diff --git "a/pretrain-checkpoint-10388/trainer_state.json" "b/pretrain-checkpoint-10388/trainer_state.json" new file mode 100644--- /dev/null +++ "b/pretrain-checkpoint-10388/trainer_state.json" @@ -0,0 +1,135110 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9997593666530307, + "eval_steps": 500, + "global_step": 10388, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "lm_loss": 9.0891, + "step": 0, + "vm_loss": 0.2726 + }, + { + "epoch": 0, + "lm_loss": 9.8504, + "step": 0, + "vm_loss": 0.2817 + }, + { + "epoch": 0, + "lm_loss": 9.959, + "step": 0, + "vm_loss": 0.3381 + }, + { + "epoch": 0, + "lm_loss": 10.3082, + "step": 0, + "vm_loss": 0.3483 + }, + { + "epoch": 0, + "lm_loss": 10.4086, + "step": 0, + "vm_loss": 0.3033 + }, + { + "epoch": 0, + "lm_loss": 10.3127, + "step": 0, + "vm_loss": 0.3025 + }, + { + "epoch": 0, + "lm_loss": 9.9909, + "step": 0, + "vm_loss": 0.2222 + }, + { + "epoch": 0, + "lm_loss": 10.4709, + "step": 0, + "vm_loss": 0.2834 + }, + { + "epoch": 0.0001925066775753784, + "grad_norm": 67.56311716151845, + "learning_rate": 6.41025641025641e-08, + "loss": 10.4004, + "step": 1 + }, + { + "epoch": 0.0003850133551507568, + "grad_norm": 67.54056470404237, + "learning_rate": 1.282051282051282e-07, + "loss": 10.3209, + "step": 2 + }, + { + "epoch": 0.0005775200327261352, + "grad_norm": 68.11791668225818, + "learning_rate": 1.9230769230769234e-07, + "loss": 10.3976, + "step": 3 + }, + { + "epoch": 0.0007700267103015136, + "grad_norm": 67.99840906336297, + "learning_rate": 2.564102564102564e-07, + "loss": 10.458, + "step": 4 + }, + { + "epoch": 0.000962533387876892, + "grad_norm": 67.95113424782106, + "learning_rate": 3.205128205128205e-07, + "loss": 10.4427, + "step": 5 + }, + { + "epoch": 0.0011550400654522703, + "grad_norm": 67.5200481108944, + "learning_rate": 3.846153846153847e-07, + "loss": 10.3632, + "step": 6 + }, + { + "epoch": 0.0013475467430276488, + "grad_norm": 67.25707550005988, + "learning_rate": 4.4871794871794876e-07, + "loss": 10.439, + "step": 7 + }, + { + "epoch": 0.0015400534206030272, + "grad_norm": 67.13059075776133, + "learning_rate": 5.128205128205128e-07, + "loss": 10.3051, + "step": 8 + }, + { + "epoch": 0.0015400534206030272, + "lm_loss": 9.8455, + "step": 8, + "vm_loss": 0.2447 + }, + { + "epoch": 0.0015400534206030272, + "lm_loss": 10.1427, + "step": 8, + "vm_loss": 0.2629 + }, + { + "epoch": 0.0015400534206030272, + "lm_loss": 9.5623, + "step": 8, + "vm_loss": 0.2549 + }, + { + "epoch": 0.0015400534206030272, + "lm_loss": 9.9834, + "step": 8, + "vm_loss": 0.3022 + }, + { + "epoch": 0.0015400534206030272, + "lm_loss": 9.5318, + "step": 8, + "vm_loss": 0.2696 + }, + { + "epoch": 0.0015400534206030272, + "lm_loss": 10.0009, + "step": 8, + "vm_loss": 0.2668 + }, + { + "epoch": 0.0015400534206030272, + "lm_loss": 10.0723, + "step": 8, + "vm_loss": 0.2553 + }, + { + "epoch": 0.0015400534206030272, + "lm_loss": 10.1124, + "step": 8, + "vm_loss": 0.3022 + }, + { + "epoch": 0.0017325600981784055, + "grad_norm": 66.30373484529393, + "learning_rate": 5.76923076923077e-07, + "loss": 10.3016, + "step": 9 + }, + { + "epoch": 0.001925066775753784, + "grad_norm": 62.07838358943419, + "learning_rate": 6.41025641025641e-07, + "loss": 10.2373, + "step": 10 + }, + { + "epoch": 0.002117573453329162, + "grad_norm": 61.37073291647445, + "learning_rate": 7.051282051282052e-07, + "loss": 10.1884, + "step": 11 + }, + { + "epoch": 0.0023100801309045406, + "grad_norm": 61.400977655344526, + "learning_rate": 7.692307692307694e-07, + "loss": 10.0603, + "step": 12 + }, + { + "epoch": 0.002502586808479919, + "grad_norm": 50.182015447034196, + "learning_rate": 8.333333333333333e-07, + "loss": 9.7238, + "step": 13 + }, + { + "epoch": 0.0026950934860552975, + "grad_norm": 49.97607872876963, + "learning_rate": 8.974358974358975e-07, + "loss": 9.7353, + "step": 14 + }, + { + "epoch": 0.002887600163630676, + "grad_norm": 48.99447824537502, + "learning_rate": 9.615384615384617e-07, + "loss": 9.6302, + "step": 15 + }, + { + "epoch": 0.0030801068412060545, + "grad_norm": 48.10312448024197, + "learning_rate": 1.0256410256410257e-06, + "loss": 9.6281, + "step": 16 + }, + { + "epoch": 0.0030801068412060545, + "lm_loss": 9.0049, + "step": 16, + "vm_loss": 0.273 + }, + { + "epoch": 0.0030801068412060545, + "lm_loss": 9.1357, + "step": 16, + "vm_loss": 0.2863 + }, + { + "epoch": 0.0030801068412060545, + "lm_loss": 8.6543, + "step": 16, + "vm_loss": 0.2357 + }, + { + "epoch": 0.0030801068412060545, + "lm_loss": 9.2545, + "step": 16, + "vm_loss": 0.1919 + }, + { + "epoch": 0.0030801068412060545, + "lm_loss": 9.4352, + "step": 16, + "vm_loss": 0.2756 + }, + { + "epoch": 0.0030801068412060545, + "lm_loss": 9.181, + "step": 16, + "vm_loss": 0.2176 + }, + { + "epoch": 0.0030801068412060545, + "lm_loss": 9.7709, + "step": 16, + "vm_loss": 0.3593 + }, + { + "epoch": 0.0030801068412060545, + "lm_loss": 8.8362, + "step": 16, + "vm_loss": 0.2298 + }, + { + "epoch": 0.003272613518781433, + "grad_norm": 45.12580958599678, + "learning_rate": 1.0897435897435899e-06, + "loss": 9.3547, + "step": 17 + }, + { + "epoch": 0.003465120196356811, + "grad_norm": 52.454030929150434, + "learning_rate": 1.153846153846154e-06, + "loss": 8.9004, + "step": 18 + }, + { + "epoch": 0.0036576268739321894, + "grad_norm": 49.37823434619467, + "learning_rate": 1.217948717948718e-06, + "loss": 8.885, + "step": 19 + }, + { + "epoch": 0.003850133551507568, + "grad_norm": 44.49325642467285, + "learning_rate": 1.282051282051282e-06, + "loss": 8.7918, + "step": 20 + }, + { + "epoch": 0.004042640229082946, + "grad_norm": 39.66645486846694, + "learning_rate": 1.3461538461538462e-06, + "loss": 8.7024, + "step": 21 + }, + { + "epoch": 0.004235146906658324, + "grad_norm": 38.43222276018352, + "learning_rate": 1.4102564102564104e-06, + "loss": 8.6347, + "step": 22 + }, + { + "epoch": 0.004427653584233703, + "grad_norm": 37.16065706647283, + "learning_rate": 1.4743589743589745e-06, + "loss": 8.4031, + "step": 23 + }, + { + "epoch": 0.004620160261809081, + "grad_norm": 40.34442755793192, + "learning_rate": 1.5384615384615387e-06, + "loss": 8.0557, + "step": 24 + }, + { + "epoch": 0.004620160261809081, + "lm_loss": 7.2879, + "step": 24, + "vm_loss": 0.2595 + }, + { + "epoch": 0.004620160261809081, + "lm_loss": 7.6324, + "step": 24, + "vm_loss": 0.2813 + }, + { + "epoch": 0.004620160261809081, + "lm_loss": 7.568, + "step": 24, + "vm_loss": 0.2289 + }, + { + "epoch": 0.004620160261809081, + "lm_loss": 7.6291, + "step": 24, + "vm_loss": 0.2515 + }, + { + "epoch": 0.004620160261809081, + "lm_loss": 7.6707, + "step": 24, + "vm_loss": 0.2657 + }, + { + "epoch": 0.004620160261809081, + "lm_loss": 7.5318, + "step": 24, + "vm_loss": 0.1969 + }, + { + "epoch": 0.004620160261809081, + "lm_loss": 7.5985, + "step": 24, + "vm_loss": 0.2884 + }, + { + "epoch": 0.004620160261809081, + "lm_loss": 7.6362, + "step": 24, + "vm_loss": 0.3132 + }, + { + "epoch": 0.00481266693938446, + "grad_norm": 39.60870239983037, + "learning_rate": 1.602564102564103e-06, + "loss": 7.8232, + "step": 25 + }, + { + "epoch": 0.005005173616959838, + "grad_norm": 39.4982593276097, + "learning_rate": 1.6666666666666667e-06, + "loss": 7.6741, + "step": 26 + }, + { + "epoch": 0.005197680294535217, + "grad_norm": 34.99435336916321, + "learning_rate": 1.7307692307692308e-06, + "loss": 7.4862, + "step": 27 + }, + { + "epoch": 0.005390186972110595, + "grad_norm": 30.470687063139007, + "learning_rate": 1.794871794871795e-06, + "loss": 7.416, + "step": 28 + }, + { + "epoch": 0.005582693649685973, + "grad_norm": 34.83143074077658, + "learning_rate": 1.8589743589743592e-06, + "loss": 7.4327, + "step": 29 + }, + { + "epoch": 0.005775200327261352, + "grad_norm": 36.79708928645268, + "learning_rate": 1.9230769230769234e-06, + "loss": 7.2392, + "step": 30 + }, + { + "epoch": 0.00596770700483673, + "grad_norm": 35.409654561711974, + "learning_rate": 1.987179487179487e-06, + "loss": 7.0838, + "step": 31 + }, + { + "epoch": 0.006160213682412109, + "grad_norm": 28.8551998968067, + "learning_rate": 2.0512820512820513e-06, + "loss": 6.8512, + "step": 32 + }, + { + "epoch": 0.006160213682412109, + "lm_loss": 5.9191, + "step": 32, + "vm_loss": 0.2617 + }, + { + "epoch": 0.006160213682412109, + "lm_loss": 6.5199, + "step": 32, + "vm_loss": 0.2488 + }, + { + "epoch": 0.006160213682412109, + "lm_loss": 6.9769, + "step": 32, + "vm_loss": 0.305 + }, + { + "epoch": 0.006160213682412109, + "lm_loss": 6.4268, + "step": 32, + "vm_loss": 0.3273 + }, + { + "epoch": 0.006160213682412109, + "lm_loss": 6.5986, + "step": 32, + "vm_loss": 0.2744 + }, + { + "epoch": 0.006160213682412109, + "lm_loss": 6.6533, + "step": 32, + "vm_loss": 0.2437 + }, + { + "epoch": 0.006160213682412109, + "lm_loss": 6.4629, + "step": 32, + "vm_loss": 0.2509 + }, + { + "epoch": 0.006160213682412109, + "lm_loss": 6.1676, + "step": 32, + "vm_loss": 0.2067 + }, + { + "epoch": 0.006352720359987487, + "grad_norm": 31.308951991177533, + "learning_rate": 2.1153846153846155e-06, + "loss": 6.7775, + "step": 33 + }, + { + "epoch": 0.006545227037562866, + "grad_norm": 39.603915694655704, + "learning_rate": 2.1794871794871797e-06, + "loss": 6.6871, + "step": 34 + }, + { + "epoch": 0.006737733715138244, + "grad_norm": 40.0007504045018, + "learning_rate": 2.243589743589744e-06, + "loss": 6.5464, + "step": 35 + }, + { + "epoch": 0.006930240392713622, + "grad_norm": 33.64736860797609, + "learning_rate": 2.307692307692308e-06, + "loss": 6.3299, + "step": 36 + }, + { + "epoch": 0.007122747070289001, + "grad_norm": 27.527054295587234, + "learning_rate": 2.371794871794872e-06, + "loss": 6.1941, + "step": 37 + }, + { + "epoch": 0.007315253747864379, + "grad_norm": 26.868103732745308, + "learning_rate": 2.435897435897436e-06, + "loss": 6.0506, + "step": 38 + }, + { + "epoch": 0.007507760425439758, + "grad_norm": 34.14811674624274, + "learning_rate": 2.5e-06, + "loss": 5.9916, + "step": 39 + }, + { + "epoch": 0.007700267103015136, + "grad_norm": 30.15744905014044, + "learning_rate": 2.564102564102564e-06, + "loss": 5.8761, + "step": 40 + }, + { + "epoch": 0.007700267103015136, + "lm_loss": 5.2691, + "step": 40, + "vm_loss": 0.2279 + }, + { + "epoch": 0.007700267103015136, + "lm_loss": 5.8057, + "step": 40, + "vm_loss": 0.2381 + }, + { + "epoch": 0.007700267103015136, + "lm_loss": 5.9513, + "step": 40, + "vm_loss": 0.1568 + }, + { + "epoch": 0.007700267103015136, + "lm_loss": 5.3976, + "step": 40, + "vm_loss": 0.2235 + }, + { + "epoch": 0.007700267103015136, + "lm_loss": 5.6982, + "step": 40, + "vm_loss": 0.2372 + }, + { + "epoch": 0.007700267103015136, + "lm_loss": 6.0874, + "step": 40, + "vm_loss": 0.1644 + }, + { + "epoch": 0.007700267103015136, + "lm_loss": 5.495, + "step": 40, + "vm_loss": 0.1906 + }, + { + "epoch": 0.007700267103015136, + "lm_loss": 5.3463, + "step": 40, + "vm_loss": 0.2759 + }, + { + "epoch": 0.007892773780590515, + "grad_norm": 24.27723895971218, + "learning_rate": 2.6282051282051286e-06, + "loss": 5.8348, + "step": 41 + }, + { + "epoch": 0.008085280458165893, + "grad_norm": 24.93716131037821, + "learning_rate": 2.6923076923076923e-06, + "loss": 5.8076, + "step": 42 + }, + { + "epoch": 0.00827778713574127, + "grad_norm": 25.96480278691456, + "learning_rate": 2.756410256410257e-06, + "loss": 5.5663, + "step": 43 + }, + { + "epoch": 0.008470293813316649, + "grad_norm": 22.172212733444184, + "learning_rate": 2.8205128205128207e-06, + "loss": 5.4675, + "step": 44 + }, + { + "epoch": 0.008662800490892028, + "grad_norm": 20.513481238836366, + "learning_rate": 2.8846153846153845e-06, + "loss": 5.4229, + "step": 45 + }, + { + "epoch": 0.008855307168467406, + "grad_norm": 21.502472356346956, + "learning_rate": 2.948717948717949e-06, + "loss": 5.3116, + "step": 46 + }, + { + "epoch": 0.009047813846042784, + "grad_norm": 19.880574155344647, + "learning_rate": 3.012820512820513e-06, + "loss": 5.3022, + "step": 47 + }, + { + "epoch": 0.009240320523618163, + "grad_norm": 17.837237753145605, + "learning_rate": 3.0769230769230774e-06, + "loss": 5.1557, + "step": 48 + }, + { + "epoch": 0.009240320523618163, + "lm_loss": 5.2803, + "step": 48, + "vm_loss": 0.2251 + }, + { + "epoch": 0.009240320523618163, + "lm_loss": 4.4979, + "step": 48, + "vm_loss": 0.2392 + }, + { + "epoch": 0.009240320523618163, + "lm_loss": 4.9999, + "step": 48, + "vm_loss": 0.2557 + }, + { + "epoch": 0.009240320523618163, + "lm_loss": 4.6382, + "step": 48, + "vm_loss": 0.1606 + }, + { + "epoch": 0.009240320523618163, + "lm_loss": 4.3124, + "step": 48, + "vm_loss": 0.206 + }, + { + "epoch": 0.009240320523618163, + "lm_loss": 4.6165, + "step": 48, + "vm_loss": 0.2116 + }, + { + "epoch": 0.009240320523618163, + "lm_loss": 4.7613, + "step": 48, + "vm_loss": 0.133 + }, + { + "epoch": 0.009240320523618163, + "lm_loss": 5.1469, + "step": 48, + "vm_loss": 0.1783 + }, + { + "epoch": 0.009432827201193542, + "grad_norm": 19.125369584951166, + "learning_rate": 3.141025641025641e-06, + "loss": 5.1127, + "step": 49 + }, + { + "epoch": 0.00962533387876892, + "grad_norm": 18.88008556622611, + "learning_rate": 3.205128205128206e-06, + "loss": 4.994, + "step": 50 + }, + { + "epoch": 0.009817840556344298, + "grad_norm": 17.793665649291906, + "learning_rate": 3.2692307692307696e-06, + "loss": 4.8572, + "step": 51 + }, + { + "epoch": 0.010010347233919676, + "grad_norm": 15.95988062195999, + "learning_rate": 3.3333333333333333e-06, + "loss": 4.8681, + "step": 52 + }, + { + "epoch": 0.010202853911495054, + "grad_norm": 14.079401638833641, + "learning_rate": 3.397435897435898e-06, + "loss": 4.7364, + "step": 53 + }, + { + "epoch": 0.010395360589070434, + "grad_norm": 15.252274114094016, + "learning_rate": 3.4615384615384617e-06, + "loss": 4.6981, + "step": 54 + }, + { + "epoch": 0.010587867266645812, + "grad_norm": 14.969515784687696, + "learning_rate": 3.5256410256410263e-06, + "loss": 4.5873, + "step": 55 + }, + { + "epoch": 0.01078037394422119, + "grad_norm": 14.594612860831406, + "learning_rate": 3.58974358974359e-06, + "loss": 4.6019, + "step": 56 + }, + { + "epoch": 0.01078037394422119, + "lm_loss": 4.0126, + "step": 56, + "vm_loss": 0.1791 + }, + { + "epoch": 0.01078037394422119, + "lm_loss": 4.126, + "step": 56, + "vm_loss": 0.2154 + }, + { + "epoch": 0.01078037394422119, + "lm_loss": 4.2023, + "step": 56, + "vm_loss": 0.2398 + }, + { + "epoch": 0.01078037394422119, + "lm_loss": 4.4627, + "step": 56, + "vm_loss": 0.216 + }, + { + "epoch": 0.01078037394422119, + "lm_loss": 4.3084, + "step": 56, + "vm_loss": 0.2611 + }, + { + "epoch": 0.01078037394422119, + "lm_loss": 4.1678, + "step": 56, + "vm_loss": 0.2361 + }, + { + "epoch": 0.01078037394422119, + "lm_loss": 4.646, + "step": 56, + "vm_loss": 0.2391 + }, + { + "epoch": 0.01078037394422119, + "lm_loss": 4.7998, + "step": 56, + "vm_loss": 0.3156 + }, + { + "epoch": 0.010972880621796568, + "grad_norm": 12.555763755713865, + "learning_rate": 3.653846153846154e-06, + "loss": 4.5263, + "step": 57 + }, + { + "epoch": 0.011165387299371946, + "grad_norm": 13.506694461540164, + "learning_rate": 3.7179487179487184e-06, + "loss": 4.3911, + "step": 58 + }, + { + "epoch": 0.011357893976947326, + "grad_norm": 13.69467907982829, + "learning_rate": 3.782051282051282e-06, + "loss": 4.4337, + "step": 59 + }, + { + "epoch": 0.011550400654522704, + "grad_norm": 11.933853505007217, + "learning_rate": 3.846153846153847e-06, + "loss": 4.3236, + "step": 60 + }, + { + "epoch": 0.011742907332098082, + "grad_norm": 11.05845132436078, + "learning_rate": 3.910256410256411e-06, + "loss": 4.336, + "step": 61 + }, + { + "epoch": 0.01193541400967346, + "grad_norm": 12.193715903046733, + "learning_rate": 3.974358974358974e-06, + "loss": 4.3291, + "step": 62 + }, + { + "epoch": 0.01212792068724884, + "grad_norm": 11.092158638717908, + "learning_rate": 4.0384615384615385e-06, + "loss": 4.1942, + "step": 63 + }, + { + "epoch": 0.012320427364824218, + "grad_norm": 10.453586342361072, + "learning_rate": 4.102564102564103e-06, + "loss": 4.2205, + "step": 64 + }, + { + "epoch": 0.012320427364824218, + "lm_loss": 4.2783, + "step": 64, + "vm_loss": 0.2464 + }, + { + "epoch": 0.012320427364824218, + "lm_loss": 4.4476, + "step": 64, + "vm_loss": 0.2418 + }, + { + "epoch": 0.012320427364824218, + "lm_loss": 4.248, + "step": 64, + "vm_loss": 0.2461 + }, + { + "epoch": 0.012320427364824218, + "lm_loss": 3.9782, + "step": 64, + "vm_loss": 0.1599 + }, + { + "epoch": 0.012320427364824218, + "lm_loss": 3.9591, + "step": 64, + "vm_loss": 0.2176 + }, + { + "epoch": 0.012320427364824218, + "lm_loss": 4.1235, + "step": 64, + "vm_loss": 0.2085 + }, + { + "epoch": 0.012320427364824218, + "lm_loss": 3.9965, + "step": 64, + "vm_loss": 0.1892 + }, + { + "epoch": 0.012320427364824218, + "lm_loss": 3.8855, + "step": 64, + "vm_loss": 0.2005 + }, + { + "epoch": 0.012512934042399596, + "grad_norm": 10.571151275524029, + "learning_rate": 4.166666666666667e-06, + "loss": 4.1813, + "step": 65 + }, + { + "epoch": 0.012705440719974974, + "grad_norm": 10.332021130036583, + "learning_rate": 4.230769230769231e-06, + "loss": 4.147, + "step": 66 + }, + { + "epoch": 0.012897947397550352, + "grad_norm": 9.078977356392668, + "learning_rate": 4.294871794871795e-06, + "loss": 4.1122, + "step": 67 + }, + { + "epoch": 0.013090454075125732, + "grad_norm": 9.249462822436183, + "learning_rate": 4.358974358974359e-06, + "loss": 4.0811, + "step": 68 + }, + { + "epoch": 0.01328296075270111, + "grad_norm": 10.233531959350929, + "learning_rate": 4.423076923076924e-06, + "loss": 4.0307, + "step": 69 + }, + { + "epoch": 0.013475467430276488, + "grad_norm": 9.487386336050799, + "learning_rate": 4.487179487179488e-06, + "loss": 3.9154, + "step": 70 + }, + { + "epoch": 0.013667974107851866, + "grad_norm": 9.328143981087013, + "learning_rate": 4.551282051282052e-06, + "loss": 3.8843, + "step": 71 + }, + { + "epoch": 0.013860480785427244, + "grad_norm": 8.526946206961572, + "learning_rate": 4.615384615384616e-06, + "loss": 3.894, + "step": 72 + }, + { + "epoch": 0.013860480785427244, + "lm_loss": 3.9402, + "step": 72, + "vm_loss": 0.2257 + }, + { + "epoch": 0.013860480785427244, + "lm_loss": 3.6234, + "step": 72, + "vm_loss": 0.2238 + }, + { + "epoch": 0.013860480785427244, + "lm_loss": 3.8825, + "step": 72, + "vm_loss": 0.2857 + }, + { + "epoch": 0.013860480785427244, + "lm_loss": 4.1888, + "step": 72, + "vm_loss": 0.2127 + }, + { + "epoch": 0.013860480785427244, + "lm_loss": 3.8483, + "step": 72, + "vm_loss": 0.2016 + }, + { + "epoch": 0.013860480785427244, + "lm_loss": 3.585, + "step": 72, + "vm_loss": 0.1955 + }, + { + "epoch": 0.013860480785427244, + "lm_loss": 3.6869, + "step": 72, + "vm_loss": 0.1909 + }, + { + "epoch": 0.013860480785427244, + "lm_loss": 3.7044, + "step": 72, + "vm_loss": 0.2009 + }, + { + "epoch": 0.014052987463002624, + "grad_norm": 8.976446161591616, + "learning_rate": 4.6794871794871795e-06, + "loss": 3.9352, + "step": 73 + }, + { + "epoch": 0.014245494140578002, + "grad_norm": 8.57017614217942, + "learning_rate": 4.743589743589744e-06, + "loss": 3.8516, + "step": 74 + }, + { + "epoch": 0.01443800081815338, + "grad_norm": 8.143374976367387, + "learning_rate": 4.807692307692308e-06, + "loss": 3.846, + "step": 75 + }, + { + "epoch": 0.014630507495728758, + "grad_norm": 8.268330580017869, + "learning_rate": 4.871794871794872e-06, + "loss": 3.7582, + "step": 76 + }, + { + "epoch": 0.014823014173304137, + "grad_norm": 7.974463043405866, + "learning_rate": 4.935897435897436e-06, + "loss": 3.8047, + "step": 77 + }, + { + "epoch": 0.015015520850879515, + "grad_norm": 7.243983118995253, + "learning_rate": 5e-06, + "loss": 3.7461, + "step": 78 + }, + { + "epoch": 0.015208027528454893, + "grad_norm": 6.784533296038897, + "learning_rate": 5.064102564102565e-06, + "loss": 3.7125, + "step": 79 + }, + { + "epoch": 0.015400534206030271, + "grad_norm": 7.5594748816071675, + "learning_rate": 5.128205128205128e-06, + "loss": 3.6808, + "step": 80 + }, + { + "epoch": 0.015400534206030271, + "lm_loss": 3.1976, + "step": 80, + "vm_loss": 0.1975 + }, + { + "epoch": 0.015400534206030271, + "lm_loss": 3.5419, + "step": 80, + "vm_loss": 0.2396 + }, + { + "epoch": 0.015400534206030271, + "lm_loss": 3.2571, + "step": 80, + "vm_loss": 0.2571 + }, + { + "epoch": 0.015400534206030271, + "lm_loss": 3.3721, + "step": 80, + "vm_loss": 0.2068 + }, + { + "epoch": 0.015400534206030271, + "lm_loss": 3.3478, + "step": 80, + "vm_loss": 0.197 + }, + { + "epoch": 0.015400534206030271, + "lm_loss": 2.9339, + "step": 80, + "vm_loss": 0.1939 + }, + { + "epoch": 0.015400534206030271, + "lm_loss": 3.8199, + "step": 80, + "vm_loss": 0.1557 + }, + { + "epoch": 0.015400534206030271, + "lm_loss": 3.6178, + "step": 80, + "vm_loss": 0.171 + }, + { + "epoch": 0.01559304088360565, + "grad_norm": 6.465158031015808, + "learning_rate": 5.192307692307693e-06, + "loss": 3.6744, + "step": 81 + }, + { + "epoch": 0.01578554756118103, + "grad_norm": 29.311170712167385, + "learning_rate": 5.256410256410257e-06, + "loss": 3.6273, + "step": 82 + }, + { + "epoch": 0.015978054238756405, + "grad_norm": 7.604136704214489, + "learning_rate": 5.320512820512821e-06, + "loss": 3.6615, + "step": 83 + }, + { + "epoch": 0.016170560916331785, + "grad_norm": 7.221914551372844, + "learning_rate": 5.384615384615385e-06, + "loss": 3.6641, + "step": 84 + }, + { + "epoch": 0.016363067593907165, + "grad_norm": 6.126998492930213, + "learning_rate": 5.448717948717949e-06, + "loss": 3.5409, + "step": 85 + }, + { + "epoch": 0.01655557427148254, + "grad_norm": 6.313025278980163, + "learning_rate": 5.512820512820514e-06, + "loss": 3.5991, + "step": 86 + }, + { + "epoch": 0.01674808094905792, + "grad_norm": 5.966017358027714, + "learning_rate": 5.576923076923077e-06, + "loss": 3.5317, + "step": 87 + }, + { + "epoch": 0.016940587626633297, + "grad_norm": 5.6035146911464855, + "learning_rate": 5.641025641025641e-06, + "loss": 3.5677, + "step": 88 + }, + { + "epoch": 0.016940587626633297, + "lm_loss": 3.6411, + "step": 88, + "vm_loss": 0.1805 + }, + { + "epoch": 0.016940587626633297, + "lm_loss": 3.3762, + "step": 88, + "vm_loss": 0.1739 + }, + { + "epoch": 0.016940587626633297, + "lm_loss": 3.2421, + "step": 88, + "vm_loss": 0.3299 + }, + { + "epoch": 0.016940587626633297, + "lm_loss": 3.1934, + "step": 88, + "vm_loss": 0.1558 + }, + { + "epoch": 0.016940587626633297, + "lm_loss": 2.9709, + "step": 88, + "vm_loss": 0.2082 + }, + { + "epoch": 0.016940587626633297, + "lm_loss": 3.2983, + "step": 88, + "vm_loss": 0.1868 + }, + { + "epoch": 0.016940587626633297, + "lm_loss": 3.6268, + "step": 88, + "vm_loss": 0.1523 + }, + { + "epoch": 0.016940587626633297, + "lm_loss": 3.1539, + "step": 88, + "vm_loss": 0.1471 + }, + { + "epoch": 0.017133094304208677, + "grad_norm": 7.174914137397918, + "learning_rate": 5.705128205128206e-06, + "loss": 3.525, + "step": 89 + }, + { + "epoch": 0.017325600981784057, + "grad_norm": 6.813223499274678, + "learning_rate": 5.769230769230769e-06, + "loss": 3.4842, + "step": 90 + }, + { + "epoch": 0.017518107659359433, + "grad_norm": 6.679518010719231, + "learning_rate": 5.833333333333334e-06, + "loss": 3.4315, + "step": 91 + }, + { + "epoch": 0.017710614336934813, + "grad_norm": 6.572097255682014, + "learning_rate": 5.897435897435898e-06, + "loss": 3.4732, + "step": 92 + }, + { + "epoch": 0.01790312101451019, + "grad_norm": 6.183997770654562, + "learning_rate": 5.961538461538462e-06, + "loss": 3.5838, + "step": 93 + }, + { + "epoch": 0.01809562769208557, + "grad_norm": 14.567870123394162, + "learning_rate": 6.025641025641026e-06, + "loss": 3.4775, + "step": 94 + }, + { + "epoch": 0.01828813436966095, + "grad_norm": 10.831077977064988, + "learning_rate": 6.08974358974359e-06, + "loss": 3.4321, + "step": 95 + }, + { + "epoch": 0.018480641047236325, + "grad_norm": 6.479427660592102, + "learning_rate": 6.153846153846155e-06, + "loss": 3.3915, + "step": 96 + }, + { + "epoch": 0.018480641047236325, + "lm_loss": 3.0723, + "step": 96, + "vm_loss": 0.1102 + }, + { + "epoch": 0.018480641047236325, + "lm_loss": 3.2662, + "step": 96, + "vm_loss": 0.185 + }, + { + "epoch": 0.018480641047236325, + "lm_loss": 3.3414, + "step": 96, + "vm_loss": 0.1149 + }, + { + "epoch": 0.018480641047236325, + "lm_loss": 3.2811, + "step": 96, + "vm_loss": 0.199 + }, + { + "epoch": 0.018480641047236325, + "lm_loss": 3.3468, + "step": 96, + "vm_loss": 0.1557 + }, + { + "epoch": 0.018480641047236325, + "lm_loss": 3.3057, + "step": 96, + "vm_loss": 0.1856 + }, + { + "epoch": 0.018480641047236325, + "lm_loss": 3.3988, + "step": 96, + "vm_loss": 0.162 + }, + { + "epoch": 0.018480641047236325, + "lm_loss": 3.2662, + "step": 96, + "vm_loss": 0.217 + }, + { + "epoch": 0.018673147724811705, + "grad_norm": 5.374474299425232, + "learning_rate": 6.217948717948718e-06, + "loss": 3.4264, + "step": 97 + }, + { + "epoch": 0.018865654402387085, + "grad_norm": 5.717364333879507, + "learning_rate": 6.282051282051282e-06, + "loss": 3.4044, + "step": 98 + }, + { + "epoch": 0.01905816107996246, + "grad_norm": 6.112177669084539, + "learning_rate": 6.3461538461538466e-06, + "loss": 3.4472, + "step": 99 + }, + { + "epoch": 0.01925066775753784, + "grad_norm": 4.467719887841041, + "learning_rate": 6.410256410256412e-06, + "loss": 3.4059, + "step": 100 + }, + { + "epoch": 0.019443174435113217, + "grad_norm": 8.25017683762081, + "learning_rate": 6.474358974358975e-06, + "loss": 3.4254, + "step": 101 + }, + { + "epoch": 0.019635681112688597, + "grad_norm": 5.735595100137572, + "learning_rate": 6.538461538461539e-06, + "loss": 3.3524, + "step": 102 + }, + { + "epoch": 0.019828187790263976, + "grad_norm": 6.23470405055773, + "learning_rate": 6.602564102564103e-06, + "loss": 3.3708, + "step": 103 + }, + { + "epoch": 0.020020694467839353, + "grad_norm": 6.098411804215811, + "learning_rate": 6.666666666666667e-06, + "loss": 3.358, + "step": 104 + }, + { + "epoch": 0.020020694467839353, + "lm_loss": 3.278, + "step": 104, + "vm_loss": 0.259 + }, + { + "epoch": 0.020020694467839353, + "lm_loss": 3.0134, + "step": 104, + "vm_loss": 0.2244 + }, + { + "epoch": 0.020020694467839353, + "lm_loss": 3.0034, + "step": 104, + "vm_loss": 0.2757 + }, + { + "epoch": 0.020020694467839353, + "lm_loss": 2.8226, + "step": 104, + "vm_loss": 0.1966 + }, + { + "epoch": 0.020020694467839353, + "lm_loss": 3.3339, + "step": 104, + "vm_loss": 0.1595 + }, + { + "epoch": 0.020020694467839353, + "lm_loss": 3.5293, + "step": 104, + "vm_loss": 0.2031 + }, + { + "epoch": 0.020020694467839353, + "lm_loss": 3.0764, + "step": 104, + "vm_loss": 0.1803 + }, + { + "epoch": 0.020020694467839353, + "lm_loss": 3.2529, + "step": 104, + "vm_loss": 0.2081 + }, + { + "epoch": 0.020213201145414732, + "grad_norm": 6.136288532192748, + "learning_rate": 6.730769230769232e-06, + "loss": 3.3935, + "step": 105 + }, + { + "epoch": 0.02040570782299011, + "grad_norm": 6.163567089968353, + "learning_rate": 6.794871794871796e-06, + "loss": 3.3032, + "step": 106 + }, + { + "epoch": 0.02059821450056549, + "grad_norm": 7.695161522115114, + "learning_rate": 6.858974358974359e-06, + "loss": 3.391, + "step": 107 + }, + { + "epoch": 0.020790721178140868, + "grad_norm": 8.284810986761684, + "learning_rate": 6.923076923076923e-06, + "loss": 3.3819, + "step": 108 + }, + { + "epoch": 0.020983227855716245, + "grad_norm": 6.829673110076974, + "learning_rate": 6.9871794871794876e-06, + "loss": 3.3247, + "step": 109 + }, + { + "epoch": 0.021175734533291624, + "grad_norm": 6.10533286566713, + "learning_rate": 7.051282051282053e-06, + "loss": 3.3494, + "step": 110 + }, + { + "epoch": 0.021368241210867, + "grad_norm": 10.155164642203296, + "learning_rate": 7.115384615384616e-06, + "loss": 3.3479, + "step": 111 + }, + { + "epoch": 0.02156074788844238, + "grad_norm": 9.492590327271007, + "learning_rate": 7.17948717948718e-06, + "loss": 3.3817, + "step": 112 + }, + { + "epoch": 0.02156074788844238, + "lm_loss": 2.8807, + "step": 112, + "vm_loss": 0.1734 + }, + { + "epoch": 0.02156074788844238, + "lm_loss": 3.1213, + "step": 112, + "vm_loss": 0.0985 + }, + { + "epoch": 0.02156074788844238, + "lm_loss": 3.221, + "step": 112, + "vm_loss": 0.2124 + }, + { + "epoch": 0.02156074788844238, + "lm_loss": 3.2356, + "step": 112, + "vm_loss": 0.1667 + }, + { + "epoch": 0.02156074788844238, + "lm_loss": 3.0598, + "step": 112, + "vm_loss": 0.1735 + }, + { + "epoch": 0.02156074788844238, + "lm_loss": 2.9602, + "step": 112, + "vm_loss": 0.1365 + }, + { + "epoch": 0.02156074788844238, + "lm_loss": 3.2267, + "step": 112, + "vm_loss": 0.2017 + }, + { + "epoch": 0.02156074788844238, + "lm_loss": 3.4341, + "step": 112, + "vm_loss": 0.2666 + }, + { + "epoch": 0.02175325456601776, + "grad_norm": 7.027929270068158, + "learning_rate": 7.243589743589744e-06, + "loss": 3.3258, + "step": 113 + }, + { + "epoch": 0.021945761243593136, + "grad_norm": 8.43797699584041, + "learning_rate": 7.307692307692308e-06, + "loss": 3.3145, + "step": 114 + }, + { + "epoch": 0.022138267921168516, + "grad_norm": 8.73386040218349, + "learning_rate": 7.371794871794873e-06, + "loss": 3.2792, + "step": 115 + }, + { + "epoch": 0.022330774598743892, + "grad_norm": 9.842525194493287, + "learning_rate": 7.435897435897437e-06, + "loss": 3.3344, + "step": 116 + }, + { + "epoch": 0.022523281276319272, + "grad_norm": 6.569715725981174, + "learning_rate": 7.500000000000001e-06, + "loss": 3.3751, + "step": 117 + }, + { + "epoch": 0.022715787953894652, + "grad_norm": 7.709524952064772, + "learning_rate": 7.564102564102564e-06, + "loss": 3.3565, + "step": 118 + }, + { + "epoch": 0.022908294631470028, + "grad_norm": 4.243711290650009, + "learning_rate": 7.6282051282051286e-06, + "loss": 3.2749, + "step": 119 + }, + { + "epoch": 0.023100801309045408, + "grad_norm": 13.538180070203053, + "learning_rate": 7.692307692307694e-06, + "loss": 3.2191, + "step": 120 + }, + { + "epoch": 0.023100801309045408, + "lm_loss": 3.0914, + "step": 120, + "vm_loss": 0.1795 + }, + { + "epoch": 0.023100801309045408, + "lm_loss": 3.1108, + "step": 120, + "vm_loss": 0.2037 + }, + { + "epoch": 0.023100801309045408, + "lm_loss": 3.3028, + "step": 120, + "vm_loss": 0.1673 + }, + { + "epoch": 0.023100801309045408, + "lm_loss": 3.2354, + "step": 120, + "vm_loss": 0.1284 + }, + { + "epoch": 0.023100801309045408, + "lm_loss": 2.8069, + "step": 120, + "vm_loss": 0.1626 + }, + { + "epoch": 0.023100801309045408, + "lm_loss": 2.93, + "step": 120, + "vm_loss": 0.1538 + }, + { + "epoch": 0.023100801309045408, + "lm_loss": 3.1315, + "step": 120, + "vm_loss": 0.1609 + }, + { + "epoch": 0.023100801309045408, + "lm_loss": 2.965, + "step": 120, + "vm_loss": 0.215 + }, + { + "epoch": 0.023293307986620784, + "grad_norm": 7.665741220768339, + "learning_rate": 7.756410256410258e-06, + "loss": 3.212, + "step": 121 + }, + { + "epoch": 0.023485814664196164, + "grad_norm": 10.226398256908233, + "learning_rate": 7.820512820512822e-06, + "loss": 3.2717, + "step": 122 + }, + { + "epoch": 0.023678321341771544, + "grad_norm": 5.948491598057429, + "learning_rate": 7.884615384615384e-06, + "loss": 3.2855, + "step": 123 + }, + { + "epoch": 0.02387082801934692, + "grad_norm": 14.647611390497222, + "learning_rate": 7.948717948717949e-06, + "loss": 3.2483, + "step": 124 + }, + { + "epoch": 0.0240633346969223, + "grad_norm": 8.256742315660889, + "learning_rate": 8.012820512820515e-06, + "loss": 3.2698, + "step": 125 + }, + { + "epoch": 0.02425584137449768, + "grad_norm": 7.138236374002898, + "learning_rate": 8.076923076923077e-06, + "loss": 3.2711, + "step": 126 + }, + { + "epoch": 0.024448348052073056, + "grad_norm": 9.256013416446951, + "learning_rate": 8.141025641025641e-06, + "loss": 3.3322, + "step": 127 + }, + { + "epoch": 0.024640854729648436, + "grad_norm": 7.537511602392607, + "learning_rate": 8.205128205128205e-06, + "loss": 3.2525, + "step": 128 + }, + { + "epoch": 0.024640854729648436, + "lm_loss": 3.1969, + "step": 128, + "vm_loss": 0.2097 + }, + { + "epoch": 0.024640854729648436, + "lm_loss": 3.5112, + "step": 128, + "vm_loss": 0.2458 + }, + { + "epoch": 0.024640854729648436, + "lm_loss": 3.2529, + "step": 128, + "vm_loss": 0.2193 + }, + { + "epoch": 0.024640854729648436, + "lm_loss": 3.2538, + "step": 128, + "vm_loss": 0.1252 + }, + { + "epoch": 0.024640854729648436, + "lm_loss": 3.1832, + "step": 128, + "vm_loss": 0.157 + }, + { + "epoch": 0.024640854729648436, + "lm_loss": 2.9437, + "step": 128, + "vm_loss": 0.1872 + }, + { + "epoch": 0.024640854729648436, + "lm_loss": 3.3408, + "step": 128, + "vm_loss": 0.1691 + }, + { + "epoch": 0.024640854729648436, + "lm_loss": 3.0984, + "step": 128, + "vm_loss": 0.1737 + }, + { + "epoch": 0.024833361407223812, + "grad_norm": 8.250978419450655, + "learning_rate": 8.26923076923077e-06, + "loss": 3.3227, + "step": 129 + }, + { + "epoch": 0.02502586808479919, + "grad_norm": 7.3628457173539, + "learning_rate": 8.333333333333334e-06, + "loss": 3.2849, + "step": 130 + }, + { + "epoch": 0.02521837476237457, + "grad_norm": 8.454176258940308, + "learning_rate": 8.397435897435898e-06, + "loss": 3.1913, + "step": 131 + }, + { + "epoch": 0.025410881439949948, + "grad_norm": 8.376173215840465, + "learning_rate": 8.461538461538462e-06, + "loss": 3.2459, + "step": 132 + }, + { + "epoch": 0.025603388117525328, + "grad_norm": 6.87582665691323, + "learning_rate": 8.525641025641026e-06, + "loss": 3.2125, + "step": 133 + }, + { + "epoch": 0.025795894795100704, + "grad_norm": 8.896240523232297, + "learning_rate": 8.58974358974359e-06, + "loss": 3.1578, + "step": 134 + }, + { + "epoch": 0.025988401472676084, + "grad_norm": 6.6724173301052865, + "learning_rate": 8.653846153846155e-06, + "loss": 3.2215, + "step": 135 + }, + { + "epoch": 0.026180908150251463, + "grad_norm": 6.6790152323660354, + "learning_rate": 8.717948717948719e-06, + "loss": 3.1951, + "step": 136 + }, + { + "epoch": 0.026180908150251463, + "lm_loss": 2.963, + "step": 136, + "vm_loss": 0.192 + }, + { + "epoch": 0.026180908150251463, + "lm_loss": 2.7938, + "step": 136, + "vm_loss": 0.1459 + }, + { + "epoch": 0.026180908150251463, + "lm_loss": 2.9, + "step": 136, + "vm_loss": 0.2829 + }, + { + "epoch": 0.026180908150251463, + "lm_loss": 3.0702, + "step": 136, + "vm_loss": 0.2815 + }, + { + "epoch": 0.026180908150251463, + "lm_loss": 2.7033, + "step": 136, + "vm_loss": 0.2858 + }, + { + "epoch": 0.026180908150251463, + "lm_loss": 2.5698, + "step": 136, + "vm_loss": 0.2013 + }, + { + "epoch": 0.026180908150251463, + "lm_loss": 2.808, + "step": 136, + "vm_loss": 0.1753 + }, + { + "epoch": 0.026180908150251463, + "lm_loss": 3.1382, + "step": 136, + "vm_loss": 0.1591 + }, + { + "epoch": 0.02637341482782684, + "grad_norm": 7.2813674097077685, + "learning_rate": 8.782051282051283e-06, + "loss": 3.2166, + "step": 137 + }, + { + "epoch": 0.02656592150540222, + "grad_norm": 8.149438744977191, + "learning_rate": 8.846153846153847e-06, + "loss": 3.1668, + "step": 138 + }, + { + "epoch": 0.026758428182977596, + "grad_norm": 8.287910029793027, + "learning_rate": 8.910256410256411e-06, + "loss": 3.1465, + "step": 139 + }, + { + "epoch": 0.026950934860552975, + "grad_norm": 6.719044446835403, + "learning_rate": 8.974358974358976e-06, + "loss": 3.2167, + "step": 140 + }, + { + "epoch": 0.027143441538128355, + "grad_norm": 7.011100244291259, + "learning_rate": 9.03846153846154e-06, + "loss": 3.2148, + "step": 141 + }, + { + "epoch": 0.02733594821570373, + "grad_norm": 6.1920104301004235, + "learning_rate": 9.102564102564104e-06, + "loss": 3.1763, + "step": 142 + }, + { + "epoch": 0.02752845489327911, + "grad_norm": 6.755555458151317, + "learning_rate": 9.166666666666666e-06, + "loss": 3.1924, + "step": 143 + }, + { + "epoch": 0.027720961570854488, + "grad_norm": 5.608491649533835, + "learning_rate": 9.230769230769232e-06, + "loss": 3.1493, + "step": 144 + }, + { + "epoch": 0.027720961570854488, + "lm_loss": 3.0104, + "step": 144, + "vm_loss": 0.2395 + }, + { + "epoch": 0.027720961570854488, + "lm_loss": 2.9678, + "step": 144, + "vm_loss": 0.1796 + }, + { + "epoch": 0.027720961570854488, + "lm_loss": 2.9748, + "step": 144, + "vm_loss": 0.1622 + }, + { + "epoch": 0.027720961570854488, + "lm_loss": 2.6869, + "step": 144, + "vm_loss": 0.2255 + }, + { + "epoch": 0.027720961570854488, + "lm_loss": 2.6758, + "step": 144, + "vm_loss": 0.1826 + }, + { + "epoch": 0.027720961570854488, + "lm_loss": 2.8615, + "step": 144, + "vm_loss": 0.1567 + }, + { + "epoch": 0.027720961570854488, + "lm_loss": 3.2455, + "step": 144, + "vm_loss": 0.2135 + }, + { + "epoch": 0.027720961570854488, + "lm_loss": 3.4464, + "step": 144, + "vm_loss": 0.1843 + }, + { + "epoch": 0.027913468248429867, + "grad_norm": 6.808337906129715, + "learning_rate": 9.294871794871796e-06, + "loss": 3.2025, + "step": 145 + }, + { + "epoch": 0.028105974926005247, + "grad_norm": 5.167818737857269, + "learning_rate": 9.358974358974359e-06, + "loss": 3.1271, + "step": 146 + }, + { + "epoch": 0.028298481603580623, + "grad_norm": 4.972735180235373, + "learning_rate": 9.423076923076923e-06, + "loss": 3.1229, + "step": 147 + }, + { + "epoch": 0.028490988281156003, + "grad_norm": 5.268973889273245, + "learning_rate": 9.487179487179487e-06, + "loss": 3.0918, + "step": 148 + }, + { + "epoch": 0.02868349495873138, + "grad_norm": 6.433975035834553, + "learning_rate": 9.551282051282053e-06, + "loss": 3.088, + "step": 149 + }, + { + "epoch": 0.02887600163630676, + "grad_norm": 4.771609768692185, + "learning_rate": 9.615384615384616e-06, + "loss": 3.0797, + "step": 150 + }, + { + "epoch": 0.02906850831388214, + "grad_norm": 5.673825586895475, + "learning_rate": 9.67948717948718e-06, + "loss": 3.132, + "step": 151 + }, + { + "epoch": 0.029261014991457515, + "grad_norm": 4.916995450862572, + "learning_rate": 9.743589743589744e-06, + "loss": 3.0724, + "step": 152 + }, + { + "epoch": 0.029261014991457515, + "lm_loss": 2.8541, + "step": 152, + "vm_loss": 0.162 + }, + { + "epoch": 0.029261014991457515, + "lm_loss": 2.814, + "step": 152, + "vm_loss": 0.1548 + }, + { + "epoch": 0.029261014991457515, + "lm_loss": 3.0889, + "step": 152, + "vm_loss": 0.2279 + }, + { + "epoch": 0.029261014991457515, + "lm_loss": 2.9812, + "step": 152, + "vm_loss": 0.2412 + }, + { + "epoch": 0.029261014991457515, + "lm_loss": 2.6343, + "step": 152, + "vm_loss": 0.1853 + }, + { + "epoch": 0.029261014991457515, + "lm_loss": 3.0641, + "step": 152, + "vm_loss": 0.1835 + }, + { + "epoch": 0.029261014991457515, + "lm_loss": 2.9087, + "step": 152, + "vm_loss": 0.1365 + }, + { + "epoch": 0.029261014991457515, + "lm_loss": 2.9947, + "step": 152, + "vm_loss": 0.21 + }, + { + "epoch": 0.029453521669032895, + "grad_norm": 5.345451285169419, + "learning_rate": 9.807692307692308e-06, + "loss": 3.1293, + "step": 153 + }, + { + "epoch": 0.029646028346608275, + "grad_norm": 5.255283821767704, + "learning_rate": 9.871794871794872e-06, + "loss": 3.0883, + "step": 154 + }, + { + "epoch": 0.02983853502418365, + "grad_norm": 7.053242397925785, + "learning_rate": 9.935897435897437e-06, + "loss": 3.1023, + "step": 155 + }, + { + "epoch": 0.03003104170175903, + "grad_norm": 5.450663155255625, + "learning_rate": 1e-05, + "loss": 3.0896, + "step": 156 + }, + { + "epoch": 0.030223548379334407, + "grad_norm": 6.9974723489596995, + "learning_rate": 1.0064102564102565e-05, + "loss": 3.0843, + "step": 157 + }, + { + "epoch": 0.030416055056909787, + "grad_norm": 6.575972837059719, + "learning_rate": 1.012820512820513e-05, + "loss": 3.053, + "step": 158 + }, + { + "epoch": 0.030608561734485167, + "grad_norm": 7.613242070548436, + "learning_rate": 1.0192307692307692e-05, + "loss": 3.0256, + "step": 159 + }, + { + "epoch": 0.030801068412060543, + "grad_norm": 5.877015920330703, + "learning_rate": 1.0256410256410256e-05, + "loss": 3.0322, + "step": 160 + }, + { + "epoch": 0.030801068412060543, + "lm_loss": 2.9073, + "step": 160, + "vm_loss": 0.2618 + }, + { + "epoch": 0.030801068412060543, + "lm_loss": 2.6029, + "step": 160, + "vm_loss": 0.2138 + }, + { + "epoch": 0.030801068412060543, + "lm_loss": 3.0235, + "step": 160, + "vm_loss": 0.2892 + }, + { + "epoch": 0.030801068412060543, + "lm_loss": 2.9285, + "step": 160, + "vm_loss": 0.2265 + }, + { + "epoch": 0.030801068412060543, + "lm_loss": 2.9395, + "step": 160, + "vm_loss": 0.1289 + }, + { + "epoch": 0.030801068412060543, + "lm_loss": 2.9529, + "step": 160, + "vm_loss": 0.2529 + }, + { + "epoch": 0.030801068412060543, + "lm_loss": 2.7037, + "step": 160, + "vm_loss": 0.2257 + }, + { + "epoch": 0.030801068412060543, + "lm_loss": 2.9467, + "step": 160, + "vm_loss": 0.2386 + }, + { + "epoch": 0.030993575089635923, + "grad_norm": 7.043293903695604, + "learning_rate": 1.0320512820512822e-05, + "loss": 3.1206, + "step": 161 + }, + { + "epoch": 0.0311860817672113, + "grad_norm": 5.026521806205481, + "learning_rate": 1.0384615384615386e-05, + "loss": 3.0801, + "step": 162 + }, + { + "epoch": 0.03137858844478668, + "grad_norm": 9.250564457388757, + "learning_rate": 1.044871794871795e-05, + "loss": 3.0663, + "step": 163 + }, + { + "epoch": 0.03157109512236206, + "grad_norm": 6.517869865889641, + "learning_rate": 1.0512820512820514e-05, + "loss": 3.057, + "step": 164 + }, + { + "epoch": 0.03176360179993744, + "grad_norm": 10.127518582941192, + "learning_rate": 1.0576923076923078e-05, + "loss": 3.0116, + "step": 165 + }, + { + "epoch": 0.03195610847751281, + "grad_norm": 7.467979551777786, + "learning_rate": 1.0641025641025643e-05, + "loss": 3.0232, + "step": 166 + }, + { + "epoch": 0.03214861515508819, + "grad_norm": 11.873610868910118, + "learning_rate": 1.0705128205128205e-05, + "loss": 3.0264, + "step": 167 + }, + { + "epoch": 0.03234112183266357, + "grad_norm": 7.355551676738349, + "learning_rate": 1.076923076923077e-05, + "loss": 3.0486, + "step": 168 + }, + { + "epoch": 0.03234112183266357, + "lm_loss": 2.8384, + "step": 168, + "vm_loss": 0.1768 + }, + { + "epoch": 0.03234112183266357, + "lm_loss": 2.8109, + "step": 168, + "vm_loss": 0.2679 + }, + { + "epoch": 0.03234112183266357, + "lm_loss": 3.158, + "step": 168, + "vm_loss": 0.1972 + }, + { + "epoch": 0.03234112183266357, + "lm_loss": 2.9281, + "step": 168, + "vm_loss": 0.1719 + }, + { + "epoch": 0.03234112183266357, + "lm_loss": 2.754, + "step": 168, + "vm_loss": 0.2043 + }, + { + "epoch": 0.03234112183266357, + "lm_loss": 2.8037, + "step": 168, + "vm_loss": 0.1854 + }, + { + "epoch": 0.03234112183266357, + "lm_loss": 2.6258, + "step": 168, + "vm_loss": 0.1763 + }, + { + "epoch": 0.03234112183266357, + "lm_loss": 2.6665, + "step": 168, + "vm_loss": 0.1676 + }, + { + "epoch": 0.03253362851023895, + "grad_norm": 10.013137269000024, + "learning_rate": 1.0833333333333334e-05, + "loss": 3.0366, + "step": 169 + }, + { + "epoch": 0.03272613518781433, + "grad_norm": 10.246365030507082, + "learning_rate": 1.0897435897435898e-05, + "loss": 3.0152, + "step": 170 + }, + { + "epoch": 0.0329186418653897, + "grad_norm": 12.843857885035476, + "learning_rate": 1.0961538461538464e-05, + "loss": 3.0374, + "step": 171 + }, + { + "epoch": 0.03311114854296508, + "grad_norm": 10.140554754776014, + "learning_rate": 1.1025641025641028e-05, + "loss": 3.0704, + "step": 172 + }, + { + "epoch": 0.03330365522054046, + "grad_norm": 7.867327631267269, + "learning_rate": 1.1089743589743592e-05, + "loss": 2.9356, + "step": 173 + }, + { + "epoch": 0.03349616189811584, + "grad_norm": 8.407609223787436, + "learning_rate": 1.1153846153846154e-05, + "loss": 2.9535, + "step": 174 + }, + { + "epoch": 0.03368866857569122, + "grad_norm": 8.469515518588421, + "learning_rate": 1.1217948717948719e-05, + "loss": 3.0489, + "step": 175 + }, + { + "epoch": 0.033881175253266595, + "grad_norm": 6.419257757286239, + "learning_rate": 1.1282051282051283e-05, + "loss": 2.9658, + "step": 176 + }, + { + "epoch": 0.033881175253266595, + "lm_loss": 3.1021, + "step": 176, + "vm_loss": 0.1547 + }, + { + "epoch": 0.033881175253266595, + "lm_loss": 2.7029, + "step": 176, + "vm_loss": 0.1769 + }, + { + "epoch": 0.033881175253266595, + "lm_loss": 3.1034, + "step": 176, + "vm_loss": 0.1611 + }, + { + "epoch": 0.033881175253266595, + "lm_loss": 2.6592, + "step": 176, + "vm_loss": 0.1725 + }, + { + "epoch": 0.033881175253266595, + "lm_loss": 2.5599, + "step": 176, + "vm_loss": 0.1273 + }, + { + "epoch": 0.033881175253266595, + "lm_loss": 2.65, + "step": 176, + "vm_loss": 0.2721 + }, + { + "epoch": 0.033881175253266595, + "lm_loss": 2.8335, + "step": 176, + "vm_loss": 0.1818 + }, + { + "epoch": 0.033881175253266595, + "lm_loss": 2.7471, + "step": 176, + "vm_loss": 0.2585 + }, + { + "epoch": 0.034073681930841974, + "grad_norm": 9.656691837353117, + "learning_rate": 1.1346153846153847e-05, + "loss": 2.977, + "step": 177 + }, + { + "epoch": 0.034266188608417354, + "grad_norm": 7.657196372897803, + "learning_rate": 1.1410256410256411e-05, + "loss": 2.953, + "step": 178 + }, + { + "epoch": 0.034458695285992734, + "grad_norm": 7.68150357555819, + "learning_rate": 1.1474358974358974e-05, + "loss": 2.9969, + "step": 179 + }, + { + "epoch": 0.034651201963568114, + "grad_norm": 6.58729845599872, + "learning_rate": 1.1538461538461538e-05, + "loss": 3.0113, + "step": 180 + }, + { + "epoch": 0.03484370864114349, + "grad_norm": 6.285673454310952, + "learning_rate": 1.1602564102564104e-05, + "loss": 2.9444, + "step": 181 + }, + { + "epoch": 0.035036215318718866, + "grad_norm": 5.761664043336924, + "learning_rate": 1.1666666666666668e-05, + "loss": 2.9881, + "step": 182 + }, + { + "epoch": 0.035228721996294246, + "grad_norm": 5.410695641675287, + "learning_rate": 1.1730769230769232e-05, + "loss": 2.9334, + "step": 183 + }, + { + "epoch": 0.035421228673869626, + "grad_norm": 6.023237121264645, + "learning_rate": 1.1794871794871796e-05, + "loss": 2.9166, + "step": 184 + }, + { + "epoch": 0.035421228673869626, + "lm_loss": 2.975, + "step": 184, + "vm_loss": 0.1624 + }, + { + "epoch": 0.035421228673869626, + "lm_loss": 3.5106, + "step": 184, + "vm_loss": 0.1783 + }, + { + "epoch": 0.035421228673869626, + "lm_loss": 3.418, + "step": 184, + "vm_loss": 0.2092 + }, + { + "epoch": 0.035421228673869626, + "lm_loss": 2.9029, + "step": 184, + "vm_loss": 0.1723 + }, + { + "epoch": 0.035421228673869626, + "lm_loss": 2.9491, + "step": 184, + "vm_loss": 0.1876 + }, + { + "epoch": 0.035421228673869626, + "lm_loss": 2.8136, + "step": 184, + "vm_loss": 0.1549 + }, + { + "epoch": 0.035421228673869626, + "lm_loss": 2.8904, + "step": 184, + "vm_loss": 0.2051 + }, + { + "epoch": 0.035421228673869626, + "lm_loss": 3.2085, + "step": 184, + "vm_loss": 0.1605 + }, + { + "epoch": 0.035613735351445006, + "grad_norm": 27.35729649401158, + "learning_rate": 1.185897435897436e-05, + "loss": 3.2101, + "step": 185 + }, + { + "epoch": 0.03580624202902038, + "grad_norm": 12.004355265836677, + "learning_rate": 1.1923076923076925e-05, + "loss": 3.2179, + "step": 186 + }, + { + "epoch": 0.03599874870659576, + "grad_norm": 13.72068537647716, + "learning_rate": 1.1987179487179487e-05, + "loss": 3.1634, + "step": 187 + }, + { + "epoch": 0.03619125538417114, + "grad_norm": 8.089655402274973, + "learning_rate": 1.2051282051282051e-05, + "loss": 3.0601, + "step": 188 + }, + { + "epoch": 0.03638376206174652, + "grad_norm": 48.303874189153646, + "learning_rate": 1.2115384615384615e-05, + "loss": 3.2995, + "step": 189 + }, + { + "epoch": 0.0365762687393219, + "grad_norm": 17.441258568769378, + "learning_rate": 1.217948717948718e-05, + "loss": 3.293, + "step": 190 + }, + { + "epoch": 0.03676877541689727, + "grad_norm": 12.222015674075696, + "learning_rate": 1.2243589743589746e-05, + "loss": 3.3524, + "step": 191 + }, + { + "epoch": 0.03696128209447265, + "grad_norm": 13.784523262562121, + "learning_rate": 1.230769230769231e-05, + "loss": 3.3192, + "step": 192 + }, + { + "epoch": 0.03696128209447265, + "lm_loss": 2.9604, + "step": 192, + "vm_loss": 0.2158 + }, + { + "epoch": 0.03696128209447265, + "lm_loss": 2.8938, + "step": 192, + "vm_loss": 0.217 + }, + { + "epoch": 0.03696128209447265, + "lm_loss": 3.1123, + "step": 192, + "vm_loss": 0.1741 + }, + { + "epoch": 0.03696128209447265, + "lm_loss": 2.9682, + "step": 192, + "vm_loss": 0.1867 + }, + { + "epoch": 0.03696128209447265, + "lm_loss": 3.089, + "step": 192, + "vm_loss": 0.2131 + }, + { + "epoch": 0.03696128209447265, + "lm_loss": 3.1525, + "step": 192, + "vm_loss": 0.1352 + }, + { + "epoch": 0.03696128209447265, + "lm_loss": 3.0075, + "step": 192, + "vm_loss": 0.2288 + }, + { + "epoch": 0.03696128209447265, + "lm_loss": 3.1707, + "step": 192, + "vm_loss": 0.1834 + }, + { + "epoch": 0.03715378877204803, + "grad_norm": 12.289969252302749, + "learning_rate": 1.2371794871794874e-05, + "loss": 3.291, + "step": 193 + }, + { + "epoch": 0.03734629544962341, + "grad_norm": 10.159277758140643, + "learning_rate": 1.2435897435897436e-05, + "loss": 3.136, + "step": 194 + }, + { + "epoch": 0.03753880212719879, + "grad_norm": 11.740631375133924, + "learning_rate": 1.25e-05, + "loss": 3.1739, + "step": 195 + }, + { + "epoch": 0.03773130880477417, + "grad_norm": 12.173446170654557, + "learning_rate": 1.2564102564102565e-05, + "loss": 3.1562, + "step": 196 + }, + { + "epoch": 0.03792381548234954, + "grad_norm": 8.518876013836504, + "learning_rate": 1.2628205128205129e-05, + "loss": 3.1074, + "step": 197 + }, + { + "epoch": 0.03811632215992492, + "grad_norm": 11.30323844302314, + "learning_rate": 1.2692307692307693e-05, + "loss": 3.1066, + "step": 198 + }, + { + "epoch": 0.0383088288375003, + "grad_norm": 7.691372851932579, + "learning_rate": 1.2756410256410257e-05, + "loss": 3.048, + "step": 199 + }, + { + "epoch": 0.03850133551507568, + "grad_norm": 8.468513935727255, + "learning_rate": 1.2820512820512823e-05, + "loss": 3.0237, + "step": 200 + }, + { + "epoch": 0.03850133551507568, + "lm_loss": 2.6696, + "step": 200, + "vm_loss": 0.197 + }, + { + "epoch": 0.03850133551507568, + "lm_loss": 2.7005, + "step": 200, + "vm_loss": 0.2201 + }, + { + "epoch": 0.03850133551507568, + "lm_loss": 2.8287, + "step": 200, + "vm_loss": 0.1616 + }, + { + "epoch": 0.03850133551507568, + "lm_loss": 2.8452, + "step": 200, + "vm_loss": 0.1832 + }, + { + "epoch": 0.03850133551507568, + "lm_loss": 2.9198, + "step": 200, + "vm_loss": 0.2 + }, + { + "epoch": 0.03850133551507568, + "lm_loss": 2.913, + "step": 200, + "vm_loss": 0.1677 + }, + { + "epoch": 0.03850133551507568, + "lm_loss": 2.7118, + "step": 200, + "vm_loss": 0.1508 + }, + { + "epoch": 0.03850133551507568, + "lm_loss": 2.9899, + "step": 200, + "vm_loss": 0.1693 + }, + { + "epoch": 0.03869384219265106, + "grad_norm": 7.116072705989573, + "learning_rate": 1.2884615384615386e-05, + "loss": 3.0521, + "step": 201 + }, + { + "epoch": 0.038886348870226434, + "grad_norm": 5.746238686341502, + "learning_rate": 1.294871794871795e-05, + "loss": 2.9985, + "step": 202 + }, + { + "epoch": 0.039078855547801813, + "grad_norm": 6.989671432523846, + "learning_rate": 1.3012820512820514e-05, + "loss": 2.9781, + "step": 203 + }, + { + "epoch": 0.03927136222537719, + "grad_norm": 5.9360335543901925, + "learning_rate": 1.3076923076923078e-05, + "loss": 2.933, + "step": 204 + }, + { + "epoch": 0.03946386890295257, + "grad_norm": 4.85956682392983, + "learning_rate": 1.3141025641025642e-05, + "loss": 2.9653, + "step": 205 + }, + { + "epoch": 0.03965637558052795, + "grad_norm": 6.550014255490412, + "learning_rate": 1.3205128205128207e-05, + "loss": 2.9506, + "step": 206 + }, + { + "epoch": 0.039848882258103326, + "grad_norm": 6.704605878999232, + "learning_rate": 1.3269230769230769e-05, + "loss": 2.9385, + "step": 207 + }, + { + "epoch": 0.040041388935678705, + "grad_norm": 4.346420782145899, + "learning_rate": 1.3333333333333333e-05, + "loss": 2.9289, + "step": 208 + }, + { + "epoch": 0.040041388935678705, + "lm_loss": 2.5593, + "step": 208, + "vm_loss": 0.2811 + }, + { + "epoch": 0.040041388935678705, + "lm_loss": 2.6893, + "step": 208, + "vm_loss": 0.1619 + }, + { + "epoch": 0.040041388935678705, + "lm_loss": 2.7023, + "step": 208, + "vm_loss": 0.2148 + }, + { + "epoch": 0.040041388935678705, + "lm_loss": 2.8206, + "step": 208, + "vm_loss": 0.1471 + }, + { + "epoch": 0.040041388935678705, + "lm_loss": 2.3611, + "step": 208, + "vm_loss": 0.1985 + }, + { + "epoch": 0.040041388935678705, + "lm_loss": 2.7323, + "step": 208, + "vm_loss": 0.1902 + }, + { + "epoch": 0.040041388935678705, + "lm_loss": 2.5944, + "step": 208, + "vm_loss": 0.2861 + }, + { + "epoch": 0.040041388935678705, + "lm_loss": 2.584, + "step": 208, + "vm_loss": 0.2026 + }, + { + "epoch": 0.040233895613254085, + "grad_norm": 5.774392950229877, + "learning_rate": 1.3397435897435897e-05, + "loss": 2.9859, + "step": 209 + }, + { + "epoch": 0.040426402290829465, + "grad_norm": 5.093636173741241, + "learning_rate": 1.3461538461538463e-05, + "loss": 2.9473, + "step": 210 + }, + { + "epoch": 0.040618908968404845, + "grad_norm": 4.300007709417803, + "learning_rate": 1.3525641025641028e-05, + "loss": 2.92, + "step": 211 + }, + { + "epoch": 0.04081141564598022, + "grad_norm": 5.809945210465388, + "learning_rate": 1.3589743589743592e-05, + "loss": 2.9335, + "step": 212 + }, + { + "epoch": 0.0410039223235556, + "grad_norm": 5.207221432710424, + "learning_rate": 1.3653846153846156e-05, + "loss": 2.9261, + "step": 213 + }, + { + "epoch": 0.04119642900113098, + "grad_norm": 4.896002202116585, + "learning_rate": 1.3717948717948718e-05, + "loss": 2.9121, + "step": 214 + }, + { + "epoch": 0.04138893567870636, + "grad_norm": 4.725368576157323, + "learning_rate": 1.3782051282051283e-05, + "loss": 2.8892, + "step": 215 + }, + { + "epoch": 0.041581442356281736, + "grad_norm": 5.002780050146259, + "learning_rate": 1.3846153846153847e-05, + "loss": 2.8562, + "step": 216 + }, + { + "epoch": 0.041581442356281736, + "lm_loss": 2.8098, + "step": 216, + "vm_loss": 0.1313 + }, + { + "epoch": 0.041581442356281736, + "lm_loss": 2.8638, + "step": 216, + "vm_loss": 0.2558 + }, + { + "epoch": 0.041581442356281736, + "lm_loss": 2.4597, + "step": 216, + "vm_loss": 0.2069 + }, + { + "epoch": 0.041581442356281736, + "lm_loss": 2.6925, + "step": 216, + "vm_loss": 0.1855 + }, + { + "epoch": 0.041581442356281736, + "lm_loss": 2.6028, + "step": 216, + "vm_loss": 0.1906 + }, + { + "epoch": 0.041581442356281736, + "lm_loss": 2.6288, + "step": 216, + "vm_loss": 0.2021 + }, + { + "epoch": 0.041581442356281736, + "lm_loss": 3.1163, + "step": 216, + "vm_loss": 0.1756 + }, + { + "epoch": 0.041581442356281736, + "lm_loss": 2.6315, + "step": 216, + "vm_loss": 0.2213 + }, + { + "epoch": 0.04177394903385711, + "grad_norm": 6.479679840921408, + "learning_rate": 1.3910256410256411e-05, + "loss": 2.9393, + "step": 217 + }, + { + "epoch": 0.04196645571143249, + "grad_norm": 4.572757285960612, + "learning_rate": 1.3974358974358975e-05, + "loss": 2.8804, + "step": 218 + }, + { + "epoch": 0.04215896238900787, + "grad_norm": 5.074904352294016, + "learning_rate": 1.403846153846154e-05, + "loss": 2.8821, + "step": 219 + }, + { + "epoch": 0.04235146906658325, + "grad_norm": 4.806943875282919, + "learning_rate": 1.4102564102564105e-05, + "loss": 2.9012, + "step": 220 + }, + { + "epoch": 0.04254397574415863, + "grad_norm": 5.470116200121896, + "learning_rate": 1.416666666666667e-05, + "loss": 2.8102, + "step": 221 + }, + { + "epoch": 0.042736482421734, + "grad_norm": 4.44921275322167, + "learning_rate": 1.4230769230769232e-05, + "loss": 2.8962, + "step": 222 + }, + { + "epoch": 0.04292898909930938, + "grad_norm": 5.662064155307462, + "learning_rate": 1.4294871794871796e-05, + "loss": 2.8708, + "step": 223 + }, + { + "epoch": 0.04312149577688476, + "grad_norm": 5.353328706835362, + "learning_rate": 1.435897435897436e-05, + "loss": 2.846, + "step": 224 + }, + { + "epoch": 0.04312149577688476, + "lm_loss": 2.4981, + "step": 224, + "vm_loss": 0.1698 + }, + { + "epoch": 0.04312149577688476, + "lm_loss": 2.7507, + "step": 224, + "vm_loss": 0.1174 + }, + { + "epoch": 0.04312149577688476, + "lm_loss": 2.8412, + "step": 224, + "vm_loss": 0.267 + }, + { + "epoch": 0.04312149577688476, + "lm_loss": 2.6686, + "step": 224, + "vm_loss": 0.1918 + }, + { + "epoch": 0.04312149577688476, + "lm_loss": 2.3326, + "step": 224, + "vm_loss": 0.2176 + }, + { + "epoch": 0.04312149577688476, + "lm_loss": 2.2581, + "step": 224, + "vm_loss": 0.1942 + }, + { + "epoch": 0.04312149577688476, + "lm_loss": 2.6294, + "step": 224, + "vm_loss": 0.1563 + }, + { + "epoch": 0.04312149577688476, + "lm_loss": 2.9014, + "step": 224, + "vm_loss": 0.1898 + }, + { + "epoch": 0.04331400245446014, + "grad_norm": 4.3566890598549755, + "learning_rate": 1.4423076923076924e-05, + "loss": 2.8706, + "step": 225 + }, + { + "epoch": 0.04350650913203552, + "grad_norm": 5.77724861491947, + "learning_rate": 1.4487179487179489e-05, + "loss": 2.8869, + "step": 226 + }, + { + "epoch": 0.04369901580961089, + "grad_norm": 4.689389375319442, + "learning_rate": 1.4551282051282051e-05, + "loss": 2.8259, + "step": 227 + }, + { + "epoch": 0.04389152248718627, + "grad_norm": 5.768321825370758, + "learning_rate": 1.4615384615384615e-05, + "loss": 2.8594, + "step": 228 + }, + { + "epoch": 0.04408402916476165, + "grad_norm": 5.055908443189705, + "learning_rate": 1.467948717948718e-05, + "loss": 2.8894, + "step": 229 + }, + { + "epoch": 0.04427653584233703, + "grad_norm": 4.829683354618881, + "learning_rate": 1.4743589743589745e-05, + "loss": 2.8516, + "step": 230 + }, + { + "epoch": 0.04446904251991241, + "grad_norm": 5.512977719670376, + "learning_rate": 1.480769230769231e-05, + "loss": 2.8592, + "step": 231 + }, + { + "epoch": 0.044661549197487785, + "grad_norm": 5.113545819702339, + "learning_rate": 1.4871794871794874e-05, + "loss": 2.841, + "step": 232 + }, + { + "epoch": 0.044661549197487785, + "lm_loss": 2.4145, + "step": 232, + "vm_loss": 0.1682 + }, + { + "epoch": 0.044661549197487785, + "lm_loss": 2.4598, + "step": 232, + "vm_loss": 0.2467 + }, + { + "epoch": 0.044661549197487785, + "lm_loss": 2.6577, + "step": 232, + "vm_loss": 0.2049 + }, + { + "epoch": 0.044661549197487785, + "lm_loss": 2.526, + "step": 232, + "vm_loss": 0.213 + }, + { + "epoch": 0.044661549197487785, + "lm_loss": 2.6536, + "step": 232, + "vm_loss": 0.1663 + }, + { + "epoch": 0.044661549197487785, + "lm_loss": 2.3985, + "step": 232, + "vm_loss": 0.1535 + }, + { + "epoch": 0.044661549197487785, + "lm_loss": 2.5585, + "step": 232, + "vm_loss": 0.2144 + }, + { + "epoch": 0.044661549197487785, + "lm_loss": 2.8372, + "step": 232, + "vm_loss": 0.2464 + }, + { + "epoch": 0.044854055875063165, + "grad_norm": 6.417788955615832, + "learning_rate": 1.4935897435897438e-05, + "loss": 2.8161, + "step": 233 + }, + { + "epoch": 0.045046562552638544, + "grad_norm": 3.71111057112546, + "learning_rate": 1.5000000000000002e-05, + "loss": 2.8409, + "step": 234 + }, + { + "epoch": 0.045239069230213924, + "grad_norm": 5.681294642281556, + "learning_rate": 1.5064102564102565e-05, + "loss": 2.8796, + "step": 235 + }, + { + "epoch": 0.045431575907789304, + "grad_norm": 4.2874115736934595, + "learning_rate": 1.5128205128205129e-05, + "loss": 2.846, + "step": 236 + }, + { + "epoch": 0.04562408258536468, + "grad_norm": 5.0998004345925905, + "learning_rate": 1.5192307692307693e-05, + "loss": 2.83, + "step": 237 + }, + { + "epoch": 0.045816589262940056, + "grad_norm": 4.857255113532344, + "learning_rate": 1.5256410256410257e-05, + "loss": 2.7729, + "step": 238 + }, + { + "epoch": 0.046009095940515436, + "grad_norm": 5.731345644511636, + "learning_rate": 1.5320512820512823e-05, + "loss": 2.8073, + "step": 239 + }, + { + "epoch": 0.046201602618090816, + "grad_norm": 4.074735776209843, + "learning_rate": 1.5384615384615387e-05, + "loss": 2.8033, + "step": 240 + }, + { + "epoch": 0.046201602618090816, + "lm_loss": 2.5881, + "step": 240, + "vm_loss": 0.1938 + }, + { + "epoch": 0.046201602618090816, + "lm_loss": 2.42, + "step": 240, + "vm_loss": 0.1851 + }, + { + "epoch": 0.046201602618090816, + "lm_loss": 2.8157, + "step": 240, + "vm_loss": 0.1463 + }, + { + "epoch": 0.046201602618090816, + "lm_loss": 2.755, + "step": 240, + "vm_loss": 0.1544 + }, + { + "epoch": 0.046201602618090816, + "lm_loss": 2.5736, + "step": 240, + "vm_loss": 0.1497 + }, + { + "epoch": 0.046201602618090816, + "lm_loss": 2.7088, + "step": 240, + "vm_loss": 0.225 + }, + { + "epoch": 0.046201602618090816, + "lm_loss": 2.4963, + "step": 240, + "vm_loss": 0.1749 + }, + { + "epoch": 0.046201602618090816, + "lm_loss": 2.8049, + "step": 240, + "vm_loss": 0.191 + }, + { + "epoch": 0.046394109295666196, + "grad_norm": 5.448771509383658, + "learning_rate": 1.544871794871795e-05, + "loss": 2.8039, + "step": 241 + }, + { + "epoch": 0.04658661597324157, + "grad_norm": 5.181558569222629, + "learning_rate": 1.5512820512820516e-05, + "loss": 2.8054, + "step": 242 + }, + { + "epoch": 0.04677912265081695, + "grad_norm": 4.837739297340354, + "learning_rate": 1.557692307692308e-05, + "loss": 2.8442, + "step": 243 + }, + { + "epoch": 0.04697162932839233, + "grad_norm": 5.31087875515781, + "learning_rate": 1.5641025641025644e-05, + "loss": 2.8542, + "step": 244 + }, + { + "epoch": 0.04716413600596771, + "grad_norm": 4.764992258784042, + "learning_rate": 1.5705128205128205e-05, + "loss": 2.7845, + "step": 245 + }, + { + "epoch": 0.04735664268354309, + "grad_norm": 6.284306004867487, + "learning_rate": 1.576923076923077e-05, + "loss": 2.7959, + "step": 246 + }, + { + "epoch": 0.04754914936111846, + "grad_norm": 5.321702902470818, + "learning_rate": 1.5833333333333333e-05, + "loss": 2.8045, + "step": 247 + }, + { + "epoch": 0.04774165603869384, + "grad_norm": 4.79852329988178, + "learning_rate": 1.5897435897435897e-05, + "loss": 2.8272, + "step": 248 + }, + { + "epoch": 0.04774165603869384, + "lm_loss": 2.1254, + "step": 248, + "vm_loss": 0.1234 + }, + { + "epoch": 0.04774165603869384, + "lm_loss": 2.7617, + "step": 248, + "vm_loss": 0.1652 + }, + { + "epoch": 0.04774165603869384, + "lm_loss": 2.5905, + "step": 248, + "vm_loss": 0.1817 + }, + { + "epoch": 0.04774165603869384, + "lm_loss": 2.71, + "step": 248, + "vm_loss": 0.1544 + }, + { + "epoch": 0.04774165603869384, + "lm_loss": 2.4503, + "step": 248, + "vm_loss": 0.1022 + }, + { + "epoch": 0.04774165603869384, + "lm_loss": 2.6204, + "step": 248, + "vm_loss": 0.2362 + }, + { + "epoch": 0.04774165603869384, + "lm_loss": 2.5351, + "step": 248, + "vm_loss": 0.1914 + }, + { + "epoch": 0.04774165603869384, + "lm_loss": 2.4125, + "step": 248, + "vm_loss": 0.1848 + }, + { + "epoch": 0.04793416271626922, + "grad_norm": 6.4595083159010835, + "learning_rate": 1.5961538461538465e-05, + "loss": 2.8033, + "step": 249 + }, + { + "epoch": 0.0481266693938446, + "grad_norm": 4.308638650480195, + "learning_rate": 1.602564102564103e-05, + "loss": 2.8103, + "step": 250 + }, + { + "epoch": 0.04831917607141998, + "grad_norm": 6.4101497723491425, + "learning_rate": 1.6089743589743593e-05, + "loss": 2.7673, + "step": 251 + }, + { + "epoch": 0.04851168274899536, + "grad_norm": 4.532903144942222, + "learning_rate": 1.6153846153846154e-05, + "loss": 2.8179, + "step": 252 + }, + { + "epoch": 0.04870418942657073, + "grad_norm": 4.748829857520044, + "learning_rate": 1.6217948717948718e-05, + "loss": 2.7881, + "step": 253 + }, + { + "epoch": 0.04889669610414611, + "grad_norm": 4.948904119589788, + "learning_rate": 1.6282051282051282e-05, + "loss": 2.7535, + "step": 254 + }, + { + "epoch": 0.04908920278172149, + "grad_norm": 4.6513772973707574, + "learning_rate": 1.6346153846153847e-05, + "loss": 2.7733, + "step": 255 + }, + { + "epoch": 0.04928170945929687, + "grad_norm": 4.294851496778557, + "learning_rate": 1.641025641025641e-05, + "loss": 2.8135, + "step": 256 + }, + { + "epoch": 0.04928170945929687, + "lm_loss": 2.7238, + "step": 256, + "vm_loss": 0.188 + }, + { + "epoch": 0.04928170945929687, + "lm_loss": 2.5891, + "step": 256, + "vm_loss": 0.1779 + }, + { + "epoch": 0.04928170945929687, + "lm_loss": 2.8327, + "step": 256, + "vm_loss": 0.1018 + }, + { + "epoch": 0.04928170945929687, + "lm_loss": 2.7131, + "step": 256, + "vm_loss": 0.1238 + }, + { + "epoch": 0.04928170945929687, + "lm_loss": 2.6686, + "step": 256, + "vm_loss": 0.1771 + }, + { + "epoch": 0.04928170945929687, + "lm_loss": 2.6499, + "step": 256, + "vm_loss": 0.1682 + }, + { + "epoch": 0.04928170945929687, + "lm_loss": 2.7558, + "step": 256, + "vm_loss": 0.1277 + }, + { + "epoch": 0.04928170945929687, + "lm_loss": 2.4651, + "step": 256, + "vm_loss": 0.1893 + }, + { + "epoch": 0.04947421613687225, + "grad_norm": 5.67444934915225, + "learning_rate": 1.6474358974358975e-05, + "loss": 2.75, + "step": 257 + }, + { + "epoch": 0.049666722814447624, + "grad_norm": 4.179304943641643, + "learning_rate": 1.653846153846154e-05, + "loss": 2.7486, + "step": 258 + }, + { + "epoch": 0.049859229492023004, + "grad_norm": 5.4761858408932, + "learning_rate": 1.6602564102564103e-05, + "loss": 2.7808, + "step": 259 + }, + { + "epoch": 0.05005173616959838, + "grad_norm": 4.61371812438267, + "learning_rate": 1.6666666666666667e-05, + "loss": 2.7898, + "step": 260 + }, + { + "epoch": 0.05024424284717376, + "grad_norm": 5.207440224754311, + "learning_rate": 1.673076923076923e-05, + "loss": 2.7671, + "step": 261 + }, + { + "epoch": 0.05043674952474914, + "grad_norm": 4.854998652086252, + "learning_rate": 1.6794871794871796e-05, + "loss": 2.7799, + "step": 262 + }, + { + "epoch": 0.050629256202324516, + "grad_norm": 5.043735751444399, + "learning_rate": 1.685897435897436e-05, + "loss": 2.7374, + "step": 263 + }, + { + "epoch": 0.050821762879899895, + "grad_norm": 5.142440309794773, + "learning_rate": 1.6923076923076924e-05, + "loss": 2.7828, + "step": 264 + }, + { + "epoch": 0.050821762879899895, + "lm_loss": 2.6345, + "step": 264, + "vm_loss": 0.2103 + }, + { + "epoch": 0.050821762879899895, + "lm_loss": 2.6301, + "step": 264, + "vm_loss": 0.151 + }, + { + "epoch": 0.050821762879899895, + "lm_loss": 2.5312, + "step": 264, + "vm_loss": 0.2164 + }, + { + "epoch": 0.050821762879899895, + "lm_loss": 2.6304, + "step": 264, + "vm_loss": 0.201 + }, + { + "epoch": 0.050821762879899895, + "lm_loss": 2.7943, + "step": 264, + "vm_loss": 0.2887 + }, + { + "epoch": 0.050821762879899895, + "lm_loss": 2.1806, + "step": 264, + "vm_loss": 0.2062 + }, + { + "epoch": 0.050821762879899895, + "lm_loss": 2.6636, + "step": 264, + "vm_loss": 0.1628 + }, + { + "epoch": 0.050821762879899895, + "lm_loss": 2.4347, + "step": 264, + "vm_loss": 0.2074 + }, + { + "epoch": 0.051014269557475275, + "grad_norm": 5.432414537043364, + "learning_rate": 1.698717948717949e-05, + "loss": 2.7929, + "step": 265 + }, + { + "epoch": 0.051206776235050655, + "grad_norm": 5.011886065042031, + "learning_rate": 1.7051282051282053e-05, + "loss": 2.752, + "step": 266 + }, + { + "epoch": 0.051399282912626035, + "grad_norm": 5.690458115360194, + "learning_rate": 1.7115384615384617e-05, + "loss": 2.7801, + "step": 267 + }, + { + "epoch": 0.05159178959020141, + "grad_norm": 4.9953507143641485, + "learning_rate": 1.717948717948718e-05, + "loss": 2.7851, + "step": 268 + }, + { + "epoch": 0.05178429626777679, + "grad_norm": 6.136330745638589, + "learning_rate": 1.7243589743589745e-05, + "loss": 2.8012, + "step": 269 + }, + { + "epoch": 0.05197680294535217, + "grad_norm": 5.398065985219351, + "learning_rate": 1.730769230769231e-05, + "loss": 2.758, + "step": 270 + }, + { + "epoch": 0.05216930962292755, + "grad_norm": 5.627593326165706, + "learning_rate": 1.7371794871794873e-05, + "loss": 2.7479, + "step": 271 + }, + { + "epoch": 0.05236181630050293, + "grad_norm": 5.4363601291925345, + "learning_rate": 1.7435897435897438e-05, + "loss": 2.7913, + "step": 272 + }, + { + "epoch": 0.05236181630050293, + "lm_loss": 2.6982, + "step": 272, + "vm_loss": 0.1953 + }, + { + "epoch": 0.05236181630050293, + "lm_loss": 2.7023, + "step": 272, + "vm_loss": 0.2113 + }, + { + "epoch": 0.05236181630050293, + "lm_loss": 2.5525, + "step": 272, + "vm_loss": 0.2256 + }, + { + "epoch": 0.05236181630050293, + "lm_loss": 2.607, + "step": 272, + "vm_loss": 0.2278 + }, + { + "epoch": 0.05236181630050293, + "lm_loss": 2.7356, + "step": 272, + "vm_loss": 0.2048 + }, + { + "epoch": 0.05236181630050293, + "lm_loss": 2.7488, + "step": 272, + "vm_loss": 0.166 + }, + { + "epoch": 0.05236181630050293, + "lm_loss": 2.5289, + "step": 272, + "vm_loss": 0.2322 + }, + { + "epoch": 0.05236181630050293, + "lm_loss": 2.4803, + "step": 272, + "vm_loss": 0.1715 + }, + { + "epoch": 0.0525543229780783, + "grad_norm": 6.71871940737774, + "learning_rate": 1.7500000000000002e-05, + "loss": 2.7886, + "step": 273 + }, + { + "epoch": 0.05274682965565368, + "grad_norm": 5.810392290014264, + "learning_rate": 1.7564102564102566e-05, + "loss": 2.8243, + "step": 274 + }, + { + "epoch": 0.05293933633322906, + "grad_norm": 4.220671687484487, + "learning_rate": 1.762820512820513e-05, + "loss": 2.7267, + "step": 275 + }, + { + "epoch": 0.05313184301080444, + "grad_norm": 3.997163303641086, + "learning_rate": 1.7692307692307694e-05, + "loss": 2.7728, + "step": 276 + }, + { + "epoch": 0.05332434968837982, + "grad_norm": 5.611693679867312, + "learning_rate": 1.775641025641026e-05, + "loss": 2.7983, + "step": 277 + }, + { + "epoch": 0.05351685636595519, + "grad_norm": 5.971487917375231, + "learning_rate": 1.7820512820512823e-05, + "loss": 2.7688, + "step": 278 + }, + { + "epoch": 0.05370936304353057, + "grad_norm": 5.502538116528449, + "learning_rate": 1.7884615384615387e-05, + "loss": 2.7181, + "step": 279 + }, + { + "epoch": 0.05390186972110595, + "grad_norm": 5.063807787749939, + "learning_rate": 1.794871794871795e-05, + "loss": 2.688, + "step": 280 + }, + { + "epoch": 0.05390186972110595, + "lm_loss": 2.3802, + "step": 280, + "vm_loss": 0.1336 + }, + { + "epoch": 0.05390186972110595, + "lm_loss": 2.3224, + "step": 280, + "vm_loss": 0.2148 + }, + { + "epoch": 0.05390186972110595, + "lm_loss": 2.7132, + "step": 280, + "vm_loss": 0.1906 + }, + { + "epoch": 0.05390186972110595, + "lm_loss": 2.5248, + "step": 280, + "vm_loss": 0.1877 + }, + { + "epoch": 0.05390186972110595, + "lm_loss": 2.3786, + "step": 280, + "vm_loss": 0.2183 + }, + { + "epoch": 0.05390186972110595, + "lm_loss": 2.5967, + "step": 280, + "vm_loss": 0.1829 + }, + { + "epoch": 0.05390186972110595, + "lm_loss": 2.6854, + "step": 280, + "vm_loss": 0.2494 + }, + { + "epoch": 0.05390186972110595, + "lm_loss": 2.6236, + "step": 280, + "vm_loss": 0.1349 + }, + { + "epoch": 0.05409437639868133, + "grad_norm": 5.557173810961538, + "learning_rate": 1.8012820512820515e-05, + "loss": 2.7521, + "step": 281 + }, + { + "epoch": 0.05428688307625671, + "grad_norm": 6.099894343529975, + "learning_rate": 1.807692307692308e-05, + "loss": 2.7306, + "step": 282 + }, + { + "epoch": 0.05447938975383208, + "grad_norm": 5.750374294384423, + "learning_rate": 1.8141025641025644e-05, + "loss": 2.6932, + "step": 283 + }, + { + "epoch": 0.05467189643140746, + "grad_norm": 4.596638318229386, + "learning_rate": 1.8205128205128208e-05, + "loss": 2.7395, + "step": 284 + }, + { + "epoch": 0.05486440310898284, + "grad_norm": 6.061063918190246, + "learning_rate": 1.826923076923077e-05, + "loss": 2.779, + "step": 285 + }, + { + "epoch": 0.05505690978655822, + "grad_norm": 6.843683391493255, + "learning_rate": 1.8333333333333333e-05, + "loss": 2.7758, + "step": 286 + }, + { + "epoch": 0.0552494164641336, + "grad_norm": 5.7417151251373, + "learning_rate": 1.8397435897435897e-05, + "loss": 2.6776, + "step": 287 + }, + { + "epoch": 0.055441923141708975, + "grad_norm": 5.95014629777928, + "learning_rate": 1.8461538461538465e-05, + "loss": 2.7491, + "step": 288 + }, + { + "epoch": 0.055441923141708975, + "lm_loss": 2.7807, + "step": 288, + "vm_loss": 0.1784 + }, + { + "epoch": 0.055441923141708975, + "lm_loss": 2.5052, + "step": 288, + "vm_loss": 0.1766 + }, + { + "epoch": 0.055441923141708975, + "lm_loss": 2.3852, + "step": 288, + "vm_loss": 0.158 + }, + { + "epoch": 0.055441923141708975, + "lm_loss": 2.5051, + "step": 288, + "vm_loss": 0.1478 + }, + { + "epoch": 0.055441923141708975, + "lm_loss": 2.5671, + "step": 288, + "vm_loss": 0.1419 + }, + { + "epoch": 0.055441923141708975, + "lm_loss": 2.7327, + "step": 288, + "vm_loss": 0.2274 + }, + { + "epoch": 0.055441923141708975, + "lm_loss": 2.9177, + "step": 288, + "vm_loss": 0.2134 + }, + { + "epoch": 0.055441923141708975, + "lm_loss": 2.2264, + "step": 288, + "vm_loss": 0.2377 + }, + { + "epoch": 0.055634429819284355, + "grad_norm": 7.481916905302612, + "learning_rate": 1.852564102564103e-05, + "loss": 2.7311, + "step": 289 + }, + { + "epoch": 0.055826936496859735, + "grad_norm": 10.666369396641484, + "learning_rate": 1.8589743589743593e-05, + "loss": 2.711, + "step": 290 + }, + { + "epoch": 0.056019443174435114, + "grad_norm": 6.643483793492336, + "learning_rate": 1.8653846153846157e-05, + "loss": 2.6804, + "step": 291 + }, + { + "epoch": 0.056211949852010494, + "grad_norm": 7.834274382181083, + "learning_rate": 1.8717948717948718e-05, + "loss": 2.7641, + "step": 292 + }, + { + "epoch": 0.05640445652958587, + "grad_norm": 5.481083990593822, + "learning_rate": 1.8782051282051282e-05, + "loss": 2.6757, + "step": 293 + }, + { + "epoch": 0.05659696320716125, + "grad_norm": 13.494098611415133, + "learning_rate": 1.8846153846153846e-05, + "loss": 2.7491, + "step": 294 + }, + { + "epoch": 0.056789469884736626, + "grad_norm": 7.929890808822521, + "learning_rate": 1.891025641025641e-05, + "loss": 2.7485, + "step": 295 + }, + { + "epoch": 0.056981976562312006, + "grad_norm": 8.078997650865817, + "learning_rate": 1.8974358974358975e-05, + "loss": 2.7422, + "step": 296 + }, + { + "epoch": 0.056981976562312006, + "lm_loss": 2.6819, + "step": 296, + "vm_loss": 0.1737 + }, + { + "epoch": 0.056981976562312006, + "lm_loss": 2.5034, + "step": 296, + "vm_loss": 0.2125 + }, + { + "epoch": 0.056981976562312006, + "lm_loss": 2.4533, + "step": 296, + "vm_loss": 0.2169 + }, + { + "epoch": 0.056981976562312006, + "lm_loss": 2.6871, + "step": 296, + "vm_loss": 0.1601 + }, + { + "epoch": 0.056981976562312006, + "lm_loss": 2.4849, + "step": 296, + "vm_loss": 0.134 + }, + { + "epoch": 0.056981976562312006, + "lm_loss": 2.5656, + "step": 296, + "vm_loss": 0.1683 + }, + { + "epoch": 0.056981976562312006, + "lm_loss": 2.5809, + "step": 296, + "vm_loss": 0.1474 + }, + { + "epoch": 0.056981976562312006, + "lm_loss": 2.4418, + "step": 296, + "vm_loss": 0.1517 + }, + { + "epoch": 0.057174483239887386, + "grad_norm": 6.614455299732487, + "learning_rate": 1.903846153846154e-05, + "loss": 2.7212, + "step": 297 + }, + { + "epoch": 0.05736698991746276, + "grad_norm": 5.590732664103167, + "learning_rate": 1.9102564102564106e-05, + "loss": 2.7189, + "step": 298 + }, + { + "epoch": 0.05755949659503814, + "grad_norm": 17.867175223198412, + "learning_rate": 1.916666666666667e-05, + "loss": 2.7574, + "step": 299 + }, + { + "epoch": 0.05775200327261352, + "grad_norm": 6.270512488879316, + "learning_rate": 1.923076923076923e-05, + "loss": 2.7043, + "step": 300 + }, + { + "epoch": 0.0579445099501889, + "grad_norm": 7.82771296805973, + "learning_rate": 1.9294871794871796e-05, + "loss": 2.7745, + "step": 301 + }, + { + "epoch": 0.05813701662776428, + "grad_norm": 6.860681826757564, + "learning_rate": 1.935897435897436e-05, + "loss": 2.7372, + "step": 302 + }, + { + "epoch": 0.05832952330533965, + "grad_norm": 7.9165276479442, + "learning_rate": 1.9423076923076924e-05, + "loss": 2.7511, + "step": 303 + }, + { + "epoch": 0.05852202998291503, + "grad_norm": 6.403564952036589, + "learning_rate": 1.9487179487179488e-05, + "loss": 2.725, + "step": 304 + }, + { + "epoch": 0.05852202998291503, + "lm_loss": 2.7356, + "step": 304, + "vm_loss": 0.1832 + }, + { + "epoch": 0.05852202998291503, + "lm_loss": 2.6297, + "step": 304, + "vm_loss": 0.1387 + }, + { + "epoch": 0.05852202998291503, + "lm_loss": 2.334, + "step": 304, + "vm_loss": 0.2014 + }, + { + "epoch": 0.05852202998291503, + "lm_loss": 2.5127, + "step": 304, + "vm_loss": 0.1754 + }, + { + "epoch": 0.05852202998291503, + "lm_loss": 2.5081, + "step": 304, + "vm_loss": 0.1757 + }, + { + "epoch": 0.05852202998291503, + "lm_loss": 2.4344, + "step": 304, + "vm_loss": 0.1567 + }, + { + "epoch": 0.05852202998291503, + "lm_loss": 2.8054, + "step": 304, + "vm_loss": 0.1783 + }, + { + "epoch": 0.05852202998291503, + "lm_loss": 2.5351, + "step": 304, + "vm_loss": 0.1959 + }, + { + "epoch": 0.05871453666049041, + "grad_norm": 6.838258056479387, + "learning_rate": 1.9551282051282052e-05, + "loss": 2.7151, + "step": 305 + }, + { + "epoch": 0.05890704333806579, + "grad_norm": 5.823482406268452, + "learning_rate": 1.9615384615384617e-05, + "loss": 2.7605, + "step": 306 + }, + { + "epoch": 0.05909955001564117, + "grad_norm": 7.102550818386516, + "learning_rate": 1.967948717948718e-05, + "loss": 2.7435, + "step": 307 + }, + { + "epoch": 0.05929205669321655, + "grad_norm": 5.5919661140690335, + "learning_rate": 1.9743589743589745e-05, + "loss": 2.7346, + "step": 308 + }, + { + "epoch": 0.05948456337079192, + "grad_norm": 6.437294533210534, + "learning_rate": 1.980769230769231e-05, + "loss": 2.6936, + "step": 309 + }, + { + "epoch": 0.0596770700483673, + "grad_norm": 7.530450249681175, + "learning_rate": 1.9871794871794873e-05, + "loss": 2.7374, + "step": 310 + }, + { + "epoch": 0.05986957672594268, + "grad_norm": 6.1723717590724005, + "learning_rate": 1.9935897435897437e-05, + "loss": 2.7802, + "step": 311 + }, + { + "epoch": 0.06006208340351806, + "grad_norm": 6.221761453623501, + "learning_rate": 2e-05, + "loss": 2.7176, + "step": 312 + }, + { + "epoch": 0.06006208340351806, + "lm_loss": 2.438, + "step": 312, + "vm_loss": 0.1857 + }, + { + "epoch": 0.06006208340351806, + "lm_loss": 2.4646, + "step": 312, + "vm_loss": 0.2016 + }, + { + "epoch": 0.06006208340351806, + "lm_loss": 2.6592, + "step": 312, + "vm_loss": 0.1966 + }, + { + "epoch": 0.06006208340351806, + "lm_loss": 2.5624, + "step": 312, + "vm_loss": 0.1687 + }, + { + "epoch": 0.06006208340351806, + "lm_loss": 2.7499, + "step": 312, + "vm_loss": 0.2068 + }, + { + "epoch": 0.06006208340351806, + "lm_loss": 2.4759, + "step": 312, + "vm_loss": 0.2478 + }, + { + "epoch": 0.06006208340351806, + "lm_loss": 2.4851, + "step": 312, + "vm_loss": 0.1711 + }, + { + "epoch": 0.06006208340351806, + "lm_loss": 2.534, + "step": 312, + "vm_loss": 0.1744 + }, + { + "epoch": 0.06025459008109344, + "grad_norm": 6.556227124834291, + "learning_rate": 1.9999999513936033e-05, + "loss": 2.726, + "step": 313 + }, + { + "epoch": 0.060447096758668814, + "grad_norm": 5.959865100550867, + "learning_rate": 1.9999998055744173e-05, + "loss": 2.7372, + "step": 314 + }, + { + "epoch": 0.060639603436244194, + "grad_norm": 5.833405481445536, + "learning_rate": 1.9999995625424566e-05, + "loss": 2.7118, + "step": 315 + }, + { + "epoch": 0.060832110113819574, + "grad_norm": 5.755580657232161, + "learning_rate": 1.9999992222977445e-05, + "loss": 2.7132, + "step": 316 + }, + { + "epoch": 0.06102461679139495, + "grad_norm": 5.755834599779296, + "learning_rate": 1.9999987848403148e-05, + "loss": 2.7577, + "step": 317 + }, + { + "epoch": 0.06121712346897033, + "grad_norm": 5.406238335625812, + "learning_rate": 1.999998250170209e-05, + "loss": 2.6694, + "step": 318 + }, + { + "epoch": 0.061409630146545706, + "grad_norm": 5.44359809849898, + "learning_rate": 1.9999976182874795e-05, + "loss": 2.702, + "step": 319 + }, + { + "epoch": 0.061602136824121086, + "grad_norm": 4.808249430953446, + "learning_rate": 1.9999968891921883e-05, + "loss": 2.725, + "step": 320 + }, + { + "epoch": 0.061602136824121086, + "lm_loss": 2.6041, + "step": 320, + "vm_loss": 0.1911 + }, + { + "epoch": 0.061602136824121086, + "lm_loss": 2.7065, + "step": 320, + "vm_loss": 0.1625 + }, + { + "epoch": 0.061602136824121086, + "lm_loss": 2.1316, + "step": 320, + "vm_loss": 0.1862 + }, + { + "epoch": 0.061602136824121086, + "lm_loss": 2.5332, + "step": 320, + "vm_loss": 0.1887 + }, + { + "epoch": 0.061602136824121086, + "lm_loss": 2.6907, + "step": 320, + "vm_loss": 0.2059 + }, + { + "epoch": 0.061602136824121086, + "lm_loss": 2.5532, + "step": 320, + "vm_loss": 0.1583 + }, + { + "epoch": 0.061602136824121086, + "lm_loss": 2.7553, + "step": 320, + "vm_loss": 0.1795 + }, + { + "epoch": 0.061602136824121086, + "lm_loss": 2.5232, + "step": 320, + "vm_loss": 0.2404 + }, + { + "epoch": 0.061794643501696465, + "grad_norm": 6.107397910778231, + "learning_rate": 1.9999960628844055e-05, + "loss": 2.6995, + "step": 321 + }, + { + "epoch": 0.061987150179271845, + "grad_norm": 6.137674852555086, + "learning_rate": 1.9999951393642117e-05, + "loss": 2.6803, + "step": 322 + }, + { + "epoch": 0.062179656856847225, + "grad_norm": 4.68510259297647, + "learning_rate": 1.9999941186316965e-05, + "loss": 2.7011, + "step": 323 + }, + { + "epoch": 0.0623721635344226, + "grad_norm": 5.37521618938404, + "learning_rate": 1.9999930006869593e-05, + "loss": 2.7482, + "step": 324 + }, + { + "epoch": 0.06256467021199798, + "grad_norm": 5.77930171018597, + "learning_rate": 1.999991785530109e-05, + "loss": 2.703, + "step": 325 + }, + { + "epoch": 0.06275717688957336, + "grad_norm": 4.852920462008338, + "learning_rate": 1.9999904731612635e-05, + "loss": 2.7423, + "step": 326 + }, + { + "epoch": 0.06294968356714874, + "grad_norm": 6.152316334811615, + "learning_rate": 1.9999890635805506e-05, + "loss": 2.7177, + "step": 327 + }, + { + "epoch": 0.06314219024472412, + "grad_norm": 5.0602729932716795, + "learning_rate": 1.999987556788107e-05, + "loss": 2.6476, + "step": 328 + }, + { + "epoch": 0.06314219024472412, + "lm_loss": 2.6998, + "step": 328, + "vm_loss": 0.2544 + }, + { + "epoch": 0.06314219024472412, + "lm_loss": 2.394, + "step": 328, + "vm_loss": 0.2385 + }, + { + "epoch": 0.06314219024472412, + "lm_loss": 2.6689, + "step": 328, + "vm_loss": 0.1637 + }, + { + "epoch": 0.06314219024472412, + "lm_loss": 2.8713, + "step": 328, + "vm_loss": 0.1683 + }, + { + "epoch": 0.06314219024472412, + "lm_loss": 2.5413, + "step": 328, + "vm_loss": 0.1094 + }, + { + "epoch": 0.06314219024472412, + "lm_loss": 2.5915, + "step": 328, + "vm_loss": 0.2122 + }, + { + "epoch": 0.06314219024472412, + "lm_loss": 2.1835, + "step": 328, + "vm_loss": 0.2006 + }, + { + "epoch": 0.06314219024472412, + "lm_loss": 2.3432, + "step": 328, + "vm_loss": 0.2225 + }, + { + "epoch": 0.0633346969222995, + "grad_norm": 5.53464512741921, + "learning_rate": 1.999985952784079e-05, + "loss": 2.7022, + "step": 329 + }, + { + "epoch": 0.06352720359987488, + "grad_norm": 5.071347296118864, + "learning_rate": 1.999984251568623e-05, + "loss": 2.6764, + "step": 330 + }, + { + "epoch": 0.06371971027745024, + "grad_norm": 4.8525379853183015, + "learning_rate": 1.9999824531419042e-05, + "loss": 2.6447, + "step": 331 + }, + { + "epoch": 0.06391221695502562, + "grad_norm": 4.974582527869404, + "learning_rate": 1.9999805575040976e-05, + "loss": 2.6804, + "step": 332 + }, + { + "epoch": 0.064104723632601, + "grad_norm": 4.445088280205642, + "learning_rate": 1.9999785646553872e-05, + "loss": 2.6619, + "step": 333 + }, + { + "epoch": 0.06429723031017638, + "grad_norm": 4.9470845625267765, + "learning_rate": 1.999976474595967e-05, + "loss": 2.7377, + "step": 334 + }, + { + "epoch": 0.06448973698775176, + "grad_norm": 5.472564416952947, + "learning_rate": 1.99997428732604e-05, + "loss": 2.6685, + "step": 335 + }, + { + "epoch": 0.06468224366532714, + "grad_norm": 4.325664357046897, + "learning_rate": 1.9999720028458185e-05, + "loss": 2.6585, + "step": 336 + }, + { + "epoch": 0.06468224366532714, + "lm_loss": 2.4, + "step": 336, + "vm_loss": 0.1781 + }, + { + "epoch": 0.06468224366532714, + "lm_loss": 2.4846, + "step": 336, + "vm_loss": 0.2043 + }, + { + "epoch": 0.06468224366532714, + "lm_loss": 2.7019, + "step": 336, + "vm_loss": 0.1693 + }, + { + "epoch": 0.06468224366532714, + "lm_loss": 2.6328, + "step": 336, + "vm_loss": 0.1672 + }, + { + "epoch": 0.06468224366532714, + "lm_loss": 2.6214, + "step": 336, + "vm_loss": 0.1762 + }, + { + "epoch": 0.06468224366532714, + "lm_loss": 2.4843, + "step": 336, + "vm_loss": 0.1179 + }, + { + "epoch": 0.06468224366532714, + "lm_loss": 2.3489, + "step": 336, + "vm_loss": 0.2547 + }, + { + "epoch": 0.06468224366532714, + "lm_loss": 2.7837, + "step": 336, + "vm_loss": 0.1768 + }, + { + "epoch": 0.06487475034290252, + "grad_norm": 5.665835095279862, + "learning_rate": 1.9999696211555254e-05, + "loss": 2.6667, + "step": 337 + }, + { + "epoch": 0.0650672570204779, + "grad_norm": 5.401863221189648, + "learning_rate": 1.9999671422553914e-05, + "loss": 2.6879, + "step": 338 + }, + { + "epoch": 0.06525976369805328, + "grad_norm": 5.134478680834371, + "learning_rate": 1.999964566145658e-05, + "loss": 2.6951, + "step": 339 + }, + { + "epoch": 0.06545227037562866, + "grad_norm": 5.10687350027081, + "learning_rate": 1.9999618928265758e-05, + "loss": 2.6853, + "step": 340 + }, + { + "epoch": 0.06564477705320404, + "grad_norm": 5.918541336370904, + "learning_rate": 1.9999591222984042e-05, + "loss": 2.7043, + "step": 341 + }, + { + "epoch": 0.0658372837307794, + "grad_norm": 5.401240241834477, + "learning_rate": 1.9999562545614125e-05, + "loss": 2.7002, + "step": 342 + }, + { + "epoch": 0.06602979040835479, + "grad_norm": 5.55776949635833, + "learning_rate": 1.9999532896158796e-05, + "loss": 2.6814, + "step": 343 + }, + { + "epoch": 0.06622229708593017, + "grad_norm": 4.940930329109183, + "learning_rate": 1.9999502274620942e-05, + "loss": 2.6935, + "step": 344 + }, + { + "epoch": 0.06622229708593017, + "lm_loss": 2.3664, + "step": 344, + "vm_loss": 0.2027 + }, + { + "epoch": 0.06622229708593017, + "lm_loss": 2.6444, + "step": 344, + "vm_loss": 0.2206 + }, + { + "epoch": 0.06622229708593017, + "lm_loss": 2.3721, + "step": 344, + "vm_loss": 0.188 + }, + { + "epoch": 0.06622229708593017, + "lm_loss": 2.4093, + "step": 344, + "vm_loss": 0.1907 + }, + { + "epoch": 0.06622229708593017, + "lm_loss": 2.4839, + "step": 344, + "vm_loss": 0.1584 + }, + { + "epoch": 0.06622229708593017, + "lm_loss": 2.3184, + "step": 344, + "vm_loss": 0.1386 + }, + { + "epoch": 0.06622229708593017, + "lm_loss": 2.3154, + "step": 344, + "vm_loss": 0.1418 + }, + { + "epoch": 0.06622229708593017, + "lm_loss": 2.5662, + "step": 344, + "vm_loss": 0.1927 + }, + { + "epoch": 0.06641480376350554, + "grad_norm": 5.795138431781234, + "learning_rate": 1.999947068100353e-05, + "loss": 2.6436, + "step": 345 + }, + { + "epoch": 0.06660731044108092, + "grad_norm": 4.1998792847036475, + "learning_rate": 1.9999438115309645e-05, + "loss": 2.6276, + "step": 346 + }, + { + "epoch": 0.0667998171186563, + "grad_norm": 5.81983355879425, + "learning_rate": 1.999940457754244e-05, + "loss": 2.6668, + "step": 347 + }, + { + "epoch": 0.06699232379623168, + "grad_norm": 4.675919662854624, + "learning_rate": 1.999937006770518e-05, + "loss": 2.6722, + "step": 348 + }, + { + "epoch": 0.06718483047380706, + "grad_norm": 5.551831943600783, + "learning_rate": 1.9999334585801223e-05, + "loss": 2.6038, + "step": 349 + }, + { + "epoch": 0.06737733715138244, + "grad_norm": 4.6632356114396005, + "learning_rate": 1.9999298131834015e-05, + "loss": 2.6595, + "step": 350 + }, + { + "epoch": 0.06756984382895782, + "grad_norm": 6.692896453659439, + "learning_rate": 1.9999260705807097e-05, + "loss": 2.6376, + "step": 351 + }, + { + "epoch": 0.06776235050653319, + "grad_norm": 5.343279283106529, + "learning_rate": 1.9999222307724112e-05, + "loss": 2.6047, + "step": 352 + }, + { + "epoch": 0.06776235050653319, + "lm_loss": 2.488, + "step": 352, + "vm_loss": 0.1852 + }, + { + "epoch": 0.06776235050653319, + "lm_loss": 2.4696, + "step": 352, + "vm_loss": 0.1415 + }, + { + "epoch": 0.06776235050653319, + "lm_loss": 2.3815, + "step": 352, + "vm_loss": 0.1449 + }, + { + "epoch": 0.06776235050653319, + "lm_loss": 2.3602, + "step": 352, + "vm_loss": 0.2399 + }, + { + "epoch": 0.06776235050653319, + "lm_loss": 2.5061, + "step": 352, + "vm_loss": 0.1495 + }, + { + "epoch": 0.06776235050653319, + "lm_loss": 2.4995, + "step": 352, + "vm_loss": 0.1858 + }, + { + "epoch": 0.06776235050653319, + "lm_loss": 2.6202, + "step": 352, + "vm_loss": 0.2258 + }, + { + "epoch": 0.06776235050653319, + "lm_loss": 2.2618, + "step": 352, + "vm_loss": 0.1772 + }, + { + "epoch": 0.06795485718410857, + "grad_norm": 4.913554407756347, + "learning_rate": 1.9999182937588793e-05, + "loss": 2.6339, + "step": 353 + }, + { + "epoch": 0.06814736386168395, + "grad_norm": 5.01951707989754, + "learning_rate": 1.9999142595404966e-05, + "loss": 2.6714, + "step": 354 + }, + { + "epoch": 0.06833987053925933, + "grad_norm": 5.629729224985602, + "learning_rate": 1.999910128117655e-05, + "loss": 2.6332, + "step": 355 + }, + { + "epoch": 0.06853237721683471, + "grad_norm": 6.026098965730194, + "learning_rate": 1.9999058994907564e-05, + "loss": 2.6521, + "step": 356 + }, + { + "epoch": 0.06872488389441009, + "grad_norm": 4.482636991401426, + "learning_rate": 1.9999015736602118e-05, + "loss": 2.6708, + "step": 357 + }, + { + "epoch": 0.06891739057198547, + "grad_norm": 4.728802487308409, + "learning_rate": 1.999897150626442e-05, + "loss": 2.6755, + "step": 358 + }, + { + "epoch": 0.06910989724956085, + "grad_norm": 4.421993026950499, + "learning_rate": 1.9998926303898766e-05, + "loss": 2.6556, + "step": 359 + }, + { + "epoch": 0.06930240392713623, + "grad_norm": 4.3441646914180865, + "learning_rate": 1.999888012950955e-05, + "loss": 2.6744, + "step": 360 + }, + { + "epoch": 0.06930240392713623, + "lm_loss": 2.4258, + "step": 360, + "vm_loss": 0.1655 + }, + { + "epoch": 0.06930240392713623, + "lm_loss": 2.5005, + "step": 360, + "vm_loss": 0.1746 + }, + { + "epoch": 0.06930240392713623, + "lm_loss": 2.6775, + "step": 360, + "vm_loss": 0.2432 + }, + { + "epoch": 0.06930240392713623, + "lm_loss": 2.6832, + "step": 360, + "vm_loss": 0.1881 + }, + { + "epoch": 0.06930240392713623, + "lm_loss": 2.34, + "step": 360, + "vm_loss": 0.1337 + }, + { + "epoch": 0.06930240392713623, + "lm_loss": 2.4566, + "step": 360, + "vm_loss": 0.1821 + }, + { + "epoch": 0.06930240392713623, + "lm_loss": 2.5994, + "step": 360, + "vm_loss": 0.211 + }, + { + "epoch": 0.06930240392713623, + "lm_loss": 2.3162, + "step": 360, + "vm_loss": 0.1979 + }, + { + "epoch": 0.06949491060471161, + "grad_norm": 5.2506038938294814, + "learning_rate": 1.9998832983101265e-05, + "loss": 2.6272, + "step": 361 + }, + { + "epoch": 0.06968741728228697, + "grad_norm": 5.284536186850513, + "learning_rate": 1.9998784864678492e-05, + "loss": 2.6672, + "step": 362 + }, + { + "epoch": 0.06987992395986235, + "grad_norm": 4.8286689622159455, + "learning_rate": 1.9998735774245906e-05, + "loss": 2.6614, + "step": 363 + }, + { + "epoch": 0.07007243063743773, + "grad_norm": 4.687684021072952, + "learning_rate": 1.9998685711808282e-05, + "loss": 2.6284, + "step": 364 + }, + { + "epoch": 0.07026493731501311, + "grad_norm": 4.1144995267156546, + "learning_rate": 1.9998634677370488e-05, + "loss": 2.7031, + "step": 365 + }, + { + "epoch": 0.07045744399258849, + "grad_norm": 4.292179436658936, + "learning_rate": 1.999858267093748e-05, + "loss": 2.6015, + "step": 366 + }, + { + "epoch": 0.07064995067016387, + "grad_norm": 4.274711940139827, + "learning_rate": 1.9998529692514318e-05, + "loss": 2.6082, + "step": 367 + }, + { + "epoch": 0.07084245734773925, + "grad_norm": 4.6390138924938995, + "learning_rate": 1.9998475742106155e-05, + "loss": 2.6544, + "step": 368 + }, + { + "epoch": 0.07084245734773925, + "lm_loss": 2.0815, + "step": 368, + "vm_loss": 0.1217 + }, + { + "epoch": 0.07084245734773925, + "lm_loss": 2.2327, + "step": 368, + "vm_loss": 0.2462 + }, + { + "epoch": 0.07084245734773925, + "lm_loss": 2.3392, + "step": 368, + "vm_loss": 0.1348 + }, + { + "epoch": 0.07084245734773925, + "lm_loss": 2.7784, + "step": 368, + "vm_loss": 0.2694 + }, + { + "epoch": 0.07084245734773925, + "lm_loss": 2.6468, + "step": 368, + "vm_loss": 0.2823 + }, + { + "epoch": 0.07084245734773925, + "lm_loss": 2.2761, + "step": 368, + "vm_loss": 0.3097 + }, + { + "epoch": 0.07084245734773925, + "lm_loss": 2.2707, + "step": 368, + "vm_loss": 0.1692 + }, + { + "epoch": 0.07084245734773925, + "lm_loss": 2.7296, + "step": 368, + "vm_loss": 0.2275 + }, + { + "epoch": 0.07103496402531463, + "grad_norm": 4.603914347689331, + "learning_rate": 1.9998420819718228e-05, + "loss": 2.6714, + "step": 369 + }, + { + "epoch": 0.07122747070289001, + "grad_norm": 4.380669375370557, + "learning_rate": 1.999836492535588e-05, + "loss": 2.626, + "step": 370 + }, + { + "epoch": 0.07141997738046539, + "grad_norm": 5.403224556955136, + "learning_rate": 1.999830805902455e-05, + "loss": 2.6733, + "step": 371 + }, + { + "epoch": 0.07161248405804076, + "grad_norm": 4.636809687245444, + "learning_rate": 1.9998250220729757e-05, + "loss": 2.6269, + "step": 372 + }, + { + "epoch": 0.07180499073561614, + "grad_norm": 5.143458583937728, + "learning_rate": 1.999819141047713e-05, + "loss": 2.6357, + "step": 373 + }, + { + "epoch": 0.07199749741319152, + "grad_norm": 4.1139032835810845, + "learning_rate": 1.9998131628272383e-05, + "loss": 2.6459, + "step": 374 + }, + { + "epoch": 0.0721900040907669, + "grad_norm": 4.693406696889142, + "learning_rate": 1.9998070874121324e-05, + "loss": 2.6304, + "step": 375 + }, + { + "epoch": 0.07238251076834228, + "grad_norm": 4.028661359851269, + "learning_rate": 1.999800914802987e-05, + "loss": 2.6701, + "step": 376 + }, + { + "epoch": 0.07238251076834228, + "lm_loss": 2.6267, + "step": 376, + "vm_loss": 0.2046 + }, + { + "epoch": 0.07238251076834228, + "lm_loss": 2.0724, + "step": 376, + "vm_loss": 0.1909 + }, + { + "epoch": 0.07238251076834228, + "lm_loss": 2.661, + "step": 376, + "vm_loss": 0.1199 + }, + { + "epoch": 0.07238251076834228, + "lm_loss": 2.3668, + "step": 376, + "vm_loss": 0.1408 + }, + { + "epoch": 0.07238251076834228, + "lm_loss": 2.7229, + "step": 376, + "vm_loss": 0.2276 + }, + { + "epoch": 0.07238251076834228, + "lm_loss": 2.6082, + "step": 376, + "vm_loss": 0.1584 + }, + { + "epoch": 0.07238251076834228, + "lm_loss": 2.5906, + "step": 376, + "vm_loss": 0.1745 + }, + { + "epoch": 0.07238251076834228, + "lm_loss": 2.4841, + "step": 376, + "vm_loss": 0.1454 + }, + { + "epoch": 0.07257501744591766, + "grad_norm": 4.189241594722147, + "learning_rate": 1.999794645000401e-05, + "loss": 2.6249, + "step": 377 + }, + { + "epoch": 0.07276752412349304, + "grad_norm": 4.2884548035205325, + "learning_rate": 1.9997882780049847e-05, + "loss": 2.6262, + "step": 378 + }, + { + "epoch": 0.07296003080106842, + "grad_norm": 4.259221190360824, + "learning_rate": 1.9997818138173564e-05, + "loss": 2.6289, + "step": 379 + }, + { + "epoch": 0.0731525374786438, + "grad_norm": 4.170838251896876, + "learning_rate": 1.999775252438145e-05, + "loss": 2.6766, + "step": 380 + }, + { + "epoch": 0.07334504415621917, + "grad_norm": 4.458199664716821, + "learning_rate": 1.999768593867988e-05, + "loss": 2.6342, + "step": 381 + }, + { + "epoch": 0.07353755083379454, + "grad_norm": 5.031529911214012, + "learning_rate": 1.9997618381075335e-05, + "loss": 2.6229, + "step": 382 + }, + { + "epoch": 0.07373005751136992, + "grad_norm": 4.122483247945086, + "learning_rate": 1.9997549851574372e-05, + "loss": 2.6343, + "step": 383 + }, + { + "epoch": 0.0739225641889453, + "grad_norm": 4.238488014723586, + "learning_rate": 1.999748035018366e-05, + "loss": 2.6133, + "step": 384 + }, + { + "epoch": 0.0739225641889453, + "lm_loss": 2.6191, + "step": 384, + "vm_loss": 0.1726 + }, + { + "epoch": 0.0739225641889453, + "lm_loss": 2.5066, + "step": 384, + "vm_loss": 0.1376 + }, + { + "epoch": 0.0739225641889453, + "lm_loss": 2.6732, + "step": 384, + "vm_loss": 0.2354 + }, + { + "epoch": 0.0739225641889453, + "lm_loss": 2.4402, + "step": 384, + "vm_loss": 0.2476 + }, + { + "epoch": 0.0739225641889453, + "lm_loss": 2.368, + "step": 384, + "vm_loss": 0.1814 + }, + { + "epoch": 0.0739225641889453, + "lm_loss": 2.3874, + "step": 384, + "vm_loss": 0.2107 + }, + { + "epoch": 0.0739225641889453, + "lm_loss": 2.2578, + "step": 384, + "vm_loss": 0.1044 + }, + { + "epoch": 0.0739225641889453, + "lm_loss": 2.3363, + "step": 384, + "vm_loss": 0.1159 + }, + { + "epoch": 0.07411507086652068, + "grad_norm": 6.204975124102081, + "learning_rate": 1.999740987690995e-05, + "loss": 2.5971, + "step": 385 + }, + { + "epoch": 0.07430757754409606, + "grad_norm": 4.54092037575377, + "learning_rate": 1.9997338431760096e-05, + "loss": 2.6696, + "step": 386 + }, + { + "epoch": 0.07450008422167144, + "grad_norm": 5.842899742795652, + "learning_rate": 1.9997266014741043e-05, + "loss": 2.5951, + "step": 387 + }, + { + "epoch": 0.07469259089924682, + "grad_norm": 4.654436316884126, + "learning_rate": 1.9997192625859835e-05, + "loss": 2.6004, + "step": 388 + }, + { + "epoch": 0.0748850975768222, + "grad_norm": 4.966095790053132, + "learning_rate": 1.9997118265123597e-05, + "loss": 2.6338, + "step": 389 + }, + { + "epoch": 0.07507760425439758, + "grad_norm": 6.255048217829622, + "learning_rate": 1.9997042932539568e-05, + "loss": 2.6397, + "step": 390 + }, + { + "epoch": 0.07527011093197296, + "grad_norm": 5.110274557779918, + "learning_rate": 1.9996966628115065e-05, + "loss": 2.6717, + "step": 391 + }, + { + "epoch": 0.07546261760954834, + "grad_norm": 5.020056293418851, + "learning_rate": 1.9996889351857505e-05, + "loss": 2.5814, + "step": 392 + }, + { + "epoch": 0.07546261760954834, + "lm_loss": 2.3172, + "step": 392, + "vm_loss": 0.1839 + }, + { + "epoch": 0.07546261760954834, + "lm_loss": 2.2725, + "step": 392, + "vm_loss": 0.1529 + }, + { + "epoch": 0.07546261760954834, + "lm_loss": 2.664, + "step": 392, + "vm_loss": 0.1455 + }, + { + "epoch": 0.07546261760954834, + "lm_loss": 2.4934, + "step": 392, + "vm_loss": 0.1866 + }, + { + "epoch": 0.07546261760954834, + "lm_loss": 2.6374, + "step": 392, + "vm_loss": 0.1812 + }, + { + "epoch": 0.07546261760954834, + "lm_loss": 2.2315, + "step": 392, + "vm_loss": 0.1771 + }, + { + "epoch": 0.07546261760954834, + "lm_loss": 2.5262, + "step": 392, + "vm_loss": 0.2159 + }, + { + "epoch": 0.07546261760954834, + "lm_loss": 2.2028, + "step": 392, + "vm_loss": 0.1845 + }, + { + "epoch": 0.0756551242871237, + "grad_norm": 6.239743206846836, + "learning_rate": 1.9996811103774402e-05, + "loss": 2.6338, + "step": 393 + }, + { + "epoch": 0.07584763096469908, + "grad_norm": 4.919758152236745, + "learning_rate": 1.9996731883873367e-05, + "loss": 2.6203, + "step": 394 + }, + { + "epoch": 0.07604013764227446, + "grad_norm": 5.911200849202589, + "learning_rate": 1.9996651692162094e-05, + "loss": 2.5953, + "step": 395 + }, + { + "epoch": 0.07623264431984984, + "grad_norm": 4.5688952866080745, + "learning_rate": 1.9996570528648386e-05, + "loss": 2.6177, + "step": 396 + }, + { + "epoch": 0.07642515099742522, + "grad_norm": 6.653621247777492, + "learning_rate": 1.9996488393340123e-05, + "loss": 2.5625, + "step": 397 + }, + { + "epoch": 0.0766176576750006, + "grad_norm": 5.109342299682795, + "learning_rate": 1.99964052862453e-05, + "loss": 2.6379, + "step": 398 + }, + { + "epoch": 0.07681016435257598, + "grad_norm": 5.1039939309649425, + "learning_rate": 1.9996321207371993e-05, + "loss": 2.6631, + "step": 399 + }, + { + "epoch": 0.07700267103015136, + "grad_norm": 5.827058092260074, + "learning_rate": 1.999623615672837e-05, + "loss": 2.5796, + "step": 400 + }, + { + "epoch": 0.07700267103015136, + "lm_loss": 2.2977, + "step": 400, + "vm_loss": 0.1672 + }, + { + "epoch": 0.07700267103015136, + "lm_loss": 2.6482, + "step": 400, + "vm_loss": 0.1375 + }, + { + "epoch": 0.07700267103015136, + "lm_loss": 2.35, + "step": 400, + "vm_loss": 0.2048 + }, + { + "epoch": 0.07700267103015136, + "lm_loss": 2.3909, + "step": 400, + "vm_loss": 0.1741 + }, + { + "epoch": 0.07700267103015136, + "lm_loss": 2.5429, + "step": 400, + "vm_loss": 0.2058 + }, + { + "epoch": 0.07700267103015136, + "lm_loss": 2.4245, + "step": 400, + "vm_loss": 0.2627 + }, + { + "epoch": 0.07700267103015136, + "lm_loss": 2.2211, + "step": 400, + "vm_loss": 0.1975 + }, + { + "epoch": 0.07700267103015136, + "lm_loss": 2.4429, + "step": 400, + "vm_loss": 0.1302 + }, + { + "epoch": 0.07719517770772674, + "grad_norm": 3.9323240756023714, + "learning_rate": 1.9996150134322705e-05, + "loss": 2.5995, + "step": 401 + }, + { + "epoch": 0.07738768438530212, + "grad_norm": 5.374848687755518, + "learning_rate": 1.999606314016336e-05, + "loss": 2.5589, + "step": 402 + }, + { + "epoch": 0.07758019106287749, + "grad_norm": 5.859427418048798, + "learning_rate": 1.9995975174258792e-05, + "loss": 2.6392, + "step": 403 + }, + { + "epoch": 0.07777269774045287, + "grad_norm": 4.26420722417678, + "learning_rate": 1.9995886236617547e-05, + "loss": 2.6561, + "step": 404 + }, + { + "epoch": 0.07796520441802825, + "grad_norm": 4.72753101872864, + "learning_rate": 1.9995796327248277e-05, + "loss": 2.6288, + "step": 405 + }, + { + "epoch": 0.07815771109560363, + "grad_norm": 3.8915702259247116, + "learning_rate": 1.9995705446159724e-05, + "loss": 2.5802, + "step": 406 + }, + { + "epoch": 0.078350217773179, + "grad_norm": 4.800953125330857, + "learning_rate": 1.9995613593360716e-05, + "loss": 2.6144, + "step": 407 + }, + { + "epoch": 0.07854272445075439, + "grad_norm": 4.942506031210902, + "learning_rate": 1.9995520768860185e-05, + "loss": 2.6788, + "step": 408 + }, + { + "epoch": 0.07854272445075439, + "lm_loss": 2.5765, + "step": 408, + "vm_loss": 0.2768 + }, + { + "epoch": 0.07854272445075439, + "lm_loss": 2.4396, + "step": 408, + "vm_loss": 0.1578 + }, + { + "epoch": 0.07854272445075439, + "lm_loss": 2.465, + "step": 408, + "vm_loss": 0.2104 + }, + { + "epoch": 0.07854272445075439, + "lm_loss": 2.4295, + "step": 408, + "vm_loss": 0.2158 + }, + { + "epoch": 0.07854272445075439, + "lm_loss": 2.4915, + "step": 408, + "vm_loss": 0.1911 + }, + { + "epoch": 0.07854272445075439, + "lm_loss": 2.3964, + "step": 408, + "vm_loss": 0.0995 + }, + { + "epoch": 0.07854272445075439, + "lm_loss": 2.2371, + "step": 408, + "vm_loss": 0.2062 + }, + { + "epoch": 0.07854272445075439, + "lm_loss": 2.342, + "step": 408, + "vm_loss": 0.3171 + }, + { + "epoch": 0.07873523112832977, + "grad_norm": 3.9198298213529337, + "learning_rate": 1.9995426972667157e-05, + "loss": 2.6314, + "step": 409 + }, + { + "epoch": 0.07892773780590515, + "grad_norm": 4.353713650508078, + "learning_rate": 1.9995332204790746e-05, + "loss": 2.5824, + "step": 410 + }, + { + "epoch": 0.07912024448348053, + "grad_norm": 5.289327370372657, + "learning_rate": 1.9995236465240168e-05, + "loss": 2.6272, + "step": 411 + }, + { + "epoch": 0.0793127511610559, + "grad_norm": 4.479585755033477, + "learning_rate": 1.999513975402473e-05, + "loss": 2.5951, + "step": 412 + }, + { + "epoch": 0.07950525783863127, + "grad_norm": 5.383623263190479, + "learning_rate": 1.9995042071153833e-05, + "loss": 2.645, + "step": 413 + }, + { + "epoch": 0.07969776451620665, + "grad_norm": 4.110562910059662, + "learning_rate": 1.999494341663697e-05, + "loss": 2.6094, + "step": 414 + }, + { + "epoch": 0.07989027119378203, + "grad_norm": 4.603499928423537, + "learning_rate": 1.9994843790483735e-05, + "loss": 2.6397, + "step": 415 + }, + { + "epoch": 0.08008277787135741, + "grad_norm": 4.732775210366881, + "learning_rate": 1.9994743192703815e-05, + "loss": 2.6386, + "step": 416 + }, + { + "epoch": 0.08008277787135741, + "lm_loss": 2.3904, + "step": 416, + "vm_loss": 0.1875 + }, + { + "epoch": 0.08008277787135741, + "lm_loss": 2.3909, + "step": 416, + "vm_loss": 0.162 + }, + { + "epoch": 0.08008277787135741, + "lm_loss": 2.7642, + "step": 416, + "vm_loss": 0.2751 + }, + { + "epoch": 0.08008277787135741, + "lm_loss": 2.3344, + "step": 416, + "vm_loss": 0.2419 + }, + { + "epoch": 0.08008277787135741, + "lm_loss": 2.3295, + "step": 416, + "vm_loss": 0.2019 + }, + { + "epoch": 0.08008277787135741, + "lm_loss": 2.4101, + "step": 416, + "vm_loss": 0.1555 + }, + { + "epoch": 0.08008277787135741, + "lm_loss": 2.4027, + "step": 416, + "vm_loss": 0.1753 + }, + { + "epoch": 0.08008277787135741, + "lm_loss": 2.3477, + "step": 416, + "vm_loss": 0.1931 + }, + { + "epoch": 0.08027528454893279, + "grad_norm": 4.017623192112295, + "learning_rate": 1.9994641623306984e-05, + "loss": 2.617, + "step": 417 + }, + { + "epoch": 0.08046779122650817, + "grad_norm": 3.8059314028452467, + "learning_rate": 1.9994539082303118e-05, + "loss": 2.6075, + "step": 418 + }, + { + "epoch": 0.08066029790408355, + "grad_norm": 4.329771878505528, + "learning_rate": 1.999443556970219e-05, + "loss": 2.6457, + "step": 419 + }, + { + "epoch": 0.08085280458165893, + "grad_norm": 4.471405981684093, + "learning_rate": 1.9994331085514255e-05, + "loss": 2.6095, + "step": 420 + }, + { + "epoch": 0.08104531125923431, + "grad_norm": 3.8388686285349407, + "learning_rate": 1.9994225629749475e-05, + "loss": 2.5931, + "step": 421 + }, + { + "epoch": 0.08123781793680969, + "grad_norm": 5.032789118058817, + "learning_rate": 1.99941192024181e-05, + "loss": 2.6351, + "step": 422 + }, + { + "epoch": 0.08143032461438506, + "grad_norm": 4.192509045251331, + "learning_rate": 1.9994011803530472e-05, + "loss": 2.6282, + "step": 423 + }, + { + "epoch": 0.08162283129196043, + "grad_norm": 4.812130088917483, + "learning_rate": 1.999390343309704e-05, + "loss": 2.6337, + "step": 424 + }, + { + "epoch": 0.08162283129196043, + "lm_loss": 2.2247, + "step": 424, + "vm_loss": 0.179 + }, + { + "epoch": 0.08162283129196043, + "lm_loss": 2.073, + "step": 424, + "vm_loss": 0.1515 + }, + { + "epoch": 0.08162283129196043, + "lm_loss": 2.4632, + "step": 424, + "vm_loss": 0.1772 + }, + { + "epoch": 0.08162283129196043, + "lm_loss": 2.4725, + "step": 424, + "vm_loss": 0.1629 + }, + { + "epoch": 0.08162283129196043, + "lm_loss": 2.7145, + "step": 424, + "vm_loss": 0.1679 + }, + { + "epoch": 0.08162283129196043, + "lm_loss": 1.8568, + "step": 424, + "vm_loss": 0.2437 + }, + { + "epoch": 0.08162283129196043, + "lm_loss": 2.5772, + "step": 424, + "vm_loss": 0.1876 + }, + { + "epoch": 0.08162283129196043, + "lm_loss": 2.4504, + "step": 424, + "vm_loss": 0.2325 + }, + { + "epoch": 0.08181533796953581, + "grad_norm": 4.972570940854288, + "learning_rate": 1.9993794091128336e-05, + "loss": 2.5861, + "step": 425 + }, + { + "epoch": 0.0820078446471112, + "grad_norm": 4.260208269673439, + "learning_rate": 1.9993683777634987e-05, + "loss": 2.6374, + "step": 426 + }, + { + "epoch": 0.08220035132468657, + "grad_norm": 5.126930289067559, + "learning_rate": 1.9993572492627716e-05, + "loss": 2.6622, + "step": 427 + }, + { + "epoch": 0.08239285800226195, + "grad_norm": 3.959883748126858, + "learning_rate": 1.9993460236117347e-05, + "loss": 2.5808, + "step": 428 + }, + { + "epoch": 0.08258536467983733, + "grad_norm": 4.136908618927918, + "learning_rate": 1.9993347008114788e-05, + "loss": 2.6109, + "step": 429 + }, + { + "epoch": 0.08277787135741271, + "grad_norm": 4.5427020652420085, + "learning_rate": 1.999323280863105e-05, + "loss": 2.6326, + "step": 430 + }, + { + "epoch": 0.0829703780349881, + "grad_norm": 4.829765682853637, + "learning_rate": 1.999311763767723e-05, + "loss": 2.5867, + "step": 431 + }, + { + "epoch": 0.08316288471256347, + "grad_norm": 3.944981469899085, + "learning_rate": 1.9993001495264527e-05, + "loss": 2.5931, + "step": 432 + }, + { + "epoch": 0.08316288471256347, + "lm_loss": 2.3912, + "step": 432, + "vm_loss": 0.1956 + }, + { + "epoch": 0.08316288471256347, + "lm_loss": 1.8917, + "step": 432, + "vm_loss": 0.1794 + }, + { + "epoch": 0.08316288471256347, + "lm_loss": 2.0879, + "step": 432, + "vm_loss": 0.1595 + }, + { + "epoch": 0.08316288471256347, + "lm_loss": 2.5934, + "step": 432, + "vm_loss": 0.1888 + }, + { + "epoch": 0.08316288471256347, + "lm_loss": 2.4406, + "step": 432, + "vm_loss": 0.1189 + }, + { + "epoch": 0.08316288471256347, + "lm_loss": 2.4875, + "step": 432, + "vm_loss": 0.2012 + }, + { + "epoch": 0.08316288471256347, + "lm_loss": 2.5105, + "step": 432, + "vm_loss": 0.1587 + }, + { + "epoch": 0.08316288471256347, + "lm_loss": 2.7124, + "step": 432, + "vm_loss": 0.1949 + }, + { + "epoch": 0.08335539139013884, + "grad_norm": 5.376991364774965, + "learning_rate": 1.999288438140423e-05, + "loss": 2.5925, + "step": 433 + }, + { + "epoch": 0.08354789806771422, + "grad_norm": 4.012543783778699, + "learning_rate": 1.9992766296107726e-05, + "loss": 2.5671, + "step": 434 + }, + { + "epoch": 0.0837404047452896, + "grad_norm": 4.407267023707095, + "learning_rate": 1.999264723938649e-05, + "loss": 2.5952, + "step": 435 + }, + { + "epoch": 0.08393291142286498, + "grad_norm": 5.033969418794113, + "learning_rate": 1.9992527211252103e-05, + "loss": 2.585, + "step": 436 + }, + { + "epoch": 0.08412541810044036, + "grad_norm": 4.843202599271736, + "learning_rate": 1.9992406211716226e-05, + "loss": 2.6513, + "step": 437 + }, + { + "epoch": 0.08431792477801574, + "grad_norm": 4.809826868336139, + "learning_rate": 1.9992284240790628e-05, + "loss": 2.642, + "step": 438 + }, + { + "epoch": 0.08451043145559112, + "grad_norm": 5.064952269495695, + "learning_rate": 1.999216129848716e-05, + "loss": 2.5747, + "step": 439 + }, + { + "epoch": 0.0847029381331665, + "grad_norm": 5.072105633259909, + "learning_rate": 1.999203738481778e-05, + "loss": 2.6159, + "step": 440 + }, + { + "epoch": 0.0847029381331665, + "lm_loss": 2.5864, + "step": 440, + "vm_loss": 0.149 + }, + { + "epoch": 0.0847029381331665, + "lm_loss": 2.3824, + "step": 440, + "vm_loss": 0.2236 + }, + { + "epoch": 0.0847029381331665, + "lm_loss": 2.6471, + "step": 440, + "vm_loss": 0.1722 + }, + { + "epoch": 0.0847029381331665, + "lm_loss": 2.3837, + "step": 440, + "vm_loss": 0.1755 + }, + { + "epoch": 0.0847029381331665, + "lm_loss": 2.3542, + "step": 440, + "vm_loss": 0.3 + }, + { + "epoch": 0.0847029381331665, + "lm_loss": 2.7226, + "step": 440, + "vm_loss": 0.1961 + }, + { + "epoch": 0.0847029381331665, + "lm_loss": 2.4981, + "step": 440, + "vm_loss": 0.2234 + }, + { + "epoch": 0.0847029381331665, + "lm_loss": 2.3625, + "step": 440, + "vm_loss": 0.2238 + }, + { + "epoch": 0.08489544481074188, + "grad_norm": 4.378055239698308, + "learning_rate": 1.9991912499794532e-05, + "loss": 2.6005, + "step": 441 + }, + { + "epoch": 0.08508795148831726, + "grad_norm": 4.915084013954159, + "learning_rate": 1.9991786643429553e-05, + "loss": 2.5852, + "step": 442 + }, + { + "epoch": 0.08528045816589262, + "grad_norm": 5.52644319746219, + "learning_rate": 1.9991659815735075e-05, + "loss": 2.6014, + "step": 443 + }, + { + "epoch": 0.085472964843468, + "grad_norm": 5.299377509082517, + "learning_rate": 1.999153201672344e-05, + "loss": 2.5953, + "step": 444 + }, + { + "epoch": 0.08566547152104338, + "grad_norm": 5.304927636380555, + "learning_rate": 1.9991403246407056e-05, + "loss": 2.5809, + "step": 445 + }, + { + "epoch": 0.08585797819861876, + "grad_norm": 4.75314150552903, + "learning_rate": 1.9991273504798456e-05, + "loss": 2.5666, + "step": 446 + }, + { + "epoch": 0.08605048487619414, + "grad_norm": 4.380128205715259, + "learning_rate": 1.9991142791910243e-05, + "loss": 2.5593, + "step": 447 + }, + { + "epoch": 0.08624299155376952, + "grad_norm": 4.318705685106545, + "learning_rate": 1.9991011107755126e-05, + "loss": 2.5956, + "step": 448 + }, + { + "epoch": 0.08624299155376952, + "lm_loss": 2.2428, + "step": 448, + "vm_loss": 0.228 + }, + { + "epoch": 0.08624299155376952, + "lm_loss": 2.4717, + "step": 448, + "vm_loss": 0.1186 + }, + { + "epoch": 0.08624299155376952, + "lm_loss": 2.4566, + "step": 448, + "vm_loss": 0.2012 + }, + { + "epoch": 0.08624299155376952, + "lm_loss": 2.3733, + "step": 448, + "vm_loss": 0.212 + }, + { + "epoch": 0.08624299155376952, + "lm_loss": 2.3158, + "step": 448, + "vm_loss": 0.1423 + }, + { + "epoch": 0.08624299155376952, + "lm_loss": 2.4755, + "step": 448, + "vm_loss": 0.2357 + }, + { + "epoch": 0.08624299155376952, + "lm_loss": 2.1083, + "step": 448, + "vm_loss": 0.2557 + }, + { + "epoch": 0.08624299155376952, + "lm_loss": 2.395, + "step": 448, + "vm_loss": 0.1879 + }, + { + "epoch": 0.0864354982313449, + "grad_norm": 5.100189768112497, + "learning_rate": 1.999087845234591e-05, + "loss": 2.5839, + "step": 449 + }, + { + "epoch": 0.08662800490892028, + "grad_norm": 4.566942736823465, + "learning_rate": 1.9990744825695485e-05, + "loss": 2.6003, + "step": 450 + }, + { + "epoch": 0.08682051158649566, + "grad_norm": 4.67341629268445, + "learning_rate": 1.9990610227816845e-05, + "loss": 2.5668, + "step": 451 + }, + { + "epoch": 0.08701301826407104, + "grad_norm": 4.137873231298762, + "learning_rate": 1.9990474658723073e-05, + "loss": 2.611, + "step": 452 + }, + { + "epoch": 0.08720552494164642, + "grad_norm": 3.8051621215802642, + "learning_rate": 1.9990338118427347e-05, + "loss": 2.5889, + "step": 453 + }, + { + "epoch": 0.08739803161922179, + "grad_norm": 5.225626633998041, + "learning_rate": 1.9990200606942943e-05, + "loss": 2.6107, + "step": 454 + }, + { + "epoch": 0.08759053829679717, + "grad_norm": 4.0165052400732755, + "learning_rate": 1.9990062124283233e-05, + "loss": 2.591, + "step": 455 + }, + { + "epoch": 0.08778304497437255, + "grad_norm": 4.661007634311046, + "learning_rate": 1.998992267046167e-05, + "loss": 2.6033, + "step": 456 + }, + { + "epoch": 0.08778304497437255, + "lm_loss": 2.5336, + "step": 456, + "vm_loss": 0.117 + }, + { + "epoch": 0.08778304497437255, + "lm_loss": 2.3767, + "step": 456, + "vm_loss": 0.2082 + }, + { + "epoch": 0.08778304497437255, + "lm_loss": 2.164, + "step": 456, + "vm_loss": 0.1636 + }, + { + "epoch": 0.08778304497437255, + "lm_loss": 2.3462, + "step": 456, + "vm_loss": 0.2143 + }, + { + "epoch": 0.08778304497437255, + "lm_loss": 2.3675, + "step": 456, + "vm_loss": 0.1613 + }, + { + "epoch": 0.08778304497437255, + "lm_loss": 2.3259, + "step": 456, + "vm_loss": 0.1577 + }, + { + "epoch": 0.08778304497437255, + "lm_loss": 2.4258, + "step": 456, + "vm_loss": 0.2104 + }, + { + "epoch": 0.08778304497437255, + "lm_loss": 2.4685, + "step": 456, + "vm_loss": 0.1802 + }, + { + "epoch": 0.08797555165194793, + "grad_norm": 4.616366067523772, + "learning_rate": 1.9989782245491816e-05, + "loss": 2.5818, + "step": 457 + }, + { + "epoch": 0.0881680583295233, + "grad_norm": 4.6502671095176975, + "learning_rate": 1.9989640849387318e-05, + "loss": 2.566, + "step": 458 + }, + { + "epoch": 0.08836056500709868, + "grad_norm": 4.566747237580736, + "learning_rate": 1.9989498482161928e-05, + "loss": 2.5371, + "step": 459 + }, + { + "epoch": 0.08855307168467406, + "grad_norm": 4.5108812834080885, + "learning_rate": 1.9989355143829483e-05, + "loss": 2.5752, + "step": 460 + }, + { + "epoch": 0.08874557836224944, + "grad_norm": 4.33856611291057, + "learning_rate": 1.9989210834403915e-05, + "loss": 2.6607, + "step": 461 + }, + { + "epoch": 0.08893808503982482, + "grad_norm": 4.284488850525381, + "learning_rate": 1.9989065553899255e-05, + "loss": 2.5752, + "step": 462 + }, + { + "epoch": 0.0891305917174002, + "grad_norm": 4.597566654401899, + "learning_rate": 1.9988919302329626e-05, + "loss": 2.5872, + "step": 463 + }, + { + "epoch": 0.08932309839497557, + "grad_norm": 4.123517766480121, + "learning_rate": 1.9988772079709244e-05, + "loss": 2.6068, + "step": 464 + }, + { + "epoch": 0.08932309839497557, + "lm_loss": 2.423, + "step": 464, + "vm_loss": 0.2284 + }, + { + "epoch": 0.08932309839497557, + "lm_loss": 2.4247, + "step": 464, + "vm_loss": 0.2015 + }, + { + "epoch": 0.08932309839497557, + "lm_loss": 2.526, + "step": 464, + "vm_loss": 0.1432 + }, + { + "epoch": 0.08932309839497557, + "lm_loss": 2.3393, + "step": 464, + "vm_loss": 0.1658 + }, + { + "epoch": 0.08932309839497557, + "lm_loss": 2.6121, + "step": 464, + "vm_loss": 0.1663 + }, + { + "epoch": 0.08932309839497557, + "lm_loss": 2.2647, + "step": 464, + "vm_loss": 0.2049 + }, + { + "epoch": 0.08932309839497557, + "lm_loss": 2.296, + "step": 464, + "vm_loss": 0.1852 + }, + { + "epoch": 0.08932309839497557, + "lm_loss": 2.4049, + "step": 464, + "vm_loss": 0.161 + }, + { + "epoch": 0.08951560507255095, + "grad_norm": 4.992122515913401, + "learning_rate": 1.9988623886052425e-05, + "loss": 2.6111, + "step": 465 + }, + { + "epoch": 0.08970811175012633, + "grad_norm": 4.106694664525713, + "learning_rate": 1.998847472137357e-05, + "loss": 2.5963, + "step": 466 + }, + { + "epoch": 0.08990061842770171, + "grad_norm": 4.874698464927889, + "learning_rate": 1.998832458568718e-05, + "loss": 2.6133, + "step": 467 + }, + { + "epoch": 0.09009312510527709, + "grad_norm": 4.217876395217213, + "learning_rate": 1.9988173479007857e-05, + "loss": 2.5935, + "step": 468 + }, + { + "epoch": 0.09028563178285247, + "grad_norm": 3.6072137765551355, + "learning_rate": 1.9988021401350285e-05, + "loss": 2.5561, + "step": 469 + }, + { + "epoch": 0.09047813846042785, + "grad_norm": 4.334501750094067, + "learning_rate": 1.9987868352729247e-05, + "loss": 2.5523, + "step": 470 + }, + { + "epoch": 0.09067064513800323, + "grad_norm": 4.749684049791752, + "learning_rate": 1.9987714333159624e-05, + "loss": 2.5596, + "step": 471 + }, + { + "epoch": 0.09086315181557861, + "grad_norm": 3.8240568308481566, + "learning_rate": 1.9987559342656387e-05, + "loss": 2.5928, + "step": 472 + }, + { + "epoch": 0.09086315181557861, + "lm_loss": 2.3947, + "step": 472, + "vm_loss": 0.1831 + }, + { + "epoch": 0.09086315181557861, + "lm_loss": 2.6004, + "step": 472, + "vm_loss": 0.1944 + }, + { + "epoch": 0.09086315181557861, + "lm_loss": 2.5982, + "step": 472, + "vm_loss": 0.1644 + }, + { + "epoch": 0.09086315181557861, + "lm_loss": 2.4629, + "step": 472, + "vm_loss": 0.1956 + }, + { + "epoch": 0.09086315181557861, + "lm_loss": 2.4183, + "step": 472, + "vm_loss": 0.2244 + }, + { + "epoch": 0.09086315181557861, + "lm_loss": 2.3785, + "step": 472, + "vm_loss": 0.2424 + }, + { + "epoch": 0.09086315181557861, + "lm_loss": 2.1391, + "step": 472, + "vm_loss": 0.1954 + }, + { + "epoch": 0.09086315181557861, + "lm_loss": 2.6103, + "step": 472, + "vm_loss": 0.137 + }, + { + "epoch": 0.09105565849315399, + "grad_norm": 4.663153349680087, + "learning_rate": 1.9987403381234604e-05, + "loss": 2.6152, + "step": 473 + }, + { + "epoch": 0.09124816517072935, + "grad_norm": 4.162147228936526, + "learning_rate": 1.9987246448909437e-05, + "loss": 2.5616, + "step": 474 + }, + { + "epoch": 0.09144067184830473, + "grad_norm": 4.424916784288473, + "learning_rate": 1.998708854569614e-05, + "loss": 2.5772, + "step": 475 + }, + { + "epoch": 0.09163317852588011, + "grad_norm": 3.634107321885927, + "learning_rate": 1.998692967161006e-05, + "loss": 2.5544, + "step": 476 + }, + { + "epoch": 0.09182568520345549, + "grad_norm": 3.8471275085344097, + "learning_rate": 1.998676982666665e-05, + "loss": 2.5619, + "step": 477 + }, + { + "epoch": 0.09201819188103087, + "grad_norm": 4.2965501168743065, + "learning_rate": 1.9986609010881447e-05, + "loss": 2.5775, + "step": 478 + }, + { + "epoch": 0.09221069855860625, + "grad_norm": 3.74049091618792, + "learning_rate": 1.998644722427008e-05, + "loss": 2.5879, + "step": 479 + }, + { + "epoch": 0.09240320523618163, + "grad_norm": 4.04950593906965, + "learning_rate": 1.998628446684828e-05, + "loss": 2.5497, + "step": 480 + }, + { + "epoch": 0.09240320523618163, + "lm_loss": 2.3726, + "step": 480, + "vm_loss": 0.1758 + }, + { + "epoch": 0.09240320523618163, + "lm_loss": 2.658, + "step": 480, + "vm_loss": 0.1125 + }, + { + "epoch": 0.09240320523618163, + "lm_loss": 2.2042, + "step": 480, + "vm_loss": 0.2009 + }, + { + "epoch": 0.09240320523618163, + "lm_loss": 2.2737, + "step": 480, + "vm_loss": 0.183 + }, + { + "epoch": 0.09240320523618163, + "lm_loss": 2.56, + "step": 480, + "vm_loss": 0.2221 + }, + { + "epoch": 0.09240320523618163, + "lm_loss": 2.6275, + "step": 480, + "vm_loss": 0.2009 + }, + { + "epoch": 0.09240320523618163, + "lm_loss": 2.4803, + "step": 480, + "vm_loss": 0.1679 + }, + { + "epoch": 0.09240320523618163, + "lm_loss": 2.4612, + "step": 480, + "vm_loss": 0.1651 + }, + { + "epoch": 0.09259571191375701, + "grad_norm": 4.168779019699929, + "learning_rate": 1.9986120738631866e-05, + "loss": 2.579, + "step": 481 + }, + { + "epoch": 0.09278821859133239, + "grad_norm": 4.520248602112067, + "learning_rate": 1.9985956039636755e-05, + "loss": 2.6008, + "step": 482 + }, + { + "epoch": 0.09298072526890777, + "grad_norm": 4.76251337913576, + "learning_rate": 1.9985790369878963e-05, + "loss": 2.5414, + "step": 483 + }, + { + "epoch": 0.09317323194648314, + "grad_norm": 4.833530895653689, + "learning_rate": 1.998562372937459e-05, + "loss": 2.5452, + "step": 484 + }, + { + "epoch": 0.09336573862405852, + "grad_norm": 4.472736876071227, + "learning_rate": 1.9985456118139836e-05, + "loss": 2.5462, + "step": 485 + }, + { + "epoch": 0.0935582453016339, + "grad_norm": 4.168361170527035, + "learning_rate": 1.9985287536190996e-05, + "loss": 2.5638, + "step": 486 + }, + { + "epoch": 0.09375075197920928, + "grad_norm": 4.587043052267748, + "learning_rate": 1.998511798354446e-05, + "loss": 2.5742, + "step": 487 + }, + { + "epoch": 0.09394325865678466, + "grad_norm": 3.8902232970242565, + "learning_rate": 1.9984947460216708e-05, + "loss": 2.6027, + "step": 488 + }, + { + "epoch": 0.09394325865678466, + "lm_loss": 2.2761, + "step": 488, + "vm_loss": 0.1691 + }, + { + "epoch": 0.09394325865678466, + "lm_loss": 2.3176, + "step": 488, + "vm_loss": 0.106 + }, + { + "epoch": 0.09394325865678466, + "lm_loss": 2.4059, + "step": 488, + "vm_loss": 0.1658 + }, + { + "epoch": 0.09394325865678466, + "lm_loss": 2.4569, + "step": 488, + "vm_loss": 0.1889 + }, + { + "epoch": 0.09394325865678466, + "lm_loss": 2.1078, + "step": 488, + "vm_loss": 0.2286 + }, + { + "epoch": 0.09394325865678466, + "lm_loss": 2.3137, + "step": 488, + "vm_loss": 0.1961 + }, + { + "epoch": 0.09394325865678466, + "lm_loss": 2.3958, + "step": 488, + "vm_loss": 0.1983 + }, + { + "epoch": 0.09394325865678466, + "lm_loss": 2.6177, + "step": 488, + "vm_loss": 0.1751 + }, + { + "epoch": 0.09413576533436004, + "grad_norm": 3.789531341506008, + "learning_rate": 1.998477596622432e-05, + "loss": 2.5858, + "step": 489 + }, + { + "epoch": 0.09432827201193542, + "grad_norm": 4.721478304358618, + "learning_rate": 1.9984603501583963e-05, + "loss": 2.5388, + "step": 490 + }, + { + "epoch": 0.0945207786895108, + "grad_norm": 3.4771208257789743, + "learning_rate": 1.9984430066312407e-05, + "loss": 2.6125, + "step": 491 + }, + { + "epoch": 0.09471328536708618, + "grad_norm": 4.2134738863418315, + "learning_rate": 1.9984255660426508e-05, + "loss": 2.6002, + "step": 492 + }, + { + "epoch": 0.09490579204466155, + "grad_norm": 4.312397139640785, + "learning_rate": 1.9984080283943224e-05, + "loss": 2.5406, + "step": 493 + }, + { + "epoch": 0.09509829872223692, + "grad_norm": 4.394364013461965, + "learning_rate": 1.9983903936879604e-05, + "loss": 2.5462, + "step": 494 + }, + { + "epoch": 0.0952908053998123, + "grad_norm": 4.800268761098634, + "learning_rate": 1.9983726619252792e-05, + "loss": 2.5699, + "step": 495 + }, + { + "epoch": 0.09548331207738768, + "grad_norm": 4.240062053830481, + "learning_rate": 1.998354833108002e-05, + "loss": 2.5816, + "step": 496 + }, + { + "epoch": 0.09548331207738768, + "lm_loss": 2.4354, + "step": 496, + "vm_loss": 0.2584 + }, + { + "epoch": 0.09548331207738768, + "lm_loss": 2.3032, + "step": 496, + "vm_loss": 0.1711 + }, + { + "epoch": 0.09548331207738768, + "lm_loss": 2.131, + "step": 496, + "vm_loss": 0.1004 + }, + { + "epoch": 0.09548331207738768, + "lm_loss": 2.4127, + "step": 496, + "vm_loss": 0.1785 + }, + { + "epoch": 0.09548331207738768, + "lm_loss": 2.6089, + "step": 496, + "vm_loss": 0.1387 + }, + { + "epoch": 0.09548331207738768, + "lm_loss": 2.3863, + "step": 496, + "vm_loss": 0.2124 + }, + { + "epoch": 0.09548331207738768, + "lm_loss": 2.5855, + "step": 496, + "vm_loss": 0.1672 + }, + { + "epoch": 0.09548331207738768, + "lm_loss": 2.3004, + "step": 496, + "vm_loss": 0.2103 + }, + { + "epoch": 0.09567581875496306, + "grad_norm": 4.447937835853206, + "learning_rate": 1.998336907237862e-05, + "loss": 2.5998, + "step": 497 + }, + { + "epoch": 0.09586832543253844, + "grad_norm": 4.824443198360054, + "learning_rate": 1.9983188843166028e-05, + "loss": 2.5347, + "step": 498 + }, + { + "epoch": 0.09606083211011382, + "grad_norm": 4.820549742018637, + "learning_rate": 1.9983007643459757e-05, + "loss": 2.5223, + "step": 499 + }, + { + "epoch": 0.0962533387876892, + "grad_norm": 5.713518136250507, + "learning_rate": 1.998282547327742e-05, + "loss": 2.572, + "step": 500 + }, + { + "epoch": 0.09644584546526458, + "grad_norm": 4.98156917712925, + "learning_rate": 1.998264233263673e-05, + "loss": 2.5726, + "step": 501 + }, + { + "epoch": 0.09663835214283996, + "grad_norm": 4.978553087117611, + "learning_rate": 1.9982458221555493e-05, + "loss": 2.5697, + "step": 502 + }, + { + "epoch": 0.09683085882041534, + "grad_norm": 4.663907646726513, + "learning_rate": 1.9982273140051602e-05, + "loss": 2.5794, + "step": 503 + }, + { + "epoch": 0.09702336549799072, + "grad_norm": 4.095342225561139, + "learning_rate": 1.998208708814305e-05, + "loss": 2.6095, + "step": 504 + }, + { + "epoch": 0.09702336549799072, + "lm_loss": 2.2248, + "step": 504, + "vm_loss": 0.1751 + }, + { + "epoch": 0.09702336549799072, + "lm_loss": 2.2301, + "step": 504, + "vm_loss": 0.2263 + }, + { + "epoch": 0.09702336549799072, + "lm_loss": 2.56, + "step": 504, + "vm_loss": 0.1618 + }, + { + "epoch": 0.09702336549799072, + "lm_loss": 2.5923, + "step": 504, + "vm_loss": 0.1728 + }, + { + "epoch": 0.09702336549799072, + "lm_loss": 2.6183, + "step": 504, + "vm_loss": 0.1991 + }, + { + "epoch": 0.09702336549799072, + "lm_loss": 2.3742, + "step": 504, + "vm_loss": 0.1322 + }, + { + "epoch": 0.09702336549799072, + "lm_loss": 2.4218, + "step": 504, + "vm_loss": 0.1578 + }, + { + "epoch": 0.09702336549799072, + "lm_loss": 2.2415, + "step": 504, + "vm_loss": 0.1642 + }, + { + "epoch": 0.09721587217556608, + "grad_norm": 4.483308439993457, + "learning_rate": 1.9981900065847927e-05, + "loss": 2.5545, + "step": 505 + }, + { + "epoch": 0.09740837885314146, + "grad_norm": 4.593686805486022, + "learning_rate": 1.998171207318441e-05, + "loss": 2.5599, + "step": 506 + }, + { + "epoch": 0.09760088553071684, + "grad_norm": 4.312691222179744, + "learning_rate": 1.9981523110170778e-05, + "loss": 2.5523, + "step": 507 + }, + { + "epoch": 0.09779339220829222, + "grad_norm": 4.216721216018249, + "learning_rate": 1.9981333176825397e-05, + "loss": 2.5527, + "step": 508 + }, + { + "epoch": 0.0979858988858676, + "grad_norm": 4.602239657520409, + "learning_rate": 1.9981142273166735e-05, + "loss": 2.6171, + "step": 509 + }, + { + "epoch": 0.09817840556344298, + "grad_norm": 4.195913233258438, + "learning_rate": 1.9980950399213344e-05, + "loss": 2.5911, + "step": 510 + }, + { + "epoch": 0.09837091224101836, + "grad_norm": 4.753370484619016, + "learning_rate": 1.9980757554983884e-05, + "loss": 2.59, + "step": 511 + }, + { + "epoch": 0.09856341891859374, + "grad_norm": 4.2637981024242935, + "learning_rate": 1.9980563740497096e-05, + "loss": 2.5816, + "step": 512 + }, + { + "epoch": 0.09856341891859374, + "lm_loss": 1.8557, + "step": 512, + "vm_loss": 0.1571 + }, + { + "epoch": 0.09856341891859374, + "lm_loss": 2.3957, + "step": 512, + "vm_loss": 0.1767 + }, + { + "epoch": 0.09856341891859374, + "lm_loss": 2.4068, + "step": 512, + "vm_loss": 0.1463 + }, + { + "epoch": 0.09856341891859374, + "lm_loss": 2.3137, + "step": 512, + "vm_loss": 0.1974 + }, + { + "epoch": 0.09856341891859374, + "lm_loss": 2.5145, + "step": 512, + "vm_loss": 0.1226 + }, + { + "epoch": 0.09856341891859374, + "lm_loss": 2.3018, + "step": 512, + "vm_loss": 0.1877 + }, + { + "epoch": 0.09856341891859374, + "lm_loss": 2.4041, + "step": 512, + "vm_loss": 0.175 + }, + { + "epoch": 0.09856341891859374, + "lm_loss": 2.5427, + "step": 512, + "vm_loss": 0.1949 + }, + { + "epoch": 0.09875592559616912, + "grad_norm": 3.9858606369345804, + "learning_rate": 1.9980368955771824e-05, + "loss": 2.5272, + "step": 513 + }, + { + "epoch": 0.0989484322737445, + "grad_norm": 4.2679744558612525, + "learning_rate": 1.9980173200827004e-05, + "loss": 2.5449, + "step": 514 + }, + { + "epoch": 0.09914093895131987, + "grad_norm": 3.980811997729134, + "learning_rate": 1.9979976475681665e-05, + "loss": 2.5657, + "step": 515 + }, + { + "epoch": 0.09933344562889525, + "grad_norm": 4.642889480523902, + "learning_rate": 1.9979778780354935e-05, + "loss": 2.5096, + "step": 516 + }, + { + "epoch": 0.09952595230647063, + "grad_norm": 3.3432377538770037, + "learning_rate": 1.9979580114866022e-05, + "loss": 2.572, + "step": 517 + }, + { + "epoch": 0.09971845898404601, + "grad_norm": 4.3653302984121325, + "learning_rate": 1.9979380479234253e-05, + "loss": 2.5336, + "step": 518 + }, + { + "epoch": 0.09991096566162139, + "grad_norm": 4.272995148347355, + "learning_rate": 1.9979179873479023e-05, + "loss": 2.6058, + "step": 519 + }, + { + "epoch": 0.10010347233919677, + "grad_norm": 4.221414867985027, + "learning_rate": 1.997897829761984e-05, + "loss": 2.5624, + "step": 520 + }, + { + "epoch": 0.10010347233919677, + "lm_loss": 2.5836, + "step": 520, + "vm_loss": 0.1734 + }, + { + "epoch": 0.10010347233919677, + "lm_loss": 2.1159, + "step": 520, + "vm_loss": 0.1922 + }, + { + "epoch": 0.10010347233919677, + "lm_loss": 2.4777, + "step": 520, + "vm_loss": 0.203 + }, + { + "epoch": 0.10010347233919677, + "lm_loss": 2.4033, + "step": 520, + "vm_loss": 0.1215 + }, + { + "epoch": 0.10010347233919677, + "lm_loss": 2.1587, + "step": 520, + "vm_loss": 0.1628 + }, + { + "epoch": 0.10010347233919677, + "lm_loss": 2.5878, + "step": 520, + "vm_loss": 0.2251 + }, + { + "epoch": 0.10010347233919677, + "lm_loss": 2.5319, + "step": 520, + "vm_loss": 0.1835 + }, + { + "epoch": 0.10010347233919677, + "lm_loss": 2.5733, + "step": 520, + "vm_loss": 0.1896 + }, + { + "epoch": 0.10029597901677215, + "grad_norm": 4.6442495897955665, + "learning_rate": 1.99787757516763e-05, + "loss": 2.5459, + "step": 521 + }, + { + "epoch": 0.10048848569434753, + "grad_norm": 5.069089669449857, + "learning_rate": 1.9978572235668085e-05, + "loss": 2.544, + "step": 522 + }, + { + "epoch": 0.1006809923719229, + "grad_norm": 4.463885809538296, + "learning_rate": 1.997836774961499e-05, + "loss": 2.5587, + "step": 523 + }, + { + "epoch": 0.10087349904949829, + "grad_norm": 4.034778740896075, + "learning_rate": 1.997816229353689e-05, + "loss": 2.593, + "step": 524 + }, + { + "epoch": 0.10106600572707365, + "grad_norm": 4.187143655625298, + "learning_rate": 1.9977955867453754e-05, + "loss": 2.5303, + "step": 525 + }, + { + "epoch": 0.10125851240464903, + "grad_norm": 4.6057469976851335, + "learning_rate": 1.9977748471385654e-05, + "loss": 2.5759, + "step": 526 + }, + { + "epoch": 0.10145101908222441, + "grad_norm": 3.9742265378043165, + "learning_rate": 1.997754010535275e-05, + "loss": 2.5935, + "step": 527 + }, + { + "epoch": 0.10164352575979979, + "grad_norm": 5.629343433645512, + "learning_rate": 1.9977330769375298e-05, + "loss": 2.594, + "step": 528 + }, + { + "epoch": 0.10164352575979979, + "lm_loss": 2.3092, + "step": 528, + "vm_loss": 0.1726 + }, + { + "epoch": 0.10164352575979979, + "lm_loss": 2.235, + "step": 528, + "vm_loss": 0.2396 + }, + { + "epoch": 0.10164352575979979, + "lm_loss": 2.1688, + "step": 528, + "vm_loss": 0.1011 + }, + { + "epoch": 0.10164352575979979, + "lm_loss": 2.789, + "step": 528, + "vm_loss": 0.1998 + }, + { + "epoch": 0.10164352575979979, + "lm_loss": 2.2814, + "step": 528, + "vm_loss": 0.1941 + }, + { + "epoch": 0.10164352575979979, + "lm_loss": 2.2993, + "step": 528, + "vm_loss": 0.2645 + }, + { + "epoch": 0.10164352575979979, + "lm_loss": 2.4847, + "step": 528, + "vm_loss": 0.1569 + }, + { + "epoch": 0.10164352575979979, + "lm_loss": 2.4784, + "step": 528, + "vm_loss": 0.1574 + }, + { + "epoch": 0.10183603243737517, + "grad_norm": 4.2954753814477185, + "learning_rate": 1.9977120463473647e-05, + "loss": 2.5763, + "step": 529 + }, + { + "epoch": 0.10202853911495055, + "grad_norm": 4.583368002012516, + "learning_rate": 1.9976909187668247e-05, + "loss": 2.5448, + "step": 530 + }, + { + "epoch": 0.10222104579252593, + "grad_norm": 4.543164436570776, + "learning_rate": 1.9976696941979625e-05, + "loss": 2.5663, + "step": 531 + }, + { + "epoch": 0.10241355247010131, + "grad_norm": 4.258242832977958, + "learning_rate": 1.9976483726428423e-05, + "loss": 2.5309, + "step": 532 + }, + { + "epoch": 0.10260605914767669, + "grad_norm": 4.936872883128827, + "learning_rate": 1.9976269541035365e-05, + "loss": 2.5791, + "step": 533 + }, + { + "epoch": 0.10279856582525207, + "grad_norm": 4.294882683273078, + "learning_rate": 1.997605438582128e-05, + "loss": 2.5756, + "step": 534 + }, + { + "epoch": 0.10299107250282744, + "grad_norm": 4.47189972091334, + "learning_rate": 1.9975838260807072e-05, + "loss": 2.5751, + "step": 535 + }, + { + "epoch": 0.10318357918040282, + "grad_norm": 5.534645060445187, + "learning_rate": 1.9975621166013758e-05, + "loss": 2.5431, + "step": 536 + }, + { + "epoch": 0.10318357918040282, + "lm_loss": 2.4895, + "step": 536, + "vm_loss": 0.1822 + }, + { + "epoch": 0.10318357918040282, + "lm_loss": 2.4698, + "step": 536, + "vm_loss": 0.1789 + }, + { + "epoch": 0.10318357918040282, + "lm_loss": 2.4426, + "step": 536, + "vm_loss": 0.1392 + }, + { + "epoch": 0.10318357918040282, + "lm_loss": 2.5139, + "step": 536, + "vm_loss": 0.2073 + }, + { + "epoch": 0.10318357918040282, + "lm_loss": 2.5653, + "step": 536, + "vm_loss": 0.1895 + }, + { + "epoch": 0.10318357918040282, + "lm_loss": 2.3175, + "step": 536, + "vm_loss": 0.193 + }, + { + "epoch": 0.10318357918040282, + "lm_loss": 2.585, + "step": 536, + "vm_loss": 0.2021 + }, + { + "epoch": 0.10318357918040282, + "lm_loss": 2.4231, + "step": 536, + "vm_loss": 0.1507 + }, + { + "epoch": 0.1033760858579782, + "grad_norm": 4.742741672338863, + "learning_rate": 1.997540310146244e-05, + "loss": 2.6073, + "step": 537 + }, + { + "epoch": 0.10356859253555357, + "grad_norm": 4.7129243689926925, + "learning_rate": 1.997518406717432e-05, + "loss": 2.5483, + "step": 538 + }, + { + "epoch": 0.10376109921312895, + "grad_norm": 5.111649367214757, + "learning_rate": 1.997496406317069e-05, + "loss": 2.5007, + "step": 539 + }, + { + "epoch": 0.10395360589070433, + "grad_norm": 5.272156736996118, + "learning_rate": 1.9974743089472935e-05, + "loss": 2.5842, + "step": 540 + }, + { + "epoch": 0.10414611256827971, + "grad_norm": 4.3925016569700155, + "learning_rate": 1.9974521146102535e-05, + "loss": 2.5717, + "step": 541 + }, + { + "epoch": 0.1043386192458551, + "grad_norm": 5.31383252091099, + "learning_rate": 1.9974298233081072e-05, + "loss": 2.5776, + "step": 542 + }, + { + "epoch": 0.10453112592343047, + "grad_norm": 4.395161100779029, + "learning_rate": 1.997407435043021e-05, + "loss": 2.5521, + "step": 543 + }, + { + "epoch": 0.10472363260100585, + "grad_norm": 5.2056347769229525, + "learning_rate": 1.9973849498171717e-05, + "loss": 2.5304, + "step": 544 + }, + { + "epoch": 0.10472363260100585, + "lm_loss": 2.303, + "step": 544, + "vm_loss": 0.1359 + }, + { + "epoch": 0.10472363260100585, + "lm_loss": 2.3496, + "step": 544, + "vm_loss": 0.1633 + }, + { + "epoch": 0.10472363260100585, + "lm_loss": 2.3177, + "step": 544, + "vm_loss": 0.2901 + }, + { + "epoch": 0.10472363260100585, + "lm_loss": 1.9506, + "step": 544, + "vm_loss": 0.1224 + }, + { + "epoch": 0.10472363260100585, + "lm_loss": 2.4513, + "step": 544, + "vm_loss": 0.2269 + }, + { + "epoch": 0.10472363260100585, + "lm_loss": 2.3186, + "step": 544, + "vm_loss": 0.1846 + }, + { + "epoch": 0.10472363260100585, + "lm_loss": 2.3219, + "step": 544, + "vm_loss": 0.2259 + }, + { + "epoch": 0.10472363260100585, + "lm_loss": 2.381, + "step": 544, + "vm_loss": 0.1353 + }, + { + "epoch": 0.10491613927858122, + "grad_norm": 4.452461920680797, + "learning_rate": 1.9973623676327447e-05, + "loss": 2.5622, + "step": 545 + }, + { + "epoch": 0.1051086459561566, + "grad_norm": 4.538593463208113, + "learning_rate": 1.997339688491936e-05, + "loss": 2.6382, + "step": 546 + }, + { + "epoch": 0.10530115263373198, + "grad_norm": 5.829048104888673, + "learning_rate": 1.9973169123969495e-05, + "loss": 2.5312, + "step": 547 + }, + { + "epoch": 0.10549365931130736, + "grad_norm": 4.0765426592671, + "learning_rate": 1.9972940393499997e-05, + "loss": 2.5475, + "step": 548 + }, + { + "epoch": 0.10568616598888274, + "grad_norm": 4.888477649510067, + "learning_rate": 1.9972710693533105e-05, + "loss": 2.5142, + "step": 549 + }, + { + "epoch": 0.10587867266645812, + "grad_norm": 4.364645119237847, + "learning_rate": 1.9972480024091143e-05, + "loss": 2.5429, + "step": 550 + }, + { + "epoch": 0.1060711793440335, + "grad_norm": 4.554702935944056, + "learning_rate": 1.9972248385196537e-05, + "loss": 2.5523, + "step": 551 + }, + { + "epoch": 0.10626368602160888, + "grad_norm": 4.562261967753434, + "learning_rate": 1.9972015776871804e-05, + "loss": 2.5612, + "step": 552 + }, + { + "epoch": 0.10626368602160888, + "lm_loss": 2.2495, + "step": 552, + "vm_loss": 0.1502 + }, + { + "epoch": 0.10626368602160888, + "lm_loss": 2.1304, + "step": 552, + "vm_loss": 0.1712 + }, + { + "epoch": 0.10626368602160888, + "lm_loss": 2.4615, + "step": 552, + "vm_loss": 0.1535 + }, + { + "epoch": 0.10626368602160888, + "lm_loss": 2.2341, + "step": 552, + "vm_loss": 0.2755 + }, + { + "epoch": 0.10626368602160888, + "lm_loss": 2.2722, + "step": 552, + "vm_loss": 0.1617 + }, + { + "epoch": 0.10626368602160888, + "lm_loss": 2.5001, + "step": 552, + "vm_loss": 0.233 + }, + { + "epoch": 0.10626368602160888, + "lm_loss": 2.1186, + "step": 552, + "vm_loss": 0.1459 + }, + { + "epoch": 0.10626368602160888, + "lm_loss": 1.8815, + "step": 552, + "vm_loss": 0.1585 + }, + { + "epoch": 0.10645619269918426, + "grad_norm": 4.029308585651912, + "learning_rate": 1.9971782199139565e-05, + "loss": 2.5253, + "step": 553 + }, + { + "epoch": 0.10664869937675964, + "grad_norm": 4.617136440972574, + "learning_rate": 1.9971547652022512e-05, + "loss": 2.5554, + "step": 554 + }, + { + "epoch": 0.106841206054335, + "grad_norm": 4.451605206367319, + "learning_rate": 1.9971312135543458e-05, + "loss": 2.5417, + "step": 555 + }, + { + "epoch": 0.10703371273191038, + "grad_norm": 3.8717236897340293, + "learning_rate": 1.9971075649725293e-05, + "loss": 2.5716, + "step": 556 + }, + { + "epoch": 0.10722621940948576, + "grad_norm": 4.904277182359794, + "learning_rate": 1.997083819459101e-05, + "loss": 2.546, + "step": 557 + }, + { + "epoch": 0.10741872608706114, + "grad_norm": 4.588915419700571, + "learning_rate": 1.9970599770163686e-05, + "loss": 2.5618, + "step": 558 + }, + { + "epoch": 0.10761123276463652, + "grad_norm": 4.54865545281532, + "learning_rate": 1.9970360376466506e-05, + "loss": 2.5528, + "step": 559 + }, + { + "epoch": 0.1078037394422119, + "grad_norm": 5.237608747956109, + "learning_rate": 1.9970120013522737e-05, + "loss": 2.5461, + "step": 560 + }, + { + "epoch": 0.1078037394422119, + "lm_loss": 2.2993, + "step": 560, + "vm_loss": 0.1628 + }, + { + "epoch": 0.1078037394422119, + "lm_loss": 2.0733, + "step": 560, + "vm_loss": 0.1442 + }, + { + "epoch": 0.1078037394422119, + "lm_loss": 2.0348, + "step": 560, + "vm_loss": 0.1293 + }, + { + "epoch": 0.1078037394422119, + "lm_loss": 2.3537, + "step": 560, + "vm_loss": 0.2592 + }, + { + "epoch": 0.1078037394422119, + "lm_loss": 2.2235, + "step": 560, + "vm_loss": 0.1084 + }, + { + "epoch": 0.1078037394422119, + "lm_loss": 2.2353, + "step": 560, + "vm_loss": 0.1531 + }, + { + "epoch": 0.1078037394422119, + "lm_loss": 2.2617, + "step": 560, + "vm_loss": 0.2525 + }, + { + "epoch": 0.1078037394422119, + "lm_loss": 2.2282, + "step": 560, + "vm_loss": 0.1504 + }, + { + "epoch": 0.10799624611978728, + "grad_norm": 4.155256976357799, + "learning_rate": 1.996987868135575e-05, + "loss": 2.5055, + "step": 561 + }, + { + "epoch": 0.10818875279736266, + "grad_norm": 5.164610891613552, + "learning_rate": 1.9969636379989e-05, + "loss": 2.5875, + "step": 562 + }, + { + "epoch": 0.10838125947493804, + "grad_norm": 5.18307245733418, + "learning_rate": 1.9969393109446052e-05, + "loss": 2.5472, + "step": 563 + }, + { + "epoch": 0.10857376615251342, + "grad_norm": 5.362741885855717, + "learning_rate": 1.996914886975054e-05, + "loss": 2.5797, + "step": 564 + }, + { + "epoch": 0.1087662728300888, + "grad_norm": 4.295717857849304, + "learning_rate": 1.996890366092622e-05, + "loss": 2.4908, + "step": 565 + }, + { + "epoch": 0.10895877950766417, + "grad_norm": 4.491151245485607, + "learning_rate": 1.9968657482996923e-05, + "loss": 2.5713, + "step": 566 + }, + { + "epoch": 0.10915128618523955, + "grad_norm": 3.5870286838918948, + "learning_rate": 1.996841033598658e-05, + "loss": 2.5951, + "step": 567 + }, + { + "epoch": 0.10934379286281493, + "grad_norm": 5.530468277715157, + "learning_rate": 1.9968162219919222e-05, + "loss": 2.5357, + "step": 568 + }, + { + "epoch": 0.10934379286281493, + "lm_loss": 2.2972, + "step": 568, + "vm_loss": 0.1815 + }, + { + "epoch": 0.10934379286281493, + "lm_loss": 2.4349, + "step": 568, + "vm_loss": 0.2109 + }, + { + "epoch": 0.10934379286281493, + "lm_loss": 2.6446, + "step": 568, + "vm_loss": 0.181 + }, + { + "epoch": 0.10934379286281493, + "lm_loss": 2.1997, + "step": 568, + "vm_loss": 0.1642 + }, + { + "epoch": 0.10934379286281493, + "lm_loss": 2.2931, + "step": 568, + "vm_loss": 0.225 + }, + { + "epoch": 0.10934379286281493, + "lm_loss": 2.4268, + "step": 568, + "vm_loss": 0.1656 + }, + { + "epoch": 0.10934379286281493, + "lm_loss": 2.4335, + "step": 568, + "vm_loss": 0.1749 + }, + { + "epoch": 0.10934379286281493, + "lm_loss": 2.517, + "step": 568, + "vm_loss": 0.194 + }, + { + "epoch": 0.1095362995403903, + "grad_norm": 4.306208266188474, + "learning_rate": 1.996791313481897e-05, + "loss": 2.5886, + "step": 569 + }, + { + "epoch": 0.10972880621796569, + "grad_norm": 4.84155708381945, + "learning_rate": 1.9967663080710026e-05, + "loss": 2.5134, + "step": 570 + }, + { + "epoch": 0.10992131289554107, + "grad_norm": 4.450894152483306, + "learning_rate": 1.9967412057616713e-05, + "loss": 2.507, + "step": 571 + }, + { + "epoch": 0.11011381957311644, + "grad_norm": 5.182548491387447, + "learning_rate": 1.9967160065563425e-05, + "loss": 2.5399, + "step": 572 + }, + { + "epoch": 0.11030632625069182, + "grad_norm": 4.405104545059833, + "learning_rate": 1.9966907104574663e-05, + "loss": 2.562, + "step": 573 + }, + { + "epoch": 0.1104988329282672, + "grad_norm": 5.0551007934908485, + "learning_rate": 1.9966653174675014e-05, + "loss": 2.5621, + "step": 574 + }, + { + "epoch": 0.11069133960584258, + "grad_norm": 4.499112987514951, + "learning_rate": 1.996639827588917e-05, + "loss": 2.5634, + "step": 575 + }, + { + "epoch": 0.11088384628341795, + "grad_norm": 4.4879177165491315, + "learning_rate": 1.99661424082419e-05, + "loss": 2.4971, + "step": 576 + }, + { + "epoch": 0.11088384628341795, + "lm_loss": 2.4575, + "step": 576, + "vm_loss": 0.1487 + }, + { + "epoch": 0.11088384628341795, + "lm_loss": 2.0256, + "step": 576, + "vm_loss": 0.0832 + }, + { + "epoch": 0.11088384628341795, + "lm_loss": 2.5426, + "step": 576, + "vm_loss": 0.1759 + }, + { + "epoch": 0.11088384628341795, + "lm_loss": 2.3797, + "step": 576, + "vm_loss": 0.1893 + }, + { + "epoch": 0.11088384628341795, + "lm_loss": 2.5603, + "step": 576, + "vm_loss": 0.2099 + }, + { + "epoch": 0.11088384628341795, + "lm_loss": 1.9635, + "step": 576, + "vm_loss": 0.1369 + }, + { + "epoch": 0.11088384628341795, + "lm_loss": 2.4542, + "step": 576, + "vm_loss": 0.1634 + }, + { + "epoch": 0.11088384628341795, + "lm_loss": 2.2695, + "step": 576, + "vm_loss": 0.1326 + }, + { + "epoch": 0.11107635296099333, + "grad_norm": 4.4604461446502555, + "learning_rate": 1.996588557175809e-05, + "loss": 2.5108, + "step": 577 + }, + { + "epoch": 0.11126885963856871, + "grad_norm": 3.6797171128629524, + "learning_rate": 1.9965627766462697e-05, + "loss": 2.5279, + "step": 578 + }, + { + "epoch": 0.11146136631614409, + "grad_norm": 3.9133489619595583, + "learning_rate": 1.996536899238079e-05, + "loss": 2.5544, + "step": 579 + }, + { + "epoch": 0.11165387299371947, + "grad_norm": 4.376642195522719, + "learning_rate": 1.9965109249537523e-05, + "loss": 2.5474, + "step": 580 + }, + { + "epoch": 0.11184637967129485, + "grad_norm": 3.8807954406788885, + "learning_rate": 1.9964848537958143e-05, + "loss": 2.5441, + "step": 581 + }, + { + "epoch": 0.11203888634887023, + "grad_norm": 4.443795091187734, + "learning_rate": 1.9964586857668002e-05, + "loss": 2.4777, + "step": 582 + }, + { + "epoch": 0.11223139302644561, + "grad_norm": 4.1714757307448815, + "learning_rate": 1.996432420869253e-05, + "loss": 2.5991, + "step": 583 + }, + { + "epoch": 0.11242389970402099, + "grad_norm": 4.0765940747119185, + "learning_rate": 1.996406059105726e-05, + "loss": 2.537, + "step": 584 + }, + { + "epoch": 0.11242389970402099, + "lm_loss": 2.4565, + "step": 584, + "vm_loss": 0.1827 + }, + { + "epoch": 0.11242389970402099, + "lm_loss": 2.2496, + "step": 584, + "vm_loss": 0.2172 + }, + { + "epoch": 0.11242389970402099, + "lm_loss": 2.2324, + "step": 584, + "vm_loss": 0.2361 + }, + { + "epoch": 0.11242389970402099, + "lm_loss": 2.4206, + "step": 584, + "vm_loss": 0.1575 + }, + { + "epoch": 0.11242389970402099, + "lm_loss": 2.0797, + "step": 584, + "vm_loss": 0.1843 + }, + { + "epoch": 0.11242389970402099, + "lm_loss": 2.4471, + "step": 584, + "vm_loss": 0.1716 + }, + { + "epoch": 0.11242389970402099, + "lm_loss": 1.8408, + "step": 584, + "vm_loss": 0.2423 + }, + { + "epoch": 0.11242389970402099, + "lm_loss": 2.0841, + "step": 584, + "vm_loss": 0.1713 + }, + { + "epoch": 0.11261640638159637, + "grad_norm": 4.56036467020508, + "learning_rate": 1.996379600478783e-05, + "loss": 2.4848, + "step": 585 + }, + { + "epoch": 0.11280891305917173, + "grad_norm": 4.387155945380377, + "learning_rate": 1.9963530449909954e-05, + "loss": 2.5938, + "step": 586 + }, + { + "epoch": 0.11300141973674711, + "grad_norm": 3.4866136531740146, + "learning_rate": 1.9963263926449446e-05, + "loss": 2.5449, + "step": 587 + }, + { + "epoch": 0.1131939264143225, + "grad_norm": 5.749970050845133, + "learning_rate": 1.9962996434432215e-05, + "loss": 2.5848, + "step": 588 + }, + { + "epoch": 0.11338643309189787, + "grad_norm": 4.3566581971219, + "learning_rate": 1.996272797388427e-05, + "loss": 2.558, + "step": 589 + }, + { + "epoch": 0.11357893976947325, + "grad_norm": 4.755457770589791, + "learning_rate": 1.9962458544831703e-05, + "loss": 2.5339, + "step": 590 + }, + { + "epoch": 0.11377144644704863, + "grad_norm": 4.487539284555335, + "learning_rate": 1.996218814730071e-05, + "loss": 2.571, + "step": 591 + }, + { + "epoch": 0.11396395312462401, + "grad_norm": 4.858345835334981, + "learning_rate": 1.9961916781317576e-05, + "loss": 2.5097, + "step": 592 + }, + { + "epoch": 0.11396395312462401, + "lm_loss": 2.1724, + "step": 592, + "vm_loss": 0.1132 + }, + { + "epoch": 0.11396395312462401, + "lm_loss": 2.1021, + "step": 592, + "vm_loss": 0.1762 + }, + { + "epoch": 0.11396395312462401, + "lm_loss": 2.1455, + "step": 592, + "vm_loss": 0.1669 + }, + { + "epoch": 0.11396395312462401, + "lm_loss": 2.603, + "step": 592, + "vm_loss": 0.135 + }, + { + "epoch": 0.11396395312462401, + "lm_loss": 2.3863, + "step": 592, + "vm_loss": 0.1551 + }, + { + "epoch": 0.11396395312462401, + "lm_loss": 2.4888, + "step": 592, + "vm_loss": 0.196 + }, + { + "epoch": 0.11396395312462401, + "lm_loss": 2.3053, + "step": 592, + "vm_loss": 0.1846 + }, + { + "epoch": 0.11396395312462401, + "lm_loss": 2.1713, + "step": 592, + "vm_loss": 0.249 + }, + { + "epoch": 0.11415645980219939, + "grad_norm": 3.987979843075841, + "learning_rate": 1.996164444690868e-05, + "loss": 2.531, + "step": 593 + }, + { + "epoch": 0.11434896647977477, + "grad_norm": 4.176283039869517, + "learning_rate": 1.9961371144100497e-05, + "loss": 2.5312, + "step": 594 + }, + { + "epoch": 0.11454147315735015, + "grad_norm": 5.447252848070803, + "learning_rate": 1.9961096872919596e-05, + "loss": 2.5583, + "step": 595 + }, + { + "epoch": 0.11473397983492552, + "grad_norm": 5.334194807974677, + "learning_rate": 1.9960821633392635e-05, + "loss": 2.4812, + "step": 596 + }, + { + "epoch": 0.1149264865125009, + "grad_norm": 6.811477152548263, + "learning_rate": 1.996054542554638e-05, + "loss": 2.5601, + "step": 597 + }, + { + "epoch": 0.11511899319007628, + "grad_norm": 4.334357344693945, + "learning_rate": 1.9960268249407674e-05, + "loss": 2.5511, + "step": 598 + }, + { + "epoch": 0.11531149986765166, + "grad_norm": 5.336792549265995, + "learning_rate": 1.9959990105003467e-05, + "loss": 2.5667, + "step": 599 + }, + { + "epoch": 0.11550400654522704, + "grad_norm": 5.893389448387921, + "learning_rate": 1.9959710992360798e-05, + "loss": 2.5293, + "step": 600 + }, + { + "epoch": 0.11550400654522704, + "lm_loss": 2.5349, + "step": 600, + "vm_loss": 0.1475 + }, + { + "epoch": 0.11550400654522704, + "lm_loss": 2.5449, + "step": 600, + "vm_loss": 0.1854 + }, + { + "epoch": 0.11550400654522704, + "lm_loss": 2.2599, + "step": 600, + "vm_loss": 0.256 + }, + { + "epoch": 0.11550400654522704, + "lm_loss": 2.4555, + "step": 600, + "vm_loss": 0.1905 + }, + { + "epoch": 0.11550400654522704, + "lm_loss": 2.2912, + "step": 600, + "vm_loss": 0.2175 + }, + { + "epoch": 0.11550400654522704, + "lm_loss": 2.3837, + "step": 600, + "vm_loss": 0.145 + }, + { + "epoch": 0.11550400654522704, + "lm_loss": 2.3388, + "step": 600, + "vm_loss": 0.1497 + }, + { + "epoch": 0.11550400654522704, + "lm_loss": 2.1978, + "step": 600, + "vm_loss": 0.2157 + }, + { + "epoch": 0.11569651322280242, + "grad_norm": 5.53391451210367, + "learning_rate": 1.9959430911506797e-05, + "loss": 2.5925, + "step": 601 + }, + { + "epoch": 0.1158890199003778, + "grad_norm": 4.656257247769798, + "learning_rate": 1.995914986246869e-05, + "loss": 2.5165, + "step": 602 + }, + { + "epoch": 0.11608152657795318, + "grad_norm": 6.197808096987862, + "learning_rate": 1.9958867845273805e-05, + "loss": 2.5937, + "step": 603 + }, + { + "epoch": 0.11627403325552856, + "grad_norm": 5.017573613725394, + "learning_rate": 1.9958584859949558e-05, + "loss": 2.5917, + "step": 604 + }, + { + "epoch": 0.11646653993310394, + "grad_norm": 4.739787351976849, + "learning_rate": 1.995830090652345e-05, + "loss": 2.5321, + "step": 605 + }, + { + "epoch": 0.1166590466106793, + "grad_norm": 4.252525284448323, + "learning_rate": 1.995801598502309e-05, + "loss": 2.5554, + "step": 606 + }, + { + "epoch": 0.11685155328825468, + "grad_norm": 4.356549863905389, + "learning_rate": 1.9957730095476177e-05, + "loss": 2.5117, + "step": 607 + }, + { + "epoch": 0.11704405996583006, + "grad_norm": 5.045659854669538, + "learning_rate": 1.9957443237910502e-05, + "loss": 2.552, + "step": 608 + }, + { + "epoch": 0.11704405996583006, + "lm_loss": 2.526, + "step": 608, + "vm_loss": 0.2532 + }, + { + "epoch": 0.11704405996583006, + "lm_loss": 2.2053, + "step": 608, + "vm_loss": 0.2431 + }, + { + "epoch": 0.11704405996583006, + "lm_loss": 2.4523, + "step": 608, + "vm_loss": 0.2149 + }, + { + "epoch": 0.11704405996583006, + "lm_loss": 2.7507, + "step": 608, + "vm_loss": 0.1393 + }, + { + "epoch": 0.11704405996583006, + "lm_loss": 2.4816, + "step": 608, + "vm_loss": 0.1639 + }, + { + "epoch": 0.11704405996583006, + "lm_loss": 2.2803, + "step": 608, + "vm_loss": 0.2002 + }, + { + "epoch": 0.11704405996583006, + "lm_loss": 2.3579, + "step": 608, + "vm_loss": 0.1043 + }, + { + "epoch": 0.11704405996583006, + "lm_loss": 2.283, + "step": 608, + "vm_loss": 0.2349 + }, + { + "epoch": 0.11723656664340544, + "grad_norm": 4.073958335085406, + "learning_rate": 1.995715541235395e-05, + "loss": 2.5839, + "step": 609 + }, + { + "epoch": 0.11742907332098082, + "grad_norm": 4.043061377223294, + "learning_rate": 1.9956866618834508e-05, + "loss": 2.5649, + "step": 610 + }, + { + "epoch": 0.1176215799985562, + "grad_norm": 4.438789195059056, + "learning_rate": 1.995657685738024e-05, + "loss": 2.4747, + "step": 611 + }, + { + "epoch": 0.11781408667613158, + "grad_norm": 4.190315304510294, + "learning_rate": 1.995628612801932e-05, + "loss": 2.5526, + "step": 612 + }, + { + "epoch": 0.11800659335370696, + "grad_norm": 4.046238216487343, + "learning_rate": 1.9955994430780012e-05, + "loss": 2.4863, + "step": 613 + }, + { + "epoch": 0.11819910003128234, + "grad_norm": 5.132396838070412, + "learning_rate": 1.9955701765690668e-05, + "loss": 2.5498, + "step": 614 + }, + { + "epoch": 0.11839160670885772, + "grad_norm": 3.9392603672246485, + "learning_rate": 1.9955408132779747e-05, + "loss": 2.5735, + "step": 615 + }, + { + "epoch": 0.1185841133864331, + "grad_norm": 5.46153809828257, + "learning_rate": 1.9955113532075784e-05, + "loss": 2.5266, + "step": 616 + }, + { + "epoch": 0.1185841133864331, + "lm_loss": 2.1994, + "step": 616, + "vm_loss": 0.187 + }, + { + "epoch": 0.1185841133864331, + "lm_loss": 2.1457, + "step": 616, + "vm_loss": 0.184 + }, + { + "epoch": 0.1185841133864331, + "lm_loss": 2.1226, + "step": 616, + "vm_loss": 0.141 + }, + { + "epoch": 0.1185841133864331, + "lm_loss": 2.1768, + "step": 616, + "vm_loss": 0.1621 + }, + { + "epoch": 0.1185841133864331, + "lm_loss": 2.6253, + "step": 616, + "vm_loss": 0.1815 + }, + { + "epoch": 0.1185841133864331, + "lm_loss": 2.3991, + "step": 616, + "vm_loss": 0.1982 + }, + { + "epoch": 0.1185841133864331, + "lm_loss": 2.3839, + "step": 616, + "vm_loss": 0.1451 + }, + { + "epoch": 0.1185841133864331, + "lm_loss": 2.5259, + "step": 616, + "vm_loss": 0.1553 + }, + { + "epoch": 0.11877662006400846, + "grad_norm": 4.083237404668489, + "learning_rate": 1.9954817963607426e-05, + "loss": 2.5464, + "step": 617 + }, + { + "epoch": 0.11896912674158384, + "grad_norm": 5.474657002245904, + "learning_rate": 1.99545214274034e-05, + "loss": 2.5376, + "step": 618 + }, + { + "epoch": 0.11916163341915922, + "grad_norm": 4.602705189412951, + "learning_rate": 1.9954223923492537e-05, + "loss": 2.5094, + "step": 619 + }, + { + "epoch": 0.1193541400967346, + "grad_norm": 5.151232392435146, + "learning_rate": 1.9953925451903757e-05, + "loss": 2.5739, + "step": 620 + }, + { + "epoch": 0.11954664677430998, + "grad_norm": 4.846012794073122, + "learning_rate": 1.9953626012666073e-05, + "loss": 2.5261, + "step": 621 + }, + { + "epoch": 0.11973915345188536, + "grad_norm": 4.762067948465738, + "learning_rate": 1.99533256058086e-05, + "loss": 2.5525, + "step": 622 + }, + { + "epoch": 0.11993166012946074, + "grad_norm": 4.450505712860721, + "learning_rate": 1.9953024231360537e-05, + "loss": 2.4879, + "step": 623 + }, + { + "epoch": 0.12012416680703612, + "grad_norm": 3.8589132116601914, + "learning_rate": 1.995272188935118e-05, + "loss": 2.5234, + "step": 624 + }, + { + "epoch": 0.12012416680703612, + "lm_loss": 2.5136, + "step": 624, + "vm_loss": 0.1301 + }, + { + "epoch": 0.12012416680703612, + "lm_loss": 2.4452, + "step": 624, + "vm_loss": 0.1765 + }, + { + "epoch": 0.12012416680703612, + "lm_loss": 2.1829, + "step": 624, + "vm_loss": 0.2665 + }, + { + "epoch": 0.12012416680703612, + "lm_loss": 2.1713, + "step": 624, + "vm_loss": 0.1648 + }, + { + "epoch": 0.12012416680703612, + "lm_loss": 2.4625, + "step": 624, + "vm_loss": 0.1692 + }, + { + "epoch": 0.12012416680703612, + "lm_loss": 2.3332, + "step": 624, + "vm_loss": 0.1797 + }, + { + "epoch": 0.12012416680703612, + "lm_loss": 2.2857, + "step": 624, + "vm_loss": 0.1905 + }, + { + "epoch": 0.12012416680703612, + "lm_loss": 2.7094, + "step": 624, + "vm_loss": 0.1627 + }, + { + "epoch": 0.1203166734846115, + "grad_norm": 5.487828235063577, + "learning_rate": 1.9952418579809927e-05, + "loss": 2.5577, + "step": 625 + }, + { + "epoch": 0.12050918016218688, + "grad_norm": 4.119771714153317, + "learning_rate": 1.9952114302766255e-05, + "loss": 2.5191, + "step": 626 + }, + { + "epoch": 0.12070168683976225, + "grad_norm": 4.457015090525097, + "learning_rate": 1.995180905824975e-05, + "loss": 2.492, + "step": 627 + }, + { + "epoch": 0.12089419351733763, + "grad_norm": 3.936169323856248, + "learning_rate": 1.995150284629008e-05, + "loss": 2.5349, + "step": 628 + }, + { + "epoch": 0.12108670019491301, + "grad_norm": 4.887220589290658, + "learning_rate": 1.9951195666917025e-05, + "loss": 2.5377, + "step": 629 + }, + { + "epoch": 0.12127920687248839, + "grad_norm": 5.273023501704847, + "learning_rate": 1.9950887520160433e-05, + "loss": 2.5603, + "step": 630 + }, + { + "epoch": 0.12147171355006377, + "grad_norm": 4.766747051272953, + "learning_rate": 1.9950578406050263e-05, + "loss": 2.5186, + "step": 631 + }, + { + "epoch": 0.12166422022763915, + "grad_norm": 5.756125396693531, + "learning_rate": 1.995026832461657e-05, + "loss": 2.4871, + "step": 632 + }, + { + "epoch": 0.12166422022763915, + "lm_loss": 2.3456, + "step": 632, + "vm_loss": 0.1742 + }, + { + "epoch": 0.12166422022763915, + "lm_loss": 2.5095, + "step": 632, + "vm_loss": 0.2327 + }, + { + "epoch": 0.12166422022763915, + "lm_loss": 2.155, + "step": 632, + "vm_loss": 0.1568 + }, + { + "epoch": 0.12166422022763915, + "lm_loss": 2.4375, + "step": 632, + "vm_loss": 0.1977 + }, + { + "epoch": 0.12166422022763915, + "lm_loss": 2.4639, + "step": 632, + "vm_loss": 0.2188 + }, + { + "epoch": 0.12166422022763915, + "lm_loss": 2.0473, + "step": 632, + "vm_loss": 0.1891 + }, + { + "epoch": 0.12166422022763915, + "lm_loss": 2.4015, + "step": 632, + "vm_loss": 0.1786 + }, + { + "epoch": 0.12166422022763915, + "lm_loss": 2.4911, + "step": 632, + "vm_loss": 0.171 + }, + { + "epoch": 0.12185672690521453, + "grad_norm": 6.055192025831539, + "learning_rate": 1.9949957275889496e-05, + "loss": 2.5765, + "step": 633 + }, + { + "epoch": 0.1220492335827899, + "grad_norm": 5.737076872236404, + "learning_rate": 1.9949645259899272e-05, + "loss": 2.4851, + "step": 634 + }, + { + "epoch": 0.12224174026036529, + "grad_norm": 4.772299580492336, + "learning_rate": 1.994933227667624e-05, + "loss": 2.4688, + "step": 635 + }, + { + "epoch": 0.12243424693794067, + "grad_norm": 5.643288397928099, + "learning_rate": 1.994901832625082e-05, + "loss": 2.5595, + "step": 636 + }, + { + "epoch": 0.12262675361551603, + "grad_norm": 4.039098871752698, + "learning_rate": 1.9948703408653535e-05, + "loss": 2.57, + "step": 637 + }, + { + "epoch": 0.12281926029309141, + "grad_norm": 4.9711410623891465, + "learning_rate": 1.9948387523915e-05, + "loss": 2.5426, + "step": 638 + }, + { + "epoch": 0.12301176697066679, + "grad_norm": 4.245752283373815, + "learning_rate": 1.9948070672065916e-05, + "loss": 2.5143, + "step": 639 + }, + { + "epoch": 0.12320427364824217, + "grad_norm": 4.509170010482479, + "learning_rate": 1.994775285313709e-05, + "loss": 2.5062, + "step": 640 + }, + { + "epoch": 0.12320427364824217, + "lm_loss": 2.6645, + "step": 640, + "vm_loss": 0.1628 + }, + { + "epoch": 0.12320427364824217, + "lm_loss": 2.3692, + "step": 640, + "vm_loss": 0.2272 + }, + { + "epoch": 0.12320427364824217, + "lm_loss": 2.458, + "step": 640, + "vm_loss": 0.1978 + }, + { + "epoch": 0.12320427364824217, + "lm_loss": 2.3372, + "step": 640, + "vm_loss": 0.1292 + }, + { + "epoch": 0.12320427364824217, + "lm_loss": 2.3324, + "step": 640, + "vm_loss": 0.216 + }, + { + "epoch": 0.12320427364824217, + "lm_loss": 2.4398, + "step": 640, + "vm_loss": 0.1255 + }, + { + "epoch": 0.12320427364824217, + "lm_loss": 2.1126, + "step": 640, + "vm_loss": 0.1741 + }, + { + "epoch": 0.12320427364824217, + "lm_loss": 2.4645, + "step": 640, + "vm_loss": 0.1978 + }, + { + "epoch": 0.12339678032581755, + "grad_norm": 4.553496428023007, + "learning_rate": 1.994743406715942e-05, + "loss": 2.5524, + "step": 641 + }, + { + "epoch": 0.12358928700339293, + "grad_norm": 3.8853493766983913, + "learning_rate": 1.9947114314163892e-05, + "loss": 2.5323, + "step": 642 + }, + { + "epoch": 0.12378179368096831, + "grad_norm": 5.3555711966836155, + "learning_rate": 1.9946793594181592e-05, + "loss": 2.5207, + "step": 643 + }, + { + "epoch": 0.12397430035854369, + "grad_norm": 3.9093572886523913, + "learning_rate": 1.99464719072437e-05, + "loss": 2.5258, + "step": 644 + }, + { + "epoch": 0.12416680703611907, + "grad_norm": 4.019594269327376, + "learning_rate": 1.9946149253381483e-05, + "loss": 2.5043, + "step": 645 + }, + { + "epoch": 0.12435931371369445, + "grad_norm": 4.493105766305279, + "learning_rate": 1.9945825632626313e-05, + "loss": 2.5364, + "step": 646 + }, + { + "epoch": 0.12455182039126982, + "grad_norm": 3.403117324921623, + "learning_rate": 1.9945501045009647e-05, + "loss": 2.524, + "step": 647 + }, + { + "epoch": 0.1247443270688452, + "grad_norm": 5.256055958991321, + "learning_rate": 1.9945175490563036e-05, + "loss": 2.5402, + "step": 648 + }, + { + "epoch": 0.1247443270688452, + "lm_loss": 2.5369, + "step": 648, + "vm_loss": 0.1844 + }, + { + "epoch": 0.1247443270688452, + "lm_loss": 2.4691, + "step": 648, + "vm_loss": 0.1109 + }, + { + "epoch": 0.1247443270688452, + "lm_loss": 2.5041, + "step": 648, + "vm_loss": 0.2007 + }, + { + "epoch": 0.1247443270688452, + "lm_loss": 2.4555, + "step": 648, + "vm_loss": 0.1968 + }, + { + "epoch": 0.1247443270688452, + "lm_loss": 2.5028, + "step": 648, + "vm_loss": 0.1971 + }, + { + "epoch": 0.1247443270688452, + "lm_loss": 2.3323, + "step": 648, + "vm_loss": 0.1231 + }, + { + "epoch": 0.1247443270688452, + "lm_loss": 1.974, + "step": 648, + "vm_loss": 0.1958 + }, + { + "epoch": 0.1247443270688452, + "lm_loss": 2.3771, + "step": 648, + "vm_loss": 0.172 + }, + { + "epoch": 0.12493683374642058, + "grad_norm": 3.51367171662114, + "learning_rate": 1.994484896931814e-05, + "loss": 2.5417, + "step": 649 + }, + { + "epoch": 0.12512934042399596, + "grad_norm": 4.647815520160439, + "learning_rate": 1.994452148130668e-05, + "loss": 2.5321, + "step": 650 + }, + { + "epoch": 0.12532184710157135, + "grad_norm": 3.7809143432498757, + "learning_rate": 1.9944193026560513e-05, + "loss": 2.566, + "step": 651 + }, + { + "epoch": 0.12551435377914671, + "grad_norm": 3.6761052173672835, + "learning_rate": 1.9943863605111558e-05, + "loss": 2.5652, + "step": 652 + }, + { + "epoch": 0.12570686045672208, + "grad_norm": 4.63139988481928, + "learning_rate": 1.9943533216991843e-05, + "loss": 2.4885, + "step": 653 + }, + { + "epoch": 0.12589936713429747, + "grad_norm": 3.7503339358328827, + "learning_rate": 1.994320186223348e-05, + "loss": 2.5568, + "step": 654 + }, + { + "epoch": 0.12609187381187284, + "grad_norm": 4.730960373444081, + "learning_rate": 1.994286954086869e-05, + "loss": 2.524, + "step": 655 + }, + { + "epoch": 0.12628438048944823, + "grad_norm": 3.86790112087384, + "learning_rate": 1.994253625292977e-05, + "loss": 2.5173, + "step": 656 + }, + { + "epoch": 0.12628438048944823, + "lm_loss": 2.4725, + "step": 656, + "vm_loss": 0.1413 + }, + { + "epoch": 0.12628438048944823, + "lm_loss": 1.934, + "step": 656, + "vm_loss": 0.2043 + }, + { + "epoch": 0.12628438048944823, + "lm_loss": 2.4389, + "step": 656, + "vm_loss": 0.2089 + }, + { + "epoch": 0.12628438048944823, + "lm_loss": 2.1853, + "step": 656, + "vm_loss": 0.1779 + }, + { + "epoch": 0.12628438048944823, + "lm_loss": 2.5635, + "step": 656, + "vm_loss": 0.1455 + }, + { + "epoch": 0.12628438048944823, + "lm_loss": 2.521, + "step": 656, + "vm_loss": 0.182 + }, + { + "epoch": 0.12628438048944823, + "lm_loss": 2.2869, + "step": 656, + "vm_loss": 0.1133 + }, + { + "epoch": 0.12628438048944823, + "lm_loss": 2.3508, + "step": 656, + "vm_loss": 0.1769 + }, + { + "epoch": 0.1264768871670236, + "grad_norm": 4.635303142095531, + "learning_rate": 1.9942201998449133e-05, + "loss": 2.5302, + "step": 657 + }, + { + "epoch": 0.126669393844599, + "grad_norm": 3.9988896990150247, + "learning_rate": 1.9941866777459253e-05, + "loss": 2.5095, + "step": 658 + }, + { + "epoch": 0.12686190052217436, + "grad_norm": 3.9154733464983833, + "learning_rate": 1.9941530589992733e-05, + "loss": 2.5567, + "step": 659 + }, + { + "epoch": 0.12705440719974975, + "grad_norm": 4.562643978524786, + "learning_rate": 1.994119343608225e-05, + "loss": 2.5156, + "step": 660 + }, + { + "epoch": 0.12724691387732512, + "grad_norm": 3.4115903425836267, + "learning_rate": 1.9940855315760582e-05, + "loss": 2.5357, + "step": 661 + }, + { + "epoch": 0.12743942055490048, + "grad_norm": 4.691181338330435, + "learning_rate": 1.9940516229060596e-05, + "loss": 2.5134, + "step": 662 + }, + { + "epoch": 0.12763192723247588, + "grad_norm": 3.367208174430154, + "learning_rate": 1.9940176176015253e-05, + "loss": 2.5319, + "step": 663 + }, + { + "epoch": 0.12782443391005124, + "grad_norm": 4.191450775924186, + "learning_rate": 1.9939835156657616e-05, + "loss": 2.5166, + "step": 664 + }, + { + "epoch": 0.12782443391005124, + "lm_loss": 2.2624, + "step": 664, + "vm_loss": 0.1341 + }, + { + "epoch": 0.12782443391005124, + "lm_loss": 2.5022, + "step": 664, + "vm_loss": 0.1497 + }, + { + "epoch": 0.12782443391005124, + "lm_loss": 2.3328, + "step": 664, + "vm_loss": 0.1311 + }, + { + "epoch": 0.12782443391005124, + "lm_loss": 2.4798, + "step": 664, + "vm_loss": 0.1817 + }, + { + "epoch": 0.12782443391005124, + "lm_loss": 2.1554, + "step": 664, + "vm_loss": 0.1295 + }, + { + "epoch": 0.12782443391005124, + "lm_loss": 2.0532, + "step": 664, + "vm_loss": 0.2217 + }, + { + "epoch": 0.12782443391005124, + "lm_loss": 2.3149, + "step": 664, + "vm_loss": 0.1993 + }, + { + "epoch": 0.12782443391005124, + "lm_loss": 2.1765, + "step": 664, + "vm_loss": 0.227 + }, + { + "epoch": 0.12801694058762664, + "grad_norm": 3.4969968141940955, + "learning_rate": 1.9939493171020836e-05, + "loss": 2.5157, + "step": 665 + }, + { + "epoch": 0.128209447265202, + "grad_norm": 3.8711315461127036, + "learning_rate": 1.9939150219138154e-05, + "loss": 2.5384, + "step": 666 + }, + { + "epoch": 0.1284019539427774, + "grad_norm": 3.774071886217187, + "learning_rate": 1.993880630104291e-05, + "loss": 2.5066, + "step": 667 + }, + { + "epoch": 0.12859446062035276, + "grad_norm": 4.038928846906131, + "learning_rate": 1.9938461416768544e-05, + "loss": 2.5013, + "step": 668 + }, + { + "epoch": 0.12878696729792816, + "grad_norm": 3.802648692275223, + "learning_rate": 1.9938115566348576e-05, + "loss": 2.5594, + "step": 669 + }, + { + "epoch": 0.12897947397550352, + "grad_norm": 3.453875121639923, + "learning_rate": 1.9937768749816628e-05, + "loss": 2.5198, + "step": 670 + }, + { + "epoch": 0.12917198065307892, + "grad_norm": 4.2472939694026905, + "learning_rate": 1.9937420967206415e-05, + "loss": 2.5353, + "step": 671 + }, + { + "epoch": 0.12936448733065428, + "grad_norm": 3.9958109356791236, + "learning_rate": 1.9937072218551752e-05, + "loss": 2.5283, + "step": 672 + }, + { + "epoch": 0.12936448733065428, + "lm_loss": 2.2537, + "step": 672, + "vm_loss": 0.1583 + }, + { + "epoch": 0.12936448733065428, + "lm_loss": 2.4968, + "step": 672, + "vm_loss": 0.134 + }, + { + "epoch": 0.12936448733065428, + "lm_loss": 2.4089, + "step": 672, + "vm_loss": 0.1597 + }, + { + "epoch": 0.12936448733065428, + "lm_loss": 2.5114, + "step": 672, + "vm_loss": 0.1938 + }, + { + "epoch": 0.12936448733065428, + "lm_loss": 2.2481, + "step": 672, + "vm_loss": 0.1668 + }, + { + "epoch": 0.12936448733065428, + "lm_loss": 2.1567, + "step": 672, + "vm_loss": 0.1149 + }, + { + "epoch": 0.12936448733065428, + "lm_loss": 2.313, + "step": 672, + "vm_loss": 0.167 + }, + { + "epoch": 0.12936448733065428, + "lm_loss": 2.4374, + "step": 672, + "vm_loss": 0.1151 + }, + { + "epoch": 0.12955699400822965, + "grad_norm": 3.9293362103244447, + "learning_rate": 1.9936722503886533e-05, + "loss": 2.5089, + "step": 673 + }, + { + "epoch": 0.12974950068580504, + "grad_norm": 4.632319353205451, + "learning_rate": 1.9936371823244764e-05, + "loss": 2.5573, + "step": 674 + }, + { + "epoch": 0.1299420073633804, + "grad_norm": 3.7231800984411265, + "learning_rate": 1.9936020176660524e-05, + "loss": 2.5283, + "step": 675 + }, + { + "epoch": 0.1301345140409558, + "grad_norm": 4.076787346661957, + "learning_rate": 1.9935667564168008e-05, + "loss": 2.5272, + "step": 676 + }, + { + "epoch": 0.13032702071853117, + "grad_norm": 3.5182137769058786, + "learning_rate": 1.993531398580149e-05, + "loss": 2.5304, + "step": 677 + }, + { + "epoch": 0.13051952739610656, + "grad_norm": 3.9492581536972673, + "learning_rate": 1.993495944159534e-05, + "loss": 2.5538, + "step": 678 + }, + { + "epoch": 0.13071203407368193, + "grad_norm": 4.455982751635128, + "learning_rate": 1.9934603931584032e-05, + "loss": 2.5007, + "step": 679 + }, + { + "epoch": 0.13090454075125732, + "grad_norm": 3.0802307570055922, + "learning_rate": 1.9934247455802115e-05, + "loss": 2.554, + "step": 680 + }, + { + "epoch": 0.13090454075125732, + "lm_loss": 2.243, + "step": 680, + "vm_loss": 0.1914 + }, + { + "epoch": 0.13090454075125732, + "lm_loss": 2.6919, + "step": 680, + "vm_loss": 0.1736 + }, + { + "epoch": 0.13090454075125732, + "lm_loss": 2.2703, + "step": 680, + "vm_loss": 0.1315 + }, + { + "epoch": 0.13090454075125732, + "lm_loss": 2.0587, + "step": 680, + "vm_loss": 0.1784 + }, + { + "epoch": 0.13090454075125732, + "lm_loss": 2.4997, + "step": 680, + "vm_loss": 0.2241 + }, + { + "epoch": 0.13090454075125732, + "lm_loss": 2.2966, + "step": 680, + "vm_loss": 0.1625 + }, + { + "epoch": 0.13090454075125732, + "lm_loss": 2.1264, + "step": 680, + "vm_loss": 0.1779 + }, + { + "epoch": 0.13090454075125732, + "lm_loss": 2.1005, + "step": 680, + "vm_loss": 0.1878 + }, + { + "epoch": 0.13109704742883269, + "grad_norm": 3.678063288166201, + "learning_rate": 1.9933890014284257e-05, + "loss": 2.5178, + "step": 681 + }, + { + "epoch": 0.13128955410640808, + "grad_norm": 3.871396444452642, + "learning_rate": 1.9933531607065193e-05, + "loss": 2.4804, + "step": 682 + }, + { + "epoch": 0.13148206078398345, + "grad_norm": 4.022100269229644, + "learning_rate": 1.993317223417977e-05, + "loss": 2.5173, + "step": 683 + }, + { + "epoch": 0.1316745674615588, + "grad_norm": 4.756744774856078, + "learning_rate": 1.993281189566292e-05, + "loss": 2.5432, + "step": 684 + }, + { + "epoch": 0.1318670741391342, + "grad_norm": 3.878310002826389, + "learning_rate": 1.9932450591549683e-05, + "loss": 2.5448, + "step": 685 + }, + { + "epoch": 0.13205958081670957, + "grad_norm": 4.18048753467834, + "learning_rate": 1.9932088321875175e-05, + "loss": 2.4943, + "step": 686 + }, + { + "epoch": 0.13225208749428496, + "grad_norm": 4.060923585685389, + "learning_rate": 1.9931725086674607e-05, + "loss": 2.5957, + "step": 687 + }, + { + "epoch": 0.13244459417186033, + "grad_norm": 3.4645542345182805, + "learning_rate": 1.99313608859833e-05, + "loss": 2.5397, + "step": 688 + }, + { + "epoch": 0.13244459417186033, + "lm_loss": 2.5851, + "step": 688, + "vm_loss": 0.222 + }, + { + "epoch": 0.13244459417186033, + "lm_loss": 2.2303, + "step": 688, + "vm_loss": 0.1517 + }, + { + "epoch": 0.13244459417186033, + "lm_loss": 2.4098, + "step": 688, + "vm_loss": 0.2349 + }, + { + "epoch": 0.13244459417186033, + "lm_loss": 2.3245, + "step": 688, + "vm_loss": 0.1569 + }, + { + "epoch": 0.13244459417186033, + "lm_loss": 2.5087, + "step": 688, + "vm_loss": 0.1535 + }, + { + "epoch": 0.13244459417186033, + "lm_loss": 2.4198, + "step": 688, + "vm_loss": 0.2447 + }, + { + "epoch": 0.13244459417186033, + "lm_loss": 2.1676, + "step": 688, + "vm_loss": 0.2177 + }, + { + "epoch": 0.13244459417186033, + "lm_loss": 2.4587, + "step": 688, + "vm_loss": 0.1465 + }, + { + "epoch": 0.13263710084943572, + "grad_norm": 3.959500347218049, + "learning_rate": 1.9930995719836658e-05, + "loss": 2.5491, + "step": 689 + }, + { + "epoch": 0.1328296075270111, + "grad_norm": 4.6051983094794835, + "learning_rate": 1.9930629588270177e-05, + "loss": 2.5777, + "step": 690 + }, + { + "epoch": 0.13302211420458648, + "grad_norm": 3.6599835646768666, + "learning_rate": 1.993026249131945e-05, + "loss": 2.4919, + "step": 691 + }, + { + "epoch": 0.13321462088216185, + "grad_norm": 4.050632043887986, + "learning_rate": 1.992989442902016e-05, + "loss": 2.5217, + "step": 692 + }, + { + "epoch": 0.13340712755973722, + "grad_norm": 4.671505037451218, + "learning_rate": 1.9929525401408095e-05, + "loss": 2.4872, + "step": 693 + }, + { + "epoch": 0.1335996342373126, + "grad_norm": 3.7477270661721054, + "learning_rate": 1.9929155408519124e-05, + "loss": 2.4912, + "step": 694 + }, + { + "epoch": 0.13379214091488797, + "grad_norm": 4.540438705544522, + "learning_rate": 1.9928784450389215e-05, + "loss": 2.532, + "step": 695 + }, + { + "epoch": 0.13398464759246337, + "grad_norm": 3.5958084799314105, + "learning_rate": 1.9928412527054434e-05, + "loss": 2.5505, + "step": 696 + }, + { + "epoch": 0.13398464759246337, + "lm_loss": 2.1628, + "step": 696, + "vm_loss": 0.172 + }, + { + "epoch": 0.13398464759246337, + "lm_loss": 2.0239, + "step": 696, + "vm_loss": 0.1817 + }, + { + "epoch": 0.13398464759246337, + "lm_loss": 2.5022, + "step": 696, + "vm_loss": 0.1987 + }, + { + "epoch": 0.13398464759246337, + "lm_loss": 2.1749, + "step": 696, + "vm_loss": 0.1419 + }, + { + "epoch": 0.13398464759246337, + "lm_loss": 2.4784, + "step": 696, + "vm_loss": 0.1388 + }, + { + "epoch": 0.13398464759246337, + "lm_loss": 2.2219, + "step": 696, + "vm_loss": 0.1678 + }, + { + "epoch": 0.13398464759246337, + "lm_loss": 2.0597, + "step": 696, + "vm_loss": 0.2234 + }, + { + "epoch": 0.13398464759246337, + "lm_loss": 2.3268, + "step": 696, + "vm_loss": 0.2184 + }, + { + "epoch": 0.13417715427003873, + "grad_norm": 5.293221432692755, + "learning_rate": 1.9928039638550932e-05, + "loss": 2.4637, + "step": 697 + }, + { + "epoch": 0.13436966094761413, + "grad_norm": 4.181332486198363, + "learning_rate": 1.992766578491496e-05, + "loss": 2.5523, + "step": 698 + }, + { + "epoch": 0.1345621676251895, + "grad_norm": 4.37083363256827, + "learning_rate": 1.992729096618286e-05, + "loss": 2.4945, + "step": 699 + }, + { + "epoch": 0.1347546743027649, + "grad_norm": 4.131147972223744, + "learning_rate": 1.9926915182391074e-05, + "loss": 2.5517, + "step": 700 + }, + { + "epoch": 0.13494718098034025, + "grad_norm": 3.748901096479565, + "learning_rate": 1.992653843357613e-05, + "loss": 2.4505, + "step": 701 + }, + { + "epoch": 0.13513968765791565, + "grad_norm": 3.5877001786459894, + "learning_rate": 1.9926160719774656e-05, + "loss": 2.4923, + "step": 702 + }, + { + "epoch": 0.135332194335491, + "grad_norm": 3.8824528589617797, + "learning_rate": 1.992578204102336e-05, + "loss": 2.5295, + "step": 703 + }, + { + "epoch": 0.13552470101306638, + "grad_norm": 3.8136063237230116, + "learning_rate": 1.9925402397359066e-05, + "loss": 2.5711, + "step": 704 + }, + { + "epoch": 0.13552470101306638, + "lm_loss": 2.2671, + "step": 704, + "vm_loss": 0.1756 + }, + { + "epoch": 0.13552470101306638, + "lm_loss": 2.3235, + "step": 704, + "vm_loss": 0.1699 + }, + { + "epoch": 0.13552470101306638, + "lm_loss": 2.3424, + "step": 704, + "vm_loss": 0.2006 + }, + { + "epoch": 0.13552470101306638, + "lm_loss": 1.6294, + "step": 704, + "vm_loss": 0.1862 + }, + { + "epoch": 0.13552470101306638, + "lm_loss": 2.1326, + "step": 704, + "vm_loss": 0.1479 + }, + { + "epoch": 0.13552470101306638, + "lm_loss": 2.3362, + "step": 704, + "vm_loss": 0.1689 + }, + { + "epoch": 0.13552470101306638, + "lm_loss": 2.5109, + "step": 704, + "vm_loss": 0.215 + }, + { + "epoch": 0.13552470101306638, + "lm_loss": 2.3083, + "step": 704, + "vm_loss": 0.195 + }, + { + "epoch": 0.13571720769064177, + "grad_norm": 4.0117866110071345, + "learning_rate": 1.9925021788818674e-05, + "loss": 2.5291, + "step": 705 + }, + { + "epoch": 0.13590971436821714, + "grad_norm": 3.7080704525190544, + "learning_rate": 1.992464021543919e-05, + "loss": 2.4973, + "step": 706 + }, + { + "epoch": 0.13610222104579253, + "grad_norm": 4.106054799135807, + "learning_rate": 1.99242576772577e-05, + "loss": 2.4842, + "step": 707 + }, + { + "epoch": 0.1362947277233679, + "grad_norm": 4.402486136739704, + "learning_rate": 1.9923874174311394e-05, + "loss": 2.4998, + "step": 708 + }, + { + "epoch": 0.1364872344009433, + "grad_norm": 4.21005751032192, + "learning_rate": 1.992348970663756e-05, + "loss": 2.5805, + "step": 709 + }, + { + "epoch": 0.13667974107851866, + "grad_norm": 4.087937474001111, + "learning_rate": 1.9923104274273564e-05, + "loss": 2.5068, + "step": 710 + }, + { + "epoch": 0.13687224775609405, + "grad_norm": 3.9605057651128153, + "learning_rate": 1.992271787725688e-05, + "loss": 2.5333, + "step": 711 + }, + { + "epoch": 0.13706475443366942, + "grad_norm": 4.393816375918899, + "learning_rate": 1.992233051562507e-05, + "loss": 2.4906, + "step": 712 + }, + { + "epoch": 0.13706475443366942, + "lm_loss": 2.3329, + "step": 712, + "vm_loss": 0.2081 + }, + { + "epoch": 0.13706475443366942, + "lm_loss": 2.4234, + "step": 712, + "vm_loss": 0.1992 + }, + { + "epoch": 0.13706475443366942, + "lm_loss": 2.1843, + "step": 712, + "vm_loss": 0.2479 + }, + { + "epoch": 0.13706475443366942, + "lm_loss": 2.2165, + "step": 712, + "vm_loss": 0.2064 + }, + { + "epoch": 0.13706475443366942, + "lm_loss": 1.7938, + "step": 712, + "vm_loss": 0.175 + }, + { + "epoch": 0.13706475443366942, + "lm_loss": 2.5259, + "step": 712, + "vm_loss": 0.1866 + }, + { + "epoch": 0.13706475443366942, + "lm_loss": 2.188, + "step": 712, + "vm_loss": 0.2361 + }, + { + "epoch": 0.13706475443366942, + "lm_loss": 2.3782, + "step": 712, + "vm_loss": 0.2383 + }, + { + "epoch": 0.13725726111124478, + "grad_norm": 4.04710857306534, + "learning_rate": 1.9921942189415793e-05, + "loss": 2.5318, + "step": 713 + }, + { + "epoch": 0.13744976778882018, + "grad_norm": 4.4872288353309235, + "learning_rate": 1.9921552898666793e-05, + "loss": 2.4813, + "step": 714 + }, + { + "epoch": 0.13764227446639554, + "grad_norm": 4.016406026527296, + "learning_rate": 1.9921162643415918e-05, + "loss": 2.4594, + "step": 715 + }, + { + "epoch": 0.13783478114397094, + "grad_norm": 4.044491673117378, + "learning_rate": 1.9920771423701108e-05, + "loss": 2.5857, + "step": 716 + }, + { + "epoch": 0.1380272878215463, + "grad_norm": 4.237752379784449, + "learning_rate": 1.9920379239560388e-05, + "loss": 2.5607, + "step": 717 + }, + { + "epoch": 0.1382197944991217, + "grad_norm": 5.218320403597712, + "learning_rate": 1.9919986091031885e-05, + "loss": 2.5589, + "step": 718 + }, + { + "epoch": 0.13841230117669706, + "grad_norm": 4.870607108724517, + "learning_rate": 1.9919591978153824e-05, + "loss": 2.5335, + "step": 719 + }, + { + "epoch": 0.13860480785427245, + "grad_norm": 3.871828669170681, + "learning_rate": 1.9919196900964517e-05, + "loss": 2.5532, + "step": 720 + }, + { + "epoch": 0.13860480785427245, + "lm_loss": 2.47, + "step": 720, + "vm_loss": 0.1103 + }, + { + "epoch": 0.13860480785427245, + "lm_loss": 2.5248, + "step": 720, + "vm_loss": 0.209 + }, + { + "epoch": 0.13860480785427245, + "lm_loss": 2.1865, + "step": 720, + "vm_loss": 0.1718 + }, + { + "epoch": 0.13860480785427245, + "lm_loss": 2.6737, + "step": 720, + "vm_loss": 0.1866 + }, + { + "epoch": 0.13860480785427245, + "lm_loss": 2.0873, + "step": 720, + "vm_loss": 0.1684 + }, + { + "epoch": 0.13860480785427245, + "lm_loss": 2.2774, + "step": 720, + "vm_loss": 0.1946 + }, + { + "epoch": 0.13860480785427245, + "lm_loss": 2.57, + "step": 720, + "vm_loss": 0.1666 + }, + { + "epoch": 0.13860480785427245, + "lm_loss": 2.0699, + "step": 720, + "vm_loss": 0.1858 + }, + { + "epoch": 0.13879731453184782, + "grad_norm": 4.075557972706211, + "learning_rate": 1.9918800859502366e-05, + "loss": 2.5329, + "step": 721 + }, + { + "epoch": 0.13898982120942321, + "grad_norm": 4.425157589383468, + "learning_rate": 1.991840385380587e-05, + "loss": 2.4517, + "step": 722 + }, + { + "epoch": 0.13918232788699858, + "grad_norm": 4.908035310675754, + "learning_rate": 1.991800588391363e-05, + "loss": 2.5122, + "step": 723 + }, + { + "epoch": 0.13937483456457395, + "grad_norm": 3.5165633378115744, + "learning_rate": 1.9917606949864326e-05, + "loss": 2.5141, + "step": 724 + }, + { + "epoch": 0.13956734124214934, + "grad_norm": 4.046152078981162, + "learning_rate": 1.9917207051696744e-05, + "loss": 2.5085, + "step": 725 + }, + { + "epoch": 0.1397598479197247, + "grad_norm": 4.221543390010374, + "learning_rate": 1.991680618944976e-05, + "loss": 2.4703, + "step": 726 + }, + { + "epoch": 0.1399523545973001, + "grad_norm": 4.1866542337112005, + "learning_rate": 1.991640436316234e-05, + "loss": 2.5176, + "step": 727 + }, + { + "epoch": 0.14014486127487547, + "grad_norm": 4.428208052748198, + "learning_rate": 1.991600157287355e-05, + "loss": 2.564, + "step": 728 + }, + { + "epoch": 0.14014486127487547, + "lm_loss": 2.3465, + "step": 728, + "vm_loss": 0.1945 + }, + { + "epoch": 0.14014486127487547, + "lm_loss": 2.2343, + "step": 728, + "vm_loss": 0.1971 + }, + { + "epoch": 0.14014486127487547, + "lm_loss": 2.6499, + "step": 728, + "vm_loss": 0.224 + }, + { + "epoch": 0.14014486127487547, + "lm_loss": 2.4633, + "step": 728, + "vm_loss": 0.1435 + }, + { + "epoch": 0.14014486127487547, + "lm_loss": 2.5692, + "step": 728, + "vm_loss": 0.1887 + }, + { + "epoch": 0.14014486127487547, + "lm_loss": 2.3546, + "step": 728, + "vm_loss": 0.206 + }, + { + "epoch": 0.14014486127487547, + "lm_loss": 2.4916, + "step": 728, + "vm_loss": 0.2559 + }, + { + "epoch": 0.14014486127487547, + "lm_loss": 2.5001, + "step": 728, + "vm_loss": 0.1853 + }, + { + "epoch": 0.14033736795245086, + "grad_norm": 4.2609027439682015, + "learning_rate": 1.9915597818622542e-05, + "loss": 2.5709, + "step": 729 + }, + { + "epoch": 0.14052987463002622, + "grad_norm": 4.128131763963246, + "learning_rate": 1.991519310044857e-05, + "loss": 2.4997, + "step": 730 + }, + { + "epoch": 0.14072238130760162, + "grad_norm": 3.6429004555596403, + "learning_rate": 1.9914787418390975e-05, + "loss": 2.5422, + "step": 731 + }, + { + "epoch": 0.14091488798517698, + "grad_norm": 3.9901359976747113, + "learning_rate": 1.9914380772489195e-05, + "loss": 2.5108, + "step": 732 + }, + { + "epoch": 0.14110739466275238, + "grad_norm": 4.431513703584899, + "learning_rate": 1.9913973162782764e-05, + "loss": 2.4951, + "step": 733 + }, + { + "epoch": 0.14129990134032774, + "grad_norm": 4.172522236655245, + "learning_rate": 1.99135645893113e-05, + "loss": 2.5202, + "step": 734 + }, + { + "epoch": 0.1414924080179031, + "grad_norm": 3.8441806336495117, + "learning_rate": 1.9913155052114533e-05, + "loss": 2.5012, + "step": 735 + }, + { + "epoch": 0.1416849146954785, + "grad_norm": 4.520989804769229, + "learning_rate": 1.9912744551232268e-05, + "loss": 2.5034, + "step": 736 + }, + { + "epoch": 0.1416849146954785, + "lm_loss": 2.3319, + "step": 736, + "vm_loss": 0.1687 + }, + { + "epoch": 0.1416849146954785, + "lm_loss": 2.4547, + "step": 736, + "vm_loss": 0.1925 + }, + { + "epoch": 0.1416849146954785, + "lm_loss": 2.3391, + "step": 736, + "vm_loss": 0.1284 + }, + { + "epoch": 0.1416849146954785, + "lm_loss": 2.8188, + "step": 736, + "vm_loss": 0.2065 + }, + { + "epoch": 0.1416849146954785, + "lm_loss": 2.169, + "step": 736, + "vm_loss": 0.1921 + }, + { + "epoch": 0.1416849146954785, + "lm_loss": 2.3449, + "step": 736, + "vm_loss": 0.1555 + }, + { + "epoch": 0.1416849146954785, + "lm_loss": 2.4636, + "step": 736, + "vm_loss": 0.1814 + }, + { + "epoch": 0.1416849146954785, + "lm_loss": 2.175, + "step": 736, + "vm_loss": 0.1559 + }, + { + "epoch": 0.14187742137305387, + "grad_norm": 3.933989005893063, + "learning_rate": 1.991233308670441e-05, + "loss": 2.5414, + "step": 737 + }, + { + "epoch": 0.14206992805062926, + "grad_norm": 4.136289663795889, + "learning_rate": 1.991192065857096e-05, + "loss": 2.4705, + "step": 738 + }, + { + "epoch": 0.14226243472820463, + "grad_norm": 4.288852677681718, + "learning_rate": 1.991150726687201e-05, + "loss": 2.5102, + "step": 739 + }, + { + "epoch": 0.14245494140578002, + "grad_norm": 3.2252252856749997, + "learning_rate": 1.9911092911647752e-05, + "loss": 2.4659, + "step": 740 + }, + { + "epoch": 0.1426474480833554, + "grad_norm": 4.352948548409321, + "learning_rate": 1.9910677592938458e-05, + "loss": 2.5335, + "step": 741 + }, + { + "epoch": 0.14283995476093078, + "grad_norm": 3.665563505299811, + "learning_rate": 1.9910261310784515e-05, + "loss": 2.5252, + "step": 742 + }, + { + "epoch": 0.14303246143850615, + "grad_norm": 3.46504171500494, + "learning_rate": 1.9909844065226376e-05, + "loss": 2.5128, + "step": 743 + }, + { + "epoch": 0.1432249681160815, + "grad_norm": 4.502805014824155, + "learning_rate": 1.9909425856304615e-05, + "loss": 2.5114, + "step": 744 + }, + { + "epoch": 0.1432249681160815, + "lm_loss": 2.3742, + "step": 744, + "vm_loss": 0.1837 + }, + { + "epoch": 0.1432249681160815, + "lm_loss": 2.3313, + "step": 744, + "vm_loss": 0.206 + }, + { + "epoch": 0.1432249681160815, + "lm_loss": 1.9662, + "step": 744, + "vm_loss": 0.2747 + }, + { + "epoch": 0.1432249681160815, + "lm_loss": 2.2348, + "step": 744, + "vm_loss": 0.2067 + }, + { + "epoch": 0.1432249681160815, + "lm_loss": 2.1076, + "step": 744, + "vm_loss": 0.2 + }, + { + "epoch": 0.1432249681160815, + "lm_loss": 2.706, + "step": 744, + "vm_loss": 0.1287 + }, + { + "epoch": 0.1432249681160815, + "lm_loss": 2.3584, + "step": 744, + "vm_loss": 0.259 + }, + { + "epoch": 0.1432249681160815, + "lm_loss": 2.3714, + "step": 744, + "vm_loss": 0.193 + }, + { + "epoch": 0.1434174747936569, + "grad_norm": 3.600535945519794, + "learning_rate": 1.990900668405988e-05, + "loss": 2.5253, + "step": 745 + }, + { + "epoch": 0.14360998147123227, + "grad_norm": 4.018715609335587, + "learning_rate": 1.990858654853292e-05, + "loss": 2.4983, + "step": 746 + }, + { + "epoch": 0.14380248814880767, + "grad_norm": 3.573250442551011, + "learning_rate": 1.990816544976458e-05, + "loss": 2.5331, + "step": 747 + }, + { + "epoch": 0.14399499482638303, + "grad_norm": 3.653256381564751, + "learning_rate": 1.99077433877958e-05, + "loss": 2.5117, + "step": 748 + }, + { + "epoch": 0.14418750150395843, + "grad_norm": 4.3701896250522045, + "learning_rate": 1.9907320362667605e-05, + "loss": 2.4485, + "step": 749 + }, + { + "epoch": 0.1443800081815338, + "grad_norm": 3.436477164554334, + "learning_rate": 1.9906896374421116e-05, + "loss": 2.5282, + "step": 750 + }, + { + "epoch": 0.14457251485910919, + "grad_norm": 4.534762046031325, + "learning_rate": 1.9906471423097552e-05, + "loss": 2.4481, + "step": 751 + }, + { + "epoch": 0.14476502153668455, + "grad_norm": 3.5823737485713867, + "learning_rate": 1.990604550873823e-05, + "loss": 2.5235, + "step": 752 + }, + { + "epoch": 0.14476502153668455, + "lm_loss": 2.2159, + "step": 752, + "vm_loss": 0.1585 + }, + { + "epoch": 0.14476502153668455, + "lm_loss": 2.0608, + "step": 752, + "vm_loss": 0.2139 + }, + { + "epoch": 0.14476502153668455, + "lm_loss": 2.3342, + "step": 752, + "vm_loss": 0.1674 + }, + { + "epoch": 0.14476502153668455, + "lm_loss": 2.5536, + "step": 752, + "vm_loss": 0.1614 + }, + { + "epoch": 0.14476502153668455, + "lm_loss": 2.667, + "step": 752, + "vm_loss": 0.1057 + }, + { + "epoch": 0.14476502153668455, + "lm_loss": 2.2093, + "step": 752, + "vm_loss": 0.1652 + }, + { + "epoch": 0.14476502153668455, + "lm_loss": 2.3884, + "step": 752, + "vm_loss": 0.1944 + }, + { + "epoch": 0.14476502153668455, + "lm_loss": 2.3651, + "step": 752, + "vm_loss": 0.2497 + }, + { + "epoch": 0.14495752821425995, + "grad_norm": 3.362839374780301, + "learning_rate": 1.9905618631384546e-05, + "loss": 2.4953, + "step": 753 + }, + { + "epoch": 0.1451500348918353, + "grad_norm": 3.890888572852814, + "learning_rate": 1.9905190791077998e-05, + "loss": 2.4955, + "step": 754 + }, + { + "epoch": 0.14534254156941068, + "grad_norm": 3.8663659137447395, + "learning_rate": 1.9904761987860187e-05, + "loss": 2.4857, + "step": 755 + }, + { + "epoch": 0.14553504824698607, + "grad_norm": 3.471695821517422, + "learning_rate": 1.9904332221772788e-05, + "loss": 2.498, + "step": 756 + }, + { + "epoch": 0.14572755492456144, + "grad_norm": 3.6899373830137754, + "learning_rate": 1.9903901492857586e-05, + "loss": 2.494, + "step": 757 + }, + { + "epoch": 0.14592006160213683, + "grad_norm": 4.180522440144561, + "learning_rate": 1.990346980115645e-05, + "loss": 2.4886, + "step": 758 + }, + { + "epoch": 0.1461125682797122, + "grad_norm": 3.836127376099166, + "learning_rate": 1.9903037146711348e-05, + "loss": 2.5123, + "step": 759 + }, + { + "epoch": 0.1463050749572876, + "grad_norm": 4.280059401564708, + "learning_rate": 1.9902603529564338e-05, + "loss": 2.5147, + "step": 760 + }, + { + "epoch": 0.1463050749572876, + "lm_loss": 2.5928, + "step": 760, + "vm_loss": 0.1738 + }, + { + "epoch": 0.1463050749572876, + "lm_loss": 2.3228, + "step": 760, + "vm_loss": 0.2248 + }, + { + "epoch": 0.1463050749572876, + "lm_loss": 2.1436, + "step": 760, + "vm_loss": 0.2843 + }, + { + "epoch": 0.1463050749572876, + "lm_loss": 2.4144, + "step": 760, + "vm_loss": 0.1942 + }, + { + "epoch": 0.1463050749572876, + "lm_loss": 2.3729, + "step": 760, + "vm_loss": 0.1444 + }, + { + "epoch": 0.1463050749572876, + "lm_loss": 2.4157, + "step": 760, + "vm_loss": 0.223 + }, + { + "epoch": 0.1463050749572876, + "lm_loss": 2.2469, + "step": 760, + "vm_loss": 0.2154 + }, + { + "epoch": 0.1463050749572876, + "lm_loss": 2.2256, + "step": 760, + "vm_loss": 0.1452 + }, + { + "epoch": 0.14649758163486296, + "grad_norm": 3.428443985869128, + "learning_rate": 1.9902168949757572e-05, + "loss": 2.5365, + "step": 761 + }, + { + "epoch": 0.14669008831243835, + "grad_norm": 3.8166352812621875, + "learning_rate": 1.9901733407333303e-05, + "loss": 2.4953, + "step": 762 + }, + { + "epoch": 0.14688259499001372, + "grad_norm": 3.942498205711259, + "learning_rate": 1.990129690233386e-05, + "loss": 2.5387, + "step": 763 + }, + { + "epoch": 0.14707510166758908, + "grad_norm": 3.317992072568667, + "learning_rate": 1.9900859434801686e-05, + "loss": 2.5173, + "step": 764 + }, + { + "epoch": 0.14726760834516447, + "grad_norm": 3.89094857705775, + "learning_rate": 1.9900421004779306e-05, + "loss": 2.4946, + "step": 765 + }, + { + "epoch": 0.14746011502273984, + "grad_norm": 3.786031588662704, + "learning_rate": 1.9899981612309344e-05, + "loss": 2.5156, + "step": 766 + }, + { + "epoch": 0.14765262170031523, + "grad_norm": 3.8286918262451297, + "learning_rate": 1.9899541257434506e-05, + "loss": 2.5095, + "step": 767 + }, + { + "epoch": 0.1478451283778906, + "grad_norm": 3.931170649796231, + "learning_rate": 1.989909994019761e-05, + "loss": 2.5604, + "step": 768 + }, + { + "epoch": 0.1478451283778906, + "lm_loss": 2.3366, + "step": 768, + "vm_loss": 0.1573 + }, + { + "epoch": 0.1478451283778906, + "lm_loss": 2.3119, + "step": 768, + "vm_loss": 0.1876 + }, + { + "epoch": 0.1478451283778906, + "lm_loss": 2.2553, + "step": 768, + "vm_loss": 0.1701 + }, + { + "epoch": 0.1478451283778906, + "lm_loss": 2.4439, + "step": 768, + "vm_loss": 0.1935 + }, + { + "epoch": 0.1478451283778906, + "lm_loss": 2.3012, + "step": 768, + "vm_loss": 0.1895 + }, + { + "epoch": 0.1478451283778906, + "lm_loss": 2.2725, + "step": 768, + "vm_loss": 0.1389 + }, + { + "epoch": 0.1478451283778906, + "lm_loss": 2.5608, + "step": 768, + "vm_loss": 0.2307 + }, + { + "epoch": 0.1478451283778906, + "lm_loss": 2.3226, + "step": 768, + "vm_loss": 0.1921 + }, + { + "epoch": 0.148037635055466, + "grad_norm": 3.7565547676940634, + "learning_rate": 1.9898657660641554e-05, + "loss": 2.5324, + "step": 769 + }, + { + "epoch": 0.14823014173304136, + "grad_norm": 3.7773119159102504, + "learning_rate": 1.989821441880933e-05, + "loss": 2.5483, + "step": 770 + }, + { + "epoch": 0.14842264841061675, + "grad_norm": 3.572617679441565, + "learning_rate": 1.989777021474403e-05, + "loss": 2.4803, + "step": 771 + }, + { + "epoch": 0.14861515508819212, + "grad_norm": 4.244362952287954, + "learning_rate": 1.9897325048488833e-05, + "loss": 2.5031, + "step": 772 + }, + { + "epoch": 0.1488076617657675, + "grad_norm": 3.6258598385306393, + "learning_rate": 1.989687892008702e-05, + "loss": 2.5049, + "step": 773 + }, + { + "epoch": 0.14900016844334288, + "grad_norm": 3.565874285155107, + "learning_rate": 1.989643182958196e-05, + "loss": 2.4807, + "step": 774 + }, + { + "epoch": 0.14919267512091824, + "grad_norm": 3.9979185276266462, + "learning_rate": 1.9895983777017113e-05, + "loss": 2.516, + "step": 775 + }, + { + "epoch": 0.14938518179849364, + "grad_norm": 3.6620937247709993, + "learning_rate": 1.9895534762436032e-05, + "loss": 2.5312, + "step": 776 + }, + { + "epoch": 0.14938518179849364, + "lm_loss": 1.929, + "step": 776, + "vm_loss": 0.2089 + }, + { + "epoch": 0.14938518179849364, + "lm_loss": 2.2116, + "step": 776, + "vm_loss": 0.1414 + }, + { + "epoch": 0.14938518179849364, + "lm_loss": 2.2605, + "step": 776, + "vm_loss": 0.1486 + }, + { + "epoch": 0.14938518179849364, + "lm_loss": 2.3101, + "step": 776, + "vm_loss": 0.1459 + }, + { + "epoch": 0.14938518179849364, + "lm_loss": 2.3977, + "step": 776, + "vm_loss": 0.1719 + }, + { + "epoch": 0.14938518179849364, + "lm_loss": 2.4413, + "step": 776, + "vm_loss": 0.167 + }, + { + "epoch": 0.14938518179849364, + "lm_loss": 2.2528, + "step": 776, + "vm_loss": 0.1728 + }, + { + "epoch": 0.14938518179849364, + "lm_loss": 2.34, + "step": 776, + "vm_loss": 0.2021 + }, + { + "epoch": 0.149577688476069, + "grad_norm": 3.6826109674237744, + "learning_rate": 1.9895084785882378e-05, + "loss": 2.4569, + "step": 777 + }, + { + "epoch": 0.1497701951536444, + "grad_norm": 3.1463332149594, + "learning_rate": 1.9894633847399883e-05, + "loss": 2.4866, + "step": 778 + }, + { + "epoch": 0.14996270183121976, + "grad_norm": 3.4325686250731393, + "learning_rate": 1.9894181947032388e-05, + "loss": 2.532, + "step": 779 + }, + { + "epoch": 0.15015520850879516, + "grad_norm": 3.498797083938133, + "learning_rate": 1.9893729084823826e-05, + "loss": 2.5261, + "step": 780 + }, + { + "epoch": 0.15034771518637052, + "grad_norm": 3.9926524983205844, + "learning_rate": 1.9893275260818222e-05, + "loss": 2.466, + "step": 781 + }, + { + "epoch": 0.15054022186394592, + "grad_norm": 3.6638849402622244, + "learning_rate": 1.9892820475059687e-05, + "loss": 2.5113, + "step": 782 + }, + { + "epoch": 0.15073272854152128, + "grad_norm": 4.326531067224903, + "learning_rate": 1.9892364727592436e-05, + "loss": 2.4637, + "step": 783 + }, + { + "epoch": 0.15092523521909668, + "grad_norm": 3.983710277535217, + "learning_rate": 1.9891908018460775e-05, + "loss": 2.4953, + "step": 784 + }, + { + "epoch": 0.15092523521909668, + "lm_loss": 2.4628, + "step": 784, + "vm_loss": 0.1432 + }, + { + "epoch": 0.15092523521909668, + "lm_loss": 2.5962, + "step": 784, + "vm_loss": 0.1441 + }, + { + "epoch": 0.15092523521909668, + "lm_loss": 2.4448, + "step": 784, + "vm_loss": 0.1746 + }, + { + "epoch": 0.15092523521909668, + "lm_loss": 2.3503, + "step": 784, + "vm_loss": 0.1932 + }, + { + "epoch": 0.15092523521909668, + "lm_loss": 2.532, + "step": 784, + "vm_loss": 0.1884 + }, + { + "epoch": 0.15092523521909668, + "lm_loss": 2.4043, + "step": 784, + "vm_loss": 0.1624 + }, + { + "epoch": 0.15092523521909668, + "lm_loss": 2.3167, + "step": 784, + "vm_loss": 0.1742 + }, + { + "epoch": 0.15092523521909668, + "lm_loss": 2.4208, + "step": 784, + "vm_loss": 0.1621 + }, + { + "epoch": 0.15111774189667204, + "grad_norm": 4.609559942458938, + "learning_rate": 1.98914503477091e-05, + "loss": 2.5614, + "step": 785 + }, + { + "epoch": 0.1513102485742474, + "grad_norm": 4.460768949829644, + "learning_rate": 1.98909917153819e-05, + "loss": 2.5284, + "step": 786 + }, + { + "epoch": 0.1515027552518228, + "grad_norm": 3.9804700459200415, + "learning_rate": 1.9890532121523766e-05, + "loss": 2.4754, + "step": 787 + }, + { + "epoch": 0.15169526192939817, + "grad_norm": 4.070777584649809, + "learning_rate": 1.989007156617937e-05, + "loss": 2.5261, + "step": 788 + }, + { + "epoch": 0.15188776860697356, + "grad_norm": 4.6263915929846675, + "learning_rate": 1.9889610049393487e-05, + "loss": 2.5019, + "step": 789 + }, + { + "epoch": 0.15208027528454893, + "grad_norm": 3.674440215497087, + "learning_rate": 1.9889147571210984e-05, + "loss": 2.518, + "step": 790 + }, + { + "epoch": 0.15227278196212432, + "grad_norm": 3.9061139696453693, + "learning_rate": 1.9888684131676816e-05, + "loss": 2.5138, + "step": 791 + }, + { + "epoch": 0.1524652886396997, + "grad_norm": 4.578408555264086, + "learning_rate": 1.9888219730836038e-05, + "loss": 2.5151, + "step": 792 + }, + { + "epoch": 0.1524652886396997, + "lm_loss": 1.688, + "step": 792, + "vm_loss": 0.1583 + }, + { + "epoch": 0.1524652886396997, + "lm_loss": 2.3761, + "step": 792, + "vm_loss": 0.1681 + }, + { + "epoch": 0.1524652886396997, + "lm_loss": 2.4514, + "step": 792, + "vm_loss": 0.1953 + }, + { + "epoch": 0.1524652886396997, + "lm_loss": 2.3851, + "step": 792, + "vm_loss": 0.2171 + }, + { + "epoch": 0.1524652886396997, + "lm_loss": 2.3004, + "step": 792, + "vm_loss": 0.1872 + }, + { + "epoch": 0.1524652886396997, + "lm_loss": 2.3727, + "step": 792, + "vm_loss": 0.1558 + }, + { + "epoch": 0.1524652886396997, + "lm_loss": 2.2116, + "step": 792, + "vm_loss": 0.1717 + }, + { + "epoch": 0.1524652886396997, + "lm_loss": 2.0514, + "step": 792, + "vm_loss": 0.1725 + }, + { + "epoch": 0.15265779531727508, + "grad_norm": 3.887310030307114, + "learning_rate": 1.9887754368733795e-05, + "loss": 2.4626, + "step": 793 + }, + { + "epoch": 0.15285030199485045, + "grad_norm": 3.8136046688558816, + "learning_rate": 1.9887288045415327e-05, + "loss": 2.577, + "step": 794 + }, + { + "epoch": 0.1530428086724258, + "grad_norm": 3.2654560002836774, + "learning_rate": 1.9886820760925962e-05, + "loss": 2.4736, + "step": 795 + }, + { + "epoch": 0.1532353153500012, + "grad_norm": 3.779358761458401, + "learning_rate": 1.9886352515311134e-05, + "loss": 2.5142, + "step": 796 + }, + { + "epoch": 0.15342782202757657, + "grad_norm": 3.8790729178900443, + "learning_rate": 1.9885883308616355e-05, + "loss": 2.5019, + "step": 797 + }, + { + "epoch": 0.15362032870515197, + "grad_norm": 3.891944942258998, + "learning_rate": 1.9885413140887244e-05, + "loss": 2.4944, + "step": 798 + }, + { + "epoch": 0.15381283538272733, + "grad_norm": 3.5925555580484185, + "learning_rate": 1.9884942012169503e-05, + "loss": 2.483, + "step": 799 + }, + { + "epoch": 0.15400534206030272, + "grad_norm": 4.474016329028718, + "learning_rate": 1.9884469922508934e-05, + "loss": 2.5211, + "step": 800 + }, + { + "epoch": 0.15400534206030272, + "lm_loss": 2.3964, + "step": 800, + "vm_loss": 0.181 + }, + { + "epoch": 0.15400534206030272, + "lm_loss": 2.3323, + "step": 800, + "vm_loss": 0.1478 + }, + { + "epoch": 0.15400534206030272, + "lm_loss": 2.4357, + "step": 800, + "vm_loss": 0.1491 + }, + { + "epoch": 0.15400534206030272, + "lm_loss": 2.4901, + "step": 800, + "vm_loss": 0.2229 + }, + { + "epoch": 0.15400534206030272, + "lm_loss": 2.2118, + "step": 800, + "vm_loss": 0.1938 + }, + { + "epoch": 0.15400534206030272, + "lm_loss": 2.109, + "step": 800, + "vm_loss": 0.1436 + }, + { + "epoch": 0.15400534206030272, + "lm_loss": 2.2988, + "step": 800, + "vm_loss": 0.1509 + }, + { + "epoch": 0.15400534206030272, + "lm_loss": 2.4196, + "step": 800, + "vm_loss": 0.2137 + }, + { + "epoch": 0.1541978487378781, + "grad_norm": 3.8196326951673463, + "learning_rate": 1.9883996871951427e-05, + "loss": 2.518, + "step": 801 + }, + { + "epoch": 0.15439035541545348, + "grad_norm": 3.7698113125530632, + "learning_rate": 1.988352286054297e-05, + "loss": 2.4896, + "step": 802 + }, + { + "epoch": 0.15458286209302885, + "grad_norm": 3.7304889831830463, + "learning_rate": 1.9883047888329647e-05, + "loss": 2.4967, + "step": 803 + }, + { + "epoch": 0.15477536877060424, + "grad_norm": 4.414808272225282, + "learning_rate": 1.988257195535763e-05, + "loss": 2.5113, + "step": 804 + }, + { + "epoch": 0.1549678754481796, + "grad_norm": 3.725610598629842, + "learning_rate": 1.9882095061673177e-05, + "loss": 2.5329, + "step": 805 + }, + { + "epoch": 0.15516038212575498, + "grad_norm": 4.534571389116044, + "learning_rate": 1.988161720732266e-05, + "loss": 2.5074, + "step": 806 + }, + { + "epoch": 0.15535288880333037, + "grad_norm": 4.220101350182375, + "learning_rate": 1.9881138392352528e-05, + "loss": 2.4932, + "step": 807 + }, + { + "epoch": 0.15554539548090573, + "grad_norm": 3.643618220848386, + "learning_rate": 1.9880658616809324e-05, + "loss": 2.5262, + "step": 808 + }, + { + "epoch": 0.15554539548090573, + "lm_loss": 2.4591, + "step": 808, + "vm_loss": 0.1949 + }, + { + "epoch": 0.15554539548090573, + "lm_loss": 2.474, + "step": 808, + "vm_loss": 0.212 + }, + { + "epoch": 0.15554539548090573, + "lm_loss": 2.4972, + "step": 808, + "vm_loss": 0.1378 + }, + { + "epoch": 0.15554539548090573, + "lm_loss": 2.2781, + "step": 808, + "vm_loss": 0.1588 + }, + { + "epoch": 0.15554539548090573, + "lm_loss": 2.3956, + "step": 808, + "vm_loss": 0.1544 + }, + { + "epoch": 0.15554539548090573, + "lm_loss": 2.4666, + "step": 808, + "vm_loss": 0.1656 + }, + { + "epoch": 0.15554539548090573, + "lm_loss": 2.5471, + "step": 808, + "vm_loss": 0.2226 + }, + { + "epoch": 0.15554539548090573, + "lm_loss": 2.1856, + "step": 808, + "vm_loss": 0.1999 + }, + { + "epoch": 0.15573790215848113, + "grad_norm": 3.7097444091461673, + "learning_rate": 1.9880177880739695e-05, + "loss": 2.5145, + "step": 809 + }, + { + "epoch": 0.1559304088360565, + "grad_norm": 4.292306449995038, + "learning_rate": 1.987969618419037e-05, + "loss": 2.509, + "step": 810 + }, + { + "epoch": 0.1561229155136319, + "grad_norm": 3.6939094755972994, + "learning_rate": 1.987921352720818e-05, + "loss": 2.5302, + "step": 811 + }, + { + "epoch": 0.15631542219120725, + "grad_norm": 4.052198836030496, + "learning_rate": 1.9878729909840043e-05, + "loss": 2.5107, + "step": 812 + }, + { + "epoch": 0.15650792886878265, + "grad_norm": 4.37266745542868, + "learning_rate": 1.987824533213297e-05, + "loss": 2.475, + "step": 813 + }, + { + "epoch": 0.156700435546358, + "grad_norm": 3.4514887647106898, + "learning_rate": 1.9877759794134072e-05, + "loss": 2.5024, + "step": 814 + }, + { + "epoch": 0.15689294222393338, + "grad_norm": 3.8212435643259095, + "learning_rate": 1.987727329589055e-05, + "loss": 2.4874, + "step": 815 + }, + { + "epoch": 0.15708544890150877, + "grad_norm": 3.7271970226065485, + "learning_rate": 1.9876785837449696e-05, + "loss": 2.5019, + "step": 816 + }, + { + "epoch": 0.15708544890150877, + "lm_loss": 2.4799, + "step": 816, + "vm_loss": 0.2399 + }, + { + "epoch": 0.15708544890150877, + "lm_loss": 2.0976, + "step": 816, + "vm_loss": 0.1674 + }, + { + "epoch": 0.15708544890150877, + "lm_loss": 2.0679, + "step": 816, + "vm_loss": 0.1345 + }, + { + "epoch": 0.15708544890150877, + "lm_loss": 2.2839, + "step": 816, + "vm_loss": 0.2008 + }, + { + "epoch": 0.15708544890150877, + "lm_loss": 2.2638, + "step": 816, + "vm_loss": 0.2365 + }, + { + "epoch": 0.15708544890150877, + "lm_loss": 2.3225, + "step": 816, + "vm_loss": 0.2095 + }, + { + "epoch": 0.15708544890150877, + "lm_loss": 2.187, + "step": 816, + "vm_loss": 0.1425 + }, + { + "epoch": 0.15708544890150877, + "lm_loss": 2.432, + "step": 816, + "vm_loss": 0.1433 + }, + { + "epoch": 0.15727795557908414, + "grad_norm": 4.094966795142, + "learning_rate": 1.9876297418858893e-05, + "loss": 2.4807, + "step": 817 + }, + { + "epoch": 0.15747046225665953, + "grad_norm": 4.094307033081914, + "learning_rate": 1.987580804016563e-05, + "loss": 2.5165, + "step": 818 + }, + { + "epoch": 0.1576629689342349, + "grad_norm": 4.442790577289076, + "learning_rate": 1.987531770141748e-05, + "loss": 2.5127, + "step": 819 + }, + { + "epoch": 0.1578554756118103, + "grad_norm": 4.139757998924265, + "learning_rate": 1.9874826402662105e-05, + "loss": 2.4873, + "step": 820 + }, + { + "epoch": 0.15804798228938566, + "grad_norm": 4.173747352310305, + "learning_rate": 1.9874334143947266e-05, + "loss": 2.4998, + "step": 821 + }, + { + "epoch": 0.15824048896696105, + "grad_norm": 4.088918468567153, + "learning_rate": 1.9873840925320823e-05, + "loss": 2.5387, + "step": 822 + }, + { + "epoch": 0.15843299564453642, + "grad_norm": 3.7010636854152015, + "learning_rate": 1.9873346746830712e-05, + "loss": 2.4738, + "step": 823 + }, + { + "epoch": 0.1586255023221118, + "grad_norm": 4.680368108743709, + "learning_rate": 1.9872851608524987e-05, + "loss": 2.4965, + "step": 824 + }, + { + "epoch": 0.1586255023221118, + "lm_loss": 2.2447, + "step": 824, + "vm_loss": 0.1719 + }, + { + "epoch": 0.1586255023221118, + "lm_loss": 2.6471, + "step": 824, + "vm_loss": 0.2275 + }, + { + "epoch": 0.1586255023221118, + "lm_loss": 2.3997, + "step": 824, + "vm_loss": 0.224 + }, + { + "epoch": 0.1586255023221118, + "lm_loss": 2.4949, + "step": 824, + "vm_loss": 0.1669 + }, + { + "epoch": 0.1586255023221118, + "lm_loss": 2.4591, + "step": 824, + "vm_loss": 0.169 + }, + { + "epoch": 0.1586255023221118, + "lm_loss": 2.5899, + "step": 824, + "vm_loss": 0.2055 + }, + { + "epoch": 0.1586255023221118, + "lm_loss": 2.3551, + "step": 824, + "vm_loss": 0.1623 + }, + { + "epoch": 0.1586255023221118, + "lm_loss": 2.114, + "step": 824, + "vm_loss": 0.1735 + }, + { + "epoch": 0.15881800899968718, + "grad_norm": 4.161591338676701, + "learning_rate": 1.9872355510451768e-05, + "loss": 2.5179, + "step": 825 + }, + { + "epoch": 0.15901051567726254, + "grad_norm": 3.9734022392053303, + "learning_rate": 1.9871858452659293e-05, + "loss": 2.4518, + "step": 826 + }, + { + "epoch": 0.15920302235483794, + "grad_norm": 4.186996813735879, + "learning_rate": 1.987136043519588e-05, + "loss": 2.5024, + "step": 827 + }, + { + "epoch": 0.1593955290324133, + "grad_norm": 4.203275787319111, + "learning_rate": 1.987086145810994e-05, + "loss": 2.4777, + "step": 828 + }, + { + "epoch": 0.1595880357099887, + "grad_norm": 3.7430402733378823, + "learning_rate": 1.9870361521449978e-05, + "loss": 2.5474, + "step": 829 + }, + { + "epoch": 0.15978054238756406, + "grad_norm": 4.129396091441689, + "learning_rate": 1.9869860625264602e-05, + "loss": 2.4963, + "step": 830 + }, + { + "epoch": 0.15997304906513946, + "grad_norm": 3.7953898576288325, + "learning_rate": 1.98693587696025e-05, + "loss": 2.5042, + "step": 831 + }, + { + "epoch": 0.16016555574271482, + "grad_norm": 3.7482538962335794, + "learning_rate": 1.9868855954512456e-05, + "loss": 2.4534, + "step": 832 + }, + { + "epoch": 0.16016555574271482, + "lm_loss": 2.4681, + "step": 832, + "vm_loss": 0.2164 + }, + { + "epoch": 0.16016555574271482, + "lm_loss": 2.0654, + "step": 832, + "vm_loss": 0.126 + }, + { + "epoch": 0.16016555574271482, + "lm_loss": 2.3052, + "step": 832, + "vm_loss": 0.2036 + }, + { + "epoch": 0.16016555574271482, + "lm_loss": 2.3923, + "step": 832, + "vm_loss": 0.1851 + }, + { + "epoch": 0.16016555574271482, + "lm_loss": 2.1613, + "step": 832, + "vm_loss": 0.1632 + }, + { + "epoch": 0.16016555574271482, + "lm_loss": 2.2553, + "step": 832, + "vm_loss": 0.1803 + }, + { + "epoch": 0.16016555574271482, + "lm_loss": 2.3776, + "step": 832, + "vm_loss": 0.1519 + }, + { + "epoch": 0.16016555574271482, + "lm_loss": 2.1413, + "step": 832, + "vm_loss": 0.1243 + }, + { + "epoch": 0.16035806242029021, + "grad_norm": 3.6482939857870846, + "learning_rate": 1.9868352180043358e-05, + "loss": 2.4955, + "step": 833 + }, + { + "epoch": 0.16055056909786558, + "grad_norm": 3.278285390521105, + "learning_rate": 1.9867847446244172e-05, + "loss": 2.4619, + "step": 834 + }, + { + "epoch": 0.16074307577544097, + "grad_norm": 3.9383036614192806, + "learning_rate": 1.9867341753163967e-05, + "loss": 2.4281, + "step": 835 + }, + { + "epoch": 0.16093558245301634, + "grad_norm": 4.372790171334646, + "learning_rate": 1.9866835100851903e-05, + "loss": 2.5259, + "step": 836 + }, + { + "epoch": 0.1611280891305917, + "grad_norm": 3.5797450957194745, + "learning_rate": 1.9866327489357236e-05, + "loss": 2.4525, + "step": 837 + }, + { + "epoch": 0.1613205958081671, + "grad_norm": 3.5250988605569873, + "learning_rate": 1.986581891872931e-05, + "loss": 2.4837, + "step": 838 + }, + { + "epoch": 0.16151310248574247, + "grad_norm": 3.7970270639036268, + "learning_rate": 1.9865309389017566e-05, + "loss": 2.5168, + "step": 839 + }, + { + "epoch": 0.16170560916331786, + "grad_norm": 3.4834087852061826, + "learning_rate": 1.986479890027153e-05, + "loss": 2.4664, + "step": 840 + }, + { + "epoch": 0.16170560916331786, + "lm_loss": 2.4628, + "step": 840, + "vm_loss": 0.1452 + }, + { + "epoch": 0.16170560916331786, + "lm_loss": 2.1535, + "step": 840, + "vm_loss": 0.1595 + }, + { + "epoch": 0.16170560916331786, + "lm_loss": 2.108, + "step": 840, + "vm_loss": 0.24 + }, + { + "epoch": 0.16170560916331786, + "lm_loss": 2.3511, + "step": 840, + "vm_loss": 0.135 + }, + { + "epoch": 0.16170560916331786, + "lm_loss": 2.4522, + "step": 840, + "vm_loss": 0.1477 + }, + { + "epoch": 0.16170560916331786, + "lm_loss": 2.4413, + "step": 840, + "vm_loss": 0.1909 + }, + { + "epoch": 0.16170560916331786, + "lm_loss": 1.9965, + "step": 840, + "vm_loss": 0.175 + }, + { + "epoch": 0.16170560916331786, + "lm_loss": 2.656, + "step": 840, + "vm_loss": 0.1464 + }, + { + "epoch": 0.16189811584089323, + "grad_norm": 3.8289089820304807, + "learning_rate": 1.986428745254084e-05, + "loss": 2.4937, + "step": 841 + }, + { + "epoch": 0.16209062251846862, + "grad_norm": 3.3458407030892325, + "learning_rate": 1.9863775045875206e-05, + "loss": 2.4868, + "step": 842 + }, + { + "epoch": 0.16228312919604398, + "grad_norm": 3.646234913416789, + "learning_rate": 1.986326168032444e-05, + "loss": 2.4751, + "step": 843 + }, + { + "epoch": 0.16247563587361938, + "grad_norm": 3.432423840366768, + "learning_rate": 1.986274735593845e-05, + "loss": 2.5625, + "step": 844 + }, + { + "epoch": 0.16266814255119474, + "grad_norm": 3.2465491995918563, + "learning_rate": 1.986223207276724e-05, + "loss": 2.528, + "step": 845 + }, + { + "epoch": 0.1628606492287701, + "grad_norm": 3.518014121969006, + "learning_rate": 1.9861715830860894e-05, + "loss": 2.5338, + "step": 846 + }, + { + "epoch": 0.1630531559063455, + "grad_norm": 3.5837082374926092, + "learning_rate": 1.9861198630269606e-05, + "loss": 2.5171, + "step": 847 + }, + { + "epoch": 0.16324566258392087, + "grad_norm": 4.13172619845174, + "learning_rate": 1.9860680471043644e-05, + "loss": 2.4927, + "step": 848 + }, + { + "epoch": 0.16324566258392087, + "lm_loss": 2.2073, + "step": 848, + "vm_loss": 0.1464 + }, + { + "epoch": 0.16324566258392087, + "lm_loss": 2.3082, + "step": 848, + "vm_loss": 0.2643 + }, + { + "epoch": 0.16324566258392087, + "lm_loss": 2.1762, + "step": 848, + "vm_loss": 0.1309 + }, + { + "epoch": 0.16324566258392087, + "lm_loss": 2.231, + "step": 848, + "vm_loss": 0.1563 + }, + { + "epoch": 0.16324566258392087, + "lm_loss": 1.9084, + "step": 848, + "vm_loss": 0.1956 + }, + { + "epoch": 0.16324566258392087, + "lm_loss": 2.5673, + "step": 848, + "vm_loss": 0.1938 + }, + { + "epoch": 0.16324566258392087, + "lm_loss": 2.4502, + "step": 848, + "vm_loss": 0.181 + }, + { + "epoch": 0.16324566258392087, + "lm_loss": 1.9246, + "step": 848, + "vm_loss": 0.1448 + }, + { + "epoch": 0.16343816926149626, + "grad_norm": 3.5100326464907714, + "learning_rate": 1.986016135323339e-05, + "loss": 2.527, + "step": 849 + }, + { + "epoch": 0.16363067593907163, + "grad_norm": 3.7155701556789156, + "learning_rate": 1.98596412768893e-05, + "loss": 2.4747, + "step": 850 + }, + { + "epoch": 0.16382318261664702, + "grad_norm": 3.8818291179174995, + "learning_rate": 1.985912024206194e-05, + "loss": 2.5051, + "step": 851 + }, + { + "epoch": 0.1640156892942224, + "grad_norm": 3.4959880929983753, + "learning_rate": 1.9858598248801953e-05, + "loss": 2.4791, + "step": 852 + }, + { + "epoch": 0.16420819597179778, + "grad_norm": 4.430584916408281, + "learning_rate": 1.9858075297160094e-05, + "loss": 2.5104, + "step": 853 + }, + { + "epoch": 0.16440070264937315, + "grad_norm": 3.6332412505498626, + "learning_rate": 1.985755138718719e-05, + "loss": 2.4906, + "step": 854 + }, + { + "epoch": 0.16459320932694854, + "grad_norm": 4.646771970769478, + "learning_rate": 1.9857026518934177e-05, + "loss": 2.5212, + "step": 855 + }, + { + "epoch": 0.1647857160045239, + "grad_norm": 3.6662405647563308, + "learning_rate": 1.985650069245208e-05, + "loss": 2.4986, + "step": 856 + }, + { + "epoch": 0.1647857160045239, + "lm_loss": 2.5841, + "step": 856, + "vm_loss": 0.2149 + }, + { + "epoch": 0.1647857160045239, + "lm_loss": 2.4106, + "step": 856, + "vm_loss": 0.1804 + }, + { + "epoch": 0.1647857160045239, + "lm_loss": 2.4381, + "step": 856, + "vm_loss": 0.1625 + }, + { + "epoch": 0.1647857160045239, + "lm_loss": 2.3658, + "step": 856, + "vm_loss": 0.1492 + }, + { + "epoch": 0.1647857160045239, + "lm_loss": 2.1363, + "step": 856, + "vm_loss": 0.2235 + }, + { + "epoch": 0.1647857160045239, + "lm_loss": 2.0291, + "step": 856, + "vm_loss": 0.2249 + }, + { + "epoch": 0.1647857160045239, + "lm_loss": 2.3136, + "step": 856, + "vm_loss": 0.154 + }, + { + "epoch": 0.1647857160045239, + "lm_loss": 2.2066, + "step": 856, + "vm_loss": 0.1691 + }, + { + "epoch": 0.16497822268209927, + "grad_norm": 3.6236749224200433, + "learning_rate": 1.985597390779201e-05, + "loss": 2.4565, + "step": 857 + }, + { + "epoch": 0.16517072935967467, + "grad_norm": 4.343106849979768, + "learning_rate": 1.9855446165005185e-05, + "loss": 2.4459, + "step": 858 + }, + { + "epoch": 0.16536323603725003, + "grad_norm": 4.063166972094065, + "learning_rate": 1.9854917464142903e-05, + "loss": 2.5496, + "step": 859 + }, + { + "epoch": 0.16555574271482543, + "grad_norm": 5.602233108872377, + "learning_rate": 1.9854387805256566e-05, + "loss": 2.4624, + "step": 860 + }, + { + "epoch": 0.1657482493924008, + "grad_norm": 3.4826940006454774, + "learning_rate": 1.9853857188397656e-05, + "loss": 2.4879, + "step": 861 + }, + { + "epoch": 0.1659407560699762, + "grad_norm": 4.0200536560380025, + "learning_rate": 1.985332561361776e-05, + "loss": 2.5539, + "step": 862 + }, + { + "epoch": 0.16613326274755155, + "grad_norm": 4.109384360418663, + "learning_rate": 1.9852793080968555e-05, + "loss": 2.4912, + "step": 863 + }, + { + "epoch": 0.16632576942512695, + "grad_norm": 3.400800481928985, + "learning_rate": 1.9852259590501807e-05, + "loss": 2.4795, + "step": 864 + }, + { + "epoch": 0.16632576942512695, + "lm_loss": 2.1267, + "step": 864, + "vm_loss": 0.1342 + }, + { + "epoch": 0.16632576942512695, + "lm_loss": 2.4347, + "step": 864, + "vm_loss": 0.1861 + }, + { + "epoch": 0.16632576942512695, + "lm_loss": 2.3805, + "step": 864, + "vm_loss": 0.1929 + }, + { + "epoch": 0.16632576942512695, + "lm_loss": 2.279, + "step": 864, + "vm_loss": 0.2296 + }, + { + "epoch": 0.16632576942512695, + "lm_loss": 2.5198, + "step": 864, + "vm_loss": 0.1991 + }, + { + "epoch": 0.16632576942512695, + "lm_loss": 2.2566, + "step": 864, + "vm_loss": 0.2571 + }, + { + "epoch": 0.16632576942512695, + "lm_loss": 2.3167, + "step": 864, + "vm_loss": 0.2091 + }, + { + "epoch": 0.16632576942512695, + "lm_loss": 2.4634, + "step": 864, + "vm_loss": 0.1732 + }, + { + "epoch": 0.1665182761027023, + "grad_norm": 5.066341540872207, + "learning_rate": 1.9851725142269382e-05, + "loss": 2.5316, + "step": 865 + }, + { + "epoch": 0.16671078278027768, + "grad_norm": 3.7992754143895437, + "learning_rate": 1.9851189736323228e-05, + "loss": 2.5325, + "step": 866 + }, + { + "epoch": 0.16690328945785307, + "grad_norm": 5.075925160303168, + "learning_rate": 1.9850653372715403e-05, + "loss": 2.4576, + "step": 867 + }, + { + "epoch": 0.16709579613542844, + "grad_norm": 3.7371478522469217, + "learning_rate": 1.9850116051498043e-05, + "loss": 2.554, + "step": 868 + }, + { + "epoch": 0.16728830281300383, + "grad_norm": 4.527764418179241, + "learning_rate": 1.984957777272338e-05, + "loss": 2.4809, + "step": 869 + }, + { + "epoch": 0.1674808094905792, + "grad_norm": 4.445340125418732, + "learning_rate": 1.9849038536443746e-05, + "loss": 2.4831, + "step": 870 + }, + { + "epoch": 0.1676733161681546, + "grad_norm": 3.7644555071649606, + "learning_rate": 1.9848498342711563e-05, + "loss": 2.4676, + "step": 871 + }, + { + "epoch": 0.16786582284572996, + "grad_norm": 4.221017632089052, + "learning_rate": 1.984795719157934e-05, + "loss": 2.4757, + "step": 872 + }, + { + "epoch": 0.16786582284572996, + "lm_loss": 2.0428, + "step": 872, + "vm_loss": 0.0955 + }, + { + "epoch": 0.16786582284572996, + "lm_loss": 2.2914, + "step": 872, + "vm_loss": 0.1962 + }, + { + "epoch": 0.16786582284572996, + "lm_loss": 2.3406, + "step": 872, + "vm_loss": 0.1847 + }, + { + "epoch": 0.16786582284572996, + "lm_loss": 2.4781, + "step": 872, + "vm_loss": 0.1579 + }, + { + "epoch": 0.16786582284572996, + "lm_loss": 2.5142, + "step": 872, + "vm_loss": 0.1883 + }, + { + "epoch": 0.16786582284572996, + "lm_loss": 2.6147, + "step": 872, + "vm_loss": 0.153 + }, + { + "epoch": 0.16786582284572996, + "lm_loss": 2.3595, + "step": 872, + "vm_loss": 0.1736 + }, + { + "epoch": 0.16786582284572996, + "lm_loss": 2.5091, + "step": 872, + "vm_loss": 0.151 + }, + { + "epoch": 0.16805832952330535, + "grad_norm": 3.6357447416847624, + "learning_rate": 1.9847415083099685e-05, + "loss": 2.491, + "step": 873 + }, + { + "epoch": 0.16825083620088072, + "grad_norm": 4.0084138737054555, + "learning_rate": 1.98468720173253e-05, + "loss": 2.5196, + "step": 874 + }, + { + "epoch": 0.1684433428784561, + "grad_norm": 3.5142195425760865, + "learning_rate": 1.9846327994308975e-05, + "loss": 2.5002, + "step": 875 + }, + { + "epoch": 0.16863584955603148, + "grad_norm": 4.427925772410468, + "learning_rate": 1.98457830141036e-05, + "loss": 2.4858, + "step": 876 + }, + { + "epoch": 0.16882835623360684, + "grad_norm": 4.542372084774133, + "learning_rate": 1.984523707676215e-05, + "loss": 2.5025, + "step": 877 + }, + { + "epoch": 0.16902086291118223, + "grad_norm": 3.8693440162781862, + "learning_rate": 1.98446901823377e-05, + "loss": 2.4649, + "step": 878 + }, + { + "epoch": 0.1692133695887576, + "grad_norm": 4.796691383754355, + "learning_rate": 1.984414233088341e-05, + "loss": 2.5548, + "step": 879 + }, + { + "epoch": 0.169405876266333, + "grad_norm": 4.551327231107463, + "learning_rate": 1.9843593522452546e-05, + "loss": 2.5246, + "step": 880 + }, + { + "epoch": 0.169405876266333, + "lm_loss": 2.3588, + "step": 880, + "vm_loss": 0.1733 + }, + { + "epoch": 0.169405876266333, + "lm_loss": 2.2537, + "step": 880, + "vm_loss": 0.2309 + }, + { + "epoch": 0.169405876266333, + "lm_loss": 2.1825, + "step": 880, + "vm_loss": 0.1124 + }, + { + "epoch": 0.169405876266333, + "lm_loss": 2.0459, + "step": 880, + "vm_loss": 0.1446 + }, + { + "epoch": 0.169405876266333, + "lm_loss": 2.2926, + "step": 880, + "vm_loss": 0.2376 + }, + { + "epoch": 0.169405876266333, + "lm_loss": 2.4112, + "step": 880, + "vm_loss": 0.1673 + }, + { + "epoch": 0.169405876266333, + "lm_loss": 2.5111, + "step": 880, + "vm_loss": 0.1799 + }, + { + "epoch": 0.169405876266333, + "lm_loss": 2.6628, + "step": 880, + "vm_loss": 0.1765 + }, + { + "epoch": 0.16959838294390836, + "grad_norm": 4.122078849569165, + "learning_rate": 1.9843043757098456e-05, + "loss": 2.53, + "step": 881 + }, + { + "epoch": 0.16979088962148375, + "grad_norm": 4.036775529873049, + "learning_rate": 1.984249303487458e-05, + "loss": 2.5269, + "step": 882 + }, + { + "epoch": 0.16998339629905912, + "grad_norm": 4.021417786112161, + "learning_rate": 1.984194135583446e-05, + "loss": 2.4871, + "step": 883 + }, + { + "epoch": 0.1701759029766345, + "grad_norm": 3.3089788906978823, + "learning_rate": 1.9841388720031727e-05, + "loss": 2.4875, + "step": 884 + }, + { + "epoch": 0.17036840965420988, + "grad_norm": 3.795220669792721, + "learning_rate": 1.98408351275201e-05, + "loss": 2.4582, + "step": 885 + }, + { + "epoch": 0.17056091633178525, + "grad_norm": 3.6286940192394126, + "learning_rate": 1.9840280578353397e-05, + "loss": 2.4753, + "step": 886 + }, + { + "epoch": 0.17075342300936064, + "grad_norm": 4.533779684106499, + "learning_rate": 1.983972507258553e-05, + "loss": 2.48, + "step": 887 + }, + { + "epoch": 0.170945929686936, + "grad_norm": 3.487651041614942, + "learning_rate": 1.9839168610270498e-05, + "loss": 2.4603, + "step": 888 + }, + { + "epoch": 0.170945929686936, + "lm_loss": 2.3489, + "step": 888, + "vm_loss": 0.1403 + }, + { + "epoch": 0.170945929686936, + "lm_loss": 2.3151, + "step": 888, + "vm_loss": 0.157 + }, + { + "epoch": 0.170945929686936, + "lm_loss": 2.0978, + "step": 888, + "vm_loss": 0.1754 + }, + { + "epoch": 0.170945929686936, + "lm_loss": 2.1745, + "step": 888, + "vm_loss": 0.1628 + }, + { + "epoch": 0.170945929686936, + "lm_loss": 2.1319, + "step": 888, + "vm_loss": 0.1404 + }, + { + "epoch": 0.170945929686936, + "lm_loss": 2.4738, + "step": 888, + "vm_loss": 0.2573 + }, + { + "epoch": 0.170945929686936, + "lm_loss": 2.2832, + "step": 888, + "vm_loss": 0.2312 + }, + { + "epoch": 0.170945929686936, + "lm_loss": 2.6107, + "step": 888, + "vm_loss": 0.1171 + }, + { + "epoch": 0.1711384363645114, + "grad_norm": 3.3852869513560475, + "learning_rate": 1.98386111914624e-05, + "loss": 2.4894, + "step": 889 + }, + { + "epoch": 0.17133094304208676, + "grad_norm": 3.7437518199127693, + "learning_rate": 1.9838052816215416e-05, + "loss": 2.5299, + "step": 890 + }, + { + "epoch": 0.17152344971966216, + "grad_norm": 3.5930595410013817, + "learning_rate": 1.9837493484583838e-05, + "loss": 2.4558, + "step": 891 + }, + { + "epoch": 0.17171595639723752, + "grad_norm": 3.2708157426537423, + "learning_rate": 1.9836933196622033e-05, + "loss": 2.4517, + "step": 892 + }, + { + "epoch": 0.17190846307481292, + "grad_norm": 3.535767008734297, + "learning_rate": 1.983637195238447e-05, + "loss": 2.4494, + "step": 893 + }, + { + "epoch": 0.17210096975238828, + "grad_norm": 3.8074124989101965, + "learning_rate": 1.9835809751925708e-05, + "loss": 2.4525, + "step": 894 + }, + { + "epoch": 0.17229347642996368, + "grad_norm": 3.3519842148845527, + "learning_rate": 1.9835246595300405e-05, + "loss": 2.5064, + "step": 895 + }, + { + "epoch": 0.17248598310753904, + "grad_norm": 3.541432511442601, + "learning_rate": 1.9834682482563302e-05, + "loss": 2.5128, + "step": 896 + }, + { + "epoch": 0.17248598310753904, + "lm_loss": 2.3096, + "step": 896, + "vm_loss": 0.1403 + }, + { + "epoch": 0.17248598310753904, + "lm_loss": 2.5469, + "step": 896, + "vm_loss": 0.283 + }, + { + "epoch": 0.17248598310753904, + "lm_loss": 2.3726, + "step": 896, + "vm_loss": 0.1604 + }, + { + "epoch": 0.17248598310753904, + "lm_loss": 2.3474, + "step": 896, + "vm_loss": 0.188 + }, + { + "epoch": 0.17248598310753904, + "lm_loss": 2.3731, + "step": 896, + "vm_loss": 0.1548 + }, + { + "epoch": 0.17248598310753904, + "lm_loss": 2.3883, + "step": 896, + "vm_loss": 0.1738 + }, + { + "epoch": 0.17248598310753904, + "lm_loss": 2.4883, + "step": 896, + "vm_loss": 0.2047 + }, + { + "epoch": 0.17248598310753904, + "lm_loss": 2.5499, + "step": 896, + "vm_loss": 0.1793 + }, + { + "epoch": 0.1726784897851144, + "grad_norm": 3.37154568668484, + "learning_rate": 1.983411741376924e-05, + "loss": 2.528, + "step": 897 + }, + { + "epoch": 0.1728709964626898, + "grad_norm": 3.6126627739882924, + "learning_rate": 1.9833551388973147e-05, + "loss": 2.565, + "step": 898 + }, + { + "epoch": 0.17306350314026517, + "grad_norm": 4.034874715261827, + "learning_rate": 1.9832984408230054e-05, + "loss": 2.5266, + "step": 899 + }, + { + "epoch": 0.17325600981784056, + "grad_norm": 3.5657829981459397, + "learning_rate": 1.9832416471595074e-05, + "loss": 2.4913, + "step": 900 + }, + { + "epoch": 0.17344851649541593, + "grad_norm": 3.9712059504488115, + "learning_rate": 1.983184757912342e-05, + "loss": 2.4832, + "step": 901 + }, + { + "epoch": 0.17364102317299132, + "grad_norm": 3.1856571783874323, + "learning_rate": 1.9831277730870396e-05, + "loss": 2.4019, + "step": 902 + }, + { + "epoch": 0.1738335298505667, + "grad_norm": 3.7649483355730653, + "learning_rate": 1.9830706926891396e-05, + "loss": 2.4939, + "step": 903 + }, + { + "epoch": 0.17402603652814208, + "grad_norm": 3.529662089759606, + "learning_rate": 1.9830135167241916e-05, + "loss": 2.5236, + "step": 904 + }, + { + "epoch": 0.17402603652814208, + "lm_loss": 2.1607, + "step": 904, + "vm_loss": 0.1763 + }, + { + "epoch": 0.17402603652814208, + "lm_loss": 2.3304, + "step": 904, + "vm_loss": 0.1432 + }, + { + "epoch": 0.17402603652814208, + "lm_loss": 2.3995, + "step": 904, + "vm_loss": 0.1896 + }, + { + "epoch": 0.17402603652814208, + "lm_loss": 1.8449, + "step": 904, + "vm_loss": 0.1664 + }, + { + "epoch": 0.17402603652814208, + "lm_loss": 2.53, + "step": 904, + "vm_loss": 0.2102 + }, + { + "epoch": 0.17402603652814208, + "lm_loss": 2.3159, + "step": 904, + "vm_loss": 0.1261 + }, + { + "epoch": 0.17402603652814208, + "lm_loss": 2.3825, + "step": 904, + "vm_loss": 0.2446 + }, + { + "epoch": 0.17402603652814208, + "lm_loss": 2.4334, + "step": 904, + "vm_loss": 0.1577 + }, + { + "epoch": 0.17421854320571745, + "grad_norm": 3.519274166730685, + "learning_rate": 1.982956245197753e-05, + "loss": 2.4378, + "step": 905 + }, + { + "epoch": 0.17441104988329284, + "grad_norm": 4.200817241002008, + "learning_rate": 1.9828988781153916e-05, + "loss": 2.5199, + "step": 906 + }, + { + "epoch": 0.1746035565608682, + "grad_norm": 4.039331012621496, + "learning_rate": 1.9828414154826845e-05, + "loss": 2.5342, + "step": 907 + }, + { + "epoch": 0.17479606323844357, + "grad_norm": 3.8594096740692976, + "learning_rate": 1.9827838573052176e-05, + "loss": 2.5359, + "step": 908 + }, + { + "epoch": 0.17498856991601897, + "grad_norm": 4.299297151095028, + "learning_rate": 1.982726203588586e-05, + "loss": 2.4471, + "step": 909 + }, + { + "epoch": 0.17518107659359433, + "grad_norm": 3.7133107730609445, + "learning_rate": 1.982668454338395e-05, + "loss": 2.5135, + "step": 910 + }, + { + "epoch": 0.17537358327116973, + "grad_norm": 4.271758266916981, + "learning_rate": 1.9826106095602582e-05, + "loss": 2.4855, + "step": 911 + }, + { + "epoch": 0.1755660899487451, + "grad_norm": 4.168423781562003, + "learning_rate": 1.982552669259799e-05, + "loss": 2.5005, + "step": 912 + }, + { + "epoch": 0.1755660899487451, + "lm_loss": 2.1874, + "step": 912, + "vm_loss": 0.1858 + }, + { + "epoch": 0.1755660899487451, + "lm_loss": 2.3565, + "step": 912, + "vm_loss": 0.1626 + }, + { + "epoch": 0.1755660899487451, + "lm_loss": 2.3739, + "step": 912, + "vm_loss": 0.1962 + }, + { + "epoch": 0.1755660899487451, + "lm_loss": 2.1453, + "step": 912, + "vm_loss": 0.1823 + }, + { + "epoch": 0.1755660899487451, + "lm_loss": 2.4343, + "step": 912, + "vm_loss": 0.145 + }, + { + "epoch": 0.1755660899487451, + "lm_loss": 2.3087, + "step": 912, + "vm_loss": 0.1595 + }, + { + "epoch": 0.1755660899487451, + "lm_loss": 2.2499, + "step": 912, + "vm_loss": 0.1637 + }, + { + "epoch": 0.1755660899487451, + "lm_loss": 2.2876, + "step": 912, + "vm_loss": 0.1919 + }, + { + "epoch": 0.17575859662632048, + "grad_norm": 3.3981216667121847, + "learning_rate": 1.9824946334426497e-05, + "loss": 2.4972, + "step": 913 + }, + { + "epoch": 0.17595110330389585, + "grad_norm": 4.79842294820674, + "learning_rate": 1.982436502114452e-05, + "loss": 2.4711, + "step": 914 + }, + { + "epoch": 0.17614360998147124, + "grad_norm": 3.76316178362478, + "learning_rate": 1.9823782752808574e-05, + "loss": 2.4849, + "step": 915 + }, + { + "epoch": 0.1763361166590466, + "grad_norm": 4.552832711876154, + "learning_rate": 1.9823199529475265e-05, + "loss": 2.4903, + "step": 916 + }, + { + "epoch": 0.17652862333662198, + "grad_norm": 3.493438864885672, + "learning_rate": 1.9822615351201283e-05, + "loss": 2.5204, + "step": 917 + }, + { + "epoch": 0.17672113001419737, + "grad_norm": 3.2804991008916864, + "learning_rate": 1.982203021804342e-05, + "loss": 2.48, + "step": 918 + }, + { + "epoch": 0.17691363669177274, + "grad_norm": 3.172639943448026, + "learning_rate": 1.982144413005856e-05, + "loss": 2.4697, + "step": 919 + }, + { + "epoch": 0.17710614336934813, + "grad_norm": 3.302360975300849, + "learning_rate": 1.982085708730368e-05, + "loss": 2.4715, + "step": 920 + }, + { + "epoch": 0.17710614336934813, + "lm_loss": 2.3409, + "step": 920, + "vm_loss": 0.2124 + }, + { + "epoch": 0.17710614336934813, + "lm_loss": 2.429, + "step": 920, + "vm_loss": 0.1952 + }, + { + "epoch": 0.17710614336934813, + "lm_loss": 2.0471, + "step": 920, + "vm_loss": 0.223 + }, + { + "epoch": 0.17710614336934813, + "lm_loss": 1.8902, + "step": 920, + "vm_loss": 0.1409 + }, + { + "epoch": 0.17710614336934813, + "lm_loss": 2.2868, + "step": 920, + "vm_loss": 0.1755 + }, + { + "epoch": 0.17710614336934813, + "lm_loss": 2.4169, + "step": 920, + "vm_loss": 0.1867 + }, + { + "epoch": 0.17710614336934813, + "lm_loss": 2.5888, + "step": 920, + "vm_loss": 0.1771 + }, + { + "epoch": 0.17710614336934813, + "lm_loss": 2.3245, + "step": 920, + "vm_loss": 0.2082 + }, + { + "epoch": 0.1772986500469235, + "grad_norm": 3.523413540578112, + "learning_rate": 1.9820269089835843e-05, + "loss": 2.4697, + "step": 921 + }, + { + "epoch": 0.1774911567244989, + "grad_norm": 4.113064333429806, + "learning_rate": 1.9819680137712214e-05, + "loss": 2.5192, + "step": 922 + }, + { + "epoch": 0.17768366340207425, + "grad_norm": 3.140013825303239, + "learning_rate": 1.9819090230990046e-05, + "loss": 2.4435, + "step": 923 + }, + { + "epoch": 0.17787617007964965, + "grad_norm": 4.368978262255207, + "learning_rate": 1.9818499369726684e-05, + "loss": 2.5135, + "step": 924 + }, + { + "epoch": 0.17806867675722501, + "grad_norm": 4.438369945176267, + "learning_rate": 1.9817907553979566e-05, + "loss": 2.5502, + "step": 925 + }, + { + "epoch": 0.1782611834348004, + "grad_norm": 3.7076680064270757, + "learning_rate": 1.9817314783806226e-05, + "loss": 2.4497, + "step": 926 + }, + { + "epoch": 0.17845369011237577, + "grad_norm": 3.9828981444572293, + "learning_rate": 1.9816721059264294e-05, + "loss": 2.5006, + "step": 927 + }, + { + "epoch": 0.17864619678995114, + "grad_norm": 4.0852706255795646, + "learning_rate": 1.9816126380411478e-05, + "loss": 2.4854, + "step": 928 + }, + { + "epoch": 0.17864619678995114, + "lm_loss": 2.4162, + "step": 928, + "vm_loss": 0.1811 + }, + { + "epoch": 0.17864619678995114, + "lm_loss": 2.3802, + "step": 928, + "vm_loss": 0.1678 + }, + { + "epoch": 0.17864619678995114, + "lm_loss": 2.3401, + "step": 928, + "vm_loss": 0.133 + }, + { + "epoch": 0.17864619678995114, + "lm_loss": 2.3518, + "step": 928, + "vm_loss": 0.0999 + }, + { + "epoch": 0.17864619678995114, + "lm_loss": 1.9935, + "step": 928, + "vm_loss": 0.2272 + }, + { + "epoch": 0.17864619678995114, + "lm_loss": 2.1281, + "step": 928, + "vm_loss": 0.1493 + }, + { + "epoch": 0.17864619678995114, + "lm_loss": 2.1832, + "step": 928, + "vm_loss": 0.2207 + }, + { + "epoch": 0.17864619678995114, + "lm_loss": 2.1482, + "step": 928, + "vm_loss": 0.1951 + }, + { + "epoch": 0.17883870346752653, + "grad_norm": 4.105545881370151, + "learning_rate": 1.9815530747305594e-05, + "loss": 2.491, + "step": 929 + }, + { + "epoch": 0.1790312101451019, + "grad_norm": 4.25919193880954, + "learning_rate": 1.9814934160004543e-05, + "loss": 2.5171, + "step": 930 + }, + { + "epoch": 0.1792237168226773, + "grad_norm": 3.7556766444343714, + "learning_rate": 1.9814336618566324e-05, + "loss": 2.4845, + "step": 931 + }, + { + "epoch": 0.17941622350025266, + "grad_norm": 3.4500733338305527, + "learning_rate": 1.9813738123049022e-05, + "loss": 2.5274, + "step": 932 + }, + { + "epoch": 0.17960873017782805, + "grad_norm": 3.7655259561953063, + "learning_rate": 1.981313867351082e-05, + "loss": 2.5393, + "step": 933 + }, + { + "epoch": 0.17980123685540342, + "grad_norm": 4.081020466463539, + "learning_rate": 1.9812538270009995e-05, + "loss": 2.4465, + "step": 934 + }, + { + "epoch": 0.1799937435329788, + "grad_norm": 3.3199140286584705, + "learning_rate": 1.981193691260491e-05, + "loss": 2.4707, + "step": 935 + }, + { + "epoch": 0.18018625021055418, + "grad_norm": 3.781768576733766, + "learning_rate": 1.9811334601354023e-05, + "loss": 2.4506, + "step": 936 + }, + { + "epoch": 0.18018625021055418, + "lm_loss": 2.6181, + "step": 936, + "vm_loss": 0.1575 + }, + { + "epoch": 0.18018625021055418, + "lm_loss": 1.8245, + "step": 936, + "vm_loss": 0.1791 + }, + { + "epoch": 0.18018625021055418, + "lm_loss": 2.3441, + "step": 936, + "vm_loss": 0.2084 + }, + { + "epoch": 0.18018625021055418, + "lm_loss": 2.433, + "step": 936, + "vm_loss": 0.2126 + }, + { + "epoch": 0.18018625021055418, + "lm_loss": 1.9038, + "step": 936, + "vm_loss": 0.225 + }, + { + "epoch": 0.18018625021055418, + "lm_loss": 2.3011, + "step": 936, + "vm_loss": 0.1849 + }, + { + "epoch": 0.18018625021055418, + "lm_loss": 2.2468, + "step": 936, + "vm_loss": 0.1741 + }, + { + "epoch": 0.18018625021055418, + "lm_loss": 2.2464, + "step": 936, + "vm_loss": 0.137 + }, + { + "epoch": 0.18037875688812954, + "grad_norm": 3.7818729181062607, + "learning_rate": 1.981073133631589e-05, + "loss": 2.4634, + "step": 937 + }, + { + "epoch": 0.18057126356570494, + "grad_norm": 4.113065116976769, + "learning_rate": 1.9810127117549158e-05, + "loss": 2.474, + "step": 938 + }, + { + "epoch": 0.1807637702432803, + "grad_norm": 4.041987198703892, + "learning_rate": 1.980952194511256e-05, + "loss": 2.4222, + "step": 939 + }, + { + "epoch": 0.1809562769208557, + "grad_norm": 3.821931670421027, + "learning_rate": 1.9808915819064926e-05, + "loss": 2.4771, + "step": 940 + }, + { + "epoch": 0.18114878359843106, + "grad_norm": 3.6492934028917734, + "learning_rate": 1.9808308739465184e-05, + "loss": 2.5089, + "step": 941 + }, + { + "epoch": 0.18134129027600646, + "grad_norm": 4.275026258897681, + "learning_rate": 1.980770070637235e-05, + "loss": 2.4623, + "step": 942 + }, + { + "epoch": 0.18153379695358182, + "grad_norm": 3.9938233056880157, + "learning_rate": 1.9807091719845527e-05, + "loss": 2.4876, + "step": 943 + }, + { + "epoch": 0.18172630363115722, + "grad_norm": 3.6050250421607037, + "learning_rate": 1.9806481779943922e-05, + "loss": 2.5289, + "step": 944 + }, + { + "epoch": 0.18172630363115722, + "lm_loss": 1.8762, + "step": 944, + "vm_loss": 0.1662 + }, + { + "epoch": 0.18172630363115722, + "lm_loss": 2.4668, + "step": 944, + "vm_loss": 0.1554 + }, + { + "epoch": 0.18172630363115722, + "lm_loss": 2.252, + "step": 944, + "vm_loss": 0.2038 + }, + { + "epoch": 0.18172630363115722, + "lm_loss": 2.4318, + "step": 944, + "vm_loss": 0.1403 + }, + { + "epoch": 0.18172630363115722, + "lm_loss": 2.4921, + "step": 944, + "vm_loss": 0.1518 + }, + { + "epoch": 0.18172630363115722, + "lm_loss": 2.2405, + "step": 944, + "vm_loss": 0.1531 + }, + { + "epoch": 0.18172630363115722, + "lm_loss": 1.9509, + "step": 944, + "vm_loss": 0.1679 + }, + { + "epoch": 0.18172630363115722, + "lm_loss": 2.14, + "step": 944, + "vm_loss": 0.1146 + }, + { + "epoch": 0.18191881030873258, + "grad_norm": 4.6365375048485, + "learning_rate": 1.9805870886726826e-05, + "loss": 2.4836, + "step": 945 + }, + { + "epoch": 0.18211131698630798, + "grad_norm": 3.7434358690966443, + "learning_rate": 1.9805259040253624e-05, + "loss": 2.4693, + "step": 946 + }, + { + "epoch": 0.18230382366388334, + "grad_norm": 4.356568670475241, + "learning_rate": 1.9804646240583797e-05, + "loss": 2.5226, + "step": 947 + }, + { + "epoch": 0.1824963303414587, + "grad_norm": 3.989996873737595, + "learning_rate": 1.980403248777692e-05, + "loss": 2.538, + "step": 948 + }, + { + "epoch": 0.1826888370190341, + "grad_norm": 3.731461961837993, + "learning_rate": 1.9803417781892655e-05, + "loss": 2.4724, + "step": 949 + }, + { + "epoch": 0.18288134369660947, + "grad_norm": 3.75401515076171, + "learning_rate": 1.9802802122990758e-05, + "loss": 2.4643, + "step": 950 + }, + { + "epoch": 0.18307385037418486, + "grad_norm": 3.6429819788861044, + "learning_rate": 1.980218551113108e-05, + "loss": 2.4784, + "step": 951 + }, + { + "epoch": 0.18326635705176023, + "grad_norm": 4.326937072731741, + "learning_rate": 1.9801567946373564e-05, + "loss": 2.4308, + "step": 952 + }, + { + "epoch": 0.18326635705176023, + "lm_loss": 2.3545, + "step": 952, + "vm_loss": 0.1786 + }, + { + "epoch": 0.18326635705176023, + "lm_loss": 2.5824, + "step": 952, + "vm_loss": 0.1769 + }, + { + "epoch": 0.18326635705176023, + "lm_loss": 1.7718, + "step": 952, + "vm_loss": 0.1413 + }, + { + "epoch": 0.18326635705176023, + "lm_loss": 2.1943, + "step": 952, + "vm_loss": 0.1012 + }, + { + "epoch": 0.18326635705176023, + "lm_loss": 2.5159, + "step": 952, + "vm_loss": 0.2275 + }, + { + "epoch": 0.18326635705176023, + "lm_loss": 2.125, + "step": 952, + "vm_loss": 0.2222 + }, + { + "epoch": 0.18326635705176023, + "lm_loss": 2.3212, + "step": 952, + "vm_loss": 0.1449 + }, + { + "epoch": 0.18326635705176023, + "lm_loss": 2.2838, + "step": 952, + "vm_loss": 0.1891 + }, + { + "epoch": 0.18345886372933562, + "grad_norm": 3.7770294529989483, + "learning_rate": 1.9800949428778245e-05, + "loss": 2.4882, + "step": 953 + }, + { + "epoch": 0.18365137040691099, + "grad_norm": 3.2466566302180664, + "learning_rate": 1.9800329958405254e-05, + "loss": 2.4741, + "step": 954 + }, + { + "epoch": 0.18384387708448638, + "grad_norm": 3.910691484176159, + "learning_rate": 1.9799709535314805e-05, + "loss": 2.4963, + "step": 955 + }, + { + "epoch": 0.18403638376206174, + "grad_norm": 3.8286337347381654, + "learning_rate": 1.9799088159567213e-05, + "loss": 2.5175, + "step": 956 + }, + { + "epoch": 0.18422889043963714, + "grad_norm": 3.5316038579186113, + "learning_rate": 1.9798465831222885e-05, + "loss": 2.499, + "step": 957 + }, + { + "epoch": 0.1844213971172125, + "grad_norm": 4.1670051970138475, + "learning_rate": 1.979784255034232e-05, + "loss": 2.4936, + "step": 958 + }, + { + "epoch": 0.18461390379478787, + "grad_norm": 3.385429672109255, + "learning_rate": 1.9797218316986112e-05, + "loss": 2.4689, + "step": 959 + }, + { + "epoch": 0.18480641047236326, + "grad_norm": 3.3078136609460094, + "learning_rate": 1.9796593131214935e-05, + "loss": 2.4422, + "step": 960 + }, + { + "epoch": 0.18480641047236326, + "lm_loss": 2.2358, + "step": 960, + "vm_loss": 0.2183 + }, + { + "epoch": 0.18480641047236326, + "lm_loss": 2.2605, + "step": 960, + "vm_loss": 0.1689 + }, + { + "epoch": 0.18480641047236326, + "lm_loss": 2.423, + "step": 960, + "vm_loss": 0.2152 + }, + { + "epoch": 0.18480641047236326, + "lm_loss": 2.3081, + "step": 960, + "vm_loss": 0.1349 + }, + { + "epoch": 0.18480641047236326, + "lm_loss": 1.4558, + "step": 960, + "vm_loss": 0.1423 + }, + { + "epoch": 0.18480641047236326, + "lm_loss": 2.3081, + "step": 960, + "vm_loss": 0.2512 + }, + { + "epoch": 0.18480641047236326, + "lm_loss": 2.286, + "step": 960, + "vm_loss": 0.1623 + }, + { + "epoch": 0.18480641047236326, + "lm_loss": 2.7513, + "step": 960, + "vm_loss": 0.1888 + }, + { + "epoch": 0.18499891714993863, + "grad_norm": 3.3534005649504213, + "learning_rate": 1.9795966993089574e-05, + "loss": 2.4932, + "step": 961 + }, + { + "epoch": 0.18519142382751402, + "grad_norm": 3.5507171279996403, + "learning_rate": 1.9795339902670893e-05, + "loss": 2.4604, + "step": 962 + }, + { + "epoch": 0.1853839305050894, + "grad_norm": 3.5148603601270105, + "learning_rate": 1.9794711860019852e-05, + "loss": 2.4512, + "step": 963 + }, + { + "epoch": 0.18557643718266478, + "grad_norm": 3.4062079659895774, + "learning_rate": 1.979408286519751e-05, + "loss": 2.5454, + "step": 964 + }, + { + "epoch": 0.18576894386024015, + "grad_norm": 3.997495292221776, + "learning_rate": 1.979345291826501e-05, + "loss": 2.4384, + "step": 965 + }, + { + "epoch": 0.18596145053781554, + "grad_norm": 3.4630392062229323, + "learning_rate": 1.9792822019283594e-05, + "loss": 2.4567, + "step": 966 + }, + { + "epoch": 0.1861539572153909, + "grad_norm": 4.048113515227834, + "learning_rate": 1.979219016831459e-05, + "loss": 2.4567, + "step": 967 + }, + { + "epoch": 0.18634646389296627, + "grad_norm": 3.4428917692998025, + "learning_rate": 1.979155736541942e-05, + "loss": 2.4915, + "step": 968 + }, + { + "epoch": 0.18634646389296627, + "lm_loss": 2.6, + "step": 968, + "vm_loss": 0.2593 + }, + { + "epoch": 0.18634646389296627, + "lm_loss": 2.2453, + "step": 968, + "vm_loss": 0.1728 + }, + { + "epoch": 0.18634646389296627, + "lm_loss": 2.4075, + "step": 968, + "vm_loss": 0.1269 + }, + { + "epoch": 0.18634646389296627, + "lm_loss": 2.2265, + "step": 968, + "vm_loss": 0.1433 + }, + { + "epoch": 0.18634646389296627, + "lm_loss": 2.3331, + "step": 968, + "vm_loss": 0.2269 + }, + { + "epoch": 0.18634646389296627, + "lm_loss": 2.0818, + "step": 968, + "vm_loss": 0.2031 + }, + { + "epoch": 0.18634646389296627, + "lm_loss": 2.2525, + "step": 968, + "vm_loss": 0.12 + }, + { + "epoch": 0.18634646389296627, + "lm_loss": 2.566, + "step": 968, + "vm_loss": 0.1129 + }, + { + "epoch": 0.18653897057054167, + "grad_norm": 3.9272927603774876, + "learning_rate": 1.9790923610659604e-05, + "loss": 2.4994, + "step": 969 + }, + { + "epoch": 0.18673147724811703, + "grad_norm": 3.6329926998473305, + "learning_rate": 1.9790288904096754e-05, + "loss": 2.4511, + "step": 970 + }, + { + "epoch": 0.18692398392569243, + "grad_norm": 4.653108912322447, + "learning_rate": 1.9789653245792565e-05, + "loss": 2.4973, + "step": 971 + }, + { + "epoch": 0.1871164906032678, + "grad_norm": 4.087354006557861, + "learning_rate": 1.9789016635808836e-05, + "loss": 2.5106, + "step": 972 + }, + { + "epoch": 0.1873089972808432, + "grad_norm": 3.4705490011992333, + "learning_rate": 1.978837907420745e-05, + "loss": 2.4917, + "step": 973 + }, + { + "epoch": 0.18750150395841855, + "grad_norm": 3.780693667091048, + "learning_rate": 1.978774056105039e-05, + "loss": 2.5198, + "step": 974 + }, + { + "epoch": 0.18769401063599395, + "grad_norm": 3.306947713109733, + "learning_rate": 1.9787101096399727e-05, + "loss": 2.4992, + "step": 975 + }, + { + "epoch": 0.1878865173135693, + "grad_norm": 3.9278538748370773, + "learning_rate": 1.978646068031762e-05, + "loss": 2.4832, + "step": 976 + }, + { + "epoch": 0.1878865173135693, + "lm_loss": 2.3012, + "step": 976, + "vm_loss": 0.1794 + }, + { + "epoch": 0.1878865173135693, + "lm_loss": 2.45, + "step": 976, + "vm_loss": 0.1252 + }, + { + "epoch": 0.1878865173135693, + "lm_loss": 2.3898, + "step": 976, + "vm_loss": 0.1251 + }, + { + "epoch": 0.1878865173135693, + "lm_loss": 2.3723, + "step": 976, + "vm_loss": 0.1905 + }, + { + "epoch": 0.1878865173135693, + "lm_loss": 2.063, + "step": 976, + "vm_loss": 0.1438 + }, + { + "epoch": 0.1878865173135693, + "lm_loss": 2.4101, + "step": 976, + "vm_loss": 0.1816 + }, + { + "epoch": 0.1878865173135693, + "lm_loss": 2.1547, + "step": 976, + "vm_loss": 0.1501 + }, + { + "epoch": 0.1878865173135693, + "lm_loss": 2.0522, + "step": 976, + "vm_loss": 0.1487 + }, + { + "epoch": 0.1880790239911447, + "grad_norm": 4.575225456367262, + "learning_rate": 1.978581931286633e-05, + "loss": 2.4422, + "step": 977 + }, + { + "epoch": 0.18827153066872007, + "grad_norm": 3.8872161486108014, + "learning_rate": 1.9785176994108207e-05, + "loss": 2.4964, + "step": 978 + }, + { + "epoch": 0.18846403734629544, + "grad_norm": 4.137412348549307, + "learning_rate": 1.9784533724105692e-05, + "loss": 2.5189, + "step": 979 + }, + { + "epoch": 0.18865654402387083, + "grad_norm": 4.200131678711624, + "learning_rate": 1.9783889502921315e-05, + "loss": 2.4432, + "step": 980 + }, + { + "epoch": 0.1888490507014462, + "grad_norm": 3.2020051860280714, + "learning_rate": 1.9783244330617713e-05, + "loss": 2.5203, + "step": 981 + }, + { + "epoch": 0.1890415573790216, + "grad_norm": 4.378339167178262, + "learning_rate": 1.9782598207257592e-05, + "loss": 2.514, + "step": 982 + }, + { + "epoch": 0.18923406405659696, + "grad_norm": 3.8179046337961053, + "learning_rate": 1.978195113290377e-05, + "loss": 2.4961, + "step": 983 + }, + { + "epoch": 0.18942657073417235, + "grad_norm": 3.795245357283514, + "learning_rate": 1.978130310761915e-05, + "loss": 2.4645, + "step": 984 + }, + { + "epoch": 0.18942657073417235, + "lm_loss": 2.3467, + "step": 984, + "vm_loss": 0.1505 + }, + { + "epoch": 0.18942657073417235, + "lm_loss": 2.3725, + "step": 984, + "vm_loss": 0.2331 + }, + { + "epoch": 0.18942657073417235, + "lm_loss": 2.5058, + "step": 984, + "vm_loss": 0.1347 + }, + { + "epoch": 0.18942657073417235, + "lm_loss": 2.4235, + "step": 984, + "vm_loss": 0.1706 + }, + { + "epoch": 0.18942657073417235, + "lm_loss": 2.4651, + "step": 984, + "vm_loss": 0.1275 + }, + { + "epoch": 0.18942657073417235, + "lm_loss": 2.2615, + "step": 984, + "vm_loss": 0.1337 + }, + { + "epoch": 0.18942657073417235, + "lm_loss": 2.19, + "step": 984, + "vm_loss": 0.218 + }, + { + "epoch": 0.18942657073417235, + "lm_loss": 2.3705, + "step": 984, + "vm_loss": 0.1926 + }, + { + "epoch": 0.18961907741174772, + "grad_norm": 3.7093620016369746, + "learning_rate": 1.978065413146673e-05, + "loss": 2.4914, + "step": 985 + }, + { + "epoch": 0.1898115840893231, + "grad_norm": 3.7049314660779817, + "learning_rate": 1.9780004204509594e-05, + "loss": 2.4619, + "step": 986 + }, + { + "epoch": 0.19000409076689848, + "grad_norm": 3.8158072288365994, + "learning_rate": 1.977935332681093e-05, + "loss": 2.4841, + "step": 987 + }, + { + "epoch": 0.19019659744447384, + "grad_norm": 3.6220529851068846, + "learning_rate": 1.9778701498434007e-05, + "loss": 2.5257, + "step": 988 + }, + { + "epoch": 0.19038910412204924, + "grad_norm": 4.103434716649326, + "learning_rate": 1.977804871944219e-05, + "loss": 2.5211, + "step": 989 + }, + { + "epoch": 0.1905816107996246, + "grad_norm": 4.3867567449904685, + "learning_rate": 1.9777394989898947e-05, + "loss": 2.4672, + "step": 990 + }, + { + "epoch": 0.1907741174772, + "grad_norm": 3.935406093562076, + "learning_rate": 1.9776740309867813e-05, + "loss": 2.4706, + "step": 991 + }, + { + "epoch": 0.19096662415477536, + "grad_norm": 3.5199978074956046, + "learning_rate": 1.9776084679412443e-05, + "loss": 2.5139, + "step": 992 + }, + { + "epoch": 0.19096662415477536, + "lm_loss": 2.2626, + "step": 992, + "vm_loss": 0.1644 + }, + { + "epoch": 0.19096662415477536, + "lm_loss": 2.2582, + "step": 992, + "vm_loss": 0.1594 + }, + { + "epoch": 0.19096662415477536, + "lm_loss": 2.5946, + "step": 992, + "vm_loss": 0.1621 + }, + { + "epoch": 0.19096662415477536, + "lm_loss": 2.187, + "step": 992, + "vm_loss": 0.1913 + }, + { + "epoch": 0.19096662415477536, + "lm_loss": 2.219, + "step": 992, + "vm_loss": 0.2317 + }, + { + "epoch": 0.19096662415477536, + "lm_loss": 2.1164, + "step": 992, + "vm_loss": 0.1684 + }, + { + "epoch": 0.19096662415477536, + "lm_loss": 2.4844, + "step": 992, + "vm_loss": 0.2143 + }, + { + "epoch": 0.19096662415477536, + "lm_loss": 2.3042, + "step": 992, + "vm_loss": 0.1737 + }, + { + "epoch": 0.19115913083235075, + "grad_norm": 4.032690702016478, + "learning_rate": 1.977542809859657e-05, + "loss": 2.487, + "step": 993 + }, + { + "epoch": 0.19135163750992612, + "grad_norm": 3.7662215343923013, + "learning_rate": 1.977477056748402e-05, + "loss": 2.458, + "step": 994 + }, + { + "epoch": 0.1915441441875015, + "grad_norm": 3.4108699826658255, + "learning_rate": 1.9774112086138718e-05, + "loss": 2.4901, + "step": 995 + }, + { + "epoch": 0.19173665086507688, + "grad_norm": 3.262581332430276, + "learning_rate": 1.9773452654624675e-05, + "loss": 2.4222, + "step": 996 + }, + { + "epoch": 0.19192915754265227, + "grad_norm": 3.4279722162029973, + "learning_rate": 1.9772792273005993e-05, + "loss": 2.4488, + "step": 997 + }, + { + "epoch": 0.19212166422022764, + "grad_norm": 3.490304265099083, + "learning_rate": 1.9772130941346866e-05, + "loss": 2.5138, + "step": 998 + }, + { + "epoch": 0.192314170897803, + "grad_norm": 3.4614595889352406, + "learning_rate": 1.9771468659711595e-05, + "loss": 2.4717, + "step": 999 + }, + { + "epoch": 0.1925066775753784, + "grad_norm": 3.9858027067320045, + "learning_rate": 1.9770805428164558e-05, + "loss": 2.5017, + "step": 1000 + }, + { + "epoch": 0.1925066775753784, + "lm_loss": 1.9985, + "step": 1000, + "vm_loss": 0.1912 + }, + { + "epoch": 0.1925066775753784, + "lm_loss": 2.2382, + "step": 1000, + "vm_loss": 0.305 + }, + { + "epoch": 0.1925066775753784, + "lm_loss": 2.1586, + "step": 1000, + "vm_loss": 0.2122 + }, + { + "epoch": 0.1925066775753784, + "lm_loss": 2.445, + "step": 1000, + "vm_loss": 0.1951 + }, + { + "epoch": 0.1925066775753784, + "lm_loss": 1.9637, + "step": 1000, + "vm_loss": 0.2108 + }, + { + "epoch": 0.1925066775753784, + "lm_loss": 2.442, + "step": 1000, + "vm_loss": 0.1698 + }, + { + "epoch": 0.1925066775753784, + "lm_loss": 2.3797, + "step": 1000, + "vm_loss": 0.1251 + }, + { + "epoch": 0.1925066775753784, + "lm_loss": 2.2713, + "step": 1000, + "vm_loss": 0.2927 + }, + { + "epoch": 0.19269918425295376, + "grad_norm": 3.957687521447021, + "learning_rate": 1.9770141246770226e-05, + "loss": 2.4373, + "step": 1001 + }, + { + "epoch": 0.19289169093052916, + "grad_norm": 3.4806180075557793, + "learning_rate": 1.976947611559317e-05, + "loss": 2.5069, + "step": 1002 + }, + { + "epoch": 0.19308419760810452, + "grad_norm": 3.5292738481587933, + "learning_rate": 1.9768810034698044e-05, + "loss": 2.4729, + "step": 1003 + }, + { + "epoch": 0.19327670428567992, + "grad_norm": 3.4395583234702505, + "learning_rate": 1.9768143004149607e-05, + "loss": 2.4949, + "step": 1004 + }, + { + "epoch": 0.19346921096325528, + "grad_norm": 3.6927098490518886, + "learning_rate": 1.9767475024012698e-05, + "loss": 2.5255, + "step": 1005 + }, + { + "epoch": 0.19366171764083068, + "grad_norm": 3.461277809334239, + "learning_rate": 1.9766806094352252e-05, + "loss": 2.5147, + "step": 1006 + }, + { + "epoch": 0.19385422431840604, + "grad_norm": 3.4270695858874936, + "learning_rate": 1.9766136215233297e-05, + "loss": 2.4699, + "step": 1007 + }, + { + "epoch": 0.19404673099598144, + "grad_norm": 4.211807705414343, + "learning_rate": 1.9765465386720963e-05, + "loss": 2.465, + "step": 1008 + }, + { + "epoch": 0.19404673099598144, + "lm_loss": 2.1625, + "step": 1008, + "vm_loss": 0.1435 + }, + { + "epoch": 0.19404673099598144, + "lm_loss": 2.4017, + "step": 1008, + "vm_loss": 0.2301 + }, + { + "epoch": 0.19404673099598144, + "lm_loss": 2.5536, + "step": 1008, + "vm_loss": 0.1806 + }, + { + "epoch": 0.19404673099598144, + "lm_loss": 2.126, + "step": 1008, + "vm_loss": 0.3028 + }, + { + "epoch": 0.19404673099598144, + "lm_loss": 2.1502, + "step": 1008, + "vm_loss": 0.1585 + }, + { + "epoch": 0.19404673099598144, + "lm_loss": 2.2478, + "step": 1008, + "vm_loss": 0.1854 + }, + { + "epoch": 0.19404673099598144, + "lm_loss": 2.1107, + "step": 1008, + "vm_loss": 0.1328 + }, + { + "epoch": 0.19404673099598144, + "lm_loss": 2.1554, + "step": 1008, + "vm_loss": 0.1652 + }, + { + "epoch": 0.1942392376735568, + "grad_norm": 3.159723117226434, + "learning_rate": 1.9764793608880454e-05, + "loss": 2.4801, + "step": 1009 + }, + { + "epoch": 0.19443174435113217, + "grad_norm": 3.540448326004961, + "learning_rate": 1.9764120881777077e-05, + "loss": 2.4163, + "step": 1010 + }, + { + "epoch": 0.19462425102870756, + "grad_norm": 3.776630767621123, + "learning_rate": 1.976344720547623e-05, + "loss": 2.4728, + "step": 1011 + }, + { + "epoch": 0.19481675770628293, + "grad_norm": 2.9084201315534233, + "learning_rate": 1.976277258004341e-05, + "loss": 2.4444, + "step": 1012 + }, + { + "epoch": 0.19500926438385832, + "grad_norm": 3.2219226578541766, + "learning_rate": 1.9762097005544187e-05, + "loss": 2.4562, + "step": 1013 + }, + { + "epoch": 0.1952017710614337, + "grad_norm": 3.5620050413053703, + "learning_rate": 1.976142048204424e-05, + "loss": 2.4781, + "step": 1014 + }, + { + "epoch": 0.19539427773900908, + "grad_norm": 3.1703341459018115, + "learning_rate": 1.976074300960934e-05, + "loss": 2.4632, + "step": 1015 + }, + { + "epoch": 0.19558678441658445, + "grad_norm": 3.616403122568653, + "learning_rate": 1.9760064588305347e-05, + "loss": 2.4929, + "step": 1016 + }, + { + "epoch": 0.19558678441658445, + "lm_loss": 2.2893, + "step": 1016, + "vm_loss": 0.1543 + }, + { + "epoch": 0.19558678441658445, + "lm_loss": 2.3458, + "step": 1016, + "vm_loss": 0.1698 + }, + { + "epoch": 0.19558678441658445, + "lm_loss": 2.4711, + "step": 1016, + "vm_loss": 0.1499 + }, + { + "epoch": 0.19558678441658445, + "lm_loss": 2.4778, + "step": 1016, + "vm_loss": 0.1648 + }, + { + "epoch": 0.19558678441658445, + "lm_loss": 2.7431, + "step": 1016, + "vm_loss": 0.1541 + }, + { + "epoch": 0.19558678441658445, + "lm_loss": 2.2508, + "step": 1016, + "vm_loss": 0.1661 + }, + { + "epoch": 0.19558678441658445, + "lm_loss": 2.3955, + "step": 1016, + "vm_loss": 0.19 + }, + { + "epoch": 0.19558678441658445, + "lm_loss": 2.3489, + "step": 1016, + "vm_loss": 0.1949 + }, + { + "epoch": 0.19577929109415984, + "grad_norm": 3.2476158500938044, + "learning_rate": 1.9759385218198203e-05, + "loss": 2.4891, + "step": 1017 + }, + { + "epoch": 0.1959717977717352, + "grad_norm": 4.018487468468996, + "learning_rate": 1.9758704899353954e-05, + "loss": 2.4978, + "step": 1018 + }, + { + "epoch": 0.19616430444931057, + "grad_norm": 3.491169193715776, + "learning_rate": 1.9758023631838744e-05, + "loss": 2.4992, + "step": 1019 + }, + { + "epoch": 0.19635681112688597, + "grad_norm": 4.7145408455772335, + "learning_rate": 1.9757341415718794e-05, + "loss": 2.5322, + "step": 1020 + }, + { + "epoch": 0.19654931780446133, + "grad_norm": 3.162191584676517, + "learning_rate": 1.9756658251060425e-05, + "loss": 2.5072, + "step": 1021 + }, + { + "epoch": 0.19674182448203673, + "grad_norm": 3.818865511666175, + "learning_rate": 1.9755974137930053e-05, + "loss": 2.466, + "step": 1022 + }, + { + "epoch": 0.1969343311596121, + "grad_norm": 3.5139150620052835, + "learning_rate": 1.9755289076394176e-05, + "loss": 2.4518, + "step": 1023 + }, + { + "epoch": 0.19712683783718749, + "grad_norm": 3.7638917320882177, + "learning_rate": 1.9754603066519395e-05, + "loss": 2.4568, + "step": 1024 + }, + { + "epoch": 0.19712683783718749, + "lm_loss": 2.2327, + "step": 1024, + "vm_loss": 0.2077 + }, + { + "epoch": 0.19712683783718749, + "lm_loss": 1.857, + "step": 1024, + "vm_loss": 0.1574 + }, + { + "epoch": 0.19712683783718749, + "lm_loss": 2.0651, + "step": 1024, + "vm_loss": 0.1599 + }, + { + "epoch": 0.19712683783718749, + "lm_loss": 2.5391, + "step": 1024, + "vm_loss": 0.1502 + }, + { + "epoch": 0.19712683783718749, + "lm_loss": 2.4881, + "step": 1024, + "vm_loss": 0.1376 + }, + { + "epoch": 0.19712683783718749, + "lm_loss": 2.3546, + "step": 1024, + "vm_loss": 0.1664 + }, + { + "epoch": 0.19712683783718749, + "lm_loss": 2.4412, + "step": 1024, + "vm_loss": 0.1202 + }, + { + "epoch": 0.19712683783718749, + "lm_loss": 2.289, + "step": 1024, + "vm_loss": 0.1884 + }, + { + "epoch": 0.19731934451476285, + "grad_norm": 3.386152807398567, + "learning_rate": 1.97539161083724e-05, + "loss": 2.5099, + "step": 1025 + }, + { + "epoch": 0.19751185119233824, + "grad_norm": 3.9435749377278726, + "learning_rate": 1.9753228202019966e-05, + "loss": 2.4697, + "step": 1026 + }, + { + "epoch": 0.1977043578699136, + "grad_norm": 3.5964264735097284, + "learning_rate": 1.9752539347528973e-05, + "loss": 2.4387, + "step": 1027 + }, + { + "epoch": 0.197896864547489, + "grad_norm": 3.9519620118813883, + "learning_rate": 1.9751849544966385e-05, + "loss": 2.4901, + "step": 1028 + }, + { + "epoch": 0.19808937122506437, + "grad_norm": 3.3338873628092025, + "learning_rate": 1.9751158794399255e-05, + "loss": 2.4591, + "step": 1029 + }, + { + "epoch": 0.19828187790263974, + "grad_norm": 3.4342699536094448, + "learning_rate": 1.9750467095894738e-05, + "loss": 2.4724, + "step": 1030 + }, + { + "epoch": 0.19847438458021513, + "grad_norm": 3.987252007022662, + "learning_rate": 1.9749774449520078e-05, + "loss": 2.4988, + "step": 1031 + }, + { + "epoch": 0.1986668912577905, + "grad_norm": 3.2412066444484293, + "learning_rate": 1.97490808553426e-05, + "loss": 2.4671, + "step": 1032 + }, + { + "epoch": 0.1986668912577905, + "lm_loss": 2.4341, + "step": 1032, + "vm_loss": 0.1054 + }, + { + "epoch": 0.1986668912577905, + "lm_loss": 2.0009, + "step": 1032, + "vm_loss": 0.1528 + }, + { + "epoch": 0.1986668912577905, + "lm_loss": 2.0975, + "step": 1032, + "vm_loss": 0.2117 + }, + { + "epoch": 0.1986668912577905, + "lm_loss": 2.3541, + "step": 1032, + "vm_loss": 0.1733 + }, + { + "epoch": 0.1986668912577905, + "lm_loss": 2.2321, + "step": 1032, + "vm_loss": 0.1368 + }, + { + "epoch": 0.1986668912577905, + "lm_loss": 2.0613, + "step": 1032, + "vm_loss": 0.1733 + }, + { + "epoch": 0.1986668912577905, + "lm_loss": 2.4225, + "step": 1032, + "vm_loss": 0.1611 + }, + { + "epoch": 0.1986668912577905, + "lm_loss": 2.1777, + "step": 1032, + "vm_loss": 0.1577 + }, + { + "epoch": 0.1988593979353659, + "grad_norm": 3.4767540050777366, + "learning_rate": 1.974838631342974e-05, + "loss": 2.4279, + "step": 1033 + }, + { + "epoch": 0.19905190461294126, + "grad_norm": 3.4765721191917813, + "learning_rate": 1.9747690823849012e-05, + "loss": 2.4845, + "step": 1034 + }, + { + "epoch": 0.19924441129051665, + "grad_norm": 4.157545201488476, + "learning_rate": 1.9746994386668022e-05, + "loss": 2.4575, + "step": 1035 + }, + { + "epoch": 0.19943691796809201, + "grad_norm": 3.5651942757837873, + "learning_rate": 1.974629700195448e-05, + "loss": 2.4731, + "step": 1036 + }, + { + "epoch": 0.1996294246456674, + "grad_norm": 3.3897257561244434, + "learning_rate": 1.974559866977618e-05, + "loss": 2.496, + "step": 1037 + }, + { + "epoch": 0.19982193132324277, + "grad_norm": 3.8318341664084294, + "learning_rate": 1.9744899390201006e-05, + "loss": 2.5046, + "step": 1038 + }, + { + "epoch": 0.20001443800081814, + "grad_norm": 3.417175940392068, + "learning_rate": 1.9744199163296934e-05, + "loss": 2.4673, + "step": 1039 + }, + { + "epoch": 0.20020694467839353, + "grad_norm": 3.3985560683942406, + "learning_rate": 1.9743497989132045e-05, + "loss": 2.4647, + "step": 1040 + }, + { + "epoch": 0.20020694467839353, + "lm_loss": 2.0189, + "step": 1040, + "vm_loss": 0.1312 + }, + { + "epoch": 0.20020694467839353, + "lm_loss": 2.1004, + "step": 1040, + "vm_loss": 0.2506 + }, + { + "epoch": 0.20020694467839353, + "lm_loss": 2.2524, + "step": 1040, + "vm_loss": 0.1759 + }, + { + "epoch": 0.20020694467839353, + "lm_loss": 2.1861, + "step": 1040, + "vm_loss": 0.1829 + }, + { + "epoch": 0.20020694467839353, + "lm_loss": 2.1416, + "step": 1040, + "vm_loss": 0.1625 + }, + { + "epoch": 0.20020694467839353, + "lm_loss": 2.2216, + "step": 1040, + "vm_loss": 0.12 + }, + { + "epoch": 0.20020694467839353, + "lm_loss": 2.3397, + "step": 1040, + "vm_loss": 0.1952 + }, + { + "epoch": 0.20020694467839353, + "lm_loss": 2.4533, + "step": 1040, + "vm_loss": 0.1584 + }, + { + "epoch": 0.2003994513559689, + "grad_norm": 3.8489824363798104, + "learning_rate": 1.974279586777449e-05, + "loss": 2.441, + "step": 1041 + }, + { + "epoch": 0.2005919580335443, + "grad_norm": 3.9382097624886683, + "learning_rate": 1.974209279929253e-05, + "loss": 2.4659, + "step": 1042 + }, + { + "epoch": 0.20078446471111966, + "grad_norm": 3.4128789112293547, + "learning_rate": 1.9741388783754515e-05, + "loss": 2.4814, + "step": 1043 + }, + { + "epoch": 0.20097697138869505, + "grad_norm": 4.191056146380478, + "learning_rate": 1.974068382122888e-05, + "loss": 2.4609, + "step": 1044 + }, + { + "epoch": 0.20116947806627042, + "grad_norm": 3.4651483243464045, + "learning_rate": 1.973997791178416e-05, + "loss": 2.49, + "step": 1045 + }, + { + "epoch": 0.2013619847438458, + "grad_norm": 3.5084783935666466, + "learning_rate": 1.9739271055488972e-05, + "loss": 2.4891, + "step": 1046 + }, + { + "epoch": 0.20155449142142118, + "grad_norm": 4.0241057682700925, + "learning_rate": 1.973856325241204e-05, + "loss": 2.4783, + "step": 1047 + }, + { + "epoch": 0.20174699809899657, + "grad_norm": 3.6096856231782635, + "learning_rate": 1.9737854502622165e-05, + "loss": 2.4573, + "step": 1048 + }, + { + "epoch": 0.20174699809899657, + "lm_loss": 1.9424, + "step": 1048, + "vm_loss": 0.1745 + }, + { + "epoch": 0.20174699809899657, + "lm_loss": 2.4505, + "step": 1048, + "vm_loss": 0.1864 + }, + { + "epoch": 0.20174699809899657, + "lm_loss": 2.2513, + "step": 1048, + "vm_loss": 0.0989 + }, + { + "epoch": 0.20174699809899657, + "lm_loss": 2.18, + "step": 1048, + "vm_loss": 0.2511 + }, + { + "epoch": 0.20174699809899657, + "lm_loss": 2.4586, + "step": 1048, + "vm_loss": 0.1759 + }, + { + "epoch": 0.20174699809899657, + "lm_loss": 2.4717, + "step": 1048, + "vm_loss": 0.1155 + }, + { + "epoch": 0.20174699809899657, + "lm_loss": 2.295, + "step": 1048, + "vm_loss": 0.2642 + }, + { + "epoch": 0.20174699809899657, + "lm_loss": 2.149, + "step": 1048, + "vm_loss": 0.1815 + }, + { + "epoch": 0.20193950477657194, + "grad_norm": 3.6071298244581804, + "learning_rate": 1.9737144806188248e-05, + "loss": 2.4893, + "step": 1049 + }, + { + "epoch": 0.2021320114541473, + "grad_norm": 3.6876354499888166, + "learning_rate": 1.9736434163179284e-05, + "loss": 2.5184, + "step": 1050 + }, + { + "epoch": 0.2023245181317227, + "grad_norm": 3.7532035449555123, + "learning_rate": 1.973572257366435e-05, + "loss": 2.4624, + "step": 1051 + }, + { + "epoch": 0.20251702480929806, + "grad_norm": 4.170890971005629, + "learning_rate": 1.9735010037712628e-05, + "loss": 2.4604, + "step": 1052 + }, + { + "epoch": 0.20270953148687346, + "grad_norm": 4.349845014346077, + "learning_rate": 1.9734296555393385e-05, + "loss": 2.5229, + "step": 1053 + }, + { + "epoch": 0.20290203816444882, + "grad_norm": 3.7706674313057746, + "learning_rate": 1.9733582126775978e-05, + "loss": 2.4935, + "step": 1054 + }, + { + "epoch": 0.20309454484202422, + "grad_norm": 3.66283713202524, + "learning_rate": 1.9732866751929856e-05, + "loss": 2.4703, + "step": 1055 + }, + { + "epoch": 0.20328705151959958, + "grad_norm": 4.190262556624985, + "learning_rate": 1.973215043092457e-05, + "loss": 2.4534, + "step": 1056 + }, + { + "epoch": 0.20328705151959958, + "lm_loss": 2.0002, + "step": 1056, + "vm_loss": 0.1778 + }, + { + "epoch": 0.20328705151959958, + "lm_loss": 2.1351, + "step": 1056, + "vm_loss": 0.1634 + }, + { + "epoch": 0.20328705151959958, + "lm_loss": 2.374, + "step": 1056, + "vm_loss": 0.2017 + }, + { + "epoch": 0.20328705151959958, + "lm_loss": 2.3327, + "step": 1056, + "vm_loss": 0.2053 + }, + { + "epoch": 0.20328705151959958, + "lm_loss": 2.3892, + "step": 1056, + "vm_loss": 0.209 + }, + { + "epoch": 0.20328705151959958, + "lm_loss": 2.1823, + "step": 1056, + "vm_loss": 0.1269 + }, + { + "epoch": 0.20328705151959958, + "lm_loss": 2.6833, + "step": 1056, + "vm_loss": 0.1803 + }, + { + "epoch": 0.20328705151959958, + "lm_loss": 2.233, + "step": 1056, + "vm_loss": 0.0971 + }, + { + "epoch": 0.20347955819717498, + "grad_norm": 3.0310609474543178, + "learning_rate": 1.973143316382975e-05, + "loss": 2.4414, + "step": 1057 + }, + { + "epoch": 0.20367206487475034, + "grad_norm": 3.6354270125806094, + "learning_rate": 1.9730714950715125e-05, + "loss": 2.4536, + "step": 1058 + }, + { + "epoch": 0.20386457155232574, + "grad_norm": 3.506230343601751, + "learning_rate": 1.9729995791650513e-05, + "loss": 2.4605, + "step": 1059 + }, + { + "epoch": 0.2040570782299011, + "grad_norm": 3.05679376572762, + "learning_rate": 1.9729275686705832e-05, + "loss": 2.4478, + "step": 1060 + }, + { + "epoch": 0.20424958490747647, + "grad_norm": 3.3411958432249764, + "learning_rate": 1.9728554635951077e-05, + "loss": 2.4445, + "step": 1061 + }, + { + "epoch": 0.20444209158505186, + "grad_norm": 3.0757403581456875, + "learning_rate": 1.9727832639456345e-05, + "loss": 2.4233, + "step": 1062 + }, + { + "epoch": 0.20463459826262723, + "grad_norm": 3.3958910262660913, + "learning_rate": 1.9727109697291828e-05, + "loss": 2.4771, + "step": 1063 + }, + { + "epoch": 0.20482710494020262, + "grad_norm": 3.5450974176658314, + "learning_rate": 1.9726385809527803e-05, + "loss": 2.4607, + "step": 1064 + }, + { + "epoch": 0.20482710494020262, + "lm_loss": 2.4008, + "step": 1064, + "vm_loss": 0.1815 + }, + { + "epoch": 0.20482710494020262, + "lm_loss": 2.5694, + "step": 1064, + "vm_loss": 0.1705 + }, + { + "epoch": 0.20482710494020262, + "lm_loss": 2.1236, + "step": 1064, + "vm_loss": 0.1912 + }, + { + "epoch": 0.20482710494020262, + "lm_loss": 2.3893, + "step": 1064, + "vm_loss": 0.1749 + }, + { + "epoch": 0.20482710494020262, + "lm_loss": 2.3014, + "step": 1064, + "vm_loss": 0.199 + }, + { + "epoch": 0.20482710494020262, + "lm_loss": 2.2917, + "step": 1064, + "vm_loss": 0.1506 + }, + { + "epoch": 0.20482710494020262, + "lm_loss": 2.3791, + "step": 1064, + "vm_loss": 0.1279 + }, + { + "epoch": 0.20482710494020262, + "lm_loss": 2.2845, + "step": 1064, + "vm_loss": 0.1706 + }, + { + "epoch": 0.20501961161777799, + "grad_norm": 3.673255582470371, + "learning_rate": 1.9725660976234637e-05, + "loss": 2.4733, + "step": 1065 + }, + { + "epoch": 0.20521211829535338, + "grad_norm": 3.5561683208373, + "learning_rate": 1.9724935197482798e-05, + "loss": 2.4681, + "step": 1066 + }, + { + "epoch": 0.20540462497292875, + "grad_norm": 3.9106349950784396, + "learning_rate": 1.9724208473342843e-05, + "loss": 2.4398, + "step": 1067 + }, + { + "epoch": 0.20559713165050414, + "grad_norm": 3.5814670866695906, + "learning_rate": 1.972348080388541e-05, + "loss": 2.4605, + "step": 1068 + }, + { + "epoch": 0.2057896383280795, + "grad_norm": 4.154509313324284, + "learning_rate": 1.9722752189181247e-05, + "loss": 2.5055, + "step": 1069 + }, + { + "epoch": 0.20598214500565487, + "grad_norm": 3.541309031296491, + "learning_rate": 1.9722022629301182e-05, + "loss": 2.4832, + "step": 1070 + }, + { + "epoch": 0.20617465168323026, + "grad_norm": 2.9521296949090616, + "learning_rate": 1.972129212431613e-05, + "loss": 2.519, + "step": 1071 + }, + { + "epoch": 0.20636715836080563, + "grad_norm": 3.652116900961156, + "learning_rate": 1.9720560674297118e-05, + "loss": 2.466, + "step": 1072 + }, + { + "epoch": 0.20636715836080563, + "lm_loss": 2.3107, + "step": 1072, + "vm_loss": 0.1926 + }, + { + "epoch": 0.20636715836080563, + "lm_loss": 2.3423, + "step": 1072, + "vm_loss": 0.1525 + }, + { + "epoch": 0.20636715836080563, + "lm_loss": 2.1638, + "step": 1072, + "vm_loss": 0.1813 + }, + { + "epoch": 0.20636715836080563, + "lm_loss": 1.954, + "step": 1072, + "vm_loss": 0.2159 + }, + { + "epoch": 0.20636715836080563, + "lm_loss": 2.1565, + "step": 1072, + "vm_loss": 0.2098 + }, + { + "epoch": 0.20636715836080563, + "lm_loss": 2.2434, + "step": 1072, + "vm_loss": 0.2372 + }, + { + "epoch": 0.20636715836080563, + "lm_loss": 2.3942, + "step": 1072, + "vm_loss": 0.1774 + }, + { + "epoch": 0.20636715836080563, + "lm_loss": 2.4193, + "step": 1072, + "vm_loss": 0.1884 + }, + { + "epoch": 0.20655966503838102, + "grad_norm": 3.462325461702056, + "learning_rate": 1.9719828279315244e-05, + "loss": 2.4689, + "step": 1073 + }, + { + "epoch": 0.2067521717159564, + "grad_norm": 3.4853023310226603, + "learning_rate": 1.9719094939441707e-05, + "loss": 2.4959, + "step": 1074 + }, + { + "epoch": 0.20694467839353178, + "grad_norm": 3.8084676541661566, + "learning_rate": 1.9718360654747803e-05, + "loss": 2.4487, + "step": 1075 + }, + { + "epoch": 0.20713718507110715, + "grad_norm": 3.307081892423532, + "learning_rate": 1.9717625425304903e-05, + "loss": 2.4767, + "step": 1076 + }, + { + "epoch": 0.20732969174868254, + "grad_norm": 3.156421642407463, + "learning_rate": 1.971688925118449e-05, + "loss": 2.4697, + "step": 1077 + }, + { + "epoch": 0.2075221984262579, + "grad_norm": 3.348708082298858, + "learning_rate": 1.971615213245813e-05, + "loss": 2.4388, + "step": 1078 + }, + { + "epoch": 0.2077147051038333, + "grad_norm": 2.9728931602673563, + "learning_rate": 1.971541406919747e-05, + "loss": 2.4722, + "step": 1079 + }, + { + "epoch": 0.20790721178140867, + "grad_norm": 3.5960490334354507, + "learning_rate": 1.9714675061474274e-05, + "loss": 2.5265, + "step": 1080 + }, + { + "epoch": 0.20790721178140867, + "lm_loss": 2.5364, + "step": 1080, + "vm_loss": 0.1945 + }, + { + "epoch": 0.20790721178140867, + "lm_loss": 2.1915, + "step": 1080, + "vm_loss": 0.1616 + }, + { + "epoch": 0.20790721178140867, + "lm_loss": 2.2793, + "step": 1080, + "vm_loss": 0.1973 + }, + { + "epoch": 0.20790721178140867, + "lm_loss": 2.1183, + "step": 1080, + "vm_loss": 0.1402 + }, + { + "epoch": 0.20790721178140867, + "lm_loss": 2.3568, + "step": 1080, + "vm_loss": 0.1577 + }, + { + "epoch": 0.20790721178140867, + "lm_loss": 2.4381, + "step": 1080, + "vm_loss": 0.1661 + }, + { + "epoch": 0.20790721178140867, + "lm_loss": 2.3992, + "step": 1080, + "vm_loss": 0.1703 + }, + { + "epoch": 0.20790721178140867, + "lm_loss": 2.3177, + "step": 1080, + "vm_loss": 0.1642 + }, + { + "epoch": 0.20809971845898403, + "grad_norm": 3.838037734841467, + "learning_rate": 1.9713935109360366e-05, + "loss": 2.4503, + "step": 1081 + }, + { + "epoch": 0.20829222513655943, + "grad_norm": 3.3836744697301504, + "learning_rate": 1.9713194212927694e-05, + "loss": 2.5003, + "step": 1082 + }, + { + "epoch": 0.2084847318141348, + "grad_norm": 3.268022060949484, + "learning_rate": 1.9712452372248277e-05, + "loss": 2.4584, + "step": 1083 + }, + { + "epoch": 0.2086772384917102, + "grad_norm": 3.0971479536525734, + "learning_rate": 1.9711709587394226e-05, + "loss": 2.4506, + "step": 1084 + }, + { + "epoch": 0.20886974516928555, + "grad_norm": 3.098899409918459, + "learning_rate": 1.9710965858437757e-05, + "loss": 2.4733, + "step": 1085 + }, + { + "epoch": 0.20906225184686095, + "grad_norm": 2.973243507493669, + "learning_rate": 1.9710221185451166e-05, + "loss": 2.4794, + "step": 1086 + }, + { + "epoch": 0.2092547585244363, + "grad_norm": 2.983289905551904, + "learning_rate": 1.9709475568506847e-05, + "loss": 2.4554, + "step": 1087 + }, + { + "epoch": 0.2094472652020117, + "grad_norm": 3.5919574904853717, + "learning_rate": 1.9708729007677282e-05, + "loss": 2.4788, + "step": 1088 + }, + { + "epoch": 0.2094472652020117, + "lm_loss": 2.3375, + "step": 1088, + "vm_loss": 0.2891 + }, + { + "epoch": 0.2094472652020117, + "lm_loss": 2.3548, + "step": 1088, + "vm_loss": 0.1501 + }, + { + "epoch": 0.2094472652020117, + "lm_loss": 2.1963, + "step": 1088, + "vm_loss": 0.1994 + }, + { + "epoch": 0.2094472652020117, + "lm_loss": 2.3831, + "step": 1088, + "vm_loss": 0.1719 + }, + { + "epoch": 0.2094472652020117, + "lm_loss": 2.0638, + "step": 1088, + "vm_loss": 0.2233 + }, + { + "epoch": 0.2094472652020117, + "lm_loss": 2.2605, + "step": 1088, + "vm_loss": 0.1999 + }, + { + "epoch": 0.2094472652020117, + "lm_loss": 2.4435, + "step": 1088, + "vm_loss": 0.2177 + }, + { + "epoch": 0.2094472652020117, + "lm_loss": 2.3499, + "step": 1088, + "vm_loss": 0.1966 + }, + { + "epoch": 0.20963977187958707, + "grad_norm": 3.107849031762314, + "learning_rate": 1.970798150303505e-05, + "loss": 2.4858, + "step": 1089 + }, + { + "epoch": 0.20983227855716244, + "grad_norm": 3.9196148085746296, + "learning_rate": 1.970723305465281e-05, + "loss": 2.4468, + "step": 1090 + }, + { + "epoch": 0.21002478523473783, + "grad_norm": 2.832563062531479, + "learning_rate": 1.9706483662603322e-05, + "loss": 2.4646, + "step": 1091 + }, + { + "epoch": 0.2102172919123132, + "grad_norm": 3.8235081517885434, + "learning_rate": 1.9705733326959448e-05, + "loss": 2.4678, + "step": 1092 + }, + { + "epoch": 0.2104097985898886, + "grad_norm": 3.7969224590107458, + "learning_rate": 1.9704982047794118e-05, + "loss": 2.4776, + "step": 1093 + }, + { + "epoch": 0.21060230526746396, + "grad_norm": 3.6819414000725974, + "learning_rate": 1.970422982518037e-05, + "loss": 2.4803, + "step": 1094 + }, + { + "epoch": 0.21079481194503935, + "grad_norm": 3.6814781999598116, + "learning_rate": 1.9703476659191326e-05, + "loss": 2.4354, + "step": 1095 + }, + { + "epoch": 0.21098731862261472, + "grad_norm": 3.3036012385776443, + "learning_rate": 1.9702722549900214e-05, + "loss": 2.4561, + "step": 1096 + }, + { + "epoch": 0.21098731862261472, + "lm_loss": 2.2784, + "step": 1096, + "vm_loss": 0.1865 + }, + { + "epoch": 0.21098731862261472, + "lm_loss": 2.6026, + "step": 1096, + "vm_loss": 0.1025 + }, + { + "epoch": 0.21098731862261472, + "lm_loss": 2.4877, + "step": 1096, + "vm_loss": 0.2261 + }, + { + "epoch": 0.21098731862261472, + "lm_loss": 2.5044, + "step": 1096, + "vm_loss": 0.2337 + }, + { + "epoch": 0.21098731862261472, + "lm_loss": 2.3991, + "step": 1096, + "vm_loss": 0.1334 + }, + { + "epoch": 0.21098731862261472, + "lm_loss": 2.0255, + "step": 1096, + "vm_loss": 0.171 + }, + { + "epoch": 0.21098731862261472, + "lm_loss": 1.956, + "step": 1096, + "vm_loss": 0.1324 + }, + { + "epoch": 0.21098731862261472, + "lm_loss": 2.2897, + "step": 1096, + "vm_loss": 0.1442 + }, + { + "epoch": 0.2111798253001901, + "grad_norm": 3.5654036696633677, + "learning_rate": 1.9701967497380334e-05, + "loss": 2.468, + "step": 1097 + }, + { + "epoch": 0.21137233197776548, + "grad_norm": 3.22316907138247, + "learning_rate": 1.9701211501705083e-05, + "loss": 2.4887, + "step": 1098 + }, + { + "epoch": 0.21156483865534087, + "grad_norm": 3.043080031212153, + "learning_rate": 1.9700454562947966e-05, + "loss": 2.4256, + "step": 1099 + }, + { + "epoch": 0.21175734533291624, + "grad_norm": 3.6046942557381687, + "learning_rate": 1.9699696681182557e-05, + "loss": 2.467, + "step": 1100 + }, + { + "epoch": 0.2119498520104916, + "grad_norm": 3.623433266963806, + "learning_rate": 1.9698937856482538e-05, + "loss": 2.4551, + "step": 1101 + }, + { + "epoch": 0.212142358688067, + "grad_norm": 3.690269380113225, + "learning_rate": 1.969817808892167e-05, + "loss": 2.4576, + "step": 1102 + }, + { + "epoch": 0.21233486536564236, + "grad_norm": 3.5394811084859117, + "learning_rate": 1.969741737857382e-05, + "loss": 2.4842, + "step": 1103 + }, + { + "epoch": 0.21252737204321775, + "grad_norm": 3.1776357190012714, + "learning_rate": 1.9696655725512933e-05, + "loss": 2.4614, + "step": 1104 + }, + { + "epoch": 0.21252737204321775, + "lm_loss": 2.3948, + "step": 1104, + "vm_loss": 0.1582 + }, + { + "epoch": 0.21252737204321775, + "lm_loss": 2.4987, + "step": 1104, + "vm_loss": 0.1417 + }, + { + "epoch": 0.21252737204321775, + "lm_loss": 2.3013, + "step": 1104, + "vm_loss": 0.2057 + }, + { + "epoch": 0.21252737204321775, + "lm_loss": 2.173, + "step": 1104, + "vm_loss": 0.215 + }, + { + "epoch": 0.21252737204321775, + "lm_loss": 1.949, + "step": 1104, + "vm_loss": 0.1731 + }, + { + "epoch": 0.21252737204321775, + "lm_loss": 2.0258, + "step": 1104, + "vm_loss": 0.216 + }, + { + "epoch": 0.21252737204321775, + "lm_loss": 2.3912, + "step": 1104, + "vm_loss": 0.1574 + }, + { + "epoch": 0.21252737204321775, + "lm_loss": 2.2166, + "step": 1104, + "vm_loss": 0.1692 + }, + { + "epoch": 0.21271987872079312, + "grad_norm": 3.657961006377713, + "learning_rate": 1.9695893129813054e-05, + "loss": 2.4767, + "step": 1105 + }, + { + "epoch": 0.21291238539836851, + "grad_norm": 3.4801132398158554, + "learning_rate": 1.9695129591548315e-05, + "loss": 2.5115, + "step": 1106 + }, + { + "epoch": 0.21310489207594388, + "grad_norm": 4.708400754862967, + "learning_rate": 1.9694365110792943e-05, + "loss": 2.478, + "step": 1107 + }, + { + "epoch": 0.21329739875351927, + "grad_norm": 3.7583293280439047, + "learning_rate": 1.9693599687621253e-05, + "loss": 2.4364, + "step": 1108 + }, + { + "epoch": 0.21348990543109464, + "grad_norm": 4.242514516222299, + "learning_rate": 1.969283332210766e-05, + "loss": 2.4977, + "step": 1109 + }, + { + "epoch": 0.21368241210867, + "grad_norm": 3.4472666671048606, + "learning_rate": 1.9692066014326655e-05, + "loss": 2.5105, + "step": 1110 + }, + { + "epoch": 0.2138749187862454, + "grad_norm": 3.8129388078345627, + "learning_rate": 1.969129776435284e-05, + "loss": 2.414, + "step": 1111 + }, + { + "epoch": 0.21406742546382077, + "grad_norm": 3.9965781838270242, + "learning_rate": 1.9690528572260894e-05, + "loss": 2.4672, + "step": 1112 + }, + { + "epoch": 0.21406742546382077, + "lm_loss": 2.1861, + "step": 1112, + "vm_loss": 0.1096 + }, + { + "epoch": 0.21406742546382077, + "lm_loss": 2.3975, + "step": 1112, + "vm_loss": 0.1581 + }, + { + "epoch": 0.21406742546382077, + "lm_loss": 2.4695, + "step": 1112, + "vm_loss": 0.178 + }, + { + "epoch": 0.21406742546382077, + "lm_loss": 2.341, + "step": 1112, + "vm_loss": 0.2007 + }, + { + "epoch": 0.21406742546382077, + "lm_loss": 2.2028, + "step": 1112, + "vm_loss": 0.1624 + }, + { + "epoch": 0.21406742546382077, + "lm_loss": 2.4051, + "step": 1112, + "vm_loss": 0.1319 + }, + { + "epoch": 0.21406742546382077, + "lm_loss": 2.3279, + "step": 1112, + "vm_loss": 0.2397 + }, + { + "epoch": 0.21406742546382077, + "lm_loss": 2.2707, + "step": 1112, + "vm_loss": 0.2604 + }, + { + "epoch": 0.21425993214139616, + "grad_norm": 3.425090927850954, + "learning_rate": 1.968975843812559e-05, + "loss": 2.4717, + "step": 1113 + }, + { + "epoch": 0.21445243881897152, + "grad_norm": 3.961245949081223, + "learning_rate": 1.9688987362021802e-05, + "loss": 2.4859, + "step": 1114 + }, + { + "epoch": 0.21464494549654692, + "grad_norm": 3.3767167640424223, + "learning_rate": 1.968821534402448e-05, + "loss": 2.4119, + "step": 1115 + }, + { + "epoch": 0.21483745217412228, + "grad_norm": 3.884816746855957, + "learning_rate": 1.9687442384208682e-05, + "loss": 2.4755, + "step": 1116 + }, + { + "epoch": 0.21502995885169768, + "grad_norm": 3.545492428730258, + "learning_rate": 1.9686668482649543e-05, + "loss": 2.5134, + "step": 1117 + }, + { + "epoch": 0.21522246552927304, + "grad_norm": 3.6174149838197707, + "learning_rate": 1.96858936394223e-05, + "loss": 2.5237, + "step": 1118 + }, + { + "epoch": 0.21541497220684844, + "grad_norm": 3.446055327133417, + "learning_rate": 1.9685117854602276e-05, + "loss": 2.4808, + "step": 1119 + }, + { + "epoch": 0.2156074788844238, + "grad_norm": 2.9068861070416987, + "learning_rate": 1.9684341128264884e-05, + "loss": 2.4478, + "step": 1120 + }, + { + "epoch": 0.2156074788844238, + "lm_loss": 2.5983, + "step": 1120, + "vm_loss": 0.2049 + }, + { + "epoch": 0.2156074788844238, + "lm_loss": 2.331, + "step": 1120, + "vm_loss": 0.2151 + }, + { + "epoch": 0.2156074788844238, + "lm_loss": 2.4922, + "step": 1120, + "vm_loss": 0.1803 + }, + { + "epoch": 0.2156074788844238, + "lm_loss": 2.3474, + "step": 1120, + "vm_loss": 0.154 + }, + { + "epoch": 0.2156074788844238, + "lm_loss": 2.2739, + "step": 1120, + "vm_loss": 0.1911 + }, + { + "epoch": 0.2156074788844238, + "lm_loss": 2.2397, + "step": 1120, + "vm_loss": 0.2368 + }, + { + "epoch": 0.2156074788844238, + "lm_loss": 2.4236, + "step": 1120, + "vm_loss": 0.1534 + }, + { + "epoch": 0.2156074788844238, + "lm_loss": 2.1614, + "step": 1120, + "vm_loss": 0.1921 + }, + { + "epoch": 0.21579998556199917, + "grad_norm": 3.288904935672784, + "learning_rate": 1.968356346048564e-05, + "loss": 2.452, + "step": 1121 + }, + { + "epoch": 0.21599249223957456, + "grad_norm": 3.310553622738488, + "learning_rate": 1.9682784851340137e-05, + "loss": 2.4679, + "step": 1122 + }, + { + "epoch": 0.21618499891714993, + "grad_norm": 3.584333329972139, + "learning_rate": 1.968200530090407e-05, + "loss": 2.4584, + "step": 1123 + }, + { + "epoch": 0.21637750559472532, + "grad_norm": 3.4314883097095725, + "learning_rate": 1.9681224809253217e-05, + "loss": 2.4644, + "step": 1124 + }, + { + "epoch": 0.2165700122723007, + "grad_norm": 3.5942601337019435, + "learning_rate": 1.968044337646345e-05, + "loss": 2.469, + "step": 1125 + }, + { + "epoch": 0.21676251894987608, + "grad_norm": 4.162594894638387, + "learning_rate": 1.9679661002610743e-05, + "loss": 2.4726, + "step": 1126 + }, + { + "epoch": 0.21695502562745145, + "grad_norm": 3.0808734276942604, + "learning_rate": 1.967887768777115e-05, + "loss": 2.4474, + "step": 1127 + }, + { + "epoch": 0.21714753230502684, + "grad_norm": 3.837693798700385, + "learning_rate": 1.967809343202081e-05, + "loss": 2.4836, + "step": 1128 + }, + { + "epoch": 0.21714753230502684, + "lm_loss": 2.2449, + "step": 1128, + "vm_loss": 0.2727 + }, + { + "epoch": 0.21714753230502684, + "lm_loss": 2.244, + "step": 1128, + "vm_loss": 0.1649 + }, + { + "epoch": 0.21714753230502684, + "lm_loss": 2.3455, + "step": 1128, + "vm_loss": 0.1945 + }, + { + "epoch": 0.21714753230502684, + "lm_loss": 2.2377, + "step": 1128, + "vm_loss": 0.2255 + }, + { + "epoch": 0.21714753230502684, + "lm_loss": 2.2009, + "step": 1128, + "vm_loss": 0.1941 + }, + { + "epoch": 0.21714753230502684, + "lm_loss": 2.3181, + "step": 1128, + "vm_loss": 0.1682 + }, + { + "epoch": 0.21714753230502684, + "lm_loss": 2.1726, + "step": 1128, + "vm_loss": 0.2108 + }, + { + "epoch": 0.21714753230502684, + "lm_loss": 2.1383, + "step": 1128, + "vm_loss": 0.1892 + }, + { + "epoch": 0.2173400389826022, + "grad_norm": 3.24752334731843, + "learning_rate": 1.967730823543597e-05, + "loss": 2.4561, + "step": 1129 + }, + { + "epoch": 0.2175325456601776, + "grad_norm": 3.3269744160512085, + "learning_rate": 1.9676522098092965e-05, + "loss": 2.4966, + "step": 1130 + }, + { + "epoch": 0.21772505233775297, + "grad_norm": 2.9074985667778734, + "learning_rate": 1.967573502006821e-05, + "loss": 2.4921, + "step": 1131 + }, + { + "epoch": 0.21791755901532833, + "grad_norm": 3.5730107471420745, + "learning_rate": 1.967494700143822e-05, + "loss": 2.5049, + "step": 1132 + }, + { + "epoch": 0.21811006569290373, + "grad_norm": 3.313305700715054, + "learning_rate": 1.9674158042279606e-05, + "loss": 2.4919, + "step": 1133 + }, + { + "epoch": 0.2183025723704791, + "grad_norm": 3.4326954502787537, + "learning_rate": 1.9673368142669066e-05, + "loss": 2.4775, + "step": 1134 + }, + { + "epoch": 0.21849507904805449, + "grad_norm": 3.3122493441262395, + "learning_rate": 1.9672577302683374e-05, + "loss": 2.4351, + "step": 1135 + }, + { + "epoch": 0.21868758572562985, + "grad_norm": 3.543085246451503, + "learning_rate": 1.9671785522399428e-05, + "loss": 2.4401, + "step": 1136 + }, + { + "epoch": 0.21868758572562985, + "lm_loss": 2.3683, + "step": 1136, + "vm_loss": 0.1161 + }, + { + "epoch": 0.21868758572562985, + "lm_loss": 2.4835, + "step": 1136, + "vm_loss": 0.1295 + }, + { + "epoch": 0.21868758572562985, + "lm_loss": 2.241, + "step": 1136, + "vm_loss": 0.1703 + }, + { + "epoch": 0.21868758572562985, + "lm_loss": 2.4631, + "step": 1136, + "vm_loss": 0.1968 + }, + { + "epoch": 0.21868758572562985, + "lm_loss": 2.1926, + "step": 1136, + "vm_loss": 0.1493 + }, + { + "epoch": 0.21868758572562985, + "lm_loss": 2.0073, + "step": 1136, + "vm_loss": 0.1428 + }, + { + "epoch": 0.21868758572562985, + "lm_loss": 2.5343, + "step": 1136, + "vm_loss": 0.1631 + }, + { + "epoch": 0.21868758572562985, + "lm_loss": 2.3711, + "step": 1136, + "vm_loss": 0.1343 + }, + { + "epoch": 0.21888009240320525, + "grad_norm": 3.3855325951831343, + "learning_rate": 1.9670992801894185e-05, + "loss": 2.4079, + "step": 1137 + }, + { + "epoch": 0.2190725990807806, + "grad_norm": 3.3264073223945982, + "learning_rate": 1.967019914124472e-05, + "loss": 2.4761, + "step": 1138 + }, + { + "epoch": 0.219265105758356, + "grad_norm": 3.0395092523524445, + "learning_rate": 1.9669404540528176e-05, + "loss": 2.476, + "step": 1139 + }, + { + "epoch": 0.21945761243593137, + "grad_norm": 3.460025319791979, + "learning_rate": 1.9668608999821806e-05, + "loss": 2.4243, + "step": 1140 + }, + { + "epoch": 0.21965011911350674, + "grad_norm": 3.4865871033475417, + "learning_rate": 1.966781251920294e-05, + "loss": 2.468, + "step": 1141 + }, + { + "epoch": 0.21984262579108213, + "grad_norm": 3.660505950427934, + "learning_rate": 1.9667015098749013e-05, + "loss": 2.4833, + "step": 1142 + }, + { + "epoch": 0.2200351324686575, + "grad_norm": 2.7877308478794673, + "learning_rate": 1.966621673853754e-05, + "loss": 2.4424, + "step": 1143 + }, + { + "epoch": 0.2202276391462329, + "grad_norm": 3.4580754913773006, + "learning_rate": 1.9665417438646132e-05, + "loss": 2.4281, + "step": 1144 + }, + { + "epoch": 0.2202276391462329, + "lm_loss": 2.3236, + "step": 1144, + "vm_loss": 0.2032 + }, + { + "epoch": 0.2202276391462329, + "lm_loss": 2.3738, + "step": 1144, + "vm_loss": 0.1959 + }, + { + "epoch": 0.2202276391462329, + "lm_loss": 2.4211, + "step": 1144, + "vm_loss": 0.18 + }, + { + "epoch": 0.2202276391462329, + "lm_loss": 2.2454, + "step": 1144, + "vm_loss": 0.1616 + }, + { + "epoch": 0.2202276391462329, + "lm_loss": 2.3133, + "step": 1144, + "vm_loss": 0.1874 + }, + { + "epoch": 0.2202276391462329, + "lm_loss": 2.2163, + "step": 1144, + "vm_loss": 0.2039 + }, + { + "epoch": 0.2202276391462329, + "lm_loss": 2.3465, + "step": 1144, + "vm_loss": 0.1691 + }, + { + "epoch": 0.2202276391462329, + "lm_loss": 2.1948, + "step": 1144, + "vm_loss": 0.2123 + }, + { + "epoch": 0.22042014582380826, + "grad_norm": 3.9941520772190584, + "learning_rate": 1.966461719915249e-05, + "loss": 2.4858, + "step": 1145 + }, + { + "epoch": 0.22061265250138365, + "grad_norm": 2.673214965013801, + "learning_rate": 1.966381602013441e-05, + "loss": 2.4529, + "step": 1146 + }, + { + "epoch": 0.22080515917895902, + "grad_norm": 3.525819465385896, + "learning_rate": 1.966301390166978e-05, + "loss": 2.4796, + "step": 1147 + }, + { + "epoch": 0.2209976658565344, + "grad_norm": 4.141749473586178, + "learning_rate": 1.9662210843836574e-05, + "loss": 2.5134, + "step": 1148 + }, + { + "epoch": 0.22119017253410977, + "grad_norm": 3.5395805230836235, + "learning_rate": 1.9661406846712854e-05, + "loss": 2.439, + "step": 1149 + }, + { + "epoch": 0.22138267921168517, + "grad_norm": 3.4650741859323415, + "learning_rate": 1.9660601910376784e-05, + "loss": 2.4731, + "step": 1150 + }, + { + "epoch": 0.22157518588926053, + "grad_norm": 3.663834711108934, + "learning_rate": 1.9659796034906614e-05, + "loss": 2.4649, + "step": 1151 + }, + { + "epoch": 0.2217676925668359, + "grad_norm": 4.219484135337479, + "learning_rate": 1.9658989220380686e-05, + "loss": 2.4461, + "step": 1152 + }, + { + "epoch": 0.2217676925668359, + "lm_loss": 2.5942, + "step": 1152, + "vm_loss": 0.2674 + }, + { + "epoch": 0.2217676925668359, + "lm_loss": 2.3894, + "step": 1152, + "vm_loss": 0.1989 + }, + { + "epoch": 0.2217676925668359, + "lm_loss": 2.4629, + "step": 1152, + "vm_loss": 0.2307 + }, + { + "epoch": 0.2217676925668359, + "lm_loss": 2.3915, + "step": 1152, + "vm_loss": 0.1001 + }, + { + "epoch": 0.2217676925668359, + "lm_loss": 2.0616, + "step": 1152, + "vm_loss": 0.1555 + }, + { + "epoch": 0.2217676925668359, + "lm_loss": 2.5497, + "step": 1152, + "vm_loss": 0.2099 + }, + { + "epoch": 0.2217676925668359, + "lm_loss": 2.2942, + "step": 1152, + "vm_loss": 0.224 + }, + { + "epoch": 0.2217676925668359, + "lm_loss": 2.2603, + "step": 1152, + "vm_loss": 0.2149 + }, + { + "epoch": 0.2219601992444113, + "grad_norm": 3.439145279154436, + "learning_rate": 1.9658181466877427e-05, + "loss": 2.491, + "step": 1153 + }, + { + "epoch": 0.22215270592198666, + "grad_norm": 3.2252892303841874, + "learning_rate": 1.965737277447537e-05, + "loss": 2.4905, + "step": 1154 + }, + { + "epoch": 0.22234521259956205, + "grad_norm": 3.554339049029039, + "learning_rate": 1.9656563143253122e-05, + "loss": 2.4758, + "step": 1155 + }, + { + "epoch": 0.22253771927713742, + "grad_norm": 3.7321968669043954, + "learning_rate": 1.9655752573289396e-05, + "loss": 2.4721, + "step": 1156 + }, + { + "epoch": 0.2227302259547128, + "grad_norm": 3.306356652848731, + "learning_rate": 1.9654941064662986e-05, + "loss": 2.5052, + "step": 1157 + }, + { + "epoch": 0.22292273263228818, + "grad_norm": 4.375134518763874, + "learning_rate": 1.965412861745278e-05, + "loss": 2.4728, + "step": 1158 + }, + { + "epoch": 0.22311523930986357, + "grad_norm": 3.961325960685138, + "learning_rate": 1.965331523173776e-05, + "loss": 2.451, + "step": 1159 + }, + { + "epoch": 0.22330774598743894, + "grad_norm": 3.467604030714786, + "learning_rate": 1.9652500907596998e-05, + "loss": 2.4754, + "step": 1160 + }, + { + "epoch": 0.22330774598743894, + "lm_loss": 2.3335, + "step": 1160, + "vm_loss": 0.1657 + }, + { + "epoch": 0.22330774598743894, + "lm_loss": 2.1967, + "step": 1160, + "vm_loss": 0.1763 + }, + { + "epoch": 0.22330774598743894, + "lm_loss": 2.2758, + "step": 1160, + "vm_loss": 0.1681 + }, + { + "epoch": 0.22330774598743894, + "lm_loss": 2.266, + "step": 1160, + "vm_loss": 0.1475 + }, + { + "epoch": 0.22330774598743894, + "lm_loss": 2.1162, + "step": 1160, + "vm_loss": 0.1221 + }, + { + "epoch": 0.22330774598743894, + "lm_loss": 2.3065, + "step": 1160, + "vm_loss": 0.1798 + }, + { + "epoch": 0.22330774598743894, + "lm_loss": 2.0585, + "step": 1160, + "vm_loss": 0.2048 + }, + { + "epoch": 0.22330774598743894, + "lm_loss": 2.433, + "step": 1160, + "vm_loss": 0.1685 + }, + { + "epoch": 0.2235002526650143, + "grad_norm": 4.073247633569005, + "learning_rate": 1.9651685645109657e-05, + "loss": 2.4406, + "step": 1161 + }, + { + "epoch": 0.2236927593425897, + "grad_norm": 3.813135705064817, + "learning_rate": 1.965086944435499e-05, + "loss": 2.4289, + "step": 1162 + }, + { + "epoch": 0.22388526602016506, + "grad_norm": 3.518834975932944, + "learning_rate": 1.965005230541234e-05, + "loss": 2.4556, + "step": 1163 + }, + { + "epoch": 0.22407777269774046, + "grad_norm": 3.5368983980092423, + "learning_rate": 1.9649234228361148e-05, + "loss": 2.4827, + "step": 1164 + }, + { + "epoch": 0.22427027937531582, + "grad_norm": 4.64997992121861, + "learning_rate": 1.964841521328094e-05, + "loss": 2.4536, + "step": 1165 + }, + { + "epoch": 0.22446278605289122, + "grad_norm": 3.879797133723813, + "learning_rate": 1.964759526025133e-05, + "loss": 2.41, + "step": 1166 + }, + { + "epoch": 0.22465529273046658, + "grad_norm": 3.1871897758966106, + "learning_rate": 1.964677436935204e-05, + "loss": 2.4376, + "step": 1167 + }, + { + "epoch": 0.22484779940804198, + "grad_norm": 4.147788638776459, + "learning_rate": 1.9645952540662855e-05, + "loss": 2.4666, + "step": 1168 + }, + { + "epoch": 0.22484779940804198, + "lm_loss": 1.9165, + "step": 1168, + "vm_loss": 0.1353 + }, + { + "epoch": 0.22484779940804198, + "lm_loss": 1.8994, + "step": 1168, + "vm_loss": 0.1985 + }, + { + "epoch": 0.22484779940804198, + "lm_loss": 2.3829, + "step": 1168, + "vm_loss": 0.1943 + }, + { + "epoch": 0.22484779940804198, + "lm_loss": 2.2215, + "step": 1168, + "vm_loss": 0.1614 + }, + { + "epoch": 0.22484779940804198, + "lm_loss": 2.1421, + "step": 1168, + "vm_loss": 0.168 + }, + { + "epoch": 0.22484779940804198, + "lm_loss": 2.0762, + "step": 1168, + "vm_loss": 0.1848 + }, + { + "epoch": 0.22484779940804198, + "lm_loss": 2.4662, + "step": 1168, + "vm_loss": 0.211 + }, + { + "epoch": 0.22484779940804198, + "lm_loss": 2.4899, + "step": 1168, + "vm_loss": 0.1186 + }, + { + "epoch": 0.22504030608561734, + "grad_norm": 4.075723294234439, + "learning_rate": 1.9645129774263678e-05, + "loss": 2.4739, + "step": 1169 + }, + { + "epoch": 0.22523281276319274, + "grad_norm": 3.6593286814891504, + "learning_rate": 1.9644306070234492e-05, + "loss": 2.4516, + "step": 1170 + }, + { + "epoch": 0.2254253194407681, + "grad_norm": 3.3902425689510305, + "learning_rate": 1.9643481428655366e-05, + "loss": 2.4688, + "step": 1171 + }, + { + "epoch": 0.22561782611834347, + "grad_norm": 3.7166742638096237, + "learning_rate": 1.9642655849606468e-05, + "loss": 2.5134, + "step": 1172 + }, + { + "epoch": 0.22581033279591886, + "grad_norm": 3.4153498463214, + "learning_rate": 1.9641829333168058e-05, + "loss": 2.4675, + "step": 1173 + }, + { + "epoch": 0.22600283947349423, + "grad_norm": 3.7397391809765352, + "learning_rate": 1.9641001879420482e-05, + "loss": 2.4392, + "step": 1174 + }, + { + "epoch": 0.22619534615106962, + "grad_norm": 3.911807991772737, + "learning_rate": 1.964017348844418e-05, + "loss": 2.5032, + "step": 1175 + }, + { + "epoch": 0.226387852828645, + "grad_norm": 3.2511768379737562, + "learning_rate": 1.9639344160319678e-05, + "loss": 2.4762, + "step": 1176 + }, + { + "epoch": 0.226387852828645, + "lm_loss": 2.1043, + "step": 1176, + "vm_loss": 0.1334 + }, + { + "epoch": 0.226387852828645, + "lm_loss": 2.4644, + "step": 1176, + "vm_loss": 0.1876 + }, + { + "epoch": 0.226387852828645, + "lm_loss": 2.5693, + "step": 1176, + "vm_loss": 0.167 + }, + { + "epoch": 0.226387852828645, + "lm_loss": 1.8619, + "step": 1176, + "vm_loss": 0.157 + }, + { + "epoch": 0.226387852828645, + "lm_loss": 2.4452, + "step": 1176, + "vm_loss": 0.175 + }, + { + "epoch": 0.226387852828645, + "lm_loss": 2.5079, + "step": 1176, + "vm_loss": 0.1582 + }, + { + "epoch": 0.226387852828645, + "lm_loss": 2.4046, + "step": 1176, + "vm_loss": 0.2209 + }, + { + "epoch": 0.226387852828645, + "lm_loss": 2.2349, + "step": 1176, + "vm_loss": 0.1517 + }, + { + "epoch": 0.22658035950622038, + "grad_norm": 3.178634843921187, + "learning_rate": 1.9638513895127604e-05, + "loss": 2.4444, + "step": 1177 + }, + { + "epoch": 0.22677286618379575, + "grad_norm": 3.9327577031615304, + "learning_rate": 1.9637682692948666e-05, + "loss": 2.4303, + "step": 1178 + }, + { + "epoch": 0.22696537286137114, + "grad_norm": 3.47990577513902, + "learning_rate": 1.963685055386367e-05, + "loss": 2.4917, + "step": 1179 + }, + { + "epoch": 0.2271578795389465, + "grad_norm": 3.943248486056169, + "learning_rate": 1.9636017477953502e-05, + "loss": 2.4956, + "step": 1180 + }, + { + "epoch": 0.2273503862165219, + "grad_norm": 3.290729248661274, + "learning_rate": 1.9635183465299157e-05, + "loss": 2.4889, + "step": 1181 + }, + { + "epoch": 0.22754289289409727, + "grad_norm": 4.1554701885405025, + "learning_rate": 1.963434851598171e-05, + "loss": 2.473, + "step": 1182 + }, + { + "epoch": 0.22773539957167263, + "grad_norm": 3.616210835013554, + "learning_rate": 1.963351263008233e-05, + "loss": 2.4482, + "step": 1183 + }, + { + "epoch": 0.22792790624924802, + "grad_norm": 3.4742629093571877, + "learning_rate": 1.963267580768227e-05, + "loss": 2.5033, + "step": 1184 + }, + { + "epoch": 0.22792790624924802, + "lm_loss": 2.3939, + "step": 1184, + "vm_loss": 0.1288 + }, + { + "epoch": 0.22792790624924802, + "lm_loss": 2.3575, + "step": 1184, + "vm_loss": 0.2145 + }, + { + "epoch": 0.22792790624924802, + "lm_loss": 2.1724, + "step": 1184, + "vm_loss": 0.1674 + }, + { + "epoch": 0.22792790624924802, + "lm_loss": 2.2225, + "step": 1184, + "vm_loss": 0.1371 + }, + { + "epoch": 0.22792790624924802, + "lm_loss": 2.3885, + "step": 1184, + "vm_loss": 0.1737 + }, + { + "epoch": 0.22792790624924802, + "lm_loss": 2.2578, + "step": 1184, + "vm_loss": 0.2011 + }, + { + "epoch": 0.22792790624924802, + "lm_loss": 2.4048, + "step": 1184, + "vm_loss": 0.1771 + }, + { + "epoch": 0.22792790624924802, + "lm_loss": 2.3607, + "step": 1184, + "vm_loss": 0.1311 + }, + { + "epoch": 0.2281204129268234, + "grad_norm": 3.7167764647989325, + "learning_rate": 1.963183804886289e-05, + "loss": 2.4591, + "step": 1185 + }, + { + "epoch": 0.22831291960439878, + "grad_norm": 4.118404926502813, + "learning_rate": 1.9630999353705617e-05, + "loss": 2.5048, + "step": 1186 + }, + { + "epoch": 0.22850542628197415, + "grad_norm": 3.267249100361888, + "learning_rate": 1.9630159722291996e-05, + "loss": 2.4638, + "step": 1187 + }, + { + "epoch": 0.22869793295954954, + "grad_norm": 3.3634596526994938, + "learning_rate": 1.962931915470364e-05, + "loss": 2.4699, + "step": 1188 + }, + { + "epoch": 0.2288904396371249, + "grad_norm": 4.438569797110638, + "learning_rate": 1.962847765102227e-05, + "loss": 2.508, + "step": 1189 + }, + { + "epoch": 0.2290829463147003, + "grad_norm": 3.0322702860915447, + "learning_rate": 1.9627635211329686e-05, + "loss": 2.4761, + "step": 1190 + }, + { + "epoch": 0.22927545299227567, + "grad_norm": 3.823828437286329, + "learning_rate": 1.9626791835707787e-05, + "loss": 2.4543, + "step": 1191 + }, + { + "epoch": 0.22946795966985103, + "grad_norm": 3.9025126813697057, + "learning_rate": 1.9625947524238564e-05, + "loss": 2.4187, + "step": 1192 + }, + { + "epoch": 0.22946795966985103, + "lm_loss": 2.2605, + "step": 1192, + "vm_loss": 0.1868 + }, + { + "epoch": 0.22946795966985103, + "lm_loss": 2.0292, + "step": 1192, + "vm_loss": 0.171 + }, + { + "epoch": 0.22946795966985103, + "lm_loss": 2.465, + "step": 1192, + "vm_loss": 0.2515 + }, + { + "epoch": 0.22946795966985103, + "lm_loss": 2.3289, + "step": 1192, + "vm_loss": 0.1464 + }, + { + "epoch": 0.22946795966985103, + "lm_loss": 2.2172, + "step": 1192, + "vm_loss": 0.1651 + }, + { + "epoch": 0.22946795966985103, + "lm_loss": 2.4247, + "step": 1192, + "vm_loss": 0.1967 + }, + { + "epoch": 0.22946795966985103, + "lm_loss": 2.1336, + "step": 1192, + "vm_loss": 0.1359 + }, + { + "epoch": 0.22946795966985103, + "lm_loss": 2.3907, + "step": 1192, + "vm_loss": 0.2451 + }, + { + "epoch": 0.22966046634742643, + "grad_norm": 3.2003435388077692, + "learning_rate": 1.9625102277004085e-05, + "loss": 2.4428, + "step": 1193 + }, + { + "epoch": 0.2298529730250018, + "grad_norm": 4.138706738787077, + "learning_rate": 1.9624256094086523e-05, + "loss": 2.4618, + "step": 1194 + }, + { + "epoch": 0.2300454797025772, + "grad_norm": 4.437828298246324, + "learning_rate": 1.9623408975568138e-05, + "loss": 2.4434, + "step": 1195 + }, + { + "epoch": 0.23023798638015255, + "grad_norm": 3.468856888317049, + "learning_rate": 1.9622560921531285e-05, + "loss": 2.4399, + "step": 1196 + }, + { + "epoch": 0.23043049305772795, + "grad_norm": 3.6938786693141674, + "learning_rate": 1.9621711932058395e-05, + "loss": 2.4629, + "step": 1197 + }, + { + "epoch": 0.2306229997353033, + "grad_norm": 3.5439846936354193, + "learning_rate": 1.9620862007232015e-05, + "loss": 2.4251, + "step": 1198 + }, + { + "epoch": 0.2308155064128787, + "grad_norm": 3.080250781285466, + "learning_rate": 1.9620011147134758e-05, + "loss": 2.4156, + "step": 1199 + }, + { + "epoch": 0.23100801309045407, + "grad_norm": 4.249715445311841, + "learning_rate": 1.961915935184934e-05, + "loss": 2.5074, + "step": 1200 + }, + { + "epoch": 0.23100801309045407, + "lm_loss": 2.428, + "step": 1200, + "vm_loss": 0.1942 + }, + { + "epoch": 0.23100801309045407, + "lm_loss": 2.2714, + "step": 1200, + "vm_loss": 0.1474 + }, + { + "epoch": 0.23100801309045407, + "lm_loss": 2.3676, + "step": 1200, + "vm_loss": 0.1581 + }, + { + "epoch": 0.23100801309045407, + "lm_loss": 2.2548, + "step": 1200, + "vm_loss": 0.2473 + }, + { + "epoch": 0.23100801309045407, + "lm_loss": 2.6291, + "step": 1200, + "vm_loss": 0.2803 + }, + { + "epoch": 0.23100801309045407, + "lm_loss": 1.759, + "step": 1200, + "vm_loss": 0.1435 + }, + { + "epoch": 0.23100801309045407, + "lm_loss": 2.2732, + "step": 1200, + "vm_loss": 0.2089 + }, + { + "epoch": 0.23100801309045407, + "lm_loss": 2.1425, + "step": 1200, + "vm_loss": 0.2043 + }, + { + "epoch": 0.23120051976802947, + "grad_norm": 3.0513229314448282, + "learning_rate": 1.961830662145857e-05, + "loss": 2.4736, + "step": 1201 + }, + { + "epoch": 0.23139302644560483, + "grad_norm": 3.081842396441924, + "learning_rate": 1.9617452956045344e-05, + "loss": 2.4947, + "step": 1202 + }, + { + "epoch": 0.2315855331231802, + "grad_norm": 3.7129843148889603, + "learning_rate": 1.9616598355692644e-05, + "loss": 2.4964, + "step": 1203 + }, + { + "epoch": 0.2317780398007556, + "grad_norm": 4.0149708384232, + "learning_rate": 1.9615742820483554e-05, + "loss": 2.4421, + "step": 1204 + }, + { + "epoch": 0.23197054647833096, + "grad_norm": 3.3334216835742896, + "learning_rate": 1.961488635050124e-05, + "loss": 2.3777, + "step": 1205 + }, + { + "epoch": 0.23216305315590635, + "grad_norm": 3.7976150665394166, + "learning_rate": 1.961402894582896e-05, + "loss": 2.4144, + "step": 1206 + }, + { + "epoch": 0.23235555983348172, + "grad_norm": 4.484708449826713, + "learning_rate": 1.961317060655007e-05, + "loss": 2.4671, + "step": 1207 + }, + { + "epoch": 0.2325480665110571, + "grad_norm": 3.305011549556479, + "learning_rate": 1.9612311332748008e-05, + "loss": 2.46, + "step": 1208 + }, + { + "epoch": 0.2325480665110571, + "lm_loss": 2.1799, + "step": 1208, + "vm_loss": 0.2228 + }, + { + "epoch": 0.2325480665110571, + "lm_loss": 2.0738, + "step": 1208, + "vm_loss": 0.1362 + }, + { + "epoch": 0.2325480665110571, + "lm_loss": 2.3666, + "step": 1208, + "vm_loss": 0.159 + }, + { + "epoch": 0.2325480665110571, + "lm_loss": 2.4713, + "step": 1208, + "vm_loss": 0.1716 + }, + { + "epoch": 0.2325480665110571, + "lm_loss": 2.387, + "step": 1208, + "vm_loss": 0.2495 + }, + { + "epoch": 0.2325480665110571, + "lm_loss": 2.447, + "step": 1208, + "vm_loss": 0.1899 + }, + { + "epoch": 0.2325480665110571, + "lm_loss": 1.5992, + "step": 1208, + "vm_loss": 0.192 + }, + { + "epoch": 0.2325480665110571, + "lm_loss": 2.3106, + "step": 1208, + "vm_loss": 0.1443 + }, + { + "epoch": 0.23274057318863248, + "grad_norm": 3.7303729944956676, + "learning_rate": 1.9611451124506308e-05, + "loss": 2.4799, + "step": 1209 + }, + { + "epoch": 0.23293307986620787, + "grad_norm": 3.961349763793609, + "learning_rate": 1.9610589981908593e-05, + "loss": 2.474, + "step": 1210 + }, + { + "epoch": 0.23312558654378324, + "grad_norm": 3.607499882312154, + "learning_rate": 1.9609727905038573e-05, + "loss": 2.5005, + "step": 1211 + }, + { + "epoch": 0.2333180932213586, + "grad_norm": 4.2850567074387484, + "learning_rate": 1.9608864893980056e-05, + "loss": 2.5114, + "step": 1212 + }, + { + "epoch": 0.233510599898934, + "grad_norm": 3.6643182984245413, + "learning_rate": 1.9608000948816942e-05, + "loss": 2.5217, + "step": 1213 + }, + { + "epoch": 0.23370310657650936, + "grad_norm": 3.1842880662237607, + "learning_rate": 1.9607136069633212e-05, + "loss": 2.4342, + "step": 1214 + }, + { + "epoch": 0.23389561325408476, + "grad_norm": 3.4572435575787273, + "learning_rate": 1.9606270256512947e-05, + "loss": 2.5053, + "step": 1215 + }, + { + "epoch": 0.23408811993166012, + "grad_norm": 3.8211503219535814, + "learning_rate": 1.9605403509540308e-05, + "loss": 2.4401, + "step": 1216 + }, + { + "epoch": 0.23408811993166012, + "lm_loss": 1.9969, + "step": 1216, + "vm_loss": 0.1779 + }, + { + "epoch": 0.23408811993166012, + "lm_loss": 2.138, + "step": 1216, + "vm_loss": 0.1898 + }, + { + "epoch": 0.23408811993166012, + "lm_loss": 2.1311, + "step": 1216, + "vm_loss": 0.2112 + }, + { + "epoch": 0.23408811993166012, + "lm_loss": 2.2109, + "step": 1216, + "vm_loss": 0.18 + }, + { + "epoch": 0.23408811993166012, + "lm_loss": 2.2883, + "step": 1216, + "vm_loss": 0.1715 + }, + { + "epoch": 0.23408811993166012, + "lm_loss": 2.4944, + "step": 1216, + "vm_loss": 0.1258 + }, + { + "epoch": 0.23408811993166012, + "lm_loss": 2.2771, + "step": 1216, + "vm_loss": 0.1514 + }, + { + "epoch": 0.23408811993166012, + "lm_loss": 1.9629, + "step": 1216, + "vm_loss": 0.1928 + }, + { + "epoch": 0.23428062660923551, + "grad_norm": 2.918449772579497, + "learning_rate": 1.960453582879956e-05, + "loss": 2.4251, + "step": 1217 + }, + { + "epoch": 0.23447313328681088, + "grad_norm": 3.5844067289270836, + "learning_rate": 1.9603667214375053e-05, + "loss": 2.4489, + "step": 1218 + }, + { + "epoch": 0.23466563996438627, + "grad_norm": 3.5047086080765086, + "learning_rate": 1.960279766635123e-05, + "loss": 2.4884, + "step": 1219 + }, + { + "epoch": 0.23485814664196164, + "grad_norm": 3.333499859532291, + "learning_rate": 1.960192718481261e-05, + "loss": 2.4472, + "step": 1220 + }, + { + "epoch": 0.23505065331953703, + "grad_norm": 3.149896738681449, + "learning_rate": 1.9601055769843832e-05, + "loss": 2.4542, + "step": 1221 + }, + { + "epoch": 0.2352431599971124, + "grad_norm": 3.453267975528268, + "learning_rate": 1.960018342152959e-05, + "loss": 2.4386, + "step": 1222 + }, + { + "epoch": 0.23543566667468777, + "grad_norm": 2.994502295368568, + "learning_rate": 1.9599310139954706e-05, + "loss": 2.4791, + "step": 1223 + }, + { + "epoch": 0.23562817335226316, + "grad_norm": 3.272488889736858, + "learning_rate": 1.959843592520406e-05, + "loss": 2.4356, + "step": 1224 + }, + { + "epoch": 0.23562817335226316, + "lm_loss": 2.0075, + "step": 1224, + "vm_loss": 0.202 + }, + { + "epoch": 0.23562817335226316, + "lm_loss": 2.4588, + "step": 1224, + "vm_loss": 0.2488 + }, + { + "epoch": 0.23562817335226316, + "lm_loss": 2.4585, + "step": 1224, + "vm_loss": 0.2306 + }, + { + "epoch": 0.23562817335226316, + "lm_loss": 2.3119, + "step": 1224, + "vm_loss": 0.2009 + }, + { + "epoch": 0.23562817335226316, + "lm_loss": 2.4838, + "step": 1224, + "vm_loss": 0.2062 + }, + { + "epoch": 0.23562817335226316, + "lm_loss": 2.3504, + "step": 1224, + "vm_loss": 0.1938 + }, + { + "epoch": 0.23562817335226316, + "lm_loss": 1.8162, + "step": 1224, + "vm_loss": 0.223 + }, + { + "epoch": 0.23562817335226316, + "lm_loss": 2.2534, + "step": 1224, + "vm_loss": 0.1763 + }, + { + "epoch": 0.23582068002983853, + "grad_norm": 2.9811635532020957, + "learning_rate": 1.9597560777362645e-05, + "loss": 2.5131, + "step": 1225 + }, + { + "epoch": 0.23601318670741392, + "grad_norm": 3.276735449151304, + "learning_rate": 1.9596684696515533e-05, + "loss": 2.4349, + "step": 1226 + }, + { + "epoch": 0.23620569338498928, + "grad_norm": 3.202355412954144, + "learning_rate": 1.959580768274789e-05, + "loss": 2.4488, + "step": 1227 + }, + { + "epoch": 0.23639820006256468, + "grad_norm": 2.9707272558132978, + "learning_rate": 1.9594929736144978e-05, + "loss": 2.4522, + "step": 1228 + }, + { + "epoch": 0.23659070674014004, + "grad_norm": 3.180961673277115, + "learning_rate": 1.9594050856792135e-05, + "loss": 2.4415, + "step": 1229 + }, + { + "epoch": 0.23678321341771544, + "grad_norm": 3.6708411494666424, + "learning_rate": 1.9593171044774807e-05, + "loss": 2.4995, + "step": 1230 + }, + { + "epoch": 0.2369757200952908, + "grad_norm": 3.1780451718881895, + "learning_rate": 1.959229030017852e-05, + "loss": 2.4377, + "step": 1231 + }, + { + "epoch": 0.2371682267728662, + "grad_norm": 3.954831781737448, + "learning_rate": 1.95914086230889e-05, + "loss": 2.4723, + "step": 1232 + }, + { + "epoch": 0.2371682267728662, + "lm_loss": 2.4703, + "step": 1232, + "vm_loss": 0.2084 + }, + { + "epoch": 0.2371682267728662, + "lm_loss": 1.9716, + "step": 1232, + "vm_loss": 0.2874 + }, + { + "epoch": 0.2371682267728662, + "lm_loss": 2.3523, + "step": 1232, + "vm_loss": 0.1645 + }, + { + "epoch": 0.2371682267728662, + "lm_loss": 2.1043, + "step": 1232, + "vm_loss": 0.1717 + }, + { + "epoch": 0.2371682267728662, + "lm_loss": 2.1198, + "step": 1232, + "vm_loss": 0.2386 + }, + { + "epoch": 0.2371682267728662, + "lm_loss": 1.6781, + "step": 1232, + "vm_loss": 0.114 + }, + { + "epoch": 0.2371682267728662, + "lm_loss": 2.5147, + "step": 1232, + "vm_loss": 0.2229 + }, + { + "epoch": 0.2371682267728662, + "lm_loss": 2.3105, + "step": 1232, + "vm_loss": 0.1309 + }, + { + "epoch": 0.23736073345044156, + "grad_norm": 3.4541065634328603, + "learning_rate": 1.9590526013591646e-05, + "loss": 2.4773, + "step": 1233 + }, + { + "epoch": 0.23755324012801693, + "grad_norm": 3.5129467842005, + "learning_rate": 1.9589642471772565e-05, + "loss": 2.4527, + "step": 1234 + }, + { + "epoch": 0.23774574680559232, + "grad_norm": 3.8413181697399685, + "learning_rate": 1.958875799771755e-05, + "loss": 2.4038, + "step": 1235 + }, + { + "epoch": 0.2379382534831677, + "grad_norm": 3.6433119169877033, + "learning_rate": 1.9587872591512583e-05, + "loss": 2.4911, + "step": 1236 + }, + { + "epoch": 0.23813076016074308, + "grad_norm": 3.312951522437006, + "learning_rate": 1.9586986253243734e-05, + "loss": 2.4925, + "step": 1237 + }, + { + "epoch": 0.23832326683831845, + "grad_norm": 3.2298244398229254, + "learning_rate": 1.9586098982997168e-05, + "loss": 2.5106, + "step": 1238 + }, + { + "epoch": 0.23851577351589384, + "grad_norm": 3.33885121791172, + "learning_rate": 1.958521078085914e-05, + "loss": 2.4396, + "step": 1239 + }, + { + "epoch": 0.2387082801934692, + "grad_norm": 3.7790297546295704, + "learning_rate": 1.9584321646915988e-05, + "loss": 2.4562, + "step": 1240 + }, + { + "epoch": 0.2387082801934692, + "lm_loss": 2.6071, + "step": 1240, + "vm_loss": 0.1864 + }, + { + "epoch": 0.2387082801934692, + "lm_loss": 1.871, + "step": 1240, + "vm_loss": 0.2463 + }, + { + "epoch": 0.2387082801934692, + "lm_loss": 2.339, + "step": 1240, + "vm_loss": 0.169 + }, + { + "epoch": 0.2387082801934692, + "lm_loss": 2.3169, + "step": 1240, + "vm_loss": 0.154 + }, + { + "epoch": 0.2387082801934692, + "lm_loss": 2.5389, + "step": 1240, + "vm_loss": 0.1502 + }, + { + "epoch": 0.2387082801934692, + "lm_loss": 2.4738, + "step": 1240, + "vm_loss": 0.1702 + }, + { + "epoch": 0.2387082801934692, + "lm_loss": 2.4138, + "step": 1240, + "vm_loss": 0.1745 + }, + { + "epoch": 0.2387082801934692, + "lm_loss": 2.4076, + "step": 1240, + "vm_loss": 0.1661 + }, + { + "epoch": 0.2389007868710446, + "grad_norm": 4.21761014010306, + "learning_rate": 1.958343158125416e-05, + "loss": 2.4804, + "step": 1241 + }, + { + "epoch": 0.23909329354861997, + "grad_norm": 3.253780108906917, + "learning_rate": 1.958254058396017e-05, + "loss": 2.4486, + "step": 1242 + }, + { + "epoch": 0.23928580022619533, + "grad_norm": 3.6916334941405746, + "learning_rate": 1.958164865512064e-05, + "loss": 2.4559, + "step": 1243 + }, + { + "epoch": 0.23947830690377073, + "grad_norm": 2.9460102699291153, + "learning_rate": 1.9580755794822278e-05, + "loss": 2.3986, + "step": 1244 + }, + { + "epoch": 0.2396708135813461, + "grad_norm": 3.5712419605370385, + "learning_rate": 1.9579862003151875e-05, + "loss": 2.458, + "step": 1245 + }, + { + "epoch": 0.2398633202589215, + "grad_norm": 3.85224886013358, + "learning_rate": 1.9578967280196327e-05, + "loss": 2.4255, + "step": 1246 + }, + { + "epoch": 0.24005582693649685, + "grad_norm": 3.28352008255624, + "learning_rate": 1.9578071626042602e-05, + "loss": 2.4651, + "step": 1247 + }, + { + "epoch": 0.24024833361407225, + "grad_norm": 3.9129803607234668, + "learning_rate": 1.957717504077778e-05, + "loss": 2.4547, + "step": 1248 + }, + { + "epoch": 0.24024833361407225, + "lm_loss": 2.5251, + "step": 1248, + "vm_loss": 0.1541 + }, + { + "epoch": 0.24024833361407225, + "lm_loss": 1.8931, + "step": 1248, + "vm_loss": 0.2147 + }, + { + "epoch": 0.24024833361407225, + "lm_loss": 2.4392, + "step": 1248, + "vm_loss": 0.2412 + }, + { + "epoch": 0.24024833361407225, + "lm_loss": 2.4346, + "step": 1248, + "vm_loss": 0.1223 + }, + { + "epoch": 0.24024833361407225, + "lm_loss": 2.3947, + "step": 1248, + "vm_loss": 0.1623 + }, + { + "epoch": 0.24024833361407225, + "lm_loss": 2.3603, + "step": 1248, + "vm_loss": 0.147 + }, + { + "epoch": 0.24024833361407225, + "lm_loss": 2.3085, + "step": 1248, + "vm_loss": 0.2455 + }, + { + "epoch": 0.24024833361407225, + "lm_loss": 2.4174, + "step": 1248, + "vm_loss": 0.1662 + }, + { + "epoch": 0.2404408402916476, + "grad_norm": 3.9670120959047233, + "learning_rate": 1.957627752448902e-05, + "loss": 2.4589, + "step": 1249 + }, + { + "epoch": 0.240633346969223, + "grad_norm": 3.583767071366644, + "learning_rate": 1.9575379077263562e-05, + "loss": 2.461, + "step": 1250 + }, + { + "epoch": 0.24082585364679837, + "grad_norm": 3.966554281561264, + "learning_rate": 1.9574479699188757e-05, + "loss": 2.4281, + "step": 1251 + }, + { + "epoch": 0.24101836032437376, + "grad_norm": 3.551034488151081, + "learning_rate": 1.9573579390352028e-05, + "loss": 2.5, + "step": 1252 + }, + { + "epoch": 0.24121086700194913, + "grad_norm": 3.6095440710159004, + "learning_rate": 1.9572678150840904e-05, + "loss": 2.4647, + "step": 1253 + }, + { + "epoch": 0.2414033736795245, + "grad_norm": 3.372422147812847, + "learning_rate": 1.957177598074299e-05, + "loss": 2.4433, + "step": 1254 + }, + { + "epoch": 0.2415958803570999, + "grad_norm": 3.2500028917779464, + "learning_rate": 1.9570872880145995e-05, + "loss": 2.4731, + "step": 1255 + }, + { + "epoch": 0.24178838703467526, + "grad_norm": 3.0608720031020376, + "learning_rate": 1.9569968849137705e-05, + "loss": 2.4711, + "step": 1256 + }, + { + "epoch": 0.24178838703467526, + "lm_loss": 2.2472, + "step": 1256, + "vm_loss": 0.2106 + }, + { + "epoch": 0.24178838703467526, + "lm_loss": 2.3767, + "step": 1256, + "vm_loss": 0.1891 + }, + { + "epoch": 0.24178838703467526, + "lm_loss": 2.3176, + "step": 1256, + "vm_loss": 0.1688 + }, + { + "epoch": 0.24178838703467526, + "lm_loss": 2.2331, + "step": 1256, + "vm_loss": 0.1593 + }, + { + "epoch": 0.24178838703467526, + "lm_loss": 2.1911, + "step": 1256, + "vm_loss": 0.1199 + }, + { + "epoch": 0.24178838703467526, + "lm_loss": 2.3035, + "step": 1256, + "vm_loss": 0.1939 + }, + { + "epoch": 0.24178838703467526, + "lm_loss": 2.2489, + "step": 1256, + "vm_loss": 0.1626 + }, + { + "epoch": 0.24178838703467526, + "lm_loss": 2.2317, + "step": 1256, + "vm_loss": 0.1561 + }, + { + "epoch": 0.24198089371225065, + "grad_norm": 3.7562562147713137, + "learning_rate": 1.9569063887806014e-05, + "loss": 2.4475, + "step": 1257 + }, + { + "epoch": 0.24217340038982602, + "grad_norm": 3.69405227034211, + "learning_rate": 1.9568157996238884e-05, + "loss": 2.4218, + "step": 1258 + }, + { + "epoch": 0.2423659070674014, + "grad_norm": 3.4793440501009294, + "learning_rate": 1.9567251174524384e-05, + "loss": 2.4519, + "step": 1259 + }, + { + "epoch": 0.24255841374497678, + "grad_norm": 3.210379678646978, + "learning_rate": 1.956634342275067e-05, + "loss": 2.4733, + "step": 1260 + }, + { + "epoch": 0.24275092042255217, + "grad_norm": 3.47679159835551, + "learning_rate": 1.9565434741005988e-05, + "loss": 2.4401, + "step": 1261 + }, + { + "epoch": 0.24294342710012753, + "grad_norm": 3.968353171325261, + "learning_rate": 1.956452512937867e-05, + "loss": 2.419, + "step": 1262 + }, + { + "epoch": 0.2431359337777029, + "grad_norm": 3.2161095814685465, + "learning_rate": 1.956361458795714e-05, + "loss": 2.4344, + "step": 1263 + }, + { + "epoch": 0.2433284404552783, + "grad_norm": 3.4924038956391836, + "learning_rate": 1.9562703116829923e-05, + "loss": 2.4614, + "step": 1264 + }, + { + "epoch": 0.2433284404552783, + "lm_loss": 2.1035, + "step": 1264, + "vm_loss": 0.1769 + }, + { + "epoch": 0.2433284404552783, + "lm_loss": 2.4302, + "step": 1264, + "vm_loss": 0.2315 + }, + { + "epoch": 0.2433284404552783, + "lm_loss": 2.25, + "step": 1264, + "vm_loss": 0.1868 + }, + { + "epoch": 0.2433284404552783, + "lm_loss": 2.2083, + "step": 1264, + "vm_loss": 0.1908 + }, + { + "epoch": 0.2433284404552783, + "lm_loss": 2.1024, + "step": 1264, + "vm_loss": 0.2034 + }, + { + "epoch": 0.2433284404552783, + "lm_loss": 2.1332, + "step": 1264, + "vm_loss": 0.21 + }, + { + "epoch": 0.2433284404552783, + "lm_loss": 2.3157, + "step": 1264, + "vm_loss": 0.1873 + }, + { + "epoch": 0.2433284404552783, + "lm_loss": 2.4478, + "step": 1264, + "vm_loss": 0.1428 + }, + { + "epoch": 0.24352094713285366, + "grad_norm": 3.533861837805012, + "learning_rate": 1.9561790716085617e-05, + "loss": 2.4888, + "step": 1265 + }, + { + "epoch": 0.24371345381042905, + "grad_norm": 3.2918500400473456, + "learning_rate": 1.9560877385812924e-05, + "loss": 2.4721, + "step": 1266 + }, + { + "epoch": 0.24390596048800442, + "grad_norm": 3.447267206266168, + "learning_rate": 1.955996312610063e-05, + "loss": 2.4205, + "step": 1267 + }, + { + "epoch": 0.2440984671655798, + "grad_norm": 3.3159754695482224, + "learning_rate": 1.9559047937037613e-05, + "loss": 2.4601, + "step": 1268 + }, + { + "epoch": 0.24429097384315518, + "grad_norm": 3.477322062869611, + "learning_rate": 1.9558131818712838e-05, + "loss": 2.4085, + "step": 1269 + }, + { + "epoch": 0.24448348052073057, + "grad_norm": 3.216674401765448, + "learning_rate": 1.955721477121537e-05, + "loss": 2.4391, + "step": 1270 + }, + { + "epoch": 0.24467598719830594, + "grad_norm": 3.393032976277194, + "learning_rate": 1.9556296794634348e-05, + "loss": 2.4743, + "step": 1271 + }, + { + "epoch": 0.24486849387588133, + "grad_norm": 3.230922177210364, + "learning_rate": 1.9555377889059022e-05, + "loss": 2.4189, + "step": 1272 + }, + { + "epoch": 0.24486849387588133, + "lm_loss": 2.0877, + "step": 1272, + "vm_loss": 0.2495 + }, + { + "epoch": 0.24486849387588133, + "lm_loss": 2.3314, + "step": 1272, + "vm_loss": 0.3155 + }, + { + "epoch": 0.24486849387588133, + "lm_loss": 1.8549, + "step": 1272, + "vm_loss": 0.2071 + }, + { + "epoch": 0.24486849387588133, + "lm_loss": 2.3719, + "step": 1272, + "vm_loss": 0.2241 + }, + { + "epoch": 0.24486849387588133, + "lm_loss": 2.4522, + "step": 1272, + "vm_loss": 0.1507 + }, + { + "epoch": 0.24486849387588133, + "lm_loss": 2.4764, + "step": 1272, + "vm_loss": 0.1921 + }, + { + "epoch": 0.24486849387588133, + "lm_loss": 2.5035, + "step": 1272, + "vm_loss": 0.1351 + }, + { + "epoch": 0.24486849387588133, + "lm_loss": 2.133, + "step": 1272, + "vm_loss": 0.1626 + }, + { + "epoch": 0.2450610005534567, + "grad_norm": 3.05452318873155, + "learning_rate": 1.9554458054578713e-05, + "loss": 2.4764, + "step": 1273 + }, + { + "epoch": 0.24525350723103206, + "grad_norm": 3.6540772139994715, + "learning_rate": 1.9553537291282846e-05, + "loss": 2.4683, + "step": 1274 + }, + { + "epoch": 0.24544601390860746, + "grad_norm": 3.1383818005000097, + "learning_rate": 1.9552615599260927e-05, + "loss": 2.4941, + "step": 1275 + }, + { + "epoch": 0.24563852058618282, + "grad_norm": 3.7826958435246265, + "learning_rate": 1.9551692978602557e-05, + "loss": 2.4663, + "step": 1276 + }, + { + "epoch": 0.24583102726375822, + "grad_norm": 3.500809634231439, + "learning_rate": 1.955076942939743e-05, + "loss": 2.4188, + "step": 1277 + }, + { + "epoch": 0.24602353394133358, + "grad_norm": 3.139786347407025, + "learning_rate": 1.9549844951735328e-05, + "loss": 2.5235, + "step": 1278 + }, + { + "epoch": 0.24621604061890898, + "grad_norm": 3.715662791454268, + "learning_rate": 1.9548919545706113e-05, + "loss": 2.4494, + "step": 1279 + }, + { + "epoch": 0.24640854729648434, + "grad_norm": 3.5537681789566737, + "learning_rate": 1.9547993211399753e-05, + "loss": 2.5021, + "step": 1280 + }, + { + "epoch": 0.24640854729648434, + "lm_loss": 2.2823, + "step": 1280, + "vm_loss": 0.1351 + }, + { + "epoch": 0.24640854729648434, + "lm_loss": 2.583, + "step": 1280, + "vm_loss": 0.1612 + }, + { + "epoch": 0.24640854729648434, + "lm_loss": 2.0475, + "step": 1280, + "vm_loss": 0.2781 + }, + { + "epoch": 0.24640854729648434, + "lm_loss": 2.2312, + "step": 1280, + "vm_loss": 0.1935 + }, + { + "epoch": 0.24640854729648434, + "lm_loss": 2.2127, + "step": 1280, + "vm_loss": 0.1453 + }, + { + "epoch": 0.24640854729648434, + "lm_loss": 2.2479, + "step": 1280, + "vm_loss": 0.1445 + }, + { + "epoch": 0.24640854729648434, + "lm_loss": 2.3117, + "step": 1280, + "vm_loss": 0.1334 + }, + { + "epoch": 0.24640854729648434, + "lm_loss": 2.4109, + "step": 1280, + "vm_loss": 0.1267 + }, + { + "epoch": 0.24660105397405974, + "grad_norm": 3.3442705721691133, + "learning_rate": 1.95470659489063e-05, + "loss": 2.468, + "step": 1281 + }, + { + "epoch": 0.2467935606516351, + "grad_norm": 3.5013485807753497, + "learning_rate": 1.954613775831589e-05, + "loss": 2.5184, + "step": 1282 + }, + { + "epoch": 0.2469860673292105, + "grad_norm": 3.3422968392427834, + "learning_rate": 1.9545208639718766e-05, + "loss": 2.4301, + "step": 1283 + }, + { + "epoch": 0.24717857400678586, + "grad_norm": 3.2711482106164613, + "learning_rate": 1.954427859320524e-05, + "loss": 2.5182, + "step": 1284 + }, + { + "epoch": 0.24737108068436123, + "grad_norm": 4.07606390410819, + "learning_rate": 1.9543347618865726e-05, + "loss": 2.427, + "step": 1285 + }, + { + "epoch": 0.24756358736193662, + "grad_norm": 2.9925954544734834, + "learning_rate": 1.9542415716790727e-05, + "loss": 2.4628, + "step": 1286 + }, + { + "epoch": 0.247756094039512, + "grad_norm": 3.7457811769328333, + "learning_rate": 1.9541482887070843e-05, + "loss": 2.4389, + "step": 1287 + }, + { + "epoch": 0.24794860071708738, + "grad_norm": 3.422721224804485, + "learning_rate": 1.9540549129796745e-05, + "loss": 2.4672, + "step": 1288 + }, + { + "epoch": 0.24794860071708738, + "lm_loss": 2.4687, + "step": 1288, + "vm_loss": 0.2037 + }, + { + "epoch": 0.24794860071708738, + "lm_loss": 2.397, + "step": 1288, + "vm_loss": 0.1799 + }, + { + "epoch": 0.24794860071708738, + "lm_loss": 1.786, + "step": 1288, + "vm_loss": 0.2771 + }, + { + "epoch": 0.24794860071708738, + "lm_loss": 2.4285, + "step": 1288, + "vm_loss": 0.17 + }, + { + "epoch": 0.24794860071708738, + "lm_loss": 2.4056, + "step": 1288, + "vm_loss": 0.1479 + }, + { + "epoch": 0.24794860071708738, + "lm_loss": 2.3301, + "step": 1288, + "vm_loss": 0.1602 + }, + { + "epoch": 0.24794860071708738, + "lm_loss": 2.3434, + "step": 1288, + "vm_loss": 0.1452 + }, + { + "epoch": 0.24794860071708738, + "lm_loss": 2.1714, + "step": 1288, + "vm_loss": 0.199 + }, + { + "epoch": 0.24814110739466275, + "grad_norm": 2.9637292844826795, + "learning_rate": 1.9539614445059215e-05, + "loss": 2.5068, + "step": 1289 + }, + { + "epoch": 0.24833361407223814, + "grad_norm": 4.205406904343167, + "learning_rate": 1.9538678832949113e-05, + "loss": 2.4976, + "step": 1290 + }, + { + "epoch": 0.2485261207498135, + "grad_norm": 3.7337525949892374, + "learning_rate": 1.9537742293557397e-05, + "loss": 2.4688, + "step": 1291 + }, + { + "epoch": 0.2487186274273889, + "grad_norm": 3.407423884312617, + "learning_rate": 1.9536804826975105e-05, + "loss": 2.4414, + "step": 1292 + }, + { + "epoch": 0.24891113410496427, + "grad_norm": 3.7277354429840126, + "learning_rate": 1.953586643329337e-05, + "loss": 2.4665, + "step": 1293 + }, + { + "epoch": 0.24910364078253963, + "grad_norm": 3.4387483267838443, + "learning_rate": 1.9534927112603417e-05, + "loss": 2.4555, + "step": 1294 + }, + { + "epoch": 0.24929614746011503, + "grad_norm": 4.057550120381568, + "learning_rate": 1.9533986864996563e-05, + "loss": 2.4908, + "step": 1295 + }, + { + "epoch": 0.2494886541376904, + "grad_norm": 3.9799671789650453, + "learning_rate": 1.9533045690564214e-05, + "loss": 2.4363, + "step": 1296 + }, + { + "epoch": 0.2494886541376904, + "lm_loss": 2.0481, + "step": 1296, + "vm_loss": 0.1162 + }, + { + "epoch": 0.2494886541376904, + "lm_loss": 2.4215, + "step": 1296, + "vm_loss": 0.2068 + }, + { + "epoch": 0.2494886541376904, + "lm_loss": 2.3141, + "step": 1296, + "vm_loss": 0.2174 + }, + { + "epoch": 0.2494886541376904, + "lm_loss": 2.2247, + "step": 1296, + "vm_loss": 0.1617 + }, + { + "epoch": 0.2494886541376904, + "lm_loss": 1.9348, + "step": 1296, + "vm_loss": 0.1766 + }, + { + "epoch": 0.2494886541376904, + "lm_loss": 2.2618, + "step": 1296, + "vm_loss": 0.221 + }, + { + "epoch": 0.2494886541376904, + "lm_loss": 2.4166, + "step": 1296, + "vm_loss": 0.1972 + }, + { + "epoch": 0.2494886541376904, + "lm_loss": 2.2877, + "step": 1296, + "vm_loss": 0.165 + }, + { + "epoch": 0.24968116081526578, + "grad_norm": 3.553468080904293, + "learning_rate": 1.9532103589397858e-05, + "loss": 2.438, + "step": 1297 + }, + { + "epoch": 0.24987366749284115, + "grad_norm": 3.3366699203215067, + "learning_rate": 1.953116056158908e-05, + "loss": 2.4579, + "step": 1298 + }, + { + "epoch": 0.25006617417041654, + "grad_norm": 3.549534831579184, + "learning_rate": 1.9530216607229557e-05, + "loss": 2.4496, + "step": 1299 + }, + { + "epoch": 0.2502586808479919, + "grad_norm": 2.9416537435647623, + "learning_rate": 1.9529271726411055e-05, + "loss": 2.486, + "step": 1300 + }, + { + "epoch": 0.2504511875255673, + "grad_norm": 3.549767881868953, + "learning_rate": 1.9528325919225426e-05, + "loss": 2.4643, + "step": 1301 + }, + { + "epoch": 0.2506436942031427, + "grad_norm": 3.0193358813288893, + "learning_rate": 1.9527379185764613e-05, + "loss": 2.405, + "step": 1302 + }, + { + "epoch": 0.25083620088071806, + "grad_norm": 3.270055103360828, + "learning_rate": 1.9526431526120656e-05, + "loss": 2.4652, + "step": 1303 + }, + { + "epoch": 0.25102870755829343, + "grad_norm": 2.6596369083952967, + "learning_rate": 1.952548294038567e-05, + "loss": 2.439, + "step": 1304 + }, + { + "epoch": 0.25102870755829343, + "lm_loss": 2.3055, + "step": 1304, + "vm_loss": 0.1628 + }, + { + "epoch": 0.25102870755829343, + "lm_loss": 2.2558, + "step": 1304, + "vm_loss": 0.1891 + }, + { + "epoch": 0.25102870755829343, + "lm_loss": 2.2685, + "step": 1304, + "vm_loss": 0.2205 + }, + { + "epoch": 0.25102870755829343, + "lm_loss": 1.9447, + "step": 1304, + "vm_loss": 0.2053 + }, + { + "epoch": 0.25102870755829343, + "lm_loss": 2.4669, + "step": 1304, + "vm_loss": 0.1086 + }, + { + "epoch": 0.25102870755829343, + "lm_loss": 2.3587, + "step": 1304, + "vm_loss": 0.1712 + }, + { + "epoch": 0.25102870755829343, + "lm_loss": 1.7869, + "step": 1304, + "vm_loss": 0.163 + }, + { + "epoch": 0.25102870755829343, + "lm_loss": 2.4312, + "step": 1304, + "vm_loss": 0.1517 + }, + { + "epoch": 0.2512212142358688, + "grad_norm": 3.2679977975074297, + "learning_rate": 1.9524533428651878e-05, + "loss": 2.4326, + "step": 1305 + }, + { + "epoch": 0.25141372091344416, + "grad_norm": 3.1348361909953466, + "learning_rate": 1.9523582991011585e-05, + "loss": 2.4319, + "step": 1306 + }, + { + "epoch": 0.2516062275910196, + "grad_norm": 3.1906128122893254, + "learning_rate": 1.952263162755718e-05, + "loss": 2.478, + "step": 1307 + }, + { + "epoch": 0.25179873426859495, + "grad_norm": 3.512905127957672, + "learning_rate": 1.952167933838115e-05, + "loss": 2.4409, + "step": 1308 + }, + { + "epoch": 0.2519912409461703, + "grad_norm": 2.8724466614974973, + "learning_rate": 1.9520726123576075e-05, + "loss": 2.4589, + "step": 1309 + }, + { + "epoch": 0.2521837476237457, + "grad_norm": 3.077330917970436, + "learning_rate": 1.951977198323461e-05, + "loss": 2.4494, + "step": 1310 + }, + { + "epoch": 0.2523762543013211, + "grad_norm": 3.2137246894170035, + "learning_rate": 1.9518816917449517e-05, + "loss": 2.4251, + "step": 1311 + }, + { + "epoch": 0.25256876097889647, + "grad_norm": 3.4836684105392846, + "learning_rate": 1.9517860926313644e-05, + "loss": 2.4533, + "step": 1312 + }, + { + "epoch": 0.25256876097889647, + "lm_loss": 2.1867, + "step": 1312, + "vm_loss": 0.1819 + }, + { + "epoch": 0.25256876097889647, + "lm_loss": 2.4755, + "step": 1312, + "vm_loss": 0.1749 + }, + { + "epoch": 0.25256876097889647, + "lm_loss": 2.4383, + "step": 1312, + "vm_loss": 0.1179 + }, + { + "epoch": 0.25256876097889647, + "lm_loss": 1.8094, + "step": 1312, + "vm_loss": 0.189 + }, + { + "epoch": 0.25256876097889647, + "lm_loss": 2.0335, + "step": 1312, + "vm_loss": 0.2433 + }, + { + "epoch": 0.25256876097889647, + "lm_loss": 2.6474, + "step": 1312, + "vm_loss": 0.1741 + }, + { + "epoch": 0.25256876097889647, + "lm_loss": 2.3003, + "step": 1312, + "vm_loss": 0.1389 + }, + { + "epoch": 0.25256876097889647, + "lm_loss": 2.7781, + "step": 1312, + "vm_loss": 0.1579 + }, + { + "epoch": 0.25276126765647183, + "grad_norm": 3.340385836767393, + "learning_rate": 1.951690400991991e-05, + "loss": 2.4892, + "step": 1313 + }, + { + "epoch": 0.2529537743340472, + "grad_norm": 3.069045119976783, + "learning_rate": 1.9515946168361357e-05, + "loss": 2.4618, + "step": 1314 + }, + { + "epoch": 0.25314628101162256, + "grad_norm": 3.3100112809818834, + "learning_rate": 1.9514987401731088e-05, + "loss": 2.4564, + "step": 1315 + }, + { + "epoch": 0.253338787689198, + "grad_norm": 2.997859414486016, + "learning_rate": 1.9514027710122313e-05, + "loss": 2.445, + "step": 1316 + }, + { + "epoch": 0.25353129436677335, + "grad_norm": 3.6982507774384494, + "learning_rate": 1.9513067093628323e-05, + "loss": 2.4981, + "step": 1317 + }, + { + "epoch": 0.2537238010443487, + "grad_norm": 2.586676960021598, + "learning_rate": 1.9512105552342507e-05, + "loss": 2.415, + "step": 1318 + }, + { + "epoch": 0.2539163077219241, + "grad_norm": 3.6333136142341704, + "learning_rate": 1.9511143086358334e-05, + "loss": 2.4571, + "step": 1319 + }, + { + "epoch": 0.2541088143994995, + "grad_norm": 3.483185217523218, + "learning_rate": 1.9510179695769367e-05, + "loss": 2.4789, + "step": 1320 + }, + { + "epoch": 0.2541088143994995, + "lm_loss": 2.492, + "step": 1320, + "vm_loss": 0.1001 + }, + { + "epoch": 0.2541088143994995, + "lm_loss": 2.3933, + "step": 1320, + "vm_loss": 0.1956 + }, + { + "epoch": 0.2541088143994995, + "lm_loss": 2.6844, + "step": 1320, + "vm_loss": 0.1681 + }, + { + "epoch": 0.2541088143994995, + "lm_loss": 2.2821, + "step": 1320, + "vm_loss": 0.1474 + }, + { + "epoch": 0.2541088143994995, + "lm_loss": 2.3256, + "step": 1320, + "vm_loss": 0.1271 + }, + { + "epoch": 0.2541088143994995, + "lm_loss": 2.3802, + "step": 1320, + "vm_loss": 0.1134 + }, + { + "epoch": 0.2541088143994995, + "lm_loss": 2.1818, + "step": 1320, + "vm_loss": 0.1981 + }, + { + "epoch": 0.2541088143994995, + "lm_loss": 2.3754, + "step": 1320, + "vm_loss": 0.132 + }, + { + "epoch": 0.25430132107707487, + "grad_norm": 2.826169024345817, + "learning_rate": 1.950921538066927e-05, + "loss": 2.4473, + "step": 1321 + }, + { + "epoch": 0.25449382775465024, + "grad_norm": 3.327700483085308, + "learning_rate": 1.9508250141151773e-05, + "loss": 2.4782, + "step": 1322 + }, + { + "epoch": 0.2546863344322256, + "grad_norm": 3.225481627547399, + "learning_rate": 1.9507283977310716e-05, + "loss": 2.3944, + "step": 1323 + }, + { + "epoch": 0.25487884110980097, + "grad_norm": 3.1353652483189, + "learning_rate": 1.9506316889240027e-05, + "loss": 2.4734, + "step": 1324 + }, + { + "epoch": 0.2550713477873764, + "grad_norm": 3.2868717470963733, + "learning_rate": 1.9505348877033715e-05, + "loss": 2.4282, + "step": 1325 + }, + { + "epoch": 0.25526385446495176, + "grad_norm": 2.876685465658946, + "learning_rate": 1.950437994078588e-05, + "loss": 2.4599, + "step": 1326 + }, + { + "epoch": 0.2554563611425271, + "grad_norm": 3.3548750590399443, + "learning_rate": 1.950341008059072e-05, + "loss": 2.4458, + "step": 1327 + }, + { + "epoch": 0.2556488678201025, + "grad_norm": 3.788430963411669, + "learning_rate": 1.9502439296542516e-05, + "loss": 2.4228, + "step": 1328 + }, + { + "epoch": 0.2556488678201025, + "lm_loss": 2.2098, + "step": 1328, + "vm_loss": 0.1965 + }, + { + "epoch": 0.2556488678201025, + "lm_loss": 1.9991, + "step": 1328, + "vm_loss": 0.1766 + }, + { + "epoch": 0.2556488678201025, + "lm_loss": 2.4696, + "step": 1328, + "vm_loss": 0.2352 + }, + { + "epoch": 0.2556488678201025, + "lm_loss": 2.272, + "step": 1328, + "vm_loss": 0.1721 + }, + { + "epoch": 0.2556488678201025, + "lm_loss": 2.3064, + "step": 1328, + "vm_loss": 0.1944 + }, + { + "epoch": 0.2556488678201025, + "lm_loss": 2.2167, + "step": 1328, + "vm_loss": 0.1275 + }, + { + "epoch": 0.2556488678201025, + "lm_loss": 2.2243, + "step": 1328, + "vm_loss": 0.1614 + }, + { + "epoch": 0.2556488678201025, + "lm_loss": 2.3176, + "step": 1328, + "vm_loss": 0.1606 + }, + { + "epoch": 0.2558413744976779, + "grad_norm": 3.157049014916354, + "learning_rate": 1.950146758873564e-05, + "loss": 2.4171, + "step": 1329 + }, + { + "epoch": 0.2560338811752533, + "grad_norm": 3.217759995974044, + "learning_rate": 1.9500494957264556e-05, + "loss": 2.4448, + "step": 1330 + }, + { + "epoch": 0.25622638785282864, + "grad_norm": 3.3655286097439756, + "learning_rate": 1.9499521402223816e-05, + "loss": 2.4174, + "step": 1331 + }, + { + "epoch": 0.256418894530404, + "grad_norm": 3.065707467387859, + "learning_rate": 1.949854692370806e-05, + "loss": 2.3962, + "step": 1332 + }, + { + "epoch": 0.25661140120797943, + "grad_norm": 3.3265994165512383, + "learning_rate": 1.949757152181202e-05, + "loss": 2.4279, + "step": 1333 + }, + { + "epoch": 0.2568039078855548, + "grad_norm": 3.047502649025238, + "learning_rate": 1.949659519663052e-05, + "loss": 2.4237, + "step": 1334 + }, + { + "epoch": 0.25699641456313016, + "grad_norm": 3.81421034363058, + "learning_rate": 1.9495617948258473e-05, + "loss": 2.44, + "step": 1335 + }, + { + "epoch": 0.2571889212407055, + "grad_norm": 3.4803572909974507, + "learning_rate": 1.949463977679087e-05, + "loss": 2.465, + "step": 1336 + }, + { + "epoch": 0.2571889212407055, + "lm_loss": 2.1166, + "step": 1336, + "vm_loss": 0.2559 + }, + { + "epoch": 0.2571889212407055, + "lm_loss": 2.0812, + "step": 1336, + "vm_loss": 0.1324 + }, + { + "epoch": 0.2571889212407055, + "lm_loss": 1.9693, + "step": 1336, + "vm_loss": 0.1819 + }, + { + "epoch": 0.2571889212407055, + "lm_loss": 2.096, + "step": 1336, + "vm_loss": 0.1757 + }, + { + "epoch": 0.2571889212407055, + "lm_loss": 2.21, + "step": 1336, + "vm_loss": 0.2052 + }, + { + "epoch": 0.2571889212407055, + "lm_loss": 2.2473, + "step": 1336, + "vm_loss": 0.2201 + }, + { + "epoch": 0.2571889212407055, + "lm_loss": 2.0484, + "step": 1336, + "vm_loss": 0.1342 + }, + { + "epoch": 0.2571889212407055, + "lm_loss": 2.4665, + "step": 1336, + "vm_loss": 0.1655 + }, + { + "epoch": 0.2573814279182809, + "grad_norm": 3.033006132100957, + "learning_rate": 1.9493660682322813e-05, + "loss": 2.4679, + "step": 1337 + }, + { + "epoch": 0.2575739345958563, + "grad_norm": 3.205101448085657, + "learning_rate": 1.949268066494948e-05, + "loss": 2.4853, + "step": 1338 + }, + { + "epoch": 0.2577664412734317, + "grad_norm": 3.831261203767552, + "learning_rate": 1.9491699724766136e-05, + "loss": 2.4555, + "step": 1339 + }, + { + "epoch": 0.25795894795100704, + "grad_norm": 2.973340038440222, + "learning_rate": 1.9490717861868146e-05, + "loss": 2.4245, + "step": 1340 + }, + { + "epoch": 0.2581514546285824, + "grad_norm": 3.86195014991082, + "learning_rate": 1.948973507635096e-05, + "loss": 2.4634, + "step": 1341 + }, + { + "epoch": 0.25834396130615783, + "grad_norm": 3.4165985199006323, + "learning_rate": 1.9488751368310113e-05, + "loss": 2.4493, + "step": 1342 + }, + { + "epoch": 0.2585364679837332, + "grad_norm": 3.2956956895908105, + "learning_rate": 1.9487766737841236e-05, + "loss": 2.4661, + "step": 1343 + }, + { + "epoch": 0.25872897466130856, + "grad_norm": 3.1985477918133864, + "learning_rate": 1.9486781185040052e-05, + "loss": 2.4651, + "step": 1344 + }, + { + "epoch": 0.25872897466130856, + "lm_loss": 1.876, + "step": 1344, + "vm_loss": 0.1496 + }, + { + "epoch": 0.25872897466130856, + "lm_loss": 1.9058, + "step": 1344, + "vm_loss": 0.2471 + }, + { + "epoch": 0.25872897466130856, + "lm_loss": 2.281, + "step": 1344, + "vm_loss": 0.2274 + }, + { + "epoch": 0.25872897466130856, + "lm_loss": 2.579, + "step": 1344, + "vm_loss": 0.2354 + }, + { + "epoch": 0.25872897466130856, + "lm_loss": 2.5787, + "step": 1344, + "vm_loss": 0.2315 + }, + { + "epoch": 0.25872897466130856, + "lm_loss": 2.202, + "step": 1344, + "vm_loss": 0.099 + }, + { + "epoch": 0.25872897466130856, + "lm_loss": 2.4344, + "step": 1344, + "vm_loss": 0.2358 + }, + { + "epoch": 0.25872897466130856, + "lm_loss": 2.3623, + "step": 1344, + "vm_loss": 0.1273 + }, + { + "epoch": 0.25892148133888393, + "grad_norm": 3.765784192387748, + "learning_rate": 1.948579471000236e-05, + "loss": 2.4244, + "step": 1345 + }, + { + "epoch": 0.2591139880164593, + "grad_norm": 3.79114609538082, + "learning_rate": 1.9484807312824066e-05, + "loss": 2.4661, + "step": 1346 + }, + { + "epoch": 0.2593064946940347, + "grad_norm": 3.811668255443407, + "learning_rate": 1.9483818993601157e-05, + "loss": 2.4524, + "step": 1347 + }, + { + "epoch": 0.2594990013716101, + "grad_norm": 3.8470479091873258, + "learning_rate": 1.9482829752429703e-05, + "loss": 2.4474, + "step": 1348 + }, + { + "epoch": 0.25969150804918545, + "grad_norm": 2.8976283748457243, + "learning_rate": 1.948183958940588e-05, + "loss": 2.4393, + "step": 1349 + }, + { + "epoch": 0.2598840147267608, + "grad_norm": 3.341725874920753, + "learning_rate": 1.9480848504625943e-05, + "loss": 2.4413, + "step": 1350 + }, + { + "epoch": 0.26007652140433624, + "grad_norm": 3.8403607779619704, + "learning_rate": 1.9479856498186232e-05, + "loss": 2.4955, + "step": 1351 + }, + { + "epoch": 0.2602690280819116, + "grad_norm": 3.2913681487474244, + "learning_rate": 1.947886357018319e-05, + "loss": 2.4731, + "step": 1352 + }, + { + "epoch": 0.2602690280819116, + "lm_loss": 2.0142, + "step": 1352, + "vm_loss": 0.2126 + }, + { + "epoch": 0.2602690280819116, + "lm_loss": 2.4495, + "step": 1352, + "vm_loss": 0.1895 + }, + { + "epoch": 0.2602690280819116, + "lm_loss": 2.1474, + "step": 1352, + "vm_loss": 0.2119 + }, + { + "epoch": 0.2602690280819116, + "lm_loss": 2.3824, + "step": 1352, + "vm_loss": 0.138 + }, + { + "epoch": 0.2602690280819116, + "lm_loss": 2.2786, + "step": 1352, + "vm_loss": 0.2129 + }, + { + "epoch": 0.2602690280819116, + "lm_loss": 2.2902, + "step": 1352, + "vm_loss": 0.1723 + }, + { + "epoch": 0.2602690280819116, + "lm_loss": 2.0131, + "step": 1352, + "vm_loss": 0.1014 + }, + { + "epoch": 0.2602690280819116, + "lm_loss": 2.254, + "step": 1352, + "vm_loss": 0.1357 + }, + { + "epoch": 0.26046153475948697, + "grad_norm": 3.3369202959380155, + "learning_rate": 1.9477869720713335e-05, + "loss": 2.427, + "step": 1353 + }, + { + "epoch": 0.26065404143706233, + "grad_norm": 4.0912223931257214, + "learning_rate": 1.9476874949873286e-05, + "loss": 2.4289, + "step": 1354 + }, + { + "epoch": 0.2608465481146377, + "grad_norm": 3.6923710272736114, + "learning_rate": 1.9475879257759753e-05, + "loss": 2.455, + "step": 1355 + }, + { + "epoch": 0.2610390547922131, + "grad_norm": 3.709818370142207, + "learning_rate": 1.9474882644469517e-05, + "loss": 2.4257, + "step": 1356 + }, + { + "epoch": 0.2612315614697885, + "grad_norm": 3.4055689067583432, + "learning_rate": 1.947388511009947e-05, + "loss": 2.3915, + "step": 1357 + }, + { + "epoch": 0.26142406814736385, + "grad_norm": 3.9842501473511356, + "learning_rate": 1.9472886654746588e-05, + "loss": 2.4562, + "step": 1358 + }, + { + "epoch": 0.2616165748249392, + "grad_norm": 2.952188124461196, + "learning_rate": 1.947188727850793e-05, + "loss": 2.4583, + "step": 1359 + }, + { + "epoch": 0.26180908150251464, + "grad_norm": 3.953323780222254, + "learning_rate": 1.9470886981480642e-05, + "loss": 2.4813, + "step": 1360 + }, + { + "epoch": 0.26180908150251464, + "lm_loss": 2.3377, + "step": 1360, + "vm_loss": 0.1776 + }, + { + "epoch": 0.26180908150251464, + "lm_loss": 2.4006, + "step": 1360, + "vm_loss": 0.1362 + }, + { + "epoch": 0.26180908150251464, + "lm_loss": 2.3592, + "step": 1360, + "vm_loss": 0.1912 + }, + { + "epoch": 0.26180908150251464, + "lm_loss": 2.3334, + "step": 1360, + "vm_loss": 0.1517 + }, + { + "epoch": 0.26180908150251464, + "lm_loss": 1.6674, + "step": 1360, + "vm_loss": 0.1866 + }, + { + "epoch": 0.26180908150251464, + "lm_loss": 2.1496, + "step": 1360, + "vm_loss": 0.1509 + }, + { + "epoch": 0.26180908150251464, + "lm_loss": 2.1673, + "step": 1360, + "vm_loss": 0.1756 + }, + { + "epoch": 0.26180908150251464, + "lm_loss": 2.4051, + "step": 1360, + "vm_loss": 0.1144 + }, + { + "epoch": 0.26200158818009, + "grad_norm": 3.6364914509651243, + "learning_rate": 1.9469885763761975e-05, + "loss": 2.4193, + "step": 1361 + }, + { + "epoch": 0.26219409485766537, + "grad_norm": 3.3224858959836725, + "learning_rate": 1.9468883625449256e-05, + "loss": 2.4438, + "step": 1362 + }, + { + "epoch": 0.26238660153524074, + "grad_norm": 3.4044018426757514, + "learning_rate": 1.9467880566639908e-05, + "loss": 2.4129, + "step": 1363 + }, + { + "epoch": 0.26257910821281616, + "grad_norm": 3.260301998937309, + "learning_rate": 1.9466876587431436e-05, + "loss": 2.463, + "step": 1364 + }, + { + "epoch": 0.2627716148903915, + "grad_norm": 3.3845445974390356, + "learning_rate": 1.9465871687921444e-05, + "loss": 2.4312, + "step": 1365 + }, + { + "epoch": 0.2629641215679669, + "grad_norm": 3.9139235265343135, + "learning_rate": 1.946486586820762e-05, + "loss": 2.3797, + "step": 1366 + }, + { + "epoch": 0.26315662824554226, + "grad_norm": 3.106114386511253, + "learning_rate": 1.9463859128387745e-05, + "loss": 2.4967, + "step": 1367 + }, + { + "epoch": 0.2633491349231176, + "grad_norm": 3.5513330707919453, + "learning_rate": 1.946285146855968e-05, + "loss": 2.5088, + "step": 1368 + }, + { + "epoch": 0.2633491349231176, + "lm_loss": 2.2648, + "step": 1368, + "vm_loss": 0.217 + }, + { + "epoch": 0.2633491349231176, + "lm_loss": 2.1635, + "step": 1368, + "vm_loss": 0.1691 + }, + { + "epoch": 0.2633491349231176, + "lm_loss": 2.4339, + "step": 1368, + "vm_loss": 0.1791 + }, + { + "epoch": 0.2633491349231176, + "lm_loss": 2.4929, + "step": 1368, + "vm_loss": 0.1999 + }, + { + "epoch": 0.2633491349231176, + "lm_loss": 2.3344, + "step": 1368, + "vm_loss": 0.2022 + }, + { + "epoch": 0.2633491349231176, + "lm_loss": 2.3458, + "step": 1368, + "vm_loss": 0.208 + }, + { + "epoch": 0.2633491349231176, + "lm_loss": 2.3191, + "step": 1368, + "vm_loss": 0.1261 + }, + { + "epoch": 0.2633491349231176, + "lm_loss": 2.2249, + "step": 1368, + "vm_loss": 0.1018 + }, + { + "epoch": 0.26354164160069304, + "grad_norm": 3.596890405237465, + "learning_rate": 1.946184288882139e-05, + "loss": 2.4508, + "step": 1369 + }, + { + "epoch": 0.2637341482782684, + "grad_norm": 2.888969322878278, + "learning_rate": 1.946083338927092e-05, + "loss": 2.4324, + "step": 1370 + }, + { + "epoch": 0.2639266549558438, + "grad_norm": 3.5671345359860647, + "learning_rate": 1.9459822970006407e-05, + "loss": 2.4462, + "step": 1371 + }, + { + "epoch": 0.26411916163341914, + "grad_norm": 3.473380576223788, + "learning_rate": 1.945881163112607e-05, + "loss": 2.4195, + "step": 1372 + }, + { + "epoch": 0.26431166831099456, + "grad_norm": 3.4430168173110456, + "learning_rate": 1.945779937272823e-05, + "loss": 2.4703, + "step": 1373 + }, + { + "epoch": 0.26450417498856993, + "grad_norm": 3.4405994117511294, + "learning_rate": 1.9456786194911292e-05, + "loss": 2.4396, + "step": 1374 + }, + { + "epoch": 0.2646966816661453, + "grad_norm": 3.115956749334274, + "learning_rate": 1.9455772097773746e-05, + "loss": 2.435, + "step": 1375 + }, + { + "epoch": 0.26488918834372066, + "grad_norm": 3.3354418666958927, + "learning_rate": 1.945475708141418e-05, + "loss": 2.3856, + "step": 1376 + }, + { + "epoch": 0.26488918834372066, + "lm_loss": 1.8427, + "step": 1376, + "vm_loss": 0.1941 + }, + { + "epoch": 0.26488918834372066, + "lm_loss": 2.3545, + "step": 1376, + "vm_loss": 0.1236 + }, + { + "epoch": 0.26488918834372066, + "lm_loss": 2.474, + "step": 1376, + "vm_loss": 0.2317 + }, + { + "epoch": 0.26488918834372066, + "lm_loss": 2.6104, + "step": 1376, + "vm_loss": 0.1725 + }, + { + "epoch": 0.26488918834372066, + "lm_loss": 2.4971, + "step": 1376, + "vm_loss": 0.1129 + }, + { + "epoch": 0.26488918834372066, + "lm_loss": 2.3644, + "step": 1376, + "vm_loss": 0.2272 + }, + { + "epoch": 0.26488918834372066, + "lm_loss": 2.3845, + "step": 1376, + "vm_loss": 0.1546 + }, + { + "epoch": 0.26488918834372066, + "lm_loss": 2.2048, + "step": 1376, + "vm_loss": 0.1536 + }, + { + "epoch": 0.265081695021296, + "grad_norm": 3.283703394033432, + "learning_rate": 1.9453741145931263e-05, + "loss": 2.5054, + "step": 1377 + }, + { + "epoch": 0.26527420169887145, + "grad_norm": 3.591374093888235, + "learning_rate": 1.9452724291423756e-05, + "loss": 2.4856, + "step": 1378 + }, + { + "epoch": 0.2654667083764468, + "grad_norm": 3.104396396592494, + "learning_rate": 1.9451706517990516e-05, + "loss": 2.4176, + "step": 1379 + }, + { + "epoch": 0.2656592150540222, + "grad_norm": 3.5614668340707767, + "learning_rate": 1.9450687825730476e-05, + "loss": 2.4932, + "step": 1380 + }, + { + "epoch": 0.26585172173159755, + "grad_norm": 2.9171927414336616, + "learning_rate": 1.9449668214742673e-05, + "loss": 2.4851, + "step": 1381 + }, + { + "epoch": 0.26604422840917297, + "grad_norm": 2.8508008003582366, + "learning_rate": 1.9448647685126222e-05, + "loss": 2.421, + "step": 1382 + }, + { + "epoch": 0.26623673508674833, + "grad_norm": 3.5220959827087968, + "learning_rate": 1.944762623698033e-05, + "loss": 2.487, + "step": 1383 + }, + { + "epoch": 0.2664292417643237, + "grad_norm": 3.164982774887524, + "learning_rate": 1.94466038704043e-05, + "loss": 2.4083, + "step": 1384 + }, + { + "epoch": 0.2664292417643237, + "lm_loss": 2.43, + "step": 1384, + "vm_loss": 0.1478 + }, + { + "epoch": 0.2664292417643237, + "lm_loss": 2.1552, + "step": 1384, + "vm_loss": 0.2158 + }, + { + "epoch": 0.2664292417643237, + "lm_loss": 2.3703, + "step": 1384, + "vm_loss": 0.1915 + }, + { + "epoch": 0.2664292417643237, + "lm_loss": 2.3977, + "step": 1384, + "vm_loss": 0.2207 + }, + { + "epoch": 0.2664292417643237, + "lm_loss": 2.1224, + "step": 1384, + "vm_loss": 0.1382 + }, + { + "epoch": 0.2664292417643237, + "lm_loss": 2.366, + "step": 1384, + "vm_loss": 0.1662 + }, + { + "epoch": 0.2664292417643237, + "lm_loss": 1.9032, + "step": 1384, + "vm_loss": 0.1806 + }, + { + "epoch": 0.2664292417643237, + "lm_loss": 2.2334, + "step": 1384, + "vm_loss": 0.1838 + }, + { + "epoch": 0.26662174844189906, + "grad_norm": 2.750466161768323, + "learning_rate": 1.9445580585497516e-05, + "loss": 2.423, + "step": 1385 + }, + { + "epoch": 0.26681425511947443, + "grad_norm": 3.9582665977384788, + "learning_rate": 1.9444556382359454e-05, + "loss": 2.4138, + "step": 1386 + }, + { + "epoch": 0.26700676179704985, + "grad_norm": 3.1158391955155476, + "learning_rate": 1.9443531261089682e-05, + "loss": 2.4077, + "step": 1387 + }, + { + "epoch": 0.2671992684746252, + "grad_norm": 3.0296666561383345, + "learning_rate": 1.944250522178785e-05, + "loss": 2.4164, + "step": 1388 + }, + { + "epoch": 0.2673917751522006, + "grad_norm": 3.465929984323705, + "learning_rate": 1.944147826455371e-05, + "loss": 2.4274, + "step": 1389 + }, + { + "epoch": 0.26758428182977595, + "grad_norm": 3.5465496574462967, + "learning_rate": 1.944045038948709e-05, + "loss": 2.4907, + "step": 1390 + }, + { + "epoch": 0.26777678850735137, + "grad_norm": 3.156895609755201, + "learning_rate": 1.943942159668791e-05, + "loss": 2.4799, + "step": 1391 + }, + { + "epoch": 0.26796929518492674, + "grad_norm": 3.4785568902455517, + "learning_rate": 1.943839188625619e-05, + "loss": 2.4296, + "step": 1392 + }, + { + "epoch": 0.26796929518492674, + "lm_loss": 2.3289, + "step": 1392, + "vm_loss": 0.1501 + }, + { + "epoch": 0.26796929518492674, + "lm_loss": 2.4526, + "step": 1392, + "vm_loss": 0.0885 + }, + { + "epoch": 0.26796929518492674, + "lm_loss": 2.3605, + "step": 1392, + "vm_loss": 0.1562 + }, + { + "epoch": 0.26796929518492674, + "lm_loss": 2.225, + "step": 1392, + "vm_loss": 0.2055 + }, + { + "epoch": 0.26796929518492674, + "lm_loss": 2.2868, + "step": 1392, + "vm_loss": 0.1709 + }, + { + "epoch": 0.26796929518492674, + "lm_loss": 2.3788, + "step": 1392, + "vm_loss": 0.1823 + }, + { + "epoch": 0.26796929518492674, + "lm_loss": 2.0234, + "step": 1392, + "vm_loss": 0.1437 + }, + { + "epoch": 0.26796929518492674, + "lm_loss": 2.0114, + "step": 1392, + "vm_loss": 0.2148 + }, + { + "epoch": 0.2681618018625021, + "grad_norm": 3.341224126619619, + "learning_rate": 1.9437361258292023e-05, + "loss": 2.4348, + "step": 1393 + }, + { + "epoch": 0.26835430854007747, + "grad_norm": 3.4269511921820475, + "learning_rate": 1.9436329712895605e-05, + "loss": 2.4465, + "step": 1394 + }, + { + "epoch": 0.2685468152176529, + "grad_norm": 3.70836013637945, + "learning_rate": 1.943529725016721e-05, + "loss": 2.4456, + "step": 1395 + }, + { + "epoch": 0.26873932189522826, + "grad_norm": 3.6695991073139846, + "learning_rate": 1.943426387020721e-05, + "loss": 2.466, + "step": 1396 + }, + { + "epoch": 0.2689318285728036, + "grad_norm": 3.5660145683217275, + "learning_rate": 1.9433229573116067e-05, + "loss": 2.4619, + "step": 1397 + }, + { + "epoch": 0.269124335250379, + "grad_norm": 3.2775950533856335, + "learning_rate": 1.943219435899432e-05, + "loss": 2.4565, + "step": 1398 + }, + { + "epoch": 0.26931684192795435, + "grad_norm": 3.6747508314816986, + "learning_rate": 1.9431158227942607e-05, + "loss": 2.4536, + "step": 1399 + }, + { + "epoch": 0.2695093486055298, + "grad_norm": 3.035128726047039, + "learning_rate": 1.9430121180061654e-05, + "loss": 2.4246, + "step": 1400 + }, + { + "epoch": 0.2695093486055298, + "lm_loss": 2.3388, + "step": 1400, + "vm_loss": 0.1728 + }, + { + "epoch": 0.2695093486055298, + "lm_loss": 2.3608, + "step": 1400, + "vm_loss": 0.1865 + }, + { + "epoch": 0.2695093486055298, + "lm_loss": 2.2271, + "step": 1400, + "vm_loss": 0.1483 + }, + { + "epoch": 0.2695093486055298, + "lm_loss": 2.1778, + "step": 1400, + "vm_loss": 0.1482 + }, + { + "epoch": 0.2695093486055298, + "lm_loss": 1.8123, + "step": 1400, + "vm_loss": 0.2525 + }, + { + "epoch": 0.2695093486055298, + "lm_loss": 2.3682, + "step": 1400, + "vm_loss": 0.1878 + }, + { + "epoch": 0.2695093486055298, + "lm_loss": 2.2861, + "step": 1400, + "vm_loss": 0.1902 + }, + { + "epoch": 0.2695093486055298, + "lm_loss": 2.3043, + "step": 1400, + "vm_loss": 0.2017 + }, + { + "epoch": 0.26970185528310514, + "grad_norm": 3.505646957946543, + "learning_rate": 1.9429083215452276e-05, + "loss": 2.4131, + "step": 1401 + }, + { + "epoch": 0.2698943619606805, + "grad_norm": 2.9293006954186085, + "learning_rate": 1.942804433421538e-05, + "loss": 2.4436, + "step": 1402 + }, + { + "epoch": 0.2700868686382559, + "grad_norm": 3.884522514501745, + "learning_rate": 1.9427004536451952e-05, + "loss": 2.4756, + "step": 1403 + }, + { + "epoch": 0.2702793753158313, + "grad_norm": 3.1622845713526013, + "learning_rate": 1.9425963822263077e-05, + "loss": 2.4612, + "step": 1404 + }, + { + "epoch": 0.27047188199340666, + "grad_norm": 3.271885497090578, + "learning_rate": 1.9424922191749927e-05, + "loss": 2.3933, + "step": 1405 + }, + { + "epoch": 0.270664388670982, + "grad_norm": 3.0806264069388924, + "learning_rate": 1.9423879645013758e-05, + "loss": 2.4557, + "step": 1406 + }, + { + "epoch": 0.2708568953485574, + "grad_norm": 3.158411007166911, + "learning_rate": 1.942283618215592e-05, + "loss": 2.4362, + "step": 1407 + }, + { + "epoch": 0.27104940202613276, + "grad_norm": 3.442724779764118, + "learning_rate": 1.942179180327785e-05, + "loss": 2.4523, + "step": 1408 + }, + { + "epoch": 0.27104940202613276, + "lm_loss": 2.1825, + "step": 1408, + "vm_loss": 0.1245 + }, + { + "epoch": 0.27104940202613276, + "lm_loss": 2.425, + "step": 1408, + "vm_loss": 0.1232 + }, + { + "epoch": 0.27104940202613276, + "lm_loss": 2.5548, + "step": 1408, + "vm_loss": 0.1889 + }, + { + "epoch": 0.27104940202613276, + "lm_loss": 2.3955, + "step": 1408, + "vm_loss": 0.2327 + }, + { + "epoch": 0.27104940202613276, + "lm_loss": 2.2641, + "step": 1408, + "vm_loss": 0.1262 + }, + { + "epoch": 0.27104940202613276, + "lm_loss": 1.8005, + "step": 1408, + "vm_loss": 0.1851 + }, + { + "epoch": 0.27104940202613276, + "lm_loss": 1.9291, + "step": 1408, + "vm_loss": 0.1958 + }, + { + "epoch": 0.27104940202613276, + "lm_loss": 2.0288, + "step": 1408, + "vm_loss": 0.1916 + }, + { + "epoch": 0.2712419087037082, + "grad_norm": 3.6250834309428543, + "learning_rate": 1.9420746508481083e-05, + "loss": 2.4288, + "step": 1409 + }, + { + "epoch": 0.27143441538128354, + "grad_norm": 3.2004605722111075, + "learning_rate": 1.9419700297867227e-05, + "loss": 2.4353, + "step": 1410 + }, + { + "epoch": 0.2716269220588589, + "grad_norm": 3.467379498683046, + "learning_rate": 1.9418653171537984e-05, + "loss": 2.4311, + "step": 1411 + }, + { + "epoch": 0.2718194287364343, + "grad_norm": 3.892785395896328, + "learning_rate": 1.941760512959516e-05, + "loss": 2.4796, + "step": 1412 + }, + { + "epoch": 0.2720119354140097, + "grad_norm": 3.8812363990587655, + "learning_rate": 1.9416556172140625e-05, + "loss": 2.4402, + "step": 1413 + }, + { + "epoch": 0.27220444209158506, + "grad_norm": 3.9808887010098664, + "learning_rate": 1.941550629927636e-05, + "loss": 2.3967, + "step": 1414 + }, + { + "epoch": 0.27239694876916043, + "grad_norm": 2.971071485447531, + "learning_rate": 1.9414455511104423e-05, + "loss": 2.4938, + "step": 1415 + }, + { + "epoch": 0.2725894554467358, + "grad_norm": 4.34825013055807, + "learning_rate": 1.9413403807726964e-05, + "loss": 2.4766, + "step": 1416 + }, + { + "epoch": 0.2725894554467358, + "lm_loss": 2.455, + "step": 1416, + "vm_loss": 0.1472 + }, + { + "epoch": 0.2725894554467358, + "lm_loss": 2.1661, + "step": 1416, + "vm_loss": 0.2036 + }, + { + "epoch": 0.2725894554467358, + "lm_loss": 2.1425, + "step": 1416, + "vm_loss": 0.1667 + }, + { + "epoch": 0.2725894554467358, + "lm_loss": 1.7711, + "step": 1416, + "vm_loss": 0.1853 + }, + { + "epoch": 0.2725894554467358, + "lm_loss": 2.3659, + "step": 1416, + "vm_loss": 0.1325 + }, + { + "epoch": 0.2725894554467358, + "lm_loss": 2.2913, + "step": 1416, + "vm_loss": 0.1851 + }, + { + "epoch": 0.2725894554467358, + "lm_loss": 2.3871, + "step": 1416, + "vm_loss": 0.1987 + }, + { + "epoch": 0.2725894554467358, + "lm_loss": 2.4161, + "step": 1416, + "vm_loss": 0.1411 + }, + { + "epoch": 0.27278196212431116, + "grad_norm": 3.80510396481745, + "learning_rate": 1.9412351189246222e-05, + "loss": 2.4145, + "step": 1417 + }, + { + "epoch": 0.2729744688018866, + "grad_norm": 3.324689546315768, + "learning_rate": 1.9411297655764526e-05, + "loss": 2.4223, + "step": 1418 + }, + { + "epoch": 0.27316697547946195, + "grad_norm": 3.2339236068441934, + "learning_rate": 1.9410243207384294e-05, + "loss": 2.4133, + "step": 1419 + }, + { + "epoch": 0.2733594821570373, + "grad_norm": 3.3032509446274823, + "learning_rate": 1.9409187844208026e-05, + "loss": 2.4938, + "step": 1420 + }, + { + "epoch": 0.2735519888346127, + "grad_norm": 3.4105288699708822, + "learning_rate": 1.940813156633832e-05, + "loss": 2.4174, + "step": 1421 + }, + { + "epoch": 0.2737444955121881, + "grad_norm": 2.981732450523626, + "learning_rate": 1.9407074373877868e-05, + "loss": 2.4556, + "step": 1422 + }, + { + "epoch": 0.27393700218976347, + "grad_norm": 3.483751064534896, + "learning_rate": 1.940601626692943e-05, + "loss": 2.4673, + "step": 1423 + }, + { + "epoch": 0.27412950886733883, + "grad_norm": 3.637206778259412, + "learning_rate": 1.9404957245595872e-05, + "loss": 2.4044, + "step": 1424 + }, + { + "epoch": 0.27412950886733883, + "lm_loss": 2.5646, + "step": 1424, + "vm_loss": 0.1605 + }, + { + "epoch": 0.27412950886733883, + "lm_loss": 2.2616, + "step": 1424, + "vm_loss": 0.1444 + }, + { + "epoch": 0.27412950886733883, + "lm_loss": 2.2578, + "step": 1424, + "vm_loss": 0.1506 + }, + { + "epoch": 0.27412950886733883, + "lm_loss": 2.4856, + "step": 1424, + "vm_loss": 0.1918 + }, + { + "epoch": 0.27412950886733883, + "lm_loss": 2.0239, + "step": 1424, + "vm_loss": 0.1592 + }, + { + "epoch": 0.27412950886733883, + "lm_loss": 2.4159, + "step": 1424, + "vm_loss": 0.1777 + }, + { + "epoch": 0.27412950886733883, + "lm_loss": 2.0632, + "step": 1424, + "vm_loss": 0.2392 + }, + { + "epoch": 0.27412950886733883, + "lm_loss": 2.3561, + "step": 1424, + "vm_loss": 0.118 + }, + { + "epoch": 0.2743220155449142, + "grad_norm": 3.334940308214593, + "learning_rate": 1.9403897309980148e-05, + "loss": 2.4394, + "step": 1425 + }, + { + "epoch": 0.27451452222248957, + "grad_norm": 3.359629194681667, + "learning_rate": 1.9402836460185294e-05, + "loss": 2.4494, + "step": 1426 + }, + { + "epoch": 0.274707028900065, + "grad_norm": 3.584232008094124, + "learning_rate": 1.9401774696314436e-05, + "loss": 2.4618, + "step": 1427 + }, + { + "epoch": 0.27489953557764035, + "grad_norm": 3.32365386990465, + "learning_rate": 1.9400712018470795e-05, + "loss": 2.4542, + "step": 1428 + }, + { + "epoch": 0.2750920422552157, + "grad_norm": 3.3221741105192013, + "learning_rate": 1.9399648426757677e-05, + "loss": 2.4543, + "step": 1429 + }, + { + "epoch": 0.2752845489327911, + "grad_norm": 3.5807113365247, + "learning_rate": 1.9398583921278473e-05, + "loss": 2.4822, + "step": 1430 + }, + { + "epoch": 0.2754770556103665, + "grad_norm": 3.8576944735433862, + "learning_rate": 1.939751850213667e-05, + "loss": 2.4871, + "step": 1431 + }, + { + "epoch": 0.27566956228794187, + "grad_norm": 3.3826688525455495, + "learning_rate": 1.939645216943584e-05, + "loss": 2.4167, + "step": 1432 + }, + { + "epoch": 0.27566956228794187, + "lm_loss": 2.2886, + "step": 1432, + "vm_loss": 0.1675 + }, + { + "epoch": 0.27566956228794187, + "lm_loss": 2.331, + "step": 1432, + "vm_loss": 0.2156 + }, + { + "epoch": 0.27566956228794187, + "lm_loss": 2.0088, + "step": 1432, + "vm_loss": 0.1431 + }, + { + "epoch": 0.27566956228794187, + "lm_loss": 2.0556, + "step": 1432, + "vm_loss": 0.1716 + }, + { + "epoch": 0.27566956228794187, + "lm_loss": 2.4222, + "step": 1432, + "vm_loss": 0.2471 + }, + { + "epoch": 0.27566956228794187, + "lm_loss": 2.308, + "step": 1432, + "vm_loss": 0.2009 + }, + { + "epoch": 0.27566956228794187, + "lm_loss": 2.2325, + "step": 1432, + "vm_loss": 0.1515 + }, + { + "epoch": 0.27566956228794187, + "lm_loss": 2.2871, + "step": 1432, + "vm_loss": 0.1482 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 3.1797972697553796, + "learning_rate": 1.9395384923279645e-05, + "loss": 2.4508, + "step": 1433 + }, + { + "epoch": 0.2760545756430926, + "grad_norm": 3.2526859642537445, + "learning_rate": 1.939431676377183e-05, + "loss": 2.4567, + "step": 1434 + }, + { + "epoch": 0.276247082320668, + "grad_norm": 3.4052983077775942, + "learning_rate": 1.939324769101624e-05, + "loss": 2.4154, + "step": 1435 + }, + { + "epoch": 0.2764395889982434, + "grad_norm": 3.4141383180878973, + "learning_rate": 1.9392177705116796e-05, + "loss": 2.4501, + "step": 1436 + }, + { + "epoch": 0.27663209567581876, + "grad_norm": 3.215217216481154, + "learning_rate": 1.939110680617752e-05, + "loss": 2.4618, + "step": 1437 + }, + { + "epoch": 0.2768246023533941, + "grad_norm": 3.477741379150412, + "learning_rate": 1.9390034994302516e-05, + "loss": 2.4218, + "step": 1438 + }, + { + "epoch": 0.2770171090309695, + "grad_norm": 3.2436896574811778, + "learning_rate": 1.9388962269595977e-05, + "loss": 2.4259, + "step": 1439 + }, + { + "epoch": 0.2772096157085449, + "grad_norm": 3.2570097703567935, + "learning_rate": 1.9387888632162185e-05, + "loss": 2.4457, + "step": 1440 + }, + { + "epoch": 0.2772096157085449, + "lm_loss": 2.3304, + "step": 1440, + "vm_loss": 0.1857 + }, + { + "epoch": 0.2772096157085449, + "lm_loss": 2.2604, + "step": 1440, + "vm_loss": 0.1624 + }, + { + "epoch": 0.2772096157085449, + "lm_loss": 2.3713, + "step": 1440, + "vm_loss": 0.1852 + }, + { + "epoch": 0.2772096157085449, + "lm_loss": 2.3503, + "step": 1440, + "vm_loss": 0.2156 + }, + { + "epoch": 0.2772096157085449, + "lm_loss": 1.9513, + "step": 1440, + "vm_loss": 0.1283 + }, + { + "epoch": 0.2772096157085449, + "lm_loss": 2.3759, + "step": 1440, + "vm_loss": 0.1565 + }, + { + "epoch": 0.2772096157085449, + "lm_loss": 2.3531, + "step": 1440, + "vm_loss": 0.1666 + }, + { + "epoch": 0.2772096157085449, + "lm_loss": 2.2514, + "step": 1440, + "vm_loss": 0.0969 + }, + { + "epoch": 0.2774021223861203, + "grad_norm": 3.3567399709409247, + "learning_rate": 1.938681408210551e-05, + "loss": 2.4071, + "step": 1441 + }, + { + "epoch": 0.27759462906369564, + "grad_norm": 3.2675266602580595, + "learning_rate": 1.9385738619530415e-05, + "loss": 2.4457, + "step": 1442 + }, + { + "epoch": 0.277787135741271, + "grad_norm": 3.405524210116633, + "learning_rate": 1.9384662244541448e-05, + "loss": 2.4597, + "step": 1443 + }, + { + "epoch": 0.27797964241884643, + "grad_norm": 3.7974134684640584, + "learning_rate": 1.9383584957243245e-05, + "loss": 2.3995, + "step": 1444 + }, + { + "epoch": 0.2781721490964218, + "grad_norm": 2.9300876578354194, + "learning_rate": 1.9382506757740532e-05, + "loss": 2.4493, + "step": 1445 + }, + { + "epoch": 0.27836465577399716, + "grad_norm": 4.225220921943164, + "learning_rate": 1.938142764613813e-05, + "loss": 2.4154, + "step": 1446 + }, + { + "epoch": 0.2785571624515725, + "grad_norm": 3.418211995090257, + "learning_rate": 1.9380347622540932e-05, + "loss": 2.447, + "step": 1447 + }, + { + "epoch": 0.2787496691291479, + "grad_norm": 3.706350884167747, + "learning_rate": 1.937926668705394e-05, + "loss": 2.4166, + "step": 1448 + }, + { + "epoch": 0.2787496691291479, + "lm_loss": 2.2721, + "step": 1448, + "vm_loss": 0.1857 + }, + { + "epoch": 0.2787496691291479, + "lm_loss": 2.0539, + "step": 1448, + "vm_loss": 0.2276 + }, + { + "epoch": 0.2787496691291479, + "lm_loss": 2.3414, + "step": 1448, + "vm_loss": 0.1996 + }, + { + "epoch": 0.2787496691291479, + "lm_loss": 2.2531, + "step": 1448, + "vm_loss": 0.1395 + }, + { + "epoch": 0.2787496691291479, + "lm_loss": 2.1248, + "step": 1448, + "vm_loss": 0.2533 + }, + { + "epoch": 0.2787496691291479, + "lm_loss": 2.3428, + "step": 1448, + "vm_loss": 0.1623 + }, + { + "epoch": 0.2787496691291479, + "lm_loss": 2.1109, + "step": 1448, + "vm_loss": 0.177 + }, + { + "epoch": 0.2787496691291479, + "lm_loss": 2.3689, + "step": 1448, + "vm_loss": 0.0945 + }, + { + "epoch": 0.2789421758067233, + "grad_norm": 3.30134788915163, + "learning_rate": 1.9378184839782226e-05, + "loss": 2.4076, + "step": 1449 + }, + { + "epoch": 0.2791346824842987, + "grad_norm": 3.2799994295793544, + "learning_rate": 1.937710208083096e-05, + "loss": 2.4107, + "step": 1450 + }, + { + "epoch": 0.27932718916187405, + "grad_norm": 3.1642706198204884, + "learning_rate": 1.9376018410305414e-05, + "loss": 2.4124, + "step": 1451 + }, + { + "epoch": 0.2795196958394494, + "grad_norm": 3.558069638639298, + "learning_rate": 1.9374933828310917e-05, + "loss": 2.4536, + "step": 1452 + }, + { + "epoch": 0.27971220251702483, + "grad_norm": 3.9670991485634928, + "learning_rate": 1.9373848334952913e-05, + "loss": 2.4421, + "step": 1453 + }, + { + "epoch": 0.2799047091946002, + "grad_norm": 2.934392409520615, + "learning_rate": 1.9372761930336925e-05, + "loss": 2.4224, + "step": 1454 + }, + { + "epoch": 0.28009721587217556, + "grad_norm": 3.0701092166421446, + "learning_rate": 1.9371674614568565e-05, + "loss": 2.4721, + "step": 1455 + }, + { + "epoch": 0.28028972254975093, + "grad_norm": 3.4830205583229037, + "learning_rate": 1.9370586387753532e-05, + "loss": 2.4002, + "step": 1456 + }, + { + "epoch": 0.28028972254975093, + "lm_loss": 2.1842, + "step": 1456, + "vm_loss": 0.1549 + }, + { + "epoch": 0.28028972254975093, + "lm_loss": 2.3036, + "step": 1456, + "vm_loss": 0.1558 + }, + { + "epoch": 0.28028972254975093, + "lm_loss": 2.3776, + "step": 1456, + "vm_loss": 0.1666 + }, + { + "epoch": 0.28028972254975093, + "lm_loss": 2.3612, + "step": 1456, + "vm_loss": 0.1646 + }, + { + "epoch": 0.28028972254975093, + "lm_loss": 2.0333, + "step": 1456, + "vm_loss": 0.182 + }, + { + "epoch": 0.28028972254975093, + "lm_loss": 2.639, + "step": 1456, + "vm_loss": 0.1368 + }, + { + "epoch": 0.28028972254975093, + "lm_loss": 2.1852, + "step": 1456, + "vm_loss": 0.122 + }, + { + "epoch": 0.28028972254975093, + "lm_loss": 2.0453, + "step": 1456, + "vm_loss": 0.2606 + }, + { + "epoch": 0.2804822292273263, + "grad_norm": 3.4796728604936105, + "learning_rate": 1.936949724999762e-05, + "loss": 2.4067, + "step": 1457 + }, + { + "epoch": 0.2806747359049017, + "grad_norm": 3.4553376674331338, + "learning_rate": 1.93684072014067e-05, + "loss": 2.4537, + "step": 1458 + }, + { + "epoch": 0.2808672425824771, + "grad_norm": 3.5616219282649104, + "learning_rate": 1.9367316242086747e-05, + "loss": 2.4485, + "step": 1459 + }, + { + "epoch": 0.28105974926005245, + "grad_norm": 3.831548443895444, + "learning_rate": 1.936622437214381e-05, + "loss": 2.4071, + "step": 1460 + }, + { + "epoch": 0.2812522559376278, + "grad_norm": 3.7627822599846765, + "learning_rate": 1.9365131591684037e-05, + "loss": 2.4595, + "step": 1461 + }, + { + "epoch": 0.28144476261520324, + "grad_norm": 3.3081301772927865, + "learning_rate": 1.936403790081366e-05, + "loss": 2.4481, + "step": 1462 + }, + { + "epoch": 0.2816372692927786, + "grad_norm": 3.333841573553775, + "learning_rate": 1.9362943299638994e-05, + "loss": 2.4035, + "step": 1463 + }, + { + "epoch": 0.28182977597035397, + "grad_norm": 4.0084181972830075, + "learning_rate": 1.9361847788266455e-05, + "loss": 2.4311, + "step": 1464 + }, + { + "epoch": 0.28182977597035397, + "lm_loss": 2.1462, + "step": 1464, + "vm_loss": 0.1706 + }, + { + "epoch": 0.28182977597035397, + "lm_loss": 2.2539, + "step": 1464, + "vm_loss": 0.1311 + }, + { + "epoch": 0.28182977597035397, + "lm_loss": 2.0586, + "step": 1464, + "vm_loss": 0.1986 + }, + { + "epoch": 0.28182977597035397, + "lm_loss": 2.3023, + "step": 1464, + "vm_loss": 0.1896 + }, + { + "epoch": 0.28182977597035397, + "lm_loss": 2.4273, + "step": 1464, + "vm_loss": 0.1569 + }, + { + "epoch": 0.28182977597035397, + "lm_loss": 2.1696, + "step": 1464, + "vm_loss": 0.1546 + }, + { + "epoch": 0.28182977597035397, + "lm_loss": 2.3907, + "step": 1464, + "vm_loss": 0.1472 + }, + { + "epoch": 0.28182977597035397, + "lm_loss": 2.0218, + "step": 1464, + "vm_loss": 0.2175 + }, + { + "epoch": 0.28202228264792933, + "grad_norm": 3.3816599717158793, + "learning_rate": 1.9360751366802534e-05, + "loss": 2.4262, + "step": 1465 + }, + { + "epoch": 0.28221478932550476, + "grad_norm": 3.031942496017371, + "learning_rate": 1.9359654035353825e-05, + "loss": 2.3665, + "step": 1466 + }, + { + "epoch": 0.2824072960030801, + "grad_norm": 3.0897395198458635, + "learning_rate": 1.9358555794027e-05, + "loss": 2.4771, + "step": 1467 + }, + { + "epoch": 0.2825998026806555, + "grad_norm": 3.0227620930025934, + "learning_rate": 1.9357456642928817e-05, + "loss": 2.3985, + "step": 1468 + }, + { + "epoch": 0.28279230935823085, + "grad_norm": 3.3676118116721767, + "learning_rate": 1.9356356582166134e-05, + "loss": 2.3991, + "step": 1469 + }, + { + "epoch": 0.2829848160358062, + "grad_norm": 3.157686084310653, + "learning_rate": 1.9355255611845885e-05, + "loss": 2.4119, + "step": 1470 + }, + { + "epoch": 0.28317732271338164, + "grad_norm": 3.118219953202858, + "learning_rate": 1.9354153732075106e-05, + "loss": 2.4515, + "step": 1471 + }, + { + "epoch": 0.283369829390957, + "grad_norm": 2.8497602634085433, + "learning_rate": 1.935305094296091e-05, + "loss": 2.4476, + "step": 1472 + }, + { + "epoch": 0.283369829390957, + "lm_loss": 2.065, + "step": 1472, + "vm_loss": 0.2413 + }, + { + "epoch": 0.283369829390957, + "lm_loss": 2.0688, + "step": 1472, + "vm_loss": 0.1385 + }, + { + "epoch": 0.283369829390957, + "lm_loss": 2.1513, + "step": 1472, + "vm_loss": 0.223 + }, + { + "epoch": 0.283369829390957, + "lm_loss": 2.4929, + "step": 1472, + "vm_loss": 0.1294 + }, + { + "epoch": 0.283369829390957, + "lm_loss": 2.2769, + "step": 1472, + "vm_loss": 0.1766 + }, + { + "epoch": 0.283369829390957, + "lm_loss": 2.2175, + "step": 1472, + "vm_loss": 0.1647 + }, + { + "epoch": 0.283369829390957, + "lm_loss": 1.8374, + "step": 1472, + "vm_loss": 0.1672 + }, + { + "epoch": 0.283369829390957, + "lm_loss": 2.2375, + "step": 1472, + "vm_loss": 0.1554 + }, + { + "epoch": 0.2835623360685324, + "grad_norm": 2.750610238396522, + "learning_rate": 1.93519472446105e-05, + "loss": 2.3855, + "step": 1473 + }, + { + "epoch": 0.28375484274610774, + "grad_norm": 3.167429351489139, + "learning_rate": 1.9350842637131172e-05, + "loss": 2.4213, + "step": 1474 + }, + { + "epoch": 0.28394734942368316, + "grad_norm": 2.688577038309872, + "learning_rate": 1.9349737120630302e-05, + "loss": 2.4289, + "step": 1475 + }, + { + "epoch": 0.2841398561012585, + "grad_norm": 3.8624307457942, + "learning_rate": 1.934863069521537e-05, + "loss": 2.3499, + "step": 1476 + }, + { + "epoch": 0.2843323627788339, + "grad_norm": 3.1920537613772146, + "learning_rate": 1.9347523360993933e-05, + "loss": 2.4393, + "step": 1477 + }, + { + "epoch": 0.28452486945640926, + "grad_norm": 3.4810979008869802, + "learning_rate": 1.9346415118073634e-05, + "loss": 2.451, + "step": 1478 + }, + { + "epoch": 0.2847173761339846, + "grad_norm": 3.6482575452682764, + "learning_rate": 1.934530596656221e-05, + "loss": 2.4376, + "step": 1479 + }, + { + "epoch": 0.28490988281156004, + "grad_norm": 3.0546001127709097, + "learning_rate": 1.9344195906567488e-05, + "loss": 2.407, + "step": 1480 + }, + { + "epoch": 0.28490988281156004, + "lm_loss": 2.3662, + "step": 1480, + "vm_loss": 0.1893 + }, + { + "epoch": 0.28490988281156004, + "lm_loss": 2.3432, + "step": 1480, + "vm_loss": 0.2485 + }, + { + "epoch": 0.28490988281156004, + "lm_loss": 2.451, + "step": 1480, + "vm_loss": 0.159 + }, + { + "epoch": 0.28490988281156004, + "lm_loss": 2.423, + "step": 1480, + "vm_loss": 0.1842 + }, + { + "epoch": 0.28490988281156004, + "lm_loss": 2.2159, + "step": 1480, + "vm_loss": 0.1914 + }, + { + "epoch": 0.28490988281156004, + "lm_loss": 1.9316, + "step": 1480, + "vm_loss": 0.1612 + }, + { + "epoch": 0.28490988281156004, + "lm_loss": 2.3108, + "step": 1480, + "vm_loss": 0.1435 + }, + { + "epoch": 0.28490988281156004, + "lm_loss": 2.2624, + "step": 1480, + "vm_loss": 0.1866 + }, + { + "epoch": 0.2851023894891354, + "grad_norm": 3.2276755655245597, + "learning_rate": 1.934308493819737e-05, + "loss": 2.4757, + "step": 1481 + }, + { + "epoch": 0.2852948961667108, + "grad_norm": 3.0527062119395536, + "learning_rate": 1.9341973061559866e-05, + "loss": 2.3694, + "step": 1482 + }, + { + "epoch": 0.28548740284428614, + "grad_norm": 2.7659991155426846, + "learning_rate": 1.9340860276763062e-05, + "loss": 2.4144, + "step": 1483 + }, + { + "epoch": 0.28567990952186156, + "grad_norm": 3.934423341342164, + "learning_rate": 1.9339746583915135e-05, + "loss": 2.4713, + "step": 1484 + }, + { + "epoch": 0.28587241619943693, + "grad_norm": 2.911922614405069, + "learning_rate": 1.9338631983124348e-05, + "loss": 2.412, + "step": 1485 + }, + { + "epoch": 0.2860649228770123, + "grad_norm": 3.2962340737937397, + "learning_rate": 1.9337516474499058e-05, + "loss": 2.4345, + "step": 1486 + }, + { + "epoch": 0.28625742955458766, + "grad_norm": 3.534613269749138, + "learning_rate": 1.9336400058147707e-05, + "loss": 2.4425, + "step": 1487 + }, + { + "epoch": 0.286449936232163, + "grad_norm": 2.925708784736393, + "learning_rate": 1.9335282734178816e-05, + "loss": 2.4239, + "step": 1488 + }, + { + "epoch": 0.286449936232163, + "lm_loss": 2.0142, + "step": 1488, + "vm_loss": 0.1647 + }, + { + "epoch": 0.286449936232163, + "lm_loss": 2.2237, + "step": 1488, + "vm_loss": 0.1487 + }, + { + "epoch": 0.286449936232163, + "lm_loss": 2.2606, + "step": 1488, + "vm_loss": 0.1634 + }, + { + "epoch": 0.286449936232163, + "lm_loss": 2.0704, + "step": 1488, + "vm_loss": 0.1723 + }, + { + "epoch": 0.286449936232163, + "lm_loss": 2.3242, + "step": 1488, + "vm_loss": 0.2087 + }, + { + "epoch": 0.286449936232163, + "lm_loss": 2.1596, + "step": 1488, + "vm_loss": 0.1608 + }, + { + "epoch": 0.286449936232163, + "lm_loss": 2.6922, + "step": 1488, + "vm_loss": 0.1549 + }, + { + "epoch": 0.286449936232163, + "lm_loss": 2.0314, + "step": 1488, + "vm_loss": 0.1677 + }, + { + "epoch": 0.28664244290973845, + "grad_norm": 3.432725225137702, + "learning_rate": 1.9334164502701017e-05, + "loss": 2.3839, + "step": 1489 + }, + { + "epoch": 0.2868349495873138, + "grad_norm": 3.6911435427842907, + "learning_rate": 1.9333045363823005e-05, + "loss": 2.4592, + "step": 1490 + }, + { + "epoch": 0.2870274562648892, + "grad_norm": 2.9310357638767326, + "learning_rate": 1.9331925317653583e-05, + "loss": 2.4692, + "step": 1491 + }, + { + "epoch": 0.28721996294246455, + "grad_norm": 3.263490254763622, + "learning_rate": 1.933080436430163e-05, + "loss": 2.4359, + "step": 1492 + }, + { + "epoch": 0.28741246962003997, + "grad_norm": 3.2613532343481295, + "learning_rate": 1.9329682503876115e-05, + "loss": 2.4698, + "step": 1493 + }, + { + "epoch": 0.28760497629761533, + "grad_norm": 3.7611595173855354, + "learning_rate": 1.93285597364861e-05, + "loss": 2.39, + "step": 1494 + }, + { + "epoch": 0.2877974829751907, + "grad_norm": 3.0550631689166075, + "learning_rate": 1.932743606224073e-05, + "loss": 2.4212, + "step": 1495 + }, + { + "epoch": 0.28798998965276607, + "grad_norm": 3.1079278386130396, + "learning_rate": 1.932631148124924e-05, + "loss": 2.4433, + "step": 1496 + }, + { + "epoch": 0.28798998965276607, + "lm_loss": 2.0973, + "step": 1496, + "vm_loss": 0.2133 + }, + { + "epoch": 0.28798998965276607, + "lm_loss": 2.1244, + "step": 1496, + "vm_loss": 0.1808 + }, + { + "epoch": 0.28798998965276607, + "lm_loss": 2.4423, + "step": 1496, + "vm_loss": 0.1441 + }, + { + "epoch": 0.28798998965276607, + "lm_loss": 2.1986, + "step": 1496, + "vm_loss": 0.1757 + }, + { + "epoch": 0.28798998965276607, + "lm_loss": 2.4945, + "step": 1496, + "vm_loss": 0.1428 + }, + { + "epoch": 0.28798998965276607, + "lm_loss": 2.4617, + "step": 1496, + "vm_loss": 0.173 + }, + { + "epoch": 0.28798998965276607, + "lm_loss": 2.3319, + "step": 1496, + "vm_loss": 0.1543 + }, + { + "epoch": 0.28798998965276607, + "lm_loss": 2.3622, + "step": 1496, + "vm_loss": 0.103 + }, + { + "epoch": 0.28818249633034143, + "grad_norm": 3.408789973939458, + "learning_rate": 1.9325185993620962e-05, + "loss": 2.4686, + "step": 1497 + }, + { + "epoch": 0.28837500300791685, + "grad_norm": 3.553674151823099, + "learning_rate": 1.9324059599465296e-05, + "loss": 2.4361, + "step": 1498 + }, + { + "epoch": 0.2885675096854922, + "grad_norm": 3.80201610175442, + "learning_rate": 1.932293229889175e-05, + "loss": 2.4753, + "step": 1499 + }, + { + "epoch": 0.2887600163630676, + "grad_norm": 3.150315227405042, + "learning_rate": 1.932180409200991e-05, + "loss": 2.44, + "step": 1500 + }, + { + "epoch": 0.28895252304064295, + "grad_norm": 3.0200214473873155, + "learning_rate": 1.9320674978929447e-05, + "loss": 2.4222, + "step": 1501 + }, + { + "epoch": 0.28914502971821837, + "grad_norm": 3.5690659587874842, + "learning_rate": 1.9319544959760135e-05, + "loss": 2.4609, + "step": 1502 + }, + { + "epoch": 0.28933753639579374, + "grad_norm": 3.0061943532336786, + "learning_rate": 1.931841403461182e-05, + "loss": 2.4438, + "step": 1503 + }, + { + "epoch": 0.2895300430733691, + "grad_norm": 3.5862230381876707, + "learning_rate": 1.9317282203594438e-05, + "loss": 2.4467, + "step": 1504 + }, + { + "epoch": 0.2895300430733691, + "lm_loss": 2.4456, + "step": 1504, + "vm_loss": 0.1528 + }, + { + "epoch": 0.2895300430733691, + "lm_loss": 2.4268, + "step": 1504, + "vm_loss": 0.1784 + }, + { + "epoch": 0.2895300430733691, + "lm_loss": 2.2311, + "step": 1504, + "vm_loss": 0.1974 + }, + { + "epoch": 0.2895300430733691, + "lm_loss": 2.253, + "step": 1504, + "vm_loss": 0.2262 + }, + { + "epoch": 0.2895300430733691, + "lm_loss": 2.2247, + "step": 1504, + "vm_loss": 0.1779 + }, + { + "epoch": 0.2895300430733691, + "lm_loss": 2.3179, + "step": 1504, + "vm_loss": 0.223 + }, + { + "epoch": 0.2895300430733691, + "lm_loss": 1.9328, + "step": 1504, + "vm_loss": 0.2 + }, + { + "epoch": 0.2895300430733691, + "lm_loss": 2.2715, + "step": 1504, + "vm_loss": 0.1611 + }, + { + "epoch": 0.28972254975094447, + "grad_norm": 3.6304841617843415, + "learning_rate": 1.931614946681803e-05, + "loss": 2.4679, + "step": 1505 + }, + { + "epoch": 0.2899150564285199, + "grad_norm": 3.647313916477359, + "learning_rate": 1.9315015824392703e-05, + "loss": 2.4313, + "step": 1506 + }, + { + "epoch": 0.29010756310609526, + "grad_norm": 3.466466129136221, + "learning_rate": 1.9313881276428663e-05, + "loss": 2.415, + "step": 1507 + }, + { + "epoch": 0.2903000697836706, + "grad_norm": 3.071139527153352, + "learning_rate": 1.9312745823036204e-05, + "loss": 2.4442, + "step": 1508 + }, + { + "epoch": 0.290492576461246, + "grad_norm": 3.048578618265307, + "learning_rate": 1.931160946432571e-05, + "loss": 2.4066, + "step": 1509 + }, + { + "epoch": 0.29068508313882135, + "grad_norm": 3.292399401220064, + "learning_rate": 1.931047220040764e-05, + "loss": 2.4343, + "step": 1510 + }, + { + "epoch": 0.2908775898163968, + "grad_norm": 3.02061935349102, + "learning_rate": 1.930933403139256e-05, + "loss": 2.4333, + "step": 1511 + }, + { + "epoch": 0.29107009649397214, + "grad_norm": 3.246994924559873, + "learning_rate": 1.9308194957391108e-05, + "loss": 2.4321, + "step": 1512 + }, + { + "epoch": 0.29107009649397214, + "lm_loss": 2.0297, + "step": 1512, + "vm_loss": 0.1497 + }, + { + "epoch": 0.29107009649397214, + "lm_loss": 2.309, + "step": 1512, + "vm_loss": 0.1535 + }, + { + "epoch": 0.29107009649397214, + "lm_loss": 2.1463, + "step": 1512, + "vm_loss": 0.1856 + }, + { + "epoch": 0.29107009649397214, + "lm_loss": 2.4213, + "step": 1512, + "vm_loss": 0.1436 + }, + { + "epoch": 0.29107009649397214, + "lm_loss": 2.3265, + "step": 1512, + "vm_loss": 0.1278 + }, + { + "epoch": 0.29107009649397214, + "lm_loss": 1.9666, + "step": 1512, + "vm_loss": 0.1285 + }, + { + "epoch": 0.29107009649397214, + "lm_loss": 2.3875, + "step": 1512, + "vm_loss": 0.1879 + }, + { + "epoch": 0.29107009649397214, + "lm_loss": 2.4664, + "step": 1512, + "vm_loss": 0.211 + }, + { + "epoch": 0.2912626031715475, + "grad_norm": 3.2806053534608557, + "learning_rate": 1.930705497851402e-05, + "loss": 2.3972, + "step": 1513 + }, + { + "epoch": 0.2914551098491229, + "grad_norm": 2.8277551831830317, + "learning_rate": 1.930591409487212e-05, + "loss": 2.469, + "step": 1514 + }, + { + "epoch": 0.2916476165266983, + "grad_norm": 3.0564067918013795, + "learning_rate": 1.9304772306576313e-05, + "loss": 2.4073, + "step": 1515 + }, + { + "epoch": 0.29184012320427366, + "grad_norm": 3.3798323473530063, + "learning_rate": 1.9303629613737592e-05, + "loss": 2.5061, + "step": 1516 + }, + { + "epoch": 0.292032629881849, + "grad_norm": 3.141003667262534, + "learning_rate": 1.9302486016467044e-05, + "loss": 2.4846, + "step": 1517 + }, + { + "epoch": 0.2922251365594244, + "grad_norm": 4.086867886740118, + "learning_rate": 1.930134151487584e-05, + "loss": 2.4152, + "step": 1518 + }, + { + "epoch": 0.29241764323699976, + "grad_norm": 3.1399383187177756, + "learning_rate": 1.9300196109075246e-05, + "loss": 2.3922, + "step": 1519 + }, + { + "epoch": 0.2926101499145752, + "grad_norm": 2.9732394632524817, + "learning_rate": 1.9299049799176602e-05, + "loss": 2.478, + "step": 1520 + }, + { + "epoch": 0.2926101499145752, + "lm_loss": 2.2772, + "step": 1520, + "vm_loss": 0.1447 + }, + { + "epoch": 0.2926101499145752, + "lm_loss": 2.0747, + "step": 1520, + "vm_loss": 0.1365 + }, + { + "epoch": 0.2926101499145752, + "lm_loss": 2.0432, + "step": 1520, + "vm_loss": 0.1829 + }, + { + "epoch": 0.2926101499145752, + "lm_loss": 2.4983, + "step": 1520, + "vm_loss": 0.1161 + }, + { + "epoch": 0.2926101499145752, + "lm_loss": 2.2147, + "step": 1520, + "vm_loss": 0.1677 + }, + { + "epoch": 0.2926101499145752, + "lm_loss": 2.2581, + "step": 1520, + "vm_loss": 0.1976 + }, + { + "epoch": 0.2926101499145752, + "lm_loss": 2.3986, + "step": 1520, + "vm_loss": 0.2364 + }, + { + "epoch": 0.2926101499145752, + "lm_loss": 2.2722, + "step": 1520, + "vm_loss": 0.1245 + }, + { + "epoch": 0.29280265659215055, + "grad_norm": 3.491632012710889, + "learning_rate": 1.9297902585291353e-05, + "loss": 2.4289, + "step": 1521 + }, + { + "epoch": 0.2929951632697259, + "grad_norm": 3.1107432971832485, + "learning_rate": 1.9296754467531015e-05, + "loss": 2.4146, + "step": 1522 + }, + { + "epoch": 0.2931876699473013, + "grad_norm": 3.3132383473598597, + "learning_rate": 1.92956054460072e-05, + "loss": 2.3997, + "step": 1523 + }, + { + "epoch": 0.2933801766248767, + "grad_norm": 3.2747616389183585, + "learning_rate": 1.929445552083161e-05, + "loss": 2.4231, + "step": 1524 + }, + { + "epoch": 0.29357268330245206, + "grad_norm": 2.6655935791159786, + "learning_rate": 1.9293304692116037e-05, + "loss": 2.4428, + "step": 1525 + }, + { + "epoch": 0.29376518998002743, + "grad_norm": 2.983970671287051, + "learning_rate": 1.929215295997235e-05, + "loss": 2.4056, + "step": 1526 + }, + { + "epoch": 0.2939576966576028, + "grad_norm": 3.090315462183964, + "learning_rate": 1.9291000324512514e-05, + "loss": 2.3686, + "step": 1527 + }, + { + "epoch": 0.29415020333517816, + "grad_norm": 2.69929539644961, + "learning_rate": 1.928984678584858e-05, + "loss": 2.4185, + "step": 1528 + }, + { + "epoch": 0.29415020333517816, + "lm_loss": 2.4492, + "step": 1528, + "vm_loss": 0.1415 + }, + { + "epoch": 0.29415020333517816, + "lm_loss": 2.2989, + "step": 1528, + "vm_loss": 0.1568 + }, + { + "epoch": 0.29415020333517816, + "lm_loss": 2.1769, + "step": 1528, + "vm_loss": 0.1225 + }, + { + "epoch": 0.29415020333517816, + "lm_loss": 2.2638, + "step": 1528, + "vm_loss": 0.1441 + }, + { + "epoch": 0.29415020333517816, + "lm_loss": 2.5292, + "step": 1528, + "vm_loss": 0.1863 + }, + { + "epoch": 0.29415020333517816, + "lm_loss": 2.2534, + "step": 1528, + "vm_loss": 0.1369 + }, + { + "epoch": 0.29415020333517816, + "lm_loss": 2.1655, + "step": 1528, + "vm_loss": 0.1818 + }, + { + "epoch": 0.29415020333517816, + "lm_loss": 2.466, + "step": 1528, + "vm_loss": 0.162 + }, + { + "epoch": 0.2943427100127536, + "grad_norm": 3.3696144018030414, + "learning_rate": 1.9288692344092686e-05, + "loss": 2.4349, + "step": 1529 + }, + { + "epoch": 0.29453521669032895, + "grad_norm": 3.1253590698791527, + "learning_rate": 1.928753699935706e-05, + "loss": 2.4333, + "step": 1530 + }, + { + "epoch": 0.2947277233679043, + "grad_norm": 3.0896832279657245, + "learning_rate": 1.9286380751754013e-05, + "loss": 2.4346, + "step": 1531 + }, + { + "epoch": 0.2949202300454797, + "grad_norm": 3.374611174377119, + "learning_rate": 1.9285223601395953e-05, + "loss": 2.4372, + "step": 1532 + }, + { + "epoch": 0.2951127367230551, + "grad_norm": 3.1184706610359454, + "learning_rate": 1.9284065548395362e-05, + "loss": 2.4198, + "step": 1533 + }, + { + "epoch": 0.29530524340063047, + "grad_norm": 2.650637065578818, + "learning_rate": 1.9282906592864826e-05, + "loss": 2.4497, + "step": 1534 + }, + { + "epoch": 0.29549775007820583, + "grad_norm": 3.56945496321312, + "learning_rate": 1.9281746734917005e-05, + "loss": 2.4252, + "step": 1535 + }, + { + "epoch": 0.2956902567557812, + "grad_norm": 3.5283418770831028, + "learning_rate": 1.9280585974664653e-05, + "loss": 2.3906, + "step": 1536 + }, + { + "epoch": 0.2956902567557812, + "lm_loss": 2.2659, + "step": 1536, + "vm_loss": 0.2062 + }, + { + "epoch": 0.2956902567557812, + "lm_loss": 2.3326, + "step": 1536, + "vm_loss": 0.1949 + }, + { + "epoch": 0.2956902567557812, + "lm_loss": 2.1007, + "step": 1536, + "vm_loss": 0.1362 + }, + { + "epoch": 0.2956902567557812, + "lm_loss": 2.1906, + "step": 1536, + "vm_loss": 0.1401 + }, + { + "epoch": 0.2956902567557812, + "lm_loss": 2.3567, + "step": 1536, + "vm_loss": 0.1199 + }, + { + "epoch": 0.2956902567557812, + "lm_loss": 2.241, + "step": 1536, + "vm_loss": 0.1925 + }, + { + "epoch": 0.2956902567557812, + "lm_loss": 2.5681, + "step": 1536, + "vm_loss": 0.1692 + }, + { + "epoch": 0.2956902567557812, + "lm_loss": 2.2083, + "step": 1536, + "vm_loss": 0.1694 + }, + { + "epoch": 0.2958827634333566, + "grad_norm": 3.4897159460884124, + "learning_rate": 1.927942431222061e-05, + "loss": 2.4126, + "step": 1537 + }, + { + "epoch": 0.296075270110932, + "grad_norm": 3.3246150847859552, + "learning_rate": 1.9278261747697804e-05, + "loss": 2.4071, + "step": 1538 + }, + { + "epoch": 0.29626777678850735, + "grad_norm": 3.3105966071770156, + "learning_rate": 1.9277098281209253e-05, + "loss": 2.4308, + "step": 1539 + }, + { + "epoch": 0.2964602834660827, + "grad_norm": 2.9720637859340067, + "learning_rate": 1.9275933912868065e-05, + "loss": 2.465, + "step": 1540 + }, + { + "epoch": 0.2966527901436581, + "grad_norm": 3.774152386581912, + "learning_rate": 1.927476864278742e-05, + "loss": 2.4356, + "step": 1541 + }, + { + "epoch": 0.2968452968212335, + "grad_norm": 3.2326649121042825, + "learning_rate": 1.927360247108061e-05, + "loss": 2.4369, + "step": 1542 + }, + { + "epoch": 0.29703780349880887, + "grad_norm": 3.482446958667124, + "learning_rate": 1.9272435397860992e-05, + "loss": 2.3995, + "step": 1543 + }, + { + "epoch": 0.29723031017638424, + "grad_norm": 3.528865631555095, + "learning_rate": 1.9271267423242028e-05, + "loss": 2.4558, + "step": 1544 + }, + { + "epoch": 0.29723031017638424, + "lm_loss": 2.3403, + "step": 1544, + "vm_loss": 0.167 + }, + { + "epoch": 0.29723031017638424, + "lm_loss": 1.8689, + "step": 1544, + "vm_loss": 0.1771 + }, + { + "epoch": 0.29723031017638424, + "lm_loss": 2.3338, + "step": 1544, + "vm_loss": 0.2052 + }, + { + "epoch": 0.29723031017638424, + "lm_loss": 2.3295, + "step": 1544, + "vm_loss": 0.1631 + }, + { + "epoch": 0.29723031017638424, + "lm_loss": 2.2525, + "step": 1544, + "vm_loss": 0.1404 + }, + { + "epoch": 0.29723031017638424, + "lm_loss": 2.4347, + "step": 1544, + "vm_loss": 0.1555 + }, + { + "epoch": 0.29723031017638424, + "lm_loss": 2.3204, + "step": 1544, + "vm_loss": 0.1527 + }, + { + "epoch": 0.29723031017638424, + "lm_loss": 2.1844, + "step": 1544, + "vm_loss": 0.2673 + }, + { + "epoch": 0.2974228168539596, + "grad_norm": 3.2098407381664082, + "learning_rate": 1.927009854733725e-05, + "loss": 2.4554, + "step": 1545 + }, + { + "epoch": 0.297615323531535, + "grad_norm": 3.4178964317192593, + "learning_rate": 1.9268928770260298e-05, + "loss": 2.4251, + "step": 1546 + }, + { + "epoch": 0.2978078302091104, + "grad_norm": 2.9512264174928124, + "learning_rate": 1.9267758092124883e-05, + "loss": 2.4219, + "step": 1547 + }, + { + "epoch": 0.29800033688668576, + "grad_norm": 3.112352823869454, + "learning_rate": 1.9266586513044818e-05, + "loss": 2.4294, + "step": 1548 + }, + { + "epoch": 0.2981928435642611, + "grad_norm": 3.579117818917497, + "learning_rate": 1.9265414033133986e-05, + "loss": 2.4237, + "step": 1549 + }, + { + "epoch": 0.2983853502418365, + "grad_norm": 3.3559863716959075, + "learning_rate": 1.926424065250637e-05, + "loss": 2.4521, + "step": 1550 + }, + { + "epoch": 0.2985778569194119, + "grad_norm": 2.72048828887211, + "learning_rate": 1.9263066371276037e-05, + "loss": 2.4223, + "step": 1551 + }, + { + "epoch": 0.2987703635969873, + "grad_norm": 3.2101721761303756, + "learning_rate": 1.9261891189557146e-05, + "loss": 2.4387, + "step": 1552 + }, + { + "epoch": 0.2987703635969873, + "lm_loss": 2.2197, + "step": 1552, + "vm_loss": 0.204 + }, + { + "epoch": 0.2987703635969873, + "lm_loss": 2.4019, + "step": 1552, + "vm_loss": 0.1566 + }, + { + "epoch": 0.2987703635969873, + "lm_loss": 2.2678, + "step": 1552, + "vm_loss": 0.1481 + }, + { + "epoch": 0.2987703635969873, + "lm_loss": 2.3116, + "step": 1552, + "vm_loss": 0.1717 + }, + { + "epoch": 0.2987703635969873, + "lm_loss": 2.105, + "step": 1552, + "vm_loss": 0.2016 + }, + { + "epoch": 0.2987703635969873, + "lm_loss": 2.0397, + "step": 1552, + "vm_loss": 0.1765 + }, + { + "epoch": 0.2987703635969873, + "lm_loss": 2.3848, + "step": 1552, + "vm_loss": 0.1818 + }, + { + "epoch": 0.2987703635969873, + "lm_loss": 2.1193, + "step": 1552, + "vm_loss": 0.1824 + }, + { + "epoch": 0.29896287027456264, + "grad_norm": 3.314986146789833, + "learning_rate": 1.9260715107463935e-05, + "loss": 2.4265, + "step": 1553 + }, + { + "epoch": 0.299155376952138, + "grad_norm": 2.782186426301867, + "learning_rate": 1.925953812511074e-05, + "loss": 2.3966, + "step": 1554 + }, + { + "epoch": 0.29934788362971343, + "grad_norm": 2.9291941363969816, + "learning_rate": 1.9258360242611973e-05, + "loss": 2.4862, + "step": 1555 + }, + { + "epoch": 0.2995403903072888, + "grad_norm": 2.9244554928875646, + "learning_rate": 1.925718146008214e-05, + "loss": 2.388, + "step": 1556 + }, + { + "epoch": 0.29973289698486416, + "grad_norm": 2.8639366877979575, + "learning_rate": 1.925600177763584e-05, + "loss": 2.4432, + "step": 1557 + }, + { + "epoch": 0.2999254036624395, + "grad_norm": 3.5674673086339546, + "learning_rate": 1.9254821195387744e-05, + "loss": 2.3614, + "step": 1558 + }, + { + "epoch": 0.3001179103400149, + "grad_norm": 3.061340156963764, + "learning_rate": 1.9253639713452624e-05, + "loss": 2.4346, + "step": 1559 + }, + { + "epoch": 0.3003104170175903, + "grad_norm": 3.3531783522059557, + "learning_rate": 1.9252457331945336e-05, + "loss": 2.4009, + "step": 1560 + }, + { + "epoch": 0.3003104170175903, + "lm_loss": 2.0283, + "step": 1560, + "vm_loss": 0.181 + }, + { + "epoch": 0.3003104170175903, + "lm_loss": 2.334, + "step": 1560, + "vm_loss": 0.1967 + }, + { + "epoch": 0.3003104170175903, + "lm_loss": 2.1629, + "step": 1560, + "vm_loss": 0.1965 + }, + { + "epoch": 0.3003104170175903, + "lm_loss": 2.0658, + "step": 1560, + "vm_loss": 0.1468 + }, + { + "epoch": 0.3003104170175903, + "lm_loss": 2.3044, + "step": 1560, + "vm_loss": 0.2496 + }, + { + "epoch": 0.3003104170175903, + "lm_loss": 2.354, + "step": 1560, + "vm_loss": 0.1461 + }, + { + "epoch": 0.3003104170175903, + "lm_loss": 2.1524, + "step": 1560, + "vm_loss": 0.2335 + }, + { + "epoch": 0.3003104170175903, + "lm_loss": 2.1873, + "step": 1560, + "vm_loss": 0.1562 + }, + { + "epoch": 0.3005029236951657, + "grad_norm": 3.167637171389351, + "learning_rate": 1.9251274050980825e-05, + "loss": 2.412, + "step": 1561 + }, + { + "epoch": 0.30069543037274105, + "grad_norm": 3.0716152116676283, + "learning_rate": 1.9250089870674115e-05, + "loss": 2.397, + "step": 1562 + }, + { + "epoch": 0.3008879370503164, + "grad_norm": 3.015825236503033, + "learning_rate": 1.9248904791140325e-05, + "loss": 2.4303, + "step": 1563 + }, + { + "epoch": 0.30108044372789183, + "grad_norm": 3.393406559313713, + "learning_rate": 1.9247718812494662e-05, + "loss": 2.4367, + "step": 1564 + }, + { + "epoch": 0.3012729504054672, + "grad_norm": 3.283385971741707, + "learning_rate": 1.924653193485242e-05, + "loss": 2.3824, + "step": 1565 + }, + { + "epoch": 0.30146545708304257, + "grad_norm": 2.945418391453257, + "learning_rate": 1.9245344158328975e-05, + "loss": 2.4084, + "step": 1566 + }, + { + "epoch": 0.30165796376061793, + "grad_norm": 2.659057019934935, + "learning_rate": 1.9244155483039794e-05, + "loss": 2.4126, + "step": 1567 + }, + { + "epoch": 0.30185047043819335, + "grad_norm": 2.99504473620692, + "learning_rate": 1.924296590910043e-05, + "loss": 2.4431, + "step": 1568 + }, + { + "epoch": 0.30185047043819335, + "lm_loss": 1.8937, + "step": 1568, + "vm_loss": 0.1264 + }, + { + "epoch": 0.30185047043819335, + "lm_loss": 2.0376, + "step": 1568, + "vm_loss": 0.1733 + }, + { + "epoch": 0.30185047043819335, + "lm_loss": 2.1112, + "step": 1568, + "vm_loss": 0.1828 + }, + { + "epoch": 0.30185047043819335, + "lm_loss": 2.2003, + "step": 1568, + "vm_loss": 0.1013 + }, + { + "epoch": 0.30185047043819335, + "lm_loss": 2.2269, + "step": 1568, + "vm_loss": 0.1503 + }, + { + "epoch": 0.30185047043819335, + "lm_loss": 2.4779, + "step": 1568, + "vm_loss": 0.1841 + }, + { + "epoch": 0.30185047043819335, + "lm_loss": 2.1138, + "step": 1568, + "vm_loss": 0.1814 + }, + { + "epoch": 0.30185047043819335, + "lm_loss": 2.1316, + "step": 1568, + "vm_loss": 0.1095 + }, + { + "epoch": 0.3020429771157687, + "grad_norm": 2.584119312003221, + "learning_rate": 1.9241775436626535e-05, + "loss": 2.4011, + "step": 1569 + }, + { + "epoch": 0.3022354837933441, + "grad_norm": 2.7809270869327407, + "learning_rate": 1.924058406573383e-05, + "loss": 2.4035, + "step": 1570 + }, + { + "epoch": 0.30242799047091945, + "grad_norm": 3.1652180231530807, + "learning_rate": 1.9239391796538125e-05, + "loss": 2.4615, + "step": 1571 + }, + { + "epoch": 0.3026204971484948, + "grad_norm": 3.122009999204937, + "learning_rate": 1.9238198629155337e-05, + "loss": 2.4567, + "step": 1572 + }, + { + "epoch": 0.30281300382607024, + "grad_norm": 2.919014482300737, + "learning_rate": 1.9237004563701446e-05, + "loss": 2.4131, + "step": 1573 + }, + { + "epoch": 0.3030055105036456, + "grad_norm": 3.4029298421946277, + "learning_rate": 1.9235809600292542e-05, + "loss": 2.4223, + "step": 1574 + }, + { + "epoch": 0.30319801718122097, + "grad_norm": 3.1332074317736507, + "learning_rate": 1.923461373904478e-05, + "loss": 2.4395, + "step": 1575 + }, + { + "epoch": 0.30339052385879633, + "grad_norm": 2.648814162106668, + "learning_rate": 1.923341698007442e-05, + "loss": 2.4365, + "step": 1576 + }, + { + "epoch": 0.30339052385879633, + "lm_loss": 2.3981, + "step": 1576, + "vm_loss": 0.1608 + }, + { + "epoch": 0.30339052385879633, + "lm_loss": 2.1836, + "step": 1576, + "vm_loss": 0.1638 + }, + { + "epoch": 0.30339052385879633, + "lm_loss": 2.0677, + "step": 1576, + "vm_loss": 0.1046 + }, + { + "epoch": 0.30339052385879633, + "lm_loss": 1.6026, + "step": 1576, + "vm_loss": 0.1611 + }, + { + "epoch": 0.30339052385879633, + "lm_loss": 1.8092, + "step": 1576, + "vm_loss": 0.1585 + }, + { + "epoch": 0.30339052385879633, + "lm_loss": 2.376, + "step": 1576, + "vm_loss": 0.1833 + }, + { + "epoch": 0.30339052385879633, + "lm_loss": 2.3269, + "step": 1576, + "vm_loss": 0.1692 + }, + { + "epoch": 0.30339052385879633, + "lm_loss": 1.8972, + "step": 1576, + "vm_loss": 0.2059 + }, + { + "epoch": 0.30358303053637176, + "grad_norm": 3.3762109369248328, + "learning_rate": 1.9232219323497797e-05, + "loss": 2.3929, + "step": 1577 + }, + { + "epoch": 0.3037755372139471, + "grad_norm": 3.4984837775593918, + "learning_rate": 1.9231020769431344e-05, + "loss": 2.4075, + "step": 1578 + }, + { + "epoch": 0.3039680438915225, + "grad_norm": 2.8195248783062508, + "learning_rate": 1.9229821317991572e-05, + "loss": 2.4866, + "step": 1579 + }, + { + "epoch": 0.30416055056909785, + "grad_norm": 3.624294792634172, + "learning_rate": 1.922862096929508e-05, + "loss": 2.4038, + "step": 1580 + }, + { + "epoch": 0.3043530572466732, + "grad_norm": 3.0880074995300597, + "learning_rate": 1.9227419723458565e-05, + "loss": 2.4406, + "step": 1581 + }, + { + "epoch": 0.30454556392424864, + "grad_norm": 2.9737859978674805, + "learning_rate": 1.9226217580598804e-05, + "loss": 2.4169, + "step": 1582 + }, + { + "epoch": 0.304738070601824, + "grad_norm": 3.093610857310697, + "learning_rate": 1.922501454083265e-05, + "loss": 2.4537, + "step": 1583 + }, + { + "epoch": 0.3049305772793994, + "grad_norm": 3.520144681125478, + "learning_rate": 1.9223810604277063e-05, + "loss": 2.4117, + "step": 1584 + }, + { + "epoch": 0.3049305772793994, + "lm_loss": 2.4783, + "step": 1584, + "vm_loss": 0.1862 + }, + { + "epoch": 0.3049305772793994, + "lm_loss": 2.4083, + "step": 1584, + "vm_loss": 0.2393 + }, + { + "epoch": 0.3049305772793994, + "lm_loss": 2.5333, + "step": 1584, + "vm_loss": 0.1538 + }, + { + "epoch": 0.3049305772793994, + "lm_loss": 2.2042, + "step": 1584, + "vm_loss": 0.1782 + }, + { + "epoch": 0.3049305772793994, + "lm_loss": 2.4878, + "step": 1584, + "vm_loss": 0.1477 + }, + { + "epoch": 0.3049305772793994, + "lm_loss": 2.4179, + "step": 1584, + "vm_loss": 0.1987 + }, + { + "epoch": 0.3049305772793994, + "lm_loss": 2.0712, + "step": 1584, + "vm_loss": 0.1585 + }, + { + "epoch": 0.3049305772793994, + "lm_loss": 2.2991, + "step": 1584, + "vm_loss": 0.2117 + }, + { + "epoch": 0.30512308395697474, + "grad_norm": 3.00242119123407, + "learning_rate": 1.9222605771049076e-05, + "loss": 2.4927, + "step": 1585 + }, + { + "epoch": 0.30531559063455016, + "grad_norm": 3.147923982339829, + "learning_rate": 1.9221400041265818e-05, + "loss": 2.3832, + "step": 1586 + }, + { + "epoch": 0.3055080973121255, + "grad_norm": 3.000091035285722, + "learning_rate": 1.9220193415044498e-05, + "loss": 2.4266, + "step": 1587 + }, + { + "epoch": 0.3057006039897009, + "grad_norm": 3.1592738723604588, + "learning_rate": 1.921898589250242e-05, + "loss": 2.4278, + "step": 1588 + }, + { + "epoch": 0.30589311066727626, + "grad_norm": 3.075560665763926, + "learning_rate": 1.9217777473756967e-05, + "loss": 2.394, + "step": 1589 + }, + { + "epoch": 0.3060856173448516, + "grad_norm": 2.7858440510575746, + "learning_rate": 1.921656815892561e-05, + "loss": 2.4336, + "step": 1590 + }, + { + "epoch": 0.30627812402242705, + "grad_norm": 3.535930254340447, + "learning_rate": 1.9215357948125918e-05, + "loss": 2.4183, + "step": 1591 + }, + { + "epoch": 0.3064706307000024, + "grad_norm": 3.1682366203268915, + "learning_rate": 1.9214146841475536e-05, + "loss": 2.4321, + "step": 1592 + }, + { + "epoch": 0.3064706307000024, + "lm_loss": 2.1046, + "step": 1592, + "vm_loss": 0.1713 + }, + { + "epoch": 0.3064706307000024, + "lm_loss": 2.2586, + "step": 1592, + "vm_loss": 0.1447 + }, + { + "epoch": 0.3064706307000024, + "lm_loss": 2.1536, + "step": 1592, + "vm_loss": 0.1758 + }, + { + "epoch": 0.3064706307000024, + "lm_loss": 2.2727, + "step": 1592, + "vm_loss": 0.1238 + }, + { + "epoch": 0.3064706307000024, + "lm_loss": 2.2912, + "step": 1592, + "vm_loss": 0.1719 + }, + { + "epoch": 0.3064706307000024, + "lm_loss": 2.1954, + "step": 1592, + "vm_loss": 0.1743 + }, + { + "epoch": 0.3064706307000024, + "lm_loss": 2.0684, + "step": 1592, + "vm_loss": 0.1333 + }, + { + "epoch": 0.3064706307000024, + "lm_loss": 2.0347, + "step": 1592, + "vm_loss": 0.1925 + }, + { + "epoch": 0.3066631373775778, + "grad_norm": 3.252129372031478, + "learning_rate": 1.9212934839092196e-05, + "loss": 2.4038, + "step": 1593 + }, + { + "epoch": 0.30685564405515314, + "grad_norm": 3.111933590591457, + "learning_rate": 1.9211721941093722e-05, + "loss": 2.4108, + "step": 1594 + }, + { + "epoch": 0.30704815073272856, + "grad_norm": 2.8854291095017355, + "learning_rate": 1.9210508147598022e-05, + "loss": 2.4226, + "step": 1595 + }, + { + "epoch": 0.30724065741030393, + "grad_norm": 3.5117309962573633, + "learning_rate": 1.9209293458723093e-05, + "loss": 2.4552, + "step": 1596 + }, + { + "epoch": 0.3074331640878793, + "grad_norm": 3.6809097376630766, + "learning_rate": 1.9208077874587017e-05, + "loss": 2.4058, + "step": 1597 + }, + { + "epoch": 0.30762567076545466, + "grad_norm": 3.324244241560003, + "learning_rate": 1.9206861395307973e-05, + "loss": 2.4333, + "step": 1598 + }, + { + "epoch": 0.30781817744303, + "grad_norm": 3.6698747241911995, + "learning_rate": 1.9205644021004203e-05, + "loss": 2.4491, + "step": 1599 + }, + { + "epoch": 0.30801068412060545, + "grad_norm": 3.0467367154505864, + "learning_rate": 1.9204425751794064e-05, + "loss": 2.367, + "step": 1600 + }, + { + "epoch": 0.30801068412060545, + "lm_loss": 2.3085, + "step": 1600, + "vm_loss": 0.216 + }, + { + "epoch": 0.30801068412060545, + "lm_loss": 2.2527, + "step": 1600, + "vm_loss": 0.1317 + }, + { + "epoch": 0.30801068412060545, + "lm_loss": 1.8658, + "step": 1600, + "vm_loss": 0.1895 + }, + { + "epoch": 0.30801068412060545, + "lm_loss": 2.4182, + "step": 1600, + "vm_loss": 0.1172 + }, + { + "epoch": 0.30801068412060545, + "lm_loss": 2.3723, + "step": 1600, + "vm_loss": 0.1541 + }, + { + "epoch": 0.30801068412060545, + "lm_loss": 2.579, + "step": 1600, + "vm_loss": 0.1639 + }, + { + "epoch": 0.30801068412060545, + "lm_loss": 2.4558, + "step": 1600, + "vm_loss": 0.1961 + }, + { + "epoch": 0.30801068412060545, + "lm_loss": 2.2496, + "step": 1600, + "vm_loss": 0.2593 + }, + { + "epoch": 0.3082031907981808, + "grad_norm": 3.9531101788806238, + "learning_rate": 1.9203206587795982e-05, + "loss": 2.3899, + "step": 1601 + }, + { + "epoch": 0.3083956974757562, + "grad_norm": 3.637420396432896, + "learning_rate": 1.9201986529128477e-05, + "loss": 2.3767, + "step": 1602 + }, + { + "epoch": 0.30858820415333155, + "grad_norm": 3.0524578861739937, + "learning_rate": 1.920076557591015e-05, + "loss": 2.3756, + "step": 1603 + }, + { + "epoch": 0.30878071083090697, + "grad_norm": 3.6819749716823393, + "learning_rate": 1.91995437282597e-05, + "loss": 2.4645, + "step": 1604 + }, + { + "epoch": 0.30897321750848233, + "grad_norm": 3.780474648887843, + "learning_rate": 1.9198320986295904e-05, + "loss": 2.4132, + "step": 1605 + }, + { + "epoch": 0.3091657241860577, + "grad_norm": 3.8836029892814876, + "learning_rate": 1.9197097350137625e-05, + "loss": 2.4527, + "step": 1606 + }, + { + "epoch": 0.30935823086363307, + "grad_norm": 3.1565123554083008, + "learning_rate": 1.9195872819903817e-05, + "loss": 2.4009, + "step": 1607 + }, + { + "epoch": 0.3095507375412085, + "grad_norm": 3.1515098808985527, + "learning_rate": 1.9194647395713525e-05, + "loss": 2.405, + "step": 1608 + }, + { + "epoch": 0.3095507375412085, + "lm_loss": 2.5084, + "step": 1608, + "vm_loss": 0.1675 + }, + { + "epoch": 0.3095507375412085, + "lm_loss": 2.2548, + "step": 1608, + "vm_loss": 0.2256 + }, + { + "epoch": 0.3095507375412085, + "lm_loss": 1.6081, + "step": 1608, + "vm_loss": 0.1329 + }, + { + "epoch": 0.3095507375412085, + "lm_loss": 2.4594, + "step": 1608, + "vm_loss": 0.2025 + }, + { + "epoch": 0.3095507375412085, + "lm_loss": 2.3055, + "step": 1608, + "vm_loss": 0.1898 + }, + { + "epoch": 0.3095507375412085, + "lm_loss": 2.0867, + "step": 1608, + "vm_loss": 0.1972 + }, + { + "epoch": 0.3095507375412085, + "lm_loss": 2.2439, + "step": 1608, + "vm_loss": 0.1845 + }, + { + "epoch": 0.3095507375412085, + "lm_loss": 2.2648, + "step": 1608, + "vm_loss": 0.1267 + }, + { + "epoch": 0.30974324421878385, + "grad_norm": 3.757860921384033, + "learning_rate": 1.919342107768587e-05, + "loss": 2.4035, + "step": 1609 + }, + { + "epoch": 0.3099357508963592, + "grad_norm": 3.741681337777263, + "learning_rate": 1.919219386594007e-05, + "loss": 2.4419, + "step": 1610 + }, + { + "epoch": 0.3101282575739346, + "grad_norm": 2.8799619899211075, + "learning_rate": 1.919096576059542e-05, + "loss": 2.408, + "step": 1611 + }, + { + "epoch": 0.31032076425150995, + "grad_norm": 3.5398337734615213, + "learning_rate": 1.9189736761771318e-05, + "loss": 2.4725, + "step": 1612 + }, + { + "epoch": 0.31051327092908537, + "grad_norm": 3.7278409587860595, + "learning_rate": 1.9188506869587223e-05, + "loss": 2.4305, + "step": 1613 + }, + { + "epoch": 0.31070577760666074, + "grad_norm": 2.931471786718931, + "learning_rate": 1.9187276084162714e-05, + "loss": 2.4303, + "step": 1614 + }, + { + "epoch": 0.3108982842842361, + "grad_norm": 3.3217212956114266, + "learning_rate": 1.9186044405617423e-05, + "loss": 2.3905, + "step": 1615 + }, + { + "epoch": 0.31109079096181147, + "grad_norm": 3.3395900084231633, + "learning_rate": 1.9184811834071097e-05, + "loss": 2.4765, + "step": 1616 + }, + { + "epoch": 0.31109079096181147, + "lm_loss": 2.3183, + "step": 1616, + "vm_loss": 0.139 + }, + { + "epoch": 0.31109079096181147, + "lm_loss": 2.3912, + "step": 1616, + "vm_loss": 0.2341 + }, + { + "epoch": 0.31109079096181147, + "lm_loss": 1.9993, + "step": 1616, + "vm_loss": 0.1552 + }, + { + "epoch": 0.31109079096181147, + "lm_loss": 2.2254, + "step": 1616, + "vm_loss": 0.1336 + }, + { + "epoch": 0.31109079096181147, + "lm_loss": 2.1837, + "step": 1616, + "vm_loss": 0.1904 + }, + { + "epoch": 0.31109079096181147, + "lm_loss": 2.1318, + "step": 1616, + "vm_loss": 0.1966 + }, + { + "epoch": 0.31109079096181147, + "lm_loss": 2.0301, + "step": 1616, + "vm_loss": 0.2629 + }, + { + "epoch": 0.31109079096181147, + "lm_loss": 2.2213, + "step": 1616, + "vm_loss": 0.2409 + }, + { + "epoch": 0.3112832976393869, + "grad_norm": 2.857989138150822, + "learning_rate": 1.918357836964355e-05, + "loss": 2.3996, + "step": 1617 + }, + { + "epoch": 0.31147580431696226, + "grad_norm": 3.414176573864841, + "learning_rate": 1.918234401245469e-05, + "loss": 2.4288, + "step": 1618 + }, + { + "epoch": 0.3116683109945376, + "grad_norm": 3.470435252438815, + "learning_rate": 1.9181108762624522e-05, + "loss": 2.4368, + "step": 1619 + }, + { + "epoch": 0.311860817672113, + "grad_norm": 3.3013239279231708, + "learning_rate": 1.9179872620273116e-05, + "loss": 2.4539, + "step": 1620 + }, + { + "epoch": 0.31205332434968835, + "grad_norm": 2.8064524571948204, + "learning_rate": 1.9178635585520648e-05, + "loss": 2.404, + "step": 1621 + }, + { + "epoch": 0.3122458310272638, + "grad_norm": 3.6435992475179266, + "learning_rate": 1.9177397658487376e-05, + "loss": 2.3827, + "step": 1622 + }, + { + "epoch": 0.31243833770483914, + "grad_norm": 2.9119566889862902, + "learning_rate": 1.9176158839293634e-05, + "loss": 2.3334, + "step": 1623 + }, + { + "epoch": 0.3126308443824145, + "grad_norm": 3.013198522022594, + "learning_rate": 1.9174919128059856e-05, + "loss": 2.3938, + "step": 1624 + }, + { + "epoch": 0.3126308443824145, + "lm_loss": 1.9192, + "step": 1624, + "vm_loss": 0.0947 + }, + { + "epoch": 0.3126308443824145, + "lm_loss": 2.3788, + "step": 1624, + "vm_loss": 0.1457 + }, + { + "epoch": 0.3126308443824145, + "lm_loss": 2.0472, + "step": 1624, + "vm_loss": 0.2036 + }, + { + "epoch": 0.3126308443824145, + "lm_loss": 2.2758, + "step": 1624, + "vm_loss": 0.1324 + }, + { + "epoch": 0.3126308443824145, + "lm_loss": 2.1313, + "step": 1624, + "vm_loss": 0.2324 + }, + { + "epoch": 0.3126308443824145, + "lm_loss": 2.1901, + "step": 1624, + "vm_loss": 0.2184 + }, + { + "epoch": 0.3126308443824145, + "lm_loss": 2.0816, + "step": 1624, + "vm_loss": 0.1776 + }, + { + "epoch": 0.3126308443824145, + "lm_loss": 2.2704, + "step": 1624, + "vm_loss": 0.1872 + }, + { + "epoch": 0.3128233510599899, + "grad_norm": 3.2380900769520746, + "learning_rate": 1.9173678524906557e-05, + "loss": 2.3892, + "step": 1625 + }, + { + "epoch": 0.3130158577375653, + "grad_norm": 2.982499010435214, + "learning_rate": 1.9172437029954342e-05, + "loss": 2.4341, + "step": 1626 + }, + { + "epoch": 0.31320836441514066, + "grad_norm": 2.7673419695399404, + "learning_rate": 1.9171194643323896e-05, + "loss": 2.4548, + "step": 1627 + }, + { + "epoch": 0.313400871092716, + "grad_norm": 3.0025337500252505, + "learning_rate": 1.9169951365136e-05, + "loss": 2.4074, + "step": 1628 + }, + { + "epoch": 0.3135933777702914, + "grad_norm": 3.0798793716423205, + "learning_rate": 1.9168707195511512e-05, + "loss": 2.4034, + "step": 1629 + }, + { + "epoch": 0.31378588444786676, + "grad_norm": 2.995941162377776, + "learning_rate": 1.916746213457138e-05, + "loss": 2.4033, + "step": 1630 + }, + { + "epoch": 0.3139783911254422, + "grad_norm": 2.9132283498345637, + "learning_rate": 1.9166216182436643e-05, + "loss": 2.4115, + "step": 1631 + }, + { + "epoch": 0.31417089780301755, + "grad_norm": 2.758941529694054, + "learning_rate": 1.9164969339228424e-05, + "loss": 2.4151, + "step": 1632 + }, + { + "epoch": 0.31417089780301755, + "lm_loss": 2.4015, + "step": 1632, + "vm_loss": 0.1654 + }, + { + "epoch": 0.31417089780301755, + "lm_loss": 2.3777, + "step": 1632, + "vm_loss": 0.1864 + }, + { + "epoch": 0.31417089780301755, + "lm_loss": 2.0602, + "step": 1632, + "vm_loss": 0.1996 + }, + { + "epoch": 0.31417089780301755, + "lm_loss": 2.354, + "step": 1632, + "vm_loss": 0.1625 + }, + { + "epoch": 0.31417089780301755, + "lm_loss": 2.1065, + "step": 1632, + "vm_loss": 0.1661 + }, + { + "epoch": 0.31417089780301755, + "lm_loss": 2.1425, + "step": 1632, + "vm_loss": 0.1681 + }, + { + "epoch": 0.31417089780301755, + "lm_loss": 2.5505, + "step": 1632, + "vm_loss": 0.135 + }, + { + "epoch": 0.31417089780301755, + "lm_loss": 2.1149, + "step": 1632, + "vm_loss": 0.1758 + }, + { + "epoch": 0.3143634044805929, + "grad_norm": 2.820036208327963, + "learning_rate": 1.916372160506793e-05, + "loss": 2.4471, + "step": 1633 + }, + { + "epoch": 0.3145559111581683, + "grad_norm": 2.785639301197258, + "learning_rate": 1.916247298007646e-05, + "loss": 2.4152, + "step": 1634 + }, + { + "epoch": 0.3147484178357437, + "grad_norm": 3.034776279384953, + "learning_rate": 1.9161223464375393e-05, + "loss": 2.4076, + "step": 1635 + }, + { + "epoch": 0.31494092451331906, + "grad_norm": 2.843864870634454, + "learning_rate": 1.9159973058086198e-05, + "loss": 2.398, + "step": 1636 + }, + { + "epoch": 0.31513343119089443, + "grad_norm": 3.0798090836111034, + "learning_rate": 1.9158721761330432e-05, + "loss": 2.3938, + "step": 1637 + }, + { + "epoch": 0.3153259378684698, + "grad_norm": 2.9122660704742147, + "learning_rate": 1.9157469574229736e-05, + "loss": 2.4125, + "step": 1638 + }, + { + "epoch": 0.3155184445460452, + "grad_norm": 3.228181786720458, + "learning_rate": 1.915621649690584e-05, + "loss": 2.3899, + "step": 1639 + }, + { + "epoch": 0.3157109512236206, + "grad_norm": 3.1463712301543514, + "learning_rate": 1.915496252948056e-05, + "loss": 2.4277, + "step": 1640 + }, + { + "epoch": 0.3157109512236206, + "lm_loss": 2.0584, + "step": 1640, + "vm_loss": 0.1587 + }, + { + "epoch": 0.3157109512236206, + "lm_loss": 2.3983, + "step": 1640, + "vm_loss": 0.214 + }, + { + "epoch": 0.3157109512236206, + "lm_loss": 2.1277, + "step": 1640, + "vm_loss": 0.1545 + }, + { + "epoch": 0.3157109512236206, + "lm_loss": 2.6039, + "step": 1640, + "vm_loss": 0.1639 + }, + { + "epoch": 0.3157109512236206, + "lm_loss": 2.4743, + "step": 1640, + "vm_loss": 0.2132 + }, + { + "epoch": 0.3157109512236206, + "lm_loss": 2.2705, + "step": 1640, + "vm_loss": 0.2481 + }, + { + "epoch": 0.3157109512236206, + "lm_loss": 2.0972, + "step": 1640, + "vm_loss": 0.2156 + }, + { + "epoch": 0.3157109512236206, + "lm_loss": 2.4121, + "step": 1640, + "vm_loss": 0.2644 + }, + { + "epoch": 0.31590345790119595, + "grad_norm": 2.874555616646559, + "learning_rate": 1.9153707672075793e-05, + "loss": 2.4827, + "step": 1641 + }, + { + "epoch": 0.3160959645787713, + "grad_norm": 3.545184904839079, + "learning_rate": 1.9152451924813532e-05, + "loss": 2.3885, + "step": 1642 + }, + { + "epoch": 0.3162884712563467, + "grad_norm": 3.224058729403147, + "learning_rate": 1.915119528781585e-05, + "loss": 2.4576, + "step": 1643 + }, + { + "epoch": 0.3164809779339221, + "grad_norm": 3.136075283475528, + "learning_rate": 1.9149937761204907e-05, + "loss": 2.4034, + "step": 1644 + }, + { + "epoch": 0.31667348461149747, + "grad_norm": 2.9142663182231536, + "learning_rate": 1.9148679345102952e-05, + "loss": 2.4013, + "step": 1645 + }, + { + "epoch": 0.31686599128907283, + "grad_norm": 3.1876018007775753, + "learning_rate": 1.914742003963232e-05, + "loss": 2.4584, + "step": 1646 + }, + { + "epoch": 0.3170584979666482, + "grad_norm": 3.419294524485164, + "learning_rate": 1.9146159844915427e-05, + "loss": 2.4318, + "step": 1647 + }, + { + "epoch": 0.3172510046442236, + "grad_norm": 2.9730271290774373, + "learning_rate": 1.9144898761074785e-05, + "loss": 2.3417, + "step": 1648 + }, + { + "epoch": 0.3172510046442236, + "lm_loss": 1.842, + "step": 1648, + "vm_loss": 0.2541 + }, + { + "epoch": 0.3172510046442236, + "lm_loss": 2.4948, + "step": 1648, + "vm_loss": 0.1793 + }, + { + "epoch": 0.3172510046442236, + "lm_loss": 2.05, + "step": 1648, + "vm_loss": 0.2094 + }, + { + "epoch": 0.3172510046442236, + "lm_loss": 2.2142, + "step": 1648, + "vm_loss": 0.149 + }, + { + "epoch": 0.3172510046442236, + "lm_loss": 2.013, + "step": 1648, + "vm_loss": 0.1743 + }, + { + "epoch": 0.3172510046442236, + "lm_loss": 2.0508, + "step": 1648, + "vm_loss": 0.228 + }, + { + "epoch": 0.3172510046442236, + "lm_loss": 2.2192, + "step": 1648, + "vm_loss": 0.1583 + }, + { + "epoch": 0.3172510046442236, + "lm_loss": 2.3756, + "step": 1648, + "vm_loss": 0.1493 + }, + { + "epoch": 0.317443511321799, + "grad_norm": 2.7837210529558276, + "learning_rate": 1.914363678823299e-05, + "loss": 2.4242, + "step": 1649 + }, + { + "epoch": 0.31763601799937435, + "grad_norm": 3.0424197234439223, + "learning_rate": 1.9142373926512713e-05, + "loss": 2.3841, + "step": 1650 + }, + { + "epoch": 0.3178285246769497, + "grad_norm": 2.9039461576908883, + "learning_rate": 1.9141110176036726e-05, + "loss": 2.4165, + "step": 1651 + }, + { + "epoch": 0.3180210313545251, + "grad_norm": 2.8248227439757554, + "learning_rate": 1.9139845536927882e-05, + "loss": 2.392, + "step": 1652 + }, + { + "epoch": 0.3182135380321005, + "grad_norm": 2.8970420013193574, + "learning_rate": 1.9138580009309118e-05, + "loss": 2.4086, + "step": 1653 + }, + { + "epoch": 0.3184060447096759, + "grad_norm": 3.1914263169991117, + "learning_rate": 1.9137313593303465e-05, + "loss": 2.4908, + "step": 1654 + }, + { + "epoch": 0.31859855138725124, + "grad_norm": 2.829729838111605, + "learning_rate": 1.9136046289034025e-05, + "loss": 2.3981, + "step": 1655 + }, + { + "epoch": 0.3187910580648266, + "grad_norm": 2.9959100645433243, + "learning_rate": 1.9134778096624e-05, + "loss": 2.4189, + "step": 1656 + }, + { + "epoch": 0.3187910580648266, + "lm_loss": 2.175, + "step": 1656, + "vm_loss": 0.1799 + }, + { + "epoch": 0.3187910580648266, + "lm_loss": 2.4143, + "step": 1656, + "vm_loss": 0.2296 + }, + { + "epoch": 0.3187910580648266, + "lm_loss": 2.2002, + "step": 1656, + "vm_loss": 0.1912 + }, + { + "epoch": 0.3187910580648266, + "lm_loss": 1.8123, + "step": 1656, + "vm_loss": 0.138 + }, + { + "epoch": 0.3187910580648266, + "lm_loss": 2.4234, + "step": 1656, + "vm_loss": 0.2117 + }, + { + "epoch": 0.3187910580648266, + "lm_loss": 2.0486, + "step": 1656, + "vm_loss": 0.1653 + }, + { + "epoch": 0.3187910580648266, + "lm_loss": 2.6399, + "step": 1656, + "vm_loss": 0.1923 + }, + { + "epoch": 0.3187910580648266, + "lm_loss": 2.4223, + "step": 1656, + "vm_loss": 0.1529 + }, + { + "epoch": 0.318983564742402, + "grad_norm": 3.190890267865329, + "learning_rate": 1.9133509016196683e-05, + "loss": 2.4414, + "step": 1657 + }, + { + "epoch": 0.3191760714199774, + "grad_norm": 3.1852361419316733, + "learning_rate": 1.9132239047875433e-05, + "loss": 2.4034, + "step": 1658 + }, + { + "epoch": 0.31936857809755276, + "grad_norm": 3.1914000050905473, + "learning_rate": 1.9130968191783715e-05, + "loss": 2.441, + "step": 1659 + }, + { + "epoch": 0.3195610847751281, + "grad_norm": 3.310642752320234, + "learning_rate": 1.9129696448045067e-05, + "loss": 2.4157, + "step": 1660 + }, + { + "epoch": 0.3197535914527035, + "grad_norm": 2.817753501061679, + "learning_rate": 1.912842381678312e-05, + "loss": 2.4377, + "step": 1661 + }, + { + "epoch": 0.3199460981302789, + "grad_norm": 3.1035160019109282, + "learning_rate": 1.9127150298121596e-05, + "loss": 2.4038, + "step": 1662 + }, + { + "epoch": 0.3201386048078543, + "grad_norm": 3.122888134091364, + "learning_rate": 1.912587589218429e-05, + "loss": 2.4088, + "step": 1663 + }, + { + "epoch": 0.32033111148542964, + "grad_norm": 3.019935114378441, + "learning_rate": 1.9124600599095094e-05, + "loss": 2.3971, + "step": 1664 + }, + { + "epoch": 0.32033111148542964, + "lm_loss": 2.081, + "step": 1664, + "vm_loss": 0.1584 + }, + { + "epoch": 0.32033111148542964, + "lm_loss": 2.3194, + "step": 1664, + "vm_loss": 0.1864 + }, + { + "epoch": 0.32033111148542964, + "lm_loss": 2.2441, + "step": 1664, + "vm_loss": 0.2384 + }, + { + "epoch": 0.32033111148542964, + "lm_loss": 2.1281, + "step": 1664, + "vm_loss": 0.2268 + }, + { + "epoch": 0.32033111148542964, + "lm_loss": 2.3737, + "step": 1664, + "vm_loss": 0.1669 + }, + { + "epoch": 0.32033111148542964, + "lm_loss": 1.9669, + "step": 1664, + "vm_loss": 0.2021 + }, + { + "epoch": 0.32033111148542964, + "lm_loss": 2.0802, + "step": 1664, + "vm_loss": 0.2034 + }, + { + "epoch": 0.32033111148542964, + "lm_loss": 2.6208, + "step": 1664, + "vm_loss": 0.1368 + }, + { + "epoch": 0.320523618163005, + "grad_norm": 3.0228353955625904, + "learning_rate": 1.9123324418977982e-05, + "loss": 2.4683, + "step": 1665 + }, + { + "epoch": 0.32071612484058043, + "grad_norm": 3.0205956839577044, + "learning_rate": 1.9122047351957012e-05, + "loss": 2.3904, + "step": 1666 + }, + { + "epoch": 0.3209086315181558, + "grad_norm": 3.2464196347789547, + "learning_rate": 1.9120769398156338e-05, + "loss": 2.3831, + "step": 1667 + }, + { + "epoch": 0.32110113819573116, + "grad_norm": 2.7621650225226544, + "learning_rate": 1.9119490557700192e-05, + "loss": 2.3747, + "step": 1668 + }, + { + "epoch": 0.3212936448733065, + "grad_norm": 2.898171373720304, + "learning_rate": 1.9118210830712886e-05, + "loss": 2.4245, + "step": 1669 + }, + { + "epoch": 0.32148615155088195, + "grad_norm": 3.2810832839521313, + "learning_rate": 1.9116930217318834e-05, + "loss": 2.4284, + "step": 1670 + }, + { + "epoch": 0.3216786582284573, + "grad_norm": 2.6934133638520765, + "learning_rate": 1.9115648717642527e-05, + "loss": 2.4347, + "step": 1671 + }, + { + "epoch": 0.3218711649060327, + "grad_norm": 3.0350892141694956, + "learning_rate": 1.911436633180854e-05, + "loss": 2.4417, + "step": 1672 + }, + { + "epoch": 0.3218711649060327, + "lm_loss": 2.4947, + "step": 1672, + "vm_loss": 0.2099 + }, + { + "epoch": 0.3218711649060327, + "lm_loss": 1.78, + "step": 1672, + "vm_loss": 0.1638 + }, + { + "epoch": 0.3218711649060327, + "lm_loss": 2.3463, + "step": 1672, + "vm_loss": 0.1886 + }, + { + "epoch": 0.3218711649060327, + "lm_loss": 2.0045, + "step": 1672, + "vm_loss": 0.1878 + }, + { + "epoch": 0.3218711649060327, + "lm_loss": 2.379, + "step": 1672, + "vm_loss": 0.1729 + }, + { + "epoch": 0.3218711649060327, + "lm_loss": 2.1747, + "step": 1672, + "vm_loss": 0.1727 + }, + { + "epoch": 0.3218711649060327, + "lm_loss": 2.2924, + "step": 1672, + "vm_loss": 0.183 + }, + { + "epoch": 0.3218711649060327, + "lm_loss": 2.1637, + "step": 1672, + "vm_loss": 0.1294 + }, + { + "epoch": 0.32206367158360805, + "grad_norm": 3.2101618268595367, + "learning_rate": 1.911308305994154e-05, + "loss": 2.4393, + "step": 1673 + }, + { + "epoch": 0.3222561782611834, + "grad_norm": 2.5739350618931116, + "learning_rate": 1.911179890216627e-05, + "loss": 2.4652, + "step": 1674 + }, + { + "epoch": 0.32244868493875883, + "grad_norm": 3.0075596556350295, + "learning_rate": 1.9110513858607576e-05, + "loss": 2.4356, + "step": 1675 + }, + { + "epoch": 0.3226411916163342, + "grad_norm": 2.862037629948916, + "learning_rate": 1.9109227929390378e-05, + "loss": 2.4667, + "step": 1676 + }, + { + "epoch": 0.32283369829390957, + "grad_norm": 2.776365763194087, + "learning_rate": 1.9107941114639685e-05, + "loss": 2.413, + "step": 1677 + }, + { + "epoch": 0.32302620497148493, + "grad_norm": 2.988130151602014, + "learning_rate": 1.9106653414480593e-05, + "loss": 2.3915, + "step": 1678 + }, + { + "epoch": 0.32321871164906035, + "grad_norm": 3.2591702958630417, + "learning_rate": 1.9105364829038278e-05, + "loss": 2.4165, + "step": 1679 + }, + { + "epoch": 0.3234112183266357, + "grad_norm": 2.794659666012045, + "learning_rate": 1.910407535843801e-05, + "loss": 2.4127, + "step": 1680 + }, + { + "epoch": 0.3234112183266357, + "lm_loss": 2.2186, + "step": 1680, + "vm_loss": 0.265 + }, + { + "epoch": 0.3234112183266357, + "lm_loss": 2.1581, + "step": 1680, + "vm_loss": 0.1523 + }, + { + "epoch": 0.3234112183266357, + "lm_loss": 2.2775, + "step": 1680, + "vm_loss": 0.1368 + }, + { + "epoch": 0.3234112183266357, + "lm_loss": 2.3566, + "step": 1680, + "vm_loss": 0.1784 + }, + { + "epoch": 0.3234112183266357, + "lm_loss": 1.9381, + "step": 1680, + "vm_loss": 0.1248 + }, + { + "epoch": 0.3234112183266357, + "lm_loss": 2.157, + "step": 1680, + "vm_loss": 0.217 + }, + { + "epoch": 0.3234112183266357, + "lm_loss": 2.1727, + "step": 1680, + "vm_loss": 0.1477 + }, + { + "epoch": 0.3234112183266357, + "lm_loss": 2.0669, + "step": 1680, + "vm_loss": 0.1803 + }, + { + "epoch": 0.3236037250042111, + "grad_norm": 3.073408880660531, + "learning_rate": 1.9102785002805145e-05, + "loss": 2.3909, + "step": 1681 + }, + { + "epoch": 0.32379623168178645, + "grad_norm": 2.9220658268757975, + "learning_rate": 1.9101493762265116e-05, + "loss": 2.414, + "step": 1682 + }, + { + "epoch": 0.3239887383593618, + "grad_norm": 2.9992045034805628, + "learning_rate": 1.9100201636943453e-05, + "loss": 2.38, + "step": 1683 + }, + { + "epoch": 0.32418124503693724, + "grad_norm": 2.9880082163340265, + "learning_rate": 1.909890862696576e-05, + "loss": 2.4029, + "step": 1684 + }, + { + "epoch": 0.3243737517145126, + "grad_norm": 3.0729355375353853, + "learning_rate": 1.9097614732457745e-05, + "loss": 2.4072, + "step": 1685 + }, + { + "epoch": 0.32456625839208797, + "grad_norm": 3.068605934741322, + "learning_rate": 1.9096319953545186e-05, + "loss": 2.4419, + "step": 1686 + }, + { + "epoch": 0.32475876506966334, + "grad_norm": 3.3148660484575436, + "learning_rate": 1.9095024290353948e-05, + "loss": 2.4111, + "step": 1687 + }, + { + "epoch": 0.32495127174723876, + "grad_norm": 3.229534212884842, + "learning_rate": 1.9093727743009993e-05, + "loss": 2.4089, + "step": 1688 + }, + { + "epoch": 0.32495127174723876, + "lm_loss": 2.3924, + "step": 1688, + "vm_loss": 0.1323 + }, + { + "epoch": 0.32495127174723876, + "lm_loss": 2.2844, + "step": 1688, + "vm_loss": 0.183 + }, + { + "epoch": 0.32495127174723876, + "lm_loss": 2.3729, + "step": 1688, + "vm_loss": 0.1703 + }, + { + "epoch": 0.32495127174723876, + "lm_loss": 2.2958, + "step": 1688, + "vm_loss": 0.1847 + }, + { + "epoch": 0.32495127174723876, + "lm_loss": 2.3885, + "step": 1688, + "vm_loss": 0.1664 + }, + { + "epoch": 0.32495127174723876, + "lm_loss": 2.1698, + "step": 1688, + "vm_loss": 0.1933 + }, + { + "epoch": 0.32495127174723876, + "lm_loss": 2.3188, + "step": 1688, + "vm_loss": 0.1698 + }, + { + "epoch": 0.32495127174723876, + "lm_loss": 2.1556, + "step": 1688, + "vm_loss": 0.1708 + }, + { + "epoch": 0.3251437784248141, + "grad_norm": 3.014794639758744, + "learning_rate": 1.9092430311639355e-05, + "loss": 2.4353, + "step": 1689 + }, + { + "epoch": 0.3253362851023895, + "grad_norm": 3.070224386406534, + "learning_rate": 1.909113199636817e-05, + "loss": 2.4172, + "step": 1690 + }, + { + "epoch": 0.32552879177996485, + "grad_norm": 2.832930561833768, + "learning_rate": 1.908983279732264e-05, + "loss": 2.4414, + "step": 1691 + }, + { + "epoch": 0.3257212984575402, + "grad_norm": 3.228483190648978, + "learning_rate": 1.908853271462907e-05, + "loss": 2.4296, + "step": 1692 + }, + { + "epoch": 0.32591380513511564, + "grad_norm": 3.259311739741599, + "learning_rate": 1.9087231748413847e-05, + "loss": 2.3699, + "step": 1693 + }, + { + "epoch": 0.326106311812691, + "grad_norm": 2.6838979497458655, + "learning_rate": 1.9085929898803437e-05, + "loss": 2.4361, + "step": 1694 + }, + { + "epoch": 0.3262988184902664, + "grad_norm": 3.615176605280966, + "learning_rate": 1.9084627165924397e-05, + "loss": 2.4544, + "step": 1695 + }, + { + "epoch": 0.32649132516784174, + "grad_norm": 2.963390452799069, + "learning_rate": 1.908332354990337e-05, + "loss": 2.4173, + "step": 1696 + }, + { + "epoch": 0.32649132516784174, + "lm_loss": 2.2288, + "step": 1696, + "vm_loss": 0.1661 + }, + { + "epoch": 0.32649132516784174, + "lm_loss": 2.2827, + "step": 1696, + "vm_loss": 0.195 + }, + { + "epoch": 0.32649132516784174, + "lm_loss": 2.5743, + "step": 1696, + "vm_loss": 0.1049 + }, + { + "epoch": 0.32649132516784174, + "lm_loss": 2.0604, + "step": 1696, + "vm_loss": 0.1643 + }, + { + "epoch": 0.32649132516784174, + "lm_loss": 2.3513, + "step": 1696, + "vm_loss": 0.2499 + }, + { + "epoch": 0.32649132516784174, + "lm_loss": 2.0254, + "step": 1696, + "vm_loss": 0.1222 + }, + { + "epoch": 0.32649132516784174, + "lm_loss": 2.0627, + "step": 1696, + "vm_loss": 0.1927 + }, + { + "epoch": 0.32649132516784174, + "lm_loss": 2.0476, + "step": 1696, + "vm_loss": 0.1947 + }, + { + "epoch": 0.32668383184541716, + "grad_norm": 3.059725368462857, + "learning_rate": 1.9082019050867086e-05, + "loss": 2.3985, + "step": 1697 + }, + { + "epoch": 0.3268763385229925, + "grad_norm": 3.0129163406099533, + "learning_rate": 1.9080713668942356e-05, + "loss": 2.4074, + "step": 1698 + }, + { + "epoch": 0.3270688452005679, + "grad_norm": 3.1096098924196807, + "learning_rate": 1.907940740425608e-05, + "loss": 2.4537, + "step": 1699 + }, + { + "epoch": 0.32726135187814326, + "grad_norm": 2.890239355800731, + "learning_rate": 1.9078100256935247e-05, + "loss": 2.424, + "step": 1700 + }, + { + "epoch": 0.3274538585557186, + "grad_norm": 3.170518539649865, + "learning_rate": 1.9076792227106926e-05, + "loss": 2.3731, + "step": 1701 + }, + { + "epoch": 0.32764636523329405, + "grad_norm": 3.4547990948980143, + "learning_rate": 1.9075483314898275e-05, + "loss": 2.4053, + "step": 1702 + }, + { + "epoch": 0.3278388719108694, + "grad_norm": 2.568403546763582, + "learning_rate": 1.9074173520436535e-05, + "loss": 2.3553, + "step": 1703 + }, + { + "epoch": 0.3280313785884448, + "grad_norm": 2.852690727468107, + "learning_rate": 1.9072862843849036e-05, + "loss": 2.4605, + "step": 1704 + }, + { + "epoch": 0.3280313785884448, + "lm_loss": 2.1918, + "step": 1704, + "vm_loss": 0.1622 + }, + { + "epoch": 0.3280313785884448, + "lm_loss": 2.2387, + "step": 1704, + "vm_loss": 0.1949 + }, + { + "epoch": 0.3280313785884448, + "lm_loss": 2.3272, + "step": 1704, + "vm_loss": 0.2146 + }, + { + "epoch": 0.3280313785884448, + "lm_loss": 2.2215, + "step": 1704, + "vm_loss": 0.1721 + }, + { + "epoch": 0.3280313785884448, + "lm_loss": 2.3218, + "step": 1704, + "vm_loss": 0.1788 + }, + { + "epoch": 0.3280313785884448, + "lm_loss": 2.3165, + "step": 1704, + "vm_loss": 0.2045 + }, + { + "epoch": 0.3280313785884448, + "lm_loss": 2.4307, + "step": 1704, + "vm_loss": 0.1509 + }, + { + "epoch": 0.3280313785884448, + "lm_loss": 2.1909, + "step": 1704, + "vm_loss": 0.1557 + }, + { + "epoch": 0.32822388526602014, + "grad_norm": 3.05549450215224, + "learning_rate": 1.9071551285263195e-05, + "loss": 2.4405, + "step": 1705 + }, + { + "epoch": 0.32841639194359556, + "grad_norm": 3.1269760108753033, + "learning_rate": 1.9070238844806507e-05, + "loss": 2.3716, + "step": 1706 + }, + { + "epoch": 0.32860889862117093, + "grad_norm": 3.097977814972827, + "learning_rate": 1.906892552260656e-05, + "loss": 2.4153, + "step": 1707 + }, + { + "epoch": 0.3288014052987463, + "grad_norm": 3.2565685454359348, + "learning_rate": 1.906761131879103e-05, + "loss": 2.4378, + "step": 1708 + }, + { + "epoch": 0.32899391197632166, + "grad_norm": 3.5405687730250266, + "learning_rate": 1.906629623348767e-05, + "loss": 2.4818, + "step": 1709 + }, + { + "epoch": 0.3291864186538971, + "grad_norm": 2.9302540013991396, + "learning_rate": 1.9064980266824323e-05, + "loss": 2.446, + "step": 1710 + }, + { + "epoch": 0.32937892533147245, + "grad_norm": 3.2652875543626037, + "learning_rate": 1.9063663418928923e-05, + "loss": 2.4593, + "step": 1711 + }, + { + "epoch": 0.3295714320090478, + "grad_norm": 3.460715665643919, + "learning_rate": 1.906234568992948e-05, + "loss": 2.4212, + "step": 1712 + }, + { + "epoch": 0.3295714320090478, + "lm_loss": 2.3905, + "step": 1712, + "vm_loss": 0.1948 + }, + { + "epoch": 0.3295714320090478, + "lm_loss": 2.4512, + "step": 1712, + "vm_loss": 0.197 + }, + { + "epoch": 0.3295714320090478, + "lm_loss": 2.0487, + "step": 1712, + "vm_loss": 0.238 + }, + { + "epoch": 0.3295714320090478, + "lm_loss": 2.1492, + "step": 1712, + "vm_loss": 0.1901 + }, + { + "epoch": 0.3295714320090478, + "lm_loss": 2.2014, + "step": 1712, + "vm_loss": 0.1451 + }, + { + "epoch": 0.3295714320090478, + "lm_loss": 2.4585, + "step": 1712, + "vm_loss": 0.1844 + }, + { + "epoch": 0.3295714320090478, + "lm_loss": 2.2054, + "step": 1712, + "vm_loss": 0.1206 + }, + { + "epoch": 0.3295714320090478, + "lm_loss": 2.2655, + "step": 1712, + "vm_loss": 0.1791 + }, + { + "epoch": 0.3297639386866232, + "grad_norm": 3.1561803387048624, + "learning_rate": 1.9061027079954093e-05, + "loss": 2.4079, + "step": 1713 + }, + { + "epoch": 0.32995644536419855, + "grad_norm": 4.209720800062066, + "learning_rate": 1.9059707589130946e-05, + "loss": 2.4078, + "step": 1714 + }, + { + "epoch": 0.33014895204177397, + "grad_norm": 3.0566142302859673, + "learning_rate": 1.9058387217588322e-05, + "loss": 2.3901, + "step": 1715 + }, + { + "epoch": 0.33034145871934933, + "grad_norm": 3.8504528922520103, + "learning_rate": 1.9057065965454562e-05, + "loss": 2.3737, + "step": 1716 + }, + { + "epoch": 0.3305339653969247, + "grad_norm": 3.5168665645160204, + "learning_rate": 1.905574383285812e-05, + "loss": 2.3657, + "step": 1717 + }, + { + "epoch": 0.33072647207450007, + "grad_norm": 3.154826785041629, + "learning_rate": 1.905442081992752e-05, + "loss": 2.4031, + "step": 1718 + }, + { + "epoch": 0.3309189787520755, + "grad_norm": 3.2774373440695252, + "learning_rate": 1.9053096926791377e-05, + "loss": 2.5024, + "step": 1719 + }, + { + "epoch": 0.33111148542965085, + "grad_norm": 3.676863375596527, + "learning_rate": 1.905177215357839e-05, + "loss": 2.4191, + "step": 1720 + }, + { + "epoch": 0.33111148542965085, + "lm_loss": 2.5633, + "step": 1720, + "vm_loss": 0.1273 + }, + { + "epoch": 0.33111148542965085, + "lm_loss": 2.5542, + "step": 1720, + "vm_loss": 0.1256 + }, + { + "epoch": 0.33111148542965085, + "lm_loss": 1.9477, + "step": 1720, + "vm_loss": 0.2004 + }, + { + "epoch": 0.33111148542965085, + "lm_loss": 2.1229, + "step": 1720, + "vm_loss": 0.1739 + }, + { + "epoch": 0.33111148542965085, + "lm_loss": 1.9846, + "step": 1720, + "vm_loss": 0.1195 + }, + { + "epoch": 0.33111148542965085, + "lm_loss": 2.0449, + "step": 1720, + "vm_loss": 0.1518 + }, + { + "epoch": 0.33111148542965085, + "lm_loss": 2.0597, + "step": 1720, + "vm_loss": 0.156 + }, + { + "epoch": 0.33111148542965085, + "lm_loss": 2.4556, + "step": 1720, + "vm_loss": 0.09 + }, + { + "epoch": 0.3313039921072262, + "grad_norm": 3.4302961132949092, + "learning_rate": 1.905044650041734e-05, + "loss": 2.3374, + "step": 1721 + }, + { + "epoch": 0.3314964987848016, + "grad_norm": 2.9757935556438113, + "learning_rate": 1.9049119967437108e-05, + "loss": 2.388, + "step": 1722 + }, + { + "epoch": 0.33168900546237695, + "grad_norm": 3.131951025886107, + "learning_rate": 1.9047792554766637e-05, + "loss": 2.3646, + "step": 1723 + }, + { + "epoch": 0.3318815121399524, + "grad_norm": 3.1522615196499633, + "learning_rate": 1.9046464262534978e-05, + "loss": 2.4163, + "step": 1724 + }, + { + "epoch": 0.33207401881752774, + "grad_norm": 3.205922169167952, + "learning_rate": 1.9045135090871254e-05, + "loss": 2.3618, + "step": 1725 + }, + { + "epoch": 0.3322665254951031, + "grad_norm": 3.5196443172186838, + "learning_rate": 1.9043805039904677e-05, + "loss": 2.4858, + "step": 1726 + }, + { + "epoch": 0.33245903217267847, + "grad_norm": 2.987227039786243, + "learning_rate": 1.9042474109764545e-05, + "loss": 2.3571, + "step": 1727 + }, + { + "epoch": 0.3326515388502539, + "grad_norm": 3.3088066481019505, + "learning_rate": 1.9041142300580247e-05, + "loss": 2.3551, + "step": 1728 + }, + { + "epoch": 0.3326515388502539, + "lm_loss": 2.2981, + "step": 1728, + "vm_loss": 0.153 + }, + { + "epoch": 0.3326515388502539, + "lm_loss": 2.4517, + "step": 1728, + "vm_loss": 0.1429 + }, + { + "epoch": 0.3326515388502539, + "lm_loss": 2.0383, + "step": 1728, + "vm_loss": 0.1942 + }, + { + "epoch": 0.3326515388502539, + "lm_loss": 1.8719, + "step": 1728, + "vm_loss": 0.1746 + }, + { + "epoch": 0.3326515388502539, + "lm_loss": 2.1821, + "step": 1728, + "vm_loss": 0.2171 + }, + { + "epoch": 0.3326515388502539, + "lm_loss": 2.31, + "step": 1728, + "vm_loss": 0.1749 + }, + { + "epoch": 0.3326515388502539, + "lm_loss": 2.3241, + "step": 1728, + "vm_loss": 0.1506 + }, + { + "epoch": 0.3326515388502539, + "lm_loss": 1.9967, + "step": 1728, + "vm_loss": 0.189 + }, + { + "epoch": 0.33284404552782926, + "grad_norm": 3.088237616719538, + "learning_rate": 1.9039809612481245e-05, + "loss": 2.3887, + "step": 1729 + }, + { + "epoch": 0.3330365522054046, + "grad_norm": 3.119223866435456, + "learning_rate": 1.9038476045597093e-05, + "loss": 2.4067, + "step": 1730 + }, + { + "epoch": 0.33322905888298, + "grad_norm": 2.6996055623863056, + "learning_rate": 1.903714160005744e-05, + "loss": 2.3805, + "step": 1731 + }, + { + "epoch": 0.33342156556055536, + "grad_norm": 3.385718510407307, + "learning_rate": 1.9035806275991996e-05, + "loss": 2.3792, + "step": 1732 + }, + { + "epoch": 0.3336140722381308, + "grad_norm": 2.7826967696854075, + "learning_rate": 1.9034470073530588e-05, + "loss": 2.3799, + "step": 1733 + }, + { + "epoch": 0.33380657891570614, + "grad_norm": 2.890236769159508, + "learning_rate": 1.90331329928031e-05, + "loss": 2.4318, + "step": 1734 + }, + { + "epoch": 0.3339990855932815, + "grad_norm": 2.906373585337404, + "learning_rate": 1.903179503393952e-05, + "loss": 2.3809, + "step": 1735 + }, + { + "epoch": 0.3341915922708569, + "grad_norm": 3.358284100078741, + "learning_rate": 1.903045619706991e-05, + "loss": 2.4329, + "step": 1736 + }, + { + "epoch": 0.3341915922708569, + "lm_loss": 2.1365, + "step": 1736, + "vm_loss": 0.1364 + }, + { + "epoch": 0.3341915922708569, + "lm_loss": 1.9399, + "step": 1736, + "vm_loss": 0.1694 + }, + { + "epoch": 0.3341915922708569, + "lm_loss": 1.9106, + "step": 1736, + "vm_loss": 0.1337 + }, + { + "epoch": 0.3341915922708569, + "lm_loss": 2.3076, + "step": 1736, + "vm_loss": 0.1751 + }, + { + "epoch": 0.3341915922708569, + "lm_loss": 2.2196, + "step": 1736, + "vm_loss": 0.2207 + }, + { + "epoch": 0.3341915922708569, + "lm_loss": 2.174, + "step": 1736, + "vm_loss": 0.1847 + }, + { + "epoch": 0.3341915922708569, + "lm_loss": 2.4505, + "step": 1736, + "vm_loss": 0.2018 + }, + { + "epoch": 0.3341915922708569, + "lm_loss": 2.2559, + "step": 1736, + "vm_loss": 0.1708 + }, + { + "epoch": 0.3343840989484323, + "grad_norm": 3.0287045228388507, + "learning_rate": 1.9029116482324427e-05, + "loss": 2.3799, + "step": 1737 + }, + { + "epoch": 0.33457660562600766, + "grad_norm": 3.146579728110227, + "learning_rate": 1.90277758898333e-05, + "loss": 2.4323, + "step": 1738 + }, + { + "epoch": 0.334769112303583, + "grad_norm": 3.300009059371359, + "learning_rate": 1.9026434419726866e-05, + "loss": 2.3945, + "step": 1739 + }, + { + "epoch": 0.3349616189811584, + "grad_norm": 3.2676158409395524, + "learning_rate": 1.9025092072135514e-05, + "loss": 2.4328, + "step": 1740 + }, + { + "epoch": 0.3351541256587338, + "grad_norm": 3.602263620536653, + "learning_rate": 1.9023748847189757e-05, + "loss": 2.3842, + "step": 1741 + }, + { + "epoch": 0.3353466323363092, + "grad_norm": 3.0877383327328305, + "learning_rate": 1.9022404745020165e-05, + "loss": 2.3776, + "step": 1742 + }, + { + "epoch": 0.33553913901388455, + "grad_norm": 3.7428783919632225, + "learning_rate": 1.9021059765757396e-05, + "loss": 2.4108, + "step": 1743 + }, + { + "epoch": 0.3357316456914599, + "grad_norm": 2.7777833462674875, + "learning_rate": 1.9019713909532207e-05, + "loss": 2.3975, + "step": 1744 + }, + { + "epoch": 0.3357316456914599, + "lm_loss": 2.1749, + "step": 1744, + "vm_loss": 0.1874 + }, + { + "epoch": 0.3357316456914599, + "lm_loss": 2.2814, + "step": 1744, + "vm_loss": 0.2063 + }, + { + "epoch": 0.3357316456914599, + "lm_loss": 1.8623, + "step": 1744, + "vm_loss": 0.1663 + }, + { + "epoch": 0.3357316456914599, + "lm_loss": 2.4433, + "step": 1744, + "vm_loss": 0.1022 + }, + { + "epoch": 0.3357316456914599, + "lm_loss": 2.3347, + "step": 1744, + "vm_loss": 0.1946 + }, + { + "epoch": 0.3357316456914599, + "lm_loss": 2.12, + "step": 1744, + "vm_loss": 0.1887 + }, + { + "epoch": 0.3357316456914599, + "lm_loss": 2.6005, + "step": 1744, + "vm_loss": 0.2413 + }, + { + "epoch": 0.3357316456914599, + "lm_loss": 2.1044, + "step": 1744, + "vm_loss": 0.2047 + }, + { + "epoch": 0.3359241523690353, + "grad_norm": 2.889883464809344, + "learning_rate": 1.901836717647543e-05, + "loss": 2.4433, + "step": 1745 + }, + { + "epoch": 0.3361166590466107, + "grad_norm": 3.50359113674953, + "learning_rate": 1.9017019566717984e-05, + "loss": 2.3941, + "step": 1746 + }, + { + "epoch": 0.33630916572418607, + "grad_norm": 3.4836649085891516, + "learning_rate": 1.9015671080390876e-05, + "loss": 2.4313, + "step": 1747 + }, + { + "epoch": 0.33650167240176143, + "grad_norm": 2.6283468362876437, + "learning_rate": 1.9014321717625198e-05, + "loss": 2.4079, + "step": 1748 + }, + { + "epoch": 0.3366941790793368, + "grad_norm": 2.9780936113384917, + "learning_rate": 1.9012971478552117e-05, + "loss": 2.4014, + "step": 1749 + }, + { + "epoch": 0.3368866857569122, + "grad_norm": 3.3658841309946586, + "learning_rate": 1.9011620363302898e-05, + "loss": 2.4677, + "step": 1750 + }, + { + "epoch": 0.3370791924344876, + "grad_norm": 3.1426882548979176, + "learning_rate": 1.901026837200889e-05, + "loss": 2.388, + "step": 1751 + }, + { + "epoch": 0.33727169911206295, + "grad_norm": 2.594427955619498, + "learning_rate": 1.9008915504801524e-05, + "loss": 2.3904, + "step": 1752 + }, + { + "epoch": 0.33727169911206295, + "lm_loss": 2.0674, + "step": 1752, + "vm_loss": 0.1897 + }, + { + "epoch": 0.33727169911206295, + "lm_loss": 2.2917, + "step": 1752, + "vm_loss": 0.2054 + }, + { + "epoch": 0.33727169911206295, + "lm_loss": 2.0626, + "step": 1752, + "vm_loss": 0.1948 + }, + { + "epoch": 0.33727169911206295, + "lm_loss": 2.2077, + "step": 1752, + "vm_loss": 0.1749 + }, + { + "epoch": 0.33727169911206295, + "lm_loss": 2.3863, + "step": 1752, + "vm_loss": 0.196 + }, + { + "epoch": 0.33727169911206295, + "lm_loss": 2.0476, + "step": 1752, + "vm_loss": 0.0952 + }, + { + "epoch": 0.33727169911206295, + "lm_loss": 2.317, + "step": 1752, + "vm_loss": 0.1756 + }, + { + "epoch": 0.33727169911206295, + "lm_loss": 2.4646, + "step": 1752, + "vm_loss": 0.1201 + }, + { + "epoch": 0.3374642057896383, + "grad_norm": 3.528213143762084, + "learning_rate": 1.900756176181231e-05, + "loss": 2.4266, + "step": 1753 + }, + { + "epoch": 0.3376567124672137, + "grad_norm": 3.2015170544565463, + "learning_rate": 1.900620714317285e-05, + "loss": 2.3885, + "step": 1754 + }, + { + "epoch": 0.3378492191447891, + "grad_norm": 2.8546790721533655, + "learning_rate": 1.900485164901484e-05, + "loss": 2.3852, + "step": 1755 + }, + { + "epoch": 0.33804172582236447, + "grad_norm": 3.589044908522458, + "learning_rate": 1.9003495279470035e-05, + "loss": 2.4071, + "step": 1756 + }, + { + "epoch": 0.33823423249993984, + "grad_norm": 3.2932653210014444, + "learning_rate": 1.9002138034670308e-05, + "loss": 2.4403, + "step": 1757 + }, + { + "epoch": 0.3384267391775152, + "grad_norm": 3.0482781234838994, + "learning_rate": 1.900077991474759e-05, + "loss": 2.4359, + "step": 1758 + }, + { + "epoch": 0.3386192458550906, + "grad_norm": 3.3508684315845363, + "learning_rate": 1.8999420919833912e-05, + "loss": 2.3675, + "step": 1759 + }, + { + "epoch": 0.338811752532666, + "grad_norm": 3.3243098362628505, + "learning_rate": 1.899806105006138e-05, + "loss": 2.4104, + "step": 1760 + }, + { + "epoch": 0.338811752532666, + "lm_loss": 2.2646, + "step": 1760, + "vm_loss": 0.1078 + }, + { + "epoch": 0.338811752532666, + "lm_loss": 2.2976, + "step": 1760, + "vm_loss": 0.1381 + }, + { + "epoch": 0.338811752532666, + "lm_loss": 2.1831, + "step": 1760, + "vm_loss": 0.176 + }, + { + "epoch": 0.338811752532666, + "lm_loss": 1.9797, + "step": 1760, + "vm_loss": 0.1489 + }, + { + "epoch": 0.338811752532666, + "lm_loss": 2.4605, + "step": 1760, + "vm_loss": 0.1876 + }, + { + "epoch": 0.338811752532666, + "lm_loss": 2.4575, + "step": 1760, + "vm_loss": 0.1957 + }, + { + "epoch": 0.338811752532666, + "lm_loss": 2.3641, + "step": 1760, + "vm_loss": 0.1334 + }, + { + "epoch": 0.338811752532666, + "lm_loss": 2.2425, + "step": 1760, + "vm_loss": 0.1975 + }, + { + "epoch": 0.33900425921024135, + "grad_norm": 3.642381160922666, + "learning_rate": 1.89967003055622e-05, + "loss": 2.3777, + "step": 1761 + }, + { + "epoch": 0.3391967658878167, + "grad_norm": 2.8564030083501253, + "learning_rate": 1.8995338686468648e-05, + "loss": 2.3715, + "step": 1762 + }, + { + "epoch": 0.3393892725653921, + "grad_norm": 3.233731017390185, + "learning_rate": 1.899397619291309e-05, + "loss": 2.3613, + "step": 1763 + }, + { + "epoch": 0.3395817792429675, + "grad_norm": 2.982087407599961, + "learning_rate": 1.8992612825027978e-05, + "loss": 2.394, + "step": 1764 + }, + { + "epoch": 0.3397742859205429, + "grad_norm": 3.3766804340379717, + "learning_rate": 1.8991248582945855e-05, + "loss": 2.4355, + "step": 1765 + }, + { + "epoch": 0.33996679259811824, + "grad_norm": 3.0277486123598254, + "learning_rate": 1.8989883466799335e-05, + "loss": 2.3914, + "step": 1766 + }, + { + "epoch": 0.3401592992756936, + "grad_norm": 3.3329987674067856, + "learning_rate": 1.898851747672113e-05, + "loss": 2.3859, + "step": 1767 + }, + { + "epoch": 0.340351805953269, + "grad_norm": 3.249698568758816, + "learning_rate": 1.8987150612844028e-05, + "loss": 2.4515, + "step": 1768 + }, + { + "epoch": 0.340351805953269, + "lm_loss": 2.0811, + "step": 1768, + "vm_loss": 0.1337 + }, + { + "epoch": 0.340351805953269, + "lm_loss": 2.4001, + "step": 1768, + "vm_loss": 0.2158 + }, + { + "epoch": 0.340351805953269, + "lm_loss": 2.3949, + "step": 1768, + "vm_loss": 0.1885 + }, + { + "epoch": 0.340351805953269, + "lm_loss": 2.4494, + "step": 1768, + "vm_loss": 0.1568 + }, + { + "epoch": 0.340351805953269, + "lm_loss": 2.3516, + "step": 1768, + "vm_loss": 0.1773 + }, + { + "epoch": 0.340351805953269, + "lm_loss": 2.0027, + "step": 1768, + "vm_loss": 0.1432 + }, + { + "epoch": 0.340351805953269, + "lm_loss": 2.2244, + "step": 1768, + "vm_loss": 0.1554 + }, + { + "epoch": 0.340351805953269, + "lm_loss": 2.3771, + "step": 1768, + "vm_loss": 0.1677 + }, + { + "epoch": 0.3405443126308444, + "grad_norm": 3.468558048038586, + "learning_rate": 1.8985782875300908e-05, + "loss": 2.3769, + "step": 1769 + }, + { + "epoch": 0.34073681930841976, + "grad_norm": 3.0027107780383906, + "learning_rate": 1.8984414264224727e-05, + "loss": 2.459, + "step": 1770 + }, + { + "epoch": 0.3409293259859951, + "grad_norm": 3.0096247219000176, + "learning_rate": 1.898304477974854e-05, + "loss": 2.383, + "step": 1771 + }, + { + "epoch": 0.3411218326635705, + "grad_norm": 3.6875624244607264, + "learning_rate": 1.8981674422005472e-05, + "loss": 2.398, + "step": 1772 + }, + { + "epoch": 0.3413143393411459, + "grad_norm": 3.3956076388785212, + "learning_rate": 1.898030319112874e-05, + "loss": 2.4397, + "step": 1773 + }, + { + "epoch": 0.3415068460187213, + "grad_norm": 3.277790075803118, + "learning_rate": 1.8978931087251648e-05, + "loss": 2.4013, + "step": 1774 + }, + { + "epoch": 0.34169935269629664, + "grad_norm": 3.2445901587391934, + "learning_rate": 1.897755811050758e-05, + "loss": 2.4201, + "step": 1775 + }, + { + "epoch": 0.341891859373872, + "grad_norm": 3.4711477530604067, + "learning_rate": 1.8976184261030007e-05, + "loss": 2.4039, + "step": 1776 + }, + { + "epoch": 0.341891859373872, + "lm_loss": 2.5154, + "step": 1776, + "vm_loss": 0.1395 + }, + { + "epoch": 0.341891859373872, + "lm_loss": 1.9922, + "step": 1776, + "vm_loss": 0.201 + }, + { + "epoch": 0.341891859373872, + "lm_loss": 2.6143, + "step": 1776, + "vm_loss": 0.2043 + }, + { + "epoch": 0.341891859373872, + "lm_loss": 2.3017, + "step": 1776, + "vm_loss": 0.1488 + }, + { + "epoch": 0.341891859373872, + "lm_loss": 2.2973, + "step": 1776, + "vm_loss": 0.1933 + }, + { + "epoch": 0.341891859373872, + "lm_loss": 2.3998, + "step": 1776, + "vm_loss": 0.1788 + }, + { + "epoch": 0.341891859373872, + "lm_loss": 2.0749, + "step": 1776, + "vm_loss": 0.2089 + }, + { + "epoch": 0.341891859373872, + "lm_loss": 2.3958, + "step": 1776, + "vm_loss": 0.1724 + }, + { + "epoch": 0.34208436605144743, + "grad_norm": 3.2357586471960693, + "learning_rate": 1.8974809538952483e-05, + "loss": 2.4398, + "step": 1777 + }, + { + "epoch": 0.3422768727290228, + "grad_norm": 3.531348848539006, + "learning_rate": 1.897343394440865e-05, + "loss": 2.3907, + "step": 1778 + }, + { + "epoch": 0.34246937940659816, + "grad_norm": 3.0828718583120547, + "learning_rate": 1.897205747753224e-05, + "loss": 2.3617, + "step": 1779 + }, + { + "epoch": 0.34266188608417353, + "grad_norm": 3.0926075152284924, + "learning_rate": 1.8970680138457052e-05, + "loss": 2.383, + "step": 1780 + }, + { + "epoch": 0.34285439276174895, + "grad_norm": 3.2922114486366545, + "learning_rate": 1.8969301927316984e-05, + "loss": 2.4189, + "step": 1781 + }, + { + "epoch": 0.3430468994393243, + "grad_norm": 3.4601428580471794, + "learning_rate": 1.8967922844246025e-05, + "loss": 2.4457, + "step": 1782 + }, + { + "epoch": 0.3432394061168997, + "grad_norm": 2.540788546760732, + "learning_rate": 1.8966542889378225e-05, + "loss": 2.4001, + "step": 1783 + }, + { + "epoch": 0.34343191279447505, + "grad_norm": 3.1119921856114297, + "learning_rate": 1.8965162062847743e-05, + "loss": 2.4513, + "step": 1784 + }, + { + "epoch": 0.34343191279447505, + "lm_loss": 2.0376, + "step": 1784, + "vm_loss": 0.2298 + }, + { + "epoch": 0.34343191279447505, + "lm_loss": 2.2602, + "step": 1784, + "vm_loss": 0.1673 + }, + { + "epoch": 0.34343191279447505, + "lm_loss": 2.2952, + "step": 1784, + "vm_loss": 0.1319 + }, + { + "epoch": 0.34343191279447505, + "lm_loss": 2.4144, + "step": 1784, + "vm_loss": 0.1509 + }, + { + "epoch": 0.34343191279447505, + "lm_loss": 2.3366, + "step": 1784, + "vm_loss": 0.1453 + }, + { + "epoch": 0.34343191279447505, + "lm_loss": 2.3469, + "step": 1784, + "vm_loss": 0.1692 + }, + { + "epoch": 0.34343191279447505, + "lm_loss": 2.2138, + "step": 1784, + "vm_loss": 0.1603 + }, + { + "epoch": 0.34343191279447505, + "lm_loss": 2.1255, + "step": 1784, + "vm_loss": 0.1336 + }, + { + "epoch": 0.3436244194720504, + "grad_norm": 3.356424725142414, + "learning_rate": 1.8963780364788815e-05, + "loss": 2.3973, + "step": 1785 + }, + { + "epoch": 0.34381692614962583, + "grad_norm": 2.8489180512515246, + "learning_rate": 1.896239779533575e-05, + "loss": 2.384, + "step": 1786 + }, + { + "epoch": 0.3440094328272012, + "grad_norm": 3.487823571010014, + "learning_rate": 1.896101435462296e-05, + "loss": 2.4175, + "step": 1787 + }, + { + "epoch": 0.34420193950477657, + "grad_norm": 3.362377925537072, + "learning_rate": 1.8959630042784928e-05, + "loss": 2.3878, + "step": 1788 + }, + { + "epoch": 0.34439444618235193, + "grad_norm": 3.371671022224365, + "learning_rate": 1.895824485995623e-05, + "loss": 2.423, + "step": 1789 + }, + { + "epoch": 0.34458695285992735, + "grad_norm": 3.5484698135490906, + "learning_rate": 1.895685880627152e-05, + "loss": 2.4268, + "step": 1790 + }, + { + "epoch": 0.3447794595375027, + "grad_norm": 3.1922204183491356, + "learning_rate": 1.895547188186554e-05, + "loss": 2.4092, + "step": 1791 + }, + { + "epoch": 0.3449719662150781, + "grad_norm": 3.7753734759492965, + "learning_rate": 1.8954084086873124e-05, + "loss": 2.4394, + "step": 1792 + }, + { + "epoch": 0.3449719662150781, + "lm_loss": 2.4267, + "step": 1792, + "vm_loss": 0.1323 + }, + { + "epoch": 0.3449719662150781, + "lm_loss": 2.2821, + "step": 1792, + "vm_loss": 0.1628 + }, + { + "epoch": 0.3449719662150781, + "lm_loss": 2.3579, + "step": 1792, + "vm_loss": 0.204 + }, + { + "epoch": 0.3449719662150781, + "lm_loss": 2.2514, + "step": 1792, + "vm_loss": 0.1376 + }, + { + "epoch": 0.3449719662150781, + "lm_loss": 2.3238, + "step": 1792, + "vm_loss": 0.1532 + }, + { + "epoch": 0.3449719662150781, + "lm_loss": 2.251, + "step": 1792, + "vm_loss": 0.195 + }, + { + "epoch": 0.3449719662150781, + "lm_loss": 2.344, + "step": 1792, + "vm_loss": 0.1592 + }, + { + "epoch": 0.3449719662150781, + "lm_loss": 2.3249, + "step": 1792, + "vm_loss": 0.1547 + }, + { + "epoch": 0.34516447289265345, + "grad_norm": 3.032929745047781, + "learning_rate": 1.8952695421429176e-05, + "loss": 2.4199, + "step": 1793 + }, + { + "epoch": 0.3453569795702288, + "grad_norm": 3.9556924058657357, + "learning_rate": 1.8951305885668695e-05, + "loss": 2.3523, + "step": 1794 + }, + { + "epoch": 0.34554948624780424, + "grad_norm": 3.3988968425269697, + "learning_rate": 1.8949915479726763e-05, + "loss": 2.3717, + "step": 1795 + }, + { + "epoch": 0.3457419929253796, + "grad_norm": 3.0620316604197964, + "learning_rate": 1.8948524203738545e-05, + "loss": 2.4092, + "step": 1796 + }, + { + "epoch": 0.34593449960295497, + "grad_norm": 3.847200885087783, + "learning_rate": 1.8947132057839287e-05, + "loss": 2.3864, + "step": 1797 + }, + { + "epoch": 0.34612700628053034, + "grad_norm": 3.5311669754787562, + "learning_rate": 1.8945739042164325e-05, + "loss": 2.4425, + "step": 1798 + }, + { + "epoch": 0.34631951295810576, + "grad_norm": 3.706395456300257, + "learning_rate": 1.894434515684908e-05, + "loss": 2.3992, + "step": 1799 + }, + { + "epoch": 0.3465120196356811, + "grad_norm": 3.172605360763211, + "learning_rate": 1.894295040202905e-05, + "loss": 2.3147, + "step": 1800 + }, + { + "epoch": 0.3465120196356811, + "lm_loss": 2.0065, + "step": 1800, + "vm_loss": 0.1531 + }, + { + "epoch": 0.3465120196356811, + "lm_loss": 2.1158, + "step": 1800, + "vm_loss": 0.2535 + }, + { + "epoch": 0.3465120196356811, + "lm_loss": 2.2855, + "step": 1800, + "vm_loss": 0.12 + }, + { + "epoch": 0.3465120196356811, + "lm_loss": 2.0779, + "step": 1800, + "vm_loss": 0.141 + }, + { + "epoch": 0.3465120196356811, + "lm_loss": 2.3025, + "step": 1800, + "vm_loss": 0.1874 + }, + { + "epoch": 0.3465120196356811, + "lm_loss": 2.0014, + "step": 1800, + "vm_loss": 0.1386 + }, + { + "epoch": 0.3465120196356811, + "lm_loss": 2.1758, + "step": 1800, + "vm_loss": 0.2182 + }, + { + "epoch": 0.3465120196356811, + "lm_loss": 2.1542, + "step": 1800, + "vm_loss": 0.1562 + }, + { + "epoch": 0.3467045263132565, + "grad_norm": 2.9197881276349613, + "learning_rate": 1.894155477783983e-05, + "loss": 2.3709, + "step": 1801 + }, + { + "epoch": 0.34689703299083186, + "grad_norm": 3.4664267219060725, + "learning_rate": 1.8940158284417093e-05, + "loss": 2.4017, + "step": 1802 + }, + { + "epoch": 0.3470895396684072, + "grad_norm": 3.2253861471370797, + "learning_rate": 1.8938760921896592e-05, + "loss": 2.3781, + "step": 1803 + }, + { + "epoch": 0.34728204634598264, + "grad_norm": 2.875584731666018, + "learning_rate": 1.8937362690414165e-05, + "loss": 2.4179, + "step": 1804 + }, + { + "epoch": 0.347474553023558, + "grad_norm": 3.0420742296641143, + "learning_rate": 1.8935963590105745e-05, + "loss": 2.3615, + "step": 1805 + }, + { + "epoch": 0.3476670597011334, + "grad_norm": 3.3754393151986926, + "learning_rate": 1.893456362110734e-05, + "loss": 2.4128, + "step": 1806 + }, + { + "epoch": 0.34785956637870874, + "grad_norm": 3.080489929696284, + "learning_rate": 1.8933162783555044e-05, + "loss": 2.4182, + "step": 1807 + }, + { + "epoch": 0.34805207305628416, + "grad_norm": 3.151003955574368, + "learning_rate": 1.8931761077585037e-05, + "loss": 2.3303, + "step": 1808 + }, + { + "epoch": 0.34805207305628416, + "lm_loss": 2.422, + "step": 1808, + "vm_loss": 0.1876 + }, + { + "epoch": 0.34805207305628416, + "lm_loss": 2.2289, + "step": 1808, + "vm_loss": 0.2409 + }, + { + "epoch": 0.34805207305628416, + "lm_loss": 2.4526, + "step": 1808, + "vm_loss": 0.1422 + }, + { + "epoch": 0.34805207305628416, + "lm_loss": 2.0436, + "step": 1808, + "vm_loss": 0.1241 + }, + { + "epoch": 0.34805207305628416, + "lm_loss": 2.2206, + "step": 1808, + "vm_loss": 0.1432 + }, + { + "epoch": 0.34805207305628416, + "lm_loss": 2.3812, + "step": 1808, + "vm_loss": 0.1871 + }, + { + "epoch": 0.34805207305628416, + "lm_loss": 2.4423, + "step": 1808, + "vm_loss": 0.2103 + }, + { + "epoch": 0.34805207305628416, + "lm_loss": 2.4581, + "step": 1808, + "vm_loss": 0.1861 + }, + { + "epoch": 0.3482445797338595, + "grad_norm": 3.316248251295914, + "learning_rate": 1.8930358503333584e-05, + "loss": 2.4207, + "step": 1809 + }, + { + "epoch": 0.3484370864114349, + "grad_norm": 3.4721530021081173, + "learning_rate": 1.892895506093703e-05, + "loss": 2.3729, + "step": 1810 + }, + { + "epoch": 0.34862959308901026, + "grad_norm": 3.141333593403813, + "learning_rate": 1.892755075053181e-05, + "loss": 2.372, + "step": 1811 + }, + { + "epoch": 0.3488220997665857, + "grad_norm": 2.937787306669567, + "learning_rate": 1.8926145572254437e-05, + "loss": 2.3917, + "step": 1812 + }, + { + "epoch": 0.34901460644416105, + "grad_norm": 3.3638315082157693, + "learning_rate": 1.892473952624152e-05, + "loss": 2.4192, + "step": 1813 + }, + { + "epoch": 0.3492071131217364, + "grad_norm": 3.254866099193651, + "learning_rate": 1.892333261262974e-05, + "loss": 2.3966, + "step": 1814 + }, + { + "epoch": 0.3493996197993118, + "grad_norm": 3.0544369888150924, + "learning_rate": 1.8921924831555865e-05, + "loss": 2.3895, + "step": 1815 + }, + { + "epoch": 0.34959212647688714, + "grad_norm": 2.860844842503817, + "learning_rate": 1.8920516183156753e-05, + "loss": 2.3796, + "step": 1816 + }, + { + "epoch": 0.34959212647688714, + "lm_loss": 2.3789, + "step": 1816, + "vm_loss": 0.1253 + }, + { + "epoch": 0.34959212647688714, + "lm_loss": 2.4993, + "step": 1816, + "vm_loss": 0.1857 + }, + { + "epoch": 0.34959212647688714, + "lm_loss": 2.1931, + "step": 1816, + "vm_loss": 0.216 + }, + { + "epoch": 0.34959212647688714, + "lm_loss": 2.212, + "step": 1816, + "vm_loss": 0.1707 + }, + { + "epoch": 0.34959212647688714, + "lm_loss": 2.19, + "step": 1816, + "vm_loss": 0.2078 + }, + { + "epoch": 0.34959212647688714, + "lm_loss": 2.533, + "step": 1816, + "vm_loss": 0.1672 + }, + { + "epoch": 0.34959212647688714, + "lm_loss": 2.1039, + "step": 1816, + "vm_loss": 0.1133 + }, + { + "epoch": 0.34959212647688714, + "lm_loss": 2.2825, + "step": 1816, + "vm_loss": 0.1455 + }, + { + "epoch": 0.34978463315446257, + "grad_norm": 2.6868805256373354, + "learning_rate": 1.891910666756934e-05, + "loss": 2.3841, + "step": 1817 + }, + { + "epoch": 0.34997713983203793, + "grad_norm": 2.9523439883969744, + "learning_rate": 1.8917696284930654e-05, + "loss": 2.3594, + "step": 1818 + }, + { + "epoch": 0.3501696465096133, + "grad_norm": 3.1216210261934156, + "learning_rate": 1.8916285035377794e-05, + "loss": 2.4226, + "step": 1819 + }, + { + "epoch": 0.35036215318718866, + "grad_norm": 2.971388675922259, + "learning_rate": 1.8914872919047957e-05, + "loss": 2.3859, + "step": 1820 + }, + { + "epoch": 0.3505546598647641, + "grad_norm": 2.588473122039049, + "learning_rate": 1.8913459936078416e-05, + "loss": 2.4038, + "step": 1821 + }, + { + "epoch": 0.35074716654233945, + "grad_norm": 2.8638933386115832, + "learning_rate": 1.8912046086606537e-05, + "loss": 2.4036, + "step": 1822 + }, + { + "epoch": 0.3509396732199148, + "grad_norm": 3.2413608139304357, + "learning_rate": 1.891063137076976e-05, + "loss": 2.348, + "step": 1823 + }, + { + "epoch": 0.3511321798974902, + "grad_norm": 2.9059829753499127, + "learning_rate": 1.890921578870561e-05, + "loss": 2.4431, + "step": 1824 + }, + { + "epoch": 0.3511321798974902, + "lm_loss": 2.1095, + "step": 1824, + "vm_loss": 0.11 + }, + { + "epoch": 0.3511321798974902, + "lm_loss": 2.2551, + "step": 1824, + "vm_loss": 0.1915 + }, + { + "epoch": 0.3511321798974902, + "lm_loss": 2.468, + "step": 1824, + "vm_loss": 0.1697 + }, + { + "epoch": 0.3511321798974902, + "lm_loss": 1.9787, + "step": 1824, + "vm_loss": 0.1991 + }, + { + "epoch": 0.3511321798974902, + "lm_loss": 2.1936, + "step": 1824, + "vm_loss": 0.2466 + }, + { + "epoch": 0.3511321798974902, + "lm_loss": 2.2338, + "step": 1824, + "vm_loss": 0.1603 + }, + { + "epoch": 0.3511321798974902, + "lm_loss": 2.3164, + "step": 1824, + "vm_loss": 0.2023 + }, + { + "epoch": 0.3511321798974902, + "lm_loss": 2.2622, + "step": 1824, + "vm_loss": 0.1843 + }, + { + "epoch": 0.35132468657506555, + "grad_norm": 2.9135405139075585, + "learning_rate": 1.8907799340551707e-05, + "loss": 2.3874, + "step": 1825 + }, + { + "epoch": 0.35151719325264097, + "grad_norm": 2.8260875008078696, + "learning_rate": 1.890638202644574e-05, + "loss": 2.4063, + "step": 1826 + }, + { + "epoch": 0.35170969993021634, + "grad_norm": 3.2235505478452797, + "learning_rate": 1.8904963846525498e-05, + "loss": 2.4551, + "step": 1827 + }, + { + "epoch": 0.3519022066077917, + "grad_norm": 3.3452384596793023, + "learning_rate": 1.8903544800928842e-05, + "loss": 2.4453, + "step": 1828 + }, + { + "epoch": 0.35209471328536707, + "grad_norm": 3.0456168870735483, + "learning_rate": 1.8902124889793717e-05, + "loss": 2.4042, + "step": 1829 + }, + { + "epoch": 0.3522872199629425, + "grad_norm": 3.1266663814464515, + "learning_rate": 1.8900704113258166e-05, + "loss": 2.3673, + "step": 1830 + }, + { + "epoch": 0.35247972664051785, + "grad_norm": 2.853481701145706, + "learning_rate": 1.88992824714603e-05, + "loss": 2.4409, + "step": 1831 + }, + { + "epoch": 0.3526722333180932, + "grad_norm": 3.3333952572076164, + "learning_rate": 1.8897859964538323e-05, + "loss": 2.4424, + "step": 1832 + }, + { + "epoch": 0.3526722333180932, + "lm_loss": 2.0206, + "step": 1832, + "vm_loss": 0.181 + }, + { + "epoch": 0.3526722333180932, + "lm_loss": 1.9559, + "step": 1832, + "vm_loss": 0.1949 + }, + { + "epoch": 0.3526722333180932, + "lm_loss": 2.1728, + "step": 1832, + "vm_loss": 0.2192 + }, + { + "epoch": 0.3526722333180932, + "lm_loss": 2.4046, + "step": 1832, + "vm_loss": 0.1567 + }, + { + "epoch": 0.3526722333180932, + "lm_loss": 2.0467, + "step": 1832, + "vm_loss": 0.1427 + }, + { + "epoch": 0.3526722333180932, + "lm_loss": 2.0041, + "step": 1832, + "vm_loss": 0.1799 + }, + { + "epoch": 0.3526722333180932, + "lm_loss": 2.2573, + "step": 1832, + "vm_loss": 0.2052 + }, + { + "epoch": 0.3526722333180932, + "lm_loss": 2.0069, + "step": 1832, + "vm_loss": 0.1806 + }, + { + "epoch": 0.3528647399956686, + "grad_norm": 3.1756662783733796, + "learning_rate": 1.8896436592630524e-05, + "loss": 2.367, + "step": 1833 + }, + { + "epoch": 0.35305724667324395, + "grad_norm": 2.768869024649846, + "learning_rate": 1.8895012355875262e-05, + "loss": 2.396, + "step": 1834 + }, + { + "epoch": 0.3532497533508194, + "grad_norm": 2.603034053391109, + "learning_rate": 1.8893587254411004e-05, + "loss": 2.3929, + "step": 1835 + }, + { + "epoch": 0.35344226002839474, + "grad_norm": 2.8764151681111314, + "learning_rate": 1.8892161288376282e-05, + "loss": 2.4061, + "step": 1836 + }, + { + "epoch": 0.3536347667059701, + "grad_norm": 3.5656811328659135, + "learning_rate": 1.889073445790972e-05, + "loss": 2.4311, + "step": 1837 + }, + { + "epoch": 0.35382727338354547, + "grad_norm": 3.0329038542118147, + "learning_rate": 1.8889306763150023e-05, + "loss": 2.3644, + "step": 1838 + }, + { + "epoch": 0.3540197800611209, + "grad_norm": 2.6652974066488095, + "learning_rate": 1.8887878204235977e-05, + "loss": 2.4213, + "step": 1839 + }, + { + "epoch": 0.35421228673869626, + "grad_norm": 3.1406848904227873, + "learning_rate": 1.8886448781306464e-05, + "loss": 2.3764, + "step": 1840 + }, + { + "epoch": 0.35421228673869626, + "lm_loss": 2.2437, + "step": 1840, + "vm_loss": 0.2457 + }, + { + "epoch": 0.35421228673869626, + "lm_loss": 2.2813, + "step": 1840, + "vm_loss": 0.2046 + }, + { + "epoch": 0.35421228673869626, + "lm_loss": 1.8656, + "step": 1840, + "vm_loss": 0.1354 + }, + { + "epoch": 0.35421228673869626, + "lm_loss": 2.3191, + "step": 1840, + "vm_loss": 0.1957 + }, + { + "epoch": 0.35421228673869626, + "lm_loss": 2.3323, + "step": 1840, + "vm_loss": 0.2042 + }, + { + "epoch": 0.35421228673869626, + "lm_loss": 2.399, + "step": 1840, + "vm_loss": 0.1638 + }, + { + "epoch": 0.35421228673869626, + "lm_loss": 2.1422, + "step": 1840, + "vm_loss": 0.2358 + }, + { + "epoch": 0.35421228673869626, + "lm_loss": 2.3123, + "step": 1840, + "vm_loss": 0.21 + }, + { + "epoch": 0.3544047934162716, + "grad_norm": 2.9327422211932563, + "learning_rate": 1.8885018494500436e-05, + "loss": 2.3661, + "step": 1841 + }, + { + "epoch": 0.354597300093847, + "grad_norm": 3.2333966435578896, + "learning_rate": 1.888358734395694e-05, + "loss": 2.385, + "step": 1842 + }, + { + "epoch": 0.3547898067714224, + "grad_norm": 3.1461269216384915, + "learning_rate": 1.88821553298151e-05, + "loss": 2.375, + "step": 1843 + }, + { + "epoch": 0.3549823134489978, + "grad_norm": 3.0833457080814863, + "learning_rate": 1.8880722452214123e-05, + "loss": 2.4091, + "step": 1844 + }, + { + "epoch": 0.35517482012657314, + "grad_norm": 3.2730325133341727, + "learning_rate": 1.887928871129331e-05, + "loss": 2.4039, + "step": 1845 + }, + { + "epoch": 0.3553673268041485, + "grad_norm": 2.680616521521122, + "learning_rate": 1.887785410719203e-05, + "loss": 2.4179, + "step": 1846 + }, + { + "epoch": 0.3555598334817239, + "grad_norm": 2.8682883067424667, + "learning_rate": 1.8876418640049757e-05, + "loss": 2.3778, + "step": 1847 + }, + { + "epoch": 0.3557523401592993, + "grad_norm": 3.032017642586472, + "learning_rate": 1.8874982310006024e-05, + "loss": 2.367, + "step": 1848 + }, + { + "epoch": 0.3557523401592993, + "lm_loss": 2.186, + "step": 1848, + "vm_loss": 0.1847 + }, + { + "epoch": 0.3557523401592993, + "lm_loss": 2.4609, + "step": 1848, + "vm_loss": 0.1875 + }, + { + "epoch": 0.3557523401592993, + "lm_loss": 2.2255, + "step": 1848, + "vm_loss": 0.2218 + }, + { + "epoch": 0.3557523401592993, + "lm_loss": 2.2172, + "step": 1848, + "vm_loss": 0.1734 + }, + { + "epoch": 0.3557523401592993, + "lm_loss": 2.3671, + "step": 1848, + "vm_loss": 0.1601 + }, + { + "epoch": 0.3557523401592993, + "lm_loss": 2.2448, + "step": 1848, + "vm_loss": 0.1371 + }, + { + "epoch": 0.3557523401592993, + "lm_loss": 2.3494, + "step": 1848, + "vm_loss": 0.1969 + }, + { + "epoch": 0.3557523401592993, + "lm_loss": 2.3271, + "step": 1848, + "vm_loss": 0.1392 + }, + { + "epoch": 0.35594484683687466, + "grad_norm": 2.9426966384262343, + "learning_rate": 1.8873545117200467e-05, + "loss": 2.4336, + "step": 1849 + }, + { + "epoch": 0.35613735351445003, + "grad_norm": 3.3873083524636898, + "learning_rate": 1.88721070617728e-05, + "loss": 2.401, + "step": 1850 + }, + { + "epoch": 0.3563298601920254, + "grad_norm": 3.0028306949601373, + "learning_rate": 1.887066814386282e-05, + "loss": 2.4453, + "step": 1851 + }, + { + "epoch": 0.3565223668696008, + "grad_norm": 3.2221959961558078, + "learning_rate": 1.8869228363610406e-05, + "loss": 2.4398, + "step": 1852 + }, + { + "epoch": 0.3567148735471762, + "grad_norm": 2.793047686194737, + "learning_rate": 1.8867787721155527e-05, + "loss": 2.4019, + "step": 1853 + }, + { + "epoch": 0.35690738022475155, + "grad_norm": 3.553907347466848, + "learning_rate": 1.8866346216638226e-05, + "loss": 2.3934, + "step": 1854 + }, + { + "epoch": 0.3570998869023269, + "grad_norm": 3.3494787533447306, + "learning_rate": 1.886490385019864e-05, + "loss": 2.3857, + "step": 1855 + }, + { + "epoch": 0.3572923935799023, + "grad_norm": 2.833512966230157, + "learning_rate": 1.8863460621976985e-05, + "loss": 2.3305, + "step": 1856 + }, + { + "epoch": 0.3572923935799023, + "lm_loss": 2.0537, + "step": 1856, + "vm_loss": 0.1693 + }, + { + "epoch": 0.3572923935799023, + "lm_loss": 2.144, + "step": 1856, + "vm_loss": 0.1805 + }, + { + "epoch": 0.3572923935799023, + "lm_loss": 2.2537, + "step": 1856, + "vm_loss": 0.2306 + }, + { + "epoch": 0.3572923935799023, + "lm_loss": 2.5965, + "step": 1856, + "vm_loss": 0.1495 + }, + { + "epoch": 0.3572923935799023, + "lm_loss": 2.0588, + "step": 1856, + "vm_loss": 0.1589 + }, + { + "epoch": 0.3572923935799023, + "lm_loss": 2.2977, + "step": 1856, + "vm_loss": 0.1537 + }, + { + "epoch": 0.3572923935799023, + "lm_loss": 2.3968, + "step": 1856, + "vm_loss": 0.1155 + }, + { + "epoch": 0.3572923935799023, + "lm_loss": 2.1899, + "step": 1856, + "vm_loss": 0.1624 + }, + { + "epoch": 0.3574849002574777, + "grad_norm": 2.798688270336243, + "learning_rate": 1.8862016532113562e-05, + "loss": 2.3998, + "step": 1857 + }, + { + "epoch": 0.35767740693505307, + "grad_norm": 3.070572149767001, + "learning_rate": 1.886057158074875e-05, + "loss": 2.3942, + "step": 1858 + }, + { + "epoch": 0.35786991361262843, + "grad_norm": 3.302594386145543, + "learning_rate": 1.8859125768023024e-05, + "loss": 2.385, + "step": 1859 + }, + { + "epoch": 0.3580624202902038, + "grad_norm": 2.6300784393289405, + "learning_rate": 1.885767909407693e-05, + "loss": 2.3233, + "step": 1860 + }, + { + "epoch": 0.3582549269677792, + "grad_norm": 3.1732526080158125, + "learning_rate": 1.8856231559051106e-05, + "loss": 2.3801, + "step": 1861 + }, + { + "epoch": 0.3584474336453546, + "grad_norm": 2.9475980974253666, + "learning_rate": 1.885478316308627e-05, + "loss": 2.3744, + "step": 1862 + }, + { + "epoch": 0.35863994032292995, + "grad_norm": 2.9845057808900037, + "learning_rate": 1.8853333906323226e-05, + "loss": 2.4119, + "step": 1863 + }, + { + "epoch": 0.3588324470005053, + "grad_norm": 3.122668779938046, + "learning_rate": 1.885188378890286e-05, + "loss": 2.3693, + "step": 1864 + }, + { + "epoch": 0.3588324470005053, + "lm_loss": 2.2618, + "step": 1864, + "vm_loss": 0.1565 + }, + { + "epoch": 0.3588324470005053, + "lm_loss": 2.1943, + "step": 1864, + "vm_loss": 0.1758 + }, + { + "epoch": 0.3588324470005053, + "lm_loss": 2.1947, + "step": 1864, + "vm_loss": 0.1776 + }, + { + "epoch": 0.3588324470005053, + "lm_loss": 2.395, + "step": 1864, + "vm_loss": 0.1834 + }, + { + "epoch": 0.3588324470005053, + "lm_loss": 1.795, + "step": 1864, + "vm_loss": 0.1294 + }, + { + "epoch": 0.3588324470005053, + "lm_loss": 1.8709, + "step": 1864, + "vm_loss": 0.1581 + }, + { + "epoch": 0.3588324470005053, + "lm_loss": 2.2653, + "step": 1864, + "vm_loss": 0.1617 + }, + { + "epoch": 0.3588324470005053, + "lm_loss": 1.7913, + "step": 1864, + "vm_loss": 0.2729 + }, + { + "epoch": 0.3590249536780807, + "grad_norm": 2.758639760471608, + "learning_rate": 1.8850432810966138e-05, + "loss": 2.3818, + "step": 1865 + }, + { + "epoch": 0.3592174603556561, + "grad_norm": 2.8130805823246354, + "learning_rate": 1.8848980972654117e-05, + "loss": 2.3793, + "step": 1866 + }, + { + "epoch": 0.35940996703323147, + "grad_norm": 2.9681859596450773, + "learning_rate": 1.8847528274107934e-05, + "loss": 2.4219, + "step": 1867 + }, + { + "epoch": 0.35960247371080684, + "grad_norm": 2.9066682466205656, + "learning_rate": 1.8846074715468813e-05, + "loss": 2.3847, + "step": 1868 + }, + { + "epoch": 0.3597949803883822, + "grad_norm": 2.7409136167481067, + "learning_rate": 1.8844620296878052e-05, + "loss": 2.4063, + "step": 1869 + }, + { + "epoch": 0.3599874870659576, + "grad_norm": 3.4945430884721254, + "learning_rate": 1.8843165018477045e-05, + "loss": 2.4089, + "step": 1870 + }, + { + "epoch": 0.360179993743533, + "grad_norm": 3.2025875184671793, + "learning_rate": 1.884170888040726e-05, + "loss": 2.4408, + "step": 1871 + }, + { + "epoch": 0.36037250042110835, + "grad_norm": 3.417333771236262, + "learning_rate": 1.884025188281025e-05, + "loss": 2.3907, + "step": 1872 + }, + { + "epoch": 0.36037250042110835, + "lm_loss": 2.1101, + "step": 1872, + "vm_loss": 0.0883 + }, + { + "epoch": 0.36037250042110835, + "lm_loss": 2.4935, + "step": 1872, + "vm_loss": 0.208 + }, + { + "epoch": 0.36037250042110835, + "lm_loss": 2.2495, + "step": 1872, + "vm_loss": 0.1997 + }, + { + "epoch": 0.36037250042110835, + "lm_loss": 1.9439, + "step": 1872, + "vm_loss": 0.1479 + }, + { + "epoch": 0.36037250042110835, + "lm_loss": 2.3628, + "step": 1872, + "vm_loss": 0.2658 + }, + { + "epoch": 0.36037250042110835, + "lm_loss": 2.2243, + "step": 1872, + "vm_loss": 0.1873 + }, + { + "epoch": 0.36037250042110835, + "lm_loss": 1.8425, + "step": 1872, + "vm_loss": 0.2241 + }, + { + "epoch": 0.36037250042110835, + "lm_loss": 2.1349, + "step": 1872, + "vm_loss": 0.1281 + }, + { + "epoch": 0.3605650070986837, + "grad_norm": 3.1327989390307103, + "learning_rate": 1.883879402582766e-05, + "loss": 2.4281, + "step": 1873 + }, + { + "epoch": 0.3607575137762591, + "grad_norm": 2.8734236311134635, + "learning_rate": 1.8837335309601214e-05, + "loss": 2.4036, + "step": 1874 + }, + { + "epoch": 0.3609500204538345, + "grad_norm": 3.49351557950339, + "learning_rate": 1.8835875734272705e-05, + "loss": 2.3593, + "step": 1875 + }, + { + "epoch": 0.3611425271314099, + "grad_norm": 4.007147016689313, + "learning_rate": 1.8834415299984035e-05, + "loss": 2.3566, + "step": 1876 + }, + { + "epoch": 0.36133503380898524, + "grad_norm": 3.2609412662507213, + "learning_rate": 1.8832954006877174e-05, + "loss": 2.3452, + "step": 1877 + }, + { + "epoch": 0.3615275404865606, + "grad_norm": 3.5884454466667903, + "learning_rate": 1.8831491855094176e-05, + "loss": 2.3751, + "step": 1878 + }, + { + "epoch": 0.361720047164136, + "grad_norm": 3.193687345270011, + "learning_rate": 1.883002884477718e-05, + "loss": 2.3745, + "step": 1879 + }, + { + "epoch": 0.3619125538417114, + "grad_norm": 3.2010730910497385, + "learning_rate": 1.8828564976068414e-05, + "loss": 2.3465, + "step": 1880 + }, + { + "epoch": 0.3619125538417114, + "lm_loss": 2.3082, + "step": 1880, + "vm_loss": 0.1229 + }, + { + "epoch": 0.3619125538417114, + "lm_loss": 2.5152, + "step": 1880, + "vm_loss": 0.1176 + }, + { + "epoch": 0.3619125538417114, + "lm_loss": 2.2222, + "step": 1880, + "vm_loss": 0.2007 + }, + { + "epoch": 0.3619125538417114, + "lm_loss": 2.3972, + "step": 1880, + "vm_loss": 0.1763 + }, + { + "epoch": 0.3619125538417114, + "lm_loss": 2.4112, + "step": 1880, + "vm_loss": 0.1632 + }, + { + "epoch": 0.3619125538417114, + "lm_loss": 2.4877, + "step": 1880, + "vm_loss": 0.1865 + }, + { + "epoch": 0.3619125538417114, + "lm_loss": 1.8775, + "step": 1880, + "vm_loss": 0.1909 + }, + { + "epoch": 0.3619125538417114, + "lm_loss": 2.4603, + "step": 1880, + "vm_loss": 0.1219 + }, + { + "epoch": 0.36210506051928676, + "grad_norm": 3.414759674740355, + "learning_rate": 1.882710024911018e-05, + "loss": 2.4158, + "step": 1881 + }, + { + "epoch": 0.3622975671968621, + "grad_norm": 3.0290105832648715, + "learning_rate": 1.882563466404487e-05, + "loss": 2.3991, + "step": 1882 + }, + { + "epoch": 0.36249007387443755, + "grad_norm": 3.3146882372720246, + "learning_rate": 1.882416822101496e-05, + "loss": 2.4032, + "step": 1883 + }, + { + "epoch": 0.3626825805520129, + "grad_norm": 3.746256597796705, + "learning_rate": 1.8822700920163002e-05, + "loss": 2.4028, + "step": 1884 + }, + { + "epoch": 0.3628750872295883, + "grad_norm": 2.7888523675049153, + "learning_rate": 1.882123276163164e-05, + "loss": 2.3824, + "step": 1885 + }, + { + "epoch": 0.36306759390716364, + "grad_norm": 3.098190781130727, + "learning_rate": 1.88197637455636e-05, + "loss": 2.3873, + "step": 1886 + }, + { + "epoch": 0.363260100584739, + "grad_norm": 3.1384152081597523, + "learning_rate": 1.8818293872101683e-05, + "loss": 2.4482, + "step": 1887 + }, + { + "epoch": 0.36345260726231443, + "grad_norm": 2.8336697049162636, + "learning_rate": 1.881682314138878e-05, + "loss": 2.361, + "step": 1888 + }, + { + "epoch": 0.36345260726231443, + "lm_loss": 2.0311, + "step": 1888, + "vm_loss": 0.2052 + }, + { + "epoch": 0.36345260726231443, + "lm_loss": 2.038, + "step": 1888, + "vm_loss": 0.206 + }, + { + "epoch": 0.36345260726231443, + "lm_loss": 2.2507, + "step": 1888, + "vm_loss": 0.18 + }, + { + "epoch": 0.36345260726231443, + "lm_loss": 2.2721, + "step": 1888, + "vm_loss": 0.2198 + }, + { + "epoch": 0.36345260726231443, + "lm_loss": 2.3505, + "step": 1888, + "vm_loss": 0.2067 + }, + { + "epoch": 0.36345260726231443, + "lm_loss": 2.1526, + "step": 1888, + "vm_loss": 0.1473 + }, + { + "epoch": 0.36345260726231443, + "lm_loss": 2.4873, + "step": 1888, + "vm_loss": 0.1685 + }, + { + "epoch": 0.36345260726231443, + "lm_loss": 2.3849, + "step": 1888, + "vm_loss": 0.1184 + }, + { + "epoch": 0.3636451139398898, + "grad_norm": 2.8860805308762987, + "learning_rate": 1.8815351553567875e-05, + "loss": 2.3837, + "step": 1889 + }, + { + "epoch": 0.36383762061746516, + "grad_norm": 3.0820414318235, + "learning_rate": 1.8813879108782014e-05, + "loss": 2.3988, + "step": 1890 + }, + { + "epoch": 0.36403012729504053, + "grad_norm": 3.0225315714033343, + "learning_rate": 1.881240580717434e-05, + "loss": 2.3823, + "step": 1891 + }, + { + "epoch": 0.36422263397261595, + "grad_norm": 3.10262796845587, + "learning_rate": 1.881093164888808e-05, + "loss": 2.3791, + "step": 1892 + }, + { + "epoch": 0.3644151406501913, + "grad_norm": 3.1554255793581354, + "learning_rate": 1.880945663406654e-05, + "loss": 2.3871, + "step": 1893 + }, + { + "epoch": 0.3646076473277667, + "grad_norm": 2.747965661804524, + "learning_rate": 1.8807980762853108e-05, + "loss": 2.3625, + "step": 1894 + }, + { + "epoch": 0.36480015400534205, + "grad_norm": 3.3691012981847837, + "learning_rate": 1.8806504035391257e-05, + "loss": 2.4316, + "step": 1895 + }, + { + "epoch": 0.3649926606829174, + "grad_norm": 3.2147784256728933, + "learning_rate": 1.8805026451824547e-05, + "loss": 2.3692, + "step": 1896 + }, + { + "epoch": 0.3649926606829174, + "lm_loss": 1.8805, + "step": 1896, + "vm_loss": 0.1814 + }, + { + "epoch": 0.3649926606829174, + "lm_loss": 1.9792, + "step": 1896, + "vm_loss": 0.1085 + }, + { + "epoch": 0.3649926606829174, + "lm_loss": 2.3455, + "step": 1896, + "vm_loss": 0.2345 + }, + { + "epoch": 0.3649926606829174, + "lm_loss": 1.9959, + "step": 1896, + "vm_loss": 0.1554 + }, + { + "epoch": 0.3649926606829174, + "lm_loss": 2.2495, + "step": 1896, + "vm_loss": 0.1686 + }, + { + "epoch": 0.3649926606829174, + "lm_loss": 2.4827, + "step": 1896, + "vm_loss": 0.1281 + }, + { + "epoch": 0.3649926606829174, + "lm_loss": 2.1369, + "step": 1896, + "vm_loss": 0.1381 + }, + { + "epoch": 0.3649926606829174, + "lm_loss": 2.3965, + "step": 1896, + "vm_loss": 0.2047 + }, + { + "epoch": 0.36518516736049283, + "grad_norm": 2.6187823658075997, + "learning_rate": 1.8803548012296616e-05, + "loss": 2.374, + "step": 1897 + }, + { + "epoch": 0.3653776740380682, + "grad_norm": 3.320039766282272, + "learning_rate": 1.8802068716951187e-05, + "loss": 2.3727, + "step": 1898 + }, + { + "epoch": 0.36557018071564357, + "grad_norm": 3.675441337628201, + "learning_rate": 1.8800588565932068e-05, + "loss": 2.3737, + "step": 1899 + }, + { + "epoch": 0.36576268739321893, + "grad_norm": 3.007094943657547, + "learning_rate": 1.879910755938315e-05, + "loss": 2.3961, + "step": 1900 + }, + { + "epoch": 0.36595519407079435, + "grad_norm": 3.0405979683581363, + "learning_rate": 1.8797625697448402e-05, + "loss": 2.3969, + "step": 1901 + }, + { + "epoch": 0.3661477007483697, + "grad_norm": 3.8330291231058227, + "learning_rate": 1.8796142980271887e-05, + "loss": 2.3956, + "step": 1902 + }, + { + "epoch": 0.3663402074259451, + "grad_norm": 3.4382474826056484, + "learning_rate": 1.879465940799773e-05, + "loss": 2.3945, + "step": 1903 + }, + { + "epoch": 0.36653271410352045, + "grad_norm": 2.9126509643226517, + "learning_rate": 1.8793174980770167e-05, + "loss": 2.3739, + "step": 1904 + }, + { + "epoch": 0.36653271410352045, + "lm_loss": 2.3646, + "step": 1904, + "vm_loss": 0.1655 + }, + { + "epoch": 0.36653271410352045, + "lm_loss": 1.9501, + "step": 1904, + "vm_loss": 0.1483 + }, + { + "epoch": 0.36653271410352045, + "lm_loss": 2.2726, + "step": 1904, + "vm_loss": 0.1279 + }, + { + "epoch": 0.36653271410352045, + "lm_loss": 2.3145, + "step": 1904, + "vm_loss": 0.1672 + }, + { + "epoch": 0.36653271410352045, + "lm_loss": 2.343, + "step": 1904, + "vm_loss": 0.2131 + }, + { + "epoch": 0.36653271410352045, + "lm_loss": 1.8461, + "step": 1904, + "vm_loss": 0.128 + }, + { + "epoch": 0.36653271410352045, + "lm_loss": 2.1248, + "step": 1904, + "vm_loss": 0.1535 + }, + { + "epoch": 0.36653271410352045, + "lm_loss": 2.1239, + "step": 1904, + "vm_loss": 0.2085 + }, + { + "epoch": 0.3667252207810958, + "grad_norm": 3.5683443964942034, + "learning_rate": 1.87916896987335e-05, + "loss": 2.4038, + "step": 1905 + }, + { + "epoch": 0.36691772745867124, + "grad_norm": 3.1788744561231446, + "learning_rate": 1.8790203562032114e-05, + "loss": 2.3457, + "step": 1906 + }, + { + "epoch": 0.3671102341362466, + "grad_norm": 3.0644531180815897, + "learning_rate": 1.8788716570810478e-05, + "loss": 2.3909, + "step": 1907 + }, + { + "epoch": 0.36730274081382197, + "grad_norm": 3.5609014826730947, + "learning_rate": 1.8787228725213155e-05, + "loss": 2.3211, + "step": 1908 + }, + { + "epoch": 0.36749524749139734, + "grad_norm": 3.3539129723707437, + "learning_rate": 1.878574002538478e-05, + "loss": 2.3694, + "step": 1909 + }, + { + "epoch": 0.36768775416897276, + "grad_norm": 2.8579089054359677, + "learning_rate": 1.878425047147007e-05, + "loss": 2.3609, + "step": 1910 + }, + { + "epoch": 0.3678802608465481, + "grad_norm": 3.226354542860252, + "learning_rate": 1.8782760063613827e-05, + "loss": 2.3683, + "step": 1911 + }, + { + "epoch": 0.3680727675241235, + "grad_norm": 3.847371245493553, + "learning_rate": 1.8781268801960947e-05, + "loss": 2.3816, + "step": 1912 + }, + { + "epoch": 0.3680727675241235, + "lm_loss": 2.2523, + "step": 1912, + "vm_loss": 0.1287 + }, + { + "epoch": 0.3680727675241235, + "lm_loss": 2.2065, + "step": 1912, + "vm_loss": 0.1458 + }, + { + "epoch": 0.3680727675241235, + "lm_loss": 2.0961, + "step": 1912, + "vm_loss": 0.1886 + }, + { + "epoch": 0.3680727675241235, + "lm_loss": 2.2932, + "step": 1912, + "vm_loss": 0.1323 + }, + { + "epoch": 0.3680727675241235, + "lm_loss": 2.2491, + "step": 1912, + "vm_loss": 0.1338 + }, + { + "epoch": 0.3680727675241235, + "lm_loss": 2.1999, + "step": 1912, + "vm_loss": 0.1184 + }, + { + "epoch": 0.3680727675241235, + "lm_loss": 1.9894, + "step": 1912, + "vm_loss": 0.1388 + }, + { + "epoch": 0.3680727675241235, + "lm_loss": 2.4337, + "step": 1912, + "vm_loss": 0.152 + }, + { + "epoch": 0.36826527420169886, + "grad_norm": 3.2629487262153583, + "learning_rate": 1.877977668665639e-05, + "loss": 2.3773, + "step": 1913 + }, + { + "epoch": 0.3684577808792743, + "grad_norm": 3.4941922701432473, + "learning_rate": 1.8778283717845216e-05, + "loss": 2.3957, + "step": 1914 + }, + { + "epoch": 0.36865028755684964, + "grad_norm": 3.367971513105956, + "learning_rate": 1.8776789895672557e-05, + "loss": 2.3977, + "step": 1915 + }, + { + "epoch": 0.368842794234425, + "grad_norm": 3.0785093524779445, + "learning_rate": 1.8775295220283633e-05, + "loss": 2.4138, + "step": 1916 + }, + { + "epoch": 0.3690353009120004, + "grad_norm": 3.163153285453088, + "learning_rate": 1.8773799691823745e-05, + "loss": 2.3586, + "step": 1917 + }, + { + "epoch": 0.36922780758957574, + "grad_norm": 3.066781181171109, + "learning_rate": 1.8772303310438275e-05, + "loss": 2.3674, + "step": 1918 + }, + { + "epoch": 0.36942031426715116, + "grad_norm": 3.140621907030957, + "learning_rate": 1.8770806076272694e-05, + "loss": 2.3615, + "step": 1919 + }, + { + "epoch": 0.36961282094472653, + "grad_norm": 3.3015796703934024, + "learning_rate": 1.8769307989472548e-05, + "loss": 2.4008, + "step": 1920 + }, + { + "epoch": 0.36961282094472653, + "lm_loss": 2.2325, + "step": 1920, + "vm_loss": 0.1834 + }, + { + "epoch": 0.36961282094472653, + "lm_loss": 1.9864, + "step": 1920, + "vm_loss": 0.1746 + }, + { + "epoch": 0.36961282094472653, + "lm_loss": 1.994, + "step": 1920, + "vm_loss": 0.2722 + }, + { + "epoch": 0.36961282094472653, + "lm_loss": 2.336, + "step": 1920, + "vm_loss": 0.1419 + }, + { + "epoch": 0.36961282094472653, + "lm_loss": 2.1098, + "step": 1920, + "vm_loss": 0.1503 + }, + { + "epoch": 0.36961282094472653, + "lm_loss": 2.2577, + "step": 1920, + "vm_loss": 0.1495 + }, + { + "epoch": 0.36961282094472653, + "lm_loss": 2.2293, + "step": 1920, + "vm_loss": 0.2073 + }, + { + "epoch": 0.36961282094472653, + "lm_loss": 2.423, + "step": 1920, + "vm_loss": 0.1074 + }, + { + "epoch": 0.3698053276223019, + "grad_norm": 3.8041246052027056, + "learning_rate": 1.8767809050183478e-05, + "loss": 2.3708, + "step": 1921 + }, + { + "epoch": 0.36999783429987726, + "grad_norm": 3.2521784935503453, + "learning_rate": 1.8766309258551195e-05, + "loss": 2.3597, + "step": 1922 + }, + { + "epoch": 0.3701903409774527, + "grad_norm": 3.248778469862273, + "learning_rate": 1.8764808614721495e-05, + "loss": 2.3936, + "step": 1923 + }, + { + "epoch": 0.37038284765502805, + "grad_norm": 3.560934149460666, + "learning_rate": 1.8763307118840264e-05, + "loss": 2.379, + "step": 1924 + }, + { + "epoch": 0.3705753543326034, + "grad_norm": 3.041505869883774, + "learning_rate": 1.8761804771053467e-05, + "loss": 2.3626, + "step": 1925 + }, + { + "epoch": 0.3707678610101788, + "grad_norm": 2.9401687744776828, + "learning_rate": 1.876030157150715e-05, + "loss": 2.348, + "step": 1926 + }, + { + "epoch": 0.37096036768775414, + "grad_norm": 3.0663044410933664, + "learning_rate": 1.8758797520347443e-05, + "loss": 2.4106, + "step": 1927 + }, + { + "epoch": 0.37115287436532957, + "grad_norm": 3.3873653068142526, + "learning_rate": 1.8757292617720558e-05, + "loss": 2.3837, + "step": 1928 + }, + { + "epoch": 0.37115287436532957, + "lm_loss": 2.3013, + "step": 1928, + "vm_loss": 0.1829 + }, + { + "epoch": 0.37115287436532957, + "lm_loss": 2.3144, + "step": 1928, + "vm_loss": 0.1887 + }, + { + "epoch": 0.37115287436532957, + "lm_loss": 2.2618, + "step": 1928, + "vm_loss": 0.1423 + }, + { + "epoch": 0.37115287436532957, + "lm_loss": 1.9892, + "step": 1928, + "vm_loss": 0.2157 + }, + { + "epoch": 0.37115287436532957, + "lm_loss": 2.1486, + "step": 1928, + "vm_loss": 0.1617 + }, + { + "epoch": 0.37115287436532957, + "lm_loss": 2.2018, + "step": 1928, + "vm_loss": 0.1451 + }, + { + "epoch": 0.37115287436532957, + "lm_loss": 2.3237, + "step": 1928, + "vm_loss": 0.1217 + }, + { + "epoch": 0.37115287436532957, + "lm_loss": 2.2211, + "step": 1928, + "vm_loss": 0.1945 + }, + { + "epoch": 0.37134538104290493, + "grad_norm": 3.377385777248594, + "learning_rate": 1.875578686377279e-05, + "loss": 2.3632, + "step": 1929 + }, + { + "epoch": 0.3715378877204803, + "grad_norm": 3.2211923182640727, + "learning_rate": 1.8754280258650522e-05, + "loss": 2.3739, + "step": 1930 + }, + { + "epoch": 0.37173039439805566, + "grad_norm": 2.889498804532783, + "learning_rate": 1.8752772802500215e-05, + "loss": 2.3582, + "step": 1931 + }, + { + "epoch": 0.3719229010756311, + "grad_norm": 2.5217446913551798, + "learning_rate": 1.8751264495468412e-05, + "loss": 2.3939, + "step": 1932 + }, + { + "epoch": 0.37211540775320645, + "grad_norm": 3.1654377622202854, + "learning_rate": 1.8749755337701738e-05, + "loss": 2.329, + "step": 1933 + }, + { + "epoch": 0.3723079144307818, + "grad_norm": 2.6859384383505804, + "learning_rate": 1.8748245329346895e-05, + "loss": 2.3646, + "step": 1934 + }, + { + "epoch": 0.3725004211083572, + "grad_norm": 2.7187052960278155, + "learning_rate": 1.8746734470550693e-05, + "loss": 2.3761, + "step": 1935 + }, + { + "epoch": 0.37269292778593255, + "grad_norm": 2.6963051840829473, + "learning_rate": 1.8745222761459995e-05, + "loss": 2.3997, + "step": 1936 + }, + { + "epoch": 0.37269292778593255, + "lm_loss": 1.9126, + "step": 1936, + "vm_loss": 0.1559 + }, + { + "epoch": 0.37269292778593255, + "lm_loss": 2.2587, + "step": 1936, + "vm_loss": 0.1499 + }, + { + "epoch": 0.37269292778593255, + "lm_loss": 2.1454, + "step": 1936, + "vm_loss": 0.1641 + }, + { + "epoch": 0.37269292778593255, + "lm_loss": 2.0813, + "step": 1936, + "vm_loss": 0.1222 + }, + { + "epoch": 0.37269292778593255, + "lm_loss": 2.2616, + "step": 1936, + "vm_loss": 0.2044 + }, + { + "epoch": 0.37269292778593255, + "lm_loss": 2.4239, + "step": 1936, + "vm_loss": 0.1577 + }, + { + "epoch": 0.37269292778593255, + "lm_loss": 1.7507, + "step": 1936, + "vm_loss": 0.1744 + }, + { + "epoch": 0.37269292778593255, + "lm_loss": 2.3021, + "step": 1936, + "vm_loss": 0.1777 + }, + { + "epoch": 0.37288543446350797, + "grad_norm": 3.0469395150127694, + "learning_rate": 1.8743710202221756e-05, + "loss": 2.3373, + "step": 1937 + }, + { + "epoch": 0.37307794114108334, + "grad_norm": 3.0424517645989617, + "learning_rate": 1.8742196792983023e-05, + "loss": 2.4125, + "step": 1938 + }, + { + "epoch": 0.3732704478186587, + "grad_norm": 2.9758849352586116, + "learning_rate": 1.874068253389092e-05, + "loss": 2.3958, + "step": 1939 + }, + { + "epoch": 0.37346295449623407, + "grad_norm": 2.847104432681737, + "learning_rate": 1.8739167425092644e-05, + "loss": 2.3616, + "step": 1940 + }, + { + "epoch": 0.3736554611738095, + "grad_norm": 2.712239491342634, + "learning_rate": 1.8737651466735487e-05, + "loss": 2.3634, + "step": 1941 + }, + { + "epoch": 0.37384796785138485, + "grad_norm": 2.9040744783936203, + "learning_rate": 1.8736134658966823e-05, + "loss": 2.3839, + "step": 1942 + }, + { + "epoch": 0.3740404745289602, + "grad_norm": 3.056239027324488, + "learning_rate": 1.8734617001934103e-05, + "loss": 2.3674, + "step": 1943 + }, + { + "epoch": 0.3742329812065356, + "grad_norm": 3.0675740117235444, + "learning_rate": 1.8733098495784864e-05, + "loss": 2.3601, + "step": 1944 + }, + { + "epoch": 0.3742329812065356, + "lm_loss": 2.1866, + "step": 1944, + "vm_loss": 0.1662 + }, + { + "epoch": 0.3742329812065356, + "lm_loss": 2.3248, + "step": 1944, + "vm_loss": 0.2112 + }, + { + "epoch": 0.3742329812065356, + "lm_loss": 1.9464, + "step": 1944, + "vm_loss": 0.1195 + }, + { + "epoch": 0.3742329812065356, + "lm_loss": 2.2396, + "step": 1944, + "vm_loss": 0.2181 + }, + { + "epoch": 0.3742329812065356, + "lm_loss": 2.1292, + "step": 1944, + "vm_loss": 0.1716 + }, + { + "epoch": 0.3742329812065356, + "lm_loss": 2.1945, + "step": 1944, + "vm_loss": 0.2114 + }, + { + "epoch": 0.3742329812065356, + "lm_loss": 2.0653, + "step": 1944, + "vm_loss": 0.1821 + }, + { + "epoch": 0.3742329812065356, + "lm_loss": 2.05, + "step": 1944, + "vm_loss": 0.129 + }, + { + "epoch": 0.37442548788411095, + "grad_norm": 2.8227348357938826, + "learning_rate": 1.873157914066672e-05, + "loss": 2.3979, + "step": 1945 + }, + { + "epoch": 0.3746179945616864, + "grad_norm": 2.8041852272865286, + "learning_rate": 1.8730058936727367e-05, + "loss": 2.4049, + "step": 1946 + }, + { + "epoch": 0.37481050123926174, + "grad_norm": 3.1714044280707805, + "learning_rate": 1.8728537884114603e-05, + "loss": 2.3971, + "step": 1947 + }, + { + "epoch": 0.3750030079168371, + "grad_norm": 2.8849558772183075, + "learning_rate": 1.8727015982976287e-05, + "loss": 2.3108, + "step": 1948 + }, + { + "epoch": 0.37519551459441247, + "grad_norm": 2.7366992104351677, + "learning_rate": 1.8725493233460362e-05, + "loss": 2.3581, + "step": 1949 + }, + { + "epoch": 0.3753880212719879, + "grad_norm": 2.9404370867883247, + "learning_rate": 1.8723969635714863e-05, + "loss": 2.3992, + "step": 1950 + }, + { + "epoch": 0.37558052794956326, + "grad_norm": 3.140588166744062, + "learning_rate": 1.872244518988791e-05, + "loss": 2.4254, + "step": 1951 + }, + { + "epoch": 0.3757730346271386, + "grad_norm": 2.780856850779236, + "learning_rate": 1.8720919896127686e-05, + "loss": 2.346, + "step": 1952 + }, + { + "epoch": 0.3757730346271386, + "lm_loss": 2.0607, + "step": 1952, + "vm_loss": 0.149 + }, + { + "epoch": 0.3757730346271386, + "lm_loss": 2.2176, + "step": 1952, + "vm_loss": 0.1687 + }, + { + "epoch": 0.3757730346271386, + "lm_loss": 2.4069, + "step": 1952, + "vm_loss": 0.1449 + }, + { + "epoch": 0.3757730346271386, + "lm_loss": 2.4947, + "step": 1952, + "vm_loss": 0.1372 + }, + { + "epoch": 0.3757730346271386, + "lm_loss": 2.1389, + "step": 1952, + "vm_loss": 0.1717 + }, + { + "epoch": 0.3757730346271386, + "lm_loss": 2.1236, + "step": 1952, + "vm_loss": 0.136 + }, + { + "epoch": 0.3757730346271386, + "lm_loss": 2.2782, + "step": 1952, + "vm_loss": 0.1539 + }, + { + "epoch": 0.3757730346271386, + "lm_loss": 2.1217, + "step": 1952, + "vm_loss": 0.1257 + }, + { + "epoch": 0.375965541304714, + "grad_norm": 3.3152655201256227, + "learning_rate": 1.8719393754582476e-05, + "loss": 2.3488, + "step": 1953 + }, + { + "epoch": 0.3761580479822894, + "grad_norm": 3.2171576874458947, + "learning_rate": 1.8717866765400638e-05, + "loss": 2.3665, + "step": 1954 + }, + { + "epoch": 0.3763505546598648, + "grad_norm": 3.11254153086239, + "learning_rate": 1.871633892873062e-05, + "loss": 2.3429, + "step": 1955 + }, + { + "epoch": 0.37654306133744014, + "grad_norm": 2.8596833908224295, + "learning_rate": 1.871481024472094e-05, + "loss": 2.3372, + "step": 1956 + }, + { + "epoch": 0.3767355680150155, + "grad_norm": 3.804049061089317, + "learning_rate": 1.871328071352021e-05, + "loss": 2.353, + "step": 1957 + }, + { + "epoch": 0.3769280746925909, + "grad_norm": 3.2778849941357224, + "learning_rate": 1.871175033527712e-05, + "loss": 2.3991, + "step": 1958 + }, + { + "epoch": 0.3771205813701663, + "grad_norm": 3.5397498020860474, + "learning_rate": 1.871021911014044e-05, + "loss": 2.4213, + "step": 1959 + }, + { + "epoch": 0.37731308804774166, + "grad_norm": 3.1265352783859597, + "learning_rate": 1.870868703825903e-05, + "loss": 2.3497, + "step": 1960 + }, + { + "epoch": 0.37731308804774166, + "lm_loss": 2.3613, + "step": 1960, + "vm_loss": 0.146 + }, + { + "epoch": 0.37731308804774166, + "lm_loss": 2.0751, + "step": 1960, + "vm_loss": 0.1317 + }, + { + "epoch": 0.37731308804774166, + "lm_loss": 1.8618, + "step": 1960, + "vm_loss": 0.1428 + }, + { + "epoch": 0.37731308804774166, + "lm_loss": 2.4324, + "step": 1960, + "vm_loss": 0.1545 + }, + { + "epoch": 0.37731308804774166, + "lm_loss": 2.4073, + "step": 1960, + "vm_loss": 0.2035 + }, + { + "epoch": 0.37731308804774166, + "lm_loss": 2.2026, + "step": 1960, + "vm_loss": 0.2031 + }, + { + "epoch": 0.37731308804774166, + "lm_loss": 2.5163, + "step": 1960, + "vm_loss": 0.164 + }, + { + "epoch": 0.37731308804774166, + "lm_loss": 2.3539, + "step": 1960, + "vm_loss": 0.1579 + }, + { + "epoch": 0.37750559472531703, + "grad_norm": 3.244911354964728, + "learning_rate": 1.870715411978182e-05, + "loss": 2.3894, + "step": 1961 + }, + { + "epoch": 0.3776981014028924, + "grad_norm": 3.333701896025753, + "learning_rate": 1.8705620354857833e-05, + "loss": 2.4247, + "step": 1962 + }, + { + "epoch": 0.3778906080804678, + "grad_norm": 3.123706656005585, + "learning_rate": 1.8704085743636172e-05, + "loss": 2.4053, + "step": 1963 + }, + { + "epoch": 0.3780831147580432, + "grad_norm": 3.0089791592640722, + "learning_rate": 1.8702550286266016e-05, + "loss": 2.3529, + "step": 1964 + }, + { + "epoch": 0.37827562143561855, + "grad_norm": 3.13980250888995, + "learning_rate": 1.8701013982896637e-05, + "loss": 2.3809, + "step": 1965 + }, + { + "epoch": 0.3784681281131939, + "grad_norm": 2.9609601146988185, + "learning_rate": 1.869947683367738e-05, + "loss": 2.3352, + "step": 1966 + }, + { + "epoch": 0.3786606347907693, + "grad_norm": 3.4079749990468704, + "learning_rate": 1.8697938838757676e-05, + "loss": 2.3317, + "step": 1967 + }, + { + "epoch": 0.3788531414683447, + "grad_norm": 3.007512275197292, + "learning_rate": 1.8696399998287035e-05, + "loss": 2.4, + "step": 1968 + }, + { + "epoch": 0.3788531414683447, + "lm_loss": 2.272, + "step": 1968, + "vm_loss": 0.1532 + }, + { + "epoch": 0.3788531414683447, + "lm_loss": 2.1801, + "step": 1968, + "vm_loss": 0.1815 + }, + { + "epoch": 0.3788531414683447, + "lm_loss": 2.2943, + "step": 1968, + "vm_loss": 0.2011 + }, + { + "epoch": 0.3788531414683447, + "lm_loss": 2.1813, + "step": 1968, + "vm_loss": 0.2048 + }, + { + "epoch": 0.3788531414683447, + "lm_loss": 2.1738, + "step": 1968, + "vm_loss": 0.1232 + }, + { + "epoch": 0.3788531414683447, + "lm_loss": 2.1781, + "step": 1968, + "vm_loss": 0.1709 + }, + { + "epoch": 0.3788531414683447, + "lm_loss": 2.374, + "step": 1968, + "vm_loss": 0.191 + }, + { + "epoch": 0.3788531414683447, + "lm_loss": 2.0512, + "step": 1968, + "vm_loss": 0.1974 + }, + { + "epoch": 0.37904564814592007, + "grad_norm": 3.369135889053919, + "learning_rate": 1.869486031241505e-05, + "loss": 2.3859, + "step": 1969 + }, + { + "epoch": 0.37923815482349543, + "grad_norm": 3.1435506605616523, + "learning_rate": 1.869331978129141e-05, + "loss": 2.3138, + "step": 1970 + }, + { + "epoch": 0.3794306615010708, + "grad_norm": 3.02540120868267, + "learning_rate": 1.869177840506587e-05, + "loss": 2.367, + "step": 1971 + }, + { + "epoch": 0.3796231681786462, + "grad_norm": 2.862784629608692, + "learning_rate": 1.8690236183888266e-05, + "loss": 2.3668, + "step": 1972 + }, + { + "epoch": 0.3798156748562216, + "grad_norm": 2.883959863544139, + "learning_rate": 1.868869311790852e-05, + "loss": 2.35, + "step": 1973 + }, + { + "epoch": 0.38000818153379695, + "grad_norm": 3.2223476977375585, + "learning_rate": 1.868714920727665e-05, + "loss": 2.4115, + "step": 1974 + }, + { + "epoch": 0.3802006882113723, + "grad_norm": 2.843032120928401, + "learning_rate": 1.8685604452142732e-05, + "loss": 2.3609, + "step": 1975 + }, + { + "epoch": 0.3803931948889477, + "grad_norm": 2.9367593648056785, + "learning_rate": 1.868405885265694e-05, + "loss": 2.3769, + "step": 1976 + }, + { + "epoch": 0.3803931948889477, + "lm_loss": 2.2925, + "step": 1976, + "vm_loss": 0.1757 + }, + { + "epoch": 0.3803931948889477, + "lm_loss": 2.3714, + "step": 1976, + "vm_loss": 0.1435 + }, + { + "epoch": 0.3803931948889477, + "lm_loss": 2.2931, + "step": 1976, + "vm_loss": 0.2274 + }, + { + "epoch": 0.3803931948889477, + "lm_loss": 2.4703, + "step": 1976, + "vm_loss": 0.1552 + }, + { + "epoch": 0.3803931948889477, + "lm_loss": 2.291, + "step": 1976, + "vm_loss": 0.1675 + }, + { + "epoch": 0.3803931948889477, + "lm_loss": 2.0574, + "step": 1976, + "vm_loss": 0.1851 + }, + { + "epoch": 0.3803931948889477, + "lm_loss": 2.1833, + "step": 1976, + "vm_loss": 0.1176 + }, + { + "epoch": 0.3803931948889477, + "lm_loss": 2.3593, + "step": 1976, + "vm_loss": 0.1253 + }, + { + "epoch": 0.3805857015665231, + "grad_norm": 3.402916162567342, + "learning_rate": 1.8682512408969522e-05, + "loss": 2.3653, + "step": 1977 + }, + { + "epoch": 0.38077820824409847, + "grad_norm": 3.0036919034170952, + "learning_rate": 1.8680965121230822e-05, + "loss": 2.3696, + "step": 1978 + }, + { + "epoch": 0.38097071492167384, + "grad_norm": 3.0463269897308956, + "learning_rate": 1.8679416989591245e-05, + "loss": 2.3773, + "step": 1979 + }, + { + "epoch": 0.3811632215992492, + "grad_norm": 3.264879139699, + "learning_rate": 1.86778680142013e-05, + "loss": 2.3906, + "step": 1980 + }, + { + "epoch": 0.3813557282768246, + "grad_norm": 3.1101290285189256, + "learning_rate": 1.867631819521156e-05, + "loss": 2.3463, + "step": 1981 + }, + { + "epoch": 0.3815482349544, + "grad_norm": 2.689597183845588, + "learning_rate": 1.8674767532772688e-05, + "loss": 2.3953, + "step": 1982 + }, + { + "epoch": 0.38174074163197536, + "grad_norm": 2.9753959196268536, + "learning_rate": 1.8673216027035425e-05, + "loss": 2.41, + "step": 1983 + }, + { + "epoch": 0.3819332483095507, + "grad_norm": 3.2863766862537886, + "learning_rate": 1.8671663678150605e-05, + "loss": 2.3829, + "step": 1984 + }, + { + "epoch": 0.3819332483095507, + "lm_loss": 2.6174, + "step": 1984, + "vm_loss": 0.139 + }, + { + "epoch": 0.3819332483095507, + "lm_loss": 2.2023, + "step": 1984, + "vm_loss": 0.1073 + }, + { + "epoch": 0.3819332483095507, + "lm_loss": 2.225, + "step": 1984, + "vm_loss": 0.1706 + }, + { + "epoch": 0.3819332483095507, + "lm_loss": 2.1383, + "step": 1984, + "vm_loss": 0.1905 + }, + { + "epoch": 0.3819332483095507, + "lm_loss": 2.1718, + "step": 1984, + "vm_loss": 0.1452 + }, + { + "epoch": 0.3819332483095507, + "lm_loss": 2.2941, + "step": 1984, + "vm_loss": 0.1618 + }, + { + "epoch": 0.3819332483095507, + "lm_loss": 2.1302, + "step": 1984, + "vm_loss": 0.1981 + }, + { + "epoch": 0.3819332483095507, + "lm_loss": 2.1689, + "step": 1984, + "vm_loss": 0.1744 + }, + { + "epoch": 0.38212575498712614, + "grad_norm": 3.1817050865598833, + "learning_rate": 1.8670110486269137e-05, + "loss": 2.3583, + "step": 1985 + }, + { + "epoch": 0.3823182616647015, + "grad_norm": 2.6313037644499824, + "learning_rate": 1.8668556451542e-05, + "loss": 2.3742, + "step": 1986 + }, + { + "epoch": 0.3825107683422769, + "grad_norm": 2.7610656914380067, + "learning_rate": 1.8667001574120272e-05, + "loss": 2.2977, + "step": 1987 + }, + { + "epoch": 0.38270327501985224, + "grad_norm": 3.34762927773237, + "learning_rate": 1.8665445854155114e-05, + "loss": 2.3691, + "step": 1988 + }, + { + "epoch": 0.3828957816974276, + "grad_norm": 3.0795798803149097, + "learning_rate": 1.8663889291797754e-05, + "loss": 2.4007, + "step": 1989 + }, + { + "epoch": 0.383088288375003, + "grad_norm": 3.4091518069598172, + "learning_rate": 1.8662331887199508e-05, + "loss": 2.4175, + "step": 1990 + }, + { + "epoch": 0.3832807950525784, + "grad_norm": 3.1702664908227343, + "learning_rate": 1.8660773640511782e-05, + "loss": 2.4171, + "step": 1991 + }, + { + "epoch": 0.38347330173015376, + "grad_norm": 3.0870067427358654, + "learning_rate": 1.8659214551886052e-05, + "loss": 2.3238, + "step": 1992 + }, + { + "epoch": 0.38347330173015376, + "lm_loss": 2.3383, + "step": 1992, + "vm_loss": 0.1633 + }, + { + "epoch": 0.38347330173015376, + "lm_loss": 2.3296, + "step": 1992, + "vm_loss": 0.1498 + }, + { + "epoch": 0.38347330173015376, + "lm_loss": 2.2698, + "step": 1992, + "vm_loss": 0.1439 + }, + { + "epoch": 0.38347330173015376, + "lm_loss": 2.4423, + "step": 1992, + "vm_loss": 0.1761 + }, + { + "epoch": 0.38347330173015376, + "lm_loss": 2.3427, + "step": 1992, + "vm_loss": 0.1547 + }, + { + "epoch": 0.38347330173015376, + "lm_loss": 2.5469, + "step": 1992, + "vm_loss": 0.201 + }, + { + "epoch": 0.38347330173015376, + "lm_loss": 2.3079, + "step": 1992, + "vm_loss": 0.1995 + }, + { + "epoch": 0.38347330173015376, + "lm_loss": 2.2981, + "step": 1992, + "vm_loss": 0.1786 + }, + { + "epoch": 0.3836658084077291, + "grad_norm": 2.839682502415458, + "learning_rate": 1.8657654621473884e-05, + "loss": 2.3972, + "step": 1993 + }, + { + "epoch": 0.38385831508530455, + "grad_norm": 3.6968356296057094, + "learning_rate": 1.8656093849426927e-05, + "loss": 2.3528, + "step": 1994 + }, + { + "epoch": 0.3840508217628799, + "grad_norm": 2.899987226556135, + "learning_rate": 1.8654532235896897e-05, + "loss": 2.3644, + "step": 1995 + }, + { + "epoch": 0.3842433284404553, + "grad_norm": 3.2858672919331573, + "learning_rate": 1.8652969781035617e-05, + "loss": 2.3945, + "step": 1996 + }, + { + "epoch": 0.38443583511803064, + "grad_norm": 2.8878937760027266, + "learning_rate": 1.8651406484994967e-05, + "loss": 2.3974, + "step": 1997 + }, + { + "epoch": 0.384628341795606, + "grad_norm": 3.3099261677660308, + "learning_rate": 1.8649842347926922e-05, + "loss": 2.3715, + "step": 1998 + }, + { + "epoch": 0.38482084847318143, + "grad_norm": 3.4595333570132487, + "learning_rate": 1.8648277369983537e-05, + "loss": 2.3914, + "step": 1999 + }, + { + "epoch": 0.3850133551507568, + "grad_norm": 2.7680952978850693, + "learning_rate": 1.8646711551316953e-05, + "loss": 2.3656, + "step": 2000 + }, + { + "epoch": 0.3850133551507568, + "lm_loss": 2.407, + "step": 2000, + "vm_loss": 0.1636 + }, + { + "epoch": 0.3850133551507568, + "lm_loss": 2.353, + "step": 2000, + "vm_loss": 0.1611 + }, + { + "epoch": 0.3850133551507568, + "lm_loss": 2.3701, + "step": 2000, + "vm_loss": 0.1459 + }, + { + "epoch": 0.3850133551507568, + "lm_loss": 2.2167, + "step": 2000, + "vm_loss": 0.1705 + }, + { + "epoch": 0.3850133551507568, + "lm_loss": 2.3023, + "step": 2000, + "vm_loss": 0.1992 + }, + { + "epoch": 0.3850133551507568, + "lm_loss": 2.458, + "step": 2000, + "vm_loss": 0.1757 + }, + { + "epoch": 0.3850133551507568, + "lm_loss": 2.421, + "step": 2000, + "vm_loss": 0.1864 + }, + { + "epoch": 0.3850133551507568, + "lm_loss": 1.949, + "step": 2000, + "vm_loss": 0.2042 + }, + { + "epoch": 0.38520586182833216, + "grad_norm": 2.8440346852863283, + "learning_rate": 1.8645144892079375e-05, + "loss": 2.3992, + "step": 2001 + }, + { + "epoch": 0.38539836850590753, + "grad_norm": 3.0163884086557418, + "learning_rate": 1.8643577392423117e-05, + "loss": 2.3894, + "step": 2002 + }, + { + "epoch": 0.38559087518348295, + "grad_norm": 3.246043913748404, + "learning_rate": 1.8642009052500545e-05, + "loss": 2.4099, + "step": 2003 + }, + { + "epoch": 0.3857833818610583, + "grad_norm": 2.839426449676454, + "learning_rate": 1.8640439872464134e-05, + "loss": 2.3676, + "step": 2004 + }, + { + "epoch": 0.3859758885386337, + "grad_norm": 3.343379803114167, + "learning_rate": 1.8638869852466427e-05, + "loss": 2.3827, + "step": 2005 + }, + { + "epoch": 0.38616839521620905, + "grad_norm": 3.0302767493948917, + "learning_rate": 1.8637298992660042e-05, + "loss": 2.412, + "step": 2006 + }, + { + "epoch": 0.3863609018937844, + "grad_norm": 2.9078652856877047, + "learning_rate": 1.8635727293197692e-05, + "loss": 2.3806, + "step": 2007 + }, + { + "epoch": 0.38655340857135984, + "grad_norm": 3.0201214146653195, + "learning_rate": 1.8634154754232166e-05, + "loss": 2.4042, + "step": 2008 + }, + { + "epoch": 0.38655340857135984, + "lm_loss": 2.2746, + "step": 2008, + "vm_loss": 0.0923 + }, + { + "epoch": 0.38655340857135984, + "lm_loss": 2.3249, + "step": 2008, + "vm_loss": 0.2573 + }, + { + "epoch": 0.38655340857135984, + "lm_loss": 2.2309, + "step": 2008, + "vm_loss": 0.1451 + }, + { + "epoch": 0.38655340857135984, + "lm_loss": 2.1614, + "step": 2008, + "vm_loss": 0.2161 + }, + { + "epoch": 0.38655340857135984, + "lm_loss": 2.1619, + "step": 2008, + "vm_loss": 0.1456 + }, + { + "epoch": 0.38655340857135984, + "lm_loss": 2.1641, + "step": 2008, + "vm_loss": 0.1638 + }, + { + "epoch": 0.38655340857135984, + "lm_loss": 1.9329, + "step": 2008, + "vm_loss": 0.1447 + }, + { + "epoch": 0.38655340857135984, + "lm_loss": 2.0703, + "step": 2008, + "vm_loss": 0.2889 + }, + { + "epoch": 0.3867459152489352, + "grad_norm": 3.0136423275193147, + "learning_rate": 1.863258137591634e-05, + "loss": 2.3728, + "step": 2009 + }, + { + "epoch": 0.38693842192651057, + "grad_norm": 2.879018694904631, + "learning_rate": 1.8631007158403155e-05, + "loss": 2.3785, + "step": 2010 + }, + { + "epoch": 0.38713092860408593, + "grad_norm": 2.85103774703889, + "learning_rate": 1.8629432101845655e-05, + "loss": 2.3931, + "step": 2011 + }, + { + "epoch": 0.38732343528166135, + "grad_norm": 2.8668263695269784, + "learning_rate": 1.862785620639695e-05, + "loss": 2.3724, + "step": 2012 + }, + { + "epoch": 0.3875159419592367, + "grad_norm": 2.5701259852923335, + "learning_rate": 1.8626279472210237e-05, + "loss": 2.3411, + "step": 2013 + }, + { + "epoch": 0.3877084486368121, + "grad_norm": 3.2875170339065534, + "learning_rate": 1.8624701899438802e-05, + "loss": 2.3735, + "step": 2014 + }, + { + "epoch": 0.38790095531438745, + "grad_norm": 3.126993072709185, + "learning_rate": 1.8623123488235995e-05, + "loss": 2.3321, + "step": 2015 + }, + { + "epoch": 0.3880934619919629, + "grad_norm": 2.6647001464043987, + "learning_rate": 1.862154423875527e-05, + "loss": 2.3769, + "step": 2016 + }, + { + "epoch": 0.3880934619919629, + "lm_loss": 2.1819, + "step": 2016, + "vm_loss": 0.1323 + }, + { + "epoch": 0.3880934619919629, + "lm_loss": 2.2146, + "step": 2016, + "vm_loss": 0.2051 + }, + { + "epoch": 0.3880934619919629, + "lm_loss": 2.2675, + "step": 2016, + "vm_loss": 0.1738 + }, + { + "epoch": 0.3880934619919629, + "lm_loss": 2.137, + "step": 2016, + "vm_loss": 0.1774 + }, + { + "epoch": 0.3880934619919629, + "lm_loss": 2.2228, + "step": 2016, + "vm_loss": 0.2099 + }, + { + "epoch": 0.3880934619919629, + "lm_loss": 2.199, + "step": 2016, + "vm_loss": 0.1647 + }, + { + "epoch": 0.3880934619919629, + "lm_loss": 2.2623, + "step": 2016, + "vm_loss": 0.263 + }, + { + "epoch": 0.3880934619919629, + "lm_loss": 2.4624, + "step": 2016, + "vm_loss": 0.1652 + }, + { + "epoch": 0.38828596866953824, + "grad_norm": 3.3470613889497596, + "learning_rate": 1.861996415115014e-05, + "loss": 2.4091, + "step": 2017 + }, + { + "epoch": 0.3884784753471136, + "grad_norm": 3.616999162685489, + "learning_rate": 1.861838322557421e-05, + "loss": 2.3646, + "step": 2018 + }, + { + "epoch": 0.38867098202468897, + "grad_norm": 3.0673270312334022, + "learning_rate": 1.861680146218117e-05, + "loss": 2.3448, + "step": 2019 + }, + { + "epoch": 0.38886348870226434, + "grad_norm": 3.1477514897276997, + "learning_rate": 1.8615218861124787e-05, + "loss": 2.3796, + "step": 2020 + }, + { + "epoch": 0.38905599537983976, + "grad_norm": 3.2590006522776997, + "learning_rate": 1.8613635422558913e-05, + "loss": 2.3036, + "step": 2021 + }, + { + "epoch": 0.3892485020574151, + "grad_norm": 3.2819230314377688, + "learning_rate": 1.8612051146637477e-05, + "loss": 2.4464, + "step": 2022 + }, + { + "epoch": 0.3894410087349905, + "grad_norm": 3.4984747289433997, + "learning_rate": 1.8610466033514483e-05, + "loss": 2.3916, + "step": 2023 + }, + { + "epoch": 0.38963351541256586, + "grad_norm": 2.9443748339448943, + "learning_rate": 1.8608880083344033e-05, + "loss": 2.4302, + "step": 2024 + }, + { + "epoch": 0.38963351541256586, + "lm_loss": 2.5331, + "step": 2024, + "vm_loss": 0.1517 + }, + { + "epoch": 0.38963351541256586, + "lm_loss": 2.1304, + "step": 2024, + "vm_loss": 0.1903 + }, + { + "epoch": 0.38963351541256586, + "lm_loss": 2.4083, + "step": 2024, + "vm_loss": 0.1096 + }, + { + "epoch": 0.38963351541256586, + "lm_loss": 2.3711, + "step": 2024, + "vm_loss": 0.1546 + }, + { + "epoch": 0.38963351541256586, + "lm_loss": 2.2704, + "step": 2024, + "vm_loss": 0.148 + }, + { + "epoch": 0.38963351541256586, + "lm_loss": 2.5538, + "step": 2024, + "vm_loss": 0.2058 + }, + { + "epoch": 0.38963351541256586, + "lm_loss": 2.3603, + "step": 2024, + "vm_loss": 0.2367 + }, + { + "epoch": 0.38963351541256586, + "lm_loss": 2.1702, + "step": 2024, + "vm_loss": 0.2124 + }, + { + "epoch": 0.3898260220901413, + "grad_norm": 3.4604129325753603, + "learning_rate": 1.86072932962803e-05, + "loss": 2.384, + "step": 2025 + }, + { + "epoch": 0.39001852876771664, + "grad_norm": 3.145886776532001, + "learning_rate": 1.860570567247754e-05, + "loss": 2.3878, + "step": 2026 + }, + { + "epoch": 0.390211035445292, + "grad_norm": 2.583739171946118, + "learning_rate": 1.8604117212090084e-05, + "loss": 2.3341, + "step": 2027 + }, + { + "epoch": 0.3904035421228674, + "grad_norm": 3.353559198885944, + "learning_rate": 1.860252791527236e-05, + "loss": 2.4004, + "step": 2028 + }, + { + "epoch": 0.39059604880044274, + "grad_norm": 3.029609564913458, + "learning_rate": 1.8600937782178868e-05, + "loss": 2.3855, + "step": 2029 + }, + { + "epoch": 0.39078855547801816, + "grad_norm": 2.9129425309627703, + "learning_rate": 1.859934681296418e-05, + "loss": 2.3812, + "step": 2030 + }, + { + "epoch": 0.39098106215559353, + "grad_norm": 3.5733235480970036, + "learning_rate": 1.8597755007782966e-05, + "loss": 2.3406, + "step": 2031 + }, + { + "epoch": 0.3911735688331689, + "grad_norm": 3.238856079367312, + "learning_rate": 1.859616236678997e-05, + "loss": 2.3559, + "step": 2032 + }, + { + "epoch": 0.3911735688331689, + "lm_loss": 2.1058, + "step": 2032, + "vm_loss": 0.2438 + }, + { + "epoch": 0.3911735688331689, + "lm_loss": 2.3157, + "step": 2032, + "vm_loss": 0.1458 + }, + { + "epoch": 0.3911735688331689, + "lm_loss": 2.1296, + "step": 2032, + "vm_loss": 0.1414 + }, + { + "epoch": 0.3911735688331689, + "lm_loss": 2.4608, + "step": 2032, + "vm_loss": 0.1785 + }, + { + "epoch": 0.3911735688331689, + "lm_loss": 2.1726, + "step": 2032, + "vm_loss": 0.1997 + }, + { + "epoch": 0.3911735688331689, + "lm_loss": 2.1331, + "step": 2032, + "vm_loss": 0.1576 + }, + { + "epoch": 0.3911735688331689, + "lm_loss": 2.225, + "step": 2032, + "vm_loss": 0.1677 + }, + { + "epoch": 0.3911735688331689, + "lm_loss": 2.1458, + "step": 2032, + "vm_loss": 0.2434 + }, + { + "epoch": 0.39136607551074426, + "grad_norm": 2.9747930290792097, + "learning_rate": 1.859456889014001e-05, + "loss": 2.3842, + "step": 2033 + }, + { + "epoch": 0.3915585821883197, + "grad_norm": 3.4932337159423663, + "learning_rate": 1.8592974577988e-05, + "loss": 2.358, + "step": 2034 + }, + { + "epoch": 0.39175108886589505, + "grad_norm": 2.7743135368373615, + "learning_rate": 1.8591379430488922e-05, + "loss": 2.3519, + "step": 2035 + }, + { + "epoch": 0.3919435955434704, + "grad_norm": 3.2273189259824573, + "learning_rate": 1.8589783447797852e-05, + "loss": 2.3795, + "step": 2036 + }, + { + "epoch": 0.3921361022210458, + "grad_norm": 3.3361305306755966, + "learning_rate": 1.8588186630069935e-05, + "loss": 2.3414, + "step": 2037 + }, + { + "epoch": 0.39232860889862115, + "grad_norm": 2.892889310143435, + "learning_rate": 1.85865889774604e-05, + "loss": 2.3361, + "step": 2038 + }, + { + "epoch": 0.39252111557619657, + "grad_norm": 3.444314411455052, + "learning_rate": 1.858499049012456e-05, + "loss": 2.3996, + "step": 2039 + }, + { + "epoch": 0.39271362225377193, + "grad_norm": 3.217486455718521, + "learning_rate": 1.8583391168217814e-05, + "loss": 2.3229, + "step": 2040 + }, + { + "epoch": 0.39271362225377193, + "lm_loss": 2.1637, + "step": 2040, + "vm_loss": 0.2215 + }, + { + "epoch": 0.39271362225377193, + "lm_loss": 2.4445, + "step": 2040, + "vm_loss": 0.1647 + }, + { + "epoch": 0.39271362225377193, + "lm_loss": 2.495, + "step": 2040, + "vm_loss": 0.1476 + }, + { + "epoch": 0.39271362225377193, + "lm_loss": 2.4946, + "step": 2040, + "vm_loss": 0.1149 + }, + { + "epoch": 0.39271362225377193, + "lm_loss": 2.2974, + "step": 2040, + "vm_loss": 0.1919 + }, + { + "epoch": 0.39271362225377193, + "lm_loss": 2.4617, + "step": 2040, + "vm_loss": 0.1682 + }, + { + "epoch": 0.39271362225377193, + "lm_loss": 2.228, + "step": 2040, + "vm_loss": 0.1795 + }, + { + "epoch": 0.39271362225377193, + "lm_loss": 2.2925, + "step": 2040, + "vm_loss": 0.2802 + }, + { + "epoch": 0.3929061289313473, + "grad_norm": 2.6109603581382097, + "learning_rate": 1.8581791011895633e-05, + "loss": 2.4388, + "step": 2041 + }, + { + "epoch": 0.39309863560892266, + "grad_norm": 2.790514869338661, + "learning_rate": 1.858019002131357e-05, + "loss": 2.3692, + "step": 2042 + }, + { + "epoch": 0.3932911422864981, + "grad_norm": 3.0620125357572796, + "learning_rate": 1.8578588196627266e-05, + "loss": 2.3519, + "step": 2043 + }, + { + "epoch": 0.39348364896407345, + "grad_norm": 2.524050758725505, + "learning_rate": 1.857698553799244e-05, + "loss": 2.3478, + "step": 2044 + }, + { + "epoch": 0.3936761556416488, + "grad_norm": 2.922462216205388, + "learning_rate": 1.8575382045564883e-05, + "loss": 2.3428, + "step": 2045 + }, + { + "epoch": 0.3938686623192242, + "grad_norm": 3.1158354683087017, + "learning_rate": 1.857377771950048e-05, + "loss": 2.3585, + "step": 2046 + }, + { + "epoch": 0.39406116899679955, + "grad_norm": 3.046297916046909, + "learning_rate": 1.8572172559955194e-05, + "loss": 2.3604, + "step": 2047 + }, + { + "epoch": 0.39425367567437497, + "grad_norm": 3.4195501727544704, + "learning_rate": 1.8570566567085067e-05, + "loss": 2.317, + "step": 2048 + }, + { + "epoch": 0.39425367567437497, + "lm_loss": 2.1239, + "step": 2048, + "vm_loss": 0.1488 + }, + { + "epoch": 0.39425367567437497, + "lm_loss": 1.8804, + "step": 2048, + "vm_loss": 0.1946 + }, + { + "epoch": 0.39425367567437497, + "lm_loss": 2.1542, + "step": 2048, + "vm_loss": 0.2163 + }, + { + "epoch": 0.39425367567437497, + "lm_loss": 2.1105, + "step": 2048, + "vm_loss": 0.1674 + }, + { + "epoch": 0.39425367567437497, + "lm_loss": 1.8584, + "step": 2048, + "vm_loss": 0.125 + }, + { + "epoch": 0.39425367567437497, + "lm_loss": 2.1507, + "step": 2048, + "vm_loss": 0.1854 + }, + { + "epoch": 0.39425367567437497, + "lm_loss": 2.1636, + "step": 2048, + "vm_loss": 0.1709 + }, + { + "epoch": 0.39425367567437497, + "lm_loss": 2.3265, + "step": 2048, + "vm_loss": 0.1319 + }, + { + "epoch": 0.39444618235195034, + "grad_norm": 3.1153836592471915, + "learning_rate": 1.8568959741046217e-05, + "loss": 2.3245, + "step": 2049 + }, + { + "epoch": 0.3946386890295257, + "grad_norm": 2.3565713206396324, + "learning_rate": 1.8567352081994852e-05, + "loss": 2.356, + "step": 2050 + }, + { + "epoch": 0.39483119570710107, + "grad_norm": 3.095627990215364, + "learning_rate": 1.8565743590087256e-05, + "loss": 2.3211, + "step": 2051 + }, + { + "epoch": 0.3950237023846765, + "grad_norm": 3.0688479603071634, + "learning_rate": 1.8564134265479798e-05, + "loss": 2.3572, + "step": 2052 + }, + { + "epoch": 0.39521620906225186, + "grad_norm": 2.773118438442102, + "learning_rate": 1.856252410832892e-05, + "loss": 2.3896, + "step": 2053 + }, + { + "epoch": 0.3954087157398272, + "grad_norm": 2.9907654759929025, + "learning_rate": 1.856091311879115e-05, + "loss": 2.3259, + "step": 2054 + }, + { + "epoch": 0.3956012224174026, + "grad_norm": 2.593740916753053, + "learning_rate": 1.85593012970231e-05, + "loss": 2.3194, + "step": 2055 + }, + { + "epoch": 0.395793729094978, + "grad_norm": 2.881627558220902, + "learning_rate": 1.855768864318146e-05, + "loss": 2.371, + "step": 2056 + }, + { + "epoch": 0.395793729094978, + "lm_loss": 2.1765, + "step": 2056, + "vm_loss": 0.1328 + }, + { + "epoch": 0.395793729094978, + "lm_loss": 1.94, + "step": 2056, + "vm_loss": 0.1777 + }, + { + "epoch": 0.395793729094978, + "lm_loss": 2.0098, + "step": 2056, + "vm_loss": 0.1802 + }, + { + "epoch": 0.395793729094978, + "lm_loss": 2.2663, + "step": 2056, + "vm_loss": 0.1252 + }, + { + "epoch": 0.395793729094978, + "lm_loss": 2.1563, + "step": 2056, + "vm_loss": 0.1936 + }, + { + "epoch": 0.395793729094978, + "lm_loss": 2.6052, + "step": 2056, + "vm_loss": 0.1602 + }, + { + "epoch": 0.395793729094978, + "lm_loss": 2.2295, + "step": 2056, + "vm_loss": 0.1267 + }, + { + "epoch": 0.395793729094978, + "lm_loss": 2.3169, + "step": 2056, + "vm_loss": 0.1474 + }, + { + "epoch": 0.3959862357725534, + "grad_norm": 3.16326948920532, + "learning_rate": 1.8556075157422996e-05, + "loss": 2.3685, + "step": 2057 + }, + { + "epoch": 0.39617874245012874, + "grad_norm": 3.034727417399684, + "learning_rate": 1.8554460839904566e-05, + "loss": 2.3517, + "step": 2058 + }, + { + "epoch": 0.3963712491277041, + "grad_norm": 2.743832446696198, + "learning_rate": 1.8552845690783096e-05, + "loss": 2.397, + "step": 2059 + }, + { + "epoch": 0.39656375580527947, + "grad_norm": 2.665699129525352, + "learning_rate": 1.8551229710215604e-05, + "loss": 2.3751, + "step": 2060 + }, + { + "epoch": 0.3967562624828549, + "grad_norm": 2.870773403906124, + "learning_rate": 1.8549612898359185e-05, + "loss": 2.3075, + "step": 2061 + }, + { + "epoch": 0.39694876916043026, + "grad_norm": 2.718877430868723, + "learning_rate": 1.8547995255371007e-05, + "loss": 2.3559, + "step": 2062 + }, + { + "epoch": 0.3971412758380056, + "grad_norm": 3.003171338836357, + "learning_rate": 1.8546376781408328e-05, + "loss": 2.4221, + "step": 2063 + }, + { + "epoch": 0.397333782515581, + "grad_norm": 2.69965600956393, + "learning_rate": 1.8544757476628487e-05, + "loss": 2.3154, + "step": 2064 + }, + { + "epoch": 0.397333782515581, + "lm_loss": 2.2021, + "step": 2064, + "vm_loss": 0.1739 + }, + { + "epoch": 0.397333782515581, + "lm_loss": 2.075, + "step": 2064, + "vm_loss": 0.1212 + }, + { + "epoch": 0.397333782515581, + "lm_loss": 1.9951, + "step": 2064, + "vm_loss": 0.1733 + }, + { + "epoch": 0.397333782515581, + "lm_loss": 2.2588, + "step": 2064, + "vm_loss": 0.1512 + }, + { + "epoch": 0.397333782515581, + "lm_loss": 2.3122, + "step": 2064, + "vm_loss": 0.1398 + }, + { + "epoch": 0.397333782515581, + "lm_loss": 2.1324, + "step": 2064, + "vm_loss": 0.2003 + }, + { + "epoch": 0.397333782515581, + "lm_loss": 2.1604, + "step": 2064, + "vm_loss": 0.1192 + }, + { + "epoch": 0.397333782515581, + "lm_loss": 2.1421, + "step": 2064, + "vm_loss": 0.1569 + }, + { + "epoch": 0.3975262891931564, + "grad_norm": 2.988678085262676, + "learning_rate": 1.85431373411889e-05, + "loss": 2.3549, + "step": 2065 + }, + { + "epoch": 0.3977187958707318, + "grad_norm": 2.8748570422625446, + "learning_rate": 1.8541516375247063e-05, + "loss": 2.3505, + "step": 2066 + }, + { + "epoch": 0.39791130254830714, + "grad_norm": 2.9680860132199283, + "learning_rate": 1.853989457896056e-05, + "loss": 2.3902, + "step": 2067 + }, + { + "epoch": 0.3981038092258825, + "grad_norm": 3.32396089252872, + "learning_rate": 1.8538271952487044e-05, + "loss": 2.3977, + "step": 2068 + }, + { + "epoch": 0.3982963159034579, + "grad_norm": 3.1991700906287055, + "learning_rate": 1.853664849598426e-05, + "loss": 2.3517, + "step": 2069 + }, + { + "epoch": 0.3984888225810333, + "grad_norm": 3.1527773494010525, + "learning_rate": 1.8535024209610026e-05, + "loss": 2.3576, + "step": 2070 + }, + { + "epoch": 0.39868132925860866, + "grad_norm": 3.3321723724755192, + "learning_rate": 1.8533399093522244e-05, + "loss": 2.3578, + "step": 2071 + }, + { + "epoch": 0.39887383593618403, + "grad_norm": 2.9962683363846616, + "learning_rate": 1.8531773147878895e-05, + "loss": 2.3266, + "step": 2072 + }, + { + "epoch": 0.39887383593618403, + "lm_loss": 2.2826, + "step": 2072, + "vm_loss": 0.2171 + }, + { + "epoch": 0.39887383593618403, + "lm_loss": 2.434, + "step": 2072, + "vm_loss": 0.1375 + }, + { + "epoch": 0.39887383593618403, + "lm_loss": 2.2177, + "step": 2072, + "vm_loss": 0.1401 + }, + { + "epoch": 0.39887383593618403, + "lm_loss": 1.7106, + "step": 2072, + "vm_loss": 0.1933 + }, + { + "epoch": 0.39887383593618403, + "lm_loss": 2.3132, + "step": 2072, + "vm_loss": 0.128 + }, + { + "epoch": 0.39887383593618403, + "lm_loss": 2.3104, + "step": 2072, + "vm_loss": 0.1336 + }, + { + "epoch": 0.39887383593618403, + "lm_loss": 2.1642, + "step": 2072, + "vm_loss": 0.131 + }, + { + "epoch": 0.39887383593618403, + "lm_loss": 2.1532, + "step": 2072, + "vm_loss": 0.1404 + }, + { + "epoch": 0.3990663426137594, + "grad_norm": 3.2315090828305393, + "learning_rate": 1.8530146372838045e-05, + "loss": 2.3467, + "step": 2073 + }, + { + "epoch": 0.3992588492913348, + "grad_norm": 2.956983044209395, + "learning_rate": 1.852851876855783e-05, + "loss": 2.335, + "step": 2074 + }, + { + "epoch": 0.3994513559689102, + "grad_norm": 3.2636455416558703, + "learning_rate": 1.8526890335196483e-05, + "loss": 2.3525, + "step": 2075 + }, + { + "epoch": 0.39964386264648555, + "grad_norm": 3.173723207522457, + "learning_rate": 1.8525261072912305e-05, + "loss": 2.3799, + "step": 2076 + }, + { + "epoch": 0.3998363693240609, + "grad_norm": 3.2814888328996474, + "learning_rate": 1.852363098186368e-05, + "loss": 2.3786, + "step": 2077 + }, + { + "epoch": 0.4000288760016363, + "grad_norm": 3.6152925232572755, + "learning_rate": 1.8522000062209078e-05, + "loss": 2.3779, + "step": 2078 + }, + { + "epoch": 0.4002213826792117, + "grad_norm": 3.020599153252269, + "learning_rate": 1.8520368314107036e-05, + "loss": 2.3436, + "step": 2079 + }, + { + "epoch": 0.40041388935678707, + "grad_norm": 3.05394719935269, + "learning_rate": 1.8518735737716188e-05, + "loss": 2.3577, + "step": 2080 + }, + { + "epoch": 0.40041388935678707, + "lm_loss": 1.9162, + "step": 2080, + "vm_loss": 0.1971 + }, + { + "epoch": 0.40041388935678707, + "lm_loss": 2.2728, + "step": 2080, + "vm_loss": 0.163 + }, + { + "epoch": 0.40041388935678707, + "lm_loss": 2.2192, + "step": 2080, + "vm_loss": 0.1723 + }, + { + "epoch": 0.40041388935678707, + "lm_loss": 2.2454, + "step": 2080, + "vm_loss": 0.2285 + }, + { + "epoch": 0.40041388935678707, + "lm_loss": 2.2141, + "step": 2080, + "vm_loss": 0.1989 + }, + { + "epoch": 0.40041388935678707, + "lm_loss": 1.9505, + "step": 2080, + "vm_loss": 0.1622 + }, + { + "epoch": 0.40041388935678707, + "lm_loss": 1.9737, + "step": 2080, + "vm_loss": 0.1511 + }, + { + "epoch": 0.40041388935678707, + "lm_loss": 2.1599, + "step": 2080, + "vm_loss": 0.1551 + }, + { + "epoch": 0.40060639603436243, + "grad_norm": 3.0858368231494193, + "learning_rate": 1.851710233319524e-05, + "loss": 2.3258, + "step": 2081 + }, + { + "epoch": 0.4007989027119378, + "grad_norm": 3.422540185958113, + "learning_rate": 1.8515468100702983e-05, + "loss": 2.4107, + "step": 2082 + }, + { + "epoch": 0.4009914093895132, + "grad_norm": 2.821456428774914, + "learning_rate": 1.851383304039828e-05, + "loss": 2.3222, + "step": 2083 + }, + { + "epoch": 0.4011839160670886, + "grad_norm": 3.142853241007964, + "learning_rate": 1.851219715244008e-05, + "loss": 2.3429, + "step": 2084 + }, + { + "epoch": 0.40137642274466395, + "grad_norm": 3.7750412960300346, + "learning_rate": 1.8510560436987414e-05, + "loss": 2.3645, + "step": 2085 + }, + { + "epoch": 0.4015689294222393, + "grad_norm": 3.029553677458068, + "learning_rate": 1.8508922894199393e-05, + "loss": 2.3588, + "step": 2086 + }, + { + "epoch": 0.40176143609981474, + "grad_norm": 3.1623164441063927, + "learning_rate": 1.8507284524235208e-05, + "loss": 2.3256, + "step": 2087 + }, + { + "epoch": 0.4019539427773901, + "grad_norm": 3.152002749911262, + "learning_rate": 1.850564532725412e-05, + "loss": 2.4107, + "step": 2088 + }, + { + "epoch": 0.4019539427773901, + "lm_loss": 2.2248, + "step": 2088, + "vm_loss": 0.1692 + }, + { + "epoch": 0.4019539427773901, + "lm_loss": 1.8678, + "step": 2088, + "vm_loss": 0.1804 + }, + { + "epoch": 0.4019539427773901, + "lm_loss": 2.1745, + "step": 2088, + "vm_loss": 0.1521 + }, + { + "epoch": 0.4019539427773901, + "lm_loss": 1.9515, + "step": 2088, + "vm_loss": 0.1845 + }, + { + "epoch": 0.4019539427773901, + "lm_loss": 2.129, + "step": 2088, + "vm_loss": 0.1966 + }, + { + "epoch": 0.4019539427773901, + "lm_loss": 2.1459, + "step": 2088, + "vm_loss": 0.1203 + }, + { + "epoch": 0.4019539427773901, + "lm_loss": 2.2922, + "step": 2088, + "vm_loss": 0.1705 + }, + { + "epoch": 0.4019539427773901, + "lm_loss": 2.3448, + "step": 2088, + "vm_loss": 0.175 + }, + { + "epoch": 0.40214644945496547, + "grad_norm": 2.773597977322333, + "learning_rate": 1.8504005303415494e-05, + "loss": 2.349, + "step": 2089 + }, + { + "epoch": 0.40233895613254084, + "grad_norm": 3.139764150050187, + "learning_rate": 1.8502364452878755e-05, + "loss": 2.3327, + "step": 2090 + }, + { + "epoch": 0.4025314628101162, + "grad_norm": 2.895376636997464, + "learning_rate": 1.850072277580341e-05, + "loss": 2.405, + "step": 2091 + }, + { + "epoch": 0.4027239694876916, + "grad_norm": 3.6007083949359076, + "learning_rate": 1.8499080272349057e-05, + "loss": 2.3429, + "step": 2092 + }, + { + "epoch": 0.402916476165267, + "grad_norm": 3.3544394363705448, + "learning_rate": 1.8497436942675366e-05, + "loss": 2.3403, + "step": 2093 + }, + { + "epoch": 0.40310898284284236, + "grad_norm": 3.386412673761896, + "learning_rate": 1.849579278694209e-05, + "loss": 2.3836, + "step": 2094 + }, + { + "epoch": 0.4033014895204177, + "grad_norm": 3.526287711151043, + "learning_rate": 1.849414780530906e-05, + "loss": 2.3513, + "step": 2095 + }, + { + "epoch": 0.40349399619799314, + "grad_norm": 3.4130453504031113, + "learning_rate": 1.8492501997936196e-05, + "loss": 2.3317, + "step": 2096 + }, + { + "epoch": 0.40349399619799314, + "lm_loss": 2.2077, + "step": 2096, + "vm_loss": 0.1421 + }, + { + "epoch": 0.40349399619799314, + "lm_loss": 2.07, + "step": 2096, + "vm_loss": 0.1663 + }, + { + "epoch": 0.40349399619799314, + "lm_loss": 2.3461, + "step": 2096, + "vm_loss": 0.1409 + }, + { + "epoch": 0.40349399619799314, + "lm_loss": 2.2929, + "step": 2096, + "vm_loss": 0.2165 + }, + { + "epoch": 0.40349399619799314, + "lm_loss": 2.23, + "step": 2096, + "vm_loss": 0.1693 + }, + { + "epoch": 0.40349399619799314, + "lm_loss": 2.0502, + "step": 2096, + "vm_loss": 0.1598 + }, + { + "epoch": 0.40349399619799314, + "lm_loss": 2.3213, + "step": 2096, + "vm_loss": 0.2232 + }, + { + "epoch": 0.40349399619799314, + "lm_loss": 2.2544, + "step": 2096, + "vm_loss": 0.2123 + }, + { + "epoch": 0.4036865028755685, + "grad_norm": 2.754506936894068, + "learning_rate": 1.8490855364983488e-05, + "loss": 2.3744, + "step": 2097 + }, + { + "epoch": 0.4038790095531439, + "grad_norm": 3.260158849495847, + "learning_rate": 1.8489207906611003e-05, + "loss": 2.3482, + "step": 2098 + }, + { + "epoch": 0.40407151623071924, + "grad_norm": 3.0051348691845177, + "learning_rate": 1.8487559622978906e-05, + "loss": 2.3223, + "step": 2099 + }, + { + "epoch": 0.4042640229082946, + "grad_norm": 2.9106647062416706, + "learning_rate": 1.8485910514247424e-05, + "loss": 2.3617, + "step": 2100 + }, + { + "epoch": 0.40445652958587003, + "grad_norm": 3.040003452725519, + "learning_rate": 1.8484260580576873e-05, + "loss": 2.3809, + "step": 2101 + }, + { + "epoch": 0.4046490362634454, + "grad_norm": 3.277968031109553, + "learning_rate": 1.8482609822127647e-05, + "loss": 2.3713, + "step": 2102 + }, + { + "epoch": 0.40484154294102076, + "grad_norm": 2.5418297864609767, + "learning_rate": 1.8480958239060226e-05, + "loss": 2.3364, + "step": 2103 + }, + { + "epoch": 0.4050340496185961, + "grad_norm": 3.045313253427444, + "learning_rate": 1.8479305831535156e-05, + "loss": 2.3705, + "step": 2104 + }, + { + "epoch": 0.4050340496185961, + "lm_loss": 2.0037, + "step": 2104, + "vm_loss": 0.1549 + }, + { + "epoch": 0.4050340496185961, + "lm_loss": 2.1825, + "step": 2104, + "vm_loss": 0.1171 + }, + { + "epoch": 0.4050340496185961, + "lm_loss": 2.2291, + "step": 2104, + "vm_loss": 0.2034 + }, + { + "epoch": 0.4050340496185961, + "lm_loss": 2.0108, + "step": 2104, + "vm_loss": 0.164 + }, + { + "epoch": 0.4050340496185961, + "lm_loss": 2.4695, + "step": 2104, + "vm_loss": 0.2458 + }, + { + "epoch": 0.4050340496185961, + "lm_loss": 1.902, + "step": 2104, + "vm_loss": 0.155 + }, + { + "epoch": 0.4050340496185961, + "lm_loss": 2.0359, + "step": 2104, + "vm_loss": 0.2108 + }, + { + "epoch": 0.4050340496185961, + "lm_loss": 2.1429, + "step": 2104, + "vm_loss": 0.1256 + }, + { + "epoch": 0.40522655629617155, + "grad_norm": 2.6914989609059177, + "learning_rate": 1.8477652599713082e-05, + "loss": 2.3515, + "step": 2105 + }, + { + "epoch": 0.4054190629737469, + "grad_norm": 2.9839354131602387, + "learning_rate": 1.8475998543754713e-05, + "loss": 2.3393, + "step": 2106 + }, + { + "epoch": 0.4056115696513223, + "grad_norm": 2.9286941595123963, + "learning_rate": 1.8474343663820844e-05, + "loss": 2.3487, + "step": 2107 + }, + { + "epoch": 0.40580407632889764, + "grad_norm": 2.7218229552518807, + "learning_rate": 1.847268796007235e-05, + "loss": 2.3624, + "step": 2108 + }, + { + "epoch": 0.405996583006473, + "grad_norm": 2.5674365360169564, + "learning_rate": 1.8471031432670194e-05, + "loss": 2.344, + "step": 2109 + }, + { + "epoch": 0.40618908968404843, + "grad_norm": 3.198034870853174, + "learning_rate": 1.8469374081775405e-05, + "loss": 2.388, + "step": 2110 + }, + { + "epoch": 0.4063815963616238, + "grad_norm": 3.1875221751385023, + "learning_rate": 1.84677159075491e-05, + "loss": 2.3866, + "step": 2111 + }, + { + "epoch": 0.40657410303919916, + "grad_norm": 2.9758315548135417, + "learning_rate": 1.8466056910152476e-05, + "loss": 2.3774, + "step": 2112 + }, + { + "epoch": 0.40657410303919916, + "lm_loss": 2.4807, + "step": 2112, + "vm_loss": 0.1375 + }, + { + "epoch": 0.40657410303919916, + "lm_loss": 1.9244, + "step": 2112, + "vm_loss": 0.1513 + }, + { + "epoch": 0.40657410303919916, + "lm_loss": 2.3026, + "step": 2112, + "vm_loss": 0.1758 + }, + { + "epoch": 0.40657410303919916, + "lm_loss": 1.8861, + "step": 2112, + "vm_loss": 0.2457 + }, + { + "epoch": 0.40657410303919916, + "lm_loss": 2.5146, + "step": 2112, + "vm_loss": 0.1584 + }, + { + "epoch": 0.40657410303919916, + "lm_loss": 2.1579, + "step": 2112, + "vm_loss": 0.168 + }, + { + "epoch": 0.40657410303919916, + "lm_loss": 2.178, + "step": 2112, + "vm_loss": 0.1353 + }, + { + "epoch": 0.40657410303919916, + "lm_loss": 2.0944, + "step": 2112, + "vm_loss": 0.2179 + }, + { + "epoch": 0.40676660971677453, + "grad_norm": 3.2055877847242895, + "learning_rate": 1.8464397089746807e-05, + "loss": 2.3455, + "step": 2113 + }, + { + "epoch": 0.40695911639434995, + "grad_norm": 2.9327102257208355, + "learning_rate": 1.8462736446493452e-05, + "loss": 2.3763, + "step": 2114 + }, + { + "epoch": 0.4071516230719253, + "grad_norm": 2.6493428633871234, + "learning_rate": 1.846107498055384e-05, + "loss": 2.3452, + "step": 2115 + }, + { + "epoch": 0.4073441297495007, + "grad_norm": 3.0305954613896686, + "learning_rate": 1.8459412692089497e-05, + "loss": 2.3569, + "step": 2116 + }, + { + "epoch": 0.40753663642707605, + "grad_norm": 3.408555878596119, + "learning_rate": 1.8457749581262006e-05, + "loss": 2.3873, + "step": 2117 + }, + { + "epoch": 0.40772914310465147, + "grad_norm": 2.8994020568100716, + "learning_rate": 1.8456085648233054e-05, + "loss": 2.3589, + "step": 2118 + }, + { + "epoch": 0.40792164978222684, + "grad_norm": 3.2053508022916324, + "learning_rate": 1.845442089316439e-05, + "loss": 2.3044, + "step": 2119 + }, + { + "epoch": 0.4081141564598022, + "grad_norm": 3.389674238590442, + "learning_rate": 1.8452755316217854e-05, + "loss": 2.3681, + "step": 2120 + }, + { + "epoch": 0.4081141564598022, + "lm_loss": 2.2301, + "step": 2120, + "vm_loss": 0.1348 + }, + { + "epoch": 0.4081141564598022, + "lm_loss": 2.1089, + "step": 2120, + "vm_loss": 0.2004 + }, + { + "epoch": 0.4081141564598022, + "lm_loss": 2.3274, + "step": 2120, + "vm_loss": 0.1299 + }, + { + "epoch": 0.4081141564598022, + "lm_loss": 2.4197, + "step": 2120, + "vm_loss": 0.1728 + }, + { + "epoch": 0.4081141564598022, + "lm_loss": 2.0335, + "step": 2120, + "vm_loss": 0.1778 + }, + { + "epoch": 0.4081141564598022, + "lm_loss": 2.3224, + "step": 2120, + "vm_loss": 0.1705 + }, + { + "epoch": 0.4081141564598022, + "lm_loss": 2.0235, + "step": 2120, + "vm_loss": 0.1541 + }, + { + "epoch": 0.4081141564598022, + "lm_loss": 2.2433, + "step": 2120, + "vm_loss": 0.1316 + }, + { + "epoch": 0.40830666313737757, + "grad_norm": 2.808924740374487, + "learning_rate": 1.8451088917555356e-05, + "loss": 2.3595, + "step": 2121 + }, + { + "epoch": 0.40849916981495293, + "grad_norm": 3.212233776272681, + "learning_rate": 1.8449421697338896e-05, + "loss": 2.3535, + "step": 2122 + }, + { + "epoch": 0.40869167649252836, + "grad_norm": 3.0112316348161388, + "learning_rate": 1.8447753655730547e-05, + "loss": 2.3898, + "step": 2123 + }, + { + "epoch": 0.4088841831701037, + "grad_norm": 2.5425130979155326, + "learning_rate": 1.8446084792892464e-05, + "loss": 2.381, + "step": 2124 + }, + { + "epoch": 0.4090766898476791, + "grad_norm": 2.9916612674600174, + "learning_rate": 1.8444415108986882e-05, + "loss": 2.3354, + "step": 2125 + }, + { + "epoch": 0.40926919652525445, + "grad_norm": 3.2384512062452933, + "learning_rate": 1.844274460417612e-05, + "loss": 2.3252, + "step": 2126 + }, + { + "epoch": 0.4094617032028299, + "grad_norm": 2.765098258363053, + "learning_rate": 1.8441073278622562e-05, + "loss": 2.3449, + "step": 2127 + }, + { + "epoch": 0.40965420988040524, + "grad_norm": 3.532905199614185, + "learning_rate": 1.843940113248869e-05, + "loss": 2.3256, + "step": 2128 + }, + { + "epoch": 0.40965420988040524, + "lm_loss": 2.3837, + "step": 2128, + "vm_loss": 0.1744 + }, + { + "epoch": 0.40965420988040524, + "lm_loss": 2.0457, + "step": 2128, + "vm_loss": 0.2055 + }, + { + "epoch": 0.40965420988040524, + "lm_loss": 2.3777, + "step": 2128, + "vm_loss": 0.1501 + }, + { + "epoch": 0.40965420988040524, + "lm_loss": 2.0095, + "step": 2128, + "vm_loss": 0.2033 + }, + { + "epoch": 0.40965420988040524, + "lm_loss": 2.357, + "step": 2128, + "vm_loss": 0.1724 + }, + { + "epoch": 0.40965420988040524, + "lm_loss": 2.0287, + "step": 2128, + "vm_loss": 0.2898 + }, + { + "epoch": 0.40965420988040524, + "lm_loss": 2.2274, + "step": 2128, + "vm_loss": 0.1408 + }, + { + "epoch": 0.40965420988040524, + "lm_loss": 2.2978, + "step": 2128, + "vm_loss": 0.2165 + }, + { + "epoch": 0.4098467165579806, + "grad_norm": 3.4233780188970067, + "learning_rate": 1.8437728165937056e-05, + "loss": 2.3334, + "step": 2129 + }, + { + "epoch": 0.41003922323555597, + "grad_norm": 3.036727315418898, + "learning_rate": 1.8436054379130296e-05, + "loss": 2.3704, + "step": 2130 + }, + { + "epoch": 0.41023172991313134, + "grad_norm": 3.473673653482789, + "learning_rate": 1.8434379772231122e-05, + "loss": 2.3206, + "step": 2131 + }, + { + "epoch": 0.41042423659070676, + "grad_norm": 3.3594688254290332, + "learning_rate": 1.8432704345402324e-05, + "loss": 2.3723, + "step": 2132 + }, + { + "epoch": 0.4106167432682821, + "grad_norm": 2.8684637280670726, + "learning_rate": 1.8431028098806783e-05, + "loss": 2.4048, + "step": 2133 + }, + { + "epoch": 0.4108092499458575, + "grad_norm": 2.9474018999457883, + "learning_rate": 1.8429351032607444e-05, + "loss": 2.3326, + "step": 2134 + }, + { + "epoch": 0.41100175662343286, + "grad_norm": 4.100170445873237, + "learning_rate": 1.842767314696734e-05, + "loss": 2.3102, + "step": 2135 + }, + { + "epoch": 0.4111942633010083, + "grad_norm": 2.994105072789109, + "learning_rate": 1.8425994442049585e-05, + "loss": 2.3435, + "step": 2136 + }, + { + "epoch": 0.4111942633010083, + "lm_loss": 2.0731, + "step": 2136, + "vm_loss": 0.1803 + }, + { + "epoch": 0.4111942633010083, + "lm_loss": 2.1314, + "step": 2136, + "vm_loss": 0.2025 + }, + { + "epoch": 0.4111942633010083, + "lm_loss": 2.1192, + "step": 2136, + "vm_loss": 0.1737 + }, + { + "epoch": 0.4111942633010083, + "lm_loss": 1.971, + "step": 2136, + "vm_loss": 0.2431 + }, + { + "epoch": 0.4111942633010083, + "lm_loss": 2.3416, + "step": 2136, + "vm_loss": 0.1884 + }, + { + "epoch": 0.4111942633010083, + "lm_loss": 2.1558, + "step": 2136, + "vm_loss": 0.1631 + }, + { + "epoch": 0.4111942633010083, + "lm_loss": 2.1965, + "step": 2136, + "vm_loss": 0.1657 + }, + { + "epoch": 0.4111942633010083, + "lm_loss": 2.0486, + "step": 2136, + "vm_loss": 0.2119 + }, + { + "epoch": 0.41138676997858364, + "grad_norm": 3.303565617203994, + "learning_rate": 1.8424314918017372e-05, + "loss": 2.3768, + "step": 2137 + }, + { + "epoch": 0.411579276656159, + "grad_norm": 3.202682378535108, + "learning_rate": 1.842263457503397e-05, + "loss": 2.3502, + "step": 2138 + }, + { + "epoch": 0.4117717833337344, + "grad_norm": 2.7570099050112837, + "learning_rate": 1.8420953413262727e-05, + "loss": 2.3837, + "step": 2139 + }, + { + "epoch": 0.41196429001130974, + "grad_norm": 3.021769115981808, + "learning_rate": 1.841927143286708e-05, + "loss": 2.3498, + "step": 2140 + }, + { + "epoch": 0.41215679668888516, + "grad_norm": 2.836439852485431, + "learning_rate": 1.8417588634010536e-05, + "loss": 2.3951, + "step": 2141 + }, + { + "epoch": 0.41234930336646053, + "grad_norm": 3.1906417927306014, + "learning_rate": 1.8415905016856682e-05, + "loss": 2.3617, + "step": 2142 + }, + { + "epoch": 0.4125418100440359, + "grad_norm": 2.802803268464032, + "learning_rate": 1.841422058156919e-05, + "loss": 2.3259, + "step": 2143 + }, + { + "epoch": 0.41273431672161126, + "grad_norm": 2.814776057707198, + "learning_rate": 1.8412535328311813e-05, + "loss": 2.383, + "step": 2144 + }, + { + "epoch": 0.41273431672161126, + "lm_loss": 2.146, + "step": 2144, + "vm_loss": 0.1444 + }, + { + "epoch": 0.41273431672161126, + "lm_loss": 2.2029, + "step": 2144, + "vm_loss": 0.2005 + }, + { + "epoch": 0.41273431672161126, + "lm_loss": 2.4944, + "step": 2144, + "vm_loss": 0.1191 + }, + { + "epoch": 0.41273431672161126, + "lm_loss": 2.3076, + "step": 2144, + "vm_loss": 0.2183 + }, + { + "epoch": 0.41273431672161126, + "lm_loss": 2.2968, + "step": 2144, + "vm_loss": 0.2033 + }, + { + "epoch": 0.41273431672161126, + "lm_loss": 2.2323, + "step": 2144, + "vm_loss": 0.1755 + }, + { + "epoch": 0.41273431672161126, + "lm_loss": 2.1842, + "step": 2144, + "vm_loss": 0.1289 + }, + { + "epoch": 0.41273431672161126, + "lm_loss": 2.2253, + "step": 2144, + "vm_loss": 0.2586 + }, + { + "epoch": 0.4129268233991867, + "grad_norm": 2.8833750937905935, + "learning_rate": 1.8410849257248372e-05, + "loss": 2.3451, + "step": 2145 + }, + { + "epoch": 0.41311933007676205, + "grad_norm": 3.1989008487113386, + "learning_rate": 1.8409162368542778e-05, + "loss": 2.3593, + "step": 2146 + }, + { + "epoch": 0.4133118367543374, + "grad_norm": 2.686935078478039, + "learning_rate": 1.8407474662359014e-05, + "loss": 2.3545, + "step": 2147 + }, + { + "epoch": 0.4135043434319128, + "grad_norm": 2.917875871207584, + "learning_rate": 1.840578613886115e-05, + "loss": 2.3165, + "step": 2148 + }, + { + "epoch": 0.41369685010948815, + "grad_norm": 2.9899575980229938, + "learning_rate": 1.8404096798213334e-05, + "loss": 2.3807, + "step": 2149 + }, + { + "epoch": 0.41388935678706357, + "grad_norm": 3.3527018619678497, + "learning_rate": 1.8402406640579788e-05, + "loss": 2.3335, + "step": 2150 + }, + { + "epoch": 0.41408186346463893, + "grad_norm": 3.041375448295186, + "learning_rate": 1.8400715666124815e-05, + "loss": 2.3613, + "step": 2151 + }, + { + "epoch": 0.4142743701422143, + "grad_norm": 3.012256771374496, + "learning_rate": 1.8399023875012808e-05, + "loss": 2.2652, + "step": 2152 + }, + { + "epoch": 0.4142743701422143, + "lm_loss": 2.1856, + "step": 2152, + "vm_loss": 0.1835 + }, + { + "epoch": 0.4142743701422143, + "lm_loss": 2.3424, + "step": 2152, + "vm_loss": 0.1178 + }, + { + "epoch": 0.4142743701422143, + "lm_loss": 2.4276, + "step": 2152, + "vm_loss": 0.1433 + }, + { + "epoch": 0.4142743701422143, + "lm_loss": 1.8783, + "step": 2152, + "vm_loss": 0.1838 + }, + { + "epoch": 0.4142743701422143, + "lm_loss": 2.3399, + "step": 2152, + "vm_loss": 0.191 + }, + { + "epoch": 0.4142743701422143, + "lm_loss": 2.1739, + "step": 2152, + "vm_loss": 0.2735 + }, + { + "epoch": 0.4142743701422143, + "lm_loss": 2.1701, + "step": 2152, + "vm_loss": 0.1429 + }, + { + "epoch": 0.4142743701422143, + "lm_loss": 2.3573, + "step": 2152, + "vm_loss": 0.1759 + }, + { + "epoch": 0.41446687681978966, + "grad_norm": 3.2521429507103083, + "learning_rate": 1.8397331267408222e-05, + "loss": 2.3721, + "step": 2153 + }, + { + "epoch": 0.4146593834973651, + "grad_norm": 2.8432046010965757, + "learning_rate": 1.8395637843475607e-05, + "loss": 2.3389, + "step": 2154 + }, + { + "epoch": 0.41485189017494045, + "grad_norm": 3.0496752744846938, + "learning_rate": 1.839394360337958e-05, + "loss": 2.3211, + "step": 2155 + }, + { + "epoch": 0.4150443968525158, + "grad_norm": 3.036044526875652, + "learning_rate": 1.8392248547284844e-05, + "loss": 2.3149, + "step": 2156 + }, + { + "epoch": 0.4152369035300912, + "grad_norm": 2.9626809902230207, + "learning_rate": 1.8390552675356183e-05, + "loss": 2.3476, + "step": 2157 + }, + { + "epoch": 0.4154294102076666, + "grad_norm": 2.6944664731149395, + "learning_rate": 1.8388855987758455e-05, + "loss": 2.3466, + "step": 2158 + }, + { + "epoch": 0.41562191688524197, + "grad_norm": 2.9261309163208478, + "learning_rate": 1.8387158484656603e-05, + "loss": 2.3725, + "step": 2159 + }, + { + "epoch": 0.41581442356281734, + "grad_norm": 2.70597099804516, + "learning_rate": 1.838546016621564e-05, + "loss": 2.4081, + "step": 2160 + }, + { + "epoch": 0.41581442356281734, + "lm_loss": 2.4882, + "step": 2160, + "vm_loss": 0.2385 + }, + { + "epoch": 0.41581442356281734, + "lm_loss": 2.1829, + "step": 2160, + "vm_loss": 0.1698 + }, + { + "epoch": 0.41581442356281734, + "lm_loss": 2.3456, + "step": 2160, + "vm_loss": 0.2613 + }, + { + "epoch": 0.41581442356281734, + "lm_loss": 1.9341, + "step": 2160, + "vm_loss": 0.1964 + }, + { + "epoch": 0.41581442356281734, + "lm_loss": 2.2251, + "step": 2160, + "vm_loss": 0.165 + }, + { + "epoch": 0.41581442356281734, + "lm_loss": 2.4087, + "step": 2160, + "vm_loss": 0.1293 + }, + { + "epoch": 0.41581442356281734, + "lm_loss": 2.0562, + "step": 2160, + "vm_loss": 0.2364 + }, + { + "epoch": 0.41581442356281734, + "lm_loss": 2.2579, + "step": 2160, + "vm_loss": 0.1415 + }, + { + "epoch": 0.4160069302403927, + "grad_norm": 3.08532306126558, + "learning_rate": 1.8383761032600663e-05, + "loss": 2.3714, + "step": 2161 + }, + { + "epoch": 0.41619943691796807, + "grad_norm": 2.9651120591167808, + "learning_rate": 1.838206108397686e-05, + "loss": 2.363, + "step": 2162 + }, + { + "epoch": 0.4163919435955435, + "grad_norm": 3.1146431347665966, + "learning_rate": 1.838036032050948e-05, + "loss": 2.3569, + "step": 2163 + }, + { + "epoch": 0.41658445027311886, + "grad_norm": 3.288019650218014, + "learning_rate": 1.8378658742363862e-05, + "loss": 2.3746, + "step": 2164 + }, + { + "epoch": 0.4167769569506942, + "grad_norm": 2.9221016590665103, + "learning_rate": 1.837695634970542e-05, + "loss": 2.3388, + "step": 2165 + }, + { + "epoch": 0.4169694636282696, + "grad_norm": 2.841662853804467, + "learning_rate": 1.8375253142699646e-05, + "loss": 2.3571, + "step": 2166 + }, + { + "epoch": 0.417161970305845, + "grad_norm": 2.732405558606511, + "learning_rate": 1.8373549121512118e-05, + "loss": 2.4483, + "step": 2167 + }, + { + "epoch": 0.4173544769834204, + "grad_norm": 3.1732136627728647, + "learning_rate": 1.8371844286308486e-05, + "loss": 2.3182, + "step": 2168 + }, + { + "epoch": 0.4173544769834204, + "lm_loss": 2.1019, + "step": 2168, + "vm_loss": 0.2212 + }, + { + "epoch": 0.4173544769834204, + "lm_loss": 2.1206, + "step": 2168, + "vm_loss": 0.1421 + }, + { + "epoch": 0.4173544769834204, + "lm_loss": 2.4115, + "step": 2168, + "vm_loss": 0.1361 + }, + { + "epoch": 0.4173544769834204, + "lm_loss": 2.3951, + "step": 2168, + "vm_loss": 0.1646 + }, + { + "epoch": 0.4173544769834204, + "lm_loss": 2.4041, + "step": 2168, + "vm_loss": 0.1325 + }, + { + "epoch": 0.4173544769834204, + "lm_loss": 2.2038, + "step": 2168, + "vm_loss": 0.2165 + }, + { + "epoch": 0.4173544769834204, + "lm_loss": 2.0637, + "step": 2168, + "vm_loss": 0.1856 + }, + { + "epoch": 0.4173544769834204, + "lm_loss": 2.1314, + "step": 2168, + "vm_loss": 0.1983 + }, + { + "epoch": 0.41754698366099574, + "grad_norm": 2.9700775384797655, + "learning_rate": 1.837013863725448e-05, + "loss": 2.3298, + "step": 2169 + }, + { + "epoch": 0.4177394903385711, + "grad_norm": 2.7890134844403875, + "learning_rate": 1.8368432174515915e-05, + "loss": 2.34, + "step": 2170 + }, + { + "epoch": 0.4179319970161465, + "grad_norm": 2.5921923600620347, + "learning_rate": 1.8366724898258678e-05, + "loss": 2.2839, + "step": 2171 + }, + { + "epoch": 0.4181245036937219, + "grad_norm": 3.014915369427207, + "learning_rate": 1.8365016808648743e-05, + "loss": 2.3425, + "step": 2172 + }, + { + "epoch": 0.41831701037129726, + "grad_norm": 3.0982445714073212, + "learning_rate": 1.8363307905852152e-05, + "loss": 2.3935, + "step": 2173 + }, + { + "epoch": 0.4185095170488726, + "grad_norm": 2.802939594824342, + "learning_rate": 1.836159819003503e-05, + "loss": 2.3589, + "step": 2174 + }, + { + "epoch": 0.418702023726448, + "grad_norm": 3.0903480637656253, + "learning_rate": 1.8359887661363594e-05, + "loss": 2.3021, + "step": 2175 + }, + { + "epoch": 0.4188945304040234, + "grad_norm": 3.066508193015433, + "learning_rate": 1.835817632000412e-05, + "loss": 2.3484, + "step": 2176 + }, + { + "epoch": 0.4188945304040234, + "lm_loss": 1.724, + "step": 2176, + "vm_loss": 0.2114 + }, + { + "epoch": 0.4188945304040234, + "lm_loss": 2.0409, + "step": 2176, + "vm_loss": 0.1534 + }, + { + "epoch": 0.4188945304040234, + "lm_loss": 1.9736, + "step": 2176, + "vm_loss": 0.1923 + }, + { + "epoch": 0.4188945304040234, + "lm_loss": 2.2024, + "step": 2176, + "vm_loss": 0.0899 + }, + { + "epoch": 0.4188945304040234, + "lm_loss": 1.7809, + "step": 2176, + "vm_loss": 0.1775 + }, + { + "epoch": 0.4188945304040234, + "lm_loss": 2.1523, + "step": 2176, + "vm_loss": 0.1853 + }, + { + "epoch": 0.4188945304040234, + "lm_loss": 2.161, + "step": 2176, + "vm_loss": 0.1155 + }, + { + "epoch": 0.4188945304040234, + "lm_loss": 2.2636, + "step": 2176, + "vm_loss": 0.1502 + }, + { + "epoch": 0.4190870370815988, + "grad_norm": 2.642592897201478, + "learning_rate": 1.8356464166122976e-05, + "loss": 2.3381, + "step": 2177 + }, + { + "epoch": 0.41927954375917414, + "grad_norm": 3.254076077578234, + "learning_rate": 1.8354751199886608e-05, + "loss": 2.3801, + "step": 2178 + }, + { + "epoch": 0.4194720504367495, + "grad_norm": 2.6657664927068643, + "learning_rate": 1.8353037421461527e-05, + "loss": 2.3021, + "step": 2179 + }, + { + "epoch": 0.4196645571143249, + "grad_norm": 2.7003895211298263, + "learning_rate": 1.835132283101435e-05, + "loss": 2.3323, + "step": 2180 + }, + { + "epoch": 0.4198570637919003, + "grad_norm": 2.9357196498250473, + "learning_rate": 1.8349607428711747e-05, + "loss": 2.3956, + "step": 2181 + }, + { + "epoch": 0.42004957046947566, + "grad_norm": 3.4185324094915193, + "learning_rate": 1.8347891214720477e-05, + "loss": 2.3317, + "step": 2182 + }, + { + "epoch": 0.42024207714705103, + "grad_norm": 2.6587801175604144, + "learning_rate": 1.834617418920738e-05, + "loss": 2.3195, + "step": 2183 + }, + { + "epoch": 0.4204345838246264, + "grad_norm": 3.0276757496206477, + "learning_rate": 1.8344456352339377e-05, + "loss": 2.3734, + "step": 2184 + }, + { + "epoch": 0.4204345838246264, + "lm_loss": 2.2437, + "step": 2184, + "vm_loss": 0.2399 + }, + { + "epoch": 0.4204345838246264, + "lm_loss": 2.0155, + "step": 2184, + "vm_loss": 0.1554 + }, + { + "epoch": 0.4204345838246264, + "lm_loss": 2.2087, + "step": 2184, + "vm_loss": 0.1255 + }, + { + "epoch": 0.4204345838246264, + "lm_loss": 2.1535, + "step": 2184, + "vm_loss": 0.1153 + }, + { + "epoch": 0.4204345838246264, + "lm_loss": 1.9832, + "step": 2184, + "vm_loss": 0.2594 + }, + { + "epoch": 0.4204345838246264, + "lm_loss": 2.1844, + "step": 2184, + "vm_loss": 0.1642 + }, + { + "epoch": 0.4204345838246264, + "lm_loss": 2.0304, + "step": 2184, + "vm_loss": 0.1669 + }, + { + "epoch": 0.4204345838246264, + "lm_loss": 2.2383, + "step": 2184, + "vm_loss": 0.2531 + }, + { + "epoch": 0.4206270905022018, + "grad_norm": 2.695100763528606, + "learning_rate": 1.8342737704283458e-05, + "loss": 2.3624, + "step": 2185 + }, + { + "epoch": 0.4208195971797772, + "grad_norm": 2.761006169969045, + "learning_rate": 1.8341018245206696e-05, + "loss": 2.3488, + "step": 2186 + }, + { + "epoch": 0.42101210385735255, + "grad_norm": 3.156939382528445, + "learning_rate": 1.8339297975276254e-05, + "loss": 2.3282, + "step": 2187 + }, + { + "epoch": 0.4212046105349279, + "grad_norm": 2.8258156355453723, + "learning_rate": 1.8337576894659353e-05, + "loss": 2.3752, + "step": 2188 + }, + { + "epoch": 0.42139711721250334, + "grad_norm": 2.8959402619317336, + "learning_rate": 1.833585500352331e-05, + "loss": 2.3784, + "step": 2189 + }, + { + "epoch": 0.4215896238900787, + "grad_norm": 3.1410318379606497, + "learning_rate": 1.8334132302035515e-05, + "loss": 2.36, + "step": 2190 + }, + { + "epoch": 0.42178213056765407, + "grad_norm": 2.7508664077296543, + "learning_rate": 1.8332408790363437e-05, + "loss": 2.3315, + "step": 2191 + }, + { + "epoch": 0.42197463724522943, + "grad_norm": 2.859582371003272, + "learning_rate": 1.833068446867462e-05, + "loss": 2.3644, + "step": 2192 + }, + { + "epoch": 0.42197463724522943, + "lm_loss": 2.3192, + "step": 2192, + "vm_loss": 0.1884 + }, + { + "epoch": 0.42197463724522943, + "lm_loss": 2.294, + "step": 2192, + "vm_loss": 0.136 + }, + { + "epoch": 0.42197463724522943, + "lm_loss": 2.0959, + "step": 2192, + "vm_loss": 0.1478 + }, + { + "epoch": 0.42197463724522943, + "lm_loss": 1.9382, + "step": 2192, + "vm_loss": 0.1769 + }, + { + "epoch": 0.42197463724522943, + "lm_loss": 2.3338, + "step": 2192, + "vm_loss": 0.2247 + }, + { + "epoch": 0.42197463724522943, + "lm_loss": 2.2903, + "step": 2192, + "vm_loss": 0.197 + }, + { + "epoch": 0.42197463724522943, + "lm_loss": 2.1545, + "step": 2192, + "vm_loss": 0.1549 + }, + { + "epoch": 0.42197463724522943, + "lm_loss": 2.1422, + "step": 2192, + "vm_loss": 0.1555 + }, + { + "epoch": 0.4221671439228048, + "grad_norm": 2.8583333800590656, + "learning_rate": 1.8328959337136692e-05, + "loss": 2.3448, + "step": 2193 + }, + { + "epoch": 0.4223596506003802, + "grad_norm": 2.98799330395155, + "learning_rate": 1.832723339591736e-05, + "loss": 2.3576, + "step": 2194 + }, + { + "epoch": 0.4225521572779556, + "grad_norm": 2.6870081808167705, + "learning_rate": 1.8325506645184402e-05, + "loss": 2.3232, + "step": 2195 + }, + { + "epoch": 0.42274466395553095, + "grad_norm": 3.203068497228845, + "learning_rate": 1.8323779085105684e-05, + "loss": 2.3231, + "step": 2196 + }, + { + "epoch": 0.4229371706331063, + "grad_norm": 3.0855690506625506, + "learning_rate": 1.8322050715849152e-05, + "loss": 2.3168, + "step": 2197 + }, + { + "epoch": 0.42312967731068174, + "grad_norm": 2.655088944444877, + "learning_rate": 1.8320321537582815e-05, + "loss": 2.3434, + "step": 2198 + }, + { + "epoch": 0.4233221839882571, + "grad_norm": 3.078639542604538, + "learning_rate": 1.831859155047478e-05, + "loss": 2.3247, + "step": 2199 + }, + { + "epoch": 0.42351469066583247, + "grad_norm": 3.3473765595244505, + "learning_rate": 1.8316860754693215e-05, + "loss": 2.327, + "step": 2200 + }, + { + "epoch": 0.42351469066583247, + "lm_loss": 2.075, + "step": 2200, + "vm_loss": 0.182 + }, + { + "epoch": 0.42351469066583247, + "lm_loss": 1.7723, + "step": 2200, + "vm_loss": 0.0993 + }, + { + "epoch": 0.42351469066583247, + "lm_loss": 2.2, + "step": 2200, + "vm_loss": 0.1674 + }, + { + "epoch": 0.42351469066583247, + "lm_loss": 2.2439, + "step": 2200, + "vm_loss": 0.1574 + }, + { + "epoch": 0.42351469066583247, + "lm_loss": 2.2557, + "step": 2200, + "vm_loss": 0.1677 + }, + { + "epoch": 0.42351469066583247, + "lm_loss": 2.1867, + "step": 2200, + "vm_loss": 0.1569 + }, + { + "epoch": 0.42351469066583247, + "lm_loss": 2.3099, + "step": 2200, + "vm_loss": 0.1619 + }, + { + "epoch": 0.42351469066583247, + "lm_loss": 1.9626, + "step": 2200, + "vm_loss": 0.1185 + }, + { + "epoch": 0.42370719734340784, + "grad_norm": 2.8337305175869747, + "learning_rate": 1.8315129150406386e-05, + "loss": 2.3027, + "step": 2201 + }, + { + "epoch": 0.4238997040209832, + "grad_norm": 2.9449433634328037, + "learning_rate": 1.831339673778262e-05, + "loss": 2.255, + "step": 2202 + }, + { + "epoch": 0.4240922106985586, + "grad_norm": 2.905475099765251, + "learning_rate": 1.8311663516990332e-05, + "loss": 2.3381, + "step": 2203 + }, + { + "epoch": 0.424284717376134, + "grad_norm": 2.9464403100852996, + "learning_rate": 1.8309929488198012e-05, + "loss": 2.3072, + "step": 2204 + }, + { + "epoch": 0.42447722405370936, + "grad_norm": 3.0311573399879497, + "learning_rate": 1.8308194651574232e-05, + "loss": 2.3421, + "step": 2205 + }, + { + "epoch": 0.4246697307312847, + "grad_norm": 2.7371020985877963, + "learning_rate": 1.8306459007287637e-05, + "loss": 2.3742, + "step": 2206 + }, + { + "epoch": 0.42486223740886014, + "grad_norm": 2.6068674367419216, + "learning_rate": 1.830472255550696e-05, + "loss": 2.3886, + "step": 2207 + }, + { + "epoch": 0.4250547440864355, + "grad_norm": 2.9373892553700953, + "learning_rate": 1.8302985296401e-05, + "loss": 2.3709, + "step": 2208 + }, + { + "epoch": 0.4250547440864355, + "lm_loss": 2.0728, + "step": 2208, + "vm_loss": 0.1149 + }, + { + "epoch": 0.4250547440864355, + "lm_loss": 2.2847, + "step": 2208, + "vm_loss": 0.1926 + }, + { + "epoch": 0.4250547440864355, + "lm_loss": 2.5463, + "step": 2208, + "vm_loss": 0.1761 + }, + { + "epoch": 0.4250547440864355, + "lm_loss": 1.9467, + "step": 2208, + "vm_loss": 0.1901 + }, + { + "epoch": 0.4250547440864355, + "lm_loss": 2.021, + "step": 2208, + "vm_loss": 0.1557 + }, + { + "epoch": 0.4250547440864355, + "lm_loss": 2.2198, + "step": 2208, + "vm_loss": 0.1148 + }, + { + "epoch": 0.4250547440864355, + "lm_loss": 1.8601, + "step": 2208, + "vm_loss": 0.1528 + }, + { + "epoch": 0.4250547440864355, + "lm_loss": 2.129, + "step": 2208, + "vm_loss": 0.2053 + }, + { + "epoch": 0.4252472507640109, + "grad_norm": 2.9983140048511654, + "learning_rate": 1.830124723013864e-05, + "loss": 2.2861, + "step": 2209 + }, + { + "epoch": 0.42543975744158624, + "grad_norm": 2.730833474461183, + "learning_rate": 1.829950835688885e-05, + "loss": 2.3471, + "step": 2210 + }, + { + "epoch": 0.4256322641191616, + "grad_norm": 2.862369895340129, + "learning_rate": 1.8297768676820665e-05, + "loss": 2.3215, + "step": 2211 + }, + { + "epoch": 0.42582477079673703, + "grad_norm": 3.6304801019321498, + "learning_rate": 1.8296028190103205e-05, + "loss": 2.2975, + "step": 2212 + }, + { + "epoch": 0.4260172774743124, + "grad_norm": 3.3409308437570027, + "learning_rate": 1.8294286896905665e-05, + "loss": 2.3725, + "step": 2213 + }, + { + "epoch": 0.42620978415188776, + "grad_norm": 2.8043646591601634, + "learning_rate": 1.8292544797397327e-05, + "loss": 2.3411, + "step": 2214 + }, + { + "epoch": 0.4264022908294631, + "grad_norm": 2.7328741821076625, + "learning_rate": 1.829080189174754e-05, + "loss": 2.3302, + "step": 2215 + }, + { + "epoch": 0.42659479750703855, + "grad_norm": 2.900396750536101, + "learning_rate": 1.828905818012574e-05, + "loss": 2.3147, + "step": 2216 + }, + { + "epoch": 0.42659479750703855, + "lm_loss": 2.2344, + "step": 2216, + "vm_loss": 0.1688 + }, + { + "epoch": 0.42659479750703855, + "lm_loss": 2.4235, + "step": 2216, + "vm_loss": 0.1721 + }, + { + "epoch": 0.42659479750703855, + "lm_loss": 2.2181, + "step": 2216, + "vm_loss": 0.1401 + }, + { + "epoch": 0.42659479750703855, + "lm_loss": 2.3286, + "step": 2216, + "vm_loss": 0.1629 + }, + { + "epoch": 0.42659479750703855, + "lm_loss": 2.1762, + "step": 2216, + "vm_loss": 0.1632 + }, + { + "epoch": 0.42659479750703855, + "lm_loss": 2.0343, + "step": 2216, + "vm_loss": 0.1838 + }, + { + "epoch": 0.42659479750703855, + "lm_loss": 2.3713, + "step": 2216, + "vm_loss": 0.1669 + }, + { + "epoch": 0.42659479750703855, + "lm_loss": 2.3332, + "step": 2216, + "vm_loss": 0.1504 + }, + { + "epoch": 0.4267873041846139, + "grad_norm": 2.8823395153204, + "learning_rate": 1.8287313662701436e-05, + "loss": 2.3646, + "step": 2217 + }, + { + "epoch": 0.4269798108621893, + "grad_norm": 3.139119355230146, + "learning_rate": 1.828556833964422e-05, + "loss": 2.3537, + "step": 2218 + }, + { + "epoch": 0.42717231753976465, + "grad_norm": 2.4723202448121073, + "learning_rate": 1.8283822211123758e-05, + "loss": 2.324, + "step": 2219 + }, + { + "epoch": 0.42736482421734, + "grad_norm": 3.01517865352798, + "learning_rate": 1.8282075277309792e-05, + "loss": 2.3434, + "step": 2220 + }, + { + "epoch": 0.42755733089491543, + "grad_norm": 2.9302152898677787, + "learning_rate": 1.8280327538372155e-05, + "loss": 2.3382, + "step": 2221 + }, + { + "epoch": 0.4277498375724908, + "grad_norm": 2.6665277903917004, + "learning_rate": 1.827857899448074e-05, + "loss": 2.2625, + "step": 2222 + }, + { + "epoch": 0.42794234425006616, + "grad_norm": 2.8370088248688377, + "learning_rate": 1.8276829645805536e-05, + "loss": 2.3409, + "step": 2223 + }, + { + "epoch": 0.42813485092764153, + "grad_norm": 2.7110791506392107, + "learning_rate": 1.8275079492516597e-05, + "loss": 2.3519, + "step": 2224 + }, + { + "epoch": 0.42813485092764153, + "lm_loss": 2.1767, + "step": 2224, + "vm_loss": 0.2208 + }, + { + "epoch": 0.42813485092764153, + "lm_loss": 1.8086, + "step": 2224, + "vm_loss": 0.1724 + }, + { + "epoch": 0.42813485092764153, + "lm_loss": 1.9442, + "step": 2224, + "vm_loss": 0.1005 + }, + { + "epoch": 0.42813485092764153, + "lm_loss": 2.2649, + "step": 2224, + "vm_loss": 0.1828 + }, + { + "epoch": 0.42813485092764153, + "lm_loss": 2.1648, + "step": 2224, + "vm_loss": 0.1485 + }, + { + "epoch": 0.42813485092764153, + "lm_loss": 2.4827, + "step": 2224, + "vm_loss": 0.1812 + }, + { + "epoch": 0.42813485092764153, + "lm_loss": 2.0415, + "step": 2224, + "vm_loss": 0.2364 + }, + { + "epoch": 0.42813485092764153, + "lm_loss": 1.9691, + "step": 2224, + "vm_loss": 0.188 + }, + { + "epoch": 0.42832735760521695, + "grad_norm": 2.9401484778077047, + "learning_rate": 1.8273328534784067e-05, + "loss": 2.3657, + "step": 2225 + }, + { + "epoch": 0.4285198642827923, + "grad_norm": 2.83229555133251, + "learning_rate": 1.8271576772778154e-05, + "loss": 2.3054, + "step": 2226 + }, + { + "epoch": 0.4287123709603677, + "grad_norm": 3.1111944626892387, + "learning_rate": 1.8269824206669153e-05, + "loss": 2.3393, + "step": 2227 + }, + { + "epoch": 0.42890487763794305, + "grad_norm": 3.3099832420602393, + "learning_rate": 1.826807083662744e-05, + "loss": 2.3056, + "step": 2228 + }, + { + "epoch": 0.42909738431551847, + "grad_norm": 2.605118641407932, + "learning_rate": 1.8266316662823456e-05, + "loss": 2.3289, + "step": 2229 + }, + { + "epoch": 0.42928989099309384, + "grad_norm": 3.0330197853056746, + "learning_rate": 1.826456168542774e-05, + "loss": 2.3253, + "step": 2230 + }, + { + "epoch": 0.4294823976706692, + "grad_norm": 3.017107128608619, + "learning_rate": 1.8262805904610894e-05, + "loss": 2.3361, + "step": 2231 + }, + { + "epoch": 0.42967490434824457, + "grad_norm": 2.9297229241573666, + "learning_rate": 1.82610493205436e-05, + "loss": 2.2937, + "step": 2232 + }, + { + "epoch": 0.42967490434824457, + "lm_loss": 1.8694, + "step": 2232, + "vm_loss": 0.2165 + }, + { + "epoch": 0.42967490434824457, + "lm_loss": 2.0562, + "step": 2232, + "vm_loss": 0.1376 + }, + { + "epoch": 0.42967490434824457, + "lm_loss": 1.9866, + "step": 2232, + "vm_loss": 0.085 + }, + { + "epoch": 0.42967490434824457, + "lm_loss": 2.0144, + "step": 2232, + "vm_loss": 0.1182 + }, + { + "epoch": 0.42967490434824457, + "lm_loss": 2.4143, + "step": 2232, + "vm_loss": 0.1122 + }, + { + "epoch": 0.42967490434824457, + "lm_loss": 2.2187, + "step": 2232, + "vm_loss": 0.1486 + }, + { + "epoch": 0.42967490434824457, + "lm_loss": 1.9456, + "step": 2232, + "vm_loss": 0.1428 + }, + { + "epoch": 0.42967490434824457, + "lm_loss": 2.1749, + "step": 2232, + "vm_loss": 0.1832 + }, + { + "epoch": 0.42986741102581993, + "grad_norm": 3.4703806471897813, + "learning_rate": 1.8259291933396625e-05, + "loss": 2.31, + "step": 2233 + }, + { + "epoch": 0.43005991770339536, + "grad_norm": 3.2460382714743212, + "learning_rate": 1.8257533743340806e-05, + "loss": 2.3453, + "step": 2234 + }, + { + "epoch": 0.4302524243809707, + "grad_norm": 2.7981051424157295, + "learning_rate": 1.825577475054706e-05, + "loss": 2.3402, + "step": 2235 + }, + { + "epoch": 0.4304449310585461, + "grad_norm": 3.0666500194026325, + "learning_rate": 1.8254014955186386e-05, + "loss": 2.3343, + "step": 2236 + }, + { + "epoch": 0.43063743773612145, + "grad_norm": 3.473243301255162, + "learning_rate": 1.825225435742986e-05, + "loss": 2.3174, + "step": 2237 + }, + { + "epoch": 0.4308299444136969, + "grad_norm": 2.825915484416556, + "learning_rate": 1.8250492957448634e-05, + "loss": 2.3386, + "step": 2238 + }, + { + "epoch": 0.43102245109127224, + "grad_norm": 2.813152685775438, + "learning_rate": 1.824873075541394e-05, + "loss": 2.2796, + "step": 2239 + }, + { + "epoch": 0.4312149577688476, + "grad_norm": 3.644551671361753, + "learning_rate": 1.8246967751497083e-05, + "loss": 2.3409, + "step": 2240 + }, + { + "epoch": 0.4312149577688476, + "lm_loss": 2.2338, + "step": 2240, + "vm_loss": 0.1459 + }, + { + "epoch": 0.4312149577688476, + "lm_loss": 2.4392, + "step": 2240, + "vm_loss": 0.1084 + }, + { + "epoch": 0.4312149577688476, + "lm_loss": 1.8853, + "step": 2240, + "vm_loss": 0.2131 + }, + { + "epoch": 0.4312149577688476, + "lm_loss": 2.2702, + "step": 2240, + "vm_loss": 0.1933 + }, + { + "epoch": 0.4312149577688476, + "lm_loss": 2.4741, + "step": 2240, + "vm_loss": 0.1401 + }, + { + "epoch": 0.4312149577688476, + "lm_loss": 1.8013, + "step": 2240, + "vm_loss": 0.1438 + }, + { + "epoch": 0.4312149577688476, + "lm_loss": 1.9907, + "step": 2240, + "vm_loss": 0.2033 + }, + { + "epoch": 0.4312149577688476, + "lm_loss": 2.1314, + "step": 2240, + "vm_loss": 0.1739 + }, + { + "epoch": 0.431407464446423, + "grad_norm": 2.5496307794075985, + "learning_rate": 1.824520394586945e-05, + "loss": 2.3183, + "step": 2241 + }, + { + "epoch": 0.43159997112399834, + "grad_norm": 3.1005403403092853, + "learning_rate": 1.8243439338702504e-05, + "loss": 2.3601, + "step": 2242 + }, + { + "epoch": 0.43179247780157376, + "grad_norm": 2.880706386710158, + "learning_rate": 1.8241673930167792e-05, + "loss": 2.334, + "step": 2243 + }, + { + "epoch": 0.4319849844791491, + "grad_norm": 2.7386247784669826, + "learning_rate": 1.8239907720436935e-05, + "loss": 2.3091, + "step": 2244 + }, + { + "epoch": 0.4321774911567245, + "grad_norm": 2.720028088706721, + "learning_rate": 1.8238140709681625e-05, + "loss": 2.2935, + "step": 2245 + }, + { + "epoch": 0.43236999783429986, + "grad_norm": 2.783310393714633, + "learning_rate": 1.8236372898073642e-05, + "loss": 2.3403, + "step": 2246 + }, + { + "epoch": 0.4325625045118753, + "grad_norm": 3.0276080413209896, + "learning_rate": 1.823460428578484e-05, + "loss": 2.384, + "step": 2247 + }, + { + "epoch": 0.43275501118945064, + "grad_norm": 2.9225361694585104, + "learning_rate": 1.8232834872987147e-05, + "loss": 2.2671, + "step": 2248 + }, + { + "epoch": 0.43275501118945064, + "lm_loss": 2.2045, + "step": 2248, + "vm_loss": 0.1858 + }, + { + "epoch": 0.43275501118945064, + "lm_loss": 2.195, + "step": 2248, + "vm_loss": 0.1143 + }, + { + "epoch": 0.43275501118945064, + "lm_loss": 1.9849, + "step": 2248, + "vm_loss": 0.1108 + }, + { + "epoch": 0.43275501118945064, + "lm_loss": 1.9011, + "step": 2248, + "vm_loss": 0.2152 + }, + { + "epoch": 0.43275501118945064, + "lm_loss": 2.3294, + "step": 2248, + "vm_loss": 0.1481 + }, + { + "epoch": 0.43275501118945064, + "lm_loss": 2.4616, + "step": 2248, + "vm_loss": 0.1873 + }, + { + "epoch": 0.43275501118945064, + "lm_loss": 2.1318, + "step": 2248, + "vm_loss": 0.213 + }, + { + "epoch": 0.43275501118945064, + "lm_loss": 2.3315, + "step": 2248, + "vm_loss": 0.1121 + }, + { + "epoch": 0.432947517867026, + "grad_norm": 2.662969001880157, + "learning_rate": 1.823106465985258e-05, + "loss": 2.3476, + "step": 2249 + }, + { + "epoch": 0.4331400245446014, + "grad_norm": 2.8890921929828735, + "learning_rate": 1.8229293646553224e-05, + "loss": 2.3723, + "step": 2250 + }, + { + "epoch": 0.43333253122217674, + "grad_norm": 2.954219283955891, + "learning_rate": 1.8227521833261237e-05, + "loss": 2.3179, + "step": 2251 + }, + { + "epoch": 0.43352503789975216, + "grad_norm": 2.7749063882076124, + "learning_rate": 1.822574922014887e-05, + "loss": 2.3006, + "step": 2252 + }, + { + "epoch": 0.43371754457732753, + "grad_norm": 2.743081860071622, + "learning_rate": 1.822397580738844e-05, + "loss": 2.313, + "step": 2253 + }, + { + "epoch": 0.4339100512549029, + "grad_norm": 2.5719047175625467, + "learning_rate": 1.822220159515234e-05, + "loss": 2.2858, + "step": 2254 + }, + { + "epoch": 0.43410255793247826, + "grad_norm": 2.9392031442512336, + "learning_rate": 1.8220426583613063e-05, + "loss": 2.3376, + "step": 2255 + }, + { + "epoch": 0.4342950646100537, + "grad_norm": 3.0591530218417593, + "learning_rate": 1.8218650772943144e-05, + "loss": 2.3579, + "step": 2256 + }, + { + "epoch": 0.4342950646100537, + "lm_loss": 2.0416, + "step": 2256, + "vm_loss": 0.1077 + }, + { + "epoch": 0.4342950646100537, + "lm_loss": 2.0622, + "step": 2256, + "vm_loss": 0.2354 + }, + { + "epoch": 0.4342950646100537, + "lm_loss": 2.0739, + "step": 2256, + "vm_loss": 0.2172 + }, + { + "epoch": 0.4342950646100537, + "lm_loss": 2.121, + "step": 2256, + "vm_loss": 0.0901 + }, + { + "epoch": 0.4342950646100537, + "lm_loss": 2.3271, + "step": 2256, + "vm_loss": 0.1731 + }, + { + "epoch": 0.4342950646100537, + "lm_loss": 2.247, + "step": 2256, + "vm_loss": 0.2204 + }, + { + "epoch": 0.4342950646100537, + "lm_loss": 2.4194, + "step": 2256, + "vm_loss": 0.1923 + }, + { + "epoch": 0.4342950646100537, + "lm_loss": 2.342, + "step": 2256, + "vm_loss": 0.2323 + }, + { + "epoch": 0.43448757128762905, + "grad_norm": 2.7995848004046953, + "learning_rate": 1.8216874163315227e-05, + "loss": 2.336, + "step": 2257 + }, + { + "epoch": 0.4346800779652044, + "grad_norm": 2.763459608904795, + "learning_rate": 1.8215096754902014e-05, + "loss": 2.3168, + "step": 2258 + }, + { + "epoch": 0.4348725846427798, + "grad_norm": 3.2027606616981004, + "learning_rate": 1.82133185478763e-05, + "loss": 2.3105, + "step": 2259 + }, + { + "epoch": 0.4350650913203552, + "grad_norm": 2.9535160926563018, + "learning_rate": 1.8211539542410938e-05, + "loss": 2.33, + "step": 2260 + }, + { + "epoch": 0.43525759799793057, + "grad_norm": 2.765769406785263, + "learning_rate": 1.8209759738678877e-05, + "loss": 2.3622, + "step": 2261 + }, + { + "epoch": 0.43545010467550593, + "grad_norm": 2.8346286786172663, + "learning_rate": 1.8207979136853136e-05, + "loss": 2.3126, + "step": 2262 + }, + { + "epoch": 0.4356426113530813, + "grad_norm": 2.7849536479320354, + "learning_rate": 1.8206197737106818e-05, + "loss": 2.3096, + "step": 2263 + }, + { + "epoch": 0.43583511803065667, + "grad_norm": 3.103087349009926, + "learning_rate": 1.8204415539613085e-05, + "loss": 2.2904, + "step": 2264 + }, + { + "epoch": 0.43583511803065667, + "lm_loss": 2.2558, + "step": 2264, + "vm_loss": 0.2054 + }, + { + "epoch": 0.43583511803065667, + "lm_loss": 2.0647, + "step": 2264, + "vm_loss": 0.1453 + }, + { + "epoch": 0.43583511803065667, + "lm_loss": 2.2168, + "step": 2264, + "vm_loss": 0.1681 + }, + { + "epoch": 0.43583511803065667, + "lm_loss": 2.2076, + "step": 2264, + "vm_loss": 0.1671 + }, + { + "epoch": 0.43583511803065667, + "lm_loss": 2.1873, + "step": 2264, + "vm_loss": 0.2243 + }, + { + "epoch": 0.43583511803065667, + "lm_loss": 2.2121, + "step": 2264, + "vm_loss": 0.237 + }, + { + "epoch": 0.43583511803065667, + "lm_loss": 2.4772, + "step": 2264, + "vm_loss": 0.1023 + }, + { + "epoch": 0.43583511803065667, + "lm_loss": 2.1969, + "step": 2264, + "vm_loss": 0.101 + }, + { + "epoch": 0.4360276247082321, + "grad_norm": 2.9321325294492033, + "learning_rate": 1.82026325445452e-05, + "loss": 2.3591, + "step": 2265 + }, + { + "epoch": 0.43622013138580745, + "grad_norm": 2.7059144075188124, + "learning_rate": 1.8200848752076486e-05, + "loss": 2.329, + "step": 2266 + }, + { + "epoch": 0.4364126380633828, + "grad_norm": 2.980462799239122, + "learning_rate": 1.8199064162380358e-05, + "loss": 2.3358, + "step": 2267 + }, + { + "epoch": 0.4366051447409582, + "grad_norm": 2.7277992230655874, + "learning_rate": 1.8197278775630295e-05, + "loss": 2.26, + "step": 2268 + }, + { + "epoch": 0.4367976514185336, + "grad_norm": 2.8509979052657264, + "learning_rate": 1.819549259199986e-05, + "loss": 2.2831, + "step": 2269 + }, + { + "epoch": 0.43699015809610897, + "grad_norm": 3.0805412306199207, + "learning_rate": 1.8193705611662697e-05, + "loss": 2.3273, + "step": 2270 + }, + { + "epoch": 0.43718266477368434, + "grad_norm": 2.8817601843232405, + "learning_rate": 1.8191917834792516e-05, + "loss": 2.3425, + "step": 2271 + }, + { + "epoch": 0.4373751714512597, + "grad_norm": 2.785495287162447, + "learning_rate": 1.819012926156312e-05, + "loss": 2.3611, + "step": 2272 + }, + { + "epoch": 0.4373751714512597, + "lm_loss": 1.9766, + "step": 2272, + "vm_loss": 0.1323 + }, + { + "epoch": 0.4373751714512597, + "lm_loss": 2.276, + "step": 2272, + "vm_loss": 0.1471 + }, + { + "epoch": 0.4373751714512597, + "lm_loss": 1.8094, + "step": 2272, + "vm_loss": 0.1697 + }, + { + "epoch": 0.4373751714512597, + "lm_loss": 2.1238, + "step": 2272, + "vm_loss": 0.1967 + }, + { + "epoch": 0.4373751714512597, + "lm_loss": 1.9581, + "step": 2272, + "vm_loss": 0.1865 + }, + { + "epoch": 0.4373751714512597, + "lm_loss": 1.9666, + "step": 2272, + "vm_loss": 0.1793 + }, + { + "epoch": 0.4373751714512597, + "lm_loss": 2.2968, + "step": 2272, + "vm_loss": 0.1832 + }, + { + "epoch": 0.4373751714512597, + "lm_loss": 2.2719, + "step": 2272, + "vm_loss": 0.1804 + }, + { + "epoch": 0.43756767812883507, + "grad_norm": 3.075200229363919, + "learning_rate": 1.8188339892148375e-05, + "loss": 2.3117, + "step": 2273 + }, + { + "epoch": 0.4377601848064105, + "grad_norm": 2.450610527956838, + "learning_rate": 1.8186549726722234e-05, + "loss": 2.2776, + "step": 2274 + }, + { + "epoch": 0.43795269148398586, + "grad_norm": 2.6649109372800104, + "learning_rate": 1.8184758765458722e-05, + "loss": 2.3495, + "step": 2275 + }, + { + "epoch": 0.4381451981615612, + "grad_norm": 2.797810230039664, + "learning_rate": 1.8182967008531948e-05, + "loss": 2.3035, + "step": 2276 + }, + { + "epoch": 0.4383377048391366, + "grad_norm": 3.0747072507211497, + "learning_rate": 1.8181174456116085e-05, + "loss": 2.3527, + "step": 2277 + }, + { + "epoch": 0.438530211516712, + "grad_norm": 3.1805540217005284, + "learning_rate": 1.81793811083854e-05, + "loss": 2.2976, + "step": 2278 + }, + { + "epoch": 0.4387227181942874, + "grad_norm": 2.8465572133598633, + "learning_rate": 1.817758696551423e-05, + "loss": 2.2874, + "step": 2279 + }, + { + "epoch": 0.43891522487186274, + "grad_norm": 2.966299399160946, + "learning_rate": 1.817579202767698e-05, + "loss": 2.3541, + "step": 2280 + }, + { + "epoch": 0.43891522487186274, + "lm_loss": 2.1517, + "step": 2280, + "vm_loss": 0.1372 + }, + { + "epoch": 0.43891522487186274, + "lm_loss": 2.3433, + "step": 2280, + "vm_loss": 0.1625 + }, + { + "epoch": 0.43891522487186274, + "lm_loss": 2.0524, + "step": 2280, + "vm_loss": 0.1827 + }, + { + "epoch": 0.43891522487186274, + "lm_loss": 1.9284, + "step": 2280, + "vm_loss": 0.144 + }, + { + "epoch": 0.43891522487186274, + "lm_loss": 2.0675, + "step": 2280, + "vm_loss": 0.2603 + }, + { + "epoch": 0.43891522487186274, + "lm_loss": 2.1306, + "step": 2280, + "vm_loss": 0.2119 + }, + { + "epoch": 0.43891522487186274, + "lm_loss": 2.1933, + "step": 2280, + "vm_loss": 0.1798 + }, + { + "epoch": 0.43891522487186274, + "lm_loss": 1.9605, + "step": 2280, + "vm_loss": 0.224 + }, + { + "epoch": 0.4391077315494381, + "grad_norm": 2.90105025988649, + "learning_rate": 1.8173996295048147e-05, + "loss": 2.2911, + "step": 2281 + }, + { + "epoch": 0.4393002382270135, + "grad_norm": 3.066794446216245, + "learning_rate": 1.8172199767802298e-05, + "loss": 2.318, + "step": 2282 + }, + { + "epoch": 0.4394927449045889, + "grad_norm": 2.920750217805513, + "learning_rate": 1.817040244611408e-05, + "loss": 2.362, + "step": 2283 + }, + { + "epoch": 0.43968525158216426, + "grad_norm": 2.709972371828956, + "learning_rate": 1.816860433015821e-05, + "loss": 2.3416, + "step": 2284 + }, + { + "epoch": 0.4398777582597396, + "grad_norm": 2.6873289268547413, + "learning_rate": 1.8166805420109498e-05, + "loss": 2.3161, + "step": 2285 + }, + { + "epoch": 0.440070264937315, + "grad_norm": 3.1716789181957035, + "learning_rate": 1.8165005716142813e-05, + "loss": 2.2905, + "step": 2286 + }, + { + "epoch": 0.4402627716148904, + "grad_norm": 2.8882577886129845, + "learning_rate": 1.816320521843311e-05, + "loss": 2.3165, + "step": 2287 + }, + { + "epoch": 0.4404552782924658, + "grad_norm": 2.9140995543471773, + "learning_rate": 1.8161403927155424e-05, + "loss": 2.2568, + "step": 2288 + }, + { + "epoch": 0.4404552782924658, + "lm_loss": 2.363, + "step": 2288, + "vm_loss": 0.1415 + }, + { + "epoch": 0.4404552782924658, + "lm_loss": 2.1881, + "step": 2288, + "vm_loss": 0.1181 + }, + { + "epoch": 0.4404552782924658, + "lm_loss": 2.0624, + "step": 2288, + "vm_loss": 0.2538 + }, + { + "epoch": 0.4404552782924658, + "lm_loss": 1.8617, + "step": 2288, + "vm_loss": 0.1768 + }, + { + "epoch": 0.4404552782924658, + "lm_loss": 2.3127, + "step": 2288, + "vm_loss": 0.1178 + }, + { + "epoch": 0.4404552782924658, + "lm_loss": 2.3468, + "step": 2288, + "vm_loss": 0.1735 + }, + { + "epoch": 0.4404552782924658, + "lm_loss": 2.2677, + "step": 2288, + "vm_loss": 0.0928 + }, + { + "epoch": 0.4404552782924658, + "lm_loss": 1.846, + "step": 2288, + "vm_loss": 0.1295 + }, + { + "epoch": 0.44064778497004115, + "grad_norm": 2.674997057544507, + "learning_rate": 1.815960184248486e-05, + "loss": 2.2859, + "step": 2289 + }, + { + "epoch": 0.4408402916476165, + "grad_norm": 2.95699712330038, + "learning_rate": 1.8157798964596606e-05, + "loss": 2.336, + "step": 2290 + }, + { + "epoch": 0.44103279832519193, + "grad_norm": 2.773987803306304, + "learning_rate": 1.8155995293665923e-05, + "loss": 2.31, + "step": 2291 + }, + { + "epoch": 0.4412253050027673, + "grad_norm": 2.8267279313377456, + "learning_rate": 1.8154190829868152e-05, + "loss": 2.2767, + "step": 2292 + }, + { + "epoch": 0.44141781168034266, + "grad_norm": 3.316062036862609, + "learning_rate": 1.815238557337871e-05, + "loss": 2.3168, + "step": 2293 + }, + { + "epoch": 0.44161031835791803, + "grad_norm": 2.975626667407499, + "learning_rate": 1.815057952437309e-05, + "loss": 2.3367, + "step": 2294 + }, + { + "epoch": 0.4418028250354934, + "grad_norm": 3.1478180106540403, + "learning_rate": 1.8148772683026865e-05, + "loss": 2.3538, + "step": 2295 + }, + { + "epoch": 0.4419953317130688, + "grad_norm": 3.1141203666257544, + "learning_rate": 1.8146965049515684e-05, + "loss": 2.2515, + "step": 2296 + }, + { + "epoch": 0.4419953317130688, + "lm_loss": 2.4991, + "step": 2296, + "vm_loss": 0.2327 + }, + { + "epoch": 0.4419953317130688, + "lm_loss": 2.552, + "step": 2296, + "vm_loss": 0.1546 + }, + { + "epoch": 0.4419953317130688, + "lm_loss": 1.3974, + "step": 2296, + "vm_loss": 0.1502 + }, + { + "epoch": 0.4419953317130688, + "lm_loss": 2.2506, + "step": 2296, + "vm_loss": 0.1705 + }, + { + "epoch": 0.4419953317130688, + "lm_loss": 2.2569, + "step": 2296, + "vm_loss": 0.1606 + }, + { + "epoch": 0.4419953317130688, + "lm_loss": 2.1856, + "step": 2296, + "vm_loss": 0.1524 + }, + { + "epoch": 0.4419953317130688, + "lm_loss": 2.3442, + "step": 2296, + "vm_loss": 0.198 + }, + { + "epoch": 0.4419953317130688, + "lm_loss": 2.2605, + "step": 2296, + "vm_loss": 0.1926 + }, + { + "epoch": 0.4421878383906442, + "grad_norm": 2.592237520048315, + "learning_rate": 1.8145156624015267e-05, + "loss": 2.2863, + "step": 2297 + }, + { + "epoch": 0.44238034506821955, + "grad_norm": 3.112368654795366, + "learning_rate": 1.8143347406701418e-05, + "loss": 2.2812, + "step": 2298 + }, + { + "epoch": 0.4425728517457949, + "grad_norm": 3.0672581311328173, + "learning_rate": 1.814153739775002e-05, + "loss": 2.34, + "step": 2299 + }, + { + "epoch": 0.44276535842337034, + "grad_norm": 2.7153923012881567, + "learning_rate": 1.8139726597337026e-05, + "loss": 2.289, + "step": 2300 + }, + { + "epoch": 0.4429578651009457, + "grad_norm": 3.0565394236394954, + "learning_rate": 1.813791500563847e-05, + "loss": 2.2871, + "step": 2301 + }, + { + "epoch": 0.44315037177852107, + "grad_norm": 3.367596758507889, + "learning_rate": 1.8136102622830458e-05, + "loss": 2.3192, + "step": 2302 + }, + { + "epoch": 0.44334287845609643, + "grad_norm": 2.8463625264033836, + "learning_rate": 1.8134289449089184e-05, + "loss": 2.3273, + "step": 2303 + }, + { + "epoch": 0.4435353851336718, + "grad_norm": 2.698690460089316, + "learning_rate": 1.8132475484590905e-05, + "loss": 2.357, + "step": 2304 + }, + { + "epoch": 0.4435353851336718, + "lm_loss": 2.1319, + "step": 2304, + "vm_loss": 0.2134 + }, + { + "epoch": 0.4435353851336718, + "lm_loss": 1.8273, + "step": 2304, + "vm_loss": 0.191 + }, + { + "epoch": 0.4435353851336718, + "lm_loss": 2.1133, + "step": 2304, + "vm_loss": 0.1666 + }, + { + "epoch": 0.4435353851336718, + "lm_loss": 2.1563, + "step": 2304, + "vm_loss": 0.1183 + }, + { + "epoch": 0.4435353851336718, + "lm_loss": 2.2225, + "step": 2304, + "vm_loss": 0.218 + }, + { + "epoch": 0.4435353851336718, + "lm_loss": 2.2088, + "step": 2304, + "vm_loss": 0.1755 + }, + { + "epoch": 0.4435353851336718, + "lm_loss": 1.6983, + "step": 2304, + "vm_loss": 0.1529 + }, + { + "epoch": 0.4435353851336718, + "lm_loss": 2.0593, + "step": 2304, + "vm_loss": 0.1199 + }, + { + "epoch": 0.4437278918112472, + "grad_norm": 3.00803392508199, + "learning_rate": 1.8130660729511964e-05, + "loss": 2.2784, + "step": 2305 + }, + { + "epoch": 0.4439203984888226, + "grad_norm": 3.049338533735284, + "learning_rate": 1.812884518402878e-05, + "loss": 2.3778, + "step": 2306 + }, + { + "epoch": 0.44411290516639795, + "grad_norm": 2.5404105702401845, + "learning_rate": 1.8127028848317843e-05, + "loss": 2.3092, + "step": 2307 + }, + { + "epoch": 0.4443054118439733, + "grad_norm": 3.0215819518567986, + "learning_rate": 1.812521172255573e-05, + "loss": 2.2888, + "step": 2308 + }, + { + "epoch": 0.44449791852154874, + "grad_norm": 2.9535979655659346, + "learning_rate": 1.8123393806919084e-05, + "loss": 2.3081, + "step": 2309 + }, + { + "epoch": 0.4446904251991241, + "grad_norm": 2.895735213659374, + "learning_rate": 1.8121575101584632e-05, + "loss": 2.3651, + "step": 2310 + }, + { + "epoch": 0.4448829318766995, + "grad_norm": 2.687726853708876, + "learning_rate": 1.8119755606729174e-05, + "loss": 2.2924, + "step": 2311 + }, + { + "epoch": 0.44507543855427484, + "grad_norm": 2.693461560074217, + "learning_rate": 1.811793532252959e-05, + "loss": 2.294, + "step": 2312 + }, + { + "epoch": 0.44507543855427484, + "lm_loss": 2.2555, + "step": 2312, + "vm_loss": 0.2395 + }, + { + "epoch": 0.44507543855427484, + "lm_loss": 1.9768, + "step": 2312, + "vm_loss": 0.1428 + }, + { + "epoch": 0.44507543855427484, + "lm_loss": 2.1486, + "step": 2312, + "vm_loss": 0.1173 + }, + { + "epoch": 0.44507543855427484, + "lm_loss": 2.0378, + "step": 2312, + "vm_loss": 0.1848 + }, + { + "epoch": 0.44507543855427484, + "lm_loss": 2.1273, + "step": 2312, + "vm_loss": 0.1765 + }, + { + "epoch": 0.44507543855427484, + "lm_loss": 2.331, + "step": 2312, + "vm_loss": 0.1441 + }, + { + "epoch": 0.44507543855427484, + "lm_loss": 2.2891, + "step": 2312, + "vm_loss": 0.116 + }, + { + "epoch": 0.44507543855427484, + "lm_loss": 2.2291, + "step": 2312, + "vm_loss": 0.1412 + }, + { + "epoch": 0.4452679452318502, + "grad_norm": 3.2736052081237696, + "learning_rate": 1.811611424916283e-05, + "loss": 2.3059, + "step": 2313 + }, + { + "epoch": 0.4454604519094256, + "grad_norm": 2.881535923317366, + "learning_rate": 1.8114292386805935e-05, + "loss": 2.3288, + "step": 2314 + }, + { + "epoch": 0.445652958587001, + "grad_norm": 3.0517008846081013, + "learning_rate": 1.8112469735636007e-05, + "loss": 2.2992, + "step": 2315 + }, + { + "epoch": 0.44584546526457636, + "grad_norm": 3.195731863253457, + "learning_rate": 1.8110646295830233e-05, + "loss": 2.313, + "step": 2316 + }, + { + "epoch": 0.4460379719421517, + "grad_norm": 2.78787429459504, + "learning_rate": 1.810882206756587e-05, + "loss": 2.3179, + "step": 2317 + }, + { + "epoch": 0.44623047861972714, + "grad_norm": 3.1160825668708614, + "learning_rate": 1.810699705102026e-05, + "loss": 2.2568, + "step": 2318 + }, + { + "epoch": 0.4464229852973025, + "grad_norm": 3.008668873403132, + "learning_rate": 1.810517124637082e-05, + "loss": 2.2848, + "step": 2319 + }, + { + "epoch": 0.4466154919748779, + "grad_norm": 2.9730995722507965, + "learning_rate": 1.8103344653795042e-05, + "loss": 2.3607, + "step": 2320 + }, + { + "epoch": 0.4466154919748779, + "lm_loss": 2.2469, + "step": 2320, + "vm_loss": 0.1817 + }, + { + "epoch": 0.4466154919748779, + "lm_loss": 2.3266, + "step": 2320, + "vm_loss": 0.1429 + }, + { + "epoch": 0.4466154919748779, + "lm_loss": 2.2016, + "step": 2320, + "vm_loss": 0.1378 + }, + { + "epoch": 0.4466154919748779, + "lm_loss": 1.9088, + "step": 2320, + "vm_loss": 0.1664 + }, + { + "epoch": 0.4466154919748779, + "lm_loss": 2.1452, + "step": 2320, + "vm_loss": 0.1909 + }, + { + "epoch": 0.4466154919748779, + "lm_loss": 2.2678, + "step": 2320, + "vm_loss": 0.2041 + }, + { + "epoch": 0.4466154919748779, + "lm_loss": 2.3045, + "step": 2320, + "vm_loss": 0.1763 + }, + { + "epoch": 0.4466154919748779, + "lm_loss": 1.8293, + "step": 2320, + "vm_loss": 0.1836 + }, + { + "epoch": 0.44680799865245324, + "grad_norm": 3.138343527926431, + "learning_rate": 1.8101517273470485e-05, + "loss": 2.3157, + "step": 2321 + }, + { + "epoch": 0.4470005053300286, + "grad_norm": 2.817627156353456, + "learning_rate": 1.8099689105574805e-05, + "loss": 2.33, + "step": 2322 + }, + { + "epoch": 0.44719301200760403, + "grad_norm": 3.0224937962314007, + "learning_rate": 1.8097860150285717e-05, + "loss": 2.2621, + "step": 2323 + }, + { + "epoch": 0.4473855186851794, + "grad_norm": 3.0258571758902186, + "learning_rate": 1.809603040778102e-05, + "loss": 2.2688, + "step": 2324 + }, + { + "epoch": 0.44757802536275476, + "grad_norm": 2.8483829903936484, + "learning_rate": 1.809419987823859e-05, + "loss": 2.2751, + "step": 2325 + }, + { + "epoch": 0.4477705320403301, + "grad_norm": 2.80689706768817, + "learning_rate": 1.8092368561836377e-05, + "loss": 2.2937, + "step": 2326 + }, + { + "epoch": 0.44796303871790555, + "grad_norm": 3.277412781913552, + "learning_rate": 1.809053645875241e-05, + "loss": 2.2877, + "step": 2327 + }, + { + "epoch": 0.4481555453954809, + "grad_norm": 2.9159806121008414, + "learning_rate": 1.808870356916479e-05, + "loss": 2.3153, + "step": 2328 + }, + { + "epoch": 0.4481555453954809, + "lm_loss": 2.2425, + "step": 2328, + "vm_loss": 0.1406 + }, + { + "epoch": 0.4481555453954809, + "lm_loss": 1.8858, + "step": 2328, + "vm_loss": 0.1486 + }, + { + "epoch": 0.4481555453954809, + "lm_loss": 2.0504, + "step": 2328, + "vm_loss": 0.1112 + }, + { + "epoch": 0.4481555453954809, + "lm_loss": 1.9213, + "step": 2328, + "vm_loss": 0.1725 + }, + { + "epoch": 0.4481555453954809, + "lm_loss": 1.8681, + "step": 2328, + "vm_loss": 0.1789 + }, + { + "epoch": 0.4481555453954809, + "lm_loss": 2.1767, + "step": 2328, + "vm_loss": 0.2274 + }, + { + "epoch": 0.4481555453954809, + "lm_loss": 2.2315, + "step": 2328, + "vm_loss": 0.1855 + }, + { + "epoch": 0.4481555453954809, + "lm_loss": 2.3613, + "step": 2328, + "vm_loss": 0.1284 + }, + { + "epoch": 0.4483480520730563, + "grad_norm": 3.2099408324439365, + "learning_rate": 1.8086869893251693e-05, + "loss": 2.3188, + "step": 2329 + }, + { + "epoch": 0.44854055875063165, + "grad_norm": 2.7266249885205487, + "learning_rate": 1.8085035431191388e-05, + "loss": 2.2785, + "step": 2330 + }, + { + "epoch": 0.44873306542820707, + "grad_norm": 3.1390339205874875, + "learning_rate": 1.8083200183162198e-05, + "loss": 2.3798, + "step": 2331 + }, + { + "epoch": 0.44892557210578243, + "grad_norm": 2.9238563664288826, + "learning_rate": 1.8081364149342535e-05, + "loss": 2.3034, + "step": 2332 + }, + { + "epoch": 0.4491180787833578, + "grad_norm": 2.9322900488499895, + "learning_rate": 1.8079527329910885e-05, + "loss": 2.2561, + "step": 2333 + }, + { + "epoch": 0.44931058546093317, + "grad_norm": 2.8551518564765472, + "learning_rate": 1.8077689725045818e-05, + "loss": 2.3207, + "step": 2334 + }, + { + "epoch": 0.44950309213850853, + "grad_norm": 2.8620785893899843, + "learning_rate": 1.807585133492596e-05, + "loss": 2.2502, + "step": 2335 + }, + { + "epoch": 0.44969559881608395, + "grad_norm": 3.229618433456999, + "learning_rate": 1.8074012159730034e-05, + "loss": 2.3092, + "step": 2336 + }, + { + "epoch": 0.44969559881608395, + "lm_loss": 2.1694, + "step": 2336, + "vm_loss": 0.2069 + }, + { + "epoch": 0.44969559881608395, + "lm_loss": 2.0757, + "step": 2336, + "vm_loss": 0.1366 + }, + { + "epoch": 0.44969559881608395, + "lm_loss": 2.3814, + "step": 2336, + "vm_loss": 0.1233 + }, + { + "epoch": 0.44969559881608395, + "lm_loss": 2.3128, + "step": 2336, + "vm_loss": 0.3002 + }, + { + "epoch": 0.44969559881608395, + "lm_loss": 2.1168, + "step": 2336, + "vm_loss": 0.1991 + }, + { + "epoch": 0.44969559881608395, + "lm_loss": 2.3629, + "step": 2336, + "vm_loss": 0.1138 + }, + { + "epoch": 0.44969559881608395, + "lm_loss": 2.3175, + "step": 2336, + "vm_loss": 0.2315 + }, + { + "epoch": 0.44969559881608395, + "lm_loss": 2.116, + "step": 2336, + "vm_loss": 0.1791 + }, + { + "epoch": 0.4498881054936593, + "grad_norm": 2.8240984073782736, + "learning_rate": 1.807217219963683e-05, + "loss": 2.3496, + "step": 2337 + }, + { + "epoch": 0.4500806121712347, + "grad_norm": 2.7334664084967812, + "learning_rate": 1.807033145482521e-05, + "loss": 2.2459, + "step": 2338 + }, + { + "epoch": 0.45027311884881005, + "grad_norm": 2.994307892206342, + "learning_rate": 1.806848992547413e-05, + "loss": 2.3126, + "step": 2339 + }, + { + "epoch": 0.45046562552638547, + "grad_norm": 2.926334002817865, + "learning_rate": 1.8066647611762596e-05, + "loss": 2.2942, + "step": 2340 + }, + { + "epoch": 0.45065813220396084, + "grad_norm": 2.6010397355706996, + "learning_rate": 1.8064804513869716e-05, + "loss": 2.2984, + "step": 2341 + }, + { + "epoch": 0.4508506388815362, + "grad_norm": 2.9548169531187147, + "learning_rate": 1.8062960631974657e-05, + "loss": 2.3312, + "step": 2342 + }, + { + "epoch": 0.45104314555911157, + "grad_norm": 2.9293199107374934, + "learning_rate": 1.806111596625667e-05, + "loss": 2.3716, + "step": 2343 + }, + { + "epoch": 0.45123565223668693, + "grad_norm": 3.0609017607682056, + "learning_rate": 1.805927051689508e-05, + "loss": 2.2608, + "step": 2344 + }, + { + "epoch": 0.45123565223668693, + "lm_loss": 1.8285, + "step": 2344, + "vm_loss": 0.1427 + }, + { + "epoch": 0.45123565223668693, + "lm_loss": 2.1332, + "step": 2344, + "vm_loss": 0.1245 + }, + { + "epoch": 0.45123565223668693, + "lm_loss": 2.3643, + "step": 2344, + "vm_loss": 0.1329 + }, + { + "epoch": 0.45123565223668693, + "lm_loss": 2.1365, + "step": 2344, + "vm_loss": 0.1446 + }, + { + "epoch": 0.45123565223668693, + "lm_loss": 2.1722, + "step": 2344, + "vm_loss": 0.1855 + }, + { + "epoch": 0.45123565223668693, + "lm_loss": 2.2516, + "step": 2344, + "vm_loss": 0.1665 + }, + { + "epoch": 0.45123565223668693, + "lm_loss": 2.0841, + "step": 2344, + "vm_loss": 0.1727 + }, + { + "epoch": 0.45123565223668693, + "lm_loss": 2.2478, + "step": 2344, + "vm_loss": 0.1252 + }, + { + "epoch": 0.45142815891426236, + "grad_norm": 2.848659581249523, + "learning_rate": 1.8057424284069287e-05, + "loss": 2.2831, + "step": 2345 + }, + { + "epoch": 0.4516206655918377, + "grad_norm": 3.1172548747918056, + "learning_rate": 1.805557726795877e-05, + "loss": 2.3554, + "step": 2346 + }, + { + "epoch": 0.4518131722694131, + "grad_norm": 2.766966260579383, + "learning_rate": 1.805372946874308e-05, + "loss": 2.2855, + "step": 2347 + }, + { + "epoch": 0.45200567894698845, + "grad_norm": 3.1466402905754913, + "learning_rate": 1.8051880886601845e-05, + "loss": 2.2876, + "step": 2348 + }, + { + "epoch": 0.4521981856245639, + "grad_norm": 2.8340130616752646, + "learning_rate": 1.805003152171478e-05, + "loss": 2.3182, + "step": 2349 + }, + { + "epoch": 0.45239069230213924, + "grad_norm": 3.1059274593784623, + "learning_rate": 1.8048181374261656e-05, + "loss": 2.256, + "step": 2350 + }, + { + "epoch": 0.4525831989797146, + "grad_norm": 2.868401927654121, + "learning_rate": 1.804633044442234e-05, + "loss": 2.2972, + "step": 2351 + }, + { + "epoch": 0.45277570565729, + "grad_norm": 2.855277595725172, + "learning_rate": 1.804447873237676e-05, + "loss": 2.2832, + "step": 2352 + }, + { + "epoch": 0.45277570565729, + "lm_loss": 1.975, + "step": 2352, + "vm_loss": 0.1969 + }, + { + "epoch": 0.45277570565729, + "lm_loss": 2.1758, + "step": 2352, + "vm_loss": 0.1215 + }, + { + "epoch": 0.45277570565729, + "lm_loss": 1.6455, + "step": 2352, + "vm_loss": 0.1977 + }, + { + "epoch": 0.45277570565729, + "lm_loss": 2.3612, + "step": 2352, + "vm_loss": 0.2306 + }, + { + "epoch": 0.45277570565729, + "lm_loss": 1.8856, + "step": 2352, + "vm_loss": 0.177 + }, + { + "epoch": 0.45277570565729, + "lm_loss": 2.1037, + "step": 2352, + "vm_loss": 0.1459 + }, + { + "epoch": 0.45277570565729, + "lm_loss": 2.2165, + "step": 2352, + "vm_loss": 0.1122 + }, + { + "epoch": 0.45277570565729, + "lm_loss": 2.2202, + "step": 2352, + "vm_loss": 0.2309 + }, + { + "epoch": 0.45296821233486534, + "grad_norm": 2.930887479217807, + "learning_rate": 1.804262623830493e-05, + "loss": 2.3317, + "step": 2353 + }, + { + "epoch": 0.45316071901244076, + "grad_norm": 2.8845706527768464, + "learning_rate": 1.8040772962386934e-05, + "loss": 2.2809, + "step": 2354 + }, + { + "epoch": 0.4533532256900161, + "grad_norm": 2.483947272889894, + "learning_rate": 1.8038918904802932e-05, + "loss": 2.304, + "step": 2355 + }, + { + "epoch": 0.4535457323675915, + "grad_norm": 3.103766313101901, + "learning_rate": 1.803706406573317e-05, + "loss": 2.2961, + "step": 2356 + }, + { + "epoch": 0.45373823904516686, + "grad_norm": 2.773846275925031, + "learning_rate": 1.8035208445357955e-05, + "loss": 2.3007, + "step": 2357 + }, + { + "epoch": 0.4539307457227423, + "grad_norm": 2.931230848773706, + "learning_rate": 1.8033352043857677e-05, + "loss": 2.2822, + "step": 2358 + }, + { + "epoch": 0.45412325240031765, + "grad_norm": 3.020776139813076, + "learning_rate": 1.8031494861412804e-05, + "loss": 2.2981, + "step": 2359 + }, + { + "epoch": 0.454315759077893, + "grad_norm": 3.122936422808837, + "learning_rate": 1.8029636898203877e-05, + "loss": 2.3055, + "step": 2360 + }, + { + "epoch": 0.454315759077893, + "lm_loss": 1.6894, + "step": 2360, + "vm_loss": 0.1853 + }, + { + "epoch": 0.454315759077893, + "lm_loss": 2.0346, + "step": 2360, + "vm_loss": 0.1755 + }, + { + "epoch": 0.454315759077893, + "lm_loss": 2.4783, + "step": 2360, + "vm_loss": 0.1707 + }, + { + "epoch": 0.454315759077893, + "lm_loss": 2.2976, + "step": 2360, + "vm_loss": 0.171 + }, + { + "epoch": 0.454315759077893, + "lm_loss": 1.8729, + "step": 2360, + "vm_loss": 0.1501 + }, + { + "epoch": 0.454315759077893, + "lm_loss": 2.1979, + "step": 2360, + "vm_loss": 0.1738 + }, + { + "epoch": 0.454315759077893, + "lm_loss": 2.3412, + "step": 2360, + "vm_loss": 0.1764 + }, + { + "epoch": 0.454315759077893, + "lm_loss": 2.0552, + "step": 2360, + "vm_loss": 0.0952 + }, + { + "epoch": 0.4545082657554684, + "grad_norm": 2.7638142005128223, + "learning_rate": 1.8027778154411515e-05, + "loss": 2.3154, + "step": 2361 + }, + { + "epoch": 0.4547007724330438, + "grad_norm": 3.064304999190845, + "learning_rate": 1.802591863021641e-05, + "loss": 2.3077, + "step": 2362 + }, + { + "epoch": 0.45489327911061916, + "grad_norm": 5.562591732814892, + "learning_rate": 1.8024058325799335e-05, + "loss": 2.3113, + "step": 2363 + }, + { + "epoch": 0.45508578578819453, + "grad_norm": 22.97063855165341, + "learning_rate": 1.8022197241341134e-05, + "loss": 2.5621, + "step": 2364 + }, + { + "epoch": 0.4552782924657699, + "grad_norm": 8.15658944492121, + "learning_rate": 1.8020335377022723e-05, + "loss": 2.666, + "step": 2365 + }, + { + "epoch": 0.45547079914334526, + "grad_norm": 6.85679448293632, + "learning_rate": 1.8018472733025108e-05, + "loss": 2.7253, + "step": 2366 + }, + { + "epoch": 0.4556633058209207, + "grad_norm": 5.327370792129126, + "learning_rate": 1.8016609309529353e-05, + "loss": 2.6148, + "step": 2367 + }, + { + "epoch": 0.45585581249849605, + "grad_norm": 5.658883351664112, + "learning_rate": 1.8014745106716612e-05, + "loss": 2.5471, + "step": 2368 + }, + { + "epoch": 0.45585581249849605, + "lm_loss": 2.2808, + "step": 2368, + "vm_loss": 0.1296 + }, + { + "epoch": 0.45585581249849605, + "lm_loss": 2.1064, + "step": 2368, + "vm_loss": 0.2059 + }, + { + "epoch": 0.45585581249849605, + "lm_loss": 2.4046, + "step": 2368, + "vm_loss": 0.1829 + }, + { + "epoch": 0.45585581249849605, + "lm_loss": 2.391, + "step": 2368, + "vm_loss": 0.1135 + }, + { + "epoch": 0.45585581249849605, + "lm_loss": 2.2072, + "step": 2368, + "vm_loss": 0.1622 + }, + { + "epoch": 0.45585581249849605, + "lm_loss": 2.4615, + "step": 2368, + "vm_loss": 0.1536 + }, + { + "epoch": 0.45585581249849605, + "lm_loss": 2.3372, + "step": 2368, + "vm_loss": 0.1414 + }, + { + "epoch": 0.45585581249849605, + "lm_loss": 2.2365, + "step": 2368, + "vm_loss": 0.1626 + }, + { + "epoch": 0.4560483191760714, + "grad_norm": 3.916581168435844, + "learning_rate": 1.801288012476811e-05, + "loss": 2.4711, + "step": 2369 + }, + { + "epoch": 0.4562408258536468, + "grad_norm": 17.491146549436323, + "learning_rate": 1.8011014363865143e-05, + "loss": 2.5565, + "step": 2370 + }, + { + "epoch": 0.4564333325312222, + "grad_norm": 3.8577337976652992, + "learning_rate": 1.800914782418909e-05, + "loss": 2.4503, + "step": 2371 + }, + { + "epoch": 0.45662583920879757, + "grad_norm": 4.307679763305614, + "learning_rate": 1.8007280505921404e-05, + "loss": 2.4555, + "step": 2372 + }, + { + "epoch": 0.45681834588637293, + "grad_norm": 4.4976377989056715, + "learning_rate": 1.8005412409243604e-05, + "loss": 2.495, + "step": 2373 + }, + { + "epoch": 0.4570108525639483, + "grad_norm": 3.9387534614636563, + "learning_rate": 1.8003543534337303e-05, + "loss": 2.5048, + "step": 2374 + }, + { + "epoch": 0.45720335924152367, + "grad_norm": 3.0372727514126185, + "learning_rate": 1.8001673881384176e-05, + "loss": 2.4397, + "step": 2375 + }, + { + "epoch": 0.4573958659190991, + "grad_norm": 4.210620707311464, + "learning_rate": 1.7999803450565974e-05, + "loss": 2.3988, + "step": 2376 + }, + { + "epoch": 0.4573958659190991, + "lm_loss": 2.1933, + "step": 2376, + "vm_loss": 0.1843 + }, + { + "epoch": 0.4573958659190991, + "lm_loss": 2.2809, + "step": 2376, + "vm_loss": 0.1678 + }, + { + "epoch": 0.4573958659190991, + "lm_loss": 2.4831, + "step": 2376, + "vm_loss": 0.1741 + }, + { + "epoch": 0.4573958659190991, + "lm_loss": 2.2587, + "step": 2376, + "vm_loss": 0.1264 + }, + { + "epoch": 0.4573958659190991, + "lm_loss": 2.0197, + "step": 2376, + "vm_loss": 0.158 + }, + { + "epoch": 0.4573958659190991, + "lm_loss": 2.1841, + "step": 2376, + "vm_loss": 0.1157 + }, + { + "epoch": 0.4573958659190991, + "lm_loss": 2.3972, + "step": 2376, + "vm_loss": 0.1552 + }, + { + "epoch": 0.4573958659190991, + "lm_loss": 2.2493, + "step": 2376, + "vm_loss": 0.1306 + }, + { + "epoch": 0.45758837259667445, + "grad_norm": 6.104813099053376, + "learning_rate": 1.7997932242064527e-05, + "loss": 2.3671, + "step": 2377 + }, + { + "epoch": 0.4577808792742498, + "grad_norm": 3.7715778172645056, + "learning_rate": 1.799606025606175e-05, + "loss": 2.4, + "step": 2378 + }, + { + "epoch": 0.4579733859518252, + "grad_norm": 3.218804062332221, + "learning_rate": 1.799418749273961e-05, + "loss": 2.4755, + "step": 2379 + }, + { + "epoch": 0.4581658926294006, + "grad_norm": 3.1883713762000196, + "learning_rate": 1.7992313952280175e-05, + "loss": 2.4082, + "step": 2380 + }, + { + "epoch": 0.45835839930697597, + "grad_norm": 3.889008555999607, + "learning_rate": 1.7990439634865566e-05, + "loss": 2.3559, + "step": 2381 + }, + { + "epoch": 0.45855090598455134, + "grad_norm": 2.930844843482591, + "learning_rate": 1.7988564540678e-05, + "loss": 2.3576, + "step": 2382 + }, + { + "epoch": 0.4587434126621267, + "grad_norm": 2.995541205198694, + "learning_rate": 1.798668866989976e-05, + "loss": 2.3986, + "step": 2383 + }, + { + "epoch": 0.45893591933970207, + "grad_norm": 3.4163952463172556, + "learning_rate": 1.79848120227132e-05, + "loss": 2.3668, + "step": 2384 + }, + { + "epoch": 0.45893591933970207, + "lm_loss": 2.095, + "step": 2384, + "vm_loss": 0.2456 + }, + { + "epoch": 0.45893591933970207, + "lm_loss": 2.1814, + "step": 2384, + "vm_loss": 0.1981 + }, + { + "epoch": 0.45893591933970207, + "lm_loss": 2.2911, + "step": 2384, + "vm_loss": 0.2083 + }, + { + "epoch": 0.45893591933970207, + "lm_loss": 2.3935, + "step": 2384, + "vm_loss": 0.2145 + }, + { + "epoch": 0.45893591933970207, + "lm_loss": 2.0076, + "step": 2384, + "vm_loss": 0.1959 + }, + { + "epoch": 0.45893591933970207, + "lm_loss": 2.0577, + "step": 2384, + "vm_loss": 0.2581 + }, + { + "epoch": 0.45893591933970207, + "lm_loss": 2.2319, + "step": 2384, + "vm_loss": 0.1164 + }, + { + "epoch": 0.45893591933970207, + "lm_loss": 2.0758, + "step": 2384, + "vm_loss": 0.223 + }, + { + "epoch": 0.4591284260172775, + "grad_norm": 3.083416169116607, + "learning_rate": 1.7982934599300754e-05, + "loss": 2.4253, + "step": 2385 + }, + { + "epoch": 0.45932093269485286, + "grad_norm": 3.1549298832566333, + "learning_rate": 1.7981056399844934e-05, + "loss": 2.3548, + "step": 2386 + }, + { + "epoch": 0.4595134393724282, + "grad_norm": 3.6372264184634973, + "learning_rate": 1.7979177424528325e-05, + "loss": 2.35, + "step": 2387 + }, + { + "epoch": 0.4597059460500036, + "grad_norm": 2.9445602397701194, + "learning_rate": 1.7977297673533586e-05, + "loss": 2.3203, + "step": 2388 + }, + { + "epoch": 0.459898452727579, + "grad_norm": 2.979596609096832, + "learning_rate": 1.7975417147043453e-05, + "loss": 2.3105, + "step": 2389 + }, + { + "epoch": 0.4600909594051544, + "grad_norm": 3.1092741789944376, + "learning_rate": 1.7973535845240743e-05, + "loss": 2.3725, + "step": 2390 + }, + { + "epoch": 0.46028346608272974, + "grad_norm": 2.960922411807894, + "learning_rate": 1.7971653768308334e-05, + "loss": 2.31, + "step": 2391 + }, + { + "epoch": 0.4604759727603051, + "grad_norm": 2.80248806718247, + "learning_rate": 1.796977091642919e-05, + "loss": 2.3358, + "step": 2392 + }, + { + "epoch": 0.4604759727603051, + "lm_loss": 2.1382, + "step": 2392, + "vm_loss": 0.1537 + }, + { + "epoch": 0.4604759727603051, + "lm_loss": 2.03, + "step": 2392, + "vm_loss": 0.1447 + }, + { + "epoch": 0.4604759727603051, + "lm_loss": 2.3969, + "step": 2392, + "vm_loss": 0.1564 + }, + { + "epoch": 0.4604759727603051, + "lm_loss": 2.2939, + "step": 2392, + "vm_loss": 0.1439 + }, + { + "epoch": 0.4604759727603051, + "lm_loss": 2.1927, + "step": 2392, + "vm_loss": 0.1642 + }, + { + "epoch": 0.4604759727603051, + "lm_loss": 1.9511, + "step": 2392, + "vm_loss": 0.2459 + }, + { + "epoch": 0.4604759727603051, + "lm_loss": 2.4046, + "step": 2392, + "vm_loss": 0.1658 + }, + { + "epoch": 0.4604759727603051, + "lm_loss": 2.1387, + "step": 2392, + "vm_loss": 0.1299 + }, + { + "epoch": 0.46066847943788053, + "grad_norm": 2.990245336043424, + "learning_rate": 1.7967887289786352e-05, + "loss": 2.3125, + "step": 2393 + }, + { + "epoch": 0.4608609861154559, + "grad_norm": 3.2394163591528478, + "learning_rate": 1.796600288856293e-05, + "loss": 2.3457, + "step": 2394 + }, + { + "epoch": 0.46105349279303126, + "grad_norm": 2.779711196596348, + "learning_rate": 1.7964117712942112e-05, + "loss": 2.3019, + "step": 2395 + }, + { + "epoch": 0.4612459994706066, + "grad_norm": 2.776740214777698, + "learning_rate": 1.7962231763107158e-05, + "loss": 2.3017, + "step": 2396 + }, + { + "epoch": 0.461438506148182, + "grad_norm": 3.017331508243643, + "learning_rate": 1.7960345039241415e-05, + "loss": 2.3713, + "step": 2397 + }, + { + "epoch": 0.4616310128257574, + "grad_norm": 3.03999712944687, + "learning_rate": 1.7958457541528292e-05, + "loss": 2.3442, + "step": 2398 + }, + { + "epoch": 0.4618235195033328, + "grad_norm": 3.0063769137489795, + "learning_rate": 1.7956569270151274e-05, + "loss": 2.3298, + "step": 2399 + }, + { + "epoch": 0.46201602618090815, + "grad_norm": 2.8908918455742194, + "learning_rate": 1.795468022529393e-05, + "loss": 2.3644, + "step": 2400 + }, + { + "epoch": 0.46201602618090815, + "lm_loss": 2.3922, + "step": 2400, + "vm_loss": 0.2353 + }, + { + "epoch": 0.46201602618090815, + "lm_loss": 2.0621, + "step": 2400, + "vm_loss": 0.167 + }, + { + "epoch": 0.46201602618090815, + "lm_loss": 2.3499, + "step": 2400, + "vm_loss": 0.1581 + }, + { + "epoch": 0.46201602618090815, + "lm_loss": 1.9638, + "step": 2400, + "vm_loss": 0.147 + }, + { + "epoch": 0.46201602618090815, + "lm_loss": 1.9578, + "step": 2400, + "vm_loss": 0.1534 + }, + { + "epoch": 0.46201602618090815, + "lm_loss": 2.4211, + "step": 2400, + "vm_loss": 0.1362 + }, + { + "epoch": 0.46201602618090815, + "lm_loss": 2.3138, + "step": 2400, + "vm_loss": 0.1601 + }, + { + "epoch": 0.46201602618090815, + "lm_loss": 2.3924, + "step": 2400, + "vm_loss": 0.226 + }, + { + "epoch": 0.4622085328584835, + "grad_norm": 2.865274471298513, + "learning_rate": 1.79527904071399e-05, + "loss": 2.3759, + "step": 2401 + }, + { + "epoch": 0.46240103953605893, + "grad_norm": 2.9435011174637213, + "learning_rate": 1.7950899815872894e-05, + "loss": 2.339, + "step": 2402 + }, + { + "epoch": 0.4625935462136343, + "grad_norm": 2.9114939429828786, + "learning_rate": 1.7949008451676703e-05, + "loss": 2.3239, + "step": 2403 + }, + { + "epoch": 0.46278605289120966, + "grad_norm": 3.07677194247858, + "learning_rate": 1.7947116314735197e-05, + "loss": 2.3558, + "step": 2404 + }, + { + "epoch": 0.46297855956878503, + "grad_norm": 3.125604945990555, + "learning_rate": 1.7945223405232307e-05, + "loss": 2.3249, + "step": 2405 + }, + { + "epoch": 0.4631710662463604, + "grad_norm": 3.262357826726187, + "learning_rate": 1.7943329723352057e-05, + "loss": 2.2219, + "step": 2406 + }, + { + "epoch": 0.4633635729239358, + "grad_norm": 2.7454123096868996, + "learning_rate": 1.7941435269278533e-05, + "loss": 2.3578, + "step": 2407 + }, + { + "epoch": 0.4635560796015112, + "grad_norm": 2.817129068134592, + "learning_rate": 1.7939540043195898e-05, + "loss": 2.2635, + "step": 2408 + }, + { + "epoch": 0.4635560796015112, + "lm_loss": 2.2138, + "step": 2408, + "vm_loss": 0.2145 + }, + { + "epoch": 0.4635560796015112, + "lm_loss": 2.4109, + "step": 2408, + "vm_loss": 0.1977 + }, + { + "epoch": 0.4635560796015112, + "lm_loss": 2.4497, + "step": 2408, + "vm_loss": 0.1382 + }, + { + "epoch": 0.4635560796015112, + "lm_loss": 2.198, + "step": 2408, + "vm_loss": 0.1555 + }, + { + "epoch": 0.4635560796015112, + "lm_loss": 1.9394, + "step": 2408, + "vm_loss": 0.2104 + }, + { + "epoch": 0.4635560796015112, + "lm_loss": 2.0549, + "step": 2408, + "vm_loss": 0.1644 + }, + { + "epoch": 0.4635560796015112, + "lm_loss": 2.2088, + "step": 2408, + "vm_loss": 0.1757 + }, + { + "epoch": 0.4635560796015112, + "lm_loss": 2.2197, + "step": 2408, + "vm_loss": 0.1515 + }, + { + "epoch": 0.46374858627908655, + "grad_norm": 3.11112440172383, + "learning_rate": 1.7937644045288395e-05, + "loss": 2.2915, + "step": 2409 + }, + { + "epoch": 0.4639410929566619, + "grad_norm": 3.125630245160032, + "learning_rate": 1.793574727574034e-05, + "loss": 2.3692, + "step": 2410 + }, + { + "epoch": 0.46413359963423734, + "grad_norm": 3.077932733601034, + "learning_rate": 1.7933849734736123e-05, + "loss": 2.3446, + "step": 2411 + }, + { + "epoch": 0.4643261063118127, + "grad_norm": 2.735712827532773, + "learning_rate": 1.7931951422460203e-05, + "loss": 2.3003, + "step": 2412 + }, + { + "epoch": 0.46451861298938807, + "grad_norm": 2.6751770647790414, + "learning_rate": 1.793005233909713e-05, + "loss": 2.3522, + "step": 2413 + }, + { + "epoch": 0.46471111966696343, + "grad_norm": 2.5434138639147617, + "learning_rate": 1.7928152484831512e-05, + "loss": 2.2867, + "step": 2414 + }, + { + "epoch": 0.4649036263445388, + "grad_norm": 2.853816866835236, + "learning_rate": 1.792625185984804e-05, + "loss": 2.3234, + "step": 2415 + }, + { + "epoch": 0.4650961330221142, + "grad_norm": 2.8692653804710972, + "learning_rate": 1.792435046433149e-05, + "loss": 2.3467, + "step": 2416 + }, + { + "epoch": 0.4650961330221142, + "lm_loss": 1.7357, + "step": 2416, + "vm_loss": 0.1757 + }, + { + "epoch": 0.4650961330221142, + "lm_loss": 1.9325, + "step": 2416, + "vm_loss": 0.1275 + }, + { + "epoch": 0.4650961330221142, + "lm_loss": 1.9983, + "step": 2416, + "vm_loss": 0.1518 + }, + { + "epoch": 0.4650961330221142, + "lm_loss": 1.971, + "step": 2416, + "vm_loss": 0.1367 + }, + { + "epoch": 0.4650961330221142, + "lm_loss": 2.07, + "step": 2416, + "vm_loss": 0.2543 + }, + { + "epoch": 0.4650961330221142, + "lm_loss": 1.9344, + "step": 2416, + "vm_loss": 0.1697 + }, + { + "epoch": 0.4650961330221142, + "lm_loss": 1.9694, + "step": 2416, + "vm_loss": 0.1418 + }, + { + "epoch": 0.4650961330221142, + "lm_loss": 2.1749, + "step": 2416, + "vm_loss": 0.1965 + }, + { + "epoch": 0.4652886396996896, + "grad_norm": 2.691369060497828, + "learning_rate": 1.7922448298466687e-05, + "loss": 2.3143, + "step": 2417 + }, + { + "epoch": 0.46548114637726495, + "grad_norm": 3.0007583602816896, + "learning_rate": 1.792054536243855e-05, + "loss": 2.2396, + "step": 2418 + }, + { + "epoch": 0.4656736530548403, + "grad_norm": 2.5770219085681996, + "learning_rate": 1.7918641656432073e-05, + "loss": 2.3234, + "step": 2419 + }, + { + "epoch": 0.46586615973241574, + "grad_norm": 2.5414521653131903, + "learning_rate": 1.7916737180632316e-05, + "loss": 2.2853, + "step": 2420 + }, + { + "epoch": 0.4660586664099911, + "grad_norm": 2.8400366264995167, + "learning_rate": 1.791483193522442e-05, + "loss": 2.2633, + "step": 2421 + }, + { + "epoch": 0.4662511730875665, + "grad_norm": 2.9830389865983182, + "learning_rate": 1.79129259203936e-05, + "loss": 2.3212, + "step": 2422 + }, + { + "epoch": 0.46644367976514184, + "grad_norm": 2.5753937513011347, + "learning_rate": 1.791101913632515e-05, + "loss": 2.2955, + "step": 2423 + }, + { + "epoch": 0.4666361864427172, + "grad_norm": 2.962506863682127, + "learning_rate": 1.790911158320442e-05, + "loss": 2.3183, + "step": 2424 + }, + { + "epoch": 0.4666361864427172, + "lm_loss": 2.151, + "step": 2424, + "vm_loss": 0.2178 + }, + { + "epoch": 0.4666361864427172, + "lm_loss": 1.882, + "step": 2424, + "vm_loss": 0.2117 + }, + { + "epoch": 0.4666361864427172, + "lm_loss": 2.3452, + "step": 2424, + "vm_loss": 0.1314 + }, + { + "epoch": 0.4666361864427172, + "lm_loss": 1.9555, + "step": 2424, + "vm_loss": 0.1361 + }, + { + "epoch": 0.4666361864427172, + "lm_loss": 2.0452, + "step": 2424, + "vm_loss": 0.1335 + }, + { + "epoch": 0.4666361864427172, + "lm_loss": 2.1511, + "step": 2424, + "vm_loss": 0.1654 + }, + { + "epoch": 0.4666361864427172, + "lm_loss": 2.1451, + "step": 2424, + "vm_loss": 0.132 + }, + { + "epoch": 0.4666361864427172, + "lm_loss": 1.926, + "step": 2424, + "vm_loss": 0.1724 + }, + { + "epoch": 0.4668286931202926, + "grad_norm": 2.790728340621961, + "learning_rate": 1.7907203261216862e-05, + "loss": 2.2839, + "step": 2425 + }, + { + "epoch": 0.467021199797868, + "grad_norm": 2.8447247087147938, + "learning_rate": 1.7905294170547983e-05, + "loss": 2.245, + "step": 2426 + }, + { + "epoch": 0.46721370647544336, + "grad_norm": 2.78874672244569, + "learning_rate": 1.7903384311383374e-05, + "loss": 2.2752, + "step": 2427 + }, + { + "epoch": 0.4674062131530187, + "grad_norm": 2.8421213752418777, + "learning_rate": 1.7901473683908694e-05, + "loss": 2.2709, + "step": 2428 + }, + { + "epoch": 0.46759871983059415, + "grad_norm": 2.7556359850468835, + "learning_rate": 1.7899562288309683e-05, + "loss": 2.2624, + "step": 2429 + }, + { + "epoch": 0.4677912265081695, + "grad_norm": 2.614723145076032, + "learning_rate": 1.7897650124772154e-05, + "loss": 2.3186, + "step": 2430 + }, + { + "epoch": 0.4679837331857449, + "grad_norm": 2.871992409114372, + "learning_rate": 1.7895737193481993e-05, + "loss": 2.3152, + "step": 2431 + }, + { + "epoch": 0.46817623986332024, + "grad_norm": 2.783729285624364, + "learning_rate": 1.7893823494625158e-05, + "loss": 2.3507, + "step": 2432 + }, + { + "epoch": 0.46817623986332024, + "lm_loss": 2.1246, + "step": 2432, + "vm_loss": 0.1464 + }, + { + "epoch": 0.46817623986332024, + "lm_loss": 2.3636, + "step": 2432, + "vm_loss": 0.2153 + }, + { + "epoch": 0.46817623986332024, + "lm_loss": 2.1422, + "step": 2432, + "vm_loss": 0.1535 + }, + { + "epoch": 0.46817623986332024, + "lm_loss": 2.1018, + "step": 2432, + "vm_loss": 0.1812 + }, + { + "epoch": 0.46817623986332024, + "lm_loss": 1.8501, + "step": 2432, + "vm_loss": 0.1071 + }, + { + "epoch": 0.46817623986332024, + "lm_loss": 2.3932, + "step": 2432, + "vm_loss": 0.1056 + }, + { + "epoch": 0.46817623986332024, + "lm_loss": 2.2017, + "step": 2432, + "vm_loss": 0.1615 + }, + { + "epoch": 0.46817623986332024, + "lm_loss": 1.8816, + "step": 2432, + "vm_loss": 0.2047 + }, + { + "epoch": 0.46836874654089566, + "grad_norm": 2.683320317098717, + "learning_rate": 1.7891909028387688e-05, + "loss": 2.2729, + "step": 2433 + }, + { + "epoch": 0.46856125321847103, + "grad_norm": 2.6615637362099966, + "learning_rate": 1.7889993794955695e-05, + "loss": 2.3065, + "step": 2434 + }, + { + "epoch": 0.4687537598960464, + "grad_norm": 2.7101461328072403, + "learning_rate": 1.7888077794515362e-05, + "loss": 2.2897, + "step": 2435 + }, + { + "epoch": 0.46894626657362176, + "grad_norm": 2.895318792334533, + "learning_rate": 1.788616102725295e-05, + "loss": 2.246, + "step": 2436 + }, + { + "epoch": 0.4691387732511971, + "grad_norm": 3.284331367802261, + "learning_rate": 1.7884243493354792e-05, + "loss": 2.3085, + "step": 2437 + }, + { + "epoch": 0.46933127992877255, + "grad_norm": 2.742929348158021, + "learning_rate": 1.78823251930073e-05, + "loss": 2.2718, + "step": 2438 + }, + { + "epoch": 0.4695237866063479, + "grad_norm": 2.838975070062055, + "learning_rate": 1.7880406126396952e-05, + "loss": 2.2585, + "step": 2439 + }, + { + "epoch": 0.4697162932839233, + "grad_norm": 3.21180468749498, + "learning_rate": 1.7878486293710308e-05, + "loss": 2.3511, + "step": 2440 + }, + { + "epoch": 0.4697162932839233, + "lm_loss": 2.2502, + "step": 2440, + "vm_loss": 0.1257 + }, + { + "epoch": 0.4697162932839233, + "lm_loss": 1.7412, + "step": 2440, + "vm_loss": 0.1683 + }, + { + "epoch": 0.4697162932839233, + "lm_loss": 2.3208, + "step": 2440, + "vm_loss": 0.1997 + }, + { + "epoch": 0.4697162932839233, + "lm_loss": 2.3066, + "step": 2440, + "vm_loss": 0.17 + }, + { + "epoch": 0.4697162932839233, + "lm_loss": 2.0328, + "step": 2440, + "vm_loss": 0.1833 + }, + { + "epoch": 0.4697162932839233, + "lm_loss": 1.981, + "step": 2440, + "vm_loss": 0.1717 + }, + { + "epoch": 0.4697162932839233, + "lm_loss": 2.2021, + "step": 2440, + "vm_loss": 0.1877 + }, + { + "epoch": 0.4697162932839233, + "lm_loss": 2.3943, + "step": 2440, + "vm_loss": 0.1681 + }, + { + "epoch": 0.46990879996149865, + "grad_norm": 2.6299559480312054, + "learning_rate": 1.7876565695134002e-05, + "loss": 2.3137, + "step": 2441 + }, + { + "epoch": 0.47010130663907407, + "grad_norm": 2.9379007138784874, + "learning_rate": 1.7874644330854742e-05, + "loss": 2.3026, + "step": 2442 + }, + { + "epoch": 0.47029381331664943, + "grad_norm": 2.8774306668916507, + "learning_rate": 1.7872722201059305e-05, + "loss": 2.3318, + "step": 2443 + }, + { + "epoch": 0.4704863199942248, + "grad_norm": 2.7792748659879907, + "learning_rate": 1.7870799305934547e-05, + "loss": 2.3365, + "step": 2444 + }, + { + "epoch": 0.47067882667180017, + "grad_norm": 3.0319054358484507, + "learning_rate": 1.7868875645667404e-05, + "loss": 2.261, + "step": 2445 + }, + { + "epoch": 0.47087133334937553, + "grad_norm": 2.7070302029013407, + "learning_rate": 1.786695122044487e-05, + "loss": 2.2547, + "step": 2446 + }, + { + "epoch": 0.47106384002695095, + "grad_norm": 3.440815425482522, + "learning_rate": 1.7865026030454037e-05, + "loss": 2.314, + "step": 2447 + }, + { + "epoch": 0.4712563467045263, + "grad_norm": 2.9166625202947967, + "learning_rate": 1.786310007588204e-05, + "loss": 2.3376, + "step": 2448 + }, + { + "epoch": 0.4712563467045263, + "lm_loss": 2.3058, + "step": 2448, + "vm_loss": 0.1383 + }, + { + "epoch": 0.4712563467045263, + "lm_loss": 2.3001, + "step": 2448, + "vm_loss": 0.111 + }, + { + "epoch": 0.4712563467045263, + "lm_loss": 1.9594, + "step": 2448, + "vm_loss": 0.1409 + }, + { + "epoch": 0.4712563467045263, + "lm_loss": 1.8172, + "step": 2448, + "vm_loss": 0.1096 + }, + { + "epoch": 0.4712563467045263, + "lm_loss": 2.3719, + "step": 2448, + "vm_loss": 0.1769 + }, + { + "epoch": 0.4712563467045263, + "lm_loss": 1.8892, + "step": 2448, + "vm_loss": 0.1321 + }, + { + "epoch": 0.4712563467045263, + "lm_loss": 2.1904, + "step": 2448, + "vm_loss": 0.1844 + }, + { + "epoch": 0.4712563467045263, + "lm_loss": 1.9629, + "step": 2448, + "vm_loss": 0.1281 + }, + { + "epoch": 0.4714488533821017, + "grad_norm": 2.93973245685304, + "learning_rate": 1.7861173356916127e-05, + "loss": 2.2693, + "step": 2449 + }, + { + "epoch": 0.47164136005967705, + "grad_norm": 3.467385242028855, + "learning_rate": 1.7859245873743586e-05, + "loss": 2.2733, + "step": 2450 + }, + { + "epoch": 0.47183386673725247, + "grad_norm": 2.774660294996914, + "learning_rate": 1.7857317626551798e-05, + "loss": 2.3051, + "step": 2451 + }, + { + "epoch": 0.47202637341482784, + "grad_norm": 2.858592246269067, + "learning_rate": 1.7855388615528213e-05, + "loss": 2.3226, + "step": 2452 + }, + { + "epoch": 0.4722188800924032, + "grad_norm": 3.25872473287099, + "learning_rate": 1.7853458840860353e-05, + "loss": 2.2839, + "step": 2453 + }, + { + "epoch": 0.47241138676997857, + "grad_norm": 3.2214287691275176, + "learning_rate": 1.785152830273582e-05, + "loss": 2.3369, + "step": 2454 + }, + { + "epoch": 0.47260389344755394, + "grad_norm": 3.0807183857014273, + "learning_rate": 1.7849597001342284e-05, + "loss": 2.2634, + "step": 2455 + }, + { + "epoch": 0.47279640012512936, + "grad_norm": 3.0540556998661263, + "learning_rate": 1.7847664936867493e-05, + "loss": 2.3348, + "step": 2456 + }, + { + "epoch": 0.47279640012512936, + "lm_loss": 1.7261, + "step": 2456, + "vm_loss": 0.2499 + }, + { + "epoch": 0.47279640012512936, + "lm_loss": 1.7832, + "step": 2456, + "vm_loss": 0.215 + }, + { + "epoch": 0.47279640012512936, + "lm_loss": 2.0767, + "step": 2456, + "vm_loss": 0.191 + }, + { + "epoch": 0.47279640012512936, + "lm_loss": 2.071, + "step": 2456, + "vm_loss": 0.1911 + }, + { + "epoch": 0.47279640012512936, + "lm_loss": 2.5267, + "step": 2456, + "vm_loss": 0.1267 + }, + { + "epoch": 0.47279640012512936, + "lm_loss": 2.0225, + "step": 2456, + "vm_loss": 0.1859 + }, + { + "epoch": 0.47279640012512936, + "lm_loss": 1.9774, + "step": 2456, + "vm_loss": 0.216 + }, + { + "epoch": 0.47279640012512936, + "lm_loss": 1.8656, + "step": 2456, + "vm_loss": 0.1543 + }, + { + "epoch": 0.4729889068027047, + "grad_norm": 2.9824302351413934, + "learning_rate": 1.7845732109499275e-05, + "loss": 2.2793, + "step": 2457 + }, + { + "epoch": 0.4731814134802801, + "grad_norm": 3.0379791470252595, + "learning_rate": 1.7843798519425514e-05, + "loss": 2.2933, + "step": 2458 + }, + { + "epoch": 0.47337392015785545, + "grad_norm": 2.7802610923880504, + "learning_rate": 1.7841864166834185e-05, + "loss": 2.3005, + "step": 2459 + }, + { + "epoch": 0.4735664268354309, + "grad_norm": 2.6795933679970974, + "learning_rate": 1.7839929051913336e-05, + "loss": 2.2457, + "step": 2460 + }, + { + "epoch": 0.47375893351300624, + "grad_norm": 3.345450928143097, + "learning_rate": 1.783799317485108e-05, + "loss": 2.27, + "step": 2461 + }, + { + "epoch": 0.4739514401905816, + "grad_norm": 2.759777826130828, + "learning_rate": 1.7836056535835607e-05, + "loss": 2.3254, + "step": 2462 + }, + { + "epoch": 0.474143946868157, + "grad_norm": 2.991248081618804, + "learning_rate": 1.783411913505519e-05, + "loss": 2.287, + "step": 2463 + }, + { + "epoch": 0.4743364535457324, + "grad_norm": 2.72177921061463, + "learning_rate": 1.7832180972698166e-05, + "loss": 2.3354, + "step": 2464 + }, + { + "epoch": 0.4743364535457324, + "lm_loss": 2.2283, + "step": 2464, + "vm_loss": 0.1201 + }, + { + "epoch": 0.4743364535457324, + "lm_loss": 1.9308, + "step": 2464, + "vm_loss": 0.1703 + }, + { + "epoch": 0.4743364535457324, + "lm_loss": 2.1478, + "step": 2464, + "vm_loss": 0.2 + }, + { + "epoch": 0.4743364535457324, + "lm_loss": 1.8642, + "step": 2464, + "vm_loss": 0.1883 + }, + { + "epoch": 0.4743364535457324, + "lm_loss": 1.9318, + "step": 2464, + "vm_loss": 0.1897 + }, + { + "epoch": 0.4743364535457324, + "lm_loss": 2.0204, + "step": 2464, + "vm_loss": 0.1727 + }, + { + "epoch": 0.4743364535457324, + "lm_loss": 2.0637, + "step": 2464, + "vm_loss": 0.1328 + }, + { + "epoch": 0.4743364535457324, + "lm_loss": 1.6153, + "step": 2464, + "vm_loss": 0.1616 + }, + { + "epoch": 0.47452896022330776, + "grad_norm": 2.8921298143247993, + "learning_rate": 1.7830242048952947e-05, + "loss": 2.2717, + "step": 2465 + }, + { + "epoch": 0.4747214669008831, + "grad_norm": 2.9135477026611243, + "learning_rate": 1.7828302364008024e-05, + "loss": 2.3188, + "step": 2466 + }, + { + "epoch": 0.4749139735784585, + "grad_norm": 2.76850667840445, + "learning_rate": 1.7826361918051956e-05, + "loss": 2.3106, + "step": 2467 + }, + { + "epoch": 0.47510648025603386, + "grad_norm": 2.6380143727757974, + "learning_rate": 1.782442071127338e-05, + "loss": 2.3306, + "step": 2468 + }, + { + "epoch": 0.4752989869336093, + "grad_norm": 2.7172113234387756, + "learning_rate": 1.7822478743861008e-05, + "loss": 2.2874, + "step": 2469 + }, + { + "epoch": 0.47549149361118465, + "grad_norm": 2.820359645241799, + "learning_rate": 1.7820536016003625e-05, + "loss": 2.2874, + "step": 2470 + }, + { + "epoch": 0.47568400028876, + "grad_norm": 2.7271457941493678, + "learning_rate": 1.781859252789009e-05, + "loss": 2.2711, + "step": 2471 + }, + { + "epoch": 0.4758765069663354, + "grad_norm": 2.7686974400428874, + "learning_rate": 1.7816648279709327e-05, + "loss": 2.2821, + "step": 2472 + }, + { + "epoch": 0.4758765069663354, + "lm_loss": 1.7014, + "step": 2472, + "vm_loss": 0.1801 + }, + { + "epoch": 0.4758765069663354, + "lm_loss": 2.0505, + "step": 2472, + "vm_loss": 0.1589 + }, + { + "epoch": 0.4758765069663354, + "lm_loss": 2.3833, + "step": 2472, + "vm_loss": 0.2545 + }, + { + "epoch": 0.4758765069663354, + "lm_loss": 2.3397, + "step": 2472, + "vm_loss": 0.177 + }, + { + "epoch": 0.4758765069663354, + "lm_loss": 2.1906, + "step": 2472, + "vm_loss": 0.2012 + }, + { + "epoch": 0.4758765069663354, + "lm_loss": 2.3236, + "step": 2472, + "vm_loss": 0.1966 + }, + { + "epoch": 0.4758765069663354, + "lm_loss": 1.89, + "step": 2472, + "vm_loss": 0.183 + }, + { + "epoch": 0.4758765069663354, + "lm_loss": 2.2237, + "step": 2472, + "vm_loss": 0.1498 + }, + { + "epoch": 0.4760690136439108, + "grad_norm": 2.8731253245191786, + "learning_rate": 1.781470327165035e-05, + "loss": 2.3367, + "step": 2473 + }, + { + "epoch": 0.47626152032148616, + "grad_norm": 2.9764662480395754, + "learning_rate": 1.7812757503902234e-05, + "loss": 2.3115, + "step": 2474 + }, + { + "epoch": 0.47645402699906153, + "grad_norm": 2.6349584211140864, + "learning_rate": 1.7810810976654135e-05, + "loss": 2.2839, + "step": 2475 + }, + { + "epoch": 0.4766465336766369, + "grad_norm": 3.2707711685971934, + "learning_rate": 1.7808863690095283e-05, + "loss": 2.3148, + "step": 2476 + }, + { + "epoch": 0.47683904035421226, + "grad_norm": 2.8203970868055928, + "learning_rate": 1.780691564441497e-05, + "loss": 2.324, + "step": 2477 + }, + { + "epoch": 0.4770315470317877, + "grad_norm": 2.7007971964851913, + "learning_rate": 1.780496683980258e-05, + "loss": 2.2756, + "step": 2478 + }, + { + "epoch": 0.47722405370936305, + "grad_norm": 3.022028469303706, + "learning_rate": 1.7803017276447558e-05, + "loss": 2.3281, + "step": 2479 + }, + { + "epoch": 0.4774165603869384, + "grad_norm": 3.0015100776468806, + "learning_rate": 1.7801066954539424e-05, + "loss": 2.2613, + "step": 2480 + }, + { + "epoch": 0.4774165603869384, + "lm_loss": 2.1357, + "step": 2480, + "vm_loss": 0.1967 + }, + { + "epoch": 0.4774165603869384, + "lm_loss": 1.6637, + "step": 2480, + "vm_loss": 0.1793 + }, + { + "epoch": 0.4774165603869384, + "lm_loss": 1.87, + "step": 2480, + "vm_loss": 0.2122 + }, + { + "epoch": 0.4774165603869384, + "lm_loss": 2.2797, + "step": 2480, + "vm_loss": 0.2007 + }, + { + "epoch": 0.4774165603869384, + "lm_loss": 2.4752, + "step": 2480, + "vm_loss": 0.1539 + }, + { + "epoch": 0.4774165603869384, + "lm_loss": 2.2192, + "step": 2480, + "vm_loss": 0.1526 + }, + { + "epoch": 0.4774165603869384, + "lm_loss": 1.976, + "step": 2480, + "vm_loss": 0.1465 + }, + { + "epoch": 0.4774165603869384, + "lm_loss": 2.2177, + "step": 2480, + "vm_loss": 0.1476 + }, + { + "epoch": 0.4776090670645138, + "grad_norm": 2.9507337386080583, + "learning_rate": 1.7799115874267782e-05, + "loss": 2.2796, + "step": 2481 + }, + { + "epoch": 0.4778015737420892, + "grad_norm": 3.273729245309616, + "learning_rate": 1.7797164035822295e-05, + "loss": 2.3087, + "step": 2482 + }, + { + "epoch": 0.47799408041966457, + "grad_norm": 2.9390748708887457, + "learning_rate": 1.779521143939271e-05, + "loss": 2.2512, + "step": 2483 + }, + { + "epoch": 0.47818658709723993, + "grad_norm": 3.0189722052742503, + "learning_rate": 1.7793258085168844e-05, + "loss": 2.278, + "step": 2484 + }, + { + "epoch": 0.4783790937748153, + "grad_norm": 2.867311762530984, + "learning_rate": 1.7791303973340584e-05, + "loss": 2.3131, + "step": 2485 + }, + { + "epoch": 0.47857160045239067, + "grad_norm": 2.8110016070823174, + "learning_rate": 1.77893491040979e-05, + "loss": 2.2791, + "step": 2486 + }, + { + "epoch": 0.4787641071299661, + "grad_norm": 3.054189715388609, + "learning_rate": 1.7787393477630826e-05, + "loss": 2.3342, + "step": 2487 + }, + { + "epoch": 0.47895661380754145, + "grad_norm": 2.6569225068447717, + "learning_rate": 1.778543709412948e-05, + "loss": 2.2865, + "step": 2488 + }, + { + "epoch": 0.47895661380754145, + "lm_loss": 2.3942, + "step": 2488, + "vm_loss": 0.168 + }, + { + "epoch": 0.47895661380754145, + "lm_loss": 1.8804, + "step": 2488, + "vm_loss": 0.167 + }, + { + "epoch": 0.47895661380754145, + "lm_loss": 1.9992, + "step": 2488, + "vm_loss": 0.2106 + }, + { + "epoch": 0.47895661380754145, + "lm_loss": 2.1193, + "step": 2488, + "vm_loss": 0.1868 + }, + { + "epoch": 0.47895661380754145, + "lm_loss": 2.0918, + "step": 2488, + "vm_loss": 0.1626 + }, + { + "epoch": 0.47895661380754145, + "lm_loss": 1.9725, + "step": 2488, + "vm_loss": 0.1722 + }, + { + "epoch": 0.47895661380754145, + "lm_loss": 2.1409, + "step": 2488, + "vm_loss": 0.1459 + }, + { + "epoch": 0.47895661380754145, + "lm_loss": 2.5955, + "step": 2488, + "vm_loss": 0.1886 + }, + { + "epoch": 0.4791491204851168, + "grad_norm": 2.8484718637776267, + "learning_rate": 1.778347995378404e-05, + "loss": 2.287, + "step": 2489 + }, + { + "epoch": 0.4793416271626922, + "grad_norm": 2.842830316607992, + "learning_rate": 1.778152205678477e-05, + "loss": 2.3335, + "step": 2490 + }, + { + "epoch": 0.4795341338402676, + "grad_norm": 2.8669813223498863, + "learning_rate": 1.7779563403322006e-05, + "loss": 2.2747, + "step": 2491 + }, + { + "epoch": 0.479726640517843, + "grad_norm": 2.6786687075178657, + "learning_rate": 1.777760399358614e-05, + "loss": 2.2678, + "step": 2492 + }, + { + "epoch": 0.47991914719541834, + "grad_norm": 2.7338259002136684, + "learning_rate": 1.7775643827767668e-05, + "loss": 2.341, + "step": 2493 + }, + { + "epoch": 0.4801116538729937, + "grad_norm": 2.827619438066888, + "learning_rate": 1.7773682906057133e-05, + "loss": 2.3136, + "step": 2494 + }, + { + "epoch": 0.48030416055056907, + "grad_norm": 2.7764024789498998, + "learning_rate": 1.777172122864517e-05, + "loss": 2.2741, + "step": 2495 + }, + { + "epoch": 0.4804966672281445, + "grad_norm": 2.5973000744295325, + "learning_rate": 1.776975879572247e-05, + "loss": 2.3121, + "step": 2496 + }, + { + "epoch": 0.4804966672281445, + "lm_loss": 2.2804, + "step": 2496, + "vm_loss": 0.1751 + }, + { + "epoch": 0.4804966672281445, + "lm_loss": 2.0859, + "step": 2496, + "vm_loss": 0.1619 + }, + { + "epoch": 0.4804966672281445, + "lm_loss": 1.9264, + "step": 2496, + "vm_loss": 0.2008 + }, + { + "epoch": 0.4804966672281445, + "lm_loss": 2.1099, + "step": 2496, + "vm_loss": 0.2074 + }, + { + "epoch": 0.4804966672281445, + "lm_loss": 1.9701, + "step": 2496, + "vm_loss": 0.2113 + }, + { + "epoch": 0.4804966672281445, + "lm_loss": 2.1673, + "step": 2496, + "vm_loss": 0.1609 + }, + { + "epoch": 0.4804966672281445, + "lm_loss": 1.8768, + "step": 2496, + "vm_loss": 0.1448 + }, + { + "epoch": 0.4804966672281445, + "lm_loss": 2.1848, + "step": 2496, + "vm_loss": 0.1678 + }, + { + "epoch": 0.48068917390571986, + "grad_norm": 2.758407370499133, + "learning_rate": 1.776779560747981e-05, + "loss": 2.3027, + "step": 2497 + }, + { + "epoch": 0.4808816805832952, + "grad_norm": 2.8175415601978298, + "learning_rate": 1.776583166410804e-05, + "loss": 2.2574, + "step": 2498 + }, + { + "epoch": 0.4810741872608706, + "grad_norm": 3.1160858530931383, + "learning_rate": 1.776386696579808e-05, + "loss": 2.2968, + "step": 2499 + }, + { + "epoch": 0.481266693938446, + "grad_norm": 2.9813271165326176, + "learning_rate": 1.7761901512740918e-05, + "loss": 2.2009, + "step": 2500 + }, + { + "epoch": 0.4814592006160214, + "grad_norm": 3.0056322173375865, + "learning_rate": 1.775993530512763e-05, + "loss": 2.2438, + "step": 2501 + }, + { + "epoch": 0.48165170729359674, + "grad_norm": 2.902724321473114, + "learning_rate": 1.7757968343149348e-05, + "loss": 2.2238, + "step": 2502 + }, + { + "epoch": 0.4818442139711721, + "grad_norm": 2.9153237156705303, + "learning_rate": 1.775600062699729e-05, + "loss": 2.2917, + "step": 2503 + }, + { + "epoch": 0.48203672064874753, + "grad_norm": 3.076326841136655, + "learning_rate": 1.7754032156862744e-05, + "loss": 2.2813, + "step": 2504 + }, + { + "epoch": 0.48203672064874753, + "lm_loss": 2.106, + "step": 2504, + "vm_loss": 0.146 + }, + { + "epoch": 0.48203672064874753, + "lm_loss": 2.2349, + "step": 2504, + "vm_loss": 0.1605 + }, + { + "epoch": 0.48203672064874753, + "lm_loss": 2.273, + "step": 2504, + "vm_loss": 0.137 + }, + { + "epoch": 0.48203672064874753, + "lm_loss": 1.9146, + "step": 2504, + "vm_loss": 0.1698 + }, + { + "epoch": 0.48203672064874753, + "lm_loss": 1.9185, + "step": 2504, + "vm_loss": 0.1538 + }, + { + "epoch": 0.48203672064874753, + "lm_loss": 2.3888, + "step": 2504, + "vm_loss": 0.1955 + }, + { + "epoch": 0.48203672064874753, + "lm_loss": 2.1171, + "step": 2504, + "vm_loss": 0.1607 + }, + { + "epoch": 0.48203672064874753, + "lm_loss": 2.0008, + "step": 2504, + "vm_loss": 0.2165 + }, + { + "epoch": 0.4822292273263229, + "grad_norm": 2.9137155613882237, + "learning_rate": 1.7752062932937072e-05, + "loss": 2.2365, + "step": 2505 + }, + { + "epoch": 0.48242173400389826, + "grad_norm": 2.8470226444867173, + "learning_rate": 1.7750092955411697e-05, + "loss": 2.3559, + "step": 2506 + }, + { + "epoch": 0.4826142406814736, + "grad_norm": 2.992200474800163, + "learning_rate": 1.7748122224478138e-05, + "loss": 2.2447, + "step": 2507 + }, + { + "epoch": 0.482806747359049, + "grad_norm": 3.332323118584798, + "learning_rate": 1.7746150740327978e-05, + "loss": 2.298, + "step": 2508 + }, + { + "epoch": 0.4829992540366244, + "grad_norm": 3.2181560702089893, + "learning_rate": 1.7744178503152856e-05, + "loss": 2.2955, + "step": 2509 + }, + { + "epoch": 0.4831917607141998, + "grad_norm": 3.3009885697712225, + "learning_rate": 1.7742205513144507e-05, + "loss": 2.2493, + "step": 2510 + }, + { + "epoch": 0.48338426739177515, + "grad_norm": 2.9802031659739896, + "learning_rate": 1.7740231770494733e-05, + "loss": 2.2029, + "step": 2511 + }, + { + "epoch": 0.4835767740693505, + "grad_norm": 2.702075454671211, + "learning_rate": 1.7738257275395404e-05, + "loss": 2.2539, + "step": 2512 + }, + { + "epoch": 0.4835767740693505, + "lm_loss": 1.7886, + "step": 2512, + "vm_loss": 0.1652 + }, + { + "epoch": 0.4835767740693505, + "lm_loss": 1.9571, + "step": 2512, + "vm_loss": 0.1383 + }, + { + "epoch": 0.4835767740693505, + "lm_loss": 2.1742, + "step": 2512, + "vm_loss": 0.1611 + }, + { + "epoch": 0.4835767740693505, + "lm_loss": 2.2608, + "step": 2512, + "vm_loss": 0.2197 + }, + { + "epoch": 0.4835767740693505, + "lm_loss": 2.0741, + "step": 2512, + "vm_loss": 0.2108 + }, + { + "epoch": 0.4835767740693505, + "lm_loss": 2.2156, + "step": 2512, + "vm_loss": 0.1177 + }, + { + "epoch": 0.4835767740693505, + "lm_loss": 2.3894, + "step": 2512, + "vm_loss": 0.1625 + }, + { + "epoch": 0.4835767740693505, + "lm_loss": 2.3169, + "step": 2512, + "vm_loss": 0.1733 + }, + { + "epoch": 0.48376928074692593, + "grad_norm": 2.656867607246949, + "learning_rate": 1.7736282028038467e-05, + "loss": 2.2801, + "step": 2513 + }, + { + "epoch": 0.4839617874245013, + "grad_norm": 2.990666451512867, + "learning_rate": 1.773430602861594e-05, + "loss": 2.3087, + "step": 2514 + }, + { + "epoch": 0.48415429410207667, + "grad_norm": 2.7272215355453886, + "learning_rate": 1.7732329277319914e-05, + "loss": 2.2135, + "step": 2515 + }, + { + "epoch": 0.48434680077965203, + "grad_norm": 2.8236867862317725, + "learning_rate": 1.773035177434256e-05, + "loss": 2.278, + "step": 2516 + }, + { + "epoch": 0.4845393074572274, + "grad_norm": 2.854742291753803, + "learning_rate": 1.772837351987611e-05, + "loss": 2.3181, + "step": 2517 + }, + { + "epoch": 0.4847318141348028, + "grad_norm": 3.1824360146062967, + "learning_rate": 1.7726394514112885e-05, + "loss": 2.2925, + "step": 2518 + }, + { + "epoch": 0.4849243208123782, + "grad_norm": 2.6495974401136557, + "learning_rate": 1.772441475724526e-05, + "loss": 2.2998, + "step": 2519 + }, + { + "epoch": 0.48511682748995355, + "grad_norm": 2.901398825129236, + "learning_rate": 1.7722434249465695e-05, + "loss": 2.3034, + "step": 2520 + }, + { + "epoch": 0.48511682748995355, + "lm_loss": 2.1055, + "step": 2520, + "vm_loss": 0.2231 + }, + { + "epoch": 0.48511682748995355, + "lm_loss": 2.2893, + "step": 2520, + "vm_loss": 0.2163 + }, + { + "epoch": 0.48511682748995355, + "lm_loss": 2.0676, + "step": 2520, + "vm_loss": 0.1504 + }, + { + "epoch": 0.48511682748995355, + "lm_loss": 2.2021, + "step": 2520, + "vm_loss": 0.2222 + }, + { + "epoch": 0.48511682748995355, + "lm_loss": 2.1718, + "step": 2520, + "vm_loss": 0.1373 + }, + { + "epoch": 0.48511682748995355, + "lm_loss": 2.2904, + "step": 2520, + "vm_loss": 0.142 + }, + { + "epoch": 0.48511682748995355, + "lm_loss": 2.141, + "step": 2520, + "vm_loss": 0.1648 + }, + { + "epoch": 0.48511682748995355, + "lm_loss": 2.0731, + "step": 2520, + "vm_loss": 0.1538 + }, + { + "epoch": 0.4853093341675289, + "grad_norm": 2.7079131059338817, + "learning_rate": 1.7720452990966728e-05, + "loss": 2.262, + "step": 2521 + }, + { + "epoch": 0.48550184084510434, + "grad_norm": 2.9569668962994458, + "learning_rate": 1.7718470981940953e-05, + "loss": 2.272, + "step": 2522 + }, + { + "epoch": 0.4856943475226797, + "grad_norm": 3.128942120995433, + "learning_rate": 1.771648822258105e-05, + "loss": 2.3106, + "step": 2523 + }, + { + "epoch": 0.48588685420025507, + "grad_norm": 3.0672495369227493, + "learning_rate": 1.771450471307977e-05, + "loss": 2.2554, + "step": 2524 + }, + { + "epoch": 0.48607936087783044, + "grad_norm": 2.748026796346768, + "learning_rate": 1.7712520453629933e-05, + "loss": 2.2535, + "step": 2525 + }, + { + "epoch": 0.4862718675554058, + "grad_norm": 2.5591060358947355, + "learning_rate": 1.771053544442444e-05, + "loss": 2.3121, + "step": 2526 + }, + { + "epoch": 0.4864643742329812, + "grad_norm": 2.507063965197852, + "learning_rate": 1.770854968565625e-05, + "loss": 2.2214, + "step": 2527 + }, + { + "epoch": 0.4866568809105566, + "grad_norm": 2.778943604490944, + "learning_rate": 1.7706563177518414e-05, + "loss": 2.2849, + "step": 2528 + }, + { + "epoch": 0.4866568809105566, + "lm_loss": 2.386, + "step": 2528, + "vm_loss": 0.1362 + }, + { + "epoch": 0.4866568809105566, + "lm_loss": 1.9305, + "step": 2528, + "vm_loss": 0.0614 + }, + { + "epoch": 0.4866568809105566, + "lm_loss": 2.2808, + "step": 2528, + "vm_loss": 0.2107 + }, + { + "epoch": 0.4866568809105566, + "lm_loss": 2.1614, + "step": 2528, + "vm_loss": 0.2191 + }, + { + "epoch": 0.4866568809105566, + "lm_loss": 1.9934, + "step": 2528, + "vm_loss": 0.146 + }, + { + "epoch": 0.4866568809105566, + "lm_loss": 2.1757, + "step": 2528, + "vm_loss": 0.1102 + }, + { + "epoch": 0.4866568809105566, + "lm_loss": 1.9836, + "step": 2528, + "vm_loss": 0.1967 + }, + { + "epoch": 0.4866568809105566, + "lm_loss": 1.9253, + "step": 2528, + "vm_loss": 0.1899 + }, + { + "epoch": 0.48684938758813195, + "grad_norm": 2.8527012368064257, + "learning_rate": 1.770457592020404e-05, + "loss": 2.2983, + "step": 2529 + }, + { + "epoch": 0.4870418942657073, + "grad_norm": 2.548903435725304, + "learning_rate": 1.7702587913906314e-05, + "loss": 2.2299, + "step": 2530 + }, + { + "epoch": 0.48723440094328274, + "grad_norm": 2.7534766543717537, + "learning_rate": 1.77005991588185e-05, + "loss": 2.2199, + "step": 2531 + }, + { + "epoch": 0.4874269076208581, + "grad_norm": 2.831929858131102, + "learning_rate": 1.7698609655133925e-05, + "loss": 2.243, + "step": 2532 + }, + { + "epoch": 0.4876194142984335, + "grad_norm": 2.8308981229969206, + "learning_rate": 1.7696619403046003e-05, + "loss": 2.2362, + "step": 2533 + }, + { + "epoch": 0.48781192097600884, + "grad_norm": 2.817520067738735, + "learning_rate": 1.7694628402748203e-05, + "loss": 2.2854, + "step": 2534 + }, + { + "epoch": 0.48800442765358426, + "grad_norm": 3.114029941678181, + "learning_rate": 1.769263665443408e-05, + "loss": 2.2538, + "step": 2535 + }, + { + "epoch": 0.4881969343311596, + "grad_norm": 2.95465485247302, + "learning_rate": 1.7690644158297258e-05, + "loss": 2.2373, + "step": 2536 + }, + { + "epoch": 0.4881969343311596, + "lm_loss": 2.1383, + "step": 2536, + "vm_loss": 0.1531 + }, + { + "epoch": 0.4881969343311596, + "lm_loss": 2.0472, + "step": 2536, + "vm_loss": 0.1963 + }, + { + "epoch": 0.4881969343311596, + "lm_loss": 2.1432, + "step": 2536, + "vm_loss": 0.2142 + }, + { + "epoch": 0.4881969343311596, + "lm_loss": 2.0421, + "step": 2536, + "vm_loss": 0.1432 + }, + { + "epoch": 0.4881969343311596, + "lm_loss": 2.0239, + "step": 2536, + "vm_loss": 0.2181 + }, + { + "epoch": 0.4881969343311596, + "lm_loss": 2.3476, + "step": 2536, + "vm_loss": 0.1197 + }, + { + "epoch": 0.4881969343311596, + "lm_loss": 2.1245, + "step": 2536, + "vm_loss": 0.1379 + }, + { + "epoch": 0.4881969343311596, + "lm_loss": 2.1739, + "step": 2536, + "vm_loss": 0.1057 + }, + { + "epoch": 0.488389441008735, + "grad_norm": 2.886846240006783, + "learning_rate": 1.768865091453143e-05, + "loss": 2.26, + "step": 2537 + }, + { + "epoch": 0.48858194768631036, + "grad_norm": 2.7188743383619447, + "learning_rate": 1.7686656923330366e-05, + "loss": 2.2457, + "step": 2538 + }, + { + "epoch": 0.4887744543638857, + "grad_norm": 2.879436959628215, + "learning_rate": 1.768466218488791e-05, + "loss": 2.3291, + "step": 2539 + }, + { + "epoch": 0.48896696104146115, + "grad_norm": 2.6543654927251783, + "learning_rate": 1.7682666699397974e-05, + "loss": 2.2987, + "step": 2540 + }, + { + "epoch": 0.4891594677190365, + "grad_norm": 2.988649180985386, + "learning_rate": 1.7680670467054543e-05, + "loss": 2.2739, + "step": 2541 + }, + { + "epoch": 0.4893519743966119, + "grad_norm": 2.4518801462538455, + "learning_rate": 1.7678673488051677e-05, + "loss": 2.3312, + "step": 2542 + }, + { + "epoch": 0.48954448107418724, + "grad_norm": 2.9967463023262577, + "learning_rate": 1.767667576258351e-05, + "loss": 2.2245, + "step": 2543 + }, + { + "epoch": 0.48973698775176266, + "grad_norm": 2.9455019108387908, + "learning_rate": 1.7674677290844245e-05, + "loss": 2.3238, + "step": 2544 + }, + { + "epoch": 0.48973698775176266, + "lm_loss": 2.2408, + "step": 2544, + "vm_loss": 0.1636 + }, + { + "epoch": 0.48973698775176266, + "lm_loss": 2.3415, + "step": 2544, + "vm_loss": 0.1142 + }, + { + "epoch": 0.48973698775176266, + "lm_loss": 2.1294, + "step": 2544, + "vm_loss": 0.1786 + }, + { + "epoch": 0.48973698775176266, + "lm_loss": 1.7967, + "step": 2544, + "vm_loss": 0.1561 + }, + { + "epoch": 0.48973698775176266, + "lm_loss": 1.762, + "step": 2544, + "vm_loss": 0.1202 + }, + { + "epoch": 0.48973698775176266, + "lm_loss": 2.1862, + "step": 2544, + "vm_loss": 0.1523 + }, + { + "epoch": 0.48973698775176266, + "lm_loss": 1.6257, + "step": 2544, + "vm_loss": 0.1879 + }, + { + "epoch": 0.48973698775176266, + "lm_loss": 2.1738, + "step": 2544, + "vm_loss": 0.1269 + }, + { + "epoch": 0.48992949442933803, + "grad_norm": 3.0964093031507316, + "learning_rate": 1.7672678073028163e-05, + "loss": 2.2093, + "step": 2545 + }, + { + "epoch": 0.4901220011069134, + "grad_norm": 2.9368734050734098, + "learning_rate": 1.767067810932961e-05, + "loss": 2.218, + "step": 2546 + }, + { + "epoch": 0.49031450778448876, + "grad_norm": 2.74702750810346, + "learning_rate": 1.7668677399943004e-05, + "loss": 2.2907, + "step": 2547 + }, + { + "epoch": 0.49050701446206413, + "grad_norm": 3.312589435932395, + "learning_rate": 1.7666675945062848e-05, + "loss": 2.2803, + "step": 2548 + }, + { + "epoch": 0.49069952113963955, + "grad_norm": 3.0587406364477436, + "learning_rate": 1.76646737448837e-05, + "loss": 2.3107, + "step": 2549 + }, + { + "epoch": 0.4908920278172149, + "grad_norm": 2.777917692566353, + "learning_rate": 1.7662670799600207e-05, + "loss": 2.263, + "step": 2550 + }, + { + "epoch": 0.4910845344947903, + "grad_norm": 2.9732451609985864, + "learning_rate": 1.766066710940708e-05, + "loss": 2.3238, + "step": 2551 + }, + { + "epoch": 0.49127704117236565, + "grad_norm": 3.036282775155767, + "learning_rate": 1.76586626744991e-05, + "loss": 2.233, + "step": 2552 + }, + { + "epoch": 0.49127704117236565, + "lm_loss": 1.8737, + "step": 2552, + "vm_loss": 0.1581 + }, + { + "epoch": 0.49127704117236565, + "lm_loss": 2.5071, + "step": 2552, + "vm_loss": 0.1848 + }, + { + "epoch": 0.49127704117236565, + "lm_loss": 1.9605, + "step": 2552, + "vm_loss": 0.1606 + }, + { + "epoch": 0.49127704117236565, + "lm_loss": 1.9639, + "step": 2552, + "vm_loss": 0.1532 + }, + { + "epoch": 0.49127704117236565, + "lm_loss": 2.1104, + "step": 2552, + "vm_loss": 0.1382 + }, + { + "epoch": 0.49127704117236565, + "lm_loss": 2.2875, + "step": 2552, + "vm_loss": 0.1647 + }, + { + "epoch": 0.49127704117236565, + "lm_loss": 2.0941, + "step": 2552, + "vm_loss": 0.1555 + }, + { + "epoch": 0.49127704117236565, + "lm_loss": 1.6552, + "step": 2552, + "vm_loss": 0.1164 + }, + { + "epoch": 0.49146954784994107, + "grad_norm": 2.972581653046284, + "learning_rate": 1.7656657495071122e-05, + "loss": 2.2559, + "step": 2553 + }, + { + "epoch": 0.49166205452751643, + "grad_norm": 2.9562968463803183, + "learning_rate": 1.765465157131808e-05, + "loss": 2.2865, + "step": 2554 + }, + { + "epoch": 0.4918545612050918, + "grad_norm": 2.72488944187822, + "learning_rate": 1.7652644903434975e-05, + "loss": 2.2532, + "step": 2555 + }, + { + "epoch": 0.49204706788266717, + "grad_norm": 3.1832018923207177, + "learning_rate": 1.765063749161688e-05, + "loss": 2.2995, + "step": 2556 + }, + { + "epoch": 0.49223957456024253, + "grad_norm": 2.785982255207764, + "learning_rate": 1.764862933605894e-05, + "loss": 2.2413, + "step": 2557 + }, + { + "epoch": 0.49243208123781795, + "grad_norm": 2.6755032984446565, + "learning_rate": 1.7646620436956372e-05, + "loss": 2.2768, + "step": 2558 + }, + { + "epoch": 0.4926245879153933, + "grad_norm": 3.097254628498717, + "learning_rate": 1.764461079450447e-05, + "loss": 2.2866, + "step": 2559 + }, + { + "epoch": 0.4928170945929687, + "grad_norm": 2.972208832970011, + "learning_rate": 1.76426004088986e-05, + "loss": 2.2834, + "step": 2560 + }, + { + "epoch": 0.4928170945929687, + "lm_loss": 2.0279, + "step": 2560, + "vm_loss": 0.1683 + }, + { + "epoch": 0.4928170945929687, + "lm_loss": 2.2367, + "step": 2560, + "vm_loss": 0.142 + }, + { + "epoch": 0.4928170945929687, + "lm_loss": 2.0743, + "step": 2560, + "vm_loss": 0.1285 + }, + { + "epoch": 0.4928170945929687, + "lm_loss": 1.7309, + "step": 2560, + "vm_loss": 0.1851 + }, + { + "epoch": 0.4928170945929687, + "lm_loss": 2.115, + "step": 2560, + "vm_loss": 0.1262 + }, + { + "epoch": 0.4928170945929687, + "lm_loss": 2.2572, + "step": 2560, + "vm_loss": 0.2288 + }, + { + "epoch": 0.4928170945929687, + "lm_loss": 1.9989, + "step": 2560, + "vm_loss": 0.2201 + }, + { + "epoch": 0.4928170945929687, + "lm_loss": 2.2212, + "step": 2560, + "vm_loss": 0.1659 + }, + { + "epoch": 0.49300960127054405, + "grad_norm": 2.9174311662819754, + "learning_rate": 1.7640589280334185e-05, + "loss": 2.2437, + "step": 2561 + }, + { + "epoch": 0.4932021079481195, + "grad_norm": 2.880484016044867, + "learning_rate": 1.7638577409006746e-05, + "loss": 2.2711, + "step": 2562 + }, + { + "epoch": 0.49339461462569484, + "grad_norm": 2.6745391159531273, + "learning_rate": 1.763656479511185e-05, + "loss": 2.2332, + "step": 2563 + }, + { + "epoch": 0.4935871213032702, + "grad_norm": 3.4460242205312097, + "learning_rate": 1.7634551438845166e-05, + "loss": 2.2399, + "step": 2564 + }, + { + "epoch": 0.49377962798084557, + "grad_norm": 3.05609538339663, + "learning_rate": 1.76325373404024e-05, + "loss": 2.2649, + "step": 2565 + }, + { + "epoch": 0.493972134658421, + "grad_norm": 2.9606522693095676, + "learning_rate": 1.7630522499979358e-05, + "loss": 2.2873, + "step": 2566 + }, + { + "epoch": 0.49416464133599636, + "grad_norm": 3.0479777234997627, + "learning_rate": 1.7628506917771906e-05, + "loss": 2.2346, + "step": 2567 + }, + { + "epoch": 0.4943571480135717, + "grad_norm": 3.2481405774702137, + "learning_rate": 1.7626490593975987e-05, + "loss": 2.2538, + "step": 2568 + }, + { + "epoch": 0.4943571480135717, + "lm_loss": 2.205, + "step": 2568, + "vm_loss": 0.184 + }, + { + "epoch": 0.4943571480135717, + "lm_loss": 2.2148, + "step": 2568, + "vm_loss": 0.0979 + }, + { + "epoch": 0.4943571480135717, + "lm_loss": 2.1169, + "step": 2568, + "vm_loss": 0.1697 + }, + { + "epoch": 0.4943571480135717, + "lm_loss": 2.2488, + "step": 2568, + "vm_loss": 0.1557 + }, + { + "epoch": 0.4943571480135717, + "lm_loss": 2.0137, + "step": 2568, + "vm_loss": 0.1573 + }, + { + "epoch": 0.4943571480135717, + "lm_loss": 2.0691, + "step": 2568, + "vm_loss": 0.1803 + }, + { + "epoch": 0.4943571480135717, + "lm_loss": 2.1003, + "step": 2568, + "vm_loss": 0.2279 + }, + { + "epoch": 0.4943571480135717, + "lm_loss": 2.2928, + "step": 2568, + "vm_loss": 0.1745 + }, + { + "epoch": 0.4945496546911471, + "grad_norm": 2.6619418250407727, + "learning_rate": 1.7624473528787605e-05, + "loss": 2.2489, + "step": 2569 + }, + { + "epoch": 0.49474216136872246, + "grad_norm": 2.9907414740006932, + "learning_rate": 1.7622455722402857e-05, + "loss": 2.2233, + "step": 2570 + }, + { + "epoch": 0.4949346680462979, + "grad_norm": 3.3050848080361033, + "learning_rate": 1.762043717501789e-05, + "loss": 2.2595, + "step": 2571 + }, + { + "epoch": 0.49512717472387324, + "grad_norm": 2.9552003063155086, + "learning_rate": 1.7618417886828934e-05, + "loss": 2.2703, + "step": 2572 + }, + { + "epoch": 0.4953196814014486, + "grad_norm": 3.1171436658654432, + "learning_rate": 1.7616397858032295e-05, + "loss": 2.2244, + "step": 2573 + }, + { + "epoch": 0.495512188079024, + "grad_norm": 2.776066597500756, + "learning_rate": 1.761437708882434e-05, + "loss": 2.1886, + "step": 2574 + }, + { + "epoch": 0.4957046947565994, + "grad_norm": 3.094713604254845, + "learning_rate": 1.7612355579401518e-05, + "loss": 2.2518, + "step": 2575 + }, + { + "epoch": 0.49589720143417476, + "grad_norm": 3.1307204060496607, + "learning_rate": 1.761033332996034e-05, + "loss": 2.2356, + "step": 2576 + }, + { + "epoch": 0.49589720143417476, + "lm_loss": 2.1416, + "step": 2576, + "vm_loss": 0.1698 + }, + { + "epoch": 0.49589720143417476, + "lm_loss": 2.0645, + "step": 2576, + "vm_loss": 0.212 + }, + { + "epoch": 0.49589720143417476, + "lm_loss": 2.2717, + "step": 2576, + "vm_loss": 0.1493 + }, + { + "epoch": 0.49589720143417476, + "lm_loss": 2.1734, + "step": 2576, + "vm_loss": 0.133 + }, + { + "epoch": 0.49589720143417476, + "lm_loss": 2.248, + "step": 2576, + "vm_loss": 0.2081 + }, + { + "epoch": 0.49589720143417476, + "lm_loss": 1.9946, + "step": 2576, + "vm_loss": 0.1948 + }, + { + "epoch": 0.49589720143417476, + "lm_loss": 2.0347, + "step": 2576, + "vm_loss": 0.1221 + }, + { + "epoch": 0.49589720143417476, + "lm_loss": 2.1461, + "step": 2576, + "vm_loss": 0.197 + }, + { + "epoch": 0.4960897081117501, + "grad_norm": 3.020120593297418, + "learning_rate": 1.7608310340697396e-05, + "loss": 2.2175, + "step": 2577 + }, + { + "epoch": 0.4962822147893255, + "grad_norm": 2.683664208066447, + "learning_rate": 1.7606286611809353e-05, + "loss": 2.2577, + "step": 2578 + }, + { + "epoch": 0.49647472146690086, + "grad_norm": 2.9558971024250775, + "learning_rate": 1.7604262143492937e-05, + "loss": 2.2257, + "step": 2579 + }, + { + "epoch": 0.4966672281444763, + "grad_norm": 2.803176100642997, + "learning_rate": 1.7602236935944953e-05, + "loss": 2.2715, + "step": 2580 + }, + { + "epoch": 0.49685973482205165, + "grad_norm": 2.6343832298658683, + "learning_rate": 1.7600210989362277e-05, + "loss": 2.2643, + "step": 2581 + }, + { + "epoch": 0.497052241499627, + "grad_norm": 2.9775843277826706, + "learning_rate": 1.7598184303941862e-05, + "loss": 2.2377, + "step": 2582 + }, + { + "epoch": 0.4972447481772024, + "grad_norm": 2.770780143857951, + "learning_rate": 1.7596156879880718e-05, + "loss": 2.2485, + "step": 2583 + }, + { + "epoch": 0.4974372548547778, + "grad_norm": 3.011594554533267, + "learning_rate": 1.7594128717375945e-05, + "loss": 2.2445, + "step": 2584 + }, + { + "epoch": 0.4974372548547778, + "lm_loss": 1.9146, + "step": 2584, + "vm_loss": 0.126 + }, + { + "epoch": 0.4974372548547778, + "lm_loss": 1.9133, + "step": 2584, + "vm_loss": 0.1672 + }, + { + "epoch": 0.4974372548547778, + "lm_loss": 2.1006, + "step": 2584, + "vm_loss": 0.1527 + }, + { + "epoch": 0.4974372548547778, + "lm_loss": 1.8866, + "step": 2584, + "vm_loss": 0.1397 + }, + { + "epoch": 0.4974372548547778, + "lm_loss": 2.3141, + "step": 2584, + "vm_loss": 0.1409 + }, + { + "epoch": 0.4974372548547778, + "lm_loss": 2.0353, + "step": 2584, + "vm_loss": 0.1732 + }, + { + "epoch": 0.4974372548547778, + "lm_loss": 2.2654, + "step": 2584, + "vm_loss": 0.1907 + }, + { + "epoch": 0.4974372548547778, + "lm_loss": 1.5699, + "step": 2584, + "vm_loss": 0.1348 + }, + { + "epoch": 0.49762976153235317, + "grad_norm": 2.733499147524477, + "learning_rate": 1.7592099816624703e-05, + "loss": 2.2869, + "step": 2585 + }, + { + "epoch": 0.49782226820992853, + "grad_norm": 2.680883007818799, + "learning_rate": 1.7590070177824228e-05, + "loss": 2.2111, + "step": 2586 + }, + { + "epoch": 0.4980147748875039, + "grad_norm": 2.7995595381169385, + "learning_rate": 1.7588039801171827e-05, + "loss": 2.3103, + "step": 2587 + }, + { + "epoch": 0.49820728156507926, + "grad_norm": 3.053585236789676, + "learning_rate": 1.7586008686864874e-05, + "loss": 2.2993, + "step": 2588 + }, + { + "epoch": 0.4983997882426547, + "grad_norm": 2.9638680394664774, + "learning_rate": 1.758397683510083e-05, + "loss": 2.2384, + "step": 2589 + }, + { + "epoch": 0.49859229492023005, + "grad_norm": 2.748332135849734, + "learning_rate": 1.7581944246077202e-05, + "loss": 2.2757, + "step": 2590 + }, + { + "epoch": 0.4987848015978054, + "grad_norm": 2.737966006045177, + "learning_rate": 1.7579910919991596e-05, + "loss": 2.2066, + "step": 2591 + }, + { + "epoch": 0.4989773082753808, + "grad_norm": 2.890117448056369, + "learning_rate": 1.7577876857041672e-05, + "loss": 2.2362, + "step": 2592 + }, + { + "epoch": 0.4989773082753808, + "lm_loss": 2.1523, + "step": 2592, + "vm_loss": 0.2719 + }, + { + "epoch": 0.4989773082753808, + "lm_loss": 2.0649, + "step": 2592, + "vm_loss": 0.2127 + }, + { + "epoch": 0.4989773082753808, + "lm_loss": 2.0689, + "step": 2592, + "vm_loss": 0.125 + }, + { + "epoch": 0.4989773082753808, + "lm_loss": 2.2836, + "step": 2592, + "vm_loss": 0.1464 + }, + { + "epoch": 0.4989773082753808, + "lm_loss": 1.8959, + "step": 2592, + "vm_loss": 0.1599 + }, + { + "epoch": 0.4989773082753808, + "lm_loss": 2.3432, + "step": 2592, + "vm_loss": 0.1606 + }, + { + "epoch": 0.4989773082753808, + "lm_loss": 2.2886, + "step": 2592, + "vm_loss": 0.1279 + }, + { + "epoch": 0.4989773082753808, + "lm_loss": 2.1799, + "step": 2592, + "vm_loss": 0.2484 + }, + { + "epoch": 0.4991698149529562, + "grad_norm": 2.802861383384065, + "learning_rate": 1.757584205742517e-05, + "loss": 2.2816, + "step": 2593 + }, + { + "epoch": 0.49936232163053157, + "grad_norm": 2.892799416827955, + "learning_rate": 1.75738065213399e-05, + "loss": 2.2579, + "step": 2594 + }, + { + "epoch": 0.49955482830810694, + "grad_norm": 2.463193662151769, + "learning_rate": 1.7571770248983734e-05, + "loss": 2.2585, + "step": 2595 + }, + { + "epoch": 0.4997473349856823, + "grad_norm": 2.815154361430906, + "learning_rate": 1.756973324055463e-05, + "loss": 2.2424, + "step": 2596 + }, + { + "epoch": 0.49993984166325767, + "grad_norm": 2.989821253217763, + "learning_rate": 1.756769549625061e-05, + "loss": 2.2718, + "step": 2597 + }, + { + "epoch": 0.5001323483408331, + "grad_norm": 2.940453260654077, + "learning_rate": 1.7565657016269767e-05, + "loss": 2.2653, + "step": 2598 + }, + { + "epoch": 0.5003248550184084, + "grad_norm": 2.699491163905135, + "learning_rate": 1.756361780081027e-05, + "loss": 2.2537, + "step": 2599 + }, + { + "epoch": 0.5005173616959838, + "grad_norm": 2.797466387789946, + "learning_rate": 1.7561577850070355e-05, + "loss": 2.2796, + "step": 2600 + }, + { + "epoch": 0.5005173616959838, + "lm_loss": 2.2221, + "step": 2600, + "vm_loss": 0.1416 + }, + { + "epoch": 0.5005173616959838, + "lm_loss": 2.2654, + "step": 2600, + "vm_loss": 0.118 + }, + { + "epoch": 0.5005173616959838, + "lm_loss": 1.8445, + "step": 2600, + "vm_loss": 0.1861 + }, + { + "epoch": 0.5005173616959838, + "lm_loss": 1.8306, + "step": 2600, + "vm_loss": 0.2082 + }, + { + "epoch": 0.5005173616959838, + "lm_loss": 1.7497, + "step": 2600, + "vm_loss": 0.1367 + }, + { + "epoch": 0.5005173616959838, + "lm_loss": 1.7837, + "step": 2600, + "vm_loss": 0.1699 + }, + { + "epoch": 0.5005173616959838, + "lm_loss": 1.836, + "step": 2600, + "vm_loss": 0.1616 + }, + { + "epoch": 0.5005173616959838, + "lm_loss": 2.0453, + "step": 2600, + "vm_loss": 0.1627 + }, + { + "epoch": 0.5007098683735592, + "grad_norm": 2.7794518667196986, + "learning_rate": 1.7559537164248333e-05, + "loss": 2.2319, + "step": 2601 + }, + { + "epoch": 0.5009023750511346, + "grad_norm": 2.8899109317913867, + "learning_rate": 1.7557495743542586e-05, + "loss": 2.2734, + "step": 2602 + }, + { + "epoch": 0.50109488172871, + "grad_norm": 3.0605169807087447, + "learning_rate": 1.755545358815156e-05, + "loss": 2.2713, + "step": 2603 + }, + { + "epoch": 0.5012873884062854, + "grad_norm": 2.7685890938663333, + "learning_rate": 1.7553410698273786e-05, + "loss": 2.2534, + "step": 2604 + }, + { + "epoch": 0.5014798950838607, + "grad_norm": 2.718212449509204, + "learning_rate": 1.7551367074107854e-05, + "loss": 2.212, + "step": 2605 + }, + { + "epoch": 0.5016724017614361, + "grad_norm": 2.9281001237128073, + "learning_rate": 1.754932271585243e-05, + "loss": 2.2476, + "step": 2606 + }, + { + "epoch": 0.5018649084390114, + "grad_norm": 2.8061292565480263, + "learning_rate": 1.754727762370626e-05, + "loss": 2.24, + "step": 2607 + }, + { + "epoch": 0.5020574151165869, + "grad_norm": 2.542839139968227, + "learning_rate": 1.7545231797868144e-05, + "loss": 2.2834, + "step": 2608 + }, + { + "epoch": 0.5020574151165869, + "lm_loss": 2.2796, + "step": 2608, + "vm_loss": 0.2087 + }, + { + "epoch": 0.5020574151165869, + "lm_loss": 1.9135, + "step": 2608, + "vm_loss": 0.1486 + }, + { + "epoch": 0.5020574151165869, + "lm_loss": 2.4112, + "step": 2608, + "vm_loss": 0.1229 + }, + { + "epoch": 0.5020574151165869, + "lm_loss": 1.9448, + "step": 2608, + "vm_loss": 0.1636 + }, + { + "epoch": 0.5020574151165869, + "lm_loss": 2.3266, + "step": 2608, + "vm_loss": 0.1838 + }, + { + "epoch": 0.5020574151165869, + "lm_loss": 2.0836, + "step": 2608, + "vm_loss": 0.1635 + }, + { + "epoch": 0.5020574151165869, + "lm_loss": 2.1216, + "step": 2608, + "vm_loss": 0.2106 + }, + { + "epoch": 0.5020574151165869, + "lm_loss": 2.3413, + "step": 2608, + "vm_loss": 0.229 + }, + { + "epoch": 0.5022499217941623, + "grad_norm": 2.719356695124388, + "learning_rate": 1.7543185238536967e-05, + "loss": 2.2869, + "step": 2609 + }, + { + "epoch": 0.5024424284717376, + "grad_norm": 2.730083865426326, + "learning_rate": 1.7541137945911676e-05, + "loss": 2.257, + "step": 2610 + }, + { + "epoch": 0.502634935149313, + "grad_norm": 2.785481551920978, + "learning_rate": 1.7539089920191298e-05, + "loss": 2.2287, + "step": 2611 + }, + { + "epoch": 0.5028274418268883, + "grad_norm": 2.8220640238962504, + "learning_rate": 1.753704116157493e-05, + "loss": 2.2565, + "step": 2612 + }, + { + "epoch": 0.5030199485044637, + "grad_norm": 2.9564037523078004, + "learning_rate": 1.753499167026173e-05, + "loss": 2.2344, + "step": 2613 + }, + { + "epoch": 0.5032124551820392, + "grad_norm": 2.637009324707848, + "learning_rate": 1.7532941446450943e-05, + "loss": 2.2918, + "step": 2614 + }, + { + "epoch": 0.5034049618596145, + "grad_norm": 3.0999628395536387, + "learning_rate": 1.753089049034187e-05, + "loss": 2.2758, + "step": 2615 + }, + { + "epoch": 0.5035974685371899, + "grad_norm": 2.8104568800068903, + "learning_rate": 1.752883880213389e-05, + "loss": 2.2073, + "step": 2616 + }, + { + "epoch": 0.5035974685371899, + "lm_loss": 2.0801, + "step": 2616, + "vm_loss": 0.1282 + }, + { + "epoch": 0.5035974685371899, + "lm_loss": 2.1941, + "step": 2616, + "vm_loss": 0.1049 + }, + { + "epoch": 0.5035974685371899, + "lm_loss": 1.9485, + "step": 2616, + "vm_loss": 0.1439 + }, + { + "epoch": 0.5035974685371899, + "lm_loss": 1.8634, + "step": 2616, + "vm_loss": 0.1249 + }, + { + "epoch": 0.5035974685371899, + "lm_loss": 2.1711, + "step": 2616, + "vm_loss": 0.2027 + }, + { + "epoch": 0.5035974685371899, + "lm_loss": 2.2103, + "step": 2616, + "vm_loss": 0.2481 + }, + { + "epoch": 0.5035974685371899, + "lm_loss": 2.1504, + "step": 2616, + "vm_loss": 0.2305 + }, + { + "epoch": 0.5035974685371899, + "lm_loss": 2.0238, + "step": 2616, + "vm_loss": 0.1384 + }, + { + "epoch": 0.5037899752147653, + "grad_norm": 2.861550022259338, + "learning_rate": 1.7526786382026463e-05, + "loss": 2.2016, + "step": 2617 + }, + { + "epoch": 0.5039824818923406, + "grad_norm": 2.782844768672414, + "learning_rate": 1.7524733230219102e-05, + "loss": 2.2173, + "step": 2618 + }, + { + "epoch": 0.504174988569916, + "grad_norm": 3.18662327250915, + "learning_rate": 1.75226793469114e-05, + "loss": 2.3004, + "step": 2619 + }, + { + "epoch": 0.5043674952474914, + "grad_norm": 2.9953073631385543, + "learning_rate": 1.7520624732303024e-05, + "loss": 2.279, + "step": 2620 + }, + { + "epoch": 0.5045600019250668, + "grad_norm": 2.8556103251754617, + "learning_rate": 1.7518569386593708e-05, + "loss": 2.2484, + "step": 2621 + }, + { + "epoch": 0.5047525086026422, + "grad_norm": 2.7715476509028685, + "learning_rate": 1.7516513309983253e-05, + "loss": 2.3008, + "step": 2622 + }, + { + "epoch": 0.5049450152802175, + "grad_norm": 3.0071218031797, + "learning_rate": 1.7514456502671543e-05, + "loss": 2.2333, + "step": 2623 + }, + { + "epoch": 0.5051375219577929, + "grad_norm": 2.81686704029729, + "learning_rate": 1.7512398964858523e-05, + "loss": 2.2838, + "step": 2624 + }, + { + "epoch": 0.5051375219577929, + "lm_loss": 2.3301, + "step": 2624, + "vm_loss": 0.1711 + }, + { + "epoch": 0.5051375219577929, + "lm_loss": 2.2091, + "step": 2624, + "vm_loss": 0.2072 + }, + { + "epoch": 0.5051375219577929, + "lm_loss": 2.0045, + "step": 2624, + "vm_loss": 0.1892 + }, + { + "epoch": 0.5051375219577929, + "lm_loss": 1.7932, + "step": 2624, + "vm_loss": 0.1877 + }, + { + "epoch": 0.5051375219577929, + "lm_loss": 2.0679, + "step": 2624, + "vm_loss": 0.2389 + }, + { + "epoch": 0.5051375219577929, + "lm_loss": 2.3463, + "step": 2624, + "vm_loss": 0.1936 + }, + { + "epoch": 0.5051375219577929, + "lm_loss": 2.1037, + "step": 2624, + "vm_loss": 0.1925 + }, + { + "epoch": 0.5051375219577929, + "lm_loss": 2.0524, + "step": 2624, + "vm_loss": 0.1467 + }, + { + "epoch": 0.5053300286353682, + "grad_norm": 2.9623821652152973, + "learning_rate": 1.7510340696744213e-05, + "loss": 2.2559, + "step": 2625 + }, + { + "epoch": 0.5055225353129437, + "grad_norm": 2.879773491619362, + "learning_rate": 1.75082816985287e-05, + "loss": 2.2652, + "step": 2626 + }, + { + "epoch": 0.5057150419905191, + "grad_norm": 2.8174046980388385, + "learning_rate": 1.7506221970412145e-05, + "loss": 2.2643, + "step": 2627 + }, + { + "epoch": 0.5059075486680944, + "grad_norm": 2.7382722430931645, + "learning_rate": 1.7504161512594782e-05, + "loss": 2.2383, + "step": 2628 + }, + { + "epoch": 0.5061000553456698, + "grad_norm": 2.799785111468018, + "learning_rate": 1.7502100325276918e-05, + "loss": 2.2455, + "step": 2629 + }, + { + "epoch": 0.5062925620232451, + "grad_norm": 2.903663411893443, + "learning_rate": 1.7500038408658922e-05, + "loss": 2.2606, + "step": 2630 + }, + { + "epoch": 0.5064850687008206, + "grad_norm": 2.400303835842584, + "learning_rate": 1.7497975762941237e-05, + "loss": 2.2132, + "step": 2631 + }, + { + "epoch": 0.506677575378396, + "grad_norm": 2.74395779442082, + "learning_rate": 1.749591238832438e-05, + "loss": 2.1893, + "step": 2632 + }, + { + "epoch": 0.506677575378396, + "lm_loss": 1.8266, + "step": 2632, + "vm_loss": 0.1244 + }, + { + "epoch": 0.506677575378396, + "lm_loss": 2.358, + "step": 2632, + "vm_loss": 0.2234 + }, + { + "epoch": 0.506677575378396, + "lm_loss": 2.102, + "step": 2632, + "vm_loss": 0.1776 + }, + { + "epoch": 0.506677575378396, + "lm_loss": 1.7411, + "step": 2632, + "vm_loss": 0.1923 + }, + { + "epoch": 0.506677575378396, + "lm_loss": 1.9312, + "step": 2632, + "vm_loss": 0.1149 + }, + { + "epoch": 0.506677575378396, + "lm_loss": 1.8583, + "step": 2632, + "vm_loss": 0.1323 + }, + { + "epoch": 0.506677575378396, + "lm_loss": 1.6884, + "step": 2632, + "vm_loss": 0.1775 + }, + { + "epoch": 0.506677575378396, + "lm_loss": 2.2887, + "step": 2632, + "vm_loss": 0.1805 + }, + { + "epoch": 0.5068700820559713, + "grad_norm": 2.7243347315537183, + "learning_rate": 1.7493848285008938e-05, + "loss": 2.2111, + "step": 2633 + }, + { + "epoch": 0.5070625887335467, + "grad_norm": 2.644455835470837, + "learning_rate": 1.749178345319557e-05, + "loss": 2.2662, + "step": 2634 + }, + { + "epoch": 0.5072550954111221, + "grad_norm": 2.982752342627713, + "learning_rate": 1.7489717893085007e-05, + "loss": 2.2514, + "step": 2635 + }, + { + "epoch": 0.5074476020886974, + "grad_norm": 2.966604760254571, + "learning_rate": 1.7487651604878037e-05, + "loss": 2.2519, + "step": 2636 + }, + { + "epoch": 0.5076401087662729, + "grad_norm": 2.655046520402533, + "learning_rate": 1.7485584588775536e-05, + "loss": 2.228, + "step": 2637 + }, + { + "epoch": 0.5078326154438482, + "grad_norm": 2.891107355714645, + "learning_rate": 1.7483516844978445e-05, + "loss": 2.2483, + "step": 2638 + }, + { + "epoch": 0.5080251221214236, + "grad_norm": 2.8454734626339753, + "learning_rate": 1.7481448373687775e-05, + "loss": 2.2529, + "step": 2639 + }, + { + "epoch": 0.508217628798999, + "grad_norm": 3.021217908752849, + "learning_rate": 1.747937917510461e-05, + "loss": 2.2402, + "step": 2640 + }, + { + "epoch": 0.508217628798999, + "lm_loss": 2.3467, + "step": 2640, + "vm_loss": 0.1561 + }, + { + "epoch": 0.508217628798999, + "lm_loss": 1.6598, + "step": 2640, + "vm_loss": 0.1322 + }, + { + "epoch": 0.508217628798999, + "lm_loss": 1.8667, + "step": 2640, + "vm_loss": 0.1947 + }, + { + "epoch": 0.508217628798999, + "lm_loss": 1.8767, + "step": 2640, + "vm_loss": 0.2188 + }, + { + "epoch": 0.508217628798999, + "lm_loss": 1.8069, + "step": 2640, + "vm_loss": 0.181 + }, + { + "epoch": 0.508217628798999, + "lm_loss": 2.0007, + "step": 2640, + "vm_loss": 0.1969 + }, + { + "epoch": 0.508217628798999, + "lm_loss": 1.7902, + "step": 2640, + "vm_loss": 0.1994 + }, + { + "epoch": 0.508217628798999, + "lm_loss": 2.229, + "step": 2640, + "vm_loss": 0.2275 + }, + { + "epoch": 0.5084101354765743, + "grad_norm": 2.7388572369836663, + "learning_rate": 1.7477309249430096e-05, + "loss": 2.2098, + "step": 2641 + }, + { + "epoch": 0.5086026421541497, + "grad_norm": 2.976865901748515, + "learning_rate": 1.747523859686546e-05, + "loss": 2.2313, + "step": 2642 + }, + { + "epoch": 0.508795148831725, + "grad_norm": 3.249671008927816, + "learning_rate": 1.7473167217612e-05, + "loss": 2.2461, + "step": 2643 + }, + { + "epoch": 0.5089876555093005, + "grad_norm": 2.5726386083371025, + "learning_rate": 1.7471095111871076e-05, + "loss": 2.2406, + "step": 2644 + }, + { + "epoch": 0.5091801621868759, + "grad_norm": 2.726950050216297, + "learning_rate": 1.7469022279844123e-05, + "loss": 2.2778, + "step": 2645 + }, + { + "epoch": 0.5093726688644512, + "grad_norm": 3.1198540195543933, + "learning_rate": 1.7466948721732646e-05, + "loss": 2.2201, + "step": 2646 + }, + { + "epoch": 0.5095651755420266, + "grad_norm": 2.7510537634054866, + "learning_rate": 1.7464874437738223e-05, + "loss": 2.2555, + "step": 2647 + }, + { + "epoch": 0.5097576822196019, + "grad_norm": 3.3430748962075296, + "learning_rate": 1.7462799428062505e-05, + "loss": 2.2254, + "step": 2648 + }, + { + "epoch": 0.5097576822196019, + "lm_loss": 1.9903, + "step": 2648, + "vm_loss": 0.1541 + }, + { + "epoch": 0.5097576822196019, + "lm_loss": 2.2131, + "step": 2648, + "vm_loss": 0.1519 + }, + { + "epoch": 0.5097576822196019, + "lm_loss": 2.0546, + "step": 2648, + "vm_loss": 0.1429 + }, + { + "epoch": 0.5097576822196019, + "lm_loss": 2.2459, + "step": 2648, + "vm_loss": 0.114 + }, + { + "epoch": 0.5097576822196019, + "lm_loss": 1.952, + "step": 2648, + "vm_loss": 0.2214 + }, + { + "epoch": 0.5097576822196019, + "lm_loss": 1.9912, + "step": 2648, + "vm_loss": 0.162 + }, + { + "epoch": 0.5097576822196019, + "lm_loss": 1.7982, + "step": 2648, + "vm_loss": 0.1887 + }, + { + "epoch": 0.5097576822196019, + "lm_loss": 2.3354, + "step": 2648, + "vm_loss": 0.1229 + }, + { + "epoch": 0.5099501888971774, + "grad_norm": 2.9979161019239604, + "learning_rate": 1.74607236929072e-05, + "loss": 2.2241, + "step": 2649 + }, + { + "epoch": 0.5101426955747528, + "grad_norm": 2.665715609670079, + "learning_rate": 1.7458647232474106e-05, + "loss": 2.2793, + "step": 2650 + }, + { + "epoch": 0.5103352022523281, + "grad_norm": 2.885459234109262, + "learning_rate": 1.745657004696508e-05, + "loss": 2.2464, + "step": 2651 + }, + { + "epoch": 0.5105277089299035, + "grad_norm": 2.6116294195633816, + "learning_rate": 1.7454492136582042e-05, + "loss": 2.2282, + "step": 2652 + }, + { + "epoch": 0.5107202156074789, + "grad_norm": 2.9933898549905487, + "learning_rate": 1.7452413501527e-05, + "loss": 2.2348, + "step": 2653 + }, + { + "epoch": 0.5109127222850542, + "grad_norm": 2.7557118029733236, + "learning_rate": 1.7450334142002022e-05, + "loss": 2.2753, + "step": 2654 + }, + { + "epoch": 0.5111052289626297, + "grad_norm": 2.610904800870158, + "learning_rate": 1.7448254058209244e-05, + "loss": 2.2483, + "step": 2655 + }, + { + "epoch": 0.511297735640205, + "grad_norm": 2.951826491871495, + "learning_rate": 1.7446173250350883e-05, + "loss": 2.2551, + "step": 2656 + }, + { + "epoch": 0.511297735640205, + "lm_loss": 2.2202, + "step": 2656, + "vm_loss": 0.1408 + }, + { + "epoch": 0.511297735640205, + "lm_loss": 2.053, + "step": 2656, + "vm_loss": 0.247 + }, + { + "epoch": 0.511297735640205, + "lm_loss": 2.1313, + "step": 2656, + "vm_loss": 0.1518 + }, + { + "epoch": 0.511297735640205, + "lm_loss": 1.8586, + "step": 2656, + "vm_loss": 0.2448 + }, + { + "epoch": 0.511297735640205, + "lm_loss": 2.1471, + "step": 2656, + "vm_loss": 0.2233 + }, + { + "epoch": 0.511297735640205, + "lm_loss": 1.7696, + "step": 2656, + "vm_loss": 0.178 + }, + { + "epoch": 0.511297735640205, + "lm_loss": 1.9407, + "step": 2656, + "vm_loss": 0.1629 + }, + { + "epoch": 0.511297735640205, + "lm_loss": 1.9062, + "step": 2656, + "vm_loss": 0.1791 + }, + { + "epoch": 0.5114902423177804, + "grad_norm": 2.790749191287158, + "learning_rate": 1.7444091718629217e-05, + "loss": 2.2562, + "step": 2657 + }, + { + "epoch": 0.5116827489953558, + "grad_norm": 2.973697486881595, + "learning_rate": 1.74420094632466e-05, + "loss": 2.2418, + "step": 2658 + }, + { + "epoch": 0.5118752556729311, + "grad_norm": 2.6160886356582522, + "learning_rate": 1.743992648440545e-05, + "loss": 2.276, + "step": 2659 + }, + { + "epoch": 0.5120677623505066, + "grad_norm": 2.741550296601859, + "learning_rate": 1.7437842782308262e-05, + "loss": 2.2539, + "step": 2660 + }, + { + "epoch": 0.5122602690280819, + "grad_norm": 2.882234469762646, + "learning_rate": 1.7435758357157597e-05, + "loss": 2.2583, + "step": 2661 + }, + { + "epoch": 0.5124527757056573, + "grad_norm": 2.7153662882756144, + "learning_rate": 1.743367320915609e-05, + "loss": 2.2076, + "step": 2662 + }, + { + "epoch": 0.5126452823832327, + "grad_norm": 2.8950592504870962, + "learning_rate": 1.7431587338506443e-05, + "loss": 2.2255, + "step": 2663 + }, + { + "epoch": 0.512837789060808, + "grad_norm": 2.932371797674889, + "learning_rate": 1.7429500745411426e-05, + "loss": 2.2391, + "step": 2664 + }, + { + "epoch": 0.512837789060808, + "lm_loss": 2.0825, + "step": 2664, + "vm_loss": 0.1228 + }, + { + "epoch": 0.512837789060808, + "lm_loss": 2.3607, + "step": 2664, + "vm_loss": 0.1861 + }, + { + "epoch": 0.512837789060808, + "lm_loss": 1.8591, + "step": 2664, + "vm_loss": 0.1657 + }, + { + "epoch": 0.512837789060808, + "lm_loss": 2.3858, + "step": 2664, + "vm_loss": 0.1497 + }, + { + "epoch": 0.512837789060808, + "lm_loss": 2.4122, + "step": 2664, + "vm_loss": 0.2299 + }, + { + "epoch": 0.512837789060808, + "lm_loss": 1.7084, + "step": 2664, + "vm_loss": 0.1644 + }, + { + "epoch": 0.512837789060808, + "lm_loss": 2.2069, + "step": 2664, + "vm_loss": 0.1795 + }, + { + "epoch": 0.512837789060808, + "lm_loss": 1.9393, + "step": 2664, + "vm_loss": 0.1768 + }, + { + "epoch": 0.5130302957383834, + "grad_norm": 2.8143085477673124, + "learning_rate": 1.742741343007389e-05, + "loss": 2.2604, + "step": 2665 + }, + { + "epoch": 0.5132228024159589, + "grad_norm": 2.915248151555896, + "learning_rate": 1.7425325392696742e-05, + "loss": 2.2219, + "step": 2666 + }, + { + "epoch": 0.5134153090935342, + "grad_norm": 2.7949596383277533, + "learning_rate": 1.7423236633482966e-05, + "loss": 2.1937, + "step": 2667 + }, + { + "epoch": 0.5136078157711096, + "grad_norm": 2.956405234343142, + "learning_rate": 1.7421147152635622e-05, + "loss": 2.2288, + "step": 2668 + }, + { + "epoch": 0.5138003224486849, + "grad_norm": 2.695272293516905, + "learning_rate": 1.7419056950357828e-05, + "loss": 2.197, + "step": 2669 + }, + { + "epoch": 0.5139928291262603, + "grad_norm": 2.891992805176657, + "learning_rate": 1.7416966026852784e-05, + "loss": 2.195, + "step": 2670 + }, + { + "epoch": 0.5141853358038357, + "grad_norm": 2.949105197723159, + "learning_rate": 1.7414874382323747e-05, + "loss": 2.2368, + "step": 2671 + }, + { + "epoch": 0.514377842481411, + "grad_norm": 2.6435504260712013, + "learning_rate": 1.741278201697406e-05, + "loss": 2.2133, + "step": 2672 + }, + { + "epoch": 0.514377842481411, + "lm_loss": 2.03, + "step": 2672, + "vm_loss": 0.162 + }, + { + "epoch": 0.514377842481411, + "lm_loss": 2.121, + "step": 2672, + "vm_loss": 0.1545 + }, + { + "epoch": 0.514377842481411, + "lm_loss": 1.8565, + "step": 2672, + "vm_loss": 0.1217 + }, + { + "epoch": 0.514377842481411, + "lm_loss": 1.8095, + "step": 2672, + "vm_loss": 0.1909 + }, + { + "epoch": 0.514377842481411, + "lm_loss": 1.9122, + "step": 2672, + "vm_loss": 0.1659 + }, + { + "epoch": 0.514377842481411, + "lm_loss": 2.29, + "step": 2672, + "vm_loss": 0.1613 + }, + { + "epoch": 0.514377842481411, + "lm_loss": 2.2422, + "step": 2672, + "vm_loss": 0.1768 + }, + { + "epoch": 0.514377842481411, + "lm_loss": 2.3059, + "step": 2672, + "vm_loss": 0.1661 + }, + { + "epoch": 0.5145703491589865, + "grad_norm": 2.7479058763492015, + "learning_rate": 1.7410688931007122e-05, + "loss": 2.2694, + "step": 2673 + }, + { + "epoch": 0.5147628558365618, + "grad_norm": 2.8164772207909947, + "learning_rate": 1.740859512462641e-05, + "loss": 2.2171, + "step": 2674 + }, + { + "epoch": 0.5149553625141372, + "grad_norm": 2.7092746900203504, + "learning_rate": 1.7406500598035466e-05, + "loss": 2.2407, + "step": 2675 + }, + { + "epoch": 0.5151478691917126, + "grad_norm": 2.6261928048335776, + "learning_rate": 1.7404405351437913e-05, + "loss": 2.1885, + "step": 2676 + }, + { + "epoch": 0.5153403758692879, + "grad_norm": 2.677192099785163, + "learning_rate": 1.7402309385037423e-05, + "loss": 2.2378, + "step": 2677 + }, + { + "epoch": 0.5155328825468634, + "grad_norm": 2.718749277700357, + "learning_rate": 1.740021269903776e-05, + "loss": 2.2922, + "step": 2678 + }, + { + "epoch": 0.5157253892244387, + "grad_norm": 2.9050086699200204, + "learning_rate": 1.7398115293642748e-05, + "loss": 2.2505, + "step": 2679 + }, + { + "epoch": 0.5159178959020141, + "grad_norm": 3.051972292329932, + "learning_rate": 1.7396017169056278e-05, + "loss": 2.2532, + "step": 2680 + }, + { + "epoch": 0.5159178959020141, + "lm_loss": 1.8824, + "step": 2680, + "vm_loss": 0.156 + }, + { + "epoch": 0.5159178959020141, + "lm_loss": 1.8886, + "step": 2680, + "vm_loss": 0.1744 + }, + { + "epoch": 0.5159178959020141, + "lm_loss": 2.0481, + "step": 2680, + "vm_loss": 0.1852 + }, + { + "epoch": 0.5159178959020141, + "lm_loss": 2.2578, + "step": 2680, + "vm_loss": 0.1891 + }, + { + "epoch": 0.5159178959020141, + "lm_loss": 2.1413, + "step": 2680, + "vm_loss": 0.1478 + }, + { + "epoch": 0.5159178959020141, + "lm_loss": 2.1407, + "step": 2680, + "vm_loss": 0.1905 + }, + { + "epoch": 0.5159178959020141, + "lm_loss": 2.0647, + "step": 2680, + "vm_loss": 0.1758 + }, + { + "epoch": 0.5159178959020141, + "lm_loss": 1.9486, + "step": 2680, + "vm_loss": 0.2478 + }, + { + "epoch": 0.5161104025795895, + "grad_norm": 2.903780041059459, + "learning_rate": 1.7393918325482316e-05, + "loss": 2.2337, + "step": 2681 + }, + { + "epoch": 0.5163029092571648, + "grad_norm": 3.034083796354088, + "learning_rate": 1.7391818763124902e-05, + "loss": 2.2708, + "step": 2682 + }, + { + "epoch": 0.5164954159347402, + "grad_norm": 2.6052578950476204, + "learning_rate": 1.7389718482188128e-05, + "loss": 2.2051, + "step": 2683 + }, + { + "epoch": 0.5166879226123157, + "grad_norm": 2.8678113364109756, + "learning_rate": 1.738761748287618e-05, + "loss": 2.2004, + "step": 2684 + }, + { + "epoch": 0.516880429289891, + "grad_norm": 3.099690877183125, + "learning_rate": 1.73855157653933e-05, + "loss": 2.2109, + "step": 2685 + }, + { + "epoch": 0.5170729359674664, + "grad_norm": 3.0547528797715455, + "learning_rate": 1.738341332994379e-05, + "loss": 2.1762, + "step": 2686 + }, + { + "epoch": 0.5172654426450417, + "grad_norm": 2.856551937302945, + "learning_rate": 1.7381310176732052e-05, + "loss": 2.2181, + "step": 2687 + }, + { + "epoch": 0.5174579493226171, + "grad_norm": 2.835751527725957, + "learning_rate": 1.7379206305962525e-05, + "loss": 2.226, + "step": 2688 + }, + { + "epoch": 0.5174579493226171, + "lm_loss": 1.9812, + "step": 2688, + "vm_loss": 0.1652 + }, + { + "epoch": 0.5174579493226171, + "lm_loss": 1.6525, + "step": 2688, + "vm_loss": 0.1411 + }, + { + "epoch": 0.5174579493226171, + "lm_loss": 2.2495, + "step": 2688, + "vm_loss": 0.1706 + }, + { + "epoch": 0.5174579493226171, + "lm_loss": 2.166, + "step": 2688, + "vm_loss": 0.2457 + }, + { + "epoch": 0.5174579493226171, + "lm_loss": 2.1551, + "step": 2688, + "vm_loss": 0.2164 + }, + { + "epoch": 0.5174579493226171, + "lm_loss": 2.059, + "step": 2688, + "vm_loss": 0.1326 + }, + { + "epoch": 0.5174579493226171, + "lm_loss": 2.0375, + "step": 2688, + "vm_loss": 0.168 + }, + { + "epoch": 0.5174579493226171, + "lm_loss": 1.7353, + "step": 2688, + "vm_loss": 0.1242 + }, + { + "epoch": 0.5176504560001925, + "grad_norm": 2.9119493379293977, + "learning_rate": 1.737710171783974e-05, + "loss": 2.2171, + "step": 2689 + }, + { + "epoch": 0.5178429626777679, + "grad_norm": 2.9504745826748042, + "learning_rate": 1.7374996412568286e-05, + "loss": 2.2538, + "step": 2690 + }, + { + "epoch": 0.5180354693553433, + "grad_norm": 2.904252616732406, + "learning_rate": 1.7372890390352827e-05, + "loss": 2.1859, + "step": 2691 + }, + { + "epoch": 0.5182279760329186, + "grad_norm": 2.907993300936597, + "learning_rate": 1.7370783651398094e-05, + "loss": 2.1808, + "step": 2692 + }, + { + "epoch": 0.518420482710494, + "grad_norm": 2.690731702715627, + "learning_rate": 1.7368676195908895e-05, + "loss": 2.1896, + "step": 2693 + }, + { + "epoch": 0.5186129893880694, + "grad_norm": 2.882498152916489, + "learning_rate": 1.7366568024090095e-05, + "loss": 2.2449, + "step": 2694 + }, + { + "epoch": 0.5188054960656447, + "grad_norm": 3.0117456470892465, + "learning_rate": 1.7364459136146635e-05, + "loss": 2.2272, + "step": 2695 + }, + { + "epoch": 0.5189980027432202, + "grad_norm": 3.041114105517781, + "learning_rate": 1.736234953228353e-05, + "loss": 2.2337, + "step": 2696 + }, + { + "epoch": 0.5189980027432202, + "lm_loss": 2.0561, + "step": 2696, + "vm_loss": 0.1547 + }, + { + "epoch": 0.5189980027432202, + "lm_loss": 2.0122, + "step": 2696, + "vm_loss": 0.1321 + }, + { + "epoch": 0.5189980027432202, + "lm_loss": 2.1078, + "step": 2696, + "vm_loss": 0.1099 + }, + { + "epoch": 0.5189980027432202, + "lm_loss": 2.1573, + "step": 2696, + "vm_loss": 0.1874 + }, + { + "epoch": 0.5189980027432202, + "lm_loss": 1.8558, + "step": 2696, + "vm_loss": 0.1667 + }, + { + "epoch": 0.5189980027432202, + "lm_loss": 2.2854, + "step": 2696, + "vm_loss": 0.1917 + }, + { + "epoch": 0.5189980027432202, + "lm_loss": 1.943, + "step": 2696, + "vm_loss": 0.1765 + }, + { + "epoch": 0.5189980027432202, + "lm_loss": 2.3349, + "step": 2696, + "vm_loss": 0.1853 + }, + { + "epoch": 0.5191905094207956, + "grad_norm": 2.6538177271964964, + "learning_rate": 1.7360239212705857e-05, + "loss": 2.252, + "step": 2697 + }, + { + "epoch": 0.5193830160983709, + "grad_norm": 2.9722479288517825, + "learning_rate": 1.7358128177618766e-05, + "loss": 2.2332, + "step": 2698 + }, + { + "epoch": 0.5195755227759463, + "grad_norm": 3.1149764375503075, + "learning_rate": 1.735601642722748e-05, + "loss": 2.2374, + "step": 2699 + }, + { + "epoch": 0.5197680294535216, + "grad_norm": 2.907294994411981, + "learning_rate": 1.735390396173729e-05, + "loss": 2.2109, + "step": 2700 + }, + { + "epoch": 0.519960536131097, + "grad_norm": 2.935189947020246, + "learning_rate": 1.7351790781353543e-05, + "loss": 2.2964, + "step": 2701 + }, + { + "epoch": 0.5201530428086725, + "grad_norm": 2.993038795619102, + "learning_rate": 1.734967688628168e-05, + "loss": 2.2453, + "step": 2702 + }, + { + "epoch": 0.5203455494862478, + "grad_norm": 3.220951353637244, + "learning_rate": 1.7347562276727194e-05, + "loss": 2.2849, + "step": 2703 + }, + { + "epoch": 0.5205380561638232, + "grad_norm": 2.8945886064359874, + "learning_rate": 1.734544695289565e-05, + "loss": 2.2433, + "step": 2704 + }, + { + "epoch": 0.5205380561638232, + "lm_loss": 2.0627, + "step": 2704, + "vm_loss": 0.1401 + }, + { + "epoch": 0.5205380561638232, + "lm_loss": 1.9644, + "step": 2704, + "vm_loss": 0.1499 + }, + { + "epoch": 0.5205380561638232, + "lm_loss": 2.1807, + "step": 2704, + "vm_loss": 0.1709 + }, + { + "epoch": 0.5205380561638232, + "lm_loss": 1.9107, + "step": 2704, + "vm_loss": 0.1771 + }, + { + "epoch": 0.5205380561638232, + "lm_loss": 1.859, + "step": 2704, + "vm_loss": 0.1899 + }, + { + "epoch": 0.5205380561638232, + "lm_loss": 1.711, + "step": 2704, + "vm_loss": 0.1399 + }, + { + "epoch": 0.5205380561638232, + "lm_loss": 2.3529, + "step": 2704, + "vm_loss": 0.1645 + }, + { + "epoch": 0.5205380561638232, + "lm_loss": 2.3153, + "step": 2704, + "vm_loss": 0.1807 + }, + { + "epoch": 0.5207305628413985, + "grad_norm": 3.0467579619785106, + "learning_rate": 1.734333091499269e-05, + "loss": 2.2089, + "step": 2705 + }, + { + "epoch": 0.5209230695189739, + "grad_norm": 3.2824128906538563, + "learning_rate": 1.7341214163224016e-05, + "loss": 2.228, + "step": 2706 + }, + { + "epoch": 0.5211155761965494, + "grad_norm": 3.191283673167816, + "learning_rate": 1.7339096697795402e-05, + "loss": 2.2052, + "step": 2707 + }, + { + "epoch": 0.5213080828741247, + "grad_norm": 3.336251698360061, + "learning_rate": 1.7336978518912695e-05, + "loss": 2.2402, + "step": 2708 + }, + { + "epoch": 0.5215005895517001, + "grad_norm": 2.5873556729138545, + "learning_rate": 1.733485962678181e-05, + "loss": 2.2573, + "step": 2709 + }, + { + "epoch": 0.5216930962292754, + "grad_norm": 2.819207031019723, + "learning_rate": 1.7332740021608722e-05, + "loss": 2.2262, + "step": 2710 + }, + { + "epoch": 0.5218856029068508, + "grad_norm": 2.7390418135056707, + "learning_rate": 1.73306197035995e-05, + "loss": 2.2327, + "step": 2711 + }, + { + "epoch": 0.5220781095844262, + "grad_norm": 2.7870993764546146, + "learning_rate": 1.7328498672960253e-05, + "loss": 2.243, + "step": 2712 + }, + { + "epoch": 0.5220781095844262, + "lm_loss": 2.1499, + "step": 2712, + "vm_loss": 0.1954 + }, + { + "epoch": 0.5220781095844262, + "lm_loss": 1.8771, + "step": 2712, + "vm_loss": 0.1598 + }, + { + "epoch": 0.5220781095844262, + "lm_loss": 1.8958, + "step": 2712, + "vm_loss": 0.1548 + }, + { + "epoch": 0.5220781095844262, + "lm_loss": 1.8842, + "step": 2712, + "vm_loss": 0.2084 + }, + { + "epoch": 0.5220781095844262, + "lm_loss": 2.0225, + "step": 2712, + "vm_loss": 0.1289 + }, + { + "epoch": 0.5220781095844262, + "lm_loss": 1.8018, + "step": 2712, + "vm_loss": 0.1597 + }, + { + "epoch": 0.5220781095844262, + "lm_loss": 2.153, + "step": 2712, + "vm_loss": 0.1089 + }, + { + "epoch": 0.5220781095844262, + "lm_loss": 2.3027, + "step": 2712, + "vm_loss": 0.1844 + }, + { + "epoch": 0.5222706162620016, + "grad_norm": 2.800823858479726, + "learning_rate": 1.7326376929897173e-05, + "loss": 2.2074, + "step": 2713 + }, + { + "epoch": 0.522463122939577, + "grad_norm": 2.983643744166364, + "learning_rate": 1.7324254474616528e-05, + "loss": 2.2719, + "step": 2714 + }, + { + "epoch": 0.5226556296171524, + "grad_norm": 2.6158176885582503, + "learning_rate": 1.7322131307324644e-05, + "loss": 2.1819, + "step": 2715 + }, + { + "epoch": 0.5228481362947277, + "grad_norm": 3.2589706095271462, + "learning_rate": 1.7320007428227918e-05, + "loss": 2.2301, + "step": 2716 + }, + { + "epoch": 0.5230406429723031, + "grad_norm": 3.0743230770092116, + "learning_rate": 1.731788283753282e-05, + "loss": 2.2033, + "step": 2717 + }, + { + "epoch": 0.5232331496498784, + "grad_norm": 2.796594062480214, + "learning_rate": 1.7315757535445884e-05, + "loss": 2.1677, + "step": 2718 + }, + { + "epoch": 0.5234256563274539, + "grad_norm": 2.9089157334218063, + "learning_rate": 1.7313631522173723e-05, + "loss": 2.1832, + "step": 2719 + }, + { + "epoch": 0.5236181630050293, + "grad_norm": 2.789662930529021, + "learning_rate": 1.7311504797923006e-05, + "loss": 2.2068, + "step": 2720 + }, + { + "epoch": 0.5236181630050293, + "lm_loss": 2.0746, + "step": 2720, + "vm_loss": 0.1648 + }, + { + "epoch": 0.5236181630050293, + "lm_loss": 1.8341, + "step": 2720, + "vm_loss": 0.1785 + }, + { + "epoch": 0.5236181630050293, + "lm_loss": 2.3011, + "step": 2720, + "vm_loss": 0.2025 + }, + { + "epoch": 0.5236181630050293, + "lm_loss": 1.905, + "step": 2720, + "vm_loss": 0.2093 + }, + { + "epoch": 0.5236181630050293, + "lm_loss": 2.3392, + "step": 2720, + "vm_loss": 0.157 + }, + { + "epoch": 0.5236181630050293, + "lm_loss": 1.8519, + "step": 2720, + "vm_loss": 0.1845 + }, + { + "epoch": 0.5236181630050293, + "lm_loss": 1.9364, + "step": 2720, + "vm_loss": 0.1111 + }, + { + "epoch": 0.5236181630050293, + "lm_loss": 2.2202, + "step": 2720, + "vm_loss": 0.1749 + }, + { + "epoch": 0.5238106696826046, + "grad_norm": 2.786669712127074, + "learning_rate": 1.7309377362900486e-05, + "loss": 2.2292, + "step": 2721 + }, + { + "epoch": 0.52400317636018, + "grad_norm": 2.8935011917470583, + "learning_rate": 1.730724921731297e-05, + "loss": 2.2302, + "step": 2722 + }, + { + "epoch": 0.5241956830377553, + "grad_norm": 2.6175762482978264, + "learning_rate": 1.730512036136734e-05, + "loss": 2.246, + "step": 2723 + }, + { + "epoch": 0.5243881897153307, + "grad_norm": 2.850558122727029, + "learning_rate": 1.7302990795270557e-05, + "loss": 2.2079, + "step": 2724 + }, + { + "epoch": 0.5245806963929062, + "grad_norm": 2.891895735027978, + "learning_rate": 1.730086051922963e-05, + "loss": 2.1856, + "step": 2725 + }, + { + "epoch": 0.5247732030704815, + "grad_norm": 2.9181011661840923, + "learning_rate": 1.729872953345166e-05, + "loss": 2.157, + "step": 2726 + }, + { + "epoch": 0.5249657097480569, + "grad_norm": 2.8679221618913235, + "learning_rate": 1.72965978381438e-05, + "loss": 2.281, + "step": 2727 + }, + { + "epoch": 0.5251582164256323, + "grad_norm": 2.7988322499357987, + "learning_rate": 1.7294465433513278e-05, + "loss": 2.2143, + "step": 2728 + }, + { + "epoch": 0.5251582164256323, + "lm_loss": 2.127, + "step": 2728, + "vm_loss": 0.1939 + }, + { + "epoch": 0.5251582164256323, + "lm_loss": 2.2974, + "step": 2728, + "vm_loss": 0.2194 + }, + { + "epoch": 0.5251582164256323, + "lm_loss": 1.8778, + "step": 2728, + "vm_loss": 0.2451 + }, + { + "epoch": 0.5251582164256323, + "lm_loss": 1.9747, + "step": 2728, + "vm_loss": 0.1779 + }, + { + "epoch": 0.5251582164256323, + "lm_loss": 1.9019, + "step": 2728, + "vm_loss": 0.1366 + }, + { + "epoch": 0.5251582164256323, + "lm_loss": 1.7037, + "step": 2728, + "vm_loss": 0.1924 + }, + { + "epoch": 0.5251582164256323, + "lm_loss": 1.9436, + "step": 2728, + "vm_loss": 0.1371 + }, + { + "epoch": 0.5251582164256323, + "lm_loss": 1.9532, + "step": 2728, + "vm_loss": 0.1894 + }, + { + "epoch": 0.5253507231032076, + "grad_norm": 2.682672642624575, + "learning_rate": 1.7292332319767394e-05, + "loss": 2.2503, + "step": 2729 + }, + { + "epoch": 0.525543229780783, + "grad_norm": 2.8102973003861247, + "learning_rate": 1.7290198497113513e-05, + "loss": 2.2087, + "step": 2730 + }, + { + "epoch": 0.5257357364583584, + "grad_norm": 2.828460901822948, + "learning_rate": 1.728806396575907e-05, + "loss": 2.2568, + "step": 2731 + }, + { + "epoch": 0.5259282431359338, + "grad_norm": 2.8097666507916483, + "learning_rate": 1.7285928725911562e-05, + "loss": 2.2266, + "step": 2732 + }, + { + "epoch": 0.5261207498135092, + "grad_norm": 2.8892431448880913, + "learning_rate": 1.7283792777778575e-05, + "loss": 2.2068, + "step": 2733 + }, + { + "epoch": 0.5263132564910845, + "grad_norm": 3.0437306974051848, + "learning_rate": 1.728165612156774e-05, + "loss": 2.2506, + "step": 2734 + }, + { + "epoch": 0.5265057631686599, + "grad_norm": 2.6083167184893927, + "learning_rate": 1.7279518757486767e-05, + "loss": 2.2036, + "step": 2735 + }, + { + "epoch": 0.5266982698462352, + "grad_norm": 3.3566222342520606, + "learning_rate": 1.7277380685743444e-05, + "loss": 2.235, + "step": 2736 + }, + { + "epoch": 0.5266982698462352, + "lm_loss": 2.1634, + "step": 2736, + "vm_loss": 0.2309 + }, + { + "epoch": 0.5266982698462352, + "lm_loss": 2.1888, + "step": 2736, + "vm_loss": 0.2093 + }, + { + "epoch": 0.5266982698462352, + "lm_loss": 2.0889, + "step": 2736, + "vm_loss": 0.1404 + }, + { + "epoch": 0.5266982698462352, + "lm_loss": 2.1414, + "step": 2736, + "vm_loss": 0.2849 + }, + { + "epoch": 0.5266982698462352, + "lm_loss": 2.4522, + "step": 2736, + "vm_loss": 0.2159 + }, + { + "epoch": 0.5266982698462352, + "lm_loss": 1.9322, + "step": 2736, + "vm_loss": 0.1453 + }, + { + "epoch": 0.5266982698462352, + "lm_loss": 1.735, + "step": 2736, + "vm_loss": 0.1987 + }, + { + "epoch": 0.5266982698462352, + "lm_loss": 2.0324, + "step": 2736, + "vm_loss": 0.1724 + }, + { + "epoch": 0.5268907765238107, + "grad_norm": 2.8248197200189247, + "learning_rate": 1.727524190654561e-05, + "loss": 2.3045, + "step": 2737 + }, + { + "epoch": 0.5270832832013861, + "grad_norm": 2.578205757388353, + "learning_rate": 1.7273102420101187e-05, + "loss": 2.2532, + "step": 2738 + }, + { + "epoch": 0.5272757898789614, + "grad_norm": 2.8895454269757854, + "learning_rate": 1.7270962226618156e-05, + "loss": 2.1591, + "step": 2739 + }, + { + "epoch": 0.5274682965565368, + "grad_norm": 2.9172714154375092, + "learning_rate": 1.726882132630458e-05, + "loss": 2.1599, + "step": 2740 + }, + { + "epoch": 0.5276608032341121, + "grad_norm": 2.9225337437434593, + "learning_rate": 1.726667971936857e-05, + "loss": 2.2545, + "step": 2741 + }, + { + "epoch": 0.5278533099116876, + "grad_norm": 2.8455547028252535, + "learning_rate": 1.7264537406018323e-05, + "loss": 2.232, + "step": 2742 + }, + { + "epoch": 0.528045816589263, + "grad_norm": 2.91143726475438, + "learning_rate": 1.7262394386462102e-05, + "loss": 2.2398, + "step": 2743 + }, + { + "epoch": 0.5282383232668383, + "grad_norm": 2.9939401927064826, + "learning_rate": 1.7260250660908233e-05, + "loss": 2.2467, + "step": 2744 + }, + { + "epoch": 0.5282383232668383, + "lm_loss": 1.624, + "step": 2744, + "vm_loss": 0.2133 + }, + { + "epoch": 0.5282383232668383, + "lm_loss": 2.0168, + "step": 2744, + "vm_loss": 0.1646 + }, + { + "epoch": 0.5282383232668383, + "lm_loss": 1.7295, + "step": 2744, + "vm_loss": 0.1664 + }, + { + "epoch": 0.5282383232668383, + "lm_loss": 2.048, + "step": 2744, + "vm_loss": 0.2146 + }, + { + "epoch": 0.5282383232668383, + "lm_loss": 1.7678, + "step": 2744, + "vm_loss": 0.1874 + }, + { + "epoch": 0.5282383232668383, + "lm_loss": 2.3112, + "step": 2744, + "vm_loss": 0.2289 + }, + { + "epoch": 0.5282383232668383, + "lm_loss": 2.3074, + "step": 2744, + "vm_loss": 0.1928 + }, + { + "epoch": 0.5282383232668383, + "lm_loss": 1.7058, + "step": 2744, + "vm_loss": 0.191 + }, + { + "epoch": 0.5284308299444137, + "grad_norm": 2.8288647038892303, + "learning_rate": 1.7258106229565115e-05, + "loss": 2.215, + "step": 2745 + }, + { + "epoch": 0.5286233366219891, + "grad_norm": 2.7066053185053094, + "learning_rate": 1.7255961092641212e-05, + "loss": 2.1805, + "step": 2746 + }, + { + "epoch": 0.5288158432995644, + "grad_norm": 2.984640230345748, + "learning_rate": 1.7253815250345058e-05, + "loss": 2.2483, + "step": 2747 + }, + { + "epoch": 0.5290083499771399, + "grad_norm": 2.8337366341359393, + "learning_rate": 1.7251668702885262e-05, + "loss": 2.2069, + "step": 2748 + }, + { + "epoch": 0.5292008566547152, + "grad_norm": 2.759195177892417, + "learning_rate": 1.724952145047049e-05, + "loss": 2.2198, + "step": 2749 + }, + { + "epoch": 0.5293933633322906, + "grad_norm": 3.2688140959407845, + "learning_rate": 1.7247373493309484e-05, + "loss": 2.2207, + "step": 2750 + }, + { + "epoch": 0.529585870009866, + "grad_norm": 3.1063217777207184, + "learning_rate": 1.7245224831611057e-05, + "loss": 2.193, + "step": 2751 + }, + { + "epoch": 0.5297783766874413, + "grad_norm": 2.868409794919299, + "learning_rate": 1.724307546558408e-05, + "loss": 2.2015, + "step": 2752 + }, + { + "epoch": 0.5297783766874413, + "lm_loss": 2.2212, + "step": 2752, + "vm_loss": 0.1071 + }, + { + "epoch": 0.5297783766874413, + "lm_loss": 2.0574, + "step": 2752, + "vm_loss": 0.2019 + }, + { + "epoch": 0.5297783766874413, + "lm_loss": 2.1518, + "step": 2752, + "vm_loss": 0.2377 + }, + { + "epoch": 0.5297783766874413, + "lm_loss": 2.2163, + "step": 2752, + "vm_loss": 0.1609 + }, + { + "epoch": 0.5297783766874413, + "lm_loss": 2.1522, + "step": 2752, + "vm_loss": 0.1578 + }, + { + "epoch": 0.5297783766874413, + "lm_loss": 1.8956, + "step": 2752, + "vm_loss": 0.2013 + }, + { + "epoch": 0.5297783766874413, + "lm_loss": 2.2918, + "step": 2752, + "vm_loss": 0.2074 + }, + { + "epoch": 0.5297783766874413, + "lm_loss": 2.1732, + "step": 2752, + "vm_loss": 0.1611 + }, + { + "epoch": 0.5299708833650167, + "grad_norm": 2.6304106171264325, + "learning_rate": 1.72409253954375e-05, + "loss": 2.2464, + "step": 2753 + }, + { + "epoch": 0.530163390042592, + "grad_norm": 2.9630607681296386, + "learning_rate": 1.7238774621380335e-05, + "loss": 2.2125, + "step": 2754 + }, + { + "epoch": 0.5303558967201675, + "grad_norm": 2.8777381660679406, + "learning_rate": 1.7236623143621666e-05, + "loss": 2.2109, + "step": 2755 + }, + { + "epoch": 0.5305484033977429, + "grad_norm": 2.981110883310165, + "learning_rate": 1.7234470962370646e-05, + "loss": 2.2314, + "step": 2756 + }, + { + "epoch": 0.5307409100753182, + "grad_norm": 3.030164749753389, + "learning_rate": 1.723231807783649e-05, + "loss": 2.2369, + "step": 2757 + }, + { + "epoch": 0.5309334167528936, + "grad_norm": 2.857420219750154, + "learning_rate": 1.723016449022849e-05, + "loss": 2.1863, + "step": 2758 + }, + { + "epoch": 0.531125923430469, + "grad_norm": 2.752568237645256, + "learning_rate": 1.7228010199756e-05, + "loss": 2.2047, + "step": 2759 + }, + { + "epoch": 0.5313184301080444, + "grad_norm": 2.812141531803607, + "learning_rate": 1.722585520662844e-05, + "loss": 2.278, + "step": 2760 + }, + { + "epoch": 0.5313184301080444, + "lm_loss": 2.1925, + "step": 2760, + "vm_loss": 0.1261 + }, + { + "epoch": 0.5313184301080444, + "lm_loss": 2.1859, + "step": 2760, + "vm_loss": 0.1818 + }, + { + "epoch": 0.5313184301080444, + "lm_loss": 1.9906, + "step": 2760, + "vm_loss": 0.1416 + }, + { + "epoch": 0.5313184301080444, + "lm_loss": 2.2004, + "step": 2760, + "vm_loss": 0.1429 + }, + { + "epoch": 0.5313184301080444, + "lm_loss": 1.8358, + "step": 2760, + "vm_loss": 0.1669 + }, + { + "epoch": 0.5313184301080444, + "lm_loss": 1.9115, + "step": 2760, + "vm_loss": 0.172 + }, + { + "epoch": 0.5313184301080444, + "lm_loss": 2.009, + "step": 2760, + "vm_loss": 0.2204 + }, + { + "epoch": 0.5313184301080444, + "lm_loss": 1.5956, + "step": 2760, + "vm_loss": 0.1282 + }, + { + "epoch": 0.5315109367856198, + "grad_norm": 3.1278518058623375, + "learning_rate": 1.7223699511055316e-05, + "loss": 2.1944, + "step": 2761 + }, + { + "epoch": 0.5317034434631951, + "grad_norm": 2.7367063073803077, + "learning_rate": 1.722154311324618e-05, + "loss": 2.2048, + "step": 2762 + }, + { + "epoch": 0.5318959501407705, + "grad_norm": 2.7763989944417538, + "learning_rate": 1.721938601341066e-05, + "loss": 2.2284, + "step": 2763 + }, + { + "epoch": 0.5320884568183459, + "grad_norm": 2.8416085685011425, + "learning_rate": 1.721722821175846e-05, + "loss": 2.1799, + "step": 2764 + }, + { + "epoch": 0.5322809634959212, + "grad_norm": 2.562367566254436, + "learning_rate": 1.721506970849934e-05, + "loss": 2.1748, + "step": 2765 + }, + { + "epoch": 0.5324734701734967, + "grad_norm": 2.7750132557249616, + "learning_rate": 1.7212910503843135e-05, + "loss": 2.1865, + "step": 2766 + }, + { + "epoch": 0.532665976851072, + "grad_norm": 2.941758721362244, + "learning_rate": 1.7210750597999753e-05, + "loss": 2.1543, + "step": 2767 + }, + { + "epoch": 0.5328584835286474, + "grad_norm": 3.0968797342934566, + "learning_rate": 1.7208589991179157e-05, + "loss": 2.1927, + "step": 2768 + }, + { + "epoch": 0.5328584835286474, + "lm_loss": 1.7928, + "step": 2768, + "vm_loss": 0.1899 + }, + { + "epoch": 0.5328584835286474, + "lm_loss": 2.2464, + "step": 2768, + "vm_loss": 0.2022 + }, + { + "epoch": 0.5328584835286474, + "lm_loss": 2.2616, + "step": 2768, + "vm_loss": 0.2046 + }, + { + "epoch": 0.5328584835286474, + "lm_loss": 2.0468, + "step": 2768, + "vm_loss": 0.1496 + }, + { + "epoch": 0.5328584835286474, + "lm_loss": 2.3645, + "step": 2768, + "vm_loss": 0.1542 + }, + { + "epoch": 0.5328584835286474, + "lm_loss": 1.9536, + "step": 2768, + "vm_loss": 0.1222 + }, + { + "epoch": 0.5328584835286474, + "lm_loss": 2.3235, + "step": 2768, + "vm_loss": 0.1796 + }, + { + "epoch": 0.5328584835286474, + "lm_loss": 1.8999, + "step": 2768, + "vm_loss": 0.2185 + }, + { + "epoch": 0.5330509902062228, + "grad_norm": 3.035133047702225, + "learning_rate": 1.7206428683591393e-05, + "loss": 2.2184, + "step": 2769 + }, + { + "epoch": 0.5332434968837981, + "grad_norm": 3.0391767015149704, + "learning_rate": 1.720426667544656e-05, + "loss": 2.1596, + "step": 2770 + }, + { + "epoch": 0.5334360035613736, + "grad_norm": 3.010789242514809, + "learning_rate": 1.720210396695484e-05, + "loss": 2.1463, + "step": 2771 + }, + { + "epoch": 0.5336285102389489, + "grad_norm": 2.8928833649492063, + "learning_rate": 1.719994055832647e-05, + "loss": 2.2712, + "step": 2772 + }, + { + "epoch": 0.5338210169165243, + "grad_norm": 3.0970783685947745, + "learning_rate": 1.7197776449771765e-05, + "loss": 2.2125, + "step": 2773 + }, + { + "epoch": 0.5340135235940997, + "grad_norm": 3.1253087245775246, + "learning_rate": 1.7195611641501103e-05, + "loss": 2.2247, + "step": 2774 + }, + { + "epoch": 0.534206030271675, + "grad_norm": 2.66580061465288, + "learning_rate": 1.719344613372493e-05, + "loss": 2.1911, + "step": 2775 + }, + { + "epoch": 0.5343985369492504, + "grad_norm": 2.8029580271758467, + "learning_rate": 1.719127992665376e-05, + "loss": 2.2206, + "step": 2776 + }, + { + "epoch": 0.5343985369492504, + "lm_loss": 1.9056, + "step": 2776, + "vm_loss": 0.1242 + }, + { + "epoch": 0.5343985369492504, + "lm_loss": 1.9529, + "step": 2776, + "vm_loss": 0.1151 + }, + { + "epoch": 0.5343985369492504, + "lm_loss": 2.3989, + "step": 2776, + "vm_loss": 0.1729 + }, + { + "epoch": 0.5343985369492504, + "lm_loss": 1.9788, + "step": 2776, + "vm_loss": 0.1425 + }, + { + "epoch": 0.5343985369492504, + "lm_loss": 2.0475, + "step": 2776, + "vm_loss": 0.1425 + }, + { + "epoch": 0.5343985369492504, + "lm_loss": 1.7666, + "step": 2776, + "vm_loss": 0.1359 + }, + { + "epoch": 0.5343985369492504, + "lm_loss": 1.6481, + "step": 2776, + "vm_loss": 0.169 + }, + { + "epoch": 0.5343985369492504, + "lm_loss": 1.8702, + "step": 2776, + "vm_loss": 0.1845 + }, + { + "epoch": 0.5345910436268259, + "grad_norm": 2.982385439722535, + "learning_rate": 1.718911302049818e-05, + "loss": 2.2108, + "step": 2777 + }, + { + "epoch": 0.5347835503044012, + "grad_norm": 2.9907747946050103, + "learning_rate": 1.7186945415468837e-05, + "loss": 2.1914, + "step": 2778 + }, + { + "epoch": 0.5349760569819766, + "grad_norm": 3.1081625210274915, + "learning_rate": 1.7184777111776455e-05, + "loss": 2.2222, + "step": 2779 + }, + { + "epoch": 0.5351685636595519, + "grad_norm": 2.886793828612081, + "learning_rate": 1.7182608109631816e-05, + "loss": 2.1566, + "step": 2780 + }, + { + "epoch": 0.5353610703371273, + "grad_norm": 3.1565866275434655, + "learning_rate": 1.7180438409245774e-05, + "loss": 2.2052, + "step": 2781 + }, + { + "epoch": 0.5355535770147027, + "grad_norm": 2.9984097833804024, + "learning_rate": 1.7178268010829256e-05, + "loss": 2.1325, + "step": 2782 + }, + { + "epoch": 0.535746083692278, + "grad_norm": 2.969263911422326, + "learning_rate": 1.717609691459325e-05, + "loss": 2.1218, + "step": 2783 + }, + { + "epoch": 0.5359385903698535, + "grad_norm": 3.202129451523313, + "learning_rate": 1.7173925120748816e-05, + "loss": 2.143, + "step": 2784 + }, + { + "epoch": 0.5359385903698535, + "lm_loss": 2.3837, + "step": 2784, + "vm_loss": 0.1155 + }, + { + "epoch": 0.5359385903698535, + "lm_loss": 2.3103, + "step": 2784, + "vm_loss": 0.2541 + }, + { + "epoch": 0.5359385903698535, + "lm_loss": 1.8356, + "step": 2784, + "vm_loss": 0.1745 + }, + { + "epoch": 0.5359385903698535, + "lm_loss": 2.1483, + "step": 2784, + "vm_loss": 0.1847 + }, + { + "epoch": 0.5359385903698535, + "lm_loss": 1.8324, + "step": 2784, + "vm_loss": 0.1842 + }, + { + "epoch": 0.5359385903698535, + "lm_loss": 2.2568, + "step": 2784, + "vm_loss": 0.1659 + }, + { + "epoch": 0.5359385903698535, + "lm_loss": 2.2442, + "step": 2784, + "vm_loss": 0.1961 + }, + { + "epoch": 0.5359385903698535, + "lm_loss": 2.0604, + "step": 2784, + "vm_loss": 0.1615 + }, + { + "epoch": 0.5361310970474288, + "grad_norm": 2.86582319485646, + "learning_rate": 1.7171752629507076e-05, + "loss": 2.2504, + "step": 2785 + }, + { + "epoch": 0.5363236037250042, + "grad_norm": 2.99909770256116, + "learning_rate": 1.716957944107923e-05, + "loss": 2.2329, + "step": 2786 + }, + { + "epoch": 0.5365161104025796, + "grad_norm": 2.9525908257221847, + "learning_rate": 1.7167405555676535e-05, + "loss": 2.1521, + "step": 2787 + }, + { + "epoch": 0.5367086170801549, + "grad_norm": 2.9226957819073816, + "learning_rate": 1.7165230973510323e-05, + "loss": 2.2156, + "step": 2788 + }, + { + "epoch": 0.5369011237577304, + "grad_norm": 2.8647890667620475, + "learning_rate": 1.7163055694791987e-05, + "loss": 2.2148, + "step": 2789 + }, + { + "epoch": 0.5370936304353058, + "grad_norm": 2.736339506663662, + "learning_rate": 1.7160879719733002e-05, + "loss": 2.1976, + "step": 2790 + }, + { + "epoch": 0.5372861371128811, + "grad_norm": 3.3970062916063815, + "learning_rate": 1.715870304854489e-05, + "loss": 2.2683, + "step": 2791 + }, + { + "epoch": 0.5374786437904565, + "grad_norm": 2.8899381446193178, + "learning_rate": 1.7156525681439252e-05, + "loss": 2.199, + "step": 2792 + }, + { + "epoch": 0.5374786437904565, + "lm_loss": 2.022, + "step": 2792, + "vm_loss": 0.1568 + }, + { + "epoch": 0.5374786437904565, + "lm_loss": 1.9709, + "step": 2792, + "vm_loss": 0.2046 + }, + { + "epoch": 0.5374786437904565, + "lm_loss": 1.962, + "step": 2792, + "vm_loss": 0.1954 + }, + { + "epoch": 0.5374786437904565, + "lm_loss": 2.1104, + "step": 2792, + "vm_loss": 0.144 + }, + { + "epoch": 0.5374786437904565, + "lm_loss": 2.4044, + "step": 2792, + "vm_loss": 0.1244 + }, + { + "epoch": 0.5374786437904565, + "lm_loss": 2.016, + "step": 2792, + "vm_loss": 0.1253 + }, + { + "epoch": 0.5374786437904565, + "lm_loss": 2.1678, + "step": 2792, + "vm_loss": 0.199 + }, + { + "epoch": 0.5374786437904565, + "lm_loss": 1.7413, + "step": 2792, + "vm_loss": 0.164 + }, + { + "epoch": 0.5376711504680318, + "grad_norm": 2.877157366137414, + "learning_rate": 1.7154347618627763e-05, + "loss": 2.1965, + "step": 2793 + }, + { + "epoch": 0.5378636571456072, + "grad_norm": 2.812799175092424, + "learning_rate": 1.7152168860322156e-05, + "loss": 2.1877, + "step": 2794 + }, + { + "epoch": 0.5380561638231827, + "grad_norm": 2.9351006381214706, + "learning_rate": 1.714998940673423e-05, + "loss": 2.2072, + "step": 2795 + }, + { + "epoch": 0.538248670500758, + "grad_norm": 3.1849733400213878, + "learning_rate": 1.7147809258075862e-05, + "loss": 2.1958, + "step": 2796 + }, + { + "epoch": 0.5384411771783334, + "grad_norm": 2.9396554295358626, + "learning_rate": 1.7145628414558982e-05, + "loss": 2.2508, + "step": 2797 + }, + { + "epoch": 0.5386336838559087, + "grad_norm": 2.724302442394363, + "learning_rate": 1.71434468763956e-05, + "loss": 2.1736, + "step": 2798 + }, + { + "epoch": 0.5388261905334841, + "grad_norm": 2.9456581261913795, + "learning_rate": 1.7141264643797796e-05, + "loss": 2.1665, + "step": 2799 + }, + { + "epoch": 0.5390186972110595, + "grad_norm": 3.3403082063681304, + "learning_rate": 1.7139081716977704e-05, + "loss": 2.1858, + "step": 2800 + }, + { + "epoch": 0.5390186972110595, + "lm_loss": 2.0765, + "step": 2800, + "vm_loss": 0.1951 + }, + { + "epoch": 0.5390186972110595, + "lm_loss": 2.2672, + "step": 2800, + "vm_loss": 0.1561 + }, + { + "epoch": 0.5390186972110595, + "lm_loss": 2.2168, + "step": 2800, + "vm_loss": 0.1364 + }, + { + "epoch": 0.5390186972110595, + "lm_loss": 1.8619, + "step": 2800, + "vm_loss": 0.1415 + }, + { + "epoch": 0.5390186972110595, + "lm_loss": 1.931, + "step": 2800, + "vm_loss": 0.246 + }, + { + "epoch": 0.5390186972110595, + "lm_loss": 2.0091, + "step": 2800, + "vm_loss": 0.1434 + }, + { + "epoch": 0.5390186972110595, + "lm_loss": 2.437, + "step": 2800, + "vm_loss": 0.1618 + }, + { + "epoch": 0.5390186972110595, + "lm_loss": 2.2046, + "step": 2800, + "vm_loss": 0.172 + }, + { + "epoch": 0.5392112038886349, + "grad_norm": 2.6705208808738887, + "learning_rate": 1.7136898096147533e-05, + "loss": 2.1814, + "step": 2801 + }, + { + "epoch": 0.5394037105662103, + "grad_norm": 3.0522969891492693, + "learning_rate": 1.7134713781519556e-05, + "loss": 2.2477, + "step": 2802 + }, + { + "epoch": 0.5395962172437856, + "grad_norm": 3.5958958471394062, + "learning_rate": 1.7132528773306124e-05, + "loss": 2.2008, + "step": 2803 + }, + { + "epoch": 0.539788723921361, + "grad_norm": 2.8131822172768723, + "learning_rate": 1.713034307171964e-05, + "loss": 2.1877, + "step": 2804 + }, + { + "epoch": 0.5399812305989364, + "grad_norm": 3.153468776070917, + "learning_rate": 1.7128156676972587e-05, + "loss": 2.2204, + "step": 2805 + }, + { + "epoch": 0.5401737372765117, + "grad_norm": 3.482230010017003, + "learning_rate": 1.712596958927751e-05, + "loss": 2.2671, + "step": 2806 + }, + { + "epoch": 0.5403662439540872, + "grad_norm": 2.71188788581524, + "learning_rate": 1.7123781808847023e-05, + "loss": 2.209, + "step": 2807 + }, + { + "epoch": 0.5405587506316626, + "grad_norm": 3.2041032310884523, + "learning_rate": 1.71215933358938e-05, + "loss": 2.2697, + "step": 2808 + }, + { + "epoch": 0.5405587506316626, + "lm_loss": 1.9634, + "step": 2808, + "vm_loss": 0.1411 + }, + { + "epoch": 0.5405587506316626, + "lm_loss": 2.0582, + "step": 2808, + "vm_loss": 0.1799 + }, + { + "epoch": 0.5405587506316626, + "lm_loss": 1.9243, + "step": 2808, + "vm_loss": 0.1876 + }, + { + "epoch": 0.5405587506316626, + "lm_loss": 1.8375, + "step": 2808, + "vm_loss": 0.1443 + }, + { + "epoch": 0.5405587506316626, + "lm_loss": 1.8677, + "step": 2808, + "vm_loss": 0.2043 + }, + { + "epoch": 0.5405587506316626, + "lm_loss": 2.0188, + "step": 2808, + "vm_loss": 0.1809 + }, + { + "epoch": 0.5405587506316626, + "lm_loss": 2.1829, + "step": 2808, + "vm_loss": 0.1263 + }, + { + "epoch": 0.5405587506316626, + "lm_loss": 2.0511, + "step": 2808, + "vm_loss": 0.1102 + }, + { + "epoch": 0.5407512573092379, + "grad_norm": 2.9209754893485624, + "learning_rate": 1.7119404170630594e-05, + "loss": 2.1637, + "step": 2809 + }, + { + "epoch": 0.5409437639868133, + "grad_norm": 2.8498416561913658, + "learning_rate": 1.7117214313270224e-05, + "loss": 2.1383, + "step": 2810 + }, + { + "epoch": 0.5411362706643886, + "grad_norm": 3.304679376655516, + "learning_rate": 1.7115023764025562e-05, + "loss": 2.2082, + "step": 2811 + }, + { + "epoch": 0.541328777341964, + "grad_norm": 2.8835366396841406, + "learning_rate": 1.711283252310956e-05, + "loss": 2.19, + "step": 2812 + }, + { + "epoch": 0.5415212840195395, + "grad_norm": 2.690694900171086, + "learning_rate": 1.7110640590735243e-05, + "loss": 2.1414, + "step": 2813 + }, + { + "epoch": 0.5417137906971148, + "grad_norm": 3.153531385476327, + "learning_rate": 1.7108447967115686e-05, + "loss": 2.2194, + "step": 2814 + }, + { + "epoch": 0.5419062973746902, + "grad_norm": 2.801025006421387, + "learning_rate": 1.7106254652464045e-05, + "loss": 2.1248, + "step": 2815 + }, + { + "epoch": 0.5420988040522655, + "grad_norm": 3.000140822298546, + "learning_rate": 1.7104060646993535e-05, + "loss": 2.2304, + "step": 2816 + }, + { + "epoch": 0.5420988040522655, + "lm_loss": 2.0456, + "step": 2816, + "vm_loss": 0.1848 + }, + { + "epoch": 0.5420988040522655, + "lm_loss": 2.2187, + "step": 2816, + "vm_loss": 0.146 + }, + { + "epoch": 0.5420988040522655, + "lm_loss": 2.0623, + "step": 2816, + "vm_loss": 0.2225 + }, + { + "epoch": 0.5420988040522655, + "lm_loss": 1.976, + "step": 2816, + "vm_loss": 0.1916 + }, + { + "epoch": 0.5420988040522655, + "lm_loss": 2.1747, + "step": 2816, + "vm_loss": 0.1324 + }, + { + "epoch": 0.5420988040522655, + "lm_loss": 1.9973, + "step": 2816, + "vm_loss": 0.1789 + }, + { + "epoch": 0.5420988040522655, + "lm_loss": 2.0629, + "step": 2816, + "vm_loss": 0.1927 + }, + { + "epoch": 0.5420988040522655, + "lm_loss": 1.9232, + "step": 2816, + "vm_loss": 0.1245 + }, + { + "epoch": 0.5422913107298409, + "grad_norm": 3.155359265008571, + "learning_rate": 1.7101865950917443e-05, + "loss": 2.1552, + "step": 2817 + }, + { + "epoch": 0.5424838174074164, + "grad_norm": 2.651478041574105, + "learning_rate": 1.709967056444912e-05, + "loss": 2.1768, + "step": 2818 + }, + { + "epoch": 0.5426763240849917, + "grad_norm": 2.620746514356997, + "learning_rate": 1.7097474487801987e-05, + "loss": 2.1803, + "step": 2819 + }, + { + "epoch": 0.5428688307625671, + "grad_norm": 2.7817344526616523, + "learning_rate": 1.709527772118953e-05, + "loss": 2.19, + "step": 2820 + }, + { + "epoch": 0.5430613374401424, + "grad_norm": 2.861634875619585, + "learning_rate": 1.7093080264825304e-05, + "loss": 2.2197, + "step": 2821 + }, + { + "epoch": 0.5432538441177178, + "grad_norm": 2.66975632541193, + "learning_rate": 1.709088211892293e-05, + "loss": 2.1843, + "step": 2822 + }, + { + "epoch": 0.5434463507952932, + "grad_norm": 2.8147879522962307, + "learning_rate": 1.7088683283696096e-05, + "loss": 2.2398, + "step": 2823 + }, + { + "epoch": 0.5436388574728686, + "grad_norm": 2.7414116432024893, + "learning_rate": 1.7086483759358556e-05, + "loss": 2.101, + "step": 2824 + }, + { + "epoch": 0.5436388574728686, + "lm_loss": 2.2263, + "step": 2824, + "vm_loss": 0.1673 + }, + { + "epoch": 0.5436388574728686, + "lm_loss": 1.8173, + "step": 2824, + "vm_loss": 0.1671 + }, + { + "epoch": 0.5436388574728686, + "lm_loss": 2.098, + "step": 2824, + "vm_loss": 0.1413 + }, + { + "epoch": 0.5436388574728686, + "lm_loss": 1.7867, + "step": 2824, + "vm_loss": 0.1533 + }, + { + "epoch": 0.5436388574728686, + "lm_loss": 1.8203, + "step": 2824, + "vm_loss": 0.1364 + }, + { + "epoch": 0.5436388574728686, + "lm_loss": 2.0425, + "step": 2824, + "vm_loss": 0.1308 + }, + { + "epoch": 0.5436388574728686, + "lm_loss": 2.0885, + "step": 2824, + "vm_loss": 0.254 + }, + { + "epoch": 0.5436388574728686, + "lm_loss": 2.3214, + "step": 2824, + "vm_loss": 0.2387 + }, + { + "epoch": 0.543831364150444, + "grad_norm": 3.080524081987845, + "learning_rate": 1.708428354612413e-05, + "loss": 2.1746, + "step": 2825 + }, + { + "epoch": 0.5440238708280194, + "grad_norm": 3.1100842289094266, + "learning_rate": 1.7082082644206708e-05, + "loss": 2.1619, + "step": 2826 + }, + { + "epoch": 0.5442163775055947, + "grad_norm": 2.8263010921321268, + "learning_rate": 1.707988105382025e-05, + "loss": 2.2335, + "step": 2827 + }, + { + "epoch": 0.5444088841831701, + "grad_norm": 2.8722930245985814, + "learning_rate": 1.7077678775178774e-05, + "loss": 2.1514, + "step": 2828 + }, + { + "epoch": 0.5446013908607454, + "grad_norm": 2.7052072708399395, + "learning_rate": 1.7075475808496368e-05, + "loss": 2.1564, + "step": 2829 + }, + { + "epoch": 0.5447938975383209, + "grad_norm": 2.9833975971555566, + "learning_rate": 1.7073272153987196e-05, + "loss": 2.1882, + "step": 2830 + }, + { + "epoch": 0.5449864042158963, + "grad_norm": 2.848153359786647, + "learning_rate": 1.7071067811865477e-05, + "loss": 2.138, + "step": 2831 + }, + { + "epoch": 0.5451789108934716, + "grad_norm": 2.7681203239902206, + "learning_rate": 1.70688627823455e-05, + "loss": 2.1482, + "step": 2832 + }, + { + "epoch": 0.5451789108934716, + "lm_loss": 1.8498, + "step": 2832, + "vm_loss": 0.1166 + }, + { + "epoch": 0.5451789108934716, + "lm_loss": 1.7863, + "step": 2832, + "vm_loss": 0.1505 + }, + { + "epoch": 0.5451789108934716, + "lm_loss": 2.1311, + "step": 2832, + "vm_loss": 0.1787 + }, + { + "epoch": 0.5451789108934716, + "lm_loss": 1.9125, + "step": 2832, + "vm_loss": 0.2082 + }, + { + "epoch": 0.5451789108934716, + "lm_loss": 2.0878, + "step": 2832, + "vm_loss": 0.2114 + }, + { + "epoch": 0.5451789108934716, + "lm_loss": 1.5625, + "step": 2832, + "vm_loss": 0.135 + }, + { + "epoch": 0.5451789108934716, + "lm_loss": 2.14, + "step": 2832, + "vm_loss": 0.2575 + }, + { + "epoch": 0.5451789108934716, + "lm_loss": 2.0169, + "step": 2832, + "vm_loss": 0.1512 + }, + { + "epoch": 0.545371417571047, + "grad_norm": 2.8729993302896286, + "learning_rate": 1.7066657065641624e-05, + "loss": 2.2296, + "step": 2833 + }, + { + "epoch": 0.5455639242486223, + "grad_norm": 2.904684301348938, + "learning_rate": 1.706445066196827e-05, + "loss": 2.2374, + "step": 2834 + }, + { + "epoch": 0.5457564309261977, + "grad_norm": 3.0279667340139156, + "learning_rate": 1.7062243571539935e-05, + "loss": 2.1113, + "step": 2835 + }, + { + "epoch": 0.5459489376037732, + "grad_norm": 2.83401379299048, + "learning_rate": 1.706003579457117e-05, + "loss": 2.1863, + "step": 2836 + }, + { + "epoch": 0.5461414442813485, + "grad_norm": 2.681881971783011, + "learning_rate": 1.70578273312766e-05, + "loss": 2.2348, + "step": 2837 + }, + { + "epoch": 0.5463339509589239, + "grad_norm": 2.9269528542419914, + "learning_rate": 1.705561818187092e-05, + "loss": 2.1534, + "step": 2838 + }, + { + "epoch": 0.5465264576364993, + "grad_norm": 2.8765052311848227, + "learning_rate": 1.7053408346568884e-05, + "loss": 2.2024, + "step": 2839 + }, + { + "epoch": 0.5467189643140746, + "grad_norm": 2.74906735945929, + "learning_rate": 1.7051197825585316e-05, + "loss": 2.1726, + "step": 2840 + }, + { + "epoch": 0.5467189643140746, + "lm_loss": 2.137, + "step": 2840, + "vm_loss": 0.1261 + }, + { + "epoch": 0.5467189643140746, + "lm_loss": 2.4126, + "step": 2840, + "vm_loss": 0.1139 + }, + { + "epoch": 0.5467189643140746, + "lm_loss": 1.8027, + "step": 2840, + "vm_loss": 0.1915 + }, + { + "epoch": 0.5467189643140746, + "lm_loss": 1.9414, + "step": 2840, + "vm_loss": 0.1785 + }, + { + "epoch": 0.5467189643140746, + "lm_loss": 2.0159, + "step": 2840, + "vm_loss": 0.2042 + }, + { + "epoch": 0.5467189643140746, + "lm_loss": 2.03, + "step": 2840, + "vm_loss": 0.2136 + }, + { + "epoch": 0.5467189643140746, + "lm_loss": 1.883, + "step": 2840, + "vm_loss": 0.1713 + }, + { + "epoch": 0.5467189643140746, + "lm_loss": 1.8566, + "step": 2840, + "vm_loss": 0.1311 + }, + { + "epoch": 0.54691147099165, + "grad_norm": 2.8649960595451627, + "learning_rate": 1.7048986619135108e-05, + "loss": 2.1805, + "step": 2841 + }, + { + "epoch": 0.5471039776692254, + "grad_norm": 2.943181826477159, + "learning_rate": 1.7046774727433223e-05, + "loss": 2.2107, + "step": 2842 + }, + { + "epoch": 0.5472964843468008, + "grad_norm": 2.700998685676341, + "learning_rate": 1.7044562150694675e-05, + "loss": 2.1649, + "step": 2843 + }, + { + "epoch": 0.5474889910243762, + "grad_norm": 2.7662927302752514, + "learning_rate": 1.704234888913456e-05, + "loss": 2.1768, + "step": 2844 + }, + { + "epoch": 0.5476814977019515, + "grad_norm": 3.134918485389084, + "learning_rate": 1.7040134942968037e-05, + "loss": 2.2271, + "step": 2845 + }, + { + "epoch": 0.5478740043795269, + "grad_norm": 2.6568599048952963, + "learning_rate": 1.7037920312410325e-05, + "loss": 2.2275, + "step": 2846 + }, + { + "epoch": 0.5480665110571022, + "grad_norm": 2.716705190357507, + "learning_rate": 1.7035704997676718e-05, + "loss": 2.1732, + "step": 2847 + }, + { + "epoch": 0.5482590177346777, + "grad_norm": 2.877005783063368, + "learning_rate": 1.7033488998982573e-05, + "loss": 2.2098, + "step": 2848 + }, + { + "epoch": 0.5482590177346777, + "lm_loss": 2.2362, + "step": 2848, + "vm_loss": 0.1766 + }, + { + "epoch": 0.5482590177346777, + "lm_loss": 1.9952, + "step": 2848, + "vm_loss": 0.1336 + }, + { + "epoch": 0.5482590177346777, + "lm_loss": 2.1946, + "step": 2848, + "vm_loss": 0.1777 + }, + { + "epoch": 0.5482590177346777, + "lm_loss": 2.178, + "step": 2848, + "vm_loss": 0.0952 + }, + { + "epoch": 0.5482590177346777, + "lm_loss": 1.4631, + "step": 2848, + "vm_loss": 0.148 + }, + { + "epoch": 0.5482590177346777, + "lm_loss": 2.1477, + "step": 2848, + "vm_loss": 0.2079 + }, + { + "epoch": 0.5482590177346777, + "lm_loss": 2.0217, + "step": 2848, + "vm_loss": 0.1804 + }, + { + "epoch": 0.5482590177346777, + "lm_loss": 1.9593, + "step": 2848, + "vm_loss": 0.1441 + }, + { + "epoch": 0.5484515244122531, + "grad_norm": 2.805428232674696, + "learning_rate": 1.703127231654331e-05, + "loss": 2.1781, + "step": 2849 + }, + { + "epoch": 0.5486440310898284, + "grad_norm": 2.825724704161364, + "learning_rate": 1.702905495057443e-05, + "loss": 2.1874, + "step": 2850 + }, + { + "epoch": 0.5488365377674038, + "grad_norm": 2.9679952407920496, + "learning_rate": 1.7026836901291475e-05, + "loss": 2.2046, + "step": 2851 + }, + { + "epoch": 0.5490290444449791, + "grad_norm": 2.5453465836182896, + "learning_rate": 1.7024618168910075e-05, + "loss": 2.1409, + "step": 2852 + }, + { + "epoch": 0.5492215511225546, + "grad_norm": 2.90415321555807, + "learning_rate": 1.7022398753645918e-05, + "loss": 2.1689, + "step": 2853 + }, + { + "epoch": 0.54941405780013, + "grad_norm": 2.699163942425459, + "learning_rate": 1.7020178655714757e-05, + "loss": 2.1864, + "step": 2854 + }, + { + "epoch": 0.5496065644777053, + "grad_norm": 2.6752299095346834, + "learning_rate": 1.7017957875332417e-05, + "loss": 2.1632, + "step": 2855 + }, + { + "epoch": 0.5497990711552807, + "grad_norm": 2.4777289329560186, + "learning_rate": 1.7015736412714786e-05, + "loss": 2.1717, + "step": 2856 + }, + { + "epoch": 0.5497990711552807, + "lm_loss": 2.3689, + "step": 2856, + "vm_loss": 0.094 + }, + { + "epoch": 0.5497990711552807, + "lm_loss": 1.9439, + "step": 2856, + "vm_loss": 0.1597 + }, + { + "epoch": 0.5497990711552807, + "lm_loss": 2.0768, + "step": 2856, + "vm_loss": 0.1721 + }, + { + "epoch": 0.5497990711552807, + "lm_loss": 1.8754, + "step": 2856, + "vm_loss": 0.1698 + }, + { + "epoch": 0.5497990711552807, + "lm_loss": 2.3018, + "step": 2856, + "vm_loss": 0.1614 + }, + { + "epoch": 0.5497990711552807, + "lm_loss": 2.1182, + "step": 2856, + "vm_loss": 0.1393 + }, + { + "epoch": 0.5497990711552807, + "lm_loss": 1.9846, + "step": 2856, + "vm_loss": 0.1686 + }, + { + "epoch": 0.5497990711552807, + "lm_loss": 1.9328, + "step": 2856, + "vm_loss": 0.1467 + }, + { + "epoch": 0.5499915778328561, + "grad_norm": 2.858328302557514, + "learning_rate": 1.701351426807782e-05, + "loss": 2.1587, + "step": 2857 + }, + { + "epoch": 0.5501840845104314, + "grad_norm": 2.903914100249301, + "learning_rate": 1.7011291441637537e-05, + "loss": 2.2268, + "step": 2858 + }, + { + "epoch": 0.5503765911880069, + "grad_norm": 3.0091860471084755, + "learning_rate": 1.7009067933610027e-05, + "loss": 2.1963, + "step": 2859 + }, + { + "epoch": 0.5505690978655822, + "grad_norm": 2.503947266375995, + "learning_rate": 1.7006843744211437e-05, + "loss": 2.2147, + "step": 2860 + }, + { + "epoch": 0.5507616045431576, + "grad_norm": 2.7778365266132807, + "learning_rate": 1.7004618873657994e-05, + "loss": 2.1082, + "step": 2861 + }, + { + "epoch": 0.550954111220733, + "grad_norm": 3.128715291659576, + "learning_rate": 1.7002393322165982e-05, + "loss": 2.2089, + "step": 2862 + }, + { + "epoch": 0.5511466178983083, + "grad_norm": 2.7948260365575583, + "learning_rate": 1.7000167089951755e-05, + "loss": 2.1869, + "step": 2863 + }, + { + "epoch": 0.5513391245758837, + "grad_norm": 2.7680988132528066, + "learning_rate": 1.6997940177231722e-05, + "loss": 2.2366, + "step": 2864 + }, + { + "epoch": 0.5513391245758837, + "lm_loss": 2.0758, + "step": 2864, + "vm_loss": 0.1162 + }, + { + "epoch": 0.5513391245758837, + "lm_loss": 2.3273, + "step": 2864, + "vm_loss": 0.181 + }, + { + "epoch": 0.5513391245758837, + "lm_loss": 1.9972, + "step": 2864, + "vm_loss": 0.2417 + }, + { + "epoch": 0.5513391245758837, + "lm_loss": 2.1159, + "step": 2864, + "vm_loss": 0.1719 + }, + { + "epoch": 0.5513391245758837, + "lm_loss": 1.7565, + "step": 2864, + "vm_loss": 0.1395 + }, + { + "epoch": 0.5513391245758837, + "lm_loss": 2.1037, + "step": 2864, + "vm_loss": 0.2238 + }, + { + "epoch": 0.5513391245758837, + "lm_loss": 1.7659, + "step": 2864, + "vm_loss": 0.1324 + }, + { + "epoch": 0.5513391245758837, + "lm_loss": 2.0511, + "step": 2864, + "vm_loss": 0.158 + }, + { + "epoch": 0.551531631253459, + "grad_norm": 2.880083912273121, + "learning_rate": 1.6995712584222378e-05, + "loss": 2.1681, + "step": 2865 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 3.0684582465130017, + "learning_rate": 1.699348431114027e-05, + "loss": 2.2094, + "step": 2866 + }, + { + "epoch": 0.5519166446086099, + "grad_norm": 2.734666545633309, + "learning_rate": 1.6991255358202012e-05, + "loss": 2.1669, + "step": 2867 + }, + { + "epoch": 0.5521091512861852, + "grad_norm": 2.9895071283120935, + "learning_rate": 1.698902572562429e-05, + "loss": 2.1927, + "step": 2868 + }, + { + "epoch": 0.5523016579637606, + "grad_norm": 2.906917339030937, + "learning_rate": 1.698679541362385e-05, + "loss": 2.1975, + "step": 2869 + }, + { + "epoch": 0.552494164641336, + "grad_norm": 2.9262093702459535, + "learning_rate": 1.698456442241751e-05, + "loss": 2.1954, + "step": 2870 + }, + { + "epoch": 0.5526866713189114, + "grad_norm": 2.8019443629237895, + "learning_rate": 1.698233275222215e-05, + "loss": 2.2183, + "step": 2871 + }, + { + "epoch": 0.5528791779964868, + "grad_norm": 3.077509499146404, + "learning_rate": 1.6980100403254714e-05, + "loss": 2.1781, + "step": 2872 + }, + { + "epoch": 0.5528791779964868, + "lm_loss": 2.0781, + "step": 2872, + "vm_loss": 0.1613 + }, + { + "epoch": 0.5528791779964868, + "lm_loss": 2.0759, + "step": 2872, + "vm_loss": 0.1644 + }, + { + "epoch": 0.5528791779964868, + "lm_loss": 2.194, + "step": 2872, + "vm_loss": 0.1971 + }, + { + "epoch": 0.5528791779964868, + "lm_loss": 2.2238, + "step": 2872, + "vm_loss": 0.1854 + }, + { + "epoch": 0.5528791779964868, + "lm_loss": 2.0502, + "step": 2872, + "vm_loss": 0.1488 + }, + { + "epoch": 0.5528791779964868, + "lm_loss": 2.0838, + "step": 2872, + "vm_loss": 0.1667 + }, + { + "epoch": 0.5528791779964868, + "lm_loss": 1.7142, + "step": 2872, + "vm_loss": 0.123 + }, + { + "epoch": 0.5528791779964868, + "lm_loss": 2.3354, + "step": 2872, + "vm_loss": 0.1655 + }, + { + "epoch": 0.5530716846740621, + "grad_norm": 2.890288961610733, + "learning_rate": 1.6977867375732223e-05, + "loss": 2.2113, + "step": 2873 + }, + { + "epoch": 0.5532641913516375, + "grad_norm": 2.7646624113582723, + "learning_rate": 1.6975633669871748e-05, + "loss": 2.2386, + "step": 2874 + }, + { + "epoch": 0.5534566980292129, + "grad_norm": 3.209625539138657, + "learning_rate": 1.697339928589043e-05, + "loss": 2.1852, + "step": 2875 + }, + { + "epoch": 0.5536492047067882, + "grad_norm": 2.835435847009742, + "learning_rate": 1.6971164224005492e-05, + "loss": 2.1425, + "step": 2876 + }, + { + "epoch": 0.5538417113843637, + "grad_norm": 2.8377171924653344, + "learning_rate": 1.69689284844342e-05, + "loss": 2.1973, + "step": 2877 + }, + { + "epoch": 0.554034218061939, + "grad_norm": 2.8293848207634245, + "learning_rate": 1.6966692067393905e-05, + "loss": 2.1897, + "step": 2878 + }, + { + "epoch": 0.5542267247395144, + "grad_norm": 2.858469689350339, + "learning_rate": 1.6964454973102008e-05, + "loss": 2.1411, + "step": 2879 + }, + { + "epoch": 0.5544192314170898, + "grad_norm": 2.758531640707539, + "learning_rate": 1.6962217201775986e-05, + "loss": 2.2343, + "step": 2880 + }, + { + "epoch": 0.5544192314170898, + "lm_loss": 2.0407, + "step": 2880, + "vm_loss": 0.1175 + }, + { + "epoch": 0.5544192314170898, + "lm_loss": 1.9243, + "step": 2880, + "vm_loss": 0.1309 + }, + { + "epoch": 0.5544192314170898, + "lm_loss": 2.1767, + "step": 2880, + "vm_loss": 0.1439 + }, + { + "epoch": 0.5544192314170898, + "lm_loss": 2.2357, + "step": 2880, + "vm_loss": 0.173 + }, + { + "epoch": 0.5544192314170898, + "lm_loss": 2.3737, + "step": 2880, + "vm_loss": 0.1877 + }, + { + "epoch": 0.5544192314170898, + "lm_loss": 1.9462, + "step": 2880, + "vm_loss": 0.1866 + }, + { + "epoch": 0.5544192314170898, + "lm_loss": 2.3412, + "step": 2880, + "vm_loss": 0.1628 + }, + { + "epoch": 0.5544192314170898, + "lm_loss": 1.9693, + "step": 2880, + "vm_loss": 0.1332 + }, + { + "epoch": 0.5546117380946651, + "grad_norm": 2.7825699052644626, + "learning_rate": 1.6959978753633382e-05, + "loss": 2.1886, + "step": 2881 + }, + { + "epoch": 0.5548042447722406, + "grad_norm": 2.878408221535546, + "learning_rate": 1.6957739628891797e-05, + "loss": 2.2125, + "step": 2882 + }, + { + "epoch": 0.5549967514498159, + "grad_norm": 2.8326487998668726, + "learning_rate": 1.6955499827768905e-05, + "loss": 2.1505, + "step": 2883 + }, + { + "epoch": 0.5551892581273913, + "grad_norm": 2.779430971347055, + "learning_rate": 1.6953259350482444e-05, + "loss": 2.2568, + "step": 2884 + }, + { + "epoch": 0.5553817648049667, + "grad_norm": 2.63230265808158, + "learning_rate": 1.695101819725021e-05, + "loss": 2.0978, + "step": 2885 + }, + { + "epoch": 0.555574271482542, + "grad_norm": 2.80406006509171, + "learning_rate": 1.6948776368290085e-05, + "loss": 2.1764, + "step": 2886 + }, + { + "epoch": 0.5557667781601174, + "grad_norm": 2.646791310225615, + "learning_rate": 1.6946533863819993e-05, + "loss": 2.1503, + "step": 2887 + }, + { + "epoch": 0.5559592848376929, + "grad_norm": 2.669685720958, + "learning_rate": 1.694429068405794e-05, + "loss": 2.1791, + "step": 2888 + }, + { + "epoch": 0.5559592848376929, + "lm_loss": 2.0098, + "step": 2888, + "vm_loss": 0.1754 + }, + { + "epoch": 0.5559592848376929, + "lm_loss": 2.0938, + "step": 2888, + "vm_loss": 0.1918 + }, + { + "epoch": 0.5559592848376929, + "lm_loss": 2.1374, + "step": 2888, + "vm_loss": 0.1505 + }, + { + "epoch": 0.5559592848376929, + "lm_loss": 1.973, + "step": 2888, + "vm_loss": 0.166 + }, + { + "epoch": 0.5559592848376929, + "lm_loss": 2.0896, + "step": 2888, + "vm_loss": 0.1687 + }, + { + "epoch": 0.5559592848376929, + "lm_loss": 2.0662, + "step": 2888, + "vm_loss": 0.1292 + }, + { + "epoch": 0.5559592848376929, + "lm_loss": 2.6298, + "step": 2888, + "vm_loss": 0.1379 + }, + { + "epoch": 0.5559592848376929, + "lm_loss": 2.1503, + "step": 2888, + "vm_loss": 0.2306 + }, + { + "epoch": 0.5561517915152682, + "grad_norm": 2.854749072449087, + "learning_rate": 1.6942046829221986e-05, + "loss": 2.251, + "step": 2889 + }, + { + "epoch": 0.5563442981928436, + "grad_norm": 2.9419846354740224, + "learning_rate": 1.6939802299530266e-05, + "loss": 2.1314, + "step": 2890 + }, + { + "epoch": 0.5565368048704189, + "grad_norm": 2.593903165665498, + "learning_rate": 1.6937557095200978e-05, + "loss": 2.1317, + "step": 2891 + }, + { + "epoch": 0.5567293115479943, + "grad_norm": 2.9585810463133506, + "learning_rate": 1.6935311216452382e-05, + "loss": 2.1902, + "step": 2892 + }, + { + "epoch": 0.5569218182255697, + "grad_norm": 2.714056102608393, + "learning_rate": 1.6933064663502805e-05, + "loss": 2.1536, + "step": 2893 + }, + { + "epoch": 0.557114324903145, + "grad_norm": 2.636195004657239, + "learning_rate": 1.6930817436570644e-05, + "loss": 2.2033, + "step": 2894 + }, + { + "epoch": 0.5573068315807205, + "grad_norm": 2.7343012068680674, + "learning_rate": 1.692856953587436e-05, + "loss": 2.1655, + "step": 2895 + }, + { + "epoch": 0.5574993382582958, + "grad_norm": 2.766015931399634, + "learning_rate": 1.6926320961632475e-05, + "loss": 2.1219, + "step": 2896 + }, + { + "epoch": 0.5574993382582958, + "lm_loss": 1.6178, + "step": 2896, + "vm_loss": 0.2169 + }, + { + "epoch": 0.5574993382582958, + "lm_loss": 2.216, + "step": 2896, + "vm_loss": 0.1645 + }, + { + "epoch": 0.5574993382582958, + "lm_loss": 2.1938, + "step": 2896, + "vm_loss": 0.2207 + }, + { + "epoch": 0.5574993382582958, + "lm_loss": 2.2074, + "step": 2896, + "vm_loss": 0.1449 + }, + { + "epoch": 0.5574993382582958, + "lm_loss": 2.2695, + "step": 2896, + "vm_loss": 0.1932 + }, + { + "epoch": 0.5574993382582958, + "lm_loss": 2.2981, + "step": 2896, + "vm_loss": 0.1238 + }, + { + "epoch": 0.5574993382582958, + "lm_loss": 2.1422, + "step": 2896, + "vm_loss": 0.2147 + }, + { + "epoch": 0.5574993382582958, + "lm_loss": 1.9907, + "step": 2896, + "vm_loss": 0.1535 + }, + { + "epoch": 0.5576918449358712, + "grad_norm": 2.847538388995897, + "learning_rate": 1.6924071714063574e-05, + "loss": 2.2251, + "step": 2897 + }, + { + "epoch": 0.5578843516134466, + "grad_norm": 2.8123256232025966, + "learning_rate": 1.6921821793386324e-05, + "loss": 2.1261, + "step": 2898 + }, + { + "epoch": 0.5580768582910219, + "grad_norm": 2.8942544463366713, + "learning_rate": 1.6919571199819437e-05, + "loss": 2.1939, + "step": 2899 + }, + { + "epoch": 0.5582693649685974, + "grad_norm": 2.841089072911991, + "learning_rate": 1.69173199335817e-05, + "loss": 2.1738, + "step": 2900 + }, + { + "epoch": 0.5584618716461728, + "grad_norm": 2.8796405858181204, + "learning_rate": 1.6915067994891967e-05, + "loss": 2.1771, + "step": 2901 + }, + { + "epoch": 0.5586543783237481, + "grad_norm": 2.923301911642831, + "learning_rate": 1.6912815383969155e-05, + "loss": 2.1657, + "step": 2902 + }, + { + "epoch": 0.5588468850013235, + "grad_norm": 2.6916849586351534, + "learning_rate": 1.6910562101032253e-05, + "loss": 2.1127, + "step": 2903 + }, + { + "epoch": 0.5590393916788988, + "grad_norm": 2.6608379909050535, + "learning_rate": 1.6908308146300297e-05, + "loss": 2.1615, + "step": 2904 + }, + { + "epoch": 0.5590393916788988, + "lm_loss": 2.1971, + "step": 2904, + "vm_loss": 0.1599 + }, + { + "epoch": 0.5590393916788988, + "lm_loss": 2.5155, + "step": 2904, + "vm_loss": 0.1478 + }, + { + "epoch": 0.5590393916788988, + "lm_loss": 2.2244, + "step": 2904, + "vm_loss": 0.1963 + }, + { + "epoch": 0.5590393916788988, + "lm_loss": 1.8414, + "step": 2904, + "vm_loss": 0.1421 + }, + { + "epoch": 0.5590393916788988, + "lm_loss": 1.6498, + "step": 2904, + "vm_loss": 0.1827 + }, + { + "epoch": 0.5590393916788988, + "lm_loss": 2.0944, + "step": 2904, + "vm_loss": 0.1563 + }, + { + "epoch": 0.5590393916788988, + "lm_loss": 1.7682, + "step": 2904, + "vm_loss": 0.1787 + }, + { + "epoch": 0.5590393916788988, + "lm_loss": 1.3593, + "step": 2904, + "vm_loss": 0.1609 + }, + { + "epoch": 0.5592318983564742, + "grad_norm": 3.1174771459968276, + "learning_rate": 1.690605351999241e-05, + "loss": 2.1874, + "step": 2905 + }, + { + "epoch": 0.5594244050340497, + "grad_norm": 2.9656710821677392, + "learning_rate": 1.6903798222327765e-05, + "loss": 2.1838, + "step": 2906 + }, + { + "epoch": 0.559616911711625, + "grad_norm": 2.6594457304989936, + "learning_rate": 1.6901542253525607e-05, + "loss": 2.229, + "step": 2907 + }, + { + "epoch": 0.5598094183892004, + "grad_norm": 2.8047743270863275, + "learning_rate": 1.6899285613805246e-05, + "loss": 2.1749, + "step": 2908 + }, + { + "epoch": 0.5600019250667757, + "grad_norm": 2.9705526417826555, + "learning_rate": 1.6897028303386057e-05, + "loss": 2.1882, + "step": 2909 + }, + { + "epoch": 0.5601944317443511, + "grad_norm": 2.7318849990106604, + "learning_rate": 1.6894770322487474e-05, + "loss": 2.2005, + "step": 2910 + }, + { + "epoch": 0.5603869384219266, + "grad_norm": 2.5160188269679615, + "learning_rate": 1.6892511671329008e-05, + "loss": 2.1305, + "step": 2911 + }, + { + "epoch": 0.5605794450995019, + "grad_norm": 2.763951952753921, + "learning_rate": 1.689025235013023e-05, + "loss": 2.1323, + "step": 2912 + }, + { + "epoch": 0.5605794450995019, + "lm_loss": 2.2732, + "step": 2912, + "vm_loss": 0.128 + }, + { + "epoch": 0.5605794450995019, + "lm_loss": 2.1882, + "step": 2912, + "vm_loss": 0.2117 + }, + { + "epoch": 0.5605794450995019, + "lm_loss": 1.9254, + "step": 2912, + "vm_loss": 0.2294 + }, + { + "epoch": 0.5605794450995019, + "lm_loss": 1.8028, + "step": 2912, + "vm_loss": 0.1514 + }, + { + "epoch": 0.5605794450995019, + "lm_loss": 2.0102, + "step": 2912, + "vm_loss": 0.2088 + }, + { + "epoch": 0.5605794450995019, + "lm_loss": 1.9107, + "step": 2912, + "vm_loss": 0.198 + }, + { + "epoch": 0.5605794450995019, + "lm_loss": 2.2454, + "step": 2912, + "vm_loss": 0.2492 + }, + { + "epoch": 0.5605794450995019, + "lm_loss": 1.7294, + "step": 2912, + "vm_loss": 0.1483 + }, + { + "epoch": 0.5607719517770773, + "grad_norm": 3.019655143388101, + "learning_rate": 1.6887992359110765e-05, + "loss": 2.1836, + "step": 2913 + }, + { + "epoch": 0.5609644584546526, + "grad_norm": 2.892364744751732, + "learning_rate": 1.6885731698490322e-05, + "loss": 2.1564, + "step": 2914 + }, + { + "epoch": 0.561156965132228, + "grad_norm": 2.771983307646824, + "learning_rate": 1.6883470368488666e-05, + "loss": 2.1416, + "step": 2915 + }, + { + "epoch": 0.5613494718098034, + "grad_norm": 3.039943481023494, + "learning_rate": 1.6881208369325622e-05, + "loss": 2.1488, + "step": 2916 + }, + { + "epoch": 0.5615419784873787, + "grad_norm": 2.8725679981974985, + "learning_rate": 1.6878945701221092e-05, + "loss": 2.1768, + "step": 2917 + }, + { + "epoch": 0.5617344851649542, + "grad_norm": 3.00403542377559, + "learning_rate": 1.687668236439503e-05, + "loss": 2.1777, + "step": 2918 + }, + { + "epoch": 0.5619269918425296, + "grad_norm": 2.7069314019052078, + "learning_rate": 1.6874418359067463e-05, + "loss": 2.2706, + "step": 2919 + }, + { + "epoch": 0.5621194985201049, + "grad_norm": 2.991886744904629, + "learning_rate": 1.6872153685458484e-05, + "loss": 2.1828, + "step": 2920 + }, + { + "epoch": 0.5621194985201049, + "lm_loss": 2.0101, + "step": 2920, + "vm_loss": 0.1592 + }, + { + "epoch": 0.5621194985201049, + "lm_loss": 1.9062, + "step": 2920, + "vm_loss": 0.1441 + }, + { + "epoch": 0.5621194985201049, + "lm_loss": 2.3523, + "step": 2920, + "vm_loss": 0.1451 + }, + { + "epoch": 0.5621194985201049, + "lm_loss": 1.8348, + "step": 2920, + "vm_loss": 0.1246 + }, + { + "epoch": 0.5621194985201049, + "lm_loss": 2.1886, + "step": 2920, + "vm_loss": 0.1426 + }, + { + "epoch": 0.5621194985201049, + "lm_loss": 2.1179, + "step": 2920, + "vm_loss": 0.1532 + }, + { + "epoch": 0.5621194985201049, + "lm_loss": 2.0022, + "step": 2920, + "vm_loss": 0.2107 + }, + { + "epoch": 0.5621194985201049, + "lm_loss": 1.7119, + "step": 2920, + "vm_loss": 0.2178 + }, + { + "epoch": 0.5623120051976803, + "grad_norm": 2.78633237825189, + "learning_rate": 1.6869888343788244e-05, + "loss": 2.1791, + "step": 2921 + }, + { + "epoch": 0.5625045118752556, + "grad_norm": 2.953817775717228, + "learning_rate": 1.6867622334276966e-05, + "loss": 2.09, + "step": 2922 + }, + { + "epoch": 0.562697018552831, + "grad_norm": 2.90390007602645, + "learning_rate": 1.6865355657144935e-05, + "loss": 2.1343, + "step": 2923 + }, + { + "epoch": 0.5628895252304065, + "grad_norm": 2.678492383665957, + "learning_rate": 1.68630883126125e-05, + "loss": 2.1936, + "step": 2924 + }, + { + "epoch": 0.5630820319079818, + "grad_norm": 2.9692553787278317, + "learning_rate": 1.6860820300900077e-05, + "loss": 2.167, + "step": 2925 + }, + { + "epoch": 0.5632745385855572, + "grad_norm": 2.778151163224821, + "learning_rate": 1.6858551622228144e-05, + "loss": 2.0763, + "step": 2926 + }, + { + "epoch": 0.5634670452631325, + "grad_norm": 2.9616010830806627, + "learning_rate": 1.685628227681725e-05, + "loss": 2.1559, + "step": 2927 + }, + { + "epoch": 0.5636595519407079, + "grad_norm": 2.8145982251849286, + "learning_rate": 1.6854012264888e-05, + "loss": 2.1599, + "step": 2928 + }, + { + "epoch": 0.5636595519407079, + "lm_loss": 2.0825, + "step": 2928, + "vm_loss": 0.1476 + }, + { + "epoch": 0.5636595519407079, + "lm_loss": 2.1065, + "step": 2928, + "vm_loss": 0.1777 + }, + { + "epoch": 0.5636595519407079, + "lm_loss": 1.75, + "step": 2928, + "vm_loss": 0.1011 + }, + { + "epoch": 0.5636595519407079, + "lm_loss": 1.3023, + "step": 2928, + "vm_loss": 0.1645 + }, + { + "epoch": 0.5636595519407079, + "lm_loss": 2.2106, + "step": 2928, + "vm_loss": 0.1419 + }, + { + "epoch": 0.5636595519407079, + "lm_loss": 1.954, + "step": 2928, + "vm_loss": 0.211 + }, + { + "epoch": 0.5636595519407079, + "lm_loss": 1.8126, + "step": 2928, + "vm_loss": 0.2232 + }, + { + "epoch": 0.5636595519407079, + "lm_loss": 1.8773, + "step": 2928, + "vm_loss": 0.1562 + }, + { + "epoch": 0.5638520586182834, + "grad_norm": 2.940348634021967, + "learning_rate": 1.685174158666107e-05, + "loss": 2.2084, + "step": 2929 + }, + { + "epoch": 0.5640445652958587, + "grad_norm": 2.770211973360341, + "learning_rate": 1.6849470242357197e-05, + "loss": 2.1725, + "step": 2930 + }, + { + "epoch": 0.5642370719734341, + "grad_norm": 2.9379068487275615, + "learning_rate": 1.6847198232197184e-05, + "loss": 2.2058, + "step": 2931 + }, + { + "epoch": 0.5644295786510095, + "grad_norm": 3.0554261235661193, + "learning_rate": 1.6844925556401906e-05, + "loss": 2.1566, + "step": 2932 + }, + { + "epoch": 0.5646220853285848, + "grad_norm": 3.0900310723957425, + "learning_rate": 1.6842652215192287e-05, + "loss": 2.1493, + "step": 2933 + }, + { + "epoch": 0.5648145920061602, + "grad_norm": 3.157840594279967, + "learning_rate": 1.6840378208789334e-05, + "loss": 2.1677, + "step": 2934 + }, + { + "epoch": 0.5650070986837356, + "grad_norm": 2.9556818086969416, + "learning_rate": 1.6838103537414103e-05, + "loss": 2.1675, + "step": 2935 + }, + { + "epoch": 0.565199605361311, + "grad_norm": 2.7230589166538777, + "learning_rate": 1.6835828201287723e-05, + "loss": 2.13, + "step": 2936 + }, + { + "epoch": 0.565199605361311, + "lm_loss": 2.2277, + "step": 2936, + "vm_loss": 0.2264 + }, + { + "epoch": 0.565199605361311, + "lm_loss": 1.8808, + "step": 2936, + "vm_loss": 0.1357 + }, + { + "epoch": 0.565199605361311, + "lm_loss": 1.9247, + "step": 2936, + "vm_loss": 0.242 + }, + { + "epoch": 0.565199605361311, + "lm_loss": 1.9168, + "step": 2936, + "vm_loss": 0.2063 + }, + { + "epoch": 0.565199605361311, + "lm_loss": 1.9045, + "step": 2936, + "vm_loss": 0.1649 + }, + { + "epoch": 0.565199605361311, + "lm_loss": 2.1572, + "step": 2936, + "vm_loss": 0.218 + }, + { + "epoch": 0.565199605361311, + "lm_loss": 2.1727, + "step": 2936, + "vm_loss": 0.2185 + }, + { + "epoch": 0.565199605361311, + "lm_loss": 2.1261, + "step": 2936, + "vm_loss": 0.201 + }, + { + "epoch": 0.5653921120388864, + "grad_norm": 2.9386711762826883, + "learning_rate": 1.6833552200631388e-05, + "loss": 2.1348, + "step": 2937 + }, + { + "epoch": 0.5655846187164617, + "grad_norm": 2.920716062415379, + "learning_rate": 1.683127553566635e-05, + "loss": 2.1997, + "step": 2938 + }, + { + "epoch": 0.5657771253940371, + "grad_norm": 2.909017783735447, + "learning_rate": 1.6828998206613937e-05, + "loss": 2.2063, + "step": 2939 + }, + { + "epoch": 0.5659696320716124, + "grad_norm": 3.0186595681504036, + "learning_rate": 1.682672021369552e-05, + "loss": 2.1127, + "step": 2940 + }, + { + "epoch": 0.5661621387491879, + "grad_norm": 2.7402832013425633, + "learning_rate": 1.682444155713257e-05, + "loss": 2.1851, + "step": 2941 + }, + { + "epoch": 0.5663546454267633, + "grad_norm": 2.861908479368446, + "learning_rate": 1.6822162237146585e-05, + "loss": 2.1627, + "step": 2942 + }, + { + "epoch": 0.5665471521043386, + "grad_norm": 3.093710581956597, + "learning_rate": 1.681988225395915e-05, + "loss": 2.1038, + "step": 2943 + }, + { + "epoch": 0.566739658781914, + "grad_norm": 2.8830932876222426, + "learning_rate": 1.681760160779191e-05, + "loss": 2.159, + "step": 2944 + }, + { + "epoch": 0.566739658781914, + "lm_loss": 2.2174, + "step": 2944, + "vm_loss": 0.1399 + }, + { + "epoch": 0.566739658781914, + "lm_loss": 2.1765, + "step": 2944, + "vm_loss": 0.1313 + }, + { + "epoch": 0.566739658781914, + "lm_loss": 2.1171, + "step": 2944, + "vm_loss": 0.1522 + }, + { + "epoch": 0.566739658781914, + "lm_loss": 2.0809, + "step": 2944, + "vm_loss": 0.2162 + }, + { + "epoch": 0.566739658781914, + "lm_loss": 2.4125, + "step": 2944, + "vm_loss": 0.211 + }, + { + "epoch": 0.566739658781914, + "lm_loss": 1.9601, + "step": 2944, + "vm_loss": 0.186 + }, + { + "epoch": 0.566739658781914, + "lm_loss": 2.2961, + "step": 2944, + "vm_loss": 0.1514 + }, + { + "epoch": 0.566739658781914, + "lm_loss": 1.9124, + "step": 2944, + "vm_loss": 0.1755 + }, + { + "epoch": 0.5669321654594893, + "grad_norm": 2.956744303687958, + "learning_rate": 1.6815320298866566e-05, + "loss": 2.2233, + "step": 2945 + }, + { + "epoch": 0.5671246721370647, + "grad_norm": 2.804794520316079, + "learning_rate": 1.6813038327404903e-05, + "loss": 2.1572, + "step": 2946 + }, + { + "epoch": 0.5673171788146402, + "grad_norm": 2.716289944626574, + "learning_rate": 1.6810755693628747e-05, + "loss": 2.1972, + "step": 2947 + }, + { + "epoch": 0.5675096854922155, + "grad_norm": 2.8787525521346584, + "learning_rate": 1.680847239776e-05, + "loss": 2.1898, + "step": 2948 + }, + { + "epoch": 0.5677021921697909, + "grad_norm": 3.0569900637485343, + "learning_rate": 1.6806188440020635e-05, + "loss": 2.1302, + "step": 2949 + }, + { + "epoch": 0.5678946988473663, + "grad_norm": 2.9551687345044777, + "learning_rate": 1.6803903820632674e-05, + "loss": 2.1972, + "step": 2950 + }, + { + "epoch": 0.5680872055249416, + "grad_norm": 2.9781623382651516, + "learning_rate": 1.6801618539818215e-05, + "loss": 2.2128, + "step": 2951 + }, + { + "epoch": 0.568279712202517, + "grad_norm": 2.952464885218521, + "learning_rate": 1.6799332597799413e-05, + "loss": 2.204, + "step": 2952 + }, + { + "epoch": 0.568279712202517, + "lm_loss": 1.9899, + "step": 2952, + "vm_loss": 0.1288 + }, + { + "epoch": 0.568279712202517, + "lm_loss": 1.7825, + "step": 2952, + "vm_loss": 0.2561 + }, + { + "epoch": 0.568279712202517, + "lm_loss": 1.7164, + "step": 2952, + "vm_loss": 0.1979 + }, + { + "epoch": 0.568279712202517, + "lm_loss": 2.0708, + "step": 2952, + "vm_loss": 0.1743 + }, + { + "epoch": 0.568279712202517, + "lm_loss": 2.2554, + "step": 2952, + "vm_loss": 0.1022 + }, + { + "epoch": 0.568279712202517, + "lm_loss": 1.7811, + "step": 2952, + "vm_loss": 0.2239 + }, + { + "epoch": 0.568279712202517, + "lm_loss": 2.3576, + "step": 2952, + "vm_loss": 0.1529 + }, + { + "epoch": 0.568279712202517, + "lm_loss": 1.6213, + "step": 2952, + "vm_loss": 0.1645 + }, + { + "epoch": 0.5684722188800924, + "grad_norm": 3.111689000237369, + "learning_rate": 1.6797045994798497e-05, + "loss": 2.1154, + "step": 2953 + }, + { + "epoch": 0.5686647255576678, + "grad_norm": 2.9058315009195947, + "learning_rate": 1.679475873103775e-05, + "loss": 2.0712, + "step": 2954 + }, + { + "epoch": 0.5688572322352432, + "grad_norm": 2.5687230547297975, + "learning_rate": 1.6792470806739522e-05, + "loss": 2.1406, + "step": 2955 + }, + { + "epoch": 0.5690497389128185, + "grad_norm": 3.2409232169140587, + "learning_rate": 1.6790182222126237e-05, + "loss": 2.1877, + "step": 2956 + }, + { + "epoch": 0.5692422455903939, + "grad_norm": 2.798838308512494, + "learning_rate": 1.678789297742036e-05, + "loss": 2.1487, + "step": 2957 + }, + { + "epoch": 0.5694347522679692, + "grad_norm": 3.0632329001100884, + "learning_rate": 1.6785603072844446e-05, + "loss": 2.1333, + "step": 2958 + }, + { + "epoch": 0.5696272589455447, + "grad_norm": 2.857039426648582, + "learning_rate": 1.6783312508621105e-05, + "loss": 2.1017, + "step": 2959 + }, + { + "epoch": 0.5698197656231201, + "grad_norm": 3.066759338733449, + "learning_rate": 1.6781021284972998e-05, + "loss": 2.1359, + "step": 2960 + }, + { + "epoch": 0.5698197656231201, + "lm_loss": 1.8409, + "step": 2960, + "vm_loss": 0.1634 + }, + { + "epoch": 0.5698197656231201, + "lm_loss": 2.053, + "step": 2960, + "vm_loss": 0.2123 + }, + { + "epoch": 0.5698197656231201, + "lm_loss": 2.0793, + "step": 2960, + "vm_loss": 0.1628 + }, + { + "epoch": 0.5698197656231201, + "lm_loss": 1.9363, + "step": 2960, + "vm_loss": 0.1449 + }, + { + "epoch": 0.5698197656231201, + "lm_loss": 2.1015, + "step": 2960, + "vm_loss": 0.1383 + }, + { + "epoch": 0.5698197656231201, + "lm_loss": 1.9023, + "step": 2960, + "vm_loss": 0.174 + }, + { + "epoch": 0.5698197656231201, + "lm_loss": 2.4383, + "step": 2960, + "vm_loss": 0.1639 + }, + { + "epoch": 0.5698197656231201, + "lm_loss": 1.9681, + "step": 2960, + "vm_loss": 0.2164 + }, + { + "epoch": 0.5700122723006954, + "grad_norm": 2.909279826468173, + "learning_rate": 1.677872940212287e-05, + "loss": 2.1331, + "step": 2961 + }, + { + "epoch": 0.5702047789782708, + "grad_norm": 2.588324998584431, + "learning_rate": 1.677643686029352e-05, + "loss": 2.1342, + "step": 2962 + }, + { + "epoch": 0.5703972856558462, + "grad_norm": 2.808406413835792, + "learning_rate": 1.677414365970781e-05, + "loss": 2.162, + "step": 2963 + }, + { + "epoch": 0.5705897923334216, + "grad_norm": 3.0400637773965977, + "learning_rate": 1.6771849800588665e-05, + "loss": 2.2225, + "step": 2964 + }, + { + "epoch": 0.570782299010997, + "grad_norm": 3.140516871478318, + "learning_rate": 1.676955528315909e-05, + "loss": 2.1632, + "step": 2965 + }, + { + "epoch": 0.5709748056885723, + "grad_norm": 2.953478609795563, + "learning_rate": 1.6767260107642128e-05, + "loss": 2.1879, + "step": 2966 + }, + { + "epoch": 0.5711673123661477, + "grad_norm": 2.9332075927894175, + "learning_rate": 1.6764964274260908e-05, + "loss": 2.1155, + "step": 2967 + }, + { + "epoch": 0.5713598190437231, + "grad_norm": 2.847837865025105, + "learning_rate": 1.676266778323861e-05, + "loss": 2.1645, + "step": 2968 + }, + { + "epoch": 0.5713598190437231, + "lm_loss": 2.1508, + "step": 2968, + "vm_loss": 0.1546 + }, + { + "epoch": 0.5713598190437231, + "lm_loss": 2.1353, + "step": 2968, + "vm_loss": 0.1621 + }, + { + "epoch": 0.5713598190437231, + "lm_loss": 2.0621, + "step": 2968, + "vm_loss": 0.1383 + }, + { + "epoch": 0.5713598190437231, + "lm_loss": 1.7121, + "step": 2968, + "vm_loss": 0.1505 + }, + { + "epoch": 0.5713598190437231, + "lm_loss": 2.2223, + "step": 2968, + "vm_loss": 0.2088 + }, + { + "epoch": 0.5713598190437231, + "lm_loss": 2.2704, + "step": 2968, + "vm_loss": 0.1467 + }, + { + "epoch": 0.5713598190437231, + "lm_loss": 2.3551, + "step": 2968, + "vm_loss": 0.12 + }, + { + "epoch": 0.5713598190437231, + "lm_loss": 1.8017, + "step": 2968, + "vm_loss": 0.1973 + }, + { + "epoch": 0.5715523257212984, + "grad_norm": 2.9490681437606456, + "learning_rate": 1.6760370634798486e-05, + "loss": 2.1815, + "step": 2969 + }, + { + "epoch": 0.5717448323988739, + "grad_norm": 3.068313561260416, + "learning_rate": 1.6758072829163843e-05, + "loss": 2.1503, + "step": 2970 + }, + { + "epoch": 0.5719373390764492, + "grad_norm": 2.860764696646835, + "learning_rate": 1.6755774366558064e-05, + "loss": 2.1224, + "step": 2971 + }, + { + "epoch": 0.5721298457540246, + "grad_norm": 2.8943423145574476, + "learning_rate": 1.6753475247204577e-05, + "loss": 2.1219, + "step": 2972 + }, + { + "epoch": 0.5723223524316, + "grad_norm": 2.9453364746513673, + "learning_rate": 1.67511754713269e-05, + "loss": 2.1063, + "step": 2973 + }, + { + "epoch": 0.5725148591091753, + "grad_norm": 3.022413204512645, + "learning_rate": 1.6748875039148592e-05, + "loss": 2.2033, + "step": 2974 + }, + { + "epoch": 0.5727073657867507, + "grad_norm": 3.1293250943764863, + "learning_rate": 1.674657395089329e-05, + "loss": 2.1785, + "step": 2975 + }, + { + "epoch": 0.572899872464326, + "grad_norm": 2.7538465108666155, + "learning_rate": 1.6744272206784685e-05, + "loss": 2.1158, + "step": 2976 + }, + { + "epoch": 0.572899872464326, + "lm_loss": 1.6389, + "step": 2976, + "vm_loss": 0.148 + }, + { + "epoch": 0.572899872464326, + "lm_loss": 2.0219, + "step": 2976, + "vm_loss": 0.157 + }, + { + "epoch": 0.572899872464326, + "lm_loss": 1.8265, + "step": 2976, + "vm_loss": 0.1878 + }, + { + "epoch": 0.572899872464326, + "lm_loss": 1.739, + "step": 2976, + "vm_loss": 0.1706 + }, + { + "epoch": 0.572899872464326, + "lm_loss": 2.0564, + "step": 2976, + "vm_loss": 0.158 + }, + { + "epoch": 0.572899872464326, + "lm_loss": 1.9844, + "step": 2976, + "vm_loss": 0.0958 + }, + { + "epoch": 0.572899872464326, + "lm_loss": 2.5677, + "step": 2976, + "vm_loss": 0.2092 + }, + { + "epoch": 0.572899872464326, + "lm_loss": 1.9938, + "step": 2976, + "vm_loss": 0.1352 + }, + { + "epoch": 0.5730923791419015, + "grad_norm": 2.65350249387357, + "learning_rate": 1.6741969807046533e-05, + "loss": 2.1802, + "step": 2977 + }, + { + "epoch": 0.5732848858194769, + "grad_norm": 2.619306264450932, + "learning_rate": 1.673966675190267e-05, + "loss": 2.107, + "step": 2978 + }, + { + "epoch": 0.5734773924970522, + "grad_norm": 2.8808668419166326, + "learning_rate": 1.6737363041576963e-05, + "loss": 2.176, + "step": 2979 + }, + { + "epoch": 0.5736698991746276, + "grad_norm": 2.9735296681186147, + "learning_rate": 1.673505867629338e-05, + "loss": 2.1654, + "step": 2980 + }, + { + "epoch": 0.573862405852203, + "grad_norm": 2.603425908930411, + "learning_rate": 1.673275365627592e-05, + "loss": 2.1068, + "step": 2981 + }, + { + "epoch": 0.5740549125297784, + "grad_norm": 2.9192074879905476, + "learning_rate": 1.6730447981748672e-05, + "loss": 2.1754, + "step": 2982 + }, + { + "epoch": 0.5742474192073538, + "grad_norm": 2.7902590181730185, + "learning_rate": 1.6728141652935773e-05, + "loss": 2.1504, + "step": 2983 + }, + { + "epoch": 0.5744399258849291, + "grad_norm": 3.0914333151363387, + "learning_rate": 1.6725834670061427e-05, + "loss": 2.1831, + "step": 2984 + }, + { + "epoch": 0.5744399258849291, + "lm_loss": 1.9061, + "step": 2984, + "vm_loss": 0.1523 + }, + { + "epoch": 0.5744399258849291, + "lm_loss": 1.942, + "step": 2984, + "vm_loss": 0.1741 + }, + { + "epoch": 0.5744399258849291, + "lm_loss": 2.0901, + "step": 2984, + "vm_loss": 0.132 + }, + { + "epoch": 0.5744399258849291, + "lm_loss": 1.9405, + "step": 2984, + "vm_loss": 0.1958 + }, + { + "epoch": 0.5744399258849291, + "lm_loss": 2.0959, + "step": 2984, + "vm_loss": 0.1439 + }, + { + "epoch": 0.5744399258849291, + "lm_loss": 2.287, + "step": 2984, + "vm_loss": 0.244 + }, + { + "epoch": 0.5744399258849291, + "lm_loss": 2.0566, + "step": 2984, + "vm_loss": 0.1941 + }, + { + "epoch": 0.5744399258849291, + "lm_loss": 2.1861, + "step": 2984, + "vm_loss": 0.1351 + }, + { + "epoch": 0.5746324325625045, + "grad_norm": 2.946787245867567, + "learning_rate": 1.67235270333499e-05, + "loss": 2.1674, + "step": 2985 + }, + { + "epoch": 0.5748249392400799, + "grad_norm": 3.0645177447167526, + "learning_rate": 1.672121874302553e-05, + "loss": 2.1586, + "step": 2986 + }, + { + "epoch": 0.5750174459176552, + "grad_norm": 2.9410114812127044, + "learning_rate": 1.6718909799312705e-05, + "loss": 2.1419, + "step": 2987 + }, + { + "epoch": 0.5752099525952307, + "grad_norm": 4.221624039449748, + "learning_rate": 1.671660020243589e-05, + "loss": 2.1172, + "step": 2988 + }, + { + "epoch": 0.575402459272806, + "grad_norm": 3.193419327774961, + "learning_rate": 1.6714289952619602e-05, + "loss": 2.1837, + "step": 2989 + }, + { + "epoch": 0.5755949659503814, + "grad_norm": 2.9341434918566116, + "learning_rate": 1.6711979050088434e-05, + "loss": 2.1157, + "step": 2990 + }, + { + "epoch": 0.5757874726279568, + "grad_norm": 2.776906643403077, + "learning_rate": 1.6709667495067027e-05, + "loss": 2.1757, + "step": 2991 + }, + { + "epoch": 0.5759799793055321, + "grad_norm": 2.9222000246874096, + "learning_rate": 1.67073552877801e-05, + "loss": 2.0911, + "step": 2992 + }, + { + "epoch": 0.5759799793055321, + "lm_loss": 1.9962, + "step": 2992, + "vm_loss": 0.1549 + }, + { + "epoch": 0.5759799793055321, + "lm_loss": 1.7887, + "step": 2992, + "vm_loss": 0.2351 + }, + { + "epoch": 0.5759799793055321, + "lm_loss": 1.9677, + "step": 2992, + "vm_loss": 0.1794 + }, + { + "epoch": 0.5759799793055321, + "lm_loss": 2.0608, + "step": 2992, + "vm_loss": 0.1856 + }, + { + "epoch": 0.5759799793055321, + "lm_loss": 1.9229, + "step": 2992, + "vm_loss": 0.1467 + }, + { + "epoch": 0.5759799793055321, + "lm_loss": 1.9755, + "step": 2992, + "vm_loss": 0.222 + }, + { + "epoch": 0.5759799793055321, + "lm_loss": 1.7754, + "step": 2992, + "vm_loss": 0.1242 + }, + { + "epoch": 0.5759799793055321, + "lm_loss": 1.5932, + "step": 2992, + "vm_loss": 0.2292 + }, + { + "epoch": 0.5761724859831076, + "grad_norm": 3.077133222557676, + "learning_rate": 1.6705042428452428e-05, + "loss": 2.1646, + "step": 2993 + }, + { + "epoch": 0.5763649926606829, + "grad_norm": 3.196324918071841, + "learning_rate": 1.6702728917308847e-05, + "loss": 2.1238, + "step": 2994 + }, + { + "epoch": 0.5765574993382583, + "grad_norm": 2.674037565069713, + "learning_rate": 1.6700414754574265e-05, + "loss": 2.1051, + "step": 2995 + }, + { + "epoch": 0.5767500060158337, + "grad_norm": 2.88322051954885, + "learning_rate": 1.6698099940473644e-05, + "loss": 2.1877, + "step": 2996 + }, + { + "epoch": 0.576942512693409, + "grad_norm": 3.042383218322858, + "learning_rate": 1.6695784475232012e-05, + "loss": 2.0791, + "step": 2997 + }, + { + "epoch": 0.5771350193709844, + "grad_norm": 3.012044312350685, + "learning_rate": 1.669346835907447e-05, + "loss": 2.1168, + "step": 2998 + }, + { + "epoch": 0.5773275260485599, + "grad_norm": 2.9135181808340525, + "learning_rate": 1.6691151592226168e-05, + "loss": 2.2086, + "step": 2999 + }, + { + "epoch": 0.5775200327261352, + "grad_norm": 3.0163349891080897, + "learning_rate": 1.6688834174912326e-05, + "loss": 2.1839, + "step": 3000 + }, + { + "epoch": 0.5775200327261352, + "lm_loss": 1.8148, + "step": 3000, + "vm_loss": 0.2075 + }, + { + "epoch": 0.5775200327261352, + "lm_loss": 2.0436, + "step": 3000, + "vm_loss": 0.1971 + }, + { + "epoch": 0.5775200327261352, + "lm_loss": 1.7701, + "step": 3000, + "vm_loss": 0.2007 + }, + { + "epoch": 0.5775200327261352, + "lm_loss": 1.7789, + "step": 3000, + "vm_loss": 0.1543 + }, + { + "epoch": 0.5775200327261352, + "lm_loss": 2.4492, + "step": 3000, + "vm_loss": 0.1039 + }, + { + "epoch": 0.5775200327261352, + "lm_loss": 2.2197, + "step": 3000, + "vm_loss": 0.1712 + }, + { + "epoch": 0.5775200327261352, + "lm_loss": 2.2634, + "step": 3000, + "vm_loss": 0.1547 + }, + { + "epoch": 0.5775200327261352, + "lm_loss": 2.1622, + "step": 3000, + "vm_loss": 0.2332 + }, + { + "epoch": 0.5777125394037106, + "grad_norm": 2.618486873029357, + "learning_rate": 1.6686516107358225e-05, + "loss": 2.1332, + "step": 3001 + }, + { + "epoch": 0.5779050460812859, + "grad_norm": 2.7642437850936843, + "learning_rate": 1.6684197389789214e-05, + "loss": 2.1207, + "step": 3002 + }, + { + "epoch": 0.5780975527588613, + "grad_norm": 2.841408811008923, + "learning_rate": 1.66818780224307e-05, + "loss": 2.2031, + "step": 3003 + }, + { + "epoch": 0.5782900594364367, + "grad_norm": 3.198645434761591, + "learning_rate": 1.6679558005508157e-05, + "loss": 2.1408, + "step": 3004 + }, + { + "epoch": 0.578482566114012, + "grad_norm": 2.7723782081090635, + "learning_rate": 1.667723733924712e-05, + "loss": 2.1571, + "step": 3005 + }, + { + "epoch": 0.5786750727915875, + "grad_norm": 2.853256361346693, + "learning_rate": 1.6674916023873186e-05, + "loss": 2.155, + "step": 3006 + }, + { + "epoch": 0.5788675794691628, + "grad_norm": 2.9502615222138635, + "learning_rate": 1.6672594059612018e-05, + "loss": 2.1177, + "step": 3007 + }, + { + "epoch": 0.5790600861467382, + "grad_norm": 3.1401898369039767, + "learning_rate": 1.667027144668934e-05, + "loss": 2.1398, + "step": 3008 + }, + { + "epoch": 0.5790600861467382, + "lm_loss": 1.7484, + "step": 3008, + "vm_loss": 0.1472 + }, + { + "epoch": 0.5790600861467382, + "lm_loss": 2.686, + "step": 3008, + "vm_loss": 0.1472 + }, + { + "epoch": 0.5790600861467382, + "lm_loss": 2.0838, + "step": 3008, + "vm_loss": 0.1685 + }, + { + "epoch": 0.5790600861467382, + "lm_loss": 1.9938, + "step": 3008, + "vm_loss": 0.1258 + }, + { + "epoch": 0.5790600861467382, + "lm_loss": 1.6312, + "step": 3008, + "vm_loss": 0.1622 + }, + { + "epoch": 0.5790600861467382, + "lm_loss": 2.0108, + "step": 3008, + "vm_loss": 0.1653 + }, + { + "epoch": 0.5790600861467382, + "lm_loss": 2.2867, + "step": 3008, + "vm_loss": 0.184 + }, + { + "epoch": 0.5790600861467382, + "lm_loss": 2.1133, + "step": 3008, + "vm_loss": 0.1684 + }, + { + "epoch": 0.5792525928243136, + "grad_norm": 3.137967040854821, + "learning_rate": 1.666794818533094e-05, + "loss": 2.1429, + "step": 3009 + }, + { + "epoch": 0.5794450995018889, + "grad_norm": 2.6415181296743824, + "learning_rate": 1.6665624275762663e-05, + "loss": 2.1185, + "step": 3010 + }, + { + "epoch": 0.5796376061794644, + "grad_norm": 2.8699039466992473, + "learning_rate": 1.6663299718210435e-05, + "loss": 2.1734, + "step": 3011 + }, + { + "epoch": 0.5798301128570398, + "grad_norm": 3.009270528154217, + "learning_rate": 1.6660974512900226e-05, + "loss": 2.1735, + "step": 3012 + }, + { + "epoch": 0.5800226195346151, + "grad_norm": 2.710147199758013, + "learning_rate": 1.6658648660058073e-05, + "loss": 2.1425, + "step": 3013 + }, + { + "epoch": 0.5802151262121905, + "grad_norm": 2.9133236226274435, + "learning_rate": 1.665632215991008e-05, + "loss": 2.1326, + "step": 3014 + }, + { + "epoch": 0.5804076328897658, + "grad_norm": 2.8180354871077666, + "learning_rate": 1.6653995012682414e-05, + "loss": 2.1402, + "step": 3015 + }, + { + "epoch": 0.5806001395673412, + "grad_norm": 2.7642663465213557, + "learning_rate": 1.6651667218601306e-05, + "loss": 2.1586, + "step": 3016 + }, + { + "epoch": 0.5806001395673412, + "lm_loss": 1.9077, + "step": 3016, + "vm_loss": 0.1775 + }, + { + "epoch": 0.5806001395673412, + "lm_loss": 1.786, + "step": 3016, + "vm_loss": 0.2205 + }, + { + "epoch": 0.5806001395673412, + "lm_loss": 1.523, + "step": 3016, + "vm_loss": 0.1128 + }, + { + "epoch": 0.5806001395673412, + "lm_loss": 2.0013, + "step": 3016, + "vm_loss": 0.1889 + }, + { + "epoch": 0.5806001395673412, + "lm_loss": 1.9704, + "step": 3016, + "vm_loss": 0.2197 + }, + { + "epoch": 0.5806001395673412, + "lm_loss": 1.8691, + "step": 3016, + "vm_loss": 0.1074 + }, + { + "epoch": 0.5806001395673412, + "lm_loss": 2.0971, + "step": 3016, + "vm_loss": 0.1419 + }, + { + "epoch": 0.5806001395673412, + "lm_loss": 1.9733, + "step": 3016, + "vm_loss": 0.1716 + }, + { + "epoch": 0.5807926462449167, + "grad_norm": 3.371445079678996, + "learning_rate": 1.6649338777893043e-05, + "loss": 2.1184, + "step": 3017 + }, + { + "epoch": 0.580985152922492, + "grad_norm": 2.7310585057137735, + "learning_rate": 1.664700969078398e-05, + "loss": 2.1046, + "step": 3018 + }, + { + "epoch": 0.5811776596000674, + "grad_norm": 2.6919097245860715, + "learning_rate": 1.6644679957500537e-05, + "loss": 2.1296, + "step": 3019 + }, + { + "epoch": 0.5813701662776427, + "grad_norm": 2.730289319124603, + "learning_rate": 1.6642349578269192e-05, + "loss": 2.1523, + "step": 3020 + }, + { + "epoch": 0.5815626729552181, + "grad_norm": 2.745369352029728, + "learning_rate": 1.6640018553316486e-05, + "loss": 2.1834, + "step": 3021 + }, + { + "epoch": 0.5817551796327936, + "grad_norm": 2.6476482993923125, + "learning_rate": 1.6637686882869027e-05, + "loss": 2.1435, + "step": 3022 + }, + { + "epoch": 0.5819476863103689, + "grad_norm": 2.913432247629352, + "learning_rate": 1.6635354567153482e-05, + "loss": 2.1485, + "step": 3023 + }, + { + "epoch": 0.5821401929879443, + "grad_norm": 3.0774270333031573, + "learning_rate": 1.663302160639658e-05, + "loss": 2.091, + "step": 3024 + }, + { + "epoch": 0.5821401929879443, + "lm_loss": 1.9161, + "step": 3024, + "vm_loss": 0.129 + }, + { + "epoch": 0.5821401929879443, + "lm_loss": 2.298, + "step": 3024, + "vm_loss": 0.1591 + }, + { + "epoch": 0.5821401929879443, + "lm_loss": 2.1439, + "step": 3024, + "vm_loss": 0.1816 + }, + { + "epoch": 0.5821401929879443, + "lm_loss": 1.6152, + "step": 3024, + "vm_loss": 0.1597 + }, + { + "epoch": 0.5821401929879443, + "lm_loss": 1.9405, + "step": 3024, + "vm_loss": 0.1325 + }, + { + "epoch": 0.5821401929879443, + "lm_loss": 2.1522, + "step": 3024, + "vm_loss": 0.1028 + }, + { + "epoch": 0.5821401929879443, + "lm_loss": 2.1761, + "step": 3024, + "vm_loss": 0.1147 + }, + { + "epoch": 0.5821401929879443, + "lm_loss": 1.6991, + "step": 3024, + "vm_loss": 0.1303 + }, + { + "epoch": 0.5823326996655196, + "grad_norm": 2.748384072711639, + "learning_rate": 1.663068800082512e-05, + "loss": 2.0845, + "step": 3025 + }, + { + "epoch": 0.582525206343095, + "grad_norm": 3.312325788378188, + "learning_rate": 1.6628353750665954e-05, + "loss": 2.0968, + "step": 3026 + }, + { + "epoch": 0.5827177130206704, + "grad_norm": 3.1289334873310395, + "learning_rate": 1.6626018856146e-05, + "loss": 2.1256, + "step": 3027 + }, + { + "epoch": 0.5829102196982457, + "grad_norm": 2.7340214554749327, + "learning_rate": 1.6623683317492247e-05, + "loss": 2.1634, + "step": 3028 + }, + { + "epoch": 0.5831027263758212, + "grad_norm": 3.048497445691234, + "learning_rate": 1.662134713493173e-05, + "loss": 2.1602, + "step": 3029 + }, + { + "epoch": 0.5832952330533966, + "grad_norm": 3.167323346776916, + "learning_rate": 1.6619010308691558e-05, + "loss": 2.1507, + "step": 3030 + }, + { + "epoch": 0.5834877397309719, + "grad_norm": 2.7814775701561465, + "learning_rate": 1.6616672838998904e-05, + "loss": 2.2123, + "step": 3031 + }, + { + "epoch": 0.5836802464085473, + "grad_norm": 2.773021395966257, + "learning_rate": 1.6614334726080997e-05, + "loss": 2.1371, + "step": 3032 + }, + { + "epoch": 0.5836802464085473, + "lm_loss": 1.6373, + "step": 3032, + "vm_loss": 0.2306 + }, + { + "epoch": 0.5836802464085473, + "lm_loss": 1.6518, + "step": 3032, + "vm_loss": 0.2021 + }, + { + "epoch": 0.5836802464085473, + "lm_loss": 2.0603, + "step": 3032, + "vm_loss": 0.1861 + }, + { + "epoch": 0.5836802464085473, + "lm_loss": 2.199, + "step": 3032, + "vm_loss": 0.204 + }, + { + "epoch": 0.5836802464085473, + "lm_loss": 2.1185, + "step": 3032, + "vm_loss": 0.1373 + }, + { + "epoch": 0.5836802464085473, + "lm_loss": 1.6362, + "step": 3032, + "vm_loss": 0.2011 + }, + { + "epoch": 0.5836802464085473, + "lm_loss": 2.0823, + "step": 3032, + "vm_loss": 0.1566 + }, + { + "epoch": 0.5836802464085473, + "lm_loss": 1.6695, + "step": 3032, + "vm_loss": 0.1581 + }, + { + "epoch": 0.5838727530861226, + "grad_norm": 3.151178804919212, + "learning_rate": 1.6611995970165137e-05, + "loss": 2.1769, + "step": 3033 + }, + { + "epoch": 0.584065259763698, + "grad_norm": 2.7766689792106063, + "learning_rate": 1.6609656571478673e-05, + "loss": 2.1652, + "step": 3034 + }, + { + "epoch": 0.5842577664412735, + "grad_norm": 2.993634674309778, + "learning_rate": 1.660731653024903e-05, + "loss": 2.112, + "step": 3035 + }, + { + "epoch": 0.5844502731188488, + "grad_norm": 2.923656336549419, + "learning_rate": 1.6604975846703685e-05, + "loss": 2.1523, + "step": 3036 + }, + { + "epoch": 0.5846427797964242, + "grad_norm": 3.101385612578588, + "learning_rate": 1.6602634521070185e-05, + "loss": 2.164, + "step": 3037 + }, + { + "epoch": 0.5848352864739995, + "grad_norm": 2.873618086647502, + "learning_rate": 1.6600292553576142e-05, + "loss": 2.1445, + "step": 3038 + }, + { + "epoch": 0.5850277931515749, + "grad_norm": 3.2203575151247286, + "learning_rate": 1.6597949944449214e-05, + "loss": 2.0717, + "step": 3039 + }, + { + "epoch": 0.5852202998291504, + "grad_norm": 2.7598521391898214, + "learning_rate": 1.659560669391714e-05, + "loss": 2.1171, + "step": 3040 + }, + { + "epoch": 0.5852202998291504, + "lm_loss": 1.9191, + "step": 3040, + "vm_loss": 0.1817 + }, + { + "epoch": 0.5852202998291504, + "lm_loss": 2.2079, + "step": 3040, + "vm_loss": 0.0775 + }, + { + "epoch": 0.5852202998291504, + "lm_loss": 1.8304, + "step": 3040, + "vm_loss": 0.1661 + }, + { + "epoch": 0.5852202998291504, + "lm_loss": 2.173, + "step": 3040, + "vm_loss": 0.197 + }, + { + "epoch": 0.5852202998291504, + "lm_loss": 2.1856, + "step": 3040, + "vm_loss": 0.1894 + }, + { + "epoch": 0.5852202998291504, + "lm_loss": 2.1718, + "step": 3040, + "vm_loss": 0.2608 + }, + { + "epoch": 0.5852202998291504, + "lm_loss": 1.809, + "step": 3040, + "vm_loss": 0.1305 + }, + { + "epoch": 0.5852202998291504, + "lm_loss": 1.8568, + "step": 3040, + "vm_loss": 0.1643 + }, + { + "epoch": 0.5854128065067257, + "grad_norm": 2.83765343751536, + "learning_rate": 1.6593262802207718e-05, + "loss": 2.1945, + "step": 3041 + }, + { + "epoch": 0.5856053131843011, + "grad_norm": 3.210067274546048, + "learning_rate": 1.6590918269548798e-05, + "loss": 2.1865, + "step": 3042 + }, + { + "epoch": 0.5857978198618765, + "grad_norm": 3.0551239908079775, + "learning_rate": 1.6588573096168298e-05, + "loss": 2.1211, + "step": 3043 + }, + { + "epoch": 0.5859903265394518, + "grad_norm": 2.9643746658660515, + "learning_rate": 1.65862272822942e-05, + "loss": 2.1489, + "step": 3044 + }, + { + "epoch": 0.5861828332170272, + "grad_norm": 3.0571060599250917, + "learning_rate": 1.6583880828154545e-05, + "loss": 2.1398, + "step": 3045 + }, + { + "epoch": 0.5863753398946026, + "grad_norm": 2.8324813461231853, + "learning_rate": 1.6581533733977448e-05, + "loss": 2.1551, + "step": 3046 + }, + { + "epoch": 0.586567846572178, + "grad_norm": 2.6363970533204686, + "learning_rate": 1.6579185999991066e-05, + "loss": 2.0789, + "step": 3047 + }, + { + "epoch": 0.5867603532497534, + "grad_norm": 3.1069228929429467, + "learning_rate": 1.657683762642363e-05, + "loss": 2.1731, + "step": 3048 + }, + { + "epoch": 0.5867603532497534, + "lm_loss": 1.9754, + "step": 3048, + "vm_loss": 0.1805 + }, + { + "epoch": 0.5867603532497534, + "lm_loss": 1.7068, + "step": 3048, + "vm_loss": 0.1165 + }, + { + "epoch": 0.5867603532497534, + "lm_loss": 2.2438, + "step": 3048, + "vm_loss": 0.1529 + }, + { + "epoch": 0.5867603532497534, + "lm_loss": 2.1466, + "step": 3048, + "vm_loss": 0.2059 + }, + { + "epoch": 0.5867603532497534, + "lm_loss": 1.9538, + "step": 3048, + "vm_loss": 0.1711 + }, + { + "epoch": 0.5867603532497534, + "lm_loss": 1.7407, + "step": 3048, + "vm_loss": 0.1459 + }, + { + "epoch": 0.5867603532497534, + "lm_loss": 1.7298, + "step": 3048, + "vm_loss": 0.1137 + }, + { + "epoch": 0.5867603532497534, + "lm_loss": 1.5824, + "step": 3048, + "vm_loss": 0.1688 + }, + { + "epoch": 0.5869528599273287, + "grad_norm": 3.0057477794377547, + "learning_rate": 1.657448861350344e-05, + "loss": 2.1495, + "step": 3049 + }, + { + "epoch": 0.5871453666049041, + "grad_norm": 2.5440215261832875, + "learning_rate": 1.6572138961458842e-05, + "loss": 2.0865, + "step": 3050 + }, + { + "epoch": 0.5873378732824794, + "grad_norm": 2.9842927925728486, + "learning_rate": 1.6569788670518253e-05, + "loss": 2.093, + "step": 3051 + }, + { + "epoch": 0.5875303799600549, + "grad_norm": 3.0767755972023614, + "learning_rate": 1.6567437740910156e-05, + "loss": 2.1278, + "step": 3052 + }, + { + "epoch": 0.5877228866376303, + "grad_norm": 2.9736798317030186, + "learning_rate": 1.6565086172863084e-05, + "loss": 2.0497, + "step": 3053 + }, + { + "epoch": 0.5879153933152056, + "grad_norm": 3.118449357781708, + "learning_rate": 1.6562733966605648e-05, + "loss": 2.1532, + "step": 3054 + }, + { + "epoch": 0.588107899992781, + "grad_norm": 3.107290224926494, + "learning_rate": 1.656038112236651e-05, + "loss": 2.0866, + "step": 3055 + }, + { + "epoch": 0.5883004066703563, + "grad_norm": 2.979031705867233, + "learning_rate": 1.655802764037439e-05, + "loss": 2.0899, + "step": 3056 + }, + { + "epoch": 0.5883004066703563, + "lm_loss": 2.0332, + "step": 3056, + "vm_loss": 0.1528 + }, + { + "epoch": 0.5883004066703563, + "lm_loss": 1.6015, + "step": 3056, + "vm_loss": 0.1481 + }, + { + "epoch": 0.5883004066703563, + "lm_loss": 1.8559, + "step": 3056, + "vm_loss": 0.1203 + }, + { + "epoch": 0.5883004066703563, + "lm_loss": 1.7194, + "step": 3056, + "vm_loss": 0.1933 + }, + { + "epoch": 0.5883004066703563, + "lm_loss": 1.4751, + "step": 3056, + "vm_loss": 0.1758 + }, + { + "epoch": 0.5883004066703563, + "lm_loss": 1.4304, + "step": 3056, + "vm_loss": 0.192 + }, + { + "epoch": 0.5883004066703563, + "lm_loss": 2.4113, + "step": 3056, + "vm_loss": 0.129 + }, + { + "epoch": 0.5883004066703563, + "lm_loss": 2.0223, + "step": 3056, + "vm_loss": 0.1601 + }, + { + "epoch": 0.5884929133479317, + "grad_norm": 2.756892024960601, + "learning_rate": 1.6555673520858084e-05, + "loss": 2.101, + "step": 3057 + }, + { + "epoch": 0.5886854200255072, + "grad_norm": 2.7666622277764503, + "learning_rate": 1.655331876404644e-05, + "loss": 2.0925, + "step": 3058 + }, + { + "epoch": 0.5888779267030825, + "grad_norm": 2.9767820145843316, + "learning_rate": 1.655096337016837e-05, + "loss": 2.1422, + "step": 3059 + }, + { + "epoch": 0.5890704333806579, + "grad_norm": 3.016400898066739, + "learning_rate": 1.6548607339452853e-05, + "loss": 2.1109, + "step": 3060 + }, + { + "epoch": 0.5892629400582333, + "grad_norm": 2.8856736593884977, + "learning_rate": 1.6546250672128915e-05, + "loss": 2.0473, + "step": 3061 + }, + { + "epoch": 0.5894554467358086, + "grad_norm": 2.809008754399697, + "learning_rate": 1.6543893368425664e-05, + "loss": 2.0572, + "step": 3062 + }, + { + "epoch": 0.589647953413384, + "grad_norm": 2.815648995782425, + "learning_rate": 1.654153542857226e-05, + "loss": 2.1103, + "step": 3063 + }, + { + "epoch": 0.5898404600909594, + "grad_norm": 2.81728503708321, + "learning_rate": 1.653917685279792e-05, + "loss": 2.1614, + "step": 3064 + }, + { + "epoch": 0.5898404600909594, + "lm_loss": 1.4508, + "step": 3064, + "vm_loss": 0.2674 + }, + { + "epoch": 0.5898404600909594, + "lm_loss": 2.4672, + "step": 3064, + "vm_loss": 0.1706 + }, + { + "epoch": 0.5898404600909594, + "lm_loss": 1.9315, + "step": 3064, + "vm_loss": 0.1596 + }, + { + "epoch": 0.5898404600909594, + "lm_loss": 1.5932, + "step": 3064, + "vm_loss": 0.2002 + }, + { + "epoch": 0.5898404600909594, + "lm_loss": 1.7128, + "step": 3064, + "vm_loss": 0.1885 + }, + { + "epoch": 0.5898404600909594, + "lm_loss": 2.3497, + "step": 3064, + "vm_loss": 0.1292 + }, + { + "epoch": 0.5898404600909594, + "lm_loss": 2.122, + "step": 3064, + "vm_loss": 0.1788 + }, + { + "epoch": 0.5898404600909594, + "lm_loss": 1.7593, + "step": 3064, + "vm_loss": 0.1801 + }, + { + "epoch": 0.5900329667685348, + "grad_norm": 2.819667052656945, + "learning_rate": 1.6536817641331928e-05, + "loss": 2.0813, + "step": 3065 + }, + { + "epoch": 0.5902254734461102, + "grad_norm": 2.786566063637792, + "learning_rate": 1.653445779440363e-05, + "loss": 2.1434, + "step": 3066 + }, + { + "epoch": 0.5904179801236855, + "grad_norm": 2.7336344675602042, + "learning_rate": 1.653209731224244e-05, + "loss": 2.1545, + "step": 3067 + }, + { + "epoch": 0.5906104868012609, + "grad_norm": 2.82808881389369, + "learning_rate": 1.6529736195077814e-05, + "loss": 2.1404, + "step": 3068 + }, + { + "epoch": 0.5908029934788362, + "grad_norm": 2.985314118518069, + "learning_rate": 1.652737444313929e-05, + "loss": 2.1506, + "step": 3069 + }, + { + "epoch": 0.5909955001564117, + "grad_norm": 2.6801192766996844, + "learning_rate": 1.6525012056656463e-05, + "loss": 2.1268, + "step": 3070 + }, + { + "epoch": 0.5911880068339871, + "grad_norm": 2.847983360219452, + "learning_rate": 1.6522649035858988e-05, + "loss": 2.1326, + "step": 3071 + }, + { + "epoch": 0.5913805135115624, + "grad_norm": 2.8784764803182115, + "learning_rate": 1.6520285380976576e-05, + "loss": 2.147, + "step": 3072 + }, + { + "epoch": 0.5913805135115624, + "lm_loss": 2.1412, + "step": 3072, + "vm_loss": 0.1497 + }, + { + "epoch": 0.5913805135115624, + "lm_loss": 1.5153, + "step": 3072, + "vm_loss": 0.1517 + }, + { + "epoch": 0.5913805135115624, + "lm_loss": 1.7926, + "step": 3072, + "vm_loss": 0.2407 + }, + { + "epoch": 0.5913805135115624, + "lm_loss": 1.969, + "step": 3072, + "vm_loss": 0.1824 + }, + { + "epoch": 0.5913805135115624, + "lm_loss": 2.1109, + "step": 3072, + "vm_loss": 0.2102 + }, + { + "epoch": 0.5913805135115624, + "lm_loss": 2.158, + "step": 3072, + "vm_loss": 0.1194 + }, + { + "epoch": 0.5913805135115624, + "lm_loss": 1.6353, + "step": 3072, + "vm_loss": 0.1834 + }, + { + "epoch": 0.5913805135115624, + "lm_loss": 2.1399, + "step": 3072, + "vm_loss": 0.1858 + }, + { + "epoch": 0.5915730201891378, + "grad_norm": 2.7136733115355316, + "learning_rate": 1.6517921092239002e-05, + "loss": 2.1694, + "step": 3073 + }, + { + "epoch": 0.5917655268667132, + "grad_norm": 2.846406856509497, + "learning_rate": 1.6515556169876115e-05, + "loss": 2.1305, + "step": 3074 + }, + { + "epoch": 0.5919580335442886, + "grad_norm": 2.7493678118332023, + "learning_rate": 1.6513190614117805e-05, + "loss": 2.1621, + "step": 3075 + }, + { + "epoch": 0.592150540221864, + "grad_norm": 2.8434055330118486, + "learning_rate": 1.6510824425194043e-05, + "loss": 2.1271, + "step": 3076 + }, + { + "epoch": 0.5923430468994393, + "grad_norm": 3.08171570849402, + "learning_rate": 1.6508457603334848e-05, + "loss": 2.1416, + "step": 3077 + }, + { + "epoch": 0.5925355535770147, + "grad_norm": 2.8191036195925223, + "learning_rate": 1.6506090148770306e-05, + "loss": 2.155, + "step": 3078 + }, + { + "epoch": 0.5927280602545901, + "grad_norm": 2.777495879092757, + "learning_rate": 1.6503722061730565e-05, + "loss": 2.0914, + "step": 3079 + }, + { + "epoch": 0.5929205669321654, + "grad_norm": 2.76847626487161, + "learning_rate": 1.650135334244583e-05, + "loss": 2.156, + "step": 3080 + }, + { + "epoch": 0.5929205669321654, + "lm_loss": 1.6757, + "step": 3080, + "vm_loss": 0.1609 + }, + { + "epoch": 0.5929205669321654, + "lm_loss": 2.1201, + "step": 3080, + "vm_loss": 0.1541 + }, + { + "epoch": 0.5929205669321654, + "lm_loss": 1.7977, + "step": 3080, + "vm_loss": 0.1517 + }, + { + "epoch": 0.5929205669321654, + "lm_loss": 1.7977, + "step": 3080, + "vm_loss": 0.1093 + }, + { + "epoch": 0.5929205669321654, + "lm_loss": 2.2029, + "step": 3080, + "vm_loss": 0.1664 + }, + { + "epoch": 0.5929205669321654, + "lm_loss": 2.2853, + "step": 3080, + "vm_loss": 0.1967 + }, + { + "epoch": 0.5929205669321654, + "lm_loss": 2.039, + "step": 3080, + "vm_loss": 0.1703 + }, + { + "epoch": 0.5929205669321654, + "lm_loss": 1.8213, + "step": 3080, + "vm_loss": 0.1981 + }, + { + "epoch": 0.5931130736097409, + "grad_norm": 2.7624164999638565, + "learning_rate": 1.6498983991146377e-05, + "loss": 2.0748, + "step": 3081 + }, + { + "epoch": 0.5933055802873162, + "grad_norm": 2.7991924711373293, + "learning_rate": 1.6496614008062533e-05, + "loss": 2.0823, + "step": 3082 + }, + { + "epoch": 0.5934980869648916, + "grad_norm": 3.0474521586445165, + "learning_rate": 1.6494243393424693e-05, + "loss": 2.1214, + "step": 3083 + }, + { + "epoch": 0.593690593642467, + "grad_norm": 3.251477309664315, + "learning_rate": 1.6491872147463307e-05, + "loss": 2.0788, + "step": 3084 + }, + { + "epoch": 0.5938831003200423, + "grad_norm": 2.9523167013991145, + "learning_rate": 1.6489500270408894e-05, + "loss": 2.1514, + "step": 3085 + }, + { + "epoch": 0.5940756069976177, + "grad_norm": 3.1621505174689855, + "learning_rate": 1.648712776249203e-05, + "loss": 2.1231, + "step": 3086 + }, + { + "epoch": 0.594268113675193, + "grad_norm": 2.7068999145353048, + "learning_rate": 1.648475462394335e-05, + "loss": 2.07, + "step": 3087 + }, + { + "epoch": 0.5944606203527685, + "grad_norm": 2.7609146520409897, + "learning_rate": 1.6482380854993558e-05, + "loss": 2.1109, + "step": 3088 + }, + { + "epoch": 0.5944606203527685, + "lm_loss": 2.3017, + "step": 3088, + "vm_loss": 0.1712 + }, + { + "epoch": 0.5944606203527685, + "lm_loss": 1.8878, + "step": 3088, + "vm_loss": 0.1683 + }, + { + "epoch": 0.5944606203527685, + "lm_loss": 1.7726, + "step": 3088, + "vm_loss": 0.149 + }, + { + "epoch": 0.5944606203527685, + "lm_loss": 1.7092, + "step": 3088, + "vm_loss": 0.1922 + }, + { + "epoch": 0.5944606203527685, + "lm_loss": 1.9233, + "step": 3088, + "vm_loss": 0.1396 + }, + { + "epoch": 0.5944606203527685, + "lm_loss": 1.9601, + "step": 3088, + "vm_loss": 0.1586 + }, + { + "epoch": 0.5944606203527685, + "lm_loss": 2.1384, + "step": 3088, + "vm_loss": 0.164 + }, + { + "epoch": 0.5944606203527685, + "lm_loss": 2.1962, + "step": 3088, + "vm_loss": 0.1468 + }, + { + "epoch": 0.5946531270303439, + "grad_norm": 3.006288736919539, + "learning_rate": 1.6480006455873415e-05, + "loss": 2.1321, + "step": 3089 + }, + { + "epoch": 0.5948456337079192, + "grad_norm": 3.0979170778252105, + "learning_rate": 1.6477631426813737e-05, + "loss": 2.0971, + "step": 3090 + }, + { + "epoch": 0.5950381403854946, + "grad_norm": 3.1360765057743705, + "learning_rate": 1.6475255768045415e-05, + "loss": 2.1783, + "step": 3091 + }, + { + "epoch": 0.59523064706307, + "grad_norm": 2.9230329805705573, + "learning_rate": 1.647287947979939e-05, + "loss": 2.0912, + "step": 3092 + }, + { + "epoch": 0.5954231537406454, + "grad_norm": 2.927314575830423, + "learning_rate": 1.6470502562306664e-05, + "loss": 2.1638, + "step": 3093 + }, + { + "epoch": 0.5956156604182208, + "grad_norm": 3.1089013455394894, + "learning_rate": 1.6468125015798307e-05, + "loss": 2.1538, + "step": 3094 + }, + { + "epoch": 0.5958081670957961, + "grad_norm": 2.9433711613829057, + "learning_rate": 1.646574684050545e-05, + "loss": 2.0776, + "step": 3095 + }, + { + "epoch": 0.5960006737733715, + "grad_norm": 2.774132946873311, + "learning_rate": 1.646336803665928e-05, + "loss": 2.0849, + "step": 3096 + }, + { + "epoch": 0.5960006737733715, + "lm_loss": 2.1088, + "step": 3096, + "vm_loss": 0.2047 + }, + { + "epoch": 0.5960006737733715, + "lm_loss": 1.8132, + "step": 3096, + "vm_loss": 0.2243 + }, + { + "epoch": 0.5960006737733715, + "lm_loss": 1.6197, + "step": 3096, + "vm_loss": 0.2126 + }, + { + "epoch": 0.5960006737733715, + "lm_loss": 1.9509, + "step": 3096, + "vm_loss": 0.1758 + }, + { + "epoch": 0.5960006737733715, + "lm_loss": 1.8054, + "step": 3096, + "vm_loss": 0.2375 + }, + { + "epoch": 0.5960006737733715, + "lm_loss": 1.9674, + "step": 3096, + "vm_loss": 0.1667 + }, + { + "epoch": 0.5960006737733715, + "lm_loss": 1.6279, + "step": 3096, + "vm_loss": 0.234 + }, + { + "epoch": 0.5960006737733715, + "lm_loss": 1.5834, + "step": 3096, + "vm_loss": 0.1697 + }, + { + "epoch": 0.5961931804509469, + "grad_norm": 3.1102912681314456, + "learning_rate": 1.6460988604491044e-05, + "loss": 2.0974, + "step": 3097 + }, + { + "epoch": 0.5963856871285222, + "grad_norm": 3.209904410802056, + "learning_rate": 1.6458608544232052e-05, + "loss": 2.098, + "step": 3098 + }, + { + "epoch": 0.5965781938060977, + "grad_norm": 2.959313334858762, + "learning_rate": 1.6456227856113683e-05, + "loss": 2.0795, + "step": 3099 + }, + { + "epoch": 0.596770700483673, + "grad_norm": 2.7199332191624537, + "learning_rate": 1.645384654036737e-05, + "loss": 2.138, + "step": 3100 + }, + { + "epoch": 0.5969632071612484, + "grad_norm": 2.759879116157722, + "learning_rate": 1.6451464597224603e-05, + "loss": 2.0507, + "step": 3101 + }, + { + "epoch": 0.5971557138388238, + "grad_norm": 3.119661728528989, + "learning_rate": 1.6449082026916934e-05, + "loss": 2.229, + "step": 3102 + }, + { + "epoch": 0.5973482205163991, + "grad_norm": 3.0034052779210647, + "learning_rate": 1.6446698829675986e-05, + "loss": 2.1487, + "step": 3103 + }, + { + "epoch": 0.5975407271939746, + "grad_norm": 2.863377385051883, + "learning_rate": 1.644431500573344e-05, + "loss": 2.108, + "step": 3104 + }, + { + "epoch": 0.5975407271939746, + "lm_loss": 2.2092, + "step": 3104, + "vm_loss": 0.2035 + }, + { + "epoch": 0.5975407271939746, + "lm_loss": 2.1008, + "step": 3104, + "vm_loss": 0.1878 + }, + { + "epoch": 0.5975407271939746, + "lm_loss": 2.213, + "step": 3104, + "vm_loss": 0.1493 + }, + { + "epoch": 0.5975407271939746, + "lm_loss": 2.0819, + "step": 3104, + "vm_loss": 0.1404 + }, + { + "epoch": 0.5975407271939746, + "lm_loss": 2.1218, + "step": 3104, + "vm_loss": 0.1822 + }, + { + "epoch": 0.5975407271939746, + "lm_loss": 1.9068, + "step": 3104, + "vm_loss": 0.1857 + }, + { + "epoch": 0.5975407271939746, + "lm_loss": 1.9765, + "step": 3104, + "vm_loss": 0.1267 + }, + { + "epoch": 0.5975407271939746, + "lm_loss": 1.92, + "step": 3104, + "vm_loss": 0.1573 + }, + { + "epoch": 0.59773323387155, + "grad_norm": 2.9689068731455723, + "learning_rate": 1.6441930555321026e-05, + "loss": 2.1197, + "step": 3105 + }, + { + "epoch": 0.5979257405491253, + "grad_norm": 3.097564050558102, + "learning_rate": 1.6439545478670543e-05, + "loss": 2.1217, + "step": 3106 + }, + { + "epoch": 0.5981182472267007, + "grad_norm": 2.9485061072165055, + "learning_rate": 1.6437159776013856e-05, + "loss": 2.1186, + "step": 3107 + }, + { + "epoch": 0.598310753904276, + "grad_norm": 2.9449434150118288, + "learning_rate": 1.6434773447582882e-05, + "loss": 2.1196, + "step": 3108 + }, + { + "epoch": 0.5985032605818514, + "grad_norm": 2.892442618301742, + "learning_rate": 1.6432386493609606e-05, + "loss": 2.1639, + "step": 3109 + }, + { + "epoch": 0.5986957672594269, + "grad_norm": 2.9301560663595287, + "learning_rate": 1.6429998914326064e-05, + "loss": 2.1238, + "step": 3110 + }, + { + "epoch": 0.5988882739370022, + "grad_norm": 2.5957474368595737, + "learning_rate": 1.6427610709964367e-05, + "loss": 2.0595, + "step": 3111 + }, + { + "epoch": 0.5990807806145776, + "grad_norm": 3.0215715144038473, + "learning_rate": 1.6425221880756677e-05, + "loss": 2.1745, + "step": 3112 + }, + { + "epoch": 0.5990807806145776, + "lm_loss": 2.3693, + "step": 3112, + "vm_loss": 0.182 + }, + { + "epoch": 0.5990807806145776, + "lm_loss": 1.8474, + "step": 3112, + "vm_loss": 0.098 + }, + { + "epoch": 0.5990807806145776, + "lm_loss": 1.991, + "step": 3112, + "vm_loss": 0.2447 + }, + { + "epoch": 0.5990807806145776, + "lm_loss": 2.0397, + "step": 3112, + "vm_loss": 0.2014 + }, + { + "epoch": 0.5990807806145776, + "lm_loss": 2.028, + "step": 3112, + "vm_loss": 0.1778 + }, + { + "epoch": 0.5990807806145776, + "lm_loss": 2.1633, + "step": 3112, + "vm_loss": 0.1638 + }, + { + "epoch": 0.5990807806145776, + "lm_loss": 2.1718, + "step": 3112, + "vm_loss": 0.1621 + }, + { + "epoch": 0.5990807806145776, + "lm_loss": 1.8395, + "step": 3112, + "vm_loss": 0.1935 + }, + { + "epoch": 0.5992732872921529, + "grad_norm": 2.6427740186479283, + "learning_rate": 1.6422832426935216e-05, + "loss": 2.1091, + "step": 3113 + }, + { + "epoch": 0.5994657939697283, + "grad_norm": 2.7658436602417034, + "learning_rate": 1.642044234873227e-05, + "loss": 2.1472, + "step": 3114 + }, + { + "epoch": 0.5996583006473037, + "grad_norm": 2.791107423450398, + "learning_rate": 1.6418051646380187e-05, + "loss": 2.1144, + "step": 3115 + }, + { + "epoch": 0.599850807324879, + "grad_norm": 2.9253078480863177, + "learning_rate": 1.6415660320111376e-05, + "loss": 2.1129, + "step": 3116 + }, + { + "epoch": 0.6000433140024545, + "grad_norm": 2.749328051887258, + "learning_rate": 1.64132683701583e-05, + "loss": 2.1051, + "step": 3117 + }, + { + "epoch": 0.6002358206800298, + "grad_norm": 2.6775405828032395, + "learning_rate": 1.641087579675349e-05, + "loss": 2.1119, + "step": 3118 + }, + { + "epoch": 0.6004283273576052, + "grad_norm": 2.8670121783474016, + "learning_rate": 1.6408482600129526e-05, + "loss": 2.1357, + "step": 3119 + }, + { + "epoch": 0.6006208340351806, + "grad_norm": 2.6600516360354214, + "learning_rate": 1.640608878051907e-05, + "loss": 2.1157, + "step": 3120 + }, + { + "epoch": 0.6006208340351806, + "lm_loss": 1.7266, + "step": 3120, + "vm_loss": 0.1393 + }, + { + "epoch": 0.6006208340351806, + "lm_loss": 1.7653, + "step": 3120, + "vm_loss": 0.1958 + }, + { + "epoch": 0.6006208340351806, + "lm_loss": 1.8918, + "step": 3120, + "vm_loss": 0.1887 + }, + { + "epoch": 0.6006208340351806, + "lm_loss": 1.9661, + "step": 3120, + "vm_loss": 0.157 + }, + { + "epoch": 0.6006208340351806, + "lm_loss": 1.7109, + "step": 3120, + "vm_loss": 0.1042 + }, + { + "epoch": 0.6006208340351806, + "lm_loss": 2.0747, + "step": 3120, + "vm_loss": 0.2553 + }, + { + "epoch": 0.6006208340351806, + "lm_loss": 2.0993, + "step": 3120, + "vm_loss": 0.1852 + }, + { + "epoch": 0.6006208340351806, + "lm_loss": 1.8689, + "step": 3120, + "vm_loss": 0.1623 + }, + { + "epoch": 0.6008133407127559, + "grad_norm": 2.8378894612568004, + "learning_rate": 1.6403694338154828e-05, + "loss": 2.1019, + "step": 3121 + }, + { + "epoch": 0.6010058473903314, + "grad_norm": 2.964683498745593, + "learning_rate": 1.6401299273269572e-05, + "loss": 2.1403, + "step": 3122 + }, + { + "epoch": 0.6011983540679068, + "grad_norm": 2.9847529592062187, + "learning_rate": 1.6398903586096128e-05, + "loss": 2.1044, + "step": 3123 + }, + { + "epoch": 0.6013908607454821, + "grad_norm": 2.7649675136474383, + "learning_rate": 1.6396507276867386e-05, + "loss": 2.1079, + "step": 3124 + }, + { + "epoch": 0.6015833674230575, + "grad_norm": 2.700862714423676, + "learning_rate": 1.6394110345816302e-05, + "loss": 2.1276, + "step": 3125 + }, + { + "epoch": 0.6017758741006328, + "grad_norm": 2.8538236861747146, + "learning_rate": 1.639171279317589e-05, + "loss": 2.1021, + "step": 3126 + }, + { + "epoch": 0.6019683807782082, + "grad_norm": 2.782355506492004, + "learning_rate": 1.638931461917922e-05, + "loss": 2.1307, + "step": 3127 + }, + { + "epoch": 0.6021608874557837, + "grad_norm": 2.715795496543863, + "learning_rate": 1.6386915824059427e-05, + "loss": 2.1066, + "step": 3128 + }, + { + "epoch": 0.6021608874557837, + "lm_loss": 2.254, + "step": 3128, + "vm_loss": 0.1918 + }, + { + "epoch": 0.6021608874557837, + "lm_loss": 2.2265, + "step": 3128, + "vm_loss": 0.2451 + }, + { + "epoch": 0.6021608874557837, + "lm_loss": 1.8418, + "step": 3128, + "vm_loss": 0.1162 + }, + { + "epoch": 0.6021608874557837, + "lm_loss": 1.7923, + "step": 3128, + "vm_loss": 0.2015 + }, + { + "epoch": 0.6021608874557837, + "lm_loss": 1.6608, + "step": 3128, + "vm_loss": 0.163 + }, + { + "epoch": 0.6021608874557837, + "lm_loss": 1.5838, + "step": 3128, + "vm_loss": 0.2162 + }, + { + "epoch": 0.6021608874557837, + "lm_loss": 1.7733, + "step": 3128, + "vm_loss": 0.2257 + }, + { + "epoch": 0.6021608874557837, + "lm_loss": 2.1976, + "step": 3128, + "vm_loss": 0.1788 + }, + { + "epoch": 0.602353394133359, + "grad_norm": 2.556823839721738, + "learning_rate": 1.63845164080497e-05, + "loss": 2.1295, + "step": 3129 + }, + { + "epoch": 0.6025459008109344, + "grad_norm": 2.758001356529022, + "learning_rate": 1.63821163713833e-05, + "loss": 2.0889, + "step": 3130 + }, + { + "epoch": 0.6027384074885097, + "grad_norm": 2.758340263093989, + "learning_rate": 1.6379715714293536e-05, + "loss": 2.0958, + "step": 3131 + }, + { + "epoch": 0.6029309141660851, + "grad_norm": 2.7591547941026775, + "learning_rate": 1.6377314437013786e-05, + "loss": 2.0781, + "step": 3132 + }, + { + "epoch": 0.6031234208436606, + "grad_norm": 2.8910216216542612, + "learning_rate": 1.6374912539777483e-05, + "loss": 2.0632, + "step": 3133 + }, + { + "epoch": 0.6033159275212359, + "grad_norm": 2.973880034324225, + "learning_rate": 1.637251002281812e-05, + "loss": 2.1312, + "step": 3134 + }, + { + "epoch": 0.6035084341988113, + "grad_norm": 2.8086452247351206, + "learning_rate": 1.6370106886369253e-05, + "loss": 2.1051, + "step": 3135 + }, + { + "epoch": 0.6037009408763867, + "grad_norm": 2.993744282956302, + "learning_rate": 1.63677031306645e-05, + "loss": 2.0856, + "step": 3136 + }, + { + "epoch": 0.6037009408763867, + "lm_loss": 1.9913, + "step": 3136, + "vm_loss": 0.1618 + }, + { + "epoch": 0.6037009408763867, + "lm_loss": 1.6417, + "step": 3136, + "vm_loss": 0.1459 + }, + { + "epoch": 0.6037009408763867, + "lm_loss": 1.9158, + "step": 3136, + "vm_loss": 0.1745 + }, + { + "epoch": 0.6037009408763867, + "lm_loss": 1.5652, + "step": 3136, + "vm_loss": 0.1995 + }, + { + "epoch": 0.6037009408763867, + "lm_loss": 1.8118, + "step": 3136, + "vm_loss": 0.0926 + }, + { + "epoch": 0.6037009408763867, + "lm_loss": 2.135, + "step": 3136, + "vm_loss": 0.1796 + }, + { + "epoch": 0.6037009408763867, + "lm_loss": 2.1011, + "step": 3136, + "vm_loss": 0.2212 + }, + { + "epoch": 0.6037009408763867, + "lm_loss": 1.9596, + "step": 3136, + "vm_loss": 0.111 + }, + { + "epoch": 0.603893447553962, + "grad_norm": 2.9773326560686337, + "learning_rate": 1.6365298755937534e-05, + "loss": 2.0918, + "step": 3137 + }, + { + "epoch": 0.6040859542315374, + "grad_norm": 2.7659726024628633, + "learning_rate": 1.63628937624221e-05, + "loss": 2.0297, + "step": 3138 + }, + { + "epoch": 0.6042784609091127, + "grad_norm": 3.1548979907377968, + "learning_rate": 1.6360488150351984e-05, + "loss": 2.0924, + "step": 3139 + }, + { + "epoch": 0.6044709675866882, + "grad_norm": 2.8928207432105726, + "learning_rate": 1.6358081919961044e-05, + "loss": 2.1029, + "step": 3140 + }, + { + "epoch": 0.6046634742642636, + "grad_norm": 2.823551057318725, + "learning_rate": 1.63556750714832e-05, + "loss": 2.1177, + "step": 3141 + }, + { + "epoch": 0.6048559809418389, + "grad_norm": 2.846249474094296, + "learning_rate": 1.6353267605152424e-05, + "loss": 2.1025, + "step": 3142 + }, + { + "epoch": 0.6050484876194143, + "grad_norm": 3.054820514672883, + "learning_rate": 1.6350859521202755e-05, + "loss": 2.096, + "step": 3143 + }, + { + "epoch": 0.6052409942969896, + "grad_norm": 3.0712096143284207, + "learning_rate": 1.634845081986829e-05, + "loss": 2.0949, + "step": 3144 + }, + { + "epoch": 0.6052409942969896, + "lm_loss": 1.9218, + "step": 3144, + "vm_loss": 0.1275 + }, + { + "epoch": 0.6052409942969896, + "lm_loss": 2.0666, + "step": 3144, + "vm_loss": 0.1824 + }, + { + "epoch": 0.6052409942969896, + "lm_loss": 1.6691, + "step": 3144, + "vm_loss": 0.1933 + }, + { + "epoch": 0.6052409942969896, + "lm_loss": 1.6648, + "step": 3144, + "vm_loss": 0.1919 + }, + { + "epoch": 0.6052409942969896, + "lm_loss": 1.3433, + "step": 3144, + "vm_loss": 0.1404 + }, + { + "epoch": 0.6052409942969896, + "lm_loss": 1.6114, + "step": 3144, + "vm_loss": 0.1696 + }, + { + "epoch": 0.6052409942969896, + "lm_loss": 1.8505, + "step": 3144, + "vm_loss": 0.1787 + }, + { + "epoch": 0.6052409942969896, + "lm_loss": 2.4911, + "step": 3144, + "vm_loss": 0.1919 + }, + { + "epoch": 0.605433500974565, + "grad_norm": 2.835328495524342, + "learning_rate": 1.6346041501383184e-05, + "loss": 2.0802, + "step": 3145 + }, + { + "epoch": 0.6056260076521405, + "grad_norm": 2.92849519481618, + "learning_rate": 1.6343631565981654e-05, + "loss": 2.0441, + "step": 3146 + }, + { + "epoch": 0.6058185143297158, + "grad_norm": 3.0463062663964324, + "learning_rate": 1.634122101389798e-05, + "loss": 2.1568, + "step": 3147 + }, + { + "epoch": 0.6060110210072912, + "grad_norm": 3.1554149138705214, + "learning_rate": 1.6338809845366493e-05, + "loss": 2.0897, + "step": 3148 + }, + { + "epoch": 0.6062035276848665, + "grad_norm": 2.731802671555569, + "learning_rate": 1.6336398060621592e-05, + "loss": 2.1514, + "step": 3149 + }, + { + "epoch": 0.6063960343624419, + "grad_norm": 3.0152084637803873, + "learning_rate": 1.6333985659897737e-05, + "loss": 2.1413, + "step": 3150 + }, + { + "epoch": 0.6065885410400174, + "grad_norm": 3.0776264682704255, + "learning_rate": 1.6331572643429433e-05, + "loss": 2.1152, + "step": 3151 + }, + { + "epoch": 0.6067810477175927, + "grad_norm": 2.963824143166886, + "learning_rate": 1.6329159011451273e-05, + "loss": 2.1244, + "step": 3152 + }, + { + "epoch": 0.6067810477175927, + "lm_loss": 1.6866, + "step": 3152, + "vm_loss": 0.183 + }, + { + "epoch": 0.6067810477175927, + "lm_loss": 2.1177, + "step": 3152, + "vm_loss": 0.1355 + }, + { + "epoch": 0.6067810477175927, + "lm_loss": 1.9209, + "step": 3152, + "vm_loss": 0.1008 + }, + { + "epoch": 0.6067810477175927, + "lm_loss": 1.9096, + "step": 3152, + "vm_loss": 0.1522 + }, + { + "epoch": 0.6067810477175927, + "lm_loss": 2.2478, + "step": 3152, + "vm_loss": 0.1636 + }, + { + "epoch": 0.6067810477175927, + "lm_loss": 2.1336, + "step": 3152, + "vm_loss": 0.1122 + }, + { + "epoch": 0.6067810477175927, + "lm_loss": 1.64, + "step": 3152, + "vm_loss": 0.1402 + }, + { + "epoch": 0.6067810477175927, + "lm_loss": 2.1637, + "step": 3152, + "vm_loss": 0.1764 + }, + { + "epoch": 0.6069735543951681, + "grad_norm": 3.1438201711506903, + "learning_rate": 1.6326744764197876e-05, + "loss": 2.1044, + "step": 3153 + }, + { + "epoch": 0.6071660610727435, + "grad_norm": 3.0859865381780835, + "learning_rate": 1.6324329901903946e-05, + "loss": 2.0864, + "step": 3154 + }, + { + "epoch": 0.6073585677503188, + "grad_norm": 3.2253098221943093, + "learning_rate": 1.632191442480424e-05, + "loss": 2.1114, + "step": 3155 + }, + { + "epoch": 0.6075510744278942, + "grad_norm": 2.8904717869222023, + "learning_rate": 1.6319498333133572e-05, + "loss": 2.1729, + "step": 3156 + }, + { + "epoch": 0.6077435811054696, + "grad_norm": 2.9691321538621867, + "learning_rate": 1.6317081627126817e-05, + "loss": 2.1507, + "step": 3157 + }, + { + "epoch": 0.607936087783045, + "grad_norm": 3.1396755992246854, + "learning_rate": 1.6314664307018902e-05, + "loss": 2.0873, + "step": 3158 + }, + { + "epoch": 0.6081285944606204, + "grad_norm": 3.1259666135796147, + "learning_rate": 1.6312246373044835e-05, + "loss": 2.0331, + "step": 3159 + }, + { + "epoch": 0.6083211011381957, + "grad_norm": 3.1071483225402625, + "learning_rate": 1.630982782543966e-05, + "loss": 2.1273, + "step": 3160 + }, + { + "epoch": 0.6083211011381957, + "lm_loss": 1.8931, + "step": 3160, + "vm_loss": 0.1465 + }, + { + "epoch": 0.6083211011381957, + "lm_loss": 1.8999, + "step": 3160, + "vm_loss": 0.1578 + }, + { + "epoch": 0.6083211011381957, + "lm_loss": 1.9565, + "step": 3160, + "vm_loss": 0.1715 + }, + { + "epoch": 0.6083211011381957, + "lm_loss": 1.5077, + "step": 3160, + "vm_loss": 0.1974 + }, + { + "epoch": 0.6083211011381957, + "lm_loss": 1.7355, + "step": 3160, + "vm_loss": 0.139 + }, + { + "epoch": 0.6083211011381957, + "lm_loss": 1.6954, + "step": 3160, + "vm_loss": 0.1684 + }, + { + "epoch": 0.6083211011381957, + "lm_loss": 1.415, + "step": 3160, + "vm_loss": 0.163 + }, + { + "epoch": 0.6083211011381957, + "lm_loss": 1.5183, + "step": 3160, + "vm_loss": 0.1816 + }, + { + "epoch": 0.6085136078157711, + "grad_norm": 3.0467831333516613, + "learning_rate": 1.6307408664438496e-05, + "loss": 2.0497, + "step": 3161 + }, + { + "epoch": 0.6087061144933464, + "grad_norm": 3.4622780167619815, + "learning_rate": 1.6304988890276515e-05, + "loss": 2.0739, + "step": 3162 + }, + { + "epoch": 0.6088986211709219, + "grad_norm": 2.8572980280905367, + "learning_rate": 1.6302568503188946e-05, + "loss": 2.1324, + "step": 3163 + }, + { + "epoch": 0.6090911278484973, + "grad_norm": 2.8955750566187457, + "learning_rate": 1.630014750341109e-05, + "loss": 2.1081, + "step": 3164 + }, + { + "epoch": 0.6092836345260726, + "grad_norm": 3.022213257586785, + "learning_rate": 1.629772589117829e-05, + "loss": 2.0795, + "step": 3165 + }, + { + "epoch": 0.609476141203648, + "grad_norm": 2.911666092157024, + "learning_rate": 1.6295303666725963e-05, + "loss": 2.0625, + "step": 3166 + }, + { + "epoch": 0.6096686478812233, + "grad_norm": 2.987814984399549, + "learning_rate": 1.629288083028958e-05, + "loss": 2.1602, + "step": 3167 + }, + { + "epoch": 0.6098611545587987, + "grad_norm": 2.762750417446928, + "learning_rate": 1.629045738210467e-05, + "loss": 2.0978, + "step": 3168 + }, + { + "epoch": 0.6098611545587987, + "lm_loss": 1.8585, + "step": 3168, + "vm_loss": 0.1626 + }, + { + "epoch": 0.6098611545587987, + "lm_loss": 2.1456, + "step": 3168, + "vm_loss": 0.148 + }, + { + "epoch": 0.6098611545587987, + "lm_loss": 1.8984, + "step": 3168, + "vm_loss": 0.2566 + }, + { + "epoch": 0.6098611545587987, + "lm_loss": 1.7668, + "step": 3168, + "vm_loss": 0.171 + }, + { + "epoch": 0.6098611545587987, + "lm_loss": 1.9968, + "step": 3168, + "vm_loss": 0.1701 + }, + { + "epoch": 0.6098611545587987, + "lm_loss": 2.0106, + "step": 3168, + "vm_loss": 0.1816 + }, + { + "epoch": 0.6098611545587987, + "lm_loss": 2.0301, + "step": 3168, + "vm_loss": 0.2122 + }, + { + "epoch": 0.6098611545587987, + "lm_loss": 2.0269, + "step": 3168, + "vm_loss": 0.2335 + }, + { + "epoch": 0.6100536612363742, + "grad_norm": 3.19094204683108, + "learning_rate": 1.6288033322406826e-05, + "loss": 2.0942, + "step": 3169 + }, + { + "epoch": 0.6102461679139495, + "grad_norm": 3.0645364923540077, + "learning_rate": 1.6285608651431692e-05, + "loss": 2.0482, + "step": 3170 + }, + { + "epoch": 0.6104386745915249, + "grad_norm": 3.2824346827346496, + "learning_rate": 1.6283183369414986e-05, + "loss": 2.0982, + "step": 3171 + }, + { + "epoch": 0.6106311812691003, + "grad_norm": 2.9755716710347073, + "learning_rate": 1.6280757476592467e-05, + "loss": 2.1002, + "step": 3172 + }, + { + "epoch": 0.6108236879466756, + "grad_norm": 3.0240372525686907, + "learning_rate": 1.6278330973199963e-05, + "loss": 2.0837, + "step": 3173 + }, + { + "epoch": 0.611016194624251, + "grad_norm": 2.782826203392463, + "learning_rate": 1.6275903859473374e-05, + "loss": 2.1048, + "step": 3174 + }, + { + "epoch": 0.6112087013018264, + "grad_norm": 2.827268432431336, + "learning_rate": 1.627347613564863e-05, + "loss": 2.1476, + "step": 3175 + }, + { + "epoch": 0.6114012079794018, + "grad_norm": 3.0814375790338935, + "learning_rate": 1.627104780196175e-05, + "loss": 2.0954, + "step": 3176 + }, + { + "epoch": 0.6114012079794018, + "lm_loss": 1.8808, + "step": 3176, + "vm_loss": 0.1295 + }, + { + "epoch": 0.6114012079794018, + "lm_loss": 2.0095, + "step": 3176, + "vm_loss": 0.1568 + }, + { + "epoch": 0.6114012079794018, + "lm_loss": 2.1868, + "step": 3176, + "vm_loss": 0.1495 + }, + { + "epoch": 0.6114012079794018, + "lm_loss": 1.96, + "step": 3176, + "vm_loss": 0.2162 + }, + { + "epoch": 0.6114012079794018, + "lm_loss": 2.0044, + "step": 3176, + "vm_loss": 0.1335 + }, + { + "epoch": 0.6114012079794018, + "lm_loss": 1.9393, + "step": 3176, + "vm_loss": 0.1591 + }, + { + "epoch": 0.6114012079794018, + "lm_loss": 1.902, + "step": 3176, + "vm_loss": 0.1335 + }, + { + "epoch": 0.6114012079794018, + "lm_loss": 2.1005, + "step": 3176, + "vm_loss": 0.2449 + }, + { + "epoch": 0.6115937146569772, + "grad_norm": 3.014359561279996, + "learning_rate": 1.626861885864879e-05, + "loss": 2.0651, + "step": 3177 + }, + { + "epoch": 0.6117862213345525, + "grad_norm": 2.676790591742261, + "learning_rate": 1.6266189305945878e-05, + "loss": 2.0786, + "step": 3178 + }, + { + "epoch": 0.6119787280121279, + "grad_norm": 2.9025690876455963, + "learning_rate": 1.62637591440892e-05, + "loss": 2.1289, + "step": 3179 + }, + { + "epoch": 0.6121712346897032, + "grad_norm": 2.9615786598423557, + "learning_rate": 1.6261328373314995e-05, + "loss": 2.0853, + "step": 3180 + }, + { + "epoch": 0.6123637413672787, + "grad_norm": 2.9624895726333076, + "learning_rate": 1.6258896993859566e-05, + "loss": 2.0707, + "step": 3181 + }, + { + "epoch": 0.6125562480448541, + "grad_norm": 3.1354391140772133, + "learning_rate": 1.6256465005959277e-05, + "loss": 2.0831, + "step": 3182 + }, + { + "epoch": 0.6127487547224294, + "grad_norm": 3.215196700630077, + "learning_rate": 1.625403240985054e-05, + "loss": 2.1399, + "step": 3183 + }, + { + "epoch": 0.6129412614000048, + "grad_norm": 3.1360356171676966, + "learning_rate": 1.6251599205769845e-05, + "loss": 2.0937, + "step": 3184 + }, + { + "epoch": 0.6129412614000048, + "lm_loss": 1.941, + "step": 3184, + "vm_loss": 0.1791 + }, + { + "epoch": 0.6129412614000048, + "lm_loss": 1.8345, + "step": 3184, + "vm_loss": 0.1036 + }, + { + "epoch": 0.6129412614000048, + "lm_loss": 1.9953, + "step": 3184, + "vm_loss": 0.1877 + }, + { + "epoch": 0.6129412614000048, + "lm_loss": 2.1133, + "step": 3184, + "vm_loss": 0.1885 + }, + { + "epoch": 0.6129412614000048, + "lm_loss": 1.6006, + "step": 3184, + "vm_loss": 0.1144 + }, + { + "epoch": 0.6129412614000048, + "lm_loss": 2.1592, + "step": 3184, + "vm_loss": 0.1513 + }, + { + "epoch": 0.6129412614000048, + "lm_loss": 1.5276, + "step": 3184, + "vm_loss": 0.1628 + }, + { + "epoch": 0.6129412614000048, + "lm_loss": 1.8149, + "step": 3184, + "vm_loss": 0.1252 + }, + { + "epoch": 0.6131337680775802, + "grad_norm": 2.9057593517280487, + "learning_rate": 1.6249165393953727e-05, + "loss": 2.0647, + "step": 3185 + }, + { + "epoch": 0.6133262747551556, + "grad_norm": 2.8131783955620966, + "learning_rate": 1.624673097463878e-05, + "loss": 2.1041, + "step": 3186 + }, + { + "epoch": 0.613518781432731, + "grad_norm": 2.9169879856427743, + "learning_rate": 1.6244295948061663e-05, + "loss": 2.0291, + "step": 3187 + }, + { + "epoch": 0.6137112881103063, + "grad_norm": 2.9397036118165714, + "learning_rate": 1.6241860314459096e-05, + "loss": 2.1053, + "step": 3188 + }, + { + "epoch": 0.6139037947878817, + "grad_norm": 3.0678595748184203, + "learning_rate": 1.6239424074067845e-05, + "loss": 2.0949, + "step": 3189 + }, + { + "epoch": 0.6140963014654571, + "grad_norm": 2.9734960974328, + "learning_rate": 1.6236987227124748e-05, + "loss": 2.1509, + "step": 3190 + }, + { + "epoch": 0.6142888081430324, + "grad_norm": 3.043773111444623, + "learning_rate": 1.62345497738667e-05, + "loss": 2.0506, + "step": 3191 + }, + { + "epoch": 0.6144813148206079, + "grad_norm": 2.893973337494174, + "learning_rate": 1.623211171453065e-05, + "loss": 2.0905, + "step": 3192 + }, + { + "epoch": 0.6144813148206079, + "lm_loss": 1.7668, + "step": 3192, + "vm_loss": 0.1867 + }, + { + "epoch": 0.6144813148206079, + "lm_loss": 1.7622, + "step": 3192, + "vm_loss": 0.1882 + }, + { + "epoch": 0.6144813148206079, + "lm_loss": 1.4562, + "step": 3192, + "vm_loss": 0.2336 + }, + { + "epoch": 0.6144813148206079, + "lm_loss": 2.215, + "step": 3192, + "vm_loss": 0.244 + }, + { + "epoch": 0.6144813148206079, + "lm_loss": 1.8997, + "step": 3192, + "vm_loss": 0.2005 + }, + { + "epoch": 0.6144813148206079, + "lm_loss": 1.9953, + "step": 3192, + "vm_loss": 0.1201 + }, + { + "epoch": 0.6144813148206079, + "lm_loss": 1.7962, + "step": 3192, + "vm_loss": 0.1596 + }, + { + "epoch": 0.6144813148206079, + "lm_loss": 1.9989, + "step": 3192, + "vm_loss": 0.2018 + }, + { + "epoch": 0.6146738214981832, + "grad_norm": 2.9494806129924522, + "learning_rate": 1.6229673049353612e-05, + "loss": 2.0856, + "step": 3193 + }, + { + "epoch": 0.6148663281757586, + "grad_norm": 2.8932857741551157, + "learning_rate": 1.622723377857265e-05, + "loss": 2.1079, + "step": 3194 + }, + { + "epoch": 0.615058834853334, + "grad_norm": 2.7985344129187144, + "learning_rate": 1.62247939024249e-05, + "loss": 2.0593, + "step": 3195 + }, + { + "epoch": 0.6152513415309093, + "grad_norm": 3.2135061394917948, + "learning_rate": 1.622235342114754e-05, + "loss": 2.1201, + "step": 3196 + }, + { + "epoch": 0.6154438482084847, + "grad_norm": 2.756267579527573, + "learning_rate": 1.621991233497782e-05, + "loss": 2.0738, + "step": 3197 + }, + { + "epoch": 0.61563635488606, + "grad_norm": 2.743092268777963, + "learning_rate": 1.6217470644153045e-05, + "loss": 2.1584, + "step": 3198 + }, + { + "epoch": 0.6158288615636355, + "grad_norm": 2.7047178972930674, + "learning_rate": 1.6215028348910577e-05, + "loss": 2.0681, + "step": 3199 + }, + { + "epoch": 0.6160213682412109, + "grad_norm": 2.8421509802042193, + "learning_rate": 1.6212585449487842e-05, + "loss": 2.0451, + "step": 3200 + }, + { + "epoch": 0.6160213682412109, + "lm_loss": 2.2161, + "step": 3200, + "vm_loss": 0.1554 + }, + { + "epoch": 0.6160213682412109, + "lm_loss": 1.9087, + "step": 3200, + "vm_loss": 0.1663 + }, + { + "epoch": 0.6160213682412109, + "lm_loss": 1.67, + "step": 3200, + "vm_loss": 0.1341 + }, + { + "epoch": 0.6160213682412109, + "lm_loss": 2.2056, + "step": 3200, + "vm_loss": 0.1843 + }, + { + "epoch": 0.6160213682412109, + "lm_loss": 1.8152, + "step": 3200, + "vm_loss": 0.1492 + }, + { + "epoch": 0.6160213682412109, + "lm_loss": 1.9843, + "step": 3200, + "vm_loss": 0.1731 + }, + { + "epoch": 0.6160213682412109, + "lm_loss": 1.3233, + "step": 3200, + "vm_loss": 0.1528 + }, + { + "epoch": 0.6160213682412109, + "lm_loss": 1.9735, + "step": 3200, + "vm_loss": 0.1528 + }, + { + "epoch": 0.6162138749187862, + "grad_norm": 3.070108243125282, + "learning_rate": 1.6210141946122322e-05, + "loss": 2.061, + "step": 3201 + }, + { + "epoch": 0.6164063815963616, + "grad_norm": 2.9327218614405948, + "learning_rate": 1.620769783905155e-05, + "loss": 2.155, + "step": 3202 + }, + { + "epoch": 0.616598888273937, + "grad_norm": 2.8836796347423186, + "learning_rate": 1.6205253128513133e-05, + "loss": 2.0599, + "step": 3203 + }, + { + "epoch": 0.6167913949515124, + "grad_norm": 2.7829783265112447, + "learning_rate": 1.620280781474472e-05, + "loss": 2.0259, + "step": 3204 + }, + { + "epoch": 0.6169839016290878, + "grad_norm": 2.9808814874585265, + "learning_rate": 1.620036189798403e-05, + "loss": 2.1368, + "step": 3205 + }, + { + "epoch": 0.6171764083066631, + "grad_norm": 2.971563304864646, + "learning_rate": 1.619791537846884e-05, + "loss": 2.0435, + "step": 3206 + }, + { + "epoch": 0.6173689149842385, + "grad_norm": 2.7751986637485477, + "learning_rate": 1.619546825643698e-05, + "loss": 2.0856, + "step": 3207 + }, + { + "epoch": 0.6175614216618139, + "grad_norm": 2.693359645171731, + "learning_rate": 1.6193020532126342e-05, + "loss": 2.0517, + "step": 3208 + }, + { + "epoch": 0.6175614216618139, + "lm_loss": 1.5563, + "step": 3208, + "vm_loss": 0.1711 + }, + { + "epoch": 0.6175614216618139, + "lm_loss": 1.7835, + "step": 3208, + "vm_loss": 0.202 + }, + { + "epoch": 0.6175614216618139, + "lm_loss": 1.714, + "step": 3208, + "vm_loss": 0.1726 + }, + { + "epoch": 0.6175614216618139, + "lm_loss": 1.6305, + "step": 3208, + "vm_loss": 0.2519 + }, + { + "epoch": 0.6175614216618139, + "lm_loss": 1.7705, + "step": 3208, + "vm_loss": 0.1723 + }, + { + "epoch": 0.6175614216618139, + "lm_loss": 1.8373, + "step": 3208, + "vm_loss": 0.1779 + }, + { + "epoch": 0.6175614216618139, + "lm_loss": 1.822, + "step": 3208, + "vm_loss": 0.1399 + }, + { + "epoch": 0.6175614216618139, + "lm_loss": 1.5346, + "step": 3208, + "vm_loss": 0.2133 + }, + { + "epoch": 0.6177539283393892, + "grad_norm": 3.310893855951052, + "learning_rate": 1.619057220577487e-05, + "loss": 2.0618, + "step": 3209 + }, + { + "epoch": 0.6179464350169647, + "grad_norm": 2.928554604886684, + "learning_rate": 1.618812327762059e-05, + "loss": 2.1162, + "step": 3210 + }, + { + "epoch": 0.61813894169454, + "grad_norm": 2.942721976499006, + "learning_rate": 1.6185673747901554e-05, + "loss": 2.1128, + "step": 3211 + }, + { + "epoch": 0.6183314483721154, + "grad_norm": 2.820986691806293, + "learning_rate": 1.6183223616855888e-05, + "loss": 2.1592, + "step": 3212 + }, + { + "epoch": 0.6185239550496908, + "grad_norm": 2.6620656782849816, + "learning_rate": 1.6180772884721786e-05, + "loss": 2.1516, + "step": 3213 + }, + { + "epoch": 0.6187164617272661, + "grad_norm": 2.834615064973116, + "learning_rate": 1.617832155173748e-05, + "loss": 2.0964, + "step": 3214 + }, + { + "epoch": 0.6189089684048416, + "grad_norm": 2.770172171950774, + "learning_rate": 1.6175869618141278e-05, + "loss": 2.0133, + "step": 3215 + }, + { + "epoch": 0.619101475082417, + "grad_norm": 2.9966203528062665, + "learning_rate": 1.6173417084171537e-05, + "loss": 2.1524, + "step": 3216 + }, + { + "epoch": 0.619101475082417, + "lm_loss": 2.1031, + "step": 3216, + "vm_loss": 0.1685 + }, + { + "epoch": 0.619101475082417, + "lm_loss": 2.1308, + "step": 3216, + "vm_loss": 0.1652 + }, + { + "epoch": 0.619101475082417, + "lm_loss": 1.4265, + "step": 3216, + "vm_loss": 0.1476 + }, + { + "epoch": 0.619101475082417, + "lm_loss": 1.8085, + "step": 3216, + "vm_loss": 0.17 + }, + { + "epoch": 0.619101475082417, + "lm_loss": 1.9499, + "step": 3216, + "vm_loss": 0.2371 + }, + { + "epoch": 0.619101475082417, + "lm_loss": 1.7959, + "step": 3216, + "vm_loss": 0.1957 + }, + { + "epoch": 0.619101475082417, + "lm_loss": 1.7617, + "step": 3216, + "vm_loss": 0.1896 + }, + { + "epoch": 0.619101475082417, + "lm_loss": 2.1991, + "step": 3216, + "vm_loss": 0.1846 + }, + { + "epoch": 0.6192939817599923, + "grad_norm": 2.984699383955704, + "learning_rate": 1.6170963950066672e-05, + "loss": 2.0398, + "step": 3217 + }, + { + "epoch": 0.6194864884375677, + "grad_norm": 3.0863065514460186, + "learning_rate": 1.6168510216065162e-05, + "loss": 2.1241, + "step": 3218 + }, + { + "epoch": 0.619678995115143, + "grad_norm": 3.0806461972237273, + "learning_rate": 1.616605588240554e-05, + "loss": 2.1218, + "step": 3219 + }, + { + "epoch": 0.6198715017927184, + "grad_norm": 2.874351171395082, + "learning_rate": 1.6163600949326405e-05, + "loss": 2.0915, + "step": 3220 + }, + { + "epoch": 0.6200640084702939, + "grad_norm": 2.7559362122654227, + "learning_rate": 1.6161145417066392e-05, + "loss": 2.0673, + "step": 3221 + }, + { + "epoch": 0.6202565151478692, + "grad_norm": 2.885909144866961, + "learning_rate": 1.615868928586423e-05, + "loss": 2.0847, + "step": 3222 + }, + { + "epoch": 0.6204490218254446, + "grad_norm": 2.8390260663978815, + "learning_rate": 1.615623255595867e-05, + "loss": 2.1185, + "step": 3223 + }, + { + "epoch": 0.6206415285030199, + "grad_norm": 2.829431338923187, + "learning_rate": 1.6153775227588547e-05, + "loss": 2.1076, + "step": 3224 + }, + { + "epoch": 0.6206415285030199, + "lm_loss": 2.3484, + "step": 3224, + "vm_loss": 0.1346 + }, + { + "epoch": 0.6206415285030199, + "lm_loss": 1.7484, + "step": 3224, + "vm_loss": 0.1988 + }, + { + "epoch": 0.6206415285030199, + "lm_loss": 2.1724, + "step": 3224, + "vm_loss": 0.1275 + }, + { + "epoch": 0.6206415285030199, + "lm_loss": 1.62, + "step": 3224, + "vm_loss": 0.1905 + }, + { + "epoch": 0.6206415285030199, + "lm_loss": 2.007, + "step": 3224, + "vm_loss": 0.139 + }, + { + "epoch": 0.6206415285030199, + "lm_loss": 1.5406, + "step": 3224, + "vm_loss": 0.2257 + }, + { + "epoch": 0.6206415285030199, + "lm_loss": 2.0054, + "step": 3224, + "vm_loss": 0.2293 + }, + { + "epoch": 0.6206415285030199, + "lm_loss": 1.9702, + "step": 3224, + "vm_loss": 0.1394 + }, + { + "epoch": 0.6208340351805953, + "grad_norm": 2.721669568085743, + "learning_rate": 1.6151317300992743e-05, + "loss": 2.0792, + "step": 3225 + }, + { + "epoch": 0.6210265418581707, + "grad_norm": 2.793934930081318, + "learning_rate": 1.61488587764102e-05, + "loss": 2.0452, + "step": 3226 + }, + { + "epoch": 0.621219048535746, + "grad_norm": 2.9110488755819075, + "learning_rate": 1.6146399654079915e-05, + "loss": 2.0789, + "step": 3227 + }, + { + "epoch": 0.6214115552133215, + "grad_norm": 2.7810142670961495, + "learning_rate": 1.614393993424095e-05, + "loss": 2.1153, + "step": 3228 + }, + { + "epoch": 0.6216040618908968, + "grad_norm": 2.8281314844694174, + "learning_rate": 1.6141479617132417e-05, + "loss": 1.9844, + "step": 3229 + }, + { + "epoch": 0.6217965685684722, + "grad_norm": 2.723547867862937, + "learning_rate": 1.6139018702993495e-05, + "loss": 2.003, + "step": 3230 + }, + { + "epoch": 0.6219890752460476, + "grad_norm": 2.720139820849798, + "learning_rate": 1.613655719206341e-05, + "loss": 2.1188, + "step": 3231 + }, + { + "epoch": 0.6221815819236229, + "grad_norm": 2.797496042123596, + "learning_rate": 1.613409508458146e-05, + "loss": 2.0817, + "step": 3232 + }, + { + "epoch": 0.6221815819236229, + "lm_loss": 1.4603, + "step": 3232, + "vm_loss": 0.2047 + }, + { + "epoch": 0.6221815819236229, + "lm_loss": 1.97, + "step": 3232, + "vm_loss": 0.1874 + }, + { + "epoch": 0.6221815819236229, + "lm_loss": 1.8605, + "step": 3232, + "vm_loss": 0.1649 + }, + { + "epoch": 0.6221815819236229, + "lm_loss": 2.1665, + "step": 3232, + "vm_loss": 0.1458 + }, + { + "epoch": 0.6221815819236229, + "lm_loss": 1.7954, + "step": 3232, + "vm_loss": 0.1292 + }, + { + "epoch": 0.6221815819236229, + "lm_loss": 2.0923, + "step": 3232, + "vm_loss": 0.1504 + }, + { + "epoch": 0.6221815819236229, + "lm_loss": 1.5658, + "step": 3232, + "vm_loss": 0.1456 + }, + { + "epoch": 0.6221815819236229, + "lm_loss": 1.6446, + "step": 3232, + "vm_loss": 0.2006 + }, + { + "epoch": 0.6223740886011984, + "grad_norm": 2.836820240361788, + "learning_rate": 1.613163238078699e-05, + "loss": 2.053, + "step": 3233 + }, + { + "epoch": 0.6225665952787738, + "grad_norm": 2.8689611065856404, + "learning_rate": 1.6129169080919405e-05, + "loss": 2.0844, + "step": 3234 + }, + { + "epoch": 0.6227591019563491, + "grad_norm": 2.958303885276219, + "learning_rate": 1.612670518521817e-05, + "loss": 2.1158, + "step": 3235 + }, + { + "epoch": 0.6229516086339245, + "grad_norm": 3.1440409362555144, + "learning_rate": 1.6124240693922808e-05, + "loss": 2.0957, + "step": 3236 + }, + { + "epoch": 0.6231441153114998, + "grad_norm": 2.72055694405428, + "learning_rate": 1.61217756072729e-05, + "loss": 2.033, + "step": 3237 + }, + { + "epoch": 0.6233366219890752, + "grad_norm": 3.1464460416021787, + "learning_rate": 1.611930992550808e-05, + "loss": 2.0671, + "step": 3238 + }, + { + "epoch": 0.6235291286666507, + "grad_norm": 3.2932572541966527, + "learning_rate": 1.6116843648868047e-05, + "loss": 2.0354, + "step": 3239 + }, + { + "epoch": 0.623721635344226, + "grad_norm": 3.0321716212657552, + "learning_rate": 1.6114376777592556e-05, + "loss": 2.079, + "step": 3240 + }, + { + "epoch": 0.623721635344226, + "lm_loss": 1.8431, + "step": 3240, + "vm_loss": 0.1987 + }, + { + "epoch": 0.623721635344226, + "lm_loss": 1.5771, + "step": 3240, + "vm_loss": 0.1908 + }, + { + "epoch": 0.623721635344226, + "lm_loss": 2.0431, + "step": 3240, + "vm_loss": 0.1179 + }, + { + "epoch": 0.623721635344226, + "lm_loss": 1.5818, + "step": 3240, + "vm_loss": 0.2094 + }, + { + "epoch": 0.623721635344226, + "lm_loss": 2.2321, + "step": 3240, + "vm_loss": 0.1557 + }, + { + "epoch": 0.623721635344226, + "lm_loss": 1.7004, + "step": 3240, + "vm_loss": 0.1574 + }, + { + "epoch": 0.623721635344226, + "lm_loss": 1.543, + "step": 3240, + "vm_loss": 0.1183 + }, + { + "epoch": 0.623721635344226, + "lm_loss": 1.6946, + "step": 3240, + "vm_loss": 0.1434 + }, + { + "epoch": 0.6239141420218014, + "grad_norm": 2.9879939493693977, + "learning_rate": 1.6111909311921413e-05, + "loss": 2.0229, + "step": 3241 + }, + { + "epoch": 0.6241066486993767, + "grad_norm": 3.22187543906383, + "learning_rate": 1.6109441252094495e-05, + "loss": 2.0647, + "step": 3242 + }, + { + "epoch": 0.6242991553769521, + "grad_norm": 3.0232401925477914, + "learning_rate": 1.610697259835172e-05, + "loss": 2.0914, + "step": 3243 + }, + { + "epoch": 0.6244916620545276, + "grad_norm": 3.062352805921721, + "learning_rate": 1.610450335093308e-05, + "loss": 2.1114, + "step": 3244 + }, + { + "epoch": 0.6246841687321029, + "grad_norm": 3.4633648377938755, + "learning_rate": 1.610203351007862e-05, + "loss": 2.0415, + "step": 3245 + }, + { + "epoch": 0.6248766754096783, + "grad_norm": 3.042531011877521, + "learning_rate": 1.6099563076028427e-05, + "loss": 2.0638, + "step": 3246 + }, + { + "epoch": 0.6250691820872537, + "grad_norm": 2.9386519431627427, + "learning_rate": 1.609709204902267e-05, + "loss": 2.0361, + "step": 3247 + }, + { + "epoch": 0.625261688764829, + "grad_norm": 3.4657798473256043, + "learning_rate": 1.609462042930156e-05, + "loss": 2.1306, + "step": 3248 + }, + { + "epoch": 0.625261688764829, + "lm_loss": 2.0203, + "step": 3248, + "vm_loss": 0.1773 + }, + { + "epoch": 0.625261688764829, + "lm_loss": 1.8791, + "step": 3248, + "vm_loss": 0.1747 + }, + { + "epoch": 0.625261688764829, + "lm_loss": 1.4903, + "step": 3248, + "vm_loss": 0.1864 + }, + { + "epoch": 0.625261688764829, + "lm_loss": 1.8407, + "step": 3248, + "vm_loss": 0.1997 + }, + { + "epoch": 0.625261688764829, + "lm_loss": 1.6741, + "step": 3248, + "vm_loss": 0.1887 + }, + { + "epoch": 0.625261688764829, + "lm_loss": 1.9715, + "step": 3248, + "vm_loss": 0.2259 + }, + { + "epoch": 0.625261688764829, + "lm_loss": 1.5132, + "step": 3248, + "vm_loss": 0.1644 + }, + { + "epoch": 0.625261688764829, + "lm_loss": 1.8687, + "step": 3248, + "vm_loss": 0.2442 + }, + { + "epoch": 0.6254541954424044, + "grad_norm": 3.160398971061847, + "learning_rate": 1.6092148217105372e-05, + "loss": 2.0839, + "step": 3249 + }, + { + "epoch": 0.6256467021199797, + "grad_norm": 2.8940007770699894, + "learning_rate": 1.6089675412674436e-05, + "loss": 2.0199, + "step": 3250 + }, + { + "epoch": 0.6258392087975552, + "grad_norm": 2.6958379679164155, + "learning_rate": 1.608720201624914e-05, + "loss": 2.0245, + "step": 3251 + }, + { + "epoch": 0.6260317154751306, + "grad_norm": 3.306982120459445, + "learning_rate": 1.6084728028069935e-05, + "loss": 2.0307, + "step": 3252 + }, + { + "epoch": 0.6262242221527059, + "grad_norm": 3.1510069115974915, + "learning_rate": 1.6082253448377312e-05, + "loss": 2.0957, + "step": 3253 + }, + { + "epoch": 0.6264167288302813, + "grad_norm": 2.9267431955331085, + "learning_rate": 1.6079778277411843e-05, + "loss": 2.0544, + "step": 3254 + }, + { + "epoch": 0.6266092355078566, + "grad_norm": 2.8269267507829356, + "learning_rate": 1.6077302515414142e-05, + "loss": 2.0425, + "step": 3255 + }, + { + "epoch": 0.626801742185432, + "grad_norm": 2.8171361752175192, + "learning_rate": 1.6074826162624883e-05, + "loss": 2.1019, + "step": 3256 + }, + { + "epoch": 0.626801742185432, + "lm_loss": 1.6816, + "step": 3256, + "vm_loss": 0.1453 + }, + { + "epoch": 0.626801742185432, + "lm_loss": 1.9142, + "step": 3256, + "vm_loss": 0.1295 + }, + { + "epoch": 0.626801742185432, + "lm_loss": 1.7568, + "step": 3256, + "vm_loss": 0.2309 + }, + { + "epoch": 0.626801742185432, + "lm_loss": 2.1106, + "step": 3256, + "vm_loss": 0.1654 + }, + { + "epoch": 0.626801742185432, + "lm_loss": 1.9164, + "step": 3256, + "vm_loss": 0.1607 + }, + { + "epoch": 0.626801742185432, + "lm_loss": 1.7232, + "step": 3256, + "vm_loss": 0.1295 + }, + { + "epoch": 0.626801742185432, + "lm_loss": 2.0661, + "step": 3256, + "vm_loss": 0.2257 + }, + { + "epoch": 0.626801742185432, + "lm_loss": 1.93, + "step": 3256, + "vm_loss": 0.1922 + }, + { + "epoch": 0.6269942488630075, + "grad_norm": 2.958991963577431, + "learning_rate": 1.6072349219284805e-05, + "loss": 2.0596, + "step": 3257 + }, + { + "epoch": 0.6271867555405828, + "grad_norm": 2.7804274531826523, + "learning_rate": 1.606987168563469e-05, + "loss": 2.0335, + "step": 3258 + }, + { + "epoch": 0.6273792622181582, + "grad_norm": 2.879610627506035, + "learning_rate": 1.6067393561915395e-05, + "loss": 2.0597, + "step": 3259 + }, + { + "epoch": 0.6275717688957335, + "grad_norm": 2.9926630797216034, + "learning_rate": 1.6064914848367818e-05, + "loss": 2.0735, + "step": 3260 + }, + { + "epoch": 0.6277642755733089, + "grad_norm": 2.8666754652569235, + "learning_rate": 1.606243554523293e-05, + "loss": 2.054, + "step": 3261 + }, + { + "epoch": 0.6279567822508844, + "grad_norm": 3.275475330705778, + "learning_rate": 1.605995565275174e-05, + "loss": 2.1118, + "step": 3262 + }, + { + "epoch": 0.6281492889284597, + "grad_norm": 2.835854144413279, + "learning_rate": 1.6057475171165333e-05, + "loss": 2.0543, + "step": 3263 + }, + { + "epoch": 0.6283417956060351, + "grad_norm": 3.0033977622092176, + "learning_rate": 1.605499410071484e-05, + "loss": 2.094, + "step": 3264 + }, + { + "epoch": 0.6283417956060351, + "lm_loss": 1.9847, + "step": 3264, + "vm_loss": 0.1007 + }, + { + "epoch": 0.6283417956060351, + "lm_loss": 1.4738, + "step": 3264, + "vm_loss": 0.1834 + }, + { + "epoch": 0.6283417956060351, + "lm_loss": 1.9652, + "step": 3264, + "vm_loss": 0.1837 + }, + { + "epoch": 0.6283417956060351, + "lm_loss": 1.4214, + "step": 3264, + "vm_loss": 0.1369 + }, + { + "epoch": 0.6283417956060351, + "lm_loss": 1.7637, + "step": 3264, + "vm_loss": 0.1725 + }, + { + "epoch": 0.6283417956060351, + "lm_loss": 2.0176, + "step": 3264, + "vm_loss": 0.1923 + }, + { + "epoch": 0.6283417956060351, + "lm_loss": 1.7222, + "step": 3264, + "vm_loss": 0.1663 + }, + { + "epoch": 0.6283417956060351, + "lm_loss": 1.7601, + "step": 3264, + "vm_loss": 0.1608 + }, + { + "epoch": 0.6285343022836105, + "grad_norm": 2.659107301330133, + "learning_rate": 1.6052512441641456e-05, + "loss": 2.0407, + "step": 3265 + }, + { + "epoch": 0.6287268089611858, + "grad_norm": 2.8663836480113534, + "learning_rate": 1.605003019418643e-05, + "loss": 2.0407, + "step": 3266 + }, + { + "epoch": 0.6289193156387612, + "grad_norm": 3.0307360465573834, + "learning_rate": 1.6047547358591063e-05, + "loss": 2.0345, + "step": 3267 + }, + { + "epoch": 0.6291118223163366, + "grad_norm": 2.9942403075182065, + "learning_rate": 1.604506393509673e-05, + "loss": 2.045, + "step": 3268 + }, + { + "epoch": 0.629304328993912, + "grad_norm": 2.9463247531711736, + "learning_rate": 1.6042579923944834e-05, + "loss": 2.012, + "step": 3269 + }, + { + "epoch": 0.6294968356714874, + "grad_norm": 2.9749780526600307, + "learning_rate": 1.6040095325376867e-05, + "loss": 2.076, + "step": 3270 + }, + { + "epoch": 0.6296893423490627, + "grad_norm": 2.9523475624585664, + "learning_rate": 1.6037610139634358e-05, + "loss": 2.0609, + "step": 3271 + }, + { + "epoch": 0.6298818490266381, + "grad_norm": 2.7609148414533142, + "learning_rate": 1.60351243669589e-05, + "loss": 2.0429, + "step": 3272 + }, + { + "epoch": 0.6298818490266381, + "lm_loss": 1.7073, + "step": 3272, + "vm_loss": 0.1606 + }, + { + "epoch": 0.6298818490266381, + "lm_loss": 1.8577, + "step": 3272, + "vm_loss": 0.2083 + }, + { + "epoch": 0.6298818490266381, + "lm_loss": 1.9248, + "step": 3272, + "vm_loss": 0.1104 + }, + { + "epoch": 0.6298818490266381, + "lm_loss": 1.7696, + "step": 3272, + "vm_loss": 0.1465 + }, + { + "epoch": 0.6298818490266381, + "lm_loss": 2.3402, + "step": 3272, + "vm_loss": 0.1358 + }, + { + "epoch": 0.6298818490266381, + "lm_loss": 2.0146, + "step": 3272, + "vm_loss": 0.2233 + }, + { + "epoch": 0.6298818490266381, + "lm_loss": 1.5198, + "step": 3272, + "vm_loss": 0.2117 + }, + { + "epoch": 0.6298818490266381, + "lm_loss": 1.6701, + "step": 3272, + "vm_loss": 0.1037 + }, + { + "epoch": 0.6300743557042134, + "grad_norm": 2.9176825605467607, + "learning_rate": 1.6032638007592143e-05, + "loss": 2.0413, + "step": 3273 + }, + { + "epoch": 0.6302668623817889, + "grad_norm": 2.978385302217501, + "learning_rate": 1.603015106177579e-05, + "loss": 1.9534, + "step": 3274 + }, + { + "epoch": 0.6304593690593643, + "grad_norm": 3.019605757575759, + "learning_rate": 1.60276635297516e-05, + "loss": 2.0332, + "step": 3275 + }, + { + "epoch": 0.6306518757369396, + "grad_norm": 2.978349274068598, + "learning_rate": 1.602517541176141e-05, + "loss": 2.0575, + "step": 3276 + }, + { + "epoch": 0.630844382414515, + "grad_norm": 2.928080702894649, + "learning_rate": 1.602268670804708e-05, + "loss": 2.0351, + "step": 3277 + }, + { + "epoch": 0.6310368890920904, + "grad_norm": 2.8851228608228907, + "learning_rate": 1.602019741885055e-05, + "loss": 2.0468, + "step": 3278 + }, + { + "epoch": 0.6312293957696657, + "grad_norm": 2.934821805893519, + "learning_rate": 1.6017707544413808e-05, + "loss": 2.0145, + "step": 3279 + }, + { + "epoch": 0.6314219024472412, + "grad_norm": 2.6787353699381624, + "learning_rate": 1.6015217084978908e-05, + "loss": 2.052, + "step": 3280 + }, + { + "epoch": 0.6314219024472412, + "lm_loss": 1.7433, + "step": 3280, + "vm_loss": 0.1663 + }, + { + "epoch": 0.6314219024472412, + "lm_loss": 1.6783, + "step": 3280, + "vm_loss": 0.1943 + }, + { + "epoch": 0.6314219024472412, + "lm_loss": 1.638, + "step": 3280, + "vm_loss": 0.2398 + }, + { + "epoch": 0.6314219024472412, + "lm_loss": 1.8944, + "step": 3280, + "vm_loss": 0.1291 + }, + { + "epoch": 0.6314219024472412, + "lm_loss": 1.9888, + "step": 3280, + "vm_loss": 0.1855 + }, + { + "epoch": 0.6314219024472412, + "lm_loss": 1.7269, + "step": 3280, + "vm_loss": 0.1202 + }, + { + "epoch": 0.6314219024472412, + "lm_loss": 1.8964, + "step": 3280, + "vm_loss": 0.1907 + }, + { + "epoch": 0.6314219024472412, + "lm_loss": 2.2046, + "step": 3280, + "vm_loss": 0.1276 + }, + { + "epoch": 0.6316144091248165, + "grad_norm": 2.8850037555915695, + "learning_rate": 1.6012726040787944e-05, + "loss": 2.054, + "step": 3281 + }, + { + "epoch": 0.6318069158023919, + "grad_norm": 2.951420455515323, + "learning_rate": 1.601023441208309e-05, + "loss": 2.1242, + "step": 3282 + }, + { + "epoch": 0.6319994224799673, + "grad_norm": 3.196164145490763, + "learning_rate": 1.600774219910655e-05, + "loss": 2.077, + "step": 3283 + }, + { + "epoch": 0.6321919291575426, + "grad_norm": 2.82576519307112, + "learning_rate": 1.6005249402100612e-05, + "loss": 1.9973, + "step": 3284 + }, + { + "epoch": 0.632384435835118, + "grad_norm": 2.9895702645153532, + "learning_rate": 1.6002756021307603e-05, + "loss": 2.0559, + "step": 3285 + }, + { + "epoch": 0.6325769425126934, + "grad_norm": 3.3972380122932937, + "learning_rate": 1.600026205696991e-05, + "loss": 2.0727, + "step": 3286 + }, + { + "epoch": 0.6327694491902688, + "grad_norm": 2.8123570879748407, + "learning_rate": 1.599776750932998e-05, + "loss": 2.074, + "step": 3287 + }, + { + "epoch": 0.6329619558678442, + "grad_norm": 3.016210079875932, + "learning_rate": 1.5995272378630313e-05, + "loss": 2.0213, + "step": 3288 + }, + { + "epoch": 0.6329619558678442, + "lm_loss": 1.8798, + "step": 3288, + "vm_loss": 0.2196 + }, + { + "epoch": 0.6329619558678442, + "lm_loss": 1.6479, + "step": 3288, + "vm_loss": 0.189 + }, + { + "epoch": 0.6329619558678442, + "lm_loss": 1.615, + "step": 3288, + "vm_loss": 0.2133 + }, + { + "epoch": 0.6329619558678442, + "lm_loss": 1.9822, + "step": 3288, + "vm_loss": 0.1498 + }, + { + "epoch": 0.6329619558678442, + "lm_loss": 1.799, + "step": 3288, + "vm_loss": 0.1469 + }, + { + "epoch": 0.6329619558678442, + "lm_loss": 1.7904, + "step": 3288, + "vm_loss": 0.2208 + }, + { + "epoch": 0.6329619558678442, + "lm_loss": 2.207, + "step": 3288, + "vm_loss": 0.1454 + }, + { + "epoch": 0.6329619558678442, + "lm_loss": 1.9522, + "step": 3288, + "vm_loss": 0.2228 + }, + { + "epoch": 0.6331544625454195, + "grad_norm": 3.6456296755612674, + "learning_rate": 1.599277666511347e-05, + "loss": 2.0451, + "step": 3289 + }, + { + "epoch": 0.6333469692229949, + "grad_norm": 3.0173959761456217, + "learning_rate": 1.5990280369022064e-05, + "loss": 2.0896, + "step": 3290 + }, + { + "epoch": 0.6335394759005702, + "grad_norm": 3.003685282374375, + "learning_rate": 1.598778349059877e-05, + "loss": 2.1061, + "step": 3291 + }, + { + "epoch": 0.6337319825781457, + "grad_norm": 3.467649304205802, + "learning_rate": 1.5985286030086315e-05, + "loss": 2.0255, + "step": 3292 + }, + { + "epoch": 0.6339244892557211, + "grad_norm": 3.118770581639357, + "learning_rate": 1.598278798772748e-05, + "loss": 2.0783, + "step": 3293 + }, + { + "epoch": 0.6341169959332964, + "grad_norm": 2.7747227595558503, + "learning_rate": 1.5980289363765113e-05, + "loss": 2.0201, + "step": 3294 + }, + { + "epoch": 0.6343095026108718, + "grad_norm": 2.976046922116868, + "learning_rate": 1.597779015844211e-05, + "loss": 2.0429, + "step": 3295 + }, + { + "epoch": 0.6345020092884472, + "grad_norm": 2.9661040809587047, + "learning_rate": 1.5975290372001423e-05, + "loss": 2.0405, + "step": 3296 + }, + { + "epoch": 0.6345020092884472, + "lm_loss": 1.8267, + "step": 3296, + "vm_loss": 0.1496 + }, + { + "epoch": 0.6345020092884472, + "lm_loss": 1.6175, + "step": 3296, + "vm_loss": 0.1906 + }, + { + "epoch": 0.6345020092884472, + "lm_loss": 1.7879, + "step": 3296, + "vm_loss": 0.1605 + }, + { + "epoch": 0.6345020092884472, + "lm_loss": 1.5495, + "step": 3296, + "vm_loss": 0.2055 + }, + { + "epoch": 0.6345020092884472, + "lm_loss": 2.0817, + "step": 3296, + "vm_loss": 0.157 + }, + { + "epoch": 0.6345020092884472, + "lm_loss": 2.0249, + "step": 3296, + "vm_loss": 0.1448 + }, + { + "epoch": 0.6345020092884472, + "lm_loss": 2.1368, + "step": 3296, + "vm_loss": 0.1582 + }, + { + "epoch": 0.6345020092884472, + "lm_loss": 1.75, + "step": 3296, + "vm_loss": 0.1818 + }, + { + "epoch": 0.6346945159660226, + "grad_norm": 2.7107119940730566, + "learning_rate": 1.597279000468607e-05, + "loss": 2.034, + "step": 3297 + }, + { + "epoch": 0.634887022643598, + "grad_norm": 2.888205903036789, + "learning_rate": 1.597028905673911e-05, + "loss": 2.0562, + "step": 3298 + }, + { + "epoch": 0.6350795293211733, + "grad_norm": 3.327241549349959, + "learning_rate": 1.596778752840367e-05, + "loss": 2.0407, + "step": 3299 + }, + { + "epoch": 0.6352720359987487, + "grad_norm": 3.053432659277193, + "learning_rate": 1.5965285419922936e-05, + "loss": 2.0587, + "step": 3300 + }, + { + "epoch": 0.6354645426763241, + "grad_norm": 2.760345374328317, + "learning_rate": 1.596278273154014e-05, + "loss": 2.0274, + "step": 3301 + }, + { + "epoch": 0.6356570493538994, + "grad_norm": 2.9083317304313994, + "learning_rate": 1.5960279463498576e-05, + "loss": 2.0866, + "step": 3302 + }, + { + "epoch": 0.6358495560314749, + "grad_norm": 2.669141789540415, + "learning_rate": 1.5957775616041594e-05, + "loss": 2.0148, + "step": 3303 + }, + { + "epoch": 0.6360420627090502, + "grad_norm": 2.8077454795126546, + "learning_rate": 1.5955271189412596e-05, + "loss": 2.0561, + "step": 3304 + }, + { + "epoch": 0.6360420627090502, + "lm_loss": 2.231, + "step": 3304, + "vm_loss": 0.1564 + }, + { + "epoch": 0.6360420627090502, + "lm_loss": 2.0571, + "step": 3304, + "vm_loss": 0.1618 + }, + { + "epoch": 0.6360420627090502, + "lm_loss": 2.3381, + "step": 3304, + "vm_loss": 0.2133 + }, + { + "epoch": 0.6360420627090502, + "lm_loss": 1.4315, + "step": 3304, + "vm_loss": 0.1601 + }, + { + "epoch": 0.6360420627090502, + "lm_loss": 1.6932, + "step": 3304, + "vm_loss": 0.1826 + }, + { + "epoch": 0.6360420627090502, + "lm_loss": 1.7046, + "step": 3304, + "vm_loss": 0.2253 + }, + { + "epoch": 0.6360420627090502, + "lm_loss": 1.2636, + "step": 3304, + "vm_loss": 0.196 + }, + { + "epoch": 0.6360420627090502, + "lm_loss": 2.1153, + "step": 3304, + "vm_loss": 0.1661 + }, + { + "epoch": 0.6362345693866256, + "grad_norm": 2.9974686567284006, + "learning_rate": 1.595276618385505e-05, + "loss": 2.07, + "step": 3305 + }, + { + "epoch": 0.636427076064201, + "grad_norm": 2.983821918017044, + "learning_rate": 1.5950260599612478e-05, + "loss": 2.0981, + "step": 3306 + }, + { + "epoch": 0.6366195827417763, + "grad_norm": 2.795410831641941, + "learning_rate": 1.5947754436928445e-05, + "loss": 2.069, + "step": 3307 + }, + { + "epoch": 0.6368120894193517, + "grad_norm": 2.9708374381594767, + "learning_rate": 1.594524769604659e-05, + "loss": 2.054, + "step": 3308 + }, + { + "epoch": 0.6370045960969272, + "grad_norm": 2.8290560136821172, + "learning_rate": 1.5942740377210597e-05, + "loss": 2.0266, + "step": 3309 + }, + { + "epoch": 0.6371971027745025, + "grad_norm": 2.6833896258540526, + "learning_rate": 1.5940232480664205e-05, + "loss": 2.1062, + "step": 3310 + }, + { + "epoch": 0.6373896094520779, + "grad_norm": 2.8616296115912934, + "learning_rate": 1.5937724006651222e-05, + "loss": 2.0091, + "step": 3311 + }, + { + "epoch": 0.6375821161296532, + "grad_norm": 2.9638169926847704, + "learning_rate": 1.5935214955415496e-05, + "loss": 2.0339, + "step": 3312 + }, + { + "epoch": 0.6375821161296532, + "lm_loss": 2.0585, + "step": 3312, + "vm_loss": 0.1967 + }, + { + "epoch": 0.6375821161296532, + "lm_loss": 1.6663, + "step": 3312, + "vm_loss": 0.2057 + }, + { + "epoch": 0.6375821161296532, + "lm_loss": 2.0045, + "step": 3312, + "vm_loss": 0.2354 + }, + { + "epoch": 0.6375821161296532, + "lm_loss": 1.53, + "step": 3312, + "vm_loss": 0.1436 + }, + { + "epoch": 0.6375821161296532, + "lm_loss": 2.0119, + "step": 3312, + "vm_loss": 0.2043 + }, + { + "epoch": 0.6375821161296532, + "lm_loss": 1.8432, + "step": 3312, + "vm_loss": 0.1152 + }, + { + "epoch": 0.6375821161296532, + "lm_loss": 1.9417, + "step": 3312, + "vm_loss": 0.2161 + }, + { + "epoch": 0.6375821161296532, + "lm_loss": 1.7011, + "step": 3312, + "vm_loss": 0.1962 + }, + { + "epoch": 0.6377746228072286, + "grad_norm": 2.9680209701670823, + "learning_rate": 1.5932705327200948e-05, + "loss": 2.0567, + "step": 3313 + }, + { + "epoch": 0.637967129484804, + "grad_norm": 3.145858056178398, + "learning_rate": 1.593019512225154e-05, + "loss": 2.064, + "step": 3314 + }, + { + "epoch": 0.6381596361623794, + "grad_norm": 2.8657016073930173, + "learning_rate": 1.5927684340811292e-05, + "loss": 2.0391, + "step": 3315 + }, + { + "epoch": 0.6383521428399548, + "grad_norm": 2.753424023140314, + "learning_rate": 1.5925172983124292e-05, + "loss": 2.0104, + "step": 3316 + }, + { + "epoch": 0.6385446495175301, + "grad_norm": 2.670779550129038, + "learning_rate": 1.592266104943467e-05, + "loss": 2.0245, + "step": 3317 + }, + { + "epoch": 0.6387371561951055, + "grad_norm": 2.8361908175389394, + "learning_rate": 1.5920148539986623e-05, + "loss": 2.016, + "step": 3318 + }, + { + "epoch": 0.6389296628726809, + "grad_norm": 2.802222203007585, + "learning_rate": 1.59176354550244e-05, + "loss": 2.0571, + "step": 3319 + }, + { + "epoch": 0.6391221695502562, + "grad_norm": 3.0332668143014265, + "learning_rate": 1.5915121794792302e-05, + "loss": 2.0137, + "step": 3320 + }, + { + "epoch": 0.6391221695502562, + "lm_loss": 2.1169, + "step": 3320, + "vm_loss": 0.1448 + }, + { + "epoch": 0.6391221695502562, + "lm_loss": 1.4978, + "step": 3320, + "vm_loss": 0.1309 + }, + { + "epoch": 0.6391221695502562, + "lm_loss": 2.097, + "step": 3320, + "vm_loss": 0.1545 + }, + { + "epoch": 0.6391221695502562, + "lm_loss": 1.794, + "step": 3320, + "vm_loss": 0.1678 + }, + { + "epoch": 0.6391221695502562, + "lm_loss": 1.6489, + "step": 3320, + "vm_loss": 0.155 + }, + { + "epoch": 0.6391221695502562, + "lm_loss": 2.1894, + "step": 3320, + "vm_loss": 0.2021 + }, + { + "epoch": 0.6391221695502562, + "lm_loss": 1.2987, + "step": 3320, + "vm_loss": 0.1787 + }, + { + "epoch": 0.6391221695502562, + "lm_loss": 1.9577, + "step": 3320, + "vm_loss": 0.0952 + }, + { + "epoch": 0.6393146762278317, + "grad_norm": 3.0584574826512205, + "learning_rate": 1.5912607559534684e-05, + "loss": 2.0532, + "step": 3321 + }, + { + "epoch": 0.639507182905407, + "grad_norm": 2.833461610120119, + "learning_rate": 1.5910092749495972e-05, + "loss": 2.0738, + "step": 3322 + }, + { + "epoch": 0.6396996895829824, + "grad_norm": 2.6985401222281262, + "learning_rate": 1.590757736492063e-05, + "loss": 2.0767, + "step": 3323 + }, + { + "epoch": 0.6398921962605578, + "grad_norm": 2.883571231355534, + "learning_rate": 1.5905061406053186e-05, + "loss": 1.9904, + "step": 3324 + }, + { + "epoch": 0.6400847029381331, + "grad_norm": 3.0584918647258514, + "learning_rate": 1.5902544873138228e-05, + "loss": 2.0095, + "step": 3325 + }, + { + "epoch": 0.6402772096157086, + "grad_norm": 2.8503687868954666, + "learning_rate": 1.5900027766420396e-05, + "loss": 2.044, + "step": 3326 + }, + { + "epoch": 0.640469716293284, + "grad_norm": 2.750696515497026, + "learning_rate": 1.5897510086144374e-05, + "loss": 2.0355, + "step": 3327 + }, + { + "epoch": 0.6406622229708593, + "grad_norm": 2.808866913889281, + "learning_rate": 1.5894991832554926e-05, + "loss": 2.0258, + "step": 3328 + }, + { + "epoch": 0.6406622229708593, + "lm_loss": 1.6966, + "step": 3328, + "vm_loss": 0.1142 + }, + { + "epoch": 0.6406622229708593, + "lm_loss": 2.1342, + "step": 3328, + "vm_loss": 0.1944 + }, + { + "epoch": 0.6406622229708593, + "lm_loss": 1.7212, + "step": 3328, + "vm_loss": 0.1472 + }, + { + "epoch": 0.6406622229708593, + "lm_loss": 1.7517, + "step": 3328, + "vm_loss": 0.1997 + }, + { + "epoch": 0.6406622229708593, + "lm_loss": 1.4576, + "step": 3328, + "vm_loss": 0.2306 + }, + { + "epoch": 0.6406622229708593, + "lm_loss": 1.7404, + "step": 3328, + "vm_loss": 0.1969 + }, + { + "epoch": 0.6406622229708593, + "lm_loss": 2.0482, + "step": 3328, + "vm_loss": 0.1318 + }, + { + "epoch": 0.6406622229708593, + "lm_loss": 1.7131, + "step": 3328, + "vm_loss": 0.1975 + }, + { + "epoch": 0.6408547296484347, + "grad_norm": 3.042957276070317, + "learning_rate": 1.589247300589685e-05, + "loss": 2.1082, + "step": 3329 + }, + { + "epoch": 0.64104723632601, + "grad_norm": 2.935121582805986, + "learning_rate": 1.5889953606415014e-05, + "loss": 2.0608, + "step": 3330 + }, + { + "epoch": 0.6412397430035854, + "grad_norm": 3.0704924786474366, + "learning_rate": 1.5887433634354335e-05, + "loss": 2.0611, + "step": 3331 + }, + { + "epoch": 0.6414322496811609, + "grad_norm": 2.8471577273649773, + "learning_rate": 1.5884913089959778e-05, + "loss": 2.0239, + "step": 3332 + }, + { + "epoch": 0.6416247563587362, + "grad_norm": 2.8555693950657925, + "learning_rate": 1.588239197347638e-05, + "loss": 2.0545, + "step": 3333 + }, + { + "epoch": 0.6418172630363116, + "grad_norm": 3.0491748486611576, + "learning_rate": 1.5879870285149232e-05, + "loss": 2.0432, + "step": 3334 + }, + { + "epoch": 0.6420097697138869, + "grad_norm": 2.822007576139157, + "learning_rate": 1.5877348025223458e-05, + "loss": 2.0804, + "step": 3335 + }, + { + "epoch": 0.6422022763914623, + "grad_norm": 2.9273675253440583, + "learning_rate": 1.5874825193944264e-05, + "loss": 2.0267, + "step": 3336 + }, + { + "epoch": 0.6422022763914623, + "lm_loss": 1.7186, + "step": 3336, + "vm_loss": 0.1408 + }, + { + "epoch": 0.6422022763914623, + "lm_loss": 2.0483, + "step": 3336, + "vm_loss": 0.2134 + }, + { + "epoch": 0.6422022763914623, + "lm_loss": 1.8428, + "step": 3336, + "vm_loss": 0.1924 + }, + { + "epoch": 0.6422022763914623, + "lm_loss": 1.7132, + "step": 3336, + "vm_loss": 0.1692 + }, + { + "epoch": 0.6422022763914623, + "lm_loss": 1.639, + "step": 3336, + "vm_loss": 0.117 + }, + { + "epoch": 0.6422022763914623, + "lm_loss": 1.7427, + "step": 3336, + "vm_loss": 0.2607 + }, + { + "epoch": 0.6422022763914623, + "lm_loss": 1.6197, + "step": 3336, + "vm_loss": 0.1647 + }, + { + "epoch": 0.6422022763914623, + "lm_loss": 1.4923, + "step": 3336, + "vm_loss": 0.1467 + }, + { + "epoch": 0.6423947830690377, + "grad_norm": 2.9028976799538664, + "learning_rate": 1.5872301791556904e-05, + "loss": 2.0061, + "step": 3337 + }, + { + "epoch": 0.642587289746613, + "grad_norm": 3.106169298332078, + "learning_rate": 1.586977781830668e-05, + "loss": 2.0524, + "step": 3338 + }, + { + "epoch": 0.6427797964241885, + "grad_norm": 3.0360476401193814, + "learning_rate": 1.5867253274438954e-05, + "loss": 2.1155, + "step": 3339 + }, + { + "epoch": 0.6429723031017639, + "grad_norm": 2.9732666727001673, + "learning_rate": 1.5864728160199145e-05, + "loss": 2.0105, + "step": 3340 + }, + { + "epoch": 0.6431648097793392, + "grad_norm": 3.057200243234311, + "learning_rate": 1.586220247583273e-05, + "loss": 2.0101, + "step": 3341 + }, + { + "epoch": 0.6433573164569146, + "grad_norm": 2.93237720166904, + "learning_rate": 1.5859676221585233e-05, + "loss": 2.0244, + "step": 3342 + }, + { + "epoch": 0.6435498231344899, + "grad_norm": 2.7617260927879292, + "learning_rate": 1.585714939770224e-05, + "loss": 2.0318, + "step": 3343 + }, + { + "epoch": 0.6437423298120654, + "grad_norm": 2.8552648221139925, + "learning_rate": 1.5854622004429392e-05, + "loss": 2.0279, + "step": 3344 + }, + { + "epoch": 0.6437423298120654, + "lm_loss": 1.2019, + "step": 3344, + "vm_loss": 0.1113 + }, + { + "epoch": 0.6437423298120654, + "lm_loss": 1.5573, + "step": 3344, + "vm_loss": 0.1667 + }, + { + "epoch": 0.6437423298120654, + "lm_loss": 1.3952, + "step": 3344, + "vm_loss": 0.2025 + }, + { + "epoch": 0.6437423298120654, + "lm_loss": 1.523, + "step": 3344, + "vm_loss": 0.1598 + }, + { + "epoch": 0.6437423298120654, + "lm_loss": 1.6737, + "step": 3344, + "vm_loss": 0.1813 + }, + { + "epoch": 0.6437423298120654, + "lm_loss": 1.987, + "step": 3344, + "vm_loss": 0.1396 + }, + { + "epoch": 0.6437423298120654, + "lm_loss": 1.3552, + "step": 3344, + "vm_loss": 0.1644 + }, + { + "epoch": 0.6437423298120654, + "lm_loss": 1.9727, + "step": 3344, + "vm_loss": 0.1748 + }, + { + "epoch": 0.6439348364896408, + "grad_norm": 2.8998247702239968, + "learning_rate": 1.5852094042012384e-05, + "loss": 1.9637, + "step": 3345 + }, + { + "epoch": 0.6441273431672161, + "grad_norm": 3.0268684376395307, + "learning_rate": 1.584956551069696e-05, + "loss": 2.0307, + "step": 3346 + }, + { + "epoch": 0.6443198498447915, + "grad_norm": 2.8674829263791044, + "learning_rate": 1.5847036410728933e-05, + "loss": 2.0548, + "step": 3347 + }, + { + "epoch": 0.6445123565223668, + "grad_norm": 2.874074737392052, + "learning_rate": 1.5844506742354163e-05, + "loss": 1.9918, + "step": 3348 + }, + { + "epoch": 0.6447048631999422, + "grad_norm": 2.8648274573427064, + "learning_rate": 1.584197650581856e-05, + "loss": 2.0863, + "step": 3349 + }, + { + "epoch": 0.6448973698775177, + "grad_norm": 2.7674233658501364, + "learning_rate": 1.5839445701368104e-05, + "loss": 2.0943, + "step": 3350 + }, + { + "epoch": 0.645089876555093, + "grad_norm": 2.8961242482338214, + "learning_rate": 1.5836914329248817e-05, + "loss": 2.0335, + "step": 3351 + }, + { + "epoch": 0.6452823832326684, + "grad_norm": 2.9747271683162566, + "learning_rate": 1.583438238970678e-05, + "loss": 2.0065, + "step": 3352 + }, + { + "epoch": 0.6452823832326684, + "lm_loss": 1.688, + "step": 3352, + "vm_loss": 0.1924 + }, + { + "epoch": 0.6452823832326684, + "lm_loss": 2.1008, + "step": 3352, + "vm_loss": 0.2152 + }, + { + "epoch": 0.6452823832326684, + "lm_loss": 1.8394, + "step": 3352, + "vm_loss": 0.1654 + }, + { + "epoch": 0.6452823832326684, + "lm_loss": 1.4674, + "step": 3352, + "vm_loss": 0.2615 + }, + { + "epoch": 0.6452823832326684, + "lm_loss": 1.8466, + "step": 3352, + "vm_loss": 0.1941 + }, + { + "epoch": 0.6452823832326684, + "lm_loss": 2.0024, + "step": 3352, + "vm_loss": 0.1742 + }, + { + "epoch": 0.6452823832326684, + "lm_loss": 2.1023, + "step": 3352, + "vm_loss": 0.2063 + }, + { + "epoch": 0.6452823832326684, + "lm_loss": 2.0437, + "step": 3352, + "vm_loss": 0.1241 + }, + { + "epoch": 0.6454748899102437, + "grad_norm": 2.9917526492886415, + "learning_rate": 1.5831849882988138e-05, + "loss": 2.1314, + "step": 3353 + }, + { + "epoch": 0.6456673965878191, + "grad_norm": 2.758164479447144, + "learning_rate": 1.5829316809339068e-05, + "loss": 1.9959, + "step": 3354 + }, + { + "epoch": 0.6458599032653946, + "grad_norm": 2.8806748694174558, + "learning_rate": 1.5826783169005827e-05, + "loss": 2.0012, + "step": 3355 + }, + { + "epoch": 0.6460524099429699, + "grad_norm": 2.92702907914621, + "learning_rate": 1.5824248962234715e-05, + "loss": 1.9718, + "step": 3356 + }, + { + "epoch": 0.6462449166205453, + "grad_norm": 2.9915276550254633, + "learning_rate": 1.5821714189272093e-05, + "loss": 2.0118, + "step": 3357 + }, + { + "epoch": 0.6464374232981207, + "grad_norm": 3.071158921315843, + "learning_rate": 1.581917885036437e-05, + "loss": 2.0091, + "step": 3358 + }, + { + "epoch": 0.646629929975696, + "grad_norm": 2.9865223380529216, + "learning_rate": 1.581664294575801e-05, + "loss": 2.081, + "step": 3359 + }, + { + "epoch": 0.6468224366532714, + "grad_norm": 2.874942712172753, + "learning_rate": 1.5814106475699546e-05, + "loss": 2.0613, + "step": 3360 + }, + { + "epoch": 0.6468224366532714, + "lm_loss": 1.6293, + "step": 3360, + "vm_loss": 0.1618 + }, + { + "epoch": 0.6468224366532714, + "lm_loss": 1.3367, + "step": 3360, + "vm_loss": 0.1434 + }, + { + "epoch": 0.6468224366532714, + "lm_loss": 2.2137, + "step": 3360, + "vm_loss": 0.2102 + }, + { + "epoch": 0.6468224366532714, + "lm_loss": 1.873, + "step": 3360, + "vm_loss": 0.1626 + }, + { + "epoch": 0.6468224366532714, + "lm_loss": 1.7447, + "step": 3360, + "vm_loss": 0.1653 + }, + { + "epoch": 0.6468224366532714, + "lm_loss": 1.9527, + "step": 3360, + "vm_loss": 0.1664 + }, + { + "epoch": 0.6468224366532714, + "lm_loss": 2.1401, + "step": 3360, + "vm_loss": 0.1738 + }, + { + "epoch": 0.6468224366532714, + "lm_loss": 1.7832, + "step": 3360, + "vm_loss": 0.1749 + }, + { + "epoch": 0.6470149433308467, + "grad_norm": 2.965084940060805, + "learning_rate": 1.581156944043554e-05, + "loss": 2.0341, + "step": 3361 + }, + { + "epoch": 0.6472074500084222, + "grad_norm": 2.9535612731752185, + "learning_rate": 1.580903184021264e-05, + "loss": 2.0015, + "step": 3362 + }, + { + "epoch": 0.6473999566859976, + "grad_norm": 3.0264711841637504, + "learning_rate": 1.580649367527752e-05, + "loss": 2.0291, + "step": 3363 + }, + { + "epoch": 0.6475924633635729, + "grad_norm": 3.0282319074475303, + "learning_rate": 1.5803954945876933e-05, + "loss": 2.0478, + "step": 3364 + }, + { + "epoch": 0.6477849700411483, + "grad_norm": 3.0511656046216085, + "learning_rate": 1.580141565225767e-05, + "loss": 2.016, + "step": 3365 + }, + { + "epoch": 0.6479774767187236, + "grad_norm": 3.0070316032997515, + "learning_rate": 1.5798875794666582e-05, + "loss": 2.0287, + "step": 3366 + }, + { + "epoch": 0.648169983396299, + "grad_norm": 2.9736436104394195, + "learning_rate": 1.5796335373350577e-05, + "loss": 1.9982, + "step": 3367 + }, + { + "epoch": 0.6483624900738745, + "grad_norm": 2.851670807729729, + "learning_rate": 1.5793794388556617e-05, + "loss": 1.9933, + "step": 3368 + }, + { + "epoch": 0.6483624900738745, + "lm_loss": 1.6781, + "step": 3368, + "vm_loss": 0.196 + }, + { + "epoch": 0.6483624900738745, + "lm_loss": 1.9421, + "step": 3368, + "vm_loss": 0.145 + }, + { + "epoch": 0.6483624900738745, + "lm_loss": 1.5757, + "step": 3368, + "vm_loss": 0.167 + }, + { + "epoch": 0.6483624900738745, + "lm_loss": 1.7888, + "step": 3368, + "vm_loss": 0.1753 + }, + { + "epoch": 0.6483624900738745, + "lm_loss": 1.523, + "step": 3368, + "vm_loss": 0.153 + }, + { + "epoch": 0.6483624900738745, + "lm_loss": 1.9573, + "step": 3368, + "vm_loss": 0.1791 + }, + { + "epoch": 0.6483624900738745, + "lm_loss": 1.7266, + "step": 3368, + "vm_loss": 0.1826 + }, + { + "epoch": 0.6483624900738745, + "lm_loss": 1.8996, + "step": 3368, + "vm_loss": 0.185 + }, + { + "epoch": 0.6485549967514498, + "grad_norm": 2.8822716894955556, + "learning_rate": 1.579125284053172e-05, + "loss": 1.9607, + "step": 3369 + }, + { + "epoch": 0.6487475034290252, + "grad_norm": 2.969764721719698, + "learning_rate": 1.5788710729522953e-05, + "loss": 1.9954, + "step": 3370 + }, + { + "epoch": 0.6489400101066005, + "grad_norm": 3.0955256143874283, + "learning_rate": 1.5786168055777445e-05, + "loss": 2.065, + "step": 3371 + }, + { + "epoch": 0.6491325167841759, + "grad_norm": 3.0266748730683055, + "learning_rate": 1.578362481954238e-05, + "loss": 1.9835, + "step": 3372 + }, + { + "epoch": 0.6493250234617514, + "grad_norm": 2.81747653422955, + "learning_rate": 1.5781081021064983e-05, + "loss": 2.0244, + "step": 3373 + }, + { + "epoch": 0.6495175301393267, + "grad_norm": 2.735177166904523, + "learning_rate": 1.5778536660592545e-05, + "loss": 1.9463, + "step": 3374 + }, + { + "epoch": 0.6497100368169021, + "grad_norm": 2.8850286110875585, + "learning_rate": 1.577599173837242e-05, + "loss": 2.0777, + "step": 3375 + }, + { + "epoch": 0.6499025434944775, + "grad_norm": 3.108746343431595, + "learning_rate": 1.5773446254652002e-05, + "loss": 2.0573, + "step": 3376 + }, + { + "epoch": 0.6499025434944775, + "lm_loss": 2.055, + "step": 3376, + "vm_loss": 0.2294 + }, + { + "epoch": 0.6499025434944775, + "lm_loss": 2.026, + "step": 3376, + "vm_loss": 0.1396 + }, + { + "epoch": 0.6499025434944775, + "lm_loss": 2.3197, + "step": 3376, + "vm_loss": 0.203 + }, + { + "epoch": 0.6499025434944775, + "lm_loss": 1.7229, + "step": 3376, + "vm_loss": 0.1229 + }, + { + "epoch": 0.6499025434944775, + "lm_loss": 1.871, + "step": 3376, + "vm_loss": 0.1994 + }, + { + "epoch": 0.6499025434944775, + "lm_loss": 2.1273, + "step": 3376, + "vm_loss": 0.2058 + }, + { + "epoch": 0.6499025434944775, + "lm_loss": 1.6647, + "step": 3376, + "vm_loss": 0.1279 + }, + { + "epoch": 0.6499025434944775, + "lm_loss": 1.9205, + "step": 3376, + "vm_loss": 0.2001 + }, + { + "epoch": 0.6500950501720528, + "grad_norm": 2.8810753317904245, + "learning_rate": 1.5770900209678742e-05, + "loss": 1.9721, + "step": 3377 + }, + { + "epoch": 0.6502875568496282, + "grad_norm": 3.2565079248723743, + "learning_rate": 1.5768353603700152e-05, + "loss": 1.9951, + "step": 3378 + }, + { + "epoch": 0.6504800635272036, + "grad_norm": 3.4762835316162803, + "learning_rate": 1.5765806436963793e-05, + "loss": 2.0728, + "step": 3379 + }, + { + "epoch": 0.650672570204779, + "grad_norm": 2.819181062391101, + "learning_rate": 1.5763258709717282e-05, + "loss": 2.044, + "step": 3380 + }, + { + "epoch": 0.6508650768823544, + "grad_norm": 2.7087932661598733, + "learning_rate": 1.576071042220829e-05, + "loss": 2.0344, + "step": 3381 + }, + { + "epoch": 0.6510575835599297, + "grad_norm": 2.8468133262027124, + "learning_rate": 1.5758161574684544e-05, + "loss": 2.0382, + "step": 3382 + }, + { + "epoch": 0.6512500902375051, + "grad_norm": 3.1302616853320964, + "learning_rate": 1.5755612167393827e-05, + "loss": 2.0711, + "step": 3383 + }, + { + "epoch": 0.6514425969150804, + "grad_norm": 3.073042992911792, + "learning_rate": 1.5753062200583967e-05, + "loss": 2.1246, + "step": 3384 + }, + { + "epoch": 0.6514425969150804, + "lm_loss": 1.4069, + "step": 3384, + "vm_loss": 0.175 + }, + { + "epoch": 0.6514425969150804, + "lm_loss": 1.5835, + "step": 3384, + "vm_loss": 0.1673 + }, + { + "epoch": 0.6514425969150804, + "lm_loss": 1.9258, + "step": 3384, + "vm_loss": 0.1632 + }, + { + "epoch": 0.6514425969150804, + "lm_loss": 1.3249, + "step": 3384, + "vm_loss": 0.1813 + }, + { + "epoch": 0.6514425969150804, + "lm_loss": 2.1316, + "step": 3384, + "vm_loss": 0.124 + }, + { + "epoch": 0.6514425969150804, + "lm_loss": 1.77, + "step": 3384, + "vm_loss": 0.213 + }, + { + "epoch": 0.6514425969150804, + "lm_loss": 1.8358, + "step": 3384, + "vm_loss": 0.1748 + }, + { + "epoch": 0.6514425969150804, + "lm_loss": 1.9636, + "step": 3384, + "vm_loss": 0.1411 + }, + { + "epoch": 0.6516351035926559, + "grad_norm": 2.7719716067245286, + "learning_rate": 1.575051167450286e-05, + "loss": 1.9522, + "step": 3385 + }, + { + "epoch": 0.6518276102702313, + "grad_norm": 3.0614920779669066, + "learning_rate": 1.5747960589398447e-05, + "loss": 2.0054, + "step": 3386 + }, + { + "epoch": 0.6520201169478066, + "grad_norm": 2.9202469318987765, + "learning_rate": 1.574540894551873e-05, + "loss": 1.9906, + "step": 3387 + }, + { + "epoch": 0.652212623625382, + "grad_norm": 3.1519819154667212, + "learning_rate": 1.5742856743111753e-05, + "loss": 2.0739, + "step": 3388 + }, + { + "epoch": 0.6524051303029574, + "grad_norm": 2.928325350516434, + "learning_rate": 1.574030398242563e-05, + "loss": 1.9817, + "step": 3389 + }, + { + "epoch": 0.6525976369805327, + "grad_norm": 3.397517726884812, + "learning_rate": 1.5737750663708526e-05, + "loss": 2.055, + "step": 3390 + }, + { + "epoch": 0.6527901436581082, + "grad_norm": 3.0333572272710354, + "learning_rate": 1.573519678720864e-05, + "loss": 2.0125, + "step": 3391 + }, + { + "epoch": 0.6529826503356835, + "grad_norm": 2.7685766341728137, + "learning_rate": 1.5732642353174257e-05, + "loss": 2.0549, + "step": 3392 + }, + { + "epoch": 0.6529826503356835, + "lm_loss": 1.9299, + "step": 3392, + "vm_loss": 0.2183 + }, + { + "epoch": 0.6529826503356835, + "lm_loss": 1.5882, + "step": 3392, + "vm_loss": 0.2296 + }, + { + "epoch": 0.6529826503356835, + "lm_loss": 1.9038, + "step": 3392, + "vm_loss": 0.1091 + }, + { + "epoch": 0.6529826503356835, + "lm_loss": 1.8111, + "step": 3392, + "vm_loss": 0.1592 + }, + { + "epoch": 0.6529826503356835, + "lm_loss": 1.9049, + "step": 3392, + "vm_loss": 0.164 + }, + { + "epoch": 0.6529826503356835, + "lm_loss": 2.0374, + "step": 3392, + "vm_loss": 0.1526 + }, + { + "epoch": 0.6529826503356835, + "lm_loss": 1.7034, + "step": 3392, + "vm_loss": 0.1239 + }, + { + "epoch": 0.6529826503356835, + "lm_loss": 2.21, + "step": 3392, + "vm_loss": 0.2505 + }, + { + "epoch": 0.6531751570132589, + "grad_norm": 2.972010163136967, + "learning_rate": 1.5730087361853696e-05, + "loss": 2.0188, + "step": 3393 + }, + { + "epoch": 0.6533676636908343, + "grad_norm": 3.3182017029799837, + "learning_rate": 1.5727531813495335e-05, + "loss": 2.0668, + "step": 3394 + }, + { + "epoch": 0.6535601703684096, + "grad_norm": 3.03414851839463, + "learning_rate": 1.5724975708347603e-05, + "loss": 2.014, + "step": 3395 + }, + { + "epoch": 0.653752677045985, + "grad_norm": 2.8983963533454644, + "learning_rate": 1.572241904665899e-05, + "loss": 2.0301, + "step": 3396 + }, + { + "epoch": 0.6539451837235604, + "grad_norm": 2.9062250834043186, + "learning_rate": 1.5719861828678033e-05, + "loss": 2.0651, + "step": 3397 + }, + { + "epoch": 0.6541376904011358, + "grad_norm": 2.8714890842775183, + "learning_rate": 1.5717304054653327e-05, + "loss": 2.0567, + "step": 3398 + }, + { + "epoch": 0.6543301970787112, + "grad_norm": 2.968658438168832, + "learning_rate": 1.5714745724833524e-05, + "loss": 1.996, + "step": 3399 + }, + { + "epoch": 0.6545227037562865, + "grad_norm": 2.747876452270412, + "learning_rate": 1.571218683946732e-05, + "loss": 1.9906, + "step": 3400 + }, + { + "epoch": 0.6545227037562865, + "lm_loss": 1.7027, + "step": 3400, + "vm_loss": 0.1818 + }, + { + "epoch": 0.6545227037562865, + "lm_loss": 1.7728, + "step": 3400, + "vm_loss": 0.1313 + }, + { + "epoch": 0.6545227037562865, + "lm_loss": 1.5423, + "step": 3400, + "vm_loss": 0.1291 + }, + { + "epoch": 0.6545227037562865, + "lm_loss": 1.661, + "step": 3400, + "vm_loss": 0.2097 + }, + { + "epoch": 0.6545227037562865, + "lm_loss": 1.7011, + "step": 3400, + "vm_loss": 0.1775 + }, + { + "epoch": 0.6545227037562865, + "lm_loss": 2.2959, + "step": 3400, + "vm_loss": 0.205 + }, + { + "epoch": 0.6545227037562865, + "lm_loss": 1.7793, + "step": 3400, + "vm_loss": 0.1847 + }, + { + "epoch": 0.6545227037562865, + "lm_loss": 2.2202, + "step": 3400, + "vm_loss": 0.1501 + }, + { + "epoch": 0.6547152104338619, + "grad_norm": 2.9835769459797272, + "learning_rate": 1.570962739880348e-05, + "loss": 2.0532, + "step": 3401 + }, + { + "epoch": 0.6549077171114372, + "grad_norm": 3.150741838620774, + "learning_rate": 1.5707067403090803e-05, + "loss": 2.0205, + "step": 3402 + }, + { + "epoch": 0.6551002237890127, + "grad_norm": 3.4170029251909724, + "learning_rate": 1.5704506852578165e-05, + "loss": 2.0636, + "step": 3403 + }, + { + "epoch": 0.6552927304665881, + "grad_norm": 3.110253786012647, + "learning_rate": 1.5701945747514476e-05, + "loss": 2.0175, + "step": 3404 + }, + { + "epoch": 0.6554852371441634, + "grad_norm": 2.68279732289107, + "learning_rate": 1.5699384088148713e-05, + "loss": 2.0557, + "step": 3405 + }, + { + "epoch": 0.6556777438217388, + "grad_norm": 2.7271677324414854, + "learning_rate": 1.56968218747299e-05, + "loss": 2.0537, + "step": 3406 + }, + { + "epoch": 0.6558702504993142, + "grad_norm": 2.697876411616252, + "learning_rate": 1.5694259107507117e-05, + "loss": 2.051, + "step": 3407 + }, + { + "epoch": 0.6560627571768896, + "grad_norm": 2.9578479773799264, + "learning_rate": 1.5691695786729498e-05, + "loss": 1.9997, + "step": 3408 + }, + { + "epoch": 0.6560627571768896, + "lm_loss": 1.941, + "step": 3408, + "vm_loss": 0.1942 + }, + { + "epoch": 0.6560627571768896, + "lm_loss": 1.7293, + "step": 3408, + "vm_loss": 0.1856 + }, + { + "epoch": 0.6560627571768896, + "lm_loss": 2.207, + "step": 3408, + "vm_loss": 0.1518 + }, + { + "epoch": 0.6560627571768896, + "lm_loss": 1.6512, + "step": 3408, + "vm_loss": 0.1595 + }, + { + "epoch": 0.6560627571768896, + "lm_loss": 1.556, + "step": 3408, + "vm_loss": 0.1322 + }, + { + "epoch": 0.6560627571768896, + "lm_loss": 1.4449, + "step": 3408, + "vm_loss": 0.1835 + }, + { + "epoch": 0.6560627571768896, + "lm_loss": 1.6831, + "step": 3408, + "vm_loss": 0.2399 + }, + { + "epoch": 0.6560627571768896, + "lm_loss": 2.4147, + "step": 3408, + "vm_loss": 0.1806 + }, + { + "epoch": 0.656255263854465, + "grad_norm": 3.0444447262285594, + "learning_rate": 1.5689131912646227e-05, + "loss": 2.0458, + "step": 3409 + }, + { + "epoch": 0.6564477705320403, + "grad_norm": 3.006755721771176, + "learning_rate": 1.5686567485506552e-05, + "loss": 2.0522, + "step": 3410 + }, + { + "epoch": 0.6566402772096157, + "grad_norm": 2.6689143571135006, + "learning_rate": 1.5684002505559766e-05, + "loss": 2.0127, + "step": 3411 + }, + { + "epoch": 0.6568327838871911, + "grad_norm": 2.7023171450616124, + "learning_rate": 1.5681436973055214e-05, + "loss": 2.0361, + "step": 3412 + }, + { + "epoch": 0.6570252905647664, + "grad_norm": 2.709842390603092, + "learning_rate": 1.5678870888242304e-05, + "loss": 2.0, + "step": 3413 + }, + { + "epoch": 0.6572177972423419, + "grad_norm": 2.922918978783161, + "learning_rate": 1.567630425137049e-05, + "loss": 2.0158, + "step": 3414 + }, + { + "epoch": 0.6574103039199172, + "grad_norm": 2.8302159315183886, + "learning_rate": 1.567373706268928e-05, + "loss": 1.9985, + "step": 3415 + }, + { + "epoch": 0.6576028105974926, + "grad_norm": 3.027700207435978, + "learning_rate": 1.567116932244824e-05, + "loss": 2.0694, + "step": 3416 + }, + { + "epoch": 0.6576028105974926, + "lm_loss": 1.9269, + "step": 3416, + "vm_loss": 0.2624 + }, + { + "epoch": 0.6576028105974926, + "lm_loss": 2.2354, + "step": 3416, + "vm_loss": 0.1938 + }, + { + "epoch": 0.6576028105974926, + "lm_loss": 2.1576, + "step": 3416, + "vm_loss": 0.1619 + }, + { + "epoch": 0.6576028105974926, + "lm_loss": 1.8478, + "step": 3416, + "vm_loss": 0.1141 + }, + { + "epoch": 0.6576028105974926, + "lm_loss": 2.3587, + "step": 3416, + "vm_loss": 0.1562 + }, + { + "epoch": 0.6576028105974926, + "lm_loss": 1.9866, + "step": 3416, + "vm_loss": 0.1521 + }, + { + "epoch": 0.6576028105974926, + "lm_loss": 2.0243, + "step": 3416, + "vm_loss": 0.1826 + }, + { + "epoch": 0.6576028105974926, + "lm_loss": 1.775, + "step": 3416, + "vm_loss": 0.173 + }, + { + "epoch": 0.657795317275068, + "grad_norm": 2.847442502518517, + "learning_rate": 1.5668601030896987e-05, + "loss": 2.0471, + "step": 3417 + }, + { + "epoch": 0.6579878239526433, + "grad_norm": 2.9557472795179085, + "learning_rate": 1.5666032188285188e-05, + "loss": 1.8998, + "step": 3418 + }, + { + "epoch": 0.6581803306302187, + "grad_norm": 2.901403000409162, + "learning_rate": 1.5663462794862575e-05, + "loss": 2.0332, + "step": 3419 + }, + { + "epoch": 0.6583728373077942, + "grad_norm": 2.8418445004034814, + "learning_rate": 1.566089285087892e-05, + "loss": 2.0205, + "step": 3420 + }, + { + "epoch": 0.6585653439853695, + "grad_norm": 2.826850492192919, + "learning_rate": 1.565832235658405e-05, + "loss": 2.0542, + "step": 3421 + }, + { + "epoch": 0.6587578506629449, + "grad_norm": 2.7836412097920373, + "learning_rate": 1.5655751312227863e-05, + "loss": 2.0041, + "step": 3422 + }, + { + "epoch": 0.6589503573405202, + "grad_norm": 2.80879958368805, + "learning_rate": 1.565317971806029e-05, + "loss": 2.0289, + "step": 3423 + }, + { + "epoch": 0.6591428640180956, + "grad_norm": 2.725278332808272, + "learning_rate": 1.565060757433132e-05, + "loss": 1.9269, + "step": 3424 + }, + { + "epoch": 0.6591428640180956, + "lm_loss": 1.9155, + "step": 3424, + "vm_loss": 0.1946 + }, + { + "epoch": 0.6591428640180956, + "lm_loss": 1.6846, + "step": 3424, + "vm_loss": 0.1533 + }, + { + "epoch": 0.6591428640180956, + "lm_loss": 1.2645, + "step": 3424, + "vm_loss": 0.2341 + }, + { + "epoch": 0.6591428640180956, + "lm_loss": 1.5602, + "step": 3424, + "vm_loss": 0.1549 + }, + { + "epoch": 0.6591428640180956, + "lm_loss": 1.5456, + "step": 3424, + "vm_loss": 0.1333 + }, + { + "epoch": 0.6591428640180956, + "lm_loss": 1.817, + "step": 3424, + "vm_loss": 0.1339 + }, + { + "epoch": 0.6591428640180956, + "lm_loss": 1.5246, + "step": 3424, + "vm_loss": 0.18 + }, + { + "epoch": 0.6591428640180956, + "lm_loss": 1.4716, + "step": 3424, + "vm_loss": 0.2201 + }, + { + "epoch": 0.659335370695671, + "grad_norm": 2.987240946372525, + "learning_rate": 1.5648034881291005e-05, + "loss": 1.973, + "step": 3425 + }, + { + "epoch": 0.6595278773732464, + "grad_norm": 2.8525445130621248, + "learning_rate": 1.5645461639189437e-05, + "loss": 1.9432, + "step": 3426 + }, + { + "epoch": 0.6597203840508218, + "grad_norm": 2.8161380609631794, + "learning_rate": 1.5642887848276775e-05, + "loss": 2.0197, + "step": 3427 + }, + { + "epoch": 0.6599128907283971, + "grad_norm": 2.805475129561156, + "learning_rate": 1.5640313508803217e-05, + "loss": 1.9881, + "step": 3428 + }, + { + "epoch": 0.6601053974059725, + "grad_norm": 2.88272232888688, + "learning_rate": 1.5637738621019026e-05, + "loss": 2.0012, + "step": 3429 + }, + { + "epoch": 0.6602979040835479, + "grad_norm": 2.8680119710289067, + "learning_rate": 1.5635163185174512e-05, + "loss": 1.9932, + "step": 3430 + }, + { + "epoch": 0.6604904107611232, + "grad_norm": 2.9984468936883126, + "learning_rate": 1.5632587201520042e-05, + "loss": 1.9723, + "step": 3431 + }, + { + "epoch": 0.6606829174386987, + "grad_norm": 2.7987091532425934, + "learning_rate": 1.5630010670306037e-05, + "loss": 2.0323, + "step": 3432 + }, + { + "epoch": 0.6606829174386987, + "lm_loss": 1.9288, + "step": 3432, + "vm_loss": 0.1695 + }, + { + "epoch": 0.6606829174386987, + "lm_loss": 1.6718, + "step": 3432, + "vm_loss": 0.1453 + }, + { + "epoch": 0.6606829174386987, + "lm_loss": 2.2938, + "step": 3432, + "vm_loss": 0.1843 + }, + { + "epoch": 0.6606829174386987, + "lm_loss": 1.6039, + "step": 3432, + "vm_loss": 0.1303 + }, + { + "epoch": 0.6606829174386987, + "lm_loss": 2.0922, + "step": 3432, + "vm_loss": 0.11 + }, + { + "epoch": 0.6606829174386987, + "lm_loss": 1.8396, + "step": 3432, + "vm_loss": 0.2052 + }, + { + "epoch": 0.6606829174386987, + "lm_loss": 1.5361, + "step": 3432, + "vm_loss": 0.0889 + }, + { + "epoch": 0.6606829174386987, + "lm_loss": 1.941, + "step": 3432, + "vm_loss": 0.1724 + }, + { + "epoch": 0.660875424116274, + "grad_norm": 2.8881863040351314, + "learning_rate": 1.5627433591782963e-05, + "loss": 2.0371, + "step": 3433 + }, + { + "epoch": 0.6610679307938494, + "grad_norm": 2.8017031745412315, + "learning_rate": 1.562485596620135e-05, + "loss": 1.9789, + "step": 3434 + }, + { + "epoch": 0.6612604374714248, + "grad_norm": 2.8428717915391193, + "learning_rate": 1.562227779381177e-05, + "loss": 1.9797, + "step": 3435 + }, + { + "epoch": 0.6614529441490001, + "grad_norm": 3.083704515675064, + "learning_rate": 1.5619699074864864e-05, + "loss": 2.0164, + "step": 3436 + }, + { + "epoch": 0.6616454508265756, + "grad_norm": 2.855110285247169, + "learning_rate": 1.5617119809611308e-05, + "loss": 2.0349, + "step": 3437 + }, + { + "epoch": 0.661837957504151, + "grad_norm": 2.944567963600729, + "learning_rate": 1.561453999830184e-05, + "loss": 2.0145, + "step": 3438 + }, + { + "epoch": 0.6620304641817263, + "grad_norm": 3.1023614657863194, + "learning_rate": 1.561195964118726e-05, + "loss": 1.988, + "step": 3439 + }, + { + "epoch": 0.6622229708593017, + "grad_norm": 2.8243815231457674, + "learning_rate": 1.56093787385184e-05, + "loss": 1.9731, + "step": 3440 + }, + { + "epoch": 0.6622229708593017, + "lm_loss": 2.1885, + "step": 3440, + "vm_loss": 0.1729 + }, + { + "epoch": 0.6622229708593017, + "lm_loss": 1.4917, + "step": 3440, + "vm_loss": 0.1868 + }, + { + "epoch": 0.6622229708593017, + "lm_loss": 1.6558, + "step": 3440, + "vm_loss": 0.1816 + }, + { + "epoch": 0.6622229708593017, + "lm_loss": 2.1221, + "step": 3440, + "vm_loss": 0.1363 + }, + { + "epoch": 0.6622229708593017, + "lm_loss": 2.1218, + "step": 3440, + "vm_loss": 0.136 + }, + { + "epoch": 0.6622229708593017, + "lm_loss": 1.8281, + "step": 3440, + "vm_loss": 0.1784 + }, + { + "epoch": 0.6622229708593017, + "lm_loss": 1.7897, + "step": 3440, + "vm_loss": 0.1588 + }, + { + "epoch": 0.6622229708593017, + "lm_loss": 2.1482, + "step": 3440, + "vm_loss": 0.103 + }, + { + "epoch": 0.662415477536877, + "grad_norm": 2.7397043699901813, + "learning_rate": 1.560679729054616e-05, + "loss": 1.991, + "step": 3441 + }, + { + "epoch": 0.6626079842144524, + "grad_norm": 3.0580459675589022, + "learning_rate": 1.5604215297521498e-05, + "loss": 2.0069, + "step": 3442 + }, + { + "epoch": 0.6628004908920279, + "grad_norm": 3.041994247593464, + "learning_rate": 1.5601632759695408e-05, + "loss": 1.979, + "step": 3443 + }, + { + "epoch": 0.6629929975696032, + "grad_norm": 2.9349032068403567, + "learning_rate": 1.5599049677318944e-05, + "loss": 1.9642, + "step": 3444 + }, + { + "epoch": 0.6631855042471786, + "grad_norm": 3.024445542281148, + "learning_rate": 1.5596466050643225e-05, + "loss": 2.0155, + "step": 3445 + }, + { + "epoch": 0.6633780109247539, + "grad_norm": 3.1267719054774523, + "learning_rate": 1.5593881879919403e-05, + "loss": 1.952, + "step": 3446 + }, + { + "epoch": 0.6635705176023293, + "grad_norm": 3.117584784320607, + "learning_rate": 1.5591297165398693e-05, + "loss": 2.028, + "step": 3447 + }, + { + "epoch": 0.6637630242799047, + "grad_norm": 2.9688443606527937, + "learning_rate": 1.5588711907332368e-05, + "loss": 2.0514, + "step": 3448 + }, + { + "epoch": 0.6637630242799047, + "lm_loss": 1.8242, + "step": 3448, + "vm_loss": 0.1973 + }, + { + "epoch": 0.6637630242799047, + "lm_loss": 1.5338, + "step": 3448, + "vm_loss": 0.1796 + }, + { + "epoch": 0.6637630242799047, + "lm_loss": 1.9291, + "step": 3448, + "vm_loss": 0.1517 + }, + { + "epoch": 0.6637630242799047, + "lm_loss": 2.2259, + "step": 3448, + "vm_loss": 0.1796 + }, + { + "epoch": 0.6637630242799047, + "lm_loss": 1.7583, + "step": 3448, + "vm_loss": 0.1308 + }, + { + "epoch": 0.6637630242799047, + "lm_loss": 1.8037, + "step": 3448, + "vm_loss": 0.1906 + }, + { + "epoch": 0.6637630242799047, + "lm_loss": 1.8267, + "step": 3448, + "vm_loss": 0.1229 + }, + { + "epoch": 0.6637630242799047, + "lm_loss": 1.9707, + "step": 3448, + "vm_loss": 0.115 + }, + { + "epoch": 0.6639555309574801, + "grad_norm": 3.0209883449226234, + "learning_rate": 1.5586126105971744e-05, + "loss": 1.9607, + "step": 3449 + }, + { + "epoch": 0.6641480376350555, + "grad_norm": 3.1492199274650203, + "learning_rate": 1.5583539761568194e-05, + "loss": 1.9427, + "step": 3450 + }, + { + "epoch": 0.6643405443126309, + "grad_norm": 2.9140437650949655, + "learning_rate": 1.558095287437315e-05, + "loss": 2.0092, + "step": 3451 + }, + { + "epoch": 0.6645330509902062, + "grad_norm": 3.1676452757072386, + "learning_rate": 1.5578365444638078e-05, + "loss": 1.9852, + "step": 3452 + }, + { + "epoch": 0.6647255576677816, + "grad_norm": 3.010447397114762, + "learning_rate": 1.557577747261452e-05, + "loss": 2.0513, + "step": 3453 + }, + { + "epoch": 0.6649180643453569, + "grad_norm": 3.022548399018991, + "learning_rate": 1.5573188958554056e-05, + "loss": 2.0029, + "step": 3454 + }, + { + "epoch": 0.6651105710229324, + "grad_norm": 2.845997979383061, + "learning_rate": 1.5570599902708325e-05, + "loss": 2.01, + "step": 3455 + }, + { + "epoch": 0.6653030777005078, + "grad_norm": 3.0622108945712543, + "learning_rate": 1.5568010305329012e-05, + "loss": 2.0546, + "step": 3456 + }, + { + "epoch": 0.6653030777005078, + "lm_loss": 2.2959, + "step": 3456, + "vm_loss": 0.1557 + }, + { + "epoch": 0.6653030777005078, + "lm_loss": 1.7777, + "step": 3456, + "vm_loss": 0.1111 + }, + { + "epoch": 0.6653030777005078, + "lm_loss": 1.508, + "step": 3456, + "vm_loss": 0.1607 + }, + { + "epoch": 0.6653030777005078, + "lm_loss": 1.6771, + "step": 3456, + "vm_loss": 0.1849 + }, + { + "epoch": 0.6653030777005078, + "lm_loss": 2.1553, + "step": 3456, + "vm_loss": 0.1373 + }, + { + "epoch": 0.6653030777005078, + "lm_loss": 1.6149, + "step": 3456, + "vm_loss": 0.1931 + }, + { + "epoch": 0.6653030777005078, + "lm_loss": 2.0503, + "step": 3456, + "vm_loss": 0.1533 + }, + { + "epoch": 0.6653030777005078, + "lm_loss": 1.5738, + "step": 3456, + "vm_loss": 0.1171 + }, + { + "epoch": 0.6654955843780831, + "grad_norm": 3.052977472133687, + "learning_rate": 1.5565420166667867e-05, + "loss": 2.0091, + "step": 3457 + }, + { + "epoch": 0.6656880910556585, + "grad_norm": 3.1487828529891964, + "learning_rate": 1.5562829486976675e-05, + "loss": 2.0333, + "step": 3458 + }, + { + "epoch": 0.6658805977332338, + "grad_norm": 2.8968394397514294, + "learning_rate": 1.5560238266507287e-05, + "loss": 2.005, + "step": 3459 + }, + { + "epoch": 0.6660731044108092, + "grad_norm": 2.877036579317055, + "learning_rate": 1.5557646505511605e-05, + "loss": 2.0145, + "step": 3460 + }, + { + "epoch": 0.6662656110883847, + "grad_norm": 2.7400517292784876, + "learning_rate": 1.555505420424158e-05, + "loss": 2.019, + "step": 3461 + }, + { + "epoch": 0.66645811776596, + "grad_norm": 3.2083964470678517, + "learning_rate": 1.5552461362949214e-05, + "loss": 1.9642, + "step": 3462 + }, + { + "epoch": 0.6666506244435354, + "grad_norm": 2.747255053376632, + "learning_rate": 1.5549867981886567e-05, + "loss": 2.0075, + "step": 3463 + }, + { + "epoch": 0.6668431311211107, + "grad_norm": 2.8665802575812127, + "learning_rate": 1.5547274061305753e-05, + "loss": 1.9852, + "step": 3464 + }, + { + "epoch": 0.6668431311211107, + "lm_loss": 2.3415, + "step": 3464, + "vm_loss": 0.1885 + }, + { + "epoch": 0.6668431311211107, + "lm_loss": 1.8594, + "step": 3464, + "vm_loss": 0.1746 + }, + { + "epoch": 0.6668431311211107, + "lm_loss": 1.8048, + "step": 3464, + "vm_loss": 0.1879 + }, + { + "epoch": 0.6668431311211107, + "lm_loss": 1.9043, + "step": 3464, + "vm_loss": 0.1953 + }, + { + "epoch": 0.6668431311211107, + "lm_loss": 1.9511, + "step": 3464, + "vm_loss": 0.2023 + }, + { + "epoch": 0.6668431311211107, + "lm_loss": 1.7978, + "step": 3464, + "vm_loss": 0.2501 + }, + { + "epoch": 0.6668431311211107, + "lm_loss": 1.6212, + "step": 3464, + "vm_loss": 0.1767 + }, + { + "epoch": 0.6668431311211107, + "lm_loss": 1.9876, + "step": 3464, + "vm_loss": 0.1522 + }, + { + "epoch": 0.6670356377986861, + "grad_norm": 2.9883267350361997, + "learning_rate": 1.5544679601458925e-05, + "loss": 2.01, + "step": 3465 + }, + { + "epoch": 0.6672281444762616, + "grad_norm": 3.021619780823117, + "learning_rate": 1.5542084602598305e-05, + "loss": 1.9351, + "step": 3466 + }, + { + "epoch": 0.6674206511538369, + "grad_norm": 3.0649062889123835, + "learning_rate": 1.5539489064976157e-05, + "loss": 2.0582, + "step": 3467 + }, + { + "epoch": 0.6676131578314123, + "grad_norm": 2.880175633373562, + "learning_rate": 1.55368929888448e-05, + "loss": 2.0809, + "step": 3468 + }, + { + "epoch": 0.6678056645089877, + "grad_norm": 2.7253570766901136, + "learning_rate": 1.553429637445661e-05, + "loss": 1.9596, + "step": 3469 + }, + { + "epoch": 0.667998171186563, + "grad_norm": 2.821924131951273, + "learning_rate": 1.5531699222064005e-05, + "loss": 2.0052, + "step": 3470 + }, + { + "epoch": 0.6681906778641384, + "grad_norm": 2.7434439103369708, + "learning_rate": 1.5529101531919465e-05, + "loss": 1.9824, + "step": 3471 + }, + { + "epoch": 0.6683831845417137, + "grad_norm": 3.0967602915104013, + "learning_rate": 1.5526503304275518e-05, + "loss": 2.0319, + "step": 3472 + }, + { + "epoch": 0.6683831845417137, + "lm_loss": 1.6055, + "step": 3472, + "vm_loss": 0.1326 + }, + { + "epoch": 0.6683831845417137, + "lm_loss": 1.9154, + "step": 3472, + "vm_loss": 0.1283 + }, + { + "epoch": 0.6683831845417137, + "lm_loss": 1.571, + "step": 3472, + "vm_loss": 0.1343 + }, + { + "epoch": 0.6683831845417137, + "lm_loss": 1.6757, + "step": 3472, + "vm_loss": 0.1865 + }, + { + "epoch": 0.6683831845417137, + "lm_loss": 1.7838, + "step": 3472, + "vm_loss": 0.1323 + }, + { + "epoch": 0.6683831845417137, + "lm_loss": 1.9148, + "step": 3472, + "vm_loss": 0.1684 + }, + { + "epoch": 0.6683831845417137, + "lm_loss": 1.8315, + "step": 3472, + "vm_loss": 0.1859 + }, + { + "epoch": 0.6683831845417137, + "lm_loss": 1.6614, + "step": 3472, + "vm_loss": 0.1662 + }, + { + "epoch": 0.6685756912192892, + "grad_norm": 3.081452169159797, + "learning_rate": 1.5523904539384746e-05, + "loss": 1.9643, + "step": 3473 + }, + { + "epoch": 0.6687681978968646, + "grad_norm": 3.2058894146864403, + "learning_rate": 1.5521305237499785e-05, + "loss": 2.0005, + "step": 3474 + }, + { + "epoch": 0.6689607045744399, + "grad_norm": 2.9795203766456693, + "learning_rate": 1.5518705398873312e-05, + "loss": 1.9782, + "step": 3475 + }, + { + "epoch": 0.6691532112520153, + "grad_norm": 2.7351741695000906, + "learning_rate": 1.551610502375807e-05, + "loss": 2.0507, + "step": 3476 + }, + { + "epoch": 0.6693457179295906, + "grad_norm": 2.67702214854956, + "learning_rate": 1.551350411240685e-05, + "loss": 2.0183, + "step": 3477 + }, + { + "epoch": 0.669538224607166, + "grad_norm": 2.7198653300284925, + "learning_rate": 1.551090266507249e-05, + "loss": 1.9985, + "step": 3478 + }, + { + "epoch": 0.6697307312847415, + "grad_norm": 2.8843065969743185, + "learning_rate": 1.5508300682007885e-05, + "loss": 1.9538, + "step": 3479 + }, + { + "epoch": 0.6699232379623168, + "grad_norm": 2.967227707552942, + "learning_rate": 1.5505698163465986e-05, + "loss": 2.0143, + "step": 3480 + }, + { + "epoch": 0.6699232379623168, + "lm_loss": 1.4283, + "step": 3480, + "vm_loss": 0.1867 + }, + { + "epoch": 0.6699232379623168, + "lm_loss": 1.9501, + "step": 3480, + "vm_loss": 0.1294 + }, + { + "epoch": 0.6699232379623168, + "lm_loss": 1.8329, + "step": 3480, + "vm_loss": 0.1613 + }, + { + "epoch": 0.6699232379623168, + "lm_loss": 1.8189, + "step": 3480, + "vm_loss": 0.1652 + }, + { + "epoch": 0.6699232379623168, + "lm_loss": 2.2648, + "step": 3480, + "vm_loss": 0.1428 + }, + { + "epoch": 0.6699232379623168, + "lm_loss": 2.2595, + "step": 3480, + "vm_loss": 0.1519 + }, + { + "epoch": 0.6699232379623168, + "lm_loss": 2.2414, + "step": 3480, + "vm_loss": 0.1374 + }, + { + "epoch": 0.6699232379623168, + "lm_loss": 1.6074, + "step": 3480, + "vm_loss": 0.1742 + }, + { + "epoch": 0.6701157446398922, + "grad_norm": 3.412154765084055, + "learning_rate": 1.5503095109699783e-05, + "loss": 1.9997, + "step": 3481 + }, + { + "epoch": 0.6703082513174676, + "grad_norm": 3.0734874352038335, + "learning_rate": 1.5500491520962333e-05, + "loss": 2.0571, + "step": 3482 + }, + { + "epoch": 0.6705007579950429, + "grad_norm": 2.8309247108213307, + "learning_rate": 1.5497887397506735e-05, + "loss": 2.0329, + "step": 3483 + }, + { + "epoch": 0.6706932646726184, + "grad_norm": 2.9852449930169342, + "learning_rate": 1.549528273958614e-05, + "loss": 1.9516, + "step": 3484 + }, + { + "epoch": 0.6708857713501937, + "grad_norm": 2.751853182656892, + "learning_rate": 1.549267754745376e-05, + "loss": 1.9794, + "step": 3485 + }, + { + "epoch": 0.6710782780277691, + "grad_norm": 2.8104850029035804, + "learning_rate": 1.5490071821362855e-05, + "loss": 1.9668, + "step": 3486 + }, + { + "epoch": 0.6712707847053445, + "grad_norm": 3.1035222028992684, + "learning_rate": 1.5487465561566725e-05, + "loss": 1.9581, + "step": 3487 + }, + { + "epoch": 0.6714632913829198, + "grad_norm": 3.1953144326479324, + "learning_rate": 1.5484858768318738e-05, + "loss": 1.9561, + "step": 3488 + }, + { + "epoch": 0.6714632913829198, + "lm_loss": 1.86, + "step": 3488, + "vm_loss": 0.1554 + }, + { + "epoch": 0.6714632913829198, + "lm_loss": 1.9647, + "step": 3488, + "vm_loss": 0.2146 + }, + { + "epoch": 0.6714632913829198, + "lm_loss": 1.919, + "step": 3488, + "vm_loss": 0.1816 + }, + { + "epoch": 0.6714632913829198, + "lm_loss": 1.9231, + "step": 3488, + "vm_loss": 0.2286 + }, + { + "epoch": 0.6714632913829198, + "lm_loss": 1.3727, + "step": 3488, + "vm_loss": 0.1834 + }, + { + "epoch": 0.6714632913829198, + "lm_loss": 1.5676, + "step": 3488, + "vm_loss": 0.1194 + }, + { + "epoch": 0.6714632913829198, + "lm_loss": 2.2973, + "step": 3488, + "vm_loss": 0.1768 + }, + { + "epoch": 0.6714632913829198, + "lm_loss": 2.233, + "step": 3488, + "vm_loss": 0.2083 + }, + { + "epoch": 0.6716557980604952, + "grad_norm": 3.04166470809487, + "learning_rate": 1.5482251441872306e-05, + "loss": 2.0468, + "step": 3489 + }, + { + "epoch": 0.6718483047380706, + "grad_norm": 3.145946815675012, + "learning_rate": 1.54796435824809e-05, + "loss": 2.0161, + "step": 3490 + }, + { + "epoch": 0.672040811415646, + "grad_norm": 2.822444943732931, + "learning_rate": 1.5477035190398028e-05, + "loss": 1.9597, + "step": 3491 + }, + { + "epoch": 0.6722333180932214, + "grad_norm": 2.9098668100064637, + "learning_rate": 1.5474426265877266e-05, + "loss": 1.9746, + "step": 3492 + }, + { + "epoch": 0.6724258247707967, + "grad_norm": 3.013575404485257, + "learning_rate": 1.5471816809172228e-05, + "loss": 1.9756, + "step": 3493 + }, + { + "epoch": 0.6726183314483721, + "grad_norm": 2.9444460624530175, + "learning_rate": 1.5469206820536595e-05, + "loss": 2.0041, + "step": 3494 + }, + { + "epoch": 0.6728108381259474, + "grad_norm": 2.979265390403357, + "learning_rate": 1.5466596300224087e-05, + "loss": 2.0074, + "step": 3495 + }, + { + "epoch": 0.6730033448035229, + "grad_norm": 2.9324685917280675, + "learning_rate": 1.5463985248488484e-05, + "loss": 1.9547, + "step": 3496 + }, + { + "epoch": 0.6730033448035229, + "lm_loss": 1.9443, + "step": 3496, + "vm_loss": 0.0981 + }, + { + "epoch": 0.6730033448035229, + "lm_loss": 2.3861, + "step": 3496, + "vm_loss": 0.2585 + }, + { + "epoch": 0.6730033448035229, + "lm_loss": 1.8024, + "step": 3496, + "vm_loss": 0.1564 + }, + { + "epoch": 0.6730033448035229, + "lm_loss": 1.7392, + "step": 3496, + "vm_loss": 0.1647 + }, + { + "epoch": 0.6730033448035229, + "lm_loss": 2.0842, + "step": 3496, + "vm_loss": 0.194 + }, + { + "epoch": 0.6730033448035229, + "lm_loss": 1.962, + "step": 3496, + "vm_loss": 0.1648 + }, + { + "epoch": 0.6730033448035229, + "lm_loss": 2.1859, + "step": 3496, + "vm_loss": 0.1943 + }, + { + "epoch": 0.6730033448035229, + "lm_loss": 2.2246, + "step": 3496, + "vm_loss": 0.1247 + }, + { + "epoch": 0.6731958514810983, + "grad_norm": 3.1319824903009676, + "learning_rate": 1.5461373665583606e-05, + "loss": 2.033, + "step": 3497 + }, + { + "epoch": 0.6733883581586736, + "grad_norm": 2.936784868205389, + "learning_rate": 1.545876155176334e-05, + "loss": 2.0058, + "step": 3498 + }, + { + "epoch": 0.673580864836249, + "grad_norm": 3.0620857184280483, + "learning_rate": 1.545614890728161e-05, + "loss": 1.9886, + "step": 3499 + }, + { + "epoch": 0.6737733715138244, + "grad_norm": 2.8170381406077, + "learning_rate": 1.5453535732392397e-05, + "loss": 1.9922, + "step": 3500 + }, + { + "epoch": 0.6739658781913997, + "grad_norm": 2.86332818896035, + "learning_rate": 1.5450922027349747e-05, + "loss": 2.0093, + "step": 3501 + }, + { + "epoch": 0.6741583848689752, + "grad_norm": 2.7930804059793273, + "learning_rate": 1.5448307792407737e-05, + "loss": 2.0342, + "step": 3502 + }, + { + "epoch": 0.6743508915465505, + "grad_norm": 2.9960072303819256, + "learning_rate": 1.5445693027820503e-05, + "loss": 2.0626, + "step": 3503 + }, + { + "epoch": 0.6745433982241259, + "grad_norm": 3.110512417480102, + "learning_rate": 1.544307773384224e-05, + "loss": 1.9193, + "step": 3504 + }, + { + "epoch": 0.6745433982241259, + "lm_loss": 1.8012, + "step": 3504, + "vm_loss": 0.2022 + }, + { + "epoch": 0.6745433982241259, + "lm_loss": 1.7025, + "step": 3504, + "vm_loss": 0.1008 + }, + { + "epoch": 0.6745433982241259, + "lm_loss": 1.9561, + "step": 3504, + "vm_loss": 0.1838 + }, + { + "epoch": 0.6745433982241259, + "lm_loss": 1.6308, + "step": 3504, + "vm_loss": 0.1665 + }, + { + "epoch": 0.6745433982241259, + "lm_loss": 1.6486, + "step": 3504, + "vm_loss": 0.1161 + }, + { + "epoch": 0.6745433982241259, + "lm_loss": 1.7085, + "step": 3504, + "vm_loss": 0.1935 + }, + { + "epoch": 0.6745433982241259, + "lm_loss": 1.7881, + "step": 3504, + "vm_loss": 0.1775 + }, + { + "epoch": 0.6745433982241259, + "lm_loss": 1.8465, + "step": 3504, + "vm_loss": 0.1313 + }, + { + "epoch": 0.6747359049017013, + "grad_norm": 3.150743693277032, + "learning_rate": 1.544046191072718e-05, + "loss": 1.9663, + "step": 3505 + }, + { + "epoch": 0.6749284115792766, + "grad_norm": 2.8846130321638426, + "learning_rate": 1.543784555872962e-05, + "loss": 1.9214, + "step": 3506 + }, + { + "epoch": 0.675120918256852, + "grad_norm": 3.0299567165442176, + "learning_rate": 1.54352286781039e-05, + "loss": 1.9829, + "step": 3507 + }, + { + "epoch": 0.6753134249344274, + "grad_norm": 2.981331500565479, + "learning_rate": 1.5432611269104416e-05, + "loss": 1.9782, + "step": 3508 + }, + { + "epoch": 0.6755059316120028, + "grad_norm": 2.8495231835086816, + "learning_rate": 1.5429993331985613e-05, + "loss": 1.9637, + "step": 3509 + }, + { + "epoch": 0.6756984382895782, + "grad_norm": 2.9049199773040524, + "learning_rate": 1.5427374867001993e-05, + "loss": 1.9932, + "step": 3510 + }, + { + "epoch": 0.6758909449671535, + "grad_norm": 3.1018396989524217, + "learning_rate": 1.5424755874408095e-05, + "loss": 1.9572, + "step": 3511 + }, + { + "epoch": 0.6760834516447289, + "grad_norm": 2.872796161759113, + "learning_rate": 1.5422136354458522e-05, + "loss": 2.0115, + "step": 3512 + }, + { + "epoch": 0.6760834516447289, + "lm_loss": 1.3793, + "step": 3512, + "vm_loss": 0.1435 + }, + { + "epoch": 0.6760834516447289, + "lm_loss": 1.3485, + "step": 3512, + "vm_loss": 0.1347 + }, + { + "epoch": 0.6760834516447289, + "lm_loss": 1.6604, + "step": 3512, + "vm_loss": 0.1415 + }, + { + "epoch": 0.6760834516447289, + "lm_loss": 1.3108, + "step": 3512, + "vm_loss": 0.1304 + }, + { + "epoch": 0.6760834516447289, + "lm_loss": 1.8638, + "step": 3512, + "vm_loss": 0.1582 + }, + { + "epoch": 0.6760834516447289, + "lm_loss": 1.6561, + "step": 3512, + "vm_loss": 0.161 + }, + { + "epoch": 0.6760834516447289, + "lm_loss": 1.938, + "step": 3512, + "vm_loss": 0.1078 + }, + { + "epoch": 0.6760834516447289, + "lm_loss": 2.223, + "step": 3512, + "vm_loss": 0.1862 + }, + { + "epoch": 0.6762759583223044, + "grad_norm": 2.7047768775248153, + "learning_rate": 1.5419516307407925e-05, + "loss": 1.9297, + "step": 3513 + }, + { + "epoch": 0.6764684649998797, + "grad_norm": 2.7927486272194435, + "learning_rate": 1.541689573351101e-05, + "loss": 2.1054, + "step": 3514 + }, + { + "epoch": 0.6766609716774551, + "grad_norm": 3.0514903264849935, + "learning_rate": 1.5414274633022533e-05, + "loss": 1.9215, + "step": 3515 + }, + { + "epoch": 0.6768534783550304, + "grad_norm": 2.7581916942521696, + "learning_rate": 1.5411653006197284e-05, + "loss": 2.0527, + "step": 3516 + }, + { + "epoch": 0.6770459850326058, + "grad_norm": 2.9838382743238934, + "learning_rate": 1.540903085329013e-05, + "loss": 2.0438, + "step": 3517 + }, + { + "epoch": 0.6772384917101812, + "grad_norm": 2.872790819795221, + "learning_rate": 1.5406408174555978e-05, + "loss": 1.9362, + "step": 3518 + }, + { + "epoch": 0.6774309983877566, + "grad_norm": 2.8707493577267456, + "learning_rate": 1.540378497024978e-05, + "loss": 1.9643, + "step": 3519 + }, + { + "epoch": 0.677623505065332, + "grad_norm": 2.89725179771934, + "learning_rate": 1.540116124062655e-05, + "loss": 2.0444, + "step": 3520 + }, + { + "epoch": 0.677623505065332, + "lm_loss": 1.7288, + "step": 3520, + "vm_loss": 0.122 + }, + { + "epoch": 0.677623505065332, + "lm_loss": 1.9892, + "step": 3520, + "vm_loss": 0.1651 + }, + { + "epoch": 0.677623505065332, + "lm_loss": 2.0788, + "step": 3520, + "vm_loss": 0.18 + }, + { + "epoch": 0.677623505065332, + "lm_loss": 1.7419, + "step": 3520, + "vm_loss": 0.1279 + }, + { + "epoch": 0.677623505065332, + "lm_loss": 1.8654, + "step": 3520, + "vm_loss": 0.1807 + }, + { + "epoch": 0.677623505065332, + "lm_loss": 1.9472, + "step": 3520, + "vm_loss": 0.2365 + }, + { + "epoch": 0.677623505065332, + "lm_loss": 1.8347, + "step": 3520, + "vm_loss": 0.1421 + }, + { + "epoch": 0.677623505065332, + "lm_loss": 2.0136, + "step": 3520, + "vm_loss": 0.137 + }, + { + "epoch": 0.6778160117429073, + "grad_norm": 2.8276560865305944, + "learning_rate": 1.5398536985941347e-05, + "loss": 2.0026, + "step": 3521 + }, + { + "epoch": 0.6780085184204827, + "grad_norm": 2.9067846424046593, + "learning_rate": 1.539591220644928e-05, + "loss": 1.9606, + "step": 3522 + }, + { + "epoch": 0.6782010250980581, + "grad_norm": 2.8439958339052245, + "learning_rate": 1.5393286902405513e-05, + "loss": 1.9776, + "step": 3523 + }, + { + "epoch": 0.6783935317756334, + "grad_norm": 2.8976549312203863, + "learning_rate": 1.5390661074065257e-05, + "loss": 2.0439, + "step": 3524 + }, + { + "epoch": 0.6785860384532089, + "grad_norm": 3.1006514730406867, + "learning_rate": 1.5388034721683783e-05, + "loss": 1.9781, + "step": 3525 + }, + { + "epoch": 0.6787785451307842, + "grad_norm": 2.7939585585356217, + "learning_rate": 1.5385407845516395e-05, + "loss": 1.9368, + "step": 3526 + }, + { + "epoch": 0.6789710518083596, + "grad_norm": 2.8590422138281264, + "learning_rate": 1.5382780445818468e-05, + "loss": 1.955, + "step": 3527 + }, + { + "epoch": 0.679163558485935, + "grad_norm": 3.065389616390679, + "learning_rate": 1.5380152522845416e-05, + "loss": 2.039, + "step": 3528 + }, + { + "epoch": 0.679163558485935, + "lm_loss": 1.68, + "step": 3528, + "vm_loss": 0.1275 + }, + { + "epoch": 0.679163558485935, + "lm_loss": 1.7364, + "step": 3528, + "vm_loss": 0.1166 + }, + { + "epoch": 0.679163558485935, + "lm_loss": 2.5211, + "step": 3528, + "vm_loss": 0.1935 + }, + { + "epoch": 0.679163558485935, + "lm_loss": 2.186, + "step": 3528, + "vm_loss": 0.2242 + }, + { + "epoch": 0.679163558485935, + "lm_loss": 1.5331, + "step": 3528, + "vm_loss": 0.1704 + }, + { + "epoch": 0.679163558485935, + "lm_loss": 1.727, + "step": 3528, + "vm_loss": 0.2105 + }, + { + "epoch": 0.679163558485935, + "lm_loss": 2.0293, + "step": 3528, + "vm_loss": 0.1568 + }, + { + "epoch": 0.679163558485935, + "lm_loss": 1.4385, + "step": 3528, + "vm_loss": 0.1711 + }, + { + "epoch": 0.6793560651635103, + "grad_norm": 3.14617451915695, + "learning_rate": 1.5377524076852704e-05, + "loss": 1.948, + "step": 3529 + }, + { + "epoch": 0.6795485718410857, + "grad_norm": 2.876815650812825, + "learning_rate": 1.5374895108095858e-05, + "loss": 1.9959, + "step": 3530 + }, + { + "epoch": 0.6797410785186612, + "grad_norm": 2.995124562180894, + "learning_rate": 1.537226561683044e-05, + "loss": 2.0018, + "step": 3531 + }, + { + "epoch": 0.6799335851962365, + "grad_norm": 2.8711812447192915, + "learning_rate": 1.5369635603312073e-05, + "loss": 1.9582, + "step": 3532 + }, + { + "epoch": 0.6801260918738119, + "grad_norm": 2.8699348550468917, + "learning_rate": 1.5367005067796426e-05, + "loss": 2.0119, + "step": 3533 + }, + { + "epoch": 0.6803185985513872, + "grad_norm": 2.7712292799927036, + "learning_rate": 1.5364374010539225e-05, + "loss": 1.9118, + "step": 3534 + }, + { + "epoch": 0.6805111052289626, + "grad_norm": 2.7486627795720633, + "learning_rate": 1.5361742431796235e-05, + "loss": 2.0, + "step": 3535 + }, + { + "epoch": 0.680703611906538, + "grad_norm": 2.8599852390444416, + "learning_rate": 1.535911033182329e-05, + "loss": 1.8943, + "step": 3536 + }, + { + "epoch": 0.680703611906538, + "lm_loss": 1.753, + "step": 3536, + "vm_loss": 0.1128 + }, + { + "epoch": 0.680703611906538, + "lm_loss": 2.017, + "step": 3536, + "vm_loss": 0.1309 + }, + { + "epoch": 0.680703611906538, + "lm_loss": 1.8967, + "step": 3536, + "vm_loss": 0.1209 + }, + { + "epoch": 0.680703611906538, + "lm_loss": 1.8915, + "step": 3536, + "vm_loss": 0.2178 + }, + { + "epoch": 0.680703611906538, + "lm_loss": 1.6509, + "step": 3536, + "vm_loss": 0.2328 + }, + { + "epoch": 0.680703611906538, + "lm_loss": 2.1272, + "step": 3536, + "vm_loss": 0.2106 + }, + { + "epoch": 0.680703611906538, + "lm_loss": 2.0965, + "step": 3536, + "vm_loss": 0.1305 + }, + { + "epoch": 0.680703611906538, + "lm_loss": 1.7545, + "step": 3536, + "vm_loss": 0.0953 + }, + { + "epoch": 0.6808961185841134, + "grad_norm": 3.0179392952066126, + "learning_rate": 1.5356477710876255e-05, + "loss": 1.9832, + "step": 3537 + }, + { + "epoch": 0.6810886252616888, + "grad_norm": 3.0031548743576773, + "learning_rate": 1.5353844569211056e-05, + "loss": 1.9833, + "step": 3538 + }, + { + "epoch": 0.6812811319392641, + "grad_norm": 2.765648008151935, + "learning_rate": 1.535121090708367e-05, + "loss": 1.8995, + "step": 3539 + }, + { + "epoch": 0.6814736386168395, + "grad_norm": 2.9755962524505675, + "learning_rate": 1.5348576724750123e-05, + "loss": 1.9483, + "step": 3540 + }, + { + "epoch": 0.6816661452944149, + "grad_norm": 2.9369714167325784, + "learning_rate": 1.5345942022466488e-05, + "loss": 2.0139, + "step": 3541 + }, + { + "epoch": 0.6818586519719902, + "grad_norm": 3.1180432527990414, + "learning_rate": 1.5343306800488896e-05, + "loss": 1.9913, + "step": 3542 + }, + { + "epoch": 0.6820511586495657, + "grad_norm": 3.0379557253781018, + "learning_rate": 1.534067105907352e-05, + "loss": 1.9596, + "step": 3543 + }, + { + "epoch": 0.682243665327141, + "grad_norm": 3.054172199435693, + "learning_rate": 1.533803479847659e-05, + "loss": 1.8881, + "step": 3544 + }, + { + "epoch": 0.682243665327141, + "lm_loss": 2.1323, + "step": 3544, + "vm_loss": 0.2054 + }, + { + "epoch": 0.682243665327141, + "lm_loss": 1.752, + "step": 3544, + "vm_loss": 0.1537 + }, + { + "epoch": 0.682243665327141, + "lm_loss": 1.884, + "step": 3544, + "vm_loss": 0.2146 + }, + { + "epoch": 0.682243665327141, + "lm_loss": 2.2649, + "step": 3544, + "vm_loss": 0.1704 + }, + { + "epoch": 0.682243665327141, + "lm_loss": 1.831, + "step": 3544, + "vm_loss": 0.2134 + }, + { + "epoch": 0.682243665327141, + "lm_loss": 1.8256, + "step": 3544, + "vm_loss": 0.1199 + }, + { + "epoch": 0.682243665327141, + "lm_loss": 1.8808, + "step": 3544, + "vm_loss": 0.2027 + }, + { + "epoch": 0.682243665327141, + "lm_loss": 1.8405, + "step": 3544, + "vm_loss": 0.1805 + }, + { + "epoch": 0.6824361720047164, + "grad_norm": 3.122498022349623, + "learning_rate": 1.5335398018954385e-05, + "loss": 1.9769, + "step": 3545 + }, + { + "epoch": 0.6826286786822918, + "grad_norm": 2.8998094669325027, + "learning_rate": 1.5332760720763232e-05, + "loss": 1.9797, + "step": 3546 + }, + { + "epoch": 0.6828211853598671, + "grad_norm": 2.94157799970006, + "learning_rate": 1.5330122904159513e-05, + "loss": 1.9511, + "step": 3547 + }, + { + "epoch": 0.6830136920374426, + "grad_norm": 3.0153973250992, + "learning_rate": 1.5327484569399655e-05, + "loss": 1.9852, + "step": 3548 + }, + { + "epoch": 0.683206198715018, + "grad_norm": 3.2771290967265823, + "learning_rate": 1.532484571674014e-05, + "loss": 1.9962, + "step": 3549 + }, + { + "epoch": 0.6833987053925933, + "grad_norm": 2.877853758691398, + "learning_rate": 1.5322206346437492e-05, + "loss": 1.9261, + "step": 3550 + }, + { + "epoch": 0.6835912120701687, + "grad_norm": 2.9096314659106577, + "learning_rate": 1.5319566458748303e-05, + "loss": 2.0331, + "step": 3551 + }, + { + "epoch": 0.683783718747744, + "grad_norm": 2.93229005971163, + "learning_rate": 1.5316926053929192e-05, + "loss": 1.9389, + "step": 3552 + }, + { + "epoch": 0.683783718747744, + "lm_loss": 1.6662, + "step": 3552, + "vm_loss": 0.195 + }, + { + "epoch": 0.683783718747744, + "lm_loss": 1.4992, + "step": 3552, + "vm_loss": 0.195 + }, + { + "epoch": 0.683783718747744, + "lm_loss": 1.9278, + "step": 3552, + "vm_loss": 0.1676 + }, + { + "epoch": 0.683783718747744, + "lm_loss": 2.0457, + "step": 3552, + "vm_loss": 0.1743 + }, + { + "epoch": 0.683783718747744, + "lm_loss": 1.9813, + "step": 3552, + "vm_loss": 0.1886 + }, + { + "epoch": 0.683783718747744, + "lm_loss": 1.6245, + "step": 3552, + "vm_loss": 0.1864 + }, + { + "epoch": 0.683783718747744, + "lm_loss": 2.0912, + "step": 3552, + "vm_loss": 0.1588 + }, + { + "epoch": 0.683783718747744, + "lm_loss": 1.5839, + "step": 3552, + "vm_loss": 0.176 + }, + { + "epoch": 0.6839762254253194, + "grad_norm": 3.093072813815383, + "learning_rate": 1.5314285132236845e-05, + "loss": 1.9847, + "step": 3553 + }, + { + "epoch": 0.6841687321028949, + "grad_norm": 2.848386084421865, + "learning_rate": 1.5311643693927997e-05, + "loss": 1.8948, + "step": 3554 + }, + { + "epoch": 0.6843612387804702, + "grad_norm": 3.191335796802072, + "learning_rate": 1.530900173925942e-05, + "loss": 1.9773, + "step": 3555 + }, + { + "epoch": 0.6845537454580456, + "grad_norm": 2.990210028114377, + "learning_rate": 1.5306359268487958e-05, + "loss": 1.9985, + "step": 3556 + }, + { + "epoch": 0.6847462521356209, + "grad_norm": 2.9093156860972815, + "learning_rate": 1.5303716281870485e-05, + "loss": 2.0383, + "step": 3557 + }, + { + "epoch": 0.6849387588131963, + "grad_norm": 2.8978387948492315, + "learning_rate": 1.5301072779663936e-05, + "loss": 1.9715, + "step": 3558 + }, + { + "epoch": 0.6851312654907717, + "grad_norm": 3.018804339661139, + "learning_rate": 1.5298428762125293e-05, + "loss": 1.9616, + "step": 3559 + }, + { + "epoch": 0.6853237721683471, + "grad_norm": 3.381703127663243, + "learning_rate": 1.5295784229511583e-05, + "loss": 1.9815, + "step": 3560 + }, + { + "epoch": 0.6853237721683471, + "lm_loss": 1.7722, + "step": 3560, + "vm_loss": 0.149 + }, + { + "epoch": 0.6853237721683471, + "lm_loss": 2.1165, + "step": 3560, + "vm_loss": 0.1991 + }, + { + "epoch": 0.6853237721683471, + "lm_loss": 1.4734, + "step": 3560, + "vm_loss": 0.1716 + }, + { + "epoch": 0.6853237721683471, + "lm_loss": 1.6724, + "step": 3560, + "vm_loss": 0.1423 + }, + { + "epoch": 0.6853237721683471, + "lm_loss": 2.1123, + "step": 3560, + "vm_loss": 0.1092 + }, + { + "epoch": 0.6853237721683471, + "lm_loss": 1.6136, + "step": 3560, + "vm_loss": 0.2185 + }, + { + "epoch": 0.6853237721683471, + "lm_loss": 1.9916, + "step": 3560, + "vm_loss": 0.1675 + }, + { + "epoch": 0.6853237721683471, + "lm_loss": 1.5897, + "step": 3560, + "vm_loss": 0.1461 + }, + { + "epoch": 0.6855162788459225, + "grad_norm": 2.830539865804742, + "learning_rate": 1.5293139182079894e-05, + "loss": 1.9684, + "step": 3561 + }, + { + "epoch": 0.6857087855234979, + "grad_norm": 2.9754972639752015, + "learning_rate": 1.5290493620087362e-05, + "loss": 1.9758, + "step": 3562 + }, + { + "epoch": 0.6859012922010732, + "grad_norm": 3.093228135430268, + "learning_rate": 1.5287847543791162e-05, + "loss": 2.0164, + "step": 3563 + }, + { + "epoch": 0.6860937988786486, + "grad_norm": 2.8552487592072344, + "learning_rate": 1.528520095344853e-05, + "loss": 1.9806, + "step": 3564 + }, + { + "epoch": 0.6862863055562239, + "grad_norm": 2.9246862871822805, + "learning_rate": 1.5282553849316744e-05, + "loss": 2.0243, + "step": 3565 + }, + { + "epoch": 0.6864788122337994, + "grad_norm": 3.1427723491893453, + "learning_rate": 1.5279906231653146e-05, + "loss": 1.989, + "step": 3566 + }, + { + "epoch": 0.6866713189113748, + "grad_norm": 2.9267215787326077, + "learning_rate": 1.527725810071511e-05, + "loss": 1.9616, + "step": 3567 + }, + { + "epoch": 0.6868638255889501, + "grad_norm": 3.178720792001327, + "learning_rate": 1.5274609456760073e-05, + "loss": 1.9679, + "step": 3568 + }, + { + "epoch": 0.6868638255889501, + "lm_loss": 2.0215, + "step": 3568, + "vm_loss": 0.2075 + }, + { + "epoch": 0.6868638255889501, + "lm_loss": 1.7344, + "step": 3568, + "vm_loss": 0.1682 + }, + { + "epoch": 0.6868638255889501, + "lm_loss": 1.9781, + "step": 3568, + "vm_loss": 0.1433 + }, + { + "epoch": 0.6868638255889501, + "lm_loss": 1.6267, + "step": 3568, + "vm_loss": 0.1624 + }, + { + "epoch": 0.6868638255889501, + "lm_loss": 1.6278, + "step": 3568, + "vm_loss": 0.1761 + }, + { + "epoch": 0.6868638255889501, + "lm_loss": 1.4253, + "step": 3568, + "vm_loss": 0.182 + }, + { + "epoch": 0.6868638255889501, + "lm_loss": 1.8945, + "step": 3568, + "vm_loss": 0.2569 + }, + { + "epoch": 0.6868638255889501, + "lm_loss": 1.9701, + "step": 3568, + "vm_loss": 0.1494 + }, + { + "epoch": 0.6870563322665255, + "grad_norm": 3.105526323629938, + "learning_rate": 1.5271960300045514e-05, + "loss": 1.9754, + "step": 3569 + }, + { + "epoch": 0.6872488389441008, + "grad_norm": 3.3532885915928636, + "learning_rate": 1.5269310630828963e-05, + "loss": 1.9227, + "step": 3570 + }, + { + "epoch": 0.6874413456216762, + "grad_norm": 3.043739984183519, + "learning_rate": 1.526666044936801e-05, + "loss": 2.012, + "step": 3571 + }, + { + "epoch": 0.6876338522992517, + "grad_norm": 2.9659747073530442, + "learning_rate": 1.5264009755920276e-05, + "loss": 2.0128, + "step": 3572 + }, + { + "epoch": 0.687826358976827, + "grad_norm": 3.355471471281819, + "learning_rate": 1.526135855074345e-05, + "loss": 1.9744, + "step": 3573 + }, + { + "epoch": 0.6880188656544024, + "grad_norm": 3.1248765784788493, + "learning_rate": 1.5258706834095256e-05, + "loss": 1.9422, + "step": 3574 + }, + { + "epoch": 0.6882113723319777, + "grad_norm": 2.857522659304052, + "learning_rate": 1.5256054606233481e-05, + "loss": 1.9944, + "step": 3575 + }, + { + "epoch": 0.6884038790095531, + "grad_norm": 3.1002121909964195, + "learning_rate": 1.5253401867415958e-05, + "loss": 1.9869, + "step": 3576 + }, + { + "epoch": 0.6884038790095531, + "lm_loss": 1.6013, + "step": 3576, + "vm_loss": 0.206 + }, + { + "epoch": 0.6884038790095531, + "lm_loss": 2.1497, + "step": 3576, + "vm_loss": 0.2021 + }, + { + "epoch": 0.6884038790095531, + "lm_loss": 2.248, + "step": 3576, + "vm_loss": 0.1278 + }, + { + "epoch": 0.6884038790095531, + "lm_loss": 1.8658, + "step": 3576, + "vm_loss": 0.1321 + }, + { + "epoch": 0.6884038790095531, + "lm_loss": 1.5132, + "step": 3576, + "vm_loss": 0.1748 + }, + { + "epoch": 0.6884038790095531, + "lm_loss": 1.2832, + "step": 3576, + "vm_loss": 0.1637 + }, + { + "epoch": 0.6884038790095531, + "lm_loss": 1.7683, + "step": 3576, + "vm_loss": 0.1757 + }, + { + "epoch": 0.6884038790095531, + "lm_loss": 1.7649, + "step": 3576, + "vm_loss": 0.1513 + }, + { + "epoch": 0.6885963856871286, + "grad_norm": 2.900853932298349, + "learning_rate": 1.525074861790056e-05, + "loss": 1.9473, + "step": 3577 + }, + { + "epoch": 0.6887888923647039, + "grad_norm": 2.836583621256632, + "learning_rate": 1.524809485794522e-05, + "loss": 1.9117, + "step": 3578 + }, + { + "epoch": 0.6889813990422793, + "grad_norm": 3.113230410017473, + "learning_rate": 1.5245440587807917e-05, + "loss": 1.9043, + "step": 3579 + }, + { + "epoch": 0.6891739057198547, + "grad_norm": 3.1593715705332692, + "learning_rate": 1.5242785807746678e-05, + "loss": 1.9086, + "step": 3580 + }, + { + "epoch": 0.68936641239743, + "grad_norm": 3.181101193388769, + "learning_rate": 1.5240130518019588e-05, + "loss": 1.9745, + "step": 3581 + }, + { + "epoch": 0.6895589190750054, + "grad_norm": 2.8789257814271627, + "learning_rate": 1.5237474718884767e-05, + "loss": 1.9761, + "step": 3582 + }, + { + "epoch": 0.6897514257525807, + "grad_norm": 3.1152786611065135, + "learning_rate": 1.5234818410600396e-05, + "loss": 1.936, + "step": 3583 + }, + { + "epoch": 0.6899439324301562, + "grad_norm": 2.876959467751351, + "learning_rate": 1.5232161593424704e-05, + "loss": 1.9802, + "step": 3584 + }, + { + "epoch": 0.6899439324301562, + "lm_loss": 1.9767, + "step": 3584, + "vm_loss": 0.2543 + }, + { + "epoch": 0.6899439324301562, + "lm_loss": 1.8907, + "step": 3584, + "vm_loss": 0.1914 + }, + { + "epoch": 0.6899439324301562, + "lm_loss": 1.5442, + "step": 3584, + "vm_loss": 0.1138 + }, + { + "epoch": 0.6899439324301562, + "lm_loss": 1.6459, + "step": 3584, + "vm_loss": 0.1347 + }, + { + "epoch": 0.6899439324301562, + "lm_loss": 1.7727, + "step": 3584, + "vm_loss": 0.1442 + }, + { + "epoch": 0.6899439324301562, + "lm_loss": 2.1747, + "step": 3584, + "vm_loss": 0.1148 + }, + { + "epoch": 0.6899439324301562, + "lm_loss": 1.8457, + "step": 3584, + "vm_loss": 0.1285 + }, + { + "epoch": 0.6899439324301562, + "lm_loss": 1.8709, + "step": 3584, + "vm_loss": 0.1535 + }, + { + "epoch": 0.6901364391077316, + "grad_norm": 2.8579243740427773, + "learning_rate": 1.5229504267615967e-05, + "loss": 1.9501, + "step": 3585 + }, + { + "epoch": 0.6903289457853069, + "grad_norm": 3.1640023759376334, + "learning_rate": 1.522684643343251e-05, + "loss": 1.9739, + "step": 3586 + }, + { + "epoch": 0.6905214524628823, + "grad_norm": 3.0110607080883125, + "learning_rate": 1.5224188091132708e-05, + "loss": 1.9444, + "step": 3587 + }, + { + "epoch": 0.6907139591404576, + "grad_norm": 2.99224492667875, + "learning_rate": 1.5221529240974987e-05, + "loss": 1.9935, + "step": 3588 + }, + { + "epoch": 0.690906465818033, + "grad_norm": 3.088337571848301, + "learning_rate": 1.5218869883217822e-05, + "loss": 1.9547, + "step": 3589 + }, + { + "epoch": 0.6910989724956085, + "grad_norm": 2.8448920643109803, + "learning_rate": 1.5216210018119735e-05, + "loss": 1.9707, + "step": 3590 + }, + { + "epoch": 0.6912914791731838, + "grad_norm": 2.8230195051109, + "learning_rate": 1.5213549645939299e-05, + "loss": 1.969, + "step": 3591 + }, + { + "epoch": 0.6914839858507592, + "grad_norm": 2.866308876996519, + "learning_rate": 1.5210888766935138e-05, + "loss": 1.9058, + "step": 3592 + }, + { + "epoch": 0.6914839858507592, + "lm_loss": 2.1078, + "step": 3592, + "vm_loss": 0.1299 + }, + { + "epoch": 0.6914839858507592, + "lm_loss": 1.6051, + "step": 3592, + "vm_loss": 0.1977 + }, + { + "epoch": 0.6914839858507592, + "lm_loss": 1.7529, + "step": 3592, + "vm_loss": 0.1874 + }, + { + "epoch": 0.6914839858507592, + "lm_loss": 1.5033, + "step": 3592, + "vm_loss": 0.0956 + }, + { + "epoch": 0.6914839858507592, + "lm_loss": 2.3358, + "step": 3592, + "vm_loss": 0.1135 + }, + { + "epoch": 0.6914839858507592, + "lm_loss": 1.9542, + "step": 3592, + "vm_loss": 0.1542 + }, + { + "epoch": 0.6914839858507592, + "lm_loss": 1.6266, + "step": 3592, + "vm_loss": 0.1122 + }, + { + "epoch": 0.6914839858507592, + "lm_loss": 2.1033, + "step": 3592, + "vm_loss": 0.1775 + }, + { + "epoch": 0.6916764925283346, + "grad_norm": 3.021169320973257, + "learning_rate": 1.5208227381365919e-05, + "loss": 1.9119, + "step": 3593 + }, + { + "epoch": 0.6918689992059099, + "grad_norm": 3.090184515345662, + "learning_rate": 1.5205565489490369e-05, + "loss": 1.9892, + "step": 3594 + }, + { + "epoch": 0.6920615058834854, + "grad_norm": 3.086627243066056, + "learning_rate": 1.5202903091567253e-05, + "loss": 1.931, + "step": 3595 + }, + { + "epoch": 0.6922540125610607, + "grad_norm": 2.9501333184256193, + "learning_rate": 1.5200240187855392e-05, + "loss": 1.92, + "step": 3596 + }, + { + "epoch": 0.6924465192386361, + "grad_norm": 2.824361407462446, + "learning_rate": 1.5197576778613657e-05, + "loss": 1.9343, + "step": 3597 + }, + { + "epoch": 0.6926390259162115, + "grad_norm": 2.8894982685009403, + "learning_rate": 1.519491286410096e-05, + "loss": 2.036, + "step": 3598 + }, + { + "epoch": 0.6928315325937868, + "grad_norm": 2.9326869747406703, + "learning_rate": 1.519224844457627e-05, + "loss": 1.9683, + "step": 3599 + }, + { + "epoch": 0.6930240392713622, + "grad_norm": 2.924726968764479, + "learning_rate": 1.5189583520298605e-05, + "loss": 1.898, + "step": 3600 + }, + { + "epoch": 0.6930240392713622, + "lm_loss": 1.5069, + "step": 3600, + "vm_loss": 0.1117 + }, + { + "epoch": 0.6930240392713622, + "lm_loss": 1.7374, + "step": 3600, + "vm_loss": 0.1463 + }, + { + "epoch": 0.6930240392713622, + "lm_loss": 1.9252, + "step": 3600, + "vm_loss": 0.2247 + }, + { + "epoch": 0.6930240392713622, + "lm_loss": 1.9606, + "step": 3600, + "vm_loss": 0.1098 + }, + { + "epoch": 0.6930240392713622, + "lm_loss": 1.3729, + "step": 3600, + "vm_loss": 0.1437 + }, + { + "epoch": 0.6930240392713622, + "lm_loss": 1.607, + "step": 3600, + "vm_loss": 0.137 + }, + { + "epoch": 0.6930240392713622, + "lm_loss": 1.6964, + "step": 3600, + "vm_loss": 0.1868 + }, + { + "epoch": 0.6930240392713622, + "lm_loss": 2.0102, + "step": 3600, + "vm_loss": 0.1337 + }, + { + "epoch": 0.6932165459489376, + "grad_norm": 3.0372860382335305, + "learning_rate": 1.5186918091527025e-05, + "loss": 1.9145, + "step": 3601 + }, + { + "epoch": 0.693409052626513, + "grad_norm": 2.9607599928161368, + "learning_rate": 1.5184252158520647e-05, + "loss": 1.8524, + "step": 3602 + }, + { + "epoch": 0.6936015593040884, + "grad_norm": 2.9089548754353394, + "learning_rate": 1.5181585721538636e-05, + "loss": 1.9669, + "step": 3603 + }, + { + "epoch": 0.6937940659816637, + "grad_norm": 2.819319271679664, + "learning_rate": 1.5178918780840198e-05, + "loss": 1.9424, + "step": 3604 + }, + { + "epoch": 0.6939865726592391, + "grad_norm": 2.974853590223117, + "learning_rate": 1.5176251336684597e-05, + "loss": 1.9874, + "step": 3605 + }, + { + "epoch": 0.6941790793368144, + "grad_norm": 2.8289683115907707, + "learning_rate": 1.5173583389331144e-05, + "loss": 1.9164, + "step": 3606 + }, + { + "epoch": 0.6943715860143899, + "grad_norm": 2.9868943352681194, + "learning_rate": 1.5170914939039195e-05, + "loss": 2.016, + "step": 3607 + }, + { + "epoch": 0.6945640926919653, + "grad_norm": 2.9452229250098387, + "learning_rate": 1.516824598606816e-05, + "loss": 1.987, + "step": 3608 + }, + { + "epoch": 0.6945640926919653, + "lm_loss": 1.8154, + "step": 3608, + "vm_loss": 0.1901 + }, + { + "epoch": 0.6945640926919653, + "lm_loss": 2.1617, + "step": 3608, + "vm_loss": 0.2547 + }, + { + "epoch": 0.6945640926919653, + "lm_loss": 1.9443, + "step": 3608, + "vm_loss": 0.1528 + }, + { + "epoch": 0.6945640926919653, + "lm_loss": 2.0248, + "step": 3608, + "vm_loss": 0.1722 + }, + { + "epoch": 0.6945640926919653, + "lm_loss": 2.1246, + "step": 3608, + "vm_loss": 0.1125 + }, + { + "epoch": 0.6945640926919653, + "lm_loss": 1.8375, + "step": 3608, + "vm_loss": 0.2023 + }, + { + "epoch": 0.6945640926919653, + "lm_loss": 2.3115, + "step": 3608, + "vm_loss": 0.1181 + }, + { + "epoch": 0.6945640926919653, + "lm_loss": 0.9842, + "step": 3608, + "vm_loss": 0.1557 + }, + { + "epoch": 0.6947565993695406, + "grad_norm": 2.948513754884626, + "learning_rate": 1.5165576530677492e-05, + "loss": 2.0207, + "step": 3609 + }, + { + "epoch": 0.694949106047116, + "grad_norm": 2.960162170971852, + "learning_rate": 1.5162906573126698e-05, + "loss": 1.9101, + "step": 3610 + }, + { + "epoch": 0.6951416127246914, + "grad_norm": 2.909200606908281, + "learning_rate": 1.5160236113675333e-05, + "loss": 1.9988, + "step": 3611 + }, + { + "epoch": 0.6953341194022667, + "grad_norm": 3.007786461709295, + "learning_rate": 1.5157565152583002e-05, + "loss": 1.9468, + "step": 3612 + }, + { + "epoch": 0.6955266260798422, + "grad_norm": 2.81787849759017, + "learning_rate": 1.5154893690109349e-05, + "loss": 2.0016, + "step": 3613 + }, + { + "epoch": 0.6957191327574175, + "grad_norm": 3.038201465904211, + "learning_rate": 1.5152221726514082e-05, + "loss": 1.9454, + "step": 3614 + }, + { + "epoch": 0.6959116394349929, + "grad_norm": 3.0036994997596316, + "learning_rate": 1.5149549262056946e-05, + "loss": 1.9501, + "step": 3615 + }, + { + "epoch": 0.6961041461125683, + "grad_norm": 3.1216605876343593, + "learning_rate": 1.5146876296997739e-05, + "loss": 1.9862, + "step": 3616 + }, + { + "epoch": 0.6961041461125683, + "lm_loss": 2.1122, + "step": 3616, + "vm_loss": 0.2121 + }, + { + "epoch": 0.6961041461125683, + "lm_loss": 1.6196, + "step": 3616, + "vm_loss": 0.158 + }, + { + "epoch": 0.6961041461125683, + "lm_loss": 1.8496, + "step": 3616, + "vm_loss": 0.2051 + }, + { + "epoch": 0.6961041461125683, + "lm_loss": 2.2242, + "step": 3616, + "vm_loss": 0.2501 + }, + { + "epoch": 0.6961041461125683, + "lm_loss": 1.4669, + "step": 3616, + "vm_loss": 0.1832 + }, + { + "epoch": 0.6961041461125683, + "lm_loss": 1.8816, + "step": 3616, + "vm_loss": 0.1626 + }, + { + "epoch": 0.6961041461125683, + "lm_loss": 1.359, + "step": 3616, + "vm_loss": 0.1237 + }, + { + "epoch": 0.6961041461125683, + "lm_loss": 1.988, + "step": 3616, + "vm_loss": 0.1436 + }, + { + "epoch": 0.6962966527901436, + "grad_norm": 3.048908171464537, + "learning_rate": 1.514420283159631e-05, + "loss": 1.9869, + "step": 3617 + }, + { + "epoch": 0.696489159467719, + "grad_norm": 2.780863729468389, + "learning_rate": 1.5141528866112551e-05, + "loss": 1.8739, + "step": 3618 + }, + { + "epoch": 0.6966816661452944, + "grad_norm": 2.915142335212109, + "learning_rate": 1.5138854400806408e-05, + "loss": 1.9529, + "step": 3619 + }, + { + "epoch": 0.6968741728228698, + "grad_norm": 3.060927670706388, + "learning_rate": 1.5136179435937871e-05, + "loss": 1.9732, + "step": 3620 + }, + { + "epoch": 0.6970666795004452, + "grad_norm": 2.7912706153539464, + "learning_rate": 1.513350397176698e-05, + "loss": 1.9412, + "step": 3621 + }, + { + "epoch": 0.6972591861780205, + "grad_norm": 2.868448117584272, + "learning_rate": 1.513082800855383e-05, + "loss": 2.0046, + "step": 3622 + }, + { + "epoch": 0.6974516928555959, + "grad_norm": 2.81072155310048, + "learning_rate": 1.5128151546558554e-05, + "loss": 1.9713, + "step": 3623 + }, + { + "epoch": 0.6976441995331714, + "grad_norm": 3.11880247067057, + "learning_rate": 1.5125474586041337e-05, + "loss": 1.9949, + "step": 3624 + }, + { + "epoch": 0.6976441995331714, + "lm_loss": 1.8441, + "step": 3624, + "vm_loss": 0.1263 + }, + { + "epoch": 0.6976441995331714, + "lm_loss": 1.8873, + "step": 3624, + "vm_loss": 0.1399 + }, + { + "epoch": 0.6976441995331714, + "lm_loss": 2.231, + "step": 3624, + "vm_loss": 0.1252 + }, + { + "epoch": 0.6976441995331714, + "lm_loss": 2.0389, + "step": 3624, + "vm_loss": 0.1539 + }, + { + "epoch": 0.6976441995331714, + "lm_loss": 2.0288, + "step": 3624, + "vm_loss": 0.161 + }, + { + "epoch": 0.6976441995331714, + "lm_loss": 1.7233, + "step": 3624, + "vm_loss": 0.1411 + }, + { + "epoch": 0.6976441995331714, + "lm_loss": 1.4897, + "step": 3624, + "vm_loss": 0.21 + }, + { + "epoch": 0.6976441995331714, + "lm_loss": 1.7322, + "step": 3624, + "vm_loss": 0.1589 + }, + { + "epoch": 0.6978367062107467, + "grad_norm": 2.8830523554256655, + "learning_rate": 1.5122797127262422e-05, + "loss": 1.9565, + "step": 3625 + }, + { + "epoch": 0.6980292128883221, + "grad_norm": 2.890140601798388, + "learning_rate": 1.5120119170482079e-05, + "loss": 1.9505, + "step": 3626 + }, + { + "epoch": 0.6982217195658974, + "grad_norm": 2.7371199214020074, + "learning_rate": 1.511744071596065e-05, + "loss": 1.9619, + "step": 3627 + }, + { + "epoch": 0.6984142262434728, + "grad_norm": 2.839582880806902, + "learning_rate": 1.5114761763958513e-05, + "loss": 1.9895, + "step": 3628 + }, + { + "epoch": 0.6986067329210482, + "grad_norm": 2.907063297765613, + "learning_rate": 1.5112082314736096e-05, + "loss": 2.0068, + "step": 3629 + }, + { + "epoch": 0.6987992395986236, + "grad_norm": 2.9998919856400037, + "learning_rate": 1.5109402368553876e-05, + "loss": 2.0009, + "step": 3630 + }, + { + "epoch": 0.698991746276199, + "grad_norm": 2.948567062565866, + "learning_rate": 1.5106721925672379e-05, + "loss": 1.9757, + "step": 3631 + }, + { + "epoch": 0.6991842529537743, + "grad_norm": 2.8687635027866394, + "learning_rate": 1.5104040986352173e-05, + "loss": 1.8674, + "step": 3632 + }, + { + "epoch": 0.6991842529537743, + "lm_loss": 2.153, + "step": 3632, + "vm_loss": 0.2083 + }, + { + "epoch": 0.6991842529537743, + "lm_loss": 1.9258, + "step": 3632, + "vm_loss": 0.1191 + }, + { + "epoch": 0.6991842529537743, + "lm_loss": 1.4406, + "step": 3632, + "vm_loss": 0.1755 + }, + { + "epoch": 0.6991842529537743, + "lm_loss": 1.7789, + "step": 3632, + "vm_loss": 0.2111 + }, + { + "epoch": 0.6991842529537743, + "lm_loss": 1.9998, + "step": 3632, + "vm_loss": 0.1668 + }, + { + "epoch": 0.6991842529537743, + "lm_loss": 1.6037, + "step": 3632, + "vm_loss": 0.2228 + }, + { + "epoch": 0.6991842529537743, + "lm_loss": 1.806, + "step": 3632, + "vm_loss": 0.1507 + }, + { + "epoch": 0.6991842529537743, + "lm_loss": 1.4199, + "step": 3632, + "vm_loss": 0.1708 + }, + { + "epoch": 0.6993767596313497, + "grad_norm": 3.1255405483968186, + "learning_rate": 1.5101359550853883e-05, + "loss": 1.9726, + "step": 3633 + }, + { + "epoch": 0.6995692663089251, + "grad_norm": 3.0628078020029395, + "learning_rate": 1.5098677619438183e-05, + "loss": 1.9393, + "step": 3634 + }, + { + "epoch": 0.6997617729865004, + "grad_norm": 2.804392237668242, + "learning_rate": 1.5095995192365784e-05, + "loss": 1.9323, + "step": 3635 + }, + { + "epoch": 0.6999542796640759, + "grad_norm": 3.0333090747620948, + "learning_rate": 1.5093312269897458e-05, + "loss": 1.9089, + "step": 3636 + }, + { + "epoch": 0.7001467863416512, + "grad_norm": 3.0138606987179255, + "learning_rate": 1.5090628852294014e-05, + "loss": 1.9429, + "step": 3637 + }, + { + "epoch": 0.7003392930192266, + "grad_norm": 2.9190957374143776, + "learning_rate": 1.5087944939816322e-05, + "loss": 1.9847, + "step": 3638 + }, + { + "epoch": 0.700531799696802, + "grad_norm": 2.8850694022946577, + "learning_rate": 1.508526053272528e-05, + "loss": 1.9084, + "step": 3639 + }, + { + "epoch": 0.7007243063743773, + "grad_norm": 2.959422989661412, + "learning_rate": 1.5082575631281862e-05, + "loss": 1.8962, + "step": 3640 + }, + { + "epoch": 0.7007243063743773, + "lm_loss": 1.8689, + "step": 3640, + "vm_loss": 0.1282 + }, + { + "epoch": 0.7007243063743773, + "lm_loss": 1.8218, + "step": 3640, + "vm_loss": 0.1851 + }, + { + "epoch": 0.7007243063743773, + "lm_loss": 1.7596, + "step": 3640, + "vm_loss": 0.2005 + }, + { + "epoch": 0.7007243063743773, + "lm_loss": 1.3301, + "step": 3640, + "vm_loss": 0.132 + }, + { + "epoch": 0.7007243063743773, + "lm_loss": 1.7032, + "step": 3640, + "vm_loss": 0.1206 + }, + { + "epoch": 0.7007243063743773, + "lm_loss": 1.1456, + "step": 3640, + "vm_loss": 0.2242 + }, + { + "epoch": 0.7007243063743773, + "lm_loss": 1.5863, + "step": 3640, + "vm_loss": 0.1563 + }, + { + "epoch": 0.7007243063743773, + "lm_loss": 2.0456, + "step": 3640, + "vm_loss": 0.2205 + }, + { + "epoch": 0.7009168130519527, + "grad_norm": 2.9769855647541705, + "learning_rate": 1.5079890235747066e-05, + "loss": 1.9123, + "step": 3641 + }, + { + "epoch": 0.7011093197295282, + "grad_norm": 3.2811469805953513, + "learning_rate": 1.5077204346381946e-05, + "loss": 1.9549, + "step": 3642 + }, + { + "epoch": 0.7013018264071035, + "grad_norm": 3.0379382934444403, + "learning_rate": 1.507451796344761e-05, + "loss": 1.9615, + "step": 3643 + }, + { + "epoch": 0.7014943330846789, + "grad_norm": 2.9636436235644736, + "learning_rate": 1.5071831087205205e-05, + "loss": 1.9491, + "step": 3644 + }, + { + "epoch": 0.7016868397622542, + "grad_norm": 3.036526421297367, + "learning_rate": 1.5069143717915928e-05, + "loss": 1.9216, + "step": 3645 + }, + { + "epoch": 0.7018793464398296, + "grad_norm": 3.1912969356836975, + "learning_rate": 1.5066455855841028e-05, + "loss": 1.9395, + "step": 3646 + }, + { + "epoch": 0.702071853117405, + "grad_norm": 2.9034727887408738, + "learning_rate": 1.5063767501241804e-05, + "loss": 1.9549, + "step": 3647 + }, + { + "epoch": 0.7022643597949804, + "grad_norm": 3.185919849681255, + "learning_rate": 1.5061078654379593e-05, + "loss": 1.9251, + "step": 3648 + }, + { + "epoch": 0.7022643597949804, + "lm_loss": 1.7484, + "step": 3648, + "vm_loss": 0.1406 + }, + { + "epoch": 0.7022643597949804, + "lm_loss": 1.675, + "step": 3648, + "vm_loss": 0.1936 + }, + { + "epoch": 0.7022643597949804, + "lm_loss": 1.8368, + "step": 3648, + "vm_loss": 0.1923 + }, + { + "epoch": 0.7022643597949804, + "lm_loss": 1.7594, + "step": 3648, + "vm_loss": 0.1608 + }, + { + "epoch": 0.7022643597949804, + "lm_loss": 1.6704, + "step": 3648, + "vm_loss": 0.209 + }, + { + "epoch": 0.7022643597949804, + "lm_loss": 1.7657, + "step": 3648, + "vm_loss": 0.2118 + }, + { + "epoch": 0.7022643597949804, + "lm_loss": 2.1663, + "step": 3648, + "vm_loss": 0.2122 + }, + { + "epoch": 0.7022643597949804, + "lm_loss": 1.9826, + "step": 3648, + "vm_loss": 0.1605 + }, + { + "epoch": 0.7024568664725558, + "grad_norm": 3.1924538518851757, + "learning_rate": 1.5058389315515786e-05, + "loss": 2.0014, + "step": 3649 + }, + { + "epoch": 0.7026493731501311, + "grad_norm": 3.3276572144474654, + "learning_rate": 1.5055699484911822e-05, + "loss": 2.0373, + "step": 3650 + }, + { + "epoch": 0.7028418798277065, + "grad_norm": 3.0158872153573943, + "learning_rate": 1.5053009162829188e-05, + "loss": 1.9828, + "step": 3651 + }, + { + "epoch": 0.7030343865052819, + "grad_norm": 2.879433544701226, + "learning_rate": 1.5050318349529416e-05, + "loss": 1.9595, + "step": 3652 + }, + { + "epoch": 0.7032268931828572, + "grad_norm": 2.9908264220168177, + "learning_rate": 1.504762704527409e-05, + "loss": 1.962, + "step": 3653 + }, + { + "epoch": 0.7034193998604327, + "grad_norm": 3.292653240593635, + "learning_rate": 1.5044935250324832e-05, + "loss": 1.9893, + "step": 3654 + }, + { + "epoch": 0.7036119065380081, + "grad_norm": 2.9446634675233496, + "learning_rate": 1.504224296494333e-05, + "loss": 1.8828, + "step": 3655 + }, + { + "epoch": 0.7038044132155834, + "grad_norm": 2.8583238934066952, + "learning_rate": 1.50395501893913e-05, + "loss": 1.9728, + "step": 3656 + }, + { + "epoch": 0.7038044132155834, + "lm_loss": 1.9844, + "step": 3656, + "vm_loss": 0.1972 + }, + { + "epoch": 0.7038044132155834, + "lm_loss": 1.6128, + "step": 3656, + "vm_loss": 0.1581 + }, + { + "epoch": 0.7038044132155834, + "lm_loss": 1.2618, + "step": 3656, + "vm_loss": 0.1098 + }, + { + "epoch": 0.7038044132155834, + "lm_loss": 1.8605, + "step": 3656, + "vm_loss": 0.1503 + }, + { + "epoch": 0.7038044132155834, + "lm_loss": 1.905, + "step": 3656, + "vm_loss": 0.1902 + }, + { + "epoch": 0.7038044132155834, + "lm_loss": 2.2919, + "step": 3656, + "vm_loss": 0.2154 + }, + { + "epoch": 0.7038044132155834, + "lm_loss": 1.7497, + "step": 3656, + "vm_loss": 0.2163 + }, + { + "epoch": 0.7038044132155834, + "lm_loss": 2.012, + "step": 3656, + "vm_loss": 0.151 + }, + { + "epoch": 0.7039969198931588, + "grad_norm": 3.153595845368003, + "learning_rate": 1.5036856923930514e-05, + "loss": 1.9101, + "step": 3657 + }, + { + "epoch": 0.7041894265707341, + "grad_norm": 2.9442524495402043, + "learning_rate": 1.5034163168822797e-05, + "loss": 1.8937, + "step": 3658 + }, + { + "epoch": 0.7043819332483096, + "grad_norm": 3.1614976300585127, + "learning_rate": 1.5031468924330015e-05, + "loss": 1.9508, + "step": 3659 + }, + { + "epoch": 0.704574439925885, + "grad_norm": 2.8529734534221487, + "learning_rate": 1.5028774190714082e-05, + "loss": 1.9771, + "step": 3660 + }, + { + "epoch": 0.7047669466034603, + "grad_norm": 2.910387990854519, + "learning_rate": 1.5026078968236962e-05, + "loss": 1.9205, + "step": 3661 + }, + { + "epoch": 0.7049594532810357, + "grad_norm": 2.8678511724490803, + "learning_rate": 1.5023383257160662e-05, + "loss": 1.9159, + "step": 3662 + }, + { + "epoch": 0.705151959958611, + "grad_norm": 2.9130142694048184, + "learning_rate": 1.5020687057747241e-05, + "loss": 1.9766, + "step": 3663 + }, + { + "epoch": 0.7053444666361864, + "grad_norm": 3.0179284921192084, + "learning_rate": 1.5017990370258806e-05, + "loss": 1.8681, + "step": 3664 + }, + { + "epoch": 0.7053444666361864, + "lm_loss": 1.7657, + "step": 3664, + "vm_loss": 0.1856 + }, + { + "epoch": 0.7053444666361864, + "lm_loss": 1.5371, + "step": 3664, + "vm_loss": 0.1272 + }, + { + "epoch": 0.7053444666361864, + "lm_loss": 1.8717, + "step": 3664, + "vm_loss": 0.2319 + }, + { + "epoch": 0.7053444666361864, + "lm_loss": 1.9186, + "step": 3664, + "vm_loss": 0.1822 + }, + { + "epoch": 0.7053444666361864, + "lm_loss": 1.7543, + "step": 3664, + "vm_loss": 0.1726 + }, + { + "epoch": 0.7053444666361864, + "lm_loss": 2.0273, + "step": 3664, + "vm_loss": 0.1784 + }, + { + "epoch": 0.7053444666361864, + "lm_loss": 1.8591, + "step": 3664, + "vm_loss": 0.2131 + }, + { + "epoch": 0.7053444666361864, + "lm_loss": 1.3255, + "step": 3664, + "vm_loss": 0.1991 + }, + { + "epoch": 0.7055369733137619, + "grad_norm": 2.953865498704835, + "learning_rate": 1.501529319495751e-05, + "loss": 1.9549, + "step": 3665 + }, + { + "epoch": 0.7057294799913372, + "grad_norm": 2.9171848906970625, + "learning_rate": 1.5012595532105547e-05, + "loss": 1.9099, + "step": 3666 + }, + { + "epoch": 0.7059219866689126, + "grad_norm": 2.933804306347362, + "learning_rate": 1.5009897381965171e-05, + "loss": 1.9525, + "step": 3667 + }, + { + "epoch": 0.7061144933464879, + "grad_norm": 2.732231035676331, + "learning_rate": 1.5007198744798673e-05, + "loss": 1.8986, + "step": 3668 + }, + { + "epoch": 0.7063070000240633, + "grad_norm": 2.9131324974146975, + "learning_rate": 1.5004499620868395e-05, + "loss": 2.0361, + "step": 3669 + }, + { + "epoch": 0.7064995067016387, + "grad_norm": 3.2010181370604514, + "learning_rate": 1.5001800010436731e-05, + "loss": 2.0474, + "step": 3670 + }, + { + "epoch": 0.7066920133792141, + "grad_norm": 2.800951628496817, + "learning_rate": 1.4999099913766113e-05, + "loss": 1.9389, + "step": 3671 + }, + { + "epoch": 0.7068845200567895, + "grad_norm": 2.9597387330200924, + "learning_rate": 1.4996399331119026e-05, + "loss": 1.9196, + "step": 3672 + }, + { + "epoch": 0.7068845200567895, + "lm_loss": 1.9407, + "step": 3672, + "vm_loss": 0.1456 + }, + { + "epoch": 0.7068845200567895, + "lm_loss": 1.8135, + "step": 3672, + "vm_loss": 0.2029 + }, + { + "epoch": 0.7068845200567895, + "lm_loss": 1.2727, + "step": 3672, + "vm_loss": 0.183 + }, + { + "epoch": 0.7068845200567895, + "lm_loss": 1.646, + "step": 3672, + "vm_loss": 0.2208 + }, + { + "epoch": 0.7068845200567895, + "lm_loss": 1.6671, + "step": 3672, + "vm_loss": 0.2486 + }, + { + "epoch": 0.7068845200567895, + "lm_loss": 2.1241, + "step": 3672, + "vm_loss": 0.1528 + }, + { + "epoch": 0.7068845200567895, + "lm_loss": 1.658, + "step": 3672, + "vm_loss": 0.1161 + }, + { + "epoch": 0.7068845200567895, + "lm_loss": 2.0151, + "step": 3672, + "vm_loss": 0.1546 + }, + { + "epoch": 0.7070770267343649, + "grad_norm": 2.9422197321951202, + "learning_rate": 1.4993698262758e-05, + "loss": 1.8897, + "step": 3673 + }, + { + "epoch": 0.7072695334119402, + "grad_norm": 2.982046999938329, + "learning_rate": 1.4990996708945618e-05, + "loss": 1.9786, + "step": 3674 + }, + { + "epoch": 0.7074620400895156, + "grad_norm": 2.993585891994863, + "learning_rate": 1.4988294669944501e-05, + "loss": 1.9215, + "step": 3675 + }, + { + "epoch": 0.7076545467670909, + "grad_norm": 3.4963731692640985, + "learning_rate": 1.4985592146017322e-05, + "loss": 2.0173, + "step": 3676 + }, + { + "epoch": 0.7078470534446664, + "grad_norm": 2.712323519776265, + "learning_rate": 1.4982889137426803e-05, + "loss": 1.9098, + "step": 3677 + }, + { + "epoch": 0.7080395601222418, + "grad_norm": 2.7697458932977037, + "learning_rate": 1.4980185644435712e-05, + "loss": 2.0103, + "step": 3678 + }, + { + "epoch": 0.7082320667998171, + "grad_norm": 3.3143028085309223, + "learning_rate": 1.4977481667306857e-05, + "loss": 1.9511, + "step": 3679 + }, + { + "epoch": 0.7084245734773925, + "grad_norm": 3.1932280932825714, + "learning_rate": 1.4974777206303106e-05, + "loss": 1.8988, + "step": 3680 + }, + { + "epoch": 0.7084245734773925, + "lm_loss": 1.2301, + "step": 3680, + "vm_loss": 0.1777 + }, + { + "epoch": 0.7084245734773925, + "lm_loss": 1.8124, + "step": 3680, + "vm_loss": 0.1518 + }, + { + "epoch": 0.7084245734773925, + "lm_loss": 1.8047, + "step": 3680, + "vm_loss": 0.1775 + }, + { + "epoch": 0.7084245734773925, + "lm_loss": 2.0136, + "step": 3680, + "vm_loss": 0.2167 + }, + { + "epoch": 0.7084245734773925, + "lm_loss": 1.9321, + "step": 3680, + "vm_loss": 0.1831 + }, + { + "epoch": 0.7084245734773925, + "lm_loss": 1.8036, + "step": 3680, + "vm_loss": 0.2131 + }, + { + "epoch": 0.7084245734773925, + "lm_loss": 2.2365, + "step": 3680, + "vm_loss": 0.1333 + }, + { + "epoch": 0.7084245734773925, + "lm_loss": 2.0735, + "step": 3680, + "vm_loss": 0.1678 + }, + { + "epoch": 0.7086170801549678, + "grad_norm": 2.892149321167442, + "learning_rate": 1.4972072261687365e-05, + "loss": 1.8933, + "step": 3681 + }, + { + "epoch": 0.7088095868325432, + "grad_norm": 2.9787878894004103, + "learning_rate": 1.4969366833722587e-05, + "loss": 1.959, + "step": 3682 + }, + { + "epoch": 0.7090020935101187, + "grad_norm": 2.8597305354027767, + "learning_rate": 1.4966660922671779e-05, + "loss": 1.8298, + "step": 3683 + }, + { + "epoch": 0.709194600187694, + "grad_norm": 3.0913370448707815, + "learning_rate": 1.4963954528797983e-05, + "loss": 1.9603, + "step": 3684 + }, + { + "epoch": 0.7093871068652694, + "grad_norm": 2.6438987003671577, + "learning_rate": 1.4961247652364303e-05, + "loss": 1.9356, + "step": 3685 + }, + { + "epoch": 0.7095796135428448, + "grad_norm": 3.056020359556225, + "learning_rate": 1.4958540293633875e-05, + "loss": 1.9182, + "step": 3686 + }, + { + "epoch": 0.7097721202204201, + "grad_norm": 2.8993099628532093, + "learning_rate": 1.4955832452869893e-05, + "loss": 1.9854, + "step": 3687 + }, + { + "epoch": 0.7099646268979956, + "grad_norm": 2.8790834945258923, + "learning_rate": 1.4953124130335594e-05, + "loss": 1.8998, + "step": 3688 + }, + { + "epoch": 0.7099646268979956, + "lm_loss": 1.5317, + "step": 3688, + "vm_loss": 0.2164 + }, + { + "epoch": 0.7099646268979956, + "lm_loss": 1.9062, + "step": 3688, + "vm_loss": 0.2458 + }, + { + "epoch": 0.7099646268979956, + "lm_loss": 1.8765, + "step": 3688, + "vm_loss": 0.1665 + }, + { + "epoch": 0.7099646268979956, + "lm_loss": 1.9494, + "step": 3688, + "vm_loss": 0.1706 + }, + { + "epoch": 0.7099646268979956, + "lm_loss": 2.3855, + "step": 3688, + "vm_loss": 0.175 + }, + { + "epoch": 0.7099646268979956, + "lm_loss": 2.3481, + "step": 3688, + "vm_loss": 0.1617 + }, + { + "epoch": 0.7099646268979956, + "lm_loss": 1.7073, + "step": 3688, + "vm_loss": 0.2296 + }, + { + "epoch": 0.7099646268979956, + "lm_loss": 1.6671, + "step": 3688, + "vm_loss": 0.1674 + }, + { + "epoch": 0.7101571335755709, + "grad_norm": 2.8292398192039285, + "learning_rate": 1.495041532629426e-05, + "loss": 1.954, + "step": 3689 + }, + { + "epoch": 0.7103496402531463, + "grad_norm": 3.0200014870401515, + "learning_rate": 1.4947706041009223e-05, + "loss": 1.8534, + "step": 3690 + }, + { + "epoch": 0.7105421469307217, + "grad_norm": 3.1136084865061946, + "learning_rate": 1.4944996274743857e-05, + "loss": 1.8933, + "step": 3691 + }, + { + "epoch": 0.710734653608297, + "grad_norm": 2.9356339267602305, + "learning_rate": 1.494228602776159e-05, + "loss": 1.9205, + "step": 3692 + }, + { + "epoch": 0.7109271602858724, + "grad_norm": 3.172558410462714, + "learning_rate": 1.493957530032589e-05, + "loss": 1.944, + "step": 3693 + }, + { + "epoch": 0.7111196669634477, + "grad_norm": 3.0320596542526688, + "learning_rate": 1.4936864092700278e-05, + "loss": 1.9274, + "step": 3694 + }, + { + "epoch": 0.7113121736410232, + "grad_norm": 2.9182609554141945, + "learning_rate": 1.4934152405148312e-05, + "loss": 1.9629, + "step": 3695 + }, + { + "epoch": 0.7115046803185986, + "grad_norm": 3.2744116488572588, + "learning_rate": 1.4931440237933605e-05, + "loss": 1.9073, + "step": 3696 + }, + { + "epoch": 0.7115046803185986, + "lm_loss": 1.8795, + "step": 3696, + "vm_loss": 0.1999 + }, + { + "epoch": 0.7115046803185986, + "lm_loss": 1.7848, + "step": 3696, + "vm_loss": 0.1661 + }, + { + "epoch": 0.7115046803185986, + "lm_loss": 2.033, + "step": 3696, + "vm_loss": 0.1511 + }, + { + "epoch": 0.7115046803185986, + "lm_loss": 1.95, + "step": 3696, + "vm_loss": 0.1588 + }, + { + "epoch": 0.7115046803185986, + "lm_loss": 1.7633, + "step": 3696, + "vm_loss": 0.1579 + }, + { + "epoch": 0.7115046803185986, + "lm_loss": 1.8678, + "step": 3696, + "vm_loss": 0.1596 + }, + { + "epoch": 0.7115046803185986, + "lm_loss": 1.9616, + "step": 3696, + "vm_loss": 0.1807 + }, + { + "epoch": 0.7115046803185986, + "lm_loss": 1.4825, + "step": 3696, + "vm_loss": 0.1989 + }, + { + "epoch": 0.7116971869961739, + "grad_norm": 2.8536925931608743, + "learning_rate": 1.492872759131982e-05, + "loss": 1.9543, + "step": 3697 + }, + { + "epoch": 0.7118896936737493, + "grad_norm": 3.0120423071835685, + "learning_rate": 1.4926014465570652e-05, + "loss": 1.9256, + "step": 3698 + }, + { + "epoch": 0.7120822003513246, + "grad_norm": 2.8958635643689137, + "learning_rate": 1.492330086094986e-05, + "loss": 1.9489, + "step": 3699 + }, + { + "epoch": 0.7122747070289001, + "grad_norm": 2.8932758814879502, + "learning_rate": 1.4920586777721231e-05, + "loss": 1.973, + "step": 3700 + }, + { + "epoch": 0.7124672137064755, + "grad_norm": 3.2130394399389415, + "learning_rate": 1.491787221614862e-05, + "loss": 1.9698, + "step": 3701 + }, + { + "epoch": 0.7126597203840508, + "grad_norm": 2.9738723358837187, + "learning_rate": 1.491515717649591e-05, + "loss": 1.9653, + "step": 3702 + }, + { + "epoch": 0.7128522270616262, + "grad_norm": 2.769676264659778, + "learning_rate": 1.4912441659027037e-05, + "loss": 1.8621, + "step": 3703 + }, + { + "epoch": 0.7130447337392016, + "grad_norm": 2.816529435197119, + "learning_rate": 1.4909725664005987e-05, + "loss": 1.8816, + "step": 3704 + }, + { + "epoch": 0.7130447337392016, + "lm_loss": 1.9877, + "step": 3704, + "vm_loss": 0.1922 + }, + { + "epoch": 0.7130447337392016, + "lm_loss": 1.5424, + "step": 3704, + "vm_loss": 0.1022 + }, + { + "epoch": 0.7130447337392016, + "lm_loss": 1.5501, + "step": 3704, + "vm_loss": 0.1896 + }, + { + "epoch": 0.7130447337392016, + "lm_loss": 1.6442, + "step": 3704, + "vm_loss": 0.1249 + }, + { + "epoch": 0.7130447337392016, + "lm_loss": 1.0074, + "step": 3704, + "vm_loss": 0.1853 + }, + { + "epoch": 0.7130447337392016, + "lm_loss": 1.7407, + "step": 3704, + "vm_loss": 0.1129 + }, + { + "epoch": 0.7130447337392016, + "lm_loss": 1.5802, + "step": 3704, + "vm_loss": 0.1058 + }, + { + "epoch": 0.7130447337392016, + "lm_loss": 1.6994, + "step": 3704, + "vm_loss": 0.1758 + }, + { + "epoch": 0.7132372404167769, + "grad_norm": 3.0258604172335963, + "learning_rate": 1.4907009191696791e-05, + "loss": 1.8583, + "step": 3705 + }, + { + "epoch": 0.7134297470943524, + "grad_norm": 3.120601881167479, + "learning_rate": 1.4904292242363521e-05, + "loss": 1.9304, + "step": 3706 + }, + { + "epoch": 0.7136222537719277, + "grad_norm": 3.0398261737868744, + "learning_rate": 1.4901574816270302e-05, + "loss": 1.933, + "step": 3707 + }, + { + "epoch": 0.7138147604495031, + "grad_norm": 2.911962354152422, + "learning_rate": 1.48988569136813e-05, + "loss": 1.9485, + "step": 3708 + }, + { + "epoch": 0.7140072671270785, + "grad_norm": 3.2596608325013667, + "learning_rate": 1.4896138534860728e-05, + "loss": 1.9451, + "step": 3709 + }, + { + "epoch": 0.7141997738046538, + "grad_norm": 2.996590040777344, + "learning_rate": 1.4893419680072856e-05, + "loss": 1.964, + "step": 3710 + }, + { + "epoch": 0.7143922804822292, + "grad_norm": 2.864444338506274, + "learning_rate": 1.4890700349581986e-05, + "loss": 1.9124, + "step": 3711 + }, + { + "epoch": 0.7145847871598046, + "grad_norm": 3.1503096212407846, + "learning_rate": 1.4887980543652467e-05, + "loss": 1.883, + "step": 3712 + }, + { + "epoch": 0.7145847871598046, + "lm_loss": 1.647, + "step": 3712, + "vm_loss": 0.1571 + }, + { + "epoch": 0.7145847871598046, + "lm_loss": 1.8455, + "step": 3712, + "vm_loss": 0.2255 + }, + { + "epoch": 0.7145847871598046, + "lm_loss": 1.4691, + "step": 3712, + "vm_loss": 0.1429 + }, + { + "epoch": 0.7145847871598046, + "lm_loss": 1.759, + "step": 3712, + "vm_loss": 0.1463 + }, + { + "epoch": 0.7145847871598046, + "lm_loss": 1.8049, + "step": 3712, + "vm_loss": 0.1967 + }, + { + "epoch": 0.7145847871598046, + "lm_loss": 1.5675, + "step": 3712, + "vm_loss": 0.1895 + }, + { + "epoch": 0.7145847871598046, + "lm_loss": 1.955, + "step": 3712, + "vm_loss": 0.154 + }, + { + "epoch": 0.7145847871598046, + "lm_loss": 1.9605, + "step": 3712, + "vm_loss": 0.1499 + }, + { + "epoch": 0.71477729383738, + "grad_norm": 3.121243343215388, + "learning_rate": 1.4885260262548706e-05, + "loss": 1.9278, + "step": 3713 + }, + { + "epoch": 0.7149698005149554, + "grad_norm": 2.9712268915660704, + "learning_rate": 1.488253950653515e-05, + "loss": 2.0209, + "step": 3714 + }, + { + "epoch": 0.7151623071925307, + "grad_norm": 2.973055936590317, + "learning_rate": 1.4879818275876283e-05, + "loss": 1.922, + "step": 3715 + }, + { + "epoch": 0.7153548138701061, + "grad_norm": 2.832989712910767, + "learning_rate": 1.4877096570836651e-05, + "loss": 1.9317, + "step": 3716 + }, + { + "epoch": 0.7155473205476814, + "grad_norm": 3.028559499643474, + "learning_rate": 1.4874374391680837e-05, + "loss": 1.9111, + "step": 3717 + }, + { + "epoch": 0.7157398272252569, + "grad_norm": 3.115262017561496, + "learning_rate": 1.4871651738673469e-05, + "loss": 2.0183, + "step": 3718 + }, + { + "epoch": 0.7159323339028323, + "grad_norm": 3.1181027224898594, + "learning_rate": 1.4868928612079226e-05, + "loss": 1.8976, + "step": 3719 + }, + { + "epoch": 0.7161248405804076, + "grad_norm": 3.1881361420256735, + "learning_rate": 1.486620501216283e-05, + "loss": 1.9471, + "step": 3720 + }, + { + "epoch": 0.7161248405804076, + "lm_loss": 1.5344, + "step": 3720, + "vm_loss": 0.1641 + }, + { + "epoch": 0.7161248405804076, + "lm_loss": 1.9682, + "step": 3720, + "vm_loss": 0.1686 + }, + { + "epoch": 0.7161248405804076, + "lm_loss": 2.2396, + "step": 3720, + "vm_loss": 0.1711 + }, + { + "epoch": 0.7161248405804076, + "lm_loss": 1.8707, + "step": 3720, + "vm_loss": 0.1771 + }, + { + "epoch": 0.7161248405804076, + "lm_loss": 1.6082, + "step": 3720, + "vm_loss": 0.1602 + }, + { + "epoch": 0.7161248405804076, + "lm_loss": 1.3282, + "step": 3720, + "vm_loss": 0.1856 + }, + { + "epoch": 0.7161248405804076, + "lm_loss": 1.8681, + "step": 3720, + "vm_loss": 0.134 + }, + { + "epoch": 0.7161248405804076, + "lm_loss": 1.8214, + "step": 3720, + "vm_loss": 0.1203 + }, + { + "epoch": 0.716317347257983, + "grad_norm": 2.963801681292483, + "learning_rate": 1.486348093918905e-05, + "loss": 1.935, + "step": 3721 + }, + { + "epoch": 0.7165098539355584, + "grad_norm": 2.7421865251275, + "learning_rate": 1.4860756393422699e-05, + "loss": 1.8775, + "step": 3722 + }, + { + "epoch": 0.7167023606131337, + "grad_norm": 2.7773370709149146, + "learning_rate": 1.4858031375128643e-05, + "loss": 1.8892, + "step": 3723 + }, + { + "epoch": 0.7168948672907092, + "grad_norm": 3.0049953609481146, + "learning_rate": 1.4855305884571782e-05, + "loss": 1.8382, + "step": 3724 + }, + { + "epoch": 0.7170873739682845, + "grad_norm": 3.0525509380576734, + "learning_rate": 1.4852579922017072e-05, + "loss": 1.8724, + "step": 3725 + }, + { + "epoch": 0.7172798806458599, + "grad_norm": 3.2518079410622778, + "learning_rate": 1.4849853487729512e-05, + "loss": 1.903, + "step": 3726 + }, + { + "epoch": 0.7174723873234353, + "grad_norm": 3.002528005269556, + "learning_rate": 1.4847126581974146e-05, + "loss": 1.907, + "step": 3727 + }, + { + "epoch": 0.7176648940010106, + "grad_norm": 2.870813502564371, + "learning_rate": 1.4844399205016062e-05, + "loss": 1.9513, + "step": 3728 + }, + { + "epoch": 0.7176648940010106, + "lm_loss": 1.9327, + "step": 3728, + "vm_loss": 0.1643 + }, + { + "epoch": 0.7176648940010106, + "lm_loss": 1.5341, + "step": 3728, + "vm_loss": 0.1647 + }, + { + "epoch": 0.7176648940010106, + "lm_loss": 1.844, + "step": 3728, + "vm_loss": 0.1591 + }, + { + "epoch": 0.7176648940010106, + "lm_loss": 1.7995, + "step": 3728, + "vm_loss": 0.1147 + }, + { + "epoch": 0.7176648940010106, + "lm_loss": 1.8214, + "step": 3728, + "vm_loss": 0.1823 + }, + { + "epoch": 0.7176648940010106, + "lm_loss": 1.8343, + "step": 3728, + "vm_loss": 0.2092 + }, + { + "epoch": 0.7176648940010106, + "lm_loss": 2.0218, + "step": 3728, + "vm_loss": 0.1727 + }, + { + "epoch": 0.7176648940010106, + "lm_loss": 1.5754, + "step": 3728, + "vm_loss": 0.1213 + }, + { + "epoch": 0.717857400678586, + "grad_norm": 2.889644092271477, + "learning_rate": 1.4841671357120395e-05, + "loss": 1.8583, + "step": 3729 + }, + { + "epoch": 0.7180499073561614, + "grad_norm": 2.909230833785038, + "learning_rate": 1.483894303855233e-05, + "loss": 1.935, + "step": 3730 + }, + { + "epoch": 0.7182424140337368, + "grad_norm": 3.035777813753681, + "learning_rate": 1.4836214249577094e-05, + "loss": 1.9072, + "step": 3731 + }, + { + "epoch": 0.7184349207113122, + "grad_norm": 2.9401793345334917, + "learning_rate": 1.4833484990459963e-05, + "loss": 1.9445, + "step": 3732 + }, + { + "epoch": 0.7186274273888875, + "grad_norm": 3.157828467371138, + "learning_rate": 1.4830755261466249e-05, + "loss": 1.934, + "step": 3733 + }, + { + "epoch": 0.7188199340664629, + "grad_norm": 3.342609333773877, + "learning_rate": 1.482802506286132e-05, + "loss": 1.907, + "step": 3734 + }, + { + "epoch": 0.7190124407440384, + "grad_norm": 3.2489124115875008, + "learning_rate": 1.482529439491059e-05, + "loss": 1.9053, + "step": 3735 + }, + { + "epoch": 0.7192049474216137, + "grad_norm": 2.8670753083817777, + "learning_rate": 1.4822563257879508e-05, + "loss": 1.8917, + "step": 3736 + }, + { + "epoch": 0.7192049474216137, + "lm_loss": 1.6906, + "step": 3736, + "vm_loss": 0.1499 + }, + { + "epoch": 0.7192049474216137, + "lm_loss": 1.597, + "step": 3736, + "vm_loss": 0.1409 + }, + { + "epoch": 0.7192049474216137, + "lm_loss": 2.1388, + "step": 3736, + "vm_loss": 0.1934 + }, + { + "epoch": 0.7192049474216137, + "lm_loss": 1.6235, + "step": 3736, + "vm_loss": 0.2226 + }, + { + "epoch": 0.7192049474216137, + "lm_loss": 1.9806, + "step": 3736, + "vm_loss": 0.1406 + }, + { + "epoch": 0.7192049474216137, + "lm_loss": 2.1045, + "step": 3736, + "vm_loss": 0.2493 + }, + { + "epoch": 0.7192049474216137, + "lm_loss": 2.2695, + "step": 3736, + "vm_loss": 0.2056 + }, + { + "epoch": 0.7192049474216137, + "lm_loss": 1.6723, + "step": 3736, + "vm_loss": 0.1453 + }, + { + "epoch": 0.7193974540991891, + "grad_norm": 3.1072404818057557, + "learning_rate": 1.4819831652033579e-05, + "loss": 1.9802, + "step": 3737 + }, + { + "epoch": 0.7195899607767644, + "grad_norm": 3.049431558234614, + "learning_rate": 1.481709957763835e-05, + "loss": 1.9003, + "step": 3738 + }, + { + "epoch": 0.7197824674543398, + "grad_norm": 3.08415414804755, + "learning_rate": 1.4814367034959414e-05, + "loss": 1.894, + "step": 3739 + }, + { + "epoch": 0.7199749741319152, + "grad_norm": 3.1817186645565174, + "learning_rate": 1.4811634024262409e-05, + "loss": 1.9126, + "step": 3740 + }, + { + "epoch": 0.7201674808094906, + "grad_norm": 3.229228692319169, + "learning_rate": 1.4808900545813016e-05, + "loss": 1.9417, + "step": 3741 + }, + { + "epoch": 0.720359987487066, + "grad_norm": 3.0200083591959395, + "learning_rate": 1.4806166599876967e-05, + "loss": 1.885, + "step": 3742 + }, + { + "epoch": 0.7205524941646413, + "grad_norm": 3.152478698419477, + "learning_rate": 1.4803432186720037e-05, + "loss": 1.8664, + "step": 3743 + }, + { + "epoch": 0.7207450008422167, + "grad_norm": 2.928741057067477, + "learning_rate": 1.4800697306608043e-05, + "loss": 1.896, + "step": 3744 + }, + { + "epoch": 0.7207450008422167, + "lm_loss": 1.1524, + "step": 3744, + "vm_loss": 0.1353 + }, + { + "epoch": 0.7207450008422167, + "lm_loss": 1.4933, + "step": 3744, + "vm_loss": 0.147 + }, + { + "epoch": 0.7207450008422167, + "lm_loss": 1.7221, + "step": 3744, + "vm_loss": 0.2314 + }, + { + "epoch": 0.7207450008422167, + "lm_loss": 1.7901, + "step": 3744, + "vm_loss": 0.1467 + }, + { + "epoch": 0.7207450008422167, + "lm_loss": 1.9902, + "step": 3744, + "vm_loss": 0.156 + }, + { + "epoch": 0.7207450008422167, + "lm_loss": 1.7596, + "step": 3744, + "vm_loss": 0.1246 + }, + { + "epoch": 0.7207450008422167, + "lm_loss": 1.555, + "step": 3744, + "vm_loss": 0.1909 + }, + { + "epoch": 0.7207450008422167, + "lm_loss": 2.3212, + "step": 3744, + "vm_loss": 0.2247 + }, + { + "epoch": 0.7209375075197921, + "grad_norm": 3.023002099864317, + "learning_rate": 1.4797961959806854e-05, + "loss": 1.885, + "step": 3745 + }, + { + "epoch": 0.7211300141973674, + "grad_norm": 3.2507530108483493, + "learning_rate": 1.4795226146582378e-05, + "loss": 1.8696, + "step": 3746 + }, + { + "epoch": 0.7213225208749429, + "grad_norm": 2.9215267991045475, + "learning_rate": 1.479248986720057e-05, + "loss": 1.9525, + "step": 3747 + }, + { + "epoch": 0.7215150275525182, + "grad_norm": 2.9915521516980466, + "learning_rate": 1.4789753121927435e-05, + "loss": 1.8833, + "step": 3748 + }, + { + "epoch": 0.7217075342300936, + "grad_norm": 2.9456974182390123, + "learning_rate": 1.4787015911029014e-05, + "loss": 1.9252, + "step": 3749 + }, + { + "epoch": 0.721900040907669, + "grad_norm": 3.1366470952785486, + "learning_rate": 1.4784278234771407e-05, + "loss": 1.9243, + "step": 3750 + }, + { + "epoch": 0.7220925475852443, + "grad_norm": 3.289474461723117, + "learning_rate": 1.4781540093420746e-05, + "loss": 1.9456, + "step": 3751 + }, + { + "epoch": 0.7222850542628197, + "grad_norm": 2.7924391041033676, + "learning_rate": 1.4778801487243214e-05, + "loss": 1.9359, + "step": 3752 + }, + { + "epoch": 0.7222850542628197, + "lm_loss": 2.18, + "step": 3752, + "vm_loss": 0.177 + }, + { + "epoch": 0.7222850542628197, + "lm_loss": 1.9218, + "step": 3752, + "vm_loss": 0.2452 + }, + { + "epoch": 0.7222850542628197, + "lm_loss": 1.5757, + "step": 3752, + "vm_loss": 0.1532 + }, + { + "epoch": 0.7222850542628197, + "lm_loss": 1.4875, + "step": 3752, + "vm_loss": 0.1336 + }, + { + "epoch": 0.7222850542628197, + "lm_loss": 1.2811, + "step": 3752, + "vm_loss": 0.1184 + }, + { + "epoch": 0.7222850542628197, + "lm_loss": 1.8495, + "step": 3752, + "vm_loss": 0.1255 + }, + { + "epoch": 0.7222850542628197, + "lm_loss": 1.9507, + "step": 3752, + "vm_loss": 0.1723 + }, + { + "epoch": 0.7222850542628197, + "lm_loss": 1.4793, + "step": 3752, + "vm_loss": 0.2049 + }, + { + "epoch": 0.7224775609403952, + "grad_norm": 3.0610919750334586, + "learning_rate": 1.4776062416505035e-05, + "loss": 1.8408, + "step": 3753 + }, + { + "epoch": 0.7226700676179705, + "grad_norm": 2.8901879349015247, + "learning_rate": 1.477332288147249e-05, + "loss": 1.9165, + "step": 3754 + }, + { + "epoch": 0.7228625742955459, + "grad_norm": 2.911078905028071, + "learning_rate": 1.477058288241189e-05, + "loss": 1.8403, + "step": 3755 + }, + { + "epoch": 0.7230550809731212, + "grad_norm": 3.1773776189665774, + "learning_rate": 1.47678424195896e-05, + "loss": 1.8859, + "step": 3756 + }, + { + "epoch": 0.7232475876506966, + "grad_norm": 3.322724287841504, + "learning_rate": 1.4765101493272032e-05, + "loss": 1.9475, + "step": 3757 + }, + { + "epoch": 0.723440094328272, + "grad_norm": 3.065576238962563, + "learning_rate": 1.476236010372563e-05, + "loss": 1.944, + "step": 3758 + }, + { + "epoch": 0.7236326010058474, + "grad_norm": 3.1019181158658453, + "learning_rate": 1.4759618251216903e-05, + "loss": 1.8806, + "step": 3759 + }, + { + "epoch": 0.7238251076834228, + "grad_norm": 2.9596534839491175, + "learning_rate": 1.4756875936012385e-05, + "loss": 1.9602, + "step": 3760 + }, + { + "epoch": 0.7238251076834228, + "lm_loss": 1.5672, + "step": 3760, + "vm_loss": 0.2213 + }, + { + "epoch": 0.7238251076834228, + "lm_loss": 1.4406, + "step": 3760, + "vm_loss": 0.1866 + }, + { + "epoch": 0.7238251076834228, + "lm_loss": 1.8501, + "step": 3760, + "vm_loss": 0.1605 + }, + { + "epoch": 0.7238251076834228, + "lm_loss": 1.8334, + "step": 3760, + "vm_loss": 0.2294 + }, + { + "epoch": 0.7238251076834228, + "lm_loss": 1.7821, + "step": 3760, + "vm_loss": 0.2131 + }, + { + "epoch": 0.7238251076834228, + "lm_loss": 1.9379, + "step": 3760, + "vm_loss": 0.1565 + }, + { + "epoch": 0.7238251076834228, + "lm_loss": 2.3439, + "step": 3760, + "vm_loss": 0.1571 + }, + { + "epoch": 0.7238251076834228, + "lm_loss": 2.0827, + "step": 3760, + "vm_loss": 0.2065 + }, + { + "epoch": 0.7240176143609981, + "grad_norm": 2.8078062475850794, + "learning_rate": 1.4754133158378668e-05, + "loss": 1.9215, + "step": 3761 + }, + { + "epoch": 0.7242101210385735, + "grad_norm": 2.8732470476455836, + "learning_rate": 1.4751389918582387e-05, + "loss": 1.966, + "step": 3762 + }, + { + "epoch": 0.7244026277161489, + "grad_norm": 2.9614104763428, + "learning_rate": 1.4748646216890215e-05, + "loss": 1.9411, + "step": 3763 + }, + { + "epoch": 0.7245951343937242, + "grad_norm": 3.131225361159734, + "learning_rate": 1.4745902053568882e-05, + "loss": 1.9054, + "step": 3764 + }, + { + "epoch": 0.7247876410712997, + "grad_norm": 2.91588752534843, + "learning_rate": 1.4743157428885147e-05, + "loss": 1.9081, + "step": 3765 + }, + { + "epoch": 0.7249801477488751, + "grad_norm": 2.9327040501890687, + "learning_rate": 1.4740412343105828e-05, + "loss": 1.8678, + "step": 3766 + }, + { + "epoch": 0.7251726544264504, + "grad_norm": 2.74747089193638, + "learning_rate": 1.4737666796497784e-05, + "loss": 1.8966, + "step": 3767 + }, + { + "epoch": 0.7253651611040258, + "grad_norm": 2.855983359239089, + "learning_rate": 1.4734920789327913e-05, + "loss": 1.8764, + "step": 3768 + }, + { + "epoch": 0.7253651611040258, + "lm_loss": 1.6491, + "step": 3768, + "vm_loss": 0.1713 + }, + { + "epoch": 0.7253651611040258, + "lm_loss": 1.6772, + "step": 3768, + "vm_loss": 0.1356 + }, + { + "epoch": 0.7253651611040258, + "lm_loss": 1.3075, + "step": 3768, + "vm_loss": 0.1303 + }, + { + "epoch": 0.7253651611040258, + "lm_loss": 1.464, + "step": 3768, + "vm_loss": 0.1796 + }, + { + "epoch": 0.7253651611040258, + "lm_loss": 1.6724, + "step": 3768, + "vm_loss": 0.1775 + }, + { + "epoch": 0.7253651611040258, + "lm_loss": 1.9063, + "step": 3768, + "vm_loss": 0.1368 + }, + { + "epoch": 0.7253651611040258, + "lm_loss": 1.744, + "step": 3768, + "vm_loss": 0.2026 + }, + { + "epoch": 0.7253651611040258, + "lm_loss": 2.2858, + "step": 3768, + "vm_loss": 0.2235 + }, + { + "epoch": 0.7255576677816011, + "grad_norm": 2.994524235639348, + "learning_rate": 1.4732174321863167e-05, + "loss": 1.9165, + "step": 3769 + }, + { + "epoch": 0.7257501744591766, + "grad_norm": 2.8683545229264475, + "learning_rate": 1.4729427394370533e-05, + "loss": 1.8537, + "step": 3770 + }, + { + "epoch": 0.725942681136752, + "grad_norm": 2.865619534192569, + "learning_rate": 1.4726680007117046e-05, + "loss": 1.8509, + "step": 3771 + }, + { + "epoch": 0.7261351878143273, + "grad_norm": 2.8866391106728604, + "learning_rate": 1.4723932160369794e-05, + "loss": 1.9106, + "step": 3772 + }, + { + "epoch": 0.7263276944919027, + "grad_norm": 3.0757442050545394, + "learning_rate": 1.4721183854395897e-05, + "loss": 1.947, + "step": 3773 + }, + { + "epoch": 0.726520201169478, + "grad_norm": 2.9798541110704146, + "learning_rate": 1.4718435089462528e-05, + "loss": 1.8744, + "step": 3774 + }, + { + "epoch": 0.7267127078470534, + "grad_norm": 2.813690311602499, + "learning_rate": 1.4715685865836904e-05, + "loss": 1.9043, + "step": 3775 + }, + { + "epoch": 0.7269052145246289, + "grad_norm": 2.881585400846457, + "learning_rate": 1.4712936183786281e-05, + "loss": 1.9092, + "step": 3776 + }, + { + "epoch": 0.7269052145246289, + "lm_loss": 2.0198, + "step": 3776, + "vm_loss": 0.1699 + }, + { + "epoch": 0.7269052145246289, + "lm_loss": 1.7665, + "step": 3776, + "vm_loss": 0.1533 + }, + { + "epoch": 0.7269052145246289, + "lm_loss": 1.6736, + "step": 3776, + "vm_loss": 0.1611 + }, + { + "epoch": 0.7269052145246289, + "lm_loss": 2.1747, + "step": 3776, + "vm_loss": 0.1325 + }, + { + "epoch": 0.7269052145246289, + "lm_loss": 1.9061, + "step": 3776, + "vm_loss": 0.1567 + }, + { + "epoch": 0.7269052145246289, + "lm_loss": 1.6564, + "step": 3776, + "vm_loss": 0.1588 + }, + { + "epoch": 0.7269052145246289, + "lm_loss": 1.1195, + "step": 3776, + "vm_loss": 0.1838 + }, + { + "epoch": 0.7269052145246289, + "lm_loss": 1.967, + "step": 3776, + "vm_loss": 0.1863 + }, + { + "epoch": 0.7270977212022042, + "grad_norm": 2.8651637905957283, + "learning_rate": 1.4710186043577965e-05, + "loss": 1.9225, + "step": 3777 + }, + { + "epoch": 0.7272902278797796, + "grad_norm": 2.823024656375641, + "learning_rate": 1.4707435445479306e-05, + "loss": 1.8473, + "step": 3778 + }, + { + "epoch": 0.7274827345573549, + "grad_norm": 2.9268825113603874, + "learning_rate": 1.4704684389757696e-05, + "loss": 1.9245, + "step": 3779 + }, + { + "epoch": 0.7276752412349303, + "grad_norm": 2.929838647278052, + "learning_rate": 1.470193287668057e-05, + "loss": 1.871, + "step": 3780 + }, + { + "epoch": 0.7278677479125057, + "grad_norm": 3.044792363818208, + "learning_rate": 1.4699180906515418e-05, + "loss": 1.8977, + "step": 3781 + }, + { + "epoch": 0.7280602545900811, + "grad_norm": 2.8703812272871043, + "learning_rate": 1.4696428479529758e-05, + "loss": 1.8681, + "step": 3782 + }, + { + "epoch": 0.7282527612676565, + "grad_norm": 2.984227209164664, + "learning_rate": 1.4693675595991163e-05, + "loss": 1.9523, + "step": 3783 + }, + { + "epoch": 0.7284452679452319, + "grad_norm": 2.9747978746757924, + "learning_rate": 1.4690922256167256e-05, + "loss": 1.985, + "step": 3784 + }, + { + "epoch": 0.7284452679452319, + "lm_loss": 1.3529, + "step": 3784, + "vm_loss": 0.1927 + }, + { + "epoch": 0.7284452679452319, + "lm_loss": 1.5735, + "step": 3784, + "vm_loss": 0.1323 + }, + { + "epoch": 0.7284452679452319, + "lm_loss": 1.2311, + "step": 3784, + "vm_loss": 0.1487 + }, + { + "epoch": 0.7284452679452319, + "lm_loss": 1.2852, + "step": 3784, + "vm_loss": 0.1859 + }, + { + "epoch": 0.7284452679452319, + "lm_loss": 1.714, + "step": 3784, + "vm_loss": 0.1929 + }, + { + "epoch": 0.7284452679452319, + "lm_loss": 1.721, + "step": 3784, + "vm_loss": 0.1807 + }, + { + "epoch": 0.7284452679452319, + "lm_loss": 1.3125, + "step": 3784, + "vm_loss": 0.1607 + }, + { + "epoch": 0.7284452679452319, + "lm_loss": 1.9266, + "step": 3784, + "vm_loss": 0.1314 + }, + { + "epoch": 0.7286377746228072, + "grad_norm": 2.8770598699520784, + "learning_rate": 1.4688168460325686e-05, + "loss": 1.8648, + "step": 3785 + }, + { + "epoch": 0.7288302813003826, + "grad_norm": 2.8249853074431477, + "learning_rate": 1.4685414208734168e-05, + "loss": 1.8871, + "step": 3786 + }, + { + "epoch": 0.7290227879779579, + "grad_norm": 2.6915223035627687, + "learning_rate": 1.468265950166044e-05, + "loss": 1.858, + "step": 3787 + }, + { + "epoch": 0.7292152946555334, + "grad_norm": 2.9186050017578653, + "learning_rate": 1.4679904339372301e-05, + "loss": 1.8947, + "step": 3788 + }, + { + "epoch": 0.7294078013331088, + "grad_norm": 3.0135445207936495, + "learning_rate": 1.467714872213759e-05, + "loss": 1.9631, + "step": 3789 + }, + { + "epoch": 0.7296003080106841, + "grad_norm": 2.9510883750314276, + "learning_rate": 1.467439265022418e-05, + "loss": 1.928, + "step": 3790 + }, + { + "epoch": 0.7297928146882595, + "grad_norm": 2.979859919913248, + "learning_rate": 1.4671636123900006e-05, + "loss": 1.9898, + "step": 3791 + }, + { + "epoch": 0.7299853213658348, + "grad_norm": 2.8783329180705644, + "learning_rate": 1.4668879143433031e-05, + "loss": 1.8716, + "step": 3792 + }, + { + "epoch": 0.7299853213658348, + "lm_loss": 2.201, + "step": 3792, + "vm_loss": 0.1409 + }, + { + "epoch": 0.7299853213658348, + "lm_loss": 1.986, + "step": 3792, + "vm_loss": 0.2028 + }, + { + "epoch": 0.7299853213658348, + "lm_loss": 1.8175, + "step": 3792, + "vm_loss": 0.1474 + }, + { + "epoch": 0.7299853213658348, + "lm_loss": 1.7665, + "step": 3792, + "vm_loss": 0.1654 + }, + { + "epoch": 0.7299853213658348, + "lm_loss": 1.974, + "step": 3792, + "vm_loss": 0.1661 + }, + { + "epoch": 0.7299853213658348, + "lm_loss": 1.44, + "step": 3792, + "vm_loss": 0.1365 + }, + { + "epoch": 0.7299853213658348, + "lm_loss": 1.4445, + "step": 3792, + "vm_loss": 0.173 + }, + { + "epoch": 0.7299853213658348, + "lm_loss": 1.5639, + "step": 3792, + "vm_loss": 0.1892 + }, + { + "epoch": 0.7301778280434102, + "grad_norm": 2.897848340764509, + "learning_rate": 1.466612170909127e-05, + "loss": 1.8876, + "step": 3793 + }, + { + "epoch": 0.7303703347209857, + "grad_norm": 2.855486079063728, + "learning_rate": 1.4663363821142784e-05, + "loss": 1.8899, + "step": 3794 + }, + { + "epoch": 0.730562841398561, + "grad_norm": 2.890471014035725, + "learning_rate": 1.466060547985567e-05, + "loss": 1.8745, + "step": 3795 + }, + { + "epoch": 0.7307553480761364, + "grad_norm": 3.120574012481081, + "learning_rate": 1.4657846685498079e-05, + "loss": 1.9247, + "step": 3796 + }, + { + "epoch": 0.7309478547537118, + "grad_norm": 3.1286212358505394, + "learning_rate": 1.4655087438338196e-05, + "loss": 1.9042, + "step": 3797 + }, + { + "epoch": 0.7311403614312871, + "grad_norm": 3.032055545764561, + "learning_rate": 1.465232773864426e-05, + "loss": 1.8474, + "step": 3798 + }, + { + "epoch": 0.7313328681088626, + "grad_norm": 2.961280707772565, + "learning_rate": 1.4649567586684548e-05, + "loss": 1.9814, + "step": 3799 + }, + { + "epoch": 0.7315253747864379, + "grad_norm": 2.825872236590409, + "learning_rate": 1.464680698272738e-05, + "loss": 1.8708, + "step": 3800 + }, + { + "epoch": 0.7315253747864379, + "lm_loss": 1.5248, + "step": 3800, + "vm_loss": 0.1962 + }, + { + "epoch": 0.7315253747864379, + "lm_loss": 2.0103, + "step": 3800, + "vm_loss": 0.1868 + }, + { + "epoch": 0.7315253747864379, + "lm_loss": 1.3539, + "step": 3800, + "vm_loss": 0.0982 + }, + { + "epoch": 0.7315253747864379, + "lm_loss": 1.7909, + "step": 3800, + "vm_loss": 0.1917 + }, + { + "epoch": 0.7315253747864379, + "lm_loss": 1.8984, + "step": 3800, + "vm_loss": 0.2291 + }, + { + "epoch": 0.7315253747864379, + "lm_loss": 1.7355, + "step": 3800, + "vm_loss": 0.1632 + }, + { + "epoch": 0.7315253747864379, + "lm_loss": 1.7069, + "step": 3800, + "vm_loss": 0.1441 + }, + { + "epoch": 0.7315253747864379, + "lm_loss": 1.6806, + "step": 3800, + "vm_loss": 0.2131 + }, + { + "epoch": 0.7317178814640133, + "grad_norm": 3.210465647848649, + "learning_rate": 1.4644045927041124e-05, + "loss": 1.8633, + "step": 3801 + }, + { + "epoch": 0.7319103881415887, + "grad_norm": 3.1170651130741187, + "learning_rate": 1.464128441989419e-05, + "loss": 1.859, + "step": 3802 + }, + { + "epoch": 0.732102894819164, + "grad_norm": 2.8536383287067286, + "learning_rate": 1.463852246155503e-05, + "loss": 1.8636, + "step": 3803 + }, + { + "epoch": 0.7322954014967394, + "grad_norm": 3.0796702239268665, + "learning_rate": 1.463576005229214e-05, + "loss": 1.8913, + "step": 3804 + }, + { + "epoch": 0.7324879081743148, + "grad_norm": 3.3263877220154847, + "learning_rate": 1.463299719237407e-05, + "loss": 1.8516, + "step": 3805 + }, + { + "epoch": 0.7326804148518902, + "grad_norm": 3.375887425072847, + "learning_rate": 1.4630233882069396e-05, + "loss": 1.859, + "step": 3806 + }, + { + "epoch": 0.7328729215294656, + "grad_norm": 2.9099849025767135, + "learning_rate": 1.462747012164675e-05, + "loss": 1.8462, + "step": 3807 + }, + { + "epoch": 0.7330654282070409, + "grad_norm": 2.8266158078709815, + "learning_rate": 1.4624705911374808e-05, + "loss": 1.8635, + "step": 3808 + }, + { + "epoch": 0.7330654282070409, + "lm_loss": 1.4052, + "step": 3808, + "vm_loss": 0.1464 + }, + { + "epoch": 0.7330654282070409, + "lm_loss": 1.5563, + "step": 3808, + "vm_loss": 0.24 + }, + { + "epoch": 0.7330654282070409, + "lm_loss": 1.9506, + "step": 3808, + "vm_loss": 0.1781 + }, + { + "epoch": 0.7330654282070409, + "lm_loss": 2.1204, + "step": 3808, + "vm_loss": 0.223 + }, + { + "epoch": 0.7330654282070409, + "lm_loss": 1.6784, + "step": 3808, + "vm_loss": 0.1507 + }, + { + "epoch": 0.7330654282070409, + "lm_loss": 1.585, + "step": 3808, + "vm_loss": 0.1549 + }, + { + "epoch": 0.7330654282070409, + "lm_loss": 1.5776, + "step": 3808, + "vm_loss": 0.1608 + }, + { + "epoch": 0.7330654282070409, + "lm_loss": 1.858, + "step": 3808, + "vm_loss": 0.1993 + }, + { + "epoch": 0.7332579348846163, + "grad_norm": 3.1287871877030957, + "learning_rate": 1.462194125152228e-05, + "loss": 1.9462, + "step": 3809 + }, + { + "epoch": 0.7334504415621916, + "grad_norm": 2.783954028897125, + "learning_rate": 1.4619176142357936e-05, + "loss": 1.9007, + "step": 3810 + }, + { + "epoch": 0.7336429482397671, + "grad_norm": 3.04246002898484, + "learning_rate": 1.4616410584150572e-05, + "loss": 1.9307, + "step": 3811 + }, + { + "epoch": 0.7338354549173425, + "grad_norm": 2.8826782971176583, + "learning_rate": 1.4613644577169035e-05, + "loss": 1.8615, + "step": 3812 + }, + { + "epoch": 0.7340279615949178, + "grad_norm": 2.910438167504957, + "learning_rate": 1.4610878121682223e-05, + "loss": 1.8768, + "step": 3813 + }, + { + "epoch": 0.7342204682724932, + "grad_norm": 2.898400065854732, + "learning_rate": 1.4608111217959067e-05, + "loss": 1.8516, + "step": 3814 + }, + { + "epoch": 0.7344129749500686, + "grad_norm": 3.0642227593692035, + "learning_rate": 1.4605343866268544e-05, + "loss": 1.9445, + "step": 3815 + }, + { + "epoch": 0.7346054816276439, + "grad_norm": 2.850021427137016, + "learning_rate": 1.460257606687968e-05, + "loss": 1.8347, + "step": 3816 + }, + { + "epoch": 0.7346054816276439, + "lm_loss": 1.6676, + "step": 3816, + "vm_loss": 0.154 + }, + { + "epoch": 0.7346054816276439, + "lm_loss": 1.7034, + "step": 3816, + "vm_loss": 0.1732 + }, + { + "epoch": 0.7346054816276439, + "lm_loss": 1.8896, + "step": 3816, + "vm_loss": 0.1987 + }, + { + "epoch": 0.7346054816276439, + "lm_loss": 1.8827, + "step": 3816, + "vm_loss": 0.1484 + }, + { + "epoch": 0.7346054816276439, + "lm_loss": 1.7872, + "step": 3816, + "vm_loss": 0.1964 + }, + { + "epoch": 0.7346054816276439, + "lm_loss": 1.9646, + "step": 3816, + "vm_loss": 0.1574 + }, + { + "epoch": 0.7346054816276439, + "lm_loss": 1.5278, + "step": 3816, + "vm_loss": 0.1654 + }, + { + "epoch": 0.7346054816276439, + "lm_loss": 1.2605, + "step": 3816, + "vm_loss": 0.1603 + }, + { + "epoch": 0.7347979883052194, + "grad_norm": 2.9055510497597026, + "learning_rate": 1.4599807820061536e-05, + "loss": 1.8787, + "step": 3817 + }, + { + "epoch": 0.7349904949827947, + "grad_norm": 2.971811352824652, + "learning_rate": 1.4597039126083224e-05, + "loss": 1.8528, + "step": 3818 + }, + { + "epoch": 0.7351830016603701, + "grad_norm": 2.9539883288609943, + "learning_rate": 1.4594269985213893e-05, + "loss": 1.8676, + "step": 3819 + }, + { + "epoch": 0.7353755083379455, + "grad_norm": 2.9868512124543294, + "learning_rate": 1.4591500397722746e-05, + "loss": 1.8643, + "step": 3820 + }, + { + "epoch": 0.7355680150155208, + "grad_norm": 3.101406742443786, + "learning_rate": 1.4588730363879015e-05, + "loss": 1.9055, + "step": 3821 + }, + { + "epoch": 0.7357605216930962, + "grad_norm": 2.9113109890318625, + "learning_rate": 1.458595988395199e-05, + "loss": 1.9581, + "step": 3822 + }, + { + "epoch": 0.7359530283706716, + "grad_norm": 2.8932218278534862, + "learning_rate": 1.4583188958210987e-05, + "loss": 1.9013, + "step": 3823 + }, + { + "epoch": 0.736145535048247, + "grad_norm": 2.8870778768085663, + "learning_rate": 1.4580417586925384e-05, + "loss": 1.9068, + "step": 3824 + }, + { + "epoch": 0.736145535048247, + "lm_loss": 1.8975, + "step": 3824, + "vm_loss": 0.156 + }, + { + "epoch": 0.736145535048247, + "lm_loss": 1.419, + "step": 3824, + "vm_loss": 0.2197 + }, + { + "epoch": 0.736145535048247, + "lm_loss": 1.5307, + "step": 3824, + "vm_loss": 0.1859 + }, + { + "epoch": 0.736145535048247, + "lm_loss": 2.175, + "step": 3824, + "vm_loss": 0.177 + }, + { + "epoch": 0.736145535048247, + "lm_loss": 2.3087, + "step": 3824, + "vm_loss": 0.1568 + }, + { + "epoch": 0.736145535048247, + "lm_loss": 1.5409, + "step": 3824, + "vm_loss": 0.1606 + }, + { + "epoch": 0.736145535048247, + "lm_loss": 1.7161, + "step": 3824, + "vm_loss": 0.1939 + }, + { + "epoch": 0.736145535048247, + "lm_loss": 1.7306, + "step": 3824, + "vm_loss": 0.1609 + }, + { + "epoch": 0.7363380417258224, + "grad_norm": 2.8516293532175463, + "learning_rate": 1.457764577036459e-05, + "loss": 1.9099, + "step": 3825 + }, + { + "epoch": 0.7365305484033977, + "grad_norm": 3.1458646036580364, + "learning_rate": 1.4574873508798063e-05, + "loss": 1.8971, + "step": 3826 + }, + { + "epoch": 0.7367230550809731, + "grad_norm": 2.9776755579716236, + "learning_rate": 1.4572100802495302e-05, + "loss": 1.8959, + "step": 3827 + }, + { + "epoch": 0.7369155617585486, + "grad_norm": 2.9628437232221123, + "learning_rate": 1.4569327651725847e-05, + "loss": 1.8474, + "step": 3828 + }, + { + "epoch": 0.7371080684361239, + "grad_norm": 2.910023063689976, + "learning_rate": 1.4566554056759286e-05, + "loss": 1.8876, + "step": 3829 + }, + { + "epoch": 0.7373005751136993, + "grad_norm": 2.846094873646071, + "learning_rate": 1.4563780017865248e-05, + "loss": 1.8749, + "step": 3830 + }, + { + "epoch": 0.7374930817912746, + "grad_norm": 2.942425683678951, + "learning_rate": 1.4561005535313402e-05, + "loss": 1.88, + "step": 3831 + }, + { + "epoch": 0.73768558846885, + "grad_norm": 2.8362740868832317, + "learning_rate": 1.4558230609373469e-05, + "loss": 1.9019, + "step": 3832 + }, + { + "epoch": 0.73768558846885, + "lm_loss": 1.9086, + "step": 3832, + "vm_loss": 0.2219 + }, + { + "epoch": 0.73768558846885, + "lm_loss": 1.8401, + "step": 3832, + "vm_loss": 0.2106 + }, + { + "epoch": 0.73768558846885, + "lm_loss": 1.2908, + "step": 3832, + "vm_loss": 0.1626 + }, + { + "epoch": 0.73768558846885, + "lm_loss": 1.7236, + "step": 3832, + "vm_loss": 0.1925 + }, + { + "epoch": 0.73768558846885, + "lm_loss": 1.5062, + "step": 3832, + "vm_loss": 0.1818 + }, + { + "epoch": 0.73768558846885, + "lm_loss": 2.0799, + "step": 3832, + "vm_loss": 0.178 + }, + { + "epoch": 0.73768558846885, + "lm_loss": 2.094, + "step": 3832, + "vm_loss": 0.1427 + }, + { + "epoch": 0.73768558846885, + "lm_loss": 1.2925, + "step": 3832, + "vm_loss": 0.1776 + }, + { + "epoch": 0.7378780951464254, + "grad_norm": 3.1971256387533082, + "learning_rate": 1.4555455240315202e-05, + "loss": 1.8861, + "step": 3833 + }, + { + "epoch": 0.7380706018240007, + "grad_norm": 2.881995289066406, + "learning_rate": 1.4552679428408402e-05, + "loss": 1.8704, + "step": 3834 + }, + { + "epoch": 0.7382631085015762, + "grad_norm": 2.9762213116119525, + "learning_rate": 1.4549903173922921e-05, + "loss": 1.9293, + "step": 3835 + }, + { + "epoch": 0.7384556151791515, + "grad_norm": 2.9306629177899377, + "learning_rate": 1.4547126477128637e-05, + "loss": 1.8696, + "step": 3836 + }, + { + "epoch": 0.7386481218567269, + "grad_norm": 3.077156437479805, + "learning_rate": 1.4544349338295487e-05, + "loss": 1.8942, + "step": 3837 + }, + { + "epoch": 0.7388406285343023, + "grad_norm": 3.164571209751457, + "learning_rate": 1.454157175769344e-05, + "loss": 1.922, + "step": 3838 + }, + { + "epoch": 0.7390331352118776, + "grad_norm": 2.8618423982224286, + "learning_rate": 1.4538793735592518e-05, + "loss": 1.8946, + "step": 3839 + }, + { + "epoch": 0.7392256418894531, + "grad_norm": 3.0244413345367693, + "learning_rate": 1.4536015272262773e-05, + "loss": 1.9509, + "step": 3840 + }, + { + "epoch": 0.7392256418894531, + "lm_loss": 2.3827, + "step": 3840, + "vm_loss": 0.1314 + }, + { + "epoch": 0.7392256418894531, + "lm_loss": 1.9091, + "step": 3840, + "vm_loss": 0.2034 + }, + { + "epoch": 0.7392256418894531, + "lm_loss": 1.7933, + "step": 3840, + "vm_loss": 0.1257 + }, + { + "epoch": 0.7392256418894531, + "lm_loss": 1.955, + "step": 3840, + "vm_loss": 0.1997 + }, + { + "epoch": 0.7392256418894531, + "lm_loss": 1.9445, + "step": 3840, + "vm_loss": 0.2062 + }, + { + "epoch": 0.7392256418894531, + "lm_loss": 2.1779, + "step": 3840, + "vm_loss": 0.1818 + }, + { + "epoch": 0.7392256418894531, + "lm_loss": 1.8821, + "step": 3840, + "vm_loss": 0.1621 + }, + { + "epoch": 0.7392256418894531, + "lm_loss": 1.8644, + "step": 3840, + "vm_loss": 0.1561 + }, + { + "epoch": 0.7394181485670284, + "grad_norm": 3.139005253671557, + "learning_rate": 1.4533236367974313e-05, + "loss": 1.8266, + "step": 3841 + }, + { + "epoch": 0.7396106552446038, + "grad_norm": 3.0658324622634305, + "learning_rate": 1.4530457022997282e-05, + "loss": 1.8826, + "step": 3842 + }, + { + "epoch": 0.7398031619221792, + "grad_norm": 2.971648309669812, + "learning_rate": 1.4527677237601865e-05, + "loss": 1.8877, + "step": 3843 + }, + { + "epoch": 0.7399956685997545, + "grad_norm": 3.111543085055416, + "learning_rate": 1.4524897012058297e-05, + "loss": 1.8498, + "step": 3844 + }, + { + "epoch": 0.7401881752773299, + "grad_norm": 2.95760332318267, + "learning_rate": 1.4522116346636846e-05, + "loss": 1.8323, + "step": 3845 + }, + { + "epoch": 0.7403806819549054, + "grad_norm": 2.8049768098510253, + "learning_rate": 1.4519335241607833e-05, + "loss": 1.8212, + "step": 3846 + }, + { + "epoch": 0.7405731886324807, + "grad_norm": 2.8587699703344644, + "learning_rate": 1.4516553697241615e-05, + "loss": 1.87, + "step": 3847 + }, + { + "epoch": 0.7407656953100561, + "grad_norm": 2.978140868099243, + "learning_rate": 1.4513771713808594e-05, + "loss": 1.9335, + "step": 3848 + }, + { + "epoch": 0.7407656953100561, + "lm_loss": 1.6118, + "step": 3848, + "vm_loss": 0.202 + }, + { + "epoch": 0.7407656953100561, + "lm_loss": 1.3833, + "step": 3848, + "vm_loss": 0.1463 + }, + { + "epoch": 0.7407656953100561, + "lm_loss": 1.0515, + "step": 3848, + "vm_loss": 0.1469 + }, + { + "epoch": 0.7407656953100561, + "lm_loss": 1.2491, + "step": 3848, + "vm_loss": 0.1632 + }, + { + "epoch": 0.7407656953100561, + "lm_loss": 1.8356, + "step": 3848, + "vm_loss": 0.1873 + }, + { + "epoch": 0.7407656953100561, + "lm_loss": 2.0404, + "step": 3848, + "vm_loss": 0.1752 + }, + { + "epoch": 0.7407656953100561, + "lm_loss": 1.4947, + "step": 3848, + "vm_loss": 0.1684 + }, + { + "epoch": 0.7407656953100561, + "lm_loss": 1.6058, + "step": 3848, + "vm_loss": 0.1189 + }, + { + "epoch": 0.7409582019876314, + "grad_norm": 2.9531610415798357, + "learning_rate": 1.4510989291579214e-05, + "loss": 1.883, + "step": 3849 + }, + { + "epoch": 0.7411507086652068, + "grad_norm": 3.157199596048571, + "learning_rate": 1.4508206430823963e-05, + "loss": 1.9209, + "step": 3850 + }, + { + "epoch": 0.7413432153427822, + "grad_norm": 3.053226852390783, + "learning_rate": 1.4505423131813371e-05, + "loss": 1.8842, + "step": 3851 + }, + { + "epoch": 0.7415357220203576, + "grad_norm": 3.2702361176148953, + "learning_rate": 1.4502639394818007e-05, + "loss": 1.8606, + "step": 3852 + }, + { + "epoch": 0.741728228697933, + "grad_norm": 3.104752763861082, + "learning_rate": 1.449985522010849e-05, + "loss": 1.8445, + "step": 3853 + }, + { + "epoch": 0.7419207353755083, + "grad_norm": 2.9520776078384436, + "learning_rate": 1.4497070607955477e-05, + "loss": 1.9598, + "step": 3854 + }, + { + "epoch": 0.7421132420530837, + "grad_norm": 3.0018615102635247, + "learning_rate": 1.4494285558629664e-05, + "loss": 1.8486, + "step": 3855 + }, + { + "epoch": 0.7423057487306591, + "grad_norm": 3.091778112936236, + "learning_rate": 1.4491500072401796e-05, + "loss": 1.9188, + "step": 3856 + }, + { + "epoch": 0.7423057487306591, + "lm_loss": 2.0952, + "step": 3856, + "vm_loss": 0.205 + }, + { + "epoch": 0.7423057487306591, + "lm_loss": 1.563, + "step": 3856, + "vm_loss": 0.1831 + }, + { + "epoch": 0.7423057487306591, + "lm_loss": 1.8727, + "step": 3856, + "vm_loss": 0.2032 + }, + { + "epoch": 0.7423057487306591, + "lm_loss": 1.4588, + "step": 3856, + "vm_loss": 0.1934 + }, + { + "epoch": 0.7423057487306591, + "lm_loss": 1.7593, + "step": 3856, + "vm_loss": 0.188 + }, + { + "epoch": 0.7423057487306591, + "lm_loss": 1.4849, + "step": 3856, + "vm_loss": 0.1635 + }, + { + "epoch": 0.7423057487306591, + "lm_loss": 0.7698, + "step": 3856, + "vm_loss": 0.1527 + }, + { + "epoch": 0.7423057487306591, + "lm_loss": 1.7999, + "step": 3856, + "vm_loss": 0.2298 + }, + { + "epoch": 0.7424982554082344, + "grad_norm": 3.2231424573590006, + "learning_rate": 1.448871414954266e-05, + "loss": 1.8714, + "step": 3857 + }, + { + "epoch": 0.7426907620858099, + "grad_norm": 3.0774949500867144, + "learning_rate": 1.448592779032308e-05, + "loss": 1.9052, + "step": 3858 + }, + { + "epoch": 0.7428832687633853, + "grad_norm": 2.8867989946937986, + "learning_rate": 1.4483140995013924e-05, + "loss": 1.8708, + "step": 3859 + }, + { + "epoch": 0.7430757754409606, + "grad_norm": 3.187271409887001, + "learning_rate": 1.448035376388611e-05, + "loss": 1.8576, + "step": 3860 + }, + { + "epoch": 0.743268282118536, + "grad_norm": 3.016348859284071, + "learning_rate": 1.4477566097210592e-05, + "loss": 1.8046, + "step": 3861 + }, + { + "epoch": 0.7434607887961113, + "grad_norm": 2.9555569253122562, + "learning_rate": 1.4474777995258363e-05, + "loss": 1.8916, + "step": 3862 + }, + { + "epoch": 0.7436532954736867, + "grad_norm": 2.8893921198800854, + "learning_rate": 1.4471989458300462e-05, + "loss": 1.9013, + "step": 3863 + }, + { + "epoch": 0.7438458021512622, + "grad_norm": 2.8337064941075525, + "learning_rate": 1.4469200486607972e-05, + "loss": 1.9004, + "step": 3864 + }, + { + "epoch": 0.7438458021512622, + "lm_loss": 1.9101, + "step": 3864, + "vm_loss": 0.193 + }, + { + "epoch": 0.7438458021512622, + "lm_loss": 1.2254, + "step": 3864, + "vm_loss": 0.1945 + }, + { + "epoch": 0.7438458021512622, + "lm_loss": 1.6179, + "step": 3864, + "vm_loss": 0.1803 + }, + { + "epoch": 0.7438458021512622, + "lm_loss": 1.4599, + "step": 3864, + "vm_loss": 0.1349 + }, + { + "epoch": 0.7438458021512622, + "lm_loss": 1.709, + "step": 3864, + "vm_loss": 0.1001 + }, + { + "epoch": 0.7438458021512622, + "lm_loss": 2.0548, + "step": 3864, + "vm_loss": 0.1215 + }, + { + "epoch": 0.7438458021512622, + "lm_loss": 1.8772, + "step": 3864, + "vm_loss": 0.1198 + }, + { + "epoch": 0.7438458021512622, + "lm_loss": 1.6035, + "step": 3864, + "vm_loss": 0.1123 + }, + { + "epoch": 0.7440383088288375, + "grad_norm": 2.813384024534517, + "learning_rate": 1.4466411080452019e-05, + "loss": 1.8162, + "step": 3865 + }, + { + "epoch": 0.7442308155064129, + "grad_norm": 2.9652358880743184, + "learning_rate": 1.4463621240103768e-05, + "loss": 1.8161, + "step": 3866 + }, + { + "epoch": 0.7444233221839882, + "grad_norm": 2.9258324606987824, + "learning_rate": 1.4460830965834425e-05, + "loss": 1.8261, + "step": 3867 + }, + { + "epoch": 0.7446158288615636, + "grad_norm": 3.2039421090786178, + "learning_rate": 1.445804025791524e-05, + "loss": 1.8737, + "step": 3868 + }, + { + "epoch": 0.744808335539139, + "grad_norm": 2.9797814709557016, + "learning_rate": 1.4455249116617506e-05, + "loss": 1.8664, + "step": 3869 + }, + { + "epoch": 0.7450008422167144, + "grad_norm": 2.849677007744024, + "learning_rate": 1.4452457542212561e-05, + "loss": 1.8846, + "step": 3870 + }, + { + "epoch": 0.7451933488942898, + "grad_norm": 2.9659264675401205, + "learning_rate": 1.444966553497178e-05, + "loss": 1.9239, + "step": 3871 + }, + { + "epoch": 0.7453858555718651, + "grad_norm": 2.904541785937833, + "learning_rate": 1.4446873095166578e-05, + "loss": 1.8107, + "step": 3872 + }, + { + "epoch": 0.7453858555718651, + "lm_loss": 1.1513, + "step": 3872, + "vm_loss": 0.1566 + }, + { + "epoch": 0.7453858555718651, + "lm_loss": 1.2626, + "step": 3872, + "vm_loss": 0.2322 + }, + { + "epoch": 0.7453858555718651, + "lm_loss": 1.163, + "step": 3872, + "vm_loss": 0.232 + }, + { + "epoch": 0.7453858555718651, + "lm_loss": 1.6007, + "step": 3872, + "vm_loss": 0.1843 + }, + { + "epoch": 0.7453858555718651, + "lm_loss": 1.802, + "step": 3872, + "vm_loss": 0.1467 + }, + { + "epoch": 0.7453858555718651, + "lm_loss": 1.5019, + "step": 3872, + "vm_loss": 0.0985 + }, + { + "epoch": 0.7453858555718651, + "lm_loss": 1.6935, + "step": 3872, + "vm_loss": 0.1963 + }, + { + "epoch": 0.7453858555718651, + "lm_loss": 1.921, + "step": 3872, + "vm_loss": 0.1714 + }, + { + "epoch": 0.7455783622494405, + "grad_norm": 2.9990768639811685, + "learning_rate": 1.4444080223068424e-05, + "loss": 1.8768, + "step": 3873 + }, + { + "epoch": 0.7457708689270159, + "grad_norm": 3.1413771295448405, + "learning_rate": 1.4441286918948811e-05, + "loss": 1.8594, + "step": 3874 + }, + { + "epoch": 0.7459633756045913, + "grad_norm": 3.0373578971492057, + "learning_rate": 1.443849318307929e-05, + "loss": 1.8458, + "step": 3875 + }, + { + "epoch": 0.7461558822821667, + "grad_norm": 2.944574147954027, + "learning_rate": 1.4435699015731449e-05, + "loss": 1.8835, + "step": 3876 + }, + { + "epoch": 0.7463483889597421, + "grad_norm": 3.14360339600945, + "learning_rate": 1.4432904417176911e-05, + "loss": 1.9239, + "step": 3877 + }, + { + "epoch": 0.7465408956373174, + "grad_norm": 2.877200407612022, + "learning_rate": 1.4430109387687352e-05, + "loss": 1.8334, + "step": 3878 + }, + { + "epoch": 0.7467334023148928, + "grad_norm": 2.9756564385533437, + "learning_rate": 1.4427313927534483e-05, + "loss": 1.7689, + "step": 3879 + }, + { + "epoch": 0.7469259089924681, + "grad_norm": 2.904140260457494, + "learning_rate": 1.4424518036990059e-05, + "loss": 1.9144, + "step": 3880 + }, + { + "epoch": 0.7469259089924681, + "lm_loss": 2.1963, + "step": 3880, + "vm_loss": 0.1126 + }, + { + "epoch": 0.7469259089924681, + "lm_loss": 1.3887, + "step": 3880, + "vm_loss": 0.1313 + }, + { + "epoch": 0.7469259089924681, + "lm_loss": 0.936, + "step": 3880, + "vm_loss": 0.1762 + }, + { + "epoch": 0.7469259089924681, + "lm_loss": 2.0121, + "step": 3880, + "vm_loss": 0.1302 + }, + { + "epoch": 0.7469259089924681, + "lm_loss": 1.6809, + "step": 3880, + "vm_loss": 0.148 + }, + { + "epoch": 0.7469259089924681, + "lm_loss": 1.8965, + "step": 3880, + "vm_loss": 0.1147 + }, + { + "epoch": 0.7469259089924681, + "lm_loss": 2.0161, + "step": 3880, + "vm_loss": 0.1948 + }, + { + "epoch": 0.7469259089924681, + "lm_loss": 1.5197, + "step": 3880, + "vm_loss": 0.1543 + }, + { + "epoch": 0.7471184156700436, + "grad_norm": 2.912253714852121, + "learning_rate": 1.4421721716325874e-05, + "loss": 1.8549, + "step": 3881 + }, + { + "epoch": 0.747310922347619, + "grad_norm": 2.7843082722882886, + "learning_rate": 1.4418924965813767e-05, + "loss": 1.8798, + "step": 3882 + }, + { + "epoch": 0.7475034290251943, + "grad_norm": 2.820575624027667, + "learning_rate": 1.4416127785725622e-05, + "loss": 1.9084, + "step": 3883 + }, + { + "epoch": 0.7476959357027697, + "grad_norm": 2.8362621376225086, + "learning_rate": 1.4413330176333356e-05, + "loss": 1.8479, + "step": 3884 + }, + { + "epoch": 0.747888442380345, + "grad_norm": 2.914174112542245, + "learning_rate": 1.4410532137908935e-05, + "loss": 1.8347, + "step": 3885 + }, + { + "epoch": 0.7480809490579204, + "grad_norm": 2.9674830287574023, + "learning_rate": 1.440773367072436e-05, + "loss": 1.8337, + "step": 3886 + }, + { + "epoch": 0.7482734557354959, + "grad_norm": 3.235735026248816, + "learning_rate": 1.4404934775051682e-05, + "loss": 1.8931, + "step": 3887 + }, + { + "epoch": 0.7484659624130712, + "grad_norm": 3.012888010875251, + "learning_rate": 1.440213545116299e-05, + "loss": 1.8514, + "step": 3888 + }, + { + "epoch": 0.7484659624130712, + "lm_loss": 1.8733, + "step": 3888, + "vm_loss": 0.1818 + }, + { + "epoch": 0.7484659624130712, + "lm_loss": 1.6941, + "step": 3888, + "vm_loss": 0.2008 + }, + { + "epoch": 0.7484659624130712, + "lm_loss": 1.5611, + "step": 3888, + "vm_loss": 0.1836 + }, + { + "epoch": 0.7484659624130712, + "lm_loss": 1.9527, + "step": 3888, + "vm_loss": 0.1701 + }, + { + "epoch": 0.7484659624130712, + "lm_loss": 2.2498, + "step": 3888, + "vm_loss": 0.1894 + }, + { + "epoch": 0.7484659624130712, + "lm_loss": 1.5923, + "step": 3888, + "vm_loss": 0.1612 + }, + { + "epoch": 0.7484659624130712, + "lm_loss": 2.1298, + "step": 3888, + "vm_loss": 0.1834 + }, + { + "epoch": 0.7484659624130712, + "lm_loss": 1.4539, + "step": 3888, + "vm_loss": 0.2041 + }, + { + "epoch": 0.7486584690906466, + "grad_norm": 3.0391284543297714, + "learning_rate": 1.4399335699330412e-05, + "loss": 1.9166, + "step": 3889 + }, + { + "epoch": 0.7488509757682219, + "grad_norm": 2.8420235929792352, + "learning_rate": 1.439653551982612e-05, + "loss": 1.8831, + "step": 3890 + }, + { + "epoch": 0.7490434824457973, + "grad_norm": 2.8722009357184386, + "learning_rate": 1.4393734912922326e-05, + "loss": 1.8296, + "step": 3891 + }, + { + "epoch": 0.7492359891233727, + "grad_norm": 2.9941890579370383, + "learning_rate": 1.4390933878891286e-05, + "loss": 1.8771, + "step": 3892 + }, + { + "epoch": 0.7494284958009481, + "grad_norm": 3.2520967650133987, + "learning_rate": 1.4388132418005299e-05, + "loss": 1.868, + "step": 3893 + }, + { + "epoch": 0.7496210024785235, + "grad_norm": 3.2932001252044913, + "learning_rate": 1.43853305305367e-05, + "loss": 1.8571, + "step": 3894 + }, + { + "epoch": 0.7498135091560989, + "grad_norm": 2.9912662393601996, + "learning_rate": 1.4382528216757867e-05, + "loss": 1.8718, + "step": 3895 + }, + { + "epoch": 0.7500060158336742, + "grad_norm": 3.029563285262763, + "learning_rate": 1.4379725476941222e-05, + "loss": 1.867, + "step": 3896 + }, + { + "epoch": 0.7500060158336742, + "lm_loss": 1.4198, + "step": 3896, + "vm_loss": 0.2212 + }, + { + "epoch": 0.7500060158336742, + "lm_loss": 1.8307, + "step": 3896, + "vm_loss": 0.1413 + }, + { + "epoch": 0.7500060158336742, + "lm_loss": 1.3123, + "step": 3896, + "vm_loss": 0.185 + }, + { + "epoch": 0.7500060158336742, + "lm_loss": 2.0322, + "step": 3896, + "vm_loss": 0.1737 + }, + { + "epoch": 0.7500060158336742, + "lm_loss": 1.7277, + "step": 3896, + "vm_loss": 0.1633 + }, + { + "epoch": 0.7500060158336742, + "lm_loss": 1.5033, + "step": 3896, + "vm_loss": 0.146 + }, + { + "epoch": 0.7500060158336742, + "lm_loss": 1.7703, + "step": 3896, + "vm_loss": 0.1649 + }, + { + "epoch": 0.7500060158336742, + "lm_loss": 1.7678, + "step": 3896, + "vm_loss": 0.2395 + }, + { + "epoch": 0.7501985225112496, + "grad_norm": 3.031277070056288, + "learning_rate": 1.437692231135923e-05, + "loss": 1.7953, + "step": 3897 + }, + { + "epoch": 0.7503910291888249, + "grad_norm": 3.0646516595423634, + "learning_rate": 1.4374118720284388e-05, + "loss": 1.8635, + "step": 3898 + }, + { + "epoch": 0.7505835358664004, + "grad_norm": 3.094125218287347, + "learning_rate": 1.437131470398925e-05, + "loss": 1.885, + "step": 3899 + }, + { + "epoch": 0.7507760425439758, + "grad_norm": 3.0173812164233795, + "learning_rate": 1.4368510262746393e-05, + "loss": 1.8942, + "step": 3900 + }, + { + "epoch": 0.7509685492215511, + "grad_norm": 3.1860832458680757, + "learning_rate": 1.4365705396828452e-05, + "loss": 1.8622, + "step": 3901 + }, + { + "epoch": 0.7511610558991265, + "grad_norm": 3.126314852927878, + "learning_rate": 1.4362900106508094e-05, + "loss": 1.8794, + "step": 3902 + }, + { + "epoch": 0.7513535625767018, + "grad_norm": 3.0744961197882485, + "learning_rate": 1.4360094392058024e-05, + "loss": 1.8516, + "step": 3903 + }, + { + "epoch": 0.7515460692542772, + "grad_norm": 2.9634641082677713, + "learning_rate": 1.4357288253750996e-05, + "loss": 1.9332, + "step": 3904 + }, + { + "epoch": 0.7515460692542772, + "lm_loss": 1.2288, + "step": 3904, + "vm_loss": 0.1788 + }, + { + "epoch": 0.7515460692542772, + "lm_loss": 1.4124, + "step": 3904, + "vm_loss": 0.1998 + }, + { + "epoch": 0.7515460692542772, + "lm_loss": 2.2023, + "step": 3904, + "vm_loss": 0.1357 + }, + { + "epoch": 0.7515460692542772, + "lm_loss": 2.0974, + "step": 3904, + "vm_loss": 0.1758 + }, + { + "epoch": 0.7515460692542772, + "lm_loss": 1.9227, + "step": 3904, + "vm_loss": 0.1381 + }, + { + "epoch": 0.7515460692542772, + "lm_loss": 2.2358, + "step": 3904, + "vm_loss": 0.1914 + }, + { + "epoch": 0.7515460692542772, + "lm_loss": 1.5846, + "step": 3904, + "vm_loss": 0.193 + }, + { + "epoch": 0.7515460692542772, + "lm_loss": 1.7607, + "step": 3904, + "vm_loss": 0.2853 + }, + { + "epoch": 0.7517385759318527, + "grad_norm": 2.978005621161641, + "learning_rate": 1.4354481691859809e-05, + "loss": 1.9035, + "step": 3905 + }, + { + "epoch": 0.751931082609428, + "grad_norm": 3.0099502006903487, + "learning_rate": 1.4351674706657288e-05, + "loss": 1.9248, + "step": 3906 + }, + { + "epoch": 0.7521235892870034, + "grad_norm": 2.9986193380978032, + "learning_rate": 1.4348867298416313e-05, + "loss": 1.8944, + "step": 3907 + }, + { + "epoch": 0.7523160959645788, + "grad_norm": 2.9627537913807522, + "learning_rate": 1.4346059467409798e-05, + "loss": 1.8015, + "step": 3908 + }, + { + "epoch": 0.7525086026421541, + "grad_norm": 3.2489308260690675, + "learning_rate": 1.4343251213910698e-05, + "loss": 1.7759, + "step": 3909 + }, + { + "epoch": 0.7527011093197296, + "grad_norm": 3.1126730832924987, + "learning_rate": 1.4340442538192018e-05, + "loss": 1.8406, + "step": 3910 + }, + { + "epoch": 0.7528936159973049, + "grad_norm": 2.972358642088698, + "learning_rate": 1.4337633440526791e-05, + "loss": 1.901, + "step": 3911 + }, + { + "epoch": 0.7530861226748803, + "grad_norm": 2.95847612504498, + "learning_rate": 1.4334823921188098e-05, + "loss": 1.869, + "step": 3912 + }, + { + "epoch": 0.7530861226748803, + "lm_loss": 1.6565, + "step": 3912, + "vm_loss": 0.2607 + }, + { + "epoch": 0.7530861226748803, + "lm_loss": 1.7178, + "step": 3912, + "vm_loss": 0.1615 + }, + { + "epoch": 0.7530861226748803, + "lm_loss": 2.1061, + "step": 3912, + "vm_loss": 0.1825 + }, + { + "epoch": 0.7530861226748803, + "lm_loss": 2.3604, + "step": 3912, + "vm_loss": 0.2185 + }, + { + "epoch": 0.7530861226748803, + "lm_loss": 1.7426, + "step": 3912, + "vm_loss": 0.1672 + }, + { + "epoch": 0.7530861226748803, + "lm_loss": 1.8165, + "step": 3912, + "vm_loss": 0.1569 + }, + { + "epoch": 0.7530861226748803, + "lm_loss": 1.9397, + "step": 3912, + "vm_loss": 0.1572 + }, + { + "epoch": 0.7530861226748803, + "lm_loss": 1.2593, + "step": 3912, + "vm_loss": 0.157 + }, + { + "epoch": 0.7532786293524557, + "grad_norm": 3.089997570161952, + "learning_rate": 1.4332013980449063e-05, + "loss": 1.8888, + "step": 3913 + }, + { + "epoch": 0.753471136030031, + "grad_norm": 2.948105995137009, + "learning_rate": 1.4329203618582845e-05, + "loss": 1.8735, + "step": 3914 + }, + { + "epoch": 0.7536636427076064, + "grad_norm": 2.9142879797968564, + "learning_rate": 1.4326392835862648e-05, + "loss": 1.8487, + "step": 3915 + }, + { + "epoch": 0.7538561493851818, + "grad_norm": 2.833880698658168, + "learning_rate": 1.4323581632561716e-05, + "loss": 1.8807, + "step": 3916 + }, + { + "epoch": 0.7540486560627572, + "grad_norm": 3.0335384496629163, + "learning_rate": 1.4320770008953338e-05, + "loss": 1.8224, + "step": 3917 + }, + { + "epoch": 0.7542411627403326, + "grad_norm": 3.352566358195461, + "learning_rate": 1.4317957965310834e-05, + "loss": 1.8276, + "step": 3918 + }, + { + "epoch": 0.7544336694179079, + "grad_norm": 3.091554004826876, + "learning_rate": 1.4315145501907575e-05, + "loss": 1.8271, + "step": 3919 + }, + { + "epoch": 0.7546261760954833, + "grad_norm": 3.2014966592167564, + "learning_rate": 1.4312332619016964e-05, + "loss": 1.7813, + "step": 3920 + }, + { + "epoch": 0.7546261760954833, + "lm_loss": 1.5071, + "step": 3920, + "vm_loss": 0.1733 + }, + { + "epoch": 0.7546261760954833, + "lm_loss": 1.4232, + "step": 3920, + "vm_loss": 0.1899 + }, + { + "epoch": 0.7546261760954833, + "lm_loss": 1.5671, + "step": 3920, + "vm_loss": 0.2175 + }, + { + "epoch": 0.7546261760954833, + "lm_loss": 1.7572, + "step": 3920, + "vm_loss": 0.1956 + }, + { + "epoch": 0.7546261760954833, + "lm_loss": 1.9343, + "step": 3920, + "vm_loss": 0.15 + }, + { + "epoch": 0.7546261760954833, + "lm_loss": 1.7613, + "step": 3920, + "vm_loss": 0.1473 + }, + { + "epoch": 0.7546261760954833, + "lm_loss": 1.9382, + "step": 3920, + "vm_loss": 0.1503 + }, + { + "epoch": 0.7546261760954833, + "lm_loss": 1.8083, + "step": 3920, + "vm_loss": 0.2572 + }, + { + "epoch": 0.7548186827730586, + "grad_norm": 2.8945642993549776, + "learning_rate": 1.430951931691245e-05, + "loss": 1.8143, + "step": 3921 + }, + { + "epoch": 0.7550111894506341, + "grad_norm": 3.0719264506978403, + "learning_rate": 1.4306705595867528e-05, + "loss": 1.852, + "step": 3922 + }, + { + "epoch": 0.7552036961282095, + "grad_norm": 3.3224304368249666, + "learning_rate": 1.4303891456155718e-05, + "loss": 1.8697, + "step": 3923 + }, + { + "epoch": 0.7553962028057848, + "grad_norm": 3.1948690992466373, + "learning_rate": 1.4301076898050601e-05, + "loss": 1.8788, + "step": 3924 + }, + { + "epoch": 0.7555887094833602, + "grad_norm": 3.1131279809084775, + "learning_rate": 1.429826192182578e-05, + "loss": 1.9402, + "step": 3925 + }, + { + "epoch": 0.7557812161609356, + "grad_norm": 2.959955782238859, + "learning_rate": 1.429544652775491e-05, + "loss": 1.8269, + "step": 3926 + }, + { + "epoch": 0.7559737228385109, + "grad_norm": 2.9504320946450777, + "learning_rate": 1.4292630716111682e-05, + "loss": 1.9012, + "step": 3927 + }, + { + "epoch": 0.7561662295160864, + "grad_norm": 3.212225613502672, + "learning_rate": 1.428981448716983e-05, + "loss": 1.9018, + "step": 3928 + }, + { + "epoch": 0.7561662295160864, + "lm_loss": 1.6364, + "step": 3928, + "vm_loss": 0.196 + }, + { + "epoch": 0.7561662295160864, + "lm_loss": 1.6102, + "step": 3928, + "vm_loss": 0.115 + }, + { + "epoch": 0.7561662295160864, + "lm_loss": 1.6576, + "step": 3928, + "vm_loss": 0.1622 + }, + { + "epoch": 0.7561662295160864, + "lm_loss": 1.2704, + "step": 3928, + "vm_loss": 0.207 + }, + { + "epoch": 0.7561662295160864, + "lm_loss": 1.701, + "step": 3928, + "vm_loss": 0.1556 + }, + { + "epoch": 0.7561662295160864, + "lm_loss": 1.9657, + "step": 3928, + "vm_loss": 0.1963 + }, + { + "epoch": 0.7561662295160864, + "lm_loss": 1.5115, + "step": 3928, + "vm_loss": 0.1384 + }, + { + "epoch": 0.7561662295160864, + "lm_loss": 2.0605, + "step": 3928, + "vm_loss": 0.2228 + }, + { + "epoch": 0.7563587361936617, + "grad_norm": 3.1072399237488777, + "learning_rate": 1.4286997841203127e-05, + "loss": 1.901, + "step": 3929 + }, + { + "epoch": 0.7565512428712371, + "grad_norm": 2.993995786318162, + "learning_rate": 1.4284180778485388e-05, + "loss": 1.8361, + "step": 3930 + }, + { + "epoch": 0.7567437495488125, + "grad_norm": 2.801774156807687, + "learning_rate": 1.4281363299290466e-05, + "loss": 1.824, + "step": 3931 + }, + { + "epoch": 0.7569362562263878, + "grad_norm": 2.82552839984301, + "learning_rate": 1.4278545403892257e-05, + "loss": 1.8591, + "step": 3932 + }, + { + "epoch": 0.7571287629039632, + "grad_norm": 2.942922458721903, + "learning_rate": 1.4275727092564696e-05, + "loss": 1.8325, + "step": 3933 + }, + { + "epoch": 0.7573212695815386, + "grad_norm": 2.9027549912041546, + "learning_rate": 1.4272908365581758e-05, + "loss": 1.7916, + "step": 3934 + }, + { + "epoch": 0.757513776259114, + "grad_norm": 2.883983175187057, + "learning_rate": 1.427008922321746e-05, + "loss": 1.8013, + "step": 3935 + }, + { + "epoch": 0.7577062829366894, + "grad_norm": 3.143747270042312, + "learning_rate": 1.4267269665745862e-05, + "loss": 1.863, + "step": 3936 + }, + { + "epoch": 0.7577062829366894, + "lm_loss": 1.8463, + "step": 3936, + "vm_loss": 0.1853 + }, + { + "epoch": 0.7577062829366894, + "lm_loss": 1.2841, + "step": 3936, + "vm_loss": 0.1931 + }, + { + "epoch": 0.7577062829366894, + "lm_loss": 1.7682, + "step": 3936, + "vm_loss": 0.1876 + }, + { + "epoch": 0.7577062829366894, + "lm_loss": 1.6743, + "step": 3936, + "vm_loss": 0.1228 + }, + { + "epoch": 0.7577062829366894, + "lm_loss": 1.6931, + "step": 3936, + "vm_loss": 0.1917 + }, + { + "epoch": 0.7577062829366894, + "lm_loss": 1.6102, + "step": 3936, + "vm_loss": 0.2155 + }, + { + "epoch": 0.7577062829366894, + "lm_loss": 1.7691, + "step": 3936, + "vm_loss": 0.2039 + }, + { + "epoch": 0.7577062829366894, + "lm_loss": 1.3207, + "step": 3936, + "vm_loss": 0.1859 + }, + { + "epoch": 0.7578987896142647, + "grad_norm": 3.1518217098390675, + "learning_rate": 1.4264449693441057e-05, + "loss": 1.8609, + "step": 3937 + }, + { + "epoch": 0.7580912962918401, + "grad_norm": 2.967204088171265, + "learning_rate": 1.4261629306577182e-05, + "loss": 1.8501, + "step": 3938 + }, + { + "epoch": 0.7582838029694156, + "grad_norm": 2.896278825295079, + "learning_rate": 1.4258808505428419e-05, + "loss": 1.8782, + "step": 3939 + }, + { + "epoch": 0.7584763096469909, + "grad_norm": 2.951709637457461, + "learning_rate": 1.425598729026898e-05, + "loss": 1.8666, + "step": 3940 + }, + { + "epoch": 0.7586688163245663, + "grad_norm": 2.8596163288953886, + "learning_rate": 1.4253165661373129e-05, + "loss": 1.7891, + "step": 3941 + }, + { + "epoch": 0.7588613230021416, + "grad_norm": 2.9194011470989087, + "learning_rate": 1.4250343619015162e-05, + "loss": 1.8329, + "step": 3942 + }, + { + "epoch": 0.759053829679717, + "grad_norm": 3.2166618225609045, + "learning_rate": 1.4247521163469413e-05, + "loss": 1.9026, + "step": 3943 + }, + { + "epoch": 0.7592463363572924, + "grad_norm": 3.163432110421066, + "learning_rate": 1.4244698295010268e-05, + "loss": 1.8788, + "step": 3944 + }, + { + "epoch": 0.7592463363572924, + "lm_loss": 1.9078, + "step": 3944, + "vm_loss": 0.1972 + }, + { + "epoch": 0.7592463363572924, + "lm_loss": 1.1845, + "step": 3944, + "vm_loss": 0.1337 + }, + { + "epoch": 0.7592463363572924, + "lm_loss": 1.6857, + "step": 3944, + "vm_loss": 0.1821 + }, + { + "epoch": 0.7592463363572924, + "lm_loss": 1.0323, + "step": 3944, + "vm_loss": 0.1652 + }, + { + "epoch": 0.7592463363572924, + "lm_loss": 1.9001, + "step": 3944, + "vm_loss": 0.1562 + }, + { + "epoch": 0.7592463363572924, + "lm_loss": 1.6769, + "step": 3944, + "vm_loss": 0.1763 + }, + { + "epoch": 0.7592463363572924, + "lm_loss": 1.372, + "step": 3944, + "vm_loss": 0.1509 + }, + { + "epoch": 0.7592463363572924, + "lm_loss": 1.7638, + "step": 3944, + "vm_loss": 0.1943 + }, + { + "epoch": 0.7594388430348677, + "grad_norm": 2.757870121723556, + "learning_rate": 1.4241875013912143e-05, + "loss": 1.8452, + "step": 3945 + }, + { + "epoch": 0.7596313497124432, + "grad_norm": 3.0363678141840293, + "learning_rate": 1.4239051320449501e-05, + "loss": 1.8401, + "step": 3946 + }, + { + "epoch": 0.7598238563900185, + "grad_norm": 3.180450693408773, + "learning_rate": 1.4236227214896831e-05, + "loss": 1.839, + "step": 3947 + }, + { + "epoch": 0.7600163630675939, + "grad_norm": 2.8819630276964983, + "learning_rate": 1.4233402697528685e-05, + "loss": 1.9173, + "step": 3948 + }, + { + "epoch": 0.7602088697451693, + "grad_norm": 2.987702219284944, + "learning_rate": 1.423057776861963e-05, + "loss": 1.7845, + "step": 3949 + }, + { + "epoch": 0.7604013764227446, + "grad_norm": 3.340641470073444, + "learning_rate": 1.4227752428444294e-05, + "loss": 1.9255, + "step": 3950 + }, + { + "epoch": 0.7605938831003201, + "grad_norm": 2.978987478726501, + "learning_rate": 1.4224926677277336e-05, + "loss": 1.8569, + "step": 3951 + }, + { + "epoch": 0.7607863897778954, + "grad_norm": 2.845046259713777, + "learning_rate": 1.4222100515393447e-05, + "loss": 1.8437, + "step": 3952 + }, + { + "epoch": 0.7607863897778954, + "lm_loss": 1.2545, + "step": 3952, + "vm_loss": 0.1808 + }, + { + "epoch": 0.7607863897778954, + "lm_loss": 1.1551, + "step": 3952, + "vm_loss": 0.1617 + }, + { + "epoch": 0.7607863897778954, + "lm_loss": 1.9904, + "step": 3952, + "vm_loss": 0.1618 + }, + { + "epoch": 0.7607863897778954, + "lm_loss": 1.3759, + "step": 3952, + "vm_loss": 0.1101 + }, + { + "epoch": 0.7607863897778954, + "lm_loss": 2.0572, + "step": 3952, + "vm_loss": 0.1537 + }, + { + "epoch": 0.7607863897778954, + "lm_loss": 1.9234, + "step": 3952, + "vm_loss": 0.1396 + }, + { + "epoch": 0.7607863897778954, + "lm_loss": 1.4797, + "step": 3952, + "vm_loss": 0.218 + }, + { + "epoch": 0.7607863897778954, + "lm_loss": 1.8059, + "step": 3952, + "vm_loss": 0.1404 + }, + { + "epoch": 0.7609788964554708, + "grad_norm": 2.985387755989666, + "learning_rate": 1.4219273943067375e-05, + "loss": 1.8321, + "step": 3953 + }, + { + "epoch": 0.7611714031330462, + "grad_norm": 3.085324422170434, + "learning_rate": 1.4216446960573893e-05, + "loss": 1.808, + "step": 3954 + }, + { + "epoch": 0.7613639098106215, + "grad_norm": 3.0042291326829376, + "learning_rate": 1.4213619568187827e-05, + "loss": 1.8534, + "step": 3955 + }, + { + "epoch": 0.7615564164881969, + "grad_norm": 3.1357714629379165, + "learning_rate": 1.4210791766184025e-05, + "loss": 1.8321, + "step": 3956 + }, + { + "epoch": 0.7617489231657724, + "grad_norm": 3.1411157814644577, + "learning_rate": 1.4207963554837391e-05, + "loss": 1.7995, + "step": 3957 + }, + { + "epoch": 0.7619414298433477, + "grad_norm": 3.0650157266410982, + "learning_rate": 1.420513493442287e-05, + "loss": 1.8373, + "step": 3958 + }, + { + "epoch": 0.7621339365209231, + "grad_norm": 2.989938974532215, + "learning_rate": 1.420230590521543e-05, + "loss": 1.8302, + "step": 3959 + }, + { + "epoch": 0.7623264431984984, + "grad_norm": 2.895483167815223, + "learning_rate": 1.4199476467490097e-05, + "loss": 1.7878, + "step": 3960 + }, + { + "epoch": 0.7623264431984984, + "lm_loss": 1.4025, + "step": 3960, + "vm_loss": 0.1929 + }, + { + "epoch": 0.7623264431984984, + "lm_loss": 2.04, + "step": 3960, + "vm_loss": 0.2038 + }, + { + "epoch": 0.7623264431984984, + "lm_loss": 1.3072, + "step": 3960, + "vm_loss": 0.1453 + }, + { + "epoch": 0.7623264431984984, + "lm_loss": 1.4161, + "step": 3960, + "vm_loss": 0.2055 + }, + { + "epoch": 0.7623264431984984, + "lm_loss": 1.302, + "step": 3960, + "vm_loss": 0.2173 + }, + { + "epoch": 0.7623264431984984, + "lm_loss": 1.7175, + "step": 3960, + "vm_loss": 0.1941 + }, + { + "epoch": 0.7623264431984984, + "lm_loss": 2.1627, + "step": 3960, + "vm_loss": 0.1561 + }, + { + "epoch": 0.7623264431984984, + "lm_loss": 1.6751, + "step": 3960, + "vm_loss": 0.16 + }, + { + "epoch": 0.7625189498760738, + "grad_norm": 3.1139646239547734, + "learning_rate": 1.4196646621521919e-05, + "loss": 1.8937, + "step": 3961 + }, + { + "epoch": 0.7627114565536492, + "grad_norm": 3.0772541018928634, + "learning_rate": 1.4193816367586e-05, + "loss": 1.8611, + "step": 3962 + }, + { + "epoch": 0.7629039632312246, + "grad_norm": 2.859952195349067, + "learning_rate": 1.4190985705957476e-05, + "loss": 1.8489, + "step": 3963 + }, + { + "epoch": 0.7630964699088, + "grad_norm": 3.1744025105175466, + "learning_rate": 1.4188154636911524e-05, + "loss": 1.9318, + "step": 3964 + }, + { + "epoch": 0.7632889765863753, + "grad_norm": 3.3165222272553185, + "learning_rate": 1.4185323160723357e-05, + "loss": 1.8166, + "step": 3965 + }, + { + "epoch": 0.7634814832639507, + "grad_norm": 3.0765716137629027, + "learning_rate": 1.4182491277668233e-05, + "loss": 1.7699, + "step": 3966 + }, + { + "epoch": 0.7636739899415261, + "grad_norm": 3.0423554445344227, + "learning_rate": 1.4179658988021447e-05, + "loss": 1.8542, + "step": 3967 + }, + { + "epoch": 0.7638664966191014, + "grad_norm": 3.2152753769254403, + "learning_rate": 1.4176826292058338e-05, + "loss": 1.8537, + "step": 3968 + }, + { + "epoch": 0.7638664966191014, + "lm_loss": 1.6772, + "step": 3968, + "vm_loss": 0.1917 + }, + { + "epoch": 0.7638664966191014, + "lm_loss": 1.2117, + "step": 3968, + "vm_loss": 0.1119 + }, + { + "epoch": 0.7638664966191014, + "lm_loss": 1.5107, + "step": 3968, + "vm_loss": 0.1801 + }, + { + "epoch": 0.7638664966191014, + "lm_loss": 1.5143, + "step": 3968, + "vm_loss": 0.1174 + }, + { + "epoch": 0.7638664966191014, + "lm_loss": 2.1202, + "step": 3968, + "vm_loss": 0.1366 + }, + { + "epoch": 0.7638664966191014, + "lm_loss": 1.851, + "step": 3968, + "vm_loss": 0.2014 + }, + { + "epoch": 0.7638664966191014, + "lm_loss": 1.7877, + "step": 3968, + "vm_loss": 0.2257 + }, + { + "epoch": 0.7638664966191014, + "lm_loss": 1.5673, + "step": 3968, + "vm_loss": 0.1497 + }, + { + "epoch": 0.7640590032966769, + "grad_norm": 3.31072984280053, + "learning_rate": 1.4173993190054272e-05, + "loss": 1.7911, + "step": 3969 + }, + { + "epoch": 0.7642515099742523, + "grad_norm": 3.0398249054418818, + "learning_rate": 1.417115968228467e-05, + "loss": 1.8015, + "step": 3970 + }, + { + "epoch": 0.7644440166518276, + "grad_norm": 2.8167325835752663, + "learning_rate": 1.416832576902498e-05, + "loss": 1.8447, + "step": 3971 + }, + { + "epoch": 0.764636523329403, + "grad_norm": 2.9676542282376155, + "learning_rate": 1.41654914505507e-05, + "loss": 1.8211, + "step": 3972 + }, + { + "epoch": 0.7648290300069783, + "grad_norm": 2.961992646510966, + "learning_rate": 1.4162656727137353e-05, + "loss": 1.8905, + "step": 3973 + }, + { + "epoch": 0.7650215366845537, + "grad_norm": 3.034034752343732, + "learning_rate": 1.4159821599060518e-05, + "loss": 1.8651, + "step": 3974 + }, + { + "epoch": 0.7652140433621292, + "grad_norm": 2.898247826972282, + "learning_rate": 1.4156986066595807e-05, + "loss": 1.8471, + "step": 3975 + }, + { + "epoch": 0.7654065500397045, + "grad_norm": 2.9700540489161766, + "learning_rate": 1.4154150130018867e-05, + "loss": 1.797, + "step": 3976 + }, + { + "epoch": 0.7654065500397045, + "lm_loss": 1.7395, + "step": 3976, + "vm_loss": 0.1633 + }, + { + "epoch": 0.7654065500397045, + "lm_loss": 1.7074, + "step": 3976, + "vm_loss": 0.18 + }, + { + "epoch": 0.7654065500397045, + "lm_loss": 1.4981, + "step": 3976, + "vm_loss": 0.1927 + }, + { + "epoch": 0.7654065500397045, + "lm_loss": 1.5866, + "step": 3976, + "vm_loss": 0.1484 + }, + { + "epoch": 0.7654065500397045, + "lm_loss": 1.8493, + "step": 3976, + "vm_loss": 0.1818 + }, + { + "epoch": 0.7654065500397045, + "lm_loss": 1.7097, + "step": 3976, + "vm_loss": 0.2271 + }, + { + "epoch": 0.7654065500397045, + "lm_loss": 1.4475, + "step": 3976, + "vm_loss": 0.1303 + }, + { + "epoch": 0.7654065500397045, + "lm_loss": 1.832, + "step": 3976, + "vm_loss": 0.1791 + }, + { + "epoch": 0.7655990567172799, + "grad_norm": 2.8183329341015435, + "learning_rate": 1.4151313789605386e-05, + "loss": 1.8533, + "step": 3977 + }, + { + "epoch": 0.7657915633948552, + "grad_norm": 2.9791558971756613, + "learning_rate": 1.4148477045631094e-05, + "loss": 1.8373, + "step": 3978 + }, + { + "epoch": 0.7659840700724306, + "grad_norm": 2.8972751134585657, + "learning_rate": 1.4145639898371755e-05, + "loss": 1.8384, + "step": 3979 + }, + { + "epoch": 0.766176576750006, + "grad_norm": 3.1050748687162018, + "learning_rate": 1.4142802348103184e-05, + "loss": 1.7923, + "step": 3980 + }, + { + "epoch": 0.7663690834275814, + "grad_norm": 3.0763978947148254, + "learning_rate": 1.4139964395101222e-05, + "loss": 1.8703, + "step": 3981 + }, + { + "epoch": 0.7665615901051568, + "grad_norm": 2.945435291021433, + "learning_rate": 1.4137126039641755e-05, + "loss": 1.828, + "step": 3982 + }, + { + "epoch": 0.7667540967827321, + "grad_norm": 2.79987981235688, + "learning_rate": 1.4134287282000708e-05, + "loss": 1.8288, + "step": 3983 + }, + { + "epoch": 0.7669466034603075, + "grad_norm": 2.8943327544633495, + "learning_rate": 1.4131448122454042e-05, + "loss": 1.8372, + "step": 3984 + }, + { + "epoch": 0.7669466034603075, + "lm_loss": 1.8878, + "step": 3984, + "vm_loss": 0.148 + }, + { + "epoch": 0.7669466034603075, + "lm_loss": 1.9113, + "step": 3984, + "vm_loss": 0.1615 + }, + { + "epoch": 0.7669466034603075, + "lm_loss": 1.3748, + "step": 3984, + "vm_loss": 0.2142 + }, + { + "epoch": 0.7669466034603075, + "lm_loss": 1.637, + "step": 3984, + "vm_loss": 0.1473 + }, + { + "epoch": 0.7669466034603075, + "lm_loss": 1.5189, + "step": 3984, + "vm_loss": 0.1534 + }, + { + "epoch": 0.7669466034603075, + "lm_loss": 1.5993, + "step": 3984, + "vm_loss": 0.1248 + }, + { + "epoch": 0.7669466034603075, + "lm_loss": 2.1075, + "step": 3984, + "vm_loss": 0.1115 + }, + { + "epoch": 0.7669466034603075, + "lm_loss": 1.5463, + "step": 3984, + "vm_loss": 0.2033 + }, + { + "epoch": 0.7671391101378829, + "grad_norm": 2.924952715945339, + "learning_rate": 1.412860856127776e-05, + "loss": 1.8579, + "step": 3985 + }, + { + "epoch": 0.7673316168154583, + "grad_norm": 3.1058606435304266, + "learning_rate": 1.412576859874791e-05, + "loss": 1.748, + "step": 3986 + }, + { + "epoch": 0.7675241234930337, + "grad_norm": 2.905379502230526, + "learning_rate": 1.412292823514057e-05, + "loss": 1.8257, + "step": 3987 + }, + { + "epoch": 0.7677166301706091, + "grad_norm": 3.4764941502537035, + "learning_rate": 1.4120087470731854e-05, + "loss": 1.8689, + "step": 3988 + }, + { + "epoch": 0.7679091368481844, + "grad_norm": 3.133069201261247, + "learning_rate": 1.4117246305797925e-05, + "loss": 1.8739, + "step": 3989 + }, + { + "epoch": 0.7681016435257598, + "grad_norm": 2.9230297544358126, + "learning_rate": 1.411440474061498e-05, + "loss": 1.8838, + "step": 3990 + }, + { + "epoch": 0.7682941502033351, + "grad_norm": 2.9363653302656885, + "learning_rate": 1.4111562775459256e-05, + "loss": 1.8304, + "step": 3991 + }, + { + "epoch": 0.7684866568809106, + "grad_norm": 2.859654908042205, + "learning_rate": 1.4108720410607028e-05, + "loss": 1.8379, + "step": 3992 + }, + { + "epoch": 0.7684866568809106, + "lm_loss": 1.143, + "step": 3992, + "vm_loss": 0.1919 + }, + { + "epoch": 0.7684866568809106, + "lm_loss": 2.0982, + "step": 3992, + "vm_loss": 0.1394 + }, + { + "epoch": 0.7684866568809106, + "lm_loss": 0.9737, + "step": 3992, + "vm_loss": 0.1414 + }, + { + "epoch": 0.7684866568809106, + "lm_loss": 1.7117, + "step": 3992, + "vm_loss": 0.1599 + }, + { + "epoch": 0.7684866568809106, + "lm_loss": 1.9586, + "step": 3992, + "vm_loss": 0.1347 + }, + { + "epoch": 0.7684866568809106, + "lm_loss": 1.5372, + "step": 3992, + "vm_loss": 0.15 + }, + { + "epoch": 0.7684866568809106, + "lm_loss": 1.2196, + "step": 3992, + "vm_loss": 0.1475 + }, + { + "epoch": 0.7684866568809106, + "lm_loss": 2.2931, + "step": 3992, + "vm_loss": 0.1703 + }, + { + "epoch": 0.768679163558486, + "grad_norm": 2.8818544641674846, + "learning_rate": 1.4105877646334611e-05, + "loss": 1.9196, + "step": 3993 + }, + { + "epoch": 0.7688716702360613, + "grad_norm": 2.8991168237385208, + "learning_rate": 1.4103034482918359e-05, + "loss": 1.7729, + "step": 3994 + }, + { + "epoch": 0.7690641769136367, + "grad_norm": 2.9582439988838196, + "learning_rate": 1.4100190920634657e-05, + "loss": 1.7994, + "step": 3995 + }, + { + "epoch": 0.769256683591212, + "grad_norm": 3.3426038447491124, + "learning_rate": 1.4097346959759943e-05, + "loss": 1.8557, + "step": 3996 + }, + { + "epoch": 0.7694491902687874, + "grad_norm": 3.1638795121153818, + "learning_rate": 1.4094502600570687e-05, + "loss": 1.7578, + "step": 3997 + }, + { + "epoch": 0.7696416969463629, + "grad_norm": 2.9947585256683027, + "learning_rate": 1.4091657843343392e-05, + "loss": 1.8195, + "step": 3998 + }, + { + "epoch": 0.7698342036239382, + "grad_norm": 2.850048292803185, + "learning_rate": 1.4088812688354607e-05, + "loss": 1.7869, + "step": 3999 + }, + { + "epoch": 0.7700267103015136, + "grad_norm": 3.1299515256116215, + "learning_rate": 1.4085967135880915e-05, + "loss": 1.7451, + "step": 4000 + }, + { + "epoch": 0.7700267103015136, + "lm_loss": 1.0991, + "step": 4000, + "vm_loss": 0.169 + }, + { + "epoch": 0.7700267103015136, + "lm_loss": 1.5031, + "step": 4000, + "vm_loss": 0.1357 + }, + { + "epoch": 0.7700267103015136, + "lm_loss": 1.4119, + "step": 4000, + "vm_loss": 0.1368 + }, + { + "epoch": 0.7700267103015136, + "lm_loss": 1.5303, + "step": 4000, + "vm_loss": 0.1783 + }, + { + "epoch": 0.7700267103015136, + "lm_loss": 1.7049, + "step": 4000, + "vm_loss": 0.1731 + }, + { + "epoch": 0.7700267103015136, + "lm_loss": 1.8045, + "step": 4000, + "vm_loss": 0.1059 + }, + { + "epoch": 0.7700267103015136, + "lm_loss": 1.8893, + "step": 4000, + "vm_loss": 0.1509 + }, + { + "epoch": 0.7700267103015136, + "lm_loss": 1.9339, + "step": 4000, + "vm_loss": 0.15 + }, + { + "epoch": 0.770219216979089, + "grad_norm": 2.88009095925609, + "learning_rate": 1.408312118619895e-05, + "loss": 1.7356, + "step": 4001 + }, + { + "epoch": 0.7704117236566643, + "grad_norm": 2.987194684422851, + "learning_rate": 1.408027483958536e-05, + "loss": 1.7879, + "step": 4002 + }, + { + "epoch": 0.7706042303342397, + "grad_norm": 3.1587514781539126, + "learning_rate": 1.4077428096316857e-05, + "loss": 1.8439, + "step": 4003 + }, + { + "epoch": 0.7707967370118151, + "grad_norm": 3.1832997675628163, + "learning_rate": 1.4074580956670176e-05, + "loss": 1.8227, + "step": 4004 + }, + { + "epoch": 0.7709892436893905, + "grad_norm": 3.1088278440263855, + "learning_rate": 1.4071733420922096e-05, + "loss": 1.8454, + "step": 4005 + }, + { + "epoch": 0.7711817503669659, + "grad_norm": 2.9423191545091947, + "learning_rate": 1.4068885489349437e-05, + "loss": 1.8288, + "step": 4006 + }, + { + "epoch": 0.7713742570445412, + "grad_norm": 3.1755948417808475, + "learning_rate": 1.406603716222905e-05, + "loss": 1.8897, + "step": 4007 + }, + { + "epoch": 0.7715667637221166, + "grad_norm": 2.913291665844025, + "learning_rate": 1.4063188439837831e-05, + "loss": 1.7643, + "step": 4008 + }, + { + "epoch": 0.7715667637221166, + "lm_loss": 1.5608, + "step": 4008, + "vm_loss": 0.1599 + }, + { + "epoch": 0.7715667637221166, + "lm_loss": 1.5677, + "step": 4008, + "vm_loss": 0.1585 + }, + { + "epoch": 0.7715667637221166, + "lm_loss": 1.7777, + "step": 4008, + "vm_loss": 0.2226 + }, + { + "epoch": 0.7715667637221166, + "lm_loss": 1.5992, + "step": 4008, + "vm_loss": 0.2 + }, + { + "epoch": 0.7715667637221166, + "lm_loss": 1.9004, + "step": 4008, + "vm_loss": 0.1539 + }, + { + "epoch": 0.7715667637221166, + "lm_loss": 1.4711, + "step": 4008, + "vm_loss": 0.2121 + }, + { + "epoch": 0.7715667637221166, + "lm_loss": 1.5642, + "step": 4008, + "vm_loss": 0.2166 + }, + { + "epoch": 0.7715667637221166, + "lm_loss": 1.3571, + "step": 4008, + "vm_loss": 0.1675 + }, + { + "epoch": 0.7717592703996919, + "grad_norm": 2.9424246623209163, + "learning_rate": 1.4060339322452717e-05, + "loss": 1.8602, + "step": 4009 + }, + { + "epoch": 0.7719517770772674, + "grad_norm": 3.137520034929048, + "learning_rate": 1.4057489810350668e-05, + "loss": 1.8616, + "step": 4010 + }, + { + "epoch": 0.7721442837548428, + "grad_norm": 3.1707206302899587, + "learning_rate": 1.4054639903808704e-05, + "loss": 1.8335, + "step": 4011 + }, + { + "epoch": 0.7723367904324181, + "grad_norm": 3.0661467867274075, + "learning_rate": 1.4051789603103865e-05, + "loss": 1.77, + "step": 4012 + }, + { + "epoch": 0.7725292971099935, + "grad_norm": 3.019270131821078, + "learning_rate": 1.4048938908513239e-05, + "loss": 1.7895, + "step": 4013 + }, + { + "epoch": 0.7727218037875688, + "grad_norm": 3.0046318497102473, + "learning_rate": 1.4046087820313949e-05, + "loss": 1.8895, + "step": 4014 + }, + { + "epoch": 0.7729143104651442, + "grad_norm": 2.936466085236794, + "learning_rate": 1.404323633878316e-05, + "loss": 1.8369, + "step": 4015 + }, + { + "epoch": 0.7731068171427197, + "grad_norm": 2.781151789315632, + "learning_rate": 1.4040384464198072e-05, + "loss": 1.7959, + "step": 4016 + }, + { + "epoch": 0.7731068171427197, + "lm_loss": 1.6099, + "step": 4016, + "vm_loss": 0.1494 + }, + { + "epoch": 0.7731068171427197, + "lm_loss": 1.2724, + "step": 4016, + "vm_loss": 0.1546 + }, + { + "epoch": 0.7731068171427197, + "lm_loss": 2.2063, + "step": 4016, + "vm_loss": 0.141 + }, + { + "epoch": 0.7731068171427197, + "lm_loss": 1.7715, + "step": 4016, + "vm_loss": 0.155 + }, + { + "epoch": 0.7731068171427197, + "lm_loss": 1.9839, + "step": 4016, + "vm_loss": 0.1337 + }, + { + "epoch": 0.7731068171427197, + "lm_loss": 1.52, + "step": 4016, + "vm_loss": 0.1481 + }, + { + "epoch": 0.7731068171427197, + "lm_loss": 2.1398, + "step": 4016, + "vm_loss": 0.1373 + }, + { + "epoch": 0.7731068171427197, + "lm_loss": 2.0469, + "step": 4016, + "vm_loss": 0.0888 + }, + { + "epoch": 0.773299323820295, + "grad_norm": 2.9666955011730693, + "learning_rate": 1.4037532196835918e-05, + "loss": 1.8401, + "step": 4017 + }, + { + "epoch": 0.7734918304978704, + "grad_norm": 3.125914517258326, + "learning_rate": 1.4034679536973985e-05, + "loss": 1.8088, + "step": 4018 + }, + { + "epoch": 0.7736843371754458, + "grad_norm": 3.2431790134318126, + "learning_rate": 1.4031826484889579e-05, + "loss": 1.8709, + "step": 4019 + }, + { + "epoch": 0.7738768438530211, + "grad_norm": 3.139605941407338, + "learning_rate": 1.4028973040860056e-05, + "loss": 1.8715, + "step": 4020 + }, + { + "epoch": 0.7740693505305966, + "grad_norm": 2.8109364526210823, + "learning_rate": 1.402611920516281e-05, + "loss": 1.8227, + "step": 4021 + }, + { + "epoch": 0.7742618572081719, + "grad_norm": 2.853754697606584, + "learning_rate": 1.4023264978075266e-05, + "loss": 1.8375, + "step": 4022 + }, + { + "epoch": 0.7744543638857473, + "grad_norm": 3.010463561666103, + "learning_rate": 1.4020410359874897e-05, + "loss": 1.8411, + "step": 4023 + }, + { + "epoch": 0.7746468705633227, + "grad_norm": 2.901744673093344, + "learning_rate": 1.40175553508392e-05, + "loss": 1.8887, + "step": 4024 + }, + { + "epoch": 0.7746468705633227, + "lm_loss": 1.4194, + "step": 4024, + "vm_loss": 0.1915 + }, + { + "epoch": 0.7746468705633227, + "lm_loss": 1.8114, + "step": 4024, + "vm_loss": 0.1657 + }, + { + "epoch": 0.7746468705633227, + "lm_loss": 1.4711, + "step": 4024, + "vm_loss": 0.1819 + }, + { + "epoch": 0.7746468705633227, + "lm_loss": 1.5008, + "step": 4024, + "vm_loss": 0.1735 + }, + { + "epoch": 0.7746468705633227, + "lm_loss": 1.7318, + "step": 4024, + "vm_loss": 0.181 + }, + { + "epoch": 0.7746468705633227, + "lm_loss": 1.6722, + "step": 4024, + "vm_loss": 0.1399 + }, + { + "epoch": 0.7746468705633227, + "lm_loss": 1.8567, + "step": 4024, + "vm_loss": 0.1987 + }, + { + "epoch": 0.7746468705633227, + "lm_loss": 1.5928, + "step": 4024, + "vm_loss": 0.1903 + }, + { + "epoch": 0.774839377240898, + "grad_norm": 3.0331698143575947, + "learning_rate": 1.4014699951245725e-05, + "loss": 1.8565, + "step": 4025 + }, + { + "epoch": 0.7750318839184734, + "grad_norm": 2.9077554051636714, + "learning_rate": 1.4011844161372054e-05, + "loss": 1.8223, + "step": 4026 + }, + { + "epoch": 0.7752243905960488, + "grad_norm": 3.0577890206872067, + "learning_rate": 1.4008987981495801e-05, + "loss": 1.8245, + "step": 4027 + }, + { + "epoch": 0.7754168972736242, + "grad_norm": 3.3130318926779316, + "learning_rate": 1.4006131411894627e-05, + "loss": 1.8063, + "step": 4028 + }, + { + "epoch": 0.7756094039511996, + "grad_norm": 3.3150186099020016, + "learning_rate": 1.4003274452846227e-05, + "loss": 1.8869, + "step": 4029 + }, + { + "epoch": 0.7758019106287749, + "grad_norm": 3.073970282576432, + "learning_rate": 1.4000417104628332e-05, + "loss": 1.828, + "step": 4030 + }, + { + "epoch": 0.7759944173063503, + "grad_norm": 2.8507468939052707, + "learning_rate": 1.3997559367518718e-05, + "loss": 1.7164, + "step": 4031 + }, + { + "epoch": 0.7761869239839257, + "grad_norm": 2.872735085083444, + "learning_rate": 1.3994701241795188e-05, + "loss": 1.7941, + "step": 4032 + }, + { + "epoch": 0.7761869239839257, + "lm_loss": 1.594, + "step": 4032, + "vm_loss": 0.1599 + }, + { + "epoch": 0.7761869239839257, + "lm_loss": 1.7514, + "step": 4032, + "vm_loss": 0.2017 + }, + { + "epoch": 0.7761869239839257, + "lm_loss": 1.86, + "step": 4032, + "vm_loss": 0.0957 + }, + { + "epoch": 0.7761869239839257, + "lm_loss": 2.5574, + "step": 4032, + "vm_loss": 0.1614 + }, + { + "epoch": 0.7761869239839257, + "lm_loss": 1.7639, + "step": 4032, + "vm_loss": 0.1434 + }, + { + "epoch": 0.7761869239839257, + "lm_loss": 1.9776, + "step": 4032, + "vm_loss": 0.1474 + }, + { + "epoch": 0.7761869239839257, + "lm_loss": 1.8151, + "step": 4032, + "vm_loss": 0.1591 + }, + { + "epoch": 0.7761869239839257, + "lm_loss": 1.5379, + "step": 4032, + "vm_loss": 0.2461 + }, + { + "epoch": 0.7763794306615011, + "grad_norm": 3.0256713042689265, + "learning_rate": 1.3991842727735589e-05, + "loss": 1.8689, + "step": 4033 + }, + { + "epoch": 0.7765719373390765, + "grad_norm": 2.962435975496588, + "learning_rate": 1.3988983825617806e-05, + "loss": 1.7957, + "step": 4034 + }, + { + "epoch": 0.7767644440166518, + "grad_norm": 3.1034203355082535, + "learning_rate": 1.3986124535719766e-05, + "loss": 1.8842, + "step": 4035 + }, + { + "epoch": 0.7769569506942272, + "grad_norm": 3.0571419352254394, + "learning_rate": 1.3983264858319416e-05, + "loss": 1.8052, + "step": 4036 + }, + { + "epoch": 0.7771494573718026, + "grad_norm": 2.9462460798693204, + "learning_rate": 1.3980404793694766e-05, + "loss": 1.7405, + "step": 4037 + }, + { + "epoch": 0.7773419640493779, + "grad_norm": 2.813958994943593, + "learning_rate": 1.3977544342123846e-05, + "loss": 1.8, + "step": 4038 + }, + { + "epoch": 0.7775344707269534, + "grad_norm": 2.7774017658501795, + "learning_rate": 1.3974683503884725e-05, + "loss": 1.8311, + "step": 4039 + }, + { + "epoch": 0.7777269774045287, + "grad_norm": 2.8141368449901516, + "learning_rate": 1.3971822279255518e-05, + "loss": 1.843, + "step": 4040 + }, + { + "epoch": 0.7777269774045287, + "lm_loss": 1.8025, + "step": 4040, + "vm_loss": 0.1738 + }, + { + "epoch": 0.7777269774045287, + "lm_loss": 1.6357, + "step": 4040, + "vm_loss": 0.1517 + }, + { + "epoch": 0.7777269774045287, + "lm_loss": 1.8278, + "step": 4040, + "vm_loss": 0.1821 + }, + { + "epoch": 0.7777269774045287, + "lm_loss": 2.1172, + "step": 4040, + "vm_loss": 0.2529 + }, + { + "epoch": 0.7777269774045287, + "lm_loss": 1.5749, + "step": 4040, + "vm_loss": 0.1648 + }, + { + "epoch": 0.7777269774045287, + "lm_loss": 1.357, + "step": 4040, + "vm_loss": 0.213 + }, + { + "epoch": 0.7777269774045287, + "lm_loss": 1.0937, + "step": 4040, + "vm_loss": 0.1375 + }, + { + "epoch": 0.7777269774045287, + "lm_loss": 0.9817, + "step": 4040, + "vm_loss": 0.1372 + }, + { + "epoch": 0.7779194840821041, + "grad_norm": 2.7949606433976886, + "learning_rate": 1.396896066851437e-05, + "loss": 1.8226, + "step": 4041 + }, + { + "epoch": 0.7781119907596795, + "grad_norm": 3.024343789695567, + "learning_rate": 1.396609867193947e-05, + "loss": 1.7849, + "step": 4042 + }, + { + "epoch": 0.7783044974372548, + "grad_norm": 3.2153926302851623, + "learning_rate": 1.3963236289809037e-05, + "loss": 1.8482, + "step": 4043 + }, + { + "epoch": 0.7784970041148302, + "grad_norm": 3.116901715360136, + "learning_rate": 1.396037352240133e-05, + "loss": 1.766, + "step": 4044 + }, + { + "epoch": 0.7786895107924056, + "grad_norm": 2.9049954421487496, + "learning_rate": 1.3957510369994651e-05, + "loss": 1.7699, + "step": 4045 + }, + { + "epoch": 0.778882017469981, + "grad_norm": 2.9345498375797137, + "learning_rate": 1.3954646832867332e-05, + "loss": 1.7634, + "step": 4046 + }, + { + "epoch": 0.7790745241475564, + "grad_norm": 2.8354739448950137, + "learning_rate": 1.3951782911297745e-05, + "loss": 1.766, + "step": 4047 + }, + { + "epoch": 0.7792670308251317, + "grad_norm": 2.94549970705335, + "learning_rate": 1.3948918605564302e-05, + "loss": 1.788, + "step": 4048 + }, + { + "epoch": 0.7792670308251317, + "lm_loss": 1.7841, + "step": 4048, + "vm_loss": 0.1329 + }, + { + "epoch": 0.7792670308251317, + "lm_loss": 2.1371, + "step": 4048, + "vm_loss": 0.1965 + }, + { + "epoch": 0.7792670308251317, + "lm_loss": 1.2237, + "step": 4048, + "vm_loss": 0.1808 + }, + { + "epoch": 0.7792670308251317, + "lm_loss": 1.4411, + "step": 4048, + "vm_loss": 0.1479 + }, + { + "epoch": 0.7792670308251317, + "lm_loss": 0.9289, + "step": 4048, + "vm_loss": 0.1679 + }, + { + "epoch": 0.7792670308251317, + "lm_loss": 1.8343, + "step": 4048, + "vm_loss": 0.1668 + }, + { + "epoch": 0.7792670308251317, + "lm_loss": 1.6576, + "step": 4048, + "vm_loss": 0.1353 + }, + { + "epoch": 0.7792670308251317, + "lm_loss": 1.1951, + "step": 4048, + "vm_loss": 0.232 + }, + { + "epoch": 0.7794595375027071, + "grad_norm": 2.9205606952664933, + "learning_rate": 1.3946053915945447e-05, + "loss": 1.8279, + "step": 4049 + }, + { + "epoch": 0.7796520441802826, + "grad_norm": 2.8229355954617112, + "learning_rate": 1.3943188842719669e-05, + "loss": 1.7886, + "step": 4050 + }, + { + "epoch": 0.7798445508578579, + "grad_norm": 2.8820103434268547, + "learning_rate": 1.3940323386165486e-05, + "loss": 1.8028, + "step": 4051 + }, + { + "epoch": 0.7800370575354333, + "grad_norm": 3.111333756648554, + "learning_rate": 1.393745754656146e-05, + "loss": 1.8146, + "step": 4052 + }, + { + "epoch": 0.7802295642130086, + "grad_norm": 2.9772298504003363, + "learning_rate": 1.3934591324186184e-05, + "loss": 1.7845, + "step": 4053 + }, + { + "epoch": 0.780422070890584, + "grad_norm": 3.0515547437708492, + "learning_rate": 1.3931724719318293e-05, + "loss": 1.7499, + "step": 4054 + }, + { + "epoch": 0.7806145775681594, + "grad_norm": 2.956722880524745, + "learning_rate": 1.3928857732236461e-05, + "loss": 1.7989, + "step": 4055 + }, + { + "epoch": 0.7808070842457348, + "grad_norm": 2.778985714413757, + "learning_rate": 1.3925990363219391e-05, + "loss": 1.9, + "step": 4056 + }, + { + "epoch": 0.7808070842457348, + "lm_loss": 1.3524, + "step": 4056, + "vm_loss": 0.1024 + }, + { + "epoch": 0.7808070842457348, + "lm_loss": 1.5343, + "step": 4056, + "vm_loss": 0.1874 + }, + { + "epoch": 0.7808070842457348, + "lm_loss": 0.8962, + "step": 4056, + "vm_loss": 0.1351 + }, + { + "epoch": 0.7808070842457348, + "lm_loss": 1.4084, + "step": 4056, + "vm_loss": 0.1491 + }, + { + "epoch": 0.7808070842457348, + "lm_loss": 1.666, + "step": 4056, + "vm_loss": 0.1854 + }, + { + "epoch": 0.7808070842457348, + "lm_loss": 1.5609, + "step": 4056, + "vm_loss": 0.1231 + }, + { + "epoch": 0.7808070842457348, + "lm_loss": 2.0982, + "step": 4056, + "vm_loss": 0.103 + }, + { + "epoch": 0.7808070842457348, + "lm_loss": 1.8253, + "step": 4056, + "vm_loss": 0.1976 + }, + { + "epoch": 0.7809995909233102, + "grad_norm": 3.0039588319775876, + "learning_rate": 1.3923122612545829e-05, + "loss": 1.7997, + "step": 4057 + }, + { + "epoch": 0.7811920976008855, + "grad_norm": 2.9983927081128234, + "learning_rate": 1.3920254480494558e-05, + "loss": 1.7633, + "step": 4058 + }, + { + "epoch": 0.7813846042784609, + "grad_norm": 3.067163532516728, + "learning_rate": 1.3917385967344395e-05, + "loss": 1.8634, + "step": 4059 + }, + { + "epoch": 0.7815771109560363, + "grad_norm": 3.0466989589705262, + "learning_rate": 1.39145170733742e-05, + "loss": 1.817, + "step": 4060 + }, + { + "epoch": 0.7817696176336116, + "grad_norm": 3.0451178017885723, + "learning_rate": 1.3911647798862864e-05, + "loss": 1.7693, + "step": 4061 + }, + { + "epoch": 0.7819621243111871, + "grad_norm": 3.011886601372958, + "learning_rate": 1.3908778144089318e-05, + "loss": 1.7836, + "step": 4062 + }, + { + "epoch": 0.7821546309887625, + "grad_norm": 2.829280743382815, + "learning_rate": 1.3905908109332526e-05, + "loss": 1.7758, + "step": 4063 + }, + { + "epoch": 0.7823471376663378, + "grad_norm": 2.9431053026682843, + "learning_rate": 1.39030376948715e-05, + "loss": 1.8196, + "step": 4064 + }, + { + "epoch": 0.7823471376663378, + "lm_loss": 2.0483, + "step": 4064, + "vm_loss": 0.1451 + }, + { + "epoch": 0.7823471376663378, + "lm_loss": 2.1775, + "step": 4064, + "vm_loss": 0.2173 + }, + { + "epoch": 0.7823471376663378, + "lm_loss": 1.1851, + "step": 4064, + "vm_loss": 0.1947 + }, + { + "epoch": 0.7823471376663378, + "lm_loss": 1.4748, + "step": 4064, + "vm_loss": 0.1636 + }, + { + "epoch": 0.7823471376663378, + "lm_loss": 1.0178, + "step": 4064, + "vm_loss": 0.213 + }, + { + "epoch": 0.7823471376663378, + "lm_loss": 1.0924, + "step": 4064, + "vm_loss": 0.1606 + }, + { + "epoch": 0.7823471376663378, + "lm_loss": 1.8808, + "step": 4064, + "vm_loss": 0.1365 + }, + { + "epoch": 0.7823471376663378, + "lm_loss": 1.1702, + "step": 4064, + "vm_loss": 0.1893 + }, + { + "epoch": 0.7825396443439132, + "grad_norm": 3.0483740041496254, + "learning_rate": 1.390016690098527e-05, + "loss": 1.8378, + "step": 4065 + }, + { + "epoch": 0.7827321510214885, + "grad_norm": 2.8271357025801698, + "learning_rate": 1.3897295727952922e-05, + "loss": 1.8298, + "step": 4066 + }, + { + "epoch": 0.7829246576990639, + "grad_norm": 3.0999878284764697, + "learning_rate": 1.3894424176053566e-05, + "loss": 1.7901, + "step": 4067 + }, + { + "epoch": 0.7831171643766394, + "grad_norm": 3.0944559937144858, + "learning_rate": 1.3891552245566357e-05, + "loss": 1.7819, + "step": 4068 + }, + { + "epoch": 0.7833096710542147, + "grad_norm": 3.06537944330935, + "learning_rate": 1.3888679936770485e-05, + "loss": 1.7079, + "step": 4069 + }, + { + "epoch": 0.7835021777317901, + "grad_norm": 3.1509569988191917, + "learning_rate": 1.3885807249945166e-05, + "loss": 1.8096, + "step": 4070 + }, + { + "epoch": 0.7836946844093654, + "grad_norm": 3.1595710943026174, + "learning_rate": 1.3882934185369674e-05, + "loss": 1.8333, + "step": 4071 + }, + { + "epoch": 0.7838871910869408, + "grad_norm": 3.1486737483786733, + "learning_rate": 1.38800607433233e-05, + "loss": 1.8572, + "step": 4072 + }, + { + "epoch": 0.7838871910869408, + "lm_loss": 1.7733, + "step": 4072, + "vm_loss": 0.1732 + }, + { + "epoch": 0.7838871910869408, + "lm_loss": 1.7505, + "step": 4072, + "vm_loss": 0.1514 + }, + { + "epoch": 0.7838871910869408, + "lm_loss": 1.2405, + "step": 4072, + "vm_loss": 0.1462 + }, + { + "epoch": 0.7838871910869408, + "lm_loss": 1.2191, + "step": 4072, + "vm_loss": 0.173 + }, + { + "epoch": 0.7838871910869408, + "lm_loss": 1.6826, + "step": 4072, + "vm_loss": 0.2052 + }, + { + "epoch": 0.7838871910869408, + "lm_loss": 1.2599, + "step": 4072, + "vm_loss": 0.1328 + }, + { + "epoch": 0.7838871910869408, + "lm_loss": 1.3386, + "step": 4072, + "vm_loss": 0.1878 + }, + { + "epoch": 0.7838871910869408, + "lm_loss": 1.304, + "step": 4072, + "vm_loss": 0.1186 + }, + { + "epoch": 0.7840796977645162, + "grad_norm": 2.942846956341721, + "learning_rate": 1.3877186924085381e-05, + "loss": 1.8078, + "step": 4073 + }, + { + "epoch": 0.7842722044420916, + "grad_norm": 2.9717820726789856, + "learning_rate": 1.3874312727935293e-05, + "loss": 1.799, + "step": 4074 + }, + { + "epoch": 0.784464711119667, + "grad_norm": 2.909170511089149, + "learning_rate": 1.3871438155152437e-05, + "loss": 1.7596, + "step": 4075 + }, + { + "epoch": 0.7846572177972423, + "grad_norm": 3.1610936927974502, + "learning_rate": 1.3868563206016263e-05, + "loss": 1.8184, + "step": 4076 + }, + { + "epoch": 0.7848497244748177, + "grad_norm": 3.012412494238963, + "learning_rate": 1.3865687880806253e-05, + "loss": 1.7573, + "step": 4077 + }, + { + "epoch": 0.7850422311523931, + "grad_norm": 3.0813443893217487, + "learning_rate": 1.3862812179801922e-05, + "loss": 1.7904, + "step": 4078 + }, + { + "epoch": 0.7852347378299684, + "grad_norm": 2.9588761143031785, + "learning_rate": 1.385993610328283e-05, + "loss": 1.7988, + "step": 4079 + }, + { + "epoch": 0.7854272445075439, + "grad_norm": 3.050427421099969, + "learning_rate": 1.3857059651528564e-05, + "loss": 1.7342, + "step": 4080 + }, + { + "epoch": 0.7854272445075439, + "lm_loss": 1.9363, + "step": 4080, + "vm_loss": 0.1232 + }, + { + "epoch": 0.7854272445075439, + "lm_loss": 1.9776, + "step": 4080, + "vm_loss": 0.1241 + }, + { + "epoch": 0.7854272445075439, + "lm_loss": 1.5176, + "step": 4080, + "vm_loss": 0.1603 + }, + { + "epoch": 0.7854272445075439, + "lm_loss": 1.8757, + "step": 4080, + "vm_loss": 0.1834 + }, + { + "epoch": 0.7854272445075439, + "lm_loss": 1.4139, + "step": 4080, + "vm_loss": 0.1283 + }, + { + "epoch": 0.7854272445075439, + "lm_loss": 1.684, + "step": 4080, + "vm_loss": 0.2414 + }, + { + "epoch": 0.7854272445075439, + "lm_loss": 1.7236, + "step": 4080, + "vm_loss": 0.1262 + }, + { + "epoch": 0.7854272445075439, + "lm_loss": 1.3506, + "step": 4080, + "vm_loss": 0.1546 + }, + { + "epoch": 0.7856197511851193, + "grad_norm": 3.1097761182725088, + "learning_rate": 1.3854182824818755e-05, + "loss": 1.7873, + "step": 4081 + }, + { + "epoch": 0.7858122578626946, + "grad_norm": 3.1260086699133427, + "learning_rate": 1.3851305623433064e-05, + "loss": 1.7808, + "step": 4082 + }, + { + "epoch": 0.78600476454027, + "grad_norm": 2.9690424458336833, + "learning_rate": 1.3848428047651195e-05, + "loss": 1.8014, + "step": 4083 + }, + { + "epoch": 0.7861972712178453, + "grad_norm": 2.9957559993132397, + "learning_rate": 1.3845550097752884e-05, + "loss": 1.7166, + "step": 4084 + }, + { + "epoch": 0.7863897778954207, + "grad_norm": 3.374527272864488, + "learning_rate": 1.3842671774017905e-05, + "loss": 1.8519, + "step": 4085 + }, + { + "epoch": 0.7865822845729962, + "grad_norm": 3.178359363226118, + "learning_rate": 1.3839793076726069e-05, + "loss": 1.9157, + "step": 4086 + }, + { + "epoch": 0.7867747912505715, + "grad_norm": 2.9233902786787405, + "learning_rate": 1.3836914006157218e-05, + "loss": 1.8245, + "step": 4087 + }, + { + "epoch": 0.7869672979281469, + "grad_norm": 2.938397457704189, + "learning_rate": 1.3834034562591236e-05, + "loss": 1.7517, + "step": 4088 + }, + { + "epoch": 0.7869672979281469, + "lm_loss": 1.2485, + "step": 4088, + "vm_loss": 0.2384 + }, + { + "epoch": 0.7869672979281469, + "lm_loss": 1.7862, + "step": 4088, + "vm_loss": 0.1271 + }, + { + "epoch": 0.7869672979281469, + "lm_loss": 1.7462, + "step": 4088, + "vm_loss": 0.1622 + }, + { + "epoch": 0.7869672979281469, + "lm_loss": 1.3506, + "step": 4088, + "vm_loss": 0.1964 + }, + { + "epoch": 0.7869672979281469, + "lm_loss": 1.8381, + "step": 4088, + "vm_loss": 0.151 + }, + { + "epoch": 0.7869672979281469, + "lm_loss": 1.7815, + "step": 4088, + "vm_loss": 0.1383 + }, + { + "epoch": 0.7869672979281469, + "lm_loss": 1.444, + "step": 4088, + "vm_loss": 0.2113 + }, + { + "epoch": 0.7869672979281469, + "lm_loss": 1.5421, + "step": 4088, + "vm_loss": 0.1667 + }, + { + "epoch": 0.7871598046057222, + "grad_norm": 2.9556575549687976, + "learning_rate": 1.3831154746308046e-05, + "loss": 1.8388, + "step": 4089 + }, + { + "epoch": 0.7873523112832976, + "grad_norm": 3.0504332699491536, + "learning_rate": 1.38282745575876e-05, + "loss": 1.809, + "step": 4090 + }, + { + "epoch": 0.7875448179608731, + "grad_norm": 3.325100100646713, + "learning_rate": 1.3825393996709885e-05, + "loss": 1.8219, + "step": 4091 + }, + { + "epoch": 0.7877373246384484, + "grad_norm": 3.1646913595687076, + "learning_rate": 1.3822513063954934e-05, + "loss": 1.8466, + "step": 4092 + }, + { + "epoch": 0.7879298313160238, + "grad_norm": 3.2056820159609902, + "learning_rate": 1.3819631759602812e-05, + "loss": 1.857, + "step": 4093 + }, + { + "epoch": 0.7881223379935991, + "grad_norm": 3.032242283613238, + "learning_rate": 1.3816750083933612e-05, + "loss": 1.8223, + "step": 4094 + }, + { + "epoch": 0.7883148446711745, + "grad_norm": 3.113099355250901, + "learning_rate": 1.3813868037227477e-05, + "loss": 1.7623, + "step": 4095 + }, + { + "epoch": 0.7885073513487499, + "grad_norm": 3.0948790490464617, + "learning_rate": 1.3810985619764573e-05, + "loss": 1.8179, + "step": 4096 + }, + { + "epoch": 0.7885073513487499, + "lm_loss": 1.4658, + "step": 4096, + "vm_loss": 0.1628 + }, + { + "epoch": 0.7885073513487499, + "lm_loss": 1.7345, + "step": 4096, + "vm_loss": 0.1414 + }, + { + "epoch": 0.7885073513487499, + "lm_loss": 1.6974, + "step": 4096, + "vm_loss": 0.1997 + }, + { + "epoch": 0.7885073513487499, + "lm_loss": 1.7658, + "step": 4096, + "vm_loss": 0.1938 + }, + { + "epoch": 0.7885073513487499, + "lm_loss": 1.9589, + "step": 4096, + "vm_loss": 0.1996 + }, + { + "epoch": 0.7885073513487499, + "lm_loss": 1.6987, + "step": 4096, + "vm_loss": 0.2306 + }, + { + "epoch": 0.7885073513487499, + "lm_loss": 1.6775, + "step": 4096, + "vm_loss": 0.1722 + }, + { + "epoch": 0.7885073513487499, + "lm_loss": 1.3665, + "step": 4096, + "vm_loss": 0.1621 + }, + { + "epoch": 0.7886998580263253, + "grad_norm": 3.112798483159566, + "learning_rate": 1.3808102831825109e-05, + "loss": 1.8462, + "step": 4097 + }, + { + "epoch": 0.7888923647039007, + "grad_norm": 3.0224642032864613, + "learning_rate": 1.3805219673689333e-05, + "loss": 1.7617, + "step": 4098 + }, + { + "epoch": 0.7890848713814761, + "grad_norm": 3.0285955634333965, + "learning_rate": 1.3802336145637519e-05, + "loss": 1.8123, + "step": 4099 + }, + { + "epoch": 0.7892773780590514, + "grad_norm": 2.93467580979092, + "learning_rate": 1.3799452247949985e-05, + "loss": 1.8467, + "step": 4100 + }, + { + "epoch": 0.7894698847366268, + "grad_norm": 2.99445117237434, + "learning_rate": 1.3796567980907085e-05, + "loss": 1.8077, + "step": 4101 + }, + { + "epoch": 0.7896623914142021, + "grad_norm": 3.0410283214470564, + "learning_rate": 1.3793683344789206e-05, + "loss": 1.7431, + "step": 4102 + }, + { + "epoch": 0.7898548980917776, + "grad_norm": 3.2777335966472214, + "learning_rate": 1.3790798339876769e-05, + "loss": 1.8565, + "step": 4103 + }, + { + "epoch": 0.790047404769353, + "grad_norm": 2.8317291643216564, + "learning_rate": 1.3787912966450234e-05, + "loss": 1.7599, + "step": 4104 + }, + { + "epoch": 0.790047404769353, + "lm_loss": 1.6602, + "step": 4104, + "vm_loss": 0.1152 + }, + { + "epoch": 0.790047404769353, + "lm_loss": 1.4861, + "step": 4104, + "vm_loss": 0.1721 + }, + { + "epoch": 0.790047404769353, + "lm_loss": 1.5513, + "step": 4104, + "vm_loss": 0.1373 + }, + { + "epoch": 0.790047404769353, + "lm_loss": 1.5775, + "step": 4104, + "vm_loss": 0.1492 + }, + { + "epoch": 0.790047404769353, + "lm_loss": 1.2776, + "step": 4104, + "vm_loss": 0.0974 + }, + { + "epoch": 0.790047404769353, + "lm_loss": 1.1686, + "step": 4104, + "vm_loss": 0.1309 + }, + { + "epoch": 0.790047404769353, + "lm_loss": 2.1077, + "step": 4104, + "vm_loss": 0.1614 + }, + { + "epoch": 0.790047404769353, + "lm_loss": 1.5043, + "step": 4104, + "vm_loss": 0.1524 + }, + { + "epoch": 0.7902399114469283, + "grad_norm": 3.0348646008479756, + "learning_rate": 1.3785027224790098e-05, + "loss": 1.7403, + "step": 4105 + }, + { + "epoch": 0.7904324181245037, + "grad_norm": 2.9492514307203472, + "learning_rate": 1.3782141115176893e-05, + "loss": 1.8339, + "step": 4106 + }, + { + "epoch": 0.790624924802079, + "grad_norm": 2.953620654582367, + "learning_rate": 1.3779254637891181e-05, + "loss": 1.7783, + "step": 4107 + }, + { + "epoch": 0.7908174314796544, + "grad_norm": 3.028874213426188, + "learning_rate": 1.377636779321357e-05, + "loss": 1.8489, + "step": 4108 + }, + { + "epoch": 0.7910099381572299, + "grad_norm": 3.086760055031574, + "learning_rate": 1.3773480581424694e-05, + "loss": 1.8296, + "step": 4109 + }, + { + "epoch": 0.7912024448348052, + "grad_norm": 2.9565670864497684, + "learning_rate": 1.3770593002805228e-05, + "loss": 1.8049, + "step": 4110 + }, + { + "epoch": 0.7913949515123806, + "grad_norm": 2.9900317204590183, + "learning_rate": 1.3767705057635885e-05, + "loss": 1.7794, + "step": 4111 + }, + { + "epoch": 0.791587458189956, + "grad_norm": 2.8574744503109986, + "learning_rate": 1.3764816746197405e-05, + "loss": 1.7062, + "step": 4112 + }, + { + "epoch": 0.791587458189956, + "lm_loss": 1.818, + "step": 4112, + "vm_loss": 0.1777 + }, + { + "epoch": 0.791587458189956, + "lm_loss": 1.8306, + "step": 4112, + "vm_loss": 0.1757 + }, + { + "epoch": 0.791587458189956, + "lm_loss": 1.937, + "step": 4112, + "vm_loss": 0.1607 + }, + { + "epoch": 0.791587458189956, + "lm_loss": 1.4639, + "step": 4112, + "vm_loss": 0.1724 + }, + { + "epoch": 0.791587458189956, + "lm_loss": 1.2267, + "step": 4112, + "vm_loss": 0.1577 + }, + { + "epoch": 0.791587458189956, + "lm_loss": 1.5697, + "step": 4112, + "vm_loss": 0.1418 + }, + { + "epoch": 0.791587458189956, + "lm_loss": 0.883, + "step": 4112, + "vm_loss": 0.1965 + }, + { + "epoch": 0.791587458189956, + "lm_loss": 1.985, + "step": 4112, + "vm_loss": 0.1322 + }, + { + "epoch": 0.7917799648675313, + "grad_norm": 3.0439372590121456, + "learning_rate": 1.3761928068770569e-05, + "loss": 1.8224, + "step": 4113 + }, + { + "epoch": 0.7919724715451067, + "grad_norm": 3.060132207663992, + "learning_rate": 1.3759039025636199e-05, + "loss": 1.8291, + "step": 4114 + }, + { + "epoch": 0.7921649782226821, + "grad_norm": 2.969659313034043, + "learning_rate": 1.3756149617075143e-05, + "loss": 1.7905, + "step": 4115 + }, + { + "epoch": 0.7923574849002575, + "grad_norm": 3.1349497200103285, + "learning_rate": 1.3753259843368287e-05, + "loss": 1.8445, + "step": 4116 + }, + { + "epoch": 0.7925499915778329, + "grad_norm": 3.032116286133235, + "learning_rate": 1.3750369704796557e-05, + "loss": 1.8869, + "step": 4117 + }, + { + "epoch": 0.7927424982554082, + "grad_norm": 2.961375721578509, + "learning_rate": 1.3747479201640914e-05, + "loss": 1.8675, + "step": 4118 + }, + { + "epoch": 0.7929350049329836, + "grad_norm": 2.9345590134424944, + "learning_rate": 1.3744588334182346e-05, + "loss": 1.7738, + "step": 4119 + }, + { + "epoch": 0.7931275116105589, + "grad_norm": 2.789534675391815, + "learning_rate": 1.3741697102701884e-05, + "loss": 1.7528, + "step": 4120 + }, + { + "epoch": 0.7931275116105589, + "lm_loss": 1.2631, + "step": 4120, + "vm_loss": 0.2968 + }, + { + "epoch": 0.7931275116105589, + "lm_loss": 1.7093, + "step": 4120, + "vm_loss": 0.1571 + }, + { + "epoch": 0.7931275116105589, + "lm_loss": 1.8607, + "step": 4120, + "vm_loss": 0.163 + }, + { + "epoch": 0.7931275116105589, + "lm_loss": 1.6359, + "step": 4120, + "vm_loss": 0.1152 + }, + { + "epoch": 0.7931275116105589, + "lm_loss": 1.6032, + "step": 4120, + "vm_loss": 0.1844 + }, + { + "epoch": 0.7931275116105589, + "lm_loss": 1.3036, + "step": 4120, + "vm_loss": 0.1734 + }, + { + "epoch": 0.7931275116105589, + "lm_loss": 1.4906, + "step": 4120, + "vm_loss": 0.1171 + }, + { + "epoch": 0.7931275116105589, + "lm_loss": 1.2669, + "step": 4120, + "vm_loss": 0.2248 + }, + { + "epoch": 0.7933200182881344, + "grad_norm": 2.8590303458393143, + "learning_rate": 1.3738805507480593e-05, + "loss": 1.7927, + "step": 4121 + }, + { + "epoch": 0.7935125249657098, + "grad_norm": 3.172233154223333, + "learning_rate": 1.3735913548799576e-05, + "loss": 1.9059, + "step": 4122 + }, + { + "epoch": 0.7937050316432851, + "grad_norm": 2.9152528967846534, + "learning_rate": 1.3733021226939962e-05, + "loss": 1.8546, + "step": 4123 + }, + { + "epoch": 0.7938975383208605, + "grad_norm": 3.0002474134073394, + "learning_rate": 1.3730128542182926e-05, + "loss": 1.8077, + "step": 4124 + }, + { + "epoch": 0.7940900449984358, + "grad_norm": 2.9023444029749377, + "learning_rate": 1.3727235494809677e-05, + "loss": 1.7438, + "step": 4125 + }, + { + "epoch": 0.7942825516760113, + "grad_norm": 2.9158866311670293, + "learning_rate": 1.372434208510145e-05, + "loss": 1.7853, + "step": 4126 + }, + { + "epoch": 0.7944750583535867, + "grad_norm": 3.1289160815928168, + "learning_rate": 1.3721448313339525e-05, + "loss": 1.7448, + "step": 4127 + }, + { + "epoch": 0.794667565031162, + "grad_norm": 3.0904899443513556, + "learning_rate": 1.371855417980521e-05, + "loss": 1.819, + "step": 4128 + }, + { + "epoch": 0.794667565031162, + "lm_loss": 1.3426, + "step": 4128, + "vm_loss": 0.1912 + }, + { + "epoch": 0.794667565031162, + "lm_loss": 1.8798, + "step": 4128, + "vm_loss": 0.1671 + }, + { + "epoch": 0.794667565031162, + "lm_loss": 1.2844, + "step": 4128, + "vm_loss": 0.1141 + }, + { + "epoch": 0.794667565031162, + "lm_loss": 1.5478, + "step": 4128, + "vm_loss": 0.1822 + }, + { + "epoch": 0.794667565031162, + "lm_loss": 1.8707, + "step": 4128, + "vm_loss": 0.1704 + }, + { + "epoch": 0.794667565031162, + "lm_loss": 1.6675, + "step": 4128, + "vm_loss": 0.2084 + }, + { + "epoch": 0.794667565031162, + "lm_loss": 1.7918, + "step": 4128, + "vm_loss": 0.1541 + }, + { + "epoch": 0.794667565031162, + "lm_loss": 1.8239, + "step": 4128, + "vm_loss": 0.1996 + }, + { + "epoch": 0.7948600717087374, + "grad_norm": 2.9179317679232053, + "learning_rate": 1.3715659684779857e-05, + "loss": 1.8219, + "step": 4129 + }, + { + "epoch": 0.7950525783863128, + "grad_norm": 3.071604802587858, + "learning_rate": 1.3712764828544844e-05, + "loss": 1.8166, + "step": 4130 + }, + { + "epoch": 0.7952450850638881, + "grad_norm": 2.961709357463004, + "learning_rate": 1.3709869611381592e-05, + "loss": 1.8102, + "step": 4131 + }, + { + "epoch": 0.7954375917414636, + "grad_norm": 2.8831109008537092, + "learning_rate": 1.3706974033571548e-05, + "loss": 1.7393, + "step": 4132 + }, + { + "epoch": 0.7956300984190389, + "grad_norm": 3.1013353062502684, + "learning_rate": 1.3704078095396204e-05, + "loss": 1.7627, + "step": 4133 + }, + { + "epoch": 0.7958226050966143, + "grad_norm": 3.0922116208491586, + "learning_rate": 1.3701181797137077e-05, + "loss": 1.7066, + "step": 4134 + }, + { + "epoch": 0.7960151117741897, + "grad_norm": 3.0261397418312312, + "learning_rate": 1.3698285139075732e-05, + "loss": 1.7395, + "step": 4135 + }, + { + "epoch": 0.796207618451765, + "grad_norm": 2.930750769491821, + "learning_rate": 1.369538812149375e-05, + "loss": 1.8324, + "step": 4136 + }, + { + "epoch": 0.796207618451765, + "lm_loss": 1.4476, + "step": 4136, + "vm_loss": 0.1536 + }, + { + "epoch": 0.796207618451765, + "lm_loss": 1.7678, + "step": 4136, + "vm_loss": 0.1829 + }, + { + "epoch": 0.796207618451765, + "lm_loss": 1.0921, + "step": 4136, + "vm_loss": 0.2071 + }, + { + "epoch": 0.796207618451765, + "lm_loss": 0.7299, + "step": 4136, + "vm_loss": 0.1631 + }, + { + "epoch": 0.796207618451765, + "lm_loss": 1.1343, + "step": 4136, + "vm_loss": 0.0957 + }, + { + "epoch": 0.796207618451765, + "lm_loss": 2.0802, + "step": 4136, + "vm_loss": 0.0988 + }, + { + "epoch": 0.796207618451765, + "lm_loss": 1.4803, + "step": 4136, + "vm_loss": 0.1539 + }, + { + "epoch": 0.796207618451765, + "lm_loss": 1.8327, + "step": 4136, + "vm_loss": 0.1967 + }, + { + "epoch": 0.7964001251293404, + "grad_norm": 3.1196954415221576, + "learning_rate": 1.3692490744672772e-05, + "loss": 1.7881, + "step": 4137 + }, + { + "epoch": 0.7965926318069158, + "grad_norm": 3.0863368487562424, + "learning_rate": 1.3689593008894448e-05, + "loss": 1.8187, + "step": 4138 + }, + { + "epoch": 0.7967851384844912, + "grad_norm": 2.8681528478976723, + "learning_rate": 1.3686694914440484e-05, + "loss": 1.8407, + "step": 4139 + }, + { + "epoch": 0.7969776451620666, + "grad_norm": 3.058069903730238, + "learning_rate": 1.3683796461592604e-05, + "loss": 1.7561, + "step": 4140 + }, + { + "epoch": 0.7971701518396419, + "grad_norm": 3.095041929547832, + "learning_rate": 1.3680897650632581e-05, + "loss": 1.8094, + "step": 4141 + }, + { + "epoch": 0.7973626585172173, + "grad_norm": 3.091448095377173, + "learning_rate": 1.3677998481842213e-05, + "loss": 1.8349, + "step": 4142 + }, + { + "epoch": 0.7975551651947927, + "grad_norm": 3.1095072101732737, + "learning_rate": 1.3675098955503336e-05, + "loss": 1.8109, + "step": 4143 + }, + { + "epoch": 0.7977476718723681, + "grad_norm": 3.040109112713848, + "learning_rate": 1.3672199071897825e-05, + "loss": 1.8036, + "step": 4144 + }, + { + "epoch": 0.7977476718723681, + "lm_loss": 1.3263, + "step": 4144, + "vm_loss": 0.1282 + }, + { + "epoch": 0.7977476718723681, + "lm_loss": 1.5142, + "step": 4144, + "vm_loss": 0.1961 + }, + { + "epoch": 0.7977476718723681, + "lm_loss": 2.0414, + "step": 4144, + "vm_loss": 0.2132 + }, + { + "epoch": 0.7977476718723681, + "lm_loss": 1.7033, + "step": 4144, + "vm_loss": 0.1691 + }, + { + "epoch": 0.7977476718723681, + "lm_loss": 1.4974, + "step": 4144, + "vm_loss": 0.1623 + }, + { + "epoch": 0.7977476718723681, + "lm_loss": 1.8169, + "step": 4144, + "vm_loss": 0.195 + }, + { + "epoch": 0.7977476718723681, + "lm_loss": 1.4401, + "step": 4144, + "vm_loss": 0.2103 + }, + { + "epoch": 0.7977476718723681, + "lm_loss": 1.0588, + "step": 4144, + "vm_loss": 0.1263 + }, + { + "epoch": 0.7979401785499435, + "grad_norm": 3.1020639310752025, + "learning_rate": 1.366929883130758e-05, + "loss": 1.7401, + "step": 4145 + }, + { + "epoch": 0.7981326852275188, + "grad_norm": 3.1023362498290865, + "learning_rate": 1.3666398234014546e-05, + "loss": 1.7683, + "step": 4146 + }, + { + "epoch": 0.7983251919050942, + "grad_norm": 2.9222792817742733, + "learning_rate": 1.3663497280300698e-05, + "loss": 1.6984, + "step": 4147 + }, + { + "epoch": 0.7985176985826696, + "grad_norm": 3.1709375783402236, + "learning_rate": 1.3660595970448044e-05, + "loss": 1.8345, + "step": 4148 + }, + { + "epoch": 0.7987102052602449, + "grad_norm": 3.2405450862529377, + "learning_rate": 1.3657694304738627e-05, + "loss": 1.7698, + "step": 4149 + }, + { + "epoch": 0.7989027119378204, + "grad_norm": 3.0996143666495724, + "learning_rate": 1.3654792283454529e-05, + "loss": 1.7334, + "step": 4150 + }, + { + "epoch": 0.7990952186153957, + "grad_norm": 3.153233711735116, + "learning_rate": 1.3651889906877865e-05, + "loss": 1.807, + "step": 4151 + }, + { + "epoch": 0.7992877252929711, + "grad_norm": 3.0355731091566382, + "learning_rate": 1.3648987175290777e-05, + "loss": 1.7163, + "step": 4152 + }, + { + "epoch": 0.7992877252929711, + "lm_loss": 1.4009, + "step": 4152, + "vm_loss": 0.149 + }, + { + "epoch": 0.7992877252929711, + "lm_loss": 2.0995, + "step": 4152, + "vm_loss": 0.1288 + }, + { + "epoch": 0.7992877252929711, + "lm_loss": 1.5432, + "step": 4152, + "vm_loss": 0.1971 + }, + { + "epoch": 0.7992877252929711, + "lm_loss": 1.8983, + "step": 4152, + "vm_loss": 0.1709 + }, + { + "epoch": 0.7992877252929711, + "lm_loss": 1.7048, + "step": 4152, + "vm_loss": 0.1671 + }, + { + "epoch": 0.7992877252929711, + "lm_loss": 1.7232, + "step": 4152, + "vm_loss": 0.2077 + }, + { + "epoch": 0.7992877252929711, + "lm_loss": 1.8925, + "step": 4152, + "vm_loss": 0.2076 + }, + { + "epoch": 0.7992877252929711, + "lm_loss": 1.6771, + "step": 4152, + "vm_loss": 0.1663 + }, + { + "epoch": 0.7994802319705465, + "grad_norm": 3.0203741570733946, + "learning_rate": 1.364608408897545e-05, + "loss": 1.7903, + "step": 4153 + }, + { + "epoch": 0.7996727386481218, + "grad_norm": 3.0844672072280517, + "learning_rate": 1.3643180648214105e-05, + "loss": 1.7259, + "step": 4154 + }, + { + "epoch": 0.7998652453256972, + "grad_norm": 3.0718964121654198, + "learning_rate": 1.3640276853288992e-05, + "loss": 1.7457, + "step": 4155 + }, + { + "epoch": 0.8000577520032726, + "grad_norm": 3.0013264252849163, + "learning_rate": 1.3637372704482397e-05, + "loss": 1.7349, + "step": 4156 + }, + { + "epoch": 0.800250258680848, + "grad_norm": 3.166518840818398, + "learning_rate": 1.3634468202076639e-05, + "loss": 1.8091, + "step": 4157 + }, + { + "epoch": 0.8004427653584234, + "grad_norm": 3.1510012996782413, + "learning_rate": 1.3631563346354072e-05, + "loss": 1.7817, + "step": 4158 + }, + { + "epoch": 0.8006352720359987, + "grad_norm": 2.9929104043703023, + "learning_rate": 1.3628658137597086e-05, + "loss": 1.7993, + "step": 4159 + }, + { + "epoch": 0.8008277787135741, + "grad_norm": 2.9075421072370107, + "learning_rate": 1.3625752576088105e-05, + "loss": 1.7418, + "step": 4160 + }, + { + "epoch": 0.8008277787135741, + "lm_loss": 1.62, + "step": 4160, + "vm_loss": 0.1835 + }, + { + "epoch": 0.8008277787135741, + "lm_loss": 1.3552, + "step": 4160, + "vm_loss": 0.1991 + }, + { + "epoch": 0.8008277787135741, + "lm_loss": 1.6661, + "step": 4160, + "vm_loss": 0.1693 + }, + { + "epoch": 0.8008277787135741, + "lm_loss": 1.0421, + "step": 4160, + "vm_loss": 0.1668 + }, + { + "epoch": 0.8008277787135741, + "lm_loss": 1.8515, + "step": 4160, + "vm_loss": 0.2132 + }, + { + "epoch": 0.8008277787135741, + "lm_loss": 1.8931, + "step": 4160, + "vm_loss": 0.2289 + }, + { + "epoch": 0.8008277787135741, + "lm_loss": 1.0161, + "step": 4160, + "vm_loss": 0.1207 + }, + { + "epoch": 0.8008277787135741, + "lm_loss": 1.0134, + "step": 4160, + "vm_loss": 0.1952 + }, + { + "epoch": 0.8010202853911496, + "grad_norm": 3.006315087639913, + "learning_rate": 1.3622846662109589e-05, + "loss": 1.8408, + "step": 4161 + }, + { + "epoch": 0.8012127920687249, + "grad_norm": 2.8956403409845257, + "learning_rate": 1.3619940395944027e-05, + "loss": 1.7977, + "step": 4162 + }, + { + "epoch": 0.8014052987463003, + "grad_norm": 2.8891525324244802, + "learning_rate": 1.3617033777873943e-05, + "loss": 1.7503, + "step": 4163 + }, + { + "epoch": 0.8015978054238756, + "grad_norm": 3.021975654259892, + "learning_rate": 1.3614126808181904e-05, + "loss": 1.781, + "step": 4164 + }, + { + "epoch": 0.801790312101451, + "grad_norm": 3.143310442513092, + "learning_rate": 1.36112194871505e-05, + "loss": 1.7302, + "step": 4165 + }, + { + "epoch": 0.8019828187790264, + "grad_norm": 3.2191722146098134, + "learning_rate": 1.3608311815062361e-05, + "loss": 1.7207, + "step": 4166 + }, + { + "epoch": 0.8021753254566018, + "grad_norm": 3.170107194061057, + "learning_rate": 1.360540379220015e-05, + "loss": 1.7594, + "step": 4167 + }, + { + "epoch": 0.8023678321341772, + "grad_norm": 3.0484191505845732, + "learning_rate": 1.3602495418846564e-05, + "loss": 1.774, + "step": 4168 + }, + { + "epoch": 0.8023678321341772, + "lm_loss": 1.2763, + "step": 4168, + "vm_loss": 0.1749 + }, + { + "epoch": 0.8023678321341772, + "lm_loss": 1.174, + "step": 4168, + "vm_loss": 0.2025 + }, + { + "epoch": 0.8023678321341772, + "lm_loss": 1.1064, + "step": 4168, + "vm_loss": 0.1621 + }, + { + "epoch": 0.8023678321341772, + "lm_loss": 1.2594, + "step": 4168, + "vm_loss": 0.1563 + }, + { + "epoch": 0.8023678321341772, + "lm_loss": 2.0125, + "step": 4168, + "vm_loss": 0.1338 + }, + { + "epoch": 0.8023678321341772, + "lm_loss": 1.9577, + "step": 4168, + "vm_loss": 0.1651 + }, + { + "epoch": 0.8023678321341772, + "lm_loss": 1.4861, + "step": 4168, + "vm_loss": 0.1617 + }, + { + "epoch": 0.8023678321341772, + "lm_loss": 1.7579, + "step": 4168, + "vm_loss": 0.1159 + }, + { + "epoch": 0.8025603388117525, + "grad_norm": 3.050749151806815, + "learning_rate": 1.3599586695284331e-05, + "loss": 1.694, + "step": 4169 + }, + { + "epoch": 0.8027528454893279, + "grad_norm": 2.86913173466317, + "learning_rate": 1.3596677621796221e-05, + "loss": 1.7799, + "step": 4170 + }, + { + "epoch": 0.8029453521669033, + "grad_norm": 2.8261896613352726, + "learning_rate": 1.3593768198665031e-05, + "loss": 1.743, + "step": 4171 + }, + { + "epoch": 0.8031378588444786, + "grad_norm": 2.9453427834323627, + "learning_rate": 1.3590858426173595e-05, + "loss": 1.7357, + "step": 4172 + }, + { + "epoch": 0.8033303655220541, + "grad_norm": 3.1042981432954364, + "learning_rate": 1.3587948304604781e-05, + "loss": 1.8153, + "step": 4173 + }, + { + "epoch": 0.8035228721996295, + "grad_norm": 3.1985162848087674, + "learning_rate": 1.3585037834241485e-05, + "loss": 1.7691, + "step": 4174 + }, + { + "epoch": 0.8037153788772048, + "grad_norm": 3.1839762209239124, + "learning_rate": 1.3582127015366649e-05, + "loss": 1.8489, + "step": 4175 + }, + { + "epoch": 0.8039078855547802, + "grad_norm": 3.1115510513237528, + "learning_rate": 1.3579215848263234e-05, + "loss": 1.7104, + "step": 4176 + }, + { + "epoch": 0.8039078855547802, + "lm_loss": 1.4175, + "step": 4176, + "vm_loss": 0.146 + }, + { + "epoch": 0.8039078855547802, + "lm_loss": 1.4508, + "step": 4176, + "vm_loss": 0.1762 + }, + { + "epoch": 0.8039078855547802, + "lm_loss": 1.7968, + "step": 4176, + "vm_loss": 0.1287 + }, + { + "epoch": 0.8039078855547802, + "lm_loss": 1.9964, + "step": 4176, + "vm_loss": 0.1483 + }, + { + "epoch": 0.8039078855547802, + "lm_loss": 1.0855, + "step": 4176, + "vm_loss": 0.1774 + }, + { + "epoch": 0.8039078855547802, + "lm_loss": 1.7647, + "step": 4176, + "vm_loss": 0.1991 + }, + { + "epoch": 0.8039078855547802, + "lm_loss": 1.7331, + "step": 4176, + "vm_loss": 0.1602 + }, + { + "epoch": 0.8039078855547802, + "lm_loss": 1.94, + "step": 4176, + "vm_loss": 0.1761 + }, + { + "epoch": 0.8041003922323555, + "grad_norm": 3.0471759909201683, + "learning_rate": 1.357630433321425e-05, + "loss": 1.7945, + "step": 4177 + }, + { + "epoch": 0.8042928989099309, + "grad_norm": 3.1023322126403063, + "learning_rate": 1.3573392470502729e-05, + "loss": 1.7121, + "step": 4178 + }, + { + "epoch": 0.8044854055875064, + "grad_norm": 3.1005887719437983, + "learning_rate": 1.3570480260411745e-05, + "loss": 1.8209, + "step": 4179 + }, + { + "epoch": 0.8046779122650817, + "grad_norm": 2.8865554987876743, + "learning_rate": 1.3567567703224398e-05, + "loss": 1.7591, + "step": 4180 + }, + { + "epoch": 0.8048704189426571, + "grad_norm": 2.9767501269679117, + "learning_rate": 1.356465479922383e-05, + "loss": 1.7614, + "step": 4181 + }, + { + "epoch": 0.8050629256202324, + "grad_norm": 2.9145814771687113, + "learning_rate": 1.3561741548693208e-05, + "loss": 1.7493, + "step": 4182 + }, + { + "epoch": 0.8052554322978078, + "grad_norm": 3.19190644211542, + "learning_rate": 1.355882795191574e-05, + "loss": 1.7349, + "step": 4183 + }, + { + "epoch": 0.8054479389753832, + "grad_norm": 3.2383860814875027, + "learning_rate": 1.3555914009174665e-05, + "loss": 1.8099, + "step": 4184 + }, + { + "epoch": 0.8054479389753832, + "lm_loss": 1.7311, + "step": 4184, + "vm_loss": 0.1708 + }, + { + "epoch": 0.8054479389753832, + "lm_loss": 1.6794, + "step": 4184, + "vm_loss": 0.1377 + }, + { + "epoch": 0.8054479389753832, + "lm_loss": 1.6862, + "step": 4184, + "vm_loss": 0.1839 + }, + { + "epoch": 0.8054479389753832, + "lm_loss": 1.3711, + "step": 4184, + "vm_loss": 0.1306 + }, + { + "epoch": 0.8054479389753832, + "lm_loss": 1.3464, + "step": 4184, + "vm_loss": 0.1746 + }, + { + "epoch": 0.8054479389753832, + "lm_loss": 1.8787, + "step": 4184, + "vm_loss": 0.148 + }, + { + "epoch": 0.8054479389753832, + "lm_loss": 2.037, + "step": 4184, + "vm_loss": 0.1514 + }, + { + "epoch": 0.8054479389753832, + "lm_loss": 1.5833, + "step": 4184, + "vm_loss": 0.199 + }, + { + "epoch": 0.8056404456529586, + "grad_norm": 3.0350628682987915, + "learning_rate": 1.3552999720753255e-05, + "loss": 1.7292, + "step": 4185 + }, + { + "epoch": 0.805832952330534, + "grad_norm": 2.8679878356224813, + "learning_rate": 1.3550085086934815e-05, + "loss": 1.6892, + "step": 4186 + }, + { + "epoch": 0.8060254590081093, + "grad_norm": 2.930655824720837, + "learning_rate": 1.3547170108002685e-05, + "loss": 1.8168, + "step": 4187 + }, + { + "epoch": 0.8062179656856847, + "grad_norm": 3.0648528032094515, + "learning_rate": 1.3544254784240243e-05, + "loss": 1.7858, + "step": 4188 + }, + { + "epoch": 0.8064104723632601, + "grad_norm": 3.1633340868402433, + "learning_rate": 1.3541339115930887e-05, + "loss": 1.7398, + "step": 4189 + }, + { + "epoch": 0.8066029790408354, + "grad_norm": 3.034954374668336, + "learning_rate": 1.3538423103358066e-05, + "loss": 1.7034, + "step": 4190 + }, + { + "epoch": 0.8067954857184109, + "grad_norm": 3.095265537520487, + "learning_rate": 1.3535506746805247e-05, + "loss": 1.7476, + "step": 4191 + }, + { + "epoch": 0.8069879923959863, + "grad_norm": 3.268188239359184, + "learning_rate": 1.3532590046555939e-05, + "loss": 1.7765, + "step": 4192 + }, + { + "epoch": 0.8069879923959863, + "lm_loss": 1.5298, + "step": 4192, + "vm_loss": 0.1768 + }, + { + "epoch": 0.8069879923959863, + "lm_loss": 1.5119, + "step": 4192, + "vm_loss": 0.1611 + }, + { + "epoch": 0.8069879923959863, + "lm_loss": 1.3325, + "step": 4192, + "vm_loss": 0.1154 + }, + { + "epoch": 0.8069879923959863, + "lm_loss": 1.9065, + "step": 4192, + "vm_loss": 0.1302 + }, + { + "epoch": 0.8069879923959863, + "lm_loss": 1.3308, + "step": 4192, + "vm_loss": 0.2217 + }, + { + "epoch": 0.8069879923959863, + "lm_loss": 1.3178, + "step": 4192, + "vm_loss": 0.13 + }, + { + "epoch": 0.8069879923959863, + "lm_loss": 1.5023, + "step": 4192, + "vm_loss": 0.2103 + }, + { + "epoch": 0.8069879923959863, + "lm_loss": 1.5513, + "step": 4192, + "vm_loss": 0.1499 + }, + { + "epoch": 0.8071804990735616, + "grad_norm": 3.230598820652005, + "learning_rate": 1.3529673002893686e-05, + "loss": 1.7385, + "step": 4193 + }, + { + "epoch": 0.807373005751137, + "grad_norm": 3.060410825098863, + "learning_rate": 1.3526755616102058e-05, + "loss": 1.7631, + "step": 4194 + }, + { + "epoch": 0.8075655124287123, + "grad_norm": 2.9509573237221067, + "learning_rate": 1.3523837886464664e-05, + "loss": 1.7553, + "step": 4195 + }, + { + "epoch": 0.8077580191062878, + "grad_norm": 3.0285404122721484, + "learning_rate": 1.3520919814265146e-05, + "loss": 1.7242, + "step": 4196 + }, + { + "epoch": 0.8079505257838632, + "grad_norm": 3.168218591535499, + "learning_rate": 1.3518001399787174e-05, + "loss": 1.6952, + "step": 4197 + }, + { + "epoch": 0.8081430324614385, + "grad_norm": 2.9021182303180666, + "learning_rate": 1.3515082643314457e-05, + "loss": 1.7218, + "step": 4198 + }, + { + "epoch": 0.8083355391390139, + "grad_norm": 3.0740080244492978, + "learning_rate": 1.3512163545130736e-05, + "loss": 1.7938, + "step": 4199 + }, + { + "epoch": 0.8085280458165892, + "grad_norm": 3.185109915921439, + "learning_rate": 1.3509244105519788e-05, + "loss": 1.7589, + "step": 4200 + }, + { + "epoch": 0.8085280458165892, + "lm_loss": 1.5822, + "step": 4200, + "vm_loss": 0.2174 + }, + { + "epoch": 0.8085280458165892, + "lm_loss": 1.1601, + "step": 4200, + "vm_loss": 0.1645 + }, + { + "epoch": 0.8085280458165892, + "lm_loss": 2.2057, + "step": 4200, + "vm_loss": 0.1875 + }, + { + "epoch": 0.8085280458165892, + "lm_loss": 1.3627, + "step": 4200, + "vm_loss": 0.1052 + }, + { + "epoch": 0.8085280458165892, + "lm_loss": 1.9234, + "step": 4200, + "vm_loss": 0.1326 + }, + { + "epoch": 0.8085280458165892, + "lm_loss": 0.8681, + "step": 4200, + "vm_loss": 0.1101 + }, + { + "epoch": 0.8085280458165892, + "lm_loss": 1.2193, + "step": 4200, + "vm_loss": 0.1685 + }, + { + "epoch": 0.8085280458165892, + "lm_loss": 1.3911, + "step": 4200, + "vm_loss": 0.2057 + }, + { + "epoch": 0.8087205524941646, + "grad_norm": 2.8091591189525618, + "learning_rate": 1.3506324324765414e-05, + "loss": 1.6959, + "step": 4201 + }, + { + "epoch": 0.8089130591717401, + "grad_norm": 3.0075184815212177, + "learning_rate": 1.3503404203151458e-05, + "loss": 1.766, + "step": 4202 + }, + { + "epoch": 0.8091055658493154, + "grad_norm": 2.950334567951103, + "learning_rate": 1.3500483740961792e-05, + "loss": 1.749, + "step": 4203 + }, + { + "epoch": 0.8092980725268908, + "grad_norm": 2.9699652426456447, + "learning_rate": 1.349756293848032e-05, + "loss": 1.7783, + "step": 4204 + }, + { + "epoch": 0.8094905792044662, + "grad_norm": 3.103827831584143, + "learning_rate": 1.3494641795990986e-05, + "loss": 1.7509, + "step": 4205 + }, + { + "epoch": 0.8096830858820415, + "grad_norm": 3.0540475358481642, + "learning_rate": 1.3491720313777756e-05, + "loss": 1.7321, + "step": 4206 + }, + { + "epoch": 0.8098755925596169, + "grad_norm": 3.1431337847449012, + "learning_rate": 1.3488798492124645e-05, + "loss": 1.697, + "step": 4207 + }, + { + "epoch": 0.8100680992371923, + "grad_norm": 3.036980372111481, + "learning_rate": 1.3485876331315682e-05, + "loss": 1.7405, + "step": 4208 + }, + { + "epoch": 0.8100680992371923, + "lm_loss": 1.4505, + "step": 4208, + "vm_loss": 0.1321 + }, + { + "epoch": 0.8100680992371923, + "lm_loss": 1.515, + "step": 4208, + "vm_loss": 0.2041 + }, + { + "epoch": 0.8100680992371923, + "lm_loss": 1.8511, + "step": 4208, + "vm_loss": 0.1229 + }, + { + "epoch": 0.8100680992371923, + "lm_loss": 1.4691, + "step": 4208, + "vm_loss": 0.1841 + }, + { + "epoch": 0.8100680992371923, + "lm_loss": 1.7261, + "step": 4208, + "vm_loss": 0.2225 + }, + { + "epoch": 0.8100680992371923, + "lm_loss": 1.8041, + "step": 4208, + "vm_loss": 0.1161 + }, + { + "epoch": 0.8100680992371923, + "lm_loss": 1.55, + "step": 4208, + "vm_loss": 0.173 + }, + { + "epoch": 0.8100680992371923, + "lm_loss": 1.7651, + "step": 4208, + "vm_loss": 0.2029 + }, + { + "epoch": 0.8102606059147677, + "grad_norm": 2.9630933375614137, + "learning_rate": 1.3482953831634941e-05, + "loss": 1.7285, + "step": 4209 + }, + { + "epoch": 0.8104531125923431, + "grad_norm": 3.1595450969622054, + "learning_rate": 1.348003099336653e-05, + "loss": 1.727, + "step": 4210 + }, + { + "epoch": 0.8106456192699184, + "grad_norm": 3.4164469045954564, + "learning_rate": 1.3477107816794583e-05, + "loss": 1.7991, + "step": 4211 + }, + { + "epoch": 0.8108381259474938, + "grad_norm": 3.092609435794148, + "learning_rate": 1.3474184302203275e-05, + "loss": 1.7423, + "step": 4212 + }, + { + "epoch": 0.8110306326250691, + "grad_norm": 2.970812730307828, + "learning_rate": 1.34712604498768e-05, + "loss": 1.7448, + "step": 4213 + }, + { + "epoch": 0.8112231393026446, + "grad_norm": 3.135328031365485, + "learning_rate": 1.3468336260099401e-05, + "loss": 1.7889, + "step": 4214 + }, + { + "epoch": 0.81141564598022, + "grad_norm": 3.434772059259396, + "learning_rate": 1.3465411733155342e-05, + "loss": 1.8172, + "step": 4215 + }, + { + "epoch": 0.8116081526577953, + "grad_norm": 3.2510891722237387, + "learning_rate": 1.346248686932893e-05, + "loss": 1.8063, + "step": 4216 + }, + { + "epoch": 0.8116081526577953, + "lm_loss": 1.8768, + "step": 4216, + "vm_loss": 0.128 + }, + { + "epoch": 0.8116081526577953, + "lm_loss": 1.9212, + "step": 4216, + "vm_loss": 0.1647 + }, + { + "epoch": 0.8116081526577953, + "lm_loss": 1.9487, + "step": 4216, + "vm_loss": 0.178 + }, + { + "epoch": 0.8116081526577953, + "lm_loss": 2.128, + "step": 4216, + "vm_loss": 0.1297 + }, + { + "epoch": 0.8116081526577953, + "lm_loss": 1.7591, + "step": 4216, + "vm_loss": 0.1224 + }, + { + "epoch": 0.8116081526577953, + "lm_loss": 1.5731, + "step": 4216, + "vm_loss": 0.1958 + }, + { + "epoch": 0.8116081526577953, + "lm_loss": 1.5061, + "step": 4216, + "vm_loss": 0.1459 + }, + { + "epoch": 0.8116081526577953, + "lm_loss": 1.5745, + "step": 4216, + "vm_loss": 0.2486 + }, + { + "epoch": 0.8118006593353707, + "grad_norm": 3.087688855434372, + "learning_rate": 1.3459561668904496e-05, + "loss": 1.8176, + "step": 4217 + }, + { + "epoch": 0.811993166012946, + "grad_norm": 3.0048426480157793, + "learning_rate": 1.3456636132166406e-05, + "loss": 1.7865, + "step": 4218 + }, + { + "epoch": 0.8121856726905214, + "grad_norm": 2.9385449217627073, + "learning_rate": 1.345371025939906e-05, + "loss": 1.8323, + "step": 4219 + }, + { + "epoch": 0.8123781793680969, + "grad_norm": 3.11194397351124, + "learning_rate": 1.3450784050886891e-05, + "loss": 1.7165, + "step": 4220 + }, + { + "epoch": 0.8125706860456722, + "grad_norm": 3.10133339124572, + "learning_rate": 1.3447857506914363e-05, + "loss": 1.7359, + "step": 4221 + }, + { + "epoch": 0.8127631927232476, + "grad_norm": 3.1377905145886444, + "learning_rate": 1.3444930627765978e-05, + "loss": 1.7256, + "step": 4222 + }, + { + "epoch": 0.812955699400823, + "grad_norm": 3.135955377110187, + "learning_rate": 1.344200341372626e-05, + "loss": 1.7197, + "step": 4223 + }, + { + "epoch": 0.8131482060783983, + "grad_norm": 2.995005178114232, + "learning_rate": 1.3439075865079777e-05, + "loss": 1.6991, + "step": 4224 + }, + { + "epoch": 0.8131482060783983, + "lm_loss": 1.5297, + "step": 4224, + "vm_loss": 0.1808 + }, + { + "epoch": 0.8131482060783983, + "lm_loss": 1.4745, + "step": 4224, + "vm_loss": 0.129 + }, + { + "epoch": 0.8131482060783983, + "lm_loss": 1.798, + "step": 4224, + "vm_loss": 0.161 + }, + { + "epoch": 0.8131482060783983, + "lm_loss": 1.1874, + "step": 4224, + "vm_loss": 0.1217 + }, + { + "epoch": 0.8131482060783983, + "lm_loss": 1.6013, + "step": 4224, + "vm_loss": 0.1841 + }, + { + "epoch": 0.8131482060783983, + "lm_loss": 2.045, + "step": 4224, + "vm_loss": 0.1468 + }, + { + "epoch": 0.8131482060783983, + "lm_loss": 0.9502, + "step": 4224, + "vm_loss": 0.2246 + }, + { + "epoch": 0.8131482060783983, + "lm_loss": 1.5346, + "step": 4224, + "vm_loss": 0.1682 + }, + { + "epoch": 0.8133407127559737, + "grad_norm": 3.004449221976845, + "learning_rate": 1.3436147982111116e-05, + "loss": 1.6784, + "step": 4225 + }, + { + "epoch": 0.8135332194335491, + "grad_norm": 3.1928208966914564, + "learning_rate": 1.3433219765104913e-05, + "loss": 1.7894, + "step": 4226 + }, + { + "epoch": 0.8137257261111245, + "grad_norm": 3.025728406355041, + "learning_rate": 1.3430291214345825e-05, + "loss": 1.7178, + "step": 4227 + }, + { + "epoch": 0.8139182327886999, + "grad_norm": 3.010848551962657, + "learning_rate": 1.3427362330118542e-05, + "loss": 1.7906, + "step": 4228 + }, + { + "epoch": 0.8141107394662752, + "grad_norm": 3.077800366060747, + "learning_rate": 1.3424433112707798e-05, + "loss": 1.7662, + "step": 4229 + }, + { + "epoch": 0.8143032461438506, + "grad_norm": 2.9697349009555674, + "learning_rate": 1.3421503562398339e-05, + "loss": 1.8232, + "step": 4230 + }, + { + "epoch": 0.814495752821426, + "grad_norm": 2.961893574534323, + "learning_rate": 1.3418573679474962e-05, + "loss": 1.7537, + "step": 4231 + }, + { + "epoch": 0.8146882594990014, + "grad_norm": 3.0719664336014314, + "learning_rate": 1.3415643464222487e-05, + "loss": 1.7107, + "step": 4232 + }, + { + "epoch": 0.8146882594990014, + "lm_loss": 1.7374, + "step": 4232, + "vm_loss": 0.2452 + }, + { + "epoch": 0.8146882594990014, + "lm_loss": 1.1992, + "step": 4232, + "vm_loss": 0.1452 + }, + { + "epoch": 0.8146882594990014, + "lm_loss": 1.6857, + "step": 4232, + "vm_loss": 0.1611 + }, + { + "epoch": 0.8146882594990014, + "lm_loss": 1.5649, + "step": 4232, + "vm_loss": 0.1123 + }, + { + "epoch": 0.8146882594990014, + "lm_loss": 1.6239, + "step": 4232, + "vm_loss": 0.2647 + }, + { + "epoch": 0.8146882594990014, + "lm_loss": 1.8705, + "step": 4232, + "vm_loss": 0.226 + }, + { + "epoch": 0.8146882594990014, + "lm_loss": 1.4023, + "step": 4232, + "vm_loss": 0.1215 + }, + { + "epoch": 0.8146882594990014, + "lm_loss": 1.3925, + "step": 4232, + "vm_loss": 0.1695 + }, + { + "epoch": 0.8148807661765768, + "grad_norm": 3.056474152000144, + "learning_rate": 1.3412712916925768e-05, + "loss": 1.7672, + "step": 4233 + }, + { + "epoch": 0.8150732728541521, + "grad_norm": 3.118553472919914, + "learning_rate": 1.3409782037869692e-05, + "loss": 1.7945, + "step": 4234 + }, + { + "epoch": 0.8152657795317275, + "grad_norm": 2.9588114611600616, + "learning_rate": 1.3406850827339177e-05, + "loss": 1.7608, + "step": 4235 + }, + { + "epoch": 0.8154582862093029, + "grad_norm": 2.952878484538887, + "learning_rate": 1.3403919285619179e-05, + "loss": 1.7732, + "step": 4236 + }, + { + "epoch": 0.8156507928868783, + "grad_norm": 3.139813173629082, + "learning_rate": 1.3400987412994673e-05, + "loss": 1.7564, + "step": 4237 + }, + { + "epoch": 0.8158432995644537, + "grad_norm": 3.113546562427499, + "learning_rate": 1.339805520975068e-05, + "loss": 1.7142, + "step": 4238 + }, + { + "epoch": 0.816035806242029, + "grad_norm": 3.0329534417999144, + "learning_rate": 1.3395122676172248e-05, + "loss": 1.8003, + "step": 4239 + }, + { + "epoch": 0.8162283129196044, + "grad_norm": 2.9078464008822977, + "learning_rate": 1.3392189812544454e-05, + "loss": 1.7531, + "step": 4240 + }, + { + "epoch": 0.8162283129196044, + "lm_loss": 1.3255, + "step": 4240, + "vm_loss": 0.1326 + }, + { + "epoch": 0.8162283129196044, + "lm_loss": 1.9207, + "step": 4240, + "vm_loss": 0.1842 + }, + { + "epoch": 0.8162283129196044, + "lm_loss": 1.3671, + "step": 4240, + "vm_loss": 0.1593 + }, + { + "epoch": 0.8162283129196044, + "lm_loss": 1.8623, + "step": 4240, + "vm_loss": 0.1441 + }, + { + "epoch": 0.8162283129196044, + "lm_loss": 1.2443, + "step": 4240, + "vm_loss": 0.1989 + }, + { + "epoch": 0.8162283129196044, + "lm_loss": 1.7799, + "step": 4240, + "vm_loss": 0.1465 + }, + { + "epoch": 0.8162283129196044, + "lm_loss": 1.9387, + "step": 4240, + "vm_loss": 0.1828 + }, + { + "epoch": 0.8162283129196044, + "lm_loss": 1.9082, + "step": 4240, + "vm_loss": 0.2437 + }, + { + "epoch": 0.8164208195971798, + "grad_norm": 3.0017982685155107, + "learning_rate": 1.3389256619152416e-05, + "loss": 1.8217, + "step": 4241 + }, + { + "epoch": 0.8166133262747551, + "grad_norm": 3.0487075049373074, + "learning_rate": 1.3386323096281268e-05, + "loss": 1.7352, + "step": 4242 + }, + { + "epoch": 0.8168058329523306, + "grad_norm": 2.977176962346174, + "learning_rate": 1.3383389244216193e-05, + "loss": 1.7995, + "step": 4243 + }, + { + "epoch": 0.8169983396299059, + "grad_norm": 2.8916049415010425, + "learning_rate": 1.3380455063242399e-05, + "loss": 1.7063, + "step": 4244 + }, + { + "epoch": 0.8171908463074813, + "grad_norm": 2.9897527401065895, + "learning_rate": 1.3377520553645122e-05, + "loss": 1.7536, + "step": 4245 + }, + { + "epoch": 0.8173833529850567, + "grad_norm": 2.97455430838434, + "learning_rate": 1.3374585715709638e-05, + "loss": 1.7192, + "step": 4246 + }, + { + "epoch": 0.817575859662632, + "grad_norm": 3.022859891085978, + "learning_rate": 1.3371650549721246e-05, + "loss": 1.7041, + "step": 4247 + }, + { + "epoch": 0.8177683663402074, + "grad_norm": 2.926727736452646, + "learning_rate": 1.3368715055965289e-05, + "loss": 1.6899, + "step": 4248 + }, + { + "epoch": 0.8177683663402074, + "lm_loss": 1.7005, + "step": 4248, + "vm_loss": 0.1792 + }, + { + "epoch": 0.8177683663402074, + "lm_loss": 1.0538, + "step": 4248, + "vm_loss": 0.1826 + }, + { + "epoch": 0.8177683663402074, + "lm_loss": 1.6263, + "step": 4248, + "vm_loss": 0.1551 + }, + { + "epoch": 0.8177683663402074, + "lm_loss": 1.5526, + "step": 4248, + "vm_loss": 0.1669 + }, + { + "epoch": 0.8177683663402074, + "lm_loss": 1.5682, + "step": 4248, + "vm_loss": 0.223 + }, + { + "epoch": 0.8177683663402074, + "lm_loss": 1.5879, + "step": 4248, + "vm_loss": 0.1681 + }, + { + "epoch": 0.8177683663402074, + "lm_loss": 1.8125, + "step": 4248, + "vm_loss": 0.1872 + }, + { + "epoch": 0.8177683663402074, + "lm_loss": 2.2098, + "step": 4248, + "vm_loss": 0.1633 + }, + { + "epoch": 0.8179608730177828, + "grad_norm": 2.9584910565929032, + "learning_rate": 1.3365779234727127e-05, + "loss": 1.7387, + "step": 4249 + }, + { + "epoch": 0.8181533796953582, + "grad_norm": 3.195450526833678, + "learning_rate": 1.336284308629216e-05, + "loss": 1.654, + "step": 4250 + }, + { + "epoch": 0.8183458863729336, + "grad_norm": 3.1213635065641743, + "learning_rate": 1.3359906610945829e-05, + "loss": 1.7943, + "step": 4251 + }, + { + "epoch": 0.8185383930505089, + "grad_norm": 3.060303564759357, + "learning_rate": 1.3356969808973583e-05, + "loss": 1.8135, + "step": 4252 + }, + { + "epoch": 0.8187308997280843, + "grad_norm": 2.907618765168843, + "learning_rate": 1.3354032680660928e-05, + "loss": 1.7595, + "step": 4253 + }, + { + "epoch": 0.8189234064056597, + "grad_norm": 3.0830533502737145, + "learning_rate": 1.3351095226293383e-05, + "loss": 1.7521, + "step": 4254 + }, + { + "epoch": 0.8191159130832351, + "grad_norm": 3.002790591094632, + "learning_rate": 1.3348157446156508e-05, + "loss": 1.7921, + "step": 4255 + }, + { + "epoch": 0.8193084197608105, + "grad_norm": 3.12771378595863, + "learning_rate": 1.3345219340535897e-05, + "loss": 1.7093, + "step": 4256 + }, + { + "epoch": 0.8193084197608105, + "lm_loss": 1.5638, + "step": 4256, + "vm_loss": 0.1772 + }, + { + "epoch": 0.8193084197608105, + "lm_loss": 1.5537, + "step": 4256, + "vm_loss": 0.1042 + }, + { + "epoch": 0.8193084197608105, + "lm_loss": 1.8795, + "step": 4256, + "vm_loss": 0.1811 + }, + { + "epoch": 0.8193084197608105, + "lm_loss": 1.833, + "step": 4256, + "vm_loss": 0.1003 + }, + { + "epoch": 0.8193084197608105, + "lm_loss": 1.6799, + "step": 4256, + "vm_loss": 0.2417 + }, + { + "epoch": 0.8193084197608105, + "lm_loss": 2.0418, + "step": 4256, + "vm_loss": 0.1966 + }, + { + "epoch": 0.8193084197608105, + "lm_loss": 1.5613, + "step": 4256, + "vm_loss": 0.1418 + }, + { + "epoch": 0.8193084197608105, + "lm_loss": 1.3785, + "step": 4256, + "vm_loss": 0.0856 + }, + { + "epoch": 0.8195009264383858, + "grad_norm": 3.087553954059264, + "learning_rate": 1.3342280909717166e-05, + "loss": 1.7217, + "step": 4257 + }, + { + "epoch": 0.8196934331159612, + "grad_norm": 3.297333111638422, + "learning_rate": 1.3339342153985973e-05, + "loss": 1.7545, + "step": 4258 + }, + { + "epoch": 0.8198859397935366, + "grad_norm": 3.126109433302256, + "learning_rate": 1.3336403073627997e-05, + "loss": 1.7496, + "step": 4259 + }, + { + "epoch": 0.8200784464711119, + "grad_norm": 3.0400943295068803, + "learning_rate": 1.3333463668928958e-05, + "loss": 1.7762, + "step": 4260 + }, + { + "epoch": 0.8202709531486874, + "grad_norm": 3.0445408784785633, + "learning_rate": 1.3330523940174603e-05, + "loss": 1.7057, + "step": 4261 + }, + { + "epoch": 0.8204634598262627, + "grad_norm": 2.933298159888886, + "learning_rate": 1.332758388765071e-05, + "loss": 1.7476, + "step": 4262 + }, + { + "epoch": 0.8206559665038381, + "grad_norm": 2.967201689873884, + "learning_rate": 1.3324643511643091e-05, + "loss": 1.7618, + "step": 4263 + }, + { + "epoch": 0.8208484731814135, + "grad_norm": 3.1038049182627345, + "learning_rate": 1.3321702812437591e-05, + "loss": 1.7328, + "step": 4264 + }, + { + "epoch": 0.8208484731814135, + "lm_loss": 1.3352, + "step": 4264, + "vm_loss": 0.2115 + }, + { + "epoch": 0.8208484731814135, + "lm_loss": 1.6949, + "step": 4264, + "vm_loss": 0.125 + }, + { + "epoch": 0.8208484731814135, + "lm_loss": 1.4472, + "step": 4264, + "vm_loss": 0.1165 + }, + { + "epoch": 0.8208484731814135, + "lm_loss": 2.0302, + "step": 4264, + "vm_loss": 0.1158 + }, + { + "epoch": 0.8208484731814135, + "lm_loss": 1.5004, + "step": 4264, + "vm_loss": 0.1711 + }, + { + "epoch": 0.8208484731814135, + "lm_loss": 2.2003, + "step": 4264, + "vm_loss": 0.198 + }, + { + "epoch": 0.8208484731814135, + "lm_loss": 1.6877, + "step": 4264, + "vm_loss": 0.1892 + }, + { + "epoch": 0.8208484731814135, + "lm_loss": 1.4109, + "step": 4264, + "vm_loss": 0.1669 + }, + { + "epoch": 0.8210409798589888, + "grad_norm": 3.4524913755032887, + "learning_rate": 1.3318761790320078e-05, + "loss": 1.7804, + "step": 4265 + }, + { + "epoch": 0.8212334865365643, + "grad_norm": 3.084612718140637, + "learning_rate": 1.3315820445576461e-05, + "loss": 1.7303, + "step": 4266 + }, + { + "epoch": 0.8214259932141396, + "grad_norm": 2.86453479820018, + "learning_rate": 1.3312878778492673e-05, + "loss": 1.7261, + "step": 4267 + }, + { + "epoch": 0.821618499891715, + "grad_norm": 2.9642940257281603, + "learning_rate": 1.3309936789354687e-05, + "loss": 1.7786, + "step": 4268 + }, + { + "epoch": 0.8218110065692904, + "grad_norm": 2.872544224702129, + "learning_rate": 1.3306994478448497e-05, + "loss": 1.6908, + "step": 4269 + }, + { + "epoch": 0.8220035132468657, + "grad_norm": 3.24797991298083, + "learning_rate": 1.3304051846060135e-05, + "loss": 1.7209, + "step": 4270 + }, + { + "epoch": 0.8221960199244411, + "grad_norm": 3.2505037606017484, + "learning_rate": 1.3301108892475662e-05, + "loss": 1.6965, + "step": 4271 + }, + { + "epoch": 0.8223885266020166, + "grad_norm": 3.31943131891016, + "learning_rate": 1.3298165617981171e-05, + "loss": 1.7208, + "step": 4272 + }, + { + "epoch": 0.8223885266020166, + "lm_loss": 1.3379, + "step": 4272, + "vm_loss": 0.1484 + }, + { + "epoch": 0.8223885266020166, + "lm_loss": 1.7845, + "step": 4272, + "vm_loss": 0.177 + }, + { + "epoch": 0.8223885266020166, + "lm_loss": 1.5951, + "step": 4272, + "vm_loss": 0.2236 + }, + { + "epoch": 0.8223885266020166, + "lm_loss": 1.9505, + "step": 4272, + "vm_loss": 0.1442 + }, + { + "epoch": 0.8223885266020166, + "lm_loss": 1.632, + "step": 4272, + "vm_loss": 0.1461 + }, + { + "epoch": 0.8223885266020166, + "lm_loss": 1.5017, + "step": 4272, + "vm_loss": 0.1583 + }, + { + "epoch": 0.8223885266020166, + "lm_loss": 1.7023, + "step": 4272, + "vm_loss": 0.2229 + }, + { + "epoch": 0.8223885266020166, + "lm_loss": 1.5454, + "step": 4272, + "vm_loss": 0.1333 + }, + { + "epoch": 0.8225810332795919, + "grad_norm": 3.173916383370582, + "learning_rate": 1.3295222022862786e-05, + "loss": 1.7939, + "step": 4273 + }, + { + "epoch": 0.8227735399571673, + "grad_norm": 3.020100243496548, + "learning_rate": 1.3292278107406662e-05, + "loss": 1.7385, + "step": 4274 + }, + { + "epoch": 0.8229660466347426, + "grad_norm": 2.8670773424380767, + "learning_rate": 1.3289333871898985e-05, + "loss": 1.6237, + "step": 4275 + }, + { + "epoch": 0.823158553312318, + "grad_norm": 3.078847320816505, + "learning_rate": 1.3286389316625974e-05, + "loss": 1.8541, + "step": 4276 + }, + { + "epoch": 0.8233510599898934, + "grad_norm": 3.267465954184543, + "learning_rate": 1.3283444441873879e-05, + "loss": 1.7641, + "step": 4277 + }, + { + "epoch": 0.8235435666674688, + "grad_norm": 3.0722589517473287, + "learning_rate": 1.3280499247928971e-05, + "loss": 1.7111, + "step": 4278 + }, + { + "epoch": 0.8237360733450442, + "grad_norm": 2.972695817002317, + "learning_rate": 1.3277553735077568e-05, + "loss": 1.6193, + "step": 4279 + }, + { + "epoch": 0.8239285800226195, + "grad_norm": 3.469561820609016, + "learning_rate": 1.3274607903606011e-05, + "loss": 1.7081, + "step": 4280 + }, + { + "epoch": 0.8239285800226195, + "lm_loss": 1.8575, + "step": 4280, + "vm_loss": 0.1863 + }, + { + "epoch": 0.8239285800226195, + "lm_loss": 1.5696, + "step": 4280, + "vm_loss": 0.1104 + }, + { + "epoch": 0.8239285800226195, + "lm_loss": 1.828, + "step": 4280, + "vm_loss": 0.11 + }, + { + "epoch": 0.8239285800226195, + "lm_loss": 1.9287, + "step": 4280, + "vm_loss": 0.1332 + }, + { + "epoch": 0.8239285800226195, + "lm_loss": 1.2578, + "step": 4280, + "vm_loss": 0.216 + }, + { + "epoch": 0.8239285800226195, + "lm_loss": 1.8995, + "step": 4280, + "vm_loss": 0.1498 + }, + { + "epoch": 0.8239285800226195, + "lm_loss": 1.0758, + "step": 4280, + "vm_loss": 0.1936 + }, + { + "epoch": 0.8239285800226195, + "lm_loss": 1.5689, + "step": 4280, + "vm_loss": 0.1486 + }, + { + "epoch": 0.8241210867001949, + "grad_norm": 3.3610551697349105, + "learning_rate": 1.3271661753800671e-05, + "loss": 1.7621, + "step": 4281 + }, + { + "epoch": 0.8243135933777703, + "grad_norm": 3.206184853890844, + "learning_rate": 1.326871528594795e-05, + "loss": 1.7821, + "step": 4282 + }, + { + "epoch": 0.8245061000553456, + "grad_norm": 3.2065666190871056, + "learning_rate": 1.3265768500334286e-05, + "loss": 1.7724, + "step": 4283 + }, + { + "epoch": 0.8246986067329211, + "grad_norm": 3.0557994407050213, + "learning_rate": 1.3262821397246142e-05, + "loss": 1.7552, + "step": 4284 + }, + { + "epoch": 0.8248911134104965, + "grad_norm": 3.16455530963429, + "learning_rate": 1.3259873976970017e-05, + "loss": 1.766, + "step": 4285 + }, + { + "epoch": 0.8250836200880718, + "grad_norm": 3.11495912863915, + "learning_rate": 1.3256926239792432e-05, + "loss": 1.7142, + "step": 4286 + }, + { + "epoch": 0.8252761267656472, + "grad_norm": 3.1639425631823235, + "learning_rate": 1.325397818599995e-05, + "loss": 1.7724, + "step": 4287 + }, + { + "epoch": 0.8254686334432225, + "grad_norm": 3.2540000858537304, + "learning_rate": 1.3251029815879157e-05, + "loss": 1.7853, + "step": 4288 + }, + { + "epoch": 0.8254686334432225, + "lm_loss": 1.0671, + "step": 4288, + "vm_loss": 0.1237 + }, + { + "epoch": 0.8254686334432225, + "lm_loss": 1.7359, + "step": 4288, + "vm_loss": 0.19 + }, + { + "epoch": 0.8254686334432225, + "lm_loss": 1.3392, + "step": 4288, + "vm_loss": 0.194 + }, + { + "epoch": 0.8254686334432225, + "lm_loss": 1.7199, + "step": 4288, + "vm_loss": 0.1536 + }, + { + "epoch": 0.8254686334432225, + "lm_loss": 1.3968, + "step": 4288, + "vm_loss": 0.1485 + }, + { + "epoch": 0.8254686334432225, + "lm_loss": 1.7895, + "step": 4288, + "vm_loss": 0.1697 + }, + { + "epoch": 0.8254686334432225, + "lm_loss": 1.5899, + "step": 4288, + "vm_loss": 0.2258 + }, + { + "epoch": 0.8254686334432225, + "lm_loss": 1.4685, + "step": 4288, + "vm_loss": 0.1074 + }, + { + "epoch": 0.8256611401207979, + "grad_norm": 3.3643625865291633, + "learning_rate": 1.3248081129716674e-05, + "loss": 1.714, + "step": 4289 + }, + { + "epoch": 0.8258536467983734, + "grad_norm": 3.1606649244447365, + "learning_rate": 1.3245132127799147e-05, + "loss": 1.6943, + "step": 4290 + }, + { + "epoch": 0.8260461534759487, + "grad_norm": 3.130702444656028, + "learning_rate": 1.3242182810413262e-05, + "loss": 1.6464, + "step": 4291 + }, + { + "epoch": 0.8262386601535241, + "grad_norm": 3.1671571621051915, + "learning_rate": 1.3239233177845732e-05, + "loss": 1.5863, + "step": 4292 + }, + { + "epoch": 0.8264311668310994, + "grad_norm": 3.1401928995314443, + "learning_rate": 1.323628323038329e-05, + "loss": 1.7095, + "step": 4293 + }, + { + "epoch": 0.8266236735086748, + "grad_norm": 3.2568144055943256, + "learning_rate": 1.3233332968312715e-05, + "loss": 1.7271, + "step": 4294 + }, + { + "epoch": 0.8268161801862502, + "grad_norm": 3.3262138501821514, + "learning_rate": 1.3230382391920808e-05, + "loss": 1.7909, + "step": 4295 + }, + { + "epoch": 0.8270086868638256, + "grad_norm": 3.147954434809932, + "learning_rate": 1.3227431501494406e-05, + "loss": 1.7132, + "step": 4296 + }, + { + "epoch": 0.8270086868638256, + "lm_loss": 1.2794, + "step": 4296, + "vm_loss": 0.1256 + }, + { + "epoch": 0.8270086868638256, + "lm_loss": 1.3709, + "step": 4296, + "vm_loss": 0.1878 + }, + { + "epoch": 0.8270086868638256, + "lm_loss": 1.6473, + "step": 4296, + "vm_loss": 0.1468 + }, + { + "epoch": 0.8270086868638256, + "lm_loss": 1.554, + "step": 4296, + "vm_loss": 0.1313 + }, + { + "epoch": 0.8270086868638256, + "lm_loss": 1.2883, + "step": 4296, + "vm_loss": 0.1441 + }, + { + "epoch": 0.8270086868638256, + "lm_loss": 1.7367, + "step": 4296, + "vm_loss": 0.206 + }, + { + "epoch": 0.8270086868638256, + "lm_loss": 1.7747, + "step": 4296, + "vm_loss": 0.1579 + }, + { + "epoch": 0.8270086868638256, + "lm_loss": 1.2201, + "step": 4296, + "vm_loss": 0.1661 + }, + { + "epoch": 0.827201193541401, + "grad_norm": 3.098110333123799, + "learning_rate": 1.3224480297320373e-05, + "loss": 1.7194, + "step": 4297 + }, + { + "epoch": 0.8273937002189763, + "grad_norm": 2.954003103505225, + "learning_rate": 1.3221528779685597e-05, + "loss": 1.7169, + "step": 4298 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 3.1261572280040575, + "learning_rate": 1.3218576948877013e-05, + "loss": 1.7307, + "step": 4299 + }, + { + "epoch": 0.8277787135741271, + "grad_norm": 2.9410075458142826, + "learning_rate": 1.3215624805181568e-05, + "loss": 1.7621, + "step": 4300 + }, + { + "epoch": 0.8279712202517024, + "grad_norm": 3.15622291770706, + "learning_rate": 1.3212672348886254e-05, + "loss": 1.7768, + "step": 4301 + }, + { + "epoch": 0.8281637269292779, + "grad_norm": 3.045413340542613, + "learning_rate": 1.3209719580278085e-05, + "loss": 1.6844, + "step": 4302 + }, + { + "epoch": 0.8283562336068533, + "grad_norm": 3.0723424443103045, + "learning_rate": 1.3206766499644111e-05, + "loss": 1.7145, + "step": 4303 + }, + { + "epoch": 0.8285487402844286, + "grad_norm": 3.080632806858383, + "learning_rate": 1.3203813107271401e-05, + "loss": 1.7135, + "step": 4304 + }, + { + "epoch": 0.8285487402844286, + "lm_loss": 1.4784, + "step": 4304, + "vm_loss": 0.1726 + }, + { + "epoch": 0.8285487402844286, + "lm_loss": 1.44, + "step": 4304, + "vm_loss": 0.1969 + }, + { + "epoch": 0.8285487402844286, + "lm_loss": 1.9401, + "step": 4304, + "vm_loss": 0.2073 + }, + { + "epoch": 0.8285487402844286, + "lm_loss": 2.0472, + "step": 4304, + "vm_loss": 0.1248 + }, + { + "epoch": 0.8285487402844286, + "lm_loss": 1.7871, + "step": 4304, + "vm_loss": 0.2297 + }, + { + "epoch": 0.8285487402844286, + "lm_loss": 1.8191, + "step": 4304, + "vm_loss": 0.1675 + }, + { + "epoch": 0.8285487402844286, + "lm_loss": 1.7695, + "step": 4304, + "vm_loss": 0.159 + }, + { + "epoch": 0.8285487402844286, + "lm_loss": 1.858, + "step": 4304, + "vm_loss": 0.2146 + }, + { + "epoch": 0.828741246962004, + "grad_norm": 2.997829906303288, + "learning_rate": 1.3200859403447072e-05, + "loss": 1.7718, + "step": 4305 + }, + { + "epoch": 0.8289337536395793, + "grad_norm": 3.0669689279731003, + "learning_rate": 1.319790538845826e-05, + "loss": 1.817, + "step": 4306 + }, + { + "epoch": 0.8291262603171548, + "grad_norm": 3.024933959494137, + "learning_rate": 1.3194951062592127e-05, + "loss": 1.7663, + "step": 4307 + }, + { + "epoch": 0.8293187669947302, + "grad_norm": 2.9890220690854803, + "learning_rate": 1.3191996426135876e-05, + "loss": 1.7223, + "step": 4308 + }, + { + "epoch": 0.8295112736723055, + "grad_norm": 3.0407935122446905, + "learning_rate": 1.3189041479376739e-05, + "loss": 1.6666, + "step": 4309 + }, + { + "epoch": 0.8297037803498809, + "grad_norm": 3.1377110576316087, + "learning_rate": 1.3186086222601965e-05, + "loss": 1.7573, + "step": 4310 + }, + { + "epoch": 0.8298962870274562, + "grad_norm": 2.880648423818378, + "learning_rate": 1.3183130656098852e-05, + "loss": 1.7631, + "step": 4311 + }, + { + "epoch": 0.8300887937050316, + "grad_norm": 3.100730411465285, + "learning_rate": 1.3180174780154712e-05, + "loss": 1.6916, + "step": 4312 + }, + { + "epoch": 0.8300887937050316, + "lm_loss": 1.3353, + "step": 4312, + "vm_loss": 0.1649 + }, + { + "epoch": 0.8300887937050316, + "lm_loss": 1.6853, + "step": 4312, + "vm_loss": 0.1797 + }, + { + "epoch": 0.8300887937050316, + "lm_loss": 1.286, + "step": 4312, + "vm_loss": 0.1856 + }, + { + "epoch": 0.8300887937050316, + "lm_loss": 1.7319, + "step": 4312, + "vm_loss": 0.189 + }, + { + "epoch": 0.8300887937050316, + "lm_loss": 1.436, + "step": 4312, + "vm_loss": 0.167 + }, + { + "epoch": 0.8300887937050316, + "lm_loss": 1.5393, + "step": 4312, + "vm_loss": 0.2059 + }, + { + "epoch": 0.8300887937050316, + "lm_loss": 1.4342, + "step": 4312, + "vm_loss": 0.1305 + }, + { + "epoch": 0.8300887937050316, + "lm_loss": 1.9062, + "step": 4312, + "vm_loss": 0.1779 + }, + { + "epoch": 0.8302813003826071, + "grad_norm": 2.9849119995428435, + "learning_rate": 1.3177218595056899e-05, + "loss": 1.665, + "step": 4313 + }, + { + "epoch": 0.8304738070601824, + "grad_norm": 3.1273888132548513, + "learning_rate": 1.3174262101092792e-05, + "loss": 1.7917, + "step": 4314 + }, + { + "epoch": 0.8306663137377578, + "grad_norm": 2.916513814222911, + "learning_rate": 1.3171305298549797e-05, + "loss": 1.6309, + "step": 4315 + }, + { + "epoch": 0.8308588204153332, + "grad_norm": 3.0272047536354023, + "learning_rate": 1.3168348187715353e-05, + "loss": 1.7967, + "step": 4316 + }, + { + "epoch": 0.8310513270929085, + "grad_norm": 2.924545959133155, + "learning_rate": 1.316539076887693e-05, + "loss": 1.7042, + "step": 4317 + }, + { + "epoch": 0.8312438337704839, + "grad_norm": 3.0858666430461996, + "learning_rate": 1.3162433042322026e-05, + "loss": 1.6617, + "step": 4318 + }, + { + "epoch": 0.8314363404480593, + "grad_norm": 3.1526376925494084, + "learning_rate": 1.3159475008338174e-05, + "loss": 1.7445, + "step": 4319 + }, + { + "epoch": 0.8316288471256347, + "grad_norm": 3.058549450276619, + "learning_rate": 1.3156516667212929e-05, + "loss": 1.6957, + "step": 4320 + }, + { + "epoch": 0.8316288471256347, + "lm_loss": 1.0864, + "step": 4320, + "vm_loss": 0.1851 + }, + { + "epoch": 0.8316288471256347, + "lm_loss": 1.7334, + "step": 4320, + "vm_loss": 0.106 + }, + { + "epoch": 0.8316288471256347, + "lm_loss": 1.6165, + "step": 4320, + "vm_loss": 0.1904 + }, + { + "epoch": 0.8316288471256347, + "lm_loss": 1.5458, + "step": 4320, + "vm_loss": 0.1162 + }, + { + "epoch": 0.8316288471256347, + "lm_loss": 1.9312, + "step": 4320, + "vm_loss": 0.1681 + }, + { + "epoch": 0.8316288471256347, + "lm_loss": 1.7114, + "step": 4320, + "vm_loss": 0.1881 + }, + { + "epoch": 0.8316288471256347, + "lm_loss": 1.6733, + "step": 4320, + "vm_loss": 0.1587 + }, + { + "epoch": 0.8316288471256347, + "lm_loss": 1.4645, + "step": 4320, + "vm_loss": 0.14 + }, + { + "epoch": 0.8318213538032101, + "grad_norm": 2.9855443525388656, + "learning_rate": 1.315355801923388e-05, + "loss": 1.7789, + "step": 4321 + }, + { + "epoch": 0.8320138604807854, + "grad_norm": 3.1908073180382623, + "learning_rate": 1.3150599064688643e-05, + "loss": 1.7147, + "step": 4322 + }, + { + "epoch": 0.8322063671583608, + "grad_norm": 3.1022150465040834, + "learning_rate": 1.3147639803864871e-05, + "loss": 1.76, + "step": 4323 + }, + { + "epoch": 0.8323988738359361, + "grad_norm": 2.9471853463082156, + "learning_rate": 1.314468023705024e-05, + "loss": 1.6761, + "step": 4324 + }, + { + "epoch": 0.8325913805135116, + "grad_norm": 2.854576162752735, + "learning_rate": 1.3141720364532458e-05, + "loss": 1.6797, + "step": 4325 + }, + { + "epoch": 0.832783887191087, + "grad_norm": 2.8976015643510107, + "learning_rate": 1.3138760186599262e-05, + "loss": 1.7644, + "step": 4326 + }, + { + "epoch": 0.8329763938686623, + "grad_norm": 3.042699031256594, + "learning_rate": 1.3135799703538418e-05, + "loss": 1.7433, + "step": 4327 + }, + { + "epoch": 0.8331689005462377, + "grad_norm": 3.112085598050744, + "learning_rate": 1.3132838915637727e-05, + "loss": 1.7964, + "step": 4328 + }, + { + "epoch": 0.8331689005462377, + "lm_loss": 1.6855, + "step": 4328, + "vm_loss": 0.1466 + }, + { + "epoch": 0.8331689005462377, + "lm_loss": 1.7287, + "step": 4328, + "vm_loss": 0.189 + }, + { + "epoch": 0.8331689005462377, + "lm_loss": 1.6022, + "step": 4328, + "vm_loss": 0.1321 + }, + { + "epoch": 0.8331689005462377, + "lm_loss": 2.0756, + "step": 4328, + "vm_loss": 0.2051 + }, + { + "epoch": 0.8331689005462377, + "lm_loss": 1.2478, + "step": 4328, + "vm_loss": 0.183 + }, + { + "epoch": 0.8331689005462377, + "lm_loss": 1.4121, + "step": 4328, + "vm_loss": 0.2168 + }, + { + "epoch": 0.8331689005462377, + "lm_loss": 1.7495, + "step": 4328, + "vm_loss": 0.1379 + }, + { + "epoch": 0.8331689005462377, + "lm_loss": 1.1925, + "step": 4328, + "vm_loss": 0.1261 + }, + { + "epoch": 0.833361407223813, + "grad_norm": 3.081484070977461, + "learning_rate": 1.312987782318501e-05, + "loss": 1.7265, + "step": 4329 + }, + { + "epoch": 0.8335539139013884, + "grad_norm": 3.041756979727332, + "learning_rate": 1.3126916426468128e-05, + "loss": 1.747, + "step": 4330 + }, + { + "epoch": 0.8337464205789639, + "grad_norm": 2.9962928499382917, + "learning_rate": 1.3123954725774964e-05, + "loss": 1.6656, + "step": 4331 + }, + { + "epoch": 0.8339389272565392, + "grad_norm": 3.219265615435624, + "learning_rate": 1.3120992721393433e-05, + "loss": 1.6059, + "step": 4332 + }, + { + "epoch": 0.8341314339341146, + "grad_norm": 2.929623078330894, + "learning_rate": 1.3118030413611483e-05, + "loss": 1.6296, + "step": 4333 + }, + { + "epoch": 0.83432394061169, + "grad_norm": 2.9423185987076317, + "learning_rate": 1.311506780271708e-05, + "loss": 1.6675, + "step": 4334 + }, + { + "epoch": 0.8345164472892653, + "grad_norm": 3.079786539206392, + "learning_rate": 1.3112104888998238e-05, + "loss": 1.7253, + "step": 4335 + }, + { + "epoch": 0.8347089539668407, + "grad_norm": 3.2597172240869536, + "learning_rate": 1.3109141672742983e-05, + "loss": 1.7855, + "step": 4336 + }, + { + "epoch": 0.8347089539668407, + "lm_loss": 1.9719, + "step": 4336, + "vm_loss": 0.2307 + }, + { + "epoch": 0.8347089539668407, + "lm_loss": 1.8905, + "step": 4336, + "vm_loss": 0.2066 + }, + { + "epoch": 0.8347089539668407, + "lm_loss": 2.231, + "step": 4336, + "vm_loss": 0.1668 + }, + { + "epoch": 0.8347089539668407, + "lm_loss": 1.2111, + "step": 4336, + "vm_loss": 0.2329 + }, + { + "epoch": 0.8347089539668407, + "lm_loss": 1.8987, + "step": 4336, + "vm_loss": 0.1884 + }, + { + "epoch": 0.8347089539668407, + "lm_loss": 1.567, + "step": 4336, + "vm_loss": 0.1325 + }, + { + "epoch": 0.8347089539668407, + "lm_loss": 1.356, + "step": 4336, + "vm_loss": 0.1401 + }, + { + "epoch": 0.8347089539668407, + "lm_loss": 1.0883, + "step": 4336, + "vm_loss": 0.1959 + }, + { + "epoch": 0.8349014606444161, + "grad_norm": 3.0634122335302343, + "learning_rate": 1.310617815423938e-05, + "loss": 1.679, + "step": 4337 + }, + { + "epoch": 0.8350939673219915, + "grad_norm": 3.163017311305193, + "learning_rate": 1.3103214333775522e-05, + "loss": 1.6907, + "step": 4338 + }, + { + "epoch": 0.8352864739995669, + "grad_norm": 3.1631439299909396, + "learning_rate": 1.3100250211639527e-05, + "loss": 1.685, + "step": 4339 + }, + { + "epoch": 0.8354789806771422, + "grad_norm": 2.970612701036097, + "learning_rate": 1.309728578811955e-05, + "loss": 1.5722, + "step": 4340 + }, + { + "epoch": 0.8356714873547176, + "grad_norm": 3.166437765091976, + "learning_rate": 1.3094321063503768e-05, + "loss": 1.7154, + "step": 4341 + }, + { + "epoch": 0.835863994032293, + "grad_norm": 3.0212925378787374, + "learning_rate": 1.309135603808039e-05, + "loss": 1.6008, + "step": 4342 + }, + { + "epoch": 0.8360565007098684, + "grad_norm": 3.231279639494313, + "learning_rate": 1.3088390712137656e-05, + "loss": 1.6666, + "step": 4343 + }, + { + "epoch": 0.8362490073874438, + "grad_norm": 2.9996254254066503, + "learning_rate": 1.308542508596383e-05, + "loss": 1.6851, + "step": 4344 + }, + { + "epoch": 0.8362490073874438, + "lm_loss": 1.5815, + "step": 4344, + "vm_loss": 0.1795 + }, + { + "epoch": 0.8362490073874438, + "lm_loss": 1.2942, + "step": 4344, + "vm_loss": 0.1643 + }, + { + "epoch": 0.8362490073874438, + "lm_loss": 1.1068, + "step": 4344, + "vm_loss": 0.1301 + }, + { + "epoch": 0.8362490073874438, + "lm_loss": 1.8251, + "step": 4344, + "vm_loss": 0.1502 + }, + { + "epoch": 0.8362490073874438, + "lm_loss": 1.4435, + "step": 4344, + "vm_loss": 0.1083 + }, + { + "epoch": 0.8362490073874438, + "lm_loss": 2.0858, + "step": 4344, + "vm_loss": 0.1325 + }, + { + "epoch": 0.8362490073874438, + "lm_loss": 1.4153, + "step": 4344, + "vm_loss": 0.1708 + }, + { + "epoch": 0.8362490073874438, + "lm_loss": 1.0147, + "step": 4344, + "vm_loss": 0.1933 + }, + { + "epoch": 0.8364415140650191, + "grad_norm": 2.9387825127261356, + "learning_rate": 1.3082459159847215e-05, + "loss": 1.6416, + "step": 4345 + }, + { + "epoch": 0.8366340207425945, + "grad_norm": 3.252453269638057, + "learning_rate": 1.307949293407613e-05, + "loss": 1.7184, + "step": 4346 + }, + { + "epoch": 0.8368265274201699, + "grad_norm": 3.0950856828521642, + "learning_rate": 1.3076526408938937e-05, + "loss": 1.7215, + "step": 4347 + }, + { + "epoch": 0.8370190340977453, + "grad_norm": 3.0720351857356163, + "learning_rate": 1.3073559584724014e-05, + "loss": 1.7436, + "step": 4348 + }, + { + "epoch": 0.8372115407753207, + "grad_norm": 2.9544535763763604, + "learning_rate": 1.3070592461719778e-05, + "loss": 1.7351, + "step": 4349 + }, + { + "epoch": 0.837404047452896, + "grad_norm": 2.9724622137453083, + "learning_rate": 1.3067625040214673e-05, + "loss": 1.6959, + "step": 4350 + }, + { + "epoch": 0.8375965541304714, + "grad_norm": 3.1319990036676733, + "learning_rate": 1.3064657320497165e-05, + "loss": 1.706, + "step": 4351 + }, + { + "epoch": 0.8377890608080468, + "grad_norm": 2.880729076499695, + "learning_rate": 1.3061689302855758e-05, + "loss": 1.7127, + "step": 4352 + }, + { + "epoch": 0.8377890608080468, + "lm_loss": 1.0865, + "step": 4352, + "vm_loss": 0.1867 + }, + { + "epoch": 0.8377890608080468, + "lm_loss": 2.0064, + "step": 4352, + "vm_loss": 0.1915 + }, + { + "epoch": 0.8377890608080468, + "lm_loss": 1.1778, + "step": 4352, + "vm_loss": 0.1275 + }, + { + "epoch": 0.8377890608080468, + "lm_loss": 1.3265, + "step": 4352, + "vm_loss": 0.1746 + }, + { + "epoch": 0.8377890608080468, + "lm_loss": 1.6751, + "step": 4352, + "vm_loss": 0.2303 + }, + { + "epoch": 0.8377890608080468, + "lm_loss": 1.9797, + "step": 4352, + "vm_loss": 0.1499 + }, + { + "epoch": 0.8377890608080468, + "lm_loss": 1.3836, + "step": 4352, + "vm_loss": 0.1184 + }, + { + "epoch": 0.8377890608080468, + "lm_loss": 1.9283, + "step": 4352, + "vm_loss": 0.1758 + }, + { + "epoch": 0.8379815674856221, + "grad_norm": 3.0943276550785894, + "learning_rate": 1.3058720987578978e-05, + "loss": 1.72, + "step": 4353 + }, + { + "epoch": 0.8381740741631976, + "grad_norm": 3.0955715215633486, + "learning_rate": 1.3055752374955389e-05, + "loss": 1.6804, + "step": 4354 + }, + { + "epoch": 0.8383665808407729, + "grad_norm": 3.236203418403068, + "learning_rate": 1.3052783465273573e-05, + "loss": 1.7305, + "step": 4355 + }, + { + "epoch": 0.8385590875183483, + "grad_norm": 3.0711202566535984, + "learning_rate": 1.3049814258822147e-05, + "loss": 1.6848, + "step": 4356 + }, + { + "epoch": 0.8387515941959237, + "grad_norm": 3.219756738286842, + "learning_rate": 1.3046844755889758e-05, + "loss": 1.7334, + "step": 4357 + }, + { + "epoch": 0.838944100873499, + "grad_norm": 3.1958736261796847, + "learning_rate": 1.3043874956765077e-05, + "loss": 1.7674, + "step": 4358 + }, + { + "epoch": 0.8391366075510744, + "grad_norm": 2.9699152078715993, + "learning_rate": 1.3040904861736809e-05, + "loss": 1.7144, + "step": 4359 + }, + { + "epoch": 0.8393291142286498, + "grad_norm": 3.2269581974200032, + "learning_rate": 1.3037934471093683e-05, + "loss": 1.712, + "step": 4360 + }, + { + "epoch": 0.8393291142286498, + "lm_loss": 1.3242, + "step": 4360, + "vm_loss": 0.1982 + }, + { + "epoch": 0.8393291142286498, + "lm_loss": 1.4208, + "step": 4360, + "vm_loss": 0.1746 + }, + { + "epoch": 0.8393291142286498, + "lm_loss": 1.3043, + "step": 4360, + "vm_loss": 0.1132 + }, + { + "epoch": 0.8393291142286498, + "lm_loss": 1.5691, + "step": 4360, + "vm_loss": 0.1389 + }, + { + "epoch": 0.8393291142286498, + "lm_loss": 1.0469, + "step": 4360, + "vm_loss": 0.1324 + }, + { + "epoch": 0.8393291142286498, + "lm_loss": 1.6433, + "step": 4360, + "vm_loss": 0.2061 + }, + { + "epoch": 0.8393291142286498, + "lm_loss": 1.6181, + "step": 4360, + "vm_loss": 0.1789 + }, + { + "epoch": 0.8393291142286498, + "lm_loss": 1.5244, + "step": 4360, + "vm_loss": 0.1433 + }, + { + "epoch": 0.8395216209062252, + "grad_norm": 3.0851228569188915, + "learning_rate": 1.303496378512446e-05, + "loss": 1.6432, + "step": 4361 + }, + { + "epoch": 0.8397141275838006, + "grad_norm": 3.0392999551247155, + "learning_rate": 1.303199280411793e-05, + "loss": 1.6857, + "step": 4362 + }, + { + "epoch": 0.8399066342613759, + "grad_norm": 3.1267006693387573, + "learning_rate": 1.3029021528362906e-05, + "loss": 1.6084, + "step": 4363 + }, + { + "epoch": 0.8400991409389513, + "grad_norm": 3.4465845069229832, + "learning_rate": 1.3026049958148236e-05, + "loss": 1.7086, + "step": 4364 + }, + { + "epoch": 0.8402916476165267, + "grad_norm": 3.036159481809941, + "learning_rate": 1.3023078093762802e-05, + "loss": 1.7075, + "step": 4365 + }, + { + "epoch": 0.8404841542941021, + "grad_norm": 2.9602071606780065, + "learning_rate": 1.3020105935495496e-05, + "loss": 1.6513, + "step": 4366 + }, + { + "epoch": 0.8406766609716775, + "grad_norm": 3.168469049745241, + "learning_rate": 1.3017133483635253e-05, + "loss": 1.7288, + "step": 4367 + }, + { + "epoch": 0.8408691676492528, + "grad_norm": 3.117941451459316, + "learning_rate": 1.3014160738471037e-05, + "loss": 1.6945, + "step": 4368 + }, + { + "epoch": 0.8408691676492528, + "lm_loss": 1.5681, + "step": 4368, + "vm_loss": 0.1339 + }, + { + "epoch": 0.8408691676492528, + "lm_loss": 1.3728, + "step": 4368, + "vm_loss": 0.1548 + }, + { + "epoch": 0.8408691676492528, + "lm_loss": 1.4545, + "step": 4368, + "vm_loss": 0.1526 + }, + { + "epoch": 0.8408691676492528, + "lm_loss": 1.5959, + "step": 4368, + "vm_loss": 0.2072 + }, + { + "epoch": 0.8408691676492528, + "lm_loss": 1.4277, + "step": 4368, + "vm_loss": 0.1323 + }, + { + "epoch": 0.8408691676492528, + "lm_loss": 1.0381, + "step": 4368, + "vm_loss": 0.093 + }, + { + "epoch": 0.8408691676492528, + "lm_loss": 1.368, + "step": 4368, + "vm_loss": 0.1767 + }, + { + "epoch": 0.8408691676492528, + "lm_loss": 1.4613, + "step": 4368, + "vm_loss": 0.165 + }, + { + "epoch": 0.8410616743268282, + "grad_norm": 3.1687155861974152, + "learning_rate": 1.301118770029184e-05, + "loss": 1.7255, + "step": 4369 + }, + { + "epoch": 0.8412541810044036, + "grad_norm": 3.020417425428901, + "learning_rate": 1.3008214369386667e-05, + "loss": 1.7248, + "step": 4370 + }, + { + "epoch": 0.8414466876819789, + "grad_norm": 3.15466560116373, + "learning_rate": 1.3005240746044572e-05, + "loss": 1.7554, + "step": 4371 + }, + { + "epoch": 0.8416391943595544, + "grad_norm": 3.133746879438481, + "learning_rate": 1.300226683055463e-05, + "loss": 1.6209, + "step": 4372 + }, + { + "epoch": 0.8418317010371297, + "grad_norm": 2.9935772664744937, + "learning_rate": 1.2999292623205942e-05, + "loss": 1.7297, + "step": 4373 + }, + { + "epoch": 0.8420242077147051, + "grad_norm": 3.114097538564018, + "learning_rate": 1.2996318124287638e-05, + "loss": 1.7117, + "step": 4374 + }, + { + "epoch": 0.8422167143922805, + "grad_norm": 2.9472148398573204, + "learning_rate": 1.2993343334088879e-05, + "loss": 1.7152, + "step": 4375 + }, + { + "epoch": 0.8424092210698558, + "grad_norm": 3.1484104763609326, + "learning_rate": 1.299036825289885e-05, + "loss": 1.7502, + "step": 4376 + }, + { + "epoch": 0.8424092210698558, + "lm_loss": 1.9482, + "step": 4376, + "vm_loss": 0.1899 + }, + { + "epoch": 0.8424092210698558, + "lm_loss": 1.4684, + "step": 4376, + "vm_loss": 0.1537 + }, + { + "epoch": 0.8424092210698558, + "lm_loss": 1.6806, + "step": 4376, + "vm_loss": 0.1436 + }, + { + "epoch": 0.8424092210698558, + "lm_loss": 1.6284, + "step": 4376, + "vm_loss": 0.1632 + }, + { + "epoch": 0.8424092210698558, + "lm_loss": 1.7249, + "step": 4376, + "vm_loss": 0.1531 + }, + { + "epoch": 0.8424092210698558, + "lm_loss": 2.0307, + "step": 4376, + "vm_loss": 0.1574 + }, + { + "epoch": 0.8424092210698558, + "lm_loss": 1.7772, + "step": 4376, + "vm_loss": 0.1526 + }, + { + "epoch": 0.8424092210698558, + "lm_loss": 1.747, + "step": 4376, + "vm_loss": 0.1928 + }, + { + "epoch": 0.8426017277474313, + "grad_norm": 3.057821633445439, + "learning_rate": 1.2987392881006773e-05, + "loss": 1.6721, + "step": 4377 + }, + { + "epoch": 0.8427942344250067, + "grad_norm": 2.9228310185035684, + "learning_rate": 1.2984417218701883e-05, + "loss": 1.6497, + "step": 4378 + }, + { + "epoch": 0.842986741102582, + "grad_norm": 2.9792571412327282, + "learning_rate": 1.2981441266273462e-05, + "loss": 1.6332, + "step": 4379 + }, + { + "epoch": 0.8431792477801574, + "grad_norm": 3.1047203335827795, + "learning_rate": 1.2978465024010803e-05, + "loss": 1.6713, + "step": 4380 + }, + { + "epoch": 0.8433717544577327, + "grad_norm": 3.046873451531447, + "learning_rate": 1.2975488492203241e-05, + "loss": 1.6209, + "step": 4381 + }, + { + "epoch": 0.8435642611353081, + "grad_norm": 3.0713850487777656, + "learning_rate": 1.2972511671140127e-05, + "loss": 1.6902, + "step": 4382 + }, + { + "epoch": 0.8437567678128836, + "grad_norm": 3.5186504370725973, + "learning_rate": 1.2969534561110848e-05, + "loss": 1.7662, + "step": 4383 + }, + { + "epoch": 0.8439492744904589, + "grad_norm": 2.9124896326540086, + "learning_rate": 1.2966557162404823e-05, + "loss": 1.6847, + "step": 4384 + }, + { + "epoch": 0.8439492744904589, + "lm_loss": 1.5537, + "step": 4384, + "vm_loss": 0.1451 + }, + { + "epoch": 0.8439492744904589, + "lm_loss": 1.4561, + "step": 4384, + "vm_loss": 0.1327 + }, + { + "epoch": 0.8439492744904589, + "lm_loss": 1.2491, + "step": 4384, + "vm_loss": 0.1454 + }, + { + "epoch": 0.8439492744904589, + "lm_loss": 2.3442, + "step": 4384, + "vm_loss": 0.1679 + }, + { + "epoch": 0.8439492744904589, + "lm_loss": 1.5548, + "step": 4384, + "vm_loss": 0.1655 + }, + { + "epoch": 0.8439492744904589, + "lm_loss": 1.2072, + "step": 4384, + "vm_loss": 0.088 + }, + { + "epoch": 0.8439492744904589, + "lm_loss": 1.6889, + "step": 4384, + "vm_loss": 0.14 + }, + { + "epoch": 0.8439492744904589, + "lm_loss": 1.057, + "step": 4384, + "vm_loss": 0.2097 + }, + { + "epoch": 0.8441417811680343, + "grad_norm": 3.112590158695623, + "learning_rate": 1.2963579475311485e-05, + "loss": 1.6484, + "step": 4385 + }, + { + "epoch": 0.8443342878456096, + "grad_norm": 3.062370038617411, + "learning_rate": 1.2960601500120307e-05, + "loss": 1.7194, + "step": 4386 + }, + { + "epoch": 0.844526794523185, + "grad_norm": 3.1758525210826174, + "learning_rate": 1.2957623237120784e-05, + "loss": 1.7476, + "step": 4387 + }, + { + "epoch": 0.8447193012007604, + "grad_norm": 3.1485560540709527, + "learning_rate": 1.2954644686602449e-05, + "loss": 1.7309, + "step": 4388 + }, + { + "epoch": 0.8449118078783358, + "grad_norm": 3.310210577452421, + "learning_rate": 1.2951665848854845e-05, + "loss": 1.7006, + "step": 4389 + }, + { + "epoch": 0.8451043145559112, + "grad_norm": 3.105159334962832, + "learning_rate": 1.2948686724167559e-05, + "loss": 1.6738, + "step": 4390 + }, + { + "epoch": 0.8452968212334865, + "grad_norm": 3.003040600761736, + "learning_rate": 1.2945707312830197e-05, + "loss": 1.6846, + "step": 4391 + }, + { + "epoch": 0.8454893279110619, + "grad_norm": 2.914091679808134, + "learning_rate": 1.29427276151324e-05, + "loss": 1.7869, + "step": 4392 + }, + { + "epoch": 0.8454893279110619, + "lm_loss": 1.3304, + "step": 4392, + "vm_loss": 0.2085 + }, + { + "epoch": 0.8454893279110619, + "lm_loss": 1.5971, + "step": 4392, + "vm_loss": 0.1559 + }, + { + "epoch": 0.8454893279110619, + "lm_loss": 1.6343, + "step": 4392, + "vm_loss": 0.1975 + }, + { + "epoch": 0.8454893279110619, + "lm_loss": 1.1903, + "step": 4392, + "vm_loss": 0.108 + }, + { + "epoch": 0.8454893279110619, + "lm_loss": 1.493, + "step": 4392, + "vm_loss": 0.1283 + }, + { + "epoch": 0.8454893279110619, + "lm_loss": 1.6251, + "step": 4392, + "vm_loss": 0.1155 + }, + { + "epoch": 0.8454893279110619, + "lm_loss": 1.8112, + "step": 4392, + "vm_loss": 0.1251 + }, + { + "epoch": 0.8454893279110619, + "lm_loss": 1.5236, + "step": 4392, + "vm_loss": 0.0975 + }, + { + "epoch": 0.8456818345886373, + "grad_norm": 3.1125788233275964, + "learning_rate": 1.293974763136383e-05, + "loss": 1.6903, + "step": 4393 + }, + { + "epoch": 0.8458743412662126, + "grad_norm": 3.179429058715911, + "learning_rate": 1.293676736181418e-05, + "loss": 1.7269, + "step": 4394 + }, + { + "epoch": 0.8460668479437881, + "grad_norm": 2.959998430945532, + "learning_rate": 1.293378680677317e-05, + "loss": 1.7335, + "step": 4395 + }, + { + "epoch": 0.8462593546213635, + "grad_norm": 3.1487827553199015, + "learning_rate": 1.2930805966530548e-05, + "loss": 1.7915, + "step": 4396 + }, + { + "epoch": 0.8464518612989388, + "grad_norm": 3.0411303880509957, + "learning_rate": 1.292782484137609e-05, + "loss": 1.7142, + "step": 4397 + }, + { + "epoch": 0.8466443679765142, + "grad_norm": 3.080035179409326, + "learning_rate": 1.29248434315996e-05, + "loss": 1.6986, + "step": 4398 + }, + { + "epoch": 0.8468368746540895, + "grad_norm": 3.0322106595129226, + "learning_rate": 1.2921861737490908e-05, + "loss": 1.7125, + "step": 4399 + }, + { + "epoch": 0.8470293813316649, + "grad_norm": 3.002032605090812, + "learning_rate": 1.2918879759339874e-05, + "loss": 1.6556, + "step": 4400 + }, + { + "epoch": 0.8470293813316649, + "lm_loss": 1.6954, + "step": 4400, + "vm_loss": 0.1623 + }, + { + "epoch": 0.8470293813316649, + "lm_loss": 1.1661, + "step": 4400, + "vm_loss": 0.1753 + }, + { + "epoch": 0.8470293813316649, + "lm_loss": 1.6242, + "step": 4400, + "vm_loss": 0.2181 + }, + { + "epoch": 0.8470293813316649, + "lm_loss": 1.3823, + "step": 4400, + "vm_loss": 0.1507 + }, + { + "epoch": 0.8470293813316649, + "lm_loss": 1.5291, + "step": 4400, + "vm_loss": 0.1657 + }, + { + "epoch": 0.8470293813316649, + "lm_loss": 1.3078, + "step": 4400, + "vm_loss": 0.1792 + }, + { + "epoch": 0.8470293813316649, + "lm_loss": 1.3767, + "step": 4400, + "vm_loss": 0.1678 + }, + { + "epoch": 0.8470293813316649, + "lm_loss": 1.4285, + "step": 4400, + "vm_loss": 0.1643 + }, + { + "epoch": 0.8472218880092404, + "grad_norm": 3.028198960753014, + "learning_rate": 1.2915897497436385e-05, + "loss": 1.7108, + "step": 4401 + }, + { + "epoch": 0.8474143946868157, + "grad_norm": 3.102521513788809, + "learning_rate": 1.2912914952070352e-05, + "loss": 1.7153, + "step": 4402 + }, + { + "epoch": 0.8476069013643911, + "grad_norm": 3.038183005438477, + "learning_rate": 1.2909932123531721e-05, + "loss": 1.726, + "step": 4403 + }, + { + "epoch": 0.8477994080419664, + "grad_norm": 2.997786049380668, + "learning_rate": 1.2906949012110456e-05, + "loss": 1.6443, + "step": 4404 + }, + { + "epoch": 0.8479919147195418, + "grad_norm": 3.065878758309054, + "learning_rate": 1.2903965618096557e-05, + "loss": 1.6612, + "step": 4405 + }, + { + "epoch": 0.8481844213971172, + "grad_norm": 3.108680060707114, + "learning_rate": 1.2900981941780049e-05, + "loss": 1.694, + "step": 4406 + }, + { + "epoch": 0.8483769280746926, + "grad_norm": 3.0741516778288434, + "learning_rate": 1.289799798345098e-05, + "loss": 1.7056, + "step": 4407 + }, + { + "epoch": 0.848569434752268, + "grad_norm": 2.8878425363227334, + "learning_rate": 1.2895013743399432e-05, + "loss": 1.6273, + "step": 4408 + }, + { + "epoch": 0.848569434752268, + "lm_loss": 1.2965, + "step": 4408, + "vm_loss": 0.1274 + }, + { + "epoch": 0.848569434752268, + "lm_loss": 1.7567, + "step": 4408, + "vm_loss": 0.1827 + }, + { + "epoch": 0.848569434752268, + "lm_loss": 1.3998, + "step": 4408, + "vm_loss": 0.1729 + }, + { + "epoch": 0.848569434752268, + "lm_loss": 1.0901, + "step": 4408, + "vm_loss": 0.181 + }, + { + "epoch": 0.848569434752268, + "lm_loss": 1.4649, + "step": 4408, + "vm_loss": 0.1826 + }, + { + "epoch": 0.848569434752268, + "lm_loss": 1.6219, + "step": 4408, + "vm_loss": 0.1278 + }, + { + "epoch": 0.848569434752268, + "lm_loss": 1.1227, + "step": 4408, + "vm_loss": 0.1599 + }, + { + "epoch": 0.848569434752268, + "lm_loss": 1.0629, + "step": 4408, + "vm_loss": 0.2062 + }, + { + "epoch": 0.8487619414298434, + "grad_norm": 3.0432887729568208, + "learning_rate": 1.2892029221915508e-05, + "loss": 1.6767, + "step": 4409 + }, + { + "epoch": 0.8489544481074187, + "grad_norm": 2.9597448142345173, + "learning_rate": 1.2889044419289347e-05, + "loss": 1.6346, + "step": 4410 + }, + { + "epoch": 0.8491469547849941, + "grad_norm": 2.9543102717920604, + "learning_rate": 1.2886059335811106e-05, + "loss": 1.6848, + "step": 4411 + }, + { + "epoch": 0.8493394614625694, + "grad_norm": 2.9453317598785005, + "learning_rate": 1.2883073971770971e-05, + "loss": 1.6299, + "step": 4412 + }, + { + "epoch": 0.8495319681401449, + "grad_norm": 3.1508828939284115, + "learning_rate": 1.2880088327459164e-05, + "loss": 1.6633, + "step": 4413 + }, + { + "epoch": 0.8497244748177203, + "grad_norm": 2.9977843801287003, + "learning_rate": 1.2877102403165922e-05, + "loss": 1.7089, + "step": 4414 + }, + { + "epoch": 0.8499169814952956, + "grad_norm": 3.1787254448781677, + "learning_rate": 1.2874116199181522e-05, + "loss": 1.6849, + "step": 4415 + }, + { + "epoch": 0.850109488172871, + "grad_norm": 3.026261071217002, + "learning_rate": 1.2871129715796253e-05, + "loss": 1.6622, + "step": 4416 + }, + { + "epoch": 0.850109488172871, + "lm_loss": 1.2626, + "step": 4416, + "vm_loss": 0.1377 + }, + { + "epoch": 0.850109488172871, + "lm_loss": 1.1737, + "step": 4416, + "vm_loss": 0.1724 + }, + { + "epoch": 0.850109488172871, + "lm_loss": 1.3827, + "step": 4416, + "vm_loss": 0.2169 + }, + { + "epoch": 0.850109488172871, + "lm_loss": 1.6548, + "step": 4416, + "vm_loss": 0.1552 + }, + { + "epoch": 0.850109488172871, + "lm_loss": 1.4932, + "step": 4416, + "vm_loss": 0.1114 + }, + { + "epoch": 0.850109488172871, + "lm_loss": 1.5004, + "step": 4416, + "vm_loss": 0.1749 + }, + { + "epoch": 0.850109488172871, + "lm_loss": 1.6964, + "step": 4416, + "vm_loss": 0.1671 + }, + { + "epoch": 0.850109488172871, + "lm_loss": 1.5854, + "step": 4416, + "vm_loss": 0.21 + }, + { + "epoch": 0.8503019948504463, + "grad_norm": 2.967167164075282, + "learning_rate": 1.2868142953300442e-05, + "loss": 1.6123, + "step": 4417 + }, + { + "epoch": 0.8504945015280218, + "grad_norm": 3.055304588664331, + "learning_rate": 1.2865155911984445e-05, + "loss": 1.6461, + "step": 4418 + }, + { + "epoch": 0.8506870082055972, + "grad_norm": 2.992944752205052, + "learning_rate": 1.2862168592138635e-05, + "loss": 1.7252, + "step": 4419 + }, + { + "epoch": 0.8508795148831725, + "grad_norm": 3.0834994495829675, + "learning_rate": 1.2859180994053424e-05, + "loss": 1.6663, + "step": 4420 + }, + { + "epoch": 0.8510720215607479, + "grad_norm": 2.961857124765966, + "learning_rate": 1.2856193118019235e-05, + "loss": 1.6574, + "step": 4421 + }, + { + "epoch": 0.8512645282383232, + "grad_norm": 3.0558728721543202, + "learning_rate": 1.2853204964326536e-05, + "loss": 1.6458, + "step": 4422 + }, + { + "epoch": 0.8514570349158986, + "grad_norm": 3.03977260308226, + "learning_rate": 1.2850216533265814e-05, + "loss": 1.6869, + "step": 4423 + }, + { + "epoch": 0.8516495415934741, + "grad_norm": 2.9482612800103607, + "learning_rate": 1.2847227825127578e-05, + "loss": 1.6941, + "step": 4424 + }, + { + "epoch": 0.8516495415934741, + "lm_loss": 1.7082, + "step": 4424, + "vm_loss": 0.1739 + }, + { + "epoch": 0.8516495415934741, + "lm_loss": 1.6234, + "step": 4424, + "vm_loss": 0.1619 + }, + { + "epoch": 0.8516495415934741, + "lm_loss": 1.8249, + "step": 4424, + "vm_loss": 0.1954 + }, + { + "epoch": 0.8516495415934741, + "lm_loss": 1.7877, + "step": 4424, + "vm_loss": 0.1249 + }, + { + "epoch": 0.8516495415934741, + "lm_loss": 1.4533, + "step": 4424, + "vm_loss": 0.1315 + }, + { + "epoch": 0.8516495415934741, + "lm_loss": 1.3378, + "step": 4424, + "vm_loss": 0.1233 + }, + { + "epoch": 0.8516495415934741, + "lm_loss": 1.687, + "step": 4424, + "vm_loss": 0.166 + }, + { + "epoch": 0.8516495415934741, + "lm_loss": 1.2291, + "step": 4424, + "vm_loss": 0.1605 + }, + { + "epoch": 0.8518420482710494, + "grad_norm": 3.050494491986149, + "learning_rate": 1.284423884020237e-05, + "loss": 1.7177, + "step": 4425 + }, + { + "epoch": 0.8520345549486248, + "grad_norm": 2.9041037013114965, + "learning_rate": 1.2841249578780756e-05, + "loss": 1.6738, + "step": 4426 + }, + { + "epoch": 0.8522270616262002, + "grad_norm": 3.0444074352669577, + "learning_rate": 1.2838260041153339e-05, + "loss": 1.6578, + "step": 4427 + }, + { + "epoch": 0.8524195683037755, + "grad_norm": 2.9682919905362106, + "learning_rate": 1.283527022761073e-05, + "loss": 1.7867, + "step": 4428 + }, + { + "epoch": 0.8526120749813509, + "grad_norm": 3.0700905398642218, + "learning_rate": 1.283228013844358e-05, + "loss": 1.6722, + "step": 4429 + }, + { + "epoch": 0.8528045816589263, + "grad_norm": 3.055669392950635, + "learning_rate": 1.2829289773942567e-05, + "loss": 1.6533, + "step": 4430 + }, + { + "epoch": 0.8529970883365017, + "grad_norm": 3.119651451324948, + "learning_rate": 1.2826299134398387e-05, + "loss": 1.6535, + "step": 4431 + }, + { + "epoch": 0.8531895950140771, + "grad_norm": 3.0988419364495665, + "learning_rate": 1.2823308220101777e-05, + "loss": 1.807, + "step": 4432 + }, + { + "epoch": 0.8531895950140771, + "lm_loss": 1.3772, + "step": 4432, + "vm_loss": 0.2209 + }, + { + "epoch": 0.8531895950140771, + "lm_loss": 1.2338, + "step": 4432, + "vm_loss": 0.1473 + }, + { + "epoch": 0.8531895950140771, + "lm_loss": 1.5472, + "step": 4432, + "vm_loss": 0.2098 + }, + { + "epoch": 0.8531895950140771, + "lm_loss": 1.5292, + "step": 4432, + "vm_loss": 0.1546 + }, + { + "epoch": 0.8531895950140771, + "lm_loss": 1.5988, + "step": 4432, + "vm_loss": 0.1384 + }, + { + "epoch": 0.8531895950140771, + "lm_loss": 1.8606, + "step": 4432, + "vm_loss": 0.1237 + }, + { + "epoch": 0.8531895950140771, + "lm_loss": 1.2638, + "step": 4432, + "vm_loss": 0.2367 + }, + { + "epoch": 0.8531895950140771, + "lm_loss": 0.9159, + "step": 4432, + "vm_loss": 0.1578 + }, + { + "epoch": 0.8533821016916524, + "grad_norm": 3.0419804980930594, + "learning_rate": 1.2820317031343485e-05, + "loss": 1.6427, + "step": 4433 + }, + { + "epoch": 0.8535746083692278, + "grad_norm": 2.8795738351575393, + "learning_rate": 1.2817325568414299e-05, + "loss": 1.6269, + "step": 4434 + }, + { + "epoch": 0.8537671150468031, + "grad_norm": 2.992128438235043, + "learning_rate": 1.2814333831605019e-05, + "loss": 1.7017, + "step": 4435 + }, + { + "epoch": 0.8539596217243786, + "grad_norm": 3.000331001767118, + "learning_rate": 1.2811341821206487e-05, + "loss": 1.6946, + "step": 4436 + }, + { + "epoch": 0.854152128401954, + "grad_norm": 3.074197489173797, + "learning_rate": 1.2808349537509564e-05, + "loss": 1.6514, + "step": 4437 + }, + { + "epoch": 0.8543446350795293, + "grad_norm": 3.158141085384541, + "learning_rate": 1.2805356980805132e-05, + "loss": 1.661, + "step": 4438 + }, + { + "epoch": 0.8545371417571047, + "grad_norm": 3.161881857328982, + "learning_rate": 1.2802364151384116e-05, + "loss": 1.6844, + "step": 4439 + }, + { + "epoch": 0.85472964843468, + "grad_norm": 3.1013498278455054, + "learning_rate": 1.2799371049537448e-05, + "loss": 1.6821, + "step": 4440 + }, + { + "epoch": 0.85472964843468, + "lm_loss": 1.4488, + "step": 4440, + "vm_loss": 0.0951 + }, + { + "epoch": 0.85472964843468, + "lm_loss": 1.118, + "step": 4440, + "vm_loss": 0.1862 + }, + { + "epoch": 0.85472964843468, + "lm_loss": 1.933, + "step": 4440, + "vm_loss": 0.1554 + }, + { + "epoch": 0.85472964843468, + "lm_loss": 0.9792, + "step": 4440, + "vm_loss": 0.1073 + }, + { + "epoch": 0.85472964843468, + "lm_loss": 1.7091, + "step": 4440, + "vm_loss": 0.1564 + }, + { + "epoch": 0.85472964843468, + "lm_loss": 1.3559, + "step": 4440, + "vm_loss": 0.1438 + }, + { + "epoch": 0.85472964843468, + "lm_loss": 1.5367, + "step": 4440, + "vm_loss": 0.1477 + }, + { + "epoch": 0.85472964843468, + "lm_loss": 0.9572, + "step": 4440, + "vm_loss": 0.1956 + }, + { + "epoch": 0.8549221551122554, + "grad_norm": 2.9380934733636392, + "learning_rate": 1.27963776755561e-05, + "loss": 1.6732, + "step": 4441 + }, + { + "epoch": 0.8551146617898309, + "grad_norm": 2.9949646721292864, + "learning_rate": 1.2793384029731066e-05, + "loss": 1.7065, + "step": 4442 + }, + { + "epoch": 0.8553071684674062, + "grad_norm": 3.0680098851192463, + "learning_rate": 1.2790390112353369e-05, + "loss": 1.712, + "step": 4443 + }, + { + "epoch": 0.8554996751449816, + "grad_norm": 3.091282238202021, + "learning_rate": 1.2787395923714049e-05, + "loss": 1.7368, + "step": 4444 + }, + { + "epoch": 0.855692181822557, + "grad_norm": 2.96441454518578, + "learning_rate": 1.2784401464104189e-05, + "loss": 1.6559, + "step": 4445 + }, + { + "epoch": 0.8558846885001323, + "grad_norm": 3.334169924401074, + "learning_rate": 1.2781406733814879e-05, + "loss": 1.7048, + "step": 4446 + }, + { + "epoch": 0.8560771951777078, + "grad_norm": 3.17574344447504, + "learning_rate": 1.2778411733137253e-05, + "loss": 1.698, + "step": 4447 + }, + { + "epoch": 0.8562697018552831, + "grad_norm": 3.0851685450318844, + "learning_rate": 1.2775416462362458e-05, + "loss": 1.7089, + "step": 4448 + }, + { + "epoch": 0.8562697018552831, + "lm_loss": 1.2059, + "step": 4448, + "vm_loss": 0.1723 + }, + { + "epoch": 0.8562697018552831, + "lm_loss": 1.2959, + "step": 4448, + "vm_loss": 0.2538 + }, + { + "epoch": 0.8562697018552831, + "lm_loss": 1.36, + "step": 4448, + "vm_loss": 0.1596 + }, + { + "epoch": 0.8562697018552831, + "lm_loss": 1.4915, + "step": 4448, + "vm_loss": 0.1285 + }, + { + "epoch": 0.8562697018552831, + "lm_loss": 1.697, + "step": 4448, + "vm_loss": 0.1522 + }, + { + "epoch": 0.8562697018552831, + "lm_loss": 1.7197, + "step": 4448, + "vm_loss": 0.1889 + }, + { + "epoch": 0.8562697018552831, + "lm_loss": 1.8211, + "step": 4448, + "vm_loss": 0.14 + }, + { + "epoch": 0.8562697018552831, + "lm_loss": 1.2567, + "step": 4448, + "vm_loss": 0.2277 + }, + { + "epoch": 0.8564622085328585, + "grad_norm": 2.986701654283633, + "learning_rate": 1.2772420921781677e-05, + "loss": 1.6713, + "step": 4449 + }, + { + "epoch": 0.8566547152104339, + "grad_norm": 2.9389940716368033, + "learning_rate": 1.276942511168611e-05, + "loss": 1.6625, + "step": 4450 + }, + { + "epoch": 0.8568472218880092, + "grad_norm": 3.0107046795792947, + "learning_rate": 1.2766429032366994e-05, + "loss": 1.6501, + "step": 4451 + }, + { + "epoch": 0.8570397285655846, + "grad_norm": 3.064247953908551, + "learning_rate": 1.2763432684115582e-05, + "loss": 1.6622, + "step": 4452 + }, + { + "epoch": 0.85723223524316, + "grad_norm": 3.106108831727385, + "learning_rate": 1.2760436067223156e-05, + "loss": 1.69, + "step": 4453 + }, + { + "epoch": 0.8574247419207354, + "grad_norm": 2.9829775742102895, + "learning_rate": 1.2757439181981033e-05, + "loss": 1.6909, + "step": 4454 + }, + { + "epoch": 0.8576172485983108, + "grad_norm": 3.0164088771005284, + "learning_rate": 1.275444202868054e-05, + "loss": 1.7258, + "step": 4455 + }, + { + "epoch": 0.8578097552758861, + "grad_norm": 3.015194334952046, + "learning_rate": 1.2751444607613044e-05, + "loss": 1.6578, + "step": 4456 + }, + { + "epoch": 0.8578097552758861, + "lm_loss": 1.5486, + "step": 4456, + "vm_loss": 0.1496 + }, + { + "epoch": 0.8578097552758861, + "lm_loss": 1.3585, + "step": 4456, + "vm_loss": 0.1278 + }, + { + "epoch": 0.8578097552758861, + "lm_loss": 1.7845, + "step": 4456, + "vm_loss": 0.1593 + }, + { + "epoch": 0.8578097552758861, + "lm_loss": 1.6806, + "step": 4456, + "vm_loss": 0.1825 + }, + { + "epoch": 0.8578097552758861, + "lm_loss": 1.7309, + "step": 4456, + "vm_loss": 0.1468 + }, + { + "epoch": 0.8578097552758861, + "lm_loss": 1.5521, + "step": 4456, + "vm_loss": 0.1612 + }, + { + "epoch": 0.8578097552758861, + "lm_loss": 1.2181, + "step": 4456, + "vm_loss": 0.2328 + }, + { + "epoch": 0.8578097552758861, + "lm_loss": 1.3713, + "step": 4456, + "vm_loss": 0.1498 + }, + { + "epoch": 0.8580022619534615, + "grad_norm": 2.9504431217016114, + "learning_rate": 1.274844691906993e-05, + "loss": 1.7202, + "step": 4457 + }, + { + "epoch": 0.8581947686310369, + "grad_norm": 2.9642851500204817, + "learning_rate": 1.2745448963342612e-05, + "loss": 1.5875, + "step": 4458 + }, + { + "epoch": 0.8583872753086123, + "grad_norm": 3.2369529928116125, + "learning_rate": 1.2742450740722532e-05, + "loss": 1.6929, + "step": 4459 + }, + { + "epoch": 0.8585797819861877, + "grad_norm": 3.1381033212467173, + "learning_rate": 1.2739452251501152e-05, + "loss": 1.6784, + "step": 4460 + }, + { + "epoch": 0.858772288663763, + "grad_norm": 3.0593129256530194, + "learning_rate": 1.2736453495969967e-05, + "loss": 1.632, + "step": 4461 + }, + { + "epoch": 0.8589647953413384, + "grad_norm": 3.089659793123969, + "learning_rate": 1.2733454474420493e-05, + "loss": 1.6274, + "step": 4462 + }, + { + "epoch": 0.8591573020189138, + "grad_norm": 3.066609905530649, + "learning_rate": 1.2730455187144269e-05, + "loss": 1.6699, + "step": 4463 + }, + { + "epoch": 0.8593498086964891, + "grad_norm": 3.0534938405455674, + "learning_rate": 1.2727455634432873e-05, + "loss": 1.6807, + "step": 4464 + }, + { + "epoch": 0.8593498086964891, + "lm_loss": 1.2368, + "step": 4464, + "vm_loss": 0.154 + }, + { + "epoch": 0.8593498086964891, + "lm_loss": 1.3027, + "step": 4464, + "vm_loss": 0.1382 + }, + { + "epoch": 0.8593498086964891, + "lm_loss": 1.1642, + "step": 4464, + "vm_loss": 0.1903 + }, + { + "epoch": 0.8593498086964891, + "lm_loss": 1.4768, + "step": 4464, + "vm_loss": 0.2516 + }, + { + "epoch": 0.8593498086964891, + "lm_loss": 0.9715, + "step": 4464, + "vm_loss": 0.206 + }, + { + "epoch": 0.8593498086964891, + "lm_loss": 1.4413, + "step": 4464, + "vm_loss": 0.1566 + }, + { + "epoch": 0.8593498086964891, + "lm_loss": 1.6963, + "step": 4464, + "vm_loss": 0.209 + }, + { + "epoch": 0.8593498086964891, + "lm_loss": 1.7286, + "step": 4464, + "vm_loss": 0.1349 + }, + { + "epoch": 0.8595423153740646, + "grad_norm": 3.0154793954165835, + "learning_rate": 1.2724455816577891e-05, + "loss": 1.6409, + "step": 4465 + }, + { + "epoch": 0.8597348220516399, + "grad_norm": 2.9078586481620374, + "learning_rate": 1.2721455733870952e-05, + "loss": 1.7146, + "step": 4466 + }, + { + "epoch": 0.8599273287292153, + "grad_norm": 3.078662308753128, + "learning_rate": 1.2718455386603695e-05, + "loss": 1.7405, + "step": 4467 + }, + { + "epoch": 0.8601198354067907, + "grad_norm": 3.034324836966756, + "learning_rate": 1.2715454775067798e-05, + "loss": 1.7353, + "step": 4468 + }, + { + "epoch": 0.860312342084366, + "grad_norm": 2.9498075382587996, + "learning_rate": 1.271245389955495e-05, + "loss": 1.7237, + "step": 4469 + }, + { + "epoch": 0.8605048487619414, + "grad_norm": 3.0301071703289217, + "learning_rate": 1.2709452760356884e-05, + "loss": 1.6602, + "step": 4470 + }, + { + "epoch": 0.8606973554395168, + "grad_norm": 3.0409535418330638, + "learning_rate": 1.2706451357765347e-05, + "loss": 1.6519, + "step": 4471 + }, + { + "epoch": 0.8608898621170922, + "grad_norm": 3.040719003145782, + "learning_rate": 1.2703449692072112e-05, + "loss": 1.7106, + "step": 4472 + }, + { + "epoch": 0.8608898621170922, + "lm_loss": 1.6528, + "step": 4472, + "vm_loss": 0.1864 + }, + { + "epoch": 0.8608898621170922, + "lm_loss": 1.3673, + "step": 4472, + "vm_loss": 0.1453 + }, + { + "epoch": 0.8608898621170922, + "lm_loss": 1.6238, + "step": 4472, + "vm_loss": 0.1382 + }, + { + "epoch": 0.8608898621170922, + "lm_loss": 1.6757, + "step": 4472, + "vm_loss": 0.1697 + }, + { + "epoch": 0.8608898621170922, + "lm_loss": 1.7708, + "step": 4472, + "vm_loss": 0.1596 + }, + { + "epoch": 0.8608898621170922, + "lm_loss": 1.2154, + "step": 4472, + "vm_loss": 0.1961 + }, + { + "epoch": 0.8608898621170922, + "lm_loss": 1.5934, + "step": 4472, + "vm_loss": 0.2166 + }, + { + "epoch": 0.8608898621170922, + "lm_loss": 1.2965, + "step": 4472, + "vm_loss": 0.1499 + }, + { + "epoch": 0.8610823687946676, + "grad_norm": 2.9392567593614833, + "learning_rate": 1.2700447763568977e-05, + "loss": 1.6685, + "step": 4473 + }, + { + "epoch": 0.8612748754722429, + "grad_norm": 3.0407914173970685, + "learning_rate": 1.2697445572547773e-05, + "loss": 1.72, + "step": 4474 + }, + { + "epoch": 0.8614673821498183, + "grad_norm": 3.1670146643943125, + "learning_rate": 1.2694443119300348e-05, + "loss": 1.6622, + "step": 4475 + }, + { + "epoch": 0.8616598888273937, + "grad_norm": 2.9350261507142448, + "learning_rate": 1.2691440404118582e-05, + "loss": 1.6374, + "step": 4476 + }, + { + "epoch": 0.8618523955049691, + "grad_norm": 3.016237493766547, + "learning_rate": 1.2688437427294371e-05, + "loss": 1.6382, + "step": 4477 + }, + { + "epoch": 0.8620449021825445, + "grad_norm": 3.0005654231300007, + "learning_rate": 1.2685434189119652e-05, + "loss": 1.6675, + "step": 4478 + }, + { + "epoch": 0.8622374088601198, + "grad_norm": 3.1857688717745827, + "learning_rate": 1.268243068988637e-05, + "loss": 1.6929, + "step": 4479 + }, + { + "epoch": 0.8624299155376952, + "grad_norm": 3.1171575576813715, + "learning_rate": 1.2679426929886507e-05, + "loss": 1.682, + "step": 4480 + }, + { + "epoch": 0.8624299155376952, + "lm_loss": 1.5832, + "step": 4480, + "vm_loss": 0.1688 + }, + { + "epoch": 0.8624299155376952, + "lm_loss": 1.8804, + "step": 4480, + "vm_loss": 0.1661 + }, + { + "epoch": 0.8624299155376952, + "lm_loss": 1.5709, + "step": 4480, + "vm_loss": 0.1491 + }, + { + "epoch": 0.8624299155376952, + "lm_loss": 2.0507, + "step": 4480, + "vm_loss": 0.2352 + }, + { + "epoch": 0.8624299155376952, + "lm_loss": 2.0701, + "step": 4480, + "vm_loss": 0.1268 + }, + { + "epoch": 0.8624299155376952, + "lm_loss": 1.1112, + "step": 4480, + "vm_loss": 0.1484 + }, + { + "epoch": 0.8624299155376952, + "lm_loss": 1.2841, + "step": 4480, + "vm_loss": 0.1022 + }, + { + "epoch": 0.8624299155376952, + "lm_loss": 1.8705, + "step": 4480, + "vm_loss": 0.1524 + }, + { + "epoch": 0.8626224222152706, + "grad_norm": 3.0897309797569377, + "learning_rate": 1.2676422909412067e-05, + "loss": 1.6388, + "step": 4481 + }, + { + "epoch": 0.862814928892846, + "grad_norm": 3.3078437971865875, + "learning_rate": 1.2673418628755078e-05, + "loss": 1.806, + "step": 4482 + }, + { + "epoch": 0.8630074355704214, + "grad_norm": 3.094966719813121, + "learning_rate": 1.2670414088207597e-05, + "loss": 1.7651, + "step": 4483 + }, + { + "epoch": 0.8631999422479967, + "grad_norm": 2.9244838672595446, + "learning_rate": 1.26674092880617e-05, + "loss": 1.6751, + "step": 4484 + }, + { + "epoch": 0.8633924489255721, + "grad_norm": 3.03126661454168, + "learning_rate": 1.2664404228609498e-05, + "loss": 1.6308, + "step": 4485 + }, + { + "epoch": 0.8635849556031475, + "grad_norm": 3.058533701565695, + "learning_rate": 1.2661398910143112e-05, + "loss": 1.6744, + "step": 4486 + }, + { + "epoch": 0.8637774622807228, + "grad_norm": 3.086312651540744, + "learning_rate": 1.2658393332954704e-05, + "loss": 1.6787, + "step": 4487 + }, + { + "epoch": 0.8639699689582983, + "grad_norm": 3.06225690621572, + "learning_rate": 1.2655387497336456e-05, + "loss": 1.6254, + "step": 4488 + }, + { + "epoch": 0.8639699689582983, + "lm_loss": 1.1454, + "step": 4488, + "vm_loss": 0.1885 + }, + { + "epoch": 0.8639699689582983, + "lm_loss": 1.9375, + "step": 4488, + "vm_loss": 0.1655 + }, + { + "epoch": 0.8639699689582983, + "lm_loss": 1.9362, + "step": 4488, + "vm_loss": 0.1746 + }, + { + "epoch": 0.8639699689582983, + "lm_loss": 1.9991, + "step": 4488, + "vm_loss": 0.1774 + }, + { + "epoch": 0.8639699689582983, + "lm_loss": 1.9301, + "step": 4488, + "vm_loss": 0.1436 + }, + { + "epoch": 0.8639699689582983, + "lm_loss": 1.5812, + "step": 4488, + "vm_loss": 0.2221 + }, + { + "epoch": 0.8639699689582983, + "lm_loss": 1.355, + "step": 4488, + "vm_loss": 0.1419 + }, + { + "epoch": 0.8639699689582983, + "lm_loss": 1.3635, + "step": 4488, + "vm_loss": 0.1446 + }, + { + "epoch": 0.8641624756358737, + "grad_norm": 3.2148140541238646, + "learning_rate": 1.265238140358057e-05, + "loss": 1.7138, + "step": 4489 + }, + { + "epoch": 0.864354982313449, + "grad_norm": 3.017382667130634, + "learning_rate": 1.2649375051979276e-05, + "loss": 1.6194, + "step": 4490 + }, + { + "epoch": 0.8645474889910244, + "grad_norm": 3.0050034680899467, + "learning_rate": 1.2646368442824832e-05, + "loss": 1.707, + "step": 4491 + }, + { + "epoch": 0.8647399956685997, + "grad_norm": 3.020961298784208, + "learning_rate": 1.2643361576409517e-05, + "loss": 1.6307, + "step": 4492 + }, + { + "epoch": 0.8649325023461751, + "grad_norm": 2.8863948839624225, + "learning_rate": 1.264035445302564e-05, + "loss": 1.6951, + "step": 4493 + }, + { + "epoch": 0.8651250090237506, + "grad_norm": 2.9371411689720572, + "learning_rate": 1.263734707296553e-05, + "loss": 1.6998, + "step": 4494 + }, + { + "epoch": 0.8653175157013259, + "grad_norm": 3.0191043075879636, + "learning_rate": 1.2634339436521543e-05, + "loss": 1.6333, + "step": 4495 + }, + { + "epoch": 0.8655100223789013, + "grad_norm": 3.0311774510797136, + "learning_rate": 1.263133154398606e-05, + "loss": 1.7016, + "step": 4496 + }, + { + "epoch": 0.8655100223789013, + "lm_loss": 1.3431, + "step": 4496, + "vm_loss": 0.1782 + }, + { + "epoch": 0.8655100223789013, + "lm_loss": 0.9305, + "step": 4496, + "vm_loss": 0.1038 + }, + { + "epoch": 0.8655100223789013, + "lm_loss": 1.4762, + "step": 4496, + "vm_loss": 0.1683 + }, + { + "epoch": 0.8655100223789013, + "lm_loss": 0.8536, + "step": 4496, + "vm_loss": 0.1155 + }, + { + "epoch": 0.8655100223789013, + "lm_loss": 1.2865, + "step": 4496, + "vm_loss": 0.1799 + }, + { + "epoch": 0.8655100223789013, + "lm_loss": 1.4997, + "step": 4496, + "vm_loss": 0.1679 + }, + { + "epoch": 0.8655100223789013, + "lm_loss": 1.7362, + "step": 4496, + "vm_loss": 0.155 + }, + { + "epoch": 0.8655100223789013, + "lm_loss": 0.8027, + "step": 4496, + "vm_loss": 0.1983 + }, + { + "epoch": 0.8657025290564766, + "grad_norm": 3.032642485101621, + "learning_rate": 1.2628323395651485e-05, + "loss": 1.6102, + "step": 4497 + }, + { + "epoch": 0.865895035734052, + "grad_norm": 3.1145686090229643, + "learning_rate": 1.2625314991810254e-05, + "loss": 1.6627, + "step": 4498 + }, + { + "epoch": 0.8660875424116274, + "grad_norm": 3.1790364780462492, + "learning_rate": 1.2622306332754812e-05, + "loss": 1.7303, + "step": 4499 + }, + { + "epoch": 0.8662800490892028, + "grad_norm": 3.0131872076240596, + "learning_rate": 1.2619297418777651e-05, + "loss": 1.7053, + "step": 4500 + }, + { + "epoch": 0.8664725557667782, + "grad_norm": 2.9860416847197535, + "learning_rate": 1.2616288250171268e-05, + "loss": 1.7084, + "step": 4501 + }, + { + "epoch": 0.8666650624443535, + "grad_norm": 3.018242603209765, + "learning_rate": 1.2613278827228197e-05, + "loss": 1.6736, + "step": 4502 + }, + { + "epoch": 0.8668575691219289, + "grad_norm": 3.0926621651436963, + "learning_rate": 1.2610269150240985e-05, + "loss": 1.6979, + "step": 4503 + }, + { + "epoch": 0.8670500757995043, + "grad_norm": 3.0550182199135905, + "learning_rate": 1.2607259219502222e-05, + "loss": 1.6118, + "step": 4504 + }, + { + "epoch": 0.8670500757995043, + "lm_loss": 1.1916, + "step": 4504, + "vm_loss": 0.1236 + }, + { + "epoch": 0.8670500757995043, + "lm_loss": 1.8031, + "step": 4504, + "vm_loss": 0.1892 + }, + { + "epoch": 0.8670500757995043, + "lm_loss": 1.8845, + "step": 4504, + "vm_loss": 0.1829 + }, + { + "epoch": 0.8670500757995043, + "lm_loss": 1.3913, + "step": 4504, + "vm_loss": 0.1229 + }, + { + "epoch": 0.8670500757995043, + "lm_loss": 1.7355, + "step": 4504, + "vm_loss": 0.1839 + }, + { + "epoch": 0.8670500757995043, + "lm_loss": 0.8855, + "step": 4504, + "vm_loss": 0.1201 + }, + { + "epoch": 0.8670500757995043, + "lm_loss": 1.6043, + "step": 4504, + "vm_loss": 0.2125 + }, + { + "epoch": 0.8670500757995043, + "lm_loss": 1.8649, + "step": 4504, + "vm_loss": 0.1755 + }, + { + "epoch": 0.8672425824770796, + "grad_norm": 3.040848271698335, + "learning_rate": 1.2604249035304505e-05, + "loss": 1.683, + "step": 4505 + }, + { + "epoch": 0.8674350891546551, + "grad_norm": 3.2964680800870596, + "learning_rate": 1.2601238597940462e-05, + "loss": 1.6255, + "step": 4506 + }, + { + "epoch": 0.8676275958322305, + "grad_norm": 3.2444759612907106, + "learning_rate": 1.2598227907702749e-05, + "loss": 1.6381, + "step": 4507 + }, + { + "epoch": 0.8678201025098058, + "grad_norm": 3.0614446983141534, + "learning_rate": 1.2595216964884043e-05, + "loss": 1.7674, + "step": 4508 + }, + { + "epoch": 0.8680126091873812, + "grad_norm": 3.05410010044593, + "learning_rate": 1.2592205769777044e-05, + "loss": 1.7364, + "step": 4509 + }, + { + "epoch": 0.8682051158649565, + "grad_norm": 3.0570537511777602, + "learning_rate": 1.258919432267448e-05, + "loss": 1.7035, + "step": 4510 + }, + { + "epoch": 0.8683976225425319, + "grad_norm": 3.166324911497316, + "learning_rate": 1.2586182623869102e-05, + "loss": 1.6489, + "step": 4511 + }, + { + "epoch": 0.8685901292201074, + "grad_norm": 3.061439217901443, + "learning_rate": 1.2583170673653688e-05, + "loss": 1.6768, + "step": 4512 + }, + { + "epoch": 0.8685901292201074, + "lm_loss": 1.7003, + "step": 4512, + "vm_loss": 0.1742 + }, + { + "epoch": 0.8685901292201074, + "lm_loss": 1.0974, + "step": 4512, + "vm_loss": 0.1254 + }, + { + "epoch": 0.8685901292201074, + "lm_loss": 1.581, + "step": 4512, + "vm_loss": 0.149 + }, + { + "epoch": 0.8685901292201074, + "lm_loss": 2.0384, + "step": 4512, + "vm_loss": 0.1505 + }, + { + "epoch": 0.8685901292201074, + "lm_loss": 1.8247, + "step": 4512, + "vm_loss": 0.2137 + }, + { + "epoch": 0.8685901292201074, + "lm_loss": 1.5041, + "step": 4512, + "vm_loss": 0.161 + }, + { + "epoch": 0.8685901292201074, + "lm_loss": 1.5306, + "step": 4512, + "vm_loss": 0.1439 + }, + { + "epoch": 0.8685901292201074, + "lm_loss": 1.4329, + "step": 4512, + "vm_loss": 0.2135 + }, + { + "epoch": 0.8687826358976827, + "grad_norm": 3.138110276354647, + "learning_rate": 1.2580158472321035e-05, + "loss": 1.6968, + "step": 4513 + }, + { + "epoch": 0.8689751425752581, + "grad_norm": 3.244684812037457, + "learning_rate": 1.2577146020163968e-05, + "loss": 1.6605, + "step": 4514 + }, + { + "epoch": 0.8691676492528334, + "grad_norm": 3.2220205622214824, + "learning_rate": 1.2574133317475337e-05, + "loss": 1.6659, + "step": 4515 + }, + { + "epoch": 0.8693601559304088, + "grad_norm": 3.126389018681553, + "learning_rate": 1.2571120364548015e-05, + "loss": 1.6461, + "step": 4516 + }, + { + "epoch": 0.8695526626079843, + "grad_norm": 3.0977651156155077, + "learning_rate": 1.25681071616749e-05, + "loss": 1.6534, + "step": 4517 + }, + { + "epoch": 0.8697451692855596, + "grad_norm": 2.9236833692927497, + "learning_rate": 1.2565093709148913e-05, + "loss": 1.6396, + "step": 4518 + }, + { + "epoch": 0.869937675963135, + "grad_norm": 2.97654897421359, + "learning_rate": 1.2562080007263e-05, + "loss": 1.6218, + "step": 4519 + }, + { + "epoch": 0.8701301826407104, + "grad_norm": 2.981091271799177, + "learning_rate": 1.2559066056310132e-05, + "loss": 1.6319, + "step": 4520 + }, + { + "epoch": 0.8701301826407104, + "lm_loss": 0.9042, + "step": 4520, + "vm_loss": 0.1029 + }, + { + "epoch": 0.8701301826407104, + "lm_loss": 1.1927, + "step": 4520, + "vm_loss": 0.1574 + }, + { + "epoch": 0.8701301826407104, + "lm_loss": 1.0412, + "step": 4520, + "vm_loss": 0.1885 + }, + { + "epoch": 0.8701301826407104, + "lm_loss": 1.3148, + "step": 4520, + "vm_loss": 0.1344 + }, + { + "epoch": 0.8701301826407104, + "lm_loss": 1.4818, + "step": 4520, + "vm_loss": 0.2433 + }, + { + "epoch": 0.8701301826407104, + "lm_loss": 1.555, + "step": 4520, + "vm_loss": 0.1705 + }, + { + "epoch": 0.8701301826407104, + "lm_loss": 1.3902, + "step": 4520, + "vm_loss": 0.1408 + }, + { + "epoch": 0.8701301826407104, + "lm_loss": 1.3759, + "step": 4520, + "vm_loss": 0.1975 + }, + { + "epoch": 0.8703226893182857, + "grad_norm": 3.114404094929091, + "learning_rate": 1.2556051856583302e-05, + "loss": 1.6005, + "step": 4521 + }, + { + "epoch": 0.8705151959958611, + "grad_norm": 3.0898387301273296, + "learning_rate": 1.2553037408375532e-05, + "loss": 1.704, + "step": 4522 + }, + { + "epoch": 0.8707077026734364, + "grad_norm": 3.416529763203889, + "learning_rate": 1.2550022711979861e-05, + "loss": 1.7238, + "step": 4523 + }, + { + "epoch": 0.8709002093510119, + "grad_norm": 3.1046861715040777, + "learning_rate": 1.2547007767689361e-05, + "loss": 1.7134, + "step": 4524 + }, + { + "epoch": 0.8710927160285873, + "grad_norm": 3.0865199969765102, + "learning_rate": 1.2543992575797118e-05, + "loss": 1.6264, + "step": 4525 + }, + { + "epoch": 0.8712852227061626, + "grad_norm": 3.041407015619365, + "learning_rate": 1.254097713659625e-05, + "loss": 1.6276, + "step": 4526 + }, + { + "epoch": 0.871477729383738, + "grad_norm": 2.97918325936035, + "learning_rate": 1.2537961450379896e-05, + "loss": 1.6819, + "step": 4527 + }, + { + "epoch": 0.8716702360613133, + "grad_norm": 3.1665083925795163, + "learning_rate": 1.2534945517441217e-05, + "loss": 1.705, + "step": 4528 + }, + { + "epoch": 0.8716702360613133, + "lm_loss": 1.262, + "step": 4528, + "vm_loss": 0.1188 + }, + { + "epoch": 0.8716702360613133, + "lm_loss": 1.6075, + "step": 4528, + "vm_loss": 0.1799 + }, + { + "epoch": 0.8716702360613133, + "lm_loss": 1.837, + "step": 4528, + "vm_loss": 0.1221 + }, + { + "epoch": 0.8716702360613133, + "lm_loss": 1.4592, + "step": 4528, + "vm_loss": 0.2896 + }, + { + "epoch": 0.8716702360613133, + "lm_loss": 1.9992, + "step": 4528, + "vm_loss": 0.1594 + }, + { + "epoch": 0.8716702360613133, + "lm_loss": 1.3408, + "step": 4528, + "vm_loss": 0.1928 + }, + { + "epoch": 0.8716702360613133, + "lm_loss": 0.9686, + "step": 4528, + "vm_loss": 0.0975 + }, + { + "epoch": 0.8716702360613133, + "lm_loss": 0.9556, + "step": 4528, + "vm_loss": 0.1654 + }, + { + "epoch": 0.8718627427388888, + "grad_norm": 3.2669378526116875, + "learning_rate": 1.2531929338073404e-05, + "loss": 1.6542, + "step": 4529 + }, + { + "epoch": 0.8720552494164642, + "grad_norm": 3.057730215153544, + "learning_rate": 1.2528912912569666e-05, + "loss": 1.6103, + "step": 4530 + }, + { + "epoch": 0.8722477560940395, + "grad_norm": 3.04827502993877, + "learning_rate": 1.252589624122324e-05, + "loss": 1.5632, + "step": 4531 + }, + { + "epoch": 0.8724402627716149, + "grad_norm": 3.1293689322359013, + "learning_rate": 1.2522879324327382e-05, + "loss": 1.6759, + "step": 4532 + }, + { + "epoch": 0.8726327694491902, + "grad_norm": 3.0097413213092943, + "learning_rate": 1.2519862162175377e-05, + "loss": 1.6904, + "step": 4533 + }, + { + "epoch": 0.8728252761267656, + "grad_norm": 2.9590036337945134, + "learning_rate": 1.2516844755060533e-05, + "loss": 1.6119, + "step": 4534 + }, + { + "epoch": 0.8730177828043411, + "grad_norm": 3.1314755033065547, + "learning_rate": 1.2513827103276177e-05, + "loss": 1.7016, + "step": 4535 + }, + { + "epoch": 0.8732102894819164, + "grad_norm": 2.99572618480157, + "learning_rate": 1.2510809207115666e-05, + "loss": 1.6597, + "step": 4536 + }, + { + "epoch": 0.8732102894819164, + "lm_loss": 1.6401, + "step": 4536, + "vm_loss": 0.1153 + }, + { + "epoch": 0.8732102894819164, + "lm_loss": 0.9012, + "step": 4536, + "vm_loss": 0.1809 + }, + { + "epoch": 0.8732102894819164, + "lm_loss": 0.8623, + "step": 4536, + "vm_loss": 0.1239 + }, + { + "epoch": 0.8732102894819164, + "lm_loss": 2.1662, + "step": 4536, + "vm_loss": 0.1925 + }, + { + "epoch": 0.8732102894819164, + "lm_loss": 1.6903, + "step": 4536, + "vm_loss": 0.1063 + }, + { + "epoch": 0.8732102894819164, + "lm_loss": 1.0089, + "step": 4536, + "vm_loss": 0.1996 + }, + { + "epoch": 0.8732102894819164, + "lm_loss": 1.3925, + "step": 4536, + "vm_loss": 0.1303 + }, + { + "epoch": 0.8732102894819164, + "lm_loss": 1.4715, + "step": 4536, + "vm_loss": 0.1459 + }, + { + "epoch": 0.8734027961594918, + "grad_norm": 2.8922712583563372, + "learning_rate": 1.2507791066872377e-05, + "loss": 1.6018, + "step": 4537 + }, + { + "epoch": 0.8735953028370672, + "grad_norm": 2.995805259893655, + "learning_rate": 1.2504772682839713e-05, + "loss": 1.6936, + "step": 4538 + }, + { + "epoch": 0.8737878095146425, + "grad_norm": 2.909318212187558, + "learning_rate": 1.2501754055311098e-05, + "loss": 1.6816, + "step": 4539 + }, + { + "epoch": 0.8739803161922179, + "grad_norm": 2.9946738866273748, + "learning_rate": 1.2498735184579982e-05, + "loss": 1.6868, + "step": 4540 + }, + { + "epoch": 0.8741728228697933, + "grad_norm": 2.99397166880558, + "learning_rate": 1.2495716070939841e-05, + "loss": 1.6723, + "step": 4541 + }, + { + "epoch": 0.8743653295473687, + "grad_norm": 3.102729386804844, + "learning_rate": 1.2492696714684165e-05, + "loss": 1.6424, + "step": 4542 + }, + { + "epoch": 0.8745578362249441, + "grad_norm": 3.000925337211229, + "learning_rate": 1.2489677116106478e-05, + "loss": 1.6273, + "step": 4543 + }, + { + "epoch": 0.8747503429025194, + "grad_norm": 3.1244958420367603, + "learning_rate": 1.2486657275500322e-05, + "loss": 1.6249, + "step": 4544 + }, + { + "epoch": 0.8747503429025194, + "lm_loss": 2.1406, + "step": 4544, + "vm_loss": 0.1921 + }, + { + "epoch": 0.8747503429025194, + "lm_loss": 1.6883, + "step": 4544, + "vm_loss": 0.1354 + }, + { + "epoch": 0.8747503429025194, + "lm_loss": 1.7223, + "step": 4544, + "vm_loss": 0.0714 + }, + { + "epoch": 0.8747503429025194, + "lm_loss": 1.6763, + "step": 4544, + "vm_loss": 0.1589 + }, + { + "epoch": 0.8747503429025194, + "lm_loss": 1.6168, + "step": 4544, + "vm_loss": 0.2027 + }, + { + "epoch": 0.8747503429025194, + "lm_loss": 1.1973, + "step": 4544, + "vm_loss": 0.1971 + }, + { + "epoch": 0.8747503429025194, + "lm_loss": 1.3835, + "step": 4544, + "vm_loss": 0.175 + }, + { + "epoch": 0.8747503429025194, + "lm_loss": 1.2531, + "step": 4544, + "vm_loss": 0.1503 + }, + { + "epoch": 0.8749428495800948, + "grad_norm": 3.0721203185954367, + "learning_rate": 1.2483637193159268e-05, + "loss": 1.6778, + "step": 4545 + }, + { + "epoch": 0.8751353562576701, + "grad_norm": 3.0051601609676575, + "learning_rate": 1.2480616869376903e-05, + "loss": 1.6284, + "step": 4546 + }, + { + "epoch": 0.8753278629352456, + "grad_norm": 3.0086585252084923, + "learning_rate": 1.247759630444684e-05, + "loss": 1.6178, + "step": 4547 + }, + { + "epoch": 0.875520369612821, + "grad_norm": 2.9493966968582845, + "learning_rate": 1.2474575498662722e-05, + "loss": 1.6195, + "step": 4548 + }, + { + "epoch": 0.8757128762903963, + "grad_norm": 3.073232391234651, + "learning_rate": 1.2471554452318201e-05, + "loss": 1.669, + "step": 4549 + }, + { + "epoch": 0.8759053829679717, + "grad_norm": 3.1985464544251907, + "learning_rate": 1.2468533165706971e-05, + "loss": 1.6932, + "step": 4550 + }, + { + "epoch": 0.8760978896455471, + "grad_norm": 3.0270338607373413, + "learning_rate": 1.2465511639122737e-05, + "loss": 1.6856, + "step": 4551 + }, + { + "epoch": 0.8762903963231224, + "grad_norm": 3.1794867515628527, + "learning_rate": 1.2462489872859225e-05, + "loss": 1.6008, + "step": 4552 + }, + { + "epoch": 0.8762903963231224, + "lm_loss": 0.8905, + "step": 4552, + "vm_loss": 0.1642 + }, + { + "epoch": 0.8762903963231224, + "lm_loss": 1.4828, + "step": 4552, + "vm_loss": 0.1505 + }, + { + "epoch": 0.8762903963231224, + "lm_loss": 1.149, + "step": 4552, + "vm_loss": 0.1048 + }, + { + "epoch": 0.8762903963231224, + "lm_loss": 1.513, + "step": 4552, + "vm_loss": 0.1539 + }, + { + "epoch": 0.8762903963231224, + "lm_loss": 1.3674, + "step": 4552, + "vm_loss": 0.1132 + }, + { + "epoch": 0.8762903963231224, + "lm_loss": 1.6657, + "step": 4552, + "vm_loss": 0.2663 + }, + { + "epoch": 0.8762903963231224, + "lm_loss": 1.4919, + "step": 4552, + "vm_loss": 0.166 + }, + { + "epoch": 0.8762903963231224, + "lm_loss": 1.6948, + "step": 4552, + "vm_loss": 0.1743 + }, + { + "epoch": 0.8764829030006979, + "grad_norm": 2.898069556738775, + "learning_rate": 1.2459467867210195e-05, + "loss": 1.6365, + "step": 4553 + }, + { + "epoch": 0.8766754096782732, + "grad_norm": 3.001681802908033, + "learning_rate": 1.2456445622469422e-05, + "loss": 1.6694, + "step": 4554 + }, + { + "epoch": 0.8768679163558486, + "grad_norm": 3.1156492602116796, + "learning_rate": 1.2453423138930704e-05, + "loss": 1.6596, + "step": 4555 + }, + { + "epoch": 0.877060423033424, + "grad_norm": 3.039170441785524, + "learning_rate": 1.2450400416887872e-05, + "loss": 1.6374, + "step": 4556 + }, + { + "epoch": 0.8772529297109993, + "grad_norm": 3.0219659222375914, + "learning_rate": 1.244737745663477e-05, + "loss": 1.649, + "step": 4557 + }, + { + "epoch": 0.8774454363885748, + "grad_norm": 3.08620213967334, + "learning_rate": 1.2444354258465268e-05, + "loss": 1.6217, + "step": 4558 + }, + { + "epoch": 0.8776379430661501, + "grad_norm": 3.103126411561901, + "learning_rate": 1.2441330822673257e-05, + "loss": 1.6227, + "step": 4559 + }, + { + "epoch": 0.8778304497437255, + "grad_norm": 3.009849825563307, + "learning_rate": 1.2438307149552659e-05, + "loss": 1.5811, + "step": 4560 + }, + { + "epoch": 0.8778304497437255, + "lm_loss": 1.7564, + "step": 4560, + "vm_loss": 0.1432 + }, + { + "epoch": 0.8778304497437255, + "lm_loss": 1.1507, + "step": 4560, + "vm_loss": 0.2036 + }, + { + "epoch": 0.8778304497437255, + "lm_loss": 1.2605, + "step": 4560, + "vm_loss": 0.2205 + }, + { + "epoch": 0.8778304497437255, + "lm_loss": 1.2455, + "step": 4560, + "vm_loss": 0.1708 + }, + { + "epoch": 0.8778304497437255, + "lm_loss": 1.5245, + "step": 4560, + "vm_loss": 0.1504 + }, + { + "epoch": 0.8778304497437255, + "lm_loss": 2.0303, + "step": 4560, + "vm_loss": 0.1204 + }, + { + "epoch": 0.8778304497437255, + "lm_loss": 1.2958, + "step": 4560, + "vm_loss": 0.1698 + }, + { + "epoch": 0.8778304497437255, + "lm_loss": 1.6348, + "step": 4560, + "vm_loss": 0.2421 + }, + { + "epoch": 0.8780229564213009, + "grad_norm": 3.0874447260341302, + "learning_rate": 1.2435283239397411e-05, + "loss": 1.6386, + "step": 4561 + }, + { + "epoch": 0.8782154630988762, + "grad_norm": 2.9737312909336198, + "learning_rate": 1.2432259092501474e-05, + "loss": 1.5963, + "step": 4562 + }, + { + "epoch": 0.8784079697764516, + "grad_norm": 3.01348030067374, + "learning_rate": 1.2429234709158837e-05, + "loss": 1.6532, + "step": 4563 + }, + { + "epoch": 0.878600476454027, + "grad_norm": 3.161406167177918, + "learning_rate": 1.2426210089663507e-05, + "loss": 1.691, + "step": 4564 + }, + { + "epoch": 0.8787929831316024, + "grad_norm": 3.190201401742706, + "learning_rate": 1.2423185234309516e-05, + "loss": 1.7028, + "step": 4565 + }, + { + "epoch": 0.8789854898091778, + "grad_norm": 3.2318175696114064, + "learning_rate": 1.2420160143390918e-05, + "loss": 1.6397, + "step": 4566 + }, + { + "epoch": 0.8791779964867531, + "grad_norm": 3.1098491512355766, + "learning_rate": 1.2417134817201792e-05, + "loss": 1.6048, + "step": 4567 + }, + { + "epoch": 0.8793705031643285, + "grad_norm": 3.0554103452449124, + "learning_rate": 1.2414109256036237e-05, + "loss": 1.7338, + "step": 4568 + }, + { + "epoch": 0.8793705031643285, + "lm_loss": 1.7922, + "step": 4568, + "vm_loss": 0.1493 + }, + { + "epoch": 0.8793705031643285, + "lm_loss": 0.8861, + "step": 4568, + "vm_loss": 0.2132 + }, + { + "epoch": 0.8793705031643285, + "lm_loss": 1.3009, + "step": 4568, + "vm_loss": 0.1244 + }, + { + "epoch": 0.8793705031643285, + "lm_loss": 1.2766, + "step": 4568, + "vm_loss": 0.1921 + }, + { + "epoch": 0.8793705031643285, + "lm_loss": 1.4508, + "step": 4568, + "vm_loss": 0.1635 + }, + { + "epoch": 0.8793705031643285, + "lm_loss": 1.1622, + "step": 4568, + "vm_loss": 0.1423 + }, + { + "epoch": 0.8793705031643285, + "lm_loss": 1.5518, + "step": 4568, + "vm_loss": 0.1774 + }, + { + "epoch": 0.8793705031643285, + "lm_loss": 1.9879, + "step": 4568, + "vm_loss": 0.1435 + }, + { + "epoch": 0.8795630098419039, + "grad_norm": 3.049499359721112, + "learning_rate": 1.2411083460188378e-05, + "loss": 1.6687, + "step": 4569 + }, + { + "epoch": 0.8797555165194793, + "grad_norm": 2.900962754963383, + "learning_rate": 1.240805742995236e-05, + "loss": 1.6325, + "step": 4570 + }, + { + "epoch": 0.8799480231970547, + "grad_norm": 3.08890790992242, + "learning_rate": 1.240503116562235e-05, + "loss": 1.6275, + "step": 4571 + }, + { + "epoch": 0.88014052987463, + "grad_norm": 3.0867500115031805, + "learning_rate": 1.240200466749254e-05, + "loss": 1.6601, + "step": 4572 + }, + { + "epoch": 0.8803330365522054, + "grad_norm": 3.145114025357283, + "learning_rate": 1.239897793585715e-05, + "loss": 1.68, + "step": 4573 + }, + { + "epoch": 0.8805255432297808, + "grad_norm": 3.1196334446703586, + "learning_rate": 1.2395950971010408e-05, + "loss": 1.604, + "step": 4574 + }, + { + "epoch": 0.8807180499073561, + "grad_norm": 3.09137320033844, + "learning_rate": 1.2392923773246583e-05, + "loss": 1.6026, + "step": 4575 + }, + { + "epoch": 0.8809105565849316, + "grad_norm": 3.3147088740344417, + "learning_rate": 1.238989634285995e-05, + "loss": 1.6865, + "step": 4576 + }, + { + "epoch": 0.8809105565849316, + "lm_loss": 1.809, + "step": 4576, + "vm_loss": 0.1408 + }, + { + "epoch": 0.8809105565849316, + "lm_loss": 1.4648, + "step": 4576, + "vm_loss": 0.1628 + }, + { + "epoch": 0.8809105565849316, + "lm_loss": 1.6058, + "step": 4576, + "vm_loss": 0.1401 + }, + { + "epoch": 0.8809105565849316, + "lm_loss": 1.6269, + "step": 4576, + "vm_loss": 0.1459 + }, + { + "epoch": 0.8809105565849316, + "lm_loss": 1.376, + "step": 4576, + "vm_loss": 0.1693 + }, + { + "epoch": 0.8809105565849316, + "lm_loss": 1.7382, + "step": 4576, + "vm_loss": 0.1007 + }, + { + "epoch": 0.8809105565849316, + "lm_loss": 1.6333, + "step": 4576, + "vm_loss": 0.2567 + }, + { + "epoch": 0.8809105565849316, + "lm_loss": 1.4625, + "step": 4576, + "vm_loss": 0.2702 + }, + { + "epoch": 0.8811030632625069, + "grad_norm": 3.19647188730665, + "learning_rate": 1.2386868680144816e-05, + "loss": 1.7354, + "step": 4577 + }, + { + "epoch": 0.8812955699400823, + "grad_norm": 2.928682338156493, + "learning_rate": 1.2383840785395511e-05, + "loss": 1.5272, + "step": 4578 + }, + { + "epoch": 0.8814880766176577, + "grad_norm": 2.9378246150967513, + "learning_rate": 1.2380812658906385e-05, + "loss": 1.6514, + "step": 4579 + }, + { + "epoch": 0.881680583295233, + "grad_norm": 3.1033376933264236, + "learning_rate": 1.2377784300971807e-05, + "loss": 1.6931, + "step": 4580 + }, + { + "epoch": 0.8818730899728084, + "grad_norm": 3.1889888253104086, + "learning_rate": 1.2374755711886173e-05, + "loss": 1.6055, + "step": 4581 + }, + { + "epoch": 0.8820655966503839, + "grad_norm": 3.0050036021158197, + "learning_rate": 1.2371726891943905e-05, + "loss": 1.5516, + "step": 4582 + }, + { + "epoch": 0.8822581033279592, + "grad_norm": 3.1669435906135748, + "learning_rate": 1.2368697841439439e-05, + "loss": 1.5779, + "step": 4583 + }, + { + "epoch": 0.8824506100055346, + "grad_norm": 3.182860087338219, + "learning_rate": 1.2365668560667237e-05, + "loss": 1.6214, + "step": 4584 + }, + { + "epoch": 0.8824506100055346, + "lm_loss": 1.6302, + "step": 4584, + "vm_loss": 0.1838 + }, + { + "epoch": 0.8824506100055346, + "lm_loss": 1.5885, + "step": 4584, + "vm_loss": 0.2405 + }, + { + "epoch": 0.8824506100055346, + "lm_loss": 1.5876, + "step": 4584, + "vm_loss": 0.2252 + }, + { + "epoch": 0.8824506100055346, + "lm_loss": 1.3184, + "step": 4584, + "vm_loss": 0.2534 + }, + { + "epoch": 0.8824506100055346, + "lm_loss": 1.2578, + "step": 4584, + "vm_loss": 0.0995 + }, + { + "epoch": 0.8824506100055346, + "lm_loss": 1.3042, + "step": 4584, + "vm_loss": 0.1601 + }, + { + "epoch": 0.8824506100055346, + "lm_loss": 0.9598, + "step": 4584, + "vm_loss": 0.2077 + }, + { + "epoch": 0.8824506100055346, + "lm_loss": 1.0264, + "step": 4584, + "vm_loss": 0.166 + }, + { + "epoch": 0.8826431166831099, + "grad_norm": 3.3881675973074805, + "learning_rate": 1.2362639049921785e-05, + "loss": 1.6034, + "step": 4585 + }, + { + "epoch": 0.8828356233606853, + "grad_norm": 3.3858077264681414, + "learning_rate": 1.2359609309497591e-05, + "loss": 1.5744, + "step": 4586 + }, + { + "epoch": 0.8830281300382608, + "grad_norm": 3.203102517177925, + "learning_rate": 1.2356579339689186e-05, + "loss": 1.704, + "step": 4587 + }, + { + "epoch": 0.8832206367158361, + "grad_norm": 3.1306568130015515, + "learning_rate": 1.2353549140791116e-05, + "loss": 1.567, + "step": 4588 + }, + { + "epoch": 0.8834131433934115, + "grad_norm": 3.2228695926530846, + "learning_rate": 1.2350518713097964e-05, + "loss": 1.6452, + "step": 4589 + }, + { + "epoch": 0.8836056500709868, + "grad_norm": 3.0623422546782244, + "learning_rate": 1.2347488056904316e-05, + "loss": 1.5413, + "step": 4590 + }, + { + "epoch": 0.8837981567485622, + "grad_norm": 3.2335550336654526, + "learning_rate": 1.2344457172504796e-05, + "loss": 1.697, + "step": 4591 + }, + { + "epoch": 0.8839906634261376, + "grad_norm": 3.0967465520611213, + "learning_rate": 1.2341426060194049e-05, + "loss": 1.5974, + "step": 4592 + }, + { + "epoch": 0.8839906634261376, + "lm_loss": 1.5585, + "step": 4592, + "vm_loss": 0.1552 + }, + { + "epoch": 0.8839906634261376, + "lm_loss": 1.5586, + "step": 4592, + "vm_loss": 0.1143 + }, + { + "epoch": 0.8839906634261376, + "lm_loss": 1.0075, + "step": 4592, + "vm_loss": 0.1615 + }, + { + "epoch": 0.8839906634261376, + "lm_loss": 1.4814, + "step": 4592, + "vm_loss": 0.2125 + }, + { + "epoch": 0.8839906634261376, + "lm_loss": 1.3402, + "step": 4592, + "vm_loss": 0.2109 + }, + { + "epoch": 0.8839906634261376, + "lm_loss": 1.4366, + "step": 4592, + "vm_loss": 0.1751 + }, + { + "epoch": 0.8839906634261376, + "lm_loss": 1.7803, + "step": 4592, + "vm_loss": 0.1199 + }, + { + "epoch": 0.8839906634261376, + "lm_loss": 1.3822, + "step": 4592, + "vm_loss": 0.1721 + }, + { + "epoch": 0.884183170103713, + "grad_norm": 3.315556386594215, + "learning_rate": 1.2338394720266729e-05, + "loss": 1.6535, + "step": 4593 + }, + { + "epoch": 0.8843756767812884, + "grad_norm": 3.1962428399109504, + "learning_rate": 1.2335363153017528e-05, + "loss": 1.635, + "step": 4594 + }, + { + "epoch": 0.8845681834588637, + "grad_norm": 3.0978866346938694, + "learning_rate": 1.2332331358741147e-05, + "loss": 1.692, + "step": 4595 + }, + { + "epoch": 0.8847606901364391, + "grad_norm": 3.0753693627008856, + "learning_rate": 1.232929933773232e-05, + "loss": 1.5809, + "step": 4596 + }, + { + "epoch": 0.8849531968140145, + "grad_norm": 3.149636508210439, + "learning_rate": 1.23262670902858e-05, + "loss": 1.7049, + "step": 4597 + }, + { + "epoch": 0.8851457034915898, + "grad_norm": 2.988612280970963, + "learning_rate": 1.2323234616696353e-05, + "loss": 1.6249, + "step": 4598 + }, + { + "epoch": 0.8853382101691653, + "grad_norm": 3.1006811744798806, + "learning_rate": 1.232020191725878e-05, + "loss": 1.6025, + "step": 4599 + }, + { + "epoch": 0.8855307168467407, + "grad_norm": 3.1546596868266668, + "learning_rate": 1.2317168992267897e-05, + "loss": 1.6837, + "step": 4600 + }, + { + "epoch": 0.8855307168467407, + "lm_loss": 1.6674, + "step": 4600, + "vm_loss": 0.1659 + }, + { + "epoch": 0.8855307168467407, + "lm_loss": 2.0395, + "step": 4600, + "vm_loss": 0.2249 + }, + { + "epoch": 0.8855307168467407, + "lm_loss": 1.7314, + "step": 4600, + "vm_loss": 0.2164 + }, + { + "epoch": 0.8855307168467407, + "lm_loss": 1.026, + "step": 4600, + "vm_loss": 0.162 + }, + { + "epoch": 0.8855307168467407, + "lm_loss": 1.4702, + "step": 4600, + "vm_loss": 0.1194 + }, + { + "epoch": 0.8855307168467407, + "lm_loss": 1.5369, + "step": 4600, + "vm_loss": 0.2163 + }, + { + "epoch": 0.8855307168467407, + "lm_loss": 1.2585, + "step": 4600, + "vm_loss": 0.1399 + }, + { + "epoch": 0.8855307168467407, + "lm_loss": 1.6183, + "step": 4600, + "vm_loss": 0.2088 + }, + { + "epoch": 0.885723223524316, + "grad_norm": 3.0676064950398714, + "learning_rate": 1.2314135842018541e-05, + "loss": 1.6357, + "step": 4601 + }, + { + "epoch": 0.8859157302018914, + "grad_norm": 3.4160529813487814, + "learning_rate": 1.2311102466805578e-05, + "loss": 1.6635, + "step": 4602 + }, + { + "epoch": 0.8861082368794667, + "grad_norm": 3.0827882577358348, + "learning_rate": 1.2308068866923885e-05, + "loss": 1.6261, + "step": 4603 + }, + { + "epoch": 0.8863007435570421, + "grad_norm": 3.1139809583041567, + "learning_rate": 1.2305035042668372e-05, + "loss": 1.6624, + "step": 4604 + }, + { + "epoch": 0.8864932502346176, + "grad_norm": 3.2010709841406575, + "learning_rate": 1.230200099433396e-05, + "loss": 1.6666, + "step": 4605 + }, + { + "epoch": 0.8866857569121929, + "grad_norm": 3.159535980287746, + "learning_rate": 1.2298966722215604e-05, + "loss": 1.6099, + "step": 4606 + }, + { + "epoch": 0.8868782635897683, + "grad_norm": 3.1742782734120714, + "learning_rate": 1.2295932226608267e-05, + "loss": 1.65, + "step": 4607 + }, + { + "epoch": 0.8870707702673436, + "grad_norm": 3.1888948681243336, + "learning_rate": 1.2292897507806942e-05, + "loss": 1.567, + "step": 4608 + }, + { + "epoch": 0.8870707702673436, + "lm_loss": 1.5239, + "step": 4608, + "vm_loss": 0.1499 + }, + { + "epoch": 0.8870707702673436, + "lm_loss": 1.1824, + "step": 4608, + "vm_loss": 0.1325 + }, + { + "epoch": 0.8870707702673436, + "lm_loss": 1.5153, + "step": 4608, + "vm_loss": 0.1978 + }, + { + "epoch": 0.8870707702673436, + "lm_loss": 1.562, + "step": 4608, + "vm_loss": 0.0847 + }, + { + "epoch": 0.8870707702673436, + "lm_loss": 1.2188, + "step": 4608, + "vm_loss": 0.1627 + }, + { + "epoch": 0.8870707702673436, + "lm_loss": 1.5364, + "step": 4608, + "vm_loss": 0.1723 + }, + { + "epoch": 0.8870707702673436, + "lm_loss": 1.7758, + "step": 4608, + "vm_loss": 0.1393 + }, + { + "epoch": 0.8870707702673436, + "lm_loss": 2.0222, + "step": 4608, + "vm_loss": 0.1678 + }, + { + "epoch": 0.887263276944919, + "grad_norm": 3.3027548119621435, + "learning_rate": 1.2289862566106649e-05, + "loss": 1.6529, + "step": 4609 + }, + { + "epoch": 0.8874557836224944, + "grad_norm": 3.1453994806310086, + "learning_rate": 1.2286827401802418e-05, + "loss": 1.4935, + "step": 4610 + }, + { + "epoch": 0.8876482903000698, + "grad_norm": 3.227512629656992, + "learning_rate": 1.2283792015189304e-05, + "loss": 1.6087, + "step": 4611 + }, + { + "epoch": 0.8878407969776452, + "grad_norm": 3.0819205992772725, + "learning_rate": 1.2280756406562388e-05, + "loss": 1.6505, + "step": 4612 + }, + { + "epoch": 0.8880333036552205, + "grad_norm": 3.1243158083345848, + "learning_rate": 1.2277720576216772e-05, + "loss": 1.6221, + "step": 4613 + }, + { + "epoch": 0.8882258103327959, + "grad_norm": 3.1095502589038673, + "learning_rate": 1.2274684524447575e-05, + "loss": 1.6685, + "step": 4614 + }, + { + "epoch": 0.8884183170103713, + "grad_norm": 3.253054573823724, + "learning_rate": 1.227164825154994e-05, + "loss": 1.6932, + "step": 4615 + }, + { + "epoch": 0.8886108236879466, + "grad_norm": 3.0614054837713764, + "learning_rate": 1.2268611757819034e-05, + "loss": 1.6843, + "step": 4616 + }, + { + "epoch": 0.8886108236879466, + "lm_loss": 0.6115, + "step": 4616, + "vm_loss": 0.1563 + }, + { + "epoch": 0.8886108236879466, + "lm_loss": 1.635, + "step": 4616, + "vm_loss": 0.2065 + }, + { + "epoch": 0.8886108236879466, + "lm_loss": 1.0678, + "step": 4616, + "vm_loss": 0.1891 + }, + { + "epoch": 0.8886108236879466, + "lm_loss": 2.0023, + "step": 4616, + "vm_loss": 0.1789 + }, + { + "epoch": 0.8886108236879466, + "lm_loss": 1.3538, + "step": 4616, + "vm_loss": 0.1826 + }, + { + "epoch": 0.8886108236879466, + "lm_loss": 1.0059, + "step": 4616, + "vm_loss": 0.1928 + }, + { + "epoch": 0.8886108236879466, + "lm_loss": 1.0953, + "step": 4616, + "vm_loss": 0.1783 + }, + { + "epoch": 0.8886108236879466, + "lm_loss": 1.3482, + "step": 4616, + "vm_loss": 0.1908 + }, + { + "epoch": 0.8888033303655221, + "grad_norm": 3.10550399722038, + "learning_rate": 1.2265575043550038e-05, + "loss": 1.565, + "step": 4617 + }, + { + "epoch": 0.8889958370430975, + "grad_norm": 3.100190892774192, + "learning_rate": 1.2262538109038162e-05, + "loss": 1.6939, + "step": 4618 + }, + { + "epoch": 0.8891883437206728, + "grad_norm": 3.0403326081130655, + "learning_rate": 1.225950095457864e-05, + "loss": 1.6972, + "step": 4619 + }, + { + "epoch": 0.8893808503982482, + "grad_norm": 3.128885863206769, + "learning_rate": 1.2256463580466713e-05, + "loss": 1.6154, + "step": 4620 + }, + { + "epoch": 0.8895733570758235, + "grad_norm": 3.0510610344879048, + "learning_rate": 1.2253425986997658e-05, + "loss": 1.6542, + "step": 4621 + }, + { + "epoch": 0.889765863753399, + "grad_norm": 2.9153665077793716, + "learning_rate": 1.225038817446677e-05, + "loss": 1.6346, + "step": 4622 + }, + { + "epoch": 0.8899583704309744, + "grad_norm": 2.942641990042011, + "learning_rate": 1.2247350143169358e-05, + "loss": 1.5289, + "step": 4623 + }, + { + "epoch": 0.8901508771085497, + "grad_norm": 3.1305139392744055, + "learning_rate": 1.2244311893400761e-05, + "loss": 1.6499, + "step": 4624 + }, + { + "epoch": 0.8901508771085497, + "lm_loss": 1.9097, + "step": 4624, + "vm_loss": 0.1491 + }, + { + "epoch": 0.8901508771085497, + "lm_loss": 1.4179, + "step": 4624, + "vm_loss": 0.1855 + }, + { + "epoch": 0.8901508771085497, + "lm_loss": 1.5429, + "step": 4624, + "vm_loss": 0.1783 + }, + { + "epoch": 0.8901508771085497, + "lm_loss": 1.9257, + "step": 4624, + "vm_loss": 0.1431 + }, + { + "epoch": 0.8901508771085497, + "lm_loss": 1.5695, + "step": 4624, + "vm_loss": 0.1973 + }, + { + "epoch": 0.8901508771085497, + "lm_loss": 1.3959, + "step": 4624, + "vm_loss": 0.1662 + }, + { + "epoch": 0.8901508771085497, + "lm_loss": 1.358, + "step": 4624, + "vm_loss": 0.2183 + }, + { + "epoch": 0.8901508771085497, + "lm_loss": 1.1129, + "step": 4624, + "vm_loss": 0.1069 + }, + { + "epoch": 0.8903433837861251, + "grad_norm": 3.0393107727471476, + "learning_rate": 1.2241273425456335e-05, + "loss": 1.6413, + "step": 4625 + }, + { + "epoch": 0.8905358904637004, + "grad_norm": 3.0958053591355994, + "learning_rate": 1.223823473963146e-05, + "loss": 1.6782, + "step": 4626 + }, + { + "epoch": 0.8907283971412758, + "grad_norm": 3.2182620886126374, + "learning_rate": 1.223519583622153e-05, + "loss": 1.6827, + "step": 4627 + }, + { + "epoch": 0.8909209038188513, + "grad_norm": 3.179706211206332, + "learning_rate": 1.223215671552197e-05, + "loss": 1.6415, + "step": 4628 + }, + { + "epoch": 0.8911134104964266, + "grad_norm": 3.244708835716366, + "learning_rate": 1.2229117377828219e-05, + "loss": 1.6588, + "step": 4629 + }, + { + "epoch": 0.891305917174002, + "grad_norm": 3.1654432788061952, + "learning_rate": 1.2226077823435738e-05, + "loss": 1.6757, + "step": 4630 + }, + { + "epoch": 0.8914984238515774, + "grad_norm": 3.10313238038243, + "learning_rate": 1.2223038052640016e-05, + "loss": 1.6096, + "step": 4631 + }, + { + "epoch": 0.8916909305291527, + "grad_norm": 3.052738658824864, + "learning_rate": 1.2219998065736554e-05, + "loss": 1.65, + "step": 4632 + }, + { + "epoch": 0.8916909305291527, + "lm_loss": 1.8079, + "step": 4632, + "vm_loss": 0.1638 + }, + { + "epoch": 0.8916909305291527, + "lm_loss": 1.6441, + "step": 4632, + "vm_loss": 0.146 + }, + { + "epoch": 0.8916909305291527, + "lm_loss": 1.2368, + "step": 4632, + "vm_loss": 0.1584 + }, + { + "epoch": 0.8916909305291527, + "lm_loss": 1.2051, + "step": 4632, + "vm_loss": 0.2239 + }, + { + "epoch": 0.8916909305291527, + "lm_loss": 1.6459, + "step": 4632, + "vm_loss": 0.098 + }, + { + "epoch": 0.8916909305291527, + "lm_loss": 1.3861, + "step": 4632, + "vm_loss": 0.1381 + }, + { + "epoch": 0.8916909305291527, + "lm_loss": 1.1626, + "step": 4632, + "vm_loss": 0.1575 + }, + { + "epoch": 0.8916909305291527, + "lm_loss": 2.021, + "step": 4632, + "vm_loss": 0.1634 + }, + { + "epoch": 0.8918834372067281, + "grad_norm": 3.2697272247795865, + "learning_rate": 1.2216957863020878e-05, + "loss": 1.5705, + "step": 4633 + }, + { + "epoch": 0.8920759438843034, + "grad_norm": 2.9708022167190373, + "learning_rate": 1.2213917444788532e-05, + "loss": 1.5708, + "step": 4634 + }, + { + "epoch": 0.8922684505618789, + "grad_norm": 3.0047296594198984, + "learning_rate": 1.2210876811335089e-05, + "loss": 1.5814, + "step": 4635 + }, + { + "epoch": 0.8924609572394543, + "grad_norm": 3.1603834086492255, + "learning_rate": 1.220783596295613e-05, + "loss": 1.6645, + "step": 4636 + }, + { + "epoch": 0.8926534639170296, + "grad_norm": 3.023515673247715, + "learning_rate": 1.2204794899947272e-05, + "loss": 1.5917, + "step": 4637 + }, + { + "epoch": 0.892845970594605, + "grad_norm": 3.1204589083766807, + "learning_rate": 1.2201753622604145e-05, + "loss": 1.6147, + "step": 4638 + }, + { + "epoch": 0.8930384772721803, + "grad_norm": 3.2572988148402566, + "learning_rate": 1.2198712131222395e-05, + "loss": 1.6018, + "step": 4639 + }, + { + "epoch": 0.8932309839497558, + "grad_norm": 3.2353028219647286, + "learning_rate": 1.2195670426097692e-05, + "loss": 1.5814, + "step": 4640 + }, + { + "epoch": 0.8932309839497558, + "lm_loss": 1.9769, + "step": 4640, + "vm_loss": 0.2061 + }, + { + "epoch": 0.8932309839497558, + "lm_loss": 1.3061, + "step": 4640, + "vm_loss": 0.1813 + }, + { + "epoch": 0.8932309839497558, + "lm_loss": 1.0332, + "step": 4640, + "vm_loss": 0.2129 + }, + { + "epoch": 0.8932309839497558, + "lm_loss": 1.294, + "step": 4640, + "vm_loss": 0.1868 + }, + { + "epoch": 0.8932309839497558, + "lm_loss": 2.0536, + "step": 4640, + "vm_loss": 0.1034 + }, + { + "epoch": 0.8932309839497558, + "lm_loss": 1.6611, + "step": 4640, + "vm_loss": 0.1663 + }, + { + "epoch": 0.8932309839497558, + "lm_loss": 1.6921, + "step": 4640, + "vm_loss": 0.1409 + }, + { + "epoch": 0.8932309839497558, + "lm_loss": 1.5264, + "step": 4640, + "vm_loss": 0.1176 + }, + { + "epoch": 0.8934234906273312, + "grad_norm": 3.366885372667514, + "learning_rate": 1.2192628507525737e-05, + "loss": 1.668, + "step": 4641 + }, + { + "epoch": 0.8936159973049065, + "grad_norm": 3.0493219596915204, + "learning_rate": 1.218958637580224e-05, + "loss": 1.586, + "step": 4642 + }, + { + "epoch": 0.8938085039824819, + "grad_norm": 3.18845784404462, + "learning_rate": 1.2186544031222932e-05, + "loss": 1.652, + "step": 4643 + }, + { + "epoch": 0.8940010106600572, + "grad_norm": 3.178947272327734, + "learning_rate": 1.218350147408357e-05, + "loss": 1.6399, + "step": 4644 + }, + { + "epoch": 0.8941935173376326, + "grad_norm": 3.00309772751451, + "learning_rate": 1.2180458704679931e-05, + "loss": 1.5833, + "step": 4645 + }, + { + "epoch": 0.8943860240152081, + "grad_norm": 3.0841302452372537, + "learning_rate": 1.2177415723307808e-05, + "loss": 1.6665, + "step": 4646 + }, + { + "epoch": 0.8945785306927834, + "grad_norm": 3.216045703054112, + "learning_rate": 1.2174372530263018e-05, + "loss": 1.645, + "step": 4647 + }, + { + "epoch": 0.8947710373703588, + "grad_norm": 3.05202555598051, + "learning_rate": 1.2171329125841404e-05, + "loss": 1.6664, + "step": 4648 + }, + { + "epoch": 0.8947710373703588, + "lm_loss": 1.8764, + "step": 4648, + "vm_loss": 0.1733 + }, + { + "epoch": 0.8947710373703588, + "lm_loss": 1.2324, + "step": 4648, + "vm_loss": 0.1451 + }, + { + "epoch": 0.8947710373703588, + "lm_loss": 1.0173, + "step": 4648, + "vm_loss": 0.1253 + }, + { + "epoch": 0.8947710373703588, + "lm_loss": 1.1536, + "step": 4648, + "vm_loss": 0.1377 + }, + { + "epoch": 0.8947710373703588, + "lm_loss": 1.2665, + "step": 4648, + "vm_loss": 0.1489 + }, + { + "epoch": 0.8947710373703588, + "lm_loss": 1.3861, + "step": 4648, + "vm_loss": 0.1632 + }, + { + "epoch": 0.8947710373703588, + "lm_loss": 1.4213, + "step": 4648, + "vm_loss": 0.2204 + }, + { + "epoch": 0.8947710373703588, + "lm_loss": 1.4593, + "step": 4648, + "vm_loss": 0.2035 + }, + { + "epoch": 0.8949635440479342, + "grad_norm": 3.2904167436694265, + "learning_rate": 1.2168285510338817e-05, + "loss": 1.6271, + "step": 4649 + }, + { + "epoch": 0.8951560507255095, + "grad_norm": 3.1996784863490455, + "learning_rate": 1.216524168405114e-05, + "loss": 1.6564, + "step": 4650 + }, + { + "epoch": 0.8953485574030849, + "grad_norm": 3.0093466192511378, + "learning_rate": 1.2162197647274269e-05, + "loss": 1.6286, + "step": 4651 + }, + { + "epoch": 0.8955410640806603, + "grad_norm": 3.190197375782913, + "learning_rate": 1.2159153400304124e-05, + "loss": 1.6191, + "step": 4652 + }, + { + "epoch": 0.8957335707582357, + "grad_norm": 3.095108948285983, + "learning_rate": 1.2156108943436644e-05, + "loss": 1.6293, + "step": 4653 + }, + { + "epoch": 0.8959260774358111, + "grad_norm": 3.169867952570169, + "learning_rate": 1.215306427696779e-05, + "loss": 1.6101, + "step": 4654 + }, + { + "epoch": 0.8961185841133864, + "grad_norm": 3.064936468378051, + "learning_rate": 1.2150019401193545e-05, + "loss": 1.6075, + "step": 4655 + }, + { + "epoch": 0.8963110907909618, + "grad_norm": 3.31709284902921, + "learning_rate": 1.2146974316409902e-05, + "loss": 1.6022, + "step": 4656 + }, + { + "epoch": 0.8963110907909618, + "lm_loss": 1.4719, + "step": 4656, + "vm_loss": 0.1312 + }, + { + "epoch": 0.8963110907909618, + "lm_loss": 1.3698, + "step": 4656, + "vm_loss": 0.1665 + }, + { + "epoch": 0.8963110907909618, + "lm_loss": 1.2543, + "step": 4656, + "vm_loss": 0.1979 + }, + { + "epoch": 0.8963110907909618, + "lm_loss": 1.6949, + "step": 4656, + "vm_loss": 0.1631 + }, + { + "epoch": 0.8963110907909618, + "lm_loss": 1.6257, + "step": 4656, + "vm_loss": 0.1498 + }, + { + "epoch": 0.8963110907909618, + "lm_loss": 1.2, + "step": 4656, + "vm_loss": 0.1948 + }, + { + "epoch": 0.8963110907909618, + "lm_loss": 1.6329, + "step": 4656, + "vm_loss": 0.1237 + }, + { + "epoch": 0.8963110907909618, + "lm_loss": 1.4026, + "step": 4656, + "vm_loss": 0.1902 + }, + { + "epoch": 0.8965035974685371, + "grad_norm": 3.1366733945081773, + "learning_rate": 1.2143929022912895e-05, + "loss": 1.5494, + "step": 4657 + }, + { + "epoch": 0.8966961041461126, + "grad_norm": 3.1187682095136795, + "learning_rate": 1.2140883520998552e-05, + "loss": 1.5779, + "step": 4658 + }, + { + "epoch": 0.896888610823688, + "grad_norm": 3.0839650711338087, + "learning_rate": 1.2137837810962942e-05, + "loss": 1.6034, + "step": 4659 + }, + { + "epoch": 0.8970811175012633, + "grad_norm": 3.0239628653720616, + "learning_rate": 1.213479189310215e-05, + "loss": 1.622, + "step": 4660 + }, + { + "epoch": 0.8972736241788387, + "grad_norm": 3.0494806224712994, + "learning_rate": 1.2131745767712268e-05, + "loss": 1.5916, + "step": 4661 + }, + { + "epoch": 0.8974661308564141, + "grad_norm": 3.076872414199497, + "learning_rate": 1.2128699435089429e-05, + "loss": 1.6399, + "step": 4662 + }, + { + "epoch": 0.8976586375339894, + "grad_norm": 3.0832554094676636, + "learning_rate": 1.2125652895529766e-05, + "loss": 1.6499, + "step": 4663 + }, + { + "epoch": 0.8978511442115649, + "grad_norm": 3.132694545451436, + "learning_rate": 1.2122606149329452e-05, + "loss": 1.6629, + "step": 4664 + }, + { + "epoch": 0.8978511442115649, + "lm_loss": 1.4086, + "step": 4664, + "vm_loss": 0.1654 + }, + { + "epoch": 0.8978511442115649, + "lm_loss": 1.6571, + "step": 4664, + "vm_loss": 0.1608 + }, + { + "epoch": 0.8978511442115649, + "lm_loss": 1.744, + "step": 4664, + "vm_loss": 0.1884 + }, + { + "epoch": 0.8978511442115649, + "lm_loss": 1.4371, + "step": 4664, + "vm_loss": 0.1887 + }, + { + "epoch": 0.8978511442115649, + "lm_loss": 1.5094, + "step": 4664, + "vm_loss": 0.1705 + }, + { + "epoch": 0.8978511442115649, + "lm_loss": 0.94, + "step": 4664, + "vm_loss": 0.1202 + }, + { + "epoch": 0.8978511442115649, + "lm_loss": 1.7128, + "step": 4664, + "vm_loss": 0.2407 + }, + { + "epoch": 0.8978511442115649, + "lm_loss": 1.5537, + "step": 4664, + "vm_loss": 0.2256 + }, + { + "epoch": 0.8980436508891402, + "grad_norm": 3.3090151082549486, + "learning_rate": 1.211955919678466e-05, + "loss": 1.6759, + "step": 4665 + }, + { + "epoch": 0.8982361575667156, + "grad_norm": 3.08192270973342, + "learning_rate": 1.2116512038191599e-05, + "loss": 1.5975, + "step": 4666 + }, + { + "epoch": 0.898428664244291, + "grad_norm": 3.1936096077047322, + "learning_rate": 1.2113464673846492e-05, + "loss": 1.6439, + "step": 4667 + }, + { + "epoch": 0.8986211709218663, + "grad_norm": 3.0011780195168476, + "learning_rate": 1.2110417104045575e-05, + "loss": 1.6437, + "step": 4668 + }, + { + "epoch": 0.8988136775994418, + "grad_norm": 3.037762702966735, + "learning_rate": 1.210736932908512e-05, + "loss": 1.6962, + "step": 4669 + }, + { + "epoch": 0.8990061842770171, + "grad_norm": 3.0322050904902, + "learning_rate": 1.2104321349261401e-05, + "loss": 1.5935, + "step": 4670 + }, + { + "epoch": 0.8991986909545925, + "grad_norm": 3.158070329535032, + "learning_rate": 1.2101273164870726e-05, + "loss": 1.6979, + "step": 4671 + }, + { + "epoch": 0.8993911976321679, + "grad_norm": 3.00476197813505, + "learning_rate": 1.2098224776209417e-05, + "loss": 1.5603, + "step": 4672 + }, + { + "epoch": 0.8993911976321679, + "lm_loss": 1.0689, + "step": 4672, + "vm_loss": 0.1162 + }, + { + "epoch": 0.8993911976321679, + "lm_loss": 2.5474, + "step": 4672, + "vm_loss": 0.1715 + }, + { + "epoch": 0.8993911976321679, + "lm_loss": 1.631, + "step": 4672, + "vm_loss": 0.177 + }, + { + "epoch": 0.8993911976321679, + "lm_loss": 1.789, + "step": 4672, + "vm_loss": 0.2855 + }, + { + "epoch": 0.8993911976321679, + "lm_loss": 1.5229, + "step": 4672, + "vm_loss": 0.1764 + }, + { + "epoch": 0.8993911976321679, + "lm_loss": 1.6291, + "step": 4672, + "vm_loss": 0.1076 + }, + { + "epoch": 0.8993911976321679, + "lm_loss": 1.2529, + "step": 4672, + "vm_loss": 0.1437 + }, + { + "epoch": 0.8993911976321679, + "lm_loss": 1.237, + "step": 4672, + "vm_loss": 0.1077 + }, + { + "epoch": 0.8995837043097432, + "grad_norm": 3.2543579634043627, + "learning_rate": 1.2095176183573815e-05, + "loss": 1.6247, + "step": 4673 + }, + { + "epoch": 0.8997762109873186, + "grad_norm": 3.148133273728255, + "learning_rate": 1.2092127387260283e-05, + "loss": 1.5773, + "step": 4674 + }, + { + "epoch": 0.899968717664894, + "grad_norm": 3.073996355350252, + "learning_rate": 1.2089078387565204e-05, + "loss": 1.6061, + "step": 4675 + }, + { + "epoch": 0.9001612243424694, + "grad_norm": 3.175612370012723, + "learning_rate": 1.2086029184784978e-05, + "loss": 1.6827, + "step": 4676 + }, + { + "epoch": 0.9003537310200448, + "grad_norm": 3.035258058385415, + "learning_rate": 1.2082979779216027e-05, + "loss": 1.6511, + "step": 4677 + }, + { + "epoch": 0.9005462376976201, + "grad_norm": 3.0649198727208358, + "learning_rate": 1.2079930171154792e-05, + "loss": 1.5653, + "step": 4678 + }, + { + "epoch": 0.9007387443751955, + "grad_norm": 3.055381101298269, + "learning_rate": 1.2076880360897737e-05, + "loss": 1.6153, + "step": 4679 + }, + { + "epoch": 0.9009312510527709, + "grad_norm": 3.0158721380168196, + "learning_rate": 1.2073830348741335e-05, + "loss": 1.5756, + "step": 4680 + }, + { + "epoch": 0.9009312510527709, + "lm_loss": 2.2093, + "step": 4680, + "vm_loss": 0.2302 + }, + { + "epoch": 0.9009312510527709, + "lm_loss": 0.9889, + "step": 4680, + "vm_loss": 0.154 + }, + { + "epoch": 0.9009312510527709, + "lm_loss": 1.5358, + "step": 4680, + "vm_loss": 0.1658 + }, + { + "epoch": 0.9009312510527709, + "lm_loss": 1.434, + "step": 4680, + "vm_loss": 0.1717 + }, + { + "epoch": 0.9009312510527709, + "lm_loss": 1.7574, + "step": 4680, + "vm_loss": 0.1896 + }, + { + "epoch": 0.9009312510527709, + "lm_loss": 1.0625, + "step": 4680, + "vm_loss": 0.1965 + }, + { + "epoch": 0.9009312510527709, + "lm_loss": 1.8779, + "step": 4680, + "vm_loss": 0.1701 + }, + { + "epoch": 0.9009312510527709, + "lm_loss": 0.9325, + "step": 4680, + "vm_loss": 0.1494 + }, + { + "epoch": 0.9011237577303463, + "grad_norm": 3.2162631844118277, + "learning_rate": 1.2070780134982095e-05, + "loss": 1.7162, + "step": 4681 + }, + { + "epoch": 0.9013162644079217, + "grad_norm": 3.1308054971656745, + "learning_rate": 1.206772971991653e-05, + "loss": 1.6107, + "step": 4682 + }, + { + "epoch": 0.901508771085497, + "grad_norm": 2.941418197703706, + "learning_rate": 1.2064679103841182e-05, + "loss": 1.5446, + "step": 4683 + }, + { + "epoch": 0.9017012777630724, + "grad_norm": 3.193208413458499, + "learning_rate": 1.2061628287052611e-05, + "loss": 1.6469, + "step": 4684 + }, + { + "epoch": 0.9018937844406478, + "grad_norm": 3.2828086615883274, + "learning_rate": 1.2058577269847394e-05, + "loss": 1.6149, + "step": 4685 + }, + { + "epoch": 0.9020862911182231, + "grad_norm": 3.1134899536792666, + "learning_rate": 1.2055526052522132e-05, + "loss": 1.6555, + "step": 4686 + }, + { + "epoch": 0.9022787977957986, + "grad_norm": 2.9969836519682254, + "learning_rate": 1.2052474635373436e-05, + "loss": 1.6475, + "step": 4687 + }, + { + "epoch": 0.9024713044733739, + "grad_norm": 2.990386236690656, + "learning_rate": 1.204942301869795e-05, + "loss": 1.6913, + "step": 4688 + }, + { + "epoch": 0.9024713044733739, + "lm_loss": 1.898, + "step": 4688, + "vm_loss": 0.1723 + }, + { + "epoch": 0.9024713044733739, + "lm_loss": 1.1247, + "step": 4688, + "vm_loss": 0.1494 + }, + { + "epoch": 0.9024713044733739, + "lm_loss": 1.6261, + "step": 4688, + "vm_loss": 0.2368 + }, + { + "epoch": 0.9024713044733739, + "lm_loss": 2.0588, + "step": 4688, + "vm_loss": 0.1534 + }, + { + "epoch": 0.9024713044733739, + "lm_loss": 1.422, + "step": 4688, + "vm_loss": 0.1587 + }, + { + "epoch": 0.9024713044733739, + "lm_loss": 1.116, + "step": 4688, + "vm_loss": 0.1875 + }, + { + "epoch": 0.9024713044733739, + "lm_loss": 1.6583, + "step": 4688, + "vm_loss": 0.1358 + }, + { + "epoch": 0.9024713044733739, + "lm_loss": 2.0492, + "step": 4688, + "vm_loss": 0.1625 + }, + { + "epoch": 0.9026638111509493, + "grad_norm": 3.1122430373755576, + "learning_rate": 1.2046371202792326e-05, + "loss": 1.661, + "step": 4689 + }, + { + "epoch": 0.9028563178285247, + "grad_norm": 3.0778606178917554, + "learning_rate": 1.2043319187953242e-05, + "loss": 1.5298, + "step": 4690 + }, + { + "epoch": 0.9030488245061, + "grad_norm": 3.107991430557598, + "learning_rate": 1.204026697447739e-05, + "loss": 1.5297, + "step": 4691 + }, + { + "epoch": 0.9032413311836754, + "grad_norm": 3.1109362734329538, + "learning_rate": 1.2037214562661483e-05, + "loss": 1.6395, + "step": 4692 + }, + { + "epoch": 0.9034338378612509, + "grad_norm": 3.1776933412960404, + "learning_rate": 1.2034161952802259e-05, + "loss": 1.6126, + "step": 4693 + }, + { + "epoch": 0.9036263445388262, + "grad_norm": 3.1758622802049192, + "learning_rate": 1.2031109145196469e-05, + "loss": 1.6582, + "step": 4694 + }, + { + "epoch": 0.9038188512164016, + "grad_norm": 3.0104186612043264, + "learning_rate": 1.2028056140140885e-05, + "loss": 1.6123, + "step": 4695 + }, + { + "epoch": 0.9040113578939769, + "grad_norm": 3.047143656417212, + "learning_rate": 1.2025002937932298e-05, + "loss": 1.6624, + "step": 4696 + }, + { + "epoch": 0.9040113578939769, + "lm_loss": 1.1889, + "step": 4696, + "vm_loss": 0.2658 + }, + { + "epoch": 0.9040113578939769, + "lm_loss": 1.362, + "step": 4696, + "vm_loss": 0.1953 + }, + { + "epoch": 0.9040113578939769, + "lm_loss": 1.1682, + "step": 4696, + "vm_loss": 0.1783 + }, + { + "epoch": 0.9040113578939769, + "lm_loss": 1.6901, + "step": 4696, + "vm_loss": 0.1591 + }, + { + "epoch": 0.9040113578939769, + "lm_loss": 0.6715, + "step": 4696, + "vm_loss": 0.1095 + }, + { + "epoch": 0.9040113578939769, + "lm_loss": 1.5931, + "step": 4696, + "vm_loss": 0.1271 + }, + { + "epoch": 0.9040113578939769, + "lm_loss": 1.4708, + "step": 4696, + "vm_loss": 0.1952 + }, + { + "epoch": 0.9040113578939769, + "lm_loss": 1.3216, + "step": 4696, + "vm_loss": 0.1389 + }, + { + "epoch": 0.9042038645715523, + "grad_norm": 2.997997235687829, + "learning_rate": 1.2021949538867514e-05, + "loss": 1.5484, + "step": 4697 + }, + { + "epoch": 0.9043963712491278, + "grad_norm": 2.922027475024749, + "learning_rate": 1.2018895943243372e-05, + "loss": 1.5326, + "step": 4698 + }, + { + "epoch": 0.9045888779267031, + "grad_norm": 3.009076763808899, + "learning_rate": 1.2015842151356711e-05, + "loss": 1.5415, + "step": 4699 + }, + { + "epoch": 0.9047813846042785, + "grad_norm": 3.1825166629669934, + "learning_rate": 1.2012788163504405e-05, + "loss": 1.6531, + "step": 4700 + }, + { + "epoch": 0.9049738912818538, + "grad_norm": 3.0516416241714244, + "learning_rate": 1.2009733979983338e-05, + "loss": 1.5371, + "step": 4701 + }, + { + "epoch": 0.9051663979594292, + "grad_norm": 3.0795298313082937, + "learning_rate": 1.2006679601090415e-05, + "loss": 1.6377, + "step": 4702 + }, + { + "epoch": 0.9053589046370046, + "grad_norm": 3.242419716000162, + "learning_rate": 1.2003625027122562e-05, + "loss": 1.615, + "step": 4703 + }, + { + "epoch": 0.90555141131458, + "grad_norm": 3.0705280806131863, + "learning_rate": 1.2000570258376724e-05, + "loss": 1.5279, + "step": 4704 + }, + { + "epoch": 0.90555141131458, + "lm_loss": 2.1423, + "step": 4704, + "vm_loss": 0.1696 + }, + { + "epoch": 0.90555141131458, + "lm_loss": 1.7026, + "step": 4704, + "vm_loss": 0.1497 + }, + { + "epoch": 0.90555141131458, + "lm_loss": 1.4717, + "step": 4704, + "vm_loss": 0.1727 + }, + { + "epoch": 0.90555141131458, + "lm_loss": 1.9995, + "step": 4704, + "vm_loss": 0.1685 + }, + { + "epoch": 0.90555141131458, + "lm_loss": 1.612, + "step": 4704, + "vm_loss": 0.1801 + }, + { + "epoch": 0.90555141131458, + "lm_loss": 1.9819, + "step": 4704, + "vm_loss": 0.1248 + }, + { + "epoch": 0.90555141131458, + "lm_loss": 1.5629, + "step": 4704, + "vm_loss": 0.1346 + }, + { + "epoch": 0.90555141131458, + "lm_loss": 1.5307, + "step": 4704, + "vm_loss": 0.1528 + }, + { + "epoch": 0.9057439179921554, + "grad_norm": 3.2109250004982073, + "learning_rate": 1.1997515295149861e-05, + "loss": 1.6352, + "step": 4705 + }, + { + "epoch": 0.9059364246697307, + "grad_norm": 3.120886181270095, + "learning_rate": 1.1994460137738954e-05, + "loss": 1.6207, + "step": 4706 + }, + { + "epoch": 0.9061289313473061, + "grad_norm": 3.244670606071805, + "learning_rate": 1.1991404786441007e-05, + "loss": 1.6012, + "step": 4707 + }, + { + "epoch": 0.9063214380248815, + "grad_norm": 3.385320122090827, + "learning_rate": 1.1988349241553038e-05, + "loss": 1.6448, + "step": 4708 + }, + { + "epoch": 0.9065139447024568, + "grad_norm": 3.1747655067058607, + "learning_rate": 1.1985293503372085e-05, + "loss": 1.5946, + "step": 4709 + }, + { + "epoch": 0.9067064513800323, + "grad_norm": 2.932729733122712, + "learning_rate": 1.1982237572195203e-05, + "loss": 1.56, + "step": 4710 + }, + { + "epoch": 0.9068989580576077, + "grad_norm": 3.0855871978194744, + "learning_rate": 1.1979181448319469e-05, + "loss": 1.6133, + "step": 4711 + }, + { + "epoch": 0.907091464735183, + "grad_norm": 3.386528743654921, + "learning_rate": 1.1976125132041974e-05, + "loss": 1.7261, + "step": 4712 + }, + { + "epoch": 0.907091464735183, + "lm_loss": 1.0121, + "step": 4712, + "vm_loss": 0.1672 + }, + { + "epoch": 0.907091464735183, + "lm_loss": 1.5156, + "step": 4712, + "vm_loss": 0.1663 + }, + { + "epoch": 0.907091464735183, + "lm_loss": 1.7849, + "step": 4712, + "vm_loss": 0.203 + }, + { + "epoch": 0.907091464735183, + "lm_loss": 1.9754, + "step": 4712, + "vm_loss": 0.2184 + }, + { + "epoch": 0.907091464735183, + "lm_loss": 1.2805, + "step": 4712, + "vm_loss": 0.1522 + }, + { + "epoch": 0.907091464735183, + "lm_loss": 1.4139, + "step": 4712, + "vm_loss": 0.1744 + }, + { + "epoch": 0.907091464735183, + "lm_loss": 1.0879, + "step": 4712, + "vm_loss": 0.1844 + }, + { + "epoch": 0.907091464735183, + "lm_loss": 1.1509, + "step": 4712, + "vm_loss": 0.1408 + }, + { + "epoch": 0.9072839714127584, + "grad_norm": 3.199990428689835, + "learning_rate": 1.1973068623659839e-05, + "loss": 1.6055, + "step": 4713 + }, + { + "epoch": 0.9074764780903337, + "grad_norm": 3.018276693329278, + "learning_rate": 1.1970011923470189e-05, + "loss": 1.538, + "step": 4714 + }, + { + "epoch": 0.9076689847679091, + "grad_norm": 3.0560692992183722, + "learning_rate": 1.1966955031770176e-05, + "loss": 1.6219, + "step": 4715 + }, + { + "epoch": 0.9078614914454846, + "grad_norm": 3.0018417725136213, + "learning_rate": 1.196389794885697e-05, + "loss": 1.6411, + "step": 4716 + }, + { + "epoch": 0.9080539981230599, + "grad_norm": 3.1087391739088837, + "learning_rate": 1.1960840675027754e-05, + "loss": 1.6407, + "step": 4717 + }, + { + "epoch": 0.9082465048006353, + "grad_norm": 3.0829636796115616, + "learning_rate": 1.195778321057974e-05, + "loss": 1.6297, + "step": 4718 + }, + { + "epoch": 0.9084390114782106, + "grad_norm": 3.08512040251026, + "learning_rate": 1.1954725555810151e-05, + "loss": 1.5259, + "step": 4719 + }, + { + "epoch": 0.908631518155786, + "grad_norm": 3.1531988826165147, + "learning_rate": 1.1951667711016233e-05, + "loss": 1.5728, + "step": 4720 + }, + { + "epoch": 0.908631518155786, + "lm_loss": 1.7369, + "step": 4720, + "vm_loss": 0.1392 + }, + { + "epoch": 0.908631518155786, + "lm_loss": 1.233, + "step": 4720, + "vm_loss": 0.1648 + }, + { + "epoch": 0.908631518155786, + "lm_loss": 1.0099, + "step": 4720, + "vm_loss": 0.1276 + }, + { + "epoch": 0.908631518155786, + "lm_loss": 1.0384, + "step": 4720, + "vm_loss": 0.1313 + }, + { + "epoch": 0.908631518155786, + "lm_loss": 1.4658, + "step": 4720, + "vm_loss": 0.1597 + }, + { + "epoch": 0.908631518155786, + "lm_loss": 1.0697, + "step": 4720, + "vm_loss": 0.2073 + }, + { + "epoch": 0.908631518155786, + "lm_loss": 1.7965, + "step": 4720, + "vm_loss": 0.1356 + }, + { + "epoch": 0.908631518155786, + "lm_loss": 1.9753, + "step": 4720, + "vm_loss": 0.1867 + }, + { + "epoch": 0.9088240248333614, + "grad_norm": 3.325567806677158, + "learning_rate": 1.194860967649524e-05, + "loss": 1.5958, + "step": 4721 + }, + { + "epoch": 0.9090165315109368, + "grad_norm": 3.2019621959431266, + "learning_rate": 1.1945551452544458e-05, + "loss": 1.5239, + "step": 4722 + }, + { + "epoch": 0.9092090381885122, + "grad_norm": 3.309722748894992, + "learning_rate": 1.1942493039461185e-05, + "loss": 1.6618, + "step": 4723 + }, + { + "epoch": 0.9094015448660876, + "grad_norm": 3.3072880613639044, + "learning_rate": 1.1939434437542734e-05, + "loss": 1.6342, + "step": 4724 + }, + { + "epoch": 0.9095940515436629, + "grad_norm": 3.049943449060783, + "learning_rate": 1.1936375647086444e-05, + "loss": 1.6307, + "step": 4725 + }, + { + "epoch": 0.9097865582212383, + "grad_norm": 3.3162130404318253, + "learning_rate": 1.1933316668389669e-05, + "loss": 1.6491, + "step": 4726 + }, + { + "epoch": 0.9099790648988136, + "grad_norm": 3.223514727480635, + "learning_rate": 1.1930257501749777e-05, + "loss": 1.6575, + "step": 4727 + }, + { + "epoch": 0.9101715715763891, + "grad_norm": 2.977685932359518, + "learning_rate": 1.1927198147464163e-05, + "loss": 1.6196, + "step": 4728 + }, + { + "epoch": 0.9101715715763891, + "lm_loss": 1.7089, + "step": 4728, + "vm_loss": 0.1398 + }, + { + "epoch": 0.9101715715763891, + "lm_loss": 0.9011, + "step": 4728, + "vm_loss": 0.1399 + }, + { + "epoch": 0.9101715715763891, + "lm_loss": 1.3424, + "step": 4728, + "vm_loss": 0.1513 + }, + { + "epoch": 0.9101715715763891, + "lm_loss": 1.3399, + "step": 4728, + "vm_loss": 0.1905 + }, + { + "epoch": 0.9101715715763891, + "lm_loss": 1.7321, + "step": 4728, + "vm_loss": 0.1748 + }, + { + "epoch": 0.9101715715763891, + "lm_loss": 1.7444, + "step": 4728, + "vm_loss": 0.1798 + }, + { + "epoch": 0.9101715715763891, + "lm_loss": 1.1951, + "step": 4728, + "vm_loss": 0.1741 + }, + { + "epoch": 0.9101715715763891, + "lm_loss": 1.9315, + "step": 4728, + "vm_loss": 0.1422 + }, + { + "epoch": 0.9103640782539645, + "grad_norm": 3.1635544184550577, + "learning_rate": 1.1924138605830231e-05, + "loss": 1.5845, + "step": 4729 + }, + { + "epoch": 0.9105565849315398, + "grad_norm": 3.021581326141417, + "learning_rate": 1.1921078877145414e-05, + "loss": 1.5498, + "step": 4730 + }, + { + "epoch": 0.9107490916091152, + "grad_norm": 3.434839780280532, + "learning_rate": 1.1918018961707147e-05, + "loss": 1.6179, + "step": 4731 + }, + { + "epoch": 0.9109415982866905, + "grad_norm": 3.2198833407364114, + "learning_rate": 1.1914958859812901e-05, + "loss": 1.5567, + "step": 4732 + }, + { + "epoch": 0.911134104964266, + "grad_norm": 3.1224098506504623, + "learning_rate": 1.1911898571760155e-05, + "loss": 1.5817, + "step": 4733 + }, + { + "epoch": 0.9113266116418414, + "grad_norm": 3.0629613940052614, + "learning_rate": 1.1908838097846404e-05, + "loss": 1.6571, + "step": 4734 + }, + { + "epoch": 0.9115191183194167, + "grad_norm": 3.1317016752153846, + "learning_rate": 1.1905777438369175e-05, + "loss": 1.6147, + "step": 4735 + }, + { + "epoch": 0.9117116249969921, + "grad_norm": 3.05355189921111, + "learning_rate": 1.1902716593625991e-05, + "loss": 1.6229, + "step": 4736 + }, + { + "epoch": 0.9117116249969921, + "lm_loss": 1.5109, + "step": 4736, + "vm_loss": 0.0979 + }, + { + "epoch": 0.9117116249969921, + "lm_loss": 1.4057, + "step": 4736, + "vm_loss": 0.2678 + }, + { + "epoch": 0.9117116249969921, + "lm_loss": 1.2949, + "step": 4736, + "vm_loss": 0.1078 + }, + { + "epoch": 0.9117116249969921, + "lm_loss": 1.5492, + "step": 4736, + "vm_loss": 0.2187 + }, + { + "epoch": 0.9117116249969921, + "lm_loss": 0.9376, + "step": 4736, + "vm_loss": 0.1047 + }, + { + "epoch": 0.9117116249969921, + "lm_loss": 1.8476, + "step": 4736, + "vm_loss": 0.1477 + }, + { + "epoch": 0.9117116249969921, + "lm_loss": 1.7221, + "step": 4736, + "vm_loss": 0.1849 + }, + { + "epoch": 0.9117116249969921, + "lm_loss": 1.4249, + "step": 4736, + "vm_loss": 0.2073 + }, + { + "epoch": 0.9119041316745674, + "grad_norm": 3.078609919769739, + "learning_rate": 1.1899655563914413e-05, + "loss": 1.5567, + "step": 4737 + }, + { + "epoch": 0.9120966383521428, + "grad_norm": 3.1494639008782683, + "learning_rate": 1.1896594349532014e-05, + "loss": 1.6325, + "step": 4738 + }, + { + "epoch": 0.9122891450297183, + "grad_norm": 3.1079321919728846, + "learning_rate": 1.1893532950776373e-05, + "loss": 1.5214, + "step": 4739 + }, + { + "epoch": 0.9124816517072936, + "grad_norm": 3.296226850772759, + "learning_rate": 1.1890471367945108e-05, + "loss": 1.5918, + "step": 4740 + }, + { + "epoch": 0.912674158384869, + "grad_norm": 3.314991454319871, + "learning_rate": 1.188740960133584e-05, + "loss": 1.6301, + "step": 4741 + }, + { + "epoch": 0.9128666650624444, + "grad_norm": 3.304338307960673, + "learning_rate": 1.1884347651246214e-05, + "loss": 1.6048, + "step": 4742 + }, + { + "epoch": 0.9130591717400197, + "grad_norm": 3.3300366135285278, + "learning_rate": 1.1881285517973885e-05, + "loss": 1.6116, + "step": 4743 + }, + { + "epoch": 0.9132516784175951, + "grad_norm": 3.163511130898148, + "learning_rate": 1.1878223201816534e-05, + "loss": 1.685, + "step": 4744 + }, + { + "epoch": 0.9132516784175951, + "lm_loss": 1.419, + "step": 4744, + "vm_loss": 0.169 + }, + { + "epoch": 0.9132516784175951, + "lm_loss": 1.1975, + "step": 4744, + "vm_loss": 0.1641 + }, + { + "epoch": 0.9132516784175951, + "lm_loss": 1.4845, + "step": 4744, + "vm_loss": 0.1448 + }, + { + "epoch": 0.9132516784175951, + "lm_loss": 1.5721, + "step": 4744, + "vm_loss": 0.197 + }, + { + "epoch": 0.9132516784175951, + "lm_loss": 1.3634, + "step": 4744, + "vm_loss": 0.1883 + }, + { + "epoch": 0.9132516784175951, + "lm_loss": 1.2942, + "step": 4744, + "vm_loss": 0.1767 + }, + { + "epoch": 0.9132516784175951, + "lm_loss": 1.5083, + "step": 4744, + "vm_loss": 0.161 + }, + { + "epoch": 0.9132516784175951, + "lm_loss": 1.0931, + "step": 4744, + "vm_loss": 0.2037 + }, + { + "epoch": 0.9134441850951704, + "grad_norm": 3.096686639377927, + "learning_rate": 1.187516070307186e-05, + "loss": 1.64, + "step": 4745 + }, + { + "epoch": 0.9136366917727459, + "grad_norm": 2.9079387174667533, + "learning_rate": 1.1872098022037575e-05, + "loss": 1.5758, + "step": 4746 + }, + { + "epoch": 0.9138291984503213, + "grad_norm": 3.061758563851062, + "learning_rate": 1.186903515901141e-05, + "loss": 1.5934, + "step": 4747 + }, + { + "epoch": 0.9140217051278966, + "grad_norm": 3.1823371658554103, + "learning_rate": 1.1865972114291116e-05, + "loss": 1.5801, + "step": 4748 + }, + { + "epoch": 0.914214211805472, + "grad_norm": 3.484095623995338, + "learning_rate": 1.1862908888174462e-05, + "loss": 1.6215, + "step": 4749 + }, + { + "epoch": 0.9144067184830473, + "grad_norm": 3.3834394171611537, + "learning_rate": 1.1859845480959229e-05, + "loss": 1.5714, + "step": 4750 + }, + { + "epoch": 0.9145992251606228, + "grad_norm": 3.107808325801354, + "learning_rate": 1.1856781892943218e-05, + "loss": 1.546, + "step": 4751 + }, + { + "epoch": 0.9147917318381982, + "grad_norm": 3.092399057497426, + "learning_rate": 1.1853718124424258e-05, + "loss": 1.6522, + "step": 4752 + }, + { + "epoch": 0.9147917318381982, + "lm_loss": 1.3328, + "step": 4752, + "vm_loss": 0.141 + }, + { + "epoch": 0.9147917318381982, + "lm_loss": 1.3626, + "step": 4752, + "vm_loss": 0.1956 + }, + { + "epoch": 0.9147917318381982, + "lm_loss": 1.2497, + "step": 4752, + "vm_loss": 0.2243 + }, + { + "epoch": 0.9147917318381982, + "lm_loss": 1.4696, + "step": 4752, + "vm_loss": 0.1715 + }, + { + "epoch": 0.9147917318381982, + "lm_loss": 1.528, + "step": 4752, + "vm_loss": 0.1683 + }, + { + "epoch": 0.9147917318381982, + "lm_loss": 2.1192, + "step": 4752, + "vm_loss": 0.1444 + }, + { + "epoch": 0.9147917318381982, + "lm_loss": 1.4258, + "step": 4752, + "vm_loss": 0.1793 + }, + { + "epoch": 0.9147917318381982, + "lm_loss": 1.5073, + "step": 4752, + "vm_loss": 0.1459 + }, + { + "epoch": 0.9149842385157735, + "grad_norm": 3.118080759115116, + "learning_rate": 1.1850654175700175e-05, + "loss": 1.5939, + "step": 4753 + }, + { + "epoch": 0.9151767451933489, + "grad_norm": 3.0603491941346945, + "learning_rate": 1.1847590047068834e-05, + "loss": 1.5756, + "step": 4754 + }, + { + "epoch": 0.9153692518709243, + "grad_norm": 3.054071426864371, + "learning_rate": 1.1844525738828098e-05, + "loss": 1.5916, + "step": 4755 + }, + { + "epoch": 0.9155617585484996, + "grad_norm": 3.1115514042059838, + "learning_rate": 1.1841461251275868e-05, + "loss": 1.629, + "step": 4756 + }, + { + "epoch": 0.9157542652260751, + "grad_norm": 3.156577782343747, + "learning_rate": 1.183839658471004e-05, + "loss": 1.5943, + "step": 4757 + }, + { + "epoch": 0.9159467719036504, + "grad_norm": 2.96346855001217, + "learning_rate": 1.1835331739428545e-05, + "loss": 1.5604, + "step": 4758 + }, + { + "epoch": 0.9161392785812258, + "grad_norm": 3.0608833534009663, + "learning_rate": 1.1832266715729324e-05, + "loss": 1.5823, + "step": 4759 + }, + { + "epoch": 0.9163317852588012, + "grad_norm": 3.213660262891851, + "learning_rate": 1.1829201513910339e-05, + "loss": 1.5654, + "step": 4760 + }, + { + "epoch": 0.9163317852588012, + "lm_loss": 0.971, + "step": 4760, + "vm_loss": 0.1798 + }, + { + "epoch": 0.9163317852588012, + "lm_loss": 1.468, + "step": 4760, + "vm_loss": 0.1915 + }, + { + "epoch": 0.9163317852588012, + "lm_loss": 1.0862, + "step": 4760, + "vm_loss": 0.1882 + }, + { + "epoch": 0.9163317852588012, + "lm_loss": 1.3217, + "step": 4760, + "vm_loss": 0.2288 + }, + { + "epoch": 0.9163317852588012, + "lm_loss": 1.7517, + "step": 4760, + "vm_loss": 0.1653 + }, + { + "epoch": 0.9163317852588012, + "lm_loss": 1.3305, + "step": 4760, + "vm_loss": 0.1513 + }, + { + "epoch": 0.9163317852588012, + "lm_loss": 1.3218, + "step": 4760, + "vm_loss": 0.1109 + }, + { + "epoch": 0.9163317852588012, + "lm_loss": 1.4472, + "step": 4760, + "vm_loss": 0.212 + }, + { + "epoch": 0.9165242919363765, + "grad_norm": 3.111033909598755, + "learning_rate": 1.182613613426956e-05, + "loss": 1.555, + "step": 4761 + }, + { + "epoch": 0.9167167986139519, + "grad_norm": 3.0694053880828096, + "learning_rate": 1.1823070577104989e-05, + "loss": 1.6144, + "step": 4762 + }, + { + "epoch": 0.9169093052915273, + "grad_norm": 3.0911651853895354, + "learning_rate": 1.1820004842714634e-05, + "loss": 1.6094, + "step": 4763 + }, + { + "epoch": 0.9171018119691027, + "grad_norm": 3.1440997607893215, + "learning_rate": 1.1816938931396525e-05, + "loss": 1.5272, + "step": 4764 + }, + { + "epoch": 0.9172943186466781, + "grad_norm": 3.2112795550665725, + "learning_rate": 1.18138728434487e-05, + "loss": 1.619, + "step": 4765 + }, + { + "epoch": 0.9174868253242534, + "grad_norm": 3.037661841808173, + "learning_rate": 1.1810806579169235e-05, + "loss": 1.6242, + "step": 4766 + }, + { + "epoch": 0.9176793320018288, + "grad_norm": 3.0503935184502766, + "learning_rate": 1.1807740138856203e-05, + "loss": 1.6127, + "step": 4767 + }, + { + "epoch": 0.9178718386794041, + "grad_norm": 3.013279554216417, + "learning_rate": 1.1804673522807699e-05, + "loss": 1.5879, + "step": 4768 + }, + { + "epoch": 0.9178718386794041, + "lm_loss": 1.0693, + "step": 4768, + "vm_loss": 0.1773 + }, + { + "epoch": 0.9178718386794041, + "lm_loss": 0.9161, + "step": 4768, + "vm_loss": 0.1714 + }, + { + "epoch": 0.9178718386794041, + "lm_loss": 1.3118, + "step": 4768, + "vm_loss": 0.1186 + }, + { + "epoch": 0.9178718386794041, + "lm_loss": 1.497, + "step": 4768, + "vm_loss": 0.2291 + }, + { + "epoch": 0.9178718386794041, + "lm_loss": 1.3785, + "step": 4768, + "vm_loss": 0.1443 + }, + { + "epoch": 0.9178718386794041, + "lm_loss": 1.2063, + "step": 4768, + "vm_loss": 0.1114 + }, + { + "epoch": 0.9178718386794041, + "lm_loss": 1.3182, + "step": 4768, + "vm_loss": 0.1141 + }, + { + "epoch": 0.9178718386794041, + "lm_loss": 1.4246, + "step": 4768, + "vm_loss": 0.1194 + }, + { + "epoch": 0.9180643453569796, + "grad_norm": 3.133762994320165, + "learning_rate": 1.180160673132184e-05, + "loss": 1.581, + "step": 4769 + }, + { + "epoch": 0.918256852034555, + "grad_norm": 3.272012218669178, + "learning_rate": 1.1798539764696756e-05, + "loss": 1.6411, + "step": 4770 + }, + { + "epoch": 0.9184493587121303, + "grad_norm": 3.183984436062204, + "learning_rate": 1.17954726232306e-05, + "loss": 1.6584, + "step": 4771 + }, + { + "epoch": 0.9186418653897057, + "grad_norm": 3.1608099526418023, + "learning_rate": 1.1792405307221535e-05, + "loss": 1.6111, + "step": 4772 + }, + { + "epoch": 0.9188343720672811, + "grad_norm": 3.094614310245401, + "learning_rate": 1.1789337816967744e-05, + "loss": 1.6069, + "step": 4773 + }, + { + "epoch": 0.9190268787448564, + "grad_norm": 3.113612205834525, + "learning_rate": 1.178627015276742e-05, + "loss": 1.6155, + "step": 4774 + }, + { + "epoch": 0.9192193854224319, + "grad_norm": 3.096776503243558, + "learning_rate": 1.1783202314918787e-05, + "loss": 1.5997, + "step": 4775 + }, + { + "epoch": 0.9194118921000072, + "grad_norm": 3.1498093917458228, + "learning_rate": 1.1780134303720074e-05, + "loss": 1.5243, + "step": 4776 + }, + { + "epoch": 0.9194118921000072, + "lm_loss": 1.2499, + "step": 4776, + "vm_loss": 0.1547 + }, + { + "epoch": 0.9194118921000072, + "lm_loss": 1.0967, + "step": 4776, + "vm_loss": 0.1676 + }, + { + "epoch": 0.9194118921000072, + "lm_loss": 1.3902, + "step": 4776, + "vm_loss": 0.1594 + }, + { + "epoch": 0.9194118921000072, + "lm_loss": 1.7551, + "step": 4776, + "vm_loss": 0.2257 + }, + { + "epoch": 0.9194118921000072, + "lm_loss": 0.8441, + "step": 4776, + "vm_loss": 0.2517 + }, + { + "epoch": 0.9194118921000072, + "lm_loss": 1.4534, + "step": 4776, + "vm_loss": 0.1763 + }, + { + "epoch": 0.9194118921000072, + "lm_loss": 1.457, + "step": 4776, + "vm_loss": 0.1164 + }, + { + "epoch": 0.9194118921000072, + "lm_loss": 0.8831, + "step": 4776, + "vm_loss": 0.1975 + }, + { + "epoch": 0.9196043987775826, + "grad_norm": 3.0232594043394485, + "learning_rate": 1.1777066119469533e-05, + "loss": 1.6018, + "step": 4777 + }, + { + "epoch": 0.919796905455158, + "grad_norm": 3.300903977480852, + "learning_rate": 1.177399776246543e-05, + "loss": 1.508, + "step": 4778 + }, + { + "epoch": 0.9199894121327333, + "grad_norm": 3.184781032146618, + "learning_rate": 1.1770929233006047e-05, + "loss": 1.5742, + "step": 4779 + }, + { + "epoch": 0.9201819188103088, + "grad_norm": 3.269287148303652, + "learning_rate": 1.1767860531389686e-05, + "loss": 1.5955, + "step": 4780 + }, + { + "epoch": 0.9203744254878841, + "grad_norm": 3.2165907530184694, + "learning_rate": 1.1764791657914664e-05, + "loss": 1.6458, + "step": 4781 + }, + { + "epoch": 0.9205669321654595, + "grad_norm": 3.0867529396215097, + "learning_rate": 1.1761722612879317e-05, + "loss": 1.5739, + "step": 4782 + }, + { + "epoch": 0.9207594388430349, + "grad_norm": 3.060016522500291, + "learning_rate": 1.1758653396581991e-05, + "loss": 1.643, + "step": 4783 + }, + { + "epoch": 0.9209519455206102, + "grad_norm": 3.029460367165218, + "learning_rate": 1.1755584009321055e-05, + "loss": 1.5523, + "step": 4784 + }, + { + "epoch": 0.9209519455206102, + "lm_loss": 1.4024, + "step": 4784, + "vm_loss": 0.1524 + }, + { + "epoch": 0.9209519455206102, + "lm_loss": 1.754, + "step": 4784, + "vm_loss": 0.1331 + }, + { + "epoch": 0.9209519455206102, + "lm_loss": 1.55, + "step": 4784, + "vm_loss": 0.1603 + }, + { + "epoch": 0.9209519455206102, + "lm_loss": 1.4865, + "step": 4784, + "vm_loss": 0.1827 + }, + { + "epoch": 0.9209519455206102, + "lm_loss": 1.0225, + "step": 4784, + "vm_loss": 0.1888 + }, + { + "epoch": 0.9209519455206102, + "lm_loss": 1.6498, + "step": 4784, + "vm_loss": 0.206 + }, + { + "epoch": 0.9209519455206102, + "lm_loss": 1.1726, + "step": 4784, + "vm_loss": 0.1841 + }, + { + "epoch": 0.9209519455206102, + "lm_loss": 1.3623, + "step": 4784, + "vm_loss": 0.1587 + }, + { + "epoch": 0.9211444521981856, + "grad_norm": 2.913313625258337, + "learning_rate": 1.1752514451394894e-05, + "loss": 1.6203, + "step": 4785 + }, + { + "epoch": 0.9213369588757611, + "grad_norm": 3.166308522409298, + "learning_rate": 1.1749444723101907e-05, + "loss": 1.5781, + "step": 4786 + }, + { + "epoch": 0.9215294655533364, + "grad_norm": 3.216965026399961, + "learning_rate": 1.1746374824740508e-05, + "loss": 1.5486, + "step": 4787 + }, + { + "epoch": 0.9217219722309118, + "grad_norm": 3.1735804589544108, + "learning_rate": 1.1743304756609133e-05, + "loss": 1.5616, + "step": 4788 + }, + { + "epoch": 0.9219144789084871, + "grad_norm": 3.2929616053865858, + "learning_rate": 1.1740234519006238e-05, + "loss": 1.5232, + "step": 4789 + }, + { + "epoch": 0.9221069855860625, + "grad_norm": 3.0941134779164434, + "learning_rate": 1.173716411223028e-05, + "loss": 1.5588, + "step": 4790 + }, + { + "epoch": 0.9222994922636379, + "grad_norm": 3.2018675263601333, + "learning_rate": 1.1734093536579742e-05, + "loss": 1.6539, + "step": 4791 + }, + { + "epoch": 0.9224919989412133, + "grad_norm": 3.1293930189346195, + "learning_rate": 1.173102279235313e-05, + "loss": 1.534, + "step": 4792 + }, + { + "epoch": 0.9224919989412133, + "lm_loss": 1.4114, + "step": 4792, + "vm_loss": 0.1996 + }, + { + "epoch": 0.9224919989412133, + "lm_loss": 0.817, + "step": 4792, + "vm_loss": 0.1847 + }, + { + "epoch": 0.9224919989412133, + "lm_loss": 1.6947, + "step": 4792, + "vm_loss": 0.1394 + }, + { + "epoch": 0.9224919989412133, + "lm_loss": 1.7196, + "step": 4792, + "vm_loss": 0.1193 + }, + { + "epoch": 0.9224919989412133, + "lm_loss": 1.0684, + "step": 4792, + "vm_loss": 0.0944 + }, + { + "epoch": 0.9224919989412133, + "lm_loss": 1.5546, + "step": 4792, + "vm_loss": 0.1675 + }, + { + "epoch": 0.9224919989412133, + "lm_loss": 1.1797, + "step": 4792, + "vm_loss": 0.1626 + }, + { + "epoch": 0.9224919989412133, + "lm_loss": 1.5089, + "step": 4792, + "vm_loss": 0.1933 + }, + { + "epoch": 0.9226845056187887, + "grad_norm": 3.2066999457012457, + "learning_rate": 1.172795187984896e-05, + "loss": 1.5393, + "step": 4793 + }, + { + "epoch": 0.922877012296364, + "grad_norm": 3.2279126103638798, + "learning_rate": 1.1724880799365757e-05, + "loss": 1.6209, + "step": 4794 + }, + { + "epoch": 0.9230695189739394, + "grad_norm": 3.0127613118616288, + "learning_rate": 1.1721809551202072e-05, + "loss": 1.532, + "step": 4795 + }, + { + "epoch": 0.9232620256515148, + "grad_norm": 3.215697232902078, + "learning_rate": 1.1718738135656471e-05, + "loss": 1.6465, + "step": 4796 + }, + { + "epoch": 0.9234545323290901, + "grad_norm": 3.3188923915044146, + "learning_rate": 1.1715666553027534e-05, + "loss": 1.5811, + "step": 4797 + }, + { + "epoch": 0.9236470390066656, + "grad_norm": 3.1733251299750815, + "learning_rate": 1.171259480361386e-05, + "loss": 1.5342, + "step": 4798 + }, + { + "epoch": 0.9238395456842409, + "grad_norm": 3.1543324824781265, + "learning_rate": 1.1709522887714058e-05, + "loss": 1.5265, + "step": 4799 + }, + { + "epoch": 0.9240320523618163, + "grad_norm": 3.2246702387975525, + "learning_rate": 1.1706450805626762e-05, + "loss": 1.5997, + "step": 4800 + }, + { + "epoch": 0.9240320523618163, + "lm_loss": 1.4951, + "step": 4800, + "vm_loss": 0.1984 + }, + { + "epoch": 0.9240320523618163, + "lm_loss": 1.3107, + "step": 4800, + "vm_loss": 0.2102 + }, + { + "epoch": 0.9240320523618163, + "lm_loss": 1.4419, + "step": 4800, + "vm_loss": 0.2452 + }, + { + "epoch": 0.9240320523618163, + "lm_loss": 1.6896, + "step": 4800, + "vm_loss": 0.1901 + }, + { + "epoch": 0.9240320523618163, + "lm_loss": 1.3192, + "step": 4800, + "vm_loss": 0.1787 + }, + { + "epoch": 0.9240320523618163, + "lm_loss": 0.8301, + "step": 4800, + "vm_loss": 0.2143 + }, + { + "epoch": 0.9240320523618163, + "lm_loss": 1.6884, + "step": 4800, + "vm_loss": 0.2358 + }, + { + "epoch": 0.9240320523618163, + "lm_loss": 1.5382, + "step": 4800, + "vm_loss": 0.1249 + }, + { + "epoch": 0.9242245590393917, + "grad_norm": 3.1085704583271836, + "learning_rate": 1.1703378557650612e-05, + "loss": 1.6157, + "step": 4801 + }, + { + "epoch": 0.924417065716967, + "grad_norm": 3.2843450494121336, + "learning_rate": 1.1700306144084279e-05, + "loss": 1.6467, + "step": 4802 + }, + { + "epoch": 0.9246095723945424, + "grad_norm": 3.1540049787069013, + "learning_rate": 1.1697233565226433e-05, + "loss": 1.6082, + "step": 4803 + }, + { + "epoch": 0.9248020790721179, + "grad_norm": 3.0073162271969256, + "learning_rate": 1.1694160821375768e-05, + "loss": 1.4865, + "step": 4804 + }, + { + "epoch": 0.9249945857496932, + "grad_norm": 3.109294811338232, + "learning_rate": 1.1691087912831004e-05, + "loss": 1.5301, + "step": 4805 + }, + { + "epoch": 0.9251870924272686, + "grad_norm": 3.2121240194224856, + "learning_rate": 1.168801483989085e-05, + "loss": 1.6176, + "step": 4806 + }, + { + "epoch": 0.9253795991048439, + "grad_norm": 3.3680376309864997, + "learning_rate": 1.1684941602854065e-05, + "loss": 1.596, + "step": 4807 + }, + { + "epoch": 0.9255721057824193, + "grad_norm": 3.156682359555501, + "learning_rate": 1.1681868202019397e-05, + "loss": 1.5878, + "step": 4808 + }, + { + "epoch": 0.9255721057824193, + "lm_loss": 1.8091, + "step": 4808, + "vm_loss": 0.2077 + }, + { + "epoch": 0.9255721057824193, + "lm_loss": 1.4879, + "step": 4808, + "vm_loss": 0.237 + }, + { + "epoch": 0.9255721057824193, + "lm_loss": 1.499, + "step": 4808, + "vm_loss": 0.1767 + }, + { + "epoch": 0.9255721057824193, + "lm_loss": 1.3911, + "step": 4808, + "vm_loss": 0.1431 + }, + { + "epoch": 0.9255721057824193, + "lm_loss": 1.0723, + "step": 4808, + "vm_loss": 0.1627 + }, + { + "epoch": 0.9255721057824193, + "lm_loss": 1.2265, + "step": 4808, + "vm_loss": 0.212 + }, + { + "epoch": 0.9255721057824193, + "lm_loss": 1.7926, + "step": 4808, + "vm_loss": 0.1686 + }, + { + "epoch": 0.9255721057824193, + "lm_loss": 1.7457, + "step": 4808, + "vm_loss": 0.1873 + }, + { + "epoch": 0.9257646124599948, + "grad_norm": 3.0406175944253895, + "learning_rate": 1.167879463768562e-05, + "loss": 1.6384, + "step": 4809 + }, + { + "epoch": 0.9259571191375701, + "grad_norm": 3.1469707634999065, + "learning_rate": 1.1675720910151529e-05, + "loss": 1.5512, + "step": 4810 + }, + { + "epoch": 0.9261496258151455, + "grad_norm": 3.0160858366002006, + "learning_rate": 1.1672647019715926e-05, + "loss": 1.5781, + "step": 4811 + }, + { + "epoch": 0.9263421324927208, + "grad_norm": 3.125319667772183, + "learning_rate": 1.1669572966677634e-05, + "loss": 1.5595, + "step": 4812 + }, + { + "epoch": 0.9265346391702962, + "grad_norm": 3.2166400520842777, + "learning_rate": 1.1666498751335487e-05, + "loss": 1.6011, + "step": 4813 + }, + { + "epoch": 0.9267271458478716, + "grad_norm": 3.180651338229441, + "learning_rate": 1.1663424373988341e-05, + "loss": 1.5731, + "step": 4814 + }, + { + "epoch": 0.926919652525447, + "grad_norm": 3.146645207159717, + "learning_rate": 1.166034983493507e-05, + "loss": 1.5267, + "step": 4815 + }, + { + "epoch": 0.9271121592030224, + "grad_norm": 3.479080493797216, + "learning_rate": 1.1657275134474547e-05, + "loss": 1.6216, + "step": 4816 + }, + { + "epoch": 0.9271121592030224, + "lm_loss": 1.814, + "step": 4816, + "vm_loss": 0.1702 + }, + { + "epoch": 0.9271121592030224, + "lm_loss": 1.2522, + "step": 4816, + "vm_loss": 0.2053 + }, + { + "epoch": 0.9271121592030224, + "lm_loss": 1.0369, + "step": 4816, + "vm_loss": 0.189 + }, + { + "epoch": 0.9271121592030224, + "lm_loss": 1.9353, + "step": 4816, + "vm_loss": 0.1617 + }, + { + "epoch": 0.9271121592030224, + "lm_loss": 1.3556, + "step": 4816, + "vm_loss": 0.1658 + }, + { + "epoch": 0.9271121592030224, + "lm_loss": 1.6112, + "step": 4816, + "vm_loss": 0.1815 + }, + { + "epoch": 0.9271121592030224, + "lm_loss": 1.7312, + "step": 4816, + "vm_loss": 0.2253 + }, + { + "epoch": 0.9271121592030224, + "lm_loss": 1.4574, + "step": 4816, + "vm_loss": 0.1931 + }, + { + "epoch": 0.9273046658805977, + "grad_norm": 3.2409739248579377, + "learning_rate": 1.165420027290568e-05, + "loss": 1.5664, + "step": 4817 + }, + { + "epoch": 0.9274971725581731, + "grad_norm": 3.153166675923825, + "learning_rate": 1.1651125250527385e-05, + "loss": 1.5624, + "step": 4818 + }, + { + "epoch": 0.9276896792357485, + "grad_norm": 3.05655847899456, + "learning_rate": 1.164805006763859e-05, + "loss": 1.5636, + "step": 4819 + }, + { + "epoch": 0.9278821859133238, + "grad_norm": 3.056913551557172, + "learning_rate": 1.1644974724538243e-05, + "loss": 1.4699, + "step": 4820 + }, + { + "epoch": 0.9280746925908993, + "grad_norm": 3.1345778526856387, + "learning_rate": 1.1641899221525306e-05, + "loss": 1.6289, + "step": 4821 + }, + { + "epoch": 0.9282671992684747, + "grad_norm": 3.0436407536550805, + "learning_rate": 1.1638823558898762e-05, + "loss": 1.5319, + "step": 4822 + }, + { + "epoch": 0.92845970594605, + "grad_norm": 3.135359867776693, + "learning_rate": 1.1635747736957598e-05, + "loss": 1.592, + "step": 4823 + }, + { + "epoch": 0.9286522126236254, + "grad_norm": 3.1211437400886526, + "learning_rate": 1.1632671756000831e-05, + "loss": 1.5556, + "step": 4824 + }, + { + "epoch": 0.9286522126236254, + "lm_loss": 1.7428, + "step": 4824, + "vm_loss": 0.1206 + }, + { + "epoch": 0.9286522126236254, + "lm_loss": 1.6523, + "step": 4824, + "vm_loss": 0.2443 + }, + { + "epoch": 0.9286522126236254, + "lm_loss": 1.4588, + "step": 4824, + "vm_loss": 0.1314 + }, + { + "epoch": 0.9286522126236254, + "lm_loss": 0.9358, + "step": 4824, + "vm_loss": 0.1458 + }, + { + "epoch": 0.9286522126236254, + "lm_loss": 1.2836, + "step": 4824, + "vm_loss": 0.1983 + }, + { + "epoch": 0.9286522126236254, + "lm_loss": 1.2809, + "step": 4824, + "vm_loss": 0.1598 + }, + { + "epoch": 0.9286522126236254, + "lm_loss": 2.1211, + "step": 4824, + "vm_loss": 0.1432 + }, + { + "epoch": 0.9286522126236254, + "lm_loss": 1.6074, + "step": 4824, + "vm_loss": 0.128 + }, + { + "epoch": 0.9288447193012007, + "grad_norm": 3.3336391041839124, + "learning_rate": 1.1629595616327479e-05, + "loss": 1.5579, + "step": 4825 + }, + { + "epoch": 0.9290372259787761, + "grad_norm": 3.2202924660440546, + "learning_rate": 1.1626519318236582e-05, + "loss": 1.536, + "step": 4826 + }, + { + "epoch": 0.9292297326563516, + "grad_norm": 3.138997658731568, + "learning_rate": 1.1623442862027204e-05, + "loss": 1.5365, + "step": 4827 + }, + { + "epoch": 0.9294222393339269, + "grad_norm": 2.917488871004586, + "learning_rate": 1.1620366247998404e-05, + "loss": 1.5782, + "step": 4828 + }, + { + "epoch": 0.9296147460115023, + "grad_norm": 2.9983484636208986, + "learning_rate": 1.1617289476449278e-05, + "loss": 1.5491, + "step": 4829 + }, + { + "epoch": 0.9298072526890776, + "grad_norm": 3.1563516285528626, + "learning_rate": 1.1614212547678923e-05, + "loss": 1.5611, + "step": 4830 + }, + { + "epoch": 0.929999759366653, + "grad_norm": 3.263707280856024, + "learning_rate": 1.1611135461986454e-05, + "loss": 1.5301, + "step": 4831 + }, + { + "epoch": 0.9301922660442284, + "grad_norm": 3.121231101363883, + "learning_rate": 1.1608058219671005e-05, + "loss": 1.6169, + "step": 4832 + }, + { + "epoch": 0.9301922660442284, + "lm_loss": 1.1342, + "step": 4832, + "vm_loss": 0.1469 + }, + { + "epoch": 0.9301922660442284, + "lm_loss": 1.4738, + "step": 4832, + "vm_loss": 0.2279 + }, + { + "epoch": 0.9301922660442284, + "lm_loss": 1.3081, + "step": 4832, + "vm_loss": 0.1039 + }, + { + "epoch": 0.9301922660442284, + "lm_loss": 1.1317, + "step": 4832, + "vm_loss": 0.1249 + }, + { + "epoch": 0.9301922660442284, + "lm_loss": 1.0903, + "step": 4832, + "vm_loss": 0.1815 + }, + { + "epoch": 0.9301922660442284, + "lm_loss": 1.1748, + "step": 4832, + "vm_loss": 0.2013 + }, + { + "epoch": 0.9301922660442284, + "lm_loss": 1.3884, + "step": 4832, + "vm_loss": 0.1401 + }, + { + "epoch": 0.9301922660442284, + "lm_loss": 1.3137, + "step": 4832, + "vm_loss": 0.2067 + }, + { + "epoch": 0.9303847727218038, + "grad_norm": 3.0644765110699805, + "learning_rate": 1.1604980821031726e-05, + "loss": 1.5475, + "step": 4833 + }, + { + "epoch": 0.9305772793993792, + "grad_norm": 3.1326240378006034, + "learning_rate": 1.160190326636778e-05, + "loss": 1.5892, + "step": 4834 + }, + { + "epoch": 0.9307697860769546, + "grad_norm": 3.2592622504085185, + "learning_rate": 1.1598825555978339e-05, + "loss": 1.5569, + "step": 4835 + }, + { + "epoch": 0.9309622927545299, + "grad_norm": 3.0310499672192215, + "learning_rate": 1.1595747690162601e-05, + "loss": 1.5168, + "step": 4836 + }, + { + "epoch": 0.9311547994321053, + "grad_norm": 3.062408033812157, + "learning_rate": 1.1592669669219771e-05, + "loss": 1.4959, + "step": 4837 + }, + { + "epoch": 0.9313473061096806, + "grad_norm": 3.2068415478092516, + "learning_rate": 1.1589591493449073e-05, + "loss": 1.5672, + "step": 4838 + }, + { + "epoch": 0.9315398127872561, + "grad_norm": 3.264087274465582, + "learning_rate": 1.1586513163149746e-05, + "loss": 1.5339, + "step": 4839 + }, + { + "epoch": 0.9317323194648315, + "grad_norm": 3.029278121521542, + "learning_rate": 1.158343467862104e-05, + "loss": 1.5309, + "step": 4840 + }, + { + "epoch": 0.9317323194648315, + "lm_loss": 1.5672, + "step": 4840, + "vm_loss": 0.1548 + }, + { + "epoch": 0.9317323194648315, + "lm_loss": 1.3623, + "step": 4840, + "vm_loss": 0.0931 + }, + { + "epoch": 0.9317323194648315, + "lm_loss": 1.633, + "step": 4840, + "vm_loss": 0.144 + }, + { + "epoch": 0.9317323194648315, + "lm_loss": 0.9986, + "step": 4840, + "vm_loss": 0.2033 + }, + { + "epoch": 0.9317323194648315, + "lm_loss": 1.2554, + "step": 4840, + "vm_loss": 0.1026 + }, + { + "epoch": 0.9317323194648315, + "lm_loss": 1.0123, + "step": 4840, + "vm_loss": 0.1055 + }, + { + "epoch": 0.9317323194648315, + "lm_loss": 1.3729, + "step": 4840, + "vm_loss": 0.1938 + }, + { + "epoch": 0.9317323194648315, + "lm_loss": 1.4821, + "step": 4840, + "vm_loss": 0.2007 + }, + { + "epoch": 0.9319248261424068, + "grad_norm": 3.063626137379955, + "learning_rate": 1.1580356040162228e-05, + "loss": 1.5395, + "step": 4841 + }, + { + "epoch": 0.9321173328199822, + "grad_norm": 3.0821042931468456, + "learning_rate": 1.1577277248072586e-05, + "loss": 1.4663, + "step": 4842 + }, + { + "epoch": 0.9323098394975575, + "grad_norm": 3.0416556570901956, + "learning_rate": 1.1574198302651421e-05, + "loss": 1.5184, + "step": 4843 + }, + { + "epoch": 0.932502346175133, + "grad_norm": 3.208528869496445, + "learning_rate": 1.1571119204198038e-05, + "loss": 1.5918, + "step": 4844 + }, + { + "epoch": 0.9326948528527084, + "grad_norm": 3.192967532999396, + "learning_rate": 1.156803995301177e-05, + "loss": 1.5279, + "step": 4845 + }, + { + "epoch": 0.9328873595302837, + "grad_norm": 3.1194253738676374, + "learning_rate": 1.1564960549391957e-05, + "loss": 1.4991, + "step": 4846 + }, + { + "epoch": 0.9330798662078591, + "grad_norm": 3.134893381699097, + "learning_rate": 1.1561880993637956e-05, + "loss": 1.531, + "step": 4847 + }, + { + "epoch": 0.9332723728854344, + "grad_norm": 3.050254221272552, + "learning_rate": 1.155880128604914e-05, + "loss": 1.5547, + "step": 4848 + }, + { + "epoch": 0.9332723728854344, + "lm_loss": 1.5866, + "step": 4848, + "vm_loss": 0.1845 + }, + { + "epoch": 0.9332723728854344, + "lm_loss": 1.1025, + "step": 4848, + "vm_loss": 0.13 + }, + { + "epoch": 0.9332723728854344, + "lm_loss": 1.2048, + "step": 4848, + "vm_loss": 0.1565 + }, + { + "epoch": 0.9332723728854344, + "lm_loss": 1.7769, + "step": 4848, + "vm_loss": 0.1386 + }, + { + "epoch": 0.9332723728854344, + "lm_loss": 1.4299, + "step": 4848, + "vm_loss": 0.1316 + }, + { + "epoch": 0.9332723728854344, + "lm_loss": 1.48, + "step": 4848, + "vm_loss": 0.1778 + }, + { + "epoch": 0.9332723728854344, + "lm_loss": 1.14, + "step": 4848, + "vm_loss": 0.1188 + }, + { + "epoch": 0.9332723728854344, + "lm_loss": 1.1291, + "step": 4848, + "vm_loss": 0.1442 + }, + { + "epoch": 0.9334648795630098, + "grad_norm": 3.1736928114218843, + "learning_rate": 1.1555721426924895e-05, + "loss": 1.5718, + "step": 4849 + }, + { + "epoch": 0.9336573862405853, + "grad_norm": 3.1994502425083837, + "learning_rate": 1.1552641416564626e-05, + "loss": 1.6227, + "step": 4850 + }, + { + "epoch": 0.9338498929181606, + "grad_norm": 3.0221815357792976, + "learning_rate": 1.1549561255267746e-05, + "loss": 1.5033, + "step": 4851 + }, + { + "epoch": 0.934042399595736, + "grad_norm": 3.164211916836797, + "learning_rate": 1.1546480943333687e-05, + "loss": 1.5602, + "step": 4852 + }, + { + "epoch": 0.9342349062733114, + "grad_norm": 3.1552740665362506, + "learning_rate": 1.1543400481061895e-05, + "loss": 1.5503, + "step": 4853 + }, + { + "epoch": 0.9344274129508867, + "grad_norm": 3.2818367107285193, + "learning_rate": 1.154031986875183e-05, + "loss": 1.6392, + "step": 4854 + }, + { + "epoch": 0.9346199196284621, + "grad_norm": 3.156085070543579, + "learning_rate": 1.1537239106702968e-05, + "loss": 1.4913, + "step": 4855 + }, + { + "epoch": 0.9348124263060374, + "grad_norm": 3.2001590277133745, + "learning_rate": 1.15341581952148e-05, + "loss": 1.5564, + "step": 4856 + }, + { + "epoch": 0.9348124263060374, + "lm_loss": 1.8592, + "step": 4856, + "vm_loss": 0.1813 + }, + { + "epoch": 0.9348124263060374, + "lm_loss": 1.4133, + "step": 4856, + "vm_loss": 0.1063 + }, + { + "epoch": 0.9348124263060374, + "lm_loss": 1.786, + "step": 4856, + "vm_loss": 0.1412 + }, + { + "epoch": 0.9348124263060374, + "lm_loss": 1.2183, + "step": 4856, + "vm_loss": 0.144 + }, + { + "epoch": 0.9348124263060374, + "lm_loss": 1.1631, + "step": 4856, + "vm_loss": 0.1327 + }, + { + "epoch": 0.9348124263060374, + "lm_loss": 1.0705, + "step": 4856, + "vm_loss": 0.1503 + }, + { + "epoch": 0.9348124263060374, + "lm_loss": 1.1934, + "step": 4856, + "vm_loss": 0.0829 + }, + { + "epoch": 0.9348124263060374, + "lm_loss": 1.7506, + "step": 4856, + "vm_loss": 0.1316 + }, + { + "epoch": 0.9350049329836129, + "grad_norm": 3.168393377303047, + "learning_rate": 1.1531077134586826e-05, + "loss": 1.497, + "step": 4857 + }, + { + "epoch": 0.9351974396611883, + "grad_norm": 3.0979508166853726, + "learning_rate": 1.1527995925118567e-05, + "loss": 1.58, + "step": 4858 + }, + { + "epoch": 0.9353899463387636, + "grad_norm": 3.2282771126891285, + "learning_rate": 1.1524914567109553e-05, + "loss": 1.457, + "step": 4859 + }, + { + "epoch": 0.935582453016339, + "grad_norm": 3.1004618219242768, + "learning_rate": 1.1521833060859338e-05, + "loss": 1.5539, + "step": 4860 + }, + { + "epoch": 0.9357749596939143, + "grad_norm": 3.165984828973839, + "learning_rate": 1.1518751406667477e-05, + "loss": 1.5899, + "step": 4861 + }, + { + "epoch": 0.9359674663714898, + "grad_norm": 3.0278481131988086, + "learning_rate": 1.1515669604833546e-05, + "loss": 1.5914, + "step": 4862 + }, + { + "epoch": 0.9361599730490652, + "grad_norm": 3.099379791712217, + "learning_rate": 1.1512587655657144e-05, + "loss": 1.4938, + "step": 4863 + }, + { + "epoch": 0.9363524797266405, + "grad_norm": 3.143625551503428, + "learning_rate": 1.1509505559437866e-05, + "loss": 1.5861, + "step": 4864 + }, + { + "epoch": 0.9363524797266405, + "lm_loss": 1.5652, + "step": 4864, + "vm_loss": 0.1897 + }, + { + "epoch": 0.9363524797266405, + "lm_loss": 1.3394, + "step": 4864, + "vm_loss": 0.1083 + }, + { + "epoch": 0.9363524797266405, + "lm_loss": 1.3699, + "step": 4864, + "vm_loss": 0.1076 + }, + { + "epoch": 0.9363524797266405, + "lm_loss": 1.7026, + "step": 4864, + "vm_loss": 0.1799 + }, + { + "epoch": 0.9363524797266405, + "lm_loss": 1.9699, + "step": 4864, + "vm_loss": 0.2315 + }, + { + "epoch": 0.9363524797266405, + "lm_loss": 1.6658, + "step": 4864, + "vm_loss": 0.1198 + }, + { + "epoch": 0.9363524797266405, + "lm_loss": 1.456, + "step": 4864, + "vm_loss": 0.2297 + }, + { + "epoch": 0.9363524797266405, + "lm_loss": 1.4579, + "step": 4864, + "vm_loss": 0.1636 + }, + { + "epoch": 0.9365449864042159, + "grad_norm": 3.0087095550939846, + "learning_rate": 1.1506423316475336e-05, + "loss": 1.6279, + "step": 4865 + }, + { + "epoch": 0.9367374930817913, + "grad_norm": 3.080304278923635, + "learning_rate": 1.1503340927069189e-05, + "loss": 1.6414, + "step": 4866 + }, + { + "epoch": 0.9369299997593666, + "grad_norm": 3.0244907233347624, + "learning_rate": 1.1500258391519068e-05, + "loss": 1.5328, + "step": 4867 + }, + { + "epoch": 0.9371225064369421, + "grad_norm": 3.0172207372915207, + "learning_rate": 1.149717571012464e-05, + "loss": 1.5867, + "step": 4868 + }, + { + "epoch": 0.9373150131145174, + "grad_norm": 3.171742951196958, + "learning_rate": 1.1494092883185574e-05, + "loss": 1.5866, + "step": 4869 + }, + { + "epoch": 0.9375075197920928, + "grad_norm": 3.0540213613615474, + "learning_rate": 1.1491009911001572e-05, + "loss": 1.4671, + "step": 4870 + }, + { + "epoch": 0.9377000264696682, + "grad_norm": 3.0893632063474312, + "learning_rate": 1.1487926793872326e-05, + "loss": 1.5704, + "step": 4871 + }, + { + "epoch": 0.9378925331472435, + "grad_norm": 3.1753938354348143, + "learning_rate": 1.1484843532097558e-05, + "loss": 1.5911, + "step": 4872 + }, + { + "epoch": 0.9378925331472435, + "lm_loss": 1.5414, + "step": 4872, + "vm_loss": 0.2056 + }, + { + "epoch": 0.9378925331472435, + "lm_loss": 1.4615, + "step": 4872, + "vm_loss": 0.1462 + }, + { + "epoch": 0.9378925331472435, + "lm_loss": 1.1818, + "step": 4872, + "vm_loss": 0.2019 + }, + { + "epoch": 0.9378925331472435, + "lm_loss": 0.9545, + "step": 4872, + "vm_loss": 0.1239 + }, + { + "epoch": 0.9378925331472435, + "lm_loss": 1.4708, + "step": 4872, + "vm_loss": 0.1512 + }, + { + "epoch": 0.9378925331472435, + "lm_loss": 1.3805, + "step": 4872, + "vm_loss": 0.1506 + }, + { + "epoch": 0.9378925331472435, + "lm_loss": 1.0126, + "step": 4872, + "vm_loss": 0.196 + }, + { + "epoch": 0.9378925331472435, + "lm_loss": 1.9353, + "step": 4872, + "vm_loss": 0.1852 + }, + { + "epoch": 0.938085039824819, + "grad_norm": 3.167477045695026, + "learning_rate": 1.1481760125977006e-05, + "loss": 1.5535, + "step": 4873 + }, + { + "epoch": 0.9382775465023943, + "grad_norm": 3.0832071739105547, + "learning_rate": 1.147867657581041e-05, + "loss": 1.5618, + "step": 4874 + }, + { + "epoch": 0.9384700531799697, + "grad_norm": 3.063530755053782, + "learning_rate": 1.1475592881897532e-05, + "loss": 1.5471, + "step": 4875 + }, + { + "epoch": 0.9386625598575451, + "grad_norm": 3.0586462970968653, + "learning_rate": 1.147250904453815e-05, + "loss": 1.5151, + "step": 4876 + }, + { + "epoch": 0.9388550665351204, + "grad_norm": 3.209239511768028, + "learning_rate": 1.1469425064032051e-05, + "loss": 1.6254, + "step": 4877 + }, + { + "epoch": 0.9390475732126958, + "grad_norm": 3.1461683265244993, + "learning_rate": 1.1466340940679033e-05, + "loss": 1.6053, + "step": 4878 + }, + { + "epoch": 0.9392400798902711, + "grad_norm": 3.2032952209867203, + "learning_rate": 1.1463256674778918e-05, + "loss": 1.5755, + "step": 4879 + }, + { + "epoch": 0.9394325865678466, + "grad_norm": 3.1882747477272666, + "learning_rate": 1.1460172266631532e-05, + "loss": 1.5079, + "step": 4880 + }, + { + "epoch": 0.9394325865678466, + "lm_loss": 1.0817, + "step": 4880, + "vm_loss": 0.1745 + }, + { + "epoch": 0.9394325865678466, + "lm_loss": 0.8867, + "step": 4880, + "vm_loss": 0.138 + }, + { + "epoch": 0.9394325865678466, + "lm_loss": 1.3817, + "step": 4880, + "vm_loss": 0.1508 + }, + { + "epoch": 0.9394325865678466, + "lm_loss": 1.8459, + "step": 4880, + "vm_loss": 0.1447 + }, + { + "epoch": 0.9394325865678466, + "lm_loss": 1.4768, + "step": 4880, + "vm_loss": 0.1671 + }, + { + "epoch": 0.9394325865678466, + "lm_loss": 1.5947, + "step": 4880, + "vm_loss": 0.1825 + }, + { + "epoch": 0.9394325865678466, + "lm_loss": 1.4836, + "step": 4880, + "vm_loss": 0.2103 + }, + { + "epoch": 0.9394325865678466, + "lm_loss": 1.4317, + "step": 4880, + "vm_loss": 0.1733 + }, + { + "epoch": 0.939625093245422, + "grad_norm": 3.0943695939502724, + "learning_rate": 1.145708771653672e-05, + "loss": 1.5843, + "step": 4881 + }, + { + "epoch": 0.9398175999229973, + "grad_norm": 3.05885073975982, + "learning_rate": 1.1454003024794344e-05, + "loss": 1.5338, + "step": 4882 + }, + { + "epoch": 0.9400101066005727, + "grad_norm": 3.283124191345047, + "learning_rate": 1.145091819170427e-05, + "loss": 1.5193, + "step": 4883 + }, + { + "epoch": 0.9402026132781481, + "grad_norm": 3.245001455358806, + "learning_rate": 1.1447833217566383e-05, + "loss": 1.6128, + "step": 4884 + }, + { + "epoch": 0.9403951199557234, + "grad_norm": 3.1659155142818802, + "learning_rate": 1.144474810268059e-05, + "loss": 1.4824, + "step": 4885 + }, + { + "epoch": 0.9405876266332989, + "grad_norm": 3.0214319883676155, + "learning_rate": 1.1441662847346794e-05, + "loss": 1.4376, + "step": 4886 + }, + { + "epoch": 0.9407801333108742, + "grad_norm": 3.1081845228797085, + "learning_rate": 1.1438577451864923e-05, + "loss": 1.537, + "step": 4887 + }, + { + "epoch": 0.9409726399884496, + "grad_norm": 3.0459142778145534, + "learning_rate": 1.1435491916534919e-05, + "loss": 1.6073, + "step": 4888 + }, + { + "epoch": 0.9409726399884496, + "lm_loss": 1.1136, + "step": 4888, + "vm_loss": 0.2038 + }, + { + "epoch": 0.9409726399884496, + "lm_loss": 1.8127, + "step": 4888, + "vm_loss": 0.1345 + }, + { + "epoch": 0.9409726399884496, + "lm_loss": 1.3128, + "step": 4888, + "vm_loss": 0.1613 + }, + { + "epoch": 0.9409726399884496, + "lm_loss": 0.8779, + "step": 4888, + "vm_loss": 0.1822 + }, + { + "epoch": 0.9409726399884496, + "lm_loss": 1.897, + "step": 4888, + "vm_loss": 0.191 + }, + { + "epoch": 0.9409726399884496, + "lm_loss": 1.0679, + "step": 4888, + "vm_loss": 0.2067 + }, + { + "epoch": 0.9409726399884496, + "lm_loss": 1.0779, + "step": 4888, + "vm_loss": 0.1063 + }, + { + "epoch": 0.9409726399884496, + "lm_loss": 1.1264, + "step": 4888, + "vm_loss": 0.1275 + }, + { + "epoch": 0.941165146666025, + "grad_norm": 3.1837593792149765, + "learning_rate": 1.1432406241656737e-05, + "loss": 1.4771, + "step": 4889 + }, + { + "epoch": 0.9413576533436003, + "grad_norm": 3.0957834234190247, + "learning_rate": 1.1429320427530343e-05, + "loss": 1.533, + "step": 4890 + }, + { + "epoch": 0.9415501600211758, + "grad_norm": 3.3753160246525225, + "learning_rate": 1.1426234474455718e-05, + "loss": 1.5847, + "step": 4891 + }, + { + "epoch": 0.9417426666987511, + "grad_norm": 3.2750259226281333, + "learning_rate": 1.1423148382732854e-05, + "loss": 1.5888, + "step": 4892 + }, + { + "epoch": 0.9419351733763265, + "grad_norm": 3.042623538360436, + "learning_rate": 1.142006215266176e-05, + "loss": 1.5139, + "step": 4893 + }, + { + "epoch": 0.9421276800539019, + "grad_norm": 3.0984437638331106, + "learning_rate": 1.1416975784542454e-05, + "loss": 1.587, + "step": 4894 + }, + { + "epoch": 0.9423201867314772, + "grad_norm": 3.1520703222636204, + "learning_rate": 1.1413889278674977e-05, + "loss": 1.5697, + "step": 4895 + }, + { + "epoch": 0.9425126934090526, + "grad_norm": 3.029448965933574, + "learning_rate": 1.1410802635359373e-05, + "loss": 1.4825, + "step": 4896 + }, + { + "epoch": 0.9425126934090526, + "lm_loss": 1.4535, + "step": 4896, + "vm_loss": 0.1198 + }, + { + "epoch": 0.9425126934090526, + "lm_loss": 1.6739, + "step": 4896, + "vm_loss": 0.1418 + }, + { + "epoch": 0.9425126934090526, + "lm_loss": 1.1151, + "step": 4896, + "vm_loss": 0.218 + }, + { + "epoch": 0.9425126934090526, + "lm_loss": 1.7162, + "step": 4896, + "vm_loss": 0.1817 + }, + { + "epoch": 0.9425126934090526, + "lm_loss": 1.3903, + "step": 4896, + "vm_loss": 0.1856 + }, + { + "epoch": 0.9425126934090526, + "lm_loss": 0.9864, + "step": 4896, + "vm_loss": 0.2469 + }, + { + "epoch": 0.9425126934090526, + "lm_loss": 1.5197, + "step": 4896, + "vm_loss": 0.1826 + }, + { + "epoch": 0.9425126934090526, + "lm_loss": 1.5114, + "step": 4896, + "vm_loss": 0.1444 + }, + { + "epoch": 0.9427052000866281, + "grad_norm": 3.0435527137690266, + "learning_rate": 1.14077158548957e-05, + "loss": 1.6032, + "step": 4897 + }, + { + "epoch": 0.9428977067642034, + "grad_norm": 3.162895857751957, + "learning_rate": 1.1404628937584036e-05, + "loss": 1.5725, + "step": 4898 + }, + { + "epoch": 0.9430902134417788, + "grad_norm": 3.143355252823054, + "learning_rate": 1.1401541883724474e-05, + "loss": 1.5566, + "step": 4899 + }, + { + "epoch": 0.9432827201193541, + "grad_norm": 3.001866948364369, + "learning_rate": 1.1398454693617107e-05, + "loss": 1.5556, + "step": 4900 + }, + { + "epoch": 0.9434752267969295, + "grad_norm": 3.311727949517917, + "learning_rate": 1.139536736756205e-05, + "loss": 1.6002, + "step": 4901 + }, + { + "epoch": 0.9436677334745049, + "grad_norm": 3.148587097344572, + "learning_rate": 1.1392279905859438e-05, + "loss": 1.5546, + "step": 4902 + }, + { + "epoch": 0.9438602401520803, + "grad_norm": 3.2665995722471854, + "learning_rate": 1.1389192308809402e-05, + "loss": 1.521, + "step": 4903 + }, + { + "epoch": 0.9440527468296557, + "grad_norm": 3.144212409101037, + "learning_rate": 1.1386104576712105e-05, + "loss": 1.6015, + "step": 4904 + }, + { + "epoch": 0.9440527468296557, + "lm_loss": 1.116, + "step": 4904, + "vm_loss": 0.1613 + }, + { + "epoch": 0.9440527468296557, + "lm_loss": 0.913, + "step": 4904, + "vm_loss": 0.1163 + }, + { + "epoch": 0.9440527468296557, + "lm_loss": 0.9958, + "step": 4904, + "vm_loss": 0.2041 + }, + { + "epoch": 0.9440527468296557, + "lm_loss": 1.2078, + "step": 4904, + "vm_loss": 0.169 + }, + { + "epoch": 0.9440527468296557, + "lm_loss": 1.2221, + "step": 4904, + "vm_loss": 0.1703 + }, + { + "epoch": 0.9440527468296557, + "lm_loss": 1.5112, + "step": 4904, + "vm_loss": 0.1252 + }, + { + "epoch": 0.9440527468296557, + "lm_loss": 1.4032, + "step": 4904, + "vm_loss": 0.1711 + }, + { + "epoch": 0.9440527468296557, + "lm_loss": 1.0256, + "step": 4904, + "vm_loss": 0.1755 + }, + { + "epoch": 0.944245253507231, + "grad_norm": 3.235582188083785, + "learning_rate": 1.1383016709867706e-05, + "loss": 1.5676, + "step": 4905 + }, + { + "epoch": 0.9444377601848064, + "grad_norm": 3.058808523747482, + "learning_rate": 1.1379928708576394e-05, + "loss": 1.5706, + "step": 4906 + }, + { + "epoch": 0.9446302668623818, + "grad_norm": 3.064475529218613, + "learning_rate": 1.137684057313835e-05, + "loss": 1.4793, + "step": 4907 + }, + { + "epoch": 0.9448227735399571, + "grad_norm": 3.0214177007846708, + "learning_rate": 1.1373752303853791e-05, + "loss": 1.5653, + "step": 4908 + }, + { + "epoch": 0.9450152802175326, + "grad_norm": 3.1990952707037126, + "learning_rate": 1.1370663901022934e-05, + "loss": 1.5365, + "step": 4909 + }, + { + "epoch": 0.9452077868951079, + "grad_norm": 3.250619324455714, + "learning_rate": 1.1367575364946006e-05, + "loss": 1.5833, + "step": 4910 + }, + { + "epoch": 0.9454002935726833, + "grad_norm": 3.2049096288589527, + "learning_rate": 1.136448669592326e-05, + "loss": 1.4932, + "step": 4911 + }, + { + "epoch": 0.9455928002502587, + "grad_norm": 3.2144282272893228, + "learning_rate": 1.1361397894254949e-05, + "loss": 1.5363, + "step": 4912 + }, + { + "epoch": 0.9455928002502587, + "lm_loss": 0.8845, + "step": 4912, + "vm_loss": 0.1219 + }, + { + "epoch": 0.9455928002502587, + "lm_loss": 1.5232, + "step": 4912, + "vm_loss": 0.1654 + }, + { + "epoch": 0.9455928002502587, + "lm_loss": 1.5202, + "step": 4912, + "vm_loss": 0.1611 + }, + { + "epoch": 0.9455928002502587, + "lm_loss": 1.5089, + "step": 4912, + "vm_loss": 0.1743 + }, + { + "epoch": 0.9455928002502587, + "lm_loss": 0.9956, + "step": 4912, + "vm_loss": 0.1141 + }, + { + "epoch": 0.9455928002502587, + "lm_loss": 1.3623, + "step": 4912, + "vm_loss": 0.1632 + }, + { + "epoch": 0.9455928002502587, + "lm_loss": 1.3111, + "step": 4912, + "vm_loss": 0.1607 + }, + { + "epoch": 0.9455928002502587, + "lm_loss": 1.616, + "step": 4912, + "vm_loss": 0.1586 + }, + { + "epoch": 0.945785306927834, + "grad_norm": 3.0988056937748083, + "learning_rate": 1.1358308960241341e-05, + "loss": 1.5214, + "step": 4913 + }, + { + "epoch": 0.9459778136054094, + "grad_norm": 3.05750733391802, + "learning_rate": 1.1355219894182733e-05, + "loss": 1.4986, + "step": 4914 + }, + { + "epoch": 0.9461703202829849, + "grad_norm": 3.0882525545721253, + "learning_rate": 1.1352130696379406e-05, + "loss": 1.5015, + "step": 4915 + }, + { + "epoch": 0.9463628269605602, + "grad_norm": 3.013030011802002, + "learning_rate": 1.1349041367131677e-05, + "loss": 1.5253, + "step": 4916 + }, + { + "epoch": 0.9465553336381356, + "grad_norm": 3.278662237320305, + "learning_rate": 1.134595190673987e-05, + "loss": 1.5729, + "step": 4917 + }, + { + "epoch": 0.9467478403157109, + "grad_norm": 3.201480421866968, + "learning_rate": 1.1342862315504317e-05, + "loss": 1.5561, + "step": 4918 + }, + { + "epoch": 0.9469403469932863, + "grad_norm": 3.1123366547274314, + "learning_rate": 1.1339772593725367e-05, + "loss": 1.5354, + "step": 4919 + }, + { + "epoch": 0.9471328536708618, + "grad_norm": 3.2351873894798415, + "learning_rate": 1.133668274170338e-05, + "loss": 1.5292, + "step": 4920 + }, + { + "epoch": 0.9471328536708618, + "lm_loss": 1.3133, + "step": 4920, + "vm_loss": 0.1023 + }, + { + "epoch": 0.9471328536708618, + "lm_loss": 1.2972, + "step": 4920, + "vm_loss": 0.1255 + }, + { + "epoch": 0.9471328536708618, + "lm_loss": 1.823, + "step": 4920, + "vm_loss": 0.187 + }, + { + "epoch": 0.9471328536708618, + "lm_loss": 1.1321, + "step": 4920, + "vm_loss": 0.1536 + }, + { + "epoch": 0.9471328536708618, + "lm_loss": 1.3792, + "step": 4920, + "vm_loss": 0.2137 + }, + { + "epoch": 0.9471328536708618, + "lm_loss": 1.3652, + "step": 4920, + "vm_loss": 0.1753 + }, + { + "epoch": 0.9471328536708618, + "lm_loss": 1.8067, + "step": 4920, + "vm_loss": 0.1512 + }, + { + "epoch": 0.9471328536708618, + "lm_loss": 1.2198, + "step": 4920, + "vm_loss": 0.1261 + }, + { + "epoch": 0.9473253603484371, + "grad_norm": 3.2310810068224383, + "learning_rate": 1.133359275973873e-05, + "loss": 1.5215, + "step": 4921 + }, + { + "epoch": 0.9475178670260125, + "grad_norm": 3.2755841817760167, + "learning_rate": 1.13305026481318e-05, + "loss": 1.5601, + "step": 4922 + }, + { + "epoch": 0.9477103737035878, + "grad_norm": 2.910984151233412, + "learning_rate": 1.1327412407182994e-05, + "loss": 1.4745, + "step": 4923 + }, + { + "epoch": 0.9479028803811632, + "grad_norm": 2.8948142549989857, + "learning_rate": 1.1324322037192716e-05, + "loss": 1.523, + "step": 4924 + }, + { + "epoch": 0.9480953870587386, + "grad_norm": 3.139939339963268, + "learning_rate": 1.1321231538461395e-05, + "loss": 1.5119, + "step": 4925 + }, + { + "epoch": 0.948287893736314, + "grad_norm": 3.0586700870669947, + "learning_rate": 1.1318140911289465e-05, + "loss": 1.6178, + "step": 4926 + }, + { + "epoch": 0.9484804004138894, + "grad_norm": 3.177624029359829, + "learning_rate": 1.1315050155977374e-05, + "loss": 1.5625, + "step": 4927 + }, + { + "epoch": 0.9486729070914648, + "grad_norm": 3.13465913917213, + "learning_rate": 1.1311959272825586e-05, + "loss": 1.5137, + "step": 4928 + }, + { + "epoch": 0.9486729070914648, + "lm_loss": 1.0588, + "step": 4928, + "vm_loss": 0.1541 + }, + { + "epoch": 0.9486729070914648, + "lm_loss": 1.5185, + "step": 4928, + "vm_loss": 0.1492 + }, + { + "epoch": 0.9486729070914648, + "lm_loss": 1.8094, + "step": 4928, + "vm_loss": 0.1828 + }, + { + "epoch": 0.9486729070914648, + "lm_loss": 1.2392, + "step": 4928, + "vm_loss": 0.1345 + }, + { + "epoch": 0.9486729070914648, + "lm_loss": 1.3031, + "step": 4928, + "vm_loss": 0.2132 + }, + { + "epoch": 0.9486729070914648, + "lm_loss": 1.3994, + "step": 4928, + "vm_loss": 0.2161 + }, + { + "epoch": 0.9486729070914648, + "lm_loss": 1.8366, + "step": 4928, + "vm_loss": 0.1933 + }, + { + "epoch": 0.9486729070914648, + "lm_loss": 1.0128, + "step": 4928, + "vm_loss": 0.1901 + }, + { + "epoch": 0.9488654137690401, + "grad_norm": 3.063110072336763, + "learning_rate": 1.1308868262134566e-05, + "loss": 1.5292, + "step": 4929 + }, + { + "epoch": 0.9490579204466155, + "grad_norm": 3.2467589947739204, + "learning_rate": 1.1305777124204812e-05, + "loss": 1.5051, + "step": 4930 + }, + { + "epoch": 0.9492504271241908, + "grad_norm": 3.1607679070207437, + "learning_rate": 1.1302685859336812e-05, + "loss": 1.52, + "step": 4931 + }, + { + "epoch": 0.9494429338017663, + "grad_norm": 3.3017046972525717, + "learning_rate": 1.1299594467831079e-05, + "loss": 1.5853, + "step": 4932 + }, + { + "epoch": 0.9496354404793417, + "grad_norm": 3.090866892150214, + "learning_rate": 1.1296502949988142e-05, + "loss": 1.5195, + "step": 4933 + }, + { + "epoch": 0.949827947156917, + "grad_norm": 3.1211796579681583, + "learning_rate": 1.1293411306108526e-05, + "loss": 1.5689, + "step": 4934 + }, + { + "epoch": 0.9500204538344924, + "grad_norm": 3.2692116963309683, + "learning_rate": 1.1290319536492786e-05, + "loss": 1.5567, + "step": 4935 + }, + { + "epoch": 0.9502129605120677, + "grad_norm": 3.1415776875048205, + "learning_rate": 1.1287227641441478e-05, + "loss": 1.5173, + "step": 4936 + }, + { + "epoch": 0.9502129605120677, + "lm_loss": 0.7196, + "step": 4936, + "vm_loss": 0.1627 + }, + { + "epoch": 0.9502129605120677, + "lm_loss": 1.6534, + "step": 4936, + "vm_loss": 0.2064 + }, + { + "epoch": 0.9502129605120677, + "lm_loss": 1.4973, + "step": 4936, + "vm_loss": 0.1301 + }, + { + "epoch": 0.9502129605120677, + "lm_loss": 1.3946, + "step": 4936, + "vm_loss": 0.166 + }, + { + "epoch": 0.9502129605120677, + "lm_loss": 1.5871, + "step": 4936, + "vm_loss": 0.1313 + }, + { + "epoch": 0.9502129605120677, + "lm_loss": 1.4575, + "step": 4936, + "vm_loss": 0.1275 + }, + { + "epoch": 0.9502129605120677, + "lm_loss": 1.3059, + "step": 4936, + "vm_loss": 0.1409 + }, + { + "epoch": 0.9502129605120677, + "lm_loss": 1.4286, + "step": 4936, + "vm_loss": 0.1103 + }, + { + "epoch": 0.9504054671896431, + "grad_norm": 3.222931344598351, + "learning_rate": 1.1284135621255177e-05, + "loss": 1.5321, + "step": 4937 + }, + { + "epoch": 0.9505979738672186, + "grad_norm": 3.1184862502659505, + "learning_rate": 1.1281043476234461e-05, + "loss": 1.4561, + "step": 4938 + }, + { + "epoch": 0.9507904805447939, + "grad_norm": 3.317785224622354, + "learning_rate": 1.127795120667993e-05, + "loss": 1.5369, + "step": 4939 + }, + { + "epoch": 0.9509829872223693, + "grad_norm": 3.2377981450902524, + "learning_rate": 1.1274858812892198e-05, + "loss": 1.5141, + "step": 4940 + }, + { + "epoch": 0.9511754938999446, + "grad_norm": 3.3142560982923057, + "learning_rate": 1.1271766295171873e-05, + "loss": 1.5112, + "step": 4941 + }, + { + "epoch": 0.95136800057752, + "grad_norm": 3.4039945829062193, + "learning_rate": 1.1268673653819594e-05, + "loss": 1.5644, + "step": 4942 + }, + { + "epoch": 0.9515605072550954, + "grad_norm": 3.1422287825617863, + "learning_rate": 1.1265580889136007e-05, + "loss": 1.5406, + "step": 4943 + }, + { + "epoch": 0.9517530139326708, + "grad_norm": 3.018636398312615, + "learning_rate": 1.1262488001421765e-05, + "loss": 1.4895, + "step": 4944 + }, + { + "epoch": 0.9517530139326708, + "lm_loss": 0.9212, + "step": 4944, + "vm_loss": 0.1176 + }, + { + "epoch": 0.9517530139326708, + "lm_loss": 1.36, + "step": 4944, + "vm_loss": 0.1902 + }, + { + "epoch": 0.9517530139326708, + "lm_loss": 1.2078, + "step": 4944, + "vm_loss": 0.1959 + }, + { + "epoch": 0.9517530139326708, + "lm_loss": 1.5963, + "step": 4944, + "vm_loss": 0.1419 + }, + { + "epoch": 0.9517530139326708, + "lm_loss": 1.5913, + "step": 4944, + "vm_loss": 0.1541 + }, + { + "epoch": 0.9517530139326708, + "lm_loss": 1.3362, + "step": 4944, + "vm_loss": 0.1562 + }, + { + "epoch": 0.9517530139326708, + "lm_loss": 0.9535, + "step": 4944, + "vm_loss": 0.2216 + }, + { + "epoch": 0.9517530139326708, + "lm_loss": 1.0879, + "step": 4944, + "vm_loss": 0.1906 + }, + { + "epoch": 0.9519455206102462, + "grad_norm": 3.2544283354247736, + "learning_rate": 1.1259394990977541e-05, + "loss": 1.472, + "step": 4945 + }, + { + "epoch": 0.9521380272878216, + "grad_norm": 3.2095407748938976, + "learning_rate": 1.1256301858104007e-05, + "loss": 1.4981, + "step": 4946 + }, + { + "epoch": 0.9523305339653969, + "grad_norm": 3.524097672988114, + "learning_rate": 1.125320860310186e-05, + "loss": 1.6084, + "step": 4947 + }, + { + "epoch": 0.9525230406429723, + "grad_norm": 3.2341711913449536, + "learning_rate": 1.1250115226271804e-05, + "loss": 1.4711, + "step": 4948 + }, + { + "epoch": 0.9527155473205476, + "grad_norm": 3.4667260649290346, + "learning_rate": 1.1247021727914555e-05, + "loss": 1.5321, + "step": 4949 + }, + { + "epoch": 0.9529080539981231, + "grad_norm": 3.0880161582827643, + "learning_rate": 1.124392810833084e-05, + "loss": 1.5284, + "step": 4950 + }, + { + "epoch": 0.9531005606756985, + "grad_norm": 3.0167625032502743, + "learning_rate": 1.1240834367821398e-05, + "loss": 1.5366, + "step": 4951 + }, + { + "epoch": 0.9532930673532738, + "grad_norm": 3.127109342620602, + "learning_rate": 1.1237740506686983e-05, + "loss": 1.5798, + "step": 4952 + }, + { + "epoch": 0.9532930673532738, + "lm_loss": 2.0237, + "step": 4952, + "vm_loss": 0.1348 + }, + { + "epoch": 0.9532930673532738, + "lm_loss": 1.2665, + "step": 4952, + "vm_loss": 0.218 + }, + { + "epoch": 0.9532930673532738, + "lm_loss": 1.6865, + "step": 4952, + "vm_loss": 0.1361 + }, + { + "epoch": 0.9532930673532738, + "lm_loss": 1.0832, + "step": 4952, + "vm_loss": 0.1978 + }, + { + "epoch": 0.9532930673532738, + "lm_loss": 1.2294, + "step": 4952, + "vm_loss": 0.2231 + }, + { + "epoch": 0.9532930673532738, + "lm_loss": 2.0328, + "step": 4952, + "vm_loss": 0.1491 + }, + { + "epoch": 0.9532930673532738, + "lm_loss": 1.4052, + "step": 4952, + "vm_loss": 0.153 + }, + { + "epoch": 0.9532930673532738, + "lm_loss": 1.5694, + "step": 4952, + "vm_loss": 0.2345 + }, + { + "epoch": 0.9534855740308492, + "grad_norm": 3.07905336068598, + "learning_rate": 1.1234646525228353e-05, + "loss": 1.5861, + "step": 4953 + }, + { + "epoch": 0.9536780807084245, + "grad_norm": 3.0700224772077807, + "learning_rate": 1.1231552423746284e-05, + "loss": 1.5368, + "step": 4954 + }, + { + "epoch": 0.953870587386, + "grad_norm": 3.1117391854579144, + "learning_rate": 1.1228458202541566e-05, + "loss": 1.4671, + "step": 4955 + }, + { + "epoch": 0.9540630940635754, + "grad_norm": 3.1579746964044366, + "learning_rate": 1.122536386191499e-05, + "loss": 1.56, + "step": 4956 + }, + { + "epoch": 0.9542556007411507, + "grad_norm": 3.288425003716889, + "learning_rate": 1.1222269402167372e-05, + "loss": 1.5579, + "step": 4957 + }, + { + "epoch": 0.9544481074187261, + "grad_norm": 3.1795499598907906, + "learning_rate": 1.1219174823599532e-05, + "loss": 1.5184, + "step": 4958 + }, + { + "epoch": 0.9546406140963015, + "grad_norm": 3.24444769593709, + "learning_rate": 1.1216080126512298e-05, + "loss": 1.5073, + "step": 4959 + }, + { + "epoch": 0.9548331207738768, + "grad_norm": 3.150687090032631, + "learning_rate": 1.1212985311206522e-05, + "loss": 1.5568, + "step": 4960 + }, + { + "epoch": 0.9548331207738768, + "lm_loss": 1.6337, + "step": 4960, + "vm_loss": 0.2233 + }, + { + "epoch": 0.9548331207738768, + "lm_loss": 1.7092, + "step": 4960, + "vm_loss": 0.2044 + }, + { + "epoch": 0.9548331207738768, + "lm_loss": 0.8714, + "step": 4960, + "vm_loss": 0.158 + }, + { + "epoch": 0.9548331207738768, + "lm_loss": 1.6399, + "step": 4960, + "vm_loss": 0.1555 + }, + { + "epoch": 0.9548331207738768, + "lm_loss": 1.3171, + "step": 4960, + "vm_loss": 0.1617 + }, + { + "epoch": 0.9548331207738768, + "lm_loss": 0.9431, + "step": 4960, + "vm_loss": 0.1485 + }, + { + "epoch": 0.9548331207738768, + "lm_loss": 1.8818, + "step": 4960, + "vm_loss": 0.2111 + }, + { + "epoch": 0.9548331207738768, + "lm_loss": 1.3153, + "step": 4960, + "vm_loss": 0.1362 + }, + { + "epoch": 0.9550256274514523, + "grad_norm": 3.070799484841203, + "learning_rate": 1.120989037798305e-05, + "loss": 1.5702, + "step": 4961 + }, + { + "epoch": 0.9552181341290276, + "grad_norm": 3.0812185508135452, + "learning_rate": 1.120679532714276e-05, + "loss": 1.5618, + "step": 4962 + }, + { + "epoch": 0.955410640806603, + "grad_norm": 3.0407337386431874, + "learning_rate": 1.120370015898652e-05, + "loss": 1.5118, + "step": 4963 + }, + { + "epoch": 0.9556031474841784, + "grad_norm": 3.1324404134633306, + "learning_rate": 1.1200604873815226e-05, + "loss": 1.5251, + "step": 4964 + }, + { + "epoch": 0.9557956541617537, + "grad_norm": 3.177728013772863, + "learning_rate": 1.1197509471929779e-05, + "loss": 1.5302, + "step": 4965 + }, + { + "epoch": 0.9559881608393291, + "grad_norm": 3.3034508925586885, + "learning_rate": 1.119441395363109e-05, + "loss": 1.592, + "step": 4966 + }, + { + "epoch": 0.9561806675169044, + "grad_norm": 3.2302100345526634, + "learning_rate": 1.1191318319220085e-05, + "loss": 1.5793, + "step": 4967 + }, + { + "epoch": 0.9563731741944799, + "grad_norm": 3.205608358848487, + "learning_rate": 1.1188222568997697e-05, + "loss": 1.5227, + "step": 4968 + }, + { + "epoch": 0.9563731741944799, + "lm_loss": 1.8363, + "step": 4968, + "vm_loss": 0.1191 + }, + { + "epoch": 0.9563731741944799, + "lm_loss": 1.2392, + "step": 4968, + "vm_loss": 0.1407 + }, + { + "epoch": 0.9563731741944799, + "lm_loss": 1.4104, + "step": 4968, + "vm_loss": 0.2182 + }, + { + "epoch": 0.9563731741944799, + "lm_loss": 1.4452, + "step": 4968, + "vm_loss": 0.1845 + }, + { + "epoch": 0.9563731741944799, + "lm_loss": 1.5054, + "step": 4968, + "vm_loss": 0.1682 + }, + { + "epoch": 0.9563731741944799, + "lm_loss": 1.0966, + "step": 4968, + "vm_loss": 0.1454 + }, + { + "epoch": 0.9563731741944799, + "lm_loss": 1.5982, + "step": 4968, + "vm_loss": 0.2177 + }, + { + "epoch": 0.9563731741944799, + "lm_loss": 1.3861, + "step": 4968, + "vm_loss": 0.1592 + }, + { + "epoch": 0.9565656808720553, + "grad_norm": 2.9280178432610175, + "learning_rate": 1.1185126703264875e-05, + "loss": 1.4848, + "step": 4969 + }, + { + "epoch": 0.9567581875496306, + "grad_norm": 3.051466697417576, + "learning_rate": 1.1182030722322571e-05, + "loss": 1.4295, + "step": 4970 + }, + { + "epoch": 0.956950694227206, + "grad_norm": 3.047311201224443, + "learning_rate": 1.1178934626471762e-05, + "loss": 1.3757, + "step": 4971 + }, + { + "epoch": 0.9571432009047813, + "grad_norm": 2.9806130932304327, + "learning_rate": 1.1175838416013425e-05, + "loss": 1.53, + "step": 4972 + }, + { + "epoch": 0.9573357075823568, + "grad_norm": 3.1935678923291047, + "learning_rate": 1.1172742091248548e-05, + "loss": 1.5851, + "step": 4973 + }, + { + "epoch": 0.9575282142599322, + "grad_norm": 3.3291942217519286, + "learning_rate": 1.1169645652478138e-05, + "loss": 1.5695, + "step": 4974 + }, + { + "epoch": 0.9577207209375075, + "grad_norm": 3.262252693044439, + "learning_rate": 1.1166549100003203e-05, + "loss": 1.585, + "step": 4975 + }, + { + "epoch": 0.9579132276150829, + "grad_norm": 3.397908129430199, + "learning_rate": 1.1163452434124773e-05, + "loss": 1.5269, + "step": 4976 + }, + { + "epoch": 0.9579132276150829, + "lm_loss": 1.2388, + "step": 4976, + "vm_loss": 0.1189 + }, + { + "epoch": 0.9579132276150829, + "lm_loss": 1.6814, + "step": 4976, + "vm_loss": 0.1911 + }, + { + "epoch": 0.9579132276150829, + "lm_loss": 1.6699, + "step": 4976, + "vm_loss": 0.1867 + }, + { + "epoch": 0.9579132276150829, + "lm_loss": 1.2209, + "step": 4976, + "vm_loss": 0.2209 + }, + { + "epoch": 0.9579132276150829, + "lm_loss": 1.4817, + "step": 4976, + "vm_loss": 0.224 + }, + { + "epoch": 0.9579132276150829, + "lm_loss": 1.4436, + "step": 4976, + "vm_loss": 0.1634 + }, + { + "epoch": 0.9579132276150829, + "lm_loss": 1.0202, + "step": 4976, + "vm_loss": 0.2728 + }, + { + "epoch": 0.9579132276150829, + "lm_loss": 1.3588, + "step": 4976, + "vm_loss": 0.22 + }, + { + "epoch": 0.9581057342926583, + "grad_norm": 3.1657081991723137, + "learning_rate": 1.1160355655143883e-05, + "loss": 1.6135, + "step": 4977 + }, + { + "epoch": 0.9582982409702336, + "grad_norm": 2.99265792139987, + "learning_rate": 1.1157258763361577e-05, + "loss": 1.4804, + "step": 4978 + }, + { + "epoch": 0.9584907476478091, + "grad_norm": 3.2195053117486006, + "learning_rate": 1.1154161759078912e-05, + "loss": 1.487, + "step": 4979 + }, + { + "epoch": 0.9586832543253844, + "grad_norm": 3.0686529605655664, + "learning_rate": 1.1151064642596958e-05, + "loss": 1.4882, + "step": 4980 + }, + { + "epoch": 0.9588757610029598, + "grad_norm": 3.2883065855922937, + "learning_rate": 1.1147967414216796e-05, + "loss": 1.5837, + "step": 4981 + }, + { + "epoch": 0.9590682676805352, + "grad_norm": 3.2242174201478404, + "learning_rate": 1.1144870074239514e-05, + "loss": 1.5435, + "step": 4982 + }, + { + "epoch": 0.9592607743581105, + "grad_norm": 3.161845572759309, + "learning_rate": 1.1141772622966214e-05, + "loss": 1.5549, + "step": 4983 + }, + { + "epoch": 0.959453281035686, + "grad_norm": 3.2259888525380815, + "learning_rate": 1.1138675060698007e-05, + "loss": 1.5718, + "step": 4984 + }, + { + "epoch": 0.959453281035686, + "lm_loss": 1.0418, + "step": 4984, + "vm_loss": 0.1455 + }, + { + "epoch": 0.959453281035686, + "lm_loss": 1.0052, + "step": 4984, + "vm_loss": 0.1357 + }, + { + "epoch": 0.959453281035686, + "lm_loss": 1.2592, + "step": 4984, + "vm_loss": 0.1541 + }, + { + "epoch": 0.959453281035686, + "lm_loss": 1.4809, + "step": 4984, + "vm_loss": 0.1336 + }, + { + "epoch": 0.959453281035686, + "lm_loss": 1.409, + "step": 4984, + "vm_loss": 0.0994 + }, + { + "epoch": 0.959453281035686, + "lm_loss": 1.356, + "step": 4984, + "vm_loss": 0.1663 + }, + { + "epoch": 0.959453281035686, + "lm_loss": 1.2268, + "step": 4984, + "vm_loss": 0.1177 + }, + { + "epoch": 0.959453281035686, + "lm_loss": 1.9241, + "step": 4984, + "vm_loss": 0.1541 + }, + { + "epoch": 0.9596457877132613, + "grad_norm": 3.274063562252294, + "learning_rate": 1.1135577387736016e-05, + "loss": 1.444, + "step": 4985 + }, + { + "epoch": 0.9598382943908367, + "grad_norm": 3.0000261767060614, + "learning_rate": 1.1132479604381377e-05, + "loss": 1.6082, + "step": 4986 + }, + { + "epoch": 0.9600308010684121, + "grad_norm": 3.049413535990162, + "learning_rate": 1.1129381710935229e-05, + "loss": 1.5844, + "step": 4987 + }, + { + "epoch": 0.9602233077459874, + "grad_norm": 3.132131404265152, + "learning_rate": 1.112628370769873e-05, + "loss": 1.5485, + "step": 4988 + }, + { + "epoch": 0.9604158144235628, + "grad_norm": 2.999358898174001, + "learning_rate": 1.1123185594973047e-05, + "loss": 1.4486, + "step": 4989 + }, + { + "epoch": 0.9606083211011381, + "grad_norm": 3.255676728811577, + "learning_rate": 1.1120087373059351e-05, + "loss": 1.5545, + "step": 4990 + }, + { + "epoch": 0.9608008277787136, + "grad_norm": 3.311115513310606, + "learning_rate": 1.1116989042258837e-05, + "loss": 1.5843, + "step": 4991 + }, + { + "epoch": 0.960993334456289, + "grad_norm": 3.2665485107454475, + "learning_rate": 1.1113890602872695e-05, + "loss": 1.5794, + "step": 4992 + }, + { + "epoch": 0.960993334456289, + "lm_loss": 0.9666, + "step": 4992, + "vm_loss": 0.1479 + }, + { + "epoch": 0.960993334456289, + "lm_loss": 1.5262, + "step": 4992, + "vm_loss": 0.1491 + }, + { + "epoch": 0.960993334456289, + "lm_loss": 1.4755, + "step": 4992, + "vm_loss": 0.1931 + }, + { + "epoch": 0.960993334456289, + "lm_loss": 1.5749, + "step": 4992, + "vm_loss": 0.1319 + }, + { + "epoch": 0.960993334456289, + "lm_loss": 1.5446, + "step": 4992, + "vm_loss": 0.1664 + }, + { + "epoch": 0.960993334456289, + "lm_loss": 1.9874, + "step": 4992, + "vm_loss": 0.1461 + }, + { + "epoch": 0.960993334456289, + "lm_loss": 1.4392, + "step": 4992, + "vm_loss": 0.1407 + }, + { + "epoch": 0.960993334456289, + "lm_loss": 1.1821, + "step": 4992, + "vm_loss": 0.17 + }, + { + "epoch": 0.9611858411338643, + "grad_norm": 3.1126080864229255, + "learning_rate": 1.1110792055202135e-05, + "loss": 1.4613, + "step": 4993 + }, + { + "epoch": 0.9613783478114397, + "grad_norm": 3.277587209054993, + "learning_rate": 1.1107693399548378e-05, + "loss": 1.5535, + "step": 4994 + }, + { + "epoch": 0.9615708544890151, + "grad_norm": 3.131997486153474, + "learning_rate": 1.110459463621265e-05, + "loss": 1.5459, + "step": 4995 + }, + { + "epoch": 0.9617633611665904, + "grad_norm": 3.165541572953028, + "learning_rate": 1.110149576549619e-05, + "loss": 1.5136, + "step": 4996 + }, + { + "epoch": 0.9619558678441659, + "grad_norm": 3.098776074179078, + "learning_rate": 1.1098396787700252e-05, + "loss": 1.4477, + "step": 4997 + }, + { + "epoch": 0.9621483745217412, + "grad_norm": 3.104081992397341, + "learning_rate": 1.1095297703126094e-05, + "loss": 1.5055, + "step": 4998 + }, + { + "epoch": 0.9623408811993166, + "grad_norm": 3.111710358114047, + "learning_rate": 1.1092198512074983e-05, + "loss": 1.5488, + "step": 4999 + }, + { + "epoch": 0.962533387876892, + "grad_norm": 3.1759162888321497, + "learning_rate": 1.1089099214848202e-05, + "loss": 1.4918, + "step": 5000 + }, + { + "epoch": 0.962533387876892, + "lm_loss": 1.7775, + "step": 5000, + "vm_loss": 0.1527 + }, + { + "epoch": 0.962533387876892, + "lm_loss": 0.6308, + "step": 5000, + "vm_loss": 0.1819 + }, + { + "epoch": 0.962533387876892, + "lm_loss": 1.2442, + "step": 5000, + "vm_loss": 0.1702 + }, + { + "epoch": 0.962533387876892, + "lm_loss": 1.1558, + "step": 5000, + "vm_loss": 0.18 + }, + { + "epoch": 0.962533387876892, + "lm_loss": 1.215, + "step": 5000, + "vm_loss": 0.1017 + }, + { + "epoch": 0.962533387876892, + "lm_loss": 1.4159, + "step": 5000, + "vm_loss": 0.2236 + }, + { + "epoch": 0.962533387876892, + "lm_loss": 1.6202, + "step": 5000, + "vm_loss": 0.1365 + }, + { + "epoch": 0.962533387876892, + "lm_loss": 1.4315, + "step": 5000, + "vm_loss": 0.2205 + }, + { + "epoch": 0.9627258945544673, + "grad_norm": 3.1277312529275623, + "learning_rate": 1.1085999811747048e-05, + "loss": 1.4734, + "step": 5001 + }, + { + "epoch": 0.9629184012320428, + "grad_norm": 3.0358493979105043, + "learning_rate": 1.1082900303072814e-05, + "loss": 1.4905, + "step": 5002 + }, + { + "epoch": 0.9631109079096181, + "grad_norm": 3.009859005626985, + "learning_rate": 1.107980068912682e-05, + "loss": 1.4066, + "step": 5003 + }, + { + "epoch": 0.9633034145871935, + "grad_norm": 3.2689938698285483, + "learning_rate": 1.1076700970210383e-05, + "loss": 1.5161, + "step": 5004 + }, + { + "epoch": 0.9634959212647689, + "grad_norm": 3.2371356512914633, + "learning_rate": 1.1073601146624837e-05, + "loss": 1.5081, + "step": 5005 + }, + { + "epoch": 0.9636884279423442, + "grad_norm": 3.2563042849276025, + "learning_rate": 1.1070501218671526e-05, + "loss": 1.5896, + "step": 5006 + }, + { + "epoch": 0.9638809346199196, + "grad_norm": 3.1266000777353633, + "learning_rate": 1.1067401186651799e-05, + "loss": 1.6047, + "step": 5007 + }, + { + "epoch": 0.9640734412974951, + "grad_norm": 3.0572994685307306, + "learning_rate": 1.1064301050867022e-05, + "loss": 1.4774, + "step": 5008 + }, + { + "epoch": 0.9640734412974951, + "lm_loss": 1.0822, + "step": 5008, + "vm_loss": 0.1287 + }, + { + "epoch": 0.9640734412974951, + "lm_loss": 1.0384, + "step": 5008, + "vm_loss": 0.1455 + }, + { + "epoch": 0.9640734412974951, + "lm_loss": 1.4654, + "step": 5008, + "vm_loss": 0.1464 + }, + { + "epoch": 0.9640734412974951, + "lm_loss": 1.6399, + "step": 5008, + "vm_loss": 0.1397 + }, + { + "epoch": 0.9640734412974951, + "lm_loss": 1.1153, + "step": 5008, + "vm_loss": 0.143 + }, + { + "epoch": 0.9640734412974951, + "lm_loss": 1.116, + "step": 5008, + "vm_loss": 0.1136 + }, + { + "epoch": 0.9640734412974951, + "lm_loss": 1.4629, + "step": 5008, + "vm_loss": 0.214 + }, + { + "epoch": 0.9640734412974951, + "lm_loss": 1.4472, + "step": 5008, + "vm_loss": 0.2289 + }, + { + "epoch": 0.9642659479750704, + "grad_norm": 3.0707765392144863, + "learning_rate": 1.1061200811618564e-05, + "loss": 1.5074, + "step": 5009 + }, + { + "epoch": 0.9644584546526458, + "grad_norm": 3.11302522345523, + "learning_rate": 1.1058100469207811e-05, + "loss": 1.5559, + "step": 5010 + }, + { + "epoch": 0.9646509613302211, + "grad_norm": 3.1106507784028286, + "learning_rate": 1.1055000023936155e-05, + "loss": 1.5193, + "step": 5011 + }, + { + "epoch": 0.9648434680077965, + "grad_norm": 3.0946780148701527, + "learning_rate": 1.1051899476105001e-05, + "loss": 1.5627, + "step": 5012 + }, + { + "epoch": 0.9650359746853719, + "grad_norm": 3.006655990895208, + "learning_rate": 1.104879882601576e-05, + "loss": 1.5304, + "step": 5013 + }, + { + "epoch": 0.9652284813629473, + "grad_norm": 3.0771532295837507, + "learning_rate": 1.1045698073969855e-05, + "loss": 1.4615, + "step": 5014 + }, + { + "epoch": 0.9654209880405227, + "grad_norm": 3.056591900513941, + "learning_rate": 1.1042597220268717e-05, + "loss": 1.4858, + "step": 5015 + }, + { + "epoch": 0.965613494718098, + "grad_norm": 3.1226024561361325, + "learning_rate": 1.1039496265213792e-05, + "loss": 1.5535, + "step": 5016 + }, + { + "epoch": 0.965613494718098, + "lm_loss": 1.5655, + "step": 5016, + "vm_loss": 0.1305 + }, + { + "epoch": 0.965613494718098, + "lm_loss": 1.4469, + "step": 5016, + "vm_loss": 0.1826 + }, + { + "epoch": 0.965613494718098, + "lm_loss": 1.0383, + "step": 5016, + "vm_loss": 0.1879 + }, + { + "epoch": 0.965613494718098, + "lm_loss": 1.0489, + "step": 5016, + "vm_loss": 0.1282 + }, + { + "epoch": 0.965613494718098, + "lm_loss": 1.0831, + "step": 5016, + "vm_loss": 0.2744 + }, + { + "epoch": 0.965613494718098, + "lm_loss": 1.2477, + "step": 5016, + "vm_loss": 0.1848 + }, + { + "epoch": 0.965613494718098, + "lm_loss": 2.135, + "step": 5016, + "vm_loss": 0.1816 + }, + { + "epoch": 0.965613494718098, + "lm_loss": 1.4885, + "step": 5016, + "vm_loss": 0.1426 + }, + { + "epoch": 0.9658060013956734, + "grad_norm": 3.227004644371464, + "learning_rate": 1.103639520910653e-05, + "loss": 1.4961, + "step": 5017 + }, + { + "epoch": 0.9659985080732488, + "grad_norm": 3.13013353110654, + "learning_rate": 1.1033294052248396e-05, + "loss": 1.4988, + "step": 5018 + }, + { + "epoch": 0.9661910147508241, + "grad_norm": 3.101879817042813, + "learning_rate": 1.103019279494086e-05, + "loss": 1.4998, + "step": 5019 + }, + { + "epoch": 0.9663835214283996, + "grad_norm": 3.197413848021874, + "learning_rate": 1.1027091437485404e-05, + "loss": 1.5503, + "step": 5020 + }, + { + "epoch": 0.9665760281059749, + "grad_norm": 3.0283583462031967, + "learning_rate": 1.1023989980183518e-05, + "loss": 1.5142, + "step": 5021 + }, + { + "epoch": 0.9667685347835503, + "grad_norm": 3.302367679081862, + "learning_rate": 1.1020888423336706e-05, + "loss": 1.5303, + "step": 5022 + }, + { + "epoch": 0.9669610414611257, + "grad_norm": 3.192947791422235, + "learning_rate": 1.1017786767246479e-05, + "loss": 1.5018, + "step": 5023 + }, + { + "epoch": 0.967153548138701, + "grad_norm": 3.164070760564195, + "learning_rate": 1.1014685012214356e-05, + "loss": 1.5219, + "step": 5024 + }, + { + "epoch": 0.967153548138701, + "lm_loss": 0.9907, + "step": 5024, + "vm_loss": 0.2035 + }, + { + "epoch": 0.967153548138701, + "lm_loss": 1.4281, + "step": 5024, + "vm_loss": 0.1621 + }, + { + "epoch": 0.967153548138701, + "lm_loss": 1.162, + "step": 5024, + "vm_loss": 0.1764 + }, + { + "epoch": 0.967153548138701, + "lm_loss": 1.2192, + "step": 5024, + "vm_loss": 0.1504 + }, + { + "epoch": 0.967153548138701, + "lm_loss": 1.3241, + "step": 5024, + "vm_loss": 0.2218 + }, + { + "epoch": 0.967153548138701, + "lm_loss": 1.4035, + "step": 5024, + "vm_loss": 0.1562 + }, + { + "epoch": 0.967153548138701, + "lm_loss": 1.3924, + "step": 5024, + "vm_loss": 0.2563 + }, + { + "epoch": 0.967153548138701, + "lm_loss": 1.7133, + "step": 5024, + "vm_loss": 0.1924 + }, + { + "epoch": 0.9673460548162764, + "grad_norm": 3.1564790637080673, + "learning_rate": 1.1011583158541872e-05, + "loss": 1.5896, + "step": 5025 + }, + { + "epoch": 0.9675385614938519, + "grad_norm": 3.110982167192257, + "learning_rate": 1.1008481206530559e-05, + "loss": 1.5237, + "step": 5026 + }, + { + "epoch": 0.9677310681714272, + "grad_norm": 3.066232119994031, + "learning_rate": 1.1005379156481975e-05, + "loss": 1.4477, + "step": 5027 + }, + { + "epoch": 0.9679235748490026, + "grad_norm": 3.1831212206810964, + "learning_rate": 1.1002277008697668e-05, + "loss": 1.5055, + "step": 5028 + }, + { + "epoch": 0.9681160815265779, + "grad_norm": 3.008984222314179, + "learning_rate": 1.0999174763479217e-05, + "loss": 1.5004, + "step": 5029 + }, + { + "epoch": 0.9683085882041533, + "grad_norm": 3.0763207553316767, + "learning_rate": 1.0996072421128196e-05, + "loss": 1.4857, + "step": 5030 + }, + { + "epoch": 0.9685010948817288, + "grad_norm": 3.1765247155977567, + "learning_rate": 1.0992969981946192e-05, + "loss": 1.4765, + "step": 5031 + }, + { + "epoch": 0.9686936015593041, + "grad_norm": 3.220280058020591, + "learning_rate": 1.0989867446234801e-05, + "loss": 1.4952, + "step": 5032 + }, + { + "epoch": 0.9686936015593041, + "lm_loss": 1.1544, + "step": 5032, + "vm_loss": 0.1267 + }, + { + "epoch": 0.9686936015593041, + "lm_loss": 1.2017, + "step": 5032, + "vm_loss": 0.1639 + }, + { + "epoch": 0.9686936015593041, + "lm_loss": 0.8891, + "step": 5032, + "vm_loss": 0.1029 + }, + { + "epoch": 0.9686936015593041, + "lm_loss": 1.2234, + "step": 5032, + "vm_loss": 0.1547 + }, + { + "epoch": 0.9686936015593041, + "lm_loss": 1.0949, + "step": 5032, + "vm_loss": 0.1487 + }, + { + "epoch": 0.9686936015593041, + "lm_loss": 1.2767, + "step": 5032, + "vm_loss": 0.1826 + }, + { + "epoch": 0.9686936015593041, + "lm_loss": 1.3779, + "step": 5032, + "vm_loss": 0.2074 + }, + { + "epoch": 0.9686936015593041, + "lm_loss": 1.2076, + "step": 5032, + "vm_loss": 0.2111 + }, + { + "epoch": 0.9688861082368795, + "grad_norm": 3.263019198728408, + "learning_rate": 1.098676481429563e-05, + "loss": 1.47, + "step": 5033 + }, + { + "epoch": 0.9690786149144548, + "grad_norm": 3.09992119773287, + "learning_rate": 1.0983662086430294e-05, + "loss": 1.4738, + "step": 5034 + }, + { + "epoch": 0.9692711215920302, + "grad_norm": 3.2311374956387824, + "learning_rate": 1.098055926294042e-05, + "loss": 1.5493, + "step": 5035 + }, + { + "epoch": 0.9694636282696056, + "grad_norm": 3.176599893392964, + "learning_rate": 1.0977456344127639e-05, + "loss": 1.5388, + "step": 5036 + }, + { + "epoch": 0.969656134947181, + "grad_norm": 3.2598170299157867, + "learning_rate": 1.0974353330293599e-05, + "loss": 1.5156, + "step": 5037 + }, + { + "epoch": 0.9698486416247564, + "grad_norm": 3.2021674598402416, + "learning_rate": 1.0971250221739947e-05, + "loss": 1.5872, + "step": 5038 + }, + { + "epoch": 0.9700411483023318, + "grad_norm": 3.0781674527047254, + "learning_rate": 1.0968147018768347e-05, + "loss": 1.505, + "step": 5039 + }, + { + "epoch": 0.9702336549799071, + "grad_norm": 3.173571737677203, + "learning_rate": 1.0965043721680472e-05, + "loss": 1.538, + "step": 5040 + }, + { + "epoch": 0.9702336549799071, + "lm_loss": 1.1073, + "step": 5040, + "vm_loss": 0.1653 + }, + { + "epoch": 0.9702336549799071, + "lm_loss": 1.4669, + "step": 5040, + "vm_loss": 0.1843 + }, + { + "epoch": 0.9702336549799071, + "lm_loss": 1.3553, + "step": 5040, + "vm_loss": 0.2674 + }, + { + "epoch": 0.9702336549799071, + "lm_loss": 1.5128, + "step": 5040, + "vm_loss": 0.1866 + }, + { + "epoch": 0.9702336549799071, + "lm_loss": 1.1608, + "step": 5040, + "vm_loss": 0.1614 + }, + { + "epoch": 0.9702336549799071, + "lm_loss": 1.4648, + "step": 5040, + "vm_loss": 0.1395 + }, + { + "epoch": 0.9702336549799071, + "lm_loss": 1.2252, + "step": 5040, + "vm_loss": 0.1099 + }, + { + "epoch": 0.9702336549799071, + "lm_loss": 1.45, + "step": 5040, + "vm_loss": 0.1584 + }, + { + "epoch": 0.9704261616574825, + "grad_norm": 3.309162253381353, + "learning_rate": 1.0961940330778001e-05, + "loss": 1.5304, + "step": 5041 + }, + { + "epoch": 0.9706186683350578, + "grad_norm": 3.0690111469541512, + "learning_rate": 1.0958836846362622e-05, + "loss": 1.4746, + "step": 5042 + }, + { + "epoch": 0.9708111750126333, + "grad_norm": 2.9626854571825785, + "learning_rate": 1.0955733268736037e-05, + "loss": 1.4372, + "step": 5043 + }, + { + "epoch": 0.9710036816902087, + "grad_norm": 3.106790003973013, + "learning_rate": 1.0952629598199946e-05, + "loss": 1.4666, + "step": 5044 + }, + { + "epoch": 0.971196188367784, + "grad_norm": 3.108409914143002, + "learning_rate": 1.0949525835056073e-05, + "loss": 1.4984, + "step": 5045 + }, + { + "epoch": 0.9713886950453594, + "grad_norm": 3.0322054387270554, + "learning_rate": 1.0946421979606141e-05, + "loss": 1.5261, + "step": 5046 + }, + { + "epoch": 0.9715812017229347, + "grad_norm": 3.1495917482279365, + "learning_rate": 1.0943318032151887e-05, + "loss": 1.5153, + "step": 5047 + }, + { + "epoch": 0.9717737084005101, + "grad_norm": 3.1222659581799515, + "learning_rate": 1.0940213992995048e-05, + "loss": 1.4472, + "step": 5048 + }, + { + "epoch": 0.9717737084005101, + "lm_loss": 1.6739, + "step": 5048, + "vm_loss": 0.1481 + }, + { + "epoch": 0.9717737084005101, + "lm_loss": 1.0821, + "step": 5048, + "vm_loss": 0.1218 + }, + { + "epoch": 0.9717737084005101, + "lm_loss": 1.2419, + "step": 5048, + "vm_loss": 0.1446 + }, + { + "epoch": 0.9717737084005101, + "lm_loss": 1.1225, + "step": 5048, + "vm_loss": 0.2065 + }, + { + "epoch": 0.9717737084005101, + "lm_loss": 1.1091, + "step": 5048, + "vm_loss": 0.1441 + }, + { + "epoch": 0.9717737084005101, + "lm_loss": 1.3134, + "step": 5048, + "vm_loss": 0.1632 + }, + { + "epoch": 0.9717737084005101, + "lm_loss": 1.3475, + "step": 5048, + "vm_loss": 0.1365 + }, + { + "epoch": 0.9717737084005101, + "lm_loss": 2.0123, + "step": 5048, + "vm_loss": 0.1257 + }, + { + "epoch": 0.9719662150780856, + "grad_norm": 3.255306351038778, + "learning_rate": 1.0937109862437384e-05, + "loss": 1.4684, + "step": 5049 + }, + { + "epoch": 0.9721587217556609, + "grad_norm": 3.3634241186903435, + "learning_rate": 1.0934005640780648e-05, + "loss": 1.5599, + "step": 5050 + }, + { + "epoch": 0.9723512284332363, + "grad_norm": 3.184275517596426, + "learning_rate": 1.0930901328326613e-05, + "loss": 1.509, + "step": 5051 + }, + { + "epoch": 0.9725437351108116, + "grad_norm": 3.1789682559324217, + "learning_rate": 1.0927796925377064e-05, + "loss": 1.5125, + "step": 5052 + }, + { + "epoch": 0.972736241788387, + "grad_norm": 3.080556359503767, + "learning_rate": 1.0924692432233781e-05, + "loss": 1.4913, + "step": 5053 + }, + { + "epoch": 0.9729287484659624, + "grad_norm": 3.089755182054428, + "learning_rate": 1.0921587849198565e-05, + "loss": 1.4843, + "step": 5054 + }, + { + "epoch": 0.9731212551435378, + "grad_norm": 2.993585810769113, + "learning_rate": 1.0918483176573217e-05, + "loss": 1.4952, + "step": 5055 + }, + { + "epoch": 0.9733137618211132, + "grad_norm": 3.324191859105468, + "learning_rate": 1.0915378414659557e-05, + "loss": 1.5028, + "step": 5056 + }, + { + "epoch": 0.9733137618211132, + "lm_loss": 1.5365, + "step": 5056, + "vm_loss": 0.1786 + }, + { + "epoch": 0.9733137618211132, + "lm_loss": 1.4559, + "step": 5056, + "vm_loss": 0.1708 + }, + { + "epoch": 0.9733137618211132, + "lm_loss": 1.3338, + "step": 5056, + "vm_loss": 0.1885 + }, + { + "epoch": 0.9733137618211132, + "lm_loss": 1.6344, + "step": 5056, + "vm_loss": 0.1551 + }, + { + "epoch": 0.9733137618211132, + "lm_loss": 1.3296, + "step": 5056, + "vm_loss": 0.1101 + }, + { + "epoch": 0.9733137618211132, + "lm_loss": 1.3883, + "step": 5056, + "vm_loss": 0.1619 + }, + { + "epoch": 0.9733137618211132, + "lm_loss": 1.2136, + "step": 5056, + "vm_loss": 0.1771 + }, + { + "epoch": 0.9733137618211132, + "lm_loss": 1.4391, + "step": 5056, + "vm_loss": 0.1324 + }, + { + "epoch": 0.9735062684986886, + "grad_norm": 3.3766287783857307, + "learning_rate": 1.0912273563759402e-05, + "loss": 1.5543, + "step": 5057 + }, + { + "epoch": 0.9736987751762639, + "grad_norm": 3.272054564204573, + "learning_rate": 1.0909168624174584e-05, + "loss": 1.4719, + "step": 5058 + }, + { + "epoch": 0.9738912818538393, + "grad_norm": 3.055140788789017, + "learning_rate": 1.0906063596206946e-05, + "loss": 1.4603, + "step": 5059 + }, + { + "epoch": 0.9740837885314146, + "grad_norm": 3.1514589908614883, + "learning_rate": 1.0902958480158335e-05, + "loss": 1.5118, + "step": 5060 + }, + { + "epoch": 0.9742762952089901, + "grad_norm": 3.1827112557344504, + "learning_rate": 1.0899853276330605e-05, + "loss": 1.5618, + "step": 5061 + }, + { + "epoch": 0.9744688018865655, + "grad_norm": 3.0904213088786245, + "learning_rate": 1.0896747985025626e-05, + "loss": 1.5089, + "step": 5062 + }, + { + "epoch": 0.9746613085641408, + "grad_norm": 3.015654347582122, + "learning_rate": 1.0893642606545269e-05, + "loss": 1.4205, + "step": 5063 + }, + { + "epoch": 0.9748538152417162, + "grad_norm": 3.153906222085746, + "learning_rate": 1.0890537141191417e-05, + "loss": 1.4576, + "step": 5064 + }, + { + "epoch": 0.9748538152417162, + "lm_loss": 1.5905, + "step": 5064, + "vm_loss": 0.1518 + }, + { + "epoch": 0.9748538152417162, + "lm_loss": 0.5789, + "step": 5064, + "vm_loss": 0.1423 + }, + { + "epoch": 0.9748538152417162, + "lm_loss": 1.4248, + "step": 5064, + "vm_loss": 0.1129 + }, + { + "epoch": 0.9748538152417162, + "lm_loss": 0.9698, + "step": 5064, + "vm_loss": 0.1822 + }, + { + "epoch": 0.9748538152417162, + "lm_loss": 1.1604, + "step": 5064, + "vm_loss": 0.1381 + }, + { + "epoch": 0.9748538152417162, + "lm_loss": 1.3652, + "step": 5064, + "vm_loss": 0.1351 + }, + { + "epoch": 0.9748538152417162, + "lm_loss": 1.6127, + "step": 5064, + "vm_loss": 0.1493 + }, + { + "epoch": 0.9748538152417162, + "lm_loss": 1.0081, + "step": 5064, + "vm_loss": 0.1716 + }, + { + "epoch": 0.9750463219192915, + "grad_norm": 3.246451313208057, + "learning_rate": 1.0887431589265964e-05, + "loss": 1.5028, + "step": 5065 + }, + { + "epoch": 0.975238828596867, + "grad_norm": 3.04927508526419, + "learning_rate": 1.0884325951070804e-05, + "loss": 1.4178, + "step": 5066 + }, + { + "epoch": 0.9754313352744424, + "grad_norm": 3.277779996302365, + "learning_rate": 1.088122022690785e-05, + "loss": 1.5067, + "step": 5067 + }, + { + "epoch": 0.9756238419520177, + "grad_norm": 3.0843807309244653, + "learning_rate": 1.0878114417079013e-05, + "loss": 1.4937, + "step": 5068 + }, + { + "epoch": 0.9758163486295931, + "grad_norm": 3.064278174023516, + "learning_rate": 1.0875008521886223e-05, + "loss": 1.4749, + "step": 5069 + }, + { + "epoch": 0.9760088553071685, + "grad_norm": 3.1611696501653164, + "learning_rate": 1.0871902541631409e-05, + "loss": 1.4488, + "step": 5070 + }, + { + "epoch": 0.9762013619847438, + "grad_norm": 3.1971325522563454, + "learning_rate": 1.0868796476616516e-05, + "loss": 1.5234, + "step": 5071 + }, + { + "epoch": 0.9763938686623193, + "grad_norm": 3.1726027096461182, + "learning_rate": 1.0865690327143488e-05, + "loss": 1.506, + "step": 5072 + }, + { + "epoch": 0.9763938686623193, + "lm_loss": 1.4105, + "step": 5072, + "vm_loss": 0.1702 + }, + { + "epoch": 0.9763938686623193, + "lm_loss": 1.1371, + "step": 5072, + "vm_loss": 0.1654 + }, + { + "epoch": 0.9763938686623193, + "lm_loss": 1.5114, + "step": 5072, + "vm_loss": 0.1556 + }, + { + "epoch": 0.9763938686623193, + "lm_loss": 1.6416, + "step": 5072, + "vm_loss": 0.1577 + }, + { + "epoch": 0.9763938686623193, + "lm_loss": 1.6405, + "step": 5072, + "vm_loss": 0.1939 + }, + { + "epoch": 0.9763938686623193, + "lm_loss": 1.2148, + "step": 5072, + "vm_loss": 0.1702 + }, + { + "epoch": 0.9763938686623193, + "lm_loss": 1.204, + "step": 5072, + "vm_loss": 0.1968 + }, + { + "epoch": 0.9763938686623193, + "lm_loss": 1.2776, + "step": 5072, + "vm_loss": 0.1648 + }, + { + "epoch": 0.9765863753398946, + "grad_norm": 3.2158857158618477, + "learning_rate": 1.0862584093514285e-05, + "loss": 1.4987, + "step": 5073 + }, + { + "epoch": 0.97677888201747, + "grad_norm": 3.256693245507279, + "learning_rate": 1.0859477776030872e-05, + "loss": 1.4808, + "step": 5074 + }, + { + "epoch": 0.9769713886950454, + "grad_norm": 3.197236321317148, + "learning_rate": 1.0856371374995222e-05, + "loss": 1.5149, + "step": 5075 + }, + { + "epoch": 0.9771638953726207, + "grad_norm": 3.1370566335817274, + "learning_rate": 1.085326489070932e-05, + "loss": 1.54, + "step": 5076 + }, + { + "epoch": 0.9773564020501961, + "grad_norm": 3.160503226869559, + "learning_rate": 1.0850158323475152e-05, + "loss": 1.4793, + "step": 5077 + }, + { + "epoch": 0.9775489087277714, + "grad_norm": 3.183313832897267, + "learning_rate": 1.0847051673594724e-05, + "loss": 1.5053, + "step": 5078 + }, + { + "epoch": 0.9777414154053469, + "grad_norm": 3.118247783485568, + "learning_rate": 1.084394494137003e-05, + "loss": 1.5068, + "step": 5079 + }, + { + "epoch": 0.9779339220829223, + "grad_norm": 3.089225277455859, + "learning_rate": 1.0840838127103093e-05, + "loss": 1.4629, + "step": 5080 + }, + { + "epoch": 0.9779339220829223, + "lm_loss": 1.4794, + "step": 5080, + "vm_loss": 0.1503 + }, + { + "epoch": 0.9779339220829223, + "lm_loss": 1.2263, + "step": 5080, + "vm_loss": 0.179 + }, + { + "epoch": 0.9779339220829223, + "lm_loss": 2.344, + "step": 5080, + "vm_loss": 0.1984 + }, + { + "epoch": 0.9779339220829223, + "lm_loss": 1.3033, + "step": 5080, + "vm_loss": 0.1038 + }, + { + "epoch": 0.9779339220829223, + "lm_loss": 1.8235, + "step": 5080, + "vm_loss": 0.1119 + }, + { + "epoch": 0.9779339220829223, + "lm_loss": 0.925, + "step": 5080, + "vm_loss": 0.157 + }, + { + "epoch": 0.9779339220829223, + "lm_loss": 1.4049, + "step": 5080, + "vm_loss": 0.1471 + }, + { + "epoch": 0.9779339220829223, + "lm_loss": 2.0164, + "step": 5080, + "vm_loss": 0.1379 + }, + { + "epoch": 0.9781264287604976, + "grad_norm": 3.1444033522273056, + "learning_rate": 1.0837731231095934e-05, + "loss": 1.504, + "step": 5081 + }, + { + "epoch": 0.978318935438073, + "grad_norm": 3.2710760827059326, + "learning_rate": 1.0834624253650579e-05, + "loss": 1.5012, + "step": 5082 + }, + { + "epoch": 0.9785114421156483, + "grad_norm": 3.18729257021845, + "learning_rate": 1.083151719506907e-05, + "loss": 1.5112, + "step": 5083 + }, + { + "epoch": 0.9787039487932238, + "grad_norm": 3.148707069033162, + "learning_rate": 1.0828410055653451e-05, + "loss": 1.5381, + "step": 5084 + }, + { + "epoch": 0.9788964554707992, + "grad_norm": 3.160382129589161, + "learning_rate": 1.0825302835705776e-05, + "loss": 1.4748, + "step": 5085 + }, + { + "epoch": 0.9790889621483745, + "grad_norm": 3.2375711105249776, + "learning_rate": 1.0822195535528106e-05, + "loss": 1.4919, + "step": 5086 + }, + { + "epoch": 0.9792814688259499, + "grad_norm": 3.020912128934804, + "learning_rate": 1.081908815542251e-05, + "loss": 1.4839, + "step": 5087 + }, + { + "epoch": 0.9794739755035253, + "grad_norm": 3.0581526085618225, + "learning_rate": 1.0815980695691068e-05, + "loss": 1.4536, + "step": 5088 + }, + { + "epoch": 0.9794739755035253, + "lm_loss": 0.8236, + "step": 5088, + "vm_loss": 0.1375 + }, + { + "epoch": 0.9794739755035253, + "lm_loss": 1.5163, + "step": 5088, + "vm_loss": 0.1286 + }, + { + "epoch": 0.9794739755035253, + "lm_loss": 0.983, + "step": 5088, + "vm_loss": 0.1874 + }, + { + "epoch": 0.9794739755035253, + "lm_loss": 0.9062, + "step": 5088, + "vm_loss": 0.1722 + }, + { + "epoch": 0.9794739755035253, + "lm_loss": 1.8458, + "step": 5088, + "vm_loss": 0.1258 + }, + { + "epoch": 0.9794739755035253, + "lm_loss": 1.5874, + "step": 5088, + "vm_loss": 0.1815 + }, + { + "epoch": 0.9794739755035253, + "lm_loss": 1.6175, + "step": 5088, + "vm_loss": 0.1591 + }, + { + "epoch": 0.9794739755035253, + "lm_loss": 0.9154, + "step": 5088, + "vm_loss": 0.1521 + }, + { + "epoch": 0.9796664821811006, + "grad_norm": 3.1865684570246646, + "learning_rate": 1.0812873156635864e-05, + "loss": 1.4647, + "step": 5089 + }, + { + "epoch": 0.9798589888586761, + "grad_norm": 3.257020889524908, + "learning_rate": 1.080976553855899e-05, + "loss": 1.5563, + "step": 5090 + }, + { + "epoch": 0.9800514955362514, + "grad_norm": 3.2491158883260223, + "learning_rate": 1.0806657841762542e-05, + "loss": 1.5032, + "step": 5091 + }, + { + "epoch": 0.9802440022138268, + "grad_norm": 3.2575254063268955, + "learning_rate": 1.0803550066548634e-05, + "loss": 1.4804, + "step": 5092 + }, + { + "epoch": 0.9804365088914022, + "grad_norm": 3.116673690439317, + "learning_rate": 1.080044221321938e-05, + "loss": 1.5548, + "step": 5093 + }, + { + "epoch": 0.9806290155689775, + "grad_norm": 3.0864541256514744, + "learning_rate": 1.0797334282076898e-05, + "loss": 1.4788, + "step": 5094 + }, + { + "epoch": 0.980821522246553, + "grad_norm": 3.0842548618909973, + "learning_rate": 1.079422627342333e-05, + "loss": 1.4056, + "step": 5095 + }, + { + "epoch": 0.9810140289241283, + "grad_norm": 3.083940344140672, + "learning_rate": 1.0791118187560801e-05, + "loss": 1.4139, + "step": 5096 + }, + { + "epoch": 0.9810140289241283, + "lm_loss": 0.8556, + "step": 5096, + "vm_loss": 0.1483 + }, + { + "epoch": 0.9810140289241283, + "lm_loss": 1.2532, + "step": 5096, + "vm_loss": 0.1933 + }, + { + "epoch": 0.9810140289241283, + "lm_loss": 1.1542, + "step": 5096, + "vm_loss": 0.1992 + }, + { + "epoch": 0.9810140289241283, + "lm_loss": 1.4572, + "step": 5096, + "vm_loss": 0.1985 + }, + { + "epoch": 0.9810140289241283, + "lm_loss": 2.0382, + "step": 5096, + "vm_loss": 0.1687 + }, + { + "epoch": 0.9810140289241283, + "lm_loss": 1.1248, + "step": 5096, + "vm_loss": 0.2109 + }, + { + "epoch": 0.9810140289241283, + "lm_loss": 1.5335, + "step": 5096, + "vm_loss": 0.1273 + }, + { + "epoch": 0.9810140289241283, + "lm_loss": 1.4494, + "step": 5096, + "vm_loss": 0.1073 + }, + { + "epoch": 0.9812065356017037, + "grad_norm": 3.1219095160790324, + "learning_rate": 1.0788010024791465e-05, + "loss": 1.452, + "step": 5097 + }, + { + "epoch": 0.9813990422792791, + "grad_norm": 3.491937895268139, + "learning_rate": 1.0784901785417474e-05, + "loss": 1.481, + "step": 5098 + }, + { + "epoch": 0.9815915489568544, + "grad_norm": 3.3704852163736874, + "learning_rate": 1.0781793469740984e-05, + "loss": 1.498, + "step": 5099 + }, + { + "epoch": 0.9817840556344298, + "grad_norm": 3.229131580133439, + "learning_rate": 1.0778685078064172e-05, + "loss": 1.4592, + "step": 5100 + }, + { + "epoch": 0.9819765623120053, + "grad_norm": 3.0934918125098454, + "learning_rate": 1.0775576610689205e-05, + "loss": 1.4983, + "step": 5101 + }, + { + "epoch": 0.9821690689895806, + "grad_norm": 3.175488113458974, + "learning_rate": 1.077246806791827e-05, + "loss": 1.4828, + "step": 5102 + }, + { + "epoch": 0.982361575667156, + "grad_norm": 3.0483937907567658, + "learning_rate": 1.0769359450053553e-05, + "loss": 1.4747, + "step": 5103 + }, + { + "epoch": 0.9825540823447313, + "grad_norm": 3.094902182125298, + "learning_rate": 1.0766250757397256e-05, + "loss": 1.5029, + "step": 5104 + }, + { + "epoch": 0.9825540823447313, + "lm_loss": 1.6889, + "step": 5104, + "vm_loss": 0.1808 + }, + { + "epoch": 0.9825540823447313, + "lm_loss": 1.0876, + "step": 5104, + "vm_loss": 0.1515 + }, + { + "epoch": 0.9825540823447313, + "lm_loss": 1.4541, + "step": 5104, + "vm_loss": 0.1706 + }, + { + "epoch": 0.9825540823447313, + "lm_loss": 1.2922, + "step": 5104, + "vm_loss": 0.2192 + }, + { + "epoch": 0.9825540823447313, + "lm_loss": 1.6232, + "step": 5104, + "vm_loss": 0.1833 + }, + { + "epoch": 0.9825540823447313, + "lm_loss": 1.1543, + "step": 5104, + "vm_loss": 0.2176 + }, + { + "epoch": 0.9825540823447313, + "lm_loss": 1.5891, + "step": 5104, + "vm_loss": 0.2756 + }, + { + "epoch": 0.9825540823447313, + "lm_loss": 1.3451, + "step": 5104, + "vm_loss": 0.1246 + }, + { + "epoch": 0.9827465890223067, + "grad_norm": 3.1152313142280277, + "learning_rate": 1.0763141990251586e-05, + "loss": 1.4765, + "step": 5105 + }, + { + "epoch": 0.9829390956998821, + "grad_norm": 3.1994604687130623, + "learning_rate": 1.0760033148918747e-05, + "loss": 1.4895, + "step": 5106 + }, + { + "epoch": 0.9831316023774574, + "grad_norm": 3.280600630824215, + "learning_rate": 1.0756924233700963e-05, + "loss": 1.4686, + "step": 5107 + }, + { + "epoch": 0.9833241090550329, + "grad_norm": 3.23383752018144, + "learning_rate": 1.0753815244900459e-05, + "loss": 1.4649, + "step": 5108 + }, + { + "epoch": 0.9835166157326082, + "grad_norm": 3.2192692766516626, + "learning_rate": 1.0750706182819468e-05, + "loss": 1.5051, + "step": 5109 + }, + { + "epoch": 0.9837091224101836, + "grad_norm": 3.2542010151470926, + "learning_rate": 1.0747597047760236e-05, + "loss": 1.5459, + "step": 5110 + }, + { + "epoch": 0.983901629087759, + "grad_norm": 3.14894053192367, + "learning_rate": 1.0744487840025001e-05, + "loss": 1.4758, + "step": 5111 + }, + { + "epoch": 0.9840941357653343, + "grad_norm": 3.043445294354467, + "learning_rate": 1.0741378559916029e-05, + "loss": 1.5055, + "step": 5112 + }, + { + "epoch": 0.9840941357653343, + "lm_loss": 1.6875, + "step": 5112, + "vm_loss": 0.1517 + }, + { + "epoch": 0.9840941357653343, + "lm_loss": 1.4693, + "step": 5112, + "vm_loss": 0.1763 + }, + { + "epoch": 0.9840941357653343, + "lm_loss": 0.8308, + "step": 5112, + "vm_loss": 0.174 + }, + { + "epoch": 0.9840941357653343, + "lm_loss": 1.071, + "step": 5112, + "vm_loss": 0.1125 + }, + { + "epoch": 0.9840941357653343, + "lm_loss": 1.404, + "step": 5112, + "vm_loss": 0.1988 + }, + { + "epoch": 0.9840941357653343, + "lm_loss": 1.4635, + "step": 5112, + "vm_loss": 0.1884 + }, + { + "epoch": 0.9840941357653343, + "lm_loss": 1.0517, + "step": 5112, + "vm_loss": 0.2233 + }, + { + "epoch": 0.9840941357653343, + "lm_loss": 1.3387, + "step": 5112, + "vm_loss": 0.1997 + }, + { + "epoch": 0.9842866424429098, + "grad_norm": 3.273247435364176, + "learning_rate": 1.0738269207735573e-05, + "loss": 1.4612, + "step": 5113 + }, + { + "epoch": 0.9844791491204851, + "grad_norm": 3.0481708944911214, + "learning_rate": 1.0735159783785905e-05, + "loss": 1.4686, + "step": 5114 + }, + { + "epoch": 0.9846716557980605, + "grad_norm": 3.1539878163603987, + "learning_rate": 1.07320502883693e-05, + "loss": 1.4064, + "step": 5115 + }, + { + "epoch": 0.9848641624756359, + "grad_norm": 3.3934956241595144, + "learning_rate": 1.0728940721788041e-05, + "loss": 1.4699, + "step": 5116 + }, + { + "epoch": 0.9850566691532112, + "grad_norm": 3.282275792838578, + "learning_rate": 1.072583108434442e-05, + "loss": 1.47, + "step": 5117 + }, + { + "epoch": 0.9852491758307866, + "grad_norm": 3.2967793734759714, + "learning_rate": 1.072272137634073e-05, + "loss": 1.4415, + "step": 5118 + }, + { + "epoch": 0.9854416825083621, + "grad_norm": 3.275878257788189, + "learning_rate": 1.0719611598079278e-05, + "loss": 1.4493, + "step": 5119 + }, + { + "epoch": 0.9856341891859374, + "grad_norm": 3.26253730246854, + "learning_rate": 1.0716501749862369e-05, + "loss": 1.4776, + "step": 5120 + }, + { + "epoch": 0.9856341891859374, + "lm_loss": 1.2419, + "step": 5120, + "vm_loss": 0.1612 + }, + { + "epoch": 0.9856341891859374, + "lm_loss": 0.8267, + "step": 5120, + "vm_loss": 0.1626 + }, + { + "epoch": 0.9856341891859374, + "lm_loss": 0.7665, + "step": 5120, + "vm_loss": 0.1853 + }, + { + "epoch": 0.9856341891859374, + "lm_loss": 1.5169, + "step": 5120, + "vm_loss": 0.157 + }, + { + "epoch": 0.9856341891859374, + "lm_loss": 1.1724, + "step": 5120, + "vm_loss": 0.1532 + }, + { + "epoch": 0.9856341891859374, + "lm_loss": 1.3158, + "step": 5120, + "vm_loss": 0.2295 + }, + { + "epoch": 0.9856341891859374, + "lm_loss": 1.6846, + "step": 5120, + "vm_loss": 0.2286 + }, + { + "epoch": 0.9856341891859374, + "lm_loss": 1.248, + "step": 5120, + "vm_loss": 0.1563 + }, + { + "epoch": 0.9858266958635128, + "grad_norm": 3.2041187833608653, + "learning_rate": 1.0713391831992324e-05, + "loss": 1.4751, + "step": 5121 + }, + { + "epoch": 0.9860192025410881, + "grad_norm": 3.1781097369895694, + "learning_rate": 1.0710281844771469e-05, + "loss": 1.5337, + "step": 5122 + }, + { + "epoch": 0.9862117092186635, + "grad_norm": 3.043691286169084, + "learning_rate": 1.0707171788502129e-05, + "loss": 1.4937, + "step": 5123 + }, + { + "epoch": 0.986404215896239, + "grad_norm": 3.044828485902192, + "learning_rate": 1.0704061663486645e-05, + "loss": 1.4685, + "step": 5124 + }, + { + "epoch": 0.9865967225738143, + "grad_norm": 3.264003149302052, + "learning_rate": 1.0700951470027357e-05, + "loss": 1.5052, + "step": 5125 + }, + { + "epoch": 0.9867892292513897, + "grad_norm": 3.2005612480115397, + "learning_rate": 1.0697841208426617e-05, + "loss": 1.4816, + "step": 5126 + }, + { + "epoch": 0.986981735928965, + "grad_norm": 3.3183210382633277, + "learning_rate": 1.069473087898679e-05, + "loss": 1.5739, + "step": 5127 + }, + { + "epoch": 0.9871742426065404, + "grad_norm": 3.3945775008555152, + "learning_rate": 1.0691620482010227e-05, + "loss": 1.5584, + "step": 5128 + }, + { + "epoch": 0.9871742426065404, + "lm_loss": 0.9012, + "step": 5128, + "vm_loss": 0.2294 + }, + { + "epoch": 0.9871742426065404, + "lm_loss": 1.6728, + "step": 5128, + "vm_loss": 0.1636 + }, + { + "epoch": 0.9871742426065404, + "lm_loss": 1.6825, + "step": 5128, + "vm_loss": 0.2086 + }, + { + "epoch": 0.9871742426065404, + "lm_loss": 1.4753, + "step": 5128, + "vm_loss": 0.2063 + }, + { + "epoch": 0.9871742426065404, + "lm_loss": 1.7834, + "step": 5128, + "vm_loss": 0.1635 + }, + { + "epoch": 0.9871742426065404, + "lm_loss": 0.9395, + "step": 5128, + "vm_loss": 0.1394 + }, + { + "epoch": 0.9871742426065404, + "lm_loss": 1.5501, + "step": 5128, + "vm_loss": 0.1745 + }, + { + "epoch": 0.9871742426065404, + "lm_loss": 1.4829, + "step": 5128, + "vm_loss": 0.1129 + }, + { + "epoch": 0.9873667492841158, + "grad_norm": 3.100445528016157, + "learning_rate": 1.068851001779931e-05, + "loss": 1.5566, + "step": 5129 + }, + { + "epoch": 0.9875592559616911, + "grad_norm": 3.0946163510110374, + "learning_rate": 1.0685399486656407e-05, + "loss": 1.527, + "step": 5130 + }, + { + "epoch": 0.9877517626392666, + "grad_norm": 3.067301694698744, + "learning_rate": 1.0682288888883905e-05, + "loss": 1.5129, + "step": 5131 + }, + { + "epoch": 0.987944269316842, + "grad_norm": 3.1427012755918624, + "learning_rate": 1.0679178224784197e-05, + "loss": 1.4775, + "step": 5132 + }, + { + "epoch": 0.9881367759944173, + "grad_norm": 3.108769437991071, + "learning_rate": 1.0676067494659673e-05, + "loss": 1.4521, + "step": 5133 + }, + { + "epoch": 0.9883292826719927, + "grad_norm": 3.1054006768686877, + "learning_rate": 1.0672956698812741e-05, + "loss": 1.478, + "step": 5134 + }, + { + "epoch": 0.988521789349568, + "grad_norm": 3.0890832467756337, + "learning_rate": 1.066984583754581e-05, + "loss": 1.4382, + "step": 5135 + }, + { + "epoch": 0.9887142960271434, + "grad_norm": 3.232718631336568, + "learning_rate": 1.0666734911161292e-05, + "loss": 1.4512, + "step": 5136 + }, + { + "epoch": 0.9887142960271434, + "lm_loss": 1.6239, + "step": 5136, + "vm_loss": 0.1259 + }, + { + "epoch": 0.9887142960271434, + "lm_loss": 1.3203, + "step": 5136, + "vm_loss": 0.2441 + }, + { + "epoch": 0.9887142960271434, + "lm_loss": 1.3079, + "step": 5136, + "vm_loss": 0.1727 + }, + { + "epoch": 0.9887142960271434, + "lm_loss": 1.604, + "step": 5136, + "vm_loss": 0.1292 + }, + { + "epoch": 0.9887142960271434, + "lm_loss": 1.3442, + "step": 5136, + "vm_loss": 0.1976 + }, + { + "epoch": 0.9887142960271434, + "lm_loss": 1.2042, + "step": 5136, + "vm_loss": 0.1076 + }, + { + "epoch": 0.9887142960271434, + "lm_loss": 1.2235, + "step": 5136, + "vm_loss": 0.1346 + }, + { + "epoch": 0.9887142960271434, + "lm_loss": 1.1693, + "step": 5136, + "vm_loss": 0.1376 + }, + { + "epoch": 0.9889068027047189, + "grad_norm": 3.2850293869554874, + "learning_rate": 1.0663623919961608e-05, + "loss": 1.4995, + "step": 5137 + }, + { + "epoch": 0.9890993093822942, + "grad_norm": 3.1394844876116466, + "learning_rate": 1.066051286424919e-05, + "loss": 1.415, + "step": 5138 + }, + { + "epoch": 0.9892918160598696, + "grad_norm": 3.15755217535084, + "learning_rate": 1.0657401744326474e-05, + "loss": 1.4169, + "step": 5139 + }, + { + "epoch": 0.9894843227374449, + "grad_norm": 3.323771989734763, + "learning_rate": 1.0654290560495897e-05, + "loss": 1.4747, + "step": 5140 + }, + { + "epoch": 0.9896768294150203, + "grad_norm": 3.252469853828255, + "learning_rate": 1.0651179313059904e-05, + "loss": 1.4618, + "step": 5141 + }, + { + "epoch": 0.9898693360925958, + "grad_norm": 3.309364710083231, + "learning_rate": 1.0648068002320953e-05, + "loss": 1.5262, + "step": 5142 + }, + { + "epoch": 0.9900618427701711, + "grad_norm": 3.183395597115284, + "learning_rate": 1.06449566285815e-05, + "loss": 1.5184, + "step": 5143 + }, + { + "epoch": 0.9902543494477465, + "grad_norm": 3.152919906818271, + "learning_rate": 1.0641845192144013e-05, + "loss": 1.4453, + "step": 5144 + }, + { + "epoch": 0.9902543494477465, + "lm_loss": 1.7308, + "step": 5144, + "vm_loss": 0.1611 + }, + { + "epoch": 0.9902543494477465, + "lm_loss": 1.5433, + "step": 5144, + "vm_loss": 0.2146 + }, + { + "epoch": 0.9902543494477465, + "lm_loss": 1.4702, + "step": 5144, + "vm_loss": 0.1929 + }, + { + "epoch": 0.9902543494477465, + "lm_loss": 1.0467, + "step": 5144, + "vm_loss": 0.1762 + }, + { + "epoch": 0.9902543494477465, + "lm_loss": 1.1147, + "step": 5144, + "vm_loss": 0.1218 + }, + { + "epoch": 0.9902543494477465, + "lm_loss": 1.1266, + "step": 5144, + "vm_loss": 0.2445 + }, + { + "epoch": 0.9902543494477465, + "lm_loss": 1.4071, + "step": 5144, + "vm_loss": 0.1544 + }, + { + "epoch": 0.9902543494477465, + "lm_loss": 0.7921, + "step": 5144, + "vm_loss": 0.1703 + }, + { + "epoch": 0.9904468561253218, + "grad_norm": 3.1507326274152048, + "learning_rate": 1.063873369331096e-05, + "loss": 1.4981, + "step": 5145 + }, + { + "epoch": 0.9906393628028972, + "grad_norm": 3.1033121527883765, + "learning_rate": 1.0635622132384821e-05, + "loss": 1.508, + "step": 5146 + }, + { + "epoch": 0.9908318694804726, + "grad_norm": 3.0980205790453987, + "learning_rate": 1.0632510509668079e-05, + "loss": 1.4269, + "step": 5147 + }, + { + "epoch": 0.991024376158048, + "grad_norm": 3.262012530974657, + "learning_rate": 1.0629398825463223e-05, + "loss": 1.5174, + "step": 5148 + }, + { + "epoch": 0.9912168828356234, + "grad_norm": 3.139943289625485, + "learning_rate": 1.0626287080072747e-05, + "loss": 1.4301, + "step": 5149 + }, + { + "epoch": 0.9914093895131988, + "grad_norm": 3.255552846527656, + "learning_rate": 1.0623175273799158e-05, + "loss": 1.4833, + "step": 5150 + }, + { + "epoch": 0.9916018961907741, + "grad_norm": 3.3293575832645, + "learning_rate": 1.0620063406944956e-05, + "loss": 1.5011, + "step": 5151 + }, + { + "epoch": 0.9917944028683495, + "grad_norm": 3.17686847821999, + "learning_rate": 1.0616951479812658e-05, + "loss": 1.4806, + "step": 5152 + }, + { + "epoch": 0.9917944028683495, + "lm_loss": 1.3465, + "step": 5152, + "vm_loss": 0.1403 + }, + { + "epoch": 0.9917944028683495, + "lm_loss": 1.4512, + "step": 5152, + "vm_loss": 0.2219 + }, + { + "epoch": 0.9917944028683495, + "lm_loss": 0.9472, + "step": 5152, + "vm_loss": 0.1573 + }, + { + "epoch": 0.9917944028683495, + "lm_loss": 1.7034, + "step": 5152, + "vm_loss": 0.2329 + }, + { + "epoch": 0.9917944028683495, + "lm_loss": 0.9384, + "step": 5152, + "vm_loss": 0.1558 + }, + { + "epoch": 0.9917944028683495, + "lm_loss": 1.446, + "step": 5152, + "vm_loss": 0.1987 + }, + { + "epoch": 0.9917944028683495, + "lm_loss": 1.068, + "step": 5152, + "vm_loss": 0.2087 + }, + { + "epoch": 0.9917944028683495, + "lm_loss": 1.4045, + "step": 5152, + "vm_loss": 0.1459 + }, + { + "epoch": 0.9919869095459248, + "grad_norm": 3.1996735775395897, + "learning_rate": 1.0613839492704784e-05, + "loss": 1.4944, + "step": 5153 + }, + { + "epoch": 0.9921794162235003, + "grad_norm": 3.155055301257671, + "learning_rate": 1.0610727445923858e-05, + "loss": 1.3565, + "step": 5154 + }, + { + "epoch": 0.9923719229010757, + "grad_norm": 3.2128393370425674, + "learning_rate": 1.0607615339772412e-05, + "loss": 1.4292, + "step": 5155 + }, + { + "epoch": 0.992564429578651, + "grad_norm": 3.1610813521395653, + "learning_rate": 1.0604503174552982e-05, + "loss": 1.4889, + "step": 5156 + }, + { + "epoch": 0.9927569362562264, + "grad_norm": 3.2208654860446315, + "learning_rate": 1.0601390950568106e-05, + "loss": 1.5059, + "step": 5157 + }, + { + "epoch": 0.9929494429338017, + "grad_norm": 3.0908580941570305, + "learning_rate": 1.059827866812034e-05, + "loss": 1.503, + "step": 5158 + }, + { + "epoch": 0.9931419496113771, + "grad_norm": 3.1712177645381994, + "learning_rate": 1.059516632751223e-05, + "loss": 1.4529, + "step": 5159 + }, + { + "epoch": 0.9933344562889526, + "grad_norm": 3.2481163389266205, + "learning_rate": 1.059205392904634e-05, + "loss": 1.5239, + "step": 5160 + }, + { + "epoch": 0.9933344562889526, + "lm_loss": 1.1123, + "step": 5160, + "vm_loss": 0.1655 + }, + { + "epoch": 0.9933344562889526, + "lm_loss": 1.4215, + "step": 5160, + "vm_loss": 0.1819 + }, + { + "epoch": 0.9933344562889526, + "lm_loss": 1.8061, + "step": 5160, + "vm_loss": 0.1858 + }, + { + "epoch": 0.9933344562889526, + "lm_loss": 0.9452, + "step": 5160, + "vm_loss": 0.1162 + }, + { + "epoch": 0.9933344562889526, + "lm_loss": 1.2034, + "step": 5160, + "vm_loss": 0.1701 + }, + { + "epoch": 0.9933344562889526, + "lm_loss": 1.1324, + "step": 5160, + "vm_loss": 0.1201 + }, + { + "epoch": 0.9933344562889526, + "lm_loss": 1.3233, + "step": 5160, + "vm_loss": 0.1898 + }, + { + "epoch": 0.9933344562889526, + "lm_loss": 1.3686, + "step": 5160, + "vm_loss": 0.1474 + }, + { + "epoch": 0.9935269629665279, + "grad_norm": 3.0479866632881305, + "learning_rate": 1.0588941473025234e-05, + "loss": 1.4826, + "step": 5161 + }, + { + "epoch": 0.9937194696441033, + "grad_norm": 3.311144798373948, + "learning_rate": 1.0585828959751478e-05, + "loss": 1.5256, + "step": 5162 + }, + { + "epoch": 0.9939119763216786, + "grad_norm": 3.316950033865326, + "learning_rate": 1.0582716389527659e-05, + "loss": 1.4482, + "step": 5163 + }, + { + "epoch": 0.994104482999254, + "grad_norm": 3.1207980520784546, + "learning_rate": 1.0579603762656347e-05, + "loss": 1.4965, + "step": 5164 + }, + { + "epoch": 0.9942969896768294, + "grad_norm": 3.1699648952579875, + "learning_rate": 1.0576491079440137e-05, + "loss": 1.4857, + "step": 5165 + }, + { + "epoch": 0.9944894963544048, + "grad_norm": 3.231213567919864, + "learning_rate": 1.0573378340181614e-05, + "loss": 1.4901, + "step": 5166 + }, + { + "epoch": 0.9946820030319802, + "grad_norm": 3.071529170750346, + "learning_rate": 1.0570265545183382e-05, + "loss": 1.396, + "step": 5167 + }, + { + "epoch": 0.9948745097095556, + "grad_norm": 3.130870206394583, + "learning_rate": 1.0567152694748046e-05, + "loss": 1.4962, + "step": 5168 + }, + { + "epoch": 0.9948745097095556, + "lm_loss": 1.4961, + "step": 5168, + "vm_loss": 0.1937 + }, + { + "epoch": 0.9948745097095556, + "lm_loss": 0.9975, + "step": 5168, + "vm_loss": 0.1887 + }, + { + "epoch": 0.9948745097095556, + "lm_loss": 1.04, + "step": 5168, + "vm_loss": 0.1851 + }, + { + "epoch": 0.9948745097095556, + "lm_loss": 1.3609, + "step": 5168, + "vm_loss": 0.098 + }, + { + "epoch": 0.9948745097095556, + "lm_loss": 1.1595, + "step": 5168, + "vm_loss": 0.2534 + }, + { + "epoch": 0.9948745097095556, + "lm_loss": 1.3601, + "step": 5168, + "vm_loss": 0.1441 + }, + { + "epoch": 0.9948745097095556, + "lm_loss": 1.3202, + "step": 5168, + "vm_loss": 0.2056 + }, + { + "epoch": 0.9948745097095556, + "lm_loss": 1.0364, + "step": 5168, + "vm_loss": 0.158 + }, + { + "epoch": 0.9950670163871309, + "grad_norm": 2.933246845790367, + "learning_rate": 1.0564039789178211e-05, + "loss": 1.4263, + "step": 5169 + }, + { + "epoch": 0.9952595230647063, + "grad_norm": 3.1811132991656295, + "learning_rate": 1.056092682877649e-05, + "loss": 1.5109, + "step": 5170 + }, + { + "epoch": 0.9954520297422816, + "grad_norm": 3.123891655430106, + "learning_rate": 1.0557813813845508e-05, + "loss": 1.3532, + "step": 5171 + }, + { + "epoch": 0.9956445364198571, + "grad_norm": 3.2588478318132807, + "learning_rate": 1.0554700744687885e-05, + "loss": 1.4477, + "step": 5172 + }, + { + "epoch": 0.9958370430974325, + "grad_norm": 3.249772397889448, + "learning_rate": 1.0551587621606253e-05, + "loss": 1.4698, + "step": 5173 + }, + { + "epoch": 0.9960295497750078, + "grad_norm": 3.23628835620586, + "learning_rate": 1.0548474444903247e-05, + "loss": 1.5267, + "step": 5174 + }, + { + "epoch": 0.9962220564525832, + "grad_norm": 3.2840124788427523, + "learning_rate": 1.054536121488151e-05, + "loss": 1.4961, + "step": 5175 + }, + { + "epoch": 0.9964145631301585, + "grad_norm": 3.1689419482787464, + "learning_rate": 1.0542247931843684e-05, + "loss": 1.4285, + "step": 5176 + }, + { + "epoch": 0.9964145631301585, + "lm_loss": 0.7649, + "step": 5176, + "vm_loss": 0.2461 + }, + { + "epoch": 0.9964145631301585, + "lm_loss": 1.8616, + "step": 5176, + "vm_loss": 0.1332 + }, + { + "epoch": 0.9964145631301585, + "lm_loss": 1.1569, + "step": 5176, + "vm_loss": 0.204 + }, + { + "epoch": 0.9964145631301585, + "lm_loss": 0.8808, + "step": 5176, + "vm_loss": 0.1462 + }, + { + "epoch": 0.9964145631301585, + "lm_loss": 1.2465, + "step": 5176, + "vm_loss": 0.1164 + }, + { + "epoch": 0.9964145631301585, + "lm_loss": 1.4974, + "step": 5176, + "vm_loss": 0.1817 + }, + { + "epoch": 0.9964145631301585, + "lm_loss": 2.4756, + "step": 5176, + "vm_loss": 0.0899 + }, + { + "epoch": 0.9964145631301585, + "lm_loss": 1.595, + "step": 5176, + "vm_loss": 0.0963 + }, + { + "epoch": 0.996607069807734, + "grad_norm": 3.0695706645997234, + "learning_rate": 1.0539134596092423e-05, + "loss": 1.4913, + "step": 5177 + }, + { + "epoch": 0.9967995764853094, + "grad_norm": 3.039550839618325, + "learning_rate": 1.0536021207930381e-05, + "loss": 1.4326, + "step": 5178 + }, + { + "epoch": 0.9969920831628847, + "grad_norm": 3.110007854319775, + "learning_rate": 1.0532907767660218e-05, + "loss": 1.4257, + "step": 5179 + }, + { + "epoch": 0.9971845898404601, + "grad_norm": 3.1108761420049773, + "learning_rate": 1.0529794275584606e-05, + "loss": 1.4324, + "step": 5180 + }, + { + "epoch": 0.9973770965180355, + "grad_norm": 3.3111678742001107, + "learning_rate": 1.0526680732006209e-05, + "loss": 1.5098, + "step": 5181 + }, + { + "epoch": 0.9975696031956108, + "grad_norm": 3.30252037218265, + "learning_rate": 1.0523567137227708e-05, + "loss": 1.4985, + "step": 5182 + }, + { + "epoch": 0.9977621098731863, + "grad_norm": 3.326749085967109, + "learning_rate": 1.0520453491551783e-05, + "loss": 1.4911, + "step": 5183 + }, + { + "epoch": 0.9979546165507616, + "grad_norm": 3.1907896097177093, + "learning_rate": 1.0517339795281118e-05, + "loss": 1.3981, + "step": 5184 + }, + { + "epoch": 0.9979546165507616, + "lm_loss": 1.1123, + "step": 5184, + "vm_loss": 0.1635 + }, + { + "epoch": 0.9979546165507616, + "lm_loss": 1.6794, + "step": 5184, + "vm_loss": 0.1264 + }, + { + "epoch": 0.9979546165507616, + "lm_loss": 1.7063, + "step": 5184, + "vm_loss": 0.1555 + }, + { + "epoch": 0.9979546165507616, + "lm_loss": 1.3821, + "step": 5184, + "vm_loss": 0.1758 + }, + { + "epoch": 0.9979546165507616, + "lm_loss": 1.7377, + "step": 5184, + "vm_loss": 0.1296 + }, + { + "epoch": 0.9979546165507616, + "lm_loss": 1.4997, + "step": 5184, + "vm_loss": 0.1186 + }, + { + "epoch": 0.9979546165507616, + "lm_loss": 1.0181, + "step": 5184, + "vm_loss": 0.1181 + }, + { + "epoch": 0.9979546165507616, + "lm_loss": 1.3014, + "step": 5184, + "vm_loss": 0.158 + }, + { + "epoch": 0.998147123228337, + "grad_norm": 3.0400737235446122, + "learning_rate": 1.051422604871841e-05, + "loss": 1.4413, + "step": 5185 + }, + { + "epoch": 0.9983396299059124, + "grad_norm": 3.038496730884325, + "learning_rate": 1.051111225216635e-05, + "loss": 1.4707, + "step": 5186 + }, + { + "epoch": 0.9985321365834877, + "grad_norm": 3.1326005174540743, + "learning_rate": 1.0507998405927638e-05, + "loss": 1.4119, + "step": 5187 + }, + { + "epoch": 0.9987246432610631, + "grad_norm": 3.2447187292621544, + "learning_rate": 1.0504884510304982e-05, + "loss": 1.4558, + "step": 5188 + }, + { + "epoch": 0.9989171499386384, + "grad_norm": 3.1455350352179012, + "learning_rate": 1.0501770565601093e-05, + "loss": 1.4679, + "step": 5189 + }, + { + "epoch": 0.9991096566162139, + "grad_norm": 3.2838123218271376, + "learning_rate": 1.0498656572118687e-05, + "loss": 1.488, + "step": 5190 + }, + { + "epoch": 0.9993021632937893, + "grad_norm": 3.2137620754875096, + "learning_rate": 1.0495542530160481e-05, + "loss": 1.4699, + "step": 5191 + }, + { + "epoch": 0.9994946699713646, + "grad_norm": 3.0667918137777845, + "learning_rate": 1.04924284400292e-05, + "loss": 1.4652, + "step": 5192 + }, + { + "epoch": 0.9994946699713646, + "lm_loss": 1.365, + "step": 5192, + "vm_loss": 0.1908 + }, + { + "epoch": 0.9994946699713646, + "lm_loss": 1.5673, + "step": 5192, + "vm_loss": 0.1667 + }, + { + "epoch": 0.9994946699713646, + "lm_loss": 1.4192, + "step": 5192, + "vm_loss": 0.1939 + }, + { + "epoch": 0.9994946699713646, + "lm_loss": 1.2579, + "step": 5192, + "vm_loss": 0.2228 + }, + { + "epoch": 0.9994946699713646, + "lm_loss": 0.6447, + "step": 5192, + "vm_loss": 0.1505 + }, + { + "epoch": 0.9994946699713646, + "lm_loss": 1.5067, + "step": 5192, + "vm_loss": 0.1803 + }, + { + "epoch": 0.9994946699713646, + "lm_loss": 1.3401, + "step": 5192, + "vm_loss": 0.1556 + }, + { + "epoch": 0.9994946699713646, + "lm_loss": 1.6762, + "step": 5192, + "vm_loss": 0.1309 + }, + { + "epoch": 0.99968717664894, + "grad_norm": 3.1994135536860604, + "learning_rate": 1.0489314302027576e-05, + "loss": 1.4936, + "step": 5193 + }, + { + "epoch": 0.9998796833265153, + "grad_norm": 3.22642452404787, + "learning_rate": 1.0486200116458343e-05, + "loss": 1.5552, + "step": 5194 + }, + { + "epoch": 1.0000721900040908, + "grad_norm": 3.162386536178728, + "learning_rate": 1.0483085883624236e-05, + "loss": 1.4202, + "step": 5195 + }, + { + "epoch": 1.0002646966816662, + "grad_norm": 3.179228071581397, + "learning_rate": 1.0479971603828001e-05, + "loss": 1.4598, + "step": 5196 + }, + { + "epoch": 1.0004572033592416, + "grad_norm": 3.184312446038585, + "learning_rate": 1.0476857277372387e-05, + "loss": 1.4779, + "step": 5197 + }, + { + "epoch": 1.0006497100368168, + "grad_norm": 3.157349199681182, + "learning_rate": 1.0473742904560143e-05, + "loss": 1.472, + "step": 5198 + }, + { + "epoch": 1.0008422167143922, + "grad_norm": 3.0979039251442155, + "learning_rate": 1.047062848569403e-05, + "loss": 1.4494, + "step": 5199 + }, + { + "epoch": 1.0010347233919676, + "grad_norm": 3.112808485950145, + "learning_rate": 1.0467514021076805e-05, + "loss": 1.3441, + "step": 5200 + }, + { + "epoch": 1.0010347233919676, + "lm_loss": 1.0705, + "step": 5200, + "vm_loss": 0.1427 + }, + { + "epoch": 1.0010347233919676, + "lm_loss": 1.3485, + "step": 5200, + "vm_loss": 0.1524 + }, + { + "epoch": 1.0010347233919676, + "lm_loss": 1.2488, + "step": 5200, + "vm_loss": 0.182 + }, + { + "epoch": 1.0010347233919676, + "lm_loss": 1.5624, + "step": 5200, + "vm_loss": 0.2688 + }, + { + "epoch": 1.0010347233919676, + "lm_loss": 1.4236, + "step": 5200, + "vm_loss": 0.2156 + }, + { + "epoch": 1.0010347233919676, + "lm_loss": 1.4954, + "step": 5200, + "vm_loss": 0.164 + }, + { + "epoch": 1.0010347233919676, + "lm_loss": 1.5853, + "step": 5200, + "vm_loss": 0.2351 + }, + { + "epoch": 1.0010347233919676, + "lm_loss": 1.7917, + "step": 5200, + "vm_loss": 0.156 + }, + { + "epoch": 1.001227230069543, + "grad_norm": 3.1661471596884248, + "learning_rate": 1.0464399511011236e-05, + "loss": 1.4379, + "step": 5201 + }, + { + "epoch": 1.0014197367471185, + "grad_norm": 3.496955929590711, + "learning_rate": 1.0461284955800094e-05, + "loss": 1.4888, + "step": 5202 + }, + { + "epoch": 1.001612243424694, + "grad_norm": 3.265446181662951, + "learning_rate": 1.0458170355746154e-05, + "loss": 1.4981, + "step": 5203 + }, + { + "epoch": 1.001804750102269, + "grad_norm": 3.0717926778926294, + "learning_rate": 1.0455055711152194e-05, + "loss": 1.3417, + "step": 5204 + }, + { + "epoch": 1.0019972567798445, + "grad_norm": 3.083236944411091, + "learning_rate": 1.0451941022320993e-05, + "loss": 1.3909, + "step": 5205 + }, + { + "epoch": 1.00218976345742, + "grad_norm": 3.133559481934841, + "learning_rate": 1.0448826289555345e-05, + "loss": 1.4288, + "step": 5206 + }, + { + "epoch": 1.0023822701349954, + "grad_norm": 3.1973796323832393, + "learning_rate": 1.0445711513158039e-05, + "loss": 1.3973, + "step": 5207 + }, + { + "epoch": 1.0025747768125708, + "grad_norm": 3.2553336060458156, + "learning_rate": 1.0442596693431872e-05, + "loss": 1.4556, + "step": 5208 + }, + { + "epoch": 1.0025747768125708, + "lm_loss": 1.0739, + "step": 5208, + "vm_loss": 0.1908 + }, + { + "epoch": 1.0025747768125708, + "lm_loss": 1.4179, + "step": 5208, + "vm_loss": 0.2097 + }, + { + "epoch": 1.0025747768125708, + "lm_loss": 1.2275, + "step": 5208, + "vm_loss": 0.1939 + }, + { + "epoch": 1.0025747768125708, + "lm_loss": 0.7862, + "step": 5208, + "vm_loss": 0.1552 + }, + { + "epoch": 1.0025747768125708, + "lm_loss": 0.9683, + "step": 5208, + "vm_loss": 0.1985 + }, + { + "epoch": 1.0025747768125708, + "lm_loss": 1.2458, + "step": 5208, + "vm_loss": 0.171 + }, + { + "epoch": 1.0025747768125708, + "lm_loss": 1.0941, + "step": 5208, + "vm_loss": 0.2062 + }, + { + "epoch": 1.0025747768125708, + "lm_loss": 0.6557, + "step": 5208, + "vm_loss": 0.2039 + }, + { + "epoch": 1.002767283490146, + "grad_norm": 3.1589191316239043, + "learning_rate": 1.0439481830679645e-05, + "loss": 1.3939, + "step": 5209 + }, + { + "epoch": 1.0029597901677214, + "grad_norm": 3.2808017576085167, + "learning_rate": 1.043636692520416e-05, + "loss": 1.5028, + "step": 5210 + }, + { + "epoch": 1.0031522968452968, + "grad_norm": 3.566053230283462, + "learning_rate": 1.0433251977308227e-05, + "loss": 1.4628, + "step": 5211 + }, + { + "epoch": 1.0033448035228723, + "grad_norm": 3.221899998514581, + "learning_rate": 1.0430136987294661e-05, + "loss": 1.4336, + "step": 5212 + }, + { + "epoch": 1.0035373102004477, + "grad_norm": 3.1795297123867585, + "learning_rate": 1.0427021955466274e-05, + "loss": 1.3693, + "step": 5213 + }, + { + "epoch": 1.0037298168780229, + "grad_norm": 3.1485205426822103, + "learning_rate": 1.042390688212589e-05, + "loss": 1.3793, + "step": 5214 + }, + { + "epoch": 1.0039223235555983, + "grad_norm": 3.0701711755441674, + "learning_rate": 1.0420791767576334e-05, + "loss": 1.4546, + "step": 5215 + }, + { + "epoch": 1.0041148302331737, + "grad_norm": 3.0752007514669315, + "learning_rate": 1.0417676612120438e-05, + "loss": 1.4924, + "step": 5216 + }, + { + "epoch": 1.0041148302331737, + "lm_loss": 1.0657, + "step": 5216, + "vm_loss": 0.1795 + }, + { + "epoch": 1.0041148302331737, + "lm_loss": 1.1886, + "step": 5216, + "vm_loss": 0.1581 + }, + { + "epoch": 1.0041148302331737, + "lm_loss": 0.9246, + "step": 5216, + "vm_loss": 0.1904 + }, + { + "epoch": 1.0041148302331737, + "lm_loss": 1.2511, + "step": 5216, + "vm_loss": 0.1519 + }, + { + "epoch": 1.0041148302331737, + "lm_loss": 1.5134, + "step": 5216, + "vm_loss": 0.1541 + }, + { + "epoch": 1.0041148302331737, + "lm_loss": 0.6979, + "step": 5216, + "vm_loss": 0.1653 + }, + { + "epoch": 1.0041148302331737, + "lm_loss": 1.0754, + "step": 5216, + "vm_loss": 0.1309 + }, + { + "epoch": 1.0041148302331737, + "lm_loss": 1.2109, + "step": 5216, + "vm_loss": 0.1701 + }, + { + "epoch": 1.0043073369107491, + "grad_norm": 3.0531474288076037, + "learning_rate": 1.0414561416061027e-05, + "loss": 1.3911, + "step": 5217 + }, + { + "epoch": 1.0044998435883246, + "grad_norm": 3.2132300977004653, + "learning_rate": 1.0411446179700944e-05, + "loss": 1.5404, + "step": 5218 + }, + { + "epoch": 1.0046923502658998, + "grad_norm": 3.0146976069540776, + "learning_rate": 1.0408330903343029e-05, + "loss": 1.4114, + "step": 5219 + }, + { + "epoch": 1.0048848569434752, + "grad_norm": 3.171097654113657, + "learning_rate": 1.0405215587290126e-05, + "loss": 1.4515, + "step": 5220 + }, + { + "epoch": 1.0050773636210506, + "grad_norm": 3.2455155961267215, + "learning_rate": 1.0402100231845084e-05, + "loss": 1.4366, + "step": 5221 + }, + { + "epoch": 1.005269870298626, + "grad_norm": 3.22901564982213, + "learning_rate": 1.0398984837310755e-05, + "loss": 1.4834, + "step": 5222 + }, + { + "epoch": 1.0054623769762014, + "grad_norm": 3.3496168743749046, + "learning_rate": 1.0395869403989998e-05, + "loss": 1.468, + "step": 5223 + }, + { + "epoch": 1.0056548836537766, + "grad_norm": 3.2536169087370728, + "learning_rate": 1.0392753932185667e-05, + "loss": 1.4217, + "step": 5224 + }, + { + "epoch": 1.0056548836537766, + "lm_loss": 1.8638, + "step": 5224, + "vm_loss": 0.1703 + }, + { + "epoch": 1.0056548836537766, + "lm_loss": 0.9168, + "step": 5224, + "vm_loss": 0.1601 + }, + { + "epoch": 1.0056548836537766, + "lm_loss": 1.3041, + "step": 5224, + "vm_loss": 0.1166 + }, + { + "epoch": 1.0056548836537766, + "lm_loss": 1.2719, + "step": 5224, + "vm_loss": 0.1278 + }, + { + "epoch": 1.0056548836537766, + "lm_loss": 1.4, + "step": 5224, + "vm_loss": 0.1748 + }, + { + "epoch": 1.0056548836537766, + "lm_loss": 1.0455, + "step": 5224, + "vm_loss": 0.1458 + }, + { + "epoch": 1.0056548836537766, + "lm_loss": 1.0459, + "step": 5224, + "vm_loss": 0.1677 + }, + { + "epoch": 1.0056548836537766, + "lm_loss": 1.2135, + "step": 5224, + "vm_loss": 0.2154 + }, + { + "epoch": 1.005847390331352, + "grad_norm": 3.2048423208939028, + "learning_rate": 1.0389638422200627e-05, + "loss": 1.3887, + "step": 5225 + }, + { + "epoch": 1.0060398970089275, + "grad_norm": 3.0113042110121984, + "learning_rate": 1.0386522874337752e-05, + "loss": 1.3036, + "step": 5226 + }, + { + "epoch": 1.006232403686503, + "grad_norm": 3.072168645649257, + "learning_rate": 1.0383407288899904e-05, + "loss": 1.4374, + "step": 5227 + }, + { + "epoch": 1.0064249103640783, + "grad_norm": 3.118892777007722, + "learning_rate": 1.0380291666189968e-05, + "loss": 1.4333, + "step": 5228 + }, + { + "epoch": 1.0066174170416535, + "grad_norm": 3.264699659391858, + "learning_rate": 1.0377176006510812e-05, + "loss": 1.4752, + "step": 5229 + }, + { + "epoch": 1.006809923719229, + "grad_norm": 3.2665980015175116, + "learning_rate": 1.0374060310165324e-05, + "loss": 1.4826, + "step": 5230 + }, + { + "epoch": 1.0070024303968044, + "grad_norm": 3.155272863651309, + "learning_rate": 1.0370944577456388e-05, + "loss": 1.4027, + "step": 5231 + }, + { + "epoch": 1.0071949370743798, + "grad_norm": 3.150861512859124, + "learning_rate": 1.0367828808686892e-05, + "loss": 1.4328, + "step": 5232 + }, + { + "epoch": 1.0071949370743798, + "lm_loss": 1.3273, + "step": 5232, + "vm_loss": 0.2 + }, + { + "epoch": 1.0071949370743798, + "lm_loss": 1.3858, + "step": 5232, + "vm_loss": 0.1637 + }, + { + "epoch": 1.0071949370743798, + "lm_loss": 1.6511, + "step": 5232, + "vm_loss": 0.2237 + }, + { + "epoch": 1.0071949370743798, + "lm_loss": 1.338, + "step": 5232, + "vm_loss": 0.1812 + }, + { + "epoch": 1.0071949370743798, + "lm_loss": 1.4826, + "step": 5232, + "vm_loss": 0.1449 + }, + { + "epoch": 1.0071949370743798, + "lm_loss": 1.6167, + "step": 5232, + "vm_loss": 0.1457 + }, + { + "epoch": 1.0071949370743798, + "lm_loss": 0.8488, + "step": 5232, + "vm_loss": 0.1441 + }, + { + "epoch": 1.0071949370743798, + "lm_loss": 1.508, + "step": 5232, + "vm_loss": 0.1258 + }, + { + "epoch": 1.0073874437519552, + "grad_norm": 3.359076758798513, + "learning_rate": 1.0364713004159733e-05, + "loss": 1.4635, + "step": 5233 + }, + { + "epoch": 1.0075799504295304, + "grad_norm": 3.1923613989881563, + "learning_rate": 1.0361597164177802e-05, + "loss": 1.4511, + "step": 5234 + }, + { + "epoch": 1.0077724571071058, + "grad_norm": 3.1955137645426013, + "learning_rate": 1.0358481289044002e-05, + "loss": 1.438, + "step": 5235 + }, + { + "epoch": 1.0079649637846813, + "grad_norm": 3.113364889754319, + "learning_rate": 1.035536537906123e-05, + "loss": 1.4117, + "step": 5236 + }, + { + "epoch": 1.0081574704622567, + "grad_norm": 3.0817782710398642, + "learning_rate": 1.03522494345324e-05, + "loss": 1.4212, + "step": 5237 + }, + { + "epoch": 1.008349977139832, + "grad_norm": 3.04033494996302, + "learning_rate": 1.034913345576042e-05, + "loss": 1.4207, + "step": 5238 + }, + { + "epoch": 1.0085424838174075, + "grad_norm": 3.317204646936617, + "learning_rate": 1.03460174430482e-05, + "loss": 1.461, + "step": 5239 + }, + { + "epoch": 1.0087349904949827, + "grad_norm": 3.091562523290644, + "learning_rate": 1.0342901396698658e-05, + "loss": 1.4214, + "step": 5240 + }, + { + "epoch": 1.0087349904949827, + "lm_loss": 1.1824, + "step": 5240, + "vm_loss": 0.1834 + }, + { + "epoch": 1.0087349904949827, + "lm_loss": 1.258, + "step": 5240, + "vm_loss": 0.2047 + }, + { + "epoch": 1.0087349904949827, + "lm_loss": 1.4302, + "step": 5240, + "vm_loss": 0.1716 + }, + { + "epoch": 1.0087349904949827, + "lm_loss": 1.3365, + "step": 5240, + "vm_loss": 0.1766 + }, + { + "epoch": 1.0087349904949827, + "lm_loss": 0.6611, + "step": 5240, + "vm_loss": 0.125 + }, + { + "epoch": 1.0087349904949827, + "lm_loss": 1.0134, + "step": 5240, + "vm_loss": 0.143 + }, + { + "epoch": 1.0087349904949827, + "lm_loss": 1.2141, + "step": 5240, + "vm_loss": 0.2307 + }, + { + "epoch": 1.0087349904949827, + "lm_loss": 0.7424, + "step": 5240, + "vm_loss": 0.1611 + }, + { + "epoch": 1.0089274971725581, + "grad_norm": 3.187239188299685, + "learning_rate": 1.0339785317014713e-05, + "loss": 1.4711, + "step": 5241 + }, + { + "epoch": 1.0091200038501336, + "grad_norm": 3.156670805972283, + "learning_rate": 1.0336669204299286e-05, + "loss": 1.3994, + "step": 5242 + }, + { + "epoch": 1.009312510527709, + "grad_norm": 3.482825333960884, + "learning_rate": 1.0333553058855309e-05, + "loss": 1.5643, + "step": 5243 + }, + { + "epoch": 1.0095050172052844, + "grad_norm": 3.193519212219886, + "learning_rate": 1.0330436880985704e-05, + "loss": 1.465, + "step": 5244 + }, + { + "epoch": 1.0096975238828596, + "grad_norm": 3.119312141286633, + "learning_rate": 1.0327320670993409e-05, + "loss": 1.4434, + "step": 5245 + }, + { + "epoch": 1.009890030560435, + "grad_norm": 3.027848459519069, + "learning_rate": 1.0324204429181355e-05, + "loss": 1.4603, + "step": 5246 + }, + { + "epoch": 1.0100825372380104, + "grad_norm": 3.12083207215003, + "learning_rate": 1.0321088155852484e-05, + "loss": 1.4209, + "step": 5247 + }, + { + "epoch": 1.0102750439155859, + "grad_norm": 3.1428892058115285, + "learning_rate": 1.0317971851309736e-05, + "loss": 1.4654, + "step": 5248 + }, + { + "epoch": 1.0102750439155859, + "lm_loss": 1.0534, + "step": 5248, + "vm_loss": 0.106 + }, + { + "epoch": 1.0102750439155859, + "lm_loss": 1.0712, + "step": 5248, + "vm_loss": 0.1868 + }, + { + "epoch": 1.0102750439155859, + "lm_loss": 0.8561, + "step": 5248, + "vm_loss": 0.146 + }, + { + "epoch": 1.0102750439155859, + "lm_loss": 0.9943, + "step": 5248, + "vm_loss": 0.1799 + }, + { + "epoch": 1.0102750439155859, + "lm_loss": 1.2064, + "step": 5248, + "vm_loss": 0.2008 + }, + { + "epoch": 1.0102750439155859, + "lm_loss": 0.8936, + "step": 5248, + "vm_loss": 0.1779 + }, + { + "epoch": 1.0102750439155859, + "lm_loss": 1.7405, + "step": 5248, + "vm_loss": 0.1481 + }, + { + "epoch": 1.0102750439155859, + "lm_loss": 1.2537, + "step": 5248, + "vm_loss": 0.1707 + }, + { + "epoch": 1.0104675505931613, + "grad_norm": 3.1568219977954275, + "learning_rate": 1.0314855515856055e-05, + "loss": 1.407, + "step": 5249 + }, + { + "epoch": 1.0106600572707365, + "grad_norm": 3.0545804169416173, + "learning_rate": 1.0311739149794394e-05, + "loss": 1.3806, + "step": 5250 + }, + { + "epoch": 1.010852563948312, + "grad_norm": 3.2868937988084896, + "learning_rate": 1.0308622753427692e-05, + "loss": 1.4317, + "step": 5251 + }, + { + "epoch": 1.0110450706258873, + "grad_norm": 3.279832328674485, + "learning_rate": 1.0305506327058916e-05, + "loss": 1.4319, + "step": 5252 + }, + { + "epoch": 1.0112375773034628, + "grad_norm": 3.1806904948089523, + "learning_rate": 1.0302389870991015e-05, + "loss": 1.3886, + "step": 5253 + }, + { + "epoch": 1.0114300839810382, + "grad_norm": 3.245406812470036, + "learning_rate": 1.0299273385526949e-05, + "loss": 1.4277, + "step": 5254 + }, + { + "epoch": 1.0116225906586134, + "grad_norm": 3.220546266751897, + "learning_rate": 1.0296156870969681e-05, + "loss": 1.4215, + "step": 5255 + }, + { + "epoch": 1.0118150973361888, + "grad_norm": 3.183225485454342, + "learning_rate": 1.0293040327622175e-05, + "loss": 1.4502, + "step": 5256 + }, + { + "epoch": 1.0118150973361888, + "lm_loss": 1.5459, + "step": 5256, + "vm_loss": 0.1959 + }, + { + "epoch": 1.0118150973361888, + "lm_loss": 0.8177, + "step": 5256, + "vm_loss": 0.1649 + }, + { + "epoch": 1.0118150973361888, + "lm_loss": 1.3076, + "step": 5256, + "vm_loss": 0.1692 + }, + { + "epoch": 1.0118150973361888, + "lm_loss": 1.0591, + "step": 5256, + "vm_loss": 0.1429 + }, + { + "epoch": 1.0118150973361888, + "lm_loss": 1.1081, + "step": 5256, + "vm_loss": 0.0943 + }, + { + "epoch": 1.0118150973361888, + "lm_loss": 1.2002, + "step": 5256, + "vm_loss": 0.1575 + }, + { + "epoch": 1.0118150973361888, + "lm_loss": 1.3334, + "step": 5256, + "vm_loss": 0.1797 + }, + { + "epoch": 1.0118150973361888, + "lm_loss": 1.6397, + "step": 5256, + "vm_loss": 0.2071 + }, + { + "epoch": 1.0120076040137642, + "grad_norm": 3.047694369454868, + "learning_rate": 1.0289923755787401e-05, + "loss": 1.3061, + "step": 5257 + }, + { + "epoch": 1.0122001106913396, + "grad_norm": 3.312699106019508, + "learning_rate": 1.0286807155768331e-05, + "loss": 1.4754, + "step": 5258 + }, + { + "epoch": 1.012392617368915, + "grad_norm": 3.3788030383218257, + "learning_rate": 1.0283690527867931e-05, + "loss": 1.4469, + "step": 5259 + }, + { + "epoch": 1.0125851240464903, + "grad_norm": 3.3165415825726896, + "learning_rate": 1.0280573872389187e-05, + "loss": 1.3928, + "step": 5260 + }, + { + "epoch": 1.0127776307240657, + "grad_norm": 3.3838556072765424, + "learning_rate": 1.0277457189635071e-05, + "loss": 1.3587, + "step": 5261 + }, + { + "epoch": 1.012970137401641, + "grad_norm": 3.466213026247168, + "learning_rate": 1.0274340479908568e-05, + "loss": 1.4747, + "step": 5262 + }, + { + "epoch": 1.0131626440792165, + "grad_norm": 3.2428300917265664, + "learning_rate": 1.027122374351266e-05, + "loss": 1.4718, + "step": 5263 + }, + { + "epoch": 1.013355150756792, + "grad_norm": 3.16897949739538, + "learning_rate": 1.0268106980750334e-05, + "loss": 1.3461, + "step": 5264 + }, + { + "epoch": 1.013355150756792, + "lm_loss": 1.1759, + "step": 5264, + "vm_loss": 0.1627 + }, + { + "epoch": 1.013355150756792, + "lm_loss": 1.4544, + "step": 5264, + "vm_loss": 0.1734 + }, + { + "epoch": 1.013355150756792, + "lm_loss": 1.2957, + "step": 5264, + "vm_loss": 0.1686 + }, + { + "epoch": 1.013355150756792, + "lm_loss": 1.0837, + "step": 5264, + "vm_loss": 0.2089 + }, + { + "epoch": 1.013355150756792, + "lm_loss": 1.3465, + "step": 5264, + "vm_loss": 0.1385 + }, + { + "epoch": 1.013355150756792, + "lm_loss": 1.1683, + "step": 5264, + "vm_loss": 0.1203 + }, + { + "epoch": 1.013355150756792, + "lm_loss": 1.303, + "step": 5264, + "vm_loss": 0.2007 + }, + { + "epoch": 1.013355150756792, + "lm_loss": 0.817, + "step": 5264, + "vm_loss": 0.1202 + }, + { + "epoch": 1.0135476574343674, + "grad_norm": 3.2342853446683413, + "learning_rate": 1.026499019192458e-05, + "loss": 1.4262, + "step": 5265 + }, + { + "epoch": 1.0137401641119426, + "grad_norm": 3.0567282752690628, + "learning_rate": 1.0261873377338387e-05, + "loss": 1.3825, + "step": 5266 + }, + { + "epoch": 1.013932670789518, + "grad_norm": 3.5180534312384153, + "learning_rate": 1.0258756537294754e-05, + "loss": 1.4764, + "step": 5267 + }, + { + "epoch": 1.0141251774670934, + "grad_norm": 3.2360901212068582, + "learning_rate": 1.0255639672096673e-05, + "loss": 1.3408, + "step": 5268 + }, + { + "epoch": 1.0143176841446688, + "grad_norm": 3.27572343652706, + "learning_rate": 1.0252522782047147e-05, + "loss": 1.4576, + "step": 5269 + }, + { + "epoch": 1.0145101908222443, + "grad_norm": 3.2020992380999926, + "learning_rate": 1.0249405867449176e-05, + "loss": 1.3733, + "step": 5270 + }, + { + "epoch": 1.0147026974998195, + "grad_norm": 3.20208234847383, + "learning_rate": 1.0246288928605762e-05, + "loss": 1.3456, + "step": 5271 + }, + { + "epoch": 1.0148952041773949, + "grad_norm": 3.1915674510142016, + "learning_rate": 1.0243171965819914e-05, + "loss": 1.4144, + "step": 5272 + }, + { + "epoch": 1.0148952041773949, + "lm_loss": 1.0684, + "step": 5272, + "vm_loss": 0.1814 + }, + { + "epoch": 1.0148952041773949, + "lm_loss": 1.3565, + "step": 5272, + "vm_loss": 0.1941 + }, + { + "epoch": 1.0148952041773949, + "lm_loss": 1.0271, + "step": 5272, + "vm_loss": 0.1781 + }, + { + "epoch": 1.0148952041773949, + "lm_loss": 1.4733, + "step": 5272, + "vm_loss": 0.1148 + }, + { + "epoch": 1.0148952041773949, + "lm_loss": 1.2065, + "step": 5272, + "vm_loss": 0.2133 + }, + { + "epoch": 1.0148952041773949, + "lm_loss": 1.2886, + "step": 5272, + "vm_loss": 0.1691 + }, + { + "epoch": 1.0148952041773949, + "lm_loss": 1.2064, + "step": 5272, + "vm_loss": 0.1564 + }, + { + "epoch": 1.0148952041773949, + "lm_loss": 1.1793, + "step": 5272, + "vm_loss": 0.1895 + }, + { + "epoch": 1.0150877108549703, + "grad_norm": 3.0389727232566557, + "learning_rate": 1.024005497939464e-05, + "loss": 1.3706, + "step": 5273 + }, + { + "epoch": 1.0152802175325457, + "grad_norm": 3.1730530785317783, + "learning_rate": 1.0236937969632951e-05, + "loss": 1.3743, + "step": 5274 + }, + { + "epoch": 1.0154727242101211, + "grad_norm": 3.241070767558403, + "learning_rate": 1.0233820936837859e-05, + "loss": 1.4349, + "step": 5275 + }, + { + "epoch": 1.0156652308876963, + "grad_norm": 3.163541255353608, + "learning_rate": 1.023070388131238e-05, + "loss": 1.39, + "step": 5276 + }, + { + "epoch": 1.0158577375652718, + "grad_norm": 3.279424677110529, + "learning_rate": 1.0227586803359532e-05, + "loss": 1.4857, + "step": 5277 + }, + { + "epoch": 1.0160502442428472, + "grad_norm": 3.0392038365640337, + "learning_rate": 1.0224469703282336e-05, + "loss": 1.3688, + "step": 5278 + }, + { + "epoch": 1.0162427509204226, + "grad_norm": 3.107688767971896, + "learning_rate": 1.0221352581383815e-05, + "loss": 1.3962, + "step": 5279 + }, + { + "epoch": 1.016435257597998, + "grad_norm": 3.229133483622222, + "learning_rate": 1.021823543796699e-05, + "loss": 1.3811, + "step": 5280 + }, + { + "epoch": 1.016435257597998, + "lm_loss": 1.3626, + "step": 5280, + "vm_loss": 0.205 + }, + { + "epoch": 1.016435257597998, + "lm_loss": 1.2113, + "step": 5280, + "vm_loss": 0.2032 + }, + { + "epoch": 1.016435257597998, + "lm_loss": 1.2378, + "step": 5280, + "vm_loss": 0.1471 + }, + { + "epoch": 1.016435257597998, + "lm_loss": 1.3814, + "step": 5280, + "vm_loss": 0.1647 + }, + { + "epoch": 1.016435257597998, + "lm_loss": 1.1118, + "step": 5280, + "vm_loss": 0.1606 + }, + { + "epoch": 1.016435257597998, + "lm_loss": 1.238, + "step": 5280, + "vm_loss": 0.1579 + }, + { + "epoch": 1.016435257597998, + "lm_loss": 0.9976, + "step": 5280, + "vm_loss": 0.1528 + }, + { + "epoch": 1.016435257597998, + "lm_loss": 0.4437, + "step": 5280, + "vm_loss": 0.1654 + }, + { + "epoch": 1.0166277642755732, + "grad_norm": 3.1895405736270965, + "learning_rate": 1.0215118273334885e-05, + "loss": 1.3868, + "step": 5281 + }, + { + "epoch": 1.0168202709531486, + "grad_norm": 3.3588583179219142, + "learning_rate": 1.0212001087790538e-05, + "loss": 1.4782, + "step": 5282 + }, + { + "epoch": 1.017012777630724, + "grad_norm": 3.1984493432076433, + "learning_rate": 1.0208883881636969e-05, + "loss": 1.339, + "step": 5283 + }, + { + "epoch": 1.0172052843082995, + "grad_norm": 3.318610884941977, + "learning_rate": 1.0205766655177217e-05, + "loss": 1.4481, + "step": 5284 + }, + { + "epoch": 1.017397790985875, + "grad_norm": 3.1703718895832793, + "learning_rate": 1.0202649408714309e-05, + "loss": 1.4301, + "step": 5285 + }, + { + "epoch": 1.01759029766345, + "grad_norm": 3.2425917172108045, + "learning_rate": 1.019953214255129e-05, + "loss": 1.4198, + "step": 5286 + }, + { + "epoch": 1.0177828043410255, + "grad_norm": 3.2128255289844456, + "learning_rate": 1.019641485699119e-05, + "loss": 1.4475, + "step": 5287 + }, + { + "epoch": 1.017975311018601, + "grad_norm": 3.269415254382278, + "learning_rate": 1.0193297552337055e-05, + "loss": 1.424, + "step": 5288 + }, + { + "epoch": 1.017975311018601, + "lm_loss": 0.871, + "step": 5288, + "vm_loss": 0.1745 + }, + { + "epoch": 1.017975311018601, + "lm_loss": 1.3448, + "step": 5288, + "vm_loss": 0.1368 + }, + { + "epoch": 1.017975311018601, + "lm_loss": 1.4553, + "step": 5288, + "vm_loss": 0.1873 + }, + { + "epoch": 1.017975311018601, + "lm_loss": 1.0574, + "step": 5288, + "vm_loss": 0.2351 + }, + { + "epoch": 1.017975311018601, + "lm_loss": 0.8342, + "step": 5288, + "vm_loss": 0.1225 + }, + { + "epoch": 1.017975311018601, + "lm_loss": 1.4492, + "step": 5288, + "vm_loss": 0.163 + }, + { + "epoch": 1.017975311018601, + "lm_loss": 0.9951, + "step": 5288, + "vm_loss": 0.1677 + }, + { + "epoch": 1.017975311018601, + "lm_loss": 0.9214, + "step": 5288, + "vm_loss": 0.1914 + }, + { + "epoch": 1.0181678176961764, + "grad_norm": 3.1809889748977906, + "learning_rate": 1.0190180228891928e-05, + "loss": 1.4325, + "step": 5289 + }, + { + "epoch": 1.0183603243737518, + "grad_norm": 3.161266610240595, + "learning_rate": 1.0187062886958845e-05, + "loss": 1.4522, + "step": 5290 + }, + { + "epoch": 1.018552831051327, + "grad_norm": 3.13937199218909, + "learning_rate": 1.018394552684086e-05, + "loss": 1.425, + "step": 5291 + }, + { + "epoch": 1.0187453377289024, + "grad_norm": 3.0696475000092236, + "learning_rate": 1.0180828148841013e-05, + "loss": 1.3128, + "step": 5292 + }, + { + "epoch": 1.0189378444064778, + "grad_norm": 3.2629891648249334, + "learning_rate": 1.0177710753262358e-05, + "loss": 1.4148, + "step": 5293 + }, + { + "epoch": 1.0191303510840533, + "grad_norm": 3.487317270262764, + "learning_rate": 1.0174593340407945e-05, + "loss": 1.4459, + "step": 5294 + }, + { + "epoch": 1.0193228577616287, + "grad_norm": 3.223753277963404, + "learning_rate": 1.0171475910580824e-05, + "loss": 1.3645, + "step": 5295 + }, + { + "epoch": 1.0195153644392039, + "grad_norm": 3.1659643364483814, + "learning_rate": 1.0168358464084052e-05, + "loss": 1.4245, + "step": 5296 + }, + { + "epoch": 1.0195153644392039, + "lm_loss": 1.2782, + "step": 5296, + "vm_loss": 0.0919 + }, + { + "epoch": 1.0195153644392039, + "lm_loss": 1.207, + "step": 5296, + "vm_loss": 0.2118 + }, + { + "epoch": 1.0195153644392039, + "lm_loss": 1.6258, + "step": 5296, + "vm_loss": 0.1419 + }, + { + "epoch": 1.0195153644392039, + "lm_loss": 1.4703, + "step": 5296, + "vm_loss": 0.2561 + }, + { + "epoch": 1.0195153644392039, + "lm_loss": 1.4977, + "step": 5296, + "vm_loss": 0.1504 + }, + { + "epoch": 1.0195153644392039, + "lm_loss": 1.5082, + "step": 5296, + "vm_loss": 0.1435 + }, + { + "epoch": 1.0195153644392039, + "lm_loss": 1.0653, + "step": 5296, + "vm_loss": 0.1677 + }, + { + "epoch": 1.0195153644392039, + "lm_loss": 1.4842, + "step": 5296, + "vm_loss": 0.1779 + }, + { + "epoch": 1.0197078711167793, + "grad_norm": 3.309585063882577, + "learning_rate": 1.0165241001220682e-05, + "loss": 1.4798, + "step": 5297 + }, + { + "epoch": 1.0199003777943547, + "grad_norm": 3.098357780652905, + "learning_rate": 1.0162123522293775e-05, + "loss": 1.3605, + "step": 5298 + }, + { + "epoch": 1.0200928844719301, + "grad_norm": 3.285065831004035, + "learning_rate": 1.0159006027606385e-05, + "loss": 1.453, + "step": 5299 + }, + { + "epoch": 1.0202853911495056, + "grad_norm": 3.1451059635329015, + "learning_rate": 1.0155888517461575e-05, + "loss": 1.4094, + "step": 5300 + }, + { + "epoch": 1.020477897827081, + "grad_norm": 3.2605192736138937, + "learning_rate": 1.0152770992162407e-05, + "loss": 1.4339, + "step": 5301 + }, + { + "epoch": 1.0206704045046562, + "grad_norm": 3.1099133662654515, + "learning_rate": 1.0149653452011943e-05, + "loss": 1.4244, + "step": 5302 + }, + { + "epoch": 1.0208629111822316, + "grad_norm": 3.1149443589911003, + "learning_rate": 1.0146535897313253e-05, + "loss": 1.4468, + "step": 5303 + }, + { + "epoch": 1.021055417859807, + "grad_norm": 3.1183860664006313, + "learning_rate": 1.0143418328369394e-05, + "loss": 1.4413, + "step": 5304 + }, + { + "epoch": 1.021055417859807, + "lm_loss": 1.0358, + "step": 5304, + "vm_loss": 0.1843 + }, + { + "epoch": 1.021055417859807, + "lm_loss": 1.2513, + "step": 5304, + "vm_loss": 0.15 + }, + { + "epoch": 1.021055417859807, + "lm_loss": 1.2671, + "step": 5304, + "vm_loss": 0.1004 + }, + { + "epoch": 1.021055417859807, + "lm_loss": 1.8601, + "step": 5304, + "vm_loss": 0.219 + }, + { + "epoch": 1.021055417859807, + "lm_loss": 1.3223, + "step": 5304, + "vm_loss": 0.1411 + }, + { + "epoch": 1.021055417859807, + "lm_loss": 1.0145, + "step": 5304, + "vm_loss": 0.208 + }, + { + "epoch": 1.021055417859807, + "lm_loss": 0.8064, + "step": 5304, + "vm_loss": 0.1421 + }, + { + "epoch": 1.021055417859807, + "lm_loss": 1.3773, + "step": 5304, + "vm_loss": 0.2019 + }, + { + "epoch": 1.0212479245373824, + "grad_norm": 3.1876945926345357, + "learning_rate": 1.014030074548344e-05, + "loss": 1.3092, + "step": 5305 + }, + { + "epoch": 1.0214404312149579, + "grad_norm": 3.160294845799342, + "learning_rate": 1.0137183148958462e-05, + "loss": 1.4389, + "step": 5306 + }, + { + "epoch": 1.021632937892533, + "grad_norm": 3.1488466954151177, + "learning_rate": 1.0134065539097524e-05, + "loss": 1.3584, + "step": 5307 + }, + { + "epoch": 1.0218254445701085, + "grad_norm": 3.2183981746042423, + "learning_rate": 1.0130947916203701e-05, + "loss": 1.4, + "step": 5308 + }, + { + "epoch": 1.022017951247684, + "grad_norm": 3.309119695013585, + "learning_rate": 1.0127830280580064e-05, + "loss": 1.4486, + "step": 5309 + }, + { + "epoch": 1.0222104579252593, + "grad_norm": 3.2917959183037135, + "learning_rate": 1.012471263252969e-05, + "loss": 1.4806, + "step": 5310 + }, + { + "epoch": 1.0224029646028348, + "grad_norm": 3.041168110733001, + "learning_rate": 1.0121594972355653e-05, + "loss": 1.3193, + "step": 5311 + }, + { + "epoch": 1.02259547128041, + "grad_norm": 3.1217711058701334, + "learning_rate": 1.0118477300361027e-05, + "loss": 1.4177, + "step": 5312 + }, + { + "epoch": 1.02259547128041, + "lm_loss": 1.1922, + "step": 5312, + "vm_loss": 0.1862 + }, + { + "epoch": 1.02259547128041, + "lm_loss": 1.8132, + "step": 5312, + "vm_loss": 0.1896 + }, + { + "epoch": 1.02259547128041, + "lm_loss": 1.3263, + "step": 5312, + "vm_loss": 0.1721 + }, + { + "epoch": 1.02259547128041, + "lm_loss": 1.2304, + "step": 5312, + "vm_loss": 0.1872 + }, + { + "epoch": 1.02259547128041, + "lm_loss": 1.4567, + "step": 5312, + "vm_loss": 0.1717 + }, + { + "epoch": 1.02259547128041, + "lm_loss": 1.2695, + "step": 5312, + "vm_loss": 0.1606 + }, + { + "epoch": 1.02259547128041, + "lm_loss": 1.0845, + "step": 5312, + "vm_loss": 0.1344 + }, + { + "epoch": 1.02259547128041, + "lm_loss": 1.31, + "step": 5312, + "vm_loss": 0.1869 + }, + { + "epoch": 1.0227879779579854, + "grad_norm": 3.209033675576354, + "learning_rate": 1.0115359616848894e-05, + "loss": 1.4252, + "step": 5313 + }, + { + "epoch": 1.0229804846355608, + "grad_norm": 3.1966544527759972, + "learning_rate": 1.0112241922122328e-05, + "loss": 1.4263, + "step": 5314 + }, + { + "epoch": 1.0231729913131362, + "grad_norm": 3.2289260728185973, + "learning_rate": 1.0109124216484413e-05, + "loss": 1.4179, + "step": 5315 + }, + { + "epoch": 1.0233654979907116, + "grad_norm": 3.2165733141536985, + "learning_rate": 1.0106006500238225e-05, + "loss": 1.394, + "step": 5316 + }, + { + "epoch": 1.0235580046682868, + "grad_norm": 3.2782471043668746, + "learning_rate": 1.0102888773686852e-05, + "loss": 1.4332, + "step": 5317 + }, + { + "epoch": 1.0237505113458623, + "grad_norm": 3.310818491794284, + "learning_rate": 1.0099771037133372e-05, + "loss": 1.4755, + "step": 5318 + }, + { + "epoch": 1.0239430180234377, + "grad_norm": 3.135705805781398, + "learning_rate": 1.0096653290880873e-05, + "loss": 1.3867, + "step": 5319 + }, + { + "epoch": 1.024135524701013, + "grad_norm": 3.2555511824794707, + "learning_rate": 1.0093535535232435e-05, + "loss": 1.4129, + "step": 5320 + }, + { + "epoch": 1.024135524701013, + "lm_loss": 1.2205, + "step": 5320, + "vm_loss": 0.1474 + }, + { + "epoch": 1.024135524701013, + "lm_loss": 1.2717, + "step": 5320, + "vm_loss": 0.1628 + }, + { + "epoch": 1.024135524701013, + "lm_loss": 1.4631, + "step": 5320, + "vm_loss": 0.1669 + }, + { + "epoch": 1.024135524701013, + "lm_loss": 1.2231, + "step": 5320, + "vm_loss": 0.1456 + }, + { + "epoch": 1.024135524701013, + "lm_loss": 1.1232, + "step": 5320, + "vm_loss": 0.2094 + }, + { + "epoch": 1.024135524701013, + "lm_loss": 1.2995, + "step": 5320, + "vm_loss": 0.1935 + }, + { + "epoch": 1.024135524701013, + "lm_loss": 1.3768, + "step": 5320, + "vm_loss": 0.1527 + }, + { + "epoch": 1.024135524701013, + "lm_loss": 1.2309, + "step": 5320, + "vm_loss": 0.1781 + }, + { + "epoch": 1.0243280313785885, + "grad_norm": 3.205451557131525, + "learning_rate": 1.0090417770491148e-05, + "loss": 1.4128, + "step": 5321 + }, + { + "epoch": 1.0245205380561637, + "grad_norm": 3.0897210498051084, + "learning_rate": 1.0087299996960096e-05, + "loss": 1.3047, + "step": 5322 + }, + { + "epoch": 1.0247130447337391, + "grad_norm": 3.1896416551894, + "learning_rate": 1.0084182214942369e-05, + "loss": 1.4053, + "step": 5323 + }, + { + "epoch": 1.0249055514113146, + "grad_norm": 3.348943987544542, + "learning_rate": 1.0081064424741052e-05, + "loss": 1.3919, + "step": 5324 + }, + { + "epoch": 1.02509805808889, + "grad_norm": 3.286855495325973, + "learning_rate": 1.0077946626659236e-05, + "loss": 1.4559, + "step": 5325 + }, + { + "epoch": 1.0252905647664654, + "grad_norm": 3.2149820459269023, + "learning_rate": 1.0074828821000012e-05, + "loss": 1.4314, + "step": 5326 + }, + { + "epoch": 1.0254830714440408, + "grad_norm": 3.1558744476761666, + "learning_rate": 1.0071711008066466e-05, + "loss": 1.4334, + "step": 5327 + }, + { + "epoch": 1.025675578121616, + "grad_norm": 2.9632586118212982, + "learning_rate": 1.0068593188161698e-05, + "loss": 1.3293, + "step": 5328 + }, + { + "epoch": 1.025675578121616, + "lm_loss": 1.707, + "step": 5328, + "vm_loss": 0.1203 + }, + { + "epoch": 1.025675578121616, + "lm_loss": 1.2245, + "step": 5328, + "vm_loss": 0.1443 + }, + { + "epoch": 1.025675578121616, + "lm_loss": 1.0639, + "step": 5328, + "vm_loss": 0.1993 + }, + { + "epoch": 1.025675578121616, + "lm_loss": 1.4365, + "step": 5328, + "vm_loss": 0.2185 + }, + { + "epoch": 1.025675578121616, + "lm_loss": 1.4362, + "step": 5328, + "vm_loss": 0.2132 + }, + { + "epoch": 1.025675578121616, + "lm_loss": 1.0708, + "step": 5328, + "vm_loss": 0.1994 + }, + { + "epoch": 1.025675578121616, + "lm_loss": 1.0495, + "step": 5328, + "vm_loss": 0.1708 + }, + { + "epoch": 1.025675578121616, + "lm_loss": 0.8801, + "step": 5328, + "vm_loss": 0.1854 + }, + { + "epoch": 1.0258680847991914, + "grad_norm": 3.1651625019448097, + "learning_rate": 1.006547536158879e-05, + "loss": 1.3872, + "step": 5329 + }, + { + "epoch": 1.0260605914767669, + "grad_norm": 3.182450699949572, + "learning_rate": 1.0062357528650841e-05, + "loss": 1.4508, + "step": 5330 + }, + { + "epoch": 1.0262530981543423, + "grad_norm": 3.133293572588481, + "learning_rate": 1.0059239689650941e-05, + "loss": 1.4227, + "step": 5331 + }, + { + "epoch": 1.0264456048319177, + "grad_norm": 3.1361916643565504, + "learning_rate": 1.0056121844892188e-05, + "loss": 1.3474, + "step": 5332 + }, + { + "epoch": 1.026638111509493, + "grad_norm": 3.1388939111256695, + "learning_rate": 1.0053003994677673e-05, + "loss": 1.4219, + "step": 5333 + }, + { + "epoch": 1.0268306181870683, + "grad_norm": 3.26017446039176, + "learning_rate": 1.004988613931049e-05, + "loss": 1.369, + "step": 5334 + }, + { + "epoch": 1.0270231248646438, + "grad_norm": 3.3943827858384217, + "learning_rate": 1.0046768279093737e-05, + "loss": 1.4274, + "step": 5335 + }, + { + "epoch": 1.0272156315422192, + "grad_norm": 3.3231513281638834, + "learning_rate": 1.0043650414330508e-05, + "loss": 1.424, + "step": 5336 + }, + { + "epoch": 1.0272156315422192, + "lm_loss": 1.6288, + "step": 5336, + "vm_loss": 0.2065 + }, + { + "epoch": 1.0272156315422192, + "lm_loss": 0.6245, + "step": 5336, + "vm_loss": 0.1128 + }, + { + "epoch": 1.0272156315422192, + "lm_loss": 0.7742, + "step": 5336, + "vm_loss": 0.1798 + }, + { + "epoch": 1.0272156315422192, + "lm_loss": 1.1095, + "step": 5336, + "vm_loss": 0.208 + }, + { + "epoch": 1.0272156315422192, + "lm_loss": 1.1431, + "step": 5336, + "vm_loss": 0.1537 + }, + { + "epoch": 1.0272156315422192, + "lm_loss": 1.198, + "step": 5336, + "vm_loss": 0.1619 + }, + { + "epoch": 1.0272156315422192, + "lm_loss": 1.5549, + "step": 5336, + "vm_loss": 0.2308 + }, + { + "epoch": 1.0272156315422192, + "lm_loss": 1.7397, + "step": 5336, + "vm_loss": 0.1462 + }, + { + "epoch": 1.0274081382197946, + "grad_norm": 3.249015798947832, + "learning_rate": 1.0040532545323902e-05, + "loss": 1.4466, + "step": 5337 + }, + { + "epoch": 1.0276006448973698, + "grad_norm": 3.0782205659937323, + "learning_rate": 1.0037414672377013e-05, + "loss": 1.3859, + "step": 5338 + }, + { + "epoch": 1.0277931515749452, + "grad_norm": 3.135001391128802, + "learning_rate": 1.0034296795792936e-05, + "loss": 1.443, + "step": 5339 + }, + { + "epoch": 1.0279856582525206, + "grad_norm": 3.139373370248466, + "learning_rate": 1.0031178915874775e-05, + "loss": 1.4479, + "step": 5340 + }, + { + "epoch": 1.028178164930096, + "grad_norm": 3.108944125463726, + "learning_rate": 1.0028061032925622e-05, + "loss": 1.4183, + "step": 5341 + }, + { + "epoch": 1.0283706716076715, + "grad_norm": 3.0654320462283726, + "learning_rate": 1.0024943147248582e-05, + "loss": 1.3772, + "step": 5342 + }, + { + "epoch": 1.0285631782852467, + "grad_norm": 3.1211612241691045, + "learning_rate": 1.0021825259146747e-05, + "loss": 1.4313, + "step": 5343 + }, + { + "epoch": 1.028755684962822, + "grad_norm": 3.1410284394050207, + "learning_rate": 1.0018707368923217e-05, + "loss": 1.4232, + "step": 5344 + }, + { + "epoch": 1.028755684962822, + "lm_loss": 1.3754, + "step": 5344, + "vm_loss": 0.2186 + }, + { + "epoch": 1.028755684962822, + "lm_loss": 1.3445, + "step": 5344, + "vm_loss": 0.1414 + }, + { + "epoch": 1.028755684962822, + "lm_loss": 1.1443, + "step": 5344, + "vm_loss": 0.2073 + }, + { + "epoch": 1.028755684962822, + "lm_loss": 0.819, + "step": 5344, + "vm_loss": 0.1774 + }, + { + "epoch": 1.028755684962822, + "lm_loss": 1.1697, + "step": 5344, + "vm_loss": 0.1427 + }, + { + "epoch": 1.028755684962822, + "lm_loss": 1.1347, + "step": 5344, + "vm_loss": 0.154 + }, + { + "epoch": 1.028755684962822, + "lm_loss": 1.4737, + "step": 5344, + "vm_loss": 0.1247 + }, + { + "epoch": 1.028755684962822, + "lm_loss": 0.9111, + "step": 5344, + "vm_loss": 0.1477 + }, + { + "epoch": 1.0289481916403975, + "grad_norm": 3.094445676840102, + "learning_rate": 1.0015589476881092e-05, + "loss": 1.3898, + "step": 5345 + }, + { + "epoch": 1.029140698317973, + "grad_norm": 3.1060274800819236, + "learning_rate": 1.001247158332347e-05, + "loss": 1.4405, + "step": 5346 + }, + { + "epoch": 1.0293332049955484, + "grad_norm": 3.0758249715764703, + "learning_rate": 1.0009353688553451e-05, + "loss": 1.4121, + "step": 5347 + }, + { + "epoch": 1.0295257116731236, + "grad_norm": 3.284471717421342, + "learning_rate": 1.0006235792874133e-05, + "loss": 1.3676, + "step": 5348 + }, + { + "epoch": 1.029718218350699, + "grad_norm": 3.249339545384867, + "learning_rate": 1.0003117896588619e-05, + "loss": 1.4919, + "step": 5349 + }, + { + "epoch": 1.0299107250282744, + "grad_norm": 3.1553690020413434, + "learning_rate": 1e-05, + "loss": 1.3574, + "step": 5350 + }, + { + "epoch": 1.0301032317058498, + "grad_norm": 3.132590582717691, + "learning_rate": 9.996882103411388e-06, + "loss": 1.3704, + "step": 5351 + }, + { + "epoch": 1.0302957383834253, + "grad_norm": 3.250763919568954, + "learning_rate": 9.993764207125868e-06, + "loss": 1.3527, + "step": 5352 + }, + { + "epoch": 1.0302957383834253, + "lm_loss": 1.4645, + "step": 5352, + "vm_loss": 0.146 + }, + { + "epoch": 1.0302957383834253, + "lm_loss": 1.3112, + "step": 5352, + "vm_loss": 0.1424 + }, + { + "epoch": 1.0302957383834253, + "lm_loss": 1.3946, + "step": 5352, + "vm_loss": 0.2251 + }, + { + "epoch": 1.0302957383834253, + "lm_loss": 1.0482, + "step": 5352, + "vm_loss": 0.1942 + }, + { + "epoch": 1.0302957383834253, + "lm_loss": 1.222, + "step": 5352, + "vm_loss": 0.2189 + }, + { + "epoch": 1.0302957383834253, + "lm_loss": 0.9827, + "step": 5352, + "vm_loss": 0.1828 + }, + { + "epoch": 1.0302957383834253, + "lm_loss": 1.2148, + "step": 5352, + "vm_loss": 0.2048 + }, + { + "epoch": 1.0302957383834253, + "lm_loss": 1.3597, + "step": 5352, + "vm_loss": 0.147 + }, + { + "epoch": 1.0304882450610005, + "grad_norm": 3.244305837429493, + "learning_rate": 9.990646311446552e-06, + "loss": 1.3827, + "step": 5353 + }, + { + "epoch": 1.0306807517385759, + "grad_norm": 3.2438888594859816, + "learning_rate": 9.987528416676534e-06, + "loss": 1.4393, + "step": 5354 + }, + { + "epoch": 1.0308732584161513, + "grad_norm": 3.2527600413831905, + "learning_rate": 9.984410523118913e-06, + "loss": 1.3645, + "step": 5355 + }, + { + "epoch": 1.0310657650937267, + "grad_norm": 3.1793339085471053, + "learning_rate": 9.981292631076785e-06, + "loss": 1.3937, + "step": 5356 + }, + { + "epoch": 1.0312582717713021, + "grad_norm": 3.2898567611297924, + "learning_rate": 9.978174740853257e-06, + "loss": 1.3974, + "step": 5357 + }, + { + "epoch": 1.0314507784488773, + "grad_norm": 3.202726762246539, + "learning_rate": 9.975056852751421e-06, + "loss": 1.4186, + "step": 5358 + }, + { + "epoch": 1.0316432851264528, + "grad_norm": 3.1197452600849216, + "learning_rate": 9.971938967074378e-06, + "loss": 1.367, + "step": 5359 + }, + { + "epoch": 1.0318357918040282, + "grad_norm": 3.2109037077170477, + "learning_rate": 9.968821084125228e-06, + "loss": 1.393, + "step": 5360 + }, + { + "epoch": 1.0318357918040282, + "lm_loss": 1.1128, + "step": 5360, + "vm_loss": 0.1253 + }, + { + "epoch": 1.0318357918040282, + "lm_loss": 1.4715, + "step": 5360, + "vm_loss": 0.1517 + }, + { + "epoch": 1.0318357918040282, + "lm_loss": 1.216, + "step": 5360, + "vm_loss": 0.1543 + }, + { + "epoch": 1.0318357918040282, + "lm_loss": 0.8798, + "step": 5360, + "vm_loss": 0.1901 + }, + { + "epoch": 1.0318357918040282, + "lm_loss": 1.4744, + "step": 5360, + "vm_loss": 0.1468 + }, + { + "epoch": 1.0318357918040282, + "lm_loss": 1.2315, + "step": 5360, + "vm_loss": 0.1428 + }, + { + "epoch": 1.0318357918040282, + "lm_loss": 1.4365, + "step": 5360, + "vm_loss": 0.1539 + }, + { + "epoch": 1.0318357918040282, + "lm_loss": 0.9068, + "step": 5360, + "vm_loss": 0.1556 + }, + { + "epoch": 1.0320282984816036, + "grad_norm": 3.193875723913729, + "learning_rate": 9.965703204207065e-06, + "loss": 1.3459, + "step": 5361 + }, + { + "epoch": 1.032220805159179, + "grad_norm": 3.199794479122657, + "learning_rate": 9.962585327622992e-06, + "loss": 1.3892, + "step": 5362 + }, + { + "epoch": 1.0324133118367544, + "grad_norm": 3.148181520641845, + "learning_rate": 9.959467454676102e-06, + "loss": 1.4401, + "step": 5363 + }, + { + "epoch": 1.0326058185143296, + "grad_norm": 3.209367370680021, + "learning_rate": 9.956349585669495e-06, + "loss": 1.4013, + "step": 5364 + }, + { + "epoch": 1.032798325191905, + "grad_norm": 3.322079761855245, + "learning_rate": 9.953231720906268e-06, + "loss": 1.4014, + "step": 5365 + }, + { + "epoch": 1.0329908318694805, + "grad_norm": 3.231172852821185, + "learning_rate": 9.950113860689512e-06, + "loss": 1.3964, + "step": 5366 + }, + { + "epoch": 1.033183338547056, + "grad_norm": 3.1515599259032987, + "learning_rate": 9.946996005322332e-06, + "loss": 1.3612, + "step": 5367 + }, + { + "epoch": 1.0333758452246313, + "grad_norm": 3.277200308074966, + "learning_rate": 9.943878155107817e-06, + "loss": 1.3995, + "step": 5368 + }, + { + "epoch": 1.0333758452246313, + "lm_loss": 1.2137, + "step": 5368, + "vm_loss": 0.1828 + }, + { + "epoch": 1.0333758452246313, + "lm_loss": 1.2698, + "step": 5368, + "vm_loss": 0.1621 + }, + { + "epoch": 1.0333758452246313, + "lm_loss": 1.303, + "step": 5368, + "vm_loss": 0.1441 + }, + { + "epoch": 1.0333758452246313, + "lm_loss": 1.4011, + "step": 5368, + "vm_loss": 0.1238 + }, + { + "epoch": 1.0333758452246313, + "lm_loss": 1.4055, + "step": 5368, + "vm_loss": 0.1468 + }, + { + "epoch": 1.0333758452246313, + "lm_loss": 1.2743, + "step": 5368, + "vm_loss": 0.1633 + }, + { + "epoch": 1.0333758452246313, + "lm_loss": 0.9325, + "step": 5368, + "vm_loss": 0.1469 + }, + { + "epoch": 1.0333758452246313, + "lm_loss": 1.0772, + "step": 5368, + "vm_loss": 0.1866 + }, + { + "epoch": 1.0335683519022065, + "grad_norm": 3.1449388887374754, + "learning_rate": 9.940760310349059e-06, + "loss": 1.3498, + "step": 5369 + }, + { + "epoch": 1.033760858579782, + "grad_norm": 3.310454158858666, + "learning_rate": 9.937642471349162e-06, + "loss": 1.4872, + "step": 5370 + }, + { + "epoch": 1.0339533652573574, + "grad_norm": 3.1365198713540816, + "learning_rate": 9.934524638411213e-06, + "loss": 1.4136, + "step": 5371 + }, + { + "epoch": 1.0341458719349328, + "grad_norm": 3.2456231230900103, + "learning_rate": 9.931406811838307e-06, + "loss": 1.3062, + "step": 5372 + }, + { + "epoch": 1.0343383786125082, + "grad_norm": 3.2647305859061784, + "learning_rate": 9.928288991933534e-06, + "loss": 1.4442, + "step": 5373 + }, + { + "epoch": 1.0345308852900834, + "grad_norm": 3.1062637771244574, + "learning_rate": 9.925171178999992e-06, + "loss": 1.3525, + "step": 5374 + }, + { + "epoch": 1.0347233919676588, + "grad_norm": 3.2955391344579996, + "learning_rate": 9.922053373340767e-06, + "loss": 1.3956, + "step": 5375 + }, + { + "epoch": 1.0349158986452343, + "grad_norm": 3.3745698577763057, + "learning_rate": 9.918935575258948e-06, + "loss": 1.4915, + "step": 5376 + }, + { + "epoch": 1.0349158986452343, + "lm_loss": 1.6253, + "step": 5376, + "vm_loss": 0.18 + }, + { + "epoch": 1.0349158986452343, + "lm_loss": 1.1861, + "step": 5376, + "vm_loss": 0.1734 + }, + { + "epoch": 1.0349158986452343, + "lm_loss": 1.4872, + "step": 5376, + "vm_loss": 0.1268 + }, + { + "epoch": 1.0349158986452343, + "lm_loss": 1.209, + "step": 5376, + "vm_loss": 0.1828 + }, + { + "epoch": 1.0349158986452343, + "lm_loss": 1.4559, + "step": 5376, + "vm_loss": 0.2011 + }, + { + "epoch": 1.0349158986452343, + "lm_loss": 1.7794, + "step": 5376, + "vm_loss": 0.1445 + }, + { + "epoch": 1.0349158986452343, + "lm_loss": 1.0959, + "step": 5376, + "vm_loss": 0.1646 + }, + { + "epoch": 1.0349158986452343, + "lm_loss": 0.8984, + "step": 5376, + "vm_loss": 0.175 + }, + { + "epoch": 1.0351084053228097, + "grad_norm": 3.300685691166521, + "learning_rate": 9.915817785057634e-06, + "loss": 1.4182, + "step": 5377 + }, + { + "epoch": 1.035300912000385, + "grad_norm": 3.165662753519481, + "learning_rate": 9.912700003039907e-06, + "loss": 1.3947, + "step": 5378 + }, + { + "epoch": 1.0354934186779603, + "grad_norm": 3.190423305945623, + "learning_rate": 9.909582229508852e-06, + "loss": 1.3968, + "step": 5379 + }, + { + "epoch": 1.0356859253555357, + "grad_norm": 3.1729390048145407, + "learning_rate": 9.906464464767567e-06, + "loss": 1.352, + "step": 5380 + }, + { + "epoch": 1.0358784320331111, + "grad_norm": 3.1471749043384367, + "learning_rate": 9.903346709119129e-06, + "loss": 1.387, + "step": 5381 + }, + { + "epoch": 1.0360709387106866, + "grad_norm": 3.1515754598937433, + "learning_rate": 9.900228962866631e-06, + "loss": 1.3318, + "step": 5382 + }, + { + "epoch": 1.036263445388262, + "grad_norm": 3.3230505120830216, + "learning_rate": 9.89711122631315e-06, + "loss": 1.4003, + "step": 5383 + }, + { + "epoch": 1.0364559520658372, + "grad_norm": 3.3918096353558775, + "learning_rate": 9.893993499761776e-06, + "loss": 1.3778, + "step": 5384 + }, + { + "epoch": 1.0364559520658372, + "lm_loss": 1.5019, + "step": 5384, + "vm_loss": 0.1526 + }, + { + "epoch": 1.0364559520658372, + "lm_loss": 1.1124, + "step": 5384, + "vm_loss": 0.1438 + }, + { + "epoch": 1.0364559520658372, + "lm_loss": 1.0978, + "step": 5384, + "vm_loss": 0.1449 + }, + { + "epoch": 1.0364559520658372, + "lm_loss": 1.6393, + "step": 5384, + "vm_loss": 0.1272 + }, + { + "epoch": 1.0364559520658372, + "lm_loss": 1.2078, + "step": 5384, + "vm_loss": 0.2061 + }, + { + "epoch": 1.0364559520658372, + "lm_loss": 1.2014, + "step": 5384, + "vm_loss": 0.1312 + }, + { + "epoch": 1.0364559520658372, + "lm_loss": 1.2002, + "step": 5384, + "vm_loss": 0.1427 + }, + { + "epoch": 1.0364559520658372, + "lm_loss": 1.9068, + "step": 5384, + "vm_loss": 0.1591 + }, + { + "epoch": 1.0366484587434126, + "grad_norm": 3.4488107236522176, + "learning_rate": 9.890875783515592e-06, + "loss": 1.4161, + "step": 5385 + }, + { + "epoch": 1.036840965420988, + "grad_norm": 3.2760276075089916, + "learning_rate": 9.887758077877673e-06, + "loss": 1.3369, + "step": 5386 + }, + { + "epoch": 1.0370334720985634, + "grad_norm": 3.254576549458216, + "learning_rate": 9.88464038315111e-06, + "loss": 1.3624, + "step": 5387 + }, + { + "epoch": 1.0372259787761389, + "grad_norm": 3.276148975684649, + "learning_rate": 9.881522699638976e-06, + "loss": 1.3929, + "step": 5388 + }, + { + "epoch": 1.0374184854537143, + "grad_norm": 3.26106298682237, + "learning_rate": 9.878405027644352e-06, + "loss": 1.3947, + "step": 5389 + }, + { + "epoch": 1.0376109921312895, + "grad_norm": 3.2202465582721866, + "learning_rate": 9.87528736747031e-06, + "loss": 1.3397, + "step": 5390 + }, + { + "epoch": 1.037803498808865, + "grad_norm": 3.2293015598232446, + "learning_rate": 9.872169719419938e-06, + "loss": 1.3353, + "step": 5391 + }, + { + "epoch": 1.0379960054864403, + "grad_norm": 3.5487671987950495, + "learning_rate": 9.869052083796302e-06, + "loss": 1.4769, + "step": 5392 + }, + { + "epoch": 1.0379960054864403, + "lm_loss": 1.285, + "step": 5392, + "vm_loss": 0.1572 + }, + { + "epoch": 1.0379960054864403, + "lm_loss": 1.4873, + "step": 5392, + "vm_loss": 0.15 + }, + { + "epoch": 1.0379960054864403, + "lm_loss": 1.2664, + "step": 5392, + "vm_loss": 0.1311 + }, + { + "epoch": 1.0379960054864403, + "lm_loss": 1.6228, + "step": 5392, + "vm_loss": 0.1648 + }, + { + "epoch": 1.0379960054864403, + "lm_loss": 1.3204, + "step": 5392, + "vm_loss": 0.1316 + }, + { + "epoch": 1.0379960054864403, + "lm_loss": 1.4353, + "step": 5392, + "vm_loss": 0.2064 + }, + { + "epoch": 1.0379960054864403, + "lm_loss": 1.1501, + "step": 5392, + "vm_loss": 0.2052 + }, + { + "epoch": 1.0379960054864403, + "lm_loss": 1.3081, + "step": 5392, + "vm_loss": 0.1898 + }, + { + "epoch": 1.0381885121640158, + "grad_norm": 3.108028384484025, + "learning_rate": 9.865934460902476e-06, + "loss": 1.4117, + "step": 5393 + }, + { + "epoch": 1.0383810188415912, + "grad_norm": 3.0883729779283806, + "learning_rate": 9.862816851041541e-06, + "loss": 1.3842, + "step": 5394 + }, + { + "epoch": 1.0385735255191664, + "grad_norm": 3.1929596088229384, + "learning_rate": 9.859699254516562e-06, + "loss": 1.3792, + "step": 5395 + }, + { + "epoch": 1.0387660321967418, + "grad_norm": 3.2863250794078804, + "learning_rate": 9.856581671630609e-06, + "loss": 1.4054, + "step": 5396 + }, + { + "epoch": 1.0389585388743172, + "grad_norm": 3.1925937920225405, + "learning_rate": 9.85346410268675e-06, + "loss": 1.3145, + "step": 5397 + }, + { + "epoch": 1.0391510455518926, + "grad_norm": 3.262136834044595, + "learning_rate": 9.850346547988059e-06, + "loss": 1.3324, + "step": 5398 + }, + { + "epoch": 1.039343552229468, + "grad_norm": 3.1923735629517824, + "learning_rate": 9.847229007837598e-06, + "loss": 1.3257, + "step": 5399 + }, + { + "epoch": 1.0395360589070433, + "grad_norm": 3.2093886194811674, + "learning_rate": 9.844111482538427e-06, + "loss": 1.3253, + "step": 5400 + }, + { + "epoch": 1.0395360589070433, + "lm_loss": 1.1906, + "step": 5400, + "vm_loss": 0.1874 + }, + { + "epoch": 1.0395360589070433, + "lm_loss": 1.3638, + "step": 5400, + "vm_loss": 0.1679 + }, + { + "epoch": 1.0395360589070433, + "lm_loss": 0.9292, + "step": 5400, + "vm_loss": 0.131 + }, + { + "epoch": 1.0395360589070433, + "lm_loss": 1.1142, + "step": 5400, + "vm_loss": 0.1502 + }, + { + "epoch": 1.0395360589070433, + "lm_loss": 1.6241, + "step": 5400, + "vm_loss": 0.1578 + }, + { + "epoch": 1.0395360589070433, + "lm_loss": 0.6412, + "step": 5400, + "vm_loss": 0.1233 + }, + { + "epoch": 1.0395360589070433, + "lm_loss": 0.977, + "step": 5400, + "vm_loss": 0.0819 + }, + { + "epoch": 1.0395360589070433, + "lm_loss": 1.7489, + "step": 5400, + "vm_loss": 0.1838 + }, + { + "epoch": 1.0397285655846187, + "grad_norm": 3.20446940176017, + "learning_rate": 9.84099397239362e-06, + "loss": 1.4377, + "step": 5401 + }, + { + "epoch": 1.039921072262194, + "grad_norm": 3.3628851219098705, + "learning_rate": 9.83787647770623e-06, + "loss": 1.3535, + "step": 5402 + }, + { + "epoch": 1.0401135789397695, + "grad_norm": 3.1598105163862638, + "learning_rate": 9.83475899877932e-06, + "loss": 1.3382, + "step": 5403 + }, + { + "epoch": 1.040306085617345, + "grad_norm": 3.1081425579274735, + "learning_rate": 9.831641535915953e-06, + "loss": 1.2984, + "step": 5404 + }, + { + "epoch": 1.0404985922949201, + "grad_norm": 3.337070743893075, + "learning_rate": 9.82852408941918e-06, + "loss": 1.4279, + "step": 5405 + }, + { + "epoch": 1.0406910989724956, + "grad_norm": 3.192534793407697, + "learning_rate": 9.82540665959206e-06, + "loss": 1.445, + "step": 5406 + }, + { + "epoch": 1.040883605650071, + "grad_norm": 3.2348921877666372, + "learning_rate": 9.822289246737642e-06, + "loss": 1.3463, + "step": 5407 + }, + { + "epoch": 1.0410761123276464, + "grad_norm": 3.137003976646942, + "learning_rate": 9.81917185115899e-06, + "loss": 1.3859, + "step": 5408 + }, + { + "epoch": 1.0410761123276464, + "lm_loss": 0.791, + "step": 5408, + "vm_loss": 0.1903 + }, + { + "epoch": 1.0410761123276464, + "lm_loss": 1.2919, + "step": 5408, + "vm_loss": 0.1662 + }, + { + "epoch": 1.0410761123276464, + "lm_loss": 1.1612, + "step": 5408, + "vm_loss": 0.1309 + }, + { + "epoch": 1.0410761123276464, + "lm_loss": 1.574, + "step": 5408, + "vm_loss": 0.1303 + }, + { + "epoch": 1.0410761123276464, + "lm_loss": 1.1696, + "step": 5408, + "vm_loss": 0.1126 + }, + { + "epoch": 1.0410761123276464, + "lm_loss": 0.9024, + "step": 5408, + "vm_loss": 0.1263 + }, + { + "epoch": 1.0410761123276464, + "lm_loss": 1.286, + "step": 5408, + "vm_loss": 0.2045 + }, + { + "epoch": 1.0410761123276464, + "lm_loss": 1.1437, + "step": 5408, + "vm_loss": 0.1033 + }, + { + "epoch": 1.0412686190052218, + "grad_norm": 3.195562145687875, + "learning_rate": 9.816054473159145e-06, + "loss": 1.289, + "step": 5409 + }, + { + "epoch": 1.041461125682797, + "grad_norm": 3.501157576003049, + "learning_rate": 9.812937113041155e-06, + "loss": 1.4108, + "step": 5410 + }, + { + "epoch": 1.0416536323603725, + "grad_norm": 3.2252080430300305, + "learning_rate": 9.809819771108076e-06, + "loss": 1.345, + "step": 5411 + }, + { + "epoch": 1.0418461390379479, + "grad_norm": 3.1686936266317023, + "learning_rate": 9.806702447662946e-06, + "loss": 1.3906, + "step": 5412 + }, + { + "epoch": 1.0420386457155233, + "grad_norm": 3.1699792283707944, + "learning_rate": 9.803585143008814e-06, + "loss": 1.3431, + "step": 5413 + }, + { + "epoch": 1.0422311523930987, + "grad_norm": 3.149811572761568, + "learning_rate": 9.800467857448713e-06, + "loss": 1.3529, + "step": 5414 + }, + { + "epoch": 1.042423659070674, + "grad_norm": 3.1949878323993324, + "learning_rate": 9.797350591285694e-06, + "loss": 1.4059, + "step": 5415 + }, + { + "epoch": 1.0426161657482493, + "grad_norm": 3.42104176058902, + "learning_rate": 9.79423334482279e-06, + "loss": 1.4167, + "step": 5416 + }, + { + "epoch": 1.0426161657482493, + "lm_loss": 1.2595, + "step": 5416, + "vm_loss": 0.2135 + }, + { + "epoch": 1.0426161657482493, + "lm_loss": 1.567, + "step": 5416, + "vm_loss": 0.1495 + }, + { + "epoch": 1.0426161657482493, + "lm_loss": 1.2305, + "step": 5416, + "vm_loss": 0.2172 + }, + { + "epoch": 1.0426161657482493, + "lm_loss": 1.1163, + "step": 5416, + "vm_loss": 0.1393 + }, + { + "epoch": 1.0426161657482493, + "lm_loss": 1.5555, + "step": 5416, + "vm_loss": 0.1447 + }, + { + "epoch": 1.0426161657482493, + "lm_loss": 1.2243, + "step": 5416, + "vm_loss": 0.1568 + }, + { + "epoch": 1.0426161657482493, + "lm_loss": 1.9649, + "step": 5416, + "vm_loss": 0.1923 + }, + { + "epoch": 1.0426161657482493, + "lm_loss": 1.0061, + "step": 5416, + "vm_loss": 0.1667 + }, + { + "epoch": 1.0428086724258248, + "grad_norm": 3.3125370344067804, + "learning_rate": 9.791116118363033e-06, + "loss": 1.3985, + "step": 5417 + }, + { + "epoch": 1.0430011791034002, + "grad_norm": 3.255981493727488, + "learning_rate": 9.787998912209466e-06, + "loss": 1.394, + "step": 5418 + }, + { + "epoch": 1.0431936857809756, + "grad_norm": 3.244088810004539, + "learning_rate": 9.784881726665117e-06, + "loss": 1.4061, + "step": 5419 + }, + { + "epoch": 1.0433861924585508, + "grad_norm": 3.2455077384495183, + "learning_rate": 9.781764562033011e-06, + "loss": 1.398, + "step": 5420 + }, + { + "epoch": 1.0435786991361262, + "grad_norm": 3.2520388112240517, + "learning_rate": 9.778647418616187e-06, + "loss": 1.336, + "step": 5421 + }, + { + "epoch": 1.0437712058137016, + "grad_norm": 3.295193023030233, + "learning_rate": 9.775530296717666e-06, + "loss": 1.3657, + "step": 5422 + }, + { + "epoch": 1.043963712491277, + "grad_norm": 3.370179588026639, + "learning_rate": 9.77241319664047e-06, + "loss": 1.327, + "step": 5423 + }, + { + "epoch": 1.0441562191688525, + "grad_norm": 3.192534190234887, + "learning_rate": 9.769296118687623e-06, + "loss": 1.3379, + "step": 5424 + }, + { + "epoch": 1.0441562191688525, + "lm_loss": 1.4854, + "step": 5424, + "vm_loss": 0.1546 + }, + { + "epoch": 1.0441562191688525, + "lm_loss": 1.796, + "step": 5424, + "vm_loss": 0.1689 + }, + { + "epoch": 1.0441562191688525, + "lm_loss": 1.0299, + "step": 5424, + "vm_loss": 0.1477 + }, + { + "epoch": 1.0441562191688525, + "lm_loss": 1.554, + "step": 5424, + "vm_loss": 0.1009 + }, + { + "epoch": 1.0441562191688525, + "lm_loss": 1.3919, + "step": 5424, + "vm_loss": 0.1736 + }, + { + "epoch": 1.0441562191688525, + "lm_loss": 1.0757, + "step": 5424, + "vm_loss": 0.2045 + }, + { + "epoch": 1.0441562191688525, + "lm_loss": 0.8189, + "step": 5424, + "vm_loss": 0.2273 + }, + { + "epoch": 1.0441562191688525, + "lm_loss": 0.7756, + "step": 5424, + "vm_loss": 0.1927 + }, + { + "epoch": 1.044348725846428, + "grad_norm": 3.4266427025458777, + "learning_rate": 9.766179063162143e-06, + "loss": 1.4116, + "step": 5425 + }, + { + "epoch": 1.044541232524003, + "grad_norm": 3.3775988136593296, + "learning_rate": 9.763062030367054e-06, + "loss": 1.3926, + "step": 5426 + }, + { + "epoch": 1.0447337392015785, + "grad_norm": 3.2975218995845075, + "learning_rate": 9.759945020605363e-06, + "loss": 1.4077, + "step": 5427 + }, + { + "epoch": 1.044926245879154, + "grad_norm": 3.3322400263594063, + "learning_rate": 9.756828034180088e-06, + "loss": 1.4358, + "step": 5428 + }, + { + "epoch": 1.0451187525567294, + "grad_norm": 3.1956105877185625, + "learning_rate": 9.753711071394242e-06, + "loss": 1.414, + "step": 5429 + }, + { + "epoch": 1.0453112592343048, + "grad_norm": 3.209203842790889, + "learning_rate": 9.75059413255083e-06, + "loss": 1.4038, + "step": 5430 + }, + { + "epoch": 1.04550376591188, + "grad_norm": 3.1365457548324613, + "learning_rate": 9.747477217952855e-06, + "loss": 1.3633, + "step": 5431 + }, + { + "epoch": 1.0456962725894554, + "grad_norm": 3.161884700784253, + "learning_rate": 9.74436032790333e-06, + "loss": 1.4195, + "step": 5432 + }, + { + "epoch": 1.0456962725894554, + "lm_loss": 1.2009, + "step": 5432, + "vm_loss": 0.1428 + }, + { + "epoch": 1.0456962725894554, + "lm_loss": 1.0991, + "step": 5432, + "vm_loss": 0.115 + }, + { + "epoch": 1.0456962725894554, + "lm_loss": 1.2694, + "step": 5432, + "vm_loss": 0.1656 + }, + { + "epoch": 1.0456962725894554, + "lm_loss": 1.6871, + "step": 5432, + "vm_loss": 0.1904 + }, + { + "epoch": 1.0456962725894554, + "lm_loss": 1.324, + "step": 5432, + "vm_loss": 0.1079 + }, + { + "epoch": 1.0456962725894554, + "lm_loss": 0.8793, + "step": 5432, + "vm_loss": 0.2057 + }, + { + "epoch": 1.0456962725894554, + "lm_loss": 1.1131, + "step": 5432, + "vm_loss": 0.0766 + }, + { + "epoch": 1.0456962725894554, + "lm_loss": 1.8577, + "step": 5432, + "vm_loss": 0.1562 + }, + { + "epoch": 1.0458887792670308, + "grad_norm": 3.268027301590538, + "learning_rate": 9.741243462705251e-06, + "loss": 1.3532, + "step": 5433 + }, + { + "epoch": 1.0460812859446063, + "grad_norm": 3.359714236854639, + "learning_rate": 9.738126622661613e-06, + "loss": 1.3868, + "step": 5434 + }, + { + "epoch": 1.0462737926221817, + "grad_norm": 3.3355513954355325, + "learning_rate": 9.735009808075422e-06, + "loss": 1.3391, + "step": 5435 + }, + { + "epoch": 1.0464662992997569, + "grad_norm": 3.2965579103438074, + "learning_rate": 9.731893019249669e-06, + "loss": 1.3524, + "step": 5436 + }, + { + "epoch": 1.0466588059773323, + "grad_norm": 3.4936833672772427, + "learning_rate": 9.72877625648734e-06, + "loss": 1.4113, + "step": 5437 + }, + { + "epoch": 1.0468513126549077, + "grad_norm": 3.2731217216358446, + "learning_rate": 9.725659520091433e-06, + "loss": 1.4354, + "step": 5438 + }, + { + "epoch": 1.0470438193324831, + "grad_norm": 3.11045425622832, + "learning_rate": 9.72254281036493e-06, + "loss": 1.3663, + "step": 5439 + }, + { + "epoch": 1.0472363260100586, + "grad_norm": 3.073939093369468, + "learning_rate": 9.719426127610818e-06, + "loss": 1.3429, + "step": 5440 + }, + { + "epoch": 1.0472363260100586, + "lm_loss": 0.8708, + "step": 5440, + "vm_loss": 0.2349 + }, + { + "epoch": 1.0472363260100586, + "lm_loss": 1.0635, + "step": 5440, + "vm_loss": 0.1769 + }, + { + "epoch": 1.0472363260100586, + "lm_loss": 1.7092, + "step": 5440, + "vm_loss": 0.2029 + }, + { + "epoch": 1.0472363260100586, + "lm_loss": 1.1106, + "step": 5440, + "vm_loss": 0.1453 + }, + { + "epoch": 1.0472363260100586, + "lm_loss": 1.2037, + "step": 5440, + "vm_loss": 0.1038 + }, + { + "epoch": 1.0472363260100586, + "lm_loss": 1.2045, + "step": 5440, + "vm_loss": 0.1367 + }, + { + "epoch": 1.0472363260100586, + "lm_loss": 1.0665, + "step": 5440, + "vm_loss": 0.1713 + }, + { + "epoch": 1.0472363260100586, + "lm_loss": 0.8388, + "step": 5440, + "vm_loss": 0.169 + }, + { + "epoch": 1.0474288326876338, + "grad_norm": 3.1840058326940213, + "learning_rate": 9.716309472132069e-06, + "loss": 1.3188, + "step": 5441 + }, + { + "epoch": 1.0476213393652092, + "grad_norm": 3.073987586177332, + "learning_rate": 9.713192844231674e-06, + "loss": 1.2815, + "step": 5442 + }, + { + "epoch": 1.0478138460427846, + "grad_norm": 3.1318191979639263, + "learning_rate": 9.7100762442126e-06, + "loss": 1.3444, + "step": 5443 + }, + { + "epoch": 1.04800635272036, + "grad_norm": 3.2230700538362544, + "learning_rate": 9.706959672377826e-06, + "loss": 1.2817, + "step": 5444 + }, + { + "epoch": 1.0481988593979354, + "grad_norm": 3.454312697421651, + "learning_rate": 9.703843129030324e-06, + "loss": 1.3167, + "step": 5445 + }, + { + "epoch": 1.0483913660755106, + "grad_norm": 3.4157576555049696, + "learning_rate": 9.700726614473055e-06, + "loss": 1.3455, + "step": 5446 + }, + { + "epoch": 1.048583872753086, + "grad_norm": 3.363796833928215, + "learning_rate": 9.69761012900899e-06, + "loss": 1.3621, + "step": 5447 + }, + { + "epoch": 1.0487763794306615, + "grad_norm": 3.3421109094411996, + "learning_rate": 9.694493672941086e-06, + "loss": 1.3925, + "step": 5448 + }, + { + "epoch": 1.0487763794306615, + "lm_loss": 1.8137, + "step": 5448, + "vm_loss": 0.1852 + }, + { + "epoch": 1.0487763794306615, + "lm_loss": 1.0316, + "step": 5448, + "vm_loss": 0.1973 + }, + { + "epoch": 1.0487763794306615, + "lm_loss": 1.6715, + "step": 5448, + "vm_loss": 0.1703 + }, + { + "epoch": 1.0487763794306615, + "lm_loss": 1.2971, + "step": 5448, + "vm_loss": 0.1682 + }, + { + "epoch": 1.0487763794306615, + "lm_loss": 1.3823, + "step": 5448, + "vm_loss": 0.1383 + }, + { + "epoch": 1.0487763794306615, + "lm_loss": 1.7679, + "step": 5448, + "vm_loss": 0.2053 + }, + { + "epoch": 1.0487763794306615, + "lm_loss": 1.4051, + "step": 5448, + "vm_loss": 0.141 + }, + { + "epoch": 1.0487763794306615, + "lm_loss": 0.9337, + "step": 5448, + "vm_loss": 0.1385 + }, + { + "epoch": 1.048968886108237, + "grad_norm": 3.2010205556991176, + "learning_rate": 9.69137724657231e-06, + "loss": 1.4144, + "step": 5449 + }, + { + "epoch": 1.0491613927858123, + "grad_norm": 3.163163683838556, + "learning_rate": 9.688260850205613e-06, + "loss": 1.3706, + "step": 5450 + }, + { + "epoch": 1.0493538994633875, + "grad_norm": 3.2024519043361077, + "learning_rate": 9.685144484143945e-06, + "loss": 1.3649, + "step": 5451 + }, + { + "epoch": 1.049546406140963, + "grad_norm": 3.2186631444263765, + "learning_rate": 9.682028148690267e-06, + "loss": 1.3479, + "step": 5452 + }, + { + "epoch": 1.0497389128185384, + "grad_norm": 3.1590699901924046, + "learning_rate": 9.67891184414752e-06, + "loss": 1.2377, + "step": 5453 + }, + { + "epoch": 1.0499314194961138, + "grad_norm": 3.4004263535349124, + "learning_rate": 9.675795570818645e-06, + "loss": 1.361, + "step": 5454 + }, + { + "epoch": 1.0501239261736892, + "grad_norm": 3.3908322976794607, + "learning_rate": 9.672679329006593e-06, + "loss": 1.3957, + "step": 5455 + }, + { + "epoch": 1.0503164328512646, + "grad_norm": 3.582504419373602, + "learning_rate": 9.669563119014299e-06, + "loss": 1.3794, + "step": 5456 + }, + { + "epoch": 1.0503164328512646, + "lm_loss": 1.1451, + "step": 5456, + "vm_loss": 0.1849 + }, + { + "epoch": 1.0503164328512646, + "lm_loss": 1.2296, + "step": 5456, + "vm_loss": 0.1836 + }, + { + "epoch": 1.0503164328512646, + "lm_loss": 1.8296, + "step": 5456, + "vm_loss": 0.1947 + }, + { + "epoch": 1.0503164328512646, + "lm_loss": 1.6063, + "step": 5456, + "vm_loss": 0.1507 + }, + { + "epoch": 1.0503164328512646, + "lm_loss": 1.1651, + "step": 5456, + "vm_loss": 0.138 + }, + { + "epoch": 1.0503164328512646, + "lm_loss": 1.1, + "step": 5456, + "vm_loss": 0.1004 + }, + { + "epoch": 1.0503164328512646, + "lm_loss": 1.3713, + "step": 5456, + "vm_loss": 0.1863 + }, + { + "epoch": 1.0503164328512646, + "lm_loss": 0.8877, + "step": 5456, + "vm_loss": 0.1139 + }, + { + "epoch": 1.0505089395288398, + "grad_norm": 3.258457749628126, + "learning_rate": 9.666446941144696e-06, + "loss": 1.3779, + "step": 5457 + }, + { + "epoch": 1.0507014462064153, + "grad_norm": 3.13787964693258, + "learning_rate": 9.663330795700714e-06, + "loss": 1.356, + "step": 5458 + }, + { + "epoch": 1.0508939528839907, + "grad_norm": 3.0945007688429027, + "learning_rate": 9.66021468298529e-06, + "loss": 1.3179, + "step": 5459 + }, + { + "epoch": 1.051086459561566, + "grad_norm": 3.1067042166063388, + "learning_rate": 9.657098603301347e-06, + "loss": 1.3659, + "step": 5460 + }, + { + "epoch": 1.0512789662391415, + "grad_norm": 3.1037167090908944, + "learning_rate": 9.653982556951802e-06, + "loss": 1.3527, + "step": 5461 + }, + { + "epoch": 1.0514714729167167, + "grad_norm": 3.302168792952475, + "learning_rate": 9.650866544239582e-06, + "loss": 1.3317, + "step": 5462 + }, + { + "epoch": 1.0516639795942921, + "grad_norm": 3.2818875272593484, + "learning_rate": 9.647750565467602e-06, + "loss": 1.3724, + "step": 5463 + }, + { + "epoch": 1.0518564862718676, + "grad_norm": 3.290746685639992, + "learning_rate": 9.644634620938771e-06, + "loss": 1.3587, + "step": 5464 + }, + { + "epoch": 1.0518564862718676, + "lm_loss": 1.1375, + "step": 5464, + "vm_loss": 0.1211 + }, + { + "epoch": 1.0518564862718676, + "lm_loss": 1.0107, + "step": 5464, + "vm_loss": 0.1409 + }, + { + "epoch": 1.0518564862718676, + "lm_loss": 0.808, + "step": 5464, + "vm_loss": 0.1544 + }, + { + "epoch": 1.0518564862718676, + "lm_loss": 1.3957, + "step": 5464, + "vm_loss": 0.2092 + }, + { + "epoch": 1.0518564862718676, + "lm_loss": 1.4544, + "step": 5464, + "vm_loss": 0.1909 + }, + { + "epoch": 1.0518564862718676, + "lm_loss": 1.144, + "step": 5464, + "vm_loss": 0.1302 + }, + { + "epoch": 1.0518564862718676, + "lm_loss": 1.3339, + "step": 5464, + "vm_loss": 0.15 + }, + { + "epoch": 1.0518564862718676, + "lm_loss": 1.6451, + "step": 5464, + "vm_loss": 0.1581 + }, + { + "epoch": 1.052048992949443, + "grad_norm": 3.1326206069678846, + "learning_rate": 9.641518710956001e-06, + "loss": 1.3477, + "step": 5465 + }, + { + "epoch": 1.0522414996270184, + "grad_norm": 3.204884544063744, + "learning_rate": 9.638402835822203e-06, + "loss": 1.3998, + "step": 5466 + }, + { + "epoch": 1.0524340063045936, + "grad_norm": 3.2836470642143762, + "learning_rate": 9.63528699584027e-06, + "loss": 1.4714, + "step": 5467 + }, + { + "epoch": 1.052626512982169, + "grad_norm": 3.24579415508618, + "learning_rate": 9.63217119131311e-06, + "loss": 1.3421, + "step": 5468 + }, + { + "epoch": 1.0528190196597444, + "grad_norm": 3.204660668685535, + "learning_rate": 9.629055422543613e-06, + "loss": 1.3596, + "step": 5469 + }, + { + "epoch": 1.0530115263373199, + "grad_norm": 3.232675305449826, + "learning_rate": 9.62593968983468e-06, + "loss": 1.3669, + "step": 5470 + }, + { + "epoch": 1.0532040330148953, + "grad_norm": 3.0967130902059075, + "learning_rate": 9.622823993489193e-06, + "loss": 1.3964, + "step": 5471 + }, + { + "epoch": 1.0533965396924705, + "grad_norm": 3.1226377764884528, + "learning_rate": 9.619708333810035e-06, + "loss": 1.3929, + "step": 5472 + }, + { + "epoch": 1.0533965396924705, + "lm_loss": 1.5588, + "step": 5472, + "vm_loss": 0.2292 + }, + { + "epoch": 1.0533965396924705, + "lm_loss": 0.894, + "step": 5472, + "vm_loss": 0.1325 + }, + { + "epoch": 1.0533965396924705, + "lm_loss": 1.4592, + "step": 5472, + "vm_loss": 0.2092 + }, + { + "epoch": 1.0533965396924705, + "lm_loss": 1.4189, + "step": 5472, + "vm_loss": 0.2134 + }, + { + "epoch": 1.0533965396924705, + "lm_loss": 1.0472, + "step": 5472, + "vm_loss": 0.1431 + }, + { + "epoch": 1.0533965396924705, + "lm_loss": 1.2052, + "step": 5472, + "vm_loss": 0.1119 + }, + { + "epoch": 1.0533965396924705, + "lm_loss": 0.9274, + "step": 5472, + "vm_loss": 0.1493 + }, + { + "epoch": 1.0533965396924705, + "lm_loss": 1.6927, + "step": 5472, + "vm_loss": 0.1493 + }, + { + "epoch": 1.053589046370046, + "grad_norm": 3.1968328058448, + "learning_rate": 9.616592711100097e-06, + "loss": 1.424, + "step": 5473 + }, + { + "epoch": 1.0537815530476213, + "grad_norm": 3.1657660770776648, + "learning_rate": 9.613477125662255e-06, + "loss": 1.3533, + "step": 5474 + }, + { + "epoch": 1.0539740597251968, + "grad_norm": 3.267773534815087, + "learning_rate": 9.610361577799373e-06, + "loss": 1.3784, + "step": 5475 + }, + { + "epoch": 1.0541665664027722, + "grad_norm": 3.2379643399334026, + "learning_rate": 9.607246067814338e-06, + "loss": 1.3694, + "step": 5476 + }, + { + "epoch": 1.0543590730803474, + "grad_norm": 3.2997355524433685, + "learning_rate": 9.604130596010009e-06, + "loss": 1.3979, + "step": 5477 + }, + { + "epoch": 1.0545515797579228, + "grad_norm": 3.3484553664352545, + "learning_rate": 9.601015162689245e-06, + "loss": 1.4398, + "step": 5478 + }, + { + "epoch": 1.0547440864354982, + "grad_norm": 3.2034867993858227, + "learning_rate": 9.597899768154917e-06, + "loss": 1.336, + "step": 5479 + }, + { + "epoch": 1.0549365931130736, + "grad_norm": 3.247468190994917, + "learning_rate": 9.594784412709876e-06, + "loss": 1.3769, + "step": 5480 + }, + { + "epoch": 1.0549365931130736, + "lm_loss": 1.2273, + "step": 5480, + "vm_loss": 0.2033 + }, + { + "epoch": 1.0549365931130736, + "lm_loss": 0.8006, + "step": 5480, + "vm_loss": 0.2182 + }, + { + "epoch": 1.0549365931130736, + "lm_loss": 1.2139, + "step": 5480, + "vm_loss": 0.1171 + }, + { + "epoch": 1.0549365931130736, + "lm_loss": 1.1277, + "step": 5480, + "vm_loss": 0.1697 + }, + { + "epoch": 1.0549365931130736, + "lm_loss": 0.8925, + "step": 5480, + "vm_loss": 0.1345 + }, + { + "epoch": 1.0549365931130736, + "lm_loss": 1.2873, + "step": 5480, + "vm_loss": 0.1708 + }, + { + "epoch": 1.0549365931130736, + "lm_loss": 1.7226, + "step": 5480, + "vm_loss": 0.1946 + }, + { + "epoch": 1.0549365931130736, + "lm_loss": 1.1429, + "step": 5480, + "vm_loss": 0.1557 + }, + { + "epoch": 1.055129099790649, + "grad_norm": 3.1462705924497696, + "learning_rate": 9.591669096656975e-06, + "loss": 1.3087, + "step": 5481 + }, + { + "epoch": 1.0553216064682243, + "grad_norm": 3.1607823724074673, + "learning_rate": 9.588553820299057e-06, + "loss": 1.401, + "step": 5482 + }, + { + "epoch": 1.0555141131457997, + "grad_norm": 3.1466229700315536, + "learning_rate": 9.585438583938976e-06, + "loss": 1.3744, + "step": 5483 + }, + { + "epoch": 1.055706619823375, + "grad_norm": 3.1055546860091576, + "learning_rate": 9.582323387879569e-06, + "loss": 1.2741, + "step": 5484 + }, + { + "epoch": 1.0558991265009505, + "grad_norm": 3.1687489440084318, + "learning_rate": 9.579208232423667e-06, + "loss": 1.3701, + "step": 5485 + }, + { + "epoch": 1.056091633178526, + "grad_norm": 3.2362720401673, + "learning_rate": 9.576093117874112e-06, + "loss": 1.4174, + "step": 5486 + }, + { + "epoch": 1.0562841398561011, + "grad_norm": 3.2887288922445426, + "learning_rate": 9.572978044533729e-06, + "loss": 1.416, + "step": 5487 + }, + { + "epoch": 1.0564766465336766, + "grad_norm": 3.202312231641938, + "learning_rate": 9.569863012705345e-06, + "loss": 1.3477, + "step": 5488 + }, + { + "epoch": 1.0564766465336766, + "lm_loss": 1.2546, + "step": 5488, + "vm_loss": 0.1512 + }, + { + "epoch": 1.0564766465336766, + "lm_loss": 0.9154, + "step": 5488, + "vm_loss": 0.1642 + }, + { + "epoch": 1.0564766465336766, + "lm_loss": 1.4171, + "step": 5488, + "vm_loss": 0.1617 + }, + { + "epoch": 1.0564766465336766, + "lm_loss": 0.7775, + "step": 5488, + "vm_loss": 0.1483 + }, + { + "epoch": 1.0564766465336766, + "lm_loss": 1.4801, + "step": 5488, + "vm_loss": 0.1865 + }, + { + "epoch": 1.0564766465336766, + "lm_loss": 0.9349, + "step": 5488, + "vm_loss": 0.1955 + }, + { + "epoch": 1.0564766465336766, + "lm_loss": 0.7298, + "step": 5488, + "vm_loss": 0.1261 + }, + { + "epoch": 1.0564766465336766, + "lm_loss": 1.5569, + "step": 5488, + "vm_loss": 0.1659 + }, + { + "epoch": 1.056669153211252, + "grad_norm": 3.354804592943215, + "learning_rate": 9.566748022691776e-06, + "loss": 1.324, + "step": 5489 + }, + { + "epoch": 1.0568616598888274, + "grad_norm": 3.1340332711030747, + "learning_rate": 9.563633074795843e-06, + "loss": 1.3168, + "step": 5490 + }, + { + "epoch": 1.0570541665664028, + "grad_norm": 3.19717365800173, + "learning_rate": 9.56051816932036e-06, + "loss": 1.3949, + "step": 5491 + }, + { + "epoch": 1.0572466732439783, + "grad_norm": 3.1980309002715295, + "learning_rate": 9.557403306568128e-06, + "loss": 1.3398, + "step": 5492 + }, + { + "epoch": 1.0574391799215535, + "grad_norm": 3.1294102643117334, + "learning_rate": 9.554288486841963e-06, + "loss": 1.3647, + "step": 5493 + }, + { + "epoch": 1.0576316865991289, + "grad_norm": 3.1787080791133797, + "learning_rate": 9.551173710444658e-06, + "loss": 1.3546, + "step": 5494 + }, + { + "epoch": 1.0578241932767043, + "grad_norm": 3.1608070672607878, + "learning_rate": 9.548058977679009e-06, + "loss": 1.4116, + "step": 5495 + }, + { + "epoch": 1.0580166999542797, + "grad_norm": 3.0895406186405743, + "learning_rate": 9.544944288847811e-06, + "loss": 1.2737, + "step": 5496 + }, + { + "epoch": 1.0580166999542797, + "lm_loss": 1.1827, + "step": 5496, + "vm_loss": 0.1714 + }, + { + "epoch": 1.0580166999542797, + "lm_loss": 1.018, + "step": 5496, + "vm_loss": 0.1615 + }, + { + "epoch": 1.0580166999542797, + "lm_loss": 1.7915, + "step": 5496, + "vm_loss": 0.1769 + }, + { + "epoch": 1.0580166999542797, + "lm_loss": 1.2585, + "step": 5496, + "vm_loss": 0.1336 + }, + { + "epoch": 1.0580166999542797, + "lm_loss": 1.1598, + "step": 5496, + "vm_loss": 0.1275 + }, + { + "epoch": 1.0580166999542797, + "lm_loss": 1.2979, + "step": 5496, + "vm_loss": 0.1647 + }, + { + "epoch": 1.0580166999542797, + "lm_loss": 1.3715, + "step": 5496, + "vm_loss": 0.1617 + }, + { + "epoch": 1.0580166999542797, + "lm_loss": 1.0014, + "step": 5496, + "vm_loss": 0.0986 + }, + { + "epoch": 1.0582092066318551, + "grad_norm": 3.1990702556244517, + "learning_rate": 9.541829644253851e-06, + "loss": 1.3976, + "step": 5497 + }, + { + "epoch": 1.0584017133094303, + "grad_norm": 3.2423345582479715, + "learning_rate": 9.53871504419991e-06, + "loss": 1.4015, + "step": 5498 + }, + { + "epoch": 1.0585942199870058, + "grad_norm": 3.234646377479755, + "learning_rate": 9.535600488988765e-06, + "loss": 1.3359, + "step": 5499 + }, + { + "epoch": 1.0587867266645812, + "grad_norm": 3.283801487146299, + "learning_rate": 9.532485978923199e-06, + "loss": 1.3731, + "step": 5500 + }, + { + "epoch": 1.0589792333421566, + "grad_norm": 3.4712007327410648, + "learning_rate": 9.529371514305976e-06, + "loss": 1.3819, + "step": 5501 + }, + { + "epoch": 1.059171740019732, + "grad_norm": 3.250546190873005, + "learning_rate": 9.526257095439859e-06, + "loss": 1.3562, + "step": 5502 + }, + { + "epoch": 1.0593642466973072, + "grad_norm": 3.1773738322890086, + "learning_rate": 9.523142722627616e-06, + "loss": 1.3108, + "step": 5503 + }, + { + "epoch": 1.0595567533748826, + "grad_norm": 3.1614215400053007, + "learning_rate": 9.520028396172002e-06, + "loss": 1.3152, + "step": 5504 + }, + { + "epoch": 1.0595567533748826, + "lm_loss": 1.0594, + "step": 5504, + "vm_loss": 0.1881 + }, + { + "epoch": 1.0595567533748826, + "lm_loss": 1.1106, + "step": 5504, + "vm_loss": 0.1534 + }, + { + "epoch": 1.0595567533748826, + "lm_loss": 1.6877, + "step": 5504, + "vm_loss": 0.1891 + }, + { + "epoch": 1.0595567533748826, + "lm_loss": 1.1166, + "step": 5504, + "vm_loss": 0.1642 + }, + { + "epoch": 1.0595567533748826, + "lm_loss": 1.3703, + "step": 5504, + "vm_loss": 0.174 + }, + { + "epoch": 1.0595567533748826, + "lm_loss": 0.9894, + "step": 5504, + "vm_loss": 0.1202 + }, + { + "epoch": 1.0595567533748826, + "lm_loss": 1.0837, + "step": 5504, + "vm_loss": 0.2254 + }, + { + "epoch": 1.0595567533748826, + "lm_loss": 1.198, + "step": 5504, + "vm_loss": 0.128 + }, + { + "epoch": 1.059749260052458, + "grad_norm": 3.14156539454186, + "learning_rate": 9.516914116375768e-06, + "loss": 1.3315, + "step": 5505 + }, + { + "epoch": 1.0599417667300335, + "grad_norm": 3.131932772847044, + "learning_rate": 9.51379988354166e-06, + "loss": 1.3831, + "step": 5506 + }, + { + "epoch": 1.060134273407609, + "grad_norm": 3.107728989187229, + "learning_rate": 9.510685697972429e-06, + "loss": 1.347, + "step": 5507 + }, + { + "epoch": 1.060326780085184, + "grad_norm": 3.27751325877077, + "learning_rate": 9.507571559970802e-06, + "loss": 1.4523, + "step": 5508 + }, + { + "epoch": 1.0605192867627595, + "grad_norm": 3.2564747721848604, + "learning_rate": 9.504457469839522e-06, + "loss": 1.3724, + "step": 5509 + }, + { + "epoch": 1.060711793440335, + "grad_norm": 3.1804606065999947, + "learning_rate": 9.501343427881317e-06, + "loss": 1.2729, + "step": 5510 + }, + { + "epoch": 1.0609043001179104, + "grad_norm": 3.2315894137636527, + "learning_rate": 9.49822943439891e-06, + "loss": 1.4042, + "step": 5511 + }, + { + "epoch": 1.0610968067954858, + "grad_norm": 3.2250274987852356, + "learning_rate": 9.49511548969502e-06, + "loss": 1.3667, + "step": 5512 + }, + { + "epoch": 1.0610968067954858, + "lm_loss": 1.3068, + "step": 5512, + "vm_loss": 0.1608 + }, + { + "epoch": 1.0610968067954858, + "lm_loss": 0.7451, + "step": 5512, + "vm_loss": 0.1532 + }, + { + "epoch": 1.0610968067954858, + "lm_loss": 1.3144, + "step": 5512, + "vm_loss": 0.2083 + }, + { + "epoch": 1.0610968067954858, + "lm_loss": 1.3018, + "step": 5512, + "vm_loss": 0.2 + }, + { + "epoch": 1.0610968067954858, + "lm_loss": 1.2121, + "step": 5512, + "vm_loss": 0.1684 + }, + { + "epoch": 1.0610968067954858, + "lm_loss": 1.469, + "step": 5512, + "vm_loss": 0.2339 + }, + { + "epoch": 1.0610968067954858, + "lm_loss": 1.5717, + "step": 5512, + "vm_loss": 0.1265 + }, + { + "epoch": 1.0610968067954858, + "lm_loss": 1.1969, + "step": 5512, + "vm_loss": 0.1868 + }, + { + "epoch": 1.061289313473061, + "grad_norm": 3.161972446545002, + "learning_rate": 9.492001594072364e-06, + "loss": 1.3499, + "step": 5513 + }, + { + "epoch": 1.0614818201506364, + "grad_norm": 3.1706424609459654, + "learning_rate": 9.488887747833655e-06, + "loss": 1.4026, + "step": 5514 + }, + { + "epoch": 1.0616743268282118, + "grad_norm": 3.2230485621414475, + "learning_rate": 9.485773951281594e-06, + "loss": 1.3903, + "step": 5515 + }, + { + "epoch": 1.0618668335057873, + "grad_norm": 3.0607407945570997, + "learning_rate": 9.482660204718882e-06, + "loss": 1.3522, + "step": 5516 + }, + { + "epoch": 1.0620593401833627, + "grad_norm": 3.1370278677580616, + "learning_rate": 9.47954650844822e-06, + "loss": 1.3724, + "step": 5517 + }, + { + "epoch": 1.062251846860938, + "grad_norm": 3.1852560086999655, + "learning_rate": 9.476432862772297e-06, + "loss": 1.3959, + "step": 5518 + }, + { + "epoch": 1.0624443535385133, + "grad_norm": 3.1122660643811337, + "learning_rate": 9.473319267993793e-06, + "loss": 1.383, + "step": 5519 + }, + { + "epoch": 1.0626368602160887, + "grad_norm": 3.115348971023863, + "learning_rate": 9.470205724415397e-06, + "loss": 1.3538, + "step": 5520 + }, + { + "epoch": 1.0626368602160887, + "lm_loss": 0.6927, + "step": 5520, + "vm_loss": 0.2302 + }, + { + "epoch": 1.0626368602160887, + "lm_loss": 1.4636, + "step": 5520, + "vm_loss": 0.2369 + }, + { + "epoch": 1.0626368602160887, + "lm_loss": 0.9029, + "step": 5520, + "vm_loss": 0.1514 + }, + { + "epoch": 1.0626368602160887, + "lm_loss": 1.2039, + "step": 5520, + "vm_loss": 0.1779 + }, + { + "epoch": 1.0626368602160887, + "lm_loss": 1.1532, + "step": 5520, + "vm_loss": 0.152 + }, + { + "epoch": 1.0626368602160887, + "lm_loss": 0.8009, + "step": 5520, + "vm_loss": 0.1272 + }, + { + "epoch": 1.0626368602160887, + "lm_loss": 0.8479, + "step": 5520, + "vm_loss": 0.1864 + }, + { + "epoch": 1.0626368602160887, + "lm_loss": 1.5394, + "step": 5520, + "vm_loss": 0.1724 + }, + { + "epoch": 1.0628293668936641, + "grad_norm": 3.214962216060597, + "learning_rate": 9.467092232339785e-06, + "loss": 1.3926, + "step": 5521 + }, + { + "epoch": 1.0630218735712396, + "grad_norm": 3.257986693392231, + "learning_rate": 9.463978792069624e-06, + "loss": 1.3756, + "step": 5522 + }, + { + "epoch": 1.063214380248815, + "grad_norm": 3.3293203971725145, + "learning_rate": 9.46086540390758e-06, + "loss": 1.3543, + "step": 5523 + }, + { + "epoch": 1.0634068869263902, + "grad_norm": 3.214518068462653, + "learning_rate": 9.457752068156318e-06, + "loss": 1.3075, + "step": 5524 + }, + { + "epoch": 1.0635993936039656, + "grad_norm": 3.30846353584957, + "learning_rate": 9.454638785118494e-06, + "loss": 1.3906, + "step": 5525 + }, + { + "epoch": 1.063791900281541, + "grad_norm": 3.1405904433221874, + "learning_rate": 9.451525555096753e-06, + "loss": 1.3856, + "step": 5526 + }, + { + "epoch": 1.0639844069591164, + "grad_norm": 3.1499927362033153, + "learning_rate": 9.44841237839375e-06, + "loss": 1.3788, + "step": 5527 + }, + { + "epoch": 1.0641769136366919, + "grad_norm": 3.1332903103264544, + "learning_rate": 9.445299255312117e-06, + "loss": 1.3226, + "step": 5528 + }, + { + "epoch": 1.0641769136366919, + "lm_loss": 0.5048, + "step": 5528, + "vm_loss": 0.1027 + }, + { + "epoch": 1.0641769136366919, + "lm_loss": 1.3757, + "step": 5528, + "vm_loss": 0.1541 + }, + { + "epoch": 1.0641769136366919, + "lm_loss": 1.0306, + "step": 5528, + "vm_loss": 0.2452 + }, + { + "epoch": 1.0641769136366919, + "lm_loss": 1.5186, + "step": 5528, + "vm_loss": 0.1714 + }, + { + "epoch": 1.0641769136366919, + "lm_loss": 1.0104, + "step": 5528, + "vm_loss": 0.137 + }, + { + "epoch": 1.0641769136366919, + "lm_loss": 1.0585, + "step": 5528, + "vm_loss": 0.1802 + }, + { + "epoch": 1.0641769136366919, + "lm_loss": 1.8206, + "step": 5528, + "vm_loss": 0.1185 + }, + { + "epoch": 1.0641769136366919, + "lm_loss": 1.0597, + "step": 5528, + "vm_loss": 0.1295 + }, + { + "epoch": 1.064369420314267, + "grad_norm": 3.0925379134096875, + "learning_rate": 9.442186186154493e-06, + "loss": 1.3705, + "step": 5529 + }, + { + "epoch": 1.0645619269918425, + "grad_norm": 3.3061318942947135, + "learning_rate": 9.439073171223511e-06, + "loss": 1.3784, + "step": 5530 + }, + { + "epoch": 1.064754433669418, + "grad_norm": 3.25435156961351, + "learning_rate": 9.435960210821792e-06, + "loss": 1.3472, + "step": 5531 + }, + { + "epoch": 1.0649469403469933, + "grad_norm": 3.26852388794344, + "learning_rate": 9.432847305251957e-06, + "loss": 1.3235, + "step": 5532 + }, + { + "epoch": 1.0651394470245688, + "grad_norm": 3.2186926054919303, + "learning_rate": 9.429734454816616e-06, + "loss": 1.3413, + "step": 5533 + }, + { + "epoch": 1.065331953702144, + "grad_norm": 3.342135743280298, + "learning_rate": 9.426621659818388e-06, + "loss": 1.346, + "step": 5534 + }, + { + "epoch": 1.0655244603797194, + "grad_norm": 3.2890832655300835, + "learning_rate": 9.42350892055987e-06, + "loss": 1.3569, + "step": 5535 + }, + { + "epoch": 1.0657169670572948, + "grad_norm": 3.2046026266689958, + "learning_rate": 9.420396237343654e-06, + "loss": 1.3518, + "step": 5536 + }, + { + "epoch": 1.0657169670572948, + "lm_loss": 1.172, + "step": 5536, + "vm_loss": 0.1542 + }, + { + "epoch": 1.0657169670572948, + "lm_loss": 1.0842, + "step": 5536, + "vm_loss": 0.1968 + }, + { + "epoch": 1.0657169670572948, + "lm_loss": 1.028, + "step": 5536, + "vm_loss": 0.1752 + }, + { + "epoch": 1.0657169670572948, + "lm_loss": 1.0291, + "step": 5536, + "vm_loss": 0.2056 + }, + { + "epoch": 1.0657169670572948, + "lm_loss": 1.2232, + "step": 5536, + "vm_loss": 0.1493 + }, + { + "epoch": 1.0657169670572948, + "lm_loss": 1.5943, + "step": 5536, + "vm_loss": 0.1699 + }, + { + "epoch": 1.0657169670572948, + "lm_loss": 1.6264, + "step": 5536, + "vm_loss": 0.2396 + }, + { + "epoch": 1.0657169670572948, + "lm_loss": 1.5356, + "step": 5536, + "vm_loss": 0.1374 + }, + { + "epoch": 1.0659094737348702, + "grad_norm": 3.2119574975529157, + "learning_rate": 9.417283610472345e-06, + "loss": 1.3855, + "step": 5537 + }, + { + "epoch": 1.0661019804124456, + "grad_norm": 3.209459213728874, + "learning_rate": 9.414171040248523e-06, + "loss": 1.4334, + "step": 5538 + }, + { + "epoch": 1.0662944870900208, + "grad_norm": 3.2173801614183684, + "learning_rate": 9.41105852697477e-06, + "loss": 1.4036, + "step": 5539 + }, + { + "epoch": 1.0664869937675963, + "grad_norm": 3.0550331138758278, + "learning_rate": 9.407946070953662e-06, + "loss": 1.323, + "step": 5540 + }, + { + "epoch": 1.0666795004451717, + "grad_norm": 3.356867240591646, + "learning_rate": 9.404833672487772e-06, + "loss": 1.3927, + "step": 5541 + }, + { + "epoch": 1.066872007122747, + "grad_norm": 3.3540095447689113, + "learning_rate": 9.401721331879665e-06, + "loss": 1.4108, + "step": 5542 + }, + { + "epoch": 1.0670645138003225, + "grad_norm": 3.2693585638277662, + "learning_rate": 9.398609049431894e-06, + "loss": 1.3606, + "step": 5543 + }, + { + "epoch": 1.0672570204778977, + "grad_norm": 3.257143753258995, + "learning_rate": 9.395496825447022e-06, + "loss": 1.383, + "step": 5544 + }, + { + "epoch": 1.0672570204778977, + "lm_loss": 0.7556, + "step": 5544, + "vm_loss": 0.1722 + }, + { + "epoch": 1.0672570204778977, + "lm_loss": 0.8079, + "step": 5544, + "vm_loss": 0.1931 + }, + { + "epoch": 1.0672570204778977, + "lm_loss": 1.2367, + "step": 5544, + "vm_loss": 0.1289 + }, + { + "epoch": 1.0672570204778977, + "lm_loss": 1.1791, + "step": 5544, + "vm_loss": 0.2135 + }, + { + "epoch": 1.0672570204778977, + "lm_loss": 1.3971, + "step": 5544, + "vm_loss": 0.1788 + }, + { + "epoch": 1.0672570204778977, + "lm_loss": 1.512, + "step": 5544, + "vm_loss": 0.1219 + }, + { + "epoch": 1.0672570204778977, + "lm_loss": 1.8917, + "step": 5544, + "vm_loss": 0.1209 + }, + { + "epoch": 1.0672570204778977, + "lm_loss": 0.9322, + "step": 5544, + "vm_loss": 0.204 + }, + { + "epoch": 1.0674495271554731, + "grad_norm": 3.2613928042766887, + "learning_rate": 9.392384660227591e-06, + "loss": 1.3759, + "step": 5545 + }, + { + "epoch": 1.0676420338330486, + "grad_norm": 3.1389333600898923, + "learning_rate": 9.389272554076142e-06, + "loss": 1.3723, + "step": 5546 + }, + { + "epoch": 1.067834540510624, + "grad_norm": 3.126299361772461, + "learning_rate": 9.386160507295217e-06, + "loss": 1.3228, + "step": 5547 + }, + { + "epoch": 1.0680270471881994, + "grad_norm": 3.2267531044216513, + "learning_rate": 9.383048520187344e-06, + "loss": 1.3486, + "step": 5548 + }, + { + "epoch": 1.0682195538657746, + "grad_norm": 3.1449322864087357, + "learning_rate": 9.379936593055048e-06, + "loss": 1.3331, + "step": 5549 + }, + { + "epoch": 1.06841206054335, + "grad_norm": 3.22693186688896, + "learning_rate": 9.376824726200847e-06, + "loss": 1.3342, + "step": 5550 + }, + { + "epoch": 1.0686045672209254, + "grad_norm": 3.3067000824509503, + "learning_rate": 9.373712919927255e-06, + "loss": 1.3576, + "step": 5551 + }, + { + "epoch": 1.0687970738985009, + "grad_norm": 3.4724923628475, + "learning_rate": 9.37060117453678e-06, + "loss": 1.433, + "step": 5552 + }, + { + "epoch": 1.0687970738985009, + "lm_loss": 0.9841, + "step": 5552, + "vm_loss": 0.1145 + }, + { + "epoch": 1.0687970738985009, + "lm_loss": 1.5305, + "step": 5552, + "vm_loss": 0.0796 + }, + { + "epoch": 1.0687970738985009, + "lm_loss": 1.6985, + "step": 5552, + "vm_loss": 0.1494 + }, + { + "epoch": 1.0687970738985009, + "lm_loss": 1.5509, + "step": 5552, + "vm_loss": 0.1576 + }, + { + "epoch": 1.0687970738985009, + "lm_loss": 0.9366, + "step": 5552, + "vm_loss": 0.1439 + }, + { + "epoch": 1.0687970738985009, + "lm_loss": 1.1486, + "step": 5552, + "vm_loss": 0.2215 + }, + { + "epoch": 1.0687970738985009, + "lm_loss": 1.841, + "step": 5552, + "vm_loss": 0.1578 + }, + { + "epoch": 1.0687970738985009, + "lm_loss": 1.1481, + "step": 5552, + "vm_loss": 0.1188 + }, + { + "epoch": 1.0689895805760763, + "grad_norm": 3.24294221239512, + "learning_rate": 9.367489490331923e-06, + "loss": 1.3523, + "step": 5553 + }, + { + "epoch": 1.0691820872536517, + "grad_norm": 3.132357727758026, + "learning_rate": 9.36437786761518e-06, + "loss": 1.3414, + "step": 5554 + }, + { + "epoch": 1.069374593931227, + "grad_norm": 3.2264972217198933, + "learning_rate": 9.361266306689043e-06, + "loss": 1.3088, + "step": 5555 + }, + { + "epoch": 1.0695671006088023, + "grad_norm": 3.1989211828158988, + "learning_rate": 9.358154807855992e-06, + "loss": 1.3214, + "step": 5556 + }, + { + "epoch": 1.0697596072863778, + "grad_norm": 3.248003844905881, + "learning_rate": 9.355043371418501e-06, + "loss": 1.3495, + "step": 5557 + }, + { + "epoch": 1.0699521139639532, + "grad_norm": 3.29257100010339, + "learning_rate": 9.351931997679049e-06, + "loss": 1.3572, + "step": 5558 + }, + { + "epoch": 1.0701446206415286, + "grad_norm": 3.273101648818866, + "learning_rate": 9.348820686940099e-06, + "loss": 1.3609, + "step": 5559 + }, + { + "epoch": 1.0703371273191038, + "grad_norm": 3.3311418105670203, + "learning_rate": 9.345709439504104e-06, + "loss": 1.3484, + "step": 5560 + }, + { + "epoch": 1.0703371273191038, + "lm_loss": 1.2996, + "step": 5560, + "vm_loss": 0.1979 + }, + { + "epoch": 1.0703371273191038, + "lm_loss": 1.2805, + "step": 5560, + "vm_loss": 0.16 + }, + { + "epoch": 1.0703371273191038, + "lm_loss": 0.7554, + "step": 5560, + "vm_loss": 0.1621 + }, + { + "epoch": 1.0703371273191038, + "lm_loss": 0.9952, + "step": 5560, + "vm_loss": 0.2108 + }, + { + "epoch": 1.0703371273191038, + "lm_loss": 1.5369, + "step": 5560, + "vm_loss": 0.176 + }, + { + "epoch": 1.0703371273191038, + "lm_loss": 1.3448, + "step": 5560, + "vm_loss": 0.1013 + }, + { + "epoch": 1.0703371273191038, + "lm_loss": 1.0685, + "step": 5560, + "vm_loss": 0.1372 + }, + { + "epoch": 1.0703371273191038, + "lm_loss": 0.8945, + "step": 5560, + "vm_loss": 0.174 + }, + { + "epoch": 1.0705296339966792, + "grad_norm": 3.2508346065296707, + "learning_rate": 9.342598255673528e-06, + "loss": 1.3502, + "step": 5561 + }, + { + "epoch": 1.0707221406742546, + "grad_norm": 3.2695752230137325, + "learning_rate": 9.339487135750811e-06, + "loss": 1.3456, + "step": 5562 + }, + { + "epoch": 1.07091464735183, + "grad_norm": 3.1136596016957325, + "learning_rate": 9.336376080038397e-06, + "loss": 1.2774, + "step": 5563 + }, + { + "epoch": 1.0711071540294055, + "grad_norm": 3.2071240217875294, + "learning_rate": 9.333265088838713e-06, + "loss": 1.3178, + "step": 5564 + }, + { + "epoch": 1.0712996607069807, + "grad_norm": 3.325872811973558, + "learning_rate": 9.330154162454194e-06, + "loss": 1.3159, + "step": 5565 + }, + { + "epoch": 1.071492167384556, + "grad_norm": 3.3147133497472145, + "learning_rate": 9.327043301187264e-06, + "loss": 1.4158, + "step": 5566 + }, + { + "epoch": 1.0716846740621315, + "grad_norm": 3.1779783457830755, + "learning_rate": 9.323932505340327e-06, + "loss": 1.356, + "step": 5567 + }, + { + "epoch": 1.071877180739707, + "grad_norm": 3.24784563060547, + "learning_rate": 9.320821775215807e-06, + "loss": 1.2972, + "step": 5568 + }, + { + "epoch": 1.071877180739707, + "lm_loss": 1.3385, + "step": 5568, + "vm_loss": 0.205 + }, + { + "epoch": 1.071877180739707, + "lm_loss": 0.9812, + "step": 5568, + "vm_loss": 0.1651 + }, + { + "epoch": 1.071877180739707, + "lm_loss": 1.2648, + "step": 5568, + "vm_loss": 0.135 + }, + { + "epoch": 1.071877180739707, + "lm_loss": 1.035, + "step": 5568, + "vm_loss": 0.1564 + }, + { + "epoch": 1.071877180739707, + "lm_loss": 1.7109, + "step": 5568, + "vm_loss": 0.2149 + }, + { + "epoch": 1.071877180739707, + "lm_loss": 1.3619, + "step": 5568, + "vm_loss": 0.2053 + }, + { + "epoch": 1.071877180739707, + "lm_loss": 1.6522, + "step": 5568, + "vm_loss": 0.1622 + }, + { + "epoch": 1.071877180739707, + "lm_loss": 0.805, + "step": 5568, + "vm_loss": 0.1816 + }, + { + "epoch": 1.0720696874172824, + "grad_norm": 3.47390906500689, + "learning_rate": 9.317711111116098e-06, + "loss": 1.4401, + "step": 5569 + }, + { + "epoch": 1.0722621940948576, + "grad_norm": 3.0915878509853334, + "learning_rate": 9.314600513343595e-06, + "loss": 1.3178, + "step": 5570 + }, + { + "epoch": 1.072454700772433, + "grad_norm": 3.1851176812917505, + "learning_rate": 9.311489982200695e-06, + "loss": 1.3234, + "step": 5571 + }, + { + "epoch": 1.0726472074500084, + "grad_norm": 3.2647896275943356, + "learning_rate": 9.308379517989775e-06, + "loss": 1.3934, + "step": 5572 + }, + { + "epoch": 1.0728397141275838, + "grad_norm": 3.1518661523441134, + "learning_rate": 9.305269121013214e-06, + "loss": 1.3106, + "step": 5573 + }, + { + "epoch": 1.0730322208051593, + "grad_norm": 3.1579274267979756, + "learning_rate": 9.302158791573384e-06, + "loss": 1.308, + "step": 5574 + }, + { + "epoch": 1.0732247274827347, + "grad_norm": 3.294144867619082, + "learning_rate": 9.299048529972647e-06, + "loss": 1.3786, + "step": 5575 + }, + { + "epoch": 1.0734172341603099, + "grad_norm": 3.2827801171325333, + "learning_rate": 9.295938336513362e-06, + "loss": 1.4, + "step": 5576 + }, + { + "epoch": 1.0734172341603099, + "lm_loss": 1.0909, + "step": 5576, + "vm_loss": 0.2028 + }, + { + "epoch": 1.0734172341603099, + "lm_loss": 0.8172, + "step": 5576, + "vm_loss": 0.2114 + }, + { + "epoch": 1.0734172341603099, + "lm_loss": 1.3225, + "step": 5576, + "vm_loss": 0.1618 + }, + { + "epoch": 1.0734172341603099, + "lm_loss": 1.0128, + "step": 5576, + "vm_loss": 0.1604 + }, + { + "epoch": 1.0734172341603099, + "lm_loss": 1.446, + "step": 5576, + "vm_loss": 0.1765 + }, + { + "epoch": 1.0734172341603099, + "lm_loss": 0.8409, + "step": 5576, + "vm_loss": 0.2027 + }, + { + "epoch": 1.0734172341603099, + "lm_loss": 1.2535, + "step": 5576, + "vm_loss": 0.1931 + }, + { + "epoch": 1.0734172341603099, + "lm_loss": 1.2566, + "step": 5576, + "vm_loss": 0.141 + }, + { + "epoch": 1.0736097408378853, + "grad_norm": 3.3056015475490987, + "learning_rate": 9.292828211497873e-06, + "loss": 1.4011, + "step": 5577 + }, + { + "epoch": 1.0738022475154607, + "grad_norm": 3.191573499434425, + "learning_rate": 9.289718155228535e-06, + "loss": 1.345, + "step": 5578 + }, + { + "epoch": 1.0739947541930361, + "grad_norm": 3.2581016160008955, + "learning_rate": 9.286608168007678e-06, + "loss": 1.3927, + "step": 5579 + }, + { + "epoch": 1.0741872608706116, + "grad_norm": 3.1994116204612206, + "learning_rate": 9.283498250137636e-06, + "loss": 1.3218, + "step": 5580 + }, + { + "epoch": 1.0743797675481868, + "grad_norm": 3.151571378839607, + "learning_rate": 9.280388401920724e-06, + "loss": 1.3232, + "step": 5581 + }, + { + "epoch": 1.0745722742257622, + "grad_norm": 3.165131733201607, + "learning_rate": 9.277278623659273e-06, + "loss": 1.3694, + "step": 5582 + }, + { + "epoch": 1.0747647809033376, + "grad_norm": 3.320884928187641, + "learning_rate": 9.274168915655583e-06, + "loss": 1.3723, + "step": 5583 + }, + { + "epoch": 1.074957287580913, + "grad_norm": 3.1834172221634387, + "learning_rate": 9.271059278211959e-06, + "loss": 1.3614, + "step": 5584 + }, + { + "epoch": 1.074957287580913, + "lm_loss": 1.196, + "step": 5584, + "vm_loss": 0.1141 + }, + { + "epoch": 1.074957287580913, + "lm_loss": 1.4323, + "step": 5584, + "vm_loss": 0.1916 + }, + { + "epoch": 1.074957287580913, + "lm_loss": 1.7479, + "step": 5584, + "vm_loss": 0.185 + }, + { + "epoch": 1.074957287580913, + "lm_loss": 0.8278, + "step": 5584, + "vm_loss": 0.1806 + }, + { + "epoch": 1.074957287580913, + "lm_loss": 1.8402, + "step": 5584, + "vm_loss": 0.1405 + }, + { + "epoch": 1.074957287580913, + "lm_loss": 1.0791, + "step": 5584, + "vm_loss": 0.1647 + }, + { + "epoch": 1.074957287580913, + "lm_loss": 1.6947, + "step": 5584, + "vm_loss": 0.167 + }, + { + "epoch": 1.074957287580913, + "lm_loss": 1.214, + "step": 5584, + "vm_loss": 0.1998 + }, + { + "epoch": 1.0751497942584884, + "grad_norm": 3.1486265201118853, + "learning_rate": 9.267949711630701e-06, + "loss": 1.3818, + "step": 5585 + }, + { + "epoch": 1.0753423009360636, + "grad_norm": 3.366567619134176, + "learning_rate": 9.264840216214099e-06, + "loss": 1.3255, + "step": 5586 + }, + { + "epoch": 1.075534807613639, + "grad_norm": 3.125600940091369, + "learning_rate": 9.261730792264429e-06, + "loss": 1.2776, + "step": 5587 + }, + { + "epoch": 1.0757273142912145, + "grad_norm": 3.0778228260380596, + "learning_rate": 9.258621440083973e-06, + "loss": 1.3043, + "step": 5588 + }, + { + "epoch": 1.07591982096879, + "grad_norm": 3.128962069772777, + "learning_rate": 9.255512159975e-06, + "loss": 1.3145, + "step": 5589 + }, + { + "epoch": 1.0761123276463653, + "grad_norm": 3.201602437326969, + "learning_rate": 9.252402952239767e-06, + "loss": 1.3456, + "step": 5590 + }, + { + "epoch": 1.0763048343239405, + "grad_norm": 3.1683037764909714, + "learning_rate": 9.249293817180532e-06, + "loss": 1.3172, + "step": 5591 + }, + { + "epoch": 1.076497341001516, + "grad_norm": 3.334446013280881, + "learning_rate": 9.246184755099545e-06, + "loss": 1.4251, + "step": 5592 + }, + { + "epoch": 1.076497341001516, + "lm_loss": 1.3221, + "step": 5592, + "vm_loss": 0.1453 + }, + { + "epoch": 1.076497341001516, + "lm_loss": 1.1799, + "step": 5592, + "vm_loss": 0.1263 + }, + { + "epoch": 1.076497341001516, + "lm_loss": 1.3996, + "step": 5592, + "vm_loss": 0.1251 + }, + { + "epoch": 1.076497341001516, + "lm_loss": 0.9323, + "step": 5592, + "vm_loss": 0.204 + }, + { + "epoch": 1.076497341001516, + "lm_loss": 1.3081, + "step": 5592, + "vm_loss": 0.1388 + }, + { + "epoch": 1.076497341001516, + "lm_loss": 1.3987, + "step": 5592, + "vm_loss": 0.203 + }, + { + "epoch": 1.076497341001516, + "lm_loss": 0.9182, + "step": 5592, + "vm_loss": 0.1243 + }, + { + "epoch": 1.076497341001516, + "lm_loss": 1.0443, + "step": 5592, + "vm_loss": 0.1355 + }, + { + "epoch": 1.0766898476790914, + "grad_norm": 3.0388159730339903, + "learning_rate": 9.243075766299042e-06, + "loss": 1.314, + "step": 5593 + }, + { + "epoch": 1.0768823543566668, + "grad_norm": 3.1361715749433436, + "learning_rate": 9.239966851081256e-06, + "loss": 1.3035, + "step": 5594 + }, + { + "epoch": 1.0770748610342422, + "grad_norm": 3.310299147627332, + "learning_rate": 9.236858009748417e-06, + "loss": 1.351, + "step": 5595 + }, + { + "epoch": 1.0772673677118174, + "grad_norm": 3.142324914262676, + "learning_rate": 9.233749242602745e-06, + "loss": 1.3388, + "step": 5596 + }, + { + "epoch": 1.0774598743893928, + "grad_norm": 3.1156432864499126, + "learning_rate": 9.23064054994645e-06, + "loss": 1.3499, + "step": 5597 + }, + { + "epoch": 1.0776523810669683, + "grad_norm": 3.2268799191935313, + "learning_rate": 9.227531932081732e-06, + "loss": 1.3481, + "step": 5598 + }, + { + "epoch": 1.0778448877445437, + "grad_norm": 3.293693605554753, + "learning_rate": 9.224423389310799e-06, + "loss": 1.2627, + "step": 5599 + }, + { + "epoch": 1.078037394422119, + "grad_norm": 3.20375506598723, + "learning_rate": 9.221314921935833e-06, + "loss": 1.3245, + "step": 5600 + }, + { + "epoch": 1.078037394422119, + "lm_loss": 1.5035, + "step": 5600, + "vm_loss": 0.1172 + }, + { + "epoch": 1.078037394422119, + "lm_loss": 1.3449, + "step": 5600, + "vm_loss": 0.1952 + }, + { + "epoch": 1.078037394422119, + "lm_loss": 1.1092, + "step": 5600, + "vm_loss": 0.1765 + }, + { + "epoch": 1.078037394422119, + "lm_loss": 1.2073, + "step": 5600, + "vm_loss": 0.1639 + }, + { + "epoch": 1.078037394422119, + "lm_loss": 0.8621, + "step": 5600, + "vm_loss": 0.1113 + }, + { + "epoch": 1.078037394422119, + "lm_loss": 1.3378, + "step": 5600, + "vm_loss": 0.1501 + }, + { + "epoch": 1.078037394422119, + "lm_loss": 1.4005, + "step": 5600, + "vm_loss": 0.1096 + }, + { + "epoch": 1.078037394422119, + "lm_loss": 1.3804, + "step": 5600, + "vm_loss": 0.1565 + }, + { + "epoch": 1.0782299010996943, + "grad_norm": 3.4237851172723883, + "learning_rate": 9.218206530259016e-06, + "loss": 1.4015, + "step": 5601 + }, + { + "epoch": 1.0784224077772697, + "grad_norm": 3.220122858294109, + "learning_rate": 9.21509821458253e-06, + "loss": 1.3112, + "step": 5602 + }, + { + "epoch": 1.0786149144548451, + "grad_norm": 3.2244602976168664, + "learning_rate": 9.211989975208538e-06, + "loss": 1.3542, + "step": 5603 + }, + { + "epoch": 1.0788074211324206, + "grad_norm": 3.211157743767312, + "learning_rate": 9.208881812439199e-06, + "loss": 1.4079, + "step": 5604 + }, + { + "epoch": 1.078999927809996, + "grad_norm": 3.0995121417458336, + "learning_rate": 9.205773726576673e-06, + "loss": 1.3484, + "step": 5605 + }, + { + "epoch": 1.0791924344875712, + "grad_norm": 3.4115777887664946, + "learning_rate": 9.202665717923103e-06, + "loss": 1.3965, + "step": 5606 + }, + { + "epoch": 1.0793849411651466, + "grad_norm": 3.1127732275476028, + "learning_rate": 9.199557786780626e-06, + "loss": 1.2936, + "step": 5607 + }, + { + "epoch": 1.079577447842722, + "grad_norm": 3.116278383862214, + "learning_rate": 9.196449933451366e-06, + "loss": 1.3132, + "step": 5608 + }, + { + "epoch": 1.079577447842722, + "lm_loss": 1.3763, + "step": 5608, + "vm_loss": 0.1814 + }, + { + "epoch": 1.079577447842722, + "lm_loss": 0.8764, + "step": 5608, + "vm_loss": 0.1831 + }, + { + "epoch": 1.079577447842722, + "lm_loss": 0.8006, + "step": 5608, + "vm_loss": 0.1608 + }, + { + "epoch": 1.079577447842722, + "lm_loss": 1.103, + "step": 5608, + "vm_loss": 0.1586 + }, + { + "epoch": 1.079577447842722, + "lm_loss": 1.3528, + "step": 5608, + "vm_loss": 0.1898 + }, + { + "epoch": 1.079577447842722, + "lm_loss": 0.9172, + "step": 5608, + "vm_loss": 0.1957 + }, + { + "epoch": 1.079577447842722, + "lm_loss": 1.5046, + "step": 5608, + "vm_loss": 0.1887 + }, + { + "epoch": 1.079577447842722, + "lm_loss": 0.5672, + "step": 5608, + "vm_loss": 0.1426 + }, + { + "epoch": 1.0797699545202974, + "grad_norm": 3.1853642651186154, + "learning_rate": 9.193342158237461e-06, + "loss": 1.3346, + "step": 5609 + }, + { + "epoch": 1.0799624611978729, + "grad_norm": 3.2617039931768024, + "learning_rate": 9.190234461441015e-06, + "loss": 1.3073, + "step": 5610 + }, + { + "epoch": 1.080154967875448, + "grad_norm": 3.272132300401591, + "learning_rate": 9.187126843364138e-06, + "loss": 1.3388, + "step": 5611 + }, + { + "epoch": 1.0803474745530235, + "grad_norm": 3.410332008063213, + "learning_rate": 9.184019304308933e-06, + "loss": 1.369, + "step": 5612 + }, + { + "epoch": 1.080539981230599, + "grad_norm": 3.500265904636164, + "learning_rate": 9.180911844577492e-06, + "loss": 1.4171, + "step": 5613 + }, + { + "epoch": 1.0807324879081743, + "grad_norm": 3.1231867091659318, + "learning_rate": 9.177804464471897e-06, + "loss": 1.3271, + "step": 5614 + }, + { + "epoch": 1.0809249945857498, + "grad_norm": 3.1650242742528083, + "learning_rate": 9.174697164294228e-06, + "loss": 1.3551, + "step": 5615 + }, + { + "epoch": 1.081117501263325, + "grad_norm": 3.1344394035953944, + "learning_rate": 9.171589944346552e-06, + "loss": 1.3229, + "step": 5616 + }, + { + "epoch": 1.081117501263325, + "lm_loss": 1.1695, + "step": 5616, + "vm_loss": 0.1476 + }, + { + "epoch": 1.081117501263325, + "lm_loss": 1.6816, + "step": 5616, + "vm_loss": 0.158 + }, + { + "epoch": 1.081117501263325, + "lm_loss": 1.4095, + "step": 5616, + "vm_loss": 0.1377 + }, + { + "epoch": 1.081117501263325, + "lm_loss": 1.3409, + "step": 5616, + "vm_loss": 0.1599 + }, + { + "epoch": 1.081117501263325, + "lm_loss": 1.3467, + "step": 5616, + "vm_loss": 0.1453 + }, + { + "epoch": 1.081117501263325, + "lm_loss": 0.7703, + "step": 5616, + "vm_loss": 0.1006 + }, + { + "epoch": 1.081117501263325, + "lm_loss": 1.14, + "step": 5616, + "vm_loss": 0.089 + }, + { + "epoch": 1.081117501263325, + "lm_loss": 0.7698, + "step": 5616, + "vm_loss": 0.2383 + }, + { + "epoch": 1.0813100079409004, + "grad_norm": 3.2346473025052904, + "learning_rate": 9.168482804930935e-06, + "loss": 1.3788, + "step": 5617 + }, + { + "epoch": 1.0815025146184758, + "grad_norm": 3.2466149876167174, + "learning_rate": 9.165375746349424e-06, + "loss": 1.3781, + "step": 5618 + }, + { + "epoch": 1.0816950212960512, + "grad_norm": 3.2442343757848104, + "learning_rate": 9.16226876890407e-06, + "loss": 1.3038, + "step": 5619 + }, + { + "epoch": 1.0818875279736266, + "grad_norm": 3.188876304079787, + "learning_rate": 9.15916187289691e-06, + "loss": 1.2986, + "step": 5620 + }, + { + "epoch": 1.082080034651202, + "grad_norm": 3.4713123129512615, + "learning_rate": 9.15605505862997e-06, + "loss": 1.3816, + "step": 5621 + }, + { + "epoch": 1.0822725413287773, + "grad_norm": 3.160790565480621, + "learning_rate": 9.15294832640528e-06, + "loss": 1.2938, + "step": 5622 + }, + { + "epoch": 1.0824650480063527, + "grad_norm": 3.1861005575735044, + "learning_rate": 9.14984167652485e-06, + "loss": 1.3553, + "step": 5623 + }, + { + "epoch": 1.082657554683928, + "grad_norm": 3.242793708904605, + "learning_rate": 9.146735109290685e-06, + "loss": 1.3536, + "step": 5624 + }, + { + "epoch": 1.082657554683928, + "lm_loss": 1.784, + "step": 5624, + "vm_loss": 0.1503 + }, + { + "epoch": 1.082657554683928, + "lm_loss": 0.9895, + "step": 5624, + "vm_loss": 0.1549 + }, + { + "epoch": 1.082657554683928, + "lm_loss": 1.4984, + "step": 5624, + "vm_loss": 0.1481 + }, + { + "epoch": 1.082657554683928, + "lm_loss": 0.901, + "step": 5624, + "vm_loss": 0.2059 + }, + { + "epoch": 1.082657554683928, + "lm_loss": 1.0796, + "step": 5624, + "vm_loss": 0.0945 + }, + { + "epoch": 1.082657554683928, + "lm_loss": 0.9166, + "step": 5624, + "vm_loss": 0.1451 + }, + { + "epoch": 1.082657554683928, + "lm_loss": 1.0005, + "step": 5624, + "vm_loss": 0.129 + }, + { + "epoch": 1.082657554683928, + "lm_loss": 1.0859, + "step": 5624, + "vm_loss": 0.2272 + }, + { + "epoch": 1.0828500613615035, + "grad_norm": 3.2347946021845155, + "learning_rate": 9.14362862500478e-06, + "loss": 1.3152, + "step": 5625 + }, + { + "epoch": 1.083042568039079, + "grad_norm": 3.3486793454807176, + "learning_rate": 9.140522223969132e-06, + "loss": 1.3904, + "step": 5626 + }, + { + "epoch": 1.0832350747166541, + "grad_norm": 3.182503554964886, + "learning_rate": 9.13741590648572e-06, + "loss": 1.2623, + "step": 5627 + }, + { + "epoch": 1.0834275813942296, + "grad_norm": 3.238017989777943, + "learning_rate": 9.134309672856513e-06, + "loss": 1.3072, + "step": 5628 + }, + { + "epoch": 1.083620088071805, + "grad_norm": 3.216862328776496, + "learning_rate": 9.131203523383487e-06, + "loss": 1.3622, + "step": 5629 + }, + { + "epoch": 1.0838125947493804, + "grad_norm": 3.2114941720973147, + "learning_rate": 9.128097458368593e-06, + "loss": 1.3098, + "step": 5630 + }, + { + "epoch": 1.0840051014269558, + "grad_norm": 3.1529689407249943, + "learning_rate": 9.12499147811378e-06, + "loss": 1.2471, + "step": 5631 + }, + { + "epoch": 1.084197608104531, + "grad_norm": 3.1505777818508, + "learning_rate": 9.121885582920987e-06, + "loss": 1.2947, + "step": 5632 + }, + { + "epoch": 1.084197608104531, + "lm_loss": 0.8096, + "step": 5632, + "vm_loss": 0.144 + }, + { + "epoch": 1.084197608104531, + "lm_loss": 1.4151, + "step": 5632, + "vm_loss": 0.1593 + }, + { + "epoch": 1.084197608104531, + "lm_loss": 1.5111, + "step": 5632, + "vm_loss": 0.1688 + }, + { + "epoch": 1.084197608104531, + "lm_loss": 1.0143, + "step": 5632, + "vm_loss": 0.147 + }, + { + "epoch": 1.084197608104531, + "lm_loss": 1.0986, + "step": 5632, + "vm_loss": 0.1325 + }, + { + "epoch": 1.084197608104531, + "lm_loss": 0.9617, + "step": 5632, + "vm_loss": 0.1348 + }, + { + "epoch": 1.084197608104531, + "lm_loss": 0.9046, + "step": 5632, + "vm_loss": 0.1395 + }, + { + "epoch": 1.084197608104531, + "lm_loss": 1.2678, + "step": 5632, + "vm_loss": 0.1514 + }, + { + "epoch": 1.0843901147821065, + "grad_norm": 3.2413186979891457, + "learning_rate": 9.118779773092154e-06, + "loss": 1.2555, + "step": 5633 + }, + { + "epoch": 1.0845826214596819, + "grad_norm": 3.295569042665185, + "learning_rate": 9.115674048929199e-06, + "loss": 1.3234, + "step": 5634 + }, + { + "epoch": 1.0847751281372573, + "grad_norm": 3.426311384291926, + "learning_rate": 9.112568410734039e-06, + "loss": 1.3859, + "step": 5635 + }, + { + "epoch": 1.0849676348148327, + "grad_norm": 3.2377294893857496, + "learning_rate": 9.109462858808586e-06, + "loss": 1.2857, + "step": 5636 + }, + { + "epoch": 1.085160141492408, + "grad_norm": 3.242225588204388, + "learning_rate": 9.106357393454734e-06, + "loss": 1.3465, + "step": 5637 + }, + { + "epoch": 1.0853526481699833, + "grad_norm": 3.1525927900142503, + "learning_rate": 9.10325201497438e-06, + "loss": 1.3269, + "step": 5638 + }, + { + "epoch": 1.0855451548475588, + "grad_norm": 3.1941445231616745, + "learning_rate": 9.100146723669397e-06, + "loss": 1.3537, + "step": 5639 + }, + { + "epoch": 1.0857376615251342, + "grad_norm": 3.1498306529255493, + "learning_rate": 9.09704151984167e-06, + "loss": 1.3055, + "step": 5640 + }, + { + "epoch": 1.0857376615251342, + "lm_loss": 0.7588, + "step": 5640, + "vm_loss": 0.1227 + }, + { + "epoch": 1.0857376615251342, + "lm_loss": 0.6631, + "step": 5640, + "vm_loss": 0.1212 + }, + { + "epoch": 1.0857376615251342, + "lm_loss": 1.1715, + "step": 5640, + "vm_loss": 0.1507 + }, + { + "epoch": 1.0857376615251342, + "lm_loss": 1.2777, + "step": 5640, + "vm_loss": 0.1841 + }, + { + "epoch": 1.0857376615251342, + "lm_loss": 1.0492, + "step": 5640, + "vm_loss": 0.1537 + }, + { + "epoch": 1.0857376615251342, + "lm_loss": 0.7675, + "step": 5640, + "vm_loss": 0.139 + }, + { + "epoch": 1.0857376615251342, + "lm_loss": 0.9948, + "step": 5640, + "vm_loss": 0.2053 + }, + { + "epoch": 1.0857376615251342, + "lm_loss": 1.5775, + "step": 5640, + "vm_loss": 0.1509 + }, + { + "epoch": 1.0859301682027096, + "grad_norm": 3.146314131853108, + "learning_rate": 9.093936403793057e-06, + "loss": 1.2698, + "step": 5641 + }, + { + "epoch": 1.086122674880285, + "grad_norm": 3.1552392868556, + "learning_rate": 9.090831375825416e-06, + "loss": 1.2445, + "step": 5642 + }, + { + "epoch": 1.0863151815578602, + "grad_norm": 3.3170929025036187, + "learning_rate": 9.087726436240601e-06, + "loss": 1.316, + "step": 5643 + }, + { + "epoch": 1.0865076882354356, + "grad_norm": 3.3415596394813414, + "learning_rate": 9.084621585340448e-06, + "loss": 1.2846, + "step": 5644 + }, + { + "epoch": 1.086700194913011, + "grad_norm": 3.3089984096724074, + "learning_rate": 9.081516823426783e-06, + "loss": 1.2825, + "step": 5645 + }, + { + "epoch": 1.0868927015905865, + "grad_norm": 3.3521087164646284, + "learning_rate": 9.078412150801438e-06, + "loss": 1.3557, + "step": 5646 + }, + { + "epoch": 1.087085208268162, + "grad_norm": 3.2254473086601916, + "learning_rate": 9.075307567766222e-06, + "loss": 1.2782, + "step": 5647 + }, + { + "epoch": 1.087277714945737, + "grad_norm": 3.1440803048245423, + "learning_rate": 9.072203074622941e-06, + "loss": 1.2887, + "step": 5648 + }, + { + "epoch": 1.087277714945737, + "lm_loss": 1.6449, + "step": 5648, + "vm_loss": 0.1782 + }, + { + "epoch": 1.087277714945737, + "lm_loss": 1.3673, + "step": 5648, + "vm_loss": 0.1558 + }, + { + "epoch": 1.087277714945737, + "lm_loss": 1.4146, + "step": 5648, + "vm_loss": 0.1578 + }, + { + "epoch": 1.087277714945737, + "lm_loss": 0.9623, + "step": 5648, + "vm_loss": 0.1606 + }, + { + "epoch": 1.087277714945737, + "lm_loss": 1.0349, + "step": 5648, + "vm_loss": 0.2062 + }, + { + "epoch": 1.087277714945737, + "lm_loss": 0.9455, + "step": 5648, + "vm_loss": 0.1353 + }, + { + "epoch": 1.087277714945737, + "lm_loss": 1.7619, + "step": 5648, + "vm_loss": 0.1328 + }, + { + "epoch": 1.087277714945737, + "lm_loss": 0.6367, + "step": 5648, + "vm_loss": 0.1166 + }, + { + "epoch": 1.0874702216233125, + "grad_norm": 3.2512703613371086, + "learning_rate": 9.069098671673387e-06, + "loss": 1.3273, + "step": 5649 + }, + { + "epoch": 1.087662728300888, + "grad_norm": 3.3341693502548595, + "learning_rate": 9.065994359219357e-06, + "loss": 1.3922, + "step": 5650 + }, + { + "epoch": 1.0878552349784634, + "grad_norm": 3.2341100594876075, + "learning_rate": 9.062890137562623e-06, + "loss": 1.4104, + "step": 5651 + }, + { + "epoch": 1.0880477416560388, + "grad_norm": 3.148063103142531, + "learning_rate": 9.059786007004953e-06, + "loss": 1.249, + "step": 5652 + }, + { + "epoch": 1.088240248333614, + "grad_norm": 3.2259478266781065, + "learning_rate": 9.056681967848117e-06, + "loss": 1.3031, + "step": 5653 + }, + { + "epoch": 1.0884327550111894, + "grad_norm": 3.251268724052633, + "learning_rate": 9.053578020393862e-06, + "loss": 1.3143, + "step": 5654 + }, + { + "epoch": 1.0886252616887648, + "grad_norm": 3.3933464605283468, + "learning_rate": 9.050474164943928e-06, + "loss": 1.347, + "step": 5655 + }, + { + "epoch": 1.0888177683663403, + "grad_norm": 3.2158435104353646, + "learning_rate": 9.047370401800055e-06, + "loss": 1.2745, + "step": 5656 + }, + { + "epoch": 1.0888177683663403, + "lm_loss": 0.8327, + "step": 5656, + "vm_loss": 0.1442 + }, + { + "epoch": 1.0888177683663403, + "lm_loss": 1.2655, + "step": 5656, + "vm_loss": 0.2085 + }, + { + "epoch": 1.0888177683663403, + "lm_loss": 1.2796, + "step": 5656, + "vm_loss": 0.1505 + }, + { + "epoch": 1.0888177683663403, + "lm_loss": 1.2851, + "step": 5656, + "vm_loss": 0.1352 + }, + { + "epoch": 1.0888177683663403, + "lm_loss": 1.1139, + "step": 5656, + "vm_loss": 0.199 + }, + { + "epoch": 1.0888177683663403, + "lm_loss": 1.0567, + "step": 5656, + "vm_loss": 0.1583 + }, + { + "epoch": 1.0888177683663403, + "lm_loss": 1.0972, + "step": 5656, + "vm_loss": 0.1639 + }, + { + "epoch": 1.0888177683663403, + "lm_loss": 1.3835, + "step": 5656, + "vm_loss": 0.1633 + }, + { + "epoch": 1.0890102750439157, + "grad_norm": 3.365445421169781, + "learning_rate": 9.044266731263966e-06, + "loss": 1.358, + "step": 5657 + }, + { + "epoch": 1.0892027817214909, + "grad_norm": 3.1799178912596457, + "learning_rate": 9.041163153637382e-06, + "loss": 1.2963, + "step": 5658 + }, + { + "epoch": 1.0893952883990663, + "grad_norm": 3.2056908790131966, + "learning_rate": 9.038059669222e-06, + "loss": 1.2634, + "step": 5659 + }, + { + "epoch": 1.0895877950766417, + "grad_norm": 3.144224581238796, + "learning_rate": 9.03495627831953e-06, + "loss": 1.3538, + "step": 5660 + }, + { + "epoch": 1.0897803017542171, + "grad_norm": 3.268945702475398, + "learning_rate": 9.031852981231656e-06, + "loss": 1.3097, + "step": 5661 + }, + { + "epoch": 1.0899728084317926, + "grad_norm": 3.283374190320138, + "learning_rate": 9.028749778260055e-06, + "loss": 1.4262, + "step": 5662 + }, + { + "epoch": 1.0901653151093678, + "grad_norm": 3.2891409008783263, + "learning_rate": 9.025646669706405e-06, + "loss": 1.3216, + "step": 5663 + }, + { + "epoch": 1.0903578217869432, + "grad_norm": 3.3716704336312477, + "learning_rate": 9.022543655872363e-06, + "loss": 1.3586, + "step": 5664 + }, + { + "epoch": 1.0903578217869432, + "lm_loss": 1.0418, + "step": 5664, + "vm_loss": 0.2043 + }, + { + "epoch": 1.0903578217869432, + "lm_loss": 0.9155, + "step": 5664, + "vm_loss": 0.1563 + }, + { + "epoch": 1.0903578217869432, + "lm_loss": 1.4843, + "step": 5664, + "vm_loss": 0.1712 + }, + { + "epoch": 1.0903578217869432, + "lm_loss": 1.0664, + "step": 5664, + "vm_loss": 0.1552 + }, + { + "epoch": 1.0903578217869432, + "lm_loss": 0.6404, + "step": 5664, + "vm_loss": 0.1479 + }, + { + "epoch": 1.0903578217869432, + "lm_loss": 0.9507, + "step": 5664, + "vm_loss": 0.2082 + }, + { + "epoch": 1.0903578217869432, + "lm_loss": 1.1979, + "step": 5664, + "vm_loss": 0.1582 + }, + { + "epoch": 1.0903578217869432, + "lm_loss": 0.7537, + "step": 5664, + "vm_loss": 0.1419 + }, + { + "epoch": 1.0905503284645186, + "grad_norm": 3.2136032971325723, + "learning_rate": 9.019440737059585e-06, + "loss": 1.2997, + "step": 5665 + }, + { + "epoch": 1.090742835142094, + "grad_norm": 3.1334220885118995, + "learning_rate": 9.016337913569708e-06, + "loss": 1.2393, + "step": 5666 + }, + { + "epoch": 1.0909353418196694, + "grad_norm": 3.1517433062241, + "learning_rate": 9.013235185704374e-06, + "loss": 1.295, + "step": 5667 + }, + { + "epoch": 1.0911278484972446, + "grad_norm": 3.3172375820447426, + "learning_rate": 9.010132553765204e-06, + "loss": 1.2911, + "step": 5668 + }, + { + "epoch": 1.09132035517482, + "grad_norm": 3.272666488681282, + "learning_rate": 9.00703001805381e-06, + "loss": 1.313, + "step": 5669 + }, + { + "epoch": 1.0915128618523955, + "grad_norm": 3.16269257199317, + "learning_rate": 9.003927578871807e-06, + "loss": 1.3056, + "step": 5670 + }, + { + "epoch": 1.091705368529971, + "grad_norm": 3.1779747990065084, + "learning_rate": 9.000825236520786e-06, + "loss": 1.2796, + "step": 5671 + }, + { + "epoch": 1.0918978752075463, + "grad_norm": 3.0754888218483245, + "learning_rate": 8.997722991302335e-06, + "loss": 1.3469, + "step": 5672 + }, + { + "epoch": 1.0918978752075463, + "lm_loss": 1.1048, + "step": 5672, + "vm_loss": 0.1478 + }, + { + "epoch": 1.0918978752075463, + "lm_loss": 1.3984, + "step": 5672, + "vm_loss": 0.1579 + }, + { + "epoch": 1.0918978752075463, + "lm_loss": 0.5434, + "step": 5672, + "vm_loss": 0.104 + }, + { + "epoch": 1.0918978752075463, + "lm_loss": 1.3016, + "step": 5672, + "vm_loss": 0.1905 + }, + { + "epoch": 1.0918978752075463, + "lm_loss": 0.7313, + "step": 5672, + "vm_loss": 0.1852 + }, + { + "epoch": 1.0918978752075463, + "lm_loss": 1.0544, + "step": 5672, + "vm_loss": 0.2133 + }, + { + "epoch": 1.0918978752075463, + "lm_loss": 1.3284, + "step": 5672, + "vm_loss": 0.1439 + }, + { + "epoch": 1.0918978752075463, + "lm_loss": 0.7356, + "step": 5672, + "vm_loss": 0.1676 + }, + { + "epoch": 1.0920903818851215, + "grad_norm": 3.1328898034537827, + "learning_rate": 8.99462084351803e-06, + "loss": 1.3003, + "step": 5673 + }, + { + "epoch": 1.092282888562697, + "grad_norm": 3.146352768171252, + "learning_rate": 8.991518793469444e-06, + "loss": 1.2844, + "step": 5674 + }, + { + "epoch": 1.0924753952402724, + "grad_norm": 3.2329772737455884, + "learning_rate": 8.988416841458131e-06, + "loss": 1.3251, + "step": 5675 + }, + { + "epoch": 1.0926679019178478, + "grad_norm": 3.2238944957365754, + "learning_rate": 8.985314987785644e-06, + "loss": 1.3392, + "step": 5676 + }, + { + "epoch": 1.0928604085954232, + "grad_norm": 3.23393808417215, + "learning_rate": 8.982213232753523e-06, + "loss": 1.3148, + "step": 5677 + }, + { + "epoch": 1.0930529152729984, + "grad_norm": 3.3104508630892138, + "learning_rate": 8.979111576663295e-06, + "loss": 1.3799, + "step": 5678 + }, + { + "epoch": 1.0932454219505738, + "grad_norm": 3.475890470701071, + "learning_rate": 8.976010019816484e-06, + "loss": 1.3488, + "step": 5679 + }, + { + "epoch": 1.0934379286281493, + "grad_norm": 3.192017942227653, + "learning_rate": 8.9729085625146e-06, + "loss": 1.3173, + "step": 5680 + }, + { + "epoch": 1.0934379286281493, + "lm_loss": 1.7857, + "step": 5680, + "vm_loss": 0.1317 + }, + { + "epoch": 1.0934379286281493, + "lm_loss": 1.9222, + "step": 5680, + "vm_loss": 0.1979 + }, + { + "epoch": 1.0934379286281493, + "lm_loss": 1.6439, + "step": 5680, + "vm_loss": 0.1592 + }, + { + "epoch": 1.0934379286281493, + "lm_loss": 0.9772, + "step": 5680, + "vm_loss": 0.1753 + }, + { + "epoch": 1.0934379286281493, + "lm_loss": 1.1941, + "step": 5680, + "vm_loss": 0.1617 + }, + { + "epoch": 1.0934379286281493, + "lm_loss": 1.1633, + "step": 5680, + "vm_loss": 0.1766 + }, + { + "epoch": 1.0934379286281493, + "lm_loss": 1.1305, + "step": 5680, + "vm_loss": 0.1868 + }, + { + "epoch": 1.0934379286281493, + "lm_loss": 0.9578, + "step": 5680, + "vm_loss": 0.2459 + }, + { + "epoch": 1.0936304353057247, + "grad_norm": 3.3268726040842975, + "learning_rate": 8.969807205059144e-06, + "loss": 1.3326, + "step": 5681 + }, + { + "epoch": 1.0938229419833, + "grad_norm": 3.3294333761238786, + "learning_rate": 8.966705947751608e-06, + "loss": 1.3391, + "step": 5682 + }, + { + "epoch": 1.0940154486608755, + "grad_norm": 3.1628002566022633, + "learning_rate": 8.96360479089347e-06, + "loss": 1.3703, + "step": 5683 + }, + { + "epoch": 1.0942079553384507, + "grad_norm": 3.2991841413351732, + "learning_rate": 8.96050373478621e-06, + "loss": 1.3413, + "step": 5684 + }, + { + "epoch": 1.0944004620160261, + "grad_norm": 3.1891290278593973, + "learning_rate": 8.957402779731288e-06, + "loss": 1.2926, + "step": 5685 + }, + { + "epoch": 1.0945929686936016, + "grad_norm": 3.1823080045827115, + "learning_rate": 8.954301926030146e-06, + "loss": 1.2862, + "step": 5686 + }, + { + "epoch": 1.094785475371177, + "grad_norm": 3.20346621057419, + "learning_rate": 8.951201173984243e-06, + "loss": 1.325, + "step": 5687 + }, + { + "epoch": 1.0949779820487524, + "grad_norm": 3.2578293288530737, + "learning_rate": 8.948100523895002e-06, + "loss": 1.3479, + "step": 5688 + }, + { + "epoch": 1.0949779820487524, + "lm_loss": 1.4778, + "step": 5688, + "vm_loss": 0.1587 + }, + { + "epoch": 1.0949779820487524, + "lm_loss": 1.2696, + "step": 5688, + "vm_loss": 0.1694 + }, + { + "epoch": 1.0949779820487524, + "lm_loss": 1.0023, + "step": 5688, + "vm_loss": 0.144 + }, + { + "epoch": 1.0949779820487524, + "lm_loss": 1.3103, + "step": 5688, + "vm_loss": 0.12 + }, + { + "epoch": 1.0949779820487524, + "lm_loss": 1.2342, + "step": 5688, + "vm_loss": 0.1936 + }, + { + "epoch": 1.0949779820487524, + "lm_loss": 1.3477, + "step": 5688, + "vm_loss": 0.191 + }, + { + "epoch": 1.0949779820487524, + "lm_loss": 1.4247, + "step": 5688, + "vm_loss": 0.1877 + }, + { + "epoch": 1.0949779820487524, + "lm_loss": 1.0773, + "step": 5688, + "vm_loss": 0.1719 + }, + { + "epoch": 1.0951704887263276, + "grad_norm": 3.2956872971471607, + "learning_rate": 8.944999976063848e-06, + "loss": 1.3785, + "step": 5689 + }, + { + "epoch": 1.095362995403903, + "grad_norm": 3.1829411772902527, + "learning_rate": 8.94189953079219e-06, + "loss": 1.2844, + "step": 5690 + }, + { + "epoch": 1.0955555020814784, + "grad_norm": 3.2735102528095523, + "learning_rate": 8.938799188381439e-06, + "loss": 1.3904, + "step": 5691 + }, + { + "epoch": 1.0957480087590539, + "grad_norm": 3.1418807431825573, + "learning_rate": 8.935698949132983e-06, + "loss": 1.3297, + "step": 5692 + }, + { + "epoch": 1.0959405154366293, + "grad_norm": 3.233228086065883, + "learning_rate": 8.932598813348203e-06, + "loss": 1.3426, + "step": 5693 + }, + { + "epoch": 1.0961330221142045, + "grad_norm": 3.074023225740406, + "learning_rate": 8.929498781328477e-06, + "loss": 1.2919, + "step": 5694 + }, + { + "epoch": 1.09632552879178, + "grad_norm": 3.307272176178498, + "learning_rate": 8.926398853375165e-06, + "loss": 1.3589, + "step": 5695 + }, + { + "epoch": 1.0965180354693553, + "grad_norm": 3.2056346120917243, + "learning_rate": 8.923299029789617e-06, + "loss": 1.2958, + "step": 5696 + }, + { + "epoch": 1.0965180354693553, + "lm_loss": 1.0955, + "step": 5696, + "vm_loss": 0.1857 + }, + { + "epoch": 1.0965180354693553, + "lm_loss": 1.1629, + "step": 5696, + "vm_loss": 0.1113 + }, + { + "epoch": 1.0965180354693553, + "lm_loss": 1.9042, + "step": 5696, + "vm_loss": 0.1614 + }, + { + "epoch": 1.0965180354693553, + "lm_loss": 1.1895, + "step": 5696, + "vm_loss": 0.2194 + }, + { + "epoch": 1.0965180354693553, + "lm_loss": 0.9745, + "step": 5696, + "vm_loss": 0.1556 + }, + { + "epoch": 1.0965180354693553, + "lm_loss": 1.536, + "step": 5696, + "vm_loss": 0.1577 + }, + { + "epoch": 1.0965180354693553, + "lm_loss": 0.7094, + "step": 5696, + "vm_loss": 0.1603 + }, + { + "epoch": 1.0965180354693553, + "lm_loss": 1.4681, + "step": 5696, + "vm_loss": 0.1785 + }, + { + "epoch": 1.0967105421469308, + "grad_norm": 3.2061990960286058, + "learning_rate": 8.92019931087318e-06, + "loss": 1.3578, + "step": 5697 + }, + { + "epoch": 1.0969030488245062, + "grad_norm": 3.2176623245472333, + "learning_rate": 8.917099696927188e-06, + "loss": 1.3112, + "step": 5698 + }, + { + "epoch": 1.0970955555020814, + "grad_norm": 3.389623603965464, + "learning_rate": 8.914000188252955e-06, + "loss": 1.3546, + "step": 5699 + }, + { + "epoch": 1.0972880621796568, + "grad_norm": 3.347557446122881, + "learning_rate": 8.9109007851518e-06, + "loss": 1.2894, + "step": 5700 + }, + { + "epoch": 1.0974805688572322, + "grad_norm": 3.434783087036281, + "learning_rate": 8.90780148792502e-06, + "loss": 1.3188, + "step": 5701 + }, + { + "epoch": 1.0976730755348076, + "grad_norm": 3.313694495487245, + "learning_rate": 8.904702296873913e-06, + "loss": 1.3537, + "step": 5702 + }, + { + "epoch": 1.097865582212383, + "grad_norm": 3.3140814474179985, + "learning_rate": 8.901603212299751e-06, + "loss": 1.4102, + "step": 5703 + }, + { + "epoch": 1.0980580888899585, + "grad_norm": 3.209801159029106, + "learning_rate": 8.898504234503811e-06, + "loss": 1.2962, + "step": 5704 + }, + { + "epoch": 1.0980580888899585, + "lm_loss": 1.3149, + "step": 5704, + "vm_loss": 0.173 + }, + { + "epoch": 1.0980580888899585, + "lm_loss": 1.516, + "step": 5704, + "vm_loss": 0.1604 + }, + { + "epoch": 1.0980580888899585, + "lm_loss": 0.5353, + "step": 5704, + "vm_loss": 0.1606 + }, + { + "epoch": 1.0980580888899585, + "lm_loss": 1.3332, + "step": 5704, + "vm_loss": 0.1773 + }, + { + "epoch": 1.0980580888899585, + "lm_loss": 0.8764, + "step": 5704, + "vm_loss": 0.1747 + }, + { + "epoch": 1.0980580888899585, + "lm_loss": 0.833, + "step": 5704, + "vm_loss": 0.1456 + }, + { + "epoch": 1.0980580888899585, + "lm_loss": 1.2967, + "step": 5704, + "vm_loss": 0.1899 + }, + { + "epoch": 1.0980580888899585, + "lm_loss": 0.9802, + "step": 5704, + "vm_loss": 0.1277 + }, + { + "epoch": 1.0982505955675337, + "grad_norm": 3.194259831195042, + "learning_rate": 8.895405363787356e-06, + "loss": 1.3314, + "step": 5705 + }, + { + "epoch": 1.098443102245109, + "grad_norm": 3.198044851177277, + "learning_rate": 8.892306600451628e-06, + "loss": 1.2774, + "step": 5706 + }, + { + "epoch": 1.0986356089226845, + "grad_norm": 3.2487984183833034, + "learning_rate": 8.889207944797866e-06, + "loss": 1.3128, + "step": 5707 + }, + { + "epoch": 1.09882811560026, + "grad_norm": 3.2111271010134255, + "learning_rate": 8.886109397127309e-06, + "loss": 1.2663, + "step": 5708 + }, + { + "epoch": 1.0990206222778354, + "grad_norm": 3.299987832538869, + "learning_rate": 8.883010957741168e-06, + "loss": 1.2782, + "step": 5709 + }, + { + "epoch": 1.0992131289554106, + "grad_norm": 3.346647907029083, + "learning_rate": 8.879912626940647e-06, + "loss": 1.2763, + "step": 5710 + }, + { + "epoch": 1.099405635632986, + "grad_norm": 3.329141543332642, + "learning_rate": 8.876814405026955e-06, + "loss": 1.2916, + "step": 5711 + }, + { + "epoch": 1.0995981423105614, + "grad_norm": 3.335237301203536, + "learning_rate": 8.873716292301273e-06, + "loss": 1.3337, + "step": 5712 + }, + { + "epoch": 1.0995981423105614, + "lm_loss": 0.9059, + "step": 5712, + "vm_loss": 0.1363 + }, + { + "epoch": 1.0995981423105614, + "lm_loss": 1.3644, + "step": 5712, + "vm_loss": 0.1309 + }, + { + "epoch": 1.0995981423105614, + "lm_loss": 1.0724, + "step": 5712, + "vm_loss": 0.1757 + }, + { + "epoch": 1.0995981423105614, + "lm_loss": 1.2328, + "step": 5712, + "vm_loss": 0.1882 + }, + { + "epoch": 1.0995981423105614, + "lm_loss": 0.9025, + "step": 5712, + "vm_loss": 0.178 + }, + { + "epoch": 1.0995981423105614, + "lm_loss": 0.7712, + "step": 5712, + "vm_loss": 0.223 + }, + { + "epoch": 1.0995981423105614, + "lm_loss": 0.9679, + "step": 5712, + "vm_loss": 0.1772 + }, + { + "epoch": 1.0995981423105614, + "lm_loss": 0.9716, + "step": 5712, + "vm_loss": 0.1897 + }, + { + "epoch": 1.0997906489881368, + "grad_norm": 3.3860530989647764, + "learning_rate": 8.870618289064776e-06, + "loss": 1.3245, + "step": 5713 + }, + { + "epoch": 1.0999831556657123, + "grad_norm": 3.116866692562132, + "learning_rate": 8.867520395618626e-06, + "loss": 1.2991, + "step": 5714 + }, + { + "epoch": 1.1001756623432875, + "grad_norm": 3.172194706063709, + "learning_rate": 8.864422612263986e-06, + "loss": 1.2737, + "step": 5715 + }, + { + "epoch": 1.1003681690208629, + "grad_norm": 3.275481186771253, + "learning_rate": 8.861324939301998e-06, + "loss": 1.2483, + "step": 5716 + }, + { + "epoch": 1.1005606756984383, + "grad_norm": 3.190524089796947, + "learning_rate": 8.858227377033787e-06, + "loss": 1.2717, + "step": 5717 + }, + { + "epoch": 1.1007531823760137, + "grad_norm": 3.2672227934924747, + "learning_rate": 8.855129925760488e-06, + "loss": 1.27, + "step": 5718 + }, + { + "epoch": 1.1009456890535891, + "grad_norm": 3.41309060278696, + "learning_rate": 8.852032585783206e-06, + "loss": 1.2894, + "step": 5719 + }, + { + "epoch": 1.1011381957311643, + "grad_norm": 3.5509748306118847, + "learning_rate": 8.848935357403041e-06, + "loss": 1.3076, + "step": 5720 + }, + { + "epoch": 1.1011381957311643, + "lm_loss": 1.6671, + "step": 5720, + "vm_loss": 0.1402 + }, + { + "epoch": 1.1011381957311643, + "lm_loss": 1.0089, + "step": 5720, + "vm_loss": 0.173 + }, + { + "epoch": 1.1011381957311643, + "lm_loss": 0.7442, + "step": 5720, + "vm_loss": 0.1025 + }, + { + "epoch": 1.1011381957311643, + "lm_loss": 1.1051, + "step": 5720, + "vm_loss": 0.2065 + }, + { + "epoch": 1.1011381957311643, + "lm_loss": 1.1958, + "step": 5720, + "vm_loss": 0.1261 + }, + { + "epoch": 1.1011381957311643, + "lm_loss": 1.062, + "step": 5720, + "vm_loss": 0.1406 + }, + { + "epoch": 1.1011381957311643, + "lm_loss": 1.1236, + "step": 5720, + "vm_loss": 0.182 + }, + { + "epoch": 1.1011381957311643, + "lm_loss": 1.1452, + "step": 5720, + "vm_loss": 0.2024 + }, + { + "epoch": 1.1013307024087398, + "grad_norm": 3.2172550917825746, + "learning_rate": 8.845838240921091e-06, + "loss": 1.2263, + "step": 5721 + }, + { + "epoch": 1.1015232090863152, + "grad_norm": 3.2650362536324367, + "learning_rate": 8.842741236638427e-06, + "loss": 1.3122, + "step": 5722 + }, + { + "epoch": 1.1017157157638906, + "grad_norm": 3.3086706903377947, + "learning_rate": 8.839644344856122e-06, + "loss": 1.3099, + "step": 5723 + }, + { + "epoch": 1.101908222441466, + "grad_norm": 3.0866868531425062, + "learning_rate": 8.836547565875227e-06, + "loss": 1.2873, + "step": 5724 + }, + { + "epoch": 1.1021007291190412, + "grad_norm": 3.2058832697428983, + "learning_rate": 8.833450899996799e-06, + "loss": 1.259, + "step": 5725 + }, + { + "epoch": 1.1022932357966166, + "grad_norm": 3.0449475786265503, + "learning_rate": 8.830354347521867e-06, + "loss": 1.3154, + "step": 5726 + }, + { + "epoch": 1.102485742474192, + "grad_norm": 3.186909057301959, + "learning_rate": 8.827257908751454e-06, + "loss": 1.2959, + "step": 5727 + }, + { + "epoch": 1.1026782491517675, + "grad_norm": 3.179916271433507, + "learning_rate": 8.824161583986578e-06, + "loss": 1.263, + "step": 5728 + }, + { + "epoch": 1.1026782491517675, + "lm_loss": 1.407, + "step": 5728, + "vm_loss": 0.2057 + }, + { + "epoch": 1.1026782491517675, + "lm_loss": 0.9904, + "step": 5728, + "vm_loss": 0.224 + }, + { + "epoch": 1.1026782491517675, + "lm_loss": 0.8983, + "step": 5728, + "vm_loss": 0.1021 + }, + { + "epoch": 1.1026782491517675, + "lm_loss": 0.8898, + "step": 5728, + "vm_loss": 0.1955 + }, + { + "epoch": 1.1026782491517675, + "lm_loss": 1.5359, + "step": 5728, + "vm_loss": 0.1971 + }, + { + "epoch": 1.1026782491517675, + "lm_loss": 1.2492, + "step": 5728, + "vm_loss": 0.168 + }, + { + "epoch": 1.1026782491517675, + "lm_loss": 1.6421, + "step": 5728, + "vm_loss": 0.1557 + }, + { + "epoch": 1.1026782491517675, + "lm_loss": 0.7561, + "step": 5728, + "vm_loss": 0.1674 + }, + { + "epoch": 1.102870755829343, + "grad_norm": 3.3544099807973398, + "learning_rate": 8.821065373528242e-06, + "loss": 1.3186, + "step": 5729 + }, + { + "epoch": 1.103063262506918, + "grad_norm": 3.2155243484799474, + "learning_rate": 8.817969277677434e-06, + "loss": 1.3368, + "step": 5730 + }, + { + "epoch": 1.1032557691844935, + "grad_norm": 3.2399010599616163, + "learning_rate": 8.814873296735129e-06, + "loss": 1.2513, + "step": 5731 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 3.411926884325892, + "learning_rate": 8.811777431002307e-06, + "loss": 1.3891, + "step": 5732 + }, + { + "epoch": 1.1036407825396444, + "grad_norm": 3.239426826739809, + "learning_rate": 8.80868168077992e-06, + "loss": 1.2687, + "step": 5733 + }, + { + "epoch": 1.1038332892172198, + "grad_norm": 3.2149869406082408, + "learning_rate": 8.80558604636891e-06, + "loss": 1.2915, + "step": 5734 + }, + { + "epoch": 1.104025795894795, + "grad_norm": 3.20867661109182, + "learning_rate": 8.802490528070223e-06, + "loss": 1.2929, + "step": 5735 + }, + { + "epoch": 1.1042183025723704, + "grad_norm": 2.9938701382306245, + "learning_rate": 8.799395126184777e-06, + "loss": 1.266, + "step": 5736 + }, + { + "epoch": 1.1042183025723704, + "lm_loss": 1.2729, + "step": 5736, + "vm_loss": 0.1325 + }, + { + "epoch": 1.1042183025723704, + "lm_loss": 1.1825, + "step": 5736, + "vm_loss": 0.1852 + }, + { + "epoch": 1.1042183025723704, + "lm_loss": 0.9335, + "step": 5736, + "vm_loss": 0.1749 + }, + { + "epoch": 1.1042183025723704, + "lm_loss": 1.3149, + "step": 5736, + "vm_loss": 0.1518 + }, + { + "epoch": 1.1042183025723704, + "lm_loss": 1.2084, + "step": 5736, + "vm_loss": 0.2314 + }, + { + "epoch": 1.1042183025723704, + "lm_loss": 1.0026, + "step": 5736, + "vm_loss": 0.1172 + }, + { + "epoch": 1.1042183025723704, + "lm_loss": 1.2398, + "step": 5736, + "vm_loss": 0.1695 + }, + { + "epoch": 1.1042183025723704, + "lm_loss": 1.284, + "step": 5736, + "vm_loss": 0.1513 + }, + { + "epoch": 1.1044108092499458, + "grad_norm": 3.3294033524570925, + "learning_rate": 8.79629984101348e-06, + "loss": 1.3744, + "step": 5737 + }, + { + "epoch": 1.1046033159275213, + "grad_norm": 3.230884024334751, + "learning_rate": 8.793204672857244e-06, + "loss": 1.3228, + "step": 5738 + }, + { + "epoch": 1.1047958226050967, + "grad_norm": 3.1790858903114034, + "learning_rate": 8.790109622016953e-06, + "loss": 1.2911, + "step": 5739 + }, + { + "epoch": 1.1049883292826719, + "grad_norm": 3.1446978365129534, + "learning_rate": 8.787014688793483e-06, + "loss": 1.2558, + "step": 5740 + }, + { + "epoch": 1.1051808359602473, + "grad_norm": 3.3188546078909105, + "learning_rate": 8.783919873487703e-06, + "loss": 1.3439, + "step": 5741 + }, + { + "epoch": 1.1053733426378227, + "grad_norm": 3.178844859898877, + "learning_rate": 8.78082517640047e-06, + "loss": 1.2719, + "step": 5742 + }, + { + "epoch": 1.1055658493153981, + "grad_norm": 3.3052463644202037, + "learning_rate": 8.777730597832631e-06, + "loss": 1.293, + "step": 5743 + }, + { + "epoch": 1.1057583559929736, + "grad_norm": 3.223868764354965, + "learning_rate": 8.774636138085013e-06, + "loss": 1.2274, + "step": 5744 + }, + { + "epoch": 1.1057583559929736, + "lm_loss": 1.1531, + "step": 5744, + "vm_loss": 0.1115 + }, + { + "epoch": 1.1057583559929736, + "lm_loss": 1.5379, + "step": 5744, + "vm_loss": 0.2015 + }, + { + "epoch": 1.1057583559929736, + "lm_loss": 1.4541, + "step": 5744, + "vm_loss": 0.2231 + }, + { + "epoch": 1.1057583559929736, + "lm_loss": 1.1609, + "step": 5744, + "vm_loss": 0.1716 + }, + { + "epoch": 1.1057583559929736, + "lm_loss": 1.0592, + "step": 5744, + "vm_loss": 0.2418 + }, + { + "epoch": 1.1057583559929736, + "lm_loss": 0.6771, + "step": 5744, + "vm_loss": 0.1638 + }, + { + "epoch": 1.1057583559929736, + "lm_loss": 1.2431, + "step": 5744, + "vm_loss": 0.1685 + }, + { + "epoch": 1.1057583559929736, + "lm_loss": 1.4037, + "step": 5744, + "vm_loss": 0.2159 + }, + { + "epoch": 1.105950862670549, + "grad_norm": 3.3372441862643623, + "learning_rate": 8.771541797458438e-06, + "loss": 1.3425, + "step": 5745 + }, + { + "epoch": 1.1061433693481242, + "grad_norm": 3.282351538931925, + "learning_rate": 8.76844757625372e-06, + "loss": 1.3096, + "step": 5746 + }, + { + "epoch": 1.1063358760256996, + "grad_norm": 3.255183448764697, + "learning_rate": 8.765353474771652e-06, + "loss": 1.303, + "step": 5747 + }, + { + "epoch": 1.106528382703275, + "grad_norm": 3.2298008842137027, + "learning_rate": 8.76225949331302e-06, + "loss": 1.3707, + "step": 5748 + }, + { + "epoch": 1.1067208893808504, + "grad_norm": 3.2003155492387556, + "learning_rate": 8.759165632178605e-06, + "loss": 1.3075, + "step": 5749 + }, + { + "epoch": 1.1069133960584259, + "grad_norm": 3.0844910498814087, + "learning_rate": 8.756071891669164e-06, + "loss": 1.2853, + "step": 5750 + }, + { + "epoch": 1.107105902736001, + "grad_norm": 3.200859508529741, + "learning_rate": 8.752978272085445e-06, + "loss": 1.4472, + "step": 5751 + }, + { + "epoch": 1.1072984094135765, + "grad_norm": 3.1647514808431545, + "learning_rate": 8.749884773728197e-06, + "loss": 1.2861, + "step": 5752 + }, + { + "epoch": 1.1072984094135765, + "lm_loss": 1.1724, + "step": 5752, + "vm_loss": 0.1493 + }, + { + "epoch": 1.1072984094135765, + "lm_loss": 1.0232, + "step": 5752, + "vm_loss": 0.2277 + }, + { + "epoch": 1.1072984094135765, + "lm_loss": 1.339, + "step": 5752, + "vm_loss": 0.1616 + }, + { + "epoch": 1.1072984094135765, + "lm_loss": 1.7843, + "step": 5752, + "vm_loss": 0.2309 + }, + { + "epoch": 1.1072984094135765, + "lm_loss": 0.9503, + "step": 5752, + "vm_loss": 0.1245 + }, + { + "epoch": 1.1072984094135765, + "lm_loss": 0.7747, + "step": 5752, + "vm_loss": 0.1292 + }, + { + "epoch": 1.1072984094135765, + "lm_loss": 1.042, + "step": 5752, + "vm_loss": 0.1519 + }, + { + "epoch": 1.1072984094135765, + "lm_loss": 1.321, + "step": 5752, + "vm_loss": 0.1591 + }, + { + "epoch": 1.107490916091152, + "grad_norm": 3.2335417505779027, + "learning_rate": 8.746791396898142e-06, + "loss": 1.3539, + "step": 5753 + }, + { + "epoch": 1.1076834227687273, + "grad_norm": 3.129828256571666, + "learning_rate": 8.743698141895995e-06, + "loss": 1.2401, + "step": 5754 + }, + { + "epoch": 1.1078759294463028, + "grad_norm": 3.38379470508949, + "learning_rate": 8.740605009022462e-06, + "loss": 1.3669, + "step": 5755 + }, + { + "epoch": 1.108068436123878, + "grad_norm": 3.4064027881281316, + "learning_rate": 8.737511998578237e-06, + "loss": 1.2685, + "step": 5756 + }, + { + "epoch": 1.1082609428014534, + "grad_norm": 3.3082602514750192, + "learning_rate": 8.734419110863996e-06, + "loss": 1.3381, + "step": 5757 + }, + { + "epoch": 1.1084534494790288, + "grad_norm": 3.104944738599945, + "learning_rate": 8.731326346180406e-06, + "loss": 1.2362, + "step": 5758 + }, + { + "epoch": 1.1086459561566042, + "grad_norm": 3.2475149597922517, + "learning_rate": 8.728233704828129e-06, + "loss": 1.2941, + "step": 5759 + }, + { + "epoch": 1.1088384628341796, + "grad_norm": 3.1182486341455573, + "learning_rate": 8.725141187107809e-06, + "loss": 1.2464, + "step": 5760 + }, + { + "epoch": 1.1088384628341796, + "lm_loss": 0.7673, + "step": 5760, + "vm_loss": 0.2399 + }, + { + "epoch": 1.1088384628341796, + "lm_loss": 1.2846, + "step": 5760, + "vm_loss": 0.1154 + }, + { + "epoch": 1.1088384628341796, + "lm_loss": 0.7412, + "step": 5760, + "vm_loss": 0.2398 + }, + { + "epoch": 1.1088384628341796, + "lm_loss": 1.2939, + "step": 5760, + "vm_loss": 0.1882 + }, + { + "epoch": 1.1088384628341796, + "lm_loss": 0.8116, + "step": 5760, + "vm_loss": 0.1919 + }, + { + "epoch": 1.1088384628341796, + "lm_loss": 1.1136, + "step": 5760, + "vm_loss": 0.1837 + }, + { + "epoch": 1.1088384628341796, + "lm_loss": 1.2673, + "step": 5760, + "vm_loss": 0.1472 + }, + { + "epoch": 1.1088384628341796, + "lm_loss": 0.8531, + "step": 5760, + "vm_loss": 0.2475 + }, + { + "epoch": 1.1090309695117548, + "grad_norm": 3.1310694188883486, + "learning_rate": 8.722048793320069e-06, + "loss": 1.257, + "step": 5761 + }, + { + "epoch": 1.1092234761893303, + "grad_norm": 3.1253915911173493, + "learning_rate": 8.718956523765542e-06, + "loss": 1.2536, + "step": 5762 + }, + { + "epoch": 1.1094159828669057, + "grad_norm": 3.1741424388362764, + "learning_rate": 8.715864378744827e-06, + "loss": 1.2745, + "step": 5763 + }, + { + "epoch": 1.109608489544481, + "grad_norm": 3.2240120514473882, + "learning_rate": 8.712772358558526e-06, + "loss": 1.3256, + "step": 5764 + }, + { + "epoch": 1.1098009962220565, + "grad_norm": 3.24387418275331, + "learning_rate": 8.709680463507217e-06, + "loss": 1.2493, + "step": 5765 + }, + { + "epoch": 1.109993502899632, + "grad_norm": 3.319137751666423, + "learning_rate": 8.706588693891475e-06, + "loss": 1.298, + "step": 5766 + }, + { + "epoch": 1.1101860095772071, + "grad_norm": 3.249226861485674, + "learning_rate": 8.703497050011865e-06, + "loss": 1.2556, + "step": 5767 + }, + { + "epoch": 1.1103785162547826, + "grad_norm": 3.162930382574667, + "learning_rate": 8.700405532168921e-06, + "loss": 1.3077, + "step": 5768 + }, + { + "epoch": 1.1103785162547826, + "lm_loss": 0.8303, + "step": 5768, + "vm_loss": 0.1598 + }, + { + "epoch": 1.1103785162547826, + "lm_loss": 1.0244, + "step": 5768, + "vm_loss": 0.2202 + }, + { + "epoch": 1.1103785162547826, + "lm_loss": 1.1835, + "step": 5768, + "vm_loss": 0.173 + }, + { + "epoch": 1.1103785162547826, + "lm_loss": 1.0347, + "step": 5768, + "vm_loss": 0.1745 + }, + { + "epoch": 1.1103785162547826, + "lm_loss": 0.8018, + "step": 5768, + "vm_loss": 0.1876 + }, + { + "epoch": 1.1103785162547826, + "lm_loss": 1.0218, + "step": 5768, + "vm_loss": 0.2613 + }, + { + "epoch": 1.1103785162547826, + "lm_loss": 1.0776, + "step": 5768, + "vm_loss": 0.1617 + }, + { + "epoch": 1.1103785162547826, + "lm_loss": 1.4667, + "step": 5768, + "vm_loss": 0.1413 + }, + { + "epoch": 1.110571022932358, + "grad_norm": 3.07642110724025, + "learning_rate": 8.697314140663191e-06, + "loss": 1.2263, + "step": 5769 + }, + { + "epoch": 1.1107635296099334, + "grad_norm": 3.249267159171443, + "learning_rate": 8.694222875795193e-06, + "loss": 1.2744, + "step": 5770 + }, + { + "epoch": 1.1109560362875088, + "grad_norm": 3.13720427137784, + "learning_rate": 8.691131737865432e-06, + "loss": 1.2248, + "step": 5771 + }, + { + "epoch": 1.111148542965084, + "grad_norm": 3.2859805775860718, + "learning_rate": 8.688040727174419e-06, + "loss": 1.2715, + "step": 5772 + }, + { + "epoch": 1.1113410496426595, + "grad_norm": 3.2549457878110406, + "learning_rate": 8.684949844022629e-06, + "loss": 1.2662, + "step": 5773 + }, + { + "epoch": 1.1115335563202349, + "grad_norm": 3.3197574947459763, + "learning_rate": 8.681859088710538e-06, + "loss": 1.2975, + "step": 5774 + }, + { + "epoch": 1.1117260629978103, + "grad_norm": 3.3564377916550563, + "learning_rate": 8.678768461538605e-06, + "loss": 1.3153, + "step": 5775 + }, + { + "epoch": 1.1119185696753857, + "grad_norm": 3.3079923980813457, + "learning_rate": 8.675677962807285e-06, + "loss": 1.3404, + "step": 5776 + }, + { + "epoch": 1.1119185696753857, + "lm_loss": 1.1684, + "step": 5776, + "vm_loss": 0.1621 + }, + { + "epoch": 1.1119185696753857, + "lm_loss": 0.7326, + "step": 5776, + "vm_loss": 0.1369 + }, + { + "epoch": 1.1119185696753857, + "lm_loss": 1.3145, + "step": 5776, + "vm_loss": 0.1608 + }, + { + "epoch": 1.1119185696753857, + "lm_loss": 1.2669, + "step": 5776, + "vm_loss": 0.129 + }, + { + "epoch": 1.1119185696753857, + "lm_loss": 0.7184, + "step": 5776, + "vm_loss": 0.1774 + }, + { + "epoch": 1.1119185696753857, + "lm_loss": 1.6034, + "step": 5776, + "vm_loss": 0.1568 + }, + { + "epoch": 1.1119185696753857, + "lm_loss": 1.6335, + "step": 5776, + "vm_loss": 0.1809 + }, + { + "epoch": 1.1119185696753857, + "lm_loss": 0.5951, + "step": 5776, + "vm_loss": 0.1637 + }, + { + "epoch": 1.112111076352961, + "grad_norm": 3.296730670451773, + "learning_rate": 8.67258759281701e-06, + "loss": 1.2567, + "step": 5777 + }, + { + "epoch": 1.1123035830305363, + "grad_norm": 3.4502152249762306, + "learning_rate": 8.669497351868199e-06, + "loss": 1.367, + "step": 5778 + }, + { + "epoch": 1.1124960897081118, + "grad_norm": 3.236412433177419, + "learning_rate": 8.666407240261271e-06, + "loss": 1.2906, + "step": 5779 + }, + { + "epoch": 1.1126885963856872, + "grad_norm": 3.20910457197812, + "learning_rate": 8.663317258296624e-06, + "loss": 1.233, + "step": 5780 + }, + { + "epoch": 1.1128811030632626, + "grad_norm": 3.3524233100461003, + "learning_rate": 8.660227406274636e-06, + "loss": 1.331, + "step": 5781 + }, + { + "epoch": 1.1130736097408378, + "grad_norm": 3.3928806572816166, + "learning_rate": 8.657137684495685e-06, + "loss": 1.34, + "step": 5782 + }, + { + "epoch": 1.1132661164184132, + "grad_norm": 3.1867955984439162, + "learning_rate": 8.654048093260135e-06, + "loss": 1.2842, + "step": 5783 + }, + { + "epoch": 1.1134586230959886, + "grad_norm": 3.375041312887529, + "learning_rate": 8.650958632868325e-06, + "loss": 1.285, + "step": 5784 + }, + { + "epoch": 1.1134586230959886, + "lm_loss": 1.0885, + "step": 5784, + "vm_loss": 0.1402 + }, + { + "epoch": 1.1134586230959886, + "lm_loss": 0.9434, + "step": 5784, + "vm_loss": 0.1577 + }, + { + "epoch": 1.1134586230959886, + "lm_loss": 1.1283, + "step": 5784, + "vm_loss": 0.1699 + }, + { + "epoch": 1.1134586230959886, + "lm_loss": 1.0804, + "step": 5784, + "vm_loss": 0.1216 + }, + { + "epoch": 1.1134586230959886, + "lm_loss": 1.1903, + "step": 5784, + "vm_loss": 0.1507 + }, + { + "epoch": 1.1134586230959886, + "lm_loss": 1.0224, + "step": 5784, + "vm_loss": 0.1506 + }, + { + "epoch": 1.1134586230959886, + "lm_loss": 0.4357, + "step": 5784, + "vm_loss": 0.2024 + }, + { + "epoch": 1.1134586230959886, + "lm_loss": 1.1196, + "step": 5784, + "vm_loss": 0.2381 + }, + { + "epoch": 1.113651129773564, + "grad_norm": 3.211773769195659, + "learning_rate": 8.647869303620597e-06, + "loss": 1.2346, + "step": 5785 + }, + { + "epoch": 1.1138436364511395, + "grad_norm": 3.416427406500532, + "learning_rate": 8.644780105817272e-06, + "loss": 1.3322, + "step": 5786 + }, + { + "epoch": 1.1140361431287147, + "grad_norm": 3.194090236827222, + "learning_rate": 8.64169103975866e-06, + "loss": 1.2845, + "step": 5787 + }, + { + "epoch": 1.11422864980629, + "grad_norm": 3.2073354884866188, + "learning_rate": 8.638602105745058e-06, + "loss": 1.2709, + "step": 5788 + }, + { + "epoch": 1.1144211564838655, + "grad_norm": 3.248757242893497, + "learning_rate": 8.635513304076742e-06, + "loss": 1.3377, + "step": 5789 + }, + { + "epoch": 1.114613663161441, + "grad_norm": 3.103522565537857, + "learning_rate": 8.632424635053997e-06, + "loss": 1.2263, + "step": 5790 + }, + { + "epoch": 1.1148061698390164, + "grad_norm": 3.0884328320560743, + "learning_rate": 8.629336098977073e-06, + "loss": 1.26, + "step": 5791 + }, + { + "epoch": 1.1149986765165916, + "grad_norm": 3.383489323314709, + "learning_rate": 8.62624769614621e-06, + "loss": 1.3099, + "step": 5792 + }, + { + "epoch": 1.1149986765165916, + "lm_loss": 0.9667, + "step": 5792, + "vm_loss": 0.1408 + }, + { + "epoch": 1.1149986765165916, + "lm_loss": 1.3053, + "step": 5792, + "vm_loss": 0.2438 + }, + { + "epoch": 1.1149986765165916, + "lm_loss": 1.2457, + "step": 5792, + "vm_loss": 0.2179 + }, + { + "epoch": 1.1149986765165916, + "lm_loss": 0.7514, + "step": 5792, + "vm_loss": 0.1878 + }, + { + "epoch": 1.1149986765165916, + "lm_loss": 1.0549, + "step": 5792, + "vm_loss": 0.1739 + }, + { + "epoch": 1.1149986765165916, + "lm_loss": 0.7807, + "step": 5792, + "vm_loss": 0.1064 + }, + { + "epoch": 1.1149986765165916, + "lm_loss": 0.911, + "step": 5792, + "vm_loss": 0.1888 + }, + { + "epoch": 1.1149986765165916, + "lm_loss": 0.9711, + "step": 5792, + "vm_loss": 0.1778 + }, + { + "epoch": 1.115191183194167, + "grad_norm": 3.2417360932903425, + "learning_rate": 8.623159426861653e-06, + "loss": 1.2937, + "step": 5793 + }, + { + "epoch": 1.1153836898717424, + "grad_norm": 3.153840568085341, + "learning_rate": 8.620071291423613e-06, + "loss": 1.2251, + "step": 5794 + }, + { + "epoch": 1.1155761965493178, + "grad_norm": 3.255810899035514, + "learning_rate": 8.616983290132294e-06, + "loss": 1.2595, + "step": 5795 + }, + { + "epoch": 1.1157687032268933, + "grad_norm": 3.1984720836336007, + "learning_rate": 8.613895423287899e-06, + "loss": 1.2035, + "step": 5796 + }, + { + "epoch": 1.1159612099044685, + "grad_norm": 3.3253300663742875, + "learning_rate": 8.6108076911906e-06, + "loss": 1.2823, + "step": 5797 + }, + { + "epoch": 1.1161537165820439, + "grad_norm": 3.2594200375673217, + "learning_rate": 8.607720094140567e-06, + "loss": 1.3097, + "step": 5798 + }, + { + "epoch": 1.1163462232596193, + "grad_norm": 3.337625101396846, + "learning_rate": 8.60463263243795e-06, + "loss": 1.2551, + "step": 5799 + }, + { + "epoch": 1.1165387299371947, + "grad_norm": 3.29332614245858, + "learning_rate": 8.601545306382897e-06, + "loss": 1.3796, + "step": 5800 + }, + { + "epoch": 1.1165387299371947, + "lm_loss": 1.4544, + "step": 5800, + "vm_loss": 0.1364 + }, + { + "epoch": 1.1165387299371947, + "lm_loss": 0.7637, + "step": 5800, + "vm_loss": 0.1965 + }, + { + "epoch": 1.1165387299371947, + "lm_loss": 1.4044, + "step": 5800, + "vm_loss": 0.1783 + }, + { + "epoch": 1.1165387299371947, + "lm_loss": 1.1879, + "step": 5800, + "vm_loss": 0.1631 + }, + { + "epoch": 1.1165387299371947, + "lm_loss": 0.9673, + "step": 5800, + "vm_loss": 0.1737 + }, + { + "epoch": 1.1165387299371947, + "lm_loss": 1.3464, + "step": 5800, + "vm_loss": 0.1548 + }, + { + "epoch": 1.1165387299371947, + "lm_loss": 0.8967, + "step": 5800, + "vm_loss": 0.1241 + }, + { + "epoch": 1.1165387299371947, + "lm_loss": 0.9183, + "step": 5800, + "vm_loss": 0.1432 + }, + { + "epoch": 1.1167312366147701, + "grad_norm": 3.181126926582151, + "learning_rate": 8.59845811627553e-06, + "loss": 1.2678, + "step": 5801 + }, + { + "epoch": 1.1169237432923453, + "grad_norm": 3.2007875687220184, + "learning_rate": 8.595371062415962e-06, + "loss": 1.3012, + "step": 5802 + }, + { + "epoch": 1.1171162499699208, + "grad_norm": 3.3794410356754905, + "learning_rate": 8.592284145104303e-06, + "loss": 1.2866, + "step": 5803 + }, + { + "epoch": 1.1173087566474962, + "grad_norm": 3.270731015546087, + "learning_rate": 8.58919736464063e-06, + "loss": 1.2684, + "step": 5804 + }, + { + "epoch": 1.1175012633250716, + "grad_norm": 3.0871461124290884, + "learning_rate": 8.586110721325028e-06, + "loss": 1.2454, + "step": 5805 + }, + { + "epoch": 1.117693770002647, + "grad_norm": 3.196088735564785, + "learning_rate": 8.583024215457548e-06, + "loss": 1.249, + "step": 5806 + }, + { + "epoch": 1.1178862766802224, + "grad_norm": 3.232896060269576, + "learning_rate": 8.579937847338245e-06, + "loss": 1.3196, + "step": 5807 + }, + { + "epoch": 1.1180787833577976, + "grad_norm": 3.2574655536993955, + "learning_rate": 8.576851617267151e-06, + "loss": 1.255, + "step": 5808 + }, + { + "epoch": 1.1180787833577976, + "lm_loss": 1.5429, + "step": 5808, + "vm_loss": 0.1759 + }, + { + "epoch": 1.1180787833577976, + "lm_loss": 0.8541, + "step": 5808, + "vm_loss": 0.1467 + }, + { + "epoch": 1.1180787833577976, + "lm_loss": 1.1003, + "step": 5808, + "vm_loss": 0.1333 + }, + { + "epoch": 1.1180787833577976, + "lm_loss": 1.0175, + "step": 5808, + "vm_loss": 0.1691 + }, + { + "epoch": 1.1180787833577976, + "lm_loss": 0.5826, + "step": 5808, + "vm_loss": 0.1146 + }, + { + "epoch": 1.1180787833577976, + "lm_loss": 1.3037, + "step": 5808, + "vm_loss": 0.1584 + }, + { + "epoch": 1.1180787833577976, + "lm_loss": 0.8394, + "step": 5808, + "vm_loss": 0.1979 + }, + { + "epoch": 1.1180787833577976, + "lm_loss": 1.0671, + "step": 5808, + "vm_loss": 0.1556 + }, + { + "epoch": 1.118271290035373, + "grad_norm": 3.1902434672686613, + "learning_rate": 8.573765525544282e-06, + "loss": 1.2234, + "step": 5809 + }, + { + "epoch": 1.1184637967129485, + "grad_norm": 3.3514099123226946, + "learning_rate": 8.570679572469659e-06, + "loss": 1.3134, + "step": 5810 + }, + { + "epoch": 1.118656303390524, + "grad_norm": 3.3315402318895324, + "learning_rate": 8.567593758343266e-06, + "loss": 1.3401, + "step": 5811 + }, + { + "epoch": 1.1188488100680993, + "grad_norm": 3.173129076311302, + "learning_rate": 8.56450808346508e-06, + "loss": 1.2829, + "step": 5812 + }, + { + "epoch": 1.1190413167456745, + "grad_norm": 3.1240676379334684, + "learning_rate": 8.561422548135079e-06, + "loss": 1.2446, + "step": 5813 + }, + { + "epoch": 1.11923382342325, + "grad_norm": 3.183081385208541, + "learning_rate": 8.558337152653211e-06, + "loss": 1.3134, + "step": 5814 + }, + { + "epoch": 1.1194263301008254, + "grad_norm": 3.2413698585858035, + "learning_rate": 8.555251897319417e-06, + "loss": 1.3259, + "step": 5815 + }, + { + "epoch": 1.1196188367784008, + "grad_norm": 3.233607278621533, + "learning_rate": 8.552166782433617e-06, + "loss": 1.2863, + "step": 5816 + }, + { + "epoch": 1.1196188367784008, + "lm_loss": 1.1983, + "step": 5816, + "vm_loss": 0.1822 + }, + { + "epoch": 1.1196188367784008, + "lm_loss": 0.9664, + "step": 5816, + "vm_loss": 0.1395 + }, + { + "epoch": 1.1196188367784008, + "lm_loss": 1.0984, + "step": 5816, + "vm_loss": 0.1948 + }, + { + "epoch": 1.1196188367784008, + "lm_loss": 0.4811, + "step": 5816, + "vm_loss": 0.1375 + }, + { + "epoch": 1.1196188367784008, + "lm_loss": 0.6098, + "step": 5816, + "vm_loss": 0.151 + }, + { + "epoch": 1.1196188367784008, + "lm_loss": 1.2023, + "step": 5816, + "vm_loss": 0.1843 + }, + { + "epoch": 1.1196188367784008, + "lm_loss": 1.0636, + "step": 5816, + "vm_loss": 0.2021 + }, + { + "epoch": 1.1196188367784008, + "lm_loss": 0.9158, + "step": 5816, + "vm_loss": 0.1479 + }, + { + "epoch": 1.1198113434559762, + "grad_norm": 3.189399888909901, + "learning_rate": 8.549081808295732e-06, + "loss": 1.2151, + "step": 5817 + }, + { + "epoch": 1.1200038501335514, + "grad_norm": 3.4421164148864336, + "learning_rate": 8.54599697520566e-06, + "loss": 1.4045, + "step": 5818 + }, + { + "epoch": 1.1201963568111268, + "grad_norm": 3.2848889591705737, + "learning_rate": 8.542912283463278e-06, + "loss": 1.3319, + "step": 5819 + }, + { + "epoch": 1.1203888634887023, + "grad_norm": 3.0149955803195025, + "learning_rate": 8.53982773336847e-06, + "loss": 1.2412, + "step": 5820 + }, + { + "epoch": 1.1205813701662777, + "grad_norm": 3.3221193616576468, + "learning_rate": 8.536743325221087e-06, + "loss": 1.343, + "step": 5821 + }, + { + "epoch": 1.120773876843853, + "grad_norm": 3.1189604342214143, + "learning_rate": 8.53365905932097e-06, + "loss": 1.2208, + "step": 5822 + }, + { + "epoch": 1.1209663835214283, + "grad_norm": 3.2121365800015544, + "learning_rate": 8.530574935967952e-06, + "loss": 1.3094, + "step": 5823 + }, + { + "epoch": 1.1211588901990037, + "grad_norm": 3.2384096282188666, + "learning_rate": 8.527490955461852e-06, + "loss": 1.2752, + "step": 5824 + }, + { + "epoch": 1.1211588901990037, + "lm_loss": 1.3402, + "step": 5824, + "vm_loss": 0.1078 + }, + { + "epoch": 1.1211588901990037, + "lm_loss": 1.5689, + "step": 5824, + "vm_loss": 0.1381 + }, + { + "epoch": 1.1211588901990037, + "lm_loss": 1.0668, + "step": 5824, + "vm_loss": 0.1435 + }, + { + "epoch": 1.1211588901990037, + "lm_loss": 0.8923, + "step": 5824, + "vm_loss": 0.1059 + }, + { + "epoch": 1.1211588901990037, + "lm_loss": 1.1477, + "step": 5824, + "vm_loss": 0.1211 + }, + { + "epoch": 1.1211588901990037, + "lm_loss": 1.4189, + "step": 5824, + "vm_loss": 0.1487 + }, + { + "epoch": 1.1211588901990037, + "lm_loss": 0.9885, + "step": 5824, + "vm_loss": 0.2499 + }, + { + "epoch": 1.1211588901990037, + "lm_loss": 1.1444, + "step": 5824, + "vm_loss": 0.1393 + }, + { + "epoch": 1.1213513968765791, + "grad_norm": 3.1174973130473465, + "learning_rate": 8.52440711810247e-06, + "loss": 1.2267, + "step": 5825 + }, + { + "epoch": 1.1215439035541546, + "grad_norm": 3.2690984713514513, + "learning_rate": 8.521323424189593e-06, + "loss": 1.3266, + "step": 5826 + }, + { + "epoch": 1.12173641023173, + "grad_norm": 3.100651494825096, + "learning_rate": 8.518239874022997e-06, + "loss": 1.2071, + "step": 5827 + }, + { + "epoch": 1.1219289169093054, + "grad_norm": 3.2724168318632745, + "learning_rate": 8.515156467902444e-06, + "loss": 1.2676, + "step": 5828 + }, + { + "epoch": 1.1221214235868806, + "grad_norm": 3.2248893350241414, + "learning_rate": 8.512073206127677e-06, + "loss": 1.2517, + "step": 5829 + }, + { + "epoch": 1.122313930264456, + "grad_norm": 3.2508027984971823, + "learning_rate": 8.508990088998431e-06, + "loss": 1.295, + "step": 5830 + }, + { + "epoch": 1.1225064369420314, + "grad_norm": 3.2473741504206983, + "learning_rate": 8.505907116814427e-06, + "loss": 1.344, + "step": 5831 + }, + { + "epoch": 1.1226989436196069, + "grad_norm": 3.151055139151268, + "learning_rate": 8.502824289875366e-06, + "loss": 1.223, + "step": 5832 + }, + { + "epoch": 1.1226989436196069, + "lm_loss": 1.6531, + "step": 5832, + "vm_loss": 0.2195 + }, + { + "epoch": 1.1226989436196069, + "lm_loss": 0.6116, + "step": 5832, + "vm_loss": 0.2436 + }, + { + "epoch": 1.1226989436196069, + "lm_loss": 1.3, + "step": 5832, + "vm_loss": 0.1455 + }, + { + "epoch": 1.1226989436196069, + "lm_loss": 0.9203, + "step": 5832, + "vm_loss": 0.1323 + }, + { + "epoch": 1.1226989436196069, + "lm_loss": 1.1451, + "step": 5832, + "vm_loss": 0.1276 + }, + { + "epoch": 1.1226989436196069, + "lm_loss": 0.8976, + "step": 5832, + "vm_loss": 0.1809 + }, + { + "epoch": 1.1226989436196069, + "lm_loss": 1.0667, + "step": 5832, + "vm_loss": 0.1296 + }, + { + "epoch": 1.1226989436196069, + "lm_loss": 1.4443, + "step": 5832, + "vm_loss": 0.1593 + }, + { + "epoch": 1.1228914502971823, + "grad_norm": 3.273362627432169, + "learning_rate": 8.499741608480932e-06, + "loss": 1.3474, + "step": 5833 + }, + { + "epoch": 1.1230839569747575, + "grad_norm": 3.1271279159256675, + "learning_rate": 8.496659072930815e-06, + "loss": 1.2773, + "step": 5834 + }, + { + "epoch": 1.123276463652333, + "grad_norm": 3.2203187817709824, + "learning_rate": 8.493576683524667e-06, + "loss": 1.3296, + "step": 5835 + }, + { + "epoch": 1.1234689703299083, + "grad_norm": 3.2630457643465918, + "learning_rate": 8.490494440562136e-06, + "loss": 1.2938, + "step": 5836 + }, + { + "epoch": 1.1236614770074838, + "grad_norm": 3.2379638364155934, + "learning_rate": 8.48741234434286e-06, + "loss": 1.3148, + "step": 5837 + }, + { + "epoch": 1.1238539836850592, + "grad_norm": 3.170706942060178, + "learning_rate": 8.484330395166455e-06, + "loss": 1.3533, + "step": 5838 + }, + { + "epoch": 1.1240464903626344, + "grad_norm": 3.137551450835941, + "learning_rate": 8.48124859333253e-06, + "loss": 1.2266, + "step": 5839 + }, + { + "epoch": 1.1242389970402098, + "grad_norm": 3.2694774073982154, + "learning_rate": 8.478166939140665e-06, + "loss": 1.3561, + "step": 5840 + }, + { + "epoch": 1.1242389970402098, + "lm_loss": 1.4241, + "step": 5840, + "vm_loss": 0.1871 + }, + { + "epoch": 1.1242389970402098, + "lm_loss": 1.2203, + "step": 5840, + "vm_loss": 0.2032 + }, + { + "epoch": 1.1242389970402098, + "lm_loss": 0.9619, + "step": 5840, + "vm_loss": 0.1462 + }, + { + "epoch": 1.1242389970402098, + "lm_loss": 0.9268, + "step": 5840, + "vm_loss": 0.1974 + }, + { + "epoch": 1.1242389970402098, + "lm_loss": 0.8854, + "step": 5840, + "vm_loss": 0.1956 + }, + { + "epoch": 1.1242389970402098, + "lm_loss": 1.368, + "step": 5840, + "vm_loss": 0.1842 + }, + { + "epoch": 1.1242389970402098, + "lm_loss": 1.1761, + "step": 5840, + "vm_loss": 0.1947 + }, + { + "epoch": 1.1242389970402098, + "lm_loss": 0.8146, + "step": 5840, + "vm_loss": 0.1543 + }, + { + "epoch": 1.1244315037177852, + "grad_norm": 3.0620694997931004, + "learning_rate": 8.475085432890449e-06, + "loss": 1.2172, + "step": 5841 + }, + { + "epoch": 1.1246240103953606, + "grad_norm": 3.237102027297433, + "learning_rate": 8.472004074881438e-06, + "loss": 1.3298, + "step": 5842 + }, + { + "epoch": 1.124816517072936, + "grad_norm": 3.2295491664366978, + "learning_rate": 8.468922865413176e-06, + "loss": 1.2595, + "step": 5843 + }, + { + "epoch": 1.1250090237505113, + "grad_norm": 3.1840226859224017, + "learning_rate": 8.465841804785202e-06, + "loss": 1.272, + "step": 5844 + }, + { + "epoch": 1.1252015304280867, + "grad_norm": 3.356093941009692, + "learning_rate": 8.462760893297033e-06, + "loss": 1.3238, + "step": 5845 + }, + { + "epoch": 1.125394037105662, + "grad_norm": 3.42043759963782, + "learning_rate": 8.459680131248168e-06, + "loss": 1.3248, + "step": 5846 + }, + { + "epoch": 1.1255865437832375, + "grad_norm": 3.299890605639256, + "learning_rate": 8.456599518938107e-06, + "loss": 1.32, + "step": 5847 + }, + { + "epoch": 1.125779050460813, + "grad_norm": 3.19825838761476, + "learning_rate": 8.453519056666314e-06, + "loss": 1.3599, + "step": 5848 + }, + { + "epoch": 1.125779050460813, + "lm_loss": 1.5224, + "step": 5848, + "vm_loss": 0.1656 + }, + { + "epoch": 1.125779050460813, + "lm_loss": 1.3296, + "step": 5848, + "vm_loss": 0.1496 + }, + { + "epoch": 1.125779050460813, + "lm_loss": 0.5724, + "step": 5848, + "vm_loss": 0.1395 + }, + { + "epoch": 1.125779050460813, + "lm_loss": 1.1999, + "step": 5848, + "vm_loss": 0.1688 + }, + { + "epoch": 1.125779050460813, + "lm_loss": 0.8062, + "step": 5848, + "vm_loss": 0.1711 + }, + { + "epoch": 1.125779050460813, + "lm_loss": 0.7877, + "step": 5848, + "vm_loss": 0.2048 + }, + { + "epoch": 1.125779050460813, + "lm_loss": 0.9927, + "step": 5848, + "vm_loss": 0.1929 + }, + { + "epoch": 1.125779050460813, + "lm_loss": 1.9508, + "step": 5848, + "vm_loss": 0.1671 + }, + { + "epoch": 1.1259715571383881, + "grad_norm": 3.035662040743749, + "learning_rate": 8.450438744732259e-06, + "loss": 1.208, + "step": 5849 + }, + { + "epoch": 1.1261640638159636, + "grad_norm": 3.1088250859066857, + "learning_rate": 8.447358583435377e-06, + "loss": 1.2635, + "step": 5850 + }, + { + "epoch": 1.126356570493539, + "grad_norm": 3.21080519231543, + "learning_rate": 8.444278573075107e-06, + "loss": 1.3395, + "step": 5851 + }, + { + "epoch": 1.1265490771711144, + "grad_norm": 3.1600525452007626, + "learning_rate": 8.441198713950865e-06, + "loss": 1.2287, + "step": 5852 + }, + { + "epoch": 1.1267415838486898, + "grad_norm": 3.3862551949520965, + "learning_rate": 8.438119006362046e-06, + "loss": 1.3177, + "step": 5853 + }, + { + "epoch": 1.126934090526265, + "grad_norm": 3.1757911641365952, + "learning_rate": 8.435039450608046e-06, + "loss": 1.2492, + "step": 5854 + }, + { + "epoch": 1.1271265972038405, + "grad_norm": 3.2158351021886262, + "learning_rate": 8.431960046988235e-06, + "loss": 1.3062, + "step": 5855 + }, + { + "epoch": 1.1273191038814159, + "grad_norm": 3.4285830689747785, + "learning_rate": 8.428880795801965e-06, + "loss": 1.2645, + "step": 5856 + }, + { + "epoch": 1.1273191038814159, + "lm_loss": 1.2031, + "step": 5856, + "vm_loss": 0.2152 + }, + { + "epoch": 1.1273191038814159, + "lm_loss": 0.834, + "step": 5856, + "vm_loss": 0.1275 + }, + { + "epoch": 1.1273191038814159, + "lm_loss": 0.9468, + "step": 5856, + "vm_loss": 0.2031 + }, + { + "epoch": 1.1273191038814159, + "lm_loss": 1.0256, + "step": 5856, + "vm_loss": 0.2021 + }, + { + "epoch": 1.1273191038814159, + "lm_loss": 0.8655, + "step": 5856, + "vm_loss": 0.1995 + }, + { + "epoch": 1.1273191038814159, + "lm_loss": 1.1139, + "step": 5856, + "vm_loss": 0.1018 + }, + { + "epoch": 1.1273191038814159, + "lm_loss": 1.0159, + "step": 5856, + "vm_loss": 0.219 + }, + { + "epoch": 1.1273191038814159, + "lm_loss": 0.6217, + "step": 5856, + "vm_loss": 0.1255 + }, + { + "epoch": 1.1275116105589913, + "grad_norm": 3.140633895907306, + "learning_rate": 8.42580169734858e-06, + "loss": 1.2262, + "step": 5857 + }, + { + "epoch": 1.1277041172365667, + "grad_norm": 3.143927110954885, + "learning_rate": 8.422722751927415e-06, + "loss": 1.2241, + "step": 5858 + }, + { + "epoch": 1.127896623914142, + "grad_norm": 3.351408604636248, + "learning_rate": 8.419643959837777e-06, + "loss": 1.2718, + "step": 5859 + }, + { + "epoch": 1.1280891305917173, + "grad_norm": 3.3739720287411203, + "learning_rate": 8.416565321378962e-06, + "loss": 1.2684, + "step": 5860 + }, + { + "epoch": 1.1282816372692928, + "grad_norm": 3.2173463678522864, + "learning_rate": 8.413486836850258e-06, + "loss": 1.2844, + "step": 5861 + }, + { + "epoch": 1.1284741439468682, + "grad_norm": 3.1883903205622155, + "learning_rate": 8.41040850655093e-06, + "loss": 1.3117, + "step": 5862 + }, + { + "epoch": 1.1286666506244436, + "grad_norm": 3.1852220130364937, + "learning_rate": 8.407330330780234e-06, + "loss": 1.3195, + "step": 5863 + }, + { + "epoch": 1.1288591573020188, + "grad_norm": 3.316046859845407, + "learning_rate": 8.4042523098374e-06, + "loss": 1.2692, + "step": 5864 + }, + { + "epoch": 1.1288591573020188, + "lm_loss": 1.8406, + "step": 5864, + "vm_loss": 0.1442 + }, + { + "epoch": 1.1288591573020188, + "lm_loss": 0.9737, + "step": 5864, + "vm_loss": 0.1613 + }, + { + "epoch": 1.1288591573020188, + "lm_loss": 0.5428, + "step": 5864, + "vm_loss": 0.1323 + }, + { + "epoch": 1.1288591573020188, + "lm_loss": 0.9614, + "step": 5864, + "vm_loss": 0.1666 + }, + { + "epoch": 1.1288591573020188, + "lm_loss": 1.2193, + "step": 5864, + "vm_loss": 0.1897 + }, + { + "epoch": 1.1288591573020188, + "lm_loss": 0.8916, + "step": 5864, + "vm_loss": 0.1914 + }, + { + "epoch": 1.1288591573020188, + "lm_loss": 0.9738, + "step": 5864, + "vm_loss": 0.1467 + }, + { + "epoch": 1.1288591573020188, + "lm_loss": 0.8708, + "step": 5864, + "vm_loss": 0.1674 + }, + { + "epoch": 1.1290516639795942, + "grad_norm": 3.2270768209537355, + "learning_rate": 8.401174444021663e-06, + "loss": 1.2638, + "step": 5865 + }, + { + "epoch": 1.1292441706571696, + "grad_norm": 3.3164734166666596, + "learning_rate": 8.398096733632223e-06, + "loss": 1.2329, + "step": 5866 + }, + { + "epoch": 1.129436677334745, + "grad_norm": 3.2916633833882223, + "learning_rate": 8.395019178968274e-06, + "loss": 1.2415, + "step": 5867 + }, + { + "epoch": 1.1296291840123205, + "grad_norm": 3.5267657621273036, + "learning_rate": 8.391941780328996e-06, + "loss": 1.2922, + "step": 5868 + }, + { + "epoch": 1.1298216906898957, + "grad_norm": 3.1315513936167827, + "learning_rate": 8.38886453801355e-06, + "loss": 1.2486, + "step": 5869 + }, + { + "epoch": 1.130014197367471, + "grad_norm": 3.1883547318831607, + "learning_rate": 8.38578745232108e-06, + "loss": 1.2458, + "step": 5870 + }, + { + "epoch": 1.1302067040450465, + "grad_norm": 3.2144540780756685, + "learning_rate": 8.382710523550725e-06, + "loss": 1.2772, + "step": 5871 + }, + { + "epoch": 1.130399210722622, + "grad_norm": 3.0267568001221354, + "learning_rate": 8.379633752001597e-06, + "loss": 1.2134, + "step": 5872 + }, + { + "epoch": 1.130399210722622, + "lm_loss": 0.8084, + "step": 5872, + "vm_loss": 0.1572 + }, + { + "epoch": 1.130399210722622, + "lm_loss": 0.991, + "step": 5872, + "vm_loss": 0.1964 + }, + { + "epoch": 1.130399210722622, + "lm_loss": 1.2142, + "step": 5872, + "vm_loss": 0.1955 + }, + { + "epoch": 1.130399210722622, + "lm_loss": 1.483, + "step": 5872, + "vm_loss": 0.1462 + }, + { + "epoch": 1.130399210722622, + "lm_loss": 1.1382, + "step": 5872, + "vm_loss": 0.1176 + }, + { + "epoch": 1.130399210722622, + "lm_loss": 1.4086, + "step": 5872, + "vm_loss": 0.1536 + }, + { + "epoch": 1.130399210722622, + "lm_loss": 1.0364, + "step": 5872, + "vm_loss": 0.1514 + }, + { + "epoch": 1.130399210722622, + "lm_loss": 0.8559, + "step": 5872, + "vm_loss": 0.1879 + }, + { + "epoch": 1.1305917174001974, + "grad_norm": 3.1448279535608665, + "learning_rate": 8.376557137972801e-06, + "loss": 1.2348, + "step": 5873 + }, + { + "epoch": 1.1307842240777728, + "grad_norm": 3.2773959062530764, + "learning_rate": 8.373480681763418e-06, + "loss": 1.3179, + "step": 5874 + }, + { + "epoch": 1.130976730755348, + "grad_norm": 3.2631498231279292, + "learning_rate": 8.370404383672525e-06, + "loss": 1.3622, + "step": 5875 + }, + { + "epoch": 1.1311692374329234, + "grad_norm": 3.2126047187470883, + "learning_rate": 8.367328243999174e-06, + "loss": 1.2495, + "step": 5876 + }, + { + "epoch": 1.1313617441104988, + "grad_norm": 3.2704190844929886, + "learning_rate": 8.3642522630424e-06, + "loss": 1.3115, + "step": 5877 + }, + { + "epoch": 1.1315542507880743, + "grad_norm": 3.126034016582004, + "learning_rate": 8.36117644110124e-06, + "loss": 1.2036, + "step": 5878 + }, + { + "epoch": 1.1317467574656497, + "grad_norm": 3.319519104724287, + "learning_rate": 8.358100778474696e-06, + "loss": 1.3649, + "step": 5879 + }, + { + "epoch": 1.1319392641432249, + "grad_norm": 3.389219887371969, + "learning_rate": 8.355025275461764e-06, + "loss": 1.3423, + "step": 5880 + }, + { + "epoch": 1.1319392641432249, + "lm_loss": 0.9885, + "step": 5880, + "vm_loss": 0.1037 + }, + { + "epoch": 1.1319392641432249, + "lm_loss": 1.0471, + "step": 5880, + "vm_loss": 0.1675 + }, + { + "epoch": 1.1319392641432249, + "lm_loss": 1.2167, + "step": 5880, + "vm_loss": 0.2535 + }, + { + "epoch": 1.1319392641432249, + "lm_loss": 1.006, + "step": 5880, + "vm_loss": 0.1394 + }, + { + "epoch": 1.1319392641432249, + "lm_loss": 1.4664, + "step": 5880, + "vm_loss": 0.1889 + }, + { + "epoch": 1.1319392641432249, + "lm_loss": 1.0578, + "step": 5880, + "vm_loss": 0.1455 + }, + { + "epoch": 1.1319392641432249, + "lm_loss": 1.1965, + "step": 5880, + "vm_loss": 0.173 + }, + { + "epoch": 1.1319392641432249, + "lm_loss": 1.1448, + "step": 5880, + "vm_loss": 0.1074 + }, + { + "epoch": 1.1321317708208003, + "grad_norm": 3.0365568448688363, + "learning_rate": 8.351949932361413e-06, + "loss": 1.2472, + "step": 5881 + }, + { + "epoch": 1.1323242774983757, + "grad_norm": 3.1737492451190423, + "learning_rate": 8.34887474947262e-06, + "loss": 1.2445, + "step": 5882 + }, + { + "epoch": 1.1325167841759511, + "grad_norm": 3.155271224195876, + "learning_rate": 8.345799727094323e-06, + "loss": 1.3018, + "step": 5883 + }, + { + "epoch": 1.1327092908535266, + "grad_norm": 3.0813235955400273, + "learning_rate": 8.342724865525455e-06, + "loss": 1.3244, + "step": 5884 + }, + { + "epoch": 1.132901797531102, + "grad_norm": 3.2410567940081236, + "learning_rate": 8.339650165064935e-06, + "loss": 1.3045, + "step": 5885 + }, + { + "epoch": 1.1330943042086772, + "grad_norm": 3.1581908678685644, + "learning_rate": 8.33657562601166e-06, + "loss": 1.2405, + "step": 5886 + }, + { + "epoch": 1.1332868108862526, + "grad_norm": 3.1498172778383347, + "learning_rate": 8.333501248664513e-06, + "loss": 1.2199, + "step": 5887 + }, + { + "epoch": 1.133479317563828, + "grad_norm": 3.3202762759733138, + "learning_rate": 8.33042703332237e-06, + "loss": 1.2404, + "step": 5888 + }, + { + "epoch": 1.133479317563828, + "lm_loss": 1.3709, + "step": 5888, + "vm_loss": 0.1867 + }, + { + "epoch": 1.133479317563828, + "lm_loss": 0.8066, + "step": 5888, + "vm_loss": 0.1585 + }, + { + "epoch": 1.133479317563828, + "lm_loss": 1.247, + "step": 5888, + "vm_loss": 0.145 + }, + { + "epoch": 1.133479317563828, + "lm_loss": 0.7363, + "step": 5888, + "vm_loss": 0.1228 + }, + { + "epoch": 1.133479317563828, + "lm_loss": 1.52, + "step": 5888, + "vm_loss": 0.1726 + }, + { + "epoch": 1.133479317563828, + "lm_loss": 0.8289, + "step": 5888, + "vm_loss": 0.2152 + }, + { + "epoch": 1.133479317563828, + "lm_loss": 1.4974, + "step": 5888, + "vm_loss": 0.1832 + }, + { + "epoch": 1.133479317563828, + "lm_loss": 0.8929, + "step": 5888, + "vm_loss": 0.1811 + }, + { + "epoch": 1.1336718242414034, + "grad_norm": 3.2119194287590354, + "learning_rate": 8.327352980284076e-06, + "loss": 1.2301, + "step": 5889 + }, + { + "epoch": 1.1338643309189789, + "grad_norm": 3.367612511967922, + "learning_rate": 8.324279089848475e-06, + "loss": 1.2604, + "step": 5890 + }, + { + "epoch": 1.134056837596554, + "grad_norm": 3.393785170173505, + "learning_rate": 8.321205362314382e-06, + "loss": 1.3117, + "step": 5891 + }, + { + "epoch": 1.1342493442741295, + "grad_norm": 3.2545119201967827, + "learning_rate": 8.318131797980606e-06, + "loss": 1.2792, + "step": 5892 + }, + { + "epoch": 1.134441850951705, + "grad_norm": 3.2777173328509694, + "learning_rate": 8.31505839714594e-06, + "loss": 1.3164, + "step": 5893 + }, + { + "epoch": 1.1346343576292803, + "grad_norm": 3.2705333322198573, + "learning_rate": 8.311985160109148e-06, + "loss": 1.3177, + "step": 5894 + }, + { + "epoch": 1.1348268643068558, + "grad_norm": 3.112691149032303, + "learning_rate": 8.308912087169001e-06, + "loss": 1.2403, + "step": 5895 + }, + { + "epoch": 1.135019370984431, + "grad_norm": 3.1961449583180523, + "learning_rate": 8.305839178624233e-06, + "loss": 1.245, + "step": 5896 + }, + { + "epoch": 1.135019370984431, + "lm_loss": 1.1306, + "step": 5896, + "vm_loss": 0.2131 + }, + { + "epoch": 1.135019370984431, + "lm_loss": 1.5792, + "step": 5896, + "vm_loss": 0.1532 + }, + { + "epoch": 1.135019370984431, + "lm_loss": 1.2191, + "step": 5896, + "vm_loss": 0.1746 + }, + { + "epoch": 1.135019370984431, + "lm_loss": 1.7574, + "step": 5896, + "vm_loss": 0.1584 + }, + { + "epoch": 1.135019370984431, + "lm_loss": 1.3805, + "step": 5896, + "vm_loss": 0.1612 + }, + { + "epoch": 1.135019370984431, + "lm_loss": 1.5082, + "step": 5896, + "vm_loss": 0.121 + }, + { + "epoch": 1.135019370984431, + "lm_loss": 0.7554, + "step": 5896, + "vm_loss": 0.1647 + }, + { + "epoch": 1.135019370984431, + "lm_loss": 1.0155, + "step": 5896, + "vm_loss": 0.1578 + }, + { + "epoch": 1.1352118776620064, + "grad_norm": 3.25801437423002, + "learning_rate": 8.302766434773572e-06, + "loss": 1.33, + "step": 5897 + }, + { + "epoch": 1.1354043843395818, + "grad_norm": 3.193130018969506, + "learning_rate": 8.299693855915723e-06, + "loss": 1.2436, + "step": 5898 + }, + { + "epoch": 1.1355968910171572, + "grad_norm": 3.2663209610979114, + "learning_rate": 8.29662144234939e-06, + "loss": 1.2687, + "step": 5899 + }, + { + "epoch": 1.1357893976947326, + "grad_norm": 3.3788371678679905, + "learning_rate": 8.293549194373243e-06, + "loss": 1.2267, + "step": 5900 + }, + { + "epoch": 1.1359819043723078, + "grad_norm": 3.295447626922356, + "learning_rate": 8.290477112285944e-06, + "loss": 1.2073, + "step": 5901 + }, + { + "epoch": 1.1361744110498833, + "grad_norm": 3.3398406764439974, + "learning_rate": 8.287405196386144e-06, + "loss": 1.2767, + "step": 5902 + }, + { + "epoch": 1.1363669177274587, + "grad_norm": 3.2929088211476523, + "learning_rate": 8.284333446972471e-06, + "loss": 1.2379, + "step": 5903 + }, + { + "epoch": 1.136559424405034, + "grad_norm": 3.3768346170975154, + "learning_rate": 8.28126186434353e-06, + "loss": 1.2948, + "step": 5904 + }, + { + "epoch": 1.136559424405034, + "lm_loss": 1.1123, + "step": 5904, + "vm_loss": 0.1732 + }, + { + "epoch": 1.136559424405034, + "lm_loss": 1.1455, + "step": 5904, + "vm_loss": 0.1466 + }, + { + "epoch": 1.136559424405034, + "lm_loss": 0.6919, + "step": 5904, + "vm_loss": 0.2131 + }, + { + "epoch": 1.136559424405034, + "lm_loss": 1.0409, + "step": 5904, + "vm_loss": 0.1845 + }, + { + "epoch": 1.136559424405034, + "lm_loss": 1.1416, + "step": 5904, + "vm_loss": 0.1882 + }, + { + "epoch": 1.136559424405034, + "lm_loss": 1.0636, + "step": 5904, + "vm_loss": 0.1562 + }, + { + "epoch": 1.136559424405034, + "lm_loss": 0.5159, + "step": 5904, + "vm_loss": 0.143 + }, + { + "epoch": 1.136559424405034, + "lm_loss": 1.1402, + "step": 5904, + "vm_loss": 0.1266 + }, + { + "epoch": 1.1367519310826095, + "grad_norm": 3.347507969112467, + "learning_rate": 8.278190448797931e-06, + "loss": 1.2502, + "step": 5905 + }, + { + "epoch": 1.1369444377601847, + "grad_norm": 3.220006291494801, + "learning_rate": 8.275119200634248e-06, + "loss": 1.2404, + "step": 5906 + }, + { + "epoch": 1.1371369444377601, + "grad_norm": 3.302165627611826, + "learning_rate": 8.272048120151045e-06, + "loss": 1.3117, + "step": 5907 + }, + { + "epoch": 1.1373294511153356, + "grad_norm": 3.222033805945584, + "learning_rate": 8.26897720764687e-06, + "loss": 1.2809, + "step": 5908 + }, + { + "epoch": 1.137521957792911, + "grad_norm": 3.255902778533384, + "learning_rate": 8.26590646342026e-06, + "loss": 1.2362, + "step": 5909 + }, + { + "epoch": 1.1377144644704864, + "grad_norm": 3.132923384517969, + "learning_rate": 8.262835887769725e-06, + "loss": 1.2725, + "step": 5910 + }, + { + "epoch": 1.1379069711480616, + "grad_norm": 3.247885768061465, + "learning_rate": 8.259765480993767e-06, + "loss": 1.2829, + "step": 5911 + }, + { + "epoch": 1.138099477825637, + "grad_norm": 3.225341105290348, + "learning_rate": 8.256695243390868e-06, + "loss": 1.258, + "step": 5912 + }, + { + "epoch": 1.138099477825637, + "lm_loss": 0.8413, + "step": 5912, + "vm_loss": 0.141 + }, + { + "epoch": 1.138099477825637, + "lm_loss": 1.3644, + "step": 5912, + "vm_loss": 0.1992 + }, + { + "epoch": 1.138099477825637, + "lm_loss": 1.3922, + "step": 5912, + "vm_loss": 0.1429 + }, + { + "epoch": 1.138099477825637, + "lm_loss": 1.0489, + "step": 5912, + "vm_loss": 0.1699 + }, + { + "epoch": 1.138099477825637, + "lm_loss": 0.9715, + "step": 5912, + "vm_loss": 0.1588 + }, + { + "epoch": 1.138099477825637, + "lm_loss": 1.0801, + "step": 5912, + "vm_loss": 0.18 + }, + { + "epoch": 1.138099477825637, + "lm_loss": 0.8907, + "step": 5912, + "vm_loss": 0.1636 + }, + { + "epoch": 1.138099477825637, + "lm_loss": 0.8373, + "step": 5912, + "vm_loss": 0.1846 + }, + { + "epoch": 1.1382919845032125, + "grad_norm": 3.202713247720956, + "learning_rate": 8.253625175259496e-06, + "loss": 1.2277, + "step": 5913 + }, + { + "epoch": 1.1384844911807879, + "grad_norm": 3.274301887721809, + "learning_rate": 8.2505552768981e-06, + "loss": 1.3141, + "step": 5914 + }, + { + "epoch": 1.1386769978583633, + "grad_norm": 3.2081132999983923, + "learning_rate": 8.24748554860511e-06, + "loss": 1.2779, + "step": 5915 + }, + { + "epoch": 1.1388695045359385, + "grad_norm": 3.432795170897406, + "learning_rate": 8.244415990678949e-06, + "loss": 1.3048, + "step": 5916 + }, + { + "epoch": 1.139062011213514, + "grad_norm": 3.3953901171098724, + "learning_rate": 8.241346603418014e-06, + "loss": 1.3193, + "step": 5917 + }, + { + "epoch": 1.1392545178910893, + "grad_norm": 3.2766238592456514, + "learning_rate": 8.238277387120685e-06, + "loss": 1.2205, + "step": 5918 + }, + { + "epoch": 1.1394470245686648, + "grad_norm": 3.333379257890828, + "learning_rate": 8.235208342085337e-06, + "loss": 1.3327, + "step": 5919 + }, + { + "epoch": 1.1396395312462402, + "grad_norm": 3.2670742832098916, + "learning_rate": 8.232139468610316e-06, + "loss": 1.2547, + "step": 5920 + }, + { + "epoch": 1.1396395312462402, + "lm_loss": 1.4901, + "step": 5920, + "vm_loss": 0.1329 + }, + { + "epoch": 1.1396395312462402, + "lm_loss": 0.5242, + "step": 5920, + "vm_loss": 0.1776 + }, + { + "epoch": 1.1396395312462402, + "lm_loss": 0.9055, + "step": 5920, + "vm_loss": 0.2502 + }, + { + "epoch": 1.1396395312462402, + "lm_loss": 1.0267, + "step": 5920, + "vm_loss": 0.1693 + }, + { + "epoch": 1.1396395312462402, + "lm_loss": 0.9592, + "step": 5920, + "vm_loss": 0.1139 + }, + { + "epoch": 1.1396395312462402, + "lm_loss": 0.9626, + "step": 5920, + "vm_loss": 0.156 + }, + { + "epoch": 1.1396395312462402, + "lm_loss": 1.075, + "step": 5920, + "vm_loss": 0.1627 + }, + { + "epoch": 1.1396395312462402, + "lm_loss": 0.773, + "step": 5920, + "vm_loss": 0.1635 + }, + { + "epoch": 1.1398320379238154, + "grad_norm": 3.128453319459127, + "learning_rate": 8.229070766993955e-06, + "loss": 1.256, + "step": 5921 + }, + { + "epoch": 1.1400245446013908, + "grad_norm": 3.074504414024722, + "learning_rate": 8.226002237534574e-06, + "loss": 1.2452, + "step": 5922 + }, + { + "epoch": 1.1402170512789662, + "grad_norm": 3.1236767517262582, + "learning_rate": 8.22293388053047e-06, + "loss": 1.2531, + "step": 5923 + }, + { + "epoch": 1.1404095579565416, + "grad_norm": 3.1929280601830228, + "learning_rate": 8.21986569627993e-06, + "loss": 1.2414, + "step": 5924 + }, + { + "epoch": 1.140602064634117, + "grad_norm": 3.3336778312718036, + "learning_rate": 8.216797685081214e-06, + "loss": 1.2975, + "step": 5925 + }, + { + "epoch": 1.1407945713116923, + "grad_norm": 3.2610425030665087, + "learning_rate": 8.213729847232583e-06, + "loss": 1.2399, + "step": 5926 + }, + { + "epoch": 1.1409870779892677, + "grad_norm": 3.323426816433846, + "learning_rate": 8.210662183032262e-06, + "loss": 1.2964, + "step": 5927 + }, + { + "epoch": 1.141179584666843, + "grad_norm": 3.389401166964098, + "learning_rate": 8.207594692778464e-06, + "loss": 1.3115, + "step": 5928 + }, + { + "epoch": 1.141179584666843, + "lm_loss": 0.6674, + "step": 5928, + "vm_loss": 0.1499 + }, + { + "epoch": 1.141179584666843, + "lm_loss": 1.0127, + "step": 5928, + "vm_loss": 0.1883 + }, + { + "epoch": 1.141179584666843, + "lm_loss": 0.7549, + "step": 5928, + "vm_loss": 0.2239 + }, + { + "epoch": 1.141179584666843, + "lm_loss": 0.8433, + "step": 5928, + "vm_loss": 0.1168 + }, + { + "epoch": 1.141179584666843, + "lm_loss": 1.461, + "step": 5928, + "vm_loss": 0.1869 + }, + { + "epoch": 1.141179584666843, + "lm_loss": 0.849, + "step": 5928, + "vm_loss": 0.2322 + }, + { + "epoch": 1.141179584666843, + "lm_loss": 0.9908, + "step": 5928, + "vm_loss": 0.1728 + }, + { + "epoch": 1.141179584666843, + "lm_loss": 1.1951, + "step": 5928, + "vm_loss": 0.1122 + }, + { + "epoch": 1.1413720913444185, + "grad_norm": 3.2606204289380565, + "learning_rate": 8.204527376769401e-06, + "loss": 1.269, + "step": 5929 + }, + { + "epoch": 1.141564598021994, + "grad_norm": 3.1432490902994736, + "learning_rate": 8.201460235303245e-06, + "loss": 1.312, + "step": 5930 + }, + { + "epoch": 1.1417571046995691, + "grad_norm": 3.263003331658955, + "learning_rate": 8.198393268678163e-06, + "loss": 1.3229, + "step": 5931 + }, + { + "epoch": 1.1419496113771446, + "grad_norm": 3.105052795488984, + "learning_rate": 8.195326477192305e-06, + "loss": 1.2408, + "step": 5932 + }, + { + "epoch": 1.14214211805472, + "grad_norm": 3.201663012326055, + "learning_rate": 8.1922598611438e-06, + "loss": 1.3032, + "step": 5933 + }, + { + "epoch": 1.1423346247322954, + "grad_norm": 3.3066660747381014, + "learning_rate": 8.189193420830768e-06, + "loss": 1.3294, + "step": 5934 + }, + { + "epoch": 1.1425271314098708, + "grad_norm": 3.2277095117417303, + "learning_rate": 8.1861271565513e-06, + "loss": 1.3289, + "step": 5935 + }, + { + "epoch": 1.1427196380874463, + "grad_norm": 3.204952925050108, + "learning_rate": 8.18306106860348e-06, + "loss": 1.2099, + "step": 5936 + }, + { + "epoch": 1.1427196380874463, + "lm_loss": 1.0924, + "step": 5936, + "vm_loss": 0.1508 + }, + { + "epoch": 1.1427196380874463, + "lm_loss": 1.06, + "step": 5936, + "vm_loss": 0.1538 + }, + { + "epoch": 1.1427196380874463, + "lm_loss": 0.8774, + "step": 5936, + "vm_loss": 0.1288 + }, + { + "epoch": 1.1427196380874463, + "lm_loss": 0.7256, + "step": 5936, + "vm_loss": 0.1508 + }, + { + "epoch": 1.1427196380874463, + "lm_loss": 0.945, + "step": 5936, + "vm_loss": 0.1631 + }, + { + "epoch": 1.1427196380874463, + "lm_loss": 0.8488, + "step": 5936, + "vm_loss": 0.2005 + }, + { + "epoch": 1.1427196380874463, + "lm_loss": 0.8559, + "step": 5936, + "vm_loss": 0.1647 + }, + { + "epoch": 1.1427196380874463, + "lm_loss": 0.732, + "step": 5936, + "vm_loss": 0.1127 + }, + { + "epoch": 1.1429121447650215, + "grad_norm": 3.159000653909086, + "learning_rate": 8.17999515728537e-06, + "loss": 1.2107, + "step": 5937 + }, + { + "epoch": 1.1431046514425969, + "grad_norm": 3.161568254489513, + "learning_rate": 8.17692942289501e-06, + "loss": 1.2297, + "step": 5938 + }, + { + "epoch": 1.1432971581201723, + "grad_norm": 3.2931193580244518, + "learning_rate": 8.173863865730441e-06, + "loss": 1.2056, + "step": 5939 + }, + { + "epoch": 1.1434896647977477, + "grad_norm": 3.3584679898396033, + "learning_rate": 8.170798486089666e-06, + "loss": 1.2431, + "step": 5940 + }, + { + "epoch": 1.1436821714753231, + "grad_norm": 3.3366810272422684, + "learning_rate": 8.167733284270681e-06, + "loss": 1.2478, + "step": 5941 + }, + { + "epoch": 1.1438746781528983, + "grad_norm": 3.3192864453322706, + "learning_rate": 8.164668260571457e-06, + "loss": 1.2387, + "step": 5942 + }, + { + "epoch": 1.1440671848304738, + "grad_norm": 3.4568546688336435, + "learning_rate": 8.161603415289964e-06, + "loss": 1.2533, + "step": 5943 + }, + { + "epoch": 1.1442596915080492, + "grad_norm": 3.4068916522800383, + "learning_rate": 8.158538748724139e-06, + "loss": 1.2816, + "step": 5944 + }, + { + "epoch": 1.1442596915080492, + "lm_loss": 1.4009, + "step": 5944, + "vm_loss": 0.246 + }, + { + "epoch": 1.1442596915080492, + "lm_loss": 0.9023, + "step": 5944, + "vm_loss": 0.1045 + }, + { + "epoch": 1.1442596915080492, + "lm_loss": 0.953, + "step": 5944, + "vm_loss": 0.1358 + }, + { + "epoch": 1.1442596915080492, + "lm_loss": 1.2306, + "step": 5944, + "vm_loss": 0.2208 + }, + { + "epoch": 1.1442596915080492, + "lm_loss": 0.9315, + "step": 5944, + "vm_loss": 0.1791 + }, + { + "epoch": 1.1442596915080492, + "lm_loss": 1.2075, + "step": 5944, + "vm_loss": 0.1958 + }, + { + "epoch": 1.1442596915080492, + "lm_loss": 0.9148, + "step": 5944, + "vm_loss": 0.18 + }, + { + "epoch": 1.1442596915080492, + "lm_loss": 0.5915, + "step": 5944, + "vm_loss": 0.1626 + }, + { + "epoch": 1.1444521981856246, + "grad_norm": 3.215902215356208, + "learning_rate": 8.155474261171902e-06, + "loss": 1.2718, + "step": 5945 + }, + { + "epoch": 1.1446447048632, + "grad_norm": 3.3448726721315825, + "learning_rate": 8.152409952931168e-06, + "loss": 1.2588, + "step": 5946 + }, + { + "epoch": 1.1448372115407752, + "grad_norm": 3.188109293885731, + "learning_rate": 8.149345824299826e-06, + "loss": 1.2375, + "step": 5947 + }, + { + "epoch": 1.1450297182183506, + "grad_norm": 3.2546810917779494, + "learning_rate": 8.146281875575748e-06, + "loss": 1.277, + "step": 5948 + }, + { + "epoch": 1.145222224895926, + "grad_norm": 3.125567246071508, + "learning_rate": 8.14321810705678e-06, + "loss": 1.2126, + "step": 5949 + }, + { + "epoch": 1.1454147315735015, + "grad_norm": 3.1885004732648343, + "learning_rate": 8.140154519040775e-06, + "loss": 1.3086, + "step": 5950 + }, + { + "epoch": 1.145607238251077, + "grad_norm": 3.3430427028184564, + "learning_rate": 8.137091111825541e-06, + "loss": 1.371, + "step": 5951 + }, + { + "epoch": 1.1457997449286523, + "grad_norm": 3.2094088458729457, + "learning_rate": 8.134027885708884e-06, + "loss": 1.2888, + "step": 5952 + }, + { + "epoch": 1.1457997449286523, + "lm_loss": 0.6678, + "step": 5952, + "vm_loss": 0.1427 + }, + { + "epoch": 1.1457997449286523, + "lm_loss": 0.9595, + "step": 5952, + "vm_loss": 0.1931 + }, + { + "epoch": 1.1457997449286523, + "lm_loss": 1.3242, + "step": 5952, + "vm_loss": 0.1428 + }, + { + "epoch": 1.1457997449286523, + "lm_loss": 0.9153, + "step": 5952, + "vm_loss": 0.1384 + }, + { + "epoch": 1.1457997449286523, + "lm_loss": 1.095, + "step": 5952, + "vm_loss": 0.136 + }, + { + "epoch": 1.1457997449286523, + "lm_loss": 0.7651, + "step": 5952, + "vm_loss": 0.1386 + }, + { + "epoch": 1.1457997449286523, + "lm_loss": 1.0999, + "step": 5952, + "vm_loss": 0.1972 + }, + { + "epoch": 1.1457997449286523, + "lm_loss": 0.9255, + "step": 5952, + "vm_loss": 0.1819 + }, + { + "epoch": 1.1459922516062275, + "grad_norm": 3.120233319769545, + "learning_rate": 8.130964840988593e-06, + "loss": 1.223, + "step": 5953 + }, + { + "epoch": 1.146184758283803, + "grad_norm": 3.161250321376422, + "learning_rate": 8.127901977962428e-06, + "loss": 1.239, + "step": 5954 + }, + { + "epoch": 1.1463772649613784, + "grad_norm": 3.200221989792111, + "learning_rate": 8.124839296928144e-06, + "loss": 1.1959, + "step": 5955 + }, + { + "epoch": 1.1465697716389538, + "grad_norm": 3.4369949980712606, + "learning_rate": 8.121776798183468e-06, + "loss": 1.271, + "step": 5956 + }, + { + "epoch": 1.1467622783165292, + "grad_norm": 3.3972309079232956, + "learning_rate": 8.11871448202612e-06, + "loss": 1.3276, + "step": 5957 + }, + { + "epoch": 1.1469547849941044, + "grad_norm": 3.5101688273837417, + "learning_rate": 8.115652348753791e-06, + "loss": 1.2771, + "step": 5958 + }, + { + "epoch": 1.1471472916716798, + "grad_norm": 3.3620678161460162, + "learning_rate": 8.11259039866416e-06, + "loss": 1.2147, + "step": 5959 + }, + { + "epoch": 1.1473397983492553, + "grad_norm": 3.1849842004415088, + "learning_rate": 8.109528632054893e-06, + "loss": 1.2932, + "step": 5960 + }, + { + "epoch": 1.1473397983492553, + "lm_loss": 0.9591, + "step": 5960, + "vm_loss": 0.1558 + }, + { + "epoch": 1.1473397983492553, + "lm_loss": 1.7359, + "step": 5960, + "vm_loss": 0.1605 + }, + { + "epoch": 1.1473397983492553, + "lm_loss": 1.078, + "step": 5960, + "vm_loss": 0.1212 + }, + { + "epoch": 1.1473397983492553, + "lm_loss": 0.8653, + "step": 5960, + "vm_loss": 0.152 + }, + { + "epoch": 1.1473397983492553, + "lm_loss": 1.3356, + "step": 5960, + "vm_loss": 0.1852 + }, + { + "epoch": 1.1473397983492553, + "lm_loss": 1.6872, + "step": 5960, + "vm_loss": 0.1815 + }, + { + "epoch": 1.1473397983492553, + "lm_loss": 1.4369, + "step": 5960, + "vm_loss": 0.1345 + }, + { + "epoch": 1.1473397983492553, + "lm_loss": 1.3156, + "step": 5960, + "vm_loss": 0.1857 + }, + { + "epoch": 1.1475323050268307, + "grad_norm": 3.0988719823340025, + "learning_rate": 8.106467049223628e-06, + "loss": 1.2428, + "step": 5961 + }, + { + "epoch": 1.147724811704406, + "grad_norm": 3.1559597705391473, + "learning_rate": 8.103405650467988e-06, + "loss": 1.2601, + "step": 5962 + }, + { + "epoch": 1.1479173183819813, + "grad_norm": 3.218222551515623, + "learning_rate": 8.100344436085588e-06, + "loss": 1.2911, + "step": 5963 + }, + { + "epoch": 1.1481098250595567, + "grad_norm": 3.188426498694288, + "learning_rate": 8.097283406374012e-06, + "loss": 1.2724, + "step": 5964 + }, + { + "epoch": 1.1483023317371321, + "grad_norm": 3.1882918974843273, + "learning_rate": 8.094222561630832e-06, + "loss": 1.271, + "step": 5965 + }, + { + "epoch": 1.1484948384147076, + "grad_norm": 3.247748487957767, + "learning_rate": 8.091161902153594e-06, + "loss": 1.2275, + "step": 5966 + }, + { + "epoch": 1.148687345092283, + "grad_norm": 3.235754295006389, + "learning_rate": 8.088101428239847e-06, + "loss": 1.2666, + "step": 5967 + }, + { + "epoch": 1.1488798517698582, + "grad_norm": 3.2132228713360225, + "learning_rate": 8.085041140187102e-06, + "loss": 1.2314, + "step": 5968 + }, + { + "epoch": 1.1488798517698582, + "lm_loss": 0.7354, + "step": 5968, + "vm_loss": 0.1598 + }, + { + "epoch": 1.1488798517698582, + "lm_loss": 1.1644, + "step": 5968, + "vm_loss": 0.1878 + }, + { + "epoch": 1.1488798517698582, + "lm_loss": 1.0089, + "step": 5968, + "vm_loss": 0.1842 + }, + { + "epoch": 1.1488798517698582, + "lm_loss": 1.3632, + "step": 5968, + "vm_loss": 0.1761 + }, + { + "epoch": 1.1488798517698582, + "lm_loss": 1.1275, + "step": 5968, + "vm_loss": 0.1387 + }, + { + "epoch": 1.1488798517698582, + "lm_loss": 0.5409, + "step": 5968, + "vm_loss": 0.1532 + }, + { + "epoch": 1.1488798517698582, + "lm_loss": 0.6812, + "step": 5968, + "vm_loss": 0.2174 + }, + { + "epoch": 1.1488798517698582, + "lm_loss": 1.1398, + "step": 5968, + "vm_loss": 0.1991 + }, + { + "epoch": 1.1490723584474336, + "grad_norm": 3.1845594396762547, + "learning_rate": 8.081981038292853e-06, + "loss": 1.2379, + "step": 5969 + }, + { + "epoch": 1.149264865125009, + "grad_norm": 3.2295798992310862, + "learning_rate": 8.07892112285459e-06, + "loss": 1.2347, + "step": 5970 + }, + { + "epoch": 1.1494573718025844, + "grad_norm": 3.3013468623571653, + "learning_rate": 8.07586139416977e-06, + "loss": 1.2511, + "step": 5971 + }, + { + "epoch": 1.1496498784801599, + "grad_norm": 3.3203794304672014, + "learning_rate": 8.07280185253584e-06, + "loss": 1.1935, + "step": 5972 + }, + { + "epoch": 1.149842385157735, + "grad_norm": 3.338494989573673, + "learning_rate": 8.069742498250224e-06, + "loss": 1.2398, + "step": 5973 + }, + { + "epoch": 1.1500348918353105, + "grad_norm": 3.2671916194226425, + "learning_rate": 8.066683331610333e-06, + "loss": 1.2229, + "step": 5974 + }, + { + "epoch": 1.150227398512886, + "grad_norm": 3.2815867381174817, + "learning_rate": 8.06362435291356e-06, + "loss": 1.2301, + "step": 5975 + }, + { + "epoch": 1.1504199051904613, + "grad_norm": 3.2420687979436207, + "learning_rate": 8.060565562457268e-06, + "loss": 1.2742, + "step": 5976 + }, + { + "epoch": 1.1504199051904613, + "lm_loss": 1.4417, + "step": 5976, + "vm_loss": 0.2045 + }, + { + "epoch": 1.1504199051904613, + "lm_loss": 1.1356, + "step": 5976, + "vm_loss": 0.1525 + }, + { + "epoch": 1.1504199051904613, + "lm_loss": 1.517, + "step": 5976, + "vm_loss": 0.1922 + }, + { + "epoch": 1.1504199051904613, + "lm_loss": 1.6541, + "step": 5976, + "vm_loss": 0.1453 + }, + { + "epoch": 1.1504199051904613, + "lm_loss": 1.1276, + "step": 5976, + "vm_loss": 0.1499 + }, + { + "epoch": 1.1504199051904613, + "lm_loss": 0.9718, + "step": 5976, + "vm_loss": 0.1643 + }, + { + "epoch": 1.1504199051904613, + "lm_loss": 0.9761, + "step": 5976, + "vm_loss": 0.1594 + }, + { + "epoch": 1.1504199051904613, + "lm_loss": 0.7615, + "step": 5976, + "vm_loss": 0.1712 + }, + { + "epoch": 1.1506124118680368, + "grad_norm": 3.217745115199923, + "learning_rate": 8.057506960538818e-06, + "loss": 1.2809, + "step": 5977 + }, + { + "epoch": 1.150804918545612, + "grad_norm": 3.135667888303514, + "learning_rate": 8.054448547455546e-06, + "loss": 1.2145, + "step": 5978 + }, + { + "epoch": 1.1509974252231874, + "grad_norm": 3.1731805177447994, + "learning_rate": 8.05139032350476e-06, + "loss": 1.2485, + "step": 5979 + }, + { + "epoch": 1.1511899319007628, + "grad_norm": 3.3240257737439616, + "learning_rate": 8.04833228898377e-06, + "loss": 1.2794, + "step": 5980 + }, + { + "epoch": 1.1513824385783382, + "grad_norm": 3.339371548614613, + "learning_rate": 8.04527444418985e-06, + "loss": 1.2404, + "step": 5981 + }, + { + "epoch": 1.1515749452559136, + "grad_norm": 3.3516356036188597, + "learning_rate": 8.042216789420263e-06, + "loss": 1.2525, + "step": 5982 + }, + { + "epoch": 1.1517674519334888, + "grad_norm": 3.4654169910620967, + "learning_rate": 8.039159324972246e-06, + "loss": 1.2676, + "step": 5983 + }, + { + "epoch": 1.1519599586110643, + "grad_norm": 3.320542548458265, + "learning_rate": 8.036102051143036e-06, + "loss": 1.3017, + "step": 5984 + }, + { + "epoch": 1.1519599586110643, + "lm_loss": 1.4839, + "step": 5984, + "vm_loss": 0.1588 + }, + { + "epoch": 1.1519599586110643, + "lm_loss": 1.3175, + "step": 5984, + "vm_loss": 0.1524 + }, + { + "epoch": 1.1519599586110643, + "lm_loss": 1.2177, + "step": 5984, + "vm_loss": 0.1302 + }, + { + "epoch": 1.1519599586110643, + "lm_loss": 0.8308, + "step": 5984, + "vm_loss": 0.1543 + }, + { + "epoch": 1.1519599586110643, + "lm_loss": 1.1378, + "step": 5984, + "vm_loss": 0.1873 + }, + { + "epoch": 1.1519599586110643, + "lm_loss": 0.9164, + "step": 5984, + "vm_loss": 0.1854 + }, + { + "epoch": 1.1519599586110643, + "lm_loss": 1.1588, + "step": 5984, + "vm_loss": 0.1586 + }, + { + "epoch": 1.1519599586110643, + "lm_loss": 1.0239, + "step": 5984, + "vm_loss": 0.1826 + }, + { + "epoch": 1.1521524652886397, + "grad_norm": 3.5122424690887044, + "learning_rate": 8.03304496822983e-06, + "loss": 1.3349, + "step": 5985 + }, + { + "epoch": 1.152344971966215, + "grad_norm": 3.186790881694689, + "learning_rate": 8.029988076529813e-06, + "loss": 1.2459, + "step": 5986 + }, + { + "epoch": 1.1525374786437905, + "grad_norm": 3.2618014604310317, + "learning_rate": 8.026931376340163e-06, + "loss": 1.289, + "step": 5987 + }, + { + "epoch": 1.1527299853213657, + "grad_norm": 3.2814227532493754, + "learning_rate": 8.023874867958027e-06, + "loss": 1.2508, + "step": 5988 + }, + { + "epoch": 1.1529224919989411, + "grad_norm": 3.2872809651638812, + "learning_rate": 8.020818551680536e-06, + "loss": 1.2185, + "step": 5989 + }, + { + "epoch": 1.1531149986765166, + "grad_norm": 3.111117769936296, + "learning_rate": 8.017762427804799e-06, + "loss": 1.2512, + "step": 5990 + }, + { + "epoch": 1.153307505354092, + "grad_norm": 3.232585397008882, + "learning_rate": 8.014706496627919e-06, + "loss": 1.2449, + "step": 5991 + }, + { + "epoch": 1.1535000120316674, + "grad_norm": 3.4061255355504496, + "learning_rate": 8.011650758446967e-06, + "loss": 1.2965, + "step": 5992 + }, + { + "epoch": 1.1535000120316674, + "lm_loss": 1.1944, + "step": 5992, + "vm_loss": 0.1608 + }, + { + "epoch": 1.1535000120316674, + "lm_loss": 1.3503, + "step": 5992, + "vm_loss": 0.1918 + }, + { + "epoch": 1.1535000120316674, + "lm_loss": 1.222, + "step": 5992, + "vm_loss": 0.1209 + }, + { + "epoch": 1.1535000120316674, + "lm_loss": 0.94, + "step": 5992, + "vm_loss": 0.1411 + }, + { + "epoch": 1.1535000120316674, + "lm_loss": 1.4303, + "step": 5992, + "vm_loss": 0.1264 + }, + { + "epoch": 1.1535000120316674, + "lm_loss": 0.6596, + "step": 5992, + "vm_loss": 0.1774 + }, + { + "epoch": 1.1535000120316674, + "lm_loss": 1.3028, + "step": 5992, + "vm_loss": 0.1294 + }, + { + "epoch": 1.1535000120316674, + "lm_loss": 1.38, + "step": 5992, + "vm_loss": 0.2221 + }, + { + "epoch": 1.1536925187092426, + "grad_norm": 3.4460640069779753, + "learning_rate": 8.008595213558993e-06, + "loss": 1.2603, + "step": 5993 + }, + { + "epoch": 1.153885025386818, + "grad_norm": 3.1835671318564542, + "learning_rate": 8.005539862261048e-06, + "loss": 1.1515, + "step": 5994 + }, + { + "epoch": 1.1540775320643935, + "grad_norm": 3.2449092447379915, + "learning_rate": 8.002484704850142e-06, + "loss": 1.2132, + "step": 5995 + }, + { + "epoch": 1.1542700387419689, + "grad_norm": 3.4288666135643764, + "learning_rate": 7.999429741623278e-06, + "loss": 1.3617, + "step": 5996 + }, + { + "epoch": 1.1544625454195443, + "grad_norm": 3.072304956450402, + "learning_rate": 7.996374972877441e-06, + "loss": 1.1793, + "step": 5997 + }, + { + "epoch": 1.1546550520971195, + "grad_norm": 3.161630593528054, + "learning_rate": 7.993320398909587e-06, + "loss": 1.307, + "step": 5998 + }, + { + "epoch": 1.154847558774695, + "grad_norm": 3.179616861803642, + "learning_rate": 7.990266020016667e-06, + "loss": 1.263, + "step": 5999 + }, + { + "epoch": 1.1550400654522703, + "grad_norm": 3.047692704123104, + "learning_rate": 7.987211836495597e-06, + "loss": 1.2191, + "step": 6000 + }, + { + "epoch": 1.1550400654522703, + "lm_loss": 1.2796, + "step": 6000, + "vm_loss": 0.2105 + }, + { + "epoch": 1.1550400654522703, + "lm_loss": 1.1836, + "step": 6000, + "vm_loss": 0.1549 + }, + { + "epoch": 1.1550400654522703, + "lm_loss": 1.3955, + "step": 6000, + "vm_loss": 0.1456 + }, + { + "epoch": 1.1550400654522703, + "lm_loss": 0.7848, + "step": 6000, + "vm_loss": 0.176 + }, + { + "epoch": 1.1550400654522703, + "lm_loss": 0.8213, + "step": 6000, + "vm_loss": 0.194 + }, + { + "epoch": 1.1550400654522703, + "lm_loss": 1.0619, + "step": 6000, + "vm_loss": 0.1533 + }, + { + "epoch": 1.1550400654522703, + "lm_loss": 1.3353, + "step": 6000, + "vm_loss": 0.2056 + }, + { + "epoch": 1.1550400654522703, + "lm_loss": 1.0139, + "step": 6000, + "vm_loss": 0.107 + }, + { + "epoch": 1.1552325721298458, + "grad_norm": 3.2563527807845976, + "learning_rate": 7.984157848643292e-06, + "loss": 1.2719, + "step": 6001 + }, + { + "epoch": 1.1554250788074212, + "grad_norm": 3.2389211499290917, + "learning_rate": 7.981104056756633e-06, + "loss": 1.2772, + "step": 6002 + }, + { + "epoch": 1.1556175854849966, + "grad_norm": 3.2633922537389117, + "learning_rate": 7.978050461132486e-06, + "loss": 1.2615, + "step": 6003 + }, + { + "epoch": 1.1558100921625718, + "grad_norm": 3.2468537424803685, + "learning_rate": 7.974997062067707e-06, + "loss": 1.2582, + "step": 6004 + }, + { + "epoch": 1.1560025988401472, + "grad_norm": 3.1887455191950163, + "learning_rate": 7.97194385985912e-06, + "loss": 1.2566, + "step": 6005 + }, + { + "epoch": 1.1561951055177226, + "grad_norm": 3.215568593250897, + "learning_rate": 7.968890854803536e-06, + "loss": 1.2216, + "step": 6006 + }, + { + "epoch": 1.156387612195298, + "grad_norm": 3.3918618929471656, + "learning_rate": 7.965838047197743e-06, + "loss": 1.2283, + "step": 6007 + }, + { + "epoch": 1.1565801188728735, + "grad_norm": 3.2230562893790213, + "learning_rate": 7.96278543733852e-06, + "loss": 1.2536, + "step": 6008 + }, + { + "epoch": 1.1565801188728735, + "lm_loss": 1.6314, + "step": 6008, + "vm_loss": 0.1956 + }, + { + "epoch": 1.1565801188728735, + "lm_loss": 0.8353, + "step": 6008, + "vm_loss": 0.131 + }, + { + "epoch": 1.1565801188728735, + "lm_loss": 0.8045, + "step": 6008, + "vm_loss": 0.1182 + }, + { + "epoch": 1.1565801188728735, + "lm_loss": 1.1367, + "step": 6008, + "vm_loss": 0.147 + }, + { + "epoch": 1.1565801188728735, + "lm_loss": 0.9787, + "step": 6008, + "vm_loss": 0.1704 + }, + { + "epoch": 1.1565801188728735, + "lm_loss": 1.1859, + "step": 6008, + "vm_loss": 0.1986 + }, + { + "epoch": 1.1565801188728735, + "lm_loss": 0.7178, + "step": 6008, + "vm_loss": 0.1503 + }, + { + "epoch": 1.1565801188728735, + "lm_loss": 1.0924, + "step": 6008, + "vm_loss": 0.1008 + }, + { + "epoch": 1.1567726255504487, + "grad_norm": 3.1759366710330186, + "learning_rate": 7.959733025522615e-06, + "loss": 1.1742, + "step": 6009 + }, + { + "epoch": 1.156965132228024, + "grad_norm": 3.194702220347344, + "learning_rate": 7.95668081204676e-06, + "loss": 1.1728, + "step": 6010 + }, + { + "epoch": 1.1571576389055995, + "grad_norm": 3.329988549158313, + "learning_rate": 7.953628797207675e-06, + "loss": 1.2232, + "step": 6011 + }, + { + "epoch": 1.157350145583175, + "grad_norm": 3.2319188046055993, + "learning_rate": 7.950576981302053e-06, + "loss": 1.199, + "step": 6012 + }, + { + "epoch": 1.1575426522607504, + "grad_norm": 3.1842137489807962, + "learning_rate": 7.947525364626562e-06, + "loss": 1.2606, + "step": 6013 + }, + { + "epoch": 1.1577351589383258, + "grad_norm": 3.245136394895245, + "learning_rate": 7.944473947477871e-06, + "loss": 1.2613, + "step": 6014 + }, + { + "epoch": 1.157927665615901, + "grad_norm": 3.1626588182355615, + "learning_rate": 7.941422730152607e-06, + "loss": 1.246, + "step": 6015 + }, + { + "epoch": 1.1581201722934764, + "grad_norm": 3.2668529867141807, + "learning_rate": 7.938371712947392e-06, + "loss": 1.2052, + "step": 6016 + }, + { + "epoch": 1.1581201722934764, + "lm_loss": 0.6604, + "step": 6016, + "vm_loss": 0.1233 + }, + { + "epoch": 1.1581201722934764, + "lm_loss": 0.7238, + "step": 6016, + "vm_loss": 0.1796 + }, + { + "epoch": 1.1581201722934764, + "lm_loss": 0.9639, + "step": 6016, + "vm_loss": 0.2214 + }, + { + "epoch": 1.1581201722934764, + "lm_loss": 0.5416, + "step": 6016, + "vm_loss": 0.1832 + }, + { + "epoch": 1.1581201722934764, + "lm_loss": 1.1536, + "step": 6016, + "vm_loss": 0.1188 + }, + { + "epoch": 1.1581201722934764, + "lm_loss": 1.4388, + "step": 6016, + "vm_loss": 0.1133 + }, + { + "epoch": 1.1581201722934764, + "lm_loss": 1.2018, + "step": 6016, + "vm_loss": 0.1152 + }, + { + "epoch": 1.1581201722934764, + "lm_loss": 0.9978, + "step": 6016, + "vm_loss": 0.1607 + }, + { + "epoch": 1.1583126789710518, + "grad_norm": 3.2749108740168356, + "learning_rate": 7.93532089615882e-06, + "loss": 1.2607, + "step": 6017 + }, + { + "epoch": 1.1585051856486273, + "grad_norm": 3.2824637807279062, + "learning_rate": 7.932270280083473e-06, + "loss": 1.2124, + "step": 6018 + }, + { + "epoch": 1.1586976923262027, + "grad_norm": 3.251809751009011, + "learning_rate": 7.92921986501791e-06, + "loss": 1.2866, + "step": 6019 + }, + { + "epoch": 1.1588901990037779, + "grad_norm": 3.194500187230321, + "learning_rate": 7.926169651258668e-06, + "loss": 1.2563, + "step": 6020 + }, + { + "epoch": 1.1590827056813533, + "grad_norm": 3.2655303838480285, + "learning_rate": 7.923119639102268e-06, + "loss": 1.2299, + "step": 6021 + }, + { + "epoch": 1.1592752123589287, + "grad_norm": 3.0509843282729854, + "learning_rate": 7.920069828845211e-06, + "loss": 1.1647, + "step": 6022 + }, + { + "epoch": 1.1594677190365041, + "grad_norm": 3.137476953712626, + "learning_rate": 7.917020220783978e-06, + "loss": 1.2431, + "step": 6023 + }, + { + "epoch": 1.1596602257140796, + "grad_norm": 3.1925441219277215, + "learning_rate": 7.913970815215024e-06, + "loss": 1.2213, + "step": 6024 + }, + { + "epoch": 1.1596602257140796, + "lm_loss": 0.9437, + "step": 6024, + "vm_loss": 0.1154 + }, + { + "epoch": 1.1596602257140796, + "lm_loss": 1.0937, + "step": 6024, + "vm_loss": 0.1202 + }, + { + "epoch": 1.1596602257140796, + "lm_loss": 1.3067, + "step": 6024, + "vm_loss": 0.0873 + }, + { + "epoch": 1.1596602257140796, + "lm_loss": 0.9509, + "step": 6024, + "vm_loss": 0.1001 + }, + { + "epoch": 1.1596602257140796, + "lm_loss": 0.9217, + "step": 6024, + "vm_loss": 0.1752 + }, + { + "epoch": 1.1596602257140796, + "lm_loss": 1.0277, + "step": 6024, + "vm_loss": 0.2043 + }, + { + "epoch": 1.1596602257140796, + "lm_loss": 0.9979, + "step": 6024, + "vm_loss": 0.2206 + }, + { + "epoch": 1.1596602257140796, + "lm_loss": 0.8125, + "step": 6024, + "vm_loss": 0.194 + }, + { + "epoch": 1.1598527323916548, + "grad_norm": 3.177949817384393, + "learning_rate": 7.910921612434799e-06, + "loss": 1.268, + "step": 6025 + }, + { + "epoch": 1.1600452390692302, + "grad_norm": 3.16292343862426, + "learning_rate": 7.90787261273972e-06, + "loss": 1.2446, + "step": 6026 + }, + { + "epoch": 1.1602377457468056, + "grad_norm": 3.226058390566725, + "learning_rate": 7.904823816426185e-06, + "loss": 1.166, + "step": 6027 + }, + { + "epoch": 1.160430252424381, + "grad_norm": 3.653607627770296, + "learning_rate": 7.901775223790585e-06, + "loss": 1.2908, + "step": 6028 + }, + { + "epoch": 1.1606227591019564, + "grad_norm": 3.395645850098499, + "learning_rate": 7.898726835129277e-06, + "loss": 1.2342, + "step": 6029 + }, + { + "epoch": 1.1608152657795316, + "grad_norm": 3.402042348628373, + "learning_rate": 7.895678650738602e-06, + "loss": 1.221, + "step": 6030 + }, + { + "epoch": 1.161007772457107, + "grad_norm": 3.3067696899484282, + "learning_rate": 7.892630670914882e-06, + "loss": 1.2748, + "step": 6031 + }, + { + "epoch": 1.1612002791346825, + "grad_norm": 3.4454807452681817, + "learning_rate": 7.889582895954427e-06, + "loss": 1.326, + "step": 6032 + }, + { + "epoch": 1.1612002791346825, + "lm_loss": 0.9989, + "step": 6032, + "vm_loss": 0.1997 + }, + { + "epoch": 1.1612002791346825, + "lm_loss": 0.7525, + "step": 6032, + "vm_loss": 0.1093 + }, + { + "epoch": 1.1612002791346825, + "lm_loss": 0.5834, + "step": 6032, + "vm_loss": 0.1523 + }, + { + "epoch": 1.1612002791346825, + "lm_loss": 0.8088, + "step": 6032, + "vm_loss": 0.2102 + }, + { + "epoch": 1.1612002791346825, + "lm_loss": 1.1025, + "step": 6032, + "vm_loss": 0.1543 + }, + { + "epoch": 1.1612002791346825, + "lm_loss": 1.1073, + "step": 6032, + "vm_loss": 0.1193 + }, + { + "epoch": 1.1612002791346825, + "lm_loss": 0.6936, + "step": 6032, + "vm_loss": 0.1841 + }, + { + "epoch": 1.1612002791346825, + "lm_loss": 1.0145, + "step": 6032, + "vm_loss": 0.1653 + }, + { + "epoch": 1.161392785812258, + "grad_norm": 3.2488163594881927, + "learning_rate": 7.886535326153513e-06, + "loss": 1.2685, + "step": 6033 + }, + { + "epoch": 1.1615852924898333, + "grad_norm": 3.250922761189258, + "learning_rate": 7.883487961808401e-06, + "loss": 1.2791, + "step": 6034 + }, + { + "epoch": 1.1617777991674085, + "grad_norm": 3.1745511658623964, + "learning_rate": 7.880440803215341e-06, + "loss": 1.2866, + "step": 6035 + }, + { + "epoch": 1.161970305844984, + "grad_norm": 3.231440034221781, + "learning_rate": 7.87739385067055e-06, + "loss": 1.3415, + "step": 6036 + }, + { + "epoch": 1.1621628125225594, + "grad_norm": 3.1284013468259535, + "learning_rate": 7.874347104470234e-06, + "loss": 1.2376, + "step": 6037 + }, + { + "epoch": 1.1623553192001348, + "grad_norm": 3.2382772646271953, + "learning_rate": 7.871300564910576e-06, + "loss": 1.1916, + "step": 6038 + }, + { + "epoch": 1.1625478258777102, + "grad_norm": 3.2915301591193913, + "learning_rate": 7.868254232287733e-06, + "loss": 1.229, + "step": 6039 + }, + { + "epoch": 1.1627403325552854, + "grad_norm": 3.412554706239831, + "learning_rate": 7.865208106897856e-06, + "loss": 1.2492, + "step": 6040 + }, + { + "epoch": 1.1627403325552854, + "lm_loss": 1.5306, + "step": 6040, + "vm_loss": 0.1775 + }, + { + "epoch": 1.1627403325552854, + "lm_loss": 1.3196, + "step": 6040, + "vm_loss": 0.1465 + }, + { + "epoch": 1.1627403325552854, + "lm_loss": 0.9985, + "step": 6040, + "vm_loss": 0.1396 + }, + { + "epoch": 1.1627403325552854, + "lm_loss": 0.6738, + "step": 6040, + "vm_loss": 0.1558 + }, + { + "epoch": 1.1627403325552854, + "lm_loss": 1.2792, + "step": 6040, + "vm_loss": 0.1879 + }, + { + "epoch": 1.1627403325552854, + "lm_loss": 1.245, + "step": 6040, + "vm_loss": 0.1551 + }, + { + "epoch": 1.1627403325552854, + "lm_loss": 1.3332, + "step": 6040, + "vm_loss": 0.2609 + }, + { + "epoch": 1.1627403325552854, + "lm_loss": 1.0222, + "step": 6040, + "vm_loss": 0.1583 + }, + { + "epoch": 1.1629328392328608, + "grad_norm": 3.52821450340791, + "learning_rate": 7.862162189037058e-06, + "loss": 1.3276, + "step": 6041 + }, + { + "epoch": 1.1631253459104363, + "grad_norm": 3.3089665786852183, + "learning_rate": 7.85911647900145e-06, + "loss": 1.1754, + "step": 6042 + }, + { + "epoch": 1.1633178525880117, + "grad_norm": 3.3398472211853987, + "learning_rate": 7.85607097708711e-06, + "loss": 1.2402, + "step": 6043 + }, + { + "epoch": 1.163510359265587, + "grad_norm": 3.306070251280547, + "learning_rate": 7.853025683590096e-06, + "loss": 1.2843, + "step": 6044 + }, + { + "epoch": 1.1637028659431623, + "grad_norm": 3.2407089086776066, + "learning_rate": 7.849980598806459e-06, + "loss": 1.2803, + "step": 6045 + }, + { + "epoch": 1.1638953726207377, + "grad_norm": 3.219550718434643, + "learning_rate": 7.846935723032213e-06, + "loss": 1.2197, + "step": 6046 + }, + { + "epoch": 1.1640878792983131, + "grad_norm": 3.1097994335672725, + "learning_rate": 7.843891056563361e-06, + "loss": 1.1946, + "step": 6047 + }, + { + "epoch": 1.1642803859758886, + "grad_norm": 3.325837955519314, + "learning_rate": 7.840846599695878e-06, + "loss": 1.2862, + "step": 6048 + }, + { + "epoch": 1.1642803859758886, + "lm_loss": 1.158, + "step": 6048, + "vm_loss": 0.1303 + }, + { + "epoch": 1.1642803859758886, + "lm_loss": 1.3378, + "step": 6048, + "vm_loss": 0.1181 + }, + { + "epoch": 1.1642803859758886, + "lm_loss": 0.6989, + "step": 6048, + "vm_loss": 0.1363 + }, + { + "epoch": 1.1642803859758886, + "lm_loss": 0.8295, + "step": 6048, + "vm_loss": 0.1715 + }, + { + "epoch": 1.1642803859758886, + "lm_loss": 0.6564, + "step": 6048, + "vm_loss": 0.1582 + }, + { + "epoch": 1.1642803859758886, + "lm_loss": 1.0447, + "step": 6048, + "vm_loss": 0.1716 + }, + { + "epoch": 1.1642803859758886, + "lm_loss": 1.0332, + "step": 6048, + "vm_loss": 0.1942 + }, + { + "epoch": 1.1642803859758886, + "lm_loss": 1.3833, + "step": 6048, + "vm_loss": 0.1708 + }, + { + "epoch": 1.164472892653464, + "grad_norm": 3.2645210940418, + "learning_rate": 7.837802352725734e-06, + "loss": 1.1994, + "step": 6049 + }, + { + "epoch": 1.1646653993310392, + "grad_norm": 3.3228753397949546, + "learning_rate": 7.834758315948865e-06, + "loss": 1.247, + "step": 6050 + }, + { + "epoch": 1.1648579060086146, + "grad_norm": 3.0723297345773775, + "learning_rate": 7.831714489661181e-06, + "loss": 1.1127, + "step": 6051 + }, + { + "epoch": 1.16505041268619, + "grad_norm": 3.1659719455570547, + "learning_rate": 7.828670874158598e-06, + "loss": 1.1899, + "step": 6052 + }, + { + "epoch": 1.1652429193637655, + "grad_norm": 3.33908858568199, + "learning_rate": 7.825627469736984e-06, + "loss": 1.206, + "step": 6053 + }, + { + "epoch": 1.1654354260413409, + "grad_norm": 3.4273867064880825, + "learning_rate": 7.822584276692192e-06, + "loss": 1.2833, + "step": 6054 + }, + { + "epoch": 1.165627932718916, + "grad_norm": 3.251674497138722, + "learning_rate": 7.819541295320072e-06, + "loss": 1.2254, + "step": 6055 + }, + { + "epoch": 1.1658204393964915, + "grad_norm": 3.2769478411485764, + "learning_rate": 7.816498525916435e-06, + "loss": 1.246, + "step": 6056 + }, + { + "epoch": 1.1658204393964915, + "lm_loss": 0.8814, + "step": 6056, + "vm_loss": 0.1649 + }, + { + "epoch": 1.1658204393964915, + "lm_loss": 1.1356, + "step": 6056, + "vm_loss": 0.135 + }, + { + "epoch": 1.1658204393964915, + "lm_loss": 0.9583, + "step": 6056, + "vm_loss": 0.1982 + }, + { + "epoch": 1.1658204393964915, + "lm_loss": 1.0268, + "step": 6056, + "vm_loss": 0.1506 + }, + { + "epoch": 1.1658204393964915, + "lm_loss": 0.7276, + "step": 6056, + "vm_loss": 0.2051 + }, + { + "epoch": 1.1658204393964915, + "lm_loss": 1.0062, + "step": 6056, + "vm_loss": 0.2236 + }, + { + "epoch": 1.1658204393964915, + "lm_loss": 0.9913, + "step": 6056, + "vm_loss": 0.1546 + }, + { + "epoch": 1.1658204393964915, + "lm_loss": 0.646, + "step": 6056, + "vm_loss": 0.0789 + }, + { + "epoch": 1.166012946074067, + "grad_norm": 3.1425701585453956, + "learning_rate": 7.813455968777072e-06, + "loss": 1.1428, + "step": 6057 + }, + { + "epoch": 1.1662054527516423, + "grad_norm": 3.255369180601749, + "learning_rate": 7.810413624197764e-06, + "loss": 1.175, + "step": 6058 + }, + { + "epoch": 1.1663979594292178, + "grad_norm": 3.164628359517738, + "learning_rate": 7.807371492474266e-06, + "loss": 1.2214, + "step": 6059 + }, + { + "epoch": 1.166590466106793, + "grad_norm": 3.293675121730721, + "learning_rate": 7.80432957390231e-06, + "loss": 1.2633, + "step": 6060 + }, + { + "epoch": 1.1667829727843684, + "grad_norm": 3.2671911191419785, + "learning_rate": 7.80128786877761e-06, + "loss": 1.2637, + "step": 6061 + }, + { + "epoch": 1.1669754794619438, + "grad_norm": 3.2281453304153716, + "learning_rate": 7.798246377395858e-06, + "loss": 1.2298, + "step": 6062 + }, + { + "epoch": 1.1671679861395192, + "grad_norm": 3.1966186018123954, + "learning_rate": 7.79520510005273e-06, + "loss": 1.2091, + "step": 6063 + }, + { + "epoch": 1.1673604928170946, + "grad_norm": 3.3494192065334074, + "learning_rate": 7.792164037043874e-06, + "loss": 1.2768, + "step": 6064 + }, + { + "epoch": 1.1673604928170946, + "lm_loss": 1.2898, + "step": 6064, + "vm_loss": 0.1343 + }, + { + "epoch": 1.1673604928170946, + "lm_loss": 1.0545, + "step": 6064, + "vm_loss": 0.1776 + }, + { + "epoch": 1.1673604928170946, + "lm_loss": 1.0931, + "step": 6064, + "vm_loss": 0.2095 + }, + { + "epoch": 1.1673604928170946, + "lm_loss": 0.8892, + "step": 6064, + "vm_loss": 0.1356 + }, + { + "epoch": 1.1673604928170946, + "lm_loss": 0.9904, + "step": 6064, + "vm_loss": 0.2161 + }, + { + "epoch": 1.1673604928170946, + "lm_loss": 0.6541, + "step": 6064, + "vm_loss": 0.1743 + }, + { + "epoch": 1.1673604928170946, + "lm_loss": 1.0412, + "step": 6064, + "vm_loss": 0.1484 + }, + { + "epoch": 1.1673604928170946, + "lm_loss": 0.9578, + "step": 6064, + "vm_loss": 0.1261 + }, + { + "epoch": 1.16755299949467, + "grad_norm": 3.3255017161699714, + "learning_rate": 7.789123188664914e-06, + "loss": 1.3037, + "step": 6065 + }, + { + "epoch": 1.1677455061722453, + "grad_norm": 3.2095186649069887, + "learning_rate": 7.786082555211471e-06, + "loss": 1.2326, + "step": 6066 + }, + { + "epoch": 1.1679380128498207, + "grad_norm": 3.116170883234849, + "learning_rate": 7.783042136979127e-06, + "loss": 1.1927, + "step": 6067 + }, + { + "epoch": 1.168130519527396, + "grad_norm": 3.265854493646826, + "learning_rate": 7.780001934263447e-06, + "loss": 1.2211, + "step": 6068 + }, + { + "epoch": 1.1683230262049715, + "grad_norm": 3.235423868772814, + "learning_rate": 7.776961947359987e-06, + "loss": 1.2312, + "step": 6069 + }, + { + "epoch": 1.168515532882547, + "grad_norm": 3.142208526937211, + "learning_rate": 7.773922176564264e-06, + "loss": 1.2295, + "step": 6070 + }, + { + "epoch": 1.1687080395601221, + "grad_norm": 3.248111677667479, + "learning_rate": 7.770882622171783e-06, + "loss": 1.2616, + "step": 6071 + }, + { + "epoch": 1.1689005462376976, + "grad_norm": 3.2229762653269094, + "learning_rate": 7.767843284478031e-06, + "loss": 1.2146, + "step": 6072 + }, + { + "epoch": 1.1689005462376976, + "lm_loss": 1.1333, + "step": 6072, + "vm_loss": 0.179 + }, + { + "epoch": 1.1689005462376976, + "lm_loss": 1.0305, + "step": 6072, + "vm_loss": 0.1677 + }, + { + "epoch": 1.1689005462376976, + "lm_loss": 0.8646, + "step": 6072, + "vm_loss": 0.2033 + }, + { + "epoch": 1.1689005462376976, + "lm_loss": 1.1849, + "step": 6072, + "vm_loss": 0.1798 + }, + { + "epoch": 1.1689005462376976, + "lm_loss": 1.0135, + "step": 6072, + "vm_loss": 0.1666 + }, + { + "epoch": 1.1689005462376976, + "lm_loss": 0.8469, + "step": 6072, + "vm_loss": 0.1421 + }, + { + "epoch": 1.1689005462376976, + "lm_loss": 0.8382, + "step": 6072, + "vm_loss": 0.136 + }, + { + "epoch": 1.1689005462376976, + "lm_loss": 0.8109, + "step": 6072, + "vm_loss": 0.1835 + }, + { + "epoch": 1.169093052915273, + "grad_norm": 3.415748192258437, + "learning_rate": 7.764804163778472e-06, + "loss": 1.2553, + "step": 6073 + }, + { + "epoch": 1.1692855595928484, + "grad_norm": 3.243437880832997, + "learning_rate": 7.761765260368546e-06, + "loss": 1.2042, + "step": 6074 + }, + { + "epoch": 1.1694780662704238, + "grad_norm": 3.2296092720287004, + "learning_rate": 7.758726574543665e-06, + "loss": 1.1945, + "step": 6075 + }, + { + "epoch": 1.1696705729479993, + "grad_norm": 3.3953647599277943, + "learning_rate": 7.75568810659924e-06, + "loss": 1.3019, + "step": 6076 + }, + { + "epoch": 1.1698630796255745, + "grad_norm": 3.180131029246469, + "learning_rate": 7.752649856830645e-06, + "loss": 1.2322, + "step": 6077 + }, + { + "epoch": 1.1700555863031499, + "grad_norm": 3.1661847001069385, + "learning_rate": 7.749611825533231e-06, + "loss": 1.1808, + "step": 6078 + }, + { + "epoch": 1.1702480929807253, + "grad_norm": 3.35343023048648, + "learning_rate": 7.746574013002344e-06, + "loss": 1.2684, + "step": 6079 + }, + { + "epoch": 1.1704405996583007, + "grad_norm": 3.2200019012286556, + "learning_rate": 7.743536419533289e-06, + "loss": 1.2512, + "step": 6080 + }, + { + "epoch": 1.1704405996583007, + "lm_loss": 1.0463, + "step": 6080, + "vm_loss": 0.2373 + }, + { + "epoch": 1.1704405996583007, + "lm_loss": 0.9312, + "step": 6080, + "vm_loss": 0.1449 + }, + { + "epoch": 1.1704405996583007, + "lm_loss": 1.09, + "step": 6080, + "vm_loss": 0.2064 + }, + { + "epoch": 1.1704405996583007, + "lm_loss": 1.3708, + "step": 6080, + "vm_loss": 0.1342 + }, + { + "epoch": 1.1704405996583007, + "lm_loss": 1.1168, + "step": 6080, + "vm_loss": 0.1363 + }, + { + "epoch": 1.1704405996583007, + "lm_loss": 1.0453, + "step": 6080, + "vm_loss": 0.1789 + }, + { + "epoch": 1.1704405996583007, + "lm_loss": 1.0371, + "step": 6080, + "vm_loss": 0.0974 + }, + { + "epoch": 1.1704405996583007, + "lm_loss": 1.1337, + "step": 6080, + "vm_loss": 0.1314 + }, + { + "epoch": 1.1706331063358761, + "grad_norm": 3.2311523348726685, + "learning_rate": 7.740499045421366e-06, + "loss": 1.2094, + "step": 6081 + }, + { + "epoch": 1.1708256130134513, + "grad_norm": 3.189549567738214, + "learning_rate": 7.737461890961841e-06, + "loss": 1.1445, + "step": 6082 + }, + { + "epoch": 1.1710181196910268, + "grad_norm": 3.399362893807536, + "learning_rate": 7.734424956449965e-06, + "loss": 1.2159, + "step": 6083 + }, + { + "epoch": 1.1712106263686022, + "grad_norm": 3.4431427439200064, + "learning_rate": 7.73138824218097e-06, + "loss": 1.2297, + "step": 6084 + }, + { + "epoch": 1.1714031330461776, + "grad_norm": 3.286141287738985, + "learning_rate": 7.728351748450062e-06, + "loss": 1.2609, + "step": 6085 + }, + { + "epoch": 1.171595639723753, + "grad_norm": 3.3056881954486785, + "learning_rate": 7.725315475552428e-06, + "loss": 1.2308, + "step": 6086 + }, + { + "epoch": 1.1717881464013282, + "grad_norm": 3.2401397785309753, + "learning_rate": 7.722279423783231e-06, + "loss": 1.2215, + "step": 6087 + }, + { + "epoch": 1.1719806530789036, + "grad_norm": 3.3031077875404, + "learning_rate": 7.71924359343761e-06, + "loss": 1.2569, + "step": 6088 + }, + { + "epoch": 1.1719806530789036, + "lm_loss": 0.9556, + "step": 6088, + "vm_loss": 0.1724 + }, + { + "epoch": 1.1719806530789036, + "lm_loss": 1.4546, + "step": 6088, + "vm_loss": 0.1862 + }, + { + "epoch": 1.1719806530789036, + "lm_loss": 1.2894, + "step": 6088, + "vm_loss": 0.1456 + }, + { + "epoch": 1.1719806530789036, + "lm_loss": 1.2921, + "step": 6088, + "vm_loss": 0.1358 + }, + { + "epoch": 1.1719806530789036, + "lm_loss": 1.338, + "step": 6088, + "vm_loss": 0.1406 + }, + { + "epoch": 1.1719806530789036, + "lm_loss": 0.7538, + "step": 6088, + "vm_loss": 0.1292 + }, + { + "epoch": 1.1719806530789036, + "lm_loss": 1.363, + "step": 6088, + "vm_loss": 0.2098 + }, + { + "epoch": 1.1719806530789036, + "lm_loss": 1.1648, + "step": 6088, + "vm_loss": 0.1936 + }, + { + "epoch": 1.172173159756479, + "grad_norm": 3.177163103223128, + "learning_rate": 7.716207984810698e-06, + "loss": 1.2234, + "step": 6089 + }, + { + "epoch": 1.1723656664340545, + "grad_norm": 3.2012564040408074, + "learning_rate": 7.713172598197587e-06, + "loss": 1.2607, + "step": 6090 + }, + { + "epoch": 1.17255817311163, + "grad_norm": 3.171217374893182, + "learning_rate": 7.710137433893356e-06, + "loss": 1.2587, + "step": 6091 + }, + { + "epoch": 1.172750679789205, + "grad_norm": 3.2683926434896677, + "learning_rate": 7.707102492193058e-06, + "loss": 1.2019, + "step": 6092 + }, + { + "epoch": 1.1729431864667805, + "grad_norm": 3.2579314487402984, + "learning_rate": 7.704067773391736e-06, + "loss": 1.2201, + "step": 6093 + }, + { + "epoch": 1.173135693144356, + "grad_norm": 3.1222975049872645, + "learning_rate": 7.701033277784402e-06, + "loss": 1.2729, + "step": 6094 + }, + { + "epoch": 1.1733281998219314, + "grad_norm": 3.1749440792459844, + "learning_rate": 7.697999005666041e-06, + "loss": 1.2102, + "step": 6095 + }, + { + "epoch": 1.1735207064995068, + "grad_norm": 3.200408144892804, + "learning_rate": 7.69496495733163e-06, + "loss": 1.231, + "step": 6096 + }, + { + "epoch": 1.1735207064995068, + "lm_loss": 1.315, + "step": 6096, + "vm_loss": 0.1609 + }, + { + "epoch": 1.1735207064995068, + "lm_loss": 1.0508, + "step": 6096, + "vm_loss": 0.1237 + }, + { + "epoch": 1.1735207064995068, + "lm_loss": 1.3423, + "step": 6096, + "vm_loss": 0.1237 + }, + { + "epoch": 1.1735207064995068, + "lm_loss": 0.856, + "step": 6096, + "vm_loss": 0.1732 + }, + { + "epoch": 1.1735207064995068, + "lm_loss": 0.7529, + "step": 6096, + "vm_loss": 0.1627 + }, + { + "epoch": 1.1735207064995068, + "lm_loss": 1.1407, + "step": 6096, + "vm_loss": 0.1936 + }, + { + "epoch": 1.1735207064995068, + "lm_loss": 0.6215, + "step": 6096, + "vm_loss": 0.119 + }, + { + "epoch": 1.1735207064995068, + "lm_loss": 0.8128, + "step": 6096, + "vm_loss": 0.1604 + }, + { + "epoch": 1.173713213177082, + "grad_norm": 3.272116317791104, + "learning_rate": 7.691931133076116e-06, + "loss": 1.227, + "step": 6097 + }, + { + "epoch": 1.1739057198546574, + "grad_norm": 3.2679713191529296, + "learning_rate": 7.688897533194423e-06, + "loss": 1.187, + "step": 6098 + }, + { + "epoch": 1.1740982265322328, + "grad_norm": 3.114035420061861, + "learning_rate": 7.68586415798146e-06, + "loss": 1.1658, + "step": 6099 + }, + { + "epoch": 1.1742907332098083, + "grad_norm": 3.0285070977139186, + "learning_rate": 7.682831007732105e-06, + "loss": 1.1892, + "step": 6100 + }, + { + "epoch": 1.1744832398873837, + "grad_norm": 3.1505328288548453, + "learning_rate": 7.679798082741223e-06, + "loss": 1.2246, + "step": 6101 + }, + { + "epoch": 1.1746757465649589, + "grad_norm": 3.2339088438533117, + "learning_rate": 7.676765383303649e-06, + "loss": 1.2237, + "step": 6102 + }, + { + "epoch": 1.1748682532425343, + "grad_norm": 3.252943676747561, + "learning_rate": 7.673732909714204e-06, + "loss": 1.2636, + "step": 6103 + }, + { + "epoch": 1.1750607599201097, + "grad_norm": 3.269614692316229, + "learning_rate": 7.670700662267683e-06, + "loss": 1.253, + "step": 6104 + }, + { + "epoch": 1.1750607599201097, + "lm_loss": 0.7834, + "step": 6104, + "vm_loss": 0.1104 + }, + { + "epoch": 1.1750607599201097, + "lm_loss": 1.1494, + "step": 6104, + "vm_loss": 0.2235 + }, + { + "epoch": 1.1750607599201097, + "lm_loss": 0.6054, + "step": 6104, + "vm_loss": 0.1824 + }, + { + "epoch": 1.1750607599201097, + "lm_loss": 1.0462, + "step": 6104, + "vm_loss": 0.2106 + }, + { + "epoch": 1.1750607599201097, + "lm_loss": 1.3916, + "step": 6104, + "vm_loss": 0.1732 + }, + { + "epoch": 1.1750607599201097, + "lm_loss": 0.9219, + "step": 6104, + "vm_loss": 0.2069 + }, + { + "epoch": 1.1750607599201097, + "lm_loss": 1.1053, + "step": 6104, + "vm_loss": 0.151 + }, + { + "epoch": 1.1750607599201097, + "lm_loss": 1.1389, + "step": 6104, + "vm_loss": 0.1266 + }, + { + "epoch": 1.1752532665976851, + "grad_norm": 3.3297778299252276, + "learning_rate": 7.667668641258858e-06, + "loss": 1.2251, + "step": 6105 + }, + { + "epoch": 1.1754457732752606, + "grad_norm": 3.191894568523854, + "learning_rate": 7.664636846982476e-06, + "loss": 1.2583, + "step": 6106 + }, + { + "epoch": 1.1756382799528358, + "grad_norm": 3.4304407409016493, + "learning_rate": 7.661605279733275e-06, + "loss": 1.3137, + "step": 6107 + }, + { + "epoch": 1.1758307866304112, + "grad_norm": 3.3211786463611435, + "learning_rate": 7.658573939805958e-06, + "loss": 1.2498, + "step": 6108 + }, + { + "epoch": 1.1760232933079866, + "grad_norm": 3.2574586041382414, + "learning_rate": 7.655542827495205e-06, + "loss": 1.2671, + "step": 6109 + }, + { + "epoch": 1.176215799985562, + "grad_norm": 3.1480474455918177, + "learning_rate": 7.652511943095687e-06, + "loss": 1.1949, + "step": 6110 + }, + { + "epoch": 1.1764083066631374, + "grad_norm": 3.071867415544505, + "learning_rate": 7.649481286902043e-06, + "loss": 1.2004, + "step": 6111 + }, + { + "epoch": 1.1766008133407126, + "grad_norm": 3.2445520926328917, + "learning_rate": 7.646450859208884e-06, + "loss": 1.2997, + "step": 6112 + }, + { + "epoch": 1.1766008133407126, + "lm_loss": 1.3768, + "step": 6112, + "vm_loss": 0.1405 + }, + { + "epoch": 1.1766008133407126, + "lm_loss": 1.1704, + "step": 6112, + "vm_loss": 0.1498 + }, + { + "epoch": 1.1766008133407126, + "lm_loss": 0.9388, + "step": 6112, + "vm_loss": 0.1523 + }, + { + "epoch": 1.1766008133407126, + "lm_loss": 0.405, + "step": 6112, + "vm_loss": 0.2655 + }, + { + "epoch": 1.1766008133407126, + "lm_loss": 0.8671, + "step": 6112, + "vm_loss": 0.1343 + }, + { + "epoch": 1.1766008133407126, + "lm_loss": 1.9855, + "step": 6112, + "vm_loss": 0.2259 + }, + { + "epoch": 1.1766008133407126, + "lm_loss": 1.0908, + "step": 6112, + "vm_loss": 0.1212 + }, + { + "epoch": 1.1766008133407126, + "lm_loss": 0.996, + "step": 6112, + "vm_loss": 0.1734 + }, + { + "epoch": 1.176793320018288, + "grad_norm": 3.121422349257162, + "learning_rate": 7.643420660310817e-06, + "loss": 1.2327, + "step": 6113 + }, + { + "epoch": 1.1769858266958635, + "grad_norm": 3.1655764602384417, + "learning_rate": 7.640390690502412e-06, + "loss": 1.2626, + "step": 6114 + }, + { + "epoch": 1.177178333373439, + "grad_norm": 3.078722753836169, + "learning_rate": 7.637360950078218e-06, + "loss": 1.1797, + "step": 6115 + }, + { + "epoch": 1.1773708400510143, + "grad_norm": 3.1830917346580567, + "learning_rate": 7.634331439332765e-06, + "loss": 1.2305, + "step": 6116 + }, + { + "epoch": 1.1775633467285895, + "grad_norm": 3.150364731662425, + "learning_rate": 7.631302158560564e-06, + "loss": 1.2067, + "step": 6117 + }, + { + "epoch": 1.177755853406165, + "grad_norm": 3.121972753890976, + "learning_rate": 7.628273108056099e-06, + "loss": 1.1705, + "step": 6118 + }, + { + "epoch": 1.1779483600837404, + "grad_norm": 3.3939227409414996, + "learning_rate": 7.625244288113827e-06, + "loss": 1.2799, + "step": 6119 + }, + { + "epoch": 1.1781408667613158, + "grad_norm": 3.1952704291724796, + "learning_rate": 7.622215699028196e-06, + "loss": 1.1534, + "step": 6120 + }, + { + "epoch": 1.1781408667613158, + "lm_loss": 1.2247, + "step": 6120, + "vm_loss": 0.1581 + }, + { + "epoch": 1.1781408667613158, + "lm_loss": 0.9736, + "step": 6120, + "vm_loss": 0.1493 + }, + { + "epoch": 1.1781408667613158, + "lm_loss": 1.1532, + "step": 6120, + "vm_loss": 0.1527 + }, + { + "epoch": 1.1781408667613158, + "lm_loss": 1.4124, + "step": 6120, + "vm_loss": 0.1586 + }, + { + "epoch": 1.1781408667613158, + "lm_loss": 1.1035, + "step": 6120, + "vm_loss": 0.1323 + }, + { + "epoch": 1.1781408667613158, + "lm_loss": 1.3728, + "step": 6120, + "vm_loss": 0.142 + }, + { + "epoch": 1.1781408667613158, + "lm_loss": 0.5252, + "step": 6120, + "vm_loss": 0.1612 + }, + { + "epoch": 1.1781408667613158, + "lm_loss": 0.8146, + "step": 6120, + "vm_loss": 0.1943 + }, + { + "epoch": 1.1783333734388912, + "grad_norm": 3.191346384253806, + "learning_rate": 7.619187341093617e-06, + "loss": 1.1461, + "step": 6121 + }, + { + "epoch": 1.1785258801164664, + "grad_norm": 3.2489607203935673, + "learning_rate": 7.616159214604491e-06, + "loss": 1.2574, + "step": 6122 + }, + { + "epoch": 1.1787183867940418, + "grad_norm": 3.359279493380381, + "learning_rate": 7.613131319855185e-06, + "loss": 1.2626, + "step": 6123 + }, + { + "epoch": 1.1789108934716173, + "grad_norm": 3.2811659969587996, + "learning_rate": 7.610103657140052e-06, + "loss": 1.2282, + "step": 6124 + }, + { + "epoch": 1.1791034001491927, + "grad_norm": 3.293820787791637, + "learning_rate": 7.60707622675342e-06, + "loss": 1.2547, + "step": 6125 + }, + { + "epoch": 1.179295906826768, + "grad_norm": 3.1523615927974533, + "learning_rate": 7.604049028989591e-06, + "loss": 1.1417, + "step": 6126 + }, + { + "epoch": 1.1794884135043435, + "grad_norm": 3.2177422652202012, + "learning_rate": 7.601022064142853e-06, + "loss": 1.2666, + "step": 6127 + }, + { + "epoch": 1.1796809201819187, + "grad_norm": 3.141040953184416, + "learning_rate": 7.597995332507462e-06, + "loss": 1.2107, + "step": 6128 + }, + { + "epoch": 1.1796809201819187, + "lm_loss": 1.1697, + "step": 6128, + "vm_loss": 0.1991 + }, + { + "epoch": 1.1796809201819187, + "lm_loss": 0.888, + "step": 6128, + "vm_loss": 0.1724 + }, + { + "epoch": 1.1796809201819187, + "lm_loss": 1.2819, + "step": 6128, + "vm_loss": 0.1224 + }, + { + "epoch": 1.1796809201819187, + "lm_loss": 1.3237, + "step": 6128, + "vm_loss": 0.1981 + }, + { + "epoch": 1.1796809201819187, + "lm_loss": 1.1864, + "step": 6128, + "vm_loss": 0.1447 + }, + { + "epoch": 1.1796809201819187, + "lm_loss": 0.794, + "step": 6128, + "vm_loss": 0.157 + }, + { + "epoch": 1.1796809201819187, + "lm_loss": 1.2782, + "step": 6128, + "vm_loss": 0.1375 + }, + { + "epoch": 1.1796809201819187, + "lm_loss": 1.1459, + "step": 6128, + "vm_loss": 0.1694 + }, + { + "epoch": 1.1798734268594941, + "grad_norm": 3.351149528770669, + "learning_rate": 7.594968834377652e-06, + "loss": 1.3111, + "step": 6129 + }, + { + "epoch": 1.1800659335370696, + "grad_norm": 3.1601437871982014, + "learning_rate": 7.591942570047643e-06, + "loss": 1.2143, + "step": 6130 + }, + { + "epoch": 1.180258440214645, + "grad_norm": 3.4056196629676387, + "learning_rate": 7.588916539811626e-06, + "loss": 1.2268, + "step": 6131 + }, + { + "epoch": 1.1804509468922204, + "grad_norm": 3.323165121755806, + "learning_rate": 7.585890743963767e-06, + "loss": 1.217, + "step": 6132 + }, + { + "epoch": 1.1806434535697956, + "grad_norm": 3.2725761268098443, + "learning_rate": 7.582865182798209e-06, + "loss": 1.182, + "step": 6133 + }, + { + "epoch": 1.180835960247371, + "grad_norm": 3.3592922734893054, + "learning_rate": 7.579839856609085e-06, + "loss": 1.2307, + "step": 6134 + }, + { + "epoch": 1.1810284669249465, + "grad_norm": 3.325530009096257, + "learning_rate": 7.576814765690488e-06, + "loss": 1.2526, + "step": 6135 + }, + { + "epoch": 1.1812209736025219, + "grad_norm": 3.3165829033378604, + "learning_rate": 7.573789910336494e-06, + "loss": 1.2551, + "step": 6136 + }, + { + "epoch": 1.1812209736025219, + "lm_loss": 1.3722, + "step": 6136, + "vm_loss": 0.1144 + }, + { + "epoch": 1.1812209736025219, + "lm_loss": 0.7562, + "step": 6136, + "vm_loss": 0.2157 + }, + { + "epoch": 1.1812209736025219, + "lm_loss": 0.7106, + "step": 6136, + "vm_loss": 0.1063 + }, + { + "epoch": 1.1812209736025219, + "lm_loss": 0.7439, + "step": 6136, + "vm_loss": 0.2323 + }, + { + "epoch": 1.1812209736025219, + "lm_loss": 1.4404, + "step": 6136, + "vm_loss": 0.1833 + }, + { + "epoch": 1.1812209736025219, + "lm_loss": 1.3321, + "step": 6136, + "vm_loss": 0.1481 + }, + { + "epoch": 1.1812209736025219, + "lm_loss": 0.9439, + "step": 6136, + "vm_loss": 0.1875 + }, + { + "epoch": 1.1812209736025219, + "lm_loss": 0.7343, + "step": 6136, + "vm_loss": 0.1158 + }, + { + "epoch": 1.1814134802800973, + "grad_norm": 3.408472309339248, + "learning_rate": 7.570765290841165e-06, + "loss": 1.3067, + "step": 6137 + }, + { + "epoch": 1.1816059869576727, + "grad_norm": 3.260547138499601, + "learning_rate": 7.567740907498529e-06, + "loss": 1.296, + "step": 6138 + }, + { + "epoch": 1.181798493635248, + "grad_norm": 3.002320892257385, + "learning_rate": 7.564716760602595e-06, + "loss": 1.1626, + "step": 6139 + }, + { + "epoch": 1.1819910003128233, + "grad_norm": 3.2548984469588707, + "learning_rate": 7.561692850447343e-06, + "loss": 1.1986, + "step": 6140 + }, + { + "epoch": 1.1821835069903988, + "grad_norm": 3.0987378878945857, + "learning_rate": 7.558669177326745e-06, + "loss": 1.1862, + "step": 6141 + }, + { + "epoch": 1.1823760136679742, + "grad_norm": 3.0606482658399576, + "learning_rate": 7.555645741534736e-06, + "loss": 1.1741, + "step": 6142 + }, + { + "epoch": 1.1825685203455496, + "grad_norm": 3.1957034570874403, + "learning_rate": 7.552622543365233e-06, + "loss": 1.2273, + "step": 6143 + }, + { + "epoch": 1.1827610270231248, + "grad_norm": 3.3300397456899695, + "learning_rate": 7.549599583112131e-06, + "loss": 1.2496, + "step": 6144 + }, + { + "epoch": 1.1827610270231248, + "lm_loss": 1.0396, + "step": 6144, + "vm_loss": 0.1738 + }, + { + "epoch": 1.1827610270231248, + "lm_loss": 0.9948, + "step": 6144, + "vm_loss": 0.1222 + }, + { + "epoch": 1.1827610270231248, + "lm_loss": 0.6277, + "step": 6144, + "vm_loss": 0.1871 + }, + { + "epoch": 1.1827610270231248, + "lm_loss": 1.3043, + "step": 6144, + "vm_loss": 0.1885 + }, + { + "epoch": 1.1827610270231248, + "lm_loss": 0.7678, + "step": 6144, + "vm_loss": 0.1918 + }, + { + "epoch": 1.1827610270231248, + "lm_loss": 0.7145, + "step": 6144, + "vm_loss": 0.1658 + }, + { + "epoch": 1.1827610270231248, + "lm_loss": 0.6558, + "step": 6144, + "vm_loss": 0.1863 + }, + { + "epoch": 1.1827610270231248, + "lm_loss": 0.9492, + "step": 6144, + "vm_loss": 0.1418 + }, + { + "epoch": 1.1829535337007002, + "grad_norm": 3.350568048814192, + "learning_rate": 7.546576861069297e-06, + "loss": 1.191, + "step": 6145 + }, + { + "epoch": 1.1831460403782756, + "grad_norm": 3.300789334439735, + "learning_rate": 7.543554377530582e-06, + "loss": 1.2334, + "step": 6146 + }, + { + "epoch": 1.183338547055851, + "grad_norm": 3.3580018816327524, + "learning_rate": 7.5405321327898075e-06, + "loss": 1.2344, + "step": 6147 + }, + { + "epoch": 1.1835310537334265, + "grad_norm": 3.4427880431031794, + "learning_rate": 7.537510127140778e-06, + "loss": 1.2696, + "step": 6148 + }, + { + "epoch": 1.1837235604110017, + "grad_norm": 3.290988138106618, + "learning_rate": 7.5344883608772675e-06, + "loss": 1.2615, + "step": 6149 + }, + { + "epoch": 1.183916067088577, + "grad_norm": 3.368063465406094, + "learning_rate": 7.531466834293029e-06, + "loss": 1.2394, + "step": 6150 + }, + { + "epoch": 1.1841085737661525, + "grad_norm": 3.3293151424403273, + "learning_rate": 7.5284455476818e-06, + "loss": 1.2465, + "step": 6151 + }, + { + "epoch": 1.184301080443728, + "grad_norm": 3.1964368913305514, + "learning_rate": 7.525424501337284e-06, + "loss": 1.1736, + "step": 6152 + }, + { + "epoch": 1.184301080443728, + "lm_loss": 1.0186, + "step": 6152, + "vm_loss": 0.1413 + }, + { + "epoch": 1.184301080443728, + "lm_loss": 0.9387, + "step": 6152, + "vm_loss": 0.1977 + }, + { + "epoch": 1.184301080443728, + "lm_loss": 1.0602, + "step": 6152, + "vm_loss": 0.1899 + }, + { + "epoch": 1.184301080443728, + "lm_loss": 0.8528, + "step": 6152, + "vm_loss": 0.1831 + }, + { + "epoch": 1.184301080443728, + "lm_loss": 1.2751, + "step": 6152, + "vm_loss": 0.1561 + }, + { + "epoch": 1.184301080443728, + "lm_loss": 1.1112, + "step": 6152, + "vm_loss": 0.1302 + }, + { + "epoch": 1.184301080443728, + "lm_loss": 1.018, + "step": 6152, + "vm_loss": 0.1174 + }, + { + "epoch": 1.184301080443728, + "lm_loss": 0.8559, + "step": 6152, + "vm_loss": 0.1928 + }, + { + "epoch": 1.1844935871213034, + "grad_norm": 3.1730839466709466, + "learning_rate": 7.52240369555316e-06, + "loss": 1.1976, + "step": 6153 + }, + { + "epoch": 1.1846860937988786, + "grad_norm": 3.1648035431815815, + "learning_rate": 7.5193831306231e-06, + "loss": 1.1902, + "step": 6154 + }, + { + "epoch": 1.184878600476454, + "grad_norm": 3.045988424538661, + "learning_rate": 7.516362806840736e-06, + "loss": 1.1579, + "step": 6155 + }, + { + "epoch": 1.1850711071540294, + "grad_norm": 3.261703573080538, + "learning_rate": 7.513342724499682e-06, + "loss": 1.2734, + "step": 6156 + }, + { + "epoch": 1.1852636138316048, + "grad_norm": 3.163168691374363, + "learning_rate": 7.510322883893524e-06, + "loss": 1.1692, + "step": 6157 + }, + { + "epoch": 1.1854561205091803, + "grad_norm": 3.3998879838271527, + "learning_rate": 7.507303285315839e-06, + "loss": 1.186, + "step": 6158 + }, + { + "epoch": 1.1856486271867555, + "grad_norm": 3.326055087265486, + "learning_rate": 7.5042839290601655e-06, + "loss": 1.1941, + "step": 6159 + }, + { + "epoch": 1.1858411338643309, + "grad_norm": 3.2872318799370204, + "learning_rate": 7.501264815420019e-06, + "loss": 1.2264, + "step": 6160 + }, + { + "epoch": 1.1858411338643309, + "lm_loss": 1.0201, + "step": 6160, + "vm_loss": 0.1727 + }, + { + "epoch": 1.1858411338643309, + "lm_loss": 1.1475, + "step": 6160, + "vm_loss": 0.189 + }, + { + "epoch": 1.1858411338643309, + "lm_loss": 1.2055, + "step": 6160, + "vm_loss": 0.1346 + }, + { + "epoch": 1.1858411338643309, + "lm_loss": 1.5111, + "step": 6160, + "vm_loss": 0.1891 + }, + { + "epoch": 1.1858411338643309, + "lm_loss": 0.9498, + "step": 6160, + "vm_loss": 0.1871 + }, + { + "epoch": 1.1858411338643309, + "lm_loss": 0.9566, + "step": 6160, + "vm_loss": 0.1018 + }, + { + "epoch": 1.1858411338643309, + "lm_loss": 1.1052, + "step": 6160, + "vm_loss": 0.1983 + }, + { + "epoch": 1.1858411338643309, + "lm_loss": 1.1775, + "step": 6160, + "vm_loss": 0.1006 + }, + { + "epoch": 1.1860336405419063, + "grad_norm": 3.354725456308886, + "learning_rate": 7.498245944688904e-06, + "loss": 1.2821, + "step": 6161 + }, + { + "epoch": 1.1862261472194817, + "grad_norm": 3.3477471998239663, + "learning_rate": 7.495227317160292e-06, + "loss": 1.2759, + "step": 6162 + }, + { + "epoch": 1.1864186538970571, + "grad_norm": 3.2420694946380273, + "learning_rate": 7.492208933127625e-06, + "loss": 1.207, + "step": 6163 + }, + { + "epoch": 1.1866111605746323, + "grad_norm": 3.234054388727203, + "learning_rate": 7.489190792884338e-06, + "loss": 1.187, + "step": 6164 + }, + { + "epoch": 1.1868036672522078, + "grad_norm": 3.295862072446685, + "learning_rate": 7.486172896723826e-06, + "loss": 1.2846, + "step": 6165 + }, + { + "epoch": 1.1869961739297832, + "grad_norm": 3.167120671281481, + "learning_rate": 7.483155244939471e-06, + "loss": 1.1728, + "step": 6166 + }, + { + "epoch": 1.1871886806073586, + "grad_norm": 3.1183057937221235, + "learning_rate": 7.480137837824626e-06, + "loss": 1.1969, + "step": 6167 + }, + { + "epoch": 1.187381187284934, + "grad_norm": 3.321682198204172, + "learning_rate": 7.47712067567262e-06, + "loss": 1.2728, + "step": 6168 + }, + { + "epoch": 1.187381187284934, + "lm_loss": 1.1225, + "step": 6168, + "vm_loss": 0.1572 + }, + { + "epoch": 1.187381187284934, + "lm_loss": 1.4882, + "step": 6168, + "vm_loss": 0.1634 + }, + { + "epoch": 1.187381187284934, + "lm_loss": 0.8208, + "step": 6168, + "vm_loss": 0.1712 + }, + { + "epoch": 1.187381187284934, + "lm_loss": 1.0842, + "step": 6168, + "vm_loss": 0.1975 + }, + { + "epoch": 1.187381187284934, + "lm_loss": 0.8446, + "step": 6168, + "vm_loss": 0.143 + }, + { + "epoch": 1.187381187284934, + "lm_loss": 0.8169, + "step": 6168, + "vm_loss": 0.1861 + }, + { + "epoch": 1.187381187284934, + "lm_loss": 0.8156, + "step": 6168, + "vm_loss": 0.1489 + }, + { + "epoch": 1.187381187284934, + "lm_loss": 0.888, + "step": 6168, + "vm_loss": 0.2004 + }, + { + "epoch": 1.1875736939625092, + "grad_norm": 3.491126090797365, + "learning_rate": 7.474103758776764e-06, + "loss": 1.2885, + "step": 6169 + }, + { + "epoch": 1.1877662006400846, + "grad_norm": 3.323535183665952, + "learning_rate": 7.471087087430334e-06, + "loss": 1.1791, + "step": 6170 + }, + { + "epoch": 1.18795870731766, + "grad_norm": 3.236288698500348, + "learning_rate": 7.468070661926599e-06, + "loss": 1.1956, + "step": 6171 + }, + { + "epoch": 1.1881512139952355, + "grad_norm": 3.2000631102171253, + "learning_rate": 7.4650544825587865e-06, + "loss": 1.2091, + "step": 6172 + }, + { + "epoch": 1.188343720672811, + "grad_norm": 3.379853037894804, + "learning_rate": 7.46203854962011e-06, + "loss": 1.2954, + "step": 6173 + }, + { + "epoch": 1.188536227350386, + "grad_norm": 3.2048383490268413, + "learning_rate": 7.459022863403753e-06, + "loss": 1.2266, + "step": 6174 + }, + { + "epoch": 1.1887287340279615, + "grad_norm": 3.126968254247338, + "learning_rate": 7.456007424202886e-06, + "loss": 1.2586, + "step": 6175 + }, + { + "epoch": 1.188921240705537, + "grad_norm": 3.2755282263856302, + "learning_rate": 7.452992232310645e-06, + "loss": 1.216, + "step": 6176 + }, + { + "epoch": 1.188921240705537, + "lm_loss": 1.1578, + "step": 6176, + "vm_loss": 0.1729 + }, + { + "epoch": 1.188921240705537, + "lm_loss": 1.0037, + "step": 6176, + "vm_loss": 0.1513 + }, + { + "epoch": 1.188921240705537, + "lm_loss": 0.9248, + "step": 6176, + "vm_loss": 0.1602 + }, + { + "epoch": 1.188921240705537, + "lm_loss": 1.1977, + "step": 6176, + "vm_loss": 0.1853 + }, + { + "epoch": 1.188921240705537, + "lm_loss": 1.8386, + "step": 6176, + "vm_loss": 0.2537 + }, + { + "epoch": 1.188921240705537, + "lm_loss": 0.8929, + "step": 6176, + "vm_loss": 0.2167 + }, + { + "epoch": 1.188921240705537, + "lm_loss": 0.5072, + "step": 6176, + "vm_loss": 0.1651 + }, + { + "epoch": 1.188921240705537, + "lm_loss": 0.8753, + "step": 6176, + "vm_loss": 0.2088 + }, + { + "epoch": 1.1891137473831124, + "grad_norm": 3.16222103503267, + "learning_rate": 7.44997728802014e-06, + "loss": 1.228, + "step": 6177 + }, + { + "epoch": 1.1893062540606878, + "grad_norm": 3.2936959025738166, + "learning_rate": 7.446962591624471e-06, + "loss": 1.2465, + "step": 6178 + }, + { + "epoch": 1.189498760738263, + "grad_norm": 3.283102256055236, + "learning_rate": 7.443948143416702e-06, + "loss": 1.2371, + "step": 6179 + }, + { + "epoch": 1.1896912674158384, + "grad_norm": 3.3894301224994035, + "learning_rate": 7.440933943689874e-06, + "loss": 1.2532, + "step": 6180 + }, + { + "epoch": 1.1898837740934138, + "grad_norm": 3.2334481748368638, + "learning_rate": 7.437919992737003e-06, + "loss": 1.1663, + "step": 6181 + }, + { + "epoch": 1.1900762807709893, + "grad_norm": 3.2284119074789586, + "learning_rate": 7.434906290851091e-06, + "loss": 1.1445, + "step": 6182 + }, + { + "epoch": 1.1902687874485647, + "grad_norm": 3.277937001283184, + "learning_rate": 7.4318928383251035e-06, + "loss": 1.1553, + "step": 6183 + }, + { + "epoch": 1.1904612941261399, + "grad_norm": 3.3448301168805092, + "learning_rate": 7.428879635451986e-06, + "loss": 1.1992, + "step": 6184 + }, + { + "epoch": 1.1904612941261399, + "lm_loss": 1.2329, + "step": 6184, + "vm_loss": 0.1661 + }, + { + "epoch": 1.1904612941261399, + "lm_loss": 1.2779, + "step": 6184, + "vm_loss": 0.1429 + }, + { + "epoch": 1.1904612941261399, + "lm_loss": 0.923, + "step": 6184, + "vm_loss": 0.1278 + }, + { + "epoch": 1.1904612941261399, + "lm_loss": 0.7622, + "step": 6184, + "vm_loss": 0.1912 + }, + { + "epoch": 1.1904612941261399, + "lm_loss": 1.171, + "step": 6184, + "vm_loss": 0.1436 + }, + { + "epoch": 1.1904612941261399, + "lm_loss": 1.2231, + "step": 6184, + "vm_loss": 0.0985 + }, + { + "epoch": 1.1904612941261399, + "lm_loss": 1.0454, + "step": 6184, + "vm_loss": 0.1737 + }, + { + "epoch": 1.1904612941261399, + "lm_loss": 0.9305, + "step": 6184, + "vm_loss": 0.1656 + }, + { + "epoch": 1.1906538008037153, + "grad_norm": 3.2702402175724035, + "learning_rate": 7.425866682524665e-06, + "loss": 1.1968, + "step": 6185 + }, + { + "epoch": 1.1908463074812907, + "grad_norm": 3.272444529644762, + "learning_rate": 7.422853979836035e-06, + "loss": 1.1913, + "step": 6186 + }, + { + "epoch": 1.1910388141588661, + "grad_norm": 3.3329941529459552, + "learning_rate": 7.419841527678967e-06, + "loss": 1.2529, + "step": 6187 + }, + { + "epoch": 1.1912313208364416, + "grad_norm": 3.106954940272689, + "learning_rate": 7.416829326346313e-06, + "loss": 1.175, + "step": 6188 + }, + { + "epoch": 1.191423827514017, + "grad_norm": 3.162673809376489, + "learning_rate": 7.413817376130901e-06, + "loss": 1.1783, + "step": 6189 + }, + { + "epoch": 1.1916163341915922, + "grad_norm": 3.2304730104542565, + "learning_rate": 7.4108056773255245e-06, + "loss": 1.2486, + "step": 6190 + }, + { + "epoch": 1.1918088408691676, + "grad_norm": 3.3115602211061734, + "learning_rate": 7.407794230222958e-06, + "loss": 1.2557, + "step": 6191 + }, + { + "epoch": 1.192001347546743, + "grad_norm": 3.2728645813007744, + "learning_rate": 7.404783035115961e-06, + "loss": 1.1979, + "step": 6192 + }, + { + "epoch": 1.192001347546743, + "lm_loss": 1.2222, + "step": 6192, + "vm_loss": 0.1563 + }, + { + "epoch": 1.192001347546743, + "lm_loss": 0.6885, + "step": 6192, + "vm_loss": 0.1733 + }, + { + "epoch": 1.192001347546743, + "lm_loss": 1.0615, + "step": 6192, + "vm_loss": 0.1399 + }, + { + "epoch": 1.192001347546743, + "lm_loss": 0.6434, + "step": 6192, + "vm_loss": 0.1966 + }, + { + "epoch": 1.192001347546743, + "lm_loss": 0.7688, + "step": 6192, + "vm_loss": 0.1965 + }, + { + "epoch": 1.192001347546743, + "lm_loss": 1.1392, + "step": 6192, + "vm_loss": 0.1921 + }, + { + "epoch": 1.192001347546743, + "lm_loss": 0.5012, + "step": 6192, + "vm_loss": 0.163 + }, + { + "epoch": 1.192001347546743, + "lm_loss": 0.7217, + "step": 6192, + "vm_loss": 0.1183 + }, + { + "epoch": 1.1921938542243185, + "grad_norm": 3.125565278870283, + "learning_rate": 7.401772092297254e-06, + "loss": 1.1268, + "step": 6193 + }, + { + "epoch": 1.1923863609018939, + "grad_norm": 3.3155124403649756, + "learning_rate": 7.3987614020595385e-06, + "loss": 1.1991, + "step": 6194 + }, + { + "epoch": 1.192578867579469, + "grad_norm": 3.236628006851464, + "learning_rate": 7.395750964695499e-06, + "loss": 1.2418, + "step": 6195 + }, + { + "epoch": 1.1927713742570445, + "grad_norm": 3.549167723460142, + "learning_rate": 7.3927407804977814e-06, + "loss": 1.2797, + "step": 6196 + }, + { + "epoch": 1.19296388093462, + "grad_norm": 3.1920525656594854, + "learning_rate": 7.389730849759018e-06, + "loss": 1.2399, + "step": 6197 + }, + { + "epoch": 1.1931563876121953, + "grad_norm": 3.1733956925620603, + "learning_rate": 7.386721172771807e-06, + "loss": 1.2363, + "step": 6198 + }, + { + "epoch": 1.1933488942897708, + "grad_norm": 3.176200652429145, + "learning_rate": 7.383711749828735e-06, + "loss": 1.183, + "step": 6199 + }, + { + "epoch": 1.1935414009673462, + "grad_norm": 3.2896289527802676, + "learning_rate": 7.380702581222353e-06, + "loss": 1.2697, + "step": 6200 + }, + { + "epoch": 1.1935414009673462, + "lm_loss": 1.2398, + "step": 6200, + "vm_loss": 0.1035 + }, + { + "epoch": 1.1935414009673462, + "lm_loss": 1.7413, + "step": 6200, + "vm_loss": 0.1927 + }, + { + "epoch": 1.1935414009673462, + "lm_loss": 1.31, + "step": 6200, + "vm_loss": 0.2074 + }, + { + "epoch": 1.1935414009673462, + "lm_loss": 1.4241, + "step": 6200, + "vm_loss": 0.2191 + }, + { + "epoch": 1.1935414009673462, + "lm_loss": 0.7958, + "step": 6200, + "vm_loss": 0.1932 + }, + { + "epoch": 1.1935414009673462, + "lm_loss": 0.9645, + "step": 6200, + "vm_loss": 0.1682 + }, + { + "epoch": 1.1935414009673462, + "lm_loss": 0.5985, + "step": 6200, + "vm_loss": 0.1519 + }, + { + "epoch": 1.1935414009673462, + "lm_loss": 1.2084, + "step": 6200, + "vm_loss": 0.1749 + }, + { + "epoch": 1.1937339076449214, + "grad_norm": 3.2747695138186126, + "learning_rate": 7.377693667245187e-06, + "loss": 1.2394, + "step": 6201 + }, + { + "epoch": 1.1939264143224968, + "grad_norm": 3.443711273539638, + "learning_rate": 7.37468500818975e-06, + "loss": 1.2095, + "step": 6202 + }, + { + "epoch": 1.1941189210000722, + "grad_norm": 3.2764875825710256, + "learning_rate": 7.371676604348516e-06, + "loss": 1.2377, + "step": 6203 + }, + { + "epoch": 1.1943114276776476, + "grad_norm": 3.2969254826835095, + "learning_rate": 7.368668456013941e-06, + "loss": 1.2586, + "step": 6204 + }, + { + "epoch": 1.194503934355223, + "grad_norm": 3.318369219201361, + "learning_rate": 7.365660563478458e-06, + "loss": 1.2444, + "step": 6205 + }, + { + "epoch": 1.1946964410327983, + "grad_norm": 3.189390488154652, + "learning_rate": 7.362652927034474e-06, + "loss": 1.1092, + "step": 6206 + }, + { + "epoch": 1.1948889477103737, + "grad_norm": 3.290045806810415, + "learning_rate": 7.359645546974364e-06, + "loss": 1.2321, + "step": 6207 + }, + { + "epoch": 1.195081454387949, + "grad_norm": 3.0659480184230765, + "learning_rate": 7.3566384235904855e-06, + "loss": 1.147, + "step": 6208 + }, + { + "epoch": 1.195081454387949, + "lm_loss": 1.2341, + "step": 6208, + "vm_loss": 0.2125 + }, + { + "epoch": 1.195081454387949, + "lm_loss": 1.027, + "step": 6208, + "vm_loss": 0.1184 + }, + { + "epoch": 1.195081454387949, + "lm_loss": 0.6748, + "step": 6208, + "vm_loss": 0.2164 + }, + { + "epoch": 1.195081454387949, + "lm_loss": 1.2132, + "step": 6208, + "vm_loss": 0.167 + }, + { + "epoch": 1.195081454387949, + "lm_loss": 0.9857, + "step": 6208, + "vm_loss": 0.0964 + }, + { + "epoch": 1.195081454387949, + "lm_loss": 0.8908, + "step": 6208, + "vm_loss": 0.1479 + }, + { + "epoch": 1.195081454387949, + "lm_loss": 1.1228, + "step": 6208, + "vm_loss": 0.1822 + }, + { + "epoch": 1.195081454387949, + "lm_loss": 1.0035, + "step": 6208, + "vm_loss": 0.18 + }, + { + "epoch": 1.1952739610655245, + "grad_norm": 3.3454839972767085, + "learning_rate": 7.353631557175172e-06, + "loss": 1.235, + "step": 6209 + }, + { + "epoch": 1.1954664677431, + "grad_norm": 3.154619047150957, + "learning_rate": 7.3506249480207284e-06, + "loss": 1.1767, + "step": 6210 + }, + { + "epoch": 1.1956589744206751, + "grad_norm": 3.2720044374901507, + "learning_rate": 7.3476185964194345e-06, + "loss": 1.1863, + "step": 6211 + }, + { + "epoch": 1.1958514810982506, + "grad_norm": 3.2507692137042667, + "learning_rate": 7.344612502663548e-06, + "loss": 1.2062, + "step": 6212 + }, + { + "epoch": 1.196043987775826, + "grad_norm": 3.2641337513735436, + "learning_rate": 7.341606667045299e-06, + "loss": 1.2614, + "step": 6213 + }, + { + "epoch": 1.1962364944534014, + "grad_norm": 3.250203841808071, + "learning_rate": 7.338601089856894e-06, + "loss": 1.2484, + "step": 6214 + }, + { + "epoch": 1.1964290011309768, + "grad_norm": 3.2265894007170215, + "learning_rate": 7.3355957713905065e-06, + "loss": 1.1716, + "step": 6215 + }, + { + "epoch": 1.196621507808552, + "grad_norm": 3.169694952108079, + "learning_rate": 7.332590711938303e-06, + "loss": 1.1967, + "step": 6216 + }, + { + "epoch": 1.196621507808552, + "lm_loss": 0.7031, + "step": 6216, + "vm_loss": 0.144 + }, + { + "epoch": 1.196621507808552, + "lm_loss": 0.9983, + "step": 6216, + "vm_loss": 0.1483 + }, + { + "epoch": 1.196621507808552, + "lm_loss": 1.1673, + "step": 6216, + "vm_loss": 0.1289 + }, + { + "epoch": 1.196621507808552, + "lm_loss": 0.9241, + "step": 6216, + "vm_loss": 0.1334 + }, + { + "epoch": 1.196621507808552, + "lm_loss": 0.6495, + "step": 6216, + "vm_loss": 0.1495 + }, + { + "epoch": 1.196621507808552, + "lm_loss": 1.2762, + "step": 6216, + "vm_loss": 0.1271 + }, + { + "epoch": 1.196621507808552, + "lm_loss": 1.0543, + "step": 6216, + "vm_loss": 0.1738 + }, + { + "epoch": 1.196621507808552, + "lm_loss": 1.0742, + "step": 6216, + "vm_loss": 0.1705 + }, + { + "epoch": 1.1968140144861275, + "grad_norm": 3.2277696827229225, + "learning_rate": 7.329585911792407e-06, + "loss": 1.1916, + "step": 6217 + }, + { + "epoch": 1.1970065211637029, + "grad_norm": 3.163722348409198, + "learning_rate": 7.326581371244923e-06, + "loss": 1.154, + "step": 6218 + }, + { + "epoch": 1.1971990278412783, + "grad_norm": 3.248891868485399, + "learning_rate": 7.323577090587936e-06, + "loss": 1.1905, + "step": 6219 + }, + { + "epoch": 1.1973915345188537, + "grad_norm": 3.227041808296925, + "learning_rate": 7.320573070113497e-06, + "loss": 1.2002, + "step": 6220 + }, + { + "epoch": 1.197584041196429, + "grad_norm": 3.21744906848506, + "learning_rate": 7.31756931011363e-06, + "loss": 1.1798, + "step": 6221 + }, + { + "epoch": 1.1977765478740043, + "grad_norm": 3.212013137666816, + "learning_rate": 7.314565810880351e-06, + "loss": 1.1972, + "step": 6222 + }, + { + "epoch": 1.1979690545515798, + "grad_norm": 3.2475371789276353, + "learning_rate": 7.311562572705631e-06, + "loss": 1.1855, + "step": 6223 + }, + { + "epoch": 1.1981615612291552, + "grad_norm": 3.4267964686402532, + "learning_rate": 7.308559595881424e-06, + "loss": 1.2382, + "step": 6224 + }, + { + "epoch": 1.1981615612291552, + "lm_loss": 1.1657, + "step": 6224, + "vm_loss": 0.1426 + }, + { + "epoch": 1.1981615612291552, + "lm_loss": 0.9857, + "step": 6224, + "vm_loss": 0.217 + }, + { + "epoch": 1.1981615612291552, + "lm_loss": 1.2079, + "step": 6224, + "vm_loss": 0.1398 + }, + { + "epoch": 1.1981615612291552, + "lm_loss": 1.3074, + "step": 6224, + "vm_loss": 0.1396 + }, + { + "epoch": 1.1981615612291552, + "lm_loss": 0.9671, + "step": 6224, + "vm_loss": 0.1632 + }, + { + "epoch": 1.1981615612291552, + "lm_loss": 1.5091, + "step": 6224, + "vm_loss": 0.2195 + }, + { + "epoch": 1.1981615612291552, + "lm_loss": 0.7788, + "step": 6224, + "vm_loss": 0.1169 + }, + { + "epoch": 1.1981615612291552, + "lm_loss": 0.6039, + "step": 6224, + "vm_loss": 0.1313 + }, + { + "epoch": 1.1983540679067306, + "grad_norm": 3.299078269091446, + "learning_rate": 7.305556880699654e-06, + "loss": 1.1843, + "step": 6225 + }, + { + "epoch": 1.1985465745843058, + "grad_norm": 3.320527666963726, + "learning_rate": 7.302554427452231e-06, + "loss": 1.2126, + "step": 6226 + }, + { + "epoch": 1.1987390812618812, + "grad_norm": 3.1985635946986153, + "learning_rate": 7.299552236431025e-06, + "loss": 1.2148, + "step": 6227 + }, + { + "epoch": 1.1989315879394566, + "grad_norm": 3.275271839408709, + "learning_rate": 7.296550307927892e-06, + "loss": 1.1551, + "step": 6228 + }, + { + "epoch": 1.199124094617032, + "grad_norm": 3.1898739436906096, + "learning_rate": 7.293548642234658e-06, + "loss": 1.1853, + "step": 6229 + }, + { + "epoch": 1.1993166012946075, + "grad_norm": 3.3066667012176336, + "learning_rate": 7.290547239643117e-06, + "loss": 1.1874, + "step": 6230 + }, + { + "epoch": 1.1995091079721827, + "grad_norm": 3.343224828428256, + "learning_rate": 7.287546100445054e-06, + "loss": 1.2729, + "step": 6231 + }, + { + "epoch": 1.199701614649758, + "grad_norm": 3.285667226862546, + "learning_rate": 7.284545224932207e-06, + "loss": 1.2143, + "step": 6232 + }, + { + "epoch": 1.199701614649758, + "lm_loss": 1.0936, + "step": 6232, + "vm_loss": 0.1421 + }, + { + "epoch": 1.199701614649758, + "lm_loss": 0.8895, + "step": 6232, + "vm_loss": 0.1574 + }, + { + "epoch": 1.199701614649758, + "lm_loss": 1.177, + "step": 6232, + "vm_loss": 0.1431 + }, + { + "epoch": 1.199701614649758, + "lm_loss": 0.859, + "step": 6232, + "vm_loss": 0.1456 + }, + { + "epoch": 1.199701614649758, + "lm_loss": 1.0927, + "step": 6232, + "vm_loss": 0.1547 + }, + { + "epoch": 1.199701614649758, + "lm_loss": 1.0089, + "step": 6232, + "vm_loss": 0.1284 + }, + { + "epoch": 1.199701614649758, + "lm_loss": 0.954, + "step": 6232, + "vm_loss": 0.2099 + }, + { + "epoch": 1.199701614649758, + "lm_loss": 1.2182, + "step": 6232, + "vm_loss": 0.0866 + }, + { + "epoch": 1.1998941213273335, + "grad_norm": 3.197548288639276, + "learning_rate": 7.281544613396309e-06, + "loss": 1.1323, + "step": 6233 + }, + { + "epoch": 1.200086628004909, + "grad_norm": 3.3397984686715962, + "learning_rate": 7.278544266129054e-06, + "loss": 1.2801, + "step": 6234 + }, + { + "epoch": 1.2002791346824844, + "grad_norm": 3.31671961022355, + "learning_rate": 7.2755441834221095e-06, + "loss": 1.2424, + "step": 6235 + }, + { + "epoch": 1.2004716413600596, + "grad_norm": 3.2645689585151794, + "learning_rate": 7.272544365567131e-06, + "loss": 1.1644, + "step": 6236 + }, + { + "epoch": 1.200664148037635, + "grad_norm": 3.2489844026446018, + "learning_rate": 7.269544812855734e-06, + "loss": 1.2024, + "step": 6237 + }, + { + "epoch": 1.2008566547152104, + "grad_norm": 3.3981200333694983, + "learning_rate": 7.266545525579509e-06, + "loss": 1.2401, + "step": 6238 + }, + { + "epoch": 1.2010491613927858, + "grad_norm": 3.188410754158432, + "learning_rate": 7.263546504030036e-06, + "loss": 1.1297, + "step": 6239 + }, + { + "epoch": 1.2012416680703613, + "grad_norm": 3.25408926477826, + "learning_rate": 7.260547748498851e-06, + "loss": 1.2461, + "step": 6240 + }, + { + "epoch": 1.2012416680703613, + "lm_loss": 0.7198, + "step": 6240, + "vm_loss": 0.1629 + }, + { + "epoch": 1.2012416680703613, + "lm_loss": 1.2332, + "step": 6240, + "vm_loss": 0.1672 + }, + { + "epoch": 1.2012416680703613, + "lm_loss": 0.885, + "step": 6240, + "vm_loss": 0.1982 + }, + { + "epoch": 1.2012416680703613, + "lm_loss": 1.0609, + "step": 6240, + "vm_loss": 0.2137 + }, + { + "epoch": 1.2012416680703613, + "lm_loss": 0.6779, + "step": 6240, + "vm_loss": 0.198 + }, + { + "epoch": 1.2012416680703613, + "lm_loss": 1.2458, + "step": 6240, + "vm_loss": 0.2552 + }, + { + "epoch": 1.2012416680703613, + "lm_loss": 0.8837, + "step": 6240, + "vm_loss": 0.1853 + }, + { + "epoch": 1.2012416680703613, + "lm_loss": 0.9928, + "step": 6240, + "vm_loss": 0.2173 + }, + { + "epoch": 1.2014341747479365, + "grad_norm": 3.3782687461738488, + "learning_rate": 7.257549259277472e-06, + "loss": 1.2634, + "step": 6241 + }, + { + "epoch": 1.2016266814255119, + "grad_norm": 3.340928059291423, + "learning_rate": 7.254551036657389e-06, + "loss": 1.2107, + "step": 6242 + }, + { + "epoch": 1.2018191881030873, + "grad_norm": 3.2448429759403434, + "learning_rate": 7.251553080930073e-06, + "loss": 1.1888, + "step": 6243 + }, + { + "epoch": 1.2020116947806627, + "grad_norm": 3.214846528176014, + "learning_rate": 7.24855539238696e-06, + "loss": 1.1641, + "step": 6244 + }, + { + "epoch": 1.2022042014582381, + "grad_norm": 3.2422322180149292, + "learning_rate": 7.245557971319461e-06, + "loss": 1.2084, + "step": 6245 + }, + { + "epoch": 1.2023967081358133, + "grad_norm": 3.2217925250756676, + "learning_rate": 7.24256081801897e-06, + "loss": 1.1843, + "step": 6246 + }, + { + "epoch": 1.2025892148133888, + "grad_norm": 3.2929749335830274, + "learning_rate": 7.239563932776845e-06, + "loss": 1.1837, + "step": 6247 + }, + { + "epoch": 1.2027817214909642, + "grad_norm": 3.2701090684026175, + "learning_rate": 7.236567315884422e-06, + "loss": 1.2006, + "step": 6248 + }, + { + "epoch": 1.2027817214909642, + "lm_loss": 1.3905, + "step": 6248, + "vm_loss": 0.1451 + }, + { + "epoch": 1.2027817214909642, + "lm_loss": 1.171, + "step": 6248, + "vm_loss": 0.2274 + }, + { + "epoch": 1.2027817214909642, + "lm_loss": 1.0381, + "step": 6248, + "vm_loss": 0.1707 + }, + { + "epoch": 1.2027817214909642, + "lm_loss": 1.1331, + "step": 6248, + "vm_loss": 0.1736 + }, + { + "epoch": 1.2027817214909642, + "lm_loss": 1.5097, + "step": 6248, + "vm_loss": 0.1286 + }, + { + "epoch": 1.2027817214909642, + "lm_loss": 0.9433, + "step": 6248, + "vm_loss": 0.1534 + }, + { + "epoch": 1.2027817214909642, + "lm_loss": 1.1767, + "step": 6248, + "vm_loss": 0.1593 + }, + { + "epoch": 1.2027817214909642, + "lm_loss": 1.4988, + "step": 6248, + "vm_loss": 0.1655 + }, + { + "epoch": 1.2029742281685396, + "grad_norm": 3.310512096841856, + "learning_rate": 7.233570967633008e-06, + "loss": 1.1829, + "step": 6249 + }, + { + "epoch": 1.203166734846115, + "grad_norm": 3.2115793111578674, + "learning_rate": 7.23057488831389e-06, + "loss": 1.1287, + "step": 6250 + }, + { + "epoch": 1.2033592415236904, + "grad_norm": 3.4209483303028434, + "learning_rate": 7.227579078218328e-06, + "loss": 1.2503, + "step": 6251 + }, + { + "epoch": 1.2035517482012656, + "grad_norm": 3.3349988269434965, + "learning_rate": 7.224583537637544e-06, + "loss": 1.1451, + "step": 6252 + }, + { + "epoch": 1.203744254878841, + "grad_norm": 3.363070437375981, + "learning_rate": 7.221588266862749e-06, + "loss": 1.2628, + "step": 6253 + }, + { + "epoch": 1.2039367615564165, + "grad_norm": 3.1535084147021806, + "learning_rate": 7.218593266185125e-06, + "loss": 1.1363, + "step": 6254 + }, + { + "epoch": 1.204129268233992, + "grad_norm": 3.1279933342989863, + "learning_rate": 7.215598535895814e-06, + "loss": 1.1468, + "step": 6255 + }, + { + "epoch": 1.2043217749115673, + "grad_norm": 3.090503879220856, + "learning_rate": 7.212604076285952e-06, + "loss": 1.1315, + "step": 6256 + }, + { + "epoch": 1.2043217749115673, + "lm_loss": 1.673, + "step": 6256, + "vm_loss": 0.154 + }, + { + "epoch": 1.2043217749115673, + "lm_loss": 1.0383, + "step": 6256, + "vm_loss": 0.1746 + }, + { + "epoch": 1.2043217749115673, + "lm_loss": 1.2368, + "step": 6256, + "vm_loss": 0.1298 + }, + { + "epoch": 1.2043217749115673, + "lm_loss": 0.9823, + "step": 6256, + "vm_loss": 0.1768 + }, + { + "epoch": 1.2043217749115673, + "lm_loss": 1.2037, + "step": 6256, + "vm_loss": 0.1898 + }, + { + "epoch": 1.2043217749115673, + "lm_loss": 1.0105, + "step": 6256, + "vm_loss": 0.1575 + }, + { + "epoch": 1.2043217749115673, + "lm_loss": 1.2091, + "step": 6256, + "vm_loss": 0.1059 + }, + { + "epoch": 1.2043217749115673, + "lm_loss": 1.1998, + "step": 6256, + "vm_loss": 0.1886 + }, + { + "epoch": 1.2045142815891425, + "grad_norm": 3.1475802534019013, + "learning_rate": 7.2096098876466355e-06, + "loss": 1.2004, + "step": 6257 + }, + { + "epoch": 1.204706788266718, + "grad_norm": 3.190325446464954, + "learning_rate": 7.206615970268938e-06, + "loss": 1.135, + "step": 6258 + }, + { + "epoch": 1.2048992949442934, + "grad_norm": 3.313235756427296, + "learning_rate": 7.203622324443902e-06, + "loss": 1.2336, + "step": 6259 + }, + { + "epoch": 1.2050918016218688, + "grad_norm": 3.323245804434984, + "learning_rate": 7.200628950462556e-06, + "loss": 1.1946, + "step": 6260 + }, + { + "epoch": 1.2052843082994442, + "grad_norm": 3.2728561770340048, + "learning_rate": 7.1976358486158895e-06, + "loss": 1.2412, + "step": 6261 + }, + { + "epoch": 1.2054768149770196, + "grad_norm": 3.408384228137689, + "learning_rate": 7.194643019194867e-06, + "loss": 1.2359, + "step": 6262 + }, + { + "epoch": 1.2056693216545948, + "grad_norm": 3.244930210019856, + "learning_rate": 7.191650462490439e-06, + "loss": 1.1801, + "step": 6263 + }, + { + "epoch": 1.2058618283321703, + "grad_norm": 3.401049784818652, + "learning_rate": 7.188658178793516e-06, + "loss": 1.1734, + "step": 6264 + }, + { + "epoch": 1.2058618283321703, + "lm_loss": 1.3469, + "step": 6264, + "vm_loss": 0.1399 + }, + { + "epoch": 1.2058618283321703, + "lm_loss": 1.0489, + "step": 6264, + "vm_loss": 0.1989 + }, + { + "epoch": 1.2058618283321703, + "lm_loss": 0.9177, + "step": 6264, + "vm_loss": 0.1726 + }, + { + "epoch": 1.2058618283321703, + "lm_loss": 0.8576, + "step": 6264, + "vm_loss": 0.1582 + }, + { + "epoch": 1.2058618283321703, + "lm_loss": 0.9937, + "step": 6264, + "vm_loss": 0.1562 + }, + { + "epoch": 1.2058618283321703, + "lm_loss": 1.0464, + "step": 6264, + "vm_loss": 0.1941 + }, + { + "epoch": 1.2058618283321703, + "lm_loss": 0.9471, + "step": 6264, + "vm_loss": 0.1661 + }, + { + "epoch": 1.2058618283321703, + "lm_loss": 1.0336, + "step": 6264, + "vm_loss": 0.2043 + }, + { + "epoch": 1.2060543350097457, + "grad_norm": 3.235153447841721, + "learning_rate": 7.185666168394985e-06, + "loss": 1.1642, + "step": 6265 + }, + { + "epoch": 1.206246841687321, + "grad_norm": 3.1741085592825358, + "learning_rate": 7.182674431585703e-06, + "loss": 1.1767, + "step": 6266 + }, + { + "epoch": 1.2064393483648965, + "grad_norm": 3.107578353071712, + "learning_rate": 7.179682968656516e-06, + "loss": 1.2062, + "step": 6267 + }, + { + "epoch": 1.2066318550424717, + "grad_norm": 3.236103594447218, + "learning_rate": 7.1766917798982236e-06, + "loss": 1.2483, + "step": 6268 + }, + { + "epoch": 1.2068243617200471, + "grad_norm": 3.13447657632835, + "learning_rate": 7.1737008656016124e-06, + "loss": 1.1698, + "step": 6269 + }, + { + "epoch": 1.2070168683976226, + "grad_norm": 3.1977006305842606, + "learning_rate": 7.1707102260574376e-06, + "loss": 1.1787, + "step": 6270 + }, + { + "epoch": 1.207209375075198, + "grad_norm": 3.181931735917009, + "learning_rate": 7.1677198615564235e-06, + "loss": 1.2092, + "step": 6271 + }, + { + "epoch": 1.2074018817527734, + "grad_norm": 3.4251984634650205, + "learning_rate": 7.164729772389276e-06, + "loss": 1.3033, + "step": 6272 + }, + { + "epoch": 1.2074018817527734, + "lm_loss": 0.9955, + "step": 6272, + "vm_loss": 0.1954 + }, + { + "epoch": 1.2074018817527734, + "lm_loss": 1.0398, + "step": 6272, + "vm_loss": 0.2003 + }, + { + "epoch": 1.2074018817527734, + "lm_loss": 1.0763, + "step": 6272, + "vm_loss": 0.1432 + }, + { + "epoch": 1.2074018817527734, + "lm_loss": 1.0323, + "step": 6272, + "vm_loss": 0.131 + }, + { + "epoch": 1.2074018817527734, + "lm_loss": 0.9877, + "step": 6272, + "vm_loss": 0.1519 + }, + { + "epoch": 1.2074018817527734, + "lm_loss": 0.9707, + "step": 6272, + "vm_loss": 0.1441 + }, + { + "epoch": 1.2074018817527734, + "lm_loss": 1.2018, + "step": 6272, + "vm_loss": 0.1396 + }, + { + "epoch": 1.2074018817527734, + "lm_loss": 0.6506, + "step": 6272, + "vm_loss": 0.1813 + }, + { + "epoch": 1.2075943884303486, + "grad_norm": 3.1262888896545094, + "learning_rate": 7.161739958846665e-06, + "loss": 1.1531, + "step": 6273 + }, + { + "epoch": 1.207786895107924, + "grad_norm": 3.3181949647175104, + "learning_rate": 7.158750421219245e-06, + "loss": 1.2043, + "step": 6274 + }, + { + "epoch": 1.2079794017854995, + "grad_norm": 3.306411371650362, + "learning_rate": 7.155761159797635e-06, + "loss": 1.1639, + "step": 6275 + }, + { + "epoch": 1.2081719084630749, + "grad_norm": 3.344924594958964, + "learning_rate": 7.152772174872424e-06, + "loss": 1.2305, + "step": 6276 + }, + { + "epoch": 1.2083644151406503, + "grad_norm": 3.210511343625126, + "learning_rate": 7.14978346673419e-06, + "loss": 1.1543, + "step": 6277 + }, + { + "epoch": 1.2085569218182255, + "grad_norm": 3.2280306134212373, + "learning_rate": 7.146795035673466e-06, + "loss": 1.2473, + "step": 6278 + }, + { + "epoch": 1.208749428495801, + "grad_norm": 3.113007166867191, + "learning_rate": 7.143806881980765e-06, + "loss": 1.1412, + "step": 6279 + }, + { + "epoch": 1.2089419351733763, + "grad_norm": 3.147264123615081, + "learning_rate": 7.14081900594658e-06, + "loss": 1.1732, + "step": 6280 + }, + { + "epoch": 1.2089419351733763, + "lm_loss": 1.057, + "step": 6280, + "vm_loss": 0.1908 + }, + { + "epoch": 1.2089419351733763, + "lm_loss": 1.1339, + "step": 6280, + "vm_loss": 0.2002 + }, + { + "epoch": 1.2089419351733763, + "lm_loss": 0.9748, + "step": 6280, + "vm_loss": 0.1385 + }, + { + "epoch": 1.2089419351733763, + "lm_loss": 1.3231, + "step": 6280, + "vm_loss": 0.1713 + }, + { + "epoch": 1.2089419351733763, + "lm_loss": 0.9505, + "step": 6280, + "vm_loss": 0.2016 + }, + { + "epoch": 1.2089419351733763, + "lm_loss": 0.9946, + "step": 6280, + "vm_loss": 0.2088 + }, + { + "epoch": 1.2089419351733763, + "lm_loss": 0.8218, + "step": 6280, + "vm_loss": 0.1087 + }, + { + "epoch": 1.2089419351733763, + "lm_loss": 1.046, + "step": 6280, + "vm_loss": 0.1596 + }, + { + "epoch": 1.2091344418509518, + "grad_norm": 3.204502297688718, + "learning_rate": 7.137831407861366e-06, + "loss": 1.1991, + "step": 6281 + }, + { + "epoch": 1.2093269485285272, + "grad_norm": 3.2239978975507104, + "learning_rate": 7.13484408801556e-06, + "loss": 1.188, + "step": 6282 + }, + { + "epoch": 1.2095194552061024, + "grad_norm": 3.3273701498540675, + "learning_rate": 7.131857046699557e-06, + "loss": 1.1982, + "step": 6283 + }, + { + "epoch": 1.2097119618836778, + "grad_norm": 3.1209299108318946, + "learning_rate": 7.12887028420375e-06, + "loss": 1.1021, + "step": 6284 + }, + { + "epoch": 1.2099044685612532, + "grad_norm": 3.4801166094611258, + "learning_rate": 7.125883800818482e-06, + "loss": 1.2455, + "step": 6285 + }, + { + "epoch": 1.2100969752388286, + "grad_norm": 3.2662891795085245, + "learning_rate": 7.122897596834078e-06, + "loss": 1.1795, + "step": 6286 + }, + { + "epoch": 1.210289481916404, + "grad_norm": 3.493900537356155, + "learning_rate": 7.1199116725408375e-06, + "loss": 1.2837, + "step": 6287 + }, + { + "epoch": 1.2104819885939793, + "grad_norm": 3.503112137751488, + "learning_rate": 7.116926028229032e-06, + "loss": 1.2832, + "step": 6288 + }, + { + "epoch": 1.2104819885939793, + "lm_loss": 0.8538, + "step": 6288, + "vm_loss": 0.1518 + }, + { + "epoch": 1.2104819885939793, + "lm_loss": 1.2523, + "step": 6288, + "vm_loss": 0.2045 + }, + { + "epoch": 1.2104819885939793, + "lm_loss": 0.7868, + "step": 6288, + "vm_loss": 0.1481 + }, + { + "epoch": 1.2104819885939793, + "lm_loss": 1.4973, + "step": 6288, + "vm_loss": 0.1725 + }, + { + "epoch": 1.2104819885939793, + "lm_loss": 0.7266, + "step": 6288, + "vm_loss": 0.1642 + }, + { + "epoch": 1.2104819885939793, + "lm_loss": 0.8101, + "step": 6288, + "vm_loss": 0.1 + }, + { + "epoch": 1.2104819885939793, + "lm_loss": 1.1246, + "step": 6288, + "vm_loss": 0.1388 + }, + { + "epoch": 1.2104819885939793, + "lm_loss": 0.8861, + "step": 6288, + "vm_loss": 0.1597 + }, + { + "epoch": 1.2106744952715547, + "grad_norm": 3.2547246975430015, + "learning_rate": 7.113940664188898e-06, + "loss": 1.1725, + "step": 6289 + }, + { + "epoch": 1.21086700194913, + "grad_norm": 3.275173385223495, + "learning_rate": 7.110955580710656e-06, + "loss": 1.1757, + "step": 6290 + }, + { + "epoch": 1.2110595086267055, + "grad_norm": 3.108700331568262, + "learning_rate": 7.107970778084494e-06, + "loss": 1.1669, + "step": 6291 + }, + { + "epoch": 1.211252015304281, + "grad_norm": 3.1805530953219705, + "learning_rate": 7.1049862566005725e-06, + "loss": 1.208, + "step": 6292 + }, + { + "epoch": 1.2114445219818561, + "grad_norm": 3.0302930776576313, + "learning_rate": 7.102002016549022e-06, + "loss": 1.1075, + "step": 6293 + }, + { + "epoch": 1.2116370286594316, + "grad_norm": 3.1991494908193823, + "learning_rate": 7.099018058219953e-06, + "loss": 1.1518, + "step": 6294 + }, + { + "epoch": 1.211829535337007, + "grad_norm": 3.2709897240512604, + "learning_rate": 7.096034381903446e-06, + "loss": 1.2065, + "step": 6295 + }, + { + "epoch": 1.2120220420145824, + "grad_norm": 3.210550255538842, + "learning_rate": 7.093050987889547e-06, + "loss": 1.17, + "step": 6296 + }, + { + "epoch": 1.2120220420145824, + "lm_loss": 0.6208, + "step": 6296, + "vm_loss": 0.1293 + }, + { + "epoch": 1.2120220420145824, + "lm_loss": 0.9971, + "step": 6296, + "vm_loss": 0.1535 + }, + { + "epoch": 1.2120220420145824, + "lm_loss": 0.7363, + "step": 6296, + "vm_loss": 0.1936 + }, + { + "epoch": 1.2120220420145824, + "lm_loss": 1.2485, + "step": 6296, + "vm_loss": 0.1518 + }, + { + "epoch": 1.2120220420145824, + "lm_loss": 1.2022, + "step": 6296, + "vm_loss": 0.1684 + }, + { + "epoch": 1.2120220420145824, + "lm_loss": 1.1878, + "step": 6296, + "vm_loss": 0.1935 + }, + { + "epoch": 1.2120220420145824, + "lm_loss": 0.9401, + "step": 6296, + "vm_loss": 0.1872 + }, + { + "epoch": 1.2120220420145824, + "lm_loss": 0.921, + "step": 6296, + "vm_loss": 0.1888 + }, + { + "epoch": 1.2122145486921578, + "grad_norm": 3.22985518637462, + "learning_rate": 7.090067876468283e-06, + "loss": 1.1598, + "step": 6297 + }, + { + "epoch": 1.212407055369733, + "grad_norm": 3.1225585748900393, + "learning_rate": 7.087085047929652e-06, + "loss": 1.0889, + "step": 6298 + }, + { + "epoch": 1.2125995620473085, + "grad_norm": 3.4134806278107845, + "learning_rate": 7.084102502563621e-06, + "loss": 1.2457, + "step": 6299 + }, + { + "epoch": 1.2127920687248839, + "grad_norm": 3.2778289792966766, + "learning_rate": 7.081120240660128e-06, + "loss": 1.1413, + "step": 6300 + }, + { + "epoch": 1.2129845754024593, + "grad_norm": 3.5682183566647163, + "learning_rate": 7.078138262509096e-06, + "loss": 1.2339, + "step": 6301 + }, + { + "epoch": 1.2131770820800347, + "grad_norm": 3.2795777394160965, + "learning_rate": 7.075156568400406e-06, + "loss": 1.2264, + "step": 6302 + }, + { + "epoch": 1.21336958875761, + "grad_norm": 3.2023166526472666, + "learning_rate": 7.072175158623911e-06, + "loss": 1.1447, + "step": 6303 + }, + { + "epoch": 1.2135620954351853, + "grad_norm": 3.3501058884750163, + "learning_rate": 7.069194033469455e-06, + "loss": 1.2536, + "step": 6304 + }, + { + "epoch": 1.2135620954351853, + "lm_loss": 0.6913, + "step": 6304, + "vm_loss": 0.1833 + }, + { + "epoch": 1.2135620954351853, + "lm_loss": 1.0513, + "step": 6304, + "vm_loss": 0.1608 + }, + { + "epoch": 1.2135620954351853, + "lm_loss": 1.051, + "step": 6304, + "vm_loss": 0.1256 + }, + { + "epoch": 1.2135620954351853, + "lm_loss": 1.2587, + "step": 6304, + "vm_loss": 0.1556 + }, + { + "epoch": 1.2135620954351853, + "lm_loss": 0.3811, + "step": 6304, + "vm_loss": 0.2193 + }, + { + "epoch": 1.2135620954351853, + "lm_loss": 0.9453, + "step": 6304, + "vm_loss": 0.1612 + }, + { + "epoch": 1.2135620954351853, + "lm_loss": 0.5829, + "step": 6304, + "vm_loss": 0.192 + }, + { + "epoch": 1.2135620954351853, + "lm_loss": 1.4331, + "step": 6304, + "vm_loss": 0.1335 + }, + { + "epoch": 1.2137546021127608, + "grad_norm": 3.2077695690202193, + "learning_rate": 7.066213193226834e-06, + "loss": 1.1831, + "step": 6305 + }, + { + "epoch": 1.2139471087903362, + "grad_norm": 3.1771782135477307, + "learning_rate": 7.063232638185825e-06, + "loss": 1.228, + "step": 6306 + }, + { + "epoch": 1.2141396154679116, + "grad_norm": 3.1878522167575043, + "learning_rate": 7.060252368636171e-06, + "loss": 1.2435, + "step": 6307 + }, + { + "epoch": 1.2143321221454868, + "grad_norm": 3.1864072252025073, + "learning_rate": 7.057272384867603e-06, + "loss": 1.132, + "step": 6308 + }, + { + "epoch": 1.2145246288230622, + "grad_norm": 3.2777745304531503, + "learning_rate": 7.054292687169806e-06, + "loss": 1.247, + "step": 6309 + }, + { + "epoch": 1.2147171355006376, + "grad_norm": 3.220336951292951, + "learning_rate": 7.051313275832443e-06, + "loss": 1.1932, + "step": 6310 + }, + { + "epoch": 1.214909642178213, + "grad_norm": 3.3229143777514314, + "learning_rate": 7.0483341511451576e-06, + "loss": 1.1798, + "step": 6311 + }, + { + "epoch": 1.2151021488557885, + "grad_norm": 3.3720909089318867, + "learning_rate": 7.045355313397555e-06, + "loss": 1.2373, + "step": 6312 + }, + { + "epoch": 1.2151021488557885, + "lm_loss": 1.4526, + "step": 6312, + "vm_loss": 0.2176 + }, + { + "epoch": 1.2151021488557885, + "lm_loss": 1.2553, + "step": 6312, + "vm_loss": 0.1403 + }, + { + "epoch": 1.2151021488557885, + "lm_loss": 1.0319, + "step": 6312, + "vm_loss": 0.1677 + }, + { + "epoch": 1.2151021488557885, + "lm_loss": 0.7074, + "step": 6312, + "vm_loss": 0.1499 + }, + { + "epoch": 1.2151021488557885, + "lm_loss": 1.2028, + "step": 6312, + "vm_loss": 0.2683 + }, + { + "epoch": 1.2151021488557885, + "lm_loss": 0.9914, + "step": 6312, + "vm_loss": 0.127 + }, + { + "epoch": 1.2151021488557885, + "lm_loss": 0.682, + "step": 6312, + "vm_loss": 0.15 + }, + { + "epoch": 1.2151021488557885, + "lm_loss": 1.0474, + "step": 6312, + "vm_loss": 0.1887 + }, + { + "epoch": 1.215294655533364, + "grad_norm": 3.4816528653689436, + "learning_rate": 7.042376762879215e-06, + "loss": 1.2038, + "step": 6313 + }, + { + "epoch": 1.215487162210939, + "grad_norm": 3.3319608500571545, + "learning_rate": 7.0393984998796976e-06, + "loss": 1.1567, + "step": 6314 + }, + { + "epoch": 1.2156796688885145, + "grad_norm": 3.3912542959794973, + "learning_rate": 7.036420524688519e-06, + "loss": 1.2343, + "step": 6315 + }, + { + "epoch": 1.21587217556609, + "grad_norm": 3.269818379282169, + "learning_rate": 7.033442837595183e-06, + "loss": 1.1185, + "step": 6316 + }, + { + "epoch": 1.2160646822436654, + "grad_norm": 3.240081399511524, + "learning_rate": 7.030465438889152e-06, + "loss": 1.1855, + "step": 6317 + }, + { + "epoch": 1.2162571889212408, + "grad_norm": 3.2836433953377497, + "learning_rate": 7.027488328859876e-06, + "loss": 1.1753, + "step": 6318 + }, + { + "epoch": 1.216449695598816, + "grad_norm": 3.1727387759277366, + "learning_rate": 7.024511507796764e-06, + "loss": 1.1328, + "step": 6319 + }, + { + "epoch": 1.2166422022763914, + "grad_norm": 3.0946199516398156, + "learning_rate": 7.021534975989197e-06, + "loss": 1.1338, + "step": 6320 + }, + { + "epoch": 1.2166422022763914, + "lm_loss": 1.4044, + "step": 6320, + "vm_loss": 0.1276 + }, + { + "epoch": 1.2166422022763914, + "lm_loss": 0.699, + "step": 6320, + "vm_loss": 0.1354 + }, + { + "epoch": 1.2166422022763914, + "lm_loss": 1.157, + "step": 6320, + "vm_loss": 0.1303 + }, + { + "epoch": 1.2166422022763914, + "lm_loss": 1.1441, + "step": 6320, + "vm_loss": 0.1378 + }, + { + "epoch": 1.2166422022763914, + "lm_loss": 1.0404, + "step": 6320, + "vm_loss": 0.1475 + }, + { + "epoch": 1.2166422022763914, + "lm_loss": 1.1469, + "step": 6320, + "vm_loss": 0.1813 + }, + { + "epoch": 1.2166422022763914, + "lm_loss": 0.9252, + "step": 6320, + "vm_loss": 0.1579 + }, + { + "epoch": 1.2166422022763914, + "lm_loss": 0.8761, + "step": 6320, + "vm_loss": 0.1577 + }, + { + "epoch": 1.2168347089539668, + "grad_norm": 3.2000192647736236, + "learning_rate": 7.01855873372654e-06, + "loss": 1.1929, + "step": 6321 + }, + { + "epoch": 1.2170272156315423, + "grad_norm": 3.243909779380425, + "learning_rate": 7.015582781298118e-06, + "loss": 1.221, + "step": 6322 + }, + { + "epoch": 1.2172197223091177, + "grad_norm": 3.2608828606546645, + "learning_rate": 7.0126071189932336e-06, + "loss": 1.207, + "step": 6323 + }, + { + "epoch": 1.2174122289866929, + "grad_norm": 3.063443458622882, + "learning_rate": 7.00963174710115e-06, + "loss": 1.0766, + "step": 6324 + }, + { + "epoch": 1.2176047356642683, + "grad_norm": 3.181150767934691, + "learning_rate": 7.0066566659111246e-06, + "loss": 1.228, + "step": 6325 + }, + { + "epoch": 1.2177972423418437, + "grad_norm": 3.249325003248595, + "learning_rate": 7.003681875712366e-06, + "loss": 1.1501, + "step": 6326 + }, + { + "epoch": 1.2179897490194191, + "grad_norm": 3.39165345313089, + "learning_rate": 7.000707376794059e-06, + "loss": 1.2915, + "step": 6327 + }, + { + "epoch": 1.2181822556969946, + "grad_norm": 3.2178612272632887, + "learning_rate": 6.997733169445372e-06, + "loss": 1.1737, + "step": 6328 + }, + { + "epoch": 1.2181822556969946, + "lm_loss": 0.8076, + "step": 6328, + "vm_loss": 0.1492 + }, + { + "epoch": 1.2181822556969946, + "lm_loss": 1.3434, + "step": 6328, + "vm_loss": 0.1806 + }, + { + "epoch": 1.2181822556969946, + "lm_loss": 1.0055, + "step": 6328, + "vm_loss": 0.1391 + }, + { + "epoch": 1.2181822556969946, + "lm_loss": 1.3691, + "step": 6328, + "vm_loss": 0.1072 + }, + { + "epoch": 1.2181822556969946, + "lm_loss": 1.482, + "step": 6328, + "vm_loss": 0.2015 + }, + { + "epoch": 1.2181822556969946, + "lm_loss": 0.9006, + "step": 6328, + "vm_loss": 0.1897 + }, + { + "epoch": 1.2181822556969946, + "lm_loss": 0.6516, + "step": 6328, + "vm_loss": 0.1535 + }, + { + "epoch": 1.2181822556969946, + "lm_loss": 1.1731, + "step": 6328, + "vm_loss": 0.1384 + }, + { + "epoch": 1.21837476237457, + "grad_norm": 3.4710482151707986, + "learning_rate": 6.994759253955432e-06, + "loss": 1.2257, + "step": 6329 + }, + { + "epoch": 1.2185672690521452, + "grad_norm": 3.329881707321585, + "learning_rate": 6.991785630613334e-06, + "loss": 1.1587, + "step": 6330 + }, + { + "epoch": 1.2187597757297206, + "grad_norm": 3.348408955253127, + "learning_rate": 6.988812299708165e-06, + "loss": 1.1833, + "step": 6331 + }, + { + "epoch": 1.218952282407296, + "grad_norm": 3.26228539356878, + "learning_rate": 6.985839261528966e-06, + "loss": 1.1814, + "step": 6332 + }, + { + "epoch": 1.2191447890848714, + "grad_norm": 3.158730978599338, + "learning_rate": 6.982866516364749e-06, + "loss": 1.1598, + "step": 6333 + }, + { + "epoch": 1.2193372957624469, + "grad_norm": 3.0713112136553327, + "learning_rate": 6.979894064504508e-06, + "loss": 1.1843, + "step": 6334 + }, + { + "epoch": 1.219529802440022, + "grad_norm": 3.3747343042436464, + "learning_rate": 6.976921906237202e-06, + "loss": 1.1887, + "step": 6335 + }, + { + "epoch": 1.2197223091175975, + "grad_norm": 3.1122766112857003, + "learning_rate": 6.9739500418517645e-06, + "loss": 1.1417, + "step": 6336 + }, + { + "epoch": 1.2197223091175975, + "lm_loss": 1.1145, + "step": 6336, + "vm_loss": 0.1799 + }, + { + "epoch": 1.2197223091175975, + "lm_loss": 1.0423, + "step": 6336, + "vm_loss": 0.2208 + }, + { + "epoch": 1.2197223091175975, + "lm_loss": 0.7772, + "step": 6336, + "vm_loss": 0.1533 + }, + { + "epoch": 1.2197223091175975, + "lm_loss": 0.7666, + "step": 6336, + "vm_loss": 0.1268 + }, + { + "epoch": 1.2197223091175975, + "lm_loss": 0.7329, + "step": 6336, + "vm_loss": 0.2232 + }, + { + "epoch": 1.2197223091175975, + "lm_loss": 1.0783, + "step": 6336, + "vm_loss": 0.223 + }, + { + "epoch": 1.2197223091175975, + "lm_loss": 0.8958, + "step": 6336, + "vm_loss": 0.205 + }, + { + "epoch": 1.2197223091175975, + "lm_loss": 1.7043, + "step": 6336, + "vm_loss": 0.1795 + }, + { + "epoch": 1.219914815795173, + "grad_norm": 3.268007424360385, + "learning_rate": 6.970978471637097e-06, + "loss": 1.2128, + "step": 6337 + }, + { + "epoch": 1.2201073224727483, + "grad_norm": 3.2308761642508363, + "learning_rate": 6.968007195882074e-06, + "loss": 1.1716, + "step": 6338 + }, + { + "epoch": 1.2202998291503238, + "grad_norm": 3.139863694301323, + "learning_rate": 6.965036214875542e-06, + "loss": 1.0914, + "step": 6339 + }, + { + "epoch": 1.220492335827899, + "grad_norm": 3.3129502649341536, + "learning_rate": 6.962065528906321e-06, + "loss": 1.1707, + "step": 6340 + }, + { + "epoch": 1.2206848425054744, + "grad_norm": 3.26907779981963, + "learning_rate": 6.9590951382631926e-06, + "loss": 1.1628, + "step": 6341 + }, + { + "epoch": 1.2208773491830498, + "grad_norm": 3.4099939337519665, + "learning_rate": 6.956125043234925e-06, + "loss": 1.2209, + "step": 6342 + }, + { + "epoch": 1.2210698558606252, + "grad_norm": 3.3565261744081485, + "learning_rate": 6.953155244110246e-06, + "loss": 1.2062, + "step": 6343 + }, + { + "epoch": 1.2212623625382006, + "grad_norm": 3.267385174607392, + "learning_rate": 6.950185741177853e-06, + "loss": 1.1668, + "step": 6344 + }, + { + "epoch": 1.2212623625382006, + "lm_loss": 0.9226, + "step": 6344, + "vm_loss": 0.1451 + }, + { + "epoch": 1.2212623625382006, + "lm_loss": 0.9342, + "step": 6344, + "vm_loss": 0.2127 + }, + { + "epoch": 1.2212623625382006, + "lm_loss": 0.8098, + "step": 6344, + "vm_loss": 0.1349 + }, + { + "epoch": 1.2212623625382006, + "lm_loss": 0.8304, + "step": 6344, + "vm_loss": 0.182 + }, + { + "epoch": 1.2212623625382006, + "lm_loss": 1.1625, + "step": 6344, + "vm_loss": 0.141 + }, + { + "epoch": 1.2212623625382006, + "lm_loss": 0.5067, + "step": 6344, + "vm_loss": 0.2202 + }, + { + "epoch": 1.2212623625382006, + "lm_loss": 0.6432, + "step": 6344, + "vm_loss": 0.2229 + }, + { + "epoch": 1.2212623625382006, + "lm_loss": 1.0615, + "step": 6344, + "vm_loss": 0.1815 + }, + { + "epoch": 1.2214548692157758, + "grad_norm": 3.280510693864168, + "learning_rate": 6.947216534726428e-06, + "loss": 1.2158, + "step": 6345 + }, + { + "epoch": 1.2216473758933513, + "grad_norm": 3.077767741385324, + "learning_rate": 6.9442476250446156e-06, + "loss": 1.1494, + "step": 6346 + }, + { + "epoch": 1.2218398825709267, + "grad_norm": 3.162607141282282, + "learning_rate": 6.941279012421025e-06, + "loss": 1.1823, + "step": 6347 + }, + { + "epoch": 1.222032389248502, + "grad_norm": 3.006629932782205, + "learning_rate": 6.938310697144245e-06, + "loss": 1.0913, + "step": 6348 + }, + { + "epoch": 1.2222248959260775, + "grad_norm": 3.30982558084763, + "learning_rate": 6.935342679502839e-06, + "loss": 1.23, + "step": 6349 + }, + { + "epoch": 1.2224174026036527, + "grad_norm": 3.290391128679161, + "learning_rate": 6.932374959785333e-06, + "loss": 1.1852, + "step": 6350 + }, + { + "epoch": 1.2226099092812281, + "grad_norm": 3.2484693269368234, + "learning_rate": 6.9294075382802215e-06, + "loss": 1.2044, + "step": 6351 + }, + { + "epoch": 1.2228024159588036, + "grad_norm": 3.254706654428902, + "learning_rate": 6.926440415275988e-06, + "loss": 1.1562, + "step": 6352 + }, + { + "epoch": 1.2228024159588036, + "lm_loss": 0.5933, + "step": 6352, + "vm_loss": 0.1899 + }, + { + "epoch": 1.2228024159588036, + "lm_loss": 1.1309, + "step": 6352, + "vm_loss": 0.1983 + }, + { + "epoch": 1.2228024159588036, + "lm_loss": 0.9949, + "step": 6352, + "vm_loss": 0.2301 + }, + { + "epoch": 1.2228024159588036, + "lm_loss": 0.8022, + "step": 6352, + "vm_loss": 0.189 + }, + { + "epoch": 1.2228024159588036, + "lm_loss": 1.3695, + "step": 6352, + "vm_loss": 0.1442 + }, + { + "epoch": 1.2228024159588036, + "lm_loss": 0.8494, + "step": 6352, + "vm_loss": 0.16 + }, + { + "epoch": 1.2228024159588036, + "lm_loss": 0.9152, + "step": 6352, + "vm_loss": 0.1501 + }, + { + "epoch": 1.2228024159588036, + "lm_loss": 1.4296, + "step": 6352, + "vm_loss": 0.145 + }, + { + "epoch": 1.222994922636379, + "grad_norm": 3.268875392274639, + "learning_rate": 6.923473591061067e-06, + "loss": 1.1637, + "step": 6353 + }, + { + "epoch": 1.2231874293139544, + "grad_norm": 3.249527857636542, + "learning_rate": 6.920507065923871e-06, + "loss": 1.1749, + "step": 6354 + }, + { + "epoch": 1.2233799359915296, + "grad_norm": 3.337548369712692, + "learning_rate": 6.91754084015279e-06, + "loss": 1.1657, + "step": 6355 + }, + { + "epoch": 1.223572442669105, + "grad_norm": 3.2975232294242676, + "learning_rate": 6.914574914036172e-06, + "loss": 1.214, + "step": 6356 + }, + { + "epoch": 1.2237649493466805, + "grad_norm": 3.3902446782440214, + "learning_rate": 6.9116092878623485e-06, + "loss": 1.2485, + "step": 6357 + }, + { + "epoch": 1.2239574560242559, + "grad_norm": 3.042917875445795, + "learning_rate": 6.908643961919614e-06, + "loss": 1.1429, + "step": 6358 + }, + { + "epoch": 1.2241499627018313, + "grad_norm": 3.204473968924032, + "learning_rate": 6.905678936496235e-06, + "loss": 1.1691, + "step": 6359 + }, + { + "epoch": 1.2243424693794065, + "grad_norm": 3.127600914365728, + "learning_rate": 6.902714211880454e-06, + "loss": 1.163, + "step": 6360 + }, + { + "epoch": 1.2243424693794065, + "lm_loss": 0.3458, + "step": 6360, + "vm_loss": 0.1879 + }, + { + "epoch": 1.2243424693794065, + "lm_loss": 0.9782, + "step": 6360, + "vm_loss": 0.1434 + }, + { + "epoch": 1.2243424693794065, + "lm_loss": 1.0702, + "step": 6360, + "vm_loss": 0.1919 + }, + { + "epoch": 1.2243424693794065, + "lm_loss": 0.5649, + "step": 6360, + "vm_loss": 0.1311 + }, + { + "epoch": 1.2243424693794065, + "lm_loss": 1.158, + "step": 6360, + "vm_loss": 0.1265 + }, + { + "epoch": 1.2243424693794065, + "lm_loss": 0.7082, + "step": 6360, + "vm_loss": 0.1822 + }, + { + "epoch": 1.2243424693794065, + "lm_loss": 0.8934, + "step": 6360, + "vm_loss": 0.1341 + }, + { + "epoch": 1.2243424693794065, + "lm_loss": 1.1257, + "step": 6360, + "vm_loss": 0.1952 + }, + { + "epoch": 1.224534976056982, + "grad_norm": 3.272410678834247, + "learning_rate": 6.899749788360473e-06, + "loss": 1.1753, + "step": 6361 + }, + { + "epoch": 1.2247274827345573, + "grad_norm": 3.32810540797477, + "learning_rate": 6.896785666224482e-06, + "loss": 1.1767, + "step": 6362 + }, + { + "epoch": 1.2249199894121328, + "grad_norm": 3.22079894919327, + "learning_rate": 6.893821845760623e-06, + "loss": 1.1962, + "step": 6363 + }, + { + "epoch": 1.2251124960897082, + "grad_norm": 3.3457615918505397, + "learning_rate": 6.890858327257022e-06, + "loss": 1.1658, + "step": 6364 + }, + { + "epoch": 1.2253050027672834, + "grad_norm": 3.3464868265624204, + "learning_rate": 6.887895111001765e-06, + "loss": 1.1623, + "step": 6365 + }, + { + "epoch": 1.2254975094448588, + "grad_norm": 3.3999814426085737, + "learning_rate": 6.884932197282923e-06, + "loss": 1.1875, + "step": 6366 + }, + { + "epoch": 1.2256900161224342, + "grad_norm": 3.2790174787680284, + "learning_rate": 6.881969586388524e-06, + "loss": 1.1849, + "step": 6367 + }, + { + "epoch": 1.2258825228000096, + "grad_norm": 3.2084795826150914, + "learning_rate": 6.879007278606569e-06, + "loss": 1.1953, + "step": 6368 + }, + { + "epoch": 1.2258825228000096, + "lm_loss": 0.9642, + "step": 6368, + "vm_loss": 0.1943 + }, + { + "epoch": 1.2258825228000096, + "lm_loss": 0.8978, + "step": 6368, + "vm_loss": 0.1956 + }, + { + "epoch": 1.2258825228000096, + "lm_loss": 0.6809, + "step": 6368, + "vm_loss": 0.2233 + }, + { + "epoch": 1.2258825228000096, + "lm_loss": 1.2326, + "step": 6368, + "vm_loss": 0.1635 + }, + { + "epoch": 1.2258825228000096, + "lm_loss": 0.7029, + "step": 6368, + "vm_loss": 0.122 + }, + { + "epoch": 1.2258825228000096, + "lm_loss": 0.679, + "step": 6368, + "vm_loss": 0.2035 + }, + { + "epoch": 1.2258825228000096, + "lm_loss": 1.3457, + "step": 6368, + "vm_loss": 0.1518 + }, + { + "epoch": 1.2258825228000096, + "lm_loss": 1.1471, + "step": 6368, + "vm_loss": 0.1256 + }, + { + "epoch": 1.226075029477585, + "grad_norm": 3.2977343250752615, + "learning_rate": 6.8760452742250385e-06, + "loss": 1.1721, + "step": 6369 + }, + { + "epoch": 1.2262675361551603, + "grad_norm": 3.148962007475685, + "learning_rate": 6.873083573531876e-06, + "loss": 1.1561, + "step": 6370 + }, + { + "epoch": 1.2264600428327357, + "grad_norm": 3.1906776920193725, + "learning_rate": 6.870122176814991e-06, + "loss": 1.0846, + "step": 6371 + }, + { + "epoch": 1.226652549510311, + "grad_norm": 3.245159011983423, + "learning_rate": 6.867161084362275e-06, + "loss": 1.209, + "step": 6372 + }, + { + "epoch": 1.2268450561878865, + "grad_norm": 3.184375407596528, + "learning_rate": 6.864200296461585e-06, + "loss": 1.1654, + "step": 6373 + }, + { + "epoch": 1.227037562865462, + "grad_norm": 3.2360517733828567, + "learning_rate": 6.861239813400742e-06, + "loss": 1.2161, + "step": 6374 + }, + { + "epoch": 1.2272300695430371, + "grad_norm": 3.2703951014009562, + "learning_rate": 6.858279635467544e-06, + "loss": 1.1823, + "step": 6375 + }, + { + "epoch": 1.2274225762206126, + "grad_norm": 3.16553047828405, + "learning_rate": 6.855319762949763e-06, + "loss": 1.119, + "step": 6376 + }, + { + "epoch": 1.2274225762206126, + "lm_loss": 1.026, + "step": 6376, + "vm_loss": 0.1029 + }, + { + "epoch": 1.2274225762206126, + "lm_loss": 1.0472, + "step": 6376, + "vm_loss": 0.1666 + }, + { + "epoch": 1.2274225762206126, + "lm_loss": 1.3632, + "step": 6376, + "vm_loss": 0.1584 + }, + { + "epoch": 1.2274225762206126, + "lm_loss": 0.9518, + "step": 6376, + "vm_loss": 0.0994 + }, + { + "epoch": 1.2274225762206126, + "lm_loss": 1.502, + "step": 6376, + "vm_loss": 0.174 + }, + { + "epoch": 1.2274225762206126, + "lm_loss": 0.7018, + "step": 6376, + "vm_loss": 0.1198 + }, + { + "epoch": 1.2274225762206126, + "lm_loss": 0.6451, + "step": 6376, + "vm_loss": 0.1294 + }, + { + "epoch": 1.2274225762206126, + "lm_loss": 1.1055, + "step": 6376, + "vm_loss": 0.1956 + }, + { + "epoch": 1.227615082898188, + "grad_norm": 3.23252955467232, + "learning_rate": 6.852360196135133e-06, + "loss": 1.1875, + "step": 6377 + }, + { + "epoch": 1.2278075895757634, + "grad_norm": 3.122840578742874, + "learning_rate": 6.849400935311359e-06, + "loss": 1.1374, + "step": 6378 + }, + { + "epoch": 1.2280000962533388, + "grad_norm": 3.1446273771302784, + "learning_rate": 6.846441980766123e-06, + "loss": 1.0958, + "step": 6379 + }, + { + "epoch": 1.2281926029309143, + "grad_norm": 3.159614219358664, + "learning_rate": 6.843483332787076e-06, + "loss": 1.1042, + "step": 6380 + }, + { + "epoch": 1.2283851096084895, + "grad_norm": 3.319008321988319, + "learning_rate": 6.840524991661831e-06, + "loss": 1.1741, + "step": 6381 + }, + { + "epoch": 1.2285776162860649, + "grad_norm": 3.3108241669562153, + "learning_rate": 6.837566957677974e-06, + "loss": 1.1579, + "step": 6382 + }, + { + "epoch": 1.2287701229636403, + "grad_norm": 3.2603794702010216, + "learning_rate": 6.834609231123073e-06, + "loss": 1.13, + "step": 6383 + }, + { + "epoch": 1.2289626296412157, + "grad_norm": 3.2775946754971588, + "learning_rate": 6.831651812284652e-06, + "loss": 1.1505, + "step": 6384 + }, + { + "epoch": 1.2289626296412157, + "lm_loss": 1.2223, + "step": 6384, + "vm_loss": 0.1166 + }, + { + "epoch": 1.2289626296412157, + "lm_loss": 0.8183, + "step": 6384, + "vm_loss": 0.1329 + }, + { + "epoch": 1.2289626296412157, + "lm_loss": 0.7273, + "step": 6384, + "vm_loss": 0.1849 + }, + { + "epoch": 1.2289626296412157, + "lm_loss": 1.2707, + "step": 6384, + "vm_loss": 0.0932 + }, + { + "epoch": 1.2289626296412157, + "lm_loss": 1.2897, + "step": 6384, + "vm_loss": 0.2024 + }, + { + "epoch": 1.2289626296412157, + "lm_loss": 0.9332, + "step": 6384, + "vm_loss": 0.117 + }, + { + "epoch": 1.2289626296412157, + "lm_loss": 1.0724, + "step": 6384, + "vm_loss": 0.2008 + }, + { + "epoch": 1.2289626296412157, + "lm_loss": 0.6767, + "step": 6384, + "vm_loss": 0.219 + }, + { + "epoch": 1.2291551363187911, + "grad_norm": 3.252813045993992, + "learning_rate": 6.8286947014502045e-06, + "loss": 1.1225, + "step": 6385 + }, + { + "epoch": 1.2293476429963663, + "grad_norm": 3.2764823607593874, + "learning_rate": 6.82573789890721e-06, + "loss": 1.204, + "step": 6386 + }, + { + "epoch": 1.2295401496739418, + "grad_norm": 3.413546774064386, + "learning_rate": 6.822781404943103e-06, + "loss": 1.1847, + "step": 6387 + }, + { + "epoch": 1.2297326563515172, + "grad_norm": 3.204617337086198, + "learning_rate": 6.819825219845286e-06, + "loss": 1.1588, + "step": 6388 + }, + { + "epoch": 1.2299251630290926, + "grad_norm": 3.2242473502446565, + "learning_rate": 6.81686934390115e-06, + "loss": 1.1631, + "step": 6389 + }, + { + "epoch": 1.230117669706668, + "grad_norm": 3.16720224612367, + "learning_rate": 6.8139137773980364e-06, + "loss": 1.1456, + "step": 6390 + }, + { + "epoch": 1.2303101763842434, + "grad_norm": 3.225978099954139, + "learning_rate": 6.810958520623267e-06, + "loss": 1.142, + "step": 6391 + }, + { + "epoch": 1.2305026830618186, + "grad_norm": 3.330341542525514, + "learning_rate": 6.8080035738641236e-06, + "loss": 1.2045, + "step": 6392 + }, + { + "epoch": 1.2305026830618186, + "lm_loss": 0.6361, + "step": 6392, + "vm_loss": 0.2022 + }, + { + "epoch": 1.2305026830618186, + "lm_loss": 1.083, + "step": 6392, + "vm_loss": 0.1617 + }, + { + "epoch": 1.2305026830618186, + "lm_loss": 0.7344, + "step": 6392, + "vm_loss": 0.1544 + }, + { + "epoch": 1.2305026830618186, + "lm_loss": 1.1294, + "step": 6392, + "vm_loss": 0.1626 + }, + { + "epoch": 1.2305026830618186, + "lm_loss": 0.6235, + "step": 6392, + "vm_loss": 0.1592 + }, + { + "epoch": 1.2305026830618186, + "lm_loss": 0.9764, + "step": 6392, + "vm_loss": 0.139 + }, + { + "epoch": 1.2305026830618186, + "lm_loss": 1.3719, + "step": 6392, + "vm_loss": 0.2149 + }, + { + "epoch": 1.2305026830618186, + "lm_loss": 1.014, + "step": 6392, + "vm_loss": 0.2023 + }, + { + "epoch": 1.230695189739394, + "grad_norm": 3.334779462170675, + "learning_rate": 6.805048937407875e-06, + "loss": 1.1901, + "step": 6393 + }, + { + "epoch": 1.2308876964169695, + "grad_norm": 3.2258686608963942, + "learning_rate": 6.802094611541745e-06, + "loss": 1.18, + "step": 6394 + }, + { + "epoch": 1.231080203094545, + "grad_norm": 3.3416646873915297, + "learning_rate": 6.799140596552929e-06, + "loss": 1.1748, + "step": 6395 + }, + { + "epoch": 1.2312727097721203, + "grad_norm": 3.301910263871618, + "learning_rate": 6.7961868927286005e-06, + "loss": 1.1572, + "step": 6396 + }, + { + "epoch": 1.2314652164496955, + "grad_norm": 3.288587862360865, + "learning_rate": 6.793233500355893e-06, + "loss": 1.1634, + "step": 6397 + }, + { + "epoch": 1.231657723127271, + "grad_norm": 3.275838883678048, + "learning_rate": 6.790280419721919e-06, + "loss": 1.192, + "step": 6398 + }, + { + "epoch": 1.2318502298048464, + "grad_norm": 3.2206118944199322, + "learning_rate": 6.787327651113749e-06, + "loss": 1.1614, + "step": 6399 + }, + { + "epoch": 1.2320427364824218, + "grad_norm": 3.2060283515758496, + "learning_rate": 6.784375194818435e-06, + "loss": 1.1508, + "step": 6400 + }, + { + "epoch": 1.2320427364824218, + "lm_loss": 1.0285, + "step": 6400, + "vm_loss": 0.1474 + }, + { + "epoch": 1.2320427364824218, + "lm_loss": 1.1317, + "step": 6400, + "vm_loss": 0.1253 + }, + { + "epoch": 1.2320427364824218, + "lm_loss": 1.1628, + "step": 6400, + "vm_loss": 0.1562 + }, + { + "epoch": 1.2320427364824218, + "lm_loss": 0.8128, + "step": 6400, + "vm_loss": 0.1773 + }, + { + "epoch": 1.2320427364824218, + "lm_loss": 1.3343, + "step": 6400, + "vm_loss": 0.2293 + }, + { + "epoch": 1.2320427364824218, + "lm_loss": 0.817, + "step": 6400, + "vm_loss": 0.1497 + }, + { + "epoch": 1.2320427364824218, + "lm_loss": 0.6489, + "step": 6400, + "vm_loss": 0.2312 + }, + { + "epoch": 1.2320427364824218, + "lm_loss": 0.7846, + "step": 6400, + "vm_loss": 0.1386 + }, + { + "epoch": 1.2322352431599972, + "grad_norm": 3.1731773778458763, + "learning_rate": 6.7814230511229924e-06, + "loss": 1.1579, + "step": 6401 + }, + { + "epoch": 1.2324277498375724, + "grad_norm": 3.259987382322204, + "learning_rate": 6.778471220314403e-06, + "loss": 1.1926, + "step": 6402 + }, + { + "epoch": 1.2326202565151478, + "grad_norm": 3.2589509568281603, + "learning_rate": 6.775519702679631e-06, + "loss": 1.137, + "step": 6403 + }, + { + "epoch": 1.2328127631927233, + "grad_norm": 3.1838492740212248, + "learning_rate": 6.7725684985055965e-06, + "loss": 1.1618, + "step": 6404 + }, + { + "epoch": 1.2330052698702987, + "grad_norm": 3.385631725967053, + "learning_rate": 6.769617608079191e-06, + "loss": 1.2271, + "step": 6405 + }, + { + "epoch": 1.233197776547874, + "grad_norm": 3.272867752305655, + "learning_rate": 6.766667031687286e-06, + "loss": 1.1534, + "step": 6406 + }, + { + "epoch": 1.2333902832254493, + "grad_norm": 3.1986803444259, + "learning_rate": 6.763716769616714e-06, + "loss": 1.15, + "step": 6407 + }, + { + "epoch": 1.2335827899030247, + "grad_norm": 3.2869641475152886, + "learning_rate": 6.760766822154274e-06, + "loss": 1.1922, + "step": 6408 + }, + { + "epoch": 1.2335827899030247, + "lm_loss": 0.7554, + "step": 6408, + "vm_loss": 0.1668 + }, + { + "epoch": 1.2335827899030247, + "lm_loss": 1.1554, + "step": 6408, + "vm_loss": 0.1601 + }, + { + "epoch": 1.2335827899030247, + "lm_loss": 0.8012, + "step": 6408, + "vm_loss": 0.1799 + }, + { + "epoch": 1.2335827899030247, + "lm_loss": 0.7747, + "step": 6408, + "vm_loss": 0.148 + }, + { + "epoch": 1.2335827899030247, + "lm_loss": 1.5025, + "step": 6408, + "vm_loss": 0.1857 + }, + { + "epoch": 1.2335827899030247, + "lm_loss": 0.7256, + "step": 6408, + "vm_loss": 0.1638 + }, + { + "epoch": 1.2335827899030247, + "lm_loss": 1.2578, + "step": 6408, + "vm_loss": 0.187 + }, + { + "epoch": 1.2335827899030247, + "lm_loss": 1.2996, + "step": 6408, + "vm_loss": 0.1555 + }, + { + "epoch": 1.2337752965806001, + "grad_norm": 3.343995773366055, + "learning_rate": 6.7578171895867375e-06, + "loss": 1.1975, + "step": 6409 + }, + { + "epoch": 1.2339678032581756, + "grad_norm": 3.2356894446518942, + "learning_rate": 6.754867872200854e-06, + "loss": 1.144, + "step": 6410 + }, + { + "epoch": 1.234160309935751, + "grad_norm": 3.288407368586856, + "learning_rate": 6.751918870283331e-06, + "loss": 1.1831, + "step": 6411 + }, + { + "epoch": 1.2343528166133262, + "grad_norm": 3.245010164018591, + "learning_rate": 6.748970184120844e-06, + "loss": 1.1664, + "step": 6412 + }, + { + "epoch": 1.2345453232909016, + "grad_norm": 3.185605696019955, + "learning_rate": 6.7460218140000525e-06, + "loss": 1.1185, + "step": 6413 + }, + { + "epoch": 1.234737829968477, + "grad_norm": 3.2796317965871205, + "learning_rate": 6.743073760207573e-06, + "loss": 1.1644, + "step": 6414 + }, + { + "epoch": 1.2349303366460525, + "grad_norm": 3.302757000057848, + "learning_rate": 6.740126023029987e-06, + "loss": 1.2243, + "step": 6415 + }, + { + "epoch": 1.2351228433236279, + "grad_norm": 3.2120315257251666, + "learning_rate": 6.737178602753859e-06, + "loss": 1.177, + "step": 6416 + }, + { + "epoch": 1.2351228433236279, + "lm_loss": 1.2269, + "step": 6416, + "vm_loss": 0.2367 + }, + { + "epoch": 1.2351228433236279, + "lm_loss": 1.2172, + "step": 6416, + "vm_loss": 0.1833 + }, + { + "epoch": 1.2351228433236279, + "lm_loss": 1.1004, + "step": 6416, + "vm_loss": 0.1641 + }, + { + "epoch": 1.2351228433236279, + "lm_loss": 1.0071, + "step": 6416, + "vm_loss": 0.147 + }, + { + "epoch": 1.2351228433236279, + "lm_loss": 0.9398, + "step": 6416, + "vm_loss": 0.197 + }, + { + "epoch": 1.2351228433236279, + "lm_loss": 0.9857, + "step": 6416, + "vm_loss": 0.2076 + }, + { + "epoch": 1.2351228433236279, + "lm_loss": 0.604, + "step": 6416, + "vm_loss": 0.122 + }, + { + "epoch": 1.2351228433236279, + "lm_loss": 0.9268, + "step": 6416, + "vm_loss": 0.1554 + }, + { + "epoch": 1.235315350001203, + "grad_norm": 3.2166254039846454, + "learning_rate": 6.734231499665717e-06, + "loss": 1.1426, + "step": 6417 + }, + { + "epoch": 1.2355078566787785, + "grad_norm": 3.3645591744255614, + "learning_rate": 6.731284714052051e-06, + "loss": 1.1776, + "step": 6418 + }, + { + "epoch": 1.235700363356354, + "grad_norm": 3.3144847649924274, + "learning_rate": 6.728338246199331e-06, + "loss": 1.1761, + "step": 6419 + }, + { + "epoch": 1.2358928700339293, + "grad_norm": 3.396453192371253, + "learning_rate": 6.72539209639399e-06, + "loss": 1.2255, + "step": 6420 + }, + { + "epoch": 1.2360853767115048, + "grad_norm": 3.207005840991673, + "learning_rate": 6.7224462649224345e-06, + "loss": 1.1429, + "step": 6421 + }, + { + "epoch": 1.23627788338908, + "grad_norm": 3.2747951875335053, + "learning_rate": 6.719500752071033e-06, + "loss": 1.199, + "step": 6422 + }, + { + "epoch": 1.2364703900666554, + "grad_norm": 3.1125882856998617, + "learning_rate": 6.716555558126126e-06, + "loss": 1.1517, + "step": 6423 + }, + { + "epoch": 1.2366628967442308, + "grad_norm": 3.1624782067730504, + "learning_rate": 6.713610683374028e-06, + "loss": 1.1404, + "step": 6424 + }, + { + "epoch": 1.2366628967442308, + "lm_loss": 1.2258, + "step": 6424, + "vm_loss": 0.2141 + }, + { + "epoch": 1.2366628967442308, + "lm_loss": 1.2754, + "step": 6424, + "vm_loss": 0.2242 + }, + { + "epoch": 1.2366628967442308, + "lm_loss": 1.021, + "step": 6424, + "vm_loss": 0.1374 + }, + { + "epoch": 1.2366628967442308, + "lm_loss": 0.651, + "step": 6424, + "vm_loss": 0.1906 + }, + { + "epoch": 1.2366628967442308, + "lm_loss": 0.7658, + "step": 6424, + "vm_loss": 0.196 + }, + { + "epoch": 1.2366628967442308, + "lm_loss": 0.9653, + "step": 6424, + "vm_loss": 0.1614 + }, + { + "epoch": 1.2366628967442308, + "lm_loss": 0.9538, + "step": 6424, + "vm_loss": 0.1497 + }, + { + "epoch": 1.2366628967442308, + "lm_loss": 0.8322, + "step": 6424, + "vm_loss": 0.2061 + }, + { + "epoch": 1.2368554034218062, + "grad_norm": 3.1776533782410645, + "learning_rate": 6.710666128101018e-06, + "loss": 1.1754, + "step": 6425 + }, + { + "epoch": 1.2370479100993816, + "grad_norm": 3.223181011492992, + "learning_rate": 6.707721892593339e-06, + "loss": 1.1607, + "step": 6426 + }, + { + "epoch": 1.2372404167769568, + "grad_norm": 3.417475601262568, + "learning_rate": 6.704777977137217e-06, + "loss": 1.2818, + "step": 6427 + }, + { + "epoch": 1.2374329234545323, + "grad_norm": 3.418829666485802, + "learning_rate": 6.7018343820188324e-06, + "loss": 1.1741, + "step": 6428 + }, + { + "epoch": 1.2376254301321077, + "grad_norm": 3.3420881975265613, + "learning_rate": 6.698891107524339e-06, + "loss": 1.1949, + "step": 6429 + }, + { + "epoch": 1.237817936809683, + "grad_norm": 3.2995499156600494, + "learning_rate": 6.695948153939869e-06, + "loss": 1.2, + "step": 6430 + }, + { + "epoch": 1.2380104434872585, + "grad_norm": 3.349400151108026, + "learning_rate": 6.693005521551507e-06, + "loss": 1.187, + "step": 6431 + }, + { + "epoch": 1.2382029501648337, + "grad_norm": 3.364339599392584, + "learning_rate": 6.690063210645318e-06, + "loss": 1.1951, + "step": 6432 + }, + { + "epoch": 1.2382029501648337, + "lm_loss": 0.8332, + "step": 6432, + "vm_loss": 0.1509 + }, + { + "epoch": 1.2382029501648337, + "lm_loss": 1.1286, + "step": 6432, + "vm_loss": 0.1965 + }, + { + "epoch": 1.2382029501648337, + "lm_loss": 1.1205, + "step": 6432, + "vm_loss": 0.1594 + }, + { + "epoch": 1.2382029501648337, + "lm_loss": 1.1866, + "step": 6432, + "vm_loss": 0.2043 + }, + { + "epoch": 1.2382029501648337, + "lm_loss": 0.9926, + "step": 6432, + "vm_loss": 0.1635 + }, + { + "epoch": 1.2382029501648337, + "lm_loss": 1.3529, + "step": 6432, + "vm_loss": 0.234 + }, + { + "epoch": 1.2382029501648337, + "lm_loss": 0.5125, + "step": 6432, + "vm_loss": 0.1622 + }, + { + "epoch": 1.2382029501648337, + "lm_loss": 0.5856, + "step": 6432, + "vm_loss": 0.1712 + }, + { + "epoch": 1.2383954568424091, + "grad_norm": 3.2981878803705125, + "learning_rate": 6.687121221507327e-06, + "loss": 1.2182, + "step": 6433 + }, + { + "epoch": 1.2385879635199846, + "grad_norm": 3.181671269021458, + "learning_rate": 6.684179554423542e-06, + "loss": 1.1663, + "step": 6434 + }, + { + "epoch": 1.23878047019756, + "grad_norm": 3.1483686192548923, + "learning_rate": 6.681238209679926e-06, + "loss": 1.1028, + "step": 6435 + }, + { + "epoch": 1.2389729768751354, + "grad_norm": 3.2765035443444837, + "learning_rate": 6.6782971875624105e-06, + "loss": 1.2155, + "step": 6436 + }, + { + "epoch": 1.2391654835527106, + "grad_norm": 3.1288715823570956, + "learning_rate": 6.67535648835691e-06, + "loss": 1.1221, + "step": 6437 + }, + { + "epoch": 1.239357990230286, + "grad_norm": 3.236588441010955, + "learning_rate": 6.672416112349293e-06, + "loss": 1.1422, + "step": 6438 + }, + { + "epoch": 1.2395504969078615, + "grad_norm": 3.2081420535282974, + "learning_rate": 6.669476059825401e-06, + "loss": 1.1201, + "step": 6439 + }, + { + "epoch": 1.2397430035854369, + "grad_norm": 3.2849340398857074, + "learning_rate": 6.666536331071045e-06, + "loss": 1.1917, + "step": 6440 + }, + { + "epoch": 1.2397430035854369, + "lm_loss": 1.1492, + "step": 6440, + "vm_loss": 0.2317 + }, + { + "epoch": 1.2397430035854369, + "lm_loss": 1.1589, + "step": 6440, + "vm_loss": 0.1719 + }, + { + "epoch": 1.2397430035854369, + "lm_loss": 1.3083, + "step": 6440, + "vm_loss": 0.14 + }, + { + "epoch": 1.2397430035854369, + "lm_loss": 1.2385, + "step": 6440, + "vm_loss": 0.1869 + }, + { + "epoch": 1.2397430035854369, + "lm_loss": 1.2025, + "step": 6440, + "vm_loss": 0.1439 + }, + { + "epoch": 1.2397430035854369, + "lm_loss": 1.0232, + "step": 6440, + "vm_loss": 0.1614 + }, + { + "epoch": 1.2397430035854369, + "lm_loss": 1.582, + "step": 6440, + "vm_loss": 0.1469 + }, + { + "epoch": 1.2397430035854369, + "lm_loss": 1.1614, + "step": 6440, + "vm_loss": 0.1794 + }, + { + "epoch": 1.2399355102630123, + "grad_norm": 3.5703682908965857, + "learning_rate": 6.663596926372006e-06, + "loss": 1.2414, + "step": 6441 + }, + { + "epoch": 1.2401280169405877, + "grad_norm": 3.2380129483288442, + "learning_rate": 6.660657846014031e-06, + "loss": 1.1402, + "step": 6442 + }, + { + "epoch": 1.240320523618163, + "grad_norm": 3.4179083462183044, + "learning_rate": 6.6577190902828355e-06, + "loss": 1.2141, + "step": 6443 + }, + { + "epoch": 1.2405130302957383, + "grad_norm": 3.4118236295316784, + "learning_rate": 6.654780659464105e-06, + "loss": 1.1876, + "step": 6444 + }, + { + "epoch": 1.2407055369733138, + "grad_norm": 3.2447648945003484, + "learning_rate": 6.651842553843494e-06, + "loss": 1.213, + "step": 6445 + }, + { + "epoch": 1.2408980436508892, + "grad_norm": 3.25890208548692, + "learning_rate": 6.648904773706617e-06, + "loss": 1.1507, + "step": 6446 + }, + { + "epoch": 1.2410905503284646, + "grad_norm": 3.1274992619924795, + "learning_rate": 6.645967319339075e-06, + "loss": 1.1667, + "step": 6447 + }, + { + "epoch": 1.2412830570060398, + "grad_norm": 3.2156283476732366, + "learning_rate": 6.643030191026419e-06, + "loss": 1.1586, + "step": 6448 + }, + { + "epoch": 1.2412830570060398, + "lm_loss": 1.4617, + "step": 6448, + "vm_loss": 0.1913 + }, + { + "epoch": 1.2412830570060398, + "lm_loss": 1.0869, + "step": 6448, + "vm_loss": 0.1839 + }, + { + "epoch": 1.2412830570060398, + "lm_loss": 1.1408, + "step": 6448, + "vm_loss": 0.1565 + }, + { + "epoch": 1.2412830570060398, + "lm_loss": 0.7855, + "step": 6448, + "vm_loss": 0.2224 + }, + { + "epoch": 1.2412830570060398, + "lm_loss": 1.0465, + "step": 6448, + "vm_loss": 0.1211 + }, + { + "epoch": 1.2412830570060398, + "lm_loss": 0.7974, + "step": 6448, + "vm_loss": 0.2201 + }, + { + "epoch": 1.2412830570060398, + "lm_loss": 0.9376, + "step": 6448, + "vm_loss": 0.1372 + }, + { + "epoch": 1.2412830570060398, + "lm_loss": 0.8957, + "step": 6448, + "vm_loss": 0.1869 + }, + { + "epoch": 1.2414755636836152, + "grad_norm": 3.3060825377202665, + "learning_rate": 6.640093389054176e-06, + "loss": 1.2578, + "step": 6449 + }, + { + "epoch": 1.2416680703611906, + "grad_norm": 3.260136064612598, + "learning_rate": 6.637156913707839e-06, + "loss": 1.1925, + "step": 6450 + }, + { + "epoch": 1.241860577038766, + "grad_norm": 3.2069638338448416, + "learning_rate": 6.6342207652728765e-06, + "loss": 1.1439, + "step": 6451 + }, + { + "epoch": 1.2420530837163415, + "grad_norm": 3.297263510135486, + "learning_rate": 6.631284944034716e-06, + "loss": 1.1867, + "step": 6452 + }, + { + "epoch": 1.242245590393917, + "grad_norm": 3.342538461912467, + "learning_rate": 6.628349450278753e-06, + "loss": 1.1885, + "step": 6453 + }, + { + "epoch": 1.242438097071492, + "grad_norm": 3.192040432257448, + "learning_rate": 6.6254142842903644e-06, + "loss": 1.1086, + "step": 6454 + }, + { + "epoch": 1.2426306037490675, + "grad_norm": 3.467120863214339, + "learning_rate": 6.622479446354881e-06, + "loss": 1.2112, + "step": 6455 + }, + { + "epoch": 1.242823110426643, + "grad_norm": 3.316627894706597, + "learning_rate": 6.619544936757607e-06, + "loss": 1.1334, + "step": 6456 + }, + { + "epoch": 1.242823110426643, + "lm_loss": 0.976, + "step": 6456, + "vm_loss": 0.1931 + }, + { + "epoch": 1.242823110426643, + "lm_loss": 0.6858, + "step": 6456, + "vm_loss": 0.1637 + }, + { + "epoch": 1.242823110426643, + "lm_loss": 1.2426, + "step": 6456, + "vm_loss": 0.1767 + }, + { + "epoch": 1.242823110426643, + "lm_loss": 0.4044, + "step": 6456, + "vm_loss": 0.1596 + }, + { + "epoch": 1.242823110426643, + "lm_loss": 0.5019, + "step": 6456, + "vm_loss": 0.153 + }, + { + "epoch": 1.242823110426643, + "lm_loss": 0.9979, + "step": 6456, + "vm_loss": 0.1441 + }, + { + "epoch": 1.242823110426643, + "lm_loss": 0.7759, + "step": 6456, + "vm_loss": 0.1304 + }, + { + "epoch": 1.242823110426643, + "lm_loss": 1.1337, + "step": 6456, + "vm_loss": 0.2132 + }, + { + "epoch": 1.2430156171042184, + "grad_norm": 3.231262655764489, + "learning_rate": 6.6166107557838075e-06, + "loss": 1.14, + "step": 6457 + }, + { + "epoch": 1.2432081237817938, + "grad_norm": 3.509915664429392, + "learning_rate": 6.613676903718735e-06, + "loss": 1.2653, + "step": 6458 + }, + { + "epoch": 1.243400630459369, + "grad_norm": 3.2454478405907037, + "learning_rate": 6.610743380847588e-06, + "loss": 1.1575, + "step": 6459 + }, + { + "epoch": 1.2435931371369444, + "grad_norm": 3.368978239540738, + "learning_rate": 6.6078101874555465e-06, + "loss": 1.2271, + "step": 6460 + }, + { + "epoch": 1.2437856438145198, + "grad_norm": 3.3039800714411065, + "learning_rate": 6.604877323827755e-06, + "loss": 1.1796, + "step": 6461 + }, + { + "epoch": 1.2439781504920953, + "grad_norm": 3.0066908771144853, + "learning_rate": 6.601944790249322e-06, + "loss": 1.1622, + "step": 6462 + }, + { + "epoch": 1.2441706571696707, + "grad_norm": 3.229101237328442, + "learning_rate": 6.599012587005329e-06, + "loss": 1.1785, + "step": 6463 + }, + { + "epoch": 1.2443631638472459, + "grad_norm": 3.1910294129349004, + "learning_rate": 6.596080714380825e-06, + "loss": 1.2032, + "step": 6464 + }, + { + "epoch": 1.2443631638472459, + "lm_loss": 0.9415, + "step": 6464, + "vm_loss": 0.1487 + }, + { + "epoch": 1.2443631638472459, + "lm_loss": 0.8613, + "step": 6464, + "vm_loss": 0.2152 + }, + { + "epoch": 1.2443631638472459, + "lm_loss": 1.2942, + "step": 6464, + "vm_loss": 0.2125 + }, + { + "epoch": 1.2443631638472459, + "lm_loss": 0.8417, + "step": 6464, + "vm_loss": 0.1733 + }, + { + "epoch": 1.2443631638472459, + "lm_loss": 1.067, + "step": 6464, + "vm_loss": 0.1769 + }, + { + "epoch": 1.2443631638472459, + "lm_loss": 1.1984, + "step": 6464, + "vm_loss": 0.1974 + }, + { + "epoch": 1.2443631638472459, + "lm_loss": 0.8002, + "step": 6464, + "vm_loss": 0.0925 + }, + { + "epoch": 1.2443631638472459, + "lm_loss": 0.8312, + "step": 6464, + "vm_loss": 0.1657 + }, + { + "epoch": 1.2445556705248213, + "grad_norm": 3.2743376303242173, + "learning_rate": 6.593149172660825e-06, + "loss": 1.194, + "step": 6465 + }, + { + "epoch": 1.2447481772023967, + "grad_norm": 3.27407654661791, + "learning_rate": 6.5902179621303115e-06, + "loss": 1.2164, + "step": 6466 + }, + { + "epoch": 1.2449406838799721, + "grad_norm": 3.305632762462827, + "learning_rate": 6.587287083074234e-06, + "loss": 1.1852, + "step": 6467 + }, + { + "epoch": 1.2451331905575476, + "grad_norm": 3.328347173440295, + "learning_rate": 6.5843565357775165e-06, + "loss": 1.1505, + "step": 6468 + }, + { + "epoch": 1.2453256972351228, + "grad_norm": 3.160554069273794, + "learning_rate": 6.581426320525042e-06, + "loss": 1.0902, + "step": 6469 + }, + { + "epoch": 1.2455182039126982, + "grad_norm": 3.426550377743918, + "learning_rate": 6.578496437601661e-06, + "loss": 1.1708, + "step": 6470 + }, + { + "epoch": 1.2457107105902736, + "grad_norm": 3.3036999435897645, + "learning_rate": 6.575566887292205e-06, + "loss": 1.1476, + "step": 6471 + }, + { + "epoch": 1.245903217267849, + "grad_norm": 3.251543383195193, + "learning_rate": 6.572637669881459e-06, + "loss": 1.1237, + "step": 6472 + }, + { + "epoch": 1.245903217267849, + "lm_loss": 0.4743, + "step": 6472, + "vm_loss": 0.1313 + }, + { + "epoch": 1.245903217267849, + "lm_loss": 0.9738, + "step": 6472, + "vm_loss": 0.159 + }, + { + "epoch": 1.245903217267849, + "lm_loss": 0.9925, + "step": 6472, + "vm_loss": 0.1917 + }, + { + "epoch": 1.245903217267849, + "lm_loss": 0.8035, + "step": 6472, + "vm_loss": 0.1267 + }, + { + "epoch": 1.245903217267849, + "lm_loss": 1.4435, + "step": 6472, + "vm_loss": 0.1756 + }, + { + "epoch": 1.245903217267849, + "lm_loss": 1.0071, + "step": 6472, + "vm_loss": 0.1105 + }, + { + "epoch": 1.245903217267849, + "lm_loss": 0.9499, + "step": 6472, + "vm_loss": 0.1782 + }, + { + "epoch": 1.245903217267849, + "lm_loss": 1.0864, + "step": 6472, + "vm_loss": 0.1585 + }, + { + "epoch": 1.2460957239454244, + "grad_norm": 3.2760580026000072, + "learning_rate": 6.56970878565418e-06, + "loss": 1.1676, + "step": 6473 + }, + { + "epoch": 1.2462882306229996, + "grad_norm": 3.376827501506037, + "learning_rate": 6.566780234895089e-06, + "loss": 1.1636, + "step": 6474 + }, + { + "epoch": 1.246480737300575, + "grad_norm": 3.2769856178786783, + "learning_rate": 6.563852017888886e-06, + "loss": 1.1399, + "step": 6475 + }, + { + "epoch": 1.2466732439781505, + "grad_norm": 3.2301724721179808, + "learning_rate": 6.56092413492023e-06, + "loss": 1.1573, + "step": 6476 + }, + { + "epoch": 1.246865750655726, + "grad_norm": 3.300203600316979, + "learning_rate": 6.557996586273741e-06, + "loss": 1.1503, + "step": 6477 + }, + { + "epoch": 1.2470582573333013, + "grad_norm": 3.249049738345709, + "learning_rate": 6.5550693722340255e-06, + "loss": 1.197, + "step": 6478 + }, + { + "epoch": 1.2472507640108765, + "grad_norm": 3.2054957071922523, + "learning_rate": 6.5521424930856395e-06, + "loss": 1.1749, + "step": 6479 + }, + { + "epoch": 1.247443270688452, + "grad_norm": 3.3373034030763664, + "learning_rate": 6.54921594911311e-06, + "loss": 1.1994, + "step": 6480 + }, + { + "epoch": 1.247443270688452, + "lm_loss": 0.8822, + "step": 6480, + "vm_loss": 0.1511 + }, + { + "epoch": 1.247443270688452, + "lm_loss": 1.0125, + "step": 6480, + "vm_loss": 0.1542 + }, + { + "epoch": 1.247443270688452, + "lm_loss": 1.3557, + "step": 6480, + "vm_loss": 0.1622 + }, + { + "epoch": 1.247443270688452, + "lm_loss": 1.0088, + "step": 6480, + "vm_loss": 0.1458 + }, + { + "epoch": 1.247443270688452, + "lm_loss": 0.893, + "step": 6480, + "vm_loss": 0.1434 + }, + { + "epoch": 1.247443270688452, + "lm_loss": 0.8262, + "step": 6480, + "vm_loss": 0.1677 + }, + { + "epoch": 1.247443270688452, + "lm_loss": 0.7606, + "step": 6480, + "vm_loss": 0.2043 + }, + { + "epoch": 1.247443270688452, + "lm_loss": 1.307, + "step": 6480, + "vm_loss": 0.2147 + }, + { + "epoch": 1.2476357773660274, + "grad_norm": 3.1915285629620853, + "learning_rate": 6.546289740600943e-06, + "loss": 1.1186, + "step": 6481 + }, + { + "epoch": 1.2478282840436028, + "grad_norm": 3.2830967989858792, + "learning_rate": 6.543363867833597e-06, + "loss": 1.1439, + "step": 6482 + }, + { + "epoch": 1.2480207907211782, + "grad_norm": 3.1723211773780133, + "learning_rate": 6.540438331095509e-06, + "loss": 1.0864, + "step": 6483 + }, + { + "epoch": 1.2482132973987534, + "grad_norm": 3.393476732782133, + "learning_rate": 6.5375131306710725e-06, + "loss": 1.1336, + "step": 6484 + }, + { + "epoch": 1.2484058040763288, + "grad_norm": 3.3428417219994833, + "learning_rate": 6.534588266844659e-06, + "loss": 1.1682, + "step": 6485 + }, + { + "epoch": 1.2485983107539043, + "grad_norm": 3.452901540380813, + "learning_rate": 6.531663739900603e-06, + "loss": 1.2042, + "step": 6486 + }, + { + "epoch": 1.2487908174314797, + "grad_norm": 3.213686504230132, + "learning_rate": 6.528739550123201e-06, + "loss": 1.1636, + "step": 6487 + }, + { + "epoch": 1.248983324109055, + "grad_norm": 3.2043499127739143, + "learning_rate": 6.525815697796729e-06, + "loss": 1.1478, + "step": 6488 + }, + { + "epoch": 1.248983324109055, + "lm_loss": 0.9215, + "step": 6488, + "vm_loss": 0.2279 + }, + { + "epoch": 1.248983324109055, + "lm_loss": 0.9209, + "step": 6488, + "vm_loss": 0.176 + }, + { + "epoch": 1.248983324109055, + "lm_loss": 1.3809, + "step": 6488, + "vm_loss": 0.1955 + }, + { + "epoch": 1.248983324109055, + "lm_loss": 1.2241, + "step": 6488, + "vm_loss": 0.1722 + }, + { + "epoch": 1.248983324109055, + "lm_loss": 0.6929, + "step": 6488, + "vm_loss": 0.1554 + }, + { + "epoch": 1.248983324109055, + "lm_loss": 1.087, + "step": 6488, + "vm_loss": 0.1839 + }, + { + "epoch": 1.248983324109055, + "lm_loss": 0.7616, + "step": 6488, + "vm_loss": 0.2064 + }, + { + "epoch": 1.248983324109055, + "lm_loss": 0.8344, + "step": 6488, + "vm_loss": 0.0977 + }, + { + "epoch": 1.2491758307866303, + "grad_norm": 3.2593989964342853, + "learning_rate": 6.522892183205418e-06, + "loss": 1.1662, + "step": 6489 + }, + { + "epoch": 1.2493683374642057, + "grad_norm": 3.2876302717363863, + "learning_rate": 6.519969006633473e-06, + "loss": 1.2049, + "step": 6490 + }, + { + "epoch": 1.2495608441417811, + "grad_norm": 3.124470262346882, + "learning_rate": 6.517046168365059e-06, + "loss": 1.1201, + "step": 6491 + }, + { + "epoch": 1.2497533508193566, + "grad_norm": 3.2184367519592882, + "learning_rate": 6.514123668684321e-06, + "loss": 1.1273, + "step": 6492 + }, + { + "epoch": 1.249945857496932, + "grad_norm": 3.2300728259813365, + "learning_rate": 6.5112015078753595e-06, + "loss": 1.1094, + "step": 6493 + }, + { + "epoch": 1.2501383641745072, + "grad_norm": 3.306738755137397, + "learning_rate": 6.508279686222243e-06, + "loss": 1.1508, + "step": 6494 + }, + { + "epoch": 1.2503308708520826, + "grad_norm": 3.2526034007987814, + "learning_rate": 6.505358204009018e-06, + "loss": 1.1393, + "step": 6495 + }, + { + "epoch": 1.250523377529658, + "grad_norm": 3.2872425365484683, + "learning_rate": 6.502437061519684e-06, + "loss": 1.0873, + "step": 6496 + }, + { + "epoch": 1.250523377529658, + "lm_loss": 0.6758, + "step": 6496, + "vm_loss": 0.1807 + }, + { + "epoch": 1.250523377529658, + "lm_loss": 0.478, + "step": 6496, + "vm_loss": 0.2281 + }, + { + "epoch": 1.250523377529658, + "lm_loss": 1.2305, + "step": 6496, + "vm_loss": 0.2159 + }, + { + "epoch": 1.250523377529658, + "lm_loss": 1.398, + "step": 6496, + "vm_loss": 0.1434 + }, + { + "epoch": 1.250523377529658, + "lm_loss": 0.8182, + "step": 6496, + "vm_loss": 0.2075 + }, + { + "epoch": 1.250523377529658, + "lm_loss": 0.6764, + "step": 6496, + "vm_loss": 0.167 + }, + { + "epoch": 1.250523377529658, + "lm_loss": 0.7359, + "step": 6496, + "vm_loss": 0.1759 + }, + { + "epoch": 1.250523377529658, + "lm_loss": 0.8572, + "step": 6496, + "vm_loss": 0.0932 + }, + { + "epoch": 1.2507158842072335, + "grad_norm": 3.4920727769145343, + "learning_rate": 6.499516259038215e-06, + "loss": 1.1678, + "step": 6497 + }, + { + "epoch": 1.2509083908848089, + "grad_norm": 3.370107449387861, + "learning_rate": 6.496595796848545e-06, + "loss": 1.0923, + "step": 6498 + }, + { + "epoch": 1.251100897562384, + "grad_norm": 3.458431927595557, + "learning_rate": 6.49367567523459e-06, + "loss": 1.2007, + "step": 6499 + }, + { + "epoch": 1.2512934042399595, + "grad_norm": 3.398886026824005, + "learning_rate": 6.490755894480218e-06, + "loss": 1.2365, + "step": 6500 + }, + { + "epoch": 1.251485910917535, + "grad_norm": 3.2190851593638405, + "learning_rate": 6.487836454869265e-06, + "loss": 1.1265, + "step": 6501 + }, + { + "epoch": 1.2516784175951103, + "grad_norm": 3.1946081299341498, + "learning_rate": 6.484917356685546e-06, + "loss": 1.1497, + "step": 6502 + }, + { + "epoch": 1.2518709242726858, + "grad_norm": 3.3309267200587995, + "learning_rate": 6.48199860021283e-06, + "loss": 1.2098, + "step": 6503 + }, + { + "epoch": 1.252063430950261, + "grad_norm": 3.1452551879982056, + "learning_rate": 6.479080185734859e-06, + "loss": 1.1249, + "step": 6504 + }, + { + "epoch": 1.252063430950261, + "lm_loss": 1.3131, + "step": 6504, + "vm_loss": 0.1452 + }, + { + "epoch": 1.252063430950261, + "lm_loss": 1.3693, + "step": 6504, + "vm_loss": 0.1896 + }, + { + "epoch": 1.252063430950261, + "lm_loss": 1.3976, + "step": 6504, + "vm_loss": 0.1682 + }, + { + "epoch": 1.252063430950261, + "lm_loss": 1.2337, + "step": 6504, + "vm_loss": 0.1648 + }, + { + "epoch": 1.252063430950261, + "lm_loss": 0.9856, + "step": 6504, + "vm_loss": 0.1534 + }, + { + "epoch": 1.252063430950261, + "lm_loss": 1.3357, + "step": 6504, + "vm_loss": 0.154 + }, + { + "epoch": 1.252063430950261, + "lm_loss": 1.1298, + "step": 6504, + "vm_loss": 0.1385 + }, + { + "epoch": 1.252063430950261, + "lm_loss": 1.1775, + "step": 6504, + "vm_loss": 0.1264 + }, + { + "epoch": 1.2522559376278366, + "grad_norm": 3.185797758241765, + "learning_rate": 6.47616211353534e-06, + "loss": 1.1672, + "step": 6505 + }, + { + "epoch": 1.2524484443054118, + "grad_norm": 3.2365708025184077, + "learning_rate": 6.4732443838979454e-06, + "loss": 1.1654, + "step": 6506 + }, + { + "epoch": 1.2526409509829872, + "grad_norm": 3.1965223956458195, + "learning_rate": 6.470326997106319e-06, + "loss": 1.1631, + "step": 6507 + }, + { + "epoch": 1.2528334576605626, + "grad_norm": 3.2188409564472273, + "learning_rate": 6.467409953444061e-06, + "loss": 1.1702, + "step": 6508 + }, + { + "epoch": 1.2530259643381378, + "grad_norm": 3.285414402656991, + "learning_rate": 6.464493253194757e-06, + "loss": 1.2021, + "step": 6509 + }, + { + "epoch": 1.2532184710157135, + "grad_norm": 3.286313818782534, + "learning_rate": 6.461576896641939e-06, + "loss": 1.1634, + "step": 6510 + }, + { + "epoch": 1.2534109776932887, + "grad_norm": 3.438837267570832, + "learning_rate": 6.458660884069113e-06, + "loss": 1.1417, + "step": 6511 + }, + { + "epoch": 1.253603484370864, + "grad_norm": 3.3402585244739473, + "learning_rate": 6.455745215759761e-06, + "loss": 1.1229, + "step": 6512 + }, + { + "epoch": 1.253603484370864, + "lm_loss": 1.0565, + "step": 6512, + "vm_loss": 0.1454 + }, + { + "epoch": 1.253603484370864, + "lm_loss": 1.1177, + "step": 6512, + "vm_loss": 0.1615 + }, + { + "epoch": 1.253603484370864, + "lm_loss": 1.1827, + "step": 6512, + "vm_loss": 0.149 + }, + { + "epoch": 1.253603484370864, + "lm_loss": 0.6863, + "step": 6512, + "vm_loss": 0.1624 + }, + { + "epoch": 1.253603484370864, + "lm_loss": 0.458, + "step": 6512, + "vm_loss": 0.095 + }, + { + "epoch": 1.253603484370864, + "lm_loss": 1.2695, + "step": 6512, + "vm_loss": 0.1502 + }, + { + "epoch": 1.253603484370864, + "lm_loss": 0.9193, + "step": 6512, + "vm_loss": 0.1317 + }, + { + "epoch": 1.253603484370864, + "lm_loss": 0.8516, + "step": 6512, + "vm_loss": 0.1407 + }, + { + "epoch": 1.2537959910484395, + "grad_norm": 3.5496681882128875, + "learning_rate": 6.4528298919973165e-06, + "loss": 1.1636, + "step": 6513 + }, + { + "epoch": 1.253988497726015, + "grad_norm": 3.3582581441125363, + "learning_rate": 6.449914913065189e-06, + "loss": 1.1348, + "step": 6514 + }, + { + "epoch": 1.2541810044035904, + "grad_norm": 3.2356981689330873, + "learning_rate": 6.447000279246747e-06, + "loss": 1.1071, + "step": 6515 + }, + { + "epoch": 1.2543735110811656, + "grad_norm": 3.124705165766025, + "learning_rate": 6.444085990825338e-06, + "loss": 1.1098, + "step": 6516 + }, + { + "epoch": 1.254566017758741, + "grad_norm": 3.1864643929781, + "learning_rate": 6.441172048084264e-06, + "loss": 1.1456, + "step": 6517 + }, + { + "epoch": 1.2547585244363164, + "grad_norm": 3.2600036936890247, + "learning_rate": 6.438258451306794e-06, + "loss": 1.1983, + "step": 6518 + }, + { + "epoch": 1.2549510311138918, + "grad_norm": 3.104826943219379, + "learning_rate": 6.435345200776173e-06, + "loss": 1.1166, + "step": 6519 + }, + { + "epoch": 1.2551435377914673, + "grad_norm": 3.1496104537650393, + "learning_rate": 6.432432296775606e-06, + "loss": 1.1224, + "step": 6520 + }, + { + "epoch": 1.2551435377914673, + "lm_loss": 0.9849, + "step": 6520, + "vm_loss": 0.1531 + }, + { + "epoch": 1.2551435377914673, + "lm_loss": 0.9404, + "step": 6520, + "vm_loss": 0.1453 + }, + { + "epoch": 1.2551435377914673, + "lm_loss": 0.7559, + "step": 6520, + "vm_loss": 0.1679 + }, + { + "epoch": 1.2551435377914673, + "lm_loss": 0.8853, + "step": 6520, + "vm_loss": 0.159 + }, + { + "epoch": 1.2551435377914673, + "lm_loss": 0.6761, + "step": 6520, + "vm_loss": 0.1681 + }, + { + "epoch": 1.2551435377914673, + "lm_loss": 1.6008, + "step": 6520, + "vm_loss": 0.1418 + }, + { + "epoch": 1.2551435377914673, + "lm_loss": 0.8952, + "step": 6520, + "vm_loss": 0.0932 + }, + { + "epoch": 1.2551435377914673, + "lm_loss": 0.9865, + "step": 6520, + "vm_loss": 0.1917 + }, + { + "epoch": 1.2553360444690425, + "grad_norm": 3.1629690884168897, + "learning_rate": 6.429519739588257e-06, + "loss": 1.0974, + "step": 6521 + }, + { + "epoch": 1.2555285511466179, + "grad_norm": 3.2150610273477063, + "learning_rate": 6.426607529497273e-06, + "loss": 1.1424, + "step": 6522 + }, + { + "epoch": 1.2557210578241933, + "grad_norm": 3.221647749435571, + "learning_rate": 6.4236956667857545e-06, + "loss": 1.1249, + "step": 6523 + }, + { + "epoch": 1.2559135645017687, + "grad_norm": 3.445590731600045, + "learning_rate": 6.42078415173677e-06, + "loss": 1.1316, + "step": 6524 + }, + { + "epoch": 1.2561060711793441, + "grad_norm": 3.580801887650186, + "learning_rate": 6.417872984633356e-06, + "loss": 1.2201, + "step": 6525 + }, + { + "epoch": 1.2562985778569193, + "grad_norm": 3.5668018182268058, + "learning_rate": 6.4149621657585174e-06, + "loss": 1.1706, + "step": 6526 + }, + { + "epoch": 1.2564910845344948, + "grad_norm": 3.4298768836944586, + "learning_rate": 6.412051695395225e-06, + "loss": 1.197, + "step": 6527 + }, + { + "epoch": 1.2566835912120702, + "grad_norm": 3.195084452236325, + "learning_rate": 6.409141573826407e-06, + "loss": 1.1456, + "step": 6528 + }, + { + "epoch": 1.2566835912120702, + "lm_loss": 0.8882, + "step": 6528, + "vm_loss": 0.1879 + }, + { + "epoch": 1.2566835912120702, + "lm_loss": 0.8564, + "step": 6528, + "vm_loss": 0.1073 + }, + { + "epoch": 1.2566835912120702, + "lm_loss": 0.7425, + "step": 6528, + "vm_loss": 0.1365 + }, + { + "epoch": 1.2566835912120702, + "lm_loss": 0.6299, + "step": 6528, + "vm_loss": 0.2098 + }, + { + "epoch": 1.2566835912120702, + "lm_loss": 1.2098, + "step": 6528, + "vm_loss": 0.0902 + }, + { + "epoch": 1.2566835912120702, + "lm_loss": 1.3022, + "step": 6528, + "vm_loss": 0.1362 + }, + { + "epoch": 1.2566835912120702, + "lm_loss": 0.7828, + "step": 6528, + "vm_loss": 0.161 + }, + { + "epoch": 1.2566835912120702, + "lm_loss": 0.5261, + "step": 6528, + "vm_loss": 0.1608 + }, + { + "epoch": 1.2568760978896456, + "grad_norm": 3.253970234346718, + "learning_rate": 6.40623180133497e-06, + "loss": 1.1322, + "step": 6529 + }, + { + "epoch": 1.257068604567221, + "grad_norm": 3.26142651185074, + "learning_rate": 6.403322378203782e-06, + "loss": 1.1368, + "step": 6530 + }, + { + "epoch": 1.2572611112447962, + "grad_norm": 3.216254755246491, + "learning_rate": 6.400413304715674e-06, + "loss": 1.1666, + "step": 6531 + }, + { + "epoch": 1.2574536179223716, + "grad_norm": 3.2099764398814794, + "learning_rate": 6.39750458115344e-06, + "loss": 1.1747, + "step": 6532 + }, + { + "epoch": 1.257646124599947, + "grad_norm": 3.1863892210984774, + "learning_rate": 6.394596207799855e-06, + "loss": 1.1669, + "step": 6533 + }, + { + "epoch": 1.2578386312775225, + "grad_norm": 3.1253549474940114, + "learning_rate": 6.391688184937643e-06, + "loss": 1.1274, + "step": 6534 + }, + { + "epoch": 1.258031137955098, + "grad_norm": 3.3188478197006397, + "learning_rate": 6.388780512849501e-06, + "loss": 1.1915, + "step": 6535 + }, + { + "epoch": 1.258223644632673, + "grad_norm": 3.336536343674728, + "learning_rate": 6.385873191818099e-06, + "loss": 1.1862, + "step": 6536 + }, + { + "epoch": 1.258223644632673, + "lm_loss": 0.8618, + "step": 6536, + "vm_loss": 0.1778 + }, + { + "epoch": 1.258223644632673, + "lm_loss": 1.066, + "step": 6536, + "vm_loss": 0.1334 + }, + { + "epoch": 1.258223644632673, + "lm_loss": 0.6471, + "step": 6536, + "vm_loss": 0.1437 + }, + { + "epoch": 1.258223644632673, + "lm_loss": 0.5755, + "step": 6536, + "vm_loss": 0.1965 + }, + { + "epoch": 1.258223644632673, + "lm_loss": 0.6876, + "step": 6536, + "vm_loss": 0.1572 + }, + { + "epoch": 1.258223644632673, + "lm_loss": 1.1159, + "step": 6536, + "vm_loss": 0.2072 + }, + { + "epoch": 1.258223644632673, + "lm_loss": 1.5133, + "step": 6536, + "vm_loss": 0.1328 + }, + { + "epoch": 1.258223644632673, + "lm_loss": 0.8326, + "step": 6536, + "vm_loss": 0.1832 + }, + { + "epoch": 1.2584161513102485, + "grad_norm": 3.1439114082407924, + "learning_rate": 6.382966222126059e-06, + "loss": 1.1417, + "step": 6537 + }, + { + "epoch": 1.258608657987824, + "grad_norm": 3.198305721525765, + "learning_rate": 6.380059604055975e-06, + "loss": 1.1544, + "step": 6538 + }, + { + "epoch": 1.2588011646653994, + "grad_norm": 3.34381104771015, + "learning_rate": 6.377153337890414e-06, + "loss": 1.1539, + "step": 6539 + }, + { + "epoch": 1.2589936713429748, + "grad_norm": 3.2119355967724603, + "learning_rate": 6.374247423911897e-06, + "loss": 1.1326, + "step": 6540 + }, + { + "epoch": 1.25918617802055, + "grad_norm": 3.4657214946278008, + "learning_rate": 6.371341862402919e-06, + "loss": 1.2058, + "step": 6541 + }, + { + "epoch": 1.2593786846981254, + "grad_norm": 3.484099281784217, + "learning_rate": 6.368436653645931e-06, + "loss": 1.2004, + "step": 6542 + }, + { + "epoch": 1.2595711913757008, + "grad_norm": 3.352240398342553, + "learning_rate": 6.365531797923365e-06, + "loss": 1.1844, + "step": 6543 + }, + { + "epoch": 1.2597636980532763, + "grad_norm": 3.2715147242041733, + "learning_rate": 6.362627295517606e-06, + "loss": 1.1137, + "step": 6544 + }, + { + "epoch": 1.2597636980532763, + "lm_loss": 1.0429, + "step": 6544, + "vm_loss": 0.1474 + }, + { + "epoch": 1.2597636980532763, + "lm_loss": 1.3993, + "step": 6544, + "vm_loss": 0.1668 + }, + { + "epoch": 1.2597636980532763, + "lm_loss": 0.6411, + "step": 6544, + "vm_loss": 0.1757 + }, + { + "epoch": 1.2597636980532763, + "lm_loss": 1.4759, + "step": 6544, + "vm_loss": 0.1218 + }, + { + "epoch": 1.2597636980532763, + "lm_loss": 0.7102, + "step": 6544, + "vm_loss": 0.1965 + }, + { + "epoch": 1.2597636980532763, + "lm_loss": 1.2725, + "step": 6544, + "vm_loss": 0.2001 + }, + { + "epoch": 1.2597636980532763, + "lm_loss": 1.3043, + "step": 6544, + "vm_loss": 0.1686 + }, + { + "epoch": 1.2597636980532763, + "lm_loss": 0.875, + "step": 6544, + "vm_loss": 0.1854 + }, + { + "epoch": 1.2599562047308517, + "grad_norm": 3.4137467969118354, + "learning_rate": 6.359723146711009e-06, + "loss": 1.2157, + "step": 6545 + }, + { + "epoch": 1.2601487114084269, + "grad_norm": 3.243972711073635, + "learning_rate": 6.356819351785897e-06, + "loss": 1.1443, + "step": 6546 + }, + { + "epoch": 1.2603412180860023, + "grad_norm": 3.237286165514483, + "learning_rate": 6.3539159110245505e-06, + "loss": 1.1171, + "step": 6547 + }, + { + "epoch": 1.2605337247635777, + "grad_norm": 3.21306763724978, + "learning_rate": 6.351012824709228e-06, + "loss": 1.1475, + "step": 6548 + }, + { + "epoch": 1.2607262314411531, + "grad_norm": 3.1692013366104286, + "learning_rate": 6.34811009312214e-06, + "loss": 1.0841, + "step": 6549 + }, + { + "epoch": 1.2609187381187286, + "grad_norm": 3.340230730404477, + "learning_rate": 6.345207716545474e-06, + "loss": 1.2215, + "step": 6550 + }, + { + "epoch": 1.2611112447963038, + "grad_norm": 3.161544444710282, + "learning_rate": 6.342305695261377e-06, + "loss": 1.1384, + "step": 6551 + }, + { + "epoch": 1.2613037514738792, + "grad_norm": 3.1783228514730375, + "learning_rate": 6.339404029551959e-06, + "loss": 1.0953, + "step": 6552 + }, + { + "epoch": 1.2613037514738792, + "lm_loss": 0.9166, + "step": 6552, + "vm_loss": 0.1692 + }, + { + "epoch": 1.2613037514738792, + "lm_loss": 0.9839, + "step": 6552, + "vm_loss": 0.1903 + }, + { + "epoch": 1.2613037514738792, + "lm_loss": 0.8046, + "step": 6552, + "vm_loss": 0.1335 + }, + { + "epoch": 1.2613037514738792, + "lm_loss": 1.4118, + "step": 6552, + "vm_loss": 0.2149 + }, + { + "epoch": 1.2613037514738792, + "lm_loss": 1.1313, + "step": 6552, + "vm_loss": 0.1494 + }, + { + "epoch": 1.2613037514738792, + "lm_loss": 1.0796, + "step": 6552, + "vm_loss": 0.1386 + }, + { + "epoch": 1.2613037514738792, + "lm_loss": 1.2278, + "step": 6552, + "vm_loss": 0.1884 + }, + { + "epoch": 1.2613037514738792, + "lm_loss": 1.3371, + "step": 6552, + "vm_loss": 0.1655 + }, + { + "epoch": 1.2614962581514546, + "grad_norm": 3.2332800857758834, + "learning_rate": 6.336502719699306e-06, + "loss": 1.1484, + "step": 6553 + }, + { + "epoch": 1.26168876482903, + "grad_norm": 3.23907174208649, + "learning_rate": 6.333601765985458e-06, + "loss": 1.0782, + "step": 6554 + }, + { + "epoch": 1.2618812715066055, + "grad_norm": 3.2853273514445953, + "learning_rate": 6.33070116869242e-06, + "loss": 1.1368, + "step": 6555 + }, + { + "epoch": 1.2620737781841807, + "grad_norm": 3.3695517665808423, + "learning_rate": 6.327800928102178e-06, + "loss": 1.1909, + "step": 6556 + }, + { + "epoch": 1.262266284861756, + "grad_norm": 3.3504843754424214, + "learning_rate": 6.324901044496667e-06, + "loss": 1.1614, + "step": 6557 + }, + { + "epoch": 1.2624587915393315, + "grad_norm": 3.2518660288040078, + "learning_rate": 6.322001518157792e-06, + "loss": 1.08, + "step": 6558 + }, + { + "epoch": 1.262651298216907, + "grad_norm": 3.20670504449457, + "learning_rate": 6.319102349367421e-06, + "loss": 1.1324, + "step": 6559 + }, + { + "epoch": 1.2628438048944823, + "grad_norm": 3.088995480287488, + "learning_rate": 6.316203538407397e-06, + "loss": 1.0336, + "step": 6560 + }, + { + "epoch": 1.2628438048944823, + "lm_loss": 1.1798, + "step": 6560, + "vm_loss": 0.1718 + }, + { + "epoch": 1.2628438048944823, + "lm_loss": 1.4228, + "step": 6560, + "vm_loss": 0.1715 + }, + { + "epoch": 1.2628438048944823, + "lm_loss": 0.7189, + "step": 6560, + "vm_loss": 0.1467 + }, + { + "epoch": 1.2628438048944823, + "lm_loss": 0.5666, + "step": 6560, + "vm_loss": 0.1076 + }, + { + "epoch": 1.2628438048944823, + "lm_loss": 0.5334, + "step": 6560, + "vm_loss": 0.1743 + }, + { + "epoch": 1.2628438048944823, + "lm_loss": 0.9095, + "step": 6560, + "vm_loss": 0.1428 + }, + { + "epoch": 1.2628438048944823, + "lm_loss": 1.0363, + "step": 6560, + "vm_loss": 0.1086 + }, + { + "epoch": 1.2628438048944823, + "lm_loss": 0.7588, + "step": 6560, + "vm_loss": 0.1685 + }, + { + "epoch": 1.2630363115720575, + "grad_norm": 3.242657550232243, + "learning_rate": 6.31330508555952e-06, + "loss": 1.1563, + "step": 6561 + }, + { + "epoch": 1.263228818249633, + "grad_norm": 3.3398806685798044, + "learning_rate": 6.310406991105551e-06, + "loss": 1.1558, + "step": 6562 + }, + { + "epoch": 1.2634213249272084, + "grad_norm": 3.127694393940714, + "learning_rate": 6.307509255327231e-06, + "loss": 1.0772, + "step": 6563 + }, + { + "epoch": 1.2636138316047838, + "grad_norm": 3.1235472129727317, + "learning_rate": 6.30461187850625e-06, + "loss": 1.1906, + "step": 6564 + }, + { + "epoch": 1.2638063382823592, + "grad_norm": 3.176269750572582, + "learning_rate": 6.3017148609242725e-06, + "loss": 1.1497, + "step": 6565 + }, + { + "epoch": 1.2639988449599344, + "grad_norm": 3.022608979538775, + "learning_rate": 6.2988182028629245e-06, + "loss": 1.0713, + "step": 6566 + }, + { + "epoch": 1.26419135163751, + "grad_norm": 3.14718691820071, + "learning_rate": 6.295921904603799e-06, + "loss": 1.1319, + "step": 6567 + }, + { + "epoch": 1.2643838583150853, + "grad_norm": 3.1692395702697858, + "learning_rate": 6.293025966428456e-06, + "loss": 1.1177, + "step": 6568 + }, + { + "epoch": 1.2643838583150853, + "lm_loss": 1.0072, + "step": 6568, + "vm_loss": 0.1715 + }, + { + "epoch": 1.2643838583150853, + "lm_loss": 0.8117, + "step": 6568, + "vm_loss": 0.2362 + }, + { + "epoch": 1.2643838583150853, + "lm_loss": 0.7021, + "step": 6568, + "vm_loss": 0.1743 + }, + { + "epoch": 1.2643838583150853, + "lm_loss": 0.7703, + "step": 6568, + "vm_loss": 0.1565 + }, + { + "epoch": 1.2643838583150853, + "lm_loss": 1.3175, + "step": 6568, + "vm_loss": 0.1961 + }, + { + "epoch": 1.2643838583150853, + "lm_loss": 0.8012, + "step": 6568, + "vm_loss": 0.1579 + }, + { + "epoch": 1.2643838583150853, + "lm_loss": 0.686, + "step": 6568, + "vm_loss": 0.141 + }, + { + "epoch": 1.2643838583150853, + "lm_loss": 1.0748, + "step": 6568, + "vm_loss": 0.1213 + }, + { + "epoch": 1.2645763649926607, + "grad_norm": 3.2430794622913086, + "learning_rate": 6.290130388618412e-06, + "loss": 1.1208, + "step": 6569 + }, + { + "epoch": 1.264768871670236, + "grad_norm": 3.296057530080696, + "learning_rate": 6.287235171455157e-06, + "loss": 1.1312, + "step": 6570 + }, + { + "epoch": 1.2649613783478113, + "grad_norm": 3.4003208262780613, + "learning_rate": 6.284340315220146e-06, + "loss": 1.2242, + "step": 6571 + }, + { + "epoch": 1.265153885025387, + "grad_norm": 3.4538104824283096, + "learning_rate": 6.281445820194795e-06, + "loss": 1.1916, + "step": 6572 + }, + { + "epoch": 1.2653463917029621, + "grad_norm": 3.2820107919980726, + "learning_rate": 6.278551686660479e-06, + "loss": 1.1741, + "step": 6573 + }, + { + "epoch": 1.2655388983805376, + "grad_norm": 3.261920089294364, + "learning_rate": 6.275657914898554e-06, + "loss": 1.1182, + "step": 6574 + }, + { + "epoch": 1.265731405058113, + "grad_norm": 3.3961592496259656, + "learning_rate": 6.272764505190329e-06, + "loss": 1.1691, + "step": 6575 + }, + { + "epoch": 1.2659239117356884, + "grad_norm": 3.4323843898187607, + "learning_rate": 6.269871457817074e-06, + "loss": 1.2213, + "step": 6576 + }, + { + "epoch": 1.2659239117356884, + "lm_loss": 0.8828, + "step": 6576, + "vm_loss": 0.1373 + }, + { + "epoch": 1.2659239117356884, + "lm_loss": 1.7807, + "step": 6576, + "vm_loss": 0.1761 + }, + { + "epoch": 1.2659239117356884, + "lm_loss": 1.4014, + "step": 6576, + "vm_loss": 0.2309 + }, + { + "epoch": 1.2659239117356884, + "lm_loss": 0.8518, + "step": 6576, + "vm_loss": 0.1809 + }, + { + "epoch": 1.2659239117356884, + "lm_loss": 0.986, + "step": 6576, + "vm_loss": 0.1432 + }, + { + "epoch": 1.2659239117356884, + "lm_loss": 0.5707, + "step": 6576, + "vm_loss": 0.1731 + }, + { + "epoch": 1.2659239117356884, + "lm_loss": 0.8639, + "step": 6576, + "vm_loss": 0.154 + }, + { + "epoch": 1.2659239117356884, + "lm_loss": 1.0247, + "step": 6576, + "vm_loss": 0.1653 + }, + { + "epoch": 1.2661164184132638, + "grad_norm": 3.284454552025599, + "learning_rate": 6.2669787730600405e-06, + "loss": 1.1349, + "step": 6577 + }, + { + "epoch": 1.266308925090839, + "grad_norm": 3.2569641087487513, + "learning_rate": 6.264086451200431e-06, + "loss": 1.1411, + "step": 6578 + }, + { + "epoch": 1.2665014317684145, + "grad_norm": 3.1943833956424434, + "learning_rate": 6.261194492519408e-06, + "loss": 1.1579, + "step": 6579 + }, + { + "epoch": 1.2666939384459899, + "grad_norm": 3.3191513353771183, + "learning_rate": 6.258302897298119e-06, + "loss": 1.1858, + "step": 6580 + }, + { + "epoch": 1.2668864451235653, + "grad_norm": 3.2433977811436794, + "learning_rate": 6.255411665817659e-06, + "loss": 1.0819, + "step": 6581 + }, + { + "epoch": 1.2670789518011407, + "grad_norm": 3.2376220008229986, + "learning_rate": 6.252520798359092e-06, + "loss": 1.1653, + "step": 6582 + }, + { + "epoch": 1.267271458478716, + "grad_norm": 3.2963903999601727, + "learning_rate": 6.249630295203443e-06, + "loss": 1.204, + "step": 6583 + }, + { + "epoch": 1.2674639651562913, + "grad_norm": 3.3534538583674016, + "learning_rate": 6.246740156631714e-06, + "loss": 1.1484, + "step": 6584 + }, + { + "epoch": 1.2674639651562913, + "lm_loss": 1.0217, + "step": 6584, + "vm_loss": 0.1742 + }, + { + "epoch": 1.2674639651562913, + "lm_loss": 0.7434, + "step": 6584, + "vm_loss": 0.1342 + }, + { + "epoch": 1.2674639651562913, + "lm_loss": 1.1464, + "step": 6584, + "vm_loss": 0.2339 + }, + { + "epoch": 1.2674639651562913, + "lm_loss": 0.7456, + "step": 6584, + "vm_loss": 0.1196 + }, + { + "epoch": 1.2674639651562913, + "lm_loss": 0.9957, + "step": 6584, + "vm_loss": 0.1996 + }, + { + "epoch": 1.2674639651562913, + "lm_loss": 0.7101, + "step": 6584, + "vm_loss": 0.1911 + }, + { + "epoch": 1.2674639651562913, + "lm_loss": 0.9687, + "step": 6584, + "vm_loss": 0.1219 + }, + { + "epoch": 1.2674639651562913, + "lm_loss": 0.6178, + "step": 6584, + "vm_loss": 0.183 + }, + { + "epoch": 1.2676564718338668, + "grad_norm": 3.172844712094401, + "learning_rate": 6.243850382924861e-06, + "loss": 1.0895, + "step": 6585 + }, + { + "epoch": 1.2678489785114422, + "grad_norm": 3.35070716741379, + "learning_rate": 6.240960974363803e-06, + "loss": 1.1604, + "step": 6586 + }, + { + "epoch": 1.2680414851890176, + "grad_norm": 3.374475484371833, + "learning_rate": 6.238071931229433e-06, + "loss": 1.2074, + "step": 6587 + }, + { + "epoch": 1.2682339918665928, + "grad_norm": 3.430219849201853, + "learning_rate": 6.2351832538025995e-06, + "loss": 1.1668, + "step": 6588 + }, + { + "epoch": 1.2684264985441682, + "grad_norm": 3.195259537000875, + "learning_rate": 6.232294942364121e-06, + "loss": 1.1391, + "step": 6589 + }, + { + "epoch": 1.2686190052217436, + "grad_norm": 3.0949180533736316, + "learning_rate": 6.229406997194774e-06, + "loss": 1.1023, + "step": 6590 + }, + { + "epoch": 1.268811511899319, + "grad_norm": 3.320149514600776, + "learning_rate": 6.2265194185753085e-06, + "loss": 1.2249, + "step": 6591 + }, + { + "epoch": 1.2690040185768945, + "grad_norm": 3.1216746481851962, + "learning_rate": 6.223632206786434e-06, + "loss": 1.1303, + "step": 6592 + }, + { + "epoch": 1.2690040185768945, + "lm_loss": 0.7316, + "step": 6592, + "vm_loss": 0.1824 + }, + { + "epoch": 1.2690040185768945, + "lm_loss": 0.8746, + "step": 6592, + "vm_loss": 0.1632 + }, + { + "epoch": 1.2690040185768945, + "lm_loss": 0.8615, + "step": 6592, + "vm_loss": 0.1394 + }, + { + "epoch": 1.2690040185768945, + "lm_loss": 1.0139, + "step": 6592, + "vm_loss": 0.208 + }, + { + "epoch": 1.2690040185768945, + "lm_loss": 1.0843, + "step": 6592, + "vm_loss": 0.1299 + }, + { + "epoch": 1.2690040185768945, + "lm_loss": 1.2468, + "step": 6592, + "vm_loss": 0.146 + }, + { + "epoch": 1.2690040185768945, + "lm_loss": 0.4102, + "step": 6592, + "vm_loss": 0.1708 + }, + { + "epoch": 1.2690040185768945, + "lm_loss": 0.5111, + "step": 6592, + "vm_loss": 0.141 + }, + { + "epoch": 1.2691965252544697, + "grad_norm": 3.1963748838766026, + "learning_rate": 6.22074536210882e-06, + "loss": 1.109, + "step": 6593 + }, + { + "epoch": 1.269389031932045, + "grad_norm": 3.1486434960178946, + "learning_rate": 6.217858884823109e-06, + "loss": 1.1106, + "step": 6594 + }, + { + "epoch": 1.2695815386096205, + "grad_norm": 3.131291270941672, + "learning_rate": 6.214972775209904e-06, + "loss": 1.0971, + "step": 6595 + }, + { + "epoch": 1.269774045287196, + "grad_norm": 3.1404795494912183, + "learning_rate": 6.212087033549765e-06, + "loss": 1.0937, + "step": 6596 + }, + { + "epoch": 1.2699665519647714, + "grad_norm": 3.2213465906715313, + "learning_rate": 6.209201660123233e-06, + "loss": 1.0695, + "step": 6597 + }, + { + "epoch": 1.2701590586423466, + "grad_norm": 3.432325534461375, + "learning_rate": 6.206316655210798e-06, + "loss": 1.121, + "step": 6598 + }, + { + "epoch": 1.270351565319922, + "grad_norm": 3.3563740045164807, + "learning_rate": 6.203432019092919e-06, + "loss": 1.1182, + "step": 6599 + }, + { + "epoch": 1.2705440719974974, + "grad_norm": 3.256192106349421, + "learning_rate": 6.2005477520500145e-06, + "loss": 1.0479, + "step": 6600 + }, + { + "epoch": 1.2705440719974974, + "lm_loss": 0.8173, + "step": 6600, + "vm_loss": 0.1894 + }, + { + "epoch": 1.2705440719974974, + "lm_loss": 0.7971, + "step": 6600, + "vm_loss": 0.1674 + }, + { + "epoch": 1.2705440719974974, + "lm_loss": 1.109, + "step": 6600, + "vm_loss": 0.1486 + }, + { + "epoch": 1.2705440719974974, + "lm_loss": 0.9904, + "step": 6600, + "vm_loss": 0.1715 + }, + { + "epoch": 1.2705440719974974, + "lm_loss": 1.0624, + "step": 6600, + "vm_loss": 0.115 + }, + { + "epoch": 1.2705440719974974, + "lm_loss": 0.6629, + "step": 6600, + "vm_loss": 0.1629 + }, + { + "epoch": 1.2705440719974974, + "lm_loss": 0.9223, + "step": 6600, + "vm_loss": 0.1237 + }, + { + "epoch": 1.2705440719974974, + "lm_loss": 0.4428, + "step": 6600, + "vm_loss": 0.1678 + }, + { + "epoch": 1.2707365786750728, + "grad_norm": 3.4592258035144985, + "learning_rate": 6.197663854362483e-06, + "loss": 1.1377, + "step": 6601 + }, + { + "epoch": 1.2709290853526483, + "grad_norm": 3.2579943152181716, + "learning_rate": 6.194780326310672e-06, + "loss": 1.1475, + "step": 6602 + }, + { + "epoch": 1.2711215920302235, + "grad_norm": 3.2807791125826937, + "learning_rate": 6.191897168174892e-06, + "loss": 1.0765, + "step": 6603 + }, + { + "epoch": 1.2713140987077989, + "grad_norm": 3.2623462572463593, + "learning_rate": 6.18901438023543e-06, + "loss": 1.133, + "step": 6604 + }, + { + "epoch": 1.2715066053853743, + "grad_norm": 3.2587894175631957, + "learning_rate": 6.186131962772528e-06, + "loss": 1.128, + "step": 6605 + }, + { + "epoch": 1.2716991120629497, + "grad_norm": 3.3095294744490227, + "learning_rate": 6.183249916066389e-06, + "loss": 1.1393, + "step": 6606 + }, + { + "epoch": 1.2718916187405251, + "grad_norm": 3.218013281888009, + "learning_rate": 6.18036824039719e-06, + "loss": 1.1353, + "step": 6607 + }, + { + "epoch": 1.2720841254181003, + "grad_norm": 3.2932117842421555, + "learning_rate": 6.177486936045067e-06, + "loss": 1.114, + "step": 6608 + }, + { + "epoch": 1.2720841254181003, + "lm_loss": 1.0368, + "step": 6608, + "vm_loss": 0.1658 + }, + { + "epoch": 1.2720841254181003, + "lm_loss": 0.6892, + "step": 6608, + "vm_loss": 0.1676 + }, + { + "epoch": 1.2720841254181003, + "lm_loss": 1.0632, + "step": 6608, + "vm_loss": 0.195 + }, + { + "epoch": 1.2720841254181003, + "lm_loss": 1.5611, + "step": 6608, + "vm_loss": 0.124 + }, + { + "epoch": 1.2720841254181003, + "lm_loss": 0.5957, + "step": 6608, + "vm_loss": 0.1916 + }, + { + "epoch": 1.2720841254181003, + "lm_loss": 0.9344, + "step": 6608, + "vm_loss": 0.1251 + }, + { + "epoch": 1.2720841254181003, + "lm_loss": 1.3944, + "step": 6608, + "vm_loss": 0.1557 + }, + { + "epoch": 1.2720841254181003, + "lm_loss": 1.1844, + "step": 6608, + "vm_loss": 0.141 + }, + { + "epoch": 1.2722766320956758, + "grad_norm": 3.193454305440564, + "learning_rate": 6.174606003290118e-06, + "loss": 1.0823, + "step": 6609 + }, + { + "epoch": 1.2724691387732512, + "grad_norm": 3.2752768440607682, + "learning_rate": 6.171725442412404e-06, + "loss": 1.119, + "step": 6610 + }, + { + "epoch": 1.2726616454508266, + "grad_norm": 3.3589643097520345, + "learning_rate": 6.168845253691956e-06, + "loss": 1.1485, + "step": 6611 + }, + { + "epoch": 1.272854152128402, + "grad_norm": 3.299537038928917, + "learning_rate": 6.165965437408766e-06, + "loss": 1.0858, + "step": 6612 + }, + { + "epoch": 1.2730466588059772, + "grad_norm": 3.3592794693852746, + "learning_rate": 6.163085993842786e-06, + "loss": 1.1386, + "step": 6613 + }, + { + "epoch": 1.2732391654835526, + "grad_norm": 3.4163801863493464, + "learning_rate": 6.160206923273934e-06, + "loss": 1.1376, + "step": 6614 + }, + { + "epoch": 1.273431672161128, + "grad_norm": 3.314220347513823, + "learning_rate": 6.157328225982098e-06, + "loss": 1.1128, + "step": 6615 + }, + { + "epoch": 1.2736241788387035, + "grad_norm": 3.2548111536942796, + "learning_rate": 6.154449902247119e-06, + "loss": 1.089, + "step": 6616 + }, + { + "epoch": 1.2736241788387035, + "lm_loss": 0.8305, + "step": 6616, + "vm_loss": 0.1763 + }, + { + "epoch": 1.2736241788387035, + "lm_loss": 1.0349, + "step": 6616, + "vm_loss": 0.2181 + }, + { + "epoch": 1.2736241788387035, + "lm_loss": 0.9645, + "step": 6616, + "vm_loss": 0.1828 + }, + { + "epoch": 1.2736241788387035, + "lm_loss": 0.9943, + "step": 6616, + "vm_loss": 0.1954 + }, + { + "epoch": 1.2736241788387035, + "lm_loss": 0.9281, + "step": 6616, + "vm_loss": 0.1762 + }, + { + "epoch": 1.2736241788387035, + "lm_loss": 1.3873, + "step": 6616, + "vm_loss": 0.1101 + }, + { + "epoch": 1.2736241788387035, + "lm_loss": 1.0504, + "step": 6616, + "vm_loss": 0.1868 + }, + { + "epoch": 1.2736241788387035, + "lm_loss": 0.6133, + "step": 6616, + "vm_loss": 0.1257 + }, + { + "epoch": 1.273816685516279, + "grad_norm": 3.2634543760431813, + "learning_rate": 6.151571952348805e-06, + "loss": 1.0846, + "step": 6617 + }, + { + "epoch": 1.2740091921938541, + "grad_norm": 3.1855053218224816, + "learning_rate": 6.148694376566937e-06, + "loss": 1.1264, + "step": 6618 + }, + { + "epoch": 1.2742016988714295, + "grad_norm": 3.2428850014688977, + "learning_rate": 6.1458171751812505e-06, + "loss": 1.1414, + "step": 6619 + }, + { + "epoch": 1.274394205549005, + "grad_norm": 3.3641234481072706, + "learning_rate": 6.142940348471437e-06, + "loss": 1.1699, + "step": 6620 + }, + { + "epoch": 1.2745867122265804, + "grad_norm": 3.2307835337366466, + "learning_rate": 6.140063896717172e-06, + "loss": 1.1299, + "step": 6621 + }, + { + "epoch": 1.2747792189041558, + "grad_norm": 3.2586165239277554, + "learning_rate": 6.137187820198081e-06, + "loss": 1.1018, + "step": 6622 + }, + { + "epoch": 1.274971725581731, + "grad_norm": 3.4141249758897985, + "learning_rate": 6.134312119193752e-06, + "loss": 1.1981, + "step": 6623 + }, + { + "epoch": 1.2751642322593064, + "grad_norm": 3.2327333969691168, + "learning_rate": 6.131436793983738e-06, + "loss": 1.1014, + "step": 6624 + }, + { + "epoch": 1.2751642322593064, + "lm_loss": 0.8796, + "step": 6624, + "vm_loss": 0.1839 + }, + { + "epoch": 1.2751642322593064, + "lm_loss": 1.0196, + "step": 6624, + "vm_loss": 0.2056 + }, + { + "epoch": 1.2751642322593064, + "lm_loss": 0.966, + "step": 6624, + "vm_loss": 0.1706 + }, + { + "epoch": 1.2751642322593064, + "lm_loss": 1.0035, + "step": 6624, + "vm_loss": 0.1426 + }, + { + "epoch": 1.2751642322593064, + "lm_loss": 0.9303, + "step": 6624, + "vm_loss": 0.1585 + }, + { + "epoch": 1.2751642322593064, + "lm_loss": 0.8744, + "step": 6624, + "vm_loss": 0.2677 + }, + { + "epoch": 1.2751642322593064, + "lm_loss": 0.8096, + "step": 6624, + "vm_loss": 0.1953 + }, + { + "epoch": 1.2751642322593064, + "lm_loss": 0.4843, + "step": 6624, + "vm_loss": 0.1492 + }, + { + "epoch": 1.2753567389368818, + "grad_norm": 3.226464147486299, + "learning_rate": 6.128561844847567e-06, + "loss": 1.1454, + "step": 6625 + }, + { + "epoch": 1.2755492456144573, + "grad_norm": 3.1811055753109296, + "learning_rate": 6.125687272064713e-06, + "loss": 1.0637, + "step": 6626 + }, + { + "epoch": 1.2757417522920327, + "grad_norm": 3.1948663720014987, + "learning_rate": 6.122813075914619e-06, + "loss": 1.0897, + "step": 6627 + }, + { + "epoch": 1.2759342589696079, + "grad_norm": 3.330322912109146, + "learning_rate": 6.119939256676701e-06, + "loss": 1.1696, + "step": 6628 + }, + { + "epoch": 1.2761267656471835, + "grad_norm": 3.1985282596025346, + "learning_rate": 6.117065814630327e-06, + "loss": 1.0874, + "step": 6629 + }, + { + "epoch": 1.2763192723247587, + "grad_norm": 3.2024199818159347, + "learning_rate": 6.114192750054833e-06, + "loss": 1.1188, + "step": 6630 + }, + { + "epoch": 1.2765117790023341, + "grad_norm": 3.1585191029462982, + "learning_rate": 6.11132006322952e-06, + "loss": 1.1118, + "step": 6631 + }, + { + "epoch": 1.2767042856799096, + "grad_norm": 3.2365758802566145, + "learning_rate": 6.108447754433644e-06, + "loss": 1.1111, + "step": 6632 + }, + { + "epoch": 1.2767042856799096, + "lm_loss": 0.9974, + "step": 6632, + "vm_loss": 0.1274 + }, + { + "epoch": 1.2767042856799096, + "lm_loss": 0.7836, + "step": 6632, + "vm_loss": 0.1718 + }, + { + "epoch": 1.2767042856799096, + "lm_loss": 1.1529, + "step": 6632, + "vm_loss": 0.1975 + }, + { + "epoch": 1.2767042856799096, + "lm_loss": 0.8055, + "step": 6632, + "vm_loss": 0.1583 + }, + { + "epoch": 1.2767042856799096, + "lm_loss": 1.0699, + "step": 6632, + "vm_loss": 0.1702 + }, + { + "epoch": 1.2767042856799096, + "lm_loss": 1.4923, + "step": 6632, + "vm_loss": 0.2157 + }, + { + "epoch": 1.2767042856799096, + "lm_loss": 1.107, + "step": 6632, + "vm_loss": 0.1373 + }, + { + "epoch": 1.2767042856799096, + "lm_loss": 1.187, + "step": 6632, + "vm_loss": 0.147 + }, + { + "epoch": 1.2768967923574848, + "grad_norm": 3.278623149511034, + "learning_rate": 6.105575823946437e-06, + "loss": 1.1268, + "step": 6633 + }, + { + "epoch": 1.2770892990350604, + "grad_norm": 3.2865357228763497, + "learning_rate": 6.10270427204708e-06, + "loss": 1.1155, + "step": 6634 + }, + { + "epoch": 1.2772818057126356, + "grad_norm": 3.1019539250023143, + "learning_rate": 6.099833099014734e-06, + "loss": 1.0594, + "step": 6635 + }, + { + "epoch": 1.277474312390211, + "grad_norm": 3.2910216827595646, + "learning_rate": 6.096962305128506e-06, + "loss": 1.1217, + "step": 6636 + }, + { + "epoch": 1.2776668190677865, + "grad_norm": 3.369657710892053, + "learning_rate": 6.094091890667473e-06, + "loss": 1.213, + "step": 6637 + }, + { + "epoch": 1.2778593257453619, + "grad_norm": 3.3943682948687557, + "learning_rate": 6.091221855910685e-06, + "loss": 1.1327, + "step": 6638 + }, + { + "epoch": 1.2780518324229373, + "grad_norm": 3.317886043429176, + "learning_rate": 6.088352201137139e-06, + "loss": 1.1677, + "step": 6639 + }, + { + "epoch": 1.2782443391005125, + "grad_norm": 3.288379790735745, + "learning_rate": 6.085482926625803e-06, + "loss": 1.1482, + "step": 6640 + }, + { + "epoch": 1.2782443391005125, + "lm_loss": 0.6361, + "step": 6640, + "vm_loss": 0.1043 + }, + { + "epoch": 1.2782443391005125, + "lm_loss": 0.4126, + "step": 6640, + "vm_loss": 0.2592 + }, + { + "epoch": 1.2782443391005125, + "lm_loss": 1.4527, + "step": 6640, + "vm_loss": 0.1564 + }, + { + "epoch": 1.2782443391005125, + "lm_loss": 0.9723, + "step": 6640, + "vm_loss": 0.1957 + }, + { + "epoch": 1.2782443391005125, + "lm_loss": 0.8676, + "step": 6640, + "vm_loss": 0.135 + }, + { + "epoch": 1.2782443391005125, + "lm_loss": 1.1842, + "step": 6640, + "vm_loss": 0.2359 + }, + { + "epoch": 1.2782443391005125, + "lm_loss": 0.89, + "step": 6640, + "vm_loss": 0.1185 + }, + { + "epoch": 1.2782443391005125, + "lm_loss": 0.6887, + "step": 6640, + "vm_loss": 0.1598 + }, + { + "epoch": 1.278436845778088, + "grad_norm": 3.0646674234816405, + "learning_rate": 6.082614032655605e-06, + "loss": 1.0738, + "step": 6641 + }, + { + "epoch": 1.2786293524556633, + "grad_norm": 3.276718687991323, + "learning_rate": 6.079745519505446e-06, + "loss": 1.0999, + "step": 6642 + }, + { + "epoch": 1.2788218591332388, + "grad_norm": 3.2995560943464635, + "learning_rate": 6.076877387454176e-06, + "loss": 1.1246, + "step": 6643 + }, + { + "epoch": 1.2790143658108142, + "grad_norm": 3.2146849271138866, + "learning_rate": 6.074009636780612e-06, + "loss": 1.1043, + "step": 6644 + }, + { + "epoch": 1.2792068724883894, + "grad_norm": 3.390724218021271, + "learning_rate": 6.071142267763541e-06, + "loss": 1.1727, + "step": 6645 + }, + { + "epoch": 1.2793993791659648, + "grad_norm": 3.2144347061724132, + "learning_rate": 6.068275280681709e-06, + "loss": 1.1362, + "step": 6646 + }, + { + "epoch": 1.2795918858435402, + "grad_norm": 3.2317995678324802, + "learning_rate": 6.065408675813816e-06, + "loss": 1.1073, + "step": 6647 + }, + { + "epoch": 1.2797843925211156, + "grad_norm": 3.185002066065946, + "learning_rate": 6.0625424534385425e-06, + "loss": 1.1697, + "step": 6648 + }, + { + "epoch": 1.2797843925211156, + "lm_loss": 0.92, + "step": 6648, + "vm_loss": 0.199 + }, + { + "epoch": 1.2797843925211156, + "lm_loss": 1.0522, + "step": 6648, + "vm_loss": 0.1773 + }, + { + "epoch": 1.2797843925211156, + "lm_loss": 1.0565, + "step": 6648, + "vm_loss": 0.1463 + }, + { + "epoch": 1.2797843925211156, + "lm_loss": 0.9726, + "step": 6648, + "vm_loss": 0.1795 + }, + { + "epoch": 1.2797843925211156, + "lm_loss": 0.8785, + "step": 6648, + "vm_loss": 0.1495 + }, + { + "epoch": 1.2797843925211156, + "lm_loss": 1.0389, + "step": 6648, + "vm_loss": 0.1023 + }, + { + "epoch": 1.2797843925211156, + "lm_loss": 0.9572, + "step": 6648, + "vm_loss": 0.1817 + }, + { + "epoch": 1.2797843925211156, + "lm_loss": 0.5625, + "step": 6648, + "vm_loss": 0.18 + }, + { + "epoch": 1.279976899198691, + "grad_norm": 3.325125241980314, + "learning_rate": 6.0596766138345176e-06, + "loss": 1.1034, + "step": 6649 + }, + { + "epoch": 1.2801694058762663, + "grad_norm": 3.362687132237075, + "learning_rate": 6.056811157280333e-06, + "loss": 1.2115, + "step": 6650 + }, + { + "epoch": 1.2803619125538417, + "grad_norm": 3.267660427938564, + "learning_rate": 6.053946084054554e-06, + "loss": 1.1128, + "step": 6651 + }, + { + "epoch": 1.280554419231417, + "grad_norm": 3.3618210873818697, + "learning_rate": 6.051081394435702e-06, + "loss": 1.1879, + "step": 6652 + }, + { + "epoch": 1.2807469259089925, + "grad_norm": 3.4180425164302513, + "learning_rate": 6.048217088702258e-06, + "loss": 1.1624, + "step": 6653 + }, + { + "epoch": 1.280939432586568, + "grad_norm": 3.3554021744807034, + "learning_rate": 6.045353167132671e-06, + "loss": 1.1347, + "step": 6654 + }, + { + "epoch": 1.2811319392641431, + "grad_norm": 3.1485502998727277, + "learning_rate": 6.042489630005352e-06, + "loss": 1.1071, + "step": 6655 + }, + { + "epoch": 1.2813244459417186, + "grad_norm": 3.22438501287988, + "learning_rate": 6.039626477598673e-06, + "loss": 1.0869, + "step": 6656 + }, + { + "epoch": 1.2813244459417186, + "lm_loss": 0.9497, + "step": 6656, + "vm_loss": 0.131 + }, + { + "epoch": 1.2813244459417186, + "lm_loss": 0.762, + "step": 6656, + "vm_loss": 0.1571 + }, + { + "epoch": 1.2813244459417186, + "lm_loss": 1.0301, + "step": 6656, + "vm_loss": 0.2305 + }, + { + "epoch": 1.2813244459417186, + "lm_loss": 0.7932, + "step": 6656, + "vm_loss": 0.1871 + }, + { + "epoch": 1.2813244459417186, + "lm_loss": 1.4329, + "step": 6656, + "vm_loss": 0.1318 + }, + { + "epoch": 1.2813244459417186, + "lm_loss": 0.8006, + "step": 6656, + "vm_loss": 0.151 + }, + { + "epoch": 1.2813244459417186, + "lm_loss": 1.0734, + "step": 6656, + "vm_loss": 0.1603 + }, + { + "epoch": 1.2813244459417186, + "lm_loss": 0.8487, + "step": 6656, + "vm_loss": 0.2176 + }, + { + "epoch": 1.281516952619294, + "grad_norm": 3.3263745981245023, + "learning_rate": 6.036763710190968e-06, + "loss": 1.1661, + "step": 6657 + }, + { + "epoch": 1.2817094592968694, + "grad_norm": 3.3593013411646417, + "learning_rate": 6.033901328060532e-06, + "loss": 1.1653, + "step": 6658 + }, + { + "epoch": 1.2819019659744448, + "grad_norm": 3.113484541583496, + "learning_rate": 6.031039331485631e-06, + "loss": 1.0612, + "step": 6659 + }, + { + "epoch": 1.28209447265202, + "grad_norm": 3.1606516864254615, + "learning_rate": 6.028177720744486e-06, + "loss": 1.1231, + "step": 6660 + }, + { + "epoch": 1.2822869793295955, + "grad_norm": 3.257141326529588, + "learning_rate": 6.025316496115276e-06, + "loss": 1.1159, + "step": 6661 + }, + { + "epoch": 1.2824794860071709, + "grad_norm": 3.290057076531557, + "learning_rate": 6.022455657876158e-06, + "loss": 1.1142, + "step": 6662 + }, + { + "epoch": 1.2826719926847463, + "grad_norm": 3.377804369433278, + "learning_rate": 6.019595206305237e-06, + "loss": 1.1586, + "step": 6663 + }, + { + "epoch": 1.2828644993623217, + "grad_norm": 3.2187176409292833, + "learning_rate": 6.016735141680587e-06, + "loss": 1.0753, + "step": 6664 + }, + { + "epoch": 1.2828644993623217, + "lm_loss": 0.8042, + "step": 6664, + "vm_loss": 0.1783 + }, + { + "epoch": 1.2828644993623217, + "lm_loss": 0.8567, + "step": 6664, + "vm_loss": 0.1743 + }, + { + "epoch": 1.2828644993623217, + "lm_loss": 0.7168, + "step": 6664, + "vm_loss": 0.1493 + }, + { + "epoch": 1.2828644993623217, + "lm_loss": 0.6396, + "step": 6664, + "vm_loss": 0.1364 + }, + { + "epoch": 1.2828644993623217, + "lm_loss": 0.8455, + "step": 6664, + "vm_loss": 0.1763 + }, + { + "epoch": 1.2828644993623217, + "lm_loss": 1.1142, + "step": 6664, + "vm_loss": 0.1333 + }, + { + "epoch": 1.2828644993623217, + "lm_loss": 0.8613, + "step": 6664, + "vm_loss": 0.1765 + }, + { + "epoch": 1.2828644993623217, + "lm_loss": 1.1116, + "step": 6664, + "vm_loss": 0.15 + }, + { + "epoch": 1.283057006039897, + "grad_norm": 3.2248676846011546, + "learning_rate": 6.013875464280239e-06, + "loss": 1.0863, + "step": 6665 + }, + { + "epoch": 1.2832495127174723, + "grad_norm": 3.1852668351607805, + "learning_rate": 6.011016174382196e-06, + "loss": 1.075, + "step": 6666 + }, + { + "epoch": 1.2834420193950478, + "grad_norm": 3.4590980045580104, + "learning_rate": 6.008157272264415e-06, + "loss": 1.2116, + "step": 6667 + }, + { + "epoch": 1.2836345260726232, + "grad_norm": 3.294351295793464, + "learning_rate": 6.005298758204814e-06, + "loss": 1.1388, + "step": 6668 + }, + { + "epoch": 1.2838270327501986, + "grad_norm": 3.162108651038212, + "learning_rate": 6.002440632481285e-06, + "loss": 1.1085, + "step": 6669 + }, + { + "epoch": 1.2840195394277738, + "grad_norm": 3.3218715312943092, + "learning_rate": 5.999582895371669e-06, + "loss": 1.1459, + "step": 6670 + }, + { + "epoch": 1.2842120461053492, + "grad_norm": 3.2533756587593303, + "learning_rate": 5.996725547153773e-06, + "loss": 1.1284, + "step": 6671 + }, + { + "epoch": 1.2844045527829246, + "grad_norm": 3.238942824356261, + "learning_rate": 5.993868588105375e-06, + "loss": 1.1226, + "step": 6672 + }, + { + "epoch": 1.2844045527829246, + "lm_loss": 0.7356, + "step": 6672, + "vm_loss": 0.2005 + }, + { + "epoch": 1.2844045527829246, + "lm_loss": 1.0102, + "step": 6672, + "vm_loss": 0.1528 + }, + { + "epoch": 1.2844045527829246, + "lm_loss": 1.9887, + "step": 6672, + "vm_loss": 0.0918 + }, + { + "epoch": 1.2844045527829246, + "lm_loss": 0.9675, + "step": 6672, + "vm_loss": 0.1612 + }, + { + "epoch": 1.2844045527829246, + "lm_loss": 0.7122, + "step": 6672, + "vm_loss": 0.117 + }, + { + "epoch": 1.2844045527829246, + "lm_loss": 0.938, + "step": 6672, + "vm_loss": 0.1773 + }, + { + "epoch": 1.2844045527829246, + "lm_loss": 0.8966, + "step": 6672, + "vm_loss": 0.1564 + }, + { + "epoch": 1.2844045527829246, + "lm_loss": 1.2908, + "step": 6672, + "vm_loss": 0.1326 + }, + { + "epoch": 1.2845970594605, + "grad_norm": 3.2062820040120363, + "learning_rate": 5.991012018504201e-06, + "loss": 1.0981, + "step": 6673 + }, + { + "epoch": 1.2847895661380755, + "grad_norm": 3.244400001080694, + "learning_rate": 5.988155838627951e-06, + "loss": 1.1547, + "step": 6674 + }, + { + "epoch": 1.2849820728156507, + "grad_norm": 3.1113397184566853, + "learning_rate": 5.985300048754278e-06, + "loss": 1.052, + "step": 6675 + }, + { + "epoch": 1.285174579493226, + "grad_norm": 3.2833974257504477, + "learning_rate": 5.9824446491608035e-06, + "loss": 1.1253, + "step": 6676 + }, + { + "epoch": 1.2853670861708015, + "grad_norm": 3.152368089179338, + "learning_rate": 5.979589640125109e-06, + "loss": 1.0917, + "step": 6677 + }, + { + "epoch": 1.285559592848377, + "grad_norm": 3.2092598257476683, + "learning_rate": 5.976735021924735e-06, + "loss": 1.0606, + "step": 6678 + }, + { + "epoch": 1.2857520995259524, + "grad_norm": 3.477398016078983, + "learning_rate": 5.973880794837194e-06, + "loss": 1.176, + "step": 6679 + }, + { + "epoch": 1.2859446062035276, + "grad_norm": 3.35540579708099, + "learning_rate": 5.971026959139947e-06, + "loss": 1.1069, + "step": 6680 + }, + { + "epoch": 1.2859446062035276, + "lm_loss": 1.0275, + "step": 6680, + "vm_loss": 0.1519 + }, + { + "epoch": 1.2859446062035276, + "lm_loss": 0.9473, + "step": 6680, + "vm_loss": 0.2029 + }, + { + "epoch": 1.2859446062035276, + "lm_loss": 1.066, + "step": 6680, + "vm_loss": 0.1364 + }, + { + "epoch": 1.2859446062035276, + "lm_loss": 0.7842, + "step": 6680, + "vm_loss": 0.1972 + }, + { + "epoch": 1.2859446062035276, + "lm_loss": 0.6261, + "step": 6680, + "vm_loss": 0.1729 + }, + { + "epoch": 1.2859446062035276, + "lm_loss": 0.5599, + "step": 6680, + "vm_loss": 0.1546 + }, + { + "epoch": 1.2859446062035276, + "lm_loss": 1.114, + "step": 6680, + "vm_loss": 0.1875 + }, + { + "epoch": 1.2859446062035276, + "lm_loss": 0.9851, + "step": 6680, + "vm_loss": 0.1824 + }, + { + "epoch": 1.286137112881103, + "grad_norm": 3.342037762460158, + "learning_rate": 5.968173515110427e-06, + "loss": 1.1149, + "step": 6681 + }, + { + "epoch": 1.2863296195586784, + "grad_norm": 3.1508544719142715, + "learning_rate": 5.965320463026019e-06, + "loss": 1.0458, + "step": 6682 + }, + { + "epoch": 1.2865221262362538, + "grad_norm": 3.277391870723528, + "learning_rate": 5.962467803164083e-06, + "loss": 1.0688, + "step": 6683 + }, + { + "epoch": 1.2867146329138293, + "grad_norm": 3.173183626546501, + "learning_rate": 5.959615535801933e-06, + "loss": 1.1194, + "step": 6684 + }, + { + "epoch": 1.2869071395914045, + "grad_norm": 3.207537977150246, + "learning_rate": 5.956763661216841e-06, + "loss": 1.0952, + "step": 6685 + }, + { + "epoch": 1.2870996462689799, + "grad_norm": 3.33364042366414, + "learning_rate": 5.953912179686052e-06, + "loss": 1.1525, + "step": 6686 + }, + { + "epoch": 1.2872921529465553, + "grad_norm": 3.2416636686667455, + "learning_rate": 5.951061091486765e-06, + "loss": 1.1358, + "step": 6687 + }, + { + "epoch": 1.2874846596241307, + "grad_norm": 3.166505654927115, + "learning_rate": 5.948210396896137e-06, + "loss": 1.0706, + "step": 6688 + }, + { + "epoch": 1.2874846596241307, + "lm_loss": 1.3101, + "step": 6688, + "vm_loss": 0.1704 + }, + { + "epoch": 1.2874846596241307, + "lm_loss": 0.8622, + "step": 6688, + "vm_loss": 0.1385 + }, + { + "epoch": 1.2874846596241307, + "lm_loss": 0.893, + "step": 6688, + "vm_loss": 0.1275 + }, + { + "epoch": 1.2874846596241307, + "lm_loss": 0.8109, + "step": 6688, + "vm_loss": 0.1127 + }, + { + "epoch": 1.2874846596241307, + "lm_loss": 0.82, + "step": 6688, + "vm_loss": 0.1628 + }, + { + "epoch": 1.2874846596241307, + "lm_loss": 0.951, + "step": 6688, + "vm_loss": 0.1584 + }, + { + "epoch": 1.2874846596241307, + "lm_loss": 1.0102, + "step": 6688, + "vm_loss": 0.1971 + }, + { + "epoch": 1.2874846596241307, + "lm_loss": 0.7994, + "step": 6688, + "vm_loss": 0.1779 + }, + { + "epoch": 1.2876771663017061, + "grad_norm": 3.1673670985766935, + "learning_rate": 5.9453600961913e-06, + "loss": 1.0801, + "step": 6689 + }, + { + "epoch": 1.2878696729792813, + "grad_norm": 3.243468298626356, + "learning_rate": 5.942510189649333e-06, + "loss": 1.1216, + "step": 6690 + }, + { + "epoch": 1.2880621796568568, + "grad_norm": 3.302040344549393, + "learning_rate": 5.939660677547288e-06, + "loss": 1.1427, + "step": 6691 + }, + { + "epoch": 1.2882546863344322, + "grad_norm": 3.241573382388179, + "learning_rate": 5.936811560162169e-06, + "loss": 1.0603, + "step": 6692 + }, + { + "epoch": 1.2884471930120076, + "grad_norm": 3.214616070686676, + "learning_rate": 5.9339628377709525e-06, + "loss": 1.0731, + "step": 6693 + }, + { + "epoch": 1.288639699689583, + "grad_norm": 3.287339342192405, + "learning_rate": 5.931114510650566e-06, + "loss": 1.1013, + "step": 6694 + }, + { + "epoch": 1.2888322063671582, + "grad_norm": 3.356604233703808, + "learning_rate": 5.928266579077906e-06, + "loss": 1.1523, + "step": 6695 + }, + { + "epoch": 1.2890247130447339, + "grad_norm": 3.325152500691127, + "learning_rate": 5.925419043329827e-06, + "loss": 1.1452, + "step": 6696 + }, + { + "epoch": 1.2890247130447339, + "lm_loss": 0.7924, + "step": 6696, + "vm_loss": 0.1722 + }, + { + "epoch": 1.2890247130447339, + "lm_loss": 1.1074, + "step": 6696, + "vm_loss": 0.1332 + }, + { + "epoch": 1.2890247130447339, + "lm_loss": 0.6835, + "step": 6696, + "vm_loss": 0.1293 + }, + { + "epoch": 1.2890247130447339, + "lm_loss": 0.813, + "step": 6696, + "vm_loss": 0.1884 + }, + { + "epoch": 1.2890247130447339, + "lm_loss": 0.6944, + "step": 6696, + "vm_loss": 0.2483 + }, + { + "epoch": 1.2890247130447339, + "lm_loss": 0.7973, + "step": 6696, + "vm_loss": 0.1604 + }, + { + "epoch": 1.2890247130447339, + "lm_loss": 0.6998, + "step": 6696, + "vm_loss": 0.145 + }, + { + "epoch": 1.2890247130447339, + "lm_loss": 1.7891, + "step": 6696, + "vm_loss": 0.2293 + }, + { + "epoch": 1.289217219722309, + "grad_norm": 3.2846629066686273, + "learning_rate": 5.922571903683147e-06, + "loss": 1.1362, + "step": 6697 + }, + { + "epoch": 1.2894097263998845, + "grad_norm": 3.3136833032332187, + "learning_rate": 5.919725160414645e-06, + "loss": 1.1666, + "step": 6698 + }, + { + "epoch": 1.28960223307746, + "grad_norm": 3.1327110991150757, + "learning_rate": 5.9168788138010545e-06, + "loss": 1.0954, + "step": 6699 + }, + { + "epoch": 1.2897947397550353, + "grad_norm": 3.2799814769159092, + "learning_rate": 5.9140328641190855e-06, + "loss": 1.1077, + "step": 6700 + }, + { + "epoch": 1.2899872464326108, + "grad_norm": 3.253586257257136, + "learning_rate": 5.911187311645398e-06, + "loss": 1.153, + "step": 6701 + }, + { + "epoch": 1.290179753110186, + "grad_norm": 3.219647443264257, + "learning_rate": 5.9083421566566105e-06, + "loss": 1.1153, + "step": 6702 + }, + { + "epoch": 1.2903722597877614, + "grad_norm": 3.2309199762763443, + "learning_rate": 5.905497399429316e-06, + "loss": 1.1125, + "step": 6703 + }, + { + "epoch": 1.2905647664653368, + "grad_norm": 3.1445968019228556, + "learning_rate": 5.902653040240059e-06, + "loss": 1.0809, + "step": 6704 + }, + { + "epoch": 1.2905647664653368, + "lm_loss": 1.1049, + "step": 6704, + "vm_loss": 0.1625 + }, + { + "epoch": 1.2905647664653368, + "lm_loss": 0.8855, + "step": 6704, + "vm_loss": 0.1999 + }, + { + "epoch": 1.2905647664653368, + "lm_loss": 0.6965, + "step": 6704, + "vm_loss": 0.1079 + }, + { + "epoch": 1.2905647664653368, + "lm_loss": 0.4566, + "step": 6704, + "vm_loss": 0.1237 + }, + { + "epoch": 1.2905647664653368, + "lm_loss": 1.2037, + "step": 6704, + "vm_loss": 0.1634 + }, + { + "epoch": 1.2905647664653368, + "lm_loss": 1.0137, + "step": 6704, + "vm_loss": 0.2202 + }, + { + "epoch": 1.2905647664653368, + "lm_loss": 0.9589, + "step": 6704, + "vm_loss": 0.1796 + }, + { + "epoch": 1.2905647664653368, + "lm_loss": 0.7211, + "step": 6704, + "vm_loss": 0.1463 + }, + { + "epoch": 1.2907572731429122, + "grad_norm": 3.163381937510988, + "learning_rate": 5.8998090793653425e-06, + "loss": 1.0859, + "step": 6705 + }, + { + "epoch": 1.2909497798204876, + "grad_norm": 3.234409170746116, + "learning_rate": 5.896965517081645e-06, + "loss": 1.1419, + "step": 6706 + }, + { + "epoch": 1.2911422864980628, + "grad_norm": 3.25906867065638, + "learning_rate": 5.894122353665393e-06, + "loss": 1.0985, + "step": 6707 + }, + { + "epoch": 1.2913347931756383, + "grad_norm": 3.225383743928816, + "learning_rate": 5.891279589392975e-06, + "loss": 1.0706, + "step": 6708 + }, + { + "epoch": 1.2915272998532137, + "grad_norm": 3.309608742120105, + "learning_rate": 5.888437224540745e-06, + "loss": 1.0958, + "step": 6709 + }, + { + "epoch": 1.291719806530789, + "grad_norm": 3.3990285576293666, + "learning_rate": 5.885595259385022e-06, + "loss": 1.1407, + "step": 6710 + }, + { + "epoch": 1.2919123132083645, + "grad_norm": 3.2756592560484794, + "learning_rate": 5.882753694202079e-06, + "loss": 1.0561, + "step": 6711 + }, + { + "epoch": 1.2921048198859397, + "grad_norm": 3.3420990240590873, + "learning_rate": 5.879912529268148e-06, + "loss": 1.0662, + "step": 6712 + }, + { + "epoch": 1.2921048198859397, + "lm_loss": 1.0454, + "step": 6712, + "vm_loss": 0.1499 + }, + { + "epoch": 1.2921048198859397, + "lm_loss": 1.0143, + "step": 6712, + "vm_loss": 0.1177 + }, + { + "epoch": 1.2921048198859397, + "lm_loss": 1.5339, + "step": 6712, + "vm_loss": 0.1968 + }, + { + "epoch": 1.2921048198859397, + "lm_loss": 0.9684, + "step": 6712, + "vm_loss": 0.1549 + }, + { + "epoch": 1.2921048198859397, + "lm_loss": 0.6806, + "step": 6712, + "vm_loss": 0.1715 + }, + { + "epoch": 1.2921048198859397, + "lm_loss": 0.5913, + "step": 6712, + "vm_loss": 0.1579 + }, + { + "epoch": 1.2921048198859397, + "lm_loss": 0.819, + "step": 6712, + "vm_loss": 0.1792 + }, + { + "epoch": 1.2921048198859397, + "lm_loss": 0.8451, + "step": 6712, + "vm_loss": 0.1279 + }, + { + "epoch": 1.2922973265635151, + "grad_norm": 3.2263486408927813, + "learning_rate": 5.877071764859434e-06, + "loss": 1.0681, + "step": 6713 + }, + { + "epoch": 1.2924898332410906, + "grad_norm": 3.270108977460173, + "learning_rate": 5.874231401252091e-06, + "loss": 1.1024, + "step": 6714 + }, + { + "epoch": 1.292682339918666, + "grad_norm": 3.1513460724551288, + "learning_rate": 5.87139143872224e-06, + "loss": 1.0754, + "step": 6715 + }, + { + "epoch": 1.2928748465962414, + "grad_norm": 3.2395103295286867, + "learning_rate": 5.868551877545962e-06, + "loss": 1.1047, + "step": 6716 + }, + { + "epoch": 1.2930673532738166, + "grad_norm": 3.247469995101324, + "learning_rate": 5.865712717999297e-06, + "loss": 1.122, + "step": 6717 + }, + { + "epoch": 1.293259859951392, + "grad_norm": 3.2125923890321473, + "learning_rate": 5.862873960358251e-06, + "loss": 1.0786, + "step": 6718 + }, + { + "epoch": 1.2934523666289675, + "grad_norm": 3.2457073499096922, + "learning_rate": 5.86003560489878e-06, + "loss": 1.0713, + "step": 6719 + }, + { + "epoch": 1.2936448733065429, + "grad_norm": 3.3374953157867826, + "learning_rate": 5.857197651896819e-06, + "loss": 1.1625, + "step": 6720 + }, + { + "epoch": 1.2936448733065429, + "lm_loss": 0.8841, + "step": 6720, + "vm_loss": 0.1634 + }, + { + "epoch": 1.2936448733065429, + "lm_loss": 0.9326, + "step": 6720, + "vm_loss": 0.1375 + }, + { + "epoch": 1.2936448733065429, + "lm_loss": 1.6497, + "step": 6720, + "vm_loss": 0.1567 + }, + { + "epoch": 1.2936448733065429, + "lm_loss": 0.9635, + "step": 6720, + "vm_loss": 0.2087 + }, + { + "epoch": 1.2936448733065429, + "lm_loss": 0.6027, + "step": 6720, + "vm_loss": 0.154 + }, + { + "epoch": 1.2936448733065429, + "lm_loss": 1.0785, + "step": 6720, + "vm_loss": 0.1474 + }, + { + "epoch": 1.2936448733065429, + "lm_loss": 0.5596, + "step": 6720, + "vm_loss": 0.1802 + }, + { + "epoch": 1.2936448733065429, + "lm_loss": 1.061, + "step": 6720, + "vm_loss": 0.2327 + }, + { + "epoch": 1.2938373799841183, + "grad_norm": 3.297782511910469, + "learning_rate": 5.854360101628247e-06, + "loss": 1.1376, + "step": 6721 + }, + { + "epoch": 1.2940298866616935, + "grad_norm": 3.3118415906323686, + "learning_rate": 5.851522954368909e-06, + "loss": 1.1761, + "step": 6722 + }, + { + "epoch": 1.294222393339269, + "grad_norm": 3.2205943406359, + "learning_rate": 5.8486862103946176e-06, + "loss": 1.1086, + "step": 6723 + }, + { + "epoch": 1.2944149000168443, + "grad_norm": 3.175018010738224, + "learning_rate": 5.845849869981137e-06, + "loss": 1.0791, + "step": 6724 + }, + { + "epoch": 1.2946074066944198, + "grad_norm": 3.1853281329942096, + "learning_rate": 5.843013933404197e-06, + "loss": 1.0898, + "step": 6725 + }, + { + "epoch": 1.2947999133719952, + "grad_norm": 3.18817508751911, + "learning_rate": 5.8401784009394815e-06, + "loss": 1.0988, + "step": 6726 + }, + { + "epoch": 1.2949924200495704, + "grad_norm": 3.028986933563594, + "learning_rate": 5.837343272862649e-06, + "loss": 1.0568, + "step": 6727 + }, + { + "epoch": 1.2951849267271458, + "grad_norm": 3.286719058039817, + "learning_rate": 5.834508549449305e-06, + "loss": 1.0896, + "step": 6728 + }, + { + "epoch": 1.2951849267271458, + "lm_loss": 0.8548, + "step": 6728, + "vm_loss": 0.1542 + }, + { + "epoch": 1.2951849267271458, + "lm_loss": 0.6273, + "step": 6728, + "vm_loss": 0.1702 + }, + { + "epoch": 1.2951849267271458, + "lm_loss": 0.8368, + "step": 6728, + "vm_loss": 0.2093 + }, + { + "epoch": 1.2951849267271458, + "lm_loss": 1.0741, + "step": 6728, + "vm_loss": 0.1265 + }, + { + "epoch": 1.2951849267271458, + "lm_loss": 0.8913, + "step": 6728, + "vm_loss": 0.2136 + }, + { + "epoch": 1.2951849267271458, + "lm_loss": 1.2988, + "step": 6728, + "vm_loss": 0.1968 + }, + { + "epoch": 1.2951849267271458, + "lm_loss": 0.9203, + "step": 6728, + "vm_loss": 0.1214 + }, + { + "epoch": 1.2951849267271458, + "lm_loss": 0.8856, + "step": 6728, + "vm_loss": 0.158 + }, + { + "epoch": 1.2953774334047212, + "grad_norm": 3.3178976039301555, + "learning_rate": 5.831674230975021e-06, + "loss": 1.1736, + "step": 6729 + }, + { + "epoch": 1.2955699400822966, + "grad_norm": 3.2076307799395494, + "learning_rate": 5.828840317715331e-06, + "loss": 1.1022, + "step": 6730 + }, + { + "epoch": 1.295762446759872, + "grad_norm": 3.2225902895843124, + "learning_rate": 5.82600680994573e-06, + "loss": 1.1005, + "step": 6731 + }, + { + "epoch": 1.2959549534374473, + "grad_norm": 3.1994529657963153, + "learning_rate": 5.823173707941665e-06, + "loss": 1.0855, + "step": 6732 + }, + { + "epoch": 1.2961474601150227, + "grad_norm": 3.2928044386054993, + "learning_rate": 5.8203410119785515e-06, + "loss": 1.1189, + "step": 6733 + }, + { + "epoch": 1.296339966792598, + "grad_norm": 3.1860256568116854, + "learning_rate": 5.8175087223317674e-06, + "loss": 1.0791, + "step": 6734 + }, + { + "epoch": 1.2965324734701735, + "grad_norm": 3.3695415782671314, + "learning_rate": 5.814676839276648e-06, + "loss": 1.0887, + "step": 6735 + }, + { + "epoch": 1.296724980147749, + "grad_norm": 3.2316577594511116, + "learning_rate": 5.811845363088477e-06, + "loss": 1.07, + "step": 6736 + }, + { + "epoch": 1.296724980147749, + "lm_loss": 1.2677, + "step": 6736, + "vm_loss": 0.1752 + }, + { + "epoch": 1.296724980147749, + "lm_loss": 0.5743, + "step": 6736, + "vm_loss": 0.1737 + }, + { + "epoch": 1.296724980147749, + "lm_loss": 0.8285, + "step": 6736, + "vm_loss": 0.1078 + }, + { + "epoch": 1.296724980147749, + "lm_loss": 0.8711, + "step": 6736, + "vm_loss": 0.2158 + }, + { + "epoch": 1.296724980147749, + "lm_loss": 0.66, + "step": 6736, + "vm_loss": 0.1026 + }, + { + "epoch": 1.296724980147749, + "lm_loss": 0.9126, + "step": 6736, + "vm_loss": 0.1378 + }, + { + "epoch": 1.296724980147749, + "lm_loss": 1.3324, + "step": 6736, + "vm_loss": 0.1985 + }, + { + "epoch": 1.296724980147749, + "lm_loss": 1.1221, + "step": 6736, + "vm_loss": 0.1634 + }, + { + "epoch": 1.2969174868253242, + "grad_norm": 3.1988470027673, + "learning_rate": 5.809014294042528e-06, + "loss": 1.0707, + "step": 6737 + }, + { + "epoch": 1.2971099935028996, + "grad_norm": 3.2796317783655953, + "learning_rate": 5.806183632414003e-06, + "loss": 1.1394, + "step": 6738 + }, + { + "epoch": 1.297302500180475, + "grad_norm": 3.345223345634217, + "learning_rate": 5.8033533784780885e-06, + "loss": 1.1356, + "step": 6739 + }, + { + "epoch": 1.2974950068580504, + "grad_norm": 3.3762879257540472, + "learning_rate": 5.800523532509907e-06, + "loss": 1.1424, + "step": 6740 + }, + { + "epoch": 1.2976875135356258, + "grad_norm": 3.315179752645858, + "learning_rate": 5.797694094784574e-06, + "loss": 1.1435, + "step": 6741 + }, + { + "epoch": 1.297880020213201, + "grad_norm": 3.155546932446427, + "learning_rate": 5.794865065577134e-06, + "loss": 1.052, + "step": 6742 + }, + { + "epoch": 1.2980725268907765, + "grad_norm": 3.1691607236808803, + "learning_rate": 5.79203644516261e-06, + "loss": 1.1403, + "step": 6743 + }, + { + "epoch": 1.2982650335683519, + "grad_norm": 3.194397428183179, + "learning_rate": 5.7892082338159774e-06, + "loss": 1.121, + "step": 6744 + }, + { + "epoch": 1.2982650335683519, + "lm_loss": 0.8229, + "step": 6744, + "vm_loss": 0.1469 + }, + { + "epoch": 1.2982650335683519, + "lm_loss": 0.6625, + "step": 6744, + "vm_loss": 0.1533 + }, + { + "epoch": 1.2982650335683519, + "lm_loss": 1.0013, + "step": 6744, + "vm_loss": 0.1387 + }, + { + "epoch": 1.2982650335683519, + "lm_loss": 0.5418, + "step": 6744, + "vm_loss": 0.1578 + }, + { + "epoch": 1.2982650335683519, + "lm_loss": 1.0268, + "step": 6744, + "vm_loss": 0.219 + }, + { + "epoch": 1.2982650335683519, + "lm_loss": 1.1461, + "step": 6744, + "vm_loss": 0.1539 + }, + { + "epoch": 1.2982650335683519, + "lm_loss": 0.8486, + "step": 6744, + "vm_loss": 0.1839 + }, + { + "epoch": 1.2982650335683519, + "lm_loss": 1.0587, + "step": 6744, + "vm_loss": 0.1203 + }, + { + "epoch": 1.2984575402459273, + "grad_norm": 3.070647441645855, + "learning_rate": 5.786380431812178e-06, + "loss": 1.0944, + "step": 6745 + }, + { + "epoch": 1.2986500469235027, + "grad_norm": 3.3018689465183826, + "learning_rate": 5.783553039426107e-06, + "loss": 1.1229, + "step": 6746 + }, + { + "epoch": 1.298842553601078, + "grad_norm": 3.319140900202501, + "learning_rate": 5.78072605693263e-06, + "loss": 1.1085, + "step": 6747 + }, + { + "epoch": 1.2990350602786533, + "grad_norm": 3.1612195114340134, + "learning_rate": 5.777899484606555e-06, + "loss": 1.0235, + "step": 6748 + }, + { + "epoch": 1.2992275669562288, + "grad_norm": 3.362802521212921, + "learning_rate": 5.775073322722669e-06, + "loss": 1.1054, + "step": 6749 + }, + { + "epoch": 1.2994200736338042, + "grad_norm": 3.1598349090684983, + "learning_rate": 5.7722475715557056e-06, + "loss": 1.0791, + "step": 6750 + }, + { + "epoch": 1.2996125803113796, + "grad_norm": 3.3170068075520796, + "learning_rate": 5.769422231380369e-06, + "loss": 1.0847, + "step": 6751 + }, + { + "epoch": 1.2998050869889548, + "grad_norm": 3.3054676873467272, + "learning_rate": 5.766597302471319e-06, + "loss": 1.1031, + "step": 6752 + }, + { + "epoch": 1.2998050869889548, + "lm_loss": 0.9478, + "step": 6752, + "vm_loss": 0.1624 + }, + { + "epoch": 1.2998050869889548, + "lm_loss": 0.9842, + "step": 6752, + "vm_loss": 0.1448 + }, + { + "epoch": 1.2998050869889548, + "lm_loss": 1.1728, + "step": 6752, + "vm_loss": 0.1858 + }, + { + "epoch": 1.2998050869889548, + "lm_loss": 0.643, + "step": 6752, + "vm_loss": 0.1436 + }, + { + "epoch": 1.2998050869889548, + "lm_loss": 0.8266, + "step": 6752, + "vm_loss": 0.1692 + }, + { + "epoch": 1.2998050869889548, + "lm_loss": 0.5803, + "step": 6752, + "vm_loss": 0.1718 + }, + { + "epoch": 1.2998050869889548, + "lm_loss": 0.5385, + "step": 6752, + "vm_loss": 0.162 + }, + { + "epoch": 1.2998050869889548, + "lm_loss": 1.4366, + "step": 6752, + "vm_loss": 0.1162 + }, + { + "epoch": 1.2999975936665302, + "grad_norm": 3.2693039802953687, + "learning_rate": 5.763772785103166e-06, + "loss": 1.0795, + "step": 6753 + }, + { + "epoch": 1.3001901003441056, + "grad_norm": 3.453954912595241, + "learning_rate": 5.760948679550502e-06, + "loss": 1.1465, + "step": 6754 + }, + { + "epoch": 1.300382607021681, + "grad_norm": 3.210306459377581, + "learning_rate": 5.7581249860878566e-06, + "loss": 1.0641, + "step": 6755 + }, + { + "epoch": 1.3005751136992565, + "grad_norm": 3.2652307946047734, + "learning_rate": 5.755301704989735e-06, + "loss": 1.1324, + "step": 6756 + }, + { + "epoch": 1.3007676203768317, + "grad_norm": 3.337089800878578, + "learning_rate": 5.752478836530586e-06, + "loss": 1.1125, + "step": 6757 + }, + { + "epoch": 1.3009601270544073, + "grad_norm": 3.2797625564655655, + "learning_rate": 5.7496563809848434e-06, + "loss": 1.1189, + "step": 6758 + }, + { + "epoch": 1.3011526337319825, + "grad_norm": 3.139538214615867, + "learning_rate": 5.746834338626874e-06, + "loss": 1.0742, + "step": 6759 + }, + { + "epoch": 1.301345140409558, + "grad_norm": 3.221486001443372, + "learning_rate": 5.7440127097310214e-06, + "loss": 1.0935, + "step": 6760 + }, + { + "epoch": 1.301345140409558, + "lm_loss": 1.3481, + "step": 6760, + "vm_loss": 0.2456 + }, + { + "epoch": 1.301345140409558, + "lm_loss": 0.9318, + "step": 6760, + "vm_loss": 0.1649 + }, + { + "epoch": 1.301345140409558, + "lm_loss": 1.2141, + "step": 6760, + "vm_loss": 0.1781 + }, + { + "epoch": 1.301345140409558, + "lm_loss": 1.0681, + "step": 6760, + "vm_loss": 0.1586 + }, + { + "epoch": 1.301345140409558, + "lm_loss": 0.7319, + "step": 6760, + "vm_loss": 0.0679 + }, + { + "epoch": 1.301345140409558, + "lm_loss": 0.8683, + "step": 6760, + "vm_loss": 0.248 + }, + { + "epoch": 1.301345140409558, + "lm_loss": 1.4273, + "step": 6760, + "vm_loss": 0.1682 + }, + { + "epoch": 1.301345140409558, + "lm_loss": 0.8325, + "step": 6760, + "vm_loss": 0.1419 + }, + { + "epoch": 1.3015376470871334, + "grad_norm": 3.1533545710551114, + "learning_rate": 5.741191494571583e-06, + "loss": 1.1233, + "step": 6761 + }, + { + "epoch": 1.3017301537647086, + "grad_norm": 3.272179948393788, + "learning_rate": 5.738370693422822e-06, + "loss": 1.0779, + "step": 6762 + }, + { + "epoch": 1.3019226604422842, + "grad_norm": 3.214616809835346, + "learning_rate": 5.735550306558943e-06, + "loss": 1.0909, + "step": 6763 + }, + { + "epoch": 1.3021151671198594, + "grad_norm": 3.3897524145909483, + "learning_rate": 5.7327303342541415e-06, + "loss": 1.083, + "step": 6764 + }, + { + "epoch": 1.3023076737974348, + "grad_norm": 3.240392478199875, + "learning_rate": 5.729910776782542e-06, + "loss": 1.1251, + "step": 6765 + }, + { + "epoch": 1.3025001804750103, + "grad_norm": 3.309930806180067, + "learning_rate": 5.727091634418243e-06, + "loss": 1.1355, + "step": 6766 + }, + { + "epoch": 1.3026926871525857, + "grad_norm": 3.1743223658690334, + "learning_rate": 5.724272907435307e-06, + "loss": 1.0664, + "step": 6767 + }, + { + "epoch": 1.302885193830161, + "grad_norm": 3.3237043681773386, + "learning_rate": 5.721454596107747e-06, + "loss": 1.0529, + "step": 6768 + }, + { + "epoch": 1.302885193830161, + "lm_loss": 1.0997, + "step": 6768, + "vm_loss": 0.1915 + }, + { + "epoch": 1.302885193830161, + "lm_loss": 1.0425, + "step": 6768, + "vm_loss": 0.1671 + }, + { + "epoch": 1.302885193830161, + "lm_loss": 1.4424, + "step": 6768, + "vm_loss": 0.1519 + }, + { + "epoch": 1.302885193830161, + "lm_loss": 0.7211, + "step": 6768, + "vm_loss": 0.1668 + }, + { + "epoch": 1.302885193830161, + "lm_loss": 1.0533, + "step": 6768, + "vm_loss": 0.125 + }, + { + "epoch": 1.302885193830161, + "lm_loss": 0.6738, + "step": 6768, + "vm_loss": 0.14 + }, + { + "epoch": 1.302885193830161, + "lm_loss": 1.3093, + "step": 6768, + "vm_loss": 0.0997 + }, + { + "epoch": 1.302885193830161, + "lm_loss": 0.705, + "step": 6768, + "vm_loss": 0.1672 + }, + { + "epoch": 1.3030777005077363, + "grad_norm": 3.3347147320347985, + "learning_rate": 5.718636700709538e-06, + "loss": 1.1053, + "step": 6769 + }, + { + "epoch": 1.3032702071853117, + "grad_norm": 3.4127874156542446, + "learning_rate": 5.7158192215146154e-06, + "loss": 1.1561, + "step": 6770 + }, + { + "epoch": 1.3034627138628871, + "grad_norm": 3.1944048337543465, + "learning_rate": 5.713002158796874e-06, + "loss": 1.1098, + "step": 6771 + }, + { + "epoch": 1.3036552205404626, + "grad_norm": 3.2932699944967063, + "learning_rate": 5.710185512830171e-06, + "loss": 1.1432, + "step": 6772 + }, + { + "epoch": 1.303847727218038, + "grad_norm": 3.2294556580828027, + "learning_rate": 5.707369283888322e-06, + "loss": 1.0967, + "step": 6773 + }, + { + "epoch": 1.3040402338956132, + "grad_norm": 3.2082578451820116, + "learning_rate": 5.70455347224509e-06, + "loss": 1.0733, + "step": 6774 + }, + { + "epoch": 1.3042327405731886, + "grad_norm": 3.246994024496379, + "learning_rate": 5.701738078174224e-06, + "loss": 1.099, + "step": 6775 + }, + { + "epoch": 1.304425247250764, + "grad_norm": 3.2234213379107777, + "learning_rate": 5.6989231019494016e-06, + "loss": 1.1293, + "step": 6776 + }, + { + "epoch": 1.304425247250764, + "lm_loss": 0.9508, + "step": 6776, + "vm_loss": 0.1989 + }, + { + "epoch": 1.304425247250764, + "lm_loss": 1.07, + "step": 6776, + "vm_loss": 0.1682 + }, + { + "epoch": 1.304425247250764, + "lm_loss": 0.7285, + "step": 6776, + "vm_loss": 0.1753 + }, + { + "epoch": 1.304425247250764, + "lm_loss": 1.206, + "step": 6776, + "vm_loss": 0.141 + }, + { + "epoch": 1.304425247250764, + "lm_loss": 0.8687, + "step": 6776, + "vm_loss": 0.1639 + }, + { + "epoch": 1.304425247250764, + "lm_loss": 1.5568, + "step": 6776, + "vm_loss": 0.1607 + }, + { + "epoch": 1.304425247250764, + "lm_loss": 0.6493, + "step": 6776, + "vm_loss": 0.1538 + }, + { + "epoch": 1.304425247250764, + "lm_loss": 0.9439, + "step": 6776, + "vm_loss": 0.1888 + }, + { + "epoch": 1.3046177539283395, + "grad_norm": 3.3146946977391094, + "learning_rate": 5.696108543844282e-06, + "loss": 1.107, + "step": 6777 + }, + { + "epoch": 1.3048102606059149, + "grad_norm": 3.1934439487412423, + "learning_rate": 5.693294404132474e-06, + "loss": 1.0768, + "step": 6778 + }, + { + "epoch": 1.30500276728349, + "grad_norm": 3.1245360071591257, + "learning_rate": 5.690480683087554e-06, + "loss": 1.0278, + "step": 6779 + }, + { + "epoch": 1.3051952739610655, + "grad_norm": 3.4021738150462273, + "learning_rate": 5.687667380983037e-06, + "loss": 1.1556, + "step": 6780 + }, + { + "epoch": 1.305387780638641, + "grad_norm": 3.337973502226165, + "learning_rate": 5.68485449809243e-06, + "loss": 1.1215, + "step": 6781 + }, + { + "epoch": 1.3055802873162163, + "grad_norm": 3.341225219849192, + "learning_rate": 5.682042034689168e-06, + "loss": 1.1376, + "step": 6782 + }, + { + "epoch": 1.3057727939937918, + "grad_norm": 3.2288798743694858, + "learning_rate": 5.679229991046669e-06, + "loss": 1.0701, + "step": 6783 + }, + { + "epoch": 1.305965300671367, + "grad_norm": 3.201899431606587, + "learning_rate": 5.6764183674382834e-06, + "loss": 1.0944, + "step": 6784 + }, + { + "epoch": 1.305965300671367, + "lm_loss": 0.9724, + "step": 6784, + "vm_loss": 0.1806 + }, + { + "epoch": 1.305965300671367, + "lm_loss": 0.481, + "step": 6784, + "vm_loss": 0.2005 + }, + { + "epoch": 1.305965300671367, + "lm_loss": 1.2175, + "step": 6784, + "vm_loss": 0.1602 + }, + { + "epoch": 1.305965300671367, + "lm_loss": 0.9441, + "step": 6784, + "vm_loss": 0.1556 + }, + { + "epoch": 1.305965300671367, + "lm_loss": 0.8773, + "step": 6784, + "vm_loss": 0.1244 + }, + { + "epoch": 1.305965300671367, + "lm_loss": 0.8689, + "step": 6784, + "vm_loss": 0.2134 + }, + { + "epoch": 1.305965300671367, + "lm_loss": 0.907, + "step": 6784, + "vm_loss": 0.1224 + }, + { + "epoch": 1.305965300671367, + "lm_loss": 0.6336, + "step": 6784, + "vm_loss": 0.2249 + }, + { + "epoch": 1.3061578073489424, + "grad_norm": 3.2952599340064017, + "learning_rate": 5.673607164137356e-06, + "loss": 1.0991, + "step": 6785 + }, + { + "epoch": 1.3063503140265178, + "grad_norm": 3.072607409178018, + "learning_rate": 5.67079638141716e-06, + "loss": 1.0673, + "step": 6786 + }, + { + "epoch": 1.3065428207040932, + "grad_norm": 3.2593801050639857, + "learning_rate": 5.6679860195509416e-06, + "loss": 1.1151, + "step": 6787 + }, + { + "epoch": 1.3067353273816686, + "grad_norm": 3.201958699434688, + "learning_rate": 5.665176078811905e-06, + "loss": 1.0717, + "step": 6788 + }, + { + "epoch": 1.3069278340592438, + "grad_norm": 3.376825724493138, + "learning_rate": 5.662366559473212e-06, + "loss": 1.0886, + "step": 6789 + }, + { + "epoch": 1.3071203407368193, + "grad_norm": 3.1774777512029666, + "learning_rate": 5.659557461807987e-06, + "loss": 1.08, + "step": 6790 + }, + { + "epoch": 1.3073128474143947, + "grad_norm": 3.1173406672633295, + "learning_rate": 5.656748786089304e-06, + "loss": 1.0255, + "step": 6791 + }, + { + "epoch": 1.30750535409197, + "grad_norm": 3.292747526043047, + "learning_rate": 5.653940532590205e-06, + "loss": 1.1313, + "step": 6792 + }, + { + "epoch": 1.30750535409197, + "lm_loss": 1.0739, + "step": 6792, + "vm_loss": 0.1848 + }, + { + "epoch": 1.30750535409197, + "lm_loss": 1.2381, + "step": 6792, + "vm_loss": 0.1681 + }, + { + "epoch": 1.30750535409197, + "lm_loss": 0.8689, + "step": 6792, + "vm_loss": 0.1667 + }, + { + "epoch": 1.30750535409197, + "lm_loss": 0.8472, + "step": 6792, + "vm_loss": 0.1652 + }, + { + "epoch": 1.30750535409197, + "lm_loss": 1.0332, + "step": 6792, + "vm_loss": 0.1451 + }, + { + "epoch": 1.30750535409197, + "lm_loss": 0.9619, + "step": 6792, + "vm_loss": 0.1597 + }, + { + "epoch": 1.30750535409197, + "lm_loss": 0.8545, + "step": 6792, + "vm_loss": 0.1657 + }, + { + "epoch": 1.30750535409197, + "lm_loss": 0.9727, + "step": 6792, + "vm_loss": 0.1556 + }, + { + "epoch": 1.3076978607695455, + "grad_norm": 3.4114704574420416, + "learning_rate": 5.6511327015836895e-06, + "loss": 1.1314, + "step": 6793 + }, + { + "epoch": 1.3078903674471207, + "grad_norm": 3.2169803587369112, + "learning_rate": 5.648325293342714e-06, + "loss": 1.078, + "step": 6794 + }, + { + "epoch": 1.3080828741246961, + "grad_norm": 3.109197791591697, + "learning_rate": 5.645518308140192e-06, + "loss": 1.0197, + "step": 6795 + }, + { + "epoch": 1.3082753808022716, + "grad_norm": 3.2619593952993435, + "learning_rate": 5.642711746249006e-06, + "loss": 1.1091, + "step": 6796 + }, + { + "epoch": 1.308467887479847, + "grad_norm": 3.377584607874024, + "learning_rate": 5.639905607941976e-06, + "loss": 1.0918, + "step": 6797 + }, + { + "epoch": 1.3086603941574224, + "grad_norm": 3.2155422049561175, + "learning_rate": 5.637099893491911e-06, + "loss": 1.0899, + "step": 6798 + }, + { + "epoch": 1.3088529008349976, + "grad_norm": 3.2378673833520755, + "learning_rate": 5.634294603171551e-06, + "loss": 1.1037, + "step": 6799 + }, + { + "epoch": 1.309045407512573, + "grad_norm": 3.2521025421536254, + "learning_rate": 5.6314897372536115e-06, + "loss": 1.1077, + "step": 6800 + }, + { + "epoch": 1.309045407512573, + "lm_loss": 0.759, + "step": 6800, + "vm_loss": 0.1856 + }, + { + "epoch": 1.309045407512573, + "lm_loss": 0.5033, + "step": 6800, + "vm_loss": 0.1957 + }, + { + "epoch": 1.309045407512573, + "lm_loss": 0.6912, + "step": 6800, + "vm_loss": 0.188 + }, + { + "epoch": 1.309045407512573, + "lm_loss": 1.1405, + "step": 6800, + "vm_loss": 0.1922 + }, + { + "epoch": 1.309045407512573, + "lm_loss": 0.8219, + "step": 6800, + "vm_loss": 0.1068 + }, + { + "epoch": 1.309045407512573, + "lm_loss": 1.0372, + "step": 6800, + "vm_loss": 0.1626 + }, + { + "epoch": 1.309045407512573, + "lm_loss": 0.7955, + "step": 6800, + "vm_loss": 0.1498 + }, + { + "epoch": 1.309045407512573, + "lm_loss": 1.0129, + "step": 6800, + "vm_loss": 0.1516 + }, + { + "epoch": 1.3092379141901485, + "grad_norm": 3.151913377908758, + "learning_rate": 5.628685296010752e-06, + "loss": 1.0695, + "step": 6801 + }, + { + "epoch": 1.3094304208677239, + "grad_norm": 3.326215342257217, + "learning_rate": 5.625881279715615e-06, + "loss": 1.152, + "step": 6802 + }, + { + "epoch": 1.3096229275452993, + "grad_norm": 3.152581951826157, + "learning_rate": 5.6230776886407745e-06, + "loss": 1.0788, + "step": 6803 + }, + { + "epoch": 1.3098154342228745, + "grad_norm": 3.2537269140234204, + "learning_rate": 5.620274523058781e-06, + "loss": 1.0777, + "step": 6804 + }, + { + "epoch": 1.31000794090045, + "grad_norm": 3.14831837340861, + "learning_rate": 5.617471783242137e-06, + "loss": 1.0636, + "step": 6805 + }, + { + "epoch": 1.3102004475780253, + "grad_norm": 3.3643419637949425, + "learning_rate": 5.614669469463307e-06, + "loss": 1.1172, + "step": 6806 + }, + { + "epoch": 1.3103929542556008, + "grad_norm": 3.3022522755551864, + "learning_rate": 5.611867581994705e-06, + "loss": 1.1269, + "step": 6807 + }, + { + "epoch": 1.3105854609331762, + "grad_norm": 3.3039057834503116, + "learning_rate": 5.609066121108717e-06, + "loss": 1.1253, + "step": 6808 + }, + { + "epoch": 1.3105854609331762, + "lm_loss": 0.9438, + "step": 6808, + "vm_loss": 0.2584 + }, + { + "epoch": 1.3105854609331762, + "lm_loss": 0.6468, + "step": 6808, + "vm_loss": 0.1923 + }, + { + "epoch": 1.3105854609331762, + "lm_loss": 0.723, + "step": 6808, + "vm_loss": 0.1992 + }, + { + "epoch": 1.3105854609331762, + "lm_loss": 0.6605, + "step": 6808, + "vm_loss": 0.1724 + }, + { + "epoch": 1.3105854609331762, + "lm_loss": 0.7814, + "step": 6808, + "vm_loss": 0.1828 + }, + { + "epoch": 1.3105854609331762, + "lm_loss": 1.0829, + "step": 6808, + "vm_loss": 0.1483 + }, + { + "epoch": 1.3105854609331762, + "lm_loss": 1.4473, + "step": 6808, + "vm_loss": 0.1256 + }, + { + "epoch": 1.3105854609331762, + "lm_loss": 0.6278, + "step": 6808, + "vm_loss": 0.2379 + }, + { + "epoch": 1.3107779676107514, + "grad_norm": 3.1239119720757897, + "learning_rate": 5.606265087077677e-06, + "loss": 1.0447, + "step": 6809 + }, + { + "epoch": 1.3109704742883268, + "grad_norm": 3.286694789022265, + "learning_rate": 5.603464480173884e-06, + "loss": 1.1099, + "step": 6810 + }, + { + "epoch": 1.3111629809659022, + "grad_norm": 3.2785136526197416, + "learning_rate": 5.600664300669589e-06, + "loss": 1.0719, + "step": 6811 + }, + { + "epoch": 1.3113554876434776, + "grad_norm": 3.3207451556789422, + "learning_rate": 5.597864548837011e-06, + "loss": 1.0529, + "step": 6812 + }, + { + "epoch": 1.311547994321053, + "grad_norm": 3.270711211480238, + "learning_rate": 5.59506522494832e-06, + "loss": 1.129, + "step": 6813 + }, + { + "epoch": 1.3117405009986283, + "grad_norm": 3.354711886269982, + "learning_rate": 5.592266329275641e-06, + "loss": 1.0879, + "step": 6814 + }, + { + "epoch": 1.3119330076762037, + "grad_norm": 3.1895961410309415, + "learning_rate": 5.589467862091069e-06, + "loss": 1.0493, + "step": 6815 + }, + { + "epoch": 1.312125514353779, + "grad_norm": 3.2974611482813616, + "learning_rate": 5.586669823666646e-06, + "loss": 1.0905, + "step": 6816 + }, + { + "epoch": 1.312125514353779, + "lm_loss": 0.4995, + "step": 6816, + "vm_loss": 0.2086 + }, + { + "epoch": 1.312125514353779, + "lm_loss": 0.6886, + "step": 6816, + "vm_loss": 0.1549 + }, + { + "epoch": 1.312125514353779, + "lm_loss": 1.0722, + "step": 6816, + "vm_loss": 0.1521 + }, + { + "epoch": 1.312125514353779, + "lm_loss": 0.5194, + "step": 6816, + "vm_loss": 0.1361 + }, + { + "epoch": 1.312125514353779, + "lm_loss": 0.748, + "step": 6816, + "vm_loss": 0.2343 + }, + { + "epoch": 1.312125514353779, + "lm_loss": 0.8596, + "step": 6816, + "vm_loss": 0.2164 + }, + { + "epoch": 1.312125514353779, + "lm_loss": 1.0437, + "step": 6816, + "vm_loss": 0.2174 + }, + { + "epoch": 1.312125514353779, + "lm_loss": 1.2521, + "step": 6816, + "vm_loss": 0.226 + }, + { + "epoch": 1.3123180210313545, + "grad_norm": 3.338580201701724, + "learning_rate": 5.583872214274383e-06, + "loss": 1.0967, + "step": 6817 + }, + { + "epoch": 1.31251052770893, + "grad_norm": 3.385269464304009, + "learning_rate": 5.581075034186232e-06, + "loss": 1.1223, + "step": 6818 + }, + { + "epoch": 1.3127030343865052, + "grad_norm": 3.3341249863889426, + "learning_rate": 5.57827828367413e-06, + "loss": 1.1528, + "step": 6819 + }, + { + "epoch": 1.3128955410640808, + "grad_norm": 3.3671197998520226, + "learning_rate": 5.575481963009945e-06, + "loss": 1.1225, + "step": 6820 + }, + { + "epoch": 1.313088047741656, + "grad_norm": 3.246298777987742, + "learning_rate": 5.572686072465519e-06, + "loss": 1.091, + "step": 6821 + }, + { + "epoch": 1.3132805544192314, + "grad_norm": 3.1937464693038455, + "learning_rate": 5.569890612312649e-06, + "loss": 1.0635, + "step": 6822 + }, + { + "epoch": 1.3134730610968068, + "grad_norm": 3.180069204686733, + "learning_rate": 5.567095582823094e-06, + "loss": 1.0844, + "step": 6823 + }, + { + "epoch": 1.313665567774382, + "grad_norm": 3.1679948678829835, + "learning_rate": 5.564300984268556e-06, + "loss": 1.0516, + "step": 6824 + }, + { + "epoch": 1.313665567774382, + "lm_loss": 0.3728, + "step": 6824, + "vm_loss": 0.1166 + }, + { + "epoch": 1.313665567774382, + "lm_loss": 1.3732, + "step": 6824, + "vm_loss": 0.1392 + }, + { + "epoch": 1.313665567774382, + "lm_loss": 0.9333, + "step": 6824, + "vm_loss": 0.191 + }, + { + "epoch": 1.313665567774382, + "lm_loss": 1.1613, + "step": 6824, + "vm_loss": 0.1114 + }, + { + "epoch": 1.313665567774382, + "lm_loss": 0.9209, + "step": 6824, + "vm_loss": 0.1161 + }, + { + "epoch": 1.313665567774382, + "lm_loss": 0.8465, + "step": 6824, + "vm_loss": 0.234 + }, + { + "epoch": 1.313665567774382, + "lm_loss": 0.4611, + "step": 6824, + "vm_loss": 0.1939 + }, + { + "epoch": 1.313665567774382, + "lm_loss": 0.887, + "step": 6824, + "vm_loss": 0.169 + }, + { + "epoch": 1.3138580744519577, + "grad_norm": 3.1148225817967448, + "learning_rate": 5.561506816920713e-06, + "loss": 1.0379, + "step": 6825 + }, + { + "epoch": 1.3140505811295329, + "grad_norm": 3.3791816638663854, + "learning_rate": 5.558713081051192e-06, + "loss": 1.1062, + "step": 6826 + }, + { + "epoch": 1.3142430878071083, + "grad_norm": 3.309018968668874, + "learning_rate": 5.555919776931584e-06, + "loss": 1.0984, + "step": 6827 + }, + { + "epoch": 1.3144355944846837, + "grad_norm": 3.1616429580855407, + "learning_rate": 5.5531269048334216e-06, + "loss": 1.0309, + "step": 6828 + }, + { + "epoch": 1.3146281011622591, + "grad_norm": 3.271338691082631, + "learning_rate": 5.550334465028224e-06, + "loss": 1.0709, + "step": 6829 + }, + { + "epoch": 1.3148206078398346, + "grad_norm": 3.536727848506085, + "learning_rate": 5.547542457787442e-06, + "loss": 1.1801, + "step": 6830 + }, + { + "epoch": 1.3150131145174098, + "grad_norm": 3.1400289464688256, + "learning_rate": 5.544750883382495e-06, + "loss": 1.0285, + "step": 6831 + }, + { + "epoch": 1.3152056211949852, + "grad_norm": 3.2304741462335214, + "learning_rate": 5.541959742084761e-06, + "loss": 1.0434, + "step": 6832 + }, + { + "epoch": 1.3152056211949852, + "lm_loss": 0.7938, + "step": 6832, + "vm_loss": 0.2274 + }, + { + "epoch": 1.3152056211949852, + "lm_loss": 0.9232, + "step": 6832, + "vm_loss": 0.1196 + }, + { + "epoch": 1.3152056211949852, + "lm_loss": 1.1548, + "step": 6832, + "vm_loss": 0.2505 + }, + { + "epoch": 1.3152056211949852, + "lm_loss": 0.9291, + "step": 6832, + "vm_loss": 0.1386 + }, + { + "epoch": 1.3152056211949852, + "lm_loss": 0.894, + "step": 6832, + "vm_loss": 0.1631 + }, + { + "epoch": 1.3152056211949852, + "lm_loss": 1.0663, + "step": 6832, + "vm_loss": 0.1805 + }, + { + "epoch": 1.3152056211949852, + "lm_loss": 1.131, + "step": 6832, + "vm_loss": 0.153 + }, + { + "epoch": 1.3152056211949852, + "lm_loss": 1.0495, + "step": 6832, + "vm_loss": 0.13 + }, + { + "epoch": 1.3153981278725606, + "grad_norm": 3.3888403195494616, + "learning_rate": 5.539169034165577e-06, + "loss": 1.1579, + "step": 6833 + }, + { + "epoch": 1.315590634550136, + "grad_norm": 3.221327772775588, + "learning_rate": 5.536378759896237e-06, + "loss": 1.0684, + "step": 6834 + }, + { + "epoch": 1.3157831412277115, + "grad_norm": 3.303925706919441, + "learning_rate": 5.533588919547979e-06, + "loss": 1.1403, + "step": 6835 + }, + { + "epoch": 1.3159756479052866, + "grad_norm": 3.21299284286161, + "learning_rate": 5.530799513392029e-06, + "loss": 1.0419, + "step": 6836 + }, + { + "epoch": 1.316168154582862, + "grad_norm": 3.255937788119579, + "learning_rate": 5.52801054169954e-06, + "loss": 1.1186, + "step": 6837 + }, + { + "epoch": 1.3163606612604375, + "grad_norm": 3.3594248441002548, + "learning_rate": 5.525222004741639e-06, + "loss": 1.1198, + "step": 6838 + }, + { + "epoch": 1.316553167938013, + "grad_norm": 3.46863621840443, + "learning_rate": 5.522433902789408e-06, + "loss": 1.1787, + "step": 6839 + }, + { + "epoch": 1.3167456746155883, + "grad_norm": 3.2245547220750344, + "learning_rate": 5.519646236113891e-06, + "loss": 1.1158, + "step": 6840 + }, + { + "epoch": 1.3167456746155883, + "lm_loss": 1.1142, + "step": 6840, + "vm_loss": 0.1907 + }, + { + "epoch": 1.3167456746155883, + "lm_loss": 1.0466, + "step": 6840, + "vm_loss": 0.1945 + }, + { + "epoch": 1.3167456746155883, + "lm_loss": 0.9665, + "step": 6840, + "vm_loss": 0.1682 + }, + { + "epoch": 1.3167456746155883, + "lm_loss": 0.7343, + "step": 6840, + "vm_loss": 0.1466 + }, + { + "epoch": 1.3167456746155883, + "lm_loss": 1.0654, + "step": 6840, + "vm_loss": 0.1677 + }, + { + "epoch": 1.3167456746155883, + "lm_loss": 0.8259, + "step": 6840, + "vm_loss": 0.1038 + }, + { + "epoch": 1.3167456746155883, + "lm_loss": 0.8645, + "step": 6840, + "vm_loss": 0.1774 + }, + { + "epoch": 1.3167456746155883, + "lm_loss": 1.0416, + "step": 6840, + "vm_loss": 0.1876 + }, + { + "epoch": 1.3169381812931635, + "grad_norm": 3.2847984864002413, + "learning_rate": 5.516859004986077e-06, + "loss": 1.1061, + "step": 6841 + }, + { + "epoch": 1.317130687970739, + "grad_norm": 3.3258484033524542, + "learning_rate": 5.514072209676924e-06, + "loss": 1.1381, + "step": 6842 + }, + { + "epoch": 1.3173231946483144, + "grad_norm": 3.2890677771544548, + "learning_rate": 5.511285850457343e-06, + "loss": 1.1269, + "step": 6843 + }, + { + "epoch": 1.3175157013258898, + "grad_norm": 3.1683016735834064, + "learning_rate": 5.508499927598209e-06, + "loss": 1.0501, + "step": 6844 + }, + { + "epoch": 1.3177082080034652, + "grad_norm": 3.2720195812542077, + "learning_rate": 5.505714441370338e-06, + "loss": 1.1158, + "step": 6845 + }, + { + "epoch": 1.3179007146810404, + "grad_norm": 3.224613017112921, + "learning_rate": 5.502929392044528e-06, + "loss": 1.0876, + "step": 6846 + }, + { + "epoch": 1.3180932213586158, + "grad_norm": 3.2866542999664805, + "learning_rate": 5.500144779891513e-06, + "loss": 1.1203, + "step": 6847 + }, + { + "epoch": 1.3182857280361913, + "grad_norm": 3.298827306964532, + "learning_rate": 5.497360605181998e-06, + "loss": 1.1171, + "step": 6848 + }, + { + "epoch": 1.3182857280361913, + "lm_loss": 0.9218, + "step": 6848, + "vm_loss": 0.1404 + }, + { + "epoch": 1.3182857280361913, + "lm_loss": 0.6899, + "step": 6848, + "vm_loss": 0.1137 + }, + { + "epoch": 1.3182857280361913, + "lm_loss": 1.1841, + "step": 6848, + "vm_loss": 0.1655 + }, + { + "epoch": 1.3182857280361913, + "lm_loss": 0.9574, + "step": 6848, + "vm_loss": 0.1425 + }, + { + "epoch": 1.3182857280361913, + "lm_loss": 0.8867, + "step": 6848, + "vm_loss": 0.1641 + }, + { + "epoch": 1.3182857280361913, + "lm_loss": 0.6012, + "step": 6848, + "vm_loss": 0.1724 + }, + { + "epoch": 1.3182857280361913, + "lm_loss": 1.0803, + "step": 6848, + "vm_loss": 0.215 + }, + { + "epoch": 1.3182857280361913, + "lm_loss": 1.0218, + "step": 6848, + "vm_loss": 0.1843 + }, + { + "epoch": 1.3184782347137667, + "grad_norm": 3.392204290782093, + "learning_rate": 5.494576868186632e-06, + "loss": 1.0956, + "step": 6849 + }, + { + "epoch": 1.318670741391342, + "grad_norm": 3.2547383256188485, + "learning_rate": 5.491793569176042e-06, + "loss": 1.0644, + "step": 6850 + }, + { + "epoch": 1.3188632480689173, + "grad_norm": 3.2597836207170676, + "learning_rate": 5.48901070842079e-06, + "loss": 1.1214, + "step": 6851 + }, + { + "epoch": 1.3190557547464927, + "grad_norm": 3.1863533305785485, + "learning_rate": 5.4862282861914095e-06, + "loss": 1.0854, + "step": 6852 + }, + { + "epoch": 1.3192482614240681, + "grad_norm": 3.1722641521112207, + "learning_rate": 5.483446302758389e-06, + "loss": 1.0786, + "step": 6853 + }, + { + "epoch": 1.3194407681016436, + "grad_norm": 3.174585915549796, + "learning_rate": 5.480664758392169e-06, + "loss": 1.0427, + "step": 6854 + }, + { + "epoch": 1.319633274779219, + "grad_norm": 3.390440023339875, + "learning_rate": 5.477883653363155e-06, + "loss": 1.1294, + "step": 6855 + }, + { + "epoch": 1.3198257814567942, + "grad_norm": 3.098003098638949, + "learning_rate": 5.475102987941705e-06, + "loss": 1.0458, + "step": 6856 + }, + { + "epoch": 1.3198257814567942, + "lm_loss": 0.5035, + "step": 6856, + "vm_loss": 0.167 + }, + { + "epoch": 1.3198257814567942, + "lm_loss": 1.0625, + "step": 6856, + "vm_loss": 0.1921 + }, + { + "epoch": 1.3198257814567942, + "lm_loss": 0.8518, + "step": 6856, + "vm_loss": 0.1515 + }, + { + "epoch": 1.3198257814567942, + "lm_loss": 0.935, + "step": 6856, + "vm_loss": 0.1485 + }, + { + "epoch": 1.3198257814567942, + "lm_loss": 0.9772, + "step": 6856, + "vm_loss": 0.155 + }, + { + "epoch": 1.3198257814567942, + "lm_loss": 0.6268, + "step": 6856, + "vm_loss": 0.1428 + }, + { + "epoch": 1.3198257814567942, + "lm_loss": 0.851, + "step": 6856, + "vm_loss": 0.2018 + }, + { + "epoch": 1.3198257814567942, + "lm_loss": 1.0988, + "step": 6856, + "vm_loss": 0.1795 + }, + { + "epoch": 1.3200182881343696, + "grad_norm": 3.255400202211938, + "learning_rate": 5.472322762398139e-06, + "loss": 1.0887, + "step": 6857 + }, + { + "epoch": 1.320210794811945, + "grad_norm": 3.411683348119307, + "learning_rate": 5.469542977002722e-06, + "loss": 1.1152, + "step": 6858 + }, + { + "epoch": 1.3204033014895205, + "grad_norm": 3.474602284948717, + "learning_rate": 5.466763632025689e-06, + "loss": 1.1206, + "step": 6859 + }, + { + "epoch": 1.3205958081670959, + "grad_norm": 3.5411638429401653, + "learning_rate": 5.463984727737227e-06, + "loss": 1.141, + "step": 6860 + }, + { + "epoch": 1.320788314844671, + "grad_norm": 3.537454154531341, + "learning_rate": 5.461206264407487e-06, + "loss": 1.173, + "step": 6861 + }, + { + "epoch": 1.3209808215222465, + "grad_norm": 3.3068033135474515, + "learning_rate": 5.45842824230656e-06, + "loss": 1.0896, + "step": 6862 + }, + { + "epoch": 1.321173328199822, + "grad_norm": 3.3003051997031054, + "learning_rate": 5.455650661704517e-06, + "loss": 1.1119, + "step": 6863 + }, + { + "epoch": 1.3213658348773973, + "grad_norm": 3.278279944408037, + "learning_rate": 5.452873522871365e-06, + "loss": 1.1406, + "step": 6864 + }, + { + "epoch": 1.3213658348773973, + "lm_loss": 0.8291, + "step": 6864, + "vm_loss": 0.1709 + }, + { + "epoch": 1.3213658348773973, + "lm_loss": 0.7603, + "step": 6864, + "vm_loss": 0.2183 + }, + { + "epoch": 1.3213658348773973, + "lm_loss": 0.7874, + "step": 6864, + "vm_loss": 0.1515 + }, + { + "epoch": 1.3213658348773973, + "lm_loss": 0.7299, + "step": 6864, + "vm_loss": 0.1387 + }, + { + "epoch": 1.3213658348773973, + "lm_loss": 0.8268, + "step": 6864, + "vm_loss": 0.1403 + }, + { + "epoch": 1.3213658348773973, + "lm_loss": 0.7649, + "step": 6864, + "vm_loss": 0.1661 + }, + { + "epoch": 1.3213658348773973, + "lm_loss": 0.3361, + "step": 6864, + "vm_loss": 0.1226 + }, + { + "epoch": 1.3213658348773973, + "lm_loss": 0.5882, + "step": 6864, + "vm_loss": 0.2669 + }, + { + "epoch": 1.3215583415549728, + "grad_norm": 3.225164583265845, + "learning_rate": 5.450096826077086e-06, + "loss": 1.0848, + "step": 6865 + }, + { + "epoch": 1.321750848232548, + "grad_norm": 3.20575269499846, + "learning_rate": 5.447320571591598e-06, + "loss": 1.0497, + "step": 6866 + }, + { + "epoch": 1.3219433549101234, + "grad_norm": 3.1294913804345517, + "learning_rate": 5.4445447596848045e-06, + "loss": 1.1334, + "step": 6867 + }, + { + "epoch": 1.3221358615876988, + "grad_norm": 3.136796129316707, + "learning_rate": 5.441769390626537e-06, + "loss": 1.0217, + "step": 6868 + }, + { + "epoch": 1.3223283682652742, + "grad_norm": 3.3011942872919584, + "learning_rate": 5.438994464686601e-06, + "loss": 1.1015, + "step": 6869 + }, + { + "epoch": 1.3225208749428496, + "grad_norm": 3.0577145060756594, + "learning_rate": 5.436219982134756e-06, + "loss": 1.0318, + "step": 6870 + }, + { + "epoch": 1.3227133816204248, + "grad_norm": 3.219533394784254, + "learning_rate": 5.433445943240721e-06, + "loss": 1.0691, + "step": 6871 + }, + { + "epoch": 1.3229058882980003, + "grad_norm": 3.4439773962233744, + "learning_rate": 5.430672348274155e-06, + "loss": 1.1165, + "step": 6872 + }, + { + "epoch": 1.3229058882980003, + "lm_loss": 0.5442, + "step": 6872, + "vm_loss": 0.1242 + }, + { + "epoch": 1.3229058882980003, + "lm_loss": 0.8073, + "step": 6872, + "vm_loss": 0.2056 + }, + { + "epoch": 1.3229058882980003, + "lm_loss": 1.1496, + "step": 6872, + "vm_loss": 0.1287 + }, + { + "epoch": 1.3229058882980003, + "lm_loss": 1.0221, + "step": 6872, + "vm_loss": 0.1613 + }, + { + "epoch": 1.3229058882980003, + "lm_loss": 1.2131, + "step": 6872, + "vm_loss": 0.1172 + }, + { + "epoch": 1.3229058882980003, + "lm_loss": 0.718, + "step": 6872, + "vm_loss": 0.1339 + }, + { + "epoch": 1.3229058882980003, + "lm_loss": 0.6556, + "step": 6872, + "vm_loss": 0.1126 + }, + { + "epoch": 1.3229058882980003, + "lm_loss": 1.3355, + "step": 6872, + "vm_loss": 0.1333 + }, + { + "epoch": 1.3230983949755757, + "grad_norm": 3.347689661732486, + "learning_rate": 5.427899197504703e-06, + "loss": 1.0835, + "step": 6873 + }, + { + "epoch": 1.323290901653151, + "grad_norm": 3.434818005625957, + "learning_rate": 5.42512649120194e-06, + "loss": 1.088, + "step": 6874 + }, + { + "epoch": 1.3234834083307265, + "grad_norm": 3.3283256339375407, + "learning_rate": 5.422354229635412e-06, + "loss": 1.0945, + "step": 6875 + }, + { + "epoch": 1.3236759150083017, + "grad_norm": 3.1716535917928477, + "learning_rate": 5.419582413074618e-06, + "loss": 1.0332, + "step": 6876 + }, + { + "epoch": 1.3238684216858772, + "grad_norm": 3.2547821717140892, + "learning_rate": 5.416811041789014e-06, + "loss": 1.0515, + "step": 6877 + }, + { + "epoch": 1.3240609283634526, + "grad_norm": 3.101051179705844, + "learning_rate": 5.414040116048016e-06, + "loss": 1.044, + "step": 6878 + }, + { + "epoch": 1.324253435041028, + "grad_norm": 3.2552167748632503, + "learning_rate": 5.411269636120984e-06, + "loss": 1.1026, + "step": 6879 + }, + { + "epoch": 1.3244459417186034, + "grad_norm": 3.197140893983596, + "learning_rate": 5.408499602277256e-06, + "loss": 1.0838, + "step": 6880 + }, + { + "epoch": 1.3244459417186034, + "lm_loss": 0.4903, + "step": 6880, + "vm_loss": 0.1913 + }, + { + "epoch": 1.3244459417186034, + "lm_loss": 1.0475, + "step": 6880, + "vm_loss": 0.1658 + }, + { + "epoch": 1.3244459417186034, + "lm_loss": 0.8657, + "step": 6880, + "vm_loss": 0.1319 + }, + { + "epoch": 1.3244459417186034, + "lm_loss": 0.6593, + "step": 6880, + "vm_loss": 0.1741 + }, + { + "epoch": 1.3244459417186034, + "lm_loss": 0.8005, + "step": 6880, + "vm_loss": 0.115 + }, + { + "epoch": 1.3244459417186034, + "lm_loss": 0.744, + "step": 6880, + "vm_loss": 0.1444 + }, + { + "epoch": 1.3244459417186034, + "lm_loss": 1.0617, + "step": 6880, + "vm_loss": 0.1616 + }, + { + "epoch": 1.3244459417186034, + "lm_loss": 0.884, + "step": 6880, + "vm_loss": 0.1802 + }, + { + "epoch": 1.3246384483961786, + "grad_norm": 3.196365095129799, + "learning_rate": 5.405730014786107e-06, + "loss": 1.0868, + "step": 6881 + }, + { + "epoch": 1.3248309550737543, + "grad_norm": 3.2250465796722105, + "learning_rate": 5.4029608739167815e-06, + "loss": 1.091, + "step": 6882 + }, + { + "epoch": 1.3250234617513295, + "grad_norm": 3.4015932440409182, + "learning_rate": 5.400192179938465e-06, + "loss": 1.0743, + "step": 6883 + }, + { + "epoch": 1.3252159684289049, + "grad_norm": 3.25180967039191, + "learning_rate": 5.397423933120325e-06, + "loss": 1.0503, + "step": 6884 + }, + { + "epoch": 1.3254084751064803, + "grad_norm": 3.172798232766363, + "learning_rate": 5.394656133731458e-06, + "loss": 1.084, + "step": 6885 + }, + { + "epoch": 1.3256009817840555, + "grad_norm": 3.310818218043609, + "learning_rate": 5.3918887820409364e-06, + "loss": 1.0884, + "step": 6886 + }, + { + "epoch": 1.3257934884616311, + "grad_norm": 3.2509000113295103, + "learning_rate": 5.389121878317778e-06, + "loss": 1.0864, + "step": 6887 + }, + { + "epoch": 1.3259859951392063, + "grad_norm": 3.272075873522302, + "learning_rate": 5.386355422830969e-06, + "loss": 1.0847, + "step": 6888 + }, + { + "epoch": 1.3259859951392063, + "lm_loss": 0.3604, + "step": 6888, + "vm_loss": 0.1896 + }, + { + "epoch": 1.3259859951392063, + "lm_loss": 1.1042, + "step": 6888, + "vm_loss": 0.1072 + }, + { + "epoch": 1.3259859951392063, + "lm_loss": 1.2242, + "step": 6888, + "vm_loss": 0.1466 + }, + { + "epoch": 1.3259859951392063, + "lm_loss": 1.3887, + "step": 6888, + "vm_loss": 0.1545 + }, + { + "epoch": 1.3259859951392063, + "lm_loss": 0.6736, + "step": 6888, + "vm_loss": 0.1838 + }, + { + "epoch": 1.3259859951392063, + "lm_loss": 1.2054, + "step": 6888, + "vm_loss": 0.1915 + }, + { + "epoch": 1.3259859951392063, + "lm_loss": 0.6526, + "step": 6888, + "vm_loss": 0.1466 + }, + { + "epoch": 1.3259859951392063, + "lm_loss": 0.6192, + "step": 6888, + "vm_loss": 0.1786 + }, + { + "epoch": 1.3261785018167818, + "grad_norm": 3.2628570362596157, + "learning_rate": 5.383589415849434e-06, + "loss": 1.1237, + "step": 6889 + }, + { + "epoch": 1.3263710084943572, + "grad_norm": 3.2731584881707145, + "learning_rate": 5.3808238576420694e-06, + "loss": 1.0837, + "step": 6890 + }, + { + "epoch": 1.3265635151719326, + "grad_norm": 3.2213939239969513, + "learning_rate": 5.378058748477722e-06, + "loss": 1.0586, + "step": 6891 + }, + { + "epoch": 1.326756021849508, + "grad_norm": 3.3295600342354437, + "learning_rate": 5.375294088625196e-06, + "loss": 1.0885, + "step": 6892 + }, + { + "epoch": 1.3269485285270832, + "grad_norm": 3.2371652707263263, + "learning_rate": 5.372529878353252e-06, + "loss": 1.0947, + "step": 6893 + }, + { + "epoch": 1.3271410352046586, + "grad_norm": 3.2603293027875413, + "learning_rate": 5.36976611793061e-06, + "loss": 1.1055, + "step": 6894 + }, + { + "epoch": 1.327333541882234, + "grad_norm": 3.2627971521945747, + "learning_rate": 5.3670028076259355e-06, + "loss": 1.113, + "step": 6895 + }, + { + "epoch": 1.3275260485598095, + "grad_norm": 3.316257141872321, + "learning_rate": 5.364239947707861e-06, + "loss": 1.0874, + "step": 6896 + }, + { + "epoch": 1.3275260485598095, + "lm_loss": 0.6665, + "step": 6896, + "vm_loss": 0.2164 + }, + { + "epoch": 1.3275260485598095, + "lm_loss": 1.4276, + "step": 6896, + "vm_loss": 0.2144 + }, + { + "epoch": 1.3275260485598095, + "lm_loss": 0.9414, + "step": 6896, + "vm_loss": 0.2129 + }, + { + "epoch": 1.3275260485598095, + "lm_loss": 1.4092, + "step": 6896, + "vm_loss": 0.1462 + }, + { + "epoch": 1.3275260485598095, + "lm_loss": 0.9723, + "step": 6896, + "vm_loss": 0.1261 + }, + { + "epoch": 1.3275260485598095, + "lm_loss": 0.8288, + "step": 6896, + "vm_loss": 0.1172 + }, + { + "epoch": 1.3275260485598095, + "lm_loss": 0.4871, + "step": 6896, + "vm_loss": 0.1555 + }, + { + "epoch": 1.3275260485598095, + "lm_loss": 1.109, + "step": 6896, + "vm_loss": 0.2111 + }, + { + "epoch": 1.327718555237385, + "grad_norm": 3.370314938121935, + "learning_rate": 5.361477538444973e-06, + "loss": 1.1607, + "step": 6897 + }, + { + "epoch": 1.3279110619149601, + "grad_norm": 3.2472941820543486, + "learning_rate": 5.358715580105813e-06, + "loss": 1.0772, + "step": 6898 + }, + { + "epoch": 1.3281035685925355, + "grad_norm": 3.1629097179210035, + "learning_rate": 5.35595407295888e-06, + "loss": 1.023, + "step": 6899 + }, + { + "epoch": 1.328296075270111, + "grad_norm": 3.0412218058228198, + "learning_rate": 5.3531930172726195e-06, + "loss": 0.9986, + "step": 6900 + }, + { + "epoch": 1.3284885819476864, + "grad_norm": 3.2993112553088766, + "learning_rate": 5.350432413315455e-06, + "loss": 1.088, + "step": 6901 + }, + { + "epoch": 1.3286810886252618, + "grad_norm": 3.3455929183723128, + "learning_rate": 5.347672261355742e-06, + "loss": 1.1175, + "step": 6902 + }, + { + "epoch": 1.328873595302837, + "grad_norm": 3.1966685556094387, + "learning_rate": 5.344912561661806e-06, + "loss": 1.026, + "step": 6903 + }, + { + "epoch": 1.3290661019804124, + "grad_norm": 3.2649545462920186, + "learning_rate": 5.342153314501923e-06, + "loss": 1.0719, + "step": 6904 + }, + { + "epoch": 1.3290661019804124, + "lm_loss": 1.3055, + "step": 6904, + "vm_loss": 0.1398 + }, + { + "epoch": 1.3290661019804124, + "lm_loss": 0.8856, + "step": 6904, + "vm_loss": 0.1388 + }, + { + "epoch": 1.3290661019804124, + "lm_loss": 0.94, + "step": 6904, + "vm_loss": 0.203 + }, + { + "epoch": 1.3290661019804124, + "lm_loss": 1.1542, + "step": 6904, + "vm_loss": 0.2463 + }, + { + "epoch": 1.3290661019804124, + "lm_loss": 0.9263, + "step": 6904, + "vm_loss": 0.1715 + }, + { + "epoch": 1.3290661019804124, + "lm_loss": 0.4832, + "step": 6904, + "vm_loss": 0.1609 + }, + { + "epoch": 1.3290661019804124, + "lm_loss": 0.4448, + "step": 6904, + "vm_loss": 0.1273 + }, + { + "epoch": 1.3290661019804124, + "lm_loss": 0.7291, + "step": 6904, + "vm_loss": 0.1555 + }, + { + "epoch": 1.3292586086579878, + "grad_norm": 3.2793473622909697, + "learning_rate": 5.339394520144334e-06, + "loss": 1.0711, + "step": 6905 + }, + { + "epoch": 1.3294511153355633, + "grad_norm": 3.3569902689525515, + "learning_rate": 5.3366361788572215e-06, + "loss": 1.1332, + "step": 6906 + }, + { + "epoch": 1.3296436220131387, + "grad_norm": 3.244764772000376, + "learning_rate": 5.333878290908734e-06, + "loss": 1.0646, + "step": 6907 + }, + { + "epoch": 1.3298361286907139, + "grad_norm": 3.113239379918662, + "learning_rate": 5.331120856566972e-06, + "loss": 1.0743, + "step": 6908 + }, + { + "epoch": 1.3300286353682893, + "grad_norm": 3.3118697135160167, + "learning_rate": 5.3283638761e-06, + "loss": 1.1498, + "step": 6909 + }, + { + "epoch": 1.3302211420458647, + "grad_norm": 3.191066365421342, + "learning_rate": 5.325607349775818e-06, + "loss": 1.0525, + "step": 6910 + }, + { + "epoch": 1.3304136487234401, + "grad_norm": 3.2738481099074868, + "learning_rate": 5.3228512778624156e-06, + "loss": 1.1252, + "step": 6911 + }, + { + "epoch": 1.3306061554010156, + "grad_norm": 3.1918435934599034, + "learning_rate": 5.3200956606277006e-06, + "loss": 1.0633, + "step": 6912 + }, + { + "epoch": 1.3306061554010156, + "lm_loss": 1.5916, + "step": 6912, + "vm_loss": 0.1609 + }, + { + "epoch": 1.3306061554010156, + "lm_loss": 0.9236, + "step": 6912, + "vm_loss": 0.2144 + }, + { + "epoch": 1.3306061554010156, + "lm_loss": 0.8234, + "step": 6912, + "vm_loss": 0.1459 + }, + { + "epoch": 1.3306061554010156, + "lm_loss": 0.969, + "step": 6912, + "vm_loss": 0.1333 + }, + { + "epoch": 1.3306061554010156, + "lm_loss": 1.1552, + "step": 6912, + "vm_loss": 0.1708 + }, + { + "epoch": 1.3306061554010156, + "lm_loss": 0.9247, + "step": 6912, + "vm_loss": 0.1738 + }, + { + "epoch": 1.3306061554010156, + "lm_loss": 1.4569, + "step": 6912, + "vm_loss": 0.1476 + }, + { + "epoch": 1.3306061554010156, + "lm_loss": 0.9717, + "step": 6912, + "vm_loss": 0.1865 + }, + { + "epoch": 1.3307986620785908, + "grad_norm": 3.2889234517132193, + "learning_rate": 5.317340498339561e-06, + "loss": 1.1296, + "step": 6913 + }, + { + "epoch": 1.3309911687561662, + "grad_norm": 3.335211868132565, + "learning_rate": 5.314585791265835e-06, + "loss": 1.1246, + "step": 6914 + }, + { + "epoch": 1.3311836754337416, + "grad_norm": 3.2264391913559094, + "learning_rate": 5.311831539674317e-06, + "loss": 1.0785, + "step": 6915 + }, + { + "epoch": 1.331376182111317, + "grad_norm": 3.2205110850075434, + "learning_rate": 5.309077743832749e-06, + "loss": 1.065, + "step": 6916 + }, + { + "epoch": 1.3315686887888925, + "grad_norm": 3.154567295654948, + "learning_rate": 5.306324404008838e-06, + "loss": 1.0318, + "step": 6917 + }, + { + "epoch": 1.3317611954664677, + "grad_norm": 3.2282866306913927, + "learning_rate": 5.303571520470245e-06, + "loss": 1.0897, + "step": 6918 + }, + { + "epoch": 1.331953702144043, + "grad_norm": 3.2235587594358948, + "learning_rate": 5.300819093484586e-06, + "loss": 1.0563, + "step": 6919 + }, + { + "epoch": 1.3321462088216185, + "grad_norm": 3.343270087207791, + "learning_rate": 5.29806712331943e-06, + "loss": 1.1389, + "step": 6920 + }, + { + "epoch": 1.3321462088216185, + "lm_loss": 1.3698, + "step": 6920, + "vm_loss": 0.2275 + }, + { + "epoch": 1.3321462088216185, + "lm_loss": 0.9847, + "step": 6920, + "vm_loss": 0.1623 + }, + { + "epoch": 1.3321462088216185, + "lm_loss": 0.7467, + "step": 6920, + "vm_loss": 0.1333 + }, + { + "epoch": 1.3321462088216185, + "lm_loss": 0.5713, + "step": 6920, + "vm_loss": 0.1739 + }, + { + "epoch": 1.3321462088216185, + "lm_loss": 0.7602, + "step": 6920, + "vm_loss": 0.1611 + }, + { + "epoch": 1.3321462088216185, + "lm_loss": 0.9817, + "step": 6920, + "vm_loss": 0.1407 + }, + { + "epoch": 1.3321462088216185, + "lm_loss": 0.9181, + "step": 6920, + "vm_loss": 0.1995 + }, + { + "epoch": 1.3321462088216185, + "lm_loss": 0.8862, + "step": 6920, + "vm_loss": 0.1515 + }, + { + "epoch": 1.332338715499194, + "grad_norm": 3.2543245430100436, + "learning_rate": 5.295315610242305e-06, + "loss": 1.0613, + "step": 6921 + }, + { + "epoch": 1.3325312221767693, + "grad_norm": 3.1305727186729246, + "learning_rate": 5.292564554520698e-06, + "loss": 1.0366, + "step": 6922 + }, + { + "epoch": 1.3327237288543445, + "grad_norm": 3.320716713752849, + "learning_rate": 5.289813956422037e-06, + "loss": 1.0985, + "step": 6923 + }, + { + "epoch": 1.33291623553192, + "grad_norm": 3.3004947850582784, + "learning_rate": 5.28706381621372e-06, + "loss": 1.0944, + "step": 6924 + }, + { + "epoch": 1.3331087422094954, + "grad_norm": 3.2851821881022585, + "learning_rate": 5.284314134163098e-06, + "loss": 1.0328, + "step": 6925 + }, + { + "epoch": 1.3333012488870708, + "grad_norm": 3.4455814871621526, + "learning_rate": 5.281564910537476e-06, + "loss": 1.091, + "step": 6926 + }, + { + "epoch": 1.3334937555646462, + "grad_norm": 3.383897664710934, + "learning_rate": 5.278816145604103e-06, + "loss": 1.0939, + "step": 6927 + }, + { + "epoch": 1.3336862622422214, + "grad_norm": 3.2224026961725754, + "learning_rate": 5.27606783963021e-06, + "loss": 1.0291, + "step": 6928 + }, + { + "epoch": 1.3336862622422214, + "lm_loss": 0.9031, + "step": 6928, + "vm_loss": 0.1635 + }, + { + "epoch": 1.3336862622422214, + "lm_loss": 1.2411, + "step": 6928, + "vm_loss": 0.1721 + }, + { + "epoch": 1.3336862622422214, + "lm_loss": 1.0549, + "step": 6928, + "vm_loss": 0.1059 + }, + { + "epoch": 1.3336862622422214, + "lm_loss": 0.5974, + "step": 6928, + "vm_loss": 0.2139 + }, + { + "epoch": 1.3336862622422214, + "lm_loss": 0.8604, + "step": 6928, + "vm_loss": 0.1138 + }, + { + "epoch": 1.3336862622422214, + "lm_loss": 1.2005, + "step": 6928, + "vm_loss": 0.1687 + }, + { + "epoch": 1.3336862622422214, + "lm_loss": 1.5703, + "step": 6928, + "vm_loss": 0.1737 + }, + { + "epoch": 1.3336862622422214, + "lm_loss": 0.7812, + "step": 6928, + "vm_loss": 0.1835 + }, + { + "epoch": 1.3338787689197968, + "grad_norm": 3.199032374006238, + "learning_rate": 5.273319992882957e-06, + "loss": 1.0846, + "step": 6929 + }, + { + "epoch": 1.3340712755973723, + "grad_norm": 3.1363885753599736, + "learning_rate": 5.270572605629471e-06, + "loss": 1.0353, + "step": 6930 + }, + { + "epoch": 1.3342637822749477, + "grad_norm": 3.2030617130649355, + "learning_rate": 5.2678256781368364e-06, + "loss": 1.1086, + "step": 6931 + }, + { + "epoch": 1.334456288952523, + "grad_norm": 3.214798053725716, + "learning_rate": 5.2650792106720905e-06, + "loss": 1.0768, + "step": 6932 + }, + { + "epoch": 1.3346487956300983, + "grad_norm": 3.251462340548556, + "learning_rate": 5.26233320350222e-06, + "loss": 1.0658, + "step": 6933 + }, + { + "epoch": 1.3348413023076737, + "grad_norm": 3.3068442657570674, + "learning_rate": 5.259587656894174e-06, + "loss": 1.1081, + "step": 6934 + }, + { + "epoch": 1.3350338089852491, + "grad_norm": 3.3069805607946012, + "learning_rate": 5.256842571114855e-06, + "loss": 1.1131, + "step": 6935 + }, + { + "epoch": 1.3352263156628246, + "grad_norm": 3.255382601246712, + "learning_rate": 5.254097946431121e-06, + "loss": 1.0815, + "step": 6936 + }, + { + "epoch": 1.3352263156628246, + "lm_loss": 1.2063, + "step": 6936, + "vm_loss": 0.1439 + }, + { + "epoch": 1.3352263156628246, + "lm_loss": 0.5287, + "step": 6936, + "vm_loss": 0.2075 + }, + { + "epoch": 1.3352263156628246, + "lm_loss": 0.9248, + "step": 6936, + "vm_loss": 0.1606 + }, + { + "epoch": 1.3352263156628246, + "lm_loss": 0.4554, + "step": 6936, + "vm_loss": 0.1529 + }, + { + "epoch": 1.3352263156628246, + "lm_loss": 0.6011, + "step": 6936, + "vm_loss": 0.1055 + }, + { + "epoch": 1.3352263156628246, + "lm_loss": 0.9562, + "step": 6936, + "vm_loss": 0.1508 + }, + { + "epoch": 1.3352263156628246, + "lm_loss": 0.7735, + "step": 6936, + "vm_loss": 0.1891 + }, + { + "epoch": 1.3352263156628246, + "lm_loss": 1.3065, + "step": 6936, + "vm_loss": 0.1814 + }, + { + "epoch": 1.3354188223404, + "grad_norm": 3.2176035999449604, + "learning_rate": 5.251353783109785e-06, + "loss": 1.0451, + "step": 6937 + }, + { + "epoch": 1.3356113290179752, + "grad_norm": 3.355347432293387, + "learning_rate": 5.248610081417617e-06, + "loss": 1.1227, + "step": 6938 + }, + { + "epoch": 1.3358038356955506, + "grad_norm": 3.2924429880155244, + "learning_rate": 5.245866841621335e-06, + "loss": 1.064, + "step": 6939 + }, + { + "epoch": 1.335996342373126, + "grad_norm": 3.3502522848516865, + "learning_rate": 5.243124063987618e-06, + "loss": 1.1232, + "step": 6940 + }, + { + "epoch": 1.3361888490507015, + "grad_norm": 3.2857219256752, + "learning_rate": 5.2403817487831e-06, + "loss": 1.0804, + "step": 6941 + }, + { + "epoch": 1.3363813557282769, + "grad_norm": 3.2776621987920365, + "learning_rate": 5.237639896274369e-06, + "loss": 1.0797, + "step": 6942 + }, + { + "epoch": 1.336573862405852, + "grad_norm": 3.3159521584187477, + "learning_rate": 5.234898506727973e-06, + "loss": 1.0998, + "step": 6943 + }, + { + "epoch": 1.3367663690834277, + "grad_norm": 3.323041177276933, + "learning_rate": 5.232157580410397e-06, + "loss": 1.1238, + "step": 6944 + }, + { + "epoch": 1.3367663690834277, + "lm_loss": 1.4002, + "step": 6944, + "vm_loss": 0.181 + }, + { + "epoch": 1.3367663690834277, + "lm_loss": 0.7547, + "step": 6944, + "vm_loss": 0.1882 + }, + { + "epoch": 1.3367663690834277, + "lm_loss": 1.0656, + "step": 6944, + "vm_loss": 0.1482 + }, + { + "epoch": 1.3367663690834277, + "lm_loss": 0.6765, + "step": 6944, + "vm_loss": 0.1526 + }, + { + "epoch": 1.3367663690834277, + "lm_loss": 0.6627, + "step": 6944, + "vm_loss": 0.144 + }, + { + "epoch": 1.3367663690834277, + "lm_loss": 1.1616, + "step": 6944, + "vm_loss": 0.1726 + }, + { + "epoch": 1.3367663690834277, + "lm_loss": 0.5965, + "step": 6944, + "vm_loss": 0.1202 + }, + { + "epoch": 1.3367663690834277, + "lm_loss": 0.8376, + "step": 6944, + "vm_loss": 0.1649 + }, + { + "epoch": 1.336958875761003, + "grad_norm": 3.3570869409403774, + "learning_rate": 5.229417117588112e-06, + "loss": 1.0671, + "step": 6945 + }, + { + "epoch": 1.3371513824385783, + "grad_norm": 3.0934497499081917, + "learning_rate": 5.226677118527512e-06, + "loss": 1.0377, + "step": 6946 + }, + { + "epoch": 1.3373438891161538, + "grad_norm": 3.328485370491098, + "learning_rate": 5.223937583494964e-06, + "loss": 1.088, + "step": 6947 + }, + { + "epoch": 1.337536395793729, + "grad_norm": 3.4006204192044094, + "learning_rate": 5.221198512756789e-06, + "loss": 1.0992, + "step": 6948 + }, + { + "epoch": 1.3377289024713046, + "grad_norm": 3.2466918053787306, + "learning_rate": 5.218459906579259e-06, + "loss": 0.9971, + "step": 6949 + }, + { + "epoch": 1.3379214091488798, + "grad_norm": 3.3878006822575157, + "learning_rate": 5.215721765228596e-06, + "loss": 1.0872, + "step": 6950 + }, + { + "epoch": 1.3381139158264552, + "grad_norm": 3.4611325712261496, + "learning_rate": 5.212984088970987e-06, + "loss": 1.1445, + "step": 6951 + }, + { + "epoch": 1.3383064225040306, + "grad_norm": 3.2497383088604823, + "learning_rate": 5.2102468780725675e-06, + "loss": 1.0885, + "step": 6952 + }, + { + "epoch": 1.3383064225040306, + "lm_loss": 0.619, + "step": 6952, + "vm_loss": 0.1443 + }, + { + "epoch": 1.3383064225040306, + "lm_loss": 0.9264, + "step": 6952, + "vm_loss": 0.1174 + }, + { + "epoch": 1.3383064225040306, + "lm_loss": 0.9728, + "step": 6952, + "vm_loss": 0.191 + }, + { + "epoch": 1.3383064225040306, + "lm_loss": 0.8471, + "step": 6952, + "vm_loss": 0.194 + }, + { + "epoch": 1.3383064225040306, + "lm_loss": 0.8807, + "step": 6952, + "vm_loss": 0.164 + }, + { + "epoch": 1.3383064225040306, + "lm_loss": 0.9985, + "step": 6952, + "vm_loss": 0.1013 + }, + { + "epoch": 1.3383064225040306, + "lm_loss": 0.5578, + "step": 6952, + "vm_loss": 0.1939 + }, + { + "epoch": 1.3383064225040306, + "lm_loss": 0.9214, + "step": 6952, + "vm_loss": 0.1963 + }, + { + "epoch": 1.338498929181606, + "grad_norm": 3.129673353084442, + "learning_rate": 5.207510132799436e-06, + "loss": 1.0404, + "step": 6953 + }, + { + "epoch": 1.3386914358591815, + "grad_norm": 3.277355557917731, + "learning_rate": 5.204773853417624e-06, + "loss": 1.1306, + "step": 6954 + }, + { + "epoch": 1.3388839425367567, + "grad_norm": 3.0838139829589615, + "learning_rate": 5.20203804019315e-06, + "loss": 0.9889, + "step": 6955 + }, + { + "epoch": 1.339076449214332, + "grad_norm": 3.1098672808817702, + "learning_rate": 5.199302693391958e-06, + "loss": 0.9934, + "step": 6956 + }, + { + "epoch": 1.3392689558919075, + "grad_norm": 3.1955853435960506, + "learning_rate": 5.196567813279964e-06, + "loss": 1.0248, + "step": 6957 + }, + { + "epoch": 1.339461462569483, + "grad_norm": 3.243707155436304, + "learning_rate": 5.193833400123034e-06, + "loss": 1.0726, + "step": 6958 + }, + { + "epoch": 1.3396539692470584, + "grad_norm": 3.2248598012651035, + "learning_rate": 5.191099454186984e-06, + "loss": 1.1026, + "step": 6959 + }, + { + "epoch": 1.3398464759246336, + "grad_norm": 3.3288835063635784, + "learning_rate": 5.188365975737595e-06, + "loss": 1.0695, + "step": 6960 + }, + { + "epoch": 1.3398464759246336, + "lm_loss": 0.7154, + "step": 6960, + "vm_loss": 0.169 + }, + { + "epoch": 1.3398464759246336, + "lm_loss": 0.938, + "step": 6960, + "vm_loss": 0.124 + }, + { + "epoch": 1.3398464759246336, + "lm_loss": 0.3825, + "step": 6960, + "vm_loss": 0.105 + }, + { + "epoch": 1.3398464759246336, + "lm_loss": 0.9634, + "step": 6960, + "vm_loss": 0.1038 + }, + { + "epoch": 1.3398464759246336, + "lm_loss": 1.1795, + "step": 6960, + "vm_loss": 0.2573 + }, + { + "epoch": 1.3398464759246336, + "lm_loss": 0.8148, + "step": 6960, + "vm_loss": 0.1656 + }, + { + "epoch": 1.3398464759246336, + "lm_loss": 1.2566, + "step": 6960, + "vm_loss": 0.2478 + }, + { + "epoch": 1.3398464759246336, + "lm_loss": 0.9161, + "step": 6960, + "vm_loss": 0.1841 + }, + { + "epoch": 1.340038982602209, + "grad_norm": 3.2204728302890913, + "learning_rate": 5.185632965040589e-06, + "loss": 1.0324, + "step": 6961 + }, + { + "epoch": 1.3402314892797844, + "grad_norm": 3.1720486745009273, + "learning_rate": 5.182900422361652e-06, + "loss": 1.0168, + "step": 6962 + }, + { + "epoch": 1.3404239959573598, + "grad_norm": 3.217575165751346, + "learning_rate": 5.180168347966422e-06, + "loss": 1.1317, + "step": 6963 + }, + { + "epoch": 1.3406165026349353, + "grad_norm": 3.2126670111959887, + "learning_rate": 5.1774367421204975e-06, + "loss": 1.0256, + "step": 6964 + }, + { + "epoch": 1.3408090093125105, + "grad_norm": 3.245098369950989, + "learning_rate": 5.174705605089412e-06, + "loss": 1.0561, + "step": 6965 + }, + { + "epoch": 1.3410015159900859, + "grad_norm": 3.3202999568300204, + "learning_rate": 5.171974937138682e-06, + "loss": 1.1031, + "step": 6966 + }, + { + "epoch": 1.3411940226676613, + "grad_norm": 3.2245200417534234, + "learning_rate": 5.169244738533754e-06, + "loss": 0.9983, + "step": 6967 + }, + { + "epoch": 1.3413865293452367, + "grad_norm": 3.3479766639087245, + "learning_rate": 5.1665150095400395e-06, + "loss": 1.0662, + "step": 6968 + }, + { + "epoch": 1.3413865293452367, + "lm_loss": 0.9483, + "step": 6968, + "vm_loss": 0.094 + }, + { + "epoch": 1.3413865293452367, + "lm_loss": 0.6806, + "step": 6968, + "vm_loss": 0.1682 + }, + { + "epoch": 1.3413865293452367, + "lm_loss": 0.9263, + "step": 6968, + "vm_loss": 0.1351 + }, + { + "epoch": 1.3413865293452367, + "lm_loss": 0.5996, + "step": 6968, + "vm_loss": 0.1673 + }, + { + "epoch": 1.3413865293452367, + "lm_loss": 0.7838, + "step": 6968, + "vm_loss": 0.1928 + }, + { + "epoch": 1.3413865293452367, + "lm_loss": 0.5778, + "step": 6968, + "vm_loss": 0.1477 + }, + { + "epoch": 1.3413865293452367, + "lm_loss": 0.8887, + "step": 6968, + "vm_loss": 0.1241 + }, + { + "epoch": 1.3413865293452367, + "lm_loss": 0.9038, + "step": 6968, + "vm_loss": 0.1517 + }, + { + "epoch": 1.3415790360228121, + "grad_norm": 3.2784356051830805, + "learning_rate": 5.163785750422905e-06, + "loss": 1.0513, + "step": 6969 + }, + { + "epoch": 1.3417715427003873, + "grad_norm": 3.3227694192099086, + "learning_rate": 5.1610569614476735e-06, + "loss": 1.0921, + "step": 6970 + }, + { + "epoch": 1.3419640493779628, + "grad_norm": 3.310711693670043, + "learning_rate": 5.1583286428796045e-06, + "loss": 1.0513, + "step": 6971 + }, + { + "epoch": 1.3421565560555382, + "grad_norm": 3.266230708433354, + "learning_rate": 5.1556007949839435e-06, + "loss": 1.0617, + "step": 6972 + }, + { + "epoch": 1.3423490627331136, + "grad_norm": 3.443904753766547, + "learning_rate": 5.152873418025859e-06, + "loss": 1.1211, + "step": 6973 + }, + { + "epoch": 1.342541569410689, + "grad_norm": 3.238602411064094, + "learning_rate": 5.150146512270494e-06, + "loss": 1.0233, + "step": 6974 + }, + { + "epoch": 1.3427340760882642, + "grad_norm": 3.3151212479413306, + "learning_rate": 5.147420077982928e-06, + "loss": 1.0981, + "step": 6975 + }, + { + "epoch": 1.3429265827658396, + "grad_norm": 3.240780362478548, + "learning_rate": 5.144694115428222e-06, + "loss": 1.0353, + "step": 6976 + }, + { + "epoch": 1.3429265827658396, + "lm_loss": 0.7008, + "step": 6976, + "vm_loss": 0.1757 + }, + { + "epoch": 1.3429265827658396, + "lm_loss": 0.5197, + "step": 6976, + "vm_loss": 0.1819 + }, + { + "epoch": 1.3429265827658396, + "lm_loss": 0.7226, + "step": 6976, + "vm_loss": 0.1588 + }, + { + "epoch": 1.3429265827658396, + "lm_loss": 1.0323, + "step": 6976, + "vm_loss": 0.1725 + }, + { + "epoch": 1.3429265827658396, + "lm_loss": 1.1335, + "step": 6976, + "vm_loss": 0.1518 + }, + { + "epoch": 1.3429265827658396, + "lm_loss": 1.3661, + "step": 6976, + "vm_loss": 0.1418 + }, + { + "epoch": 1.3429265827658396, + "lm_loss": 0.8359, + "step": 6976, + "vm_loss": 0.124 + }, + { + "epoch": 1.3429265827658396, + "lm_loss": 0.9437, + "step": 6976, + "vm_loss": 0.1297 + }, + { + "epoch": 1.343119089443415, + "grad_norm": 3.361879349342819, + "learning_rate": 5.14196862487136e-06, + "loss": 1.0546, + "step": 6977 + }, + { + "epoch": 1.3433115961209905, + "grad_norm": 3.1506508763458676, + "learning_rate": 5.139243606577302e-06, + "loss": 1.0283, + "step": 6978 + }, + { + "epoch": 1.343504102798566, + "grad_norm": 3.242870749476808, + "learning_rate": 5.136519060810952e-06, + "loss": 1.0785, + "step": 6979 + }, + { + "epoch": 1.3436966094761411, + "grad_norm": 3.270217491963181, + "learning_rate": 5.1337949878371705e-06, + "loss": 1.0961, + "step": 6980 + }, + { + "epoch": 1.3438891161537165, + "grad_norm": 3.10994473058476, + "learning_rate": 5.1310713879207786e-06, + "loss": 1.0358, + "step": 6981 + }, + { + "epoch": 1.344081622831292, + "grad_norm": 3.2441401483734804, + "learning_rate": 5.1283482613265305e-06, + "loss": 1.07, + "step": 6982 + }, + { + "epoch": 1.3442741295088674, + "grad_norm": 3.2136902284051723, + "learning_rate": 5.1256256083191665e-06, + "loss": 1.0585, + "step": 6983 + }, + { + "epoch": 1.3444666361864428, + "grad_norm": 3.202221464912572, + "learning_rate": 5.1229034291633505e-06, + "loss": 1.0539, + "step": 6984 + }, + { + "epoch": 1.3444666361864428, + "lm_loss": 0.9509, + "step": 6984, + "vm_loss": 0.2163 + }, + { + "epoch": 1.3444666361864428, + "lm_loss": 0.5015, + "step": 6984, + "vm_loss": 0.1845 + }, + { + "epoch": 1.3444666361864428, + "lm_loss": 0.6393, + "step": 6984, + "vm_loss": 0.135 + }, + { + "epoch": 1.3444666361864428, + "lm_loss": 0.9299, + "step": 6984, + "vm_loss": 0.2031 + }, + { + "epoch": 1.3444666361864428, + "lm_loss": 0.9546, + "step": 6984, + "vm_loss": 0.174 + }, + { + "epoch": 1.3444666361864428, + "lm_loss": 1.2449, + "step": 6984, + "vm_loss": 0.1491 + }, + { + "epoch": 1.3444666361864428, + "lm_loss": 0.6238, + "step": 6984, + "vm_loss": 0.235 + }, + { + "epoch": 1.3444666361864428, + "lm_loss": 1.1046, + "step": 6984, + "vm_loss": 0.2099 + }, + { + "epoch": 1.344659142864018, + "grad_norm": 3.0768520183826746, + "learning_rate": 5.1201817241237175e-06, + "loss": 1.0507, + "step": 6985 + }, + { + "epoch": 1.3448516495415934, + "grad_norm": 3.261797715322335, + "learning_rate": 5.117460493464852e-06, + "loss": 1.0569, + "step": 6986 + }, + { + "epoch": 1.3450441562191688, + "grad_norm": 3.2171081270955604, + "learning_rate": 5.114739737451297e-06, + "loss": 1.0383, + "step": 6987 + }, + { + "epoch": 1.3452366628967443, + "grad_norm": 3.2706009020803033, + "learning_rate": 5.1120194563475315e-06, + "loss": 1.0872, + "step": 6988 + }, + { + "epoch": 1.3454291695743197, + "grad_norm": 3.1403946841379753, + "learning_rate": 5.109299650418018e-06, + "loss": 1.0149, + "step": 6989 + }, + { + "epoch": 1.3456216762518949, + "grad_norm": 3.14067313338354, + "learning_rate": 5.106580319927146e-06, + "loss": 1.04, + "step": 6990 + }, + { + "epoch": 1.3458141829294703, + "grad_norm": 3.1176850897260273, + "learning_rate": 5.103861465139275e-06, + "loss": 1.0332, + "step": 6991 + }, + { + "epoch": 1.3460066896070457, + "grad_norm": 3.1652729859081563, + "learning_rate": 5.101143086318703e-06, + "loss": 1.0394, + "step": 6992 + }, + { + "epoch": 1.3460066896070457, + "lm_loss": 0.4395, + "step": 6992, + "vm_loss": 0.1364 + }, + { + "epoch": 1.3460066896070457, + "lm_loss": 0.7579, + "step": 6992, + "vm_loss": 0.163 + }, + { + "epoch": 1.3460066896070457, + "lm_loss": 1.0986, + "step": 6992, + "vm_loss": 0.1752 + }, + { + "epoch": 1.3460066896070457, + "lm_loss": 1.1285, + "step": 6992, + "vm_loss": 0.1641 + }, + { + "epoch": 1.3460066896070457, + "lm_loss": 0.8946, + "step": 6992, + "vm_loss": 0.1282 + }, + { + "epoch": 1.3460066896070457, + "lm_loss": 0.8644, + "step": 6992, + "vm_loss": 0.1269 + }, + { + "epoch": 1.3460066896070457, + "lm_loss": 1.4697, + "step": 6992, + "vm_loss": 0.1036 + }, + { + "epoch": 1.3460066896070457, + "lm_loss": 0.874, + "step": 6992, + "vm_loss": 0.1193 + }, + { + "epoch": 1.3461991962846211, + "grad_norm": 3.2511624077096597, + "learning_rate": 5.098425183729704e-06, + "loss": 1.0265, + "step": 6993 + }, + { + "epoch": 1.3463917029621966, + "grad_norm": 3.313968484559467, + "learning_rate": 5.095707757636484e-06, + "loss": 1.0642, + "step": 6994 + }, + { + "epoch": 1.3465842096397718, + "grad_norm": 3.398714973198529, + "learning_rate": 5.092990808303213e-06, + "loss": 1.0852, + "step": 6995 + }, + { + "epoch": 1.3467767163173472, + "grad_norm": 3.2429646263022707, + "learning_rate": 5.090274335994014e-06, + "loss": 1.066, + "step": 6996 + }, + { + "epoch": 1.3469692229949226, + "grad_norm": 3.226575785924758, + "learning_rate": 5.0875583409729676e-06, + "loss": 1.0688, + "step": 6997 + }, + { + "epoch": 1.347161729672498, + "grad_norm": 3.3889982820683042, + "learning_rate": 5.084842823504096e-06, + "loss": 1.1398, + "step": 6998 + }, + { + "epoch": 1.3473542363500735, + "grad_norm": 3.3836863269700888, + "learning_rate": 5.082127783851385e-06, + "loss": 1.1321, + "step": 6999 + }, + { + "epoch": 1.3475467430276487, + "grad_norm": 3.1329966084118817, + "learning_rate": 5.07941322227877e-06, + "loss": 1.0241, + "step": 7000 + }, + { + "epoch": 1.3475467430276487, + "lm_loss": 1.3697, + "step": 7000, + "vm_loss": 0.1373 + }, + { + "epoch": 1.3475467430276487, + "lm_loss": 1.295, + "step": 7000, + "vm_loss": 0.1755 + }, + { + "epoch": 1.3475467430276487, + "lm_loss": 1.0614, + "step": 7000, + "vm_loss": 0.1602 + }, + { + "epoch": 1.3475467430276487, + "lm_loss": 1.4258, + "step": 7000, + "vm_loss": 0.1786 + }, + { + "epoch": 1.3475467430276487, + "lm_loss": 0.5607, + "step": 7000, + "vm_loss": 0.1399 + }, + { + "epoch": 1.3475467430276487, + "lm_loss": 0.6863, + "step": 7000, + "vm_loss": 0.1803 + }, + { + "epoch": 1.3475467430276487, + "lm_loss": 0.8159, + "step": 7000, + "vm_loss": 0.1933 + }, + { + "epoch": 1.3475467430276487, + "lm_loss": 0.8655, + "step": 7000, + "vm_loss": 0.1381 + }, + { + "epoch": 1.347739249705224, + "grad_norm": 3.1971140212292912, + "learning_rate": 5.076699139050144e-06, + "loss": 1.1068, + "step": 7001 + }, + { + "epoch": 1.3479317563827995, + "grad_norm": 3.2873754937552895, + "learning_rate": 5.073985534429349e-06, + "loss": 1.1125, + "step": 7002 + }, + { + "epoch": 1.348124263060375, + "grad_norm": 3.134492004716534, + "learning_rate": 5.071272408680181e-06, + "loss": 1.0287, + "step": 7003 + }, + { + "epoch": 1.3483167697379503, + "grad_norm": 3.3359563863463264, + "learning_rate": 5.068559762066396e-06, + "loss": 1.1014, + "step": 7004 + }, + { + "epoch": 1.3485092764155255, + "grad_norm": 3.1324659075162247, + "learning_rate": 5.065847594851692e-06, + "loss": 1.0521, + "step": 7005 + }, + { + "epoch": 1.348701783093101, + "grad_norm": 3.20751896207077, + "learning_rate": 5.063135907299726e-06, + "loss": 1.0577, + "step": 7006 + }, + { + "epoch": 1.3488942897706764, + "grad_norm": 3.4023816127728703, + "learning_rate": 5.060424699674109e-06, + "loss": 1.0772, + "step": 7007 + }, + { + "epoch": 1.3490867964482518, + "grad_norm": 3.2637932288110423, + "learning_rate": 5.057713972238414e-06, + "loss": 1.0365, + "step": 7008 + }, + { + "epoch": 1.3490867964482518, + "lm_loss": 1.0385, + "step": 7008, + "vm_loss": 0.1319 + }, + { + "epoch": 1.3490867964482518, + "lm_loss": 1.0117, + "step": 7008, + "vm_loss": 0.1255 + }, + { + "epoch": 1.3490867964482518, + "lm_loss": 1.7242, + "step": 7008, + "vm_loss": 0.1972 + }, + { + "epoch": 1.3490867964482518, + "lm_loss": 1.1684, + "step": 7008, + "vm_loss": 0.1732 + }, + { + "epoch": 1.3490867964482518, + "lm_loss": 1.0217, + "step": 7008, + "vm_loss": 0.1334 + }, + { + "epoch": 1.3490867964482518, + "lm_loss": 0.6708, + "step": 7008, + "vm_loss": 0.1568 + }, + { + "epoch": 1.3490867964482518, + "lm_loss": 0.6978, + "step": 7008, + "vm_loss": 0.2026 + }, + { + "epoch": 1.3490867964482518, + "lm_loss": 0.9669, + "step": 7008, + "vm_loss": 0.1449 + }, + { + "epoch": 1.3492793031258272, + "grad_norm": 3.3081169392962564, + "learning_rate": 5.055003725256142e-06, + "loss": 1.0759, + "step": 7009 + }, + { + "epoch": 1.3494718098034024, + "grad_norm": 3.2859245884411665, + "learning_rate": 5.05229395899078e-06, + "loss": 1.0728, + "step": 7010 + }, + { + "epoch": 1.349664316480978, + "grad_norm": 3.401787157871432, + "learning_rate": 5.049584673705742e-06, + "loss": 1.0894, + "step": 7011 + }, + { + "epoch": 1.3498568231585533, + "grad_norm": 3.3308832663866927, + "learning_rate": 5.046875869664407e-06, + "loss": 1.0842, + "step": 7012 + }, + { + "epoch": 1.3500493298361287, + "grad_norm": 3.2625058610122983, + "learning_rate": 5.044167547130108e-06, + "loss": 1.0355, + "step": 7013 + }, + { + "epoch": 1.350241836513704, + "grad_norm": 3.262555258201198, + "learning_rate": 5.04145970636613e-06, + "loss": 1.0667, + "step": 7014 + }, + { + "epoch": 1.3504343431912795, + "grad_norm": 3.1523685896757234, + "learning_rate": 5.0387523476357024e-06, + "loss": 1.035, + "step": 7015 + }, + { + "epoch": 1.350626849868855, + "grad_norm": 3.2261397396233473, + "learning_rate": 5.03604547120202e-06, + "loss": 1.0417, + "step": 7016 + }, + { + "epoch": 1.350626849868855, + "lm_loss": 0.7755, + "step": 7016, + "vm_loss": 0.1769 + }, + { + "epoch": 1.350626849868855, + "lm_loss": 1.5116, + "step": 7016, + "vm_loss": 0.2167 + }, + { + "epoch": 1.350626849868855, + "lm_loss": 1.3271, + "step": 7016, + "vm_loss": 0.1381 + }, + { + "epoch": 1.350626849868855, + "lm_loss": 0.9609, + "step": 7016, + "vm_loss": 0.1652 + }, + { + "epoch": 1.350626849868855, + "lm_loss": 0.854, + "step": 7016, + "vm_loss": 0.1777 + }, + { + "epoch": 1.350626849868855, + "lm_loss": 0.5871, + "step": 7016, + "vm_loss": 0.196 + }, + { + "epoch": 1.350626849868855, + "lm_loss": 1.1658, + "step": 7016, + "vm_loss": 0.144 + }, + { + "epoch": 1.350626849868855, + "lm_loss": 0.6783, + "step": 7016, + "vm_loss": 0.1392 + }, + { + "epoch": 1.3508193565464301, + "grad_norm": 3.276769872753125, + "learning_rate": 5.033339077328225e-06, + "loss": 1.083, + "step": 7017 + }, + { + "epoch": 1.3510118632240056, + "grad_norm": 3.1706901801997582, + "learning_rate": 5.0306331662774185e-06, + "loss": 1.0477, + "step": 7018 + }, + { + "epoch": 1.351204369901581, + "grad_norm": 3.2552947299878676, + "learning_rate": 5.027927738312637e-06, + "loss": 1.0454, + "step": 7019 + }, + { + "epoch": 1.3513968765791564, + "grad_norm": 3.407927209008117, + "learning_rate": 5.025222793696898e-06, + "loss": 1.1409, + "step": 7020 + }, + { + "epoch": 1.3515893832567318, + "grad_norm": 3.1788750969842368, + "learning_rate": 5.022518332693145e-06, + "loss": 1.0454, + "step": 7021 + }, + { + "epoch": 1.351781889934307, + "grad_norm": 3.3358747674809104, + "learning_rate": 5.019814355564292e-06, + "loss": 1.0628, + "step": 7022 + }, + { + "epoch": 1.3519743966118825, + "grad_norm": 3.2604331967790743, + "learning_rate": 5.017110862573198e-06, + "loss": 1.0653, + "step": 7023 + }, + { + "epoch": 1.3521669032894579, + "grad_norm": 3.242091255967768, + "learning_rate": 5.014407853982679e-06, + "loss": 1.0925, + "step": 7024 + }, + { + "epoch": 1.3521669032894579, + "lm_loss": 0.7482, + "step": 7024, + "vm_loss": 0.2584 + }, + { + "epoch": 1.3521669032894579, + "lm_loss": 1.0807, + "step": 7024, + "vm_loss": 0.173 + }, + { + "epoch": 1.3521669032894579, + "lm_loss": 0.9074, + "step": 7024, + "vm_loss": 0.1267 + }, + { + "epoch": 1.3521669032894579, + "lm_loss": 1.0855, + "step": 7024, + "vm_loss": 0.2429 + }, + { + "epoch": 1.3521669032894579, + "lm_loss": 1.5356, + "step": 7024, + "vm_loss": 0.1658 + }, + { + "epoch": 1.3521669032894579, + "lm_loss": 0.7081, + "step": 7024, + "vm_loss": 0.2358 + }, + { + "epoch": 1.3521669032894579, + "lm_loss": 0.6794, + "step": 7024, + "vm_loss": 0.1327 + }, + { + "epoch": 1.3521669032894579, + "lm_loss": 0.9537, + "step": 7024, + "vm_loss": 0.1649 + }, + { + "epoch": 1.3523594099670333, + "grad_norm": 3.4084755133963758, + "learning_rate": 5.011705330055503e-06, + "loss": 1.1128, + "step": 7025 + }, + { + "epoch": 1.3525519166446087, + "grad_norm": 3.1586039917168343, + "learning_rate": 5.009003291054382e-06, + "loss": 1.0662, + "step": 7026 + }, + { + "epoch": 1.352744423322184, + "grad_norm": 3.1051612367691717, + "learning_rate": 5.006301737242001e-06, + "loss": 0.981, + "step": 7027 + }, + { + "epoch": 1.3529369299997593, + "grad_norm": 3.257798011986029, + "learning_rate": 5.003600668880975e-06, + "loss": 1.0241, + "step": 7028 + }, + { + "epoch": 1.3531294366773348, + "grad_norm": 3.1825731148571967, + "learning_rate": 5.000900086233888e-06, + "loss": 1.0742, + "step": 7029 + }, + { + "epoch": 1.3533219433549102, + "grad_norm": 3.3461707133581724, + "learning_rate": 4.998199989563269e-06, + "loss": 1.0969, + "step": 7030 + }, + { + "epoch": 1.3535144500324856, + "grad_norm": 3.292655661511612, + "learning_rate": 4.995500379131607e-06, + "loss": 1.0967, + "step": 7031 + }, + { + "epoch": 1.3537069567100608, + "grad_norm": 3.243348524788829, + "learning_rate": 4.99280125520133e-06, + "loss": 1.0691, + "step": 7032 + }, + { + "epoch": 1.3537069567100608, + "lm_loss": 0.4519, + "step": 7032, + "vm_loss": 0.1788 + }, + { + "epoch": 1.3537069567100608, + "lm_loss": 0.6597, + "step": 7032, + "vm_loss": 0.1979 + }, + { + "epoch": 1.3537069567100608, + "lm_loss": 0.7942, + "step": 7032, + "vm_loss": 0.1827 + }, + { + "epoch": 1.3537069567100608, + "lm_loss": 0.6231, + "step": 7032, + "vm_loss": 0.2301 + }, + { + "epoch": 1.3537069567100608, + "lm_loss": 0.6757, + "step": 7032, + "vm_loss": 0.1631 + }, + { + "epoch": 1.3537069567100608, + "lm_loss": 1.002, + "step": 7032, + "vm_loss": 0.1497 + }, + { + "epoch": 1.3537069567100608, + "lm_loss": 0.9637, + "step": 7032, + "vm_loss": 0.2254 + }, + { + "epoch": 1.3537069567100608, + "lm_loss": 0.7412, + "step": 7032, + "vm_loss": 0.1848 + }, + { + "epoch": 1.3538994633876362, + "grad_norm": 3.2221471019601835, + "learning_rate": 4.990102618034832e-06, + "loss": 1.0794, + "step": 7033 + }, + { + "epoch": 1.3540919700652116, + "grad_norm": 3.3941904765736495, + "learning_rate": 4.987404467894456e-06, + "loss": 1.0973, + "step": 7034 + }, + { + "epoch": 1.354284476742787, + "grad_norm": 3.461218689878161, + "learning_rate": 4.984706805042497e-06, + "loss": 1.0949, + "step": 7035 + }, + { + "epoch": 1.3544769834203625, + "grad_norm": 3.2016768049193502, + "learning_rate": 4.9820096297411945e-06, + "loss": 1.0234, + "step": 7036 + }, + { + "epoch": 1.3546694900979377, + "grad_norm": 3.4973752911200022, + "learning_rate": 4.979312942252763e-06, + "loss": 1.185, + "step": 7037 + }, + { + "epoch": 1.3548619967755131, + "grad_norm": 3.0898950681327846, + "learning_rate": 4.976616742839342e-06, + "loss": 1.0132, + "step": 7038 + }, + { + "epoch": 1.3550545034530885, + "grad_norm": 3.2200748093646845, + "learning_rate": 4.973921031763042e-06, + "loss": 1.1154, + "step": 7039 + }, + { + "epoch": 1.355247010130664, + "grad_norm": 3.1017967673750433, + "learning_rate": 4.97122580928592e-06, + "loss": 1.0223, + "step": 7040 + }, + { + "epoch": 1.355247010130664, + "lm_loss": 0.9618, + "step": 7040, + "vm_loss": 0.174 + }, + { + "epoch": 1.355247010130664, + "lm_loss": 0.9607, + "step": 7040, + "vm_loss": 0.1904 + }, + { + "epoch": 1.355247010130664, + "lm_loss": 0.8893, + "step": 7040, + "vm_loss": 0.1685 + }, + { + "epoch": 1.355247010130664, + "lm_loss": 0.8751, + "step": 7040, + "vm_loss": 0.1876 + }, + { + "epoch": 1.355247010130664, + "lm_loss": 1.0891, + "step": 7040, + "vm_loss": 0.0878 + }, + { + "epoch": 1.355247010130664, + "lm_loss": 1.3179, + "step": 7040, + "vm_loss": 0.1373 + }, + { + "epoch": 1.355247010130664, + "lm_loss": 1.0796, + "step": 7040, + "vm_loss": 0.1489 + }, + { + "epoch": 1.355247010130664, + "lm_loss": 0.6549, + "step": 7040, + "vm_loss": 0.2317 + }, + { + "epoch": 1.3554395168082394, + "grad_norm": 3.228617061313726, + "learning_rate": 4.96853107566999e-06, + "loss": 1.0932, + "step": 7041 + }, + { + "epoch": 1.3556320234858146, + "grad_norm": 3.2204735756778393, + "learning_rate": 4.965836831177206e-06, + "loss": 1.1058, + "step": 7042 + }, + { + "epoch": 1.35582453016339, + "grad_norm": 3.203507371510655, + "learning_rate": 4.96314307606949e-06, + "loss": 1.0637, + "step": 7043 + }, + { + "epoch": 1.3560170368409654, + "grad_norm": 3.203881705389357, + "learning_rate": 4.960449810608705e-06, + "loss": 1.0911, + "step": 7044 + }, + { + "epoch": 1.3562095435185408, + "grad_norm": 3.1978205491362113, + "learning_rate": 4.957757035056674e-06, + "loss": 1.0314, + "step": 7045 + }, + { + "epoch": 1.3564020501961163, + "grad_norm": 3.2535014130887236, + "learning_rate": 4.955064749675168e-06, + "loss": 1.0927, + "step": 7046 + }, + { + "epoch": 1.3565945568736915, + "grad_norm": 3.2249263651046705, + "learning_rate": 4.9523729547259125e-06, + "loss": 1.0469, + "step": 7047 + }, + { + "epoch": 1.3567870635512669, + "grad_norm": 3.167104445490603, + "learning_rate": 4.949681650470587e-06, + "loss": 1.031, + "step": 7048 + }, + { + "epoch": 1.3567870635512669, + "lm_loss": 1.012, + "step": 7048, + "vm_loss": 0.1854 + }, + { + "epoch": 1.3567870635512669, + "lm_loss": 0.5898, + "step": 7048, + "vm_loss": 0.2261 + }, + { + "epoch": 1.3567870635512669, + "lm_loss": 0.8023, + "step": 7048, + "vm_loss": 0.1592 + }, + { + "epoch": 1.3567870635512669, + "lm_loss": 0.9925, + "step": 7048, + "vm_loss": 0.1843 + }, + { + "epoch": 1.3567870635512669, + "lm_loss": 0.726, + "step": 7048, + "vm_loss": 0.1381 + }, + { + "epoch": 1.3567870635512669, + "lm_loss": 0.8478, + "step": 7048, + "vm_loss": 0.2006 + }, + { + "epoch": 1.3567870635512669, + "lm_loss": 0.6665, + "step": 7048, + "vm_loss": 0.1177 + }, + { + "epoch": 1.3567870635512669, + "lm_loss": 1.1384, + "step": 7048, + "vm_loss": 0.1653 + }, + { + "epoch": 1.3569795702288423, + "grad_norm": 3.1222255461659945, + "learning_rate": 4.946990837170814e-06, + "loss": 1.0054, + "step": 7049 + }, + { + "epoch": 1.3571720769064177, + "grad_norm": 3.4422307586468976, + "learning_rate": 4.944300515088178e-06, + "loss": 1.1559, + "step": 7050 + }, + { + "epoch": 1.3573645835839931, + "grad_norm": 3.230864226898403, + "learning_rate": 4.941610684484214e-06, + "loss": 1.0781, + "step": 7051 + }, + { + "epoch": 1.3575570902615683, + "grad_norm": 3.2991112150014317, + "learning_rate": 4.938921345620411e-06, + "loss": 1.0897, + "step": 7052 + }, + { + "epoch": 1.3577495969391438, + "grad_norm": 3.363526726113174, + "learning_rate": 4.936232498758195e-06, + "loss": 1.0753, + "step": 7053 + }, + { + "epoch": 1.3579421036167192, + "grad_norm": 3.284159403744669, + "learning_rate": 4.933544144158972e-06, + "loss": 1.0254, + "step": 7054 + }, + { + "epoch": 1.3581346102942946, + "grad_norm": 3.209213872587984, + "learning_rate": 4.930856282084074e-06, + "loss": 1.0024, + "step": 7055 + }, + { + "epoch": 1.35832711697187, + "grad_norm": 3.232070739964759, + "learning_rate": 4.928168912794803e-06, + "loss": 1.0453, + "step": 7056 + }, + { + "epoch": 1.35832711697187, + "lm_loss": 1.2744, + "step": 7056, + "vm_loss": 0.1874 + }, + { + "epoch": 1.35832711697187, + "lm_loss": 0.7717, + "step": 7056, + "vm_loss": 0.1289 + }, + { + "epoch": 1.35832711697187, + "lm_loss": 1.1349, + "step": 7056, + "vm_loss": 0.2007 + }, + { + "epoch": 1.35832711697187, + "lm_loss": 0.9859, + "step": 7056, + "vm_loss": 0.1593 + }, + { + "epoch": 1.35832711697187, + "lm_loss": 0.7024, + "step": 7056, + "vm_loss": 0.1419 + }, + { + "epoch": 1.35832711697187, + "lm_loss": 0.8248, + "step": 7056, + "vm_loss": 0.1957 + }, + { + "epoch": 1.35832711697187, + "lm_loss": 1.1315, + "step": 7056, + "vm_loss": 0.1535 + }, + { + "epoch": 1.35832711697187, + "lm_loss": 1.0956, + "step": 7056, + "vm_loss": 0.1792 + }, + { + "epoch": 1.3585196236494452, + "grad_norm": 3.332246641255352, + "learning_rate": 4.925482036552393e-06, + "loss": 1.0732, + "step": 7057 + }, + { + "epoch": 1.3587121303270207, + "grad_norm": 3.3276336153700465, + "learning_rate": 4.922795653618059e-06, + "loss": 1.044, + "step": 7058 + }, + { + "epoch": 1.358904637004596, + "grad_norm": 3.2191386126354686, + "learning_rate": 4.920109764252939e-06, + "loss": 1.0459, + "step": 7059 + }, + { + "epoch": 1.3590971436821715, + "grad_norm": 3.375590773088858, + "learning_rate": 4.917424368718141e-06, + "loss": 1.0765, + "step": 7060 + }, + { + "epoch": 1.359289650359747, + "grad_norm": 3.213202976080089, + "learning_rate": 4.914739467274721e-06, + "loss": 1.0104, + "step": 7061 + }, + { + "epoch": 1.3594821570373221, + "grad_norm": 3.3084010657113203, + "learning_rate": 4.912055060183687e-06, + "loss": 1.0538, + "step": 7062 + }, + { + "epoch": 1.3596746637148975, + "grad_norm": 3.1141084709878037, + "learning_rate": 4.909371147705987e-06, + "loss": 0.964, + "step": 7063 + }, + { + "epoch": 1.359867170392473, + "grad_norm": 3.3617157213971813, + "learning_rate": 4.906687730102547e-06, + "loss": 1.0662, + "step": 7064 + }, + { + "epoch": 1.359867170392473, + "lm_loss": 0.8885, + "step": 7064, + "vm_loss": 0.196 + }, + { + "epoch": 1.359867170392473, + "lm_loss": 0.7341, + "step": 7064, + "vm_loss": 0.0982 + }, + { + "epoch": 1.359867170392473, + "lm_loss": 0.8265, + "step": 7064, + "vm_loss": 0.194 + }, + { + "epoch": 1.359867170392473, + "lm_loss": 0.7617, + "step": 7064, + "vm_loss": 0.132 + }, + { + "epoch": 1.359867170392473, + "lm_loss": 1.1231, + "step": 7064, + "vm_loss": 0.1338 + }, + { + "epoch": 1.359867170392473, + "lm_loss": 0.6978, + "step": 7064, + "vm_loss": 0.1872 + }, + { + "epoch": 1.359867170392473, + "lm_loss": 0.864, + "step": 7064, + "vm_loss": 0.194 + }, + { + "epoch": 1.359867170392473, + "lm_loss": 0.8986, + "step": 7064, + "vm_loss": 0.1659 + }, + { + "epoch": 1.3600596770700484, + "grad_norm": 3.1566727302445945, + "learning_rate": 4.9040048076342195e-06, + "loss": 1.0205, + "step": 7065 + }, + { + "epoch": 1.3602521837476238, + "grad_norm": 3.364266433699712, + "learning_rate": 4.90132238056182e-06, + "loss": 1.1213, + "step": 7066 + }, + { + "epoch": 1.360444690425199, + "grad_norm": 3.281674362027754, + "learning_rate": 4.898640449146118e-06, + "loss": 1.0511, + "step": 7067 + }, + { + "epoch": 1.3606371971027744, + "grad_norm": 3.3329300665891, + "learning_rate": 4.895959013647828e-06, + "loss": 1.0947, + "step": 7068 + }, + { + "epoch": 1.3608297037803498, + "grad_norm": 3.2369855966483914, + "learning_rate": 4.893278074327627e-06, + "loss": 1.0573, + "step": 7069 + }, + { + "epoch": 1.3610222104579253, + "grad_norm": 3.2681183810022216, + "learning_rate": 4.890597631446124e-06, + "loss": 1.1227, + "step": 7070 + }, + { + "epoch": 1.3612147171355007, + "grad_norm": 3.2419842159483814, + "learning_rate": 4.887917685263906e-06, + "loss": 1.0745, + "step": 7071 + }, + { + "epoch": 1.3614072238130759, + "grad_norm": 3.1726430750595567, + "learning_rate": 4.885238236041487e-06, + "loss": 1.0189, + "step": 7072 + }, + { + "epoch": 1.3614072238130759, + "lm_loss": 0.7693, + "step": 7072, + "vm_loss": 0.1495 + }, + { + "epoch": 1.3614072238130759, + "lm_loss": 1.1509, + "step": 7072, + "vm_loss": 0.1588 + }, + { + "epoch": 1.3614072238130759, + "lm_loss": 0.9204, + "step": 7072, + "vm_loss": 0.1895 + }, + { + "epoch": 1.3614072238130759, + "lm_loss": 0.775, + "step": 7072, + "vm_loss": 0.1573 + }, + { + "epoch": 1.3614072238130759, + "lm_loss": 1.1243, + "step": 7072, + "vm_loss": 0.1446 + }, + { + "epoch": 1.3614072238130759, + "lm_loss": 0.7317, + "step": 7072, + "vm_loss": 0.1807 + }, + { + "epoch": 1.3614072238130759, + "lm_loss": 0.7785, + "step": 7072, + "vm_loss": 0.1673 + }, + { + "epoch": 1.3614072238130759, + "lm_loss": 0.8687, + "step": 7072, + "vm_loss": 0.1625 + }, + { + "epoch": 1.3615997304906515, + "grad_norm": 3.1079160357638527, + "learning_rate": 4.882559284039353e-06, + "loss": 1.0343, + "step": 7073 + }, + { + "epoch": 1.3617922371682267, + "grad_norm": 3.347357138036633, + "learning_rate": 4.879880829517921e-06, + "loss": 1.0684, + "step": 7074 + }, + { + "epoch": 1.3619847438458021, + "grad_norm": 3.1897892769497442, + "learning_rate": 4.877202872737585e-06, + "loss": 1.0463, + "step": 7075 + }, + { + "epoch": 1.3621772505233776, + "grad_norm": 3.1912344431291984, + "learning_rate": 4.874525413958665e-06, + "loss": 1.0251, + "step": 7076 + }, + { + "epoch": 1.362369757200953, + "grad_norm": 3.237945837556285, + "learning_rate": 4.871848453441449e-06, + "loss": 1.077, + "step": 7077 + }, + { + "epoch": 1.3625622638785284, + "grad_norm": 3.2507323025596366, + "learning_rate": 4.8691719914461725e-06, + "loss": 1.0511, + "step": 7078 + }, + { + "epoch": 1.3627547705561036, + "grad_norm": 3.3846922632320147, + "learning_rate": 4.866496028233023e-06, + "loss": 1.0757, + "step": 7079 + }, + { + "epoch": 1.362947277233679, + "grad_norm": 3.308047938623538, + "learning_rate": 4.86382056406213e-06, + "loss": 1.1051, + "step": 7080 + }, + { + "epoch": 1.362947277233679, + "lm_loss": 0.848, + "step": 7080, + "vm_loss": 0.1735 + }, + { + "epoch": 1.362947277233679, + "lm_loss": 0.7518, + "step": 7080, + "vm_loss": 0.124 + }, + { + "epoch": 1.362947277233679, + "lm_loss": 1.1171, + "step": 7080, + "vm_loss": 0.2214 + }, + { + "epoch": 1.362947277233679, + "lm_loss": 0.846, + "step": 7080, + "vm_loss": 0.1504 + }, + { + "epoch": 1.362947277233679, + "lm_loss": 0.7262, + "step": 7080, + "vm_loss": 0.1566 + }, + { + "epoch": 1.362947277233679, + "lm_loss": 0.8489, + "step": 7080, + "vm_loss": 0.2092 + }, + { + "epoch": 1.362947277233679, + "lm_loss": 0.7605, + "step": 7080, + "vm_loss": 0.1295 + }, + { + "epoch": 1.362947277233679, + "lm_loss": 1.07, + "step": 7080, + "vm_loss": 0.1462 + }, + { + "epoch": 1.3631397839112545, + "grad_norm": 3.314393392928482, + "learning_rate": 4.861145599193596e-06, + "loss": 1.0714, + "step": 7081 + }, + { + "epoch": 1.3633322905888299, + "grad_norm": 3.234185518230245, + "learning_rate": 4.858471133887451e-06, + "loss": 0.9809, + "step": 7082 + }, + { + "epoch": 1.3635247972664053, + "grad_norm": 3.2570231782122177, + "learning_rate": 4.855797168403692e-06, + "loss": 1.0603, + "step": 7083 + }, + { + "epoch": 1.3637173039439805, + "grad_norm": 3.306835201132162, + "learning_rate": 4.853123703002262e-06, + "loss": 1.1068, + "step": 7084 + }, + { + "epoch": 1.363909810621556, + "grad_norm": 3.225418981328935, + "learning_rate": 4.8504507379430584e-06, + "loss": 1.0754, + "step": 7085 + }, + { + "epoch": 1.3641023172991313, + "grad_norm": 3.289673204544399, + "learning_rate": 4.8477782734859215e-06, + "loss": 1.0843, + "step": 7086 + }, + { + "epoch": 1.3642948239767068, + "grad_norm": 3.145958458809384, + "learning_rate": 4.845106309890654e-06, + "loss": 1.049, + "step": 7087 + }, + { + "epoch": 1.3644873306542822, + "grad_norm": 3.193507920095359, + "learning_rate": 4.842434847417001e-06, + "loss": 1.0174, + "step": 7088 + }, + { + "epoch": 1.3644873306542822, + "lm_loss": 0.6428, + "step": 7088, + "vm_loss": 0.1787 + }, + { + "epoch": 1.3644873306542822, + "lm_loss": 0.7582, + "step": 7088, + "vm_loss": 0.187 + }, + { + "epoch": 1.3644873306542822, + "lm_loss": 1.0783, + "step": 7088, + "vm_loss": 0.2249 + }, + { + "epoch": 1.3644873306542822, + "lm_loss": 0.6633, + "step": 7088, + "vm_loss": 0.1715 + }, + { + "epoch": 1.3644873306542822, + "lm_loss": 1.1022, + "step": 7088, + "vm_loss": 0.1978 + }, + { + "epoch": 1.3644873306542822, + "lm_loss": 0.9337, + "step": 7088, + "vm_loss": 0.1847 + }, + { + "epoch": 1.3644873306542822, + "lm_loss": 0.8289, + "step": 7088, + "vm_loss": 0.1834 + }, + { + "epoch": 1.3644873306542822, + "lm_loss": 0.6194, + "step": 7088, + "vm_loss": 0.1258 + }, + { + "epoch": 1.3646798373318574, + "grad_norm": 3.2094122593164616, + "learning_rate": 4.839763886324668e-06, + "loss": 1.0365, + "step": 7089 + }, + { + "epoch": 1.3648723440094328, + "grad_norm": 3.1914931873834833, + "learning_rate": 4.837093426873306e-06, + "loss": 1.0235, + "step": 7090 + }, + { + "epoch": 1.3650648506870082, + "grad_norm": 3.288741738240951, + "learning_rate": 4.834423469322509e-06, + "loss": 1.1068, + "step": 7091 + }, + { + "epoch": 1.3652573573645836, + "grad_norm": 3.2756853261272747, + "learning_rate": 4.831754013931844e-06, + "loss": 1.0786, + "step": 7092 + }, + { + "epoch": 1.365449864042159, + "grad_norm": 3.2630285015588445, + "learning_rate": 4.829085060960808e-06, + "loss": 1.0209, + "step": 7093 + }, + { + "epoch": 1.3656423707197343, + "grad_norm": 3.2784387953089786, + "learning_rate": 4.826416610668858e-06, + "loss": 1.0458, + "step": 7094 + }, + { + "epoch": 1.3658348773973097, + "grad_norm": 3.256025392665378, + "learning_rate": 4.823748663315404e-06, + "loss": 1.0558, + "step": 7095 + }, + { + "epoch": 1.366027384074885, + "grad_norm": 3.1385564305713367, + "learning_rate": 4.821081219159806e-06, + "loss": 0.9779, + "step": 7096 + }, + { + "epoch": 1.366027384074885, + "lm_loss": 0.8178, + "step": 7096, + "vm_loss": 0.1388 + }, + { + "epoch": 1.366027384074885, + "lm_loss": 0.8379, + "step": 7096, + "vm_loss": 0.2216 + }, + { + "epoch": 1.366027384074885, + "lm_loss": 1.0425, + "step": 7096, + "vm_loss": 0.1199 + }, + { + "epoch": 1.366027384074885, + "lm_loss": 0.5086, + "step": 7096, + "vm_loss": 0.1309 + }, + { + "epoch": 1.366027384074885, + "lm_loss": 1.1079, + "step": 7096, + "vm_loss": 0.1256 + }, + { + "epoch": 1.366027384074885, + "lm_loss": 1.2865, + "step": 7096, + "vm_loss": 0.2172 + }, + { + "epoch": 1.366027384074885, + "lm_loss": 0.9702, + "step": 7096, + "vm_loss": 0.1899 + }, + { + "epoch": 1.366027384074885, + "lm_loss": 1.0967, + "step": 7096, + "vm_loss": 0.2411 + }, + { + "epoch": 1.3662198907524605, + "grad_norm": 3.2470761727469672, + "learning_rate": 4.818414278461365e-06, + "loss": 1.0857, + "step": 7097 + }, + { + "epoch": 1.366412397430036, + "grad_norm": 3.1784661220901542, + "learning_rate": 4.815747841479356e-06, + "loss": 1.0154, + "step": 7098 + }, + { + "epoch": 1.3666049041076112, + "grad_norm": 3.3479864030220448, + "learning_rate": 4.813081908472977e-06, + "loss": 1.063, + "step": 7099 + }, + { + "epoch": 1.3667974107851866, + "grad_norm": 3.329823176932758, + "learning_rate": 4.810416479701402e-06, + "loss": 1.0397, + "step": 7100 + }, + { + "epoch": 1.366989917462762, + "grad_norm": 3.362421904172485, + "learning_rate": 4.8077515554237304e-06, + "loss": 1.0458, + "step": 7101 + }, + { + "epoch": 1.3671824241403374, + "grad_norm": 3.44068970315734, + "learning_rate": 4.8050871358990446e-06, + "loss": 1.0953, + "step": 7102 + }, + { + "epoch": 1.3673749308179128, + "grad_norm": 3.1922532266400223, + "learning_rate": 4.802423221386347e-06, + "loss": 1.064, + "step": 7103 + }, + { + "epoch": 1.367567437495488, + "grad_norm": 3.1764295308848087, + "learning_rate": 4.79975981214461e-06, + "loss": 0.9904, + "step": 7104 + }, + { + "epoch": 1.367567437495488, + "lm_loss": 0.955, + "step": 7104, + "vm_loss": 0.1363 + }, + { + "epoch": 1.367567437495488, + "lm_loss": 0.6801, + "step": 7104, + "vm_loss": 0.194 + }, + { + "epoch": 1.367567437495488, + "lm_loss": 0.828, + "step": 7104, + "vm_loss": 0.1722 + }, + { + "epoch": 1.367567437495488, + "lm_loss": 1.0659, + "step": 7104, + "vm_loss": 0.1728 + }, + { + "epoch": 1.367567437495488, + "lm_loss": 0.7548, + "step": 7104, + "vm_loss": 0.1594 + }, + { + "epoch": 1.367567437495488, + "lm_loss": 1.177, + "step": 7104, + "vm_loss": 0.1518 + }, + { + "epoch": 1.367567437495488, + "lm_loss": 0.9026, + "step": 7104, + "vm_loss": 0.1029 + }, + { + "epoch": 1.367567437495488, + "lm_loss": 1.478, + "step": 7104, + "vm_loss": 0.2062 + }, + { + "epoch": 1.3677599441730635, + "grad_norm": 3.3017230584823722, + "learning_rate": 4.797096908432749e-06, + "loss": 1.03, + "step": 7105 + }, + { + "epoch": 1.3679524508506389, + "grad_norm": 3.224838174502266, + "learning_rate": 4.794434510509632e-06, + "loss": 1.0323, + "step": 7106 + }, + { + "epoch": 1.3681449575282143, + "grad_norm": 3.3641021800481, + "learning_rate": 4.791772618634084e-06, + "loss": 1.1041, + "step": 7107 + }, + { + "epoch": 1.3683374642057897, + "grad_norm": 3.242532957559337, + "learning_rate": 4.789111233064867e-06, + "loss": 1.0518, + "step": 7108 + }, + { + "epoch": 1.368529970883365, + "grad_norm": 3.3298553167355105, + "learning_rate": 4.786450354060703e-06, + "loss": 1.0286, + "step": 7109 + }, + { + "epoch": 1.3687224775609403, + "grad_norm": 3.3661302962865727, + "learning_rate": 4.783789981880267e-06, + "loss": 1.0656, + "step": 7110 + }, + { + "epoch": 1.3689149842385158, + "grad_norm": 3.0979804569039397, + "learning_rate": 4.781130116782179e-06, + "loss": 0.9882, + "step": 7111 + }, + { + "epoch": 1.3691074909160912, + "grad_norm": 3.2378044900306406, + "learning_rate": 4.7784707590250124e-06, + "loss": 1.012, + "step": 7112 + }, + { + "epoch": 1.3691074909160912, + "lm_loss": 1.058, + "step": 7112, + "vm_loss": 0.1692 + }, + { + "epoch": 1.3691074909160912, + "lm_loss": 0.6313, + "step": 7112, + "vm_loss": 0.1611 + }, + { + "epoch": 1.3691074909160912, + "lm_loss": 1.413, + "step": 7112, + "vm_loss": 0.2613 + }, + { + "epoch": 1.3691074909160912, + "lm_loss": 1.1325, + "step": 7112, + "vm_loss": 0.1333 + }, + { + "epoch": 1.3691074909160912, + "lm_loss": 0.7352, + "step": 7112, + "vm_loss": 0.1509 + }, + { + "epoch": 1.3691074909160912, + "lm_loss": 0.7394, + "step": 7112, + "vm_loss": 0.1265 + }, + { + "epoch": 1.3691074909160912, + "lm_loss": 0.6363, + "step": 7112, + "vm_loss": 0.1249 + }, + { + "epoch": 1.3691074909160912, + "lm_loss": 0.7023, + "step": 7112, + "vm_loss": 0.1531 + }, + { + "epoch": 1.3692999975936666, + "grad_norm": 3.364143469123974, + "learning_rate": 4.775811908867295e-06, + "loss": 1.1084, + "step": 7113 + }, + { + "epoch": 1.3694925042712418, + "grad_norm": 3.2782866525261536, + "learning_rate": 4.773153566567489e-06, + "loss": 1.0511, + "step": 7114 + }, + { + "epoch": 1.3696850109488172, + "grad_norm": 3.193657171189352, + "learning_rate": 4.770495732384035e-06, + "loss": 1.0323, + "step": 7115 + }, + { + "epoch": 1.3698775176263926, + "grad_norm": 3.298515994385738, + "learning_rate": 4.767838406575297e-06, + "loss": 1.0548, + "step": 7116 + }, + { + "epoch": 1.370070024303968, + "grad_norm": 3.2968768391369414, + "learning_rate": 4.765181589399609e-06, + "loss": 1.0691, + "step": 7117 + }, + { + "epoch": 1.3702625309815435, + "grad_norm": 3.1937952089991013, + "learning_rate": 4.762525281115235e-06, + "loss": 1.0606, + "step": 7118 + }, + { + "epoch": 1.3704550376591187, + "grad_norm": 3.500103068445227, + "learning_rate": 4.759869481980418e-06, + "loss": 1.1315, + "step": 7119 + }, + { + "epoch": 1.3706475443366941, + "grad_norm": 3.345112654935229, + "learning_rate": 4.7572141922533245e-06, + "loss": 1.0523, + "step": 7120 + }, + { + "epoch": 1.3706475443366941, + "lm_loss": 0.7338, + "step": 7120, + "vm_loss": 0.1885 + }, + { + "epoch": 1.3706475443366941, + "lm_loss": 0.7907, + "step": 7120, + "vm_loss": 0.1971 + }, + { + "epoch": 1.3706475443366941, + "lm_loss": 0.8778, + "step": 7120, + "vm_loss": 0.1629 + }, + { + "epoch": 1.3706475443366941, + "lm_loss": 0.9886, + "step": 7120, + "vm_loss": 0.0891 + }, + { + "epoch": 1.3706475443366941, + "lm_loss": 1.3416, + "step": 7120, + "vm_loss": 0.1253 + }, + { + "epoch": 1.3706475443366941, + "lm_loss": 0.6807, + "step": 7120, + "vm_loss": 0.1116 + }, + { + "epoch": 1.3706475443366941, + "lm_loss": 1.0028, + "step": 7120, + "vm_loss": 0.1263 + }, + { + "epoch": 1.3706475443366941, + "lm_loss": 0.4997, + "step": 7120, + "vm_loss": 0.0931 + }, + { + "epoch": 1.3708400510142695, + "grad_norm": 3.211014526309793, + "learning_rate": 4.754559412192087e-06, + "loss": 1.0034, + "step": 7121 + }, + { + "epoch": 1.371032557691845, + "grad_norm": 3.3156406872211184, + "learning_rate": 4.751905142054783e-06, + "loss": 1.0655, + "step": 7122 + }, + { + "epoch": 1.3712250643694204, + "grad_norm": 3.27702613506469, + "learning_rate": 4.7492513820994455e-06, + "loss": 1.0251, + "step": 7123 + }, + { + "epoch": 1.3714175710469956, + "grad_norm": 3.2672661785670485, + "learning_rate": 4.746598132584047e-06, + "loss": 1.0664, + "step": 7124 + }, + { + "epoch": 1.371610077724571, + "grad_norm": 3.3222556304415773, + "learning_rate": 4.74394539376652e-06, + "loss": 1.0387, + "step": 7125 + }, + { + "epoch": 1.3718025844021464, + "grad_norm": 3.2365192202430575, + "learning_rate": 4.741293165904747e-06, + "loss": 1.023, + "step": 7126 + }, + { + "epoch": 1.3719950910797218, + "grad_norm": 3.3319102237630367, + "learning_rate": 4.738641449256555e-06, + "loss": 1.101, + "step": 7127 + }, + { + "epoch": 1.3721875977572973, + "grad_norm": 3.1907847592354575, + "learning_rate": 4.735990244079727e-06, + "loss": 1.0664, + "step": 7128 + }, + { + "epoch": 1.3721875977572973, + "lm_loss": 0.6782, + "step": 7128, + "vm_loss": 0.1464 + }, + { + "epoch": 1.3721875977572973, + "lm_loss": 0.8332, + "step": 7128, + "vm_loss": 0.193 + }, + { + "epoch": 1.3721875977572973, + "lm_loss": 0.4181, + "step": 7128, + "vm_loss": 0.1852 + }, + { + "epoch": 1.3721875977572973, + "lm_loss": 1.1329, + "step": 7128, + "vm_loss": 0.0839 + }, + { + "epoch": 1.3721875977572973, + "lm_loss": 0.8547, + "step": 7128, + "vm_loss": 0.1443 + }, + { + "epoch": 1.3721875977572973, + "lm_loss": 0.9379, + "step": 7128, + "vm_loss": 0.1991 + }, + { + "epoch": 1.3721875977572973, + "lm_loss": 0.802, + "step": 7128, + "vm_loss": 0.1206 + }, + { + "epoch": 1.3721875977572973, + "lm_loss": 1.0139, + "step": 7128, + "vm_loss": 0.1201 + }, + { + "epoch": 1.3723801044348725, + "grad_norm": 3.1155307860371484, + "learning_rate": 4.733339550631993e-06, + "loss": 0.9863, + "step": 7129 + }, + { + "epoch": 1.3725726111124479, + "grad_norm": 3.1389427447816463, + "learning_rate": 4.73068936917104e-06, + "loss": 0.9647, + "step": 7130 + }, + { + "epoch": 1.3727651177900233, + "grad_norm": 3.218499440956164, + "learning_rate": 4.72803969995449e-06, + "loss": 1.0451, + "step": 7131 + }, + { + "epoch": 1.3729576244675987, + "grad_norm": 3.302689711491638, + "learning_rate": 4.7253905432399295e-06, + "loss": 1.0729, + "step": 7132 + }, + { + "epoch": 1.3731501311451741, + "grad_norm": 3.2381152662064507, + "learning_rate": 4.72274189928489e-06, + "loss": 1.0452, + "step": 7133 + }, + { + "epoch": 1.3733426378227493, + "grad_norm": 3.338192506894632, + "learning_rate": 4.720093768346858e-06, + "loss": 1.048, + "step": 7134 + }, + { + "epoch": 1.373535144500325, + "grad_norm": 3.276929344775282, + "learning_rate": 4.717446150683252e-06, + "loss": 1.0199, + "step": 7135 + }, + { + "epoch": 1.3737276511779002, + "grad_norm": 3.3288586200508377, + "learning_rate": 4.714799046551474e-06, + "loss": 1.0117, + "step": 7136 + }, + { + "epoch": 1.3737276511779002, + "lm_loss": 0.9793, + "step": 7136, + "vm_loss": 0.1965 + }, + { + "epoch": 1.3737276511779002, + "lm_loss": 0.807, + "step": 7136, + "vm_loss": 0.1381 + }, + { + "epoch": 1.3737276511779002, + "lm_loss": 0.9508, + "step": 7136, + "vm_loss": 0.1881 + }, + { + "epoch": 1.3737276511779002, + "lm_loss": 0.8771, + "step": 7136, + "vm_loss": 0.1303 + }, + { + "epoch": 1.3737276511779002, + "lm_loss": 0.7458, + "step": 7136, + "vm_loss": 0.1341 + }, + { + "epoch": 1.3737276511779002, + "lm_loss": 0.6409, + "step": 7136, + "vm_loss": 0.1327 + }, + { + "epoch": 1.3737276511779002, + "lm_loss": 0.735, + "step": 7136, + "vm_loss": 0.1757 + }, + { + "epoch": 1.3737276511779002, + "lm_loss": 0.5206, + "step": 7136, + "vm_loss": 0.1491 + }, + { + "epoch": 1.3739201578554756, + "grad_norm": 3.1478304942148974, + "learning_rate": 4.712152456208841e-06, + "loss": 1.0212, + "step": 7137 + }, + { + "epoch": 1.374112664533051, + "grad_norm": 3.409246517272823, + "learning_rate": 4.709506379912641e-06, + "loss": 1.1143, + "step": 7138 + }, + { + "epoch": 1.3743051712106262, + "grad_norm": 3.379073869435661, + "learning_rate": 4.706860817920105e-06, + "loss": 1.0876, + "step": 7139 + }, + { + "epoch": 1.3744976778882019, + "grad_norm": 3.294287589027186, + "learning_rate": 4.704215770488422e-06, + "loss": 1.0562, + "step": 7140 + }, + { + "epoch": 1.374690184565777, + "grad_norm": 3.2621472671038028, + "learning_rate": 4.701571237874714e-06, + "loss": 1.0322, + "step": 7141 + }, + { + "epoch": 1.3748826912433525, + "grad_norm": 3.3430024728994443, + "learning_rate": 4.698927220336068e-06, + "loss": 1.0675, + "step": 7142 + }, + { + "epoch": 1.375075197920928, + "grad_norm": 3.2293080959707057, + "learning_rate": 4.696283718129517e-06, + "loss": 1.0128, + "step": 7143 + }, + { + "epoch": 1.3752677045985033, + "grad_norm": 3.280628068137271, + "learning_rate": 4.6936407315120465e-06, + "loss": 1.0512, + "step": 7144 + }, + { + "epoch": 1.3752677045985033, + "lm_loss": 0.4082, + "step": 7144, + "vm_loss": 0.1534 + }, + { + "epoch": 1.3752677045985033, + "lm_loss": 1.0373, + "step": 7144, + "vm_loss": 0.156 + }, + { + "epoch": 1.3752677045985033, + "lm_loss": 0.9889, + "step": 7144, + "vm_loss": 0.1268 + }, + { + "epoch": 1.3752677045985033, + "lm_loss": 0.8766, + "step": 7144, + "vm_loss": 0.1413 + }, + { + "epoch": 1.3752677045985033, + "lm_loss": 1.0124, + "step": 7144, + "vm_loss": 0.0992 + }, + { + "epoch": 1.3752677045985033, + "lm_loss": 0.6896, + "step": 7144, + "vm_loss": 0.1422 + }, + { + "epoch": 1.3752677045985033, + "lm_loss": 0.8161, + "step": 7144, + "vm_loss": 0.1724 + }, + { + "epoch": 1.3752677045985033, + "lm_loss": 1.1946, + "step": 7144, + "vm_loss": 0.1292 + }, + { + "epoch": 1.3754602112760788, + "grad_norm": 3.249676009271436, + "learning_rate": 4.690998260740579e-06, + "loss": 0.9981, + "step": 7145 + }, + { + "epoch": 1.375652717953654, + "grad_norm": 3.2617896197347966, + "learning_rate": 4.688356306072009e-06, + "loss": 1.0517, + "step": 7146 + }, + { + "epoch": 1.3758452246312294, + "grad_norm": 3.079311594261293, + "learning_rate": 4.685714867763158e-06, + "loss": 0.9999, + "step": 7147 + }, + { + "epoch": 1.3760377313088048, + "grad_norm": 3.126671100611527, + "learning_rate": 4.683073946070812e-06, + "loss": 0.9622, + "step": 7148 + }, + { + "epoch": 1.3762302379863802, + "grad_norm": 3.2983521018026587, + "learning_rate": 4.680433541251701e-06, + "loss": 1.0557, + "step": 7149 + }, + { + "epoch": 1.3764227446639556, + "grad_norm": 3.324486552324521, + "learning_rate": 4.677793653562507e-06, + "loss": 1.13, + "step": 7150 + }, + { + "epoch": 1.3766152513415308, + "grad_norm": 3.225894736096773, + "learning_rate": 4.6751542832598655e-06, + "loss": 1.0358, + "step": 7151 + }, + { + "epoch": 1.3768077580191063, + "grad_norm": 3.376397167759517, + "learning_rate": 4.672515430600347e-06, + "loss": 1.1284, + "step": 7152 + }, + { + "epoch": 1.3768077580191063, + "lm_loss": 0.7219, + "step": 7152, + "vm_loss": 0.1808 + }, + { + "epoch": 1.3768077580191063, + "lm_loss": 0.7553, + "step": 7152, + "vm_loss": 0.2024 + }, + { + "epoch": 1.3768077580191063, + "lm_loss": 0.8683, + "step": 7152, + "vm_loss": 0.1417 + }, + { + "epoch": 1.3768077580191063, + "lm_loss": 1.2722, + "step": 7152, + "vm_loss": 0.1547 + }, + { + "epoch": 1.3768077580191063, + "lm_loss": 0.3439, + "step": 7152, + "vm_loss": 0.1564 + }, + { + "epoch": 1.3768077580191063, + "lm_loss": 0.7411, + "step": 7152, + "vm_loss": 0.1675 + }, + { + "epoch": 1.3768077580191063, + "lm_loss": 1.1973, + "step": 7152, + "vm_loss": 0.1743 + }, + { + "epoch": 1.3768077580191063, + "lm_loss": 0.9364, + "step": 7152, + "vm_loss": 0.1634 + }, + { + "epoch": 1.3770002646966817, + "grad_norm": 3.184742649793817, + "learning_rate": 4.669877095840488e-06, + "loss": 1.049, + "step": 7153 + }, + { + "epoch": 1.377192771374257, + "grad_norm": 3.236740749731261, + "learning_rate": 4.667239279236768e-06, + "loss": 1.0181, + "step": 7154 + }, + { + "epoch": 1.3773852780518325, + "grad_norm": 3.2962556154139815, + "learning_rate": 4.664601981045616e-06, + "loss": 1.0817, + "step": 7155 + }, + { + "epoch": 1.3775777847294077, + "grad_norm": 3.2590386163660128, + "learning_rate": 4.6619652015234095e-06, + "loss": 1.0095, + "step": 7156 + }, + { + "epoch": 1.3777702914069831, + "grad_norm": 3.199013012293532, + "learning_rate": 4.659328940926484e-06, + "loss": 1.0261, + "step": 7157 + }, + { + "epoch": 1.3779627980845586, + "grad_norm": 3.3087070396825284, + "learning_rate": 4.656693199511108e-06, + "loss": 1.0567, + "step": 7158 + }, + { + "epoch": 1.378155304762134, + "grad_norm": 3.068071030562416, + "learning_rate": 4.654057977533515e-06, + "loss": 1.0008, + "step": 7159 + }, + { + "epoch": 1.3783478114397094, + "grad_norm": 3.0796367185420768, + "learning_rate": 4.65142327524988e-06, + "loss": 0.989, + "step": 7160 + }, + { + "epoch": 1.3783478114397094, + "lm_loss": 0.844, + "step": 7160, + "vm_loss": 0.1772 + }, + { + "epoch": 1.3783478114397094, + "lm_loss": 0.8576, + "step": 7160, + "vm_loss": 0.1789 + }, + { + "epoch": 1.3783478114397094, + "lm_loss": 1.0579, + "step": 7160, + "vm_loss": 0.1265 + }, + { + "epoch": 1.3783478114397094, + "lm_loss": 0.7478, + "step": 7160, + "vm_loss": 0.1344 + }, + { + "epoch": 1.3783478114397094, + "lm_loss": 0.8814, + "step": 7160, + "vm_loss": 0.1363 + }, + { + "epoch": 1.3783478114397094, + "lm_loss": 0.5713, + "step": 7160, + "vm_loss": 0.116 + }, + { + "epoch": 1.3783478114397094, + "lm_loss": 0.8367, + "step": 7160, + "vm_loss": 0.1518 + }, + { + "epoch": 1.3783478114397094, + "lm_loss": 0.9311, + "step": 7160, + "vm_loss": 0.1173 + }, + { + "epoch": 1.3785403181172846, + "grad_norm": 3.260286882713741, + "learning_rate": 4.648789092916335e-06, + "loss": 0.9486, + "step": 7161 + }, + { + "epoch": 1.37873282479486, + "grad_norm": 3.397740795172978, + "learning_rate": 4.646155430788944e-06, + "loss": 1.064, + "step": 7162 + }, + { + "epoch": 1.3789253314724355, + "grad_norm": 3.2767185316166674, + "learning_rate": 4.643522289123749e-06, + "loss": 1.024, + "step": 7163 + }, + { + "epoch": 1.3791178381500109, + "grad_norm": 3.2308683721170817, + "learning_rate": 4.640889668176713e-06, + "loss": 0.99, + "step": 7164 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 3.35784136839363, + "learning_rate": 4.6382575682037685e-06, + "loss": 1.0311, + "step": 7165 + }, + { + "epoch": 1.3795028515051615, + "grad_norm": 3.454245029992402, + "learning_rate": 4.635625989460778e-06, + "loss": 1.0897, + "step": 7166 + }, + { + "epoch": 1.379695358182737, + "grad_norm": 3.281500622963091, + "learning_rate": 4.632994932203579e-06, + "loss": 1.0475, + "step": 7167 + }, + { + "epoch": 1.3798878648603123, + "grad_norm": 3.402296868399998, + "learning_rate": 4.630364396687932e-06, + "loss": 1.0621, + "step": 7168 + }, + { + "epoch": 1.3798878648603123, + "lm_loss": 0.826, + "step": 7168, + "vm_loss": 0.1767 + }, + { + "epoch": 1.3798878648603123, + "lm_loss": 0.6723, + "step": 7168, + "vm_loss": 0.1929 + }, + { + "epoch": 1.3798878648603123, + "lm_loss": 1.1915, + "step": 7168, + "vm_loss": 0.2075 + }, + { + "epoch": 1.3798878648603123, + "lm_loss": 0.9041, + "step": 7168, + "vm_loss": 0.1205 + }, + { + "epoch": 1.3798878648603123, + "lm_loss": 1.0895, + "step": 7168, + "vm_loss": 0.1783 + }, + { + "epoch": 1.3798878648603123, + "lm_loss": 0.9383, + "step": 7168, + "vm_loss": 0.1723 + }, + { + "epoch": 1.3798878648603123, + "lm_loss": 0.9183, + "step": 7168, + "vm_loss": 0.1617 + }, + { + "epoch": 1.3798878648603123, + "lm_loss": 1.2286, + "step": 7168, + "vm_loss": 0.0968 + }, + { + "epoch": 1.3800803715378878, + "grad_norm": 3.0971321077542577, + "learning_rate": 4.627734383169563e-06, + "loss": 0.9602, + "step": 7169 + }, + { + "epoch": 1.3802728782154632, + "grad_norm": 3.3412425059318887, + "learning_rate": 4.625104891904144e-06, + "loss": 1.0162, + "step": 7170 + }, + { + "epoch": 1.3804653848930384, + "grad_norm": 3.2311206880194283, + "learning_rate": 4.622475923147295e-06, + "loss": 1.0286, + "step": 7171 + }, + { + "epoch": 1.3806578915706138, + "grad_norm": 3.303908705157134, + "learning_rate": 4.619847477154584e-06, + "loss": 1.0514, + "step": 7172 + }, + { + "epoch": 1.3808503982481892, + "grad_norm": 3.1139753400666725, + "learning_rate": 4.617219554181531e-06, + "loss": 0.987, + "step": 7173 + }, + { + "epoch": 1.3810429049257646, + "grad_norm": 3.3128177591935604, + "learning_rate": 4.614592154483607e-06, + "loss": 1.0418, + "step": 7174 + }, + { + "epoch": 1.38123541160334, + "grad_norm": 3.2468533366440444, + "learning_rate": 4.611965278316221e-06, + "loss": 1.05, + "step": 7175 + }, + { + "epoch": 1.3814279182809153, + "grad_norm": 3.2502978168802112, + "learning_rate": 4.609338925934743e-06, + "loss": 1.0678, + "step": 7176 + }, + { + "epoch": 1.3814279182809153, + "lm_loss": 1.1664, + "step": 7176, + "vm_loss": 0.2062 + }, + { + "epoch": 1.3814279182809153, + "lm_loss": 0.9134, + "step": 7176, + "vm_loss": 0.1438 + }, + { + "epoch": 1.3814279182809153, + "lm_loss": 1.6322, + "step": 7176, + "vm_loss": 0.1567 + }, + { + "epoch": 1.3814279182809153, + "lm_loss": 0.8187, + "step": 7176, + "vm_loss": 0.1269 + }, + { + "epoch": 1.3814279182809153, + "lm_loss": 0.741, + "step": 7176, + "vm_loss": 0.1233 + }, + { + "epoch": 1.3814279182809153, + "lm_loss": 1.3134, + "step": 7176, + "vm_loss": 0.1556 + }, + { + "epoch": 1.3814279182809153, + "lm_loss": 0.9905, + "step": 7176, + "vm_loss": 0.2035 + }, + { + "epoch": 1.3814279182809153, + "lm_loss": 0.8516, + "step": 7176, + "vm_loss": 0.1447 + }, + { + "epoch": 1.3816204249584907, + "grad_norm": 3.39920893618975, + "learning_rate": 4.606713097594488e-06, + "loss": 1.0945, + "step": 7177 + }, + { + "epoch": 1.381812931636066, + "grad_norm": 3.1672113566580884, + "learning_rate": 4.6040877935507245e-06, + "loss": 0.9937, + "step": 7178 + }, + { + "epoch": 1.3820054383136415, + "grad_norm": 3.2503382510793957, + "learning_rate": 4.6014630140586534e-06, + "loss": 1.0246, + "step": 7179 + }, + { + "epoch": 1.382197944991217, + "grad_norm": 3.216884506545426, + "learning_rate": 4.598838759373453e-06, + "loss": 0.9988, + "step": 7180 + }, + { + "epoch": 1.3823904516687922, + "grad_norm": 3.3271411537072213, + "learning_rate": 4.596215029750222e-06, + "loss": 1.0792, + "step": 7181 + }, + { + "epoch": 1.3825829583463676, + "grad_norm": 3.2775934137100413, + "learning_rate": 4.593591825444028e-06, + "loss": 1.0196, + "step": 7182 + }, + { + "epoch": 1.382775465023943, + "grad_norm": 3.128423763187306, + "learning_rate": 4.59096914670987e-06, + "loss": 0.9343, + "step": 7183 + }, + { + "epoch": 1.3829679717015184, + "grad_norm": 3.2861258216949993, + "learning_rate": 4.588346993802719e-06, + "loss": 1.0357, + "step": 7184 + }, + { + "epoch": 1.3829679717015184, + "lm_loss": 0.7793, + "step": 7184, + "vm_loss": 0.2527 + }, + { + "epoch": 1.3829679717015184, + "lm_loss": 0.8046, + "step": 7184, + "vm_loss": 0.2241 + }, + { + "epoch": 1.3829679717015184, + "lm_loss": 1.1453, + "step": 7184, + "vm_loss": 0.1605 + }, + { + "epoch": 1.3829679717015184, + "lm_loss": 0.3852, + "step": 7184, + "vm_loss": 0.1237 + }, + { + "epoch": 1.3829679717015184, + "lm_loss": 0.875, + "step": 7184, + "vm_loss": 0.1397 + }, + { + "epoch": 1.3829679717015184, + "lm_loss": 0.6063, + "step": 7184, + "vm_loss": 0.1571 + }, + { + "epoch": 1.3829679717015184, + "lm_loss": 0.7424, + "step": 7184, + "vm_loss": 0.1461 + }, + { + "epoch": 1.3829679717015184, + "lm_loss": 0.6702, + "step": 7184, + "vm_loss": 0.1399 + }, + { + "epoch": 1.3831604783790938, + "grad_norm": 3.3159232427173193, + "learning_rate": 4.5857253669774725e-06, + "loss": 1.0276, + "step": 7185 + }, + { + "epoch": 1.383352985056669, + "grad_norm": 3.171683064633105, + "learning_rate": 4.583104266488989e-06, + "loss": 0.9991, + "step": 7186 + }, + { + "epoch": 1.3835454917342445, + "grad_norm": 3.307096398905604, + "learning_rate": 4.580483692592074e-06, + "loss": 1.0719, + "step": 7187 + }, + { + "epoch": 1.3837379984118199, + "grad_norm": 3.199174668208879, + "learning_rate": 4.577863645541483e-06, + "loss": 1.0335, + "step": 7188 + }, + { + "epoch": 1.3839305050893953, + "grad_norm": 3.301151267278536, + "learning_rate": 4.575244125591906e-06, + "loss": 1.019, + "step": 7189 + }, + { + "epoch": 1.3841230117669707, + "grad_norm": 3.3132592548908075, + "learning_rate": 4.572625132998013e-06, + "loss": 1.0205, + "step": 7190 + }, + { + "epoch": 1.384315518444546, + "grad_norm": 3.294109865959035, + "learning_rate": 4.570006668014388e-06, + "loss": 1.0324, + "step": 7191 + }, + { + "epoch": 1.3845080251221213, + "grad_norm": 3.2962832572059053, + "learning_rate": 4.567388730895586e-06, + "loss": 1.0754, + "step": 7192 + }, + { + "epoch": 1.3845080251221213, + "lm_loss": 1.0252, + "step": 7192, + "vm_loss": 0.1505 + }, + { + "epoch": 1.3845080251221213, + "lm_loss": 0.9189, + "step": 7192, + "vm_loss": 0.1608 + }, + { + "epoch": 1.3845080251221213, + "lm_loss": 0.5341, + "step": 7192, + "vm_loss": 0.2006 + }, + { + "epoch": 1.3845080251221213, + "lm_loss": 1.2423, + "step": 7192, + "vm_loss": 0.1898 + }, + { + "epoch": 1.3845080251221213, + "lm_loss": 1.3822, + "step": 7192, + "vm_loss": 0.1491 + }, + { + "epoch": 1.3845080251221213, + "lm_loss": 1.5858, + "step": 7192, + "vm_loss": 0.1848 + }, + { + "epoch": 1.3845080251221213, + "lm_loss": 0.5449, + "step": 7192, + "vm_loss": 0.1499 + }, + { + "epoch": 1.3845080251221213, + "lm_loss": 0.9615, + "step": 7192, + "vm_loss": 0.1344 + }, + { + "epoch": 1.3847005317996968, + "grad_norm": 3.2869732233550604, + "learning_rate": 4.564771321896102e-06, + "loss": 1.0617, + "step": 7193 + }, + { + "epoch": 1.3848930384772722, + "grad_norm": 3.1752680099543724, + "learning_rate": 4.562154441270383e-06, + "loss": 1.025, + "step": 7194 + }, + { + "epoch": 1.3850855451548476, + "grad_norm": 3.9682637690835856, + "learning_rate": 4.559538089272825e-06, + "loss": 1.1528, + "step": 7195 + }, + { + "epoch": 1.3852780518324228, + "grad_norm": 3.3920053084676827, + "learning_rate": 4.556922266157762e-06, + "loss": 1.0521, + "step": 7196 + }, + { + "epoch": 1.3854705585099985, + "grad_norm": 3.1967024525321968, + "learning_rate": 4.5543069721795e-06, + "loss": 1.0219, + "step": 7197 + }, + { + "epoch": 1.3856630651875737, + "grad_norm": 3.427403027422847, + "learning_rate": 4.551692207592265e-06, + "loss": 1.0914, + "step": 7198 + }, + { + "epoch": 1.385855571865149, + "grad_norm": 3.147041901278296, + "learning_rate": 4.549077972650257e-06, + "loss": 0.9766, + "step": 7199 + }, + { + "epoch": 1.3860480785427245, + "grad_norm": 3.193079541884317, + "learning_rate": 4.546464267607601e-06, + "loss": 1.0413, + "step": 7200 + }, + { + "epoch": 1.3860480785427245, + "lm_loss": 0.5371, + "step": 7200, + "vm_loss": 0.2162 + }, + { + "epoch": 1.3860480785427245, + "lm_loss": 0.9204, + "step": 7200, + "vm_loss": 0.1667 + }, + { + "epoch": 1.3860480785427245, + "lm_loss": 0.8246, + "step": 7200, + "vm_loss": 0.2098 + }, + { + "epoch": 1.3860480785427245, + "lm_loss": 0.8937, + "step": 7200, + "vm_loss": 0.1334 + }, + { + "epoch": 1.3860480785427245, + "lm_loss": 1.0023, + "step": 7200, + "vm_loss": 0.1673 + }, + { + "epoch": 1.3860480785427245, + "lm_loss": 0.6684, + "step": 7200, + "vm_loss": 0.194 + }, + { + "epoch": 1.3860480785427245, + "lm_loss": 0.8016, + "step": 7200, + "vm_loss": 0.107 + }, + { + "epoch": 1.3860480785427245, + "lm_loss": 0.7231, + "step": 7200, + "vm_loss": 0.1863 + }, + { + "epoch": 1.3862405852202997, + "grad_norm": 3.396665186516052, + "learning_rate": 4.5438510927183964e-06, + "loss": 1.0713, + "step": 7201 + }, + { + "epoch": 1.3864330918978753, + "grad_norm": 3.27578743961322, + "learning_rate": 4.5412384482366665e-06, + "loss": 1.0203, + "step": 7202 + }, + { + "epoch": 1.3866255985754505, + "grad_norm": 3.359348201211265, + "learning_rate": 4.538626334416396e-06, + "loss": 1.0193, + "step": 7203 + }, + { + "epoch": 1.386818105253026, + "grad_norm": 3.2376379344104627, + "learning_rate": 4.536014751511518e-06, + "loss": 0.9915, + "step": 7204 + }, + { + "epoch": 1.3870106119306014, + "grad_norm": 3.23029642648798, + "learning_rate": 4.533403699775916e-06, + "loss": 1.0353, + "step": 7205 + }, + { + "epoch": 1.3872031186081768, + "grad_norm": 3.383302059490082, + "learning_rate": 4.530793179463407e-06, + "loss": 1.1001, + "step": 7206 + }, + { + "epoch": 1.3873956252857522, + "grad_norm": 3.371371020774235, + "learning_rate": 4.528183190827773e-06, + "loss": 1.0552, + "step": 7207 + }, + { + "epoch": 1.3875881319633274, + "grad_norm": 3.285004809659229, + "learning_rate": 4.525573734122738e-06, + "loss": 1.0414, + "step": 7208 + }, + { + "epoch": 1.3875881319633274, + "lm_loss": 0.7833, + "step": 7208, + "vm_loss": 0.1807 + }, + { + "epoch": 1.3875881319633274, + "lm_loss": 1.4645, + "step": 7208, + "vm_loss": 0.1446 + }, + { + "epoch": 1.3875881319633274, + "lm_loss": 0.9078, + "step": 7208, + "vm_loss": 0.1724 + }, + { + "epoch": 1.3875881319633274, + "lm_loss": 0.81, + "step": 7208, + "vm_loss": 0.2249 + }, + { + "epoch": 1.3875881319633274, + "lm_loss": 0.6973, + "step": 7208, + "vm_loss": 0.1416 + }, + { + "epoch": 1.3875881319633274, + "lm_loss": 1.0551, + "step": 7208, + "vm_loss": 0.1476 + }, + { + "epoch": 1.3875881319633274, + "lm_loss": 0.5329, + "step": 7208, + "vm_loss": 0.1648 + }, + { + "epoch": 1.3875881319633274, + "lm_loss": 0.6353, + "step": 7208, + "vm_loss": 0.1335 + }, + { + "epoch": 1.3877806386409028, + "grad_norm": 3.29047756916702, + "learning_rate": 4.522964809601978e-06, + "loss": 1.0515, + "step": 7209 + }, + { + "epoch": 1.3879731453184783, + "grad_norm": 3.327147942865889, + "learning_rate": 4.520356417519104e-06, + "loss": 1.0796, + "step": 7210 + }, + { + "epoch": 1.3881656519960537, + "grad_norm": 3.4521941405127112, + "learning_rate": 4.517748558127697e-06, + "loss": 1.0523, + "step": 7211 + }, + { + "epoch": 1.388358158673629, + "grad_norm": 3.3160959637144964, + "learning_rate": 4.5151412316812656e-06, + "loss": 1.0242, + "step": 7212 + }, + { + "epoch": 1.3885506653512043, + "grad_norm": 3.1856021908245724, + "learning_rate": 4.512534438433278e-06, + "loss": 1.0152, + "step": 7213 + }, + { + "epoch": 1.3887431720287797, + "grad_norm": 3.2348098983557727, + "learning_rate": 4.509928178637149e-06, + "loss": 1.0597, + "step": 7214 + }, + { + "epoch": 1.3889356787063551, + "grad_norm": 3.3849927104429693, + "learning_rate": 4.5073224525462386e-06, + "loss": 1.1143, + "step": 7215 + }, + { + "epoch": 1.3891281853839306, + "grad_norm": 3.2040462044368385, + "learning_rate": 4.504717260413863e-06, + "loss": 1.0179, + "step": 7216 + }, + { + "epoch": 1.3891281853839306, + "lm_loss": 0.9291, + "step": 7216, + "vm_loss": 0.1834 + }, + { + "epoch": 1.3891281853839306, + "lm_loss": 0.6274, + "step": 7216, + "vm_loss": 0.0889 + }, + { + "epoch": 1.3891281853839306, + "lm_loss": 0.9705, + "step": 7216, + "vm_loss": 0.2015 + }, + { + "epoch": 1.3891281853839306, + "lm_loss": 0.902, + "step": 7216, + "vm_loss": 0.1286 + }, + { + "epoch": 1.3891281853839306, + "lm_loss": 0.6908, + "step": 7216, + "vm_loss": 0.137 + }, + { + "epoch": 1.3891281853839306, + "lm_loss": 0.9793, + "step": 7216, + "vm_loss": 0.1884 + }, + { + "epoch": 1.3891281853839306, + "lm_loss": 1.2459, + "step": 7216, + "vm_loss": 0.2207 + }, + { + "epoch": 1.3891281853839306, + "lm_loss": 1.0427, + "step": 7216, + "vm_loss": 0.1514 + }, + { + "epoch": 1.389320692061506, + "grad_norm": 3.141025219087616, + "learning_rate": 4.502112602493266e-06, + "loss": 0.9903, + "step": 7217 + }, + { + "epoch": 1.3895131987390812, + "grad_norm": 3.2037202653091104, + "learning_rate": 4.49950847903767e-06, + "loss": 1.014, + "step": 7218 + }, + { + "epoch": 1.3897057054166566, + "grad_norm": 3.318574421679347, + "learning_rate": 4.496904890300219e-06, + "loss": 1.0174, + "step": 7219 + }, + { + "epoch": 1.389898212094232, + "grad_norm": 3.235564291228873, + "learning_rate": 4.494301836534016e-06, + "loss": 1.0587, + "step": 7220 + }, + { + "epoch": 1.3900907187718075, + "grad_norm": 3.2361534504003164, + "learning_rate": 4.491699317992114e-06, + "loss": 1.0199, + "step": 7221 + }, + { + "epoch": 1.3902832254493829, + "grad_norm": 3.4226133330254522, + "learning_rate": 4.489097334927514e-06, + "loss": 1.1493, + "step": 7222 + }, + { + "epoch": 1.390475732126958, + "grad_norm": 3.3597589452218757, + "learning_rate": 4.4864958875931545e-06, + "loss": 1.0947, + "step": 7223 + }, + { + "epoch": 1.3906682388045335, + "grad_norm": 3.219695591833373, + "learning_rate": 4.483894976241933e-06, + "loss": 1.0185, + "step": 7224 + }, + { + "epoch": 1.3906682388045335, + "lm_loss": 1.2147, + "step": 7224, + "vm_loss": 0.202 + }, + { + "epoch": 1.3906682388045335, + "lm_loss": 1.1027, + "step": 7224, + "vm_loss": 0.1533 + }, + { + "epoch": 1.3906682388045335, + "lm_loss": 1.3044, + "step": 7224, + "vm_loss": 0.1623 + }, + { + "epoch": 1.3906682388045335, + "lm_loss": 1.2123, + "step": 7224, + "vm_loss": 0.19 + }, + { + "epoch": 1.3906682388045335, + "lm_loss": 0.8591, + "step": 7224, + "vm_loss": 0.2573 + }, + { + "epoch": 1.3906682388045335, + "lm_loss": 0.8682, + "step": 7224, + "vm_loss": 0.1794 + }, + { + "epoch": 1.3906682388045335, + "lm_loss": 0.5287, + "step": 7224, + "vm_loss": 0.1876 + }, + { + "epoch": 1.3906682388045335, + "lm_loss": 1.0067, + "step": 7224, + "vm_loss": 0.1249 + }, + { + "epoch": 1.390860745482109, + "grad_norm": 3.2375185692310846, + "learning_rate": 4.48129460112669e-06, + "loss": 1.0076, + "step": 7225 + }, + { + "epoch": 1.3910532521596843, + "grad_norm": 3.411162459282257, + "learning_rate": 4.478694762500222e-06, + "loss": 1.0823, + "step": 7226 + }, + { + "epoch": 1.3912457588372598, + "grad_norm": 3.238380728340277, + "learning_rate": 4.476095460615253e-06, + "loss": 0.9842, + "step": 7227 + }, + { + "epoch": 1.391438265514835, + "grad_norm": 3.252013012438708, + "learning_rate": 4.473496695724484e-06, + "loss": 1.0752, + "step": 7228 + }, + { + "epoch": 1.3916307721924104, + "grad_norm": 3.2129535095421287, + "learning_rate": 4.470898468080538e-06, + "loss": 0.9922, + "step": 7229 + }, + { + "epoch": 1.3918232788699858, + "grad_norm": 3.25915344119369, + "learning_rate": 4.4683007779359975e-06, + "loss": 1.0342, + "step": 7230 + }, + { + "epoch": 1.3920157855475612, + "grad_norm": 3.187012010329501, + "learning_rate": 4.465703625543393e-06, + "loss": 0.9991, + "step": 7231 + }, + { + "epoch": 1.3922082922251366, + "grad_norm": 3.14853267219442, + "learning_rate": 4.463107011155204e-06, + "loss": 0.979, + "step": 7232 + }, + { + "epoch": 1.3922082922251366, + "lm_loss": 1.2003, + "step": 7232, + "vm_loss": 0.1968 + }, + { + "epoch": 1.3922082922251366, + "lm_loss": 0.7273, + "step": 7232, + "vm_loss": 0.172 + }, + { + "epoch": 1.3922082922251366, + "lm_loss": 0.9422, + "step": 7232, + "vm_loss": 0.1333 + }, + { + "epoch": 1.3922082922251366, + "lm_loss": 1.155, + "step": 7232, + "vm_loss": 0.1339 + }, + { + "epoch": 1.3922082922251366, + "lm_loss": 0.8178, + "step": 7232, + "vm_loss": 0.1535 + }, + { + "epoch": 1.3922082922251366, + "lm_loss": 0.8455, + "step": 7232, + "vm_loss": 0.1566 + }, + { + "epoch": 1.3922082922251366, + "lm_loss": 0.6767, + "step": 7232, + "vm_loss": 0.1596 + }, + { + "epoch": 1.3922082922251366, + "lm_loss": 1.0292, + "step": 7232, + "vm_loss": 0.1062 + }, + { + "epoch": 1.3924007989027118, + "grad_norm": 3.137616420048383, + "learning_rate": 4.460510935023848e-06, + "loss": 0.9778, + "step": 7233 + }, + { + "epoch": 1.3925933055802873, + "grad_norm": 3.1467254630452586, + "learning_rate": 4.4579153974016986e-06, + "loss": 1.0096, + "step": 7234 + }, + { + "epoch": 1.3927858122578627, + "grad_norm": 3.0698213665525205, + "learning_rate": 4.455320398541078e-06, + "loss": 0.9647, + "step": 7235 + }, + { + "epoch": 1.392978318935438, + "grad_norm": 3.3548613982649824, + "learning_rate": 4.4527259386942515e-06, + "loss": 0.9872, + "step": 7236 + }, + { + "epoch": 1.3931708256130135, + "grad_norm": 3.439315685990862, + "learning_rate": 4.450132018113434e-06, + "loss": 1.0785, + "step": 7237 + }, + { + "epoch": 1.3933633322905887, + "grad_norm": 3.220278425434203, + "learning_rate": 4.447538637050787e-06, + "loss": 0.9896, + "step": 7238 + }, + { + "epoch": 1.3935558389681642, + "grad_norm": 3.3971823209419956, + "learning_rate": 4.4449457957584255e-06, + "loss": 1.0137, + "step": 7239 + }, + { + "epoch": 1.3937483456457396, + "grad_norm": 3.4165500885908857, + "learning_rate": 4.4423534944883984e-06, + "loss": 1.0091, + "step": 7240 + }, + { + "epoch": 1.3937483456457396, + "lm_loss": 0.6314, + "step": 7240, + "vm_loss": 0.1298 + }, + { + "epoch": 1.3937483456457396, + "lm_loss": 0.9667, + "step": 7240, + "vm_loss": 0.1312 + }, + { + "epoch": 1.3937483456457396, + "lm_loss": 1.0063, + "step": 7240, + "vm_loss": 0.1933 + }, + { + "epoch": 1.3937483456457396, + "lm_loss": 0.7289, + "step": 7240, + "vm_loss": 0.1287 + }, + { + "epoch": 1.3937483456457396, + "lm_loss": 0.5011, + "step": 7240, + "vm_loss": 0.1464 + }, + { + "epoch": 1.3937483456457396, + "lm_loss": 1.1109, + "step": 7240, + "vm_loss": 0.1309 + }, + { + "epoch": 1.3937483456457396, + "lm_loss": 0.7889, + "step": 7240, + "vm_loss": 0.12 + }, + { + "epoch": 1.3937483456457396, + "lm_loss": 0.9704, + "step": 7240, + "vm_loss": 0.1903 + }, + { + "epoch": 1.393940852323315, + "grad_norm": 3.448420545946613, + "learning_rate": 4.439761733492716e-06, + "loss": 1.0251, + "step": 7241 + }, + { + "epoch": 1.3941333590008904, + "grad_norm": 3.1351231087001294, + "learning_rate": 4.437170513023328e-06, + "loss": 0.9533, + "step": 7242 + }, + { + "epoch": 1.3943258656784656, + "grad_norm": 3.236364256875787, + "learning_rate": 4.43457983333214e-06, + "loss": 1.0286, + "step": 7243 + }, + { + "epoch": 1.394518372356041, + "grad_norm": 3.3104200425336785, + "learning_rate": 4.4319896946709875e-06, + "loss": 1.0756, + "step": 7244 + }, + { + "epoch": 1.3947108790336165, + "grad_norm": 3.2787601550931895, + "learning_rate": 4.4294000972916785e-06, + "loss": 1.0304, + "step": 7245 + }, + { + "epoch": 1.3949033857111919, + "grad_norm": 3.1918141296807145, + "learning_rate": 4.426811041445946e-06, + "loss": 1.0365, + "step": 7246 + }, + { + "epoch": 1.3950958923887673, + "grad_norm": 3.25795711637226, + "learning_rate": 4.424222527385482e-06, + "loss": 1.0663, + "step": 7247 + }, + { + "epoch": 1.3952883990663425, + "grad_norm": 3.3175391760229354, + "learning_rate": 4.421634555361923e-06, + "loss": 1.0736, + "step": 7248 + }, + { + "epoch": 1.3952883990663425, + "lm_loss": 0.8402, + "step": 7248, + "vm_loss": 0.1309 + }, + { + "epoch": 1.3952883990663425, + "lm_loss": 0.817, + "step": 7248, + "vm_loss": 0.1377 + }, + { + "epoch": 1.3952883990663425, + "lm_loss": 0.6167, + "step": 7248, + "vm_loss": 0.1593 + }, + { + "epoch": 1.3952883990663425, + "lm_loss": 0.8723, + "step": 7248, + "vm_loss": 0.2015 + }, + { + "epoch": 1.3952883990663425, + "lm_loss": 0.5982, + "step": 7248, + "vm_loss": 0.1037 + }, + { + "epoch": 1.3952883990663425, + "lm_loss": 0.6458, + "step": 7248, + "vm_loss": 0.1978 + }, + { + "epoch": 1.3952883990663425, + "lm_loss": 1.2121, + "step": 7248, + "vm_loss": 0.1826 + }, + { + "epoch": 1.3952883990663425, + "lm_loss": 0.822, + "step": 7248, + "vm_loss": 0.1935 + }, + { + "epoch": 1.395480905743918, + "grad_norm": 3.2214795420753637, + "learning_rate": 4.419047125626859e-06, + "loss": 1.0083, + "step": 7249 + }, + { + "epoch": 1.3956734124214933, + "grad_norm": 3.326936723226034, + "learning_rate": 4.416460238431809e-06, + "loss": 1.0895, + "step": 7250 + }, + { + "epoch": 1.3958659190990688, + "grad_norm": 3.0973144320397346, + "learning_rate": 4.41387389402826e-06, + "loss": 0.9925, + "step": 7251 + }, + { + "epoch": 1.3960584257766442, + "grad_norm": 3.159966760526034, + "learning_rate": 4.411288092667636e-06, + "loss": 1.0237, + "step": 7252 + }, + { + "epoch": 1.3962509324542194, + "grad_norm": 3.1974776230684165, + "learning_rate": 4.408702834601309e-06, + "loss": 1.0049, + "step": 7253 + }, + { + "epoch": 1.3964434391317948, + "grad_norm": 3.126783721051538, + "learning_rate": 4.4061181200806e-06, + "loss": 0.9581, + "step": 7254 + }, + { + "epoch": 1.3966359458093702, + "grad_norm": 3.2599328173979245, + "learning_rate": 4.403533949356781e-06, + "loss": 1.0062, + "step": 7255 + }, + { + "epoch": 1.3968284524869456, + "grad_norm": 3.3935221452812177, + "learning_rate": 4.4009503226810576e-06, + "loss": 1.0422, + "step": 7256 + }, + { + "epoch": 1.3968284524869456, + "lm_loss": 1.0591, + "step": 7256, + "vm_loss": 0.1384 + }, + { + "epoch": 1.3968284524869456, + "lm_loss": 0.6725, + "step": 7256, + "vm_loss": 0.1807 + }, + { + "epoch": 1.3968284524869456, + "lm_loss": 0.4944, + "step": 7256, + "vm_loss": 0.126 + }, + { + "epoch": 1.3968284524869456, + "lm_loss": 0.7794, + "step": 7256, + "vm_loss": 0.1908 + }, + { + "epoch": 1.3968284524869456, + "lm_loss": 0.8425, + "step": 7256, + "vm_loss": 0.1685 + }, + { + "epoch": 1.3968284524869456, + "lm_loss": 0.7313, + "step": 7256, + "vm_loss": 0.1419 + }, + { + "epoch": 1.3968284524869456, + "lm_loss": 0.6278, + "step": 7256, + "vm_loss": 0.201 + }, + { + "epoch": 1.3968284524869456, + "lm_loss": 0.9993, + "step": 7256, + "vm_loss": 0.1263 + }, + { + "epoch": 1.397020959164521, + "grad_norm": 3.241265329943556, + "learning_rate": 4.3983672403045954e-06, + "loss": 1.0021, + "step": 7257 + }, + { + "epoch": 1.3972134658420963, + "grad_norm": 3.1852523397834274, + "learning_rate": 4.395784702478504e-06, + "loss": 1.0148, + "step": 7258 + }, + { + "epoch": 1.397405972519672, + "grad_norm": 3.300321343363957, + "learning_rate": 4.393202709453838e-06, + "loss": 1.024, + "step": 7259 + }, + { + "epoch": 1.3975984791972471, + "grad_norm": 3.2757612445628346, + "learning_rate": 4.390621261481603e-06, + "loss": 1.0259, + "step": 7260 + }, + { + "epoch": 1.3977909858748225, + "grad_norm": 3.4123510058924746, + "learning_rate": 4.388040358812741e-06, + "loss": 1.014, + "step": 7261 + }, + { + "epoch": 1.397983492552398, + "grad_norm": 3.1793761199222534, + "learning_rate": 4.38546000169816e-06, + "loss": 0.9836, + "step": 7262 + }, + { + "epoch": 1.3981759992299732, + "grad_norm": 3.2269409992976286, + "learning_rate": 4.3828801903886944e-06, + "loss": 0.9891, + "step": 7263 + }, + { + "epoch": 1.3983685059075488, + "grad_norm": 3.2880405892395603, + "learning_rate": 4.380300925135138e-06, + "loss": 1.0335, + "step": 7264 + }, + { + "epoch": 1.3983685059075488, + "lm_loss": 0.7709, + "step": 7264, + "vm_loss": 0.1326 + }, + { + "epoch": 1.3983685059075488, + "lm_loss": 0.8882, + "step": 7264, + "vm_loss": 0.2081 + }, + { + "epoch": 1.3983685059075488, + "lm_loss": 0.7829, + "step": 7264, + "vm_loss": 0.1472 + }, + { + "epoch": 1.3983685059075488, + "lm_loss": 0.8806, + "step": 7264, + "vm_loss": 0.1974 + }, + { + "epoch": 1.3983685059075488, + "lm_loss": 0.8995, + "step": 7264, + "vm_loss": 0.1501 + }, + { + "epoch": 1.3983685059075488, + "lm_loss": 0.5841, + "step": 7264, + "vm_loss": 0.0822 + }, + { + "epoch": 1.3983685059075488, + "lm_loss": 0.879, + "step": 7264, + "vm_loss": 0.2164 + }, + { + "epoch": 1.3983685059075488, + "lm_loss": 1.0041, + "step": 7264, + "vm_loss": 0.1558 + }, + { + "epoch": 1.398561012585124, + "grad_norm": 3.3126867405250153, + "learning_rate": 4.377722206188229e-06, + "loss": 1.0654, + "step": 7265 + }, + { + "epoch": 1.3987535192626994, + "grad_norm": 3.2267603303595034, + "learning_rate": 4.375144033798655e-06, + "loss": 0.9996, + "step": 7266 + }, + { + "epoch": 1.3989460259402748, + "grad_norm": 3.411733525371951, + "learning_rate": 4.372566408217041e-06, + "loss": 1.0859, + "step": 7267 + }, + { + "epoch": 1.3991385326178503, + "grad_norm": 3.26290067537603, + "learning_rate": 4.369989329693967e-06, + "loss": 1.0456, + "step": 7268 + }, + { + "epoch": 1.3993310392954257, + "grad_norm": 3.3516593555548324, + "learning_rate": 4.367412798479959e-06, + "loss": 1.0432, + "step": 7269 + }, + { + "epoch": 1.3995235459730009, + "grad_norm": 3.223393793471439, + "learning_rate": 4.364836814825493e-06, + "loss": 0.997, + "step": 7270 + }, + { + "epoch": 1.3997160526505763, + "grad_norm": 3.1981313427234417, + "learning_rate": 4.362261378980977e-06, + "loss": 1.0133, + "step": 7271 + }, + { + "epoch": 1.3999085593281517, + "grad_norm": 3.154688550220936, + "learning_rate": 4.359686491196789e-06, + "loss": 1.0019, + "step": 7272 + }, + { + "epoch": 1.3999085593281517, + "lm_loss": 0.9545, + "step": 7272, + "vm_loss": 0.2396 + }, + { + "epoch": 1.3999085593281517, + "lm_loss": 0.7766, + "step": 7272, + "vm_loss": 0.2045 + }, + { + "epoch": 1.3999085593281517, + "lm_loss": 0.8152, + "step": 7272, + "vm_loss": 0.1627 + }, + { + "epoch": 1.3999085593281517, + "lm_loss": 0.6654, + "step": 7272, + "vm_loss": 0.1383 + }, + { + "epoch": 1.3999085593281517, + "lm_loss": 0.6261, + "step": 7272, + "vm_loss": 0.1542 + }, + { + "epoch": 1.3999085593281517, + "lm_loss": 0.9085, + "step": 7272, + "vm_loss": 0.1857 + }, + { + "epoch": 1.3999085593281517, + "lm_loss": 0.3901, + "step": 7272, + "vm_loss": 0.1609 + }, + { + "epoch": 1.3999085593281517, + "lm_loss": 1.1798, + "step": 7272, + "vm_loss": 0.1773 + }, + { + "epoch": 1.4001010660057271, + "grad_norm": 3.329802661399455, + "learning_rate": 4.35711215172323e-06, + "loss": 0.9845, + "step": 7273 + }, + { + "epoch": 1.4002935726833026, + "grad_norm": 3.339628740789547, + "learning_rate": 4.354538360810564e-06, + "loss": 1.0145, + "step": 7274 + }, + { + "epoch": 1.4004860793608778, + "grad_norm": 3.5029633162411025, + "learning_rate": 4.351965118708996e-06, + "loss": 1.0453, + "step": 7275 + }, + { + "epoch": 1.4006785860384532, + "grad_norm": 3.546755808969498, + "learning_rate": 4.349392425668683e-06, + "loss": 1.1019, + "step": 7276 + }, + { + "epoch": 1.4008710927160286, + "grad_norm": 3.305754008367797, + "learning_rate": 4.346820281939713e-06, + "loss": 0.9926, + "step": 7277 + }, + { + "epoch": 1.401063599393604, + "grad_norm": 3.4479443017814604, + "learning_rate": 4.344248687772138e-06, + "loss": 1.0128, + "step": 7278 + }, + { + "epoch": 1.4012561060711795, + "grad_norm": 3.196193341633772, + "learning_rate": 4.341677643415949e-06, + "loss": 0.9909, + "step": 7279 + }, + { + "epoch": 1.4014486127487547, + "grad_norm": 3.163949879077566, + "learning_rate": 4.3391071491210834e-06, + "loss": 0.9313, + "step": 7280 + }, + { + "epoch": 1.4014486127487547, + "lm_loss": 1.121, + "step": 7280, + "vm_loss": 0.2672 + }, + { + "epoch": 1.4014486127487547, + "lm_loss": 1.1644, + "step": 7280, + "vm_loss": 0.1672 + }, + { + "epoch": 1.4014486127487547, + "lm_loss": 1.225, + "step": 7280, + "vm_loss": 0.1143 + }, + { + "epoch": 1.4014486127487547, + "lm_loss": 0.8647, + "step": 7280, + "vm_loss": 0.2452 + }, + { + "epoch": 1.4014486127487547, + "lm_loss": 1.1583, + "step": 7280, + "vm_loss": 0.2238 + }, + { + "epoch": 1.4014486127487547, + "lm_loss": 0.6586, + "step": 7280, + "vm_loss": 0.1912 + }, + { + "epoch": 1.4014486127487547, + "lm_loss": 1.2907, + "step": 7280, + "vm_loss": 0.1825 + }, + { + "epoch": 1.4014486127487547, + "lm_loss": 0.8495, + "step": 7280, + "vm_loss": 0.1408 + }, + { + "epoch": 1.40164111942633, + "grad_norm": 3.182995788297079, + "learning_rate": 4.336537205137431e-06, + "loss": 1.0415, + "step": 7281 + }, + { + "epoch": 1.4018336261039055, + "grad_norm": 3.136485722934549, + "learning_rate": 4.333967811714811e-06, + "loss": 0.9886, + "step": 7282 + }, + { + "epoch": 1.402026132781481, + "grad_norm": 3.2764522996051997, + "learning_rate": 4.3313989691030175e-06, + "loss": 0.9838, + "step": 7283 + }, + { + "epoch": 1.4022186394590563, + "grad_norm": 3.1908097624066807, + "learning_rate": 4.328830677551763e-06, + "loss": 1.0193, + "step": 7284 + }, + { + "epoch": 1.4024111461366315, + "grad_norm": 3.281168306914215, + "learning_rate": 4.3262629373107214e-06, + "loss": 1.081, + "step": 7285 + }, + { + "epoch": 1.402603652814207, + "grad_norm": 3.2179464085052722, + "learning_rate": 4.3236957486295115e-06, + "loss": 1.0098, + "step": 7286 + }, + { + "epoch": 1.4027961594917824, + "grad_norm": 3.2386755498098285, + "learning_rate": 4.321129111757699e-06, + "loss": 1.0356, + "step": 7287 + }, + { + "epoch": 1.4029886661693578, + "grad_norm": 3.31189265735079, + "learning_rate": 4.318563026944784e-06, + "loss": 1.046, + "step": 7288 + }, + { + "epoch": 1.4029886661693578, + "lm_loss": 0.8486, + "step": 7288, + "vm_loss": 0.133 + }, + { + "epoch": 1.4029886661693578, + "lm_loss": 0.7606, + "step": 7288, + "vm_loss": 0.2218 + }, + { + "epoch": 1.4029886661693578, + "lm_loss": 0.573, + "step": 7288, + "vm_loss": 0.1619 + }, + { + "epoch": 1.4029886661693578, + "lm_loss": 1.015, + "step": 7288, + "vm_loss": 0.1628 + }, + { + "epoch": 1.4029886661693578, + "lm_loss": 0.9283, + "step": 7288, + "vm_loss": 0.1541 + }, + { + "epoch": 1.4029886661693578, + "lm_loss": 0.7314, + "step": 7288, + "vm_loss": 0.1671 + }, + { + "epoch": 1.4029886661693578, + "lm_loss": 0.6255, + "step": 7288, + "vm_loss": 0.165 + }, + { + "epoch": 1.4029886661693578, + "lm_loss": 0.7905, + "step": 7288, + "vm_loss": 0.1997 + }, + { + "epoch": 1.4031811728469332, + "grad_norm": 3.335457634076318, + "learning_rate": 4.315997494440236e-06, + "loss": 1.0416, + "step": 7289 + }, + { + "epoch": 1.4033736795245084, + "grad_norm": 3.258655833881981, + "learning_rate": 4.3134325144934484e-06, + "loss": 1.0135, + "step": 7290 + }, + { + "epoch": 1.4035661862020838, + "grad_norm": 3.2833775289960925, + "learning_rate": 4.3108680873537765e-06, + "loss": 1.0548, + "step": 7291 + }, + { + "epoch": 1.4037586928796593, + "grad_norm": 3.2690774628374206, + "learning_rate": 4.308304213270504e-06, + "loss": 1.0575, + "step": 7292 + }, + { + "epoch": 1.4039511995572347, + "grad_norm": 3.0720614054330238, + "learning_rate": 4.3057408924928874e-06, + "loss": 0.9774, + "step": 7293 + }, + { + "epoch": 1.40414370623481, + "grad_norm": 3.320562023041671, + "learning_rate": 4.303178125270104e-06, + "loss": 1.0264, + "step": 7294 + }, + { + "epoch": 1.4043362129123853, + "grad_norm": 3.211414790894887, + "learning_rate": 4.30061591185129e-06, + "loss": 1.0217, + "step": 7295 + }, + { + "epoch": 1.4045287195899607, + "grad_norm": 3.15356726324523, + "learning_rate": 4.298054252485525e-06, + "loss": 0.9973, + "step": 7296 + }, + { + "epoch": 1.4045287195899607, + "lm_loss": 0.7628, + "step": 7296, + "vm_loss": 0.2266 + }, + { + "epoch": 1.4045287195899607, + "lm_loss": 0.8077, + "step": 7296, + "vm_loss": 0.1461 + }, + { + "epoch": 1.4045287195899607, + "lm_loss": 0.7567, + "step": 7296, + "vm_loss": 0.2034 + }, + { + "epoch": 1.4045287195899607, + "lm_loss": 0.8168, + "step": 7296, + "vm_loss": 0.1713 + }, + { + "epoch": 1.4045287195899607, + "lm_loss": 0.8702, + "step": 7296, + "vm_loss": 0.1442 + }, + { + "epoch": 1.4045287195899607, + "lm_loss": 0.9975, + "step": 7296, + "vm_loss": 0.2149 + }, + { + "epoch": 1.4045287195899607, + "lm_loss": 0.8532, + "step": 7296, + "vm_loss": 0.1811 + }, + { + "epoch": 1.4045287195899607, + "lm_loss": 1.1108, + "step": 7296, + "vm_loss": 0.1371 + }, + { + "epoch": 1.4047212262675361, + "grad_norm": 3.170620425114536, + "learning_rate": 4.295493147421836e-06, + "loss": 1.0224, + "step": 7297 + }, + { + "epoch": 1.4049137329451116, + "grad_norm": 3.309635710505961, + "learning_rate": 4.292932596909199e-06, + "loss": 1.0348, + "step": 7298 + }, + { + "epoch": 1.405106239622687, + "grad_norm": 3.2120827317240046, + "learning_rate": 4.2903726011965265e-06, + "loss": 0.9676, + "step": 7299 + }, + { + "epoch": 1.4052987463002622, + "grad_norm": 3.1528441126651416, + "learning_rate": 4.287813160532682e-06, + "loss": 0.9883, + "step": 7300 + }, + { + "epoch": 1.4054912529778376, + "grad_norm": 3.3699919996954524, + "learning_rate": 4.28525427516648e-06, + "loss": 1.06, + "step": 7301 + }, + { + "epoch": 1.405683759655413, + "grad_norm": 3.213543604286197, + "learning_rate": 4.282695945346674e-06, + "loss": 1.0053, + "step": 7302 + }, + { + "epoch": 1.4058762663329885, + "grad_norm": 3.4365591393205195, + "learning_rate": 4.2801381713219695e-06, + "loss": 1.0726, + "step": 7303 + }, + { + "epoch": 1.4060687730105639, + "grad_norm": 3.322222742331046, + "learning_rate": 4.277580953341015e-06, + "loss": 1.0318, + "step": 7304 + }, + { + "epoch": 1.4060687730105639, + "lm_loss": 0.5521, + "step": 7304, + "vm_loss": 0.1859 + }, + { + "epoch": 1.4060687730105639, + "lm_loss": 0.4944, + "step": 7304, + "vm_loss": 0.2401 + }, + { + "epoch": 1.4060687730105639, + "lm_loss": 0.6581, + "step": 7304, + "vm_loss": 0.114 + }, + { + "epoch": 1.4060687730105639, + "lm_loss": 0.8147, + "step": 7304, + "vm_loss": 0.1547 + }, + { + "epoch": 1.4060687730105639, + "lm_loss": 0.8176, + "step": 7304, + "vm_loss": 0.179 + }, + { + "epoch": 1.4060687730105639, + "lm_loss": 1.498, + "step": 7304, + "vm_loss": 0.1797 + }, + { + "epoch": 1.4060687730105639, + "lm_loss": 0.6341, + "step": 7304, + "vm_loss": 0.1062 + }, + { + "epoch": 1.4060687730105639, + "lm_loss": 0.7056, + "step": 7304, + "vm_loss": 0.1837 + }, + { + "epoch": 1.406261279688139, + "grad_norm": 3.1682524016922904, + "learning_rate": 4.2750242916523965e-06, + "loss": 0.9833, + "step": 7305 + }, + { + "epoch": 1.4064537863657145, + "grad_norm": 3.395791082098268, + "learning_rate": 4.272468186504668e-06, + "loss": 1.0588, + "step": 7306 + }, + { + "epoch": 1.40664629304329, + "grad_norm": 3.4898333003568442, + "learning_rate": 4.269912638146305e-06, + "loss": 1.1293, + "step": 7307 + }, + { + "epoch": 1.4068387997208653, + "grad_norm": 3.27212011522272, + "learning_rate": 4.267357646825746e-06, + "loss": 1.0164, + "step": 7308 + }, + { + "epoch": 1.4070313063984408, + "grad_norm": 3.2014180161296957, + "learning_rate": 4.264803212791359e-06, + "loss": 1.0056, + "step": 7309 + }, + { + "epoch": 1.407223813076016, + "grad_norm": 3.1555948868316146, + "learning_rate": 4.262249336291481e-06, + "loss": 1.0083, + "step": 7310 + }, + { + "epoch": 1.4074163197535914, + "grad_norm": 3.243827283181842, + "learning_rate": 4.259696017574371e-06, + "loss": 0.9986, + "step": 7311 + }, + { + "epoch": 1.4076088264311668, + "grad_norm": 3.098891302867036, + "learning_rate": 4.257143256888248e-06, + "loss": 0.9651, + "step": 7312 + }, + { + "epoch": 1.4076088264311668, + "lm_loss": 0.4941, + "step": 7312, + "vm_loss": 0.1604 + }, + { + "epoch": 1.4076088264311668, + "lm_loss": 0.9744, + "step": 7312, + "vm_loss": 0.1248 + }, + { + "epoch": 1.4076088264311668, + "lm_loss": 0.9488, + "step": 7312, + "vm_loss": 0.1756 + }, + { + "epoch": 1.4076088264311668, + "lm_loss": 0.9772, + "step": 7312, + "vm_loss": 0.1686 + }, + { + "epoch": 1.4076088264311668, + "lm_loss": 1.1317, + "step": 7312, + "vm_loss": 0.1047 + }, + { + "epoch": 1.4076088264311668, + "lm_loss": 0.9345, + "step": 7312, + "vm_loss": 0.1686 + }, + { + "epoch": 1.4076088264311668, + "lm_loss": 1.1215, + "step": 7312, + "vm_loss": 0.1017 + }, + { + "epoch": 1.4076088264311668, + "lm_loss": 0.8664, + "step": 7312, + "vm_loss": 0.1617 + }, + { + "epoch": 1.4078013331087422, + "grad_norm": 3.23687498459985, + "learning_rate": 4.254591054481274e-06, + "loss": 1.0063, + "step": 7313 + }, + { + "epoch": 1.4079938397863176, + "grad_norm": 3.2610700527557173, + "learning_rate": 4.2520394106015574e-06, + "loss": 1.0057, + "step": 7314 + }, + { + "epoch": 1.4081863464638928, + "grad_norm": 3.3554675937861194, + "learning_rate": 4.2494883254971444e-06, + "loss": 1.0421, + "step": 7315 + }, + { + "epoch": 1.4083788531414683, + "grad_norm": 3.393335455574113, + "learning_rate": 4.246937799416037e-06, + "loss": 0.9989, + "step": 7316 + }, + { + "epoch": 1.4085713598190437, + "grad_norm": 3.379727730482624, + "learning_rate": 4.244387832606179e-06, + "loss": 1.0274, + "step": 7317 + }, + { + "epoch": 1.408763866496619, + "grad_norm": 3.3099376666367046, + "learning_rate": 4.2418384253154585e-06, + "loss": 0.9735, + "step": 7318 + }, + { + "epoch": 1.4089563731741945, + "grad_norm": 3.436342655011878, + "learning_rate": 4.239289577791712e-06, + "loss": 1.0782, + "step": 7319 + }, + { + "epoch": 1.4091488798517697, + "grad_norm": 3.2924206949159482, + "learning_rate": 4.2367412902827185e-06, + "loss": 1.0684, + "step": 7320 + }, + { + "epoch": 1.4091488798517697, + "lm_loss": 0.836, + "step": 7320, + "vm_loss": 0.1112 + }, + { + "epoch": 1.4091488798517697, + "lm_loss": 0.4804, + "step": 7320, + "vm_loss": 0.1989 + }, + { + "epoch": 1.4091488798517697, + "lm_loss": 1.0266, + "step": 7320, + "vm_loss": 0.1431 + }, + { + "epoch": 1.4091488798517697, + "lm_loss": 0.7001, + "step": 7320, + "vm_loss": 0.1504 + }, + { + "epoch": 1.4091488798517697, + "lm_loss": 0.5321, + "step": 7320, + "vm_loss": 0.1382 + }, + { + "epoch": 1.4091488798517697, + "lm_loss": 0.9073, + "step": 7320, + "vm_loss": 0.1679 + }, + { + "epoch": 1.4091488798517697, + "lm_loss": 0.7403, + "step": 7320, + "vm_loss": 0.1569 + }, + { + "epoch": 1.4091488798517697, + "lm_loss": 0.4705, + "step": 7320, + "vm_loss": 0.1796 + }, + { + "epoch": 1.4093413865293454, + "grad_norm": 3.0987218544910435, + "learning_rate": 4.234193563036211e-06, + "loss": 0.9552, + "step": 7321 + }, + { + "epoch": 1.4095338932069206, + "grad_norm": 3.1630195661630913, + "learning_rate": 4.23164639629985e-06, + "loss": 0.952, + "step": 7322 + }, + { + "epoch": 1.409726399884496, + "grad_norm": 3.100224102880909, + "learning_rate": 4.229099790321259e-06, + "loss": 0.9906, + "step": 7323 + }, + { + "epoch": 1.4099189065620714, + "grad_norm": 3.2992946372833343, + "learning_rate": 4.226553745347999e-06, + "loss": 1.0069, + "step": 7324 + }, + { + "epoch": 1.4101114132396466, + "grad_norm": 3.2068404973815348, + "learning_rate": 4.224008261627583e-06, + "loss": 1.0178, + "step": 7325 + }, + { + "epoch": 1.4103039199172223, + "grad_norm": 3.359014259833218, + "learning_rate": 4.221463339407454e-06, + "loss": 1.0193, + "step": 7326 + }, + { + "epoch": 1.4104964265947975, + "grad_norm": 3.358961999615674, + "learning_rate": 4.218918978935022e-06, + "loss": 0.994, + "step": 7327 + }, + { + "epoch": 1.4106889332723729, + "grad_norm": 3.3486145025532803, + "learning_rate": 4.216375180457626e-06, + "loss": 1.0524, + "step": 7328 + }, + { + "epoch": 1.4106889332723729, + "lm_loss": 0.9263, + "step": 7328, + "vm_loss": 0.1385 + }, + { + "epoch": 1.4106889332723729, + "lm_loss": 0.5902, + "step": 7328, + "vm_loss": 0.1169 + }, + { + "epoch": 1.4106889332723729, + "lm_loss": 0.4089, + "step": 7328, + "vm_loss": 0.2104 + }, + { + "epoch": 1.4106889332723729, + "lm_loss": 0.9828, + "step": 7328, + "vm_loss": 0.1675 + }, + { + "epoch": 1.4106889332723729, + "lm_loss": 0.6561, + "step": 7328, + "vm_loss": 0.1425 + }, + { + "epoch": 1.4106889332723729, + "lm_loss": 0.5534, + "step": 7328, + "vm_loss": 0.1621 + }, + { + "epoch": 1.4106889332723729, + "lm_loss": 0.3088, + "step": 7328, + "vm_loss": 0.1493 + }, + { + "epoch": 1.4106889332723729, + "lm_loss": 0.545, + "step": 7328, + "vm_loss": 0.1097 + }, + { + "epoch": 1.4108814399499483, + "grad_norm": 3.165090778978362, + "learning_rate": 4.213831944222556e-06, + "loss": 0.9587, + "step": 7329 + }, + { + "epoch": 1.4110739466275237, + "grad_norm": 3.22520948138797, + "learning_rate": 4.211289270477047e-06, + "loss": 0.9913, + "step": 7330 + }, + { + "epoch": 1.4112664533050991, + "grad_norm": 3.29524439109055, + "learning_rate": 4.2087471594682835e-06, + "loss": 1.0009, + "step": 7331 + }, + { + "epoch": 1.4114589599826743, + "grad_norm": 3.155685687100094, + "learning_rate": 4.206205611443386e-06, + "loss": 0.9773, + "step": 7332 + }, + { + "epoch": 1.4116514666602498, + "grad_norm": 3.1631723233831446, + "learning_rate": 4.203664626649426e-06, + "loss": 1.0305, + "step": 7333 + }, + { + "epoch": 1.4118439733378252, + "grad_norm": 3.263985432511573, + "learning_rate": 4.201124205333421e-06, + "loss": 1.077, + "step": 7334 + }, + { + "epoch": 1.4120364800154006, + "grad_norm": 3.3198038015606812, + "learning_rate": 4.198584347742337e-06, + "loss": 1.0166, + "step": 7335 + }, + { + "epoch": 1.412228986692976, + "grad_norm": 3.3581877619581135, + "learning_rate": 4.196045054123068e-06, + "loss": 1.0288, + "step": 7336 + }, + { + "epoch": 1.412228986692976, + "lm_loss": 0.5751, + "step": 7336, + "vm_loss": 0.1102 + }, + { + "epoch": 1.412228986692976, + "lm_loss": 0.7448, + "step": 7336, + "vm_loss": 0.1971 + }, + { + "epoch": 1.412228986692976, + "lm_loss": 0.8573, + "step": 7336, + "vm_loss": 0.1398 + }, + { + "epoch": 1.412228986692976, + "lm_loss": 0.7549, + "step": 7336, + "vm_loss": 0.1761 + }, + { + "epoch": 1.412228986692976, + "lm_loss": 0.8223, + "step": 7336, + "vm_loss": 0.1389 + }, + { + "epoch": 1.412228986692976, + "lm_loss": 0.6548, + "step": 7336, + "vm_loss": 0.1291 + }, + { + "epoch": 1.412228986692976, + "lm_loss": 1.2283, + "step": 7336, + "vm_loss": 0.1408 + }, + { + "epoch": 1.412228986692976, + "lm_loss": 0.6998, + "step": 7336, + "vm_loss": 0.143 + }, + { + "epoch": 1.4124214933705512, + "grad_norm": 3.0879554842673844, + "learning_rate": 4.193506324722481e-06, + "loss": 0.9324, + "step": 7337 + }, + { + "epoch": 1.4126140000481267, + "grad_norm": 3.315599772061835, + "learning_rate": 4.190968159787364e-06, + "loss": 1.0635, + "step": 7338 + }, + { + "epoch": 1.412806506725702, + "grad_norm": 3.136467155455047, + "learning_rate": 4.18843055956446e-06, + "loss": 1.0225, + "step": 7339 + }, + { + "epoch": 1.4129990134032775, + "grad_norm": 3.152344002246507, + "learning_rate": 4.185893524300458e-06, + "loss": 0.9572, + "step": 7340 + }, + { + "epoch": 1.413191520080853, + "grad_norm": 3.359363349712454, + "learning_rate": 4.183357054241989e-06, + "loss": 1.0496, + "step": 7341 + }, + { + "epoch": 1.4133840267584281, + "grad_norm": 3.413428952585428, + "learning_rate": 4.180821149635634e-06, + "loss": 1.0504, + "step": 7342 + }, + { + "epoch": 1.4135765334360035, + "grad_norm": 3.1090198648209566, + "learning_rate": 4.178285810727906e-06, + "loss": 0.9559, + "step": 7343 + }, + { + "epoch": 1.413769040113579, + "grad_norm": 3.3678127079420777, + "learning_rate": 4.175751037765286e-06, + "loss": 1.0858, + "step": 7344 + }, + { + "epoch": 1.413769040113579, + "lm_loss": 0.5224, + "step": 7344, + "vm_loss": 0.1403 + }, + { + "epoch": 1.413769040113579, + "lm_loss": 0.6963, + "step": 7344, + "vm_loss": 0.137 + }, + { + "epoch": 1.413769040113579, + "lm_loss": 0.7576, + "step": 7344, + "vm_loss": 0.1767 + }, + { + "epoch": 1.413769040113579, + "lm_loss": 0.9183, + "step": 7344, + "vm_loss": 0.1426 + }, + { + "epoch": 1.413769040113579, + "lm_loss": 0.8531, + "step": 7344, + "vm_loss": 0.2048 + }, + { + "epoch": 1.413769040113579, + "lm_loss": 1.0998, + "step": 7344, + "vm_loss": 0.1602 + }, + { + "epoch": 1.413769040113579, + "lm_loss": 0.5426, + "step": 7344, + "vm_loss": 0.1356 + }, + { + "epoch": 1.413769040113579, + "lm_loss": 0.443, + "step": 7344, + "vm_loss": 0.2192 + }, + { + "epoch": 1.4139615467911544, + "grad_norm": 3.084931466295408, + "learning_rate": 4.173216830994175e-06, + "loss": 0.986, + "step": 7345 + }, + { + "epoch": 1.4141540534687298, + "grad_norm": 3.1417702184589587, + "learning_rate": 4.170683190660934e-06, + "loss": 0.9746, + "step": 7346 + }, + { + "epoch": 1.414346560146305, + "grad_norm": 3.3208463954550624, + "learning_rate": 4.168150117011866e-06, + "loss": 1.1104, + "step": 7347 + }, + { + "epoch": 1.4145390668238804, + "grad_norm": 3.2567999363989935, + "learning_rate": 4.16561761029322e-06, + "loss": 1.0384, + "step": 7348 + }, + { + "epoch": 1.4147315735014558, + "grad_norm": 3.1159148533776615, + "learning_rate": 4.1630856707511834e-06, + "loss": 1.0272, + "step": 7349 + }, + { + "epoch": 1.4149240801790313, + "grad_norm": 3.171403810309266, + "learning_rate": 4.1605542986318955e-06, + "loss": 0.9976, + "step": 7350 + }, + { + "epoch": 1.4151165868566067, + "grad_norm": 3.278656649292173, + "learning_rate": 4.158023494181439e-06, + "loss": 1.0282, + "step": 7351 + }, + { + "epoch": 1.4153090935341819, + "grad_norm": 3.3617582419369936, + "learning_rate": 4.155493257645842e-06, + "loss": 1.0848, + "step": 7352 + }, + { + "epoch": 1.4153090935341819, + "lm_loss": 0.7457, + "step": 7352, + "vm_loss": 0.1416 + }, + { + "epoch": 1.4153090935341819, + "lm_loss": 1.0191, + "step": 7352, + "vm_loss": 0.155 + }, + { + "epoch": 1.4153090935341819, + "lm_loss": 1.3959, + "step": 7352, + "vm_loss": 0.191 + }, + { + "epoch": 1.4153090935341819, + "lm_loss": 0.6692, + "step": 7352, + "vm_loss": 0.1899 + }, + { + "epoch": 1.4153090935341819, + "lm_loss": 0.9548, + "step": 7352, + "vm_loss": 0.1263 + }, + { + "epoch": 1.4153090935341819, + "lm_loss": 0.6711, + "step": 7352, + "vm_loss": 0.1688 + }, + { + "epoch": 1.4153090935341819, + "lm_loss": 0.8017, + "step": 7352, + "vm_loss": 0.1966 + }, + { + "epoch": 1.4153090935341819, + "lm_loss": 0.5682, + "step": 7352, + "vm_loss": 0.1474 + }, + { + "epoch": 1.4155016002117573, + "grad_norm": 3.296179896864732, + "learning_rate": 4.152963589271067e-06, + "loss": 1.0208, + "step": 7353 + }, + { + "epoch": 1.4156941068893327, + "grad_norm": 3.0462229680183643, + "learning_rate": 4.150434489303043e-06, + "loss": 0.9255, + "step": 7354 + }, + { + "epoch": 1.4158866135669081, + "grad_norm": 3.3604111462995485, + "learning_rate": 4.147905957987621e-06, + "loss": 1.0757, + "step": 7355 + }, + { + "epoch": 1.4160791202444836, + "grad_norm": 3.346826409899582, + "learning_rate": 4.145377995570611e-06, + "loss": 1.0275, + "step": 7356 + }, + { + "epoch": 1.4162716269220588, + "grad_norm": 3.0982447370218082, + "learning_rate": 4.142850602297761e-06, + "loss": 0.932, + "step": 7357 + }, + { + "epoch": 1.4164641335996342, + "grad_norm": 3.2149632167768187, + "learning_rate": 4.140323778414772e-06, + "loss": 0.9685, + "step": 7358 + }, + { + "epoch": 1.4166566402772096, + "grad_norm": 3.3380226432358673, + "learning_rate": 4.137797524167274e-06, + "loss": 1.0108, + "step": 7359 + }, + { + "epoch": 1.416849146954785, + "grad_norm": 3.1673162551324245, + "learning_rate": 4.135271839800857e-06, + "loss": 0.9908, + "step": 7360 + }, + { + "epoch": 1.416849146954785, + "lm_loss": 0.5839, + "step": 7360, + "vm_loss": 0.1816 + }, + { + "epoch": 1.416849146954785, + "lm_loss": 1.2805, + "step": 7360, + "vm_loss": 0.225 + }, + { + "epoch": 1.416849146954785, + "lm_loss": 1.0768, + "step": 7360, + "vm_loss": 0.1101 + }, + { + "epoch": 1.416849146954785, + "lm_loss": 0.7986, + "step": 7360, + "vm_loss": 0.1896 + }, + { + "epoch": 1.416849146954785, + "lm_loss": 1.5303, + "step": 7360, + "vm_loss": 0.1882 + }, + { + "epoch": 1.416849146954785, + "lm_loss": 0.7462, + "step": 7360, + "vm_loss": 0.1018 + }, + { + "epoch": 1.416849146954785, + "lm_loss": 1.0603, + "step": 7360, + "vm_loss": 0.1794 + }, + { + "epoch": 1.416849146954785, + "lm_loss": 0.5246, + "step": 7360, + "vm_loss": 0.1949 + }, + { + "epoch": 1.4170416536323605, + "grad_norm": 3.351910542398462, + "learning_rate": 4.132746725561049e-06, + "loss": 1.0354, + "step": 7361 + }, + { + "epoch": 1.4172341603099357, + "grad_norm": 3.1473049693140553, + "learning_rate": 4.130222181693323e-06, + "loss": 0.9857, + "step": 7362 + }, + { + "epoch": 1.417426666987511, + "grad_norm": 3.179922747859307, + "learning_rate": 4.127698208443097e-06, + "loss": 0.974, + "step": 7363 + }, + { + "epoch": 1.4176191736650865, + "grad_norm": 3.320696069472839, + "learning_rate": 4.125174806055735e-06, + "loss": 1.0389, + "step": 7364 + }, + { + "epoch": 1.417811680342662, + "grad_norm": 3.2790493766282345, + "learning_rate": 4.122651974776546e-06, + "loss": 1.0249, + "step": 7365 + }, + { + "epoch": 1.4180041870202373, + "grad_norm": 3.105329561705002, + "learning_rate": 4.120129714850773e-06, + "loss": 0.9381, + "step": 7366 + }, + { + "epoch": 1.4181966936978125, + "grad_norm": 3.0690335902779435, + "learning_rate": 4.11760802652362e-06, + "loss": 0.9104, + "step": 7367 + }, + { + "epoch": 1.418389200375388, + "grad_norm": 3.226350529488512, + "learning_rate": 4.115086910040222e-06, + "loss": 1.0282, + "step": 7368 + }, + { + "epoch": 1.418389200375388, + "lm_loss": 1.6842, + "step": 7368, + "vm_loss": 0.1623 + }, + { + "epoch": 1.418389200375388, + "lm_loss": 0.9523, + "step": 7368, + "vm_loss": 0.1316 + }, + { + "epoch": 1.418389200375388, + "lm_loss": 0.529, + "step": 7368, + "vm_loss": 0.1143 + }, + { + "epoch": 1.418389200375388, + "lm_loss": 0.5949, + "step": 7368, + "vm_loss": 0.1205 + }, + { + "epoch": 1.418389200375388, + "lm_loss": 1.0147, + "step": 7368, + "vm_loss": 0.1292 + }, + { + "epoch": 1.418389200375388, + "lm_loss": 0.9286, + "step": 7368, + "vm_loss": 0.2069 + }, + { + "epoch": 1.418389200375388, + "lm_loss": 1.0407, + "step": 7368, + "vm_loss": 0.2081 + }, + { + "epoch": 1.418389200375388, + "lm_loss": 0.6375, + "step": 7368, + "vm_loss": 0.1422 + }, + { + "epoch": 1.4185817070529634, + "grad_norm": 3.4301240144104828, + "learning_rate": 4.112566365645671e-06, + "loss": 1.0072, + "step": 7369 + }, + { + "epoch": 1.4187742137305388, + "grad_norm": 3.3228689913713416, + "learning_rate": 4.110046393584985e-06, + "loss": 1.0226, + "step": 7370 + }, + { + "epoch": 1.4189667204081142, + "grad_norm": 3.4187041171908503, + "learning_rate": 4.107526994103151e-06, + "loss": 1.069, + "step": 7371 + }, + { + "epoch": 1.4191592270856894, + "grad_norm": 3.19833901106283, + "learning_rate": 4.105008167445076e-06, + "loss": 0.9771, + "step": 7372 + }, + { + "epoch": 1.4193517337632648, + "grad_norm": 3.3456634701639083, + "learning_rate": 4.10248991385563e-06, + "loss": 1.0419, + "step": 7373 + }, + { + "epoch": 1.4195442404408403, + "grad_norm": 3.2372282551924627, + "learning_rate": 4.099972233579608e-06, + "loss": 1.0027, + "step": 7374 + }, + { + "epoch": 1.4197367471184157, + "grad_norm": 3.2251809983981645, + "learning_rate": 4.097455126861776e-06, + "loss": 0.9876, + "step": 7375 + }, + { + "epoch": 1.419929253795991, + "grad_norm": 3.3391858143242152, + "learning_rate": 4.094938593946817e-06, + "loss": 1.0079, + "step": 7376 + }, + { + "epoch": 1.419929253795991, + "lm_loss": 0.6546, + "step": 7376, + "vm_loss": 0.1941 + }, + { + "epoch": 1.419929253795991, + "lm_loss": 0.6136, + "step": 7376, + "vm_loss": 0.1591 + }, + { + "epoch": 1.419929253795991, + "lm_loss": 0.7492, + "step": 7376, + "vm_loss": 0.2151 + }, + { + "epoch": 1.419929253795991, + "lm_loss": 1.1023, + "step": 7376, + "vm_loss": 0.1368 + }, + { + "epoch": 1.419929253795991, + "lm_loss": 0.6279, + "step": 7376, + "vm_loss": 0.1545 + }, + { + "epoch": 1.419929253795991, + "lm_loss": 1.6072, + "step": 7376, + "vm_loss": 0.1621 + }, + { + "epoch": 1.419929253795991, + "lm_loss": 0.6872, + "step": 7376, + "vm_loss": 0.1632 + }, + { + "epoch": 1.419929253795991, + "lm_loss": 1.1598, + "step": 7376, + "vm_loss": 0.207 + }, + { + "epoch": 1.4201217604735663, + "grad_norm": 3.4046885308325536, + "learning_rate": 4.092422635079375e-06, + "loss": 1.0126, + "step": 7377 + }, + { + "epoch": 1.4203142671511417, + "grad_norm": 3.446814180320635, + "learning_rate": 4.089907250504032e-06, + "loss": 1.0212, + "step": 7378 + }, + { + "epoch": 1.4205067738287172, + "grad_norm": 3.4339173009741746, + "learning_rate": 4.087392440465321e-06, + "loss": 1.0277, + "step": 7379 + }, + { + "epoch": 1.4206992805062926, + "grad_norm": 3.242526341651682, + "learning_rate": 4.084878205207702e-06, + "loss": 1.029, + "step": 7380 + }, + { + "epoch": 1.420891787183868, + "grad_norm": 3.4595850655916522, + "learning_rate": 4.082364544975604e-06, + "loss": 1.0997, + "step": 7381 + }, + { + "epoch": 1.4210842938614432, + "grad_norm": 3.183047256656175, + "learning_rate": 4.079851460013377e-06, + "loss": 1.03, + "step": 7382 + }, + { + "epoch": 1.4212768005390186, + "grad_norm": 3.1627443896582776, + "learning_rate": 4.07733895056533e-06, + "loss": 1.0262, + "step": 7383 + }, + { + "epoch": 1.421469307216594, + "grad_norm": 3.158894113957345, + "learning_rate": 4.074827016875711e-06, + "loss": 1.0214, + "step": 7384 + }, + { + "epoch": 1.421469307216594, + "lm_loss": 0.8131, + "step": 7384, + "vm_loss": 0.1753 + }, + { + "epoch": 1.421469307216594, + "lm_loss": 1.016, + "step": 7384, + "vm_loss": 0.1766 + }, + { + "epoch": 1.421469307216594, + "lm_loss": 0.584, + "step": 7384, + "vm_loss": 0.1463 + }, + { + "epoch": 1.421469307216594, + "lm_loss": 0.767, + "step": 7384, + "vm_loss": 0.2098 + }, + { + "epoch": 1.421469307216594, + "lm_loss": 1.0661, + "step": 7384, + "vm_loss": 0.1486 + }, + { + "epoch": 1.421469307216594, + "lm_loss": 0.8573, + "step": 7384, + "vm_loss": 0.1978 + }, + { + "epoch": 1.421469307216594, + "lm_loss": 1.0721, + "step": 7384, + "vm_loss": 0.1553 + }, + { + "epoch": 1.421469307216594, + "lm_loss": 1.0122, + "step": 7384, + "vm_loss": 0.1163 + }, + { + "epoch": 1.4216618138941695, + "grad_norm": 3.350069305976011, + "learning_rate": 4.072315659188709e-06, + "loss": 1.0566, + "step": 7385 + }, + { + "epoch": 1.4218543205717449, + "grad_norm": 3.2171703777806897, + "learning_rate": 4.069804877748468e-06, + "loss": 1.0285, + "step": 7386 + }, + { + "epoch": 1.42204682724932, + "grad_norm": 3.190166627259227, + "learning_rate": 4.067294672799053e-06, + "loss": 1.0668, + "step": 7387 + }, + { + "epoch": 1.4222393339268957, + "grad_norm": 3.3119135660117944, + "learning_rate": 4.064785044584505e-06, + "loss": 1.0169, + "step": 7388 + }, + { + "epoch": 1.422431840604471, + "grad_norm": 3.207983359737915, + "learning_rate": 4.062275993348781e-06, + "loss": 0.9527, + "step": 7389 + }, + { + "epoch": 1.4226243472820463, + "grad_norm": 3.258990378604529, + "learning_rate": 4.0597675193357995e-06, + "loss": 1.0314, + "step": 7390 + }, + { + "epoch": 1.4228168539596218, + "grad_norm": 3.1921282043953663, + "learning_rate": 4.0572596227894054e-06, + "loss": 0.9862, + "step": 7391 + }, + { + "epoch": 1.4230093606371972, + "grad_norm": 3.340513437604555, + "learning_rate": 4.0547523039534135e-06, + "loss": 1.0409, + "step": 7392 + }, + { + "epoch": 1.4230093606371972, + "lm_loss": 0.5914, + "step": 7392, + "vm_loss": 0.1988 + }, + { + "epoch": 1.4230093606371972, + "lm_loss": 0.9033, + "step": 7392, + "vm_loss": 0.1519 + }, + { + "epoch": 1.4230093606371972, + "lm_loss": 0.6052, + "step": 7392, + "vm_loss": 0.149 + }, + { + "epoch": 1.4230093606371972, + "lm_loss": 1.1184, + "step": 7392, + "vm_loss": 0.1718 + }, + { + "epoch": 1.4230093606371972, + "lm_loss": 0.5199, + "step": 7392, + "vm_loss": 0.1731 + }, + { + "epoch": 1.4230093606371972, + "lm_loss": 0.5446, + "step": 7392, + "vm_loss": 0.1292 + }, + { + "epoch": 1.4230093606371972, + "lm_loss": 0.6481, + "step": 7392, + "vm_loss": 0.1737 + }, + { + "epoch": 1.4230093606371972, + "lm_loss": 0.7845, + "step": 7392, + "vm_loss": 0.2065 + }, + { + "epoch": 1.4232018673147726, + "grad_norm": 3.20556945845834, + "learning_rate": 4.052245563071557e-06, + "loss": 1.0324, + "step": 7393 + }, + { + "epoch": 1.4233943739923478, + "grad_norm": 3.4175801825526073, + "learning_rate": 4.049739400387524e-06, + "loss": 1.0217, + "step": 7394 + }, + { + "epoch": 1.4235868806699232, + "grad_norm": 3.3716952623686627, + "learning_rate": 4.0472338161449475e-06, + "loss": 1.0453, + "step": 7395 + }, + { + "epoch": 1.4237793873474986, + "grad_norm": 3.244058303981847, + "learning_rate": 4.044728810587406e-06, + "loss": 1.0354, + "step": 7396 + }, + { + "epoch": 1.423971894025074, + "grad_norm": 3.2972306629488317, + "learning_rate": 4.042224383958408e-06, + "loss": 1.0326, + "step": 7397 + }, + { + "epoch": 1.4241644007026495, + "grad_norm": 3.2273620322360648, + "learning_rate": 4.039720536501428e-06, + "loss": 0.9559, + "step": 7398 + }, + { + "epoch": 1.4243569073802247, + "grad_norm": 3.3238210969343887, + "learning_rate": 4.037217268459863e-06, + "loss": 1.046, + "step": 7399 + }, + { + "epoch": 1.4245494140578001, + "grad_norm": 3.2923909523340944, + "learning_rate": 4.034714580077066e-06, + "loss": 0.9389, + "step": 7400 + }, + { + "epoch": 1.4245494140578001, + "lm_loss": 0.6158, + "step": 7400, + "vm_loss": 0.1451 + }, + { + "epoch": 1.4245494140578001, + "lm_loss": 1.0048, + "step": 7400, + "vm_loss": 0.1543 + }, + { + "epoch": 1.4245494140578001, + "lm_loss": 0.7914, + "step": 7400, + "vm_loss": 0.1884 + }, + { + "epoch": 1.4245494140578001, + "lm_loss": 0.5698, + "step": 7400, + "vm_loss": 0.1455 + }, + { + "epoch": 1.4245494140578001, + "lm_loss": 0.9156, + "step": 7400, + "vm_loss": 0.1917 + }, + { + "epoch": 1.4245494140578001, + "lm_loss": 0.768, + "step": 7400, + "vm_loss": 0.2151 + }, + { + "epoch": 1.4245494140578001, + "lm_loss": 0.3554, + "step": 7400, + "vm_loss": 0.155 + }, + { + "epoch": 1.4245494140578001, + "lm_loss": 0.9098, + "step": 7400, + "vm_loss": 0.1997 + }, + { + "epoch": 1.4247419207353755, + "grad_norm": 3.2288875348629715, + "learning_rate": 4.03221247159633e-06, + "loss": 1.0197, + "step": 7401 + }, + { + "epoch": 1.424934427412951, + "grad_norm": 3.115225700658362, + "learning_rate": 4.029710943260895e-06, + "loss": 0.9632, + "step": 7402 + }, + { + "epoch": 1.4251269340905264, + "grad_norm": 3.1840041920195, + "learning_rate": 4.027209995313935e-06, + "loss": 1.0211, + "step": 7403 + }, + { + "epoch": 1.4253194407681016, + "grad_norm": 3.2325860128206916, + "learning_rate": 4.024709627998579e-06, + "loss": 0.9853, + "step": 7404 + }, + { + "epoch": 1.425511947445677, + "grad_norm": 3.2774833124718556, + "learning_rate": 4.022209841557892e-06, + "loss": 0.9616, + "step": 7405 + }, + { + "epoch": 1.4257044541232524, + "grad_norm": 3.3981942344028373, + "learning_rate": 4.019710636234888e-06, + "loss": 1.0437, + "step": 7406 + }, + { + "epoch": 1.4258969608008278, + "grad_norm": 3.301355253386875, + "learning_rate": 4.017212012272524e-06, + "loss": 1.0403, + "step": 7407 + }, + { + "epoch": 1.4260894674784033, + "grad_norm": 3.22398247074651, + "learning_rate": 4.014713969913686e-06, + "loss": 0.9735, + "step": 7408 + }, + { + "epoch": 1.4260894674784033, + "lm_loss": 0.8611, + "step": 7408, + "vm_loss": 0.1549 + }, + { + "epoch": 1.4260894674784033, + "lm_loss": 0.5353, + "step": 7408, + "vm_loss": 0.1873 + }, + { + "epoch": 1.4260894674784033, + "lm_loss": 1.1884, + "step": 7408, + "vm_loss": 0.1506 + }, + { + "epoch": 1.4260894674784033, + "lm_loss": 1.1233, + "step": 7408, + "vm_loss": 0.1503 + }, + { + "epoch": 1.4260894674784033, + "lm_loss": 1.0175, + "step": 7408, + "vm_loss": 0.1378 + }, + { + "epoch": 1.4260894674784033, + "lm_loss": 0.771, + "step": 7408, + "vm_loss": 0.2339 + }, + { + "epoch": 1.4260894674784033, + "lm_loss": 0.7138, + "step": 7408, + "vm_loss": 0.186 + }, + { + "epoch": 1.4260894674784033, + "lm_loss": 1.0238, + "step": 7408, + "vm_loss": 0.1577 + }, + { + "epoch": 1.4262819741559785, + "grad_norm": 3.303176690818786, + "learning_rate": 4.0122165094012335e-06, + "loss": 1.0029, + "step": 7409 + }, + { + "epoch": 1.4264744808335539, + "grad_norm": 3.256662276211901, + "learning_rate": 4.009719630977937e-06, + "loss": 1.0179, + "step": 7410 + }, + { + "epoch": 1.4266669875111293, + "grad_norm": 3.1580133047293906, + "learning_rate": 4.007223334886531e-06, + "loss": 1.0115, + "step": 7411 + }, + { + "epoch": 1.4268594941887047, + "grad_norm": 3.1874531311439176, + "learning_rate": 4.004727621369688e-06, + "loss": 0.9649, + "step": 7412 + }, + { + "epoch": 1.4270520008662801, + "grad_norm": 3.13307885663222, + "learning_rate": 4.002232490670024e-06, + "loss": 1.0101, + "step": 7413 + }, + { + "epoch": 1.4272445075438553, + "grad_norm": 3.4095476491296495, + "learning_rate": 3.999737943030088e-06, + "loss": 1.0812, + "step": 7414 + }, + { + "epoch": 1.4274370142214308, + "grad_norm": 3.1072478471573235, + "learning_rate": 3.997243978692399e-06, + "loss": 0.9711, + "step": 7415 + }, + { + "epoch": 1.4276295208990062, + "grad_norm": 3.255597463997609, + "learning_rate": 3.994750597899389e-06, + "loss": 1.0046, + "step": 7416 + }, + { + "epoch": 1.4276295208990062, + "lm_loss": 1.1918, + "step": 7416, + "vm_loss": 0.1391 + }, + { + "epoch": 1.4276295208990062, + "lm_loss": 0.5311, + "step": 7416, + "vm_loss": 0.1497 + }, + { + "epoch": 1.4276295208990062, + "lm_loss": 1.0459, + "step": 7416, + "vm_loss": 0.1514 + }, + { + "epoch": 1.4276295208990062, + "lm_loss": 0.5959, + "step": 7416, + "vm_loss": 0.141 + }, + { + "epoch": 1.4276295208990062, + "lm_loss": 0.611, + "step": 7416, + "vm_loss": 0.2386 + }, + { + "epoch": 1.4276295208990062, + "lm_loss": 0.4351, + "step": 7416, + "vm_loss": 0.1588 + }, + { + "epoch": 1.4276295208990062, + "lm_loss": 0.8348, + "step": 7416, + "vm_loss": 0.2063 + }, + { + "epoch": 1.4276295208990062, + "lm_loss": 0.5412, + "step": 7416, + "vm_loss": 0.1244 + }, + { + "epoch": 1.4278220275765816, + "grad_norm": 3.123729056411103, + "learning_rate": 3.992257800893453e-06, + "loss": 0.9645, + "step": 7417 + }, + { + "epoch": 1.428014534254157, + "grad_norm": 3.2674029856602966, + "learning_rate": 3.989765587916914e-06, + "loss": 0.9322, + "step": 7418 + }, + { + "epoch": 1.4282070409317322, + "grad_norm": 3.188003582916186, + "learning_rate": 3.987273959212059e-06, + "loss": 0.9531, + "step": 7419 + }, + { + "epoch": 1.4283995476093077, + "grad_norm": 3.5679165307966625, + "learning_rate": 3.984782915021098e-06, + "loss": 1.0613, + "step": 7420 + }, + { + "epoch": 1.428592054286883, + "grad_norm": 3.420518370485407, + "learning_rate": 3.982292455586195e-06, + "loss": 1.0468, + "step": 7421 + }, + { + "epoch": 1.4287845609644585, + "grad_norm": 3.329595565137712, + "learning_rate": 3.979802581149453e-06, + "loss": 0.9915, + "step": 7422 + }, + { + "epoch": 1.428977067642034, + "grad_norm": 3.3412893140265023, + "learning_rate": 3.977313291952926e-06, + "loss": 1.0029, + "step": 7423 + }, + { + "epoch": 1.4291695743196091, + "grad_norm": 3.2661413648901907, + "learning_rate": 3.974824588238595e-06, + "loss": 0.9889, + "step": 7424 + }, + { + "epoch": 1.4291695743196091, + "lm_loss": 1.0655, + "step": 7424, + "vm_loss": 0.1535 + }, + { + "epoch": 1.4291695743196091, + "lm_loss": 0.578, + "step": 7424, + "vm_loss": 0.2205 + }, + { + "epoch": 1.4291695743196091, + "lm_loss": 1.0376, + "step": 7424, + "vm_loss": 0.1147 + }, + { + "epoch": 1.4291695743196091, + "lm_loss": 1.2093, + "step": 7424, + "vm_loss": 0.1047 + }, + { + "epoch": 1.4291695743196091, + "lm_loss": 0.819, + "step": 7424, + "vm_loss": 0.0999 + }, + { + "epoch": 1.4291695743196091, + "lm_loss": 0.5052, + "step": 7424, + "vm_loss": 0.1983 + }, + { + "epoch": 1.4291695743196091, + "lm_loss": 0.7423, + "step": 7424, + "vm_loss": 0.1182 + }, + { + "epoch": 1.4291695743196091, + "lm_loss": 1.009, + "step": 7424, + "vm_loss": 0.1584 + }, + { + "epoch": 1.4293620809971845, + "grad_norm": 3.252995902185338, + "learning_rate": 3.972336470248399e-06, + "loss": 1.0009, + "step": 7425 + }, + { + "epoch": 1.42955458767476, + "grad_norm": 3.2873847364206763, + "learning_rate": 3.969848938224215e-06, + "loss": 1.0308, + "step": 7426 + }, + { + "epoch": 1.4297470943523354, + "grad_norm": 3.245030794213246, + "learning_rate": 3.967361992407861e-06, + "loss": 0.9839, + "step": 7427 + }, + { + "epoch": 1.4299396010299108, + "grad_norm": 3.291314172424728, + "learning_rate": 3.964875633041102e-06, + "loss": 1.0538, + "step": 7428 + }, + { + "epoch": 1.430132107707486, + "grad_norm": 3.2161676801464143, + "learning_rate": 3.962389860365643e-06, + "loss": 0.9574, + "step": 7429 + }, + { + "epoch": 1.4303246143850614, + "grad_norm": 3.0914050654666987, + "learning_rate": 3.959904674623137e-06, + "loss": 0.9535, + "step": 7430 + }, + { + "epoch": 1.4305171210626368, + "grad_norm": 3.212896007320673, + "learning_rate": 3.957420076055165e-06, + "loss": 0.9593, + "step": 7431 + }, + { + "epoch": 1.4307096277402123, + "grad_norm": 3.404286849714347, + "learning_rate": 3.954936064903276e-06, + "loss": 1.0614, + "step": 7432 + }, + { + "epoch": 1.4307096277402123, + "lm_loss": 0.7622, + "step": 7432, + "vm_loss": 0.164 + }, + { + "epoch": 1.4307096277402123, + "lm_loss": 0.9301, + "step": 7432, + "vm_loss": 0.138 + }, + { + "epoch": 1.4307096277402123, + "lm_loss": 0.8515, + "step": 7432, + "vm_loss": 0.1659 + }, + { + "epoch": 1.4307096277402123, + "lm_loss": 0.6838, + "step": 7432, + "vm_loss": 0.1539 + }, + { + "epoch": 1.4307096277402123, + "lm_loss": 1.0809, + "step": 7432, + "vm_loss": 0.1448 + }, + { + "epoch": 1.4307096277402123, + "lm_loss": 0.8926, + "step": 7432, + "vm_loss": 0.1824 + }, + { + "epoch": 1.4307096277402123, + "lm_loss": 1.3521, + "step": 7432, + "vm_loss": 0.1411 + }, + { + "epoch": 1.4307096277402123, + "lm_loss": 0.9398, + "step": 7432, + "vm_loss": 0.0998 + }, + { + "epoch": 1.4309021344177877, + "grad_norm": 3.345257249499339, + "learning_rate": 3.952452641408937e-06, + "loss": 1.0511, + "step": 7433 + }, + { + "epoch": 1.4310946410953629, + "grad_norm": 3.2836892473193955, + "learning_rate": 3.949969805813575e-06, + "loss": 0.9989, + "step": 7434 + }, + { + "epoch": 1.4312871477729383, + "grad_norm": 3.3396982441324647, + "learning_rate": 3.947487558358544e-06, + "loss": 1.0021, + "step": 7435 + }, + { + "epoch": 1.4314796544505137, + "grad_norm": 3.4172474902373176, + "learning_rate": 3.945005899285163e-06, + "loss": 1.0002, + "step": 7436 + }, + { + "epoch": 1.4316721611280891, + "grad_norm": 3.201080773306174, + "learning_rate": 3.94252482883467e-06, + "loss": 0.9697, + "step": 7437 + }, + { + "epoch": 1.4318646678056646, + "grad_norm": 3.2652406216236427, + "learning_rate": 3.940044347248263e-06, + "loss": 0.9904, + "step": 7438 + }, + { + "epoch": 1.4320571744832398, + "grad_norm": 3.351456416168999, + "learning_rate": 3.937564454767074e-06, + "loss": 1.0224, + "step": 7439 + }, + { + "epoch": 1.4322496811608152, + "grad_norm": 3.244925645364108, + "learning_rate": 3.935085151632185e-06, + "loss": 1.0115, + "step": 7440 + }, + { + "epoch": 1.4322496811608152, + "lm_loss": 0.4762, + "step": 7440, + "vm_loss": 0.1246 + }, + { + "epoch": 1.4322496811608152, + "lm_loss": 1.3445, + "step": 7440, + "vm_loss": 0.1946 + }, + { + "epoch": 1.4322496811608152, + "lm_loss": 1.1248, + "step": 7440, + "vm_loss": 0.1828 + }, + { + "epoch": 1.4322496811608152, + "lm_loss": 0.8252, + "step": 7440, + "vm_loss": 0.1221 + }, + { + "epoch": 1.4322496811608152, + "lm_loss": 0.7972, + "step": 7440, + "vm_loss": 0.1218 + }, + { + "epoch": 1.4322496811608152, + "lm_loss": 0.5959, + "step": 7440, + "vm_loss": 0.1845 + }, + { + "epoch": 1.4322496811608152, + "lm_loss": 0.7924, + "step": 7440, + "vm_loss": 0.1612 + }, + { + "epoch": 1.4322496811608152, + "lm_loss": 1.1844, + "step": 7440, + "vm_loss": 0.2002 + }, + { + "epoch": 1.4324421878383906, + "grad_norm": 3.220571344953075, + "learning_rate": 3.932606438084609e-06, + "loss": 0.9602, + "step": 7441 + }, + { + "epoch": 1.432634694515966, + "grad_norm": 3.2605862014377633, + "learning_rate": 3.9301283143653125e-06, + "loss": 0.9803, + "step": 7442 + }, + { + "epoch": 1.4328272011935415, + "grad_norm": 3.3283590305902697, + "learning_rate": 3.9276507807151985e-06, + "loss": 1.0042, + "step": 7443 + }, + { + "epoch": 1.4330197078711167, + "grad_norm": 3.3350422244063975, + "learning_rate": 3.925173837375118e-06, + "loss": 1.0209, + "step": 7444 + }, + { + "epoch": 1.433212214548692, + "grad_norm": 3.2043577402483088, + "learning_rate": 3.92269748458586e-06, + "loss": 0.9655, + "step": 7445 + }, + { + "epoch": 1.4334047212262675, + "grad_norm": 3.31921524712407, + "learning_rate": 3.920221722588161e-06, + "loss": 0.9912, + "step": 7446 + }, + { + "epoch": 1.433597227903843, + "grad_norm": 3.124418309022242, + "learning_rate": 3.91774655162269e-06, + "loss": 0.9347, + "step": 7447 + }, + { + "epoch": 1.4337897345814183, + "grad_norm": 3.3104114389444033, + "learning_rate": 3.915271971930069e-06, + "loss": 0.989, + "step": 7448 + }, + { + "epoch": 1.4337897345814183, + "lm_loss": 1.1521, + "step": 7448, + "vm_loss": 0.1624 + }, + { + "epoch": 1.4337897345814183, + "lm_loss": 0.4039, + "step": 7448, + "vm_loss": 0.2023 + }, + { + "epoch": 1.4337897345814183, + "lm_loss": 0.8436, + "step": 7448, + "vm_loss": 0.1603 + }, + { + "epoch": 1.4337897345814183, + "lm_loss": 1.0746, + "step": 7448, + "vm_loss": 0.1656 + }, + { + "epoch": 1.4337897345814183, + "lm_loss": 0.7943, + "step": 7448, + "vm_loss": 0.1946 + }, + { + "epoch": 1.4337897345814183, + "lm_loss": 0.7272, + "step": 7448, + "vm_loss": 0.1506 + }, + { + "epoch": 1.4337897345814183, + "lm_loss": 0.6191, + "step": 7448, + "vm_loss": 0.2139 + }, + { + "epoch": 1.4337897345814183, + "lm_loss": 0.5419, + "step": 7448, + "vm_loss": 0.1123 + }, + { + "epoch": 1.4339822412589935, + "grad_norm": 3.214734386182521, + "learning_rate": 3.912797983750859e-06, + "loss": 1.0084, + "step": 7449 + }, + { + "epoch": 1.4341747479365692, + "grad_norm": 3.236118347486923, + "learning_rate": 3.910324587325564e-06, + "loss": 0.933, + "step": 7450 + }, + { + "epoch": 1.4343672546141444, + "grad_norm": 3.2760855075400297, + "learning_rate": 3.90785178289463e-06, + "loss": 0.9781, + "step": 7451 + }, + { + "epoch": 1.4345597612917198, + "grad_norm": 3.4453944172252378, + "learning_rate": 3.905379570698439e-06, + "loss": 1.0589, + "step": 7452 + }, + { + "epoch": 1.4347522679692952, + "grad_norm": 3.286427005157244, + "learning_rate": 3.902907950977333e-06, + "loss": 1.027, + "step": 7453 + }, + { + "epoch": 1.4349447746468704, + "grad_norm": 3.350066963710581, + "learning_rate": 3.900436923971576e-06, + "loss": 1.0257, + "step": 7454 + }, + { + "epoch": 1.435137281324446, + "grad_norm": 3.2163408955439383, + "learning_rate": 3.897966489921384e-06, + "loss": 0.9931, + "step": 7455 + }, + { + "epoch": 1.4353297880020213, + "grad_norm": 3.1868054016766503, + "learning_rate": 3.895496649066919e-06, + "loss": 0.9636, + "step": 7456 + }, + { + "epoch": 1.4353297880020213, + "lm_loss": 0.8351, + "step": 7456, + "vm_loss": 0.1509 + }, + { + "epoch": 1.4353297880020213, + "lm_loss": 0.6497, + "step": 7456, + "vm_loss": 0.1041 + }, + { + "epoch": 1.4353297880020213, + "lm_loss": 0.5105, + "step": 7456, + "vm_loss": 0.1195 + }, + { + "epoch": 1.4353297880020213, + "lm_loss": 0.788, + "step": 7456, + "vm_loss": 0.1804 + }, + { + "epoch": 1.4353297880020213, + "lm_loss": 0.6853, + "step": 7456, + "vm_loss": 0.1824 + }, + { + "epoch": 1.4353297880020213, + "lm_loss": 0.8977, + "step": 7456, + "vm_loss": 0.1935 + }, + { + "epoch": 1.4353297880020213, + "lm_loss": 1.2236, + "step": 7456, + "vm_loss": 0.1615 + }, + { + "epoch": 1.4353297880020213, + "lm_loss": 0.8895, + "step": 7456, + "vm_loss": 0.1694 + }, + { + "epoch": 1.4355222946795967, + "grad_norm": 3.280423662805804, + "learning_rate": 3.893027401648281e-06, + "loss": 0.9728, + "step": 7457 + }, + { + "epoch": 1.435714801357172, + "grad_norm": 3.3222574346585723, + "learning_rate": 3.89055874790551e-06, + "loss": 0.9605, + "step": 7458 + }, + { + "epoch": 1.4359073080347475, + "grad_norm": 3.218268050577204, + "learning_rate": 3.888090688078589e-06, + "loss": 0.9926, + "step": 7459 + }, + { + "epoch": 1.436099814712323, + "grad_norm": 3.317277203857952, + "learning_rate": 3.885623222407448e-06, + "loss": 0.985, + "step": 7460 + }, + { + "epoch": 1.4362923213898982, + "grad_norm": 3.2611957253280526, + "learning_rate": 3.8831563511319585e-06, + "loss": 1.0346, + "step": 7461 + }, + { + "epoch": 1.4364848280674736, + "grad_norm": 3.33529861871596, + "learning_rate": 3.8806900744919205e-06, + "loss": 1.0221, + "step": 7462 + }, + { + "epoch": 1.436677334745049, + "grad_norm": 3.2641392981184447, + "learning_rate": 3.878224392727106e-06, + "loss": 1.002, + "step": 7463 + }, + { + "epoch": 1.4368698414226244, + "grad_norm": 3.308520639128367, + "learning_rate": 3.875759306077196e-06, + "loss": 1.0438, + "step": 7464 + }, + { + "epoch": 1.4368698414226244, + "lm_loss": 0.6734, + "step": 7464, + "vm_loss": 0.1295 + }, + { + "epoch": 1.4368698414226244, + "lm_loss": 0.8854, + "step": 7464, + "vm_loss": 0.16 + }, + { + "epoch": 1.4368698414226244, + "lm_loss": 0.4941, + "step": 7464, + "vm_loss": 0.1378 + }, + { + "epoch": 1.4368698414226244, + "lm_loss": 0.6, + "step": 7464, + "vm_loss": 0.1513 + }, + { + "epoch": 1.4368698414226244, + "lm_loss": 0.8256, + "step": 7464, + "vm_loss": 0.251 + }, + { + "epoch": 1.4368698414226244, + "lm_loss": 1.0003, + "step": 7464, + "vm_loss": 0.2095 + }, + { + "epoch": 1.4368698414226244, + "lm_loss": 0.6397, + "step": 7464, + "vm_loss": 0.1621 + }, + { + "epoch": 1.4368698414226244, + "lm_loss": 0.6008, + "step": 7464, + "vm_loss": 0.1198 + }, + { + "epoch": 1.4370623481001998, + "grad_norm": 3.2277235291794386, + "learning_rate": 3.8732948147818324e-06, + "loss": 0.9642, + "step": 7465 + }, + { + "epoch": 1.437254854777775, + "grad_norm": 3.203874049249102, + "learning_rate": 3.870830919080598e-06, + "loss": 0.9526, + "step": 7466 + }, + { + "epoch": 1.4374473614553505, + "grad_norm": 3.236389700174982, + "learning_rate": 3.868367619213012e-06, + "loss": 0.9435, + "step": 7467 + }, + { + "epoch": 1.4376398681329259, + "grad_norm": 3.2450882075874237, + "learning_rate": 3.865904915418544e-06, + "loss": 1.01, + "step": 7468 + }, + { + "epoch": 1.4378323748105013, + "grad_norm": 3.235749810889186, + "learning_rate": 3.863442807936592e-06, + "loss": 0.9921, + "step": 7469 + }, + { + "epoch": 1.4380248814880767, + "grad_norm": 3.3633187079149254, + "learning_rate": 3.86098129700651e-06, + "loss": 1.023, + "step": 7470 + }, + { + "epoch": 1.438217388165652, + "grad_norm": 3.4290620440258244, + "learning_rate": 3.858520382867586e-06, + "loss": 1.0356, + "step": 7471 + }, + { + "epoch": 1.4384098948432273, + "grad_norm": 3.4227166008932906, + "learning_rate": 3.856060065759053e-06, + "loss": 1.0122, + "step": 7472 + }, + { + "epoch": 1.4384098948432273, + "lm_loss": 0.4834, + "step": 7472, + "vm_loss": 0.1461 + }, + { + "epoch": 1.4384098948432273, + "lm_loss": 0.4899, + "step": 7472, + "vm_loss": 0.1979 + }, + { + "epoch": 1.4384098948432273, + "lm_loss": 0.9298, + "step": 7472, + "vm_loss": 0.1239 + }, + { + "epoch": 1.4384098948432273, + "lm_loss": 0.7222, + "step": 7472, + "vm_loss": 0.1921 + }, + { + "epoch": 1.4384098948432273, + "lm_loss": 0.6443, + "step": 7472, + "vm_loss": 0.1835 + }, + { + "epoch": 1.4384098948432273, + "lm_loss": 1.0035, + "step": 7472, + "vm_loss": 0.1463 + }, + { + "epoch": 1.4384098948432273, + "lm_loss": 0.9978, + "step": 7472, + "vm_loss": 0.2305 + }, + { + "epoch": 1.4384098948432273, + "lm_loss": 1.0775, + "step": 7472, + "vm_loss": 0.2082 + }, + { + "epoch": 1.4386024015208028, + "grad_norm": 3.2644956050559304, + "learning_rate": 3.853600345920087e-06, + "loss": 0.9965, + "step": 7473 + }, + { + "epoch": 1.4387949081983782, + "grad_norm": 3.2573708745019103, + "learning_rate": 3.851141223589805e-06, + "loss": 0.985, + "step": 7474 + }, + { + "epoch": 1.4389874148759536, + "grad_norm": 3.202165768748596, + "learning_rate": 3.848682699007259e-06, + "loss": 1.0116, + "step": 7475 + }, + { + "epoch": 1.4391799215535288, + "grad_norm": 3.3032648931040574, + "learning_rate": 3.846224772411454e-06, + "loss": 1.0396, + "step": 7476 + }, + { + "epoch": 1.4393724282311042, + "grad_norm": 3.2751949325642595, + "learning_rate": 3.8437674440413306e-06, + "loss": 1.0157, + "step": 7477 + }, + { + "epoch": 1.4395649349086796, + "grad_norm": 3.219988408009005, + "learning_rate": 3.8413107141357765e-06, + "loss": 0.9703, + "step": 7478 + }, + { + "epoch": 1.439757441586255, + "grad_norm": 3.1296905556426107, + "learning_rate": 3.838854582933605e-06, + "loss": 0.9741, + "step": 7479 + }, + { + "epoch": 1.4399499482638305, + "grad_norm": 3.2371632550965828, + "learning_rate": 3.836399050673601e-06, + "loss": 0.9632, + "step": 7480 + }, + { + "epoch": 1.4399499482638305, + "lm_loss": 0.7624, + "step": 7480, + "vm_loss": 0.1566 + }, + { + "epoch": 1.4399499482638305, + "lm_loss": 0.7381, + "step": 7480, + "vm_loss": 0.196 + }, + { + "epoch": 1.4399499482638305, + "lm_loss": 0.9242, + "step": 7480, + "vm_loss": 0.1823 + }, + { + "epoch": 1.4399499482638305, + "lm_loss": 0.647, + "step": 7480, + "vm_loss": 0.233 + }, + { + "epoch": 1.4399499482638305, + "lm_loss": 0.9376, + "step": 7480, + "vm_loss": 0.1264 + }, + { + "epoch": 1.4399499482638305, + "lm_loss": 0.8964, + "step": 7480, + "vm_loss": 0.16 + }, + { + "epoch": 1.4399499482638305, + "lm_loss": 0.8888, + "step": 7480, + "vm_loss": 0.1603 + }, + { + "epoch": 1.4399499482638305, + "lm_loss": 0.9, + "step": 7480, + "vm_loss": 0.1904 + }, + { + "epoch": 1.4401424549414057, + "grad_norm": 3.2498923138714635, + "learning_rate": 3.833944117594462e-06, + "loss": 0.9978, + "step": 7481 + }, + { + "epoch": 1.4403349616189811, + "grad_norm": 3.262256771287942, + "learning_rate": 3.831489783934843e-06, + "loss": 0.9932, + "step": 7482 + }, + { + "epoch": 1.4405274682965565, + "grad_norm": 3.4630054564912576, + "learning_rate": 3.829036049933329e-06, + "loss": 1.0508, + "step": 7483 + }, + { + "epoch": 1.440719974974132, + "grad_norm": 3.4278737502812477, + "learning_rate": 3.826582915828468e-06, + "loss": 1.0402, + "step": 7484 + }, + { + "epoch": 1.4409124816517074, + "grad_norm": 3.372472709575327, + "learning_rate": 3.824130381858725e-06, + "loss": 1.0717, + "step": 7485 + }, + { + "epoch": 1.4411049883292826, + "grad_norm": 3.1609286457287773, + "learning_rate": 3.821678448262522e-06, + "loss": 0.9934, + "step": 7486 + }, + { + "epoch": 1.441297495006858, + "grad_norm": 3.146847478090736, + "learning_rate": 3.8192271152782164e-06, + "loss": 0.962, + "step": 7487 + }, + { + "epoch": 1.4414900016844334, + "grad_norm": 3.296196532032015, + "learning_rate": 3.816776383144111e-06, + "loss": 1.0363, + "step": 7488 + }, + { + "epoch": 1.4414900016844334, + "lm_loss": 0.7895, + "step": 7488, + "vm_loss": 0.2194 + }, + { + "epoch": 1.4414900016844334, + "lm_loss": 0.7976, + "step": 7488, + "vm_loss": 0.1999 + }, + { + "epoch": 1.4414900016844334, + "lm_loss": 1.1302, + "step": 7488, + "vm_loss": 0.1248 + }, + { + "epoch": 1.4414900016844334, + "lm_loss": 0.7889, + "step": 7488, + "vm_loss": 0.1513 + }, + { + "epoch": 1.4414900016844334, + "lm_loss": 0.9545, + "step": 7488, + "vm_loss": 0.1431 + }, + { + "epoch": 1.4414900016844334, + "lm_loss": 0.4963, + "step": 7488, + "vm_loss": 0.2143 + }, + { + "epoch": 1.4414900016844334, + "lm_loss": 1.1055, + "step": 7488, + "vm_loss": 0.1831 + }, + { + "epoch": 1.4414900016844334, + "lm_loss": 0.9651, + "step": 7488, + "vm_loss": 0.1411 + }, + { + "epoch": 1.4416825083620088, + "grad_norm": 3.3643801054859632, + "learning_rate": 3.814326252098447e-06, + "loss": 1.0321, + "step": 7489 + }, + { + "epoch": 1.4418750150395843, + "grad_norm": 3.226943747711484, + "learning_rate": 3.8118767223794105e-06, + "loss": 0.9387, + "step": 7490 + }, + { + "epoch": 1.4420675217171595, + "grad_norm": 3.1776805066055407, + "learning_rate": 3.809427794225128e-06, + "loss": 0.9575, + "step": 7491 + }, + { + "epoch": 1.4422600283947349, + "grad_norm": 3.210129592832735, + "learning_rate": 3.8069794678736614e-06, + "loss": 0.9619, + "step": 7492 + }, + { + "epoch": 1.4424525350723103, + "grad_norm": 3.260087344804152, + "learning_rate": 3.8045317435630226e-06, + "loss": 0.9999, + "step": 7493 + }, + { + "epoch": 1.4426450417498857, + "grad_norm": 3.237644060631388, + "learning_rate": 3.8020846215311612e-06, + "loss": 1.0142, + "step": 7494 + }, + { + "epoch": 1.4428375484274611, + "grad_norm": 3.2343796105376588, + "learning_rate": 3.7996381020159733e-06, + "loss": 0.9763, + "step": 7495 + }, + { + "epoch": 1.4430300551050363, + "grad_norm": 3.2320392764616517, + "learning_rate": 3.7971921852552807e-06, + "loss": 0.9766, + "step": 7496 + }, + { + "epoch": 1.4430300551050363, + "lm_loss": 0.3994, + "step": 7496, + "vm_loss": 0.1647 + }, + { + "epoch": 1.4430300551050363, + "lm_loss": 0.7851, + "step": 7496, + "vm_loss": 0.1632 + }, + { + "epoch": 1.4430300551050363, + "lm_loss": 1.002, + "step": 7496, + "vm_loss": 0.1783 + }, + { + "epoch": 1.4430300551050363, + "lm_loss": 0.7841, + "step": 7496, + "vm_loss": 0.2111 + }, + { + "epoch": 1.4430300551050363, + "lm_loss": 0.7084, + "step": 7496, + "vm_loss": 0.1516 + }, + { + "epoch": 1.4430300551050363, + "lm_loss": 1.0134, + "step": 7496, + "vm_loss": 0.1345 + }, + { + "epoch": 1.4430300551050363, + "lm_loss": 0.737, + "step": 7496, + "vm_loss": 0.12 + }, + { + "epoch": 1.4430300551050363, + "lm_loss": 0.745, + "step": 7496, + "vm_loss": 0.2035 + }, + { + "epoch": 1.4432225617826118, + "grad_norm": 3.3132321566950442, + "learning_rate": 3.794746871486871e-06, + "loss": 1.0172, + "step": 7497 + }, + { + "epoch": 1.4434150684601872, + "grad_norm": 3.2689003122046763, + "learning_rate": 3.79230216094845e-06, + "loss": 0.9648, + "step": 7498 + }, + { + "epoch": 1.4436075751377626, + "grad_norm": 3.2688826987465878, + "learning_rate": 3.789858053877683e-06, + "loss": 0.9893, + "step": 7499 + }, + { + "epoch": 1.443800081815338, + "grad_norm": 3.3077582825469998, + "learning_rate": 3.7874145505121563e-06, + "loss": 1.0048, + "step": 7500 + }, + { + "epoch": 1.4439925884929132, + "grad_norm": 3.2895767499573365, + "learning_rate": 3.784971651089425e-06, + "loss": 1.017, + "step": 7501 + }, + { + "epoch": 1.4441850951704887, + "grad_norm": 3.2483683485805295, + "learning_rate": 3.782529355846959e-06, + "loss": 0.974, + "step": 7502 + }, + { + "epoch": 1.444377601848064, + "grad_norm": 3.3250399042741337, + "learning_rate": 3.780087665022184e-06, + "loss": 1.0322, + "step": 7503 + }, + { + "epoch": 1.4445701085256395, + "grad_norm": 3.089754405345806, + "learning_rate": 3.777646578852464e-06, + "loss": 0.9597, + "step": 7504 + }, + { + "epoch": 1.4445701085256395, + "lm_loss": 0.9068, + "step": 7504, + "vm_loss": 0.1563 + }, + { + "epoch": 1.4445701085256395, + "lm_loss": 0.6463, + "step": 7504, + "vm_loss": 0.2443 + }, + { + "epoch": 1.4445701085256395, + "lm_loss": 0.8423, + "step": 7504, + "vm_loss": 0.1493 + }, + { + "epoch": 1.4445701085256395, + "lm_loss": 0.7481, + "step": 7504, + "vm_loss": 0.229 + }, + { + "epoch": 1.4445701085256395, + "lm_loss": 0.8449, + "step": 7504, + "vm_loss": 0.1726 + }, + { + "epoch": 1.4445701085256395, + "lm_loss": 1.4216, + "step": 7504, + "vm_loss": 0.1722 + }, + { + "epoch": 1.4445701085256395, + "lm_loss": 0.6287, + "step": 7504, + "vm_loss": 0.2004 + }, + { + "epoch": 1.4445701085256395, + "lm_loss": 1.1786, + "step": 7504, + "vm_loss": 0.2206 + }, + { + "epoch": 1.444762615203215, + "grad_norm": 3.2833387844393664, + "learning_rate": 3.775206097575108e-06, + "loss": 1.0412, + "step": 7505 + }, + { + "epoch": 1.4449551218807901, + "grad_norm": 3.324964519919435, + "learning_rate": 3.7727662214273496e-06, + "loss": 1.023, + "step": 7506 + }, + { + "epoch": 1.4451476285583655, + "grad_norm": 3.2779973115036776, + "learning_rate": 3.7703269506463914e-06, + "loss": 1.0081, + "step": 7507 + }, + { + "epoch": 1.445340135235941, + "grad_norm": 3.1481965703019603, + "learning_rate": 3.767888285469351e-06, + "loss": 0.9619, + "step": 7508 + }, + { + "epoch": 1.4455326419135164, + "grad_norm": 3.2253701021785544, + "learning_rate": 3.7654502261333017e-06, + "loss": 0.9839, + "step": 7509 + }, + { + "epoch": 1.4457251485910918, + "grad_norm": 3.2457753010481505, + "learning_rate": 3.763012772875253e-06, + "loss": 0.9837, + "step": 7510 + }, + { + "epoch": 1.445917655268667, + "grad_norm": 3.445943380043002, + "learning_rate": 3.760575925932157e-06, + "loss": 1.0641, + "step": 7511 + }, + { + "epoch": 1.4461101619462426, + "grad_norm": 3.369873365835705, + "learning_rate": 3.75813968554091e-06, + "loss": 1.0189, + "step": 7512 + }, + { + "epoch": 1.4461101619462426, + "lm_loss": 1.0479, + "step": 7512, + "vm_loss": 0.1675 + }, + { + "epoch": 1.4461101619462426, + "lm_loss": 0.3101, + "step": 7512, + "vm_loss": 0.1831 + }, + { + "epoch": 1.4461101619462426, + "lm_loss": 0.9104, + "step": 7512, + "vm_loss": 0.113 + }, + { + "epoch": 1.4461101619462426, + "lm_loss": 0.9212, + "step": 7512, + "vm_loss": 0.1012 + }, + { + "epoch": 1.4461101619462426, + "lm_loss": 1.2068, + "step": 7512, + "vm_loss": 0.1248 + }, + { + "epoch": 1.4461101619462426, + "lm_loss": 0.8678, + "step": 7512, + "vm_loss": 0.1324 + }, + { + "epoch": 1.4461101619462426, + "lm_loss": 1.1408, + "step": 7512, + "vm_loss": 0.1849 + }, + { + "epoch": 1.4461101619462426, + "lm_loss": 1.1058, + "step": 7512, + "vm_loss": 0.1383 + }, + { + "epoch": 1.4463026686238178, + "grad_norm": 3.3482323034896657, + "learning_rate": 3.755704051938339e-06, + "loss": 1.0005, + "step": 7513 + }, + { + "epoch": 1.4464951753013933, + "grad_norm": 3.1836188944426587, + "learning_rate": 3.7532690253612226e-06, + "loss": 1.0087, + "step": 7514 + }, + { + "epoch": 1.4466876819789687, + "grad_norm": 3.293230993070609, + "learning_rate": 3.7508346060462752e-06, + "loss": 1.0309, + "step": 7515 + }, + { + "epoch": 1.4468801886565439, + "grad_norm": 3.1495133136119797, + "learning_rate": 3.748400794230158e-06, + "loss": 0.9427, + "step": 7516 + }, + { + "epoch": 1.4470726953341195, + "grad_norm": 3.230017133284445, + "learning_rate": 3.7459675901494587e-06, + "loss": 0.9685, + "step": 7517 + }, + { + "epoch": 1.4472652020116947, + "grad_norm": 3.328659724741726, + "learning_rate": 3.7435349940407274e-06, + "loss": 1.0522, + "step": 7518 + }, + { + "epoch": 1.4474577086892702, + "grad_norm": 3.1261749518043116, + "learning_rate": 3.741103006140436e-06, + "loss": 0.9525, + "step": 7519 + }, + { + "epoch": 1.4476502153668456, + "grad_norm": 3.271125639666758, + "learning_rate": 3.7386716266850066e-06, + "loss": 0.9951, + "step": 7520 + }, + { + "epoch": 1.4476502153668456, + "lm_loss": 0.4959, + "step": 7520, + "vm_loss": 0.1869 + }, + { + "epoch": 1.4476502153668456, + "lm_loss": 0.8442, + "step": 7520, + "vm_loss": 0.1457 + }, + { + "epoch": 1.4476502153668456, + "lm_loss": 0.7844, + "step": 7520, + "vm_loss": 0.1622 + }, + { + "epoch": 1.4476502153668456, + "lm_loss": 1.0791, + "step": 7520, + "vm_loss": 0.2007 + }, + { + "epoch": 1.4476502153668456, + "lm_loss": 1.194, + "step": 7520, + "vm_loss": 0.1446 + }, + { + "epoch": 1.4476502153668456, + "lm_loss": 0.8037, + "step": 7520, + "vm_loss": 0.1237 + }, + { + "epoch": 1.4476502153668456, + "lm_loss": 0.8036, + "step": 7520, + "vm_loss": 0.1968 + }, + { + "epoch": 1.4476502153668456, + "lm_loss": 0.8202, + "step": 7520, + "vm_loss": 0.1452 + }, + { + "epoch": 1.447842722044421, + "grad_norm": 3.306374288929964, + "learning_rate": 3.736240855910801e-06, + "loss": 1.0315, + "step": 7521 + }, + { + "epoch": 1.4480352287219964, + "grad_norm": 3.2919471301750813, + "learning_rate": 3.733810694054124e-06, + "loss": 1.0287, + "step": 7522 + }, + { + "epoch": 1.4482277353995716, + "grad_norm": 3.27774068438974, + "learning_rate": 3.7313811413512134e-06, + "loss": 0.9701, + "step": 7523 + }, + { + "epoch": 1.448420242077147, + "grad_norm": 3.1863582032918623, + "learning_rate": 3.7289521980382535e-06, + "loss": 0.924, + "step": 7524 + }, + { + "epoch": 1.4486127487547225, + "grad_norm": 3.566581678943995, + "learning_rate": 3.7265238643513713e-06, + "loss": 1.0628, + "step": 7525 + }, + { + "epoch": 1.4488052554322979, + "grad_norm": 3.1991363347929767, + "learning_rate": 3.724096140526633e-06, + "loss": 0.9763, + "step": 7526 + }, + { + "epoch": 1.4489977621098733, + "grad_norm": 3.3299171200360282, + "learning_rate": 3.7216690268000357e-06, + "loss": 1.0277, + "step": 7527 + }, + { + "epoch": 1.4491902687874485, + "grad_norm": 3.1947564410213483, + "learning_rate": 3.719242523407539e-06, + "loss": 0.9573, + "step": 7528 + }, + { + "epoch": 1.4491902687874485, + "lm_loss": 1.2342, + "step": 7528, + "vm_loss": 0.1783 + }, + { + "epoch": 1.4491902687874485, + "lm_loss": 0.798, + "step": 7528, + "vm_loss": 0.1366 + }, + { + "epoch": 1.4491902687874485, + "lm_loss": 0.426, + "step": 7528, + "vm_loss": 0.1083 + }, + { + "epoch": 1.4491902687874485, + "lm_loss": 0.9921, + "step": 7528, + "vm_loss": 0.2058 + }, + { + "epoch": 1.4491902687874485, + "lm_loss": 0.4276, + "step": 7528, + "vm_loss": 0.1405 + }, + { + "epoch": 1.4491902687874485, + "lm_loss": 0.9316, + "step": 7528, + "vm_loss": 0.1577 + }, + { + "epoch": 1.4491902687874485, + "lm_loss": 0.8508, + "step": 7528, + "vm_loss": 0.2739 + }, + { + "epoch": 1.4491902687874485, + "lm_loss": 0.9222, + "step": 7528, + "vm_loss": 0.1468 + }, + { + "epoch": 1.449382775465024, + "grad_norm": 3.2972334841729443, + "learning_rate": 3.7168166305850208e-06, + "loss": 0.9857, + "step": 7529 + }, + { + "epoch": 1.4495752821425993, + "grad_norm": 3.488950617590216, + "learning_rate": 3.7143913485683103e-06, + "loss": 1.0115, + "step": 7530 + }, + { + "epoch": 1.4497677888201748, + "grad_norm": 3.2191520296394294, + "learning_rate": 3.711966677593177e-06, + "loss": 0.9946, + "step": 7531 + }, + { + "epoch": 1.4499602954977502, + "grad_norm": 3.1475797537407226, + "learning_rate": 3.709542617895332e-06, + "loss": 0.9756, + "step": 7532 + }, + { + "epoch": 1.4501528021753254, + "grad_norm": 2.9694111380851993, + "learning_rate": 3.7071191697104248e-06, + "loss": 0.9059, + "step": 7533 + }, + { + "epoch": 1.4503453088529008, + "grad_norm": 3.277830864971584, + "learning_rate": 3.704696333274037e-06, + "loss": 0.9884, + "step": 7534 + }, + { + "epoch": 1.4505378155304762, + "grad_norm": 3.3678668580433313, + "learning_rate": 3.702274108821713e-06, + "loss": 1.0209, + "step": 7535 + }, + { + "epoch": 1.4507303222080516, + "grad_norm": 3.1660710396110043, + "learning_rate": 3.6998524965889136e-06, + "loss": 0.9524, + "step": 7536 + }, + { + "epoch": 1.4507303222080516, + "lm_loss": 1.076, + "step": 7536, + "vm_loss": 0.1322 + }, + { + "epoch": 1.4507303222080516, + "lm_loss": 0.4982, + "step": 7536, + "vm_loss": 0.1206 + }, + { + "epoch": 1.4507303222080516, + "lm_loss": 0.5622, + "step": 7536, + "vm_loss": 0.1205 + }, + { + "epoch": 1.4507303222080516, + "lm_loss": 0.847, + "step": 7536, + "vm_loss": 0.1401 + }, + { + "epoch": 1.4507303222080516, + "lm_loss": 0.8279, + "step": 7536, + "vm_loss": 0.1549 + }, + { + "epoch": 1.4507303222080516, + "lm_loss": 0.7906, + "step": 7536, + "vm_loss": 0.2187 + }, + { + "epoch": 1.4507303222080516, + "lm_loss": 0.5357, + "step": 7536, + "vm_loss": 0.1718 + }, + { + "epoch": 1.4507303222080516, + "lm_loss": 0.5858, + "step": 7536, + "vm_loss": 0.1212 + }, + { + "epoch": 1.450922828885627, + "grad_norm": 3.1028320177361417, + "learning_rate": 3.6974314968110547e-06, + "loss": 0.9129, + "step": 7537 + }, + { + "epoch": 1.4511153355632023, + "grad_norm": 3.2609419564341526, + "learning_rate": 3.6950111097234866e-06, + "loss": 0.9689, + "step": 7538 + }, + { + "epoch": 1.4513078422407777, + "grad_norm": 3.289604171766757, + "learning_rate": 3.6925913355615075e-06, + "loss": 0.9961, + "step": 7539 + }, + { + "epoch": 1.4515003489183531, + "grad_norm": 3.1669610502451655, + "learning_rate": 3.6901721745603414e-06, + "loss": 0.9502, + "step": 7540 + }, + { + "epoch": 1.4516928555959285, + "grad_norm": 3.495465152657839, + "learning_rate": 3.687753626955167e-06, + "loss": 1.0622, + "step": 7541 + }, + { + "epoch": 1.451885362273504, + "grad_norm": 3.289462938968708, + "learning_rate": 3.6853356929810966e-06, + "loss": 0.9704, + "step": 7542 + }, + { + "epoch": 1.4520778689510792, + "grad_norm": 3.260639672356218, + "learning_rate": 3.68291837287319e-06, + "loss": 0.9654, + "step": 7543 + }, + { + "epoch": 1.4522703756286546, + "grad_norm": 3.370648975426911, + "learning_rate": 3.6805016668664284e-06, + "loss": 1.0135, + "step": 7544 + }, + { + "epoch": 1.4522703756286546, + "lm_loss": 0.5624, + "step": 7544, + "vm_loss": 0.18 + }, + { + "epoch": 1.4522703756286546, + "lm_loss": 0.6554, + "step": 7544, + "vm_loss": 0.1835 + }, + { + "epoch": 1.4522703756286546, + "lm_loss": 0.9162, + "step": 7544, + "vm_loss": 0.2429 + }, + { + "epoch": 1.4522703756286546, + "lm_loss": 0.8813, + "step": 7544, + "vm_loss": 0.1377 + }, + { + "epoch": 1.4522703756286546, + "lm_loss": 0.4676, + "step": 7544, + "vm_loss": 0.2293 + }, + { + "epoch": 1.4522703756286546, + "lm_loss": 0.4517, + "step": 7544, + "vm_loss": 0.1447 + }, + { + "epoch": 1.4522703756286546, + "lm_loss": 0.786, + "step": 7544, + "vm_loss": 0.1597 + }, + { + "epoch": 1.4522703756286546, + "lm_loss": 0.865, + "step": 7544, + "vm_loss": 0.1491 + }, + { + "epoch": 1.45246288230623, + "grad_norm": 3.207943675242393, + "learning_rate": 3.678085575195761e-06, + "loss": 1.0242, + "step": 7545 + }, + { + "epoch": 1.4526553889838054, + "grad_norm": 3.33134265411261, + "learning_rate": 3.6756700980960548e-06, + "loss": 0.9791, + "step": 7546 + }, + { + "epoch": 1.4528478956613808, + "grad_norm": 3.3400801629272605, + "learning_rate": 3.6732552358021258e-06, + "loss": 0.9816, + "step": 7547 + }, + { + "epoch": 1.453040402338956, + "grad_norm": 3.340912625850119, + "learning_rate": 3.6708409885487305e-06, + "loss": 1.0513, + "step": 7548 + }, + { + "epoch": 1.4532329090165315, + "grad_norm": 3.3629712138048617, + "learning_rate": 3.6684273565705686e-06, + "loss": 1.0565, + "step": 7549 + }, + { + "epoch": 1.4534254156941069, + "grad_norm": 3.2395413896837746, + "learning_rate": 3.6660143401022687e-06, + "loss": 1.0295, + "step": 7550 + }, + { + "epoch": 1.4536179223716823, + "grad_norm": 3.122951720208705, + "learning_rate": 3.6636019393784093e-06, + "loss": 0.977, + "step": 7551 + }, + { + "epoch": 1.4538104290492577, + "grad_norm": 3.194321558893384, + "learning_rate": 3.6611901546335093e-06, + "loss": 0.9901, + "step": 7552 + }, + { + "epoch": 1.4538104290492577, + "lm_loss": 1.2886, + "step": 7552, + "vm_loss": 0.1748 + }, + { + "epoch": 1.4538104290492577, + "lm_loss": 0.8625, + "step": 7552, + "vm_loss": 0.1085 + }, + { + "epoch": 1.4538104290492577, + "lm_loss": 0.8445, + "step": 7552, + "vm_loss": 0.137 + }, + { + "epoch": 1.4538104290492577, + "lm_loss": 0.8414, + "step": 7552, + "vm_loss": 0.1448 + }, + { + "epoch": 1.4538104290492577, + "lm_loss": 0.8963, + "step": 7552, + "vm_loss": 0.252 + }, + { + "epoch": 1.4538104290492577, + "lm_loss": 1.0389, + "step": 7552, + "vm_loss": 0.1895 + }, + { + "epoch": 1.4538104290492577, + "lm_loss": 1.2937, + "step": 7552, + "vm_loss": 0.1785 + }, + { + "epoch": 1.4538104290492577, + "lm_loss": 0.7024, + "step": 7552, + "vm_loss": 0.1378 + }, + { + "epoch": 1.454002935726833, + "grad_norm": 3.394555528487962, + "learning_rate": 3.658778986102022e-06, + "loss": 1.0609, + "step": 7553 + }, + { + "epoch": 1.4541954424044083, + "grad_norm": 3.2455965484785323, + "learning_rate": 3.6563684340183458e-06, + "loss": 0.9662, + "step": 7554 + }, + { + "epoch": 1.4543879490819838, + "grad_norm": 3.1950471856719442, + "learning_rate": 3.6539584986168163e-06, + "loss": 0.9818, + "step": 7555 + }, + { + "epoch": 1.4545804557595592, + "grad_norm": 3.164321591551787, + "learning_rate": 3.651549180131714e-06, + "loss": 0.9722, + "step": 7556 + }, + { + "epoch": 1.4547729624371346, + "grad_norm": 3.1991018657563624, + "learning_rate": 3.6491404787972486e-06, + "loss": 0.9484, + "step": 7557 + }, + { + "epoch": 1.4549654691147098, + "grad_norm": 3.4178083616927935, + "learning_rate": 3.6467323948475786e-06, + "loss": 1.0199, + "step": 7558 + }, + { + "epoch": 1.4551579757922852, + "grad_norm": 3.3827364975113787, + "learning_rate": 3.6443249285168025e-06, + "loss": 1.0097, + "step": 7559 + }, + { + "epoch": 1.4553504824698607, + "grad_norm": 3.0954711936454777, + "learning_rate": 3.6419180800389607e-06, + "loss": 0.9177, + "step": 7560 + }, + { + "epoch": 1.4553504824698607, + "lm_loss": 0.8161, + "step": 7560, + "vm_loss": 0.1324 + }, + { + "epoch": 1.4553504824698607, + "lm_loss": 1.4125, + "step": 7560, + "vm_loss": 0.1713 + }, + { + "epoch": 1.4553504824698607, + "lm_loss": 0.899, + "step": 7560, + "vm_loss": 0.2148 + }, + { + "epoch": 1.4553504824698607, + "lm_loss": 0.5899, + "step": 7560, + "vm_loss": 0.1955 + }, + { + "epoch": 1.4553504824698607, + "lm_loss": 0.8906, + "step": 7560, + "vm_loss": 0.1822 + }, + { + "epoch": 1.4553504824698607, + "lm_loss": 0.8449, + "step": 7560, + "vm_loss": 0.1947 + }, + { + "epoch": 1.4553504824698607, + "lm_loss": 0.6252, + "step": 7560, + "vm_loss": 0.132 + }, + { + "epoch": 1.4553504824698607, + "lm_loss": 0.5667, + "step": 7560, + "vm_loss": 0.1508 + }, + { + "epoch": 1.455542989147436, + "grad_norm": 3.4012657303298885, + "learning_rate": 3.6395118496480176e-06, + "loss": 1.0099, + "step": 7561 + }, + { + "epoch": 1.4557354958250115, + "grad_norm": 3.1965460318289134, + "learning_rate": 3.637106237577903e-06, + "loss": 0.9137, + "step": 7562 + }, + { + "epoch": 1.4559280025025867, + "grad_norm": 3.156106085034886, + "learning_rate": 3.6347012440624653e-06, + "loss": 0.9592, + "step": 7563 + }, + { + "epoch": 1.4561205091801621, + "grad_norm": 3.1892515772761842, + "learning_rate": 3.6322968693355008e-06, + "loss": 0.9335, + "step": 7564 + }, + { + "epoch": 1.4563130158577375, + "grad_norm": 3.336148332296915, + "learning_rate": 3.629893113630749e-06, + "loss": 0.9825, + "step": 7565 + }, + { + "epoch": 1.456505522535313, + "grad_norm": 3.250439218883212, + "learning_rate": 3.627489977181887e-06, + "loss": 0.9767, + "step": 7566 + }, + { + "epoch": 1.4566980292128884, + "grad_norm": 3.1578545775510825, + "learning_rate": 3.625087460222524e-06, + "loss": 0.9337, + "step": 7567 + }, + { + "epoch": 1.4568905358904636, + "grad_norm": 3.3216048226693635, + "learning_rate": 3.622685562986219e-06, + "loss": 0.9791, + "step": 7568 + }, + { + "epoch": 1.4568905358904636, + "lm_loss": 0.7149, + "step": 7568, + "vm_loss": 0.1539 + }, + { + "epoch": 1.4568905358904636, + "lm_loss": 0.874, + "step": 7568, + "vm_loss": 0.1443 + }, + { + "epoch": 1.4568905358904636, + "lm_loss": 1.0607, + "step": 7568, + "vm_loss": 0.2261 + }, + { + "epoch": 1.4568905358904636, + "lm_loss": 0.9421, + "step": 7568, + "vm_loss": 0.2124 + }, + { + "epoch": 1.4568905358904636, + "lm_loss": 0.6371, + "step": 7568, + "vm_loss": 0.1119 + }, + { + "epoch": 1.4568905358904636, + "lm_loss": 0.9167, + "step": 7568, + "vm_loss": 0.2672 + }, + { + "epoch": 1.4568905358904636, + "lm_loss": 0.7039, + "step": 7568, + "vm_loss": 0.2106 + }, + { + "epoch": 1.4568905358904636, + "lm_loss": 0.5965, + "step": 7568, + "vm_loss": 0.1604 + }, + { + "epoch": 1.457083042568039, + "grad_norm": 3.1967627215085153, + "learning_rate": 3.6202842857064657e-06, + "loss": 0.9753, + "step": 7569 + }, + { + "epoch": 1.4572755492456144, + "grad_norm": 3.0597970150074993, + "learning_rate": 3.6178836286167052e-06, + "loss": 0.9412, + "step": 7570 + }, + { + "epoch": 1.4574680559231898, + "grad_norm": 3.1929048121324164, + "learning_rate": 3.6154835919502996e-06, + "loss": 0.9467, + "step": 7571 + }, + { + "epoch": 1.4576605626007653, + "grad_norm": 3.159884695980742, + "learning_rate": 3.6130841759405776e-06, + "loss": 0.9349, + "step": 7572 + }, + { + "epoch": 1.4578530692783405, + "grad_norm": 3.3245797728228084, + "learning_rate": 3.6106853808207833e-06, + "loss": 0.9987, + "step": 7573 + }, + { + "epoch": 1.458045575955916, + "grad_norm": 3.1664880575091017, + "learning_rate": 3.6082872068241124e-06, + "loss": 0.9684, + "step": 7574 + }, + { + "epoch": 1.4582380826334913, + "grad_norm": 3.1596500132994243, + "learning_rate": 3.6058896541836986e-06, + "loss": 0.9627, + "step": 7575 + }, + { + "epoch": 1.4584305893110667, + "grad_norm": 3.21400726079437, + "learning_rate": 3.6034927231326156e-06, + "loss": 0.9491, + "step": 7576 + }, + { + "epoch": 1.4584305893110667, + "lm_loss": 0.7674, + "step": 7576, + "vm_loss": 0.1198 + }, + { + "epoch": 1.4584305893110667, + "lm_loss": 0.784, + "step": 7576, + "vm_loss": 0.2127 + }, + { + "epoch": 1.4584305893110667, + "lm_loss": 1.1716, + "step": 7576, + "vm_loss": 0.0885 + }, + { + "epoch": 1.4584305893110667, + "lm_loss": 1.0539, + "step": 7576, + "vm_loss": 0.1306 + }, + { + "epoch": 1.4584305893110667, + "lm_loss": 1.1292, + "step": 7576, + "vm_loss": 0.1832 + }, + { + "epoch": 1.4584305893110667, + "lm_loss": 0.6197, + "step": 7576, + "vm_loss": 0.2005 + }, + { + "epoch": 1.4584305893110667, + "lm_loss": 0.5794, + "step": 7576, + "vm_loss": 0.208 + }, + { + "epoch": 1.4584305893110667, + "lm_loss": 1.299, + "step": 7576, + "vm_loss": 0.1638 + }, + { + "epoch": 1.4586230959886421, + "grad_norm": 3.303567985812882, + "learning_rate": 3.6010964139038783e-06, + "loss": 0.9938, + "step": 7577 + }, + { + "epoch": 1.4588156026662173, + "grad_norm": 3.2887744292475727, + "learning_rate": 3.59870072673043e-06, + "loss": 0.9542, + "step": 7578 + }, + { + "epoch": 1.459008109343793, + "grad_norm": 3.392341811907046, + "learning_rate": 3.596305661845172e-06, + "loss": 1.0073, + "step": 7579 + }, + { + "epoch": 1.4592006160213682, + "grad_norm": 3.4059394382802135, + "learning_rate": 3.593911219480929e-06, + "loss": 1.0078, + "step": 7580 + }, + { + "epoch": 1.4593931226989436, + "grad_norm": 3.325909839367532, + "learning_rate": 3.5915173998704733e-06, + "loss": 0.9616, + "step": 7581 + }, + { + "epoch": 1.459585629376519, + "grad_norm": 3.341954948429275, + "learning_rate": 3.5891242032465144e-06, + "loss": 0.9626, + "step": 7582 + }, + { + "epoch": 1.4597781360540945, + "grad_norm": 3.1673300061184744, + "learning_rate": 3.5867316298417053e-06, + "loss": 0.9053, + "step": 7583 + }, + { + "epoch": 1.4599706427316699, + "grad_norm": 3.463879385629071, + "learning_rate": 3.5843396798886275e-06, + "loss": 1.0202, + "step": 7584 + }, + { + "epoch": 1.4599706427316699, + "lm_loss": 0.5899, + "step": 7584, + "vm_loss": 0.1 + }, + { + "epoch": 1.4599706427316699, + "lm_loss": 0.7933, + "step": 7584, + "vm_loss": 0.1282 + }, + { + "epoch": 1.4599706427316699, + "lm_loss": 0.5765, + "step": 7584, + "vm_loss": 0.1828 + }, + { + "epoch": 1.4599706427316699, + "lm_loss": 1.3014, + "step": 7584, + "vm_loss": 0.129 + }, + { + "epoch": 1.4599706427316699, + "lm_loss": 0.5129, + "step": 7584, + "vm_loss": 0.1611 + }, + { + "epoch": 1.4599706427316699, + "lm_loss": 0.5223, + "step": 7584, + "vm_loss": 0.1846 + }, + { + "epoch": 1.4599706427316699, + "lm_loss": 0.6431, + "step": 7584, + "vm_loss": 0.1786 + }, + { + "epoch": 1.4599706427316699, + "lm_loss": 0.7816, + "step": 7584, + "vm_loss": 0.1926 + }, + { + "epoch": 1.460163149409245, + "grad_norm": 3.2615385854672314, + "learning_rate": 3.581948353619814e-06, + "loss": 0.9483, + "step": 7585 + }, + { + "epoch": 1.4603556560868205, + "grad_norm": 3.3058767381400593, + "learning_rate": 3.579557651267731e-06, + "loss": 0.9906, + "step": 7586 + }, + { + "epoch": 1.460548162764396, + "grad_norm": 3.2836430538495707, + "learning_rate": 3.5771675730647894e-06, + "loss": 0.9855, + "step": 7587 + }, + { + "epoch": 1.4607406694419713, + "grad_norm": 3.1257355895561467, + "learning_rate": 3.574778119243325e-06, + "loss": 0.9053, + "step": 7588 + }, + { + "epoch": 1.4609331761195468, + "grad_norm": 3.515096850020925, + "learning_rate": 3.572389290035636e-06, + "loss": 1.0862, + "step": 7589 + }, + { + "epoch": 1.461125682797122, + "grad_norm": 3.321891360331378, + "learning_rate": 3.5700010856739385e-06, + "loss": 0.9918, + "step": 7590 + }, + { + "epoch": 1.4613181894746974, + "grad_norm": 3.29240862255625, + "learning_rate": 3.567613506390399e-06, + "loss": 0.9889, + "step": 7591 + }, + { + "epoch": 1.4615106961522728, + "grad_norm": 3.2835252301405493, + "learning_rate": 3.5652265524171205e-06, + "loss": 0.9922, + "step": 7592 + }, + { + "epoch": 1.4615106961522728, + "lm_loss": 0.5149, + "step": 7592, + "vm_loss": 0.1547 + }, + { + "epoch": 1.4615106961522728, + "lm_loss": 0.9553, + "step": 7592, + "vm_loss": 0.162 + }, + { + "epoch": 1.4615106961522728, + "lm_loss": 0.5854, + "step": 7592, + "vm_loss": 0.1739 + }, + { + "epoch": 1.4615106961522728, + "lm_loss": 1.1794, + "step": 7592, + "vm_loss": 0.1874 + }, + { + "epoch": 1.4615106961522728, + "lm_loss": 1.0496, + "step": 7592, + "vm_loss": 0.2439 + }, + { + "epoch": 1.4615106961522728, + "lm_loss": 0.7187, + "step": 7592, + "vm_loss": 0.1318 + }, + { + "epoch": 1.4615106961522728, + "lm_loss": 0.8927, + "step": 7592, + "vm_loss": 0.2793 + }, + { + "epoch": 1.4615106961522728, + "lm_loss": 0.6064, + "step": 7592, + "vm_loss": 0.1653 + }, + { + "epoch": 1.4617032028298482, + "grad_norm": 3.3377264045779764, + "learning_rate": 3.5628402239861504e-06, + "loss": 0.9804, + "step": 7593 + }, + { + "epoch": 1.4618957095074236, + "grad_norm": 3.222452797517878, + "learning_rate": 3.5604545213294616e-06, + "loss": 0.9852, + "step": 7594 + }, + { + "epoch": 1.4620882161849988, + "grad_norm": 3.4280855320824832, + "learning_rate": 3.5580694446789786e-06, + "loss": 1.0191, + "step": 7595 + }, + { + "epoch": 1.4622807228625743, + "grad_norm": 3.284918299654645, + "learning_rate": 3.5556849942665626e-06, + "loss": 0.9974, + "step": 7596 + }, + { + "epoch": 1.4624732295401497, + "grad_norm": 3.3171573473139335, + "learning_rate": 3.5533011703240127e-06, + "loss": 0.9669, + "step": 7597 + }, + { + "epoch": 1.462665736217725, + "grad_norm": 3.3113878570313173, + "learning_rate": 3.550917973083069e-06, + "loss": 0.9983, + "step": 7598 + }, + { + "epoch": 1.4628582428953005, + "grad_norm": 3.1279239343627494, + "learning_rate": 3.5485354027753992e-06, + "loss": 0.9523, + "step": 7599 + }, + { + "epoch": 1.4630507495728757, + "grad_norm": 3.2349066112100706, + "learning_rate": 3.546153459632634e-06, + "loss": 0.9483, + "step": 7600 + }, + { + "epoch": 1.4630507495728757, + "lm_loss": 0.9693, + "step": 7600, + "vm_loss": 0.1988 + }, + { + "epoch": 1.4630507495728757, + "lm_loss": 0.8754, + "step": 7600, + "vm_loss": 0.1469 + }, + { + "epoch": 1.4630507495728757, + "lm_loss": 0.9397, + "step": 7600, + "vm_loss": 0.1385 + }, + { + "epoch": 1.4630507495728757, + "lm_loss": 0.9083, + "step": 7600, + "vm_loss": 0.1129 + }, + { + "epoch": 1.4630507495728757, + "lm_loss": 0.6628, + "step": 7600, + "vm_loss": 0.1603 + }, + { + "epoch": 1.4630507495728757, + "lm_loss": 1.4697, + "step": 7600, + "vm_loss": 0.136 + }, + { + "epoch": 1.4630507495728757, + "lm_loss": 0.8152, + "step": 7600, + "vm_loss": 0.1451 + }, + { + "epoch": 1.4630507495728757, + "lm_loss": 0.9698, + "step": 7600, + "vm_loss": 0.1689 + }, + { + "epoch": 1.4632432562504512, + "grad_norm": 3.2978215667176762, + "learning_rate": 3.543772143886318e-06, + "loss": 0.9641, + "step": 7601 + }, + { + "epoch": 1.4634357629280266, + "grad_norm": 3.1678881781243673, + "learning_rate": 3.5413914557679486e-06, + "loss": 0.9476, + "step": 7602 + }, + { + "epoch": 1.463628269605602, + "grad_norm": 3.080024560280289, + "learning_rate": 3.539011395508959e-06, + "loss": 0.9382, + "step": 7603 + }, + { + "epoch": 1.4638207762831774, + "grad_norm": 3.1301046620278803, + "learning_rate": 3.5366319633407255e-06, + "loss": 0.905, + "step": 7604 + }, + { + "epoch": 1.4640132829607526, + "grad_norm": 3.2358544988563875, + "learning_rate": 3.5342531594945493e-06, + "loss": 0.9543, + "step": 7605 + }, + { + "epoch": 1.464205789638328, + "grad_norm": 3.1965287900257318, + "learning_rate": 3.531874984201694e-06, + "loss": 0.9262, + "step": 7606 + }, + { + "epoch": 1.4643982963159035, + "grad_norm": 3.278284516734381, + "learning_rate": 3.5294974376933377e-06, + "loss": 0.9521, + "step": 7607 + }, + { + "epoch": 1.4645908029934789, + "grad_norm": 3.2343210218478275, + "learning_rate": 3.5271205202006163e-06, + "loss": 0.9504, + "step": 7608 + }, + { + "epoch": 1.4645908029934789, + "lm_loss": 0.6236, + "step": 7608, + "vm_loss": 0.1512 + }, + { + "epoch": 1.4645908029934789, + "lm_loss": 0.497, + "step": 7608, + "vm_loss": 0.1981 + }, + { + "epoch": 1.4645908029934789, + "lm_loss": 0.9825, + "step": 7608, + "vm_loss": 0.1647 + }, + { + "epoch": 1.4645908029934789, + "lm_loss": 0.8292, + "step": 7608, + "vm_loss": 0.1687 + }, + { + "epoch": 1.4645908029934789, + "lm_loss": 1.1448, + "step": 7608, + "vm_loss": 0.1992 + }, + { + "epoch": 1.4645908029934789, + "lm_loss": 0.9142, + "step": 7608, + "vm_loss": 0.1444 + }, + { + "epoch": 1.4645908029934789, + "lm_loss": 1.2976, + "step": 7608, + "vm_loss": 0.095 + }, + { + "epoch": 1.4645908029934789, + "lm_loss": 0.5753, + "step": 7608, + "vm_loss": 0.1146 + }, + { + "epoch": 1.4647833096710543, + "grad_norm": 3.3579104795701893, + "learning_rate": 3.5247442319545853e-06, + "loss": 0.9706, + "step": 7609 + }, + { + "epoch": 1.4649758163486295, + "grad_norm": 3.2936999429028813, + "learning_rate": 3.5223685731862657e-06, + "loss": 0.9457, + "step": 7610 + }, + { + "epoch": 1.465168323026205, + "grad_norm": 3.4391550431519478, + "learning_rate": 3.5199935441265887e-06, + "loss": 1.0147, + "step": 7611 + }, + { + "epoch": 1.4653608297037803, + "grad_norm": 3.344371873851089, + "learning_rate": 3.5176191450064445e-06, + "loss": 0.9753, + "step": 7612 + }, + { + "epoch": 1.4655533363813558, + "grad_norm": 3.3837372903693805, + "learning_rate": 3.5152453760566518e-06, + "loss": 0.9846, + "step": 7613 + }, + { + "epoch": 1.4657458430589312, + "grad_norm": 3.3644683039690526, + "learning_rate": 3.512872237507974e-06, + "loss": 0.9985, + "step": 7614 + }, + { + "epoch": 1.4659383497365064, + "grad_norm": 3.1706162100994115, + "learning_rate": 3.5104997295911115e-06, + "loss": 0.9482, + "step": 7615 + }, + { + "epoch": 1.4661308564140818, + "grad_norm": 3.596283859658559, + "learning_rate": 3.508127852536698e-06, + "loss": 1.0849, + "step": 7616 + }, + { + "epoch": 1.4661308564140818, + "lm_loss": 1.3985, + "step": 7616, + "vm_loss": 0.1106 + }, + { + "epoch": 1.4661308564140818, + "lm_loss": 0.7794, + "step": 7616, + "vm_loss": 0.1819 + }, + { + "epoch": 1.4661308564140818, + "lm_loss": 0.6363, + "step": 7616, + "vm_loss": 0.126 + }, + { + "epoch": 1.4661308564140818, + "lm_loss": 1.1086, + "step": 7616, + "vm_loss": 0.1108 + }, + { + "epoch": 1.4661308564140818, + "lm_loss": 0.6449, + "step": 7616, + "vm_loss": 0.1454 + }, + { + "epoch": 1.4661308564140818, + "lm_loss": 0.6519, + "step": 7616, + "vm_loss": 0.1798 + }, + { + "epoch": 1.4661308564140818, + "lm_loss": 0.8471, + "step": 7616, + "vm_loss": 0.1507 + }, + { + "epoch": 1.4661308564140818, + "lm_loss": 0.6101, + "step": 7616, + "vm_loss": 0.1878 + }, + { + "epoch": 1.4663233630916572, + "grad_norm": 3.365089333801048, + "learning_rate": 3.5057566065753114e-06, + "loss": 1.0188, + "step": 7617 + }, + { + "epoch": 1.4665158697692326, + "grad_norm": 3.3045802593950837, + "learning_rate": 3.5033859919374693e-06, + "loss": 1.032, + "step": 7618 + }, + { + "epoch": 1.466708376446808, + "grad_norm": 3.241155900566468, + "learning_rate": 3.5010160088536238e-06, + "loss": 1.0014, + "step": 7619 + }, + { + "epoch": 1.4669008831243833, + "grad_norm": 3.1511822679533976, + "learning_rate": 3.4986466575541688e-06, + "loss": 0.9604, + "step": 7620 + }, + { + "epoch": 1.4670933898019587, + "grad_norm": 3.1535278090043253, + "learning_rate": 3.4962779382694387e-06, + "loss": 0.9499, + "step": 7621 + }, + { + "epoch": 1.4672858964795341, + "grad_norm": 3.0753821712555545, + "learning_rate": 3.4939098512296933e-06, + "loss": 0.9172, + "step": 7622 + }, + { + "epoch": 1.4674784031571095, + "grad_norm": 3.342031497366783, + "learning_rate": 3.491542396665155e-06, + "loss": 1.0126, + "step": 7623 + }, + { + "epoch": 1.467670909834685, + "grad_norm": 3.1576460273515217, + "learning_rate": 3.4891755748059587e-06, + "loss": 0.9586, + "step": 7624 + }, + { + "epoch": 1.467670909834685, + "lm_loss": 0.8257, + "step": 7624, + "vm_loss": 0.1626 + }, + { + "epoch": 1.467670909834685, + "lm_loss": 0.805, + "step": 7624, + "vm_loss": 0.2393 + }, + { + "epoch": 1.467670909834685, + "lm_loss": 0.5289, + "step": 7624, + "vm_loss": 0.1621 + }, + { + "epoch": 1.467670909834685, + "lm_loss": 0.9875, + "step": 7624, + "vm_loss": 0.1664 + }, + { + "epoch": 1.467670909834685, + "lm_loss": 1.0995, + "step": 7624, + "vm_loss": 0.1553 + }, + { + "epoch": 1.467670909834685, + "lm_loss": 0.4283, + "step": 7624, + "vm_loss": 0.1162 + }, + { + "epoch": 1.467670909834685, + "lm_loss": 0.7987, + "step": 7624, + "vm_loss": 0.1296 + }, + { + "epoch": 1.467670909834685, + "lm_loss": 0.3783, + "step": 7624, + "vm_loss": 0.1415 + }, + { + "epoch": 1.4678634165122602, + "grad_norm": 3.098715100970303, + "learning_rate": 3.4868093858821982e-06, + "loss": 0.9012, + "step": 7625 + }, + { + "epoch": 1.4680559231898356, + "grad_norm": 3.340863438836121, + "learning_rate": 3.484443830123887e-06, + "loss": 1.0073, + "step": 7626 + }, + { + "epoch": 1.468248429867411, + "grad_norm": 3.255337529289207, + "learning_rate": 3.4820789077609997e-06, + "loss": 0.9847, + "step": 7627 + }, + { + "epoch": 1.4684409365449864, + "grad_norm": 3.2620871653989036, + "learning_rate": 3.4797146190234286e-06, + "loss": 1.0, + "step": 7628 + }, + { + "epoch": 1.4686334432225618, + "grad_norm": 3.32782763753346, + "learning_rate": 3.477350964141014e-06, + "loss": 0.9993, + "step": 7629 + }, + { + "epoch": 1.468825949900137, + "grad_norm": 3.453517657302616, + "learning_rate": 3.4749879433435364e-06, + "loss": 0.9681, + "step": 7630 + }, + { + "epoch": 1.4690184565777125, + "grad_norm": 3.314283208859271, + "learning_rate": 3.4726255568607125e-06, + "loss": 0.9771, + "step": 7631 + }, + { + "epoch": 1.4692109632552879, + "grad_norm": 3.3479673159347128, + "learning_rate": 3.4702638049221902e-06, + "loss": 0.9991, + "step": 7632 + }, + { + "epoch": 1.4692109632552879, + "lm_loss": 0.6978, + "step": 7632, + "vm_loss": 0.1323 + }, + { + "epoch": 1.4692109632552879, + "lm_loss": 0.8667, + "step": 7632, + "vm_loss": 0.1402 + }, + { + "epoch": 1.4692109632552879, + "lm_loss": 0.6932, + "step": 7632, + "vm_loss": 0.1741 + }, + { + "epoch": 1.4692109632552879, + "lm_loss": 0.6685, + "step": 7632, + "vm_loss": 0.1283 + }, + { + "epoch": 1.4692109632552879, + "lm_loss": 0.9055, + "step": 7632, + "vm_loss": 0.1055 + }, + { + "epoch": 1.4692109632552879, + "lm_loss": 0.4773, + "step": 7632, + "vm_loss": 0.1527 + }, + { + "epoch": 1.4692109632552879, + "lm_loss": 0.4983, + "step": 7632, + "vm_loss": 0.106 + }, + { + "epoch": 1.4692109632552879, + "lm_loss": 0.8448, + "step": 7632, + "vm_loss": 0.1267 + }, + { + "epoch": 1.4694034699328633, + "grad_norm": 3.399133316300144, + "learning_rate": 3.467902687757567e-06, + "loss": 0.9534, + "step": 7633 + }, + { + "epoch": 1.4695959766104387, + "grad_norm": 3.2852169695509814, + "learning_rate": 3.4655422055963715e-06, + "loss": 0.9406, + "step": 7634 + }, + { + "epoch": 1.469788483288014, + "grad_norm": 3.250399779790608, + "learning_rate": 3.4631823586680747e-06, + "loss": 0.9588, + "step": 7635 + }, + { + "epoch": 1.4699809899655896, + "grad_norm": 3.2291664040015764, + "learning_rate": 3.4608231472020815e-06, + "loss": 0.9892, + "step": 7636 + }, + { + "epoch": 1.4701734966431648, + "grad_norm": 3.2487218721689315, + "learning_rate": 3.4584645714277397e-06, + "loss": 1.028, + "step": 7637 + }, + { + "epoch": 1.4703660033207402, + "grad_norm": 3.147919480674612, + "learning_rate": 3.4561066315743365e-06, + "loss": 0.9763, + "step": 7638 + }, + { + "epoch": 1.4705585099983156, + "grad_norm": 3.148093332918957, + "learning_rate": 3.4537493278710853e-06, + "loss": 0.9365, + "step": 7639 + }, + { + "epoch": 1.4707510166758908, + "grad_norm": 3.1778092093940082, + "learning_rate": 3.4513926605471504e-06, + "loss": 0.9526, + "step": 7640 + }, + { + "epoch": 1.4707510166758908, + "lm_loss": 1.2307, + "step": 7640, + "vm_loss": 0.1674 + }, + { + "epoch": 1.4707510166758908, + "lm_loss": 0.7594, + "step": 7640, + "vm_loss": 0.1934 + }, + { + "epoch": 1.4707510166758908, + "lm_loss": 0.7765, + "step": 7640, + "vm_loss": 0.1479 + }, + { + "epoch": 1.4707510166758908, + "lm_loss": 0.7014, + "step": 7640, + "vm_loss": 0.1736 + }, + { + "epoch": 1.4707510166758908, + "lm_loss": 0.6204, + "step": 7640, + "vm_loss": 0.1461 + }, + { + "epoch": 1.4707510166758908, + "lm_loss": 0.6984, + "step": 7640, + "vm_loss": 0.1736 + }, + { + "epoch": 1.4707510166758908, + "lm_loss": 1.093, + "step": 7640, + "vm_loss": 0.1579 + }, + { + "epoch": 1.4707510166758908, + "lm_loss": 0.6794, + "step": 7640, + "vm_loss": 0.1602 + }, + { + "epoch": 1.4709435233534665, + "grad_norm": 3.106304167228322, + "learning_rate": 3.449036629831629e-06, + "loss": 0.9344, + "step": 7641 + }, + { + "epoch": 1.4711360300310417, + "grad_norm": 3.2077347146833985, + "learning_rate": 3.446681235953564e-06, + "loss": 0.97, + "step": 7642 + }, + { + "epoch": 1.471328536708617, + "grad_norm": 3.3466915520285894, + "learning_rate": 3.4443264791419164e-06, + "loss": 0.9722, + "step": 7643 + }, + { + "epoch": 1.4715210433861925, + "grad_norm": 3.164027558278825, + "learning_rate": 3.4419723596256126e-06, + "loss": 0.9772, + "step": 7644 + }, + { + "epoch": 1.471713550063768, + "grad_norm": 3.234882745388558, + "learning_rate": 3.439618877633495e-06, + "loss": 0.9574, + "step": 7645 + }, + { + "epoch": 1.4719060567413433, + "grad_norm": 3.133643979282079, + "learning_rate": 3.437266033394354e-06, + "loss": 0.934, + "step": 7646 + }, + { + "epoch": 1.4720985634189185, + "grad_norm": 3.540410827258322, + "learning_rate": 3.4349138271369163e-06, + "loss": 1.0199, + "step": 7647 + }, + { + "epoch": 1.472291070096494, + "grad_norm": 3.165299098636344, + "learning_rate": 3.43256225908985e-06, + "loss": 0.9181, + "step": 7648 + }, + { + "epoch": 1.472291070096494, + "lm_loss": 1.2548, + "step": 7648, + "vm_loss": 0.1461 + }, + { + "epoch": 1.472291070096494, + "lm_loss": 1.0031, + "step": 7648, + "vm_loss": 0.1383 + }, + { + "epoch": 1.472291070096494, + "lm_loss": 1.1507, + "step": 7648, + "vm_loss": 0.0744 + }, + { + "epoch": 1.472291070096494, + "lm_loss": 0.1779, + "step": 7648, + "vm_loss": 0.139 + }, + { + "epoch": 1.472291070096494, + "lm_loss": 0.6426, + "step": 7648, + "vm_loss": 0.1449 + }, + { + "epoch": 1.472291070096494, + "lm_loss": 0.5592, + "step": 7648, + "vm_loss": 0.1969 + }, + { + "epoch": 1.472291070096494, + "lm_loss": 1.0454, + "step": 7648, + "vm_loss": 0.1086 + }, + { + "epoch": 1.472291070096494, + "lm_loss": 1.1815, + "step": 7648, + "vm_loss": 0.1597 + }, + { + "epoch": 1.4724835767740694, + "grad_norm": 3.4563873464189956, + "learning_rate": 3.4302113294817505e-06, + "loss": 1.0134, + "step": 7649 + }, + { + "epoch": 1.4726760834516448, + "grad_norm": 3.277669385941886, + "learning_rate": 3.4278610385411615e-06, + "loss": 0.9396, + "step": 7650 + }, + { + "epoch": 1.4728685901292202, + "grad_norm": 3.2130327667350516, + "learning_rate": 3.425511386496563e-06, + "loss": 0.9478, + "step": 7651 + }, + { + "epoch": 1.4730610968067954, + "grad_norm": 3.3036002545304752, + "learning_rate": 3.423162373576372e-06, + "loss": 0.9713, + "step": 7652 + }, + { + "epoch": 1.4732536034843708, + "grad_norm": 3.270411874849448, + "learning_rate": 3.4208140000089352e-06, + "loss": 0.9717, + "step": 7653 + }, + { + "epoch": 1.4734461101619463, + "grad_norm": 3.198418358837812, + "learning_rate": 3.4184662660225567e-06, + "loss": 0.9559, + "step": 7654 + }, + { + "epoch": 1.4736386168395217, + "grad_norm": 3.282354105423872, + "learning_rate": 3.4161191718454545e-06, + "loss": 0.9626, + "step": 7655 + }, + { + "epoch": 1.473831123517097, + "grad_norm": 3.3841784366276286, + "learning_rate": 3.4137727177058023e-06, + "loss": 1.0399, + "step": 7656 + }, + { + "epoch": 1.473831123517097, + "lm_loss": 0.7541, + "step": 7656, + "vm_loss": 0.1779 + }, + { + "epoch": 1.473831123517097, + "lm_loss": 0.7946, + "step": 7656, + "vm_loss": 0.1218 + }, + { + "epoch": 1.473831123517097, + "lm_loss": 0.7296, + "step": 7656, + "vm_loss": 0.1294 + }, + { + "epoch": 1.473831123517097, + "lm_loss": 1.2273, + "step": 7656, + "vm_loss": 0.1371 + }, + { + "epoch": 1.473831123517097, + "lm_loss": 0.6341, + "step": 7656, + "vm_loss": 0.1303 + }, + { + "epoch": 1.473831123517097, + "lm_loss": 0.5763, + "step": 7656, + "vm_loss": 0.2179 + }, + { + "epoch": 1.473831123517097, + "lm_loss": 0.8286, + "step": 7656, + "vm_loss": 0.1612 + }, + { + "epoch": 1.473831123517097, + "lm_loss": 0.7555, + "step": 7656, + "vm_loss": 0.1452 + }, + { + "epoch": 1.4740236301946723, + "grad_norm": 3.1183530273437, + "learning_rate": 3.411426903831705e-06, + "loss": 0.9342, + "step": 7657 + }, + { + "epoch": 1.4742161368722477, + "grad_norm": 3.2607045404943658, + "learning_rate": 3.409081730451205e-06, + "loss": 0.9838, + "step": 7658 + }, + { + "epoch": 1.4744086435498232, + "grad_norm": 3.2481865915879067, + "learning_rate": 3.406737197792286e-06, + "loss": 0.9636, + "step": 7659 + }, + { + "epoch": 1.4746011502273986, + "grad_norm": 3.153801840715601, + "learning_rate": 3.4043933060828606e-06, + "loss": 0.9471, + "step": 7660 + }, + { + "epoch": 1.474793656904974, + "grad_norm": 3.268611465633069, + "learning_rate": 3.402050055550787e-06, + "loss": 1.0125, + "step": 7661 + }, + { + "epoch": 1.4749861635825492, + "grad_norm": 3.20372763783119, + "learning_rate": 3.399707446423862e-06, + "loss": 0.9685, + "step": 7662 + }, + { + "epoch": 1.4751786702601246, + "grad_norm": 3.199487868569656, + "learning_rate": 3.3973654789298162e-06, + "loss": 0.9174, + "step": 7663 + }, + { + "epoch": 1.4753711769377, + "grad_norm": 3.2885956979602775, + "learning_rate": 3.395024153296317e-06, + "loss": 0.9928, + "step": 7664 + }, + { + "epoch": 1.4753711769377, + "lm_loss": 0.7433, + "step": 7664, + "vm_loss": 0.0994 + }, + { + "epoch": 1.4753711769377, + "lm_loss": 0.6486, + "step": 7664, + "vm_loss": 0.3052 + }, + { + "epoch": 1.4753711769377, + "lm_loss": 0.6649, + "step": 7664, + "vm_loss": 0.1543 + }, + { + "epoch": 1.4753711769377, + "lm_loss": 1.3084, + "step": 7664, + "vm_loss": 0.177 + }, + { + "epoch": 1.4753711769377, + "lm_loss": 0.7433, + "step": 7664, + "vm_loss": 0.1953 + }, + { + "epoch": 1.4753711769377, + "lm_loss": 1.3535, + "step": 7664, + "vm_loss": 0.1837 + }, + { + "epoch": 1.4753711769377, + "lm_loss": 1.1201, + "step": 7664, + "vm_loss": 0.1285 + }, + { + "epoch": 1.4753711769377, + "lm_loss": 0.9644, + "step": 7664, + "vm_loss": 0.1403 + }, + { + "epoch": 1.4755636836152755, + "grad_norm": 3.2093659568400903, + "learning_rate": 3.392683469750976e-06, + "loss": 0.9625, + "step": 7665 + }, + { + "epoch": 1.4757561902928509, + "grad_norm": 3.2595373071337677, + "learning_rate": 3.390343428521331e-06, + "loss": 0.9584, + "step": 7666 + }, + { + "epoch": 1.475948696970426, + "grad_norm": 3.1292423262295874, + "learning_rate": 3.388004029834867e-06, + "loss": 0.9294, + "step": 7667 + }, + { + "epoch": 1.4761412036480015, + "grad_norm": 3.314468384780081, + "learning_rate": 3.3856652739190034e-06, + "loss": 0.9403, + "step": 7668 + }, + { + "epoch": 1.476333710325577, + "grad_norm": 3.3471634219320285, + "learning_rate": 3.3833271610011e-06, + "loss": 0.9793, + "step": 7669 + }, + { + "epoch": 1.4765262170031523, + "grad_norm": 3.319087740514296, + "learning_rate": 3.380989691308442e-06, + "loss": 0.9945, + "step": 7670 + }, + { + "epoch": 1.4767187236807278, + "grad_norm": 3.1405822665191283, + "learning_rate": 3.378652865068275e-06, + "loss": 0.9634, + "step": 7671 + }, + { + "epoch": 1.476911230358303, + "grad_norm": 3.220422842074132, + "learning_rate": 3.3763166825077576e-06, + "loss": 0.9852, + "step": 7672 + }, + { + "epoch": 1.476911230358303, + "lm_loss": 0.7613, + "step": 7672, + "vm_loss": 0.1293 + }, + { + "epoch": 1.476911230358303, + "lm_loss": 0.4764, + "step": 7672, + "vm_loss": 0.1548 + }, + { + "epoch": 1.476911230358303, + "lm_loss": 0.814, + "step": 7672, + "vm_loss": 0.1825 + }, + { + "epoch": 1.476911230358303, + "lm_loss": 0.6232, + "step": 7672, + "vm_loss": 0.2044 + }, + { + "epoch": 1.476911230358303, + "lm_loss": 0.873, + "step": 7672, + "vm_loss": 0.1844 + }, + { + "epoch": 1.476911230358303, + "lm_loss": 0.9057, + "step": 7672, + "vm_loss": 0.1922 + }, + { + "epoch": 1.476911230358303, + "lm_loss": 0.7622, + "step": 7672, + "vm_loss": 0.1551 + }, + { + "epoch": 1.476911230358303, + "lm_loss": 0.655, + "step": 7672, + "vm_loss": 0.1798 + }, + { + "epoch": 1.4771037370358784, + "grad_norm": 3.2286896358689208, + "learning_rate": 3.3739811438540037e-06, + "loss": 0.9723, + "step": 7673 + }, + { + "epoch": 1.4772962437134538, + "grad_norm": 3.580271965507969, + "learning_rate": 3.3716462493340473e-06, + "loss": 1.0689, + "step": 7674 + }, + { + "epoch": 1.4774887503910292, + "grad_norm": 3.229096785538811, + "learning_rate": 3.3693119991748836e-06, + "loss": 0.9519, + "step": 7675 + }, + { + "epoch": 1.4776812570686046, + "grad_norm": 3.266786355417555, + "learning_rate": 3.3669783936034216e-06, + "loss": 0.9196, + "step": 7676 + }, + { + "epoch": 1.4778737637461798, + "grad_norm": 3.261867454546461, + "learning_rate": 3.3646454328465205e-06, + "loss": 0.9656, + "step": 7677 + }, + { + "epoch": 1.4780662704237553, + "grad_norm": 3.266164914651739, + "learning_rate": 3.3623131171309755e-06, + "loss": 0.9595, + "step": 7678 + }, + { + "epoch": 1.4782587771013307, + "grad_norm": 3.331630261545895, + "learning_rate": 3.3599814466835156e-06, + "loss": 0.9914, + "step": 7679 + }, + { + "epoch": 1.4784512837789061, + "grad_norm": 3.140015401798985, + "learning_rate": 3.357650421730809e-06, + "loss": 0.9362, + "step": 7680 + }, + { + "epoch": 1.4784512837789061, + "lm_loss": 0.7588, + "step": 7680, + "vm_loss": 0.1961 + }, + { + "epoch": 1.4784512837789061, + "lm_loss": 0.4668, + "step": 7680, + "vm_loss": 0.1634 + }, + { + "epoch": 1.4784512837789061, + "lm_loss": 0.7358, + "step": 7680, + "vm_loss": 0.1885 + }, + { + "epoch": 1.4784512837789061, + "lm_loss": 0.9172, + "step": 7680, + "vm_loss": 0.1565 + }, + { + "epoch": 1.4784512837789061, + "lm_loss": 0.9064, + "step": 7680, + "vm_loss": 0.1593 + }, + { + "epoch": 1.4784512837789061, + "lm_loss": 1.0094, + "step": 7680, + "vm_loss": 0.1815 + }, + { + "epoch": 1.4784512837789061, + "lm_loss": 0.8003, + "step": 7680, + "vm_loss": 0.1934 + }, + { + "epoch": 1.4784512837789061, + "lm_loss": 1.0929, + "step": 7680, + "vm_loss": 0.1281 + }, + { + "epoch": 1.4786437904564815, + "grad_norm": 3.298129711406833, + "learning_rate": 3.3553200424994627e-06, + "loss": 0.984, + "step": 7681 + }, + { + "epoch": 1.4788362971340567, + "grad_norm": 3.168720645909617, + "learning_rate": 3.352990309216022e-06, + "loss": 0.9375, + "step": 7682 + }, + { + "epoch": 1.4790288038116322, + "grad_norm": 3.33303608005996, + "learning_rate": 3.350661222106959e-06, + "loss": 0.9492, + "step": 7683 + }, + { + "epoch": 1.4792213104892076, + "grad_norm": 3.3385378614676764, + "learning_rate": 3.3483327813986967e-06, + "loss": 0.9789, + "step": 7684 + }, + { + "epoch": 1.479413817166783, + "grad_norm": 3.1863423530664905, + "learning_rate": 3.346004987317586e-06, + "loss": 0.8918, + "step": 7685 + }, + { + "epoch": 1.4796063238443584, + "grad_norm": 3.2570743910180386, + "learning_rate": 3.343677840089924e-06, + "loss": 0.9321, + "step": 7686 + }, + { + "epoch": 1.4797988305219336, + "grad_norm": 3.337914097809864, + "learning_rate": 3.3413513399419295e-06, + "loss": 0.9712, + "step": 7687 + }, + { + "epoch": 1.479991337199509, + "grad_norm": 3.3116611752191747, + "learning_rate": 3.3390254870997796e-06, + "loss": 0.9216, + "step": 7688 + }, + { + "epoch": 1.479991337199509, + "lm_loss": 1.1127, + "step": 7688, + "vm_loss": 0.2356 + }, + { + "epoch": 1.479991337199509, + "lm_loss": 0.7878, + "step": 7688, + "vm_loss": 0.1568 + }, + { + "epoch": 1.479991337199509, + "lm_loss": 0.7703, + "step": 7688, + "vm_loss": 0.1993 + }, + { + "epoch": 1.479991337199509, + "lm_loss": 0.8123, + "step": 7688, + "vm_loss": 0.1541 + }, + { + "epoch": 1.479991337199509, + "lm_loss": 1.2013, + "step": 7688, + "vm_loss": 0.1201 + }, + { + "epoch": 1.479991337199509, + "lm_loss": 0.9865, + "step": 7688, + "vm_loss": 0.1517 + }, + { + "epoch": 1.479991337199509, + "lm_loss": 0.8533, + "step": 7688, + "vm_loss": 0.1257 + }, + { + "epoch": 1.479991337199509, + "lm_loss": 0.4776, + "step": 7688, + "vm_loss": 0.1446 + }, + { + "epoch": 1.4801838438770845, + "grad_norm": 3.3403632917325496, + "learning_rate": 3.336700281789568e-06, + "loss": 0.9764, + "step": 7689 + }, + { + "epoch": 1.4803763505546599, + "grad_norm": 3.3808395436349494, + "learning_rate": 3.3343757242373397e-06, + "loss": 1.0189, + "step": 7690 + }, + { + "epoch": 1.4805688572322353, + "grad_norm": 3.1848900222907552, + "learning_rate": 3.332051814669064e-06, + "loss": 0.9232, + "step": 7691 + }, + { + "epoch": 1.4807613639098105, + "grad_norm": 3.3423341415135206, + "learning_rate": 3.3297285533106648e-06, + "loss": 0.9983, + "step": 7692 + }, + { + "epoch": 1.480953870587386, + "grad_norm": 3.1437843463376605, + "learning_rate": 3.3274059403879865e-06, + "loss": 0.95, + "step": 7693 + }, + { + "epoch": 1.4811463772649613, + "grad_norm": 3.31769356295955, + "learning_rate": 3.3250839761268174e-06, + "loss": 0.9674, + "step": 7694 + }, + { + "epoch": 1.4813388839425368, + "grad_norm": 3.428510625370614, + "learning_rate": 3.322762660752883e-06, + "loss": 0.9887, + "step": 7695 + }, + { + "epoch": 1.4815313906201122, + "grad_norm": 3.3790686443996893, + "learning_rate": 3.3204419944918476e-06, + "loss": 0.9952, + "step": 7696 + }, + { + "epoch": 1.4815313906201122, + "lm_loss": 0.5251, + "step": 7696, + "vm_loss": 0.135 + }, + { + "epoch": 1.4815313906201122, + "lm_loss": 0.8638, + "step": 7696, + "vm_loss": 0.198 + }, + { + "epoch": 1.4815313906201122, + "lm_loss": 0.4394, + "step": 7696, + "vm_loss": 0.1069 + }, + { + "epoch": 1.4815313906201122, + "lm_loss": 0.4837, + "step": 7696, + "vm_loss": 0.1442 + }, + { + "epoch": 1.4815313906201122, + "lm_loss": 0.5822, + "step": 7696, + "vm_loss": 0.2029 + }, + { + "epoch": 1.4815313906201122, + "lm_loss": 0.5042, + "step": 7696, + "vm_loss": 0.1783 + }, + { + "epoch": 1.4815313906201122, + "lm_loss": 0.7243, + "step": 7696, + "vm_loss": 0.2335 + }, + { + "epoch": 1.4815313906201122, + "lm_loss": 0.816, + "step": 7696, + "vm_loss": 0.1322 + }, + { + "epoch": 1.4817238972976874, + "grad_norm": 3.0705739789682145, + "learning_rate": 3.3181219775693006e-06, + "loss": 0.9651, + "step": 7697 + }, + { + "epoch": 1.481916403975263, + "grad_norm": 3.4072719742980704, + "learning_rate": 3.315802610210791e-06, + "loss": 1.0181, + "step": 7698 + }, + { + "epoch": 1.4821089106528382, + "grad_norm": 3.2400219531711967, + "learning_rate": 3.313483892641779e-06, + "loss": 0.957, + "step": 7699 + }, + { + "epoch": 1.4823014173304137, + "grad_norm": 3.1793623976256797, + "learning_rate": 3.3111658250876787e-06, + "loss": 0.9297, + "step": 7700 + }, + { + "epoch": 1.482493924007989, + "grad_norm": 3.2544204417275795, + "learning_rate": 3.3088484077738357e-06, + "loss": 0.9594, + "step": 7701 + }, + { + "epoch": 1.4826864306855643, + "grad_norm": 3.300422945692872, + "learning_rate": 3.306531640925532e-06, + "loss": 0.9762, + "step": 7702 + }, + { + "epoch": 1.48287893736314, + "grad_norm": 3.1188481773430348, + "learning_rate": 3.3042155247679896e-06, + "loss": 0.9318, + "step": 7703 + }, + { + "epoch": 1.4830714440407151, + "grad_norm": 3.1249433926566157, + "learning_rate": 3.3019000595263573e-06, + "loss": 0.92, + "step": 7704 + }, + { + "epoch": 1.4830714440407151, + "lm_loss": 1.2974, + "step": 7704, + "vm_loss": 0.1591 + }, + { + "epoch": 1.4830714440407151, + "lm_loss": 0.9343, + "step": 7704, + "vm_loss": 0.1801 + }, + { + "epoch": 1.4830714440407151, + "lm_loss": 0.6972, + "step": 7704, + "vm_loss": 0.143 + }, + { + "epoch": 1.4830714440407151, + "lm_loss": 0.9308, + "step": 7704, + "vm_loss": 0.154 + }, + { + "epoch": 1.4830714440407151, + "lm_loss": 0.7107, + "step": 7704, + "vm_loss": 0.1713 + }, + { + "epoch": 1.4830714440407151, + "lm_loss": 0.7054, + "step": 7704, + "vm_loss": 0.1463 + }, + { + "epoch": 1.4830714440407151, + "lm_loss": 0.8292, + "step": 7704, + "vm_loss": 0.1676 + }, + { + "epoch": 1.4830714440407151, + "lm_loss": 0.9217, + "step": 7704, + "vm_loss": 0.1589 + }, + { + "epoch": 1.4832639507182905, + "grad_norm": 3.428671616486981, + "learning_rate": 3.299585245425738e-06, + "loss": 1.0303, + "step": 7705 + }, + { + "epoch": 1.483456457395866, + "grad_norm": 3.280696581633513, + "learning_rate": 3.297271082691155e-06, + "loss": 0.9336, + "step": 7706 + }, + { + "epoch": 1.4836489640734414, + "grad_norm": 3.0762891421045513, + "learning_rate": 3.294957571547577e-06, + "loss": 0.8753, + "step": 7707 + }, + { + "epoch": 1.4838414707510168, + "grad_norm": 3.360564615643978, + "learning_rate": 3.2926447122199e-06, + "loss": 0.9878, + "step": 7708 + }, + { + "epoch": 1.484033977428592, + "grad_norm": 3.460083945446107, + "learning_rate": 3.290332504932975e-06, + "loss": 1.0146, + "step": 7709 + }, + { + "epoch": 1.4842264841061674, + "grad_norm": 3.1599800146266896, + "learning_rate": 3.288020949911569e-06, + "loss": 0.9325, + "step": 7710 + }, + { + "epoch": 1.4844189907837428, + "grad_norm": 3.4002943891608135, + "learning_rate": 3.285710047380398e-06, + "loss": 1.0001, + "step": 7711 + }, + { + "epoch": 1.4846114974613183, + "grad_norm": 3.1981818334072885, + "learning_rate": 3.283399797564112e-06, + "loss": 0.9598, + "step": 7712 + }, + { + "epoch": 1.4846114974613183, + "lm_loss": 0.9238, + "step": 7712, + "vm_loss": 0.2249 + }, + { + "epoch": 1.4846114974613183, + "lm_loss": 0.6226, + "step": 7712, + "vm_loss": 0.1382 + }, + { + "epoch": 1.4846114974613183, + "lm_loss": 0.686, + "step": 7712, + "vm_loss": 0.2114 + }, + { + "epoch": 1.4846114974613183, + "lm_loss": 0.8612, + "step": 7712, + "vm_loss": 0.2043 + }, + { + "epoch": 1.4846114974613183, + "lm_loss": 0.9632, + "step": 7712, + "vm_loss": 0.2409 + }, + { + "epoch": 1.4846114974613183, + "lm_loss": 0.5601, + "step": 7712, + "vm_loss": 0.2169 + }, + { + "epoch": 1.4846114974613183, + "lm_loss": 0.7838, + "step": 7712, + "vm_loss": 0.1284 + }, + { + "epoch": 1.4846114974613183, + "lm_loss": 0.7952, + "step": 7712, + "vm_loss": 0.1365 + }, + { + "epoch": 1.4848040041388937, + "grad_norm": 3.202405742173062, + "learning_rate": 3.2810902006872993e-06, + "loss": 0.9525, + "step": 7713 + }, + { + "epoch": 1.4849965108164689, + "grad_norm": 3.259381340023161, + "learning_rate": 3.2787812569744716e-06, + "loss": 0.9509, + "step": 7714 + }, + { + "epoch": 1.4851890174940443, + "grad_norm": 3.254614066979019, + "learning_rate": 3.2764729666501027e-06, + "loss": 0.9457, + "step": 7715 + }, + { + "epoch": 1.4853815241716197, + "grad_norm": 3.350859717617728, + "learning_rate": 3.2741653299385767e-06, + "loss": 0.9589, + "step": 7716 + }, + { + "epoch": 1.4855740308491951, + "grad_norm": 3.1914184042538354, + "learning_rate": 3.271858347064233e-06, + "loss": 0.8791, + "step": 7717 + }, + { + "epoch": 1.4857665375267706, + "grad_norm": 3.2097459229258307, + "learning_rate": 3.2695520182513285e-06, + "loss": 0.9328, + "step": 7718 + }, + { + "epoch": 1.4859590442043458, + "grad_norm": 3.2882380193893472, + "learning_rate": 3.267246343724082e-06, + "loss": 0.9362, + "step": 7719 + }, + { + "epoch": 1.4861515508819212, + "grad_norm": 3.338042557480579, + "learning_rate": 3.2649413237066253e-06, + "loss": 0.9711, + "step": 7720 + }, + { + "epoch": 1.4861515508819212, + "lm_loss": 0.6465, + "step": 7720, + "vm_loss": 0.1714 + }, + { + "epoch": 1.4861515508819212, + "lm_loss": 0.4273, + "step": 7720, + "vm_loss": 0.1726 + }, + { + "epoch": 1.4861515508819212, + "lm_loss": 0.7071, + "step": 7720, + "vm_loss": 0.1479 + }, + { + "epoch": 1.4861515508819212, + "lm_loss": 0.5044, + "step": 7720, + "vm_loss": 0.1459 + }, + { + "epoch": 1.4861515508819212, + "lm_loss": 0.8187, + "step": 7720, + "vm_loss": 0.1749 + }, + { + "epoch": 1.4861515508819212, + "lm_loss": 0.7489, + "step": 7720, + "vm_loss": 0.1748 + }, + { + "epoch": 1.4861515508819212, + "lm_loss": 1.1517, + "step": 7720, + "vm_loss": 0.17 + }, + { + "epoch": 1.4861515508819212, + "lm_loss": 1.1211, + "step": 7720, + "vm_loss": 0.1994 + }, + { + "epoch": 1.4863440575594966, + "grad_norm": 3.327634152280964, + "learning_rate": 3.2626369584230387e-06, + "loss": 0.9911, + "step": 7721 + }, + { + "epoch": 1.486536564237072, + "grad_norm": 3.361983593182565, + "learning_rate": 3.260333248097335e-06, + "loss": 0.9757, + "step": 7722 + }, + { + "epoch": 1.4867290709146475, + "grad_norm": 3.3497177201880635, + "learning_rate": 3.2580301929534654e-06, + "loss": 0.9976, + "step": 7723 + }, + { + "epoch": 1.4869215775922227, + "grad_norm": 3.250072082007862, + "learning_rate": 3.25572779321532e-06, + "loss": 0.9274, + "step": 7724 + }, + { + "epoch": 1.487114084269798, + "grad_norm": 3.2983804550042017, + "learning_rate": 3.253426049106709e-06, + "loss": 0.9552, + "step": 7725 + }, + { + "epoch": 1.4873065909473735, + "grad_norm": 3.3550800568344843, + "learning_rate": 3.2511249608514083e-06, + "loss": 0.9635, + "step": 7726 + }, + { + "epoch": 1.487499097624949, + "grad_norm": 3.210736913768476, + "learning_rate": 3.2488245286731013e-06, + "loss": 0.9523, + "step": 7727 + }, + { + "epoch": 1.4876916043025243, + "grad_norm": 3.282491789538895, + "learning_rate": 3.2465247527954226e-06, + "loss": 0.9549, + "step": 7728 + }, + { + "epoch": 1.4876916043025243, + "lm_loss": 0.5348, + "step": 7728, + "vm_loss": 0.1185 + }, + { + "epoch": 1.4876916043025243, + "lm_loss": 0.8002, + "step": 7728, + "vm_loss": 0.097 + }, + { + "epoch": 1.4876916043025243, + "lm_loss": 0.7222, + "step": 7728, + "vm_loss": 0.231 + }, + { + "epoch": 1.4876916043025243, + "lm_loss": 0.7606, + "step": 7728, + "vm_loss": 0.1761 + }, + { + "epoch": 1.4876916043025243, + "lm_loss": 0.7557, + "step": 7728, + "vm_loss": 0.1662 + }, + { + "epoch": 1.4876916043025243, + "lm_loss": 0.6734, + "step": 7728, + "vm_loss": 0.1948 + }, + { + "epoch": 1.4876916043025243, + "lm_loss": 0.8212, + "step": 7728, + "vm_loss": 0.1591 + }, + { + "epoch": 1.4876916043025243, + "lm_loss": 0.869, + "step": 7728, + "vm_loss": 0.1735 + }, + { + "epoch": 1.4878841109800995, + "grad_norm": 3.1761805424602456, + "learning_rate": 3.2442256334419398e-06, + "loss": 0.9184, + "step": 7729 + }, + { + "epoch": 1.488076617657675, + "grad_norm": 3.179179281159974, + "learning_rate": 3.241927170836161e-06, + "loss": 0.9571, + "step": 7730 + }, + { + "epoch": 1.4882691243352504, + "grad_norm": 3.1717681626726977, + "learning_rate": 3.2396293652015155e-06, + "loss": 0.9527, + "step": 7731 + }, + { + "epoch": 1.4884616310128258, + "grad_norm": 3.201785275545528, + "learning_rate": 3.237332216761392e-06, + "loss": 0.918, + "step": 7732 + }, + { + "epoch": 1.4886541376904012, + "grad_norm": 3.246982176686414, + "learning_rate": 3.235035725739094e-06, + "loss": 0.9279, + "step": 7733 + }, + { + "epoch": 1.4888466443679764, + "grad_norm": 3.2145636026407387, + "learning_rate": 3.2327398923578755e-06, + "loss": 0.9563, + "step": 7734 + }, + { + "epoch": 1.4890391510455518, + "grad_norm": 3.2755514777636847, + "learning_rate": 3.2304447168409115e-06, + "loss": 0.9207, + "step": 7735 + }, + { + "epoch": 1.4892316577231273, + "grad_norm": 3.2709106612751224, + "learning_rate": 3.2281501994113363e-06, + "loss": 0.9221, + "step": 7736 + }, + { + "epoch": 1.4892316577231273, + "lm_loss": 0.7205, + "step": 7736, + "vm_loss": 0.1475 + }, + { + "epoch": 1.4892316577231273, + "lm_loss": 0.7752, + "step": 7736, + "vm_loss": 0.1988 + }, + { + "epoch": 1.4892316577231273, + "lm_loss": 0.6643, + "step": 7736, + "vm_loss": 0.1843 + }, + { + "epoch": 1.4892316577231273, + "lm_loss": 0.6819, + "step": 7736, + "vm_loss": 0.1313 + }, + { + "epoch": 1.4892316577231273, + "lm_loss": 0.8126, + "step": 7736, + "vm_loss": 0.1938 + }, + { + "epoch": 1.4892316577231273, + "lm_loss": 1.0848, + "step": 7736, + "vm_loss": 0.1358 + }, + { + "epoch": 1.4892316577231273, + "lm_loss": 0.677, + "step": 7736, + "vm_loss": 0.1329 + }, + { + "epoch": 1.4892316577231273, + "lm_loss": 1.0783, + "step": 7736, + "vm_loss": 0.1348 + }, + { + "epoch": 1.4894241644007027, + "grad_norm": 3.327641897128104, + "learning_rate": 3.225856340292195e-06, + "loss": 0.9833, + "step": 7737 + }, + { + "epoch": 1.489616671078278, + "grad_norm": 3.312587666576186, + "learning_rate": 3.2235631397064837e-06, + "loss": 0.9685, + "step": 7738 + }, + { + "epoch": 1.4898091777558533, + "grad_norm": 3.3874345900416296, + "learning_rate": 3.2212705978771318e-06, + "loss": 0.964, + "step": 7739 + }, + { + "epoch": 1.4900016844334287, + "grad_norm": 3.213508811423855, + "learning_rate": 3.218978715027006e-06, + "loss": 0.9759, + "step": 7740 + }, + { + "epoch": 1.4901941911110042, + "grad_norm": 3.1564147583199014, + "learning_rate": 3.2166874913789004e-06, + "loss": 0.9657, + "step": 7741 + }, + { + "epoch": 1.4903866977885796, + "grad_norm": 3.3153738830979163, + "learning_rate": 3.214396927155555e-06, + "loss": 0.9451, + "step": 7742 + }, + { + "epoch": 1.490579204466155, + "grad_norm": 3.360377429039677, + "learning_rate": 3.2121070225796416e-06, + "loss": 0.9449, + "step": 7743 + }, + { + "epoch": 1.4907717111437302, + "grad_norm": 3.245010344793086, + "learning_rate": 3.209817777873767e-06, + "loss": 0.9357, + "step": 7744 + }, + { + "epoch": 1.4907717111437302, + "lm_loss": 0.6168, + "step": 7744, + "vm_loss": 0.1949 + }, + { + "epoch": 1.4907717111437302, + "lm_loss": 0.4935, + "step": 7744, + "vm_loss": 0.2001 + }, + { + "epoch": 1.4907717111437302, + "lm_loss": 0.7921, + "step": 7744, + "vm_loss": 0.2396 + }, + { + "epoch": 1.4907717111437302, + "lm_loss": 1.4087, + "step": 7744, + "vm_loss": 0.1624 + }, + { + "epoch": 1.4907717111437302, + "lm_loss": 0.7285, + "step": 7744, + "vm_loss": 0.1485 + }, + { + "epoch": 1.4907717111437302, + "lm_loss": 0.9514, + "step": 7744, + "vm_loss": 0.1876 + }, + { + "epoch": 1.4907717111437302, + "lm_loss": 0.6106, + "step": 7744, + "vm_loss": 0.1155 + }, + { + "epoch": 1.4907717111437302, + "lm_loss": 0.7972, + "step": 7744, + "vm_loss": 0.1476 + }, + { + "epoch": 1.4909642178213056, + "grad_norm": 3.096542702836715, + "learning_rate": 3.2075291932604767e-06, + "loss": 0.9184, + "step": 7745 + }, + { + "epoch": 1.491156724498881, + "grad_norm": 3.2724392160972537, + "learning_rate": 3.20524126896225e-06, + "loss": 0.9962, + "step": 7746 + }, + { + "epoch": 1.4913492311764565, + "grad_norm": 3.1330953220531557, + "learning_rate": 3.202954005201506e-06, + "loss": 0.9348, + "step": 7747 + }, + { + "epoch": 1.4915417378540319, + "grad_norm": 3.144468917724164, + "learning_rate": 3.2006674022005857e-06, + "loss": 0.9417, + "step": 7748 + }, + { + "epoch": 1.491734244531607, + "grad_norm": 3.4473696949701855, + "learning_rate": 3.1983814601817888e-06, + "loss": 1.0588, + "step": 7749 + }, + { + "epoch": 1.4919267512091825, + "grad_norm": 3.244718457392763, + "learning_rate": 3.1960961793673285e-06, + "loss": 0.9457, + "step": 7750 + }, + { + "epoch": 1.492119257886758, + "grad_norm": 3.190004355305222, + "learning_rate": 3.1938115599793708e-06, + "loss": 0.9411, + "step": 7751 + }, + { + "epoch": 1.4923117645643333, + "grad_norm": 3.0831332085064496, + "learning_rate": 3.19152760224e-06, + "loss": 0.9433, + "step": 7752 + }, + { + "epoch": 1.4923117645643333, + "lm_loss": 0.9243, + "step": 7752, + "vm_loss": 0.118 + }, + { + "epoch": 1.4923117645643333, + "lm_loss": 0.7105, + "step": 7752, + "vm_loss": 0.165 + }, + { + "epoch": 1.4923117645643333, + "lm_loss": 0.7414, + "step": 7752, + "vm_loss": 0.1835 + }, + { + "epoch": 1.4923117645643333, + "lm_loss": 0.8962, + "step": 7752, + "vm_loss": 0.177 + }, + { + "epoch": 1.4923117645643333, + "lm_loss": 0.7186, + "step": 7752, + "vm_loss": 0.1759 + }, + { + "epoch": 1.4923117645643333, + "lm_loss": 0.8059, + "step": 7752, + "vm_loss": 0.1481 + }, + { + "epoch": 1.4923117645643333, + "lm_loss": 0.5081, + "step": 7752, + "vm_loss": 0.143 + }, + { + "epoch": 1.4923117645643333, + "lm_loss": 0.8889, + "step": 7752, + "vm_loss": 0.0949 + }, + { + "epoch": 1.4925042712419088, + "grad_norm": 3.2620693381317785, + "learning_rate": 3.1892443063712574e-06, + "loss": 0.9191, + "step": 7753 + }, + { + "epoch": 1.492696777919484, + "grad_norm": 3.2102291524560664, + "learning_rate": 3.1869616725951012e-06, + "loss": 0.9172, + "step": 7754 + }, + { + "epoch": 1.4928892845970594, + "grad_norm": 3.406145037898015, + "learning_rate": 3.184679701133434e-06, + "loss": 0.9816, + "step": 7755 + }, + { + "epoch": 1.4930817912746348, + "grad_norm": 3.2968253045028515, + "learning_rate": 3.1823983922080926e-06, + "loss": 0.9911, + "step": 7756 + }, + { + "epoch": 1.4932742979522102, + "grad_norm": 3.302648899155252, + "learning_rate": 3.1801177460408537e-06, + "loss": 0.9968, + "step": 7757 + }, + { + "epoch": 1.4934668046297856, + "grad_norm": 3.3309498232761885, + "learning_rate": 3.1778377628534194e-06, + "loss": 0.9394, + "step": 7758 + }, + { + "epoch": 1.4936593113073608, + "grad_norm": 3.2072188884160258, + "learning_rate": 3.1755584428674345e-06, + "loss": 0.9078, + "step": 7759 + }, + { + "epoch": 1.4938518179849363, + "grad_norm": 3.4244721701795333, + "learning_rate": 3.1732797863044795e-06, + "loss": 1.0008, + "step": 7760 + }, + { + "epoch": 1.4938518179849363, + "lm_loss": 0.9631, + "step": 7760, + "vm_loss": 0.1686 + }, + { + "epoch": 1.4938518179849363, + "lm_loss": 0.6483, + "step": 7760, + "vm_loss": 0.19 + }, + { + "epoch": 1.4938518179849363, + "lm_loss": 0.599, + "step": 7760, + "vm_loss": 0.2073 + }, + { + "epoch": 1.4938518179849363, + "lm_loss": 0.936, + "step": 7760, + "vm_loss": 0.112 + }, + { + "epoch": 1.4938518179849363, + "lm_loss": 0.9048, + "step": 7760, + "vm_loss": 0.1664 + }, + { + "epoch": 1.4938518179849363, + "lm_loss": 0.6556, + "step": 7760, + "vm_loss": 0.1857 + }, + { + "epoch": 1.4938518179849363, + "lm_loss": 1.3135, + "step": 7760, + "vm_loss": 0.1735 + }, + { + "epoch": 1.4938518179849363, + "lm_loss": 1.4535, + "step": 7760, + "vm_loss": 0.193 + }, + { + "epoch": 1.4940443246625117, + "grad_norm": 3.531376948022944, + "learning_rate": 3.171001793386068e-06, + "loss": 1.0326, + "step": 7761 + }, + { + "epoch": 1.4942368313400871, + "grad_norm": 3.2461755000387478, + "learning_rate": 3.1687244643336514e-06, + "loss": 0.9529, + "step": 7762 + }, + { + "epoch": 1.4944293380176625, + "grad_norm": 3.210999573322285, + "learning_rate": 3.166447799368617e-06, + "loss": 0.9209, + "step": 7763 + }, + { + "epoch": 1.4946218446952377, + "grad_norm": 3.3313638551805944, + "learning_rate": 3.16417179871228e-06, + "loss": 0.9788, + "step": 7764 + }, + { + "epoch": 1.4948143513728134, + "grad_norm": 3.280142236807375, + "learning_rate": 3.1618964625859006e-06, + "loss": 0.9403, + "step": 7765 + }, + { + "epoch": 1.4950068580503886, + "grad_norm": 3.3494071346407694, + "learning_rate": 3.1596217912106687e-06, + "loss": 0.969, + "step": 7766 + }, + { + "epoch": 1.495199364727964, + "grad_norm": 3.1817418460339617, + "learning_rate": 3.157347784807714e-06, + "loss": 0.9575, + "step": 7767 + }, + { + "epoch": 1.4953918714055394, + "grad_norm": 3.2022246116984205, + "learning_rate": 3.1550744435980995e-06, + "loss": 0.9762, + "step": 7768 + }, + { + "epoch": 1.4953918714055394, + "lm_loss": 1.0008, + "step": 7768, + "vm_loss": 0.12 + }, + { + "epoch": 1.4953918714055394, + "lm_loss": 0.553, + "step": 7768, + "vm_loss": 0.2039 + }, + { + "epoch": 1.4953918714055394, + "lm_loss": 0.4216, + "step": 7768, + "vm_loss": 0.2159 + }, + { + "epoch": 1.4953918714055394, + "lm_loss": 0.5774, + "step": 7768, + "vm_loss": 0.1533 + }, + { + "epoch": 1.4953918714055394, + "lm_loss": 0.6663, + "step": 7768, + "vm_loss": 0.1245 + }, + { + "epoch": 1.4953918714055394, + "lm_loss": 0.8418, + "step": 7768, + "vm_loss": 0.1853 + }, + { + "epoch": 1.4953918714055394, + "lm_loss": 0.7475, + "step": 7768, + "vm_loss": 0.1274 + }, + { + "epoch": 1.4953918714055394, + "lm_loss": 0.701, + "step": 7768, + "vm_loss": 0.1843 + }, + { + "epoch": 1.4955843780831148, + "grad_norm": 3.131577978457707, + "learning_rate": 3.1528017678028156e-06, + "loss": 0.9352, + "step": 7769 + }, + { + "epoch": 1.4957768847606903, + "grad_norm": 3.110431502337773, + "learning_rate": 3.1505297576428074e-06, + "loss": 0.935, + "step": 7770 + }, + { + "epoch": 1.4959693914382655, + "grad_norm": 3.081872289025915, + "learning_rate": 3.148258413338935e-06, + "loss": 0.8718, + "step": 7771 + }, + { + "epoch": 1.4961618981158409, + "grad_norm": 3.1016206680483887, + "learning_rate": 3.145987735112003e-06, + "loss": 0.9261, + "step": 7772 + }, + { + "epoch": 1.4963544047934163, + "grad_norm": 3.230725527636669, + "learning_rate": 3.143717723182751e-06, + "loss": 0.9083, + "step": 7773 + }, + { + "epoch": 1.4965469114709917, + "grad_norm": 3.4636176251919806, + "learning_rate": 3.1414483777718585e-06, + "loss": 1.0642, + "step": 7774 + }, + { + "epoch": 1.4967394181485671, + "grad_norm": 3.3413270290674393, + "learning_rate": 3.139179699099926e-06, + "loss": 0.9393, + "step": 7775 + }, + { + "epoch": 1.4969319248261423, + "grad_norm": 3.3708650824461834, + "learning_rate": 3.136911687387503e-06, + "loss": 0.981, + "step": 7776 + }, + { + "epoch": 1.4969319248261423, + "lm_loss": 0.6159, + "step": 7776, + "vm_loss": 0.1593 + }, + { + "epoch": 1.4969319248261423, + "lm_loss": 0.8462, + "step": 7776, + "vm_loss": 0.1951 + }, + { + "epoch": 1.4969319248261423, + "lm_loss": 0.4944, + "step": 7776, + "vm_loss": 0.1521 + }, + { + "epoch": 1.4969319248261423, + "lm_loss": 1.0182, + "step": 7776, + "vm_loss": 0.1339 + }, + { + "epoch": 1.4969319248261423, + "lm_loss": 0.8989, + "step": 7776, + "vm_loss": 0.1389 + }, + { + "epoch": 1.4969319248261423, + "lm_loss": 0.739, + "step": 7776, + "vm_loss": 0.1783 + }, + { + "epoch": 1.4969319248261423, + "lm_loss": 0.8847, + "step": 7776, + "vm_loss": 0.1994 + }, + { + "epoch": 1.4969319248261423, + "lm_loss": 0.7906, + "step": 7776, + "vm_loss": 0.1956 + }, + { + "epoch": 1.4971244315037178, + "grad_norm": 3.417335662213305, + "learning_rate": 3.1346443428550676e-06, + "loss": 0.9921, + "step": 7777 + }, + { + "epoch": 1.4973169381812932, + "grad_norm": 3.2780290555945455, + "learning_rate": 3.132377665723039e-06, + "loss": 0.9345, + "step": 7778 + }, + { + "epoch": 1.4975094448588686, + "grad_norm": 3.0854355870367742, + "learning_rate": 3.1301116562117572e-06, + "loss": 0.8491, + "step": 7779 + }, + { + "epoch": 1.497701951536444, + "grad_norm": 3.2628778850439533, + "learning_rate": 3.1278463145415205e-06, + "loss": 0.9483, + "step": 7780 + }, + { + "epoch": 1.4978944582140192, + "grad_norm": 3.3944971265147657, + "learning_rate": 3.12558164093254e-06, + "loss": 0.9707, + "step": 7781 + }, + { + "epoch": 1.4980869648915947, + "grad_norm": 3.4057424751099905, + "learning_rate": 3.1233176356049733e-06, + "loss": 0.9585, + "step": 7782 + }, + { + "epoch": 1.49827947156917, + "grad_norm": 3.3692897701341895, + "learning_rate": 3.1210542987789104e-06, + "loss": 0.9812, + "step": 7783 + }, + { + "epoch": 1.4984719782467455, + "grad_norm": 3.360540053812142, + "learning_rate": 3.118791630674378e-06, + "loss": 0.9934, + "step": 7784 + }, + { + "epoch": 1.4984719782467455, + "lm_loss": 0.8223, + "step": 7784, + "vm_loss": 0.1722 + }, + { + "epoch": 1.4984719782467455, + "lm_loss": 0.6921, + "step": 7784, + "vm_loss": 0.2364 + }, + { + "epoch": 1.4984719782467455, + "lm_loss": 0.7079, + "step": 7784, + "vm_loss": 0.1449 + }, + { + "epoch": 1.4984719782467455, + "lm_loss": 1.224, + "step": 7784, + "vm_loss": 0.1605 + }, + { + "epoch": 1.4984719782467455, + "lm_loss": 0.9882, + "step": 7784, + "vm_loss": 0.1481 + }, + { + "epoch": 1.4984719782467455, + "lm_loss": 0.4192, + "step": 7784, + "vm_loss": 0.1352 + }, + { + "epoch": 1.4984719782467455, + "lm_loss": 0.5468, + "step": 7784, + "vm_loss": 0.1959 + }, + { + "epoch": 1.4984719782467455, + "lm_loss": 0.957, + "step": 7784, + "vm_loss": 0.2157 + }, + { + "epoch": 1.498664484924321, + "grad_norm": 3.2540458010806375, + "learning_rate": 3.116529631511338e-06, + "loss": 0.9763, + "step": 7785 + }, + { + "epoch": 1.4988569916018961, + "grad_norm": 3.388813406527751, + "learning_rate": 3.1142683015096797e-06, + "loss": 0.9917, + "step": 7786 + }, + { + "epoch": 1.4990494982794715, + "grad_norm": 3.188809515025512, + "learning_rate": 3.1120076408892374e-06, + "loss": 0.8953, + "step": 7787 + }, + { + "epoch": 1.499242004957047, + "grad_norm": 3.130039817311817, + "learning_rate": 3.109747649869774e-06, + "loss": 0.9345, + "step": 7788 + }, + { + "epoch": 1.4994345116346224, + "grad_norm": 3.2290389686221417, + "learning_rate": 3.1074883286709936e-06, + "loss": 0.9626, + "step": 7789 + }, + { + "epoch": 1.4996270183121978, + "grad_norm": 3.141382440953624, + "learning_rate": 3.105229677512527e-06, + "loss": 0.9117, + "step": 7790 + }, + { + "epoch": 1.499819524989773, + "grad_norm": 3.1755283220419, + "learning_rate": 3.102971696613949e-06, + "loss": 0.9062, + "step": 7791 + }, + { + "epoch": 1.5000120316673484, + "grad_norm": 3.2476202278877357, + "learning_rate": 3.1007143861947575e-06, + "loss": 0.9752, + "step": 7792 + }, + { + "epoch": 1.5000120316673484, + "lm_loss": 0.7053, + "step": 7792, + "vm_loss": 0.1556 + }, + { + "epoch": 1.5000120316673484, + "lm_loss": 0.6637, + "step": 7792, + "vm_loss": 0.1211 + }, + { + "epoch": 1.5000120316673484, + "lm_loss": 0.8751, + "step": 7792, + "vm_loss": 0.1725 + }, + { + "epoch": 1.5000120316673484, + "lm_loss": 0.9144, + "step": 7792, + "vm_loss": 0.1684 + }, + { + "epoch": 1.5000120316673484, + "lm_loss": 0.5799, + "step": 7792, + "vm_loss": 0.131 + }, + { + "epoch": 1.5000120316673484, + "lm_loss": 1.0938, + "step": 7792, + "vm_loss": 0.158 + }, + { + "epoch": 1.5000120316673484, + "lm_loss": 0.8213, + "step": 7792, + "vm_loss": 0.1741 + }, + { + "epoch": 1.5000120316673484, + "lm_loss": 1.1349, + "step": 7792, + "vm_loss": 0.1953 + }, + { + "epoch": 1.5002045383449238, + "grad_norm": 3.294366351781694, + "learning_rate": 3.098457746474396e-06, + "loss": 0.963, + "step": 7793 + }, + { + "epoch": 1.5003970450224993, + "grad_norm": 3.3135509686339755, + "learning_rate": 3.0962017776722364e-06, + "loss": 0.967, + "step": 7794 + }, + { + "epoch": 1.5005895517000747, + "grad_norm": 3.3842790496195034, + "learning_rate": 3.0939464800075947e-06, + "loss": 1.0102, + "step": 7795 + }, + { + "epoch": 1.5007820583776499, + "grad_norm": 3.2071537212585497, + "learning_rate": 3.091691853699702e-06, + "loss": 0.9301, + "step": 7796 + }, + { + "epoch": 1.5009745650552253, + "grad_norm": 3.399682611476646, + "learning_rate": 3.08943789896775e-06, + "loss": 1.0149, + "step": 7797 + }, + { + "epoch": 1.5011670717328007, + "grad_norm": 3.180991284415726, + "learning_rate": 3.087184616030844e-06, + "loss": 0.9317, + "step": 7798 + }, + { + "epoch": 1.5013595784103761, + "grad_norm": 3.1782859750776145, + "learning_rate": 3.0849320051080368e-06, + "loss": 0.9469, + "step": 7799 + }, + { + "epoch": 1.5015520850879516, + "grad_norm": 3.3363292570567555, + "learning_rate": 3.082680066418301e-06, + "loss": 0.9758, + "step": 7800 + }, + { + "epoch": 1.5015520850879516, + "lm_loss": 0.5712, + "step": 7800, + "vm_loss": 0.1428 + }, + { + "epoch": 1.5015520850879516, + "lm_loss": 0.5198, + "step": 7800, + "vm_loss": 0.2224 + }, + { + "epoch": 1.5015520850879516, + "lm_loss": 0.8409, + "step": 7800, + "vm_loss": 0.1604 + }, + { + "epoch": 1.5015520850879516, + "lm_loss": 0.9893, + "step": 7800, + "vm_loss": 0.1883 + }, + { + "epoch": 1.5015520850879516, + "lm_loss": 0.9657, + "step": 7800, + "vm_loss": 0.1526 + }, + { + "epoch": 1.5015520850879516, + "lm_loss": 0.4379, + "step": 7800, + "vm_loss": 0.162 + }, + { + "epoch": 1.5015520850879516, + "lm_loss": 0.831, + "step": 7800, + "vm_loss": 0.1933 + }, + { + "epoch": 1.5015520850879516, + "lm_loss": 0.481, + "step": 7800, + "vm_loss": 0.173 + }, + { + "epoch": 1.5017445917655268, + "grad_norm": 3.2590669875017313, + "learning_rate": 3.0804288001805693e-06, + "loss": 0.9373, + "step": 7801 + }, + { + "epoch": 1.5019370984431022, + "grad_norm": 3.2236078569900646, + "learning_rate": 3.0781782066136812e-06, + "loss": 0.9446, + "step": 7802 + }, + { + "epoch": 1.5021296051206776, + "grad_norm": 3.408332658780235, + "learning_rate": 3.075928285936427e-06, + "loss": 1.0079, + "step": 7803 + }, + { + "epoch": 1.502322111798253, + "grad_norm": 3.3325751997240185, + "learning_rate": 3.0736790383675287e-06, + "loss": 0.979, + "step": 7804 + }, + { + "epoch": 1.5025146184758285, + "grad_norm": 3.112958850337292, + "learning_rate": 3.071430464125641e-06, + "loss": 0.9326, + "step": 7805 + }, + { + "epoch": 1.5027071251534037, + "grad_norm": 3.2691658090713736, + "learning_rate": 3.069182563429355e-06, + "loss": 0.9385, + "step": 7806 + }, + { + "epoch": 1.5028996318309793, + "grad_norm": 3.250751491877489, + "learning_rate": 3.066935336497198e-06, + "loss": 0.9826, + "step": 7807 + }, + { + "epoch": 1.5030921385085545, + "grad_norm": 3.3822783336599436, + "learning_rate": 3.0646887835476226e-06, + "loss": 1.0021, + "step": 7808 + }, + { + "epoch": 1.5030921385085545, + "lm_loss": 0.6411, + "step": 7808, + "vm_loss": 0.1462 + }, + { + "epoch": 1.5030921385085545, + "lm_loss": 0.7659, + "step": 7808, + "vm_loss": 0.1423 + }, + { + "epoch": 1.5030921385085545, + "lm_loss": 0.932, + "step": 7808, + "vm_loss": 0.1846 + }, + { + "epoch": 1.5030921385085545, + "lm_loss": 0.951, + "step": 7808, + "vm_loss": 0.2108 + }, + { + "epoch": 1.5030921385085545, + "lm_loss": 1.1533, + "step": 7808, + "vm_loss": 0.1659 + }, + { + "epoch": 1.5030921385085545, + "lm_loss": 0.4582, + "step": 7808, + "vm_loss": 0.1249 + }, + { + "epoch": 1.5030921385085545, + "lm_loss": 0.9484, + "step": 7808, + "vm_loss": 0.1549 + }, + { + "epoch": 1.5030921385085545, + "lm_loss": 0.6505, + "step": 7808, + "vm_loss": 0.1667 + }, + { + "epoch": 1.50328464518613, + "grad_norm": 3.2930240304573206, + "learning_rate": 3.0624429047990255e-06, + "loss": 0.9666, + "step": 7809 + }, + { + "epoch": 1.5034771518637053, + "grad_norm": 3.1852727989615, + "learning_rate": 3.0601977004697357e-06, + "loss": 0.9158, + "step": 7810 + }, + { + "epoch": 1.5036696585412805, + "grad_norm": 3.205499384792309, + "learning_rate": 3.0579531707780162e-06, + "loss": 0.9618, + "step": 7811 + }, + { + "epoch": 1.5038621652188562, + "grad_norm": 3.205904308491133, + "learning_rate": 3.055709315942066e-06, + "loss": 0.8971, + "step": 7812 + }, + { + "epoch": 1.5040546718964314, + "grad_norm": 3.1524511368743093, + "learning_rate": 3.053466136180007e-06, + "loss": 0.909, + "step": 7813 + }, + { + "epoch": 1.5042471785740068, + "grad_norm": 3.2974488401174673, + "learning_rate": 3.0512236317099173e-06, + "loss": 0.9637, + "step": 7814 + }, + { + "epoch": 1.5044396852515822, + "grad_norm": 3.4551727605621174, + "learning_rate": 3.0489818027497895e-06, + "loss": 0.9725, + "step": 7815 + }, + { + "epoch": 1.5046321919291574, + "grad_norm": 3.4765767203988043, + "learning_rate": 3.0467406495175633e-06, + "loss": 0.9499, + "step": 7816 + }, + { + "epoch": 1.5046321919291574, + "lm_loss": 1.5307, + "step": 7816, + "vm_loss": 0.128 + }, + { + "epoch": 1.5046321919291574, + "lm_loss": 0.3988, + "step": 7816, + "vm_loss": 0.1074 + }, + { + "epoch": 1.5046321919291574, + "lm_loss": 0.5943, + "step": 7816, + "vm_loss": 0.198 + }, + { + "epoch": 1.5046321919291574, + "lm_loss": 0.8914, + "step": 7816, + "vm_loss": 0.1713 + }, + { + "epoch": 1.5046321919291574, + "lm_loss": 1.108, + "step": 7816, + "vm_loss": 0.1865 + }, + { + "epoch": 1.5046321919291574, + "lm_loss": 0.87, + "step": 7816, + "vm_loss": 0.1972 + }, + { + "epoch": 1.5046321919291574, + "lm_loss": 0.8067, + "step": 7816, + "vm_loss": 0.1169 + }, + { + "epoch": 1.5046321919291574, + "lm_loss": 0.683, + "step": 7816, + "vm_loss": 0.1919 + }, + { + "epoch": 1.504824698606733, + "grad_norm": 3.291171560745147, + "learning_rate": 3.0445001722310973e-06, + "loss": 0.9842, + "step": 7817 + }, + { + "epoch": 1.5050172052843083, + "grad_norm": 3.2814565928466948, + "learning_rate": 3.0422603711082076e-06, + "loss": 0.9925, + "step": 7818 + }, + { + "epoch": 1.5052097119618837, + "grad_norm": 3.2003445203822074, + "learning_rate": 3.0400212463666224e-06, + "loss": 0.932, + "step": 7819 + }, + { + "epoch": 1.505402218639459, + "grad_norm": 3.279519565723928, + "learning_rate": 3.037782798224015e-06, + "loss": 0.9111, + "step": 7820 + }, + { + "epoch": 1.5055947253170343, + "grad_norm": 3.125263038178421, + "learning_rate": 3.0355450268979935e-06, + "loss": 0.9054, + "step": 7821 + }, + { + "epoch": 1.50578723199461, + "grad_norm": 3.3000042329584782, + "learning_rate": 3.0333079326060997e-06, + "loss": 0.9732, + "step": 7822 + }, + { + "epoch": 1.5059797386721852, + "grad_norm": 3.395311858369942, + "learning_rate": 3.0310715155657987e-06, + "loss": 0.9734, + "step": 7823 + }, + { + "epoch": 1.5061722453497606, + "grad_norm": 3.1519466658749464, + "learning_rate": 3.028835775994512e-06, + "loss": 0.9265, + "step": 7824 + }, + { + "epoch": 1.5061722453497606, + "lm_loss": 0.9728, + "step": 7824, + "vm_loss": 0.1828 + }, + { + "epoch": 1.5061722453497606, + "lm_loss": 0.6396, + "step": 7824, + "vm_loss": 0.1997 + }, + { + "epoch": 1.5061722453497606, + "lm_loss": 0.8694, + "step": 7824, + "vm_loss": 0.1243 + }, + { + "epoch": 1.5061722453497606, + "lm_loss": 0.7613, + "step": 7824, + "vm_loss": 0.1891 + }, + { + "epoch": 1.5061722453497606, + "lm_loss": 0.5175, + "step": 7824, + "vm_loss": 0.1363 + }, + { + "epoch": 1.5061722453497606, + "lm_loss": 0.9541, + "step": 7824, + "vm_loss": 0.1782 + }, + { + "epoch": 1.5061722453497606, + "lm_loss": 0.6703, + "step": 7824, + "vm_loss": 0.1927 + }, + { + "epoch": 1.5061722453497606, + "lm_loss": 0.695, + "step": 7824, + "vm_loss": 0.1352 + }, + { + "epoch": 1.506364752027336, + "grad_norm": 3.344800185688409, + "learning_rate": 3.026600714109571e-06, + "loss": 0.9597, + "step": 7825 + }, + { + "epoch": 1.5065572587049112, + "grad_norm": 3.3540640146375025, + "learning_rate": 3.0243663301282567e-06, + "loss": 0.9829, + "step": 7826 + }, + { + "epoch": 1.5067497653824868, + "grad_norm": 3.381203114039733, + "learning_rate": 3.0221326242677797e-06, + "loss": 1.0206, + "step": 7827 + }, + { + "epoch": 1.506942272060062, + "grad_norm": 3.286401521181983, + "learning_rate": 3.019899596745284e-06, + "loss": 0.9601, + "step": 7828 + }, + { + "epoch": 1.5071347787376375, + "grad_norm": 3.362663390847099, + "learning_rate": 3.017667247777852e-06, + "loss": 1.0066, + "step": 7829 + }, + { + "epoch": 1.5073272854152129, + "grad_norm": 3.3186556926533357, + "learning_rate": 3.0154355775824916e-06, + "loss": 0.963, + "step": 7830 + }, + { + "epoch": 1.507519792092788, + "grad_norm": 3.188453438557234, + "learning_rate": 3.013204586376152e-06, + "loss": 0.9411, + "step": 7831 + }, + { + "epoch": 1.5077122987703637, + "grad_norm": 3.26417788736505, + "learning_rate": 3.010974274375712e-06, + "loss": 0.9615, + "step": 7832 + }, + { + "epoch": 1.5077122987703637, + "lm_loss": 1.3322, + "step": 7832, + "vm_loss": 0.1764 + }, + { + "epoch": 1.5077122987703637, + "lm_loss": 0.5002, + "step": 7832, + "vm_loss": 0.2413 + }, + { + "epoch": 1.5077122987703637, + "lm_loss": 0.7861, + "step": 7832, + "vm_loss": 0.2121 + }, + { + "epoch": 1.5077122987703637, + "lm_loss": 0.5722, + "step": 7832, + "vm_loss": 0.1302 + }, + { + "epoch": 1.5077122987703637, + "lm_loss": 0.4425, + "step": 7832, + "vm_loss": 0.1637 + }, + { + "epoch": 1.5077122987703637, + "lm_loss": 0.6456, + "step": 7832, + "vm_loss": 0.1836 + }, + { + "epoch": 1.5077122987703637, + "lm_loss": 0.7936, + "step": 7832, + "vm_loss": 0.1999 + }, + { + "epoch": 1.5077122987703637, + "lm_loss": 0.9341, + "step": 7832, + "vm_loss": 0.1991 + }, + { + "epoch": 1.507904805447939, + "grad_norm": 3.2556036950745004, + "learning_rate": 3.008744641797993e-06, + "loss": 0.945, + "step": 7833 + }, + { + "epoch": 1.5080973121255143, + "grad_norm": 3.2045046291032966, + "learning_rate": 3.0065156888597313e-06, + "loss": 0.9807, + "step": 7834 + }, + { + "epoch": 1.5082898188030898, + "grad_norm": 3.3143383201647274, + "learning_rate": 3.004287415777625e-06, + "loss": 0.9492, + "step": 7835 + }, + { + "epoch": 1.508482325480665, + "grad_norm": 3.346507916101847, + "learning_rate": 3.0020598227682794e-06, + "loss": 0.942, + "step": 7836 + }, + { + "epoch": 1.5086748321582406, + "grad_norm": 3.012009149299936, + "learning_rate": 2.9998329100482493e-06, + "loss": 0.8855, + "step": 7837 + }, + { + "epoch": 1.5088673388358158, + "grad_norm": 3.2302572945298125, + "learning_rate": 2.9976066778340186e-06, + "loss": 0.935, + "step": 7838 + }, + { + "epoch": 1.5090598455133912, + "grad_norm": 3.0961712975697355, + "learning_rate": 2.9953811263420087e-06, + "loss": 0.8796, + "step": 7839 + }, + { + "epoch": 1.5092523521909667, + "grad_norm": 3.2979329118484606, + "learning_rate": 2.993156255788565e-06, + "loss": 0.9715, + "step": 7840 + }, + { + "epoch": 1.5092523521909667, + "lm_loss": 0.7013, + "step": 7840, + "vm_loss": 0.1883 + }, + { + "epoch": 1.5092523521909667, + "lm_loss": 0.7238, + "step": 7840, + "vm_loss": 0.1954 + }, + { + "epoch": 1.5092523521909667, + "lm_loss": 1.207, + "step": 7840, + "vm_loss": 0.1778 + }, + { + "epoch": 1.5092523521909667, + "lm_loss": 0.6218, + "step": 7840, + "vm_loss": 0.1356 + }, + { + "epoch": 1.5092523521909667, + "lm_loss": 0.8391, + "step": 7840, + "vm_loss": 0.2006 + }, + { + "epoch": 1.5092523521909667, + "lm_loss": 1.0426, + "step": 7840, + "vm_loss": 0.1548 + }, + { + "epoch": 1.5092523521909667, + "lm_loss": 0.8236, + "step": 7840, + "vm_loss": 0.1739 + }, + { + "epoch": 1.5092523521909667, + "lm_loss": 0.82, + "step": 7840, + "vm_loss": 0.2145 + }, + { + "epoch": 1.5094448588685418, + "grad_norm": 3.298616035639577, + "learning_rate": 2.990932066389978e-06, + "loss": 0.9608, + "step": 7841 + }, + { + "epoch": 1.5096373655461175, + "grad_norm": 3.333384899053493, + "learning_rate": 2.9887085583624655e-06, + "loss": 0.9888, + "step": 7842 + }, + { + "epoch": 1.5098298722236927, + "grad_norm": 3.2292780290333267, + "learning_rate": 2.9864857319221852e-06, + "loss": 0.9637, + "step": 7843 + }, + { + "epoch": 1.5100223789012681, + "grad_norm": 3.2134135467500458, + "learning_rate": 2.9842635872852135e-06, + "loss": 0.8908, + "step": 7844 + }, + { + "epoch": 1.5102148855788435, + "grad_norm": 3.2092819128941, + "learning_rate": 2.9820421246675855e-06, + "loss": 0.9236, + "step": 7845 + }, + { + "epoch": 1.5104073922564187, + "grad_norm": 3.3036654409188912, + "learning_rate": 2.9798213442852466e-06, + "loss": 0.9492, + "step": 7846 + }, + { + "epoch": 1.5105998989339944, + "grad_norm": 3.40963758868991, + "learning_rate": 2.977601246354086e-06, + "loss": 0.9789, + "step": 7847 + }, + { + "epoch": 1.5107924056115696, + "grad_norm": 3.149603980505544, + "learning_rate": 2.9753818310899276e-06, + "loss": 0.928, + "step": 7848 + }, + { + "epoch": 1.5107924056115696, + "lm_loss": 0.5397, + "step": 7848, + "vm_loss": 0.1543 + }, + { + "epoch": 1.5107924056115696, + "lm_loss": 0.6126, + "step": 7848, + "vm_loss": 0.2447 + }, + { + "epoch": 1.5107924056115696, + "lm_loss": 0.5625, + "step": 7848, + "vm_loss": 0.1749 + }, + { + "epoch": 1.5107924056115696, + "lm_loss": 0.3023, + "step": 7848, + "vm_loss": 0.1321 + }, + { + "epoch": 1.5107924056115696, + "lm_loss": 0.6832, + "step": 7848, + "vm_loss": 0.1275 + }, + { + "epoch": 1.5107924056115696, + "lm_loss": 0.6294, + "step": 7848, + "vm_loss": 0.1636 + }, + { + "epoch": 1.5107924056115696, + "lm_loss": 0.8626, + "step": 7848, + "vm_loss": 0.1262 + }, + { + "epoch": 1.5107924056115696, + "lm_loss": 0.7261, + "step": 7848, + "vm_loss": 0.1813 + }, + { + "epoch": 1.510984912289145, + "grad_norm": 3.175547072125256, + "learning_rate": 2.973163098708527e-06, + "loss": 0.9235, + "step": 7849 + }, + { + "epoch": 1.5111774189667204, + "grad_norm": 3.3815955939935405, + "learning_rate": 2.9709450494255744e-06, + "loss": 0.9804, + "step": 7850 + }, + { + "epoch": 1.5113699256442958, + "grad_norm": 3.038392663111802, + "learning_rate": 2.9687276834566856e-06, + "loss": 0.9041, + "step": 7851 + }, + { + "epoch": 1.5115624323218713, + "grad_norm": 3.211710795979048, + "learning_rate": 2.966511001017428e-06, + "loss": 0.9654, + "step": 7852 + }, + { + "epoch": 1.5117549389994465, + "grad_norm": 3.15260849877099, + "learning_rate": 2.964295002323283e-06, + "loss": 0.9459, + "step": 7853 + }, + { + "epoch": 1.5119474456770219, + "grad_norm": 3.250671400542431, + "learning_rate": 2.962079687589676e-06, + "loss": 0.9721, + "step": 7854 + }, + { + "epoch": 1.5121399523545973, + "grad_norm": 3.258175969691289, + "learning_rate": 2.9598650570319653e-06, + "loss": 0.9353, + "step": 7855 + }, + { + "epoch": 1.5123324590321727, + "grad_norm": 3.4596913053078513, + "learning_rate": 2.9576511108654436e-06, + "loss": 0.9975, + "step": 7856 + }, + { + "epoch": 1.5123324590321727, + "lm_loss": 0.7401, + "step": 7856, + "vm_loss": 0.1658 + }, + { + "epoch": 1.5123324590321727, + "lm_loss": 0.7333, + "step": 7856, + "vm_loss": 0.1974 + }, + { + "epoch": 1.5123324590321727, + "lm_loss": 0.8848, + "step": 7856, + "vm_loss": 0.1338 + }, + { + "epoch": 1.5123324590321727, + "lm_loss": 1.2021, + "step": 7856, + "vm_loss": 0.1092 + }, + { + "epoch": 1.5123324590321727, + "lm_loss": 0.6534, + "step": 7856, + "vm_loss": 0.1662 + }, + { + "epoch": 1.5123324590321727, + "lm_loss": 1.0, + "step": 7856, + "vm_loss": 0.2015 + }, + { + "epoch": 1.5123324590321727, + "lm_loss": 0.5091, + "step": 7856, + "vm_loss": 0.1313 + }, + { + "epoch": 1.5123324590321727, + "lm_loss": 0.7526, + "step": 7856, + "vm_loss": 0.1396 + }, + { + "epoch": 1.5125249657097481, + "grad_norm": 3.0854046974490945, + "learning_rate": 2.955437849305329e-06, + "loss": 0.9188, + "step": 7857 + }, + { + "epoch": 1.5127174723873233, + "grad_norm": 3.2122323287307184, + "learning_rate": 2.953225272566782e-06, + "loss": 0.9503, + "step": 7858 + }, + { + "epoch": 1.5129099790648988, + "grad_norm": 3.0845922810565005, + "learning_rate": 2.9510133808648923e-06, + "loss": 0.9004, + "step": 7859 + }, + { + "epoch": 1.5131024857424742, + "grad_norm": 3.3478893482677243, + "learning_rate": 2.948802174414688e-06, + "loss": 1.0322, + "step": 7860 + }, + { + "epoch": 1.5132949924200496, + "grad_norm": 3.2317287730803086, + "learning_rate": 2.9465916534311177e-06, + "loss": 0.909, + "step": 7861 + }, + { + "epoch": 1.513487499097625, + "grad_norm": 3.153682051162729, + "learning_rate": 2.944381818129084e-06, + "loss": 0.948, + "step": 7862 + }, + { + "epoch": 1.5136800057752002, + "grad_norm": 3.2725592119711675, + "learning_rate": 2.9421726687234033e-06, + "loss": 0.97, + "step": 7863 + }, + { + "epoch": 1.5138725124527757, + "grad_norm": 3.4323394694233187, + "learning_rate": 2.9399642054288336e-06, + "loss": 1.0502, + "step": 7864 + }, + { + "epoch": 1.5138725124527757, + "lm_loss": 0.6883, + "step": 7864, + "vm_loss": 0.1303 + }, + { + "epoch": 1.5138725124527757, + "lm_loss": 0.5412, + "step": 7864, + "vm_loss": 0.234 + }, + { + "epoch": 1.5138725124527757, + "lm_loss": 1.0955, + "step": 7864, + "vm_loss": 0.1701 + }, + { + "epoch": 1.5138725124527757, + "lm_loss": 0.775, + "step": 7864, + "vm_loss": 0.1522 + }, + { + "epoch": 1.5138725124527757, + "lm_loss": 0.5132, + "step": 7864, + "vm_loss": 0.098 + }, + { + "epoch": 1.5138725124527757, + "lm_loss": 0.6865, + "step": 7864, + "vm_loss": 0.1435 + }, + { + "epoch": 1.5138725124527757, + "lm_loss": 0.5881, + "step": 7864, + "vm_loss": 0.1623 + }, + { + "epoch": 1.5138725124527757, + "lm_loss": 0.7141, + "step": 7864, + "vm_loss": 0.1306 + }, + { + "epoch": 1.514065019130351, + "grad_norm": 3.324281687103269, + "learning_rate": 2.937756428460068e-06, + "loss": 0.9883, + "step": 7865 + }, + { + "epoch": 1.5142575258079265, + "grad_norm": 3.1941463767188365, + "learning_rate": 2.9355493380317334e-06, + "loss": 0.9133, + "step": 7866 + }, + { + "epoch": 1.514450032485502, + "grad_norm": 3.129856805838578, + "learning_rate": 2.933342934358381e-06, + "loss": 0.9373, + "step": 7867 + }, + { + "epoch": 1.5146425391630771, + "grad_norm": 3.293173417884313, + "learning_rate": 2.9311372176545028e-06, + "loss": 0.9619, + "step": 7868 + }, + { + "epoch": 1.5148350458406528, + "grad_norm": 3.1964471208157126, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.9447, + "step": 7869 + }, + { + "epoch": 1.515027552518228, + "grad_norm": 3.3227774032784643, + "learning_rate": 2.9267278460128045e-06, + "loss": 0.9729, + "step": 7870 + }, + { + "epoch": 1.5152200591958034, + "grad_norm": 3.1392236777527724, + "learning_rate": 2.9245241915036304e-06, + "loss": 0.8933, + "step": 7871 + }, + { + "epoch": 1.5154125658733788, + "grad_norm": 3.270933592700056, + "learning_rate": 2.9223212248212275e-06, + "loss": 0.9828, + "step": 7872 + }, + { + "epoch": 1.5154125658733788, + "lm_loss": 0.372, + "step": 7872, + "vm_loss": 0.1465 + }, + { + "epoch": 1.5154125658733788, + "lm_loss": 0.6552, + "step": 7872, + "vm_loss": 0.1895 + }, + { + "epoch": 1.5154125658733788, + "lm_loss": 0.8957, + "step": 7872, + "vm_loss": 0.186 + }, + { + "epoch": 1.5154125658733788, + "lm_loss": 0.3996, + "step": 7872, + "vm_loss": 0.1566 + }, + { + "epoch": 1.5154125658733788, + "lm_loss": 0.9829, + "step": 7872, + "vm_loss": 0.1666 + }, + { + "epoch": 1.5154125658733788, + "lm_loss": 0.5987, + "step": 7872, + "vm_loss": 0.2297 + }, + { + "epoch": 1.5154125658733788, + "lm_loss": 1.1111, + "step": 7872, + "vm_loss": 0.1945 + }, + { + "epoch": 1.5154125658733788, + "lm_loss": 0.8696, + "step": 7872, + "vm_loss": 0.1477 + }, + { + "epoch": 1.515605072550954, + "grad_norm": 3.1978790823637406, + "learning_rate": 2.920118946179754e-06, + "loss": 0.957, + "step": 7873 + }, + { + "epoch": 1.5157975792285296, + "grad_norm": 3.314784829486023, + "learning_rate": 2.917917355793294e-06, + "loss": 0.9538, + "step": 7874 + }, + { + "epoch": 1.5159900859061048, + "grad_norm": 3.1440310455661353, + "learning_rate": 2.915716453875873e-06, + "loss": 0.9397, + "step": 7875 + }, + { + "epoch": 1.5161825925836803, + "grad_norm": 3.358587416688107, + "learning_rate": 2.9135162406414476e-06, + "loss": 0.9508, + "step": 7876 + }, + { + "epoch": 1.5163750992612557, + "grad_norm": 3.5239765529201366, + "learning_rate": 2.911316716303908e-06, + "loss": 1.0264, + "step": 7877 + }, + { + "epoch": 1.5165676059388309, + "grad_norm": 3.24052667970497, + "learning_rate": 2.909117881077069e-06, + "loss": 0.9333, + "step": 7878 + }, + { + "epoch": 1.5167601126164065, + "grad_norm": 3.0790327228456564, + "learning_rate": 2.906919735174698e-06, + "loss": 0.892, + "step": 7879 + }, + { + "epoch": 1.5169526192939817, + "grad_norm": 3.2202527269056747, + "learning_rate": 2.9047222788104712e-06, + "loss": 0.9154, + "step": 7880 + }, + { + "epoch": 1.5169526192939817, + "lm_loss": 0.7989, + "step": 7880, + "vm_loss": 0.1923 + }, + { + "epoch": 1.5169526192939817, + "lm_loss": 0.7383, + "step": 7880, + "vm_loss": 0.1622 + }, + { + "epoch": 1.5169526192939817, + "lm_loss": 0.7612, + "step": 7880, + "vm_loss": 0.1432 + }, + { + "epoch": 1.5169526192939817, + "lm_loss": 0.5935, + "step": 7880, + "vm_loss": 0.1424 + }, + { + "epoch": 1.5169526192939817, + "lm_loss": 0.7328, + "step": 7880, + "vm_loss": 0.1514 + }, + { + "epoch": 1.5169526192939817, + "lm_loss": 0.8368, + "step": 7880, + "vm_loss": 0.1861 + }, + { + "epoch": 1.5169526192939817, + "lm_loss": 0.5921, + "step": 7880, + "vm_loss": 0.1597 + }, + { + "epoch": 1.5169526192939817, + "lm_loss": 0.6966, + "step": 7880, + "vm_loss": 0.2142 + }, + { + "epoch": 1.5171451259715572, + "grad_norm": 3.212063517588743, + "learning_rate": 2.9025255121980155e-06, + "loss": 0.9165, + "step": 7881 + }, + { + "epoch": 1.5173376326491326, + "grad_norm": 3.2564129852326755, + "learning_rate": 2.9003294355508816e-06, + "loss": 0.9436, + "step": 7882 + }, + { + "epoch": 1.5175301393267078, + "grad_norm": 3.4035160443072385, + "learning_rate": 2.8981340490825616e-06, + "loss": 1.0312, + "step": 7883 + }, + { + "epoch": 1.5177226460042834, + "grad_norm": 3.306536776510789, + "learning_rate": 2.8959393530064696e-06, + "loss": 0.9688, + "step": 7884 + }, + { + "epoch": 1.5179151526818586, + "grad_norm": 3.19413145345373, + "learning_rate": 2.8937453475359578e-06, + "loss": 0.8942, + "step": 7885 + }, + { + "epoch": 1.518107659359434, + "grad_norm": 3.2393458137153597, + "learning_rate": 2.891552032884315e-06, + "loss": 0.934, + "step": 7886 + }, + { + "epoch": 1.5183001660370095, + "grad_norm": 3.0957271210821227, + "learning_rate": 2.8893594092647616e-06, + "loss": 0.8787, + "step": 7887 + }, + { + "epoch": 1.5184926727145847, + "grad_norm": 3.203300721557911, + "learning_rate": 2.8871674768904377e-06, + "loss": 0.9614, + "step": 7888 + }, + { + "epoch": 1.5184926727145847, + "lm_loss": 0.7213, + "step": 7888, + "vm_loss": 0.155 + }, + { + "epoch": 1.5184926727145847, + "lm_loss": 1.2036, + "step": 7888, + "vm_loss": 0.1552 + }, + { + "epoch": 1.5184926727145847, + "lm_loss": 0.9483, + "step": 7888, + "vm_loss": 0.1819 + }, + { + "epoch": 1.5184926727145847, + "lm_loss": 0.7984, + "step": 7888, + "vm_loss": 0.1895 + }, + { + "epoch": 1.5184926727145847, + "lm_loss": 0.9461, + "step": 7888, + "vm_loss": 0.1847 + }, + { + "epoch": 1.5184926727145847, + "lm_loss": 0.4063, + "step": 7888, + "vm_loss": 0.1541 + }, + { + "epoch": 1.5184926727145847, + "lm_loss": 1.0366, + "step": 7888, + "vm_loss": 0.2027 + }, + { + "epoch": 1.5184926727145847, + "lm_loss": 1.0646, + "step": 7888, + "vm_loss": 0.1418 + }, + { + "epoch": 1.5186851793921603, + "grad_norm": 3.2783994253213042, + "learning_rate": 2.8849762359744426e-06, + "loss": 0.9325, + "step": 7889 + }, + { + "epoch": 1.5188776860697355, + "grad_norm": 3.119216990453225, + "learning_rate": 2.882785686729781e-06, + "loss": 0.8994, + "step": 7890 + }, + { + "epoch": 1.519070192747311, + "grad_norm": 3.4122065764179554, + "learning_rate": 2.880595829369406e-06, + "loss": 0.9633, + "step": 7891 + }, + { + "epoch": 1.5192626994248863, + "grad_norm": 3.4339113385605677, + "learning_rate": 2.8784066641062013e-06, + "loss": 0.9425, + "step": 7892 + }, + { + "epoch": 1.5194552061024615, + "grad_norm": 3.412221430504424, + "learning_rate": 2.8762181911529798e-06, + "loss": 0.9181, + "step": 7893 + }, + { + "epoch": 1.5196477127800372, + "grad_norm": 3.2228622687664474, + "learning_rate": 2.8740304107224936e-06, + "loss": 0.8988, + "step": 7894 + }, + { + "epoch": 1.5198402194576124, + "grad_norm": 3.4439362547727788, + "learning_rate": 2.8718433230274124e-06, + "loss": 0.9846, + "step": 7895 + }, + { + "epoch": 1.5200327261351878, + "grad_norm": 3.2469484901917363, + "learning_rate": 2.8696569282803622e-06, + "loss": 0.9403, + "step": 7896 + }, + { + "epoch": 1.5200327261351878, + "lm_loss": 0.7343, + "step": 7896, + "vm_loss": 0.1046 + }, + { + "epoch": 1.5200327261351878, + "lm_loss": 1.0578, + "step": 7896, + "vm_loss": 0.1227 + }, + { + "epoch": 1.5200327261351878, + "lm_loss": 1.0158, + "step": 7896, + "vm_loss": 0.1476 + }, + { + "epoch": 1.5200327261351878, + "lm_loss": 0.6039, + "step": 7896, + "vm_loss": 0.1759 + }, + { + "epoch": 1.5200327261351878, + "lm_loss": 0.8944, + "step": 7896, + "vm_loss": 0.213 + }, + { + "epoch": 1.5200327261351878, + "lm_loss": 0.7894, + "step": 7896, + "vm_loss": 0.1591 + }, + { + "epoch": 1.5200327261351878, + "lm_loss": 0.4928, + "step": 7896, + "vm_loss": 0.1714 + }, + { + "epoch": 1.5200327261351878, + "lm_loss": 0.811, + "step": 7896, + "vm_loss": 0.1959 + }, + { + "epoch": 1.5202252328127632, + "grad_norm": 3.143612158537478, + "learning_rate": 2.8674712266938797e-06, + "loss": 0.9297, + "step": 7897 + }, + { + "epoch": 1.5204177394903384, + "grad_norm": 3.213090765660677, + "learning_rate": 2.8652862184804452e-06, + "loss": 0.9569, + "step": 7898 + }, + { + "epoch": 1.520610246167914, + "grad_norm": 3.1761350812790243, + "learning_rate": 2.863101903852471e-06, + "loss": 0.9358, + "step": 7899 + }, + { + "epoch": 1.5208027528454893, + "grad_norm": 3.37648226987177, + "learning_rate": 2.8609182830223015e-06, + "loss": 0.9764, + "step": 7900 + }, + { + "epoch": 1.5209952595230647, + "grad_norm": 3.287643373656489, + "learning_rate": 2.858735356202207e-06, + "loss": 0.9576, + "step": 7901 + }, + { + "epoch": 1.5211877662006401, + "grad_norm": 3.3216958851653082, + "learning_rate": 2.8565531236043997e-06, + "loss": 0.9921, + "step": 7902 + }, + { + "epoch": 1.5213802728782153, + "grad_norm": 3.210463538273395, + "learning_rate": 2.8543715854410203e-06, + "loss": 0.9243, + "step": 7903 + }, + { + "epoch": 1.521572779555791, + "grad_norm": 3.3266262878978288, + "learning_rate": 2.8521907419241445e-06, + "loss": 0.9833, + "step": 7904 + }, + { + "epoch": 1.521572779555791, + "lm_loss": 0.8762, + "step": 7904, + "vm_loss": 0.1526 + }, + { + "epoch": 1.521572779555791, + "lm_loss": 1.1756, + "step": 7904, + "vm_loss": 0.2393 + }, + { + "epoch": 1.521572779555791, + "lm_loss": 0.5915, + "step": 7904, + "vm_loss": 0.1429 + }, + { + "epoch": 1.521572779555791, + "lm_loss": 0.6991, + "step": 7904, + "vm_loss": 0.1429 + }, + { + "epoch": 1.521572779555791, + "lm_loss": 0.4825, + "step": 7904, + "vm_loss": 0.1226 + }, + { + "epoch": 1.521572779555791, + "lm_loss": 0.9025, + "step": 7904, + "vm_loss": 0.1788 + }, + { + "epoch": 1.521572779555791, + "lm_loss": 0.8409, + "step": 7904, + "vm_loss": 0.1685 + }, + { + "epoch": 1.521572779555791, + "lm_loss": 1.1326, + "step": 7904, + "vm_loss": 0.1662 + }, + { + "epoch": 1.5217652862333662, + "grad_norm": 3.200664760887337, + "learning_rate": 2.8500105932657706e-06, + "loss": 0.9314, + "step": 7905 + }, + { + "epoch": 1.5219577929109416, + "grad_norm": 3.1841842542374605, + "learning_rate": 2.847831139677848e-06, + "loss": 0.9176, + "step": 7906 + }, + { + "epoch": 1.522150299588517, + "grad_norm": 3.1638362796839776, + "learning_rate": 2.845652381372238e-06, + "loss": 0.96, + "step": 7907 + }, + { + "epoch": 1.5223428062660922, + "grad_norm": 3.3740739708409864, + "learning_rate": 2.8434743185607483e-06, + "loss": 0.9908, + "step": 7908 + }, + { + "epoch": 1.5225353129436678, + "grad_norm": 3.416488671591386, + "learning_rate": 2.8412969514551124e-06, + "loss": 1.0287, + "step": 7909 + }, + { + "epoch": 1.522727819621243, + "grad_norm": 3.195275143905709, + "learning_rate": 2.8391202802670025e-06, + "loss": 0.8638, + "step": 7910 + }, + { + "epoch": 1.5229203262988185, + "grad_norm": 3.4105116413427954, + "learning_rate": 2.8369443052080135e-06, + "loss": 0.9716, + "step": 7911 + }, + { + "epoch": 1.5231128329763939, + "grad_norm": 3.1091468107279936, + "learning_rate": 2.83476902648968e-06, + "loss": 0.8899, + "step": 7912 + }, + { + "epoch": 1.5231128329763939, + "lm_loss": 0.7429, + "step": 7912, + "vm_loss": 0.1948 + }, + { + "epoch": 1.5231128329763939, + "lm_loss": 0.6468, + "step": 7912, + "vm_loss": 0.1659 + }, + { + "epoch": 1.5231128329763939, + "lm_loss": 0.586, + "step": 7912, + "vm_loss": 0.1402 + }, + { + "epoch": 1.5231128329763939, + "lm_loss": 0.57, + "step": 7912, + "vm_loss": 0.1817 + }, + { + "epoch": 1.5231128329763939, + "lm_loss": 0.9127, + "step": 7912, + "vm_loss": 0.1374 + }, + { + "epoch": 1.5231128329763939, + "lm_loss": 0.3617, + "step": 7912, + "vm_loss": 0.1186 + }, + { + "epoch": 1.5231128329763939, + "lm_loss": 0.7489, + "step": 7912, + "vm_loss": 0.1425 + }, + { + "epoch": 1.5231128329763939, + "lm_loss": 0.5803, + "step": 7912, + "vm_loss": 0.1347 + }, + { + "epoch": 1.523305339653969, + "grad_norm": 3.3998330070262215, + "learning_rate": 2.8325944443234674e-06, + "loss": 0.9365, + "step": 7913 + }, + { + "epoch": 1.5234978463315447, + "grad_norm": 3.328980404324694, + "learning_rate": 2.830420558920772e-06, + "loss": 0.9587, + "step": 7914 + }, + { + "epoch": 1.52369035300912, + "grad_norm": 3.1617280526809552, + "learning_rate": 2.8282473704929283e-06, + "loss": 0.9315, + "step": 7915 + }, + { + "epoch": 1.5238828596866953, + "grad_norm": 3.2725409445742972, + "learning_rate": 2.826074879251186e-06, + "loss": 0.9593, + "step": 7916 + }, + { + "epoch": 1.5240753663642708, + "grad_norm": 3.172251503528973, + "learning_rate": 2.823903085406754e-06, + "loss": 0.9017, + "step": 7917 + }, + { + "epoch": 1.5242678730418462, + "grad_norm": 3.1936853799073064, + "learning_rate": 2.8217319891707474e-06, + "loss": 0.9365, + "step": 7918 + }, + { + "epoch": 1.5244603797194216, + "grad_norm": 3.386063629209984, + "learning_rate": 2.8195615907542285e-06, + "loss": 0.9875, + "step": 7919 + }, + { + "epoch": 1.5246528863969968, + "grad_norm": 3.0335218716968892, + "learning_rate": 2.8173918903681874e-06, + "loss": 0.8948, + "step": 7920 + }, + { + "epoch": 1.5246528863969968, + "lm_loss": 1.2054, + "step": 7920, + "vm_loss": 0.1947 + }, + { + "epoch": 1.5246528863969968, + "lm_loss": 0.875, + "step": 7920, + "vm_loss": 0.1303 + }, + { + "epoch": 1.5246528863969968, + "lm_loss": 0.3873, + "step": 7920, + "vm_loss": 0.106 + }, + { + "epoch": 1.5246528863969968, + "lm_loss": 1.2146, + "step": 7920, + "vm_loss": 0.1552 + }, + { + "epoch": 1.5246528863969968, + "lm_loss": 0.9866, + "step": 7920, + "vm_loss": 0.141 + }, + { + "epoch": 1.5246528863969968, + "lm_loss": 0.6228, + "step": 7920, + "vm_loss": 0.2196 + }, + { + "epoch": 1.5246528863969968, + "lm_loss": 1.0023, + "step": 7920, + "vm_loss": 0.1723 + }, + { + "epoch": 1.5246528863969968, + "lm_loss": 0.8476, + "step": 7920, + "vm_loss": 0.154 + }, + { + "epoch": 1.5248453930745722, + "grad_norm": 3.4371834327541237, + "learning_rate": 2.8152228882235512e-06, + "loss": 1.0209, + "step": 7921 + }, + { + "epoch": 1.5250378997521477, + "grad_norm": 3.218700290989853, + "learning_rate": 2.813054584531163e-06, + "loss": 0.9181, + "step": 7922 + }, + { + "epoch": 1.525230406429723, + "grad_norm": 3.2618155870182584, + "learning_rate": 2.8108869795018233e-06, + "loss": 0.9354, + "step": 7923 + }, + { + "epoch": 1.5254229131072985, + "grad_norm": 3.1782015142158144, + "learning_rate": 2.8087200733462427e-06, + "loss": 0.9015, + "step": 7924 + }, + { + "epoch": 1.5256154197848737, + "grad_norm": 3.1426280150538646, + "learning_rate": 2.806553866275077e-06, + "loss": 0.8952, + "step": 7925 + }, + { + "epoch": 1.5258079264624491, + "grad_norm": 3.2162292050820533, + "learning_rate": 2.8043883584988996e-06, + "loss": 0.9495, + "step": 7926 + }, + { + "epoch": 1.5260004331400245, + "grad_norm": 3.2438556475297, + "learning_rate": 2.802223550228239e-06, + "loss": 0.9145, + "step": 7927 + }, + { + "epoch": 1.5261929398176, + "grad_norm": 3.2794762771581363, + "learning_rate": 2.800059441673533e-06, + "loss": 0.959, + "step": 7928 + }, + { + "epoch": 1.5261929398176, + "lm_loss": 0.5727, + "step": 7928, + "vm_loss": 0.1715 + }, + { + "epoch": 1.5261929398176, + "lm_loss": 1.0174, + "step": 7928, + "vm_loss": 0.1412 + }, + { + "epoch": 1.5261929398176, + "lm_loss": 0.6226, + "step": 7928, + "vm_loss": 0.1729 + }, + { + "epoch": 1.5261929398176, + "lm_loss": 0.7331, + "step": 7928, + "vm_loss": 0.1596 + }, + { + "epoch": 1.5261929398176, + "lm_loss": 0.6039, + "step": 7928, + "vm_loss": 0.1756 + }, + { + "epoch": 1.5261929398176, + "lm_loss": 0.6562, + "step": 7928, + "vm_loss": 0.2045 + }, + { + "epoch": 1.5261929398176, + "lm_loss": 0.7794, + "step": 7928, + "vm_loss": 0.1581 + }, + { + "epoch": 1.5261929398176, + "lm_loss": 1.4351, + "step": 7928, + "vm_loss": 0.1456 + }, + { + "epoch": 1.5263854464951754, + "grad_norm": 3.193255170579407, + "learning_rate": 2.797896033045163e-06, + "loss": 0.9079, + "step": 7929 + }, + { + "epoch": 1.5265779531727506, + "grad_norm": 3.2150010122468755, + "learning_rate": 2.795733324553441e-06, + "loss": 0.9547, + "step": 7930 + }, + { + "epoch": 1.5267704598503262, + "grad_norm": 3.1670579134807415, + "learning_rate": 2.7935713164086086e-06, + "loss": 0.9161, + "step": 7931 + }, + { + "epoch": 1.5269629665279014, + "grad_norm": 3.488154195621208, + "learning_rate": 2.791410008820845e-06, + "loss": 1.0134, + "step": 7932 + }, + { + "epoch": 1.5271554732054768, + "grad_norm": 3.2146959501058037, + "learning_rate": 2.7892494020002505e-06, + "loss": 0.8919, + "step": 7933 + }, + { + "epoch": 1.5273479798830523, + "grad_norm": 3.169590916399676, + "learning_rate": 2.7870894961568662e-06, + "loss": 0.9452, + "step": 7934 + }, + { + "epoch": 1.5275404865606275, + "grad_norm": 3.1514412199949264, + "learning_rate": 2.7849302915006626e-06, + "loss": 0.8826, + "step": 7935 + }, + { + "epoch": 1.527732993238203, + "grad_norm": 3.2367947803501167, + "learning_rate": 2.782771788241543e-06, + "loss": 0.9734, + "step": 7936 + }, + { + "epoch": 1.527732993238203, + "lm_loss": 0.7398, + "step": 7936, + "vm_loss": 0.2004 + }, + { + "epoch": 1.527732993238203, + "lm_loss": 0.6008, + "step": 7936, + "vm_loss": 0.1049 + }, + { + "epoch": 1.527732993238203, + "lm_loss": 0.4836, + "step": 7936, + "vm_loss": 0.1842 + }, + { + "epoch": 1.527732993238203, + "lm_loss": 0.7368, + "step": 7936, + "vm_loss": 0.1328 + }, + { + "epoch": 1.527732993238203, + "lm_loss": 0.4844, + "step": 7936, + "vm_loss": 0.183 + }, + { + "epoch": 1.527732993238203, + "lm_loss": 0.6611, + "step": 7936, + "vm_loss": 0.1039 + }, + { + "epoch": 1.527732993238203, + "lm_loss": 0.7487, + "step": 7936, + "vm_loss": 0.168 + }, + { + "epoch": 1.527732993238203, + "lm_loss": 0.5715, + "step": 7936, + "vm_loss": 0.1831 + }, + { + "epoch": 1.5279254999157783, + "grad_norm": 3.3783687918573957, + "learning_rate": 2.780613986589341e-06, + "loss": 0.9606, + "step": 7937 + }, + { + "epoch": 1.5281180065933537, + "grad_norm": 3.3513477808613326, + "learning_rate": 2.778456886753824e-06, + "loss": 0.9474, + "step": 7938 + }, + { + "epoch": 1.5283105132709291, + "grad_norm": 3.353058471605618, + "learning_rate": 2.7763004889446845e-06, + "loss": 0.9644, + "step": 7939 + }, + { + "epoch": 1.5285030199485043, + "grad_norm": 3.3199404914339503, + "learning_rate": 2.7741447933715595e-06, + "loss": 0.9534, + "step": 7940 + }, + { + "epoch": 1.52869552662608, + "grad_norm": 3.1643395875412033, + "learning_rate": 2.7719898002440037e-06, + "loss": 0.9612, + "step": 7941 + }, + { + "epoch": 1.5288880333036552, + "grad_norm": 3.141052928744899, + "learning_rate": 2.7698355097715167e-06, + "loss": 0.9364, + "step": 7942 + }, + { + "epoch": 1.5290805399812306, + "grad_norm": 3.3226621707174906, + "learning_rate": 2.767681922163512e-06, + "loss": 1.0054, + "step": 7943 + }, + { + "epoch": 1.529273046658806, + "grad_norm": 3.2824671922228235, + "learning_rate": 2.765529037629359e-06, + "loss": 0.9463, + "step": 7944 + }, + { + "epoch": 1.529273046658806, + "lm_loss": 0.838, + "step": 7944, + "vm_loss": 0.1811 + }, + { + "epoch": 1.529273046658806, + "lm_loss": 0.9024, + "step": 7944, + "vm_loss": 0.1637 + }, + { + "epoch": 1.529273046658806, + "lm_loss": 0.7319, + "step": 7944, + "vm_loss": 0.2005 + }, + { + "epoch": 1.529273046658806, + "lm_loss": 0.9065, + "step": 7944, + "vm_loss": 0.223 + }, + { + "epoch": 1.529273046658806, + "lm_loss": 0.7323, + "step": 7944, + "vm_loss": 0.2251 + }, + { + "epoch": 1.529273046658806, + "lm_loss": 1.0154, + "step": 7944, + "vm_loss": 0.1485 + }, + { + "epoch": 1.529273046658806, + "lm_loss": 0.666, + "step": 7944, + "vm_loss": 0.1367 + }, + { + "epoch": 1.529273046658806, + "lm_loss": 0.9149, + "step": 7944, + "vm_loss": 0.1012 + }, + { + "epoch": 1.5294655533363812, + "grad_norm": 3.216402170517718, + "learning_rate": 2.7633768563783357e-06, + "loss": 0.9316, + "step": 7945 + }, + { + "epoch": 1.5296580600139569, + "grad_norm": 3.2903178911149498, + "learning_rate": 2.7612253786196663e-06, + "loss": 0.9497, + "step": 7946 + }, + { + "epoch": 1.529850566691532, + "grad_norm": 3.107944032566568, + "learning_rate": 2.7590746045625004e-06, + "loss": 0.8819, + "step": 7947 + }, + { + "epoch": 1.5300430733691075, + "grad_norm": 3.0358265909040303, + "learning_rate": 2.756924534415926e-06, + "loss": 0.8595, + "step": 7948 + }, + { + "epoch": 1.530235580046683, + "grad_norm": 3.362489582509592, + "learning_rate": 2.7547751683889478e-06, + "loss": 0.9955, + "step": 7949 + }, + { + "epoch": 1.5304280867242581, + "grad_norm": 3.368545454685749, + "learning_rate": 2.7526265066905177e-06, + "loss": 0.9856, + "step": 7950 + }, + { + "epoch": 1.5306205934018338, + "grad_norm": 3.3648455786830325, + "learning_rate": 2.7504785495295127e-06, + "loss": 0.9612, + "step": 7951 + }, + { + "epoch": 1.530813100079409, + "grad_norm": 3.0460179200882087, + "learning_rate": 2.7483312971147403e-06, + "loss": 0.8983, + "step": 7952 + }, + { + "epoch": 1.530813100079409, + "lm_loss": 0.5829, + "step": 7952, + "vm_loss": 0.1456 + }, + { + "epoch": 1.530813100079409, + "lm_loss": 0.7443, + "step": 7952, + "vm_loss": 0.2037 + }, + { + "epoch": 1.530813100079409, + "lm_loss": 1.0858, + "step": 7952, + "vm_loss": 0.1817 + }, + { + "epoch": 1.530813100079409, + "lm_loss": 0.7118, + "step": 7952, + "vm_loss": 0.1551 + }, + { + "epoch": 1.530813100079409, + "lm_loss": 0.5233, + "step": 7952, + "vm_loss": 0.2523 + }, + { + "epoch": 1.530813100079409, + "lm_loss": 0.9803, + "step": 7952, + "vm_loss": 0.202 + }, + { + "epoch": 1.530813100079409, + "lm_loss": 1.1859, + "step": 7952, + "vm_loss": 0.1098 + }, + { + "epoch": 1.530813100079409, + "lm_loss": 1.1364, + "step": 7952, + "vm_loss": 0.1543 + }, + { + "epoch": 1.5310056067569844, + "grad_norm": 3.3848290380901456, + "learning_rate": 2.746184749654942e-06, + "loss": 0.996, + "step": 7953 + }, + { + "epoch": 1.5311981134345598, + "grad_norm": 3.1296851778905186, + "learning_rate": 2.7440389073587927e-06, + "loss": 0.9301, + "step": 7954 + }, + { + "epoch": 1.531390620112135, + "grad_norm": 3.1792388112306456, + "learning_rate": 2.741893770434889e-06, + "loss": 0.9201, + "step": 7955 + }, + { + "epoch": 1.5315831267897106, + "grad_norm": 3.298553622733847, + "learning_rate": 2.7397493390917694e-06, + "loss": 0.9612, + "step": 7956 + }, + { + "epoch": 1.5317756334672858, + "grad_norm": 3.2565758429365514, + "learning_rate": 2.737605613537899e-06, + "loss": 0.9204, + "step": 7957 + }, + { + "epoch": 1.5319681401448613, + "grad_norm": 3.2749339786733027, + "learning_rate": 2.735462593981677e-06, + "loss": 0.9502, + "step": 7958 + }, + { + "epoch": 1.5321606468224367, + "grad_norm": 3.0270652651930816, + "learning_rate": 2.733320280631434e-06, + "loss": 0.8678, + "step": 7959 + }, + { + "epoch": 1.5323531535000119, + "grad_norm": 3.253472602030513, + "learning_rate": 2.7311786736954228e-06, + "loss": 0.9464, + "step": 7960 + }, + { + "epoch": 1.5323531535000119, + "lm_loss": 0.8057, + "step": 7960, + "vm_loss": 0.125 + }, + { + "epoch": 1.5323531535000119, + "lm_loss": 0.704, + "step": 7960, + "vm_loss": 0.1279 + }, + { + "epoch": 1.5323531535000119, + "lm_loss": 0.7755, + "step": 7960, + "vm_loss": 0.1328 + }, + { + "epoch": 1.5323531535000119, + "lm_loss": 0.6444, + "step": 7960, + "vm_loss": 0.1387 + }, + { + "epoch": 1.5323531535000119, + "lm_loss": 0.6986, + "step": 7960, + "vm_loss": 0.2247 + }, + { + "epoch": 1.5323531535000119, + "lm_loss": 0.708, + "step": 7960, + "vm_loss": 0.1142 + }, + { + "epoch": 1.5323531535000119, + "lm_loss": 0.5582, + "step": 7960, + "vm_loss": 0.1198 + }, + { + "epoch": 1.5323531535000119, + "lm_loss": 0.7389, + "step": 7960, + "vm_loss": 0.1773 + }, + { + "epoch": 1.5325456601775875, + "grad_norm": 3.2291358836197483, + "learning_rate": 2.729037773381845e-06, + "loss": 0.9131, + "step": 7961 + }, + { + "epoch": 1.5327381668551627, + "grad_norm": 3.1773580738369693, + "learning_rate": 2.726897579898815e-06, + "loss": 0.8895, + "step": 7962 + }, + { + "epoch": 1.5329306735327382, + "grad_norm": 3.2664355525792295, + "learning_rate": 2.7247580934543905e-06, + "loss": 0.9368, + "step": 7963 + }, + { + "epoch": 1.5331231802103136, + "grad_norm": 3.158994553941811, + "learning_rate": 2.722619314256557e-06, + "loss": 0.9145, + "step": 7964 + }, + { + "epoch": 1.5333156868878888, + "grad_norm": 3.530813063511815, + "learning_rate": 2.720481242513234e-06, + "loss": 1.0117, + "step": 7965 + }, + { + "epoch": 1.5335081935654644, + "grad_norm": 3.169532035430008, + "learning_rate": 2.7183438784322646e-06, + "loss": 0.8709, + "step": 7966 + }, + { + "epoch": 1.5337007002430396, + "grad_norm": 3.3416479133475465, + "learning_rate": 2.7162072222214285e-06, + "loss": 0.9149, + "step": 7967 + }, + { + "epoch": 1.533893206920615, + "grad_norm": 3.158020922994402, + "learning_rate": 2.714071274088438e-06, + "loss": 0.9398, + "step": 7968 + }, + { + "epoch": 1.533893206920615, + "lm_loss": 0.6734, + "step": 7968, + "vm_loss": 0.1853 + }, + { + "epoch": 1.533893206920615, + "lm_loss": 0.3745, + "step": 7968, + "vm_loss": 0.1692 + }, + { + "epoch": 1.533893206920615, + "lm_loss": 0.6806, + "step": 7968, + "vm_loss": 0.1688 + }, + { + "epoch": 1.533893206920615, + "lm_loss": 0.7573, + "step": 7968, + "vm_loss": 0.1341 + }, + { + "epoch": 1.533893206920615, + "lm_loss": 0.6443, + "step": 7968, + "vm_loss": 0.1463 + }, + { + "epoch": 1.533893206920615, + "lm_loss": 0.5062, + "step": 7968, + "vm_loss": 0.252 + }, + { + "epoch": 1.533893206920615, + "lm_loss": 0.6351, + "step": 7968, + "vm_loss": 0.1386 + }, + { + "epoch": 1.533893206920615, + "lm_loss": 0.7953, + "step": 7968, + "vm_loss": 0.1921 + }, + { + "epoch": 1.5340857135981905, + "grad_norm": 3.2099254580424375, + "learning_rate": 2.7119360342409363e-06, + "loss": 0.9432, + "step": 7969 + }, + { + "epoch": 1.5342782202757657, + "grad_norm": 3.1532831296313844, + "learning_rate": 2.7098015028864878e-06, + "loss": 0.905, + "step": 7970 + }, + { + "epoch": 1.5344707269533413, + "grad_norm": 3.3746696130800324, + "learning_rate": 2.7076676802326084e-06, + "loss": 0.9853, + "step": 7971 + }, + { + "epoch": 1.5346632336309165, + "grad_norm": 3.14625616291821, + "learning_rate": 2.705534566486723e-06, + "loss": 0.913, + "step": 7972 + }, + { + "epoch": 1.534855740308492, + "grad_norm": 3.279148845372519, + "learning_rate": 2.7034021618562024e-06, + "loss": 0.9157, + "step": 7973 + }, + { + "epoch": 1.5350482469860673, + "grad_norm": 3.1076794403716974, + "learning_rate": 2.7012704665483413e-06, + "loss": 0.8862, + "step": 7974 + }, + { + "epoch": 1.5352407536636425, + "grad_norm": 3.269619903444149, + "learning_rate": 2.6991394807703686e-06, + "loss": 0.9394, + "step": 7975 + }, + { + "epoch": 1.5354332603412182, + "grad_norm": 3.161744880009637, + "learning_rate": 2.697009204729447e-06, + "loss": 0.9244, + "step": 7976 + }, + { + "epoch": 1.5354332603412182, + "lm_loss": 0.8292, + "step": 7976, + "vm_loss": 0.1684 + }, + { + "epoch": 1.5354332603412182, + "lm_loss": 1.0268, + "step": 7976, + "vm_loss": 0.1634 + }, + { + "epoch": 1.5354332603412182, + "lm_loss": 0.6761, + "step": 7976, + "vm_loss": 0.1943 + }, + { + "epoch": 1.5354332603412182, + "lm_loss": 0.7027, + "step": 7976, + "vm_loss": 0.192 + }, + { + "epoch": 1.5354332603412182, + "lm_loss": 0.6313, + "step": 7976, + "vm_loss": 0.1451 + }, + { + "epoch": 1.5354332603412182, + "lm_loss": 0.6142, + "step": 7976, + "vm_loss": 0.1876 + }, + { + "epoch": 1.5354332603412182, + "lm_loss": 0.9596, + "step": 7976, + "vm_loss": 0.1635 + }, + { + "epoch": 1.5354332603412182, + "lm_loss": 0.8075, + "step": 7976, + "vm_loss": 0.1786 + }, + { + "epoch": 1.5356257670187934, + "grad_norm": 3.231054455586062, + "learning_rate": 2.6948796386326615e-06, + "loss": 0.9193, + "step": 7977 + }, + { + "epoch": 1.5358182736963688, + "grad_norm": 3.464960503544298, + "learning_rate": 2.6927507826870338e-06, + "loss": 0.973, + "step": 7978 + }, + { + "epoch": 1.5360107803739442, + "grad_norm": 3.4564040281116917, + "learning_rate": 2.690622637099517e-06, + "loss": 0.9814, + "step": 7979 + }, + { + "epoch": 1.5362032870515197, + "grad_norm": 3.174019810427211, + "learning_rate": 2.688495202076994e-06, + "loss": 0.9006, + "step": 7980 + }, + { + "epoch": 1.536395793729095, + "grad_norm": 3.2697717538707813, + "learning_rate": 2.6863684778262786e-06, + "loss": 0.9645, + "step": 7981 + }, + { + "epoch": 1.5365883004066703, + "grad_norm": 3.2959796547954796, + "learning_rate": 2.68424246455412e-06, + "loss": 0.9716, + "step": 7982 + }, + { + "epoch": 1.5367808070842457, + "grad_norm": 3.3474441174234273, + "learning_rate": 2.682117162467186e-06, + "loss": 0.9453, + "step": 7983 + }, + { + "epoch": 1.5369733137618211, + "grad_norm": 3.204225946582758, + "learning_rate": 2.6799925717720866e-06, + "loss": 0.9182, + "step": 7984 + }, + { + "epoch": 1.5369733137618211, + "lm_loss": 0.7671, + "step": 7984, + "vm_loss": 0.1229 + }, + { + "epoch": 1.5369733137618211, + "lm_loss": 1.0129, + "step": 7984, + "vm_loss": 0.1909 + }, + { + "epoch": 1.5369733137618211, + "lm_loss": 0.8394, + "step": 7984, + "vm_loss": 0.1994 + }, + { + "epoch": 1.5369733137618211, + "lm_loss": 0.6259, + "step": 7984, + "vm_loss": 0.1584 + }, + { + "epoch": 1.5369733137618211, + "lm_loss": 0.4944, + "step": 7984, + "vm_loss": 0.1261 + }, + { + "epoch": 1.5369733137618211, + "lm_loss": 0.7534, + "step": 7984, + "vm_loss": 0.1721 + }, + { + "epoch": 1.5369733137618211, + "lm_loss": 0.7852, + "step": 7984, + "vm_loss": 0.1613 + }, + { + "epoch": 1.5369733137618211, + "lm_loss": 0.8214, + "step": 7984, + "vm_loss": 0.0996 + }, + { + "epoch": 1.5371658204393965, + "grad_norm": 3.3253669885070973, + "learning_rate": 2.6778686926753594e-06, + "loss": 0.9288, + "step": 7985 + }, + { + "epoch": 1.537358327116972, + "grad_norm": 3.2155316285498348, + "learning_rate": 2.6757455253834763e-06, + "loss": 0.8838, + "step": 7986 + }, + { + "epoch": 1.5375508337945472, + "grad_norm": 3.2043528814507067, + "learning_rate": 2.6736230701028266e-06, + "loss": 0.9242, + "step": 7987 + }, + { + "epoch": 1.5377433404721226, + "grad_norm": 3.2211581773656515, + "learning_rate": 2.6715013270397517e-06, + "loss": 0.9192, + "step": 7988 + }, + { + "epoch": 1.537935847149698, + "grad_norm": 3.364400008071222, + "learning_rate": 2.6693802964005046e-06, + "loss": 0.9668, + "step": 7989 + }, + { + "epoch": 1.5381283538272734, + "grad_norm": 3.2061185646199495, + "learning_rate": 2.667259978391281e-06, + "loss": 0.9732, + "step": 7990 + }, + { + "epoch": 1.5383208605048488, + "grad_norm": 3.314719819803128, + "learning_rate": 2.6651403732181948e-06, + "loss": 0.9724, + "step": 7991 + }, + { + "epoch": 1.538513367182424, + "grad_norm": 3.3206698577924842, + "learning_rate": 2.66302148108731e-06, + "loss": 0.9471, + "step": 7992 + }, + { + "epoch": 1.538513367182424, + "lm_loss": 0.9808, + "step": 7992, + "vm_loss": 0.1688 + }, + { + "epoch": 1.538513367182424, + "lm_loss": 1.0178, + "step": 7992, + "vm_loss": 0.1578 + }, + { + "epoch": 1.538513367182424, + "lm_loss": 1.1058, + "step": 7992, + "vm_loss": 0.1394 + }, + { + "epoch": 1.538513367182424, + "lm_loss": 0.6054, + "step": 7992, + "vm_loss": 0.2311 + }, + { + "epoch": 1.538513367182424, + "lm_loss": 0.4173, + "step": 7992, + "vm_loss": 0.1687 + }, + { + "epoch": 1.538513367182424, + "lm_loss": 0.821, + "step": 7992, + "vm_loss": 0.1694 + }, + { + "epoch": 1.538513367182424, + "lm_loss": 0.6285, + "step": 7992, + "vm_loss": 0.1254 + }, + { + "epoch": 1.538513367182424, + "lm_loss": 1.0019, + "step": 7992, + "vm_loss": 0.1696 + }, + { + "epoch": 1.5387058738599997, + "grad_norm": 3.2942341475937136, + "learning_rate": 2.6609033022046026e-06, + "loss": 0.9568, + "step": 7993 + }, + { + "epoch": 1.5388983805375749, + "grad_norm": 3.133324745317571, + "learning_rate": 2.6587858367759878e-06, + "loss": 0.8964, + "step": 7994 + }, + { + "epoch": 1.5390908872151503, + "grad_norm": 3.3536422986677663, + "learning_rate": 2.656669085007312e-06, + "loss": 0.9711, + "step": 7995 + }, + { + "epoch": 1.5392833938927257, + "grad_norm": 3.328822575143836, + "learning_rate": 2.654553047104349e-06, + "loss": 0.9891, + "step": 7996 + }, + { + "epoch": 1.539475900570301, + "grad_norm": 3.211263241867042, + "learning_rate": 2.652437723272806e-06, + "loss": 0.943, + "step": 7997 + }, + { + "epoch": 1.5396684072478766, + "grad_norm": 3.120363786721185, + "learning_rate": 2.6503231137183192e-06, + "loss": 0.8712, + "step": 7998 + }, + { + "epoch": 1.5398609139254518, + "grad_norm": 3.295522695434396, + "learning_rate": 2.648209218646458e-06, + "loss": 0.9501, + "step": 7999 + }, + { + "epoch": 1.5400534206030272, + "grad_norm": 3.2722126523476027, + "learning_rate": 2.6460960382627155e-06, + "loss": 0.9344, + "step": 8000 + }, + { + "epoch": 1.5400534206030272, + "lm_loss": 0.9622, + "step": 8000, + "vm_loss": 0.1589 + }, + { + "epoch": 1.5400534206030272, + "lm_loss": 1.1346, + "step": 8000, + "vm_loss": 0.1565 + }, + { + "epoch": 1.5400534206030272, + "lm_loss": 0.5825, + "step": 8000, + "vm_loss": 0.1711 + }, + { + "epoch": 1.5400534206030272, + "lm_loss": 0.8418, + "step": 8000, + "vm_loss": 0.1129 + }, + { + "epoch": 1.5400534206030272, + "lm_loss": 1.0322, + "step": 8000, + "vm_loss": 0.2188 + }, + { + "epoch": 1.5400534206030272, + "lm_loss": 0.7109, + "step": 8000, + "vm_loss": 0.1493 + }, + { + "epoch": 1.5400534206030272, + "lm_loss": 0.9612, + "step": 8000, + "vm_loss": 0.2211 + }, + { + "epoch": 1.5400534206030272, + "lm_loss": 0.4593, + "step": 8000, + "vm_loss": 0.2239 + }, + { + "epoch": 1.5402459272806026, + "grad_norm": 3.383160649722326, + "learning_rate": 2.6439835727725217e-06, + "loss": 0.9684, + "step": 8001 + }, + { + "epoch": 1.5404384339581778, + "grad_norm": 3.187616805335515, + "learning_rate": 2.6418718223812357e-06, + "loss": 0.9313, + "step": 8002 + }, + { + "epoch": 1.5406309406357535, + "grad_norm": 3.3488764601943335, + "learning_rate": 2.639760787294149e-06, + "loss": 0.9342, + "step": 8003 + }, + { + "epoch": 1.5408234473133287, + "grad_norm": 3.421534642422455, + "learning_rate": 2.6376504677164727e-06, + "loss": 0.9464, + "step": 8004 + }, + { + "epoch": 1.541015953990904, + "grad_norm": 3.2180030301567606, + "learning_rate": 2.6355408638533684e-06, + "loss": 0.8909, + "step": 8005 + }, + { + "epoch": 1.5412084606684795, + "grad_norm": 3.225555599456573, + "learning_rate": 2.633431975909909e-06, + "loss": 0.944, + "step": 8006 + }, + { + "epoch": 1.5414009673460547, + "grad_norm": 3.1808055368756083, + "learning_rate": 2.6313238040911103e-06, + "loss": 0.9155, + "step": 8007 + }, + { + "epoch": 1.5415934740236303, + "grad_norm": 3.223747635586637, + "learning_rate": 2.629216348601905e-06, + "loss": 0.8708, + "step": 8008 + }, + { + "epoch": 1.5415934740236303, + "lm_loss": 0.7937, + "step": 8008, + "vm_loss": 0.1556 + }, + { + "epoch": 1.5415934740236303, + "lm_loss": 0.9909, + "step": 8008, + "vm_loss": 0.1363 + }, + { + "epoch": 1.5415934740236303, + "lm_loss": 1.0529, + "step": 8008, + "vm_loss": 0.1911 + }, + { + "epoch": 1.5415934740236303, + "lm_loss": 0.4188, + "step": 8008, + "vm_loss": 0.1399 + }, + { + "epoch": 1.5415934740236303, + "lm_loss": 0.78, + "step": 8008, + "vm_loss": 0.1663 + }, + { + "epoch": 1.5415934740236303, + "lm_loss": 0.6115, + "step": 8008, + "vm_loss": 0.1448 + }, + { + "epoch": 1.5415934740236303, + "lm_loss": 0.7757, + "step": 8008, + "vm_loss": 0.2248 + }, + { + "epoch": 1.5415934740236303, + "lm_loss": 0.5175, + "step": 8008, + "vm_loss": 0.1615 + }, + { + "epoch": 1.5417859807012055, + "grad_norm": 3.3587088199145763, + "learning_rate": 2.627109609647176e-06, + "loss": 0.9678, + "step": 8009 + }, + { + "epoch": 1.541978487378781, + "grad_norm": 3.2493155929426507, + "learning_rate": 2.625003587431717e-06, + "loss": 0.9342, + "step": 8010 + }, + { + "epoch": 1.5421709940563564, + "grad_norm": 3.1938556153950266, + "learning_rate": 2.6228982821602623e-06, + "loss": 0.8509, + "step": 8011 + }, + { + "epoch": 1.5423635007339316, + "grad_norm": 3.2148264232840216, + "learning_rate": 2.6207936940374767e-06, + "loss": 0.9426, + "step": 8012 + }, + { + "epoch": 1.5425560074115072, + "grad_norm": 3.3948024793351066, + "learning_rate": 2.6186898232679534e-06, + "loss": 0.9279, + "step": 8013 + }, + { + "epoch": 1.5427485140890824, + "grad_norm": 3.279163267063515, + "learning_rate": 2.616586670056208e-06, + "loss": 0.9461, + "step": 8014 + }, + { + "epoch": 1.5429410207666578, + "grad_norm": 3.2778972656379155, + "learning_rate": 2.6144842346067067e-06, + "loss": 0.948, + "step": 8015 + }, + { + "epoch": 1.5431335274442333, + "grad_norm": 3.287355788110965, + "learning_rate": 2.6123825171238216e-06, + "loss": 0.9588, + "step": 8016 + }, + { + "epoch": 1.5431335274442333, + "lm_loss": 1.1373, + "step": 8016, + "vm_loss": 0.1146 + }, + { + "epoch": 1.5431335274442333, + "lm_loss": 0.8576, + "step": 8016, + "vm_loss": 0.1391 + }, + { + "epoch": 1.5431335274442333, + "lm_loss": 1.014, + "step": 8016, + "vm_loss": 0.1559 + }, + { + "epoch": 1.5431335274442333, + "lm_loss": 0.4777, + "step": 8016, + "vm_loss": 0.1788 + }, + { + "epoch": 1.5431335274442333, + "lm_loss": 0.8893, + "step": 8016, + "vm_loss": 0.17 + }, + { + "epoch": 1.5431335274442333, + "lm_loss": 0.6298, + "step": 8016, + "vm_loss": 0.186 + }, + { + "epoch": 1.5431335274442333, + "lm_loss": 0.8481, + "step": 8016, + "vm_loss": 0.1274 + }, + { + "epoch": 1.5431335274442333, + "lm_loss": 1.1882, + "step": 8016, + "vm_loss": 0.1819 + }, + { + "epoch": 1.5433260341218085, + "grad_norm": 3.248078114485856, + "learning_rate": 2.610281517811872e-06, + "loss": 0.9421, + "step": 8017 + }, + { + "epoch": 1.543518540799384, + "grad_norm": 3.414508479095204, + "learning_rate": 2.6081812368751015e-06, + "loss": 1.026, + "step": 8018 + }, + { + "epoch": 1.5437110474769593, + "grad_norm": 3.3334142730825502, + "learning_rate": 2.606081674517683e-06, + "loss": 0.9362, + "step": 8019 + }, + { + "epoch": 1.5439035541545347, + "grad_norm": 3.2821418238884985, + "learning_rate": 2.6039828309437245e-06, + "loss": 0.9567, + "step": 8020 + }, + { + "epoch": 1.5440960608321102, + "grad_norm": 3.2290554862601826, + "learning_rate": 2.6018847063572516e-06, + "loss": 0.9544, + "step": 8021 + }, + { + "epoch": 1.5442885675096854, + "grad_norm": 3.1612207850093474, + "learning_rate": 2.599787300962241e-06, + "loss": 0.9301, + "step": 8022 + }, + { + "epoch": 1.544481074187261, + "grad_norm": 3.18829769328562, + "learning_rate": 2.597690614962578e-06, + "loss": 0.9023, + "step": 8023 + }, + { + "epoch": 1.5446735808648362, + "grad_norm": 3.2561905459158087, + "learning_rate": 2.595594648562093e-06, + "loss": 0.9232, + "step": 8024 + }, + { + "epoch": 1.5446735808648362, + "lm_loss": 0.7379, + "step": 8024, + "vm_loss": 0.1484 + }, + { + "epoch": 1.5446735808648362, + "lm_loss": 0.4718, + "step": 8024, + "vm_loss": 0.1683 + }, + { + "epoch": 1.5446735808648362, + "lm_loss": 1.0247, + "step": 8024, + "vm_loss": 0.1393 + }, + { + "epoch": 1.5446735808648362, + "lm_loss": 0.7464, + "step": 8024, + "vm_loss": 0.213 + }, + { + "epoch": 1.5446735808648362, + "lm_loss": 0.8225, + "step": 8024, + "vm_loss": 0.2319 + }, + { + "epoch": 1.5446735808648362, + "lm_loss": 0.5027, + "step": 8024, + "vm_loss": 0.1775 + }, + { + "epoch": 1.5446735808648362, + "lm_loss": 1.1778, + "step": 8024, + "vm_loss": 0.1728 + }, + { + "epoch": 1.5446735808648362, + "lm_loss": 1.0253, + "step": 8024, + "vm_loss": 0.1344 + }, + { + "epoch": 1.5448660875424116, + "grad_norm": 3.2077979756440507, + "learning_rate": 2.593499401964532e-06, + "loss": 0.9481, + "step": 8025 + }, + { + "epoch": 1.545058594219987, + "grad_norm": 3.337386603929552, + "learning_rate": 2.591404875373593e-06, + "loss": 0.9477, + "step": 8026 + }, + { + "epoch": 1.5452511008975622, + "grad_norm": 3.3735205655263134, + "learning_rate": 2.58931106899288e-06, + "loss": 0.9571, + "step": 8027 + }, + { + "epoch": 1.5454436075751379, + "grad_norm": 3.0976357802038845, + "learning_rate": 2.5872179830259413e-06, + "loss": 0.914, + "step": 8028 + }, + { + "epoch": 1.545636114252713, + "grad_norm": 3.341977295056129, + "learning_rate": 2.585125617676253e-06, + "loss": 0.9215, + "step": 8029 + }, + { + "epoch": 1.5458286209302885, + "grad_norm": 3.2096451182066077, + "learning_rate": 2.583033973147221e-06, + "loss": 0.9102, + "step": 8030 + }, + { + "epoch": 1.546021127607864, + "grad_norm": 3.4217362382686503, + "learning_rate": 2.5809430496421707e-06, + "loss": 0.9648, + "step": 8031 + }, + { + "epoch": 1.5462136342854391, + "grad_norm": 3.205804446345324, + "learning_rate": 2.57885284736438e-06, + "loss": 0.9243, + "step": 8032 + }, + { + "epoch": 1.5462136342854391, + "lm_loss": 0.74, + "step": 8032, + "vm_loss": 0.0934 + }, + { + "epoch": 1.5462136342854391, + "lm_loss": 0.4432, + "step": 8032, + "vm_loss": 0.0986 + }, + { + "epoch": 1.5462136342854391, + "lm_loss": 0.5572, + "step": 8032, + "vm_loss": 0.2166 + }, + { + "epoch": 1.5462136342854391, + "lm_loss": 0.9402, + "step": 8032, + "vm_loss": 0.1711 + }, + { + "epoch": 1.5462136342854391, + "lm_loss": 0.7416, + "step": 8032, + "vm_loss": 0.2138 + }, + { + "epoch": 1.5462136342854391, + "lm_loss": 0.7241, + "step": 8032, + "vm_loss": 0.2004 + }, + { + "epoch": 1.5462136342854391, + "lm_loss": 0.6982, + "step": 8032, + "vm_loss": 0.1826 + }, + { + "epoch": 1.5462136342854391, + "lm_loss": 0.5453, + "step": 8032, + "vm_loss": 0.1791 + }, + { + "epoch": 1.5464061409630148, + "grad_norm": 3.2343987829262457, + "learning_rate": 2.5767633665170343e-06, + "loss": 0.9199, + "step": 8033 + }, + { + "epoch": 1.54659864764059, + "grad_norm": 3.3110251423523716, + "learning_rate": 2.5746746073032624e-06, + "loss": 0.9565, + "step": 8034 + }, + { + "epoch": 1.5467911543181654, + "grad_norm": 3.3682507190932864, + "learning_rate": 2.572586569926111e-06, + "loss": 0.8937, + "step": 8035 + }, + { + "epoch": 1.5469836609957408, + "grad_norm": 3.449785579991025, + "learning_rate": 2.5704992545885755e-06, + "loss": 1.0068, + "step": 8036 + }, + { + "epoch": 1.547176167673316, + "grad_norm": 3.315921617406086, + "learning_rate": 2.56841266149356e-06, + "loss": 0.9175, + "step": 8037 + }, + { + "epoch": 1.5473686743508916, + "grad_norm": 3.214776155482678, + "learning_rate": 2.5663267908439115e-06, + "loss": 0.8991, + "step": 8038 + }, + { + "epoch": 1.5475611810284668, + "grad_norm": 3.177546508328573, + "learning_rate": 2.564241642842403e-06, + "loss": 0.9115, + "step": 8039 + }, + { + "epoch": 1.5477536877060423, + "grad_norm": 3.3409315281395693, + "learning_rate": 2.5621572176917387e-06, + "loss": 0.9464, + "step": 8040 + }, + { + "epoch": 1.5477536877060423, + "lm_loss": 0.7771, + "step": 8040, + "vm_loss": 0.2134 + }, + { + "epoch": 1.5477536877060423, + "lm_loss": 0.8359, + "step": 8040, + "vm_loss": 0.1553 + }, + { + "epoch": 1.5477536877060423, + "lm_loss": 0.6471, + "step": 8040, + "vm_loss": 0.1431 + }, + { + "epoch": 1.5477536877060423, + "lm_loss": 0.6839, + "step": 8040, + "vm_loss": 0.2457 + }, + { + "epoch": 1.5477536877060423, + "lm_loss": 0.5652, + "step": 8040, + "vm_loss": 0.1836 + }, + { + "epoch": 1.5477536877060423, + "lm_loss": 0.7827, + "step": 8040, + "vm_loss": 0.1491 + }, + { + "epoch": 1.5477536877060423, + "lm_loss": 0.4397, + "step": 8040, + "vm_loss": 0.1322 + }, + { + "epoch": 1.5477536877060423, + "lm_loss": 0.9897, + "step": 8040, + "vm_loss": 0.1745 + }, + { + "epoch": 1.5479461943836177, + "grad_norm": 3.414174748023272, + "learning_rate": 2.560073515594553e-06, + "loss": 0.997, + "step": 8041 + }, + { + "epoch": 1.5481387010611931, + "grad_norm": 3.1531722194752265, + "learning_rate": 2.557990536753401e-06, + "loss": 0.9037, + "step": 8042 + }, + { + "epoch": 1.5483312077387685, + "grad_norm": 3.057843892580745, + "learning_rate": 2.555908281370786e-06, + "loss": 0.8872, + "step": 8043 + }, + { + "epoch": 1.5485237144163437, + "grad_norm": 3.2116494676564162, + "learning_rate": 2.55382674964912e-06, + "loss": 0.8925, + "step": 8044 + }, + { + "epoch": 1.5487162210939192, + "grad_norm": 3.2344749266557287, + "learning_rate": 2.5517459417907575e-06, + "loss": 0.942, + "step": 8045 + }, + { + "epoch": 1.5489087277714946, + "grad_norm": 3.104221482293591, + "learning_rate": 2.5496658579979815e-06, + "loss": 0.8619, + "step": 8046 + }, + { + "epoch": 1.54910123444907, + "grad_norm": 5.665101920938779, + "learning_rate": 2.5475864984730047e-06, + "loss": 0.8937, + "step": 8047 + }, + { + "epoch": 1.5492937411266454, + "grad_norm": 3.299530216755856, + "learning_rate": 2.5455078634179585e-06, + "loss": 0.9431, + "step": 8048 + }, + { + "epoch": 1.5492937411266454, + "lm_loss": 0.792, + "step": 8048, + "vm_loss": 0.133 + }, + { + "epoch": 1.5492937411266454, + "lm_loss": 0.6506, + "step": 8048, + "vm_loss": 0.1515 + }, + { + "epoch": 1.5492937411266454, + "lm_loss": 0.6503, + "step": 8048, + "vm_loss": 0.1017 + }, + { + "epoch": 1.5492937411266454, + "lm_loss": 0.7102, + "step": 8048, + "vm_loss": 0.1541 + }, + { + "epoch": 1.5492937411266454, + "lm_loss": 0.5696, + "step": 8048, + "vm_loss": 0.1657 + }, + { + "epoch": 1.5492937411266454, + "lm_loss": 0.5421, + "step": 8048, + "vm_loss": 0.1951 + }, + { + "epoch": 1.5492937411266454, + "lm_loss": 1.3349, + "step": 8048, + "vm_loss": 0.2416 + }, + { + "epoch": 1.5492937411266454, + "lm_loss": 0.7006, + "step": 8048, + "vm_loss": 0.0982 + }, + { + "epoch": 1.5494862478042206, + "grad_norm": 3.385589099523751, + "learning_rate": 2.543429953034925e-06, + "loss": 0.9719, + "step": 8049 + }, + { + "epoch": 1.549678754481796, + "grad_norm": 3.3664696626675985, + "learning_rate": 2.541352767525894e-06, + "loss": 0.9519, + "step": 8050 + }, + { + "epoch": 1.5498712611593715, + "grad_norm": 3.3575633284852198, + "learning_rate": 2.5392763070928006e-06, + "loss": 0.9488, + "step": 8051 + }, + { + "epoch": 1.5500637678369469, + "grad_norm": 3.3275734245013973, + "learning_rate": 2.5372005719374957e-06, + "loss": 0.9372, + "step": 8052 + }, + { + "epoch": 1.5502562745145223, + "grad_norm": 3.4242098745925893, + "learning_rate": 2.5351255622617775e-06, + "loss": 0.954, + "step": 8053 + }, + { + "epoch": 1.5504487811920975, + "grad_norm": 3.14201328970626, + "learning_rate": 2.5330512782673554e-06, + "loss": 0.8365, + "step": 8054 + }, + { + "epoch": 1.550641287869673, + "grad_norm": 3.280764839740856, + "learning_rate": 2.5309777201558795e-06, + "loss": 0.9086, + "step": 8055 + }, + { + "epoch": 1.5508337945472483, + "grad_norm": 3.263028325756709, + "learning_rate": 2.5289048881289256e-06, + "loss": 0.9177, + "step": 8056 + }, + { + "epoch": 1.5508337945472483, + "lm_loss": 0.6132, + "step": 8056, + "vm_loss": 0.1768 + }, + { + "epoch": 1.5508337945472483, + "lm_loss": 0.349, + "step": 8056, + "vm_loss": 0.1675 + }, + { + "epoch": 1.5508337945472483, + "lm_loss": 0.8607, + "step": 8056, + "vm_loss": 0.1151 + }, + { + "epoch": 1.5508337945472483, + "lm_loss": 0.4599, + "step": 8056, + "vm_loss": 0.1287 + }, + { + "epoch": 1.5508337945472483, + "lm_loss": 0.5041, + "step": 8056, + "vm_loss": 0.1264 + }, + { + "epoch": 1.5508337945472483, + "lm_loss": 0.6696, + "step": 8056, + "vm_loss": 0.1256 + }, + { + "epoch": 1.5508337945472483, + "lm_loss": 0.6461, + "step": 8056, + "vm_loss": 0.2125 + }, + { + "epoch": 1.5508337945472483, + "lm_loss": 0.582, + "step": 8056, + "vm_loss": 0.1376 + }, + { + "epoch": 1.5510263012248238, + "grad_norm": 3.121599770952913, + "learning_rate": 2.5268327823880036e-06, + "loss": 0.865, + "step": 8057 + }, + { + "epoch": 1.5512188079023992, + "grad_norm": 3.425979639262565, + "learning_rate": 2.5247614031345412e-06, + "loss": 0.9736, + "step": 8058 + }, + { + "epoch": 1.5514113145799744, + "grad_norm": 3.189515678924552, + "learning_rate": 2.522690750569906e-06, + "loss": 0.9005, + "step": 8059 + }, + { + "epoch": 1.55160382125755, + "grad_norm": 3.3124622292775445, + "learning_rate": 2.5206208248953932e-06, + "loss": 0.9535, + "step": 8060 + }, + { + "epoch": 1.5517963279351252, + "grad_norm": 3.3259167107284067, + "learning_rate": 2.5185516263122256e-06, + "loss": 0.9381, + "step": 8061 + }, + { + "epoch": 1.5519888346127007, + "grad_norm": 3.2442572878533675, + "learning_rate": 2.5164831550215552e-06, + "loss": 0.9086, + "step": 8062 + }, + { + "epoch": 1.552181341290276, + "grad_norm": 3.2982612636229045, + "learning_rate": 2.5144154112244647e-06, + "loss": 0.9348, + "step": 8063 + }, + { + "epoch": 1.5523738479678513, + "grad_norm": 3.3743817947078125, + "learning_rate": 2.512348395121967e-06, + "loss": 0.9845, + "step": 8064 + }, + { + "epoch": 1.5523738479678513, + "lm_loss": 0.8487, + "step": 8064, + "vm_loss": 0.1124 + }, + { + "epoch": 1.5523738479678513, + "lm_loss": 1.189, + "step": 8064, + "vm_loss": 0.1446 + }, + { + "epoch": 1.5523738479678513, + "lm_loss": 0.7536, + "step": 8064, + "vm_loss": 0.1641 + }, + { + "epoch": 1.5523738479678513, + "lm_loss": 0.5355, + "step": 8064, + "vm_loss": 0.1373 + }, + { + "epoch": 1.5523738479678513, + "lm_loss": 0.9571, + "step": 8064, + "vm_loss": 0.1878 + }, + { + "epoch": 1.5523738479678513, + "lm_loss": 0.5759, + "step": 8064, + "vm_loss": 0.1755 + }, + { + "epoch": 1.5523738479678513, + "lm_loss": 0.8864, + "step": 8064, + "vm_loss": 0.1448 + }, + { + "epoch": 1.5523738479678513, + "lm_loss": 0.6029, + "step": 8064, + "vm_loss": 0.1823 + }, + { + "epoch": 1.552566354645427, + "grad_norm": 3.3081428980958436, + "learning_rate": 2.5102821069149963e-06, + "loss": 0.9282, + "step": 8065 + }, + { + "epoch": 1.5527588613230021, + "grad_norm": 3.32069548727019, + "learning_rate": 2.5082165468044306e-06, + "loss": 0.9059, + "step": 8066 + }, + { + "epoch": 1.5529513680005775, + "grad_norm": 3.187461595431289, + "learning_rate": 2.506151714991062e-06, + "loss": 0.9012, + "step": 8067 + }, + { + "epoch": 1.553143874678153, + "grad_norm": 3.289097079521282, + "learning_rate": 2.504087611675623e-06, + "loss": 0.9233, + "step": 8068 + }, + { + "epoch": 1.5533363813557282, + "grad_norm": 3.430215447197617, + "learning_rate": 2.5020242370587643e-06, + "loss": 0.9548, + "step": 8069 + }, + { + "epoch": 1.5535288880333038, + "grad_norm": 3.3077061842860265, + "learning_rate": 2.4999615913410823e-06, + "loss": 0.9433, + "step": 8070 + }, + { + "epoch": 1.553721394710879, + "grad_norm": 3.28873848326804, + "learning_rate": 2.4978996747230834e-06, + "loss": 0.9317, + "step": 8071 + }, + { + "epoch": 1.5539139013884544, + "grad_norm": 3.109929815527573, + "learning_rate": 2.4958384874052166e-06, + "loss": 0.8534, + "step": 8072 + }, + { + "epoch": 1.5539139013884544, + "lm_loss": 0.7684, + "step": 8072, + "vm_loss": 0.1455 + }, + { + "epoch": 1.5539139013884544, + "lm_loss": 0.905, + "step": 8072, + "vm_loss": 0.0838 + }, + { + "epoch": 1.5539139013884544, + "lm_loss": 0.9617, + "step": 8072, + "vm_loss": 0.178 + }, + { + "epoch": 1.5539139013884544, + "lm_loss": 0.7553, + "step": 8072, + "vm_loss": 0.1898 + }, + { + "epoch": 1.5539139013884544, + "lm_loss": 0.7515, + "step": 8072, + "vm_loss": 0.2147 + }, + { + "epoch": 1.5539139013884544, + "lm_loss": 0.7598, + "step": 8072, + "vm_loss": 0.1552 + }, + { + "epoch": 1.5539139013884544, + "lm_loss": 0.7956, + "step": 8072, + "vm_loss": 0.1254 + }, + { + "epoch": 1.5539139013884544, + "lm_loss": 1.2563, + "step": 8072, + "vm_loss": 0.1629 + }, + { + "epoch": 1.5541064080660298, + "grad_norm": 3.2748473322767326, + "learning_rate": 2.4937780295878557e-06, + "loss": 0.9205, + "step": 8073 + }, + { + "epoch": 1.554298914743605, + "grad_norm": 3.094646244520794, + "learning_rate": 2.4917183014713052e-06, + "loss": 0.8799, + "step": 8074 + }, + { + "epoch": 1.5544914214211807, + "grad_norm": 3.248429068905733, + "learning_rate": 2.4896593032557916e-06, + "loss": 0.935, + "step": 8075 + }, + { + "epoch": 1.5546839280987559, + "grad_norm": 3.4227510520669346, + "learning_rate": 2.4876010351414792e-06, + "loss": 0.977, + "step": 8076 + }, + { + "epoch": 1.5548764347763313, + "grad_norm": 3.238548932910585, + "learning_rate": 2.4855434973284585e-06, + "loss": 0.9099, + "step": 8077 + }, + { + "epoch": 1.5550689414539067, + "grad_norm": 3.4626305796193044, + "learning_rate": 2.4834866900167478e-06, + "loss": 1.0208, + "step": 8078 + }, + { + "epoch": 1.555261448131482, + "grad_norm": 3.186535890007029, + "learning_rate": 2.481430613406295e-06, + "loss": 0.9121, + "step": 8079 + }, + { + "epoch": 1.5554539548090576, + "grad_norm": 3.151463090537228, + "learning_rate": 2.4793752676969795e-06, + "loss": 0.8837, + "step": 8080 + }, + { + "epoch": 1.5554539548090576, + "lm_loss": 0.7908, + "step": 8080, + "vm_loss": 0.2115 + }, + { + "epoch": 1.5554539548090576, + "lm_loss": 1.0147, + "step": 8080, + "vm_loss": 0.2028 + }, + { + "epoch": 1.5554539548090576, + "lm_loss": 0.8523, + "step": 8080, + "vm_loss": 0.1413 + }, + { + "epoch": 1.5554539548090576, + "lm_loss": 0.4586, + "step": 8080, + "vm_loss": 0.0933 + }, + { + "epoch": 1.5554539548090576, + "lm_loss": 0.5975, + "step": 8080, + "vm_loss": 0.1221 + }, + { + "epoch": 1.5554539548090576, + "lm_loss": 0.9358, + "step": 8080, + "vm_loss": 0.1914 + }, + { + "epoch": 1.5554539548090576, + "lm_loss": 1.3041, + "step": 8080, + "vm_loss": 0.1429 + }, + { + "epoch": 1.5554539548090576, + "lm_loss": 0.503, + "step": 8080, + "vm_loss": 0.1678 + }, + { + "epoch": 1.5556464614866328, + "grad_norm": 3.1700068703120152, + "learning_rate": 2.477320653088602e-06, + "loss": 0.9198, + "step": 8081 + }, + { + "epoch": 1.5558389681642082, + "grad_norm": 3.2992851740245843, + "learning_rate": 2.4752667697809007e-06, + "loss": 0.9179, + "step": 8082 + }, + { + "epoch": 1.5560314748417836, + "grad_norm": 3.1071362393579975, + "learning_rate": 2.473213617973538e-06, + "loss": 0.9027, + "step": 8083 + }, + { + "epoch": 1.5562239815193588, + "grad_norm": 3.184432023927627, + "learning_rate": 2.4711611978661076e-06, + "loss": 0.882, + "step": 8084 + }, + { + "epoch": 1.5564164881969345, + "grad_norm": 3.4120498193014677, + "learning_rate": 2.4691095096581343e-06, + "loss": 0.9697, + "step": 8085 + }, + { + "epoch": 1.5566089948745097, + "grad_norm": 3.227166756512222, + "learning_rate": 2.4670585535490586e-06, + "loss": 0.9588, + "step": 8086 + }, + { + "epoch": 1.556801501552085, + "grad_norm": 3.2756054338265765, + "learning_rate": 2.465008329738272e-06, + "loss": 0.9371, + "step": 8087 + }, + { + "epoch": 1.5569940082296605, + "grad_norm": 3.398754887021963, + "learning_rate": 2.4629588384250737e-06, + "loss": 0.9529, + "step": 8088 + }, + { + "epoch": 1.5569940082296605, + "lm_loss": 0.6432, + "step": 8088, + "vm_loss": 0.1751 + }, + { + "epoch": 1.5569940082296605, + "lm_loss": 1.0005, + "step": 8088, + "vm_loss": 0.1609 + }, + { + "epoch": 1.5569940082296605, + "lm_loss": 0.9258, + "step": 8088, + "vm_loss": 0.1908 + }, + { + "epoch": 1.5569940082296605, + "lm_loss": 0.8778, + "step": 8088, + "vm_loss": 0.1745 + }, + { + "epoch": 1.5569940082296605, + "lm_loss": 0.8446, + "step": 8088, + "vm_loss": 0.1527 + }, + { + "epoch": 1.5569940082296605, + "lm_loss": 1.1582, + "step": 8088, + "vm_loss": 0.1737 + }, + { + "epoch": 1.5569940082296605, + "lm_loss": 0.6223, + "step": 8088, + "vm_loss": 0.166 + }, + { + "epoch": 1.5569940082296605, + "lm_loss": 0.5025, + "step": 8088, + "vm_loss": 0.163 + }, + { + "epoch": 1.5571865149072357, + "grad_norm": 3.261592689440988, + "learning_rate": 2.460910079808703e-06, + "loss": 0.9176, + "step": 8089 + }, + { + "epoch": 1.5573790215848113, + "grad_norm": 3.2467865653768526, + "learning_rate": 2.4588620540883257e-06, + "loss": 0.9096, + "step": 8090 + }, + { + "epoch": 1.5575715282623865, + "grad_norm": 3.2931729713179294, + "learning_rate": 2.4568147614630378e-06, + "loss": 0.8999, + "step": 8091 + }, + { + "epoch": 1.557764034939962, + "grad_norm": 3.2619116303949696, + "learning_rate": 2.4547682021318596e-06, + "loss": 0.9245, + "step": 8092 + }, + { + "epoch": 1.5579565416175374, + "grad_norm": 3.5874144049066996, + "learning_rate": 2.4527223762937422e-06, + "loss": 0.9962, + "step": 8093 + }, + { + "epoch": 1.5581490482951126, + "grad_norm": 3.4376427335455877, + "learning_rate": 2.450677284147569e-06, + "loss": 0.9681, + "step": 8094 + }, + { + "epoch": 1.5583415549726882, + "grad_norm": 3.1498222998796876, + "learning_rate": 2.44863292589215e-06, + "loss": 0.8592, + "step": 8095 + }, + { + "epoch": 1.5585340616502634, + "grad_norm": 3.2060332358741226, + "learning_rate": 2.4465893017262156e-06, + "loss": 0.9425, + "step": 8096 + }, + { + "epoch": 1.5585340616502634, + "lm_loss": 0.5833, + "step": 8096, + "vm_loss": 0.2157 + }, + { + "epoch": 1.5585340616502634, + "lm_loss": 0.7539, + "step": 8096, + "vm_loss": 0.1131 + }, + { + "epoch": 1.5585340616502634, + "lm_loss": 1.1119, + "step": 8096, + "vm_loss": 0.1873 + }, + { + "epoch": 1.5585340616502634, + "lm_loss": 0.8628, + "step": 8096, + "vm_loss": 0.1613 + }, + { + "epoch": 1.5585340616502634, + "lm_loss": 0.5656, + "step": 8096, + "vm_loss": 0.121 + }, + { + "epoch": 1.5585340616502634, + "lm_loss": 0.5463, + "step": 8096, + "vm_loss": 0.0986 + }, + { + "epoch": 1.5585340616502634, + "lm_loss": 0.9406, + "step": 8096, + "vm_loss": 0.2596 + }, + { + "epoch": 1.5585340616502634, + "lm_loss": 1.078, + "step": 8096, + "vm_loss": 0.1268 + }, + { + "epoch": 1.5587265683278388, + "grad_norm": 3.1548312965062, + "learning_rate": 2.444546411848443e-06, + "loss": 0.9039, + "step": 8097 + }, + { + "epoch": 1.5589190750054143, + "grad_norm": 3.20065192290026, + "learning_rate": 2.4425042564574186e-06, + "loss": 0.8895, + "step": 8098 + }, + { + "epoch": 1.5591115816829895, + "grad_norm": 3.384712201753457, + "learning_rate": 2.440462835751669e-06, + "loss": 0.9601, + "step": 8099 + }, + { + "epoch": 1.559304088360565, + "grad_norm": 3.24615811520348, + "learning_rate": 2.4384221499296466e-06, + "loss": 0.9596, + "step": 8100 + }, + { + "epoch": 1.5594965950381403, + "grad_norm": 3.254830866558642, + "learning_rate": 2.436382199189734e-06, + "loss": 0.9482, + "step": 8101 + }, + { + "epoch": 1.5596891017157157, + "grad_norm": 3.200481922264514, + "learning_rate": 2.4343429837302367e-06, + "loss": 0.902, + "step": 8102 + }, + { + "epoch": 1.5598816083932912, + "grad_norm": 3.129146938125482, + "learning_rate": 2.432304503749394e-06, + "loss": 0.9116, + "step": 8103 + }, + { + "epoch": 1.5600741150708666, + "grad_norm": 3.21760359488873, + "learning_rate": 2.4302667594453734e-06, + "loss": 0.8925, + "step": 8104 + }, + { + "epoch": 1.5600741150708666, + "lm_loss": 0.8803, + "step": 8104, + "vm_loss": 0.1492 + }, + { + "epoch": 1.5600741150708666, + "lm_loss": 0.9525, + "step": 8104, + "vm_loss": 0.13 + }, + { + "epoch": 1.5600741150708666, + "lm_loss": 0.5712, + "step": 8104, + "vm_loss": 0.1491 + }, + { + "epoch": 1.5600741150708666, + "lm_loss": 0.7088, + "step": 8104, + "vm_loss": 0.1951 + }, + { + "epoch": 1.5600741150708666, + "lm_loss": 0.6756, + "step": 8104, + "vm_loss": 0.1248 + }, + { + "epoch": 1.5600741150708666, + "lm_loss": 0.7027, + "step": 8104, + "vm_loss": 0.1589 + }, + { + "epoch": 1.5600741150708666, + "lm_loss": 0.9538, + "step": 8104, + "vm_loss": 0.1385 + }, + { + "epoch": 1.5600741150708666, + "lm_loss": 0.8504, + "step": 8104, + "vm_loss": 0.1673 + }, + { + "epoch": 1.560266621748442, + "grad_norm": 3.142369378834673, + "learning_rate": 2.428229751016269e-06, + "loss": 0.8685, + "step": 8105 + }, + { + "epoch": 1.5604591284260172, + "grad_norm": 3.1771725856632864, + "learning_rate": 2.4261934786601025e-06, + "loss": 0.9012, + "step": 8106 + }, + { + "epoch": 1.5606516351035926, + "grad_norm": 3.100174614819856, + "learning_rate": 2.4241579425748284e-06, + "loss": 0.8673, + "step": 8107 + }, + { + "epoch": 1.560844141781168, + "grad_norm": 3.2751778244121996, + "learning_rate": 2.422123142958328e-06, + "loss": 0.9254, + "step": 8108 + }, + { + "epoch": 1.5610366484587435, + "grad_norm": 3.474570815643663, + "learning_rate": 2.4200890800084052e-06, + "loss": 0.9307, + "step": 8109 + }, + { + "epoch": 1.5612291551363189, + "grad_norm": 3.2521525690403497, + "learning_rate": 2.418055753922799e-06, + "loss": 0.9234, + "step": 8110 + }, + { + "epoch": 1.561421661813894, + "grad_norm": 3.342543265991017, + "learning_rate": 2.416023164899175e-06, + "loss": 0.9208, + "step": 8111 + }, + { + "epoch": 1.5616141684914695, + "grad_norm": 3.312155075900073, + "learning_rate": 2.413991313135129e-06, + "loss": 0.9177, + "step": 8112 + }, + { + "epoch": 1.5616141684914695, + "lm_loss": 0.5547, + "step": 8112, + "vm_loss": 0.1987 + }, + { + "epoch": 1.5616141684914695, + "lm_loss": 0.566, + "step": 8112, + "vm_loss": 0.1099 + }, + { + "epoch": 1.5616141684914695, + "lm_loss": 0.7185, + "step": 8112, + "vm_loss": 0.1104 + }, + { + "epoch": 1.5616141684914695, + "lm_loss": 0.442, + "step": 8112, + "vm_loss": 0.1402 + }, + { + "epoch": 1.5616141684914695, + "lm_loss": 1.0472, + "step": 8112, + "vm_loss": 0.1583 + }, + { + "epoch": 1.5616141684914695, + "lm_loss": 0.9709, + "step": 8112, + "vm_loss": 0.2062 + }, + { + "epoch": 1.5616141684914695, + "lm_loss": 0.6095, + "step": 8112, + "vm_loss": 0.1518 + }, + { + "epoch": 1.5616141684914695, + "lm_loss": 0.328, + "step": 8112, + "vm_loss": 0.1185 + }, + { + "epoch": 1.561806675169045, + "grad_norm": 3.164265951924517, + "learning_rate": 2.411960198828175e-06, + "loss": 0.8689, + "step": 8113 + }, + { + "epoch": 1.5619991818466203, + "grad_norm": 3.2980978750404506, + "learning_rate": 2.409929822175775e-06, + "loss": 0.9352, + "step": 8114 + }, + { + "epoch": 1.5621916885241958, + "grad_norm": 3.195297586360202, + "learning_rate": 2.407900183375299e-06, + "loss": 0.9138, + "step": 8115 + }, + { + "epoch": 1.562384195201771, + "grad_norm": 3.3897318033117756, + "learning_rate": 2.4058712826240595e-06, + "loss": 0.9643, + "step": 8116 + }, + { + "epoch": 1.5625767018793464, + "grad_norm": 3.144066264093219, + "learning_rate": 2.4038431201192825e-06, + "loss": 0.8795, + "step": 8117 + }, + { + "epoch": 1.5627692085569218, + "grad_norm": 3.2175877368377055, + "learning_rate": 2.401815696058143e-06, + "loss": 0.9054, + "step": 8118 + }, + { + "epoch": 1.5629617152344972, + "grad_norm": 3.2806098294263677, + "learning_rate": 2.399789010637725e-06, + "loss": 0.9152, + "step": 8119 + }, + { + "epoch": 1.5631542219120727, + "grad_norm": 3.2346207666052007, + "learning_rate": 2.39776306405505e-06, + "loss": 0.8956, + "step": 8120 + }, + { + "epoch": 1.5631542219120727, + "lm_loss": 0.6953, + "step": 8120, + "vm_loss": 0.1597 + }, + { + "epoch": 1.5631542219120727, + "lm_loss": 0.8863, + "step": 8120, + "vm_loss": 0.1787 + }, + { + "epoch": 1.5631542219120727, + "lm_loss": 0.8472, + "step": 8120, + "vm_loss": 0.1367 + }, + { + "epoch": 1.5631542219120727, + "lm_loss": 0.8541, + "step": 8120, + "vm_loss": 0.1045 + }, + { + "epoch": 1.5631542219120727, + "lm_loss": 0.8565, + "step": 8120, + "vm_loss": 0.1693 + }, + { + "epoch": 1.5631542219120727, + "lm_loss": 0.3922, + "step": 8120, + "vm_loss": 0.2042 + }, + { + "epoch": 1.5631542219120727, + "lm_loss": 0.5924, + "step": 8120, + "vm_loss": 0.1396 + }, + { + "epoch": 1.5631542219120727, + "lm_loss": 1.0281, + "step": 8120, + "vm_loss": 0.1848 + }, + { + "epoch": 1.5633467285896478, + "grad_norm": 3.1363226237620894, + "learning_rate": 2.3957378565070656e-06, + "loss": 0.8878, + "step": 8121 + }, + { + "epoch": 1.5635392352672235, + "grad_norm": 3.204317566045012, + "learning_rate": 2.3937133881906484e-06, + "loss": 0.8759, + "step": 8122 + }, + { + "epoch": 1.5637317419447987, + "grad_norm": 3.154909219088451, + "learning_rate": 2.3916896593026027e-06, + "loss": 0.8993, + "step": 8123 + }, + { + "epoch": 1.5639242486223741, + "grad_norm": 3.1829763045546478, + "learning_rate": 2.389666670039664e-06, + "loss": 0.9449, + "step": 8124 + }, + { + "epoch": 1.5641167552999495, + "grad_norm": 3.280915198095316, + "learning_rate": 2.3876444205984872e-06, + "loss": 0.917, + "step": 8125 + }, + { + "epoch": 1.5643092619775247, + "grad_norm": 3.139616102539235, + "learning_rate": 2.3856229111756622e-06, + "loss": 0.8976, + "step": 8126 + }, + { + "epoch": 1.5645017686551004, + "grad_norm": 3.2043698280066177, + "learning_rate": 2.383602141967708e-06, + "loss": 0.9345, + "step": 8127 + }, + { + "epoch": 1.5646942753326756, + "grad_norm": 3.1811792140662, + "learning_rate": 2.381582113171066e-06, + "loss": 0.8973, + "step": 8128 + }, + { + "epoch": 1.5646942753326756, + "lm_loss": 0.6974, + "step": 8128, + "vm_loss": 0.1314 + }, + { + "epoch": 1.5646942753326756, + "lm_loss": 0.817, + "step": 8128, + "vm_loss": 0.1995 + }, + { + "epoch": 1.5646942753326756, + "lm_loss": 0.7631, + "step": 8128, + "vm_loss": 0.1931 + }, + { + "epoch": 1.5646942753326756, + "lm_loss": 0.5283, + "step": 8128, + "vm_loss": 0.1536 + }, + { + "epoch": 1.5646942753326756, + "lm_loss": 0.7091, + "step": 8128, + "vm_loss": 0.1297 + }, + { + "epoch": 1.5646942753326756, + "lm_loss": 0.5502, + "step": 8128, + "vm_loss": 0.1921 + }, + { + "epoch": 1.5646942753326756, + "lm_loss": 0.7149, + "step": 8128, + "vm_loss": 0.1646 + }, + { + "epoch": 1.5646942753326756, + "lm_loss": 1.6096, + "step": 8128, + "vm_loss": 0.1155 + }, + { + "epoch": 1.564886782010251, + "grad_norm": 3.340787377345133, + "learning_rate": 2.3795628249821147e-06, + "loss": 0.9549, + "step": 8129 + }, + { + "epoch": 1.5650792886878264, + "grad_norm": 3.2600492598972135, + "learning_rate": 2.3775442775971448e-06, + "loss": 0.936, + "step": 8130 + }, + { + "epoch": 1.5652717953654016, + "grad_norm": 3.2822791534156877, + "learning_rate": 2.375526471212396e-06, + "loss": 0.9235, + "step": 8131 + }, + { + "epoch": 1.5654643020429773, + "grad_norm": 3.282186614362665, + "learning_rate": 2.373509406024017e-06, + "loss": 0.9035, + "step": 8132 + }, + { + "epoch": 1.5656568087205525, + "grad_norm": 3.3265425431706808, + "learning_rate": 2.371493082228098e-06, + "loss": 0.9147, + "step": 8133 + }, + { + "epoch": 1.5658493153981279, + "grad_norm": 3.294296197470225, + "learning_rate": 2.3694775000206427e-06, + "loss": 0.915, + "step": 8134 + }, + { + "epoch": 1.5660418220757033, + "grad_norm": 3.2582322486076905, + "learning_rate": 2.3674626595976037e-06, + "loss": 0.8999, + "step": 8135 + }, + { + "epoch": 1.5662343287532785, + "grad_norm": 3.3380259469162965, + "learning_rate": 2.3654485611548394e-06, + "loss": 0.913, + "step": 8136 + }, + { + "epoch": 1.5662343287532785, + "lm_loss": 1.05, + "step": 8136, + "vm_loss": 0.1354 + }, + { + "epoch": 1.5662343287532785, + "lm_loss": 0.6783, + "step": 8136, + "vm_loss": 0.0798 + }, + { + "epoch": 1.5662343287532785, + "lm_loss": 0.7606, + "step": 8136, + "vm_loss": 0.2127 + }, + { + "epoch": 1.5662343287532785, + "lm_loss": 0.8195, + "step": 8136, + "vm_loss": 0.2809 + }, + { + "epoch": 1.5662343287532785, + "lm_loss": 0.6626, + "step": 8136, + "vm_loss": 0.1633 + }, + { + "epoch": 1.5662343287532785, + "lm_loss": 0.4933, + "step": 8136, + "vm_loss": 0.0801 + }, + { + "epoch": 1.5662343287532785, + "lm_loss": 0.7835, + "step": 8136, + "vm_loss": 0.1732 + }, + { + "epoch": 1.5662343287532785, + "lm_loss": 0.38, + "step": 8136, + "vm_loss": 0.1031 + }, + { + "epoch": 1.5664268354308541, + "grad_norm": 3.1797893554798224, + "learning_rate": 2.363435204888149e-06, + "loss": 0.8812, + "step": 8137 + }, + { + "epoch": 1.5666193421084293, + "grad_norm": 3.2443241293356406, + "learning_rate": 2.3614225909932575e-06, + "loss": 0.9267, + "step": 8138 + }, + { + "epoch": 1.5668118487860048, + "grad_norm": 3.124435209508619, + "learning_rate": 2.359410719665819e-06, + "loss": 0.8866, + "step": 8139 + }, + { + "epoch": 1.5670043554635802, + "grad_norm": 3.2166079328069515, + "learning_rate": 2.357399591101404e-06, + "loss": 0.9022, + "step": 8140 + }, + { + "epoch": 1.5671968621411554, + "grad_norm": 3.3804288754115923, + "learning_rate": 2.3553892054955317e-06, + "loss": 0.9399, + "step": 8141 + }, + { + "epoch": 1.567389368818731, + "grad_norm": 3.2918410629600627, + "learning_rate": 2.3533795630436297e-06, + "loss": 0.9403, + "step": 8142 + }, + { + "epoch": 1.5675818754963062, + "grad_norm": 3.350468217281528, + "learning_rate": 2.3513706639410625e-06, + "loss": 0.9298, + "step": 8143 + }, + { + "epoch": 1.5677743821738817, + "grad_norm": 3.0478209408176897, + "learning_rate": 2.3493625083831217e-06, + "loss": 0.863, + "step": 8144 + }, + { + "epoch": 1.5677743821738817, + "lm_loss": 0.8374, + "step": 8144, + "vm_loss": 0.1404 + }, + { + "epoch": 1.5677743821738817, + "lm_loss": 0.9028, + "step": 8144, + "vm_loss": 0.1496 + }, + { + "epoch": 1.5677743821738817, + "lm_loss": 0.7882, + "step": 8144, + "vm_loss": 0.1682 + }, + { + "epoch": 1.5677743821738817, + "lm_loss": 0.8617, + "step": 8144, + "vm_loss": 0.152 + }, + { + "epoch": 1.5677743821738817, + "lm_loss": 0.6157, + "step": 8144, + "vm_loss": 0.142 + }, + { + "epoch": 1.5677743821738817, + "lm_loss": 0.6503, + "step": 8144, + "vm_loss": 0.1857 + }, + { + "epoch": 1.5677743821738817, + "lm_loss": 0.7442, + "step": 8144, + "vm_loss": 0.1347 + }, + { + "epoch": 1.5677743821738817, + "lm_loss": 0.4915, + "step": 8144, + "vm_loss": 0.1568 + }, + { + "epoch": 1.567966888851457, + "grad_norm": 3.1616191401000275, + "learning_rate": 2.347355096565025e-06, + "loss": 0.9028, + "step": 8145 + }, + { + "epoch": 1.5681593955290323, + "grad_norm": 3.3562843850140274, + "learning_rate": 2.345348428681922e-06, + "loss": 0.9469, + "step": 8146 + }, + { + "epoch": 1.568351902206608, + "grad_norm": 3.2790231434199217, + "learning_rate": 2.3433425049288806e-06, + "loss": 0.9308, + "step": 8147 + }, + { + "epoch": 1.5685444088841831, + "grad_norm": 3.275086397844839, + "learning_rate": 2.341337325500904e-06, + "loss": 0.9008, + "step": 8148 + }, + { + "epoch": 1.5687369155617585, + "grad_norm": 3.2917473904467895, + "learning_rate": 2.3393328905929225e-06, + "loss": 0.9573, + "step": 8149 + }, + { + "epoch": 1.568929422239334, + "grad_norm": 3.153765642666675, + "learning_rate": 2.3373292003997963e-06, + "loss": 0.8683, + "step": 8150 + }, + { + "epoch": 1.5691219289169092, + "grad_norm": 3.357799039872151, + "learning_rate": 2.3353262551162993e-06, + "loss": 0.9611, + "step": 8151 + }, + { + "epoch": 1.5693144355944848, + "grad_norm": 3.15244956783198, + "learning_rate": 2.3333240549371572e-06, + "loss": 0.8572, + "step": 8152 + }, + { + "epoch": 1.5693144355944848, + "lm_loss": 0.7314, + "step": 8152, + "vm_loss": 0.1781 + }, + { + "epoch": 1.5693144355944848, + "lm_loss": 1.1789, + "step": 8152, + "vm_loss": 0.1506 + }, + { + "epoch": 1.5693144355944848, + "lm_loss": 0.4106, + "step": 8152, + "vm_loss": 0.0864 + }, + { + "epoch": 1.5693144355944848, + "lm_loss": 0.6878, + "step": 8152, + "vm_loss": 0.1279 + }, + { + "epoch": 1.5693144355944848, + "lm_loss": 0.9103, + "step": 8152, + "vm_loss": 0.2072 + }, + { + "epoch": 1.5693144355944848, + "lm_loss": 0.3278, + "step": 8152, + "vm_loss": 0.1749 + }, + { + "epoch": 1.5693144355944848, + "lm_loss": 0.8873, + "step": 8152, + "vm_loss": 0.1153 + }, + { + "epoch": 1.5693144355944848, + "lm_loss": 0.9183, + "step": 8152, + "vm_loss": 0.1563 + }, + { + "epoch": 1.56950694227206, + "grad_norm": 3.3885515098720504, + "learning_rate": 2.3313226000569975e-06, + "loss": 0.9583, + "step": 8153 + }, + { + "epoch": 1.5696994489496354, + "grad_norm": 3.178625228102367, + "learning_rate": 2.329321890670394e-06, + "loss": 0.9031, + "step": 8154 + }, + { + "epoch": 1.5698919556272108, + "grad_norm": 2.999660373423965, + "learning_rate": 2.3273219269718384e-06, + "loss": 0.8263, + "step": 8155 + }, + { + "epoch": 1.570084462304786, + "grad_norm": 3.397643679766613, + "learning_rate": 2.325322709155757e-06, + "loss": 0.9731, + "step": 8156 + }, + { + "epoch": 1.5702769689823617, + "grad_norm": 3.230941346911783, + "learning_rate": 2.3233242374164933e-06, + "loss": 0.8917, + "step": 8157 + }, + { + "epoch": 1.5704694756599369, + "grad_norm": 3.264226063388696, + "learning_rate": 2.3213265119483265e-06, + "loss": 0.9428, + "step": 8158 + }, + { + "epoch": 1.5706619823375123, + "grad_norm": 3.3648367888556803, + "learning_rate": 2.319329532945461e-06, + "loss": 0.9777, + "step": 8159 + }, + { + "epoch": 1.5708544890150877, + "grad_norm": 3.207624410673108, + "learning_rate": 2.3173333006020337e-06, + "loss": 0.8871, + "step": 8160 + }, + { + "epoch": 1.5708544890150877, + "lm_loss": 0.6826, + "step": 8160, + "vm_loss": 0.1382 + }, + { + "epoch": 1.5708544890150877, + "lm_loss": 1.0295, + "step": 8160, + "vm_loss": 0.1427 + }, + { + "epoch": 1.5708544890150877, + "lm_loss": 0.9451, + "step": 8160, + "vm_loss": 0.1033 + }, + { + "epoch": 1.5708544890150877, + "lm_loss": 0.6799, + "step": 8160, + "vm_loss": 0.1637 + }, + { + "epoch": 1.5708544890150877, + "lm_loss": 0.5979, + "step": 8160, + "vm_loss": 0.1614 + }, + { + "epoch": 1.5708544890150877, + "lm_loss": 1.0406, + "step": 8160, + "vm_loss": 0.1291 + }, + { + "epoch": 1.5708544890150877, + "lm_loss": 0.5861, + "step": 8160, + "vm_loss": 0.1672 + }, + { + "epoch": 1.5708544890150877, + "lm_loss": 0.6189, + "step": 8160, + "vm_loss": 0.1617 + }, + { + "epoch": 1.571046995692663, + "grad_norm": 3.1941149249792895, + "learning_rate": 2.315337815112093e-06, + "loss": 0.8958, + "step": 8161 + }, + { + "epoch": 1.5712395023702386, + "grad_norm": 3.257168598373191, + "learning_rate": 2.3133430766696383e-06, + "loss": 0.9373, + "step": 8162 + }, + { + "epoch": 1.5714320090478138, + "grad_norm": 3.3551957656097064, + "learning_rate": 2.311349085468575e-06, + "loss": 0.9508, + "step": 8163 + }, + { + "epoch": 1.5716245157253892, + "grad_norm": 3.2288193390098026, + "learning_rate": 2.3093558417027473e-06, + "loss": 0.9308, + "step": 8164 + }, + { + "epoch": 1.5718170224029646, + "grad_norm": 3.377165830861174, + "learning_rate": 2.3073633455659227e-06, + "loss": 0.9787, + "step": 8165 + }, + { + "epoch": 1.57200952908054, + "grad_norm": 3.2092360868528442, + "learning_rate": 2.3053715972517998e-06, + "loss": 0.9094, + "step": 8166 + }, + { + "epoch": 1.5722020357581155, + "grad_norm": 3.0851227318352046, + "learning_rate": 2.3033805969540024e-06, + "loss": 0.9008, + "step": 8167 + }, + { + "epoch": 1.5723945424356907, + "grad_norm": 3.2585858081746575, + "learning_rate": 2.3013903448660745e-06, + "loss": 0.9015, + "step": 8168 + }, + { + "epoch": 1.5723945424356907, + "lm_loss": 0.6834, + "step": 8168, + "vm_loss": 0.1651 + }, + { + "epoch": 1.5723945424356907, + "lm_loss": 0.6106, + "step": 8168, + "vm_loss": 0.1342 + }, + { + "epoch": 1.5723945424356907, + "lm_loss": 0.6573, + "step": 8168, + "vm_loss": 0.2406 + }, + { + "epoch": 1.5723945424356907, + "lm_loss": 0.6251, + "step": 8168, + "vm_loss": 0.2209 + }, + { + "epoch": 1.5723945424356907, + "lm_loss": 0.7978, + "step": 8168, + "vm_loss": 0.1653 + }, + { + "epoch": 1.5723945424356907, + "lm_loss": 0.6573, + "step": 8168, + "vm_loss": 0.1156 + }, + { + "epoch": 1.5723945424356907, + "lm_loss": 0.5448, + "step": 8168, + "vm_loss": 0.1304 + }, + { + "epoch": 1.5723945424356907, + "lm_loss": 0.9146, + "step": 8168, + "vm_loss": 0.1491 + }, + { + "epoch": 1.572587049113266, + "grad_norm": 3.3144961416756518, + "learning_rate": 2.299400841181504e-06, + "loss": 0.9481, + "step": 8169 + }, + { + "epoch": 1.5727795557908415, + "grad_norm": 3.188380465015555, + "learning_rate": 2.297412086093689e-06, + "loss": 0.9222, + "step": 8170 + }, + { + "epoch": 1.572972062468417, + "grad_norm": 3.19698615939922, + "learning_rate": 2.2954240797959635e-06, + "loss": 0.8456, + "step": 8171 + }, + { + "epoch": 1.5731645691459923, + "grad_norm": 3.1844655735565732, + "learning_rate": 2.2934368224815894e-06, + "loss": 0.9159, + "step": 8172 + }, + { + "epoch": 1.5733570758235675, + "grad_norm": 3.2158803168224974, + "learning_rate": 2.2914503143437526e-06, + "loss": 0.9468, + "step": 8173 + }, + { + "epoch": 1.573549582501143, + "grad_norm": 3.283085889625685, + "learning_rate": 2.2894645555755645e-06, + "loss": 0.9192, + "step": 8174 + }, + { + "epoch": 1.5737420891787184, + "grad_norm": 3.2326747144532852, + "learning_rate": 2.287479546370068e-06, + "loss": 0.8958, + "step": 8175 + }, + { + "epoch": 1.5739345958562938, + "grad_norm": 3.353028360845267, + "learning_rate": 2.285495286920232e-06, + "loss": 0.9647, + "step": 8176 + }, + { + "epoch": 1.5739345958562938, + "lm_loss": 0.8348, + "step": 8176, + "vm_loss": 0.174 + }, + { + "epoch": 1.5739345958562938, + "lm_loss": 0.5109, + "step": 8176, + "vm_loss": 0.1645 + }, + { + "epoch": 1.5739345958562938, + "lm_loss": 0.5087, + "step": 8176, + "vm_loss": 0.1431 + }, + { + "epoch": 1.5739345958562938, + "lm_loss": 0.8325, + "step": 8176, + "vm_loss": 0.2366 + }, + { + "epoch": 1.5739345958562938, + "lm_loss": 0.6733, + "step": 8176, + "vm_loss": 0.1798 + }, + { + "epoch": 1.5739345958562938, + "lm_loss": 0.9432, + "step": 8176, + "vm_loss": 0.1632 + }, + { + "epoch": 1.5739345958562938, + "lm_loss": 0.6622, + "step": 8176, + "vm_loss": 0.194 + }, + { + "epoch": 1.5739345958562938, + "lm_loss": 0.8813, + "step": 8176, + "vm_loss": 0.1431 + }, + { + "epoch": 1.5741271025338692, + "grad_norm": 3.3527984505299386, + "learning_rate": 2.283511777418954e-06, + "loss": 0.9355, + "step": 8177 + }, + { + "epoch": 1.5743196092114444, + "grad_norm": 3.006069459448071, + "learning_rate": 2.281529018059049e-06, + "loss": 0.8631, + "step": 8178 + }, + { + "epoch": 1.5745121158890198, + "grad_norm": 3.257133935818751, + "learning_rate": 2.2795470090332762e-06, + "loss": 0.9251, + "step": 8179 + }, + { + "epoch": 1.5747046225665953, + "grad_norm": 3.173798654869818, + "learning_rate": 2.277565750534305e-06, + "loss": 0.8903, + "step": 8180 + }, + { + "epoch": 1.5748971292441707, + "grad_norm": 3.1206637015978975, + "learning_rate": 2.275585242754741e-06, + "loss": 0.8856, + "step": 8181 + }, + { + "epoch": 1.5750896359217461, + "grad_norm": 3.272775075891993, + "learning_rate": 2.273605485887116e-06, + "loss": 0.9306, + "step": 8182 + }, + { + "epoch": 1.5752821425993213, + "grad_norm": 3.3119786055097755, + "learning_rate": 2.2716264801238908e-06, + "loss": 0.9059, + "step": 8183 + }, + { + "epoch": 1.575474649276897, + "grad_norm": 3.115321483924805, + "learning_rate": 2.2696482256574426e-06, + "loss": 0.9011, + "step": 8184 + }, + { + "epoch": 1.575474649276897, + "lm_loss": 0.8412, + "step": 8184, + "vm_loss": 0.1364 + }, + { + "epoch": 1.575474649276897, + "lm_loss": 0.872, + "step": 8184, + "vm_loss": 0.1733 + }, + { + "epoch": 1.575474649276897, + "lm_loss": 0.6991, + "step": 8184, + "vm_loss": 0.149 + }, + { + "epoch": 1.575474649276897, + "lm_loss": 0.8603, + "step": 8184, + "vm_loss": 0.1675 + }, + { + "epoch": 1.575474649276897, + "lm_loss": 0.9292, + "step": 8184, + "vm_loss": 0.1952 + }, + { + "epoch": 1.575474649276897, + "lm_loss": 0.9954, + "step": 8184, + "vm_loss": 0.1753 + }, + { + "epoch": 1.575474649276897, + "lm_loss": 0.7913, + "step": 8184, + "vm_loss": 0.1556 + }, + { + "epoch": 1.575474649276897, + "lm_loss": 0.5971, + "step": 8184, + "vm_loss": 0.1768 + }, + { + "epoch": 1.5756671559544722, + "grad_norm": 3.293245656035095, + "learning_rate": 2.2676707226800876e-06, + "loss": 0.9186, + "step": 8185 + }, + { + "epoch": 1.5758596626320476, + "grad_norm": 3.2870770804201244, + "learning_rate": 2.265693971384063e-06, + "loss": 0.9065, + "step": 8186 + }, + { + "epoch": 1.576052169309623, + "grad_norm": 3.251360928941749, + "learning_rate": 2.2637179719615367e-06, + "loss": 0.9301, + "step": 8187 + }, + { + "epoch": 1.5762446759871982, + "grad_norm": 3.331542614086288, + "learning_rate": 2.2617427246045976e-06, + "loss": 0.8974, + "step": 8188 + }, + { + "epoch": 1.5764371826647738, + "grad_norm": 3.1992042923231767, + "learning_rate": 2.259768229505268e-06, + "loss": 0.9179, + "step": 8189 + }, + { + "epoch": 1.576629689342349, + "grad_norm": 3.284790450948841, + "learning_rate": 2.2577944868554947e-06, + "loss": 0.9203, + "step": 8190 + }, + { + "epoch": 1.5768221960199245, + "grad_norm": 3.291675186957191, + "learning_rate": 2.255821496847147e-06, + "loss": 0.8864, + "step": 8191 + }, + { + "epoch": 1.5770147026974999, + "grad_norm": 3.426732793788415, + "learning_rate": 2.2538492596720275e-06, + "loss": 0.9391, + "step": 8192 + }, + { + "epoch": 1.5770147026974999, + "lm_loss": 0.6776, + "step": 8192, + "vm_loss": 0.2087 + }, + { + "epoch": 1.5770147026974999, + "lm_loss": 0.6365, + "step": 8192, + "vm_loss": 0.2183 + }, + { + "epoch": 1.5770147026974999, + "lm_loss": 0.5117, + "step": 8192, + "vm_loss": 0.2143 + }, + { + "epoch": 1.5770147026974999, + "lm_loss": 0.4408, + "step": 8192, + "vm_loss": 0.1412 + }, + { + "epoch": 1.5770147026974999, + "lm_loss": 0.6576, + "step": 8192, + "vm_loss": 0.0783 + }, + { + "epoch": 1.5770147026974999, + "lm_loss": 0.6194, + "step": 8192, + "vm_loss": 0.1757 + }, + { + "epoch": 1.5770147026974999, + "lm_loss": 0.4873, + "step": 8192, + "vm_loss": 0.1304 + }, + { + "epoch": 1.5770147026974999, + "lm_loss": 0.8917, + "step": 8192, + "vm_loss": 0.1478 + }, + { + "epoch": 1.577207209375075, + "grad_norm": 3.187442037946832, + "learning_rate": 2.251877775521861e-06, + "loss": 0.8918, + "step": 8193 + }, + { + "epoch": 1.5773997160526507, + "grad_norm": 3.3938651593298697, + "learning_rate": 2.2499070445883054e-06, + "loss": 0.9387, + "step": 8194 + }, + { + "epoch": 1.577592222730226, + "grad_norm": 3.1989251367932465, + "learning_rate": 2.247937067062932e-06, + "loss": 0.9224, + "step": 8195 + }, + { + "epoch": 1.5777847294078013, + "grad_norm": 3.1228217595534287, + "learning_rate": 2.245967843137259e-06, + "loss": 0.9039, + "step": 8196 + }, + { + "epoch": 1.5779772360853768, + "grad_norm": 3.220670253396689, + "learning_rate": 2.243999373002712e-06, + "loss": 0.8637, + "step": 8197 + }, + { + "epoch": 1.578169742762952, + "grad_norm": 3.3071434802258173, + "learning_rate": 2.242031656850654e-06, + "loss": 0.9318, + "step": 8198 + }, + { + "epoch": 1.5783622494405276, + "grad_norm": 3.192511413469724, + "learning_rate": 2.2400646948723724e-06, + "loss": 0.8818, + "step": 8199 + }, + { + "epoch": 1.5785547561181028, + "grad_norm": 3.2523337471462885, + "learning_rate": 2.238098487259084e-06, + "loss": 0.9011, + "step": 8200 + }, + { + "epoch": 1.5785547561181028, + "lm_loss": 0.6058, + "step": 8200, + "vm_loss": 0.2292 + }, + { + "epoch": 1.5785547561181028, + "lm_loss": 0.7306, + "step": 8200, + "vm_loss": 0.1315 + }, + { + "epoch": 1.5785547561181028, + "lm_loss": 0.9154, + "step": 8200, + "vm_loss": 0.0988 + }, + { + "epoch": 1.5785547561181028, + "lm_loss": 0.5544, + "step": 8200, + "vm_loss": 0.1249 + }, + { + "epoch": 1.5785547561181028, + "lm_loss": 0.5119, + "step": 8200, + "vm_loss": 0.1282 + }, + { + "epoch": 1.5785547561181028, + "lm_loss": 0.9309, + "step": 8200, + "vm_loss": 0.147 + }, + { + "epoch": 1.5785547561181028, + "lm_loss": 0.9151, + "step": 8200, + "vm_loss": 0.2426 + }, + { + "epoch": 1.5785547561181028, + "lm_loss": 0.8323, + "step": 8200, + "vm_loss": 0.1723 + }, + { + "epoch": 1.5787472627956782, + "grad_norm": 3.3301845399628953, + "learning_rate": 2.236133034201924e-06, + "loss": 0.8786, + "step": 8201 + }, + { + "epoch": 1.5789397694732537, + "grad_norm": 3.3761877741493973, + "learning_rate": 2.234168335891961e-06, + "loss": 0.9348, + "step": 8202 + }, + { + "epoch": 1.5791322761508289, + "grad_norm": 3.1487978734408872, + "learning_rate": 2.23220439252019e-06, + "loss": 0.8922, + "step": 8203 + }, + { + "epoch": 1.5793247828284045, + "grad_norm": 3.17940209113727, + "learning_rate": 2.2302412042775344e-06, + "loss": 0.9023, + "step": 8204 + }, + { + "epoch": 1.5795172895059797, + "grad_norm": 3.1770255131272873, + "learning_rate": 2.2282787713548315e-06, + "loss": 0.8736, + "step": 8205 + }, + { + "epoch": 1.5797097961835551, + "grad_norm": 3.2710488861028924, + "learning_rate": 2.226317093942868e-06, + "loss": 0.9217, + "step": 8206 + }, + { + "epoch": 1.5799023028611305, + "grad_norm": 3.2204827277678936, + "learning_rate": 2.224356172232335e-06, + "loss": 0.883, + "step": 8207 + }, + { + "epoch": 1.5800948095387057, + "grad_norm": 3.2602247354373763, + "learning_rate": 2.2223960064138605e-06, + "loss": 0.9261, + "step": 8208 + }, + { + "epoch": 1.5800948095387057, + "lm_loss": 0.8811, + "step": 8208, + "vm_loss": 0.162 + }, + { + "epoch": 1.5800948095387057, + "lm_loss": 0.7241, + "step": 8208, + "vm_loss": 0.2009 + }, + { + "epoch": 1.5800948095387057, + "lm_loss": 1.0384, + "step": 8208, + "vm_loss": 0.1229 + }, + { + "epoch": 1.5800948095387057, + "lm_loss": 1.0267, + "step": 8208, + "vm_loss": 0.1738 + }, + { + "epoch": 1.5800948095387057, + "lm_loss": 0.7457, + "step": 8208, + "vm_loss": 0.1711 + }, + { + "epoch": 1.5800948095387057, + "lm_loss": 0.4828, + "step": 8208, + "vm_loss": 0.2208 + }, + { + "epoch": 1.5800948095387057, + "lm_loss": 0.9754, + "step": 8208, + "vm_loss": 0.1615 + }, + { + "epoch": 1.5800948095387057, + "lm_loss": 0.6169, + "step": 8208, + "vm_loss": 0.1108 + }, + { + "epoch": 1.5802873162162814, + "grad_norm": 3.3094468273377937, + "learning_rate": 2.2204365966779983e-06, + "loss": 0.9531, + "step": 8209 + }, + { + "epoch": 1.5804798228938566, + "grad_norm": 3.1726877924759647, + "learning_rate": 2.218477943215229e-06, + "loss": 0.8924, + "step": 8210 + }, + { + "epoch": 1.580672329571432, + "grad_norm": 3.6417494085076387, + "learning_rate": 2.216520046215962e-06, + "loss": 1.0206, + "step": 8211 + }, + { + "epoch": 1.5808648362490074, + "grad_norm": 3.4330684466800463, + "learning_rate": 2.214562905870521e-06, + "loss": 0.9663, + "step": 8212 + }, + { + "epoch": 1.5810573429265826, + "grad_norm": 3.3981598688903825, + "learning_rate": 2.212606522369175e-06, + "loss": 0.9794, + "step": 8213 + }, + { + "epoch": 1.5812498496041583, + "grad_norm": 3.2584667017760722, + "learning_rate": 2.210650895902102e-06, + "loss": 0.8862, + "step": 8214 + }, + { + "epoch": 1.5814423562817335, + "grad_norm": 3.242158263707811, + "learning_rate": 2.208696026659417e-06, + "loss": 0.9195, + "step": 8215 + }, + { + "epoch": 1.5816348629593089, + "grad_norm": 3.156867509445372, + "learning_rate": 2.2067419148311585e-06, + "loss": 0.9064, + "step": 8216 + }, + { + "epoch": 1.5816348629593089, + "lm_loss": 0.7287, + "step": 8216, + "vm_loss": 0.1929 + }, + { + "epoch": 1.5816348629593089, + "lm_loss": 0.7278, + "step": 8216, + "vm_loss": 0.1778 + }, + { + "epoch": 1.5816348629593089, + "lm_loss": 0.6999, + "step": 8216, + "vm_loss": 0.1727 + }, + { + "epoch": 1.5816348629593089, + "lm_loss": 0.7356, + "step": 8216, + "vm_loss": 0.1028 + }, + { + "epoch": 1.5816348629593089, + "lm_loss": 0.8408, + "step": 8216, + "vm_loss": 0.2673 + }, + { + "epoch": 1.5816348629593089, + "lm_loss": 0.7208, + "step": 8216, + "vm_loss": 0.1462 + }, + { + "epoch": 1.5816348629593089, + "lm_loss": 0.6515, + "step": 8216, + "vm_loss": 0.2032 + }, + { + "epoch": 1.5816348629593089, + "lm_loss": 0.4321, + "step": 8216, + "vm_loss": 0.1334 + }, + { + "epoch": 1.5818273696368843, + "grad_norm": 3.1852110175199226, + "learning_rate": 2.2047885606072926e-06, + "loss": 0.9176, + "step": 8217 + }, + { + "epoch": 1.5820198763144595, + "grad_norm": 3.161167760440285, + "learning_rate": 2.2028359641777066e-06, + "loss": 0.8692, + "step": 8218 + }, + { + "epoch": 1.5822123829920351, + "grad_norm": 3.2571311621277252, + "learning_rate": 2.20088412573222e-06, + "loss": 0.9363, + "step": 8219 + }, + { + "epoch": 1.5824048896696103, + "grad_norm": 3.1570445081681355, + "learning_rate": 2.198933045460575e-06, + "loss": 0.8953, + "step": 8220 + }, + { + "epoch": 1.5825973963471858, + "grad_norm": 3.1799866214407064, + "learning_rate": 2.1969827235524467e-06, + "loss": 0.8934, + "step": 8221 + }, + { + "epoch": 1.5827899030247612, + "grad_norm": 3.3450851847850807, + "learning_rate": 2.1950331601974207e-06, + "loss": 0.9384, + "step": 8222 + }, + { + "epoch": 1.5829824097023364, + "grad_norm": 3.2608110680430316, + "learning_rate": 2.1930843555850324e-06, + "loss": 0.9044, + "step": 8223 + }, + { + "epoch": 1.583174916379912, + "grad_norm": 3.22262931269552, + "learning_rate": 2.1911363099047222e-06, + "loss": 0.9165, + "step": 8224 + }, + { + "epoch": 1.583174916379912, + "lm_loss": 0.836, + "step": 8224, + "vm_loss": 0.136 + }, + { + "epoch": 1.583174916379912, + "lm_loss": 0.881, + "step": 8224, + "vm_loss": 0.1955 + }, + { + "epoch": 1.583174916379912, + "lm_loss": 0.6971, + "step": 8224, + "vm_loss": 0.1834 + }, + { + "epoch": 1.583174916379912, + "lm_loss": 1.0072, + "step": 8224, + "vm_loss": 0.1953 + }, + { + "epoch": 1.583174916379912, + "lm_loss": 0.5985, + "step": 8224, + "vm_loss": 0.1662 + }, + { + "epoch": 1.583174916379912, + "lm_loss": 0.9451, + "step": 8224, + "vm_loss": 0.2185 + }, + { + "epoch": 1.583174916379912, + "lm_loss": 0.9946, + "step": 8224, + "vm_loss": 0.1552 + }, + { + "epoch": 1.583174916379912, + "lm_loss": 0.6879, + "step": 8224, + "vm_loss": 0.1201 + }, + { + "epoch": 1.5833674230574872, + "grad_norm": 3.463676796008061, + "learning_rate": 2.189189023345866e-06, + "loss": 0.9423, + "step": 8225 + }, + { + "epoch": 1.5835599297350627, + "grad_norm": 3.352628672710239, + "learning_rate": 2.1872424960977667e-06, + "loss": 0.9134, + "step": 8226 + }, + { + "epoch": 1.583752436412638, + "grad_norm": 3.5250084325447077, + "learning_rate": 2.185296728349654e-06, + "loss": 0.9722, + "step": 8227 + }, + { + "epoch": 1.5839449430902133, + "grad_norm": 3.278741367798804, + "learning_rate": 2.183351720290676e-06, + "loss": 0.9162, + "step": 8228 + }, + { + "epoch": 1.584137449767789, + "grad_norm": 3.2639077693837204, + "learning_rate": 2.1814074721099144e-06, + "loss": 0.9219, + "step": 8229 + }, + { + "epoch": 1.5843299564453641, + "grad_norm": 3.17069440568728, + "learning_rate": 2.1794639839963762e-06, + "loss": 0.9212, + "step": 8230 + }, + { + "epoch": 1.5845224631229395, + "grad_norm": 3.24572136550553, + "learning_rate": 2.1775212561389925e-06, + "loss": 0.9023, + "step": 8231 + }, + { + "epoch": 1.584714969800515, + "grad_norm": 3.173234626405369, + "learning_rate": 2.1755792887266236e-06, + "loss": 0.8955, + "step": 8232 + }, + { + "epoch": 1.584714969800515, + "lm_loss": 1.0978, + "step": 8232, + "vm_loss": 0.0847 + }, + { + "epoch": 1.584714969800515, + "lm_loss": 0.671, + "step": 8232, + "vm_loss": 0.2121 + }, + { + "epoch": 1.584714969800515, + "lm_loss": 0.5522, + "step": 8232, + "vm_loss": 0.1703 + }, + { + "epoch": 1.584714969800515, + "lm_loss": 0.7667, + "step": 8232, + "vm_loss": 0.1419 + }, + { + "epoch": 1.584714969800515, + "lm_loss": 0.9111, + "step": 8232, + "vm_loss": 0.145 + }, + { + "epoch": 1.584714969800515, + "lm_loss": 0.7633, + "step": 8232, + "vm_loss": 0.1308 + }, + { + "epoch": 1.584714969800515, + "lm_loss": 0.4721, + "step": 8232, + "vm_loss": 0.1307 + }, + { + "epoch": 1.584714969800515, + "lm_loss": 0.7309, + "step": 8232, + "vm_loss": 0.1926 + }, + { + "epoch": 1.5849074764780904, + "grad_norm": 3.2109544580273286, + "learning_rate": 2.173638081948046e-06, + "loss": 0.9018, + "step": 8233 + }, + { + "epoch": 1.5850999831556658, + "grad_norm": 3.3142580995582795, + "learning_rate": 2.1716976359919818e-06, + "loss": 0.9442, + "step": 8234 + }, + { + "epoch": 1.585292489833241, + "grad_norm": 3.2627328726224367, + "learning_rate": 2.169757951047057e-06, + "loss": 0.9443, + "step": 8235 + }, + { + "epoch": 1.5854849965108164, + "grad_norm": 3.2255978387830884, + "learning_rate": 2.1678190273018376e-06, + "loss": 0.9295, + "step": 8236 + }, + { + "epoch": 1.5856775031883918, + "grad_norm": 3.278314759275568, + "learning_rate": 2.1658808649448105e-06, + "loss": 0.9043, + "step": 8237 + }, + { + "epoch": 1.5858700098659673, + "grad_norm": 3.331337344067321, + "learning_rate": 2.1639434641643954e-06, + "loss": 0.9193, + "step": 8238 + }, + { + "epoch": 1.5860625165435427, + "grad_norm": 3.371963193118649, + "learning_rate": 2.1620068251489214e-06, + "loss": 0.9522, + "step": 8239 + }, + { + "epoch": 1.5862550232211179, + "grad_norm": 3.229596579387952, + "learning_rate": 2.1600709480866667e-06, + "loss": 0.8901, + "step": 8240 + }, + { + "epoch": 1.5862550232211179, + "lm_loss": 1.1651, + "step": 8240, + "vm_loss": 0.1415 + }, + { + "epoch": 1.5862550232211179, + "lm_loss": 0.8819, + "step": 8240, + "vm_loss": 0.1347 + }, + { + "epoch": 1.5862550232211179, + "lm_loss": 0.5262, + "step": 8240, + "vm_loss": 0.1228 + }, + { + "epoch": 1.5862550232211179, + "lm_loss": 0.7565, + "step": 8240, + "vm_loss": 0.1565 + }, + { + "epoch": 1.5862550232211179, + "lm_loss": 0.5928, + "step": 8240, + "vm_loss": 0.1788 + }, + { + "epoch": 1.5862550232211179, + "lm_loss": 0.5042, + "step": 8240, + "vm_loss": 0.1751 + }, + { + "epoch": 1.5862550232211179, + "lm_loss": 0.6945, + "step": 8240, + "vm_loss": 0.1634 + }, + { + "epoch": 1.5862550232211179, + "lm_loss": 0.626, + "step": 8240, + "vm_loss": 0.1652 + }, + { + "epoch": 1.5864475298986933, + "grad_norm": 3.1789923911149165, + "learning_rate": 2.158135833165815e-06, + "loss": 0.874, + "step": 8241 + }, + { + "epoch": 1.5866400365762687, + "grad_norm": 3.2201266594907314, + "learning_rate": 2.1562014805744913e-06, + "loss": 0.9344, + "step": 8242 + }, + { + "epoch": 1.5868325432538442, + "grad_norm": 3.268882139437811, + "learning_rate": 2.1542678905007287e-06, + "loss": 0.8923, + "step": 8243 + }, + { + "epoch": 1.5870250499314196, + "grad_norm": 3.1699739429129195, + "learning_rate": 2.152335063132508e-06, + "loss": 0.9255, + "step": 8244 + }, + { + "epoch": 1.5872175566089948, + "grad_norm": 3.1690268395937538, + "learning_rate": 2.150402998657718e-06, + "loss": 0.915, + "step": 8245 + }, + { + "epoch": 1.5874100632865704, + "grad_norm": 3.221311426877247, + "learning_rate": 2.1484716972641828e-06, + "loss": 0.9245, + "step": 8246 + }, + { + "epoch": 1.5876025699641456, + "grad_norm": 3.2618171724113623, + "learning_rate": 2.146541159139649e-06, + "loss": 0.9447, + "step": 8247 + }, + { + "epoch": 1.587795076641721, + "grad_norm": 3.2384727153698902, + "learning_rate": 2.144611384471792e-06, + "loss": 0.8857, + "step": 8248 + }, + { + "epoch": 1.587795076641721, + "lm_loss": 0.3583, + "step": 8248, + "vm_loss": 0.1374 + }, + { + "epoch": 1.587795076641721, + "lm_loss": 1.0985, + "step": 8248, + "vm_loss": 0.1681 + }, + { + "epoch": 1.587795076641721, + "lm_loss": 0.7634, + "step": 8248, + "vm_loss": 0.1271 + }, + { + "epoch": 1.587795076641721, + "lm_loss": 0.7449, + "step": 8248, + "vm_loss": 0.1615 + }, + { + "epoch": 1.587795076641721, + "lm_loss": 0.647, + "step": 8248, + "vm_loss": 0.1673 + }, + { + "epoch": 1.587795076641721, + "lm_loss": 0.852, + "step": 8248, + "vm_loss": 0.1653 + }, + { + "epoch": 1.587795076641721, + "lm_loss": 0.7354, + "step": 8248, + "vm_loss": 0.1316 + }, + { + "epoch": 1.587795076641721, + "lm_loss": 0.5444, + "step": 8248, + "vm_loss": 0.1388 + }, + { + "epoch": 1.5879875833192965, + "grad_norm": 3.2719808369185825, + "learning_rate": 2.142682373448205e-06, + "loss": 0.9129, + "step": 8249 + }, + { + "epoch": 1.5881800899968717, + "grad_norm": 3.288993460020872, + "learning_rate": 2.1407541262564157e-06, + "loss": 0.8914, + "step": 8250 + }, + { + "epoch": 1.5883725966744473, + "grad_norm": 3.2399579563294845, + "learning_rate": 2.1388266430838757e-06, + "loss": 0.9467, + "step": 8251 + }, + { + "epoch": 1.5885651033520225, + "grad_norm": 3.3038289880947387, + "learning_rate": 2.1368999241179578e-06, + "loss": 0.8996, + "step": 8252 + }, + { + "epoch": 1.588757610029598, + "grad_norm": 3.153658976784306, + "learning_rate": 2.134973969545967e-06, + "loss": 0.8724, + "step": 8253 + }, + { + "epoch": 1.5889501167071733, + "grad_norm": 3.4391392516375694, + "learning_rate": 2.133048779555129e-06, + "loss": 0.9789, + "step": 8254 + }, + { + "epoch": 1.5891426233847485, + "grad_norm": 3.263121193278418, + "learning_rate": 2.131124354332601e-06, + "loss": 0.8899, + "step": 8255 + }, + { + "epoch": 1.5893351300623242, + "grad_norm": 3.3863184790056238, + "learning_rate": 2.129200694065452e-06, + "loss": 0.9311, + "step": 8256 + }, + { + "epoch": 1.5893351300623242, + "lm_loss": 1.1368, + "step": 8256, + "vm_loss": 0.2064 + }, + { + "epoch": 1.5893351300623242, + "lm_loss": 0.8404, + "step": 8256, + "vm_loss": 0.1408 + }, + { + "epoch": 1.5893351300623242, + "lm_loss": 0.647, + "step": 8256, + "vm_loss": 0.126 + }, + { + "epoch": 1.5893351300623242, + "lm_loss": 0.9692, + "step": 8256, + "vm_loss": 0.1821 + }, + { + "epoch": 1.5893351300623242, + "lm_loss": 0.6332, + "step": 8256, + "vm_loss": 0.1298 + }, + { + "epoch": 1.5893351300623242, + "lm_loss": 0.9095, + "step": 8256, + "vm_loss": 0.1552 + }, + { + "epoch": 1.5893351300623242, + "lm_loss": 0.6873, + "step": 8256, + "vm_loss": 0.1436 + }, + { + "epoch": 1.5893351300623242, + "lm_loss": 0.7877, + "step": 8256, + "vm_loss": 0.1109 + }, + { + "epoch": 1.5895276367398994, + "grad_norm": 3.3680130258038123, + "learning_rate": 2.127277798940698e-06, + "loss": 0.9021, + "step": 8257 + }, + { + "epoch": 1.5897201434174748, + "grad_norm": 3.51722159129693, + "learning_rate": 2.1253556691452603e-06, + "loss": 0.9496, + "step": 8258 + }, + { + "epoch": 1.5899126500950502, + "grad_norm": 3.3200275621014845, + "learning_rate": 2.123434304866e-06, + "loss": 0.9232, + "step": 8259 + }, + { + "epoch": 1.5901051567726254, + "grad_norm": 3.5031689445659238, + "learning_rate": 2.121513706289692e-06, + "loss": 0.956, + "step": 8260 + }, + { + "epoch": 1.590297663450201, + "grad_norm": 3.0817432319221294, + "learning_rate": 2.1195938736030507e-06, + "loss": 0.8935, + "step": 8261 + }, + { + "epoch": 1.5904901701277763, + "grad_norm": 3.2296742571778934, + "learning_rate": 2.1176748069927035e-06, + "loss": 0.911, + "step": 8262 + }, + { + "epoch": 1.5906826768053517, + "grad_norm": 3.2895183400037054, + "learning_rate": 2.115756506645209e-06, + "loss": 0.9344, + "step": 8263 + }, + { + "epoch": 1.5908751834829271, + "grad_norm": 3.326861806896037, + "learning_rate": 2.1138389727470508e-06, + "loss": 0.9498, + "step": 8264 + }, + { + "epoch": 1.5908751834829271, + "lm_loss": 0.984, + "step": 8264, + "vm_loss": 0.1455 + }, + { + "epoch": 1.5908751834829271, + "lm_loss": 0.5109, + "step": 8264, + "vm_loss": 0.1731 + }, + { + "epoch": 1.5908751834829271, + "lm_loss": 0.5482, + "step": 8264, + "vm_loss": 0.134 + }, + { + "epoch": 1.5908751834829271, + "lm_loss": 0.5695, + "step": 8264, + "vm_loss": 0.1475 + }, + { + "epoch": 1.5908751834829271, + "lm_loss": 0.5779, + "step": 8264, + "vm_loss": 0.1808 + }, + { + "epoch": 1.5908751834829271, + "lm_loss": 0.7628, + "step": 8264, + "vm_loss": 0.1704 + }, + { + "epoch": 1.5908751834829271, + "lm_loss": 0.6919, + "step": 8264, + "vm_loss": 0.154 + }, + { + "epoch": 1.5908751834829271, + "lm_loss": 0.3581, + "step": 8264, + "vm_loss": 0.1932 + }, + { + "epoch": 1.5910676901605023, + "grad_norm": 3.0889364039039036, + "learning_rate": 2.111922205484641e-06, + "loss": 0.8708, + "step": 8265 + }, + { + "epoch": 1.591260196838078, + "grad_norm": 3.1380712248967235, + "learning_rate": 2.110006205044307e-06, + "loss": 0.866, + "step": 8266 + }, + { + "epoch": 1.5914527035156532, + "grad_norm": 3.374327927916381, + "learning_rate": 2.108090971612313e-06, + "loss": 0.9162, + "step": 8267 + }, + { + "epoch": 1.5916452101932286, + "grad_norm": 3.2724945876137017, + "learning_rate": 2.106176505374844e-06, + "loss": 0.9164, + "step": 8268 + }, + { + "epoch": 1.591837716870804, + "grad_norm": 3.1877542397708005, + "learning_rate": 2.1042628065180105e-06, + "loss": 0.9258, + "step": 8269 + }, + { + "epoch": 1.5920302235483792, + "grad_norm": 3.4767676783801815, + "learning_rate": 2.102349875227847e-06, + "loss": 0.9931, + "step": 8270 + }, + { + "epoch": 1.5922227302259548, + "grad_norm": 3.4162759032961847, + "learning_rate": 2.1004377116903195e-06, + "loss": 0.9238, + "step": 8271 + }, + { + "epoch": 1.59241523690353, + "grad_norm": 3.174705485908778, + "learning_rate": 2.098526316091308e-06, + "loss": 0.8909, + "step": 8272 + }, + { + "epoch": 1.59241523690353, + "lm_loss": 1.0846, + "step": 8272, + "vm_loss": 0.2141 + }, + { + "epoch": 1.59241523690353, + "lm_loss": 0.6833, + "step": 8272, + "vm_loss": 0.1333 + }, + { + "epoch": 1.59241523690353, + "lm_loss": 0.5514, + "step": 8272, + "vm_loss": 0.2057 + }, + { + "epoch": 1.59241523690353, + "lm_loss": 0.6213, + "step": 8272, + "vm_loss": 0.1145 + }, + { + "epoch": 1.59241523690353, + "lm_loss": 0.6342, + "step": 8272, + "vm_loss": 0.1636 + }, + { + "epoch": 1.59241523690353, + "lm_loss": 0.7591, + "step": 8272, + "vm_loss": 0.105 + }, + { + "epoch": 1.59241523690353, + "lm_loss": 0.553, + "step": 8272, + "vm_loss": 0.1479 + }, + { + "epoch": 1.59241523690353, + "lm_loss": 0.5246, + "step": 8272, + "vm_loss": 0.1166 + }, + { + "epoch": 1.5926077435811055, + "grad_norm": 3.209103944378494, + "learning_rate": 2.096615688616629e-06, + "loss": 0.851, + "step": 8273 + }, + { + "epoch": 1.5928002502586809, + "grad_norm": 3.1659651472179733, + "learning_rate": 2.094705829452018e-06, + "loss": 0.8988, + "step": 8274 + }, + { + "epoch": 1.592992756936256, + "grad_norm": 3.332303504014603, + "learning_rate": 2.0927967387831395e-06, + "loss": 0.9351, + "step": 8275 + }, + { + "epoch": 1.5931852636138317, + "grad_norm": 3.3133121839309543, + "learning_rate": 2.090888416795582e-06, + "loss": 0.9528, + "step": 8276 + }, + { + "epoch": 1.593377770291407, + "grad_norm": 3.1087676897121703, + "learning_rate": 2.088980863674853e-06, + "loss": 0.8593, + "step": 8277 + }, + { + "epoch": 1.5935702769689823, + "grad_norm": 3.2490769004783777, + "learning_rate": 2.0870740796064015e-06, + "loss": 0.8851, + "step": 8278 + }, + { + "epoch": 1.5937627836465578, + "grad_norm": 3.2246541185243704, + "learning_rate": 2.0851680647755812e-06, + "loss": 0.8852, + "step": 8279 + }, + { + "epoch": 1.593955290324133, + "grad_norm": 3.2637560147604745, + "learning_rate": 2.0832628193676864e-06, + "loss": 0.9056, + "step": 8280 + }, + { + "epoch": 1.593955290324133, + "lm_loss": 0.7534, + "step": 8280, + "vm_loss": 0.2144 + }, + { + "epoch": 1.593955290324133, + "lm_loss": 0.6807, + "step": 8280, + "vm_loss": 0.2277 + }, + { + "epoch": 1.593955290324133, + "lm_loss": 1.098, + "step": 8280, + "vm_loss": 0.1633 + }, + { + "epoch": 1.593955290324133, + "lm_loss": 0.726, + "step": 8280, + "vm_loss": 0.1523 + }, + { + "epoch": 1.593955290324133, + "lm_loss": 0.796, + "step": 8280, + "vm_loss": 0.1941 + }, + { + "epoch": 1.593955290324133, + "lm_loss": 0.283, + "step": 8280, + "vm_loss": 0.2627 + }, + { + "epoch": 1.593955290324133, + "lm_loss": 0.6848, + "step": 8280, + "vm_loss": 0.1281 + }, + { + "epoch": 1.593955290324133, + "lm_loss": 0.6416, + "step": 8280, + "vm_loss": 0.1572 + }, + { + "epoch": 1.5941477970017086, + "grad_norm": 3.307097229603673, + "learning_rate": 2.0813583435679295e-06, + "loss": 0.9457, + "step": 8281 + }, + { + "epoch": 1.5943403036792838, + "grad_norm": 3.2138158606667204, + "learning_rate": 2.0794546375614543e-06, + "loss": 0.8821, + "step": 8282 + }, + { + "epoch": 1.5945328103568592, + "grad_norm": 3.353456294736932, + "learning_rate": 2.077551701533318e-06, + "loss": 0.9028, + "step": 8283 + }, + { + "epoch": 1.5947253170344347, + "grad_norm": 3.1925695940369243, + "learning_rate": 2.0756495356685147e-06, + "loss": 0.8619, + "step": 8284 + }, + { + "epoch": 1.5949178237120099, + "grad_norm": 3.2155619911772026, + "learning_rate": 2.0737481401519576e-06, + "loss": 0.9192, + "step": 8285 + }, + { + "epoch": 1.5951103303895855, + "grad_norm": 3.3890080264834808, + "learning_rate": 2.0718475151684913e-06, + "loss": 0.9258, + "step": 8286 + }, + { + "epoch": 1.5953028370671607, + "grad_norm": 3.157819241474124, + "learning_rate": 2.0699476609028714e-06, + "loss": 0.8851, + "step": 8287 + }, + { + "epoch": 1.5954953437447361, + "grad_norm": 3.1975040811645044, + "learning_rate": 2.0680485775397983e-06, + "loss": 0.8677, + "step": 8288 + }, + { + "epoch": 1.5954953437447361, + "lm_loss": 0.7623, + "step": 8288, + "vm_loss": 0.1216 + }, + { + "epoch": 1.5954953437447361, + "lm_loss": 0.6548, + "step": 8288, + "vm_loss": 0.1728 + }, + { + "epoch": 1.5954953437447361, + "lm_loss": 0.8758, + "step": 8288, + "vm_loss": 0.1345 + }, + { + "epoch": 1.5954953437447361, + "lm_loss": 0.5852, + "step": 8288, + "vm_loss": 0.1643 + }, + { + "epoch": 1.5954953437447361, + "lm_loss": 0.9623, + "step": 8288, + "vm_loss": 0.1888 + }, + { + "epoch": 1.5954953437447361, + "lm_loss": 0.6871, + "step": 8288, + "vm_loss": 0.1164 + }, + { + "epoch": 1.5954953437447361, + "lm_loss": 0.7335, + "step": 8288, + "vm_loss": 0.1489 + }, + { + "epoch": 1.5954953437447361, + "lm_loss": 0.3618, + "step": 8288, + "vm_loss": 0.2121 + }, + { + "epoch": 1.5956878504223115, + "grad_norm": 3.178929153855023, + "learning_rate": 2.0661502652638822e-06, + "loss": 0.8982, + "step": 8289 + }, + { + "epoch": 1.5958803570998867, + "grad_norm": 3.211535634731193, + "learning_rate": 2.064252724259662e-06, + "loss": 0.941, + "step": 8290 + }, + { + "epoch": 1.5960728637774624, + "grad_norm": 3.154287789712948, + "learning_rate": 2.0623559547116055e-06, + "loss": 0.9034, + "step": 8291 + }, + { + "epoch": 1.5962653704550376, + "grad_norm": 3.1613609530987508, + "learning_rate": 2.060459956804103e-06, + "loss": 0.867, + "step": 8292 + }, + { + "epoch": 1.596457877132613, + "grad_norm": 3.2989629970352983, + "learning_rate": 2.0585647307214707e-06, + "loss": 0.9263, + "step": 8293 + }, + { + "epoch": 1.5966503838101884, + "grad_norm": 3.4278171879053065, + "learning_rate": 2.0566702766479454e-06, + "loss": 0.9745, + "step": 8294 + }, + { + "epoch": 1.5968428904877638, + "grad_norm": 3.0830439356882544, + "learning_rate": 2.0547765947676933e-06, + "loss": 0.8762, + "step": 8295 + }, + { + "epoch": 1.5970353971653393, + "grad_norm": 3.2516764486934235, + "learning_rate": 2.052883685264806e-06, + "loss": 0.8958, + "step": 8296 + }, + { + "epoch": 1.5970353971653393, + "lm_loss": 0.9544, + "step": 8296, + "vm_loss": 0.1839 + }, + { + "epoch": 1.5970353971653393, + "lm_loss": 0.6604, + "step": 8296, + "vm_loss": 0.1644 + }, + { + "epoch": 1.5970353971653393, + "lm_loss": 0.5058, + "step": 8296, + "vm_loss": 0.1321 + }, + { + "epoch": 1.5970353971653393, + "lm_loss": 0.4843, + "step": 8296, + "vm_loss": 0.2179 + }, + { + "epoch": 1.5970353971653393, + "lm_loss": 0.5797, + "step": 8296, + "vm_loss": 0.1381 + }, + { + "epoch": 1.5970353971653393, + "lm_loss": 1.0527, + "step": 8296, + "vm_loss": 0.2119 + }, + { + "epoch": 1.5970353971653393, + "lm_loss": 0.4596, + "step": 8296, + "vm_loss": 0.1195 + }, + { + "epoch": 1.5970353971653393, + "lm_loss": 0.7037, + "step": 8296, + "vm_loss": 0.171 + }, + { + "epoch": 1.5972279038429145, + "grad_norm": 3.4543530605829105, + "learning_rate": 2.050991548323298e-06, + "loss": 0.9574, + "step": 8297 + }, + { + "epoch": 1.5974204105204899, + "grad_norm": 3.3095564567194815, + "learning_rate": 2.0491001841271073e-06, + "loss": 0.9158, + "step": 8298 + }, + { + "epoch": 1.5976129171980653, + "grad_norm": 3.2974638837382204, + "learning_rate": 2.0472095928601045e-06, + "loss": 0.9056, + "step": 8299 + }, + { + "epoch": 1.5978054238756407, + "grad_norm": 3.2165264009311136, + "learning_rate": 2.0453197747060728e-06, + "loss": 0.8627, + "step": 8300 + }, + { + "epoch": 1.5979979305532162, + "grad_norm": 3.3553934836972297, + "learning_rate": 2.043430729848729e-06, + "loss": 0.9395, + "step": 8301 + }, + { + "epoch": 1.5981904372307913, + "grad_norm": 3.3075297979522795, + "learning_rate": 2.0415424584717113e-06, + "loss": 0.9105, + "step": 8302 + }, + { + "epoch": 1.5983829439083668, + "grad_norm": 3.270031232767467, + "learning_rate": 2.039654960758588e-06, + "loss": 0.9408, + "step": 8303 + }, + { + "epoch": 1.5985754505859422, + "grad_norm": 3.2969076462985982, + "learning_rate": 2.0377682368928407e-06, + "loss": 0.8967, + "step": 8304 + }, + { + "epoch": 1.5985754505859422, + "lm_loss": 0.6728, + "step": 8304, + "vm_loss": 0.1118 + }, + { + "epoch": 1.5985754505859422, + "lm_loss": 0.9064, + "step": 8304, + "vm_loss": 0.1144 + }, + { + "epoch": 1.5985754505859422, + "lm_loss": 0.7602, + "step": 8304, + "vm_loss": 0.2321 + }, + { + "epoch": 1.5985754505859422, + "lm_loss": 0.8914, + "step": 8304, + "vm_loss": 0.1101 + }, + { + "epoch": 1.5985754505859422, + "lm_loss": 0.6725, + "step": 8304, + "vm_loss": 0.1353 + }, + { + "epoch": 1.5985754505859422, + "lm_loss": 0.5932, + "step": 8304, + "vm_loss": 0.1575 + }, + { + "epoch": 1.5985754505859422, + "lm_loss": 0.8768, + "step": 8304, + "vm_loss": 0.1597 + }, + { + "epoch": 1.5985754505859422, + "lm_loss": 1.052, + "step": 8304, + "vm_loss": 0.1834 + }, + { + "epoch": 1.5987679572635176, + "grad_norm": 3.184496176439511, + "learning_rate": 2.0358822870578922e-06, + "loss": 0.8853, + "step": 8305 + }, + { + "epoch": 1.598960463941093, + "grad_norm": 3.2710727618572597, + "learning_rate": 2.033997111437074e-06, + "loss": 0.9099, + "step": 8306 + }, + { + "epoch": 1.5991529706186682, + "grad_norm": 3.2971407534972306, + "learning_rate": 2.0321127102136528e-06, + "loss": 0.9123, + "step": 8307 + }, + { + "epoch": 1.5993454772962439, + "grad_norm": 3.277469259024938, + "learning_rate": 2.0302290835708115e-06, + "loss": 0.92, + "step": 8308 + }, + { + "epoch": 1.599537983973819, + "grad_norm": 3.276318087830748, + "learning_rate": 2.0283462316916712e-06, + "loss": 0.9024, + "step": 8309 + }, + { + "epoch": 1.5997304906513945, + "grad_norm": 3.3002482656381487, + "learning_rate": 2.026464154759261e-06, + "loss": 0.9017, + "step": 8310 + }, + { + "epoch": 1.59992299732897, + "grad_norm": 3.0843286695321335, + "learning_rate": 2.024582852956547e-06, + "loss": 0.8882, + "step": 8311 + }, + { + "epoch": 1.6001155040065451, + "grad_norm": 3.2784101583412006, + "learning_rate": 2.022702326466416e-06, + "loss": 0.9017, + "step": 8312 + }, + { + "epoch": 1.6001155040065451, + "lm_loss": 1.2306, + "step": 8312, + "vm_loss": 0.1286 + }, + { + "epoch": 1.6001155040065451, + "lm_loss": 0.6293, + "step": 8312, + "vm_loss": 0.1713 + }, + { + "epoch": 1.6001155040065451, + "lm_loss": 0.7233, + "step": 8312, + "vm_loss": 0.2081 + }, + { + "epoch": 1.6001155040065451, + "lm_loss": 0.5911, + "step": 8312, + "vm_loss": 0.2123 + }, + { + "epoch": 1.6001155040065451, + "lm_loss": 0.7699, + "step": 8312, + "vm_loss": 0.1922 + }, + { + "epoch": 1.6001155040065451, + "lm_loss": 0.9302, + "step": 8312, + "vm_loss": 0.1195 + }, + { + "epoch": 1.6001155040065451, + "lm_loss": 0.4482, + "step": 8312, + "vm_loss": 0.1618 + }, + { + "epoch": 1.6001155040065451, + "lm_loss": 0.662, + "step": 8312, + "vm_loss": 0.1162 + }, + { + "epoch": 1.6003080106841208, + "grad_norm": 3.3629956586206133, + "learning_rate": 2.020822575471677e-06, + "loss": 0.951, + "step": 8313 + }, + { + "epoch": 1.600500517361696, + "grad_norm": 3.2308917588566586, + "learning_rate": 2.0189436001550668e-06, + "loss": 0.9026, + "step": 8314 + }, + { + "epoch": 1.6006930240392714, + "grad_norm": 3.1706581743766025, + "learning_rate": 2.0170654006992473e-06, + "loss": 0.926, + "step": 8315 + }, + { + "epoch": 1.6008855307168468, + "grad_norm": 3.223117217203848, + "learning_rate": 2.015187977286804e-06, + "loss": 0.9471, + "step": 8316 + }, + { + "epoch": 1.601078037394422, + "grad_norm": 3.150625318917613, + "learning_rate": 2.0133113301002426e-06, + "loss": 0.8854, + "step": 8317 + }, + { + "epoch": 1.6012705440719976, + "grad_norm": 3.1967382143421372, + "learning_rate": 2.0114354593220007e-06, + "loss": 0.8676, + "step": 8318 + }, + { + "epoch": 1.6014630507495728, + "grad_norm": 3.2693629092763588, + "learning_rate": 2.009560365134434e-06, + "loss": 0.9057, + "step": 8319 + }, + { + "epoch": 1.6016555574271483, + "grad_norm": 3.1873409554459875, + "learning_rate": 2.007686047719831e-06, + "loss": 0.8742, + "step": 8320 + }, + { + "epoch": 1.6016555574271483, + "lm_loss": 0.347, + "step": 8320, + "vm_loss": 0.1858 + }, + { + "epoch": 1.6016555574271483, + "lm_loss": 0.7822, + "step": 8320, + "vm_loss": 0.1943 + }, + { + "epoch": 1.6016555574271483, + "lm_loss": 0.8968, + "step": 8320, + "vm_loss": 0.1343 + }, + { + "epoch": 1.6016555574271483, + "lm_loss": 0.4761, + "step": 8320, + "vm_loss": 0.1102 + }, + { + "epoch": 1.6016555574271483, + "lm_loss": 0.8188, + "step": 8320, + "vm_loss": 0.1602 + }, + { + "epoch": 1.6016555574271483, + "lm_loss": 0.586, + "step": 8320, + "vm_loss": 0.1647 + }, + { + "epoch": 1.6016555574271483, + "lm_loss": 0.7331, + "step": 8320, + "vm_loss": 0.1617 + }, + { + "epoch": 1.6016555574271483, + "lm_loss": 0.7576, + "step": 8320, + "vm_loss": 0.1995 + }, + { + "epoch": 1.6018480641047237, + "grad_norm": 3.228129237529959, + "learning_rate": 2.0058125072603907e-06, + "loss": 0.8586, + "step": 8321 + }, + { + "epoch": 1.602040570782299, + "grad_norm": 3.137441253552322, + "learning_rate": 2.0039397439382544e-06, + "loss": 0.868, + "step": 8322 + }, + { + "epoch": 1.6022330774598745, + "grad_norm": 3.3876991628001774, + "learning_rate": 2.002067757935473e-06, + "loss": 0.9317, + "step": 8323 + }, + { + "epoch": 1.6024255841374497, + "grad_norm": 3.386745238087488, + "learning_rate": 2.0001965494340303e-06, + "loss": 0.9348, + "step": 8324 + }, + { + "epoch": 1.6026180908150252, + "grad_norm": 3.175238453048385, + "learning_rate": 1.9983261186158265e-06, + "loss": 0.8481, + "step": 8325 + }, + { + "epoch": 1.6028105974926006, + "grad_norm": 3.2653911147651695, + "learning_rate": 1.9964564656626993e-06, + "loss": 0.8898, + "step": 8326 + }, + { + "epoch": 1.6030031041701758, + "grad_norm": 3.3461166423031448, + "learning_rate": 1.994587590756397e-06, + "loss": 0.9328, + "step": 8327 + }, + { + "epoch": 1.6031956108477514, + "grad_norm": 3.334832116830459, + "learning_rate": 1.9927194940785998e-06, + "loss": 0.9333, + "step": 8328 + }, + { + "epoch": 1.6031956108477514, + "lm_loss": 0.6449, + "step": 8328, + "vm_loss": 0.1483 + }, + { + "epoch": 1.6031956108477514, + "lm_loss": 1.3609, + "step": 8328, + "vm_loss": 0.2348 + }, + { + "epoch": 1.6031956108477514, + "lm_loss": 0.5589, + "step": 8328, + "vm_loss": 0.1643 + }, + { + "epoch": 1.6031956108477514, + "lm_loss": 0.4405, + "step": 8328, + "vm_loss": 0.1855 + }, + { + "epoch": 1.6031956108477514, + "lm_loss": 0.4061, + "step": 8328, + "vm_loss": 0.2258 + }, + { + "epoch": 1.6031956108477514, + "lm_loss": 1.1004, + "step": 8328, + "vm_loss": 0.1945 + }, + { + "epoch": 1.6031956108477514, + "lm_loss": 0.6224, + "step": 8328, + "vm_loss": 0.107 + }, + { + "epoch": 1.6031956108477514, + "lm_loss": 0.5236, + "step": 8328, + "vm_loss": 0.1303 + }, + { + "epoch": 1.6033881175253266, + "grad_norm": 3.29165390533546, + "learning_rate": 1.9908521758109114e-06, + "loss": 0.8882, + "step": 8329 + }, + { + "epoch": 1.603580624202902, + "grad_norm": 3.228774750231894, + "learning_rate": 1.9889856361348603e-06, + "loss": 0.9116, + "step": 8330 + }, + { + "epoch": 1.6037731308804775, + "grad_norm": 3.2595149286983705, + "learning_rate": 1.987119875231891e-06, + "loss": 0.8965, + "step": 8331 + }, + { + "epoch": 1.6039656375580527, + "grad_norm": 3.2169430903579066, + "learning_rate": 1.9852548932833893e-06, + "loss": 0.8845, + "step": 8332 + }, + { + "epoch": 1.6041581442356283, + "grad_norm": 3.3134092040973497, + "learning_rate": 1.983390690470649e-06, + "loss": 0.8902, + "step": 8333 + }, + { + "epoch": 1.6043506509132035, + "grad_norm": 3.3056309612807455, + "learning_rate": 1.981527266974895e-06, + "loss": 0.8967, + "step": 8334 + }, + { + "epoch": 1.604543157590779, + "grad_norm": 3.3964609959525447, + "learning_rate": 1.9796646229772775e-06, + "loss": 0.9804, + "step": 8335 + }, + { + "epoch": 1.6047356642683543, + "grad_norm": 3.231816830687224, + "learning_rate": 1.977802758658869e-06, + "loss": 0.8844, + "step": 8336 + }, + { + "epoch": 1.6047356642683543, + "lm_loss": 0.4029, + "step": 8336, + "vm_loss": 0.1754 + }, + { + "epoch": 1.6047356642683543, + "lm_loss": 0.599, + "step": 8336, + "vm_loss": 0.1532 + }, + { + "epoch": 1.6047356642683543, + "lm_loss": 0.4715, + "step": 8336, + "vm_loss": 0.1185 + }, + { + "epoch": 1.6047356642683543, + "lm_loss": 1.0251, + "step": 8336, + "vm_loss": 0.1797 + }, + { + "epoch": 1.6047356642683543, + "lm_loss": 0.8673, + "step": 8336, + "vm_loss": 0.107 + }, + { + "epoch": 1.6047356642683543, + "lm_loss": 0.4617, + "step": 8336, + "vm_loss": 0.1268 + }, + { + "epoch": 1.6047356642683543, + "lm_loss": 0.7612, + "step": 8336, + "vm_loss": 0.2686 + }, + { + "epoch": 1.6047356642683543, + "lm_loss": 1.037, + "step": 8336, + "vm_loss": 0.1366 + }, + { + "epoch": 1.6049281709459295, + "grad_norm": 3.1876850179137777, + "learning_rate": 1.975941674200668e-06, + "loss": 0.8957, + "step": 8337 + }, + { + "epoch": 1.6051206776235052, + "grad_norm": 3.2441251256188663, + "learning_rate": 1.9740813697835914e-06, + "loss": 0.8945, + "step": 8338 + }, + { + "epoch": 1.6053131843010804, + "grad_norm": 3.256935183580076, + "learning_rate": 1.9722218455884866e-06, + "loss": 0.9107, + "step": 8339 + }, + { + "epoch": 1.6055056909786558, + "grad_norm": 3.420023418609238, + "learning_rate": 1.9703631017961243e-06, + "loss": 0.955, + "step": 8340 + }, + { + "epoch": 1.6056981976562312, + "grad_norm": 3.12517737846043, + "learning_rate": 1.968505138587201e-06, + "loss": 0.859, + "step": 8341 + }, + { + "epoch": 1.6058907043338064, + "grad_norm": 3.1094554742084193, + "learning_rate": 1.9666479561423247e-06, + "loss": 0.8861, + "step": 8342 + }, + { + "epoch": 1.606083211011382, + "grad_norm": 3.3213613728077442, + "learning_rate": 1.964791554642049e-06, + "loss": 0.9522, + "step": 8343 + }, + { + "epoch": 1.6062757176889573, + "grad_norm": 3.2132899690243306, + "learning_rate": 1.962935934266833e-06, + "loss": 0.8621, + "step": 8344 + }, + { + "epoch": 1.6062757176889573, + "lm_loss": 0.7518, + "step": 8344, + "vm_loss": 0.1974 + }, + { + "epoch": 1.6062757176889573, + "lm_loss": 0.86, + "step": 8344, + "vm_loss": 0.1958 + }, + { + "epoch": 1.6062757176889573, + "lm_loss": 0.7898, + "step": 8344, + "vm_loss": 0.1661 + }, + { + "epoch": 1.6062757176889573, + "lm_loss": 0.6418, + "step": 8344, + "vm_loss": 0.2243 + }, + { + "epoch": 1.6062757176889573, + "lm_loss": 0.7294, + "step": 8344, + "vm_loss": 0.1689 + }, + { + "epoch": 1.6062757176889573, + "lm_loss": 0.9021, + "step": 8344, + "vm_loss": 0.2092 + }, + { + "epoch": 1.6062757176889573, + "lm_loss": 0.9843, + "step": 8344, + "vm_loss": 0.1731 + }, + { + "epoch": 1.6062757176889573, + "lm_loss": 0.5909, + "step": 8344, + "vm_loss": 0.1206 + }, + { + "epoch": 1.6064682243665327, + "grad_norm": 3.226243287061956, + "learning_rate": 1.961081095197067e-06, + "loss": 0.9073, + "step": 8345 + }, + { + "epoch": 1.6066607310441081, + "grad_norm": 3.2271172538523634, + "learning_rate": 1.959227037613067e-06, + "loss": 0.8824, + "step": 8346 + }, + { + "epoch": 1.6068532377216833, + "grad_norm": 3.239449176094794, + "learning_rate": 1.957373761695073e-06, + "loss": 0.9052, + "step": 8347 + }, + { + "epoch": 1.607045744399259, + "grad_norm": 3.211515576315902, + "learning_rate": 1.9555212676232396e-06, + "loss": 0.8627, + "step": 8348 + }, + { + "epoch": 1.6072382510768342, + "grad_norm": 3.1530427504099037, + "learning_rate": 1.953669555577663e-06, + "loss": 0.8782, + "step": 8349 + }, + { + "epoch": 1.6074307577544096, + "grad_norm": 3.0249201247538653, + "learning_rate": 1.9518186257383453e-06, + "loss": 0.8474, + "step": 8350 + }, + { + "epoch": 1.607623264431985, + "grad_norm": 3.1234078016556253, + "learning_rate": 1.949968478285226e-06, + "loss": 0.8673, + "step": 8351 + }, + { + "epoch": 1.6078157711095602, + "grad_norm": 3.257557159383585, + "learning_rate": 1.9481191133981557e-06, + "loss": 0.9131, + "step": 8352 + }, + { + "epoch": 1.6078157711095602, + "lm_loss": 0.7146, + "step": 8352, + "vm_loss": 0.1659 + }, + { + "epoch": 1.6078157711095602, + "lm_loss": 0.4001, + "step": 8352, + "vm_loss": 0.1649 + }, + { + "epoch": 1.6078157711095602, + "lm_loss": 0.9887, + "step": 8352, + "vm_loss": 0.1443 + }, + { + "epoch": 1.6078157711095602, + "lm_loss": 0.4478, + "step": 8352, + "vm_loss": 0.1821 + }, + { + "epoch": 1.6078157711095602, + "lm_loss": 0.8284, + "step": 8352, + "vm_loss": 0.2053 + }, + { + "epoch": 1.6078157711095602, + "lm_loss": 0.5847, + "step": 8352, + "vm_loss": 0.1995 + }, + { + "epoch": 1.6078157711095602, + "lm_loss": 1.2224, + "step": 8352, + "vm_loss": 0.1842 + }, + { + "epoch": 1.6078157711095602, + "lm_loss": 0.8029, + "step": 8352, + "vm_loss": 0.1436 + }, + { + "epoch": 1.6080082777871358, + "grad_norm": 3.3511706936007744, + "learning_rate": 1.9462705312569265e-06, + "loss": 0.9353, + "step": 8353 + }, + { + "epoch": 1.608200784464711, + "grad_norm": 3.3495207489580316, + "learning_rate": 1.9444227320412355e-06, + "loss": 0.9749, + "step": 8354 + }, + { + "epoch": 1.6083932911422865, + "grad_norm": 3.4993689114699675, + "learning_rate": 1.9425757159307168e-06, + "loss": 0.9569, + "step": 8355 + }, + { + "epoch": 1.6085857978198619, + "grad_norm": 3.283540130190931, + "learning_rate": 1.9407294831049226e-06, + "loss": 0.9036, + "step": 8356 + }, + { + "epoch": 1.6087783044974373, + "grad_norm": 3.3853501016977914, + "learning_rate": 1.9388840337433313e-06, + "loss": 0.9441, + "step": 8357 + }, + { + "epoch": 1.6089708111750127, + "grad_norm": 3.2853629931186488, + "learning_rate": 1.937039368025346e-06, + "loss": 0.8918, + "step": 8358 + }, + { + "epoch": 1.609163317852588, + "grad_norm": 3.1723898464013534, + "learning_rate": 1.9351954861302847e-06, + "loss": 0.8858, + "step": 8359 + }, + { + "epoch": 1.6093558245301633, + "grad_norm": 3.252540386245679, + "learning_rate": 1.933352388237406e-06, + "loss": 0.888, + "step": 8360 + }, + { + "epoch": 1.6093558245301633, + "lm_loss": 0.8955, + "step": 8360, + "vm_loss": 0.1478 + }, + { + "epoch": 1.6093558245301633, + "lm_loss": 0.5522, + "step": 8360, + "vm_loss": 0.1398 + }, + { + "epoch": 1.6093558245301633, + "lm_loss": 1.2706, + "step": 8360, + "vm_loss": 0.1578 + }, + { + "epoch": 1.6093558245301633, + "lm_loss": 0.7644, + "step": 8360, + "vm_loss": 0.1426 + }, + { + "epoch": 1.6093558245301633, + "lm_loss": 0.4609, + "step": 8360, + "vm_loss": 0.1782 + }, + { + "epoch": 1.6093558245301633, + "lm_loss": 0.5524, + "step": 8360, + "vm_loss": 0.1501 + }, + { + "epoch": 1.6093558245301633, + "lm_loss": 0.7427, + "step": 8360, + "vm_loss": 0.1654 + }, + { + "epoch": 1.6093558245301633, + "lm_loss": 0.7653, + "step": 8360, + "vm_loss": 0.1654 + }, + { + "epoch": 1.6095483312077388, + "grad_norm": 3.187836269541993, + "learning_rate": 1.931510074525874e-06, + "loss": 0.8615, + "step": 8361 + }, + { + "epoch": 1.6097408378853142, + "grad_norm": 3.1978676330634546, + "learning_rate": 1.929668545174791e-06, + "loss": 0.8902, + "step": 8362 + }, + { + "epoch": 1.6099333445628896, + "grad_norm": 3.086626183351094, + "learning_rate": 1.9278278003631735e-06, + "loss": 0.8808, + "step": 8363 + }, + { + "epoch": 1.6101258512404648, + "grad_norm": 3.186101429427331, + "learning_rate": 1.9259878402699704e-06, + "loss": 0.894, + "step": 8364 + }, + { + "epoch": 1.6103183579180402, + "grad_norm": 3.098971139575795, + "learning_rate": 1.9241486650740406e-06, + "loss": 0.8355, + "step": 8365 + }, + { + "epoch": 1.6105108645956157, + "grad_norm": 3.193924022165544, + "learning_rate": 1.9223102749541856e-06, + "loss": 0.8806, + "step": 8366 + }, + { + "epoch": 1.610703371273191, + "grad_norm": 3.232335134679844, + "learning_rate": 1.920472670089114e-06, + "loss": 0.882, + "step": 8367 + }, + { + "epoch": 1.6108958779507665, + "grad_norm": 3.4555498922303887, + "learning_rate": 1.9186358506574686e-06, + "loss": 0.9542, + "step": 8368 + }, + { + "epoch": 1.6108958779507665, + "lm_loss": 0.6641, + "step": 8368, + "vm_loss": 0.1644 + }, + { + "epoch": 1.6108958779507665, + "lm_loss": 0.9509, + "step": 8368, + "vm_loss": 0.1704 + }, + { + "epoch": 1.6108958779507665, + "lm_loss": 1.4275, + "step": 8368, + "vm_loss": 0.2062 + }, + { + "epoch": 1.6108958779507665, + "lm_loss": 1.1082, + "step": 8368, + "vm_loss": 0.1883 + }, + { + "epoch": 1.6108958779507665, + "lm_loss": 1.3913, + "step": 8368, + "vm_loss": 0.219 + }, + { + "epoch": 1.6108958779507665, + "lm_loss": 1.4924, + "step": 8368, + "vm_loss": 0.1558 + }, + { + "epoch": 1.6108958779507665, + "lm_loss": 0.8781, + "step": 8368, + "vm_loss": 0.1685 + }, + { + "epoch": 1.6108958779507665, + "lm_loss": 0.4511, + "step": 8368, + "vm_loss": 0.1526 + }, + { + "epoch": 1.6110883846283417, + "grad_norm": 3.4808297222428286, + "learning_rate": 1.9167998168378034e-06, + "loss": 0.9614, + "step": 8369 + }, + { + "epoch": 1.6112808913059173, + "grad_norm": 3.322109682549969, + "learning_rate": 1.914964568808616e-06, + "loss": 0.9031, + "step": 8370 + }, + { + "epoch": 1.6114733979834925, + "grad_norm": 3.3946513457557725, + "learning_rate": 1.9131301067483075e-06, + "loss": 0.895, + "step": 8371 + }, + { + "epoch": 1.611665904661068, + "grad_norm": 3.244419771898723, + "learning_rate": 1.9112964308352145e-06, + "loss": 0.8263, + "step": 8372 + }, + { + "epoch": 1.6118584113386434, + "grad_norm": 3.273995896696473, + "learning_rate": 1.9094635412475927e-06, + "loss": 0.9115, + "step": 8373 + }, + { + "epoch": 1.6120509180162186, + "grad_norm": 3.128102081238093, + "learning_rate": 1.907631438163626e-06, + "loss": 0.8408, + "step": 8374 + }, + { + "epoch": 1.6122434246937942, + "grad_norm": 3.126744850391365, + "learning_rate": 1.9058001217614118e-06, + "loss": 0.8546, + "step": 8375 + }, + { + "epoch": 1.6124359313713694, + "grad_norm": 3.2844812113609563, + "learning_rate": 1.9039695922189817e-06, + "loss": 0.9252, + "step": 8376 + }, + { + "epoch": 1.6124359313713694, + "lm_loss": 0.8637, + "step": 8376, + "vm_loss": 0.1621 + }, + { + "epoch": 1.6124359313713694, + "lm_loss": 0.5745, + "step": 8376, + "vm_loss": 0.155 + }, + { + "epoch": 1.6124359313713694, + "lm_loss": 0.6015, + "step": 8376, + "vm_loss": 0.1822 + }, + { + "epoch": 1.6124359313713694, + "lm_loss": 0.5047, + "step": 8376, + "vm_loss": 0.2012 + }, + { + "epoch": 1.6124359313713694, + "lm_loss": 0.4382, + "step": 8376, + "vm_loss": 0.2181 + }, + { + "epoch": 1.6124359313713694, + "lm_loss": 1.0475, + "step": 8376, + "vm_loss": 0.2588 + }, + { + "epoch": 1.6124359313713694, + "lm_loss": 0.5956, + "step": 8376, + "vm_loss": 0.1625 + }, + { + "epoch": 1.6124359313713694, + "lm_loss": 0.6175, + "step": 8376, + "vm_loss": 0.122 + }, + { + "epoch": 1.6126284380489448, + "grad_norm": 3.0535297509297497, + "learning_rate": 1.902139849714285e-06, + "loss": 0.8695, + "step": 8377 + }, + { + "epoch": 1.6128209447265203, + "grad_norm": 3.282675394589569, + "learning_rate": 1.900310894425197e-06, + "loss": 0.925, + "step": 8378 + }, + { + "epoch": 1.6130134514040955, + "grad_norm": 3.15395348936995, + "learning_rate": 1.8984827265295147e-06, + "loss": 0.8882, + "step": 8379 + }, + { + "epoch": 1.613205958081671, + "grad_norm": 3.2341105730541524, + "learning_rate": 1.8966553462049608e-06, + "loss": 0.8913, + "step": 8380 + }, + { + "epoch": 1.6133984647592463, + "grad_norm": 3.395267805528291, + "learning_rate": 1.894828753629182e-06, + "loss": 0.9779, + "step": 8381 + }, + { + "epoch": 1.6135909714368217, + "grad_norm": 3.2149550462320766, + "learning_rate": 1.8930029489797408e-06, + "loss": 0.8954, + "step": 8382 + }, + { + "epoch": 1.6137834781143972, + "grad_norm": 3.1458913615442303, + "learning_rate": 1.8911779324341317e-06, + "loss": 0.893, + "step": 8383 + }, + { + "epoch": 1.6139759847919724, + "grad_norm": 3.4440901373487276, + "learning_rate": 1.8893537041697707e-06, + "loss": 0.9452, + "step": 8384 + }, + { + "epoch": 1.6139759847919724, + "lm_loss": 0.9268, + "step": 8384, + "vm_loss": 0.1072 + }, + { + "epoch": 1.6139759847919724, + "lm_loss": 0.726, + "step": 8384, + "vm_loss": 0.1993 + }, + { + "epoch": 1.6139759847919724, + "lm_loss": 0.7835, + "step": 8384, + "vm_loss": 0.2232 + }, + { + "epoch": 1.6139759847919724, + "lm_loss": 0.6222, + "step": 8384, + "vm_loss": 0.129 + }, + { + "epoch": 1.6139759847919724, + "lm_loss": 0.3847, + "step": 8384, + "vm_loss": 0.1694 + }, + { + "epoch": 1.6139759847919724, + "lm_loss": 0.8808, + "step": 8384, + "vm_loss": 0.1399 + }, + { + "epoch": 1.6139759847919724, + "lm_loss": 0.9483, + "step": 8384, + "vm_loss": 0.2061 + }, + { + "epoch": 1.6139759847919724, + "lm_loss": 0.4876, + "step": 8384, + "vm_loss": 0.1935 + }, + { + "epoch": 1.614168491469548, + "grad_norm": 3.364944851463711, + "learning_rate": 1.887530264363996e-06, + "loss": 0.9012, + "step": 8385 + }, + { + "epoch": 1.6143609981471232, + "grad_norm": 3.216208698112367, + "learning_rate": 1.8857076131940643e-06, + "loss": 0.8319, + "step": 8386 + }, + { + "epoch": 1.6145535048246986, + "grad_norm": 3.388066159595722, + "learning_rate": 1.88388575083717e-06, + "loss": 0.9089, + "step": 8387 + }, + { + "epoch": 1.614746011502274, + "grad_norm": 3.297383004575363, + "learning_rate": 1.8820646774704131e-06, + "loss": 0.8865, + "step": 8388 + }, + { + "epoch": 1.6149385181798492, + "grad_norm": 3.3618466590428953, + "learning_rate": 1.880244393270828e-06, + "loss": 0.9095, + "step": 8389 + }, + { + "epoch": 1.6151310248574249, + "grad_norm": 3.349619236553279, + "learning_rate": 1.8784248984153708e-06, + "loss": 0.903, + "step": 8390 + }, + { + "epoch": 1.615323531535, + "grad_norm": 3.282595188101464, + "learning_rate": 1.876606193080921e-06, + "loss": 0.897, + "step": 8391 + }, + { + "epoch": 1.6155160382125755, + "grad_norm": 3.2375819022347887, + "learning_rate": 1.8747882774442749e-06, + "loss": 0.8822, + "step": 8392 + }, + { + "epoch": 1.6155160382125755, + "lm_loss": 0.7315, + "step": 8392, + "vm_loss": 0.1619 + }, + { + "epoch": 1.6155160382125755, + "lm_loss": 0.5204, + "step": 8392, + "vm_loss": 0.1548 + }, + { + "epoch": 1.6155160382125755, + "lm_loss": 0.4955, + "step": 8392, + "vm_loss": 0.1711 + }, + { + "epoch": 1.6155160382125755, + "lm_loss": 0.7145, + "step": 8392, + "vm_loss": 0.0861 + }, + { + "epoch": 1.6155160382125755, + "lm_loss": 0.7463, + "step": 8392, + "vm_loss": 0.1669 + }, + { + "epoch": 1.6155160382125755, + "lm_loss": 0.8001, + "step": 8392, + "vm_loss": 0.1718 + }, + { + "epoch": 1.6155160382125755, + "lm_loss": 0.7204, + "step": 8392, + "vm_loss": 0.1675 + }, + { + "epoch": 1.6155160382125755, + "lm_loss": 0.6268, + "step": 8392, + "vm_loss": 0.248 + }, + { + "epoch": 1.615708544890151, + "grad_norm": 3.1966809273610783, + "learning_rate": 1.8729711516821603e-06, + "loss": 0.902, + "step": 8393 + }, + { + "epoch": 1.6159010515677261, + "grad_norm": 3.330473567364802, + "learning_rate": 1.8711548159712246e-06, + "loss": 0.9305, + "step": 8394 + }, + { + "epoch": 1.6160935582453018, + "grad_norm": 3.3675340132816243, + "learning_rate": 1.8693392704880409e-06, + "loss": 0.9552, + "step": 8395 + }, + { + "epoch": 1.616286064922877, + "grad_norm": 3.1219829488529887, + "learning_rate": 1.867524515409097e-06, + "loss": 0.8774, + "step": 8396 + }, + { + "epoch": 1.6164785716004524, + "grad_norm": 3.0600472748064145, + "learning_rate": 1.8657105509108208e-06, + "loss": 0.8205, + "step": 8397 + }, + { + "epoch": 1.6166710782780278, + "grad_norm": 3.0703600265301887, + "learning_rate": 1.8638973771695446e-06, + "loss": 0.8581, + "step": 8398 + }, + { + "epoch": 1.616863584955603, + "grad_norm": 3.1689308778930445, + "learning_rate": 1.8620849943615337e-06, + "loss": 0.903, + "step": 8399 + }, + { + "epoch": 1.6170560916331786, + "grad_norm": 3.3200598172334077, + "learning_rate": 1.8602734026629753e-06, + "loss": 0.924, + "step": 8400 + }, + { + "epoch": 1.6170560916331786, + "lm_loss": 0.4931, + "step": 8400, + "vm_loss": 0.1251 + }, + { + "epoch": 1.6170560916331786, + "lm_loss": 0.6529, + "step": 8400, + "vm_loss": 0.2643 + }, + { + "epoch": 1.6170560916331786, + "lm_loss": 0.722, + "step": 8400, + "vm_loss": 0.2154 + }, + { + "epoch": 1.6170560916331786, + "lm_loss": 0.6142, + "step": 8400, + "vm_loss": 0.236 + }, + { + "epoch": 1.6170560916331786, + "lm_loss": 0.7254, + "step": 8400, + "vm_loss": 0.1872 + }, + { + "epoch": 1.6170560916331786, + "lm_loss": 1.0071, + "step": 8400, + "vm_loss": 0.1262 + }, + { + "epoch": 1.6170560916331786, + "lm_loss": 1.5374, + "step": 8400, + "vm_loss": 0.1457 + }, + { + "epoch": 1.6170560916331786, + "lm_loss": 0.5954, + "step": 8400, + "vm_loss": 0.1247 + }, + { + "epoch": 1.6172485983107538, + "grad_norm": 3.158453591593248, + "learning_rate": 1.8584626022499808e-06, + "loss": 0.8749, + "step": 8401 + }, + { + "epoch": 1.6174411049883293, + "grad_norm": 3.2471861623623757, + "learning_rate": 1.8566525932985836e-06, + "loss": 0.913, + "step": 8402 + }, + { + "epoch": 1.6176336116659047, + "grad_norm": 3.0919409473059174, + "learning_rate": 1.854843375984734e-06, + "loss": 0.853, + "step": 8403 + }, + { + "epoch": 1.61782611834348, + "grad_norm": 3.1755453932865247, + "learning_rate": 1.8530349504843193e-06, + "loss": 0.8857, + "step": 8404 + }, + { + "epoch": 1.6180186250210555, + "grad_norm": 3.2251113956096527, + "learning_rate": 1.8512273169731353e-06, + "loss": 0.8854, + "step": 8405 + }, + { + "epoch": 1.6182111316986307, + "grad_norm": 3.2533062859646305, + "learning_rate": 1.8494204756269097e-06, + "loss": 0.8965, + "step": 8406 + }, + { + "epoch": 1.6184036383762062, + "grad_norm": 3.2917705464218963, + "learning_rate": 1.8476144266212903e-06, + "loss": 0.8747, + "step": 8407 + }, + { + "epoch": 1.6185961450537816, + "grad_norm": 3.155665333987961, + "learning_rate": 1.8458091701318504e-06, + "loss": 0.8611, + "step": 8408 + }, + { + "epoch": 1.6185961450537816, + "lm_loss": 0.7215, + "step": 8408, + "vm_loss": 0.2088 + }, + { + "epoch": 1.6185961450537816, + "lm_loss": 0.8097, + "step": 8408, + "vm_loss": 0.1638 + }, + { + "epoch": 1.6185961450537816, + "lm_loss": 0.6905, + "step": 8408, + "vm_loss": 0.2283 + }, + { + "epoch": 1.6185961450537816, + "lm_loss": 0.2811, + "step": 8408, + "vm_loss": 0.2079 + }, + { + "epoch": 1.6185961450537816, + "lm_loss": 0.9875, + "step": 8408, + "vm_loss": 0.1807 + }, + { + "epoch": 1.6185961450537816, + "lm_loss": 0.6309, + "step": 8408, + "vm_loss": 0.1961 + }, + { + "epoch": 1.6185961450537816, + "lm_loss": 0.5143, + "step": 8408, + "vm_loss": 0.1262 + }, + { + "epoch": 1.6185961450537816, + "lm_loss": 0.7739, + "step": 8408, + "vm_loss": 0.133 + }, + { + "epoch": 1.6187886517313568, + "grad_norm": 3.279446183978866, + "learning_rate": 1.8440047063340793e-06, + "loss": 0.9392, + "step": 8409 + }, + { + "epoch": 1.6189811584089324, + "grad_norm": 3.3061220484966984, + "learning_rate": 1.8422010354033958e-06, + "loss": 0.8998, + "step": 8410 + }, + { + "epoch": 1.6191736650865076, + "grad_norm": 3.289514214265429, + "learning_rate": 1.840398157515142e-06, + "loss": 0.9142, + "step": 8411 + }, + { + "epoch": 1.619366171764083, + "grad_norm": 3.223875227373888, + "learning_rate": 1.8385960728445795e-06, + "loss": 0.8899, + "step": 8412 + }, + { + "epoch": 1.6195586784416585, + "grad_norm": 3.414341773206159, + "learning_rate": 1.8367947815668897e-06, + "loss": 0.9362, + "step": 8413 + }, + { + "epoch": 1.6197511851192337, + "grad_norm": 3.321590689251058, + "learning_rate": 1.8349942838571898e-06, + "loss": 0.8785, + "step": 8414 + }, + { + "epoch": 1.6199436917968093, + "grad_norm": 3.302479337004412, + "learning_rate": 1.8331945798905048e-06, + "loss": 0.9395, + "step": 8415 + }, + { + "epoch": 1.6201361984743845, + "grad_norm": 3.2153763912663464, + "learning_rate": 1.831395669841789e-06, + "loss": 0.8722, + "step": 8416 + }, + { + "epoch": 1.6201361984743845, + "lm_loss": 0.8092, + "step": 8416, + "vm_loss": 0.117 + }, + { + "epoch": 1.6201361984743845, + "lm_loss": 0.7421, + "step": 8416, + "vm_loss": 0.1271 + }, + { + "epoch": 1.6201361984743845, + "lm_loss": 1.2135, + "step": 8416, + "vm_loss": 0.1081 + }, + { + "epoch": 1.6201361984743845, + "lm_loss": 0.8927, + "step": 8416, + "vm_loss": 0.2702 + }, + { + "epoch": 1.6201361984743845, + "lm_loss": 0.4406, + "step": 8416, + "vm_loss": 0.138 + }, + { + "epoch": 1.6201361984743845, + "lm_loss": 0.4905, + "step": 8416, + "vm_loss": 0.1828 + }, + { + "epoch": 1.6201361984743845, + "lm_loss": 0.6884, + "step": 8416, + "vm_loss": 0.1309 + }, + { + "epoch": 1.6201361984743845, + "lm_loss": 0.5488, + "step": 8416, + "vm_loss": 0.1282 + }, + { + "epoch": 1.62032870515196, + "grad_norm": 3.249231698173159, + "learning_rate": 1.8295975538859223e-06, + "loss": 0.8917, + "step": 8417 + }, + { + "epoch": 1.6205212118295353, + "grad_norm": 3.316148652215696, + "learning_rate": 1.827800232197705e-06, + "loss": 0.9102, + "step": 8418 + }, + { + "epoch": 1.6207137185071108, + "grad_norm": 3.1653376179242865, + "learning_rate": 1.8260037049518565e-06, + "loss": 0.8825, + "step": 8419 + }, + { + "epoch": 1.6209062251846862, + "grad_norm": 3.2845834132360365, + "learning_rate": 1.8242079723230232e-06, + "loss": 0.8823, + "step": 8420 + }, + { + "epoch": 1.6210987318622614, + "grad_norm": 3.151334676801339, + "learning_rate": 1.8224130344857738e-06, + "loss": 0.865, + "step": 8421 + }, + { + "epoch": 1.6212912385398368, + "grad_norm": 3.250751693760326, + "learning_rate": 1.8206188916145995e-06, + "loss": 0.8975, + "step": 8422 + }, + { + "epoch": 1.6214837452174122, + "grad_norm": 3.3547001182476883, + "learning_rate": 1.818825543883914e-06, + "loss": 0.9069, + "step": 8423 + }, + { + "epoch": 1.6216762518949877, + "grad_norm": 3.2711974117197395, + "learning_rate": 1.8170329914680529e-06, + "loss": 0.8804, + "step": 8424 + }, + { + "epoch": 1.6216762518949877, + "lm_loss": 0.6618, + "step": 8424, + "vm_loss": 0.179 + }, + { + "epoch": 1.6216762518949877, + "lm_loss": 0.8496, + "step": 8424, + "vm_loss": 0.1864 + }, + { + "epoch": 1.6216762518949877, + "lm_loss": 0.6816, + "step": 8424, + "vm_loss": 0.1305 + }, + { + "epoch": 1.6216762518949877, + "lm_loss": 0.4988, + "step": 8424, + "vm_loss": 0.2584 + }, + { + "epoch": 1.6216762518949877, + "lm_loss": 0.4379, + "step": 8424, + "vm_loss": 0.1662 + }, + { + "epoch": 1.6216762518949877, + "lm_loss": 0.7708, + "step": 8424, + "vm_loss": 0.3248 + }, + { + "epoch": 1.6216762518949877, + "lm_loss": 0.5707, + "step": 8424, + "vm_loss": 0.1851 + }, + { + "epoch": 1.6216762518949877, + "lm_loss": 0.5124, + "step": 8424, + "vm_loss": 0.1298 + }, + { + "epoch": 1.621868758572563, + "grad_norm": 3.436717805763262, + "learning_rate": 1.8152412345412785e-06, + "loss": 0.9877, + "step": 8425 + }, + { + "epoch": 1.6220612652501383, + "grad_norm": 3.2891167612215613, + "learning_rate": 1.813450273277767e-06, + "loss": 0.9038, + "step": 8426 + }, + { + "epoch": 1.6222537719277137, + "grad_norm": 3.0887591458038672, + "learning_rate": 1.811660107851626e-06, + "loss": 0.842, + "step": 8427 + }, + { + "epoch": 1.6224462786052891, + "grad_norm": 3.1012407371831148, + "learning_rate": 1.8098707384368818e-06, + "loss": 0.8232, + "step": 8428 + }, + { + "epoch": 1.6226387852828645, + "grad_norm": 3.1088858892259883, + "learning_rate": 1.8080821652074865e-06, + "loss": 0.8291, + "step": 8429 + }, + { + "epoch": 1.62283129196044, + "grad_norm": 3.317213780404013, + "learning_rate": 1.8062943883373052e-06, + "loss": 0.8629, + "step": 8430 + }, + { + "epoch": 1.6230237986380152, + "grad_norm": 3.402243252034283, + "learning_rate": 1.8045074080001423e-06, + "loss": 0.9125, + "step": 8431 + }, + { + "epoch": 1.6232163053155906, + "grad_norm": 3.323981140240041, + "learning_rate": 1.8027212243697079e-06, + "loss": 0.9263, + "step": 8432 + }, + { + "epoch": 1.6232163053155906, + "lm_loss": 0.7551, + "step": 8432, + "vm_loss": 0.1317 + }, + { + "epoch": 1.6232163053155906, + "lm_loss": 0.5408, + "step": 8432, + "vm_loss": 0.1065 + }, + { + "epoch": 1.6232163053155906, + "lm_loss": 0.3836, + "step": 8432, + "vm_loss": 0.2074 + }, + { + "epoch": 1.6232163053155906, + "lm_loss": 0.8257, + "step": 8432, + "vm_loss": 0.1731 + }, + { + "epoch": 1.6232163053155906, + "lm_loss": 0.8335, + "step": 8432, + "vm_loss": 0.2275 + }, + { + "epoch": 1.6232163053155906, + "lm_loss": 0.7664, + "step": 8432, + "vm_loss": 0.2516 + }, + { + "epoch": 1.6232163053155906, + "lm_loss": 0.6236, + "step": 8432, + "vm_loss": 0.1496 + }, + { + "epoch": 1.6232163053155906, + "lm_loss": 0.477, + "step": 8432, + "vm_loss": 0.124 + }, + { + "epoch": 1.623408811993166, + "grad_norm": 3.411989164891271, + "learning_rate": 1.8009358376196472e-06, + "loss": 0.9246, + "step": 8433 + }, + { + "epoch": 1.6236013186707414, + "grad_norm": 3.34692872799914, + "learning_rate": 1.799151247923514e-06, + "loss": 0.9258, + "step": 8434 + }, + { + "epoch": 1.6237938253483168, + "grad_norm": 3.198446431654013, + "learning_rate": 1.7973674554548047e-06, + "loss": 0.8479, + "step": 8435 + }, + { + "epoch": 1.623986332025892, + "grad_norm": 3.229715294554512, + "learning_rate": 1.7955844603869176e-06, + "loss": 0.8765, + "step": 8436 + }, + { + "epoch": 1.6241788387034677, + "grad_norm": 3.267660131337464, + "learning_rate": 1.7938022628931872e-06, + "loss": 0.9076, + "step": 8437 + }, + { + "epoch": 1.6243713453810429, + "grad_norm": 3.471213799150422, + "learning_rate": 1.7920208631468638e-06, + "loss": 0.9282, + "step": 8438 + }, + { + "epoch": 1.6245638520586183, + "grad_norm": 3.389314970537656, + "learning_rate": 1.7902402613211234e-06, + "loss": 0.9298, + "step": 8439 + }, + { + "epoch": 1.6247563587361937, + "grad_norm": 3.3980879821869245, + "learning_rate": 1.7884604575890641e-06, + "loss": 0.9302, + "step": 8440 + }, + { + "epoch": 1.6247563587361937, + "lm_loss": 0.7974, + "step": 8440, + "vm_loss": 0.188 + }, + { + "epoch": 1.6247563587361937, + "lm_loss": 0.8428, + "step": 8440, + "vm_loss": 0.1781 + }, + { + "epoch": 1.6247563587361937, + "lm_loss": 0.5434, + "step": 8440, + "vm_loss": 0.1814 + }, + { + "epoch": 1.6247563587361937, + "lm_loss": 1.0095, + "step": 8440, + "vm_loss": 0.1166 + }, + { + "epoch": 1.6247563587361937, + "lm_loss": 0.6346, + "step": 8440, + "vm_loss": 0.1204 + }, + { + "epoch": 1.6247563587361937, + "lm_loss": 1.308, + "step": 8440, + "vm_loss": 0.178 + }, + { + "epoch": 1.6247563587361937, + "lm_loss": 0.856, + "step": 8440, + "vm_loss": 0.1947 + }, + { + "epoch": 1.6247563587361937, + "lm_loss": 0.4346, + "step": 8440, + "vm_loss": 0.1756 + }, + { + "epoch": 1.624948865413769, + "grad_norm": 3.233786626856265, + "learning_rate": 1.7866814521237064e-06, + "loss": 0.919, + "step": 8441 + }, + { + "epoch": 1.6251413720913446, + "grad_norm": 3.1480802322852006, + "learning_rate": 1.7849032450979876e-06, + "loss": 0.8919, + "step": 8442 + }, + { + "epoch": 1.6253338787689198, + "grad_norm": 3.3445500803941233, + "learning_rate": 1.783125836684776e-06, + "loss": 0.9082, + "step": 8443 + }, + { + "epoch": 1.6255263854464952, + "grad_norm": 3.1112929076021, + "learning_rate": 1.7813492270568566e-06, + "loss": 0.8432, + "step": 8444 + }, + { + "epoch": 1.6257188921240706, + "grad_norm": 3.2506330381964625, + "learning_rate": 1.7795734163869405e-06, + "loss": 0.8703, + "step": 8445 + }, + { + "epoch": 1.6259113988016458, + "grad_norm": 3.3433536996165163, + "learning_rate": 1.7777984048476605e-06, + "loss": 0.908, + "step": 8446 + }, + { + "epoch": 1.6261039054792215, + "grad_norm": 3.2306418835900264, + "learning_rate": 1.776024192611563e-06, + "loss": 0.9196, + "step": 8447 + }, + { + "epoch": 1.6262964121567967, + "grad_norm": 3.186214674865702, + "learning_rate": 1.7742507798511345e-06, + "loss": 0.8619, + "step": 8448 + }, + { + "epoch": 1.6262964121567967, + "lm_loss": 0.8864, + "step": 8448, + "vm_loss": 0.1373 + }, + { + "epoch": 1.6262964121567967, + "lm_loss": 0.5078, + "step": 8448, + "vm_loss": 0.1136 + }, + { + "epoch": 1.6262964121567967, + "lm_loss": 0.8447, + "step": 8448, + "vm_loss": 0.1978 + }, + { + "epoch": 1.6262964121567967, + "lm_loss": 0.7178, + "step": 8448, + "vm_loss": 0.1661 + }, + { + "epoch": 1.6262964121567967, + "lm_loss": 0.7485, + "step": 8448, + "vm_loss": 0.1742 + }, + { + "epoch": 1.6262964121567967, + "lm_loss": 0.4119, + "step": 8448, + "vm_loss": 0.1177 + }, + { + "epoch": 1.6262964121567967, + "lm_loss": 0.6146, + "step": 8448, + "vm_loss": 0.2764 + }, + { + "epoch": 1.6262964121567967, + "lm_loss": 0.9143, + "step": 8448, + "vm_loss": 0.1552 + }, + { + "epoch": 1.626488918834372, + "grad_norm": 3.170357594475807, + "learning_rate": 1.7724781667387658e-06, + "loss": 0.8708, + "step": 8449 + }, + { + "epoch": 1.6266814255119475, + "grad_norm": 3.309933029916249, + "learning_rate": 1.7707063534467816e-06, + "loss": 0.9212, + "step": 8450 + }, + { + "epoch": 1.6268739321895227, + "grad_norm": 3.3356447852556803, + "learning_rate": 1.7689353401474197e-06, + "loss": 0.9163, + "step": 8451 + }, + { + "epoch": 1.6270664388670983, + "grad_norm": 3.2911631129879972, + "learning_rate": 1.7671651270128531e-06, + "loss": 0.9489, + "step": 8452 + }, + { + "epoch": 1.6272589455446735, + "grad_norm": 3.194873105069443, + "learning_rate": 1.7653957142151622e-06, + "loss": 0.8553, + "step": 8453 + }, + { + "epoch": 1.627451452222249, + "grad_norm": 3.3279564253509384, + "learning_rate": 1.7636271019263596e-06, + "loss": 0.908, + "step": 8454 + }, + { + "epoch": 1.6276439588998244, + "grad_norm": 3.1447300247087275, + "learning_rate": 1.7618592903183761e-06, + "loss": 0.863, + "step": 8455 + }, + { + "epoch": 1.6278364655773996, + "grad_norm": 3.1006719064477886, + "learning_rate": 1.7600922795630693e-06, + "loss": 0.8685, + "step": 8456 + }, + { + "epoch": 1.6278364655773996, + "lm_loss": 0.8434, + "step": 8456, + "vm_loss": 0.1501 + }, + { + "epoch": 1.6278364655773996, + "lm_loss": 0.591, + "step": 8456, + "vm_loss": 0.1471 + }, + { + "epoch": 1.6278364655773996, + "lm_loss": 0.8466, + "step": 8456, + "vm_loss": 0.1488 + }, + { + "epoch": 1.6278364655773996, + "lm_loss": 0.5618, + "step": 8456, + "vm_loss": 0.1438 + }, + { + "epoch": 1.6278364655773996, + "lm_loss": 0.8751, + "step": 8456, + "vm_loss": 0.1363 + }, + { + "epoch": 1.6278364655773996, + "lm_loss": 0.8954, + "step": 8456, + "vm_loss": 0.1621 + }, + { + "epoch": 1.6278364655773996, + "lm_loss": 0.797, + "step": 8456, + "vm_loss": 0.1427 + }, + { + "epoch": 1.6278364655773996, + "lm_loss": 0.8621, + "step": 8456, + "vm_loss": 0.2175 + }, + { + "epoch": 1.6280289722549752, + "grad_norm": 3.2863412717117297, + "learning_rate": 1.7583260698322069e-06, + "loss": 0.8907, + "step": 8457 + }, + { + "epoch": 1.6282214789325504, + "grad_norm": 3.261475185764014, + "learning_rate": 1.7565606612974983e-06, + "loss": 0.9099, + "step": 8458 + }, + { + "epoch": 1.6284139856101258, + "grad_norm": 3.335473571539051, + "learning_rate": 1.7547960541305542e-06, + "loss": 0.9085, + "step": 8459 + }, + { + "epoch": 1.6286064922877013, + "grad_norm": 3.3044845573058304, + "learning_rate": 1.7530322485029206e-06, + "loss": 0.8913, + "step": 8460 + }, + { + "epoch": 1.6287989989652765, + "grad_norm": 3.2754737833853933, + "learning_rate": 1.7512692445860624e-06, + "loss": 0.8734, + "step": 8461 + }, + { + "epoch": 1.6289915056428521, + "grad_norm": 3.4135468627258514, + "learning_rate": 1.749507042551365e-06, + "loss": 0.9834, + "step": 8462 + }, + { + "epoch": 1.6291840123204273, + "grad_norm": 3.149913399872908, + "learning_rate": 1.747745642570141e-06, + "loss": 0.8771, + "step": 8463 + }, + { + "epoch": 1.6293765189980027, + "grad_norm": 3.2509669728524124, + "learning_rate": 1.745985044813615e-06, + "loss": 0.882, + "step": 8464 + }, + { + "epoch": 1.6293765189980027, + "lm_loss": 0.3118, + "step": 8464, + "vm_loss": 0.2085 + }, + { + "epoch": 1.6293765189980027, + "lm_loss": 0.8766, + "step": 8464, + "vm_loss": 0.1892 + }, + { + "epoch": 1.6293765189980027, + "lm_loss": 0.5112, + "step": 8464, + "vm_loss": 0.0883 + }, + { + "epoch": 1.6293765189980027, + "lm_loss": 0.7389, + "step": 8464, + "vm_loss": 0.1441 + }, + { + "epoch": 1.6293765189980027, + "lm_loss": 0.8028, + "step": 8464, + "vm_loss": 0.1139 + }, + { + "epoch": 1.6293765189980027, + "lm_loss": 0.9244, + "step": 8464, + "vm_loss": 0.1226 + }, + { + "epoch": 1.6293765189980027, + "lm_loss": 0.7022, + "step": 8464, + "vm_loss": 0.1775 + }, + { + "epoch": 1.6293765189980027, + "lm_loss": 0.5815, + "step": 8464, + "vm_loss": 0.1527 + }, + { + "epoch": 1.6295690256755782, + "grad_norm": 3.3816111957347235, + "learning_rate": 1.7442252494529422e-06, + "loss": 0.9279, + "step": 8465 + }, + { + "epoch": 1.6297615323531534, + "grad_norm": 3.3325969122934556, + "learning_rate": 1.7424662566591967e-06, + "loss": 0.8919, + "step": 8466 + }, + { + "epoch": 1.629954039030729, + "grad_norm": 3.172126376671785, + "learning_rate": 1.7407080666033794e-06, + "loss": 0.8643, + "step": 8467 + }, + { + "epoch": 1.6301465457083042, + "grad_norm": 3.2714323919169916, + "learning_rate": 1.7389506794563993e-06, + "loss": 0.8975, + "step": 8468 + }, + { + "epoch": 1.6303390523858796, + "grad_norm": 3.1733953268311983, + "learning_rate": 1.7371940953891087e-06, + "loss": 0.8959, + "step": 8469 + }, + { + "epoch": 1.630531559063455, + "grad_norm": 3.3448553950899598, + "learning_rate": 1.7354383145722608e-06, + "loss": 0.9756, + "step": 8470 + }, + { + "epoch": 1.6307240657410302, + "grad_norm": 3.34341426493702, + "learning_rate": 1.7336833371765437e-06, + "loss": 0.9203, + "step": 8471 + }, + { + "epoch": 1.6309165724186059, + "grad_norm": 3.1229779210848463, + "learning_rate": 1.7319291633725633e-06, + "loss": 0.831, + "step": 8472 + }, + { + "epoch": 1.6309165724186059, + "lm_loss": 0.5602, + "step": 8472, + "vm_loss": 0.226 + }, + { + "epoch": 1.6309165724186059, + "lm_loss": 0.5658, + "step": 8472, + "vm_loss": 0.1131 + }, + { + "epoch": 1.6309165724186059, + "lm_loss": 0.8256, + "step": 8472, + "vm_loss": 0.215 + }, + { + "epoch": 1.6309165724186059, + "lm_loss": 0.6543, + "step": 8472, + "vm_loss": 0.2484 + }, + { + "epoch": 1.6309165724186059, + "lm_loss": 0.6027, + "step": 8472, + "vm_loss": 0.1322 + }, + { + "epoch": 1.6309165724186059, + "lm_loss": 1.0274, + "step": 8472, + "vm_loss": 0.2017 + }, + { + "epoch": 1.6309165724186059, + "lm_loss": 0.4151, + "step": 8472, + "vm_loss": 0.14 + }, + { + "epoch": 1.6309165724186059, + "lm_loss": 0.8648, + "step": 8472, + "vm_loss": 0.2343 + }, + { + "epoch": 1.631109079096181, + "grad_norm": 3.284544127801621, + "learning_rate": 1.7301757933308506e-06, + "loss": 0.9016, + "step": 8473 + }, + { + "epoch": 1.6313015857737565, + "grad_norm": 3.3163125254080814, + "learning_rate": 1.7284232272218503e-06, + "loss": 0.8838, + "step": 8474 + }, + { + "epoch": 1.631494092451332, + "grad_norm": 3.2834695956852467, + "learning_rate": 1.7266714652159354e-06, + "loss": 0.8553, + "step": 8475 + }, + { + "epoch": 1.6316865991289071, + "grad_norm": 3.3235561456293023, + "learning_rate": 1.7249205074834019e-06, + "loss": 0.9274, + "step": 8476 + }, + { + "epoch": 1.6318791058064828, + "grad_norm": 3.1648255902986917, + "learning_rate": 1.7231703541944666e-06, + "loss": 0.8467, + "step": 8477 + }, + { + "epoch": 1.632071612484058, + "grad_norm": 3.1710467917544287, + "learning_rate": 1.7214210055192604e-06, + "loss": 0.9198, + "step": 8478 + }, + { + "epoch": 1.6322641191616334, + "grad_norm": 3.045766966043062, + "learning_rate": 1.7196724616278504e-06, + "loss": 0.8223, + "step": 8479 + }, + { + "epoch": 1.6324566258392088, + "grad_norm": 3.1697415153983646, + "learning_rate": 1.7179247226902107e-06, + "loss": 0.8947, + "step": 8480 + }, + { + "epoch": 1.6324566258392088, + "lm_loss": 0.9598, + "step": 8480, + "vm_loss": 0.0892 + }, + { + "epoch": 1.6324566258392088, + "lm_loss": 0.6499, + "step": 8480, + "vm_loss": 0.1607 + }, + { + "epoch": 1.6324566258392088, + "lm_loss": 1.0453, + "step": 8480, + "vm_loss": 0.2684 + }, + { + "epoch": 1.6324566258392088, + "lm_loss": 0.6136, + "step": 8480, + "vm_loss": 0.1085 + }, + { + "epoch": 1.6324566258392088, + "lm_loss": 0.8492, + "step": 8480, + "vm_loss": 0.1676 + }, + { + "epoch": 1.6324566258392088, + "lm_loss": 0.8723, + "step": 8480, + "vm_loss": 0.1336 + }, + { + "epoch": 1.6324566258392088, + "lm_loss": 0.2933, + "step": 8480, + "vm_loss": 0.1975 + }, + { + "epoch": 1.6324566258392088, + "lm_loss": 0.6891, + "step": 8480, + "vm_loss": 0.1304 + }, + { + "epoch": 1.6326491325167842, + "grad_norm": 3.3418944145619416, + "learning_rate": 1.7161777888762466e-06, + "loss": 0.907, + "step": 8481 + }, + { + "epoch": 1.6328416391943597, + "grad_norm": 3.2800683917061684, + "learning_rate": 1.7144316603557832e-06, + "loss": 0.8738, + "step": 8482 + }, + { + "epoch": 1.6330341458719349, + "grad_norm": 3.169754492854183, + "learning_rate": 1.7126863372985658e-06, + "loss": 0.8656, + "step": 8483 + }, + { + "epoch": 1.6332266525495103, + "grad_norm": 3.3629042542528773, + "learning_rate": 1.7109418198742644e-06, + "loss": 0.9378, + "step": 8484 + }, + { + "epoch": 1.6334191592270857, + "grad_norm": 3.2717371783330926, + "learning_rate": 1.7091981082524633e-06, + "loss": 0.9161, + "step": 8485 + }, + { + "epoch": 1.6336116659046611, + "grad_norm": 3.105697767564221, + "learning_rate": 1.7074552026026759e-06, + "loss": 0.8368, + "step": 8486 + }, + { + "epoch": 1.6338041725822365, + "grad_norm": 3.2565201189663266, + "learning_rate": 1.7057131030943364e-06, + "loss": 0.8834, + "step": 8487 + }, + { + "epoch": 1.6339966792598117, + "grad_norm": 3.1515380798590265, + "learning_rate": 1.7039718098967983e-06, + "loss": 0.8381, + "step": 8488 + }, + { + "epoch": 1.6339966792598117, + "lm_loss": 0.7819, + "step": 8488, + "vm_loss": 0.2411 + }, + { + "epoch": 1.6339966792598117, + "lm_loss": 0.6532, + "step": 8488, + "vm_loss": 0.2146 + }, + { + "epoch": 1.6339966792598117, + "lm_loss": 1.2335, + "step": 8488, + "vm_loss": 0.2049 + }, + { + "epoch": 1.6339966792598117, + "lm_loss": 0.6933, + "step": 8488, + "vm_loss": 0.1606 + }, + { + "epoch": 1.6339966792598117, + "lm_loss": 0.4939, + "step": 8488, + "vm_loss": 0.1665 + }, + { + "epoch": 1.6339966792598117, + "lm_loss": 0.6833, + "step": 8488, + "vm_loss": 0.2046 + }, + { + "epoch": 1.6339966792598117, + "lm_loss": 0.7266, + "step": 8488, + "vm_loss": 0.1303 + }, + { + "epoch": 1.6339966792598117, + "lm_loss": 0.6452, + "step": 8488, + "vm_loss": 0.1142 + }, + { + "epoch": 1.6341891859373872, + "grad_norm": 3.2618775771290904, + "learning_rate": 1.7022313231793364e-06, + "loss": 0.8824, + "step": 8489 + }, + { + "epoch": 1.6343816926149626, + "grad_norm": 3.3458552636418193, + "learning_rate": 1.7004916431111529e-06, + "loss": 0.869, + "step": 8490 + }, + { + "epoch": 1.634574199292538, + "grad_norm": 3.294678228271462, + "learning_rate": 1.6987527698613603e-06, + "loss": 0.9186, + "step": 8491 + }, + { + "epoch": 1.6347667059701134, + "grad_norm": 3.1896106307745815, + "learning_rate": 1.6970147035990025e-06, + "loss": 0.8598, + "step": 8492 + }, + { + "epoch": 1.6349592126476886, + "grad_norm": 3.113586851436277, + "learning_rate": 1.695277444493042e-06, + "loss": 0.8252, + "step": 8493 + }, + { + "epoch": 1.635151719325264, + "grad_norm": 3.234654844009193, + "learning_rate": 1.6935409927123647e-06, + "loss": 0.8698, + "step": 8494 + }, + { + "epoch": 1.6353442260028395, + "grad_norm": 3.3971615641815767, + "learning_rate": 1.6918053484257691e-06, + "loss": 0.921, + "step": 8495 + }, + { + "epoch": 1.6355367326804149, + "grad_norm": 3.186607063898726, + "learning_rate": 1.69007051180199e-06, + "loss": 0.8914, + "step": 8496 + }, + { + "epoch": 1.6355367326804149, + "lm_loss": 0.5472, + "step": 8496, + "vm_loss": 0.1213 + }, + { + "epoch": 1.6355367326804149, + "lm_loss": 0.8396, + "step": 8496, + "vm_loss": 0.1862 + }, + { + "epoch": 1.6355367326804149, + "lm_loss": 0.7031, + "step": 8496, + "vm_loss": 0.1417 + }, + { + "epoch": 1.6355367326804149, + "lm_loss": 1.0508, + "step": 8496, + "vm_loss": 0.151 + }, + { + "epoch": 1.6355367326804149, + "lm_loss": 0.7819, + "step": 8496, + "vm_loss": 0.1054 + }, + { + "epoch": 1.6355367326804149, + "lm_loss": 0.3792, + "step": 8496, + "vm_loss": 0.1674 + }, + { + "epoch": 1.6355367326804149, + "lm_loss": 0.8541, + "step": 8496, + "vm_loss": 0.1048 + }, + { + "epoch": 1.6355367326804149, + "lm_loss": 0.6582, + "step": 8496, + "vm_loss": 0.1207 + }, + { + "epoch": 1.6357292393579903, + "grad_norm": 3.245727340390148, + "learning_rate": 1.6883364830096705e-06, + "loss": 0.8436, + "step": 8497 + }, + { + "epoch": 1.6359217460355655, + "grad_norm": 3.2731521555414744, + "learning_rate": 1.6866032622173822e-06, + "loss": 0.9053, + "step": 8498 + }, + { + "epoch": 1.6361142527131411, + "grad_norm": 3.370282944351226, + "learning_rate": 1.6848708495936161e-06, + "loss": 0.8897, + "step": 8499 + }, + { + "epoch": 1.6363067593907163, + "grad_norm": 3.402486474017815, + "learning_rate": 1.6831392453067885e-06, + "loss": 0.9346, + "step": 8500 + }, + { + "epoch": 1.6364992660682918, + "grad_norm": 3.151677143009986, + "learning_rate": 1.6814084495252258e-06, + "loss": 0.8477, + "step": 8501 + }, + { + "epoch": 1.6366917727458672, + "grad_norm": 3.161219138083262, + "learning_rate": 1.6796784624171891e-06, + "loss": 0.8901, + "step": 8502 + }, + { + "epoch": 1.6368842794234424, + "grad_norm": 3.309755527022719, + "learning_rate": 1.6779492841508527e-06, + "loss": 0.9112, + "step": 8503 + }, + { + "epoch": 1.637076786101018, + "grad_norm": 3.22390679508785, + "learning_rate": 1.6762209148943164e-06, + "loss": 0.9189, + "step": 8504 + }, + { + "epoch": 1.637076786101018, + "lm_loss": 0.7057, + "step": 8504, + "vm_loss": 0.1321 + }, + { + "epoch": 1.637076786101018, + "lm_loss": 0.7496, + "step": 8504, + "vm_loss": 0.134 + }, + { + "epoch": 1.637076786101018, + "lm_loss": 1.1109, + "step": 8504, + "vm_loss": 0.2299 + }, + { + "epoch": 1.637076786101018, + "lm_loss": 0.4952, + "step": 8504, + "vm_loss": 0.1388 + }, + { + "epoch": 1.637076786101018, + "lm_loss": 0.6524, + "step": 8504, + "vm_loss": 0.1505 + }, + { + "epoch": 1.637076786101018, + "lm_loss": 0.6033, + "step": 8504, + "vm_loss": 0.1702 + }, + { + "epoch": 1.637076786101018, + "lm_loss": 0.9249, + "step": 8504, + "vm_loss": 0.1452 + }, + { + "epoch": 1.637076786101018, + "lm_loss": 0.8034, + "step": 8504, + "vm_loss": 0.1836 + }, + { + "epoch": 1.6372692927785932, + "grad_norm": 3.3046820577186833, + "learning_rate": 1.6744933548155994e-06, + "loss": 0.8885, + "step": 8505 + }, + { + "epoch": 1.6374617994561687, + "grad_norm": 3.0990160240948894, + "learning_rate": 1.672766604082643e-06, + "loss": 0.8427, + "step": 8506 + }, + { + "epoch": 1.637654306133744, + "grad_norm": 3.356322915855534, + "learning_rate": 1.6710406628633113e-06, + "loss": 0.9442, + "step": 8507 + }, + { + "epoch": 1.6378468128113193, + "grad_norm": 3.3079469237203347, + "learning_rate": 1.6693155313253828e-06, + "loss": 0.9227, + "step": 8508 + }, + { + "epoch": 1.638039319488895, + "grad_norm": 3.2399651576207256, + "learning_rate": 1.6675912096365655e-06, + "loss": 0.8945, + "step": 8509 + }, + { + "epoch": 1.6382318261664701, + "grad_norm": 3.2265612099292786, + "learning_rate": 1.6658676979644862e-06, + "loss": 0.9297, + "step": 8510 + }, + { + "epoch": 1.6384243328440455, + "grad_norm": 3.35450050058546, + "learning_rate": 1.664144996476692e-06, + "loss": 0.9128, + "step": 8511 + }, + { + "epoch": 1.638616839521621, + "grad_norm": 3.237006886051614, + "learning_rate": 1.662423105340647e-06, + "loss": 0.9146, + "step": 8512 + }, + { + "epoch": 1.638616839521621, + "lm_loss": 0.6118, + "step": 8512, + "vm_loss": 0.1396 + }, + { + "epoch": 1.638616839521621, + "lm_loss": 0.4682, + "step": 8512, + "vm_loss": 0.1661 + }, + { + "epoch": 1.638616839521621, + "lm_loss": 0.6492, + "step": 8512, + "vm_loss": 0.2043 + }, + { + "epoch": 1.638616839521621, + "lm_loss": 0.4621, + "step": 8512, + "vm_loss": 0.1423 + }, + { + "epoch": 1.638616839521621, + "lm_loss": 0.5928, + "step": 8512, + "vm_loss": 0.1545 + }, + { + "epoch": 1.638616839521621, + "lm_loss": 0.4592, + "step": 8512, + "vm_loss": 0.1587 + }, + { + "epoch": 1.638616839521621, + "lm_loss": 0.529, + "step": 8512, + "vm_loss": 0.1219 + }, + { + "epoch": 1.638616839521621, + "lm_loss": 0.5069, + "step": 8512, + "vm_loss": 0.2069 + }, + { + "epoch": 1.6388093461991962, + "grad_norm": 3.1268579958972498, + "learning_rate": 1.6607020247237504e-06, + "loss": 0.8296, + "step": 8513 + }, + { + "epoch": 1.6390018528767718, + "grad_norm": 3.1389705693022103, + "learning_rate": 1.658981754793304e-06, + "loss": 0.856, + "step": 8514 + }, + { + "epoch": 1.639194359554347, + "grad_norm": 3.2782678120357076, + "learning_rate": 1.657262295716545e-06, + "loss": 0.8907, + "step": 8515 + }, + { + "epoch": 1.6393868662319224, + "grad_norm": 3.2826293137258773, + "learning_rate": 1.6555436476606246e-06, + "loss": 0.9095, + "step": 8516 + }, + { + "epoch": 1.6395793729094978, + "grad_norm": 3.2464552625179204, + "learning_rate": 1.6538258107926219e-06, + "loss": 0.8571, + "step": 8517 + }, + { + "epoch": 1.639771879587073, + "grad_norm": 3.2942051842256954, + "learning_rate": 1.652108785279526e-06, + "loss": 0.8816, + "step": 8518 + }, + { + "epoch": 1.6399643862646487, + "grad_norm": 3.362549487709971, + "learning_rate": 1.6503925712882573e-06, + "loss": 0.9566, + "step": 8519 + }, + { + "epoch": 1.6401568929422239, + "grad_norm": 3.1056394535793124, + "learning_rate": 1.6486771689856528e-06, + "loss": 0.8531, + "step": 8520 + }, + { + "epoch": 1.6401568929422239, + "lm_loss": 0.9199, + "step": 8520, + "vm_loss": 0.2464 + }, + { + "epoch": 1.6401568929422239, + "lm_loss": 0.7875, + "step": 8520, + "vm_loss": 0.1389 + }, + { + "epoch": 1.6401568929422239, + "lm_loss": 1.1308, + "step": 8520, + "vm_loss": 0.1887 + }, + { + "epoch": 1.6401568929422239, + "lm_loss": 0.5517, + "step": 8520, + "vm_loss": 0.1648 + }, + { + "epoch": 1.6401568929422239, + "lm_loss": 0.6735, + "step": 8520, + "vm_loss": 0.1728 + }, + { + "epoch": 1.6401568929422239, + "lm_loss": 0.4264, + "step": 8520, + "vm_loss": 0.1692 + }, + { + "epoch": 1.6401568929422239, + "lm_loss": 0.8659, + "step": 8520, + "vm_loss": 0.1628 + }, + { + "epoch": 1.6401568929422239, + "lm_loss": 0.3875, + "step": 8520, + "vm_loss": 0.2023 + }, + { + "epoch": 1.6403493996197993, + "grad_norm": 3.3269962025473387, + "learning_rate": 1.6469625785384746e-06, + "loss": 0.9149, + "step": 8521 + }, + { + "epoch": 1.6405419062973747, + "grad_norm": 3.3960128002494754, + "learning_rate": 1.645248800113396e-06, + "loss": 0.9088, + "step": 8522 + }, + { + "epoch": 1.64073441297495, + "grad_norm": 3.1854575576083626, + "learning_rate": 1.643535833877027e-06, + "loss": 0.8762, + "step": 8523 + }, + { + "epoch": 1.6409269196525256, + "grad_norm": 3.255071206202876, + "learning_rate": 1.641823679995883e-06, + "loss": 0.9089, + "step": 8524 + }, + { + "epoch": 1.6411194263301008, + "grad_norm": 3.2290354368499243, + "learning_rate": 1.6401123386364094e-06, + "loss": 0.8882, + "step": 8525 + }, + { + "epoch": 1.6413119330076762, + "grad_norm": 3.229150164708246, + "learning_rate": 1.6384018099649712e-06, + "loss": 0.9201, + "step": 8526 + }, + { + "epoch": 1.6415044396852516, + "grad_norm": 3.2942404252259494, + "learning_rate": 1.6366920941478526e-06, + "loss": 0.8578, + "step": 8527 + }, + { + "epoch": 1.6416969463628268, + "grad_norm": 3.4658893470830137, + "learning_rate": 1.6349831913512626e-06, + "loss": 0.961, + "step": 8528 + }, + { + "epoch": 1.6416969463628268, + "lm_loss": 0.6479, + "step": 8528, + "vm_loss": 0.1301 + }, + { + "epoch": 1.6416969463628268, + "lm_loss": 0.7459, + "step": 8528, + "vm_loss": 0.1316 + }, + { + "epoch": 1.6416969463628268, + "lm_loss": 0.6563, + "step": 8528, + "vm_loss": 0.1794 + }, + { + "epoch": 1.6416969463628268, + "lm_loss": 0.4744, + "step": 8528, + "vm_loss": 0.0857 + }, + { + "epoch": 1.6416969463628268, + "lm_loss": 0.5223, + "step": 8528, + "vm_loss": 0.1784 + }, + { + "epoch": 1.6416969463628268, + "lm_loss": 0.6838, + "step": 8528, + "vm_loss": 0.1999 + }, + { + "epoch": 1.6416969463628268, + "lm_loss": 0.6971, + "step": 8528, + "vm_loss": 0.2503 + }, + { + "epoch": 1.6416969463628268, + "lm_loss": 0.5268, + "step": 8528, + "vm_loss": 0.1376 + }, + { + "epoch": 1.6418894530404025, + "grad_norm": 3.0272633848553787, + "learning_rate": 1.6332751017413218e-06, + "loss": 0.8131, + "step": 8529 + }, + { + "epoch": 1.6420819597179777, + "grad_norm": 3.1892090525502366, + "learning_rate": 1.6315678254840883e-06, + "loss": 0.8799, + "step": 8530 + }, + { + "epoch": 1.642274466395553, + "grad_norm": 3.3471680456617467, + "learning_rate": 1.6298613627455218e-06, + "loss": 0.9013, + "step": 8531 + }, + { + "epoch": 1.6424669730731285, + "grad_norm": 3.3814493772479453, + "learning_rate": 1.6281557136915172e-06, + "loss": 0.9323, + "step": 8532 + }, + { + "epoch": 1.6426594797507037, + "grad_norm": 3.4312198003595062, + "learning_rate": 1.6264508784878841e-06, + "loss": 0.9264, + "step": 8533 + }, + { + "epoch": 1.6428519864282793, + "grad_norm": 3.332540023792863, + "learning_rate": 1.6247468573003567e-06, + "loss": 0.9336, + "step": 8534 + }, + { + "epoch": 1.6430444931058545, + "grad_norm": 3.1879344360605733, + "learning_rate": 1.6230436502945834e-06, + "loss": 0.887, + "step": 8535 + }, + { + "epoch": 1.64323699978343, + "grad_norm": 3.218520511293312, + "learning_rate": 1.6213412576361398e-06, + "loss": 0.8721, + "step": 8536 + }, + { + "epoch": 1.64323699978343, + "lm_loss": 0.5021, + "step": 8536, + "vm_loss": 0.1787 + }, + { + "epoch": 1.64323699978343, + "lm_loss": 0.8111, + "step": 8536, + "vm_loss": 0.1803 + }, + { + "epoch": 1.64323699978343, + "lm_loss": 1.0127, + "step": 8536, + "vm_loss": 0.1324 + }, + { + "epoch": 1.64323699978343, + "lm_loss": 0.6422, + "step": 8536, + "vm_loss": 0.1351 + }, + { + "epoch": 1.64323699978343, + "lm_loss": 0.5639, + "step": 8536, + "vm_loss": 0.2001 + }, + { + "epoch": 1.64323699978343, + "lm_loss": 0.5559, + "step": 8536, + "vm_loss": 0.1259 + }, + { + "epoch": 1.64323699978343, + "lm_loss": 0.8818, + "step": 8536, + "vm_loss": 0.2433 + }, + { + "epoch": 1.64323699978343, + "lm_loss": 0.8029, + "step": 8536, + "vm_loss": 0.1494 + }, + { + "epoch": 1.6434295064610054, + "grad_norm": 3.3612476835679668, + "learning_rate": 1.6196396794905212e-06, + "loss": 0.9, + "step": 8537 + }, + { + "epoch": 1.6436220131385806, + "grad_norm": 3.273889819108241, + "learning_rate": 1.617938916023143e-06, + "loss": 0.8946, + "step": 8538 + }, + { + "epoch": 1.6438145198161562, + "grad_norm": 3.12152354370881, + "learning_rate": 1.616238967399336e-06, + "loss": 0.8659, + "step": 8539 + }, + { + "epoch": 1.6440070264937314, + "grad_norm": 3.067912306756776, + "learning_rate": 1.6145398337843654e-06, + "loss": 0.8597, + "step": 8540 + }, + { + "epoch": 1.6441995331713068, + "grad_norm": 3.2016386983545146, + "learning_rate": 1.6128415153434017e-06, + "loss": 0.8772, + "step": 8541 + }, + { + "epoch": 1.6443920398488823, + "grad_norm": 3.273360390443935, + "learning_rate": 1.611144012241549e-06, + "loss": 0.9044, + "step": 8542 + }, + { + "epoch": 1.6445845465264577, + "grad_norm": 3.2786507347711518, + "learning_rate": 1.609447324643818e-06, + "loss": 0.912, + "step": 8543 + }, + { + "epoch": 1.6447770532040331, + "grad_norm": 3.132915795662491, + "learning_rate": 1.6077514527151573e-06, + "loss": 0.8538, + "step": 8544 + }, + { + "epoch": 1.6447770532040331, + "lm_loss": 0.6591, + "step": 8544, + "vm_loss": 0.2326 + }, + { + "epoch": 1.6447770532040331, + "lm_loss": 0.79, + "step": 8544, + "vm_loss": 0.1455 + }, + { + "epoch": 1.6447770532040331, + "lm_loss": 0.8092, + "step": 8544, + "vm_loss": 0.1212 + }, + { + "epoch": 1.6447770532040331, + "lm_loss": 0.6855, + "step": 8544, + "vm_loss": 0.1604 + }, + { + "epoch": 1.6447770532040331, + "lm_loss": 0.4217, + "step": 8544, + "vm_loss": 0.2128 + }, + { + "epoch": 1.6447770532040331, + "lm_loss": 0.6045, + "step": 8544, + "vm_loss": 0.1488 + }, + { + "epoch": 1.6447770532040331, + "lm_loss": 1.1429, + "step": 8544, + "vm_loss": 0.216 + }, + { + "epoch": 1.6447770532040331, + "lm_loss": 0.6354, + "step": 8544, + "vm_loss": 0.1527 + }, + { + "epoch": 1.6449695598816083, + "grad_norm": 3.2683914646985457, + "learning_rate": 1.6060563966204224e-06, + "loss": 0.9285, + "step": 8545 + }, + { + "epoch": 1.6451620665591837, + "grad_norm": 3.3960710038787734, + "learning_rate": 1.604362156524395e-06, + "loss": 0.9454, + "step": 8546 + }, + { + "epoch": 1.6453545732367592, + "grad_norm": 3.402781970100242, + "learning_rate": 1.6026687325917779e-06, + "loss": 0.9087, + "step": 8547 + }, + { + "epoch": 1.6455470799143346, + "grad_norm": 3.213231186840626, + "learning_rate": 1.6009761249871936e-06, + "loss": 0.9269, + "step": 8548 + }, + { + "epoch": 1.64573958659191, + "grad_norm": 3.3529769349102314, + "learning_rate": 1.5992843338751874e-06, + "loss": 0.9029, + "step": 8549 + }, + { + "epoch": 1.6459320932694852, + "grad_norm": 3.2361917082805016, + "learning_rate": 1.5975933594202142e-06, + "loss": 0.8635, + "step": 8550 + }, + { + "epoch": 1.6461245999470606, + "grad_norm": 3.1983628388541985, + "learning_rate": 1.5959032017866705e-06, + "loss": 0.864, + "step": 8551 + }, + { + "epoch": 1.646317106624636, + "grad_norm": 3.127493535109191, + "learning_rate": 1.5942138611388536e-06, + "loss": 0.824, + "step": 8552 + }, + { + "epoch": 1.646317106624636, + "lm_loss": 0.4443, + "step": 8552, + "vm_loss": 0.2146 + }, + { + "epoch": 1.646317106624636, + "lm_loss": 0.6904, + "step": 8552, + "vm_loss": 0.1582 + }, + { + "epoch": 1.646317106624636, + "lm_loss": 0.6321, + "step": 8552, + "vm_loss": 0.1679 + }, + { + "epoch": 1.646317106624636, + "lm_loss": 0.7198, + "step": 8552, + "vm_loss": 0.08 + }, + { + "epoch": 1.646317106624636, + "lm_loss": 0.7772, + "step": 8552, + "vm_loss": 0.1711 + }, + { + "epoch": 1.646317106624636, + "lm_loss": 0.7045, + "step": 8552, + "vm_loss": 0.146 + }, + { + "epoch": 1.646317106624636, + "lm_loss": 0.8953, + "step": 8552, + "vm_loss": 0.2365 + }, + { + "epoch": 1.646317106624636, + "lm_loss": 0.7245, + "step": 8552, + "vm_loss": 0.117 + }, + { + "epoch": 1.6465096133022115, + "grad_norm": 3.5815344476331736, + "learning_rate": 1.59252533764099e-06, + "loss": 0.9255, + "step": 8553 + }, + { + "epoch": 1.6467021199797869, + "grad_norm": 3.2068823413874283, + "learning_rate": 1.590837631457226e-06, + "loss": 0.8861, + "step": 8554 + }, + { + "epoch": 1.646894626657362, + "grad_norm": 3.2616126318494985, + "learning_rate": 1.5891507427516316e-06, + "loss": 0.8714, + "step": 8555 + }, + { + "epoch": 1.6470871333349375, + "grad_norm": 3.340829765072319, + "learning_rate": 1.587464671688187e-06, + "loss": 0.9373, + "step": 8556 + }, + { + "epoch": 1.647279640012513, + "grad_norm": 3.292852039147704, + "learning_rate": 1.5857794184308084e-06, + "loss": 0.91, + "step": 8557 + }, + { + "epoch": 1.6474721466900883, + "grad_norm": 3.1128553345387955, + "learning_rate": 1.5840949831433172e-06, + "loss": 0.8456, + "step": 8558 + }, + { + "epoch": 1.6476646533676638, + "grad_norm": 3.1853892489548463, + "learning_rate": 1.582411365989468e-06, + "loss": 0.8463, + "step": 8559 + }, + { + "epoch": 1.647857160045239, + "grad_norm": 3.1844989115021995, + "learning_rate": 1.5807285671329197e-06, + "loss": 0.8598, + "step": 8560 + }, + { + "epoch": 1.647857160045239, + "lm_loss": 0.8092, + "step": 8560, + "vm_loss": 0.1574 + }, + { + "epoch": 1.647857160045239, + "lm_loss": 0.5509, + "step": 8560, + "vm_loss": 0.171 + }, + { + "epoch": 1.647857160045239, + "lm_loss": 0.7338, + "step": 8560, + "vm_loss": 0.0948 + }, + { + "epoch": 1.647857160045239, + "lm_loss": 0.5898, + "step": 8560, + "vm_loss": 0.1842 + }, + { + "epoch": 1.647857160045239, + "lm_loss": 0.8763, + "step": 8560, + "vm_loss": 0.1927 + }, + { + "epoch": 1.647857160045239, + "lm_loss": 0.8373, + "step": 8560, + "vm_loss": 0.1588 + }, + { + "epoch": 1.647857160045239, + "lm_loss": 0.7365, + "step": 8560, + "vm_loss": 0.1688 + }, + { + "epoch": 1.647857160045239, + "lm_loss": 0.3853, + "step": 8560, + "vm_loss": 0.1687 + }, + { + "epoch": 1.6480496667228146, + "grad_norm": 3.105746661486708, + "learning_rate": 1.579046586737275e-06, + "loss": 0.8338, + "step": 8561 + }, + { + "epoch": 1.6482421734003898, + "grad_norm": 3.385922770303578, + "learning_rate": 1.5773654249660342e-06, + "loss": 0.9295, + "step": 8562 + }, + { + "epoch": 1.6484346800779652, + "grad_norm": 3.5486513855255257, + "learning_rate": 1.575685081982632e-06, + "loss": 0.9672, + "step": 8563 + }, + { + "epoch": 1.6486271867555407, + "grad_norm": 3.207737292454642, + "learning_rate": 1.5740055579504176e-06, + "loss": 0.885, + "step": 8564 + }, + { + "epoch": 1.6488196934331159, + "grad_norm": 3.2800049731018555, + "learning_rate": 1.5723268530326653e-06, + "loss": 0.8621, + "step": 8565 + }, + { + "epoch": 1.6490122001106915, + "grad_norm": 3.2939256305741473, + "learning_rate": 1.5706489673925618e-06, + "loss": 0.8813, + "step": 8566 + }, + { + "epoch": 1.6492047067882667, + "grad_norm": 3.261284274347596, + "learning_rate": 1.5689719011932213e-06, + "loss": 0.8581, + "step": 8567 + }, + { + "epoch": 1.6493972134658421, + "grad_norm": 3.3438703056349564, + "learning_rate": 1.567295654597677e-06, + "loss": 0.9254, + "step": 8568 + }, + { + "epoch": 1.6493972134658421, + "lm_loss": 0.8161, + "step": 8568, + "vm_loss": 0.1782 + }, + { + "epoch": 1.6493972134658421, + "lm_loss": 0.7037, + "step": 8568, + "vm_loss": 0.14 + }, + { + "epoch": 1.6493972134658421, + "lm_loss": 0.7304, + "step": 8568, + "vm_loss": 0.2118 + }, + { + "epoch": 1.6493972134658421, + "lm_loss": 0.877, + "step": 8568, + "vm_loss": 0.1501 + }, + { + "epoch": 1.6493972134658421, + "lm_loss": 0.7825, + "step": 8568, + "vm_loss": 0.2216 + }, + { + "epoch": 1.6493972134658421, + "lm_loss": 0.5804, + "step": 8568, + "vm_loss": 0.116 + }, + { + "epoch": 1.6493972134658421, + "lm_loss": 0.3336, + "step": 8568, + "vm_loss": 0.1687 + }, + { + "epoch": 1.6493972134658421, + "lm_loss": 0.4479, + "step": 8568, + "vm_loss": 0.1323 + }, + { + "epoch": 1.6495897201434175, + "grad_norm": 3.286907855805531, + "learning_rate": 1.5656202277688803e-06, + "loss": 0.8613, + "step": 8569 + }, + { + "epoch": 1.6497822268209927, + "grad_norm": 3.5446860409728846, + "learning_rate": 1.5639456208697058e-06, + "loss": 0.9323, + "step": 8570 + }, + { + "epoch": 1.6499747334985684, + "grad_norm": 3.469972201723012, + "learning_rate": 1.562271834062945e-06, + "loss": 0.916, + "step": 8571 + }, + { + "epoch": 1.6501672401761436, + "grad_norm": 3.431246399979431, + "learning_rate": 1.5605988675113137e-06, + "loss": 0.9067, + "step": 8572 + }, + { + "epoch": 1.650359746853719, + "grad_norm": 3.098578298550671, + "learning_rate": 1.5589267213774394e-06, + "loss": 0.8611, + "step": 8573 + }, + { + "epoch": 1.6505522535312944, + "grad_norm": 3.246010420679491, + "learning_rate": 1.5572553958238867e-06, + "loss": 0.8819, + "step": 8574 + }, + { + "epoch": 1.6507447602088696, + "grad_norm": 3.210330082708509, + "learning_rate": 1.55558489101312e-06, + "loss": 0.8519, + "step": 8575 + }, + { + "epoch": 1.6509372668864453, + "grad_norm": 3.2499514432970322, + "learning_rate": 1.5539152071075402e-06, + "loss": 0.8956, + "step": 8576 + }, + { + "epoch": 1.6509372668864453, + "lm_loss": 0.6336, + "step": 8576, + "vm_loss": 0.1452 + }, + { + "epoch": 1.6509372668864453, + "lm_loss": 0.8766, + "step": 8576, + "vm_loss": 0.1409 + }, + { + "epoch": 1.6509372668864453, + "lm_loss": 0.7394, + "step": 8576, + "vm_loss": 0.1312 + }, + { + "epoch": 1.6509372668864453, + "lm_loss": 0.7994, + "step": 8576, + "vm_loss": 0.1476 + }, + { + "epoch": 1.6509372668864453, + "lm_loss": 0.6676, + "step": 8576, + "vm_loss": 0.1913 + }, + { + "epoch": 1.6509372668864453, + "lm_loss": 0.9927, + "step": 8576, + "vm_loss": 0.2275 + }, + { + "epoch": 1.6509372668864453, + "lm_loss": 0.7093, + "step": 8576, + "vm_loss": 0.1291 + }, + { + "epoch": 1.6509372668864453, + "lm_loss": 0.7242, + "step": 8576, + "vm_loss": 0.1718 + }, + { + "epoch": 1.6511297735640205, + "grad_norm": 3.309146418973245, + "learning_rate": 1.5522463442694546e-06, + "loss": 0.8736, + "step": 8577 + }, + { + "epoch": 1.6513222802415959, + "grad_norm": 3.291156938701736, + "learning_rate": 1.5505783026611076e-06, + "loss": 0.8936, + "step": 8578 + }, + { + "epoch": 1.6515147869191713, + "grad_norm": 3.2619975801347234, + "learning_rate": 1.5489110824446462e-06, + "loss": 0.8843, + "step": 8579 + }, + { + "epoch": 1.6517072935967465, + "grad_norm": 3.217711669431549, + "learning_rate": 1.5472446837821487e-06, + "loss": 0.8909, + "step": 8580 + }, + { + "epoch": 1.6518998002743221, + "grad_norm": 3.221748673722485, + "learning_rate": 1.5455791068356108e-06, + "loss": 0.8783, + "step": 8581 + }, + { + "epoch": 1.6520923069518973, + "grad_norm": 3.0030024216598767, + "learning_rate": 1.5439143517669496e-06, + "loss": 0.8289, + "step": 8582 + }, + { + "epoch": 1.6522848136294728, + "grad_norm": 3.2378499897830086, + "learning_rate": 1.542250418737996e-06, + "loss": 0.8786, + "step": 8583 + }, + { + "epoch": 1.6524773203070482, + "grad_norm": 3.265688236045046, + "learning_rate": 1.5405873079105083e-06, + "loss": 0.8939, + "step": 8584 + }, + { + "epoch": 1.6524773203070482, + "lm_loss": 1.0133, + "step": 8584, + "vm_loss": 0.1675 + }, + { + "epoch": 1.6524773203070482, + "lm_loss": 0.764, + "step": 8584, + "vm_loss": 0.1924 + }, + { + "epoch": 1.6524773203070482, + "lm_loss": 0.7781, + "step": 8584, + "vm_loss": 0.2221 + }, + { + "epoch": 1.6524773203070482, + "lm_loss": 0.6213, + "step": 8584, + "vm_loss": 0.1701 + }, + { + "epoch": 1.6524773203070482, + "lm_loss": 1.0419, + "step": 8584, + "vm_loss": 0.1479 + }, + { + "epoch": 1.6524773203070482, + "lm_loss": 0.4164, + "step": 8584, + "vm_loss": 0.1795 + }, + { + "epoch": 1.6524773203070482, + "lm_loss": 0.8432, + "step": 8584, + "vm_loss": 0.1499 + }, + { + "epoch": 1.6524773203070482, + "lm_loss": 0.5623, + "step": 8584, + "vm_loss": 0.2833 + }, + { + "epoch": 1.6526698269846234, + "grad_norm": 3.148808554522905, + "learning_rate": 1.5389250194461602e-06, + "loss": 0.8789, + "step": 8585 + }, + { + "epoch": 1.652862333662199, + "grad_norm": 3.2207234898934125, + "learning_rate": 1.5372635535065506e-06, + "loss": 0.8985, + "step": 8586 + }, + { + "epoch": 1.6530548403397742, + "grad_norm": 3.2975874142122885, + "learning_rate": 1.5356029102531933e-06, + "loss": 0.8988, + "step": 8587 + }, + { + "epoch": 1.6532473470173497, + "grad_norm": 3.4629366259803867, + "learning_rate": 1.533943089847526e-06, + "loss": 0.9151, + "step": 8588 + }, + { + "epoch": 1.653439853694925, + "grad_norm": 3.3645320469139035, + "learning_rate": 1.5322840924509009e-06, + "loss": 0.9093, + "step": 8589 + }, + { + "epoch": 1.6536323603725003, + "grad_norm": 3.314965778125061, + "learning_rate": 1.5306259182245963e-06, + "loss": 0.8768, + "step": 8590 + }, + { + "epoch": 1.653824867050076, + "grad_norm": 3.4655778784003086, + "learning_rate": 1.528968567329806e-06, + "loss": 0.9478, + "step": 8591 + }, + { + "epoch": 1.6540173737276511, + "grad_norm": 3.263730130362224, + "learning_rate": 1.5273120399276486e-06, + "loss": 0.8664, + "step": 8592 + }, + { + "epoch": 1.6540173737276511, + "lm_loss": 1.0209, + "step": 8592, + "vm_loss": 0.1867 + }, + { + "epoch": 1.6540173737276511, + "lm_loss": 0.5994, + "step": 8592, + "vm_loss": 0.1656 + }, + { + "epoch": 1.6540173737276511, + "lm_loss": 0.5958, + "step": 8592, + "vm_loss": 0.1396 + }, + { + "epoch": 1.6540173737276511, + "lm_loss": 0.7338, + "step": 8592, + "vm_loss": 0.1738 + }, + { + "epoch": 1.6540173737276511, + "lm_loss": 0.3963, + "step": 8592, + "vm_loss": 0.1868 + }, + { + "epoch": 1.6540173737276511, + "lm_loss": 0.8106, + "step": 8592, + "vm_loss": 0.1263 + }, + { + "epoch": 1.6540173737276511, + "lm_loss": 0.8736, + "step": 8592, + "vm_loss": 0.1244 + }, + { + "epoch": 1.6540173737276511, + "lm_loss": 0.6983, + "step": 8592, + "vm_loss": 0.2055 + }, + { + "epoch": 1.6542098804052265, + "grad_norm": 3.0706368405825843, + "learning_rate": 1.5256563361791598e-06, + "loss": 0.8604, + "step": 8593 + }, + { + "epoch": 1.654402387082802, + "grad_norm": 3.2293666230213587, + "learning_rate": 1.5240014562452888e-06, + "loss": 0.8733, + "step": 8594 + }, + { + "epoch": 1.6545948937603772, + "grad_norm": 3.283028867293217, + "learning_rate": 1.522347400286921e-06, + "loss": 0.894, + "step": 8595 + }, + { + "epoch": 1.6547874004379528, + "grad_norm": 3.2526690937293057, + "learning_rate": 1.5206941684648446e-06, + "loss": 0.8828, + "step": 8596 + }, + { + "epoch": 1.654979907115528, + "grad_norm": 3.1684084185086996, + "learning_rate": 1.5190417609397768e-06, + "loss": 0.8325, + "step": 8597 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 3.4292913454824787, + "learning_rate": 1.5173901778723532e-06, + "loss": 0.9443, + "step": 8598 + }, + { + "epoch": 1.6553649204706788, + "grad_norm": 3.180749309509719, + "learning_rate": 1.5157394194231301e-06, + "loss": 0.8549, + "step": 8599 + }, + { + "epoch": 1.655557427148254, + "grad_norm": 3.147228553786558, + "learning_rate": 1.514089485752579e-06, + "loss": 0.8551, + "step": 8600 + }, + { + "epoch": 1.655557427148254, + "lm_loss": 0.7246, + "step": 8600, + "vm_loss": 0.1019 + }, + { + "epoch": 1.655557427148254, + "lm_loss": 0.7496, + "step": 8600, + "vm_loss": 0.2002 + }, + { + "epoch": 1.655557427148254, + "lm_loss": 0.867, + "step": 8600, + "vm_loss": 0.1587 + }, + { + "epoch": 1.655557427148254, + "lm_loss": 0.7668, + "step": 8600, + "vm_loss": 0.1382 + }, + { + "epoch": 1.655557427148254, + "lm_loss": 0.7195, + "step": 8600, + "vm_loss": 0.1621 + }, + { + "epoch": 1.655557427148254, + "lm_loss": 0.8501, + "step": 8600, + "vm_loss": 0.1752 + }, + { + "epoch": 1.655557427148254, + "lm_loss": 0.6099, + "step": 8600, + "vm_loss": 0.1473 + }, + { + "epoch": 1.655557427148254, + "lm_loss": 0.5455, + "step": 8600, + "vm_loss": 0.1664 + }, + { + "epoch": 1.6557499338258297, + "grad_norm": 3.292446592359703, + "learning_rate": 1.5124403770210972e-06, + "loss": 0.9009, + "step": 8601 + }, + { + "epoch": 1.6559424405034049, + "grad_norm": 3.3249381820495953, + "learning_rate": 1.5107920933889976e-06, + "loss": 0.9146, + "step": 8602 + }, + { + "epoch": 1.6561349471809803, + "grad_norm": 3.351641227618159, + "learning_rate": 1.5091446350165173e-06, + "loss": 0.865, + "step": 8603 + }, + { + "epoch": 1.6563274538585557, + "grad_norm": 3.1480824762923043, + "learning_rate": 1.507498002063803e-06, + "loss": 0.8724, + "step": 8604 + }, + { + "epoch": 1.656519960536131, + "grad_norm": 3.3068635263293973, + "learning_rate": 1.5058521946909399e-06, + "loss": 0.9284, + "step": 8605 + }, + { + "epoch": 1.6567124672137066, + "grad_norm": 3.1165652984161345, + "learning_rate": 1.5042072130579122e-06, + "loss": 0.8535, + "step": 8606 + }, + { + "epoch": 1.6569049738912818, + "grad_norm": 3.1629556822327647, + "learning_rate": 1.5025630573246363e-06, + "loss": 0.8807, + "step": 8607 + }, + { + "epoch": 1.6570974805688572, + "grad_norm": 3.1481275223575964, + "learning_rate": 1.5009197276509446e-06, + "loss": 0.8571, + "step": 8608 + }, + { + "epoch": 1.6570974805688572, + "lm_loss": 0.9178, + "step": 8608, + "vm_loss": 0.1613 + }, + { + "epoch": 1.6570974805688572, + "lm_loss": 0.7153, + "step": 8608, + "vm_loss": 0.1521 + }, + { + "epoch": 1.6570974805688572, + "lm_loss": 0.7857, + "step": 8608, + "vm_loss": 0.1649 + }, + { + "epoch": 1.6570974805688572, + "lm_loss": 0.5521, + "step": 8608, + "vm_loss": 0.1836 + }, + { + "epoch": 1.6570974805688572, + "lm_loss": 0.6812, + "step": 8608, + "vm_loss": 0.1564 + }, + { + "epoch": 1.6570974805688572, + "lm_loss": 0.7867, + "step": 8608, + "vm_loss": 0.1963 + }, + { + "epoch": 1.6570974805688572, + "lm_loss": 0.6027, + "step": 8608, + "vm_loss": 0.1955 + }, + { + "epoch": 1.6570974805688572, + "lm_loss": 0.9518, + "step": 8608, + "vm_loss": 0.1504 + }, + { + "epoch": 1.6572899872464326, + "grad_norm": 3.273485719571236, + "learning_rate": 1.4992772241965935e-06, + "loss": 0.873, + "step": 8609 + }, + { + "epoch": 1.657482493924008, + "grad_norm": 3.2619107685133697, + "learning_rate": 1.49763554712125e-06, + "loss": 0.8909, + "step": 8610 + }, + { + "epoch": 1.6576750006015835, + "grad_norm": 3.0510990190415352, + "learning_rate": 1.4959946965845084e-06, + "loss": 0.7988, + "step": 8611 + }, + { + "epoch": 1.6578675072791587, + "grad_norm": 3.3338252414019425, + "learning_rate": 1.4943546727458802e-06, + "loss": 0.9106, + "step": 8612 + }, + { + "epoch": 1.658060013956734, + "grad_norm": 3.480807468628937, + "learning_rate": 1.4927154757647967e-06, + "loss": 0.942, + "step": 8613 + }, + { + "epoch": 1.6582525206343095, + "grad_norm": 3.2978699376414076, + "learning_rate": 1.491077105800608e-06, + "loss": 0.9154, + "step": 8614 + }, + { + "epoch": 1.658445027311885, + "grad_norm": 3.2960106036443557, + "learning_rate": 1.4894395630125868e-06, + "loss": 0.8866, + "step": 8615 + }, + { + "epoch": 1.6586375339894603, + "grad_norm": 3.165725497970785, + "learning_rate": 1.4878028475599237e-06, + "loss": 0.8958, + "step": 8616 + }, + { + "epoch": 1.6586375339894603, + "lm_loss": 0.7562, + "step": 8616, + "vm_loss": 0.2352 + }, + { + "epoch": 1.6586375339894603, + "lm_loss": 0.9552, + "step": 8616, + "vm_loss": 0.1616 + }, + { + "epoch": 1.6586375339894603, + "lm_loss": 0.8778, + "step": 8616, + "vm_loss": 0.213 + }, + { + "epoch": 1.6586375339894603, + "lm_loss": 0.5895, + "step": 8616, + "vm_loss": 0.1436 + }, + { + "epoch": 1.6586375339894603, + "lm_loss": 0.7849, + "step": 8616, + "vm_loss": 0.1229 + }, + { + "epoch": 1.6586375339894603, + "lm_loss": 0.6183, + "step": 8616, + "vm_loss": 0.0964 + }, + { + "epoch": 1.6586375339894603, + "lm_loss": 0.5651, + "step": 8616, + "vm_loss": 0.1489 + }, + { + "epoch": 1.6586375339894603, + "lm_loss": 0.7164, + "step": 8616, + "vm_loss": 0.1942 + }, + { + "epoch": 1.6588300406670355, + "grad_norm": 3.448421431615067, + "learning_rate": 1.4861669596017237e-06, + "loss": 0.9136, + "step": 8617 + }, + { + "epoch": 1.659022547344611, + "grad_norm": 3.1740875942782276, + "learning_rate": 1.4845318992970203e-06, + "loss": 0.86, + "step": 8618 + }, + { + "epoch": 1.6592150540221864, + "grad_norm": 3.3788893341495903, + "learning_rate": 1.48289766680476e-06, + "loss": 0.9262, + "step": 8619 + }, + { + "epoch": 1.6594075606997618, + "grad_norm": 3.194499805785924, + "learning_rate": 1.4812642622838148e-06, + "loss": 0.8662, + "step": 8620 + }, + { + "epoch": 1.6596000673773372, + "grad_norm": 3.305860907010635, + "learning_rate": 1.4796316858929648e-06, + "loss": 0.8752, + "step": 8621 + }, + { + "epoch": 1.6597925740549124, + "grad_norm": 3.201176208449638, + "learning_rate": 1.4779999377909271e-06, + "loss": 0.8643, + "step": 8622 + }, + { + "epoch": 1.659985080732488, + "grad_norm": 3.316934044031229, + "learning_rate": 1.476369018136321e-06, + "loss": 0.9067, + "step": 8623 + }, + { + "epoch": 1.6601775874100633, + "grad_norm": 3.4796896556572374, + "learning_rate": 1.4747389270876977e-06, + "loss": 0.9132, + "step": 8624 + }, + { + "epoch": 1.6601775874100633, + "lm_loss": 1.0447, + "step": 8624, + "vm_loss": 0.2471 + }, + { + "epoch": 1.6601775874100633, + "lm_loss": 0.7276, + "step": 8624, + "vm_loss": 0.196 + }, + { + "epoch": 1.6601775874100633, + "lm_loss": 0.6373, + "step": 8624, + "vm_loss": 0.2002 + }, + { + "epoch": 1.6601775874100633, + "lm_loss": 0.4689, + "step": 8624, + "vm_loss": 0.1688 + }, + { + "epoch": 1.6601775874100633, + "lm_loss": 0.5324, + "step": 8624, + "vm_loss": 0.1547 + }, + { + "epoch": 1.6601775874100633, + "lm_loss": 0.7878, + "step": 8624, + "vm_loss": 0.2042 + }, + { + "epoch": 1.6601775874100633, + "lm_loss": 1.1226, + "step": 8624, + "vm_loss": 0.1537 + }, + { + "epoch": 1.6601775874100633, + "lm_loss": 0.9283, + "step": 8624, + "vm_loss": 0.1915 + }, + { + "epoch": 1.6603700940876387, + "grad_norm": 3.2580092370535447, + "learning_rate": 1.4731096648035158e-06, + "loss": 0.9013, + "step": 8625 + }, + { + "epoch": 1.6605626007652141, + "grad_norm": 3.1987584969560903, + "learning_rate": 1.4714812314421711e-06, + "loss": 0.8889, + "step": 8626 + }, + { + "epoch": 1.6607551074427893, + "grad_norm": 3.378742204547144, + "learning_rate": 1.469853627161959e-06, + "loss": 0.9387, + "step": 8627 + }, + { + "epoch": 1.660947614120365, + "grad_norm": 3.2596257329743303, + "learning_rate": 1.4682268521211075e-06, + "loss": 0.9044, + "step": 8628 + }, + { + "epoch": 1.6611401207979402, + "grad_norm": 3.1555085556971814, + "learning_rate": 1.4666009064777587e-06, + "loss": 0.8573, + "step": 8629 + }, + { + "epoch": 1.6613326274755156, + "grad_norm": 3.11067129782355, + "learning_rate": 1.4649757903899752e-06, + "loss": 0.8309, + "step": 8630 + }, + { + "epoch": 1.661525134153091, + "grad_norm": 3.215776551744369, + "learning_rate": 1.463351504015741e-06, + "loss": 0.864, + "step": 8631 + }, + { + "epoch": 1.6617176408306662, + "grad_norm": 3.1944682161970346, + "learning_rate": 1.461728047512958e-06, + "loss": 0.8349, + "step": 8632 + }, + { + "epoch": 1.6617176408306662, + "lm_loss": 0.8908, + "step": 8632, + "vm_loss": 0.1249 + }, + { + "epoch": 1.6617176408306662, + "lm_loss": 0.6637, + "step": 8632, + "vm_loss": 0.1742 + }, + { + "epoch": 1.6617176408306662, + "lm_loss": 0.4599, + "step": 8632, + "vm_loss": 0.117 + }, + { + "epoch": 1.6617176408306662, + "lm_loss": 1.2626, + "step": 8632, + "vm_loss": 0.1571 + }, + { + "epoch": 1.6617176408306662, + "lm_loss": 0.6261, + "step": 8632, + "vm_loss": 0.1854 + }, + { + "epoch": 1.6617176408306662, + "lm_loss": 0.719, + "step": 8632, + "vm_loss": 0.1769 + }, + { + "epoch": 1.6617176408306662, + "lm_loss": 0.708, + "step": 8632, + "vm_loss": 0.1815 + }, + { + "epoch": 1.6617176408306662, + "lm_loss": 0.6153, + "step": 8632, + "vm_loss": 0.138 + }, + { + "epoch": 1.6619101475082418, + "grad_norm": 3.0728873791224682, + "learning_rate": 1.4601054210394428e-06, + "loss": 0.809, + "step": 8633 + }, + { + "epoch": 1.662102654185817, + "grad_norm": 3.2411696687244813, + "learning_rate": 1.4584836247529377e-06, + "loss": 0.8674, + "step": 8634 + }, + { + "epoch": 1.6622951608633925, + "grad_norm": 3.1629452086313363, + "learning_rate": 1.4568626588111023e-06, + "loss": 0.8242, + "step": 8635 + }, + { + "epoch": 1.6624876675409679, + "grad_norm": 3.26054925572343, + "learning_rate": 1.455242523371515e-06, + "loss": 0.8585, + "step": 8636 + }, + { + "epoch": 1.662680174218543, + "grad_norm": 3.275617540411821, + "learning_rate": 1.4536232185916765e-06, + "loss": 0.9255, + "step": 8637 + }, + { + "epoch": 1.6628726808961187, + "grad_norm": 3.1331767130555166, + "learning_rate": 1.4520047446289964e-06, + "loss": 0.8042, + "step": 8638 + }, + { + "epoch": 1.663065187573694, + "grad_norm": 3.1747286387788893, + "learning_rate": 1.45038710164082e-06, + "loss": 0.8556, + "step": 8639 + }, + { + "epoch": 1.6632576942512693, + "grad_norm": 3.0959212464276904, + "learning_rate": 1.4487702897843969e-06, + "loss": 0.8385, + "step": 8640 + }, + { + "epoch": 1.6632576942512693, + "lm_loss": 0.6467, + "step": 8640, + "vm_loss": 0.1838 + }, + { + "epoch": 1.6632576942512693, + "lm_loss": 0.4168, + "step": 8640, + "vm_loss": 0.0948 + }, + { + "epoch": 1.6632576942512693, + "lm_loss": 0.8074, + "step": 8640, + "vm_loss": 0.1682 + }, + { + "epoch": 1.6632576942512693, + "lm_loss": 0.5727, + "step": 8640, + "vm_loss": 0.1776 + }, + { + "epoch": 1.6632576942512693, + "lm_loss": 0.9478, + "step": 8640, + "vm_loss": 0.1192 + }, + { + "epoch": 1.6632576942512693, + "lm_loss": 0.4803, + "step": 8640, + "vm_loss": 0.168 + }, + { + "epoch": 1.6632576942512693, + "lm_loss": 0.6475, + "step": 8640, + "vm_loss": 0.1355 + }, + { + "epoch": 1.6632576942512693, + "lm_loss": 1.1566, + "step": 8640, + "vm_loss": 0.1507 + }, + { + "epoch": 1.6634502009288448, + "grad_norm": 3.230317078988885, + "learning_rate": 1.4471543092169072e-06, + "loss": 0.8464, + "step": 8641 + }, + { + "epoch": 1.66364270760642, + "grad_norm": 3.327929402150926, + "learning_rate": 1.445539160095436e-06, + "loss": 0.8797, + "step": 8642 + }, + { + "epoch": 1.6638352142839956, + "grad_norm": 3.273601998759998, + "learning_rate": 1.4439248425770058e-06, + "loss": 0.8864, + "step": 8643 + }, + { + "epoch": 1.6640277209615708, + "grad_norm": 3.220464535294617, + "learning_rate": 1.4423113568185433e-06, + "loss": 0.8645, + "step": 8644 + }, + { + "epoch": 1.6642202276391462, + "grad_norm": 3.2937547286579676, + "learning_rate": 1.4406987029769026e-06, + "loss": 0.8529, + "step": 8645 + }, + { + "epoch": 1.6644127343167217, + "grad_norm": 3.157043837647187, + "learning_rate": 1.4390868812088521e-06, + "loss": 0.8293, + "step": 8646 + }, + { + "epoch": 1.6646052409942969, + "grad_norm": 3.2342571082236486, + "learning_rate": 1.4374758916710863e-06, + "loss": 0.8891, + "step": 8647 + }, + { + "epoch": 1.6647977476718725, + "grad_norm": 3.2269554137077585, + "learning_rate": 1.4358657345202043e-06, + "loss": 0.8856, + "step": 8648 + }, + { + "epoch": 1.6647977476718725, + "lm_loss": 1.0571, + "step": 8648, + "vm_loss": 0.1861 + }, + { + "epoch": 1.6647977476718725, + "lm_loss": 0.6571, + "step": 8648, + "vm_loss": 0.15 + }, + { + "epoch": 1.6647977476718725, + "lm_loss": 1.1517, + "step": 8648, + "vm_loss": 0.1698 + }, + { + "epoch": 1.6647977476718725, + "lm_loss": 0.7242, + "step": 8648, + "vm_loss": 0.1091 + }, + { + "epoch": 1.6647977476718725, + "lm_loss": 0.9443, + "step": 8648, + "vm_loss": 0.1777 + }, + { + "epoch": 1.6647977476718725, + "lm_loss": 0.6744, + "step": 8648, + "vm_loss": 0.1454 + }, + { + "epoch": 1.6647977476718725, + "lm_loss": 0.522, + "step": 8648, + "vm_loss": 0.1433 + }, + { + "epoch": 1.6647977476718725, + "lm_loss": 0.7312, + "step": 8648, + "vm_loss": 0.1543 + }, + { + "epoch": 1.6649902543494477, + "grad_norm": 3.2064001453693742, + "learning_rate": 1.4342564099127465e-06, + "loss": 0.8896, + "step": 8649 + }, + { + "epoch": 1.6651827610270231, + "grad_norm": 3.1865968676083605, + "learning_rate": 1.43264791800515e-06, + "loss": 0.8844, + "step": 8650 + }, + { + "epoch": 1.6653752677045985, + "grad_norm": 3.4245156551821254, + "learning_rate": 1.4310402589537852e-06, + "loss": 0.9231, + "step": 8651 + }, + { + "epoch": 1.6655677743821737, + "grad_norm": 3.286437050363198, + "learning_rate": 1.429433432914935e-06, + "loss": 0.8677, + "step": 8652 + }, + { + "epoch": 1.6657602810597494, + "grad_norm": 3.2948250752241552, + "learning_rate": 1.4278274400448056e-06, + "loss": 0.8979, + "step": 8653 + }, + { + "epoch": 1.6659527877373246, + "grad_norm": 3.3176248888995996, + "learning_rate": 1.4262222804995217e-06, + "loss": 0.9127, + "step": 8654 + }, + { + "epoch": 1.6661452944149, + "grad_norm": 3.396234409003037, + "learning_rate": 1.4246179544351202e-06, + "loss": 0.9161, + "step": 8655 + }, + { + "epoch": 1.6663378010924754, + "grad_norm": 3.2404596217464636, + "learning_rate": 1.4230144620075648e-06, + "loss": 0.878, + "step": 8656 + }, + { + "epoch": 1.6663378010924754, + "lm_loss": 0.3272, + "step": 8656, + "vm_loss": 0.2388 + }, + { + "epoch": 1.6663378010924754, + "lm_loss": 0.9471, + "step": 8656, + "vm_loss": 0.2977 + }, + { + "epoch": 1.6663378010924754, + "lm_loss": 0.9748, + "step": 8656, + "vm_loss": 0.1383 + }, + { + "epoch": 1.6663378010924754, + "lm_loss": 0.9703, + "step": 8656, + "vm_loss": 0.1995 + }, + { + "epoch": 1.6663378010924754, + "lm_loss": 0.3722, + "step": 8656, + "vm_loss": 0.1466 + }, + { + "epoch": 1.6663378010924754, + "lm_loss": 0.764, + "step": 8656, + "vm_loss": 0.0933 + }, + { + "epoch": 1.6663378010924754, + "lm_loss": 0.7902, + "step": 8656, + "vm_loss": 0.1104 + }, + { + "epoch": 1.6663378010924754, + "lm_loss": 0.9039, + "step": 8656, + "vm_loss": 0.1775 + }, + { + "epoch": 1.6665303077700506, + "grad_norm": 3.312944757013938, + "learning_rate": 1.4214118033727343e-06, + "loss": 0.9283, + "step": 8657 + }, + { + "epoch": 1.6667228144476263, + "grad_norm": 3.23270419046511, + "learning_rate": 1.4198099786864327e-06, + "loss": 0.8796, + "step": 8658 + }, + { + "epoch": 1.6669153211252015, + "grad_norm": 3.1366810466161796, + "learning_rate": 1.418208988104368e-06, + "loss": 0.862, + "step": 8659 + }, + { + "epoch": 1.6671078278027769, + "grad_norm": 3.3520670955629015, + "learning_rate": 1.416608831782188e-06, + "loss": 0.8655, + "step": 8660 + }, + { + "epoch": 1.6673003344803523, + "grad_norm": 3.3538257827072964, + "learning_rate": 1.4150095098754402e-06, + "loss": 0.903, + "step": 8661 + }, + { + "epoch": 1.6674928411579275, + "grad_norm": 3.2615696767489424, + "learning_rate": 1.4134110225396024e-06, + "loss": 0.8995, + "step": 8662 + }, + { + "epoch": 1.6676853478355032, + "grad_norm": 3.263676372790851, + "learning_rate": 1.4118133699300685e-06, + "loss": 0.8819, + "step": 8663 + }, + { + "epoch": 1.6678778545130784, + "grad_norm": 3.1008050641425493, + "learning_rate": 1.4102165522021516e-06, + "loss": 0.842, + "step": 8664 + }, + { + "epoch": 1.6678778545130784, + "lm_loss": 0.5606, + "step": 8664, + "vm_loss": 0.1786 + }, + { + "epoch": 1.6678778545130784, + "lm_loss": 0.7015, + "step": 8664, + "vm_loss": 0.1741 + }, + { + "epoch": 1.6678778545130784, + "lm_loss": 0.723, + "step": 8664, + "vm_loss": 0.1484 + }, + { + "epoch": 1.6678778545130784, + "lm_loss": 0.914, + "step": 8664, + "vm_loss": 0.1724 + }, + { + "epoch": 1.6678778545130784, + "lm_loss": 0.8645, + "step": 8664, + "vm_loss": 0.1514 + }, + { + "epoch": 1.6678778545130784, + "lm_loss": 0.6933, + "step": 8664, + "vm_loss": 0.1834 + }, + { + "epoch": 1.6678778545130784, + "lm_loss": 0.8451, + "step": 8664, + "vm_loss": 0.2329 + }, + { + "epoch": 1.6678778545130784, + "lm_loss": 0.6587, + "step": 8664, + "vm_loss": 0.1919 + }, + { + "epoch": 1.6680703611906538, + "grad_norm": 3.209617531633429, + "learning_rate": 1.4086205695110778e-06, + "loss": 0.8993, + "step": 8665 + }, + { + "epoch": 1.6682628678682292, + "grad_norm": 3.0392467582317244, + "learning_rate": 1.4070254220120039e-06, + "loss": 0.8147, + "step": 8666 + }, + { + "epoch": 1.6684553745458044, + "grad_norm": 3.2463083107447215, + "learning_rate": 1.4054311098599926e-06, + "loss": 0.8523, + "step": 8667 + }, + { + "epoch": 1.66864788122338, + "grad_norm": 3.1553068906414823, + "learning_rate": 1.403837633210038e-06, + "loss": 0.875, + "step": 8668 + }, + { + "epoch": 1.6688403879009552, + "grad_norm": 3.2210873315707955, + "learning_rate": 1.4022449922170368e-06, + "loss": 0.8686, + "step": 8669 + }, + { + "epoch": 1.6690328945785307, + "grad_norm": 3.106918134249493, + "learning_rate": 1.400653187035824e-06, + "loss": 0.858, + "step": 8670 + }, + { + "epoch": 1.669225401256106, + "grad_norm": 3.2638744093433254, + "learning_rate": 1.3990622178211365e-06, + "loss": 0.8831, + "step": 8671 + }, + { + "epoch": 1.6694179079336815, + "grad_norm": 3.259449104393686, + "learning_rate": 1.3974720847276412e-06, + "loss": 0.9039, + "step": 8672 + }, + { + "epoch": 1.6694179079336815, + "lm_loss": 0.6322, + "step": 8672, + "vm_loss": 0.1798 + }, + { + "epoch": 1.6694179079336815, + "lm_loss": 0.9701, + "step": 8672, + "vm_loss": 0.1308 + }, + { + "epoch": 1.6694179079336815, + "lm_loss": 0.7481, + "step": 8672, + "vm_loss": 0.1516 + }, + { + "epoch": 1.6694179079336815, + "lm_loss": 0.7956, + "step": 8672, + "vm_loss": 0.1116 + }, + { + "epoch": 1.6694179079336815, + "lm_loss": 0.9709, + "step": 8672, + "vm_loss": 0.154 + }, + { + "epoch": 1.6694179079336815, + "lm_loss": 0.693, + "step": 8672, + "vm_loss": 0.1394 + }, + { + "epoch": 1.6694179079336815, + "lm_loss": 0.445, + "step": 8672, + "vm_loss": 0.133 + }, + { + "epoch": 1.6694179079336815, + "lm_loss": 0.4672, + "step": 8672, + "vm_loss": 0.1483 + }, + { + "epoch": 1.669610414611257, + "grad_norm": 3.378183535267053, + "learning_rate": 1.3958827879099169e-06, + "loss": 0.8803, + "step": 8673 + }, + { + "epoch": 1.6698029212888321, + "grad_norm": 3.3190902676777747, + "learning_rate": 1.3942943275224641e-06, + "loss": 0.8641, + "step": 8674 + }, + { + "epoch": 1.6699954279664075, + "grad_norm": 3.1162925071191263, + "learning_rate": 1.3927067037197039e-06, + "loss": 0.8261, + "step": 8675 + }, + { + "epoch": 1.670187934643983, + "grad_norm": 3.0538030158057325, + "learning_rate": 1.3911199166559675e-06, + "loss": 0.8218, + "step": 8676 + }, + { + "epoch": 1.6703804413215584, + "grad_norm": 3.415094721865597, + "learning_rate": 1.38953396648552e-06, + "loss": 0.9105, + "step": 8677 + }, + { + "epoch": 1.6705729479991338, + "grad_norm": 3.1574624157504396, + "learning_rate": 1.3879488533625286e-06, + "loss": 0.8886, + "step": 8678 + }, + { + "epoch": 1.670765454676709, + "grad_norm": 3.181947112819833, + "learning_rate": 1.3863645774410883e-06, + "loss": 0.8558, + "step": 8679 + }, + { + "epoch": 1.6709579613542844, + "grad_norm": 3.420222345542465, + "learning_rate": 1.3847811388752119e-06, + "loss": 0.9115, + "step": 8680 + }, + { + "epoch": 1.6709579613542844, + "lm_loss": 0.5386, + "step": 8680, + "vm_loss": 0.2117 + }, + { + "epoch": 1.6709579613542844, + "lm_loss": 1.0263, + "step": 8680, + "vm_loss": 0.1886 + }, + { + "epoch": 1.6709579613542844, + "lm_loss": 0.7111, + "step": 8680, + "vm_loss": 0.1328 + }, + { + "epoch": 1.6709579613542844, + "lm_loss": 0.5011, + "step": 8680, + "vm_loss": 0.1045 + }, + { + "epoch": 1.6709579613542844, + "lm_loss": 0.6043, + "step": 8680, + "vm_loss": 0.1869 + }, + { + "epoch": 1.6709579613542844, + "lm_loss": 0.6528, + "step": 8680, + "vm_loss": 0.1638 + }, + { + "epoch": 1.6709579613542844, + "lm_loss": 0.8272, + "step": 8680, + "vm_loss": 0.1432 + }, + { + "epoch": 1.6709579613542844, + "lm_loss": 0.7701, + "step": 8680, + "vm_loss": 0.1322 + }, + { + "epoch": 1.6711504680318598, + "grad_norm": 3.300307039919615, + "learning_rate": 1.3831985378188317e-06, + "loss": 0.8677, + "step": 8681 + }, + { + "epoch": 1.6713429747094353, + "grad_norm": 3.2744881440019467, + "learning_rate": 1.3816167744257902e-06, + "loss": 0.8441, + "step": 8682 + }, + { + "epoch": 1.6715354813870107, + "grad_norm": 3.4880328278006796, + "learning_rate": 1.3800358488498645e-06, + "loss": 0.9261, + "step": 8683 + }, + { + "epoch": 1.671727988064586, + "grad_norm": 3.2779962678486436, + "learning_rate": 1.3784557612447325e-06, + "loss": 0.9037, + "step": 8684 + }, + { + "epoch": 1.6719204947421615, + "grad_norm": 3.195543899756157, + "learning_rate": 1.3768765117640059e-06, + "loss": 0.8676, + "step": 8685 + }, + { + "epoch": 1.6721130014197367, + "grad_norm": 3.19753293842354, + "learning_rate": 1.3752981005611988e-06, + "loss": 0.8521, + "step": 8686 + }, + { + "epoch": 1.6723055080973122, + "grad_norm": 3.304726085017399, + "learning_rate": 1.3737205277897637e-06, + "loss": 0.9105, + "step": 8687 + }, + { + "epoch": 1.6724980147748876, + "grad_norm": 3.2585186983972743, + "learning_rate": 1.3721437936030524e-06, + "loss": 0.8776, + "step": 8688 + }, + { + "epoch": 1.6724980147748876, + "lm_loss": 0.597, + "step": 8688, + "vm_loss": 0.1574 + }, + { + "epoch": 1.6724980147748876, + "lm_loss": 0.7772, + "step": 8688, + "vm_loss": 0.143 + }, + { + "epoch": 1.6724980147748876, + "lm_loss": 1.1408, + "step": 8688, + "vm_loss": 0.0958 + }, + { + "epoch": 1.6724980147748876, + "lm_loss": 0.5652, + "step": 8688, + "vm_loss": 0.1798 + }, + { + "epoch": 1.6724980147748876, + "lm_loss": 0.7737, + "step": 8688, + "vm_loss": 0.1081 + }, + { + "epoch": 1.6724980147748876, + "lm_loss": 0.4744, + "step": 8688, + "vm_loss": 0.0889 + }, + { + "epoch": 1.6724980147748876, + "lm_loss": 0.9416, + "step": 8688, + "vm_loss": 0.1622 + }, + { + "epoch": 1.6724980147748876, + "lm_loss": 0.551, + "step": 8688, + "vm_loss": 0.1418 + }, + { + "epoch": 1.6726905214524628, + "grad_norm": 3.0773575755433606, + "learning_rate": 1.370567898154348e-06, + "loss": 0.7906, + "step": 8689 + }, + { + "epoch": 1.6728830281300384, + "grad_norm": 3.211895231477191, + "learning_rate": 1.3689928415968456e-06, + "loss": 0.872, + "step": 8690 + }, + { + "epoch": 1.6730755348076136, + "grad_norm": 3.2176081339314586, + "learning_rate": 1.367418624083665e-06, + "loss": 0.8938, + "step": 8691 + }, + { + "epoch": 1.673268041485189, + "grad_norm": 3.3111613825138226, + "learning_rate": 1.365845245767835e-06, + "loss": 0.9052, + "step": 8692 + }, + { + "epoch": 1.6734605481627645, + "grad_norm": 3.1365353895482104, + "learning_rate": 1.3642727068023087e-06, + "loss": 0.8561, + "step": 8693 + }, + { + "epoch": 1.6736530548403397, + "grad_norm": 3.372813772824032, + "learning_rate": 1.3627010073399605e-06, + "loss": 0.8886, + "step": 8694 + }, + { + "epoch": 1.6738455615179153, + "grad_norm": 3.4618630067403378, + "learning_rate": 1.3611301475335759e-06, + "loss": 0.9459, + "step": 8695 + }, + { + "epoch": 1.6740380681954905, + "grad_norm": 3.1775714126265355, + "learning_rate": 1.359560127535865e-06, + "loss": 0.8436, + "step": 8696 + }, + { + "epoch": 1.6740380681954905, + "lm_loss": 0.6031, + "step": 8696, + "vm_loss": 0.1836 + }, + { + "epoch": 1.6740380681954905, + "lm_loss": 0.6202, + "step": 8696, + "vm_loss": 0.1992 + }, + { + "epoch": 1.6740380681954905, + "lm_loss": 0.6653, + "step": 8696, + "vm_loss": 0.1803 + }, + { + "epoch": 1.6740380681954905, + "lm_loss": 0.6178, + "step": 8696, + "vm_loss": 0.1768 + }, + { + "epoch": 1.6740380681954905, + "lm_loss": 0.4381, + "step": 8696, + "vm_loss": 0.1851 + }, + { + "epoch": 1.6740380681954905, + "lm_loss": 0.768, + "step": 8696, + "vm_loss": 0.1153 + }, + { + "epoch": 1.6740380681954905, + "lm_loss": 0.6019, + "step": 8696, + "vm_loss": 0.1668 + }, + { + "epoch": 1.6740380681954905, + "lm_loss": 0.878, + "step": 8696, + "vm_loss": 0.154 + }, + { + "epoch": 1.674230574873066, + "grad_norm": 3.272170119210459, + "learning_rate": 1.3579909474994534e-06, + "loss": 0.8851, + "step": 8697 + }, + { + "epoch": 1.6744230815506413, + "grad_norm": 3.4114111981751036, + "learning_rate": 1.3564226075768883e-06, + "loss": 0.9285, + "step": 8698 + }, + { + "epoch": 1.6746155882282165, + "grad_norm": 3.337898646747744, + "learning_rate": 1.3548551079206262e-06, + "loss": 0.8961, + "step": 8699 + }, + { + "epoch": 1.6748080949057922, + "grad_norm": 3.145509714349786, + "learning_rate": 1.353288448683051e-06, + "loss": 0.8561, + "step": 8700 + }, + { + "epoch": 1.6750006015833674, + "grad_norm": 3.185125432679613, + "learning_rate": 1.3517226300164632e-06, + "loss": 0.8621, + "step": 8701 + }, + { + "epoch": 1.6751931082609428, + "grad_norm": 3.2291425655737567, + "learning_rate": 1.3501576520730809e-06, + "loss": 0.884, + "step": 8702 + }, + { + "epoch": 1.6753856149385182, + "grad_norm": 3.2291281177363027, + "learning_rate": 1.3485935150050345e-06, + "loss": 0.8311, + "step": 8703 + }, + { + "epoch": 1.6755781216160934, + "grad_norm": 3.226636224996044, + "learning_rate": 1.347030218964387e-06, + "loss": 0.8528, + "step": 8704 + }, + { + "epoch": 1.6755781216160934, + "lm_loss": 0.7582, + "step": 8704, + "vm_loss": 0.2331 + }, + { + "epoch": 1.6755781216160934, + "lm_loss": 1.0301, + "step": 8704, + "vm_loss": 0.1868 + }, + { + "epoch": 1.6755781216160934, + "lm_loss": 0.5202, + "step": 8704, + "vm_loss": 0.1781 + }, + { + "epoch": 1.6755781216160934, + "lm_loss": 0.6198, + "step": 8704, + "vm_loss": 0.2241 + }, + { + "epoch": 1.6755781216160934, + "lm_loss": 0.9205, + "step": 8704, + "vm_loss": 0.1771 + }, + { + "epoch": 1.6755781216160934, + "lm_loss": 0.4732, + "step": 8704, + "vm_loss": 0.2216 + }, + { + "epoch": 1.6755781216160934, + "lm_loss": 0.529, + "step": 8704, + "vm_loss": 0.1735 + }, + { + "epoch": 1.6755781216160934, + "lm_loss": 0.4478, + "step": 8704, + "vm_loss": 0.1567 + }, + { + "epoch": 1.675770628293669, + "grad_norm": 3.1758902264920454, + "learning_rate": 1.3454677641031032e-06, + "loss": 0.864, + "step": 8705 + }, + { + "epoch": 1.6759631349712443, + "grad_norm": 3.307620336094291, + "learning_rate": 1.3439061505730777e-06, + "loss": 0.9329, + "step": 8706 + }, + { + "epoch": 1.6761556416488197, + "grad_norm": 3.28283674350169, + "learning_rate": 1.3423453785261164e-06, + "loss": 0.8889, + "step": 8707 + }, + { + "epoch": 1.6763481483263951, + "grad_norm": 3.2444700793163452, + "learning_rate": 1.3407854481139514e-06, + "loss": 0.8541, + "step": 8708 + }, + { + "epoch": 1.6765406550039703, + "grad_norm": 3.4211372911845004, + "learning_rate": 1.3392263594882226e-06, + "loss": 0.89, + "step": 8709 + }, + { + "epoch": 1.676733161681546, + "grad_norm": 3.4119764482832937, + "learning_rate": 1.3376681128004943e-06, + "loss": 0.9349, + "step": 8710 + }, + { + "epoch": 1.6769256683591212, + "grad_norm": 3.296501674875594, + "learning_rate": 1.3361107082022495e-06, + "loss": 0.9131, + "step": 8711 + }, + { + "epoch": 1.6771181750366966, + "grad_norm": 3.20740315599562, + "learning_rate": 1.3345541458448896e-06, + "loss": 0.882, + "step": 8712 + }, + { + "epoch": 1.6771181750366966, + "lm_loss": 0.7727, + "step": 8712, + "vm_loss": 0.1439 + }, + { + "epoch": 1.6771181750366966, + "lm_loss": 0.8545, + "step": 8712, + "vm_loss": 0.1458 + }, + { + "epoch": 1.6771181750366966, + "lm_loss": 0.364, + "step": 8712, + "vm_loss": 0.1409 + }, + { + "epoch": 1.6771181750366966, + "lm_loss": 0.7373, + "step": 8712, + "vm_loss": 0.1492 + }, + { + "epoch": 1.6771181750366966, + "lm_loss": 1.3608, + "step": 8712, + "vm_loss": 0.2275 + }, + { + "epoch": 1.6771181750366966, + "lm_loss": 0.8105, + "step": 8712, + "vm_loss": 0.1318 + }, + { + "epoch": 1.6771181750366966, + "lm_loss": 0.6354, + "step": 8712, + "vm_loss": 0.1835 + }, + { + "epoch": 1.6771181750366966, + "lm_loss": 1.1439, + "step": 8712, + "vm_loss": 0.1887 + }, + { + "epoch": 1.677310681714272, + "grad_norm": 3.2928954222811258, + "learning_rate": 1.3329984258797258e-06, + "loss": 0.8884, + "step": 8713 + }, + { + "epoch": 1.6775031883918472, + "grad_norm": 3.0144152690407395, + "learning_rate": 1.3314435484580035e-06, + "loss": 0.7848, + "step": 8714 + }, + { + "epoch": 1.6776956950694228, + "grad_norm": 3.3837874589624732, + "learning_rate": 1.3298895137308687e-06, + "loss": 0.9118, + "step": 8715 + }, + { + "epoch": 1.677888201746998, + "grad_norm": 3.406987466021496, + "learning_rate": 1.3283363218493962e-06, + "loss": 0.9177, + "step": 8716 + }, + { + "epoch": 1.6780807084245735, + "grad_norm": 3.1088730676335286, + "learning_rate": 1.326783972964576e-06, + "loss": 0.8484, + "step": 8717 + }, + { + "epoch": 1.6782732151021489, + "grad_norm": 3.396146071827399, + "learning_rate": 1.3252324672273165e-06, + "loss": 0.9127, + "step": 8718 + }, + { + "epoch": 1.678465721779724, + "grad_norm": 3.225824077691006, + "learning_rate": 1.3236818047884458e-06, + "loss": 0.8846, + "step": 8719 + }, + { + "epoch": 1.6786582284572997, + "grad_norm": 3.2206376541056025, + "learning_rate": 1.3221319857987024e-06, + "loss": 0.8779, + "step": 8720 + }, + { + "epoch": 1.6786582284572997, + "lm_loss": 0.9861, + "step": 8720, + "vm_loss": 0.2481 + }, + { + "epoch": 1.6786582284572997, + "lm_loss": 0.6711, + "step": 8720, + "vm_loss": 0.1123 + }, + { + "epoch": 1.6786582284572997, + "lm_loss": 0.8884, + "step": 8720, + "vm_loss": 0.1887 + }, + { + "epoch": 1.6786582284572997, + "lm_loss": 0.4481, + "step": 8720, + "vm_loss": 0.1382 + }, + { + "epoch": 1.6786582284572997, + "lm_loss": 0.6428, + "step": 8720, + "vm_loss": 0.1896 + }, + { + "epoch": 1.6786582284572997, + "lm_loss": 0.5598, + "step": 8720, + "vm_loss": 0.1768 + }, + { + "epoch": 1.6786582284572997, + "lm_loss": 0.3349, + "step": 8720, + "vm_loss": 0.1207 + }, + { + "epoch": 1.6786582284572997, + "lm_loss": 0.3278, + "step": 8720, + "vm_loss": 0.1925 + }, + { + "epoch": 1.678850735134875, + "grad_norm": 3.3206084775316884, + "learning_rate": 1.3205830104087558e-06, + "loss": 0.8839, + "step": 8721 + }, + { + "epoch": 1.6790432418124503, + "grad_norm": 3.409503190594079, + "learning_rate": 1.319034878769181e-06, + "loss": 0.8746, + "step": 8722 + }, + { + "epoch": 1.6792357484900258, + "grad_norm": 3.1442789119577252, + "learning_rate": 1.3174875910304784e-06, + "loss": 0.8429, + "step": 8723 + }, + { + "epoch": 1.679428255167601, + "grad_norm": 3.380796198679316, + "learning_rate": 1.3159411473430638e-06, + "loss": 0.9078, + "step": 8724 + }, + { + "epoch": 1.6796207618451766, + "grad_norm": 3.2554451948033876, + "learning_rate": 1.3143955478572735e-06, + "loss": 0.8668, + "step": 8725 + }, + { + "epoch": 1.6798132685227518, + "grad_norm": 3.237774080272247, + "learning_rate": 1.3128507927233536e-06, + "loss": 0.8673, + "step": 8726 + }, + { + "epoch": 1.6800057752003272, + "grad_norm": 3.244262203836014, + "learning_rate": 1.3113068820914798e-06, + "loss": 0.8683, + "step": 8727 + }, + { + "epoch": 1.6801982818779027, + "grad_norm": 3.1234860378205345, + "learning_rate": 1.3097638161117365e-06, + "loss": 0.8544, + "step": 8728 + }, + { + "epoch": 1.6801982818779027, + "lm_loss": 0.7836, + "step": 8728, + "vm_loss": 0.1786 + }, + { + "epoch": 1.6801982818779027, + "lm_loss": 1.1444, + "step": 8728, + "vm_loss": 0.1508 + }, + { + "epoch": 1.6801982818779027, + "lm_loss": 0.7403, + "step": 8728, + "vm_loss": 0.1574 + }, + { + "epoch": 1.6801982818779027, + "lm_loss": 0.6521, + "step": 8728, + "vm_loss": 0.2508 + }, + { + "epoch": 1.6801982818779027, + "lm_loss": 0.9282, + "step": 8728, + "vm_loss": 0.1441 + }, + { + "epoch": 1.6801982818779027, + "lm_loss": 0.4113, + "step": 8728, + "vm_loss": 0.2072 + }, + { + "epoch": 1.6801982818779027, + "lm_loss": 0.8431, + "step": 8728, + "vm_loss": 0.1554 + }, + { + "epoch": 1.6801982818779027, + "lm_loss": 0.8908, + "step": 8728, + "vm_loss": 0.1845 + }, + { + "epoch": 1.6803907885554779, + "grad_norm": 3.4425579834010627, + "learning_rate": 1.3082215949341338e-06, + "loss": 0.9077, + "step": 8729 + }, + { + "epoch": 1.6805832952330535, + "grad_norm": 3.2688507802303937, + "learning_rate": 1.306680218708587e-06, + "loss": 0.8422, + "step": 8730 + }, + { + "epoch": 1.6807758019106287, + "grad_norm": 3.3124274672826934, + "learning_rate": 1.3051396875849488e-06, + "loss": 0.8845, + "step": 8731 + }, + { + "epoch": 1.6809683085882041, + "grad_norm": 3.3166698275243136, + "learning_rate": 1.303600001712969e-06, + "loss": 0.8506, + "step": 8732 + }, + { + "epoch": 1.6811608152657795, + "grad_norm": 3.1674969990840522, + "learning_rate": 1.3020611612423296e-06, + "loss": 0.8312, + "step": 8733 + }, + { + "epoch": 1.681353321943355, + "grad_norm": 3.195604225373869, + "learning_rate": 1.3005231663226237e-06, + "loss": 0.8694, + "step": 8734 + }, + { + "epoch": 1.6815458286209304, + "grad_norm": 3.245661432867938, + "learning_rate": 1.2989860171033674e-06, + "loss": 0.851, + "step": 8735 + }, + { + "epoch": 1.6817383352985056, + "grad_norm": 3.224309719463014, + "learning_rate": 1.297449713733987e-06, + "loss": 0.8626, + "step": 8736 + }, + { + "epoch": 1.6817383352985056, + "lm_loss": 0.5702, + "step": 8736, + "vm_loss": 0.2285 + }, + { + "epoch": 1.6817383352985056, + "lm_loss": 0.394, + "step": 8736, + "vm_loss": 0.1362 + }, + { + "epoch": 1.6817383352985056, + "lm_loss": 0.9997, + "step": 8736, + "vm_loss": 0.1549 + }, + { + "epoch": 1.6817383352985056, + "lm_loss": 0.604, + "step": 8736, + "vm_loss": 0.2767 + }, + { + "epoch": 1.6817383352985056, + "lm_loss": 0.9586, + "step": 8736, + "vm_loss": 0.1643 + }, + { + "epoch": 1.6817383352985056, + "lm_loss": 0.4027, + "step": 8736, + "vm_loss": 0.1058 + }, + { + "epoch": 1.6817383352985056, + "lm_loss": 0.7967, + "step": 8736, + "vm_loss": 0.2396 + }, + { + "epoch": 1.6817383352985056, + "lm_loss": 0.9387, + "step": 8736, + "vm_loss": 0.1886 + }, + { + "epoch": 1.681930841976081, + "grad_norm": 3.3513537736415233, + "learning_rate": 1.2959142563638316e-06, + "loss": 0.9202, + "step": 8737 + }, + { + "epoch": 1.6821233486536564, + "grad_norm": 3.2121023934661728, + "learning_rate": 1.2943796451421686e-06, + "loss": 0.8521, + "step": 8738 + }, + { + "epoch": 1.6823158553312318, + "grad_norm": 3.1590992924814345, + "learning_rate": 1.292845880218182e-06, + "loss": 0.842, + "step": 8739 + }, + { + "epoch": 1.6825083620088073, + "grad_norm": 2.984186350413497, + "learning_rate": 1.2913129617409725e-06, + "loss": 0.8097, + "step": 8740 + }, + { + "epoch": 1.6827008686863825, + "grad_norm": 3.3496709157782623, + "learning_rate": 1.2897808898595599e-06, + "loss": 0.9128, + "step": 8741 + }, + { + "epoch": 1.6828933753639579, + "grad_norm": 3.2611090151595215, + "learning_rate": 1.2882496647228827e-06, + "loss": 0.8897, + "step": 8742 + }, + { + "epoch": 1.6830858820415333, + "grad_norm": 3.0652524913331263, + "learning_rate": 1.2867192864797929e-06, + "loss": 0.8291, + "step": 8743 + }, + { + "epoch": 1.6832783887191087, + "grad_norm": 3.3115779452351535, + "learning_rate": 1.2851897552790626e-06, + "loss": 0.8821, + "step": 8744 + }, + { + "epoch": 1.6832783887191087, + "lm_loss": 0.5846, + "step": 8744, + "vm_loss": 0.1361 + }, + { + "epoch": 1.6832783887191087, + "lm_loss": 0.8508, + "step": 8744, + "vm_loss": 0.1379 + }, + { + "epoch": 1.6832783887191087, + "lm_loss": 0.6689, + "step": 8744, + "vm_loss": 0.1158 + }, + { + "epoch": 1.6832783887191087, + "lm_loss": 1.1186, + "step": 8744, + "vm_loss": 0.2104 + }, + { + "epoch": 1.6832783887191087, + "lm_loss": 0.5572, + "step": 8744, + "vm_loss": 0.1508 + }, + { + "epoch": 1.6832783887191087, + "lm_loss": 1.2138, + "step": 8744, + "vm_loss": 0.2797 + }, + { + "epoch": 1.6832783887191087, + "lm_loss": 0.5603, + "step": 8744, + "vm_loss": 0.1773 + }, + { + "epoch": 1.6832783887191087, + "lm_loss": 0.7624, + "step": 8744, + "vm_loss": 0.1928 + }, + { + "epoch": 1.6834708953966842, + "grad_norm": 3.386865354669033, + "learning_rate": 1.2836610712693832e-06, + "loss": 0.9345, + "step": 8745 + }, + { + "epoch": 1.6836634020742594, + "grad_norm": 3.4846650447672722, + "learning_rate": 1.2821332345993654e-06, + "loss": 0.8977, + "step": 8746 + }, + { + "epoch": 1.683855908751835, + "grad_norm": 3.2421638668670525, + "learning_rate": 1.280606245417526e-06, + "loss": 0.8568, + "step": 8747 + }, + { + "epoch": 1.6840484154294102, + "grad_norm": 3.404072778626749, + "learning_rate": 1.2790801038723178e-06, + "loss": 0.9046, + "step": 8748 + }, + { + "epoch": 1.6842409221069856, + "grad_norm": 3.3313672278231157, + "learning_rate": 1.2775548101120949e-06, + "loss": 0.8769, + "step": 8749 + }, + { + "epoch": 1.684433428784561, + "grad_norm": 3.272495690307263, + "learning_rate": 1.276030364285138e-06, + "loss": 0.8636, + "step": 8750 + }, + { + "epoch": 1.6846259354621362, + "grad_norm": 3.231440650854355, + "learning_rate": 1.2745067665396394e-06, + "loss": 0.8935, + "step": 8751 + }, + { + "epoch": 1.6848184421397119, + "grad_norm": 3.2060360657411575, + "learning_rate": 1.2729840170237174e-06, + "loss": 0.844, + "step": 8752 + }, + { + "epoch": 1.6848184421397119, + "lm_loss": 0.5977, + "step": 8752, + "vm_loss": 0.159 + }, + { + "epoch": 1.6848184421397119, + "lm_loss": 0.42, + "step": 8752, + "vm_loss": 0.1235 + }, + { + "epoch": 1.6848184421397119, + "lm_loss": 0.485, + "step": 8752, + "vm_loss": 0.1187 + }, + { + "epoch": 1.6848184421397119, + "lm_loss": 0.4342, + "step": 8752, + "vm_loss": 0.1303 + }, + { + "epoch": 1.6848184421397119, + "lm_loss": 0.6021, + "step": 8752, + "vm_loss": 0.1271 + }, + { + "epoch": 1.6848184421397119, + "lm_loss": 0.9447, + "step": 8752, + "vm_loss": 0.186 + }, + { + "epoch": 1.6848184421397119, + "lm_loss": 0.6735, + "step": 8752, + "vm_loss": 0.2025 + }, + { + "epoch": 1.6848184421397119, + "lm_loss": 0.7368, + "step": 8752, + "vm_loss": 0.1675 + }, + { + "epoch": 1.685010948817287, + "grad_norm": 3.2971295275483072, + "learning_rate": 1.271462115885399e-06, + "loss": 0.8342, + "step": 8753 + }, + { + "epoch": 1.6852034554948625, + "grad_norm": 3.1585814923394135, + "learning_rate": 1.2699410632726327e-06, + "loss": 0.8511, + "step": 8754 + }, + { + "epoch": 1.685395962172438, + "grad_norm": 3.4391125454296194, + "learning_rate": 1.2684208593332858e-06, + "loss": 0.9064, + "step": 8755 + }, + { + "epoch": 1.6855884688500131, + "grad_norm": 3.315304705484362, + "learning_rate": 1.2669015042151422e-06, + "loss": 0.8752, + "step": 8756 + }, + { + "epoch": 1.6857809755275888, + "grad_norm": 3.2710178273322144, + "learning_rate": 1.2653829980658983e-06, + "loss": 0.8654, + "step": 8757 + }, + { + "epoch": 1.685973482205164, + "grad_norm": 3.236258565521527, + "learning_rate": 1.2638653410331781e-06, + "loss": 0.873, + "step": 8758 + }, + { + "epoch": 1.6861659888827394, + "grad_norm": 3.218784670151545, + "learning_rate": 1.2623485332645135e-06, + "loss": 0.8676, + "step": 8759 + }, + { + "epoch": 1.6863584955603148, + "grad_norm": 3.189487807466586, + "learning_rate": 1.2608325749073591e-06, + "loss": 0.8534, + "step": 8760 + }, + { + "epoch": 1.6863584955603148, + "lm_loss": 0.5938, + "step": 8760, + "vm_loss": 0.2072 + }, + { + "epoch": 1.6863584955603148, + "lm_loss": 0.4621, + "step": 8760, + "vm_loss": 0.1905 + }, + { + "epoch": 1.6863584955603148, + "lm_loss": 0.4728, + "step": 8760, + "vm_loss": 0.148 + }, + { + "epoch": 1.6863584955603148, + "lm_loss": 0.8739, + "step": 8760, + "vm_loss": 0.1682 + }, + { + "epoch": 1.6863584955603148, + "lm_loss": 0.9507, + "step": 8760, + "vm_loss": 0.1775 + }, + { + "epoch": 1.6863584955603148, + "lm_loss": 0.8914, + "step": 8760, + "vm_loss": 0.168 + }, + { + "epoch": 1.6863584955603148, + "lm_loss": 0.4753, + "step": 8760, + "vm_loss": 0.1462 + }, + { + "epoch": 1.6863584955603148, + "lm_loss": 1.0074, + "step": 8760, + "vm_loss": 0.1402 + }, + { + "epoch": 1.68655100223789, + "grad_norm": 3.318343862451477, + "learning_rate": 1.2593174661090834e-06, + "loss": 0.8901, + "step": 8761 + }, + { + "epoch": 1.6867435089154657, + "grad_norm": 3.3305765415755295, + "learning_rate": 1.2578032070169777e-06, + "loss": 0.9042, + "step": 8762 + }, + { + "epoch": 1.6869360155930408, + "grad_norm": 3.2263822495582075, + "learning_rate": 1.2562897977782461e-06, + "loss": 0.8663, + "step": 8763 + }, + { + "epoch": 1.6871285222706163, + "grad_norm": 3.197332160763477, + "learning_rate": 1.2547772385400081e-06, + "loss": 0.8793, + "step": 8764 + }, + { + "epoch": 1.6873210289481917, + "grad_norm": 3.169701305239922, + "learning_rate": 1.25326552944931e-06, + "loss": 0.8478, + "step": 8765 + }, + { + "epoch": 1.687513535625767, + "grad_norm": 3.196920467056742, + "learning_rate": 1.2517546706531047e-06, + "loss": 0.8465, + "step": 8766 + }, + { + "epoch": 1.6877060423033425, + "grad_norm": 3.2516569158577493, + "learning_rate": 1.2502446622982688e-06, + "loss": 0.8495, + "step": 8767 + }, + { + "epoch": 1.6878985489809177, + "grad_norm": 3.363300008182436, + "learning_rate": 1.2487355045315907e-06, + "loss": 0.8912, + "step": 8768 + }, + { + "epoch": 1.6878985489809177, + "lm_loss": 0.6801, + "step": 8768, + "vm_loss": 0.1306 + }, + { + "epoch": 1.6878985489809177, + "lm_loss": 0.4342, + "step": 8768, + "vm_loss": 0.2202 + }, + { + "epoch": 1.6878985489809177, + "lm_loss": 0.7569, + "step": 8768, + "vm_loss": 0.2014 + }, + { + "epoch": 1.6878985489809177, + "lm_loss": 0.4815, + "step": 8768, + "vm_loss": 0.1461 + }, + { + "epoch": 1.6878985489809177, + "lm_loss": 0.6115, + "step": 8768, + "vm_loss": 0.1669 + }, + { + "epoch": 1.6878985489809177, + "lm_loss": 0.6214, + "step": 8768, + "vm_loss": 0.1606 + }, + { + "epoch": 1.6878985489809177, + "lm_loss": 1.0029, + "step": 8768, + "vm_loss": 0.1516 + }, + { + "epoch": 1.6878985489809177, + "lm_loss": 0.6923, + "step": 8768, + "vm_loss": 0.1828 + }, + { + "epoch": 1.6880910556584932, + "grad_norm": 3.2129350435412074, + "learning_rate": 1.2472271974997863e-06, + "loss": 0.8576, + "step": 8769 + }, + { + "epoch": 1.6882835623360686, + "grad_norm": 3.222701045160172, + "learning_rate": 1.245719741349477e-06, + "loss": 0.8736, + "step": 8770 + }, + { + "epoch": 1.6884760690136438, + "grad_norm": 3.271693057210702, + "learning_rate": 1.2442131362272093e-06, + "loss": 0.8813, + "step": 8771 + }, + { + "epoch": 1.6886685756912194, + "grad_norm": 3.326589116107832, + "learning_rate": 1.2427073822794445e-06, + "loss": 0.8557, + "step": 8772 + }, + { + "epoch": 1.6888610823687946, + "grad_norm": 3.232868996477941, + "learning_rate": 1.2412024796525612e-06, + "loss": 0.8987, + "step": 8773 + }, + { + "epoch": 1.68905358904637, + "grad_norm": 3.1924700581425807, + "learning_rate": 1.2396984284928514e-06, + "loss": 0.833, + "step": 8774 + }, + { + "epoch": 1.6892460957239455, + "grad_norm": 3.25435308561204, + "learning_rate": 1.238195228946536e-06, + "loss": 0.8875, + "step": 8775 + }, + { + "epoch": 1.6894386024015207, + "grad_norm": 3.340291984095352, + "learning_rate": 1.2366928811597378e-06, + "loss": 0.8975, + "step": 8776 + }, + { + "epoch": 1.6894386024015207, + "lm_loss": 0.772, + "step": 8776, + "vm_loss": 0.2086 + }, + { + "epoch": 1.6894386024015207, + "lm_loss": 0.7891, + "step": 8776, + "vm_loss": 0.2087 + }, + { + "epoch": 1.6894386024015207, + "lm_loss": 0.4316, + "step": 8776, + "vm_loss": 0.1874 + }, + { + "epoch": 1.6894386024015207, + "lm_loss": 0.8097, + "step": 8776, + "vm_loss": 0.1224 + }, + { + "epoch": 1.6894386024015207, + "lm_loss": 0.8689, + "step": 8776, + "vm_loss": 0.1543 + }, + { + "epoch": 1.6894386024015207, + "lm_loss": 0.6848, + "step": 8776, + "vm_loss": 0.1563 + }, + { + "epoch": 1.6894386024015207, + "lm_loss": 0.8599, + "step": 8776, + "vm_loss": 0.2112 + }, + { + "epoch": 1.6894386024015207, + "lm_loss": 0.6415, + "step": 8776, + "vm_loss": 0.2249 + }, + { + "epoch": 1.6896311090790963, + "grad_norm": 3.280167002163764, + "learning_rate": 1.235191385278507e-06, + "loss": 0.9048, + "step": 8777 + }, + { + "epoch": 1.6898236157566715, + "grad_norm": 3.2665331231164743, + "learning_rate": 1.2336907414488075e-06, + "loss": 0.8697, + "step": 8778 + }, + { + "epoch": 1.690016122434247, + "grad_norm": 3.232331044596003, + "learning_rate": 1.2321909498165253e-06, + "loss": 0.8751, + "step": 8779 + }, + { + "epoch": 1.6902086291118223, + "grad_norm": 3.3207693032688947, + "learning_rate": 1.2306920105274522e-06, + "loss": 0.8903, + "step": 8780 + }, + { + "epoch": 1.6904011357893975, + "grad_norm": 3.1370262610715454, + "learning_rate": 1.2291939237273088e-06, + "loss": 0.8484, + "step": 8781 + }, + { + "epoch": 1.6905936424669732, + "grad_norm": 3.10460582839599, + "learning_rate": 1.2276966895617271e-06, + "loss": 0.8238, + "step": 8782 + }, + { + "epoch": 1.6907861491445484, + "grad_norm": 3.2125352772695206, + "learning_rate": 1.2262003081762575e-06, + "loss": 0.8644, + "step": 8783 + }, + { + "epoch": 1.6909786558221238, + "grad_norm": 3.4131757870152057, + "learning_rate": 1.2247047797163703e-06, + "loss": 0.9263, + "step": 8784 + }, + { + "epoch": 1.6909786558221238, + "lm_loss": 0.5714, + "step": 8784, + "vm_loss": 0.1823 + }, + { + "epoch": 1.6909786558221238, + "lm_loss": 0.4568, + "step": 8784, + "vm_loss": 0.1144 + }, + { + "epoch": 1.6909786558221238, + "lm_loss": 0.4888, + "step": 8784, + "vm_loss": 0.171 + }, + { + "epoch": 1.6909786558221238, + "lm_loss": 0.6263, + "step": 8784, + "vm_loss": 0.1468 + }, + { + "epoch": 1.6909786558221238, + "lm_loss": 0.8641, + "step": 8784, + "vm_loss": 0.1217 + }, + { + "epoch": 1.6909786558221238, + "lm_loss": 0.7196, + "step": 8784, + "vm_loss": 0.1307 + }, + { + "epoch": 1.6909786558221238, + "lm_loss": 0.4606, + "step": 8784, + "vm_loss": 0.2067 + }, + { + "epoch": 1.6909786558221238, + "lm_loss": 0.6796, + "step": 8784, + "vm_loss": 0.1813 + }, + { + "epoch": 1.6911711624996992, + "grad_norm": 3.2619366070349267, + "learning_rate": 1.2232101043274437e-06, + "loss": 0.8519, + "step": 8785 + }, + { + "epoch": 1.6913636691772744, + "grad_norm": 3.19861661496492, + "learning_rate": 1.2217162821547856e-06, + "loss": 0.8219, + "step": 8786 + }, + { + "epoch": 1.69155617585485, + "grad_norm": 3.215976733611148, + "learning_rate": 1.2202233133436103e-06, + "loss": 0.8398, + "step": 8787 + }, + { + "epoch": 1.6917486825324253, + "grad_norm": 3.2868616622423974, + "learning_rate": 1.218731198039056e-06, + "loss": 0.8733, + "step": 8788 + }, + { + "epoch": 1.6919411892100007, + "grad_norm": 3.296242635892787, + "learning_rate": 1.2172399363861731e-06, + "loss": 0.8692, + "step": 8789 + }, + { + "epoch": 1.6921336958875761, + "grad_norm": 3.445415280218687, + "learning_rate": 1.215749528529936e-06, + "loss": 0.906, + "step": 8790 + }, + { + "epoch": 1.6923262025651513, + "grad_norm": 3.1022956591400384, + "learning_rate": 1.214259974615225e-06, + "loss": 0.8629, + "step": 8791 + }, + { + "epoch": 1.692518709242727, + "grad_norm": 3.2987764466556206, + "learning_rate": 1.212771274786847e-06, + "loss": 0.8883, + "step": 8792 + }, + { + "epoch": 1.692518709242727, + "lm_loss": 0.9196, + "step": 8792, + "vm_loss": 0.1765 + }, + { + "epoch": 1.692518709242727, + "lm_loss": 0.7991, + "step": 8792, + "vm_loss": 0.1738 + }, + { + "epoch": 1.692518709242727, + "lm_loss": 0.706, + "step": 8792, + "vm_loss": 0.2039 + }, + { + "epoch": 1.692518709242727, + "lm_loss": 0.8006, + "step": 8792, + "vm_loss": 0.1312 + }, + { + "epoch": 1.692518709242727, + "lm_loss": 0.3766, + "step": 8792, + "vm_loss": 0.1893 + }, + { + "epoch": 1.692518709242727, + "lm_loss": 0.4099, + "step": 8792, + "vm_loss": 0.1826 + }, + { + "epoch": 1.692518709242727, + "lm_loss": 0.6701, + "step": 8792, + "vm_loss": 0.1826 + }, + { + "epoch": 1.692518709242727, + "lm_loss": 0.7107, + "step": 8792, + "vm_loss": 0.1203 + }, + { + "epoch": 1.6927112159203022, + "grad_norm": 3.293036383871833, + "learning_rate": 1.2112834291895237e-06, + "loss": 0.9201, + "step": 8793 + }, + { + "epoch": 1.6929037225978776, + "grad_norm": 3.1647996403784244, + "learning_rate": 1.2097964379678928e-06, + "loss": 0.8495, + "step": 8794 + }, + { + "epoch": 1.693096229275453, + "grad_norm": 3.2883462094761686, + "learning_rate": 1.2083103012665032e-06, + "loss": 0.8603, + "step": 8795 + }, + { + "epoch": 1.6932887359530284, + "grad_norm": 3.4132015726123632, + "learning_rate": 1.2068250192298359e-06, + "loss": 0.921, + "step": 8796 + }, + { + "epoch": 1.6934812426306038, + "grad_norm": 3.1573830503695017, + "learning_rate": 1.2053405920022709e-06, + "loss": 0.8526, + "step": 8797 + }, + { + "epoch": 1.693673749308179, + "grad_norm": 3.2215020935971395, + "learning_rate": 1.2038570197281185e-06, + "loss": 0.8734, + "step": 8798 + }, + { + "epoch": 1.6938662559857545, + "grad_norm": 3.234821680935905, + "learning_rate": 1.2023743025515977e-06, + "loss": 0.8849, + "step": 8799 + }, + { + "epoch": 1.6940587626633299, + "grad_norm": 3.3236629237547493, + "learning_rate": 1.200892440616851e-06, + "loss": 0.9253, + "step": 8800 + }, + { + "epoch": 1.6940587626633299, + "lm_loss": 0.6827, + "step": 8800, + "vm_loss": 0.1754 + }, + { + "epoch": 1.6940587626633299, + "lm_loss": 0.556, + "step": 8800, + "vm_loss": 0.1815 + }, + { + "epoch": 1.6940587626633299, + "lm_loss": 1.0148, + "step": 8800, + "vm_loss": 0.129 + }, + { + "epoch": 1.6940587626633299, + "lm_loss": 0.6856, + "step": 8800, + "vm_loss": 0.166 + }, + { + "epoch": 1.6940587626633299, + "lm_loss": 0.5674, + "step": 8800, + "vm_loss": 0.1537 + }, + { + "epoch": 1.6940587626633299, + "lm_loss": 0.9358, + "step": 8800, + "vm_loss": 0.2293 + }, + { + "epoch": 1.6940587626633299, + "lm_loss": 0.5597, + "step": 8800, + "vm_loss": 0.2256 + }, + { + "epoch": 1.6940587626633299, + "lm_loss": 0.4241, + "step": 8800, + "vm_loss": 0.1494 + }, + { + "epoch": 1.6942512693409053, + "grad_norm": 3.2657482098316617, + "learning_rate": 1.199411434067933e-06, + "loss": 0.8872, + "step": 8801 + }, + { + "epoch": 1.6944437760184807, + "grad_norm": 3.340099719276038, + "learning_rate": 1.1979312830488144e-06, + "loss": 0.9138, + "step": 8802 + }, + { + "epoch": 1.694636282696056, + "grad_norm": 3.1011893546733758, + "learning_rate": 1.1964519877033875e-06, + "loss": 0.7623, + "step": 8803 + }, + { + "epoch": 1.6948287893736314, + "grad_norm": 3.2498683166345437, + "learning_rate": 1.1949735481754565e-06, + "loss": 0.8725, + "step": 8804 + }, + { + "epoch": 1.6950212960512068, + "grad_norm": 3.2691608063931086, + "learning_rate": 1.1934959646087463e-06, + "loss": 0.8713, + "step": 8805 + }, + { + "epoch": 1.6952138027287822, + "grad_norm": 3.314856550691807, + "learning_rate": 1.1920192371468952e-06, + "loss": 0.9092, + "step": 8806 + }, + { + "epoch": 1.6954063094063576, + "grad_norm": 3.207018826587536, + "learning_rate": 1.1905433659334654e-06, + "loss": 0.8528, + "step": 8807 + }, + { + "epoch": 1.6955988160839328, + "grad_norm": 3.3303544085614267, + "learning_rate": 1.1890683511119227e-06, + "loss": 0.8891, + "step": 8808 + }, + { + "epoch": 1.6955988160839328, + "lm_loss": 0.7064, + "step": 8808, + "vm_loss": 0.2118 + }, + { + "epoch": 1.6955988160839328, + "lm_loss": 0.5326, + "step": 8808, + "vm_loss": 0.1773 + }, + { + "epoch": 1.6955988160839328, + "lm_loss": 0.7962, + "step": 8808, + "vm_loss": 0.1461 + }, + { + "epoch": 1.6955988160839328, + "lm_loss": 0.6603, + "step": 8808, + "vm_loss": 0.1774 + }, + { + "epoch": 1.6955988160839328, + "lm_loss": 0.5298, + "step": 8808, + "vm_loss": 0.1525 + }, + { + "epoch": 1.6955988160839328, + "lm_loss": 0.4801, + "step": 8808, + "vm_loss": 0.1375 + }, + { + "epoch": 1.6955988160839328, + "lm_loss": 0.2973, + "step": 8808, + "vm_loss": 0.1648 + }, + { + "epoch": 1.6955988160839328, + "lm_loss": 0.5085, + "step": 8808, + "vm_loss": 0.1853 + }, + { + "epoch": 1.6957913227615082, + "grad_norm": 3.242280329101607, + "learning_rate": 1.1875941928256619e-06, + "loss": 0.8831, + "step": 8809 + }, + { + "epoch": 1.6959838294390837, + "grad_norm": 3.3903089925129075, + "learning_rate": 1.1861208912179878e-06, + "loss": 0.921, + "step": 8810 + }, + { + "epoch": 1.696176336116659, + "grad_norm": 3.110336189759906, + "learning_rate": 1.1846484464321283e-06, + "loss": 0.8228, + "step": 8811 + }, + { + "epoch": 1.6963688427942345, + "grad_norm": 2.9195620286093154, + "learning_rate": 1.183176858611218e-06, + "loss": 0.8101, + "step": 8812 + }, + { + "epoch": 1.6965613494718097, + "grad_norm": 3.1954545593021906, + "learning_rate": 1.1817061278983199e-06, + "loss": 0.8299, + "step": 8813 + }, + { + "epoch": 1.6967538561493853, + "grad_norm": 3.3515066839941245, + "learning_rate": 1.1802362544364032e-06, + "loss": 0.8934, + "step": 8814 + }, + { + "epoch": 1.6969463628269605, + "grad_norm": 3.2847235418717515, + "learning_rate": 1.1787672383683613e-06, + "loss": 0.8472, + "step": 8815 + }, + { + "epoch": 1.697138869504536, + "grad_norm": 3.2277639208366975, + "learning_rate": 1.1772990798369987e-06, + "loss": 0.8431, + "step": 8816 + }, + { + "epoch": 1.697138869504536, + "lm_loss": 0.8186, + "step": 8816, + "vm_loss": 0.1371 + }, + { + "epoch": 1.697138869504536, + "lm_loss": 0.9216, + "step": 8816, + "vm_loss": 0.1953 + }, + { + "epoch": 1.697138869504536, + "lm_loss": 0.6937, + "step": 8816, + "vm_loss": 0.1394 + }, + { + "epoch": 1.697138869504536, + "lm_loss": 1.1941, + "step": 8816, + "vm_loss": 0.1351 + }, + { + "epoch": 1.697138869504536, + "lm_loss": 0.7434, + "step": 8816, + "vm_loss": 0.1725 + }, + { + "epoch": 1.697138869504536, + "lm_loss": 0.6492, + "step": 8816, + "vm_loss": 0.1688 + }, + { + "epoch": 1.697138869504536, + "lm_loss": 0.7571, + "step": 8816, + "vm_loss": 0.1851 + }, + { + "epoch": 1.697138869504536, + "lm_loss": 0.8377, + "step": 8816, + "vm_loss": 0.1592 + }, + { + "epoch": 1.6973313761821114, + "grad_norm": 3.2281535132777717, + "learning_rate": 1.1758317789850448e-06, + "loss": 0.8439, + "step": 8817 + }, + { + "epoch": 1.6975238828596866, + "grad_norm": 3.296686337121683, + "learning_rate": 1.1743653359551322e-06, + "loss": 0.8717, + "step": 8818 + }, + { + "epoch": 1.6977163895372622, + "grad_norm": 3.2527730765087655, + "learning_rate": 1.1728997508898231e-06, + "loss": 0.8311, + "step": 8819 + }, + { + "epoch": 1.6979088962148374, + "grad_norm": 3.36966773724426, + "learning_rate": 1.1714350239315897e-06, + "loss": 0.9131, + "step": 8820 + }, + { + "epoch": 1.6981014028924128, + "grad_norm": 3.115386333921638, + "learning_rate": 1.169971155222821e-06, + "loss": 0.807, + "step": 8821 + }, + { + "epoch": 1.6982939095699883, + "grad_norm": 3.2233436393329766, + "learning_rate": 1.1685081449058266e-06, + "loss": 0.8402, + "step": 8822 + }, + { + "epoch": 1.6984864162475635, + "grad_norm": 3.322835979842623, + "learning_rate": 1.1670459931228273e-06, + "loss": 0.9001, + "step": 8823 + }, + { + "epoch": 1.6986789229251391, + "grad_norm": 3.2516267740552194, + "learning_rate": 1.165584700015967e-06, + "loss": 0.8987, + "step": 8824 + }, + { + "epoch": 1.6986789229251391, + "lm_loss": 0.7088, + "step": 8824, + "vm_loss": 0.1498 + }, + { + "epoch": 1.6986789229251391, + "lm_loss": 0.6307, + "step": 8824, + "vm_loss": 0.1617 + }, + { + "epoch": 1.6986789229251391, + "lm_loss": 0.6178, + "step": 8824, + "vm_loss": 0.085 + }, + { + "epoch": 1.6986789229251391, + "lm_loss": 0.8213, + "step": 8824, + "vm_loss": 0.1479 + }, + { + "epoch": 1.6986789229251391, + "lm_loss": 0.548, + "step": 8824, + "vm_loss": 0.1723 + }, + { + "epoch": 1.6986789229251391, + "lm_loss": 0.6688, + "step": 8824, + "vm_loss": 0.1058 + }, + { + "epoch": 1.6986789229251391, + "lm_loss": 0.8781, + "step": 8824, + "vm_loss": 0.1532 + }, + { + "epoch": 1.6986789229251391, + "lm_loss": 0.8078, + "step": 8824, + "vm_loss": 0.1854 + }, + { + "epoch": 1.6988714296027143, + "grad_norm": 3.4522088082983546, + "learning_rate": 1.1641242657272956e-06, + "loss": 0.8911, + "step": 8825 + }, + { + "epoch": 1.6990639362802897, + "grad_norm": 3.0845567499891, + "learning_rate": 1.1626646903987904e-06, + "loss": 0.8277, + "step": 8826 + }, + { + "epoch": 1.6992564429578652, + "grad_norm": 3.313818978770263, + "learning_rate": 1.1612059741723404e-06, + "loss": 0.8722, + "step": 8827 + }, + { + "epoch": 1.6994489496354404, + "grad_norm": 3.273247086552356, + "learning_rate": 1.1597481171897517e-06, + "loss": 0.8533, + "step": 8828 + }, + { + "epoch": 1.699641456313016, + "grad_norm": 3.2043476651241707, + "learning_rate": 1.1582911195927427e-06, + "loss": 0.8389, + "step": 8829 + }, + { + "epoch": 1.6998339629905912, + "grad_norm": 3.047389210958841, + "learning_rate": 1.1568349815229584e-06, + "loss": 0.8037, + "step": 8830 + }, + { + "epoch": 1.7000264696681666, + "grad_norm": 3.1142536885519854, + "learning_rate": 1.1553797031219494e-06, + "loss": 0.8292, + "step": 8831 + }, + { + "epoch": 1.700218976345742, + "grad_norm": 3.3186625098060487, + "learning_rate": 1.1539252845311889e-06, + "loss": 0.8555, + "step": 8832 + }, + { + "epoch": 1.700218976345742, + "lm_loss": 0.5576, + "step": 8832, + "vm_loss": 0.1674 + }, + { + "epoch": 1.700218976345742, + "lm_loss": 0.6503, + "step": 8832, + "vm_loss": 0.1287 + }, + { + "epoch": 1.700218976345742, + "lm_loss": 0.8977, + "step": 8832, + "vm_loss": 0.17 + }, + { + "epoch": 1.700218976345742, + "lm_loss": 0.6176, + "step": 8832, + "vm_loss": 0.2102 + }, + { + "epoch": 1.700218976345742, + "lm_loss": 1.0169, + "step": 8832, + "vm_loss": 0.2235 + }, + { + "epoch": 1.700218976345742, + "lm_loss": 0.7094, + "step": 8832, + "vm_loss": 0.211 + }, + { + "epoch": 1.700218976345742, + "lm_loss": 0.7935, + "step": 8832, + "vm_loss": 0.1759 + }, + { + "epoch": 1.700218976345742, + "lm_loss": 0.6207, + "step": 8832, + "vm_loss": 0.1231 + }, + { + "epoch": 1.7004114830233172, + "grad_norm": 3.4903967972516603, + "learning_rate": 1.1524717258920659e-06, + "loss": 0.9224, + "step": 8833 + }, + { + "epoch": 1.7006039897008929, + "grad_norm": 3.23226587650842, + "learning_rate": 1.1510190273458854e-06, + "loss": 0.8495, + "step": 8834 + }, + { + "epoch": 1.700796496378468, + "grad_norm": 3.2672053812749438, + "learning_rate": 1.1495671890338655e-06, + "loss": 0.8811, + "step": 8835 + }, + { + "epoch": 1.7009890030560435, + "grad_norm": 3.485093973705232, + "learning_rate": 1.1481162110971444e-06, + "loss": 0.9157, + "step": 8836 + }, + { + "epoch": 1.701181509733619, + "grad_norm": 2.991145489076773, + "learning_rate": 1.146666093676776e-06, + "loss": 0.8088, + "step": 8837 + }, + { + "epoch": 1.7013740164111941, + "grad_norm": 3.15955071681223, + "learning_rate": 1.145216836913733e-06, + "loss": 0.8188, + "step": 8838 + }, + { + "epoch": 1.7015665230887698, + "grad_norm": 3.3892870227751133, + "learning_rate": 1.1437684409488946e-06, + "loss": 0.8398, + "step": 8839 + }, + { + "epoch": 1.701759029766345, + "grad_norm": 3.252825081253866, + "learning_rate": 1.1423209059230721e-06, + "loss": 0.8577, + "step": 8840 + }, + { + "epoch": 1.701759029766345, + "lm_loss": 0.8344, + "step": 8840, + "vm_loss": 0.1484 + }, + { + "epoch": 1.701759029766345, + "lm_loss": 0.359, + "step": 8840, + "vm_loss": 0.1531 + }, + { + "epoch": 1.701759029766345, + "lm_loss": 0.5752, + "step": 8840, + "vm_loss": 0.1222 + }, + { + "epoch": 1.701759029766345, + "lm_loss": 0.4119, + "step": 8840, + "vm_loss": 0.1315 + }, + { + "epoch": 1.701759029766345, + "lm_loss": 0.6752, + "step": 8840, + "vm_loss": 0.1848 + }, + { + "epoch": 1.701759029766345, + "lm_loss": 1.4176, + "step": 8840, + "vm_loss": 0.1649 + }, + { + "epoch": 1.701759029766345, + "lm_loss": 0.6824, + "step": 8840, + "vm_loss": 0.1445 + }, + { + "epoch": 1.701759029766345, + "lm_loss": 0.4542, + "step": 8840, + "vm_loss": 0.2145 + }, + { + "epoch": 1.7019515364439204, + "grad_norm": 3.1773351707510327, + "learning_rate": 1.1408742319769794e-06, + "loss": 0.8656, + "step": 8841 + }, + { + "epoch": 1.7021440431214958, + "grad_norm": 3.205217253908608, + "learning_rate": 1.1394284192512517e-06, + "loss": 0.8717, + "step": 8842 + }, + { + "epoch": 1.702336549799071, + "grad_norm": 3.2512623750535252, + "learning_rate": 1.137983467886442e-06, + "loss": 0.8895, + "step": 8843 + }, + { + "epoch": 1.7025290564766467, + "grad_norm": 3.0765128102293167, + "learning_rate": 1.1365393780230172e-06, + "loss": 0.8464, + "step": 8844 + }, + { + "epoch": 1.7027215631542219, + "grad_norm": 3.3757370732725995, + "learning_rate": 1.1350961498013635e-06, + "loss": 0.9124, + "step": 8845 + }, + { + "epoch": 1.7029140698317973, + "grad_norm": 3.44828825485161, + "learning_rate": 1.1336537833617777e-06, + "loss": 0.9366, + "step": 8846 + }, + { + "epoch": 1.7031065765093727, + "grad_norm": 3.352878235358373, + "learning_rate": 1.1322122788444779e-06, + "loss": 0.8902, + "step": 8847 + }, + { + "epoch": 1.703299083186948, + "grad_norm": 3.3688382639814862, + "learning_rate": 1.130771636389596e-06, + "loss": 0.9016, + "step": 8848 + }, + { + "epoch": 1.703299083186948, + "lm_loss": 0.5318, + "step": 8848, + "vm_loss": 0.1644 + }, + { + "epoch": 1.703299083186948, + "lm_loss": 0.6651, + "step": 8848, + "vm_loss": 0.1962 + }, + { + "epoch": 1.703299083186948, + "lm_loss": 0.5197, + "step": 8848, + "vm_loss": 0.1692 + }, + { + "epoch": 1.703299083186948, + "lm_loss": 1.1028, + "step": 8848, + "vm_loss": 0.1831 + }, + { + "epoch": 1.703299083186948, + "lm_loss": 0.921, + "step": 8848, + "vm_loss": 0.1588 + }, + { + "epoch": 1.703299083186948, + "lm_loss": 0.718, + "step": 8848, + "vm_loss": 0.1551 + }, + { + "epoch": 1.703299083186948, + "lm_loss": 0.7463, + "step": 8848, + "vm_loss": 0.1699 + }, + { + "epoch": 1.703299083186948, + "lm_loss": 0.4, + "step": 8848, + "vm_loss": 0.1741 + }, + { + "epoch": 1.7034915898645235, + "grad_norm": 3.092255925314974, + "learning_rate": 1.1293318561371824e-06, + "loss": 0.8447, + "step": 8849 + }, + { + "epoch": 1.7036840965420987, + "grad_norm": 3.235421314293682, + "learning_rate": 1.1278929382272007e-06, + "loss": 0.8711, + "step": 8850 + }, + { + "epoch": 1.7038766032196742, + "grad_norm": 3.3291793349026904, + "learning_rate": 1.1264548827995347e-06, + "loss": 0.889, + "step": 8851 + }, + { + "epoch": 1.7040691098972496, + "grad_norm": 3.1758817065037004, + "learning_rate": 1.1250176899939791e-06, + "loss": 0.8395, + "step": 8852 + }, + { + "epoch": 1.7042616165748248, + "grad_norm": 3.1218227764345423, + "learning_rate": 1.1235813599502477e-06, + "loss": 0.8277, + "step": 8853 + }, + { + "epoch": 1.7044541232524004, + "grad_norm": 3.272083794868553, + "learning_rate": 1.1221458928079697e-06, + "loss": 0.9104, + "step": 8854 + }, + { + "epoch": 1.7046466299299756, + "grad_norm": 3.155395903635095, + "learning_rate": 1.120711288706695e-06, + "loss": 0.8427, + "step": 8855 + }, + { + "epoch": 1.704839136607551, + "grad_norm": 3.235599106719716, + "learning_rate": 1.1192775477858776e-06, + "loss": 0.9063, + "step": 8856 + }, + { + "epoch": 1.704839136607551, + "lm_loss": 0.8269, + "step": 8856, + "vm_loss": 0.2233 + }, + { + "epoch": 1.704839136607551, + "lm_loss": 0.9562, + "step": 8856, + "vm_loss": 0.1814 + }, + { + "epoch": 1.704839136607551, + "lm_loss": 0.5613, + "step": 8856, + "vm_loss": 0.2062 + }, + { + "epoch": 1.704839136607551, + "lm_loss": 0.8156, + "step": 8856, + "vm_loss": 0.1799 + }, + { + "epoch": 1.704839136607551, + "lm_loss": 0.5601, + "step": 8856, + "vm_loss": 0.2429 + }, + { + "epoch": 1.704839136607551, + "lm_loss": 0.7895, + "step": 8856, + "vm_loss": 0.1688 + }, + { + "epoch": 1.704839136607551, + "lm_loss": 0.4669, + "step": 8856, + "vm_loss": 0.1186 + }, + { + "epoch": 1.704839136607551, + "lm_loss": 0.3733, + "step": 8856, + "vm_loss": 0.1718 + }, + { + "epoch": 1.7050316432851265, + "grad_norm": 3.2564330656743663, + "learning_rate": 1.1178446701849045e-06, + "loss": 0.9041, + "step": 8857 + }, + { + "epoch": 1.7052241499627019, + "grad_norm": 3.2676053615548217, + "learning_rate": 1.1164126560430633e-06, + "loss": 0.8497, + "step": 8858 + }, + { + "epoch": 1.7054166566402773, + "grad_norm": 3.209750321017128, + "learning_rate": 1.1149815054995682e-06, + "loss": 0.8197, + "step": 8859 + }, + { + "epoch": 1.7056091633178525, + "grad_norm": 3.1911299968321556, + "learning_rate": 1.1135512186935392e-06, + "loss": 0.8475, + "step": 8860 + }, + { + "epoch": 1.705801669995428, + "grad_norm": 3.3686046986211267, + "learning_rate": 1.1121217957640261e-06, + "loss": 0.8758, + "step": 8861 + }, + { + "epoch": 1.7059941766730033, + "grad_norm": 3.3407111736229838, + "learning_rate": 1.1106932368499824e-06, + "loss": 0.8809, + "step": 8862 + }, + { + "epoch": 1.7061866833505788, + "grad_norm": 3.4470173285355554, + "learning_rate": 1.1092655420902842e-06, + "loss": 0.9333, + "step": 8863 + }, + { + "epoch": 1.7063791900281542, + "grad_norm": 3.355824641161408, + "learning_rate": 1.1078387116237198e-06, + "loss": 0.8842, + "step": 8864 + }, + { + "epoch": 1.7063791900281542, + "lm_loss": 0.5672, + "step": 8864, + "vm_loss": 0.1215 + }, + { + "epoch": 1.7063791900281542, + "lm_loss": 0.4775, + "step": 8864, + "vm_loss": 0.2216 + }, + { + "epoch": 1.7063791900281542, + "lm_loss": 0.5918, + "step": 8864, + "vm_loss": 0.1373 + }, + { + "epoch": 1.7063791900281542, + "lm_loss": 1.0284, + "step": 8864, + "vm_loss": 0.1953 + }, + { + "epoch": 1.7063791900281542, + "lm_loss": 0.5056, + "step": 8864, + "vm_loss": 0.142 + }, + { + "epoch": 1.7063791900281542, + "lm_loss": 0.9401, + "step": 8864, + "vm_loss": 0.123 + }, + { + "epoch": 1.7063791900281542, + "lm_loss": 0.4532, + "step": 8864, + "vm_loss": 0.1597 + }, + { + "epoch": 1.7063791900281542, + "lm_loss": 0.511, + "step": 8864, + "vm_loss": 0.1145 + }, + { + "epoch": 1.7065716967057294, + "grad_norm": 3.4976075650895, + "learning_rate": 1.1064127455889974e-06, + "loss": 0.8965, + "step": 8865 + }, + { + "epoch": 1.7067642033833048, + "grad_norm": 3.3511588791340947, + "learning_rate": 1.1049876441247399e-06, + "loss": 0.9057, + "step": 8866 + }, + { + "epoch": 1.7069567100608802, + "grad_norm": 3.3104272371638275, + "learning_rate": 1.103563407369479e-06, + "loss": 0.8355, + "step": 8867 + }, + { + "epoch": 1.7071492167384557, + "grad_norm": 3.095068115777392, + "learning_rate": 1.1021400354616786e-06, + "loss": 0.838, + "step": 8868 + }, + { + "epoch": 1.707341723416031, + "grad_norm": 3.325558464061969, + "learning_rate": 1.100717528539702e-06, + "loss": 0.8676, + "step": 8869 + }, + { + "epoch": 1.7075342300936063, + "grad_norm": 3.079175326358344, + "learning_rate": 1.0992958867418358e-06, + "loss": 0.8183, + "step": 8870 + }, + { + "epoch": 1.7077267367711817, + "grad_norm": 3.2727756258494805, + "learning_rate": 1.0978751102062835e-06, + "loss": 0.8638, + "step": 8871 + }, + { + "epoch": 1.7079192434487571, + "grad_norm": 3.2040327259188897, + "learning_rate": 1.0964551990711624e-06, + "loss": 0.8556, + "step": 8872 + }, + { + "epoch": 1.7079192434487571, + "lm_loss": 0.5498, + "step": 8872, + "vm_loss": 0.1406 + }, + { + "epoch": 1.7079192434487571, + "lm_loss": 0.5675, + "step": 8872, + "vm_loss": 0.1157 + }, + { + "epoch": 1.7079192434487571, + "lm_loss": 0.6545, + "step": 8872, + "vm_loss": 0.1816 + }, + { + "epoch": 1.7079192434487571, + "lm_loss": 1.3122, + "step": 8872, + "vm_loss": 0.1963 + }, + { + "epoch": 1.7079192434487571, + "lm_loss": 0.9058, + "step": 8872, + "vm_loss": 0.1289 + }, + { + "epoch": 1.7079192434487571, + "lm_loss": 0.5915, + "step": 8872, + "vm_loss": 0.1451 + }, + { + "epoch": 1.7079192434487571, + "lm_loss": 1.0227, + "step": 8872, + "vm_loss": 0.1751 + }, + { + "epoch": 1.7079192434487571, + "lm_loss": 0.7995, + "step": 8872, + "vm_loss": 0.1471 + }, + { + "epoch": 1.7081117501263325, + "grad_norm": 3.549670602581958, + "learning_rate": 1.0950361534745035e-06, + "loss": 0.8971, + "step": 8873 + }, + { + "epoch": 1.708304256803908, + "grad_norm": 3.1678794102312886, + "learning_rate": 1.0936179735542607e-06, + "loss": 0.8323, + "step": 8874 + }, + { + "epoch": 1.7084967634814832, + "grad_norm": 3.319870734717857, + "learning_rate": 1.0922006594482958e-06, + "loss": 0.868, + "step": 8875 + }, + { + "epoch": 1.7086892701590588, + "grad_norm": 3.1925521191272277, + "learning_rate": 1.0907842112943922e-06, + "loss": 0.8196, + "step": 8876 + }, + { + "epoch": 1.708881776836634, + "grad_norm": 3.2962530887831862, + "learning_rate": 1.089368629230242e-06, + "loss": 0.8774, + "step": 8877 + }, + { + "epoch": 1.7090742835142094, + "grad_norm": 3.1481665874266356, + "learning_rate": 1.0879539133934646e-06, + "loss": 0.8174, + "step": 8878 + }, + { + "epoch": 1.7092667901917848, + "grad_norm": 3.1248029117457694, + "learning_rate": 1.086540063921584e-06, + "loss": 0.83, + "step": 8879 + }, + { + "epoch": 1.70945929686936, + "grad_norm": 3.144088871676336, + "learning_rate": 1.0851270809520443e-06, + "loss": 0.8409, + "step": 8880 + }, + { + "epoch": 1.70945929686936, + "lm_loss": 0.8083, + "step": 8880, + "vm_loss": 0.207 + }, + { + "epoch": 1.70945929686936, + "lm_loss": 0.8819, + "step": 8880, + "vm_loss": 0.1841 + }, + { + "epoch": 1.70945929686936, + "lm_loss": 0.7615, + "step": 8880, + "vm_loss": 0.2183 + }, + { + "epoch": 1.70945929686936, + "lm_loss": 0.8085, + "step": 8880, + "vm_loss": 0.1899 + }, + { + "epoch": 1.70945929686936, + "lm_loss": 0.8364, + "step": 8880, + "vm_loss": 0.1649 + }, + { + "epoch": 1.70945929686936, + "lm_loss": 1.0129, + "step": 8880, + "vm_loss": 0.1746 + }, + { + "epoch": 1.70945929686936, + "lm_loss": 0.5747, + "step": 8880, + "vm_loss": 0.1328 + }, + { + "epoch": 1.70945929686936, + "lm_loss": 0.5173, + "step": 8880, + "vm_loss": 0.1815 + }, + { + "epoch": 1.7096518035469357, + "grad_norm": 3.1494749438037895, + "learning_rate": 1.083714964622208e-06, + "loss": 0.8565, + "step": 8881 + }, + { + "epoch": 1.7098443102245109, + "grad_norm": 3.0538435932645385, + "learning_rate": 1.0823037150693517e-06, + "loss": 0.7939, + "step": 8882 + }, + { + "epoch": 1.7100368169020863, + "grad_norm": 3.3020248465267716, + "learning_rate": 1.0808933324306625e-06, + "loss": 0.9059, + "step": 8883 + }, + { + "epoch": 1.7102293235796617, + "grad_norm": 3.1571651179807114, + "learning_rate": 1.0794838168432498e-06, + "loss": 0.8262, + "step": 8884 + }, + { + "epoch": 1.710421830257237, + "grad_norm": 3.4548904385499117, + "learning_rate": 1.078075168444137e-06, + "loss": 0.9042, + "step": 8885 + }, + { + "epoch": 1.7106143369348126, + "grad_norm": 3.1062693327007698, + "learning_rate": 1.0766673873702637e-06, + "loss": 0.8193, + "step": 8886 + }, + { + "epoch": 1.7108068436123878, + "grad_norm": 3.24052405501799, + "learning_rate": 1.0752604737584816e-06, + "loss": 0.8652, + "step": 8887 + }, + { + "epoch": 1.7109993502899632, + "grad_norm": 3.354381429865366, + "learning_rate": 1.0738544277455632e-06, + "loss": 0.8329, + "step": 8888 + }, + { + "epoch": 1.7109993502899632, + "lm_loss": 0.786, + "step": 8888, + "vm_loss": 0.2125 + }, + { + "epoch": 1.7109993502899632, + "lm_loss": 0.8755, + "step": 8888, + "vm_loss": 0.1501 + }, + { + "epoch": 1.7109993502899632, + "lm_loss": 0.4714, + "step": 8888, + "vm_loss": 0.1179 + }, + { + "epoch": 1.7109993502899632, + "lm_loss": 0.3446, + "step": 8888, + "vm_loss": 0.161 + }, + { + "epoch": 1.7109993502899632, + "lm_loss": 1.0874, + "step": 8888, + "vm_loss": 0.1365 + }, + { + "epoch": 1.7109993502899632, + "lm_loss": 0.4309, + "step": 8888, + "vm_loss": 0.1325 + }, + { + "epoch": 1.7109993502899632, + "lm_loss": 0.5435, + "step": 8888, + "vm_loss": 0.1383 + }, + { + "epoch": 1.7109993502899632, + "lm_loss": 0.7541, + "step": 8888, + "vm_loss": 0.1147 + }, + { + "epoch": 1.7111918569675386, + "grad_norm": 3.0629785336148614, + "learning_rate": 1.0724492494681948e-06, + "loss": 0.8017, + "step": 8889 + }, + { + "epoch": 1.7113843636451138, + "grad_norm": 3.276674143732075, + "learning_rate": 1.0710449390629719e-06, + "loss": 0.8381, + "step": 8890 + }, + { + "epoch": 1.7115768703226895, + "grad_norm": 3.3575417068128925, + "learning_rate": 1.0696414966664204e-06, + "loss": 0.8774, + "step": 8891 + }, + { + "epoch": 1.7117693770002647, + "grad_norm": 3.1326103936571785, + "learning_rate": 1.0682389224149648e-06, + "loss": 0.8079, + "step": 8892 + }, + { + "epoch": 1.71196188367784, + "grad_norm": 3.272785269420774, + "learning_rate": 1.0668372164449591e-06, + "loss": 0.8698, + "step": 8893 + }, + { + "epoch": 1.7121543903554155, + "grad_norm": 3.1438979682985413, + "learning_rate": 1.0654363788926614e-06, + "loss": 0.8231, + "step": 8894 + }, + { + "epoch": 1.7123468970329907, + "grad_norm": 3.2423727464744165, + "learning_rate": 1.0640364098942569e-06, + "loss": 0.8686, + "step": 8895 + }, + { + "epoch": 1.7125394037105663, + "grad_norm": 3.412958246609101, + "learning_rate": 1.0626373095858367e-06, + "loss": 0.8951, + "step": 8896 + }, + { + "epoch": 1.7125394037105663, + "lm_loss": 0.8129, + "step": 8896, + "vm_loss": 0.1461 + }, + { + "epoch": 1.7125394037105663, + "lm_loss": 0.5878, + "step": 8896, + "vm_loss": 0.143 + }, + { + "epoch": 1.7125394037105663, + "lm_loss": 0.5934, + "step": 8896, + "vm_loss": 0.1666 + }, + { + "epoch": 1.7125394037105663, + "lm_loss": 0.8676, + "step": 8896, + "vm_loss": 0.1891 + }, + { + "epoch": 1.7125394037105663, + "lm_loss": 0.5091, + "step": 8896, + "vm_loss": 0.1372 + }, + { + "epoch": 1.7125394037105663, + "lm_loss": 0.6997, + "step": 8896, + "vm_loss": 0.174 + }, + { + "epoch": 1.7125394037105663, + "lm_loss": 0.8642, + "step": 8896, + "vm_loss": 0.1851 + }, + { + "epoch": 1.7125394037105663, + "lm_loss": 0.6114, + "step": 8896, + "vm_loss": 0.2271 + }, + { + "epoch": 1.7127319103881415, + "grad_norm": 3.277949673835552, + "learning_rate": 1.061239078103411e-06, + "loss": 0.8743, + "step": 8897 + }, + { + "epoch": 1.712924417065717, + "grad_norm": 3.4287290540715865, + "learning_rate": 1.0598417155829078e-06, + "loss": 0.8971, + "step": 8898 + }, + { + "epoch": 1.7131169237432924, + "grad_norm": 3.331824573606808, + "learning_rate": 1.0584452221601704e-06, + "loss": 0.8794, + "step": 8899 + }, + { + "epoch": 1.7133094304208676, + "grad_norm": 3.205020819960327, + "learning_rate": 1.0570495979709505e-06, + "loss": 0.8341, + "step": 8900 + }, + { + "epoch": 1.7135019370984432, + "grad_norm": 3.1647022962037012, + "learning_rate": 1.055654843150924e-06, + "loss": 0.8327, + "step": 8901 + }, + { + "epoch": 1.7136944437760184, + "grad_norm": 3.313356356073278, + "learning_rate": 1.0542609578356778e-06, + "loss": 0.8562, + "step": 8902 + }, + { + "epoch": 1.7138869504535938, + "grad_norm": 3.3872103573063823, + "learning_rate": 1.052867942160718e-06, + "loss": 0.9045, + "step": 8903 + }, + { + "epoch": 1.7140794571311693, + "grad_norm": 3.3207980889387207, + "learning_rate": 1.0514757962614576e-06, + "loss": 0.8746, + "step": 8904 + }, + { + "epoch": 1.7140794571311693, + "lm_loss": 0.9454, + "step": 8904, + "vm_loss": 0.1474 + }, + { + "epoch": 1.7140794571311693, + "lm_loss": 0.417, + "step": 8904, + "vm_loss": 0.1562 + }, + { + "epoch": 1.7140794571311693, + "lm_loss": 0.7027, + "step": 8904, + "vm_loss": 0.1679 + }, + { + "epoch": 1.7140794571311693, + "lm_loss": 0.7999, + "step": 8904, + "vm_loss": 0.164 + }, + { + "epoch": 1.7140794571311693, + "lm_loss": 0.4738, + "step": 8904, + "vm_loss": 0.1375 + }, + { + "epoch": 1.7140794571311693, + "lm_loss": 1.1223, + "step": 8904, + "vm_loss": 0.1188 + }, + { + "epoch": 1.7140794571311693, + "lm_loss": 0.8008, + "step": 8904, + "vm_loss": 0.1617 + }, + { + "epoch": 1.7140794571311693, + "lm_loss": 0.5958, + "step": 8904, + "vm_loss": 0.1978 + }, + { + "epoch": 1.7142719638087445, + "grad_norm": 3.2534060308958797, + "learning_rate": 1.0500845202732379e-06, + "loss": 0.8432, + "step": 8905 + }, + { + "epoch": 1.7144644704863201, + "grad_norm": 3.1889812206645853, + "learning_rate": 1.0486941143313045e-06, + "loss": 0.8458, + "step": 8906 + }, + { + "epoch": 1.7146569771638953, + "grad_norm": 3.1750135545687903, + "learning_rate": 1.0473045785708246e-06, + "loss": 0.8487, + "step": 8907 + }, + { + "epoch": 1.7148494838414707, + "grad_norm": 3.1354632913089318, + "learning_rate": 1.045915913126877e-06, + "loss": 0.8337, + "step": 8908 + }, + { + "epoch": 1.7150419905190462, + "grad_norm": 3.353165955052305, + "learning_rate": 1.0445281181344591e-06, + "loss": 0.9055, + "step": 8909 + }, + { + "epoch": 1.7152344971966214, + "grad_norm": 3.416826887911343, + "learning_rate": 1.0431411937284842e-06, + "loss": 0.8593, + "step": 8910 + }, + { + "epoch": 1.715427003874197, + "grad_norm": 3.0695219998235803, + "learning_rate": 1.041755140043773e-06, + "loss": 0.8203, + "step": 8911 + }, + { + "epoch": 1.7156195105517722, + "grad_norm": 3.1861432802485448, + "learning_rate": 1.040369957215076e-06, + "loss": 0.8685, + "step": 8912 + }, + { + "epoch": 1.7156195105517722, + "lm_loss": 0.8914, + "step": 8912, + "vm_loss": 0.133 + }, + { + "epoch": 1.7156195105517722, + "lm_loss": 0.742, + "step": 8912, + "vm_loss": 0.2242 + }, + { + "epoch": 1.7156195105517722, + "lm_loss": 0.4745, + "step": 8912, + "vm_loss": 0.1768 + }, + { + "epoch": 1.7156195105517722, + "lm_loss": 0.5232, + "step": 8912, + "vm_loss": 0.2139 + }, + { + "epoch": 1.7156195105517722, + "lm_loss": 0.5747, + "step": 8912, + "vm_loss": 0.1395 + }, + { + "epoch": 1.7156195105517722, + "lm_loss": 0.9237, + "step": 8912, + "vm_loss": 0.1344 + }, + { + "epoch": 1.7156195105517722, + "lm_loss": 1.1213, + "step": 8912, + "vm_loss": 0.1819 + }, + { + "epoch": 1.7156195105517722, + "lm_loss": 0.663, + "step": 8912, + "vm_loss": 0.1866 + }, + { + "epoch": 1.7158120172293476, + "grad_norm": 3.2214160179037585, + "learning_rate": 1.038985645377044e-06, + "loss": 0.8625, + "step": 8913 + }, + { + "epoch": 1.716004523906923, + "grad_norm": 3.2609101551257664, + "learning_rate": 1.037602204664252e-06, + "loss": 0.87, + "step": 8914 + }, + { + "epoch": 1.7161970305844982, + "grad_norm": 3.241086371573689, + "learning_rate": 1.0362196352111874e-06, + "loss": 0.8925, + "step": 8915 + }, + { + "epoch": 1.7163895372620739, + "grad_norm": 3.1531952615955987, + "learning_rate": 1.0348379371522577e-06, + "loss": 0.819, + "step": 8916 + }, + { + "epoch": 1.716582043939649, + "grad_norm": 3.134259475843129, + "learning_rate": 1.0334571106217762e-06, + "loss": 0.8379, + "step": 8917 + }, + { + "epoch": 1.7167745506172245, + "grad_norm": 3.199740697573619, + "learning_rate": 1.0320771557539788e-06, + "loss": 0.8611, + "step": 8918 + }, + { + "epoch": 1.7169670572948, + "grad_norm": 3.1992075535582165, + "learning_rate": 1.0306980726830151e-06, + "loss": 0.8327, + "step": 8919 + }, + { + "epoch": 1.7171595639723753, + "grad_norm": 3.391728007590299, + "learning_rate": 1.0293198615429523e-06, + "loss": 0.8824, + "step": 8920 + }, + { + "epoch": 1.7171595639723753, + "lm_loss": 0.7107, + "step": 8920, + "vm_loss": 0.1659 + }, + { + "epoch": 1.7171595639723753, + "lm_loss": 0.4212, + "step": 8920, + "vm_loss": 0.167 + }, + { + "epoch": 1.7171595639723753, + "lm_loss": 0.737, + "step": 8920, + "vm_loss": 0.2087 + }, + { + "epoch": 1.7171595639723753, + "lm_loss": 0.5547, + "step": 8920, + "vm_loss": 0.1219 + }, + { + "epoch": 1.7171595639723753, + "lm_loss": 0.6571, + "step": 8920, + "vm_loss": 0.1929 + }, + { + "epoch": 1.7171595639723753, + "lm_loss": 0.5277, + "step": 8920, + "vm_loss": 0.1642 + }, + { + "epoch": 1.7171595639723753, + "lm_loss": 0.4847, + "step": 8920, + "vm_loss": 0.1245 + }, + { + "epoch": 1.7171595639723753, + "lm_loss": 0.7419, + "step": 8920, + "vm_loss": 0.1965 + }, + { + "epoch": 1.7173520706499508, + "grad_norm": 3.069343905186388, + "learning_rate": 1.027942522467762e-06, + "loss": 0.824, + "step": 8921 + }, + { + "epoch": 1.717544577327526, + "grad_norm": 3.2927043065668857, + "learning_rate": 1.0265660555913504e-06, + "loss": 0.8538, + "step": 8922 + }, + { + "epoch": 1.7177370840051014, + "grad_norm": 3.116163717225568, + "learning_rate": 1.0251904610475183e-06, + "loss": 0.8523, + "step": 8923 + }, + { + "epoch": 1.7179295906826768, + "grad_norm": 3.352077722368089, + "learning_rate": 1.0238157389699954e-06, + "loss": 0.8643, + "step": 8924 + }, + { + "epoch": 1.7181220973602522, + "grad_norm": 3.116826650243536, + "learning_rate": 1.0224418894924226e-06, + "loss": 0.826, + "step": 8925 + }, + { + "epoch": 1.7183146040378277, + "grad_norm": 3.250029160354693, + "learning_rate": 1.0210689127483552e-06, + "loss": 0.8379, + "step": 8926 + }, + { + "epoch": 1.7185071107154029, + "grad_norm": 3.09853203365307, + "learning_rate": 1.0196968088712621e-06, + "loss": 0.7994, + "step": 8927 + }, + { + "epoch": 1.7186996173929783, + "grad_norm": 3.201837477430967, + "learning_rate": 1.0183255779945312e-06, + "loss": 0.837, + "step": 8928 + }, + { + "epoch": 1.7186996173929783, + "lm_loss": 0.7202, + "step": 8928, + "vm_loss": 0.1816 + }, + { + "epoch": 1.7186996173929783, + "lm_loss": 0.7553, + "step": 8928, + "vm_loss": 0.1405 + }, + { + "epoch": 1.7186996173929783, + "lm_loss": 0.9875, + "step": 8928, + "vm_loss": 0.1425 + }, + { + "epoch": 1.7186996173929783, + "lm_loss": 0.6147, + "step": 8928, + "vm_loss": 0.1394 + }, + { + "epoch": 1.7186996173929783, + "lm_loss": 0.5905, + "step": 8928, + "vm_loss": 0.1469 + }, + { + "epoch": 1.7186996173929783, + "lm_loss": 0.675, + "step": 8928, + "vm_loss": 0.1067 + }, + { + "epoch": 1.7186996173929783, + "lm_loss": 0.3813, + "step": 8928, + "vm_loss": 0.1646 + }, + { + "epoch": 1.7186996173929783, + "lm_loss": 0.6783, + "step": 8928, + "vm_loss": 0.2129 + }, + { + "epoch": 1.7188921240705537, + "grad_norm": 3.318475558073298, + "learning_rate": 1.0169552202514633e-06, + "loss": 0.8564, + "step": 8929 + }, + { + "epoch": 1.7190846307481291, + "grad_norm": 3.318063835923104, + "learning_rate": 1.0155857357752741e-06, + "loss": 0.8925, + "step": 8930 + }, + { + "epoch": 1.7192771374257045, + "grad_norm": 3.4515917711926685, + "learning_rate": 1.014217124699095e-06, + "loss": 0.9136, + "step": 8931 + }, + { + "epoch": 1.7194696441032797, + "grad_norm": 3.142491191565753, + "learning_rate": 1.012849387155974e-06, + "loss": 0.8201, + "step": 8932 + }, + { + "epoch": 1.7196621507808552, + "grad_norm": 3.2824590492340286, + "learning_rate": 1.011482523278874e-06, + "loss": 0.8733, + "step": 8933 + }, + { + "epoch": 1.7198546574584306, + "grad_norm": 3.3708114603109895, + "learning_rate": 1.0101165332006668e-06, + "loss": 0.8759, + "step": 8934 + }, + { + "epoch": 1.720047164136006, + "grad_norm": 3.32081758473011, + "learning_rate": 1.0087514170541479e-06, + "loss": 0.8303, + "step": 8935 + }, + { + "epoch": 1.7202396708135814, + "grad_norm": 3.340233994665464, + "learning_rate": 1.0073871749720221e-06, + "loss": 0.8813, + "step": 8936 + }, + { + "epoch": 1.7202396708135814, + "lm_loss": 0.6952, + "step": 8936, + "vm_loss": 0.162 + }, + { + "epoch": 1.7202396708135814, + "lm_loss": 0.8557, + "step": 8936, + "vm_loss": 0.1566 + }, + { + "epoch": 1.7202396708135814, + "lm_loss": 0.8264, + "step": 8936, + "vm_loss": 0.1663 + }, + { + "epoch": 1.7202396708135814, + "lm_loss": 0.7731, + "step": 8936, + "vm_loss": 0.1366 + }, + { + "epoch": 1.7202396708135814, + "lm_loss": 0.717, + "step": 8936, + "vm_loss": 0.1558 + }, + { + "epoch": 1.7202396708135814, + "lm_loss": 0.5549, + "step": 8936, + "vm_loss": 0.1486 + }, + { + "epoch": 1.7202396708135814, + "lm_loss": 0.4817, + "step": 8936, + "vm_loss": 0.1356 + }, + { + "epoch": 1.7202396708135814, + "lm_loss": 0.6742, + "step": 8936, + "vm_loss": 0.1407 + }, + { + "epoch": 1.7204321774911566, + "grad_norm": 3.2266470177293054, + "learning_rate": 1.0060238070869143e-06, + "loss": 0.8259, + "step": 8937 + }, + { + "epoch": 1.7206246841687323, + "grad_norm": 3.005420247059756, + "learning_rate": 1.0046613135313544e-06, + "loss": 0.7882, + "step": 8938 + }, + { + "epoch": 1.7208171908463075, + "grad_norm": 3.3415559983638525, + "learning_rate": 1.0032996944378027e-06, + "loss": 0.8775, + "step": 8939 + }, + { + "epoch": 1.7210096975238829, + "grad_norm": 3.44868233121355, + "learning_rate": 1.0019389499386212e-06, + "loss": 0.9046, + "step": 8940 + }, + { + "epoch": 1.7212022042014583, + "grad_norm": 3.1713647240290435, + "learning_rate": 1.0005790801660942e-06, + "loss": 0.875, + "step": 8941 + }, + { + "epoch": 1.7213947108790335, + "grad_norm": 3.2429112801164304, + "learning_rate": 9.992200852524124e-07, + "loss": 0.8531, + "step": 8942 + }, + { + "epoch": 1.7215872175566092, + "grad_norm": 3.3411078949752975, + "learning_rate": 9.978619653296961e-07, + "loss": 0.8855, + "step": 8943 + }, + { + "epoch": 1.7217797242341843, + "grad_norm": 3.2922186041006247, + "learning_rate": 9.965047205299649e-07, + "loss": 0.89, + "step": 8944 + }, + { + "epoch": 1.7217797242341843, + "lm_loss": 0.6766, + "step": 8944, + "vm_loss": 0.146 + }, + { + "epoch": 1.7217797242341843, + "lm_loss": 0.7868, + "step": 8944, + "vm_loss": 0.1826 + }, + { + "epoch": 1.7217797242341843, + "lm_loss": 0.4908, + "step": 8944, + "vm_loss": 0.1626 + }, + { + "epoch": 1.7217797242341843, + "lm_loss": 0.7814, + "step": 8944, + "vm_loss": 0.1498 + }, + { + "epoch": 1.7217797242341843, + "lm_loss": 0.8473, + "step": 8944, + "vm_loss": 0.1862 + }, + { + "epoch": 1.7217797242341843, + "lm_loss": 0.6605, + "step": 8944, + "vm_loss": 0.1947 + }, + { + "epoch": 1.7217797242341843, + "lm_loss": 0.5512, + "step": 8944, + "vm_loss": 0.1895 + }, + { + "epoch": 1.7217797242341843, + "lm_loss": 0.6782, + "step": 8944, + "vm_loss": 0.1047 + }, + { + "epoch": 1.7219722309117598, + "grad_norm": 3.3617177934642353, + "learning_rate": 9.951483509851644e-07, + "loss": 0.9132, + "step": 8945 + }, + { + "epoch": 1.7221647375893352, + "grad_norm": 3.324501972567766, + "learning_rate": 9.937928568271493e-07, + "loss": 0.8582, + "step": 8946 + }, + { + "epoch": 1.7223572442669104, + "grad_norm": 3.2989572301823626, + "learning_rate": 9.924382381876917e-07, + "loss": 0.8569, + "step": 8947 + }, + { + "epoch": 1.722549750944486, + "grad_norm": 3.132475058973454, + "learning_rate": 9.910844951984788e-07, + "loss": 0.836, + "step": 8948 + }, + { + "epoch": 1.7227422576220612, + "grad_norm": 3.0809040452973, + "learning_rate": 9.897316279911106e-07, + "loss": 0.8223, + "step": 8949 + }, + { + "epoch": 1.7229347642996367, + "grad_norm": 3.3126142331421957, + "learning_rate": 9.883796366971021e-07, + "loss": 0.8111, + "step": 8950 + }, + { + "epoch": 1.723127270977212, + "grad_norm": 3.450535666784199, + "learning_rate": 9.870285214478859e-07, + "loss": 0.9454, + "step": 8951 + }, + { + "epoch": 1.7233197776547873, + "grad_norm": 3.184740417489782, + "learning_rate": 9.856782823748067e-07, + "loss": 0.8782, + "step": 8952 + }, + { + "epoch": 1.7233197776547873, + "lm_loss": 0.8355, + "step": 8952, + "vm_loss": 0.2282 + }, + { + "epoch": 1.7233197776547873, + "lm_loss": 0.677, + "step": 8952, + "vm_loss": 0.2156 + }, + { + "epoch": 1.7233197776547873, + "lm_loss": 0.3752, + "step": 8952, + "vm_loss": 0.1546 + }, + { + "epoch": 1.7233197776547873, + "lm_loss": 0.7342, + "step": 8952, + "vm_loss": 0.1591 + }, + { + "epoch": 1.7233197776547873, + "lm_loss": 0.5538, + "step": 8952, + "vm_loss": 0.1396 + }, + { + "epoch": 1.7233197776547873, + "lm_loss": 0.8072, + "step": 8952, + "vm_loss": 0.1237 + }, + { + "epoch": 1.7233197776547873, + "lm_loss": 0.8726, + "step": 8952, + "vm_loss": 0.1622 + }, + { + "epoch": 1.7233197776547873, + "lm_loss": 0.8837, + "step": 8952, + "vm_loss": 0.2287 + }, + { + "epoch": 1.723512284332363, + "grad_norm": 3.2031496758580733, + "learning_rate": 9.843289196091243e-07, + "loss": 0.8411, + "step": 8953 + }, + { + "epoch": 1.7237047910099381, + "grad_norm": 3.1436221080331204, + "learning_rate": 9.829804332820182e-07, + "loss": 0.8143, + "step": 8954 + }, + { + "epoch": 1.7238972976875135, + "grad_norm": 3.32229552729267, + "learning_rate": 9.81632823524571e-07, + "loss": 0.8764, + "step": 8955 + }, + { + "epoch": 1.724089804365089, + "grad_norm": 3.2636142067710447, + "learning_rate": 9.802860904677959e-07, + "loss": 0.8467, + "step": 8956 + }, + { + "epoch": 1.7242823110426642, + "grad_norm": 3.2950080610204027, + "learning_rate": 9.789402342426058e-07, + "loss": 0.8946, + "step": 8957 + }, + { + "epoch": 1.7244748177202398, + "grad_norm": 3.1483267203520406, + "learning_rate": 9.775952549798406e-07, + "loss": 0.8181, + "step": 8958 + }, + { + "epoch": 1.724667324397815, + "grad_norm": 3.2048427322343973, + "learning_rate": 9.762511528102437e-07, + "loss": 0.8543, + "step": 8959 + }, + { + "epoch": 1.7248598310753904, + "grad_norm": 3.304744513621145, + "learning_rate": 9.749079278644846e-07, + "loss": 0.8927, + "step": 8960 + }, + { + "epoch": 1.7248598310753904, + "lm_loss": 0.4711, + "step": 8960, + "vm_loss": 0.2401 + }, + { + "epoch": 1.7248598310753904, + "lm_loss": 0.5376, + "step": 8960, + "vm_loss": 0.1088 + }, + { + "epoch": 1.7248598310753904, + "lm_loss": 0.8549, + "step": 8960, + "vm_loss": 0.1022 + }, + { + "epoch": 1.7248598310753904, + "lm_loss": 0.3762, + "step": 8960, + "vm_loss": 0.2143 + }, + { + "epoch": 1.7248598310753904, + "lm_loss": 1.2175, + "step": 8960, + "vm_loss": 0.1892 + }, + { + "epoch": 1.7248598310753904, + "lm_loss": 0.9253, + "step": 8960, + "vm_loss": 0.1382 + }, + { + "epoch": 1.7248598310753904, + "lm_loss": 0.6029, + "step": 8960, + "vm_loss": 0.1508 + }, + { + "epoch": 1.7248598310753904, + "lm_loss": 0.6167, + "step": 8960, + "vm_loss": 0.1544 + }, + { + "epoch": 1.7250523377529658, + "grad_norm": 3.334838370373949, + "learning_rate": 9.735655802731393e-07, + "loss": 0.8613, + "step": 8961 + }, + { + "epoch": 1.725244844430541, + "grad_norm": 3.234584263166714, + "learning_rate": 9.722241101667007e-07, + "loss": 0.859, + "step": 8962 + }, + { + "epoch": 1.7254373511081167, + "grad_norm": 3.3074746558211316, + "learning_rate": 9.70883517675577e-07, + "loss": 0.8494, + "step": 8963 + }, + { + "epoch": 1.725629857785692, + "grad_norm": 3.2494818069727116, + "learning_rate": 9.695438029300941e-07, + "loss": 0.8542, + "step": 8964 + }, + { + "epoch": 1.7258223644632673, + "grad_norm": 3.3043143389642595, + "learning_rate": 9.682049660604831e-07, + "loss": 0.8842, + "step": 8965 + }, + { + "epoch": 1.7260148711408427, + "grad_norm": 3.2213741608239213, + "learning_rate": 9.668670071969033e-07, + "loss": 0.8509, + "step": 8966 + }, + { + "epoch": 1.726207377818418, + "grad_norm": 3.126435078991705, + "learning_rate": 9.655299264694151e-07, + "loss": 0.825, + "step": 8967 + }, + { + "epoch": 1.7263998844959936, + "grad_norm": 3.229934544123826, + "learning_rate": 9.641937240080046e-07, + "loss": 0.8789, + "step": 8968 + }, + { + "epoch": 1.7263998844959936, + "lm_loss": 0.4709, + "step": 8968, + "vm_loss": 0.2089 + }, + { + "epoch": 1.7263998844959936, + "lm_loss": 0.8347, + "step": 8968, + "vm_loss": 0.1449 + }, + { + "epoch": 1.7263998844959936, + "lm_loss": 1.0164, + "step": 8968, + "vm_loss": 0.1211 + }, + { + "epoch": 1.7263998844959936, + "lm_loss": 0.6292, + "step": 8968, + "vm_loss": 0.2068 + }, + { + "epoch": 1.7263998844959936, + "lm_loss": 0.3918, + "step": 8968, + "vm_loss": 0.1826 + }, + { + "epoch": 1.7263998844959936, + "lm_loss": 0.9147, + "step": 8968, + "vm_loss": 0.2303 + }, + { + "epoch": 1.7263998844959936, + "lm_loss": 0.6953, + "step": 8968, + "vm_loss": 0.1982 + }, + { + "epoch": 1.7263998844959936, + "lm_loss": 0.8437, + "step": 8968, + "vm_loss": 0.186 + }, + { + "epoch": 1.7265923911735688, + "grad_norm": 3.2256585716227666, + "learning_rate": 9.628583999425644e-07, + "loss": 0.8918, + "step": 8969 + }, + { + "epoch": 1.7267848978511442, + "grad_norm": 3.102493749489921, + "learning_rate": 9.615239544029066e-07, + "loss": 0.8067, + "step": 8970 + }, + { + "epoch": 1.7269774045287196, + "grad_norm": 3.2600718670955873, + "learning_rate": 9.60190387518759e-07, + "loss": 0.8748, + "step": 8971 + }, + { + "epoch": 1.7271699112062948, + "grad_norm": 3.2294348220547633, + "learning_rate": 9.588576994197563e-07, + "loss": 0.8835, + "step": 8972 + }, + { + "epoch": 1.7273624178838705, + "grad_norm": 3.274912463134301, + "learning_rate": 9.575258902354545e-07, + "loss": 0.8528, + "step": 8973 + }, + { + "epoch": 1.7275549245614457, + "grad_norm": 3.4362174095597333, + "learning_rate": 9.561949600953246e-07, + "loss": 0.874, + "step": 8974 + }, + { + "epoch": 1.727747431239021, + "grad_norm": 3.2540138174963715, + "learning_rate": 9.548649091287499e-07, + "loss": 0.8661, + "step": 8975 + }, + { + "epoch": 1.7279399379165965, + "grad_norm": 3.3309790676960986, + "learning_rate": 9.535357374650234e-07, + "loss": 0.8776, + "step": 8976 + }, + { + "epoch": 1.7279399379165965, + "lm_loss": 0.7443, + "step": 8976, + "vm_loss": 0.1309 + }, + { + "epoch": 1.7279399379165965, + "lm_loss": 0.809, + "step": 8976, + "vm_loss": 0.1411 + }, + { + "epoch": 1.7279399379165965, + "lm_loss": 0.8912, + "step": 8976, + "vm_loss": 0.1064 + }, + { + "epoch": 1.7279399379165965, + "lm_loss": 0.7365, + "step": 8976, + "vm_loss": 0.1911 + }, + { + "epoch": 1.7279399379165965, + "lm_loss": 0.2947, + "step": 8976, + "vm_loss": 0.1361 + }, + { + "epoch": 1.7279399379165965, + "lm_loss": 0.4522, + "step": 8976, + "vm_loss": 0.1367 + }, + { + "epoch": 1.7279399379165965, + "lm_loss": 0.6013, + "step": 8976, + "vm_loss": 0.2124 + }, + { + "epoch": 1.7279399379165965, + "lm_loss": 0.9888, + "step": 8976, + "vm_loss": 0.1159 + }, + { + "epoch": 1.7281324445941717, + "grad_norm": 3.2086399070506224, + "learning_rate": 9.52207445233364e-07, + "loss": 0.829, + "step": 8977 + }, + { + "epoch": 1.7283249512717473, + "grad_norm": 3.2142612481278876, + "learning_rate": 9.508800325628953e-07, + "loss": 0.8203, + "step": 8978 + }, + { + "epoch": 1.7285174579493225, + "grad_norm": 3.30203227420588, + "learning_rate": 9.495534995826594e-07, + "loss": 0.8663, + "step": 8979 + }, + { + "epoch": 1.728709964626898, + "grad_norm": 3.3623202683828888, + "learning_rate": 9.482278464216121e-07, + "loss": 0.8906, + "step": 8980 + }, + { + "epoch": 1.7289024713044734, + "grad_norm": 3.168417966935436, + "learning_rate": 9.469030732086259e-07, + "loss": 0.8296, + "step": 8981 + }, + { + "epoch": 1.7290949779820486, + "grad_norm": 3.180727006826908, + "learning_rate": 9.4557918007248e-07, + "loss": 0.8367, + "step": 8982 + }, + { + "epoch": 1.7292874846596242, + "grad_norm": 3.2421341190237003, + "learning_rate": 9.442561671418826e-07, + "loss": 0.8449, + "step": 8983 + }, + { + "epoch": 1.7294799913371994, + "grad_norm": 3.1657655196022256, + "learning_rate": 9.429340345454397e-07, + "loss": 0.8446, + "step": 8984 + }, + { + "epoch": 1.7294799913371994, + "lm_loss": 1.0147, + "step": 8984, + "vm_loss": 0.1479 + }, + { + "epoch": 1.7294799913371994, + "lm_loss": 0.498, + "step": 8984, + "vm_loss": 0.1546 + }, + { + "epoch": 1.7294799913371994, + "lm_loss": 0.7227, + "step": 8984, + "vm_loss": 0.1778 + }, + { + "epoch": 1.7294799913371994, + "lm_loss": 0.7105, + "step": 8984, + "vm_loss": 0.2208 + }, + { + "epoch": 1.7294799913371994, + "lm_loss": 0.8562, + "step": 8984, + "vm_loss": 0.1435 + }, + { + "epoch": 1.7294799913371994, + "lm_loss": 0.7282, + "step": 8984, + "vm_loss": 0.1535 + }, + { + "epoch": 1.7294799913371994, + "lm_loss": 0.7734, + "step": 8984, + "vm_loss": 0.1846 + }, + { + "epoch": 1.7294799913371994, + "lm_loss": 1.0913, + "step": 8984, + "vm_loss": 0.1335 + }, + { + "epoch": 1.7296724980147749, + "grad_norm": 3.3580211341887938, + "learning_rate": 9.416127824116839e-07, + "loss": 0.8897, + "step": 8985 + }, + { + "epoch": 1.7298650046923503, + "grad_norm": 3.219064113154529, + "learning_rate": 9.402924108690536e-07, + "loss": 0.833, + "step": 8986 + }, + { + "epoch": 1.7300575113699257, + "grad_norm": 3.331850850845125, + "learning_rate": 9.389729200459108e-07, + "loss": 0.9122, + "step": 8987 + }, + { + "epoch": 1.7302500180475011, + "grad_norm": 3.3481557477256367, + "learning_rate": 9.376543100705238e-07, + "loss": 0.9224, + "step": 8988 + }, + { + "epoch": 1.7304425247250763, + "grad_norm": 3.1726878292574665, + "learning_rate": 9.363365810710789e-07, + "loss": 0.8387, + "step": 8989 + }, + { + "epoch": 1.7306350314026517, + "grad_norm": 3.361946644076714, + "learning_rate": 9.350197331756761e-07, + "loss": 0.8998, + "step": 8990 + }, + { + "epoch": 1.7308275380802272, + "grad_norm": 3.0532075814681106, + "learning_rate": 9.337037665123305e-07, + "loss": 0.8108, + "step": 8991 + }, + { + "epoch": 1.7310200447578026, + "grad_norm": 3.1757435609887685, + "learning_rate": 9.323886812089722e-07, + "loss": 0.8322, + "step": 8992 + }, + { + "epoch": 1.7310200447578026, + "lm_loss": 0.7484, + "step": 8992, + "vm_loss": 0.1643 + }, + { + "epoch": 1.7310200447578026, + "lm_loss": 0.6451, + "step": 8992, + "vm_loss": 0.1168 + }, + { + "epoch": 1.7310200447578026, + "lm_loss": 0.5853, + "step": 8992, + "vm_loss": 0.1656 + }, + { + "epoch": 1.7310200447578026, + "lm_loss": 0.7339, + "step": 8992, + "vm_loss": 0.1872 + }, + { + "epoch": 1.7310200447578026, + "lm_loss": 0.5924, + "step": 8992, + "vm_loss": 0.1163 + }, + { + "epoch": 1.7310200447578026, + "lm_loss": 0.7403, + "step": 8992, + "vm_loss": 0.1717 + }, + { + "epoch": 1.7310200447578026, + "lm_loss": 0.5906, + "step": 8992, + "vm_loss": 0.1688 + }, + { + "epoch": 1.7310200447578026, + "lm_loss": 0.6575, + "step": 8992, + "vm_loss": 0.097 + }, + { + "epoch": 1.731212551435378, + "grad_norm": 3.2269399865789836, + "learning_rate": 9.31074477393441e-07, + "loss": 0.8166, + "step": 8993 + }, + { + "epoch": 1.7314050581129532, + "grad_norm": 3.2261221746455986, + "learning_rate": 9.29761155193496e-07, + "loss": 0.8515, + "step": 8994 + }, + { + "epoch": 1.7315975647905286, + "grad_norm": 3.254771989080235, + "learning_rate": 9.284487147368093e-07, + "loss": 0.8676, + "step": 8995 + }, + { + "epoch": 1.731790071468104, + "grad_norm": 3.148020297369259, + "learning_rate": 9.271371561509657e-07, + "loss": 0.8014, + "step": 8996 + }, + { + "epoch": 1.7319825781456795, + "grad_norm": 3.138304011874794, + "learning_rate": 9.258264795634675e-07, + "loss": 0.8299, + "step": 8997 + }, + { + "epoch": 1.7321750848232549, + "grad_norm": 3.2520234328664914, + "learning_rate": 9.245166851017284e-07, + "loss": 0.8762, + "step": 8998 + }, + { + "epoch": 1.73236759150083, + "grad_norm": 3.2425417671841603, + "learning_rate": 9.232077728930744e-07, + "loss": 0.872, + "step": 8999 + }, + { + "epoch": 1.7325600981784057, + "grad_norm": 3.3174267535684945, + "learning_rate": 9.218997430647548e-07, + "loss": 0.9155, + "step": 9000 + }, + { + "epoch": 1.7325600981784057, + "lm_loss": 0.8622, + "step": 9000, + "vm_loss": 0.1443 + }, + { + "epoch": 1.7325600981784057, + "lm_loss": 0.4251, + "step": 9000, + "vm_loss": 0.1478 + }, + { + "epoch": 1.7325600981784057, + "lm_loss": 0.8288, + "step": 9000, + "vm_loss": 0.1633 + }, + { + "epoch": 1.7325600981784057, + "lm_loss": 0.8252, + "step": 9000, + "vm_loss": 0.2165 + }, + { + "epoch": 1.7325600981784057, + "lm_loss": 0.3604, + "step": 9000, + "vm_loss": 0.1249 + }, + { + "epoch": 1.7325600981784057, + "lm_loss": 0.7424, + "step": 9000, + "vm_loss": 0.1561 + }, + { + "epoch": 1.7325600981784057, + "lm_loss": 1.0767, + "step": 9000, + "vm_loss": 0.0914 + }, + { + "epoch": 1.7325600981784057, + "lm_loss": 0.5247, + "step": 9000, + "vm_loss": 0.1524 + }, + { + "epoch": 1.732752604855981, + "grad_norm": 3.2549588248316135, + "learning_rate": 9.205925957439199e-07, + "loss": 0.8588, + "step": 9001 + }, + { + "epoch": 1.7329451115335563, + "grad_norm": 3.2036334965312427, + "learning_rate": 9.192863310576472e-07, + "loss": 0.8346, + "step": 9002 + }, + { + "epoch": 1.7331376182111318, + "grad_norm": 3.373110106875319, + "learning_rate": 9.17980949132915e-07, + "loss": 0.9018, + "step": 9003 + }, + { + "epoch": 1.733330124888707, + "grad_norm": 3.226449924566269, + "learning_rate": 9.166764500966319e-07, + "loss": 0.8476, + "step": 9004 + }, + { + "epoch": 1.7335226315662826, + "grad_norm": 3.2134958873577055, + "learning_rate": 9.153728340756052e-07, + "loss": 0.8562, + "step": 9005 + }, + { + "epoch": 1.7337151382438578, + "grad_norm": 3.1391893644490643, + "learning_rate": 9.140701011965647e-07, + "loss": 0.8248, + "step": 9006 + }, + { + "epoch": 1.7339076449214332, + "grad_norm": 3.2479247629223145, + "learning_rate": 9.127682515861546e-07, + "loss": 0.8871, + "step": 9007 + }, + { + "epoch": 1.7341001515990087, + "grad_norm": 3.1520820331982593, + "learning_rate": 9.114672853709305e-07, + "loss": 0.8504, + "step": 9008 + }, + { + "epoch": 1.7341001515990087, + "lm_loss": 0.632, + "step": 9008, + "vm_loss": 0.1275 + }, + { + "epoch": 1.7341001515990087, + "lm_loss": 0.8274, + "step": 9008, + "vm_loss": 0.1251 + }, + { + "epoch": 1.7341001515990087, + "lm_loss": 0.3782, + "step": 9008, + "vm_loss": 0.1072 + }, + { + "epoch": 1.7341001515990087, + "lm_loss": 0.4288, + "step": 9008, + "vm_loss": 0.1482 + }, + { + "epoch": 1.7341001515990087, + "lm_loss": 0.6363, + "step": 9008, + "vm_loss": 0.1102 + }, + { + "epoch": 1.7341001515990087, + "lm_loss": 0.532, + "step": 9008, + "vm_loss": 0.1917 + }, + { + "epoch": 1.7341001515990087, + "lm_loss": 0.4103, + "step": 9008, + "vm_loss": 0.1106 + }, + { + "epoch": 1.7341001515990087, + "lm_loss": 0.5307, + "step": 9008, + "vm_loss": 0.0994 + }, + { + "epoch": 1.7342926582765839, + "grad_norm": 3.2181576364098854, + "learning_rate": 9.101672026773622e-07, + "loss": 0.8131, + "step": 9009 + }, + { + "epoch": 1.7344851649541595, + "grad_norm": 3.2520779694234347, + "learning_rate": 9.088680036318342e-07, + "loss": 0.885, + "step": 9010 + }, + { + "epoch": 1.7346776716317347, + "grad_norm": 3.303490851609354, + "learning_rate": 9.075696883606455e-07, + "loss": 0.9033, + "step": 9011 + }, + { + "epoch": 1.7348701783093101, + "grad_norm": 3.296322793675332, + "learning_rate": 9.062722569900085e-07, + "loss": 0.8446, + "step": 9012 + }, + { + "epoch": 1.7350626849868855, + "grad_norm": 3.446393330110615, + "learning_rate": 9.04975709646052e-07, + "loss": 0.9133, + "step": 9013 + }, + { + "epoch": 1.7352551916644607, + "grad_norm": 3.1315834919181134, + "learning_rate": 9.036800464548157e-07, + "loss": 0.8558, + "step": 9014 + }, + { + "epoch": 1.7354476983420364, + "grad_norm": 3.3058518717361016, + "learning_rate": 9.023852675422562e-07, + "loss": 0.8379, + "step": 9015 + }, + { + "epoch": 1.7356402050196116, + "grad_norm": 3.2571351661639434, + "learning_rate": 9.010913730342397e-07, + "loss": 0.8438, + "step": 9016 + }, + { + "epoch": 1.7356402050196116, + "lm_loss": 0.7141, + "step": 9016, + "vm_loss": 0.1332 + }, + { + "epoch": 1.7356402050196116, + "lm_loss": 0.7035, + "step": 9016, + "vm_loss": 0.1819 + }, + { + "epoch": 1.7356402050196116, + "lm_loss": 0.4547, + "step": 9016, + "vm_loss": 0.1393 + }, + { + "epoch": 1.7356402050196116, + "lm_loss": 0.582, + "step": 9016, + "vm_loss": 0.0812 + }, + { + "epoch": 1.7356402050196116, + "lm_loss": 0.6689, + "step": 9016, + "vm_loss": 0.1729 + }, + { + "epoch": 1.7356402050196116, + "lm_loss": 0.8338, + "step": 9016, + "vm_loss": 0.2081 + }, + { + "epoch": 1.7356402050196116, + "lm_loss": 0.3363, + "step": 9016, + "vm_loss": 0.1337 + }, + { + "epoch": 1.7356402050196116, + "lm_loss": 0.486, + "step": 9016, + "vm_loss": 0.2139 + }, + { + "epoch": 1.735832711697187, + "grad_norm": 3.182211175190881, + "learning_rate": 8.99798363056551e-07, + "loss": 0.855, + "step": 9017 + }, + { + "epoch": 1.7360252183747624, + "grad_norm": 3.195317246702894, + "learning_rate": 8.985062377348863e-07, + "loss": 0.859, + "step": 9018 + }, + { + "epoch": 1.7362177250523376, + "grad_norm": 3.4047623851792053, + "learning_rate": 8.972149971948607e-07, + "loss": 0.9195, + "step": 9019 + }, + { + "epoch": 1.7364102317299133, + "grad_norm": 3.4958953844186182, + "learning_rate": 8.959246415619905e-07, + "loss": 0.9325, + "step": 9020 + }, + { + "epoch": 1.7366027384074885, + "grad_norm": 3.252647251365429, + "learning_rate": 8.946351709617251e-07, + "loss": 0.849, + "step": 9021 + }, + { + "epoch": 1.7367952450850639, + "grad_norm": 3.4679321251220863, + "learning_rate": 8.933465855194101e-07, + "loss": 0.904, + "step": 9022 + }, + { + "epoch": 1.7369877517626393, + "grad_norm": 3.206615775500758, + "learning_rate": 8.920588853603163e-07, + "loss": 0.8236, + "step": 9023 + }, + { + "epoch": 1.7371802584402145, + "grad_norm": 3.290031510272475, + "learning_rate": 8.907720706096223e-07, + "loss": 0.849, + "step": 9024 + }, + { + "epoch": 1.7371802584402145, + "lm_loss": 0.9044, + "step": 9024, + "vm_loss": 0.162 + }, + { + "epoch": 1.7371802584402145, + "lm_loss": 0.5596, + "step": 9024, + "vm_loss": 0.1698 + }, + { + "epoch": 1.7371802584402145, + "lm_loss": 0.7644, + "step": 9024, + "vm_loss": 0.1443 + }, + { + "epoch": 1.7371802584402145, + "lm_loss": 0.7777, + "step": 9024, + "vm_loss": 0.1499 + }, + { + "epoch": 1.7371802584402145, + "lm_loss": 0.5802, + "step": 9024, + "vm_loss": 0.1264 + }, + { + "epoch": 1.7371802584402145, + "lm_loss": 0.5636, + "step": 9024, + "vm_loss": 0.168 + }, + { + "epoch": 1.7371802584402145, + "lm_loss": 0.6826, + "step": 9024, + "vm_loss": 0.1854 + }, + { + "epoch": 1.7371802584402145, + "lm_loss": 0.5797, + "step": 9024, + "vm_loss": 0.175 + }, + { + "epoch": 1.7373727651177902, + "grad_norm": 3.2458183846599966, + "learning_rate": 8.894861413924261e-07, + "loss": 0.8509, + "step": 9025 + }, + { + "epoch": 1.7375652717953654, + "grad_norm": 3.1980997709650154, + "learning_rate": 8.88201097833733e-07, + "loss": 0.86, + "step": 9026 + }, + { + "epoch": 1.7377577784729408, + "grad_norm": 3.2613122498285825, + "learning_rate": 8.869169400584665e-07, + "loss": 0.8707, + "step": 9027 + }, + { + "epoch": 1.7379502851505162, + "grad_norm": 3.237536655993925, + "learning_rate": 8.856336681914646e-07, + "loss": 0.8592, + "step": 9028 + }, + { + "epoch": 1.7381427918280914, + "grad_norm": 3.148861421584768, + "learning_rate": 8.843512823574784e-07, + "loss": 0.8571, + "step": 9029 + }, + { + "epoch": 1.738335298505667, + "grad_norm": 3.4609060602842887, + "learning_rate": 8.830697826811674e-07, + "loss": 0.9073, + "step": 9030 + }, + { + "epoch": 1.7385278051832422, + "grad_norm": 3.299052295075287, + "learning_rate": 8.817891692871161e-07, + "loss": 0.8489, + "step": 9031 + }, + { + "epoch": 1.7387203118608177, + "grad_norm": 3.172806601968051, + "learning_rate": 8.805094422998117e-07, + "loss": 0.8576, + "step": 9032 + }, + { + "epoch": 1.7387203118608177, + "lm_loss": 0.7352, + "step": 9032, + "vm_loss": 0.1898 + }, + { + "epoch": 1.7387203118608177, + "lm_loss": 0.5268, + "step": 9032, + "vm_loss": 0.2251 + }, + { + "epoch": 1.7387203118608177, + "lm_loss": 1.0179, + "step": 9032, + "vm_loss": 0.0958 + }, + { + "epoch": 1.7387203118608177, + "lm_loss": 1.0287, + "step": 9032, + "vm_loss": 0.2244 + }, + { + "epoch": 1.7387203118608177, + "lm_loss": 0.5046, + "step": 9032, + "vm_loss": 0.1833 + }, + { + "epoch": 1.7387203118608177, + "lm_loss": 0.4661, + "step": 9032, + "vm_loss": 0.1204 + }, + { + "epoch": 1.7387203118608177, + "lm_loss": 0.8889, + "step": 9032, + "vm_loss": 0.1477 + }, + { + "epoch": 1.7387203118608177, + "lm_loss": 1.2154, + "step": 9032, + "vm_loss": 0.1455 + }, + { + "epoch": 1.738912818538393, + "grad_norm": 3.314951912785957, + "learning_rate": 8.792306018436625e-07, + "loss": 0.8696, + "step": 9033 + }, + { + "epoch": 1.7391053252159683, + "grad_norm": 3.1266469335019242, + "learning_rate": 8.779526480429868e-07, + "loss": 0.8237, + "step": 9034 + }, + { + "epoch": 1.739297831893544, + "grad_norm": 3.136114447931284, + "learning_rate": 8.766755810220195e-07, + "loss": 0.8163, + "step": 9035 + }, + { + "epoch": 1.7394903385711191, + "grad_norm": 3.269079562078465, + "learning_rate": 8.753994009049082e-07, + "loss": 0.837, + "step": 9036 + }, + { + "epoch": 1.7396828452486945, + "grad_norm": 3.160373829140482, + "learning_rate": 8.741241078157103e-07, + "loss": 0.8679, + "step": 9037 + }, + { + "epoch": 1.73987535192627, + "grad_norm": 3.437835693214052, + "learning_rate": 8.728497018784065e-07, + "loss": 0.8683, + "step": 9038 + }, + { + "epoch": 1.7400678586038452, + "grad_norm": 3.1756604690853942, + "learning_rate": 8.71576183216879e-07, + "loss": 0.8264, + "step": 9039 + }, + { + "epoch": 1.7402603652814208, + "grad_norm": 3.3231204425868346, + "learning_rate": 8.703035519549341e-07, + "loss": 0.8767, + "step": 9040 + }, + { + "epoch": 1.7402603652814208, + "lm_loss": 0.6751, + "step": 9040, + "vm_loss": 0.1829 + }, + { + "epoch": 1.7402603652814208, + "lm_loss": 0.5237, + "step": 9040, + "vm_loss": 0.197 + }, + { + "epoch": 1.7402603652814208, + "lm_loss": 0.6484, + "step": 9040, + "vm_loss": 0.1733 + }, + { + "epoch": 1.7402603652814208, + "lm_loss": 0.6107, + "step": 9040, + "vm_loss": 0.1663 + }, + { + "epoch": 1.7402603652814208, + "lm_loss": 0.554, + "step": 9040, + "vm_loss": 0.2036 + }, + { + "epoch": 1.7402603652814208, + "lm_loss": 0.3648, + "step": 9040, + "vm_loss": 0.1222 + }, + { + "epoch": 1.7402603652814208, + "lm_loss": 1.052, + "step": 9040, + "vm_loss": 0.1071 + }, + { + "epoch": 1.7402603652814208, + "lm_loss": 0.737, + "step": 9040, + "vm_loss": 0.122 + }, + { + "epoch": 1.740452871958996, + "grad_norm": 3.2486605147990266, + "learning_rate": 8.690318082162874e-07, + "loss": 0.8301, + "step": 9041 + }, + { + "epoch": 1.7406453786365714, + "grad_norm": 3.563130851788097, + "learning_rate": 8.677609521245689e-07, + "loss": 0.8896, + "step": 9042 + }, + { + "epoch": 1.7408378853141468, + "grad_norm": 3.2319477625742743, + "learning_rate": 8.664909838033198e-07, + "loss": 0.8334, + "step": 9043 + }, + { + "epoch": 1.741030391991722, + "grad_norm": 3.4206303934273006, + "learning_rate": 8.65221903375999e-07, + "loss": 0.8898, + "step": 9044 + }, + { + "epoch": 1.7412228986692977, + "grad_norm": 3.1336442818814834, + "learning_rate": 8.639537109659768e-07, + "loss": 0.8028, + "step": 9045 + }, + { + "epoch": 1.741415405346873, + "grad_norm": 3.340455889622764, + "learning_rate": 8.626864066965401e-07, + "loss": 0.8669, + "step": 9046 + }, + { + "epoch": 1.7416079120244483, + "grad_norm": 3.406345664781677, + "learning_rate": 8.614199906908816e-07, + "loss": 0.8765, + "step": 9047 + }, + { + "epoch": 1.7418004187020237, + "grad_norm": 3.3483306032355102, + "learning_rate": 8.601544630721203e-07, + "loss": 0.8655, + "step": 9048 + }, + { + "epoch": 1.7418004187020237, + "lm_loss": 0.8541, + "step": 9048, + "vm_loss": 0.1315 + }, + { + "epoch": 1.7418004187020237, + "lm_loss": 0.7562, + "step": 9048, + "vm_loss": 0.1247 + }, + { + "epoch": 1.7418004187020237, + "lm_loss": 0.8092, + "step": 9048, + "vm_loss": 0.1198 + }, + { + "epoch": 1.7418004187020237, + "lm_loss": 0.322, + "step": 9048, + "vm_loss": 0.1775 + }, + { + "epoch": 1.7418004187020237, + "lm_loss": 0.9984, + "step": 9048, + "vm_loss": 0.1959 + }, + { + "epoch": 1.7418004187020237, + "lm_loss": 0.7207, + "step": 9048, + "vm_loss": 0.1482 + }, + { + "epoch": 1.7418004187020237, + "lm_loss": 0.3962, + "step": 9048, + "vm_loss": 0.1547 + }, + { + "epoch": 1.7418004187020237, + "lm_loss": 0.6018, + "step": 9048, + "vm_loss": 0.1199 + }, + { + "epoch": 1.7419929253795992, + "grad_norm": 3.1300994839524665, + "learning_rate": 8.588898239632759e-07, + "loss": 0.8061, + "step": 9049 + }, + { + "epoch": 1.7421854320571746, + "grad_norm": 3.1097956310816732, + "learning_rate": 8.576260734872909e-07, + "loss": 0.8035, + "step": 9050 + }, + { + "epoch": 1.7423779387347498, + "grad_norm": 3.2945801950378395, + "learning_rate": 8.56363211767014e-07, + "loss": 0.8551, + "step": 9051 + }, + { + "epoch": 1.7425704454123252, + "grad_norm": 3.3061136927750594, + "learning_rate": 8.551012389252178e-07, + "loss": 0.8974, + "step": 9052 + }, + { + "epoch": 1.7427629520899006, + "grad_norm": 3.1961286135381513, + "learning_rate": 8.538401550845755e-07, + "loss": 0.8356, + "step": 9053 + }, + { + "epoch": 1.742955458767476, + "grad_norm": 3.240790488252749, + "learning_rate": 8.525799603676843e-07, + "loss": 0.8297, + "step": 9054 + }, + { + "epoch": 1.7431479654450515, + "grad_norm": 3.351525773253788, + "learning_rate": 8.51320654897051e-07, + "loss": 0.8515, + "step": 9055 + }, + { + "epoch": 1.7433404721226267, + "grad_norm": 3.2579584368283894, + "learning_rate": 8.500622387950963e-07, + "loss": 0.8392, + "step": 9056 + }, + { + "epoch": 1.7433404721226267, + "lm_loss": 0.6674, + "step": 9056, + "vm_loss": 0.1741 + }, + { + "epoch": 1.7433404721226267, + "lm_loss": 0.631, + "step": 9056, + "vm_loss": 0.1513 + }, + { + "epoch": 1.7433404721226267, + "lm_loss": 0.8712, + "step": 9056, + "vm_loss": 0.1271 + }, + { + "epoch": 1.7433404721226267, + "lm_loss": 0.5818, + "step": 9056, + "vm_loss": 0.107 + }, + { + "epoch": 1.7433404721226267, + "lm_loss": 0.7312, + "step": 9056, + "vm_loss": 0.1441 + }, + { + "epoch": 1.7433404721226267, + "lm_loss": 0.9161, + "step": 9056, + "vm_loss": 0.1731 + }, + { + "epoch": 1.7433404721226267, + "lm_loss": 0.6469, + "step": 9056, + "vm_loss": 0.206 + }, + { + "epoch": 1.7433404721226267, + "lm_loss": 0.9487, + "step": 9056, + "vm_loss": 0.1506 + }, + { + "epoch": 1.743532978800202, + "grad_norm": 3.1220706014664295, + "learning_rate": 8.488047121841525e-07, + "loss": 0.8607, + "step": 9057 + }, + { + "epoch": 1.7437254854777775, + "grad_norm": 3.2105965996865558, + "learning_rate": 8.475480751864695e-07, + "loss": 0.8236, + "step": 9058 + }, + { + "epoch": 1.743917992155353, + "grad_norm": 3.195784925865514, + "learning_rate": 8.462923279242097e-07, + "loss": 0.8522, + "step": 9059 + }, + { + "epoch": 1.7441104988329283, + "grad_norm": 3.281388049722419, + "learning_rate": 8.450374705194431e-07, + "loss": 0.8566, + "step": 9060 + }, + { + "epoch": 1.7443030055105035, + "grad_norm": 3.2844970868074728, + "learning_rate": 8.43783503094161e-07, + "loss": 0.8465, + "step": 9061 + }, + { + "epoch": 1.7444955121880792, + "grad_norm": 3.206643886525386, + "learning_rate": 8.425304257702649e-07, + "loss": 0.8393, + "step": 9062 + }, + { + "epoch": 1.7446880188656544, + "grad_norm": 3.0373121499613815, + "learning_rate": 8.412782386695706e-07, + "loss": 0.8228, + "step": 9063 + }, + { + "epoch": 1.7448805255432298, + "grad_norm": 3.3303288410085816, + "learning_rate": 8.40026941913803e-07, + "loss": 0.8821, + "step": 9064 + }, + { + "epoch": 1.7448805255432298, + "lm_loss": 0.79, + "step": 9064, + "vm_loss": 0.175 + }, + { + "epoch": 1.7448805255432298, + "lm_loss": 0.4584, + "step": 9064, + "vm_loss": 0.1273 + }, + { + "epoch": 1.7448805255432298, + "lm_loss": 0.7011, + "step": 9064, + "vm_loss": 0.1458 + }, + { + "epoch": 1.7448805255432298, + "lm_loss": 0.5816, + "step": 9064, + "vm_loss": 0.2272 + }, + { + "epoch": 1.7448805255432298, + "lm_loss": 0.614, + "step": 9064, + "vm_loss": 0.1636 + }, + { + "epoch": 1.7448805255432298, + "lm_loss": 0.4739, + "step": 9064, + "vm_loss": 0.1193 + }, + { + "epoch": 1.7448805255432298, + "lm_loss": 0.7292, + "step": 9064, + "vm_loss": 0.1323 + }, + { + "epoch": 1.7448805255432298, + "lm_loss": 0.618, + "step": 9064, + "vm_loss": 0.1809 + }, + { + "epoch": 1.7450730322208052, + "grad_norm": 3.252755256884406, + "learning_rate": 8.387765356246103e-07, + "loss": 0.8396, + "step": 9065 + }, + { + "epoch": 1.7452655388983804, + "grad_norm": 3.251938740101238, + "learning_rate": 8.37527019923543e-07, + "loss": 0.8439, + "step": 9066 + }, + { + "epoch": 1.745458045575956, + "grad_norm": 3.4793442412046156, + "learning_rate": 8.36278394932073e-07, + "loss": 0.8608, + "step": 9067 + }, + { + "epoch": 1.7456505522535313, + "grad_norm": 3.135055004362577, + "learning_rate": 8.350306607715774e-07, + "loss": 0.8213, + "step": 9068 + }, + { + "epoch": 1.7458430589311067, + "grad_norm": 3.3190710950868585, + "learning_rate": 8.337838175633595e-07, + "loss": 0.8737, + "step": 9069 + }, + { + "epoch": 1.7460355656086821, + "grad_norm": 3.242552754421227, + "learning_rate": 8.325378654286231e-07, + "loss": 0.835, + "step": 9070 + }, + { + "epoch": 1.7462280722862573, + "grad_norm": 3.015522336621416, + "learning_rate": 8.312928044884927e-07, + "loss": 0.8116, + "step": 9071 + }, + { + "epoch": 1.746420578963833, + "grad_norm": 3.3116097643827125, + "learning_rate": 8.300486348640036e-07, + "loss": 0.8833, + "step": 9072 + }, + { + "epoch": 1.746420578963833, + "lm_loss": 0.6108, + "step": 9072, + "vm_loss": 0.1222 + }, + { + "epoch": 1.746420578963833, + "lm_loss": 0.9349, + "step": 9072, + "vm_loss": 0.16 + }, + { + "epoch": 1.746420578963833, + "lm_loss": 0.8165, + "step": 9072, + "vm_loss": 0.2001 + }, + { + "epoch": 1.746420578963833, + "lm_loss": 0.7814, + "step": 9072, + "vm_loss": 0.1267 + }, + { + "epoch": 1.746420578963833, + "lm_loss": 0.544, + "step": 9072, + "vm_loss": 0.1068 + }, + { + "epoch": 1.746420578963833, + "lm_loss": 0.5652, + "step": 9072, + "vm_loss": 0.114 + }, + { + "epoch": 1.746420578963833, + "lm_loss": 0.722, + "step": 9072, + "vm_loss": 0.0874 + }, + { + "epoch": 1.746420578963833, + "lm_loss": 0.7936, + "step": 9072, + "vm_loss": 0.1937 + }, + { + "epoch": 1.7466130856414082, + "grad_norm": 3.331721946012661, + "learning_rate": 8.288053566761067e-07, + "loss": 0.8773, + "step": 9073 + }, + { + "epoch": 1.7468055923189836, + "grad_norm": 3.1867124841193535, + "learning_rate": 8.275629700456588e-07, + "loss": 0.8513, + "step": 9074 + }, + { + "epoch": 1.746998098996559, + "grad_norm": 3.2228344828256517, + "learning_rate": 8.263214750934445e-07, + "loss": 0.8203, + "step": 9075 + }, + { + "epoch": 1.7471906056741342, + "grad_norm": 3.3618597439193683, + "learning_rate": 8.25080871940146e-07, + "loss": 0.8774, + "step": 9076 + }, + { + "epoch": 1.7473831123517098, + "grad_norm": 3.334811195752593, + "learning_rate": 8.238411607063678e-07, + "loss": 0.8588, + "step": 9077 + }, + { + "epoch": 1.747575619029285, + "grad_norm": 3.2305183102321933, + "learning_rate": 8.226023415126272e-07, + "loss": 0.8491, + "step": 9078 + }, + { + "epoch": 1.7477681257068605, + "grad_norm": 3.453126163810074, + "learning_rate": 8.21364414479352e-07, + "loss": 0.8949, + "step": 9079 + }, + { + "epoch": 1.7479606323844359, + "grad_norm": 3.3416361802800747, + "learning_rate": 8.201273797268861e-07, + "loss": 0.8731, + "step": 9080 + }, + { + "epoch": 1.7479606323844359, + "lm_loss": 0.7205, + "step": 9080, + "vm_loss": 0.1937 + }, + { + "epoch": 1.7479606323844359, + "lm_loss": 0.6912, + "step": 9080, + "vm_loss": 0.1695 + }, + { + "epoch": 1.7479606323844359, + "lm_loss": 0.4405, + "step": 9080, + "vm_loss": 0.2039 + }, + { + "epoch": 1.7479606323844359, + "lm_loss": 0.6079, + "step": 9080, + "vm_loss": 0.1546 + }, + { + "epoch": 1.7479606323844359, + "lm_loss": 0.5921, + "step": 9080, + "vm_loss": 0.2101 + }, + { + "epoch": 1.7479606323844359, + "lm_loss": 0.4593, + "step": 9080, + "vm_loss": 0.1569 + }, + { + "epoch": 1.7479606323844359, + "lm_loss": 0.771, + "step": 9080, + "vm_loss": 0.1606 + }, + { + "epoch": 1.7479606323844359, + "lm_loss": 0.7845, + "step": 9080, + "vm_loss": 0.2342 + }, + { + "epoch": 1.748153139062011, + "grad_norm": 3.207164905035659, + "learning_rate": 8.188912373754809e-07, + "loss": 0.8605, + "step": 9081 + }, + { + "epoch": 1.7483456457395867, + "grad_norm": 3.1780833176196768, + "learning_rate": 8.176559875453105e-07, + "loss": 0.8461, + "step": 9082 + }, + { + "epoch": 1.748538152417162, + "grad_norm": 3.4602807663147326, + "learning_rate": 8.164216303564532e-07, + "loss": 0.8661, + "step": 9083 + }, + { + "epoch": 1.7487306590947373, + "grad_norm": 3.3004493534347956, + "learning_rate": 8.151881659289085e-07, + "loss": 0.84, + "step": 9084 + }, + { + "epoch": 1.7489231657723128, + "grad_norm": 3.1962456191630015, + "learning_rate": 8.139555943825772e-07, + "loss": 0.8219, + "step": 9085 + }, + { + "epoch": 1.749115672449888, + "grad_norm": 3.3780791355413933, + "learning_rate": 8.127239158372902e-07, + "loss": 0.8564, + "step": 9086 + }, + { + "epoch": 1.7493081791274636, + "grad_norm": 3.075108601875423, + "learning_rate": 8.11493130412776e-07, + "loss": 0.8363, + "step": 9087 + }, + { + "epoch": 1.7495006858050388, + "grad_norm": 3.0302134505164555, + "learning_rate": 8.102632382286857e-07, + "loss": 0.8003, + "step": 9088 + }, + { + "epoch": 1.7495006858050388, + "lm_loss": 0.942, + "step": 9088, + "vm_loss": 0.2652 + }, + { + "epoch": 1.7495006858050388, + "lm_loss": 0.9533, + "step": 9088, + "vm_loss": 0.1399 + }, + { + "epoch": 1.7495006858050388, + "lm_loss": 0.7464, + "step": 9088, + "vm_loss": 0.1708 + }, + { + "epoch": 1.7495006858050388, + "lm_loss": 0.3457, + "step": 9088, + "vm_loss": 0.1025 + }, + { + "epoch": 1.7495006858050388, + "lm_loss": 0.5487, + "step": 9088, + "vm_loss": 0.1928 + }, + { + "epoch": 1.7495006858050388, + "lm_loss": 0.5846, + "step": 9088, + "vm_loss": 0.1589 + }, + { + "epoch": 1.7495006858050388, + "lm_loss": 0.4768, + "step": 9088, + "vm_loss": 0.1312 + }, + { + "epoch": 1.7495006858050388, + "lm_loss": 0.7956, + "step": 9088, + "vm_loss": 0.158 + }, + { + "epoch": 1.7496931924826142, + "grad_norm": 3.3065935870386256, + "learning_rate": 8.090342394045792e-07, + "loss": 0.8706, + "step": 9089 + }, + { + "epoch": 1.7498856991601897, + "grad_norm": 3.260407904345146, + "learning_rate": 8.07806134059933e-07, + "loss": 0.8845, + "step": 9090 + }, + { + "epoch": 1.7500782058377649, + "grad_norm": 3.0455748294240657, + "learning_rate": 8.065789223141318e-07, + "loss": 0.7834, + "step": 9091 + }, + { + "epoch": 1.7502707125153405, + "grad_norm": 3.324375831743756, + "learning_rate": 8.053526042864768e-07, + "loss": 0.8782, + "step": 9092 + }, + { + "epoch": 1.7504632191929157, + "grad_norm": 3.268674086627398, + "learning_rate": 8.041271800961836e-07, + "loss": 0.8401, + "step": 9093 + }, + { + "epoch": 1.7506557258704911, + "grad_norm": 3.381288763297877, + "learning_rate": 8.029026498623771e-07, + "loss": 0.9171, + "step": 9094 + }, + { + "epoch": 1.7508482325480665, + "grad_norm": 3.2580216589811895, + "learning_rate": 8.016790137040987e-07, + "loss": 0.8288, + "step": 9095 + }, + { + "epoch": 1.7510407392256417, + "grad_norm": 3.1325119015889236, + "learning_rate": 8.004562717403019e-07, + "loss": 0.7997, + "step": 9096 + }, + { + "epoch": 1.7510407392256417, + "lm_loss": 0.416, + "step": 9096, + "vm_loss": 0.1848 + }, + { + "epoch": 1.7510407392256417, + "lm_loss": 0.7128, + "step": 9096, + "vm_loss": 0.1723 + }, + { + "epoch": 1.7510407392256417, + "lm_loss": 1.0784, + "step": 9096, + "vm_loss": 0.1754 + }, + { + "epoch": 1.7510407392256417, + "lm_loss": 0.812, + "step": 9096, + "vm_loss": 0.141 + }, + { + "epoch": 1.7510407392256417, + "lm_loss": 0.7978, + "step": 9096, + "vm_loss": 0.203 + }, + { + "epoch": 1.7510407392256417, + "lm_loss": 0.6181, + "step": 9096, + "vm_loss": 0.1452 + }, + { + "epoch": 1.7510407392256417, + "lm_loss": 0.5631, + "step": 9096, + "vm_loss": 0.1297 + }, + { + "epoch": 1.7510407392256417, + "lm_loss": 0.9242, + "step": 9096, + "vm_loss": 0.1097 + }, + { + "epoch": 1.7512332459032174, + "grad_norm": 3.289409425976474, + "learning_rate": 7.992344240898519e-07, + "loss": 0.8586, + "step": 9097 + }, + { + "epoch": 1.7514257525807926, + "grad_norm": 3.2749794034205277, + "learning_rate": 7.98013470871527e-07, + "loss": 0.8573, + "step": 9098 + }, + { + "epoch": 1.751618259258368, + "grad_norm": 3.2429217630526708, + "learning_rate": 7.967934122040211e-07, + "loss": 0.8484, + "step": 9099 + }, + { + "epoch": 1.7518107659359434, + "grad_norm": 3.307285119440503, + "learning_rate": 7.955742482059381e-07, + "loss": 0.8769, + "step": 9100 + }, + { + "epoch": 1.7520032726135186, + "grad_norm": 3.200233189974417, + "learning_rate": 7.943559789957989e-07, + "loss": 0.8481, + "step": 9101 + }, + { + "epoch": 1.7521957792910943, + "grad_norm": 3.0708415896375816, + "learning_rate": 7.931386046920308e-07, + "loss": 0.8212, + "step": 9102 + }, + { + "epoch": 1.7523882859686695, + "grad_norm": 3.3267629971676937, + "learning_rate": 7.919221254129828e-07, + "loss": 0.8692, + "step": 9103 + }, + { + "epoch": 1.752580792646245, + "grad_norm": 3.087116651086049, + "learning_rate": 7.907065412769089e-07, + "loss": 0.7933, + "step": 9104 + }, + { + "epoch": 1.752580792646245, + "lm_loss": 0.7309, + "step": 9104, + "vm_loss": 0.1317 + }, + { + "epoch": 1.752580792646245, + "lm_loss": 0.6091, + "step": 9104, + "vm_loss": 0.1593 + }, + { + "epoch": 1.752580792646245, + "lm_loss": 0.9113, + "step": 9104, + "vm_loss": 0.1589 + }, + { + "epoch": 1.752580792646245, + "lm_loss": 0.7242, + "step": 9104, + "vm_loss": 0.2078 + }, + { + "epoch": 1.752580792646245, + "lm_loss": 1.087, + "step": 9104, + "vm_loss": 0.1561 + }, + { + "epoch": 1.752580792646245, + "lm_loss": 0.5255, + "step": 9104, + "vm_loss": 0.1175 + }, + { + "epoch": 1.752580792646245, + "lm_loss": 0.4267, + "step": 9104, + "vm_loss": 0.1151 + }, + { + "epoch": 1.752580792646245, + "lm_loss": 0.4205, + "step": 9104, + "vm_loss": 0.1398 + }, + { + "epoch": 1.7527732993238203, + "grad_norm": 3.147319210056095, + "learning_rate": 7.894918524019801e-07, + "loss": 0.8489, + "step": 9105 + }, + { + "epoch": 1.7529658060013955, + "grad_norm": 3.2160016766072737, + "learning_rate": 7.8827805890628e-07, + "loss": 0.8693, + "step": 9106 + }, + { + "epoch": 1.7531583126789712, + "grad_norm": 3.3616927221777626, + "learning_rate": 7.870651609078072e-07, + "loss": 0.8685, + "step": 9107 + }, + { + "epoch": 1.7533508193565464, + "grad_norm": 3.189108107766812, + "learning_rate": 7.858531585244655e-07, + "loss": 0.8495, + "step": 9108 + }, + { + "epoch": 1.7535433260341218, + "grad_norm": 3.227291159284179, + "learning_rate": 7.846420518740816e-07, + "loss": 0.8476, + "step": 9109 + }, + { + "epoch": 1.7537358327116972, + "grad_norm": 3.404899730000119, + "learning_rate": 7.834318410743891e-07, + "loss": 0.881, + "step": 9110 + }, + { + "epoch": 1.7539283393892726, + "grad_norm": 3.2260613862778578, + "learning_rate": 7.822225262430372e-07, + "loss": 0.8168, + "step": 9111 + }, + { + "epoch": 1.754120846066848, + "grad_norm": 3.165097971887034, + "learning_rate": 7.810141074975819e-07, + "loss": 0.8567, + "step": 9112 + }, + { + "epoch": 1.754120846066848, + "lm_loss": 0.7681, + "step": 9112, + "vm_loss": 0.1757 + }, + { + "epoch": 1.754120846066848, + "lm_loss": 0.2385, + "step": 9112, + "vm_loss": 0.1637 + }, + { + "epoch": 1.754120846066848, + "lm_loss": 0.6057, + "step": 9112, + "vm_loss": 0.1299 + }, + { + "epoch": 1.754120846066848, + "lm_loss": 0.7278, + "step": 9112, + "vm_loss": 0.1498 + }, + { + "epoch": 1.754120846066848, + "lm_loss": 0.9652, + "step": 9112, + "vm_loss": 0.1985 + }, + { + "epoch": 1.754120846066848, + "lm_loss": 1.06, + "step": 9112, + "vm_loss": 0.1565 + }, + { + "epoch": 1.754120846066848, + "lm_loss": 0.6473, + "step": 9112, + "vm_loss": 0.1792 + }, + { + "epoch": 1.754120846066848, + "lm_loss": 0.6349, + "step": 9112, + "vm_loss": 0.1726 + }, + { + "epoch": 1.7543133527444232, + "grad_norm": 3.1793618915258604, + "learning_rate": 7.798065849555037e-07, + "loss": 0.8427, + "step": 9113 + }, + { + "epoch": 1.7545058594219987, + "grad_norm": 3.3418824878281117, + "learning_rate": 7.785999587341853e-07, + "loss": 0.8785, + "step": 9114 + }, + { + "epoch": 1.754698366099574, + "grad_norm": 3.2938074926718293, + "learning_rate": 7.773942289509273e-07, + "loss": 0.8469, + "step": 9115 + }, + { + "epoch": 1.7548908727771495, + "grad_norm": 3.367803528827594, + "learning_rate": 7.761893957229405e-07, + "loss": 0.8718, + "step": 9116 + }, + { + "epoch": 1.755083379454725, + "grad_norm": 3.2042090946252206, + "learning_rate": 7.749854591673522e-07, + "loss": 0.8914, + "step": 9117 + }, + { + "epoch": 1.7552758861323001, + "grad_norm": 3.1210756403809903, + "learning_rate": 7.737824194012022e-07, + "loss": 0.8276, + "step": 9118 + }, + { + "epoch": 1.7554683928098755, + "grad_norm": 3.121507160293842, + "learning_rate": 7.725802765414359e-07, + "loss": 0.826, + "step": 9119 + }, + { + "epoch": 1.755660899487451, + "grad_norm": 3.2705283557820453, + "learning_rate": 7.713790307049207e-07, + "loss": 0.8562, + "step": 9120 + }, + { + "epoch": 1.755660899487451, + "lm_loss": 0.8927, + "step": 9120, + "vm_loss": 0.1948 + }, + { + "epoch": 1.755660899487451, + "lm_loss": 0.5068, + "step": 9120, + "vm_loss": 0.1747 + }, + { + "epoch": 1.755660899487451, + "lm_loss": 0.4187, + "step": 9120, + "vm_loss": 0.169 + }, + { + "epoch": 1.755660899487451, + "lm_loss": 0.6097, + "step": 9120, + "vm_loss": 0.1271 + }, + { + "epoch": 1.755660899487451, + "lm_loss": 0.909, + "step": 9120, + "vm_loss": 0.1497 + }, + { + "epoch": 1.755660899487451, + "lm_loss": 0.39, + "step": 9120, + "vm_loss": 0.1131 + }, + { + "epoch": 1.755660899487451, + "lm_loss": 0.5906, + "step": 9120, + "vm_loss": 0.1108 + }, + { + "epoch": 1.755660899487451, + "lm_loss": 0.4717, + "step": 9120, + "vm_loss": 0.1933 + }, + { + "epoch": 1.7558534061650264, + "grad_norm": 3.3482470951177246, + "learning_rate": 7.701786820084312e-07, + "loss": 0.8565, + "step": 9121 + }, + { + "epoch": 1.7560459128426018, + "grad_norm": 3.0913141418467007, + "learning_rate": 7.689792305686583e-07, + "loss": 0.82, + "step": 9122 + }, + { + "epoch": 1.756238419520177, + "grad_norm": 3.4185808812093708, + "learning_rate": 7.677806765022044e-07, + "loss": 0.8669, + "step": 9123 + }, + { + "epoch": 1.7564309261977527, + "grad_norm": 3.301512907921233, + "learning_rate": 7.665830199255842e-07, + "loss": 0.8959, + "step": 9124 + }, + { + "epoch": 1.7566234328753279, + "grad_norm": 3.3372726100674326, + "learning_rate": 7.653862609552221e-07, + "loss": 0.8926, + "step": 9125 + }, + { + "epoch": 1.7568159395529033, + "grad_norm": 3.202024410833478, + "learning_rate": 7.641903997074607e-07, + "loss": 0.8822, + "step": 9126 + }, + { + "epoch": 1.7570084462304787, + "grad_norm": 3.1979002553443014, + "learning_rate": 7.629954362985537e-07, + "loss": 0.8324, + "step": 9127 + }, + { + "epoch": 1.757200952908054, + "grad_norm": 3.176683496714559, + "learning_rate": 7.61801370844667e-07, + "loss": 0.8934, + "step": 9128 + }, + { + "epoch": 1.757200952908054, + "lm_loss": 0.3902, + "step": 9128, + "vm_loss": 0.1881 + }, + { + "epoch": 1.757200952908054, + "lm_loss": 0.4609, + "step": 9128, + "vm_loss": 0.1078 + }, + { + "epoch": 1.757200952908054, + "lm_loss": 0.5858, + "step": 9128, + "vm_loss": 0.1753 + }, + { + "epoch": 1.757200952908054, + "lm_loss": 0.7817, + "step": 9128, + "vm_loss": 0.1747 + }, + { + "epoch": 1.757200952908054, + "lm_loss": 0.9017, + "step": 9128, + "vm_loss": 0.238 + }, + { + "epoch": 1.757200952908054, + "lm_loss": 0.8686, + "step": 9128, + "vm_loss": 0.1274 + }, + { + "epoch": 1.757200952908054, + "lm_loss": 0.9927, + "step": 9128, + "vm_loss": 0.1209 + }, + { + "epoch": 1.757200952908054, + "lm_loss": 0.4221, + "step": 9128, + "vm_loss": 0.1142 + }, + { + "epoch": 1.7573934595856295, + "grad_norm": 3.3587036790139146, + "learning_rate": 7.606082034618756e-07, + "loss": 0.8737, + "step": 9129 + }, + { + "epoch": 1.7575859662632047, + "grad_norm": 3.1150339691544455, + "learning_rate": 7.594159342661767e-07, + "loss": 0.8039, + "step": 9130 + }, + { + "epoch": 1.7577784729407802, + "grad_norm": 3.015202813482452, + "learning_rate": 7.582245633734675e-07, + "loss": 0.794, + "step": 9131 + }, + { + "epoch": 1.7579709796183556, + "grad_norm": 3.226497666892498, + "learning_rate": 7.570340908995688e-07, + "loss": 0.8246, + "step": 9132 + }, + { + "epoch": 1.7581634862959308, + "grad_norm": 3.3910305075678546, + "learning_rate": 7.55844516960208e-07, + "loss": 0.8955, + "step": 9133 + }, + { + "epoch": 1.7583559929735064, + "grad_norm": 3.2541383324943576, + "learning_rate": 7.546558416710292e-07, + "loss": 0.8643, + "step": 9134 + }, + { + "epoch": 1.7585484996510816, + "grad_norm": 3.290207772610275, + "learning_rate": 7.534680651475834e-07, + "loss": 0.8428, + "step": 9135 + }, + { + "epoch": 1.758741006328657, + "grad_norm": 3.3118627213867504, + "learning_rate": 7.522811875053392e-07, + "loss": 0.8406, + "step": 9136 + }, + { + "epoch": 1.758741006328657, + "lm_loss": 0.7673, + "step": 9136, + "vm_loss": 0.1348 + }, + { + "epoch": 1.758741006328657, + "lm_loss": 0.6391, + "step": 9136, + "vm_loss": 0.1705 + }, + { + "epoch": 1.758741006328657, + "lm_loss": 0.9093, + "step": 9136, + "vm_loss": 0.2185 + }, + { + "epoch": 1.758741006328657, + "lm_loss": 0.7018, + "step": 9136, + "vm_loss": 0.1641 + }, + { + "epoch": 1.758741006328657, + "lm_loss": 0.2952, + "step": 9136, + "vm_loss": 0.1228 + }, + { + "epoch": 1.758741006328657, + "lm_loss": 0.7054, + "step": 9136, + "vm_loss": 0.1826 + }, + { + "epoch": 1.758741006328657, + "lm_loss": 0.7648, + "step": 9136, + "vm_loss": 0.1749 + }, + { + "epoch": 1.758741006328657, + "lm_loss": 0.73, + "step": 9136, + "vm_loss": 0.206 + }, + { + "epoch": 1.7589335130062325, + "grad_norm": 3.1548456763327217, + "learning_rate": 7.510952088596768e-07, + "loss": 0.8672, + "step": 9137 + }, + { + "epoch": 1.7591260196838077, + "grad_norm": 3.162241128142295, + "learning_rate": 7.499101293258881e-07, + "loss": 0.8168, + "step": 9138 + }, + { + "epoch": 1.7593185263613833, + "grad_norm": 3.127715819120615, + "learning_rate": 7.487259490191778e-07, + "loss": 0.8166, + "step": 9139 + }, + { + "epoch": 1.7595110330389585, + "grad_norm": 3.265498768933632, + "learning_rate": 7.47542668054666e-07, + "loss": 0.8607, + "step": 9140 + }, + { + "epoch": 1.759703539716534, + "grad_norm": 3.4265180534507675, + "learning_rate": 7.463602865473774e-07, + "loss": 0.8835, + "step": 9141 + }, + { + "epoch": 1.7598960463941093, + "grad_norm": 3.4210355309762432, + "learning_rate": 7.451788046122588e-07, + "loss": 0.8894, + "step": 9142 + }, + { + "epoch": 1.7600885530716845, + "grad_norm": 3.2067604294346888, + "learning_rate": 7.439982223641629e-07, + "loss": 0.836, + "step": 9143 + }, + { + "epoch": 1.7602810597492602, + "grad_norm": 3.2678172642125083, + "learning_rate": 7.428185399178601e-07, + "loss": 0.8891, + "step": 9144 + }, + { + "epoch": 1.7602810597492602, + "lm_loss": 0.5732, + "step": 9144, + "vm_loss": 0.2252 + }, + { + "epoch": 1.7602810597492602, + "lm_loss": 0.6637, + "step": 9144, + "vm_loss": 0.1772 + }, + { + "epoch": 1.7602810597492602, + "lm_loss": 0.9585, + "step": 9144, + "vm_loss": 0.1858 + }, + { + "epoch": 1.7602810597492602, + "lm_loss": 0.6224, + "step": 9144, + "vm_loss": 0.1836 + }, + { + "epoch": 1.7602810597492602, + "lm_loss": 0.6682, + "step": 9144, + "vm_loss": 0.1617 + }, + { + "epoch": 1.7602810597492602, + "lm_loss": 0.6115, + "step": 9144, + "vm_loss": 0.2027 + }, + { + "epoch": 1.7602810597492602, + "lm_loss": 0.5402, + "step": 9144, + "vm_loss": 0.1852 + }, + { + "epoch": 1.7602810597492602, + "lm_loss": 0.6333, + "step": 9144, + "vm_loss": 0.2137 + }, + { + "epoch": 1.7604735664268354, + "grad_norm": 3.406833009683654, + "learning_rate": 7.416397573880296e-07, + "loss": 0.9269, + "step": 9145 + }, + { + "epoch": 1.7606660731044108, + "grad_norm": 3.3343640162487835, + "learning_rate": 7.404618748892612e-07, + "loss": 0.879, + "step": 9146 + }, + { + "epoch": 1.7608585797819862, + "grad_norm": 3.4432827897339013, + "learning_rate": 7.392848925360651e-07, + "loss": 0.8682, + "step": 9147 + }, + { + "epoch": 1.7610510864595614, + "grad_norm": 3.1274069419447903, + "learning_rate": 7.381088104428558e-07, + "loss": 0.8222, + "step": 9148 + }, + { + "epoch": 1.761243593137137, + "grad_norm": 3.1699410686943543, + "learning_rate": 7.369336287239637e-07, + "loss": 0.8422, + "step": 9149 + }, + { + "epoch": 1.7614360998147123, + "grad_norm": 3.22319441201873, + "learning_rate": 7.35759347493632e-07, + "loss": 0.8326, + "step": 9150 + }, + { + "epoch": 1.7616286064922877, + "grad_norm": 3.06259210551004, + "learning_rate": 7.345859668660183e-07, + "loss": 0.7956, + "step": 9151 + }, + { + "epoch": 1.7618211131698631, + "grad_norm": 3.1027046636356843, + "learning_rate": 7.334134869551857e-07, + "loss": 0.8484, + "step": 9152 + }, + { + "epoch": 1.7618211131698631, + "lm_loss": 0.5319, + "step": 9152, + "vm_loss": 0.2124 + }, + { + "epoch": 1.7618211131698631, + "lm_loss": 0.6245, + "step": 9152, + "vm_loss": 0.1371 + }, + { + "epoch": 1.7618211131698631, + "lm_loss": 0.6299, + "step": 9152, + "vm_loss": 0.2012 + }, + { + "epoch": 1.7618211131698631, + "lm_loss": 0.7788, + "step": 9152, + "vm_loss": 0.1876 + }, + { + "epoch": 1.7618211131698631, + "lm_loss": 0.4139, + "step": 9152, + "vm_loss": 0.1724 + }, + { + "epoch": 1.7618211131698631, + "lm_loss": 0.9143, + "step": 9152, + "vm_loss": 0.1727 + }, + { + "epoch": 1.7618211131698631, + "lm_loss": 0.7375, + "step": 9152, + "vm_loss": 0.1323 + }, + { + "epoch": 1.7618211131698631, + "lm_loss": 0.6463, + "step": 9152, + "vm_loss": 0.1076 + }, + { + "epoch": 1.7620136198474383, + "grad_norm": 3.1839745482976185, + "learning_rate": 7.322419078751164e-07, + "loss": 0.8494, + "step": 9153 + }, + { + "epoch": 1.762206126525014, + "grad_norm": 3.271344556112259, + "learning_rate": 7.310712297397038e-07, + "loss": 0.8708, + "step": 9154 + }, + { + "epoch": 1.7623986332025892, + "grad_norm": 3.255038698813587, + "learning_rate": 7.299014526627523e-07, + "loss": 0.8741, + "step": 9155 + }, + { + "epoch": 1.7625911398801646, + "grad_norm": 3.293389916610071, + "learning_rate": 7.287325767579756e-07, + "loss": 0.8431, + "step": 9156 + }, + { + "epoch": 1.76278364655774, + "grad_norm": 3.3721474026357, + "learning_rate": 7.275646021390103e-07, + "loss": 0.8681, + "step": 9157 + }, + { + "epoch": 1.7629761532353152, + "grad_norm": 3.1726378324181828, + "learning_rate": 7.263975289193937e-07, + "loss": 0.8291, + "step": 9158 + }, + { + "epoch": 1.7631686599128908, + "grad_norm": 3.3492187415204926, + "learning_rate": 7.252313572125802e-07, + "loss": 0.877, + "step": 9159 + }, + { + "epoch": 1.763361166590466, + "grad_norm": 3.222152139483039, + "learning_rate": 7.240660871319383e-07, + "loss": 0.8315, + "step": 9160 + }, + { + "epoch": 1.763361166590466, + "lm_loss": 0.4679, + "step": 9160, + "vm_loss": 0.1919 + }, + { + "epoch": 1.763361166590466, + "lm_loss": 0.8754, + "step": 9160, + "vm_loss": 0.157 + }, + { + "epoch": 1.763361166590466, + "lm_loss": 0.8859, + "step": 9160, + "vm_loss": 0.134 + }, + { + "epoch": 1.763361166590466, + "lm_loss": 1.2349, + "step": 9160, + "vm_loss": 0.1625 + }, + { + "epoch": 1.763361166590466, + "lm_loss": 0.5745, + "step": 9160, + "vm_loss": 0.1618 + }, + { + "epoch": 1.763361166590466, + "lm_loss": 0.7343, + "step": 9160, + "vm_loss": 0.1514 + }, + { + "epoch": 1.763361166590466, + "lm_loss": 0.7035, + "step": 9160, + "vm_loss": 0.3009 + }, + { + "epoch": 1.763361166590466, + "lm_loss": 0.5711, + "step": 9160, + "vm_loss": 0.1591 + }, + { + "epoch": 1.7635536732680415, + "grad_norm": 3.1990832920156067, + "learning_rate": 7.229017187907473e-07, + "loss": 0.8512, + "step": 9161 + }, + { + "epoch": 1.7637461799456169, + "grad_norm": 3.2216247539173155, + "learning_rate": 7.217382523021988e-07, + "loss": 0.8391, + "step": 9162 + }, + { + "epoch": 1.763938686623192, + "grad_norm": 3.2241713916064523, + "learning_rate": 7.205756877793935e-07, + "loss": 0.8453, + "step": 9163 + }, + { + "epoch": 1.7641311933007677, + "grad_norm": 3.5027491028638207, + "learning_rate": 7.19414025335351e-07, + "loss": 0.8726, + "step": 9164 + }, + { + "epoch": 1.764323699978343, + "grad_norm": 3.3271537162747262, + "learning_rate": 7.182532650829976e-07, + "loss": 0.8964, + "step": 9165 + }, + { + "epoch": 1.7645162066559184, + "grad_norm": 3.212824167357, + "learning_rate": 7.170934071351754e-07, + "loss": 0.805, + "step": 9166 + }, + { + "epoch": 1.7647087133334938, + "grad_norm": 3.0850974213135967, + "learning_rate": 7.159344516046373e-07, + "loss": 0.8255, + "step": 9167 + }, + { + "epoch": 1.764901220011069, + "grad_norm": 3.025587768556213, + "learning_rate": 7.147763986040502e-07, + "loss": 0.7902, + "step": 9168 + }, + { + "epoch": 1.764901220011069, + "lm_loss": 0.6313, + "step": 9168, + "vm_loss": 0.2114 + }, + { + "epoch": 1.764901220011069, + "lm_loss": 0.6482, + "step": 9168, + "vm_loss": 0.1188 + }, + { + "epoch": 1.764901220011069, + "lm_loss": 0.5218, + "step": 9168, + "vm_loss": 0.1848 + }, + { + "epoch": 1.764901220011069, + "lm_loss": 0.4549, + "step": 9168, + "vm_loss": 0.1235 + }, + { + "epoch": 1.764901220011069, + "lm_loss": 0.5733, + "step": 9168, + "vm_loss": 0.1508 + }, + { + "epoch": 1.764901220011069, + "lm_loss": 0.7217, + "step": 9168, + "vm_loss": 0.1625 + }, + { + "epoch": 1.764901220011069, + "lm_loss": 0.3933, + "step": 9168, + "vm_loss": 0.1131 + }, + { + "epoch": 1.764901220011069, + "lm_loss": 0.9583, + "step": 9168, + "vm_loss": 0.1955 + }, + { + "epoch": 1.7650937266886446, + "grad_norm": 3.296325596838556, + "learning_rate": 7.136192482459881e-07, + "loss": 0.818, + "step": 9169 + }, + { + "epoch": 1.7652862333662198, + "grad_norm": 3.187321619404271, + "learning_rate": 7.124630006429423e-07, + "loss": 0.8105, + "step": 9170 + }, + { + "epoch": 1.7654787400437952, + "grad_norm": 3.128016239097787, + "learning_rate": 7.113076559073151e-07, + "loss": 0.8035, + "step": 9171 + }, + { + "epoch": 1.7656712467213707, + "grad_norm": 3.3196514589510953, + "learning_rate": 7.101532141514233e-07, + "loss": 0.9161, + "step": 9172 + }, + { + "epoch": 1.765863753398946, + "grad_norm": 3.224783099812493, + "learning_rate": 7.089996754874873e-07, + "loss": 0.8145, + "step": 9173 + }, + { + "epoch": 1.7660562600765215, + "grad_norm": 3.2808551712724916, + "learning_rate": 7.078470400276516e-07, + "loss": 0.8439, + "step": 9174 + }, + { + "epoch": 1.7662487667540967, + "grad_norm": 3.6337028868206334, + "learning_rate": 7.066953078839645e-07, + "loss": 0.9041, + "step": 9175 + }, + { + "epoch": 1.7664412734316721, + "grad_norm": 3.3396603853352307, + "learning_rate": 7.055444791683907e-07, + "loss": 0.8918, + "step": 9176 + }, + { + "epoch": 1.7664412734316721, + "lm_loss": 0.5954, + "step": 9176, + "vm_loss": 0.1328 + }, + { + "epoch": 1.7664412734316721, + "lm_loss": 0.874, + "step": 9176, + "vm_loss": 0.1263 + }, + { + "epoch": 1.7664412734316721, + "lm_loss": 1.1141, + "step": 9176, + "vm_loss": 0.158 + }, + { + "epoch": 1.7664412734316721, + "lm_loss": 0.8647, + "step": 9176, + "vm_loss": 0.1457 + }, + { + "epoch": 1.7664412734316721, + "lm_loss": 1.0597, + "step": 9176, + "vm_loss": 0.1999 + }, + { + "epoch": 1.7664412734316721, + "lm_loss": 0.6791, + "step": 9176, + "vm_loss": 0.147 + }, + { + "epoch": 1.7664412734316721, + "lm_loss": 0.3902, + "step": 9176, + "vm_loss": 0.225 + }, + { + "epoch": 1.7664412734316721, + "lm_loss": 0.6888, + "step": 9176, + "vm_loss": 0.1828 + }, + { + "epoch": 1.7666337801092475, + "grad_norm": 3.5058409873416636, + "learning_rate": 7.04394553992801e-07, + "loss": 0.9071, + "step": 9177 + }, + { + "epoch": 1.766826286786823, + "grad_norm": 3.2895467015421724, + "learning_rate": 7.032455324689902e-07, + "loss": 0.8981, + "step": 9178 + }, + { + "epoch": 1.7670187934643984, + "grad_norm": 3.122765185094544, + "learning_rate": 7.020974147086512e-07, + "loss": 0.8289, + "step": 9179 + }, + { + "epoch": 1.7672113001419736, + "grad_norm": 3.2268881524928177, + "learning_rate": 7.009502008233993e-07, + "loss": 0.8441, + "step": 9180 + }, + { + "epoch": 1.767403806819549, + "grad_norm": 3.2682147495198013, + "learning_rate": 6.998038909247573e-07, + "loss": 0.8385, + "step": 9181 + }, + { + "epoch": 1.7675963134971244, + "grad_norm": 3.095162711052163, + "learning_rate": 6.986584851241607e-07, + "loss": 0.8298, + "step": 9182 + }, + { + "epoch": 1.7677888201746998, + "grad_norm": 3.1787740444095, + "learning_rate": 6.975139835329603e-07, + "loss": 0.8268, + "step": 9183 + }, + { + "epoch": 1.7679813268522753, + "grad_norm": 3.3055546430014866, + "learning_rate": 6.963703862624104e-07, + "loss": 0.8479, + "step": 9184 + }, + { + "epoch": 1.7679813268522753, + "lm_loss": 0.6795, + "step": 9184, + "vm_loss": 0.2139 + }, + { + "epoch": 1.7679813268522753, + "lm_loss": 0.7216, + "step": 9184, + "vm_loss": 0.1312 + }, + { + "epoch": 1.7679813268522753, + "lm_loss": 0.696, + "step": 9184, + "vm_loss": 0.189 + }, + { + "epoch": 1.7679813268522753, + "lm_loss": 0.4869, + "step": 9184, + "vm_loss": 0.1419 + }, + { + "epoch": 1.7679813268522753, + "lm_loss": 0.501, + "step": 9184, + "vm_loss": 0.1836 + }, + { + "epoch": 1.7679813268522753, + "lm_loss": 0.7656, + "step": 9184, + "vm_loss": 0.2104 + }, + { + "epoch": 1.7679813268522753, + "lm_loss": 0.5313, + "step": 9184, + "vm_loss": 0.1525 + }, + { + "epoch": 1.7679813268522753, + "lm_loss": 0.7217, + "step": 9184, + "vm_loss": 0.1702 + }, + { + "epoch": 1.7681738335298505, + "grad_norm": 3.1302148405089247, + "learning_rate": 6.952276934236912e-07, + "loss": 0.8408, + "step": 9185 + }, + { + "epoch": 1.768366340207426, + "grad_norm": 3.1270879280449453, + "learning_rate": 6.940859051278814e-07, + "loss": 0.8286, + "step": 9186 + }, + { + "epoch": 1.7685588468850013, + "grad_norm": 3.2009342910364027, + "learning_rate": 6.92945021485979e-07, + "loss": 0.8318, + "step": 9187 + }, + { + "epoch": 1.7687513535625767, + "grad_norm": 3.306636940521181, + "learning_rate": 6.918050426088918e-07, + "loss": 0.8685, + "step": 9188 + }, + { + "epoch": 1.7689438602401522, + "grad_norm": 3.2144824607036644, + "learning_rate": 6.906659686074436e-07, + "loss": 0.8578, + "step": 9189 + }, + { + "epoch": 1.7691363669177274, + "grad_norm": 3.0542847184438764, + "learning_rate": 6.8952779959236e-07, + "loss": 0.8025, + "step": 9190 + }, + { + "epoch": 1.769328873595303, + "grad_norm": 3.2167422587656187, + "learning_rate": 6.883905356742948e-07, + "loss": 0.8599, + "step": 9191 + }, + { + "epoch": 1.7695213802728782, + "grad_norm": 3.342015309415466, + "learning_rate": 6.872541769637964e-07, + "loss": 0.8941, + "step": 9192 + }, + { + "epoch": 1.7695213802728782, + "lm_loss": 1.0404, + "step": 9192, + "vm_loss": 0.1418 + }, + { + "epoch": 1.7695213802728782, + "lm_loss": 0.7462, + "step": 9192, + "vm_loss": 0.1279 + }, + { + "epoch": 1.7695213802728782, + "lm_loss": 0.5148, + "step": 9192, + "vm_loss": 0.2237 + }, + { + "epoch": 1.7695213802728782, + "lm_loss": 0.8642, + "step": 9192, + "vm_loss": 0.1395 + }, + { + "epoch": 1.7695213802728782, + "lm_loss": 0.6004, + "step": 9192, + "vm_loss": 0.1647 + }, + { + "epoch": 1.7695213802728782, + "lm_loss": 0.5082, + "step": 9192, + "vm_loss": 0.2059 + }, + { + "epoch": 1.7695213802728782, + "lm_loss": 0.5121, + "step": 9192, + "vm_loss": 0.1306 + }, + { + "epoch": 1.7695213802728782, + "lm_loss": 0.8031, + "step": 9192, + "vm_loss": 0.1235 + }, + { + "epoch": 1.7697138869504536, + "grad_norm": 3.2796719124059557, + "learning_rate": 6.861187235713395e-07, + "loss": 0.8339, + "step": 9193 + }, + { + "epoch": 1.769906393628029, + "grad_norm": 3.2239013025223056, + "learning_rate": 6.849841756072984e-07, + "loss": 0.8495, + "step": 9194 + }, + { + "epoch": 1.7700989003056042, + "grad_norm": 3.3967382774906145, + "learning_rate": 6.838505331819733e-07, + "loss": 0.8917, + "step": 9195 + }, + { + "epoch": 1.7702914069831799, + "grad_norm": 3.3696275921566605, + "learning_rate": 6.82717796405562e-07, + "loss": 0.8608, + "step": 9196 + }, + { + "epoch": 1.770483913660755, + "grad_norm": 3.334879719118258, + "learning_rate": 6.815859653881851e-07, + "loss": 0.8665, + "step": 9197 + }, + { + "epoch": 1.7706764203383305, + "grad_norm": 3.149483726817334, + "learning_rate": 6.804550402398679e-07, + "loss": 0.7964, + "step": 9198 + }, + { + "epoch": 1.770868927015906, + "grad_norm": 3.4199878991709376, + "learning_rate": 6.79325021070556e-07, + "loss": 0.8987, + "step": 9199 + }, + { + "epoch": 1.7710614336934811, + "grad_norm": 3.3049947241360305, + "learning_rate": 6.781959079900958e-07, + "loss": 0.907, + "step": 9200 + }, + { + "epoch": 1.7710614336934811, + "lm_loss": 0.5237, + "step": 9200, + "vm_loss": 0.1248 + }, + { + "epoch": 1.7710614336934811, + "lm_loss": 0.6099, + "step": 9200, + "vm_loss": 0.1589 + }, + { + "epoch": 1.7710614336934811, + "lm_loss": 0.5679, + "step": 9200, + "vm_loss": 0.1314 + }, + { + "epoch": 1.7710614336934811, + "lm_loss": 0.7882, + "step": 9200, + "vm_loss": 0.1611 + }, + { + "epoch": 1.7710614336934811, + "lm_loss": 0.9503, + "step": 9200, + "vm_loss": 0.2522 + }, + { + "epoch": 1.7710614336934811, + "lm_loss": 0.5151, + "step": 9200, + "vm_loss": 0.1627 + }, + { + "epoch": 1.7710614336934811, + "lm_loss": 0.9043, + "step": 9200, + "vm_loss": 0.1322 + }, + { + "epoch": 1.7710614336934811, + "lm_loss": 0.8371, + "step": 9200, + "vm_loss": 0.1996 + }, + { + "epoch": 1.7712539403710568, + "grad_norm": 3.2678834259250555, + "learning_rate": 6.770677011082539e-07, + "loss": 0.8637, + "step": 9201 + }, + { + "epoch": 1.771446447048632, + "grad_norm": 3.1223605535717898, + "learning_rate": 6.75940400534707e-07, + "loss": 0.8432, + "step": 9202 + }, + { + "epoch": 1.7716389537262074, + "grad_norm": 3.150618501905829, + "learning_rate": 6.74814006379042e-07, + "loss": 0.8577, + "step": 9203 + }, + { + "epoch": 1.7718314604037828, + "grad_norm": 3.279169910201237, + "learning_rate": 6.736885187507602e-07, + "loss": 0.8826, + "step": 9204 + }, + { + "epoch": 1.772023967081358, + "grad_norm": 3.332192358964797, + "learning_rate": 6.72563937759273e-07, + "loss": 0.8396, + "step": 9205 + }, + { + "epoch": 1.7722164737589337, + "grad_norm": 3.291203924460957, + "learning_rate": 6.71440263513905e-07, + "loss": 0.8859, + "step": 9206 + }, + { + "epoch": 1.7724089804365089, + "grad_norm": 3.272812535001123, + "learning_rate": 6.703174961238867e-07, + "loss": 0.8438, + "step": 9207 + }, + { + "epoch": 1.7726014871140843, + "grad_norm": 3.316516959507834, + "learning_rate": 6.691956356983731e-07, + "loss": 0.8884, + "step": 9208 + }, + { + "epoch": 1.7726014871140843, + "lm_loss": 0.4017, + "step": 9208, + "vm_loss": 0.1899 + }, + { + "epoch": 1.7726014871140843, + "lm_loss": 0.4183, + "step": 9208, + "vm_loss": 0.2164 + }, + { + "epoch": 1.7726014871140843, + "lm_loss": 0.9697, + "step": 9208, + "vm_loss": 0.1683 + }, + { + "epoch": 1.7726014871140843, + "lm_loss": 0.5544, + "step": 9208, + "vm_loss": 0.1265 + }, + { + "epoch": 1.7726014871140843, + "lm_loss": 0.8673, + "step": 9208, + "vm_loss": 0.1584 + }, + { + "epoch": 1.7726014871140843, + "lm_loss": 0.8152, + "step": 9208, + "vm_loss": 0.2003 + }, + { + "epoch": 1.7726014871140843, + "lm_loss": 0.4683, + "step": 9208, + "vm_loss": 0.1861 + }, + { + "epoch": 1.7726014871140843, + "lm_loss": 0.7664, + "step": 9208, + "vm_loss": 0.2055 + }, + { + "epoch": 1.7727939937916597, + "grad_norm": 3.14522934926373, + "learning_rate": 6.680746823464179e-07, + "loss": 0.8285, + "step": 9209 + }, + { + "epoch": 1.772986500469235, + "grad_norm": 3.136181613504236, + "learning_rate": 6.669546361769952e-07, + "loss": 0.8155, + "step": 9210 + }, + { + "epoch": 1.7731790071468105, + "grad_norm": 3.164323045322968, + "learning_rate": 6.658354972989833e-07, + "loss": 0.8205, + "step": 9211 + }, + { + "epoch": 1.7733715138243857, + "grad_norm": 3.310678509658945, + "learning_rate": 6.647172658211831e-07, + "loss": 0.9001, + "step": 9212 + }, + { + "epoch": 1.7735640205019612, + "grad_norm": 3.283355496278776, + "learning_rate": 6.635999418522965e-07, + "loss": 0.8544, + "step": 9213 + }, + { + "epoch": 1.7737565271795366, + "grad_norm": 3.2715304259435842, + "learning_rate": 6.624835255009432e-07, + "loss": 0.8594, + "step": 9214 + }, + { + "epoch": 1.7739490338571118, + "grad_norm": 3.227025813962184, + "learning_rate": 6.61368016875652e-07, + "loss": 0.8573, + "step": 9215 + }, + { + "epoch": 1.7741415405346874, + "grad_norm": 3.380766274591073, + "learning_rate": 6.602534160848673e-07, + "loss": 0.872, + "step": 9216 + }, + { + "epoch": 1.7741415405346874, + "lm_loss": 0.5537, + "step": 9216, + "vm_loss": 0.1527 + }, + { + "epoch": 1.7741415405346874, + "lm_loss": 0.9421, + "step": 9216, + "vm_loss": 0.1859 + }, + { + "epoch": 1.7741415405346874, + "lm_loss": 0.4653, + "step": 9216, + "vm_loss": 0.1867 + }, + { + "epoch": 1.7741415405346874, + "lm_loss": 0.75, + "step": 9216, + "vm_loss": 0.1361 + }, + { + "epoch": 1.7741415405346874, + "lm_loss": 0.6928, + "step": 9216, + "vm_loss": 0.217 + }, + { + "epoch": 1.7741415405346874, + "lm_loss": 1.0293, + "step": 9216, + "vm_loss": 0.1436 + }, + { + "epoch": 1.7741415405346874, + "lm_loss": 0.9449, + "step": 9216, + "vm_loss": 0.1759 + }, + { + "epoch": 1.7741415405346874, + "lm_loss": 0.7624, + "step": 9216, + "vm_loss": 0.1071 + }, + { + "epoch": 1.7743340472122626, + "grad_norm": 3.378035724851111, + "learning_rate": 6.591397232369401e-07, + "loss": 0.9128, + "step": 9217 + }, + { + "epoch": 1.774526553889838, + "grad_norm": 3.1840416733891233, + "learning_rate": 6.58026938440135e-07, + "loss": 0.8548, + "step": 9218 + }, + { + "epoch": 1.7747190605674135, + "grad_norm": 3.362688991557411, + "learning_rate": 6.56915061802631e-07, + "loss": 0.8555, + "step": 9219 + }, + { + "epoch": 1.7749115672449887, + "grad_norm": 3.341396127842984, + "learning_rate": 6.558040934325183e-07, + "loss": 0.8593, + "step": 9220 + }, + { + "epoch": 1.7751040739225643, + "grad_norm": 3.189012700305559, + "learning_rate": 6.546940334377905e-07, + "loss": 0.8442, + "step": 9221 + }, + { + "epoch": 1.7752965806001395, + "grad_norm": 3.129851218420475, + "learning_rate": 6.535848819263679e-07, + "loss": 0.8633, + "step": 9222 + }, + { + "epoch": 1.775489087277715, + "grad_norm": 3.211557572174544, + "learning_rate": 6.524766390060677e-07, + "loss": 0.8528, + "step": 9223 + }, + { + "epoch": 1.7756815939552903, + "grad_norm": 3.0312822354676623, + "learning_rate": 6.513693047846292e-07, + "loss": 0.7963, + "step": 9224 + }, + { + "epoch": 1.7756815939552903, + "lm_loss": 0.6698, + "step": 9224, + "vm_loss": 0.1229 + }, + { + "epoch": 1.7756815939552903, + "lm_loss": 0.5015, + "step": 9224, + "vm_loss": 0.1241 + }, + { + "epoch": 1.7756815939552903, + "lm_loss": 0.6723, + "step": 9224, + "vm_loss": 0.1547 + }, + { + "epoch": 1.7756815939552903, + "lm_loss": 1.2025, + "step": 9224, + "vm_loss": 0.1382 + }, + { + "epoch": 1.7756815939552903, + "lm_loss": 0.6682, + "step": 9224, + "vm_loss": 0.2748 + }, + { + "epoch": 1.7756815939552903, + "lm_loss": 0.7978, + "step": 9224, + "vm_loss": 0.202 + }, + { + "epoch": 1.7756815939552903, + "lm_loss": 0.8342, + "step": 9224, + "vm_loss": 0.157 + }, + { + "epoch": 1.7756815939552903, + "lm_loss": 0.6983, + "step": 9224, + "vm_loss": 0.1317 + }, + { + "epoch": 1.7758741006328655, + "grad_norm": 3.3300521550975035, + "learning_rate": 6.502628793696974e-07, + "loss": 0.8522, + "step": 9225 + }, + { + "epoch": 1.7760666073104412, + "grad_norm": 3.0977194561261965, + "learning_rate": 6.49157362868832e-07, + "loss": 0.7942, + "step": 9226 + }, + { + "epoch": 1.7762591139880164, + "grad_norm": 3.280793552503617, + "learning_rate": 6.480527553895055e-07, + "loss": 0.8396, + "step": 9227 + }, + { + "epoch": 1.7764516206655918, + "grad_norm": 3.143475244861368, + "learning_rate": 6.469490570390935e-07, + "loss": 0.822, + "step": 9228 + }, + { + "epoch": 1.7766441273431672, + "grad_norm": 3.176365129783468, + "learning_rate": 6.458462679248967e-07, + "loss": 0.8596, + "step": 9229 + }, + { + "epoch": 1.7768366340207424, + "grad_norm": 3.202366959012867, + "learning_rate": 6.44744388154116e-07, + "loss": 0.8279, + "step": 9230 + }, + { + "epoch": 1.777029140698318, + "grad_norm": 3.302952553004911, + "learning_rate": 6.436434178338691e-07, + "loss": 0.8249, + "step": 9231 + }, + { + "epoch": 1.7772216473758933, + "grad_norm": 3.300311748020996, + "learning_rate": 6.425433570711848e-07, + "loss": 0.8362, + "step": 9232 + }, + { + "epoch": 1.7772216473758933, + "lm_loss": 0.6182, + "step": 9232, + "vm_loss": 0.2116 + }, + { + "epoch": 1.7772216473758933, + "lm_loss": 0.8127, + "step": 9232, + "vm_loss": 0.2078 + }, + { + "epoch": 1.7772216473758933, + "lm_loss": 0.5137, + "step": 9232, + "vm_loss": 0.1126 + }, + { + "epoch": 1.7772216473758933, + "lm_loss": 0.5472, + "step": 9232, + "vm_loss": 0.208 + }, + { + "epoch": 1.7772216473758933, + "lm_loss": 0.786, + "step": 9232, + "vm_loss": 0.1514 + }, + { + "epoch": 1.7772216473758933, + "lm_loss": 0.8349, + "step": 9232, + "vm_loss": 0.1728 + }, + { + "epoch": 1.7772216473758933, + "lm_loss": 0.65, + "step": 9232, + "vm_loss": 0.1238 + }, + { + "epoch": 1.7772216473758933, + "lm_loss": 0.8333, + "step": 9232, + "vm_loss": 0.1157 + }, + { + "epoch": 1.7774141540534687, + "grad_norm": 3.2229492400800837, + "learning_rate": 6.414442059730052e-07, + "loss": 0.8047, + "step": 9233 + }, + { + "epoch": 1.7776066607310441, + "grad_norm": 3.3264864929381126, + "learning_rate": 6.403459646461774e-07, + "loss": 0.8802, + "step": 9234 + }, + { + "epoch": 1.7777991674086195, + "grad_norm": 3.1954841826774105, + "learning_rate": 6.392486331974668e-07, + "loss": 0.8072, + "step": 9235 + }, + { + "epoch": 1.777991674086195, + "grad_norm": 3.128595033969252, + "learning_rate": 6.381522117335482e-07, + "loss": 0.8409, + "step": 9236 + }, + { + "epoch": 1.7781841807637702, + "grad_norm": 3.115331844638848, + "learning_rate": 6.370567003610095e-07, + "loss": 0.7891, + "step": 9237 + }, + { + "epoch": 1.7783766874413456, + "grad_norm": 3.3320659979008167, + "learning_rate": 6.359620991863425e-07, + "loss": 0.8697, + "step": 9238 + }, + { + "epoch": 1.778569194118921, + "grad_norm": 3.1837519072221894, + "learning_rate": 6.348684083159651e-07, + "loss": 0.8624, + "step": 9239 + }, + { + "epoch": 1.7787617007964964, + "grad_norm": 3.2117770108508, + "learning_rate": 6.337756278561902e-07, + "loss": 0.8624, + "step": 9240 + }, + { + "epoch": 1.7787617007964964, + "lm_loss": 0.7106, + "step": 9240, + "vm_loss": 0.2332 + }, + { + "epoch": 1.7787617007964964, + "lm_loss": 0.525, + "step": 9240, + "vm_loss": 0.1468 + }, + { + "epoch": 1.7787617007964964, + "lm_loss": 0.4679, + "step": 9240, + "vm_loss": 0.1832 + }, + { + "epoch": 1.7787617007964964, + "lm_loss": 0.575, + "step": 9240, + "vm_loss": 0.1001 + }, + { + "epoch": 1.7787617007964964, + "lm_loss": 0.6907, + "step": 9240, + "vm_loss": 0.1682 + }, + { + "epoch": 1.7787617007964964, + "lm_loss": 0.7037, + "step": 9240, + "vm_loss": 0.1974 + }, + { + "epoch": 1.7787617007964964, + "lm_loss": 1.2433, + "step": 9240, + "vm_loss": 0.1215 + }, + { + "epoch": 1.7787617007964964, + "lm_loss": 0.4828, + "step": 9240, + "vm_loss": 0.1711 + }, + { + "epoch": 1.7789542074740718, + "grad_norm": 3.2706895491636736, + "learning_rate": 6.326837579132538e-07, + "loss": 0.8557, + "step": 9241 + }, + { + "epoch": 1.779146714151647, + "grad_norm": 3.2307334036245505, + "learning_rate": 6.315927985933001e-07, + "loss": 0.8041, + "step": 9242 + }, + { + "epoch": 1.7793392208292225, + "grad_norm": 3.084748508457976, + "learning_rate": 6.305027500023841e-07, + "loss": 0.8194, + "step": 9243 + }, + { + "epoch": 1.779531727506798, + "grad_norm": 3.079293432116286, + "learning_rate": 6.294136122464701e-07, + "loss": 0.8057, + "step": 9244 + }, + { + "epoch": 1.7797242341843733, + "grad_norm": 3.271410006086443, + "learning_rate": 6.283253854314376e-07, + "loss": 0.841, + "step": 9245 + }, + { + "epoch": 1.7799167408619487, + "grad_norm": 3.008547302609984, + "learning_rate": 6.272380696630765e-07, + "loss": 0.7755, + "step": 9246 + }, + { + "epoch": 1.780109247539524, + "grad_norm": 3.238833149115958, + "learning_rate": 6.261516650470878e-07, + "loss": 0.8393, + "step": 9247 + }, + { + "epoch": 1.7803017542170994, + "grad_norm": 3.225089958380521, + "learning_rate": 6.250661716890838e-07, + "loss": 0.838, + "step": 9248 + }, + { + "epoch": 1.7803017542170994, + "lm_loss": 0.5288, + "step": 9248, + "vm_loss": 0.1484 + }, + { + "epoch": 1.7803017542170994, + "lm_loss": 0.7326, + "step": 9248, + "vm_loss": 0.2099 + }, + { + "epoch": 1.7803017542170994, + "lm_loss": 0.8235, + "step": 9248, + "vm_loss": 0.1413 + }, + { + "epoch": 1.7803017542170994, + "lm_loss": 1.0753, + "step": 9248, + "vm_loss": 0.1539 + }, + { + "epoch": 1.7803017542170994, + "lm_loss": 0.7818, + "step": 9248, + "vm_loss": 0.1624 + }, + { + "epoch": 1.7803017542170994, + "lm_loss": 0.466, + "step": 9248, + "vm_loss": 0.1262 + }, + { + "epoch": 1.7803017542170994, + "lm_loss": 0.641, + "step": 9248, + "vm_loss": 0.1944 + }, + { + "epoch": 1.7803017542170994, + "lm_loss": 0.7657, + "step": 9248, + "vm_loss": 0.1393 + }, + { + "epoch": 1.7804942608946748, + "grad_norm": 3.2228340982875765, + "learning_rate": 6.239815896945889e-07, + "loss": 0.8208, + "step": 9249 + }, + { + "epoch": 1.7806867675722502, + "grad_norm": 3.436525681666964, + "learning_rate": 6.228979191690387e-07, + "loss": 0.881, + "step": 9250 + }, + { + "epoch": 1.7808792742498256, + "grad_norm": 3.404937019759645, + "learning_rate": 6.218151602177769e-07, + "loss": 0.8889, + "step": 9251 + }, + { + "epoch": 1.7810717809274008, + "grad_norm": 3.245815431998424, + "learning_rate": 6.207333129460646e-07, + "loss": 0.8631, + "step": 9252 + }, + { + "epoch": 1.7812642876049765, + "grad_norm": 3.2823055949593254, + "learning_rate": 6.19652377459069e-07, + "loss": 0.8614, + "step": 9253 + }, + { + "epoch": 1.7814567942825517, + "grad_norm": 3.2936101358401655, + "learning_rate": 6.185723538618738e-07, + "loss": 0.8592, + "step": 9254 + }, + { + "epoch": 1.781649300960127, + "grad_norm": 3.382432999164009, + "learning_rate": 6.17493242259467e-07, + "loss": 0.8886, + "step": 9255 + }, + { + "epoch": 1.7818418076377025, + "grad_norm": 3.111526722908233, + "learning_rate": 6.164150427567572e-07, + "loss": 0.7941, + "step": 9256 + }, + { + "epoch": 1.7818418076377025, + "lm_loss": 0.3625, + "step": 9256, + "vm_loss": 0.1744 + }, + { + "epoch": 1.7818418076377025, + "lm_loss": 0.5348, + "step": 9256, + "vm_loss": 0.1489 + }, + { + "epoch": 1.7818418076377025, + "lm_loss": 0.7382, + "step": 9256, + "vm_loss": 0.1522 + }, + { + "epoch": 1.7818418076377025, + "lm_loss": 0.8101, + "step": 9256, + "vm_loss": 0.1552 + }, + { + "epoch": 1.7818418076377025, + "lm_loss": 0.4439, + "step": 9256, + "vm_loss": 0.1513 + }, + { + "epoch": 1.7818418076377025, + "lm_loss": 0.8838, + "step": 9256, + "vm_loss": 0.1205 + }, + { + "epoch": 1.7818418076377025, + "lm_loss": 0.547, + "step": 9256, + "vm_loss": 0.1713 + }, + { + "epoch": 1.7818418076377025, + "lm_loss": 0.7169, + "step": 9256, + "vm_loss": 0.1618 + }, + { + "epoch": 1.7820343143152777, + "grad_norm": 3.404613808835494, + "learning_rate": 6.153377554585549e-07, + "loss": 0.8366, + "step": 9257 + }, + { + "epoch": 1.7822268209928533, + "grad_norm": 3.054785570234413, + "learning_rate": 6.142613804695885e-07, + "loss": 0.801, + "step": 9258 + }, + { + "epoch": 1.7824193276704285, + "grad_norm": 3.33749363987345, + "learning_rate": 6.131859178944922e-07, + "loss": 0.8981, + "step": 9259 + }, + { + "epoch": 1.782611834348004, + "grad_norm": 3.2496571440326005, + "learning_rate": 6.121113678378199e-07, + "loss": 0.8918, + "step": 9260 + }, + { + "epoch": 1.7828043410255794, + "grad_norm": 3.1280889463683104, + "learning_rate": 6.110377304040271e-07, + "loss": 0.8239, + "step": 9261 + }, + { + "epoch": 1.7829968477031546, + "grad_norm": 3.222917663309125, + "learning_rate": 6.09965005697487e-07, + "loss": 0.834, + "step": 9262 + }, + { + "epoch": 1.7831893543807302, + "grad_norm": 3.3265008147886315, + "learning_rate": 6.088931938224818e-07, + "loss": 0.8709, + "step": 9263 + }, + { + "epoch": 1.7833818610583054, + "grad_norm": 3.1697446634346984, + "learning_rate": 6.078222948832058e-07, + "loss": 0.8191, + "step": 9264 + }, + { + "epoch": 1.7833818610583054, + "lm_loss": 0.5779, + "step": 9264, + "vm_loss": 0.1964 + }, + { + "epoch": 1.7833818610583054, + "lm_loss": 0.7652, + "step": 9264, + "vm_loss": 0.1483 + }, + { + "epoch": 1.7833818610583054, + "lm_loss": 1.0081, + "step": 9264, + "vm_loss": 0.1594 + }, + { + "epoch": 1.7833818610583054, + "lm_loss": 0.637, + "step": 9264, + "vm_loss": 0.1717 + }, + { + "epoch": 1.7833818610583054, + "lm_loss": 0.6511, + "step": 9264, + "vm_loss": 0.2324 + }, + { + "epoch": 1.7833818610583054, + "lm_loss": 0.8456, + "step": 9264, + "vm_loss": 0.1051 + }, + { + "epoch": 1.7833818610583054, + "lm_loss": 0.6702, + "step": 9264, + "vm_loss": 0.1772 + }, + { + "epoch": 1.7833818610583054, + "lm_loss": 1.1316, + "step": 9264, + "vm_loss": 0.2092 + }, + { + "epoch": 1.7835743677358809, + "grad_norm": 3.343411901700475, + "learning_rate": 6.067523089837635e-07, + "loss": 0.87, + "step": 9265 + }, + { + "epoch": 1.7837668744134563, + "grad_norm": 3.208265545162121, + "learning_rate": 6.056832362281728e-07, + "loss": 0.8293, + "step": 9266 + }, + { + "epoch": 1.7839593810910315, + "grad_norm": 3.2865771003767184, + "learning_rate": 6.046150767203585e-07, + "loss": 0.8311, + "step": 9267 + }, + { + "epoch": 1.7841518877686071, + "grad_norm": 3.2451078515808587, + "learning_rate": 6.035478305641617e-07, + "loss": 0.8504, + "step": 9268 + }, + { + "epoch": 1.7843443944461823, + "grad_norm": 3.239829724525075, + "learning_rate": 6.024814978633309e-07, + "loss": 0.8371, + "step": 9269 + }, + { + "epoch": 1.7845369011237577, + "grad_norm": 3.3536802046613094, + "learning_rate": 6.014160787215273e-07, + "loss": 0.8698, + "step": 9270 + }, + { + "epoch": 1.7847294078013332, + "grad_norm": 3.2248312329713555, + "learning_rate": 6.00351573242326e-07, + "loss": 0.8366, + "step": 9271 + }, + { + "epoch": 1.7849219144789084, + "grad_norm": 3.421789033489498, + "learning_rate": 5.992879815292052e-07, + "loss": 0.9078, + "step": 9272 + }, + { + "epoch": 1.7849219144789084, + "lm_loss": 0.6616, + "step": 9272, + "vm_loss": 0.1454 + }, + { + "epoch": 1.7849219144789084, + "lm_loss": 0.5141, + "step": 9272, + "vm_loss": 0.1829 + }, + { + "epoch": 1.7849219144789084, + "lm_loss": 0.7851, + "step": 9272, + "vm_loss": 0.1931 + }, + { + "epoch": 1.7849219144789084, + "lm_loss": 0.5386, + "step": 9272, + "vm_loss": 0.1356 + }, + { + "epoch": 1.7849219144789084, + "lm_loss": 0.4687, + "step": 9272, + "vm_loss": 0.1556 + }, + { + "epoch": 1.7849219144789084, + "lm_loss": 0.9167, + "step": 9272, + "vm_loss": 0.1988 + }, + { + "epoch": 1.7849219144789084, + "lm_loss": 0.5668, + "step": 9272, + "vm_loss": 0.1512 + }, + { + "epoch": 1.7849219144789084, + "lm_loss": 0.8033, + "step": 9272, + "vm_loss": 0.1814 + }, + { + "epoch": 1.785114421156484, + "grad_norm": 3.1573575071062043, + "learning_rate": 5.982253036855656e-07, + "loss": 0.814, + "step": 9273 + }, + { + "epoch": 1.7853069278340592, + "grad_norm": 3.296680516675027, + "learning_rate": 5.971635398147091e-07, + "loss": 0.9145, + "step": 9274 + }, + { + "epoch": 1.7854994345116346, + "grad_norm": 3.0551457943636358, + "learning_rate": 5.961026900198564e-07, + "loss": 0.814, + "step": 9275 + }, + { + "epoch": 1.78569194118921, + "grad_norm": 3.1344385936130648, + "learning_rate": 5.950427544041293e-07, + "loss": 0.8094, + "step": 9276 + }, + { + "epoch": 1.7858844478667852, + "grad_norm": 3.5087652720375533, + "learning_rate": 5.939837330705733e-07, + "loss": 0.9412, + "step": 9277 + }, + { + "epoch": 1.7860769545443609, + "grad_norm": 3.1947165381298914, + "learning_rate": 5.929256261221361e-07, + "loss": 0.8211, + "step": 9278 + }, + { + "epoch": 1.786269461221936, + "grad_norm": 3.206555448247763, + "learning_rate": 5.918684336616787e-07, + "loss": 0.8466, + "step": 9279 + }, + { + "epoch": 1.7864619678995115, + "grad_norm": 3.2183513926517504, + "learning_rate": 5.908121557919755e-07, + "loss": 0.824, + "step": 9280 + }, + { + "epoch": 1.7864619678995115, + "lm_loss": 0.8104, + "step": 9280, + "vm_loss": 0.1869 + }, + { + "epoch": 1.7864619678995115, + "lm_loss": 0.4222, + "step": 9280, + "vm_loss": 0.1592 + }, + { + "epoch": 1.7864619678995115, + "lm_loss": 0.9618, + "step": 9280, + "vm_loss": 0.1935 + }, + { + "epoch": 1.7864619678995115, + "lm_loss": 0.6091, + "step": 9280, + "vm_loss": 0.1 + }, + { + "epoch": 1.7864619678995115, + "lm_loss": 0.4865, + "step": 9280, + "vm_loss": 0.1725 + }, + { + "epoch": 1.7864619678995115, + "lm_loss": 0.5688, + "step": 9280, + "vm_loss": 0.1086 + }, + { + "epoch": 1.7864619678995115, + "lm_loss": 0.6236, + "step": 9280, + "vm_loss": 0.1244 + }, + { + "epoch": 1.7864619678995115, + "lm_loss": 0.9341, + "step": 9280, + "vm_loss": 0.2305 + }, + { + "epoch": 1.786654474577087, + "grad_norm": 3.215835781174425, + "learning_rate": 5.89756792615711e-07, + "loss": 0.8329, + "step": 9281 + }, + { + "epoch": 1.7868469812546621, + "grad_norm": 3.1170676980342606, + "learning_rate": 5.887023442354744e-07, + "loss": 0.8774, + "step": 9282 + }, + { + "epoch": 1.7870394879322378, + "grad_norm": 3.144025160494419, + "learning_rate": 5.876488107537792e-07, + "loss": 0.8312, + "step": 9283 + }, + { + "epoch": 1.787231994609813, + "grad_norm": 3.3151663693662865, + "learning_rate": 5.865961922730379e-07, + "loss": 0.8499, + "step": 9284 + }, + { + "epoch": 1.7874245012873884, + "grad_norm": 3.2966878742167944, + "learning_rate": 5.855444888955786e-07, + "loss": 0.8708, + "step": 9285 + }, + { + "epoch": 1.7876170079649638, + "grad_norm": 3.2192937950295186, + "learning_rate": 5.844937007236407e-07, + "loss": 0.8419, + "step": 9286 + }, + { + "epoch": 1.787809514642539, + "grad_norm": 3.2902553220441986, + "learning_rate": 5.834438278593768e-07, + "loss": 0.8622, + "step": 9287 + }, + { + "epoch": 1.7880020213201147, + "grad_norm": 3.242364040778866, + "learning_rate": 5.823948704048443e-07, + "loss": 0.8617, + "step": 9288 + }, + { + "epoch": 1.7880020213201147, + "lm_loss": 1.1926, + "step": 9288, + "vm_loss": 0.1792 + }, + { + "epoch": 1.7880020213201147, + "lm_loss": 0.4136, + "step": 9288, + "vm_loss": 0.1958 + }, + { + "epoch": 1.7880020213201147, + "lm_loss": 0.933, + "step": 9288, + "vm_loss": 0.1863 + }, + { + "epoch": 1.7880020213201147, + "lm_loss": 0.8711, + "step": 9288, + "vm_loss": 0.1747 + }, + { + "epoch": 1.7880020213201147, + "lm_loss": 0.601, + "step": 9288, + "vm_loss": 0.182 + }, + { + "epoch": 1.7880020213201147, + "lm_loss": 0.6969, + "step": 9288, + "vm_loss": 0.2151 + }, + { + "epoch": 1.7880020213201147, + "lm_loss": 0.7399, + "step": 9288, + "vm_loss": 0.2037 + }, + { + "epoch": 1.7880020213201147, + "lm_loss": 0.5307, + "step": 9288, + "vm_loss": 0.1029 + }, + { + "epoch": 1.7881945279976899, + "grad_norm": 3.2932063243846765, + "learning_rate": 5.813468284620161e-07, + "loss": 0.8655, + "step": 9289 + }, + { + "epoch": 1.7883870346752653, + "grad_norm": 3.3271120922929445, + "learning_rate": 5.802997021327761e-07, + "loss": 0.8659, + "step": 9290 + }, + { + "epoch": 1.7885795413528407, + "grad_norm": 3.2895643066155533, + "learning_rate": 5.792534915189185e-07, + "loss": 0.8566, + "step": 9291 + }, + { + "epoch": 1.788772048030416, + "grad_norm": 3.3427730531311877, + "learning_rate": 5.782081967221498e-07, + "loss": 0.893, + "step": 9292 + }, + { + "epoch": 1.7889645547079915, + "grad_norm": 3.32998639109651, + "learning_rate": 5.771638178440808e-07, + "loss": 0.8702, + "step": 9293 + }, + { + "epoch": 1.7891570613855667, + "grad_norm": 3.217540227518655, + "learning_rate": 5.761203549862448e-07, + "loss": 0.8015, + "step": 9294 + }, + { + "epoch": 1.7893495680631422, + "grad_norm": 3.4286631419637397, + "learning_rate": 5.750778082500763e-07, + "loss": 0.9207, + "step": 9295 + }, + { + "epoch": 1.7895420747407176, + "grad_norm": 3.273074842613617, + "learning_rate": 5.740361777369241e-07, + "loss": 0.8824, + "step": 9296 + }, + { + "epoch": 1.7895420747407176, + "lm_loss": 0.8196, + "step": 9296, + "vm_loss": 0.1114 + }, + { + "epoch": 1.7895420747407176, + "lm_loss": 0.9049, + "step": 9296, + "vm_loss": 0.1736 + }, + { + "epoch": 1.7895420747407176, + "lm_loss": 0.6444, + "step": 9296, + "vm_loss": 0.1304 + }, + { + "epoch": 1.7895420747407176, + "lm_loss": 0.5757, + "step": 9296, + "vm_loss": 0.1851 + }, + { + "epoch": 1.7895420747407176, + "lm_loss": 0.7011, + "step": 9296, + "vm_loss": 0.1558 + }, + { + "epoch": 1.7895420747407176, + "lm_loss": 0.4708, + "step": 9296, + "vm_loss": 0.2166 + }, + { + "epoch": 1.7895420747407176, + "lm_loss": 0.7206, + "step": 9296, + "vm_loss": 0.1647 + }, + { + "epoch": 1.7895420747407176, + "lm_loss": 0.8499, + "step": 9296, + "vm_loss": 0.1208 + }, + { + "epoch": 1.789734581418293, + "grad_norm": 3.2785972409992574, + "learning_rate": 5.729954635480483e-07, + "loss": 0.8562, + "step": 9297 + }, + { + "epoch": 1.7899270880958684, + "grad_norm": 3.2755376569906614, + "learning_rate": 5.719556657846226e-07, + "loss": 0.8112, + "step": 9298 + }, + { + "epoch": 1.7901195947734436, + "grad_norm": 3.2293600704366994, + "learning_rate": 5.709167845477225e-07, + "loss": 0.8297, + "step": 9299 + }, + { + "epoch": 1.790312101451019, + "grad_norm": 3.193441751466621, + "learning_rate": 5.698788199383476e-07, + "loss": 0.8165, + "step": 9300 + }, + { + "epoch": 1.7905046081285945, + "grad_norm": 3.1143475417157154, + "learning_rate": 5.688417720573958e-07, + "loss": 0.7916, + "step": 9301 + }, + { + "epoch": 1.7906971148061699, + "grad_norm": 3.1972935909874187, + "learning_rate": 5.678056410056854e-07, + "loss": 0.852, + "step": 9302 + }, + { + "epoch": 1.7908896214837453, + "grad_norm": 3.177973552713865, + "learning_rate": 5.667704268839358e-07, + "loss": 0.8133, + "step": 9303 + }, + { + "epoch": 1.7910821281613205, + "grad_norm": 3.2822464398528273, + "learning_rate": 5.65736129792791e-07, + "loss": 0.829, + "step": 9304 + }, + { + "epoch": 1.7910821281613205, + "lm_loss": 0.8761, + "step": 9304, + "vm_loss": 0.1284 + }, + { + "epoch": 1.7910821281613205, + "lm_loss": 0.8798, + "step": 9304, + "vm_loss": 0.182 + }, + { + "epoch": 1.7910821281613205, + "lm_loss": 0.7196, + "step": 9304, + "vm_loss": 0.1776 + }, + { + "epoch": 1.7910821281613205, + "lm_loss": 0.8113, + "step": 9304, + "vm_loss": 0.1282 + }, + { + "epoch": 1.7910821281613205, + "lm_loss": 0.7959, + "step": 9304, + "vm_loss": 0.2356 + }, + { + "epoch": 1.7910821281613205, + "lm_loss": 0.7527, + "step": 9304, + "vm_loss": 0.166 + }, + { + "epoch": 1.7910821281613205, + "lm_loss": 0.7275, + "step": 9304, + "vm_loss": 0.1917 + }, + { + "epoch": 1.7910821281613205, + "lm_loss": 0.629, + "step": 9304, + "vm_loss": 0.1013 + }, + { + "epoch": 1.791274634838896, + "grad_norm": 3.3303301003602193, + "learning_rate": 5.647027498327918e-07, + "loss": 0.9016, + "step": 9305 + }, + { + "epoch": 1.7914671415164714, + "grad_norm": 3.1263861161841198, + "learning_rate": 5.636702871043986e-07, + "loss": 0.8131, + "step": 9306 + }, + { + "epoch": 1.7916596481940468, + "grad_norm": 3.206954243730365, + "learning_rate": 5.626387417079793e-07, + "loss": 0.8302, + "step": 9307 + }, + { + "epoch": 1.7918521548716222, + "grad_norm": 3.2779989352350056, + "learning_rate": 5.616081137438123e-07, + "loss": 0.8658, + "step": 9308 + }, + { + "epoch": 1.7920446615491974, + "grad_norm": 3.1289923026812803, + "learning_rate": 5.60578403312092e-07, + "loss": 0.7966, + "step": 9309 + }, + { + "epoch": 1.7922371682267728, + "grad_norm": 3.27722554765375, + "learning_rate": 5.595496105129139e-07, + "loss": 0.8499, + "step": 9310 + }, + { + "epoch": 1.7924296749043482, + "grad_norm": 3.4300968204868525, + "learning_rate": 5.585217354462924e-07, + "loss": 0.8888, + "step": 9311 + }, + { + "epoch": 1.7926221815819237, + "grad_norm": 3.1934477494434863, + "learning_rate": 5.574947782121498e-07, + "loss": 0.8468, + "step": 9312 + }, + { + "epoch": 1.7926221815819237, + "lm_loss": 0.7531, + "step": 9312, + "vm_loss": 0.1311 + }, + { + "epoch": 1.7926221815819237, + "lm_loss": 0.6114, + "step": 9312, + "vm_loss": 0.1389 + }, + { + "epoch": 1.7926221815819237, + "lm_loss": 0.589, + "step": 9312, + "vm_loss": 0.1282 + }, + { + "epoch": 1.7926221815819237, + "lm_loss": 0.3324, + "step": 9312, + "vm_loss": 0.0991 + }, + { + "epoch": 1.7926221815819237, + "lm_loss": 0.9235, + "step": 9312, + "vm_loss": 0.162 + }, + { + "epoch": 1.7926221815819237, + "lm_loss": 0.75, + "step": 9312, + "vm_loss": 0.1987 + }, + { + "epoch": 1.7926221815819237, + "lm_loss": 0.5836, + "step": 9312, + "vm_loss": 0.1535 + }, + { + "epoch": 1.7926221815819237, + "lm_loss": 0.5484, + "step": 9312, + "vm_loss": 0.1653 + }, + { + "epoch": 1.792814688259499, + "grad_norm": 3.12519481115312, + "learning_rate": 5.564687389103207e-07, + "loss": 0.801, + "step": 9313 + }, + { + "epoch": 1.7930071949370743, + "grad_norm": 3.3483189553720942, + "learning_rate": 5.554436176405475e-07, + "loss": 0.9047, + "step": 9314 + }, + { + "epoch": 1.79319970161465, + "grad_norm": 3.284115775753295, + "learning_rate": 5.544194145024872e-07, + "loss": 0.8744, + "step": 9315 + }, + { + "epoch": 1.7933922082922251, + "grad_norm": 3.136461803652932, + "learning_rate": 5.533961295957013e-07, + "loss": 0.8326, + "step": 9316 + }, + { + "epoch": 1.7935847149698005, + "grad_norm": 3.372705292566454, + "learning_rate": 5.523737630196713e-07, + "loss": 0.8911, + "step": 9317 + }, + { + "epoch": 1.793777221647376, + "grad_norm": 3.3737858585394456, + "learning_rate": 5.513523148737809e-07, + "loss": 0.8638, + "step": 9318 + }, + { + "epoch": 1.7939697283249512, + "grad_norm": 3.345638230096811, + "learning_rate": 5.503317852573309e-07, + "loss": 0.8607, + "step": 9319 + }, + { + "epoch": 1.7941622350025268, + "grad_norm": 3.4251893513047853, + "learning_rate": 5.493121742695251e-07, + "loss": 0.897, + "step": 9320 + }, + { + "epoch": 1.7941622350025268, + "lm_loss": 0.419, + "step": 9320, + "vm_loss": 0.1827 + }, + { + "epoch": 1.7941622350025268, + "lm_loss": 0.7904, + "step": 9320, + "vm_loss": 0.1779 + }, + { + "epoch": 1.7941622350025268, + "lm_loss": 0.664, + "step": 9320, + "vm_loss": 0.1323 + }, + { + "epoch": 1.7941622350025268, + "lm_loss": 0.919, + "step": 9320, + "vm_loss": 0.1976 + }, + { + "epoch": 1.7941622350025268, + "lm_loss": 0.6742, + "step": 9320, + "vm_loss": 0.1537 + }, + { + "epoch": 1.7941622350025268, + "lm_loss": 0.5391, + "step": 9320, + "vm_loss": 0.1536 + }, + { + "epoch": 1.7941622350025268, + "lm_loss": 0.7375, + "step": 9320, + "vm_loss": 0.2275 + }, + { + "epoch": 1.7941622350025268, + "lm_loss": 0.5958, + "step": 9320, + "vm_loss": 0.2044 + }, + { + "epoch": 1.794354741680102, + "grad_norm": 3.092477120183323, + "learning_rate": 5.482934820094876e-07, + "loss": 0.8398, + "step": 9321 + }, + { + "epoch": 1.7945472483576774, + "grad_norm": 3.0705010731287135, + "learning_rate": 5.472757085762459e-07, + "loss": 0.786, + "step": 9322 + }, + { + "epoch": 1.7947397550352528, + "grad_norm": 3.2965471832632542, + "learning_rate": 5.462588540687397e-07, + "loss": 0.8568, + "step": 9323 + }, + { + "epoch": 1.794932261712828, + "grad_norm": 3.184817560153111, + "learning_rate": 5.452429185858221e-07, + "loss": 0.8206, + "step": 9324 + }, + { + "epoch": 1.7951247683904037, + "grad_norm": 3.260190325186805, + "learning_rate": 5.442279022262564e-07, + "loss": 0.822, + "step": 9325 + }, + { + "epoch": 1.795317275067979, + "grad_norm": 3.2684083878219803, + "learning_rate": 5.432138050887115e-07, + "loss": 0.8149, + "step": 9326 + }, + { + "epoch": 1.7955097817455543, + "grad_norm": 3.321153270924032, + "learning_rate": 5.422006272717717e-07, + "loss": 0.8805, + "step": 9327 + }, + { + "epoch": 1.7957022884231297, + "grad_norm": 2.979764277324341, + "learning_rate": 5.411883688739328e-07, + "loss": 0.7504, + "step": 9328 + }, + { + "epoch": 1.7957022884231297, + "lm_loss": 0.4891, + "step": 9328, + "vm_loss": 0.2018 + }, + { + "epoch": 1.7957022884231297, + "lm_loss": 0.5723, + "step": 9328, + "vm_loss": 0.1655 + }, + { + "epoch": 1.7957022884231297, + "lm_loss": 0.4893, + "step": 9328, + "vm_loss": 0.1509 + }, + { + "epoch": 1.7957022884231297, + "lm_loss": 0.6068, + "step": 9328, + "vm_loss": 0.1758 + }, + { + "epoch": 1.7957022884231297, + "lm_loss": 0.3643, + "step": 9328, + "vm_loss": 0.1396 + }, + { + "epoch": 1.7957022884231297, + "lm_loss": 0.5364, + "step": 9328, + "vm_loss": 0.1163 + }, + { + "epoch": 1.7957022884231297, + "lm_loss": 0.8323, + "step": 9328, + "vm_loss": 0.1776 + }, + { + "epoch": 1.7957022884231297, + "lm_loss": 0.6345, + "step": 9328, + "vm_loss": 0.1546 + }, + { + "epoch": 1.795894795100705, + "grad_norm": 3.3951599398942456, + "learning_rate": 5.401770299935971e-07, + "loss": 0.8778, + "step": 9329 + }, + { + "epoch": 1.7960873017782806, + "grad_norm": 3.2020205994392303, + "learning_rate": 5.391666107290805e-07, + "loss": 0.8502, + "step": 9330 + }, + { + "epoch": 1.7962798084558558, + "grad_norm": 3.1091143028082864, + "learning_rate": 5.381571111786099e-07, + "loss": 0.7952, + "step": 9331 + }, + { + "epoch": 1.7964723151334312, + "grad_norm": 3.3343196348526667, + "learning_rate": 5.371485314403202e-07, + "loss": 0.863, + "step": 9332 + }, + { + "epoch": 1.7966648218110066, + "grad_norm": 3.200568165098887, + "learning_rate": 5.361408716122585e-07, + "loss": 0.8341, + "step": 9333 + }, + { + "epoch": 1.7968573284885818, + "grad_norm": 3.0434407330028757, + "learning_rate": 5.351341317923808e-07, + "loss": 0.7836, + "step": 9334 + }, + { + "epoch": 1.7970498351661575, + "grad_norm": 3.104722092073949, + "learning_rate": 5.341283120785579e-07, + "loss": 0.8149, + "step": 9335 + }, + { + "epoch": 1.7972423418437327, + "grad_norm": 3.3548735604886257, + "learning_rate": 5.331234125685669e-07, + "loss": 0.8617, + "step": 9336 + }, + { + "epoch": 1.7972423418437327, + "lm_loss": 0.6442, + "step": 9336, + "vm_loss": 0.1453 + }, + { + "epoch": 1.7972423418437327, + "lm_loss": 0.4212, + "step": 9336, + "vm_loss": 0.2187 + }, + { + "epoch": 1.7972423418437327, + "lm_loss": 0.3811, + "step": 9336, + "vm_loss": 0.1698 + }, + { + "epoch": 1.7972423418437327, + "lm_loss": 0.5379, + "step": 9336, + "vm_loss": 0.1347 + }, + { + "epoch": 1.7972423418437327, + "lm_loss": 0.8259, + "step": 9336, + "vm_loss": 0.1428 + }, + { + "epoch": 1.7972423418437327, + "lm_loss": 0.4868, + "step": 9336, + "vm_loss": 0.1211 + }, + { + "epoch": 1.7972423418437327, + "lm_loss": 1.1915, + "step": 9336, + "vm_loss": 0.1606 + }, + { + "epoch": 1.7972423418437327, + "lm_loss": 0.4599, + "step": 9336, + "vm_loss": 0.1966 + }, + { + "epoch": 1.797434848521308, + "grad_norm": 3.1718756111237236, + "learning_rate": 5.321194333600954e-07, + "loss": 0.8357, + "step": 9337 + }, + { + "epoch": 1.7976273551988835, + "grad_norm": 3.2873528593143133, + "learning_rate": 5.311163745507464e-07, + "loss": 0.8655, + "step": 9338 + }, + { + "epoch": 1.7978198618764587, + "grad_norm": 3.2003733281005515, + "learning_rate": 5.301142362380274e-07, + "loss": 0.8433, + "step": 9339 + }, + { + "epoch": 1.7980123685540343, + "grad_norm": 3.3807919046718626, + "learning_rate": 5.291130185193594e-07, + "loss": 0.8485, + "step": 9340 + }, + { + "epoch": 1.7982048752316095, + "grad_norm": 3.4129025834983295, + "learning_rate": 5.281127214920745e-07, + "loss": 0.8408, + "step": 9341 + }, + { + "epoch": 1.798397381909185, + "grad_norm": 3.353193767271289, + "learning_rate": 5.271133452534149e-07, + "loss": 0.8932, + "step": 9342 + }, + { + "epoch": 1.7985898885867604, + "grad_norm": 3.361393517157605, + "learning_rate": 5.261148899005297e-07, + "loss": 0.8576, + "step": 9343 + }, + { + "epoch": 1.7987823952643356, + "grad_norm": 3.035842363424599, + "learning_rate": 5.251173555304845e-07, + "loss": 0.7771, + "step": 9344 + }, + { + "epoch": 1.7987823952643356, + "lm_loss": 0.8957, + "step": 9344, + "vm_loss": 0.1135 + }, + { + "epoch": 1.7987823952643356, + "lm_loss": 0.6786, + "step": 9344, + "vm_loss": 0.1625 + }, + { + "epoch": 1.7987823952643356, + "lm_loss": 0.7046, + "step": 9344, + "vm_loss": 0.1689 + }, + { + "epoch": 1.7987823952643356, + "lm_loss": 0.4745, + "step": 9344, + "vm_loss": 0.1765 + }, + { + "epoch": 1.7987823952643356, + "lm_loss": 0.5686, + "step": 9344, + "vm_loss": 0.1377 + }, + { + "epoch": 1.7987823952643356, + "lm_loss": 0.6368, + "step": 9344, + "vm_loss": 0.144 + }, + { + "epoch": 1.7987823952643356, + "lm_loss": 0.5009, + "step": 9344, + "vm_loss": 0.1979 + }, + { + "epoch": 1.7987823952643356, + "lm_loss": 0.3397, + "step": 9344, + "vm_loss": 0.1093 + }, + { + "epoch": 1.7989749019419112, + "grad_norm": 3.238650458103736, + "learning_rate": 5.241207422402516e-07, + "loss": 0.7828, + "step": 9345 + }, + { + "epoch": 1.7991674086194864, + "grad_norm": 3.313445480953595, + "learning_rate": 5.23125050126716e-07, + "loss": 0.849, + "step": 9346 + }, + { + "epoch": 1.7993599152970619, + "grad_norm": 3.1550807482936607, + "learning_rate": 5.221302792866667e-07, + "loss": 0.8189, + "step": 9347 + }, + { + "epoch": 1.7995524219746373, + "grad_norm": 3.4208320095160074, + "learning_rate": 5.211364298168142e-07, + "loss": 0.8748, + "step": 9348 + }, + { + "epoch": 1.7997449286522125, + "grad_norm": 3.3066552919313557, + "learning_rate": 5.201435018137701e-07, + "loss": 0.8006, + "step": 9349 + }, + { + "epoch": 1.7999374353297881, + "grad_norm": 3.2370739221568248, + "learning_rate": 5.191514953740606e-07, + "loss": 0.8029, + "step": 9350 + }, + { + "epoch": 1.8001299420073633, + "grad_norm": 3.328435341538871, + "learning_rate": 5.181604105941207e-07, + "loss": 0.8484, + "step": 9351 + }, + { + "epoch": 1.8003224486849387, + "grad_norm": 3.205933465828315, + "learning_rate": 5.171702475702967e-07, + "loss": 0.8239, + "step": 9352 + }, + { + "epoch": 1.8003224486849387, + "lm_loss": 0.4288, + "step": 9352, + "vm_loss": 0.1995 + }, + { + "epoch": 1.8003224486849387, + "lm_loss": 0.4346, + "step": 9352, + "vm_loss": 0.1706 + }, + { + "epoch": 1.8003224486849387, + "lm_loss": 0.5045, + "step": 9352, + "vm_loss": 0.0999 + }, + { + "epoch": 1.8003224486849387, + "lm_loss": 0.8446, + "step": 9352, + "vm_loss": 0.1791 + }, + { + "epoch": 1.8003224486849387, + "lm_loss": 0.6973, + "step": 9352, + "vm_loss": 0.136 + }, + { + "epoch": 1.8003224486849387, + "lm_loss": 0.6798, + "step": 9352, + "vm_loss": 0.145 + }, + { + "epoch": 1.8003224486849387, + "lm_loss": 0.9099, + "step": 9352, + "vm_loss": 0.1963 + }, + { + "epoch": 1.8003224486849387, + "lm_loss": 0.7159, + "step": 9352, + "vm_loss": 0.2359 + }, + { + "epoch": 1.8005149553625142, + "grad_norm": 3.0828030257960544, + "learning_rate": 5.16181006398847e-07, + "loss": 0.8273, + "step": 9353 + }, + { + "epoch": 1.8007074620400894, + "grad_norm": 3.3210035423085715, + "learning_rate": 5.15192687175935e-07, + "loss": 0.8551, + "step": 9354 + }, + { + "epoch": 1.800899968717665, + "grad_norm": 3.2880128154178903, + "learning_rate": 5.142052899976413e-07, + "loss": 0.8247, + "step": 9355 + }, + { + "epoch": 1.8010924753952402, + "grad_norm": 3.220242376130876, + "learning_rate": 5.132188149599526e-07, + "loss": 0.854, + "step": 9356 + }, + { + "epoch": 1.8012849820728156, + "grad_norm": 3.0951067978855797, + "learning_rate": 5.122332621587645e-07, + "loss": 0.8165, + "step": 9357 + }, + { + "epoch": 1.801477488750391, + "grad_norm": 3.389623384896271, + "learning_rate": 5.11248631689889e-07, + "loss": 0.8737, + "step": 9358 + }, + { + "epoch": 1.8016699954279662, + "grad_norm": 3.3559336549277643, + "learning_rate": 5.102649236490432e-07, + "loss": 0.85, + "step": 9359 + }, + { + "epoch": 1.8018625021055419, + "grad_norm": 3.4458790244766506, + "learning_rate": 5.092821381318558e-07, + "loss": 0.8883, + "step": 9360 + }, + { + "epoch": 1.8018625021055419, + "lm_loss": 0.5459, + "step": 9360, + "vm_loss": 0.181 + }, + { + "epoch": 1.8018625021055419, + "lm_loss": 0.7529, + "step": 9360, + "vm_loss": 0.1808 + }, + { + "epoch": 1.8018625021055419, + "lm_loss": 0.8306, + "step": 9360, + "vm_loss": 0.0763 + }, + { + "epoch": 1.8018625021055419, + "lm_loss": 0.441, + "step": 9360, + "vm_loss": 0.1237 + }, + { + "epoch": 1.8018625021055419, + "lm_loss": 0.833, + "step": 9360, + "vm_loss": 0.1329 + }, + { + "epoch": 1.8018625021055419, + "lm_loss": 0.6061, + "step": 9360, + "vm_loss": 0.1519 + }, + { + "epoch": 1.8018625021055419, + "lm_loss": 0.7371, + "step": 9360, + "vm_loss": 0.1552 + }, + { + "epoch": 1.8018625021055419, + "lm_loss": 0.7924, + "step": 9360, + "vm_loss": 0.1777 + }, + { + "epoch": 1.802055008783117, + "grad_norm": 3.2184758421167348, + "learning_rate": 5.083002752338651e-07, + "loss": 0.8368, + "step": 9361 + }, + { + "epoch": 1.8022475154606925, + "grad_norm": 3.2145464809813142, + "learning_rate": 5.073193350505224e-07, + "loss": 0.816, + "step": 9362 + }, + { + "epoch": 1.802440022138268, + "grad_norm": 3.3602811439505893, + "learning_rate": 5.063393176771891e-07, + "loss": 0.8698, + "step": 9363 + }, + { + "epoch": 1.8026325288158433, + "grad_norm": 3.2685173005618493, + "learning_rate": 5.053602232091304e-07, + "loss": 0.8383, + "step": 9364 + }, + { + "epoch": 1.8028250354934188, + "grad_norm": 3.391957322297027, + "learning_rate": 5.043820517415321e-07, + "loss": 0.8373, + "step": 9365 + }, + { + "epoch": 1.803017542170994, + "grad_norm": 3.0841682065647857, + "learning_rate": 5.034048033694816e-07, + "loss": 0.8386, + "step": 9366 + }, + { + "epoch": 1.8032100488485694, + "grad_norm": 3.1118568574819685, + "learning_rate": 5.024284781879829e-07, + "loss": 0.8053, + "step": 9367 + }, + { + "epoch": 1.8034025555261448, + "grad_norm": 3.113797492775831, + "learning_rate": 5.014530762919423e-07, + "loss": 0.817, + "step": 9368 + }, + { + "epoch": 1.8034025555261448, + "lm_loss": 0.9392, + "step": 9368, + "vm_loss": 0.1126 + }, + { + "epoch": 1.8034025555261448, + "lm_loss": 0.6223, + "step": 9368, + "vm_loss": 0.1318 + }, + { + "epoch": 1.8034025555261448, + "lm_loss": 0.5048, + "step": 9368, + "vm_loss": 0.1284 + }, + { + "epoch": 1.8034025555261448, + "lm_loss": 0.4922, + "step": 9368, + "vm_loss": 0.1764 + }, + { + "epoch": 1.8034025555261448, + "lm_loss": 1.0926, + "step": 9368, + "vm_loss": 0.1826 + }, + { + "epoch": 1.8034025555261448, + "lm_loss": 0.5785, + "step": 9368, + "vm_loss": 0.1987 + }, + { + "epoch": 1.8034025555261448, + "lm_loss": 0.6226, + "step": 9368, + "vm_loss": 0.1709 + }, + { + "epoch": 1.8034025555261448, + "lm_loss": 0.7299, + "step": 9368, + "vm_loss": 0.1578 + }, + { + "epoch": 1.8035950622037202, + "grad_norm": 3.280044274119619, + "learning_rate": 5.004785977761884e-07, + "loss": 0.8224, + "step": 9369 + }, + { + "epoch": 1.8037875688812957, + "grad_norm": 3.195232612770297, + "learning_rate": 4.995050427354464e-07, + "loss": 0.806, + "step": 9370 + }, + { + "epoch": 1.8039800755588709, + "grad_norm": 3.426690814831063, + "learning_rate": 4.98532411264362e-07, + "loss": 0.947, + "step": 9371 + }, + { + "epoch": 1.8041725822364463, + "grad_norm": 3.3469601396181567, + "learning_rate": 4.975607034574859e-07, + "loss": 0.8482, + "step": 9372 + }, + { + "epoch": 1.8043650889140217, + "grad_norm": 3.297804878638026, + "learning_rate": 4.965899194092816e-07, + "loss": 0.8558, + "step": 9373 + }, + { + "epoch": 1.8045575955915971, + "grad_norm": 3.2765246519035367, + "learning_rate": 4.956200592141213e-07, + "loss": 0.8691, + "step": 9374 + }, + { + "epoch": 1.8047501022691725, + "grad_norm": 3.1839352497095272, + "learning_rate": 4.946511229662876e-07, + "loss": 0.8421, + "step": 9375 + }, + { + "epoch": 1.8049426089467477, + "grad_norm": 3.3267236640530213, + "learning_rate": 4.936831107599749e-07, + "loss": 0.8471, + "step": 9376 + }, + { + "epoch": 1.8049426089467477, + "lm_loss": 1.0876, + "step": 9376, + "vm_loss": 0.2014 + }, + { + "epoch": 1.8049426089467477, + "lm_loss": 0.5483, + "step": 9376, + "vm_loss": 0.206 + }, + { + "epoch": 1.8049426089467477, + "lm_loss": 0.5349, + "step": 9376, + "vm_loss": 0.1636 + }, + { + "epoch": 1.8049426089467477, + "lm_loss": 0.9603, + "step": 9376, + "vm_loss": 0.1788 + }, + { + "epoch": 1.8049426089467477, + "lm_loss": 0.4511, + "step": 9376, + "vm_loss": 0.1387 + }, + { + "epoch": 1.8049426089467477, + "lm_loss": 0.3138, + "step": 9376, + "vm_loss": 0.1944 + }, + { + "epoch": 1.8049426089467477, + "lm_loss": 0.615, + "step": 9376, + "vm_loss": 0.1465 + }, + { + "epoch": 1.8049426089467477, + "lm_loss": 0.451, + "step": 9376, + "vm_loss": 0.1682 + }, + { + "epoch": 1.8051351156243234, + "grad_norm": 3.090837298837418, + "learning_rate": 4.927160226892835e-07, + "loss": 0.8196, + "step": 9377 + }, + { + "epoch": 1.8053276223018986, + "grad_norm": 3.1949496974142386, + "learning_rate": 4.917498588482295e-07, + "loss": 0.82, + "step": 9378 + }, + { + "epoch": 1.805520128979474, + "grad_norm": 3.193427470900945, + "learning_rate": 4.907846193307342e-07, + "loss": 0.8013, + "step": 9379 + }, + { + "epoch": 1.8057126356570494, + "grad_norm": 3.1994504460145645, + "learning_rate": 4.898203042306337e-07, + "loss": 0.843, + "step": 9380 + }, + { + "epoch": 1.8059051423346246, + "grad_norm": 3.086406355213787, + "learning_rate": 4.888569136416687e-07, + "loss": 0.8159, + "step": 9381 + }, + { + "epoch": 1.8060976490122003, + "grad_norm": 3.250327656138653, + "learning_rate": 4.878944476574965e-07, + "loss": 0.847, + "step": 9382 + }, + { + "epoch": 1.8062901556897755, + "grad_norm": 3.380748805639859, + "learning_rate": 4.869329063716776e-07, + "loss": 0.9055, + "step": 9383 + }, + { + "epoch": 1.8064826623673509, + "grad_norm": 3.272211286399803, + "learning_rate": 4.859722898776898e-07, + "loss": 0.8352, + "step": 9384 + }, + { + "epoch": 1.8064826623673509, + "lm_loss": 0.5525, + "step": 9384, + "vm_loss": 0.085 + }, + { + "epoch": 1.8064826623673509, + "lm_loss": 0.5594, + "step": 9384, + "vm_loss": 0.13 + }, + { + "epoch": 1.8064826623673509, + "lm_loss": 0.6448, + "step": 9384, + "vm_loss": 0.1511 + }, + { + "epoch": 1.8064826623673509, + "lm_loss": 0.4487, + "step": 9384, + "vm_loss": 0.1325 + }, + { + "epoch": 1.8064826623673509, + "lm_loss": 0.6913, + "step": 9384, + "vm_loss": 0.2148 + }, + { + "epoch": 1.8064826623673509, + "lm_loss": 1.022, + "step": 9384, + "vm_loss": 0.2565 + }, + { + "epoch": 1.8064826623673509, + "lm_loss": 0.7156, + "step": 9384, + "vm_loss": 0.1631 + }, + { + "epoch": 1.8064826623673509, + "lm_loss": 0.2478, + "step": 9384, + "vm_loss": 0.1447 + }, + { + "epoch": 1.8066751690449263, + "grad_norm": 3.1930833416047504, + "learning_rate": 4.850125982689125e-07, + "loss": 0.8275, + "step": 9385 + }, + { + "epoch": 1.8068676757225015, + "grad_norm": 3.28107320588364, + "learning_rate": 4.840538316386457e-07, + "loss": 0.84, + "step": 9386 + }, + { + "epoch": 1.8070601824000772, + "grad_norm": 3.2106028022349378, + "learning_rate": 4.830959900800902e-07, + "loss": 0.8351, + "step": 9387 + }, + { + "epoch": 1.8072526890776524, + "grad_norm": 3.2673671297868485, + "learning_rate": 4.821390736863607e-07, + "loss": 0.8328, + "step": 9388 + }, + { + "epoch": 1.8074451957552278, + "grad_norm": 3.1604949019357265, + "learning_rate": 4.811830825504815e-07, + "loss": 0.805, + "step": 9389 + }, + { + "epoch": 1.8076377024328032, + "grad_norm": 3.215626338988843, + "learning_rate": 4.802280167653906e-07, + "loss": 0.8289, + "step": 9390 + }, + { + "epoch": 1.8078302091103784, + "grad_norm": 3.22590524519466, + "learning_rate": 4.792738764239269e-07, + "loss": 0.8681, + "step": 9391 + }, + { + "epoch": 1.808022715787954, + "grad_norm": 3.117350840472475, + "learning_rate": 4.783206616188496e-07, + "loss": 0.776, + "step": 9392 + }, + { + "epoch": 1.808022715787954, + "lm_loss": 0.5703, + "step": 9392, + "vm_loss": 0.1705 + }, + { + "epoch": 1.808022715787954, + "lm_loss": 0.5303, + "step": 9392, + "vm_loss": 0.0931 + }, + { + "epoch": 1.808022715787954, + "lm_loss": 0.4471, + "step": 9392, + "vm_loss": 0.1681 + }, + { + "epoch": 1.808022715787954, + "lm_loss": 0.5546, + "step": 9392, + "vm_loss": 0.1388 + }, + { + "epoch": 1.808022715787954, + "lm_loss": 0.7287, + "step": 9392, + "vm_loss": 0.1597 + }, + { + "epoch": 1.808022715787954, + "lm_loss": 0.6972, + "step": 9392, + "vm_loss": 0.1829 + }, + { + "epoch": 1.808022715787954, + "lm_loss": 0.4452, + "step": 9392, + "vm_loss": 0.2241 + }, + { + "epoch": 1.808022715787954, + "lm_loss": 0.6672, + "step": 9392, + "vm_loss": 0.0943 + }, + { + "epoch": 1.8082152224655292, + "grad_norm": 3.328999970921193, + "learning_rate": 4.773683724428213e-07, + "loss": 0.8301, + "step": 9393 + }, + { + "epoch": 1.8084077291431047, + "grad_norm": 3.255319707033454, + "learning_rate": 4.764170089884179e-07, + "loss": 0.8164, + "step": 9394 + }, + { + "epoch": 1.80860023582068, + "grad_norm": 3.2216255086780863, + "learning_rate": 4.7546657134812213e-07, + "loss": 0.8269, + "step": 9395 + }, + { + "epoch": 1.8087927424982553, + "grad_norm": 3.291390470375985, + "learning_rate": 4.7451705961433115e-07, + "loss": 0.8629, + "step": 9396 + }, + { + "epoch": 1.808985249175831, + "grad_norm": 3.0083219781833357, + "learning_rate": 4.735684738793489e-07, + "loss": 0.7825, + "step": 9397 + }, + { + "epoch": 1.8091777558534061, + "grad_norm": 3.1930037153849566, + "learning_rate": 4.7262081423538717e-07, + "loss": 0.8528, + "step": 9398 + }, + { + "epoch": 1.8093702625309815, + "grad_norm": 3.2793185056427148, + "learning_rate": 4.7167408077457676e-07, + "loss": 0.8776, + "step": 9399 + }, + { + "epoch": 1.809562769208557, + "grad_norm": 3.3941752540386196, + "learning_rate": 4.7072827358894626e-07, + "loss": 0.8912, + "step": 9400 + }, + { + "epoch": 1.809562769208557, + "lm_loss": 0.3304, + "step": 9400, + "vm_loss": 0.1011 + }, + { + "epoch": 1.809562769208557, + "lm_loss": 0.8845, + "step": 9400, + "vm_loss": 0.1845 + }, + { + "epoch": 1.809562769208557, + "lm_loss": 0.9758, + "step": 9400, + "vm_loss": 0.1765 + }, + { + "epoch": 1.809562769208557, + "lm_loss": 0.5514, + "step": 9400, + "vm_loss": 0.1865 + }, + { + "epoch": 1.809562769208557, + "lm_loss": 0.9839, + "step": 9400, + "vm_loss": 0.1316 + }, + { + "epoch": 1.809562769208557, + "lm_loss": 0.7445, + "step": 9400, + "vm_loss": 0.1636 + }, + { + "epoch": 1.809562769208557, + "lm_loss": 0.5048, + "step": 9400, + "vm_loss": 0.2239 + }, + { + "epoch": 1.809562769208557, + "lm_loss": 0.3279, + "step": 9400, + "vm_loss": 0.1502 + }, + { + "epoch": 1.8097552758861322, + "grad_norm": 3.2830472742796197, + "learning_rate": 4.6978339277044427e-07, + "loss": 0.8475, + "step": 9401 + }, + { + "epoch": 1.8099477825637078, + "grad_norm": 3.067944874843008, + "learning_rate": 4.688394384109207e-07, + "loss": 0.7828, + "step": 9402 + }, + { + "epoch": 1.810140289241283, + "grad_norm": 3.3273113417882887, + "learning_rate": 4.678964106021455e-07, + "loss": 0.8227, + "step": 9403 + }, + { + "epoch": 1.8103327959188584, + "grad_norm": 3.2706842106191383, + "learning_rate": 4.6695430943578865e-07, + "loss": 0.857, + "step": 9404 + }, + { + "epoch": 1.8105253025964338, + "grad_norm": 3.128044813767231, + "learning_rate": 4.660131350034369e-07, + "loss": 0.8037, + "step": 9405 + }, + { + "epoch": 1.810717809274009, + "grad_norm": 3.220675583801118, + "learning_rate": 4.6507288739658264e-07, + "loss": 0.8216, + "step": 9406 + }, + { + "epoch": 1.8109103159515847, + "grad_norm": 3.3167907751594847, + "learning_rate": 4.641335667066338e-07, + "loss": 0.8782, + "step": 9407 + }, + { + "epoch": 1.81110282262916, + "grad_norm": 3.266160835205097, + "learning_rate": 4.6319517302489957e-07, + "loss": 0.8574, + "step": 9408 + }, + { + "epoch": 1.81110282262916, + "lm_loss": 0.4952, + "step": 9408, + "vm_loss": 0.1499 + }, + { + "epoch": 1.81110282262916, + "lm_loss": 1.0528, + "step": 9408, + "vm_loss": 0.2168 + }, + { + "epoch": 1.81110282262916, + "lm_loss": 0.4697, + "step": 9408, + "vm_loss": 0.1565 + }, + { + "epoch": 1.81110282262916, + "lm_loss": 0.7696, + "step": 9408, + "vm_loss": 0.1477 + }, + { + "epoch": 1.81110282262916, + "lm_loss": 0.4651, + "step": 9408, + "vm_loss": 0.1734 + }, + { + "epoch": 1.81110282262916, + "lm_loss": 0.4699, + "step": 9408, + "vm_loss": 0.1992 + }, + { + "epoch": 1.81110282262916, + "lm_loss": 0.4982, + "step": 9408, + "vm_loss": 0.1253 + }, + { + "epoch": 1.81110282262916, + "lm_loss": 0.8186, + "step": 9408, + "vm_loss": 0.1847 + }, + { + "epoch": 1.8112953293067353, + "grad_norm": 3.2946795730545384, + "learning_rate": 4.6225770644260595e-07, + "loss": 0.883, + "step": 9409 + }, + { + "epoch": 1.8114878359843107, + "grad_norm": 3.1772531443367757, + "learning_rate": 4.6132116705088657e-07, + "loss": 0.8252, + "step": 9410 + }, + { + "epoch": 1.811680342661886, + "grad_norm": 3.231380433652586, + "learning_rate": 4.603855549407854e-07, + "loss": 0.8472, + "step": 9411 + }, + { + "epoch": 1.8118728493394616, + "grad_norm": 3.374213779641825, + "learning_rate": 4.5945087020325517e-07, + "loss": 0.8649, + "step": 9412 + }, + { + "epoch": 1.8120653560170368, + "grad_norm": 3.2444244359321845, + "learning_rate": 4.58517112929161e-07, + "loss": 0.8248, + "step": 9413 + }, + { + "epoch": 1.8122578626946122, + "grad_norm": 3.4150854253433702, + "learning_rate": 4.575842832092736e-07, + "loss": 0.8831, + "step": 9414 + }, + { + "epoch": 1.8124503693721876, + "grad_norm": 3.2238964268883374, + "learning_rate": 4.566523811342771e-07, + "loss": 0.8321, + "step": 9415 + }, + { + "epoch": 1.8126428760497628, + "grad_norm": 3.2174936967269288, + "learning_rate": 4.5572140679476463e-07, + "loss": 0.8352, + "step": 9416 + }, + { + "epoch": 1.8126428760497628, + "lm_loss": 0.4912, + "step": 9416, + "vm_loss": 0.1057 + }, + { + "epoch": 1.8126428760497628, + "lm_loss": 0.4811, + "step": 9416, + "vm_loss": 0.172 + }, + { + "epoch": 1.8126428760497628, + "lm_loss": 0.75, + "step": 9416, + "vm_loss": 0.2226 + }, + { + "epoch": 1.8126428760497628, + "lm_loss": 0.5795, + "step": 9416, + "vm_loss": 0.1807 + }, + { + "epoch": 1.8126428760497628, + "lm_loss": 0.4087, + "step": 9416, + "vm_loss": 0.1429 + }, + { + "epoch": 1.8126428760497628, + "lm_loss": 0.6448, + "step": 9416, + "vm_loss": 0.1713 + }, + { + "epoch": 1.8126428760497628, + "lm_loss": 0.916, + "step": 9416, + "vm_loss": 0.1786 + }, + { + "epoch": 1.8126428760497628, + "lm_loss": 0.7205, + "step": 9416, + "vm_loss": 0.1924 + }, + { + "epoch": 1.8128353827273385, + "grad_norm": 3.0536734160637335, + "learning_rate": 4.5479136028123704e-07, + "loss": 0.7988, + "step": 9417 + }, + { + "epoch": 1.8130278894049137, + "grad_norm": 3.2131334457063887, + "learning_rate": 4.5386224168411097e-07, + "loss": 0.8364, + "step": 9418 + }, + { + "epoch": 1.813220396082489, + "grad_norm": 3.2724589199458913, + "learning_rate": 4.529340510937019e-07, + "loss": 0.8358, + "step": 9419 + }, + { + "epoch": 1.8134129027600645, + "grad_norm": 3.353461949216105, + "learning_rate": 4.520067886002488e-07, + "loss": 0.8724, + "step": 9420 + }, + { + "epoch": 1.8136054094376397, + "grad_norm": 3.3036144466552217, + "learning_rate": 4.5108045429388956e-07, + "loss": 0.8574, + "step": 9421 + }, + { + "epoch": 1.8137979161152153, + "grad_norm": 3.2005032457145903, + "learning_rate": 4.501550482646755e-07, + "loss": 0.8792, + "step": 9422 + }, + { + "epoch": 1.8139904227927905, + "grad_norm": 3.2556618639109933, + "learning_rate": 4.4923057060256903e-07, + "loss": 0.8406, + "step": 9423 + }, + { + "epoch": 1.814182929470366, + "grad_norm": 3.2181270573242378, + "learning_rate": 4.4830702139744386e-07, + "loss": 0.8199, + "step": 9424 + }, + { + "epoch": 1.814182929470366, + "lm_loss": 0.3965, + "step": 9424, + "vm_loss": 0.114 + }, + { + "epoch": 1.814182929470366, + "lm_loss": 0.9325, + "step": 9424, + "vm_loss": 0.1475 + }, + { + "epoch": 1.814182929470366, + "lm_loss": 0.5873, + "step": 9424, + "vm_loss": 0.1288 + }, + { + "epoch": 1.814182929470366, + "lm_loss": 0.7009, + "step": 9424, + "vm_loss": 0.248 + }, + { + "epoch": 1.814182929470366, + "lm_loss": 0.8435, + "step": 9424, + "vm_loss": 0.1568 + }, + { + "epoch": 1.814182929470366, + "lm_loss": 0.5541, + "step": 9424, + "vm_loss": 0.1426 + }, + { + "epoch": 1.814182929470366, + "lm_loss": 0.7144, + "step": 9424, + "vm_loss": 0.1633 + }, + { + "epoch": 1.814182929470366, + "lm_loss": 0.6195, + "step": 9424, + "vm_loss": 0.1474 + }, + { + "epoch": 1.8143754361479414, + "grad_norm": 3.340226311419131, + "learning_rate": 4.4738440073907484e-07, + "loss": 0.865, + "step": 9425 + }, + { + "epoch": 1.8145679428255168, + "grad_norm": 3.2507799288874555, + "learning_rate": 4.4646270871715693e-07, + "loss": 0.8509, + "step": 9426 + }, + { + "epoch": 1.8147604495030922, + "grad_norm": 3.300074590968059, + "learning_rate": 4.4554194542128837e-07, + "loss": 0.8662, + "step": 9427 + }, + { + "epoch": 1.8149529561806674, + "grad_norm": 3.3049519225319464, + "learning_rate": 4.446221109409821e-07, + "loss": 0.8622, + "step": 9428 + }, + { + "epoch": 1.8151454628582429, + "grad_norm": 3.2942704775606124, + "learning_rate": 4.43703205365652e-07, + "loss": 0.881, + "step": 9429 + }, + { + "epoch": 1.8153379695358183, + "grad_norm": 3.356351412684281, + "learning_rate": 4.427852287846346e-07, + "loss": 0.8809, + "step": 9430 + }, + { + "epoch": 1.8155304762133937, + "grad_norm": 3.2733802979010242, + "learning_rate": 4.418681812871639e-07, + "loss": 0.8267, + "step": 9431 + }, + { + "epoch": 1.8157229828909691, + "grad_norm": 3.3050535855163377, + "learning_rate": 4.409520629623898e-07, + "loss": 0.8543, + "step": 9432 + }, + { + "epoch": 1.8157229828909691, + "lm_loss": 0.638, + "step": 9432, + "vm_loss": 0.1496 + }, + { + "epoch": 1.8157229828909691, + "lm_loss": 1.0204, + "step": 9432, + "vm_loss": 0.2284 + }, + { + "epoch": 1.8157229828909691, + "lm_loss": 0.5644, + "step": 9432, + "vm_loss": 0.1782 + }, + { + "epoch": 1.8157229828909691, + "lm_loss": 0.4665, + "step": 9432, + "vm_loss": 0.1344 + }, + { + "epoch": 1.8157229828909691, + "lm_loss": 0.4625, + "step": 9432, + "vm_loss": 0.1683 + }, + { + "epoch": 1.8157229828909691, + "lm_loss": 0.7166, + "step": 9432, + "vm_loss": 0.1418 + }, + { + "epoch": 1.8157229828909691, + "lm_loss": 0.7052, + "step": 9432, + "vm_loss": 0.1666 + }, + { + "epoch": 1.8157229828909691, + "lm_loss": 0.7929, + "step": 9432, + "vm_loss": 0.1496 + }, + { + "epoch": 1.8159154895685443, + "grad_norm": 3.222408781441603, + "learning_rate": 4.400368738993721e-07, + "loss": 0.8449, + "step": 9433 + }, + { + "epoch": 1.8161079962461197, + "grad_norm": 3.266409053698989, + "learning_rate": 4.391226141870786e-07, + "loss": 0.8457, + "step": 9434 + }, + { + "epoch": 1.8163005029236952, + "grad_norm": 3.298062890561418, + "learning_rate": 4.3820928391438477e-07, + "loss": 0.86, + "step": 9435 + }, + { + "epoch": 1.8164930096012706, + "grad_norm": 3.260219123108118, + "learning_rate": 4.3729688317007966e-07, + "loss": 0.8366, + "step": 9436 + }, + { + "epoch": 1.816685516278846, + "grad_norm": 3.3936910604462884, + "learning_rate": 4.363854120428601e-07, + "loss": 0.8892, + "step": 9437 + }, + { + "epoch": 1.8168780229564212, + "grad_norm": 3.2660571408186656, + "learning_rate": 4.35474870621333e-07, + "loss": 0.843, + "step": 9438 + }, + { + "epoch": 1.8170705296339968, + "grad_norm": 3.389533328473793, + "learning_rate": 4.3456525899401414e-07, + "loss": 0.8682, + "step": 9439 + }, + { + "epoch": 1.817263036311572, + "grad_norm": 3.067503475867899, + "learning_rate": 4.336565772493306e-07, + "loss": 0.7984, + "step": 9440 + }, + { + "epoch": 1.817263036311572, + "lm_loss": 0.6951, + "step": 9440, + "vm_loss": 0.1623 + }, + { + "epoch": 1.817263036311572, + "lm_loss": 0.6871, + "step": 9440, + "vm_loss": 0.1802 + }, + { + "epoch": 1.817263036311572, + "lm_loss": 0.7169, + "step": 9440, + "vm_loss": 0.2062 + }, + { + "epoch": 1.817263036311572, + "lm_loss": 0.7186, + "step": 9440, + "vm_loss": 0.1236 + }, + { + "epoch": 1.817263036311572, + "lm_loss": 0.8027, + "step": 9440, + "vm_loss": 0.1118 + }, + { + "epoch": 1.817263036311572, + "lm_loss": 0.904, + "step": 9440, + "vm_loss": 0.1952 + }, + { + "epoch": 1.817263036311572, + "lm_loss": 0.3963, + "step": 9440, + "vm_loss": 0.1459 + }, + { + "epoch": 1.817263036311572, + "lm_loss": 0.818, + "step": 9440, + "vm_loss": 0.1539 + }, + { + "epoch": 1.8174555429891475, + "grad_norm": 3.331958164811603, + "learning_rate": 4.3274882547561734e-07, + "loss": 0.8464, + "step": 9441 + }, + { + "epoch": 1.8176480496667229, + "grad_norm": 3.1949212223058687, + "learning_rate": 4.318420037611182e-07, + "loss": 0.8341, + "step": 9442 + }, + { + "epoch": 1.817840556344298, + "grad_norm": 3.284468477427103, + "learning_rate": 4.3093611219398923e-07, + "loss": 0.8487, + "step": 9443 + }, + { + "epoch": 1.8180330630218737, + "grad_norm": 3.1956671405439945, + "learning_rate": 4.300311508622945e-07, + "loss": 0.8388, + "step": 9444 + }, + { + "epoch": 1.818225569699449, + "grad_norm": 3.173443300358134, + "learning_rate": 4.291271198540081e-07, + "loss": 0.8241, + "step": 9445 + }, + { + "epoch": 1.8184180763770244, + "grad_norm": 3.187716817338898, + "learning_rate": 4.2822401925700973e-07, + "loss": 0.8037, + "step": 9446 + }, + { + "epoch": 1.8186105830545998, + "grad_norm": 3.4068845747360985, + "learning_rate": 4.273218491590991e-07, + "loss": 0.8658, + "step": 9447 + }, + { + "epoch": 1.818803089732175, + "grad_norm": 3.2799445475725535, + "learning_rate": 4.264206096479728e-07, + "loss": 0.8527, + "step": 9448 + }, + { + "epoch": 1.818803089732175, + "lm_loss": 1.0119, + "step": 9448, + "vm_loss": 0.209 + }, + { + "epoch": 1.818803089732175, + "lm_loss": 0.4209, + "step": 9448, + "vm_loss": 0.1752 + }, + { + "epoch": 1.818803089732175, + "lm_loss": 0.6887, + "step": 9448, + "vm_loss": 0.1445 + }, + { + "epoch": 1.818803089732175, + "lm_loss": 0.3712, + "step": 9448, + "vm_loss": 0.1486 + }, + { + "epoch": 1.818803089732175, + "lm_loss": 0.3663, + "step": 9448, + "vm_loss": 0.183 + }, + { + "epoch": 1.818803089732175, + "lm_loss": 0.7808, + "step": 9448, + "vm_loss": 0.1236 + }, + { + "epoch": 1.818803089732175, + "lm_loss": 0.5196, + "step": 9448, + "vm_loss": 0.2027 + }, + { + "epoch": 1.818803089732175, + "lm_loss": 0.3506, + "step": 9448, + "vm_loss": 0.1426 + }, + { + "epoch": 1.8189955964097506, + "grad_norm": 3.240357748082226, + "learning_rate": 4.2552030081124627e-07, + "loss": 0.8317, + "step": 9449 + }, + { + "epoch": 1.8191881030873258, + "grad_norm": 3.1713314013418206, + "learning_rate": 4.246209227364395e-07, + "loss": 0.8174, + "step": 9450 + }, + { + "epoch": 1.8193806097649012, + "grad_norm": 3.26215107528053, + "learning_rate": 4.2372247551098477e-07, + "loss": 0.8379, + "step": 9451 + }, + { + "epoch": 1.8195731164424767, + "grad_norm": 3.1984297910279693, + "learning_rate": 4.228249592222211e-07, + "loss": 0.7654, + "step": 9452 + }, + { + "epoch": 1.8197656231200519, + "grad_norm": 3.265262614965891, + "learning_rate": 4.2192837395739874e-07, + "loss": 0.8143, + "step": 9453 + }, + { + "epoch": 1.8199581297976275, + "grad_norm": 3.314015175924182, + "learning_rate": 4.210327198036779e-07, + "loss": 0.8942, + "step": 9454 + }, + { + "epoch": 1.8201506364752027, + "grad_norm": 3.181754635117066, + "learning_rate": 4.201379968481267e-07, + "loss": 0.8382, + "step": 9455 + }, + { + "epoch": 1.8203431431527781, + "grad_norm": 3.271114711349186, + "learning_rate": 4.1924420517772567e-07, + "loss": 0.8319, + "step": 9456 + }, + { + "epoch": 1.8203431431527781, + "lm_loss": 0.7772, + "step": 9456, + "vm_loss": 0.1567 + }, + { + "epoch": 1.8203431431527781, + "lm_loss": 0.6097, + "step": 9456, + "vm_loss": 0.1379 + }, + { + "epoch": 1.8203431431527781, + "lm_loss": 0.568, + "step": 9456, + "vm_loss": 0.2546 + }, + { + "epoch": 1.8203431431527781, + "lm_loss": 1.1385, + "step": 9456, + "vm_loss": 0.2128 + }, + { + "epoch": 1.8203431431527781, + "lm_loss": 0.5089, + "step": 9456, + "vm_loss": 0.1452 + }, + { + "epoch": 1.8203431431527781, + "lm_loss": 0.3091, + "step": 9456, + "vm_loss": 0.1582 + }, + { + "epoch": 1.8203431431527781, + "lm_loss": 0.9031, + "step": 9456, + "vm_loss": 0.2212 + }, + { + "epoch": 1.8203431431527781, + "lm_loss": 0.4583, + "step": 9456, + "vm_loss": 0.1886 + }, + { + "epoch": 1.8205356498303535, + "grad_norm": 3.1912558163715032, + "learning_rate": 4.183513448793619e-07, + "loss": 0.8401, + "step": 9457 + }, + { + "epoch": 1.8207281565079287, + "grad_norm": 3.2081449124148573, + "learning_rate": 4.1745941603983155e-07, + "loss": 0.8489, + "step": 9458 + }, + { + "epoch": 1.8209206631855044, + "grad_norm": 3.3046801919679387, + "learning_rate": 4.1656841874584296e-07, + "loss": 0.8571, + "step": 9459 + }, + { + "epoch": 1.8211131698630796, + "grad_norm": 3.4789007492680213, + "learning_rate": 4.156783530840114e-07, + "loss": 0.9039, + "step": 9460 + }, + { + "epoch": 1.821305676540655, + "grad_norm": 3.199002017346473, + "learning_rate": 4.147892191408631e-07, + "loss": 0.8576, + "step": 9461 + }, + { + "epoch": 1.8214981832182304, + "grad_norm": 3.1372556741105897, + "learning_rate": 4.139010170028346e-07, + "loss": 0.8119, + "step": 9462 + }, + { + "epoch": 1.8216906898958056, + "grad_norm": 3.2893646961238825, + "learning_rate": 4.130137467562667e-07, + "loss": 0.8986, + "step": 9463 + }, + { + "epoch": 1.8218831965733813, + "grad_norm": 3.1882090770691525, + "learning_rate": 4.121274084874194e-07, + "loss": 0.7992, + "step": 9464 + }, + { + "epoch": 1.8218831965733813, + "lm_loss": 0.6522, + "step": 9464, + "vm_loss": 0.1834 + }, + { + "epoch": 1.8218831965733813, + "lm_loss": 0.7543, + "step": 9464, + "vm_loss": 0.1446 + }, + { + "epoch": 1.8218831965733813, + "lm_loss": 0.5803, + "step": 9464, + "vm_loss": 0.1796 + }, + { + "epoch": 1.8218831965733813, + "lm_loss": 0.521, + "step": 9464, + "vm_loss": 0.1924 + }, + { + "epoch": 1.8218831965733813, + "lm_loss": 0.7159, + "step": 9464, + "vm_loss": 0.1606 + }, + { + "epoch": 1.8218831965733813, + "lm_loss": 0.749, + "step": 9464, + "vm_loss": 0.1685 + }, + { + "epoch": 1.8218831965733813, + "lm_loss": 0.9796, + "step": 9464, + "vm_loss": 0.1914 + }, + { + "epoch": 1.8218831965733813, + "lm_loss": 0.516, + "step": 9464, + "vm_loss": 0.1594 + }, + { + "epoch": 1.8220757032509565, + "grad_norm": 3.1491455088349714, + "learning_rate": 4.1124200228245037e-07, + "loss": 0.8443, + "step": 9465 + }, + { + "epoch": 1.822268209928532, + "grad_norm": 3.190344975667269, + "learning_rate": 4.1035752822743524e-07, + "loss": 0.8145, + "step": 9466 + }, + { + "epoch": 1.8224607166061073, + "grad_norm": 3.3399938832626135, + "learning_rate": 4.0947398640835636e-07, + "loss": 0.8749, + "step": 9467 + }, + { + "epoch": 1.8226532232836825, + "grad_norm": 3.1419700470562497, + "learning_rate": 4.0859137691110495e-07, + "loss": 0.8157, + "step": 9468 + }, + { + "epoch": 1.8228457299612582, + "grad_norm": 3.217638726886422, + "learning_rate": 4.0770969982148025e-07, + "loss": 0.8412, + "step": 9469 + }, + { + "epoch": 1.8230382366388334, + "grad_norm": 3.3210595966246945, + "learning_rate": 4.0682895522519474e-07, + "loss": 0.847, + "step": 9470 + }, + { + "epoch": 1.8232307433164088, + "grad_norm": 3.3081842533032617, + "learning_rate": 4.059491432078666e-07, + "loss": 0.8349, + "step": 9471 + }, + { + "epoch": 1.8234232499939842, + "grad_norm": 3.217377490710455, + "learning_rate": 4.0507026385502747e-07, + "loss": 0.8739, + "step": 9472 + }, + { + "epoch": 1.8234232499939842, + "lm_loss": 0.6206, + "step": 9472, + "vm_loss": 0.1427 + }, + { + "epoch": 1.8234232499939842, + "lm_loss": 0.7198, + "step": 9472, + "vm_loss": 0.1805 + }, + { + "epoch": 1.8234232499939842, + "lm_loss": 0.7708, + "step": 9472, + "vm_loss": 0.1565 + }, + { + "epoch": 1.8234232499939842, + "lm_loss": 0.4317, + "step": 9472, + "vm_loss": 0.1807 + }, + { + "epoch": 1.8234232499939842, + "lm_loss": 0.4208, + "step": 9472, + "vm_loss": 0.1374 + }, + { + "epoch": 1.8234232499939842, + "lm_loss": 1.276, + "step": 9472, + "vm_loss": 0.1489 + }, + { + "epoch": 1.8234232499939842, + "lm_loss": 0.4207, + "step": 9472, + "vm_loss": 0.1713 + }, + { + "epoch": 1.8234232499939842, + "lm_loss": 0.9471, + "step": 9472, + "vm_loss": 0.1818 + }, + { + "epoch": 1.8236157566715594, + "grad_norm": 3.430338053425075, + "learning_rate": 4.0419231725211005e-07, + "loss": 0.8747, + "step": 9473 + }, + { + "epoch": 1.823808263349135, + "grad_norm": 3.307244858408728, + "learning_rate": 4.0331530348446944e-07, + "loss": 0.8541, + "step": 9474 + }, + { + "epoch": 1.8240007700267102, + "grad_norm": 3.3048154437997193, + "learning_rate": 4.024392226373563e-07, + "loss": 0.8774, + "step": 9475 + }, + { + "epoch": 1.8241932767042857, + "grad_norm": 3.3255389930089536, + "learning_rate": 4.015640747959404e-07, + "loss": 0.868, + "step": 9476 + }, + { + "epoch": 1.824385783381861, + "grad_norm": 3.3005337898423606, + "learning_rate": 4.0068986004529577e-07, + "loss": 0.8345, + "step": 9477 + }, + { + "epoch": 1.8245782900594363, + "grad_norm": 3.299226665971921, + "learning_rate": 3.9981657847040776e-07, + "loss": 0.8183, + "step": 9478 + }, + { + "epoch": 1.824770796737012, + "grad_norm": 3.2109005180579313, + "learning_rate": 3.989442301561719e-07, + "loss": 0.8291, + "step": 9479 + }, + { + "epoch": 1.8249633034145871, + "grad_norm": 3.2987902377342144, + "learning_rate": 3.9807281518738916e-07, + "loss": 0.8349, + "step": 9480 + }, + { + "epoch": 1.8249633034145871, + "lm_loss": 0.7434, + "step": 9480, + "vm_loss": 0.1807 + }, + { + "epoch": 1.8249633034145871, + "lm_loss": 0.4819, + "step": 9480, + "vm_loss": 0.17 + }, + { + "epoch": 1.8249633034145871, + "lm_loss": 0.5675, + "step": 9480, + "vm_loss": 0.219 + }, + { + "epoch": 1.8249633034145871, + "lm_loss": 0.8994, + "step": 9480, + "vm_loss": 0.2045 + }, + { + "epoch": 1.8249633034145871, + "lm_loss": 0.7241, + "step": 9480, + "vm_loss": 0.1751 + }, + { + "epoch": 1.8249633034145871, + "lm_loss": 1.0228, + "step": 9480, + "vm_loss": 0.1341 + }, + { + "epoch": 1.8249633034145871, + "lm_loss": 0.8299, + "step": 9480, + "vm_loss": 0.1348 + }, + { + "epoch": 1.8249633034145871, + "lm_loss": 0.6174, + "step": 9480, + "vm_loss": 0.1183 + }, + { + "epoch": 1.8251558100921625, + "grad_norm": 3.4308905616113776, + "learning_rate": 3.972023336487729e-07, + "loss": 0.8975, + "step": 9481 + }, + { + "epoch": 1.825348316769738, + "grad_norm": 3.1579522901354156, + "learning_rate": 3.9633278562494657e-07, + "loss": 0.8092, + "step": 9482 + }, + { + "epoch": 1.8255408234473132, + "grad_norm": 3.3288312197126007, + "learning_rate": 3.9546417120044035e-07, + "loss": 0.8663, + "step": 9483 + }, + { + "epoch": 1.8257333301248888, + "grad_norm": 3.392616518180647, + "learning_rate": 3.9459649045969216e-07, + "loss": 0.8615, + "step": 9484 + }, + { + "epoch": 1.825925836802464, + "grad_norm": 3.3863379603030577, + "learning_rate": 3.9372974348705685e-07, + "loss": 0.8821, + "step": 9485 + }, + { + "epoch": 1.8261183434800394, + "grad_norm": 3.4100559828135806, + "learning_rate": 3.9286393036678914e-07, + "loss": 0.8692, + "step": 9486 + }, + { + "epoch": 1.8263108501576149, + "grad_norm": 3.0070194853357903, + "learning_rate": 3.919990511830585e-07, + "loss": 0.7841, + "step": 9487 + }, + { + "epoch": 1.8265033568351903, + "grad_norm": 3.3824976826180415, + "learning_rate": 3.911351060199431e-07, + "loss": 0.8727, + "step": 9488 + }, + { + "epoch": 1.8265033568351903, + "lm_loss": 0.6035, + "step": 9488, + "vm_loss": 0.1573 + }, + { + "epoch": 1.8265033568351903, + "lm_loss": 0.4873, + "step": 9488, + "vm_loss": 0.1776 + }, + { + "epoch": 1.8265033568351903, + "lm_loss": 0.6396, + "step": 9488, + "vm_loss": 0.1172 + }, + { + "epoch": 1.8265033568351903, + "lm_loss": 0.2392, + "step": 9488, + "vm_loss": 0.1531 + }, + { + "epoch": 1.8265033568351903, + "lm_loss": 0.647, + "step": 9488, + "vm_loss": 0.2234 + }, + { + "epoch": 1.8265033568351903, + "lm_loss": 1.0678, + "step": 9488, + "vm_loss": 0.1564 + }, + { + "epoch": 1.8265033568351903, + "lm_loss": 0.7865, + "step": 9488, + "vm_loss": 0.1808 + }, + { + "epoch": 1.8265033568351903, + "lm_loss": 0.7229, + "step": 9488, + "vm_loss": 0.1882 + }, + { + "epoch": 1.8266958635127657, + "grad_norm": 3.1854970384473345, + "learning_rate": 3.9027209496142913e-07, + "loss": 0.8205, + "step": 9489 + }, + { + "epoch": 1.826888370190341, + "grad_norm": 3.136082335568287, + "learning_rate": 3.894100180914095e-07, + "loss": 0.8302, + "step": 9490 + }, + { + "epoch": 1.8270808768679163, + "grad_norm": 3.190175453799504, + "learning_rate": 3.885488754936939e-07, + "loss": 0.8387, + "step": 9491 + }, + { + "epoch": 1.8272733835454917, + "grad_norm": 3.3768010388907235, + "learning_rate": 3.876886672519931e-07, + "loss": 0.8705, + "step": 9492 + }, + { + "epoch": 1.8274658902230672, + "grad_norm": 3.2775909281947717, + "learning_rate": 3.868293934499312e-07, + "loss": 0.8396, + "step": 9493 + }, + { + "epoch": 1.8276583969006426, + "grad_norm": 3.290913518702874, + "learning_rate": 3.859710541710393e-07, + "loss": 0.8598, + "step": 9494 + }, + { + "epoch": 1.8278509035782178, + "grad_norm": 3.315770383857467, + "learning_rate": 3.8511364949876284e-07, + "loss": 0.8449, + "step": 9495 + }, + { + "epoch": 1.8280434102557932, + "grad_norm": 3.3448790442644154, + "learning_rate": 3.842571795164485e-07, + "loss": 0.8476, + "step": 9496 + }, + { + "epoch": 1.8280434102557932, + "lm_loss": 0.6862, + "step": 9496, + "vm_loss": 0.2112 + }, + { + "epoch": 1.8280434102557932, + "lm_loss": 0.7716, + "step": 9496, + "vm_loss": 0.1551 + }, + { + "epoch": 1.8280434102557932, + "lm_loss": 0.6419, + "step": 9496, + "vm_loss": 0.1774 + }, + { + "epoch": 1.8280434102557932, + "lm_loss": 0.7296, + "step": 9496, + "vm_loss": 0.153 + }, + { + "epoch": 1.8280434102557932, + "lm_loss": 0.8192, + "step": 9496, + "vm_loss": 0.1083 + }, + { + "epoch": 1.8280434102557932, + "lm_loss": 0.5988, + "step": 9496, + "vm_loss": 0.157 + }, + { + "epoch": 1.8280434102557932, + "lm_loss": 0.3493, + "step": 9496, + "vm_loss": 0.1632 + }, + { + "epoch": 1.8280434102557932, + "lm_loss": 0.419, + "step": 9496, + "vm_loss": 0.2318 + }, + { + "epoch": 1.8282359169333686, + "grad_norm": 3.1472182266491475, + "learning_rate": 3.834016443073574e-07, + "loss": 0.798, + "step": 9497 + }, + { + "epoch": 1.828428423610944, + "grad_norm": 3.1008637888315205, + "learning_rate": 3.8254704395465856e-07, + "loss": 0.7781, + "step": 9498 + }, + { + "epoch": 1.8286209302885195, + "grad_norm": 3.4187812345455386, + "learning_rate": 3.816933785414312e-07, + "loss": 0.9372, + "step": 9499 + }, + { + "epoch": 1.8288134369660947, + "grad_norm": 3.172118024424408, + "learning_rate": 3.8084064815066104e-07, + "loss": 0.7911, + "step": 9500 + }, + { + "epoch": 1.82900594364367, + "grad_norm": 3.1471065054341243, + "learning_rate": 3.79988852865244e-07, + "loss": 0.8491, + "step": 9501 + }, + { + "epoch": 1.8291984503212455, + "grad_norm": 3.2790091927382186, + "learning_rate": 3.791379927679884e-07, + "loss": 0.8461, + "step": 9502 + }, + { + "epoch": 1.829390956998821, + "grad_norm": 3.140193001946722, + "learning_rate": 3.7828806794160456e-07, + "loss": 0.8233, + "step": 9503 + }, + { + "epoch": 1.8295834636763963, + "grad_norm": 3.286493738407895, + "learning_rate": 3.774390784687187e-07, + "loss": 0.8077, + "step": 9504 + }, + { + "epoch": 1.8295834636763963, + "lm_loss": 0.7782, + "step": 9504, + "vm_loss": 0.1788 + }, + { + "epoch": 1.8295834636763963, + "lm_loss": 0.7257, + "step": 9504, + "vm_loss": 0.1896 + }, + { + "epoch": 1.8295834636763963, + "lm_loss": 0.7783, + "step": 9504, + "vm_loss": 0.0867 + }, + { + "epoch": 1.8295834636763963, + "lm_loss": 0.6613, + "step": 9504, + "vm_loss": 0.1642 + }, + { + "epoch": 1.8295834636763963, + "lm_loss": 0.8421, + "step": 9504, + "vm_loss": 0.1889 + }, + { + "epoch": 1.8295834636763963, + "lm_loss": 0.7399, + "step": 9504, + "vm_loss": 0.1577 + }, + { + "epoch": 1.8295834636763963, + "lm_loss": 0.6607, + "step": 9504, + "vm_loss": 0.1469 + }, + { + "epoch": 1.8295834636763963, + "lm_loss": 0.7154, + "step": 9504, + "vm_loss": 0.2408 + }, + { + "epoch": 1.8297759703539715, + "grad_norm": 3.1439730386916005, + "learning_rate": 3.765910244318638e-07, + "loss": 0.8058, + "step": 9505 + }, + { + "epoch": 1.8299684770315472, + "grad_norm": 3.254878105620397, + "learning_rate": 3.7574390591348044e-07, + "loss": 0.8451, + "step": 9506 + }, + { + "epoch": 1.8301609837091224, + "grad_norm": 3.231804991046132, + "learning_rate": 3.748977229959183e-07, + "loss": 0.8573, + "step": 9507 + }, + { + "epoch": 1.8303534903866978, + "grad_norm": 3.2481493719999626, + "learning_rate": 3.7405247576144055e-07, + "loss": 0.7731, + "step": 9508 + }, + { + "epoch": 1.8305459970642732, + "grad_norm": 3.3602611014099444, + "learning_rate": 3.732081642922125e-07, + "loss": 0.8738, + "step": 9509 + }, + { + "epoch": 1.8307385037418484, + "grad_norm": 3.152238527216324, + "learning_rate": 3.723647886703141e-07, + "loss": 0.7808, + "step": 9510 + }, + { + "epoch": 1.830931010419424, + "grad_norm": 3.346070852443939, + "learning_rate": 3.7152234897773085e-07, + "loss": 0.8679, + "step": 9511 + }, + { + "epoch": 1.8311235170969993, + "grad_norm": 3.3026290344589713, + "learning_rate": 3.706808452963606e-07, + "loss": 0.8437, + "step": 9512 + }, + { + "epoch": 1.8311235170969993, + "lm_loss": 0.661, + "step": 9512, + "vm_loss": 0.2038 + }, + { + "epoch": 1.8311235170969993, + "lm_loss": 0.5485, + "step": 9512, + "vm_loss": 0.1976 + }, + { + "epoch": 1.8311235170969993, + "lm_loss": 0.7937, + "step": 9512, + "vm_loss": 0.1688 + }, + { + "epoch": 1.8311235170969993, + "lm_loss": 0.6154, + "step": 9512, + "vm_loss": 0.2184 + }, + { + "epoch": 1.8311235170969993, + "lm_loss": 0.7715, + "step": 9512, + "vm_loss": 0.1487 + }, + { + "epoch": 1.8311235170969993, + "lm_loss": 0.7354, + "step": 9512, + "vm_loss": 0.1736 + }, + { + "epoch": 1.8311235170969993, + "lm_loss": 0.5383, + "step": 9512, + "vm_loss": 0.1786 + }, + { + "epoch": 1.8311235170969993, + "lm_loss": 0.8047, + "step": 9512, + "vm_loss": 0.1381 + }, + { + "epoch": 1.8313160237745747, + "grad_norm": 3.1994216360475933, + "learning_rate": 3.6984027770800676e-07, + "loss": 0.8237, + "step": 9513 + }, + { + "epoch": 1.8315085304521501, + "grad_norm": 3.151048861314282, + "learning_rate": 3.690006462943829e-07, + "loss": 0.8214, + "step": 9514 + }, + { + "epoch": 1.8317010371297253, + "grad_norm": 3.3502839719285844, + "learning_rate": 3.6816195113711374e-07, + "loss": 0.8248, + "step": 9515 + }, + { + "epoch": 1.831893543807301, + "grad_norm": 3.4547906990984503, + "learning_rate": 3.6732419231773063e-07, + "loss": 0.8845, + "step": 9516 + }, + { + "epoch": 1.8320860504848762, + "grad_norm": 3.2973478269386427, + "learning_rate": 3.664873699176719e-07, + "loss": 0.8647, + "step": 9517 + }, + { + "epoch": 1.8322785571624516, + "grad_norm": 3.346744758591867, + "learning_rate": 3.656514840182912e-07, + "loss": 0.8477, + "step": 9518 + }, + { + "epoch": 1.832471063840027, + "grad_norm": 3.27423976087775, + "learning_rate": 3.648165347008448e-07, + "loss": 0.875, + "step": 9519 + }, + { + "epoch": 1.8326635705176022, + "grad_norm": 3.334941521111932, + "learning_rate": 3.63982522046501e-07, + "loss": 0.839, + "step": 9520 + }, + { + "epoch": 1.8326635705176022, + "lm_loss": 0.9093, + "step": 9520, + "vm_loss": 0.1798 + }, + { + "epoch": 1.8326635705176022, + "lm_loss": 1.1019, + "step": 9520, + "vm_loss": 0.2605 + }, + { + "epoch": 1.8326635705176022, + "lm_loss": 0.3618, + "step": 9520, + "vm_loss": 0.1081 + }, + { + "epoch": 1.8326635705176022, + "lm_loss": 0.4845, + "step": 9520, + "vm_loss": 0.1619 + }, + { + "epoch": 1.8326635705176022, + "lm_loss": 0.6284, + "step": 9520, + "vm_loss": 0.1425 + }, + { + "epoch": 1.8326635705176022, + "lm_loss": 0.2545, + "step": 9520, + "vm_loss": 0.1405 + }, + { + "epoch": 1.8326635705176022, + "lm_loss": 0.4794, + "step": 9520, + "vm_loss": 0.2009 + }, + { + "epoch": 1.8326635705176022, + "lm_loss": 0.4807, + "step": 9520, + "vm_loss": 0.1183 + }, + { + "epoch": 1.8328560771951778, + "grad_norm": 3.2831563354004003, + "learning_rate": 3.6314944613633606e-07, + "loss": 0.8398, + "step": 9521 + }, + { + "epoch": 1.833048583872753, + "grad_norm": 3.1745320381986555, + "learning_rate": 3.623173070513375e-07, + "loss": 0.7842, + "step": 9522 + }, + { + "epoch": 1.8332410905503285, + "grad_norm": 3.2427427519757, + "learning_rate": 3.6148610487239946e-07, + "loss": 0.8635, + "step": 9523 + }, + { + "epoch": 1.8334335972279039, + "grad_norm": 3.199753075647246, + "learning_rate": 3.6065583968032283e-07, + "loss": 0.8115, + "step": 9524 + }, + { + "epoch": 1.833626103905479, + "grad_norm": 3.2788236138981333, + "learning_rate": 3.5982651155582303e-07, + "loss": 0.8359, + "step": 9525 + }, + { + "epoch": 1.8338186105830547, + "grad_norm": 3.294500466563377, + "learning_rate": 3.58998120579519e-07, + "loss": 0.8586, + "step": 9526 + }, + { + "epoch": 1.83401111726063, + "grad_norm": 3.298813101790542, + "learning_rate": 3.58170666831944e-07, + "loss": 0.8708, + "step": 9527 + }, + { + "epoch": 1.8342036239382054, + "grad_norm": 3.1303530840565905, + "learning_rate": 3.5734415039353263e-07, + "loss": 0.8267, + "step": 9528 + }, + { + "epoch": 1.8342036239382054, + "lm_loss": 0.5389, + "step": 9528, + "vm_loss": 0.179 + }, + { + "epoch": 1.8342036239382054, + "lm_loss": 0.5756, + "step": 9528, + "vm_loss": 0.1619 + }, + { + "epoch": 1.8342036239382054, + "lm_loss": 0.7273, + "step": 9528, + "vm_loss": 0.2108 + }, + { + "epoch": 1.8342036239382054, + "lm_loss": 0.6951, + "step": 9528, + "vm_loss": 0.173 + }, + { + "epoch": 1.8342036239382054, + "lm_loss": 0.5949, + "step": 9528, + "vm_loss": 0.1886 + }, + { + "epoch": 1.8342036239382054, + "lm_loss": 0.3952, + "step": 9528, + "vm_loss": 0.1725 + }, + { + "epoch": 1.8342036239382054, + "lm_loss": 0.7017, + "step": 9528, + "vm_loss": 0.1904 + }, + { + "epoch": 1.8342036239382054, + "lm_loss": 0.5159, + "step": 9528, + "vm_loss": 0.1982 + }, + { + "epoch": 1.8343961306157808, + "grad_norm": 3.2417793252633067, + "learning_rate": 3.565185713446373e-07, + "loss": 0.8656, + "step": 9529 + }, + { + "epoch": 1.834588637293356, + "grad_norm": 3.1744660241459424, + "learning_rate": 3.556939297655115e-07, + "loss": 0.8014, + "step": 9530 + }, + { + "epoch": 1.8347811439709316, + "grad_norm": 3.1962048198810504, + "learning_rate": 3.5487022573632343e-07, + "loss": 0.8138, + "step": 9531 + }, + { + "epoch": 1.8349736506485068, + "grad_norm": 3.2059522436308283, + "learning_rate": 3.540474593371468e-07, + "loss": 0.8284, + "step": 9532 + }, + { + "epoch": 1.8351661573260822, + "grad_norm": 3.255175893765566, + "learning_rate": 3.5322563064796535e-07, + "loss": 0.8372, + "step": 9533 + }, + { + "epoch": 1.8353586640036577, + "grad_norm": 3.4787265189268854, + "learning_rate": 3.524047397486696e-07, + "loss": 0.9039, + "step": 9534 + }, + { + "epoch": 1.8355511706812329, + "grad_norm": 3.1489074143203277, + "learning_rate": 3.515847867190625e-07, + "loss": 0.8423, + "step": 9535 + }, + { + "epoch": 1.8357436773588085, + "grad_norm": 3.1705660358493244, + "learning_rate": 3.507657716388535e-07, + "loss": 0.7831, + "step": 9536 + }, + { + "epoch": 1.8357436773588085, + "lm_loss": 0.6527, + "step": 9536, + "vm_loss": 0.1519 + }, + { + "epoch": 1.8357436773588085, + "lm_loss": 0.8496, + "step": 9536, + "vm_loss": 0.1578 + }, + { + "epoch": 1.8357436773588085, + "lm_loss": 0.572, + "step": 9536, + "vm_loss": 0.1426 + }, + { + "epoch": 1.8357436773588085, + "lm_loss": 0.6331, + "step": 9536, + "vm_loss": 0.1792 + }, + { + "epoch": 1.8357436773588085, + "lm_loss": 0.4811, + "step": 9536, + "vm_loss": 0.1514 + }, + { + "epoch": 1.8357436773588085, + "lm_loss": 0.9438, + "step": 9536, + "vm_loss": 0.1607 + }, + { + "epoch": 1.8357436773588085, + "lm_loss": 0.8788, + "step": 9536, + "vm_loss": 0.2183 + }, + { + "epoch": 1.8357436773588085, + "lm_loss": 0.9806, + "step": 9536, + "vm_loss": 0.1775 + }, + { + "epoch": 1.8359361840363837, + "grad_norm": 3.299423370254257, + "learning_rate": 3.4994769458766233e-07, + "loss": 0.8541, + "step": 9537 + }, + { + "epoch": 1.8361286907139591, + "grad_norm": 3.15513076282116, + "learning_rate": 3.491305556450131e-07, + "loss": 0.8091, + "step": 9538 + }, + { + "epoch": 1.8363211973915345, + "grad_norm": 3.4448679822407615, + "learning_rate": 3.483143548903467e-07, + "loss": 0.8889, + "step": 9539 + }, + { + "epoch": 1.8365137040691097, + "grad_norm": 3.2907299642195875, + "learning_rate": 3.474990924030042e-07, + "loss": 0.8491, + "step": 9540 + }, + { + "epoch": 1.8367062107466854, + "grad_norm": 3.113449439984556, + "learning_rate": 3.4668476826224205e-07, + "loss": 0.7998, + "step": 9541 + }, + { + "epoch": 1.8368987174242606, + "grad_norm": 3.3597768355411053, + "learning_rate": 3.458713825472226e-07, + "loss": 0.8691, + "step": 9542 + }, + { + "epoch": 1.837091224101836, + "grad_norm": 3.404091040106368, + "learning_rate": 3.4505893533701704e-07, + "loss": 0.8949, + "step": 9543 + }, + { + "epoch": 1.8372837307794114, + "grad_norm": 3.3427772751432934, + "learning_rate": 3.442474267106077e-07, + "loss": 0.8663, + "step": 9544 + }, + { + "epoch": 1.8372837307794114, + "lm_loss": 0.783, + "step": 9544, + "vm_loss": 0.1859 + }, + { + "epoch": 1.8372837307794114, + "lm_loss": 0.5112, + "step": 9544, + "vm_loss": 0.1801 + }, + { + "epoch": 1.8372837307794114, + "lm_loss": 0.7183, + "step": 9544, + "vm_loss": 0.1441 + }, + { + "epoch": 1.8372837307794114, + "lm_loss": 0.5819, + "step": 9544, + "vm_loss": 0.1012 + }, + { + "epoch": 1.8372837307794114, + "lm_loss": 0.3258, + "step": 9544, + "vm_loss": 0.1699 + }, + { + "epoch": 1.8372837307794114, + "lm_loss": 0.6095, + "step": 9544, + "vm_loss": 0.1288 + }, + { + "epoch": 1.8372837307794114, + "lm_loss": 0.5312, + "step": 9544, + "vm_loss": 0.1757 + }, + { + "epoch": 1.8372837307794114, + "lm_loss": 0.5948, + "step": 9544, + "vm_loss": 0.1727 + }, + { + "epoch": 1.8374762374569866, + "grad_norm": 3.3043947929699566, + "learning_rate": 3.434368567468782e-07, + "loss": 0.8461, + "step": 9545 + }, + { + "epoch": 1.8376687441345623, + "grad_norm": 3.4320712581029627, + "learning_rate": 3.4262722552463213e-07, + "loss": 0.8784, + "step": 9546 + }, + { + "epoch": 1.8378612508121375, + "grad_norm": 3.3807749870866783, + "learning_rate": 3.418185331225732e-07, + "loss": 0.8652, + "step": 9547 + }, + { + "epoch": 1.838053757489713, + "grad_norm": 3.221563034623357, + "learning_rate": 3.4101077961931627e-07, + "loss": 0.8169, + "step": 9548 + }, + { + "epoch": 1.8382462641672883, + "grad_norm": 3.255058121465018, + "learning_rate": 3.4020396509338636e-07, + "loss": 0.8196, + "step": 9549 + }, + { + "epoch": 1.8384387708448637, + "grad_norm": 3.1300023373373174, + "learning_rate": 3.393980896232174e-07, + "loss": 0.8137, + "step": 9550 + }, + { + "epoch": 1.8386312775224392, + "grad_norm": 3.219771332398671, + "learning_rate": 3.385931532871478e-07, + "loss": 0.8425, + "step": 9551 + }, + { + "epoch": 1.8388237842000144, + "grad_norm": 3.3004988231847174, + "learning_rate": 3.3778915616342947e-07, + "loss": 0.8592, + "step": 9552 + }, + { + "epoch": 1.8388237842000144, + "lm_loss": 0.3296, + "step": 9552, + "vm_loss": 0.1052 + }, + { + "epoch": 1.8388237842000144, + "lm_loss": 0.3474, + "step": 9552, + "vm_loss": 0.1604 + }, + { + "epoch": 1.8388237842000144, + "lm_loss": 0.6391, + "step": 9552, + "vm_loss": 0.1583 + }, + { + "epoch": 1.8388237842000144, + "lm_loss": 0.7476, + "step": 9552, + "vm_loss": 0.1641 + }, + { + "epoch": 1.8388237842000144, + "lm_loss": 1.0186, + "step": 9552, + "vm_loss": 0.1946 + }, + { + "epoch": 1.8388237842000144, + "lm_loss": 0.6157, + "step": 9552, + "vm_loss": 0.1618 + }, + { + "epoch": 1.8388237842000144, + "lm_loss": 0.8327, + "step": 9552, + "vm_loss": 0.1796 + }, + { + "epoch": 1.8388237842000144, + "lm_loss": 0.894, + "step": 9552, + "vm_loss": 0.1777 + }, + { + "epoch": 1.8390162908775898, + "grad_norm": 3.0862680791692325, + "learning_rate": 3.369860983302209e-07, + "loss": 0.809, + "step": 9553 + }, + { + "epoch": 1.8392087975551652, + "grad_norm": 3.3058611768910366, + "learning_rate": 3.361839798655897e-07, + "loss": 0.8709, + "step": 9554 + }, + { + "epoch": 1.8394013042327406, + "grad_norm": 3.4039592810382175, + "learning_rate": 3.353828008475113e-07, + "loss": 0.8918, + "step": 9555 + }, + { + "epoch": 1.839593810910316, + "grad_norm": 3.4500258485878517, + "learning_rate": 3.3458256135387225e-07, + "loss": 0.8848, + "step": 9556 + }, + { + "epoch": 1.8397863175878912, + "grad_norm": 3.150119150357368, + "learning_rate": 3.337832614624648e-07, + "loss": 0.8514, + "step": 9557 + }, + { + "epoch": 1.8399788242654667, + "grad_norm": 3.269558438969154, + "learning_rate": 3.3298490125099116e-07, + "loss": 0.8749, + "step": 9558 + }, + { + "epoch": 1.840171330943042, + "grad_norm": 3.1418983949093544, + "learning_rate": 3.3218748079706153e-07, + "loss": 0.8228, + "step": 9559 + }, + { + "epoch": 1.8403638376206175, + "grad_norm": 3.2807713960364704, + "learning_rate": 3.3139100017819835e-07, + "loss": 0.8698, + "step": 9560 + }, + { + "epoch": 1.8403638376206175, + "lm_loss": 0.3522, + "step": 9560, + "vm_loss": 0.1473 + }, + { + "epoch": 1.8403638376206175, + "lm_loss": 0.6446, + "step": 9560, + "vm_loss": 0.1203 + }, + { + "epoch": 1.8403638376206175, + "lm_loss": 0.5976, + "step": 9560, + "vm_loss": 0.1746 + }, + { + "epoch": 1.8403638376206175, + "lm_loss": 0.6688, + "step": 9560, + "vm_loss": 0.127 + }, + { + "epoch": 1.8403638376206175, + "lm_loss": 0.7808, + "step": 9560, + "vm_loss": 0.1561 + }, + { + "epoch": 1.8403638376206175, + "lm_loss": 0.587, + "step": 9560, + "vm_loss": 0.1482 + }, + { + "epoch": 1.8403638376206175, + "lm_loss": 0.3409, + "step": 9560, + "vm_loss": 0.1433 + }, + { + "epoch": 1.8403638376206175, + "lm_loss": 0.6537, + "step": 9560, + "vm_loss": 0.1408 + }, + { + "epoch": 1.840556344298193, + "grad_norm": 3.054919236659913, + "learning_rate": 3.3059545947182636e-07, + "loss": 0.7733, + "step": 9561 + }, + { + "epoch": 1.8407488509757681, + "grad_norm": 3.0885810012828063, + "learning_rate": 3.2980085875528254e-07, + "loss": 0.798, + "step": 9562 + }, + { + "epoch": 1.8409413576533435, + "grad_norm": 3.29829414893802, + "learning_rate": 3.2900719810581406e-07, + "loss": 0.8709, + "step": 9563 + }, + { + "epoch": 1.841133864330919, + "grad_norm": 3.135548771166346, + "learning_rate": 3.2821447760057467e-07, + "loss": 0.8009, + "step": 9564 + }, + { + "epoch": 1.8413263710084944, + "grad_norm": 3.217343032000977, + "learning_rate": 3.2742269731662504e-07, + "loss": 0.8472, + "step": 9565 + }, + { + "epoch": 1.8415188776860698, + "grad_norm": 3.317490777288784, + "learning_rate": 3.2663185733093796e-07, + "loss": 0.8525, + "step": 9566 + }, + { + "epoch": 1.841711384363645, + "grad_norm": 3.3714805204285785, + "learning_rate": 3.258419577203942e-07, + "loss": 0.8191, + "step": 9567 + }, + { + "epoch": 1.8419038910412207, + "grad_norm": 3.2876543754563516, + "learning_rate": 3.250529985617801e-07, + "loss": 0.8525, + "step": 9568 + }, + { + "epoch": 1.8419038910412207, + "lm_loss": 0.396, + "step": 9568, + "vm_loss": 0.1422 + }, + { + "epoch": 1.8419038910412207, + "lm_loss": 0.4368, + "step": 9568, + "vm_loss": 0.2023 + }, + { + "epoch": 1.8419038910412207, + "lm_loss": 0.7416, + "step": 9568, + "vm_loss": 0.2128 + }, + { + "epoch": 1.8419038910412207, + "lm_loss": 0.4, + "step": 9568, + "vm_loss": 0.1088 + }, + { + "epoch": 1.8419038910412207, + "lm_loss": 0.6, + "step": 9568, + "vm_loss": 0.1307 + }, + { + "epoch": 1.8419038910412207, + "lm_loss": 0.7134, + "step": 9568, + "vm_loss": 0.2191 + }, + { + "epoch": 1.8419038910412207, + "lm_loss": 0.4495, + "step": 9568, + "vm_loss": 0.1208 + }, + { + "epoch": 1.8419038910412207, + "lm_loss": 0.659, + "step": 9568, + "vm_loss": 0.1336 + }, + { + "epoch": 1.8420963977187959, + "grad_norm": 3.161443036247088, + "learning_rate": 3.2426497993179207e-07, + "loss": 0.8085, + "step": 9569 + }, + { + "epoch": 1.8422889043963713, + "grad_norm": 3.49922044050319, + "learning_rate": 3.2347790190703776e-07, + "loss": 0.905, + "step": 9570 + }, + { + "epoch": 1.8424814110739467, + "grad_norm": 3.115539250395291, + "learning_rate": 3.226917645640315e-07, + "loss": 0.824, + "step": 9571 + }, + { + "epoch": 1.842673917751522, + "grad_norm": 3.222247052149268, + "learning_rate": 3.2190656797919215e-07, + "loss": 0.8278, + "step": 9572 + }, + { + "epoch": 1.8428664244290975, + "grad_norm": 3.2205640216521836, + "learning_rate": 3.2112231222885536e-07, + "loss": 0.8168, + "step": 9573 + }, + { + "epoch": 1.8430589311066727, + "grad_norm": 3.2583617883956872, + "learning_rate": 3.203389973892579e-07, + "loss": 0.837, + "step": 9574 + }, + { + "epoch": 1.8432514377842482, + "grad_norm": 3.2637800564939297, + "learning_rate": 3.195566235365499e-07, + "loss": 0.8551, + "step": 9575 + }, + { + "epoch": 1.8434439444618236, + "grad_norm": 3.1884264404270524, + "learning_rate": 3.187751907467851e-07, + "loss": 0.843, + "step": 9576 + }, + { + "epoch": 1.8434439444618236, + "lm_loss": 0.9084, + "step": 9576, + "vm_loss": 0.1978 + }, + { + "epoch": 1.8434439444618236, + "lm_loss": 0.3514, + "step": 9576, + "vm_loss": 0.1841 + }, + { + "epoch": 1.8434439444618236, + "lm_loss": 0.6848, + "step": 9576, + "vm_loss": 0.1189 + }, + { + "epoch": 1.8434439444618236, + "lm_loss": 0.3035, + "step": 9576, + "vm_loss": 0.1619 + }, + { + "epoch": 1.8434439444618236, + "lm_loss": 0.7604, + "step": 9576, + "vm_loss": 0.1589 + }, + { + "epoch": 1.8434439444618236, + "lm_loss": 0.9533, + "step": 9576, + "vm_loss": 0.158 + }, + { + "epoch": 1.8434439444618236, + "lm_loss": 0.4598, + "step": 9576, + "vm_loss": 0.1808 + }, + { + "epoch": 1.8434439444618236, + "lm_loss": 0.4351, + "step": 9576, + "vm_loss": 0.1999 + }, + { + "epoch": 1.8436364511393988, + "grad_norm": 3.3462576067718404, + "learning_rate": 3.179946990959326e-07, + "loss": 0.8614, + "step": 9577 + }, + { + "epoch": 1.8438289578169744, + "grad_norm": 3.3052975459465146, + "learning_rate": 3.172151486598629e-07, + "loss": 0.8577, + "step": 9578 + }, + { + "epoch": 1.8440214644945496, + "grad_norm": 3.1065690570671673, + "learning_rate": 3.1643653951436095e-07, + "loss": 0.8116, + "step": 9579 + }, + { + "epoch": 1.844213971172125, + "grad_norm": 3.1430785507445966, + "learning_rate": 3.15658871735115e-07, + "loss": 0.794, + "step": 9580 + }, + { + "epoch": 1.8444064778497005, + "grad_norm": 3.0770032231485067, + "learning_rate": 3.14882145397728e-07, + "loss": 0.8068, + "step": 9581 + }, + { + "epoch": 1.8445989845272757, + "grad_norm": 3.4076122832152214, + "learning_rate": 3.141063605777028e-07, + "loss": 0.8473, + "step": 9582 + }, + { + "epoch": 1.8447914912048513, + "grad_norm": 3.1578171137205207, + "learning_rate": 3.133315173504592e-07, + "loss": 0.8162, + "step": 9583 + }, + { + "epoch": 1.8449839978824265, + "grad_norm": 3.250535141909998, + "learning_rate": 3.125576157913213e-07, + "loss": 0.848, + "step": 9584 + }, + { + "epoch": 1.8449839978824265, + "lm_loss": 0.7243, + "step": 9584, + "vm_loss": 0.2082 + }, + { + "epoch": 1.8449839978824265, + "lm_loss": 0.8587, + "step": 9584, + "vm_loss": 0.1866 + }, + { + "epoch": 1.8449839978824265, + "lm_loss": 0.4455, + "step": 9584, + "vm_loss": 0.1651 + }, + { + "epoch": 1.8449839978824265, + "lm_loss": 0.7672, + "step": 9584, + "vm_loss": 0.1518 + }, + { + "epoch": 1.8449839978824265, + "lm_loss": 0.9324, + "step": 9584, + "vm_loss": 0.2242 + }, + { + "epoch": 1.8449839978824265, + "lm_loss": 0.9937, + "step": 9584, + "vm_loss": 0.181 + }, + { + "epoch": 1.8449839978824265, + "lm_loss": 0.8959, + "step": 9584, + "vm_loss": 0.1673 + }, + { + "epoch": 1.8449839978824265, + "lm_loss": 0.7313, + "step": 9584, + "vm_loss": 0.1702 + }, + { + "epoch": 1.845176504560002, + "grad_norm": 3.3652124468615447, + "learning_rate": 3.117846559755211e-07, + "loss": 0.8722, + "step": 9585 + }, + { + "epoch": 1.8453690112375774, + "grad_norm": 3.2385447793080644, + "learning_rate": 3.110126379781997e-07, + "loss": 0.8278, + "step": 9586 + }, + { + "epoch": 1.8455615179151525, + "grad_norm": 3.1206017394097945, + "learning_rate": 3.102415618744092e-07, + "loss": 0.7581, + "step": 9587 + }, + { + "epoch": 1.8457540245927282, + "grad_norm": 3.279019781279383, + "learning_rate": 3.0947142773910863e-07, + "loss": 0.8382, + "step": 9588 + }, + { + "epoch": 1.8459465312703034, + "grad_norm": 3.376420562140214, + "learning_rate": 3.087022356471603e-07, + "loss": 0.8692, + "step": 9589 + }, + { + "epoch": 1.8461390379478788, + "grad_norm": 3.2286511414943875, + "learning_rate": 3.0793398567334544e-07, + "loss": 0.8321, + "step": 9590 + }, + { + "epoch": 1.8463315446254542, + "grad_norm": 3.173387587854778, + "learning_rate": 3.071666778923432e-07, + "loss": 0.8377, + "step": 9591 + }, + { + "epoch": 1.8465240513030294, + "grad_norm": 3.3325969290081763, + "learning_rate": 3.064003123787496e-07, + "loss": 0.8512, + "step": 9592 + }, + { + "epoch": 1.8465240513030294, + "lm_loss": 0.6598, + "step": 9592, + "vm_loss": 0.1752 + }, + { + "epoch": 1.8465240513030294, + "lm_loss": 0.7891, + "step": 9592, + "vm_loss": 0.1114 + }, + { + "epoch": 1.8465240513030294, + "lm_loss": 0.512, + "step": 9592, + "vm_loss": 0.2124 + }, + { + "epoch": 1.8465240513030294, + "lm_loss": 1.1378, + "step": 9592, + "vm_loss": 0.2229 + }, + { + "epoch": 1.8465240513030294, + "lm_loss": 0.3821, + "step": 9592, + "vm_loss": 0.1536 + }, + { + "epoch": 1.8465240513030294, + "lm_loss": 0.3317, + "step": 9592, + "vm_loss": 0.1613 + }, + { + "epoch": 1.8465240513030294, + "lm_loss": 0.9321, + "step": 9592, + "vm_loss": 0.158 + }, + { + "epoch": 1.8465240513030294, + "lm_loss": 0.5165, + "step": 9592, + "vm_loss": 0.1369 + }, + { + "epoch": 1.846716557980605, + "grad_norm": 3.2165937163238265, + "learning_rate": 3.056348892070593e-07, + "loss": 0.8403, + "step": 9593 + }, + { + "epoch": 1.8469090646581803, + "grad_norm": 3.2142485537295427, + "learning_rate": 3.0487040845168847e-07, + "loss": 0.8321, + "step": 9594 + }, + { + "epoch": 1.8471015713357557, + "grad_norm": 3.2185217155175603, + "learning_rate": 3.041068701869487e-07, + "loss": 0.8282, + "step": 9595 + }, + { + "epoch": 1.8472940780133311, + "grad_norm": 3.2449783636220006, + "learning_rate": 3.033442744870685e-07, + "loss": 0.8567, + "step": 9596 + }, + { + "epoch": 1.8474865846909063, + "grad_norm": 3.35156198992, + "learning_rate": 3.025826214261807e-07, + "loss": 0.8838, + "step": 9597 + }, + { + "epoch": 1.847679091368482, + "grad_norm": 3.35834157574361, + "learning_rate": 3.0182191107833047e-07, + "loss": 0.8278, + "step": 9598 + }, + { + "epoch": 1.8478715980460572, + "grad_norm": 3.312555654845415, + "learning_rate": 3.010621435174632e-07, + "loss": 0.866, + "step": 9599 + }, + { + "epoch": 1.8480641047236326, + "grad_norm": 3.0878744038169454, + "learning_rate": 3.0030331881744315e-07, + "loss": 0.7785, + "step": 9600 + }, + { + "epoch": 1.8480641047236326, + "lm_loss": 0.6234, + "step": 9600, + "vm_loss": 0.1426 + }, + { + "epoch": 1.8480641047236326, + "lm_loss": 0.4911, + "step": 9600, + "vm_loss": 0.1559 + }, + { + "epoch": 1.8480641047236326, + "lm_loss": 0.6624, + "step": 9600, + "vm_loss": 0.1593 + }, + { + "epoch": 1.8480641047236326, + "lm_loss": 0.9975, + "step": 9600, + "vm_loss": 0.1359 + }, + { + "epoch": 1.8480641047236326, + "lm_loss": 0.4601, + "step": 9600, + "vm_loss": 0.1409 + }, + { + "epoch": 1.8480641047236326, + "lm_loss": 0.5111, + "step": 9600, + "vm_loss": 0.1343 + }, + { + "epoch": 1.8480641047236326, + "lm_loss": 0.71, + "step": 9600, + "vm_loss": 0.107 + }, + { + "epoch": 1.8480641047236326, + "lm_loss": 0.9088, + "step": 9600, + "vm_loss": 0.153 + }, + { + "epoch": 1.848256611401208, + "grad_norm": 3.1479022702775414, + "learning_rate": 2.9954543705203564e-07, + "loss": 0.8215, + "step": 9601 + }, + { + "epoch": 1.8484491180787832, + "grad_norm": 3.1686049240452983, + "learning_rate": 2.987884982949163e-07, + "loss": 0.824, + "step": 9602 + }, + { + "epoch": 1.8486416247563588, + "grad_norm": 3.309996055049136, + "learning_rate": 2.9803250261966956e-07, + "loss": 0.8661, + "step": 9603 + }, + { + "epoch": 1.848834131433934, + "grad_norm": 3.174100501190331, + "learning_rate": 2.9727745009978994e-07, + "loss": 0.8088, + "step": 9604 + }, + { + "epoch": 1.8490266381115095, + "grad_norm": 3.3527667388312383, + "learning_rate": 2.9652334080867427e-07, + "loss": 0.8395, + "step": 9605 + }, + { + "epoch": 1.849219144789085, + "grad_norm": 3.199608288721758, + "learning_rate": 2.95770174819634e-07, + "loss": 0.8338, + "step": 9606 + }, + { + "epoch": 1.84941165146666, + "grad_norm": 3.377639433104591, + "learning_rate": 2.95017952205886e-07, + "loss": 0.8332, + "step": 9607 + }, + { + "epoch": 1.8496041581442357, + "grad_norm": 3.2989844050641555, + "learning_rate": 2.9426667304055525e-07, + "loss": 0.8392, + "step": 9608 + }, + { + "epoch": 1.8496041581442357, + "lm_loss": 0.506, + "step": 9608, + "vm_loss": 0.1554 + }, + { + "epoch": 1.8496041581442357, + "lm_loss": 0.4127, + "step": 9608, + "vm_loss": 0.1244 + }, + { + "epoch": 1.8496041581442357, + "lm_loss": 0.8654, + "step": 9608, + "vm_loss": 0.1782 + }, + { + "epoch": 1.8496041581442357, + "lm_loss": 0.3977, + "step": 9608, + "vm_loss": 0.129 + }, + { + "epoch": 1.8496041581442357, + "lm_loss": 0.4094, + "step": 9608, + "vm_loss": 0.1488 + }, + { + "epoch": 1.8496041581442357, + "lm_loss": 0.7141, + "step": 9608, + "vm_loss": 0.1089 + }, + { + "epoch": 1.8496041581442357, + "lm_loss": 0.8601, + "step": 9608, + "vm_loss": 0.1775 + }, + { + "epoch": 1.8496041581442357, + "lm_loss": 0.6407, + "step": 9608, + "vm_loss": 0.1405 + }, + { + "epoch": 1.849796664821811, + "grad_norm": 3.2639860090239, + "learning_rate": 2.935163373966787e-07, + "loss": 0.7994, + "step": 9609 + }, + { + "epoch": 1.8499891714993864, + "grad_norm": 3.160041798135915, + "learning_rate": 2.9276694534719375e-07, + "loss": 0.7838, + "step": 9610 + }, + { + "epoch": 1.8501816781769618, + "grad_norm": 3.2653342219607953, + "learning_rate": 2.920184969649553e-07, + "loss": 0.8396, + "step": 9611 + }, + { + "epoch": 1.8503741848545372, + "grad_norm": 3.2780055559578583, + "learning_rate": 2.912709923227197e-07, + "loss": 0.8416, + "step": 9612 + }, + { + "epoch": 1.8505666915321126, + "grad_norm": 3.2591498376637227, + "learning_rate": 2.9052443149315435e-07, + "loss": 0.818, + "step": 9613 + }, + { + "epoch": 1.8507591982096878, + "grad_norm": 3.235833185154272, + "learning_rate": 2.897788145488345e-07, + "loss": 0.8427, + "step": 9614 + }, + { + "epoch": 1.8509517048872632, + "grad_norm": 3.215842559161073, + "learning_rate": 2.890341415622444e-07, + "loss": 0.822, + "step": 9615 + }, + { + "epoch": 1.8511442115648387, + "grad_norm": 3.3361864719292975, + "learning_rate": 2.8829041260577394e-07, + "loss": 0.8582, + "step": 9616 + }, + { + "epoch": 1.8511442115648387, + "lm_loss": 0.5724, + "step": 9616, + "vm_loss": 0.1128 + }, + { + "epoch": 1.8511442115648387, + "lm_loss": 0.5626, + "step": 9616, + "vm_loss": 0.148 + }, + { + "epoch": 1.8511442115648387, + "lm_loss": 0.5549, + "step": 9616, + "vm_loss": 0.1576 + }, + { + "epoch": 1.8511442115648387, + "lm_loss": 0.7712, + "step": 9616, + "vm_loss": 0.1754 + }, + { + "epoch": 1.8511442115648387, + "lm_loss": 0.7828, + "step": 9616, + "vm_loss": 0.1441 + }, + { + "epoch": 1.8511442115648387, + "lm_loss": 0.3769, + "step": 9616, + "vm_loss": 0.1519 + }, + { + "epoch": 1.8511442115648387, + "lm_loss": 0.9738, + "step": 9616, + "vm_loss": 0.1705 + }, + { + "epoch": 1.8511442115648387, + "lm_loss": 0.5852, + "step": 9616, + "vm_loss": 0.2192 + }, + { + "epoch": 1.851336718242414, + "grad_norm": 3.38528162361654, + "learning_rate": 2.875476277517264e-07, + "loss": 0.8457, + "step": 9617 + }, + { + "epoch": 1.8515292249199895, + "grad_norm": 3.1992594617820678, + "learning_rate": 2.868057870723073e-07, + "loss": 0.8045, + "step": 9618 + }, + { + "epoch": 1.8517217315975647, + "grad_norm": 3.248271856675651, + "learning_rate": 2.860648906396346e-07, + "loss": 0.8501, + "step": 9619 + }, + { + "epoch": 1.8519142382751401, + "grad_norm": 3.2620568291910153, + "learning_rate": 2.853249385257295e-07, + "loss": 0.8175, + "step": 9620 + }, + { + "epoch": 1.8521067449527155, + "grad_norm": 3.393074803490776, + "learning_rate": 2.8458593080253004e-07, + "loss": 0.859, + "step": 9621 + }, + { + "epoch": 1.852299251630291, + "grad_norm": 3.2169530497221785, + "learning_rate": 2.838478675418732e-07, + "loss": 0.8264, + "step": 9622 + }, + { + "epoch": 1.8524917583078664, + "grad_norm": 3.372702668677499, + "learning_rate": 2.8311074881550936e-07, + "loss": 0.8654, + "step": 9623 + }, + { + "epoch": 1.8526842649854416, + "grad_norm": 3.3653128589035926, + "learning_rate": 2.8237457469509676e-07, + "loss": 0.8658, + "step": 9624 + }, + { + "epoch": 1.8526842649854416, + "lm_loss": 0.6055, + "step": 9624, + "vm_loss": 0.1274 + }, + { + "epoch": 1.8526842649854416, + "lm_loss": 0.524, + "step": 9624, + "vm_loss": 0.1343 + }, + { + "epoch": 1.8526842649854416, + "lm_loss": 0.4891, + "step": 9624, + "vm_loss": 0.1606 + }, + { + "epoch": 1.8526842649854416, + "lm_loss": 0.6838, + "step": 9624, + "vm_loss": 0.106 + }, + { + "epoch": 1.8526842649854416, + "lm_loss": 0.8178, + "step": 9624, + "vm_loss": 0.1568 + }, + { + "epoch": 1.8526842649854416, + "lm_loss": 0.5719, + "step": 9624, + "vm_loss": 0.1551 + }, + { + "epoch": 1.8526842649854416, + "lm_loss": 0.6926, + "step": 9624, + "vm_loss": 0.147 + }, + { + "epoch": 1.8526842649854416, + "lm_loss": 1.0273, + "step": 9624, + "vm_loss": 0.1413 + }, + { + "epoch": 1.852876771663017, + "grad_norm": 3.3638691215876744, + "learning_rate": 2.816393452521993e-07, + "loss": 0.8338, + "step": 9625 + }, + { + "epoch": 1.8530692783405924, + "grad_norm": 3.248971959179298, + "learning_rate": 2.8090506055829304e-07, + "loss": 0.8238, + "step": 9626 + }, + { + "epoch": 1.8532617850181679, + "grad_norm": 3.1110059934914163, + "learning_rate": 2.8017172068475763e-07, + "loss": 0.7994, + "step": 9627 + }, + { + "epoch": 1.8534542916957433, + "grad_norm": 3.2279332306566166, + "learning_rate": 2.794393257028838e-07, + "loss": 0.7852, + "step": 9628 + }, + { + "epoch": 1.8536467983733185, + "grad_norm": 3.2673131854492796, + "learning_rate": 2.7870787568387016e-07, + "loss": 0.8178, + "step": 9629 + }, + { + "epoch": 1.8538393050508941, + "grad_norm": 3.436545458566446, + "learning_rate": 2.7797737069882203e-07, + "loss": 0.9113, + "step": 9630 + }, + { + "epoch": 1.8540318117284693, + "grad_norm": 3.262198868037698, + "learning_rate": 2.772478108187548e-07, + "loss": 0.8269, + "step": 9631 + }, + { + "epoch": 1.8542243184060447, + "grad_norm": 3.2128607171330117, + "learning_rate": 2.765191961145908e-07, + "loss": 0.8536, + "step": 9632 + }, + { + "epoch": 1.8542243184060447, + "lm_loss": 0.9194, + "step": 9632, + "vm_loss": 0.2075 + }, + { + "epoch": 1.8542243184060447, + "lm_loss": 0.9703, + "step": 9632, + "vm_loss": 0.1937 + }, + { + "epoch": 1.8542243184060447, + "lm_loss": 1.2659, + "step": 9632, + "vm_loss": 0.1413 + }, + { + "epoch": 1.8542243184060447, + "lm_loss": 1.0282, + "step": 9632, + "vm_loss": 0.1692 + }, + { + "epoch": 1.8542243184060447, + "lm_loss": 0.6017, + "step": 9632, + "vm_loss": 0.1709 + }, + { + "epoch": 1.8542243184060447, + "lm_loss": 0.4856, + "step": 9632, + "vm_loss": 0.1676 + }, + { + "epoch": 1.8542243184060447, + "lm_loss": 0.6814, + "step": 9632, + "vm_loss": 0.124 + }, + { + "epoch": 1.8542243184060447, + "lm_loss": 0.79, + "step": 9632, + "vm_loss": 0.1867 + }, + { + "epoch": 1.8544168250836202, + "grad_norm": 3.386257287798221, + "learning_rate": 2.757915266571598e-07, + "loss": 0.8797, + "step": 9633 + }, + { + "epoch": 1.8546093317611954, + "grad_norm": 3.2129110113337815, + "learning_rate": 2.750648025172031e-07, + "loss": 0.8385, + "step": 9634 + }, + { + "epoch": 1.854801838438771, + "grad_norm": 3.0132512820000197, + "learning_rate": 2.7433902376536426e-07, + "loss": 0.7625, + "step": 9635 + }, + { + "epoch": 1.8549943451163462, + "grad_norm": 3.243941489796722, + "learning_rate": 2.7361419047220115e-07, + "loss": 0.8228, + "step": 9636 + }, + { + "epoch": 1.8551868517939216, + "grad_norm": 3.266034131874567, + "learning_rate": 2.7289030270817305e-07, + "loss": 0.8425, + "step": 9637 + }, + { + "epoch": 1.855379358471497, + "grad_norm": 3.244165053334818, + "learning_rate": 2.72167360543657e-07, + "loss": 0.8207, + "step": 9638 + }, + { + "epoch": 1.8555718651490722, + "grad_norm": 3.41014535113565, + "learning_rate": 2.714453640489267e-07, + "loss": 0.8575, + "step": 9639 + }, + { + "epoch": 1.8557643718266479, + "grad_norm": 3.262554039166125, + "learning_rate": 2.707243132941717e-07, + "loss": 0.81, + "step": 9640 + }, + { + "epoch": 1.8557643718266479, + "lm_loss": 1.0262, + "step": 9640, + "vm_loss": 0.1083 + }, + { + "epoch": 1.8557643718266479, + "lm_loss": 0.7494, + "step": 9640, + "vm_loss": 0.1946 + }, + { + "epoch": 1.8557643718266479, + "lm_loss": 0.4635, + "step": 9640, + "vm_loss": 0.1707 + }, + { + "epoch": 1.8557643718266479, + "lm_loss": 0.5387, + "step": 9640, + "vm_loss": 0.1429 + }, + { + "epoch": 1.8557643718266479, + "lm_loss": 0.8213, + "step": 9640, + "vm_loss": 0.1249 + }, + { + "epoch": 1.8557643718266479, + "lm_loss": 0.6055, + "step": 9640, + "vm_loss": 0.1259 + }, + { + "epoch": 1.8557643718266479, + "lm_loss": 0.4722, + "step": 9640, + "vm_loss": 0.169 + }, + { + "epoch": 1.8557643718266479, + "lm_loss": 0.7171, + "step": 9640, + "vm_loss": 0.144 + }, + { + "epoch": 1.855956878504223, + "grad_norm": 3.4041877919058945, + "learning_rate": 2.7000420834948693e-07, + "loss": 0.8514, + "step": 9641 + }, + { + "epoch": 1.8561493851817985, + "grad_norm": 3.330376841745715, + "learning_rate": 2.6928504928487755e-07, + "loss": 0.8356, + "step": 9642 + }, + { + "epoch": 1.856341891859374, + "grad_norm": 3.448141851539447, + "learning_rate": 2.68566836170252e-07, + "loss": 0.9203, + "step": 9643 + }, + { + "epoch": 1.8565343985369491, + "grad_norm": 3.2778757923510153, + "learning_rate": 2.6784956907543234e-07, + "loss": 0.8931, + "step": 9644 + }, + { + "epoch": 1.8567269052145248, + "grad_norm": 3.2501337223650655, + "learning_rate": 2.67133248070145e-07, + "loss": 0.8468, + "step": 9645 + }, + { + "epoch": 1.8569194118921, + "grad_norm": 3.2812044486300165, + "learning_rate": 2.664178732240252e-07, + "loss": 0.8609, + "step": 9646 + }, + { + "epoch": 1.8571119185696754, + "grad_norm": 3.144659049224215, + "learning_rate": 2.6570344460661647e-07, + "loss": 0.8019, + "step": 9647 + }, + { + "epoch": 1.8573044252472508, + "grad_norm": 3.2591443051106226, + "learning_rate": 2.6498996228737193e-07, + "loss": 0.8526, + "step": 9648 + }, + { + "epoch": 1.8573044252472508, + "lm_loss": 0.7227, + "step": 9648, + "vm_loss": 0.1435 + }, + { + "epoch": 1.8573044252472508, + "lm_loss": 0.9284, + "step": 9648, + "vm_loss": 0.1549 + }, + { + "epoch": 1.8573044252472508, + "lm_loss": 0.5626, + "step": 9648, + "vm_loss": 0.1453 + }, + { + "epoch": 1.8573044252472508, + "lm_loss": 0.5065, + "step": 9648, + "vm_loss": 0.1501 + }, + { + "epoch": 1.8573044252472508, + "lm_loss": 0.7692, + "step": 9648, + "vm_loss": 0.1545 + }, + { + "epoch": 1.8573044252472508, + "lm_loss": 0.7949, + "step": 9648, + "vm_loss": 0.1645 + }, + { + "epoch": 1.8573044252472508, + "lm_loss": 0.3661, + "step": 9648, + "vm_loss": 0.1142 + }, + { + "epoch": 1.8573044252472508, + "lm_loss": 0.4883, + "step": 9648, + "vm_loss": 0.1114 + }, + { + "epoch": 1.857496931924826, + "grad_norm": 3.145828079323454, + "learning_rate": 2.642774263356507e-07, + "loss": 0.7879, + "step": 9649 + }, + { + "epoch": 1.8576894386024017, + "grad_norm": 3.2049003241142073, + "learning_rate": 2.6356583682071945e-07, + "loss": 0.8402, + "step": 9650 + }, + { + "epoch": 1.8578819452799769, + "grad_norm": 3.2603877132951236, + "learning_rate": 2.6285519381175407e-07, + "loss": 0.8326, + "step": 9651 + }, + { + "epoch": 1.8580744519575523, + "grad_norm": 3.342407883174142, + "learning_rate": 2.6214549737783813e-07, + "loss": 0.8562, + "step": 9652 + }, + { + "epoch": 1.8582669586351277, + "grad_norm": 3.195245475083382, + "learning_rate": 2.614367475879642e-07, + "loss": 0.8118, + "step": 9653 + }, + { + "epoch": 1.858459465312703, + "grad_norm": 3.188890736193979, + "learning_rate": 2.607289445110284e-07, + "loss": 0.8486, + "step": 9654 + }, + { + "epoch": 1.8586519719902785, + "grad_norm": 3.2726839226499327, + "learning_rate": 2.600220882158444e-07, + "loss": 0.8276, + "step": 9655 + }, + { + "epoch": 1.8588444786678537, + "grad_norm": 3.2450376487877697, + "learning_rate": 2.5931617877112183e-07, + "loss": 0.84, + "step": 9656 + }, + { + "epoch": 1.8588444786678537, + "lm_loss": 0.7111, + "step": 9656, + "vm_loss": 0.1794 + }, + { + "epoch": 1.8588444786678537, + "lm_loss": 0.598, + "step": 9656, + "vm_loss": 0.1598 + }, + { + "epoch": 1.8588444786678537, + "lm_loss": 0.7622, + "step": 9656, + "vm_loss": 0.157 + }, + { + "epoch": 1.8588444786678537, + "lm_loss": 0.6085, + "step": 9656, + "vm_loss": 0.1302 + }, + { + "epoch": 1.8588444786678537, + "lm_loss": 0.3055, + "step": 9656, + "vm_loss": 0.1418 + }, + { + "epoch": 1.8588444786678537, + "lm_loss": 0.718, + "step": 9656, + "vm_loss": 0.1653 + }, + { + "epoch": 1.8588444786678537, + "lm_loss": 0.5804, + "step": 9656, + "vm_loss": 0.2014 + }, + { + "epoch": 1.8588444786678537, + "lm_loss": 0.5824, + "step": 9656, + "vm_loss": 0.1429 + }, + { + "epoch": 1.8590369853454292, + "grad_norm": 3.3464966422192277, + "learning_rate": 2.586112162454868e-07, + "loss": 0.8333, + "step": 9657 + }, + { + "epoch": 1.8592294920230046, + "grad_norm": 3.174787677173344, + "learning_rate": 2.579072007074701e-07, + "loss": 0.817, + "step": 9658 + }, + { + "epoch": 1.8594219987005798, + "grad_norm": 3.3124093480476815, + "learning_rate": 2.572041322255125e-07, + "loss": 0.8445, + "step": 9659 + }, + { + "epoch": 1.8596145053781554, + "grad_norm": 3.2256057403923366, + "learning_rate": 2.5650201086795946e-07, + "loss": 0.8252, + "step": 9660 + }, + { + "epoch": 1.8598070120557306, + "grad_norm": 3.360017715834636, + "learning_rate": 2.558008367030662e-07, + "loss": 0.8604, + "step": 9661 + }, + { + "epoch": 1.859999518733306, + "grad_norm": 3.3032221913333895, + "learning_rate": 2.551006097989961e-07, + "loss": 0.8583, + "step": 9662 + }, + { + "epoch": 1.8601920254108815, + "grad_norm": 3.3262230806983197, + "learning_rate": 2.5440133022382243e-07, + "loss": 0.8495, + "step": 9663 + }, + { + "epoch": 1.8603845320884567, + "grad_norm": 3.1775843456761144, + "learning_rate": 2.537029980455197e-07, + "loss": 0.8367, + "step": 9664 + }, + { + "epoch": 1.8603845320884567, + "lm_loss": 0.8675, + "step": 9664, + "vm_loss": 0.145 + }, + { + "epoch": 1.8603845320884567, + "lm_loss": 0.7792, + "step": 9664, + "vm_loss": 0.1486 + }, + { + "epoch": 1.8603845320884567, + "lm_loss": 0.5186, + "step": 9664, + "vm_loss": 0.1395 + }, + { + "epoch": 1.8603845320884567, + "lm_loss": 1.0022, + "step": 9664, + "vm_loss": 0.1375 + }, + { + "epoch": 1.8603845320884567, + "lm_loss": 0.9925, + "step": 9664, + "vm_loss": 0.2454 + }, + { + "epoch": 1.8603845320884567, + "lm_loss": 0.8654, + "step": 9664, + "vm_loss": 0.1783 + }, + { + "epoch": 1.8603845320884567, + "lm_loss": 0.6293, + "step": 9664, + "vm_loss": 0.1821 + }, + { + "epoch": 1.8603845320884567, + "lm_loss": 0.5801, + "step": 9664, + "vm_loss": 0.1632 + }, + { + "epoch": 1.8605770387660323, + "grad_norm": 3.47990501075481, + "learning_rate": 2.5300561333197803e-07, + "loss": 0.8945, + "step": 9665 + }, + { + "epoch": 1.8607695454436075, + "grad_norm": 3.2091242496639865, + "learning_rate": 2.52309176150991e-07, + "loss": 0.8193, + "step": 9666 + }, + { + "epoch": 1.860962052121183, + "grad_norm": 3.274251130854965, + "learning_rate": 2.516136865702612e-07, + "loss": 0.8353, + "step": 9667 + }, + { + "epoch": 1.8611545587987584, + "grad_norm": 3.140856919723914, + "learning_rate": 2.509191446574e-07, + "loss": 0.8232, + "step": 9668 + }, + { + "epoch": 1.8613470654763336, + "grad_norm": 3.1525851329813634, + "learning_rate": 2.5022555047992446e-07, + "loss": 0.8116, + "step": 9669 + }, + { + "epoch": 1.8615395721539092, + "grad_norm": 3.2726937006517747, + "learning_rate": 2.49532904105263e-07, + "loss": 0.8308, + "step": 9670 + }, + { + "epoch": 1.8617320788314844, + "grad_norm": 3.2448702586529796, + "learning_rate": 2.488412056007461e-07, + "loss": 0.8226, + "step": 9671 + }, + { + "epoch": 1.8619245855090598, + "grad_norm": 3.334841685342883, + "learning_rate": 2.481504550336189e-07, + "loss": 0.9059, + "step": 9672 + }, + { + "epoch": 1.8619245855090598, + "lm_loss": 0.383, + "step": 9672, + "vm_loss": 0.1784 + }, + { + "epoch": 1.8619245855090598, + "lm_loss": 1.1607, + "step": 9672, + "vm_loss": 0.1458 + }, + { + "epoch": 1.8619245855090598, + "lm_loss": 0.5706, + "step": 9672, + "vm_loss": 0.1482 + }, + { + "epoch": 1.8619245855090598, + "lm_loss": 0.4752, + "step": 9672, + "vm_loss": 0.1546 + }, + { + "epoch": 1.8619245855090598, + "lm_loss": 0.6216, + "step": 9672, + "vm_loss": 0.1605 + }, + { + "epoch": 1.8619245855090598, + "lm_loss": 0.603, + "step": 9672, + "vm_loss": 0.1533 + }, + { + "epoch": 1.8619245855090598, + "lm_loss": 0.6829, + "step": 9672, + "vm_loss": 0.1672 + }, + { + "epoch": 1.8619245855090598, + "lm_loss": 0.7276, + "step": 9672, + "vm_loss": 0.1585 + }, + { + "epoch": 1.8621170921866352, + "grad_norm": 3.1890160415655116, + "learning_rate": 2.474606524710288e-07, + "loss": 0.8122, + "step": 9673 + }, + { + "epoch": 1.8623095988642104, + "grad_norm": 3.366482000308505, + "learning_rate": 2.467717979800355e-07, + "loss": 0.8477, + "step": 9674 + }, + { + "epoch": 1.862502105541786, + "grad_norm": 3.3269393040648887, + "learning_rate": 2.4608389162760427e-07, + "loss": 0.8535, + "step": 9675 + }, + { + "epoch": 1.8626946122193613, + "grad_norm": 3.227811758422907, + "learning_rate": 2.4539693348060835e-07, + "loss": 0.8404, + "step": 9676 + }, + { + "epoch": 1.8628871188969367, + "grad_norm": 3.300521378748627, + "learning_rate": 2.447109236058265e-07, + "loss": 0.8878, + "step": 9677 + }, + { + "epoch": 1.8630796255745121, + "grad_norm": 3.112021656396733, + "learning_rate": 2.440258620699498e-07, + "loss": 0.7939, + "step": 9678 + }, + { + "epoch": 1.8632721322520875, + "grad_norm": 3.1811123186668464, + "learning_rate": 2.4334174893957505e-07, + "loss": 0.8201, + "step": 9679 + }, + { + "epoch": 1.863464638929663, + "grad_norm": 3.2446747349981604, + "learning_rate": 2.426585842812079e-07, + "loss": 0.8202, + "step": 9680 + }, + { + "epoch": 1.863464638929663, + "lm_loss": 0.5164, + "step": 9680, + "vm_loss": 0.1552 + }, + { + "epoch": 1.863464638929663, + "lm_loss": 0.4945, + "step": 9680, + "vm_loss": 0.2067 + }, + { + "epoch": 1.863464638929663, + "lm_loss": 0.565, + "step": 9680, + "vm_loss": 0.1217 + }, + { + "epoch": 1.863464638929663, + "lm_loss": 0.7844, + "step": 9680, + "vm_loss": 0.1886 + }, + { + "epoch": 1.863464638929663, + "lm_loss": 0.6199, + "step": 9680, + "vm_loss": 0.0831 + }, + { + "epoch": 1.863464638929663, + "lm_loss": 0.8042, + "step": 9680, + "vm_loss": 0.122 + }, + { + "epoch": 1.863464638929663, + "lm_loss": 0.5626, + "step": 9680, + "vm_loss": 0.128 + }, + { + "epoch": 1.863464638929663, + "lm_loss": 0.563, + "step": 9680, + "vm_loss": 0.1613 + }, + { + "epoch": 1.8636571456072382, + "grad_norm": 3.143049137460251, + "learning_rate": 2.4197636816125634e-07, + "loss": 0.8126, + "step": 9681 + }, + { + "epoch": 1.8638496522848136, + "grad_norm": 3.2426675089920205, + "learning_rate": 2.412951006460451e-07, + "loss": 0.8659, + "step": 9682 + }, + { + "epoch": 1.864042158962389, + "grad_norm": 3.2162269463474207, + "learning_rate": 2.406147818018001e-07, + "loss": 0.7919, + "step": 9683 + }, + { + "epoch": 1.8642346656399644, + "grad_norm": 3.1617350761602325, + "learning_rate": 2.399354116946584e-07, + "loss": 0.7993, + "step": 9684 + }, + { + "epoch": 1.8644271723175398, + "grad_norm": 3.1669635447357822, + "learning_rate": 2.392569903906605e-07, + "loss": 0.7747, + "step": 9685 + }, + { + "epoch": 1.864619678995115, + "grad_norm": 3.160352014176193, + "learning_rate": 2.385795179557604e-07, + "loss": 0.7938, + "step": 9686 + }, + { + "epoch": 1.8648121856726905, + "grad_norm": 3.125943571038174, + "learning_rate": 2.3790299445581645e-07, + "loss": 0.8248, + "step": 9687 + }, + { + "epoch": 1.865004692350266, + "grad_norm": 3.3510804410930666, + "learning_rate": 2.3722741995659382e-07, + "loss": 0.8523, + "step": 9688 + }, + { + "epoch": 1.865004692350266, + "lm_loss": 0.9826, + "step": 9688, + "vm_loss": 0.2301 + }, + { + "epoch": 1.865004692350266, + "lm_loss": 0.5577, + "step": 9688, + "vm_loss": 0.0892 + }, + { + "epoch": 1.865004692350266, + "lm_loss": 0.6451, + "step": 9688, + "vm_loss": 0.1591 + }, + { + "epoch": 1.865004692350266, + "lm_loss": 0.7339, + "step": 9688, + "vm_loss": 0.171 + }, + { + "epoch": 1.865004692350266, + "lm_loss": 0.5387, + "step": 9688, + "vm_loss": 0.156 + }, + { + "epoch": 1.865004692350266, + "lm_loss": 1.3206, + "step": 9688, + "vm_loss": 0.1629 + }, + { + "epoch": 1.865004692350266, + "lm_loss": 0.7708, + "step": 9688, + "vm_loss": 0.122 + }, + { + "epoch": 1.865004692350266, + "lm_loss": 0.7254, + "step": 9688, + "vm_loss": 0.1274 + }, + { + "epoch": 1.8651971990278413, + "grad_norm": 3.440153434596968, + "learning_rate": 2.365527945237689e-07, + "loss": 0.8661, + "step": 9689 + }, + { + "epoch": 1.8653897057054167, + "grad_norm": 3.094718765062354, + "learning_rate": 2.3587911822292364e-07, + "loss": 0.7645, + "step": 9690 + }, + { + "epoch": 1.865582212382992, + "grad_norm": 3.418161329806052, + "learning_rate": 2.352063911195479e-07, + "loss": 0.8774, + "step": 9691 + }, + { + "epoch": 1.8657747190605676, + "grad_norm": 3.164205496457555, + "learning_rate": 2.3453461327903825e-07, + "loss": 0.8191, + "step": 9692 + }, + { + "epoch": 1.8659672257381428, + "grad_norm": 3.321684118399626, + "learning_rate": 2.3386378476670245e-07, + "loss": 0.8577, + "step": 9693 + }, + { + "epoch": 1.8661597324157182, + "grad_norm": 3.142493114974839, + "learning_rate": 2.3319390564775168e-07, + "loss": 0.7871, + "step": 9694 + }, + { + "epoch": 1.8663522390932936, + "grad_norm": 3.3976251717189068, + "learning_rate": 2.3252497598730606e-07, + "loss": 0.864, + "step": 9695 + }, + { + "epoch": 1.8665447457708688, + "grad_norm": 3.2686074730985673, + "learning_rate": 2.318569958503958e-07, + "loss": 0.8157, + "step": 9696 + }, + { + "epoch": 1.8665447457708688, + "lm_loss": 0.6711, + "step": 9696, + "vm_loss": 0.1132 + }, + { + "epoch": 1.8665447457708688, + "lm_loss": 0.4747, + "step": 9696, + "vm_loss": 0.1571 + }, + { + "epoch": 1.8665447457708688, + "lm_loss": 0.5886, + "step": 9696, + "vm_loss": 0.1683 + }, + { + "epoch": 1.8665447457708688, + "lm_loss": 0.6015, + "step": 9696, + "vm_loss": 0.1717 + }, + { + "epoch": 1.8665447457708688, + "lm_loss": 0.798, + "step": 9696, + "vm_loss": 0.1509 + }, + { + "epoch": 1.8665447457708688, + "lm_loss": 1.0156, + "step": 9696, + "vm_loss": 0.1565 + }, + { + "epoch": 1.8665447457708688, + "lm_loss": 0.7591, + "step": 9696, + "vm_loss": 0.2009 + }, + { + "epoch": 1.8665447457708688, + "lm_loss": 0.7683, + "step": 9696, + "vm_loss": 0.2282 + }, + { + "epoch": 1.8667372524484445, + "grad_norm": 3.296592215486963, + "learning_rate": 2.311899653019578e-07, + "loss": 0.8498, + "step": 9697 + }, + { + "epoch": 1.8669297591260197, + "grad_norm": 3.354925632736047, + "learning_rate": 2.3052388440683248e-07, + "loss": 0.8834, + "step": 9698 + }, + { + "epoch": 1.867122265803595, + "grad_norm": 3.1147071902608916, + "learning_rate": 2.298587532297758e-07, + "loss": 0.7738, + "step": 9699 + }, + { + "epoch": 1.8673147724811705, + "grad_norm": 3.18120408239932, + "learning_rate": 2.291945718354438e-07, + "loss": 0.7941, + "step": 9700 + }, + { + "epoch": 1.8675072791587457, + "grad_norm": 3.267466408861681, + "learning_rate": 2.2853134028840594e-07, + "loss": 0.8526, + "step": 9701 + }, + { + "epoch": 1.8676997858363213, + "grad_norm": 3.068626957717172, + "learning_rate": 2.278690586531329e-07, + "loss": 0.7991, + "step": 9702 + }, + { + "epoch": 1.8678922925138965, + "grad_norm": 3.1655574221918465, + "learning_rate": 2.2720772699401205e-07, + "loss": 0.8058, + "step": 9703 + }, + { + "epoch": 1.868084799191472, + "grad_norm": 3.2355871296216, + "learning_rate": 2.2654734537532864e-07, + "loss": 0.8382, + "step": 9704 + }, + { + "epoch": 1.868084799191472, + "lm_loss": 0.598, + "step": 9704, + "vm_loss": 0.0869 + }, + { + "epoch": 1.868084799191472, + "lm_loss": 0.6105, + "step": 9704, + "vm_loss": 0.205 + }, + { + "epoch": 1.868084799191472, + "lm_loss": 0.6871, + "step": 9704, + "vm_loss": 0.1315 + }, + { + "epoch": 1.868084799191472, + "lm_loss": 0.4307, + "step": 9704, + "vm_loss": 0.1813 + }, + { + "epoch": 1.868084799191472, + "lm_loss": 0.2651, + "step": 9704, + "vm_loss": 0.2453 + }, + { + "epoch": 1.868084799191472, + "lm_loss": 0.7181, + "step": 9704, + "vm_loss": 0.1574 + }, + { + "epoch": 1.868084799191472, + "lm_loss": 1.0245, + "step": 9704, + "vm_loss": 0.1383 + }, + { + "epoch": 1.868084799191472, + "lm_loss": 0.6779, + "step": 9704, + "vm_loss": 0.1845 + }, + { + "epoch": 1.8682773058690474, + "grad_norm": 3.2927302339922853, + "learning_rate": 2.258879138612824e-07, + "loss": 0.8359, + "step": 9705 + }, + { + "epoch": 1.8684698125466226, + "grad_norm": 3.171509162301296, + "learning_rate": 2.2522943251597874e-07, + "loss": 0.8029, + "step": 9706 + }, + { + "epoch": 1.8686623192241982, + "grad_norm": 3.299729184506608, + "learning_rate": 2.2457190140343088e-07, + "loss": 0.8466, + "step": 9707 + }, + { + "epoch": 1.8688548259017734, + "grad_norm": 3.446661085823441, + "learning_rate": 2.2391532058755772e-07, + "loss": 0.8816, + "step": 9708 + }, + { + "epoch": 1.8690473325793489, + "grad_norm": 3.323875063457362, + "learning_rate": 2.232596901321882e-07, + "loss": 0.8703, + "step": 9709 + }, + { + "epoch": 1.8692398392569243, + "grad_norm": 3.095697258295933, + "learning_rate": 2.2260501010105795e-07, + "loss": 0.8163, + "step": 9710 + }, + { + "epoch": 1.8694323459344995, + "grad_norm": 3.29864542684858, + "learning_rate": 2.2195128055780945e-07, + "loss": 0.8424, + "step": 9711 + }, + { + "epoch": 1.8696248526120751, + "grad_norm": 3.2426546367183335, + "learning_rate": 2.2129850156599408e-07, + "loss": 0.8365, + "step": 9712 + }, + { + "epoch": 1.8696248526120751, + "lm_loss": 0.2949, + "step": 9712, + "vm_loss": 0.136 + }, + { + "epoch": 1.8696248526120751, + "lm_loss": 0.6369, + "step": 9712, + "vm_loss": 0.1262 + }, + { + "epoch": 1.8696248526120751, + "lm_loss": 0.5325, + "step": 9712, + "vm_loss": 0.1819 + }, + { + "epoch": 1.8696248526120751, + "lm_loss": 0.5593, + "step": 9712, + "vm_loss": 0.215 + }, + { + "epoch": 1.8696248526120751, + "lm_loss": 0.5492, + "step": 9712, + "vm_loss": 0.1597 + }, + { + "epoch": 1.8696248526120751, + "lm_loss": 0.453, + "step": 9712, + "vm_loss": 0.1217 + }, + { + "epoch": 1.8696248526120751, + "lm_loss": 0.5704, + "step": 9712, + "vm_loss": 0.1371 + }, + { + "epoch": 1.8696248526120751, + "lm_loss": 0.3374, + "step": 9712, + "vm_loss": 0.1515 + }, + { + "epoch": 1.8698173592896503, + "grad_norm": 3.1229958273898153, + "learning_rate": 2.2064667318907108e-07, + "loss": 0.7664, + "step": 9713 + }, + { + "epoch": 1.8700098659672257, + "grad_norm": 3.1754028997580828, + "learning_rate": 2.1999579549040752e-07, + "loss": 0.8223, + "step": 9714 + }, + { + "epoch": 1.8702023726448012, + "grad_norm": 3.3722238714422295, + "learning_rate": 2.193458685332728e-07, + "loss": 0.8716, + "step": 9715 + }, + { + "epoch": 1.8703948793223764, + "grad_norm": 3.3717148889086683, + "learning_rate": 2.186968923808519e-07, + "loss": 0.8529, + "step": 9716 + }, + { + "epoch": 1.870587385999952, + "grad_norm": 3.181198161958038, + "learning_rate": 2.180488670962322e-07, + "loss": 0.8044, + "step": 9717 + }, + { + "epoch": 1.8707798926775272, + "grad_norm": 3.085185442809127, + "learning_rate": 2.1740179274241103e-07, + "loss": 0.7873, + "step": 9718 + }, + { + "epoch": 1.8709723993551026, + "grad_norm": 3.1285213877219498, + "learning_rate": 2.1675566938229032e-07, + "loss": 0.8136, + "step": 9719 + }, + { + "epoch": 1.871164906032678, + "grad_norm": 3.2532147430005733, + "learning_rate": 2.1611049707868425e-07, + "loss": 0.8235, + "step": 9720 + }, + { + "epoch": 1.871164906032678, + "lm_loss": 0.591, + "step": 9720, + "vm_loss": 0.1376 + }, + { + "epoch": 1.871164906032678, + "lm_loss": 0.56, + "step": 9720, + "vm_loss": 0.1869 + }, + { + "epoch": 1.871164906032678, + "lm_loss": 0.9641, + "step": 9720, + "vm_loss": 0.0985 + }, + { + "epoch": 1.871164906032678, + "lm_loss": 0.5021, + "step": 9720, + "vm_loss": 0.1643 + }, + { + "epoch": 1.871164906032678, + "lm_loss": 0.6118, + "step": 9720, + "vm_loss": 0.1707 + }, + { + "epoch": 1.871164906032678, + "lm_loss": 0.5551, + "step": 9720, + "vm_loss": 0.1958 + }, + { + "epoch": 1.871164906032678, + "lm_loss": 0.3738, + "step": 9720, + "vm_loss": 0.1918 + }, + { + "epoch": 1.871164906032678, + "lm_loss": 0.5624, + "step": 9720, + "vm_loss": 0.1483 + }, + { + "epoch": 1.8713574127102532, + "grad_norm": 3.257328640580475, + "learning_rate": 2.1546627589430935e-07, + "loss": 0.8603, + "step": 9721 + }, + { + "epoch": 1.8715499193878289, + "grad_norm": 3.192045098216215, + "learning_rate": 2.1482300589179327e-07, + "loss": 0.8265, + "step": 9722 + }, + { + "epoch": 1.871742426065404, + "grad_norm": 3.194524546444081, + "learning_rate": 2.1418068713367046e-07, + "loss": 0.8358, + "step": 9723 + }, + { + "epoch": 1.8719349327429795, + "grad_norm": 3.287629593883907, + "learning_rate": 2.1353931968238206e-07, + "loss": 0.8221, + "step": 9724 + }, + { + "epoch": 1.872127439420555, + "grad_norm": 3.488361239853445, + "learning_rate": 2.128989036002771e-07, + "loss": 0.9036, + "step": 9725 + }, + { + "epoch": 1.8723199460981301, + "grad_norm": 3.132195822761052, + "learning_rate": 2.122594389496113e-07, + "loss": 0.824, + "step": 9726 + }, + { + "epoch": 1.8725124527757058, + "grad_norm": 3.292835570162771, + "learning_rate": 2.1162092579255055e-07, + "loss": 0.846, + "step": 9727 + }, + { + "epoch": 1.872704959453281, + "grad_norm": 3.262522182729641, + "learning_rate": 2.1098336419116628e-07, + "loss": 0.8523, + "step": 9728 + }, + { + "epoch": 1.872704959453281, + "lm_loss": 0.8178, + "step": 9728, + "vm_loss": 0.2007 + }, + { + "epoch": 1.872704959453281, + "lm_loss": 0.7685, + "step": 9728, + "vm_loss": 0.1549 + }, + { + "epoch": 1.872704959453281, + "lm_loss": 0.4841, + "step": 9728, + "vm_loss": 0.1708 + }, + { + "epoch": 1.872704959453281, + "lm_loss": 0.8488, + "step": 9728, + "vm_loss": 0.1996 + }, + { + "epoch": 1.872704959453281, + "lm_loss": 0.53, + "step": 9728, + "vm_loss": 0.1616 + }, + { + "epoch": 1.872704959453281, + "lm_loss": 0.6234, + "step": 9728, + "vm_loss": 0.1955 + }, + { + "epoch": 1.872704959453281, + "lm_loss": 0.7864, + "step": 9728, + "vm_loss": 0.202 + }, + { + "epoch": 1.872704959453281, + "lm_loss": 0.9771, + "step": 9728, + "vm_loss": 0.1742 + }, + { + "epoch": 1.8728974661308564, + "grad_norm": 3.1501397936146787, + "learning_rate": 2.1034675420743557e-07, + "loss": 0.8288, + "step": 9729 + }, + { + "epoch": 1.8730899728084318, + "grad_norm": 3.2668787761368185, + "learning_rate": 2.097110959032489e-07, + "loss": 0.83, + "step": 9730 + }, + { + "epoch": 1.873282479486007, + "grad_norm": 3.146049702582496, + "learning_rate": 2.0907638934039575e-07, + "loss": 0.8289, + "step": 9731 + }, + { + "epoch": 1.8734749861635827, + "grad_norm": 3.1668131620811293, + "learning_rate": 2.0844263458058122e-07, + "loss": 0.7958, + "step": 9732 + }, + { + "epoch": 1.8736674928411579, + "grad_norm": 3.3195585644546073, + "learning_rate": 2.0780983168541268e-07, + "loss": 0.8648, + "step": 9733 + }, + { + "epoch": 1.8738599995187333, + "grad_norm": 3.1890995210309288, + "learning_rate": 2.0717798071640761e-07, + "loss": 0.8216, + "step": 9734 + }, + { + "epoch": 1.8740525061963087, + "grad_norm": 3.2677739386101425, + "learning_rate": 2.0654708173499016e-07, + "loss": 0.8551, + "step": 9735 + }, + { + "epoch": 1.874245012873884, + "grad_norm": 3.200486750956276, + "learning_rate": 2.0591713480248909e-07, + "loss": 0.857, + "step": 9736 + }, + { + "epoch": 1.874245012873884, + "lm_loss": 0.4844, + "step": 9736, + "vm_loss": 0.1321 + }, + { + "epoch": 1.874245012873884, + "lm_loss": 0.9613, + "step": 9736, + "vm_loss": 0.164 + }, + { + "epoch": 1.874245012873884, + "lm_loss": 0.899, + "step": 9736, + "vm_loss": 0.1125 + }, + { + "epoch": 1.874245012873884, + "lm_loss": 0.9652, + "step": 9736, + "vm_loss": 0.1921 + }, + { + "epoch": 1.874245012873884, + "lm_loss": 0.5801, + "step": 9736, + "vm_loss": 0.2294 + }, + { + "epoch": 1.874245012873884, + "lm_loss": 0.5664, + "step": 9736, + "vm_loss": 0.1468 + }, + { + "epoch": 1.874245012873884, + "lm_loss": 0.7078, + "step": 9736, + "vm_loss": 0.1981 + }, + { + "epoch": 1.874245012873884, + "lm_loss": 0.6331, + "step": 9736, + "vm_loss": 0.1993 + }, + { + "epoch": 1.8744375195514595, + "grad_norm": 3.235305778019497, + "learning_rate": 2.0528813998014762e-07, + "loss": 0.8556, + "step": 9737 + }, + { + "epoch": 1.8746300262290347, + "grad_norm": 3.291903340896733, + "learning_rate": 2.0466009732910908e-07, + "loss": 0.8721, + "step": 9738 + }, + { + "epoch": 1.8748225329066102, + "grad_norm": 3.3385508919303493, + "learning_rate": 2.0403300691042906e-07, + "loss": 0.8523, + "step": 9739 + }, + { + "epoch": 1.8750150395841856, + "grad_norm": 3.309376141655951, + "learning_rate": 2.0340686878506655e-07, + "loss": 0.8545, + "step": 9740 + }, + { + "epoch": 1.875207546261761, + "grad_norm": 3.0862666470057896, + "learning_rate": 2.0278168301389177e-07, + "loss": 0.812, + "step": 9741 + }, + { + "epoch": 1.8754000529393364, + "grad_norm": 3.270874192099001, + "learning_rate": 2.0215744965768057e-07, + "loss": 0.8878, + "step": 9742 + }, + { + "epoch": 1.8755925596169116, + "grad_norm": 3.3824207127536945, + "learning_rate": 2.0153416877711552e-07, + "loss": 0.9079, + "step": 9743 + }, + { + "epoch": 1.875785066294487, + "grad_norm": 3.3554009239298974, + "learning_rate": 2.0091184043278923e-07, + "loss": 0.8626, + "step": 9744 + }, + { + "epoch": 1.875785066294487, + "lm_loss": 0.46, + "step": 9744, + "vm_loss": 0.1994 + }, + { + "epoch": 1.875785066294487, + "lm_loss": 0.701, + "step": 9744, + "vm_loss": 0.1157 + }, + { + "epoch": 1.875785066294487, + "lm_loss": 0.4721, + "step": 9744, + "vm_loss": 0.1697 + }, + { + "epoch": 1.875785066294487, + "lm_loss": 0.6171, + "step": 9744, + "vm_loss": 0.1682 + }, + { + "epoch": 1.875785066294487, + "lm_loss": 0.5504, + "step": 9744, + "vm_loss": 0.1867 + }, + { + "epoch": 1.875785066294487, + "lm_loss": 0.7514, + "step": 9744, + "vm_loss": 0.1125 + }, + { + "epoch": 1.875785066294487, + "lm_loss": 0.5252, + "step": 9744, + "vm_loss": 0.136 + }, + { + "epoch": 1.875785066294487, + "lm_loss": 0.7013, + "step": 9744, + "vm_loss": 0.2158 + }, + { + "epoch": 1.8759775729720625, + "grad_norm": 3.199373082523442, + "learning_rate": 2.0029046468519885e-07, + "loss": 0.8143, + "step": 9745 + }, + { + "epoch": 1.876170079649638, + "grad_norm": 3.1163404396667014, + "learning_rate": 1.9967004159474946e-07, + "loss": 0.7806, + "step": 9746 + }, + { + "epoch": 1.8763625863272133, + "grad_norm": 3.133494924560944, + "learning_rate": 1.990505712217561e-07, + "loss": 0.8267, + "step": 9747 + }, + { + "epoch": 1.8765550930047885, + "grad_norm": 3.337141915380523, + "learning_rate": 1.9843205362643724e-07, + "loss": 0.8555, + "step": 9748 + }, + { + "epoch": 1.876747599682364, + "grad_norm": 3.4311856183189002, + "learning_rate": 1.978144888689204e-07, + "loss": 0.8794, + "step": 9749 + }, + { + "epoch": 1.8769401063599394, + "grad_norm": 3.2747941313657574, + "learning_rate": 1.9719787700924308e-07, + "loss": 0.8788, + "step": 9750 + }, + { + "epoch": 1.8771326130375148, + "grad_norm": 3.1706260957535455, + "learning_rate": 1.9658221810734735e-07, + "loss": 0.8399, + "step": 9751 + }, + { + "epoch": 1.8773251197150902, + "grad_norm": 3.2379514835818037, + "learning_rate": 1.9596751222308085e-07, + "loss": 0.8317, + "step": 9752 + }, + { + "epoch": 1.8773251197150902, + "lm_loss": 0.8049, + "step": 9752, + "vm_loss": 0.1098 + }, + { + "epoch": 1.8773251197150902, + "lm_loss": 0.5397, + "step": 9752, + "vm_loss": 0.1894 + }, + { + "epoch": 1.8773251197150902, + "lm_loss": 0.9043, + "step": 9752, + "vm_loss": 0.132 + }, + { + "epoch": 1.8773251197150902, + "lm_loss": 0.899, + "step": 9752, + "vm_loss": 0.2533 + }, + { + "epoch": 1.8773251197150902, + "lm_loss": 0.5991, + "step": 9752, + "vm_loss": 0.1693 + }, + { + "epoch": 1.8773251197150902, + "lm_loss": 0.4712, + "step": 9752, + "vm_loss": 0.14 + }, + { + "epoch": 1.8773251197150902, + "lm_loss": 0.8153, + "step": 9752, + "vm_loss": 0.2072 + }, + { + "epoch": 1.8773251197150902, + "lm_loss": 0.4569, + "step": 9752, + "vm_loss": 0.1898 + }, + { + "epoch": 1.8775176263926654, + "grad_norm": 3.0641745742921978, + "learning_rate": 1.953537594162025e-07, + "loss": 0.783, + "step": 9753 + }, + { + "epoch": 1.877710133070241, + "grad_norm": 3.1238782542711694, + "learning_rate": 1.9474095974637673e-07, + "loss": 0.7744, + "step": 9754 + }, + { + "epoch": 1.8779026397478162, + "grad_norm": 3.305386572267922, + "learning_rate": 1.9412911327317595e-07, + "loss": 0.8296, + "step": 9755 + }, + { + "epoch": 1.8780951464253917, + "grad_norm": 3.0035863329513877, + "learning_rate": 1.9351822005607922e-07, + "loss": 0.7795, + "step": 9756 + }, + { + "epoch": 1.878287653102967, + "grad_norm": 3.319816005087962, + "learning_rate": 1.9290828015447237e-07, + "loss": 0.876, + "step": 9757 + }, + { + "epoch": 1.8784801597805423, + "grad_norm": 3.3085398684839333, + "learning_rate": 1.922992936276513e-07, + "loss": 0.8475, + "step": 9758 + }, + { + "epoch": 1.878672666458118, + "grad_norm": 3.1397651346353146, + "learning_rate": 1.9169126053481534e-07, + "loss": 0.8107, + "step": 9759 + }, + { + "epoch": 1.8788651731356931, + "grad_norm": 3.3281318374987263, + "learning_rate": 1.9108418093507385e-07, + "loss": 0.8659, + "step": 9760 + }, + { + "epoch": 1.8788651731356931, + "lm_loss": 0.7862, + "step": 9760, + "vm_loss": 0.1715 + }, + { + "epoch": 1.8788651731356931, + "lm_loss": 0.7759, + "step": 9760, + "vm_loss": 0.1916 + }, + { + "epoch": 1.8788651731356931, + "lm_loss": 0.825, + "step": 9760, + "vm_loss": 0.1758 + }, + { + "epoch": 1.8788651731356931, + "lm_loss": 0.5529, + "step": 9760, + "vm_loss": 0.1861 + }, + { + "epoch": 1.8788651731356931, + "lm_loss": 0.6048, + "step": 9760, + "vm_loss": 0.1454 + }, + { + "epoch": 1.8788651731356931, + "lm_loss": 0.6929, + "step": 9760, + "vm_loss": 0.1197 + }, + { + "epoch": 1.8788651731356931, + "lm_loss": 0.7371, + "step": 9760, + "vm_loss": 0.1649 + }, + { + "epoch": 1.8788651731356931, + "lm_loss": 0.5864, + "step": 9760, + "vm_loss": 0.1467 + }, + { + "epoch": 1.8790576798132685, + "grad_norm": 3.2187354431328825, + "learning_rate": 1.9047805488744188e-07, + "loss": 0.8416, + "step": 9761 + }, + { + "epoch": 1.879250186490844, + "grad_norm": 3.2612582877319465, + "learning_rate": 1.8987288245084556e-07, + "loss": 0.8063, + "step": 9762 + }, + { + "epoch": 1.8794426931684192, + "grad_norm": 3.257849488628638, + "learning_rate": 1.8926866368411012e-07, + "loss": 0.846, + "step": 9763 + }, + { + "epoch": 1.8796351998459948, + "grad_norm": 3.202178638773855, + "learning_rate": 1.8866539864597855e-07, + "loss": 0.8241, + "step": 9764 + }, + { + "epoch": 1.87982770652357, + "grad_norm": 3.3844732573164262, + "learning_rate": 1.8806308739509394e-07, + "loss": 0.9206, + "step": 9765 + }, + { + "epoch": 1.8800202132011454, + "grad_norm": 3.228501560165754, + "learning_rate": 1.8746172999000722e-07, + "loss": 0.8395, + "step": 9766 + }, + { + "epoch": 1.8802127198787209, + "grad_norm": 3.3034073993584556, + "learning_rate": 1.8686132648918055e-07, + "loss": 0.8477, + "step": 9767 + }, + { + "epoch": 1.880405226556296, + "grad_norm": 3.3027977811351277, + "learning_rate": 1.862618769509794e-07, + "loss": 0.8428, + "step": 9768 + }, + { + "epoch": 1.880405226556296, + "lm_loss": 0.9262, + "step": 9768, + "vm_loss": 0.1172 + }, + { + "epoch": 1.880405226556296, + "lm_loss": 0.6772, + "step": 9768, + "vm_loss": 0.1477 + }, + { + "epoch": 1.880405226556296, + "lm_loss": 1.2286, + "step": 9768, + "vm_loss": 0.1653 + }, + { + "epoch": 1.880405226556296, + "lm_loss": 0.7982, + "step": 9768, + "vm_loss": 0.0933 + }, + { + "epoch": 1.880405226556296, + "lm_loss": 0.5127, + "step": 9768, + "vm_loss": 0.2095 + }, + { + "epoch": 1.880405226556296, + "lm_loss": 0.5794, + "step": 9768, + "vm_loss": 0.1438 + }, + { + "epoch": 1.880405226556296, + "lm_loss": 0.5133, + "step": 9768, + "vm_loss": 0.1586 + }, + { + "epoch": 1.880405226556296, + "lm_loss": 0.5285, + "step": 9768, + "vm_loss": 0.1362 + }, + { + "epoch": 1.8805977332338717, + "grad_norm": 3.1662506759369897, + "learning_rate": 1.8566338143367835e-07, + "loss": 0.7944, + "step": 9769 + }, + { + "epoch": 1.880790239911447, + "grad_norm": 3.580192771625688, + "learning_rate": 1.8506583999545747e-07, + "loss": 0.9072, + "step": 9770 + }, + { + "epoch": 1.8809827465890223, + "grad_norm": 3.284328396357913, + "learning_rate": 1.8446925269440806e-07, + "loss": 0.8582, + "step": 9771 + }, + { + "epoch": 1.8811752532665977, + "grad_norm": 3.336742070100333, + "learning_rate": 1.8387361958852378e-07, + "loss": 0.8878, + "step": 9772 + }, + { + "epoch": 1.881367759944173, + "grad_norm": 3.2327159552030915, + "learning_rate": 1.8327894073570828e-07, + "loss": 0.8391, + "step": 9773 + }, + { + "epoch": 1.8815602666217486, + "grad_norm": 3.129188841043906, + "learning_rate": 1.8268521619377312e-07, + "loss": 0.7853, + "step": 9774 + }, + { + "epoch": 1.8817527732993238, + "grad_norm": 3.300167298964028, + "learning_rate": 1.820924460204354e-07, + "loss": 0.8463, + "step": 9775 + }, + { + "epoch": 1.8819452799768992, + "grad_norm": 3.02643565181241, + "learning_rate": 1.8150063027331798e-07, + "loss": 0.7795, + "step": 9776 + }, + { + "epoch": 1.8819452799768992, + "lm_loss": 0.3679, + "step": 9776, + "vm_loss": 0.1972 + }, + { + "epoch": 1.8819452799768992, + "lm_loss": 0.658, + "step": 9776, + "vm_loss": 0.1595 + }, + { + "epoch": 1.8819452799768992, + "lm_loss": 0.475, + "step": 9776, + "vm_loss": 0.153 + }, + { + "epoch": 1.8819452799768992, + "lm_loss": 0.6515, + "step": 9776, + "vm_loss": 0.2043 + }, + { + "epoch": 1.8819452799768992, + "lm_loss": 0.5932, + "step": 9776, + "vm_loss": 0.1187 + }, + { + "epoch": 1.8819452799768992, + "lm_loss": 0.6392, + "step": 9776, + "vm_loss": 0.1394 + }, + { + "epoch": 1.8819452799768992, + "lm_loss": 0.7081, + "step": 9776, + "vm_loss": 0.1482 + }, + { + "epoch": 1.8819452799768992, + "lm_loss": 0.7175, + "step": 9776, + "vm_loss": 0.1343 + }, + { + "epoch": 1.8821377866544746, + "grad_norm": 3.380567422586673, + "learning_rate": 1.8090976900995593e-07, + "loss": 0.8739, + "step": 9777 + }, + { + "epoch": 1.8823302933320498, + "grad_norm": 3.2600991120257357, + "learning_rate": 1.8031986228778664e-07, + "loss": 0.8276, + "step": 9778 + }, + { + "epoch": 1.8825228000096255, + "grad_norm": 3.2616190084311225, + "learning_rate": 1.7973091016415755e-07, + "loss": 0.8413, + "step": 9779 + }, + { + "epoch": 1.8827153066872007, + "grad_norm": 3.263726286884041, + "learning_rate": 1.7914291269632067e-07, + "loss": 0.8736, + "step": 9780 + }, + { + "epoch": 1.882907813364776, + "grad_norm": 3.146893889939857, + "learning_rate": 1.7855586994144025e-07, + "loss": 0.8242, + "step": 9781 + }, + { + "epoch": 1.8831003200423515, + "grad_norm": 3.367488898063329, + "learning_rate": 1.7796978195658067e-07, + "loss": 0.8462, + "step": 9782 + }, + { + "epoch": 1.8832928267199267, + "grad_norm": 3.132580133337081, + "learning_rate": 1.7738464879871963e-07, + "loss": 0.8128, + "step": 9783 + }, + { + "epoch": 1.8834853333975023, + "grad_norm": 3.2646899588958855, + "learning_rate": 1.7680047052473837e-07, + "loss": 0.7963, + "step": 9784 + }, + { + "epoch": 1.8834853333975023, + "lm_loss": 0.5743, + "step": 9784, + "vm_loss": 0.1736 + }, + { + "epoch": 1.8834853333975023, + "lm_loss": 0.9274, + "step": 9784, + "vm_loss": 0.2082 + }, + { + "epoch": 1.8834853333975023, + "lm_loss": 0.5862, + "step": 9784, + "vm_loss": 0.1774 + }, + { + "epoch": 1.8834853333975023, + "lm_loss": 1.095, + "step": 9784, + "vm_loss": 0.1471 + }, + { + "epoch": 1.8834853333975023, + "lm_loss": 0.7025, + "step": 9784, + "vm_loss": 0.1794 + }, + { + "epoch": 1.8834853333975023, + "lm_loss": 0.7589, + "step": 9784, + "vm_loss": 0.1293 + }, + { + "epoch": 1.8834853333975023, + "lm_loss": 0.4675, + "step": 9784, + "vm_loss": 0.1229 + }, + { + "epoch": 1.8834853333975023, + "lm_loss": 0.7371, + "step": 9784, + "vm_loss": 0.17 + }, + { + "epoch": 1.8836778400750775, + "grad_norm": 3.235645300484035, + "learning_rate": 1.7621724719142698e-07, + "loss": 0.8569, + "step": 9785 + }, + { + "epoch": 1.883870346752653, + "grad_norm": 3.2626734959587886, + "learning_rate": 1.756349788554812e-07, + "loss": 0.8253, + "step": 9786 + }, + { + "epoch": 1.8840628534302284, + "grad_norm": 3.1142683895937737, + "learning_rate": 1.750536655735069e-07, + "loss": 0.786, + "step": 9787 + }, + { + "epoch": 1.8842553601078036, + "grad_norm": 3.196050184789095, + "learning_rate": 1.744733074020133e-07, + "loss": 0.8077, + "step": 9788 + }, + { + "epoch": 1.8844478667853792, + "grad_norm": 3.2693673177035056, + "learning_rate": 1.738939043974197e-07, + "loss": 0.7947, + "step": 9789 + }, + { + "epoch": 1.8846403734629544, + "grad_norm": 3.2859657650689242, + "learning_rate": 1.7331545661604997e-07, + "loss": 0.8448, + "step": 9790 + }, + { + "epoch": 1.8848328801405299, + "grad_norm": 3.267741855875794, + "learning_rate": 1.7273796411414023e-07, + "loss": 0.8529, + "step": 9791 + }, + { + "epoch": 1.8850253868181053, + "grad_norm": 3.288271549946051, + "learning_rate": 1.7216142694782668e-07, + "loss": 0.8336, + "step": 9792 + }, + { + "epoch": 1.8850253868181053, + "lm_loss": 0.8748, + "step": 9792, + "vm_loss": 0.1658 + }, + { + "epoch": 1.8850253868181053, + "lm_loss": 0.4924, + "step": 9792, + "vm_loss": 0.1469 + }, + { + "epoch": 1.8850253868181053, + "lm_loss": 0.4441, + "step": 9792, + "vm_loss": 0.1303 + }, + { + "epoch": 1.8850253868181053, + "lm_loss": 0.5663, + "step": 9792, + "vm_loss": 0.0972 + }, + { + "epoch": 1.8850253868181053, + "lm_loss": 1.1189, + "step": 9792, + "vm_loss": 0.1599 + }, + { + "epoch": 1.8850253868181053, + "lm_loss": 0.8396, + "step": 9792, + "vm_loss": 0.1673 + }, + { + "epoch": 1.8850253868181053, + "lm_loss": 0.7968, + "step": 9792, + "vm_loss": 0.1779 + }, + { + "epoch": 1.8850253868181053, + "lm_loss": 0.9524, + "step": 9792, + "vm_loss": 0.185 + }, + { + "epoch": 1.8852178934956805, + "grad_norm": 3.4493391682207535, + "learning_rate": 1.7158584517315667e-07, + "loss": 0.8506, + "step": 9793 + }, + { + "epoch": 1.8854104001732561, + "grad_norm": 3.271613339158465, + "learning_rate": 1.710112188460844e-07, + "loss": 0.8623, + "step": 9794 + }, + { + "epoch": 1.8856029068508313, + "grad_norm": 3.380199588598048, + "learning_rate": 1.7043754802247293e-07, + "loss": 0.8605, + "step": 9795 + }, + { + "epoch": 1.8857954135284067, + "grad_norm": 3.228334940001412, + "learning_rate": 1.6986483275808762e-07, + "loss": 0.8377, + "step": 9796 + }, + { + "epoch": 1.8859879202059822, + "grad_norm": 3.0653181909163085, + "learning_rate": 1.6929307310860398e-07, + "loss": 0.7453, + "step": 9797 + }, + { + "epoch": 1.8861804268835574, + "grad_norm": 3.065149088907136, + "learning_rate": 1.6872226912960532e-07, + "loss": 0.7934, + "step": 9798 + }, + { + "epoch": 1.886372933561133, + "grad_norm": 3.322195596465199, + "learning_rate": 1.681524208765817e-07, + "loss": 0.8122, + "step": 9799 + }, + { + "epoch": 1.8865654402387082, + "grad_norm": 3.2393131813235705, + "learning_rate": 1.6758352840492874e-07, + "loss": 0.8148, + "step": 9800 + }, + { + "epoch": 1.8865654402387082, + "lm_loss": 0.7451, + "step": 9800, + "vm_loss": 0.0873 + }, + { + "epoch": 1.8865654402387082, + "lm_loss": 0.6702, + "step": 9800, + "vm_loss": 0.1843 + }, + { + "epoch": 1.8865654402387082, + "lm_loss": 1.2447, + "step": 9800, + "vm_loss": 0.1972 + }, + { + "epoch": 1.8865654402387082, + "lm_loss": 0.8943, + "step": 9800, + "vm_loss": 0.159 + }, + { + "epoch": 1.8865654402387082, + "lm_loss": 1.105, + "step": 9800, + "vm_loss": 0.1512 + }, + { + "epoch": 1.8865654402387082, + "lm_loss": 0.4952, + "step": 9800, + "vm_loss": 0.1787 + }, + { + "epoch": 1.8865654402387082, + "lm_loss": 0.4821, + "step": 9800, + "vm_loss": 0.1832 + }, + { + "epoch": 1.8865654402387082, + "lm_loss": 0.6276, + "step": 9800, + "vm_loss": 0.1895 + }, + { + "epoch": 1.8867579469162836, + "grad_norm": 3.2310193654960386, + "learning_rate": 1.6701559176994897e-07, + "loss": 0.8394, + "step": 9801 + }, + { + "epoch": 1.886950453593859, + "grad_norm": 3.079022742477775, + "learning_rate": 1.6644861102685595e-07, + "loss": 0.7981, + "step": 9802 + }, + { + "epoch": 1.8871429602714345, + "grad_norm": 3.088157744894846, + "learning_rate": 1.6588258623076448e-07, + "loss": 0.7396, + "step": 9803 + }, + { + "epoch": 1.8873354669490099, + "grad_norm": 3.2448328673875704, + "learning_rate": 1.6531751743670053e-07, + "loss": 0.837, + "step": 9804 + }, + { + "epoch": 1.887527973626585, + "grad_norm": 3.445973116057066, + "learning_rate": 1.6475340469959689e-07, + "loss": 0.8919, + "step": 9805 + }, + { + "epoch": 1.8877204803041605, + "grad_norm": 3.2088208986589937, + "learning_rate": 1.6419024807429296e-07, + "loss": 0.8406, + "step": 9806 + }, + { + "epoch": 1.887912986981736, + "grad_norm": 3.3249714268679655, + "learning_rate": 1.6362804761553164e-07, + "loss": 0.8394, + "step": 9807 + }, + { + "epoch": 1.8881054936593114, + "grad_norm": 3.1725927716841937, + "learning_rate": 1.630668033779692e-07, + "loss": 0.8087, + "step": 9808 + }, + { + "epoch": 1.8881054936593114, + "lm_loss": 0.5117, + "step": 9808, + "vm_loss": 0.1073 + }, + { + "epoch": 1.8881054936593114, + "lm_loss": 0.7332, + "step": 9808, + "vm_loss": 0.147 + }, + { + "epoch": 1.8881054936593114, + "lm_loss": 0.8572, + "step": 9808, + "vm_loss": 0.1405 + }, + { + "epoch": 1.8881054936593114, + "lm_loss": 0.6686, + "step": 9808, + "vm_loss": 0.261 + }, + { + "epoch": 1.8881054936593114, + "lm_loss": 0.7398, + "step": 9808, + "vm_loss": 0.1604 + }, + { + "epoch": 1.8881054936593114, + "lm_loss": 0.4486, + "step": 9808, + "vm_loss": 0.1692 + }, + { + "epoch": 1.8881054936593114, + "lm_loss": 0.5053, + "step": 9808, + "vm_loss": 0.15 + }, + { + "epoch": 1.8881054936593114, + "lm_loss": 0.6069, + "step": 9808, + "vm_loss": 0.1116 + }, + { + "epoch": 1.8882980003368868, + "grad_norm": 3.4642503341438844, + "learning_rate": 1.6250651541616426e-07, + "loss": 0.8963, + "step": 9809 + }, + { + "epoch": 1.888490507014462, + "grad_norm": 3.273486333381402, + "learning_rate": 1.619471837845854e-07, + "loss": 0.8556, + "step": 9810 + }, + { + "epoch": 1.8886830136920374, + "grad_norm": 3.2914875975985436, + "learning_rate": 1.6138880853760364e-07, + "loss": 0.8024, + "step": 9811 + }, + { + "epoch": 1.8888755203696128, + "grad_norm": 3.3718672402173944, + "learning_rate": 1.608313897295033e-07, + "loss": 0.8387, + "step": 9812 + }, + { + "epoch": 1.8890680270471882, + "grad_norm": 3.3052960490828283, + "learning_rate": 1.6027492741447214e-07, + "loss": 0.8813, + "step": 9813 + }, + { + "epoch": 1.8892605337247637, + "grad_norm": 3.26964543524852, + "learning_rate": 1.5971942164660359e-07, + "loss": 0.8378, + "step": 9814 + }, + { + "epoch": 1.8894530404023389, + "grad_norm": 3.181049582313773, + "learning_rate": 1.591648724799022e-07, + "loss": 0.8058, + "step": 9815 + }, + { + "epoch": 1.8896455470799145, + "grad_norm": 3.2651242285148045, + "learning_rate": 1.5861127996827597e-07, + "loss": 0.8509, + "step": 9816 + }, + { + "epoch": 1.8896455470799145, + "lm_loss": 0.4566, + "step": 9816, + "vm_loss": 0.1377 + }, + { + "epoch": 1.8896455470799145, + "lm_loss": 0.9221, + "step": 9816, + "vm_loss": 0.1208 + }, + { + "epoch": 1.8896455470799145, + "lm_loss": 1.0489, + "step": 9816, + "vm_loss": 0.1557 + }, + { + "epoch": 1.8896455470799145, + "lm_loss": 0.757, + "step": 9816, + "vm_loss": 0.155 + }, + { + "epoch": 1.8896455470799145, + "lm_loss": 0.4552, + "step": 9816, + "vm_loss": 0.1598 + }, + { + "epoch": 1.8896455470799145, + "lm_loss": 0.8636, + "step": 9816, + "vm_loss": 0.135 + }, + { + "epoch": 1.8896455470799145, + "lm_loss": 0.3805, + "step": 9816, + "vm_loss": 0.1142 + }, + { + "epoch": 1.8896455470799145, + "lm_loss": 0.5552, + "step": 9816, + "vm_loss": 0.1545 + }, + { + "epoch": 1.8898380537574897, + "grad_norm": 3.1680551751191195, + "learning_rate": 1.5805864416554185e-07, + "loss": 0.7921, + "step": 9817 + }, + { + "epoch": 1.8900305604350651, + "grad_norm": 3.2000399031928617, + "learning_rate": 1.575069651254224e-07, + "loss": 0.7975, + "step": 9818 + }, + { + "epoch": 1.8902230671126405, + "grad_norm": 3.1308927697117523, + "learning_rate": 1.569562429015481e-07, + "loss": 0.7988, + "step": 9819 + }, + { + "epoch": 1.8904155737902157, + "grad_norm": 3.166668783976193, + "learning_rate": 1.5640647754745607e-07, + "loss": 0.8149, + "step": 9820 + }, + { + "epoch": 1.8906080804677914, + "grad_norm": 3.364591069347157, + "learning_rate": 1.558576691165914e-07, + "loss": 0.8624, + "step": 9821 + }, + { + "epoch": 1.8908005871453666, + "grad_norm": 3.3995215959921588, + "learning_rate": 1.5530981766230356e-07, + "loss": 0.8447, + "step": 9822 + }, + { + "epoch": 1.890993093822942, + "grad_norm": 3.2098287751045365, + "learning_rate": 1.5476292323785337e-07, + "loss": 0.8205, + "step": 9823 + }, + { + "epoch": 1.8911856005005174, + "grad_norm": 3.112697861216151, + "learning_rate": 1.5421698589640267e-07, + "loss": 0.7992, + "step": 9824 + }, + { + "epoch": 1.8911856005005174, + "lm_loss": 0.413, + "step": 9824, + "vm_loss": 0.1581 + }, + { + "epoch": 1.8911856005005174, + "lm_loss": 0.6287, + "step": 9824, + "vm_loss": 0.1685 + }, + { + "epoch": 1.8911856005005174, + "lm_loss": 0.754, + "step": 9824, + "vm_loss": 0.1359 + }, + { + "epoch": 1.8911856005005174, + "lm_loss": 0.269, + "step": 9824, + "vm_loss": 0.138 + }, + { + "epoch": 1.8911856005005174, + "lm_loss": 0.8331, + "step": 9824, + "vm_loss": 0.1316 + }, + { + "epoch": 1.8911856005005174, + "lm_loss": 0.5199, + "step": 9824, + "vm_loss": 0.0919 + }, + { + "epoch": 1.8911856005005174, + "lm_loss": 0.5232, + "step": 9824, + "vm_loss": 0.1303 + }, + { + "epoch": 1.8911856005005174, + "lm_loss": 0.5469, + "step": 9824, + "vm_loss": 0.1868 + }, + { + "epoch": 1.8913781071780926, + "grad_norm": 3.2805933143061763, + "learning_rate": 1.536720056910268e-07, + "loss": 0.815, + "step": 9825 + }, + { + "epoch": 1.8915706138556683, + "grad_norm": 3.2215172728935597, + "learning_rate": 1.531279826747023e-07, + "loss": 0.8576, + "step": 9826 + }, + { + "epoch": 1.8917631205332435, + "grad_norm": 3.0919364531973725, + "learning_rate": 1.5258491690031796e-07, + "loss": 0.7929, + "step": 9827 + }, + { + "epoch": 1.891955627210819, + "grad_norm": 3.3238891873050016, + "learning_rate": 1.5204280842066266e-07, + "loss": 0.8445, + "step": 9828 + }, + { + "epoch": 1.8921481338883943, + "grad_norm": 3.553661107670562, + "learning_rate": 1.515016572884398e-07, + "loss": 0.9092, + "step": 9829 + }, + { + "epoch": 1.8923406405659695, + "grad_norm": 3.1649727484272643, + "learning_rate": 1.5096146355625396e-07, + "loss": 0.8489, + "step": 9830 + }, + { + "epoch": 1.8925331472435452, + "grad_norm": 3.4501169700865266, + "learning_rate": 1.5042222727662092e-07, + "loss": 0.8249, + "step": 9831 + }, + { + "epoch": 1.8927256539211204, + "grad_norm": 3.0495338169042023, + "learning_rate": 1.4988394850195876e-07, + "loss": 0.7709, + "step": 9832 + }, + { + "epoch": 1.8927256539211204, + "lm_loss": 0.5521, + "step": 9832, + "vm_loss": 0.1605 + }, + { + "epoch": 1.8927256539211204, + "lm_loss": 0.6678, + "step": 9832, + "vm_loss": 0.151 + }, + { + "epoch": 1.8927256539211204, + "lm_loss": 0.5387, + "step": 9832, + "vm_loss": 0.1515 + }, + { + "epoch": 1.8927256539211204, + "lm_loss": 0.8205, + "step": 9832, + "vm_loss": 0.1752 + }, + { + "epoch": 1.8927256539211204, + "lm_loss": 0.8843, + "step": 9832, + "vm_loss": 0.1452 + }, + { + "epoch": 1.8927256539211204, + "lm_loss": 0.5839, + "step": 9832, + "vm_loss": 0.1708 + }, + { + "epoch": 1.8927256539211204, + "lm_loss": 0.9296, + "step": 9832, + "vm_loss": 0.1849 + }, + { + "epoch": 1.8927256539211204, + "lm_loss": 0.8936, + "step": 9832, + "vm_loss": 0.1942 + }, + { + "epoch": 1.8929181605986958, + "grad_norm": 3.3740895263904562, + "learning_rate": 1.4934662728459893e-07, + "loss": 0.8342, + "step": 9833 + }, + { + "epoch": 1.8931106672762712, + "grad_norm": 3.3515787827393297, + "learning_rate": 1.4881026367677188e-07, + "loss": 0.9085, + "step": 9834 + }, + { + "epoch": 1.8933031739538464, + "grad_norm": 3.2759730607561797, + "learning_rate": 1.4827485773062146e-07, + "loss": 0.847, + "step": 9835 + }, + { + "epoch": 1.893495680631422, + "grad_norm": 3.2303666056170166, + "learning_rate": 1.4774040949819378e-07, + "loss": 0.7955, + "step": 9836 + }, + { + "epoch": 1.8936881873089972, + "grad_norm": 3.3719887987707042, + "learning_rate": 1.4720691903144734e-07, + "loss": 0.8362, + "step": 9837 + }, + { + "epoch": 1.8938806939865727, + "grad_norm": 3.115100220963446, + "learning_rate": 1.4667438638224064e-07, + "loss": 0.8204, + "step": 9838 + }, + { + "epoch": 1.894073200664148, + "grad_norm": 3.232999195490805, + "learning_rate": 1.4614281160234555e-07, + "loss": 0.8367, + "step": 9839 + }, + { + "epoch": 1.8942657073417233, + "grad_norm": 3.106815473788987, + "learning_rate": 1.4561219474343747e-07, + "loss": 0.7957, + "step": 9840 + }, + { + "epoch": 1.8942657073417233, + "lm_loss": 0.546, + "step": 9840, + "vm_loss": 0.1656 + }, + { + "epoch": 1.8942657073417233, + "lm_loss": 0.5616, + "step": 9840, + "vm_loss": 0.1984 + }, + { + "epoch": 1.8942657073417233, + "lm_loss": 0.7282, + "step": 9840, + "vm_loss": 0.1651 + }, + { + "epoch": 1.8942657073417233, + "lm_loss": 0.8378, + "step": 9840, + "vm_loss": 0.1423 + }, + { + "epoch": 1.8942657073417233, + "lm_loss": 1.0955, + "step": 9840, + "vm_loss": 0.1485 + }, + { + "epoch": 1.8942657073417233, + "lm_loss": 1.0232, + "step": 9840, + "vm_loss": 0.1505 + }, + { + "epoch": 1.8942657073417233, + "lm_loss": 0.4757, + "step": 9840, + "vm_loss": 0.1655 + }, + { + "epoch": 1.8942657073417233, + "lm_loss": 0.5586, + "step": 9840, + "vm_loss": 0.1867 + }, + { + "epoch": 1.894458214019299, + "grad_norm": 3.3231997355642933, + "learning_rate": 1.450825358570973e-07, + "loss": 0.8336, + "step": 9841 + }, + { + "epoch": 1.8946507206968741, + "grad_norm": 3.19496190043342, + "learning_rate": 1.4455383499481613e-07, + "loss": 0.8259, + "step": 9842 + }, + { + "epoch": 1.8948432273744495, + "grad_norm": 3.123342149495041, + "learning_rate": 1.4402609220799057e-07, + "loss": 0.8377, + "step": 9843 + }, + { + "epoch": 1.895035734052025, + "grad_norm": 3.2773964625620744, + "learning_rate": 1.4349930754792408e-07, + "loss": 0.8202, + "step": 9844 + }, + { + "epoch": 1.8952282407296002, + "grad_norm": 3.268955680949422, + "learning_rate": 1.4297348106582454e-07, + "loss": 0.8392, + "step": 9845 + }, + { + "epoch": 1.8954207474071758, + "grad_norm": 3.2694236180845584, + "learning_rate": 1.4244861281281218e-07, + "loss": 0.8205, + "step": 9846 + }, + { + "epoch": 1.895613254084751, + "grad_norm": 3.111651000924856, + "learning_rate": 1.4192470283990956e-07, + "loss": 0.8092, + "step": 9847 + }, + { + "epoch": 1.8958057607623264, + "grad_norm": 3.24948017852796, + "learning_rate": 1.41401751198047e-07, + "loss": 0.8289, + "step": 9848 + }, + { + "epoch": 1.8958057607623264, + "lm_loss": 0.5715, + "step": 9848, + "vm_loss": 0.2043 + }, + { + "epoch": 1.8958057607623264, + "lm_loss": 0.7065, + "step": 9848, + "vm_loss": 0.1414 + }, + { + "epoch": 1.8958057607623264, + "lm_loss": 0.3542, + "step": 9848, + "vm_loss": 0.1825 + }, + { + "epoch": 1.8958057607623264, + "lm_loss": 0.7414, + "step": 9848, + "vm_loss": 0.173 + }, + { + "epoch": 1.8958057607623264, + "lm_loss": 0.3797, + "step": 9848, + "vm_loss": 0.1307 + }, + { + "epoch": 1.8958057607623264, + "lm_loss": 0.4394, + "step": 9848, + "vm_loss": 0.1501 + }, + { + "epoch": 1.8958057607623264, + "lm_loss": 0.5934, + "step": 9848, + "vm_loss": 0.164 + }, + { + "epoch": 1.8958057607623264, + "lm_loss": 0.7687, + "step": 9848, + "vm_loss": 0.1566 + }, + { + "epoch": 1.8959982674399019, + "grad_norm": 3.0111185322965954, + "learning_rate": 1.4087975793806275e-07, + "loss": 0.7772, + "step": 9849 + }, + { + "epoch": 1.896190774117477, + "grad_norm": 3.35860871610262, + "learning_rate": 1.4035872311070175e-07, + "loss": 0.8679, + "step": 9850 + }, + { + "epoch": 1.8963832807950527, + "grad_norm": 3.324715125580184, + "learning_rate": 1.3983864676661352e-07, + "loss": 0.8518, + "step": 9851 + }, + { + "epoch": 1.896575787472628, + "grad_norm": 3.4247409431170412, + "learning_rate": 1.3931952895635649e-07, + "loss": 0.8679, + "step": 9852 + }, + { + "epoch": 1.8967682941502033, + "grad_norm": 3.2369895750285305, + "learning_rate": 1.3880136973039692e-07, + "loss": 0.8372, + "step": 9853 + }, + { + "epoch": 1.8969608008277787, + "grad_norm": 3.447736996587697, + "learning_rate": 1.3828416913910682e-07, + "loss": 0.8871, + "step": 9854 + }, + { + "epoch": 1.897153307505354, + "grad_norm": 3.0715163460263057, + "learning_rate": 1.3776792723276144e-07, + "loss": 0.8017, + "step": 9855 + }, + { + "epoch": 1.8973458141829296, + "grad_norm": 3.3755369082685713, + "learning_rate": 1.372526440615507e-07, + "loss": 0.8642, + "step": 9856 + }, + { + "epoch": 1.8973458141829296, + "lm_loss": 0.6191, + "step": 9856, + "vm_loss": 0.1634 + }, + { + "epoch": 1.8973458141829296, + "lm_loss": 0.6152, + "step": 9856, + "vm_loss": 0.2029 + }, + { + "epoch": 1.8973458141829296, + "lm_loss": 0.7694, + "step": 9856, + "vm_loss": 0.1322 + }, + { + "epoch": 1.8973458141829296, + "lm_loss": 0.4861, + "step": 9856, + "vm_loss": 0.1347 + }, + { + "epoch": 1.8973458141829296, + "lm_loss": 0.6503, + "step": 9856, + "vm_loss": 0.1445 + }, + { + "epoch": 1.8973458141829296, + "lm_loss": 0.52, + "step": 9856, + "vm_loss": 0.2035 + }, + { + "epoch": 1.8973458141829296, + "lm_loss": 0.6167, + "step": 9856, + "vm_loss": 0.1679 + }, + { + "epoch": 1.8973458141829296, + "lm_loss": 0.5787, + "step": 9856, + "vm_loss": 0.1091 + }, + { + "epoch": 1.8975383208605048, + "grad_norm": 3.338969650245965, + "learning_rate": 1.3673831967556228e-07, + "loss": 0.8449, + "step": 9857 + }, + { + "epoch": 1.8977308275380802, + "grad_norm": 3.202734200480825, + "learning_rate": 1.362249541247984e-07, + "loss": 0.8035, + "step": 9858 + }, + { + "epoch": 1.8979233342156556, + "grad_norm": 3.3410825440864502, + "learning_rate": 1.3571254745916363e-07, + "loss": 0.8668, + "step": 9859 + }, + { + "epoch": 1.8981158408932308, + "grad_norm": 3.2925573251381555, + "learning_rate": 1.3520109972846918e-07, + "loss": 0.8554, + "step": 9860 + }, + { + "epoch": 1.8983083475708065, + "grad_norm": 3.350636137787506, + "learning_rate": 1.3469061098243642e-07, + "loss": 0.8387, + "step": 9861 + }, + { + "epoch": 1.8985008542483817, + "grad_norm": 3.189333328035987, + "learning_rate": 1.3418108127069008e-07, + "loss": 0.8125, + "step": 9862 + }, + { + "epoch": 1.898693360925957, + "grad_norm": 3.25690960268992, + "learning_rate": 1.3367251064276386e-07, + "loss": 0.8412, + "step": 9863 + }, + { + "epoch": 1.8988858676035325, + "grad_norm": 3.1914562728521374, + "learning_rate": 1.3316489914809605e-07, + "loss": 0.7999, + "step": 9864 + }, + { + "epoch": 1.8988858676035325, + "lm_loss": 0.6434, + "step": 9864, + "vm_loss": 0.2341 + }, + { + "epoch": 1.8988858676035325, + "lm_loss": 0.8401, + "step": 9864, + "vm_loss": 0.1531 + }, + { + "epoch": 1.8988858676035325, + "lm_loss": 0.5685, + "step": 9864, + "vm_loss": 0.2014 + }, + { + "epoch": 1.8988858676035325, + "lm_loss": 0.4253, + "step": 9864, + "vm_loss": 0.1652 + }, + { + "epoch": 1.8988858676035325, + "lm_loss": 0.8736, + "step": 9864, + "vm_loss": 0.1649 + }, + { + "epoch": 1.8988858676035325, + "lm_loss": 0.8423, + "step": 9864, + "vm_loss": 0.1501 + }, + { + "epoch": 1.8988858676035325, + "lm_loss": 0.6308, + "step": 9864, + "vm_loss": 0.1939 + }, + { + "epoch": 1.8988858676035325, + "lm_loss": 0.6927, + "step": 9864, + "vm_loss": 0.1426 + }, + { + "epoch": 1.899078374281108, + "grad_norm": 3.37438863711428, + "learning_rate": 1.326582468360338e-07, + "loss": 0.8834, + "step": 9865 + }, + { + "epoch": 1.8992708809586833, + "grad_norm": 3.318364109074891, + "learning_rate": 1.3215255375582992e-07, + "loss": 0.8713, + "step": 9866 + }, + { + "epoch": 1.8994633876362585, + "grad_norm": 3.3323239664555646, + "learning_rate": 1.316478199566462e-07, + "loss": 0.8197, + "step": 9867 + }, + { + "epoch": 1.899655894313834, + "grad_norm": 3.2079348436948325, + "learning_rate": 1.3114404548754566e-07, + "loss": 0.7973, + "step": 9868 + }, + { + "epoch": 1.8998484009914094, + "grad_norm": 3.143200770719279, + "learning_rate": 1.3064123039750353e-07, + "loss": 0.819, + "step": 9869 + }, + { + "epoch": 1.9000409076689848, + "grad_norm": 3.328543245428068, + "learning_rate": 1.3013937473540073e-07, + "loss": 0.8418, + "step": 9870 + }, + { + "epoch": 1.9002334143465602, + "grad_norm": 3.2624666904693584, + "learning_rate": 1.2963847855002264e-07, + "loss": 0.854, + "step": 9871 + }, + { + "epoch": 1.9004259210241354, + "grad_norm": 3.281379078986719, + "learning_rate": 1.291385418900626e-07, + "loss": 0.8596, + "step": 9872 + }, + { + "epoch": 1.9004259210241354, + "lm_loss": 0.7164, + "step": 9872, + "vm_loss": 0.1558 + }, + { + "epoch": 1.9004259210241354, + "lm_loss": 0.999, + "step": 9872, + "vm_loss": 0.2213 + }, + { + "epoch": 1.9004259210241354, + "lm_loss": 0.6343, + "step": 9872, + "vm_loss": 0.2277 + }, + { + "epoch": 1.9004259210241354, + "lm_loss": 0.3848, + "step": 9872, + "vm_loss": 0.192 + }, + { + "epoch": 1.9004259210241354, + "lm_loss": 0.7855, + "step": 9872, + "vm_loss": 0.1141 + }, + { + "epoch": 1.9004259210241354, + "lm_loss": 0.515, + "step": 9872, + "vm_loss": 0.2058 + }, + { + "epoch": 1.9004259210241354, + "lm_loss": 0.5664, + "step": 9872, + "vm_loss": 0.1808 + }, + { + "epoch": 1.9004259210241354, + "lm_loss": 0.6668, + "step": 9872, + "vm_loss": 0.1505 + }, + { + "epoch": 1.9006184277017109, + "grad_norm": 3.2115571278196287, + "learning_rate": 1.2863956480412276e-07, + "loss": 0.8341, + "step": 9873 + }, + { + "epoch": 1.9008109343792863, + "grad_norm": 3.2056695215962416, + "learning_rate": 1.281415473407077e-07, + "loss": 0.8111, + "step": 9874 + }, + { + "epoch": 1.9010034410568617, + "grad_norm": 3.127953003319157, + "learning_rate": 1.2764448954823204e-07, + "loss": 0.785, + "step": 9875 + }, + { + "epoch": 1.9011959477344371, + "grad_norm": 3.1517299927787725, + "learning_rate": 1.2714839147501712e-07, + "loss": 0.8064, + "step": 9876 + }, + { + "epoch": 1.9013884544120123, + "grad_norm": 3.1911998113480315, + "learning_rate": 1.2665325316928878e-07, + "loss": 0.8385, + "step": 9877 + }, + { + "epoch": 1.9015809610895877, + "grad_norm": 3.3862107240064536, + "learning_rate": 1.2615907467918187e-07, + "loss": 0.9024, + "step": 9878 + }, + { + "epoch": 1.9017734677671632, + "grad_norm": 3.361385488648857, + "learning_rate": 1.2566585605273462e-07, + "loss": 0.8677, + "step": 9879 + }, + { + "epoch": 1.9019659744447386, + "grad_norm": 3.0676184066174286, + "learning_rate": 1.251735973378976e-07, + "loss": 0.7921, + "step": 9880 + }, + { + "epoch": 1.9019659744447386, + "lm_loss": 0.7984, + "step": 9880, + "vm_loss": 0.2474 + }, + { + "epoch": 1.9019659744447386, + "lm_loss": 0.8176, + "step": 9880, + "vm_loss": 0.1886 + }, + { + "epoch": 1.9019659744447386, + "lm_loss": 0.6584, + "step": 9880, + "vm_loss": 0.1411 + }, + { + "epoch": 1.9019659744447386, + "lm_loss": 0.4045, + "step": 9880, + "vm_loss": 0.1529 + }, + { + "epoch": 1.9019659744447386, + "lm_loss": 0.6691, + "step": 9880, + "vm_loss": 0.1379 + }, + { + "epoch": 1.9019659744447386, + "lm_loss": 0.6673, + "step": 9880, + "vm_loss": 0.1636 + }, + { + "epoch": 1.9019659744447386, + "lm_loss": 1.2163, + "step": 9880, + "vm_loss": 0.1181 + }, + { + "epoch": 1.9019659744447386, + "lm_loss": 0.9016, + "step": 9880, + "vm_loss": 0.1778 + }, + { + "epoch": 1.902158481122314, + "grad_norm": 3.5652974434076627, + "learning_rate": 1.2468229858252134e-07, + "loss": 0.9357, + "step": 9881 + }, + { + "epoch": 1.9023509877998892, + "grad_norm": 3.342158192730118, + "learning_rate": 1.2419195983436881e-07, + "loss": 0.8361, + "step": 9882 + }, + { + "epoch": 1.9025434944774648, + "grad_norm": 3.2976363601411336, + "learning_rate": 1.2370258114110523e-07, + "loss": 0.8614, + "step": 9883 + }, + { + "epoch": 1.90273600115504, + "grad_norm": 3.323113895401307, + "learning_rate": 1.2321416255030694e-07, + "loss": 0.8343, + "step": 9884 + }, + { + "epoch": 1.9029285078326155, + "grad_norm": 3.116439863955894, + "learning_rate": 1.2272670410945264e-07, + "loss": 0.7743, + "step": 9885 + }, + { + "epoch": 1.903121014510191, + "grad_norm": 3.2476080037993564, + "learning_rate": 1.2224020586593e-07, + "loss": 0.8236, + "step": 9886 + }, + { + "epoch": 1.903313521187766, + "grad_norm": 3.246154662985664, + "learning_rate": 1.2175466786703227e-07, + "loss": 0.81, + "step": 9887 + }, + { + "epoch": 1.9035060278653417, + "grad_norm": 3.2639114029355936, + "learning_rate": 1.2127009015996062e-07, + "loss": 0.8546, + "step": 9888 + }, + { + "epoch": 1.9035060278653417, + "lm_loss": 0.7507, + "step": 9888, + "vm_loss": 0.1679 + }, + { + "epoch": 1.9035060278653417, + "lm_loss": 0.7853, + "step": 9888, + "vm_loss": 0.1088 + }, + { + "epoch": 1.9035060278653417, + "lm_loss": 1.018, + "step": 9888, + "vm_loss": 0.1587 + }, + { + "epoch": 1.9035060278653417, + "lm_loss": 0.7663, + "step": 9888, + "vm_loss": 0.1975 + }, + { + "epoch": 1.9035060278653417, + "lm_loss": 0.6489, + "step": 9888, + "vm_loss": 0.1668 + }, + { + "epoch": 1.9035060278653417, + "lm_loss": 0.5422, + "step": 9888, + "vm_loss": 0.1493 + }, + { + "epoch": 1.9035060278653417, + "lm_loss": 0.7314, + "step": 9888, + "vm_loss": 0.1537 + }, + { + "epoch": 1.9035060278653417, + "lm_loss": 0.6153, + "step": 9888, + "vm_loss": 0.1825 + }, + { + "epoch": 1.903698534542917, + "grad_norm": 3.230043045515284, + "learning_rate": 1.2078647279182175e-07, + "loss": 0.8373, + "step": 9889 + }, + { + "epoch": 1.9038910412204924, + "grad_norm": 3.3429548658797508, + "learning_rate": 1.2030381580963036e-07, + "loss": 0.8423, + "step": 9890 + }, + { + "epoch": 1.9040835478980678, + "grad_norm": 3.3006630207801986, + "learning_rate": 1.1982211926030551e-07, + "loss": 0.8326, + "step": 9891 + }, + { + "epoch": 1.904276054575643, + "grad_norm": 3.261601858771046, + "learning_rate": 1.1934138319067646e-07, + "loss": 0.8742, + "step": 9892 + }, + { + "epoch": 1.9044685612532186, + "grad_norm": 3.193077274064226, + "learning_rate": 1.1886160764747356e-07, + "loss": 0.8441, + "step": 9893 + }, + { + "epoch": 1.9046610679307938, + "grad_norm": 3.226297739948393, + "learning_rate": 1.1838279267734176e-07, + "loss": 0.8236, + "step": 9894 + }, + { + "epoch": 1.9048535746083692, + "grad_norm": 2.893082136261642, + "learning_rate": 1.1790493832682271e-07, + "loss": 0.7491, + "step": 9895 + }, + { + "epoch": 1.9050460812859447, + "grad_norm": 3.300736608377902, + "learning_rate": 1.1742804464237368e-07, + "loss": 0.834, + "step": 9896 + }, + { + "epoch": 1.9050460812859447, + "lm_loss": 0.8853, + "step": 9896, + "vm_loss": 0.2287 + }, + { + "epoch": 1.9050460812859447, + "lm_loss": 0.642, + "step": 9896, + "vm_loss": 0.1259 + }, + { + "epoch": 1.9050460812859447, + "lm_loss": 0.7415, + "step": 9896, + "vm_loss": 0.1772 + }, + { + "epoch": 1.9050460812859447, + "lm_loss": 0.6543, + "step": 9896, + "vm_loss": 0.1222 + }, + { + "epoch": 1.9050460812859447, + "lm_loss": 0.4442, + "step": 9896, + "vm_loss": 0.1684 + }, + { + "epoch": 1.9050460812859447, + "lm_loss": 0.6913, + "step": 9896, + "vm_loss": 0.1229 + }, + { + "epoch": 1.9050460812859447, + "lm_loss": 0.4455, + "step": 9896, + "vm_loss": 0.2599 + }, + { + "epoch": 1.9050460812859447, + "lm_loss": 0.8575, + "step": 9896, + "vm_loss": 0.1421 + }, + { + "epoch": 1.9052385879635199, + "grad_norm": 3.274786349863418, + "learning_rate": 1.1695211167035314e-07, + "loss": 0.8625, + "step": 9897 + }, + { + "epoch": 1.9054310946410955, + "grad_norm": 3.2923203019976057, + "learning_rate": 1.1647713945702966e-07, + "loss": 0.8321, + "step": 9898 + }, + { + "epoch": 1.9056236013186707, + "grad_norm": 3.034853064152262, + "learning_rate": 1.1600312804857405e-07, + "loss": 0.8006, + "step": 9899 + }, + { + "epoch": 1.9058161079962461, + "grad_norm": 3.3021380821923656, + "learning_rate": 1.1553007749106837e-07, + "loss": 0.8258, + "step": 9900 + }, + { + "epoch": 1.9060086146738215, + "grad_norm": 3.1801359586345725, + "learning_rate": 1.1505798783049916e-07, + "loss": 0.8271, + "step": 9901 + }, + { + "epoch": 1.9062011213513967, + "grad_norm": 3.239456039272056, + "learning_rate": 1.1458685911275746e-07, + "loss": 0.8313, + "step": 9902 + }, + { + "epoch": 1.9063936280289724, + "grad_norm": 3.191016302967127, + "learning_rate": 1.1411669138364446e-07, + "loss": 0.8207, + "step": 9903 + }, + { + "epoch": 1.9065861347065476, + "grad_norm": 3.2500715005467096, + "learning_rate": 1.1364748468886688e-07, + "loss": 0.8536, + "step": 9904 + }, + { + "epoch": 1.9065861347065476, + "lm_loss": 0.7729, + "step": 9904, + "vm_loss": 0.1633 + }, + { + "epoch": 1.9065861347065476, + "lm_loss": 0.6795, + "step": 9904, + "vm_loss": 0.1888 + }, + { + "epoch": 1.9065861347065476, + "lm_loss": 0.6828, + "step": 9904, + "vm_loss": 0.1254 + }, + { + "epoch": 1.9065861347065476, + "lm_loss": 0.7674, + "step": 9904, + "vm_loss": 0.1731 + }, + { + "epoch": 1.9065861347065476, + "lm_loss": 0.752, + "step": 9904, + "vm_loss": 0.1725 + }, + { + "epoch": 1.9065861347065476, + "lm_loss": 0.4922, + "step": 9904, + "vm_loss": 0.1767 + }, + { + "epoch": 1.9065861347065476, + "lm_loss": 1.1116, + "step": 9904, + "vm_loss": 0.3085 + }, + { + "epoch": 1.9065861347065476, + "lm_loss": 0.411, + "step": 9904, + "vm_loss": 0.1588 + }, + { + "epoch": 1.906778641384123, + "grad_norm": 3.2610502351835127, + "learning_rate": 1.1317923907403827e-07, + "loss": 0.8658, + "step": 9905 + }, + { + "epoch": 1.9069711480616984, + "grad_norm": 3.2421997131543465, + "learning_rate": 1.1271195458467555e-07, + "loss": 0.8505, + "step": 9906 + }, + { + "epoch": 1.9071636547392736, + "grad_norm": 3.240259241230784, + "learning_rate": 1.122456312662068e-07, + "loss": 0.8247, + "step": 9907 + }, + { + "epoch": 1.9073561614168493, + "grad_norm": 3.2605844039607934, + "learning_rate": 1.1178026916396356e-07, + "loss": 0.8693, + "step": 9908 + }, + { + "epoch": 1.9075486680944245, + "grad_norm": 3.0947694256588076, + "learning_rate": 1.1131586832318631e-07, + "loss": 0.7981, + "step": 9909 + }, + { + "epoch": 1.907741174772, + "grad_norm": 3.3331988639281, + "learning_rate": 1.1085242878901892e-07, + "loss": 0.8602, + "step": 9910 + }, + { + "epoch": 1.9079336814495753, + "grad_norm": 3.2750565109747836, + "learning_rate": 1.1038995060651536e-07, + "loss": 0.832, + "step": 9911 + }, + { + "epoch": 1.9081261881271505, + "grad_norm": 3.2519543026683, + "learning_rate": 1.0992843382063301e-07, + "loss": 0.8251, + "step": 9912 + }, + { + "epoch": 1.9081261881271505, + "lm_loss": 0.9418, + "step": 9912, + "vm_loss": 0.2026 + }, + { + "epoch": 1.9081261881271505, + "lm_loss": 0.711, + "step": 9912, + "vm_loss": 0.1504 + }, + { + "epoch": 1.9081261881271505, + "lm_loss": 0.6705, + "step": 9912, + "vm_loss": 0.1449 + }, + { + "epoch": 1.9081261881271505, + "lm_loss": 0.4324, + "step": 9912, + "vm_loss": 0.1742 + }, + { + "epoch": 1.9081261881271505, + "lm_loss": 0.6006, + "step": 9912, + "vm_loss": 0.2166 + }, + { + "epoch": 1.9081261881271505, + "lm_loss": 0.5787, + "step": 9912, + "vm_loss": 0.1953 + }, + { + "epoch": 1.9081261881271505, + "lm_loss": 0.6499, + "step": 9912, + "vm_loss": 0.1298 + }, + { + "epoch": 1.9081261881271505, + "lm_loss": 0.3661, + "step": 9912, + "vm_loss": 0.1602 + }, + { + "epoch": 1.9083186948047262, + "grad_norm": 3.3275522055311857, + "learning_rate": 1.0946787847623819e-07, + "loss": 0.8957, + "step": 9913 + }, + { + "epoch": 1.9085112014823014, + "grad_norm": 3.181604524844051, + "learning_rate": 1.0900828461810176e-07, + "loss": 0.8435, + "step": 9914 + }, + { + "epoch": 1.9087037081598768, + "grad_norm": 3.2606772503624626, + "learning_rate": 1.0854965229090352e-07, + "loss": 0.8154, + "step": 9915 + }, + { + "epoch": 1.9088962148374522, + "grad_norm": 3.1688447263236825, + "learning_rate": 1.0809198153922673e-07, + "loss": 0.8417, + "step": 9916 + }, + { + "epoch": 1.9090887215150274, + "grad_norm": 3.1259354260841006, + "learning_rate": 1.0763527240756578e-07, + "loss": 0.8015, + "step": 9917 + }, + { + "epoch": 1.909281228192603, + "grad_norm": 3.262864519739653, + "learning_rate": 1.0717952494031514e-07, + "loss": 0.825, + "step": 9918 + }, + { + "epoch": 1.9094737348701782, + "grad_norm": 3.205057023786529, + "learning_rate": 1.067247391817805e-07, + "loss": 0.8381, + "step": 9919 + }, + { + "epoch": 1.9096662415477537, + "grad_norm": 3.4250298328493214, + "learning_rate": 1.0627091517617427e-07, + "loss": 0.867, + "step": 9920 + }, + { + "epoch": 1.9096662415477537, + "lm_loss": 0.2017, + "step": 9920, + "vm_loss": 0.1773 + }, + { + "epoch": 1.9096662415477537, + "lm_loss": 0.8163, + "step": 9920, + "vm_loss": 0.2009 + }, + { + "epoch": 1.9096662415477537, + "lm_loss": 0.4915, + "step": 9920, + "vm_loss": 0.1409 + }, + { + "epoch": 1.9096662415477537, + "lm_loss": 0.4239, + "step": 9920, + "vm_loss": 0.1622 + }, + { + "epoch": 1.9096662415477537, + "lm_loss": 0.7777, + "step": 9920, + "vm_loss": 0.1481 + }, + { + "epoch": 1.9096662415477537, + "lm_loss": 0.4986, + "step": 9920, + "vm_loss": 0.1265 + }, + { + "epoch": 1.9096662415477537, + "lm_loss": 0.4651, + "step": 9920, + "vm_loss": 0.1607 + }, + { + "epoch": 1.9096662415477537, + "lm_loss": 0.7211, + "step": 9920, + "vm_loss": 0.1058 + }, + { + "epoch": 1.909858748225329, + "grad_norm": 3.1613739876367526, + "learning_rate": 1.0581805296761227e-07, + "loss": 0.8356, + "step": 9921 + }, + { + "epoch": 1.9100512549029043, + "grad_norm": 3.3992340696033256, + "learning_rate": 1.0536615260011928e-07, + "loss": 0.8613, + "step": 9922 + }, + { + "epoch": 1.91024376158048, + "grad_norm": 3.444260482071373, + "learning_rate": 1.0491521411762572e-07, + "loss": 0.8776, + "step": 9923 + }, + { + "epoch": 1.9104362682580551, + "grad_norm": 3.336788292060354, + "learning_rate": 1.0446523756396764e-07, + "loss": 0.8368, + "step": 9924 + }, + { + "epoch": 1.9106287749356305, + "grad_norm": 3.3366431201524214, + "learning_rate": 1.0401622298289005e-07, + "loss": 0.869, + "step": 9925 + }, + { + "epoch": 1.910821281613206, + "grad_norm": 3.1982285187186585, + "learning_rate": 1.0356817041804246e-07, + "loss": 0.8512, + "step": 9926 + }, + { + "epoch": 1.9110137882907814, + "grad_norm": 3.2563313789034263, + "learning_rate": 1.0312107991298004e-07, + "loss": 0.8447, + "step": 9927 + }, + { + "epoch": 1.9112062949683568, + "grad_norm": 3.1270523977283617, + "learning_rate": 1.026749515111669e-07, + "loss": 0.7655, + "step": 9928 + }, + { + "epoch": 1.9112062949683568, + "lm_loss": 0.4665, + "step": 9928, + "vm_loss": 0.1631 + }, + { + "epoch": 1.9112062949683568, + "lm_loss": 0.6368, + "step": 9928, + "vm_loss": 0.2316 + }, + { + "epoch": 1.9112062949683568, + "lm_loss": 0.7044, + "step": 9928, + "vm_loss": 0.1677 + }, + { + "epoch": 1.9112062949683568, + "lm_loss": 0.5044, + "step": 9928, + "vm_loss": 0.1436 + }, + { + "epoch": 1.9112062949683568, + "lm_loss": 0.4933, + "step": 9928, + "vm_loss": 0.1197 + }, + { + "epoch": 1.9112062949683568, + "lm_loss": 0.6454, + "step": 9928, + "vm_loss": 0.1266 + }, + { + "epoch": 1.9112062949683568, + "lm_loss": 0.896, + "step": 9928, + "vm_loss": 0.1498 + }, + { + "epoch": 1.9112062949683568, + "lm_loss": 0.7782, + "step": 9928, + "vm_loss": 0.1269 + }, + { + "epoch": 1.911398801645932, + "grad_norm": 3.202314516521334, + "learning_rate": 1.0222978525597282e-07, + "loss": 0.7775, + "step": 9929 + }, + { + "epoch": 1.9115913083235074, + "grad_norm": 3.2311114879379663, + "learning_rate": 1.0178558119067316e-07, + "loss": 0.8112, + "step": 9930 + }, + { + "epoch": 1.9117838150010829, + "grad_norm": 3.2221828973454083, + "learning_rate": 1.0134233935844895e-07, + "loss": 0.8835, + "step": 9931 + }, + { + "epoch": 1.9119763216786583, + "grad_norm": 3.1304561268014393, + "learning_rate": 1.0090005980239236e-07, + "loss": 0.7846, + "step": 9932 + }, + { + "epoch": 1.9121688283562337, + "grad_norm": 3.155946356661212, + "learning_rate": 1.0045874256549348e-07, + "loss": 0.8246, + "step": 9933 + }, + { + "epoch": 1.912361335033809, + "grad_norm": 3.379190458325995, + "learning_rate": 1.0001838769065908e-07, + "loss": 0.8288, + "step": 9934 + }, + { + "epoch": 1.9125538417113843, + "grad_norm": 3.184016048369238, + "learning_rate": 9.957899522069491e-08, + "loss": 0.8187, + "step": 9935 + }, + { + "epoch": 1.9127463483889597, + "grad_norm": 3.3063694431187196, + "learning_rate": 9.914056519831572e-08, + "loss": 0.8487, + "step": 9936 + }, + { + "epoch": 1.9127463483889597, + "lm_loss": 0.433, + "step": 9936, + "vm_loss": 0.1627 + }, + { + "epoch": 1.9127463483889597, + "lm_loss": 0.3984, + "step": 9936, + "vm_loss": 0.0922 + }, + { + "epoch": 1.9127463483889597, + "lm_loss": 0.9213, + "step": 9936, + "vm_loss": 0.1655 + }, + { + "epoch": 1.9127463483889597, + "lm_loss": 0.5418, + "step": 9936, + "vm_loss": 0.1771 + }, + { + "epoch": 1.9127463483889597, + "lm_loss": 0.8912, + "step": 9936, + "vm_loss": 0.1655 + }, + { + "epoch": 1.9127463483889597, + "lm_loss": 0.4956, + "step": 9936, + "vm_loss": 0.1394 + }, + { + "epoch": 1.9127463483889597, + "lm_loss": 0.6429, + "step": 9936, + "vm_loss": 0.1558 + }, + { + "epoch": 1.9127463483889597, + "lm_loss": 0.7916, + "step": 9936, + "vm_loss": 0.1593 + }, + { + "epoch": 1.9129388550665352, + "grad_norm": 3.038788192887153, + "learning_rate": 9.870309766614184e-08, + "loss": 0.7942, + "step": 9937 + }, + { + "epoch": 1.9131313617441106, + "grad_norm": 3.3855092457664733, + "learning_rate": 9.826659266670258e-08, + "loss": 0.8926, + "step": 9938 + }, + { + "epoch": 1.9133238684216858, + "grad_norm": 3.2337467640974724, + "learning_rate": 9.783105024242956e-08, + "loss": 0.81, + "step": 9939 + }, + { + "epoch": 1.9135163750992612, + "grad_norm": 3.3771172845783615, + "learning_rate": 9.739647043566447e-08, + "loss": 0.8637, + "step": 9940 + }, + { + "epoch": 1.9137088817768366, + "grad_norm": 3.3089180985756435, + "learning_rate": 9.696285328865352e-08, + "loss": 0.8609, + "step": 9941 + }, + { + "epoch": 1.913901388454412, + "grad_norm": 3.160928207591143, + "learning_rate": 9.653019884355075e-08, + "loss": 0.838, + "step": 9942 + }, + { + "epoch": 1.9140938951319875, + "grad_norm": 2.986331310740156, + "learning_rate": 9.609850714241475e-08, + "loss": 0.7285, + "step": 9943 + }, + { + "epoch": 1.9142864018095627, + "grad_norm": 3.2505720078210443, + "learning_rate": 9.566777822721196e-08, + "loss": 0.8761, + "step": 9944 + }, + { + "epoch": 1.9142864018095627, + "lm_loss": 1.1197, + "step": 9944, + "vm_loss": 0.1493 + }, + { + "epoch": 1.9142864018095627, + "lm_loss": 0.4906, + "step": 9944, + "vm_loss": 0.1665 + }, + { + "epoch": 1.9142864018095627, + "lm_loss": 0.9033, + "step": 9944, + "vm_loss": 0.1883 + }, + { + "epoch": 1.9142864018095627, + "lm_loss": 0.5176, + "step": 9944, + "vm_loss": 0.171 + }, + { + "epoch": 1.9142864018095627, + "lm_loss": 0.8495, + "step": 9944, + "vm_loss": 0.1564 + }, + { + "epoch": 1.9142864018095627, + "lm_loss": 0.8973, + "step": 9944, + "vm_loss": 0.1873 + }, + { + "epoch": 1.9142864018095627, + "lm_loss": 0.7788, + "step": 9944, + "vm_loss": 0.2065 + }, + { + "epoch": 1.9142864018095627, + "lm_loss": 0.6433, + "step": 9944, + "vm_loss": 0.214 + }, + { + "epoch": 1.9144789084871383, + "grad_norm": 3.233812671799377, + "learning_rate": 9.523801213981443e-08, + "loss": 0.8472, + "step": 9945 + }, + { + "epoch": 1.9146714151647135, + "grad_norm": 3.4125357172174526, + "learning_rate": 9.480920892200096e-08, + "loss": 0.893, + "step": 9946 + }, + { + "epoch": 1.914863921842289, + "grad_norm": 3.187284373320313, + "learning_rate": 9.4381368615456e-08, + "loss": 0.7988, + "step": 9947 + }, + { + "epoch": 1.9150564285198644, + "grad_norm": 3.19593381936061, + "learning_rate": 9.395449126177291e-08, + "loss": 0.8139, + "step": 9948 + }, + { + "epoch": 1.9152489351974396, + "grad_norm": 3.1173367344759795, + "learning_rate": 9.352857690244854e-08, + "loss": 0.8378, + "step": 9949 + }, + { + "epoch": 1.9154414418750152, + "grad_norm": 3.237628201471339, + "learning_rate": 9.310362557888642e-08, + "loss": 0.8261, + "step": 9950 + }, + { + "epoch": 1.9156339485525904, + "grad_norm": 3.383036985375902, + "learning_rate": 9.267963733239904e-08, + "loss": 0.8709, + "step": 9951 + }, + { + "epoch": 1.9158264552301658, + "grad_norm": 3.260786093268801, + "learning_rate": 9.225661220420234e-08, + "loss": 0.8217, + "step": 9952 + }, + { + "epoch": 1.9158264552301658, + "lm_loss": 0.9081, + "step": 9952, + "vm_loss": 0.1153 + }, + { + "epoch": 1.9158264552301658, + "lm_loss": 0.6096, + "step": 9952, + "vm_loss": 0.2274 + }, + { + "epoch": 1.9158264552301658, + "lm_loss": 0.4406, + "step": 9952, + "vm_loss": 0.1576 + }, + { + "epoch": 1.9158264552301658, + "lm_loss": 0.4195, + "step": 9952, + "vm_loss": 0.1688 + }, + { + "epoch": 1.9158264552301658, + "lm_loss": 0.5355, + "step": 9952, + "vm_loss": 0.1398 + }, + { + "epoch": 1.9158264552301658, + "lm_loss": 0.6897, + "step": 9952, + "vm_loss": 0.1724 + }, + { + "epoch": 1.9158264552301658, + "lm_loss": 0.5694, + "step": 9952, + "vm_loss": 0.1288 + }, + { + "epoch": 1.9158264552301658, + "lm_loss": 1.0391, + "step": 9952, + "vm_loss": 0.1496 + }, + { + "epoch": 1.9160189619077412, + "grad_norm": 3.3606677888024263, + "learning_rate": 9.18345502354201e-08, + "loss": 0.866, + "step": 9953 + }, + { + "epoch": 1.9162114685853164, + "grad_norm": 3.129080059580664, + "learning_rate": 9.141345146708169e-08, + "loss": 0.8008, + "step": 9954 + }, + { + "epoch": 1.916403975262892, + "grad_norm": 3.206112737044632, + "learning_rate": 9.099331594012329e-08, + "loss": 0.8535, + "step": 9955 + }, + { + "epoch": 1.9165964819404673, + "grad_norm": 3.2603245484025107, + "learning_rate": 9.057414369538887e-08, + "loss": 0.8078, + "step": 9956 + }, + { + "epoch": 1.9167889886180427, + "grad_norm": 3.123323087210107, + "learning_rate": 9.015593477362472e-08, + "loss": 0.836, + "step": 9957 + }, + { + "epoch": 1.9169814952956181, + "grad_norm": 3.271809729413425, + "learning_rate": 8.973868921548834e-08, + "loss": 0.8384, + "step": 9958 + }, + { + "epoch": 1.9171740019731933, + "grad_norm": 3.2284625961389484, + "learning_rate": 8.932240706154171e-08, + "loss": 0.8067, + "step": 9959 + }, + { + "epoch": 1.917366508650769, + "grad_norm": 3.1097142499006507, + "learning_rate": 8.890708835225026e-08, + "loss": 0.7776, + "step": 9960 + }, + { + "epoch": 1.917366508650769, + "lm_loss": 0.8466, + "step": 9960, + "vm_loss": 0.1408 + }, + { + "epoch": 1.917366508650769, + "lm_loss": 0.672, + "step": 9960, + "vm_loss": 0.1637 + }, + { + "epoch": 1.917366508650769, + "lm_loss": 0.9304, + "step": 9960, + "vm_loss": 0.175 + }, + { + "epoch": 1.917366508650769, + "lm_loss": 0.424, + "step": 9960, + "vm_loss": 0.128 + }, + { + "epoch": 1.917366508650769, + "lm_loss": 0.8495, + "step": 9960, + "vm_loss": 0.2558 + }, + { + "epoch": 1.917366508650769, + "lm_loss": 0.6521, + "step": 9960, + "vm_loss": 0.1238 + }, + { + "epoch": 1.917366508650769, + "lm_loss": 0.5842, + "step": 9960, + "vm_loss": 0.1297 + }, + { + "epoch": 1.917366508650769, + "lm_loss": 0.6816, + "step": 9960, + "vm_loss": 0.1342 + }, + { + "epoch": 1.9175590153283442, + "grad_norm": 3.2158704157391607, + "learning_rate": 8.849273312799056e-08, + "loss": 0.809, + "step": 9961 + }, + { + "epoch": 1.9177515220059196, + "grad_norm": 3.268655244261384, + "learning_rate": 8.807934142904262e-08, + "loss": 0.8324, + "step": 9962 + }, + { + "epoch": 1.917944028683495, + "grad_norm": 3.3504453515127945, + "learning_rate": 8.766691329559207e-08, + "loss": 0.866, + "step": 9963 + }, + { + "epoch": 1.9181365353610702, + "grad_norm": 3.208161092329222, + "learning_rate": 8.72554487677335e-08, + "loss": 0.7797, + "step": 9964 + }, + { + "epoch": 1.9183290420386458, + "grad_norm": 3.2790459550726747, + "learning_rate": 8.684494788546716e-08, + "loss": 0.8068, + "step": 9965 + }, + { + "epoch": 1.918521548716221, + "grad_norm": 3.351630112811597, + "learning_rate": 8.643541068869776e-08, + "loss": 0.8796, + "step": 9966 + }, + { + "epoch": 1.9187140553937965, + "grad_norm": 3.3083529848965245, + "learning_rate": 8.602683721723793e-08, + "loss": 0.8853, + "step": 9967 + }, + { + "epoch": 1.918906562071372, + "grad_norm": 3.0830475507712447, + "learning_rate": 8.561922751080697e-08, + "loss": 0.7518, + "step": 9968 + }, + { + "epoch": 1.918906562071372, + "lm_loss": 0.406, + "step": 9968, + "vm_loss": 0.1586 + }, + { + "epoch": 1.918906562071372, + "lm_loss": 0.8093, + "step": 9968, + "vm_loss": 0.142 + }, + { + "epoch": 1.918906562071372, + "lm_loss": 0.9129, + "step": 9968, + "vm_loss": 0.1528 + }, + { + "epoch": 1.918906562071372, + "lm_loss": 0.4346, + "step": 9968, + "vm_loss": 0.2875 + }, + { + "epoch": 1.918906562071372, + "lm_loss": 1.1024, + "step": 9968, + "vm_loss": 0.2363 + }, + { + "epoch": 1.918906562071372, + "lm_loss": 0.85, + "step": 9968, + "vm_loss": 0.2356 + }, + { + "epoch": 1.918906562071372, + "lm_loss": 0.797, + "step": 9968, + "vm_loss": 0.1436 + }, + { + "epoch": 1.918906562071372, + "lm_loss": 0.5154, + "step": 9968, + "vm_loss": 0.1332 + }, + { + "epoch": 1.919099068748947, + "grad_norm": 3.2380418088297462, + "learning_rate": 8.521258160902767e-08, + "loss": 0.8781, + "step": 9969 + }, + { + "epoch": 1.9192915754265227, + "grad_norm": 3.070287601728227, + "learning_rate": 8.480689955143395e-08, + "loss": 0.8114, + "step": 9970 + }, + { + "epoch": 1.919484082104098, + "grad_norm": 3.30890792941629, + "learning_rate": 8.440218137746092e-08, + "loss": 0.8633, + "step": 9971 + }, + { + "epoch": 1.9196765887816734, + "grad_norm": 3.404212275537186, + "learning_rate": 8.399842712645378e-08, + "loss": 0.8702, + "step": 9972 + }, + { + "epoch": 1.9198690954592488, + "grad_norm": 3.304417692886842, + "learning_rate": 8.359563683766225e-08, + "loss": 0.8275, + "step": 9973 + }, + { + "epoch": 1.920061602136824, + "grad_norm": 3.104477219970434, + "learning_rate": 8.319381055024278e-08, + "loss": 0.8005, + "step": 9974 + }, + { + "epoch": 1.9202541088143996, + "grad_norm": 3.205647667063476, + "learning_rate": 8.279294830325746e-08, + "loss": 0.8473, + "step": 9975 + }, + { + "epoch": 1.9204466154919748, + "grad_norm": 3.393739336683005, + "learning_rate": 8.239305013567734e-08, + "loss": 0.8685, + "step": 9976 + }, + { + "epoch": 1.9204466154919748, + "lm_loss": 0.8843, + "step": 9976, + "vm_loss": 0.1441 + }, + { + "epoch": 1.9204466154919748, + "lm_loss": 0.9568, + "step": 9976, + "vm_loss": 0.1766 + }, + { + "epoch": 1.9204466154919748, + "lm_loss": 0.4966, + "step": 9976, + "vm_loss": 0.1885 + }, + { + "epoch": 1.9204466154919748, + "lm_loss": 0.5389, + "step": 9976, + "vm_loss": 0.1293 + }, + { + "epoch": 1.9204466154919748, + "lm_loss": 0.625, + "step": 9976, + "vm_loss": 0.1789 + }, + { + "epoch": 1.9204466154919748, + "lm_loss": 0.6157, + "step": 9976, + "vm_loss": 0.1381 + }, + { + "epoch": 1.9204466154919748, + "lm_loss": 0.5504, + "step": 9976, + "vm_loss": 0.1613 + }, + { + "epoch": 1.9204466154919748, + "lm_loss": 0.5517, + "step": 9976, + "vm_loss": 0.1232 + }, + { + "epoch": 1.9206391221695502, + "grad_norm": 3.3281365535731346, + "learning_rate": 8.199411608637465e-08, + "loss": 0.8461, + "step": 9977 + }, + { + "epoch": 1.9208316288471257, + "grad_norm": 3.116934828383036, + "learning_rate": 8.159614619413169e-08, + "loss": 0.7783, + "step": 9978 + }, + { + "epoch": 1.9210241355247009, + "grad_norm": 3.2335089372004338, + "learning_rate": 8.119914049763755e-08, + "loss": 0.8323, + "step": 9979 + }, + { + "epoch": 1.9212166422022765, + "grad_norm": 3.4439366277016625, + "learning_rate": 8.080309903548577e-08, + "loss": 0.8562, + "step": 9980 + }, + { + "epoch": 1.9214091488798517, + "grad_norm": 3.280723884331258, + "learning_rate": 8.040802184617446e-08, + "loss": 0.815, + "step": 9981 + }, + { + "epoch": 1.9216016555574271, + "grad_norm": 3.2593402630784105, + "learning_rate": 8.0013908968114e-08, + "loss": 0.7906, + "step": 9982 + }, + { + "epoch": 1.9217941622350025, + "grad_norm": 3.2339832880266983, + "learning_rate": 7.962076043961486e-08, + "loss": 0.79, + "step": 9983 + }, + { + "epoch": 1.9219866689125777, + "grad_norm": 3.103014281252802, + "learning_rate": 7.922857629889646e-08, + "loss": 0.774, + "step": 9984 + }, + { + "epoch": 1.9219866689125777, + "lm_loss": 0.5574, + "step": 9984, + "vm_loss": 0.1413 + }, + { + "epoch": 1.9219866689125777, + "lm_loss": 0.5323, + "step": 9984, + "vm_loss": 0.1358 + }, + { + "epoch": 1.9219866689125777, + "lm_loss": 0.6366, + "step": 9984, + "vm_loss": 0.1443 + }, + { + "epoch": 1.9219866689125777, + "lm_loss": 0.793, + "step": 9984, + "vm_loss": 0.1176 + }, + { + "epoch": 1.9219866689125777, + "lm_loss": 0.6337, + "step": 9984, + "vm_loss": 0.1541 + }, + { + "epoch": 1.9219866689125777, + "lm_loss": 0.433, + "step": 9984, + "vm_loss": 0.158 + }, + { + "epoch": 1.9219866689125777, + "lm_loss": 0.4286, + "step": 9984, + "vm_loss": 0.1335 + }, + { + "epoch": 1.9219866689125777, + "lm_loss": 0.4035, + "step": 9984, + "vm_loss": 0.186 + }, + { + "epoch": 1.9221791755901534, + "grad_norm": 3.2761356682684735, + "learning_rate": 7.883735658408386e-08, + "loss": 0.8048, + "step": 9985 + }, + { + "epoch": 1.9223716822677286, + "grad_norm": 3.227963267761403, + "learning_rate": 7.844710133320887e-08, + "loss": 0.8402, + "step": 9986 + }, + { + "epoch": 1.922564188945304, + "grad_norm": 3.2698343087583144, + "learning_rate": 7.805781058421003e-08, + "loss": 0.8189, + "step": 9987 + }, + { + "epoch": 1.9227566956228794, + "grad_norm": 3.1720528913860617, + "learning_rate": 7.766948437493039e-08, + "loss": 0.8152, + "step": 9988 + }, + { + "epoch": 1.9229492023004549, + "grad_norm": 3.317727886467578, + "learning_rate": 7.728212274312086e-08, + "loss": 0.8533, + "step": 9989 + }, + { + "epoch": 1.9231417089780303, + "grad_norm": 3.2837640255969647, + "learning_rate": 7.68957257264369e-08, + "loss": 0.8369, + "step": 9990 + }, + { + "epoch": 1.9233342156556055, + "grad_norm": 3.3213020320540214, + "learning_rate": 7.651029336244175e-08, + "loss": 0.8453, + "step": 9991 + }, + { + "epoch": 1.923526722333181, + "grad_norm": 3.279682932215763, + "learning_rate": 7.61258256886055e-08, + "loss": 0.8067, + "step": 9992 + }, + { + "epoch": 1.923526722333181, + "lm_loss": 0.7507, + "step": 9992, + "vm_loss": 0.1708 + }, + { + "epoch": 1.923526722333181, + "lm_loss": 0.5879, + "step": 9992, + "vm_loss": 0.1984 + }, + { + "epoch": 1.923526722333181, + "lm_loss": 0.8408, + "step": 9992, + "vm_loss": 0.2065 + }, + { + "epoch": 1.923526722333181, + "lm_loss": 0.9713, + "step": 9992, + "vm_loss": 0.1191 + }, + { + "epoch": 1.923526722333181, + "lm_loss": 1.0564, + "step": 9992, + "vm_loss": 0.1442 + }, + { + "epoch": 1.923526722333181, + "lm_loss": 0.848, + "step": 9992, + "vm_loss": 0.1468 + }, + { + "epoch": 1.923526722333181, + "lm_loss": 0.3305, + "step": 9992, + "vm_loss": 0.1844 + }, + { + "epoch": 1.923526722333181, + "lm_loss": 0.4904, + "step": 9992, + "vm_loss": 0.1791 + }, + { + "epoch": 1.9237192290107563, + "grad_norm": 3.204432979621257, + "learning_rate": 7.574232274230264e-08, + "loss": 0.8339, + "step": 9993 + }, + { + "epoch": 1.9239117356883317, + "grad_norm": 3.1319916054962462, + "learning_rate": 7.535978456081339e-08, + "loss": 0.7901, + "step": 9994 + }, + { + "epoch": 1.9241042423659072, + "grad_norm": 3.1394839770627154, + "learning_rate": 7.497821118132687e-08, + "loss": 0.8263, + "step": 9995 + }, + { + "epoch": 1.9242967490434824, + "grad_norm": 2.9846174050187675, + "learning_rate": 7.459760264093563e-08, + "loss": 0.7968, + "step": 9996 + }, + { + "epoch": 1.9244892557210578, + "grad_norm": 3.2778192705327136, + "learning_rate": 7.42179589766412e-08, + "loss": 0.8049, + "step": 9997 + }, + { + "epoch": 1.9246817623986332, + "grad_norm": 3.1502301575505856, + "learning_rate": 7.383928022534847e-08, + "loss": 0.8312, + "step": 9998 + }, + { + "epoch": 1.9248742690762086, + "grad_norm": 3.248549460842929, + "learning_rate": 7.346156642387137e-08, + "loss": 0.8528, + "step": 9999 + }, + { + "epoch": 1.925066775753784, + "grad_norm": 3.1471421618350512, + "learning_rate": 7.308481760892605e-08, + "loss": 0.7854, + "step": 10000 + }, + { + "epoch": 1.925066775753784, + "lm_loss": 0.6902, + "step": 10000, + "vm_loss": 0.1752 + }, + { + "epoch": 1.925066775753784, + "lm_loss": 0.5175, + "step": 10000, + "vm_loss": 0.1297 + }, + { + "epoch": 1.925066775753784, + "lm_loss": 0.2516, + "step": 10000, + "vm_loss": 0.1899 + }, + { + "epoch": 1.925066775753784, + "lm_loss": 0.5713, + "step": 10000, + "vm_loss": 0.2327 + }, + { + "epoch": 1.925066775753784, + "lm_loss": 0.6888, + "step": 10000, + "vm_loss": 0.1154 + }, + { + "epoch": 1.925066775753784, + "lm_loss": 0.702, + "step": 10000, + "vm_loss": 0.1679 + }, + { + "epoch": 1.925066775753784, + "lm_loss": 0.9082, + "step": 10000, + "vm_loss": 0.1238 + }, + { + "epoch": 1.925066775753784, + "lm_loss": 0.557, + "step": 10000, + "vm_loss": 0.2113 + }, + { + "epoch": 1.9252592824313592, + "grad_norm": 3.3684366618393975, + "learning_rate": 7.270903381713989e-08, + "loss": 0.8738, + "step": 10001 + }, + { + "epoch": 1.9254517891089347, + "grad_norm": 3.1193790826800014, + "learning_rate": 7.233421508504257e-08, + "loss": 0.7972, + "step": 10002 + }, + { + "epoch": 1.92564429578651, + "grad_norm": 3.3031903074242224, + "learning_rate": 7.196036144907159e-08, + "loss": 0.8707, + "step": 10003 + }, + { + "epoch": 1.9258368024640855, + "grad_norm": 3.3577966390052927, + "learning_rate": 7.1587472945569e-08, + "loss": 0.8781, + "step": 10004 + }, + { + "epoch": 1.926029309141661, + "grad_norm": 3.2136214711631816, + "learning_rate": 7.121554961078692e-08, + "loss": 0.8207, + "step": 10005 + }, + { + "epoch": 1.9262218158192361, + "grad_norm": 3.298741835417461, + "learning_rate": 7.084459148087864e-08, + "loss": 0.8375, + "step": 10006 + }, + { + "epoch": 1.9264143224968118, + "grad_norm": 3.179228770853609, + "learning_rate": 7.047459859190752e-08, + "loss": 0.8447, + "step": 10007 + }, + { + "epoch": 1.926606829174387, + "grad_norm": 3.1913765571956896, + "learning_rate": 7.01055709798415e-08, + "loss": 0.8008, + "step": 10008 + }, + { + "epoch": 1.926606829174387, + "lm_loss": 1.1802, + "step": 10008, + "vm_loss": 0.1411 + }, + { + "epoch": 1.926606829174387, + "lm_loss": 0.6101, + "step": 10008, + "vm_loss": 0.2149 + }, + { + "epoch": 1.926606829174387, + "lm_loss": 0.7753, + "step": 10008, + "vm_loss": 0.1785 + }, + { + "epoch": 1.926606829174387, + "lm_loss": 0.6306, + "step": 10008, + "vm_loss": 0.1789 + }, + { + "epoch": 1.926606829174387, + "lm_loss": 0.3485, + "step": 10008, + "vm_loss": 0.1429 + }, + { + "epoch": 1.926606829174387, + "lm_loss": 0.8707, + "step": 10008, + "vm_loss": 0.2313 + }, + { + "epoch": 1.926606829174387, + "lm_loss": 0.4871, + "step": 10008, + "vm_loss": 0.1631 + }, + { + "epoch": 1.926606829174387, + "lm_loss": 1.1411, + "step": 10008, + "vm_loss": 0.1381 + }, + { + "epoch": 1.9267993358519624, + "grad_norm": 3.313535315065674, + "learning_rate": 6.973750868055406e-08, + "loss": 0.8562, + "step": 10009 + }, + { + "epoch": 1.9269918425295378, + "grad_norm": 3.3256902367865253, + "learning_rate": 6.937041172982662e-08, + "loss": 0.8611, + "step": 10010 + }, + { + "epoch": 1.927184349207113, + "grad_norm": 3.3460536682412987, + "learning_rate": 6.900428016334393e-08, + "loss": 0.8395, + "step": 10011 + }, + { + "epoch": 1.9273768558846887, + "grad_norm": 3.3287282106338436, + "learning_rate": 6.863911401669976e-08, + "loss": 0.8505, + "step": 10012 + }, + { + "epoch": 1.9275693625622639, + "grad_norm": 3.048516301479563, + "learning_rate": 6.827491332539349e-08, + "loss": 0.8076, + "step": 10013 + }, + { + "epoch": 1.9277618692398393, + "grad_norm": 3.20842115740196, + "learning_rate": 6.791167812483013e-08, + "loss": 0.8419, + "step": 10014 + }, + { + "epoch": 1.9279543759174147, + "grad_norm": 3.3724980902572987, + "learning_rate": 6.75494084503181e-08, + "loss": 0.8225, + "step": 10015 + }, + { + "epoch": 1.92814688259499, + "grad_norm": 3.381450113165441, + "learning_rate": 6.718810433707924e-08, + "loss": 0.8765, + "step": 10016 + }, + { + "epoch": 1.92814688259499, + "lm_loss": 0.6617, + "step": 10016, + "vm_loss": 0.1409 + }, + { + "epoch": 1.92814688259499, + "lm_loss": 0.3258, + "step": 10016, + "vm_loss": 0.1662 + }, + { + "epoch": 1.92814688259499, + "lm_loss": 0.6057, + "step": 10016, + "vm_loss": 0.1154 + }, + { + "epoch": 1.92814688259499, + "lm_loss": 1.0099, + "step": 10016, + "vm_loss": 0.1065 + }, + { + "epoch": 1.92814688259499, + "lm_loss": 0.4767, + "step": 10016, + "vm_loss": 0.1367 + }, + { + "epoch": 1.92814688259499, + "lm_loss": 0.5711, + "step": 10016, + "vm_loss": 0.1781 + }, + { + "epoch": 1.92814688259499, + "lm_loss": 0.5324, + "step": 10016, + "vm_loss": 0.2103 + }, + { + "epoch": 1.92814688259499, + "lm_loss": 0.9001, + "step": 10016, + "vm_loss": 0.2371 + }, + { + "epoch": 1.9283393892725655, + "grad_norm": 3.153939724618915, + "learning_rate": 6.68277658202321e-08, + "loss": 0.8269, + "step": 10017 + }, + { + "epoch": 1.9285318959501407, + "grad_norm": 3.215528042978117, + "learning_rate": 6.646839293481088e-08, + "loss": 0.818, + "step": 10018 + }, + { + "epoch": 1.9287244026277162, + "grad_norm": 3.206212798912243, + "learning_rate": 6.610998571574656e-08, + "loss": 0.822, + "step": 10019 + }, + { + "epoch": 1.9289169093052916, + "grad_norm": 3.2554223949099996, + "learning_rate": 6.575254419788457e-08, + "loss": 0.8388, + "step": 10020 + }, + { + "epoch": 1.9291094159828668, + "grad_norm": 3.339778406575526, + "learning_rate": 6.539606841597046e-08, + "loss": 0.8634, + "step": 10021 + }, + { + "epoch": 1.9293019226604424, + "grad_norm": 3.329420873526788, + "learning_rate": 6.504055840465983e-08, + "loss": 0.8361, + "step": 10022 + }, + { + "epoch": 1.9294944293380176, + "grad_norm": 3.4356237032858625, + "learning_rate": 6.468601419851173e-08, + "loss": 0.8679, + "step": 10023 + }, + { + "epoch": 1.929686936015593, + "grad_norm": 3.2648005047078983, + "learning_rate": 6.433243583199412e-08, + "loss": 0.852, + "step": 10024 + }, + { + "epoch": 1.929686936015593, + "lm_loss": 0.4727, + "step": 10024, + "vm_loss": 0.1541 + }, + { + "epoch": 1.929686936015593, + "lm_loss": 0.7455, + "step": 10024, + "vm_loss": 0.1812 + }, + { + "epoch": 1.929686936015593, + "lm_loss": 0.578, + "step": 10024, + "vm_loss": 0.1594 + }, + { + "epoch": 1.929686936015593, + "lm_loss": 0.7262, + "step": 10024, + "vm_loss": 0.1302 + }, + { + "epoch": 1.929686936015593, + "lm_loss": 0.714, + "step": 10024, + "vm_loss": 0.1318 + }, + { + "epoch": 1.929686936015593, + "lm_loss": 0.4874, + "step": 10024, + "vm_loss": 0.2125 + }, + { + "epoch": 1.929686936015593, + "lm_loss": 0.4989, + "step": 10024, + "vm_loss": 0.1325 + }, + { + "epoch": 1.929686936015593, + "lm_loss": 1.4171, + "step": 10024, + "vm_loss": 0.1661 + }, + { + "epoch": 1.9298794426931685, + "grad_norm": 3.426280175069049, + "learning_rate": 6.397982333947617e-08, + "loss": 0.8751, + "step": 10025 + }, + { + "epoch": 1.9300719493707437, + "grad_norm": 3.385736953398937, + "learning_rate": 6.362817675523936e-08, + "loss": 0.8807, + "step": 10026 + }, + { + "epoch": 1.9302644560483193, + "grad_norm": 3.178883518479652, + "learning_rate": 6.327749611346745e-08, + "loss": 0.8011, + "step": 10027 + }, + { + "epoch": 1.9304569627258945, + "grad_norm": 3.283551467295873, + "learning_rate": 6.292778144824984e-08, + "loss": 0.8467, + "step": 10028 + }, + { + "epoch": 1.93064946940347, + "grad_norm": 3.3472098100925995, + "learning_rate": 6.257903279358491e-08, + "loss": 0.8881, + "step": 10029 + }, + { + "epoch": 1.9308419760810454, + "grad_norm": 3.407160695988864, + "learning_rate": 6.223125018337328e-08, + "loss": 0.8586, + "step": 10030 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 3.301421813918248, + "learning_rate": 6.188443365142682e-08, + "loss": 0.8625, + "step": 10031 + }, + { + "epoch": 1.9312269894361962, + "grad_norm": 3.0635081099564396, + "learning_rate": 6.153858323145856e-08, + "loss": 0.8045, + "step": 10032 + }, + { + "epoch": 1.9312269894361962, + "lm_loss": 0.8019, + "step": 10032, + "vm_loss": 0.1679 + }, + { + "epoch": 1.9312269894361962, + "lm_loss": 0.5311, + "step": 10032, + "vm_loss": 0.1926 + }, + { + "epoch": 1.9312269894361962, + "lm_loss": 0.5768, + "step": 10032, + "vm_loss": 0.1583 + }, + { + "epoch": 1.9312269894361962, + "lm_loss": 0.4148, + "step": 10032, + "vm_loss": 0.153 + }, + { + "epoch": 1.9312269894361962, + "lm_loss": 0.6214, + "step": 10032, + "vm_loss": 0.1701 + }, + { + "epoch": 1.9312269894361962, + "lm_loss": 0.6961, + "step": 10032, + "vm_loss": 0.1757 + }, + { + "epoch": 1.9312269894361962, + "lm_loss": 1.0139, + "step": 10032, + "vm_loss": 0.1365 + }, + { + "epoch": 1.9312269894361962, + "lm_loss": 0.7958, + "step": 10032, + "vm_loss": 0.2085 + }, + { + "epoch": 1.9314194961137714, + "grad_norm": 3.2680187578225643, + "learning_rate": 6.119369895709048e-08, + "loss": 0.8561, + "step": 10033 + }, + { + "epoch": 1.9316120027913468, + "grad_norm": 3.242346144980176, + "learning_rate": 6.0849780861848e-08, + "loss": 0.8132, + "step": 10034 + }, + { + "epoch": 1.9318045094689222, + "grad_norm": 3.352650529932269, + "learning_rate": 6.05068289791677e-08, + "loss": 0.8606, + "step": 10035 + }, + { + "epoch": 1.9319970161464974, + "grad_norm": 3.140601798771611, + "learning_rate": 6.016484334238515e-08, + "loss": 0.8158, + "step": 10036 + }, + { + "epoch": 1.932189522824073, + "grad_norm": 3.251800139852715, + "learning_rate": 5.982382398474817e-08, + "loss": 0.8417, + "step": 10037 + }, + { + "epoch": 1.9323820295016483, + "grad_norm": 3.1447314784625866, + "learning_rate": 5.948377093940694e-08, + "loss": 0.7978, + "step": 10038 + }, + { + "epoch": 1.9325745361792237, + "grad_norm": 3.2401004021930686, + "learning_rate": 5.914468423942055e-08, + "loss": 0.821, + "step": 10039 + }, + { + "epoch": 1.9327670428567991, + "grad_norm": 3.4522349113867152, + "learning_rate": 5.880656391775041e-08, + "loss": 0.9078, + "step": 10040 + }, + { + "epoch": 1.9327670428567991, + "lm_loss": 0.9056, + "step": 10040, + "vm_loss": 0.1649 + }, + { + "epoch": 1.9327670428567991, + "lm_loss": 0.4546, + "step": 10040, + "vm_loss": 0.2079 + }, + { + "epoch": 1.9327670428567991, + "lm_loss": 1.0492, + "step": 10040, + "vm_loss": 0.1102 + }, + { + "epoch": 1.9327670428567991, + "lm_loss": 0.8601, + "step": 10040, + "vm_loss": 0.1456 + }, + { + "epoch": 1.9327670428567991, + "lm_loss": 0.8724, + "step": 10040, + "vm_loss": 0.1795 + }, + { + "epoch": 1.9327670428567991, + "lm_loss": 0.7304, + "step": 10040, + "vm_loss": 0.1886 + }, + { + "epoch": 1.9327670428567991, + "lm_loss": 0.591, + "step": 10040, + "vm_loss": 0.1589 + }, + { + "epoch": 1.9327670428567991, + "lm_loss": 0.3179, + "step": 10040, + "vm_loss": 0.1658 + }, + { + "epoch": 1.9329595495343743, + "grad_norm": 3.212707950012238, + "learning_rate": 5.846941000726802e-08, + "loss": 0.8406, + "step": 10041 + }, + { + "epoch": 1.93315205621195, + "grad_norm": 3.36118334654075, + "learning_rate": 5.813322254074827e-08, + "loss": 0.8255, + "step": 10042 + }, + { + "epoch": 1.9333445628895252, + "grad_norm": 3.260913416622491, + "learning_rate": 5.779800155087167e-08, + "loss": 0.8723, + "step": 10043 + }, + { + "epoch": 1.9335370695671006, + "grad_norm": 3.1890492630043994, + "learning_rate": 5.746374707022884e-08, + "loss": 0.8362, + "step": 10044 + }, + { + "epoch": 1.933729576244676, + "grad_norm": 3.1999969718502035, + "learning_rate": 5.713045913131155e-08, + "loss": 0.8172, + "step": 10045 + }, + { + "epoch": 1.9339220829222512, + "grad_norm": 3.377020803701443, + "learning_rate": 5.679813776651943e-08, + "loss": 0.8583, + "step": 10046 + }, + { + "epoch": 1.9341145895998269, + "grad_norm": 3.1817266840059624, + "learning_rate": 5.646678300816e-08, + "loss": 0.8111, + "step": 10047 + }, + { + "epoch": 1.934307096277402, + "grad_norm": 3.1229156253114807, + "learning_rate": 5.6136394888443025e-08, + "loss": 0.8141, + "step": 10048 + }, + { + "epoch": 1.934307096277402, + "lm_loss": 0.5763, + "step": 10048, + "vm_loss": 0.1681 + }, + { + "epoch": 1.934307096277402, + "lm_loss": 0.6909, + "step": 10048, + "vm_loss": 0.178 + }, + { + "epoch": 1.934307096277402, + "lm_loss": 0.5703, + "step": 10048, + "vm_loss": 0.1797 + }, + { + "epoch": 1.934307096277402, + "lm_loss": 0.7966, + "step": 10048, + "vm_loss": 0.1305 + }, + { + "epoch": 1.934307096277402, + "lm_loss": 0.8155, + "step": 10048, + "vm_loss": 0.1481 + }, + { + "epoch": 1.934307096277402, + "lm_loss": 0.5978, + "step": 10048, + "vm_loss": 0.1741 + }, + { + "epoch": 1.934307096277402, + "lm_loss": 0.51, + "step": 10048, + "vm_loss": 0.1138 + }, + { + "epoch": 1.934307096277402, + "lm_loss": 0.4611, + "step": 10048, + "vm_loss": 0.171 + }, + { + "epoch": 1.9344996029549775, + "grad_norm": 3.2150886586817884, + "learning_rate": 5.580697343948838e-08, + "loss": 0.8283, + "step": 10049 + }, + { + "epoch": 1.934692109632553, + "grad_norm": 3.302288258596244, + "learning_rate": 5.5478518693319326e-08, + "loss": 0.8582, + "step": 10050 + }, + { + "epoch": 1.934884616310128, + "grad_norm": 3.0802396007611677, + "learning_rate": 5.515103068186478e-08, + "loss": 0.8074, + "step": 10051 + }, + { + "epoch": 1.9350771229877037, + "grad_norm": 3.3735249507637417, + "learning_rate": 5.482450943696371e-08, + "loss": 0.8362, + "step": 10052 + }, + { + "epoch": 1.935269629665279, + "grad_norm": 3.1308826976405473, + "learning_rate": 5.449895499035407e-08, + "loss": 0.7968, + "step": 10053 + }, + { + "epoch": 1.9354621363428544, + "grad_norm": 3.0514074876814554, + "learning_rate": 5.4174367373688305e-08, + "loss": 0.7674, + "step": 10054 + }, + { + "epoch": 1.9356546430204298, + "grad_norm": 3.069136174037404, + "learning_rate": 5.385074661851675e-08, + "loss": 0.8276, + "step": 10055 + }, + { + "epoch": 1.9358471496980052, + "grad_norm": 3.1611754259310887, + "learning_rate": 5.3528092756302e-08, + "loss": 0.8251, + "step": 10056 + }, + { + "epoch": 1.9358471496980052, + "lm_loss": 0.9981, + "step": 10056, + "vm_loss": 0.1287 + }, + { + "epoch": 1.9358471496980052, + "lm_loss": 0.4823, + "step": 10056, + "vm_loss": 0.1758 + }, + { + "epoch": 1.9358471496980052, + "lm_loss": 0.8971, + "step": 10056, + "vm_loss": 0.2144 + }, + { + "epoch": 1.9358471496980052, + "lm_loss": 0.6365, + "step": 10056, + "vm_loss": 0.1783 + }, + { + "epoch": 1.9358471496980052, + "lm_loss": 0.731, + "step": 10056, + "vm_loss": 0.1622 + }, + { + "epoch": 1.9358471496980052, + "lm_loss": 0.7451, + "step": 10056, + "vm_loss": 0.1431 + }, + { + "epoch": 1.9358471496980052, + "lm_loss": 0.8502, + "step": 10056, + "vm_loss": 0.1889 + }, + { + "epoch": 1.9358471496980052, + "lm_loss": 0.5152, + "step": 10056, + "vm_loss": 0.1076 + }, + { + "epoch": 1.9360396563755806, + "grad_norm": 3.155700554301023, + "learning_rate": 5.320640581840786e-08, + "loss": 0.8083, + "step": 10057 + }, + { + "epoch": 1.9362321630531558, + "grad_norm": 3.1966760555231475, + "learning_rate": 5.2885685836109316e-08, + "loss": 0.8, + "step": 10058 + }, + { + "epoch": 1.9364246697307312, + "grad_norm": 3.105026732066565, + "learning_rate": 5.2565932840582534e-08, + "loss": 0.7445, + "step": 10059 + }, + { + "epoch": 1.9366171764083067, + "grad_norm": 3.191592340812913, + "learning_rate": 5.2247146862911544e-08, + "loss": 0.8286, + "step": 10060 + }, + { + "epoch": 1.936809683085882, + "grad_norm": 3.400549073310816, + "learning_rate": 5.192932793408601e-08, + "loss": 0.8518, + "step": 10061 + }, + { + "epoch": 1.9370021897634575, + "grad_norm": 3.220631036128648, + "learning_rate": 5.1612476085004526e-08, + "loss": 0.8539, + "step": 10062 + }, + { + "epoch": 1.9371946964410327, + "grad_norm": 2.953337768751391, + "learning_rate": 5.129659134646581e-08, + "loss": 0.7467, + "step": 10063 + }, + { + "epoch": 1.9373872031186081, + "grad_norm": 3.2722567806989047, + "learning_rate": 5.098167374917973e-08, + "loss": 0.8442, + "step": 10064 + }, + { + "epoch": 1.9373872031186081, + "lm_loss": 0.5045, + "step": 10064, + "vm_loss": 0.2057 + }, + { + "epoch": 1.9373872031186081, + "lm_loss": 0.446, + "step": 10064, + "vm_loss": 0.1937 + }, + { + "epoch": 1.9373872031186081, + "lm_loss": 0.8359, + "step": 10064, + "vm_loss": 0.1623 + }, + { + "epoch": 1.9373872031186081, + "lm_loss": 0.8131, + "step": 10064, + "vm_loss": 0.164 + }, + { + "epoch": 1.9373872031186081, + "lm_loss": 0.8039, + "step": 10064, + "vm_loss": 0.1031 + }, + { + "epoch": 1.9373872031186081, + "lm_loss": 0.4256, + "step": 10064, + "vm_loss": 0.1022 + }, + { + "epoch": 1.9373872031186081, + "lm_loss": 0.4718, + "step": 10064, + "vm_loss": 0.184 + }, + { + "epoch": 1.9373872031186081, + "lm_loss": 0.6303, + "step": 10064, + "vm_loss": 0.191 + }, + { + "epoch": 1.9375797097961835, + "grad_norm": 3.199055324807108, + "learning_rate": 5.0667723323760684e-08, + "loss": 0.8222, + "step": 10065 + }, + { + "epoch": 1.937772216473759, + "grad_norm": 3.323644163833223, + "learning_rate": 5.03547401007276e-08, + "loss": 0.8447, + "step": 10066 + }, + { + "epoch": 1.9379647231513344, + "grad_norm": 3.226260452132444, + "learning_rate": 5.004272411050726e-08, + "loss": 0.8227, + "step": 10067 + }, + { + "epoch": 1.9381572298289096, + "grad_norm": 3.1074136891115147, + "learning_rate": 4.9731675383432046e-08, + "loss": 0.7887, + "step": 10068 + }, + { + "epoch": 1.9383497365064852, + "grad_norm": 3.1261300044448066, + "learning_rate": 4.942159394973778e-08, + "loss": 0.7901, + "step": 10069 + }, + { + "epoch": 1.9385422431840604, + "grad_norm": 3.284298668899398, + "learning_rate": 4.911247983957035e-08, + "loss": 0.8622, + "step": 10070 + }, + { + "epoch": 1.9387347498616359, + "grad_norm": 3.317572354106562, + "learning_rate": 4.8804333082977937e-08, + "loss": 0.8437, + "step": 10071 + }, + { + "epoch": 1.9389272565392113, + "grad_norm": 3.366810965033768, + "learning_rate": 4.84971537099177e-08, + "loss": 0.8647, + "step": 10072 + }, + { + "epoch": 1.9389272565392113, + "lm_loss": 0.2114, + "step": 10072, + "vm_loss": 0.1208 + }, + { + "epoch": 1.9389272565392113, + "lm_loss": 0.5062, + "step": 10072, + "vm_loss": 0.176 + }, + { + "epoch": 1.9389272565392113, + "lm_loss": 0.7494, + "step": 10072, + "vm_loss": 0.1437 + }, + { + "epoch": 1.9389272565392113, + "lm_loss": 1.0268, + "step": 10072, + "vm_loss": 0.1417 + }, + { + "epoch": 1.9389272565392113, + "lm_loss": 0.9819, + "step": 10072, + "vm_loss": 0.1343 + }, + { + "epoch": 1.9389272565392113, + "lm_loss": 0.6555, + "step": 10072, + "vm_loss": 0.1427 + }, + { + "epoch": 1.9389272565392113, + "lm_loss": 0.6, + "step": 10072, + "vm_loss": 0.1375 + }, + { + "epoch": 1.9389272565392113, + "lm_loss": 1.198, + "step": 10072, + "vm_loss": 0.1635 + }, + { + "epoch": 1.9391197632167865, + "grad_norm": 3.2492630354824334, + "learning_rate": 4.8190941750251296e-08, + "loss": 0.8259, + "step": 10073 + }, + { + "epoch": 1.9393122698943621, + "grad_norm": 3.1933105157475543, + "learning_rate": 4.788569723374603e-08, + "loss": 0.8248, + "step": 10074 + }, + { + "epoch": 1.9395047765719373, + "grad_norm": 3.3462045475034587, + "learning_rate": 4.7581420190077055e-08, + "loss": 0.8678, + "step": 10075 + }, + { + "epoch": 1.9396972832495127, + "grad_norm": 3.2147661053026537, + "learning_rate": 4.727811064882071e-08, + "loss": 0.8037, + "step": 10076 + }, + { + "epoch": 1.9398897899270882, + "grad_norm": 3.2360876825065965, + "learning_rate": 4.6975768639465626e-08, + "loss": 0.8703, + "step": 10077 + }, + { + "epoch": 1.9400822966046634, + "grad_norm": 3.170944778030707, + "learning_rate": 4.667439419140163e-08, + "loss": 0.817, + "step": 10078 + }, + { + "epoch": 1.940274803282239, + "grad_norm": 3.2852656658043684, + "learning_rate": 4.637398733392751e-08, + "loss": 0.8287, + "step": 10079 + }, + { + "epoch": 1.9404673099598142, + "grad_norm": 3.3230355976955686, + "learning_rate": 4.6074548096244346e-08, + "loss": 0.8599, + "step": 10080 + }, + { + "epoch": 1.9404673099598142, + "lm_loss": 0.3998, + "step": 10080, + "vm_loss": 0.1762 + }, + { + "epoch": 1.9404673099598142, + "lm_loss": 0.5445, + "step": 10080, + "vm_loss": 0.1563 + }, + { + "epoch": 1.9404673099598142, + "lm_loss": 0.6316, + "step": 10080, + "vm_loss": 0.1699 + }, + { + "epoch": 1.9404673099598142, + "lm_loss": 1.3365, + "step": 10080, + "vm_loss": 0.2069 + }, + { + "epoch": 1.9404673099598142, + "lm_loss": 0.758, + "step": 10080, + "vm_loss": 0.2109 + }, + { + "epoch": 1.9404673099598142, + "lm_loss": 0.4581, + "step": 10080, + "vm_loss": 0.1199 + }, + { + "epoch": 1.9404673099598142, + "lm_loss": 0.685, + "step": 10080, + "vm_loss": 0.1652 + }, + { + "epoch": 1.9404673099598142, + "lm_loss": 0.941, + "step": 10080, + "vm_loss": 0.1741 + }, + { + "epoch": 1.9406598166373896, + "grad_norm": 3.350455898571836, + "learning_rate": 4.5776076507464404e-08, + "loss": 0.8641, + "step": 10081 + }, + { + "epoch": 1.940852323314965, + "grad_norm": 3.2479816867411246, + "learning_rate": 4.5478572596601154e-08, + "loss": 0.8272, + "step": 10082 + }, + { + "epoch": 1.9410448299925402, + "grad_norm": 3.3150998372361475, + "learning_rate": 4.51820363925759e-08, + "loss": 0.8577, + "step": 10083 + }, + { + "epoch": 1.9412373366701159, + "grad_norm": 3.254625022717895, + "learning_rate": 4.4886467924215584e-08, + "loss": 0.8337, + "step": 10084 + }, + { + "epoch": 1.941429843347691, + "grad_norm": 3.228327176167047, + "learning_rate": 4.459186722025499e-08, + "loss": 0.8394, + "step": 10085 + }, + { + "epoch": 1.9416223500252665, + "grad_norm": 3.3950981044477597, + "learning_rate": 4.4298234309331224e-08, + "loss": 0.8483, + "step": 10086 + }, + { + "epoch": 1.941814856702842, + "grad_norm": 3.3142489393923906, + "learning_rate": 4.400556921998922e-08, + "loss": 0.8412, + "step": 10087 + }, + { + "epoch": 1.9420073633804171, + "grad_norm": 3.2547538438643744, + "learning_rate": 4.371387198068067e-08, + "loss": 0.828, + "step": 10088 + }, + { + "epoch": 1.9420073633804171, + "lm_loss": 0.7327, + "step": 10088, + "vm_loss": 0.1771 + }, + { + "epoch": 1.9420073633804171, + "lm_loss": 0.8187, + "step": 10088, + "vm_loss": 0.1217 + }, + { + "epoch": 1.9420073633804171, + "lm_loss": 0.8512, + "step": 10088, + "vm_loss": 0.1657 + }, + { + "epoch": 1.9420073633804171, + "lm_loss": 0.8304, + "step": 10088, + "vm_loss": 0.2053 + }, + { + "epoch": 1.9420073633804171, + "lm_loss": 0.8012, + "step": 10088, + "vm_loss": 0.1946 + }, + { + "epoch": 1.9420073633804171, + "lm_loss": 0.679, + "step": 10088, + "vm_loss": 0.143 + }, + { + "epoch": 1.9420073633804171, + "lm_loss": 0.3918, + "step": 10088, + "vm_loss": 0.1861 + }, + { + "epoch": 1.9420073633804171, + "lm_loss": 0.6492, + "step": 10088, + "vm_loss": 0.215 + }, + { + "epoch": 1.9421998700579928, + "grad_norm": 3.271133369606919, + "learning_rate": 4.342314261976177e-08, + "loss": 0.8464, + "step": 10089 + }, + { + "epoch": 1.942392376735568, + "grad_norm": 3.1352367547538695, + "learning_rate": 4.313338116549437e-08, + "loss": 0.8008, + "step": 10090 + }, + { + "epoch": 1.9425848834131434, + "grad_norm": 3.25802352719732, + "learning_rate": 4.284458764604926e-08, + "loss": 0.8104, + "step": 10091 + }, + { + "epoch": 1.9427773900907188, + "grad_norm": 3.2280681394410267, + "learning_rate": 4.255676208949844e-08, + "loss": 0.831, + "step": 10092 + }, + { + "epoch": 1.942969896768294, + "grad_norm": 3.558228797216585, + "learning_rate": 4.226990452382396e-08, + "loss": 0.8841, + "step": 10093 + }, + { + "epoch": 1.9431624034458697, + "grad_norm": 3.3898912323436314, + "learning_rate": 4.1984014976911293e-08, + "loss": 0.8543, + "step": 10094 + }, + { + "epoch": 1.9433549101234449, + "grad_norm": 3.4225633300778338, + "learning_rate": 4.1699093476551544e-08, + "loss": 0.8823, + "step": 10095 + }, + { + "epoch": 1.9435474168010203, + "grad_norm": 3.159583820201095, + "learning_rate": 4.141514005044589e-08, + "loss": 0.8061, + "step": 10096 + }, + { + "epoch": 1.9435474168010203, + "lm_loss": 0.5068, + "step": 10096, + "vm_loss": 0.198 + }, + { + "epoch": 1.9435474168010203, + "lm_loss": 0.5668, + "step": 10096, + "vm_loss": 0.1448 + }, + { + "epoch": 1.9435474168010203, + "lm_loss": 0.5469, + "step": 10096, + "vm_loss": 0.1852 + }, + { + "epoch": 1.9435474168010203, + "lm_loss": 0.392, + "step": 10096, + "vm_loss": 0.2203 + }, + { + "epoch": 1.9435474168010203, + "lm_loss": 0.9583, + "step": 10096, + "vm_loss": 0.1054 + }, + { + "epoch": 1.9435474168010203, + "lm_loss": 0.6466, + "step": 10096, + "vm_loss": 0.1357 + }, + { + "epoch": 1.9435474168010203, + "lm_loss": 0.9839, + "step": 10096, + "vm_loss": 0.0768 + }, + { + "epoch": 1.9435474168010203, + "lm_loss": 0.3741, + "step": 10096, + "vm_loss": 0.1322 + }, + { + "epoch": 1.9437399234785957, + "grad_norm": 3.507125190948868, + "learning_rate": 4.1132154726193364e-08, + "loss": 0.8778, + "step": 10097 + }, + { + "epoch": 1.943932430156171, + "grad_norm": 3.1920676319393144, + "learning_rate": 4.085013753130862e-08, + "loss": 0.813, + "step": 10098 + }, + { + "epoch": 1.9441249368337465, + "grad_norm": 3.1614246271362685, + "learning_rate": 4.0569088493204176e-08, + "loss": 0.816, + "step": 10099 + }, + { + "epoch": 1.9443174435113217, + "grad_norm": 3.235445689287706, + "learning_rate": 4.0289007639203736e-08, + "loss": 0.8248, + "step": 10100 + }, + { + "epoch": 1.9445099501888972, + "grad_norm": 3.2054400367484037, + "learning_rate": 4.000989499653218e-08, + "loss": 0.8427, + "step": 10101 + }, + { + "epoch": 1.9447024568664726, + "grad_norm": 3.323331805042728, + "learning_rate": 3.973175059232559e-08, + "loss": 0.8556, + "step": 10102 + }, + { + "epoch": 1.9448949635440478, + "grad_norm": 3.2620070208671272, + "learning_rate": 3.945457445362122e-08, + "loss": 0.8402, + "step": 10103 + }, + { + "epoch": 1.9450874702216234, + "grad_norm": 3.2572917031235606, + "learning_rate": 3.917836660736529e-08, + "loss": 0.8385, + "step": 10104 + }, + { + "epoch": 1.9450874702216234, + "lm_loss": 0.6567, + "step": 10104, + "vm_loss": 0.1573 + }, + { + "epoch": 1.9450874702216234, + "lm_loss": 0.4744, + "step": 10104, + "vm_loss": 0.175 + }, + { + "epoch": 1.9450874702216234, + "lm_loss": 0.6976, + "step": 10104, + "vm_loss": 0.1303 + }, + { + "epoch": 1.9450874702216234, + "lm_loss": 0.6968, + "step": 10104, + "vm_loss": 0.1418 + }, + { + "epoch": 1.9450874702216234, + "lm_loss": 0.6725, + "step": 10104, + "vm_loss": 0.1873 + }, + { + "epoch": 1.9450874702216234, + "lm_loss": 0.3953, + "step": 10104, + "vm_loss": 0.1356 + }, + { + "epoch": 1.9450874702216234, + "lm_loss": 0.6528, + "step": 10104, + "vm_loss": 0.1981 + }, + { + "epoch": 1.9450874702216234, + "lm_loss": 0.8002, + "step": 10104, + "vm_loss": 0.1677 + }, + { + "epoch": 1.9452799768991986, + "grad_norm": 3.263600797390648, + "learning_rate": 3.8903127080407446e-08, + "loss": 0.841, + "step": 10105 + }, + { + "epoch": 1.945472483576774, + "grad_norm": 3.2072187265620107, + "learning_rate": 3.8628855899506266e-08, + "loss": 0.8217, + "step": 10106 + }, + { + "epoch": 1.9456649902543495, + "grad_norm": 3.224795796927549, + "learning_rate": 3.835555309132266e-08, + "loss": 0.829, + "step": 10107 + }, + { + "epoch": 1.9458574969319247, + "grad_norm": 3.214690653187647, + "learning_rate": 3.808321868242648e-08, + "loss": 0.8478, + "step": 10108 + }, + { + "epoch": 1.9460500036095003, + "grad_norm": 3.0894031904245676, + "learning_rate": 3.7811852699291e-08, + "loss": 0.7829, + "step": 10109 + }, + { + "epoch": 1.9462425102870755, + "grad_norm": 3.276030170545593, + "learning_rate": 3.754145516829733e-08, + "loss": 0.8384, + "step": 10110 + }, + { + "epoch": 1.946435016964651, + "grad_norm": 3.153826866516344, + "learning_rate": 3.727202611573111e-08, + "loss": 0.8244, + "step": 10111 + }, + { + "epoch": 1.9466275236422264, + "grad_norm": 3.069215285630629, + "learning_rate": 3.700356556778473e-08, + "loss": 0.7866, + "step": 10112 + }, + { + "epoch": 1.9466275236422264, + "lm_loss": 0.6124, + "step": 10112, + "vm_loss": 0.1741 + }, + { + "epoch": 1.9466275236422264, + "lm_loss": 1.0928, + "step": 10112, + "vm_loss": 0.1032 + }, + { + "epoch": 1.9466275236422264, + "lm_loss": 0.7532, + "step": 10112, + "vm_loss": 0.1546 + }, + { + "epoch": 1.9466275236422264, + "lm_loss": 0.4381, + "step": 10112, + "vm_loss": 0.1908 + }, + { + "epoch": 1.9466275236422264, + "lm_loss": 0.328, + "step": 10112, + "vm_loss": 0.1919 + }, + { + "epoch": 1.9466275236422264, + "lm_loss": 0.509, + "step": 10112, + "vm_loss": 0.1484 + }, + { + "epoch": 1.9466275236422264, + "lm_loss": 0.9468, + "step": 10112, + "vm_loss": 0.1732 + }, + { + "epoch": 1.9466275236422264, + "lm_loss": 0.4347, + "step": 10112, + "vm_loss": 0.1285 + }, + { + "epoch": 1.9468200303198016, + "grad_norm": 3.4620028156184492, + "learning_rate": 3.6736073550556195e-08, + "loss": 0.9101, + "step": 10113 + }, + { + "epoch": 1.9470125369973772, + "grad_norm": 3.3082216244669422, + "learning_rate": 3.6469550090048043e-08, + "loss": 0.8545, + "step": 10114 + }, + { + "epoch": 1.9472050436749524, + "grad_norm": 3.260157406484515, + "learning_rate": 3.620399521217066e-08, + "loss": 0.8331, + "step": 10115 + }, + { + "epoch": 1.9473975503525278, + "grad_norm": 3.389742360017076, + "learning_rate": 3.5939408942738955e-08, + "loss": 0.8773, + "step": 10116 + }, + { + "epoch": 1.9475900570301032, + "grad_norm": 3.202245488634307, + "learning_rate": 3.567579130747456e-08, + "loss": 0.8243, + "step": 10117 + }, + { + "epoch": 1.9477825637076787, + "grad_norm": 3.364834556629764, + "learning_rate": 3.541314233200255e-08, + "loss": 0.8778, + "step": 10118 + }, + { + "epoch": 1.947975070385254, + "grad_norm": 3.143481608542886, + "learning_rate": 3.5151462041859154e-08, + "loss": 0.8149, + "step": 10119 + }, + { + "epoch": 1.9481675770628293, + "grad_norm": 3.233254059799246, + "learning_rate": 3.4890750462480695e-08, + "loss": 0.8151, + "step": 10120 + }, + { + "epoch": 1.9481675770628293, + "lm_loss": 1.0884, + "step": 10120, + "vm_loss": 0.2005 + }, + { + "epoch": 1.9481675770628293, + "lm_loss": 0.6766, + "step": 10120, + "vm_loss": 0.1226 + }, + { + "epoch": 1.9481675770628293, + "lm_loss": 0.7303, + "step": 10120, + "vm_loss": 0.1684 + }, + { + "epoch": 1.9481675770628293, + "lm_loss": 0.5767, + "step": 10120, + "vm_loss": 0.1803 + }, + { + "epoch": 1.9481675770628293, + "lm_loss": 0.5009, + "step": 10120, + "vm_loss": 0.224 + }, + { + "epoch": 1.9481675770628293, + "lm_loss": 0.4616, + "step": 10120, + "vm_loss": 0.2193 + }, + { + "epoch": 1.9481675770628293, + "lm_loss": 1.0803, + "step": 10120, + "vm_loss": 0.1534 + }, + { + "epoch": 1.9481675770628293, + "lm_loss": 0.9991, + "step": 10120, + "vm_loss": 0.1447 + }, + { + "epoch": 1.9483600837404047, + "grad_norm": 3.211727199026326, + "learning_rate": 3.463100761921245e-08, + "loss": 0.858, + "step": 10121 + }, + { + "epoch": 1.9485525904179801, + "grad_norm": 3.1663739986006947, + "learning_rate": 3.4372233537304236e-08, + "loss": 0.8063, + "step": 10122 + }, + { + "epoch": 1.9487450970955555, + "grad_norm": 3.1686360520601653, + "learning_rate": 3.411442824191369e-08, + "loss": 0.8354, + "step": 10123 + }, + { + "epoch": 1.948937603773131, + "grad_norm": 3.214496038640966, + "learning_rate": 3.385759175809966e-08, + "loss": 0.8113, + "step": 10124 + }, + { + "epoch": 1.9491301104507062, + "grad_norm": 3.241682633462666, + "learning_rate": 3.360172411083329e-08, + "loss": 0.8526, + "step": 10125 + }, + { + "epoch": 1.9493226171282816, + "grad_norm": 3.2212387952991834, + "learning_rate": 3.3346825324986896e-08, + "loss": 0.822, + "step": 10126 + }, + { + "epoch": 1.949515123805857, + "grad_norm": 3.215610096045776, + "learning_rate": 3.3092895425339553e-08, + "loss": 0.8031, + "step": 10127 + }, + { + "epoch": 1.9497076304834324, + "grad_norm": 3.3070457664905555, + "learning_rate": 3.283993443657596e-08, + "loss": 0.8749, + "step": 10128 + }, + { + "epoch": 1.9497076304834324, + "lm_loss": 0.5524, + "step": 10128, + "vm_loss": 0.1645 + }, + { + "epoch": 1.9497076304834324, + "lm_loss": 0.6875, + "step": 10128, + "vm_loss": 0.2007 + }, + { + "epoch": 1.9497076304834324, + "lm_loss": 0.8999, + "step": 10128, + "vm_loss": 0.1801 + }, + { + "epoch": 1.9497076304834324, + "lm_loss": 0.4945, + "step": 10128, + "vm_loss": 0.229 + }, + { + "epoch": 1.9497076304834324, + "lm_loss": 0.4811, + "step": 10128, + "vm_loss": 0.1668 + }, + { + "epoch": 1.9497076304834324, + "lm_loss": 0.9966, + "step": 10128, + "vm_loss": 0.1257 + }, + { + "epoch": 1.9497076304834324, + "lm_loss": 0.6103, + "step": 10128, + "vm_loss": 0.2317 + }, + { + "epoch": 1.9497076304834324, + "lm_loss": 0.4787, + "step": 10128, + "vm_loss": 0.1549 + }, + { + "epoch": 1.9499001371610079, + "grad_norm": 3.2293690580001324, + "learning_rate": 3.2587942383288664e-08, + "loss": 0.8323, + "step": 10129 + }, + { + "epoch": 1.950092643838583, + "grad_norm": 3.326533853363896, + "learning_rate": 3.233691928997362e-08, + "loss": 0.8557, + "step": 10130 + }, + { + "epoch": 1.9502851505161587, + "grad_norm": 3.146781148874096, + "learning_rate": 3.208686518103465e-08, + "loss": 0.8251, + "step": 10131 + }, + { + "epoch": 1.950477657193734, + "grad_norm": 3.256698827587504, + "learning_rate": 3.183778008077787e-08, + "loss": 0.8736, + "step": 10132 + }, + { + "epoch": 1.9506701638713093, + "grad_norm": 3.4029389029264325, + "learning_rate": 3.158966401341945e-08, + "loss": 0.8987, + "step": 10133 + }, + { + "epoch": 1.9508626705488847, + "grad_norm": 3.3123194247520926, + "learning_rate": 3.1342517003079e-08, + "loss": 0.8339, + "step": 10134 + }, + { + "epoch": 1.95105517722646, + "grad_norm": 3.2011050721371777, + "learning_rate": 3.109633907378173e-08, + "loss": 0.8205, + "step": 10135 + }, + { + "epoch": 1.9512476839040356, + "grad_norm": 3.187207986100193, + "learning_rate": 3.0851130249460737e-08, + "loss": 0.8746, + "step": 10136 + }, + { + "epoch": 1.9512476839040356, + "lm_loss": 0.8042, + "step": 10136, + "vm_loss": 0.1382 + }, + { + "epoch": 1.9512476839040356, + "lm_loss": 0.4996, + "step": 10136, + "vm_loss": 0.2241 + }, + { + "epoch": 1.9512476839040356, + "lm_loss": 0.3203, + "step": 10136, + "vm_loss": 0.119 + }, + { + "epoch": 1.9512476839040356, + "lm_loss": 0.817, + "step": 10136, + "vm_loss": 0.1825 + }, + { + "epoch": 1.9512476839040356, + "lm_loss": 0.8257, + "step": 10136, + "vm_loss": 0.1084 + }, + { + "epoch": 1.9512476839040356, + "lm_loss": 0.7018, + "step": 10136, + "vm_loss": 0.1325 + }, + { + "epoch": 1.9512476839040356, + "lm_loss": 0.5891, + "step": 10136, + "vm_loss": 0.1567 + }, + { + "epoch": 1.9512476839040356, + "lm_loss": 0.6358, + "step": 10136, + "vm_loss": 0.1761 + }, + { + "epoch": 1.9514401905816108, + "grad_norm": 3.128864839202728, + "learning_rate": 3.0606890553951384e-08, + "loss": 0.8242, + "step": 10137 + }, + { + "epoch": 1.9516326972591862, + "grad_norm": 3.365609096739883, + "learning_rate": 3.036362001099913e-08, + "loss": 0.8602, + "step": 10138 + }, + { + "epoch": 1.9518252039367616, + "grad_norm": 3.225548835249849, + "learning_rate": 3.012131864425061e-08, + "loss": 0.8303, + "step": 10139 + }, + { + "epoch": 1.9520177106143368, + "grad_norm": 3.3427118605209136, + "learning_rate": 2.987998647726365e-08, + "loss": 0.8636, + "step": 10140 + }, + { + "epoch": 1.9522102172919125, + "grad_norm": 3.2588357346657886, + "learning_rate": 2.963962353349503e-08, + "loss": 0.8466, + "step": 10141 + }, + { + "epoch": 1.9524027239694877, + "grad_norm": 3.3131573918408135, + "learning_rate": 2.9400229836314965e-08, + "loss": 0.8569, + "step": 10142 + }, + { + "epoch": 1.952595230647063, + "grad_norm": 3.2235368958643087, + "learning_rate": 2.9161805408992606e-08, + "loss": 0.8111, + "step": 10143 + }, + { + "epoch": 1.9527877373246385, + "grad_norm": 3.2608283611262268, + "learning_rate": 2.8924350274708303e-08, + "loss": 0.8679, + "step": 10144 + }, + { + "epoch": 1.9527877373246385, + "lm_loss": 0.4325, + "step": 10144, + "vm_loss": 0.1834 + }, + { + "epoch": 1.9527877373246385, + "lm_loss": 0.4472, + "step": 10144, + "vm_loss": 0.1386 + }, + { + "epoch": 1.9527877373246385, + "lm_loss": 0.7697, + "step": 10144, + "vm_loss": 0.2094 + }, + { + "epoch": 1.9527877373246385, + "lm_loss": 0.4781, + "step": 10144, + "vm_loss": 0.1242 + }, + { + "epoch": 1.9527877373246385, + "lm_loss": 0.752, + "step": 10144, + "vm_loss": 0.1864 + }, + { + "epoch": 1.9527877373246385, + "lm_loss": 0.8934, + "step": 10144, + "vm_loss": 0.1758 + }, + { + "epoch": 1.9527877373246385, + "lm_loss": 0.6567, + "step": 10144, + "vm_loss": 0.1644 + }, + { + "epoch": 1.9527877373246385, + "lm_loss": 0.6762, + "step": 10144, + "vm_loss": 0.1592 + }, + { + "epoch": 1.9529802440022137, + "grad_norm": 3.206351114458451, + "learning_rate": 2.868786445654359e-08, + "loss": 0.8153, + "step": 10145 + }, + { + "epoch": 1.9531727506797893, + "grad_norm": 3.341064358288513, + "learning_rate": 2.845234797748897e-08, + "loss": 0.855, + "step": 10146 + }, + { + "epoch": 1.9533652573573645, + "grad_norm": 3.2374080580376674, + "learning_rate": 2.8217800860439458e-08, + "loss": 0.8456, + "step": 10147 + }, + { + "epoch": 1.95355776403494, + "grad_norm": 3.234795006105892, + "learning_rate": 2.798422312819571e-08, + "loss": 0.8245, + "step": 10148 + }, + { + "epoch": 1.9537502707125154, + "grad_norm": 3.2679885685405994, + "learning_rate": 2.775161480346511e-08, + "loss": 0.848, + "step": 10149 + }, + { + "epoch": 1.9539427773900906, + "grad_norm": 3.208781692450976, + "learning_rate": 2.7519975908859576e-08, + "loss": 0.8482, + "step": 10150 + }, + { + "epoch": 1.9541352840676662, + "grad_norm": 3.1018058658870147, + "learning_rate": 2.7289306466897758e-08, + "loss": 0.7718, + "step": 10151 + }, + { + "epoch": 1.9543277907452414, + "grad_norm": 3.051579520195501, + "learning_rate": 2.7059606500003945e-08, + "loss": 0.8079, + "step": 10152 + }, + { + "epoch": 1.9543277907452414, + "lm_loss": 0.888, + "step": 10152, + "vm_loss": 0.2029 + }, + { + "epoch": 1.9543277907452414, + "lm_loss": 0.82, + "step": 10152, + "vm_loss": 0.1361 + }, + { + "epoch": 1.9543277907452414, + "lm_loss": 0.6736, + "step": 10152, + "vm_loss": 0.2013 + }, + { + "epoch": 1.9543277907452414, + "lm_loss": 0.772, + "step": 10152, + "vm_loss": 0.1668 + }, + { + "epoch": 1.9543277907452414, + "lm_loss": 0.7722, + "step": 10152, + "vm_loss": 0.1546 + }, + { + "epoch": 1.9543277907452414, + "lm_loss": 0.9116, + "step": 10152, + "vm_loss": 0.2154 + }, + { + "epoch": 1.9543277907452414, + "lm_loss": 0.6498, + "step": 10152, + "vm_loss": 0.1774 + }, + { + "epoch": 1.9543277907452414, + "lm_loss": 0.4299, + "step": 10152, + "vm_loss": 0.1735 + }, + { + "epoch": 1.9545202974228169, + "grad_norm": 3.2617761787851984, + "learning_rate": 2.6830876030506935e-08, + "loss": 0.8375, + "step": 10153 + }, + { + "epoch": 1.9547128041003923, + "grad_norm": 3.389546250088972, + "learning_rate": 2.6603115080643393e-08, + "loss": 0.8456, + "step": 10154 + }, + { + "epoch": 1.9549053107779675, + "grad_norm": 3.148277664659818, + "learning_rate": 2.6376323672553383e-08, + "loss": 0.8315, + "step": 10155 + }, + { + "epoch": 1.9550978174555431, + "grad_norm": 3.096522621588826, + "learning_rate": 2.6150501828284826e-08, + "loss": 0.8085, + "step": 10156 + }, + { + "epoch": 1.9552903241331183, + "grad_norm": 3.0705434290009443, + "learning_rate": 2.5925649569791266e-08, + "loss": 0.8087, + "step": 10157 + }, + { + "epoch": 1.9554828308106937, + "grad_norm": 3.13724463343224, + "learning_rate": 2.5701766918929672e-08, + "loss": 0.8028, + "step": 10158 + }, + { + "epoch": 1.9556753374882692, + "grad_norm": 3.207366228304054, + "learning_rate": 2.547885389746485e-08, + "loss": 0.8527, + "step": 10159 + }, + { + "epoch": 1.9558678441658444, + "grad_norm": 3.400290780938795, + "learning_rate": 2.5256910527066136e-08, + "loss": 0.8451, + "step": 10160 + }, + { + "epoch": 1.9558678441658444, + "lm_loss": 0.4303, + "step": 10160, + "vm_loss": 0.1774 + }, + { + "epoch": 1.9558678441658444, + "lm_loss": 0.5178, + "step": 10160, + "vm_loss": 0.1101 + }, + { + "epoch": 1.9558678441658444, + "lm_loss": 0.5244, + "step": 10160, + "vm_loss": 0.2812 + }, + { + "epoch": 1.9558678441658444, + "lm_loss": 0.5143, + "step": 10160, + "vm_loss": 0.1389 + }, + { + "epoch": 1.9558678441658444, + "lm_loss": 0.6512, + "step": 10160, + "vm_loss": 0.1473 + }, + { + "epoch": 1.9558678441658444, + "lm_loss": 0.4822, + "step": 10160, + "vm_loss": 0.228 + }, + { + "epoch": 1.9558678441658444, + "lm_loss": 0.6638, + "step": 10160, + "vm_loss": 0.1678 + }, + { + "epoch": 1.9558678441658444, + "lm_loss": 0.8131, + "step": 10160, + "vm_loss": 0.1511 + }, + { + "epoch": 1.95606035084342, + "grad_norm": 3.130830682759224, + "learning_rate": 2.5035936829310715e-08, + "loss": 0.8203, + "step": 10161 + }, + { + "epoch": 1.9562528575209952, + "grad_norm": 3.2748870706060855, + "learning_rate": 2.4815932825679177e-08, + "loss": 0.8311, + "step": 10162 + }, + { + "epoch": 1.9564453641985706, + "grad_norm": 3.14441595152886, + "learning_rate": 2.4596898537558868e-08, + "loss": 0.8117, + "step": 10163 + }, + { + "epoch": 1.956637870876146, + "grad_norm": 3.2182987337532265, + "learning_rate": 2.4378833986242745e-08, + "loss": 0.8529, + "step": 10164 + }, + { + "epoch": 1.9568303775537212, + "grad_norm": 3.198950858321288, + "learning_rate": 2.4161739192929413e-08, + "loss": 0.8091, + "step": 10165 + }, + { + "epoch": 1.9570228842312969, + "grad_norm": 3.1616524660278964, + "learning_rate": 2.39456141787231e-08, + "loss": 0.8138, + "step": 10166 + }, + { + "epoch": 1.957215390908872, + "grad_norm": 3.2708745002144646, + "learning_rate": 2.3730458964633663e-08, + "loss": 0.8255, + "step": 10167 + }, + { + "epoch": 1.9574078975864475, + "grad_norm": 3.2280621797080347, + "learning_rate": 2.3516273571577708e-08, + "loss": 0.8025, + "step": 10168 + }, + { + "epoch": 1.9574078975864475, + "lm_loss": 0.8537, + "step": 10168, + "vm_loss": 0.1948 + }, + { + "epoch": 1.9574078975864475, + "lm_loss": 0.7026, + "step": 10168, + "vm_loss": 0.1021 + }, + { + "epoch": 1.9574078975864475, + "lm_loss": 0.8448, + "step": 10168, + "vm_loss": 0.1231 + }, + { + "epoch": 1.9574078975864475, + "lm_loss": 0.5465, + "step": 10168, + "vm_loss": 0.1784 + }, + { + "epoch": 1.9574078975864475, + "lm_loss": 0.7371, + "step": 10168, + "vm_loss": 0.2035 + }, + { + "epoch": 1.9574078975864475, + "lm_loss": 0.6394, + "step": 10168, + "vm_loss": 0.1854 + }, + { + "epoch": 1.9574078975864475, + "lm_loss": 0.7971, + "step": 10168, + "vm_loss": 0.121 + }, + { + "epoch": 1.9574078975864475, + "lm_loss": 1.1849, + "step": 10168, + "vm_loss": 0.1744 + }, + { + "epoch": 1.957600404264023, + "grad_norm": 3.06664061265489, + "learning_rate": 2.3303058020376356e-08, + "loss": 0.8198, + "step": 10169 + }, + { + "epoch": 1.9577929109415981, + "grad_norm": 3.167473682828059, + "learning_rate": 2.309081233175747e-08, + "loss": 0.7841, + "step": 10170 + }, + { + "epoch": 1.9579854176191738, + "grad_norm": 3.498972712644818, + "learning_rate": 2.287953652635344e-08, + "loss": 0.8753, + "step": 10171 + }, + { + "epoch": 1.958177924296749, + "grad_norm": 3.2354186918751338, + "learning_rate": 2.2669230624702277e-08, + "loss": 0.8358, + "step": 10172 + }, + { + "epoch": 1.9583704309743244, + "grad_norm": 3.2327939391128457, + "learning_rate": 2.2459894647250956e-08, + "loss": 0.8227, + "step": 10173 + }, + { + "epoch": 1.9585629376518998, + "grad_norm": 3.1299842956838644, + "learning_rate": 2.225152861434654e-08, + "loss": 0.7831, + "step": 10174 + }, + { + "epoch": 1.958755444329475, + "grad_norm": 3.2964602800033216, + "learning_rate": 2.2044132546246156e-08, + "loss": 0.8349, + "step": 10175 + }, + { + "epoch": 1.9589479510070507, + "grad_norm": 3.1965252003900066, + "learning_rate": 2.183770646311145e-08, + "loss": 0.817, + "step": 10176 + }, + { + "epoch": 1.9589479510070507, + "lm_loss": 0.5856, + "step": 10176, + "vm_loss": 0.1725 + }, + { + "epoch": 1.9589479510070507, + "lm_loss": 0.7886, + "step": 10176, + "vm_loss": 0.1525 + }, + { + "epoch": 1.9589479510070507, + "lm_loss": 0.422, + "step": 10176, + "vm_loss": 0.2509 + }, + { + "epoch": 1.9589479510070507, + "lm_loss": 0.5699, + "step": 10176, + "vm_loss": 0.2431 + }, + { + "epoch": 1.9589479510070507, + "lm_loss": 1.1904, + "step": 10176, + "vm_loss": 0.1755 + }, + { + "epoch": 1.9589479510070507, + "lm_loss": 0.7439, + "step": 10176, + "vm_loss": 0.128 + }, + { + "epoch": 1.9589479510070507, + "lm_loss": 0.6648, + "step": 10176, + "vm_loss": 0.1854 + }, + { + "epoch": 1.9589479510070507, + "lm_loss": 0.5658, + "step": 10176, + "vm_loss": 0.1602 + }, + { + "epoch": 1.9591404576846259, + "grad_norm": 3.2230068015229763, + "learning_rate": 2.163225038501082e-08, + "loss": 0.8382, + "step": 10177 + }, + { + "epoch": 1.9593329643622013, + "grad_norm": 3.377070999729219, + "learning_rate": 2.142776433191496e-08, + "loss": 0.8503, + "step": 10178 + }, + { + "epoch": 1.9595254710397767, + "grad_norm": 3.2984859546716576, + "learning_rate": 2.1224248323703512e-08, + "loss": 0.8191, + "step": 10179 + }, + { + "epoch": 1.9597179777173521, + "grad_norm": 3.1991944841022906, + "learning_rate": 2.102170238016177e-08, + "loss": 0.8266, + "step": 10180 + }, + { + "epoch": 1.9599104843949275, + "grad_norm": 3.503770276607305, + "learning_rate": 2.0820126520978425e-08, + "loss": 0.8803, + "step": 10181 + }, + { + "epoch": 1.9601029910725027, + "grad_norm": 3.0893165955559643, + "learning_rate": 2.0619520765750022e-08, + "loss": 0.8125, + "step": 10182 + }, + { + "epoch": 1.9602954977500782, + "grad_norm": 3.2377109115152467, + "learning_rate": 2.0419885133976525e-08, + "loss": 0.8383, + "step": 10183 + }, + { + "epoch": 1.9604880044276536, + "grad_norm": 3.253946682056954, + "learning_rate": 2.0221219645067956e-08, + "loss": 0.8203, + "step": 10184 + }, + { + "epoch": 1.9604880044276536, + "lm_loss": 0.5413, + "step": 10184, + "vm_loss": 0.1347 + }, + { + "epoch": 1.9604880044276536, + "lm_loss": 0.6911, + "step": 10184, + "vm_loss": 0.1925 + }, + { + "epoch": 1.9604880044276536, + "lm_loss": 0.9271, + "step": 10184, + "vm_loss": 0.1562 + }, + { + "epoch": 1.9604880044276536, + "lm_loss": 0.3395, + "step": 10184, + "vm_loss": 0.1589 + }, + { + "epoch": 1.9604880044276536, + "lm_loss": 0.842, + "step": 10184, + "vm_loss": 0.1678 + }, + { + "epoch": 1.9604880044276536, + "lm_loss": 0.6407, + "step": 10184, + "vm_loss": 0.1628 + }, + { + "epoch": 1.9604880044276536, + "lm_loss": 0.5555, + "step": 10184, + "vm_loss": 0.124 + }, + { + "epoch": 1.9604880044276536, + "lm_loss": 0.4456, + "step": 10184, + "vm_loss": 0.1707 + }, + { + "epoch": 1.960680511105229, + "grad_norm": 3.202562127832382, + "learning_rate": 2.0023524318334432e-08, + "loss": 0.7775, + "step": 10185 + }, + { + "epoch": 1.9608730177828044, + "grad_norm": 3.3294954626682336, + "learning_rate": 1.9826799172996125e-08, + "loss": 0.8881, + "step": 10186 + }, + { + "epoch": 1.9610655244603796, + "grad_norm": 3.373220143029709, + "learning_rate": 1.9631044228176633e-08, + "loss": 0.8605, + "step": 10187 + }, + { + "epoch": 1.961258031137955, + "grad_norm": 3.0916739058517955, + "learning_rate": 1.9436259502906284e-08, + "loss": 0.7834, + "step": 10188 + }, + { + "epoch": 1.9614505378155305, + "grad_norm": 3.288469126668882, + "learning_rate": 1.9242445016118826e-08, + "loss": 0.8149, + "step": 10189 + }, + { + "epoch": 1.961643044493106, + "grad_norm": 3.422558579543728, + "learning_rate": 1.9049600786658073e-08, + "loss": 0.8883, + "step": 10190 + }, + { + "epoch": 1.9618355511706813, + "grad_norm": 3.277361239622352, + "learning_rate": 1.885772683326903e-08, + "loss": 0.827, + "step": 10191 + }, + { + "epoch": 1.9620280578482565, + "grad_norm": 3.29850755178506, + "learning_rate": 1.8666823174605665e-08, + "loss": 0.8558, + "step": 10192 + }, + { + "epoch": 1.9620280578482565, + "lm_loss": 0.8428, + "step": 10192, + "vm_loss": 0.1506 + }, + { + "epoch": 1.9620280578482565, + "lm_loss": 0.9339, + "step": 10192, + "vm_loss": 0.1561 + }, + { + "epoch": 1.9620280578482565, + "lm_loss": 0.5887, + "step": 10192, + "vm_loss": 0.1449 + }, + { + "epoch": 1.9620280578482565, + "lm_loss": 0.7009, + "step": 10192, + "vm_loss": 0.1745 + }, + { + "epoch": 1.9620280578482565, + "lm_loss": 0.8848, + "step": 10192, + "vm_loss": 0.1954 + }, + { + "epoch": 1.9620280578482565, + "lm_loss": 1.0809, + "step": 10192, + "vm_loss": 0.1045 + }, + { + "epoch": 1.9620280578482565, + "lm_loss": 0.6609, + "step": 10192, + "vm_loss": 0.1531 + }, + { + "epoch": 1.9620280578482565, + "lm_loss": 0.5582, + "step": 10192, + "vm_loss": 0.1224 + }, + { + "epoch": 1.9622205645258322, + "grad_norm": 3.349223870528647, + "learning_rate": 1.8476889829225352e-08, + "loss": 0.8317, + "step": 10193 + }, + { + "epoch": 1.9624130712034074, + "grad_norm": 3.18565841403218, + "learning_rate": 1.8287926815592217e-08, + "loss": 0.7681, + "step": 10194 + }, + { + "epoch": 1.9626055778809828, + "grad_norm": 3.4301533874208494, + "learning_rate": 1.8099934152076003e-08, + "loss": 0.8626, + "step": 10195 + }, + { + "epoch": 1.9627980845585582, + "grad_norm": 3.1925917490230042, + "learning_rate": 1.7912911856952098e-08, + "loss": 0.8263, + "step": 10196 + }, + { + "epoch": 1.9629905912361334, + "grad_norm": 3.4370946038941366, + "learning_rate": 1.7726859948401508e-08, + "loss": 0.8632, + "step": 10197 + }, + { + "epoch": 1.963183097913709, + "grad_norm": 3.4484958181406515, + "learning_rate": 1.7541778444509773e-08, + "loss": 0.8993, + "step": 10198 + }, + { + "epoch": 1.9633756045912842, + "grad_norm": 3.163027131757195, + "learning_rate": 1.735766736327138e-08, + "loss": 0.8003, + "step": 10199 + }, + { + "epoch": 1.9635681112688597, + "grad_norm": 3.1013512748531964, + "learning_rate": 1.717452672258202e-08, + "loss": 0.8072, + "step": 10200 + }, + { + "epoch": 1.9635681112688597, + "lm_loss": 0.7457, + "step": 10200, + "vm_loss": 0.1586 + }, + { + "epoch": 1.9635681112688597, + "lm_loss": 0.2945, + "step": 10200, + "vm_loss": 0.128 + }, + { + "epoch": 1.9635681112688597, + "lm_loss": 0.5406, + "step": 10200, + "vm_loss": 0.1336 + }, + { + "epoch": 1.9635681112688597, + "lm_loss": 0.7136, + "step": 10200, + "vm_loss": 0.1555 + }, + { + "epoch": 1.9635681112688597, + "lm_loss": 0.456, + "step": 10200, + "vm_loss": 0.1227 + }, + { + "epoch": 1.9635681112688597, + "lm_loss": 0.5551, + "step": 10200, + "vm_loss": 0.1637 + }, + { + "epoch": 1.9635681112688597, + "lm_loss": 0.2932, + "step": 10200, + "vm_loss": 0.181 + }, + { + "epoch": 1.9635681112688597, + "lm_loss": 0.5556, + "step": 10200, + "vm_loss": 0.2372 + }, + { + "epoch": 1.963760617946435, + "grad_norm": 3.1788950564569074, + "learning_rate": 1.6992356540247446e-08, + "loss": 0.8226, + "step": 10201 + }, + { + "epoch": 1.9639531246240103, + "grad_norm": 3.1253371178934324, + "learning_rate": 1.6811156833974605e-08, + "loss": 0.7988, + "step": 10202 + }, + { + "epoch": 1.964145631301586, + "grad_norm": 3.324519195360958, + "learning_rate": 1.663092762137941e-08, + "loss": 0.8552, + "step": 10203 + }, + { + "epoch": 1.9643381379791611, + "grad_norm": 3.01185292621742, + "learning_rate": 1.64516689199834e-08, + "loss": 0.7661, + "step": 10204 + }, + { + "epoch": 1.9645306446567365, + "grad_norm": 3.19546864611251, + "learning_rate": 1.627338074721263e-08, + "loss": 0.8266, + "step": 10205 + }, + { + "epoch": 1.964723151334312, + "grad_norm": 3.2437347023354377, + "learning_rate": 1.6096063120396576e-08, + "loss": 0.8578, + "step": 10206 + }, + { + "epoch": 1.9649156580118872, + "grad_norm": 3.2013652616262984, + "learning_rate": 1.5919716056775892e-08, + "loss": 0.7826, + "step": 10207 + }, + { + "epoch": 1.9651081646894628, + "grad_norm": 3.2535518492923075, + "learning_rate": 1.574433957349242e-08, + "loss": 0.8481, + "step": 10208 + }, + { + "epoch": 1.9651081646894628, + "lm_loss": 0.9669, + "step": 10208, + "vm_loss": 0.1234 + }, + { + "epoch": 1.9651081646894628, + "lm_loss": 0.8535, + "step": 10208, + "vm_loss": 0.1392 + }, + { + "epoch": 1.9651081646894628, + "lm_loss": 0.4212, + "step": 10208, + "vm_loss": 0.1495 + }, + { + "epoch": 1.9651081646894628, + "lm_loss": 0.5655, + "step": 10208, + "vm_loss": 0.1137 + }, + { + "epoch": 1.9651081646894628, + "lm_loss": 0.8385, + "step": 10208, + "vm_loss": 0.1871 + }, + { + "epoch": 1.9651081646894628, + "lm_loss": 0.5437, + "step": 10208, + "vm_loss": 0.1299 + }, + { + "epoch": 1.9651081646894628, + "lm_loss": 0.9348, + "step": 10208, + "vm_loss": 0.1149 + }, + { + "epoch": 1.9651081646894628, + "lm_loss": 0.6903, + "step": 10208, + "vm_loss": 0.1546 + }, + { + "epoch": 1.965300671367038, + "grad_norm": 3.2577807949095368, + "learning_rate": 1.556993368759585e-08, + "loss": 0.8217, + "step": 10209 + }, + { + "epoch": 1.9654931780446134, + "grad_norm": 3.1439382090766834, + "learning_rate": 1.5396498416038186e-08, + "loss": 0.8252, + "step": 10210 + }, + { + "epoch": 1.9656856847221889, + "grad_norm": 3.2243412725068956, + "learning_rate": 1.52240337756826e-08, + "loss": 0.8423, + "step": 10211 + }, + { + "epoch": 1.965878191399764, + "grad_norm": 3.2922122216103196, + "learning_rate": 1.5052539783292353e-08, + "loss": 0.8403, + "step": 10212 + }, + { + "epoch": 1.9660706980773397, + "grad_norm": 3.329557140736555, + "learning_rate": 1.4882016455540772e-08, + "loss": 0.8794, + "step": 10213 + }, + { + "epoch": 1.966263204754915, + "grad_norm": 3.2694591128578088, + "learning_rate": 1.4712463809004596e-08, + "loss": 0.8317, + "step": 10214 + }, + { + "epoch": 1.9664557114324903, + "grad_norm": 3.248884120966567, + "learning_rate": 1.4543881860165088e-08, + "loss": 0.8457, + "step": 10215 + }, + { + "epoch": 1.9666482181100657, + "grad_norm": 3.164956959347906, + "learning_rate": 1.437627062541136e-08, + "loss": 0.8018, + "step": 10216 + }, + { + "epoch": 1.9666482181100657, + "lm_loss": 0.4372, + "step": 10216, + "vm_loss": 0.1404 + }, + { + "epoch": 1.9666482181100657, + "lm_loss": 0.7794, + "step": 10216, + "vm_loss": 0.1914 + }, + { + "epoch": 1.9666482181100657, + "lm_loss": 0.8622, + "step": 10216, + "vm_loss": 0.1294 + }, + { + "epoch": 1.9666482181100657, + "lm_loss": 0.5169, + "step": 10216, + "vm_loss": 0.1855 + }, + { + "epoch": 1.9666482181100657, + "lm_loss": 0.7219, + "step": 10216, + "vm_loss": 0.204 + }, + { + "epoch": 1.9666482181100657, + "lm_loss": 0.6495, + "step": 10216, + "vm_loss": 0.1346 + }, + { + "epoch": 1.9666482181100657, + "lm_loss": 0.5811, + "step": 10216, + "vm_loss": 0.1229 + }, + { + "epoch": 1.9666482181100657, + "lm_loss": 0.5585, + "step": 10216, + "vm_loss": 0.1382 + }, + { + "epoch": 1.966840724787641, + "grad_norm": 3.3482625365034675, + "learning_rate": 1.4209630121039264e-08, + "loss": 0.8257, + "step": 10217 + }, + { + "epoch": 1.9670332314652166, + "grad_norm": 3.248368575866094, + "learning_rate": 1.4043960363244735e-08, + "loss": 0.8012, + "step": 10218 + }, + { + "epoch": 1.9672257381427918, + "grad_norm": 3.343227322493252, + "learning_rate": 1.3879261368135999e-08, + "loss": 0.8644, + "step": 10219 + }, + { + "epoch": 1.9674182448203672, + "grad_norm": 3.2652494759391955, + "learning_rate": 1.371553315172247e-08, + "loss": 0.8519, + "step": 10220 + }, + { + "epoch": 1.9676107514979426, + "grad_norm": 3.1809609814871243, + "learning_rate": 1.355277572992142e-08, + "loss": 0.8303, + "step": 10221 + }, + { + "epoch": 1.9678032581755178, + "grad_norm": 3.3971726500441566, + "learning_rate": 1.3390989118554631e-08, + "loss": 0.882, + "step": 10222 + }, + { + "epoch": 1.9679957648530935, + "grad_norm": 3.3313851744587692, + "learning_rate": 1.3230173333348417e-08, + "loss": 0.8406, + "step": 10223 + }, + { + "epoch": 1.9681882715306687, + "grad_norm": 3.3668217585292544, + "learning_rate": 1.3070328389939157e-08, + "loss": 0.8659, + "step": 10224 + }, + { + "epoch": 1.9681882715306687, + "lm_loss": 0.7855, + "step": 10224, + "vm_loss": 0.1344 + }, + { + "epoch": 1.9681882715306687, + "lm_loss": 0.6196, + "step": 10224, + "vm_loss": 0.156 + }, + { + "epoch": 1.9681882715306687, + "lm_loss": 0.6851, + "step": 10224, + "vm_loss": 0.2172 + }, + { + "epoch": 1.9681882715306687, + "lm_loss": 0.6548, + "step": 10224, + "vm_loss": 0.1959 + }, + { + "epoch": 1.9681882715306687, + "lm_loss": 1.0306, + "step": 10224, + "vm_loss": 0.1032 + }, + { + "epoch": 1.9681882715306687, + "lm_loss": 0.9352, + "step": 10224, + "vm_loss": 0.149 + }, + { + "epoch": 1.9681882715306687, + "lm_loss": 0.8014, + "step": 10224, + "vm_loss": 0.2218 + }, + { + "epoch": 1.9681882715306687, + "lm_loss": 0.5073, + "step": 10224, + "vm_loss": 0.2306 + }, + { + "epoch": 1.968380778208244, + "grad_norm": 3.2626960522704174, + "learning_rate": 1.2911454303863314e-08, + "loss": 0.8367, + "step": 10225 + }, + { + "epoch": 1.9685732848858195, + "grad_norm": 3.3376822624604934, + "learning_rate": 1.2753551090566307e-08, + "loss": 0.8271, + "step": 10226 + }, + { + "epoch": 1.9687657915633947, + "grad_norm": 3.159134516745942, + "learning_rate": 1.2596618765398083e-08, + "loss": 0.8115, + "step": 10227 + }, + { + "epoch": 1.9689582982409704, + "grad_norm": 3.287706200591301, + "learning_rate": 1.2440657343615325e-08, + "loss": 0.8354, + "step": 10228 + }, + { + "epoch": 1.9691508049185455, + "grad_norm": 3.124984658151881, + "learning_rate": 1.228566684037813e-08, + "loss": 0.7755, + "step": 10229 + }, + { + "epoch": 1.969343311596121, + "grad_norm": 3.371726410472818, + "learning_rate": 1.2131647270754443e-08, + "loss": 0.8585, + "step": 10230 + }, + { + "epoch": 1.9695358182736964, + "grad_norm": 3.122454071069264, + "learning_rate": 1.1978598649716733e-08, + "loss": 0.802, + "step": 10231 + }, + { + "epoch": 1.9697283249512716, + "grad_norm": 3.1684688914167434, + "learning_rate": 1.1826520992144208e-08, + "loss": 0.8123, + "step": 10232 + }, + { + "epoch": 1.9697283249512716, + "lm_loss": 0.8006, + "step": 10232, + "vm_loss": 0.214 + }, + { + "epoch": 1.9697283249512716, + "lm_loss": 0.632, + "step": 10232, + "vm_loss": 0.1461 + }, + { + "epoch": 1.9697283249512716, + "lm_loss": 1.0004, + "step": 10232, + "vm_loss": 0.1653 + }, + { + "epoch": 1.9697283249512716, + "lm_loss": 0.6753, + "step": 10232, + "vm_loss": 0.1727 + }, + { + "epoch": 1.9697283249512716, + "lm_loss": 0.7062, + "step": 10232, + "vm_loss": 0.2026 + }, + { + "epoch": 1.9697283249512716, + "lm_loss": 0.348, + "step": 10232, + "vm_loss": 0.157 + }, + { + "epoch": 1.9697283249512716, + "lm_loss": 0.4536, + "step": 10232, + "vm_loss": 0.2053 + }, + { + "epoch": 1.9697283249512716, + "lm_loss": 0.7613, + "step": 10232, + "vm_loss": 0.2098 + }, + { + "epoch": 1.9699208316288472, + "grad_norm": 3.104098898228278, + "learning_rate": 1.1675414312819488e-08, + "loss": 0.8349, + "step": 10233 + }, + { + "epoch": 1.9701133383064224, + "grad_norm": 3.3780090726454866, + "learning_rate": 1.1525278626431935e-08, + "loss": 0.858, + "step": 10234 + }, + { + "epoch": 1.9703058449839979, + "grad_norm": 3.069495782910486, + "learning_rate": 1.1376113947577649e-08, + "loss": 0.7911, + "step": 10235 + }, + { + "epoch": 1.9704983516615733, + "grad_norm": 3.2891369864121227, + "learning_rate": 1.1227920290757255e-08, + "loss": 0.8623, + "step": 10236 + }, + { + "epoch": 1.9706908583391485, + "grad_norm": 3.345186112844311, + "learning_rate": 1.1080697670375896e-08, + "loss": 0.8624, + "step": 10237 + }, + { + "epoch": 1.9708833650167241, + "grad_norm": 3.525090793930837, + "learning_rate": 1.0934446100746566e-08, + "loss": 0.9279, + "step": 10238 + }, + { + "epoch": 1.9710758716942993, + "grad_norm": 3.124773135436261, + "learning_rate": 1.0789165596086782e-08, + "loss": 0.7975, + "step": 10239 + }, + { + "epoch": 1.9712683783718747, + "grad_norm": 3.2598522818478672, + "learning_rate": 1.064485617051969e-08, + "loss": 0.8352, + "step": 10240 + }, + { + "epoch": 1.9712683783718747, + "lm_loss": 0.3821, + "step": 10240, + "vm_loss": 0.1801 + }, + { + "epoch": 1.9712683783718747, + "lm_loss": 0.539, + "step": 10240, + "vm_loss": 0.1265 + }, + { + "epoch": 1.9712683783718747, + "lm_loss": 0.7572, + "step": 10240, + "vm_loss": 0.1955 + }, + { + "epoch": 1.9712683783718747, + "lm_loss": 0.5064, + "step": 10240, + "vm_loss": 0.1084 + }, + { + "epoch": 1.9712683783718747, + "lm_loss": 1.0349, + "step": 10240, + "vm_loss": 0.1238 + }, + { + "epoch": 1.9712683783718747, + "lm_loss": 0.4137, + "step": 10240, + "vm_loss": 0.0911 + }, + { + "epoch": 1.9712683783718747, + "lm_loss": 0.4583, + "step": 10240, + "vm_loss": 0.2668 + }, + { + "epoch": 1.9712683783718747, + "lm_loss": 0.6371, + "step": 10240, + "vm_loss": 0.2232 + }, + { + "epoch": 1.9714608850494502, + "grad_norm": 3.288363565488047, + "learning_rate": 1.050151783807296e-08, + "loss": 0.8548, + "step": 10241 + }, + { + "epoch": 1.9716533917270256, + "grad_norm": 3.2925694024663748, + "learning_rate": 1.0359150612682112e-08, + "loss": 0.8807, + "step": 10242 + }, + { + "epoch": 1.971845898404601, + "grad_norm": 3.16333679507627, + "learning_rate": 1.0217754508187183e-08, + "loss": 0.8169, + "step": 10243 + }, + { + "epoch": 1.9720384050821762, + "grad_norm": 3.1850822637916933, + "learning_rate": 1.0077329538331626e-08, + "loss": 0.7628, + "step": 10244 + }, + { + "epoch": 1.9722309117597516, + "grad_norm": 3.222136683736353, + "learning_rate": 9.937875716770074e-09, + "loss": 0.8248, + "step": 10245 + }, + { + "epoch": 1.972423418437327, + "grad_norm": 3.022549407975293, + "learning_rate": 9.799393057056128e-09, + "loss": 0.7663, + "step": 10246 + }, + { + "epoch": 1.9726159251149025, + "grad_norm": 3.2308311320145915, + "learning_rate": 9.661881572652355e-09, + "loss": 0.8157, + "step": 10247 + }, + { + "epoch": 1.972808431792478, + "grad_norm": 3.1239226656656505, + "learning_rate": 9.525341276929167e-09, + "loss": 0.8128, + "step": 10248 + }, + { + "epoch": 1.972808431792478, + "lm_loss": 0.7372, + "step": 10248, + "vm_loss": 0.1565 + }, + { + "epoch": 1.972808431792478, + "lm_loss": 1.0928, + "step": 10248, + "vm_loss": 0.1632 + }, + { + "epoch": 1.972808431792478, + "lm_loss": 1.1286, + "step": 10248, + "vm_loss": 0.192 + }, + { + "epoch": 1.972808431792478, + "lm_loss": 0.6941, + "step": 10248, + "vm_loss": 0.1973 + }, + { + "epoch": 1.972808431792478, + "lm_loss": 0.4359, + "step": 10248, + "vm_loss": 0.1419 + }, + { + "epoch": 1.972808431792478, + "lm_loss": 0.3742, + "step": 10248, + "vm_loss": 0.1711 + }, + { + "epoch": 1.972808431792478, + "lm_loss": 0.6855, + "step": 10248, + "vm_loss": 0.1897 + }, + { + "epoch": 1.972808431792478, + "lm_loss": 0.3473, + "step": 10248, + "vm_loss": 0.1486 + }, + { + "epoch": 1.973000938470053, + "grad_norm": 3.2558960273622652, + "learning_rate": 9.389772183157064e-09, + "loss": 0.8398, + "step": 10249 + }, + { + "epoch": 1.9731934451476285, + "grad_norm": 3.266226389932565, + "learning_rate": 9.255174304516611e-09, + "loss": 0.7842, + "step": 10250 + }, + { + "epoch": 1.973385951825204, + "grad_norm": 3.29926940947336, + "learning_rate": 9.121547654091789e-09, + "loss": 0.8475, + "step": 10251 + }, + { + "epoch": 1.9735784585027794, + "grad_norm": 3.315875973625688, + "learning_rate": 8.988892244874425e-09, + "loss": 0.8708, + "step": 10252 + }, + { + "epoch": 1.9737709651803548, + "grad_norm": 3.0834907487715, + "learning_rate": 8.857208089758652e-09, + "loss": 0.7975, + "step": 10253 + }, + { + "epoch": 1.97396347185793, + "grad_norm": 3.2983948173305473, + "learning_rate": 8.726495201545338e-09, + "loss": 0.8462, + "step": 10254 + }, + { + "epoch": 1.9741559785355054, + "grad_norm": 3.5008527061621657, + "learning_rate": 8.596753592943208e-09, + "loss": 0.8811, + "step": 10255 + }, + { + "epoch": 1.9743484852130808, + "grad_norm": 3.2244309365477495, + "learning_rate": 8.467983276563285e-09, + "loss": 0.8253, + "step": 10256 + }, + { + "epoch": 1.9743484852130808, + "lm_loss": 0.9348, + "step": 10256, + "vm_loss": 0.1635 + }, + { + "epoch": 1.9743484852130808, + "lm_loss": 0.7365, + "step": 10256, + "vm_loss": 0.1465 + }, + { + "epoch": 1.9743484852130808, + "lm_loss": 0.5747, + "step": 10256, + "vm_loss": 0.1894 + }, + { + "epoch": 1.9743484852130808, + "lm_loss": 0.982, + "step": 10256, + "vm_loss": 0.1277 + }, + { + "epoch": 1.9743484852130808, + "lm_loss": 0.9199, + "step": 10256, + "vm_loss": 0.1351 + }, + { + "epoch": 1.9743484852130808, + "lm_loss": 0.7265, + "step": 10256, + "vm_loss": 0.1935 + }, + { + "epoch": 1.9743484852130808, + "lm_loss": 0.5097, + "step": 10256, + "vm_loss": 0.2047 + }, + { + "epoch": 1.9743484852130808, + "lm_loss": 0.8511, + "step": 10256, + "vm_loss": 0.1535 + }, + { + "epoch": 1.9745409918906562, + "grad_norm": 3.171550756440761, + "learning_rate": 8.340184264925555e-09, + "loss": 0.8221, + "step": 10257 + }, + { + "epoch": 1.9747334985682317, + "grad_norm": 3.226557052440261, + "learning_rate": 8.213356570452303e-09, + "loss": 0.8057, + "step": 10258 + }, + { + "epoch": 1.9749260052458069, + "grad_norm": 3.2514042950827373, + "learning_rate": 8.087500205472553e-09, + "loss": 0.8334, + "step": 10259 + }, + { + "epoch": 1.9751185119233825, + "grad_norm": 3.2403624841701526, + "learning_rate": 7.962615182222078e-09, + "loss": 0.8269, + "step": 10260 + }, + { + "epoch": 1.9753110186009577, + "grad_norm": 3.1835517355890492, + "learning_rate": 7.838701512840052e-09, + "loss": 0.8324, + "step": 10261 + }, + { + "epoch": 1.9755035252785331, + "grad_norm": 3.3599208788539783, + "learning_rate": 7.715759209373509e-09, + "loss": 0.8087, + "step": 10262 + }, + { + "epoch": 1.9756960319561085, + "grad_norm": 3.3042880082433514, + "learning_rate": 7.593788283775106e-09, + "loss": 0.8241, + "step": 10263 + }, + { + "epoch": 1.9758885386336837, + "grad_norm": 3.3454027302229434, + "learning_rate": 7.472788747899806e-09, + "loss": 0.8565, + "step": 10264 + }, + { + "epoch": 1.9758885386336837, + "lm_loss": 0.7951, + "step": 10264, + "vm_loss": 0.1608 + }, + { + "epoch": 1.9758885386336837, + "lm_loss": 0.7826, + "step": 10264, + "vm_loss": 0.2353 + }, + { + "epoch": 1.9758885386336837, + "lm_loss": 0.6852, + "step": 10264, + "vm_loss": 0.1997 + }, + { + "epoch": 1.9758885386336837, + "lm_loss": 0.3255, + "step": 10264, + "vm_loss": 0.1166 + }, + { + "epoch": 1.9758885386336837, + "lm_loss": 0.756, + "step": 10264, + "vm_loss": 0.1055 + }, + { + "epoch": 1.9758885386336837, + "lm_loss": 0.786, + "step": 10264, + "vm_loss": 0.1751 + }, + { + "epoch": 1.9758885386336837, + "lm_loss": 0.7667, + "step": 10264, + "vm_loss": 0.1228 + }, + { + "epoch": 1.9758885386336837, + "lm_loss": 0.4622, + "step": 10264, + "vm_loss": 0.1489 + }, + { + "epoch": 1.9760810453112594, + "grad_norm": 3.0452741802535757, + "learning_rate": 7.352760613510423e-09, + "loss": 0.7863, + "step": 10265 + }, + { + "epoch": 1.9762735519888346, + "grad_norm": 3.202450692236302, + "learning_rate": 7.233703892277622e-09, + "loss": 0.8467, + "step": 10266 + }, + { + "epoch": 1.97646605866641, + "grad_norm": 3.1561565482033687, + "learning_rate": 7.115618595772145e-09, + "loss": 0.8188, + "step": 10267 + }, + { + "epoch": 1.9766585653439854, + "grad_norm": 3.1538533029907616, + "learning_rate": 6.998504735475919e-09, + "loss": 0.8168, + "step": 10268 + }, + { + "epoch": 1.9768510720215606, + "grad_norm": 3.3838693872137138, + "learning_rate": 6.882362322772063e-09, + "loss": 0.9067, + "step": 10269 + }, + { + "epoch": 1.9770435786991363, + "grad_norm": 3.3483323662272118, + "learning_rate": 6.767191368952653e-09, + "loss": 0.8703, + "step": 10270 + }, + { + "epoch": 1.9772360853767115, + "grad_norm": 3.1963459426587577, + "learning_rate": 6.652991885212068e-09, + "loss": 0.803, + "step": 10271 + }, + { + "epoch": 1.977428592054287, + "grad_norm": 3.1553178914155118, + "learning_rate": 6.539763882653649e-09, + "loss": 0.8075, + "step": 10272 + }, + { + "epoch": 1.977428592054287, + "lm_loss": 0.6449, + "step": 10272, + "vm_loss": 0.1696 + }, + { + "epoch": 1.977428592054287, + "lm_loss": 0.9748, + "step": 10272, + "vm_loss": 0.1286 + }, + { + "epoch": 1.977428592054287, + "lm_loss": 0.3138, + "step": 10272, + "vm_loss": 0.1499 + }, + { + "epoch": 1.977428592054287, + "lm_loss": 0.4256, + "step": 10272, + "vm_loss": 0.1459 + }, + { + "epoch": 1.977428592054287, + "lm_loss": 0.7019, + "step": 10272, + "vm_loss": 0.2046 + }, + { + "epoch": 1.977428592054287, + "lm_loss": 0.9366, + "step": 10272, + "vm_loss": 0.1876 + }, + { + "epoch": 1.977428592054287, + "lm_loss": 0.5042, + "step": 10272, + "vm_loss": 0.1003 + }, + { + "epoch": 1.977428592054287, + "lm_loss": 0.4963, + "step": 10272, + "vm_loss": 0.1222 + }, + { + "epoch": 1.9776210987318623, + "grad_norm": 3.3462825031653223, + "learning_rate": 6.427507372283037e-09, + "loss": 0.8489, + "step": 10273 + }, + { + "epoch": 1.9778136054094375, + "grad_norm": 3.2860289450370157, + "learning_rate": 6.316222365014834e-09, + "loss": 0.8795, + "step": 10274 + }, + { + "epoch": 1.9780061120870132, + "grad_norm": 3.2593840235537304, + "learning_rate": 6.205908871665944e-09, + "loss": 0.8617, + "step": 10275 + }, + { + "epoch": 1.9781986187645884, + "grad_norm": 3.2826582733404597, + "learning_rate": 6.0965669029611205e-09, + "loss": 0.8416, + "step": 10276 + }, + { + "epoch": 1.9783911254421638, + "grad_norm": 3.2596252226643037, + "learning_rate": 5.988196469528529e-09, + "loss": 0.7999, + "step": 10277 + }, + { + "epoch": 1.9785836321197392, + "grad_norm": 3.076276653140899, + "learning_rate": 5.880797581904185e-09, + "loss": 0.7617, + "step": 10278 + }, + { + "epoch": 1.9787761387973144, + "grad_norm": 3.265425943029449, + "learning_rate": 5.774370250528627e-09, + "loss": 0.8102, + "step": 10279 + }, + { + "epoch": 1.97896864547489, + "grad_norm": 3.173618709191152, + "learning_rate": 5.668914485748022e-09, + "loss": 0.8411, + "step": 10280 + }, + { + "epoch": 1.97896864547489, + "lm_loss": 0.5742, + "step": 10280, + "vm_loss": 0.1818 + }, + { + "epoch": 1.97896864547489, + "lm_loss": 0.7841, + "step": 10280, + "vm_loss": 0.1628 + }, + { + "epoch": 1.97896864547489, + "lm_loss": 0.5466, + "step": 10280, + "vm_loss": 0.0883 + }, + { + "epoch": 1.97896864547489, + "lm_loss": 0.5518, + "step": 10280, + "vm_loss": 0.2193 + }, + { + "epoch": 1.97896864547489, + "lm_loss": 0.5762, + "step": 10280, + "vm_loss": 0.1721 + }, + { + "epoch": 1.97896864547489, + "lm_loss": 0.3086, + "step": 10280, + "vm_loss": 0.1401 + }, + { + "epoch": 1.97896864547489, + "lm_loss": 0.5317, + "step": 10280, + "vm_loss": 0.1979 + }, + { + "epoch": 1.97896864547489, + "lm_loss": 0.3185, + "step": 10280, + "vm_loss": 0.1659 + }, + { + "epoch": 1.9791611521524652, + "grad_norm": 3.2717339503674507, + "learning_rate": 5.564430297813062e-09, + "loss": 0.8324, + "step": 10281 + }, + { + "epoch": 1.9793536588300407, + "grad_norm": 3.316101689392939, + "learning_rate": 5.460917696882284e-09, + "loss": 0.8529, + "step": 10282 + }, + { + "epoch": 1.979546165507616, + "grad_norm": 3.267550552399221, + "learning_rate": 5.358376693017642e-09, + "loss": 0.8886, + "step": 10283 + }, + { + "epoch": 1.9797386721851913, + "grad_norm": 3.2156733526081287, + "learning_rate": 5.256807296187827e-09, + "loss": 0.8322, + "step": 10284 + }, + { + "epoch": 1.979931178862767, + "grad_norm": 3.4469724245984903, + "learning_rate": 5.156209516266053e-09, + "loss": 0.8639, + "step": 10285 + }, + { + "epoch": 1.9801236855403421, + "grad_norm": 3.3279532968208114, + "learning_rate": 5.056583363032275e-09, + "loss": 0.8547, + "step": 10286 + }, + { + "epoch": 1.9803161922179175, + "grad_norm": 3.3645429520649097, + "learning_rate": 4.957928846170967e-09, + "loss": 0.8652, + "step": 10287 + }, + { + "epoch": 1.980508698895493, + "grad_norm": 3.241975506559042, + "learning_rate": 4.860245975273348e-09, + "loss": 0.8114, + "step": 10288 + }, + { + "epoch": 1.980508698895493, + "lm_loss": 1.1911, + "step": 10288, + "vm_loss": 0.1036 + }, + { + "epoch": 1.980508698895493, + "lm_loss": 0.7126, + "step": 10288, + "vm_loss": 0.1838 + }, + { + "epoch": 1.980508698895493, + "lm_loss": 0.3572, + "step": 10288, + "vm_loss": 0.1652 + }, + { + "epoch": 1.980508698895493, + "lm_loss": 0.3932, + "step": 10288, + "vm_loss": 0.1762 + }, + { + "epoch": 1.980508698895493, + "lm_loss": 0.9089, + "step": 10288, + "vm_loss": 0.1599 + }, + { + "epoch": 1.980508698895493, + "lm_loss": 0.5353, + "step": 10288, + "vm_loss": 0.1859 + }, + { + "epoch": 1.980508698895493, + "lm_loss": 0.707, + "step": 10288, + "vm_loss": 0.199 + }, + { + "epoch": 1.980508698895493, + "lm_loss": 0.6611, + "step": 10288, + "vm_loss": 0.1412 + }, + { + "epoch": 1.9807012055730682, + "grad_norm": 3.2456576016656937, + "learning_rate": 4.763534759835153e-09, + "loss": 0.8479, + "step": 10289 + }, + { + "epoch": 1.9808937122506438, + "grad_norm": 3.290254551225311, + "learning_rate": 4.6677952092577525e-09, + "loss": 0.8887, + "step": 10290 + }, + { + "epoch": 1.981086218928219, + "grad_norm": 3.3586891231032743, + "learning_rate": 4.5730273328470355e-09, + "loss": 0.8324, + "step": 10291 + }, + { + "epoch": 1.9812787256057944, + "grad_norm": 3.5272546305254817, + "learning_rate": 4.4792311398178525e-09, + "loss": 0.8931, + "step": 10292 + }, + { + "epoch": 1.9814712322833699, + "grad_norm": 3.2049693464705302, + "learning_rate": 4.386406639287355e-09, + "loss": 0.8566, + "step": 10293 + }, + { + "epoch": 1.981663738960945, + "grad_norm": 3.0057599901725967, + "learning_rate": 4.2945538402794365e-09, + "loss": 0.7884, + "step": 10294 + }, + { + "epoch": 1.9818562456385207, + "grad_norm": 3.2166153382595417, + "learning_rate": 4.20367275172362e-09, + "loss": 0.8047, + "step": 10295 + }, + { + "epoch": 1.982048752316096, + "grad_norm": 3.2284488577300388, + "learning_rate": 4.11376338245395e-09, + "loss": 0.8246, + "step": 10296 + }, + { + "epoch": 1.982048752316096, + "lm_loss": 0.8484, + "step": 10296, + "vm_loss": 0.1439 + }, + { + "epoch": 1.982048752316096, + "lm_loss": 0.5269, + "step": 10296, + "vm_loss": 0.1415 + }, + { + "epoch": 1.982048752316096, + "lm_loss": 0.8258, + "step": 10296, + "vm_loss": 0.1802 + }, + { + "epoch": 1.982048752316096, + "lm_loss": 0.5883, + "step": 10296, + "vm_loss": 0.0965 + }, + { + "epoch": 1.982048752316096, + "lm_loss": 0.9753, + "step": 10296, + "vm_loss": 0.1006 + }, + { + "epoch": 1.982048752316096, + "lm_loss": 0.4661, + "step": 10296, + "vm_loss": 0.1736 + }, + { + "epoch": 1.982048752316096, + "lm_loss": 0.7675, + "step": 10296, + "vm_loss": 0.1161 + }, + { + "epoch": 1.982048752316096, + "lm_loss": 0.9277, + "step": 10296, + "vm_loss": 0.115 + }, + { + "epoch": 1.9822412589936713, + "grad_norm": 3.2106334566274106, + "learning_rate": 4.024825741212324e-09, + "loss": 0.8116, + "step": 10297 + }, + { + "epoch": 1.9824337656712467, + "grad_norm": 3.226807181677353, + "learning_rate": 3.936859836641826e-09, + "loss": 0.8142, + "step": 10298 + }, + { + "epoch": 1.982626272348822, + "grad_norm": 3.131980566722135, + "learning_rate": 3.849865677296727e-09, + "loss": 0.803, + "step": 10299 + }, + { + "epoch": 1.9828187790263976, + "grad_norm": 3.332934969562565, + "learning_rate": 3.763843271631373e-09, + "loss": 0.8338, + "step": 10300 + }, + { + "epoch": 1.9830112857039728, + "grad_norm": 3.2837185466607246, + "learning_rate": 3.6787926280112963e-09, + "loss": 0.8536, + "step": 10301 + }, + { + "epoch": 1.9832037923815482, + "grad_norm": 3.1764902479740234, + "learning_rate": 3.594713754702106e-09, + "loss": 0.8222, + "step": 10302 + }, + { + "epoch": 1.9833962990591236, + "grad_norm": 3.155282462746852, + "learning_rate": 3.5116066598783747e-09, + "loss": 0.8304, + "step": 10303 + }, + { + "epoch": 1.983588805736699, + "grad_norm": 3.1679409558615603, + "learning_rate": 3.4294713516180854e-09, + "loss": 0.8436, + "step": 10304 + }, + { + "epoch": 1.983588805736699, + "lm_loss": 0.5248, + "step": 10304, + "vm_loss": 0.1414 + }, + { + "epoch": 1.983588805736699, + "lm_loss": 0.5858, + "step": 10304, + "vm_loss": 0.1583 + }, + { + "epoch": 1.983588805736699, + "lm_loss": 0.5694, + "step": 10304, + "vm_loss": 0.1767 + }, + { + "epoch": 1.983588805736699, + "lm_loss": 0.5427, + "step": 10304, + "vm_loss": 0.202 + }, + { + "epoch": 1.983588805736699, + "lm_loss": 0.43, + "step": 10304, + "vm_loss": 0.1342 + }, + { + "epoch": 1.983588805736699, + "lm_loss": 0.6024, + "step": 10304, + "vm_loss": 0.1446 + }, + { + "epoch": 1.983588805736699, + "lm_loss": 0.6517, + "step": 10304, + "vm_loss": 0.1895 + }, + { + "epoch": 1.983588805736699, + "lm_loss": 0.4486, + "step": 10304, + "vm_loss": 0.1746 + }, + { + "epoch": 1.9837813124142745, + "grad_norm": 3.093616035277056, + "learning_rate": 3.348307837907072e-09, + "loss": 0.803, + "step": 10305 + }, + { + "epoch": 1.9839738190918497, + "grad_norm": 3.538443470696224, + "learning_rate": 3.2681161266356898e-09, + "loss": 0.9307, + "step": 10306 + }, + { + "epoch": 1.984166325769425, + "grad_norm": 3.2876091179167743, + "learning_rate": 3.1888962255988143e-09, + "loss": 0.8366, + "step": 10307 + }, + { + "epoch": 1.9843588324470005, + "grad_norm": 3.1854062953317075, + "learning_rate": 3.1106481424969524e-09, + "loss": 0.8386, + "step": 10308 + }, + { + "epoch": 1.984551339124576, + "grad_norm": 3.267671606002496, + "learning_rate": 3.033371884938463e-09, + "loss": 0.8041, + "step": 10309 + }, + { + "epoch": 1.9847438458021514, + "grad_norm": 3.408409421328903, + "learning_rate": 2.9570674604340046e-09, + "loss": 0.8645, + "step": 10310 + }, + { + "epoch": 1.9849363524797266, + "grad_norm": 3.1255932772739405, + "learning_rate": 2.881734876403197e-09, + "loss": 0.8077, + "step": 10311 + }, + { + "epoch": 1.985128859157302, + "grad_norm": 3.250726255756137, + "learning_rate": 2.807374140166852e-09, + "loss": 0.8137, + "step": 10312 + }, + { + "epoch": 1.985128859157302, + "lm_loss": 0.9023, + "step": 10312, + "vm_loss": 0.1408 + }, + { + "epoch": 1.985128859157302, + "lm_loss": 0.5499, + "step": 10312, + "vm_loss": 0.1842 + }, + { + "epoch": 1.985128859157302, + "lm_loss": 0.6769, + "step": 10312, + "vm_loss": 0.1349 + }, + { + "epoch": 1.985128859157302, + "lm_loss": 1.0874, + "step": 10312, + "vm_loss": 0.187 + }, + { + "epoch": 1.985128859157302, + "lm_loss": 0.405, + "step": 10312, + "vm_loss": 0.1743 + }, + { + "epoch": 1.985128859157302, + "lm_loss": 0.9951, + "step": 10312, + "vm_loss": 0.1758 + }, + { + "epoch": 1.985128859157302, + "lm_loss": 0.537, + "step": 10312, + "vm_loss": 0.1585 + }, + { + "epoch": 1.985128859157302, + "lm_loss": 0.513, + "step": 10312, + "vm_loss": 0.1712 + }, + { + "epoch": 1.9853213658348774, + "grad_norm": 3.1625704609042575, + "learning_rate": 2.7339852589569616e-09, + "loss": 0.822, + "step": 10313 + }, + { + "epoch": 1.9855138725124528, + "grad_norm": 3.4382298580330244, + "learning_rate": 2.6615682399044883e-09, + "loss": 0.8786, + "step": 10314 + }, + { + "epoch": 1.9857063791900282, + "grad_norm": 3.2080342224694225, + "learning_rate": 2.5901230900515773e-09, + "loss": 0.8206, + "step": 10315 + }, + { + "epoch": 1.9858988858676034, + "grad_norm": 3.2615351810331346, + "learning_rate": 2.519649816342673e-09, + "loss": 0.8478, + "step": 10316 + }, + { + "epoch": 1.9860913925451789, + "grad_norm": 3.329770608724312, + "learning_rate": 2.450148425628962e-09, + "loss": 0.8179, + "step": 10317 + }, + { + "epoch": 1.9862838992227543, + "grad_norm": 3.316899234656234, + "learning_rate": 2.3816189246672616e-09, + "loss": 0.837, + "step": 10318 + }, + { + "epoch": 1.9864764059003297, + "grad_norm": 3.2460592904816212, + "learning_rate": 2.31406132011891e-09, + "loss": 0.8129, + "step": 10319 + }, + { + "epoch": 1.9866689125779051, + "grad_norm": 3.3050644602280994, + "learning_rate": 2.247475618550876e-09, + "loss": 0.8333, + "step": 10320 + }, + { + "epoch": 1.9866689125779051, + "lm_loss": 0.4749, + "step": 10320, + "vm_loss": 0.1966 + }, + { + "epoch": 1.9866689125779051, + "lm_loss": 1.2338, + "step": 10320, + "vm_loss": 0.184 + }, + { + "epoch": 1.9866689125779051, + "lm_loss": 0.4694, + "step": 10320, + "vm_loss": 0.1417 + }, + { + "epoch": 1.9866689125779051, + "lm_loss": 0.7436, + "step": 10320, + "vm_loss": 0.1518 + }, + { + "epoch": 1.9866689125779051, + "lm_loss": 0.4122, + "step": 10320, + "vm_loss": 0.1415 + }, + { + "epoch": 1.9866689125779051, + "lm_loss": 0.4622, + "step": 10320, + "vm_loss": 0.2135 + }, + { + "epoch": 1.9866689125779051, + "lm_loss": 0.5574, + "step": 10320, + "vm_loss": 0.1617 + }, + { + "epoch": 1.9866689125779051, + "lm_loss": 0.6932, + "step": 10320, + "vm_loss": 0.1197 + }, + { + "epoch": 1.9868614192554803, + "grad_norm": 3.215475107716939, + "learning_rate": 2.181861826437981e-09, + "loss": 0.8053, + "step": 10321 + }, + { + "epoch": 1.987053925933056, + "grad_norm": 3.255744544623041, + "learning_rate": 2.1172199501573453e-09, + "loss": 0.8838, + "step": 10322 + }, + { + "epoch": 1.9872464326106312, + "grad_norm": 3.345130601404821, + "learning_rate": 2.0535499959917216e-09, + "loss": 0.8324, + "step": 10323 + }, + { + "epoch": 1.9874389392882066, + "grad_norm": 3.232323038434671, + "learning_rate": 1.9908519701339337e-09, + "loss": 0.8052, + "step": 10324 + }, + { + "epoch": 1.987631445965782, + "grad_norm": 3.2883399409649168, + "learning_rate": 1.929125878675775e-09, + "loss": 0.8242, + "step": 10325 + }, + { + "epoch": 1.9878239526433572, + "grad_norm": 3.2003250287087677, + "learning_rate": 1.868371727620222e-09, + "loss": 0.8589, + "step": 10326 + }, + { + "epoch": 1.9880164593209328, + "grad_norm": 3.1082088269988217, + "learning_rate": 1.8085895228725503e-09, + "loss": 0.8208, + "step": 10327 + }, + { + "epoch": 1.988208965998508, + "grad_norm": 3.4316102267662845, + "learning_rate": 1.7497792702436677e-09, + "loss": 0.8557, + "step": 10328 + }, + { + "epoch": 1.988208965998508, + "lm_loss": 0.694, + "step": 10328, + "vm_loss": 0.1319 + }, + { + "epoch": 1.988208965998508, + "lm_loss": 0.6742, + "step": 10328, + "vm_loss": 0.1884 + }, + { + "epoch": 1.988208965998508, + "lm_loss": 1.1165, + "step": 10328, + "vm_loss": 0.2022 + }, + { + "epoch": 1.988208965998508, + "lm_loss": 0.5041, + "step": 10328, + "vm_loss": 0.1798 + }, + { + "epoch": 1.988208965998508, + "lm_loss": 0.6939, + "step": 10328, + "vm_loss": 0.1193 + }, + { + "epoch": 1.988208965998508, + "lm_loss": 0.7195, + "step": 10328, + "vm_loss": 0.1222 + }, + { + "epoch": 1.988208965998508, + "lm_loss": 0.942, + "step": 10328, + "vm_loss": 0.1289 + }, + { + "epoch": 1.988208965998508, + "lm_loss": 0.695, + "step": 10328, + "vm_loss": 0.1403 + }, + { + "epoch": 1.9884014726760835, + "grad_norm": 3.2722569953632914, + "learning_rate": 1.6919409754512228e-09, + "loss": 0.8024, + "step": 10329 + }, + { + "epoch": 1.988593979353659, + "grad_norm": 3.2623332848424864, + "learning_rate": 1.6350746441173848e-09, + "loss": 0.8224, + "step": 10330 + }, + { + "epoch": 1.988786486031234, + "grad_norm": 3.34550357385071, + "learning_rate": 1.5791802817721746e-09, + "loss": 0.8518, + "step": 10331 + }, + { + "epoch": 1.9889789927088097, + "grad_norm": 3.3973080147039934, + "learning_rate": 1.5242578938468034e-09, + "loss": 0.8436, + "step": 10332 + }, + { + "epoch": 1.989171499386385, + "grad_norm": 3.115532979957815, + "learning_rate": 1.4703074856814437e-09, + "loss": 0.801, + "step": 10333 + }, + { + "epoch": 1.9893640060639604, + "grad_norm": 3.3441941543828406, + "learning_rate": 1.4173290625207891e-09, + "loss": 0.8841, + "step": 10334 + }, + { + "epoch": 1.9895565127415358, + "grad_norm": 3.416234636749379, + "learning_rate": 1.3653226295151646e-09, + "loss": 0.8598, + "step": 10335 + }, + { + "epoch": 1.989749019419111, + "grad_norm": 3.207004231361259, + "learning_rate": 1.314288191719415e-09, + "loss": 0.8037, + "step": 10336 + }, + { + "epoch": 1.989749019419111, + "lm_loss": 0.8757, + "step": 10336, + "vm_loss": 0.1926 + }, + { + "epoch": 1.989749019419111, + "lm_loss": 0.5764, + "step": 10336, + "vm_loss": 0.2512 + }, + { + "epoch": 1.989749019419111, + "lm_loss": 0.7766, + "step": 10336, + "vm_loss": 0.1338 + }, + { + "epoch": 1.989749019419111, + "lm_loss": 0.4248, + "step": 10336, + "vm_loss": 0.2268 + }, + { + "epoch": 1.989749019419111, + "lm_loss": 0.7605, + "step": 10336, + "vm_loss": 0.188 + }, + { + "epoch": 1.989749019419111, + "lm_loss": 0.7669, + "step": 10336, + "vm_loss": 0.1361 + }, + { + "epoch": 1.989749019419111, + "lm_loss": 0.5839, + "step": 10336, + "vm_loss": 0.1203 + }, + { + "epoch": 1.989749019419111, + "lm_loss": 0.427, + "step": 10336, + "vm_loss": 0.1357 + }, + { + "epoch": 1.9899415260966866, + "grad_norm": 3.244904998857969, + "learning_rate": 1.2642257540962377e-09, + "loss": 0.8464, + "step": 10337 + }, + { + "epoch": 1.9901340327742618, + "grad_norm": 3.0201925276628385, + "learning_rate": 1.21513532151063e-09, + "loss": 0.7615, + "step": 10338 + }, + { + "epoch": 1.9903265394518372, + "grad_norm": 3.3374943117759592, + "learning_rate": 1.1670168987365505e-09, + "loss": 0.8516, + "step": 10339 + }, + { + "epoch": 1.9905190461294127, + "grad_norm": 3.3054473535204343, + "learning_rate": 1.119870490450259e-09, + "loss": 0.8352, + "step": 10340 + }, + { + "epoch": 1.9907115528069879, + "grad_norm": 3.3707054368214338, + "learning_rate": 1.073696101235866e-09, + "loss": 0.8574, + "step": 10341 + }, + { + "epoch": 1.9909040594845635, + "grad_norm": 3.1316717434749033, + "learning_rate": 1.0284937355820035e-09, + "loss": 0.8127, + "step": 10342 + }, + { + "epoch": 1.9910965661621387, + "grad_norm": 3.1888661278329353, + "learning_rate": 9.842633978818238e-10, + "loss": 0.8332, + "step": 10343 + }, + { + "epoch": 1.9912890728397141, + "grad_norm": 3.1075185298567742, + "learning_rate": 9.410050924374414e-10, + "loss": 0.7789, + "step": 10344 + }, + { + "epoch": 1.9912890728397141, + "lm_loss": 0.906, + "step": 10344, + "vm_loss": 0.1824 + }, + { + "epoch": 1.9912890728397141, + "lm_loss": 0.8038, + "step": 10344, + "vm_loss": 0.2126 + }, + { + "epoch": 1.9912890728397141, + "lm_loss": 0.7587, + "step": 10344, + "vm_loss": 0.1908 + }, + { + "epoch": 1.9912890728397141, + "lm_loss": 0.49, + "step": 10344, + "vm_loss": 0.1608 + }, + { + "epoch": 1.9912890728397141, + "lm_loss": 0.7159, + "step": 10344, + "vm_loss": 0.1152 + }, + { + "epoch": 1.9912890728397141, + "lm_loss": 0.794, + "step": 10344, + "vm_loss": 0.182 + }, + { + "epoch": 1.9912890728397141, + "lm_loss": 0.6915, + "step": 10344, + "vm_loss": 0.1855 + }, + { + "epoch": 1.9912890728397141, + "lm_loss": 0.5468, + "step": 10344, + "vm_loss": 0.149 + }, + { + "epoch": 1.9914815795172895, + "grad_norm": 3.2954451576893073, + "learning_rate": 8.987188234521604e-10, + "loss": 0.8623, + "step": 10345 + }, + { + "epoch": 1.9916740861948647, + "grad_norm": 3.2869728648664127, + "learning_rate": 8.574045950360266e-10, + "loss": 0.8209, + "step": 10346 + }, + { + "epoch": 1.9918665928724404, + "grad_norm": 3.239888862869303, + "learning_rate": 8.170624112080472e-10, + "loss": 0.8102, + "step": 10347 + }, + { + "epoch": 1.9920590995500156, + "grad_norm": 3.296030330432085, + "learning_rate": 7.776922758884198e-10, + "loss": 0.8358, + "step": 10348 + }, + { + "epoch": 1.992251606227591, + "grad_norm": 3.1339022347341197, + "learning_rate": 7.392941929040832e-10, + "loss": 0.8165, + "step": 10349 + }, + { + "epoch": 1.9924441129051664, + "grad_norm": 3.093440823319644, + "learning_rate": 7.01868165987607e-10, + "loss": 0.7859, + "step": 10350 + }, + { + "epoch": 1.9926366195827416, + "grad_norm": 3.231619427735801, + "learning_rate": 6.654141987794127e-10, + "loss": 0.853, + "step": 10351 + }, + { + "epoch": 1.9928291262603173, + "grad_norm": 3.2424976031574197, + "learning_rate": 6.299322948211118e-10, + "loss": 0.8174, + "step": 10352 + }, + { + "epoch": 1.9928291262603173, + "lm_loss": 0.1905, + "step": 10352, + "vm_loss": 0.1723 + }, + { + "epoch": 1.9928291262603173, + "lm_loss": 0.5194, + "step": 10352, + "vm_loss": 0.1489 + }, + { + "epoch": 1.9928291262603173, + "lm_loss": 0.6333, + "step": 10352, + "vm_loss": 0.2156 + }, + { + "epoch": 1.9928291262603173, + "lm_loss": 0.6631, + "step": 10352, + "vm_loss": 0.2154 + }, + { + "epoch": 1.9928291262603173, + "lm_loss": 0.5846, + "step": 10352, + "vm_loss": 0.212 + }, + { + "epoch": 1.9928291262603173, + "lm_loss": 1.0106, + "step": 10352, + "vm_loss": 0.1682 + }, + { + "epoch": 1.9928291262603173, + "lm_loss": 0.3546, + "step": 10352, + "vm_loss": 0.2137 + }, + { + "epoch": 1.9928291262603173, + "lm_loss": 0.5083, + "step": 10352, + "vm_loss": 0.2589 + }, + { + "epoch": 1.9930216329378925, + "grad_norm": 3.1591245021862346, + "learning_rate": 5.95422457562167e-10, + "loss": 0.8265, + "step": 10353 + }, + { + "epoch": 1.993214139615468, + "grad_norm": 3.2729495364092145, + "learning_rate": 5.618846903576724e-10, + "loss": 0.8085, + "step": 10354 + }, + { + "epoch": 1.9934066462930433, + "grad_norm": 3.2095776619774266, + "learning_rate": 5.293189964694634e-10, + "loss": 0.8066, + "step": 10355 + }, + { + "epoch": 1.9935991529706185, + "grad_norm": 3.220226197953992, + "learning_rate": 4.977253790605652e-10, + "loss": 0.7952, + "step": 10356 + }, + { + "epoch": 1.9937916596481942, + "grad_norm": 3.132456075203666, + "learning_rate": 4.671038412051854e-10, + "loss": 0.8407, + "step": 10357 + }, + { + "epoch": 1.9939841663257694, + "grad_norm": 3.2305105672869043, + "learning_rate": 4.3745438587761145e-10, + "loss": 0.8255, + "step": 10358 + }, + { + "epoch": 1.9941766730033448, + "grad_norm": 3.2338921288859304, + "learning_rate": 4.0877701596220285e-10, + "loss": 0.7927, + "step": 10359 + }, + { + "epoch": 1.9943691796809202, + "grad_norm": 3.3457167140726805, + "learning_rate": 3.8107173424450915e-10, + "loss": 0.8743, + "step": 10360 + }, + { + "epoch": 1.9943691796809202, + "lm_loss": 0.7375, + "step": 10360, + "vm_loss": 0.1969 + }, + { + "epoch": 1.9943691796809202, + "lm_loss": 0.7251, + "step": 10360, + "vm_loss": 0.1206 + }, + { + "epoch": 1.9943691796809202, + "lm_loss": 0.6318, + "step": 10360, + "vm_loss": 0.1792 + }, + { + "epoch": 1.9943691796809202, + "lm_loss": 0.7596, + "step": 10360, + "vm_loss": 0.1662 + }, + { + "epoch": 1.9943691796809202, + "lm_loss": 0.4391, + "step": 10360, + "vm_loss": 0.1912 + }, + { + "epoch": 1.9943691796809202, + "lm_loss": 1.1354, + "step": 10360, + "vm_loss": 0.192 + }, + { + "epoch": 1.9943691796809202, + "lm_loss": 0.8815, + "step": 10360, + "vm_loss": 0.1123 + }, + { + "epoch": 1.9943691796809202, + "lm_loss": 0.6504, + "step": 10360, + "vm_loss": 0.2254 + }, + { + "epoch": 1.9945616863584954, + "grad_norm": 3.4593600694948745, + "learning_rate": 3.5433854342015185e-10, + "loss": 0.8727, + "step": 10361 + }, + { + "epoch": 1.994754193036071, + "grad_norm": 3.1775076621605325, + "learning_rate": 3.285774460870528e-10, + "loss": 0.8433, + "step": 10362 + }, + { + "epoch": 1.9949466997136462, + "grad_norm": 3.1351824845445986, + "learning_rate": 3.037884447487649e-10, + "loss": 0.822, + "step": 10363 + }, + { + "epoch": 1.9951392063912217, + "grad_norm": 3.2514329894600706, + "learning_rate": 2.799715418155824e-10, + "loss": 0.8202, + "step": 10364 + }, + { + "epoch": 1.995331713068797, + "grad_norm": 3.3190844000511444, + "learning_rate": 2.571267396034305e-10, + "loss": 0.8306, + "step": 10365 + }, + { + "epoch": 1.9955242197463725, + "grad_norm": 3.3019548448958655, + "learning_rate": 2.352540403327552e-10, + "loss": 0.8587, + "step": 10366 + }, + { + "epoch": 1.995716726423948, + "grad_norm": 3.27279222339282, + "learning_rate": 2.143534461296337e-10, + "loss": 0.8579, + "step": 10367 + }, + { + "epoch": 1.9959092331015231, + "grad_norm": 3.2293021714166197, + "learning_rate": 1.9442495902577408e-10, + "loss": 0.8297, + "step": 10368 + }, + { + "epoch": 1.9959092331015231, + "lm_loss": 0.4368, + "step": 10368, + "vm_loss": 0.1563 + }, + { + "epoch": 1.9959092331015231, + "lm_loss": 0.6769, + "step": 10368, + "vm_loss": 0.2091 + }, + { + "epoch": 1.9959092331015231, + "lm_loss": 0.473, + "step": 10368, + "vm_loss": 0.1902 + }, + { + "epoch": 1.9959092331015231, + "lm_loss": 0.7182, + "step": 10368, + "vm_loss": 0.1783 + }, + { + "epoch": 1.9959092331015231, + "lm_loss": 0.743, + "step": 10368, + "vm_loss": 0.2011 + }, + { + "epoch": 1.9959092331015231, + "lm_loss": 0.7095, + "step": 10368, + "vm_loss": 0.1719 + }, + { + "epoch": 1.9959092331015231, + "lm_loss": 0.8066, + "step": 10368, + "vm_loss": 0.1614 + }, + { + "epoch": 1.9959092331015231, + "lm_loss": 0.9259, + "step": 10368, + "vm_loss": 0.1939 + }, + { + "epoch": 1.9961017397790985, + "grad_norm": 3.3177833614222325, + "learning_rate": 1.754685809585155e-10, + "loss": 0.8713, + "step": 10369 + }, + { + "epoch": 1.996294246456674, + "grad_norm": 3.0334616208839367, + "learning_rate": 1.5748431377082817e-10, + "loss": 0.7873, + "step": 10370 + }, + { + "epoch": 1.9964867531342494, + "grad_norm": 3.0948606006893016, + "learning_rate": 1.4047215921131341e-10, + "loss": 0.7787, + "step": 10371 + }, + { + "epoch": 1.9966792598118248, + "grad_norm": 3.2382819962714517, + "learning_rate": 1.2443211893309326e-10, + "loss": 0.843, + "step": 10372 + }, + { + "epoch": 1.9968717664894, + "grad_norm": 3.201891769037206, + "learning_rate": 1.0936419449603108e-10, + "loss": 0.8204, + "step": 10373 + }, + { + "epoch": 1.9970642731669754, + "grad_norm": 3.2538395591913214, + "learning_rate": 9.526838736451105e-11, + "loss": 0.8629, + "step": 10374 + }, + { + "epoch": 1.9972567798445509, + "grad_norm": 3.2524755810004797, + "learning_rate": 8.21446989096586e-11, + "loss": 0.8652, + "step": 10375 + }, + { + "epoch": 1.9974492865221263, + "grad_norm": 3.4288566241263627, + "learning_rate": 6.999313040600974e-11, + "loss": 0.8887, + "step": 10376 + }, + { + "epoch": 1.9974492865221263, + "lm_loss": 0.5095, + "step": 10376, + "vm_loss": 0.153 + }, + { + "epoch": 1.9974492865221263, + "lm_loss": 0.9305, + "step": 10376, + "vm_loss": 0.1054 + }, + { + "epoch": 1.9974492865221263, + "lm_loss": 0.9219, + "step": 10376, + "vm_loss": 0.1622 + }, + { + "epoch": 1.9974492865221263, + "lm_loss": 0.4402, + "step": 10376, + "vm_loss": 0.1718 + }, + { + "epoch": 1.9974492865221263, + "lm_loss": 0.8247, + "step": 10376, + "vm_loss": 0.1454 + }, + { + "epoch": 1.9974492865221263, + "lm_loss": 0.6165, + "step": 10376, + "vm_loss": 0.1528 + }, + { + "epoch": 1.9974492865221263, + "lm_loss": 0.7405, + "step": 10376, + "vm_loss": 0.1221 + }, + { + "epoch": 1.9974492865221263, + "lm_loss": 0.7824, + "step": 10376, + "vm_loss": 0.1339 + }, + { + "epoch": 1.9976417931997017, + "grad_norm": 3.247147272467522, + "learning_rate": 5.881368303595202e-11, + "loss": 0.8193, + "step": 10377 + }, + { + "epoch": 1.997834299877277, + "grad_norm": 3.387983436094678, + "learning_rate": 4.860635788528356e-11, + "loss": 0.8887, + "step": 10378 + }, + { + "epoch": 1.9980268065548523, + "grad_norm": 3.2076775432881415, + "learning_rate": 3.937115594654373e-11, + "loss": 0.8144, + "step": 10379 + }, + { + "epoch": 1.9982193132324277, + "grad_norm": 3.328374671669054, + "learning_rate": 3.1108078119013175e-11, + "loss": 0.835, + "step": 10380 + }, + { + "epoch": 1.9984118199100032, + "grad_norm": 3.260221242139148, + "learning_rate": 2.3817125204272927e-11, + "loss": 0.8405, + "step": 10381 + }, + { + "epoch": 1.9986043265875786, + "grad_norm": 3.2192810824075053, + "learning_rate": 1.749829791064528e-11, + "loss": 0.8321, + "step": 10382 + }, + { + "epoch": 1.9987968332651538, + "grad_norm": 3.335088050030916, + "learning_rate": 1.2151596854304004e-11, + "loss": 0.8534, + "step": 10383 + }, + { + "epoch": 1.9989893399427294, + "grad_norm": 3.280067247254368, + "learning_rate": 7.777022553723258e-12, + "loss": 0.8263, + "step": 10384 + }, + { + "epoch": 1.9989893399427294, + "lm_loss": 0.4998, + "step": 10384, + "vm_loss": 0.1173 + }, + { + "epoch": 1.9989893399427294, + "lm_loss": 0.517, + "step": 10384, + "vm_loss": 0.2179 + }, + { + "epoch": 1.9989893399427294, + "lm_loss": 0.8516, + "step": 10384, + "vm_loss": 0.1947 + }, + { + "epoch": 1.9989893399427294, + "lm_loss": 0.5897, + "step": 10384, + "vm_loss": 0.1704 + }, + { + "epoch": 1.9989893399427294, + "lm_loss": 0.8321, + "step": 10384, + "vm_loss": 0.1231 + }, + { + "epoch": 1.9989893399427294, + "lm_loss": 0.5931, + "step": 10384, + "vm_loss": 0.1159 + }, + { + "epoch": 1.9989893399427294, + "lm_loss": 0.9918, + "step": 10384, + "vm_loss": 0.1984 + }, + { + "epoch": 1.9989893399427294, + "lm_loss": 0.6472, + "step": 10384, + "vm_loss": 0.1121 + }, + { + "epoch": 1.9991818466203046, + "grad_norm": 3.4572415083524772, + "learning_rate": 4.374575434118455e-12, + "loss": 0.8717, + "step": 10385 + }, + { + "epoch": 1.99937435329788, + "grad_norm": 3.3119665098252544, + "learning_rate": 1.944255827446284e-12, + "loss": 0.8518, + "step": 10386 + }, + { + "epoch": 1.9995668599754555, + "grad_norm": 3.1088014698276574, + "learning_rate": 4.860639690740243e-13, + "loss": 0.8285, + "step": 10387 + }, + { + "epoch": 1.9997593666530307, + "grad_norm": 3.255229368940458, + "learning_rate": 0.0, + "loss": 0.8638, + "step": 10388 + }, + { + "epoch": 1.9997593666530307, + "step": 10388, + "total_flos": 1.3983801185697792e+16, + "train_loss": 1.6275318770579388, + "train_runtime": 117073.7373, + "train_samples_per_second": 34.076, + "train_steps_per_second": 0.089 + } + ], + "logging_steps": 1.0, + "max_steps": 10388, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3983801185697792e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}