diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.964785335262904, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ETA": 0.0, + "epoch": 0.00032159511175430133, + "fp16_scale": 1.0, + "global_step": 1, + "grad_norm": 10.271092308212868, + "learning_rate": 2.127659574468085e-08, + "loss": 0.675, + "step": 1 + }, + { + "ETA": 7.02, + "epoch": 0.0006431902235086027, + "fp16_scale": 1.0, + "global_step": 2, + "grad_norm": 5.936799845967266, + "learning_rate": 4.25531914893617e-08, + "loss": 0.7339, + "step": 2 + }, + { + "ETA": 6.9, + "epoch": 0.000964785335262904, + "fp16_scale": 1.0, + "global_step": 3, + "grad_norm": 8.761149505364427, + "learning_rate": 6.382978723404254e-08, + "loss": 0.8264, + "step": 3 + }, + { + "ETA": 6.6, + "epoch": 0.0012863804470172053, + "fp16_scale": 1.0, + "global_step": 4, + "grad_norm": 8.798292347386411, + "learning_rate": 8.51063829787234e-08, + "loss": 0.8467, + "step": 4 + }, + { + "ETA": 6.56, + "epoch": 0.0016079755587715067, + "fp16_scale": 1.0, + "global_step": 5, + "grad_norm": 8.372692731162083, + "learning_rate": 1.0638297872340425e-07, + "loss": 0.7485, + "step": 5 + }, + { + "ETA": 6.47, + "epoch": 0.001929570670525808, + "fp16_scale": 1.0, + "global_step": 6, + "grad_norm": 9.34577570201643, + "learning_rate": 1.2765957446808508e-07, + "loss": 0.8435, + "step": 6 + }, + { + "ETA": 6.36, + "epoch": 0.0022511657822801095, + "fp16_scale": 1.0, + "global_step": 7, + "grad_norm": 6.855807739840747, + "learning_rate": 1.4893617021276595e-07, + "loss": 0.7391, + "step": 7 + }, + { + "ETA": 6.56, + "epoch": 0.0025727608940344106, + "fp16_scale": 1.0, + "global_step": 8, + "grad_norm": 8.247247002061396, + "learning_rate": 1.702127659574468e-07, + "loss": 0.7966, + "step": 8 + }, + { + "ETA": 6.21, + "epoch": 0.002894356005788712, + "fp16_scale": 1.0, + "global_step": 9, + "grad_norm": 11.091321882456889, + "learning_rate": 1.9148936170212765e-07, + "loss": 0.675, + "step": 9 + }, + { + "ETA": 6.21, + "epoch": 0.0032159511175430134, + "fp16_scale": 1.0, + "global_step": 10, + "grad_norm": 7.801672011661994, + "learning_rate": 2.127659574468085e-07, + "loss": 0.8447, + "step": 10 + }, + { + "ETA": 6.2, + "epoch": 0.0035375462292973146, + "fp16_scale": 1.0, + "global_step": 11, + "grad_norm": 7.778967208306511, + "learning_rate": 2.3404255319148937e-07, + "loss": 0.7343, + "step": 11 + }, + { + "ETA": 5.9, + "epoch": 0.003859141341051616, + "fp16_scale": 1.0, + "global_step": 12, + "grad_norm": 11.32488432224942, + "learning_rate": 2.5531914893617016e-07, + "loss": 0.7279, + "step": 12 + }, + { + "ETA": 5.92, + "epoch": 0.004180736452805918, + "fp16_scale": 1.0, + "global_step": 13, + "grad_norm": 7.951145078377419, + "learning_rate": 2.7659574468085106e-07, + "loss": 0.7681, + "step": 13 + }, + { + "ETA": 5.95, + "epoch": 0.004502331564560219, + "fp16_scale": 1.0, + "global_step": 14, + "grad_norm": 8.518318008722654, + "learning_rate": 2.978723404255319e-07, + "loss": 0.7737, + "step": 14 + }, + { + "ETA": 5.98, + "epoch": 0.00482392667631452, + "fp16_scale": 1.0, + "global_step": 15, + "grad_norm": 8.733510675457772, + "learning_rate": 3.1914893617021275e-07, + "loss": 0.763, + "step": 15 + }, + { + "ETA": 5.98, + "epoch": 0.005145521788068821, + "fp16_scale": 1.0, + "global_step": 16, + "grad_norm": 6.760829110381652, + "learning_rate": 3.404255319148936e-07, + "loss": 0.7064, + "step": 16 + }, + { + "ETA": 5.89, + "epoch": 0.005467116899823122, + "fp16_scale": 1.0, + "global_step": 17, + "grad_norm": 7.433915229026562, + "learning_rate": 3.617021276595745e-07, + "loss": 0.5313, + "step": 17 + }, + { + "ETA": 5.98, + "epoch": 0.005788712011577424, + "fp16_scale": 1.0, + "global_step": 18, + "grad_norm": 6.2692886913904236, + "learning_rate": 3.829787234042553e-07, + "loss": 0.7495, + "step": 18 + }, + { + "ETA": 5.98, + "epoch": 0.006110307123331726, + "fp16_scale": 1.0, + "global_step": 19, + "grad_norm": 7.660828413435692, + "learning_rate": 4.0425531914893614e-07, + "loss": 0.7335, + "step": 19 + }, + { + "ETA": 5.99, + "epoch": 0.006431902235086027, + "fp16_scale": 1.0, + "global_step": 20, + "grad_norm": 7.506163600068185, + "learning_rate": 4.25531914893617e-07, + "loss": 0.7548, + "step": 20 + }, + { + "ETA": 6.01, + "epoch": 0.006753497346840328, + "fp16_scale": 1.0, + "global_step": 21, + "grad_norm": 8.01415459477147, + "learning_rate": 4.4680851063829783e-07, + "loss": 0.797, + "step": 21 + }, + { + "ETA": 6.0, + "epoch": 0.007075092458594629, + "fp16_scale": 1.0, + "global_step": 22, + "grad_norm": 6.070403303562782, + "learning_rate": 4.6808510638297873e-07, + "loss": 0.7009, + "step": 22 + }, + { + "ETA": 5.98, + "epoch": 0.00739668757034893, + "fp16_scale": 1.0, + "global_step": 23, + "grad_norm": 7.036884976599998, + "learning_rate": 4.893617021276595e-07, + "loss": 0.6791, + "step": 23 + }, + { + "ETA": 6.0, + "epoch": 0.007718282682103232, + "fp16_scale": 1.0, + "global_step": 24, + "grad_norm": 6.889849361911478, + "learning_rate": 5.106382978723403e-07, + "loss": 0.7487, + "step": 24 + }, + { + "ETA": 6.01, + "epoch": 0.008039877793857533, + "fp16_scale": 1.0, + "global_step": 25, + "grad_norm": 6.628636420052721, + "learning_rate": 5.319148936170212e-07, + "loss": 0.6991, + "step": 25 + }, + { + "ETA": 6.05, + "epoch": 0.008361472905611836, + "fp16_scale": 1.0, + "global_step": 26, + "grad_norm": 5.624553775893587, + "learning_rate": 5.531914893617021e-07, + "loss": 0.6937, + "step": 26 + }, + { + "ETA": 5.92, + "epoch": 0.008683068017366137, + "fp16_scale": 1.0, + "global_step": 27, + "grad_norm": 5.196623996489182, + "learning_rate": 5.74468085106383e-07, + "loss": 0.5243, + "step": 27 + }, + { + "ETA": 5.94, + "epoch": 0.009004663129120438, + "fp16_scale": 1.0, + "global_step": 28, + "grad_norm": 5.11030998557107, + "learning_rate": 5.957446808510638e-07, + "loss": 0.6369, + "step": 28 + }, + { + "ETA": 5.96, + "epoch": 0.009326258240874739, + "fp16_scale": 1.0, + "global_step": 29, + "grad_norm": 3.7503987206102933, + "learning_rate": 6.170212765957446e-07, + "loss": 0.6579, + "step": 29 + }, + { + "ETA": 5.97, + "epoch": 0.00964785335262904, + "fp16_scale": 1.0, + "global_step": 30, + "grad_norm": 3.6997926154411065, + "learning_rate": 6.382978723404255e-07, + "loss": 0.6319, + "step": 30 + }, + { + "ETA": 5.96, + "epoch": 0.009969448464383341, + "fp16_scale": 1.0, + "global_step": 31, + "grad_norm": 3.325397798658775, + "learning_rate": 6.595744680851063e-07, + "loss": 0.6509, + "step": 31 + }, + { + "ETA": 5.97, + "epoch": 0.010291043576137643, + "fp16_scale": 1.0, + "global_step": 32, + "grad_norm": 3.5979872531541917, + "learning_rate": 6.808510638297872e-07, + "loss": 0.6236, + "step": 32 + }, + { + "ETA": 6.0, + "epoch": 0.010612638687891944, + "fp16_scale": 1.0, + "global_step": 33, + "grad_norm": 3.3313648341845217, + "learning_rate": 7.021276595744681e-07, + "loss": 0.5944, + "step": 33 + }, + { + "ETA": 6.03, + "epoch": 0.010934233799646245, + "fp16_scale": 1.0, + "global_step": 34, + "grad_norm": 2.6531843559585058, + "learning_rate": 7.23404255319149e-07, + "loss": 0.6317, + "step": 34 + }, + { + "ETA": 6.02, + "epoch": 0.011255828911400546, + "fp16_scale": 1.0, + "global_step": 35, + "grad_norm": 2.651824591691451, + "learning_rate": 7.446808510638297e-07, + "loss": 0.5799, + "step": 35 + }, + { + "ETA": 6.03, + "epoch": 0.011577424023154847, + "fp16_scale": 1.0, + "global_step": 36, + "grad_norm": 3.0054745271450587, + "learning_rate": 7.659574468085106e-07, + "loss": 0.575, + "step": 36 + }, + { + "ETA": 6.02, + "epoch": 0.01189901913490915, + "fp16_scale": 1.0, + "global_step": 37, + "grad_norm": 2.7676225818699907, + "learning_rate": 7.872340425531915e-07, + "loss": 0.5693, + "step": 37 + }, + { + "ETA": 5.94, + "epoch": 0.012220614246663451, + "fp16_scale": 1.0, + "global_step": 38, + "grad_norm": 4.307955418017736, + "learning_rate": 8.085106382978723e-07, + "loss": 0.549, + "step": 38 + }, + { + "ETA": 5.94, + "epoch": 0.012542209358417752, + "fp16_scale": 1.0, + "global_step": 39, + "grad_norm": 2.7592638323739282, + "learning_rate": 8.297872340425532e-07, + "loss": 0.6385, + "step": 39 + }, + { + "ETA": 5.94, + "epoch": 0.012863804470172054, + "fp16_scale": 1.0, + "global_step": 40, + "grad_norm": 3.1937611077592183, + "learning_rate": 8.51063829787234e-07, + "loss": 0.6341, + "step": 40 + }, + { + "ETA": 5.94, + "epoch": 0.013185399581926355, + "fp16_scale": 1.0, + "global_step": 41, + "grad_norm": 2.3642106915091006, + "learning_rate": 8.723404255319149e-07, + "loss": 0.5279, + "step": 41 + }, + { + "ETA": 5.94, + "epoch": 0.013506994693680656, + "fp16_scale": 1.0, + "global_step": 42, + "grad_norm": 2.4889004111147726, + "learning_rate": 8.936170212765957e-07, + "loss": 0.6534, + "step": 42 + }, + { + "ETA": 5.86, + "epoch": 0.013828589805434957, + "fp16_scale": 1.0, + "global_step": 43, + "grad_norm": 2.7252878090073382, + "learning_rate": 9.148936170212766e-07, + "loss": 0.5023, + "step": 43 + }, + { + "ETA": 5.89, + "epoch": 0.014150184917189258, + "fp16_scale": 1.0, + "global_step": 44, + "grad_norm": 2.2704994128835616, + "learning_rate": 9.361702127659575e-07, + "loss": 0.596, + "step": 44 + }, + { + "ETA": 5.9, + "epoch": 0.01447178002894356, + "fp16_scale": 1.0, + "global_step": 45, + "grad_norm": 2.1776982157402784, + "learning_rate": 9.574468085106384e-07, + "loss": 0.5203, + "step": 45 + }, + { + "ETA": 5.91, + "epoch": 0.01479337514069786, + "fp16_scale": 1.0, + "global_step": 46, + "grad_norm": 2.3223571282145277, + "learning_rate": 9.78723404255319e-07, + "loss": 0.6346, + "step": 46 + }, + { + "ETA": 5.85, + "epoch": 0.015114970252452163, + "fp16_scale": 1.0, + "global_step": 47, + "grad_norm": 2.553678306991624, + "learning_rate": 1e-06, + "loss": 0.4674, + "step": 47 + }, + { + "ETA": 5.85, + "epoch": 0.015436565364206465, + "fp16_scale": 1.0, + "global_step": 48, + "grad_norm": 2.197430468099514, + "learning_rate": 1.0212765957446806e-06, + "loss": 0.5526, + "step": 48 + }, + { + "ETA": 5.85, + "epoch": 0.015758160475960764, + "fp16_scale": 1.0, + "global_step": 49, + "grad_norm": 2.525799839180918, + "learning_rate": 1.0425531914893618e-06, + "loss": 0.5723, + "step": 49 + }, + { + "ETA": 5.85, + "epoch": 0.016079755587715065, + "fp16_scale": 1.0, + "global_step": 50, + "grad_norm": 2.2984636311934863, + "learning_rate": 1.0638297872340424e-06, + "loss": 0.5411, + "step": 50 + }, + { + "ETA": 5.85, + "epoch": 0.016401350699469366, + "fp16_scale": 1.0, + "global_step": 51, + "grad_norm": 2.1296579071495185, + "learning_rate": 1.0851063829787233e-06, + "loss": 0.5586, + "step": 51 + }, + { + "ETA": 5.79, + "epoch": 0.01672294581122367, + "fp16_scale": 1.0, + "global_step": 52, + "grad_norm": 2.3505923179519965, + "learning_rate": 1.1063829787234042e-06, + "loss": 0.4382, + "step": 52 + }, + { + "ETA": 5.81, + "epoch": 0.017044540922977972, + "fp16_scale": 1.0, + "global_step": 53, + "grad_norm": 2.17148946876222, + "learning_rate": 1.127659574468085e-06, + "loss": 0.5376, + "step": 53 + }, + { + "ETA": 5.8, + "epoch": 0.017366136034732273, + "fp16_scale": 1.0, + "global_step": 54, + "grad_norm": 1.9342880505071298, + "learning_rate": 1.148936170212766e-06, + "loss": 0.5249, + "step": 54 + }, + { + "ETA": 5.81, + "epoch": 0.017687731146486575, + "fp16_scale": 1.0, + "global_step": 55, + "grad_norm": 1.9801427724408849, + "learning_rate": 1.1702127659574467e-06, + "loss": 0.5198, + "step": 55 + }, + { + "ETA": 5.81, + "epoch": 0.018009326258240876, + "fp16_scale": 1.0, + "global_step": 56, + "grad_norm": 2.4245238767570356, + "learning_rate": 1.1914893617021276e-06, + "loss": 0.5221, + "step": 56 + }, + { + "ETA": 5.81, + "epoch": 0.018330921369995177, + "fp16_scale": 1.0, + "global_step": 57, + "grad_norm": 2.2187104912844138, + "learning_rate": 1.2127659574468085e-06, + "loss": 0.5299, + "step": 57 + }, + { + "ETA": 5.81, + "epoch": 0.018652516481749478, + "fp16_scale": 1.0, + "global_step": 58, + "grad_norm": 1.9435727492977588, + "learning_rate": 1.2340425531914892e-06, + "loss": 0.4573, + "step": 58 + }, + { + "ETA": 5.82, + "epoch": 0.01897411159350378, + "fp16_scale": 1.0, + "global_step": 59, + "grad_norm": 2.1196247403843183, + "learning_rate": 1.2553191489361701e-06, + "loss": 0.5041, + "step": 59 + }, + { + "ETA": 5.83, + "epoch": 0.01929570670525808, + "fp16_scale": 1.0, + "global_step": 60, + "grad_norm": 2.1089989366123785, + "learning_rate": 1.276595744680851e-06, + "loss": 0.5424, + "step": 60 + }, + { + "ETA": 5.85, + "epoch": 0.01961730181701238, + "fp16_scale": 1.0, + "global_step": 61, + "grad_norm": 2.2338975030115864, + "learning_rate": 1.297872340425532e-06, + "loss": 0.4935, + "step": 61 + }, + { + "ETA": 5.85, + "epoch": 0.019938896928766683, + "fp16_scale": 1.0, + "global_step": 62, + "grad_norm": 2.0431724462748213, + "learning_rate": 1.3191489361702126e-06, + "loss": 0.5799, + "step": 62 + }, + { + "ETA": 5.85, + "epoch": 0.020260492040520984, + "fp16_scale": 1.0, + "global_step": 63, + "grad_norm": 2.4551991336746606, + "learning_rate": 1.3404255319148935e-06, + "loss": 0.6281, + "step": 63 + }, + { + "ETA": 5.8, + "epoch": 0.020582087152275285, + "fp16_scale": 1.0, + "global_step": 64, + "grad_norm": 3.188788140029998, + "learning_rate": 1.3617021276595744e-06, + "loss": 0.5042, + "step": 64 + }, + { + "ETA": 5.81, + "epoch": 0.020903682264029586, + "fp16_scale": 1.0, + "global_step": 65, + "grad_norm": 2.0250469623805225, + "learning_rate": 1.3829787234042553e-06, + "loss": 0.5447, + "step": 65 + }, + { + "ETA": 5.81, + "epoch": 0.021225277375783887, + "fp16_scale": 1.0, + "global_step": 66, + "grad_norm": 2.186476354070767, + "learning_rate": 1.4042553191489362e-06, + "loss": 0.604, + "step": 66 + }, + { + "ETA": 5.81, + "epoch": 0.02154687248753819, + "fp16_scale": 1.0, + "global_step": 67, + "grad_norm": 1.9016721591480255, + "learning_rate": 1.4255319148936169e-06, + "loss": 0.4654, + "step": 67 + }, + { + "ETA": 5.81, + "epoch": 0.02186846759929249, + "fp16_scale": 1.0, + "global_step": 68, + "grad_norm": 1.8932285445408374, + "learning_rate": 1.446808510638298e-06, + "loss": 0.5115, + "step": 68 + }, + { + "ETA": 5.81, + "epoch": 0.02219006271104679, + "fp16_scale": 1.0, + "global_step": 69, + "grad_norm": 2.138350825095684, + "learning_rate": 1.4680851063829787e-06, + "loss": 0.599, + "step": 69 + }, + { + "ETA": 5.82, + "epoch": 0.022511657822801092, + "fp16_scale": 1.0, + "global_step": 70, + "grad_norm": 2.011538617509216, + "learning_rate": 1.4893617021276594e-06, + "loss": 0.516, + "step": 70 + }, + { + "ETA": 5.82, + "epoch": 0.022833252934555393, + "fp16_scale": 1.0, + "global_step": 71, + "grad_norm": 2.364474781341946, + "learning_rate": 1.5106382978723405e-06, + "loss": 0.5453, + "step": 71 + }, + { + "ETA": 5.82, + "epoch": 0.023154848046309694, + "fp16_scale": 1.0, + "global_step": 72, + "grad_norm": 1.842910522233911, + "learning_rate": 1.5319148936170212e-06, + "loss": 0.476, + "step": 72 + }, + { + "ETA": 5.82, + "epoch": 0.023476443158064, + "fp16_scale": 1.0, + "global_step": 73, + "grad_norm": 1.9116819343832798, + "learning_rate": 1.5531914893617019e-06, + "loss": 0.5051, + "step": 73 + }, + { + "ETA": 5.77, + "epoch": 0.0237980382698183, + "fp16_scale": 1.0, + "global_step": 74, + "grad_norm": 3.27843789986936, + "learning_rate": 1.574468085106383e-06, + "loss": 0.4861, + "step": 74 + }, + { + "ETA": 5.77, + "epoch": 0.0241196333815726, + "fp16_scale": 1.0, + "global_step": 75, + "grad_norm": 2.38109700900136, + "learning_rate": 1.5957446808510637e-06, + "loss": 0.5261, + "step": 75 + }, + { + "ETA": 5.77, + "epoch": 0.024441228493326903, + "fp16_scale": 1.0, + "global_step": 76, + "grad_norm": 2.23743872021617, + "learning_rate": 1.6170212765957446e-06, + "loss": 0.5736, + "step": 76 + }, + { + "ETA": 5.72, + "epoch": 0.024762823605081204, + "fp16_scale": 1.0, + "global_step": 77, + "grad_norm": 2.8559459749027383, + "learning_rate": 1.6382978723404255e-06, + "loss": 0.4531, + "step": 77 + }, + { + "ETA": 5.73, + "epoch": 0.025084418716835505, + "fp16_scale": 1.0, + "global_step": 78, + "grad_norm": 1.9454163160560185, + "learning_rate": 1.6595744680851064e-06, + "loss": 0.5391, + "step": 78 + }, + { + "ETA": 5.69, + "epoch": 0.025406013828589806, + "fp16_scale": 1.0, + "global_step": 79, + "grad_norm": 2.7935117012479136, + "learning_rate": 1.6808510638297873e-06, + "loss": 0.468, + "step": 79 + }, + { + "ETA": 5.71, + "epoch": 0.025727608940344107, + "fp16_scale": 1.0, + "global_step": 80, + "grad_norm": 2.0607040982783893, + "learning_rate": 1.702127659574468e-06, + "loss": 0.4884, + "step": 80 + }, + { + "ETA": 5.71, + "epoch": 0.02604920405209841, + "fp16_scale": 1.0, + "global_step": 81, + "grad_norm": 1.9616817221481178, + "learning_rate": 1.7234042553191488e-06, + "loss": 0.5125, + "step": 81 + }, + { + "ETA": 5.68, + "epoch": 0.02637079916385271, + "fp16_scale": 1.0, + "global_step": 82, + "grad_norm": 2.8684132922538823, + "learning_rate": 1.7446808510638297e-06, + "loss": 0.4395, + "step": 82 + }, + { + "ETA": 5.64, + "epoch": 0.02669239427560701, + "fp16_scale": 1.0, + "global_step": 83, + "grad_norm": 2.462224589767338, + "learning_rate": 1.7659574468085106e-06, + "loss": 0.4355, + "step": 83 + }, + { + "ETA": 5.64, + "epoch": 0.027013989387361312, + "fp16_scale": 1.0, + "global_step": 84, + "grad_norm": 2.2938452504024056, + "learning_rate": 1.7872340425531913e-06, + "loss": 0.5227, + "step": 84 + }, + { + "ETA": 5.64, + "epoch": 0.027335584499115613, + "fp16_scale": 1.0, + "global_step": 85, + "grad_norm": 2.1576688140194147, + "learning_rate": 1.8085106382978722e-06, + "loss": 0.5308, + "step": 85 + }, + { + "ETA": 5.64, + "epoch": 0.027657179610869914, + "fp16_scale": 1.0, + "global_step": 86, + "grad_norm": 2.120174390807831, + "learning_rate": 1.8297872340425531e-06, + "loss": 0.5216, + "step": 86 + }, + { + "ETA": 5.64, + "epoch": 0.027978774722624215, + "fp16_scale": 1.0, + "global_step": 87, + "grad_norm": 2.0757019848487057, + "learning_rate": 1.8510638297872338e-06, + "loss": 0.51, + "step": 87 + }, + { + "ETA": 5.64, + "epoch": 0.028300369834378516, + "fp16_scale": 1.0, + "global_step": 88, + "grad_norm": 2.1525748432569047, + "learning_rate": 1.872340425531915e-06, + "loss": 0.5147, + "step": 88 + }, + { + "ETA": 5.65, + "epoch": 0.028621964946132818, + "fp16_scale": 1.0, + "global_step": 89, + "grad_norm": 2.191002649456998, + "learning_rate": 1.8936170212765956e-06, + "loss": 0.5744, + "step": 89 + }, + { + "ETA": 5.67, + "epoch": 0.02894356005788712, + "fp16_scale": 1.0, + "global_step": 90, + "grad_norm": 2.0169223118103283, + "learning_rate": 1.9148936170212767e-06, + "loss": 0.5322, + "step": 90 + }, + { + "ETA": 5.67, + "epoch": 0.02926515516964142, + "fp16_scale": 1.0, + "global_step": 91, + "grad_norm": 2.043929358251754, + "learning_rate": 1.936170212765957e-06, + "loss": 0.5384, + "step": 91 + }, + { + "ETA": 5.68, + "epoch": 0.02958675028139572, + "fp16_scale": 1.0, + "global_step": 92, + "grad_norm": 1.9056931212133323, + "learning_rate": 1.957446808510638e-06, + "loss": 0.5185, + "step": 92 + }, + { + "ETA": 5.68, + "epoch": 0.029908345393150026, + "fp16_scale": 1.0, + "global_step": 93, + "grad_norm": 2.307999687316851, + "learning_rate": 1.978723404255319e-06, + "loss": 0.5083, + "step": 93 + }, + { + "ETA": 5.68, + "epoch": 0.030229940504904327, + "fp16_scale": 1.0, + "global_step": 94, + "grad_norm": 2.3069773784363874, + "learning_rate": 2e-06, + "loss": 0.4874, + "step": 94 + }, + { + "ETA": 5.68, + "epoch": 0.030551535616658628, + "fp16_scale": 1.0, + "global_step": 95, + "grad_norm": 2.1375152636420762, + "learning_rate": 1.999999457130956e-06, + "loss": 0.5308, + "step": 95 + }, + { + "ETA": 5.68, + "epoch": 0.03087313072841293, + "fp16_scale": 1.0, + "global_step": 96, + "grad_norm": 1.779984554116194, + "learning_rate": 1.999997828524414e-06, + "loss": 0.4612, + "step": 96 + }, + { + "ETA": 5.69, + "epoch": 0.03119472584016723, + "fp16_scale": 1.0, + "global_step": 97, + "grad_norm": 1.8897525194740394, + "learning_rate": 1.999995114182142e-06, + "loss": 0.4783, + "step": 97 + }, + { + "ETA": 5.7, + "epoch": 0.03151632095192153, + "fp16_scale": 1.0, + "global_step": 98, + "grad_norm": 2.140424509099989, + "learning_rate": 1.999991314107087e-06, + "loss": 0.6014, + "step": 98 + }, + { + "ETA": 5.69, + "epoch": 0.03183791606367583, + "fp16_scale": 1.0, + "global_step": 99, + "grad_norm": 2.0231806427993053, + "learning_rate": 1.9999864283033744e-06, + "loss": 0.5012, + "step": 99 + }, + { + "ETA": 5.7, + "epoch": 0.03215951117543013, + "fp16_scale": 1.0, + "global_step": 100, + "grad_norm": 2.0863695551562, + "learning_rate": 1.99998045677631e-06, + "loss": 0.5462, + "step": 100 + }, + { + "ETA": 5.66, + "epoch": 0.03248110628718443, + "fp16_scale": 1.0, + "global_step": 101, + "grad_norm": 2.437841320875372, + "learning_rate": 1.999973399532377e-06, + "loss": 0.446, + "step": 101 + }, + { + "ETA": 5.67, + "epoch": 0.03280270139893873, + "fp16_scale": 1.0, + "global_step": 102, + "grad_norm": 2.112285993018359, + "learning_rate": 1.999965256579237e-06, + "loss": 0.4646, + "step": 102 + }, + { + "ETA": 5.67, + "epoch": 0.03312429651069304, + "fp16_scale": 1.0, + "global_step": 103, + "grad_norm": 2.012409806465424, + "learning_rate": 1.9999560279257314e-06, + "loss": 0.425, + "step": 103 + }, + { + "ETA": 5.67, + "epoch": 0.03344589162244734, + "fp16_scale": 1.0, + "global_step": 104, + "grad_norm": 1.92659649084298, + "learning_rate": 1.9999457135818805e-06, + "loss": 0.5055, + "step": 104 + }, + { + "ETA": 5.67, + "epoch": 0.03376748673420164, + "fp16_scale": 1.0, + "global_step": 105, + "grad_norm": 1.9258326937148058, + "learning_rate": 1.9999343135588825e-06, + "loss": 0.5538, + "step": 105 + }, + { + "ETA": 5.68, + "epoch": 0.034089081845955944, + "fp16_scale": 1.0, + "global_step": 106, + "grad_norm": 2.092933785114832, + "learning_rate": 1.9999218278691153e-06, + "loss": 0.4819, + "step": 106 + }, + { + "ETA": 5.69, + "epoch": 0.034410676957710246, + "fp16_scale": 1.0, + "global_step": 107, + "grad_norm": 1.90548945056872, + "learning_rate": 1.999908256526135e-06, + "loss": 0.4903, + "step": 107 + }, + { + "ETA": 5.69, + "epoch": 0.03473227206946455, + "fp16_scale": 1.0, + "global_step": 108, + "grad_norm": 2.149863310234961, + "learning_rate": 1.9998935995446764e-06, + "loss": 0.5083, + "step": 108 + }, + { + "ETA": 5.69, + "epoch": 0.03505386718121885, + "fp16_scale": 1.0, + "global_step": 109, + "grad_norm": 2.019334705889536, + "learning_rate": 1.999877856940653e-06, + "loss": 0.4884, + "step": 109 + }, + { + "ETA": 5.69, + "epoch": 0.03537546229297315, + "fp16_scale": 1.0, + "global_step": 110, + "grad_norm": 2.0805274339165, + "learning_rate": 1.9998610287311573e-06, + "loss": 0.4998, + "step": 110 + }, + { + "ETA": 5.68, + "epoch": 0.03569705740472745, + "fp16_scale": 1.0, + "global_step": 111, + "grad_norm": 1.854134725693307, + "learning_rate": 1.9998431149344605e-06, + "loss": 0.4633, + "step": 111 + }, + { + "ETA": 5.65, + "epoch": 0.03601865251648175, + "fp16_scale": 1.0, + "global_step": 112, + "grad_norm": 2.435635768617461, + "learning_rate": 1.999824115570012e-06, + "loss": 0.4366, + "step": 112 + }, + { + "ETA": 5.63, + "epoch": 0.03634024762823605, + "fp16_scale": 1.0, + "global_step": 113, + "grad_norm": 2.71798042302074, + "learning_rate": 1.9998040306584397e-06, + "loss": 0.4651, + "step": 113 + }, + { + "ETA": 5.6, + "epoch": 0.036661842739990354, + "fp16_scale": 1.0, + "global_step": 114, + "grad_norm": 2.557939383567869, + "learning_rate": 1.999782860221552e-06, + "loss": 0.425, + "step": 114 + }, + { + "ETA": 5.6, + "epoch": 0.036983437851744655, + "fp16_scale": 1.0, + "global_step": 115, + "grad_norm": 1.9245468047531342, + "learning_rate": 1.999760604282333e-06, + "loss": 0.4295, + "step": 115 + }, + { + "ETA": 5.61, + "epoch": 0.037305032963498956, + "fp16_scale": 1.0, + "global_step": 116, + "grad_norm": 1.8876528914974746, + "learning_rate": 1.9997372628649476e-06, + "loss": 0.4415, + "step": 116 + }, + { + "ETA": 5.61, + "epoch": 0.03762662807525326, + "fp16_scale": 1.0, + "global_step": 117, + "grad_norm": 1.9802708149084507, + "learning_rate": 1.999712835994738e-06, + "loss": 0.5122, + "step": 117 + }, + { + "ETA": 5.61, + "epoch": 0.03794822318700756, + "fp16_scale": 1.0, + "global_step": 118, + "grad_norm": 2.1211433879182144, + "learning_rate": 1.9996873236982257e-06, + "loss": 0.5276, + "step": 118 + }, + { + "ETA": 5.61, + "epoch": 0.03826981829876186, + "fp16_scale": 1.0, + "global_step": 119, + "grad_norm": 1.621281570789123, + "learning_rate": 1.9996607260031105e-06, + "loss": 0.4932, + "step": 119 + }, + { + "ETA": 5.61, + "epoch": 0.03859141341051616, + "fp16_scale": 1.0, + "global_step": 120, + "grad_norm": 1.9653619168923202, + "learning_rate": 1.9996330429382703e-06, + "loss": 0.5654, + "step": 120 + }, + { + "ETA": 5.62, + "epoch": 0.03891300852227046, + "fp16_scale": 1.0, + "global_step": 121, + "grad_norm": 2.2123648273822942, + "learning_rate": 1.9996042745337615e-06, + "loss": 0.4775, + "step": 121 + }, + { + "ETA": 5.62, + "epoch": 0.03923460363402476, + "fp16_scale": 1.0, + "global_step": 122, + "grad_norm": 1.9972465034521092, + "learning_rate": 1.9995744208208192e-06, + "loss": 0.4768, + "step": 122 + }, + { + "ETA": 5.62, + "epoch": 0.039556198745779064, + "fp16_scale": 1.0, + "global_step": 123, + "grad_norm": 2.0828761416592663, + "learning_rate": 1.999543481831857e-06, + "loss": 0.5236, + "step": 123 + }, + { + "ETA": 5.63, + "epoch": 0.039877793857533365, + "fp16_scale": 1.0, + "global_step": 124, + "grad_norm": 2.262870812400955, + "learning_rate": 1.999511457600466e-06, + "loss": 0.5392, + "step": 124 + }, + { + "ETA": 5.63, + "epoch": 0.04019938896928767, + "fp16_scale": 1.0, + "global_step": 125, + "grad_norm": 2.006385958193691, + "learning_rate": 1.9994783481614164e-06, + "loss": 0.4454, + "step": 125 + }, + { + "ETA": 5.62, + "epoch": 0.04052098408104197, + "fp16_scale": 1.0, + "global_step": 126, + "grad_norm": 1.9169684519984513, + "learning_rate": 1.9994441535506564e-06, + "loss": 0.409, + "step": 126 + }, + { + "ETA": 5.63, + "epoch": 0.04084257919279627, + "fp16_scale": 1.0, + "global_step": 127, + "grad_norm": 1.7978841639226444, + "learning_rate": 1.9994088738053125e-06, + "loss": 0.5075, + "step": 127 + }, + { + "ETA": 5.63, + "epoch": 0.04116417430455057, + "fp16_scale": 1.0, + "global_step": 128, + "grad_norm": 2.281911267986894, + "learning_rate": 1.999372508963689e-06, + "loss": 0.5013, + "step": 128 + }, + { + "ETA": 5.62, + "epoch": 0.04148576941630487, + "fp16_scale": 1.0, + "global_step": 129, + "grad_norm": 2.0421301534446243, + "learning_rate": 1.999335059065269e-06, + "loss": 0.4435, + "step": 129 + }, + { + "ETA": 5.62, + "epoch": 0.04180736452805917, + "fp16_scale": 1.0, + "global_step": 130, + "grad_norm": 1.930763418660808, + "learning_rate": 1.9992965241507127e-06, + "loss": 0.4362, + "step": 130 + }, + { + "ETA": 5.6, + "epoch": 0.042128959639813474, + "fp16_scale": 1.0, + "global_step": 131, + "grad_norm": 2.1611057009273718, + "learning_rate": 1.9992569042618594e-06, + "loss": 0.4057, + "step": 131 + }, + { + "ETA": 5.6, + "epoch": 0.042450554751567775, + "fp16_scale": 1.0, + "global_step": 132, + "grad_norm": 2.068532510448745, + "learning_rate": 1.999216199441726e-06, + "loss": 0.5291, + "step": 132 + }, + { + "ETA": 5.61, + "epoch": 0.042772149863322076, + "fp16_scale": 1.0, + "global_step": 133, + "grad_norm": 2.1349463536595232, + "learning_rate": 1.999174409734507e-06, + "loss": 0.4926, + "step": 133 + }, + { + "ETA": 5.61, + "epoch": 0.04309374497507638, + "fp16_scale": 1.0, + "global_step": 134, + "grad_norm": 1.7812791752819839, + "learning_rate": 1.9991315351855745e-06, + "loss": 0.3955, + "step": 134 + }, + { + "ETA": 5.62, + "epoch": 0.04341534008683068, + "fp16_scale": 1.0, + "global_step": 135, + "grad_norm": 2.2711082613133877, + "learning_rate": 1.99908757584148e-06, + "loss": 0.5022, + "step": 135 + }, + { + "ETA": 5.61, + "epoch": 0.04373693519858498, + "fp16_scale": 1.0, + "global_step": 136, + "grad_norm": 2.2507108902753004, + "learning_rate": 1.9990425317499516e-06, + "loss": 0.4648, + "step": 136 + }, + { + "ETA": 5.61, + "epoch": 0.04405853031033928, + "fp16_scale": 1.0, + "global_step": 137, + "grad_norm": 2.1452354144039996, + "learning_rate": 1.998996402959895e-06, + "loss": 0.4525, + "step": 137 + }, + { + "ETA": 5.61, + "epoch": 0.04438012542209358, + "fp16_scale": 1.0, + "global_step": 138, + "grad_norm": 2.0144953999608965, + "learning_rate": 1.9989491895213946e-06, + "loss": 0.4986, + "step": 138 + }, + { + "ETA": 5.59, + "epoch": 0.04470172053384788, + "fp16_scale": 1.0, + "global_step": 139, + "grad_norm": 2.229946650769043, + "learning_rate": 1.998900891485711e-06, + "loss": 0.3849, + "step": 139 + }, + { + "ETA": 5.59, + "epoch": 0.045023315645602184, + "fp16_scale": 1.0, + "global_step": 140, + "grad_norm": 2.0253377758809625, + "learning_rate": 1.998851508905284e-06, + "loss": 0.581, + "step": 140 + }, + { + "ETA": 5.6, + "epoch": 0.045344910757356485, + "fp16_scale": 1.0, + "global_step": 141, + "grad_norm": 2.206479589908534, + "learning_rate": 1.9988010418337304e-06, + "loss": 0.4867, + "step": 141 + }, + { + "ETA": 5.61, + "epoch": 0.045666505869110786, + "fp16_scale": 1.0, + "global_step": 142, + "grad_norm": 1.7500501463013969, + "learning_rate": 1.998749490325843e-06, + "loss": 0.4973, + "step": 142 + }, + { + "ETA": 5.61, + "epoch": 0.04598810098086509, + "fp16_scale": 1.0, + "global_step": 143, + "grad_norm": 1.6690448702127838, + "learning_rate": 1.998696854437594e-06, + "loss": 0.4401, + "step": 143 + }, + { + "ETA": 5.61, + "epoch": 0.04630969609261939, + "fp16_scale": 1.0, + "global_step": 144, + "grad_norm": 1.9301595047071682, + "learning_rate": 1.998643134226132e-06, + "loss": 0.4081, + "step": 144 + }, + { + "ETA": 5.61, + "epoch": 0.0466312912043737, + "fp16_scale": 1.0, + "global_step": 145, + "grad_norm": 2.031377215245968, + "learning_rate": 1.998588329749783e-06, + "loss": 0.5083, + "step": 145 + }, + { + "ETA": 5.61, + "epoch": 0.046952886316128, + "fp16_scale": 1.0, + "global_step": 146, + "grad_norm": 1.9480273584696632, + "learning_rate": 1.998532441068051e-06, + "loss": 0.4895, + "step": 146 + }, + { + "ETA": 5.61, + "epoch": 0.0472744814278823, + "fp16_scale": 1.0, + "global_step": 147, + "grad_norm": 1.940140543978122, + "learning_rate": 1.9984754682416157e-06, + "loss": 0.5188, + "step": 147 + }, + { + "ETA": 5.59, + "epoch": 0.0475960765396366, + "fp16_scale": 1.0, + "global_step": 148, + "grad_norm": 2.566694912162044, + "learning_rate": 1.998417411332335e-06, + "loss": 0.4565, + "step": 148 + }, + { + "ETA": 5.6, + "epoch": 0.0479176716513909, + "fp16_scale": 1.0, + "global_step": 149, + "grad_norm": 2.1015541461979628, + "learning_rate": 1.9983582704032434e-06, + "loss": 0.466, + "step": 149 + }, + { + "ETA": 5.61, + "epoch": 0.0482392667631452, + "fp16_scale": 1.0, + "global_step": 150, + "grad_norm": 2.1246119471767777, + "learning_rate": 1.9982980455185523e-06, + "loss": 0.5224, + "step": 150 + }, + { + "ETA": 5.59, + "epoch": 0.048560861874899504, + "fp16_scale": 1.0, + "global_step": 151, + "grad_norm": 2.302387604082975, + "learning_rate": 1.9982367367436505e-06, + "loss": 0.4105, + "step": 151 + }, + { + "ETA": 5.57, + "epoch": 0.048882456986653805, + "fp16_scale": 1.0, + "global_step": 152, + "grad_norm": 2.2524238140564226, + "learning_rate": 1.998174344145103e-06, + "loss": 0.4797, + "step": 152 + }, + { + "ETA": 5.57, + "epoch": 0.049204052098408106, + "fp16_scale": 1.0, + "global_step": 153, + "grad_norm": 1.8770814388726234, + "learning_rate": 1.9981108677906516e-06, + "loss": 0.435, + "step": 153 + }, + { + "ETA": 5.57, + "epoch": 0.04952564721016241, + "fp16_scale": 1.0, + "global_step": 154, + "grad_norm": 2.2883451205924352, + "learning_rate": 1.9980463077492156e-06, + "loss": 0.5423, + "step": 154 + }, + { + "ETA": 5.55, + "epoch": 0.04984724232191671, + "fp16_scale": 1.0, + "global_step": 155, + "grad_norm": 2.0598058556836634, + "learning_rate": 1.9979806640908904e-06, + "loss": 0.4055, + "step": 155 + }, + { + "ETA": 5.55, + "epoch": 0.05016883743367101, + "fp16_scale": 1.0, + "global_step": 156, + "grad_norm": 2.151177357556838, + "learning_rate": 1.997913936886947e-06, + "loss": 0.541, + "step": 156 + }, + { + "ETA": 5.55, + "epoch": 0.05049043254542531, + "fp16_scale": 1.0, + "global_step": 157, + "grad_norm": 1.9583108423046163, + "learning_rate": 1.997846126209834e-06, + "loss": 0.5167, + "step": 157 + }, + { + "ETA": 5.55, + "epoch": 0.05081202765717961, + "fp16_scale": 1.0, + "global_step": 158, + "grad_norm": 1.7191485045098294, + "learning_rate": 1.997777232133176e-06, + "loss": 0.48, + "step": 158 + }, + { + "ETA": 5.55, + "epoch": 0.05113362276893391, + "fp16_scale": 1.0, + "global_step": 159, + "grad_norm": 1.871417794238402, + "learning_rate": 1.997707254731775e-06, + "loss": 0.4349, + "step": 159 + }, + { + "ETA": 5.55, + "epoch": 0.051455217880688214, + "fp16_scale": 1.0, + "global_step": 160, + "grad_norm": 1.8506517380221816, + "learning_rate": 1.997636194081606e-06, + "loss": 0.4633, + "step": 160 + }, + { + "ETA": 5.56, + "epoch": 0.051776812992442516, + "fp16_scale": 1.0, + "global_step": 161, + "grad_norm": 2.248773620263743, + "learning_rate": 1.997564050259824e-06, + "loss": 0.4719, + "step": 161 + }, + { + "ETA": 5.54, + "epoch": 0.05209840810419682, + "fp16_scale": 1.0, + "global_step": 162, + "grad_norm": 2.25152787158001, + "learning_rate": 1.997490823344758e-06, + "loss": 0.4203, + "step": 162 + }, + { + "ETA": 5.54, + "epoch": 0.05242000321595112, + "fp16_scale": 1.0, + "global_step": 163, + "grad_norm": 1.8999498537580275, + "learning_rate": 1.9974165134159125e-06, + "loss": 0.5587, + "step": 163 + }, + { + "ETA": 5.54, + "epoch": 0.05274159832770542, + "fp16_scale": 1.0, + "global_step": 164, + "grad_norm": 1.9716592311180214, + "learning_rate": 1.9973411205539693e-06, + "loss": 0.5083, + "step": 164 + }, + { + "ETA": 5.54, + "epoch": 0.05306319343945972, + "fp16_scale": 1.0, + "global_step": 165, + "grad_norm": 2.1338753921220284, + "learning_rate": 1.9972646448407852e-06, + "loss": 0.469, + "step": 165 + }, + { + "ETA": 5.54, + "epoch": 0.05338478855121402, + "fp16_scale": 1.0, + "global_step": 166, + "grad_norm": 1.8632227385106377, + "learning_rate": 1.9971870863593923e-06, + "loss": 0.4324, + "step": 166 + }, + { + "ETA": 5.54, + "epoch": 0.05370638366296832, + "fp16_scale": 1.0, + "global_step": 167, + "grad_norm": 1.9945406494512294, + "learning_rate": 1.9971084451939993e-06, + "loss": 0.4701, + "step": 167 + }, + { + "ETA": 5.56, + "epoch": 0.054027978774722624, + "fp16_scale": 1.0, + "global_step": 168, + "grad_norm": 2.014063734615643, + "learning_rate": 1.99702872142999e-06, + "loss": 0.5058, + "step": 168 + }, + { + "ETA": 5.56, + "epoch": 0.054349573886476925, + "fp16_scale": 1.0, + "global_step": 169, + "grad_norm": 1.9732053379353365, + "learning_rate": 1.9969479151539234e-06, + "loss": 0.4995, + "step": 169 + }, + { + "ETA": 5.56, + "epoch": 0.054671168998231226, + "fp16_scale": 1.0, + "global_step": 170, + "grad_norm": 1.7441435113487023, + "learning_rate": 1.9968660264535337e-06, + "loss": 0.4408, + "step": 170 + }, + { + "ETA": 5.56, + "epoch": 0.05499276410998553, + "fp16_scale": 1.0, + "global_step": 171, + "grad_norm": 2.252759572663441, + "learning_rate": 1.996783055417731e-06, + "loss": 0.4896, + "step": 171 + }, + { + "ETA": 5.56, + "epoch": 0.05531435922173983, + "fp16_scale": 1.0, + "global_step": 172, + "grad_norm": 1.8886045826013742, + "learning_rate": 1.9966990021365996e-06, + "loss": 0.4924, + "step": 172 + }, + { + "ETA": 5.56, + "epoch": 0.05563595433349413, + "fp16_scale": 1.0, + "global_step": 173, + "grad_norm": 1.9236696733010126, + "learning_rate": 1.9966138667014e-06, + "loss": 0.4812, + "step": 173 + }, + { + "ETA": 5.56, + "epoch": 0.05595754944524843, + "fp16_scale": 1.0, + "global_step": 174, + "grad_norm": 1.8929409263112333, + "learning_rate": 1.9965276492045658e-06, + "loss": 0.4767, + "step": 174 + }, + { + "ETA": 5.56, + "epoch": 0.05627914455700273, + "fp16_scale": 1.0, + "global_step": 175, + "grad_norm": 2.138949107889882, + "learning_rate": 1.996440349739708e-06, + "loss": 0.5348, + "step": 175 + }, + { + "ETA": 5.56, + "epoch": 0.05660073966875703, + "fp16_scale": 1.0, + "global_step": 176, + "grad_norm": 2.0879534448263315, + "learning_rate": 1.9963519684016104e-06, + "loss": 0.5359, + "step": 176 + }, + { + "ETA": 5.54, + "epoch": 0.056922334780511334, + "fp16_scale": 1.0, + "global_step": 177, + "grad_norm": 2.1666246712030013, + "learning_rate": 1.996262505286232e-06, + "loss": 0.4162, + "step": 177 + }, + { + "ETA": 5.55, + "epoch": 0.057243929892265635, + "fp16_scale": 1.0, + "global_step": 178, + "grad_norm": 1.7446995140521049, + "learning_rate": 1.9961719604907065e-06, + "loss": 0.4333, + "step": 178 + }, + { + "ETA": 5.54, + "epoch": 0.057565525004019936, + "fp16_scale": 1.0, + "global_step": 179, + "grad_norm": 2.243637912780736, + "learning_rate": 1.996080334113341e-06, + "loss": 0.4457, + "step": 179 + }, + { + "ETA": 5.53, + "epoch": 0.05788712011577424, + "fp16_scale": 1.0, + "global_step": 180, + "grad_norm": 1.9630983410591967, + "learning_rate": 1.995987626253619e-06, + "loss": 0.4734, + "step": 180 + }, + { + "ETA": 5.53, + "epoch": 0.05820871522752854, + "fp16_scale": 1.0, + "global_step": 181, + "grad_norm": 2.0683050944198205, + "learning_rate": 1.995893837012196e-06, + "loss": 0.4875, + "step": 181 + }, + { + "ETA": 5.53, + "epoch": 0.05853031033928284, + "fp16_scale": 1.0, + "global_step": 182, + "grad_norm": 1.9810359539411369, + "learning_rate": 1.9957989664909025e-06, + "loss": 0.5567, + "step": 182 + }, + { + "ETA": 5.53, + "epoch": 0.05885190545103714, + "fp16_scale": 1.0, + "global_step": 183, + "grad_norm": 1.9715690339103922, + "learning_rate": 1.995703014792744e-06, + "loss": 0.4645, + "step": 183 + }, + { + "ETA": 5.52, + "epoch": 0.05917350056279144, + "fp16_scale": 1.0, + "global_step": 184, + "grad_norm": 2.27653607742063, + "learning_rate": 1.995605982021898e-06, + "loss": 0.4036, + "step": 184 + }, + { + "ETA": 5.52, + "epoch": 0.059495095674545743, + "fp16_scale": 1.0, + "global_step": 185, + "grad_norm": 2.1737370192078567, + "learning_rate": 1.995507868283717e-06, + "loss": 0.5222, + "step": 185 + }, + { + "ETA": 5.52, + "epoch": 0.05981669078630005, + "fp16_scale": 1.0, + "global_step": 186, + "grad_norm": 2.218311186174217, + "learning_rate": 1.995408673684727e-06, + "loss": 0.4512, + "step": 186 + }, + { + "ETA": 5.52, + "epoch": 0.06013828589805435, + "fp16_scale": 1.0, + "global_step": 187, + "grad_norm": 2.0221690338291576, + "learning_rate": 1.995308398332627e-06, + "loss": 0.5215, + "step": 187 + }, + { + "ETA": 5.52, + "epoch": 0.060459881009808654, + "fp16_scale": 1.0, + "global_step": 188, + "grad_norm": 2.09772793745461, + "learning_rate": 1.99520704233629e-06, + "loss": 0.5167, + "step": 188 + }, + { + "ETA": 5.5, + "epoch": 0.060781476121562955, + "fp16_scale": 1.0, + "global_step": 189, + "grad_norm": 2.149692926673016, + "learning_rate": 1.995104605805762e-06, + "loss": 0.4484, + "step": 189 + }, + { + "ETA": 5.5, + "epoch": 0.061103071233317256, + "fp16_scale": 1.0, + "global_step": 190, + "grad_norm": 1.871556062942563, + "learning_rate": 1.995001088852262e-06, + "loss": 0.475, + "step": 190 + }, + { + "ETA": 5.5, + "epoch": 0.06142466634507156, + "fp16_scale": 1.0, + "global_step": 191, + "grad_norm": 2.063540687892712, + "learning_rate": 1.9948964915881833e-06, + "loss": 0.4103, + "step": 191 + }, + { + "ETA": 5.5, + "epoch": 0.06174626145682586, + "fp16_scale": 1.0, + "global_step": 192, + "grad_norm": 2.0723345018771515, + "learning_rate": 1.9947908141270895e-06, + "loss": 0.4584, + "step": 192 + }, + { + "ETA": 5.5, + "epoch": 0.06206785656858016, + "fp16_scale": 1.0, + "global_step": 193, + "grad_norm": 2.1660972095275164, + "learning_rate": 1.9946840565837204e-06, + "loss": 0.5952, + "step": 193 + }, + { + "ETA": 5.51, + "epoch": 0.06238945168033446, + "fp16_scale": 1.0, + "global_step": 194, + "grad_norm": 2.0273096299920965, + "learning_rate": 1.994576219073985e-06, + "loss": 0.5321, + "step": 194 + }, + { + "ETA": 5.51, + "epoch": 0.06271104679208876, + "fp16_scale": 1.0, + "global_step": 195, + "grad_norm": 2.141105415839073, + "learning_rate": 1.994467301714968e-06, + "loss": 0.4956, + "step": 195 + }, + { + "ETA": 5.51, + "epoch": 0.06303264190384306, + "fp16_scale": 1.0, + "global_step": 196, + "grad_norm": 1.8902357894072024, + "learning_rate": 1.9943573046249244e-06, + "loss": 0.5354, + "step": 196 + }, + { + "ETA": 5.51, + "epoch": 0.06335423701559736, + "fp16_scale": 1.0, + "global_step": 197, + "grad_norm": 1.8151811916682024, + "learning_rate": 1.9942462279232824e-06, + "loss": 0.4465, + "step": 197 + }, + { + "ETA": 5.51, + "epoch": 0.06367583212735166, + "fp16_scale": 1.0, + "global_step": 198, + "grad_norm": 1.9991294470323713, + "learning_rate": 1.9941340717306423e-06, + "loss": 0.4476, + "step": 198 + }, + { + "ETA": 5.51, + "epoch": 0.06399742723910597, + "fp16_scale": 1.0, + "global_step": 199, + "grad_norm": 1.8900639064764466, + "learning_rate": 1.9940208361687756e-06, + "loss": 0.4736, + "step": 199 + }, + { + "ETA": 5.51, + "epoch": 0.06431902235086026, + "fp16_scale": 1.0, + "global_step": 200, + "grad_norm": 2.0981671204932897, + "learning_rate": 1.993906521360628e-06, + "loss": 0.5347, + "step": 200 + }, + { + "ETA": 5.64, + "epoch": 0.06464061746261457, + "fp16_scale": 1.0, + "global_step": 201, + "grad_norm": 2.1689594357999513, + "learning_rate": 1.9937911274303143e-06, + "loss": 0.5682, + "step": 201 + }, + { + "ETA": 5.64, + "epoch": 0.06496221257436886, + "fp16_scale": 1.0, + "global_step": 202, + "grad_norm": 2.05451397459771, + "learning_rate": 1.993674654503122e-06, + "loss": 0.4233, + "step": 202 + }, + { + "ETA": 5.64, + "epoch": 0.06528380768612317, + "fp16_scale": 1.0, + "global_step": 203, + "grad_norm": 2.1267490771823003, + "learning_rate": 1.993557102705511e-06, + "loss": 0.4746, + "step": 203 + }, + { + "ETA": 5.64, + "epoch": 0.06560540279787747, + "fp16_scale": 1.0, + "global_step": 204, + "grad_norm": 2.0247502131741846, + "learning_rate": 1.9934384721651113e-06, + "loss": 0.4593, + "step": 204 + }, + { + "ETA": 5.62, + "epoch": 0.06592699790963177, + "fp16_scale": 1.0, + "global_step": 205, + "grad_norm": 2.3792680496159235, + "learning_rate": 1.9933187630107243e-06, + "loss": 0.4401, + "step": 205 + }, + { + "ETA": 5.62, + "epoch": 0.06624859302138608, + "fp16_scale": 1.0, + "global_step": 206, + "grad_norm": 1.8766085210911245, + "learning_rate": 1.9931979753723233e-06, + "loss": 0.5038, + "step": 206 + }, + { + "ETA": 5.63, + "epoch": 0.06657018813314038, + "fp16_scale": 1.0, + "global_step": 207, + "grad_norm": 2.13191856771071, + "learning_rate": 1.993076109381052e-06, + "loss": 0.4498, + "step": 207 + }, + { + "ETA": 5.63, + "epoch": 0.06689178324489468, + "fp16_scale": 1.0, + "global_step": 208, + "grad_norm": 2.7407131077048708, + "learning_rate": 1.9929531651692245e-06, + "loss": 0.4447, + "step": 208 + }, + { + "ETA": 5.63, + "epoch": 0.06721337835664898, + "fp16_scale": 1.0, + "global_step": 209, + "grad_norm": 2.1340982723217263, + "learning_rate": 1.992829142870326e-06, + "loss": 0.4677, + "step": 209 + }, + { + "ETA": 5.63, + "epoch": 0.06753497346840329, + "fp16_scale": 1.0, + "global_step": 210, + "grad_norm": 2.0796370416466363, + "learning_rate": 1.992704042619013e-06, + "loss": 0.4343, + "step": 210 + }, + { + "ETA": 5.62, + "epoch": 0.06785656858015758, + "fp16_scale": 1.0, + "global_step": 211, + "grad_norm": 2.0605035706073087, + "learning_rate": 1.992577864551111e-06, + "loss": 0.4034, + "step": 211 + }, + { + "ETA": 5.63, + "epoch": 0.06817816369191189, + "fp16_scale": 1.0, + "global_step": 212, + "grad_norm": 1.980858167776178, + "learning_rate": 1.9924506088036165e-06, + "loss": 0.4863, + "step": 212 + }, + { + "ETA": 5.62, + "epoch": 0.06849975880366618, + "fp16_scale": 1.0, + "global_step": 213, + "grad_norm": 2.114940893621464, + "learning_rate": 1.9923222755146957e-06, + "loss": 0.4586, + "step": 213 + }, + { + "ETA": 5.61, + "epoch": 0.06882135391542049, + "fp16_scale": 1.0, + "global_step": 214, + "grad_norm": 2.1489712242568233, + "learning_rate": 1.992192864823685e-06, + "loss": 0.3696, + "step": 214 + }, + { + "ETA": 5.61, + "epoch": 0.06914294902717479, + "fp16_scale": 1.0, + "global_step": 215, + "grad_norm": 2.28734431293892, + "learning_rate": 1.992062376871091e-06, + "loss": 0.5426, + "step": 215 + }, + { + "ETA": 5.61, + "epoch": 0.0694645441389291, + "fp16_scale": 1.0, + "global_step": 216, + "grad_norm": 1.961631918540186, + "learning_rate": 1.991930811798589e-06, + "loss": 0.3718, + "step": 216 + }, + { + "ETA": 5.61, + "epoch": 0.06978613925068339, + "fp16_scale": 1.0, + "global_step": 217, + "grad_norm": 2.201954772079071, + "learning_rate": 1.991798169749024e-06, + "loss": 0.4645, + "step": 217 + }, + { + "ETA": 5.61, + "epoch": 0.0701077343624377, + "fp16_scale": 1.0, + "global_step": 218, + "grad_norm": 1.683968639976631, + "learning_rate": 1.9916644508664113e-06, + "loss": 0.4626, + "step": 218 + }, + { + "ETA": 5.61, + "epoch": 0.07042932947419199, + "fp16_scale": 1.0, + "global_step": 219, + "grad_norm": 1.8954887135461083, + "learning_rate": 1.991529655295934e-06, + "loss": 0.4655, + "step": 219 + }, + { + "ETA": 5.61, + "epoch": 0.0707509245859463, + "fp16_scale": 1.0, + "global_step": 220, + "grad_norm": 1.9778515969605384, + "learning_rate": 1.9913937831839447e-06, + "loss": 0.4583, + "step": 220 + }, + { + "ETA": 5.59, + "epoch": 0.07107251969770059, + "fp16_scale": 1.0, + "global_step": 221, + "grad_norm": 2.263287480391095, + "learning_rate": 1.991256834677965e-06, + "loss": 0.4433, + "step": 221 + }, + { + "ETA": 5.59, + "epoch": 0.0713941148094549, + "fp16_scale": 1.0, + "global_step": 222, + "grad_norm": 2.3745252441991695, + "learning_rate": 1.991118809926685e-06, + "loss": 0.4594, + "step": 222 + }, + { + "ETA": 5.59, + "epoch": 0.0717157099212092, + "fp16_scale": 1.0, + "global_step": 223, + "grad_norm": 1.794064858274459, + "learning_rate": 1.990979709079964e-06, + "loss": 0.4435, + "step": 223 + }, + { + "ETA": 5.59, + "epoch": 0.0720373050329635, + "fp16_scale": 1.0, + "global_step": 224, + "grad_norm": 2.1273526787298747, + "learning_rate": 1.9908395322888292e-06, + "loss": 0.4445, + "step": 224 + }, + { + "ETA": 5.59, + "epoch": 0.0723589001447178, + "fp16_scale": 1.0, + "global_step": 225, + "grad_norm": 1.9294042532521454, + "learning_rate": 1.9906982797054747e-06, + "loss": 0.4026, + "step": 225 + }, + { + "ETA": 5.59, + "epoch": 0.0726804952564721, + "fp16_scale": 1.0, + "global_step": 226, + "grad_norm": 2.0009389469804906, + "learning_rate": 1.990555951483265e-06, + "loss": 0.4871, + "step": 226 + }, + { + "ETA": 5.59, + "epoch": 0.0730020903682264, + "fp16_scale": 1.0, + "global_step": 227, + "grad_norm": 1.9451852255069821, + "learning_rate": 1.99041254777673e-06, + "loss": 0.4363, + "step": 227 + }, + { + "ETA": 5.59, + "epoch": 0.07332368547998071, + "fp16_scale": 1.0, + "global_step": 228, + "grad_norm": 1.9439742030991134, + "learning_rate": 1.99026806874157e-06, + "loss": 0.3954, + "step": 228 + }, + { + "ETA": 5.57, + "epoch": 0.073645280591735, + "fp16_scale": 1.0, + "global_step": 229, + "grad_norm": 2.0564657113810303, + "learning_rate": 1.990122514534651e-06, + "loss": 0.4065, + "step": 229 + }, + { + "ETA": 5.56, + "epoch": 0.07396687570348931, + "fp16_scale": 1.0, + "global_step": 230, + "grad_norm": 2.0852125947795663, + "learning_rate": 1.9899758853140062e-06, + "loss": 0.3481, + "step": 230 + }, + { + "ETA": 5.54, + "epoch": 0.0742884708152436, + "fp16_scale": 1.0, + "global_step": 231, + "grad_norm": 2.0552294493253815, + "learning_rate": 1.9898281812388367e-06, + "loss": 0.3555, + "step": 231 + }, + { + "ETA": 5.54, + "epoch": 0.07461006592699791, + "fp16_scale": 1.0, + "global_step": 232, + "grad_norm": 1.9173492037411566, + "learning_rate": 1.9896794024695106e-06, + "loss": 0.441, + "step": 232 + }, + { + "ETA": 5.53, + "epoch": 0.0749316610387522, + "fp16_scale": 1.0, + "global_step": 233, + "grad_norm": 2.206279686080648, + "learning_rate": 1.9895295491675627e-06, + "loss": 0.4212, + "step": 233 + }, + { + "ETA": 5.53, + "epoch": 0.07525325615050651, + "fp16_scale": 1.0, + "global_step": 234, + "grad_norm": 2.1281257459738536, + "learning_rate": 1.989378621495694e-06, + "loss": 0.4442, + "step": 234 + }, + { + "ETA": 5.53, + "epoch": 0.07557485126226081, + "fp16_scale": 1.0, + "global_step": 235, + "grad_norm": 2.1215344537423224, + "learning_rate": 1.9892266196177734e-06, + "loss": 0.4542, + "step": 235 + }, + { + "ETA": 5.52, + "epoch": 0.07589644637401512, + "fp16_scale": 1.0, + "global_step": 236, + "grad_norm": 2.090130869035227, + "learning_rate": 1.9890735436988344e-06, + "loss": 0.5499, + "step": 236 + }, + { + "ETA": 5.52, + "epoch": 0.07621804148576941, + "fp16_scale": 1.0, + "global_step": 237, + "grad_norm": 1.937700495165257, + "learning_rate": 1.9889193939050776e-06, + "loss": 0.438, + "step": 237 + }, + { + "ETA": 5.52, + "epoch": 0.07653963659752372, + "fp16_scale": 1.0, + "global_step": 238, + "grad_norm": 2.0169598442542207, + "learning_rate": 1.988764170403869e-06, + "loss": 0.3989, + "step": 238 + }, + { + "ETA": 5.52, + "epoch": 0.07686123170927801, + "fp16_scale": 1.0, + "global_step": 239, + "grad_norm": 1.9172068608330903, + "learning_rate": 1.9886078733637405e-06, + "loss": 0.4921, + "step": 239 + }, + { + "ETA": 5.52, + "epoch": 0.07718282682103232, + "fp16_scale": 1.0, + "global_step": 240, + "grad_norm": 2.0508531717431784, + "learning_rate": 1.9884505029543905e-06, + "loss": 0.4666, + "step": 240 + }, + { + "ETA": 5.52, + "epoch": 0.07750442193278662, + "fp16_scale": 1.0, + "global_step": 241, + "grad_norm": 1.8813618835383783, + "learning_rate": 1.9882920593466815e-06, + "loss": 0.4799, + "step": 241 + }, + { + "ETA": 5.51, + "epoch": 0.07782601704454092, + "fp16_scale": 1.0, + "global_step": 242, + "grad_norm": 2.0156583473726357, + "learning_rate": 1.988132542712642e-06, + "loss": 0.4824, + "step": 242 + }, + { + "ETA": 5.52, + "epoch": 0.07814761215629522, + "fp16_scale": 1.0, + "global_step": 243, + "grad_norm": 1.8786998169485898, + "learning_rate": 1.9879719532254654e-06, + "loss": 0.4526, + "step": 243 + }, + { + "ETA": 5.52, + "epoch": 0.07846920726804953, + "fp16_scale": 1.0, + "global_step": 244, + "grad_norm": 1.9040461898330572, + "learning_rate": 1.9878102910595096e-06, + "loss": 0.4329, + "step": 244 + }, + { + "ETA": 5.52, + "epoch": 0.07879080237980382, + "fp16_scale": 1.0, + "global_step": 245, + "grad_norm": 2.178599517193598, + "learning_rate": 1.9876475563902967e-06, + "loss": 0.448, + "step": 245 + }, + { + "ETA": 5.52, + "epoch": 0.07911239749155813, + "fp16_scale": 1.0, + "global_step": 246, + "grad_norm": 2.242242117764703, + "learning_rate": 1.987483749394515e-06, + "loss": 0.4983, + "step": 246 + }, + { + "ETA": 5.5, + "epoch": 0.07943399260331242, + "fp16_scale": 1.0, + "global_step": 247, + "grad_norm": 2.5317565590513804, + "learning_rate": 1.9873188702500162e-06, + "loss": 0.4893, + "step": 247 + }, + { + "ETA": 5.49, + "epoch": 0.07975558771506673, + "fp16_scale": 1.0, + "global_step": 248, + "grad_norm": 2.307404938147276, + "learning_rate": 1.9871529191358147e-06, + "loss": 0.4137, + "step": 248 + }, + { + "ETA": 5.49, + "epoch": 0.08007718282682104, + "fp16_scale": 1.0, + "global_step": 249, + "grad_norm": 2.0851611752705255, + "learning_rate": 1.9869858962320907e-06, + "loss": 0.5131, + "step": 249 + }, + { + "ETA": 5.49, + "epoch": 0.08039877793857533, + "fp16_scale": 1.0, + "global_step": 250, + "grad_norm": 1.855787380898543, + "learning_rate": 1.986817801720187e-06, + "loss": 0.5206, + "step": 250 + }, + { + "ETA": 5.48, + "epoch": 0.08072037305032964, + "fp16_scale": 1.0, + "global_step": 251, + "grad_norm": 1.9574119303708928, + "learning_rate": 1.9866486357826107e-06, + "loss": 0.4617, + "step": 251 + }, + { + "ETA": 5.49, + "epoch": 0.08104196816208394, + "fp16_scale": 1.0, + "global_step": 252, + "grad_norm": 2.0464551158026674, + "learning_rate": 1.9864783986030313e-06, + "loss": 0.4136, + "step": 252 + }, + { + "ETA": 5.49, + "epoch": 0.08136356327383824, + "fp16_scale": 1.0, + "global_step": 253, + "grad_norm": 1.9828752781504224, + "learning_rate": 1.9863070903662816e-06, + "loss": 0.4896, + "step": 253 + }, + { + "ETA": 5.48, + "epoch": 0.08168515838559254, + "fp16_scale": 1.0, + "global_step": 254, + "grad_norm": 2.111681055324118, + "learning_rate": 1.986134711258358e-06, + "loss": 0.4898, + "step": 254 + }, + { + "ETA": 5.47, + "epoch": 0.08200675349734685, + "fp16_scale": 1.0, + "global_step": 255, + "grad_norm": 2.152892422133239, + "learning_rate": 1.9859612614664184e-06, + "loss": 0.3783, + "step": 255 + }, + { + "ETA": 5.47, + "epoch": 0.08232834860910114, + "fp16_scale": 1.0, + "global_step": 256, + "grad_norm": 2.1160619106041128, + "learning_rate": 1.9857867411787847e-06, + "loss": 0.4835, + "step": 256 + }, + { + "ETA": 5.47, + "epoch": 0.08264994372085545, + "fp16_scale": 1.0, + "global_step": 257, + "grad_norm": 2.0588220447500998, + "learning_rate": 1.9856111505849395e-06, + "loss": 0.5168, + "step": 257 + }, + { + "ETA": 5.47, + "epoch": 0.08297153883260974, + "fp16_scale": 1.0, + "global_step": 258, + "grad_norm": 2.155900845405635, + "learning_rate": 1.9854344898755286e-06, + "loss": 0.474, + "step": 258 + }, + { + "ETA": 5.46, + "epoch": 0.08329313394436405, + "fp16_scale": 1.0, + "global_step": 259, + "grad_norm": 2.4831085607503414, + "learning_rate": 1.985256759242359e-06, + "loss": 0.44, + "step": 259 + }, + { + "ETA": 5.46, + "epoch": 0.08361472905611834, + "fp16_scale": 1.0, + "global_step": 260, + "grad_norm": 2.1759924833508464, + "learning_rate": 1.9850779588783996e-06, + "loss": 0.47, + "step": 260 + }, + { + "ETA": 5.45, + "epoch": 0.08393632416787265, + "fp16_scale": 1.0, + "global_step": 261, + "grad_norm": 2.2814754125500345, + "learning_rate": 1.9848980889777815e-06, + "loss": 0.4207, + "step": 261 + }, + { + "ETA": 5.45, + "epoch": 0.08425791927962695, + "fp16_scale": 1.0, + "global_step": 262, + "grad_norm": 2.0695546230875115, + "learning_rate": 1.984717149735795e-06, + "loss": 0.5273, + "step": 262 + }, + { + "ETA": 5.45, + "epoch": 0.08457951439138126, + "fp16_scale": 1.0, + "global_step": 263, + "grad_norm": 2.011815287590636, + "learning_rate": 1.984535141348894e-06, + "loss": 0.4799, + "step": 263 + }, + { + "ETA": 5.45, + "epoch": 0.08490110950313555, + "fp16_scale": 1.0, + "global_step": 264, + "grad_norm": 1.7799971082339463, + "learning_rate": 1.9843520640146907e-06, + "loss": 0.5345, + "step": 264 + }, + { + "ETA": 5.45, + "epoch": 0.08522270461488986, + "fp16_scale": 1.0, + "global_step": 265, + "grad_norm": 2.022283024501158, + "learning_rate": 1.9841679179319603e-06, + "loss": 0.4232, + "step": 265 + }, + { + "ETA": 5.44, + "epoch": 0.08554429972664415, + "fp16_scale": 1.0, + "global_step": 266, + "grad_norm": 2.1994235877050237, + "learning_rate": 1.983982703300637e-06, + "loss": 0.4251, + "step": 266 + }, + { + "ETA": 5.44, + "epoch": 0.08586589483839846, + "fp16_scale": 1.0, + "global_step": 267, + "grad_norm": 1.7288150832221585, + "learning_rate": 1.9837964203218146e-06, + "loss": 0.4375, + "step": 267 + }, + { + "ETA": 5.44, + "epoch": 0.08618748995015275, + "fp16_scale": 1.0, + "global_step": 268, + "grad_norm": 1.8733532543135558, + "learning_rate": 1.9836090691977484e-06, + "loss": 0.452, + "step": 268 + }, + { + "ETA": 5.44, + "epoch": 0.08650908506190706, + "fp16_scale": 1.0, + "global_step": 269, + "grad_norm": 1.9609845801107526, + "learning_rate": 1.983420650131852e-06, + "loss": 0.4456, + "step": 269 + }, + { + "ETA": 5.44, + "epoch": 0.08683068017366136, + "fp16_scale": 1.0, + "global_step": 270, + "grad_norm": 2.0104836911219515, + "learning_rate": 1.9832311633287e-06, + "loss": 0.4146, + "step": 270 + }, + { + "ETA": 5.44, + "epoch": 0.08715227528541566, + "fp16_scale": 1.0, + "global_step": 271, + "grad_norm": 1.9748172750720034, + "learning_rate": 1.9830406089940248e-06, + "loss": 0.516, + "step": 271 + }, + { + "ETA": 5.44, + "epoch": 0.08747387039716996, + "fp16_scale": 1.0, + "global_step": 272, + "grad_norm": 1.7308986178352186, + "learning_rate": 1.982848987334719e-06, + "loss": 0.4786, + "step": 272 + }, + { + "ETA": 5.43, + "epoch": 0.08779546550892427, + "fp16_scale": 1.0, + "global_step": 273, + "grad_norm": 2.407921562331774, + "learning_rate": 1.9826562985588327e-06, + "loss": 0.5075, + "step": 273 + }, + { + "ETA": 5.42, + "epoch": 0.08811706062067856, + "fp16_scale": 1.0, + "global_step": 274, + "grad_norm": 2.4177834248326686, + "learning_rate": 1.9824625428755758e-06, + "loss": 0.3536, + "step": 274 + }, + { + "ETA": 5.42, + "epoch": 0.08843865573243287, + "fp16_scale": 1.0, + "global_step": 275, + "grad_norm": 1.9921521614436994, + "learning_rate": 1.9822677204953168e-06, + "loss": 0.4529, + "step": 275 + }, + { + "ETA": 5.41, + "epoch": 0.08876025084418716, + "fp16_scale": 1.0, + "global_step": 276, + "grad_norm": 2.1416846030041032, + "learning_rate": 1.9820718316295814e-06, + "loss": 0.4128, + "step": 276 + }, + { + "ETA": 5.41, + "epoch": 0.08908184595594147, + "fp16_scale": 1.0, + "global_step": 277, + "grad_norm": 1.96303970778231, + "learning_rate": 1.9818748764910537e-06, + "loss": 0.4963, + "step": 277 + }, + { + "ETA": 5.41, + "epoch": 0.08940344106769577, + "fp16_scale": 1.0, + "global_step": 278, + "grad_norm": 2.16885895447799, + "learning_rate": 1.981676855293575e-06, + "loss": 0.4541, + "step": 278 + }, + { + "ETA": 5.41, + "epoch": 0.08972503617945007, + "fp16_scale": 1.0, + "global_step": 279, + "grad_norm": 2.0061306880759386, + "learning_rate": 1.9814777682521446e-06, + "loss": 0.5096, + "step": 279 + }, + { + "ETA": 5.41, + "epoch": 0.09004663129120437, + "fp16_scale": 1.0, + "global_step": 280, + "grad_norm": 2.0713104021770787, + "learning_rate": 1.981277615582919e-06, + "loss": 0.479, + "step": 280 + }, + { + "ETA": 5.41, + "epoch": 0.09036822640295868, + "fp16_scale": 1.0, + "global_step": 281, + "grad_norm": 1.9008364956507453, + "learning_rate": 1.9810763975032115e-06, + "loss": 0.4401, + "step": 281 + }, + { + "ETA": 5.41, + "epoch": 0.09068982151471297, + "fp16_scale": 1.0, + "global_step": 282, + "grad_norm": 1.9828121117507633, + "learning_rate": 1.9808741142314927e-06, + "loss": 0.4563, + "step": 282 + }, + { + "ETA": 5.41, + "epoch": 0.09101141662646728, + "fp16_scale": 1.0, + "global_step": 283, + "grad_norm": 1.8067168743021271, + "learning_rate": 1.9806707659873885e-06, + "loss": 0.5321, + "step": 283 + }, + { + "ETA": 5.41, + "epoch": 0.09133301173822157, + "fp16_scale": 1.0, + "global_step": 284, + "grad_norm": 1.8139567240837908, + "learning_rate": 1.9804663529916823e-06, + "loss": 0.396, + "step": 284 + }, + { + "ETA": 5.41, + "epoch": 0.09165460684997588, + "fp16_scale": 1.0, + "global_step": 285, + "grad_norm": 1.9160801554334261, + "learning_rate": 1.980260875466313e-06, + "loss": 0.4233, + "step": 285 + }, + { + "ETA": 5.41, + "epoch": 0.09197620196173018, + "fp16_scale": 1.0, + "global_step": 286, + "grad_norm": 2.152353481726994, + "learning_rate": 1.9800543336343757e-06, + "loss": 0.551, + "step": 286 + }, + { + "ETA": 5.4, + "epoch": 0.09229779707348448, + "fp16_scale": 1.0, + "global_step": 287, + "grad_norm": 2.3467354491283334, + "learning_rate": 1.9798467277201197e-06, + "loss": 0.3933, + "step": 287 + }, + { + "ETA": 5.4, + "epoch": 0.09261939218523878, + "fp16_scale": 1.0, + "global_step": 288, + "grad_norm": 1.9965345728302655, + "learning_rate": 1.9796380579489517e-06, + "loss": 0.5057, + "step": 288 + }, + { + "ETA": 5.4, + "epoch": 0.09294098729699309, + "fp16_scale": 1.0, + "global_step": 289, + "grad_norm": 2.226085466112576, + "learning_rate": 1.9794283245474318e-06, + "loss": 0.4997, + "step": 289 + }, + { + "ETA": 5.4, + "epoch": 0.0932625824087474, + "fp16_scale": 1.0, + "global_step": 290, + "grad_norm": 2.3422961278712022, + "learning_rate": 1.9792175277432763e-06, + "loss": 0.4265, + "step": 290 + }, + { + "ETA": 5.4, + "epoch": 0.09358417752050169, + "fp16_scale": 1.0, + "global_step": 291, + "grad_norm": 2.250545144667838, + "learning_rate": 1.9790056677653543e-06, + "loss": 0.4607, + "step": 291 + }, + { + "ETA": 5.4, + "epoch": 0.093905772632256, + "fp16_scale": 1.0, + "global_step": 292, + "grad_norm": 2.107671337765196, + "learning_rate": 1.978792744843691e-06, + "loss": 0.4609, + "step": 292 + }, + { + "ETA": 5.4, + "epoch": 0.09422736774401029, + "fp16_scale": 1.0, + "global_step": 293, + "grad_norm": 2.3046588962914516, + "learning_rate": 1.9785787592094646e-06, + "loss": 0.49, + "step": 293 + }, + { + "ETA": 5.4, + "epoch": 0.0945489628557646, + "fp16_scale": 1.0, + "global_step": 294, + "grad_norm": 2.121159234142659, + "learning_rate": 1.978363711095007e-06, + "loss": 0.4361, + "step": 294 + }, + { + "ETA": 5.4, + "epoch": 0.09487055796751889, + "fp16_scale": 1.0, + "global_step": 295, + "grad_norm": 2.002846941930535, + "learning_rate": 1.9781476007338054e-06, + "loss": 0.432, + "step": 295 + }, + { + "ETA": 5.4, + "epoch": 0.0951921530792732, + "fp16_scale": 1.0, + "global_step": 296, + "grad_norm": 1.8538864150738767, + "learning_rate": 1.9779304283604985e-06, + "loss": 0.4812, + "step": 296 + }, + { + "ETA": 5.39, + "epoch": 0.0955137481910275, + "fp16_scale": 1.0, + "global_step": 297, + "grad_norm": 2.571711408648999, + "learning_rate": 1.977712194210878e-06, + "loss": 0.4292, + "step": 297 + }, + { + "ETA": 5.39, + "epoch": 0.0958353433027818, + "fp16_scale": 1.0, + "global_step": 298, + "grad_norm": 1.9498087105851019, + "learning_rate": 1.977492898521889e-06, + "loss": 0.5071, + "step": 298 + }, + { + "ETA": 5.39, + "epoch": 0.0961569384145361, + "fp16_scale": 1.0, + "global_step": 299, + "grad_norm": 2.0510326617660404, + "learning_rate": 1.9772725415316304e-06, + "loss": 0.4764, + "step": 299 + }, + { + "ETA": 5.39, + "epoch": 0.0964785335262904, + "fp16_scale": 1.0, + "global_step": 300, + "grad_norm": 1.9448634485999639, + "learning_rate": 1.977051123479351e-06, + "loss": 0.4956, + "step": 300 + }, + { + "ETA": 5.39, + "epoch": 0.0968001286380447, + "fp16_scale": 1.0, + "global_step": 301, + "grad_norm": 2.178769414009859, + "learning_rate": 1.9768286446054532e-06, + "loss": 0.4224, + "step": 301 + }, + { + "ETA": 5.39, + "epoch": 0.09712172374979901, + "fp16_scale": 1.0, + "global_step": 302, + "grad_norm": 2.2638075232189108, + "learning_rate": 1.976605105151491e-06, + "loss": 0.4662, + "step": 302 + }, + { + "ETA": 5.38, + "epoch": 0.0974433188615533, + "fp16_scale": 1.0, + "global_step": 303, + "grad_norm": 2.434637731614345, + "learning_rate": 1.9763805053601696e-06, + "loss": 0.4414, + "step": 303 + }, + { + "ETA": 5.38, + "epoch": 0.09776491397330761, + "fp16_scale": 1.0, + "global_step": 304, + "grad_norm": 2.1063739090403812, + "learning_rate": 1.976154845475345e-06, + "loss": 0.4902, + "step": 304 + }, + { + "ETA": 5.38, + "epoch": 0.0980865090850619, + "fp16_scale": 1.0, + "global_step": 305, + "grad_norm": 2.2403744821922458, + "learning_rate": 1.9759281257420257e-06, + "loss": 0.5037, + "step": 305 + }, + { + "ETA": 5.38, + "epoch": 0.09840810419681621, + "fp16_scale": 1.0, + "global_step": 306, + "grad_norm": 1.9278762220091021, + "learning_rate": 1.9757003464063693e-06, + "loss": 0.4761, + "step": 306 + }, + { + "ETA": 5.38, + "epoch": 0.0987296993085705, + "fp16_scale": 1.0, + "global_step": 307, + "grad_norm": 1.9065898198947517, + "learning_rate": 1.975471507715685e-06, + "loss": 0.4683, + "step": 307 + }, + { + "ETA": 5.38, + "epoch": 0.09905129442032481, + "fp16_scale": 1.0, + "global_step": 308, + "grad_norm": 1.9777547584058885, + "learning_rate": 1.9752416099184304e-06, + "loss": 0.4321, + "step": 308 + }, + { + "ETA": 5.38, + "epoch": 0.09937288953207911, + "fp16_scale": 1.0, + "global_step": 309, + "grad_norm": 1.963299362703367, + "learning_rate": 1.9750106532642156e-06, + "loss": 0.5347, + "step": 309 + }, + { + "ETA": 5.37, + "epoch": 0.09969448464383342, + "fp16_scale": 1.0, + "global_step": 310, + "grad_norm": 2.2616629673340327, + "learning_rate": 1.974778638003799e-06, + "loss": 0.4078, + "step": 310 + }, + { + "ETA": 5.37, + "epoch": 0.10001607975558771, + "fp16_scale": 1.0, + "global_step": 311, + "grad_norm": 1.8888751677487754, + "learning_rate": 1.974545564389088e-06, + "loss": 0.4648, + "step": 311 + }, + { + "ETA": 5.37, + "epoch": 0.10033767486734202, + "fp16_scale": 1.0, + "global_step": 312, + "grad_norm": 2.090204859896294, + "learning_rate": 1.974311432673139e-06, + "loss": 0.4834, + "step": 312 + }, + { + "ETA": 5.36, + "epoch": 0.10065926997909631, + "fp16_scale": 1.0, + "global_step": 313, + "grad_norm": 1.9850263954290188, + "learning_rate": 1.974076243110159e-06, + "loss": 0.4134, + "step": 313 + }, + { + "ETA": 5.36, + "epoch": 0.10098086509085062, + "fp16_scale": 1.0, + "global_step": 314, + "grad_norm": 1.8760687192321301, + "learning_rate": 1.973839995955501e-06, + "loss": 0.4857, + "step": 314 + }, + { + "ETA": 5.36, + "epoch": 0.10130246020260492, + "fp16_scale": 1.0, + "global_step": 315, + "grad_norm": 1.8290568280756117, + "learning_rate": 1.9736026914656684e-06, + "loss": 0.4375, + "step": 315 + }, + { + "ETA": 5.36, + "epoch": 0.10162405531435922, + "fp16_scale": 1.0, + "global_step": 316, + "grad_norm": 2.2258972292420514, + "learning_rate": 1.973364329898311e-06, + "loss": 0.5097, + "step": 316 + }, + { + "ETA": 5.36, + "epoch": 0.10194565042611352, + "fp16_scale": 1.0, + "global_step": 317, + "grad_norm": 1.9283114461835762, + "learning_rate": 1.973124911512228e-06, + "loss": 0.481, + "step": 317 + }, + { + "ETA": 5.35, + "epoch": 0.10226724553786783, + "fp16_scale": 1.0, + "global_step": 318, + "grad_norm": 1.8324474077343869, + "learning_rate": 1.9728844365673643e-06, + "loss": 0.39, + "step": 318 + }, + { + "ETA": 5.35, + "epoch": 0.10258884064962212, + "fp16_scale": 1.0, + "global_step": 319, + "grad_norm": 2.2401290418034145, + "learning_rate": 1.9726429053248126e-06, + "loss": 0.446, + "step": 319 + }, + { + "ETA": 5.35, + "epoch": 0.10291043576137643, + "fp16_scale": 1.0, + "global_step": 320, + "grad_norm": 1.9427407352785937, + "learning_rate": 1.9724003180468134e-06, + "loss": 0.5237, + "step": 320 + }, + { + "ETA": 5.35, + "epoch": 0.10323203087313072, + "fp16_scale": 1.0, + "global_step": 321, + "grad_norm": 2.1206260936732795, + "learning_rate": 1.972156674996752e-06, + "loss": 0.4907, + "step": 321 + }, + { + "ETA": 5.35, + "epoch": 0.10355362598488503, + "fp16_scale": 1.0, + "global_step": 322, + "grad_norm": 2.1426857430727035, + "learning_rate": 1.971911976439162e-06, + "loss": 0.4884, + "step": 322 + }, + { + "ETA": 5.35, + "epoch": 0.10387522109663933, + "fp16_scale": 1.0, + "global_step": 323, + "grad_norm": 1.9358955934852775, + "learning_rate": 1.9716662226397206e-06, + "loss": 0.4172, + "step": 323 + }, + { + "ETA": 5.34, + "epoch": 0.10419681620839363, + "fp16_scale": 1.0, + "global_step": 324, + "grad_norm": 1.9755616719498104, + "learning_rate": 1.971419413865253e-06, + "loss": 0.4955, + "step": 324 + }, + { + "ETA": 5.34, + "epoch": 0.10451841132014793, + "fp16_scale": 1.0, + "global_step": 325, + "grad_norm": 2.1111800831528167, + "learning_rate": 1.9711715503837286e-06, + "loss": 0.4747, + "step": 325 + }, + { + "ETA": 5.34, + "epoch": 0.10484000643190224, + "fp16_scale": 1.0, + "global_step": 326, + "grad_norm": 2.0681166737032473, + "learning_rate": 1.9709226324642626e-06, + "loss": 0.4525, + "step": 326 + }, + { + "ETA": 5.34, + "epoch": 0.10516160154365653, + "fp16_scale": 1.0, + "global_step": 327, + "grad_norm": 2.0329374279997983, + "learning_rate": 1.970672660377114e-06, + "loss": 0.4459, + "step": 327 + }, + { + "ETA": 5.33, + "epoch": 0.10548319665541084, + "fp16_scale": 1.0, + "global_step": 328, + "grad_norm": 2.074488988732474, + "learning_rate": 1.970421634393687e-06, + "loss": 0.3724, + "step": 328 + }, + { + "ETA": 5.33, + "epoch": 0.10580479176716513, + "fp16_scale": 1.0, + "global_step": 329, + "grad_norm": 2.138757297944091, + "learning_rate": 1.970169554786531e-06, + "loss": 0.4979, + "step": 329 + }, + { + "ETA": 5.33, + "epoch": 0.10612638687891944, + "fp16_scale": 1.0, + "global_step": 330, + "grad_norm": 1.8627660554651546, + "learning_rate": 1.9699164218293377e-06, + "loss": 0.4033, + "step": 330 + }, + { + "ETA": 5.33, + "epoch": 0.10644798199067375, + "fp16_scale": 1.0, + "global_step": 331, + "grad_norm": 2.0531385160925346, + "learning_rate": 1.9696622357969435e-06, + "loss": 0.4733, + "step": 331 + }, + { + "ETA": 5.33, + "epoch": 0.10676957710242804, + "fp16_scale": 1.0, + "global_step": 332, + "grad_norm": 1.995724099048772, + "learning_rate": 1.9694069969653276e-06, + "loss": 0.4718, + "step": 332 + }, + { + "ETA": 5.33, + "epoch": 0.10709117221418235, + "fp16_scale": 1.0, + "global_step": 333, + "grad_norm": 1.8849998772482415, + "learning_rate": 1.9691507056116124e-06, + "loss": 0.4744, + "step": 333 + }, + { + "ETA": 5.33, + "epoch": 0.10741276732593665, + "fp16_scale": 1.0, + "global_step": 334, + "grad_norm": 2.002411293165941, + "learning_rate": 1.9688933620140635e-06, + "loss": 0.5237, + "step": 334 + }, + { + "ETA": 5.33, + "epoch": 0.10773436243769095, + "fp16_scale": 1.0, + "global_step": 335, + "grad_norm": 2.398691984124437, + "learning_rate": 1.9686349664520887e-06, + "loss": 0.4134, + "step": 335 + }, + { + "ETA": 5.32, + "epoch": 0.10805595754944525, + "fp16_scale": 1.0, + "global_step": 336, + "grad_norm": 1.9498967989275895, + "learning_rate": 1.968375519206238e-06, + "loss": 0.3973, + "step": 336 + }, + { + "ETA": 5.32, + "epoch": 0.10837755266119956, + "fp16_scale": 1.0, + "global_step": 337, + "grad_norm": 1.9017393329115277, + "learning_rate": 1.9681150205582025e-06, + "loss": 0.4283, + "step": 337 + }, + { + "ETA": 5.31, + "epoch": 0.10869914777295385, + "fp16_scale": 1.0, + "global_step": 338, + "grad_norm": 2.0508551842273097, + "learning_rate": 1.967853470790816e-06, + "loss": 0.3679, + "step": 338 + }, + { + "ETA": 5.31, + "epoch": 0.10902074288470816, + "fp16_scale": 1.0, + "global_step": 339, + "grad_norm": 2.1077150608156603, + "learning_rate": 1.967590870188053e-06, + "loss": 0.4512, + "step": 339 + }, + { + "ETA": 5.31, + "epoch": 0.10934233799646245, + "fp16_scale": 1.0, + "global_step": 340, + "grad_norm": 1.9681869106158016, + "learning_rate": 1.967327219035029e-06, + "loss": 0.4334, + "step": 340 + }, + { + "ETA": 5.31, + "epoch": 0.10966393310821676, + "fp16_scale": 1.0, + "global_step": 341, + "grad_norm": 2.109716319501794, + "learning_rate": 1.967062517618e-06, + "loss": 0.3853, + "step": 341 + }, + { + "ETA": 5.31, + "epoch": 0.10998552821997105, + "fp16_scale": 1.0, + "global_step": 342, + "grad_norm": 2.016872674766734, + "learning_rate": 1.9667967662243624e-06, + "loss": 0.4655, + "step": 342 + }, + { + "ETA": 5.3, + "epoch": 0.11030712333172536, + "fp16_scale": 1.0, + "global_step": 343, + "grad_norm": 2.108575161654793, + "learning_rate": 1.966529965142653e-06, + "loss": 0.4595, + "step": 343 + }, + { + "ETA": 5.3, + "epoch": 0.11062871844347966, + "fp16_scale": 1.0, + "global_step": 344, + "grad_norm": 2.173420307534748, + "learning_rate": 1.966262114662547e-06, + "loss": 0.4948, + "step": 344 + }, + { + "ETA": 5.3, + "epoch": 0.11095031355523396, + "fp16_scale": 1.0, + "global_step": 345, + "grad_norm": 2.0730606440639785, + "learning_rate": 1.9659932150748607e-06, + "loss": 0.4506, + "step": 345 + }, + { + "ETA": 5.3, + "epoch": 0.11127190866698826, + "fp16_scale": 1.0, + "global_step": 346, + "grad_norm": 1.9958990928537186, + "learning_rate": 1.9657232666715485e-06, + "loss": 0.5104, + "step": 346 + }, + { + "ETA": 5.3, + "epoch": 0.11159350377874257, + "fp16_scale": 1.0, + "global_step": 347, + "grad_norm": 1.7452582290531609, + "learning_rate": 1.9654522697457033e-06, + "loss": 0.4345, + "step": 347 + }, + { + "ETA": 5.3, + "epoch": 0.11191509889049686, + "fp16_scale": 1.0, + "global_step": 348, + "grad_norm": 1.8489716903183635, + "learning_rate": 1.9651802245915573e-06, + "loss": 0.4735, + "step": 348 + }, + { + "ETA": 5.29, + "epoch": 0.11223669400225117, + "fp16_scale": 1.0, + "global_step": 349, + "grad_norm": 2.3048133813373703, + "learning_rate": 1.9649071315044794e-06, + "loss": 0.4097, + "step": 349 + }, + { + "ETA": 5.28, + "epoch": 0.11255828911400546, + "fp16_scale": 1.0, + "global_step": 350, + "grad_norm": 2.2961743129791756, + "learning_rate": 1.9646329907809786e-06, + "loss": 0.4117, + "step": 350 + }, + { + "ETA": 5.28, + "epoch": 0.11287988422575977, + "fp16_scale": 1.0, + "global_step": 351, + "grad_norm": 1.9315862702027242, + "learning_rate": 1.9643578027186984e-06, + "loss": 0.4826, + "step": 351 + }, + { + "ETA": 5.28, + "epoch": 0.11320147933751407, + "fp16_scale": 1.0, + "global_step": 352, + "grad_norm": 1.9851217902296738, + "learning_rate": 1.9640815676164216e-06, + "loss": 0.5012, + "step": 352 + }, + { + "ETA": 5.28, + "epoch": 0.11352307444926837, + "fp16_scale": 1.0, + "global_step": 353, + "grad_norm": 2.3317631555471037, + "learning_rate": 1.9638042857740673e-06, + "loss": 0.4248, + "step": 353 + }, + { + "ETA": 5.28, + "epoch": 0.11384466956102267, + "fp16_scale": 1.0, + "global_step": 354, + "grad_norm": 1.5533112095795296, + "learning_rate": 1.963525957492691e-06, + "loss": 0.4486, + "step": 354 + }, + { + "ETA": 5.27, + "epoch": 0.11416626467277698, + "fp16_scale": 1.0, + "global_step": 355, + "grad_norm": 1.9359397572861832, + "learning_rate": 1.9632465830744845e-06, + "loss": 0.4699, + "step": 355 + }, + { + "ETA": 5.27, + "epoch": 0.11448785978453127, + "fp16_scale": 1.0, + "global_step": 356, + "grad_norm": 2.1492595181809464, + "learning_rate": 1.9629661628227743e-06, + "loss": 0.4772, + "step": 356 + }, + { + "ETA": 5.27, + "epoch": 0.11480945489628558, + "fp16_scale": 1.0, + "global_step": 357, + "grad_norm": 1.9286738656392266, + "learning_rate": 1.9626846970420244e-06, + "loss": 0.4863, + "step": 357 + }, + { + "ETA": 5.27, + "epoch": 0.11513105000803987, + "fp16_scale": 1.0, + "global_step": 358, + "grad_norm": 2.026612586228611, + "learning_rate": 1.9624021860378324e-06, + "loss": 0.382, + "step": 358 + }, + { + "ETA": 5.26, + "epoch": 0.11545264511979418, + "fp16_scale": 1.0, + "global_step": 359, + "grad_norm": 2.1481595085938614, + "learning_rate": 1.962118630116931e-06, + "loss": 0.3635, + "step": 359 + }, + { + "ETA": 5.26, + "epoch": 0.11577424023154848, + "fp16_scale": 1.0, + "global_step": 360, + "grad_norm": 2.203817729440887, + "learning_rate": 1.9618340295871887e-06, + "loss": 0.5334, + "step": 360 + }, + { + "ETA": 5.26, + "epoch": 0.11609583534330278, + "fp16_scale": 1.0, + "global_step": 361, + "grad_norm": 1.852740179654267, + "learning_rate": 1.9615483847576057e-06, + "loss": 0.4132, + "step": 361 + }, + { + "ETA": 5.26, + "epoch": 0.11641743045505708, + "fp16_scale": 1.0, + "global_step": 362, + "grad_norm": 2.1161828900485076, + "learning_rate": 1.9612616959383188e-06, + "loss": 0.4846, + "step": 362 + }, + { + "ETA": 5.25, + "epoch": 0.11673902556681139, + "fp16_scale": 1.0, + "global_step": 363, + "grad_norm": 2.191037208198011, + "learning_rate": 1.960973963440596e-06, + "loss": 0.3523, + "step": 363 + }, + { + "ETA": 5.25, + "epoch": 0.11706062067856568, + "fp16_scale": 1.0, + "global_step": 364, + "grad_norm": 2.063164123006888, + "learning_rate": 1.96068518757684e-06, + "loss": 0.4727, + "step": 364 + }, + { + "ETA": 5.24, + "epoch": 0.11738221579031999, + "fp16_scale": 1.0, + "global_step": 365, + "grad_norm": 2.020245619278409, + "learning_rate": 1.9603953686605858e-06, + "loss": 0.4733, + "step": 365 + }, + { + "ETA": 5.23, + "epoch": 0.11770381090207428, + "fp16_scale": 1.0, + "global_step": 366, + "grad_norm": 2.4414730636621056, + "learning_rate": 1.9601045070065e-06, + "loss": 0.4116, + "step": 366 + }, + { + "ETA": 5.24, + "epoch": 0.11802540601382859, + "fp16_scale": 1.0, + "global_step": 367, + "grad_norm": 1.8085290808011578, + "learning_rate": 1.9598126029303836e-06, + "loss": 0.473, + "step": 367 + }, + { + "ETA": 5.24, + "epoch": 0.11834700112558288, + "fp16_scale": 1.0, + "global_step": 368, + "grad_norm": 2.061404295864455, + "learning_rate": 1.9595196567491665e-06, + "loss": 0.48, + "step": 368 + }, + { + "ETA": 5.23, + "epoch": 0.11866859623733719, + "fp16_scale": 1.0, + "global_step": 369, + "grad_norm": 1.95202055920713, + "learning_rate": 1.9592256687809125e-06, + "loss": 0.4003, + "step": 369 + }, + { + "ETA": 5.23, + "epoch": 0.11899019134909149, + "fp16_scale": 1.0, + "global_step": 370, + "grad_norm": 1.8825953559894752, + "learning_rate": 1.958930639344815e-06, + "loss": 0.4016, + "step": 370 + }, + { + "ETA": 5.23, + "epoch": 0.1193117864608458, + "fp16_scale": 1.0, + "global_step": 371, + "grad_norm": 1.983391946392834, + "learning_rate": 1.958634568761199e-06, + "loss": 0.4306, + "step": 371 + }, + { + "ETA": 5.22, + "epoch": 0.1196333815726001, + "fp16_scale": 1.0, + "global_step": 372, + "grad_norm": 2.133957388129558, + "learning_rate": 1.9583374573515197e-06, + "loss": 0.4762, + "step": 372 + }, + { + "ETA": 5.22, + "epoch": 0.1199549766843544, + "fp16_scale": 1.0, + "global_step": 373, + "grad_norm": 2.150798639136317, + "learning_rate": 1.958039305438362e-06, + "loss": 0.3935, + "step": 373 + }, + { + "ETA": 5.22, + "epoch": 0.1202765717961087, + "fp16_scale": 1.0, + "global_step": 374, + "grad_norm": 1.9460695795967955, + "learning_rate": 1.957740113345441e-06, + "loss": 0.442, + "step": 374 + }, + { + "ETA": 5.21, + "epoch": 0.120598166907863, + "fp16_scale": 1.0, + "global_step": 375, + "grad_norm": 2.071648691990363, + "learning_rate": 1.9574398813976005e-06, + "loss": 0.4515, + "step": 375 + }, + { + "ETA": 5.22, + "epoch": 0.12091976201961731, + "fp16_scale": 1.0, + "global_step": 376, + "grad_norm": 2.3073717685750634, + "learning_rate": 1.9571386099208142e-06, + "loss": 0.4752, + "step": 376 + }, + { + "ETA": 5.21, + "epoch": 0.1212413571313716, + "fp16_scale": 1.0, + "global_step": 377, + "grad_norm": 2.323900215946003, + "learning_rate": 1.956836299242184e-06, + "loss": 0.4397, + "step": 377 + }, + { + "ETA": 5.21, + "epoch": 0.12156295224312591, + "fp16_scale": 1.0, + "global_step": 378, + "grad_norm": 2.2973185498237587, + "learning_rate": 1.9565329496899403e-06, + "loss": 0.4131, + "step": 378 + }, + { + "ETA": 5.2, + "epoch": 0.1218845473548802, + "fp16_scale": 1.0, + "global_step": 379, + "grad_norm": 1.9221938623341537, + "learning_rate": 1.9562285615934408e-06, + "loss": 0.4085, + "step": 379 + }, + { + "ETA": 5.2, + "epoch": 0.12220614246663451, + "fp16_scale": 1.0, + "global_step": 380, + "grad_norm": 2.145805547493304, + "learning_rate": 1.9559231352831715e-06, + "loss": 0.5559, + "step": 380 + }, + { + "ETA": 5.2, + "epoch": 0.1225277375783888, + "fp16_scale": 1.0, + "global_step": 381, + "grad_norm": 2.004554060401739, + "learning_rate": 1.955616671090745e-06, + "loss": 0.4517, + "step": 381 + }, + { + "ETA": 5.2, + "epoch": 0.12284933269014311, + "fp16_scale": 1.0, + "global_step": 382, + "grad_norm": 2.096385241775676, + "learning_rate": 1.9553091693489016e-06, + "loss": 0.5358, + "step": 382 + }, + { + "ETA": 5.2, + "epoch": 0.12317092780189741, + "fp16_scale": 1.0, + "global_step": 383, + "grad_norm": 2.1179397128222703, + "learning_rate": 1.955000630391508e-06, + "loss": 0.469, + "step": 383 + }, + { + "ETA": 5.2, + "epoch": 0.12349252291365172, + "fp16_scale": 1.0, + "global_step": 384, + "grad_norm": 1.885263273180588, + "learning_rate": 1.9546910545535556e-06, + "loss": 0.3887, + "step": 384 + }, + { + "ETA": 5.2, + "epoch": 0.12381411802540601, + "fp16_scale": 1.0, + "global_step": 385, + "grad_norm": 2.205463140577137, + "learning_rate": 1.9543804421711636e-06, + "loss": 0.5082, + "step": 385 + }, + { + "ETA": 5.2, + "epoch": 0.12413571313716032, + "fp16_scale": 1.0, + "global_step": 386, + "grad_norm": 2.03347548234032, + "learning_rate": 1.954068793581575e-06, + "loss": 0.4349, + "step": 386 + }, + { + "ETA": 5.19, + "epoch": 0.12445730824891461, + "fp16_scale": 1.0, + "global_step": 387, + "grad_norm": 2.078026220588503, + "learning_rate": 1.9537561091231596e-06, + "loss": 0.4077, + "step": 387 + }, + { + "ETA": 5.19, + "epoch": 0.12477890336066892, + "fp16_scale": 1.0, + "global_step": 388, + "grad_norm": 1.9983428126578808, + "learning_rate": 1.95344238913541e-06, + "loss": 0.4936, + "step": 388 + }, + { + "ETA": 5.19, + "epoch": 0.12510049847242322, + "fp16_scale": 1.0, + "global_step": 389, + "grad_norm": 1.9843240996334781, + "learning_rate": 1.953127633958944e-06, + "loss": 0.4746, + "step": 389 + }, + { + "ETA": 5.19, + "epoch": 0.12542209358417752, + "fp16_scale": 1.0, + "global_step": 390, + "grad_norm": 2.0047320593603777, + "learning_rate": 1.952811843935503e-06, + "loss": 0.494, + "step": 390 + }, + { + "ETA": 5.18, + "epoch": 0.12574368869593183, + "fp16_scale": 1.0, + "global_step": 391, + "grad_norm": 2.1912314948321923, + "learning_rate": 1.9524950194079533e-06, + "loss": 0.467, + "step": 391 + }, + { + "ETA": 5.18, + "epoch": 0.1260652838076861, + "fp16_scale": 1.0, + "global_step": 392, + "grad_norm": 2.011092299877131, + "learning_rate": 1.952177160720282e-06, + "loss": 0.4499, + "step": 392 + }, + { + "ETA": 5.17, + "epoch": 0.12638687891944042, + "fp16_scale": 1.0, + "global_step": 393, + "grad_norm": 2.040087836819472, + "learning_rate": 1.9518582682176016e-06, + "loss": 0.3774, + "step": 393 + }, + { + "ETA": 5.17, + "epoch": 0.12670847403119473, + "fp16_scale": 1.0, + "global_step": 394, + "grad_norm": 1.9845613783170721, + "learning_rate": 1.9515383422461455e-06, + "loss": 0.4742, + "step": 394 + }, + { + "ETA": 5.17, + "epoch": 0.12703006914294904, + "fp16_scale": 1.0, + "global_step": 395, + "grad_norm": 1.7916576850809898, + "learning_rate": 1.9512173831532686e-06, + "loss": 0.4283, + "step": 395 + }, + { + "ETA": 5.17, + "epoch": 0.12735166425470332, + "fp16_scale": 1.0, + "global_step": 396, + "grad_norm": 1.8518207434023177, + "learning_rate": 1.95089539128745e-06, + "loss": 0.398, + "step": 396 + }, + { + "ETA": 5.17, + "epoch": 0.12767325936645763, + "fp16_scale": 1.0, + "global_step": 397, + "grad_norm": 1.9396466358034898, + "learning_rate": 1.950572366998287e-06, + "loss": 0.3812, + "step": 397 + }, + { + "ETA": 5.17, + "epoch": 0.12799485447821193, + "fp16_scale": 1.0, + "global_step": 398, + "grad_norm": 2.1996149256600623, + "learning_rate": 1.9502483106365e-06, + "loss": 0.4114, + "step": 398 + }, + { + "ETA": 5.17, + "epoch": 0.12831644958996624, + "fp16_scale": 1.0, + "global_step": 399, + "grad_norm": 2.0924296142233763, + "learning_rate": 1.94992322255393e-06, + "loss": 0.5468, + "step": 399 + }, + { + "ETA": 5.17, + "epoch": 0.12863804470172052, + "fp16_scale": 1.0, + "global_step": 400, + "grad_norm": 2.1111375806811656, + "learning_rate": 1.9495971031035363e-06, + "loss": 0.4113, + "step": 400 + }, + { + "ETA": 5.22, + "epoch": 0.12895963981347483, + "fp16_scale": 1.0, + "global_step": 401, + "grad_norm": 2.0505877642386077, + "learning_rate": 1.9492699526394e-06, + "loss": 0.533, + "step": 401 + }, + { + "ETA": 5.22, + "epoch": 0.12928123492522914, + "fp16_scale": 1.0, + "global_step": 402, + "grad_norm": 2.2752919331626162, + "learning_rate": 1.948941771516721e-06, + "loss": 0.4066, + "step": 402 + }, + { + "ETA": 5.21, + "epoch": 0.12960283003698345, + "fp16_scale": 1.0, + "global_step": 403, + "grad_norm": 1.9498187127382178, + "learning_rate": 1.9486125600918176e-06, + "loss": 0.3798, + "step": 403 + }, + { + "ETA": 5.21, + "epoch": 0.12992442514873773, + "fp16_scale": 1.0, + "global_step": 404, + "grad_norm": 2.019174704757042, + "learning_rate": 1.948282318722127e-06, + "loss": 0.4407, + "step": 404 + }, + { + "ETA": 5.21, + "epoch": 0.13024602026049203, + "fp16_scale": 1.0, + "global_step": 405, + "grad_norm": 1.930174696471365, + "learning_rate": 1.947951047766205e-06, + "loss": 0.4836, + "step": 405 + }, + { + "ETA": 5.2, + "epoch": 0.13056761537224634, + "fp16_scale": 1.0, + "global_step": 406, + "grad_norm": 2.3780292591951055, + "learning_rate": 1.9476187475837253e-06, + "loss": 0.4406, + "step": 406 + }, + { + "ETA": 5.2, + "epoch": 0.13088921048400065, + "fp16_scale": 1.0, + "global_step": 407, + "grad_norm": 1.9749968561931892, + "learning_rate": 1.947285418535479e-06, + "loss": 0.4926, + "step": 407 + }, + { + "ETA": 5.2, + "epoch": 0.13121080559575493, + "fp16_scale": 1.0, + "global_step": 408, + "grad_norm": 1.9118323778875925, + "learning_rate": 1.9469510609833736e-06, + "loss": 0.4718, + "step": 408 + }, + { + "ETA": 5.2, + "epoch": 0.13153240070750924, + "fp16_scale": 1.0, + "global_step": 409, + "grad_norm": 1.9557921694287344, + "learning_rate": 1.946615675290434e-06, + "loss": 0.4796, + "step": 409 + }, + { + "ETA": 5.2, + "epoch": 0.13185399581926355, + "fp16_scale": 1.0, + "global_step": 410, + "grad_norm": 1.7568646908908685, + "learning_rate": 1.9462792618208016e-06, + "loss": 0.4673, + "step": 410 + }, + { + "ETA": 5.2, + "epoch": 0.13217559093101786, + "fp16_scale": 1.0, + "global_step": 411, + "grad_norm": 1.9691516740573827, + "learning_rate": 1.945941820939733e-06, + "loss": 0.5125, + "step": 411 + }, + { + "ETA": 5.2, + "epoch": 0.13249718604277216, + "fp16_scale": 1.0, + "global_step": 412, + "grad_norm": 2.1649453318010603, + "learning_rate": 1.9456033530136006e-06, + "loss": 0.5044, + "step": 412 + }, + { + "ETA": 5.19, + "epoch": 0.13281878115452644, + "fp16_scale": 1.0, + "global_step": 413, + "grad_norm": 2.1501165173009693, + "learning_rate": 1.945263858409892e-06, + "loss": 0.5147, + "step": 413 + }, + { + "ETA": 5.19, + "epoch": 0.13314037626628075, + "fp16_scale": 1.0, + "global_step": 414, + "grad_norm": 2.1142807297947805, + "learning_rate": 1.9449233374972092e-06, + "loss": 0.5006, + "step": 414 + }, + { + "ETA": 5.19, + "epoch": 0.13346197137803506, + "fp16_scale": 1.0, + "global_step": 415, + "grad_norm": 1.937485606893142, + "learning_rate": 1.9445817906452695e-06, + "loss": 0.4537, + "step": 415 + }, + { + "ETA": 5.19, + "epoch": 0.13378356648978937, + "fp16_scale": 1.0, + "global_step": 416, + "grad_norm": 2.1203546732040595, + "learning_rate": 1.944239218224902e-06, + "loss": 0.4673, + "step": 416 + }, + { + "ETA": 5.19, + "epoch": 0.13410516160154365, + "fp16_scale": 1.0, + "global_step": 417, + "grad_norm": 1.9626627677598947, + "learning_rate": 1.9438956206080523e-06, + "loss": 0.3959, + "step": 417 + }, + { + "ETA": 5.19, + "epoch": 0.13442675671329796, + "fp16_scale": 1.0, + "global_step": 418, + "grad_norm": 1.947392905660049, + "learning_rate": 1.943550998167776e-06, + "loss": 0.59, + "step": 418 + }, + { + "ETA": 5.19, + "epoch": 0.13474835182505226, + "fp16_scale": 1.0, + "global_step": 419, + "grad_norm": 2.2259788409758157, + "learning_rate": 1.9432053512782435e-06, + "loss": 0.4874, + "step": 419 + }, + { + "ETA": 5.18, + "epoch": 0.13506994693680657, + "fp16_scale": 1.0, + "global_step": 420, + "grad_norm": 2.325649234666594, + "learning_rate": 1.9428586803147364e-06, + "loss": 0.5795, + "step": 420 + }, + { + "ETA": 5.18, + "epoch": 0.13539154204856085, + "fp16_scale": 1.0, + "global_step": 421, + "grad_norm": 2.0260415473338163, + "learning_rate": 1.942510985653649e-06, + "loss": 0.4914, + "step": 421 + }, + { + "ETA": 5.18, + "epoch": 0.13571313716031516, + "fp16_scale": 1.0, + "global_step": 422, + "grad_norm": 1.898521737983934, + "learning_rate": 1.942162267672486e-06, + "loss": 0.4755, + "step": 422 + }, + { + "ETA": 5.18, + "epoch": 0.13603473227206947, + "fp16_scale": 1.0, + "global_step": 423, + "grad_norm": 2.144860138023159, + "learning_rate": 1.941812526749865e-06, + "loss": 0.5083, + "step": 423 + }, + { + "ETA": 5.18, + "epoch": 0.13635632738382378, + "fp16_scale": 1.0, + "global_step": 424, + "grad_norm": 1.99456776136853, + "learning_rate": 1.9414617632655112e-06, + "loss": 0.4749, + "step": 424 + }, + { + "ETA": 5.18, + "epoch": 0.13667792249557806, + "fp16_scale": 1.0, + "global_step": 425, + "grad_norm": 2.254945105453509, + "learning_rate": 1.9411099776002635e-06, + "loss": 0.5435, + "step": 425 + }, + { + "ETA": 5.18, + "epoch": 0.13699951760733237, + "fp16_scale": 1.0, + "global_step": 426, + "grad_norm": 1.9695960187702708, + "learning_rate": 1.940757170136068e-06, + "loss": 0.4438, + "step": 426 + }, + { + "ETA": 5.18, + "epoch": 0.13732111271908667, + "fp16_scale": 1.0, + "global_step": 427, + "grad_norm": 2.0018331783259615, + "learning_rate": 1.9404033412559825e-06, + "loss": 0.4976, + "step": 427 + }, + { + "ETA": 5.17, + "epoch": 0.13764270783084098, + "fp16_scale": 1.0, + "global_step": 428, + "grad_norm": 1.9666528999917499, + "learning_rate": 1.940048491344171e-06, + "loss": 0.4785, + "step": 428 + }, + { + "ETA": 5.17, + "epoch": 0.13796430294259526, + "fp16_scale": 1.0, + "global_step": 429, + "grad_norm": 1.8885843306751877, + "learning_rate": 1.9396926207859082e-06, + "loss": 0.5042, + "step": 429 + }, + { + "ETA": 5.17, + "epoch": 0.13828589805434957, + "fp16_scale": 1.0, + "global_step": 430, + "grad_norm": 1.9135829325509608, + "learning_rate": 1.9393357299675764e-06, + "loss": 0.5285, + "step": 430 + }, + { + "ETA": 5.17, + "epoch": 0.13860749316610388, + "fp16_scale": 1.0, + "global_step": 431, + "grad_norm": 2.038493470995957, + "learning_rate": 1.9389778192766656e-06, + "loss": 0.4968, + "step": 431 + }, + { + "ETA": 5.17, + "epoch": 0.1389290882778582, + "fp16_scale": 1.0, + "global_step": 432, + "grad_norm": 1.8705922440534466, + "learning_rate": 1.938618889101773e-06, + "loss": 0.4157, + "step": 432 + }, + { + "ETA": 5.17, + "epoch": 0.13925068338961247, + "fp16_scale": 1.0, + "global_step": 433, + "grad_norm": 1.7352670425372603, + "learning_rate": 1.938258939832602e-06, + "loss": 0.4729, + "step": 433 + }, + { + "ETA": 5.17, + "epoch": 0.13957227850136678, + "fp16_scale": 1.0, + "global_step": 434, + "grad_norm": 2.4242807468324874, + "learning_rate": 1.9378979718599642e-06, + "loss": 0.4365, + "step": 434 + }, + { + "ETA": 5.16, + "epoch": 0.13989387361312108, + "fp16_scale": 1.0, + "global_step": 435, + "grad_norm": 1.916994664393028, + "learning_rate": 1.9375359855757766e-06, + "loss": 0.4091, + "step": 435 + }, + { + "ETA": 5.16, + "epoch": 0.1402154687248754, + "fp16_scale": 1.0, + "global_step": 436, + "grad_norm": 1.8881238474894722, + "learning_rate": 1.9371729813730604e-06, + "loss": 0.4938, + "step": 436 + }, + { + "ETA": 5.16, + "epoch": 0.14053706383662967, + "fp16_scale": 1.0, + "global_step": 437, + "grad_norm": 2.1349766080230346, + "learning_rate": 1.9368089596459438e-06, + "loss": 0.4765, + "step": 437 + }, + { + "ETA": 5.16, + "epoch": 0.14085865894838398, + "fp16_scale": 1.0, + "global_step": 438, + "grad_norm": 2.127160685881295, + "learning_rate": 1.936443920789658e-06, + "loss": 0.5, + "step": 438 + }, + { + "ETA": 5.15, + "epoch": 0.1411802540601383, + "fp16_scale": 1.0, + "global_step": 439, + "grad_norm": 1.8930260580501126, + "learning_rate": 1.9360778652005414e-06, + "loss": 0.4855, + "step": 439 + }, + { + "ETA": 5.15, + "epoch": 0.1415018491718926, + "fp16_scale": 1.0, + "global_step": 440, + "grad_norm": 1.9548799762103741, + "learning_rate": 1.9357107932760332e-06, + "loss": 0.4879, + "step": 440 + }, + { + "ETA": 5.15, + "epoch": 0.14182344428364688, + "fp16_scale": 1.0, + "global_step": 441, + "grad_norm": 1.9317495953477675, + "learning_rate": 1.9353427054146774e-06, + "loss": 0.5375, + "step": 441 + }, + { + "ETA": 5.16, + "epoch": 0.14214503939540118, + "fp16_scale": 1.0, + "global_step": 442, + "grad_norm": 1.9857593253761028, + "learning_rate": 1.934973602016122e-06, + "loss": 0.4163, + "step": 442 + }, + { + "ETA": 5.15, + "epoch": 0.1424666345071555, + "fp16_scale": 1.0, + "global_step": 443, + "grad_norm": 1.9173337263435726, + "learning_rate": 1.9346034834811153e-06, + "loss": 0.452, + "step": 443 + }, + { + "ETA": 5.15, + "epoch": 0.1427882296189098, + "fp16_scale": 1.0, + "global_step": 444, + "grad_norm": 2.181530341125949, + "learning_rate": 1.93423235021151e-06, + "loss": 0.3778, + "step": 444 + }, + { + "ETA": 5.14, + "epoch": 0.14310982473066408, + "fp16_scale": 1.0, + "global_step": 445, + "grad_norm": 1.9846459794648879, + "learning_rate": 1.9338602026102594e-06, + "loss": 0.3937, + "step": 445 + }, + { + "ETA": 5.14, + "epoch": 0.1434314198424184, + "fp16_scale": 1.0, + "global_step": 446, + "grad_norm": 1.8759579249811038, + "learning_rate": 1.9334870410814178e-06, + "loss": 0.4015, + "step": 446 + }, + { + "ETA": 5.13, + "epoch": 0.1437530149541727, + "fp16_scale": 1.0, + "global_step": 447, + "grad_norm": 1.8976000883677417, + "learning_rate": 1.9331128660301417e-06, + "loss": 0.3701, + "step": 447 + }, + { + "ETA": 5.13, + "epoch": 0.144074610065927, + "fp16_scale": 1.0, + "global_step": 448, + "grad_norm": 2.0830861561346796, + "learning_rate": 1.932737677862687e-06, + "loss": 0.4587, + "step": 448 + }, + { + "ETA": 5.13, + "epoch": 0.14439620517768129, + "fp16_scale": 1.0, + "global_step": 449, + "grad_norm": 2.023843607690428, + "learning_rate": 1.932361476986409e-06, + "loss": 0.5054, + "step": 449 + }, + { + "ETA": 5.13, + "epoch": 0.1447178002894356, + "fp16_scale": 1.0, + "global_step": 450, + "grad_norm": 2.096781199048923, + "learning_rate": 1.9319842638097644e-06, + "loss": 0.3842, + "step": 450 + }, + { + "ETA": 5.13, + "epoch": 0.1450393954011899, + "fp16_scale": 1.0, + "global_step": 451, + "grad_norm": 1.9321513122728664, + "learning_rate": 1.9316060387423074e-06, + "loss": 0.5316, + "step": 451 + }, + { + "ETA": 5.12, + "epoch": 0.1453609905129442, + "fp16_scale": 1.0, + "global_step": 452, + "grad_norm": 2.3279603223291074, + "learning_rate": 1.9312268021946916e-06, + "loss": 0.4583, + "step": 452 + }, + { + "ETA": 5.12, + "epoch": 0.1456825856246985, + "fp16_scale": 1.0, + "global_step": 453, + "grad_norm": 2.014031977512335, + "learning_rate": 1.9308465545786682e-06, + "loss": 0.4189, + "step": 453 + }, + { + "ETA": 5.11, + "epoch": 0.1460041807364528, + "fp16_scale": 1.0, + "global_step": 454, + "grad_norm": 7.418229393654079, + "learning_rate": 1.9304652963070866e-06, + "loss": 0.4562, + "step": 454 + }, + { + "ETA": 5.11, + "epoch": 0.1463257758482071, + "fp16_scale": 1.0, + "global_step": 455, + "grad_norm": 2.0383858891895215, + "learning_rate": 1.9300830277938934e-06, + "loss": 0.3989, + "step": 455 + }, + { + "ETA": 5.11, + "epoch": 0.14664737095996142, + "fp16_scale": 1.0, + "global_step": 456, + "grad_norm": 1.9722056588404926, + "learning_rate": 1.9296997494541327e-06, + "loss": 0.4759, + "step": 456 + }, + { + "ETA": 5.11, + "epoch": 0.14696896607171572, + "fp16_scale": 1.0, + "global_step": 457, + "grad_norm": 2.1272312848730106, + "learning_rate": 1.9293154617039436e-06, + "loss": 0.4357, + "step": 457 + }, + { + "ETA": 5.11, + "epoch": 0.14729056118347, + "fp16_scale": 1.0, + "global_step": 458, + "grad_norm": 1.8574626919046473, + "learning_rate": 1.928930164960562e-06, + "loss": 0.4691, + "step": 458 + }, + { + "ETA": 5.11, + "epoch": 0.1476121562952243, + "fp16_scale": 1.0, + "global_step": 459, + "grad_norm": 1.9651718600396562, + "learning_rate": 1.92854385964232e-06, + "loss": 0.4896, + "step": 459 + }, + { + "ETA": 5.11, + "epoch": 0.14793375140697862, + "fp16_scale": 1.0, + "global_step": 460, + "grad_norm": 1.9352993455502014, + "learning_rate": 1.9281565461686436e-06, + "loss": 0.4551, + "step": 460 + }, + { + "ETA": 5.1, + "epoch": 0.14825534651873293, + "fp16_scale": 1.0, + "global_step": 461, + "grad_norm": 1.912081992082377, + "learning_rate": 1.9277682249600533e-06, + "loss": 0.5035, + "step": 461 + }, + { + "ETA": 5.1, + "epoch": 0.1485769416304872, + "fp16_scale": 1.0, + "global_step": 462, + "grad_norm": 2.0499940009129007, + "learning_rate": 1.9273788964381647e-06, + "loss": 0.4614, + "step": 462 + }, + { + "ETA": 5.1, + "epoch": 0.14889853674224152, + "fp16_scale": 1.0, + "global_step": 463, + "grad_norm": 2.010885292855413, + "learning_rate": 1.9269885610256865e-06, + "loss": 0.4487, + "step": 463 + }, + { + "ETA": 5.1, + "epoch": 0.14922013185399582, + "fp16_scale": 1.0, + "global_step": 464, + "grad_norm": 2.0314188890871083, + "learning_rate": 1.926597219146421e-06, + "loss": 0.475, + "step": 464 + }, + { + "ETA": 5.1, + "epoch": 0.14954172696575013, + "fp16_scale": 1.0, + "global_step": 465, + "grad_norm": 1.9700836175690337, + "learning_rate": 1.9262048712252623e-06, + "loss": 0.478, + "step": 465 + }, + { + "ETA": 5.1, + "epoch": 0.1498633220775044, + "fp16_scale": 1.0, + "global_step": 466, + "grad_norm": 2.071677060071096, + "learning_rate": 1.925811517688198e-06, + "loss": 0.4219, + "step": 466 + }, + { + "ETA": 5.1, + "epoch": 0.15018491718925872, + "fp16_scale": 1.0, + "global_step": 467, + "grad_norm": 1.846185694632217, + "learning_rate": 1.9254171589623074e-06, + "loss": 0.4181, + "step": 467 + }, + { + "ETA": 5.1, + "epoch": 0.15050651230101303, + "fp16_scale": 1.0, + "global_step": 468, + "grad_norm": 1.8185100096356368, + "learning_rate": 1.92502179547576e-06, + "loss": 0.3918, + "step": 468 + }, + { + "ETA": 5.1, + "epoch": 0.15082810741276734, + "fp16_scale": 1.0, + "global_step": 469, + "grad_norm": 2.23559095566495, + "learning_rate": 1.9246254276578174e-06, + "loss": 0.5203, + "step": 469 + }, + { + "ETA": 5.09, + "epoch": 0.15114970252452162, + "fp16_scale": 1.0, + "global_step": 470, + "grad_norm": 2.0903476148768294, + "learning_rate": 1.924228055938831e-06, + "loss": 0.4333, + "step": 470 + }, + { + "ETA": 5.09, + "epoch": 0.15147129763627593, + "fp16_scale": 1.0, + "global_step": 471, + "grad_norm": 2.6044925679999684, + "learning_rate": 1.9238296807502427e-06, + "loss": 0.4802, + "step": 471 + }, + { + "ETA": 5.09, + "epoch": 0.15179289274803023, + "fp16_scale": 1.0, + "global_step": 472, + "grad_norm": 1.9959524898824763, + "learning_rate": 1.9234303025245833e-06, + "loss": 0.4816, + "step": 472 + }, + { + "ETA": 5.08, + "epoch": 0.15211448785978454, + "fp16_scale": 1.0, + "global_step": 473, + "grad_norm": 1.8932233597555206, + "learning_rate": 1.9230299216954734e-06, + "loss": 0.4265, + "step": 473 + }, + { + "ETA": 5.08, + "epoch": 0.15243608297153882, + "fp16_scale": 1.0, + "global_step": 474, + "grad_norm": 1.9164271497723384, + "learning_rate": 1.922628538697621e-06, + "loss": 0.4487, + "step": 474 + }, + { + "ETA": 5.08, + "epoch": 0.15275767808329313, + "fp16_scale": 1.0, + "global_step": 475, + "grad_norm": 1.936569233446201, + "learning_rate": 1.922226153966824e-06, + "loss": 0.4762, + "step": 475 + }, + { + "ETA": 5.08, + "epoch": 0.15307927319504744, + "fp16_scale": 1.0, + "global_step": 476, + "grad_norm": 2.027042737818172, + "learning_rate": 1.9218227679399657e-06, + "loss": 0.4988, + "step": 476 + }, + { + "ETA": 5.08, + "epoch": 0.15340086830680175, + "fp16_scale": 1.0, + "global_step": 477, + "grad_norm": 1.972876234639711, + "learning_rate": 1.921418381055018e-06, + "loss": 0.5094, + "step": 477 + }, + { + "ETA": 5.08, + "epoch": 0.15372246341855603, + "fp16_scale": 1.0, + "global_step": 478, + "grad_norm": 1.9803998308295407, + "learning_rate": 1.92101299375104e-06, + "loss": 0.4616, + "step": 478 + }, + { + "ETA": 5.08, + "epoch": 0.15404405853031033, + "fp16_scale": 1.0, + "global_step": 479, + "grad_norm": 2.032838491790377, + "learning_rate": 1.920606606468175e-06, + "loss": 0.4205, + "step": 479 + }, + { + "ETA": 5.08, + "epoch": 0.15436565364206464, + "fp16_scale": 1.0, + "global_step": 480, + "grad_norm": 2.111802314751689, + "learning_rate": 1.9201992196476533e-06, + "loss": 0.4826, + "step": 480 + }, + { + "ETA": 5.07, + "epoch": 0.15468724875381895, + "fp16_scale": 1.0, + "global_step": 481, + "grad_norm": 2.117507142649786, + "learning_rate": 1.919790833731791e-06, + "loss": 0.3695, + "step": 481 + }, + { + "ETA": 5.06, + "epoch": 0.15500884386557323, + "fp16_scale": 1.0, + "global_step": 482, + "grad_norm": 2.1214152384528373, + "learning_rate": 1.919381449163988e-06, + "loss": 0.4471, + "step": 482 + }, + { + "ETA": 5.06, + "epoch": 0.15533043897732754, + "fp16_scale": 1.0, + "global_step": 483, + "grad_norm": 2.095435501434622, + "learning_rate": 1.9189710663887276e-06, + "loss": 0.4821, + "step": 483 + }, + { + "ETA": 5.06, + "epoch": 0.15565203408908185, + "fp16_scale": 1.0, + "global_step": 484, + "grad_norm": 2.03919483012636, + "learning_rate": 1.9185596858515797e-06, + "loss": 0.3305, + "step": 484 + }, + { + "ETA": 5.05, + "epoch": 0.15597362920083616, + "fp16_scale": 1.0, + "global_step": 485, + "grad_norm": 2.057899679469343, + "learning_rate": 1.918147307999195e-06, + "loss": 0.4001, + "step": 485 + }, + { + "ETA": 5.05, + "epoch": 0.15629522431259044, + "fp16_scale": 1.0, + "global_step": 486, + "grad_norm": 2.181468956952193, + "learning_rate": 1.9177339332793075e-06, + "loss": 0.3925, + "step": 486 + }, + { + "ETA": 5.04, + "epoch": 0.15661681942434474, + "fp16_scale": 1.0, + "global_step": 487, + "grad_norm": 1.9415284085155948, + "learning_rate": 1.917319562140735e-06, + "loss": 0.4425, + "step": 487 + }, + { + "ETA": 5.04, + "epoch": 0.15693841453609905, + "fp16_scale": 1.0, + "global_step": 488, + "grad_norm": 2.0426162218981725, + "learning_rate": 1.9169041950333747e-06, + "loss": 0.4202, + "step": 488 + }, + { + "ETA": 5.04, + "epoch": 0.15726000964785336, + "fp16_scale": 1.0, + "global_step": 489, + "grad_norm": 1.7610985955624534, + "learning_rate": 1.9164878324082073e-06, + "loss": 0.4983, + "step": 489 + }, + { + "ETA": 5.04, + "epoch": 0.15758160475960764, + "fp16_scale": 1.0, + "global_step": 490, + "grad_norm": 1.830034328847447, + "learning_rate": 1.9160704747172933e-06, + "loss": 0.4906, + "step": 490 + }, + { + "ETA": 5.04, + "epoch": 0.15790319987136195, + "fp16_scale": 1.0, + "global_step": 491, + "grad_norm": 1.985103108443925, + "learning_rate": 1.9156521224137742e-06, + "loss": 0.4479, + "step": 491 + }, + { + "ETA": 5.04, + "epoch": 0.15822479498311626, + "fp16_scale": 1.0, + "global_step": 492, + "grad_norm": 1.9678965370654322, + "learning_rate": 1.91523277595187e-06, + "loss": 0.4031, + "step": 492 + }, + { + "ETA": 5.04, + "epoch": 0.15854639009487057, + "fp16_scale": 1.0, + "global_step": 493, + "grad_norm": 1.8801773585155792, + "learning_rate": 1.9148124357868828e-06, + "loss": 0.4581, + "step": 493 + }, + { + "ETA": 5.04, + "epoch": 0.15886798520662485, + "fp16_scale": 1.0, + "global_step": 494, + "grad_norm": 2.0481246439049863, + "learning_rate": 1.9143911023751902e-06, + "loss": 0.4102, + "step": 494 + }, + { + "ETA": 5.03, + "epoch": 0.15918958031837915, + "fp16_scale": 1.0, + "global_step": 495, + "grad_norm": 2.100424592359454, + "learning_rate": 1.9139687761742512e-06, + "loss": 0.4851, + "step": 495 + }, + { + "ETA": 5.03, + "epoch": 0.15951117543013346, + "fp16_scale": 1.0, + "global_step": 496, + "grad_norm": 2.0182661530368824, + "learning_rate": 1.9135454576426007e-06, + "loss": 0.4697, + "step": 496 + }, + { + "ETA": 5.03, + "epoch": 0.15983277054188777, + "fp16_scale": 1.0, + "global_step": 497, + "grad_norm": 2.442367034410736, + "learning_rate": 1.913121147239852e-06, + "loss": 0.4239, + "step": 497 + }, + { + "ETA": 5.03, + "epoch": 0.16015436565364208, + "fp16_scale": 1.0, + "global_step": 498, + "grad_norm": 2.114188022740362, + "learning_rate": 1.9126958454266954e-06, + "loss": 0.4819, + "step": 498 + }, + { + "ETA": 5.03, + "epoch": 0.16047596076539636, + "fp16_scale": 1.0, + "global_step": 499, + "grad_norm": 2.166891992796271, + "learning_rate": 1.9122695526648967e-06, + "loss": 0.5122, + "step": 499 + }, + { + "ETA": 5.03, + "epoch": 0.16079755587715067, + "fp16_scale": 1.0, + "global_step": 500, + "grad_norm": 2.0689448466980225, + "learning_rate": 1.9118422694172984e-06, + "loss": 0.4343, + "step": 500 + }, + { + "ETA": 5.03, + "epoch": 0.16111915098890497, + "fp16_scale": 1.0, + "global_step": 501, + "grad_norm": 1.9182557739692911, + "learning_rate": 1.9114139961478182e-06, + "loss": 0.3912, + "step": 501 + }, + { + "ETA": 5.03, + "epoch": 0.16144074610065928, + "fp16_scale": 1.0, + "global_step": 502, + "grad_norm": 2.023205269994096, + "learning_rate": 1.910984733321449e-06, + "loss": 0.4111, + "step": 502 + }, + { + "ETA": 5.02, + "epoch": 0.16176234121241356, + "fp16_scale": 1.0, + "global_step": 503, + "grad_norm": 1.8019197922059353, + "learning_rate": 1.9105544814042574e-06, + "loss": 0.4212, + "step": 503 + }, + { + "ETA": 5.02, + "epoch": 0.16208393632416787, + "fp16_scale": 1.0, + "global_step": 504, + "grad_norm": 1.997365979153901, + "learning_rate": 1.9101232408633842e-06, + "loss": 0.5027, + "step": 504 + }, + { + "ETA": 5.02, + "epoch": 0.16240553143592218, + "fp16_scale": 1.0, + "global_step": 505, + "grad_norm": 2.1495963330876875, + "learning_rate": 1.9096910121670443e-06, + "loss": 0.3767, + "step": 505 + }, + { + "ETA": 5.01, + "epoch": 0.1627271265476765, + "fp16_scale": 1.0, + "global_step": 506, + "grad_norm": 2.1017123576575787, + "learning_rate": 1.909257795784524e-06, + "loss": 0.4223, + "step": 506 + }, + { + "ETA": 5.01, + "epoch": 0.16304872165943077, + "fp16_scale": 1.0, + "global_step": 507, + "grad_norm": 1.8670850009944489, + "learning_rate": 1.9088235921861836e-06, + "loss": 0.4779, + "step": 507 + }, + { + "ETA": 5.01, + "epoch": 0.16337031677118508, + "fp16_scale": 1.0, + "global_step": 508, + "grad_norm": 2.1192504020841043, + "learning_rate": 1.908388401843454e-06, + "loss": 0.4734, + "step": 508 + }, + { + "ETA": 5.01, + "epoch": 0.16369191188293938, + "fp16_scale": 1.0, + "global_step": 509, + "grad_norm": 2.0041831162050276, + "learning_rate": 1.9079522252288387e-06, + "loss": 0.4586, + "step": 509 + }, + { + "ETA": 5.0, + "epoch": 0.1640135069946937, + "fp16_scale": 1.0, + "global_step": 510, + "grad_norm": 2.1063411475899234, + "learning_rate": 1.90751506281591e-06, + "loss": 0.4009, + "step": 510 + }, + { + "ETA": 5.0, + "epoch": 0.16433510210644797, + "fp16_scale": 1.0, + "global_step": 511, + "grad_norm": 2.569817215911691, + "learning_rate": 1.9070769150793127e-06, + "loss": 0.5017, + "step": 511 + }, + { + "ETA": 5.0, + "epoch": 0.16465669721820228, + "fp16_scale": 1.0, + "global_step": 512, + "grad_norm": 1.8061384404289822, + "learning_rate": 1.9066377824947603e-06, + "loss": 0.4823, + "step": 512 + }, + { + "ETA": 5.0, + "epoch": 0.1649782923299566, + "fp16_scale": 1.0, + "global_step": 513, + "grad_norm": 2.246681058475501, + "learning_rate": 1.9061976655390354e-06, + "loss": 0.3808, + "step": 513 + }, + { + "ETA": 4.99, + "epoch": 0.1652998874417109, + "fp16_scale": 1.0, + "global_step": 514, + "grad_norm": 2.2809229794345414, + "learning_rate": 1.9057565646899905e-06, + "loss": 0.373, + "step": 514 + }, + { + "ETA": 4.99, + "epoch": 0.16562148255346518, + "fp16_scale": 1.0, + "global_step": 515, + "grad_norm": 2.24739897262686, + "learning_rate": 1.9053144804265448e-06, + "loss": 0.4451, + "step": 515 + }, + { + "ETA": 4.98, + "epoch": 0.16594307766521948, + "fp16_scale": 1.0, + "global_step": 516, + "grad_norm": 1.9156854430478418, + "learning_rate": 1.9048714132286867e-06, + "loss": 0.4203, + "step": 516 + }, + { + "ETA": 4.98, + "epoch": 0.1662646727769738, + "fp16_scale": 1.0, + "global_step": 517, + "grad_norm": 1.926065867868701, + "learning_rate": 1.9044273635774704e-06, + "loss": 0.438, + "step": 517 + }, + { + "ETA": 4.97, + "epoch": 0.1665862678887281, + "fp16_scale": 1.0, + "global_step": 518, + "grad_norm": 2.4219151111659167, + "learning_rate": 1.903982331955018e-06, + "loss": 0.4361, + "step": 518 + }, + { + "ETA": 4.97, + "epoch": 0.16690786300048238, + "fp16_scale": 1.0, + "global_step": 519, + "grad_norm": 1.9730124219605953, + "learning_rate": 1.9035363188445178e-06, + "loss": 0.4027, + "step": 519 + }, + { + "ETA": 4.97, + "epoch": 0.1672294581122367, + "fp16_scale": 1.0, + "global_step": 520, + "grad_norm": 1.9555521797448514, + "learning_rate": 1.9030893247302217e-06, + "loss": 0.4653, + "step": 520 + }, + { + "ETA": 4.96, + "epoch": 0.167551053223991, + "fp16_scale": 1.0, + "global_step": 521, + "grad_norm": 1.9937224078125575, + "learning_rate": 1.90264135009745e-06, + "loss": 0.5008, + "step": 521 + }, + { + "ETA": 4.96, + "epoch": 0.1678726483357453, + "fp16_scale": 1.0, + "global_step": 522, + "grad_norm": 2.000642568224559, + "learning_rate": 1.9021923954325844e-06, + "loss": 0.5437, + "step": 522 + }, + { + "ETA": 4.96, + "epoch": 0.1681942434474996, + "fp16_scale": 1.0, + "global_step": 523, + "grad_norm": 1.9943212517219038, + "learning_rate": 1.901742461223073e-06, + "loss": 0.4717, + "step": 523 + }, + { + "ETA": 4.96, + "epoch": 0.1685158385592539, + "fp16_scale": 1.0, + "global_step": 524, + "grad_norm": 2.122839075375804, + "learning_rate": 1.9012915479574262e-06, + "loss": 0.485, + "step": 524 + }, + { + "ETA": 4.96, + "epoch": 0.1688374336710082, + "fp16_scale": 1.0, + "global_step": 525, + "grad_norm": 1.9468881722832705, + "learning_rate": 1.900839656125217e-06, + "loss": 0.4064, + "step": 525 + }, + { + "ETA": 4.96, + "epoch": 0.1691590287827625, + "fp16_scale": 1.0, + "global_step": 526, + "grad_norm": 2.1150096020163605, + "learning_rate": 1.900386786217083e-06, + "loss": 0.4396, + "step": 526 + }, + { + "ETA": 4.96, + "epoch": 0.1694806238945168, + "fp16_scale": 1.0, + "global_step": 527, + "grad_norm": 2.012879701644392, + "learning_rate": 1.8999329387247215e-06, + "loss": 0.4797, + "step": 527 + }, + { + "ETA": 4.95, + "epoch": 0.1698022190062711, + "fp16_scale": 1.0, + "global_step": 528, + "grad_norm": 2.031934706209426, + "learning_rate": 1.8994781141408919e-06, + "loss": 0.4477, + "step": 528 + }, + { + "ETA": 4.95, + "epoch": 0.1701238141180254, + "fp16_scale": 1.0, + "global_step": 529, + "grad_norm": 2.03686582745526, + "learning_rate": 1.8990223129594145e-06, + "loss": 0.4597, + "step": 529 + }, + { + "ETA": 4.95, + "epoch": 0.17044540922977972, + "fp16_scale": 1.0, + "global_step": 530, + "grad_norm": 1.8402927637301583, + "learning_rate": 1.8985655356751703e-06, + "loss": 0.4299, + "step": 530 + }, + { + "ETA": 4.95, + "epoch": 0.170767004341534, + "fp16_scale": 1.0, + "global_step": 531, + "grad_norm": 2.1026640221571515, + "learning_rate": 1.8981077827841e-06, + "loss": 0.4799, + "step": 531 + }, + { + "ETA": 4.95, + "epoch": 0.1710885994532883, + "fp16_scale": 1.0, + "global_step": 532, + "grad_norm": 1.919984508497446, + "learning_rate": 1.8976490547832032e-06, + "loss": 0.4408, + "step": 532 + }, + { + "ETA": 4.95, + "epoch": 0.1714101945650426, + "fp16_scale": 1.0, + "global_step": 533, + "grad_norm": 2.0140479254351025, + "learning_rate": 1.897189352170538e-06, + "loss": 0.3843, + "step": 533 + }, + { + "ETA": 4.94, + "epoch": 0.17173178967679692, + "fp16_scale": 1.0, + "global_step": 534, + "grad_norm": 2.223144969945552, + "learning_rate": 1.8967286754452212e-06, + "loss": 0.4629, + "step": 534 + }, + { + "ETA": 4.94, + "epoch": 0.1720533847885512, + "fp16_scale": 1.0, + "global_step": 535, + "grad_norm": 2.2632666917839246, + "learning_rate": 1.8962670251074274e-06, + "loss": 0.4977, + "step": 535 + }, + { + "ETA": 4.94, + "epoch": 0.1723749799003055, + "fp16_scale": 1.0, + "global_step": 536, + "grad_norm": 2.2717337535407816, + "learning_rate": 1.8958044016583874e-06, + "loss": 0.4284, + "step": 536 + }, + { + "ETA": 4.94, + "epoch": 0.17269657501205982, + "fp16_scale": 1.0, + "global_step": 537, + "grad_norm": 2.13873199798984, + "learning_rate": 1.8953408056003896e-06, + "loss": 0.415, + "step": 537 + }, + { + "ETA": 4.93, + "epoch": 0.17301817012381412, + "fp16_scale": 1.0, + "global_step": 538, + "grad_norm": 1.9101124996443284, + "learning_rate": 1.8948762374367778e-06, + "loss": 0.4479, + "step": 538 + }, + { + "ETA": 4.93, + "epoch": 0.17333976523556843, + "fp16_scale": 1.0, + "global_step": 539, + "grad_norm": 2.1281584409565477, + "learning_rate": 1.8944106976719512e-06, + "loss": 0.4806, + "step": 539 + }, + { + "ETA": 4.93, + "epoch": 0.1736613603473227, + "fp16_scale": 1.0, + "global_step": 540, + "grad_norm": 2.0622123122804354, + "learning_rate": 1.893944186811364e-06, + "loss": 0.4699, + "step": 540 + }, + { + "ETA": 4.93, + "epoch": 0.17398295545907702, + "fp16_scale": 1.0, + "global_step": 541, + "grad_norm": 2.041120255261446, + "learning_rate": 1.8934767053615247e-06, + "loss": 0.4499, + "step": 541 + }, + { + "ETA": 4.93, + "epoch": 0.17430455057083133, + "fp16_scale": 1.0, + "global_step": 542, + "grad_norm": 2.0870200132391044, + "learning_rate": 1.8930082538299965e-06, + "loss": 0.4764, + "step": 542 + }, + { + "ETA": 4.93, + "epoch": 0.17462614568258564, + "fp16_scale": 1.0, + "global_step": 543, + "grad_norm": 2.41639846747191, + "learning_rate": 1.892538832725394e-06, + "loss": 0.3978, + "step": 543 + }, + { + "ETA": 4.93, + "epoch": 0.17494774079433992, + "fp16_scale": 1.0, + "global_step": 544, + "grad_norm": 2.035605720561539, + "learning_rate": 1.8920684425573864e-06, + "loss": 0.4566, + "step": 544 + }, + { + "ETA": 4.92, + "epoch": 0.17526933590609423, + "fp16_scale": 1.0, + "global_step": 545, + "grad_norm": 1.83310911668132, + "learning_rate": 1.891597083836694e-06, + "loss": 0.5362, + "step": 545 + }, + { + "ETA": 4.92, + "epoch": 0.17559093101784853, + "fp16_scale": 1.0, + "global_step": 546, + "grad_norm": 1.896095684151234, + "learning_rate": 1.8911247570750883e-06, + "loss": 0.4819, + "step": 546 + }, + { + "ETA": 4.92, + "epoch": 0.17591252612960284, + "fp16_scale": 1.0, + "global_step": 547, + "grad_norm": 1.8853474029996398, + "learning_rate": 1.8906514627853934e-06, + "loss": 0.4873, + "step": 547 + }, + { + "ETA": 4.92, + "epoch": 0.17623412124135712, + "fp16_scale": 1.0, + "global_step": 548, + "grad_norm": 1.9571717149678636, + "learning_rate": 1.8901772014814822e-06, + "loss": 0.5372, + "step": 548 + }, + { + "ETA": 4.92, + "epoch": 0.17655571635311143, + "fp16_scale": 1.0, + "global_step": 549, + "grad_norm": 1.8769314230870588, + "learning_rate": 1.889701973678279e-06, + "loss": 0.4166, + "step": 549 + }, + { + "ETA": 4.92, + "epoch": 0.17687731146486574, + "fp16_scale": 1.0, + "global_step": 550, + "grad_norm": 1.926745259518635, + "learning_rate": 1.8892257798917557e-06, + "loss": 0.4563, + "step": 550 + }, + { + "ETA": 4.92, + "epoch": 0.17719890657662005, + "fp16_scale": 1.0, + "global_step": 551, + "grad_norm": 2.02103573757482, + "learning_rate": 1.888748620638935e-06, + "loss": 0.4151, + "step": 551 + }, + { + "ETA": 4.91, + "epoch": 0.17752050168837433, + "fp16_scale": 1.0, + "global_step": 552, + "grad_norm": 1.885756779896396, + "learning_rate": 1.8882704964378867e-06, + "loss": 0.4884, + "step": 552 + }, + { + "ETA": 4.91, + "epoch": 0.17784209680012864, + "fp16_scale": 1.0, + "global_step": 553, + "grad_norm": 2.0146509941279396, + "learning_rate": 1.8877914078077279e-06, + "loss": 0.3924, + "step": 553 + }, + { + "ETA": 4.91, + "epoch": 0.17816369191188294, + "fp16_scale": 1.0, + "global_step": 554, + "grad_norm": 2.4579834422780604, + "learning_rate": 1.8873113552686237e-06, + "loss": 0.4015, + "step": 554 + }, + { + "ETA": 4.9, + "epoch": 0.17848528702363725, + "fp16_scale": 1.0, + "global_step": 555, + "grad_norm": 1.934591746997279, + "learning_rate": 1.8868303393417856e-06, + "loss": 0.4506, + "step": 555 + }, + { + "ETA": 4.9, + "epoch": 0.17880688213539153, + "fp16_scale": 1.0, + "global_step": 556, + "grad_norm": 1.7235134069580018, + "learning_rate": 1.8863483605494706e-06, + "loss": 0.4576, + "step": 556 + }, + { + "ETA": 4.9, + "epoch": 0.17912847724714584, + "fp16_scale": 1.0, + "global_step": 557, + "grad_norm": 2.0894967580717423, + "learning_rate": 1.8858654194149816e-06, + "loss": 0.5248, + "step": 557 + }, + { + "ETA": 4.9, + "epoch": 0.17945007235890015, + "fp16_scale": 1.0, + "global_step": 558, + "grad_norm": 2.097258628227925, + "learning_rate": 1.8853815164626666e-06, + "loss": 0.4648, + "step": 558 + }, + { + "ETA": 4.9, + "epoch": 0.17977166747065446, + "fp16_scale": 1.0, + "global_step": 559, + "grad_norm": 1.8403413298853255, + "learning_rate": 1.8848966522179167e-06, + "loss": 0.4175, + "step": 559 + }, + { + "ETA": 4.89, + "epoch": 0.18009326258240874, + "fp16_scale": 1.0, + "global_step": 560, + "grad_norm": 2.0927307686172423, + "learning_rate": 1.8844108272071679e-06, + "loss": 0.3805, + "step": 560 + }, + { + "ETA": 4.89, + "epoch": 0.18041485769416304, + "fp16_scale": 1.0, + "global_step": 561, + "grad_norm": 2.131386571404521, + "learning_rate": 1.8839240419578987e-06, + "loss": 0.3927, + "step": 561 + }, + { + "ETA": 4.89, + "epoch": 0.18073645280591735, + "fp16_scale": 1.0, + "global_step": 562, + "grad_norm": 1.826343865639814, + "learning_rate": 1.8834362969986307e-06, + "loss": 0.4488, + "step": 562 + }, + { + "ETA": 4.89, + "epoch": 0.18105804791767166, + "fp16_scale": 1.0, + "global_step": 563, + "grad_norm": 1.948149057105855, + "learning_rate": 1.8829475928589268e-06, + "loss": 0.5413, + "step": 563 + }, + { + "ETA": 4.89, + "epoch": 0.18137964302942594, + "fp16_scale": 1.0, + "global_step": 564, + "grad_norm": 1.968769807335462, + "learning_rate": 1.8824579300693922e-06, + "loss": 0.5736, + "step": 564 + }, + { + "ETA": 4.89, + "epoch": 0.18170123814118025, + "fp16_scale": 1.0, + "global_step": 565, + "grad_norm": 1.8844946208572746, + "learning_rate": 1.881967309161672e-06, + "loss": 0.4289, + "step": 565 + }, + { + "ETA": 4.88, + "epoch": 0.18202283325293456, + "fp16_scale": 1.0, + "global_step": 566, + "grad_norm": 1.903822174394183, + "learning_rate": 1.881475730668452e-06, + "loss": 0.4828, + "step": 566 + }, + { + "ETA": 4.88, + "epoch": 0.18234442836468887, + "fp16_scale": 1.0, + "global_step": 567, + "grad_norm": 2.2108039620773954, + "learning_rate": 1.880983195123458e-06, + "loss": 0.5257, + "step": 567 + }, + { + "ETA": 4.88, + "epoch": 0.18266602347644315, + "fp16_scale": 1.0, + "global_step": 568, + "grad_norm": 2.099351752581281, + "learning_rate": 1.8804897030614546e-06, + "loss": 0.4821, + "step": 568 + }, + { + "ETA": 4.88, + "epoch": 0.18298761858819745, + "fp16_scale": 1.0, + "global_step": 569, + "grad_norm": 1.9382447625923098, + "learning_rate": 1.8799952550182445e-06, + "loss": 0.4487, + "step": 569 + }, + { + "ETA": 4.88, + "epoch": 0.18330921369995176, + "fp16_scale": 1.0, + "global_step": 570, + "grad_norm": 2.2681362181489053, + "learning_rate": 1.879499851530669e-06, + "loss": 0.6149, + "step": 570 + }, + { + "ETA": 4.88, + "epoch": 0.18363080881170607, + "fp16_scale": 1.0, + "global_step": 571, + "grad_norm": 1.940133081860052, + "learning_rate": 1.879003493136607e-06, + "loss": 0.4497, + "step": 571 + }, + { + "ETA": 4.88, + "epoch": 0.18395240392346035, + "fp16_scale": 1.0, + "global_step": 572, + "grad_norm": 2.0599883726403987, + "learning_rate": 1.8785061803749729e-06, + "loss": 0.4778, + "step": 572 + }, + { + "ETA": 4.88, + "epoch": 0.18427399903521466, + "fp16_scale": 1.0, + "global_step": 573, + "grad_norm": 2.110329637174342, + "learning_rate": 1.8780079137857188e-06, + "loss": 0.4481, + "step": 573 + }, + { + "ETA": 4.88, + "epoch": 0.18459559414696897, + "fp16_scale": 1.0, + "global_step": 574, + "grad_norm": 1.919884947344211, + "learning_rate": 1.877508693909831e-06, + "loss": 0.492, + "step": 574 + }, + { + "ETA": 4.87, + "epoch": 0.18491718925872327, + "fp16_scale": 1.0, + "global_step": 575, + "grad_norm": 1.8663129640547262, + "learning_rate": 1.8770085212893326e-06, + "loss": 0.3873, + "step": 575 + }, + { + "ETA": 4.87, + "epoch": 0.18523878437047755, + "fp16_scale": 1.0, + "global_step": 576, + "grad_norm": 2.1952302394856975, + "learning_rate": 1.876507396467279e-06, + "loss": 0.3941, + "step": 576 + }, + { + "ETA": 4.87, + "epoch": 0.18556037948223186, + "fp16_scale": 1.0, + "global_step": 577, + "grad_norm": 2.0533985048962893, + "learning_rate": 1.8760053199877607e-06, + "loss": 0.3433, + "step": 577 + }, + { + "ETA": 4.86, + "epoch": 0.18588197459398617, + "fp16_scale": 1.0, + "global_step": 578, + "grad_norm": 2.0182071082471293, + "learning_rate": 1.8755022923959016e-06, + "loss": 0.474, + "step": 578 + }, + { + "ETA": 4.86, + "epoch": 0.18620356970574048, + "fp16_scale": 1.0, + "global_step": 579, + "grad_norm": 2.002732610511696, + "learning_rate": 1.874998314237858e-06, + "loss": 0.469, + "step": 579 + }, + { + "ETA": 4.86, + "epoch": 0.1865251648174948, + "fp16_scale": 1.0, + "global_step": 580, + "grad_norm": 2.039418867547622, + "learning_rate": 1.8744933860608182e-06, + "loss": 0.4198, + "step": 580 + }, + { + "ETA": 4.86, + "epoch": 0.18684675992924907, + "fp16_scale": 1.0, + "global_step": 581, + "grad_norm": 1.8329540055828422, + "learning_rate": 1.873987508413001e-06, + "loss": 0.4763, + "step": 581 + }, + { + "ETA": 4.85, + "epoch": 0.18716835504100338, + "fp16_scale": 1.0, + "global_step": 582, + "grad_norm": 1.8742753050198495, + "learning_rate": 1.8734806818436582e-06, + "loss": 0.4124, + "step": 582 + }, + { + "ETA": 4.85, + "epoch": 0.18748995015275768, + "fp16_scale": 1.0, + "global_step": 583, + "grad_norm": 1.9397192271287038, + "learning_rate": 1.8729729069030702e-06, + "loss": 0.505, + "step": 583 + }, + { + "ETA": 4.85, + "epoch": 0.187811545264512, + "fp16_scale": 1.0, + "global_step": 584, + "grad_norm": 2.108304627582822, + "learning_rate": 1.8724641841425478e-06, + "loss": 0.4052, + "step": 584 + }, + { + "ETA": 4.85, + "epoch": 0.18813314037626627, + "fp16_scale": 1.0, + "global_step": 585, + "grad_norm": 1.9067581604657606, + "learning_rate": 1.8719545141144305e-06, + "loss": 0.4544, + "step": 585 + }, + { + "ETA": 4.84, + "epoch": 0.18845473548802058, + "fp16_scale": 1.0, + "global_step": 586, + "grad_norm": 1.892836923763899, + "learning_rate": 1.8714438973720866e-06, + "loss": 0.3964, + "step": 586 + }, + { + "ETA": 4.84, + "epoch": 0.1887763305997749, + "fp16_scale": 1.0, + "global_step": 587, + "grad_norm": 1.984445559124549, + "learning_rate": 1.8709323344699116e-06, + "loss": 0.428, + "step": 587 + }, + { + "ETA": 4.84, + "epoch": 0.1890979257115292, + "fp16_scale": 1.0, + "global_step": 588, + "grad_norm": 2.2199260725468366, + "learning_rate": 1.8704198259633297e-06, + "loss": 0.396, + "step": 588 + }, + { + "ETA": 4.83, + "epoch": 0.18941952082328348, + "fp16_scale": 1.0, + "global_step": 589, + "grad_norm": 1.772486612676109, + "learning_rate": 1.8699063724087903e-06, + "loss": 0.3727, + "step": 589 + }, + { + "ETA": 4.83, + "epoch": 0.18974111593503779, + "fp16_scale": 1.0, + "global_step": 590, + "grad_norm": 1.926100516202449, + "learning_rate": 1.8693919743637697e-06, + "loss": 0.3861, + "step": 590 + }, + { + "ETA": 4.83, + "epoch": 0.1900627110467921, + "fp16_scale": 1.0, + "global_step": 591, + "grad_norm": 2.3916796754464613, + "learning_rate": 1.8688766323867694e-06, + "loss": 0.366, + "step": 591 + }, + { + "ETA": 4.83, + "epoch": 0.1903843061585464, + "fp16_scale": 1.0, + "global_step": 592, + "grad_norm": 1.9010824784128724, + "learning_rate": 1.8683603470373156e-06, + "loss": 0.5022, + "step": 592 + }, + { + "ETA": 4.83, + "epoch": 0.19070590127030068, + "fp16_scale": 1.0, + "global_step": 593, + "grad_norm": 2.0597054389558402, + "learning_rate": 1.8678431188759594e-06, + "loss": 0.4443, + "step": 593 + }, + { + "ETA": 4.82, + "epoch": 0.191027496382055, + "fp16_scale": 1.0, + "global_step": 594, + "grad_norm": 1.7884018761750782, + "learning_rate": 1.867324948464275e-06, + "loss": 0.4631, + "step": 594 + }, + { + "ETA": 4.82, + "epoch": 0.1913490914938093, + "fp16_scale": 1.0, + "global_step": 595, + "grad_norm": 1.8868484963061303, + "learning_rate": 1.8668058363648595e-06, + "loss": 0.4437, + "step": 595 + }, + { + "ETA": 4.82, + "epoch": 0.1916706866055636, + "fp16_scale": 1.0, + "global_step": 596, + "grad_norm": 2.095772690216961, + "learning_rate": 1.866285783141333e-06, + "loss": 0.4201, + "step": 596 + }, + { + "ETA": 4.82, + "epoch": 0.1919922817173179, + "fp16_scale": 1.0, + "global_step": 597, + "grad_norm": 1.8552325554382783, + "learning_rate": 1.8657647893583367e-06, + "loss": 0.4808, + "step": 597 + }, + { + "ETA": 4.82, + "epoch": 0.1923138768290722, + "fp16_scale": 1.0, + "global_step": 598, + "grad_norm": 1.9484783011443043, + "learning_rate": 1.8652428555815337e-06, + "loss": 0.3913, + "step": 598 + }, + { + "ETA": 4.82, + "epoch": 0.1926354719408265, + "fp16_scale": 1.0, + "global_step": 599, + "grad_norm": 2.058521572514809, + "learning_rate": 1.8647199823776075e-06, + "loss": 0.4762, + "step": 599 + }, + { + "ETA": 4.82, + "epoch": 0.1929570670525808, + "fp16_scale": 1.0, + "global_step": 600, + "grad_norm": 2.1264069711297107, + "learning_rate": 1.864196170314261e-06, + "loss": 0.4808, + "step": 600 + }, + { + "ETA": 4.85, + "epoch": 0.1932786621643351, + "fp16_scale": 1.0, + "global_step": 601, + "grad_norm": 1.732500748250544, + "learning_rate": 1.8636714199602173e-06, + "loss": 0.4653, + "step": 601 + }, + { + "ETA": 4.84, + "epoch": 0.1936002572760894, + "fp16_scale": 1.0, + "global_step": 602, + "grad_norm": 2.1013280544007005, + "learning_rate": 1.8631457318852176e-06, + "loss": 0.3684, + "step": 602 + }, + { + "ETA": 4.84, + "epoch": 0.1939218523878437, + "fp16_scale": 1.0, + "global_step": 603, + "grad_norm": 1.9749589587766276, + "learning_rate": 1.8626191066600216e-06, + "loss": 0.4051, + "step": 603 + }, + { + "ETA": 4.84, + "epoch": 0.19424344749959802, + "fp16_scale": 1.0, + "global_step": 604, + "grad_norm": 2.175927882974133, + "learning_rate": 1.8620915448564066e-06, + "loss": 0.416, + "step": 604 + }, + { + "ETA": 4.84, + "epoch": 0.1945650426113523, + "fp16_scale": 1.0, + "global_step": 605, + "grad_norm": 2.031438401343737, + "learning_rate": 1.861563047047166e-06, + "loss": 0.4195, + "step": 605 + }, + { + "ETA": 4.83, + "epoch": 0.1948866377231066, + "fp16_scale": 1.0, + "global_step": 606, + "grad_norm": 2.183574618551414, + "learning_rate": 1.8610336138061102e-06, + "loss": 0.4374, + "step": 606 + }, + { + "ETA": 4.83, + "epoch": 0.1952082328348609, + "fp16_scale": 1.0, + "global_step": 607, + "grad_norm": 2.2343993713422217, + "learning_rate": 1.8605032457080652e-06, + "loss": 0.4276, + "step": 607 + }, + { + "ETA": 4.83, + "epoch": 0.19552982794661522, + "fp16_scale": 1.0, + "global_step": 608, + "grad_norm": 1.6430057100439763, + "learning_rate": 1.859971943328872e-06, + "loss": 0.3886, + "step": 608 + }, + { + "ETA": 4.83, + "epoch": 0.1958514230583695, + "fp16_scale": 1.0, + "global_step": 609, + "grad_norm": 1.9117037960619865, + "learning_rate": 1.8594397072453853e-06, + "loss": 0.4074, + "step": 609 + }, + { + "ETA": 4.82, + "epoch": 0.1961730181701238, + "fp16_scale": 1.0, + "global_step": 610, + "grad_norm": 2.421864106905433, + "learning_rate": 1.8589065380354745e-06, + "loss": 0.4771, + "step": 610 + }, + { + "ETA": 4.82, + "epoch": 0.19649461328187812, + "fp16_scale": 1.0, + "global_step": 611, + "grad_norm": 2.4002527375885623, + "learning_rate": 1.8583724362780212e-06, + "loss": 0.3781, + "step": 611 + }, + { + "ETA": 4.82, + "epoch": 0.19681620839363242, + "fp16_scale": 1.0, + "global_step": 612, + "grad_norm": 1.9699545619938954, + "learning_rate": 1.857837402552921e-06, + "loss": 0.4131, + "step": 612 + }, + { + "ETA": 4.81, + "epoch": 0.1971378035053867, + "fp16_scale": 1.0, + "global_step": 613, + "grad_norm": 2.2752226907372233, + "learning_rate": 1.8573014374410795e-06, + "loss": 0.5096, + "step": 613 + }, + { + "ETA": 4.81, + "epoch": 0.197459398617141, + "fp16_scale": 1.0, + "global_step": 614, + "grad_norm": 1.8314450152414132, + "learning_rate": 1.8567645415244148e-06, + "loss": 0.3465, + "step": 614 + }, + { + "ETA": 4.81, + "epoch": 0.19778099372889532, + "fp16_scale": 1.0, + "global_step": 615, + "grad_norm": 1.8977891548584482, + "learning_rate": 1.8562267153858552e-06, + "loss": 0.5017, + "step": 615 + }, + { + "ETA": 4.8, + "epoch": 0.19810258884064963, + "fp16_scale": 1.0, + "global_step": 616, + "grad_norm": 2.0420930643646806, + "learning_rate": 1.8556879596093391e-06, + "loss": 0.4094, + "step": 616 + }, + { + "ETA": 4.8, + "epoch": 0.1984241839524039, + "fp16_scale": 1.0, + "global_step": 617, + "grad_norm": 2.0549159698304513, + "learning_rate": 1.8551482747798141e-06, + "loss": 0.4693, + "step": 617 + }, + { + "ETA": 4.8, + "epoch": 0.19874577906415822, + "fp16_scale": 1.0, + "global_step": 618, + "grad_norm": 1.879700336166331, + "learning_rate": 1.8546076614832365e-06, + "loss": 0.4828, + "step": 618 + }, + { + "ETA": 4.8, + "epoch": 0.19906737417591253, + "fp16_scale": 1.0, + "global_step": 619, + "grad_norm": 1.8261289028259504, + "learning_rate": 1.8540661203065707e-06, + "loss": 0.4695, + "step": 619 + }, + { + "ETA": 4.8, + "epoch": 0.19938896928766683, + "fp16_scale": 1.0, + "global_step": 620, + "grad_norm": 1.8866623882949907, + "learning_rate": 1.853523651837789e-06, + "loss": 0.4323, + "step": 620 + }, + { + "ETA": 4.79, + "epoch": 0.19971056439942114, + "fp16_scale": 1.0, + "global_step": 621, + "grad_norm": 2.1287253170862184, + "learning_rate": 1.8529802566658692e-06, + "loss": 0.4183, + "step": 621 + }, + { + "ETA": 4.79, + "epoch": 0.20003215951117542, + "fp16_scale": 1.0, + "global_step": 622, + "grad_norm": 1.8618208672413505, + "learning_rate": 1.852435935380797e-06, + "loss": 0.4658, + "step": 622 + }, + { + "ETA": 4.78, + "epoch": 0.20035375462292973, + "fp16_scale": 1.0, + "global_step": 623, + "grad_norm": 2.254753764323054, + "learning_rate": 1.8518906885735625e-06, + "loss": 0.4352, + "step": 623 + }, + { + "ETA": 4.78, + "epoch": 0.20067534973468404, + "fp16_scale": 1.0, + "global_step": 624, + "grad_norm": 2.010192488268173, + "learning_rate": 1.851344516836161e-06, + "loss": 0.4339, + "step": 624 + }, + { + "ETA": 4.78, + "epoch": 0.20099694484643835, + "fp16_scale": 1.0, + "global_step": 625, + "grad_norm": 1.9849676704988954, + "learning_rate": 1.8507974207615916e-06, + "loss": 0.465, + "step": 625 + }, + { + "ETA": 4.78, + "epoch": 0.20131853995819263, + "fp16_scale": 1.0, + "global_step": 626, + "grad_norm": 1.8809543144615208, + "learning_rate": 1.8502494009438576e-06, + "loss": 0.5547, + "step": 626 + }, + { + "ETA": 4.77, + "epoch": 0.20164013506994694, + "fp16_scale": 1.0, + "global_step": 627, + "grad_norm": 2.1958689145152044, + "learning_rate": 1.849700457977965e-06, + "loss": 0.418, + "step": 627 + }, + { + "ETA": 4.77, + "epoch": 0.20196173018170124, + "fp16_scale": 1.0, + "global_step": 628, + "grad_norm": 2.1420083149279634, + "learning_rate": 1.8491505924599223e-06, + "loss": 0.4698, + "step": 628 + }, + { + "ETA": 4.77, + "epoch": 0.20228332529345555, + "fp16_scale": 1.0, + "global_step": 629, + "grad_norm": 2.182804317877393, + "learning_rate": 1.8485998049867386e-06, + "loss": 0.3843, + "step": 629 + }, + { + "ETA": 4.76, + "epoch": 0.20260492040520983, + "fp16_scale": 1.0, + "global_step": 630, + "grad_norm": 2.1621189673234813, + "learning_rate": 1.8480480961564257e-06, + "loss": 0.4753, + "step": 630 + }, + { + "ETA": 4.76, + "epoch": 0.20292651551696414, + "fp16_scale": 1.0, + "global_step": 631, + "grad_norm": 2.3352945651312056, + "learning_rate": 1.8474954665679948e-06, + "loss": 0.5026, + "step": 631 + }, + { + "ETA": 4.76, + "epoch": 0.20324811062871845, + "fp16_scale": 1.0, + "global_step": 632, + "grad_norm": 1.8208689508829023, + "learning_rate": 1.8469419168214566e-06, + "loss": 0.4212, + "step": 632 + }, + { + "ETA": 4.76, + "epoch": 0.20356970574047276, + "fp16_scale": 1.0, + "global_step": 633, + "grad_norm": 1.9206588859967546, + "learning_rate": 1.8463874475178214e-06, + "loss": 0.4007, + "step": 633 + }, + { + "ETA": 4.76, + "epoch": 0.20389130085222704, + "fp16_scale": 1.0, + "global_step": 634, + "grad_norm": 1.9366640673580708, + "learning_rate": 1.8458320592590973e-06, + "loss": 0.4894, + "step": 634 + }, + { + "ETA": 4.76, + "epoch": 0.20421289596398134, + "fp16_scale": 1.0, + "global_step": 635, + "grad_norm": 1.9962884829129268, + "learning_rate": 1.8452757526482907e-06, + "loss": 0.4686, + "step": 635 + }, + { + "ETA": 4.76, + "epoch": 0.20453449107573565, + "fp16_scale": 1.0, + "global_step": 636, + "grad_norm": 1.8664658036605462, + "learning_rate": 1.8447185282894049e-06, + "loss": 0.4387, + "step": 636 + }, + { + "ETA": 4.75, + "epoch": 0.20485608618748996, + "fp16_scale": 1.0, + "global_step": 637, + "grad_norm": 1.7841863488835676, + "learning_rate": 1.8441603867874397e-06, + "loss": 0.4318, + "step": 637 + }, + { + "ETA": 4.75, + "epoch": 0.20517768129924424, + "fp16_scale": 1.0, + "global_step": 638, + "grad_norm": 2.0306722810093936, + "learning_rate": 1.8436013287483902e-06, + "loss": 0.4738, + "step": 638 + }, + { + "ETA": 4.75, + "epoch": 0.20549927641099855, + "fp16_scale": 1.0, + "global_step": 639, + "grad_norm": 2.137654186818178, + "learning_rate": 1.843041354779247e-06, + "loss": 0.3841, + "step": 639 + }, + { + "ETA": 4.74, + "epoch": 0.20582087152275286, + "fp16_scale": 1.0, + "global_step": 640, + "grad_norm": 2.0265562481011754, + "learning_rate": 1.8424804654879961e-06, + "loss": 0.4185, + "step": 640 + }, + { + "ETA": 4.74, + "epoch": 0.20614246663450717, + "fp16_scale": 1.0, + "global_step": 641, + "grad_norm": 1.8130403454687563, + "learning_rate": 1.8419186614836153e-06, + "loss": 0.3894, + "step": 641 + }, + { + "ETA": 4.74, + "epoch": 0.20646406174626145, + "fp16_scale": 1.0, + "global_step": 642, + "grad_norm": 2.009846041145111, + "learning_rate": 1.841355943376077e-06, + "loss": 0.4266, + "step": 642 + }, + { + "ETA": 4.74, + "epoch": 0.20678565685801575, + "fp16_scale": 1.0, + "global_step": 643, + "grad_norm": 2.006466947197548, + "learning_rate": 1.840792311776346e-06, + "loss": 0.3877, + "step": 643 + }, + { + "ETA": 4.74, + "epoch": 0.20710725196977006, + "fp16_scale": 1.0, + "global_step": 644, + "grad_norm": 1.9422981714149796, + "learning_rate": 1.840227767296378e-06, + "loss": 0.4504, + "step": 644 + }, + { + "ETA": 4.73, + "epoch": 0.20742884708152437, + "fp16_scale": 1.0, + "global_step": 645, + "grad_norm": 1.9018256870200687, + "learning_rate": 1.8396623105491207e-06, + "loss": 0.4425, + "step": 645 + }, + { + "ETA": 4.73, + "epoch": 0.20775044219327865, + "fp16_scale": 1.0, + "global_step": 646, + "grad_norm": 2.027288659656249, + "learning_rate": 1.839095942148512e-06, + "loss": 0.3987, + "step": 646 + }, + { + "ETA": 4.73, + "epoch": 0.20807203730503296, + "fp16_scale": 1.0, + "global_step": 647, + "grad_norm": 1.872799097561611, + "learning_rate": 1.8385286627094797e-06, + "loss": 0.4947, + "step": 647 + }, + { + "ETA": 4.72, + "epoch": 0.20839363241678727, + "fp16_scale": 1.0, + "global_step": 648, + "grad_norm": 2.111928113572892, + "learning_rate": 1.8379604728479407e-06, + "loss": 0.4198, + "step": 648 + }, + { + "ETA": 4.72, + "epoch": 0.20871522752854157, + "fp16_scale": 1.0, + "global_step": 649, + "grad_norm": 1.9399081155702231, + "learning_rate": 1.8373913731808006e-06, + "loss": 0.4158, + "step": 649 + }, + { + "ETA": 4.71, + "epoch": 0.20903682264029586, + "fp16_scale": 1.0, + "global_step": 650, + "grad_norm": 2.196310553347871, + "learning_rate": 1.836821364325952e-06, + "loss": 0.3836, + "step": 650 + }, + { + "ETA": 4.71, + "epoch": 0.20935841775205016, + "fp16_scale": 1.0, + "global_step": 651, + "grad_norm": 2.119228948550605, + "learning_rate": 1.8362504469022757e-06, + "loss": 0.427, + "step": 651 + }, + { + "ETA": 4.71, + "epoch": 0.20968001286380447, + "fp16_scale": 1.0, + "global_step": 652, + "grad_norm": 2.0040639594431044, + "learning_rate": 1.8356786215296384e-06, + "loss": 0.4517, + "step": 652 + }, + { + "ETA": 4.71, + "epoch": 0.21000160797555878, + "fp16_scale": 1.0, + "global_step": 653, + "grad_norm": 2.072695395984966, + "learning_rate": 1.8351058888288927e-06, + "loss": 0.4989, + "step": 653 + }, + { + "ETA": 4.71, + "epoch": 0.21032320308731306, + "fp16_scale": 1.0, + "global_step": 654, + "grad_norm": 2.162234855537016, + "learning_rate": 1.8345322494218762e-06, + "loss": 0.5036, + "step": 654 + }, + { + "ETA": 4.7, + "epoch": 0.21064479819906737, + "fp16_scale": 1.0, + "global_step": 655, + "grad_norm": 2.2823715720576274, + "learning_rate": 1.8339577039314111e-06, + "loss": 0.3904, + "step": 655 + }, + { + "ETA": 4.7, + "epoch": 0.21096639331082168, + "fp16_scale": 1.0, + "global_step": 656, + "grad_norm": 2.3729771485791655, + "learning_rate": 1.8333822529813032e-06, + "loss": 0.464, + "step": 656 + }, + { + "ETA": 4.7, + "epoch": 0.21128798842257598, + "fp16_scale": 1.0, + "global_step": 657, + "grad_norm": 2.155816059326467, + "learning_rate": 1.8328058971963416e-06, + "loss": 0.4146, + "step": 657 + }, + { + "ETA": 4.7, + "epoch": 0.21160958353433026, + "fp16_scale": 1.0, + "global_step": 658, + "grad_norm": 1.9841320210831421, + "learning_rate": 1.8322286372022982e-06, + "loss": 0.4693, + "step": 658 + }, + { + "ETA": 4.69, + "epoch": 0.21193117864608457, + "fp16_scale": 1.0, + "global_step": 659, + "grad_norm": 2.1265935844277317, + "learning_rate": 1.8316504736259254e-06, + "loss": 0.5274, + "step": 659 + }, + { + "ETA": 4.69, + "epoch": 0.21225277375783888, + "fp16_scale": 1.0, + "global_step": 660, + "grad_norm": 2.084041465792925, + "learning_rate": 1.8310714070949578e-06, + "loss": 0.4314, + "step": 660 + }, + { + "ETA": 4.69, + "epoch": 0.2125743688695932, + "fp16_scale": 1.0, + "global_step": 661, + "grad_norm": 2.490711656110001, + "learning_rate": 1.8304914382381098e-06, + "loss": 0.4212, + "step": 661 + }, + { + "ETA": 4.69, + "epoch": 0.2128959639813475, + "fp16_scale": 1.0, + "global_step": 662, + "grad_norm": 2.170150924977779, + "learning_rate": 1.8299105676850757e-06, + "loss": 0.4744, + "step": 662 + }, + { + "ETA": 4.69, + "epoch": 0.21321755909310178, + "fp16_scale": 1.0, + "global_step": 663, + "grad_norm": 1.9143635362599047, + "learning_rate": 1.8293287960665294e-06, + "loss": 0.3939, + "step": 663 + }, + { + "ETA": 4.69, + "epoch": 0.21353915420485609, + "fp16_scale": 1.0, + "global_step": 664, + "grad_norm": 1.9950231918545598, + "learning_rate": 1.8287461240141215e-06, + "loss": 0.5137, + "step": 664 + }, + { + "ETA": 4.68, + "epoch": 0.2138607493166104, + "fp16_scale": 1.0, + "global_step": 665, + "grad_norm": 1.9897696769840771, + "learning_rate": 1.828162552160482e-06, + "loss": 0.4226, + "step": 665 + }, + { + "ETA": 4.68, + "epoch": 0.2141823444283647, + "fp16_scale": 1.0, + "global_step": 666, + "grad_norm": 1.9982248356650532, + "learning_rate": 1.827578081139217e-06, + "loss": 0.4246, + "step": 666 + }, + { + "ETA": 4.67, + "epoch": 0.21450393954011898, + "fp16_scale": 1.0, + "global_step": 667, + "grad_norm": 2.267752882574821, + "learning_rate": 1.8269927115849084e-06, + "loss": 0.4222, + "step": 667 + }, + { + "ETA": 4.67, + "epoch": 0.2148255346518733, + "fp16_scale": 1.0, + "global_step": 668, + "grad_norm": 2.1412154715946783, + "learning_rate": 1.8264064441331147e-06, + "loss": 0.4113, + "step": 668 + }, + { + "ETA": 4.67, + "epoch": 0.2151471297636276, + "fp16_scale": 1.0, + "global_step": 669, + "grad_norm": 2.231901326579392, + "learning_rate": 1.8258192794203686e-06, + "loss": 0.5034, + "step": 669 + }, + { + "ETA": 4.66, + "epoch": 0.2154687248753819, + "fp16_scale": 1.0, + "global_step": 670, + "grad_norm": 2.065323137925775, + "learning_rate": 1.8252312180841776e-06, + "loss": 0.3939, + "step": 670 + }, + { + "ETA": 4.66, + "epoch": 0.2157903199871362, + "fp16_scale": 1.0, + "global_step": 671, + "grad_norm": 1.8352790986920346, + "learning_rate": 1.8246422607630217e-06, + "loss": 0.5134, + "step": 671 + }, + { + "ETA": 4.66, + "epoch": 0.2161119150988905, + "fp16_scale": 1.0, + "global_step": 672, + "grad_norm": 1.8676361680015254, + "learning_rate": 1.8240524080963548e-06, + "loss": 0.4833, + "step": 672 + }, + { + "ETA": 4.66, + "epoch": 0.2164335102106448, + "fp16_scale": 1.0, + "global_step": 673, + "grad_norm": 1.7359585046809156, + "learning_rate": 1.823461660724602e-06, + "loss": 0.454, + "step": 673 + }, + { + "ETA": 4.65, + "epoch": 0.2167551053223991, + "fp16_scale": 1.0, + "global_step": 674, + "grad_norm": 2.5898757152168645, + "learning_rate": 1.8228700192891605e-06, + "loss": 0.4056, + "step": 674 + }, + { + "ETA": 4.65, + "epoch": 0.2170767004341534, + "fp16_scale": 1.0, + "global_step": 675, + "grad_norm": 1.7924015547754597, + "learning_rate": 1.8222774844323978e-06, + "loss": 0.4143, + "step": 675 + }, + { + "ETA": 4.65, + "epoch": 0.2173982955459077, + "fp16_scale": 1.0, + "global_step": 676, + "grad_norm": 1.9921557615333092, + "learning_rate": 1.8216840567976516e-06, + "loss": 0.4321, + "step": 676 + }, + { + "ETA": 4.65, + "epoch": 0.217719890657662, + "fp16_scale": 1.0, + "global_step": 677, + "grad_norm": 2.031491140076144, + "learning_rate": 1.821089737029229e-06, + "loss": 0.4886, + "step": 677 + }, + { + "ETA": 4.64, + "epoch": 0.21804148576941632, + "fp16_scale": 1.0, + "global_step": 678, + "grad_norm": 2.066052422348665, + "learning_rate": 1.8204945257724057e-06, + "loss": 0.3586, + "step": 678 + }, + { + "ETA": 4.64, + "epoch": 0.2183630808811706, + "fp16_scale": 1.0, + "global_step": 679, + "grad_norm": 2.112887004982494, + "learning_rate": 1.8198984236734245e-06, + "loss": 0.3591, + "step": 679 + }, + { + "ETA": 4.64, + "epoch": 0.2186846759929249, + "fp16_scale": 1.0, + "global_step": 680, + "grad_norm": 1.8330286025539153, + "learning_rate": 1.819301431379497e-06, + "loss": 0.4298, + "step": 680 + }, + { + "ETA": 4.64, + "epoch": 0.2190062711046792, + "fp16_scale": 1.0, + "global_step": 681, + "grad_norm": 2.059414327645457, + "learning_rate": 1.8187035495388e-06, + "loss": 0.3943, + "step": 681 + }, + { + "ETA": 4.64, + "epoch": 0.21932786621643352, + "fp16_scale": 1.0, + "global_step": 682, + "grad_norm": 1.9002396763580478, + "learning_rate": 1.8181047788004768e-06, + "loss": 0.4064, + "step": 682 + }, + { + "ETA": 4.63, + "epoch": 0.2196494613281878, + "fp16_scale": 1.0, + "global_step": 683, + "grad_norm": 1.9827519447925184, + "learning_rate": 1.8175051198146356e-06, + "loss": 0.5063, + "step": 683 + }, + { + "ETA": 4.63, + "epoch": 0.2199710564399421, + "fp16_scale": 1.0, + "global_step": 684, + "grad_norm": 2.0601645455000637, + "learning_rate": 1.816904573232349e-06, + "loss": 0.445, + "step": 684 + }, + { + "ETA": 4.63, + "epoch": 0.22029265155169642, + "fp16_scale": 1.0, + "global_step": 685, + "grad_norm": 1.8331284872167692, + "learning_rate": 1.8163031397056531e-06, + "loss": 0.4083, + "step": 685 + }, + { + "ETA": 4.63, + "epoch": 0.22061424666345072, + "fp16_scale": 1.0, + "global_step": 686, + "grad_norm": 3.296288761928108, + "learning_rate": 1.8157008198875477e-06, + "loss": 0.415, + "step": 686 + }, + { + "ETA": 4.63, + "epoch": 0.220935841775205, + "fp16_scale": 1.0, + "global_step": 687, + "grad_norm": 2.1112734371036406, + "learning_rate": 1.8150976144319936e-06, + "loss": 0.4791, + "step": 687 + }, + { + "ETA": 4.63, + "epoch": 0.2212574368869593, + "fp16_scale": 1.0, + "global_step": 688, + "grad_norm": 1.8901633853998032, + "learning_rate": 1.8144935239939142e-06, + "loss": 0.4727, + "step": 688 + }, + { + "ETA": 4.63, + "epoch": 0.22157903199871362, + "fp16_scale": 1.0, + "global_step": 689, + "grad_norm": 1.879120205442335, + "learning_rate": 1.813888549229194e-06, + "loss": 0.4175, + "step": 689 + }, + { + "ETA": 4.63, + "epoch": 0.22190062711046793, + "fp16_scale": 1.0, + "global_step": 690, + "grad_norm": 2.1020368460332337, + "learning_rate": 1.8132826907946765e-06, + "loss": 0.3883, + "step": 690 + }, + { + "ETA": 4.62, + "epoch": 0.2222222222222222, + "fp16_scale": 1.0, + "global_step": 691, + "grad_norm": 2.067374511395301, + "learning_rate": 1.8126759493481658e-06, + "loss": 0.4429, + "step": 691 + }, + { + "ETA": 4.62, + "epoch": 0.22254381733397652, + "fp16_scale": 1.0, + "global_step": 692, + "grad_norm": 1.9464387876385234, + "learning_rate": 1.812068325548424e-06, + "loss": 0.4954, + "step": 692 + }, + { + "ETA": 4.62, + "epoch": 0.22286541244573083, + "fp16_scale": 1.0, + "global_step": 693, + "grad_norm": 1.9262835125630478, + "learning_rate": 1.811459820055171e-06, + "loss": 0.4688, + "step": 693 + }, + { + "ETA": 4.62, + "epoch": 0.22318700755748513, + "fp16_scale": 1.0, + "global_step": 694, + "grad_norm": 2.1918110355029756, + "learning_rate": 1.810850433529085e-06, + "loss": 0.5037, + "step": 694 + }, + { + "ETA": 4.62, + "epoch": 0.22350860266923941, + "fp16_scale": 1.0, + "global_step": 695, + "grad_norm": 1.832957030289425, + "learning_rate": 1.8102401666317996e-06, + "loss": 0.5234, + "step": 695 + }, + { + "ETA": 4.62, + "epoch": 0.22383019778099372, + "fp16_scale": 1.0, + "global_step": 696, + "grad_norm": 1.8098859921784485, + "learning_rate": 1.8096290200259055e-06, + "loss": 0.4518, + "step": 696 + }, + { + "ETA": 4.62, + "epoch": 0.22415179289274803, + "fp16_scale": 1.0, + "global_step": 697, + "grad_norm": 2.048553780993463, + "learning_rate": 1.8090169943749474e-06, + "loss": 0.4365, + "step": 697 + }, + { + "ETA": 4.61, + "epoch": 0.22447338800450234, + "fp16_scale": 1.0, + "global_step": 698, + "grad_norm": 2.200686749413936, + "learning_rate": 1.8084040903434248e-06, + "loss": 0.3814, + "step": 698 + }, + { + "ETA": 4.61, + "epoch": 0.22479498311625662, + "fp16_scale": 1.0, + "global_step": 699, + "grad_norm": 1.924796579595174, + "learning_rate": 1.807790308596791e-06, + "loss": 0.4456, + "step": 699 + }, + { + "ETA": 4.61, + "epoch": 0.22511657822801093, + "fp16_scale": 1.0, + "global_step": 700, + "grad_norm": 1.709388799581436, + "learning_rate": 1.8071756498014525e-06, + "loss": 0.4432, + "step": 700 + }, + { + "ETA": 4.61, + "epoch": 0.22543817333976524, + "fp16_scale": 1.0, + "global_step": 701, + "grad_norm": 1.9075606175094577, + "learning_rate": 1.8065601146247676e-06, + "loss": 0.4508, + "step": 701 + }, + { + "ETA": 4.61, + "epoch": 0.22575976845151954, + "fp16_scale": 1.0, + "global_step": 702, + "grad_norm": 2.017820857885172, + "learning_rate": 1.805943703735046e-06, + "loss": 0.4179, + "step": 702 + }, + { + "ETA": 4.6, + "epoch": 0.22608136356327385, + "fp16_scale": 1.0, + "global_step": 703, + "grad_norm": 1.9259765770629065, + "learning_rate": 1.8053264178015489e-06, + "loss": 0.466, + "step": 703 + }, + { + "ETA": 4.6, + "epoch": 0.22640295867502813, + "fp16_scale": 1.0, + "global_step": 704, + "grad_norm": 1.9024356112618308, + "learning_rate": 1.8047082574944867e-06, + "loss": 0.46, + "step": 704 + }, + { + "ETA": 4.6, + "epoch": 0.22672455378678244, + "fp16_scale": 1.0, + "global_step": 705, + "grad_norm": 2.0781068091814423, + "learning_rate": 1.8040892234850202e-06, + "loss": 0.4756, + "step": 705 + }, + { + "ETA": 4.6, + "epoch": 0.22704614889853675, + "fp16_scale": 1.0, + "global_step": 706, + "grad_norm": 1.8396122753092925, + "learning_rate": 1.8034693164452577e-06, + "loss": 0.4244, + "step": 706 + }, + { + "ETA": 4.59, + "epoch": 0.22736774401029106, + "fp16_scale": 1.0, + "global_step": 707, + "grad_norm": 1.9992189970908851, + "learning_rate": 1.802848537048256e-06, + "loss": 0.3647, + "step": 707 + }, + { + "ETA": 4.59, + "epoch": 0.22768933912204534, + "fp16_scale": 1.0, + "global_step": 708, + "grad_norm": 2.1224032032365585, + "learning_rate": 1.8022268859680194e-06, + "loss": 0.4193, + "step": 708 + }, + { + "ETA": 4.59, + "epoch": 0.22801093423379964, + "fp16_scale": 1.0, + "global_step": 709, + "grad_norm": 2.1626700811466986, + "learning_rate": 1.8016043638794973e-06, + "loss": 0.4294, + "step": 709 + }, + { + "ETA": 4.59, + "epoch": 0.22833252934555395, + "fp16_scale": 1.0, + "global_step": 710, + "grad_norm": 1.9963804168815649, + "learning_rate": 1.8009809714585863e-06, + "loss": 0.4556, + "step": 710 + }, + { + "ETA": 4.59, + "epoch": 0.22865412445730826, + "fp16_scale": 1.0, + "global_step": 711, + "grad_norm": 1.820212185204622, + "learning_rate": 1.800356709382127e-06, + "loss": 0.4039, + "step": 711 + }, + { + "ETA": 4.58, + "epoch": 0.22897571956906254, + "fp16_scale": 1.0, + "global_step": 712, + "grad_norm": 1.965943253027829, + "learning_rate": 1.7997315783279045e-06, + "loss": 0.3419, + "step": 712 + }, + { + "ETA": 4.58, + "epoch": 0.22929731468081685, + "fp16_scale": 1.0, + "global_step": 713, + "grad_norm": 1.9727340324024272, + "learning_rate": 1.7991055789746477e-06, + "loss": 0.4642, + "step": 713 + }, + { + "ETA": 4.58, + "epoch": 0.22961890979257116, + "fp16_scale": 1.0, + "global_step": 714, + "grad_norm": 1.9682197257968195, + "learning_rate": 1.7984787120020272e-06, + "loss": 0.3678, + "step": 714 + }, + { + "ETA": 4.57, + "epoch": 0.22994050490432547, + "fp16_scale": 1.0, + "global_step": 715, + "grad_norm": 2.016209477905265, + "learning_rate": 1.7978509780906577e-06, + "loss": 0.4064, + "step": 715 + }, + { + "ETA": 4.57, + "epoch": 0.23026210001607975, + "fp16_scale": 1.0, + "global_step": 716, + "grad_norm": 1.9510813623596766, + "learning_rate": 1.7972223779220922e-06, + "loss": 0.4791, + "step": 716 + }, + { + "ETA": 4.57, + "epoch": 0.23058369512783405, + "fp16_scale": 1.0, + "global_step": 717, + "grad_norm": 1.9638577974984546, + "learning_rate": 1.796592912178827e-06, + "loss": 0.4723, + "step": 717 + }, + { + "ETA": 4.57, + "epoch": 0.23090529023958836, + "fp16_scale": 1.0, + "global_step": 718, + "grad_norm": 1.768505989368743, + "learning_rate": 1.795962581544297e-06, + "loss": 0.4377, + "step": 718 + }, + { + "ETA": 4.57, + "epoch": 0.23122688535134267, + "fp16_scale": 1.0, + "global_step": 719, + "grad_norm": 2.210067772931334, + "learning_rate": 1.7953313867028755e-06, + "loss": 0.5496, + "step": 719 + }, + { + "ETA": 4.56, + "epoch": 0.23154848046309695, + "fp16_scale": 1.0, + "global_step": 720, + "grad_norm": 1.8219268673341542, + "learning_rate": 1.7946993283398748e-06, + "loss": 0.4321, + "step": 720 + }, + { + "ETA": 4.56, + "epoch": 0.23187007557485126, + "fp16_scale": 1.0, + "global_step": 721, + "grad_norm": 2.004344691510899, + "learning_rate": 1.7940664071415457e-06, + "loss": 0.5344, + "step": 721 + }, + { + "ETA": 4.56, + "epoch": 0.23219167068660557, + "fp16_scale": 1.0, + "global_step": 722, + "grad_norm": 1.9855726746474036, + "learning_rate": 1.793432623795074e-06, + "loss": 0.4949, + "step": 722 + }, + { + "ETA": 4.56, + "epoch": 0.23251326579835987, + "fp16_scale": 1.0, + "global_step": 723, + "grad_norm": 2.0486130421428657, + "learning_rate": 1.7927979789885824e-06, + "loss": 0.4047, + "step": 723 + }, + { + "ETA": 4.56, + "epoch": 0.23283486091011416, + "fp16_scale": 1.0, + "global_step": 724, + "grad_norm": 1.9272440469802035, + "learning_rate": 1.7921624734111291e-06, + "loss": 0.4699, + "step": 724 + }, + { + "ETA": 4.56, + "epoch": 0.23315645602186846, + "fp16_scale": 1.0, + "global_step": 725, + "grad_norm": 1.9601881150311802, + "learning_rate": 1.7915261077527072e-06, + "loss": 0.3968, + "step": 725 + }, + { + "ETA": 4.56, + "epoch": 0.23347805113362277, + "fp16_scale": 1.0, + "global_step": 726, + "grad_norm": 2.0041671971661597, + "learning_rate": 1.7908888827042424e-06, + "loss": 0.4972, + "step": 726 + }, + { + "ETA": 4.56, + "epoch": 0.23379964624537708, + "fp16_scale": 1.0, + "global_step": 727, + "grad_norm": 1.9826428974386219, + "learning_rate": 1.7902507989575947e-06, + "loss": 0.399, + "step": 727 + }, + { + "ETA": 4.55, + "epoch": 0.23412124135713136, + "fp16_scale": 1.0, + "global_step": 728, + "grad_norm": 2.0330369293048265, + "learning_rate": 1.7896118572055555e-06, + "loss": 0.4726, + "step": 728 + }, + { + "ETA": 4.55, + "epoch": 0.23444283646888567, + "fp16_scale": 1.0, + "global_step": 729, + "grad_norm": 2.0163117339345944, + "learning_rate": 1.7889720581418488e-06, + "loss": 0.4936, + "step": 729 + }, + { + "ETA": 4.55, + "epoch": 0.23476443158063998, + "fp16_scale": 1.0, + "global_step": 730, + "grad_norm": 1.988671170173158, + "learning_rate": 1.7883314024611284e-06, + "loss": 0.4436, + "step": 730 + }, + { + "ETA": 4.55, + "epoch": 0.23508602669239428, + "fp16_scale": 1.0, + "global_step": 731, + "grad_norm": 2.0975771896673088, + "learning_rate": 1.7876898908589787e-06, + "loss": 0.4049, + "step": 731 + }, + { + "ETA": 4.55, + "epoch": 0.23540762180414856, + "fp16_scale": 1.0, + "global_step": 732, + "grad_norm": 2.082944961166679, + "learning_rate": 1.787047524031913e-06, + "loss": 0.4801, + "step": 732 + }, + { + "ETA": 4.55, + "epoch": 0.23572921691590287, + "fp16_scale": 1.0, + "global_step": 733, + "grad_norm": 1.899197242182982, + "learning_rate": 1.7864043026773738e-06, + "loss": 0.4641, + "step": 733 + }, + { + "ETA": 4.54, + "epoch": 0.23605081202765718, + "fp16_scale": 1.0, + "global_step": 734, + "grad_norm": 1.9972371822291792, + "learning_rate": 1.7857602274937306e-06, + "loss": 0.3807, + "step": 734 + }, + { + "ETA": 4.54, + "epoch": 0.2363724071394115, + "fp16_scale": 1.0, + "global_step": 735, + "grad_norm": 2.193975694579717, + "learning_rate": 1.7851152991802808e-06, + "loss": 0.3962, + "step": 735 + }, + { + "ETA": 4.54, + "epoch": 0.23669400225116577, + "fp16_scale": 1.0, + "global_step": 736, + "grad_norm": 1.9577948654527098, + "learning_rate": 1.7844695184372473e-06, + "loss": 0.4417, + "step": 736 + }, + { + "ETA": 4.53, + "epoch": 0.23701559736292008, + "fp16_scale": 1.0, + "global_step": 737, + "grad_norm": 1.8564902332589925, + "learning_rate": 1.7838228859657794e-06, + "loss": 0.436, + "step": 737 + }, + { + "ETA": 4.53, + "epoch": 0.23733719247467439, + "fp16_scale": 1.0, + "global_step": 738, + "grad_norm": 2.266637819531783, + "learning_rate": 1.78317540246795e-06, + "loss": 0.4337, + "step": 738 + }, + { + "ETA": 4.53, + "epoch": 0.2376587875864287, + "fp16_scale": 1.0, + "global_step": 739, + "grad_norm": 2.158320028995456, + "learning_rate": 1.7825270686467567e-06, + "loss": 0.5044, + "step": 739 + }, + { + "ETA": 4.53, + "epoch": 0.23798038269818297, + "fp16_scale": 1.0, + "global_step": 740, + "grad_norm": 2.119824332945789, + "learning_rate": 1.7818778852061206e-06, + "loss": 0.4589, + "step": 740 + }, + { + "ETA": 4.52, + "epoch": 0.23830197780993728, + "fp16_scale": 1.0, + "global_step": 741, + "grad_norm": 2.2978059185149107, + "learning_rate": 1.7812278528508842e-06, + "loss": 0.3989, + "step": 741 + }, + { + "ETA": 4.52, + "epoch": 0.2386235729216916, + "fp16_scale": 1.0, + "global_step": 742, + "grad_norm": 1.9956639370091604, + "learning_rate": 1.7805769722868128e-06, + "loss": 0.4349, + "step": 742 + }, + { + "ETA": 4.52, + "epoch": 0.2389451680334459, + "fp16_scale": 1.0, + "global_step": 743, + "grad_norm": 2.037044955159949, + "learning_rate": 1.7799252442205926e-06, + "loss": 0.4272, + "step": 743 + }, + { + "ETA": 4.52, + "epoch": 0.2392667631452002, + "fp16_scale": 1.0, + "global_step": 744, + "grad_norm": 1.8554231707091167, + "learning_rate": 1.779272669359829e-06, + "loss": 0.4673, + "step": 744 + }, + { + "ETA": 4.52, + "epoch": 0.2395883582569545, + "fp16_scale": 1.0, + "global_step": 745, + "grad_norm": 2.2053717811448412, + "learning_rate": 1.7786192484130477e-06, + "loss": 0.4457, + "step": 745 + }, + { + "ETA": 4.51, + "epoch": 0.2399099533687088, + "fp16_scale": 1.0, + "global_step": 746, + "grad_norm": 2.317068308385471, + "learning_rate": 1.7779649820896925e-06, + "loss": 0.4507, + "step": 746 + }, + { + "ETA": 4.51, + "epoch": 0.2402315484804631, + "fp16_scale": 1.0, + "global_step": 747, + "grad_norm": 2.2616330283114303, + "learning_rate": 1.7773098711001255e-06, + "loss": 0.3667, + "step": 747 + }, + { + "ETA": 4.51, + "epoch": 0.2405531435922174, + "fp16_scale": 1.0, + "global_step": 748, + "grad_norm": 2.139905482754699, + "learning_rate": 1.7766539161556251e-06, + "loss": 0.4098, + "step": 748 + }, + { + "ETA": 4.51, + "epoch": 0.2408747387039717, + "fp16_scale": 1.0, + "global_step": 749, + "grad_norm": 1.8311878172702902, + "learning_rate": 1.7759971179683872e-06, + "loss": 0.3987, + "step": 749 + }, + { + "ETA": 4.5, + "epoch": 0.241196333815726, + "fp16_scale": 1.0, + "global_step": 750, + "grad_norm": 2.138934959196834, + "learning_rate": 1.7753394772515228e-06, + "loss": 0.4585, + "step": 750 + }, + { + "ETA": 4.5, + "epoch": 0.2415179289274803, + "fp16_scale": 1.0, + "global_step": 751, + "grad_norm": 2.222212966993316, + "learning_rate": 1.7746809947190567e-06, + "loss": 0.3848, + "step": 751 + }, + { + "ETA": 4.5, + "epoch": 0.24183952403923462, + "fp16_scale": 1.0, + "global_step": 752, + "grad_norm": 2.1018816148178368, + "learning_rate": 1.7740216710859288e-06, + "loss": 0.4393, + "step": 752 + }, + { + "ETA": 4.5, + "epoch": 0.2421611191509889, + "fp16_scale": 1.0, + "global_step": 753, + "grad_norm": 1.9861261843879785, + "learning_rate": 1.773361507067992e-06, + "loss": 0.3987, + "step": 753 + }, + { + "ETA": 4.5, + "epoch": 0.2424827142627432, + "fp16_scale": 1.0, + "global_step": 754, + "grad_norm": 1.8913263997988305, + "learning_rate": 1.7727005033820114e-06, + "loss": 0.4597, + "step": 754 + }, + { + "ETA": 4.49, + "epoch": 0.2428043093744975, + "fp16_scale": 1.0, + "global_step": 755, + "grad_norm": 2.0630895122508397, + "learning_rate": 1.7720386607456636e-06, + "loss": 0.4939, + "step": 755 + }, + { + "ETA": 4.49, + "epoch": 0.24312590448625182, + "fp16_scale": 1.0, + "global_step": 756, + "grad_norm": 1.8784062689214744, + "learning_rate": 1.7713759798775372e-06, + "loss": 0.5216, + "step": 756 + }, + { + "ETA": 4.49, + "epoch": 0.2434474995980061, + "fp16_scale": 1.0, + "global_step": 757, + "grad_norm": 2.0242708323883543, + "learning_rate": 1.7707124614971294e-06, + "loss": 0.4566, + "step": 757 + }, + { + "ETA": 4.49, + "epoch": 0.2437690947097604, + "fp16_scale": 1.0, + "global_step": 758, + "grad_norm": 1.8976058234202664, + "learning_rate": 1.770048106324847e-06, + "loss": 0.4751, + "step": 758 + }, + { + "ETA": 4.49, + "epoch": 0.24409068982151472, + "fp16_scale": 1.0, + "global_step": 759, + "grad_norm": 2.1489217687260904, + "learning_rate": 1.7693829150820067e-06, + "loss": 0.3589, + "step": 759 + }, + { + "ETA": 4.49, + "epoch": 0.24441228493326903, + "fp16_scale": 1.0, + "global_step": 760, + "grad_norm": 1.8301097217537243, + "learning_rate": 1.7687168884908314e-06, + "loss": 0.4735, + "step": 760 + }, + { + "ETA": 4.48, + "epoch": 0.2447338800450233, + "fp16_scale": 1.0, + "global_step": 761, + "grad_norm": 1.9556663336829052, + "learning_rate": 1.7680500272744515e-06, + "loss": 0.4321, + "step": 761 + }, + { + "ETA": 4.48, + "epoch": 0.2450554751567776, + "fp16_scale": 1.0, + "global_step": 762, + "grad_norm": 2.0829984705854794, + "learning_rate": 1.767382332156904e-06, + "loss": 0.5225, + "step": 762 + }, + { + "ETA": 4.48, + "epoch": 0.24537707026853192, + "fp16_scale": 1.0, + "global_step": 763, + "grad_norm": 1.9064636228392189, + "learning_rate": 1.7667138038631305e-06, + "loss": 0.4515, + "step": 763 + }, + { + "ETA": 4.48, + "epoch": 0.24569866538028623, + "fp16_scale": 1.0, + "global_step": 764, + "grad_norm": 2.137222624168098, + "learning_rate": 1.766044443118978e-06, + "loss": 0.3982, + "step": 764 + }, + { + "ETA": 4.47, + "epoch": 0.2460202604920405, + "fp16_scale": 1.0, + "global_step": 765, + "grad_norm": 1.7814841302675692, + "learning_rate": 1.7653742506511966e-06, + "loss": 0.5006, + "step": 765 + }, + { + "ETA": 4.47, + "epoch": 0.24634185560379482, + "fp16_scale": 1.0, + "global_step": 766, + "grad_norm": 1.8475546846241377, + "learning_rate": 1.7647032271874399e-06, + "loss": 0.4, + "step": 766 + }, + { + "ETA": 4.47, + "epoch": 0.24666345071554913, + "fp16_scale": 1.0, + "global_step": 767, + "grad_norm": 2.145473176810697, + "learning_rate": 1.7640313734562638e-06, + "loss": 0.481, + "step": 767 + }, + { + "ETA": 4.47, + "epoch": 0.24698504582730343, + "fp16_scale": 1.0, + "global_step": 768, + "grad_norm": 1.922921093523383, + "learning_rate": 1.7633586901871248e-06, + "loss": 0.447, + "step": 768 + }, + { + "ETA": 4.47, + "epoch": 0.24730664093905771, + "fp16_scale": 1.0, + "global_step": 769, + "grad_norm": 2.0565372242087903, + "learning_rate": 1.7626851781103818e-06, + "loss": 0.4808, + "step": 769 + }, + { + "ETA": 4.47, + "epoch": 0.24762823605081202, + "fp16_scale": 1.0, + "global_step": 770, + "grad_norm": 1.9792795749250929, + "learning_rate": 1.762010837957292e-06, + "loss": 0.4895, + "step": 770 + }, + { + "ETA": 4.47, + "epoch": 0.24794983116256633, + "fp16_scale": 1.0, + "global_step": 771, + "grad_norm": 1.883691722510702, + "learning_rate": 1.7613356704600121e-06, + "loss": 0.4819, + "step": 771 + }, + { + "ETA": 4.46, + "epoch": 0.24827142627432064, + "fp16_scale": 1.0, + "global_step": 772, + "grad_norm": 2.2677718519687144, + "learning_rate": 1.7606596763515972e-06, + "loss": 0.392, + "step": 772 + }, + { + "ETA": 4.46, + "epoch": 0.24859302138607492, + "fp16_scale": 1.0, + "global_step": 773, + "grad_norm": 2.1000510484291746, + "learning_rate": 1.7599828563660001e-06, + "loss": 0.4982, + "step": 773 + }, + { + "ETA": 4.46, + "epoch": 0.24891461649782923, + "fp16_scale": 1.0, + "global_step": 774, + "grad_norm": 1.9092489891093536, + "learning_rate": 1.75930521123807e-06, + "loss": 0.4644, + "step": 774 + }, + { + "ETA": 4.46, + "epoch": 0.24923621160958354, + "fp16_scale": 1.0, + "global_step": 775, + "grad_norm": 1.8151348544203338, + "learning_rate": 1.7586267417035514e-06, + "loss": 0.4389, + "step": 775 + }, + { + "ETA": 4.46, + "epoch": 0.24955780672133784, + "fp16_scale": 1.0, + "global_step": 776, + "grad_norm": 2.000002819432597, + "learning_rate": 1.7579474484990855e-06, + "loss": 0.4121, + "step": 776 + }, + { + "ETA": 4.46, + "epoch": 0.24987940183309212, + "fp16_scale": 1.0, + "global_step": 777, + "grad_norm": 2.175264685351054, + "learning_rate": 1.757267332362206e-06, + "loss": 0.4345, + "step": 777 + }, + { + "ETA": 4.45, + "epoch": 0.25020099694484643, + "fp16_scale": 1.0, + "global_step": 778, + "grad_norm": 2.2492227906871243, + "learning_rate": 1.7565863940313413e-06, + "loss": 0.4726, + "step": 778 + }, + { + "ETA": 4.45, + "epoch": 0.2505225920566007, + "fp16_scale": 1.0, + "global_step": 779, + "grad_norm": 2.035071396505975, + "learning_rate": 1.755904634245812e-06, + "loss": 0.3934, + "step": 779 + }, + { + "ETA": 4.45, + "epoch": 0.25084418716835505, + "fp16_scale": 1.0, + "global_step": 780, + "grad_norm": 1.8616710950931066, + "learning_rate": 1.7552220537458305e-06, + "loss": 0.3698, + "step": 780 + }, + { + "ETA": 4.45, + "epoch": 0.25116578228010933, + "fp16_scale": 1.0, + "global_step": 781, + "grad_norm": 1.923068394286512, + "learning_rate": 1.7545386532725007e-06, + "loss": 0.3803, + "step": 781 + }, + { + "ETA": 4.44, + "epoch": 0.25148737739186366, + "fp16_scale": 1.0, + "global_step": 782, + "grad_norm": 1.900807572854149, + "learning_rate": 1.7538544335678162e-06, + "loss": 0.5162, + "step": 782 + }, + { + "ETA": 4.44, + "epoch": 0.25180897250361794, + "fp16_scale": 1.0, + "global_step": 783, + "grad_norm": 2.0443781807318606, + "learning_rate": 1.753169395374661e-06, + "loss": 0.412, + "step": 783 + }, + { + "ETA": 4.44, + "epoch": 0.2521305676153722, + "fp16_scale": 1.0, + "global_step": 784, + "grad_norm": 2.1369422399704097, + "learning_rate": 1.7524835394368065e-06, + "loss": 0.5262, + "step": 784 + }, + { + "ETA": 4.44, + "epoch": 0.25245216272712656, + "fp16_scale": 1.0, + "global_step": 785, + "grad_norm": 2.0901290992793378, + "learning_rate": 1.751796866498913e-06, + "loss": 0.4367, + "step": 785 + }, + { + "ETA": 4.43, + "epoch": 0.25277375783888084, + "fp16_scale": 1.0, + "global_step": 786, + "grad_norm": 1.9588990538395974, + "learning_rate": 1.7511093773065273e-06, + "loss": 0.4255, + "step": 786 + }, + { + "ETA": 4.43, + "epoch": 0.2530953529506352, + "fp16_scale": 1.0, + "global_step": 787, + "grad_norm": 2.3157784000612227, + "learning_rate": 1.7504210726060826e-06, + "loss": 0.4295, + "step": 787 + }, + { + "ETA": 4.43, + "epoch": 0.25341694806238946, + "fp16_scale": 1.0, + "global_step": 788, + "grad_norm": 1.7158073363100894, + "learning_rate": 1.7497319531448976e-06, + "loss": 0.443, + "step": 788 + }, + { + "ETA": 4.43, + "epoch": 0.25373854317414374, + "fp16_scale": 1.0, + "global_step": 789, + "grad_norm": 2.2600046462658505, + "learning_rate": 1.7490420196711755e-06, + "loss": 0.4934, + "step": 789 + }, + { + "ETA": 4.43, + "epoch": 0.2540601382858981, + "fp16_scale": 1.0, + "global_step": 790, + "grad_norm": 1.7129405170763643, + "learning_rate": 1.7483512729340032e-06, + "loss": 0.4376, + "step": 790 + }, + { + "ETA": 4.43, + "epoch": 0.25438173339765235, + "fp16_scale": 1.0, + "global_step": 791, + "grad_norm": 1.9897989562336427, + "learning_rate": 1.7476597136833511e-06, + "loss": 0.4754, + "step": 791 + }, + { + "ETA": 4.42, + "epoch": 0.25470332850940663, + "fp16_scale": 1.0, + "global_step": 792, + "grad_norm": 1.8201510096470175, + "learning_rate": 1.7469673426700713e-06, + "loss": 0.4198, + "step": 792 + }, + { + "ETA": 4.42, + "epoch": 0.25502492362116097, + "fp16_scale": 1.0, + "global_step": 793, + "grad_norm": 1.7981216800446684, + "learning_rate": 1.7462741606458973e-06, + "loss": 0.3971, + "step": 793 + }, + { + "ETA": 4.42, + "epoch": 0.25534651873291525, + "fp16_scale": 1.0, + "global_step": 794, + "grad_norm": 1.9620634932865766, + "learning_rate": 1.7455801683634431e-06, + "loss": 0.4493, + "step": 794 + }, + { + "ETA": 4.42, + "epoch": 0.2556681138446696, + "fp16_scale": 1.0, + "global_step": 795, + "grad_norm": 1.998101638799515, + "learning_rate": 1.7448853665762027e-06, + "loss": 0.4601, + "step": 795 + }, + { + "ETA": 4.42, + "epoch": 0.25598970895642387, + "fp16_scale": 1.0, + "global_step": 796, + "grad_norm": 2.1496093269738283, + "learning_rate": 1.744189756038549e-06, + "loss": 0.4203, + "step": 796 + }, + { + "ETA": 4.42, + "epoch": 0.25631130406817815, + "fp16_scale": 1.0, + "global_step": 797, + "grad_norm": 1.787968047373974, + "learning_rate": 1.7434933375057327e-06, + "loss": 0.4502, + "step": 797 + }, + { + "ETA": 4.42, + "epoch": 0.2566328991799325, + "fp16_scale": 1.0, + "global_step": 798, + "grad_norm": 1.7911742148217042, + "learning_rate": 1.7427961117338817e-06, + "loss": 0.4849, + "step": 798 + }, + { + "ETA": 4.41, + "epoch": 0.25695449429168676, + "fp16_scale": 1.0, + "global_step": 799, + "grad_norm": 1.9238691149579596, + "learning_rate": 1.742098079480001e-06, + "loss": 0.4768, + "step": 799 + }, + { + "ETA": 4.41, + "epoch": 0.25727608940344104, + "fp16_scale": 1.0, + "global_step": 800, + "grad_norm": 2.194856263523492, + "learning_rate": 1.7413992415019704e-06, + "loss": 0.4083, + "step": 800 + }, + { + "ETA": 4.43, + "epoch": 0.2575976845151954, + "fp16_scale": 1.0, + "global_step": 801, + "grad_norm": 2.2135807781089194, + "learning_rate": 1.7406995985585453e-06, + "loss": 0.4369, + "step": 801 + }, + { + "ETA": 4.43, + "epoch": 0.25791927962694966, + "fp16_scale": 1.0, + "global_step": 802, + "grad_norm": 1.9396102788499803, + "learning_rate": 1.7399991514093546e-06, + "loss": 0.4444, + "step": 802 + }, + { + "ETA": 4.43, + "epoch": 0.258240874738704, + "fp16_scale": 1.0, + "global_step": 803, + "grad_norm": 1.9528208225357846, + "learning_rate": 1.7392979008149e-06, + "loss": 0.4459, + "step": 803 + }, + { + "ETA": 4.43, + "epoch": 0.2585624698504583, + "fp16_scale": 1.0, + "global_step": 804, + "grad_norm": 2.0034358076573864, + "learning_rate": 1.7385958475365569e-06, + "loss": 0.4678, + "step": 804 + }, + { + "ETA": 4.43, + "epoch": 0.25888406496221256, + "fp16_scale": 1.0, + "global_step": 805, + "grad_norm": 2.1333845100844457, + "learning_rate": 1.7378929923365703e-06, + "loss": 0.45, + "step": 805 + }, + { + "ETA": 4.42, + "epoch": 0.2592056600739669, + "fp16_scale": 1.0, + "global_step": 806, + "grad_norm": 2.0507099534205753, + "learning_rate": 1.7371893359780573e-06, + "loss": 0.5479, + "step": 806 + }, + { + "ETA": 4.42, + "epoch": 0.2595272551857212, + "fp16_scale": 1.0, + "global_step": 807, + "grad_norm": 1.9725026084076829, + "learning_rate": 1.7364848792250047e-06, + "loss": 0.4651, + "step": 807 + }, + { + "ETA": 4.42, + "epoch": 0.25984885029747545, + "fp16_scale": 1.0, + "global_step": 808, + "grad_norm": 2.2343844990472124, + "learning_rate": 1.7357796228422675e-06, + "loss": 0.3991, + "step": 808 + }, + { + "ETA": 4.42, + "epoch": 0.2601704454092298, + "fp16_scale": 1.0, + "global_step": 809, + "grad_norm": 1.9731393977209701, + "learning_rate": 1.7350735675955695e-06, + "loss": 0.4105, + "step": 809 + }, + { + "ETA": 4.42, + "epoch": 0.26049204052098407, + "fp16_scale": 1.0, + "global_step": 810, + "grad_norm": 1.9473285535913734, + "learning_rate": 1.7343667142515021e-06, + "loss": 0.4022, + "step": 810 + }, + { + "ETA": 4.41, + "epoch": 0.2608136356327384, + "fp16_scale": 1.0, + "global_step": 811, + "grad_norm": 2.2285101547456128, + "learning_rate": 1.7336590635775228e-06, + "loss": 0.4212, + "step": 811 + }, + { + "ETA": 4.41, + "epoch": 0.2611352307444927, + "fp16_scale": 1.0, + "global_step": 812, + "grad_norm": 2.0016062409752933, + "learning_rate": 1.7329506163419546e-06, + "loss": 0.3362, + "step": 812 + }, + { + "ETA": 4.4, + "epoch": 0.26145682585624697, + "fp16_scale": 1.0, + "global_step": 813, + "grad_norm": 1.8228175063218146, + "learning_rate": 1.732241373313986e-06, + "loss": 0.4284, + "step": 813 + }, + { + "ETA": 4.4, + "epoch": 0.2617784209680013, + "fp16_scale": 1.0, + "global_step": 814, + "grad_norm": 1.7879482618101108, + "learning_rate": 1.731531335263669e-06, + "loss": 0.5041, + "step": 814 + }, + { + "ETA": 4.4, + "epoch": 0.2621000160797556, + "fp16_scale": 1.0, + "global_step": 815, + "grad_norm": 1.7973268099244846, + "learning_rate": 1.7308205029619186e-06, + "loss": 0.4017, + "step": 815 + }, + { + "ETA": 4.4, + "epoch": 0.26242161119150986, + "fp16_scale": 1.0, + "global_step": 816, + "grad_norm": 2.0170356461538064, + "learning_rate": 1.7301088771805134e-06, + "loss": 0.3851, + "step": 816 + }, + { + "ETA": 4.39, + "epoch": 0.2627432063032642, + "fp16_scale": 1.0, + "global_step": 817, + "grad_norm": 1.962065648722113, + "learning_rate": 1.729396458692092e-06, + "loss": 0.3713, + "step": 817 + }, + { + "ETA": 4.39, + "epoch": 0.2630648014150185, + "fp16_scale": 1.0, + "global_step": 818, + "grad_norm": 2.1056835285183833, + "learning_rate": 1.728683248270154e-06, + "loss": 0.5194, + "step": 818 + }, + { + "ETA": 4.39, + "epoch": 0.2633863965267728, + "fp16_scale": 1.0, + "global_step": 819, + "grad_norm": 2.0273393563710864, + "learning_rate": 1.72796924668906e-06, + "loss": 0.3721, + "step": 819 + }, + { + "ETA": 4.38, + "epoch": 0.2637079916385271, + "fp16_scale": 1.0, + "global_step": 820, + "grad_norm": 1.9509036703352076, + "learning_rate": 1.727254454724028e-06, + "loss": 0.349, + "step": 820 + }, + { + "ETA": 4.38, + "epoch": 0.2640295867502814, + "fp16_scale": 1.0, + "global_step": 821, + "grad_norm": 2.025537968078176, + "learning_rate": 1.726538873151135e-06, + "loss": 0.3888, + "step": 821 + }, + { + "ETA": 4.38, + "epoch": 0.2643511818620357, + "fp16_scale": 1.0, + "global_step": 822, + "grad_norm": 1.9390468488956132, + "learning_rate": 1.7258225027473153e-06, + "loss": 0.4164, + "step": 822 + }, + { + "ETA": 4.38, + "epoch": 0.26467277697379, + "fp16_scale": 1.0, + "global_step": 823, + "grad_norm": 1.927968786601008, + "learning_rate": 1.7251053442903594e-06, + "loss": 0.4315, + "step": 823 + }, + { + "ETA": 4.38, + "epoch": 0.2649943720855443, + "fp16_scale": 1.0, + "global_step": 824, + "grad_norm": 2.144994559077059, + "learning_rate": 1.7243873985589134e-06, + "loss": 0.4794, + "step": 824 + }, + { + "ETA": 4.37, + "epoch": 0.2653159671972986, + "fp16_scale": 1.0, + "global_step": 825, + "grad_norm": 2.2808454226695085, + "learning_rate": 1.723668666332479e-06, + "loss": 0.4492, + "step": 825 + }, + { + "ETA": 4.37, + "epoch": 0.2656375623090529, + "fp16_scale": 1.0, + "global_step": 826, + "grad_norm": 1.9193515378791834, + "learning_rate": 1.7229491483914106e-06, + "loss": 0.4437, + "step": 826 + }, + { + "ETA": 4.37, + "epoch": 0.2659591574208072, + "fp16_scale": 1.0, + "global_step": 827, + "grad_norm": 2.012237408269288, + "learning_rate": 1.7222288455169162e-06, + "loss": 0.4653, + "step": 827 + }, + { + "ETA": 4.37, + "epoch": 0.2662807525325615, + "fp16_scale": 1.0, + "global_step": 828, + "grad_norm": 1.9147271412988327, + "learning_rate": 1.7215077584910563e-06, + "loss": 0.4144, + "step": 828 + }, + { + "ETA": 4.37, + "epoch": 0.2666023476443158, + "fp16_scale": 1.0, + "global_step": 829, + "grad_norm": 1.9663151726569172, + "learning_rate": 1.7207858880967425e-06, + "loss": 0.4917, + "step": 829 + }, + { + "ETA": 4.37, + "epoch": 0.2669239427560701, + "fp16_scale": 1.0, + "global_step": 830, + "grad_norm": 2.2663738245216725, + "learning_rate": 1.7200632351177367e-06, + "loss": 0.4011, + "step": 830 + }, + { + "ETA": 4.36, + "epoch": 0.2672455378678244, + "fp16_scale": 1.0, + "global_step": 831, + "grad_norm": 2.0313714161063405, + "learning_rate": 1.719339800338651e-06, + "loss": 0.4241, + "step": 831 + }, + { + "ETA": 4.36, + "epoch": 0.26756713297957874, + "fp16_scale": 1.0, + "global_step": 832, + "grad_norm": 1.9746138792173442, + "learning_rate": 1.7186155845449464e-06, + "loss": 0.5026, + "step": 832 + }, + { + "ETA": 4.36, + "epoch": 0.267888728091333, + "fp16_scale": 1.0, + "global_step": 833, + "grad_norm": 2.1934796616910988, + "learning_rate": 1.7178905885229309e-06, + "loss": 0.573, + "step": 833 + }, + { + "ETA": 4.36, + "epoch": 0.2682103232030873, + "fp16_scale": 1.0, + "global_step": 834, + "grad_norm": 1.9015089104423855, + "learning_rate": 1.717164813059761e-06, + "loss": 0.4096, + "step": 834 + }, + { + "ETA": 4.36, + "epoch": 0.26853191831484163, + "fp16_scale": 1.0, + "global_step": 835, + "grad_norm": 2.047960992129641, + "learning_rate": 1.716438258943438e-06, + "loss": 0.3958, + "step": 835 + }, + { + "ETA": 4.36, + "epoch": 0.2688535134265959, + "fp16_scale": 1.0, + "global_step": 836, + "grad_norm": 2.0625477287596663, + "learning_rate": 1.71571092696281e-06, + "loss": 0.4992, + "step": 836 + }, + { + "ETA": 4.35, + "epoch": 0.2691751085383502, + "fp16_scale": 1.0, + "global_step": 837, + "grad_norm": 2.0557599608220283, + "learning_rate": 1.714982817907569e-06, + "loss": 0.4152, + "step": 837 + }, + { + "ETA": 4.35, + "epoch": 0.26949670365010453, + "fp16_scale": 1.0, + "global_step": 838, + "grad_norm": 2.2111570974142913, + "learning_rate": 1.7142539325682503e-06, + "loss": 0.4189, + "step": 838 + }, + { + "ETA": 4.35, + "epoch": 0.2698182987618588, + "fp16_scale": 1.0, + "global_step": 839, + "grad_norm": 2.0871791563213917, + "learning_rate": 1.7135242717362328e-06, + "loss": 0.3642, + "step": 839 + }, + { + "ETA": 4.34, + "epoch": 0.27013989387361315, + "fp16_scale": 1.0, + "global_step": 840, + "grad_norm": 2.13373016968836, + "learning_rate": 1.7127938362037373e-06, + "loss": 0.4134, + "step": 840 + }, + { + "ETA": 4.34, + "epoch": 0.2704614889853674, + "fp16_scale": 1.0, + "global_step": 841, + "grad_norm": 2.250770840572516, + "learning_rate": 1.7120626267638247e-06, + "loss": 0.3658, + "step": 841 + }, + { + "ETA": 4.33, + "epoch": 0.2707830840971217, + "fp16_scale": 1.0, + "global_step": 842, + "grad_norm": 1.9239987500787747, + "learning_rate": 1.7113306442103977e-06, + "loss": 0.4717, + "step": 842 + }, + { + "ETA": 4.33, + "epoch": 0.27110467920887604, + "fp16_scale": 1.0, + "global_step": 843, + "grad_norm": 1.9123381993159572, + "learning_rate": 1.7105978893381972e-06, + "loss": 0.4548, + "step": 843 + }, + { + "ETA": 4.33, + "epoch": 0.2714262743206303, + "fp16_scale": 1.0, + "global_step": 844, + "grad_norm": 1.9111183798624058, + "learning_rate": 1.7098643629428034e-06, + "loss": 0.4728, + "step": 844 + }, + { + "ETA": 4.33, + "epoch": 0.2717478694323846, + "fp16_scale": 1.0, + "global_step": 845, + "grad_norm": 2.008869802468323, + "learning_rate": 1.7091300658206334e-06, + "loss": 0.4089, + "step": 845 + }, + { + "ETA": 4.33, + "epoch": 0.27206946454413894, + "fp16_scale": 1.0, + "global_step": 846, + "grad_norm": 1.9139533175385044, + "learning_rate": 1.708394998768942e-06, + "loss": 0.4335, + "step": 846 + }, + { + "ETA": 4.33, + "epoch": 0.2723910596558932, + "fp16_scale": 1.0, + "global_step": 847, + "grad_norm": 1.9587937440179755, + "learning_rate": 1.707659162585819e-06, + "loss": 0.4303, + "step": 847 + }, + { + "ETA": 4.32, + "epoch": 0.27271265476764756, + "fp16_scale": 1.0, + "global_step": 848, + "grad_norm": 2.1733534262363423, + "learning_rate": 1.7069225580701904e-06, + "loss": 0.4729, + "step": 848 + }, + { + "ETA": 4.32, + "epoch": 0.27303424987940184, + "fp16_scale": 1.0, + "global_step": 849, + "grad_norm": 1.88692041884661, + "learning_rate": 1.7061851860218152e-06, + "loss": 0.3859, + "step": 849 + }, + { + "ETA": 4.32, + "epoch": 0.2733558449911561, + "fp16_scale": 1.0, + "global_step": 850, + "grad_norm": 1.8429844593171765, + "learning_rate": 1.7054470472412872e-06, + "loss": 0.3967, + "step": 850 + }, + { + "ETA": 4.32, + "epoch": 0.27367744010291045, + "fp16_scale": 1.0, + "global_step": 851, + "grad_norm": 1.8390089168268084, + "learning_rate": 1.7047081425300307e-06, + "loss": 0.4034, + "step": 851 + }, + { + "ETA": 4.32, + "epoch": 0.27399903521466473, + "fp16_scale": 1.0, + "global_step": 852, + "grad_norm": 1.95375035549875, + "learning_rate": 1.703968472690303e-06, + "loss": 0.5002, + "step": 852 + }, + { + "ETA": 4.32, + "epoch": 0.274320630326419, + "fp16_scale": 1.0, + "global_step": 853, + "grad_norm": 2.062375827886167, + "learning_rate": 1.7032280385251923e-06, + "loss": 0.5042, + "step": 853 + }, + { + "ETA": 4.32, + "epoch": 0.27464222543817335, + "fp16_scale": 1.0, + "global_step": 854, + "grad_norm": 2.0272940296944686, + "learning_rate": 1.7024868408386157e-06, + "loss": 0.4469, + "step": 854 + }, + { + "ETA": 4.32, + "epoch": 0.27496382054992763, + "fp16_scale": 1.0, + "global_step": 855, + "grad_norm": 1.914313565079873, + "learning_rate": 1.70174488043532e-06, + "loss": 0.43, + "step": 855 + }, + { + "ETA": 4.31, + "epoch": 0.27528541566168196, + "fp16_scale": 1.0, + "global_step": 856, + "grad_norm": 2.0827423924599904, + "learning_rate": 1.7010021581208797e-06, + "loss": 0.4696, + "step": 856 + }, + { + "ETA": 4.31, + "epoch": 0.27560701077343625, + "fp16_scale": 1.0, + "global_step": 857, + "grad_norm": 2.034557993310201, + "learning_rate": 1.7002586747016968e-06, + "loss": 0.4108, + "step": 857 + }, + { + "ETA": 4.3, + "epoch": 0.2759286058851905, + "fp16_scale": 1.0, + "global_step": 858, + "grad_norm": 1.9798550191722615, + "learning_rate": 1.6995144309849994e-06, + "loss": 0.3915, + "step": 858 + }, + { + "ETA": 4.3, + "epoch": 0.27625020099694486, + "fp16_scale": 1.0, + "global_step": 859, + "grad_norm": 1.9784747074833027, + "learning_rate": 1.6987694277788416e-06, + "loss": 0.4445, + "step": 859 + }, + { + "ETA": 4.3, + "epoch": 0.27657179610869914, + "fp16_scale": 1.0, + "global_step": 860, + "grad_norm": 1.9162443927414687, + "learning_rate": 1.6980236658921015e-06, + "loss": 0.4557, + "step": 860 + }, + { + "ETA": 4.3, + "epoch": 0.2768933912204534, + "fp16_scale": 1.0, + "global_step": 861, + "grad_norm": 2.225603044902912, + "learning_rate": 1.6972771461344812e-06, + "loss": 0.417, + "step": 861 + }, + { + "ETA": 4.29, + "epoch": 0.27721498633220776, + "fp16_scale": 1.0, + "global_step": 862, + "grad_norm": 1.8177365253866995, + "learning_rate": 1.6965298693165057e-06, + "loss": 0.426, + "step": 862 + }, + { + "ETA": 4.29, + "epoch": 0.27753658144396204, + "fp16_scale": 1.0, + "global_step": 863, + "grad_norm": 1.9528751550469554, + "learning_rate": 1.6957818362495218e-06, + "loss": 0.4988, + "step": 863 + }, + { + "ETA": 4.29, + "epoch": 0.2778581765557164, + "fp16_scale": 1.0, + "global_step": 864, + "grad_norm": 1.9734214739855664, + "learning_rate": 1.6950330477456975e-06, + "loss": 0.462, + "step": 864 + }, + { + "ETA": 4.29, + "epoch": 0.27817977166747065, + "fp16_scale": 1.0, + "global_step": 865, + "grad_norm": 2.048578997590825, + "learning_rate": 1.6942835046180214e-06, + "loss": 0.4576, + "step": 865 + }, + { + "ETA": 4.29, + "epoch": 0.27850136677922493, + "fp16_scale": 1.0, + "global_step": 866, + "grad_norm": 2.0365831905860485, + "learning_rate": 1.6935332076803005e-06, + "loss": 0.4645, + "step": 866 + }, + { + "ETA": 4.29, + "epoch": 0.27882296189097927, + "fp16_scale": 1.0, + "global_step": 867, + "grad_norm": 2.3348995559649004, + "learning_rate": 1.6927821577471609e-06, + "loss": 0.4275, + "step": 867 + }, + { + "ETA": 4.29, + "epoch": 0.27914455700273355, + "fp16_scale": 1.0, + "global_step": 868, + "grad_norm": 1.9424993100231185, + "learning_rate": 1.692030355634046e-06, + "loss": 0.462, + "step": 868 + }, + { + "ETA": 4.28, + "epoch": 0.2794661521144879, + "fp16_scale": 1.0, + "global_step": 869, + "grad_norm": 2.103025002987846, + "learning_rate": 1.6912778021572165e-06, + "loss": 0.4325, + "step": 869 + }, + { + "ETA": 4.28, + "epoch": 0.27978774722624217, + "fp16_scale": 1.0, + "global_step": 870, + "grad_norm": 2.0322857291386516, + "learning_rate": 1.6905244981337479e-06, + "loss": 0.4719, + "step": 870 + }, + { + "ETA": 4.28, + "epoch": 0.28010934233799645, + "fp16_scale": 1.0, + "global_step": 871, + "grad_norm": 2.2323938212614096, + "learning_rate": 1.689770444381531e-06, + "loss": 0.3803, + "step": 871 + }, + { + "ETA": 4.28, + "epoch": 0.2804309374497508, + "fp16_scale": 1.0, + "global_step": 872, + "grad_norm": 2.1196793658788686, + "learning_rate": 1.689015641719271e-06, + "loss": 0.4481, + "step": 872 + }, + { + "ETA": 4.28, + "epoch": 0.28075253256150506, + "fp16_scale": 1.0, + "global_step": 873, + "grad_norm": 2.0436425268766296, + "learning_rate": 1.688260090966486e-06, + "loss": 0.5382, + "step": 873 + }, + { + "ETA": 4.27, + "epoch": 0.28107412767325934, + "fp16_scale": 1.0, + "global_step": 874, + "grad_norm": 2.114287881621631, + "learning_rate": 1.687503792943506e-06, + "loss": 0.3502, + "step": 874 + }, + { + "ETA": 4.27, + "epoch": 0.2813957227850137, + "fp16_scale": 1.0, + "global_step": 875, + "grad_norm": 2.048699928774643, + "learning_rate": 1.6867467484714721e-06, + "loss": 0.4626, + "step": 875 + }, + { + "ETA": 4.27, + "epoch": 0.28171731789676796, + "fp16_scale": 1.0, + "global_step": 876, + "grad_norm": 1.9385321275469745, + "learning_rate": 1.6859889583723373e-06, + "loss": 0.4314, + "step": 876 + }, + { + "ETA": 4.27, + "epoch": 0.2820389130085223, + "fp16_scale": 1.0, + "global_step": 877, + "grad_norm": 1.7902775204514527, + "learning_rate": 1.6852304234688623e-06, + "loss": 0.4857, + "step": 877 + }, + { + "ETA": 4.26, + "epoch": 0.2823605081202766, + "fp16_scale": 1.0, + "global_step": 878, + "grad_norm": 1.9188851498442143, + "learning_rate": 1.6844711445846178e-06, + "loss": 0.387, + "step": 878 + }, + { + "ETA": 4.26, + "epoch": 0.28268210323203086, + "fp16_scale": 1.0, + "global_step": 879, + "grad_norm": 1.8294737345080043, + "learning_rate": 1.683711122543982e-06, + "loss": 0.4349, + "step": 879 + }, + { + "ETA": 4.26, + "epoch": 0.2830036983437852, + "fp16_scale": 1.0, + "global_step": 880, + "grad_norm": 1.8871558500356846, + "learning_rate": 1.6829503581721393e-06, + "loss": 0.4346, + "step": 880 + }, + { + "ETA": 4.26, + "epoch": 0.2833252934555395, + "fp16_scale": 1.0, + "global_step": 881, + "grad_norm": 1.8862954619001415, + "learning_rate": 1.6821888522950806e-06, + "loss": 0.4941, + "step": 881 + }, + { + "ETA": 4.25, + "epoch": 0.28364688856729375, + "fp16_scale": 1.0, + "global_step": 882, + "grad_norm": 2.309636485905092, + "learning_rate": 1.681426605739602e-06, + "loss": 0.388, + "step": 882 + }, + { + "ETA": 4.25, + "epoch": 0.2839684836790481, + "fp16_scale": 1.0, + "global_step": 883, + "grad_norm": 2.0456379778426883, + "learning_rate": 1.6806636193333038e-06, + "loss": 0.4262, + "step": 883 + }, + { + "ETA": 4.25, + "epoch": 0.28429007879080237, + "fp16_scale": 1.0, + "global_step": 884, + "grad_norm": 1.9450465927363632, + "learning_rate": 1.6798998939045892e-06, + "loss": 0.4945, + "step": 884 + }, + { + "ETA": 4.25, + "epoch": 0.2846116739025567, + "fp16_scale": 1.0, + "global_step": 885, + "grad_norm": 1.9607234693559452, + "learning_rate": 1.6791354302826637e-06, + "loss": 0.3906, + "step": 885 + }, + { + "ETA": 4.25, + "epoch": 0.284933269014311, + "fp16_scale": 1.0, + "global_step": 886, + "grad_norm": 2.0309419851069057, + "learning_rate": 1.6783702292975347e-06, + "loss": 0.4137, + "step": 886 + }, + { + "ETA": 4.24, + "epoch": 0.28525486412606527, + "fp16_scale": 1.0, + "global_step": 887, + "grad_norm": 2.0088058356002683, + "learning_rate": 1.6776042917800107e-06, + "loss": 0.4294, + "step": 887 + }, + { + "ETA": 4.24, + "epoch": 0.2855764592378196, + "fp16_scale": 1.0, + "global_step": 888, + "grad_norm": 1.869920815793303, + "learning_rate": 1.6768376185616983e-06, + "loss": 0.3826, + "step": 888 + }, + { + "ETA": 4.24, + "epoch": 0.2858980543495739, + "fp16_scale": 1.0, + "global_step": 889, + "grad_norm": 2.08059274751399, + "learning_rate": 1.6760702104750045e-06, + "loss": 0.4728, + "step": 889 + }, + { + "ETA": 4.24, + "epoch": 0.28621964946132816, + "fp16_scale": 1.0, + "global_step": 890, + "grad_norm": 2.219387716824284, + "learning_rate": 1.675302068353133e-06, + "loss": 0.4536, + "step": 890 + }, + { + "ETA": 4.24, + "epoch": 0.2865412445730825, + "fp16_scale": 1.0, + "global_step": 891, + "grad_norm": 2.057716974572081, + "learning_rate": 1.6745331930300856e-06, + "loss": 0.3842, + "step": 891 + }, + { + "ETA": 4.24, + "epoch": 0.2868628396848368, + "fp16_scale": 1.0, + "global_step": 892, + "grad_norm": 1.815613753259894, + "learning_rate": 1.6737635853406592e-06, + "loss": 0.4699, + "step": 892 + }, + { + "ETA": 4.23, + "epoch": 0.2871844347965911, + "fp16_scale": 1.0, + "global_step": 893, + "grad_norm": 1.9216240038533512, + "learning_rate": 1.6729932461204455e-06, + "loss": 0.4485, + "step": 893 + }, + { + "ETA": 4.23, + "epoch": 0.2875060299083454, + "fp16_scale": 1.0, + "global_step": 894, + "grad_norm": 1.8816264379862395, + "learning_rate": 1.6722221762058322e-06, + "loss": 0.4268, + "step": 894 + }, + { + "ETA": 4.23, + "epoch": 0.2878276250200997, + "fp16_scale": 1.0, + "global_step": 895, + "grad_norm": 2.237772525520464, + "learning_rate": 1.6714503764339985e-06, + "loss": 0.4008, + "step": 895 + }, + { + "ETA": 4.23, + "epoch": 0.288149220131854, + "fp16_scale": 1.0, + "global_step": 896, + "grad_norm": 2.2021509983138436, + "learning_rate": 1.6706778476429174e-06, + "loss": 0.3638, + "step": 896 + }, + { + "ETA": 4.22, + "epoch": 0.2884708152436083, + "fp16_scale": 1.0, + "global_step": 897, + "grad_norm": 2.3140625427022163, + "learning_rate": 1.6699045906713524e-06, + "loss": 0.4576, + "step": 897 + }, + { + "ETA": 4.22, + "epoch": 0.28879241035536257, + "fp16_scale": 1.0, + "global_step": 898, + "grad_norm": 1.9141845352200815, + "learning_rate": 1.669130606358858e-06, + "loss": 0.4727, + "step": 898 + }, + { + "ETA": 4.22, + "epoch": 0.2891140054671169, + "fp16_scale": 1.0, + "global_step": 899, + "grad_norm": 1.8455199467528947, + "learning_rate": 1.668355895545779e-06, + "loss": 0.526, + "step": 899 + }, + { + "ETA": 4.22, + "epoch": 0.2894356005788712, + "fp16_scale": 1.0, + "global_step": 900, + "grad_norm": 1.9324765236546506, + "learning_rate": 1.6675804590732479e-06, + "loss": 0.4558, + "step": 900 + }, + { + "ETA": 4.22, + "epoch": 0.2897571956906255, + "fp16_scale": 1.0, + "global_step": 901, + "grad_norm": 2.0324500329401682, + "learning_rate": 1.666804297783186e-06, + "loss": 0.3735, + "step": 901 + }, + { + "ETA": 4.21, + "epoch": 0.2900787908023798, + "fp16_scale": 1.0, + "global_step": 902, + "grad_norm": 2.2331769620950586, + "learning_rate": 1.6660274125183007e-06, + "loss": 0.432, + "step": 902 + }, + { + "ETA": 4.21, + "epoch": 0.2904003859141341, + "fp16_scale": 1.0, + "global_step": 903, + "grad_norm": 1.9605472275641855, + "learning_rate": 1.6652498041220864e-06, + "loss": 0.4608, + "step": 903 + }, + { + "ETA": 4.21, + "epoch": 0.2907219810258884, + "fp16_scale": 1.0, + "global_step": 904, + "grad_norm": 1.9836667106071602, + "learning_rate": 1.6644714734388216e-06, + "loss": 0.37, + "step": 904 + }, + { + "ETA": 4.21, + "epoch": 0.2910435761376427, + "fp16_scale": 1.0, + "global_step": 905, + "grad_norm": 2.0883224813836896, + "learning_rate": 1.6636924213135703e-06, + "loss": 0.4197, + "step": 905 + }, + { + "ETA": 4.21, + "epoch": 0.291365171249397, + "fp16_scale": 1.0, + "global_step": 906, + "grad_norm": 1.8957073666424162, + "learning_rate": 1.6629126485921784e-06, + "loss": 0.4679, + "step": 906 + }, + { + "ETA": 4.2, + "epoch": 0.2916867663611513, + "fp16_scale": 1.0, + "global_step": 907, + "grad_norm": 2.111429723636016, + "learning_rate": 1.6621321561212751e-06, + "loss": 0.4214, + "step": 907 + }, + { + "ETA": 4.2, + "epoch": 0.2920083614729056, + "fp16_scale": 1.0, + "global_step": 908, + "grad_norm": 2.0575896823495277, + "learning_rate": 1.6613509447482713e-06, + "loss": 0.4531, + "step": 908 + }, + { + "ETA": 4.2, + "epoch": 0.29232995658465993, + "fp16_scale": 1.0, + "global_step": 909, + "grad_norm": 1.9722130872935568, + "learning_rate": 1.6605690153213569e-06, + "loss": 0.4384, + "step": 909 + }, + { + "ETA": 4.2, + "epoch": 0.2926515516964142, + "fp16_scale": 1.0, + "global_step": 910, + "grad_norm": 1.8297545060113751, + "learning_rate": 1.6597863686895033e-06, + "loss": 0.3648, + "step": 910 + }, + { + "ETA": 4.19, + "epoch": 0.2929731468081685, + "fp16_scale": 1.0, + "global_step": 911, + "grad_norm": 1.8092968598163315, + "learning_rate": 1.6590030057024594e-06, + "loss": 0.383, + "step": 911 + }, + { + "ETA": 4.19, + "epoch": 0.29329474191992283, + "fp16_scale": 1.0, + "global_step": 912, + "grad_norm": 2.0699529416823705, + "learning_rate": 1.6582189272107524e-06, + "loss": 0.4354, + "step": 912 + }, + { + "ETA": 4.19, + "epoch": 0.2936163370316771, + "fp16_scale": 1.0, + "global_step": 913, + "grad_norm": 2.1270871985867372, + "learning_rate": 1.6574341340656859e-06, + "loss": 0.4982, + "step": 913 + }, + { + "ETA": 4.19, + "epoch": 0.29393793214343145, + "fp16_scale": 1.0, + "global_step": 914, + "grad_norm": 1.7365809088090376, + "learning_rate": 1.65664862711934e-06, + "loss": 0.4122, + "step": 914 + }, + { + "ETA": 4.19, + "epoch": 0.2942595272551857, + "fp16_scale": 1.0, + "global_step": 915, + "grad_norm": 2.2053078988627832, + "learning_rate": 1.6558624072245694e-06, + "loss": 0.414, + "step": 915 + }, + { + "ETA": 4.19, + "epoch": 0.29458112236694, + "fp16_scale": 1.0, + "global_step": 916, + "grad_norm": 1.9402797847729918, + "learning_rate": 1.6550754752350029e-06, + "loss": 0.5087, + "step": 916 + }, + { + "ETA": 4.18, + "epoch": 0.29490271747869434, + "fp16_scale": 1.0, + "global_step": 917, + "grad_norm": 2.0814519151266078, + "learning_rate": 1.654287832005043e-06, + "loss": 0.4751, + "step": 917 + }, + { + "ETA": 4.18, + "epoch": 0.2952243125904486, + "fp16_scale": 1.0, + "global_step": 918, + "grad_norm": 2.027778234865114, + "learning_rate": 1.6534994783898632e-06, + "loss": 0.532, + "step": 918 + }, + { + "ETA": 4.18, + "epoch": 0.2955459077022029, + "fp16_scale": 1.0, + "global_step": 919, + "grad_norm": 2.0031415254223517, + "learning_rate": 1.6527104152454094e-06, + "loss": 0.4551, + "step": 919 + }, + { + "ETA": 4.18, + "epoch": 0.29586750281395724, + "fp16_scale": 1.0, + "global_step": 920, + "grad_norm": 1.8159943503802232, + "learning_rate": 1.651920643428398e-06, + "loss": 0.4119, + "step": 920 + }, + { + "ETA": 4.18, + "epoch": 0.2961890979257115, + "fp16_scale": 1.0, + "global_step": 921, + "grad_norm": 1.9489260671056785, + "learning_rate": 1.6511301637963135e-06, + "loss": 0.4342, + "step": 921 + }, + { + "ETA": 4.18, + "epoch": 0.29651069303746586, + "fp16_scale": 1.0, + "global_step": 922, + "grad_norm": 2.3098551375258363, + "learning_rate": 1.6503389772074101e-06, + "loss": 0.4943, + "step": 922 + }, + { + "ETA": 4.18, + "epoch": 0.29683228814922014, + "fp16_scale": 1.0, + "global_step": 923, + "grad_norm": 1.788580790300466, + "learning_rate": 1.6495470845207096e-06, + "loss": 0.4268, + "step": 923 + }, + { + "ETA": 4.17, + "epoch": 0.2971538832609744, + "fp16_scale": 1.0, + "global_step": 924, + "grad_norm": 2.2036099523986175, + "learning_rate": 1.6487544865959993e-06, + "loss": 0.454, + "step": 924 + }, + { + "ETA": 4.17, + "epoch": 0.29747547837272875, + "fp16_scale": 1.0, + "global_step": 925, + "grad_norm": 2.119978543320096, + "learning_rate": 1.6479611842938336e-06, + "loss": 0.4539, + "step": 925 + }, + { + "ETA": 4.17, + "epoch": 0.29779707348448303, + "fp16_scale": 1.0, + "global_step": 926, + "grad_norm": 1.9519790961903443, + "learning_rate": 1.6471671784755307e-06, + "loss": 0.4717, + "step": 926 + }, + { + "ETA": 4.17, + "epoch": 0.2981186685962373, + "fp16_scale": 1.0, + "global_step": 927, + "grad_norm": 2.09850957875013, + "learning_rate": 1.6463724700031729e-06, + "loss": 0.4486, + "step": 927 + }, + { + "ETA": 4.17, + "epoch": 0.29844026370799165, + "fp16_scale": 1.0, + "global_step": 928, + "grad_norm": 1.967319260081159, + "learning_rate": 1.6455770597396057e-06, + "loss": 0.3927, + "step": 928 + }, + { + "ETA": 4.17, + "epoch": 0.29876185881974593, + "fp16_scale": 1.0, + "global_step": 929, + "grad_norm": 1.9256768270796039, + "learning_rate": 1.644780948548436e-06, + "loss": 0.4866, + "step": 929 + }, + { + "ETA": 4.17, + "epoch": 0.29908345393150026, + "fp16_scale": 1.0, + "global_step": 930, + "grad_norm": 2.8211455565522945, + "learning_rate": 1.6439841372940327e-06, + "loss": 0.4714, + "step": 930 + }, + { + "ETA": 4.16, + "epoch": 0.29940504904325455, + "fp16_scale": 1.0, + "global_step": 931, + "grad_norm": 1.9215018024476105, + "learning_rate": 1.6431866268415236e-06, + "loss": 0.4105, + "step": 931 + }, + { + "ETA": 4.16, + "epoch": 0.2997266441550088, + "fp16_scale": 1.0, + "global_step": 932, + "grad_norm": 2.274164232899391, + "learning_rate": 1.6423884180567957e-06, + "loss": 0.505, + "step": 932 + }, + { + "ETA": 4.16, + "epoch": 0.30004823926676316, + "fp16_scale": 1.0, + "global_step": 933, + "grad_norm": 2.29083762275944, + "learning_rate": 1.6415895118064957e-06, + "loss": 0.4023, + "step": 933 + }, + { + "ETA": 4.16, + "epoch": 0.30036983437851744, + "fp16_scale": 1.0, + "global_step": 934, + "grad_norm": 1.9240621370254039, + "learning_rate": 1.6407899089580259e-06, + "loss": 0.4576, + "step": 934 + }, + { + "ETA": 4.16, + "epoch": 0.3006914294902717, + "fp16_scale": 1.0, + "global_step": 935, + "grad_norm": 2.0423004097962707, + "learning_rate": 1.639989610379546e-06, + "loss": 0.4809, + "step": 935 + }, + { + "ETA": 4.16, + "epoch": 0.30101302460202606, + "fp16_scale": 1.0, + "global_step": 936, + "grad_norm": 2.1610394320106567, + "learning_rate": 1.63918861693997e-06, + "loss": 0.5741, + "step": 936 + }, + { + "ETA": 4.15, + "epoch": 0.30133461971378034, + "fp16_scale": 1.0, + "global_step": 937, + "grad_norm": 1.8416511356810563, + "learning_rate": 1.6383869295089679e-06, + "loss": 0.4964, + "step": 937 + }, + { + "ETA": 4.15, + "epoch": 0.3016562148255347, + "fp16_scale": 1.0, + "global_step": 938, + "grad_norm": 2.009089661026409, + "learning_rate": 1.6375845489569614e-06, + "loss": 0.4623, + "step": 938 + }, + { + "ETA": 4.15, + "epoch": 0.30197780993728895, + "fp16_scale": 1.0, + "global_step": 939, + "grad_norm": 1.9471135785187048, + "learning_rate": 1.636781476155126e-06, + "loss": 0.4525, + "step": 939 + }, + { + "ETA": 4.15, + "epoch": 0.30229940504904323, + "fp16_scale": 1.0, + "global_step": 940, + "grad_norm": 1.9459217505997315, + "learning_rate": 1.6359777119753883e-06, + "loss": 0.4674, + "step": 940 + }, + { + "ETA": 4.15, + "epoch": 0.30262100016079757, + "fp16_scale": 1.0, + "global_step": 941, + "grad_norm": 1.9527092305549782, + "learning_rate": 1.6351732572904257e-06, + "loss": 0.448, + "step": 941 + }, + { + "ETA": 4.15, + "epoch": 0.30294259527255185, + "fp16_scale": 1.0, + "global_step": 942, + "grad_norm": 2.0542514425282605, + "learning_rate": 1.6343681129736658e-06, + "loss": 0.4661, + "step": 942 + }, + { + "ETA": 4.14, + "epoch": 0.30326419038430613, + "fp16_scale": 1.0, + "global_step": 943, + "grad_norm": 1.8713043915130485, + "learning_rate": 1.6335622798992838e-06, + "loss": 0.4104, + "step": 943 + }, + { + "ETA": 4.14, + "epoch": 0.30358578549606047, + "fp16_scale": 1.0, + "global_step": 944, + "grad_norm": 1.9127342115214452, + "learning_rate": 1.6327557589422035e-06, + "loss": 0.4048, + "step": 944 + }, + { + "ETA": 4.14, + "epoch": 0.30390738060781475, + "fp16_scale": 1.0, + "global_step": 945, + "grad_norm": 1.822000083393841, + "learning_rate": 1.631948550978096e-06, + "loss": 0.4179, + "step": 945 + }, + { + "ETA": 4.14, + "epoch": 0.3042289757195691, + "fp16_scale": 1.0, + "global_step": 946, + "grad_norm": 2.246307562871448, + "learning_rate": 1.6311406568833768e-06, + "loss": 0.4699, + "step": 946 + }, + { + "ETA": 4.14, + "epoch": 0.30455057083132336, + "fp16_scale": 1.0, + "global_step": 947, + "grad_norm": 1.7991526158996067, + "learning_rate": 1.6303320775352076e-06, + "loss": 0.3842, + "step": 947 + }, + { + "ETA": 4.14, + "epoch": 0.30487216594307764, + "fp16_scale": 1.0, + "global_step": 948, + "grad_norm": 2.130316177876793, + "learning_rate": 1.6295228138114943e-06, + "loss": 0.4786, + "step": 948 + }, + { + "ETA": 4.14, + "epoch": 0.305193761054832, + "fp16_scale": 1.0, + "global_step": 949, + "grad_norm": 1.936392005000023, + "learning_rate": 1.628712866590885e-06, + "loss": 0.3948, + "step": 949 + }, + { + "ETA": 4.13, + "epoch": 0.30551535616658626, + "fp16_scale": 1.0, + "global_step": 950, + "grad_norm": 1.867987765910443, + "learning_rate": 1.62790223675277e-06, + "loss": 0.4887, + "step": 950 + }, + { + "ETA": 4.13, + "epoch": 0.3058369512783406, + "fp16_scale": 1.0, + "global_step": 951, + "grad_norm": 2.093894916092837, + "learning_rate": 1.6270909251772813e-06, + "loss": 0.387, + "step": 951 + }, + { + "ETA": 4.13, + "epoch": 0.3061585463900949, + "fp16_scale": 1.0, + "global_step": 952, + "grad_norm": 1.863470738012386, + "learning_rate": 1.6262789327452903e-06, + "loss": 0.4035, + "step": 952 + }, + { + "ETA": 4.12, + "epoch": 0.30648014150184916, + "fp16_scale": 1.0, + "global_step": 953, + "grad_norm": 2.059524569754237, + "learning_rate": 1.625466260338409e-06, + "loss": 0.4444, + "step": 953 + }, + { + "ETA": 4.12, + "epoch": 0.3068017366136035, + "fp16_scale": 1.0, + "global_step": 954, + "grad_norm": 2.0355267166690236, + "learning_rate": 1.6246529088389865e-06, + "loss": 0.4091, + "step": 954 + }, + { + "ETA": 4.12, + "epoch": 0.3071233317253578, + "fp16_scale": 1.0, + "global_step": 955, + "grad_norm": 2.108593636620605, + "learning_rate": 1.6238388791301088e-06, + "loss": 0.4331, + "step": 955 + }, + { + "ETA": 4.12, + "epoch": 0.30744492683711205, + "fp16_scale": 1.0, + "global_step": 956, + "grad_norm": 1.9504079277887791, + "learning_rate": 1.6230241720955995e-06, + "loss": 0.463, + "step": 956 + }, + { + "ETA": 4.12, + "epoch": 0.3077665219488664, + "fp16_scale": 1.0, + "global_step": 957, + "grad_norm": 1.9096490823504333, + "learning_rate": 1.6222087886200171e-06, + "loss": 0.4059, + "step": 957 + }, + { + "ETA": 4.12, + "epoch": 0.30808811706062067, + "fp16_scale": 1.0, + "global_step": 958, + "grad_norm": 1.8211137389504628, + "learning_rate": 1.6213927295886545e-06, + "loss": 0.3819, + "step": 958 + }, + { + "ETA": 4.12, + "epoch": 0.308409712172375, + "fp16_scale": 1.0, + "global_step": 959, + "grad_norm": 1.792072571331639, + "learning_rate": 1.6205759958875377e-06, + "loss": 0.4385, + "step": 959 + }, + { + "ETA": 4.11, + "epoch": 0.3087313072841293, + "fp16_scale": 1.0, + "global_step": 960, + "grad_norm": 1.7783900392994196, + "learning_rate": 1.6197585884034264e-06, + "loss": 0.3853, + "step": 960 + }, + { + "ETA": 4.11, + "epoch": 0.30905290239588357, + "fp16_scale": 1.0, + "global_step": 961, + "grad_norm": 2.1772217828206615, + "learning_rate": 1.61894050802381e-06, + "loss": 0.4426, + "step": 961 + }, + { + "ETA": 4.11, + "epoch": 0.3093744975076379, + "fp16_scale": 1.0, + "global_step": 962, + "grad_norm": 2.29495700071546, + "learning_rate": 1.6181217556369102e-06, + "loss": 0.4184, + "step": 962 + }, + { + "ETA": 4.11, + "epoch": 0.3096960926193922, + "fp16_scale": 1.0, + "global_step": 963, + "grad_norm": 2.200297165457558, + "learning_rate": 1.6173023321316774e-06, + "loss": 0.393, + "step": 963 + }, + { + "ETA": 4.1, + "epoch": 0.31001768773114646, + "fp16_scale": 1.0, + "global_step": 964, + "grad_norm": 2.3014226603894556, + "learning_rate": 1.6164822383977913e-06, + "loss": 0.4762, + "step": 964 + }, + { + "ETA": 4.1, + "epoch": 0.3103392828429008, + "fp16_scale": 1.0, + "global_step": 965, + "grad_norm": 2.043302691249108, + "learning_rate": 1.615661475325658e-06, + "loss": 0.499, + "step": 965 + }, + { + "ETA": 4.1, + "epoch": 0.3106608779546551, + "fp16_scale": 1.0, + "global_step": 966, + "grad_norm": 2.0101875742210735, + "learning_rate": 1.6148400438064125e-06, + "loss": 0.439, + "step": 966 + }, + { + "ETA": 4.1, + "epoch": 0.3109824730664094, + "fp16_scale": 1.0, + "global_step": 967, + "grad_norm": 2.106040675309942, + "learning_rate": 1.6140179447319131e-06, + "loss": 0.4656, + "step": 967 + }, + { + "ETA": 4.09, + "epoch": 0.3113040681781637, + "fp16_scale": 1.0, + "global_step": 968, + "grad_norm": 2.4306980079928797, + "learning_rate": 1.6131951789947449e-06, + "loss": 0.4361, + "step": 968 + }, + { + "ETA": 4.09, + "epoch": 0.311625663289918, + "fp16_scale": 1.0, + "global_step": 969, + "grad_norm": 1.9857925796900955, + "learning_rate": 1.6123717474882155e-06, + "loss": 0.4077, + "step": 969 + }, + { + "ETA": 4.09, + "epoch": 0.3119472584016723, + "fp16_scale": 1.0, + "global_step": 970, + "grad_norm": 1.9924019384133027, + "learning_rate": 1.6115476511063561e-06, + "loss": 0.4858, + "step": 970 + }, + { + "ETA": 4.09, + "epoch": 0.3122688535134266, + "fp16_scale": 1.0, + "global_step": 971, + "grad_norm": 1.764631728406469, + "learning_rate": 1.6107228907439195e-06, + "loss": 0.4526, + "step": 971 + }, + { + "ETA": 4.09, + "epoch": 0.3125904486251809, + "fp16_scale": 1.0, + "global_step": 972, + "grad_norm": 1.9406733716313436, + "learning_rate": 1.6098974672963794e-06, + "loss": 0.4555, + "step": 972 + }, + { + "ETA": 4.08, + "epoch": 0.3129120437369352, + "fp16_scale": 1.0, + "global_step": 973, + "grad_norm": 1.8258599665084803, + "learning_rate": 1.6090713816599293e-06, + "loss": 0.4453, + "step": 973 + }, + { + "ETA": 4.08, + "epoch": 0.3132336388486895, + "fp16_scale": 1.0, + "global_step": 974, + "grad_norm": 2.208412865540917, + "learning_rate": 1.608244634731482e-06, + "loss": 0.4072, + "step": 974 + }, + { + "ETA": 4.08, + "epoch": 0.3135552339604438, + "fp16_scale": 1.0, + "global_step": 975, + "grad_norm": 1.9984530555239954, + "learning_rate": 1.6074172274086684e-06, + "loss": 0.4352, + "step": 975 + }, + { + "ETA": 4.08, + "epoch": 0.3138768290721981, + "fp16_scale": 1.0, + "global_step": 976, + "grad_norm": 2.258343156213296, + "learning_rate": 1.6065891605898357e-06, + "loss": 0.4681, + "step": 976 + }, + { + "ETA": 4.08, + "epoch": 0.3141984241839524, + "fp16_scale": 1.0, + "global_step": 977, + "grad_norm": 1.7754507487479707, + "learning_rate": 1.605760435174048e-06, + "loss": 0.446, + "step": 977 + }, + { + "ETA": 4.07, + "epoch": 0.3145200192957067, + "fp16_scale": 1.0, + "global_step": 978, + "grad_norm": 1.942260767564616, + "learning_rate": 1.6049310520610834e-06, + "loss": 0.4362, + "step": 978 + }, + { + "ETA": 4.07, + "epoch": 0.314841614407461, + "fp16_scale": 1.0, + "global_step": 979, + "grad_norm": 1.9848142352329132, + "learning_rate": 1.6041010121514356e-06, + "loss": 0.4035, + "step": 979 + }, + { + "ETA": 4.07, + "epoch": 0.3151632095192153, + "fp16_scale": 1.0, + "global_step": 980, + "grad_norm": 1.8758060404416177, + "learning_rate": 1.6032703163463097e-06, + "loss": 0.4265, + "step": 980 + }, + { + "ETA": 4.07, + "epoch": 0.3154848046309696, + "fp16_scale": 1.0, + "global_step": 981, + "grad_norm": 2.0904424084804605, + "learning_rate": 1.6024389655476245e-06, + "loss": 0.3721, + "step": 981 + }, + { + "ETA": 4.07, + "epoch": 0.3158063997427239, + "fp16_scale": 1.0, + "global_step": 982, + "grad_norm": 2.0345875745531536, + "learning_rate": 1.6016069606580087e-06, + "loss": 0.4663, + "step": 982 + }, + { + "ETA": 4.07, + "epoch": 0.31612799485447823, + "fp16_scale": 1.0, + "global_step": 983, + "grad_norm": 1.9393973294357287, + "learning_rate": 1.600774302580802e-06, + "loss": 0.4289, + "step": 983 + }, + { + "ETA": 4.06, + "epoch": 0.3164495899662325, + "fp16_scale": 1.0, + "global_step": 984, + "grad_norm": 2.183008095835992, + "learning_rate": 1.599940992220053e-06, + "loss": 0.4253, + "step": 984 + }, + { + "ETA": 4.06, + "epoch": 0.3167711850779868, + "fp16_scale": 1.0, + "global_step": 985, + "grad_norm": 1.8314046538913937, + "learning_rate": 1.5991070304805181e-06, + "loss": 0.5015, + "step": 985 + }, + { + "ETA": 4.06, + "epoch": 0.31709278018974113, + "fp16_scale": 1.0, + "global_step": 986, + "grad_norm": 1.7814409496791646, + "learning_rate": 1.598272418267662e-06, + "loss": 0.4227, + "step": 986 + }, + { + "ETA": 4.06, + "epoch": 0.3174143753014954, + "fp16_scale": 1.0, + "global_step": 987, + "grad_norm": 1.98942113818272, + "learning_rate": 1.597437156487654e-06, + "loss": 0.4697, + "step": 987 + }, + { + "ETA": 4.06, + "epoch": 0.3177359704132497, + "fp16_scale": 1.0, + "global_step": 988, + "grad_norm": 2.2036326117118463, + "learning_rate": 1.5966012460473705e-06, + "loss": 0.4297, + "step": 988 + }, + { + "ETA": 4.05, + "epoch": 0.318057565525004, + "fp16_scale": 1.0, + "global_step": 989, + "grad_norm": 2.0946812395760563, + "learning_rate": 1.5957646878543913e-06, + "loss": 0.4558, + "step": 989 + }, + { + "ETA": 4.05, + "epoch": 0.3183791606367583, + "fp16_scale": 1.0, + "global_step": 990, + "grad_norm": 1.9755285676762397, + "learning_rate": 1.594927482816999e-06, + "loss": 0.4587, + "step": 990 + }, + { + "ETA": 4.05, + "epoch": 0.31870075574851264, + "fp16_scale": 1.0, + "global_step": 991, + "grad_norm": 1.7781947949105377, + "learning_rate": 1.594089631844179e-06, + "loss": 0.4095, + "step": 991 + }, + { + "ETA": 4.05, + "epoch": 0.3190223508602669, + "fp16_scale": 1.0, + "global_step": 992, + "grad_norm": 1.8978851692947698, + "learning_rate": 1.5932511358456183e-06, + "loss": 0.3714, + "step": 992 + }, + { + "ETA": 4.04, + "epoch": 0.3193439459720212, + "fp16_scale": 1.0, + "global_step": 993, + "grad_norm": 2.015619886492585, + "learning_rate": 1.5924119957317039e-06, + "loss": 0.4191, + "step": 993 + }, + { + "ETA": 4.04, + "epoch": 0.31966554108377554, + "fp16_scale": 1.0, + "global_step": 994, + "grad_norm": 1.8226320334133386, + "learning_rate": 1.5915722124135225e-06, + "loss": 0.4725, + "step": 994 + }, + { + "ETA": 4.04, + "epoch": 0.3199871361955298, + "fp16_scale": 1.0, + "global_step": 995, + "grad_norm": 2.1068079375079214, + "learning_rate": 1.590731786802858e-06, + "loss": 0.399, + "step": 995 + }, + { + "ETA": 4.04, + "epoch": 0.32030873130728416, + "fp16_scale": 1.0, + "global_step": 996, + "grad_norm": 2.105917564042965, + "learning_rate": 1.589890719812193e-06, + "loss": 0.3695, + "step": 996 + }, + { + "ETA": 4.04, + "epoch": 0.32063032641903844, + "fp16_scale": 1.0, + "global_step": 997, + "grad_norm": 1.9618509350466506, + "learning_rate": 1.589049012354706e-06, + "loss": 0.4866, + "step": 997 + }, + { + "ETA": 4.03, + "epoch": 0.3209519215307927, + "fp16_scale": 1.0, + "global_step": 998, + "grad_norm": 1.9767032515310012, + "learning_rate": 1.5882066653442706e-06, + "loss": 0.4888, + "step": 998 + }, + { + "ETA": 4.03, + "epoch": 0.32127351664254705, + "fp16_scale": 1.0, + "global_step": 999, + "grad_norm": 1.9047943312129536, + "learning_rate": 1.5873636796954554e-06, + "loss": 0.4402, + "step": 999 + }, + { + "ETA": 4.03, + "epoch": 0.32159511175430133, + "fp16_scale": 1.0, + "global_step": 1000, + "grad_norm": 1.8852021138237236, + "learning_rate": 1.586520056323522e-06, + "loss": 0.4348, + "step": 1000 + }, + { + "ETA": 4.05, + "epoch": 0.3219167068660556, + "fp16_scale": 1.0, + "global_step": 1001, + "grad_norm": 1.9352276953564838, + "learning_rate": 1.585675796144424e-06, + "loss": 0.4532, + "step": 1001 + }, + { + "ETA": 4.04, + "epoch": 0.32223830197780995, + "fp16_scale": 1.0, + "global_step": 1002, + "grad_norm": 1.8620206832959252, + "learning_rate": 1.5848309000748073e-06, + "loss": 0.4039, + "step": 1002 + }, + { + "ETA": 4.04, + "epoch": 0.32255989708956423, + "fp16_scale": 1.0, + "global_step": 1003, + "grad_norm": 1.8484776743122808, + "learning_rate": 1.5839853690320072e-06, + "loss": 0.4167, + "step": 1003 + }, + { + "ETA": 4.04, + "epoch": 0.32288149220131857, + "fp16_scale": 1.0, + "global_step": 1004, + "grad_norm": 1.8194759672775325, + "learning_rate": 1.5831392039340496e-06, + "loss": 0.4378, + "step": 1004 + }, + { + "ETA": 4.04, + "epoch": 0.32320308731307285, + "fp16_scale": 1.0, + "global_step": 1005, + "grad_norm": 2.193159959257116, + "learning_rate": 1.582292405699648e-06, + "loss": 0.4444, + "step": 1005 + }, + { + "ETA": 4.04, + "epoch": 0.3235246824248271, + "fp16_scale": 1.0, + "global_step": 1006, + "grad_norm": 2.0237663042766583, + "learning_rate": 1.5814449752482029e-06, + "loss": 0.4066, + "step": 1006 + }, + { + "ETA": 4.03, + "epoch": 0.32384627753658146, + "fp16_scale": 1.0, + "global_step": 1007, + "grad_norm": 1.9393019775060167, + "learning_rate": 1.5805969134998027e-06, + "loss": 0.5277, + "step": 1007 + }, + { + "ETA": 4.03, + "epoch": 0.32416787264833574, + "fp16_scale": 1.0, + "global_step": 1008, + "grad_norm": 1.909534410238616, + "learning_rate": 1.5797482213752197e-06, + "loss": 0.3949, + "step": 1008 + }, + { + "ETA": 4.03, + "epoch": 0.32448946776009, + "fp16_scale": 1.0, + "global_step": 1009, + "grad_norm": 1.8643349654376014, + "learning_rate": 1.5788988997959115e-06, + "loss": 0.3841, + "step": 1009 + }, + { + "ETA": 4.02, + "epoch": 0.32481106287184436, + "fp16_scale": 1.0, + "global_step": 1010, + "grad_norm": 2.1193315527683856, + "learning_rate": 1.5780489496840189e-06, + "loss": 0.3753, + "step": 1010 + }, + { + "ETA": 4.02, + "epoch": 0.32513265798359864, + "fp16_scale": 1.0, + "global_step": 1011, + "grad_norm": 2.0664476418474615, + "learning_rate": 1.577198371962365e-06, + "loss": 0.4472, + "step": 1011 + }, + { + "ETA": 4.02, + "epoch": 0.325454253095353, + "fp16_scale": 1.0, + "global_step": 1012, + "grad_norm": 2.0255892118055914, + "learning_rate": 1.5763471675544546e-06, + "loss": 0.393, + "step": 1012 + }, + { + "ETA": 4.01, + "epoch": 0.32577584820710725, + "fp16_scale": 1.0, + "global_step": 1013, + "grad_norm": 2.3105379871365797, + "learning_rate": 1.5754953373844728e-06, + "loss": 0.3603, + "step": 1013 + }, + { + "ETA": 4.01, + "epoch": 0.32609744331886154, + "fp16_scale": 1.0, + "global_step": 1014, + "grad_norm": 2.155938610942582, + "learning_rate": 1.5746428823772836e-06, + "loss": 0.4213, + "step": 1014 + }, + { + "ETA": 4.01, + "epoch": 0.32641903843061587, + "fp16_scale": 1.0, + "global_step": 1015, + "grad_norm": 2.244868296413816, + "learning_rate": 1.5737898034584306e-06, + "loss": 0.4233, + "step": 1015 + }, + { + "ETA": 4.01, + "epoch": 0.32674063354237015, + "fp16_scale": 1.0, + "global_step": 1016, + "grad_norm": 2.238154376167949, + "learning_rate": 1.5729361015541332e-06, + "loss": 0.4664, + "step": 1016 + }, + { + "ETA": 4.01, + "epoch": 0.32706222865412443, + "fp16_scale": 1.0, + "global_step": 1017, + "grad_norm": 2.4905645200147974, + "learning_rate": 1.5720817775912885e-06, + "loss": 0.4933, + "step": 1017 + }, + { + "ETA": 4.01, + "epoch": 0.32738382376587877, + "fp16_scale": 1.0, + "global_step": 1018, + "grad_norm": 2.2008658266116137, + "learning_rate": 1.5712268324974688e-06, + "loss": 0.4987, + "step": 1018 + }, + { + "ETA": 4.0, + "epoch": 0.32770541887763305, + "fp16_scale": 1.0, + "global_step": 1019, + "grad_norm": 2.064184305504791, + "learning_rate": 1.5703712672009205e-06, + "loss": 0.4306, + "step": 1019 + }, + { + "ETA": 4.0, + "epoch": 0.3280270139893874, + "fp16_scale": 1.0, + "global_step": 1020, + "grad_norm": 1.8987392916269317, + "learning_rate": 1.5695150826305631e-06, + "loss": 0.4248, + "step": 1020 + }, + { + "ETA": 4.0, + "epoch": 0.32834860910114166, + "fp16_scale": 1.0, + "global_step": 1021, + "grad_norm": 1.9675675056417445, + "learning_rate": 1.568658279715989e-06, + "loss": 0.5002, + "step": 1021 + }, + { + "ETA": 4.0, + "epoch": 0.32867020421289594, + "fp16_scale": 1.0, + "global_step": 1022, + "grad_norm": 2.0582941033337847, + "learning_rate": 1.5678008593874622e-06, + "loss": 0.4571, + "step": 1022 + }, + { + "ETA": 4.0, + "epoch": 0.3289917993246503, + "fp16_scale": 1.0, + "global_step": 1023, + "grad_norm": 2.0164501840616236, + "learning_rate": 1.5669428225759158e-06, + "loss": 0.5116, + "step": 1023 + }, + { + "ETA": 4.0, + "epoch": 0.32931339443640456, + "fp16_scale": 1.0, + "global_step": 1024, + "grad_norm": 2.1105696147832704, + "learning_rate": 1.566084170212953e-06, + "loss": 0.497, + "step": 1024 + }, + { + "ETA": 3.99, + "epoch": 0.32963498954815884, + "fp16_scale": 1.0, + "global_step": 1025, + "grad_norm": 1.829267150650572, + "learning_rate": 1.5652249032308462e-06, + "loss": 0.4634, + "step": 1025 + }, + { + "ETA": 3.99, + "epoch": 0.3299565846599132, + "fp16_scale": 1.0, + "global_step": 1026, + "grad_norm": 2.3222584826810238, + "learning_rate": 1.5643650225625338e-06, + "loss": 0.4429, + "step": 1026 + }, + { + "ETA": 3.99, + "epoch": 0.33027817977166746, + "fp16_scale": 1.0, + "global_step": 1027, + "grad_norm": 2.0208817981696914, + "learning_rate": 1.563504529141621e-06, + "loss": 0.4529, + "step": 1027 + }, + { + "ETA": 3.99, + "epoch": 0.3305997748834218, + "fp16_scale": 1.0, + "global_step": 1028, + "grad_norm": 2.224391812227433, + "learning_rate": 1.5626434239023782e-06, + "loss": 0.4128, + "step": 1028 + }, + { + "ETA": 3.98, + "epoch": 0.3309213699951761, + "fp16_scale": 1.0, + "global_step": 1029, + "grad_norm": 1.8325687051324988, + "learning_rate": 1.5617817077797405e-06, + "loss": 0.4281, + "step": 1029 + }, + { + "ETA": 3.98, + "epoch": 0.33124296510693035, + "fp16_scale": 1.0, + "global_step": 1030, + "grad_norm": 1.9386730764049669, + "learning_rate": 1.5609193817093057e-06, + "loss": 0.4135, + "step": 1030 + }, + { + "ETA": 3.98, + "epoch": 0.3315645602186847, + "fp16_scale": 1.0, + "global_step": 1031, + "grad_norm": 2.1193501727587174, + "learning_rate": 1.560056446627334e-06, + "loss": 0.3975, + "step": 1031 + }, + { + "ETA": 3.98, + "epoch": 0.33188615533043897, + "fp16_scale": 1.0, + "global_step": 1032, + "grad_norm": 2.0763726603941297, + "learning_rate": 1.5591929034707466e-06, + "loss": 0.4187, + "step": 1032 + }, + { + "ETA": 3.97, + "epoch": 0.3322077504421933, + "fp16_scale": 1.0, + "global_step": 1033, + "grad_norm": 2.0414959466932885, + "learning_rate": 1.558328753177126e-06, + "loss": 0.5019, + "step": 1033 + }, + { + "ETA": 3.97, + "epoch": 0.3325293455539476, + "fp16_scale": 1.0, + "global_step": 1034, + "grad_norm": 1.8954899661587834, + "learning_rate": 1.5574639966847126e-06, + "loss": 0.473, + "step": 1034 + }, + { + "ETA": 3.97, + "epoch": 0.33285094066570187, + "fp16_scale": 1.0, + "global_step": 1035, + "grad_norm": 1.9437209965979045, + "learning_rate": 1.5565986349324054e-06, + "loss": 0.4092, + "step": 1035 + }, + { + "ETA": 3.97, + "epoch": 0.3331725357774562, + "fp16_scale": 1.0, + "global_step": 1036, + "grad_norm": 1.94681689816743, + "learning_rate": 1.5557326688597608e-06, + "loss": 0.3887, + "step": 1036 + }, + { + "ETA": 3.97, + "epoch": 0.3334941308892105, + "fp16_scale": 1.0, + "global_step": 1037, + "grad_norm": 1.9569567697320496, + "learning_rate": 1.5548660994069907e-06, + "loss": 0.401, + "step": 1037 + }, + { + "ETA": 3.96, + "epoch": 0.33381572600096476, + "fp16_scale": 1.0, + "global_step": 1038, + "grad_norm": 2.205854240770515, + "learning_rate": 1.5539989275149629e-06, + "loss": 0.4019, + "step": 1038 + }, + { + "ETA": 3.96, + "epoch": 0.3341373211127191, + "fp16_scale": 1.0, + "global_step": 1039, + "grad_norm": 2.3043794541422664, + "learning_rate": 1.5531311541251992e-06, + "loss": 0.3857, + "step": 1039 + }, + { + "ETA": 3.96, + "epoch": 0.3344589162244734, + "fp16_scale": 1.0, + "global_step": 1040, + "grad_norm": 1.7645749873807048, + "learning_rate": 1.5522627801798743e-06, + "loss": 0.4517, + "step": 1040 + }, + { + "ETA": 3.96, + "epoch": 0.3347805113362277, + "fp16_scale": 1.0, + "global_step": 1041, + "grad_norm": 2.0863750253291182, + "learning_rate": 1.5513938066218142e-06, + "loss": 0.487, + "step": 1041 + }, + { + "ETA": 3.95, + "epoch": 0.335102106447982, + "fp16_scale": 1.0, + "global_step": 1042, + "grad_norm": 1.8860674534398028, + "learning_rate": 1.550524234394497e-06, + "loss": 0.4165, + "step": 1042 + }, + { + "ETA": 3.95, + "epoch": 0.3354237015597363, + "fp16_scale": 1.0, + "global_step": 1043, + "grad_norm": 2.0024574398746555, + "learning_rate": 1.5496540644420502e-06, + "loss": 0.4953, + "step": 1043 + }, + { + "ETA": 3.95, + "epoch": 0.3357452966714906, + "fp16_scale": 1.0, + "global_step": 1044, + "grad_norm": 1.803225260799288, + "learning_rate": 1.5487832977092507e-06, + "loss": 0.4748, + "step": 1044 + }, + { + "ETA": 3.95, + "epoch": 0.3360668917832449, + "fp16_scale": 1.0, + "global_step": 1045, + "grad_norm": 2.0544413463818794, + "learning_rate": 1.547911935141523e-06, + "loss": 0.3531, + "step": 1045 + }, + { + "ETA": 3.95, + "epoch": 0.3363884868949992, + "fp16_scale": 1.0, + "global_step": 1046, + "grad_norm": 1.880475734878519, + "learning_rate": 1.5470399776849386e-06, + "loss": 0.4196, + "step": 1046 + }, + { + "ETA": 3.94, + "epoch": 0.3367100820067535, + "fp16_scale": 1.0, + "global_step": 1047, + "grad_norm": 1.8844368312463697, + "learning_rate": 1.5461674262862147e-06, + "loss": 0.4545, + "step": 1047 + }, + { + "ETA": 3.94, + "epoch": 0.3370316771185078, + "fp16_scale": 1.0, + "global_step": 1048, + "grad_norm": 1.950863553954552, + "learning_rate": 1.5452942818927142e-06, + "loss": 0.4526, + "step": 1048 + }, + { + "ETA": 3.94, + "epoch": 0.3373532722302621, + "fp16_scale": 1.0, + "global_step": 1049, + "grad_norm": 2.0667239701990514, + "learning_rate": 1.5444205454524427e-06, + "loss": 0.3974, + "step": 1049 + }, + { + "ETA": 3.94, + "epoch": 0.3376748673420164, + "fp16_scale": 1.0, + "global_step": 1050, + "grad_norm": 1.7967936383477858, + "learning_rate": 1.543546217914049e-06, + "loss": 0.4819, + "step": 1050 + }, + { + "ETA": 3.94, + "epoch": 0.3379964624537707, + "fp16_scale": 1.0, + "global_step": 1051, + "grad_norm": 1.8505390025145074, + "learning_rate": 1.5426713002268246e-06, + "loss": 0.4628, + "step": 1051 + }, + { + "ETA": 3.93, + "epoch": 0.338318057565525, + "fp16_scale": 1.0, + "global_step": 1052, + "grad_norm": 2.6895805870985363, + "learning_rate": 1.5417957933407005e-06, + "loss": 0.3935, + "step": 1052 + }, + { + "ETA": 3.93, + "epoch": 0.3386396526772793, + "fp16_scale": 1.0, + "global_step": 1053, + "grad_norm": 2.0481066115640547, + "learning_rate": 1.5409196982062475e-06, + "loss": 0.3689, + "step": 1053 + }, + { + "ETA": 3.93, + "epoch": 0.3389612477890336, + "fp16_scale": 1.0, + "global_step": 1054, + "grad_norm": 2.0247938745469463, + "learning_rate": 1.5400430157746755e-06, + "loss": 0.3837, + "step": 1054 + }, + { + "ETA": 3.92, + "epoch": 0.3392828429007879, + "fp16_scale": 1.0, + "global_step": 1055, + "grad_norm": 2.1157710086241814, + "learning_rate": 1.539165746997833e-06, + "loss": 0.4054, + "step": 1055 + }, + { + "ETA": 3.92, + "epoch": 0.3396044380125422, + "fp16_scale": 1.0, + "global_step": 1056, + "grad_norm": 2.040310105981162, + "learning_rate": 1.5382878928282028e-06, + "loss": 0.4058, + "step": 1056 + }, + { + "ETA": 3.92, + "epoch": 0.33992603312429653, + "fp16_scale": 1.0, + "global_step": 1057, + "grad_norm": 1.9898876908794625, + "learning_rate": 1.5374094542189054e-06, + "loss": 0.4029, + "step": 1057 + }, + { + "ETA": 3.92, + "epoch": 0.3402476282360508, + "fp16_scale": 1.0, + "global_step": 1058, + "grad_norm": 1.864298916622204, + "learning_rate": 1.536530432123695e-06, + "loss": 0.491, + "step": 1058 + }, + { + "ETA": 3.92, + "epoch": 0.3405692233478051, + "fp16_scale": 1.0, + "global_step": 1059, + "grad_norm": 2.0137095046053894, + "learning_rate": 1.5356508274969593e-06, + "loss": 0.4754, + "step": 1059 + }, + { + "ETA": 3.91, + "epoch": 0.34089081845955943, + "fp16_scale": 1.0, + "global_step": 1060, + "grad_norm": 1.9153282169539023, + "learning_rate": 1.5347706412937184e-06, + "loss": 0.4654, + "step": 1060 + }, + { + "ETA": 3.91, + "epoch": 0.3412124135713137, + "fp16_scale": 1.0, + "global_step": 1061, + "grad_norm": 1.7275404045564595, + "learning_rate": 1.533889874469624e-06, + "loss": 0.3665, + "step": 1061 + }, + { + "ETA": 3.91, + "epoch": 0.341534008683068, + "fp16_scale": 1.0, + "global_step": 1062, + "grad_norm": 2.1859526858656197, + "learning_rate": 1.533008527980958e-06, + "loss": 0.4385, + "step": 1062 + }, + { + "ETA": 3.91, + "epoch": 0.3418556037948223, + "fp16_scale": 1.0, + "global_step": 1063, + "grad_norm": 2.1233723143512244, + "learning_rate": 1.5321266027846327e-06, + "loss": 0.4719, + "step": 1063 + }, + { + "ETA": 3.91, + "epoch": 0.3421771989065766, + "fp16_scale": 1.0, + "global_step": 1064, + "grad_norm": 2.0800026111171586, + "learning_rate": 1.531244099838187e-06, + "loss": 0.4873, + "step": 1064 + }, + { + "ETA": 3.91, + "epoch": 0.34249879401833094, + "fp16_scale": 1.0, + "global_step": 1065, + "grad_norm": 2.168644537014544, + "learning_rate": 1.5303610200997882e-06, + "loss": 0.472, + "step": 1065 + }, + { + "ETA": 3.9, + "epoch": 0.3428203891300852, + "fp16_scale": 1.0, + "global_step": 1066, + "grad_norm": 1.9707768684020286, + "learning_rate": 1.5294773645282296e-06, + "loss": 0.4336, + "step": 1066 + }, + { + "ETA": 3.9, + "epoch": 0.3431419842418395, + "fp16_scale": 1.0, + "global_step": 1067, + "grad_norm": 1.897045422023728, + "learning_rate": 1.52859313408293e-06, + "loss": 0.4241, + "step": 1067 + }, + { + "ETA": 3.9, + "epoch": 0.34346357935359384, + "fp16_scale": 1.0, + "global_step": 1068, + "grad_norm": 2.0755770205202766, + "learning_rate": 1.5277083297239318e-06, + "loss": 0.3643, + "step": 1068 + }, + { + "ETA": 3.9, + "epoch": 0.3437851744653481, + "fp16_scale": 1.0, + "global_step": 1069, + "grad_norm": 2.116766495277426, + "learning_rate": 1.5268229524119004e-06, + "loss": 0.3479, + "step": 1069 + }, + { + "ETA": 3.89, + "epoch": 0.3441067695771024, + "fp16_scale": 1.0, + "global_step": 1070, + "grad_norm": 2.0118423629926996, + "learning_rate": 1.5259370031081248e-06, + "loss": 0.3794, + "step": 1070 + }, + { + "ETA": 3.89, + "epoch": 0.34442836468885674, + "fp16_scale": 1.0, + "global_step": 1071, + "grad_norm": 2.300985748426234, + "learning_rate": 1.5250504827745127e-06, + "loss": 0.4406, + "step": 1071 + }, + { + "ETA": 3.89, + "epoch": 0.344749959800611, + "fp16_scale": 1.0, + "global_step": 1072, + "grad_norm": 2.0931326714632, + "learning_rate": 1.5241633923735938e-06, + "loss": 0.4, + "step": 1072 + }, + { + "ETA": 3.88, + "epoch": 0.34507155491236535, + "fp16_scale": 1.0, + "global_step": 1073, + "grad_norm": 1.8854353625333264, + "learning_rate": 1.5232757328685151e-06, + "loss": 0.3722, + "step": 1073 + }, + { + "ETA": 3.88, + "epoch": 0.34539315002411963, + "fp16_scale": 1.0, + "global_step": 1074, + "grad_norm": 1.8720316814062665, + "learning_rate": 1.5223875052230436e-06, + "loss": 0.4518, + "step": 1074 + }, + { + "ETA": 3.88, + "epoch": 0.3457147451358739, + "fp16_scale": 1.0, + "global_step": 1075, + "grad_norm": 2.2590433808420953, + "learning_rate": 1.521498710401561e-06, + "loss": 0.4969, + "step": 1075 + }, + { + "ETA": 3.88, + "epoch": 0.34603634024762825, + "fp16_scale": 1.0, + "global_step": 1076, + "grad_norm": 1.9492131114052138, + "learning_rate": 1.5206093493690652e-06, + "loss": 0.4088, + "step": 1076 + }, + { + "ETA": 3.88, + "epoch": 0.34635793535938253, + "fp16_scale": 1.0, + "global_step": 1077, + "grad_norm": 1.7446844440137887, + "learning_rate": 1.5197194230911705e-06, + "loss": 0.4018, + "step": 1077 + }, + { + "ETA": 3.88, + "epoch": 0.34667953047113687, + "fp16_scale": 1.0, + "global_step": 1078, + "grad_norm": 2.1056058348615556, + "learning_rate": 1.5188289325341033e-06, + "loss": 0.4783, + "step": 1078 + }, + { + "ETA": 3.87, + "epoch": 0.34700112558289115, + "fp16_scale": 1.0, + "global_step": 1079, + "grad_norm": 2.154899487781416, + "learning_rate": 1.5179378786647026e-06, + "loss": 0.4482, + "step": 1079 + }, + { + "ETA": 3.87, + "epoch": 0.3473227206946454, + "fp16_scale": 1.0, + "global_step": 1080, + "grad_norm": 1.8829126410069628, + "learning_rate": 1.5170462624504203e-06, + "loss": 0.4027, + "step": 1080 + }, + { + "ETA": 3.87, + "epoch": 0.34764431580639976, + "fp16_scale": 1.0, + "global_step": 1081, + "grad_norm": 1.9589407377100947, + "learning_rate": 1.516154084859318e-06, + "loss": 0.4548, + "step": 1081 + }, + { + "ETA": 3.87, + "epoch": 0.34796591091815404, + "fp16_scale": 1.0, + "global_step": 1082, + "grad_norm": 2.2242892428624144, + "learning_rate": 1.5152613468600661e-06, + "loss": 0.4382, + "step": 1082 + }, + { + "ETA": 3.87, + "epoch": 0.3482875060299083, + "fp16_scale": 1.0, + "global_step": 1083, + "grad_norm": 1.8970223057811344, + "learning_rate": 1.5143680494219453e-06, + "loss": 0.422, + "step": 1083 + }, + { + "ETA": 3.87, + "epoch": 0.34860910114166266, + "fp16_scale": 1.0, + "global_step": 1084, + "grad_norm": 1.7959123082952548, + "learning_rate": 1.5134741935148417e-06, + "loss": 0.447, + "step": 1084 + }, + { + "ETA": 3.86, + "epoch": 0.34893069625341694, + "fp16_scale": 1.0, + "global_step": 1085, + "grad_norm": 2.1068985939099196, + "learning_rate": 1.5125797801092497e-06, + "loss": 0.4406, + "step": 1085 + }, + { + "ETA": 3.86, + "epoch": 0.3492522913651713, + "fp16_scale": 1.0, + "global_step": 1086, + "grad_norm": 2.0786396617899117, + "learning_rate": 1.5116848101762672e-06, + "loss": 0.4329, + "step": 1086 + }, + { + "ETA": 3.86, + "epoch": 0.34957388647692555, + "fp16_scale": 1.0, + "global_step": 1087, + "grad_norm": 1.908634889904505, + "learning_rate": 1.5107892846875973e-06, + "loss": 0.4254, + "step": 1087 + }, + { + "ETA": 3.86, + "epoch": 0.34989548158867984, + "fp16_scale": 1.0, + "global_step": 1088, + "grad_norm": 2.223620445451567, + "learning_rate": 1.5098932046155463e-06, + "loss": 0.4824, + "step": 1088 + }, + { + "ETA": 3.86, + "epoch": 0.35021707670043417, + "fp16_scale": 1.0, + "global_step": 1089, + "grad_norm": 2.061418506233444, + "learning_rate": 1.5089965709330226e-06, + "loss": 0.4609, + "step": 1089 + }, + { + "ETA": 3.85, + "epoch": 0.35053867181218845, + "fp16_scale": 1.0, + "global_step": 1090, + "grad_norm": 2.0055627953071156, + "learning_rate": 1.5080993846135349e-06, + "loss": 0.4284, + "step": 1090 + }, + { + "ETA": 3.85, + "epoch": 0.35086026692394273, + "fp16_scale": 1.0, + "global_step": 1091, + "grad_norm": 1.8718837955396956, + "learning_rate": 1.5072016466311933e-06, + "loss": 0.4143, + "step": 1091 + }, + { + "ETA": 3.85, + "epoch": 0.35118186203569707, + "fp16_scale": 1.0, + "global_step": 1092, + "grad_norm": 2.0265850308325635, + "learning_rate": 1.506303357960706e-06, + "loss": 0.4691, + "step": 1092 + }, + { + "ETA": 3.85, + "epoch": 0.35150345714745135, + "fp16_scale": 1.0, + "global_step": 1093, + "grad_norm": 1.797624163727888, + "learning_rate": 1.5054045195773787e-06, + "loss": 0.3936, + "step": 1093 + }, + { + "ETA": 3.85, + "epoch": 0.3518250522592057, + "fp16_scale": 1.0, + "global_step": 1094, + "grad_norm": 1.9244718608465112, + "learning_rate": 1.504505132457115e-06, + "loss": 0.44, + "step": 1094 + }, + { + "ETA": 3.85, + "epoch": 0.35214664737095996, + "fp16_scale": 1.0, + "global_step": 1095, + "grad_norm": 1.8827263655898996, + "learning_rate": 1.5036051975764133e-06, + "loss": 0.497, + "step": 1095 + }, + { + "ETA": 3.84, + "epoch": 0.35246824248271424, + "fp16_scale": 1.0, + "global_step": 1096, + "grad_norm": 2.06998781133535, + "learning_rate": 1.5027047159123676e-06, + "loss": 0.4381, + "step": 1096 + }, + { + "ETA": 3.84, + "epoch": 0.3527898375944686, + "fp16_scale": 1.0, + "global_step": 1097, + "grad_norm": 1.9529943154306728, + "learning_rate": 1.5018036884426651e-06, + "loss": 0.4654, + "step": 1097 + }, + { + "ETA": 3.84, + "epoch": 0.35311143270622286, + "fp16_scale": 1.0, + "global_step": 1098, + "grad_norm": 2.102138183876875, + "learning_rate": 1.500902116145585e-06, + "loss": 0.3862, + "step": 1098 + }, + { + "ETA": 3.84, + "epoch": 0.35343302781797714, + "fp16_scale": 1.0, + "global_step": 1099, + "grad_norm": 2.1366269842497507, + "learning_rate": 1.5e-06, + "loss": 0.3634, + "step": 1099 + }, + { + "ETA": 3.83, + "epoch": 0.3537546229297315, + "fp16_scale": 1.0, + "global_step": 1100, + "grad_norm": 1.9739836451666781, + "learning_rate": 1.4990973409853709e-06, + "loss": 0.385, + "step": 1100 + }, + { + "ETA": 3.83, + "epoch": 0.35407621804148576, + "fp16_scale": 1.0, + "global_step": 1101, + "grad_norm": 2.0774369345981905, + "learning_rate": 1.4981941400817489e-06, + "loss": 0.4901, + "step": 1101 + }, + { + "ETA": 3.83, + "epoch": 0.3543978131532401, + "fp16_scale": 1.0, + "global_step": 1102, + "grad_norm": 1.991955578669468, + "learning_rate": 1.4972903982697742e-06, + "loss": 0.483, + "step": 1102 + }, + { + "ETA": 3.83, + "epoch": 0.3547194082649944, + "fp16_scale": 1.0, + "global_step": 1103, + "grad_norm": 1.9843357649460551, + "learning_rate": 1.4963861165306736e-06, + "loss": 0.4622, + "step": 1103 + }, + { + "ETA": 3.83, + "epoch": 0.35504100337674865, + "fp16_scale": 1.0, + "global_step": 1104, + "grad_norm": 1.7678654457273422, + "learning_rate": 1.4954812958462597e-06, + "loss": 0.3585, + "step": 1104 + }, + { + "ETA": 3.82, + "epoch": 0.355362598488503, + "fp16_scale": 1.0, + "global_step": 1105, + "grad_norm": 1.894926407127137, + "learning_rate": 1.4945759371989315e-06, + "loss": 0.3916, + "step": 1105 + }, + { + "ETA": 3.82, + "epoch": 0.35568419360025727, + "fp16_scale": 1.0, + "global_step": 1106, + "grad_norm": 1.8613555067449779, + "learning_rate": 1.4936700415716708e-06, + "loss": 0.4828, + "step": 1106 + }, + { + "ETA": 3.82, + "epoch": 0.35600578871201155, + "fp16_scale": 1.0, + "global_step": 1107, + "grad_norm": 2.1396721092458852, + "learning_rate": 1.4927636099480433e-06, + "loss": 0.5529, + "step": 1107 + }, + { + "ETA": 3.82, + "epoch": 0.3563273838237659, + "fp16_scale": 1.0, + "global_step": 1108, + "grad_norm": 2.0522833163242264, + "learning_rate": 1.4918566433121962e-06, + "loss": 0.3821, + "step": 1108 + }, + { + "ETA": 3.81, + "epoch": 0.35664897893552017, + "fp16_scale": 1.0, + "global_step": 1109, + "grad_norm": 1.9413873200967102, + "learning_rate": 1.4909491426488577e-06, + "loss": 0.4043, + "step": 1109 + }, + { + "ETA": 3.81, + "epoch": 0.3569705740472745, + "fp16_scale": 1.0, + "global_step": 1110, + "grad_norm": 1.8795456509662947, + "learning_rate": 1.4900411089433363e-06, + "loss": 0.4493, + "step": 1110 + }, + { + "ETA": 3.81, + "epoch": 0.3572921691590288, + "fp16_scale": 1.0, + "global_step": 1111, + "grad_norm": 2.082321891294859, + "learning_rate": 1.489132543181518e-06, + "loss": 0.4771, + "step": 1111 + }, + { + "ETA": 3.81, + "epoch": 0.35761376427078306, + "fp16_scale": 1.0, + "global_step": 1112, + "grad_norm": 2.224130407556845, + "learning_rate": 1.4882234463498677e-06, + "loss": 0.4975, + "step": 1112 + }, + { + "ETA": 3.81, + "epoch": 0.3579353593825374, + "fp16_scale": 1.0, + "global_step": 1113, + "grad_norm": 1.8071191267023712, + "learning_rate": 1.4873138194354266e-06, + "loss": 0.4052, + "step": 1113 + }, + { + "ETA": 3.81, + "epoch": 0.3582569544942917, + "fp16_scale": 1.0, + "global_step": 1114, + "grad_norm": 2.1122509646678598, + "learning_rate": 1.486403663425811e-06, + "loss": 0.4457, + "step": 1114 + }, + { + "ETA": 3.8, + "epoch": 0.358578549606046, + "fp16_scale": 1.0, + "global_step": 1115, + "grad_norm": 1.98561107580325, + "learning_rate": 1.485492979309212e-06, + "loss": 0.4065, + "step": 1115 + }, + { + "ETA": 3.8, + "epoch": 0.3589001447178003, + "fp16_scale": 1.0, + "global_step": 1116, + "grad_norm": 1.8581086982767931, + "learning_rate": 1.4845817680743941e-06, + "loss": 0.4603, + "step": 1116 + }, + { + "ETA": 3.8, + "epoch": 0.3592217398295546, + "fp16_scale": 1.0, + "global_step": 1117, + "grad_norm": 2.0238917547533957, + "learning_rate": 1.4836700307106939e-06, + "loss": 0.5412, + "step": 1117 + }, + { + "ETA": 3.8, + "epoch": 0.3595433349413089, + "fp16_scale": 1.0, + "global_step": 1118, + "grad_norm": 1.9850118939316719, + "learning_rate": 1.4827577682080198e-06, + "loss": 0.4695, + "step": 1118 + }, + { + "ETA": 3.8, + "epoch": 0.3598649300530632, + "fp16_scale": 1.0, + "global_step": 1119, + "grad_norm": 2.135523671395713, + "learning_rate": 1.4818449815568492e-06, + "loss": 0.4167, + "step": 1119 + }, + { + "ETA": 3.79, + "epoch": 0.3601865251648175, + "fp16_scale": 1.0, + "global_step": 1120, + "grad_norm": 2.0196024687993983, + "learning_rate": 1.4809316717482298e-06, + "loss": 0.466, + "step": 1120 + }, + { + "ETA": 3.79, + "epoch": 0.3605081202765718, + "fp16_scale": 1.0, + "global_step": 1121, + "grad_norm": 1.8288815589001348, + "learning_rate": 1.4800178397737771e-06, + "loss": 0.4073, + "step": 1121 + }, + { + "ETA": 3.79, + "epoch": 0.3608297153883261, + "fp16_scale": 1.0, + "global_step": 1122, + "grad_norm": 1.9586967489426974, + "learning_rate": 1.4791034866256728e-06, + "loss": 0.4799, + "step": 1122 + }, + { + "ETA": 3.79, + "epoch": 0.3611513105000804, + "fp16_scale": 1.0, + "global_step": 1123, + "grad_norm": 1.9743135446445312, + "learning_rate": 1.4781886132966652e-06, + "loss": 0.4088, + "step": 1123 + }, + { + "ETA": 3.79, + "epoch": 0.3614729056118347, + "fp16_scale": 1.0, + "global_step": 1124, + "grad_norm": 1.857070167265074, + "learning_rate": 1.477273220780067e-06, + "loss": 0.4299, + "step": 1124 + }, + { + "ETA": 3.79, + "epoch": 0.361794500723589, + "fp16_scale": 1.0, + "global_step": 1125, + "grad_norm": 1.86225308423235, + "learning_rate": 1.4763573100697548e-06, + "loss": 0.3947, + "step": 1125 + }, + { + "ETA": 3.78, + "epoch": 0.3621160958353433, + "fp16_scale": 1.0, + "global_step": 1126, + "grad_norm": 1.8722597560581546, + "learning_rate": 1.4754408821601675e-06, + "loss": 0.438, + "step": 1126 + }, + { + "ETA": 3.78, + "epoch": 0.3624376909470976, + "fp16_scale": 1.0, + "global_step": 1127, + "grad_norm": 1.7661756651665166, + "learning_rate": 1.4745239380463067e-06, + "loss": 0.3494, + "step": 1127 + }, + { + "ETA": 3.78, + "epoch": 0.3627592860588519, + "fp16_scale": 1.0, + "global_step": 1128, + "grad_norm": 1.8066850422576595, + "learning_rate": 1.4736064787237322e-06, + "loss": 0.4816, + "step": 1128 + }, + { + "ETA": 3.78, + "epoch": 0.3630808811706062, + "fp16_scale": 1.0, + "global_step": 1129, + "grad_norm": 1.8532213379815963, + "learning_rate": 1.4726885051885652e-06, + "loss": 0.4565, + "step": 1129 + }, + { + "ETA": 3.78, + "epoch": 0.3634024762823605, + "fp16_scale": 1.0, + "global_step": 1130, + "grad_norm": 1.888133209159448, + "learning_rate": 1.4717700184374846e-06, + "loss": 0.3954, + "step": 1130 + }, + { + "ETA": 3.77, + "epoch": 0.36372407139411483, + "fp16_scale": 1.0, + "global_step": 1131, + "grad_norm": 1.958330303482453, + "learning_rate": 1.4708510194677266e-06, + "loss": 0.3943, + "step": 1131 + }, + { + "ETA": 3.77, + "epoch": 0.3640456665058691, + "fp16_scale": 1.0, + "global_step": 1132, + "grad_norm": 2.4457901298456095, + "learning_rate": 1.4699315092770826e-06, + "loss": 0.4209, + "step": 1132 + }, + { + "ETA": 3.77, + "epoch": 0.3643672616176234, + "fp16_scale": 1.0, + "global_step": 1133, + "grad_norm": 2.086980629506438, + "learning_rate": 1.469011488863901e-06, + "loss": 0.435, + "step": 1133 + }, + { + "ETA": 3.76, + "epoch": 0.36468885672937773, + "fp16_scale": 1.0, + "global_step": 1134, + "grad_norm": 1.8818455568595487, + "learning_rate": 1.4680909592270818e-06, + "loss": 0.4205, + "step": 1134 + }, + { + "ETA": 3.76, + "epoch": 0.365010451841132, + "fp16_scale": 1.0, + "global_step": 1135, + "grad_norm": 1.799982532353527, + "learning_rate": 1.4671699213660802e-06, + "loss": 0.4193, + "step": 1135 + }, + { + "ETA": 3.76, + "epoch": 0.3653320469528863, + "fp16_scale": 1.0, + "global_step": 1136, + "grad_norm": 2.0392226099976845, + "learning_rate": 1.4662483762809013e-06, + "loss": 0.3781, + "step": 1136 + }, + { + "ETA": 3.76, + "epoch": 0.3656536420646406, + "fp16_scale": 1.0, + "global_step": 1137, + "grad_norm": 1.8884881243701355, + "learning_rate": 1.4653263249721018e-06, + "loss": 0.5207, + "step": 1137 + }, + { + "ETA": 3.76, + "epoch": 0.3659752371763949, + "fp16_scale": 1.0, + "global_step": 1138, + "grad_norm": 2.1148533457798178, + "learning_rate": 1.4644037684407881e-06, + "loss": 0.4956, + "step": 1138 + }, + { + "ETA": 3.76, + "epoch": 0.36629683228814924, + "fp16_scale": 1.0, + "global_step": 1139, + "grad_norm": 2.1641516091055233, + "learning_rate": 1.4634807076886154e-06, + "loss": 0.4612, + "step": 1139 + }, + { + "ETA": 3.75, + "epoch": 0.3666184273999035, + "fp16_scale": 1.0, + "global_step": 1140, + "grad_norm": 1.838643891924867, + "learning_rate": 1.462557143717785e-06, + "loss": 0.3633, + "step": 1140 + }, + { + "ETA": 3.75, + "epoch": 0.3669400225116578, + "fp16_scale": 1.0, + "global_step": 1141, + "grad_norm": 1.9660379929411695, + "learning_rate": 1.4616330775310462e-06, + "loss": 0.3676, + "step": 1141 + }, + { + "ETA": 3.75, + "epoch": 0.36726161762341214, + "fp16_scale": 1.0, + "global_step": 1142, + "grad_norm": 2.1266579922069444, + "learning_rate": 1.4607085101316922e-06, + "loss": 0.4574, + "step": 1142 + }, + { + "ETA": 3.75, + "epoch": 0.3675832127351664, + "fp16_scale": 1.0, + "global_step": 1143, + "grad_norm": 2.270188265598726, + "learning_rate": 1.4597834425235617e-06, + "loss": 0.5031, + "step": 1143 + }, + { + "ETA": 3.74, + "epoch": 0.3679048078469207, + "fp16_scale": 1.0, + "global_step": 1144, + "grad_norm": 1.8471350459801226, + "learning_rate": 1.4588578757110358e-06, + "loss": 0.4623, + "step": 1144 + }, + { + "ETA": 3.74, + "epoch": 0.36822640295867504, + "fp16_scale": 1.0, + "global_step": 1145, + "grad_norm": 1.9801295558849021, + "learning_rate": 1.457931810699037e-06, + "loss": 0.3618, + "step": 1145 + }, + { + "ETA": 3.74, + "epoch": 0.3685479980704293, + "fp16_scale": 1.0, + "global_step": 1146, + "grad_norm": 1.8706238235658559, + "learning_rate": 1.4570052484930299e-06, + "loss": 0.4039, + "step": 1146 + }, + { + "ETA": 3.74, + "epoch": 0.36886959318218365, + "fp16_scale": 1.0, + "global_step": 1147, + "grad_norm": 1.9248669172574233, + "learning_rate": 1.4560781900990184e-06, + "loss": 0.4597, + "step": 1147 + }, + { + "ETA": 3.74, + "epoch": 0.36919118829393793, + "fp16_scale": 1.0, + "global_step": 1148, + "grad_norm": 1.952201904699033, + "learning_rate": 1.4551506365235446e-06, + "loss": 0.4923, + "step": 1148 + }, + { + "ETA": 3.73, + "epoch": 0.3695127834056922, + "fp16_scale": 1.0, + "global_step": 1149, + "grad_norm": 1.972166684174856, + "learning_rate": 1.4542225887736894e-06, + "loss": 0.4514, + "step": 1149 + }, + { + "ETA": 3.73, + "epoch": 0.36983437851744655, + "fp16_scale": 1.0, + "global_step": 1150, + "grad_norm": 1.898042187195432, + "learning_rate": 1.4532940478570693e-06, + "loss": 0.4115, + "step": 1150 + }, + { + "ETA": 3.73, + "epoch": 0.37015597362920083, + "fp16_scale": 1.0, + "global_step": 1151, + "grad_norm": 2.075689020313724, + "learning_rate": 1.4523650147818362e-06, + "loss": 0.3744, + "step": 1151 + }, + { + "ETA": 3.72, + "epoch": 0.3704775687409551, + "fp16_scale": 1.0, + "global_step": 1152, + "grad_norm": 1.9696699565067173, + "learning_rate": 1.4514354905566774e-06, + "loss": 0.4121, + "step": 1152 + }, + { + "ETA": 3.72, + "epoch": 0.37079916385270945, + "fp16_scale": 1.0, + "global_step": 1153, + "grad_norm": 1.9542736425506968, + "learning_rate": 1.4505054761908123e-06, + "loss": 0.398, + "step": 1153 + }, + { + "ETA": 3.72, + "epoch": 0.3711207589644637, + "fp16_scale": 1.0, + "global_step": 1154, + "grad_norm": 1.7919743813455156, + "learning_rate": 1.4495749726939926e-06, + "loss": 0.4436, + "step": 1154 + }, + { + "ETA": 3.72, + "epoch": 0.37144235407621806, + "fp16_scale": 1.0, + "global_step": 1155, + "grad_norm": 2.3410907527300653, + "learning_rate": 1.448643981076502e-06, + "loss": 0.3645, + "step": 1155 + }, + { + "ETA": 3.71, + "epoch": 0.37176394918797234, + "fp16_scale": 1.0, + "global_step": 1156, + "grad_norm": 2.0876492805195705, + "learning_rate": 1.4477125023491535e-06, + "loss": 0.3779, + "step": 1156 + }, + { + "ETA": 3.71, + "epoch": 0.3720855442997266, + "fp16_scale": 1.0, + "global_step": 1157, + "grad_norm": 1.8698333270432437, + "learning_rate": 1.4467805375232888e-06, + "loss": 0.4079, + "step": 1157 + }, + { + "ETA": 3.71, + "epoch": 0.37240713941148096, + "fp16_scale": 1.0, + "global_step": 1158, + "grad_norm": 1.9124687998385432, + "learning_rate": 1.4458480876107777e-06, + "loss": 0.4337, + "step": 1158 + }, + { + "ETA": 3.71, + "epoch": 0.37272873452323524, + "fp16_scale": 1.0, + "global_step": 1159, + "grad_norm": 1.8427479575262413, + "learning_rate": 1.4449151536240165e-06, + "loss": 0.4576, + "step": 1159 + }, + { + "ETA": 3.71, + "epoch": 0.3730503296349896, + "fp16_scale": 1.0, + "global_step": 1160, + "grad_norm": 1.9713810052722738, + "learning_rate": 1.4439817365759272e-06, + "loss": 0.457, + "step": 1160 + }, + { + "ETA": 3.7, + "epoch": 0.37337192474674386, + "fp16_scale": 1.0, + "global_step": 1161, + "grad_norm": 2.164070706224667, + "learning_rate": 1.4430478374799564e-06, + "loss": 0.339, + "step": 1161 + }, + { + "ETA": 3.7, + "epoch": 0.37369351985849814, + "fp16_scale": 1.0, + "global_step": 1162, + "grad_norm": 2.145443423694682, + "learning_rate": 1.4421134573500736e-06, + "loss": 0.3927, + "step": 1162 + }, + { + "ETA": 3.7, + "epoch": 0.37401511497025247, + "fp16_scale": 1.0, + "global_step": 1163, + "grad_norm": 2.1312953943216737, + "learning_rate": 1.4411785972007712e-06, + "loss": 0.4088, + "step": 1163 + }, + { + "ETA": 3.69, + "epoch": 0.37433671008200675, + "fp16_scale": 1.0, + "global_step": 1164, + "grad_norm": 2.228988248090331, + "learning_rate": 1.4402432580470622e-06, + "loss": 0.4339, + "step": 1164 + }, + { + "ETA": 3.69, + "epoch": 0.37465830519376103, + "fp16_scale": 1.0, + "global_step": 1165, + "grad_norm": 2.1241862946324286, + "learning_rate": 1.43930744090448e-06, + "loss": 0.4488, + "step": 1165 + }, + { + "ETA": 3.69, + "epoch": 0.37497990030551537, + "fp16_scale": 1.0, + "global_step": 1166, + "grad_norm": 2.031680088017968, + "learning_rate": 1.4383711467890773e-06, + "loss": 0.407, + "step": 1166 + }, + { + "ETA": 3.69, + "epoch": 0.37530149541726965, + "fp16_scale": 1.0, + "global_step": 1167, + "grad_norm": 2.19404819289449, + "learning_rate": 1.437434376717424e-06, + "loss": 0.3353, + "step": 1167 + }, + { + "ETA": 3.69, + "epoch": 0.375623090529024, + "fp16_scale": 1.0, + "global_step": 1168, + "grad_norm": 2.0969147691699974, + "learning_rate": 1.436497131706607e-06, + "loss": 0.5032, + "step": 1168 + }, + { + "ETA": 3.68, + "epoch": 0.37594468564077826, + "fp16_scale": 1.0, + "global_step": 1169, + "grad_norm": 1.9700428428502548, + "learning_rate": 1.435559412774229e-06, + "loss": 0.5975, + "step": 1169 + }, + { + "ETA": 3.68, + "epoch": 0.37626628075253254, + "fp16_scale": 1.0, + "global_step": 1170, + "grad_norm": 1.9108611126655912, + "learning_rate": 1.4346212209384065e-06, + "loss": 0.4207, + "step": 1170 + }, + { + "ETA": 3.68, + "epoch": 0.3765878758642869, + "fp16_scale": 1.0, + "global_step": 1171, + "grad_norm": 2.175370372481936, + "learning_rate": 1.4336825572177714e-06, + "loss": 0.4746, + "step": 1171 + }, + { + "ETA": 3.68, + "epoch": 0.37690947097604116, + "fp16_scale": 1.0, + "global_step": 1172, + "grad_norm": 1.9183000041755622, + "learning_rate": 1.4327434226314656e-06, + "loss": 0.5358, + "step": 1172 + }, + { + "ETA": 3.68, + "epoch": 0.37723106608779544, + "fp16_scale": 1.0, + "global_step": 1173, + "grad_norm": 1.9645578277864115, + "learning_rate": 1.4318038181991439e-06, + "loss": 0.4341, + "step": 1173 + }, + { + "ETA": 3.68, + "epoch": 0.3775526611995498, + "fp16_scale": 1.0, + "global_step": 1174, + "grad_norm": 1.9791378068214516, + "learning_rate": 1.4308637449409703e-06, + "loss": 0.3746, + "step": 1174 + }, + { + "ETA": 3.67, + "epoch": 0.37787425631130406, + "fp16_scale": 1.0, + "global_step": 1175, + "grad_norm": 2.29483082945288, + "learning_rate": 1.4299232038776183e-06, + "loss": 0.4422, + "step": 1175 + }, + { + "ETA": 3.67, + "epoch": 0.3781958514230584, + "fp16_scale": 1.0, + "global_step": 1176, + "grad_norm": 1.8897784540429947, + "learning_rate": 1.4289821960302687e-06, + "loss": 0.4731, + "step": 1176 + }, + { + "ETA": 3.67, + "epoch": 0.3785174465348127, + "fp16_scale": 1.0, + "global_step": 1177, + "grad_norm": 2.0911449597866687, + "learning_rate": 1.4280407224206103e-06, + "loss": 0.3661, + "step": 1177 + }, + { + "ETA": 3.67, + "epoch": 0.37883904164656695, + "fp16_scale": 1.0, + "global_step": 1178, + "grad_norm": 2.0359556730170514, + "learning_rate": 1.4270987840708366e-06, + "loss": 0.3681, + "step": 1178 + }, + { + "ETA": 3.66, + "epoch": 0.3791606367583213, + "fp16_scale": 1.0, + "global_step": 1179, + "grad_norm": 2.1067301476592153, + "learning_rate": 1.4261563820036454e-06, + "loss": 0.5051, + "step": 1179 + }, + { + "ETA": 3.66, + "epoch": 0.37948223187007557, + "fp16_scale": 1.0, + "global_step": 1180, + "grad_norm": 1.841840073585346, + "learning_rate": 1.4252135172422395e-06, + "loss": 0.4756, + "step": 1180 + }, + { + "ETA": 3.66, + "epoch": 0.37980382698182985, + "fp16_scale": 1.0, + "global_step": 1181, + "grad_norm": 1.903373744148128, + "learning_rate": 1.4242701908103218e-06, + "loss": 0.4239, + "step": 1181 + }, + { + "ETA": 3.66, + "epoch": 0.3801254220935842, + "fp16_scale": 1.0, + "global_step": 1182, + "grad_norm": 1.9609762778755848, + "learning_rate": 1.4233264037320992e-06, + "loss": 0.4805, + "step": 1182 + }, + { + "ETA": 3.66, + "epoch": 0.38044701720533847, + "fp16_scale": 1.0, + "global_step": 1183, + "grad_norm": 2.0360191156115097, + "learning_rate": 1.422382157032276e-06, + "loss": 0.4749, + "step": 1183 + }, + { + "ETA": 3.66, + "epoch": 0.3807686123170928, + "fp16_scale": 1.0, + "global_step": 1184, + "grad_norm": 2.0375993595943322, + "learning_rate": 1.4214374517360575e-06, + "loss": 0.4791, + "step": 1184 + }, + { + "ETA": 3.65, + "epoch": 0.3810902074288471, + "fp16_scale": 1.0, + "global_step": 1185, + "grad_norm": 2.069701150745817, + "learning_rate": 1.4204922888691462e-06, + "loss": 0.5296, + "step": 1185 + }, + { + "ETA": 3.65, + "epoch": 0.38141180254060136, + "fp16_scale": 1.0, + "global_step": 1186, + "grad_norm": 1.7739466851664754, + "learning_rate": 1.4195466694577414e-06, + "loss": 0.3603, + "step": 1186 + }, + { + "ETA": 3.65, + "epoch": 0.3817333976523557, + "fp16_scale": 1.0, + "global_step": 1187, + "grad_norm": 1.8934344918273596, + "learning_rate": 1.4186005945285374e-06, + "loss": 0.4093, + "step": 1187 + }, + { + "ETA": 3.65, + "epoch": 0.38205499276411, + "fp16_scale": 1.0, + "global_step": 1188, + "grad_norm": 2.099195276315028, + "learning_rate": 1.4176540651087253e-06, + "loss": 0.4622, + "step": 1188 + }, + { + "ETA": 3.65, + "epoch": 0.38237658787586426, + "fp16_scale": 1.0, + "global_step": 1189, + "grad_norm": 2.0442817261367714, + "learning_rate": 1.4167070822259865e-06, + "loss": 0.4139, + "step": 1189 + }, + { + "ETA": 3.64, + "epoch": 0.3826981829876186, + "fp16_scale": 1.0, + "global_step": 1190, + "grad_norm": 1.9239751649252113, + "learning_rate": 1.4157596469084973e-06, + "loss": 0.4397, + "step": 1190 + }, + { + "ETA": 3.64, + "epoch": 0.3830197780993729, + "fp16_scale": 1.0, + "global_step": 1191, + "grad_norm": 1.820718900998498, + "learning_rate": 1.4148117601849245e-06, + "loss": 0.3975, + "step": 1191 + }, + { + "ETA": 3.64, + "epoch": 0.3833413732111272, + "fp16_scale": 1.0, + "global_step": 1192, + "grad_norm": 1.8203421531298762, + "learning_rate": 1.4138634230844239e-06, + "loss": 0.4198, + "step": 1192 + }, + { + "ETA": 3.64, + "epoch": 0.3836629683228815, + "fp16_scale": 1.0, + "global_step": 1193, + "grad_norm": 1.8982203182013695, + "learning_rate": 1.4129146366366422e-06, + "loss": 0.4429, + "step": 1193 + }, + { + "ETA": 3.64, + "epoch": 0.3839845634346358, + "fp16_scale": 1.0, + "global_step": 1194, + "grad_norm": 1.7244159154380014, + "learning_rate": 1.4119654018717125e-06, + "loss": 0.4209, + "step": 1194 + }, + { + "ETA": 3.64, + "epoch": 0.3843061585463901, + "fp16_scale": 1.0, + "global_step": 1195, + "grad_norm": 1.7945422968823292, + "learning_rate": 1.4110157198202547e-06, + "loss": 0.4141, + "step": 1195 + }, + { + "ETA": 3.63, + "epoch": 0.3846277536581444, + "fp16_scale": 1.0, + "global_step": 1196, + "grad_norm": 1.7696561457719395, + "learning_rate": 1.410065591513376e-06, + "loss": 0.462, + "step": 1196 + }, + { + "ETA": 3.63, + "epoch": 0.3849493487698987, + "fp16_scale": 1.0, + "global_step": 1197, + "grad_norm": 2.0332531442940955, + "learning_rate": 1.409115017982666e-06, + "loss": 0.4538, + "step": 1197 + }, + { + "ETA": 3.63, + "epoch": 0.385270943881653, + "fp16_scale": 1.0, + "global_step": 1198, + "grad_norm": 1.9883517762567522, + "learning_rate": 1.4081640002601981e-06, + "loss": 0.4626, + "step": 1198 + }, + { + "ETA": 3.63, + "epoch": 0.3855925389934073, + "fp16_scale": 1.0, + "global_step": 1199, + "grad_norm": 1.9941088081217284, + "learning_rate": 1.4072125393785294e-06, + "loss": 0.4557, + "step": 1199 + }, + { + "ETA": 3.63, + "epoch": 0.3859141341051616, + "fp16_scale": 1.0, + "global_step": 1200, + "grad_norm": 1.9058006896509434, + "learning_rate": 1.4062606363706971e-06, + "loss": 0.4577, + "step": 1200 + }, + { + "ETA": 3.64, + "epoch": 0.3862357292169159, + "fp16_scale": 1.0, + "global_step": 1201, + "grad_norm": 1.8311918641065574, + "learning_rate": 1.4053082922702183e-06, + "loss": 0.4237, + "step": 1201 + }, + { + "ETA": 3.64, + "epoch": 0.3865573243286702, + "fp16_scale": 1.0, + "global_step": 1202, + "grad_norm": 1.9026338376726455, + "learning_rate": 1.4043555081110892e-06, + "loss": 0.478, + "step": 1202 + }, + { + "ETA": 3.64, + "epoch": 0.3868789194404245, + "fp16_scale": 1.0, + "global_step": 1203, + "grad_norm": 1.9074712187600207, + "learning_rate": 1.4034022849277842e-06, + "loss": 0.4715, + "step": 1203 + }, + { + "ETA": 3.63, + "epoch": 0.3872005145521788, + "fp16_scale": 1.0, + "global_step": 1204, + "grad_norm": 1.889922757335601, + "learning_rate": 1.4024486237552537e-06, + "loss": 0.4644, + "step": 1204 + }, + { + "ETA": 3.63, + "epoch": 0.38752210966393313, + "fp16_scale": 1.0, + "global_step": 1205, + "grad_norm": 1.9012843410779694, + "learning_rate": 1.4014945256289239e-06, + "loss": 0.4764, + "step": 1205 + }, + { + "ETA": 3.63, + "epoch": 0.3878437047756874, + "fp16_scale": 1.0, + "global_step": 1206, + "grad_norm": 2.094894510992424, + "learning_rate": 1.4005399915846955e-06, + "loss": 0.4528, + "step": 1206 + }, + { + "ETA": 3.63, + "epoch": 0.3881652998874417, + "fp16_scale": 1.0, + "global_step": 1207, + "grad_norm": 1.715314005837692, + "learning_rate": 1.3995850226589431e-06, + "loss": 0.4322, + "step": 1207 + }, + { + "ETA": 3.63, + "epoch": 0.38848689499919603, + "fp16_scale": 1.0, + "global_step": 1208, + "grad_norm": 2.074525094768092, + "learning_rate": 1.3986296198885122e-06, + "loss": 0.4331, + "step": 1208 + }, + { + "ETA": 3.62, + "epoch": 0.3888084901109503, + "fp16_scale": 1.0, + "global_step": 1209, + "grad_norm": 1.6998510772388846, + "learning_rate": 1.39767378431072e-06, + "loss": 0.3926, + "step": 1209 + }, + { + "ETA": 3.62, + "epoch": 0.3891300852227046, + "fp16_scale": 1.0, + "global_step": 1210, + "grad_norm": 2.1933516360856986, + "learning_rate": 1.3967175169633536e-06, + "loss": 0.4228, + "step": 1210 + }, + { + "ETA": 3.62, + "epoch": 0.3894516803344589, + "fp16_scale": 1.0, + "global_step": 1211, + "grad_norm": 2.043829844085143, + "learning_rate": 1.395760818884669e-06, + "loss": 0.4713, + "step": 1211 + }, + { + "ETA": 3.62, + "epoch": 0.3897732754462132, + "fp16_scale": 1.0, + "global_step": 1212, + "grad_norm": 2.0322694459756407, + "learning_rate": 1.3948036911133899e-06, + "loss": 0.4377, + "step": 1212 + }, + { + "ETA": 3.62, + "epoch": 0.39009487055796754, + "fp16_scale": 1.0, + "global_step": 1213, + "grad_norm": 2.0356245607499166, + "learning_rate": 1.3938461346887061e-06, + "loss": 0.4721, + "step": 1213 + }, + { + "ETA": 3.62, + "epoch": 0.3904164656697218, + "fp16_scale": 1.0, + "global_step": 1214, + "grad_norm": 2.0212017990485, + "learning_rate": 1.3928881506502732e-06, + "loss": 0.4984, + "step": 1214 + }, + { + "ETA": 3.62, + "epoch": 0.3907380607814761, + "fp16_scale": 1.0, + "global_step": 1215, + "grad_norm": 1.7592455357833894, + "learning_rate": 1.3919297400382108e-06, + "loss": 0.4463, + "step": 1215 + }, + { + "ETA": 3.61, + "epoch": 0.39105965589323044, + "fp16_scale": 1.0, + "global_step": 1216, + "grad_norm": 2.174186834923421, + "learning_rate": 1.3909709038931021e-06, + "loss": 0.5003, + "step": 1216 + }, + { + "ETA": 3.61, + "epoch": 0.3913812510049847, + "fp16_scale": 1.0, + "global_step": 1217, + "grad_norm": 2.149807128715763, + "learning_rate": 1.3900116432559918e-06, + "loss": 0.3546, + "step": 1217 + }, + { + "ETA": 3.61, + "epoch": 0.391702846116739, + "fp16_scale": 1.0, + "global_step": 1218, + "grad_norm": 1.9846826547687488, + "learning_rate": 1.3890519591683858e-06, + "loss": 0.4332, + "step": 1218 + }, + { + "ETA": 3.61, + "epoch": 0.39202444122849334, + "fp16_scale": 1.0, + "global_step": 1219, + "grad_norm": 1.9078545416159736, + "learning_rate": 1.3880918526722496e-06, + "loss": 0.4545, + "step": 1219 + }, + { + "ETA": 3.6, + "epoch": 0.3923460363402476, + "fp16_scale": 1.0, + "global_step": 1220, + "grad_norm": 1.960043066110628, + "learning_rate": 1.3871313248100076e-06, + "loss": 0.3758, + "step": 1220 + }, + { + "ETA": 3.6, + "epoch": 0.39266763145200195, + "fp16_scale": 1.0, + "global_step": 1221, + "grad_norm": 2.015117131614875, + "learning_rate": 1.3861703766245412e-06, + "loss": 0.4708, + "step": 1221 + }, + { + "ETA": 3.6, + "epoch": 0.39298922656375623, + "fp16_scale": 1.0, + "global_step": 1222, + "grad_norm": 1.806741383707497, + "learning_rate": 1.3852090091591887e-06, + "loss": 0.4657, + "step": 1222 + }, + { + "ETA": 3.6, + "epoch": 0.3933108216755105, + "fp16_scale": 1.0, + "global_step": 1223, + "grad_norm": 1.9493700217050751, + "learning_rate": 1.3842472234577429e-06, + "loss": 0.3968, + "step": 1223 + }, + { + "ETA": 3.6, + "epoch": 0.39363241678726485, + "fp16_scale": 1.0, + "global_step": 1224, + "grad_norm": 1.7607255605255163, + "learning_rate": 1.3832850205644518e-06, + "loss": 0.4337, + "step": 1224 + }, + { + "ETA": 3.6, + "epoch": 0.39395401189901913, + "fp16_scale": 1.0, + "global_step": 1225, + "grad_norm": 2.0830654554064165, + "learning_rate": 1.3823224015240154e-06, + "loss": 0.4527, + "step": 1225 + }, + { + "ETA": 3.6, + "epoch": 0.3942756070107734, + "fp16_scale": 1.0, + "global_step": 1226, + "grad_norm": 1.8106200568360995, + "learning_rate": 1.3813593673815857e-06, + "loss": 0.4932, + "step": 1226 + }, + { + "ETA": 3.59, + "epoch": 0.39459720212252775, + "fp16_scale": 1.0, + "global_step": 1227, + "grad_norm": 2.1081485927831802, + "learning_rate": 1.3803959191827659e-06, + "loss": 0.4035, + "step": 1227 + }, + { + "ETA": 3.59, + "epoch": 0.394918797234282, + "fp16_scale": 1.0, + "global_step": 1228, + "grad_norm": 1.8585035098425444, + "learning_rate": 1.3794320579736083e-06, + "loss": 0.3873, + "step": 1228 + }, + { + "ETA": 3.59, + "epoch": 0.39524039234603636, + "fp16_scale": 1.0, + "global_step": 1229, + "grad_norm": 2.0499251240730882, + "learning_rate": 1.3784677848006135e-06, + "loss": 0.5385, + "step": 1229 + }, + { + "ETA": 3.59, + "epoch": 0.39556198745779064, + "fp16_scale": 1.0, + "global_step": 1230, + "grad_norm": 1.9552820293742015, + "learning_rate": 1.3775031007107299e-06, + "loss": 0.4264, + "step": 1230 + }, + { + "ETA": 3.59, + "epoch": 0.3958835825695449, + "fp16_scale": 1.0, + "global_step": 1231, + "grad_norm": 1.7723291980659213, + "learning_rate": 1.3765380067513518e-06, + "loss": 0.4062, + "step": 1231 + }, + { + "ETA": 3.58, + "epoch": 0.39620517768129926, + "fp16_scale": 1.0, + "global_step": 1232, + "grad_norm": 1.9574839735840515, + "learning_rate": 1.3755725039703179e-06, + "loss": 0.4256, + "step": 1232 + }, + { + "ETA": 3.58, + "epoch": 0.39652677279305354, + "fp16_scale": 1.0, + "global_step": 1233, + "grad_norm": 2.274789719814242, + "learning_rate": 1.374606593415912e-06, + "loss": 0.4074, + "step": 1233 + }, + { + "ETA": 3.58, + "epoch": 0.3968483679048078, + "fp16_scale": 1.0, + "global_step": 1234, + "grad_norm": 1.9997290474582707, + "learning_rate": 1.3736402761368594e-06, + "loss": 0.3597, + "step": 1234 + }, + { + "ETA": 3.58, + "epoch": 0.39716996301656216, + "fp16_scale": 1.0, + "global_step": 1235, + "grad_norm": 2.1530252175996925, + "learning_rate": 1.3726735531823286e-06, + "loss": 0.5291, + "step": 1235 + }, + { + "ETA": 3.57, + "epoch": 0.39749155812831644, + "fp16_scale": 1.0, + "global_step": 1236, + "grad_norm": 1.974745618508477, + "learning_rate": 1.3717064256019264e-06, + "loss": 0.4301, + "step": 1236 + }, + { + "ETA": 3.57, + "epoch": 0.39781315324007077, + "fp16_scale": 1.0, + "global_step": 1237, + "grad_norm": 1.9728394120343977, + "learning_rate": 1.3707388944457006e-06, + "loss": 0.4078, + "step": 1237 + }, + { + "ETA": 3.57, + "epoch": 0.39813474835182505, + "fp16_scale": 1.0, + "global_step": 1238, + "grad_norm": 2.078266927630806, + "learning_rate": 1.3697709607641365e-06, + "loss": 0.4179, + "step": 1238 + }, + { + "ETA": 3.57, + "epoch": 0.39845634346357933, + "fp16_scale": 1.0, + "global_step": 1239, + "grad_norm": 1.9533156961389537, + "learning_rate": 1.3688026256081563e-06, + "loss": 0.4724, + "step": 1239 + }, + { + "ETA": 3.57, + "epoch": 0.39877793857533367, + "fp16_scale": 1.0, + "global_step": 1240, + "grad_norm": 1.844753616843298, + "learning_rate": 1.3678338900291188e-06, + "loss": 0.4093, + "step": 1240 + }, + { + "ETA": 3.56, + "epoch": 0.39909953368708795, + "fp16_scale": 1.0, + "global_step": 1241, + "grad_norm": 1.932700638754792, + "learning_rate": 1.3668647550788172e-06, + "loss": 0.4067, + "step": 1241 + }, + { + "ETA": 3.56, + "epoch": 0.3994211287988423, + "fp16_scale": 1.0, + "global_step": 1242, + "grad_norm": 1.9695963095648668, + "learning_rate": 1.3658952218094779e-06, + "loss": 0.5026, + "step": 1242 + }, + { + "ETA": 3.56, + "epoch": 0.39974272391059656, + "fp16_scale": 1.0, + "global_step": 1243, + "grad_norm": 1.935446986486094, + "learning_rate": 1.3649252912737602e-06, + "loss": 0.4253, + "step": 1243 + }, + { + "ETA": 3.56, + "epoch": 0.40006431902235084, + "fp16_scale": 1.0, + "global_step": 1244, + "grad_norm": 2.2589571761933223, + "learning_rate": 1.3639549645247544e-06, + "loss": 0.3581, + "step": 1244 + }, + { + "ETA": 3.55, + "epoch": 0.4003859141341052, + "fp16_scale": 1.0, + "global_step": 1245, + "grad_norm": 2.1682110677343918, + "learning_rate": 1.3629842426159813e-06, + "loss": 0.3833, + "step": 1245 + }, + { + "ETA": 3.55, + "epoch": 0.40070750924585946, + "fp16_scale": 1.0, + "global_step": 1246, + "grad_norm": 2.0533016479117836, + "learning_rate": 1.362013126601391e-06, + "loss": 0.4652, + "step": 1246 + }, + { + "ETA": 3.55, + "epoch": 0.40102910435761374, + "fp16_scale": 1.0, + "global_step": 1247, + "grad_norm": 1.916742916306892, + "learning_rate": 1.3610416175353609e-06, + "loss": 0.4205, + "step": 1247 + }, + { + "ETA": 3.55, + "epoch": 0.4013506994693681, + "fp16_scale": 1.0, + "global_step": 1248, + "grad_norm": 2.2265100081173816, + "learning_rate": 1.3600697164726954e-06, + "loss": 0.4355, + "step": 1248 + }, + { + "ETA": 3.55, + "epoch": 0.40167229458112236, + "fp16_scale": 1.0, + "global_step": 1249, + "grad_norm": 2.05052390622889, + "learning_rate": 1.3590974244686246e-06, + "loss": 0.3994, + "step": 1249 + }, + { + "ETA": 3.55, + "epoch": 0.4019938896928767, + "fp16_scale": 1.0, + "global_step": 1250, + "grad_norm": 2.1337818401174067, + "learning_rate": 1.358124742578803e-06, + "loss": 0.4115, + "step": 1250 + }, + { + "ETA": 3.54, + "epoch": 0.402315484804631, + "fp16_scale": 1.0, + "global_step": 1251, + "grad_norm": 1.926233045970569, + "learning_rate": 1.3571516718593078e-06, + "loss": 0.4138, + "step": 1251 + }, + { + "ETA": 3.54, + "epoch": 0.40263707991638525, + "fp16_scale": 1.0, + "global_step": 1252, + "grad_norm": 1.8012746034968832, + "learning_rate": 1.3561782133666396e-06, + "loss": 0.4444, + "step": 1252 + }, + { + "ETA": 3.54, + "epoch": 0.4029586750281396, + "fp16_scale": 1.0, + "global_step": 1253, + "grad_norm": 1.8893945173687816, + "learning_rate": 1.355204368157719e-06, + "loss": 0.4106, + "step": 1253 + }, + { + "ETA": 3.54, + "epoch": 0.40328027013989387, + "fp16_scale": 1.0, + "global_step": 1254, + "grad_norm": 1.8259930391833834, + "learning_rate": 1.354230137289887e-06, + "loss": 0.3353, + "step": 1254 + }, + { + "ETA": 3.53, + "epoch": 0.40360186525164815, + "fp16_scale": 1.0, + "global_step": 1255, + "grad_norm": 2.1136932321072717, + "learning_rate": 1.3532555218209034e-06, + "loss": 0.3952, + "step": 1255 + }, + { + "ETA": 3.53, + "epoch": 0.4039234603634025, + "fp16_scale": 1.0, + "global_step": 1256, + "grad_norm": 2.078439130562752, + "learning_rate": 1.3522805228089449e-06, + "loss": 0.424, + "step": 1256 + }, + { + "ETA": 3.53, + "epoch": 0.40424505547515677, + "fp16_scale": 1.0, + "global_step": 1257, + "grad_norm": 2.1196724619474865, + "learning_rate": 1.3513051413126051e-06, + "loss": 0.4363, + "step": 1257 + }, + { + "ETA": 3.53, + "epoch": 0.4045666505869111, + "fp16_scale": 1.0, + "global_step": 1258, + "grad_norm": 1.9350173128719281, + "learning_rate": 1.350329378390893e-06, + "loss": 0.4015, + "step": 1258 + }, + { + "ETA": 3.52, + "epoch": 0.4048882456986654, + "fp16_scale": 1.0, + "global_step": 1259, + "grad_norm": 2.0707852214265996, + "learning_rate": 1.3493532351032317e-06, + "loss": 0.3762, + "step": 1259 + }, + { + "ETA": 3.52, + "epoch": 0.40520984081041966, + "fp16_scale": 1.0, + "global_step": 1260, + "grad_norm": 2.269699660425376, + "learning_rate": 1.3483767125094572e-06, + "loss": 0.3905, + "step": 1260 + }, + { + "ETA": 3.52, + "epoch": 0.405531435922174, + "fp16_scale": 1.0, + "global_step": 1261, + "grad_norm": 2.003311715529416, + "learning_rate": 1.3473998116698168e-06, + "loss": 0.4096, + "step": 1261 + }, + { + "ETA": 3.52, + "epoch": 0.4058530310339283, + "fp16_scale": 1.0, + "global_step": 1262, + "grad_norm": 1.9431104363624927, + "learning_rate": 1.3464225336449693e-06, + "loss": 0.4847, + "step": 1262 + }, + { + "ETA": 3.52, + "epoch": 0.40617462614568256, + "fp16_scale": 1.0, + "global_step": 1263, + "grad_norm": 1.9462633816271575, + "learning_rate": 1.3454448794959826e-06, + "loss": 0.4594, + "step": 1263 + }, + { + "ETA": 3.51, + "epoch": 0.4064962212574369, + "fp16_scale": 1.0, + "global_step": 1264, + "grad_norm": 2.07051637480553, + "learning_rate": 1.344466850284333e-06, + "loss": 0.3757, + "step": 1264 + }, + { + "ETA": 3.51, + "epoch": 0.4068178163691912, + "fp16_scale": 1.0, + "global_step": 1265, + "grad_norm": 2.2664569035479993, + "learning_rate": 1.3434884470719038e-06, + "loss": 0.4937, + "step": 1265 + }, + { + "ETA": 3.51, + "epoch": 0.4071394114809455, + "fp16_scale": 1.0, + "global_step": 1266, + "grad_norm": 2.016763153913431, + "learning_rate": 1.3425096709209855e-06, + "loss": 0.4492, + "step": 1266 + }, + { + "ETA": 3.51, + "epoch": 0.4074610065926998, + "fp16_scale": 1.0, + "global_step": 1267, + "grad_norm": 2.4052542025182904, + "learning_rate": 1.3415305228942717e-06, + "loss": 0.4578, + "step": 1267 + }, + { + "ETA": 3.51, + "epoch": 0.4077826017044541, + "fp16_scale": 1.0, + "global_step": 1268, + "grad_norm": 2.055440757308952, + "learning_rate": 1.3405510040548611e-06, + "loss": 0.4202, + "step": 1268 + }, + { + "ETA": 3.5, + "epoch": 0.4081041968162084, + "fp16_scale": 1.0, + "global_step": 1269, + "grad_norm": 1.7581727936551301, + "learning_rate": 1.3395711154662548e-06, + "loss": 0.4111, + "step": 1269 + }, + { + "ETA": 3.5, + "epoch": 0.4084257919279627, + "fp16_scale": 1.0, + "global_step": 1270, + "grad_norm": 2.143366393966735, + "learning_rate": 1.3385908581923547e-06, + "loss": 0.3887, + "step": 1270 + }, + { + "ETA": 3.5, + "epoch": 0.40874738703971697, + "fp16_scale": 1.0, + "global_step": 1271, + "grad_norm": 1.9717120060580338, + "learning_rate": 1.3376102332974639e-06, + "loss": 0.3799, + "step": 1271 + }, + { + "ETA": 3.5, + "epoch": 0.4090689821514713, + "fp16_scale": 1.0, + "global_step": 1272, + "grad_norm": 2.33211450756805, + "learning_rate": 1.336629241846284e-06, + "loss": 0.4056, + "step": 1272 + }, + { + "ETA": 3.49, + "epoch": 0.4093905772632256, + "fp16_scale": 1.0, + "global_step": 1273, + "grad_norm": 1.8909583337438918, + "learning_rate": 1.3356478849039148e-06, + "loss": 0.38, + "step": 1273 + }, + { + "ETA": 3.49, + "epoch": 0.4097121723749799, + "fp16_scale": 1.0, + "global_step": 1274, + "grad_norm": 2.1144769906691754, + "learning_rate": 1.334666163535853e-06, + "loss": 0.4052, + "step": 1274 + }, + { + "ETA": 3.49, + "epoch": 0.4100337674867342, + "fp16_scale": 1.0, + "global_step": 1275, + "grad_norm": 1.8883487851099277, + "learning_rate": 1.3336840788079905e-06, + "loss": 0.4357, + "step": 1275 + }, + { + "ETA": 3.49, + "epoch": 0.4103553625984885, + "fp16_scale": 1.0, + "global_step": 1276, + "grad_norm": 1.8964786960646427, + "learning_rate": 1.3327016317866146e-06, + "loss": 0.4628, + "step": 1276 + }, + { + "ETA": 3.49, + "epoch": 0.4106769577102428, + "fp16_scale": 1.0, + "global_step": 1277, + "grad_norm": 1.8450898049718576, + "learning_rate": 1.3317188235384051e-06, + "loss": 0.3882, + "step": 1277 + }, + { + "ETA": 3.48, + "epoch": 0.4109985528219971, + "fp16_scale": 1.0, + "global_step": 1278, + "grad_norm": 2.0586266436988576, + "learning_rate": 1.3307356551304347e-06, + "loss": 0.4246, + "step": 1278 + }, + { + "ETA": 3.48, + "epoch": 0.41132014793375143, + "fp16_scale": 1.0, + "global_step": 1279, + "grad_norm": 2.0311311801760064, + "learning_rate": 1.3297521276301665e-06, + "loss": 0.4669, + "step": 1279 + }, + { + "ETA": 3.48, + "epoch": 0.4116417430455057, + "fp16_scale": 1.0, + "global_step": 1280, + "grad_norm": 2.1469580929284033, + "learning_rate": 1.3287682421054538e-06, + "loss": 0.3494, + "step": 1280 + }, + { + "ETA": 3.48, + "epoch": 0.41196333815726, + "fp16_scale": 1.0, + "global_step": 1281, + "grad_norm": 2.1130366516356567, + "learning_rate": 1.3277839996245387e-06, + "loss": 0.4841, + "step": 1281 + }, + { + "ETA": 3.47, + "epoch": 0.41228493326901433, + "fp16_scale": 1.0, + "global_step": 1282, + "grad_norm": 2.1267466288125245, + "learning_rate": 1.3267994012560504e-06, + "loss": 0.4249, + "step": 1282 + }, + { + "ETA": 3.47, + "epoch": 0.4126065283807686, + "fp16_scale": 1.0, + "global_step": 1283, + "grad_norm": 2.036250364245317, + "learning_rate": 1.3258144480690054e-06, + "loss": 0.4517, + "step": 1283 + }, + { + "ETA": 3.47, + "epoch": 0.4129281234925229, + "fp16_scale": 1.0, + "global_step": 1284, + "grad_norm": 2.038258782557574, + "learning_rate": 1.3248291411328045e-06, + "loss": 0.5058, + "step": 1284 + }, + { + "ETA": 3.47, + "epoch": 0.4132497186042772, + "fp16_scale": 1.0, + "global_step": 1285, + "grad_norm": 2.158157978740966, + "learning_rate": 1.3238434815172333e-06, + "loss": 0.4319, + "step": 1285 + }, + { + "ETA": 3.47, + "epoch": 0.4135713137160315, + "fp16_scale": 1.0, + "global_step": 1286, + "grad_norm": 1.8677714276173643, + "learning_rate": 1.3228574702924595e-06, + "loss": 0.4284, + "step": 1286 + }, + { + "ETA": 3.47, + "epoch": 0.41389290882778584, + "fp16_scale": 1.0, + "global_step": 1287, + "grad_norm": 1.891794993060621, + "learning_rate": 1.3218711085290333e-06, + "loss": 0.4004, + "step": 1287 + }, + { + "ETA": 3.46, + "epoch": 0.4142145039395401, + "fp16_scale": 1.0, + "global_step": 1288, + "grad_norm": 1.9120263262061623, + "learning_rate": 1.3208843972978854e-06, + "loss": 0.4944, + "step": 1288 + }, + { + "ETA": 3.46, + "epoch": 0.4145360990512944, + "fp16_scale": 1.0, + "global_step": 1289, + "grad_norm": 1.8925294094801386, + "learning_rate": 1.3198973376703256e-06, + "loss": 0.3965, + "step": 1289 + }, + { + "ETA": 3.46, + "epoch": 0.41485769416304874, + "fp16_scale": 1.0, + "global_step": 1290, + "grad_norm": 1.9064706308968085, + "learning_rate": 1.3189099307180421e-06, + "loss": 0.4539, + "step": 1290 + }, + { + "ETA": 3.46, + "epoch": 0.415179289274803, + "fp16_scale": 1.0, + "global_step": 1291, + "grad_norm": 1.8373911459489973, + "learning_rate": 1.3179221775131004e-06, + "loss": 0.402, + "step": 1291 + }, + { + "ETA": 3.46, + "epoch": 0.4155008843865573, + "fp16_scale": 1.0, + "global_step": 1292, + "grad_norm": 1.9135215511363823, + "learning_rate": 1.3169340791279413e-06, + "loss": 0.3875, + "step": 1292 + }, + { + "ETA": 3.45, + "epoch": 0.41582247949831164, + "fp16_scale": 1.0, + "global_step": 1293, + "grad_norm": 1.9368908834871557, + "learning_rate": 1.3159456366353814e-06, + "loss": 0.4253, + "step": 1293 + }, + { + "ETA": 3.45, + "epoch": 0.4161440746100659, + "fp16_scale": 1.0, + "global_step": 1294, + "grad_norm": 1.9648459069380593, + "learning_rate": 1.3149568511086101e-06, + "loss": 0.4024, + "step": 1294 + }, + { + "ETA": 3.45, + "epoch": 0.41646566972182025, + "fp16_scale": 1.0, + "global_step": 1295, + "grad_norm": 2.479224124944344, + "learning_rate": 1.31396772362119e-06, + "loss": 0.4545, + "step": 1295 + }, + { + "ETA": 3.45, + "epoch": 0.41678726483357453, + "fp16_scale": 1.0, + "global_step": 1296, + "grad_norm": 1.8852621192902068, + "learning_rate": 1.3129782552470536e-06, + "loss": 0.4347, + "step": 1296 + }, + { + "ETA": 3.45, + "epoch": 0.4171088599453288, + "fp16_scale": 1.0, + "global_step": 1297, + "grad_norm": 1.6736510366199835, + "learning_rate": 1.311988447060505e-06, + "loss": 0.4099, + "step": 1297 + }, + { + "ETA": 3.45, + "epoch": 0.41743045505708315, + "fp16_scale": 1.0, + "global_step": 1298, + "grad_norm": 1.856676771155561, + "learning_rate": 1.3109983001362165e-06, + "loss": 0.4255, + "step": 1298 + }, + { + "ETA": 3.44, + "epoch": 0.41775205016883743, + "fp16_scale": 1.0, + "global_step": 1299, + "grad_norm": 2.023042928084088, + "learning_rate": 1.3100078155492283e-06, + "loss": 0.3876, + "step": 1299 + }, + { + "ETA": 3.44, + "epoch": 0.4180736452805917, + "fp16_scale": 1.0, + "global_step": 1300, + "grad_norm": 1.8877427544955612, + "learning_rate": 1.3090169943749473e-06, + "loss": 0.3698, + "step": 1300 + }, + { + "ETA": 3.44, + "epoch": 0.41839524039234605, + "fp16_scale": 1.0, + "global_step": 1301, + "grad_norm": 1.873112217728738, + "learning_rate": 1.308025837689146e-06, + "loss": 0.4068, + "step": 1301 + }, + { + "ETA": 3.44, + "epoch": 0.4187168355041003, + "fp16_scale": 1.0, + "global_step": 1302, + "grad_norm": 2.051326993625414, + "learning_rate": 1.3070343465679606e-06, + "loss": 0.4576, + "step": 1302 + }, + { + "ETA": 3.43, + "epoch": 0.41903843061585466, + "fp16_scale": 1.0, + "global_step": 1303, + "grad_norm": 2.0243280016873584, + "learning_rate": 1.3060425220878908e-06, + "loss": 0.4097, + "step": 1303 + }, + { + "ETA": 3.43, + "epoch": 0.41936002572760894, + "fp16_scale": 1.0, + "global_step": 1304, + "grad_norm": 2.39146033148339, + "learning_rate": 1.305050365325798e-06, + "loss": 0.363, + "step": 1304 + }, + { + "ETA": 3.43, + "epoch": 0.4196816208393632, + "fp16_scale": 1.0, + "global_step": 1305, + "grad_norm": 1.9115903336157114, + "learning_rate": 1.3040578773589056e-06, + "loss": 0.4096, + "step": 1305 + }, + { + "ETA": 3.43, + "epoch": 0.42000321595111756, + "fp16_scale": 1.0, + "global_step": 1306, + "grad_norm": 2.237936064048709, + "learning_rate": 1.3030650592647944e-06, + "loss": 0.3962, + "step": 1306 + }, + { + "ETA": 3.43, + "epoch": 0.42032481106287184, + "fp16_scale": 1.0, + "global_step": 1307, + "grad_norm": 1.956406833792594, + "learning_rate": 1.3020719121214054e-06, + "loss": 0.4493, + "step": 1307 + }, + { + "ETA": 3.42, + "epoch": 0.4206464061746261, + "fp16_scale": 1.0, + "global_step": 1308, + "grad_norm": 2.053709038298677, + "learning_rate": 1.3010784370070365e-06, + "loss": 0.4337, + "step": 1308 + }, + { + "ETA": 3.42, + "epoch": 0.42096800128638046, + "fp16_scale": 1.0, + "global_step": 1309, + "grad_norm": 1.9529762868034433, + "learning_rate": 1.3000846350003406e-06, + "loss": 0.4373, + "step": 1309 + }, + { + "ETA": 3.42, + "epoch": 0.42128959639813474, + "fp16_scale": 1.0, + "global_step": 1310, + "grad_norm": 1.7562374110675865, + "learning_rate": 1.2990905071803273e-06, + "loss": 0.3747, + "step": 1310 + }, + { + "ETA": 3.42, + "epoch": 0.42161119150988907, + "fp16_scale": 1.0, + "global_step": 1311, + "grad_norm": 1.7062053296457569, + "learning_rate": 1.2980960546263586e-06, + "loss": 0.4122, + "step": 1311 + }, + { + "ETA": 3.41, + "epoch": 0.42193278662164335, + "fp16_scale": 1.0, + "global_step": 1312, + "grad_norm": 1.8269535633366947, + "learning_rate": 1.2971012784181497e-06, + "loss": 0.3574, + "step": 1312 + }, + { + "ETA": 3.41, + "epoch": 0.42225438173339763, + "fp16_scale": 1.0, + "global_step": 1313, + "grad_norm": 1.8215195588670925, + "learning_rate": 1.296106179635767e-06, + "loss": 0.4707, + "step": 1313 + }, + { + "ETA": 3.41, + "epoch": 0.42257597684515197, + "fp16_scale": 1.0, + "global_step": 1314, + "grad_norm": 1.7891525419939078, + "learning_rate": 1.295110759359627e-06, + "loss": 0.3745, + "step": 1314 + }, + { + "ETA": 3.41, + "epoch": 0.42289757195690625, + "fp16_scale": 1.0, + "global_step": 1315, + "grad_norm": 1.6739666894383969, + "learning_rate": 1.2941150186704953e-06, + "loss": 0.4538, + "step": 1315 + }, + { + "ETA": 3.41, + "epoch": 0.42321916706866053, + "fp16_scale": 1.0, + "global_step": 1316, + "grad_norm": 1.879052796438246, + "learning_rate": 1.2931189586494857e-06, + "loss": 0.4704, + "step": 1316 + }, + { + "ETA": 3.4, + "epoch": 0.42354076218041486, + "fp16_scale": 1.0, + "global_step": 1317, + "grad_norm": 2.075420152410664, + "learning_rate": 1.2921225803780586e-06, + "loss": 0.4354, + "step": 1317 + }, + { + "ETA": 3.4, + "epoch": 0.42386235729216915, + "fp16_scale": 1.0, + "global_step": 1318, + "grad_norm": 1.9809921057553515, + "learning_rate": 1.2911258849380198e-06, + "loss": 0.4578, + "step": 1318 + }, + { + "ETA": 3.4, + "epoch": 0.4241839524039235, + "fp16_scale": 1.0, + "global_step": 1319, + "grad_norm": 1.9549169653914598, + "learning_rate": 1.2901288734115193e-06, + "loss": 0.3984, + "step": 1319 + }, + { + "ETA": 3.4, + "epoch": 0.42450554751567776, + "fp16_scale": 1.0, + "global_step": 1320, + "grad_norm": 1.8708757161316962, + "learning_rate": 1.2891315468810504e-06, + "loss": 0.3808, + "step": 1320 + }, + { + "ETA": 3.4, + "epoch": 0.42482714262743204, + "fp16_scale": 1.0, + "global_step": 1321, + "grad_norm": 2.0241645347316615, + "learning_rate": 1.288133906429449e-06, + "loss": 0.3758, + "step": 1321 + }, + { + "ETA": 3.4, + "epoch": 0.4251487377391864, + "fp16_scale": 1.0, + "global_step": 1322, + "grad_norm": 2.030835180450959, + "learning_rate": 1.2871359531398909e-06, + "loss": 0.4313, + "step": 1322 + }, + { + "ETA": 3.39, + "epoch": 0.42547033285094066, + "fp16_scale": 1.0, + "global_step": 1323, + "grad_norm": 2.041273747447046, + "learning_rate": 1.2861376880958922e-06, + "loss": 0.3822, + "step": 1323 + }, + { + "ETA": 3.39, + "epoch": 0.425791927962695, + "fp16_scale": 1.0, + "global_step": 1324, + "grad_norm": 1.9113739253795483, + "learning_rate": 1.2851391123813073e-06, + "loss": 0.4825, + "step": 1324 + }, + { + "ETA": 3.39, + "epoch": 0.4261135230744493, + "fp16_scale": 1.0, + "global_step": 1325, + "grad_norm": 1.9341991945682349, + "learning_rate": 1.2841402270803276e-06, + "loss": 0.5072, + "step": 1325 + }, + { + "ETA": 3.39, + "epoch": 0.42643511818620355, + "fp16_scale": 1.0, + "global_step": 1326, + "grad_norm": 2.098332183595364, + "learning_rate": 1.283141033277481e-06, + "loss": 0.4842, + "step": 1326 + }, + { + "ETA": 3.39, + "epoch": 0.4267567132979579, + "fp16_scale": 1.0, + "global_step": 1327, + "grad_norm": 1.8718407825339272, + "learning_rate": 1.2821415320576307e-06, + "loss": 0.4722, + "step": 1327 + }, + { + "ETA": 3.39, + "epoch": 0.42707830840971217, + "fp16_scale": 1.0, + "global_step": 1328, + "grad_norm": 1.8864295150142696, + "learning_rate": 1.2811417245059729e-06, + "loss": 0.5018, + "step": 1328 + }, + { + "ETA": 3.38, + "epoch": 0.42739990352146645, + "fp16_scale": 1.0, + "global_step": 1329, + "grad_norm": 1.9084232296496746, + "learning_rate": 1.2801416117080364e-06, + "loss": 0.4172, + "step": 1329 + }, + { + "ETA": 3.38, + "epoch": 0.4277214986332208, + "fp16_scale": 1.0, + "global_step": 1330, + "grad_norm": 1.9012788737997843, + "learning_rate": 1.2791411947496827e-06, + "loss": 0.4131, + "step": 1330 + }, + { + "ETA": 3.38, + "epoch": 0.42804309374497507, + "fp16_scale": 1.0, + "global_step": 1331, + "grad_norm": 2.1323509076818334, + "learning_rate": 1.2781404747171016e-06, + "loss": 0.4389, + "step": 1331 + }, + { + "ETA": 3.38, + "epoch": 0.4283646888567294, + "fp16_scale": 1.0, + "global_step": 1332, + "grad_norm": 2.2654401958205197, + "learning_rate": 1.2771394526968133e-06, + "loss": 0.4359, + "step": 1332 + }, + { + "ETA": 3.38, + "epoch": 0.4286862839684837, + "fp16_scale": 1.0, + "global_step": 1333, + "grad_norm": 1.7762420788935014, + "learning_rate": 1.2761381297756658e-06, + "loss": 0.422, + "step": 1333 + }, + { + "ETA": 3.38, + "epoch": 0.42900787908023796, + "fp16_scale": 1.0, + "global_step": 1334, + "grad_norm": 2.0063885492786806, + "learning_rate": 1.2751365070408334e-06, + "loss": 0.4716, + "step": 1334 + }, + { + "ETA": 3.37, + "epoch": 0.4293294741919923, + "fp16_scale": 1.0, + "global_step": 1335, + "grad_norm": 2.1038221381344164, + "learning_rate": 1.2741345855798159e-06, + "loss": 0.4982, + "step": 1335 + }, + { + "ETA": 3.37, + "epoch": 0.4296510693037466, + "fp16_scale": 1.0, + "global_step": 1336, + "grad_norm": 1.95117683758303, + "learning_rate": 1.273132366480438e-06, + "loss": 0.4467, + "step": 1336 + }, + { + "ETA": 3.37, + "epoch": 0.42997266441550086, + "fp16_scale": 1.0, + "global_step": 1337, + "grad_norm": 2.0886909752284852, + "learning_rate": 1.2721298508308463e-06, + "loss": 0.4383, + "step": 1337 + }, + { + "ETA": 3.37, + "epoch": 0.4302942595272552, + "fp16_scale": 1.0, + "global_step": 1338, + "grad_norm": 1.961099516106727, + "learning_rate": 1.271127039719511e-06, + "loss": 0.477, + "step": 1338 + }, + { + "ETA": 3.37, + "epoch": 0.4306158546390095, + "fp16_scale": 1.0, + "global_step": 1339, + "grad_norm": 2.32048606376842, + "learning_rate": 1.270123934235222e-06, + "loss": 0.4757, + "step": 1339 + }, + { + "ETA": 3.37, + "epoch": 0.4309374497507638, + "fp16_scale": 1.0, + "global_step": 1340, + "grad_norm": 2.011952545459391, + "learning_rate": 1.2691205354670892e-06, + "loss": 0.4333, + "step": 1340 + }, + { + "ETA": 3.36, + "epoch": 0.4312590448625181, + "fp16_scale": 1.0, + "global_step": 1341, + "grad_norm": 2.1176101531453697, + "learning_rate": 1.2681168445045412e-06, + "loss": 0.3868, + "step": 1341 + }, + { + "ETA": 3.36, + "epoch": 0.4315806399742724, + "fp16_scale": 1.0, + "global_step": 1342, + "grad_norm": 2.0411025678349266, + "learning_rate": 1.2671128624373229e-06, + "loss": 0.4032, + "step": 1342 + }, + { + "ETA": 3.36, + "epoch": 0.4319022350860267, + "fp16_scale": 1.0, + "global_step": 1343, + "grad_norm": 2.0078757906643276, + "learning_rate": 1.2661085903554962e-06, + "loss": 0.3884, + "step": 1343 + }, + { + "ETA": 3.36, + "epoch": 0.432223830197781, + "fp16_scale": 1.0, + "global_step": 1344, + "grad_norm": 1.961133658667528, + "learning_rate": 1.2651040293494376e-06, + "loss": 0.4223, + "step": 1344 + }, + { + "ETA": 3.35, + "epoch": 0.43254542530953527, + "fp16_scale": 1.0, + "global_step": 1345, + "grad_norm": 1.975805112272552, + "learning_rate": 1.2640991805098366e-06, + "loss": 0.4567, + "step": 1345 + }, + { + "ETA": 3.35, + "epoch": 0.4328670204212896, + "fp16_scale": 1.0, + "global_step": 1346, + "grad_norm": 2.1325255765239355, + "learning_rate": 1.2630940449276968e-06, + "loss": 0.5047, + "step": 1346 + }, + { + "ETA": 3.35, + "epoch": 0.4331886155330439, + "fp16_scale": 1.0, + "global_step": 1347, + "grad_norm": 2.0655733195455377, + "learning_rate": 1.262088623694332e-06, + "loss": 0.5378, + "step": 1347 + }, + { + "ETA": 3.35, + "epoch": 0.4335102106447982, + "fp16_scale": 1.0, + "global_step": 1348, + "grad_norm": 2.135734443396966, + "learning_rate": 1.2610829179013657e-06, + "loss": 0.4077, + "step": 1348 + }, + { + "ETA": 3.35, + "epoch": 0.4338318057565525, + "fp16_scale": 1.0, + "global_step": 1349, + "grad_norm": 2.1472325469193976, + "learning_rate": 1.2600769286407317e-06, + "loss": 0.3683, + "step": 1349 + }, + { + "ETA": 3.34, + "epoch": 0.4341534008683068, + "fp16_scale": 1.0, + "global_step": 1350, + "grad_norm": 1.8851290970225403, + "learning_rate": 1.2590706570046704e-06, + "loss": 0.3661, + "step": 1350 + }, + { + "ETA": 3.34, + "epoch": 0.4344749959800611, + "fp16_scale": 1.0, + "global_step": 1351, + "grad_norm": 2.1279825620137616, + "learning_rate": 1.2580641040857294e-06, + "loss": 0.426, + "step": 1351 + }, + { + "ETA": 3.34, + "epoch": 0.4347965910918154, + "fp16_scale": 1.0, + "global_step": 1352, + "grad_norm": 1.8516315571203608, + "learning_rate": 1.2570572709767616e-06, + "loss": 0.4227, + "step": 1352 + }, + { + "ETA": 3.34, + "epoch": 0.4351181862035697, + "fp16_scale": 1.0, + "global_step": 1353, + "grad_norm": 1.8200937623381561, + "learning_rate": 1.2560501587709238e-06, + "loss": 0.3648, + "step": 1353 + }, + { + "ETA": 3.34, + "epoch": 0.435439781315324, + "fp16_scale": 1.0, + "global_step": 1354, + "grad_norm": 2.0206700029119955, + "learning_rate": 1.2550427685616764e-06, + "loss": 0.3636, + "step": 1354 + }, + { + "ETA": 3.33, + "epoch": 0.4357613764270783, + "fp16_scale": 1.0, + "global_step": 1355, + "grad_norm": 1.9238979778464935, + "learning_rate": 1.2540351014427813e-06, + "loss": 0.4101, + "step": 1355 + }, + { + "ETA": 3.33, + "epoch": 0.43608297153883263, + "fp16_scale": 1.0, + "global_step": 1356, + "grad_norm": 2.278431500277815, + "learning_rate": 1.2530271585083006e-06, + "loss": 0.4308, + "step": 1356 + }, + { + "ETA": 3.33, + "epoch": 0.4364045666505869, + "fp16_scale": 1.0, + "global_step": 1357, + "grad_norm": 1.9638468459383565, + "learning_rate": 1.252018940852597e-06, + "loss": 0.492, + "step": 1357 + }, + { + "ETA": 3.33, + "epoch": 0.4367261617623412, + "fp16_scale": 1.0, + "global_step": 1358, + "grad_norm": 2.0124603726439796, + "learning_rate": 1.2510104495703304e-06, + "loss": 0.476, + "step": 1358 + }, + { + "ETA": 3.33, + "epoch": 0.4370477568740955, + "fp16_scale": 1.0, + "global_step": 1359, + "grad_norm": 1.968831758094997, + "learning_rate": 1.2500016857564584e-06, + "loss": 0.4057, + "step": 1359 + }, + { + "ETA": 3.32, + "epoch": 0.4373693519858498, + "fp16_scale": 1.0, + "global_step": 1360, + "grad_norm": 2.197815710520745, + "learning_rate": 1.248992650506234e-06, + "loss": 0.4393, + "step": 1360 + }, + { + "ETA": 3.32, + "epoch": 0.43769094709760414, + "fp16_scale": 1.0, + "global_step": 1361, + "grad_norm": 2.1321524386912087, + "learning_rate": 1.2479833449152054e-06, + "loss": 0.4704, + "step": 1361 + }, + { + "ETA": 3.32, + "epoch": 0.4380125422093584, + "fp16_scale": 1.0, + "global_step": 1362, + "grad_norm": 1.832170751052816, + "learning_rate": 1.2469737700792143e-06, + "loss": 0.4231, + "step": 1362 + }, + { + "ETA": 3.32, + "epoch": 0.4383341373211127, + "fp16_scale": 1.0, + "global_step": 1363, + "grad_norm": 2.2259342027789617, + "learning_rate": 1.2459639270943943e-06, + "loss": 0.3483, + "step": 1363 + }, + { + "ETA": 3.32, + "epoch": 0.43865573243286704, + "fp16_scale": 1.0, + "global_step": 1364, + "grad_norm": 1.8654567434816682, + "learning_rate": 1.2449538170571705e-06, + "loss": 0.5051, + "step": 1364 + }, + { + "ETA": 3.31, + "epoch": 0.4389773275446213, + "fp16_scale": 1.0, + "global_step": 1365, + "grad_norm": 2.201363172831595, + "learning_rate": 1.2439434410642578e-06, + "loss": 0.477, + "step": 1365 + }, + { + "ETA": 3.31, + "epoch": 0.4392989226563756, + "fp16_scale": 1.0, + "global_step": 1366, + "grad_norm": 2.1914787785831225, + "learning_rate": 1.2429328002126599e-06, + "loss": 0.4851, + "step": 1366 + }, + { + "ETA": 3.31, + "epoch": 0.43962051776812994, + "fp16_scale": 1.0, + "global_step": 1367, + "grad_norm": 2.1813556026033973, + "learning_rate": 1.2419218955996676e-06, + "loss": 0.4766, + "step": 1367 + }, + { + "ETA": 3.31, + "epoch": 0.4399421128798842, + "fp16_scale": 1.0, + "global_step": 1368, + "grad_norm": 1.9518526521459842, + "learning_rate": 1.2409107283228595e-06, + "loss": 0.4245, + "step": 1368 + }, + { + "ETA": 3.31, + "epoch": 0.44026370799163855, + "fp16_scale": 1.0, + "global_step": 1369, + "grad_norm": 2.1292985482112, + "learning_rate": 1.239899299480098e-06, + "loss": 0.4366, + "step": 1369 + }, + { + "ETA": 3.3, + "epoch": 0.44058530310339283, + "fp16_scale": 1.0, + "global_step": 1370, + "grad_norm": 1.8007064142102165, + "learning_rate": 1.2388876101695293e-06, + "loss": 0.4136, + "step": 1370 + }, + { + "ETA": 3.3, + "epoch": 0.4409068982151471, + "fp16_scale": 1.0, + "global_step": 1371, + "grad_norm": 1.8308364506958799, + "learning_rate": 1.2378756614895841e-06, + "loss": 0.3975, + "step": 1371 + }, + { + "ETA": 3.3, + "epoch": 0.44122849332690145, + "fp16_scale": 1.0, + "global_step": 1372, + "grad_norm": 2.030996011863319, + "learning_rate": 1.2368634545389732e-06, + "loss": 0.3726, + "step": 1372 + }, + { + "ETA": 3.3, + "epoch": 0.44155008843865573, + "fp16_scale": 1.0, + "global_step": 1373, + "grad_norm": 2.1848106243679775, + "learning_rate": 1.2358509904166875e-06, + "loss": 0.3896, + "step": 1373 + }, + { + "ETA": 3.3, + "epoch": 0.44187168355041, + "fp16_scale": 1.0, + "global_step": 1374, + "grad_norm": 2.1326047482073025, + "learning_rate": 1.2348382702219988e-06, + "loss": 0.5802, + "step": 1374 + }, + { + "ETA": 3.29, + "epoch": 0.44219327866216435, + "fp16_scale": 1.0, + "global_step": 1375, + "grad_norm": 1.8792653832315627, + "learning_rate": 1.2338252950544559e-06, + "loss": 0.3792, + "step": 1375 + }, + { + "ETA": 3.29, + "epoch": 0.4425148737739186, + "fp16_scale": 1.0, + "global_step": 1376, + "grad_norm": 1.8910372647492621, + "learning_rate": 1.2328120660138842e-06, + "loss": 0.4238, + "step": 1376 + }, + { + "ETA": 3.29, + "epoch": 0.44283646888567296, + "fp16_scale": 1.0, + "global_step": 1377, + "grad_norm": 1.9051361184992184, + "learning_rate": 1.2317985842003849e-06, + "loss": 0.3798, + "step": 1377 + }, + { + "ETA": 3.29, + "epoch": 0.44315806399742724, + "fp16_scale": 1.0, + "global_step": 1378, + "grad_norm": 1.8431495442965877, + "learning_rate": 1.2307848507143338e-06, + "loss": 0.4537, + "step": 1378 + }, + { + "ETA": 3.28, + "epoch": 0.4434796591091815, + "fp16_scale": 1.0, + "global_step": 1379, + "grad_norm": 2.224381994362364, + "learning_rate": 1.229770866656381e-06, + "loss": 0.4025, + "step": 1379 + }, + { + "ETA": 3.28, + "epoch": 0.44380125422093586, + "fp16_scale": 1.0, + "global_step": 1380, + "grad_norm": 1.9056671272062469, + "learning_rate": 1.2287566331274464e-06, + "loss": 0.4767, + "step": 1380 + }, + { + "ETA": 3.28, + "epoch": 0.44412284933269014, + "fp16_scale": 1.0, + "global_step": 1381, + "grad_norm": 1.857981416086925, + "learning_rate": 1.2277421512287224e-06, + "loss": 0.3869, + "step": 1381 + }, + { + "ETA": 3.28, + "epoch": 0.4444444444444444, + "fp16_scale": 1.0, + "global_step": 1382, + "grad_norm": 2.016952793287992, + "learning_rate": 1.2267274220616708e-06, + "loss": 0.4624, + "step": 1382 + }, + { + "ETA": 3.28, + "epoch": 0.44476603955619876, + "fp16_scale": 1.0, + "global_step": 1383, + "grad_norm": 2.2381982451702114, + "learning_rate": 1.2257124467280214e-06, + "loss": 0.4949, + "step": 1383 + }, + { + "ETA": 3.28, + "epoch": 0.44508763466795304, + "fp16_scale": 1.0, + "global_step": 1384, + "grad_norm": 1.8812941469093638, + "learning_rate": 1.2246972263297717e-06, + "loss": 0.4395, + "step": 1384 + }, + { + "ETA": 3.27, + "epoch": 0.44540922977970737, + "fp16_scale": 1.0, + "global_step": 1385, + "grad_norm": 1.8286698683678104, + "learning_rate": 1.2236817619691853e-06, + "loss": 0.416, + "step": 1385 + }, + { + "ETA": 3.27, + "epoch": 0.44573082489146165, + "fp16_scale": 1.0, + "global_step": 1386, + "grad_norm": 2.0179511134804606, + "learning_rate": 1.2226660547487903e-06, + "loss": 0.4194, + "step": 1386 + }, + { + "ETA": 3.27, + "epoch": 0.44605242000321593, + "fp16_scale": 1.0, + "global_step": 1387, + "grad_norm": 1.9439917510050146, + "learning_rate": 1.2216501057713788e-06, + "loss": 0.4202, + "step": 1387 + }, + { + "ETA": 3.27, + "epoch": 0.44637401511497027, + "fp16_scale": 1.0, + "global_step": 1388, + "grad_norm": 1.9922919677500652, + "learning_rate": 1.2206339161400057e-06, + "loss": 0.4109, + "step": 1388 + }, + { + "ETA": 3.27, + "epoch": 0.44669561022672455, + "fp16_scale": 1.0, + "global_step": 1389, + "grad_norm": 1.8899473270409248, + "learning_rate": 1.219617486957986e-06, + "loss": 0.4214, + "step": 1389 + }, + { + "ETA": 3.26, + "epoch": 0.44701720533847883, + "fp16_scale": 1.0, + "global_step": 1390, + "grad_norm": 1.940836601527028, + "learning_rate": 1.218600819328896e-06, + "loss": 0.4491, + "step": 1390 + }, + { + "ETA": 3.26, + "epoch": 0.44733880045023316, + "fp16_scale": 1.0, + "global_step": 1391, + "grad_norm": 2.0663114680502717, + "learning_rate": 1.2175839143565707e-06, + "loss": 0.3891, + "step": 1391 + }, + { + "ETA": 3.26, + "epoch": 0.44766039556198745, + "fp16_scale": 1.0, + "global_step": 1392, + "grad_norm": 1.9132479197270338, + "learning_rate": 1.2165667731451024e-06, + "loss": 0.5532, + "step": 1392 + }, + { + "ETA": 3.26, + "epoch": 0.4479819906737418, + "fp16_scale": 1.0, + "global_step": 1393, + "grad_norm": 1.6813844363293866, + "learning_rate": 1.2155493967988394e-06, + "loss": 0.3854, + "step": 1393 + }, + { + "ETA": 3.26, + "epoch": 0.44830358578549606, + "fp16_scale": 1.0, + "global_step": 1394, + "grad_norm": 1.7477356715026746, + "learning_rate": 1.2145317864223873e-06, + "loss": 0.3777, + "step": 1394 + }, + { + "ETA": 3.26, + "epoch": 0.44862518089725034, + "fp16_scale": 1.0, + "global_step": 1395, + "grad_norm": 2.1168924114259022, + "learning_rate": 1.2135139431206029e-06, + "loss": 0.4345, + "step": 1395 + }, + { + "ETA": 3.25, + "epoch": 0.4489467760090047, + "fp16_scale": 1.0, + "global_step": 1396, + "grad_norm": 2.285953739879689, + "learning_rate": 1.2124958679985987e-06, + "loss": 0.4351, + "step": 1396 + }, + { + "ETA": 3.25, + "epoch": 0.44926837112075896, + "fp16_scale": 1.0, + "global_step": 1397, + "grad_norm": 2.168364924134388, + "learning_rate": 1.211477562161737e-06, + "loss": 0.3574, + "step": 1397 + }, + { + "ETA": 3.25, + "epoch": 0.44958996623251324, + "fp16_scale": 1.0, + "global_step": 1398, + "grad_norm": 1.8445238015044143, + "learning_rate": 1.2104590267156312e-06, + "loss": 0.4202, + "step": 1398 + }, + { + "ETA": 3.25, + "epoch": 0.4499115613442676, + "fp16_scale": 1.0, + "global_step": 1399, + "grad_norm": 1.8589049162779738, + "learning_rate": 1.2094402627661446e-06, + "loss": 0.4862, + "step": 1399 + }, + { + "ETA": 3.25, + "epoch": 0.45023315645602185, + "fp16_scale": 1.0, + "global_step": 1400, + "grad_norm": 1.925609416627876, + "learning_rate": 1.2084212714193873e-06, + "loss": 0.4691, + "step": 1400 + }, + { + "ETA": 3.26, + "epoch": 0.4505547515677762, + "fp16_scale": 1.0, + "global_step": 1401, + "grad_norm": 1.9467422776423138, + "learning_rate": 1.2074020537817174e-06, + "loss": 0.4251, + "step": 1401 + }, + { + "ETA": 3.26, + "epoch": 0.45087634667953047, + "fp16_scale": 1.0, + "global_step": 1402, + "grad_norm": 2.0494031390059955, + "learning_rate": 1.2063826109597381e-06, + "loss": 0.4947, + "step": 1402 + }, + { + "ETA": 3.25, + "epoch": 0.45119794179128475, + "fp16_scale": 1.0, + "global_step": 1403, + "grad_norm": 1.8729328694911644, + "learning_rate": 1.2053629440602977e-06, + "loss": 0.4429, + "step": 1403 + }, + { + "ETA": 3.25, + "epoch": 0.4515195369030391, + "fp16_scale": 1.0, + "global_step": 1404, + "grad_norm": 1.803922890212383, + "learning_rate": 1.2043430541904869e-06, + "loss": 0.4167, + "step": 1404 + }, + { + "ETA": 3.25, + "epoch": 0.45184113201479337, + "fp16_scale": 1.0, + "global_step": 1405, + "grad_norm": 1.8310438597730947, + "learning_rate": 1.2033229424576394e-06, + "loss": 0.453, + "step": 1405 + }, + { + "ETA": 3.25, + "epoch": 0.4521627271265477, + "fp16_scale": 1.0, + "global_step": 1406, + "grad_norm": 1.9179874872619675, + "learning_rate": 1.2023026099693292e-06, + "loss": 0.3701, + "step": 1406 + }, + { + "ETA": 3.25, + "epoch": 0.452484322238302, + "fp16_scale": 1.0, + "global_step": 1407, + "grad_norm": 1.8600064163645063, + "learning_rate": 1.20128205783337e-06, + "loss": 0.383, + "step": 1407 + }, + { + "ETA": 3.25, + "epoch": 0.45280591735005626, + "fp16_scale": 1.0, + "global_step": 1408, + "grad_norm": 1.9029987264933161, + "learning_rate": 1.2002612871578141e-06, + "loss": 0.4902, + "step": 1408 + }, + { + "ETA": 3.24, + "epoch": 0.4531275124618106, + "fp16_scale": 1.0, + "global_step": 1409, + "grad_norm": 2.0699543502953985, + "learning_rate": 1.1992402990509514e-06, + "loss": 0.3728, + "step": 1409 + }, + { + "ETA": 3.24, + "epoch": 0.4534491075735649, + "fp16_scale": 1.0, + "global_step": 1410, + "grad_norm": 2.1131434946387633, + "learning_rate": 1.1982190946213076e-06, + "loss": 0.4799, + "step": 1410 + }, + { + "ETA": 3.24, + "epoch": 0.45377070268531916, + "fp16_scale": 1.0, + "global_step": 1411, + "grad_norm": 1.8855715674615943, + "learning_rate": 1.197197674977643e-06, + "loss": 0.3921, + "step": 1411 + }, + { + "ETA": 3.24, + "epoch": 0.4540922977970735, + "fp16_scale": 1.0, + "global_step": 1412, + "grad_norm": 1.8736848384438523, + "learning_rate": 1.1961760412289516e-06, + "loss": 0.4081, + "step": 1412 + }, + { + "ETA": 3.23, + "epoch": 0.4544138929088278, + "fp16_scale": 1.0, + "global_step": 1413, + "grad_norm": 2.0319908352667557, + "learning_rate": 1.1951541944844606e-06, + "loss": 0.418, + "step": 1413 + }, + { + "ETA": 3.23, + "epoch": 0.4547354880205821, + "fp16_scale": 1.0, + "global_step": 1414, + "grad_norm": 2.1642966866671673, + "learning_rate": 1.1941321358536277e-06, + "loss": 0.3824, + "step": 1414 + }, + { + "ETA": 3.23, + "epoch": 0.4550570831323364, + "fp16_scale": 1.0, + "global_step": 1415, + "grad_norm": 1.9719607908931798, + "learning_rate": 1.1931098664461406e-06, + "loss": 0.3655, + "step": 1415 + }, + { + "ETA": 3.22, + "epoch": 0.4553786782440907, + "fp16_scale": 1.0, + "global_step": 1416, + "grad_norm": 2.044659469680282, + "learning_rate": 1.1920873873719166e-06, + "loss": 0.3818, + "step": 1416 + }, + { + "ETA": 3.22, + "epoch": 0.455700273355845, + "fp16_scale": 1.0, + "global_step": 1417, + "grad_norm": 2.0818545976450205, + "learning_rate": 1.1910646997411001e-06, + "loss": 0.4219, + "step": 1417 + }, + { + "ETA": 3.22, + "epoch": 0.4560218684675993, + "fp16_scale": 1.0, + "global_step": 1418, + "grad_norm": 2.183461535181188, + "learning_rate": 1.1900418046640614e-06, + "loss": 0.3808, + "step": 1418 + }, + { + "ETA": 3.22, + "epoch": 0.45634346357935357, + "fp16_scale": 1.0, + "global_step": 1419, + "grad_norm": 2.1044355098668053, + "learning_rate": 1.1890187032513976e-06, + "loss": 0.4753, + "step": 1419 + }, + { + "ETA": 3.22, + "epoch": 0.4566650586911079, + "fp16_scale": 1.0, + "global_step": 1420, + "grad_norm": 1.9120578141345488, + "learning_rate": 1.187995396613928e-06, + "loss": 0.4635, + "step": 1420 + }, + { + "ETA": 3.22, + "epoch": 0.4569866538028622, + "fp16_scale": 1.0, + "global_step": 1421, + "grad_norm": 1.9900019777715063, + "learning_rate": 1.1869718858626963e-06, + "loss": 0.4125, + "step": 1421 + }, + { + "ETA": 3.21, + "epoch": 0.4573082489146165, + "fp16_scale": 1.0, + "global_step": 1422, + "grad_norm": 1.9526876565066824, + "learning_rate": 1.1859481721089668e-06, + "loss": 0.4535, + "step": 1422 + }, + { + "ETA": 3.21, + "epoch": 0.4576298440263708, + "fp16_scale": 1.0, + "global_step": 1423, + "grad_norm": 2.0529819147468453, + "learning_rate": 1.1849242564642244e-06, + "loss": 0.3939, + "step": 1423 + }, + { + "ETA": 3.21, + "epoch": 0.4579514391381251, + "fp16_scale": 1.0, + "global_step": 1424, + "grad_norm": 2.210405714780168, + "learning_rate": 1.1839001400401736e-06, + "loss": 0.392, + "step": 1424 + }, + { + "ETA": 3.21, + "epoch": 0.4582730342498794, + "fp16_scale": 1.0, + "global_step": 1425, + "grad_norm": 1.990997733736417, + "learning_rate": 1.1828758239487362e-06, + "loss": 0.4619, + "step": 1425 + }, + { + "ETA": 3.2, + "epoch": 0.4585946293616337, + "fp16_scale": 1.0, + "global_step": 1426, + "grad_norm": 1.7999255114643025, + "learning_rate": 1.1818513093020513e-06, + "loss": 0.4807, + "step": 1426 + }, + { + "ETA": 3.2, + "epoch": 0.458916224473388, + "fp16_scale": 1.0, + "global_step": 1427, + "grad_norm": 1.9179751151421047, + "learning_rate": 1.1808265972124738e-06, + "loss": 0.4581, + "step": 1427 + }, + { + "ETA": 3.2, + "epoch": 0.4592378195851423, + "fp16_scale": 1.0, + "global_step": 1428, + "grad_norm": 2.0878934873157715, + "learning_rate": 1.1798016887925726e-06, + "loss": 0.4967, + "step": 1428 + }, + { + "ETA": 3.2, + "epoch": 0.4595594146968966, + "fp16_scale": 1.0, + "global_step": 1429, + "grad_norm": 1.848865583148038, + "learning_rate": 1.1787765851551296e-06, + "loss": 0.4408, + "step": 1429 + }, + { + "ETA": 3.2, + "epoch": 0.45988100980865093, + "fp16_scale": 1.0, + "global_step": 1430, + "grad_norm": 1.9915372078204412, + "learning_rate": 1.1777512874131386e-06, + "loss": 0.4804, + "step": 1430 + }, + { + "ETA": 3.2, + "epoch": 0.4602026049204052, + "fp16_scale": 1.0, + "global_step": 1431, + "grad_norm": 1.7080786351610435, + "learning_rate": 1.1767257966798048e-06, + "loss": 0.4257, + "step": 1431 + }, + { + "ETA": 3.19, + "epoch": 0.4605242000321595, + "fp16_scale": 1.0, + "global_step": 1432, + "grad_norm": 2.0959852281717364, + "learning_rate": 1.1757001140685426e-06, + "loss": 0.4428, + "step": 1432 + }, + { + "ETA": 3.19, + "epoch": 0.46084579514391383, + "fp16_scale": 1.0, + "global_step": 1433, + "grad_norm": 2.0800453233062006, + "learning_rate": 1.1746742406929745e-06, + "loss": 0.4635, + "step": 1433 + }, + { + "ETA": 3.19, + "epoch": 0.4611673902556681, + "fp16_scale": 1.0, + "global_step": 1434, + "grad_norm": 1.9991439086645446, + "learning_rate": 1.1736481776669305e-06, + "loss": 0.4814, + "step": 1434 + }, + { + "ETA": 3.19, + "epoch": 0.4614889853674224, + "fp16_scale": 1.0, + "global_step": 1435, + "grad_norm": 2.0002709328065795, + "learning_rate": 1.1726219261044459e-06, + "loss": 0.4328, + "step": 1435 + }, + { + "ETA": 3.19, + "epoch": 0.4618105804791767, + "fp16_scale": 1.0, + "global_step": 1436, + "grad_norm": 2.03905864061523, + "learning_rate": 1.1715954871197615e-06, + "loss": 0.4916, + "step": 1436 + }, + { + "ETA": 3.18, + "epoch": 0.462132175590931, + "fp16_scale": 1.0, + "global_step": 1437, + "grad_norm": 1.8577890856786565, + "learning_rate": 1.1705688618273209e-06, + "loss": 0.3885, + "step": 1437 + }, + { + "ETA": 3.18, + "epoch": 0.46245377070268534, + "fp16_scale": 1.0, + "global_step": 1438, + "grad_norm": 2.1861978948056895, + "learning_rate": 1.1695420513417705e-06, + "loss": 0.394, + "step": 1438 + }, + { + "ETA": 3.18, + "epoch": 0.4627753658144396, + "fp16_scale": 1.0, + "global_step": 1439, + "grad_norm": 1.7540393312754712, + "learning_rate": 1.1685150567779575e-06, + "loss": 0.4458, + "step": 1439 + }, + { + "ETA": 3.18, + "epoch": 0.4630969609261939, + "fp16_scale": 1.0, + "global_step": 1440, + "grad_norm": 1.946279500911723, + "learning_rate": 1.1674878792509293e-06, + "loss": 0.4189, + "step": 1440 + }, + { + "ETA": 3.18, + "epoch": 0.46341855603794824, + "fp16_scale": 1.0, + "global_step": 1441, + "grad_norm": 1.978765866295056, + "learning_rate": 1.1664605198759312e-06, + "loss": 0.4118, + "step": 1441 + }, + { + "ETA": 3.18, + "epoch": 0.4637401511497025, + "fp16_scale": 1.0, + "global_step": 1442, + "grad_norm": 1.9550889217653187, + "learning_rate": 1.1654329797684065e-06, + "loss": 0.3716, + "step": 1442 + }, + { + "ETA": 3.17, + "epoch": 0.46406174626145685, + "fp16_scale": 1.0, + "global_step": 1443, + "grad_norm": 2.150992481802672, + "learning_rate": 1.1644052600439947e-06, + "loss": 0.3508, + "step": 1443 + }, + { + "ETA": 3.17, + "epoch": 0.46438334137321113, + "fp16_scale": 1.0, + "global_step": 1444, + "grad_norm": 1.673795590015164, + "learning_rate": 1.1633773618185302e-06, + "loss": 0.3795, + "step": 1444 + }, + { + "ETA": 3.17, + "epoch": 0.4647049364849654, + "fp16_scale": 1.0, + "global_step": 1445, + "grad_norm": 1.9830135159880347, + "learning_rate": 1.1623492862080412e-06, + "loss": 0.4145, + "step": 1445 + }, + { + "ETA": 3.17, + "epoch": 0.46502653159671975, + "fp16_scale": 1.0, + "global_step": 1446, + "grad_norm": 2.02563990801057, + "learning_rate": 1.1613210343287492e-06, + "loss": 0.4524, + "step": 1446 + }, + { + "ETA": 3.17, + "epoch": 0.46534812670847403, + "fp16_scale": 1.0, + "global_step": 1447, + "grad_norm": 2.0165226085304058, + "learning_rate": 1.1602926072970654e-06, + "loss": 0.4508, + "step": 1447 + }, + { + "ETA": 3.16, + "epoch": 0.4656697218202283, + "fp16_scale": 1.0, + "global_step": 1448, + "grad_norm": 1.8882974099870768, + "learning_rate": 1.1592640062295927e-06, + "loss": 0.4783, + "step": 1448 + }, + { + "ETA": 3.16, + "epoch": 0.46599131693198265, + "fp16_scale": 1.0, + "global_step": 1449, + "grad_norm": 2.0057867899838313, + "learning_rate": 1.1582352322431225e-06, + "loss": 0.3945, + "step": 1449 + }, + { + "ETA": 3.16, + "epoch": 0.4663129120437369, + "fp16_scale": 1.0, + "global_step": 1450, + "grad_norm": 1.8841958050366507, + "learning_rate": 1.157206286454634e-06, + "loss": 0.3801, + "step": 1450 + }, + { + "ETA": 3.16, + "epoch": 0.46663450715549126, + "fp16_scale": 1.0, + "global_step": 1451, + "grad_norm": 1.8272961060829263, + "learning_rate": 1.1561771699812922e-06, + "loss": 0.4317, + "step": 1451 + }, + { + "ETA": 3.16, + "epoch": 0.46695610226724554, + "fp16_scale": 1.0, + "global_step": 1452, + "grad_norm": 1.6842338631717175, + "learning_rate": 1.1551478839404494e-06, + "loss": 0.4365, + "step": 1452 + }, + { + "ETA": 3.16, + "epoch": 0.4672776973789998, + "fp16_scale": 1.0, + "global_step": 1453, + "grad_norm": 2.0903691759693372, + "learning_rate": 1.1541184294496391e-06, + "loss": 0.402, + "step": 1453 + }, + { + "ETA": 3.15, + "epoch": 0.46759929249075416, + "fp16_scale": 1.0, + "global_step": 1454, + "grad_norm": 1.7907585193865998, + "learning_rate": 1.15308880762658e-06, + "loss": 0.4354, + "step": 1454 + }, + { + "ETA": 3.15, + "epoch": 0.46792088760250844, + "fp16_scale": 1.0, + "global_step": 1455, + "grad_norm": 2.072868760983511, + "learning_rate": 1.152059019589172e-06, + "loss": 0.41, + "step": 1455 + }, + { + "ETA": 3.15, + "epoch": 0.4682424827142627, + "fp16_scale": 1.0, + "global_step": 1456, + "grad_norm": 1.9350679455167799, + "learning_rate": 1.1510290664554941e-06, + "loss": 0.379, + "step": 1456 + }, + { + "ETA": 3.15, + "epoch": 0.46856407782601706, + "fp16_scale": 1.0, + "global_step": 1457, + "grad_norm": 2.0604895433760464, + "learning_rate": 1.1499989493438074e-06, + "loss": 0.3475, + "step": 1457 + }, + { + "ETA": 3.14, + "epoch": 0.46888567293777134, + "fp16_scale": 1.0, + "global_step": 1458, + "grad_norm": 1.8593424610855758, + "learning_rate": 1.1489686693725478e-06, + "loss": 0.4129, + "step": 1458 + }, + { + "ETA": 3.14, + "epoch": 0.46920726804952567, + "fp16_scale": 1.0, + "global_step": 1459, + "grad_norm": 2.0070537049832047, + "learning_rate": 1.1479382276603299e-06, + "loss": 0.3448, + "step": 1459 + }, + { + "ETA": 3.14, + "epoch": 0.46952886316127995, + "fp16_scale": 1.0, + "global_step": 1460, + "grad_norm": 1.643190458066208, + "learning_rate": 1.1469076253259438e-06, + "loss": 0.3631, + "step": 1460 + }, + { + "ETA": 3.14, + "epoch": 0.46985045827303423, + "fp16_scale": 1.0, + "global_step": 1461, + "grad_norm": 2.070470565712216, + "learning_rate": 1.1458768634883534e-06, + "loss": 0.4025, + "step": 1461 + }, + { + "ETA": 3.13, + "epoch": 0.47017205338478857, + "fp16_scale": 1.0, + "global_step": 1462, + "grad_norm": 1.9505825330705497, + "learning_rate": 1.1448459432666959e-06, + "loss": 0.4438, + "step": 1462 + }, + { + "ETA": 3.13, + "epoch": 0.47049364849654285, + "fp16_scale": 1.0, + "global_step": 1463, + "grad_norm": 1.997402714028704, + "learning_rate": 1.1438148657802814e-06, + "loss": 0.4514, + "step": 1463 + }, + { + "ETA": 3.13, + "epoch": 0.47081524360829713, + "fp16_scale": 1.0, + "global_step": 1464, + "grad_norm": 2.0821132050682305, + "learning_rate": 1.1427836321485895e-06, + "loss": 0.3976, + "step": 1464 + }, + { + "ETA": 3.13, + "epoch": 0.47113683872005147, + "fp16_scale": 1.0, + "global_step": 1465, + "grad_norm": 1.9902860827448616, + "learning_rate": 1.14175224349127e-06, + "loss": 0.4509, + "step": 1465 + }, + { + "ETA": 3.13, + "epoch": 0.47145843383180575, + "fp16_scale": 1.0, + "global_step": 1466, + "grad_norm": 2.0287431890460694, + "learning_rate": 1.1407207009281402e-06, + "loss": 0.4753, + "step": 1466 + }, + { + "ETA": 3.12, + "epoch": 0.4717800289435601, + "fp16_scale": 1.0, + "global_step": 1467, + "grad_norm": 2.1874441731605354, + "learning_rate": 1.1396890055791862e-06, + "loss": 0.456, + "step": 1467 + }, + { + "ETA": 3.12, + "epoch": 0.47210162405531436, + "fp16_scale": 1.0, + "global_step": 1468, + "grad_norm": 2.019457884880683, + "learning_rate": 1.1386571585645579e-06, + "loss": 0.3394, + "step": 1468 + }, + { + "ETA": 3.12, + "epoch": 0.47242321916706864, + "fp16_scale": 1.0, + "global_step": 1469, + "grad_norm": 1.9878429225127454, + "learning_rate": 1.137625161004572e-06, + "loss": 0.419, + "step": 1469 + }, + { + "ETA": 3.12, + "epoch": 0.472744814278823, + "fp16_scale": 1.0, + "global_step": 1470, + "grad_norm": 1.8582254062968473, + "learning_rate": 1.1365930140197066e-06, + "loss": 0.4814, + "step": 1470 + }, + { + "ETA": 3.12, + "epoch": 0.47306640939057726, + "fp16_scale": 1.0, + "global_step": 1471, + "grad_norm": 2.2421415923963215, + "learning_rate": 1.1355607187306036e-06, + "loss": 0.3864, + "step": 1471 + }, + { + "ETA": 3.11, + "epoch": 0.47338800450233154, + "fp16_scale": 1.0, + "global_step": 1472, + "grad_norm": 2.0632495122510583, + "learning_rate": 1.1345282762580649e-06, + "loss": 0.419, + "step": 1472 + }, + { + "ETA": 3.11, + "epoch": 0.4737095996140859, + "fp16_scale": 1.0, + "global_step": 1473, + "grad_norm": 1.9216519384670905, + "learning_rate": 1.1334956877230527e-06, + "loss": 0.4142, + "step": 1473 + }, + { + "ETA": 3.11, + "epoch": 0.47403119472584015, + "fp16_scale": 1.0, + "global_step": 1474, + "grad_norm": 1.9825823580110198, + "learning_rate": 1.132462954246688e-06, + "loss": 0.4102, + "step": 1474 + }, + { + "ETA": 3.11, + "epoch": 0.4743527898375945, + "fp16_scale": 1.0, + "global_step": 1475, + "grad_norm": 1.9049563681116846, + "learning_rate": 1.1314300769502485e-06, + "loss": 0.4553, + "step": 1475 + }, + { + "ETA": 3.11, + "epoch": 0.47467438494934877, + "fp16_scale": 1.0, + "global_step": 1476, + "grad_norm": 1.9691137154721197, + "learning_rate": 1.130397056955169e-06, + "loss": 0.4579, + "step": 1476 + }, + { + "ETA": 3.1, + "epoch": 0.47499598006110305, + "fp16_scale": 1.0, + "global_step": 1477, + "grad_norm": 2.1773439122015863, + "learning_rate": 1.1293638953830378e-06, + "loss": 0.3821, + "step": 1477 + }, + { + "ETA": 3.1, + "epoch": 0.4753175751728574, + "fp16_scale": 1.0, + "global_step": 1478, + "grad_norm": 1.910864823603078, + "learning_rate": 1.1283305933555984e-06, + "loss": 0.4269, + "step": 1478 + }, + { + "ETA": 3.1, + "epoch": 0.47563917028461167, + "fp16_scale": 1.0, + "global_step": 1479, + "grad_norm": 2.045515841306016, + "learning_rate": 1.1272971519947457e-06, + "loss": 0.4844, + "step": 1479 + }, + { + "ETA": 3.1, + "epoch": 0.47596076539636595, + "fp16_scale": 1.0, + "global_step": 1480, + "grad_norm": 1.8911679526676894, + "learning_rate": 1.126263572422527e-06, + "loss": 0.4142, + "step": 1480 + }, + { + "ETA": 3.1, + "epoch": 0.4762823605081203, + "fp16_scale": 1.0, + "global_step": 1481, + "grad_norm": 1.9845520254385136, + "learning_rate": 1.1252298557611383e-06, + "loss": 0.456, + "step": 1481 + }, + { + "ETA": 3.1, + "epoch": 0.47660395561987456, + "fp16_scale": 1.0, + "global_step": 1482, + "grad_norm": 1.908810677619066, + "learning_rate": 1.124196003132926e-06, + "loss": 0.508, + "step": 1482 + }, + { + "ETA": 3.09, + "epoch": 0.4769255507316289, + "fp16_scale": 1.0, + "global_step": 1483, + "grad_norm": 1.771266096955992, + "learning_rate": 1.1231620156603823e-06, + "loss": 0.4114, + "step": 1483 + }, + { + "ETA": 3.09, + "epoch": 0.4772471458433832, + "fp16_scale": 1.0, + "global_step": 1484, + "grad_norm": 2.2235534069634197, + "learning_rate": 1.1221278944661472e-06, + "loss": 0.4922, + "step": 1484 + }, + { + "ETA": 3.09, + "epoch": 0.47756874095513746, + "fp16_scale": 1.0, + "global_step": 1485, + "grad_norm": 1.868268949487213, + "learning_rate": 1.1210936406730058e-06, + "loss": 0.321, + "step": 1485 + }, + { + "ETA": 3.09, + "epoch": 0.4778903360668918, + "fp16_scale": 1.0, + "global_step": 1486, + "grad_norm": 2.1301065625680717, + "learning_rate": 1.1200592554038865e-06, + "loss": 0.425, + "step": 1486 + }, + { + "ETA": 3.09, + "epoch": 0.4782119311786461, + "fp16_scale": 1.0, + "global_step": 1487, + "grad_norm": 2.1376344932652596, + "learning_rate": 1.1190247397818606e-06, + "loss": 0.4601, + "step": 1487 + }, + { + "ETA": 3.08, + "epoch": 0.4785335262904004, + "fp16_scale": 1.0, + "global_step": 1488, + "grad_norm": 1.8644548869345658, + "learning_rate": 1.1179900949301417e-06, + "loss": 0.4582, + "step": 1488 + }, + { + "ETA": 3.08, + "epoch": 0.4788551214021547, + "fp16_scale": 1.0, + "global_step": 1489, + "grad_norm": 2.064205355643897, + "learning_rate": 1.1169553219720826e-06, + "loss": 0.3806, + "step": 1489 + }, + { + "ETA": 3.08, + "epoch": 0.479176716513909, + "fp16_scale": 1.0, + "global_step": 1490, + "grad_norm": 1.9404339758716016, + "learning_rate": 1.1159204220311756e-06, + "loss": 0.5099, + "step": 1490 + }, + { + "ETA": 3.08, + "epoch": 0.4794983116256633, + "fp16_scale": 1.0, + "global_step": 1491, + "grad_norm": 1.750338191909074, + "learning_rate": 1.1148853962310516e-06, + "loss": 0.396, + "step": 1491 + }, + { + "ETA": 3.08, + "epoch": 0.4798199067374176, + "fp16_scale": 1.0, + "global_step": 1492, + "grad_norm": 1.8108943152358827, + "learning_rate": 1.113850245695477e-06, + "loss": 0.4023, + "step": 1492 + }, + { + "ETA": 3.07, + "epoch": 0.48014150184917187, + "fp16_scale": 1.0, + "global_step": 1493, + "grad_norm": 2.205008750952935, + "learning_rate": 1.1128149715483547e-06, + "loss": 0.3603, + "step": 1493 + }, + { + "ETA": 3.07, + "epoch": 0.4804630969609262, + "fp16_scale": 1.0, + "global_step": 1494, + "grad_norm": 1.9074915275539868, + "learning_rate": 1.1117795749137206e-06, + "loss": 0.4314, + "step": 1494 + }, + { + "ETA": 3.07, + "epoch": 0.4807846920726805, + "fp16_scale": 1.0, + "global_step": 1495, + "grad_norm": 2.0833759204535114, + "learning_rate": 1.1107440569157444e-06, + "loss": 0.3921, + "step": 1495 + }, + { + "ETA": 3.07, + "epoch": 0.4811062871844348, + "fp16_scale": 1.0, + "global_step": 1496, + "grad_norm": 2.158830571179874, + "learning_rate": 1.109708418678728e-06, + "loss": 0.3662, + "step": 1496 + }, + { + "ETA": 3.07, + "epoch": 0.4814278822961891, + "fp16_scale": 1.0, + "global_step": 1497, + "grad_norm": 1.9519051946568877, + "learning_rate": 1.1086726613271028e-06, + "loss": 0.4709, + "step": 1497 + }, + { + "ETA": 3.07, + "epoch": 0.4817494774079434, + "fp16_scale": 1.0, + "global_step": 1498, + "grad_norm": 1.8926894207331328, + "learning_rate": 1.1076367859854302e-06, + "loss": 0.4197, + "step": 1498 + }, + { + "ETA": 3.06, + "epoch": 0.4820710725196977, + "fp16_scale": 1.0, + "global_step": 1499, + "grad_norm": 1.817159057438267, + "learning_rate": 1.1066007937783995e-06, + "loss": 0.397, + "step": 1499 + }, + { + "ETA": 3.06, + "epoch": 0.482392667631452, + "fp16_scale": 1.0, + "global_step": 1500, + "grad_norm": 1.8333475714240695, + "learning_rate": 1.1055646858308264e-06, + "loss": 0.3492, + "step": 1500 + }, + { + "ETA": 3.06, + "epoch": 0.4827142627432063, + "fp16_scale": 1.0, + "global_step": 1501, + "grad_norm": 1.9852329828822428, + "learning_rate": 1.1045284632676535e-06, + "loss": 0.4314, + "step": 1501 + }, + { + "ETA": 3.06, + "epoch": 0.4830358578549606, + "fp16_scale": 1.0, + "global_step": 1502, + "grad_norm": 2.0970325834651855, + "learning_rate": 1.1034921272139466e-06, + "loss": 0.4529, + "step": 1502 + }, + { + "ETA": 3.06, + "epoch": 0.4833574529667149, + "fp16_scale": 1.0, + "global_step": 1503, + "grad_norm": 1.6668443903297765, + "learning_rate": 1.1024556787948955e-06, + "loss": 0.3715, + "step": 1503 + }, + { + "ETA": 3.05, + "epoch": 0.48367904807846923, + "fp16_scale": 1.0, + "global_step": 1504, + "grad_norm": 1.7112833732495047, + "learning_rate": 1.1014191191358117e-06, + "loss": 0.438, + "step": 1504 + }, + { + "ETA": 3.05, + "epoch": 0.4840006431902235, + "fp16_scale": 1.0, + "global_step": 1505, + "grad_norm": 2.1462971318756345, + "learning_rate": 1.1003824493621274e-06, + "loss": 0.4878, + "step": 1505 + }, + { + "ETA": 3.05, + "epoch": 0.4843222383019778, + "fp16_scale": 1.0, + "global_step": 1506, + "grad_norm": 2.0376448441760284, + "learning_rate": 1.0993456705993945e-06, + "loss": 0.4611, + "step": 1506 + }, + { + "ETA": 3.05, + "epoch": 0.48464383341373213, + "fp16_scale": 1.0, + "global_step": 1507, + "grad_norm": 2.0584199331524005, + "learning_rate": 1.0983087839732831e-06, + "loss": 0.4487, + "step": 1507 + }, + { + "ETA": 3.05, + "epoch": 0.4849654285254864, + "fp16_scale": 1.0, + "global_step": 1508, + "grad_norm": 1.9937428681748068, + "learning_rate": 1.0972717906095808e-06, + "loss": 0.412, + "step": 1508 + }, + { + "ETA": 3.05, + "epoch": 0.4852870236372407, + "fp16_scale": 1.0, + "global_step": 1509, + "grad_norm": 1.9568850624984397, + "learning_rate": 1.0962346916341902e-06, + "loss": 0.4019, + "step": 1509 + }, + { + "ETA": 3.04, + "epoch": 0.485608618748995, + "fp16_scale": 1.0, + "global_step": 1510, + "grad_norm": 1.86885155304055, + "learning_rate": 1.0951974881731298e-06, + "loss": 0.367, + "step": 1510 + }, + { + "ETA": 3.04, + "epoch": 0.4859302138607493, + "fp16_scale": 1.0, + "global_step": 1511, + "grad_norm": 1.995223230031954, + "learning_rate": 1.0941601813525308e-06, + "loss": 0.3998, + "step": 1511 + }, + { + "ETA": 3.04, + "epoch": 0.48625180897250364, + "fp16_scale": 1.0, + "global_step": 1512, + "grad_norm": 1.9989087916661044, + "learning_rate": 1.0931227722986363e-06, + "loss": 0.3847, + "step": 1512 + }, + { + "ETA": 3.04, + "epoch": 0.4865734040842579, + "fp16_scale": 1.0, + "global_step": 1513, + "grad_norm": 2.6041634816230093, + "learning_rate": 1.092085262137801e-06, + "loss": 0.4038, + "step": 1513 + }, + { + "ETA": 3.04, + "epoch": 0.4868949991960122, + "fp16_scale": 1.0, + "global_step": 1514, + "grad_norm": 2.0229525051667463, + "learning_rate": 1.0910476519964895e-06, + "loss": 0.4329, + "step": 1514 + }, + { + "ETA": 3.03, + "epoch": 0.48721659430776654, + "fp16_scale": 1.0, + "global_step": 1515, + "grad_norm": 1.7548148877659235, + "learning_rate": 1.0900099430012743e-06, + "loss": 0.3799, + "step": 1515 + }, + { + "ETA": 3.03, + "epoch": 0.4875381894195208, + "fp16_scale": 1.0, + "global_step": 1516, + "grad_norm": 1.9718031174852972, + "learning_rate": 1.088972136278836e-06, + "loss": 0.4295, + "step": 1516 + }, + { + "ETA": 3.03, + "epoch": 0.4878597845312751, + "fp16_scale": 1.0, + "global_step": 1517, + "grad_norm": 1.8079149325138577, + "learning_rate": 1.0879342329559602e-06, + "loss": 0.3546, + "step": 1517 + }, + { + "ETA": 3.03, + "epoch": 0.48818137964302943, + "fp16_scale": 1.0, + "global_step": 1518, + "grad_norm": 1.827517518365253, + "learning_rate": 1.0868962341595387e-06, + "loss": 0.3784, + "step": 1518 + }, + { + "ETA": 3.03, + "epoch": 0.4885029747547837, + "fp16_scale": 1.0, + "global_step": 1519, + "grad_norm": 1.9489494910458018, + "learning_rate": 1.085858141016566e-06, + "loss": 0.4402, + "step": 1519 + }, + { + "ETA": 3.03, + "epoch": 0.48882456986653805, + "fp16_scale": 1.0, + "global_step": 1520, + "grad_norm": 1.9675619743928485, + "learning_rate": 1.0848199546541391e-06, + "loss": 0.4943, + "step": 1520 + }, + { + "ETA": 3.02, + "epoch": 0.48914616497829233, + "fp16_scale": 1.0, + "global_step": 1521, + "grad_norm": 1.803594362549338, + "learning_rate": 1.0837816761994575e-06, + "loss": 0.4484, + "step": 1521 + }, + { + "ETA": 3.02, + "epoch": 0.4894677600900466, + "fp16_scale": 1.0, + "global_step": 1522, + "grad_norm": 1.9153258150997852, + "learning_rate": 1.082743306779819e-06, + "loss": 0.4015, + "step": 1522 + }, + { + "ETA": 3.02, + "epoch": 0.48978935520180095, + "fp16_scale": 1.0, + "global_step": 1523, + "grad_norm": 1.920659753656018, + "learning_rate": 1.0817048475226202e-06, + "loss": 0.4181, + "step": 1523 + }, + { + "ETA": 3.02, + "epoch": 0.4901109503135552, + "fp16_scale": 1.0, + "global_step": 1524, + "grad_norm": 1.8967381684793498, + "learning_rate": 1.080666299555357e-06, + "loss": 0.3888, + "step": 1524 + }, + { + "ETA": 3.02, + "epoch": 0.49043254542530956, + "fp16_scale": 1.0, + "global_step": 1525, + "grad_norm": 2.0195313420369074, + "learning_rate": 1.0796276640056198e-06, + "loss": 0.4935, + "step": 1525 + }, + { + "ETA": 3.02, + "epoch": 0.49075414053706384, + "fp16_scale": 1.0, + "global_step": 1526, + "grad_norm": 1.9326539942795364, + "learning_rate": 1.078588942001095e-06, + "loss": 0.4029, + "step": 1526 + }, + { + "ETA": 3.01, + "epoch": 0.4910757356488181, + "fp16_scale": 1.0, + "global_step": 1527, + "grad_norm": 1.7900283969233834, + "learning_rate": 1.0775501346695628e-06, + "loss": 0.4482, + "step": 1527 + }, + { + "ETA": 3.01, + "epoch": 0.49139733076057246, + "fp16_scale": 1.0, + "global_step": 1528, + "grad_norm": 1.840014114429064, + "learning_rate": 1.0765112431388954e-06, + "loss": 0.343, + "step": 1528 + }, + { + "ETA": 3.01, + "epoch": 0.49171892587232674, + "fp16_scale": 1.0, + "global_step": 1529, + "grad_norm": 1.9446197672831362, + "learning_rate": 1.0754722685370571e-06, + "loss": 0.4071, + "step": 1529 + }, + { + "ETA": 3.01, + "epoch": 0.492040520984081, + "fp16_scale": 1.0, + "global_step": 1530, + "grad_norm": 2.0102995975591575, + "learning_rate": 1.0744332119921027e-06, + "loss": 0.4499, + "step": 1530 + }, + { + "ETA": 3.01, + "epoch": 0.49236211609583536, + "fp16_scale": 1.0, + "global_step": 1531, + "grad_norm": 2.7472059083687204, + "learning_rate": 1.0733940746321746e-06, + "loss": 0.5255, + "step": 1531 + }, + { + "ETA": 3.0, + "epoch": 0.49268371120758964, + "fp16_scale": 1.0, + "global_step": 1532, + "grad_norm": 1.8943751253703869, + "learning_rate": 1.0723548575855045e-06, + "loss": 0.4162, + "step": 1532 + }, + { + "ETA": 3.0, + "epoch": 0.49300530631934397, + "fp16_scale": 1.0, + "global_step": 1533, + "grad_norm": 1.81240543059852, + "learning_rate": 1.07131556198041e-06, + "loss": 0.4207, + "step": 1533 + }, + { + "ETA": 3.0, + "epoch": 0.49332690143109825, + "fp16_scale": 1.0, + "global_step": 1534, + "grad_norm": 2.2554017559698076, + "learning_rate": 1.070276188945293e-06, + "loss": 0.3687, + "step": 1534 + }, + { + "ETA": 3.0, + "epoch": 0.49364849654285253, + "fp16_scale": 1.0, + "global_step": 1535, + "grad_norm": 1.8460626884530826, + "learning_rate": 1.0692367396086413e-06, + "loss": 0.432, + "step": 1535 + }, + { + "ETA": 3.0, + "epoch": 0.49397009165460687, + "fp16_scale": 1.0, + "global_step": 1536, + "grad_norm": 2.018607800130137, + "learning_rate": 1.0681972150990245e-06, + "loss": 0.4588, + "step": 1536 + }, + { + "ETA": 2.99, + "epoch": 0.49429168676636115, + "fp16_scale": 1.0, + "global_step": 1537, + "grad_norm": 2.003308828023504, + "learning_rate": 1.0671576165450934e-06, + "loss": 0.3649, + "step": 1537 + }, + { + "ETA": 2.99, + "epoch": 0.49461328187811543, + "fp16_scale": 1.0, + "global_step": 1538, + "grad_norm": 2.138620941424885, + "learning_rate": 1.0661179450755804e-06, + "loss": 0.4455, + "step": 1538 + }, + { + "ETA": 2.99, + "epoch": 0.49493487698986977, + "fp16_scale": 1.0, + "global_step": 1539, + "grad_norm": 2.008499212054112, + "learning_rate": 1.065078201819296e-06, + "loss": 0.4251, + "step": 1539 + }, + { + "ETA": 2.99, + "epoch": 0.49525647210162405, + "fp16_scale": 1.0, + "global_step": 1540, + "grad_norm": 2.082974182783928, + "learning_rate": 1.0640383879051294e-06, + "loss": 0.4122, + "step": 1540 + }, + { + "ETA": 2.99, + "epoch": 0.4955780672133784, + "fp16_scale": 1.0, + "global_step": 1541, + "grad_norm": 2.0246456266267274, + "learning_rate": 1.0629985044620458e-06, + "loss": 0.4179, + "step": 1541 + }, + { + "ETA": 2.98, + "epoch": 0.49589966232513266, + "fp16_scale": 1.0, + "global_step": 1542, + "grad_norm": 1.7869547994218695, + "learning_rate": 1.0619585526190864e-06, + "loss": 0.4072, + "step": 1542 + }, + { + "ETA": 2.98, + "epoch": 0.49622125743688694, + "fp16_scale": 1.0, + "global_step": 1543, + "grad_norm": 1.9121098432741022, + "learning_rate": 1.0609185335053668e-06, + "loss": 0.3966, + "step": 1543 + }, + { + "ETA": 2.98, + "epoch": 0.4965428525486413, + "fp16_scale": 1.0, + "global_step": 1544, + "grad_norm": 1.9761616742372048, + "learning_rate": 1.059878448250075e-06, + "loss": 0.4623, + "step": 1544 + }, + { + "ETA": 2.98, + "epoch": 0.49686444766039556, + "fp16_scale": 1.0, + "global_step": 1545, + "grad_norm": 1.9375219244560744, + "learning_rate": 1.0588382979824712e-06, + "loss": 0.4625, + "step": 1545 + }, + { + "ETA": 2.98, + "epoch": 0.49718604277214984, + "fp16_scale": 1.0, + "global_step": 1546, + "grad_norm": 1.8897575358362109, + "learning_rate": 1.0577980838318865e-06, + "loss": 0.4216, + "step": 1546 + }, + { + "ETA": 2.97, + "epoch": 0.4975076378839042, + "fp16_scale": 1.0, + "global_step": 1547, + "grad_norm": 1.9459334857630748, + "learning_rate": 1.0567578069277207e-06, + "loss": 0.4695, + "step": 1547 + }, + { + "ETA": 2.97, + "epoch": 0.49782923299565845, + "fp16_scale": 1.0, + "global_step": 1548, + "grad_norm": 2.1765654533864325, + "learning_rate": 1.055717468399442e-06, + "loss": 0.4074, + "step": 1548 + }, + { + "ETA": 2.97, + "epoch": 0.4981508281074128, + "fp16_scale": 1.0, + "global_step": 1549, + "grad_norm": 1.9986088515556024, + "learning_rate": 1.054677069376586e-06, + "loss": 0.4243, + "step": 1549 + }, + { + "ETA": 2.97, + "epoch": 0.49847242321916707, + "fp16_scale": 1.0, + "global_step": 1550, + "grad_norm": 2.0001655689294786, + "learning_rate": 1.053636610988753e-06, + "loss": 0.4046, + "step": 1550 + }, + { + "ETA": 2.97, + "epoch": 0.49879401833092135, + "fp16_scale": 1.0, + "global_step": 1551, + "grad_norm": 2.030441048637085, + "learning_rate": 1.0525960943656088e-06, + "loss": 0.3804, + "step": 1551 + }, + { + "ETA": 2.97, + "epoch": 0.4991156134426757, + "fp16_scale": 1.0, + "global_step": 1552, + "grad_norm": 1.9160403972161872, + "learning_rate": 1.0515555206368813e-06, + "loss": 0.4582, + "step": 1552 + }, + { + "ETA": 2.96, + "epoch": 0.49943720855442997, + "fp16_scale": 1.0, + "global_step": 1553, + "grad_norm": 1.9847974263885124, + "learning_rate": 1.0505148909323615e-06, + "loss": 0.4201, + "step": 1553 + }, + { + "ETA": 2.96, + "epoch": 0.49975880366618425, + "fp16_scale": 1.0, + "global_step": 1554, + "grad_norm": 1.8887130403081902, + "learning_rate": 1.0494742063819008e-06, + "loss": 0.3822, + "step": 1554 + }, + { + "ETA": 2.96, + "epoch": 0.5000803987779385, + "fp16_scale": 1.0, + "global_step": 1555, + "grad_norm": 1.8142692983493474, + "learning_rate": 1.04843346811541e-06, + "loss": 0.3874, + "step": 1555 + }, + { + "ETA": 2.96, + "epoch": 0.5004019938896929, + "fp16_scale": 1.0, + "global_step": 1556, + "grad_norm": 2.199494217948949, + "learning_rate": 1.047392677262858e-06, + "loss": 0.4794, + "step": 1556 + }, + { + "ETA": 2.96, + "epoch": 0.5007235890014472, + "fp16_scale": 1.0, + "global_step": 1557, + "grad_norm": 1.991244890119865, + "learning_rate": 1.0463518349542712e-06, + "loss": 0.4124, + "step": 1557 + }, + { + "ETA": 2.95, + "epoch": 0.5010451841132014, + "fp16_scale": 1.0, + "global_step": 1558, + "grad_norm": 1.9890509024092522, + "learning_rate": 1.0453109423197317e-06, + "loss": 0.3856, + "step": 1558 + }, + { + "ETA": 2.95, + "epoch": 0.5013667792249558, + "fp16_scale": 1.0, + "global_step": 1559, + "grad_norm": 2.1346695450030393, + "learning_rate": 1.0442700004893764e-06, + "loss": 0.4418, + "step": 1559 + }, + { + "ETA": 2.95, + "epoch": 0.5016883743367101, + "fp16_scale": 1.0, + "global_step": 1560, + "grad_norm": 1.8954676936171913, + "learning_rate": 1.0432290105933955e-06, + "loss": 0.4071, + "step": 1560 + }, + { + "ETA": 2.95, + "epoch": 0.5020099694484644, + "fp16_scale": 1.0, + "global_step": 1561, + "grad_norm": 1.8429515163232004, + "learning_rate": 1.0421879737620311e-06, + "loss": 0.3978, + "step": 1561 + }, + { + "ETA": 2.95, + "epoch": 0.5023315645602187, + "fp16_scale": 1.0, + "global_step": 1562, + "grad_norm": 1.8811312041106818, + "learning_rate": 1.041146891125577e-06, + "loss": 0.4292, + "step": 1562 + }, + { + "ETA": 2.95, + "epoch": 0.502653159671973, + "fp16_scale": 1.0, + "global_step": 1563, + "grad_norm": 2.149633377125283, + "learning_rate": 1.0401057638143759e-06, + "loss": 0.4273, + "step": 1563 + }, + { + "ETA": 2.94, + "epoch": 0.5029747547837273, + "fp16_scale": 1.0, + "global_step": 1564, + "grad_norm": 2.118221507675948, + "learning_rate": 1.0390645929588195e-06, + "loss": 0.4255, + "step": 1564 + }, + { + "ETA": 2.94, + "epoch": 0.5032963498954816, + "fp16_scale": 1.0, + "global_step": 1565, + "grad_norm": 2.108264126810423, + "learning_rate": 1.0380233796893464e-06, + "loss": 0.4181, + "step": 1565 + }, + { + "ETA": 2.94, + "epoch": 0.5036179450072359, + "fp16_scale": 1.0, + "global_step": 1566, + "grad_norm": 2.0316887594871957, + "learning_rate": 1.0369821251364418e-06, + "loss": 0.453, + "step": 1566 + }, + { + "ETA": 2.94, + "epoch": 0.5039395401189902, + "fp16_scale": 1.0, + "global_step": 1567, + "grad_norm": 2.3010183383648437, + "learning_rate": 1.0359408304306358e-06, + "loss": 0.445, + "step": 1567 + }, + { + "ETA": 2.94, + "epoch": 0.5042611352307445, + "fp16_scale": 1.0, + "global_step": 1568, + "grad_norm": 1.99798364861865, + "learning_rate": 1.034899496702501e-06, + "loss": 0.363, + "step": 1568 + }, + { + "ETA": 2.93, + "epoch": 0.5045827303424988, + "fp16_scale": 1.0, + "global_step": 1569, + "grad_norm": 1.93397754256817, + "learning_rate": 1.0338581250826535e-06, + "loss": 0.3803, + "step": 1569 + }, + { + "ETA": 2.93, + "epoch": 0.5049043254542531, + "fp16_scale": 1.0, + "global_step": 1570, + "grad_norm": 2.0151514240646855, + "learning_rate": 1.0328167167017498e-06, + "loss": 0.3669, + "step": 1570 + }, + { + "ETA": 2.93, + "epoch": 0.5052259205660073, + "fp16_scale": 1.0, + "global_step": 1571, + "grad_norm": 1.9817707611036037, + "learning_rate": 1.031775272690487e-06, + "loss": 0.5408, + "step": 1571 + }, + { + "ETA": 2.93, + "epoch": 0.5055475156777617, + "fp16_scale": 1.0, + "global_step": 1572, + "grad_norm": 1.8303409444087517, + "learning_rate": 1.0307337941796003e-06, + "loss": 0.3755, + "step": 1572 + }, + { + "ETA": 2.92, + "epoch": 0.505869110789516, + "fp16_scale": 1.0, + "global_step": 1573, + "grad_norm": 2.2494160842599698, + "learning_rate": 1.029692282299863e-06, + "loss": 0.3777, + "step": 1573 + }, + { + "ETA": 2.92, + "epoch": 0.5061907059012704, + "fp16_scale": 1.0, + "global_step": 1574, + "grad_norm": 1.988669734406121, + "learning_rate": 1.0286507381820837e-06, + "loss": 0.3368, + "step": 1574 + }, + { + "ETA": 2.92, + "epoch": 0.5065123010130246, + "fp16_scale": 1.0, + "global_step": 1575, + "grad_norm": 1.9620238986626437, + "learning_rate": 1.0276091629571067e-06, + "loss": 0.4226, + "step": 1575 + }, + { + "ETA": 2.92, + "epoch": 0.5068338961247789, + "fp16_scale": 1.0, + "global_step": 1576, + "grad_norm": 1.9486692517008308, + "learning_rate": 1.0265675577558098e-06, + "loss": 0.4111, + "step": 1576 + }, + { + "ETA": 2.92, + "epoch": 0.5071554912365333, + "fp16_scale": 1.0, + "global_step": 1577, + "grad_norm": 1.8619620960497394, + "learning_rate": 1.0255259237091037e-06, + "loss": 0.4338, + "step": 1577 + }, + { + "ETA": 2.92, + "epoch": 0.5074770863482875, + "fp16_scale": 1.0, + "global_step": 1578, + "grad_norm": 2.049798226514897, + "learning_rate": 1.02448426194793e-06, + "loss": 0.4825, + "step": 1578 + }, + { + "ETA": 2.91, + "epoch": 0.5077986814600418, + "fp16_scale": 1.0, + "global_step": 1579, + "grad_norm": 1.847056120246811, + "learning_rate": 1.0234425736032607e-06, + "loss": 0.4025, + "step": 1579 + }, + { + "ETA": 2.91, + "epoch": 0.5081202765717961, + "fp16_scale": 1.0, + "global_step": 1580, + "grad_norm": 1.8976665048301893, + "learning_rate": 1.022400859806096e-06, + "loss": 0.4233, + "step": 1580 + }, + { + "ETA": 2.91, + "epoch": 0.5084418716835504, + "fp16_scale": 1.0, + "global_step": 1581, + "grad_norm": 2.0290085809447476, + "learning_rate": 1.0213591216874646e-06, + "loss": 0.4502, + "step": 1581 + }, + { + "ETA": 2.91, + "epoch": 0.5087634667953047, + "fp16_scale": 1.0, + "global_step": 1582, + "grad_norm": 1.9375355634273952, + "learning_rate": 1.0203173603784216e-06, + "loss": 0.4502, + "step": 1582 + }, + { + "ETA": 2.91, + "epoch": 0.509085061907059, + "fp16_scale": 1.0, + "global_step": 1583, + "grad_norm": 2.14685888000696, + "learning_rate": 1.0192755770100466e-06, + "loss": 0.4561, + "step": 1583 + }, + { + "ETA": 2.9, + "epoch": 0.5094066570188133, + "fp16_scale": 1.0, + "global_step": 1584, + "grad_norm": 2.0901680388439803, + "learning_rate": 1.0182337727134429e-06, + "loss": 0.4819, + "step": 1584 + }, + { + "ETA": 2.9, + "epoch": 0.5097282521305676, + "fp16_scale": 1.0, + "global_step": 1585, + "grad_norm": 2.079279503569149, + "learning_rate": 1.0171919486197384e-06, + "loss": 0.4494, + "step": 1585 + }, + { + "ETA": 2.9, + "epoch": 0.5100498472423219, + "fp16_scale": 1.0, + "global_step": 1586, + "grad_norm": 1.9829272829978706, + "learning_rate": 1.0161501058600803e-06, + "loss": 0.5084, + "step": 1586 + }, + { + "ETA": 2.9, + "epoch": 0.5103714423540762, + "fp16_scale": 1.0, + "global_step": 1587, + "grad_norm": 2.262496505148507, + "learning_rate": 1.0151082455656367e-06, + "loss": 0.3741, + "step": 1587 + }, + { + "ETA": 2.9, + "epoch": 0.5106930374658305, + "fp16_scale": 1.0, + "global_step": 1588, + "grad_norm": 1.9231255778101124, + "learning_rate": 1.0140663688675959e-06, + "loss": 0.4202, + "step": 1588 + }, + { + "ETA": 2.89, + "epoch": 0.5110146325775848, + "fp16_scale": 1.0, + "global_step": 1589, + "grad_norm": 2.3576653684054167, + "learning_rate": 1.0130244768971628e-06, + "loss": 0.3904, + "step": 1589 + }, + { + "ETA": 2.89, + "epoch": 0.5113362276893392, + "fp16_scale": 1.0, + "global_step": 1590, + "grad_norm": 1.9822391498742844, + "learning_rate": 1.0119825707855588e-06, + "loss": 0.4375, + "step": 1590 + }, + { + "ETA": 2.89, + "epoch": 0.5116578228010934, + "fp16_scale": 1.0, + "global_step": 1591, + "grad_norm": 2.170209467428759, + "learning_rate": 1.0109406516640212e-06, + "loss": 0.4552, + "step": 1591 + }, + { + "ETA": 2.89, + "epoch": 0.5119794179128477, + "fp16_scale": 1.0, + "global_step": 1592, + "grad_norm": 1.8182302454950556, + "learning_rate": 1.0098987206638014e-06, + "loss": 0.4004, + "step": 1592 + }, + { + "ETA": 2.89, + "epoch": 0.5123010130246021, + "fp16_scale": 1.0, + "global_step": 1593, + "grad_norm": 2.2742069246283583, + "learning_rate": 1.0088567789161637e-06, + "loss": 0.4123, + "step": 1593 + }, + { + "ETA": 2.88, + "epoch": 0.5126226081363563, + "fp16_scale": 1.0, + "global_step": 1594, + "grad_norm": 1.8074854603327288, + "learning_rate": 1.0078148275523839e-06, + "loss": 0.4022, + "step": 1594 + }, + { + "ETA": 2.88, + "epoch": 0.5129442032481106, + "fp16_scale": 1.0, + "global_step": 1595, + "grad_norm": 1.8919083330036863, + "learning_rate": 1.006772867703748e-06, + "loss": 0.4717, + "step": 1595 + }, + { + "ETA": 2.88, + "epoch": 0.513265798359865, + "fp16_scale": 1.0, + "global_step": 1596, + "grad_norm": 1.930780179649879, + "learning_rate": 1.0057309005015517e-06, + "loss": 0.4667, + "step": 1596 + }, + { + "ETA": 2.88, + "epoch": 0.5135873934716192, + "fp16_scale": 1.0, + "global_step": 1597, + "grad_norm": 1.8823481611376727, + "learning_rate": 1.0046889270770987e-06, + "loss": 0.4272, + "step": 1597 + }, + { + "ETA": 2.88, + "epoch": 0.5139089885833735, + "fp16_scale": 1.0, + "global_step": 1598, + "grad_norm": 2.085460333805262, + "learning_rate": 1.0036469485616985e-06, + "loss": 0.4407, + "step": 1598 + }, + { + "ETA": 2.87, + "epoch": 0.5142305836951279, + "fp16_scale": 1.0, + "global_step": 1599, + "grad_norm": 2.050585262690548, + "learning_rate": 1.0026049660866675e-06, + "loss": 0.4578, + "step": 1599 + }, + { + "ETA": 2.87, + "epoch": 0.5145521788068821, + "fp16_scale": 1.0, + "global_step": 1600, + "grad_norm": 2.02747949847014, + "learning_rate": 1.001562980783326e-06, + "loss": 0.481, + "step": 1600 + }, + { + "ETA": 2.88, + "epoch": 0.5148737739186364, + "fp16_scale": 1.0, + "global_step": 1601, + "grad_norm": 2.1102242298586003, + "learning_rate": 1.0005209937829962e-06, + "loss": 0.4059, + "step": 1601 + }, + { + "ETA": 2.88, + "epoch": 0.5151953690303908, + "fp16_scale": 1.0, + "global_step": 1602, + "grad_norm": 1.9033299830802899, + "learning_rate": 9.994790062170037e-07, + "loss": 0.3734, + "step": 1602 + }, + { + "ETA": 2.88, + "epoch": 0.515516964142145, + "fp16_scale": 1.0, + "global_step": 1603, + "grad_norm": 2.223860335625996, + "learning_rate": 9.984370192166742e-07, + "loss": 0.4589, + "step": 1603 + }, + { + "ETA": 2.87, + "epoch": 0.5158385592538993, + "fp16_scale": 1.0, + "global_step": 1604, + "grad_norm": 1.91984635987963, + "learning_rate": 9.973950339133322e-07, + "loss": 0.3472, + "step": 1604 + }, + { + "ETA": 2.87, + "epoch": 0.5161601543656537, + "fp16_scale": 1.0, + "global_step": 1605, + "grad_norm": 2.075549344169445, + "learning_rate": 9.963530514383016e-07, + "loss": 0.4624, + "step": 1605 + }, + { + "ETA": 2.87, + "epoch": 0.516481749477408, + "fp16_scale": 1.0, + "global_step": 1606, + "grad_norm": 1.859380458646878, + "learning_rate": 9.953110729229016e-07, + "loss": 0.4281, + "step": 1606 + }, + { + "ETA": 2.87, + "epoch": 0.5168033445891622, + "fp16_scale": 1.0, + "global_step": 1607, + "grad_norm": 2.060696769744449, + "learning_rate": 9.942690994984484e-07, + "loss": 0.4407, + "step": 1607 + }, + { + "ETA": 2.87, + "epoch": 0.5171249397009166, + "fp16_scale": 1.0, + "global_step": 1608, + "grad_norm": 1.7215868508271845, + "learning_rate": 9.932271322962521e-07, + "loss": 0.4237, + "step": 1608 + }, + { + "ETA": 2.86, + "epoch": 0.5174465348126709, + "fp16_scale": 1.0, + "global_step": 1609, + "grad_norm": 1.8170851637621381, + "learning_rate": 9.921851724476158e-07, + "loss": 0.3627, + "step": 1609 + }, + { + "ETA": 2.86, + "epoch": 0.5177681299244251, + "fp16_scale": 1.0, + "global_step": 1610, + "grad_norm": 2.080001494718728, + "learning_rate": 9.911432210838363e-07, + "loss": 0.3805, + "step": 1610 + }, + { + "ETA": 2.86, + "epoch": 0.5180897250361794, + "fp16_scale": 1.0, + "global_step": 1611, + "grad_norm": 2.2985486698660007, + "learning_rate": 9.901012793361985e-07, + "loss": 0.3878, + "step": 1611 + }, + { + "ETA": 2.86, + "epoch": 0.5184113201479338, + "fp16_scale": 1.0, + "global_step": 1612, + "grad_norm": 2.5086366966372684, + "learning_rate": 9.890593483359787e-07, + "loss": 0.488, + "step": 1612 + }, + { + "ETA": 2.85, + "epoch": 0.518732915259688, + "fp16_scale": 1.0, + "global_step": 1613, + "grad_norm": 2.010293325396669, + "learning_rate": 9.880174292144416e-07, + "loss": 0.376, + "step": 1613 + }, + { + "ETA": 2.85, + "epoch": 0.5190545103714423, + "fp16_scale": 1.0, + "global_step": 1614, + "grad_norm": 1.8358532866623107, + "learning_rate": 9.869755231028373e-07, + "loss": 0.4112, + "step": 1614 + }, + { + "ETA": 2.85, + "epoch": 0.5193761054831967, + "fp16_scale": 1.0, + "global_step": 1615, + "grad_norm": 2.0705185022399926, + "learning_rate": 9.85933631132404e-07, + "loss": 0.4916, + "step": 1615 + }, + { + "ETA": 2.85, + "epoch": 0.5196977005949509, + "fp16_scale": 1.0, + "global_step": 1616, + "grad_norm": 2.023709420900433, + "learning_rate": 9.848917544343634e-07, + "loss": 0.5352, + "step": 1616 + }, + { + "ETA": 2.85, + "epoch": 0.5200192957067052, + "fp16_scale": 1.0, + "global_step": 1617, + "grad_norm": 1.92094948739541, + "learning_rate": 9.838498941399196e-07, + "loss": 0.4612, + "step": 1617 + }, + { + "ETA": 2.84, + "epoch": 0.5203408908184596, + "fp16_scale": 1.0, + "global_step": 1618, + "grad_norm": 2.1590515829343304, + "learning_rate": 9.828080513802617e-07, + "loss": 0.3923, + "step": 1618 + }, + { + "ETA": 2.84, + "epoch": 0.5206624859302139, + "fp16_scale": 1.0, + "global_step": 1619, + "grad_norm": 2.1403293792241818, + "learning_rate": 9.817662272865568e-07, + "loss": 0.4995, + "step": 1619 + }, + { + "ETA": 2.84, + "epoch": 0.5209840810419681, + "fp16_scale": 1.0, + "global_step": 1620, + "grad_norm": 1.9800035806978962, + "learning_rate": 9.807244229899535e-07, + "loss": 0.4261, + "step": 1620 + }, + { + "ETA": 2.84, + "epoch": 0.5213056761537225, + "fp16_scale": 1.0, + "global_step": 1621, + "grad_norm": 2.3143503505629144, + "learning_rate": 9.796826396215783e-07, + "loss": 0.4136, + "step": 1621 + }, + { + "ETA": 2.84, + "epoch": 0.5216272712654768, + "fp16_scale": 1.0, + "global_step": 1622, + "grad_norm": 2.0303042528874347, + "learning_rate": 9.786408783125353e-07, + "loss": 0.5375, + "step": 1622 + }, + { + "ETA": 2.83, + "epoch": 0.521948866377231, + "fp16_scale": 1.0, + "global_step": 1623, + "grad_norm": 1.7672886436411808, + "learning_rate": 9.775991401939043e-07, + "loss": 0.463, + "step": 1623 + }, + { + "ETA": 2.83, + "epoch": 0.5222704614889854, + "fp16_scale": 1.0, + "global_step": 1624, + "grad_norm": 1.9955680595303593, + "learning_rate": 9.765574263967395e-07, + "loss": 0.3811, + "step": 1624 + }, + { + "ETA": 2.83, + "epoch": 0.5225920566007397, + "fp16_scale": 1.0, + "global_step": 1625, + "grad_norm": 1.84911013814408, + "learning_rate": 9.7551573805207e-07, + "loss": 0.4093, + "step": 1625 + }, + { + "ETA": 2.83, + "epoch": 0.5229136517124939, + "fp16_scale": 1.0, + "global_step": 1626, + "grad_norm": 1.9727405644351568, + "learning_rate": 9.744740762908962e-07, + "loss": 0.4486, + "step": 1626 + }, + { + "ETA": 2.83, + "epoch": 0.5232352468242483, + "fp16_scale": 1.0, + "global_step": 1627, + "grad_norm": 1.8987616481187537, + "learning_rate": 9.734324422441903e-07, + "loss": 0.372, + "step": 1627 + }, + { + "ETA": 2.82, + "epoch": 0.5235568419360026, + "fp16_scale": 1.0, + "global_step": 1628, + "grad_norm": 2.104874268766279, + "learning_rate": 9.723908370428934e-07, + "loss": 0.4792, + "step": 1628 + }, + { + "ETA": 2.82, + "epoch": 0.5238784370477568, + "fp16_scale": 1.0, + "global_step": 1629, + "grad_norm": 2.037931033277432, + "learning_rate": 9.713492618179164e-07, + "loss": 0.4548, + "step": 1629 + }, + { + "ETA": 2.82, + "epoch": 0.5242000321595112, + "fp16_scale": 1.0, + "global_step": 1630, + "grad_norm": 1.9870517845156732, + "learning_rate": 9.703077177001373e-07, + "loss": 0.4648, + "step": 1630 + }, + { + "ETA": 2.82, + "epoch": 0.5245216272712655, + "fp16_scale": 1.0, + "global_step": 1631, + "grad_norm": 1.9429733091548653, + "learning_rate": 9.692662058203996e-07, + "loss": 0.4408, + "step": 1631 + }, + { + "ETA": 2.82, + "epoch": 0.5248432223830197, + "fp16_scale": 1.0, + "global_step": 1632, + "grad_norm": 1.9168587365422025, + "learning_rate": 9.68224727309513e-07, + "loss": 0.5181, + "step": 1632 + }, + { + "ETA": 2.81, + "epoch": 0.5251648174947741, + "fp16_scale": 1.0, + "global_step": 1633, + "grad_norm": 1.824832307942761, + "learning_rate": 9.6718328329825e-07, + "loss": 0.3516, + "step": 1633 + }, + { + "ETA": 2.81, + "epoch": 0.5254864126065284, + "fp16_scale": 1.0, + "global_step": 1634, + "grad_norm": 2.072616623060313, + "learning_rate": 9.661418749173467e-07, + "loss": 0.4084, + "step": 1634 + }, + { + "ETA": 2.81, + "epoch": 0.5258080077182827, + "fp16_scale": 1.0, + "global_step": 1635, + "grad_norm": 1.8799884615276101, + "learning_rate": 9.651005032974993e-07, + "loss": 0.4411, + "step": 1635 + }, + { + "ETA": 2.81, + "epoch": 0.526129602830037, + "fp16_scale": 1.0, + "global_step": 1636, + "grad_norm": 1.8586291089594524, + "learning_rate": 9.640591695693643e-07, + "loss": 0.4822, + "step": 1636 + }, + { + "ETA": 2.81, + "epoch": 0.5264511979417913, + "fp16_scale": 1.0, + "global_step": 1637, + "grad_norm": 1.9918129012674595, + "learning_rate": 9.63017874863558e-07, + "loss": 0.4686, + "step": 1637 + }, + { + "ETA": 2.8, + "epoch": 0.5267727930535456, + "fp16_scale": 1.0, + "global_step": 1638, + "grad_norm": 2.137331264992571, + "learning_rate": 9.619766203106533e-07, + "loss": 0.4136, + "step": 1638 + }, + { + "ETA": 2.8, + "epoch": 0.5270943881652999, + "fp16_scale": 1.0, + "global_step": 1639, + "grad_norm": 2.0635890205604763, + "learning_rate": 9.609354070411806e-07, + "loss": 0.3638, + "step": 1639 + }, + { + "ETA": 2.8, + "epoch": 0.5274159832770542, + "fp16_scale": 1.0, + "global_step": 1640, + "grad_norm": 1.88341098376735, + "learning_rate": 9.598942361856243e-07, + "loss": 0.4678, + "step": 1640 + }, + { + "ETA": 2.8, + "epoch": 0.5277375783888085, + "fp16_scale": 1.0, + "global_step": 1641, + "grad_norm": 2.123834976935097, + "learning_rate": 9.588531088744232e-07, + "loss": 0.3747, + "step": 1641 + }, + { + "ETA": 2.8, + "epoch": 0.5280591735005628, + "fp16_scale": 1.0, + "global_step": 1642, + "grad_norm": 2.039091565213063, + "learning_rate": 9.57812026237969e-07, + "loss": 0.4189, + "step": 1642 + }, + { + "ETA": 2.79, + "epoch": 0.5283807686123171, + "fp16_scale": 1.0, + "global_step": 1643, + "grad_norm": 1.9671824757357255, + "learning_rate": 9.567709894066044e-07, + "loss": 0.3653, + "step": 1643 + }, + { + "ETA": 2.79, + "epoch": 0.5287023637240714, + "fp16_scale": 1.0, + "global_step": 1644, + "grad_norm": 2.3620093742815667, + "learning_rate": 9.557299995106237e-07, + "loss": 0.3726, + "step": 1644 + }, + { + "ETA": 2.79, + "epoch": 0.5290239588358256, + "fp16_scale": 1.0, + "global_step": 1645, + "grad_norm": 1.9395855929320645, + "learning_rate": 9.546890576802684e-07, + "loss": 0.4576, + "step": 1645 + }, + { + "ETA": 2.79, + "epoch": 0.52934555394758, + "fp16_scale": 1.0, + "global_step": 1646, + "grad_norm": 1.8088910978805381, + "learning_rate": 9.536481650457289e-07, + "loss": 0.4414, + "step": 1646 + }, + { + "ETA": 2.79, + "epoch": 0.5296671490593343, + "fp16_scale": 1.0, + "global_step": 1647, + "grad_norm": 2.111357027253241, + "learning_rate": 9.52607322737142e-07, + "loss": 0.4036, + "step": 1647 + }, + { + "ETA": 2.78, + "epoch": 0.5299887441710887, + "fp16_scale": 1.0, + "global_step": 1648, + "grad_norm": 1.8565900369107433, + "learning_rate": 9.515665318845899e-07, + "loss": 0.4302, + "step": 1648 + }, + { + "ETA": 2.78, + "epoch": 0.5303103392828429, + "fp16_scale": 1.0, + "global_step": 1649, + "grad_norm": 2.1938904807691437, + "learning_rate": 9.505257936180991e-07, + "loss": 0.3928, + "step": 1649 + }, + { + "ETA": 2.78, + "epoch": 0.5306319343945972, + "fp16_scale": 1.0, + "global_step": 1650, + "grad_norm": 1.9079779935172827, + "learning_rate": 9.494851090676383e-07, + "loss": 0.3247, + "step": 1650 + }, + { + "ETA": 2.78, + "epoch": 0.5309535295063516, + "fp16_scale": 1.0, + "global_step": 1651, + "grad_norm": 1.890955381906359, + "learning_rate": 9.484444793631186e-07, + "loss": 0.4327, + "step": 1651 + }, + { + "ETA": 2.78, + "epoch": 0.5312751246181058, + "fp16_scale": 1.0, + "global_step": 1652, + "grad_norm": 2.1857793201911364, + "learning_rate": 9.474039056343916e-07, + "loss": 0.405, + "step": 1652 + }, + { + "ETA": 2.77, + "epoch": 0.5315967197298601, + "fp16_scale": 1.0, + "global_step": 1653, + "grad_norm": 2.1612198664743443, + "learning_rate": 9.46363389011247e-07, + "loss": 0.4921, + "step": 1653 + }, + { + "ETA": 2.77, + "epoch": 0.5319183148416144, + "fp16_scale": 1.0, + "global_step": 1654, + "grad_norm": 1.6790166735223522, + "learning_rate": 9.453229306234142e-07, + "loss": 0.4057, + "step": 1654 + }, + { + "ETA": 2.77, + "epoch": 0.5322399099533687, + "fp16_scale": 1.0, + "global_step": 1655, + "grad_norm": 1.81010700995378, + "learning_rate": 9.442825316005579e-07, + "loss": 0.3761, + "step": 1655 + }, + { + "ETA": 2.77, + "epoch": 0.532561505065123, + "fp16_scale": 1.0, + "global_step": 1656, + "grad_norm": 2.0145802715577465, + "learning_rate": 9.432421930722792e-07, + "loss": 0.4328, + "step": 1656 + }, + { + "ETA": 2.77, + "epoch": 0.5328831001768773, + "fp16_scale": 1.0, + "global_step": 1657, + "grad_norm": 1.931093282712056, + "learning_rate": 9.422019161681137e-07, + "loss": 0.5056, + "step": 1657 + }, + { + "ETA": 2.76, + "epoch": 0.5332046952886316, + "fp16_scale": 1.0, + "global_step": 1658, + "grad_norm": 1.9353050895637487, + "learning_rate": 9.411617020175287e-07, + "loss": 0.4093, + "step": 1658 + }, + { + "ETA": 2.76, + "epoch": 0.5335262904003859, + "fp16_scale": 1.0, + "global_step": 1659, + "grad_norm": 2.1583820544387824, + "learning_rate": 9.40121551749925e-07, + "loss": 0.4928, + "step": 1659 + }, + { + "ETA": 2.76, + "epoch": 0.5338478855121402, + "fp16_scale": 1.0, + "global_step": 1660, + "grad_norm": 1.792074199018544, + "learning_rate": 9.390814664946331e-07, + "loss": 0.4454, + "step": 1660 + }, + { + "ETA": 2.76, + "epoch": 0.5341694806238945, + "fp16_scale": 1.0, + "global_step": 1661, + "grad_norm": 1.9358430781906717, + "learning_rate": 9.380414473809136e-07, + "loss": 0.4129, + "step": 1661 + }, + { + "ETA": 2.76, + "epoch": 0.5344910757356488, + "fp16_scale": 1.0, + "global_step": 1662, + "grad_norm": 1.9529546238115019, + "learning_rate": 9.370014955379539e-07, + "loss": 0.4762, + "step": 1662 + }, + { + "ETA": 2.76, + "epoch": 0.5348126708474031, + "fp16_scale": 1.0, + "global_step": 1663, + "grad_norm": 2.2638924144295607, + "learning_rate": 9.359616120948707e-07, + "loss": 0.4251, + "step": 1663 + }, + { + "ETA": 2.75, + "epoch": 0.5351342659591575, + "fp16_scale": 1.0, + "global_step": 1664, + "grad_norm": 1.9813429016738413, + "learning_rate": 9.34921798180704e-07, + "loss": 0.3654, + "step": 1664 + }, + { + "ETA": 2.75, + "epoch": 0.5354558610709117, + "fp16_scale": 1.0, + "global_step": 1665, + "grad_norm": 1.960187197727769, + "learning_rate": 9.338820549244196e-07, + "loss": 0.4539, + "step": 1665 + }, + { + "ETA": 2.75, + "epoch": 0.535777456182666, + "fp16_scale": 1.0, + "global_step": 1666, + "grad_norm": 1.9770406298384058, + "learning_rate": 9.328423834549069e-07, + "loss": 0.4313, + "step": 1666 + }, + { + "ETA": 2.75, + "epoch": 0.5360990512944204, + "fp16_scale": 1.0, + "global_step": 1667, + "grad_norm": 1.9335789732605826, + "learning_rate": 9.318027849009758e-07, + "loss": 0.4187, + "step": 1667 + }, + { + "ETA": 2.75, + "epoch": 0.5364206464061746, + "fp16_scale": 1.0, + "global_step": 1668, + "grad_norm": 2.118099777727274, + "learning_rate": 9.307632603913587e-07, + "loss": 0.4702, + "step": 1668 + }, + { + "ETA": 2.75, + "epoch": 0.5367422415179289, + "fp16_scale": 1.0, + "global_step": 1669, + "grad_norm": 2.038387896608497, + "learning_rate": 9.297238110547074e-07, + "loss": 0.4608, + "step": 1669 + }, + { + "ETA": 2.74, + "epoch": 0.5370638366296833, + "fp16_scale": 1.0, + "global_step": 1670, + "grad_norm": 2.0556956671491324, + "learning_rate": 9.286844380195902e-07, + "loss": 0.4327, + "step": 1670 + }, + { + "ETA": 2.74, + "epoch": 0.5373854317414375, + "fp16_scale": 1.0, + "global_step": 1671, + "grad_norm": 1.9672969510997889, + "learning_rate": 9.276451424144956e-07, + "loss": 0.4308, + "step": 1671 + }, + { + "ETA": 2.74, + "epoch": 0.5377070268531918, + "fp16_scale": 1.0, + "global_step": 1672, + "grad_norm": 1.7491289758749573, + "learning_rate": 9.266059253678254e-07, + "loss": 0.3831, + "step": 1672 + }, + { + "ETA": 2.74, + "epoch": 0.5380286219649462, + "fp16_scale": 1.0, + "global_step": 1673, + "grad_norm": 1.9553234843934735, + "learning_rate": 9.255667880078974e-07, + "loss": 0.386, + "step": 1673 + }, + { + "ETA": 2.74, + "epoch": 0.5383502170767004, + "fp16_scale": 1.0, + "global_step": 1674, + "grad_norm": 1.952435448768448, + "learning_rate": 9.24527731462943e-07, + "loss": 0.4162, + "step": 1674 + }, + { + "ETA": 2.73, + "epoch": 0.5386718121884547, + "fp16_scale": 1.0, + "global_step": 1675, + "grad_norm": 2.2601203770527776, + "learning_rate": 9.234887568611047e-07, + "loss": 0.4065, + "step": 1675 + }, + { + "ETA": 2.73, + "epoch": 0.5389934073002091, + "fp16_scale": 1.0, + "global_step": 1676, + "grad_norm": 1.8281147986268291, + "learning_rate": 9.224498653304375e-07, + "loss": 0.3861, + "step": 1676 + }, + { + "ETA": 2.73, + "epoch": 0.5393150024119633, + "fp16_scale": 1.0, + "global_step": 1677, + "grad_norm": 2.0668405194874073, + "learning_rate": 9.214110579989049e-07, + "loss": 0.4329, + "step": 1677 + }, + { + "ETA": 2.73, + "epoch": 0.5396365975237176, + "fp16_scale": 1.0, + "global_step": 1678, + "grad_norm": 2.0307273211037775, + "learning_rate": 9.203723359943802e-07, + "loss": 0.5069, + "step": 1678 + }, + { + "ETA": 2.73, + "epoch": 0.539958192635472, + "fp16_scale": 1.0, + "global_step": 1679, + "grad_norm": 2.060692868732403, + "learning_rate": 9.193337004446427e-07, + "loss": 0.4973, + "step": 1679 + }, + { + "ETA": 2.72, + "epoch": 0.5402797877472263, + "fp16_scale": 1.0, + "global_step": 1680, + "grad_norm": 1.8842225015216565, + "learning_rate": 9.182951524773797e-07, + "loss": 0.4737, + "step": 1680 + }, + { + "ETA": 2.72, + "epoch": 0.5406013828589805, + "fp16_scale": 1.0, + "global_step": 1681, + "grad_norm": 2.049911978458765, + "learning_rate": 9.172566932201813e-07, + "loss": 0.3778, + "step": 1681 + }, + { + "ETA": 2.72, + "epoch": 0.5409229779707349, + "fp16_scale": 1.0, + "global_step": 1682, + "grad_norm": 2.04391244333585, + "learning_rate": 9.162183238005424e-07, + "loss": 0.4058, + "step": 1682 + }, + { + "ETA": 2.72, + "epoch": 0.5412445730824892, + "fp16_scale": 1.0, + "global_step": 1683, + "grad_norm": 2.1703937688139727, + "learning_rate": 9.151800453458607e-07, + "loss": 0.4195, + "step": 1683 + }, + { + "ETA": 2.72, + "epoch": 0.5415661681942434, + "fp16_scale": 1.0, + "global_step": 1684, + "grad_norm": 2.1288523361524985, + "learning_rate": 9.141418589834339e-07, + "loss": 0.4428, + "step": 1684 + }, + { + "ETA": 2.71, + "epoch": 0.5418877633059977, + "fp16_scale": 1.0, + "global_step": 1685, + "grad_norm": 2.1916580656636095, + "learning_rate": 9.131037658404614e-07, + "loss": 0.4564, + "step": 1685 + }, + { + "ETA": 2.71, + "epoch": 0.5422093584177521, + "fp16_scale": 1.0, + "global_step": 1686, + "grad_norm": 1.8491952040675952, + "learning_rate": 9.120657670440399e-07, + "loss": 0.3739, + "step": 1686 + }, + { + "ETA": 2.71, + "epoch": 0.5425309535295063, + "fp16_scale": 1.0, + "global_step": 1687, + "grad_norm": 1.8070925702546399, + "learning_rate": 9.110278637211642e-07, + "loss": 0.471, + "step": 1687 + }, + { + "ETA": 2.71, + "epoch": 0.5428525486412606, + "fp16_scale": 1.0, + "global_step": 1688, + "grad_norm": 2.057229379965033, + "learning_rate": 9.099900569987259e-07, + "loss": 0.4143, + "step": 1688 + }, + { + "ETA": 2.71, + "epoch": 0.543174143753015, + "fp16_scale": 1.0, + "global_step": 1689, + "grad_norm": 1.9522907381860064, + "learning_rate": 9.089523480035105e-07, + "loss": 0.5105, + "step": 1689 + }, + { + "ETA": 2.71, + "epoch": 0.5434957388647692, + "fp16_scale": 1.0, + "global_step": 1690, + "grad_norm": 1.9540070864190384, + "learning_rate": 9.07914737862199e-07, + "loss": 0.4394, + "step": 1690 + }, + { + "ETA": 2.7, + "epoch": 0.5438173339765235, + "fp16_scale": 1.0, + "global_step": 1691, + "grad_norm": 1.9054142712571198, + "learning_rate": 9.068772277013636e-07, + "loss": 0.4017, + "step": 1691 + }, + { + "ETA": 2.7, + "epoch": 0.5441389290882779, + "fp16_scale": 1.0, + "global_step": 1692, + "grad_norm": 1.8408541861671888, + "learning_rate": 9.058398186474693e-07, + "loss": 0.4782, + "step": 1692 + }, + { + "ETA": 2.7, + "epoch": 0.5444605242000322, + "fp16_scale": 1.0, + "global_step": 1693, + "grad_norm": 2.0129338146570173, + "learning_rate": 9.048025118268703e-07, + "loss": 0.4568, + "step": 1693 + }, + { + "ETA": 2.7, + "epoch": 0.5447821193117864, + "fp16_scale": 1.0, + "global_step": 1694, + "grad_norm": 1.9086720470669631, + "learning_rate": 9.037653083658097e-07, + "loss": 0.479, + "step": 1694 + }, + { + "ETA": 2.7, + "epoch": 0.5451037144235408, + "fp16_scale": 1.0, + "global_step": 1695, + "grad_norm": 1.8780951651815723, + "learning_rate": 9.027282093904194e-07, + "loss": 0.5396, + "step": 1695 + }, + { + "ETA": 2.7, + "epoch": 0.5454253095352951, + "fp16_scale": 1.0, + "global_step": 1696, + "grad_norm": 2.0141600550252567, + "learning_rate": 9.016912160267167e-07, + "loss": 0.4775, + "step": 1696 + }, + { + "ETA": 2.69, + "epoch": 0.5457469046470493, + "fp16_scale": 1.0, + "global_step": 1697, + "grad_norm": 1.9977542792940073, + "learning_rate": 9.006543294006055e-07, + "loss": 0.39, + "step": 1697 + }, + { + "ETA": 2.69, + "epoch": 0.5460684997588037, + "fp16_scale": 1.0, + "global_step": 1698, + "grad_norm": 2.0100574088084437, + "learning_rate": 8.996175506378727e-07, + "loss": 0.4399, + "step": 1698 + }, + { + "ETA": 2.69, + "epoch": 0.546390094870558, + "fp16_scale": 1.0, + "global_step": 1699, + "grad_norm": 1.8985816971991276, + "learning_rate": 8.985808808641883e-07, + "loss": 0.4302, + "step": 1699 + }, + { + "ETA": 2.69, + "epoch": 0.5467116899823122, + "fp16_scale": 1.0, + "global_step": 1700, + "grad_norm": 2.2538143685809047, + "learning_rate": 8.975443212051044e-07, + "loss": 0.4318, + "step": 1700 + }, + { + "ETA": 2.69, + "epoch": 0.5470332850940666, + "fp16_scale": 1.0, + "global_step": 1701, + "grad_norm": 1.8387660186964476, + "learning_rate": 8.965078727860531e-07, + "loss": 0.4254, + "step": 1701 + }, + { + "ETA": 2.68, + "epoch": 0.5473548802058209, + "fp16_scale": 1.0, + "global_step": 1702, + "grad_norm": 1.8393170690306526, + "learning_rate": 8.954715367323466e-07, + "loss": 0.4502, + "step": 1702 + }, + { + "ETA": 2.68, + "epoch": 0.5476764753175751, + "fp16_scale": 1.0, + "global_step": 1703, + "grad_norm": 1.9724573500731621, + "learning_rate": 8.944353141691737e-07, + "loss": 0.4194, + "step": 1703 + }, + { + "ETA": 2.68, + "epoch": 0.5479980704293295, + "fp16_scale": 1.0, + "global_step": 1704, + "grad_norm": 1.8424709457975637, + "learning_rate": 8.933992062216007e-07, + "loss": 0.4868, + "step": 1704 + }, + { + "ETA": 2.68, + "epoch": 0.5483196655410838, + "fp16_scale": 1.0, + "global_step": 1705, + "grad_norm": 1.7973152512620978, + "learning_rate": 8.9236321401457e-07, + "loss": 0.4057, + "step": 1705 + }, + { + "ETA": 2.68, + "epoch": 0.548641260652838, + "fp16_scale": 1.0, + "global_step": 1706, + "grad_norm": 1.9768906189493431, + "learning_rate": 8.913273386728968e-07, + "loss": 0.4456, + "step": 1706 + }, + { + "ETA": 2.67, + "epoch": 0.5489628557645924, + "fp16_scale": 1.0, + "global_step": 1707, + "grad_norm": 2.254650908488793, + "learning_rate": 8.90291581321272e-07, + "loss": 0.3038, + "step": 1707 + }, + { + "ETA": 2.67, + "epoch": 0.5492844508763467, + "fp16_scale": 1.0, + "global_step": 1708, + "grad_norm": 1.900765301423923, + "learning_rate": 8.892559430842554e-07, + "loss": 0.4334, + "step": 1708 + }, + { + "ETA": 2.67, + "epoch": 0.549606045988101, + "fp16_scale": 1.0, + "global_step": 1709, + "grad_norm": 1.957232263003515, + "learning_rate": 8.882204250862795e-07, + "loss": 0.481, + "step": 1709 + }, + { + "ETA": 2.67, + "epoch": 0.5499276410998553, + "fp16_scale": 1.0, + "global_step": 1710, + "grad_norm": 2.0269238406443773, + "learning_rate": 8.871850284516457e-07, + "loss": 0.4925, + "step": 1710 + }, + { + "ETA": 2.67, + "epoch": 0.5502492362116096, + "fp16_scale": 1.0, + "global_step": 1711, + "grad_norm": 2.1005374262104626, + "learning_rate": 8.861497543045229e-07, + "loss": 0.4553, + "step": 1711 + }, + { + "ETA": 2.67, + "epoch": 0.5505708313233639, + "fp16_scale": 1.0, + "global_step": 1712, + "grad_norm": 2.16953459934703, + "learning_rate": 8.851146037689485e-07, + "loss": 0.4664, + "step": 1712 + }, + { + "ETA": 2.66, + "epoch": 0.5508924264351182, + "fp16_scale": 1.0, + "global_step": 1713, + "grad_norm": 1.9012841499281627, + "learning_rate": 8.840795779688242e-07, + "loss": 0.4296, + "step": 1713 + }, + { + "ETA": 2.66, + "epoch": 0.5512140215468725, + "fp16_scale": 1.0, + "global_step": 1714, + "grad_norm": 1.7657029021922768, + "learning_rate": 8.830446780279175e-07, + "loss": 0.3813, + "step": 1714 + }, + { + "ETA": 2.66, + "epoch": 0.5515356166586268, + "fp16_scale": 1.0, + "global_step": 1715, + "grad_norm": 2.0263439287596676, + "learning_rate": 8.820099050698586e-07, + "loss": 0.4914, + "step": 1715 + }, + { + "ETA": 2.66, + "epoch": 0.551857211770381, + "fp16_scale": 1.0, + "global_step": 1716, + "grad_norm": 2.3080256513629274, + "learning_rate": 8.809752602181393e-07, + "loss": 0.437, + "step": 1716 + }, + { + "ETA": 2.66, + "epoch": 0.5521788068821354, + "fp16_scale": 1.0, + "global_step": 1717, + "grad_norm": 2.0337037323895424, + "learning_rate": 8.799407445961137e-07, + "loss": 0.3954, + "step": 1717 + }, + { + "ETA": 2.65, + "epoch": 0.5525004019938897, + "fp16_scale": 1.0, + "global_step": 1718, + "grad_norm": 1.976280328684517, + "learning_rate": 8.78906359326994e-07, + "loss": 0.4357, + "step": 1718 + }, + { + "ETA": 2.65, + "epoch": 0.552821997105644, + "fp16_scale": 1.0, + "global_step": 1719, + "grad_norm": 2.031342436164455, + "learning_rate": 8.778721055338528e-07, + "loss": 0.4661, + "step": 1719 + }, + { + "ETA": 2.65, + "epoch": 0.5531435922173983, + "fp16_scale": 1.0, + "global_step": 1720, + "grad_norm": 2.0845095138524368, + "learning_rate": 8.768379843396177e-07, + "loss": 0.4108, + "step": 1720 + }, + { + "ETA": 2.65, + "epoch": 0.5534651873291526, + "fp16_scale": 1.0, + "global_step": 1721, + "grad_norm": 2.0812859769989847, + "learning_rate": 8.758039968670742e-07, + "loss": 0.5438, + "step": 1721 + }, + { + "ETA": 2.65, + "epoch": 0.5537867824409068, + "fp16_scale": 1.0, + "global_step": 1722, + "grad_norm": 1.9524169274929275, + "learning_rate": 8.747701442388616e-07, + "loss": 0.5052, + "step": 1722 + }, + { + "ETA": 2.65, + "epoch": 0.5541083775526612, + "fp16_scale": 1.0, + "global_step": 1723, + "grad_norm": 1.8445450082133272, + "learning_rate": 8.737364275774729e-07, + "loss": 0.3534, + "step": 1723 + }, + { + "ETA": 2.64, + "epoch": 0.5544299726644155, + "fp16_scale": 1.0, + "global_step": 1724, + "grad_norm": 1.8526979368638332, + "learning_rate": 8.727028480052543e-07, + "loss": 0.438, + "step": 1724 + }, + { + "ETA": 2.64, + "epoch": 0.5547515677761699, + "fp16_scale": 1.0, + "global_step": 1725, + "grad_norm": 1.9320559143756728, + "learning_rate": 8.716694066444017e-07, + "loss": 0.4378, + "step": 1725 + }, + { + "ETA": 2.64, + "epoch": 0.5550731628879241, + "fp16_scale": 1.0, + "global_step": 1726, + "grad_norm": 2.1470404152795997, + "learning_rate": 8.706361046169623e-07, + "loss": 0.3622, + "step": 1726 + }, + { + "ETA": 2.64, + "epoch": 0.5553947579996784, + "fp16_scale": 1.0, + "global_step": 1727, + "grad_norm": 1.9500310475599905, + "learning_rate": 8.696029430448315e-07, + "loss": 0.3899, + "step": 1727 + }, + { + "ETA": 2.63, + "epoch": 0.5557163531114327, + "fp16_scale": 1.0, + "global_step": 1728, + "grad_norm": 1.98374866246575, + "learning_rate": 8.685699230497514e-07, + "loss": 0.3588, + "step": 1728 + }, + { + "ETA": 2.63, + "epoch": 0.556037948223187, + "fp16_scale": 1.0, + "global_step": 1729, + "grad_norm": 1.8915959141609786, + "learning_rate": 8.675370457533121e-07, + "loss": 0.4059, + "step": 1729 + }, + { + "ETA": 2.63, + "epoch": 0.5563595433349413, + "fp16_scale": 1.0, + "global_step": 1730, + "grad_norm": 1.8856876085991405, + "learning_rate": 8.665043122769472e-07, + "loss": 0.4323, + "step": 1730 + }, + { + "ETA": 2.63, + "epoch": 0.5566811384466956, + "fp16_scale": 1.0, + "global_step": 1731, + "grad_norm": 1.7684561210473508, + "learning_rate": 8.654717237419351e-07, + "loss": 0.4433, + "step": 1731 + }, + { + "ETA": 2.63, + "epoch": 0.5570027335584499, + "fp16_scale": 1.0, + "global_step": 1732, + "grad_norm": 1.7810245850770299, + "learning_rate": 8.644392812693968e-07, + "loss": 0.3711, + "step": 1732 + }, + { + "ETA": 2.63, + "epoch": 0.5573243286702042, + "fp16_scale": 1.0, + "global_step": 1733, + "grad_norm": 1.9821729103551067, + "learning_rate": 8.634069859802935e-07, + "loss": 0.4147, + "step": 1733 + }, + { + "ETA": 2.62, + "epoch": 0.5576459237819585, + "fp16_scale": 1.0, + "global_step": 1734, + "grad_norm": 1.9543986999749898, + "learning_rate": 8.623748389954281e-07, + "loss": 0.4229, + "step": 1734 + }, + { + "ETA": 2.62, + "epoch": 0.5579675188937128, + "fp16_scale": 1.0, + "global_step": 1735, + "grad_norm": 2.0466926107299686, + "learning_rate": 8.613428414354417e-07, + "loss": 0.4453, + "step": 1735 + }, + { + "ETA": 2.62, + "epoch": 0.5582891140054671, + "fp16_scale": 1.0, + "global_step": 1736, + "grad_norm": 2.0240704794095765, + "learning_rate": 8.603109944208139e-07, + "loss": 0.4059, + "step": 1736 + }, + { + "ETA": 2.62, + "epoch": 0.5586107091172214, + "fp16_scale": 1.0, + "global_step": 1737, + "grad_norm": 1.8044217907734543, + "learning_rate": 8.592792990718595e-07, + "loss": 0.4011, + "step": 1737 + }, + { + "ETA": 2.62, + "epoch": 0.5589323042289758, + "fp16_scale": 1.0, + "global_step": 1738, + "grad_norm": 1.765446521967734, + "learning_rate": 8.582477565087302e-07, + "loss": 0.3369, + "step": 1738 + }, + { + "ETA": 2.61, + "epoch": 0.55925389934073, + "fp16_scale": 1.0, + "global_step": 1739, + "grad_norm": 1.918372512515096, + "learning_rate": 8.572163678514106e-07, + "loss": 0.428, + "step": 1739 + }, + { + "ETA": 2.61, + "epoch": 0.5595754944524843, + "fp16_scale": 1.0, + "global_step": 1740, + "grad_norm": 2.12491006514322, + "learning_rate": 8.561851342197184e-07, + "loss": 0.3644, + "step": 1740 + }, + { + "ETA": 2.61, + "epoch": 0.5598970895642387, + "fp16_scale": 1.0, + "global_step": 1741, + "grad_norm": 2.0690566830004933, + "learning_rate": 8.55154056733304e-07, + "loss": 0.4755, + "step": 1741 + }, + { + "ETA": 2.61, + "epoch": 0.5602186846759929, + "fp16_scale": 1.0, + "global_step": 1742, + "grad_norm": 2.1818024567834136, + "learning_rate": 8.541231365116467e-07, + "loss": 0.3926, + "step": 1742 + }, + { + "ETA": 2.6, + "epoch": 0.5605402797877472, + "fp16_scale": 1.0, + "global_step": 1743, + "grad_norm": 1.8220517974524657, + "learning_rate": 8.530923746740563e-07, + "loss": 0.4561, + "step": 1743 + }, + { + "ETA": 2.6, + "epoch": 0.5608618748995016, + "fp16_scale": 1.0, + "global_step": 1744, + "grad_norm": 1.9285522640876034, + "learning_rate": 8.520617723396702e-07, + "loss": 0.4017, + "step": 1744 + }, + { + "ETA": 2.6, + "epoch": 0.5611834700112558, + "fp16_scale": 1.0, + "global_step": 1745, + "grad_norm": 2.154624360806632, + "learning_rate": 8.510313306274522e-07, + "loss": 0.4656, + "step": 1745 + }, + { + "ETA": 2.6, + "epoch": 0.5615050651230101, + "fp16_scale": 1.0, + "global_step": 1746, + "grad_norm": 1.952723697648608, + "learning_rate": 8.500010506561928e-07, + "loss": 0.4828, + "step": 1746 + }, + { + "ETA": 2.6, + "epoch": 0.5618266602347645, + "fp16_scale": 1.0, + "global_step": 1747, + "grad_norm": 2.341615494821478, + "learning_rate": 8.489709335445054e-07, + "loss": 0.4129, + "step": 1747 + }, + { + "ETA": 2.6, + "epoch": 0.5621482553465187, + "fp16_scale": 1.0, + "global_step": 1748, + "grad_norm": 1.9272337703544054, + "learning_rate": 8.479409804108282e-07, + "loss": 0.45, + "step": 1748 + }, + { + "ETA": 2.59, + "epoch": 0.562469850458273, + "fp16_scale": 1.0, + "global_step": 1749, + "grad_norm": 1.9766694566174827, + "learning_rate": 8.469111923734198e-07, + "loss": 0.4172, + "step": 1749 + }, + { + "ETA": 2.59, + "epoch": 0.5627914455700274, + "fp16_scale": 1.0, + "global_step": 1750, + "grad_norm": 1.9629971541372377, + "learning_rate": 8.45881570550361e-07, + "loss": 0.4689, + "step": 1750 + }, + { + "ETA": 2.59, + "epoch": 0.5631130406817816, + "fp16_scale": 1.0, + "global_step": 1751, + "grad_norm": 2.004159636242066, + "learning_rate": 8.44852116059551e-07, + "loss": 0.4394, + "step": 1751 + }, + { + "ETA": 2.59, + "epoch": 0.5634346357935359, + "fp16_scale": 1.0, + "global_step": 1752, + "grad_norm": 1.8743231940085416, + "learning_rate": 8.438228300187075e-07, + "loss": 0.3934, + "step": 1752 + }, + { + "ETA": 2.59, + "epoch": 0.5637562309052903, + "fp16_scale": 1.0, + "global_step": 1753, + "grad_norm": 1.9626815796530928, + "learning_rate": 8.42793713545366e-07, + "loss": 0.4799, + "step": 1753 + }, + { + "ETA": 2.59, + "epoch": 0.5640778260170446, + "fp16_scale": 1.0, + "global_step": 1754, + "grad_norm": 1.938175604853239, + "learning_rate": 8.417647677568772e-07, + "loss": 0.4137, + "step": 1754 + }, + { + "ETA": 2.58, + "epoch": 0.5643994211287988, + "fp16_scale": 1.0, + "global_step": 1755, + "grad_norm": 1.7744177383966495, + "learning_rate": 8.407359937704073e-07, + "loss": 0.4248, + "step": 1755 + }, + { + "ETA": 2.58, + "epoch": 0.5647210162405532, + "fp16_scale": 1.0, + "global_step": 1756, + "grad_norm": 1.9532503847097258, + "learning_rate": 8.397073927029348e-07, + "loss": 0.417, + "step": 1756 + }, + { + "ETA": 2.58, + "epoch": 0.5650426113523075, + "fp16_scale": 1.0, + "global_step": 1757, + "grad_norm": 2.22065066533937, + "learning_rate": 8.38678965671251e-07, + "loss": 0.3306, + "step": 1757 + }, + { + "ETA": 2.58, + "epoch": 0.5653642064640617, + "fp16_scale": 1.0, + "global_step": 1758, + "grad_norm": 1.953653843613687, + "learning_rate": 8.376507137919588e-07, + "loss": 0.4424, + "step": 1758 + }, + { + "ETA": 2.57, + "epoch": 0.565685801575816, + "fp16_scale": 1.0, + "global_step": 1759, + "grad_norm": 2.2588435224864343, + "learning_rate": 8.366226381814696e-07, + "loss": 0.4022, + "step": 1759 + }, + { + "ETA": 2.57, + "epoch": 0.5660073966875704, + "fp16_scale": 1.0, + "global_step": 1760, + "grad_norm": 1.8792259041029113, + "learning_rate": 8.355947399560055e-07, + "loss": 0.5325, + "step": 1760 + }, + { + "ETA": 2.57, + "epoch": 0.5663289917993246, + "fp16_scale": 1.0, + "global_step": 1761, + "grad_norm": 1.8657171909019314, + "learning_rate": 8.345670202315938e-07, + "loss": 0.4407, + "step": 1761 + }, + { + "ETA": 2.57, + "epoch": 0.566650586911079, + "fp16_scale": 1.0, + "global_step": 1762, + "grad_norm": 1.925518197428859, + "learning_rate": 8.335394801240689e-07, + "loss": 0.478, + "step": 1762 + }, + { + "ETA": 2.57, + "epoch": 0.5669721820228333, + "fp16_scale": 1.0, + "global_step": 1763, + "grad_norm": 2.1507431019069347, + "learning_rate": 8.325121207490709e-07, + "loss": 0.4391, + "step": 1763 + }, + { + "ETA": 2.56, + "epoch": 0.5672937771345875, + "fp16_scale": 1.0, + "global_step": 1764, + "grad_norm": 2.2289426766260294, + "learning_rate": 8.314849432220423e-07, + "loss": 0.5425, + "step": 1764 + }, + { + "ETA": 2.56, + "epoch": 0.5676153722463418, + "fp16_scale": 1.0, + "global_step": 1765, + "grad_norm": 1.9338689981686707, + "learning_rate": 8.304579486582295e-07, + "loss": 0.4208, + "step": 1765 + }, + { + "ETA": 2.56, + "epoch": 0.5679369673580962, + "fp16_scale": 1.0, + "global_step": 1766, + "grad_norm": 2.2381359699266907, + "learning_rate": 8.294311381726789e-07, + "loss": 0.5162, + "step": 1766 + }, + { + "ETA": 2.56, + "epoch": 0.5682585624698504, + "fp16_scale": 1.0, + "global_step": 1767, + "grad_norm": 1.927108842950315, + "learning_rate": 8.284045128802385e-07, + "loss": 0.4399, + "step": 1767 + }, + { + "ETA": 2.56, + "epoch": 0.5685801575816047, + "fp16_scale": 1.0, + "global_step": 1768, + "grad_norm": 1.8045672220973217, + "learning_rate": 8.273780738955544e-07, + "loss": 0.4396, + "step": 1768 + }, + { + "ETA": 2.56, + "epoch": 0.5689017526933591, + "fp16_scale": 1.0, + "global_step": 1769, + "grad_norm": 2.206743910505071, + "learning_rate": 8.263518223330696e-07, + "loss": 0.4406, + "step": 1769 + }, + { + "ETA": 2.55, + "epoch": 0.5692233478051134, + "fp16_scale": 1.0, + "global_step": 1770, + "grad_norm": 1.9521633490624264, + "learning_rate": 8.253257593070255e-07, + "loss": 0.4123, + "step": 1770 + }, + { + "ETA": 2.55, + "epoch": 0.5695449429168676, + "fp16_scale": 1.0, + "global_step": 1771, + "grad_norm": 1.7903253362340086, + "learning_rate": 8.242998859314572e-07, + "loss": 0.405, + "step": 1771 + }, + { + "ETA": 2.55, + "epoch": 0.569866538028622, + "fp16_scale": 1.0, + "global_step": 1772, + "grad_norm": 1.9837005731165283, + "learning_rate": 8.232742033201953e-07, + "loss": 0.373, + "step": 1772 + }, + { + "ETA": 2.55, + "epoch": 0.5701881331403763, + "fp16_scale": 1.0, + "global_step": 1773, + "grad_norm": 1.9642037637432663, + "learning_rate": 8.222487125868617e-07, + "loss": 0.3681, + "step": 1773 + }, + { + "ETA": 2.55, + "epoch": 0.5705097282521305, + "fp16_scale": 1.0, + "global_step": 1774, + "grad_norm": 1.9251607313366765, + "learning_rate": 8.212234148448707e-07, + "loss": 0.4476, + "step": 1774 + }, + { + "ETA": 2.54, + "epoch": 0.5708313233638849, + "fp16_scale": 1.0, + "global_step": 1775, + "grad_norm": 2.204764698489573, + "learning_rate": 8.201983112074276e-07, + "loss": 0.4402, + "step": 1775 + }, + { + "ETA": 2.54, + "epoch": 0.5711529184756392, + "fp16_scale": 1.0, + "global_step": 1776, + "grad_norm": 2.2923928454141587, + "learning_rate": 8.19173402787526e-07, + "loss": 0.4287, + "step": 1776 + }, + { + "ETA": 2.54, + "epoch": 0.5714745135873934, + "fp16_scale": 1.0, + "global_step": 1777, + "grad_norm": 1.8780913678933817, + "learning_rate": 8.181486906979487e-07, + "loss": 0.4122, + "step": 1777 + }, + { + "ETA": 2.54, + "epoch": 0.5717961086991478, + "fp16_scale": 1.0, + "global_step": 1778, + "grad_norm": 1.9990477720527768, + "learning_rate": 8.171241760512638e-07, + "loss": 0.4013, + "step": 1778 + }, + { + "ETA": 2.54, + "epoch": 0.5721177038109021, + "fp16_scale": 1.0, + "global_step": 1779, + "grad_norm": 2.1248581938501085, + "learning_rate": 8.160998599598265e-07, + "loss": 0.4272, + "step": 1779 + }, + { + "ETA": 2.53, + "epoch": 0.5724392989226563, + "fp16_scale": 1.0, + "global_step": 1780, + "grad_norm": 1.8442544901366762, + "learning_rate": 8.150757435357758e-07, + "loss": 0.4605, + "step": 1780 + }, + { + "ETA": 2.53, + "epoch": 0.5727608940344107, + "fp16_scale": 1.0, + "global_step": 1781, + "grad_norm": 1.7804841224343924, + "learning_rate": 8.140518278910329e-07, + "loss": 0.4858, + "step": 1781 + }, + { + "ETA": 2.53, + "epoch": 0.573082489146165, + "fp16_scale": 1.0, + "global_step": 1782, + "grad_norm": 1.9275225079051452, + "learning_rate": 8.130281141373036e-07, + "loss": 0.4632, + "step": 1782 + }, + { + "ETA": 2.53, + "epoch": 0.5734040842579193, + "fp16_scale": 1.0, + "global_step": 1783, + "grad_norm": 2.1636743932746736, + "learning_rate": 8.120046033860717e-07, + "loss": 0.4048, + "step": 1783 + }, + { + "ETA": 2.53, + "epoch": 0.5737256793696736, + "fp16_scale": 1.0, + "global_step": 1784, + "grad_norm": 1.9719324338725284, + "learning_rate": 8.109812967486024e-07, + "loss": 0.4411, + "step": 1784 + }, + { + "ETA": 2.52, + "epoch": 0.5740472744814279, + "fp16_scale": 1.0, + "global_step": 1785, + "grad_norm": 2.165944310471551, + "learning_rate": 8.099581953359387e-07, + "loss": 0.3551, + "step": 1785 + }, + { + "ETA": 2.52, + "epoch": 0.5743688695931822, + "fp16_scale": 1.0, + "global_step": 1786, + "grad_norm": 2.048182003544626, + "learning_rate": 8.089353002589001e-07, + "loss": 0.4871, + "step": 1786 + }, + { + "ETA": 2.52, + "epoch": 0.5746904647049365, + "fp16_scale": 1.0, + "global_step": 1787, + "grad_norm": 2.113670197158423, + "learning_rate": 8.079126126280835e-07, + "loss": 0.4502, + "step": 1787 + }, + { + "ETA": 2.52, + "epoch": 0.5750120598166908, + "fp16_scale": 1.0, + "global_step": 1788, + "grad_norm": 1.8462564358983096, + "learning_rate": 8.068901335538592e-07, + "loss": 0.433, + "step": 1788 + }, + { + "ETA": 2.52, + "epoch": 0.5753336549284451, + "fp16_scale": 1.0, + "global_step": 1789, + "grad_norm": 2.1693396197832975, + "learning_rate": 8.058678641463724e-07, + "loss": 0.481, + "step": 1789 + }, + { + "ETA": 2.51, + "epoch": 0.5756552500401994, + "fp16_scale": 1.0, + "global_step": 1790, + "grad_norm": 1.9947063436479482, + "learning_rate": 8.048458055155395e-07, + "loss": 0.3782, + "step": 1790 + }, + { + "ETA": 2.51, + "epoch": 0.5759768451519537, + "fp16_scale": 1.0, + "global_step": 1791, + "grad_norm": 1.8931074018887186, + "learning_rate": 8.038239587710484e-07, + "loss": 0.4266, + "step": 1791 + }, + { + "ETA": 2.51, + "epoch": 0.576298440263708, + "fp16_scale": 1.0, + "global_step": 1792, + "grad_norm": 1.9896410413935424, + "learning_rate": 8.028023250223573e-07, + "loss": 0.4381, + "step": 1792 + }, + { + "ETA": 2.51, + "epoch": 0.5766200353754622, + "fp16_scale": 1.0, + "global_step": 1793, + "grad_norm": 1.889691365312974, + "learning_rate": 8.017809053786924e-07, + "loss": 0.4842, + "step": 1793 + }, + { + "ETA": 2.51, + "epoch": 0.5769416304872166, + "fp16_scale": 1.0, + "global_step": 1794, + "grad_norm": 1.9760310787003852, + "learning_rate": 8.007597009490486e-07, + "loss": 0.4218, + "step": 1794 + }, + { + "ETA": 2.51, + "epoch": 0.5772632255989709, + "fp16_scale": 1.0, + "global_step": 1795, + "grad_norm": 1.7763686340074603, + "learning_rate": 7.997387128421858e-07, + "loss": 0.4331, + "step": 1795 + }, + { + "ETA": 2.5, + "epoch": 0.5775848207107251, + "fp16_scale": 1.0, + "global_step": 1796, + "grad_norm": 1.8898712096989858, + "learning_rate": 7.987179421666302e-07, + "loss": 0.3923, + "step": 1796 + }, + { + "ETA": 2.5, + "epoch": 0.5779064158224795, + "fp16_scale": 1.0, + "global_step": 1797, + "grad_norm": 2.399264277769306, + "learning_rate": 7.976973900306709e-07, + "loss": 0.3966, + "step": 1797 + }, + { + "ETA": 2.5, + "epoch": 0.5782280109342338, + "fp16_scale": 1.0, + "global_step": 1798, + "grad_norm": 1.8274115851856996, + "learning_rate": 7.966770575423605e-07, + "loss": 0.4037, + "step": 1798 + }, + { + "ETA": 2.5, + "epoch": 0.5785496060459882, + "fp16_scale": 1.0, + "global_step": 1799, + "grad_norm": 1.96548981746754, + "learning_rate": 7.956569458095133e-07, + "loss": 0.3995, + "step": 1799 + }, + { + "ETA": 2.49, + "epoch": 0.5788712011577424, + "fp16_scale": 1.0, + "global_step": 1800, + "grad_norm": 2.1163447856087463, + "learning_rate": 7.946370559397023e-07, + "loss": 0.4212, + "step": 1800 + }, + { + "ETA": 2.5, + "epoch": 0.5791927962694967, + "fp16_scale": 1.0, + "global_step": 1801, + "grad_norm": 1.9379575899574724, + "learning_rate": 7.936173890402619e-07, + "loss": 0.4296, + "step": 1801 + }, + { + "ETA": 2.5, + "epoch": 0.579514391381251, + "fp16_scale": 1.0, + "global_step": 1802, + "grad_norm": 1.9297845491686623, + "learning_rate": 7.92597946218283e-07, + "loss": 0.4376, + "step": 1802 + }, + { + "ETA": 2.5, + "epoch": 0.5798359864930053, + "fp16_scale": 1.0, + "global_step": 1803, + "grad_norm": 2.029934187436816, + "learning_rate": 7.915787285806127e-07, + "loss": 0.4706, + "step": 1803 + }, + { + "ETA": 2.49, + "epoch": 0.5801575816047596, + "fp16_scale": 1.0, + "global_step": 1804, + "grad_norm": 1.9195051447849678, + "learning_rate": 7.905597372338557e-07, + "loss": 0.3593, + "step": 1804 + }, + { + "ETA": 2.49, + "epoch": 0.580479176716514, + "fp16_scale": 1.0, + "global_step": 1805, + "grad_norm": 2.04487864218629, + "learning_rate": 7.895409732843688e-07, + "loss": 0.4215, + "step": 1805 + }, + { + "ETA": 2.49, + "epoch": 0.5808007718282682, + "fp16_scale": 1.0, + "global_step": 1806, + "grad_norm": 1.980442588823403, + "learning_rate": 7.885224378382631e-07, + "loss": 0.4153, + "step": 1806 + }, + { + "ETA": 2.49, + "epoch": 0.5811223669400225, + "fp16_scale": 1.0, + "global_step": 1807, + "grad_norm": 1.8145489663974113, + "learning_rate": 7.875041320014017e-07, + "loss": 0.4452, + "step": 1807 + }, + { + "ETA": 2.49, + "epoch": 0.5814439620517768, + "fp16_scale": 1.0, + "global_step": 1808, + "grad_norm": 1.8278254072744091, + "learning_rate": 7.864860568793971e-07, + "loss": 0.354, + "step": 1808 + }, + { + "ETA": 2.48, + "epoch": 0.5817655571635311, + "fp16_scale": 1.0, + "global_step": 1809, + "grad_norm": 2.0362472280698642, + "learning_rate": 7.854682135776131e-07, + "loss": 0.442, + "step": 1809 + }, + { + "ETA": 2.48, + "epoch": 0.5820871522752854, + "fp16_scale": 1.0, + "global_step": 1810, + "grad_norm": 1.8502710374670852, + "learning_rate": 7.844506032011604e-07, + "loss": 0.3702, + "step": 1810 + }, + { + "ETA": 2.48, + "epoch": 0.5824087473870397, + "fp16_scale": 1.0, + "global_step": 1811, + "grad_norm": 1.8714825606521435, + "learning_rate": 7.834332268548978e-07, + "loss": 0.3788, + "step": 1811 + }, + { + "ETA": 2.48, + "epoch": 0.582730342498794, + "fp16_scale": 1.0, + "global_step": 1812, + "grad_norm": 1.9831501683786872, + "learning_rate": 7.824160856434291e-07, + "loss": 0.4583, + "step": 1812 + }, + { + "ETA": 2.48, + "epoch": 0.5830519376105483, + "fp16_scale": 1.0, + "global_step": 1813, + "grad_norm": 2.0505702329085342, + "learning_rate": 7.813991806711039e-07, + "loss": 0.4133, + "step": 1813 + }, + { + "ETA": 2.47, + "epoch": 0.5833735327223026, + "fp16_scale": 1.0, + "global_step": 1814, + "grad_norm": 1.8947002380528628, + "learning_rate": 7.803825130420141e-07, + "loss": 0.3833, + "step": 1814 + }, + { + "ETA": 2.47, + "epoch": 0.583695127834057, + "fp16_scale": 1.0, + "global_step": 1815, + "grad_norm": 1.9725055107529188, + "learning_rate": 7.793660838599942e-07, + "loss": 0.3699, + "step": 1815 + }, + { + "ETA": 2.47, + "epoch": 0.5840167229458112, + "fp16_scale": 1.0, + "global_step": 1816, + "grad_norm": 1.8382586942742276, + "learning_rate": 7.783498942286211e-07, + "loss": 0.4018, + "step": 1816 + }, + { + "ETA": 2.47, + "epoch": 0.5843383180575655, + "fp16_scale": 1.0, + "global_step": 1817, + "grad_norm": 2.0241792152858986, + "learning_rate": 7.773339452512096e-07, + "loss": 0.4222, + "step": 1817 + }, + { + "ETA": 2.47, + "epoch": 0.5846599131693199, + "fp16_scale": 1.0, + "global_step": 1818, + "grad_norm": 2.0391250336105777, + "learning_rate": 7.763182380308146e-07, + "loss": 0.3534, + "step": 1818 + }, + { + "ETA": 2.46, + "epoch": 0.5849815082810741, + "fp16_scale": 1.0, + "global_step": 1819, + "grad_norm": 1.994254291959461, + "learning_rate": 7.753027736702282e-07, + "loss": 0.4023, + "step": 1819 + }, + { + "ETA": 2.46, + "epoch": 0.5853031033928284, + "fp16_scale": 1.0, + "global_step": 1820, + "grad_norm": 1.8984747643448134, + "learning_rate": 7.742875532719785e-07, + "loss": 0.4169, + "step": 1820 + }, + { + "ETA": 2.46, + "epoch": 0.5856246985045828, + "fp16_scale": 1.0, + "global_step": 1821, + "grad_norm": 2.032035605353064, + "learning_rate": 7.732725779383294e-07, + "loss": 0.4071, + "step": 1821 + }, + { + "ETA": 2.46, + "epoch": 0.585946293616337, + "fp16_scale": 1.0, + "global_step": 1822, + "grad_norm": 2.0041827741214555, + "learning_rate": 7.722578487712775e-07, + "loss": 0.5558, + "step": 1822 + }, + { + "ETA": 2.46, + "epoch": 0.5862678887280913, + "fp16_scale": 1.0, + "global_step": 1823, + "grad_norm": 2.1584081221774865, + "learning_rate": 7.712433668725536e-07, + "loss": 0.3674, + "step": 1823 + }, + { + "ETA": 2.45, + "epoch": 0.5865894838398457, + "fp16_scale": 1.0, + "global_step": 1824, + "grad_norm": 2.001619097438793, + "learning_rate": 7.70229133343619e-07, + "loss": 0.3917, + "step": 1824 + }, + { + "ETA": 2.45, + "epoch": 0.5869110789515999, + "fp16_scale": 1.0, + "global_step": 1825, + "grad_norm": 1.9983846520932798, + "learning_rate": 7.69215149285666e-07, + "loss": 0.4428, + "step": 1825 + }, + { + "ETA": 2.45, + "epoch": 0.5872326740633542, + "fp16_scale": 1.0, + "global_step": 1826, + "grad_norm": 1.9755229060951203, + "learning_rate": 7.682014157996154e-07, + "loss": 0.3363, + "step": 1826 + }, + { + "ETA": 2.45, + "epoch": 0.5875542691751086, + "fp16_scale": 1.0, + "global_step": 1827, + "grad_norm": 1.8847486667873525, + "learning_rate": 7.671879339861161e-07, + "loss": 0.3591, + "step": 1827 + }, + { + "ETA": 2.45, + "epoch": 0.5878758642868629, + "fp16_scale": 1.0, + "global_step": 1828, + "grad_norm": 1.7688928639790573, + "learning_rate": 7.661747049455443e-07, + "loss": 0.3714, + "step": 1828 + }, + { + "ETA": 2.44, + "epoch": 0.5881974593986171, + "fp16_scale": 1.0, + "global_step": 1829, + "grad_norm": 1.8099458098342016, + "learning_rate": 7.65161729778001e-07, + "loss": 0.3844, + "step": 1829 + }, + { + "ETA": 2.44, + "epoch": 0.5885190545103715, + "fp16_scale": 1.0, + "global_step": 1830, + "grad_norm": 2.010363991501376, + "learning_rate": 7.641490095833125e-07, + "loss": 0.4396, + "step": 1830 + }, + { + "ETA": 2.44, + "epoch": 0.5888406496221258, + "fp16_scale": 1.0, + "global_step": 1831, + "grad_norm": 2.270810950706374, + "learning_rate": 7.631365454610273e-07, + "loss": 0.3659, + "step": 1831 + }, + { + "ETA": 2.44, + "epoch": 0.58916224473388, + "fp16_scale": 1.0, + "global_step": 1832, + "grad_norm": 1.9878661551203023, + "learning_rate": 7.621243385104159e-07, + "loss": 0.485, + "step": 1832 + }, + { + "ETA": 2.44, + "epoch": 0.5894838398456343, + "fp16_scale": 1.0, + "global_step": 1833, + "grad_norm": 2.0226311551404885, + "learning_rate": 7.611123898304708e-07, + "loss": 0.3489, + "step": 1833 + }, + { + "ETA": 2.43, + "epoch": 0.5898054349573887, + "fp16_scale": 1.0, + "global_step": 1834, + "grad_norm": 1.9576918252737419, + "learning_rate": 7.601007005199021e-07, + "loss": 0.4721, + "step": 1834 + }, + { + "ETA": 2.43, + "epoch": 0.5901270300691429, + "fp16_scale": 1.0, + "global_step": 1835, + "grad_norm": 2.0687996999559446, + "learning_rate": 7.590892716771407e-07, + "loss": 0.4394, + "step": 1835 + }, + { + "ETA": 2.43, + "epoch": 0.5904486251808972, + "fp16_scale": 1.0, + "global_step": 1836, + "grad_norm": 2.2151135086262377, + "learning_rate": 7.580781044003324e-07, + "loss": 0.4364, + "step": 1836 + }, + { + "ETA": 2.43, + "epoch": 0.5907702202926516, + "fp16_scale": 1.0, + "global_step": 1837, + "grad_norm": 2.152376591059747, + "learning_rate": 7.570671997873404e-07, + "loss": 0.3745, + "step": 1837 + }, + { + "ETA": 2.42, + "epoch": 0.5910918154044058, + "fp16_scale": 1.0, + "global_step": 1838, + "grad_norm": 2.4006392471702873, + "learning_rate": 7.560565589357426e-07, + "loss": 0.3809, + "step": 1838 + }, + { + "ETA": 2.42, + "epoch": 0.5914134105161601, + "fp16_scale": 1.0, + "global_step": 1839, + "grad_norm": 1.9557808284807419, + "learning_rate": 7.550461829428296e-07, + "loss": 0.3583, + "step": 1839 + }, + { + "ETA": 2.42, + "epoch": 0.5917350056279145, + "fp16_scale": 1.0, + "global_step": 1840, + "grad_norm": 2.18443290175363, + "learning_rate": 7.540360729056058e-07, + "loss": 0.4636, + "step": 1840 + }, + { + "ETA": 2.42, + "epoch": 0.5920566007396687, + "fp16_scale": 1.0, + "global_step": 1841, + "grad_norm": 1.8793855185965898, + "learning_rate": 7.530262299207856e-07, + "loss": 0.5123, + "step": 1841 + }, + { + "ETA": 2.42, + "epoch": 0.592378195851423, + "fp16_scale": 1.0, + "global_step": 1842, + "grad_norm": 2.0013663810307314, + "learning_rate": 7.520166550847944e-07, + "loss": 0.3656, + "step": 1842 + }, + { + "ETA": 2.42, + "epoch": 0.5926997909631774, + "fp16_scale": 1.0, + "global_step": 1843, + "grad_norm": 2.063186431494864, + "learning_rate": 7.510073494937662e-07, + "loss": 0.4559, + "step": 1843 + }, + { + "ETA": 2.41, + "epoch": 0.5930213860749317, + "fp16_scale": 1.0, + "global_step": 1844, + "grad_norm": 1.9613673142244223, + "learning_rate": 7.499983142435418e-07, + "loss": 0.4223, + "step": 1844 + }, + { + "ETA": 2.41, + "epoch": 0.5933429811866859, + "fp16_scale": 1.0, + "global_step": 1845, + "grad_norm": 1.884823087504064, + "learning_rate": 7.489895504296697e-07, + "loss": 0.3825, + "step": 1845 + }, + { + "ETA": 2.41, + "epoch": 0.5936645762984403, + "fp16_scale": 1.0, + "global_step": 1846, + "grad_norm": 1.9092158504687446, + "learning_rate": 7.47981059147403e-07, + "loss": 0.4311, + "step": 1846 + }, + { + "ETA": 2.41, + "epoch": 0.5939861714101946, + "fp16_scale": 1.0, + "global_step": 1847, + "grad_norm": 2.0335014532424953, + "learning_rate": 7.469728414916994e-07, + "loss": 0.4201, + "step": 1847 + }, + { + "ETA": 2.41, + "epoch": 0.5943077665219488, + "fp16_scale": 1.0, + "global_step": 1848, + "grad_norm": 2.401168333894695, + "learning_rate": 7.45964898557219e-07, + "loss": 0.4907, + "step": 1848 + }, + { + "ETA": 2.4, + "epoch": 0.5946293616337032, + "fp16_scale": 1.0, + "global_step": 1849, + "grad_norm": 1.969279338138325, + "learning_rate": 7.449572314383236e-07, + "loss": 0.4598, + "step": 1849 + }, + { + "ETA": 2.4, + "epoch": 0.5949509567454575, + "fp16_scale": 1.0, + "global_step": 1850, + "grad_norm": 1.7639705258674894, + "learning_rate": 7.439498412290762e-07, + "loss": 0.3923, + "step": 1850 + }, + { + "ETA": 2.4, + "epoch": 0.5952725518572117, + "fp16_scale": 1.0, + "global_step": 1851, + "grad_norm": 2.0034800166056272, + "learning_rate": 7.429427290232384e-07, + "loss": 0.4317, + "step": 1851 + }, + { + "ETA": 2.4, + "epoch": 0.5955941469689661, + "fp16_scale": 1.0, + "global_step": 1852, + "grad_norm": 2.0895070852648097, + "learning_rate": 7.419358959142708e-07, + "loss": 0.4073, + "step": 1852 + }, + { + "ETA": 2.4, + "epoch": 0.5959157420807204, + "fp16_scale": 1.0, + "global_step": 1853, + "grad_norm": 1.9109434808336507, + "learning_rate": 7.409293429953296e-07, + "loss": 0.3811, + "step": 1853 + }, + { + "ETA": 2.39, + "epoch": 0.5962373371924746, + "fp16_scale": 1.0, + "global_step": 1854, + "grad_norm": 1.8864076099936178, + "learning_rate": 7.399230713592683e-07, + "loss": 0.4362, + "step": 1854 + }, + { + "ETA": 2.39, + "epoch": 0.596558932304229, + "fp16_scale": 1.0, + "global_step": 1855, + "grad_norm": 1.8161771650077314, + "learning_rate": 7.389170820986345e-07, + "loss": 0.3522, + "step": 1855 + }, + { + "ETA": 2.39, + "epoch": 0.5968805274159833, + "fp16_scale": 1.0, + "global_step": 1856, + "grad_norm": 2.1497221707520264, + "learning_rate": 7.379113763056679e-07, + "loss": 0.5376, + "step": 1856 + }, + { + "ETA": 2.39, + "epoch": 0.5972021225277376, + "fp16_scale": 1.0, + "global_step": 1857, + "grad_norm": 2.0648782697948525, + "learning_rate": 7.369059550723031e-07, + "loss": 0.4663, + "step": 1857 + }, + { + "ETA": 2.39, + "epoch": 0.5975237176394919, + "fp16_scale": 1.0, + "global_step": 1858, + "grad_norm": 1.9443453860732576, + "learning_rate": 7.359008194901632e-07, + "loss": 0.4608, + "step": 1858 + }, + { + "ETA": 2.38, + "epoch": 0.5978453127512462, + "fp16_scale": 1.0, + "global_step": 1859, + "grad_norm": 1.9649638216733079, + "learning_rate": 7.348959706505626e-07, + "loss": 0.4703, + "step": 1859 + }, + { + "ETA": 2.38, + "epoch": 0.5981669078630005, + "fp16_scale": 1.0, + "global_step": 1860, + "grad_norm": 1.8028674664504476, + "learning_rate": 7.338914096445041e-07, + "loss": 0.4174, + "step": 1860 + }, + { + "ETA": 2.38, + "epoch": 0.5984885029747548, + "fp16_scale": 1.0, + "global_step": 1861, + "grad_norm": 1.9784286461654301, + "learning_rate": 7.328871375626771e-07, + "loss": 0.4348, + "step": 1861 + }, + { + "ETA": 2.38, + "epoch": 0.5988100980865091, + "fp16_scale": 1.0, + "global_step": 1862, + "grad_norm": 1.9713681338338356, + "learning_rate": 7.31883155495459e-07, + "loss": 0.4722, + "step": 1862 + }, + { + "ETA": 2.38, + "epoch": 0.5991316931982634, + "fp16_scale": 1.0, + "global_step": 1863, + "grad_norm": 2.128705032392235, + "learning_rate": 7.308794645329105e-07, + "loss": 0.3703, + "step": 1863 + }, + { + "ETA": 2.38, + "epoch": 0.5994532883100177, + "fp16_scale": 1.0, + "global_step": 1864, + "grad_norm": 1.863216906100975, + "learning_rate": 7.298760657647778e-07, + "loss": 0.426, + "step": 1864 + }, + { + "ETA": 2.37, + "epoch": 0.599774883421772, + "fp16_scale": 1.0, + "global_step": 1865, + "grad_norm": 2.073626712959002, + "learning_rate": 7.288729602804891e-07, + "loss": 0.3698, + "step": 1865 + }, + { + "ETA": 2.37, + "epoch": 0.6000964785335263, + "fp16_scale": 1.0, + "global_step": 1866, + "grad_norm": 2.121206658616612, + "learning_rate": 7.278701491691537e-07, + "loss": 0.4141, + "step": 1866 + }, + { + "ETA": 2.37, + "epoch": 0.6004180736452805, + "fp16_scale": 1.0, + "global_step": 1867, + "grad_norm": 1.9570534911328257, + "learning_rate": 7.268676335195623e-07, + "loss": 0.3992, + "step": 1867 + }, + { + "ETA": 2.37, + "epoch": 0.6007396687570349, + "fp16_scale": 1.0, + "global_step": 1868, + "grad_norm": 1.830324613897496, + "learning_rate": 7.258654144201839e-07, + "loss": 0.3837, + "step": 1868 + }, + { + "ETA": 2.36, + "epoch": 0.6010612638687892, + "fp16_scale": 1.0, + "global_step": 1869, + "grad_norm": 2.1064898331486224, + "learning_rate": 7.248634929591667e-07, + "loss": 0.4326, + "step": 1869 + }, + { + "ETA": 2.36, + "epoch": 0.6013828589805434, + "fp16_scale": 1.0, + "global_step": 1870, + "grad_norm": 1.9603670404047848, + "learning_rate": 7.238618702243338e-07, + "loss": 0.3825, + "step": 1870 + }, + { + "ETA": 2.36, + "epoch": 0.6017044540922978, + "fp16_scale": 1.0, + "global_step": 1871, + "grad_norm": 1.717941156944836, + "learning_rate": 7.228605473031866e-07, + "loss": 0.3497, + "step": 1871 + }, + { + "ETA": 2.36, + "epoch": 0.6020260492040521, + "fp16_scale": 1.0, + "global_step": 1872, + "grad_norm": 1.8542634815483454, + "learning_rate": 7.218595252828985e-07, + "loss": 0.4263, + "step": 1872 + }, + { + "ETA": 2.36, + "epoch": 0.6023476443158065, + "fp16_scale": 1.0, + "global_step": 1873, + "grad_norm": 1.9691349002044445, + "learning_rate": 7.208588052503173e-07, + "loss": 0.4024, + "step": 1873 + }, + { + "ETA": 2.35, + "epoch": 0.6026692394275607, + "fp16_scale": 1.0, + "global_step": 1874, + "grad_norm": 2.1285946804020965, + "learning_rate": 7.198583882919636e-07, + "loss": 0.4332, + "step": 1874 + }, + { + "ETA": 2.35, + "epoch": 0.602990834539315, + "fp16_scale": 1.0, + "global_step": 1875, + "grad_norm": 2.10217432293654, + "learning_rate": 7.188582754940273e-07, + "loss": 0.3837, + "step": 1875 + }, + { + "ETA": 2.35, + "epoch": 0.6033124296510693, + "fp16_scale": 1.0, + "global_step": 1876, + "grad_norm": 1.8823643332979483, + "learning_rate": 7.178584679423694e-07, + "loss": 0.3772, + "step": 1876 + }, + { + "ETA": 2.35, + "epoch": 0.6036340247628236, + "fp16_scale": 1.0, + "global_step": 1877, + "grad_norm": 2.082038138133417, + "learning_rate": 7.168589667225191e-07, + "loss": 0.504, + "step": 1877 + }, + { + "ETA": 2.35, + "epoch": 0.6039556198745779, + "fp16_scale": 1.0, + "global_step": 1878, + "grad_norm": 2.0510651597553493, + "learning_rate": 7.158597729196723e-07, + "loss": 0.4473, + "step": 1878 + }, + { + "ETA": 2.35, + "epoch": 0.6042772149863322, + "fp16_scale": 1.0, + "global_step": 1879, + "grad_norm": 1.863623728356378, + "learning_rate": 7.14860887618693e-07, + "loss": 0.4507, + "step": 1879 + }, + { + "ETA": 2.34, + "epoch": 0.6045988100980865, + "fp16_scale": 1.0, + "global_step": 1880, + "grad_norm": 1.740905279842798, + "learning_rate": 7.138623119041079e-07, + "loss": 0.4278, + "step": 1880 + }, + { + "ETA": 2.34, + "epoch": 0.6049204052098408, + "fp16_scale": 1.0, + "global_step": 1881, + "grad_norm": 1.9298202082804343, + "learning_rate": 7.12864046860109e-07, + "loss": 0.4082, + "step": 1881 + }, + { + "ETA": 2.34, + "epoch": 0.6052420003215951, + "fp16_scale": 1.0, + "global_step": 1882, + "grad_norm": 2.0284492390917843, + "learning_rate": 7.118660935705509e-07, + "loss": 0.4157, + "step": 1882 + }, + { + "ETA": 2.34, + "epoch": 0.6055635954333494, + "fp16_scale": 1.0, + "global_step": 1883, + "grad_norm": 2.291831988981119, + "learning_rate": 7.108684531189496e-07, + "loss": 0.4533, + "step": 1883 + }, + { + "ETA": 2.34, + "epoch": 0.6058851905451037, + "fp16_scale": 1.0, + "global_step": 1884, + "grad_norm": 1.6440238224138444, + "learning_rate": 7.09871126588481e-07, + "loss": 0.3447, + "step": 1884 + }, + { + "ETA": 2.33, + "epoch": 0.606206785656858, + "fp16_scale": 1.0, + "global_step": 1885, + "grad_norm": 2.024599681836863, + "learning_rate": 7.088741150619803e-07, + "loss": 0.3901, + "step": 1885 + }, + { + "ETA": 2.33, + "epoch": 0.6065283807686123, + "fp16_scale": 1.0, + "global_step": 1886, + "grad_norm": 1.8890920986588906, + "learning_rate": 7.078774196219413e-07, + "loss": 0.457, + "step": 1886 + }, + { + "ETA": 2.33, + "epoch": 0.6068499758803666, + "fp16_scale": 1.0, + "global_step": 1887, + "grad_norm": 1.970377818957128, + "learning_rate": 7.06881041350514e-07, + "loss": 0.4278, + "step": 1887 + }, + { + "ETA": 2.33, + "epoch": 0.6071715709921209, + "fp16_scale": 1.0, + "global_step": 1888, + "grad_norm": 1.9325448841424304, + "learning_rate": 7.058849813295049e-07, + "loss": 0.4125, + "step": 1888 + }, + { + "ETA": 2.33, + "epoch": 0.6074931661038753, + "fp16_scale": 1.0, + "global_step": 1889, + "grad_norm": 1.964317539985117, + "learning_rate": 7.048892406403733e-07, + "loss": 0.4113, + "step": 1889 + }, + { + "ETA": 2.32, + "epoch": 0.6078147612156295, + "fp16_scale": 1.0, + "global_step": 1890, + "grad_norm": 1.8482170685923327, + "learning_rate": 7.03893820364233e-07, + "loss": 0.3398, + "step": 1890 + }, + { + "ETA": 2.32, + "epoch": 0.6081363563273838, + "fp16_scale": 1.0, + "global_step": 1891, + "grad_norm": 2.026553743125374, + "learning_rate": 7.028987215818505e-07, + "loss": 0.424, + "step": 1891 + }, + { + "ETA": 2.32, + "epoch": 0.6084579514391382, + "fp16_scale": 1.0, + "global_step": 1892, + "grad_norm": 2.1040219322450895, + "learning_rate": 7.019039453736413e-07, + "loss": 0.3661, + "step": 1892 + }, + { + "ETA": 2.32, + "epoch": 0.6087795465508924, + "fp16_scale": 1.0, + "global_step": 1893, + "grad_norm": 1.8607005041624993, + "learning_rate": 7.009094928196727e-07, + "loss": 0.4016, + "step": 1893 + }, + { + "ETA": 2.32, + "epoch": 0.6091011416626467, + "fp16_scale": 1.0, + "global_step": 1894, + "grad_norm": 2.033089810273623, + "learning_rate": 6.999153649996594e-07, + "loss": 0.3914, + "step": 1894 + }, + { + "ETA": 2.31, + "epoch": 0.6094227367744011, + "fp16_scale": 1.0, + "global_step": 1895, + "grad_norm": 1.980625160789773, + "learning_rate": 6.989215629929637e-07, + "loss": 0.4594, + "step": 1895 + }, + { + "ETA": 2.31, + "epoch": 0.6097443318861553, + "fp16_scale": 1.0, + "global_step": 1896, + "grad_norm": 1.83085786985307, + "learning_rate": 6.979280878785947e-07, + "loss": 0.3964, + "step": 1896 + }, + { + "ETA": 2.31, + "epoch": 0.6100659269979096, + "fp16_scale": 1.0, + "global_step": 1897, + "grad_norm": 1.9794632495257205, + "learning_rate": 6.969349407352056e-07, + "loss": 0.4243, + "step": 1897 + }, + { + "ETA": 2.31, + "epoch": 0.610387522109664, + "fp16_scale": 1.0, + "global_step": 1898, + "grad_norm": 2.0475496147883527, + "learning_rate": 6.959421226410947e-07, + "loss": 0.3993, + "step": 1898 + }, + { + "ETA": 2.31, + "epoch": 0.6107091172214182, + "fp16_scale": 1.0, + "global_step": 1899, + "grad_norm": 2.007163917754024, + "learning_rate": 6.949496346742017e-07, + "loss": 0.4549, + "step": 1899 + }, + { + "ETA": 2.3, + "epoch": 0.6110307123331725, + "fp16_scale": 1.0, + "global_step": 1900, + "grad_norm": 2.073288861320127, + "learning_rate": 6.939574779121093e-07, + "loss": 0.4442, + "step": 1900 + }, + { + "ETA": 2.3, + "epoch": 0.6113523074449269, + "fp16_scale": 1.0, + "global_step": 1901, + "grad_norm": 2.0703613897445434, + "learning_rate": 6.929656534320397e-07, + "loss": 0.3771, + "step": 1901 + }, + { + "ETA": 2.3, + "epoch": 0.6116739025566812, + "fp16_scale": 1.0, + "global_step": 1902, + "grad_norm": 2.1497329821966957, + "learning_rate": 6.919741623108542e-07, + "loss": 0.4377, + "step": 1902 + }, + { + "ETA": 2.3, + "epoch": 0.6119954976684354, + "fp16_scale": 1.0, + "global_step": 1903, + "grad_norm": 2.0351757605739116, + "learning_rate": 6.909830056250526e-07, + "loss": 0.5291, + "step": 1903 + }, + { + "ETA": 2.3, + "epoch": 0.6123170927801898, + "fp16_scale": 1.0, + "global_step": 1904, + "grad_norm": 2.07226105653468, + "learning_rate": 6.899921844507714e-07, + "loss": 0.4179, + "step": 1904 + }, + { + "ETA": 2.29, + "epoch": 0.6126386878919441, + "fp16_scale": 1.0, + "global_step": 1905, + "grad_norm": 1.9487416991930222, + "learning_rate": 6.890016998637836e-07, + "loss": 0.3382, + "step": 1905 + }, + { + "ETA": 2.29, + "epoch": 0.6129602830036983, + "fp16_scale": 1.0, + "global_step": 1906, + "grad_norm": 2.0221826982488005, + "learning_rate": 6.880115529394952e-07, + "loss": 0.4389, + "step": 1906 + }, + { + "ETA": 2.29, + "epoch": 0.6132818781154526, + "fp16_scale": 1.0, + "global_step": 1907, + "grad_norm": 1.9054340759625708, + "learning_rate": 6.870217447529463e-07, + "loss": 0.4863, + "step": 1907 + }, + { + "ETA": 2.29, + "epoch": 0.613603473227207, + "fp16_scale": 1.0, + "global_step": 1908, + "grad_norm": 2.0179621149474336, + "learning_rate": 6.860322763788101e-07, + "loss": 0.4488, + "step": 1908 + }, + { + "ETA": 2.29, + "epoch": 0.6139250683389612, + "fp16_scale": 1.0, + "global_step": 1909, + "grad_norm": 2.098744115240401, + "learning_rate": 6.850431488913895e-07, + "loss": 0.429, + "step": 1909 + }, + { + "ETA": 2.29, + "epoch": 0.6142466634507155, + "fp16_scale": 1.0, + "global_step": 1910, + "grad_norm": 2.0395094518502312, + "learning_rate": 6.840543633646186e-07, + "loss": 0.4948, + "step": 1910 + }, + { + "ETA": 2.28, + "epoch": 0.6145682585624699, + "fp16_scale": 1.0, + "global_step": 1911, + "grad_norm": 1.7849757703820304, + "learning_rate": 6.830659208720587e-07, + "loss": 0.4736, + "step": 1911 + }, + { + "ETA": 2.28, + "epoch": 0.6148898536742241, + "fp16_scale": 1.0, + "global_step": 1912, + "grad_norm": 2.15187119824761, + "learning_rate": 6.820778224868998e-07, + "loss": 0.4395, + "step": 1912 + }, + { + "ETA": 2.28, + "epoch": 0.6152114487859784, + "fp16_scale": 1.0, + "global_step": 1913, + "grad_norm": 1.8579408402211197, + "learning_rate": 6.810900692819581e-07, + "loss": 0.4385, + "step": 1913 + }, + { + "ETA": 2.28, + "epoch": 0.6155330438977328, + "fp16_scale": 1.0, + "global_step": 1914, + "grad_norm": 1.8482655430633033, + "learning_rate": 6.801026623296744e-07, + "loss": 0.4306, + "step": 1914 + }, + { + "ETA": 2.28, + "epoch": 0.615854639009487, + "fp16_scale": 1.0, + "global_step": 1915, + "grad_norm": 1.841883593780279, + "learning_rate": 6.791156027021147e-07, + "loss": 0.4127, + "step": 1915 + }, + { + "ETA": 2.28, + "epoch": 0.6161762341212413, + "fp16_scale": 1.0, + "global_step": 1916, + "grad_norm": 2.080739819301582, + "learning_rate": 6.781288914709665e-07, + "loss": 0.4478, + "step": 1916 + }, + { + "ETA": 2.27, + "epoch": 0.6164978292329957, + "fp16_scale": 1.0, + "global_step": 1917, + "grad_norm": 1.735666030974362, + "learning_rate": 6.771425297075404e-07, + "loss": 0.3685, + "step": 1917 + }, + { + "ETA": 2.27, + "epoch": 0.61681942434475, + "fp16_scale": 1.0, + "global_step": 1918, + "grad_norm": 2.2135331991280776, + "learning_rate": 6.76156518482767e-07, + "loss": 0.4302, + "step": 1918 + }, + { + "ETA": 2.27, + "epoch": 0.6171410194565042, + "fp16_scale": 1.0, + "global_step": 1919, + "grad_norm": 2.060755616454066, + "learning_rate": 6.751708588671954e-07, + "loss": 0.4693, + "step": 1919 + }, + { + "ETA": 2.27, + "epoch": 0.6174626145682586, + "fp16_scale": 1.0, + "global_step": 1920, + "grad_norm": 1.8526713368355843, + "learning_rate": 6.741855519309947e-07, + "loss": 0.4548, + "step": 1920 + }, + { + "ETA": 2.26, + "epoch": 0.6177842096800129, + "fp16_scale": 1.0, + "global_step": 1921, + "grad_norm": 2.0922034155667544, + "learning_rate": 6.732005987439493e-07, + "loss": 0.4153, + "step": 1921 + }, + { + "ETA": 2.26, + "epoch": 0.6181058047917671, + "fp16_scale": 1.0, + "global_step": 1922, + "grad_norm": 1.9621669577526282, + "learning_rate": 6.722160003754616e-07, + "loss": 0.433, + "step": 1922 + }, + { + "ETA": 2.26, + "epoch": 0.6184273999035215, + "fp16_scale": 1.0, + "global_step": 1923, + "grad_norm": 2.1596970828730875, + "learning_rate": 6.712317578945463e-07, + "loss": 0.4813, + "step": 1923 + }, + { + "ETA": 2.26, + "epoch": 0.6187489950152758, + "fp16_scale": 1.0, + "global_step": 1924, + "grad_norm": 1.969432853605627, + "learning_rate": 6.702478723698335e-07, + "loss": 0.5096, + "step": 1924 + }, + { + "ETA": 2.26, + "epoch": 0.61907059012703, + "fp16_scale": 1.0, + "global_step": 1925, + "grad_norm": 1.942849372023299, + "learning_rate": 6.692643448695653e-07, + "loss": 0.3988, + "step": 1925 + }, + { + "ETA": 2.26, + "epoch": 0.6193921852387844, + "fp16_scale": 1.0, + "global_step": 1926, + "grad_norm": 2.0304285304895644, + "learning_rate": 6.682811764615946e-07, + "loss": 0.4563, + "step": 1926 + }, + { + "ETA": 2.25, + "epoch": 0.6197137803505387, + "fp16_scale": 1.0, + "global_step": 1927, + "grad_norm": 1.9426106278397655, + "learning_rate": 6.672983682133854e-07, + "loss": 0.3837, + "step": 1927 + }, + { + "ETA": 2.25, + "epoch": 0.6200353754622929, + "fp16_scale": 1.0, + "global_step": 1928, + "grad_norm": 1.9978059912858224, + "learning_rate": 6.663159211920093e-07, + "loss": 0.4268, + "step": 1928 + }, + { + "ETA": 2.25, + "epoch": 0.6203569705740473, + "fp16_scale": 1.0, + "global_step": 1929, + "grad_norm": 1.8887839782993492, + "learning_rate": 6.653338364641471e-07, + "loss": 0.4658, + "step": 1929 + }, + { + "ETA": 2.25, + "epoch": 0.6206785656858016, + "fp16_scale": 1.0, + "global_step": 1930, + "grad_norm": 2.172058250104252, + "learning_rate": 6.643521150960854e-07, + "loss": 0.4601, + "step": 1930 + }, + { + "ETA": 2.25, + "epoch": 0.6210001607975558, + "fp16_scale": 1.0, + "global_step": 1931, + "grad_norm": 1.942563500353798, + "learning_rate": 6.633707581537158e-07, + "loss": 0.4898, + "step": 1931 + }, + { + "ETA": 2.25, + "epoch": 0.6213217559093102, + "fp16_scale": 1.0, + "global_step": 1932, + "grad_norm": 1.6498254195120632, + "learning_rate": 6.623897667025363e-07, + "loss": 0.4492, + "step": 1932 + }, + { + "ETA": 2.24, + "epoch": 0.6216433510210645, + "fp16_scale": 1.0, + "global_step": 1933, + "grad_norm": 2.3993883773088918, + "learning_rate": 6.614091418076452e-07, + "loss": 0.3456, + "step": 1933 + }, + { + "ETA": 2.24, + "epoch": 0.6219649461328188, + "fp16_scale": 1.0, + "global_step": 1934, + "grad_norm": 1.8617557065988062, + "learning_rate": 6.604288845337452e-07, + "loss": 0.4768, + "step": 1934 + }, + { + "ETA": 2.24, + "epoch": 0.622286541244573, + "fp16_scale": 1.0, + "global_step": 1935, + "grad_norm": 1.893530852376071, + "learning_rate": 6.59448995945139e-07, + "loss": 0.3804, + "step": 1935 + }, + { + "ETA": 2.24, + "epoch": 0.6226081363563274, + "fp16_scale": 1.0, + "global_step": 1936, + "grad_norm": 2.0786509523274965, + "learning_rate": 6.584694771057284e-07, + "loss": 0.3875, + "step": 1936 + }, + { + "ETA": 2.24, + "epoch": 0.6229297314680817, + "fp16_scale": 1.0, + "global_step": 1937, + "grad_norm": 1.912223968505479, + "learning_rate": 6.574903290790149e-07, + "loss": 0.4234, + "step": 1937 + }, + { + "ETA": 2.23, + "epoch": 0.623251326579836, + "fp16_scale": 1.0, + "global_step": 1938, + "grad_norm": 1.8817215013769721, + "learning_rate": 6.56511552928096e-07, + "loss": 0.456, + "step": 1938 + }, + { + "ETA": 2.23, + "epoch": 0.6235729216915903, + "fp16_scale": 1.0, + "global_step": 1939, + "grad_norm": 2.3199546637651274, + "learning_rate": 6.555331497156671e-07, + "loss": 0.4695, + "step": 1939 + }, + { + "ETA": 2.23, + "epoch": 0.6238945168033446, + "fp16_scale": 1.0, + "global_step": 1940, + "grad_norm": 1.8956014802235357, + "learning_rate": 6.545551205040173e-07, + "loss": 0.5008, + "step": 1940 + }, + { + "ETA": 2.23, + "epoch": 0.6242161119150988, + "fp16_scale": 1.0, + "global_step": 1941, + "grad_norm": 2.0032353917606947, + "learning_rate": 6.535774663550309e-07, + "loss": 0.4442, + "step": 1941 + }, + { + "ETA": 2.23, + "epoch": 0.6245377070268532, + "fp16_scale": 1.0, + "global_step": 1942, + "grad_norm": 1.757997986879624, + "learning_rate": 6.526001883301832e-07, + "loss": 0.3738, + "step": 1942 + }, + { + "ETA": 2.22, + "epoch": 0.6248593021386075, + "fp16_scale": 1.0, + "global_step": 1943, + "grad_norm": 1.761485498037052, + "learning_rate": 6.516232874905427e-07, + "loss": 0.4376, + "step": 1943 + }, + { + "ETA": 2.22, + "epoch": 0.6251808972503617, + "fp16_scale": 1.0, + "global_step": 1944, + "grad_norm": 1.9214973154495354, + "learning_rate": 6.506467648967683e-07, + "loss": 0.4894, + "step": 1944 + }, + { + "ETA": 2.22, + "epoch": 0.6255024923621161, + "fp16_scale": 1.0, + "global_step": 1945, + "grad_norm": 1.857485549019393, + "learning_rate": 6.496706216091065e-07, + "loss": 0.4339, + "step": 1945 + }, + { + "ETA": 2.22, + "epoch": 0.6258240874738704, + "fp16_scale": 1.0, + "global_step": 1946, + "grad_norm": 1.9873136616166283, + "learning_rate": 6.486948586873948e-07, + "loss": 0.3943, + "step": 1946 + }, + { + "ETA": 2.22, + "epoch": 0.6261456825856248, + "fp16_scale": 1.0, + "global_step": 1947, + "grad_norm": 1.9182273871188733, + "learning_rate": 6.477194771910553e-07, + "loss": 0.417, + "step": 1947 + }, + { + "ETA": 2.21, + "epoch": 0.626467277697379, + "fp16_scale": 1.0, + "global_step": 1948, + "grad_norm": 2.1600667481627926, + "learning_rate": 6.467444781790966e-07, + "loss": 0.4486, + "step": 1948 + }, + { + "ETA": 2.21, + "epoch": 0.6267888728091333, + "fp16_scale": 1.0, + "global_step": 1949, + "grad_norm": 1.7982034937822307, + "learning_rate": 6.457698627101131e-07, + "loss": 0.4166, + "step": 1949 + }, + { + "ETA": 2.21, + "epoch": 0.6271104679208876, + "fp16_scale": 1.0, + "global_step": 1950, + "grad_norm": 2.0885425256491366, + "learning_rate": 6.447956318422811e-07, + "loss": 0.3997, + "step": 1950 + }, + { + "ETA": 2.21, + "epoch": 0.6274320630326419, + "fp16_scale": 1.0, + "global_step": 1951, + "grad_norm": 1.8681279298899824, + "learning_rate": 6.438217866333607e-07, + "loss": 0.4551, + "step": 1951 + }, + { + "ETA": 2.21, + "epoch": 0.6277536581443962, + "fp16_scale": 1.0, + "global_step": 1952, + "grad_norm": 1.8907531289709132, + "learning_rate": 6.428483281406927e-07, + "loss": 0.449, + "step": 1952 + }, + { + "ETA": 2.2, + "epoch": 0.6280752532561505, + "fp16_scale": 1.0, + "global_step": 1953, + "grad_norm": 1.7998942403187896, + "learning_rate": 6.418752574211972e-07, + "loss": 0.4732, + "step": 1953 + }, + { + "ETA": 2.2, + "epoch": 0.6283968483679048, + "fp16_scale": 1.0, + "global_step": 1954, + "grad_norm": 1.883198087188713, + "learning_rate": 6.409025755313756e-07, + "loss": 0.4562, + "step": 1954 + }, + { + "ETA": 2.2, + "epoch": 0.6287184434796591, + "fp16_scale": 1.0, + "global_step": 1955, + "grad_norm": 1.975106498305029, + "learning_rate": 6.399302835273046e-07, + "loss": 0.4571, + "step": 1955 + }, + { + "ETA": 2.2, + "epoch": 0.6290400385914134, + "fp16_scale": 1.0, + "global_step": 1956, + "grad_norm": 1.8604538136213093, + "learning_rate": 6.38958382464639e-07, + "loss": 0.3717, + "step": 1956 + }, + { + "ETA": 2.2, + "epoch": 0.6293616337031677, + "fp16_scale": 1.0, + "global_step": 1957, + "grad_norm": 1.8549947953647987, + "learning_rate": 6.379868733986088e-07, + "loss": 0.5051, + "step": 1957 + }, + { + "ETA": 2.2, + "epoch": 0.629683228814922, + "fp16_scale": 1.0, + "global_step": 1958, + "grad_norm": 1.7524149471514499, + "learning_rate": 6.370157573840187e-07, + "loss": 0.4633, + "step": 1958 + }, + { + "ETA": 2.19, + "epoch": 0.6300048239266763, + "fp16_scale": 1.0, + "global_step": 1959, + "grad_norm": 1.9221223955013331, + "learning_rate": 6.360450354752458e-07, + "loss": 0.4509, + "step": 1959 + }, + { + "ETA": 2.19, + "epoch": 0.6303264190384306, + "fp16_scale": 1.0, + "global_step": 1960, + "grad_norm": 2.194917362984832, + "learning_rate": 6.3507470872624e-07, + "loss": 0.4769, + "step": 1960 + }, + { + "ETA": 2.19, + "epoch": 0.6306480141501849, + "fp16_scale": 1.0, + "global_step": 1961, + "grad_norm": 1.9024229881646617, + "learning_rate": 6.341047781905222e-07, + "loss": 0.4523, + "step": 1961 + }, + { + "ETA": 2.19, + "epoch": 0.6309696092619392, + "fp16_scale": 1.0, + "global_step": 1962, + "grad_norm": 1.8615673024806854, + "learning_rate": 6.331352449211826e-07, + "loss": 0.4447, + "step": 1962 + }, + { + "ETA": 2.19, + "epoch": 0.6312912043736936, + "fp16_scale": 1.0, + "global_step": 1963, + "grad_norm": 1.793802394033057, + "learning_rate": 6.321661099708811e-07, + "loss": 0.4711, + "step": 1963 + }, + { + "ETA": 2.18, + "epoch": 0.6316127994854478, + "fp16_scale": 1.0, + "global_step": 1964, + "grad_norm": 2.1104973254266373, + "learning_rate": 6.311973743918437e-07, + "loss": 0.3755, + "step": 1964 + }, + { + "ETA": 2.18, + "epoch": 0.6319343945972021, + "fp16_scale": 1.0, + "global_step": 1965, + "grad_norm": 2.1964254451114145, + "learning_rate": 6.302290392358635e-07, + "loss": 0.5188, + "step": 1965 + }, + { + "ETA": 2.18, + "epoch": 0.6322559897089565, + "fp16_scale": 1.0, + "global_step": 1966, + "grad_norm": 1.751616382161262, + "learning_rate": 6.292611055542996e-07, + "loss": 0.3545, + "step": 1966 + }, + { + "ETA": 2.18, + "epoch": 0.6325775848207107, + "fp16_scale": 1.0, + "global_step": 1967, + "grad_norm": 2.1647898881601546, + "learning_rate": 6.282935743980735e-07, + "loss": 0.4911, + "step": 1967 + }, + { + "ETA": 2.18, + "epoch": 0.632899179932465, + "fp16_scale": 1.0, + "global_step": 1968, + "grad_norm": 1.949564451269776, + "learning_rate": 6.273264468176715e-07, + "loss": 0.3563, + "step": 1968 + }, + { + "ETA": 2.17, + "epoch": 0.6332207750442194, + "fp16_scale": 1.0, + "global_step": 1969, + "grad_norm": 2.1152615023556467, + "learning_rate": 6.263597238631404e-07, + "loss": 0.4673, + "step": 1969 + }, + { + "ETA": 2.17, + "epoch": 0.6335423701559736, + "fp16_scale": 1.0, + "global_step": 1970, + "grad_norm": 2.0050156477818835, + "learning_rate": 6.253934065840879e-07, + "loss": 0.4085, + "step": 1970 + }, + { + "ETA": 2.17, + "epoch": 0.6338639652677279, + "fp16_scale": 1.0, + "global_step": 1971, + "grad_norm": 2.095570426093439, + "learning_rate": 6.244274960296823e-07, + "loss": 0.4026, + "step": 1971 + }, + { + "ETA": 2.17, + "epoch": 0.6341855603794823, + "fp16_scale": 1.0, + "global_step": 1972, + "grad_norm": 2.1479747684036172, + "learning_rate": 6.234619932486485e-07, + "loss": 0.4654, + "step": 1972 + }, + { + "ETA": 2.17, + "epoch": 0.6345071554912365, + "fp16_scale": 1.0, + "global_step": 1973, + "grad_norm": 1.8034781534833133, + "learning_rate": 6.224968992892701e-07, + "loss": 0.4604, + "step": 1973 + }, + { + "ETA": 2.16, + "epoch": 0.6348287506029908, + "fp16_scale": 1.0, + "global_step": 1974, + "grad_norm": 1.811041843738487, + "learning_rate": 6.215322151993863e-07, + "loss": 0.4874, + "step": 1974 + }, + { + "ETA": 2.16, + "epoch": 0.6351503457147452, + "fp16_scale": 1.0, + "global_step": 1975, + "grad_norm": 1.9841854199069409, + "learning_rate": 6.205679420263916e-07, + "loss": 0.4831, + "step": 1975 + }, + { + "ETA": 2.16, + "epoch": 0.6354719408264994, + "fp16_scale": 1.0, + "global_step": 1976, + "grad_norm": 1.9980204524170417, + "learning_rate": 6.196040808172343e-07, + "loss": 0.4439, + "step": 1976 + }, + { + "ETA": 2.16, + "epoch": 0.6357935359382537, + "fp16_scale": 1.0, + "global_step": 1977, + "grad_norm": 1.9608922773188318, + "learning_rate": 6.186406326184143e-07, + "loss": 0.4321, + "step": 1977 + }, + { + "ETA": 2.16, + "epoch": 0.636115131050008, + "fp16_scale": 1.0, + "global_step": 1978, + "grad_norm": 2.2580222602067272, + "learning_rate": 6.176775984759847e-07, + "loss": 0.4243, + "step": 1978 + }, + { + "ETA": 2.16, + "epoch": 0.6364367261617624, + "fp16_scale": 1.0, + "global_step": 1979, + "grad_norm": 2.1119575206185974, + "learning_rate": 6.167149794355481e-07, + "loss": 0.4395, + "step": 1979 + }, + { + "ETA": 2.15, + "epoch": 0.6367583212735166, + "fp16_scale": 1.0, + "global_step": 1980, + "grad_norm": 1.811932127171653, + "learning_rate": 6.157527765422573e-07, + "loss": 0.3301, + "step": 1980 + }, + { + "ETA": 2.15, + "epoch": 0.637079916385271, + "fp16_scale": 1.0, + "global_step": 1981, + "grad_norm": 1.9755538028177826, + "learning_rate": 6.147909908408115e-07, + "loss": 0.5009, + "step": 1981 + }, + { + "ETA": 2.15, + "epoch": 0.6374015114970253, + "fp16_scale": 1.0, + "global_step": 1982, + "grad_norm": 1.757935829368295, + "learning_rate": 6.138296233754587e-07, + "loss": 0.4297, + "step": 1982 + }, + { + "ETA": 2.15, + "epoch": 0.6377231066087795, + "fp16_scale": 1.0, + "global_step": 1983, + "grad_norm": 2.1585630597789818, + "learning_rate": 6.128686751899924e-07, + "loss": 0.4291, + "step": 1983 + }, + { + "ETA": 2.15, + "epoch": 0.6380447017205338, + "fp16_scale": 1.0, + "global_step": 1984, + "grad_norm": 1.9614757976606525, + "learning_rate": 6.119081473277501e-07, + "loss": 0.4499, + "step": 1984 + }, + { + "ETA": 2.14, + "epoch": 0.6383662968322882, + "fp16_scale": 1.0, + "global_step": 1985, + "grad_norm": 1.8461831674771794, + "learning_rate": 6.109480408316143e-07, + "loss": 0.361, + "step": 1985 + }, + { + "ETA": 2.14, + "epoch": 0.6386878919440424, + "fp16_scale": 1.0, + "global_step": 1986, + "grad_norm": 2.5933550939749392, + "learning_rate": 6.099883567440081e-07, + "loss": 0.486, + "step": 1986 + }, + { + "ETA": 2.14, + "epoch": 0.6390094870557967, + "fp16_scale": 1.0, + "global_step": 1987, + "grad_norm": 1.7600846185114918, + "learning_rate": 6.090290961068978e-07, + "loss": 0.4141, + "step": 1987 + }, + { + "ETA": 2.14, + "epoch": 0.6393310821675511, + "fp16_scale": 1.0, + "global_step": 1988, + "grad_norm": 2.1253603328467463, + "learning_rate": 6.080702599617892e-07, + "loss": 0.3644, + "step": 1988 + }, + { + "ETA": 2.14, + "epoch": 0.6396526772793053, + "fp16_scale": 1.0, + "global_step": 1989, + "grad_norm": 2.197696654861091, + "learning_rate": 6.07111849349727e-07, + "loss": 0.3995, + "step": 1989 + }, + { + "ETA": 2.13, + "epoch": 0.6399742723910596, + "fp16_scale": 1.0, + "global_step": 1990, + "grad_norm": 1.9931956613987192, + "learning_rate": 6.061538653112941e-07, + "loss": 0.4471, + "step": 1990 + }, + { + "ETA": 2.13, + "epoch": 0.640295867502814, + "fp16_scale": 1.0, + "global_step": 1991, + "grad_norm": 1.8791204448826448, + "learning_rate": 6.051963088866101e-07, + "loss": 0.3875, + "step": 1991 + }, + { + "ETA": 2.13, + "epoch": 0.6406174626145683, + "fp16_scale": 1.0, + "global_step": 1992, + "grad_norm": 1.865998253665651, + "learning_rate": 6.042391811153309e-07, + "loss": 0.4142, + "step": 1992 + }, + { + "ETA": 2.13, + "epoch": 0.6409390577263225, + "fp16_scale": 1.0, + "global_step": 1993, + "grad_norm": 2.0521484255436757, + "learning_rate": 6.032824830366466e-07, + "loss": 0.4343, + "step": 1993 + }, + { + "ETA": 2.13, + "epoch": 0.6412606528380769, + "fp16_scale": 1.0, + "global_step": 1994, + "grad_norm": 1.9804359524170014, + "learning_rate": 6.023262156892801e-07, + "loss": 0.3973, + "step": 1994 + }, + { + "ETA": 2.12, + "epoch": 0.6415822479498312, + "fp16_scale": 1.0, + "global_step": 1995, + "grad_norm": 2.1542457139304325, + "learning_rate": 6.01370380111488e-07, + "loss": 0.3889, + "step": 1995 + }, + { + "ETA": 2.12, + "epoch": 0.6419038430615854, + "fp16_scale": 1.0, + "global_step": 1996, + "grad_norm": 2.0882781621428017, + "learning_rate": 6.004149773410568e-07, + "loss": 0.4484, + "step": 1996 + }, + { + "ETA": 2.12, + "epoch": 0.6422254381733398, + "fp16_scale": 1.0, + "global_step": 1997, + "grad_norm": 2.234274236906816, + "learning_rate": 5.994600084153043e-07, + "loss": 0.4473, + "step": 1997 + }, + { + "ETA": 2.12, + "epoch": 0.6425470332850941, + "fp16_scale": 1.0, + "global_step": 1998, + "grad_norm": 2.0635888020273354, + "learning_rate": 5.985054743710763e-07, + "loss": 0.425, + "step": 1998 + }, + { + "ETA": 2.12, + "epoch": 0.6428686283968483, + "fp16_scale": 1.0, + "global_step": 1999, + "grad_norm": 1.8860137964659152, + "learning_rate": 5.975513762447464e-07, + "loss": 0.4515, + "step": 1999 + }, + { + "ETA": 2.11, + "epoch": 0.6431902235086027, + "fp16_scale": 1.0, + "global_step": 2000, + "grad_norm": 2.2500901908562296, + "learning_rate": 5.965977150722159e-07, + "loss": 0.3894, + "step": 2000 + }, + { + "ETA": 2.12, + "epoch": 0.643511818620357, + "fp16_scale": 1.0, + "global_step": 2001, + "grad_norm": 1.8436424869277004, + "learning_rate": 5.956444918889107e-07, + "loss": 0.3802, + "step": 2001 + }, + { + "ETA": 2.12, + "epoch": 0.6438334137321112, + "fp16_scale": 1.0, + "global_step": 2002, + "grad_norm": 2.054805311035249, + "learning_rate": 5.946917077297819e-07, + "loss": 0.4297, + "step": 2002 + }, + { + "ETA": 2.11, + "epoch": 0.6441550088438656, + "fp16_scale": 1.0, + "global_step": 2003, + "grad_norm": 1.7534913161624872, + "learning_rate": 5.937393636293029e-07, + "loss": 0.3728, + "step": 2003 + }, + { + "ETA": 2.11, + "epoch": 0.6444766039556199, + "fp16_scale": 1.0, + "global_step": 2004, + "grad_norm": 2.0300920903469537, + "learning_rate": 5.927874606214704e-07, + "loss": 0.4116, + "step": 2004 + }, + { + "ETA": 2.11, + "epoch": 0.6447981990673741, + "fp16_scale": 1.0, + "global_step": 2005, + "grad_norm": 2.2765492258820026, + "learning_rate": 5.918359997398019e-07, + "loss": 0.4623, + "step": 2005 + }, + { + "ETA": 2.11, + "epoch": 0.6451197941791285, + "fp16_scale": 1.0, + "global_step": 2006, + "grad_norm": 1.8058215948622816, + "learning_rate": 5.908849820173343e-07, + "loss": 0.3361, + "step": 2006 + }, + { + "ETA": 2.11, + "epoch": 0.6454413892908828, + "fp16_scale": 1.0, + "global_step": 2007, + "grad_norm": 1.9983789993118626, + "learning_rate": 5.899344084866243e-07, + "loss": 0.4584, + "step": 2007 + }, + { + "ETA": 2.1, + "epoch": 0.6457629844026371, + "fp16_scale": 1.0, + "global_step": 2008, + "grad_norm": 1.9215556764900221, + "learning_rate": 5.88984280179745e-07, + "loss": 0.4054, + "step": 2008 + }, + { + "ETA": 2.1, + "epoch": 0.6460845795143914, + "fp16_scale": 1.0, + "global_step": 2009, + "grad_norm": 2.00279358717218, + "learning_rate": 5.880345981282876e-07, + "loss": 0.4736, + "step": 2009 + }, + { + "ETA": 2.1, + "epoch": 0.6464061746261457, + "fp16_scale": 1.0, + "global_step": 2010, + "grad_norm": 2.0955002318335554, + "learning_rate": 5.87085363363358e-07, + "loss": 0.4548, + "step": 2010 + }, + { + "ETA": 2.1, + "epoch": 0.6467277697379, + "fp16_scale": 1.0, + "global_step": 2011, + "grad_norm": 2.073198212831422, + "learning_rate": 5.861365769155759e-07, + "loss": 0.3939, + "step": 2011 + }, + { + "ETA": 2.1, + "epoch": 0.6470493648496543, + "fp16_scale": 1.0, + "global_step": 2012, + "grad_norm": 2.135562977334264, + "learning_rate": 5.851882398150756e-07, + "loss": 0.3873, + "step": 2012 + }, + { + "ETA": 2.09, + "epoch": 0.6473709599614086, + "fp16_scale": 1.0, + "global_step": 2013, + "grad_norm": 1.6888098628966262, + "learning_rate": 5.842403530915024e-07, + "loss": 0.3549, + "step": 2013 + }, + { + "ETA": 2.09, + "epoch": 0.6476925550731629, + "fp16_scale": 1.0, + "global_step": 2014, + "grad_norm": 1.9980347065391566, + "learning_rate": 5.832929177740133e-07, + "loss": 0.4361, + "step": 2014 + }, + { + "ETA": 2.09, + "epoch": 0.6480141501849171, + "fp16_scale": 1.0, + "global_step": 2015, + "grad_norm": 1.8838140797820973, + "learning_rate": 5.823459348912747e-07, + "loss": 0.4179, + "step": 2015 + }, + { + "ETA": 2.09, + "epoch": 0.6483357452966715, + "fp16_scale": 1.0, + "global_step": 2016, + "grad_norm": 1.9322911975799548, + "learning_rate": 5.813994054714624e-07, + "loss": 0.3777, + "step": 2016 + }, + { + "ETA": 2.09, + "epoch": 0.6486573404084258, + "fp16_scale": 1.0, + "global_step": 2017, + "grad_norm": 1.9250320413518434, + "learning_rate": 5.80453330542259e-07, + "loss": 0.427, + "step": 2017 + }, + { + "ETA": 2.09, + "epoch": 0.64897893552018, + "fp16_scale": 1.0, + "global_step": 2018, + "grad_norm": 2.38827320000801, + "learning_rate": 5.795077111308539e-07, + "loss": 0.4415, + "step": 2018 + }, + { + "ETA": 2.08, + "epoch": 0.6493005306319344, + "fp16_scale": 1.0, + "global_step": 2019, + "grad_norm": 2.0461024210639307, + "learning_rate": 5.785625482639425e-07, + "loss": 0.4369, + "step": 2019 + }, + { + "ETA": 2.08, + "epoch": 0.6496221257436887, + "fp16_scale": 1.0, + "global_step": 2020, + "grad_norm": 2.156475214040958, + "learning_rate": 5.776178429677238e-07, + "loss": 0.4359, + "step": 2020 + }, + { + "ETA": 2.08, + "epoch": 0.649943720855443, + "fp16_scale": 1.0, + "global_step": 2021, + "grad_norm": 2.1010704366490227, + "learning_rate": 5.76673596267901e-07, + "loss": 0.4928, + "step": 2021 + }, + { + "ETA": 2.08, + "epoch": 0.6502653159671973, + "fp16_scale": 1.0, + "global_step": 2022, + "grad_norm": 1.8865559086622918, + "learning_rate": 5.757298091896783e-07, + "loss": 0.444, + "step": 2022 + }, + { + "ETA": 2.08, + "epoch": 0.6505869110789516, + "fp16_scale": 1.0, + "global_step": 2023, + "grad_norm": 2.018950994626792, + "learning_rate": 5.747864827577608e-07, + "loss": 0.3737, + "step": 2023 + }, + { + "ETA": 2.07, + "epoch": 0.650908506190706, + "fp16_scale": 1.0, + "global_step": 2024, + "grad_norm": 1.9639450586195586, + "learning_rate": 5.738436179963544e-07, + "loss": 0.3379, + "step": 2024 + }, + { + "ETA": 2.07, + "epoch": 0.6512301013024602, + "fp16_scale": 1.0, + "global_step": 2025, + "grad_norm": 2.0222112000629164, + "learning_rate": 5.729012159291633e-07, + "loss": 0.4636, + "step": 2025 + }, + { + "ETA": 2.07, + "epoch": 0.6515516964142145, + "fp16_scale": 1.0, + "global_step": 2026, + "grad_norm": 1.9355969546007485, + "learning_rate": 5.719592775793897e-07, + "loss": 0.463, + "step": 2026 + }, + { + "ETA": 2.07, + "epoch": 0.6518732915259688, + "fp16_scale": 1.0, + "global_step": 2027, + "grad_norm": 2.111716710658819, + "learning_rate": 5.710178039697313e-07, + "loss": 0.3557, + "step": 2027 + }, + { + "ETA": 2.07, + "epoch": 0.6521948866377231, + "fp16_scale": 1.0, + "global_step": 2028, + "grad_norm": 2.202734879548525, + "learning_rate": 5.700767961223818e-07, + "loss": 0.4938, + "step": 2028 + }, + { + "ETA": 2.07, + "epoch": 0.6525164817494774, + "fp16_scale": 1.0, + "global_step": 2029, + "grad_norm": 1.8289846529993485, + "learning_rate": 5.691362550590296e-07, + "loss": 0.415, + "step": 2029 + }, + { + "ETA": 2.06, + "epoch": 0.6528380768612317, + "fp16_scale": 1.0, + "global_step": 2030, + "grad_norm": 1.8551396366004673, + "learning_rate": 5.681961818008558e-07, + "loss": 0.4116, + "step": 2030 + }, + { + "ETA": 2.06, + "epoch": 0.653159671972986, + "fp16_scale": 1.0, + "global_step": 2031, + "grad_norm": 1.9685686893056107, + "learning_rate": 5.672565773685343e-07, + "loss": 0.4479, + "step": 2031 + }, + { + "ETA": 2.06, + "epoch": 0.6534812670847403, + "fp16_scale": 1.0, + "global_step": 2032, + "grad_norm": 1.8184331124615274, + "learning_rate": 5.663174427822284e-07, + "loss": 0.3493, + "step": 2032 + }, + { + "ETA": 2.06, + "epoch": 0.6538028621964946, + "fp16_scale": 1.0, + "global_step": 2033, + "grad_norm": 1.92902744488878, + "learning_rate": 5.653787790615934e-07, + "loss": 0.4476, + "step": 2033 + }, + { + "ETA": 2.06, + "epoch": 0.6541244573082489, + "fp16_scale": 1.0, + "global_step": 2034, + "grad_norm": 1.7453219695542925, + "learning_rate": 5.644405872257716e-07, + "loss": 0.4327, + "step": 2034 + }, + { + "ETA": 2.05, + "epoch": 0.6544460524200032, + "fp16_scale": 1.0, + "global_step": 2035, + "grad_norm": 2.0565556645252623, + "learning_rate": 5.635028682933928e-07, + "loss": 0.4334, + "step": 2035 + }, + { + "ETA": 2.05, + "epoch": 0.6547676475317575, + "fp16_scale": 1.0, + "global_step": 2036, + "grad_norm": 1.8334976412186053, + "learning_rate": 5.62565623282576e-07, + "loss": 0.3892, + "step": 2036 + }, + { + "ETA": 2.05, + "epoch": 0.6550892426435119, + "fp16_scale": 1.0, + "global_step": 2037, + "grad_norm": 2.061393856117499, + "learning_rate": 5.616288532109224e-07, + "loss": 0.4146, + "step": 2037 + }, + { + "ETA": 2.05, + "epoch": 0.6554108377552661, + "fp16_scale": 1.0, + "global_step": 2038, + "grad_norm": 1.9852001975556002, + "learning_rate": 5.606925590955198e-07, + "loss": 0.4019, + "step": 2038 + }, + { + "ETA": 2.05, + "epoch": 0.6557324328670204, + "fp16_scale": 1.0, + "global_step": 2039, + "grad_norm": 2.176443428455659, + "learning_rate": 5.597567419529381e-07, + "loss": 0.4754, + "step": 2039 + }, + { + "ETA": 2.05, + "epoch": 0.6560540279787748, + "fp16_scale": 1.0, + "global_step": 2040, + "grad_norm": 1.855250357011581, + "learning_rate": 5.58821402799229e-07, + "loss": 0.4311, + "step": 2040 + }, + { + "ETA": 2.04, + "epoch": 0.656375623090529, + "fp16_scale": 1.0, + "global_step": 2041, + "grad_norm": 2.314909571286062, + "learning_rate": 5.578865426499265e-07, + "loss": 0.4096, + "step": 2041 + }, + { + "ETA": 2.04, + "epoch": 0.6566972182022833, + "fp16_scale": 1.0, + "global_step": 2042, + "grad_norm": 2.1433437129678943, + "learning_rate": 5.569521625200435e-07, + "loss": 0.4667, + "step": 2042 + }, + { + "ETA": 2.04, + "epoch": 0.6570188133140377, + "fp16_scale": 1.0, + "global_step": 2043, + "grad_norm": 1.5629412526012503, + "learning_rate": 5.560182634240729e-07, + "loss": 0.4074, + "step": 2043 + }, + { + "ETA": 2.04, + "epoch": 0.6573404084257919, + "fp16_scale": 1.0, + "global_step": 2044, + "grad_norm": 2.1051220122344114, + "learning_rate": 5.550848463759834e-07, + "loss": 0.4296, + "step": 2044 + }, + { + "ETA": 2.04, + "epoch": 0.6576620035375462, + "fp16_scale": 1.0, + "global_step": 2045, + "grad_norm": 2.0755242456635004, + "learning_rate": 5.541519123892224e-07, + "loss": 0.4052, + "step": 2045 + }, + { + "ETA": 2.03, + "epoch": 0.6579835986493006, + "fp16_scale": 1.0, + "global_step": 2046, + "grad_norm": 2.0063920244386697, + "learning_rate": 5.532194624767111e-07, + "loss": 0.4453, + "step": 2046 + }, + { + "ETA": 2.03, + "epoch": 0.6583051937610548, + "fp16_scale": 1.0, + "global_step": 2047, + "grad_norm": 1.9310314752786921, + "learning_rate": 5.522874976508463e-07, + "loss": 0.422, + "step": 2047 + }, + { + "ETA": 2.03, + "epoch": 0.6586267888728091, + "fp16_scale": 1.0, + "global_step": 2048, + "grad_norm": 1.8147845689337565, + "learning_rate": 5.513560189234978e-07, + "loss": 0.5086, + "step": 2048 + }, + { + "ETA": 2.03, + "epoch": 0.6589483839845635, + "fp16_scale": 1.0, + "global_step": 2049, + "grad_norm": 1.9326449795983494, + "learning_rate": 5.504250273060072e-07, + "loss": 0.3729, + "step": 2049 + }, + { + "ETA": 2.03, + "epoch": 0.6592699790963177, + "fp16_scale": 1.0, + "global_step": 2050, + "grad_norm": 1.839187483126056, + "learning_rate": 5.49494523809188e-07, + "loss": 0.4142, + "step": 2050 + }, + { + "ETA": 2.02, + "epoch": 0.659591574208072, + "fp16_scale": 1.0, + "global_step": 2051, + "grad_norm": 2.0458852111091383, + "learning_rate": 5.485645094433227e-07, + "loss": 0.4033, + "step": 2051 + }, + { + "ETA": 2.02, + "epoch": 0.6599131693198264, + "fp16_scale": 1.0, + "global_step": 2052, + "grad_norm": 1.982259199461479, + "learning_rate": 5.476349852181634e-07, + "loss": 0.4611, + "step": 2052 + }, + { + "ETA": 2.02, + "epoch": 0.6602347644315807, + "fp16_scale": 1.0, + "global_step": 2053, + "grad_norm": 1.9886670061567648, + "learning_rate": 5.467059521429309e-07, + "loss": 0.425, + "step": 2053 + }, + { + "ETA": 2.02, + "epoch": 0.6605563595433349, + "fp16_scale": 1.0, + "global_step": 2054, + "grad_norm": 2.0980190758463624, + "learning_rate": 5.457774112263105e-07, + "loss": 0.4097, + "step": 2054 + }, + { + "ETA": 2.02, + "epoch": 0.6608779546550893, + "fp16_scale": 1.0, + "global_step": 2055, + "grad_norm": 1.7706319961454073, + "learning_rate": 5.448493634764554e-07, + "loss": 0.3716, + "step": 2055 + }, + { + "ETA": 2.01, + "epoch": 0.6611995497668436, + "fp16_scale": 1.0, + "global_step": 2056, + "grad_norm": 1.970227029214789, + "learning_rate": 5.439218099009822e-07, + "loss": 0.3874, + "step": 2056 + }, + { + "ETA": 2.01, + "epoch": 0.6615211448785978, + "fp16_scale": 1.0, + "global_step": 2057, + "grad_norm": 2.433884192071341, + "learning_rate": 5.429947515069699e-07, + "loss": 0.411, + "step": 2057 + }, + { + "ETA": 2.01, + "epoch": 0.6618427399903521, + "fp16_scale": 1.0, + "global_step": 2058, + "grad_norm": 2.353209000193179, + "learning_rate": 5.42068189300963e-07, + "loss": 0.462, + "step": 2058 + }, + { + "ETA": 2.01, + "epoch": 0.6621643351021065, + "fp16_scale": 1.0, + "global_step": 2059, + "grad_norm": 2.0345860122223227, + "learning_rate": 5.411421242889642e-07, + "loss": 0.4137, + "step": 2059 + }, + { + "ETA": 2.01, + "epoch": 0.6624859302138607, + "fp16_scale": 1.0, + "global_step": 2060, + "grad_norm": 1.957565225211963, + "learning_rate": 5.402165574764383e-07, + "loss": 0.4172, + "step": 2060 + }, + { + "ETA": 2.0, + "epoch": 0.662807525325615, + "fp16_scale": 1.0, + "global_step": 2061, + "grad_norm": 2.180623010242645, + "learning_rate": 5.392914898683077e-07, + "loss": 0.4962, + "step": 2061 + }, + { + "ETA": 2.0, + "epoch": 0.6631291204373694, + "fp16_scale": 1.0, + "global_step": 2062, + "grad_norm": 1.8248603953685665, + "learning_rate": 5.38366922468954e-07, + "loss": 0.4274, + "step": 2062 + }, + { + "ETA": 2.0, + "epoch": 0.6634507155491236, + "fp16_scale": 1.0, + "global_step": 2063, + "grad_norm": 1.8267433758122193, + "learning_rate": 5.374428562822151e-07, + "loss": 0.4159, + "step": 2063 + }, + { + "ETA": 2.0, + "epoch": 0.6637723106608779, + "fp16_scale": 1.0, + "global_step": 2064, + "grad_norm": 1.923510432210067, + "learning_rate": 5.365192923113846e-07, + "loss": 0.4498, + "step": 2064 + }, + { + "ETA": 2.0, + "epoch": 0.6640939057726323, + "fp16_scale": 1.0, + "global_step": 2065, + "grad_norm": 1.9820096390596678, + "learning_rate": 5.355962315592118e-07, + "loss": 0.4484, + "step": 2065 + }, + { + "ETA": 1.99, + "epoch": 0.6644155008843866, + "fp16_scale": 1.0, + "global_step": 2066, + "grad_norm": 1.954182301373188, + "learning_rate": 5.34673675027898e-07, + "loss": 0.4506, + "step": 2066 + }, + { + "ETA": 1.99, + "epoch": 0.6647370959961408, + "fp16_scale": 1.0, + "global_step": 2067, + "grad_norm": 2.10881723708717, + "learning_rate": 5.337516237190989e-07, + "loss": 0.3476, + "step": 2067 + }, + { + "ETA": 1.99, + "epoch": 0.6650586911078952, + "fp16_scale": 1.0, + "global_step": 2068, + "grad_norm": 2.0858184276372653, + "learning_rate": 5.328300786339199e-07, + "loss": 0.4174, + "step": 2068 + }, + { + "ETA": 1.99, + "epoch": 0.6653802862196495, + "fp16_scale": 1.0, + "global_step": 2069, + "grad_norm": 1.725948847457649, + "learning_rate": 5.319090407729179e-07, + "loss": 0.419, + "step": 2069 + }, + { + "ETA": 1.99, + "epoch": 0.6657018813314037, + "fp16_scale": 1.0, + "global_step": 2070, + "grad_norm": 1.9176502634160255, + "learning_rate": 5.309885111360993e-07, + "loss": 0.4788, + "step": 2070 + }, + { + "ETA": 1.98, + "epoch": 0.6660234764431581, + "fp16_scale": 1.0, + "global_step": 2071, + "grad_norm": 1.8852044620717578, + "learning_rate": 5.300684907229172e-07, + "loss": 0.3537, + "step": 2071 + }, + { + "ETA": 1.98, + "epoch": 0.6663450715549124, + "fp16_scale": 1.0, + "global_step": 2072, + "grad_norm": 1.9543436571900248, + "learning_rate": 5.291489805322738e-07, + "loss": 0.3813, + "step": 2072 + }, + { + "ETA": 1.98, + "epoch": 0.6666666666666666, + "fp16_scale": 1.0, + "global_step": 2073, + "grad_norm": 2.141372059399777, + "learning_rate": 5.282299815625153e-07, + "loss": 0.4728, + "step": 2073 + }, + { + "ETA": 1.98, + "epoch": 0.666988261778421, + "fp16_scale": 1.0, + "global_step": 2074, + "grad_norm": 2.0654724895865955, + "learning_rate": 5.273114948114346e-07, + "loss": 0.4454, + "step": 2074 + }, + { + "ETA": 1.98, + "epoch": 0.6673098568901753, + "fp16_scale": 1.0, + "global_step": 2075, + "grad_norm": 1.9880234369103011, + "learning_rate": 5.26393521276268e-07, + "loss": 0.4283, + "step": 2075 + }, + { + "ETA": 1.98, + "epoch": 0.6676314520019295, + "fp16_scale": 1.0, + "global_step": 2076, + "grad_norm": 1.9457472789077968, + "learning_rate": 5.254760619536935e-07, + "loss": 0.4031, + "step": 2076 + }, + { + "ETA": 1.97, + "epoch": 0.6679530471136839, + "fp16_scale": 1.0, + "global_step": 2077, + "grad_norm": 1.9498255842743843, + "learning_rate": 5.245591178398323e-07, + "loss": 0.4649, + "step": 2077 + }, + { + "ETA": 1.97, + "epoch": 0.6682746422254382, + "fp16_scale": 1.0, + "global_step": 2078, + "grad_norm": 1.9434489908906305, + "learning_rate": 5.23642689930245e-07, + "loss": 0.3411, + "step": 2078 + }, + { + "ETA": 1.97, + "epoch": 0.6685962373371924, + "fp16_scale": 1.0, + "global_step": 2079, + "grad_norm": 2.051200817295661, + "learning_rate": 5.227267792199332e-07, + "loss": 0.3771, + "step": 2079 + }, + { + "ETA": 1.97, + "epoch": 0.6689178324489468, + "fp16_scale": 1.0, + "global_step": 2080, + "grad_norm": 1.8532587279366703, + "learning_rate": 5.218113867033349e-07, + "loss": 0.3315, + "step": 2080 + }, + { + "ETA": 1.96, + "epoch": 0.6692394275607011, + "fp16_scale": 1.0, + "global_step": 2081, + "grad_norm": 2.0683333349623556, + "learning_rate": 5.208965133743271e-07, + "loss": 0.3896, + "step": 2081 + }, + { + "ETA": 1.96, + "epoch": 0.6695610226724554, + "fp16_scale": 1.0, + "global_step": 2082, + "grad_norm": 1.9150254189977027, + "learning_rate": 5.199821602262231e-07, + "loss": 0.491, + "step": 2082 + }, + { + "ETA": 1.96, + "epoch": 0.6698826177842097, + "fp16_scale": 1.0, + "global_step": 2083, + "grad_norm": 1.8251708279077623, + "learning_rate": 5.190683282517701e-07, + "loss": 0.369, + "step": 2083 + }, + { + "ETA": 1.96, + "epoch": 0.670204212895964, + "fp16_scale": 1.0, + "global_step": 2084, + "grad_norm": 2.280440590990741, + "learning_rate": 5.18155018443151e-07, + "loss": 0.3774, + "step": 2084 + }, + { + "ETA": 1.96, + "epoch": 0.6705258080077183, + "fp16_scale": 1.0, + "global_step": 2085, + "grad_norm": 2.0733753399267445, + "learning_rate": 5.172422317919804e-07, + "loss": 0.4403, + "step": 2085 + }, + { + "ETA": 1.95, + "epoch": 0.6708474031194726, + "fp16_scale": 1.0, + "global_step": 2086, + "grad_norm": 2.302744266658567, + "learning_rate": 5.163299692893059e-07, + "loss": 0.4789, + "step": 2086 + }, + { + "ETA": 1.95, + "epoch": 0.6711689982312269, + "fp16_scale": 1.0, + "global_step": 2087, + "grad_norm": 2.106795394185773, + "learning_rate": 5.15418231925606e-07, + "loss": 0.4034, + "step": 2087 + }, + { + "ETA": 1.95, + "epoch": 0.6714905933429812, + "fp16_scale": 1.0, + "global_step": 2088, + "grad_norm": 2.233870228931919, + "learning_rate": 5.14507020690788e-07, + "loss": 0.4579, + "step": 2088 + }, + { + "ETA": 1.95, + "epoch": 0.6718121884547354, + "fp16_scale": 1.0, + "global_step": 2089, + "grad_norm": 2.0075117201584884, + "learning_rate": 5.135963365741891e-07, + "loss": 0.4422, + "step": 2089 + }, + { + "ETA": 1.95, + "epoch": 0.6721337835664898, + "fp16_scale": 1.0, + "global_step": 2090, + "grad_norm": 2.077780167824397, + "learning_rate": 5.126861805645734e-07, + "loss": 0.4361, + "step": 2090 + }, + { + "ETA": 1.95, + "epoch": 0.6724553786782441, + "fp16_scale": 1.0, + "global_step": 2091, + "grad_norm": 1.6934528056624099, + "learning_rate": 5.11776553650132e-07, + "loss": 0.3995, + "step": 2091 + }, + { + "ETA": 1.94, + "epoch": 0.6727769737899983, + "fp16_scale": 1.0, + "global_step": 2092, + "grad_norm": 2.0314987863788754, + "learning_rate": 5.108674568184821e-07, + "loss": 0.4113, + "step": 2092 + }, + { + "ETA": 1.94, + "epoch": 0.6730985689017527, + "fp16_scale": 1.0, + "global_step": 2093, + "grad_norm": 1.9359311018879675, + "learning_rate": 5.099588910566637e-07, + "loss": 0.4002, + "step": 2093 + }, + { + "ETA": 1.94, + "epoch": 0.673420164013507, + "fp16_scale": 1.0, + "global_step": 2094, + "grad_norm": 1.8677083740014844, + "learning_rate": 5.090508573511423e-07, + "loss": 0.4368, + "step": 2094 + }, + { + "ETA": 1.94, + "epoch": 0.6737417591252612, + "fp16_scale": 1.0, + "global_step": 2095, + "grad_norm": 1.9545734304206386, + "learning_rate": 5.081433566878038e-07, + "loss": 0.4803, + "step": 2095 + }, + { + "ETA": 1.94, + "epoch": 0.6740633542370156, + "fp16_scale": 1.0, + "global_step": 2096, + "grad_norm": 2.0827276601919973, + "learning_rate": 5.072363900519566e-07, + "loss": 0.4697, + "step": 2096 + }, + { + "ETA": 1.93, + "epoch": 0.6743849493487699, + "fp16_scale": 1.0, + "global_step": 2097, + "grad_norm": 1.8459348603587467, + "learning_rate": 5.063299584283294e-07, + "loss": 0.4974, + "step": 2097 + }, + { + "ETA": 1.93, + "epoch": 0.6747065444605242, + "fp16_scale": 1.0, + "global_step": 2098, + "grad_norm": 1.9178692432374298, + "learning_rate": 5.054240628010686e-07, + "loss": 0.4044, + "step": 2098 + }, + { + "ETA": 1.93, + "epoch": 0.6750281395722785, + "fp16_scale": 1.0, + "global_step": 2099, + "grad_norm": 1.9316722920720346, + "learning_rate": 5.045187041537404e-07, + "loss": 0.4077, + "step": 2099 + }, + { + "ETA": 1.93, + "epoch": 0.6753497346840328, + "fp16_scale": 1.0, + "global_step": 2100, + "grad_norm": 2.189453020861492, + "learning_rate": 5.036138834693267e-07, + "loss": 0.3788, + "step": 2100 + }, + { + "ETA": 1.93, + "epoch": 0.6756713297957871, + "fp16_scale": 1.0, + "global_step": 2101, + "grad_norm": 1.8196764666790626, + "learning_rate": 5.02709601730226e-07, + "loss": 0.4728, + "step": 2101 + }, + { + "ETA": 1.92, + "epoch": 0.6759929249075414, + "fp16_scale": 1.0, + "global_step": 2102, + "grad_norm": 1.9021321042207269, + "learning_rate": 5.018058599182507e-07, + "loss": 0.4484, + "step": 2102 + }, + { + "ETA": 1.92, + "epoch": 0.6763145200192957, + "fp16_scale": 1.0, + "global_step": 2103, + "grad_norm": 1.9953437493352695, + "learning_rate": 5.009026590146293e-07, + "loss": 0.3999, + "step": 2103 + }, + { + "ETA": 1.92, + "epoch": 0.67663611513105, + "fp16_scale": 1.0, + "global_step": 2104, + "grad_norm": 2.096146733100405, + "learning_rate": 5.000000000000002e-07, + "loss": 0.4082, + "step": 2104 + }, + { + "ETA": 1.92, + "epoch": 0.6769577102428043, + "fp16_scale": 1.0, + "global_step": 2105, + "grad_norm": 2.115570096537946, + "learning_rate": 4.990978838544147e-07, + "loss": 0.3856, + "step": 2105 + }, + { + "ETA": 1.92, + "epoch": 0.6772793053545586, + "fp16_scale": 1.0, + "global_step": 2106, + "grad_norm": 1.8089565442410478, + "learning_rate": 4.981963115573352e-07, + "loss": 0.4085, + "step": 2106 + }, + { + "ETA": 1.92, + "epoch": 0.6776009004663129, + "fp16_scale": 1.0, + "global_step": 2107, + "grad_norm": 1.98835696095386, + "learning_rate": 4.972952840876325e-07, + "loss": 0.4526, + "step": 2107 + }, + { + "ETA": 1.91, + "epoch": 0.6779224955780672, + "fp16_scale": 1.0, + "global_step": 2108, + "grad_norm": 1.7542113635635128, + "learning_rate": 4.963948024235866e-07, + "loss": 0.441, + "step": 2108 + }, + { + "ETA": 1.91, + "epoch": 0.6782440906898215, + "fp16_scale": 1.0, + "global_step": 2109, + "grad_norm": 2.114280110588985, + "learning_rate": 4.954948675428853e-07, + "loss": 0.4141, + "step": 2109 + }, + { + "ETA": 1.91, + "epoch": 0.6785656858015758, + "fp16_scale": 1.0, + "global_step": 2110, + "grad_norm": 2.1118367919477357, + "learning_rate": 4.945954804226214e-07, + "loss": 0.4093, + "step": 2110 + }, + { + "ETA": 1.91, + "epoch": 0.6788872809133302, + "fp16_scale": 1.0, + "global_step": 2111, + "grad_norm": 1.9635385154854137, + "learning_rate": 4.936966420392944e-07, + "loss": 0.4003, + "step": 2111 + }, + { + "ETA": 1.91, + "epoch": 0.6792088760250844, + "fp16_scale": 1.0, + "global_step": 2112, + "grad_norm": 1.977750108398266, + "learning_rate": 4.927983533688067e-07, + "loss": 0.4322, + "step": 2112 + }, + { + "ETA": 1.9, + "epoch": 0.6795304711368387, + "fp16_scale": 1.0, + "global_step": 2113, + "grad_norm": 1.8893135953840186, + "learning_rate": 4.919006153864648e-07, + "loss": 0.4925, + "step": 2113 + }, + { + "ETA": 1.9, + "epoch": 0.6798520662485931, + "fp16_scale": 1.0, + "global_step": 2114, + "grad_norm": 2.14033345862397, + "learning_rate": 4.910034290669776e-07, + "loss": 0.3787, + "step": 2114 + }, + { + "ETA": 1.9, + "epoch": 0.6801736613603473, + "fp16_scale": 1.0, + "global_step": 2115, + "grad_norm": 1.7925381188694647, + "learning_rate": 4.901067953844537e-07, + "loss": 0.4401, + "step": 2115 + }, + { + "ETA": 1.9, + "epoch": 0.6804952564721016, + "fp16_scale": 1.0, + "global_step": 2116, + "grad_norm": 2.0063586648085887, + "learning_rate": 4.892107153124029e-07, + "loss": 0.453, + "step": 2116 + }, + { + "ETA": 1.9, + "epoch": 0.680816851583856, + "fp16_scale": 1.0, + "global_step": 2117, + "grad_norm": 2.206872828524902, + "learning_rate": 4.883151898237329e-07, + "loss": 0.3632, + "step": 2117 + }, + { + "ETA": 1.89, + "epoch": 0.6811384466956102, + "fp16_scale": 1.0, + "global_step": 2118, + "grad_norm": 2.048596899499758, + "learning_rate": 4.874202198907502e-07, + "loss": 0.4086, + "step": 2118 + }, + { + "ETA": 1.89, + "epoch": 0.6814600418073645, + "fp16_scale": 1.0, + "global_step": 2119, + "grad_norm": 1.725224121911189, + "learning_rate": 4.865258064851578e-07, + "loss": 0.4006, + "step": 2119 + }, + { + "ETA": 1.89, + "epoch": 0.6817816369191189, + "fp16_scale": 1.0, + "global_step": 2120, + "grad_norm": 2.1542462909235756, + "learning_rate": 4.856319505780547e-07, + "loss": 0.4415, + "step": 2120 + }, + { + "ETA": 1.89, + "epoch": 0.6821032320308731, + "fp16_scale": 1.0, + "global_step": 2121, + "grad_norm": 2.080492885117813, + "learning_rate": 4.847386531399339e-07, + "loss": 0.408, + "step": 2121 + }, + { + "ETA": 1.89, + "epoch": 0.6824248271426274, + "fp16_scale": 1.0, + "global_step": 2122, + "grad_norm": 2.2094173637517773, + "learning_rate": 4.838459151406822e-07, + "loss": 0.4346, + "step": 2122 + }, + { + "ETA": 1.88, + "epoch": 0.6827464222543818, + "fp16_scale": 1.0, + "global_step": 2123, + "grad_norm": 1.9387074487488687, + "learning_rate": 4.829537375495798e-07, + "loss": 0.435, + "step": 2123 + }, + { + "ETA": 1.88, + "epoch": 0.683068017366136, + "fp16_scale": 1.0, + "global_step": 2124, + "grad_norm": 2.0396843533535582, + "learning_rate": 4.82062121335297e-07, + "loss": 0.3722, + "step": 2124 + }, + { + "ETA": 1.88, + "epoch": 0.6833896124778903, + "fp16_scale": 1.0, + "global_step": 2125, + "grad_norm": 2.233077561115142, + "learning_rate": 4.811710674658968e-07, + "loss": 0.3987, + "step": 2125 + }, + { + "ETA": 1.88, + "epoch": 0.6837112075896447, + "fp16_scale": 1.0, + "global_step": 2126, + "grad_norm": 2.0868138718712257, + "learning_rate": 4.802805769088298e-07, + "loss": 0.4707, + "step": 2126 + }, + { + "ETA": 1.88, + "epoch": 0.684032802701399, + "fp16_scale": 1.0, + "global_step": 2127, + "grad_norm": 1.8915169582337128, + "learning_rate": 4.793906506309347e-07, + "loss": 0.4742, + "step": 2127 + }, + { + "ETA": 1.87, + "epoch": 0.6843543978131532, + "fp16_scale": 1.0, + "global_step": 2128, + "grad_norm": 2.06189918024411, + "learning_rate": 4.785012895984397e-07, + "loss": 0.3803, + "step": 2128 + }, + { + "ETA": 1.87, + "epoch": 0.6846759929249076, + "fp16_scale": 1.0, + "global_step": 2129, + "grad_norm": 1.8873364369445147, + "learning_rate": 4.776124947769566e-07, + "loss": 0.397, + "step": 2129 + }, + { + "ETA": 1.87, + "epoch": 0.6849975880366619, + "fp16_scale": 1.0, + "global_step": 2130, + "grad_norm": 2.0283057529756467, + "learning_rate": 4.767242671314846e-07, + "loss": 0.4939, + "step": 2130 + }, + { + "ETA": 1.87, + "epoch": 0.6853191831484161, + "fp16_scale": 1.0, + "global_step": 2131, + "grad_norm": 2.104418369293907, + "learning_rate": 4.758366076264061e-07, + "loss": 0.2901, + "step": 2131 + }, + { + "ETA": 1.87, + "epoch": 0.6856407782601704, + "fp16_scale": 1.0, + "global_step": 2132, + "grad_norm": 1.976994436965051, + "learning_rate": 4.7494951722548726e-07, + "loss": 0.4407, + "step": 2132 + }, + { + "ETA": 1.86, + "epoch": 0.6859623733719248, + "fp16_scale": 1.0, + "global_step": 2133, + "grad_norm": 2.227745365332368, + "learning_rate": 4.7406299689187557e-07, + "loss": 0.5125, + "step": 2133 + }, + { + "ETA": 1.86, + "epoch": 0.686283968483679, + "fp16_scale": 1.0, + "global_step": 2134, + "grad_norm": 1.8143522659584321, + "learning_rate": 4.731770475880994e-07, + "loss": 0.4689, + "step": 2134 + }, + { + "ETA": 1.86, + "epoch": 0.6866055635954333, + "fp16_scale": 1.0, + "global_step": 2135, + "grad_norm": 1.7957129290625795, + "learning_rate": 4.722916702760682e-07, + "loss": 0.381, + "step": 2135 + }, + { + "ETA": 1.86, + "epoch": 0.6869271587071877, + "fp16_scale": 1.0, + "global_step": 2136, + "grad_norm": 2.008111581914058, + "learning_rate": 4.714068659170698e-07, + "loss": 0.5392, + "step": 2136 + }, + { + "ETA": 1.86, + "epoch": 0.6872487538189419, + "fp16_scale": 1.0, + "global_step": 2137, + "grad_norm": 1.9949153156117356, + "learning_rate": 4.705226354717703e-07, + "loss": 0.3615, + "step": 2137 + }, + { + "ETA": 1.86, + "epoch": 0.6875703489306962, + "fp16_scale": 1.0, + "global_step": 2138, + "grad_norm": 1.8513909420105987, + "learning_rate": 4.6963897990021197e-07, + "loss": 0.3473, + "step": 2138 + }, + { + "ETA": 1.85, + "epoch": 0.6878919440424506, + "fp16_scale": 1.0, + "global_step": 2139, + "grad_norm": 2.0767110857719704, + "learning_rate": 4.687559001618131e-07, + "loss": 0.4204, + "step": 2139 + }, + { + "ETA": 1.85, + "epoch": 0.6882135391542048, + "fp16_scale": 1.0, + "global_step": 2140, + "grad_norm": 2.1147501902316455, + "learning_rate": 4.6787339721536724e-07, + "loss": 0.4133, + "step": 2140 + }, + { + "ETA": 1.85, + "epoch": 0.6885351342659591, + "fp16_scale": 1.0, + "global_step": 2141, + "grad_norm": 1.9236443785166624, + "learning_rate": 4.6699147201904143e-07, + "loss": 0.4067, + "step": 2141 + }, + { + "ETA": 1.85, + "epoch": 0.6888567293777135, + "fp16_scale": 1.0, + "global_step": 2142, + "grad_norm": 2.0996085018750157, + "learning_rate": 4.66110125530376e-07, + "loss": 0.4468, + "step": 2142 + }, + { + "ETA": 1.85, + "epoch": 0.6891783244894678, + "fp16_scale": 1.0, + "global_step": 2143, + "grad_norm": 1.828121616899122, + "learning_rate": 4.652293587062819e-07, + "loss": 0.4072, + "step": 2143 + }, + { + "ETA": 1.84, + "epoch": 0.689499919601222, + "fp16_scale": 1.0, + "global_step": 2144, + "grad_norm": 1.8826247487337693, + "learning_rate": 4.6434917250304076e-07, + "loss": 0.4363, + "step": 2144 + }, + { + "ETA": 1.84, + "epoch": 0.6898215147129764, + "fp16_scale": 1.0, + "global_step": 2145, + "grad_norm": 2.1052871533519397, + "learning_rate": 4.634695678763052e-07, + "loss": 0.4686, + "step": 2145 + }, + { + "ETA": 1.84, + "epoch": 0.6901431098247307, + "fp16_scale": 1.0, + "global_step": 2146, + "grad_norm": 1.978894968730985, + "learning_rate": 4.625905457810942e-07, + "loss": 0.4116, + "step": 2146 + }, + { + "ETA": 1.84, + "epoch": 0.6904647049364849, + "fp16_scale": 1.0, + "global_step": 2147, + "grad_norm": 1.8457556993416007, + "learning_rate": 4.617121071717971e-07, + "loss": 0.3796, + "step": 2147 + }, + { + "ETA": 1.84, + "epoch": 0.6907863000482393, + "fp16_scale": 1.0, + "global_step": 2148, + "grad_norm": 1.775233947226462, + "learning_rate": 4.608342530021669e-07, + "loss": 0.428, + "step": 2148 + }, + { + "ETA": 1.83, + "epoch": 0.6911078951599936, + "fp16_scale": 1.0, + "global_step": 2149, + "grad_norm": 2.4059448105074632, + "learning_rate": 4.599569842253244e-07, + "loss": 0.3959, + "step": 2149 + }, + { + "ETA": 1.83, + "epoch": 0.6914294902717478, + "fp16_scale": 1.0, + "global_step": 2150, + "grad_norm": 1.9336965225715175, + "learning_rate": 4.590803017937529e-07, + "loss": 0.4088, + "step": 2150 + }, + { + "ETA": 1.83, + "epoch": 0.6917510853835022, + "fp16_scale": 1.0, + "global_step": 2151, + "grad_norm": 1.9766746933849264, + "learning_rate": 4.582042066592998e-07, + "loss": 0.4478, + "step": 2151 + }, + { + "ETA": 1.83, + "epoch": 0.6920726804952565, + "fp16_scale": 1.0, + "global_step": 2152, + "grad_norm": 1.7331008700279942, + "learning_rate": 4.5732869977317535e-07, + "loss": 0.4382, + "step": 2152 + }, + { + "ETA": 1.83, + "epoch": 0.6923942756070107, + "fp16_scale": 1.0, + "global_step": 2153, + "grad_norm": 2.111208298261346, + "learning_rate": 4.5645378208595055e-07, + "loss": 0.4339, + "step": 2153 + }, + { + "ETA": 1.82, + "epoch": 0.6927158707187651, + "fp16_scale": 1.0, + "global_step": 2154, + "grad_norm": 1.9885480809010154, + "learning_rate": 4.555794545475573e-07, + "loss": 0.446, + "step": 2154 + }, + { + "ETA": 1.82, + "epoch": 0.6930374658305194, + "fp16_scale": 1.0, + "global_step": 2155, + "grad_norm": 2.0731087672033146, + "learning_rate": 4.547057181072861e-07, + "loss": 0.4682, + "step": 2155 + }, + { + "ETA": 1.82, + "epoch": 0.6933590609422737, + "fp16_scale": 1.0, + "global_step": 2156, + "grad_norm": 1.967125013570481, + "learning_rate": 4.5383257371378524e-07, + "loss": 0.4705, + "step": 2156 + }, + { + "ETA": 1.82, + "epoch": 0.693680656054028, + "fp16_scale": 1.0, + "global_step": 2157, + "grad_norm": 2.052917753719661, + "learning_rate": 4.5296002231506145e-07, + "loss": 0.4523, + "step": 2157 + }, + { + "ETA": 1.82, + "epoch": 0.6940022511657823, + "fp16_scale": 1.0, + "global_step": 2158, + "grad_norm": 2.205481787919023, + "learning_rate": 4.5208806485847693e-07, + "loss": 0.3519, + "step": 2158 + }, + { + "ETA": 1.81, + "epoch": 0.6943238462775366, + "fp16_scale": 1.0, + "global_step": 2159, + "grad_norm": 1.9347264421264965, + "learning_rate": 4.512167022907494e-07, + "loss": 0.5172, + "step": 2159 + }, + { + "ETA": 1.81, + "epoch": 0.6946454413892909, + "fp16_scale": 1.0, + "global_step": 2160, + "grad_norm": 1.9246825347822658, + "learning_rate": 4.503459355579501e-07, + "loss": 0.4234, + "step": 2160 + }, + { + "ETA": 1.81, + "epoch": 0.6949670365010452, + "fp16_scale": 1.0, + "global_step": 2161, + "grad_norm": 1.8277123255326062, + "learning_rate": 4.4947576560550326e-07, + "loss": 0.4084, + "step": 2161 + }, + { + "ETA": 1.81, + "epoch": 0.6952886316127995, + "fp16_scale": 1.0, + "global_step": 2162, + "grad_norm": 2.0570802166174214, + "learning_rate": 4.4860619337818586e-07, + "loss": 0.485, + "step": 2162 + }, + { + "ETA": 1.81, + "epoch": 0.6956102267245537, + "fp16_scale": 1.0, + "global_step": 2163, + "grad_norm": 1.9156853110868317, + "learning_rate": 4.477372198201256e-07, + "loss": 0.4717, + "step": 2163 + }, + { + "ETA": 1.81, + "epoch": 0.6959318218363081, + "fp16_scale": 1.0, + "global_step": 2164, + "grad_norm": 1.9396859746301647, + "learning_rate": 4.4686884587480056e-07, + "loss": 0.4048, + "step": 2164 + }, + { + "ETA": 1.8, + "epoch": 0.6962534169480624, + "fp16_scale": 1.0, + "global_step": 2165, + "grad_norm": 2.0950628522505985, + "learning_rate": 4.460010724850367e-07, + "loss": 0.3888, + "step": 2165 + }, + { + "ETA": 1.8, + "epoch": 0.6965750120598166, + "fp16_scale": 1.0, + "global_step": 2166, + "grad_norm": 2.257864856135109, + "learning_rate": 4.451339005930094e-07, + "loss": 0.3402, + "step": 2166 + }, + { + "ETA": 1.8, + "epoch": 0.696896607171571, + "fp16_scale": 1.0, + "global_step": 2167, + "grad_norm": 1.9499895335908553, + "learning_rate": 4.4426733114023975e-07, + "loss": 0.4476, + "step": 2167 + }, + { + "ETA": 1.8, + "epoch": 0.6972182022833253, + "fp16_scale": 1.0, + "global_step": 2168, + "grad_norm": 1.8549561588819414, + "learning_rate": 4.4340136506759486e-07, + "loss": 0.3655, + "step": 2168 + }, + { + "ETA": 1.79, + "epoch": 0.6975397973950795, + "fp16_scale": 1.0, + "global_step": 2169, + "grad_norm": 2.2544356108635646, + "learning_rate": 4.425360033152875e-07, + "loss": 0.3345, + "step": 2169 + }, + { + "ETA": 1.79, + "epoch": 0.6978613925068339, + "fp16_scale": 1.0, + "global_step": 2170, + "grad_norm": 1.9747679165142917, + "learning_rate": 4.416712468228738e-07, + "loss": 0.4603, + "step": 2170 + }, + { + "ETA": 1.79, + "epoch": 0.6981829876185882, + "fp16_scale": 1.0, + "global_step": 2171, + "grad_norm": 2.042816511574314, + "learning_rate": 4.408070965292533e-07, + "loss": 0.4746, + "step": 2171 + }, + { + "ETA": 1.79, + "epoch": 0.6985045827303425, + "fp16_scale": 1.0, + "global_step": 2172, + "grad_norm": 1.8521673086720718, + "learning_rate": 4.399435533726664e-07, + "loss": 0.3694, + "step": 2172 + }, + { + "ETA": 1.79, + "epoch": 0.6988261778420968, + "fp16_scale": 1.0, + "global_step": 2173, + "grad_norm": 2.0284223100857472, + "learning_rate": 4.3908061829069456e-07, + "loss": 0.4253, + "step": 2173 + }, + { + "ETA": 1.79, + "epoch": 0.6991477729538511, + "fp16_scale": 1.0, + "global_step": 2174, + "grad_norm": 2.1478253391524733, + "learning_rate": 4.382182922202595e-07, + "loss": 0.3783, + "step": 2174 + }, + { + "ETA": 1.78, + "epoch": 0.6994693680656054, + "fp16_scale": 1.0, + "global_step": 2175, + "grad_norm": 1.964934515334503, + "learning_rate": 4.3735657609762157e-07, + "loss": 0.4042, + "step": 2175 + }, + { + "ETA": 1.78, + "epoch": 0.6997909631773597, + "fp16_scale": 1.0, + "global_step": 2176, + "grad_norm": 1.9394408594447183, + "learning_rate": 4.364954708583791e-07, + "loss": 0.3743, + "step": 2176 + }, + { + "ETA": 1.78, + "epoch": 0.700112558289114, + "fp16_scale": 1.0, + "global_step": 2177, + "grad_norm": 2.2539404168304786, + "learning_rate": 4.3563497743746615e-07, + "loss": 0.43, + "step": 2177 + }, + { + "ETA": 1.78, + "epoch": 0.7004341534008683, + "fp16_scale": 1.0, + "global_step": 2178, + "grad_norm": 2.390806632951474, + "learning_rate": 4.347750967691539e-07, + "loss": 0.3694, + "step": 2178 + }, + { + "ETA": 1.77, + "epoch": 0.7007557485126226, + "fp16_scale": 1.0, + "global_step": 2179, + "grad_norm": 1.987087416778606, + "learning_rate": 4.339158297870469e-07, + "loss": 0.354, + "step": 2179 + }, + { + "ETA": 1.77, + "epoch": 0.7010773436243769, + "fp16_scale": 1.0, + "global_step": 2180, + "grad_norm": 2.0974630478121923, + "learning_rate": 4.330571774240842e-07, + "loss": 0.3789, + "step": 2180 + }, + { + "ETA": 1.77, + "epoch": 0.7013989387361312, + "fp16_scale": 1.0, + "global_step": 2181, + "grad_norm": 2.000184785878777, + "learning_rate": 4.3219914061253793e-07, + "loss": 0.3766, + "step": 2181 + }, + { + "ETA": 1.77, + "epoch": 0.7017205338478855, + "fp16_scale": 1.0, + "global_step": 2182, + "grad_norm": 2.1133007929286753, + "learning_rate": 4.313417202840106e-07, + "loss": 0.4462, + "step": 2182 + }, + { + "ETA": 1.77, + "epoch": 0.7020421289596398, + "fp16_scale": 1.0, + "global_step": 2183, + "grad_norm": 1.974731726211235, + "learning_rate": 4.3048491736943683e-07, + "loss": 0.4428, + "step": 2183 + }, + { + "ETA": 1.77, + "epoch": 0.7023637240713941, + "fp16_scale": 1.0, + "global_step": 2184, + "grad_norm": 1.9994266341684352, + "learning_rate": 4.2962873279907963e-07, + "loss": 0.4203, + "step": 2184 + }, + { + "ETA": 1.76, + "epoch": 0.7026853191831485, + "fp16_scale": 1.0, + "global_step": 2185, + "grad_norm": 2.1987989561751657, + "learning_rate": 4.2877316750253077e-07, + "loss": 0.3732, + "step": 2185 + }, + { + "ETA": 1.76, + "epoch": 0.7030069142949027, + "fp16_scale": 1.0, + "global_step": 2186, + "grad_norm": 1.9440227895993096, + "learning_rate": 4.2791822240871134e-07, + "loss": 0.3947, + "step": 2186 + }, + { + "ETA": 1.76, + "epoch": 0.703328509406657, + "fp16_scale": 1.0, + "global_step": 2187, + "grad_norm": 2.0467861264126075, + "learning_rate": 4.270638984458668e-07, + "loss": 0.3372, + "step": 2187 + }, + { + "ETA": 1.76, + "epoch": 0.7036501045184114, + "fp16_scale": 1.0, + "global_step": 2188, + "grad_norm": 1.771590413871306, + "learning_rate": 4.2621019654156976e-07, + "loss": 0.398, + "step": 2188 + }, + { + "ETA": 1.76, + "epoch": 0.7039716996301656, + "fp16_scale": 1.0, + "global_step": 2189, + "grad_norm": 2.05412758035908, + "learning_rate": 4.253571176227168e-07, + "loss": 0.3881, + "step": 2189 + }, + { + "ETA": 1.75, + "epoch": 0.7042932947419199, + "fp16_scale": 1.0, + "global_step": 2190, + "grad_norm": 1.9042821216063328, + "learning_rate": 4.245046626155275e-07, + "loss": 0.4662, + "step": 2190 + }, + { + "ETA": 1.75, + "epoch": 0.7046148898536743, + "fp16_scale": 1.0, + "global_step": 2191, + "grad_norm": 1.89030800830198, + "learning_rate": 4.236528324455454e-07, + "loss": 0.356, + "step": 2191 + }, + { + "ETA": 1.75, + "epoch": 0.7049364849654285, + "fp16_scale": 1.0, + "global_step": 2192, + "grad_norm": 1.9532946530918986, + "learning_rate": 4.2280162803763487e-07, + "loss": 0.4071, + "step": 2192 + }, + { + "ETA": 1.75, + "epoch": 0.7052580800771828, + "fp16_scale": 1.0, + "global_step": 2193, + "grad_norm": 2.2138783848840022, + "learning_rate": 4.2195105031598123e-07, + "loss": 0.4353, + "step": 2193 + }, + { + "ETA": 1.75, + "epoch": 0.7055796751889372, + "fp16_scale": 1.0, + "global_step": 2194, + "grad_norm": 1.7951242698976733, + "learning_rate": 4.211011002040885e-07, + "loss": 0.3116, + "step": 2194 + }, + { + "ETA": 1.74, + "epoch": 0.7059012703006914, + "fp16_scale": 1.0, + "global_step": 2195, + "grad_norm": 2.0072448973292905, + "learning_rate": 4.2025177862478057e-07, + "loss": 0.4307, + "step": 2195 + }, + { + "ETA": 1.74, + "epoch": 0.7062228654124457, + "fp16_scale": 1.0, + "global_step": 2196, + "grad_norm": 1.8684595639663752, + "learning_rate": 4.194030865001974e-07, + "loss": 0.3894, + "step": 2196 + }, + { + "ETA": 1.74, + "epoch": 0.7065444605242001, + "fp16_scale": 1.0, + "global_step": 2197, + "grad_norm": 1.829405941945004, + "learning_rate": 4.185550247517969e-07, + "loss": 0.3636, + "step": 2197 + }, + { + "ETA": 1.74, + "epoch": 0.7068660556359543, + "fp16_scale": 1.0, + "global_step": 2198, + "grad_norm": 2.1678571316880926, + "learning_rate": 4.1770759430035217e-07, + "loss": 0.345, + "step": 2198 + }, + { + "ETA": 1.74, + "epoch": 0.7071876507477086, + "fp16_scale": 1.0, + "global_step": 2199, + "grad_norm": 1.9344641864621686, + "learning_rate": 4.1686079606595027e-07, + "loss": 0.4396, + "step": 2199 + }, + { + "ETA": 1.73, + "epoch": 0.707509245859463, + "fp16_scale": 1.0, + "global_step": 2200, + "grad_norm": 1.950038444686929, + "learning_rate": 4.1601463096799274e-07, + "loss": 0.4394, + "step": 2200 + }, + { + "ETA": 1.74, + "epoch": 0.7078308409712173, + "fp16_scale": 1.0, + "global_step": 2201, + "grad_norm": 2.194075025199006, + "learning_rate": 4.151690999251928e-07, + "loss": 0.47, + "step": 2201 + }, + { + "ETA": 1.73, + "epoch": 0.7081524360829715, + "fp16_scale": 1.0, + "global_step": 2202, + "grad_norm": 2.0737549787241236, + "learning_rate": 4.1432420385557577e-07, + "loss": 0.3843, + "step": 2202 + }, + { + "ETA": 1.73, + "epoch": 0.7084740311947259, + "fp16_scale": 1.0, + "global_step": 2203, + "grad_norm": 1.922407145802047, + "learning_rate": 4.1347994367647797e-07, + "loss": 0.4362, + "step": 2203 + }, + { + "ETA": 1.73, + "epoch": 0.7087956263064802, + "fp16_scale": 1.0, + "global_step": 2204, + "grad_norm": 2.179140931747294, + "learning_rate": 4.126363203045443e-07, + "loss": 0.4519, + "step": 2204 + }, + { + "ETA": 1.73, + "epoch": 0.7091172214182344, + "fp16_scale": 1.0, + "global_step": 2205, + "grad_norm": 2.195258399591118, + "learning_rate": 4.117933346557293e-07, + "loss": 0.3956, + "step": 2205 + }, + { + "ETA": 1.73, + "epoch": 0.7094388165299887, + "fp16_scale": 1.0, + "global_step": 2206, + "grad_norm": 1.9770106328929318, + "learning_rate": 4.109509876452939e-07, + "loss": 0.4615, + "step": 2206 + }, + { + "ETA": 1.72, + "epoch": 0.7097604116417431, + "fp16_scale": 1.0, + "global_step": 2207, + "grad_norm": 1.9116333616905046, + "learning_rate": 4.101092801878068e-07, + "loss": 0.3323, + "step": 2207 + }, + { + "ETA": 1.72, + "epoch": 0.7100820067534973, + "fp16_scale": 1.0, + "global_step": 2208, + "grad_norm": 2.0070177339096715, + "learning_rate": 4.092682131971421e-07, + "loss": 0.5249, + "step": 2208 + }, + { + "ETA": 1.72, + "epoch": 0.7104036018652516, + "fp16_scale": 1.0, + "global_step": 2209, + "grad_norm": 1.9920602145139588, + "learning_rate": 4.0842778758647754e-07, + "loss": 0.4085, + "step": 2209 + }, + { + "ETA": 1.72, + "epoch": 0.710725196977006, + "fp16_scale": 1.0, + "global_step": 2210, + "grad_norm": 2.086253029698317, + "learning_rate": 4.0758800426829596e-07, + "loss": 0.3245, + "step": 2210 + }, + { + "ETA": 1.72, + "epoch": 0.7110467920887602, + "fp16_scale": 1.0, + "global_step": 2211, + "grad_norm": 1.8523812149571812, + "learning_rate": 4.0674886415438146e-07, + "loss": 0.3613, + "step": 2211 + }, + { + "ETA": 1.71, + "epoch": 0.7113683872005145, + "fp16_scale": 1.0, + "global_step": 2212, + "grad_norm": 1.878433095918315, + "learning_rate": 4.05910368155821e-07, + "loss": 0.3676, + "step": 2212 + }, + { + "ETA": 1.71, + "epoch": 0.7116899823122689, + "fp16_scale": 1.0, + "global_step": 2213, + "grad_norm": 2.07030816042548, + "learning_rate": 4.050725171830011e-07, + "loss": 0.4186, + "step": 2213 + }, + { + "ETA": 1.71, + "epoch": 0.7120115774240231, + "fp16_scale": 1.0, + "global_step": 2214, + "grad_norm": 1.991886177909802, + "learning_rate": 4.042353121456086e-07, + "loss": 0.4691, + "step": 2214 + }, + { + "ETA": 1.71, + "epoch": 0.7123331725357774, + "fp16_scale": 1.0, + "global_step": 2215, + "grad_norm": 2.2279734922669476, + "learning_rate": 4.0339875395262937e-07, + "loss": 0.4557, + "step": 2215 + }, + { + "ETA": 1.71, + "epoch": 0.7126547676475318, + "fp16_scale": 1.0, + "global_step": 2216, + "grad_norm": 2.119008274564329, + "learning_rate": 4.025628435123457e-07, + "loss": 0.3971, + "step": 2216 + }, + { + "ETA": 1.7, + "epoch": 0.7129763627592861, + "fp16_scale": 1.0, + "global_step": 2217, + "grad_norm": 1.7664745654548895, + "learning_rate": 4.017275817323382e-07, + "loss": 0.5009, + "step": 2217 + }, + { + "ETA": 1.7, + "epoch": 0.7132979578710403, + "fp16_scale": 1.0, + "global_step": 2218, + "grad_norm": 1.8939007121730727, + "learning_rate": 4.008929695194818e-07, + "loss": 0.4417, + "step": 2218 + }, + { + "ETA": 1.7, + "epoch": 0.7136195529827947, + "fp16_scale": 1.0, + "global_step": 2219, + "grad_norm": 2.017407527687127, + "learning_rate": 4.000590077799468e-07, + "loss": 0.3961, + "step": 2219 + }, + { + "ETA": 1.7, + "epoch": 0.713941148094549, + "fp16_scale": 1.0, + "global_step": 2220, + "grad_norm": 2.1317372458820447, + "learning_rate": 3.99225697419198e-07, + "loss": 0.5324, + "step": 2220 + }, + { + "ETA": 1.7, + "epoch": 0.7142627432063032, + "fp16_scale": 1.0, + "global_step": 2221, + "grad_norm": 1.981381768894328, + "learning_rate": 3.983930393419911e-07, + "loss": 0.3655, + "step": 2221 + }, + { + "ETA": 1.69, + "epoch": 0.7145843383180576, + "fp16_scale": 1.0, + "global_step": 2222, + "grad_norm": 2.3060469712534837, + "learning_rate": 3.9756103445237564e-07, + "loss": 0.3862, + "step": 2222 + }, + { + "ETA": 1.69, + "epoch": 0.7149059334298119, + "fp16_scale": 1.0, + "global_step": 2223, + "grad_norm": 2.214203361396411, + "learning_rate": 3.967296836536902e-07, + "loss": 0.4544, + "step": 2223 + }, + { + "ETA": 1.69, + "epoch": 0.7152275285415661, + "fp16_scale": 1.0, + "global_step": 2224, + "grad_norm": 2.4956829318349123, + "learning_rate": 3.9589898784856435e-07, + "loss": 0.4377, + "step": 2224 + }, + { + "ETA": 1.69, + "epoch": 0.7155491236533205, + "fp16_scale": 1.0, + "global_step": 2225, + "grad_norm": 1.8896479220175337, + "learning_rate": 3.9506894793891654e-07, + "loss": 0.4722, + "step": 2225 + }, + { + "ETA": 1.69, + "epoch": 0.7158707187650748, + "fp16_scale": 1.0, + "global_step": 2226, + "grad_norm": 1.8571813908637695, + "learning_rate": 3.94239564825952e-07, + "loss": 0.3962, + "step": 2226 + }, + { + "ETA": 1.68, + "epoch": 0.716192313876829, + "fp16_scale": 1.0, + "global_step": 2227, + "grad_norm": 2.193852216256546, + "learning_rate": 3.934108394101644e-07, + "loss": 0.3723, + "step": 2227 + }, + { + "ETA": 1.68, + "epoch": 0.7165139089885834, + "fp16_scale": 1.0, + "global_step": 2228, + "grad_norm": 1.8704927468231431, + "learning_rate": 3.925827725913315e-07, + "loss": 0.4438, + "step": 2228 + }, + { + "ETA": 1.68, + "epoch": 0.7168355041003377, + "fp16_scale": 1.0, + "global_step": 2229, + "grad_norm": 2.166932787688569, + "learning_rate": 3.9175536526851773e-07, + "loss": 0.4531, + "step": 2229 + }, + { + "ETA": 1.68, + "epoch": 0.717157099212092, + "fp16_scale": 1.0, + "global_step": 2230, + "grad_norm": 1.8801807926539045, + "learning_rate": 3.9092861834007074e-07, + "loss": 0.4107, + "step": 2230 + }, + { + "ETA": 1.68, + "epoch": 0.7174786943238463, + "fp16_scale": 1.0, + "global_step": 2231, + "grad_norm": 2.276845077870314, + "learning_rate": 3.901025327036206e-07, + "loss": 0.5075, + "step": 2231 + }, + { + "ETA": 1.68, + "epoch": 0.7178002894356006, + "fp16_scale": 1.0, + "global_step": 2232, + "grad_norm": 2.160478450654775, + "learning_rate": 3.892771092560807e-07, + "loss": 0.422, + "step": 2232 + }, + { + "ETA": 1.67, + "epoch": 0.7181218845473549, + "fp16_scale": 1.0, + "global_step": 2233, + "grad_norm": 2.15339668025265, + "learning_rate": 3.8845234889364386e-07, + "loss": 0.4008, + "step": 2233 + }, + { + "ETA": 1.67, + "epoch": 0.7184434796591092, + "fp16_scale": 1.0, + "global_step": 2234, + "grad_norm": 2.0181357334308663, + "learning_rate": 3.8762825251178466e-07, + "loss": 0.4553, + "step": 2234 + }, + { + "ETA": 1.67, + "epoch": 0.7187650747708635, + "fp16_scale": 1.0, + "global_step": 2235, + "grad_norm": 2.071829424313654, + "learning_rate": 3.868048210052551e-07, + "loss": 0.4629, + "step": 2235 + }, + { + "ETA": 1.67, + "epoch": 0.7190866698826178, + "fp16_scale": 1.0, + "global_step": 2236, + "grad_norm": 2.316577421948334, + "learning_rate": 3.859820552680867e-07, + "loss": 0.4874, + "step": 2236 + }, + { + "ETA": 1.67, + "epoch": 0.719408264994372, + "fp16_scale": 1.0, + "global_step": 2237, + "grad_norm": 1.8859096372417163, + "learning_rate": 3.851599561935877e-07, + "loss": 0.4471, + "step": 2237 + }, + { + "ETA": 1.66, + "epoch": 0.7197298601061264, + "fp16_scale": 1.0, + "global_step": 2238, + "grad_norm": 2.072028466306422, + "learning_rate": 3.843385246743417e-07, + "loss": 0.3532, + "step": 2238 + }, + { + "ETA": 1.66, + "epoch": 0.7200514552178807, + "fp16_scale": 1.0, + "global_step": 2239, + "grad_norm": 2.2699672922909206, + "learning_rate": 3.8351776160220894e-07, + "loss": 0.5076, + "step": 2239 + }, + { + "ETA": 1.66, + "epoch": 0.720373050329635, + "fp16_scale": 1.0, + "global_step": 2240, + "grad_norm": 2.1749742047782172, + "learning_rate": 3.8269766786832245e-07, + "loss": 0.4127, + "step": 2240 + }, + { + "ETA": 1.66, + "epoch": 0.7206946454413893, + "fp16_scale": 1.0, + "global_step": 2241, + "grad_norm": 1.8004861275069077, + "learning_rate": 3.818782443630897e-07, + "loss": 0.439, + "step": 2241 + }, + { + "ETA": 1.66, + "epoch": 0.7210162405531436, + "fp16_scale": 1.0, + "global_step": 2242, + "grad_norm": 1.9692018267027478, + "learning_rate": 3.8105949197619e-07, + "loss": 0.4527, + "step": 2242 + }, + { + "ETA": 1.65, + "epoch": 0.7213378356648978, + "fp16_scale": 1.0, + "global_step": 2243, + "grad_norm": 1.8598905917571928, + "learning_rate": 3.802414115965736e-07, + "loss": 0.3778, + "step": 2243 + }, + { + "ETA": 1.65, + "epoch": 0.7216594307766522, + "fp16_scale": 1.0, + "global_step": 2244, + "grad_norm": 1.9180948493602807, + "learning_rate": 3.794240041124622e-07, + "loss": 0.3541, + "step": 2244 + }, + { + "ETA": 1.65, + "epoch": 0.7219810258884065, + "fp16_scale": 1.0, + "global_step": 2245, + "grad_norm": 2.2140959932998925, + "learning_rate": 3.7860727041134553e-07, + "loss": 0.4151, + "step": 2245 + }, + { + "ETA": 1.65, + "epoch": 0.7223026210001608, + "fp16_scale": 1.0, + "global_step": 2246, + "grad_norm": 1.8895681799785997, + "learning_rate": 3.7779121137998273e-07, + "loss": 0.4091, + "step": 2246 + }, + { + "ETA": 1.65, + "epoch": 0.7226242161119151, + "fp16_scale": 1.0, + "global_step": 2247, + "grad_norm": 2.154992159649028, + "learning_rate": 3.769758279044005e-07, + "loss": 0.4338, + "step": 2247 + }, + { + "ETA": 1.64, + "epoch": 0.7229458112236694, + "fp16_scale": 1.0, + "global_step": 2248, + "grad_norm": 1.9326745202783173, + "learning_rate": 3.761611208698912e-07, + "loss": 0.346, + "step": 2248 + }, + { + "ETA": 1.64, + "epoch": 0.7232674063354237, + "fp16_scale": 1.0, + "global_step": 2249, + "grad_norm": 1.976550975144646, + "learning_rate": 3.7534709116101383e-07, + "loss": 0.3921, + "step": 2249 + }, + { + "ETA": 1.64, + "epoch": 0.723589001447178, + "fp16_scale": 1.0, + "global_step": 2250, + "grad_norm": 1.8544961526694541, + "learning_rate": 3.745337396615909e-07, + "loss": 0.3978, + "step": 2250 + }, + { + "ETA": 1.64, + "epoch": 0.7239105965589323, + "fp16_scale": 1.0, + "global_step": 2251, + "grad_norm": 1.9565133967962072, + "learning_rate": 3.737210672547093e-07, + "loss": 0.4828, + "step": 2251 + }, + { + "ETA": 1.64, + "epoch": 0.7242321916706866, + "fp16_scale": 1.0, + "global_step": 2252, + "grad_norm": 1.8860209211586123, + "learning_rate": 3.729090748227186e-07, + "loss": 0.5123, + "step": 2252 + }, + { + "ETA": 1.63, + "epoch": 0.7245537867824409, + "fp16_scale": 1.0, + "global_step": 2253, + "grad_norm": 2.0462207476491434, + "learning_rate": 3.7209776324723006e-07, + "loss": 0.3653, + "step": 2253 + }, + { + "ETA": 1.63, + "epoch": 0.7248753818941952, + "fp16_scale": 1.0, + "global_step": 2254, + "grad_norm": 1.830256212384786, + "learning_rate": 3.7128713340911534e-07, + "loss": 0.4377, + "step": 2254 + }, + { + "ETA": 1.63, + "epoch": 0.7251969770059495, + "fp16_scale": 1.0, + "global_step": 2255, + "grad_norm": 1.7475762399120987, + "learning_rate": 3.704771861885058e-07, + "loss": 0.4223, + "step": 2255 + }, + { + "ETA": 1.63, + "epoch": 0.7255185721177038, + "fp16_scale": 1.0, + "global_step": 2256, + "grad_norm": 1.976400225248446, + "learning_rate": 3.6966792246479253e-07, + "loss": 0.4513, + "step": 2256 + }, + { + "ETA": 1.63, + "epoch": 0.7258401672294581, + "fp16_scale": 1.0, + "global_step": 2257, + "grad_norm": 2.166177005329153, + "learning_rate": 3.6885934311662334e-07, + "loss": 0.4618, + "step": 2257 + }, + { + "ETA": 1.62, + "epoch": 0.7261617623412124, + "fp16_scale": 1.0, + "global_step": 2258, + "grad_norm": 1.8911059330295397, + "learning_rate": 3.680514490219041e-07, + "loss": 0.4218, + "step": 2258 + }, + { + "ETA": 1.62, + "epoch": 0.7264833574529667, + "fp16_scale": 1.0, + "global_step": 2259, + "grad_norm": 2.110615272128805, + "learning_rate": 3.672442410577965e-07, + "loss": 0.4195, + "step": 2259 + }, + { + "ETA": 1.62, + "epoch": 0.726804952564721, + "fp16_scale": 1.0, + "global_step": 2260, + "grad_norm": 1.8987697768798977, + "learning_rate": 3.6643772010071617e-07, + "loss": 0.4004, + "step": 2260 + }, + { + "ETA": 1.62, + "epoch": 0.7271265476764753, + "fp16_scale": 1.0, + "global_step": 2261, + "grad_norm": 2.1312838534787386, + "learning_rate": 3.656318870263344e-07, + "loss": 0.4779, + "step": 2261 + }, + { + "ETA": 1.62, + "epoch": 0.7274481427882297, + "fp16_scale": 1.0, + "global_step": 2262, + "grad_norm": 1.8400510723318806, + "learning_rate": 3.648267427095741e-07, + "loss": 0.4164, + "step": 2262 + }, + { + "ETA": 1.62, + "epoch": 0.7277697378999839, + "fp16_scale": 1.0, + "global_step": 2263, + "grad_norm": 2.1056389829781277, + "learning_rate": 3.6402228802461164e-07, + "loss": 0.5058, + "step": 2263 + }, + { + "ETA": 1.61, + "epoch": 0.7280913330117382, + "fp16_scale": 1.0, + "global_step": 2264, + "grad_norm": 1.851103265295296, + "learning_rate": 3.6321852384487395e-07, + "loss": 0.4628, + "step": 2264 + }, + { + "ETA": 1.61, + "epoch": 0.7284129281234926, + "fp16_scale": 1.0, + "global_step": 2265, + "grad_norm": 1.9934810942961594, + "learning_rate": 3.624154510430387e-07, + "loss": 0.4373, + "step": 2265 + }, + { + "ETA": 1.61, + "epoch": 0.7287345232352468, + "fp16_scale": 1.0, + "global_step": 2266, + "grad_norm": 2.0880001743606846, + "learning_rate": 3.616130704910324e-07, + "loss": 0.4658, + "step": 2266 + }, + { + "ETA": 1.61, + "epoch": 0.7290561183470011, + "fp16_scale": 1.0, + "global_step": 2267, + "grad_norm": 2.2273833324564873, + "learning_rate": 3.608113830600299e-07, + "loss": 0.4706, + "step": 2267 + }, + { + "ETA": 1.61, + "epoch": 0.7293777134587555, + "fp16_scale": 1.0, + "global_step": 2268, + "grad_norm": 1.9394095161631035, + "learning_rate": 3.6001038962045395e-07, + "loss": 0.3588, + "step": 2268 + }, + { + "ETA": 1.6, + "epoch": 0.7296993085705097, + "fp16_scale": 1.0, + "global_step": 2269, + "grad_norm": 2.2227064661542353, + "learning_rate": 3.592100910419738e-07, + "loss": 0.3615, + "step": 2269 + }, + { + "ETA": 1.6, + "epoch": 0.730020903682264, + "fp16_scale": 1.0, + "global_step": 2270, + "grad_norm": 2.0757467341583484, + "learning_rate": 3.5841048819350427e-07, + "loss": 0.3497, + "step": 2270 + }, + { + "ETA": 1.6, + "epoch": 0.7303424987940184, + "fp16_scale": 1.0, + "global_step": 2271, + "grad_norm": 1.9640186278757845, + "learning_rate": 3.576115819432043e-07, + "loss": 0.4717, + "step": 2271 + }, + { + "ETA": 1.6, + "epoch": 0.7306640939057726, + "fp16_scale": 1.0, + "global_step": 2272, + "grad_norm": 1.9313114039202268, + "learning_rate": 3.568133731584767e-07, + "loss": 0.4545, + "step": 2272 + }, + { + "ETA": 1.6, + "epoch": 0.7309856890175269, + "fp16_scale": 1.0, + "global_step": 2273, + "grad_norm": 1.78254561493774, + "learning_rate": 3.560158627059676e-07, + "loss": 0.4455, + "step": 2273 + }, + { + "ETA": 1.59, + "epoch": 0.7313072841292813, + "fp16_scale": 1.0, + "global_step": 2274, + "grad_norm": 1.9168963550033165, + "learning_rate": 3.552190514515636e-07, + "loss": 0.3942, + "step": 2274 + }, + { + "ETA": 1.59, + "epoch": 0.7316288792410356, + "fp16_scale": 1.0, + "global_step": 2275, + "grad_norm": 1.9743466155000438, + "learning_rate": 3.5442294026039433e-07, + "loss": 0.4919, + "step": 2275 + }, + { + "ETA": 1.59, + "epoch": 0.7319504743527898, + "fp16_scale": 1.0, + "global_step": 2276, + "grad_norm": 1.9509055281718113, + "learning_rate": 3.5362752999682724e-07, + "loss": 0.4776, + "step": 2276 + }, + { + "ETA": 1.59, + "epoch": 0.7322720694645442, + "fp16_scale": 1.0, + "global_step": 2277, + "grad_norm": 1.9184878740758493, + "learning_rate": 3.528328215244695e-07, + "loss": 0.3758, + "step": 2277 + }, + { + "ETA": 1.59, + "epoch": 0.7325936645762985, + "fp16_scale": 1.0, + "global_step": 2278, + "grad_norm": 1.9837661782560243, + "learning_rate": 3.5203881570616667e-07, + "loss": 0.5061, + "step": 2278 + }, + { + "ETA": 1.58, + "epoch": 0.7329152596880527, + "fp16_scale": 1.0, + "global_step": 2279, + "grad_norm": 1.9748864180701593, + "learning_rate": 3.512455134040008e-07, + "loss": 0.447, + "step": 2279 + }, + { + "ETA": 1.58, + "epoch": 0.733236854799807, + "fp16_scale": 1.0, + "global_step": 2280, + "grad_norm": 2.1059961968364056, + "learning_rate": 3.504529154792905e-07, + "loss": 0.4355, + "step": 2280 + }, + { + "ETA": 1.58, + "epoch": 0.7335584499115614, + "fp16_scale": 1.0, + "global_step": 2281, + "grad_norm": 2.077181442498018, + "learning_rate": 3.4966102279258956e-07, + "loss": 0.4614, + "step": 2281 + }, + { + "ETA": 1.58, + "epoch": 0.7338800450233156, + "fp16_scale": 1.0, + "global_step": 2282, + "grad_norm": 1.9069189086089477, + "learning_rate": 3.488698362036865e-07, + "loss": 0.4018, + "step": 2282 + }, + { + "ETA": 1.58, + "epoch": 0.7342016401350699, + "fp16_scale": 1.0, + "global_step": 2283, + "grad_norm": 1.9092208735298126, + "learning_rate": 3.4807935657160237e-07, + "loss": 0.3999, + "step": 2283 + }, + { + "ETA": 1.58, + "epoch": 0.7345232352468243, + "fp16_scale": 1.0, + "global_step": 2284, + "grad_norm": 2.433976642012897, + "learning_rate": 3.472895847545905e-07, + "loss": 0.3816, + "step": 2284 + }, + { + "ETA": 1.57, + "epoch": 0.7348448303585785, + "fp16_scale": 1.0, + "global_step": 2285, + "grad_norm": 2.032070228982937, + "learning_rate": 3.4650052161013675e-07, + "loss": 0.4042, + "step": 2285 + }, + { + "ETA": 1.57, + "epoch": 0.7351664254703328, + "fp16_scale": 1.0, + "global_step": 2286, + "grad_norm": 1.9062000739290756, + "learning_rate": 3.4571216799495694e-07, + "loss": 0.4139, + "step": 2286 + }, + { + "ETA": 1.57, + "epoch": 0.7354880205820872, + "fp16_scale": 1.0, + "global_step": 2287, + "grad_norm": 1.9101721394666102, + "learning_rate": 3.44924524764997e-07, + "loss": 0.407, + "step": 2287 + }, + { + "ETA": 1.57, + "epoch": 0.7358096156938414, + "fp16_scale": 1.0, + "global_step": 2288, + "grad_norm": 2.027508835356294, + "learning_rate": 3.441375927754309e-07, + "loss": 0.3725, + "step": 2288 + }, + { + "ETA": 1.57, + "epoch": 0.7361312108055957, + "fp16_scale": 1.0, + "global_step": 2289, + "grad_norm": 2.0837935450157685, + "learning_rate": 3.4335137288066006e-07, + "loss": 0.4872, + "step": 2289 + }, + { + "ETA": 1.56, + "epoch": 0.7364528059173501, + "fp16_scale": 1.0, + "global_step": 2290, + "grad_norm": 1.936697090462457, + "learning_rate": 3.4256586593431404e-07, + "loss": 0.5331, + "step": 2290 + }, + { + "ETA": 1.56, + "epoch": 0.7367744010291044, + "fp16_scale": 1.0, + "global_step": 2291, + "grad_norm": 1.8008781436797965, + "learning_rate": 3.417810727892475e-07, + "loss": 0.4559, + "step": 2291 + }, + { + "ETA": 1.56, + "epoch": 0.7370959961408586, + "fp16_scale": 1.0, + "global_step": 2292, + "grad_norm": 1.8397839886723912, + "learning_rate": 3.409969942975407e-07, + "loss": 0.4461, + "step": 2292 + }, + { + "ETA": 1.56, + "epoch": 0.737417591252613, + "fp16_scale": 1.0, + "global_step": 2293, + "grad_norm": 1.942121214519844, + "learning_rate": 3.4021363131049665e-07, + "loss": 0.3783, + "step": 2293 + }, + { + "ETA": 1.56, + "epoch": 0.7377391863643673, + "fp16_scale": 1.0, + "global_step": 2294, + "grad_norm": 2.0805562914405806, + "learning_rate": 3.3943098467864315e-07, + "loss": 0.4683, + "step": 2294 + }, + { + "ETA": 1.55, + "epoch": 0.7380607814761215, + "fp16_scale": 1.0, + "global_step": 2295, + "grad_norm": 2.100925544342399, + "learning_rate": 3.3864905525172913e-07, + "loss": 0.4538, + "step": 2295 + }, + { + "ETA": 1.55, + "epoch": 0.7383823765878759, + "fp16_scale": 1.0, + "global_step": 2296, + "grad_norm": 2.0389002485397487, + "learning_rate": 3.378678438787246e-07, + "loss": 0.4396, + "step": 2296 + }, + { + "ETA": 1.55, + "epoch": 0.7387039716996302, + "fp16_scale": 1.0, + "global_step": 2297, + "grad_norm": 1.7637039918390904, + "learning_rate": 3.370873514078215e-07, + "loss": 0.3777, + "step": 2297 + }, + { + "ETA": 1.55, + "epoch": 0.7390255668113844, + "fp16_scale": 1.0, + "global_step": 2298, + "grad_norm": 1.8904305792234644, + "learning_rate": 3.3630757868642965e-07, + "loss": 0.4037, + "step": 2298 + }, + { + "ETA": 1.55, + "epoch": 0.7393471619231388, + "fp16_scale": 1.0, + "global_step": 2299, + "grad_norm": 1.9799632883906064, + "learning_rate": 3.3552852656117837e-07, + "loss": 0.4252, + "step": 2299 + }, + { + "ETA": 1.54, + "epoch": 0.7396687570348931, + "fp16_scale": 1.0, + "global_step": 2300, + "grad_norm": 2.0009972657069133, + "learning_rate": 3.34750195877914e-07, + "loss": 0.3857, + "step": 2300 + }, + { + "ETA": 1.54, + "epoch": 0.7399903521466473, + "fp16_scale": 1.0, + "global_step": 2301, + "grad_norm": 1.8936424284333242, + "learning_rate": 3.339725874816994e-07, + "loss": 0.406, + "step": 2301 + }, + { + "ETA": 1.54, + "epoch": 0.7403119472584017, + "fp16_scale": 1.0, + "global_step": 2302, + "grad_norm": 2.056499301209626, + "learning_rate": 3.3319570221681404e-07, + "loss": 0.4455, + "step": 2302 + }, + { + "ETA": 1.54, + "epoch": 0.740633542370156, + "fp16_scale": 1.0, + "global_step": 2303, + "grad_norm": 2.072575629275413, + "learning_rate": 3.3241954092675186e-07, + "loss": 0.3989, + "step": 2303 + }, + { + "ETA": 1.54, + "epoch": 0.7409551374819102, + "fp16_scale": 1.0, + "global_step": 2304, + "grad_norm": 1.8390868300147452, + "learning_rate": 3.31644104454221e-07, + "loss": 0.4202, + "step": 2304 + }, + { + "ETA": 1.53, + "epoch": 0.7412767325936646, + "fp16_scale": 1.0, + "global_step": 2305, + "grad_norm": 1.9442922194970502, + "learning_rate": 3.308693936411421e-07, + "loss": 0.3555, + "step": 2305 + }, + { + "ETA": 1.53, + "epoch": 0.7415983277054189, + "fp16_scale": 1.0, + "global_step": 2306, + "grad_norm": 1.9263437995603583, + "learning_rate": 3.3009540932864777e-07, + "loss": 0.4344, + "step": 2306 + }, + { + "ETA": 1.53, + "epoch": 0.7419199228171732, + "fp16_scale": 1.0, + "global_step": 2307, + "grad_norm": 1.7755929053575106, + "learning_rate": 3.293221523570826e-07, + "loss": 0.4771, + "step": 2307 + }, + { + "ETA": 1.53, + "epoch": 0.7422415179289275, + "fp16_scale": 1.0, + "global_step": 2308, + "grad_norm": 2.00979769118329, + "learning_rate": 3.2854962356600126e-07, + "loss": 0.4371, + "step": 2308 + }, + { + "ETA": 1.53, + "epoch": 0.7425631130406818, + "fp16_scale": 1.0, + "global_step": 2309, + "grad_norm": 2.3919546338784436, + "learning_rate": 3.2777782379416796e-07, + "loss": 0.3822, + "step": 2309 + }, + { + "ETA": 1.53, + "epoch": 0.7428847081524361, + "fp16_scale": 1.0, + "global_step": 2310, + "grad_norm": 1.9899339960693896, + "learning_rate": 3.2700675387955434e-07, + "loss": 0.4021, + "step": 2310 + }, + { + "ETA": 1.52, + "epoch": 0.7432063032641903, + "fp16_scale": 1.0, + "global_step": 2311, + "grad_norm": 1.9185900559349034, + "learning_rate": 3.2623641465934114e-07, + "loss": 0.4688, + "step": 2311 + }, + { + "ETA": 1.52, + "epoch": 0.7435278983759447, + "fp16_scale": 1.0, + "global_step": 2312, + "grad_norm": 1.8672071580120455, + "learning_rate": 3.2546680696991437e-07, + "loss": 0.4186, + "step": 2312 + }, + { + "ETA": 1.52, + "epoch": 0.743849493487699, + "fp16_scale": 1.0, + "global_step": 2313, + "grad_norm": 1.9660977148948693, + "learning_rate": 3.246979316468665e-07, + "loss": 0.4413, + "step": 2313 + }, + { + "ETA": 1.52, + "epoch": 0.7441710885994532, + "fp16_scale": 1.0, + "global_step": 2314, + "grad_norm": 1.6828170728926204, + "learning_rate": 3.239297895249955e-07, + "loss": 0.4026, + "step": 2314 + }, + { + "ETA": 1.52, + "epoch": 0.7444926837112076, + "fp16_scale": 1.0, + "global_step": 2315, + "grad_norm": 2.1493388414577725, + "learning_rate": 3.2316238143830143e-07, + "loss": 0.3468, + "step": 2315 + }, + { + "ETA": 1.51, + "epoch": 0.7448142788229619, + "fp16_scale": 1.0, + "global_step": 2316, + "grad_norm": 2.3164200650949742, + "learning_rate": 3.223957082199895e-07, + "loss": 0.4149, + "step": 2316 + }, + { + "ETA": 1.51, + "epoch": 0.7451358739347161, + "fp16_scale": 1.0, + "global_step": 2317, + "grad_norm": 1.9651991773040327, + "learning_rate": 3.2162977070246545e-07, + "loss": 0.4702, + "step": 2317 + }, + { + "ETA": 1.51, + "epoch": 0.7454574690464705, + "fp16_scale": 1.0, + "global_step": 2318, + "grad_norm": 1.9553953252173792, + "learning_rate": 3.208645697173362e-07, + "loss": 0.4139, + "step": 2318 + }, + { + "ETA": 1.51, + "epoch": 0.7457790641582248, + "fp16_scale": 1.0, + "global_step": 2319, + "grad_norm": 1.9427431440690355, + "learning_rate": 3.2010010609541104e-07, + "loss": 0.4283, + "step": 2319 + }, + { + "ETA": 1.51, + "epoch": 0.7461006592699791, + "fp16_scale": 1.0, + "global_step": 2320, + "grad_norm": 1.8627030964806996, + "learning_rate": 3.193363806666961e-07, + "loss": 0.4009, + "step": 2320 + }, + { + "ETA": 1.5, + "epoch": 0.7464222543817334, + "fp16_scale": 1.0, + "global_step": 2321, + "grad_norm": 2.0199565518748193, + "learning_rate": 3.18573394260398e-07, + "loss": 0.3448, + "step": 2321 + }, + { + "ETA": 1.5, + "epoch": 0.7467438494934877, + "fp16_scale": 1.0, + "global_step": 2322, + "grad_norm": 1.698843236949823, + "learning_rate": 3.1781114770491966e-07, + "loss": 0.355, + "step": 2322 + }, + { + "ETA": 1.5, + "epoch": 0.747065444605242, + "fp16_scale": 1.0, + "global_step": 2323, + "grad_norm": 2.002294199773455, + "learning_rate": 3.1704964182786085e-07, + "loss": 0.4127, + "step": 2323 + }, + { + "ETA": 1.5, + "epoch": 0.7473870397169963, + "fp16_scale": 1.0, + "global_step": 2324, + "grad_norm": 1.8715369845319172, + "learning_rate": 3.1628887745601807e-07, + "loss": 0.3584, + "step": 2324 + }, + { + "ETA": 1.5, + "epoch": 0.7477086348287506, + "fp16_scale": 1.0, + "global_step": 2325, + "grad_norm": 1.8158810424221832, + "learning_rate": 3.155288554153819e-07, + "loss": 0.4242, + "step": 2325 + }, + { + "ETA": 1.49, + "epoch": 0.7480302299405049, + "fp16_scale": 1.0, + "global_step": 2326, + "grad_norm": 2.097106697099669, + "learning_rate": 3.147695765311377e-07, + "loss": 0.4529, + "step": 2326 + }, + { + "ETA": 1.49, + "epoch": 0.7483518250522592, + "fp16_scale": 1.0, + "global_step": 2327, + "grad_norm": 2.0899355326634392, + "learning_rate": 3.140110416276627e-07, + "loss": 0.466, + "step": 2327 + }, + { + "ETA": 1.49, + "epoch": 0.7486734201640135, + "fp16_scale": 1.0, + "global_step": 2328, + "grad_norm": 2.2487782125918603, + "learning_rate": 3.132532515285279e-07, + "loss": 0.3965, + "step": 2328 + }, + { + "ETA": 1.49, + "epoch": 0.7489950152757678, + "fp16_scale": 1.0, + "global_step": 2329, + "grad_norm": 1.8858384265180017, + "learning_rate": 3.1249620705649416e-07, + "loss": 0.4482, + "step": 2329 + }, + { + "ETA": 1.49, + "epoch": 0.7493166103875221, + "fp16_scale": 1.0, + "global_step": 2330, + "grad_norm": 2.0794448090461506, + "learning_rate": 3.1173990903351386e-07, + "loss": 0.5206, + "step": 2330 + }, + { + "ETA": 1.48, + "epoch": 0.7496382054992764, + "fp16_scale": 1.0, + "global_step": 2331, + "grad_norm": 1.9262669051509544, + "learning_rate": 3.109843582807289e-07, + "loss": 0.3939, + "step": 2331 + }, + { + "ETA": 1.48, + "epoch": 0.7499598006110307, + "fp16_scale": 1.0, + "global_step": 2332, + "grad_norm": 2.019912618397986, + "learning_rate": 3.1022955561846875e-07, + "loss": 0.499, + "step": 2332 + }, + { + "ETA": 1.48, + "epoch": 0.750281395722785, + "fp16_scale": 1.0, + "global_step": 2333, + "grad_norm": 2.1215830733501826, + "learning_rate": 3.0947550186625226e-07, + "loss": 0.3759, + "step": 2333 + }, + { + "ETA": 1.48, + "epoch": 0.7506029908345393, + "fp16_scale": 1.0, + "global_step": 2334, + "grad_norm": 1.9013856707341015, + "learning_rate": 3.0872219784278354e-07, + "loss": 0.3687, + "step": 2334 + }, + { + "ETA": 1.48, + "epoch": 0.7509245859462936, + "fp16_scale": 1.0, + "global_step": 2335, + "grad_norm": 1.7948480691627944, + "learning_rate": 3.0796964436595376e-07, + "loss": 0.3836, + "step": 2335 + }, + { + "ETA": 1.47, + "epoch": 0.751246181058048, + "fp16_scale": 1.0, + "global_step": 2336, + "grad_norm": 2.116207803452061, + "learning_rate": 3.072178422528392e-07, + "loss": 0.4881, + "step": 2336 + }, + { + "ETA": 1.47, + "epoch": 0.7515677761698022, + "fp16_scale": 1.0, + "global_step": 2337, + "grad_norm": 2.049696974555666, + "learning_rate": 3.0646679231969954e-07, + "loss": 0.464, + "step": 2337 + }, + { + "ETA": 1.47, + "epoch": 0.7518893712815565, + "fp16_scale": 1.0, + "global_step": 2338, + "grad_norm": 1.8532413576947044, + "learning_rate": 3.057164953819787e-07, + "loss": 0.3691, + "step": 2338 + }, + { + "ETA": 1.47, + "epoch": 0.7522109663933109, + "fp16_scale": 1.0, + "global_step": 2339, + "grad_norm": 1.8678216549910127, + "learning_rate": 3.0496695225430234e-07, + "loss": 0.3593, + "step": 2339 + }, + { + "ETA": 1.47, + "epoch": 0.7525325615050651, + "fp16_scale": 1.0, + "global_step": 2340, + "grad_norm": 2.1001635734513315, + "learning_rate": 3.0421816375047835e-07, + "loss": 0.3875, + "step": 2340 + }, + { + "ETA": 1.46, + "epoch": 0.7528541566168194, + "fp16_scale": 1.0, + "global_step": 2341, + "grad_norm": 1.8501131900814614, + "learning_rate": 3.034701306834944e-07, + "loss": 0.3891, + "step": 2341 + }, + { + "ETA": 1.46, + "epoch": 0.7531757517285738, + "fp16_scale": 1.0, + "global_step": 2342, + "grad_norm": 1.9754992574675254, + "learning_rate": 3.0272285386551867e-07, + "loss": 0.462, + "step": 2342 + }, + { + "ETA": 1.46, + "epoch": 0.753497346840328, + "fp16_scale": 1.0, + "global_step": 2343, + "grad_norm": 2.1449968620130138, + "learning_rate": 3.019763341078986e-07, + "loss": 0.4149, + "step": 2343 + }, + { + "ETA": 1.46, + "epoch": 0.7538189419520823, + "fp16_scale": 1.0, + "global_step": 2344, + "grad_norm": 2.1797413203765705, + "learning_rate": 3.012305722211583e-07, + "loss": 0.3578, + "step": 2344 + }, + { + "ETA": 1.46, + "epoch": 0.7541405370638367, + "fp16_scale": 1.0, + "global_step": 2345, + "grad_norm": 2.1557219054361143, + "learning_rate": 3.0048556901500067e-07, + "loss": 0.433, + "step": 2345 + }, + { + "ETA": 1.46, + "epoch": 0.7544621321755909, + "fp16_scale": 1.0, + "global_step": 2346, + "grad_norm": 1.8652460311633114, + "learning_rate": 2.997413252983033e-07, + "loss": 0.3479, + "step": 2346 + }, + { + "ETA": 1.45, + "epoch": 0.7547837272873452, + "fp16_scale": 1.0, + "global_step": 2347, + "grad_norm": 1.802175048918983, + "learning_rate": 2.9899784187912027e-07, + "loss": 0.4433, + "step": 2347 + }, + { + "ETA": 1.45, + "epoch": 0.7551053223990996, + "fp16_scale": 1.0, + "global_step": 2348, + "grad_norm": 1.9417291380518882, + "learning_rate": 2.982551195646801e-07, + "loss": 0.4191, + "step": 2348 + }, + { + "ETA": 1.45, + "epoch": 0.7554269175108539, + "fp16_scale": 1.0, + "global_step": 2349, + "grad_norm": 2.0141324436262273, + "learning_rate": 2.975131591613842e-07, + "loss": 0.4138, + "step": 2349 + }, + { + "ETA": 1.45, + "epoch": 0.7557485126226081, + "fp16_scale": 1.0, + "global_step": 2350, + "grad_norm": 2.0507366630364565, + "learning_rate": 2.9677196147480786e-07, + "loss": 0.3968, + "step": 2350 + }, + { + "ETA": 1.45, + "epoch": 0.7560701077343625, + "fp16_scale": 1.0, + "global_step": 2351, + "grad_norm": 1.8831942011400506, + "learning_rate": 2.960315273096968e-07, + "loss": 0.3856, + "step": 2351 + }, + { + "ETA": 1.44, + "epoch": 0.7563917028461168, + "fp16_scale": 1.0, + "global_step": 2352, + "grad_norm": 2.0522410847350034, + "learning_rate": 2.952918574699692e-07, + "loss": 0.4369, + "step": 2352 + }, + { + "ETA": 1.44, + "epoch": 0.756713297957871, + "fp16_scale": 1.0, + "global_step": 2353, + "grad_norm": 1.8757701859960927, + "learning_rate": 2.9455295275871294e-07, + "loss": 0.5093, + "step": 2353 + }, + { + "ETA": 1.44, + "epoch": 0.7570348930696253, + "fp16_scale": 1.0, + "global_step": 2354, + "grad_norm": 1.9690277816933504, + "learning_rate": 2.938148139781844e-07, + "loss": 0.4554, + "step": 2354 + }, + { + "ETA": 1.44, + "epoch": 0.7573564881813797, + "fp16_scale": 1.0, + "global_step": 2355, + "grad_norm": 1.8450578782643, + "learning_rate": 2.930774419298097e-07, + "loss": 0.4056, + "step": 2355 + }, + { + "ETA": 1.44, + "epoch": 0.7576780832931339, + "fp16_scale": 1.0, + "global_step": 2356, + "grad_norm": 1.7034100050008794, + "learning_rate": 2.923408374141808e-07, + "loss": 0.3957, + "step": 2356 + }, + { + "ETA": 1.43, + "epoch": 0.7579996784048882, + "fp16_scale": 1.0, + "global_step": 2357, + "grad_norm": 2.382291209090017, + "learning_rate": 2.91605001231058e-07, + "loss": 0.3588, + "step": 2357 + }, + { + "ETA": 1.43, + "epoch": 0.7583212735166426, + "fp16_scale": 1.0, + "global_step": 2358, + "grad_norm": 1.847158745485686, + "learning_rate": 2.9086993417936667e-07, + "loss": 0.3489, + "step": 2358 + }, + { + "ETA": 1.43, + "epoch": 0.7586428686283968, + "fp16_scale": 1.0, + "global_step": 2359, + "grad_norm": 2.1251062018931717, + "learning_rate": 2.9013563705719667e-07, + "loss": 0.4184, + "step": 2359 + }, + { + "ETA": 1.43, + "epoch": 0.7589644637401511, + "fp16_scale": 1.0, + "global_step": 2360, + "grad_norm": 1.9086332649739344, + "learning_rate": 2.89402110661803e-07, + "loss": 0.4698, + "step": 2360 + }, + { + "ETA": 1.43, + "epoch": 0.7592860588519055, + "fp16_scale": 1.0, + "global_step": 2361, + "grad_norm": 1.8856253887660206, + "learning_rate": 2.886693557896024e-07, + "loss": 0.5095, + "step": 2361 + }, + { + "ETA": 1.42, + "epoch": 0.7596076539636597, + "fp16_scale": 1.0, + "global_step": 2362, + "grad_norm": 1.8381726038916812, + "learning_rate": 2.879373732361755e-07, + "loss": 0.3904, + "step": 2362 + }, + { + "ETA": 1.42, + "epoch": 0.759929249075414, + "fp16_scale": 1.0, + "global_step": 2363, + "grad_norm": 1.9701301999772292, + "learning_rate": 2.8720616379626295e-07, + "loss": 0.3614, + "step": 2363 + }, + { + "ETA": 1.42, + "epoch": 0.7602508441871684, + "fp16_scale": 1.0, + "global_step": 2364, + "grad_norm": 1.9254175024012488, + "learning_rate": 2.86475728263767e-07, + "loss": 0.3778, + "step": 2364 + }, + { + "ETA": 1.42, + "epoch": 0.7605724392989227, + "fp16_scale": 1.0, + "global_step": 2365, + "grad_norm": 1.8753050225093335, + "learning_rate": 2.857460674317498e-07, + "loss": 0.4242, + "step": 2365 + }, + { + "ETA": 1.42, + "epoch": 0.7608940344106769, + "fp16_scale": 1.0, + "global_step": 2366, + "grad_norm": 2.0529313455452076, + "learning_rate": 2.85017182092431e-07, + "loss": 0.3934, + "step": 2366 + }, + { + "ETA": 1.41, + "epoch": 0.7612156295224313, + "fp16_scale": 1.0, + "global_step": 2367, + "grad_norm": 2.0042601366074413, + "learning_rate": 2.842890730371901e-07, + "loss": 0.5309, + "step": 2367 + }, + { + "ETA": 1.41, + "epoch": 0.7615372246341856, + "fp16_scale": 1.0, + "global_step": 2368, + "grad_norm": 1.9441968238348006, + "learning_rate": 2.8356174105656194e-07, + "loss": 0.4102, + "step": 2368 + }, + { + "ETA": 1.41, + "epoch": 0.7618588197459398, + "fp16_scale": 1.0, + "global_step": 2369, + "grad_norm": 1.841715153459455, + "learning_rate": 2.82835186940239e-07, + "loss": 0.5004, + "step": 2369 + }, + { + "ETA": 1.41, + "epoch": 0.7621804148576942, + "fp16_scale": 1.0, + "global_step": 2370, + "grad_norm": 2.018858225491701, + "learning_rate": 2.8210941147706914e-07, + "loss": 0.4003, + "step": 2370 + }, + { + "ETA": 1.41, + "epoch": 0.7625020099694485, + "fp16_scale": 1.0, + "global_step": 2371, + "grad_norm": 2.023807966756412, + "learning_rate": 2.8138441545505365e-07, + "loss": 0.4476, + "step": 2371 + }, + { + "ETA": 1.41, + "epoch": 0.7628236050812027, + "fp16_scale": 1.0, + "global_step": 2372, + "grad_norm": 1.7877444415267678, + "learning_rate": 2.8066019966134904e-07, + "loss": 0.4067, + "step": 2372 + }, + { + "ETA": 1.4, + "epoch": 0.7631452001929571, + "fp16_scale": 1.0, + "global_step": 2373, + "grad_norm": 1.9810597265207228, + "learning_rate": 2.7993676488226334e-07, + "loss": 0.3652, + "step": 2373 + }, + { + "ETA": 1.4, + "epoch": 0.7634667953047114, + "fp16_scale": 1.0, + "global_step": 2374, + "grad_norm": 1.8176940727449031, + "learning_rate": 2.792141119032575e-07, + "loss": 0.4308, + "step": 2374 + }, + { + "ETA": 1.4, + "epoch": 0.7637883904164656, + "fp16_scale": 1.0, + "global_step": 2375, + "grad_norm": 1.8175822612335957, + "learning_rate": 2.784922415089438e-07, + "loss": 0.3414, + "step": 2375 + }, + { + "ETA": 1.4, + "epoch": 0.76410998552822, + "fp16_scale": 1.0, + "global_step": 2376, + "grad_norm": 1.8542997787006494, + "learning_rate": 2.7777115448308373e-07, + "loss": 0.4791, + "step": 2376 + }, + { + "ETA": 1.4, + "epoch": 0.7644315806399743, + "fp16_scale": 1.0, + "global_step": 2377, + "grad_norm": 1.906502731802614, + "learning_rate": 2.7705085160858955e-07, + "loss": 0.4136, + "step": 2377 + }, + { + "ETA": 1.39, + "epoch": 0.7647531757517285, + "fp16_scale": 1.0, + "global_step": 2378, + "grad_norm": 2.107113884405421, + "learning_rate": 2.7633133366752094e-07, + "loss": 0.4473, + "step": 2378 + }, + { + "ETA": 1.39, + "epoch": 0.7650747708634829, + "fp16_scale": 1.0, + "global_step": 2379, + "grad_norm": 2.0367569002273727, + "learning_rate": 2.7561260144108624e-07, + "loss": 0.4173, + "step": 2379 + }, + { + "ETA": 1.39, + "epoch": 0.7653963659752372, + "fp16_scale": 1.0, + "global_step": 2380, + "grad_norm": 1.9880387158551962, + "learning_rate": 2.7489465570964065e-07, + "loss": 0.4354, + "step": 2380 + }, + { + "ETA": 1.39, + "epoch": 0.7657179610869915, + "fp16_scale": 1.0, + "global_step": 2381, + "grad_norm": 2.0044324427964866, + "learning_rate": 2.741774972526847e-07, + "loss": 0.4979, + "step": 2381 + }, + { + "ETA": 1.39, + "epoch": 0.7660395561987458, + "fp16_scale": 1.0, + "global_step": 2382, + "grad_norm": 2.077523049347562, + "learning_rate": 2.7346112684886516e-07, + "loss": 0.4095, + "step": 2382 + }, + { + "ETA": 1.38, + "epoch": 0.7663611513105001, + "fp16_scale": 1.0, + "global_step": 2383, + "grad_norm": 2.0103024321551763, + "learning_rate": 2.7274554527597206e-07, + "loss": 0.4386, + "step": 2383 + }, + { + "ETA": 1.38, + "epoch": 0.7666827464222544, + "fp16_scale": 1.0, + "global_step": 2384, + "grad_norm": 1.9720379413104863, + "learning_rate": 2.7203075331094014e-07, + "loss": 0.3507, + "step": 2384 + }, + { + "ETA": 1.38, + "epoch": 0.7670043415340086, + "fp16_scale": 1.0, + "global_step": 2385, + "grad_norm": 2.0404184176861273, + "learning_rate": 2.7131675172984556e-07, + "loss": 0.4214, + "step": 2385 + }, + { + "ETA": 1.38, + "epoch": 0.767325936645763, + "fp16_scale": 1.0, + "global_step": 2386, + "grad_norm": 1.6426678670639059, + "learning_rate": 2.7060354130790795e-07, + "loss": 0.4091, + "step": 2386 + }, + { + "ETA": 1.38, + "epoch": 0.7676475317575173, + "fp16_scale": 1.0, + "global_step": 2387, + "grad_norm": 1.876970222583723, + "learning_rate": 2.698911228194867e-07, + "loss": 0.3538, + "step": 2387 + }, + { + "ETA": 1.37, + "epoch": 0.7679691268692715, + "fp16_scale": 1.0, + "global_step": 2388, + "grad_norm": 1.981909107063528, + "learning_rate": 2.6917949703808107e-07, + "loss": 0.4145, + "step": 2388 + }, + { + "ETA": 1.37, + "epoch": 0.7682907219810259, + "fp16_scale": 1.0, + "global_step": 2389, + "grad_norm": 1.99038324454389, + "learning_rate": 2.6846866473633124e-07, + "loss": 0.4371, + "step": 2389 + }, + { + "ETA": 1.37, + "epoch": 0.7686123170927802, + "fp16_scale": 1.0, + "global_step": 2390, + "grad_norm": 2.232733886614169, + "learning_rate": 2.67758626686014e-07, + "loss": 0.4094, + "step": 2390 + }, + { + "ETA": 1.37, + "epoch": 0.7689339122045344, + "fp16_scale": 1.0, + "global_step": 2391, + "grad_norm": 2.0280191977141904, + "learning_rate": 2.670493836580453e-07, + "loss": 0.4044, + "step": 2391 + }, + { + "ETA": 1.37, + "epoch": 0.7692555073162888, + "fp16_scale": 1.0, + "global_step": 2392, + "grad_norm": 1.969272510274025, + "learning_rate": 2.6634093642247737e-07, + "loss": 0.4537, + "step": 2392 + }, + { + "ETA": 1.37, + "epoch": 0.7695771024280431, + "fp16_scale": 1.0, + "global_step": 2393, + "grad_norm": 1.872811807796311, + "learning_rate": 2.6563328574849775e-07, + "loss": 0.4049, + "step": 2393 + }, + { + "ETA": 1.36, + "epoch": 0.7698986975397974, + "fp16_scale": 1.0, + "global_step": 2394, + "grad_norm": 1.9581733890124202, + "learning_rate": 2.649264324044306e-07, + "loss": 0.3751, + "step": 2394 + }, + { + "ETA": 1.36, + "epoch": 0.7702202926515517, + "fp16_scale": 1.0, + "global_step": 2395, + "grad_norm": 1.9921380649074314, + "learning_rate": 2.642203771577326e-07, + "loss": 0.4639, + "step": 2395 + }, + { + "ETA": 1.36, + "epoch": 0.770541887763306, + "fp16_scale": 1.0, + "global_step": 2396, + "grad_norm": 2.255240623215617, + "learning_rate": 2.635151207749953e-07, + "loss": 0.4606, + "step": 2396 + }, + { + "ETA": 1.36, + "epoch": 0.7708634828750603, + "fp16_scale": 1.0, + "global_step": 2397, + "grad_norm": 2.0052310782073333, + "learning_rate": 2.628106640219424e-07, + "loss": 0.4452, + "step": 2397 + }, + { + "ETA": 1.36, + "epoch": 0.7711850779868146, + "fp16_scale": 1.0, + "global_step": 2398, + "grad_norm": 1.8495776836523259, + "learning_rate": 2.621070076634296e-07, + "loss": 0.3467, + "step": 2398 + }, + { + "ETA": 1.35, + "epoch": 0.7715066730985689, + "fp16_scale": 1.0, + "global_step": 2399, + "grad_norm": 1.9785103009015907, + "learning_rate": 2.614041524634434e-07, + "loss": 0.3818, + "step": 2399 + }, + { + "ETA": 1.35, + "epoch": 0.7718282682103232, + "fp16_scale": 1.0, + "global_step": 2400, + "grad_norm": 2.1935046919910555, + "learning_rate": 2.6070209918509977e-07, + "loss": 0.4879, + "step": 2400 + }, + { + "ETA": 1.35, + "epoch": 0.7721498633220775, + "fp16_scale": 1.0, + "global_step": 2401, + "grad_norm": 2.2724570068349834, + "learning_rate": 2.600008485906453e-07, + "loss": 0.4166, + "step": 2401 + }, + { + "ETA": 1.35, + "epoch": 0.7724714584338318, + "fp16_scale": 1.0, + "global_step": 2402, + "grad_norm": 2.2781218984515244, + "learning_rate": 2.593004014414544e-07, + "loss": 0.4997, + "step": 2402 + }, + { + "ETA": 1.35, + "epoch": 0.7727930535455861, + "fp16_scale": 1.0, + "global_step": 2403, + "grad_norm": 2.167867864658568, + "learning_rate": 2.5860075849802943e-07, + "loss": 0.4464, + "step": 2403 + }, + { + "ETA": 1.35, + "epoch": 0.7731146486573404, + "fp16_scale": 1.0, + "global_step": 2404, + "grad_norm": 1.8598594573970242, + "learning_rate": 2.5790192051999917e-07, + "loss": 0.3918, + "step": 2404 + }, + { + "ETA": 1.35, + "epoch": 0.7734362437690947, + "fp16_scale": 1.0, + "global_step": 2405, + "grad_norm": 1.9770572453766835, + "learning_rate": 2.572038882661183e-07, + "loss": 0.4527, + "step": 2405 + }, + { + "ETA": 1.34, + "epoch": 0.773757838880849, + "fp16_scale": 1.0, + "global_step": 2406, + "grad_norm": 1.7657481931797248, + "learning_rate": 2.565066624942677e-07, + "loss": 0.3527, + "step": 2406 + }, + { + "ETA": 1.34, + "epoch": 0.7740794339926033, + "fp16_scale": 1.0, + "global_step": 2407, + "grad_norm": 2.1870135146732643, + "learning_rate": 2.558102439614511e-07, + "loss": 0.383, + "step": 2407 + }, + { + "ETA": 1.34, + "epoch": 0.7744010291043576, + "fp16_scale": 1.0, + "global_step": 2408, + "grad_norm": 1.8817821440102491, + "learning_rate": 2.5511463342379714e-07, + "loss": 0.4598, + "step": 2408 + }, + { + "ETA": 1.34, + "epoch": 0.7747226242161119, + "fp16_scale": 1.0, + "global_step": 2409, + "grad_norm": 1.957924103152357, + "learning_rate": 2.54419831636557e-07, + "loss": 0.4187, + "step": 2409 + }, + { + "ETA": 1.34, + "epoch": 0.7750442193278663, + "fp16_scale": 1.0, + "global_step": 2410, + "grad_norm": 1.8293307605639495, + "learning_rate": 2.5372583935410274e-07, + "loss": 0.4452, + "step": 2410 + }, + { + "ETA": 1.33, + "epoch": 0.7753658144396205, + "fp16_scale": 1.0, + "global_step": 2411, + "grad_norm": 1.95054798880309, + "learning_rate": 2.5303265732992885e-07, + "loss": 0.4174, + "step": 2411 + }, + { + "ETA": 1.33, + "epoch": 0.7756874095513748, + "fp16_scale": 1.0, + "global_step": 2412, + "grad_norm": 1.9225610623389644, + "learning_rate": 2.5234028631664884e-07, + "loss": 0.4868, + "step": 2412 + }, + { + "ETA": 1.33, + "epoch": 0.7760090046631292, + "fp16_scale": 1.0, + "global_step": 2413, + "grad_norm": 1.9580567396648731, + "learning_rate": 2.516487270659966e-07, + "loss": 0.4789, + "step": 2413 + }, + { + "ETA": 1.33, + "epoch": 0.7763305997748834, + "fp16_scale": 1.0, + "global_step": 2414, + "grad_norm": 2.0889732155814107, + "learning_rate": 2.5095798032882443e-07, + "loss": 0.441, + "step": 2414 + }, + { + "ETA": 1.33, + "epoch": 0.7766521948866377, + "fp16_scale": 1.0, + "global_step": 2415, + "grad_norm": 2.2663999415670575, + "learning_rate": 2.502680468551025e-07, + "loss": 0.4823, + "step": 2415 + }, + { + "ETA": 1.32, + "epoch": 0.7769737899983921, + "fp16_scale": 1.0, + "global_step": 2416, + "grad_norm": 2.1777402998300217, + "learning_rate": 2.495789273939176e-07, + "loss": 0.4474, + "step": 2416 + }, + { + "ETA": 1.32, + "epoch": 0.7772953851101463, + "fp16_scale": 1.0, + "global_step": 2417, + "grad_norm": 1.9846358739140422, + "learning_rate": 2.4889062269347284e-07, + "loss": 0.4434, + "step": 2417 + }, + { + "ETA": 1.32, + "epoch": 0.7776169802219006, + "fp16_scale": 1.0, + "global_step": 2418, + "grad_norm": 2.052147639969581, + "learning_rate": 2.48203133501087e-07, + "loss": 0.4001, + "step": 2418 + }, + { + "ETA": 1.32, + "epoch": 0.777938575333655, + "fp16_scale": 1.0, + "global_step": 2419, + "grad_norm": 2.228658611393801, + "learning_rate": 2.475164605631933e-07, + "loss": 0.4636, + "step": 2419 + }, + { + "ETA": 1.32, + "epoch": 0.7782601704454092, + "fp16_scale": 1.0, + "global_step": 2420, + "grad_norm": 2.1585999473649693, + "learning_rate": 2.46830604625339e-07, + "loss": 0.3789, + "step": 2420 + }, + { + "ETA": 1.31, + "epoch": 0.7785817655571635, + "fp16_scale": 1.0, + "global_step": 2421, + "grad_norm": 2.1549970635685662, + "learning_rate": 2.4614556643218376e-07, + "loss": 0.4867, + "step": 2421 + }, + { + "ETA": 1.31, + "epoch": 0.7789033606689179, + "fp16_scale": 1.0, + "global_step": 2422, + "grad_norm": 1.742901679031034, + "learning_rate": 2.4546134672749943e-07, + "loss": 0.3308, + "step": 2422 + }, + { + "ETA": 1.31, + "epoch": 0.7792249557806721, + "fp16_scale": 1.0, + "global_step": 2423, + "grad_norm": 2.0792519984779365, + "learning_rate": 2.4477794625416945e-07, + "loss": 0.438, + "step": 2423 + }, + { + "ETA": 1.31, + "epoch": 0.7795465508924264, + "fp16_scale": 1.0, + "global_step": 2424, + "grad_norm": 1.9555432825383154, + "learning_rate": 2.4409536575418786e-07, + "loss": 0.4247, + "step": 2424 + }, + { + "ETA": 1.31, + "epoch": 0.7798681460041808, + "fp16_scale": 1.0, + "global_step": 2425, + "grad_norm": 2.170058553322506, + "learning_rate": 2.4341360596865865e-07, + "loss": 0.3825, + "step": 2425 + }, + { + "ETA": 1.3, + "epoch": 0.7801897411159351, + "fp16_scale": 1.0, + "global_step": 2426, + "grad_norm": 2.025218796942112, + "learning_rate": 2.427326676377939e-07, + "loss": 0.3754, + "step": 2426 + }, + { + "ETA": 1.3, + "epoch": 0.7805113362276893, + "fp16_scale": 1.0, + "global_step": 2427, + "grad_norm": 2.138378640496298, + "learning_rate": 2.4205255150091465e-07, + "loss": 0.5007, + "step": 2427 + }, + { + "ETA": 1.3, + "epoch": 0.7808329313394436, + "fp16_scale": 1.0, + "global_step": 2428, + "grad_norm": 1.9477397638022118, + "learning_rate": 2.413732582964486e-07, + "loss": 0.3996, + "step": 2428 + }, + { + "ETA": 1.3, + "epoch": 0.781154526451198, + "fp16_scale": 1.0, + "global_step": 2429, + "grad_norm": 1.9310566415678243, + "learning_rate": 2.4069478876193014e-07, + "loss": 0.4145, + "step": 2429 + }, + { + "ETA": 1.3, + "epoch": 0.7814761215629522, + "fp16_scale": 1.0, + "global_step": 2430, + "grad_norm": 1.7897591000650312, + "learning_rate": 2.4001714363399973e-07, + "loss": 0.4507, + "step": 2430 + }, + { + "ETA": 1.29, + "epoch": 0.7817977166747065, + "fp16_scale": 1.0, + "global_step": 2431, + "grad_norm": 1.775196010156822, + "learning_rate": 2.393403236484024e-07, + "loss": 0.4017, + "step": 2431 + }, + { + "ETA": 1.29, + "epoch": 0.7821193117864609, + "fp16_scale": 1.0, + "global_step": 2432, + "grad_norm": 2.0239457154136344, + "learning_rate": 2.386643295399878e-07, + "loss": 0.4194, + "step": 2432 + }, + { + "ETA": 1.29, + "epoch": 0.7824409068982151, + "fp16_scale": 1.0, + "global_step": 2433, + "grad_norm": 2.0350327598032734, + "learning_rate": 2.3798916204270815e-07, + "loss": 0.5327, + "step": 2433 + }, + { + "ETA": 1.29, + "epoch": 0.7827625020099694, + "fp16_scale": 1.0, + "global_step": 2434, + "grad_norm": 1.9191840675151848, + "learning_rate": 2.3731482188961815e-07, + "loss": 0.4084, + "step": 2434 + }, + { + "ETA": 1.29, + "epoch": 0.7830840971217238, + "fp16_scale": 1.0, + "global_step": 2435, + "grad_norm": 1.7817657574658297, + "learning_rate": 2.3664130981287488e-07, + "loss": 0.4291, + "step": 2435 + }, + { + "ETA": 1.29, + "epoch": 0.783405692233478, + "fp16_scale": 1.0, + "global_step": 2436, + "grad_norm": 1.7354952296947364, + "learning_rate": 2.359686265437363e-07, + "loss": 0.4298, + "step": 2436 + }, + { + "ETA": 1.28, + "epoch": 0.7837272873452323, + "fp16_scale": 1.0, + "global_step": 2437, + "grad_norm": 1.9891033243464187, + "learning_rate": 2.3529677281256023e-07, + "loss": 0.4185, + "step": 2437 + }, + { + "ETA": 1.28, + "epoch": 0.7840488824569867, + "fp16_scale": 1.0, + "global_step": 2438, + "grad_norm": 1.9224597958050123, + "learning_rate": 2.3462574934880363e-07, + "loss": 0.4331, + "step": 2438 + }, + { + "ETA": 1.28, + "epoch": 0.784370477568741, + "fp16_scale": 1.0, + "global_step": 2439, + "grad_norm": 1.933918397639351, + "learning_rate": 2.339555568810221e-07, + "loss": 0.3845, + "step": 2439 + }, + { + "ETA": 1.28, + "epoch": 0.7846920726804952, + "fp16_scale": 1.0, + "global_step": 2440, + "grad_norm": 2.206973782846126, + "learning_rate": 2.3328619613686929e-07, + "loss": 0.4741, + "step": 2440 + }, + { + "ETA": 1.28, + "epoch": 0.7850136677922496, + "fp16_scale": 1.0, + "global_step": 2441, + "grad_norm": 1.9425902063024618, + "learning_rate": 2.3261766784309566e-07, + "loss": 0.4425, + "step": 2441 + }, + { + "ETA": 1.27, + "epoch": 0.7853352629040039, + "fp16_scale": 1.0, + "global_step": 2442, + "grad_norm": 2.0404531728969673, + "learning_rate": 2.3194997272554816e-07, + "loss": 0.4983, + "step": 2442 + }, + { + "ETA": 1.27, + "epoch": 0.7856568580157581, + "fp16_scale": 1.0, + "global_step": 2443, + "grad_norm": 2.200435411586142, + "learning_rate": 2.3128311150916823e-07, + "loss": 0.4583, + "step": 2443 + }, + { + "ETA": 1.27, + "epoch": 0.7859784531275125, + "fp16_scale": 1.0, + "global_step": 2444, + "grad_norm": 2.319434408849042, + "learning_rate": 2.3061708491799315e-07, + "loss": 0.4072, + "step": 2444 + }, + { + "ETA": 1.27, + "epoch": 0.7863000482392668, + "fp16_scale": 1.0, + "global_step": 2445, + "grad_norm": 1.8877487328761051, + "learning_rate": 2.299518936751529e-07, + "loss": 0.3687, + "step": 2445 + }, + { + "ETA": 1.27, + "epoch": 0.786621643351021, + "fp16_scale": 1.0, + "global_step": 2446, + "grad_norm": 1.702793967365356, + "learning_rate": 2.2928753850287052e-07, + "loss": 0.417, + "step": 2446 + }, + { + "ETA": 1.26, + "epoch": 0.7869432384627754, + "fp16_scale": 1.0, + "global_step": 2447, + "grad_norm": 1.694650746618412, + "learning_rate": 2.2862402012246274e-07, + "loss": 0.379, + "step": 2447 + }, + { + "ETA": 1.26, + "epoch": 0.7872648335745297, + "fp16_scale": 1.0, + "global_step": 2448, + "grad_norm": 2.0788330389654797, + "learning_rate": 2.2796133925433604e-07, + "loss": 0.4545, + "step": 2448 + }, + { + "ETA": 1.26, + "epoch": 0.7875864286862839, + "fp16_scale": 1.0, + "global_step": 2449, + "grad_norm": 2.07207971611621, + "learning_rate": 2.2729949661798876e-07, + "loss": 0.4605, + "step": 2449 + }, + { + "ETA": 1.26, + "epoch": 0.7879080237980383, + "fp16_scale": 1.0, + "global_step": 2450, + "grad_norm": 1.9467495491281819, + "learning_rate": 2.2663849293200833e-07, + "loss": 0.354, + "step": 2450 + }, + { + "ETA": 1.26, + "epoch": 0.7882296189097926, + "fp16_scale": 1.0, + "global_step": 2451, + "grad_norm": 1.8780845183653034, + "learning_rate": 2.259783289140713e-07, + "loss": 0.3823, + "step": 2451 + }, + { + "ETA": 1.25, + "epoch": 0.7885512140215468, + "fp16_scale": 1.0, + "global_step": 2452, + "grad_norm": 1.778563629067184, + "learning_rate": 2.2531900528094338e-07, + "loss": 0.3928, + "step": 2452 + }, + { + "ETA": 1.25, + "epoch": 0.7888728091333012, + "fp16_scale": 1.0, + "global_step": 2453, + "grad_norm": 2.116978406408106, + "learning_rate": 2.2466052274847713e-07, + "loss": 0.4129, + "step": 2453 + }, + { + "ETA": 1.25, + "epoch": 0.7891944042450555, + "fp16_scale": 1.0, + "global_step": 2454, + "grad_norm": 1.7966541557113676, + "learning_rate": 2.2400288203161267e-07, + "loss": 0.4545, + "step": 2454 + }, + { + "ETA": 1.25, + "epoch": 0.7895159993568098, + "fp16_scale": 1.0, + "global_step": 2455, + "grad_norm": 1.6463550140147465, + "learning_rate": 2.233460838443747e-07, + "loss": 0.4503, + "step": 2455 + }, + { + "ETA": 1.25, + "epoch": 0.789837594468564, + "fp16_scale": 1.0, + "global_step": 2456, + "grad_norm": 2.037057951742195, + "learning_rate": 2.226901288998747e-07, + "loss": 0.457, + "step": 2456 + }, + { + "ETA": 1.25, + "epoch": 0.7901591895803184, + "fp16_scale": 1.0, + "global_step": 2457, + "grad_norm": 1.7667875112466476, + "learning_rate": 2.2203501791030755e-07, + "loss": 0.4695, + "step": 2457 + }, + { + "ETA": 1.24, + "epoch": 0.7904807846920727, + "fp16_scale": 1.0, + "global_step": 2458, + "grad_norm": 2.1244558523204042, + "learning_rate": 2.2138075158695223e-07, + "loss": 0.4613, + "step": 2458 + }, + { + "ETA": 1.24, + "epoch": 0.790802379803827, + "fp16_scale": 1.0, + "global_step": 2459, + "grad_norm": 1.846268770568288, + "learning_rate": 2.20727330640171e-07, + "loss": 0.4504, + "step": 2459 + }, + { + "ETA": 1.24, + "epoch": 0.7911239749155813, + "fp16_scale": 1.0, + "global_step": 2460, + "grad_norm": 2.3358489335845514, + "learning_rate": 2.2007475577940727e-07, + "loss": 0.4058, + "step": 2460 + }, + { + "ETA": 1.24, + "epoch": 0.7914455700273356, + "fp16_scale": 1.0, + "global_step": 2461, + "grad_norm": 1.8555064537306474, + "learning_rate": 2.1942302771318711e-07, + "loss": 0.3934, + "step": 2461 + }, + { + "ETA": 1.24, + "epoch": 0.7917671651390898, + "fp16_scale": 1.0, + "global_step": 2462, + "grad_norm": 1.9481266892670586, + "learning_rate": 2.1877214714911573e-07, + "loss": 0.4398, + "step": 2462 + }, + { + "ETA": 1.23, + "epoch": 0.7920887602508442, + "fp16_scale": 1.0, + "global_step": 2463, + "grad_norm": 1.9136236053642883, + "learning_rate": 2.1812211479387955e-07, + "loss": 0.4187, + "step": 2463 + }, + { + "ETA": 1.23, + "epoch": 0.7924103553625985, + "fp16_scale": 1.0, + "global_step": 2464, + "grad_norm": 2.0753059041092943, + "learning_rate": 2.174729313532433e-07, + "loss": 0.4092, + "step": 2464 + }, + { + "ETA": 1.23, + "epoch": 0.7927319504743527, + "fp16_scale": 1.0, + "global_step": 2465, + "grad_norm": 1.8422628993967345, + "learning_rate": 2.1682459753204996e-07, + "loss": 0.496, + "step": 2465 + }, + { + "ETA": 1.23, + "epoch": 0.7930535455861071, + "fp16_scale": 1.0, + "global_step": 2466, + "grad_norm": 1.8599436632392372, + "learning_rate": 2.1617711403422067e-07, + "loss": 0.438, + "step": 2466 + }, + { + "ETA": 1.23, + "epoch": 0.7933751406978614, + "fp16_scale": 1.0, + "global_step": 2467, + "grad_norm": 2.0420426155202156, + "learning_rate": 2.1553048156275278e-07, + "loss": 0.4036, + "step": 2467 + }, + { + "ETA": 1.22, + "epoch": 0.7936967358096156, + "fp16_scale": 1.0, + "global_step": 2468, + "grad_norm": 1.9215311888129323, + "learning_rate": 2.14884700819719e-07, + "loss": 0.4464, + "step": 2468 + }, + { + "ETA": 1.22, + "epoch": 0.79401833092137, + "fp16_scale": 1.0, + "global_step": 2469, + "grad_norm": 1.9252054519149342, + "learning_rate": 2.1423977250626935e-07, + "loss": 0.4183, + "step": 2469 + }, + { + "ETA": 1.22, + "epoch": 0.7943399260331243, + "fp16_scale": 1.0, + "global_step": 2470, + "grad_norm": 1.868769489318591, + "learning_rate": 2.1359569732262616e-07, + "loss": 0.4225, + "step": 2470 + }, + { + "ETA": 1.22, + "epoch": 0.7946615211448786, + "fp16_scale": 1.0, + "global_step": 2471, + "grad_norm": 1.914731506889317, + "learning_rate": 2.1295247596808707e-07, + "loss": 0.4664, + "step": 2471 + }, + { + "ETA": 1.22, + "epoch": 0.7949831162566329, + "fp16_scale": 1.0, + "global_step": 2472, + "grad_norm": 2.1494173384732305, + "learning_rate": 2.1231010914102132e-07, + "loss": 0.3298, + "step": 2472 + }, + { + "ETA": 1.21, + "epoch": 0.7953047113683872, + "fp16_scale": 1.0, + "global_step": 2473, + "grad_norm": 1.753058322961423, + "learning_rate": 2.1166859753887168e-07, + "loss": 0.4025, + "step": 2473 + }, + { + "ETA": 1.21, + "epoch": 0.7956263064801415, + "fp16_scale": 1.0, + "global_step": 2474, + "grad_norm": 1.987746944860503, + "learning_rate": 2.1102794185815097e-07, + "loss": 0.4229, + "step": 2474 + }, + { + "ETA": 1.21, + "epoch": 0.7959479015918958, + "fp16_scale": 1.0, + "global_step": 2475, + "grad_norm": 2.1355958067140475, + "learning_rate": 2.1038814279444406e-07, + "loss": 0.5087, + "step": 2475 + }, + { + "ETA": 1.21, + "epoch": 0.7962694967036501, + "fp16_scale": 1.0, + "global_step": 2476, + "grad_norm": 1.915840649659624, + "learning_rate": 2.0974920104240524e-07, + "loss": 0.3922, + "step": 2476 + }, + { + "ETA": 1.21, + "epoch": 0.7965910918154044, + "fp16_scale": 1.0, + "global_step": 2477, + "grad_norm": 1.993782877560204, + "learning_rate": 2.0911111729575736e-07, + "loss": 0.4834, + "step": 2477 + }, + { + "ETA": 1.21, + "epoch": 0.7969126869271587, + "fp16_scale": 1.0, + "global_step": 2478, + "grad_norm": 2.242003118407737, + "learning_rate": 2.0847389224729283e-07, + "loss": 0.3983, + "step": 2478 + }, + { + "ETA": 1.2, + "epoch": 0.797234282038913, + "fp16_scale": 1.0, + "global_step": 2479, + "grad_norm": 2.2093102230788406, + "learning_rate": 2.0783752658887066e-07, + "loss": 0.4651, + "step": 2479 + }, + { + "ETA": 1.2, + "epoch": 0.7975558771506673, + "fp16_scale": 1.0, + "global_step": 2480, + "grad_norm": 2.130014627732121, + "learning_rate": 2.0720202101141748e-07, + "loss": 0.5205, + "step": 2480 + }, + { + "ETA": 1.2, + "epoch": 0.7978774722624216, + "fp16_scale": 1.0, + "global_step": 2481, + "grad_norm": 2.2184590034367995, + "learning_rate": 2.0656737620492627e-07, + "loss": 0.429, + "step": 2481 + }, + { + "ETA": 1.2, + "epoch": 0.7981990673741759, + "fp16_scale": 1.0, + "global_step": 2482, + "grad_norm": 1.801017357698713, + "learning_rate": 2.0593359285845436e-07, + "loss": 0.391, + "step": 2482 + }, + { + "ETA": 1.2, + "epoch": 0.7985206624859302, + "fp16_scale": 1.0, + "global_step": 2483, + "grad_norm": 1.891796986862624, + "learning_rate": 2.053006716601251e-07, + "loss": 0.4492, + "step": 2483 + }, + { + "ETA": 1.19, + "epoch": 0.7988422575976846, + "fp16_scale": 1.0, + "global_step": 2484, + "grad_norm": 1.9651324001750246, + "learning_rate": 2.046686132971247e-07, + "loss": 0.4895, + "step": 2484 + }, + { + "ETA": 1.19, + "epoch": 0.7991638527094388, + "fp16_scale": 1.0, + "global_step": 2485, + "grad_norm": 2.0979890977550104, + "learning_rate": 2.0403741845570311e-07, + "loss": 0.3495, + "step": 2485 + }, + { + "ETA": 1.19, + "epoch": 0.7994854478211931, + "fp16_scale": 1.0, + "global_step": 2486, + "grad_norm": 2.0540695022360436, + "learning_rate": 2.0340708782117289e-07, + "loss": 0.3883, + "step": 2486 + }, + { + "ETA": 1.19, + "epoch": 0.7998070429329475, + "fp16_scale": 1.0, + "global_step": 2487, + "grad_norm": 1.957244486134892, + "learning_rate": 2.027776220779076e-07, + "loss": 0.4617, + "step": 2487 + }, + { + "ETA": 1.19, + "epoch": 0.8001286380447017, + "fp16_scale": 1.0, + "global_step": 2488, + "grad_norm": 1.8549920163018019, + "learning_rate": 2.0214902190934259e-07, + "loss": 0.401, + "step": 2488 + }, + { + "ETA": 1.18, + "epoch": 0.800450233156456, + "fp16_scale": 1.0, + "global_step": 2489, + "grad_norm": 2.012581416443408, + "learning_rate": 2.0152128799797253e-07, + "loss": 0.4624, + "step": 2489 + }, + { + "ETA": 1.18, + "epoch": 0.8007718282682104, + "fp16_scale": 1.0, + "global_step": 2490, + "grad_norm": 2.083048801769933, + "learning_rate": 2.0089442102535238e-07, + "loss": 0.4228, + "step": 2490 + }, + { + "ETA": 1.18, + "epoch": 0.8010934233799646, + "fp16_scale": 1.0, + "global_step": 2491, + "grad_norm": 2.0238387783724363, + "learning_rate": 2.0026842167209557e-07, + "loss": 0.4077, + "step": 2491 + }, + { + "ETA": 1.18, + "epoch": 0.8014150184917189, + "fp16_scale": 1.0, + "global_step": 2492, + "grad_norm": 1.9708344518247611, + "learning_rate": 1.99643290617873e-07, + "loss": 0.3877, + "step": 2492 + }, + { + "ETA": 1.18, + "epoch": 0.8017366136034733, + "fp16_scale": 1.0, + "global_step": 2493, + "grad_norm": 2.045313160170974, + "learning_rate": 1.9901902854141384e-07, + "loss": 0.4108, + "step": 2493 + }, + { + "ETA": 1.17, + "epoch": 0.8020582087152275, + "fp16_scale": 1.0, + "global_step": 2494, + "grad_norm": 1.7795147824502908, + "learning_rate": 1.983956361205027e-07, + "loss": 0.3458, + "step": 2494 + }, + { + "ETA": 1.17, + "epoch": 0.8023798038269818, + "fp16_scale": 1.0, + "global_step": 2495, + "grad_norm": 2.11604102663727, + "learning_rate": 1.9777311403198084e-07, + "loss": 0.4683, + "step": 2495 + }, + { + "ETA": 1.17, + "epoch": 0.8027013989387362, + "fp16_scale": 1.0, + "global_step": 2496, + "grad_norm": 1.9364654543210857, + "learning_rate": 1.971514629517438e-07, + "loss": 0.4086, + "step": 2496 + }, + { + "ETA": 1.17, + "epoch": 0.8030229940504904, + "fp16_scale": 1.0, + "global_step": 2497, + "grad_norm": 1.7758602896327396, + "learning_rate": 1.9653068355474212e-07, + "loss": 0.4368, + "step": 2497 + }, + { + "ETA": 1.17, + "epoch": 0.8033445891622447, + "fp16_scale": 1.0, + "global_step": 2498, + "grad_norm": 2.0880662850925247, + "learning_rate": 1.9591077651497977e-07, + "loss": 0.389, + "step": 2498 + }, + { + "ETA": 1.17, + "epoch": 0.803666184273999, + "fp16_scale": 1.0, + "global_step": 2499, + "grad_norm": 2.053422363344973, + "learning_rate": 1.9529174250551306e-07, + "loss": 0.4646, + "step": 2499 + }, + { + "ETA": 1.16, + "epoch": 0.8039877793857534, + "fp16_scale": 1.0, + "global_step": 2500, + "grad_norm": 1.9842140172033078, + "learning_rate": 1.946735821984513e-07, + "loss": 0.409, + "step": 2500 + }, + { + "ETA": 1.16, + "epoch": 0.8043093744975076, + "fp16_scale": 1.0, + "global_step": 2501, + "grad_norm": 2.2112034969132637, + "learning_rate": 1.94056296264954e-07, + "loss": 0.4633, + "step": 2501 + }, + { + "ETA": 1.16, + "epoch": 0.804630969609262, + "fp16_scale": 1.0, + "global_step": 2502, + "grad_norm": 1.9870326488709018, + "learning_rate": 1.9343988537523236e-07, + "loss": 0.3735, + "step": 2502 + }, + { + "ETA": 1.16, + "epoch": 0.8049525647210163, + "fp16_scale": 1.0, + "global_step": 2503, + "grad_norm": 1.994764445023778, + "learning_rate": 1.928243501985475e-07, + "loss": 0.4333, + "step": 2503 + }, + { + "ETA": 1.16, + "epoch": 0.8052741598327705, + "fp16_scale": 1.0, + "global_step": 2504, + "grad_norm": 1.9549067264880902, + "learning_rate": 1.9220969140320887e-07, + "loss": 0.4622, + "step": 2504 + }, + { + "ETA": 1.15, + "epoch": 0.8055957549445248, + "fp16_scale": 1.0, + "global_step": 2505, + "grad_norm": 2.040741415297998, + "learning_rate": 1.9159590965657534e-07, + "loss": 0.4144, + "step": 2505 + }, + { + "ETA": 1.15, + "epoch": 0.8059173500562792, + "fp16_scale": 1.0, + "global_step": 2506, + "grad_norm": 1.9184951697946278, + "learning_rate": 1.9098300562505264e-07, + "loss": 0.4467, + "step": 2506 + }, + { + "ETA": 1.15, + "epoch": 0.8062389451680334, + "fp16_scale": 1.0, + "global_step": 2507, + "grad_norm": 1.9242915649426138, + "learning_rate": 1.9037097997409436e-07, + "loss": 0.4843, + "step": 2507 + }, + { + "ETA": 1.15, + "epoch": 0.8065605402797877, + "fp16_scale": 1.0, + "global_step": 2508, + "grad_norm": 2.276806397279408, + "learning_rate": 1.8975983336820022e-07, + "loss": 0.4614, + "step": 2508 + }, + { + "ETA": 1.15, + "epoch": 0.8068821353915421, + "fp16_scale": 1.0, + "global_step": 2509, + "grad_norm": 2.0156629218691373, + "learning_rate": 1.8914956647091497e-07, + "loss": 0.4553, + "step": 2509 + }, + { + "ETA": 1.14, + "epoch": 0.8072037305032963, + "fp16_scale": 1.0, + "global_step": 2510, + "grad_norm": 2.0442056998554716, + "learning_rate": 1.8854017994482908e-07, + "loss": 0.4204, + "step": 2510 + }, + { + "ETA": 1.14, + "epoch": 0.8075253256150506, + "fp16_scale": 1.0, + "global_step": 2511, + "grad_norm": 2.264490983935522, + "learning_rate": 1.8793167445157608e-07, + "loss": 0.4189, + "step": 2511 + }, + { + "ETA": 1.14, + "epoch": 0.807846920726805, + "fp16_scale": 1.0, + "global_step": 2512, + "grad_norm": 1.9185061905528236, + "learning_rate": 1.8732405065183432e-07, + "loss": 0.4458, + "step": 2512 + }, + { + "ETA": 1.14, + "epoch": 0.8081685158385593, + "fp16_scale": 1.0, + "global_step": 2513, + "grad_norm": 1.8712179718797177, + "learning_rate": 1.8671730920532335e-07, + "loss": 0.3941, + "step": 2513 + }, + { + "ETA": 1.14, + "epoch": 0.8084901109503135, + "fp16_scale": 1.0, + "global_step": 2514, + "grad_norm": 1.7769623459440849, + "learning_rate": 1.8611145077080592e-07, + "loss": 0.3725, + "step": 2514 + }, + { + "ETA": 1.14, + "epoch": 0.8088117060620679, + "fp16_scale": 1.0, + "global_step": 2515, + "grad_norm": 1.7695957466228178, + "learning_rate": 1.8550647600608572e-07, + "loss": 0.3891, + "step": 2515 + }, + { + "ETA": 1.13, + "epoch": 0.8091333011738222, + "fp16_scale": 1.0, + "global_step": 2516, + "grad_norm": 1.9458140690971688, + "learning_rate": 1.8490238556800641e-07, + "loss": 0.3958, + "step": 2516 + }, + { + "ETA": 1.13, + "epoch": 0.8094548962855764, + "fp16_scale": 1.0, + "global_step": 2517, + "grad_norm": 1.9830296884981868, + "learning_rate": 1.842991801124526e-07, + "loss": 0.4352, + "step": 2517 + }, + { + "ETA": 1.13, + "epoch": 0.8097764913973308, + "fp16_scale": 1.0, + "global_step": 2518, + "grad_norm": 2.119460266750053, + "learning_rate": 1.8369686029434673e-07, + "loss": 0.4216, + "step": 2518 + }, + { + "ETA": 1.13, + "epoch": 0.8100980865090851, + "fp16_scale": 1.0, + "global_step": 2519, + "grad_norm": 2.100373649115543, + "learning_rate": 1.830954267676509e-07, + "loss": 0.4544, + "step": 2519 + }, + { + "ETA": 1.13, + "epoch": 0.8104196816208393, + "fp16_scale": 1.0, + "global_step": 2520, + "grad_norm": 2.1256103975037255, + "learning_rate": 1.824948801853643e-07, + "loss": 0.4149, + "step": 2520 + }, + { + "ETA": 1.12, + "epoch": 0.8107412767325937, + "fp16_scale": 1.0, + "global_step": 2521, + "grad_norm": 1.8692798779298303, + "learning_rate": 1.8189522119952304e-07, + "loss": 0.4557, + "step": 2521 + }, + { + "ETA": 1.12, + "epoch": 0.811062871844348, + "fp16_scale": 1.0, + "global_step": 2522, + "grad_norm": 2.1134773429500235, + "learning_rate": 1.8129645046120002e-07, + "loss": 0.3747, + "step": 2522 + }, + { + "ETA": 1.12, + "epoch": 0.8113844669561022, + "fp16_scale": 1.0, + "global_step": 2523, + "grad_norm": 1.943583459154084, + "learning_rate": 1.8069856862050303e-07, + "loss": 0.3523, + "step": 2523 + }, + { + "ETA": 1.12, + "epoch": 0.8117060620678566, + "fp16_scale": 1.0, + "global_step": 2524, + "grad_norm": 1.9621807831039595, + "learning_rate": 1.801015763265754e-07, + "loss": 0.5122, + "step": 2524 + }, + { + "ETA": 1.12, + "epoch": 0.8120276571796109, + "fp16_scale": 1.0, + "global_step": 2525, + "grad_norm": 1.8613041917834738, + "learning_rate": 1.7950547422759454e-07, + "loss": 0.416, + "step": 2525 + }, + { + "ETA": 1.11, + "epoch": 0.8123492522913651, + "fp16_scale": 1.0, + "global_step": 2526, + "grad_norm": 2.0614993988677512, + "learning_rate": 1.7891026297077094e-07, + "loss": 0.4028, + "step": 2526 + }, + { + "ETA": 1.11, + "epoch": 0.8126708474031195, + "fp16_scale": 1.0, + "global_step": 2527, + "grad_norm": 1.9527363939156346, + "learning_rate": 1.7831594320234844e-07, + "loss": 0.4435, + "step": 2527 + }, + { + "ETA": 1.11, + "epoch": 0.8129924425148738, + "fp16_scale": 1.0, + "global_step": 2528, + "grad_norm": 2.047131825759397, + "learning_rate": 1.777225155676021e-07, + "loss": 0.4773, + "step": 2528 + }, + { + "ETA": 1.11, + "epoch": 0.8133140376266281, + "fp16_scale": 1.0, + "global_step": 2529, + "grad_norm": 2.3245541025753864, + "learning_rate": 1.771299807108394e-07, + "loss": 0.4465, + "step": 2529 + }, + { + "ETA": 1.11, + "epoch": 0.8136356327383824, + "fp16_scale": 1.0, + "global_step": 2530, + "grad_norm": 2.209518083176055, + "learning_rate": 1.7653833927539773e-07, + "loss": 0.4224, + "step": 2530 + }, + { + "ETA": 1.1, + "epoch": 0.8139572278501367, + "fp16_scale": 1.0, + "global_step": 2531, + "grad_norm": 1.865852104287864, + "learning_rate": 1.7594759190364517e-07, + "loss": 0.4575, + "step": 2531 + }, + { + "ETA": 1.1, + "epoch": 0.814278822961891, + "fp16_scale": 1.0, + "global_step": 2532, + "grad_norm": 2.3157858690210467, + "learning_rate": 1.7535773923697828e-07, + "loss": 0.464, + "step": 2532 + }, + { + "ETA": 1.1, + "epoch": 0.8146004180736452, + "fp16_scale": 1.0, + "global_step": 2533, + "grad_norm": 1.8916738019343446, + "learning_rate": 1.7476878191582245e-07, + "loss": 0.4532, + "step": 2533 + }, + { + "ETA": 1.1, + "epoch": 0.8149220131853996, + "fp16_scale": 1.0, + "global_step": 2534, + "grad_norm": 2.141556612925703, + "learning_rate": 1.741807205796314e-07, + "loss": 0.4473, + "step": 2534 + }, + { + "ETA": 1.1, + "epoch": 0.8152436082971539, + "fp16_scale": 1.0, + "global_step": 2535, + "grad_norm": 1.8035567061387987, + "learning_rate": 1.7359355586688506e-07, + "loss": 0.4375, + "step": 2535 + }, + { + "ETA": 1.09, + "epoch": 0.8155652034089081, + "fp16_scale": 1.0, + "global_step": 2536, + "grad_norm": 1.8545618445407024, + "learning_rate": 1.7300728841509161e-07, + "loss": 0.3985, + "step": 2536 + }, + { + "ETA": 1.09, + "epoch": 0.8158867985206625, + "fp16_scale": 1.0, + "global_step": 2537, + "grad_norm": 1.9846775897391127, + "learning_rate": 1.7242191886078328e-07, + "loss": 0.4467, + "step": 2537 + }, + { + "ETA": 1.09, + "epoch": 0.8162083936324168, + "fp16_scale": 1.0, + "global_step": 2538, + "grad_norm": 2.013589406256994, + "learning_rate": 1.7183744783951792e-07, + "loss": 0.4546, + "step": 2538 + }, + { + "ETA": 1.09, + "epoch": 0.816529988744171, + "fp16_scale": 1.0, + "global_step": 2539, + "grad_norm": 2.1691878399885316, + "learning_rate": 1.712538759858786e-07, + "loss": 0.4676, + "step": 2539 + }, + { + "ETA": 1.09, + "epoch": 0.8168515838559254, + "fp16_scale": 1.0, + "global_step": 2540, + "grad_norm": 2.1063781504713495, + "learning_rate": 1.7067120393347078e-07, + "loss": 0.4208, + "step": 2540 + }, + { + "ETA": 1.09, + "epoch": 0.8171731789676797, + "fp16_scale": 1.0, + "global_step": 2541, + "grad_norm": 1.8546166854224735, + "learning_rate": 1.700894323149241e-07, + "loss": 0.4607, + "step": 2541 + }, + { + "ETA": 1.08, + "epoch": 0.8174947740794339, + "fp16_scale": 1.0, + "global_step": 2542, + "grad_norm": 1.9700426747155693, + "learning_rate": 1.6950856176189032e-07, + "loss": 0.347, + "step": 2542 + }, + { + "ETA": 1.08, + "epoch": 0.8178163691911883, + "fp16_scale": 1.0, + "global_step": 2543, + "grad_norm": 1.8289973718165615, + "learning_rate": 1.6892859290504236e-07, + "loss": 0.4975, + "step": 2543 + }, + { + "ETA": 1.08, + "epoch": 0.8181379643029426, + "fp16_scale": 1.0, + "global_step": 2544, + "grad_norm": 1.9524794021852336, + "learning_rate": 1.6834952637407484e-07, + "loss": 0.3824, + "step": 2544 + }, + { + "ETA": 1.08, + "epoch": 0.818459559414697, + "fp16_scale": 1.0, + "global_step": 2545, + "grad_norm": 2.019513824256391, + "learning_rate": 1.6777136279770198e-07, + "loss": 0.3291, + "step": 2545 + }, + { + "ETA": 1.08, + "epoch": 0.8187811545264512, + "fp16_scale": 1.0, + "global_step": 2546, + "grad_norm": 1.9393061584107576, + "learning_rate": 1.671941028036582e-07, + "loss": 0.4279, + "step": 2546 + }, + { + "ETA": 1.07, + "epoch": 0.8191027496382055, + "fp16_scale": 1.0, + "global_step": 2547, + "grad_norm": 1.9017947170826133, + "learning_rate": 1.666177470186967e-07, + "loss": 0.382, + "step": 2547 + }, + { + "ETA": 1.07, + "epoch": 0.8194243447499598, + "fp16_scale": 1.0, + "global_step": 2548, + "grad_norm": 2.101805217901928, + "learning_rate": 1.6604229606858898e-07, + "loss": 0.4719, + "step": 2548 + }, + { + "ETA": 1.07, + "epoch": 0.8197459398617141, + "fp16_scale": 1.0, + "global_step": 2549, + "grad_norm": 1.889196482177124, + "learning_rate": 1.65467750578124e-07, + "loss": 0.4763, + "step": 2549 + }, + { + "ETA": 1.07, + "epoch": 0.8200675349734684, + "fp16_scale": 1.0, + "global_step": 2550, + "grad_norm": 1.9490867359181265, + "learning_rate": 1.648941111711073e-07, + "loss": 0.4549, + "step": 2550 + }, + { + "ETA": 1.07, + "epoch": 0.8203891300852227, + "fp16_scale": 1.0, + "global_step": 2551, + "grad_norm": 1.776545407263281, + "learning_rate": 1.6432137847036142e-07, + "loss": 0.4146, + "step": 2551 + }, + { + "ETA": 1.06, + "epoch": 0.820710725196977, + "fp16_scale": 1.0, + "global_step": 2552, + "grad_norm": 1.9405113251265456, + "learning_rate": 1.6374955309772408e-07, + "loss": 0.4236, + "step": 2552 + }, + { + "ETA": 1.06, + "epoch": 0.8210323203087313, + "fp16_scale": 1.0, + "global_step": 2553, + "grad_norm": 2.1506190989888707, + "learning_rate": 1.631786356740479e-07, + "loss": 0.43, + "step": 2553 + }, + { + "ETA": 1.06, + "epoch": 0.8213539154204856, + "fp16_scale": 1.0, + "global_step": 2554, + "grad_norm": 1.9605395718845724, + "learning_rate": 1.6260862681919962e-07, + "loss": 0.3987, + "step": 2554 + }, + { + "ETA": 1.06, + "epoch": 0.8216755105322399, + "fp16_scale": 1.0, + "global_step": 2555, + "grad_norm": 1.9439136460842648, + "learning_rate": 1.6203952715205916e-07, + "loss": 0.4473, + "step": 2555 + }, + { + "ETA": 1.06, + "epoch": 0.8219971056439942, + "fp16_scale": 1.0, + "global_step": 2556, + "grad_norm": 1.8452912797379448, + "learning_rate": 1.6147133729052042e-07, + "loss": 0.4235, + "step": 2556 + }, + { + "ETA": 1.05, + "epoch": 0.8223187007557485, + "fp16_scale": 1.0, + "global_step": 2557, + "grad_norm": 1.9669060855358702, + "learning_rate": 1.6090405785148786e-07, + "loss": 0.4801, + "step": 2557 + }, + { + "ETA": 1.05, + "epoch": 0.8226402958675029, + "fp16_scale": 1.0, + "global_step": 2558, + "grad_norm": 1.9688328574724991, + "learning_rate": 1.6033768945087934e-07, + "loss": 0.4557, + "step": 2558 + }, + { + "ETA": 1.05, + "epoch": 0.8229618909792571, + "fp16_scale": 1.0, + "global_step": 2559, + "grad_norm": 1.937784146822814, + "learning_rate": 1.5977223270362194e-07, + "loss": 0.4149, + "step": 2559 + }, + { + "ETA": 1.05, + "epoch": 0.8232834860910114, + "fp16_scale": 1.0, + "global_step": 2560, + "grad_norm": 1.8457875986594938, + "learning_rate": 1.5920768822365416e-07, + "loss": 0.4222, + "step": 2560 + }, + { + "ETA": 1.05, + "epoch": 0.8236050812027658, + "fp16_scale": 1.0, + "global_step": 2561, + "grad_norm": 2.1278834935103, + "learning_rate": 1.58644056623923e-07, + "loss": 0.3592, + "step": 2561 + }, + { + "ETA": 1.05, + "epoch": 0.82392667631452, + "fp16_scale": 1.0, + "global_step": 2562, + "grad_norm": 1.9791492685702856, + "learning_rate": 1.5808133851638472e-07, + "loss": 0.4564, + "step": 2562 + }, + { + "ETA": 1.04, + "epoch": 0.8242482714262743, + "fp16_scale": 1.0, + "global_step": 2563, + "grad_norm": 1.923078974095852, + "learning_rate": 1.5751953451200384e-07, + "loss": 0.3662, + "step": 2563 + }, + { + "ETA": 1.04, + "epoch": 0.8245698665380287, + "fp16_scale": 1.0, + "global_step": 2564, + "grad_norm": 2.2040797032388064, + "learning_rate": 1.5695864522075254e-07, + "loss": 0.3789, + "step": 2564 + }, + { + "ETA": 1.04, + "epoch": 0.8248914616497829, + "fp16_scale": 1.0, + "global_step": 2565, + "grad_norm": 1.8952652841970785, + "learning_rate": 1.563986712516099e-07, + "loss": 0.4193, + "step": 2565 + }, + { + "ETA": 1.04, + "epoch": 0.8252130567615372, + "fp16_scale": 1.0, + "global_step": 2566, + "grad_norm": 1.9349291792067729, + "learning_rate": 1.5583961321256056e-07, + "loss": 0.419, + "step": 2566 + }, + { + "ETA": 1.04, + "epoch": 0.8255346518732916, + "fp16_scale": 1.0, + "global_step": 2567, + "grad_norm": 2.2330237281409895, + "learning_rate": 1.5528147171059514e-07, + "loss": 0.4592, + "step": 2567 + }, + { + "ETA": 1.03, + "epoch": 0.8258562469850458, + "fp16_scale": 1.0, + "global_step": 2568, + "grad_norm": 2.0969844108060323, + "learning_rate": 1.547242473517092e-07, + "loss": 0.4139, + "step": 2568 + }, + { + "ETA": 1.03, + "epoch": 0.8261778420968001, + "fp16_scale": 1.0, + "global_step": 2569, + "grad_norm": 2.0395160920440794, + "learning_rate": 1.5416794074090255e-07, + "loss": 0.328, + "step": 2569 + }, + { + "ETA": 1.03, + "epoch": 0.8264994372085545, + "fp16_scale": 1.0, + "global_step": 2570, + "grad_norm": 2.068855299846136, + "learning_rate": 1.5361255248217864e-07, + "loss": 0.4236, + "step": 2570 + }, + { + "ETA": 1.03, + "epoch": 0.8268210323203087, + "fp16_scale": 1.0, + "global_step": 2571, + "grad_norm": 2.210493605981183, + "learning_rate": 1.530580831785434e-07, + "loss": 0.5278, + "step": 2571 + }, + { + "ETA": 1.03, + "epoch": 0.827142627432063, + "fp16_scale": 1.0, + "global_step": 2572, + "grad_norm": 2.020852969906599, + "learning_rate": 1.525045334320051e-07, + "loss": 0.4251, + "step": 2572 + }, + { + "ETA": 1.02, + "epoch": 0.8274642225438174, + "fp16_scale": 1.0, + "global_step": 2573, + "grad_norm": 2.2499177103886714, + "learning_rate": 1.5195190384357404e-07, + "loss": 0.4723, + "step": 2573 + }, + { + "ETA": 1.02, + "epoch": 0.8277858176555717, + "fp16_scale": 1.0, + "global_step": 2574, + "grad_norm": 1.9345280398813554, + "learning_rate": 1.5140019501326108e-07, + "loss": 0.4186, + "step": 2574 + }, + { + "ETA": 1.02, + "epoch": 0.8281074127673259, + "fp16_scale": 1.0, + "global_step": 2575, + "grad_norm": 2.0608889214705353, + "learning_rate": 1.5084940754007792e-07, + "loss": 0.3819, + "step": 2575 + }, + { + "ETA": 1.02, + "epoch": 0.8284290078790802, + "fp16_scale": 1.0, + "global_step": 2576, + "grad_norm": 2.0624830140043606, + "learning_rate": 1.5029954202203487e-07, + "loss": 0.408, + "step": 2576 + }, + { + "ETA": 1.02, + "epoch": 0.8287506029908346, + "fp16_scale": 1.0, + "global_step": 2577, + "grad_norm": 2.0575361441388993, + "learning_rate": 1.497505990561424e-07, + "loss": 0.4474, + "step": 2577 + }, + { + "ETA": 1.01, + "epoch": 0.8290721981025888, + "fp16_scale": 1.0, + "global_step": 2578, + "grad_norm": 1.9315086577025948, + "learning_rate": 1.4920257923840862e-07, + "loss": 0.4437, + "step": 2578 + }, + { + "ETA": 1.01, + "epoch": 0.8293937932143431, + "fp16_scale": 1.0, + "global_step": 2579, + "grad_norm": 2.1467622948579743, + "learning_rate": 1.4865548316383892e-07, + "loss": 0.3698, + "step": 2579 + }, + { + "ETA": 1.01, + "epoch": 0.8297153883260975, + "fp16_scale": 1.0, + "global_step": 2580, + "grad_norm": 1.9042671934926998, + "learning_rate": 1.4810931142643734e-07, + "loss": 0.4359, + "step": 2580 + }, + { + "ETA": 1.01, + "epoch": 0.8300369834378517, + "fp16_scale": 1.0, + "global_step": 2581, + "grad_norm": 2.111438866818042, + "learning_rate": 1.475640646192028e-07, + "loss": 0.4663, + "step": 2581 + }, + { + "ETA": 1.01, + "epoch": 0.830358578549606, + "fp16_scale": 1.0, + "global_step": 2582, + "grad_norm": 1.850302035062646, + "learning_rate": 1.470197433341307e-07, + "loss": 0.3967, + "step": 2582 + }, + { + "ETA": 1.0, + "epoch": 0.8306801736613604, + "fp16_scale": 1.0, + "global_step": 2583, + "grad_norm": 2.000143436859863, + "learning_rate": 1.4647634816221132e-07, + "loss": 0.4992, + "step": 2583 + }, + { + "ETA": 1.0, + "epoch": 0.8310017687731146, + "fp16_scale": 1.0, + "global_step": 2584, + "grad_norm": 1.7734052413804926, + "learning_rate": 1.459338796934293e-07, + "loss": 0.4448, + "step": 2584 + }, + { + "ETA": 1.0, + "epoch": 0.8313233638848689, + "fp16_scale": 1.0, + "global_step": 2585, + "grad_norm": 1.9917405909212338, + "learning_rate": 1.4539233851676346e-07, + "loss": 0.4446, + "step": 2585 + }, + { + "ETA": 1.0, + "epoch": 0.8316449589966233, + "fp16_scale": 1.0, + "global_step": 2586, + "grad_norm": 1.9828738133796069, + "learning_rate": 1.4485172522018573e-07, + "loss": 0.3661, + "step": 2586 + }, + { + "ETA": 1.0, + "epoch": 0.8319665541083775, + "fp16_scale": 1.0, + "global_step": 2587, + "grad_norm": 1.8872427807549932, + "learning_rate": 1.443120403906608e-07, + "loss": 0.388, + "step": 2587 + }, + { + "ETA": 0.99, + "epoch": 0.8322881492201318, + "fp16_scale": 1.0, + "global_step": 2588, + "grad_norm": 1.8174817377215846, + "learning_rate": 1.4377328461414462e-07, + "loss": 0.3715, + "step": 2588 + }, + { + "ETA": 0.99, + "epoch": 0.8326097443318862, + "fp16_scale": 1.0, + "global_step": 2589, + "grad_norm": 2.1715968227378024, + "learning_rate": 1.4323545847558517e-07, + "loss": 0.4259, + "step": 2589 + }, + { + "ETA": 0.99, + "epoch": 0.8329313394436405, + "fp16_scale": 1.0, + "global_step": 2590, + "grad_norm": 1.9014250535210901, + "learning_rate": 1.4269856255892033e-07, + "loss": 0.3336, + "step": 2590 + }, + { + "ETA": 0.99, + "epoch": 0.8332529345553947, + "fp16_scale": 1.0, + "global_step": 2591, + "grad_norm": 1.9357048316498084, + "learning_rate": 1.421625974470788e-07, + "loss": 0.4073, + "step": 2591 + }, + { + "ETA": 0.99, + "epoch": 0.8335745296671491, + "fp16_scale": 1.0, + "global_step": 2592, + "grad_norm": 2.1649140880066517, + "learning_rate": 1.416275637219786e-07, + "loss": 0.4152, + "step": 2592 + }, + { + "ETA": 0.99, + "epoch": 0.8338961247789034, + "fp16_scale": 1.0, + "global_step": 2593, + "grad_norm": 1.8427275663772384, + "learning_rate": 1.4109346196452553e-07, + "loss": 0.4478, + "step": 2593 + }, + { + "ETA": 0.98, + "epoch": 0.8342177198906576, + "fp16_scale": 1.0, + "global_step": 2594, + "grad_norm": 1.7796464256515407, + "learning_rate": 1.4056029275461478e-07, + "loss": 0.4066, + "step": 2594 + }, + { + "ETA": 0.98, + "epoch": 0.834539315002412, + "fp16_scale": 1.0, + "global_step": 2595, + "grad_norm": 1.9569002544326037, + "learning_rate": 1.4002805667112817e-07, + "loss": 0.4346, + "step": 2595 + }, + { + "ETA": 0.98, + "epoch": 0.8348609101141663, + "fp16_scale": 1.0, + "global_step": 2596, + "grad_norm": 2.178954877658416, + "learning_rate": 1.3949675429193465e-07, + "loss": 0.4346, + "step": 2596 + }, + { + "ETA": 0.98, + "epoch": 0.8351825052259205, + "fp16_scale": 1.0, + "global_step": 2597, + "grad_norm": 2.083800666836877, + "learning_rate": 1.3896638619388978e-07, + "loss": 0.3998, + "step": 2597 + }, + { + "ETA": 0.98, + "epoch": 0.8355041003376749, + "fp16_scale": 1.0, + "global_step": 2598, + "grad_norm": 1.98421065941732, + "learning_rate": 1.3843695295283408e-07, + "loss": 0.3668, + "step": 2598 + }, + { + "ETA": 0.97, + "epoch": 0.8358256954494292, + "fp16_scale": 1.0, + "global_step": 2599, + "grad_norm": 1.9859012175962345, + "learning_rate": 1.379084551435936e-07, + "loss": 0.4887, + "step": 2599 + }, + { + "ETA": 0.97, + "epoch": 0.8361472905611834, + "fp16_scale": 1.0, + "global_step": 2600, + "grad_norm": 1.9265365356699595, + "learning_rate": 1.373808933399785e-07, + "loss": 0.4329, + "step": 2600 + }, + { + "ETA": 0.97, + "epoch": 0.8364688856729378, + "fp16_scale": 1.0, + "global_step": 2601, + "grad_norm": 1.9302765311369212, + "learning_rate": 1.368542681147824e-07, + "loss": 0.3832, + "step": 2601 + }, + { + "ETA": 0.97, + "epoch": 0.8367904807846921, + "fp16_scale": 1.0, + "global_step": 2602, + "grad_norm": 2.0436185754092717, + "learning_rate": 1.3632858003978264e-07, + "loss": 0.44, + "step": 2602 + }, + { + "ETA": 0.97, + "epoch": 0.8371120758964464, + "fp16_scale": 1.0, + "global_step": 2603, + "grad_norm": 2.0618549069106717, + "learning_rate": 1.358038296857389e-07, + "loss": 0.4329, + "step": 2603 + }, + { + "ETA": 0.97, + "epoch": 0.8374336710082007, + "fp16_scale": 1.0, + "global_step": 2604, + "grad_norm": 1.9472952381372493, + "learning_rate": 1.352800176223926e-07, + "loss": 0.4845, + "step": 2604 + }, + { + "ETA": 0.96, + "epoch": 0.837755266119955, + "fp16_scale": 1.0, + "global_step": 2605, + "grad_norm": 1.8453708180134525, + "learning_rate": 1.3475714441846608e-07, + "loss": 0.4239, + "step": 2605 + }, + { + "ETA": 0.96, + "epoch": 0.8380768612317093, + "fp16_scale": 1.0, + "global_step": 2606, + "grad_norm": 1.8924860485851713, + "learning_rate": 1.3423521064166333e-07, + "loss": 0.4157, + "step": 2606 + }, + { + "ETA": 0.96, + "epoch": 0.8383984563434635, + "fp16_scale": 1.0, + "global_step": 2607, + "grad_norm": 1.9001377997466726, + "learning_rate": 1.3371421685866702e-07, + "loss": 0.3487, + "step": 2607 + }, + { + "ETA": 0.96, + "epoch": 0.8387200514552179, + "fp16_scale": 1.0, + "global_step": 2608, + "grad_norm": 2.068520573315801, + "learning_rate": 1.3319416363514025e-07, + "loss": 0.4534, + "step": 2608 + }, + { + "ETA": 0.96, + "epoch": 0.8390416465669722, + "fp16_scale": 1.0, + "global_step": 2609, + "grad_norm": 2.1990426728773427, + "learning_rate": 1.32675051535725e-07, + "loss": 0.36, + "step": 2609 + }, + { + "ETA": 0.95, + "epoch": 0.8393632416787264, + "fp16_scale": 1.0, + "global_step": 2610, + "grad_norm": 1.8830612508120763, + "learning_rate": 1.3215688112404043e-07, + "loss": 0.4514, + "step": 2610 + }, + { + "ETA": 0.95, + "epoch": 0.8396848367904808, + "fp16_scale": 1.0, + "global_step": 2611, + "grad_norm": 1.9048686120765927, + "learning_rate": 1.316396529626843e-07, + "loss": 0.4104, + "step": 2611 + }, + { + "ETA": 0.95, + "epoch": 0.8400064319022351, + "fp16_scale": 1.0, + "global_step": 2612, + "grad_norm": 1.9535023774292661, + "learning_rate": 1.311233676132306e-07, + "loss": 0.3632, + "step": 2612 + }, + { + "ETA": 0.95, + "epoch": 0.8403280270139893, + "fp16_scale": 1.0, + "global_step": 2613, + "grad_norm": 1.9993295777536153, + "learning_rate": 1.306080256362302e-07, + "loss": 0.4098, + "step": 2613 + }, + { + "ETA": 0.95, + "epoch": 0.8406496221257437, + "fp16_scale": 1.0, + "global_step": 2614, + "grad_norm": 1.9916304814324868, + "learning_rate": 1.3009362759120978e-07, + "loss": 0.4332, + "step": 2614 + }, + { + "ETA": 0.94, + "epoch": 0.840971217237498, + "fp16_scale": 1.0, + "global_step": 2615, + "grad_norm": 1.9272510478037563, + "learning_rate": 1.2958017403667033e-07, + "loss": 0.3614, + "step": 2615 + }, + { + "ETA": 0.94, + "epoch": 0.8412928123492522, + "fp16_scale": 1.0, + "global_step": 2616, + "grad_norm": 2.10425792824437, + "learning_rate": 1.2906766553008842e-07, + "loss": 0.341, + "step": 2616 + }, + { + "ETA": 0.94, + "epoch": 0.8416144074610066, + "fp16_scale": 1.0, + "global_step": 2617, + "grad_norm": 1.9511163169652415, + "learning_rate": 1.285561026279136e-07, + "loss": 0.4548, + "step": 2617 + }, + { + "ETA": 0.94, + "epoch": 0.8419360025727609, + "fp16_scale": 1.0, + "global_step": 2618, + "grad_norm": 2.0474974091401017, + "learning_rate": 1.280454858855694e-07, + "loss": 0.3889, + "step": 2618 + }, + { + "ETA": 0.94, + "epoch": 0.8422575976845152, + "fp16_scale": 1.0, + "global_step": 2619, + "grad_norm": 3.18643441734776, + "learning_rate": 1.2753581585745222e-07, + "loss": 0.4233, + "step": 2619 + }, + { + "ETA": 0.94, + "epoch": 0.8425791927962695, + "fp16_scale": 1.0, + "global_step": 2620, + "grad_norm": 1.8786996363129471, + "learning_rate": 1.2702709309692962e-07, + "loss": 0.4216, + "step": 2620 + }, + { + "ETA": 0.93, + "epoch": 0.8429007879080238, + "fp16_scale": 1.0, + "global_step": 2621, + "grad_norm": 1.956621262661703, + "learning_rate": 1.2651931815634175e-07, + "loss": 0.4248, + "step": 2621 + }, + { + "ETA": 0.93, + "epoch": 0.8432223830197781, + "fp16_scale": 1.0, + "global_step": 2622, + "grad_norm": 1.8212862537387442, + "learning_rate": 1.260124915869988e-07, + "loss": 0.4636, + "step": 2622 + }, + { + "ETA": 0.93, + "epoch": 0.8435439781315324, + "fp16_scale": 1.0, + "global_step": 2623, + "grad_norm": 1.7661124921171842, + "learning_rate": 1.2550661393918215e-07, + "loss": 0.3898, + "step": 2623 + }, + { + "ETA": 0.93, + "epoch": 0.8438655732432867, + "fp16_scale": 1.0, + "global_step": 2624, + "grad_norm": 1.955439051149376, + "learning_rate": 1.2500168576214197e-07, + "loss": 0.4064, + "step": 2624 + }, + { + "ETA": 0.93, + "epoch": 0.844187168355041, + "fp16_scale": 1.0, + "global_step": 2625, + "grad_norm": 1.96149002647226, + "learning_rate": 1.2449770760409816e-07, + "loss": 0.4436, + "step": 2625 + }, + { + "ETA": 0.92, + "epoch": 0.8445087634667953, + "fp16_scale": 1.0, + "global_step": 2626, + "grad_norm": 2.0730734735820278, + "learning_rate": 1.2399468001223933e-07, + "loss": 0.3844, + "step": 2626 + }, + { + "ETA": 0.92, + "epoch": 0.8448303585785496, + "fp16_scale": 1.0, + "global_step": 2627, + "grad_norm": 1.697317809477219, + "learning_rate": 1.2349260353272117e-07, + "loss": 0.4151, + "step": 2627 + }, + { + "ETA": 0.92, + "epoch": 0.8451519536903039, + "fp16_scale": 1.0, + "global_step": 2628, + "grad_norm": 2.0335135160703293, + "learning_rate": 1.2299147871066772e-07, + "loss": 0.4135, + "step": 2628 + }, + { + "ETA": 0.92, + "epoch": 0.8454735488020582, + "fp16_scale": 1.0, + "global_step": 2629, + "grad_norm": 1.9677654311782051, + "learning_rate": 1.2249130609016878e-07, + "loss": 0.3195, + "step": 2629 + }, + { + "ETA": 0.92, + "epoch": 0.8457951439138125, + "fp16_scale": 1.0, + "global_step": 2630, + "grad_norm": 1.9458438484963143, + "learning_rate": 1.2199208621428114e-07, + "loss": 0.4446, + "step": 2630 + }, + { + "ETA": 0.91, + "epoch": 0.8461167390255668, + "fp16_scale": 1.0, + "global_step": 2631, + "grad_norm": 1.9836968497409233, + "learning_rate": 1.2149381962502704e-07, + "loss": 0.4003, + "step": 2631 + }, + { + "ETA": 0.91, + "epoch": 0.8464383341373211, + "fp16_scale": 1.0, + "global_step": 2632, + "grad_norm": 1.939942071471052, + "learning_rate": 1.2099650686339303e-07, + "loss": 0.4848, + "step": 2632 + }, + { + "ETA": 0.91, + "epoch": 0.8467599292490754, + "fp16_scale": 1.0, + "global_step": 2633, + "grad_norm": 1.9449697207489793, + "learning_rate": 1.2050014846933088e-07, + "loss": 0.3906, + "step": 2633 + }, + { + "ETA": 0.91, + "epoch": 0.8470815243608297, + "fp16_scale": 1.0, + "global_step": 2634, + "grad_norm": 2.130988635225944, + "learning_rate": 1.200047449817555e-07, + "loss": 0.4357, + "step": 2634 + }, + { + "ETA": 0.91, + "epoch": 0.8474031194725841, + "fp16_scale": 1.0, + "global_step": 2635, + "grad_norm": 1.9468733306836092, + "learning_rate": 1.195102969385454e-07, + "loss": 0.4015, + "step": 2635 + }, + { + "ETA": 0.9, + "epoch": 0.8477247145843383, + "fp16_scale": 1.0, + "global_step": 2636, + "grad_norm": 1.874569127546959, + "learning_rate": 1.1901680487654198e-07, + "loss": 0.3889, + "step": 2636 + }, + { + "ETA": 0.9, + "epoch": 0.8480463096960926, + "fp16_scale": 1.0, + "global_step": 2637, + "grad_norm": 2.015964467356171, + "learning_rate": 1.185242693315479e-07, + "loss": 0.4379, + "step": 2637 + }, + { + "ETA": 0.9, + "epoch": 0.848367904807847, + "fp16_scale": 1.0, + "global_step": 2638, + "grad_norm": 2.09912297413883, + "learning_rate": 1.1803269083832812e-07, + "loss": 0.391, + "step": 2638 + }, + { + "ETA": 0.9, + "epoch": 0.8486894999196012, + "fp16_scale": 1.0, + "global_step": 2639, + "grad_norm": 2.099103717179329, + "learning_rate": 1.175420699306079e-07, + "loss": 0.4343, + "step": 2639 + }, + { + "ETA": 0.9, + "epoch": 0.8490110950313555, + "fp16_scale": 1.0, + "global_step": 2640, + "grad_norm": 1.9964999767032128, + "learning_rate": 1.1705240714107301e-07, + "loss": 0.3826, + "step": 2640 + }, + { + "ETA": 0.9, + "epoch": 0.8493326901431099, + "fp16_scale": 1.0, + "global_step": 2641, + "grad_norm": 1.9542936081754776, + "learning_rate": 1.1656370300136942e-07, + "loss": 0.3872, + "step": 2641 + }, + { + "ETA": 0.89, + "epoch": 0.8496542852548641, + "fp16_scale": 1.0, + "global_step": 2642, + "grad_norm": 2.152354652101079, + "learning_rate": 1.1607595804210124e-07, + "loss": 0.5051, + "step": 2642 + }, + { + "ETA": 0.89, + "epoch": 0.8499758803666184, + "fp16_scale": 1.0, + "global_step": 2643, + "grad_norm": 2.0311135244125484, + "learning_rate": 1.1558917279283231e-07, + "loss": 0.4417, + "step": 2643 + }, + { + "ETA": 0.89, + "epoch": 0.8502974754783728, + "fp16_scale": 1.0, + "global_step": 2644, + "grad_norm": 2.1813857236598975, + "learning_rate": 1.151033477820833e-07, + "loss": 0.4612, + "step": 2644 + }, + { + "ETA": 0.89, + "epoch": 0.850619070590127, + "fp16_scale": 1.0, + "global_step": 2645, + "grad_norm": 2.149405263063172, + "learning_rate": 1.1461848353733361e-07, + "loss": 0.4194, + "step": 2645 + }, + { + "ETA": 0.89, + "epoch": 0.8509406657018813, + "fp16_scale": 1.0, + "global_step": 2646, + "grad_norm": 2.2238454576920534, + "learning_rate": 1.1413458058501802e-07, + "loss": 0.4094, + "step": 2646 + }, + { + "ETA": 0.88, + "epoch": 0.8512622608136357, + "fp16_scale": 1.0, + "global_step": 2647, + "grad_norm": 1.8842189951772699, + "learning_rate": 1.1365163945052925e-07, + "loss": 0.3989, + "step": 2647 + }, + { + "ETA": 0.88, + "epoch": 0.85158385592539, + "fp16_scale": 1.0, + "global_step": 2648, + "grad_norm": 1.8363827348750568, + "learning_rate": 1.1316966065821454e-07, + "loss": 0.4368, + "step": 2648 + }, + { + "ETA": 0.88, + "epoch": 0.8519054510371442, + "fp16_scale": 1.0, + "global_step": 2649, + "grad_norm": 1.9344762430833913, + "learning_rate": 1.1268864473137629e-07, + "loss": 0.4235, + "step": 2649 + }, + { + "ETA": 0.88, + "epoch": 0.8522270461488985, + "fp16_scale": 1.0, + "global_step": 2650, + "grad_norm": 2.07811357918809, + "learning_rate": 1.122085921922723e-07, + "loss": 0.5248, + "step": 2650 + }, + { + "ETA": 0.88, + "epoch": 0.8525486412606529, + "fp16_scale": 1.0, + "global_step": 2651, + "grad_norm": 1.8786943966368947, + "learning_rate": 1.1172950356211353e-07, + "loss": 0.469, + "step": 2651 + }, + { + "ETA": 0.87, + "epoch": 0.8528702363724071, + "fp16_scale": 1.0, + "global_step": 2652, + "grad_norm": 1.8848309361301367, + "learning_rate": 1.1125137936106487e-07, + "loss": 0.4202, + "step": 2652 + }, + { + "ETA": 0.87, + "epoch": 0.8531918314841614, + "fp16_scale": 1.0, + "global_step": 2653, + "grad_norm": 1.974186893266101, + "learning_rate": 1.1077422010824422e-07, + "loss": 0.4633, + "step": 2653 + }, + { + "ETA": 0.87, + "epoch": 0.8535134265959158, + "fp16_scale": 1.0, + "global_step": 2654, + "grad_norm": 2.0723784802037435, + "learning_rate": 1.1029802632172114e-07, + "loss": 0.4298, + "step": 2654 + }, + { + "ETA": 0.87, + "epoch": 0.85383502170767, + "fp16_scale": 1.0, + "global_step": 2655, + "grad_norm": 2.0158722543798717, + "learning_rate": 1.0982279851851773e-07, + "loss": 0.4086, + "step": 2655 + }, + { + "ETA": 0.87, + "epoch": 0.8541566168194243, + "fp16_scale": 1.0, + "global_step": 2656, + "grad_norm": 2.017577807401529, + "learning_rate": 1.0934853721460669e-07, + "loss": 0.5093, + "step": 2656 + }, + { + "ETA": 0.87, + "epoch": 0.8544782119311787, + "fp16_scale": 1.0, + "global_step": 2657, + "grad_norm": 2.142932809761941, + "learning_rate": 1.0887524292491146e-07, + "loss": 0.4552, + "step": 2657 + }, + { + "ETA": 0.86, + "epoch": 0.8547998070429329, + "fp16_scale": 1.0, + "global_step": 2658, + "grad_norm": 2.255076092998724, + "learning_rate": 1.0840291616330621e-07, + "loss": 0.4351, + "step": 2658 + }, + { + "ETA": 0.86, + "epoch": 0.8551214021546872, + "fp16_scale": 1.0, + "global_step": 2659, + "grad_norm": 2.0571021381863024, + "learning_rate": 1.079315574426135e-07, + "loss": 0.4973, + "step": 2659 + }, + { + "ETA": 0.86, + "epoch": 0.8554429972664416, + "fp16_scale": 1.0, + "global_step": 2660, + "grad_norm": 2.0775891851057935, + "learning_rate": 1.0746116727460585e-07, + "loss": 0.4605, + "step": 2660 + }, + { + "ETA": 0.86, + "epoch": 0.8557645923781958, + "fp16_scale": 1.0, + "global_step": 2661, + "grad_norm": 2.12686010950699, + "learning_rate": 1.0699174617000351e-07, + "loss": 0.3744, + "step": 2661 + }, + { + "ETA": 0.86, + "epoch": 0.8560861874899501, + "fp16_scale": 1.0, + "global_step": 2662, + "grad_norm": 1.9566894590483266, + "learning_rate": 1.0652329463847497e-07, + "loss": 0.3784, + "step": 2662 + }, + { + "ETA": 0.85, + "epoch": 0.8564077826017045, + "fp16_scale": 1.0, + "global_step": 2663, + "grad_norm": 1.8731186442283223, + "learning_rate": 1.0605581318863576e-07, + "loss": 0.3945, + "step": 2663 + }, + { + "ETA": 0.85, + "epoch": 0.8567293777134588, + "fp16_scale": 1.0, + "global_step": 2664, + "grad_norm": 1.866417490280438, + "learning_rate": 1.0558930232804874e-07, + "loss": 0.4314, + "step": 2664 + }, + { + "ETA": 0.85, + "epoch": 0.857050972825213, + "fp16_scale": 1.0, + "global_step": 2665, + "grad_norm": 1.9607139553663306, + "learning_rate": 1.0512376256322231e-07, + "loss": 0.4226, + "step": 2665 + }, + { + "ETA": 0.85, + "epoch": 0.8573725679369674, + "fp16_scale": 1.0, + "global_step": 2666, + "grad_norm": 2.044435407805264, + "learning_rate": 1.0465919439961024e-07, + "loss": 0.4644, + "step": 2666 + }, + { + "ETA": 0.85, + "epoch": 0.8576941630487217, + "fp16_scale": 1.0, + "global_step": 2667, + "grad_norm": 2.2725526784619823, + "learning_rate": 1.0419559834161262e-07, + "loss": 0.4162, + "step": 2667 + }, + { + "ETA": 0.84, + "epoch": 0.8580157581604759, + "fp16_scale": 1.0, + "global_step": 2668, + "grad_norm": 2.093071455508944, + "learning_rate": 1.0373297489257271e-07, + "loss": 0.4536, + "step": 2668 + }, + { + "ETA": 0.84, + "epoch": 0.8583373532722303, + "fp16_scale": 1.0, + "global_step": 2669, + "grad_norm": 2.008772185369336, + "learning_rate": 1.0327132455477872e-07, + "loss": 0.4453, + "step": 2669 + }, + { + "ETA": 0.84, + "epoch": 0.8586589483839846, + "fp16_scale": 1.0, + "global_step": 2670, + "grad_norm": 2.178045069885504, + "learning_rate": 1.0281064782946213e-07, + "loss": 0.4281, + "step": 2670 + }, + { + "ETA": 0.84, + "epoch": 0.8589805434957388, + "fp16_scale": 1.0, + "global_step": 2671, + "grad_norm": 1.7575765820723166, + "learning_rate": 1.0235094521679688e-07, + "loss": 0.3138, + "step": 2671 + }, + { + "ETA": 0.84, + "epoch": 0.8593021386074932, + "fp16_scale": 1.0, + "global_step": 2672, + "grad_norm": 1.800148660908802, + "learning_rate": 1.0189221721590002e-07, + "loss": 0.3898, + "step": 2672 + }, + { + "ETA": 0.83, + "epoch": 0.8596237337192475, + "fp16_scale": 1.0, + "global_step": 2673, + "grad_norm": 1.8282739859272057, + "learning_rate": 1.014344643248295e-07, + "loss": 0.4007, + "step": 2673 + }, + { + "ETA": 0.83, + "epoch": 0.8599453288310017, + "fp16_scale": 1.0, + "global_step": 2674, + "grad_norm": 1.9206677472736489, + "learning_rate": 1.0097768704058541e-07, + "loss": 0.3872, + "step": 2674 + }, + { + "ETA": 0.83, + "epoch": 0.8602669239427561, + "fp16_scale": 1.0, + "global_step": 2675, + "grad_norm": 1.921346141580359, + "learning_rate": 1.0052188585910837e-07, + "loss": 0.4243, + "step": 2675 + }, + { + "ETA": 0.83, + "epoch": 0.8605885190545104, + "fp16_scale": 1.0, + "global_step": 2676, + "grad_norm": 2.109667789704599, + "learning_rate": 1.0006706127527864e-07, + "loss": 0.3659, + "step": 2676 + }, + { + "ETA": 0.83, + "epoch": 0.8609101141662646, + "fp16_scale": 1.0, + "global_step": 2677, + "grad_norm": 2.4039133973685844, + "learning_rate": 9.961321378291709e-08, + "loss": 0.3993, + "step": 2677 + }, + { + "ETA": 0.82, + "epoch": 0.861231709278019, + "fp16_scale": 1.0, + "global_step": 2678, + "grad_norm": 1.9960946402114894, + "learning_rate": 9.916034387478277e-08, + "loss": 0.4358, + "step": 2678 + }, + { + "ETA": 0.82, + "epoch": 0.8615533043897733, + "fp16_scale": 1.0, + "global_step": 2679, + "grad_norm": 1.8065461152088664, + "learning_rate": 9.870845204257394e-08, + "loss": 0.4388, + "step": 2679 + }, + { + "ETA": 0.82, + "epoch": 0.8618748995015276, + "fp16_scale": 1.0, + "global_step": 2680, + "grad_norm": 2.033474167397842, + "learning_rate": 9.825753877692689e-08, + "loss": 0.4701, + "step": 2680 + }, + { + "ETA": 0.82, + "epoch": 0.8621964946132818, + "fp16_scale": 1.0, + "global_step": 2681, + "grad_norm": 1.9026420844908551, + "learning_rate": 9.780760456741554e-08, + "loss": 0.4008, + "step": 2681 + }, + { + "ETA": 0.82, + "epoch": 0.8625180897250362, + "fp16_scale": 1.0, + "global_step": 2682, + "grad_norm": 2.306147518834476, + "learning_rate": 9.735864990255016e-08, + "loss": 0.4138, + "step": 2682 + }, + { + "ETA": 0.82, + "epoch": 0.8628396848367905, + "fp16_scale": 1.0, + "global_step": 2683, + "grad_norm": 1.7908639279612824, + "learning_rate": 9.691067526977803e-08, + "loss": 0.4172, + "step": 2683 + }, + { + "ETA": 0.81, + "epoch": 0.8631612799485447, + "fp16_scale": 1.0, + "global_step": 2684, + "grad_norm": 1.9653635756092456, + "learning_rate": 9.646368115548231e-08, + "loss": 0.4443, + "step": 2684 + }, + { + "ETA": 0.81, + "epoch": 0.8634828750602991, + "fp16_scale": 1.0, + "global_step": 2685, + "grad_norm": 1.8720711898442557, + "learning_rate": 9.601766804498157e-08, + "loss": 0.4053, + "step": 2685 + }, + { + "ETA": 0.81, + "epoch": 0.8638044701720534, + "fp16_scale": 1.0, + "global_step": 2686, + "grad_norm": 1.9331466478282657, + "learning_rate": 9.557263642252944e-08, + "loss": 0.3921, + "step": 2686 + }, + { + "ETA": 0.81, + "epoch": 0.8641260652838076, + "fp16_scale": 1.0, + "global_step": 2687, + "grad_norm": 1.9640580317615708, + "learning_rate": 9.512858677131341e-08, + "loss": 0.4809, + "step": 2687 + }, + { + "ETA": 0.81, + "epoch": 0.864447660395562, + "fp16_scale": 1.0, + "global_step": 2688, + "grad_norm": 1.8557612803074546, + "learning_rate": 9.468551957345505e-08, + "loss": 0.4485, + "step": 2688 + }, + { + "ETA": 0.8, + "epoch": 0.8647692555073163, + "fp16_scale": 1.0, + "global_step": 2689, + "grad_norm": 2.320872209504059, + "learning_rate": 9.424343531000967e-08, + "loss": 0.4452, + "step": 2689 + }, + { + "ETA": 0.8, + "epoch": 0.8650908506190705, + "fp16_scale": 1.0, + "global_step": 2690, + "grad_norm": 2.0004259465596457, + "learning_rate": 9.380233446096441e-08, + "loss": 0.4013, + "step": 2690 + }, + { + "ETA": 0.8, + "epoch": 0.8654124457308249, + "fp16_scale": 1.0, + "global_step": 2691, + "grad_norm": 2.426862627617974, + "learning_rate": 9.336221750523965e-08, + "loss": 0.37, + "step": 2691 + }, + { + "ETA": 0.8, + "epoch": 0.8657340408425792, + "fp16_scale": 1.0, + "global_step": 2692, + "grad_norm": 2.0556781304120655, + "learning_rate": 9.292308492068713e-08, + "loss": 0.3694, + "step": 2692 + }, + { + "ETA": 0.8, + "epoch": 0.8660556359543335, + "fp16_scale": 1.0, + "global_step": 2693, + "grad_norm": 2.319035031009702, + "learning_rate": 9.24849371840899e-08, + "loss": 0.4246, + "step": 2693 + }, + { + "ETA": 0.79, + "epoch": 0.8663772310660878, + "fp16_scale": 1.0, + "global_step": 2694, + "grad_norm": 1.7868633454422285, + "learning_rate": 9.204777477116155e-08, + "loss": 0.4009, + "step": 2694 + }, + { + "ETA": 0.79, + "epoch": 0.8666988261778421, + "fp16_scale": 1.0, + "global_step": 2695, + "grad_norm": 2.0346070033620487, + "learning_rate": 9.161159815654573e-08, + "loss": 0.4271, + "step": 2695 + }, + { + "ETA": 0.79, + "epoch": 0.8670204212895964, + "fp16_scale": 1.0, + "global_step": 2696, + "grad_norm": 1.776428647891114, + "learning_rate": 9.11764078138162e-08, + "loss": 0.359, + "step": 2696 + }, + { + "ETA": 0.79, + "epoch": 0.8673420164013507, + "fp16_scale": 1.0, + "global_step": 2697, + "grad_norm": 1.974715017816616, + "learning_rate": 9.074220421547563e-08, + "loss": 0.4793, + "step": 2697 + }, + { + "ETA": 0.79, + "epoch": 0.867663611513105, + "fp16_scale": 1.0, + "global_step": 2698, + "grad_norm": 2.0667641671046297, + "learning_rate": 9.030898783295571e-08, + "loss": 0.4323, + "step": 2698 + }, + { + "ETA": 0.78, + "epoch": 0.8679852066248593, + "fp16_scale": 1.0, + "global_step": 2699, + "grad_norm": 2.3097626181233077, + "learning_rate": 8.987675913661574e-08, + "loss": 0.3421, + "step": 2699 + }, + { + "ETA": 0.78, + "epoch": 0.8683068017366136, + "fp16_scale": 1.0, + "global_step": 2700, + "grad_norm": 1.9625198456674338, + "learning_rate": 8.944551859574268e-08, + "loss": 0.4458, + "step": 2700 + }, + { + "ETA": 0.78, + "epoch": 0.8686283968483679, + "fp16_scale": 1.0, + "global_step": 2701, + "grad_norm": 2.01924383728897, + "learning_rate": 8.901526667855097e-08, + "loss": 0.3976, + "step": 2701 + }, + { + "ETA": 0.78, + "epoch": 0.8689499919601222, + "fp16_scale": 1.0, + "global_step": 2702, + "grad_norm": 1.708273426556552, + "learning_rate": 8.858600385218151e-08, + "loss": 0.3952, + "step": 2702 + }, + { + "ETA": 0.78, + "epoch": 0.8692715870718765, + "fp16_scale": 1.0, + "global_step": 2703, + "grad_norm": 2.04331318356076, + "learning_rate": 8.815773058270148e-08, + "loss": 0.3858, + "step": 2703 + }, + { + "ETA": 0.78, + "epoch": 0.8695931821836308, + "fp16_scale": 1.0, + "global_step": 2704, + "grad_norm": 1.9244381399724477, + "learning_rate": 8.773044733510337e-08, + "loss": 0.4292, + "step": 2704 + }, + { + "ETA": 0.77, + "epoch": 0.8699147772953851, + "fp16_scale": 1.0, + "global_step": 2705, + "grad_norm": 1.9755569703236773, + "learning_rate": 8.730415457330464e-08, + "loss": 0.4936, + "step": 2705 + }, + { + "ETA": 0.77, + "epoch": 0.8702363724071394, + "fp16_scale": 1.0, + "global_step": 2706, + "grad_norm": 1.9797344852629037, + "learning_rate": 8.687885276014784e-08, + "loss": 0.5014, + "step": 2706 + }, + { + "ETA": 0.77, + "epoch": 0.8705579675188937, + "fp16_scale": 1.0, + "global_step": 2707, + "grad_norm": 2.0958508443954567, + "learning_rate": 8.645454235739902e-08, + "loss": 0.4487, + "step": 2707 + }, + { + "ETA": 0.77, + "epoch": 0.870879562630648, + "fp16_scale": 1.0, + "global_step": 2708, + "grad_norm": 1.8913047874929232, + "learning_rate": 8.603122382574868e-08, + "loss": 0.4689, + "step": 2708 + }, + { + "ETA": 0.77, + "epoch": 0.8712011577424024, + "fp16_scale": 1.0, + "global_step": 2709, + "grad_norm": 1.9407118115129578, + "learning_rate": 8.560889762480949e-08, + "loss": 0.4018, + "step": 2709 + }, + { + "ETA": 0.76, + "epoch": 0.8715227528541566, + "fp16_scale": 1.0, + "global_step": 2710, + "grad_norm": 1.92871027174814, + "learning_rate": 8.518756421311734e-08, + "loss": 0.4175, + "step": 2710 + }, + { + "ETA": 0.76, + "epoch": 0.8718443479659109, + "fp16_scale": 1.0, + "global_step": 2711, + "grad_norm": 1.8382717354407931, + "learning_rate": 8.476722404812975e-08, + "loss": 0.3714, + "step": 2711 + }, + { + "ETA": 0.76, + "epoch": 0.8721659430776653, + "fp16_scale": 1.0, + "global_step": 2712, + "grad_norm": 2.097631682170976, + "learning_rate": 8.434787758622597e-08, + "loss": 0.4672, + "step": 2712 + }, + { + "ETA": 0.76, + "epoch": 0.8724875381894195, + "fp16_scale": 1.0, + "global_step": 2713, + "grad_norm": 2.2517755829643176, + "learning_rate": 8.392952528270659e-08, + "loss": 0.4415, + "step": 2713 + }, + { + "ETA": 0.76, + "epoch": 0.8728091333011738, + "fp16_scale": 1.0, + "global_step": 2714, + "grad_norm": 1.961201675919311, + "learning_rate": 8.351216759179247e-08, + "loss": 0.4135, + "step": 2714 + }, + { + "ETA": 0.75, + "epoch": 0.8731307284129282, + "fp16_scale": 1.0, + "global_step": 2715, + "grad_norm": 2.041682500027486, + "learning_rate": 8.309580496662527e-08, + "loss": 0.3429, + "step": 2715 + }, + { + "ETA": 0.75, + "epoch": 0.8734523235246824, + "fp16_scale": 1.0, + "global_step": 2716, + "grad_norm": 1.993755032287539, + "learning_rate": 8.268043785926526e-08, + "loss": 0.485, + "step": 2716 + }, + { + "ETA": 0.75, + "epoch": 0.8737739186364367, + "fp16_scale": 1.0, + "global_step": 2717, + "grad_norm": 2.050738264187792, + "learning_rate": 8.226606672069226e-08, + "loss": 0.3753, + "step": 2717 + }, + { + "ETA": 0.75, + "epoch": 0.874095513748191, + "fp16_scale": 1.0, + "global_step": 2718, + "grad_norm": 2.1505532918193864, + "learning_rate": 8.185269200080502e-08, + "loss": 0.3564, + "step": 2718 + }, + { + "ETA": 0.75, + "epoch": 0.8744171088599453, + "fp16_scale": 1.0, + "global_step": 2719, + "grad_norm": 2.214996888291509, + "learning_rate": 8.144031414842012e-08, + "loss": 0.4368, + "step": 2719 + }, + { + "ETA": 0.74, + "epoch": 0.8747387039716996, + "fp16_scale": 1.0, + "global_step": 2720, + "grad_norm": 1.96114501865112, + "learning_rate": 8.102893361127216e-08, + "loss": 0.4957, + "step": 2720 + }, + { + "ETA": 0.74, + "epoch": 0.875060299083454, + "fp16_scale": 1.0, + "global_step": 2721, + "grad_norm": 1.7876486697715273, + "learning_rate": 8.061855083601232e-08, + "loss": 0.3758, + "step": 2721 + }, + { + "ETA": 0.74, + "epoch": 0.8753818941952083, + "fp16_scale": 1.0, + "global_step": 2722, + "grad_norm": 2.0905142702443866, + "learning_rate": 8.020916626820918e-08, + "loss": 0.4295, + "step": 2722 + }, + { + "ETA": 0.74, + "epoch": 0.8757034893069625, + "fp16_scale": 1.0, + "global_step": 2723, + "grad_norm": 2.113068852745191, + "learning_rate": 7.98007803523466e-08, + "loss": 0.3791, + "step": 2723 + }, + { + "ETA": 0.74, + "epoch": 0.8760250844187168, + "fp16_scale": 1.0, + "global_step": 2724, + "grad_norm": 1.9401652158306832, + "learning_rate": 7.939339353182517e-08, + "loss": 0.3808, + "step": 2724 + }, + { + "ETA": 0.73, + "epoch": 0.8763466795304712, + "fp16_scale": 1.0, + "global_step": 2725, + "grad_norm": 2.034665160963121, + "learning_rate": 7.898700624896027e-08, + "loss": 0.3465, + "step": 2725 + }, + { + "ETA": 0.73, + "epoch": 0.8766682746422254, + "fp16_scale": 1.0, + "global_step": 2726, + "grad_norm": 1.8507702561681787, + "learning_rate": 7.858161894498172e-08, + "loss": 0.4083, + "step": 2726 + }, + { + "ETA": 0.73, + "epoch": 0.8769898697539797, + "fp16_scale": 1.0, + "global_step": 2727, + "grad_norm": 2.1941389801374247, + "learning_rate": 7.817723206003446e-08, + "loss": 0.3664, + "step": 2727 + }, + { + "ETA": 0.73, + "epoch": 0.8773114648657341, + "fp16_scale": 1.0, + "global_step": 2728, + "grad_norm": 1.8409107945974215, + "learning_rate": 7.777384603317638e-08, + "loss": 0.5171, + "step": 2728 + }, + { + "ETA": 0.73, + "epoch": 0.8776330599774883, + "fp16_scale": 1.0, + "global_step": 2729, + "grad_norm": 1.932922086837846, + "learning_rate": 7.737146130237871e-08, + "loss": 0.4135, + "step": 2729 + }, + { + "ETA": 0.73, + "epoch": 0.8779546550892426, + "fp16_scale": 1.0, + "global_step": 2730, + "grad_norm": 2.286567034059594, + "learning_rate": 7.697007830452673e-08, + "loss": 0.4116, + "step": 2730 + }, + { + "ETA": 0.72, + "epoch": 0.878276250200997, + "fp16_scale": 1.0, + "global_step": 2731, + "grad_norm": 2.0746783678866607, + "learning_rate": 7.656969747541663e-08, + "loss": 0.3823, + "step": 2731 + }, + { + "ETA": 0.72, + "epoch": 0.8785978453127512, + "fp16_scale": 1.0, + "global_step": 2732, + "grad_norm": 1.8478197569383832, + "learning_rate": 7.617031924975736e-08, + "loss": 0.4208, + "step": 2732 + }, + { + "ETA": 0.72, + "epoch": 0.8789194404245055, + "fp16_scale": 1.0, + "global_step": 2733, + "grad_norm": 2.002994476522452, + "learning_rate": 7.577194406116915e-08, + "loss": 0.342, + "step": 2733 + }, + { + "ETA": 0.72, + "epoch": 0.8792410355362599, + "fp16_scale": 1.0, + "global_step": 2734, + "grad_norm": 1.859219332222486, + "learning_rate": 7.53745723421827e-08, + "loss": 0.4539, + "step": 2734 + }, + { + "ETA": 0.72, + "epoch": 0.8795626306480141, + "fp16_scale": 1.0, + "global_step": 2735, + "grad_norm": 1.6261147657752693, + "learning_rate": 7.497820452423998e-08, + "loss": 0.3786, + "step": 2735 + }, + { + "ETA": 0.71, + "epoch": 0.8798842257597684, + "fp16_scale": 1.0, + "global_step": 2736, + "grad_norm": 2.5133372634963385, + "learning_rate": 7.458284103769252e-08, + "loss": 0.4415, + "step": 2736 + }, + { + "ETA": 0.71, + "epoch": 0.8802058208715228, + "fp16_scale": 1.0, + "global_step": 2737, + "grad_norm": 1.985319547549968, + "learning_rate": 7.418848231180175e-08, + "loss": 0.439, + "step": 2737 + }, + { + "ETA": 0.71, + "epoch": 0.8805274159832771, + "fp16_scale": 1.0, + "global_step": 2738, + "grad_norm": 1.940018471801165, + "learning_rate": 7.379512877473748e-08, + "loss": 0.3708, + "step": 2738 + }, + { + "ETA": 0.71, + "epoch": 0.8808490110950313, + "fp16_scale": 1.0, + "global_step": 2739, + "grad_norm": 1.9191589322087108, + "learning_rate": 7.340278085357909e-08, + "loss": 0.3948, + "step": 2739 + }, + { + "ETA": 0.71, + "epoch": 0.8811706062067857, + "fp16_scale": 1.0, + "global_step": 2740, + "grad_norm": 1.9885083066858489, + "learning_rate": 7.301143897431339e-08, + "loss": 0.4462, + "step": 2740 + }, + { + "ETA": 0.7, + "epoch": 0.88149220131854, + "fp16_scale": 1.0, + "global_step": 2741, + "grad_norm": 2.0307542959584604, + "learning_rate": 7.262110356183516e-08, + "loss": 0.4325, + "step": 2741 + }, + { + "ETA": 0.7, + "epoch": 0.8818137964302942, + "fp16_scale": 1.0, + "global_step": 2742, + "grad_norm": 1.837682696054374, + "learning_rate": 7.223177503994671e-08, + "loss": 0.3813, + "step": 2742 + }, + { + "ETA": 0.7, + "epoch": 0.8821353915420486, + "fp16_scale": 1.0, + "global_step": 2743, + "grad_norm": 2.0378733960131723, + "learning_rate": 7.184345383135648e-08, + "loss": 0.4802, + "step": 2743 + }, + { + "ETA": 0.7, + "epoch": 0.8824569866538029, + "fp16_scale": 1.0, + "global_step": 2744, + "grad_norm": 1.7920779749086864, + "learning_rate": 7.145614035767988e-08, + "loss": 0.4465, + "step": 2744 + }, + { + "ETA": 0.7, + "epoch": 0.8827785817655571, + "fp16_scale": 1.0, + "global_step": 2745, + "grad_norm": 1.9659612418072967, + "learning_rate": 7.106983503943764e-08, + "loss": 0.4623, + "step": 2745 + }, + { + "ETA": 0.69, + "epoch": 0.8831001768773115, + "fp16_scale": 1.0, + "global_step": 2746, + "grad_norm": 2.2622281114188953, + "learning_rate": 7.068453829605625e-08, + "loss": 0.4464, + "step": 2746 + }, + { + "ETA": 0.69, + "epoch": 0.8834217719890658, + "fp16_scale": 1.0, + "global_step": 2747, + "grad_norm": 2.0562916836782987, + "learning_rate": 7.030025054586731e-08, + "loss": 0.4161, + "step": 2747 + }, + { + "ETA": 0.69, + "epoch": 0.88374336710082, + "fp16_scale": 1.0, + "global_step": 2748, + "grad_norm": 1.9244367543472374, + "learning_rate": 6.991697220610638e-08, + "loss": 0.4549, + "step": 2748 + }, + { + "ETA": 0.69, + "epoch": 0.8840649622125744, + "fp16_scale": 1.0, + "global_step": 2749, + "grad_norm": 2.0026094793621714, + "learning_rate": 6.953470369291348e-08, + "loss": 0.4025, + "step": 2749 + }, + { + "ETA": 0.69, + "epoch": 0.8843865573243287, + "fp16_scale": 1.0, + "global_step": 2750, + "grad_norm": 2.0039552999787085, + "learning_rate": 6.915344542133195e-08, + "loss": 0.4755, + "step": 2750 + }, + { + "ETA": 0.69, + "epoch": 0.8847081524360829, + "fp16_scale": 1.0, + "global_step": 2751, + "grad_norm": 2.168757933772285, + "learning_rate": 6.877319780530844e-08, + "loss": 0.3671, + "step": 2751 + }, + { + "ETA": 0.68, + "epoch": 0.8850297475478373, + "fp16_scale": 1.0, + "global_step": 2752, + "grad_norm": 2.0765117403074647, + "learning_rate": 6.839396125769258e-08, + "loss": 0.36, + "step": 2752 + }, + { + "ETA": 0.68, + "epoch": 0.8853513426595916, + "fp16_scale": 1.0, + "global_step": 2753, + "grad_norm": 2.0748444198922704, + "learning_rate": 6.801573619023549e-08, + "loss": 0.364, + "step": 2753 + }, + { + "ETA": 0.68, + "epoch": 0.8856729377713459, + "fp16_scale": 1.0, + "global_step": 2754, + "grad_norm": 2.0661161487993818, + "learning_rate": 6.763852301359086e-08, + "loss": 0.4385, + "step": 2754 + }, + { + "ETA": 0.68, + "epoch": 0.8859945328831001, + "fp16_scale": 1.0, + "global_step": 2755, + "grad_norm": 1.889073661932241, + "learning_rate": 6.72623221373132e-08, + "loss": 0.4008, + "step": 2755 + }, + { + "ETA": 0.68, + "epoch": 0.8863161279948545, + "fp16_scale": 1.0, + "global_step": 2756, + "grad_norm": 2.1217875274225824, + "learning_rate": 6.688713396985835e-08, + "loss": 0.4568, + "step": 2756 + }, + { + "ETA": 0.67, + "epoch": 0.8866377231066088, + "fp16_scale": 1.0, + "global_step": 2757, + "grad_norm": 1.9332812064355187, + "learning_rate": 6.651295891858211e-08, + "loss": 0.3302, + "step": 2757 + }, + { + "ETA": 0.67, + "epoch": 0.886959318218363, + "fp16_scale": 1.0, + "global_step": 2758, + "grad_norm": 2.952396656380801, + "learning_rate": 6.613979738974073e-08, + "loss": 0.4236, + "step": 2758 + }, + { + "ETA": 0.67, + "epoch": 0.8872809133301174, + "fp16_scale": 1.0, + "global_step": 2759, + "grad_norm": 1.9415780066928712, + "learning_rate": 6.576764978849003e-08, + "loss": 0.4657, + "step": 2759 + }, + { + "ETA": 0.67, + "epoch": 0.8876025084418717, + "fp16_scale": 1.0, + "global_step": 2760, + "grad_norm": 2.33496607348724, + "learning_rate": 6.539651651888455e-08, + "loss": 0.4484, + "step": 2760 + }, + { + "ETA": 0.67, + "epoch": 0.8879241035536259, + "fp16_scale": 1.0, + "global_step": 2761, + "grad_norm": 1.9715363195482916, + "learning_rate": 6.50263979838781e-08, + "loss": 0.4244, + "step": 2761 + }, + { + "ETA": 0.66, + "epoch": 0.8882456986653803, + "fp16_scale": 1.0, + "global_step": 2762, + "grad_norm": 2.1190461218668277, + "learning_rate": 6.46572945853222e-08, + "loss": 0.3667, + "step": 2762 + }, + { + "ETA": 0.66, + "epoch": 0.8885672937771346, + "fp16_scale": 1.0, + "global_step": 2763, + "grad_norm": 1.9899442435298367, + "learning_rate": 6.428920672396665e-08, + "loss": 0.3984, + "step": 2763 + }, + { + "ETA": 0.66, + "epoch": 0.8888888888888888, + "fp16_scale": 1.0, + "global_step": 2764, + "grad_norm": 2.1433779788227505, + "learning_rate": 6.392213479945851e-08, + "loss": 0.4387, + "step": 2764 + }, + { + "ETA": 0.66, + "epoch": 0.8892104840006432, + "fp16_scale": 1.0, + "global_step": 2765, + "grad_norm": 1.958060690192412, + "learning_rate": 6.355607921034145e-08, + "loss": 0.3963, + "step": 2765 + }, + { + "ETA": 0.66, + "epoch": 0.8895320791123975, + "fp16_scale": 1.0, + "global_step": 2766, + "grad_norm": 2.359572994511273, + "learning_rate": 6.319104035405642e-08, + "loss": 0.4135, + "step": 2766 + }, + { + "ETA": 0.65, + "epoch": 0.8898536742241518, + "fp16_scale": 1.0, + "global_step": 2767, + "grad_norm": 1.9099708137215627, + "learning_rate": 6.282701862693962e-08, + "loss": 0.5077, + "step": 2767 + }, + { + "ETA": 0.65, + "epoch": 0.8901752693359061, + "fp16_scale": 1.0, + "global_step": 2768, + "grad_norm": 2.014617572852332, + "learning_rate": 6.246401442422345e-08, + "loss": 0.3983, + "step": 2768 + }, + { + "ETA": 0.65, + "epoch": 0.8904968644476604, + "fp16_scale": 1.0, + "global_step": 2769, + "grad_norm": 2.3235058046937676, + "learning_rate": 6.21020281400354e-08, + "loss": 0.4439, + "step": 2769 + }, + { + "ETA": 0.65, + "epoch": 0.8908184595594147, + "fp16_scale": 1.0, + "global_step": 2770, + "grad_norm": 1.7802979753919927, + "learning_rate": 6.174106016739777e-08, + "loss": 0.4451, + "step": 2770 + }, + { + "ETA": 0.65, + "epoch": 0.891140054671169, + "fp16_scale": 1.0, + "global_step": 2771, + "grad_norm": 1.9911169744032278, + "learning_rate": 6.138111089822729e-08, + "loss": 0.4853, + "step": 2771 + }, + { + "ETA": 0.64, + "epoch": 0.8914616497829233, + "fp16_scale": 1.0, + "global_step": 2772, + "grad_norm": 1.9804843218216104, + "learning_rate": 6.102218072333443e-08, + "loss": 0.4372, + "step": 2772 + }, + { + "ETA": 0.64, + "epoch": 0.8917832448946776, + "fp16_scale": 1.0, + "global_step": 2773, + "grad_norm": 1.8782788491617932, + "learning_rate": 6.066427003242358e-08, + "loss": 0.47, + "step": 2773 + }, + { + "ETA": 0.64, + "epoch": 0.8921048400064319, + "fp16_scale": 1.0, + "global_step": 2774, + "grad_norm": 2.0014379014585333, + "learning_rate": 6.030737921409168e-08, + "loss": 0.4152, + "step": 2774 + }, + { + "ETA": 0.64, + "epoch": 0.8924264351181862, + "fp16_scale": 1.0, + "global_step": 2775, + "grad_norm": 1.7100955869657102, + "learning_rate": 5.995150865582887e-08, + "loss": 0.4357, + "step": 2775 + }, + { + "ETA": 0.64, + "epoch": 0.8927480302299405, + "fp16_scale": 1.0, + "global_step": 2776, + "grad_norm": 2.0662115662141978, + "learning_rate": 5.9596658744017645e-08, + "loss": 0.4112, + "step": 2776 + }, + { + "ETA": 0.64, + "epoch": 0.8930696253416948, + "fp16_scale": 1.0, + "global_step": 2777, + "grad_norm": 1.9429734424099012, + "learning_rate": 5.924282986393159e-08, + "loss": 0.3925, + "step": 2777 + }, + { + "ETA": 0.63, + "epoch": 0.8933912204534491, + "fp16_scale": 1.0, + "global_step": 2778, + "grad_norm": 1.973763094763008, + "learning_rate": 5.889002239973651e-08, + "loss": 0.3393, + "step": 2778 + }, + { + "ETA": 0.63, + "epoch": 0.8937128155652034, + "fp16_scale": 1.0, + "global_step": 2779, + "grad_norm": 1.885547944082085, + "learning_rate": 5.8538236734488765e-08, + "loss": 0.4881, + "step": 2779 + }, + { + "ETA": 0.63, + "epoch": 0.8940344106769577, + "fp16_scale": 1.0, + "global_step": 2780, + "grad_norm": 1.7525422661166352, + "learning_rate": 5.81874732501354e-08, + "loss": 0.3359, + "step": 2780 + }, + { + "ETA": 0.63, + "epoch": 0.894356005788712, + "fp16_scale": 1.0, + "global_step": 2781, + "grad_norm": 1.9032992627126815, + "learning_rate": 5.783773232751399e-08, + "loss": 0.4142, + "step": 2781 + }, + { + "ETA": 0.63, + "epoch": 0.8946776009004663, + "fp16_scale": 1.0, + "global_step": 2782, + "grad_norm": 2.1776207196827646, + "learning_rate": 5.7489014346351114e-08, + "loss": 0.4638, + "step": 2782 + }, + { + "ETA": 0.62, + "epoch": 0.8949991960122207, + "fp16_scale": 1.0, + "global_step": 2783, + "grad_norm": 1.8962053377438484, + "learning_rate": 5.71413196852637e-08, + "loss": 0.3545, + "step": 2783 + }, + { + "ETA": 0.62, + "epoch": 0.8953207911239749, + "fp16_scale": 1.0, + "global_step": 2784, + "grad_norm": 2.0714169257639443, + "learning_rate": 5.6794648721756656e-08, + "loss": 0.5256, + "step": 2784 + }, + { + "ETA": 0.62, + "epoch": 0.8956423862357292, + "fp16_scale": 1.0, + "global_step": 2785, + "grad_norm": 2.0333492450531825, + "learning_rate": 5.6449001832223895e-08, + "loss": 0.4191, + "step": 2785 + }, + { + "ETA": 0.62, + "epoch": 0.8959639813474836, + "fp16_scale": 1.0, + "global_step": 2786, + "grad_norm": 1.8074962470777867, + "learning_rate": 5.610437939194779e-08, + "loss": 0.4456, + "step": 2786 + }, + { + "ETA": 0.62, + "epoch": 0.8962855764592378, + "fp16_scale": 1.0, + "global_step": 2787, + "grad_norm": 1.9607028035361536, + "learning_rate": 5.5760781775097574e-08, + "loss": 0.4336, + "step": 2787 + }, + { + "ETA": 0.61, + "epoch": 0.8966071715709921, + "fp16_scale": 1.0, + "global_step": 2788, + "grad_norm": 2.0004431256527466, + "learning_rate": 5.5418209354730626e-08, + "loss": 0.5431, + "step": 2788 + }, + { + "ETA": 0.61, + "epoch": 0.8969287666827465, + "fp16_scale": 1.0, + "global_step": 2789, + "grad_norm": 2.0203590109563883, + "learning_rate": 5.507666250279053e-08, + "loss": 0.4809, + "step": 2789 + }, + { + "ETA": 0.61, + "epoch": 0.8972503617945007, + "fp16_scale": 1.0, + "global_step": 2790, + "grad_norm": 1.92302946795985, + "learning_rate": 5.4736141590107866e-08, + "loss": 0.3758, + "step": 2790 + }, + { + "ETA": 0.61, + "epoch": 0.897571956906255, + "fp16_scale": 1.0, + "global_step": 2791, + "grad_norm": 1.9859721627098932, + "learning_rate": 5.4396646986399454e-08, + "loss": 0.3476, + "step": 2791 + }, + { + "ETA": 0.61, + "epoch": 0.8978935520180094, + "fp16_scale": 1.0, + "global_step": 2792, + "grad_norm": 1.9804620448459982, + "learning_rate": 5.4058179060267e-08, + "loss": 0.3944, + "step": 2792 + }, + { + "ETA": 0.6, + "epoch": 0.8982151471297636, + "fp16_scale": 1.0, + "global_step": 2793, + "grad_norm": 2.0532529562429547, + "learning_rate": 5.372073817919842e-08, + "loss": 0.5245, + "step": 2793 + }, + { + "ETA": 0.6, + "epoch": 0.8985367422415179, + "fp16_scale": 1.0, + "global_step": 2794, + "grad_norm": 1.9961537606507216, + "learning_rate": 5.3384324709565887e-08, + "loss": 0.4047, + "step": 2794 + }, + { + "ETA": 0.6, + "epoch": 0.8988583373532723, + "fp16_scale": 1.0, + "global_step": 2795, + "grad_norm": 2.4223557942125753, + "learning_rate": 5.3048939016626547e-08, + "loss": 0.3971, + "step": 2795 + }, + { + "ETA": 0.6, + "epoch": 0.8991799324650265, + "fp16_scale": 1.0, + "global_step": 2796, + "grad_norm": 1.934772789614562, + "learning_rate": 5.2714581464521016e-08, + "loss": 0.4174, + "step": 2796 + }, + { + "ETA": 0.6, + "epoch": 0.8995015275767808, + "fp16_scale": 1.0, + "global_step": 2797, + "grad_norm": 2.0986507181843743, + "learning_rate": 5.238125241627456e-08, + "loss": 0.4943, + "step": 2797 + }, + { + "ETA": 0.6, + "epoch": 0.8998231226885351, + "fp16_scale": 1.0, + "global_step": 2798, + "grad_norm": 1.9990106115715338, + "learning_rate": 5.204895223379491e-08, + "loss": 0.3806, + "step": 2798 + }, + { + "ETA": 0.59, + "epoch": 0.9001447178002895, + "fp16_scale": 1.0, + "global_step": 2799, + "grad_norm": 2.1018154315633675, + "learning_rate": 5.171768127787302e-08, + "loss": 0.4229, + "step": 2799 + }, + { + "ETA": 0.59, + "epoch": 0.9004663129120437, + "fp16_scale": 1.0, + "global_step": 2800, + "grad_norm": 1.9412235090122163, + "learning_rate": 5.1387439908182505e-08, + "loss": 0.362, + "step": 2800 + }, + { + "ETA": 0.59, + "epoch": 0.900787908023798, + "fp16_scale": 1.0, + "global_step": 2801, + "grad_norm": 1.9229842640914792, + "learning_rate": 5.105822848327879e-08, + "loss": 0.4345, + "step": 2801 + }, + { + "ETA": 0.59, + "epoch": 0.9011095031355524, + "fp16_scale": 1.0, + "global_step": 2802, + "grad_norm": 2.072485326088834, + "learning_rate": 5.073004736059949e-08, + "loss": 0.4725, + "step": 2802 + }, + { + "ETA": 0.59, + "epoch": 0.9014310982473066, + "fp16_scale": 1.0, + "global_step": 2803, + "grad_norm": 1.9110256803258092, + "learning_rate": 5.040289689646338e-08, + "loss": 0.4398, + "step": 2803 + }, + { + "ETA": 0.58, + "epoch": 0.9017526933590609, + "fp16_scale": 1.0, + "global_step": 2804, + "grad_norm": 1.9401269117169266, + "learning_rate": 5.007677744606986e-08, + "loss": 0.3845, + "step": 2804 + }, + { + "ETA": 0.58, + "epoch": 0.9020742884708153, + "fp16_scale": 1.0, + "global_step": 2805, + "grad_norm": 1.882339884357719, + "learning_rate": 4.9751689363499714e-08, + "loss": 0.3854, + "step": 2805 + }, + { + "ETA": 0.58, + "epoch": 0.9023958835825695, + "fp16_scale": 1.0, + "global_step": 2806, + "grad_norm": 1.9100152569468716, + "learning_rate": 4.942763300171293e-08, + "loss": 0.3934, + "step": 2806 + }, + { + "ETA": 0.58, + "epoch": 0.9027174786943238, + "fp16_scale": 1.0, + "global_step": 2807, + "grad_norm": 2.038750726725319, + "learning_rate": 4.9104608712550065e-08, + "loss": 0.4337, + "step": 2807 + }, + { + "ETA": 0.58, + "epoch": 0.9030390738060782, + "fp16_scale": 1.0, + "global_step": 2808, + "grad_norm": 2.017505647201532, + "learning_rate": 4.878261684673102e-08, + "loss": 0.3796, + "step": 2808 + }, + { + "ETA": 0.57, + "epoch": 0.9033606689178324, + "fp16_scale": 1.0, + "global_step": 2809, + "grad_norm": 2.053613315651322, + "learning_rate": 4.846165775385458e-08, + "loss": 0.3799, + "step": 2809 + }, + { + "ETA": 0.57, + "epoch": 0.9036822640295867, + "fp16_scale": 1.0, + "global_step": 2810, + "grad_norm": 2.2447420755578946, + "learning_rate": 4.814173178239833e-08, + "loss": 0.4516, + "step": 2810 + }, + { + "ETA": 0.57, + "epoch": 0.9040038591413411, + "fp16_scale": 1.0, + "global_step": 2811, + "grad_norm": 2.0541863410234313, + "learning_rate": 4.782283927971775e-08, + "loss": 0.4281, + "step": 2811 + }, + { + "ETA": 0.57, + "epoch": 0.9043254542530954, + "fp16_scale": 1.0, + "global_step": 2812, + "grad_norm": 2.0116360550988697, + "learning_rate": 4.7504980592046776e-08, + "loss": 0.3497, + "step": 2812 + }, + { + "ETA": 0.57, + "epoch": 0.9046470493648496, + "fp16_scale": 1.0, + "global_step": 2813, + "grad_norm": 2.0850594510307157, + "learning_rate": 4.7188156064496664e-08, + "loss": 0.3847, + "step": 2813 + }, + { + "ETA": 0.56, + "epoch": 0.904968644476604, + "fp16_scale": 1.0, + "global_step": 2814, + "grad_norm": 2.2255842436670634, + "learning_rate": 4.687236604105615e-08, + "loss": 0.4049, + "step": 2814 + }, + { + "ETA": 0.56, + "epoch": 0.9052902395883583, + "fp16_scale": 1.0, + "global_step": 2815, + "grad_norm": 1.8471804725282406, + "learning_rate": 4.655761086459009e-08, + "loss": 0.475, + "step": 2815 + }, + { + "ETA": 0.56, + "epoch": 0.9056118347001125, + "fp16_scale": 1.0, + "global_step": 2816, + "grad_norm": 2.1853785377843438, + "learning_rate": 4.624389087684033e-08, + "loss": 0.4647, + "step": 2816 + }, + { + "ETA": 0.56, + "epoch": 0.9059334298118669, + "fp16_scale": 1.0, + "global_step": 2817, + "grad_norm": 1.8683504344226407, + "learning_rate": 4.593120641842474e-08, + "loss": 0.4378, + "step": 2817 + }, + { + "ETA": 0.56, + "epoch": 0.9062550249236212, + "fp16_scale": 1.0, + "global_step": 2818, + "grad_norm": 2.1039763344759344, + "learning_rate": 4.5619557828836306e-08, + "loss": 0.4204, + "step": 2818 + }, + { + "ETA": 0.56, + "epoch": 0.9065766200353754, + "fp16_scale": 1.0, + "global_step": 2819, + "grad_norm": 1.952491669989993, + "learning_rate": 4.530894544644426e-08, + "loss": 0.4794, + "step": 2819 + }, + { + "ETA": 0.55, + "epoch": 0.9068982151471298, + "fp16_scale": 1.0, + "global_step": 2820, + "grad_norm": 2.1989145210470182, + "learning_rate": 4.499936960849226e-08, + "loss": 0.3845, + "step": 2820 + }, + { + "ETA": 0.55, + "epoch": 0.9072198102588841, + "fp16_scale": 1.0, + "global_step": 2821, + "grad_norm": 2.3718244597709783, + "learning_rate": 4.4690830651098244e-08, + "loss": 0.4097, + "step": 2821 + }, + { + "ETA": 0.55, + "epoch": 0.9075414053706383, + "fp16_scale": 1.0, + "global_step": 2822, + "grad_norm": 1.9915383448367294, + "learning_rate": 4.4383328909255e-08, + "loss": 0.4298, + "step": 2822 + }, + { + "ETA": 0.55, + "epoch": 0.9078630004823927, + "fp16_scale": 1.0, + "global_step": 2823, + "grad_norm": 1.9772001686657124, + "learning_rate": 4.40768647168287e-08, + "loss": 0.4083, + "step": 2823 + }, + { + "ETA": 0.55, + "epoch": 0.908184595594147, + "fp16_scale": 1.0, + "global_step": 2824, + "grad_norm": 2.0575240055420942, + "learning_rate": 4.377143840655917e-08, + "loss": 0.375, + "step": 2824 + }, + { + "ETA": 0.54, + "epoch": 0.9085061907059012, + "fp16_scale": 1.0, + "global_step": 2825, + "grad_norm": 1.848169752420248, + "learning_rate": 4.34670503100596e-08, + "loss": 0.4511, + "step": 2825 + }, + { + "ETA": 0.54, + "epoch": 0.9088277858176556, + "fp16_scale": 1.0, + "global_step": 2826, + "grad_norm": 2.0723450252659372, + "learning_rate": 4.316370075781573e-08, + "loss": 0.3619, + "step": 2826 + }, + { + "ETA": 0.54, + "epoch": 0.9091493809294099, + "fp16_scale": 1.0, + "global_step": 2827, + "grad_norm": 2.0247285800808315, + "learning_rate": 4.286139007918566e-08, + "loss": 0.4377, + "step": 2827 + }, + { + "ETA": 0.54, + "epoch": 0.9094709760411642, + "fp16_scale": 1.0, + "global_step": 2828, + "grad_norm": 1.939722941084154, + "learning_rate": 4.2560118602399386e-08, + "loss": 0.4625, + "step": 2828 + }, + { + "ETA": 0.54, + "epoch": 0.9097925711529185, + "fp16_scale": 1.0, + "global_step": 2829, + "grad_norm": 2.1283893092626305, + "learning_rate": 4.225988665455904e-08, + "loss": 0.4464, + "step": 2829 + }, + { + "ETA": 0.53, + "epoch": 0.9101141662646728, + "fp16_scale": 1.0, + "global_step": 2830, + "grad_norm": 1.9020144535129746, + "learning_rate": 4.1960694561637864e-08, + "loss": 0.3999, + "step": 2830 + }, + { + "ETA": 0.53, + "epoch": 0.9104357613764271, + "fp16_scale": 1.0, + "global_step": 2831, + "grad_norm": 1.7322995221783446, + "learning_rate": 4.166254264848024e-08, + "loss": 0.4352, + "step": 2831 + }, + { + "ETA": 0.53, + "epoch": 0.9107573564881813, + "fp16_scale": 1.0, + "global_step": 2832, + "grad_norm": 1.9560527597958781, + "learning_rate": 4.136543123880088e-08, + "loss": 0.4196, + "step": 2832 + }, + { + "ETA": 0.53, + "epoch": 0.9110789515999357, + "fp16_scale": 1.0, + "global_step": 2833, + "grad_norm": 2.1021864770481207, + "learning_rate": 4.1069360655184846e-08, + "loss": 0.435, + "step": 2833 + }, + { + "ETA": 0.53, + "epoch": 0.91140054671169, + "fp16_scale": 1.0, + "global_step": 2834, + "grad_norm": 1.7474690163098885, + "learning_rate": 4.077433121908747e-08, + "loss": 0.3684, + "step": 2834 + }, + { + "ETA": 0.52, + "epoch": 0.9117221418234442, + "fp16_scale": 1.0, + "global_step": 2835, + "grad_norm": 1.992238578274917, + "learning_rate": 4.048034325083327e-08, + "loss": 0.4108, + "step": 2835 + }, + { + "ETA": 0.52, + "epoch": 0.9120437369351986, + "fp16_scale": 1.0, + "global_step": 2836, + "grad_norm": 2.0955800830552187, + "learning_rate": 4.018739706961649e-08, + "loss": 0.4941, + "step": 2836 + }, + { + "ETA": 0.52, + "epoch": 0.9123653320469529, + "fp16_scale": 1.0, + "global_step": 2837, + "grad_norm": 2.0180449576883186, + "learning_rate": 3.9895492993499594e-08, + "loss": 0.3799, + "step": 2837 + }, + { + "ETA": 0.52, + "epoch": 0.9126869271587071, + "fp16_scale": 1.0, + "global_step": 2838, + "grad_norm": 1.7623029939806376, + "learning_rate": 3.9604631339414276e-08, + "loss": 0.3238, + "step": 2838 + }, + { + "ETA": 0.52, + "epoch": 0.9130085222704615, + "fp16_scale": 1.0, + "global_step": 2839, + "grad_norm": 1.939639310069537, + "learning_rate": 3.9314812423159924e-08, + "loss": 0.4883, + "step": 2839 + }, + { + "ETA": 0.51, + "epoch": 0.9133301173822158, + "fp16_scale": 1.0, + "global_step": 2840, + "grad_norm": 1.7967992231208605, + "learning_rate": 3.902603655940384e-08, + "loss": 0.3382, + "step": 2840 + }, + { + "ETA": 0.51, + "epoch": 0.91365171249397, + "fp16_scale": 1.0, + "global_step": 2841, + "grad_norm": 1.9918374371163514, + "learning_rate": 3.87383040616811e-08, + "loss": 0.4233, + "step": 2841 + }, + { + "ETA": 0.51, + "epoch": 0.9139733076057244, + "fp16_scale": 1.0, + "global_step": 2842, + "grad_norm": 2.4450115166099176, + "learning_rate": 3.845161524239393e-08, + "loss": 0.4185, + "step": 2842 + }, + { + "ETA": 0.51, + "epoch": 0.9142949027174787, + "fp16_scale": 1.0, + "global_step": 2843, + "grad_norm": 2.137554399172394, + "learning_rate": 3.816597041281144e-08, + "loss": 0.4271, + "step": 2843 + }, + { + "ETA": 0.51, + "epoch": 0.914616497829233, + "fp16_scale": 1.0, + "global_step": 2844, + "grad_norm": 2.149222564381253, + "learning_rate": 3.788136988306878e-08, + "loss": 0.3694, + "step": 2844 + }, + { + "ETA": 0.51, + "epoch": 0.9149380929409873, + "fp16_scale": 1.0, + "global_step": 2845, + "grad_norm": 1.8491213389245522, + "learning_rate": 3.7597813962167654e-08, + "loss": 0.4423, + "step": 2845 + }, + { + "ETA": 0.5, + "epoch": 0.9152596880527416, + "fp16_scale": 1.0, + "global_step": 2846, + "grad_norm": 2.031649759451373, + "learning_rate": 3.731530295797558e-08, + "loss": 0.4607, + "step": 2846 + }, + { + "ETA": 0.5, + "epoch": 0.9155812831644959, + "fp16_scale": 1.0, + "global_step": 2847, + "grad_norm": 1.9071908232824537, + "learning_rate": 3.7033837177225415e-08, + "loss": 0.3631, + "step": 2847 + }, + { + "ETA": 0.5, + "epoch": 0.9159028782762502, + "fp16_scale": 1.0, + "global_step": 2848, + "grad_norm": 1.8521844096559676, + "learning_rate": 3.675341692551559e-08, + "loss": 0.4, + "step": 2848 + }, + { + "ETA": 0.5, + "epoch": 0.9162244733880045, + "fp16_scale": 1.0, + "global_step": 2849, + "grad_norm": 1.8572444407739268, + "learning_rate": 3.647404250730879e-08, + "loss": 0.42, + "step": 2849 + }, + { + "ETA": 0.5, + "epoch": 0.9165460684997588, + "fp16_scale": 1.0, + "global_step": 2850, + "grad_norm": 1.9641467246546185, + "learning_rate": 3.619571422593248e-08, + "loss": 0.4721, + "step": 2850 + }, + { + "ETA": 0.49, + "epoch": 0.9168676636115131, + "fp16_scale": 1.0, + "global_step": 2851, + "grad_norm": 2.026162989565498, + "learning_rate": 3.591843238357828e-08, + "loss": 0.327, + "step": 2851 + }, + { + "ETA": 0.49, + "epoch": 0.9171892587232674, + "fp16_scale": 1.0, + "global_step": 2852, + "grad_norm": 1.9214575809984122, + "learning_rate": 3.5642197281301576e-08, + "loss": 0.4278, + "step": 2852 + }, + { + "ETA": 0.49, + "epoch": 0.9175108538350217, + "fp16_scale": 1.0, + "global_step": 2853, + "grad_norm": 1.9719888765763525, + "learning_rate": 3.536700921902169e-08, + "loss": 0.4269, + "step": 2853 + }, + { + "ETA": 0.49, + "epoch": 0.917832448946776, + "fp16_scale": 1.0, + "global_step": 2854, + "grad_norm": 2.030921485554089, + "learning_rate": 3.509286849552029e-08, + "loss": 0.42, + "step": 2854 + }, + { + "ETA": 0.49, + "epoch": 0.9181540440585303, + "fp16_scale": 1.0, + "global_step": 2855, + "grad_norm": 2.16948803003424, + "learning_rate": 3.481977540844283e-08, + "loss": 0.3734, + "step": 2855 + }, + { + "ETA": 0.48, + "epoch": 0.9184756391702846, + "fp16_scale": 1.0, + "global_step": 2856, + "grad_norm": 2.1816436495070497, + "learning_rate": 3.454773025429658e-08, + "loss": 0.3648, + "step": 2856 + }, + { + "ETA": 0.48, + "epoch": 0.918797234282039, + "fp16_scale": 1.0, + "global_step": 2857, + "grad_norm": 2.0823465631539473, + "learning_rate": 3.427673332845138e-08, + "loss": 0.4859, + "step": 2857 + }, + { + "ETA": 0.48, + "epoch": 0.9191188293937932, + "fp16_scale": 1.0, + "global_step": 2858, + "grad_norm": 1.9732601534126495, + "learning_rate": 3.4006784925139085e-08, + "loss": 0.4306, + "step": 2858 + }, + { + "ETA": 0.48, + "epoch": 0.9194404245055475, + "fp16_scale": 1.0, + "global_step": 2859, + "grad_norm": 2.171560970560878, + "learning_rate": 3.373788533745281e-08, + "loss": 0.3704, + "step": 2859 + }, + { + "ETA": 0.48, + "epoch": 0.9197620196173019, + "fp16_scale": 1.0, + "global_step": 2860, + "grad_norm": 1.952877951419747, + "learning_rate": 3.347003485734712e-08, + "loss": 0.3489, + "step": 2860 + }, + { + "ETA": 0.47, + "epoch": 0.9200836147290561, + "fp16_scale": 1.0, + "global_step": 2861, + "grad_norm": 1.7911008016079333, + "learning_rate": 3.3203233775637494e-08, + "loss": 0.3741, + "step": 2861 + }, + { + "ETA": 0.47, + "epoch": 0.9204052098408104, + "fp16_scale": 1.0, + "global_step": 2862, + "grad_norm": 1.9835673260363862, + "learning_rate": 3.2937482381999895e-08, + "loss": 0.483, + "step": 2862 + }, + { + "ETA": 0.47, + "epoch": 0.9207268049525648, + "fp16_scale": 1.0, + "global_step": 2863, + "grad_norm": 2.1148827379064876, + "learning_rate": 3.267278096497084e-08, + "loss": 0.3681, + "step": 2863 + }, + { + "ETA": 0.47, + "epoch": 0.921048400064319, + "fp16_scale": 1.0, + "global_step": 2864, + "grad_norm": 2.0236888634451855, + "learning_rate": 3.2409129811946765e-08, + "loss": 0.4312, + "step": 2864 + }, + { + "ETA": 0.47, + "epoch": 0.9213699951760733, + "fp16_scale": 1.0, + "global_step": 2865, + "grad_norm": 2.039835627064776, + "learning_rate": 3.214652920918393e-08, + "loss": 0.3642, + "step": 2865 + }, + { + "ETA": 0.46, + "epoch": 0.9216915902878277, + "fp16_scale": 1.0, + "global_step": 2866, + "grad_norm": 2.122661662967067, + "learning_rate": 3.1884979441797576e-08, + "loss": 0.462, + "step": 2866 + }, + { + "ETA": 0.46, + "epoch": 0.9220131853995819, + "fp16_scale": 1.0, + "global_step": 2867, + "grad_norm": 2.060438498644924, + "learning_rate": 3.162448079376212e-08, + "loss": 0.3579, + "step": 2867 + }, + { + "ETA": 0.46, + "epoch": 0.9223347805113362, + "fp16_scale": 1.0, + "global_step": 2868, + "grad_norm": 1.94447250213451, + "learning_rate": 3.136503354791109e-08, + "loss": 0.4399, + "step": 2868 + }, + { + "ETA": 0.46, + "epoch": 0.9226563756230906, + "fp16_scale": 1.0, + "global_step": 2869, + "grad_norm": 2.1724875728398736, + "learning_rate": 3.1106637985936155e-08, + "loss": 0.384, + "step": 2869 + }, + { + "ETA": 0.46, + "epoch": 0.9229779707348448, + "fp16_scale": 1.0, + "global_step": 2870, + "grad_norm": 1.9524800347584632, + "learning_rate": 3.084929438838746e-08, + "loss": 0.455, + "step": 2870 + }, + { + "ETA": 0.45, + "epoch": 0.9232995658465991, + "fp16_scale": 1.0, + "global_step": 2871, + "grad_norm": 2.0985641238180297, + "learning_rate": 3.059300303467238e-08, + "loss": 0.381, + "step": 2871 + }, + { + "ETA": 0.45, + "epoch": 0.9236211609583534, + "fp16_scale": 1.0, + "global_step": 2872, + "grad_norm": 2.0603571877937856, + "learning_rate": 3.033776420305656e-08, + "loss": 0.4047, + "step": 2872 + }, + { + "ETA": 0.45, + "epoch": 0.9239427560701078, + "fp16_scale": 1.0, + "global_step": 2873, + "grad_norm": 2.0350019436918396, + "learning_rate": 3.0083578170662095e-08, + "loss": 0.378, + "step": 2873 + }, + { + "ETA": 0.45, + "epoch": 0.924264351181862, + "fp16_scale": 1.0, + "global_step": 2874, + "grad_norm": 1.920722594905657, + "learning_rate": 2.983044521346878e-08, + "loss": 0.3673, + "step": 2874 + }, + { + "ETA": 0.45, + "epoch": 0.9245859462936163, + "fp16_scale": 1.0, + "global_step": 2875, + "grad_norm": 1.9377182625281728, + "learning_rate": 2.957836560631266e-08, + "loss": 0.4096, + "step": 2875 + }, + { + "ETA": 0.45, + "epoch": 0.9249075414053707, + "fp16_scale": 1.0, + "global_step": 2876, + "grad_norm": 2.1329634427360338, + "learning_rate": 2.9327339622886027e-08, + "loss": 0.4077, + "step": 2876 + }, + { + "ETA": 0.44, + "epoch": 0.9252291365171249, + "fp16_scale": 1.0, + "global_step": 2877, + "grad_norm": 2.0851122865652125, + "learning_rate": 2.907736753573764e-08, + "loss": 0.3713, + "step": 2877 + }, + { + "ETA": 0.44, + "epoch": 0.9255507316288792, + "fp16_scale": 1.0, + "global_step": 2878, + "grad_norm": 2.1182511867806917, + "learning_rate": 2.88284496162714e-08, + "loss": 0.4121, + "step": 2878 + }, + { + "ETA": 0.44, + "epoch": 0.9258723267406336, + "fp16_scale": 1.0, + "global_step": 2879, + "grad_norm": 1.686680202974622, + "learning_rate": 2.8580586134746898e-08, + "loss": 0.4095, + "step": 2879 + }, + { + "ETA": 0.44, + "epoch": 0.9261939218523878, + "fp16_scale": 1.0, + "global_step": 2880, + "grad_norm": 2.003043949217861, + "learning_rate": 2.833377736027931e-08, + "loss": 0.4378, + "step": 2880 + }, + { + "ETA": 0.44, + "epoch": 0.9265155169641421, + "fp16_scale": 1.0, + "global_step": 2881, + "grad_norm": 2.0132547667998177, + "learning_rate": 2.8088023560838058e-08, + "loss": 0.4162, + "step": 2881 + }, + { + "ETA": 0.43, + "epoch": 0.9268371120758965, + "fp16_scale": 1.0, + "global_step": 2882, + "grad_norm": 1.9477678640121898, + "learning_rate": 2.7843325003247707e-08, + "loss": 0.3662, + "step": 2882 + }, + { + "ETA": 0.43, + "epoch": 0.9271587071876507, + "fp16_scale": 1.0, + "global_step": 2883, + "grad_norm": 2.03755480900658, + "learning_rate": 2.7599681953186405e-08, + "loss": 0.4026, + "step": 2883 + }, + { + "ETA": 0.43, + "epoch": 0.927480302299405, + "fp16_scale": 1.0, + "global_step": 2884, + "grad_norm": 1.896138339130577, + "learning_rate": 2.7357094675186987e-08, + "loss": 0.4692, + "step": 2884 + }, + { + "ETA": 0.43, + "epoch": 0.9278018974111594, + "fp16_scale": 1.0, + "global_step": 2885, + "grad_norm": 1.8473039605003492, + "learning_rate": 2.7115563432635547e-08, + "loss": 0.3465, + "step": 2885 + }, + { + "ETA": 0.43, + "epoch": 0.9281234925229137, + "fp16_scale": 1.0, + "global_step": 2886, + "grad_norm": 1.7983228307103274, + "learning_rate": 2.6875088487771757e-08, + "loss": 0.3294, + "step": 2886 + }, + { + "ETA": 0.42, + "epoch": 0.9284450876346679, + "fp16_scale": 1.0, + "global_step": 2887, + "grad_norm": 2.0419374524211076, + "learning_rate": 2.6635670101688547e-08, + "loss": 0.4591, + "step": 2887 + }, + { + "ETA": 0.42, + "epoch": 0.9287666827464223, + "fp16_scale": 1.0, + "global_step": 2888, + "grad_norm": 2.0402655206198377, + "learning_rate": 2.639730853433142e-08, + "loss": 0.4047, + "step": 2888 + }, + { + "ETA": 0.42, + "epoch": 0.9290882778581766, + "fp16_scale": 1.0, + "global_step": 2889, + "grad_norm": 3.28262931467128, + "learning_rate": 2.6160004044498808e-08, + "loss": 0.3441, + "step": 2889 + }, + { + "ETA": 0.42, + "epoch": 0.9294098729699308, + "fp16_scale": 1.0, + "global_step": 2890, + "grad_norm": 1.938396177219089, + "learning_rate": 2.5923756889841052e-08, + "loss": 0.3975, + "step": 2890 + }, + { + "ETA": 0.42, + "epoch": 0.9297314680816852, + "fp16_scale": 1.0, + "global_step": 2891, + "grad_norm": 1.966063074085021, + "learning_rate": 2.5688567326860644e-08, + "loss": 0.4875, + "step": 2891 + }, + { + "ETA": 0.41, + "epoch": 0.9300530631934395, + "fp16_scale": 1.0, + "global_step": 2892, + "grad_norm": 1.8360764536400855, + "learning_rate": 2.5454435610912095e-08, + "loss": 0.417, + "step": 2892 + }, + { + "ETA": 0.41, + "epoch": 0.9303746583051937, + "fp16_scale": 1.0, + "global_step": 2893, + "grad_norm": 1.75421471852118, + "learning_rate": 2.5221361996200952e-08, + "loss": 0.4182, + "step": 2893 + }, + { + "ETA": 0.41, + "epoch": 0.9306962534169481, + "fp16_scale": 1.0, + "global_step": 2894, + "grad_norm": 1.9087059051516082, + "learning_rate": 2.4989346735784124e-08, + "loss": 0.4365, + "step": 2894 + }, + { + "ETA": 0.41, + "epoch": 0.9310178485287024, + "fp16_scale": 1.0, + "global_step": 2895, + "grad_norm": 2.0869932193712404, + "learning_rate": 2.4758390081569436e-08, + "loss": 0.4637, + "step": 2895 + }, + { + "ETA": 0.41, + "epoch": 0.9313394436404566, + "fp16_scale": 1.0, + "global_step": 2896, + "grad_norm": 2.237552466479958, + "learning_rate": 2.4528492284315305e-08, + "loss": 0.4443, + "step": 2896 + }, + { + "ETA": 0.4, + "epoch": 0.931661038752211, + "fp16_scale": 1.0, + "global_step": 2897, + "grad_norm": 2.198283922950274, + "learning_rate": 2.429965359363073e-08, + "loss": 0.3911, + "step": 2897 + }, + { + "ETA": 0.4, + "epoch": 0.9319826338639653, + "fp16_scale": 1.0, + "global_step": 2898, + "grad_norm": 2.0013742597536557, + "learning_rate": 2.407187425797419e-08, + "loss": 0.4223, + "step": 2898 + }, + { + "ETA": 0.4, + "epoch": 0.9323042289757195, + "fp16_scale": 1.0, + "global_step": 2899, + "grad_norm": 1.895460601105781, + "learning_rate": 2.384515452465474e-08, + "loss": 0.475, + "step": 2899 + }, + { + "ETA": 0.4, + "epoch": 0.9326258240874739, + "fp16_scale": 1.0, + "global_step": 2900, + "grad_norm": 2.3961486345211536, + "learning_rate": 2.3619494639830374e-08, + "loss": 0.4462, + "step": 2900 + }, + { + "ETA": 0.4, + "epoch": 0.9329474191992282, + "fp16_scale": 1.0, + "global_step": 2901, + "grad_norm": 2.092962793312189, + "learning_rate": 2.3394894848508874e-08, + "loss": 0.4551, + "step": 2901 + }, + { + "ETA": 0.4, + "epoch": 0.9332690143109825, + "fp16_scale": 1.0, + "global_step": 2902, + "grad_norm": 1.8940398717493065, + "learning_rate": 2.317135539454662e-08, + "loss": 0.4258, + "step": 2902 + }, + { + "ETA": 0.39, + "epoch": 0.9335906094227368, + "fp16_scale": 1.0, + "global_step": 2903, + "grad_norm": 1.8358028292874564, + "learning_rate": 2.2948876520648917e-08, + "loss": 0.4338, + "step": 2903 + }, + { + "ETA": 0.39, + "epoch": 0.9339122045344911, + "fp16_scale": 1.0, + "global_step": 2904, + "grad_norm": 2.0039504383224016, + "learning_rate": 2.2727458468369654e-08, + "loss": 0.4437, + "step": 2904 + }, + { + "ETA": 0.39, + "epoch": 0.9342337996462454, + "fp16_scale": 1.0, + "global_step": 2905, + "grad_norm": 1.9121237637779611, + "learning_rate": 2.2507101478110745e-08, + "loss": 0.4524, + "step": 2905 + }, + { + "ETA": 0.39, + "epoch": 0.9345553947579996, + "fp16_scale": 1.0, + "global_step": 2906, + "grad_norm": 2.0617501161978526, + "learning_rate": 2.228780578912226e-08, + "loss": 0.4526, + "step": 2906 + }, + { + "ETA": 0.39, + "epoch": 0.934876989869754, + "fp16_scale": 1.0, + "global_step": 2907, + "grad_norm": 1.9074988731134586, + "learning_rate": 2.206957163950174e-08, + "loss": 0.4001, + "step": 2907 + }, + { + "ETA": 0.38, + "epoch": 0.9351985849815083, + "fp16_scale": 1.0, + "global_step": 2908, + "grad_norm": 2.522027191574107, + "learning_rate": 2.185239926619431e-08, + "loss": 0.4575, + "step": 2908 + }, + { + "ETA": 0.38, + "epoch": 0.9355201800932625, + "fp16_scale": 1.0, + "global_step": 2909, + "grad_norm": 1.9950749602428057, + "learning_rate": 2.1636288904992585e-08, + "loss": 0.4594, + "step": 2909 + }, + { + "ETA": 0.38, + "epoch": 0.9358417752050169, + "fp16_scale": 1.0, + "global_step": 2910, + "grad_norm": 2.405769795260131, + "learning_rate": 2.1421240790535424e-08, + "loss": 0.4531, + "step": 2910 + }, + { + "ETA": 0.38, + "epoch": 0.9361633703167712, + "fp16_scale": 1.0, + "global_step": 2911, + "grad_norm": 2.0040134486983425, + "learning_rate": 2.1207255156309056e-08, + "loss": 0.4508, + "step": 2911 + }, + { + "ETA": 0.38, + "epoch": 0.9364849654285254, + "fp16_scale": 1.0, + "global_step": 2912, + "grad_norm": 1.9088454468379767, + "learning_rate": 2.099433223464564e-08, + "loss": 0.4288, + "step": 2912 + }, + { + "ETA": 0.37, + "epoch": 0.9368065605402798, + "fp16_scale": 1.0, + "global_step": 2913, + "grad_norm": 2.059182017213907, + "learning_rate": 2.0782472256723803e-08, + "loss": 0.4029, + "step": 2913 + }, + { + "ETA": 0.37, + "epoch": 0.9371281556520341, + "fp16_scale": 1.0, + "global_step": 2914, + "grad_norm": 2.069901122293247, + "learning_rate": 2.0571675452567993e-08, + "loss": 0.3054, + "step": 2914 + }, + { + "ETA": 0.37, + "epoch": 0.9374497507637883, + "fp16_scale": 1.0, + "global_step": 2915, + "grad_norm": 1.9778287081296801, + "learning_rate": 2.0361942051048242e-08, + "loss": 0.4049, + "step": 2915 + }, + { + "ETA": 0.37, + "epoch": 0.9377713458755427, + "fp16_scale": 1.0, + "global_step": 2916, + "grad_norm": 1.9012498076195208, + "learning_rate": 2.0153272279880173e-08, + "loss": 0.4042, + "step": 2916 + }, + { + "ETA": 0.37, + "epoch": 0.938092940987297, + "fp16_scale": 1.0, + "global_step": 2917, + "grad_norm": 2.0072285388411335, + "learning_rate": 1.9945666365624447e-08, + "loss": 0.3625, + "step": 2917 + }, + { + "ETA": 0.36, + "epoch": 0.9384145360990513, + "fp16_scale": 1.0, + "global_step": 2918, + "grad_norm": 1.9604109361099546, + "learning_rate": 1.973912453368676e-08, + "loss": 0.3789, + "step": 2918 + }, + { + "ETA": 0.36, + "epoch": 0.9387361312108056, + "fp16_scale": 1.0, + "global_step": 2919, + "grad_norm": 1.8108215220097155, + "learning_rate": 1.95336470083175e-08, + "loss": 0.3918, + "step": 2919 + }, + { + "ETA": 0.36, + "epoch": 0.9390577263225599, + "fp16_scale": 1.0, + "global_step": 2920, + "grad_norm": 2.2514629786650167, + "learning_rate": 1.9329234012611327e-08, + "loss": 0.3537, + "step": 2920 + }, + { + "ETA": 0.36, + "epoch": 0.9393793214343142, + "fp16_scale": 1.0, + "global_step": 2921, + "grad_norm": 1.986437118979266, + "learning_rate": 1.9125885768507267e-08, + "loss": 0.5235, + "step": 2921 + }, + { + "ETA": 0.36, + "epoch": 0.9397009165460685, + "fp16_scale": 1.0, + "global_step": 2922, + "grad_norm": 1.9941076820573982, + "learning_rate": 1.892360249678826e-08, + "loss": 0.4799, + "step": 2922 + }, + { + "ETA": 0.36, + "epoch": 0.9400225116578228, + "fp16_scale": 1.0, + "global_step": 2923, + "grad_norm": 2.317297842742614, + "learning_rate": 1.872238441708085e-08, + "loss": 0.3844, + "step": 2923 + }, + { + "ETA": 0.35, + "epoch": 0.9403441067695771, + "fp16_scale": 1.0, + "global_step": 2924, + "grad_norm": 1.9462108083364182, + "learning_rate": 1.8522231747855388e-08, + "loss": 0.4104, + "step": 2924 + }, + { + "ETA": 0.35, + "epoch": 0.9406657018813314, + "fp16_scale": 1.0, + "global_step": 2925, + "grad_norm": 1.8474139095167101, + "learning_rate": 1.8323144706425155e-08, + "loss": 0.4414, + "step": 2925 + }, + { + "ETA": 0.35, + "epoch": 0.9409872969930857, + "fp16_scale": 1.0, + "global_step": 2926, + "grad_norm": 1.924080350838665, + "learning_rate": 1.812512350894646e-08, + "loss": 0.3743, + "step": 2926 + }, + { + "ETA": 0.35, + "epoch": 0.94130889210484, + "fp16_scale": 1.0, + "global_step": 2927, + "grad_norm": 2.0265808388741546, + "learning_rate": 1.792816837041844e-08, + "loss": 0.4951, + "step": 2927 + }, + { + "ETA": 0.35, + "epoch": 0.9416304872165943, + "fp16_scale": 1.0, + "global_step": 2928, + "grad_norm": 2.1163588652864913, + "learning_rate": 1.7732279504683034e-08, + "loss": 0.4028, + "step": 2928 + }, + { + "ETA": 0.34, + "epoch": 0.9419520823283486, + "fp16_scale": 1.0, + "global_step": 2929, + "grad_norm": 2.2452526256173764, + "learning_rate": 1.7537457124423893e-08, + "loss": 0.4152, + "step": 2929 + }, + { + "ETA": 0.34, + "epoch": 0.9422736774401029, + "fp16_scale": 1.0, + "global_step": 2930, + "grad_norm": 1.951578725577269, + "learning_rate": 1.7343701441167258e-08, + "loss": 0.5007, + "step": 2930 + }, + { + "ETA": 0.34, + "epoch": 0.9425952725518573, + "fp16_scale": 1.0, + "global_step": 2931, + "grad_norm": 2.0595012579859735, + "learning_rate": 1.7151012665281183e-08, + "loss": 0.346, + "step": 2931 + }, + { + "ETA": 0.34, + "epoch": 0.9429168676636115, + "fp16_scale": 1.0, + "global_step": 2932, + "grad_norm": 1.8322769366624494, + "learning_rate": 1.6959391005975098e-08, + "loss": 0.4124, + "step": 2932 + }, + { + "ETA": 0.34, + "epoch": 0.9432384627753658, + "fp16_scale": 1.0, + "global_step": 2933, + "grad_norm": 1.9282113927559967, + "learning_rate": 1.6768836671299912e-08, + "loss": 0.4012, + "step": 2933 + }, + { + "ETA": 0.33, + "epoch": 0.9435600578871202, + "fp16_scale": 1.0, + "global_step": 2934, + "grad_norm": 2.1884584348146543, + "learning_rate": 1.6579349868147686e-08, + "loss": 0.5475, + "step": 2934 + }, + { + "ETA": 0.33, + "epoch": 0.9438816529988744, + "fp16_scale": 1.0, + "global_step": 2935, + "grad_norm": 2.073075463862458, + "learning_rate": 1.6390930802251624e-08, + "loss": 0.3919, + "step": 2935 + }, + { + "ETA": 0.33, + "epoch": 0.9442032481106287, + "fp16_scale": 1.0, + "global_step": 2936, + "grad_norm": 1.9712325183264225, + "learning_rate": 1.620357967818531e-08, + "loss": 0.43, + "step": 2936 + }, + { + "ETA": 0.33, + "epoch": 0.9445248432223831, + "fp16_scale": 1.0, + "global_step": 2937, + "grad_norm": 2.197163721181814, + "learning_rate": 1.6017296699363138e-08, + "loss": 0.5202, + "step": 2937 + }, + { + "ETA": 0.33, + "epoch": 0.9448464383341373, + "fp16_scale": 1.0, + "global_step": 2938, + "grad_norm": 1.919809845775633, + "learning_rate": 1.583208206803954e-08, + "loss": 0.3665, + "step": 2938 + }, + { + "ETA": 0.32, + "epoch": 0.9451680334458916, + "fp16_scale": 1.0, + "global_step": 2939, + "grad_norm": 1.8314609230798584, + "learning_rate": 1.5647935985309003e-08, + "loss": 0.4703, + "step": 2939 + }, + { + "ETA": 0.32, + "epoch": 0.945489628557646, + "fp16_scale": 1.0, + "global_step": 2940, + "grad_norm": 2.0837053070772114, + "learning_rate": 1.5464858651106138e-08, + "loss": 0.4618, + "step": 2940 + }, + { + "ETA": 0.32, + "epoch": 0.9458112236694002, + "fp16_scale": 1.0, + "global_step": 2941, + "grad_norm": 1.7579175386781978, + "learning_rate": 1.528285026420484e-08, + "loss": 0.3877, + "step": 2941 + }, + { + "ETA": 0.32, + "epoch": 0.9461328187811545, + "fp16_scale": 1.0, + "global_step": 2942, + "grad_norm": 1.8483540718218372, + "learning_rate": 1.5101911022218693e-08, + "loss": 0.4942, + "step": 2942 + }, + { + "ETA": 0.32, + "epoch": 0.9464544138929089, + "fp16_scale": 1.0, + "global_step": 2943, + "grad_norm": 1.9770167749618421, + "learning_rate": 1.4922041121600337e-08, + "loss": 0.3881, + "step": 2943 + }, + { + "ETA": 0.32, + "epoch": 0.9467760090046631, + "fp16_scale": 1.0, + "global_step": 2944, + "grad_norm": 2.0364922257199267, + "learning_rate": 1.4743240757641107e-08, + "loss": 0.462, + "step": 2944 + }, + { + "ETA": 0.31, + "epoch": 0.9470976041164174, + "fp16_scale": 1.0, + "global_step": 2945, + "grad_norm": 2.45622442544849, + "learning_rate": 1.4565510124471492e-08, + "loss": 0.3526, + "step": 2945 + }, + { + "ETA": 0.31, + "epoch": 0.9474191992281717, + "fp16_scale": 1.0, + "global_step": 2946, + "grad_norm": 2.015600297041838, + "learning_rate": 1.4388849415060466e-08, + "loss": 0.3525, + "step": 2946 + }, + { + "ETA": 0.31, + "epoch": 0.9477407943399261, + "fp16_scale": 1.0, + "global_step": 2947, + "grad_norm": 2.1377403208434806, + "learning_rate": 1.4213258821215379e-08, + "loss": 0.4307, + "step": 2947 + }, + { + "ETA": 0.31, + "epoch": 0.9480623894516803, + "fp16_scale": 1.0, + "global_step": 2948, + "grad_norm": 1.7641667527380287, + "learning_rate": 1.4038738533581617e-08, + "loss": 0.3583, + "step": 2948 + }, + { + "ETA": 0.31, + "epoch": 0.9483839845634346, + "fp16_scale": 1.0, + "global_step": 2949, + "grad_norm": 1.7784574412022152, + "learning_rate": 1.3865288741642168e-08, + "loss": 0.4862, + "step": 2949 + }, + { + "ETA": 0.3, + "epoch": 0.948705579675189, + "fp16_scale": 1.0, + "global_step": 2950, + "grad_norm": 1.9576693109144088, + "learning_rate": 1.3692909633718497e-08, + "loss": 0.4525, + "step": 2950 + }, + { + "ETA": 0.3, + "epoch": 0.9490271747869432, + "fp16_scale": 1.0, + "global_step": 2951, + "grad_norm": 2.133508948989494, + "learning_rate": 1.3521601396968896e-08, + "loss": 0.4069, + "step": 2951 + }, + { + "ETA": 0.3, + "epoch": 0.9493487698986975, + "fp16_scale": 1.0, + "global_step": 2952, + "grad_norm": 1.8553115893701249, + "learning_rate": 1.3351364217389249e-08, + "loss": 0.4627, + "step": 2952 + }, + { + "ETA": 0.3, + "epoch": 0.9496703650104519, + "fp16_scale": 1.0, + "global_step": 2953, + "grad_norm": 1.9719888974956736, + "learning_rate": 1.3182198279812816e-08, + "loss": 0.4568, + "step": 2953 + }, + { + "ETA": 0.3, + "epoch": 0.9499919601222061, + "fp16_scale": 1.0, + "global_step": 2954, + "grad_norm": 2.0719174871476014, + "learning_rate": 1.3014103767909235e-08, + "loss": 0.3627, + "step": 2954 + }, + { + "ETA": 0.29, + "epoch": 0.9503135552339604, + "fp16_scale": 1.0, + "global_step": 2955, + "grad_norm": 2.087689015845033, + "learning_rate": 1.2847080864185177e-08, + "loss": 0.4018, + "step": 2955 + }, + { + "ETA": 0.29, + "epoch": 0.9506351503457148, + "fp16_scale": 1.0, + "global_step": 2956, + "grad_norm": 1.9434984481486384, + "learning_rate": 1.2681129749983809e-08, + "loss": 0.4133, + "step": 2956 + }, + { + "ETA": 0.29, + "epoch": 0.950956745457469, + "fp16_scale": 1.0, + "global_step": 2957, + "grad_norm": 1.9024416054546027, + "learning_rate": 1.2516250605484558e-08, + "loss": 0.4437, + "step": 2957 + }, + { + "ETA": 0.29, + "epoch": 0.9512783405692233, + "fp16_scale": 1.0, + "global_step": 2958, + "grad_norm": 2.020528747188203, + "learning_rate": 1.2352443609703e-08, + "loss": 0.394, + "step": 2958 + }, + { + "ETA": 0.29, + "epoch": 0.9515999356809777, + "fp16_scale": 1.0, + "global_step": 2959, + "grad_norm": 1.8568009487685488, + "learning_rate": 1.218970894049065e-08, + "loss": 0.4347, + "step": 2959 + }, + { + "ETA": 0.28, + "epoch": 0.9519215307927319, + "fp16_scale": 1.0, + "global_step": 2960, + "grad_norm": 1.7086439917684098, + "learning_rate": 1.2028046774534616e-08, + "loss": 0.3959, + "step": 2960 + }, + { + "ETA": 0.28, + "epoch": 0.9522431259044862, + "fp16_scale": 1.0, + "global_step": 2961, + "grad_norm": 1.9627415938801038, + "learning_rate": 1.186745728735783e-08, + "loss": 0.3955, + "step": 2961 + }, + { + "ETA": 0.28, + "epoch": 0.9525647210162406, + "fp16_scale": 1.0, + "global_step": 2962, + "grad_norm": 1.7178424258131575, + "learning_rate": 1.170794065331837e-08, + "loss": 0.384, + "step": 2962 + }, + { + "ETA": 0.28, + "epoch": 0.9528863161279949, + "fp16_scale": 1.0, + "global_step": 2963, + "grad_norm": 1.923561456451965, + "learning_rate": 1.1549497045609368e-08, + "loss": 0.4176, + "step": 2963 + }, + { + "ETA": 0.28, + "epoch": 0.9532079112397491, + "fp16_scale": 1.0, + "global_step": 2964, + "grad_norm": 1.8165583592615953, + "learning_rate": 1.1392126636259324e-08, + "loss": 0.3305, + "step": 2964 + }, + { + "ETA": 0.27, + "epoch": 0.9535295063515035, + "fp16_scale": 1.0, + "global_step": 2965, + "grad_norm": 1.8570178517632707, + "learning_rate": 1.123582959613123e-08, + "loss": 0.4354, + "step": 2965 + }, + { + "ETA": 0.27, + "epoch": 0.9538511014632578, + "fp16_scale": 1.0, + "global_step": 2966, + "grad_norm": 2.0259558536558853, + "learning_rate": 1.1080606094922562e-08, + "loss": 0.3532, + "step": 2966 + }, + { + "ETA": 0.27, + "epoch": 0.954172696575012, + "fp16_scale": 1.0, + "global_step": 2967, + "grad_norm": 1.9570562339180626, + "learning_rate": 1.0926456301165621e-08, + "loss": 0.4097, + "step": 2967 + }, + { + "ETA": 0.27, + "epoch": 0.9544942916867664, + "fp16_scale": 1.0, + "global_step": 2968, + "grad_norm": 1.9588007205081928, + "learning_rate": 1.0773380382226415e-08, + "loss": 0.4465, + "step": 2968 + }, + { + "ETA": 0.27, + "epoch": 0.9548158867985207, + "fp16_scale": 1.0, + "global_step": 2969, + "grad_norm": 1.8906510859736605, + "learning_rate": 1.0621378504305666e-08, + "loss": 0.455, + "step": 2969 + }, + { + "ETA": 0.27, + "epoch": 0.9551374819102749, + "fp16_scale": 1.0, + "global_step": 2970, + "grad_norm": 2.0645576486022135, + "learning_rate": 1.047045083243725e-08, + "loss": 0.3481, + "step": 2970 + }, + { + "ETA": 0.26, + "epoch": 0.9554590770220293, + "fp16_scale": 1.0, + "global_step": 2971, + "grad_norm": 2.045479632270176, + "learning_rate": 1.0320597530489417e-08, + "loss": 0.4637, + "step": 2971 + }, + { + "ETA": 0.26, + "epoch": 0.9557806721337836, + "fp16_scale": 1.0, + "global_step": 2972, + "grad_norm": 1.660848396106032, + "learning_rate": 1.0171818761163353e-08, + "loss": 0.4502, + "step": 2972 + }, + { + "ETA": 0.26, + "epoch": 0.9561022672455378, + "fp16_scale": 1.0, + "global_step": 2973, + "grad_norm": 2.439452414713532, + "learning_rate": 1.0024114685993956e-08, + "loss": 0.4045, + "step": 2973 + }, + { + "ETA": 0.26, + "epoch": 0.9564238623572922, + "fp16_scale": 1.0, + "global_step": 2974, + "grad_norm": 1.7980539379155829, + "learning_rate": 9.877485465349056e-09, + "loss": 0.4406, + "step": 2974 + }, + { + "ETA": 0.26, + "epoch": 0.9567454574690465, + "fp16_scale": 1.0, + "global_step": 2975, + "grad_norm": 1.8403094333533125, + "learning_rate": 9.731931258429638e-09, + "loss": 0.4131, + "step": 2975 + }, + { + "ETA": 0.25, + "epoch": 0.9570670525808008, + "fp16_scale": 1.0, + "global_step": 2976, + "grad_norm": 1.8125355911510845, + "learning_rate": 9.587452223269622e-09, + "loss": 0.3938, + "step": 2976 + }, + { + "ETA": 0.25, + "epoch": 0.957388647692555, + "fp16_scale": 1.0, + "global_step": 2977, + "grad_norm": 1.8495426433052407, + "learning_rate": 9.444048516735193e-09, + "loss": 0.3742, + "step": 2977 + }, + { + "ETA": 0.25, + "epoch": 0.9577102428043094, + "fp16_scale": 1.0, + "global_step": 2978, + "grad_norm": 2.179984387368447, + "learning_rate": 9.30172029452514e-09, + "loss": 0.4132, + "step": 2978 + }, + { + "ETA": 0.25, + "epoch": 0.9580318379160637, + "fp16_scale": 1.0, + "global_step": 2979, + "grad_norm": 1.9759395060978266, + "learning_rate": 9.16046771117085e-09, + "loss": 0.4631, + "step": 2979 + }, + { + "ETA": 0.25, + "epoch": 0.958353433027818, + "fp16_scale": 1.0, + "global_step": 2980, + "grad_norm": 1.8666175143157948, + "learning_rate": 9.020290920035534e-09, + "loss": 0.4602, + "step": 2980 + }, + { + "ETA": 0.24, + "epoch": 0.9586750281395723, + "fp16_scale": 1.0, + "global_step": 2981, + "grad_norm": 2.052054203830057, + "learning_rate": 8.881190073314559e-09, + "loss": 0.4006, + "step": 2981 + }, + { + "ETA": 0.24, + "epoch": 0.9589966232513266, + "fp16_scale": 1.0, + "global_step": 2982, + "grad_norm": 2.0317608519735, + "learning_rate": 8.743165322035007e-09, + "loss": 0.464, + "step": 2982 + }, + { + "ETA": 0.24, + "epoch": 0.9593182183630808, + "fp16_scale": 1.0, + "global_step": 2983, + "grad_norm": 1.774378858660257, + "learning_rate": 8.606216816055333e-09, + "loss": 0.4483, + "step": 2983 + }, + { + "ETA": 0.24, + "epoch": 0.9596398134748352, + "fp16_scale": 1.0, + "global_step": 2984, + "grad_norm": 2.0955463172220608, + "learning_rate": 8.470344704066046e-09, + "loss": 0.3876, + "step": 2984 + }, + { + "ETA": 0.24, + "epoch": 0.9599614085865895, + "fp16_scale": 1.0, + "global_step": 2985, + "grad_norm": 2.1051645851664857, + "learning_rate": 8.335549133588582e-09, + "loss": 0.398, + "step": 2985 + }, + { + "ETA": 0.23, + "epoch": 0.9602830036983437, + "fp16_scale": 1.0, + "global_step": 2986, + "grad_norm": 2.029552173358812, + "learning_rate": 8.20183025097565e-09, + "loss": 0.3471, + "step": 2986 + }, + { + "ETA": 0.23, + "epoch": 0.9606045988100981, + "fp16_scale": 1.0, + "global_step": 2987, + "grad_norm": 2.0325420544793973, + "learning_rate": 8.069188201410892e-09, + "loss": 0.3908, + "step": 2987 + }, + { + "ETA": 0.23, + "epoch": 0.9609261939218524, + "fp16_scale": 1.0, + "global_step": 2988, + "grad_norm": 2.2358667514528054, + "learning_rate": 7.937623128908887e-09, + "loss": 0.4238, + "step": 2988 + }, + { + "ETA": 0.23, + "epoch": 0.9612477890336066, + "fp16_scale": 1.0, + "global_step": 2989, + "grad_norm": 1.8586219935882644, + "learning_rate": 7.807135176314706e-09, + "loss": 0.3451, + "step": 2989 + }, + { + "ETA": 0.23, + "epoch": 0.961569384145361, + "fp16_scale": 1.0, + "global_step": 2990, + "grad_norm": 1.7855331290795615, + "learning_rate": 7.677724485304237e-09, + "loss": 0.3704, + "step": 2990 + }, + { + "ETA": 0.23, + "epoch": 0.9618909792571153, + "fp16_scale": 1.0, + "global_step": 2991, + "grad_norm": 2.1596260546169046, + "learning_rate": 7.549391196383536e-09, + "loss": 0.4331, + "step": 2991 + }, + { + "ETA": 0.22, + "epoch": 0.9622125743688696, + "fp16_scale": 1.0, + "global_step": 2992, + "grad_norm": 1.9673441050483644, + "learning_rate": 7.422135448889033e-09, + "loss": 0.4345, + "step": 2992 + }, + { + "ETA": 0.22, + "epoch": 0.9625341694806239, + "fp16_scale": 1.0, + "global_step": 2993, + "grad_norm": 2.2208780929202523, + "learning_rate": 7.295957380986983e-09, + "loss": 0.3715, + "step": 2993 + }, + { + "ETA": 0.22, + "epoch": 0.9628557645923782, + "fp16_scale": 1.0, + "global_step": 2994, + "grad_norm": 2.0173989523449323, + "learning_rate": 7.170857129673913e-09, + "loss": 0.4656, + "step": 2994 + }, + { + "ETA": 0.22, + "epoch": 0.9631773597041325, + "fp16_scale": 1.0, + "global_step": 2995, + "grad_norm": 1.8048564034676648, + "learning_rate": 7.0468348307757275e-09, + "loss": 0.4128, + "step": 2995 + }, + { + "ETA": 0.22, + "epoch": 0.9634989548158868, + "fp16_scale": 1.0, + "global_step": 2996, + "grad_norm": 1.813233816899862, + "learning_rate": 6.923890618948158e-09, + "loss": 0.3979, + "step": 2996 + }, + { + "ETA": 0.21, + "epoch": 0.9638205499276411, + "fp16_scale": 1.0, + "global_step": 2997, + "grad_norm": 2.1973299162727242, + "learning_rate": 6.80202462767665e-09, + "loss": 0.439, + "step": 2997 + }, + { + "ETA": 0.21, + "epoch": 0.9641421450393954, + "fp16_scale": 1.0, + "global_step": 2998, + "grad_norm": 1.9469707157712282, + "learning_rate": 6.681236989275585e-09, + "loss": 0.3835, + "step": 2998 + }, + { + "ETA": 0.21, + "epoch": 0.9644637401511497, + "fp16_scale": 1.0, + "global_step": 2999, + "grad_norm": 1.8263752228757655, + "learning_rate": 6.561527834888725e-09, + "loss": 0.4488, + "step": 2999 + }, + { + "ETA": 0.21, + "epoch": 0.964785335262904, + "fp16_scale": 1.0, + "global_step": 3000, + "grad_norm": 1.8783427695492874, + "learning_rate": 6.442897294488881e-09, + "loss": 0.4241, + "step": 3000 + } + ], + "logging_steps": 1, + "max_steps": 3109, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6197817830014976.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}