diff --git "a/rm-harmless-hs/checkpoint-950/trainer_state.json" "b/rm-harmless-hs/checkpoint-950/trainer_state.json" new file mode 100644--- /dev/null +++ "b/rm-harmless-hs/checkpoint-950/trainer_state.json" @@ -0,0 +1,15221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.922279792746114, + "eval_steps": 1, + "global_step": 950, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0051813471502590676, + "grad_norm": 159.0, + "learning_rate": 4.994818652849741e-05, + "loss": 0.9219, + "step": 1 + }, + { + "epoch": 0.0051813471502590676, + "eval_accuracy": 0.46733668341708545, + "eval_loss": 0.7203988432884216, + "eval_runtime": 23.3133, + "eval_samples_per_second": 17.072, + "eval_steps_per_second": 2.145, + "step": 1 + }, + { + "epoch": 0.010362694300518135, + "grad_norm": 137.0, + "learning_rate": 4.989637305699482e-05, + "loss": 0.707, + "step": 2 + }, + { + "epoch": 0.010362694300518135, + "eval_accuracy": 0.46733668341708545, + "eval_loss": 0.7385560870170593, + "eval_runtime": 23.5535, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.123, + "step": 2 + }, + { + "epoch": 0.015544041450777202, + "grad_norm": 91.5, + "learning_rate": 4.984455958549223e-05, + "loss": 0.7344, + "step": 3 + }, + { + "epoch": 0.015544041450777202, + "eval_accuracy": 0.550251256281407, + "eval_loss": 0.713489294052124, + "eval_runtime": 23.6182, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.117, + "step": 3 + }, + { + "epoch": 0.02072538860103627, + "grad_norm": 23.5, + "learning_rate": 4.979274611398964e-05, + "loss": 0.3867, + "step": 4 + }, + { + "epoch": 0.02072538860103627, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.7309594750404358, + "eval_runtime": 23.6854, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 4 + }, + { + "epoch": 0.025906735751295335, + "grad_norm": 63.0, + "learning_rate": 4.974093264248705e-05, + "loss": 0.8945, + "step": 5 + }, + { + "epoch": 0.025906735751295335, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.7841944098472595, + "eval_runtime": 23.7238, + "eval_samples_per_second": 16.776, + "eval_steps_per_second": 2.108, + "step": 5 + }, + { + "epoch": 0.031088082901554404, + "grad_norm": 45.5, + "learning_rate": 4.968911917098446e-05, + "loss": 0.7656, + "step": 6 + }, + { + "epoch": 0.031088082901554404, + "eval_accuracy": 0.5979899497487438, + "eval_loss": 0.8126373887062073, + "eval_runtime": 23.7268, + "eval_samples_per_second": 16.774, + "eval_steps_per_second": 2.107, + "step": 6 + }, + { + "epoch": 0.03626943005181347, + "grad_norm": 59.25, + "learning_rate": 4.963730569948187e-05, + "loss": 1.5078, + "step": 7 + }, + { + "epoch": 0.03626943005181347, + "eval_accuracy": 0.5829145728643216, + "eval_loss": 0.7515114545822144, + "eval_runtime": 23.7309, + "eval_samples_per_second": 16.771, + "eval_steps_per_second": 2.107, + "step": 7 + }, + { + "epoch": 0.04145077720207254, + "grad_norm": 58.0, + "learning_rate": 4.9585492227979277e-05, + "loss": 0.4258, + "step": 8 + }, + { + "epoch": 0.04145077720207254, + "eval_accuracy": 0.4648241206030151, + "eval_loss": 0.9782702326774597, + "eval_runtime": 23.7326, + "eval_samples_per_second": 16.77, + "eval_steps_per_second": 2.107, + "step": 8 + }, + { + "epoch": 0.046632124352331605, + "grad_norm": 62.25, + "learning_rate": 4.9533678756476685e-05, + "loss": 1.0547, + "step": 9 + }, + { + "epoch": 0.046632124352331605, + "eval_accuracy": 0.45979899497487436, + "eval_loss": 1.0166261196136475, + "eval_runtime": 23.7123, + "eval_samples_per_second": 16.785, + "eval_steps_per_second": 2.109, + "step": 9 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 109.0, + "learning_rate": 4.948186528497409e-05, + "loss": 0.7656, + "step": 10 + }, + { + "epoch": 0.05181347150259067, + "eval_accuracy": 0.4547738693467337, + "eval_loss": 1.0814815759658813, + "eval_runtime": 23.6693, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 10 + }, + { + "epoch": 0.05699481865284974, + "grad_norm": 160.0, + "learning_rate": 4.943005181347151e-05, + "loss": 1.0938, + "step": 11 + }, + { + "epoch": 0.05699481865284974, + "eval_accuracy": 0.457286432160804, + "eval_loss": 1.0878022909164429, + "eval_runtime": 23.663, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 11 + }, + { + "epoch": 0.06217616580310881, + "grad_norm": 70.0, + "learning_rate": 4.937823834196891e-05, + "loss": 0.6133, + "step": 12 + }, + { + "epoch": 0.06217616580310881, + "eval_accuracy": 0.4849246231155779, + "eval_loss": 1.0733354091644287, + "eval_runtime": 23.6222, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.117, + "step": 12 + }, + { + "epoch": 0.06735751295336788, + "grad_norm": 106.0, + "learning_rate": 4.9326424870466325e-05, + "loss": 2.4062, + "step": 13 + }, + { + "epoch": 0.06735751295336788, + "eval_accuracy": 0.46984924623115576, + "eval_loss": 0.9452536106109619, + "eval_runtime": 23.5746, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.121, + "step": 13 + }, + { + "epoch": 0.07253886010362694, + "grad_norm": 54.75, + "learning_rate": 4.9274611398963734e-05, + "loss": 0.7188, + "step": 14 + }, + { + "epoch": 0.07253886010362694, + "eval_accuracy": 0.4824120603015075, + "eval_loss": 0.8462625741958618, + "eval_runtime": 23.5633, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.122, + "step": 14 + }, + { + "epoch": 0.07772020725388601, + "grad_norm": 49.0, + "learning_rate": 4.922279792746114e-05, + "loss": 0.9766, + "step": 15 + }, + { + "epoch": 0.07772020725388601, + "eval_accuracy": 0.5452261306532663, + "eval_loss": 0.7269354462623596, + "eval_runtime": 23.5919, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.119, + "step": 15 + }, + { + "epoch": 0.08290155440414508, + "grad_norm": 34.0, + "learning_rate": 4.917098445595855e-05, + "loss": 0.6289, + "step": 16 + }, + { + "epoch": 0.08290155440414508, + "eval_accuracy": 0.5728643216080402, + "eval_loss": 0.6750942468643188, + "eval_runtime": 23.5748, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.121, + "step": 16 + }, + { + "epoch": 0.08808290155440414, + "grad_norm": 109.0, + "learning_rate": 4.911917098445596e-05, + "loss": 0.5781, + "step": 17 + }, + { + "epoch": 0.08808290155440414, + "eval_accuracy": 0.6030150753768844, + "eval_loss": 0.6482215523719788, + "eval_runtime": 23.5403, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.124, + "step": 17 + }, + { + "epoch": 0.09326424870466321, + "grad_norm": 29.625, + "learning_rate": 4.9067357512953374e-05, + "loss": 0.9141, + "step": 18 + }, + { + "epoch": 0.09326424870466321, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.6388779878616333, + "eval_runtime": 23.607, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.118, + "step": 18 + }, + { + "epoch": 0.09844559585492228, + "grad_norm": 33.0, + "learning_rate": 4.9015544041450776e-05, + "loss": 0.6289, + "step": 19 + }, + { + "epoch": 0.09844559585492228, + "eval_accuracy": 0.6457286432160804, + "eval_loss": 0.6634736061096191, + "eval_runtime": 23.6582, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 19 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 109.0, + "learning_rate": 4.896373056994819e-05, + "loss": 1.0312, + "step": 20 + }, + { + "epoch": 0.10362694300518134, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.7022416591644287, + "eval_runtime": 23.7072, + "eval_samples_per_second": 16.788, + "eval_steps_per_second": 2.109, + "step": 20 + }, + { + "epoch": 0.10880829015544041, + "grad_norm": 33.75, + "learning_rate": 4.89119170984456e-05, + "loss": 0.7617, + "step": 21 + }, + { + "epoch": 0.10880829015544041, + "eval_accuracy": 0.6231155778894473, + "eval_loss": 0.7502355575561523, + "eval_runtime": 23.7321, + "eval_samples_per_second": 16.771, + "eval_steps_per_second": 2.107, + "step": 21 + }, + { + "epoch": 0.11398963730569948, + "grad_norm": 27.75, + "learning_rate": 4.886010362694301e-05, + "loss": 0.5781, + "step": 22 + }, + { + "epoch": 0.11398963730569948, + "eval_accuracy": 0.5628140703517588, + "eval_loss": 0.7993286848068237, + "eval_runtime": 23.7441, + "eval_samples_per_second": 16.762, + "eval_steps_per_second": 2.106, + "step": 22 + }, + { + "epoch": 0.11917098445595854, + "grad_norm": 95.5, + "learning_rate": 4.8808290155440416e-05, + "loss": 0.8008, + "step": 23 + }, + { + "epoch": 0.11917098445595854, + "eval_accuracy": 0.5100502512562815, + "eval_loss": 0.8114596605300903, + "eval_runtime": 23.7307, + "eval_samples_per_second": 16.772, + "eval_steps_per_second": 2.107, + "step": 23 + }, + { + "epoch": 0.12435233160621761, + "grad_norm": 37.5, + "learning_rate": 4.8756476683937825e-05, + "loss": 0.3691, + "step": 24 + }, + { + "epoch": 0.12435233160621761, + "eval_accuracy": 0.49748743718592964, + "eval_loss": 0.7979153394699097, + "eval_runtime": 23.7344, + "eval_samples_per_second": 16.769, + "eval_steps_per_second": 2.107, + "step": 24 + }, + { + "epoch": 0.12953367875647667, + "grad_norm": 28.75, + "learning_rate": 4.870466321243523e-05, + "loss": 0.4688, + "step": 25 + }, + { + "epoch": 0.12953367875647667, + "eval_accuracy": 0.5050251256281407, + "eval_loss": 0.7843906879425049, + "eval_runtime": 23.6938, + "eval_samples_per_second": 16.798, + "eval_steps_per_second": 2.11, + "step": 25 + }, + { + "epoch": 0.13471502590673576, + "grad_norm": 43.25, + "learning_rate": 4.865284974093264e-05, + "loss": 0.2676, + "step": 26 + }, + { + "epoch": 0.13471502590673576, + "eval_accuracy": 0.5276381909547738, + "eval_loss": 0.7802685499191284, + "eval_runtime": 23.628, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 26 + }, + { + "epoch": 0.13989637305699482, + "grad_norm": 56.75, + "learning_rate": 4.860103626943006e-05, + "loss": 0.8828, + "step": 27 + }, + { + "epoch": 0.13989637305699482, + "eval_accuracy": 0.49748743718592964, + "eval_loss": 0.7800329923629761, + "eval_runtime": 23.6083, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.118, + "step": 27 + }, + { + "epoch": 0.14507772020725387, + "grad_norm": 33.5, + "learning_rate": 4.8549222797927465e-05, + "loss": 0.5352, + "step": 28 + }, + { + "epoch": 0.14507772020725387, + "eval_accuracy": 0.4723618090452261, + "eval_loss": 0.7683534622192383, + "eval_runtime": 23.5773, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.121, + "step": 28 + }, + { + "epoch": 0.15025906735751296, + "grad_norm": 18.5, + "learning_rate": 4.8497409326424874e-05, + "loss": 0.5273, + "step": 29 + }, + { + "epoch": 0.15025906735751296, + "eval_accuracy": 0.48743718592964824, + "eval_loss": 0.7782663106918335, + "eval_runtime": 23.505, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.127, + "step": 29 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 150.0, + "learning_rate": 4.844559585492228e-05, + "loss": 1.1875, + "step": 30 + }, + { + "epoch": 0.15544041450777202, + "eval_accuracy": 0.5125628140703518, + "eval_loss": 0.7549858689308167, + "eval_runtime": 23.5259, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.125, + "step": 30 + }, + { + "epoch": 0.16062176165803108, + "grad_norm": 28.125, + "learning_rate": 4.839378238341969e-05, + "loss": 0.6016, + "step": 31 + }, + { + "epoch": 0.16062176165803108, + "eval_accuracy": 0.5150753768844221, + "eval_loss": 0.7510992288589478, + "eval_runtime": 23.63, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 31 + }, + { + "epoch": 0.16580310880829016, + "grad_norm": 34.25, + "learning_rate": 4.83419689119171e-05, + "loss": 0.9141, + "step": 32 + }, + { + "epoch": 0.16580310880829016, + "eval_accuracy": 0.5402010050251256, + "eval_loss": 0.7427763938903809, + "eval_runtime": 23.7355, + "eval_samples_per_second": 16.768, + "eval_steps_per_second": 2.107, + "step": 32 + }, + { + "epoch": 0.17098445595854922, + "grad_norm": 19.5, + "learning_rate": 4.829015544041451e-05, + "loss": 0.6445, + "step": 33 + }, + { + "epoch": 0.17098445595854922, + "eval_accuracy": 0.5552763819095478, + "eval_loss": 0.7356705665588379, + "eval_runtime": 23.7046, + "eval_samples_per_second": 16.79, + "eval_steps_per_second": 2.109, + "step": 33 + }, + { + "epoch": 0.17616580310880828, + "grad_norm": 41.25, + "learning_rate": 4.823834196891192e-05, + "loss": 0.5312, + "step": 34 + }, + { + "epoch": 0.17616580310880828, + "eval_accuracy": 0.5703517587939698, + "eval_loss": 0.7327064871788025, + "eval_runtime": 23.7135, + "eval_samples_per_second": 16.784, + "eval_steps_per_second": 2.109, + "step": 34 + }, + { + "epoch": 0.18134715025906736, + "grad_norm": 28.0, + "learning_rate": 4.818652849740933e-05, + "loss": 0.8125, + "step": 35 + }, + { + "epoch": 0.18134715025906736, + "eval_accuracy": 0.5628140703517588, + "eval_loss": 0.7242462038993835, + "eval_runtime": 23.7086, + "eval_samples_per_second": 16.787, + "eval_steps_per_second": 2.109, + "step": 35 + }, + { + "epoch": 0.18652849740932642, + "grad_norm": 27.375, + "learning_rate": 4.813471502590674e-05, + "loss": 0.6875, + "step": 36 + }, + { + "epoch": 0.18652849740932642, + "eval_accuracy": 0.6080402010050251, + "eval_loss": 0.7315287590026855, + "eval_runtime": 23.7039, + "eval_samples_per_second": 16.79, + "eval_steps_per_second": 2.109, + "step": 36 + }, + { + "epoch": 0.19170984455958548, + "grad_norm": 29.375, + "learning_rate": 4.808290155440415e-05, + "loss": 0.8945, + "step": 37 + }, + { + "epoch": 0.19170984455958548, + "eval_accuracy": 0.6180904522613065, + "eval_loss": 0.7307828068733215, + "eval_runtime": 23.7029, + "eval_samples_per_second": 16.791, + "eval_steps_per_second": 2.109, + "step": 37 + }, + { + "epoch": 0.19689119170984457, + "grad_norm": 121.0, + "learning_rate": 4.8031088082901556e-05, + "loss": 0.5938, + "step": 38 + }, + { + "epoch": 0.19689119170984457, + "eval_accuracy": 0.585427135678392, + "eval_loss": 0.7384775280952454, + "eval_runtime": 23.6157, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.117, + "step": 38 + }, + { + "epoch": 0.20207253886010362, + "grad_norm": 214.0, + "learning_rate": 4.7979274611398965e-05, + "loss": 0.9805, + "step": 39 + }, + { + "epoch": 0.20207253886010362, + "eval_accuracy": 0.5879396984924623, + "eval_loss": 0.746800422668457, + "eval_runtime": 23.5814, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.12, + "step": 39 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 14.5, + "learning_rate": 4.792746113989637e-05, + "loss": 0.668, + "step": 40 + }, + { + "epoch": 0.20725388601036268, + "eval_accuracy": 0.5804020100502513, + "eval_loss": 0.7726916074752808, + "eval_runtime": 23.633, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 40 + }, + { + "epoch": 0.21243523316062177, + "grad_norm": 13.5625, + "learning_rate": 4.787564766839379e-05, + "loss": 0.6641, + "step": 41 + }, + { + "epoch": 0.21243523316062177, + "eval_accuracy": 0.5678391959798995, + "eval_loss": 0.7937343120574951, + "eval_runtime": 23.6904, + "eval_samples_per_second": 16.8, + "eval_steps_per_second": 2.111, + "step": 41 + }, + { + "epoch": 0.21761658031088082, + "grad_norm": 16.25, + "learning_rate": 4.782383419689119e-05, + "loss": 0.8906, + "step": 42 + }, + { + "epoch": 0.21761658031088082, + "eval_accuracy": 0.5728643216080402, + "eval_loss": 0.8176036477088928, + "eval_runtime": 23.7162, + "eval_samples_per_second": 16.782, + "eval_steps_per_second": 2.108, + "step": 42 + }, + { + "epoch": 0.22279792746113988, + "grad_norm": 11.5625, + "learning_rate": 4.7772020725388605e-05, + "loss": 0.5078, + "step": 43 + }, + { + "epoch": 0.22279792746113988, + "eval_accuracy": 0.5804020100502513, + "eval_loss": 0.8413944840431213, + "eval_runtime": 23.7037, + "eval_samples_per_second": 16.791, + "eval_steps_per_second": 2.109, + "step": 43 + }, + { + "epoch": 0.22797927461139897, + "grad_norm": 900.0, + "learning_rate": 4.7720207253886013e-05, + "loss": 1.1406, + "step": 44 + }, + { + "epoch": 0.22797927461139897, + "eval_accuracy": 0.5728643216080402, + "eval_loss": 0.8346419334411621, + "eval_runtime": 23.701, + "eval_samples_per_second": 16.793, + "eval_steps_per_second": 2.11, + "step": 44 + }, + { + "epoch": 0.23316062176165803, + "grad_norm": 9.5625, + "learning_rate": 4.766839378238342e-05, + "loss": 0.4492, + "step": 45 + }, + { + "epoch": 0.23316062176165803, + "eval_accuracy": 0.5753768844221105, + "eval_loss": 0.8348774909973145, + "eval_runtime": 23.6779, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.112, + "step": 45 + }, + { + "epoch": 0.23834196891191708, + "grad_norm": 16.75, + "learning_rate": 4.761658031088083e-05, + "loss": 0.8672, + "step": 46 + }, + { + "epoch": 0.23834196891191708, + "eval_accuracy": 0.585427135678392, + "eval_loss": 0.8115185499191284, + "eval_runtime": 23.6135, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.117, + "step": 46 + }, + { + "epoch": 0.24352331606217617, + "grad_norm": 113.5, + "learning_rate": 4.756476683937824e-05, + "loss": 1.8359, + "step": 47 + }, + { + "epoch": 0.24352331606217617, + "eval_accuracy": 0.592964824120603, + "eval_loss": 0.7704342007637024, + "eval_runtime": 23.612, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.118, + "step": 47 + }, + { + "epoch": 0.24870466321243523, + "grad_norm": 11.8125, + "learning_rate": 4.7512953367875654e-05, + "loss": 0.543, + "step": 48 + }, + { + "epoch": 0.24870466321243523, + "eval_accuracy": 0.5979899497487438, + "eval_loss": 0.755182147026062, + "eval_runtime": 23.5617, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.122, + "step": 48 + }, + { + "epoch": 0.2538860103626943, + "grad_norm": 10.5625, + "learning_rate": 4.7461139896373056e-05, + "loss": 0.6719, + "step": 49 + }, + { + "epoch": 0.2538860103626943, + "eval_accuracy": 0.6180904522613065, + "eval_loss": 0.7425212264060974, + "eval_runtime": 23.5299, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.125, + "step": 49 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 10.9375, + "learning_rate": 4.740932642487047e-05, + "loss": 0.4688, + "step": 50 + }, + { + "epoch": 0.25906735751295334, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.7508440613746643, + "eval_runtime": 23.5176, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.126, + "step": 50 + }, + { + "epoch": 0.26424870466321243, + "grad_norm": 8.5, + "learning_rate": 4.735751295336788e-05, + "loss": 0.3828, + "step": 51 + }, + { + "epoch": 0.26424870466321243, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.7443074584007263, + "eval_runtime": 23.4623, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.131, + "step": 51 + }, + { + "epoch": 0.2694300518134715, + "grad_norm": 74.0, + "learning_rate": 4.730569948186529e-05, + "loss": 0.4766, + "step": 52 + }, + { + "epoch": 0.2694300518134715, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.7366716265678406, + "eval_runtime": 23.4685, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.131, + "step": 52 + }, + { + "epoch": 0.27461139896373055, + "grad_norm": 29.875, + "learning_rate": 4.7253886010362696e-05, + "loss": 0.3691, + "step": 53 + }, + { + "epoch": 0.27461139896373055, + "eval_accuracy": 0.6507537688442211, + "eval_loss": 0.7361612915992737, + "eval_runtime": 23.6018, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.118, + "step": 53 + }, + { + "epoch": 0.27979274611398963, + "grad_norm": 14.3125, + "learning_rate": 4.7202072538860104e-05, + "loss": 0.8906, + "step": 54 + }, + { + "epoch": 0.27979274611398963, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.7379671931266785, + "eval_runtime": 23.7035, + "eval_samples_per_second": 16.791, + "eval_steps_per_second": 2.109, + "step": 54 + }, + { + "epoch": 0.2849740932642487, + "grad_norm": 151.0, + "learning_rate": 4.715025906735751e-05, + "loss": 0.9453, + "step": 55 + }, + { + "epoch": 0.2849740932642487, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.7540633082389832, + "eval_runtime": 23.7098, + "eval_samples_per_second": 16.786, + "eval_steps_per_second": 2.109, + "step": 55 + }, + { + "epoch": 0.29015544041450775, + "grad_norm": 32.5, + "learning_rate": 4.709844559585492e-05, + "loss": 0.418, + "step": 56 + }, + { + "epoch": 0.29015544041450775, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.7577536106109619, + "eval_runtime": 23.7199, + "eval_samples_per_second": 16.779, + "eval_steps_per_second": 2.108, + "step": 56 + }, + { + "epoch": 0.29533678756476683, + "grad_norm": 10.5, + "learning_rate": 4.7046632124352336e-05, + "loss": 0.5078, + "step": 57 + }, + { + "epoch": 0.29533678756476683, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.7663905620574951, + "eval_runtime": 23.6944, + "eval_samples_per_second": 16.797, + "eval_steps_per_second": 2.11, + "step": 57 + }, + { + "epoch": 0.3005181347150259, + "grad_norm": 17.25, + "learning_rate": 4.6994818652849745e-05, + "loss": 0.8789, + "step": 58 + }, + { + "epoch": 0.3005181347150259, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.7679020166397095, + "eval_runtime": 23.6785, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.112, + "step": 58 + }, + { + "epoch": 0.30569948186528495, + "grad_norm": 12.8125, + "learning_rate": 4.694300518134715e-05, + "loss": 0.4062, + "step": 59 + }, + { + "epoch": 0.30569948186528495, + "eval_accuracy": 0.6582914572864321, + "eval_loss": 0.7692956924438477, + "eval_runtime": 23.6294, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 59 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 13.5625, + "learning_rate": 4.689119170984456e-05, + "loss": 0.7812, + "step": 60 + }, + { + "epoch": 0.31088082901554404, + "eval_accuracy": 0.6407035175879398, + "eval_loss": 0.7538473606109619, + "eval_runtime": 23.6061, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.118, + "step": 60 + }, + { + "epoch": 0.3160621761658031, + "grad_norm": 100.5, + "learning_rate": 4.683937823834197e-05, + "loss": 1.1797, + "step": 61 + }, + { + "epoch": 0.3160621761658031, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.7485670447349548, + "eval_runtime": 23.5684, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.121, + "step": 61 + }, + { + "epoch": 0.32124352331606215, + "grad_norm": 253.0, + "learning_rate": 4.678756476683938e-05, + "loss": 0.6758, + "step": 62 + }, + { + "epoch": 0.32124352331606215, + "eval_accuracy": 0.6532663316582915, + "eval_loss": 0.733099102973938, + "eval_runtime": 23.4938, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.128, + "step": 62 + }, + { + "epoch": 0.32642487046632124, + "grad_norm": 9.875, + "learning_rate": 4.673575129533679e-05, + "loss": 0.5742, + "step": 63 + }, + { + "epoch": 0.32642487046632124, + "eval_accuracy": 0.6407035175879398, + "eval_loss": 0.7366912961006165, + "eval_runtime": 23.5835, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.12, + "step": 63 + }, + { + "epoch": 0.3316062176165803, + "grad_norm": 12.1875, + "learning_rate": 4.66839378238342e-05, + "loss": 0.7578, + "step": 64 + }, + { + "epoch": 0.3316062176165803, + "eval_accuracy": 0.6432160804020101, + "eval_loss": 0.7372605204582214, + "eval_runtime": 23.6146, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.117, + "step": 64 + }, + { + "epoch": 0.33678756476683935, + "grad_norm": 12.875, + "learning_rate": 4.663212435233161e-05, + "loss": 0.6445, + "step": 65 + }, + { + "epoch": 0.33678756476683935, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.7386149764060974, + "eval_runtime": 23.6718, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 65 + }, + { + "epoch": 0.34196891191709844, + "grad_norm": 120.5, + "learning_rate": 4.658031088082902e-05, + "loss": 1.0781, + "step": 66 + }, + { + "epoch": 0.34196891191709844, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.7280739545822144, + "eval_runtime": 23.69, + "eval_samples_per_second": 16.8, + "eval_steps_per_second": 2.111, + "step": 66 + }, + { + "epoch": 0.3471502590673575, + "grad_norm": 7.46875, + "learning_rate": 4.652849740932643e-05, + "loss": 0.3672, + "step": 67 + }, + { + "epoch": 0.3471502590673575, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.7309594750404358, + "eval_runtime": 23.7069, + "eval_samples_per_second": 16.788, + "eval_steps_per_second": 2.109, + "step": 67 + }, + { + "epoch": 0.35233160621761656, + "grad_norm": 16.875, + "learning_rate": 4.6476683937823836e-05, + "loss": 0.7148, + "step": 68 + }, + { + "epoch": 0.35233160621761656, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.7312146425247192, + "eval_runtime": 23.7123, + "eval_samples_per_second": 16.785, + "eval_steps_per_second": 2.109, + "step": 68 + }, + { + "epoch": 0.35751295336787564, + "grad_norm": 146.0, + "learning_rate": 4.6424870466321244e-05, + "loss": 1.2344, + "step": 69 + }, + { + "epoch": 0.35751295336787564, + "eval_accuracy": 0.6180904522613065, + "eval_loss": 0.7239910364151001, + "eval_runtime": 23.7016, + "eval_samples_per_second": 16.792, + "eval_steps_per_second": 2.11, + "step": 69 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 9.5625, + "learning_rate": 4.637305699481865e-05, + "loss": 0.4355, + "step": 70 + }, + { + "epoch": 0.3626943005181347, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.7159037590026855, + "eval_runtime": 23.6498, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.114, + "step": 70 + }, + { + "epoch": 0.36787564766839376, + "grad_norm": 11.0625, + "learning_rate": 4.632124352331607e-05, + "loss": 0.5664, + "step": 71 + }, + { + "epoch": 0.36787564766839376, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.7206540703773499, + "eval_runtime": 23.6248, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.116, + "step": 71 + }, + { + "epoch": 0.37305699481865284, + "grad_norm": 205.0, + "learning_rate": 4.626943005181347e-05, + "loss": 1.3984, + "step": 72 + }, + { + "epoch": 0.37305699481865284, + "eval_accuracy": 0.6231155778894473, + "eval_loss": 0.7386345863342285, + "eval_runtime": 23.687, + "eval_samples_per_second": 16.802, + "eval_steps_per_second": 2.111, + "step": 72 + }, + { + "epoch": 0.37823834196891193, + "grad_norm": 4.46875, + "learning_rate": 4.6217616580310885e-05, + "loss": 0.1045, + "step": 73 + }, + { + "epoch": 0.37823834196891193, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.7332364916801453, + "eval_runtime": 23.7126, + "eval_samples_per_second": 16.784, + "eval_steps_per_second": 2.109, + "step": 73 + }, + { + "epoch": 0.38341968911917096, + "grad_norm": 14.0, + "learning_rate": 4.616580310880829e-05, + "loss": 0.5469, + "step": 74 + }, + { + "epoch": 0.38341968911917096, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.7495877742767334, + "eval_runtime": 23.7179, + "eval_samples_per_second": 16.781, + "eval_steps_per_second": 2.108, + "step": 74 + }, + { + "epoch": 0.38860103626943004, + "grad_norm": 29.5, + "learning_rate": 4.61139896373057e-05, + "loss": 0.4824, + "step": 75 + }, + { + "epoch": 0.38860103626943004, + "eval_accuracy": 0.6231155778894473, + "eval_loss": 0.7662531137466431, + "eval_runtime": 23.7047, + "eval_samples_per_second": 16.79, + "eval_steps_per_second": 2.109, + "step": 75 + }, + { + "epoch": 0.39378238341968913, + "grad_norm": 23.375, + "learning_rate": 4.606217616580311e-05, + "loss": 0.8633, + "step": 76 + }, + { + "epoch": 0.39378238341968913, + "eval_accuracy": 0.6381909547738693, + "eval_loss": 0.7647024393081665, + "eval_runtime": 23.7243, + "eval_samples_per_second": 16.776, + "eval_steps_per_second": 2.108, + "step": 76 + }, + { + "epoch": 0.39896373056994816, + "grad_norm": 19.125, + "learning_rate": 4.601036269430052e-05, + "loss": 0.7422, + "step": 77 + }, + { + "epoch": 0.39896373056994816, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.7746937870979309, + "eval_runtime": 23.6369, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 77 + }, + { + "epoch": 0.40414507772020725, + "grad_norm": 13.75, + "learning_rate": 4.5958549222797934e-05, + "loss": 0.7734, + "step": 78 + }, + { + "epoch": 0.40414507772020725, + "eval_accuracy": 0.6206030150753769, + "eval_loss": 0.7783055901527405, + "eval_runtime": 23.5709, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.121, + "step": 78 + }, + { + "epoch": 0.40932642487046633, + "grad_norm": 41.75, + "learning_rate": 4.5906735751295335e-05, + "loss": 1.4453, + "step": 79 + }, + { + "epoch": 0.40932642487046633, + "eval_accuracy": 0.6432160804020101, + "eval_loss": 0.7571254968643188, + "eval_runtime": 23.5639, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.122, + "step": 79 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 13.5, + "learning_rate": 4.585492227979275e-05, + "loss": 0.5234, + "step": 80 + }, + { + "epoch": 0.41450777202072536, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.754514753818512, + "eval_runtime": 23.4957, + "eval_samples_per_second": 16.939, + "eval_steps_per_second": 2.128, + "step": 80 + }, + { + "epoch": 0.41968911917098445, + "grad_norm": 17.75, + "learning_rate": 4.580310880829016e-05, + "loss": 0.7188, + "step": 81 + }, + { + "epoch": 0.41968911917098445, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.7605017423629761, + "eval_runtime": 23.5089, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.127, + "step": 81 + }, + { + "epoch": 0.42487046632124353, + "grad_norm": 42.25, + "learning_rate": 4.575129533678757e-05, + "loss": 0.4258, + "step": 82 + }, + { + "epoch": 0.42487046632124353, + "eval_accuracy": 0.6432160804020101, + "eval_loss": 0.751982569694519, + "eval_runtime": 23.5575, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.122, + "step": 82 + }, + { + "epoch": 0.43005181347150256, + "grad_norm": 19.125, + "learning_rate": 4.5699481865284976e-05, + "loss": 1.1953, + "step": 83 + }, + { + "epoch": 0.43005181347150256, + "eval_accuracy": 0.6407035175879398, + "eval_loss": 0.752591073513031, + "eval_runtime": 23.6787, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.112, + "step": 83 + }, + { + "epoch": 0.43523316062176165, + "grad_norm": 8.625, + "learning_rate": 4.5647668393782384e-05, + "loss": 0.3711, + "step": 84 + }, + { + "epoch": 0.43523316062176165, + "eval_accuracy": 0.6381909547738693, + "eval_loss": 0.7537099719047546, + "eval_runtime": 23.6911, + "eval_samples_per_second": 16.8, + "eval_steps_per_second": 2.11, + "step": 84 + }, + { + "epoch": 0.44041450777202074, + "grad_norm": 5.75, + "learning_rate": 4.55958549222798e-05, + "loss": 0.2188, + "step": 85 + }, + { + "epoch": 0.44041450777202074, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.7552803158760071, + "eval_runtime": 23.7483, + "eval_samples_per_second": 16.759, + "eval_steps_per_second": 2.105, + "step": 85 + }, + { + "epoch": 0.44559585492227977, + "grad_norm": 10.75, + "learning_rate": 4.55440414507772e-05, + "loss": 0.3555, + "step": 86 + }, + { + "epoch": 0.44559585492227977, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.7666850090026855, + "eval_runtime": 23.752, + "eval_samples_per_second": 16.756, + "eval_steps_per_second": 2.105, + "step": 86 + }, + { + "epoch": 0.45077720207253885, + "grad_norm": 12.625, + "learning_rate": 4.5492227979274616e-05, + "loss": 0.6484, + "step": 87 + }, + { + "epoch": 0.45077720207253885, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.7797974348068237, + "eval_runtime": 23.6975, + "eval_samples_per_second": 16.795, + "eval_steps_per_second": 2.11, + "step": 87 + }, + { + "epoch": 0.45595854922279794, + "grad_norm": 14.6875, + "learning_rate": 4.5440414507772025e-05, + "loss": 0.8516, + "step": 88 + }, + { + "epoch": 0.45595854922279794, + "eval_accuracy": 0.6206030150753769, + "eval_loss": 0.7789337038993835, + "eval_runtime": 23.6825, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.111, + "step": 88 + }, + { + "epoch": 0.46113989637305697, + "grad_norm": 27.25, + "learning_rate": 4.538860103626943e-05, + "loss": 1.8125, + "step": 89 + }, + { + "epoch": 0.46113989637305697, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.7647809386253357, + "eval_runtime": 23.6303, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 89 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 15.25, + "learning_rate": 4.533678756476684e-05, + "loss": 0.8047, + "step": 90 + }, + { + "epoch": 0.46632124352331605, + "eval_accuracy": 0.6381909547738693, + "eval_loss": 0.7367894053459167, + "eval_runtime": 23.6063, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.118, + "step": 90 + }, + { + "epoch": 0.47150259067357514, + "grad_norm": 109.0, + "learning_rate": 4.528497409326425e-05, + "loss": 1.7109, + "step": 91 + }, + { + "epoch": 0.47150259067357514, + "eval_accuracy": 0.6407035175879398, + "eval_loss": 0.7258166074752808, + "eval_runtime": 23.515, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.126, + "step": 91 + }, + { + "epoch": 0.47668393782383417, + "grad_norm": 10.8125, + "learning_rate": 4.523316062176166e-05, + "loss": 0.5938, + "step": 92 + }, + { + "epoch": 0.47668393782383417, + "eval_accuracy": 0.6331658291457286, + "eval_loss": 0.7118011713027954, + "eval_runtime": 23.4901, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.129, + "step": 92 + }, + { + "epoch": 0.48186528497409326, + "grad_norm": 9.0, + "learning_rate": 4.5181347150259067e-05, + "loss": 0.5469, + "step": 93 + }, + { + "epoch": 0.48186528497409326, + "eval_accuracy": 0.6381909547738693, + "eval_loss": 0.7092886567115784, + "eval_runtime": 23.5146, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.126, + "step": 93 + }, + { + "epoch": 0.48704663212435234, + "grad_norm": 13.3125, + "learning_rate": 4.512953367875648e-05, + "loss": 0.457, + "step": 94 + }, + { + "epoch": 0.48704663212435234, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.6999843120574951, + "eval_runtime": 23.6638, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 94 + }, + { + "epoch": 0.49222797927461137, + "grad_norm": 9.6875, + "learning_rate": 4.507772020725389e-05, + "loss": 0.6211, + "step": 95 + }, + { + "epoch": 0.49222797927461137, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.6926429271697998, + "eval_runtime": 23.6743, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 95 + }, + { + "epoch": 0.49740932642487046, + "grad_norm": 18.625, + "learning_rate": 4.50259067357513e-05, + "loss": 1.0078, + "step": 96 + }, + { + "epoch": 0.49740932642487046, + "eval_accuracy": 0.6381909547738693, + "eval_loss": 0.675212025642395, + "eval_runtime": 23.697, + "eval_samples_per_second": 16.795, + "eval_steps_per_second": 2.11, + "step": 96 + }, + { + "epoch": 0.5025906735751295, + "grad_norm": 12.375, + "learning_rate": 4.497409326424871e-05, + "loss": 0.6914, + "step": 97 + }, + { + "epoch": 0.5025906735751295, + "eval_accuracy": 0.6532663316582915, + "eval_loss": 0.6650832295417786, + "eval_runtime": 23.7104, + "eval_samples_per_second": 16.786, + "eval_steps_per_second": 2.109, + "step": 97 + }, + { + "epoch": 0.5077720207253886, + "grad_norm": 15.125, + "learning_rate": 4.4922279792746115e-05, + "loss": 1.0, + "step": 98 + }, + { + "epoch": 0.5077720207253886, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.6655739545822144, + "eval_runtime": 23.6841, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.111, + "step": 98 + }, + { + "epoch": 0.5129533678756477, + "grad_norm": 11.0625, + "learning_rate": 4.4870466321243524e-05, + "loss": 0.5898, + "step": 99 + }, + { + "epoch": 0.5129533678756477, + "eval_accuracy": 0.6457286432160804, + "eval_loss": 0.6619621515274048, + "eval_runtime": 23.7091, + "eval_samples_per_second": 16.787, + "eval_steps_per_second": 2.109, + "step": 99 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 318.0, + "learning_rate": 4.481865284974093e-05, + "loss": 1.6797, + "step": 100 + }, + { + "epoch": 0.5181347150259067, + "eval_accuracy": 0.6457286432160804, + "eval_loss": 0.6478878855705261, + "eval_runtime": 23.5817, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.12, + "step": 100 + }, + { + "epoch": 0.5233160621761658, + "grad_norm": 7.53125, + "learning_rate": 4.476683937823835e-05, + "loss": 0.5078, + "step": 101 + }, + { + "epoch": 0.5233160621761658, + "eval_accuracy": 0.6457286432160804, + "eval_loss": 0.6403305530548096, + "eval_runtime": 23.554, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.123, + "step": 101 + }, + { + "epoch": 0.5284974093264249, + "grad_norm": 27.125, + "learning_rate": 4.471502590673575e-05, + "loss": 0.6836, + "step": 102 + }, + { + "epoch": 0.5284974093264249, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6396435499191284, + "eval_runtime": 23.4978, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.128, + "step": 102 + }, + { + "epoch": 0.533678756476684, + "grad_norm": 8.4375, + "learning_rate": 4.4663212435233164e-05, + "loss": 0.668, + "step": 103 + }, + { + "epoch": 0.533678756476684, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6415279507637024, + "eval_runtime": 23.6443, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 103 + }, + { + "epoch": 0.538860103626943, + "grad_norm": 8.3125, + "learning_rate": 4.461139896373057e-05, + "loss": 0.6094, + "step": 104 + }, + { + "epoch": 0.538860103626943, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6463567614555359, + "eval_runtime": 23.6631, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 104 + }, + { + "epoch": 0.5440414507772021, + "grad_norm": 11.4375, + "learning_rate": 4.455958549222798e-05, + "loss": 0.7266, + "step": 105 + }, + { + "epoch": 0.5440414507772021, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.646415650844574, + "eval_runtime": 23.6892, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 105 + }, + { + "epoch": 0.5492227979274611, + "grad_norm": 8.9375, + "learning_rate": 4.450777202072539e-05, + "loss": 0.7266, + "step": 106 + }, + { + "epoch": 0.5492227979274611, + "eval_accuracy": 0.6608040201005025, + "eval_loss": 0.645100474357605, + "eval_runtime": 23.7086, + "eval_samples_per_second": 16.787, + "eval_steps_per_second": 2.109, + "step": 106 + }, + { + "epoch": 0.5544041450777202, + "grad_norm": 9.375, + "learning_rate": 4.44559585492228e-05, + "loss": 0.5508, + "step": 107 + }, + { + "epoch": 0.5544041450777202, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6430197954177856, + "eval_runtime": 23.6952, + "eval_samples_per_second": 16.797, + "eval_steps_per_second": 2.11, + "step": 107 + }, + { + "epoch": 0.5595854922279793, + "grad_norm": 14.9375, + "learning_rate": 4.440414507772021e-05, + "loss": 0.6562, + "step": 108 + }, + { + "epoch": 0.5595854922279793, + "eval_accuracy": 0.6582914572864321, + "eval_loss": 0.6414101719856262, + "eval_runtime": 23.6943, + "eval_samples_per_second": 16.797, + "eval_steps_per_second": 2.11, + "step": 108 + }, + { + "epoch": 0.5647668393782384, + "grad_norm": 21.75, + "learning_rate": 4.4352331606217615e-05, + "loss": 0.9258, + "step": 109 + }, + { + "epoch": 0.5647668393782384, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.6451593637466431, + "eval_runtime": 23.6644, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 109 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 8.125, + "learning_rate": 4.430051813471503e-05, + "loss": 0.6484, + "step": 110 + }, + { + "epoch": 0.5699481865284974, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.6474952697753906, + "eval_runtime": 23.5996, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.119, + "step": 110 + }, + { + "epoch": 0.5751295336787565, + "grad_norm": 19.375, + "learning_rate": 4.424870466321244e-05, + "loss": 0.5859, + "step": 111 + }, + { + "epoch": 0.5751295336787565, + "eval_accuracy": 0.6381909547738693, + "eval_loss": 0.652991533279419, + "eval_runtime": 23.5384, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.124, + "step": 111 + }, + { + "epoch": 0.5803108808290155, + "grad_norm": 11.125, + "learning_rate": 4.419689119170985e-05, + "loss": 0.8398, + "step": 112 + }, + { + "epoch": 0.5803108808290155, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.6619032621383667, + "eval_runtime": 23.5456, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.124, + "step": 112 + }, + { + "epoch": 0.5854922279792746, + "grad_norm": 10.75, + "learning_rate": 4.4145077720207255e-05, + "loss": 0.6367, + "step": 113 + }, + { + "epoch": 0.5854922279792746, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.6710898280143738, + "eval_runtime": 23.5573, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.122, + "step": 113 + }, + { + "epoch": 0.5906735751295337, + "grad_norm": 27.375, + "learning_rate": 4.4093264248704664e-05, + "loss": 0.6367, + "step": 114 + }, + { + "epoch": 0.5906735751295337, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.6657310128211975, + "eval_runtime": 23.6625, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.113, + "step": 114 + }, + { + "epoch": 0.5958549222797928, + "grad_norm": 24.25, + "learning_rate": 4.404145077720208e-05, + "loss": 0.75, + "step": 115 + }, + { + "epoch": 0.5958549222797928, + "eval_accuracy": 0.6407035175879398, + "eval_loss": 0.657565176486969, + "eval_runtime": 23.7252, + "eval_samples_per_second": 16.775, + "eval_steps_per_second": 2.107, + "step": 115 + }, + { + "epoch": 0.6010362694300518, + "grad_norm": 8.5, + "learning_rate": 4.398963730569948e-05, + "loss": 0.6875, + "step": 116 + }, + { + "epoch": 0.6010362694300518, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.6540122628211975, + "eval_runtime": 23.6906, + "eval_samples_per_second": 16.8, + "eval_steps_per_second": 2.111, + "step": 116 + }, + { + "epoch": 0.6062176165803109, + "grad_norm": 9.375, + "learning_rate": 4.3937823834196896e-05, + "loss": 0.6953, + "step": 117 + }, + { + "epoch": 0.6062176165803109, + "eval_accuracy": 0.6331658291457286, + "eval_loss": 0.6604310870170593, + "eval_runtime": 23.7017, + "eval_samples_per_second": 16.792, + "eval_steps_per_second": 2.11, + "step": 117 + }, + { + "epoch": 0.6113989637305699, + "grad_norm": 8.875, + "learning_rate": 4.3886010362694304e-05, + "loss": 0.6758, + "step": 118 + }, + { + "epoch": 0.6113989637305699, + "eval_accuracy": 0.6231155778894473, + "eval_loss": 0.661510705947876, + "eval_runtime": 23.6881, + "eval_samples_per_second": 16.802, + "eval_steps_per_second": 2.111, + "step": 118 + }, + { + "epoch": 0.616580310880829, + "grad_norm": 20.875, + "learning_rate": 4.383419689119171e-05, + "loss": 1.0, + "step": 119 + }, + { + "epoch": 0.616580310880829, + "eval_accuracy": 0.6206030150753769, + "eval_loss": 0.6571529507637024, + "eval_runtime": 23.7126, + "eval_samples_per_second": 16.784, + "eval_steps_per_second": 2.109, + "step": 119 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 8.4375, + "learning_rate": 4.378238341968912e-05, + "loss": 0.582, + "step": 120 + }, + { + "epoch": 0.6217616580310881, + "eval_accuracy": 0.6381909547738693, + "eval_loss": 0.6559359431266785, + "eval_runtime": 23.6147, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.117, + "step": 120 + }, + { + "epoch": 0.6269430051813472, + "grad_norm": 7.5, + "learning_rate": 4.373056994818653e-05, + "loss": 0.5977, + "step": 121 + }, + { + "epoch": 0.6269430051813472, + "eval_accuracy": 0.6432160804020101, + "eval_loss": 0.6563481688499451, + "eval_runtime": 23.5467, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.123, + "step": 121 + }, + { + "epoch": 0.6321243523316062, + "grad_norm": 8.3125, + "learning_rate": 4.367875647668394e-05, + "loss": 0.5977, + "step": 122 + }, + { + "epoch": 0.6321243523316062, + "eval_accuracy": 0.6507537688442211, + "eval_loss": 0.6563677787780762, + "eval_runtime": 23.5552, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.123, + "step": 122 + }, + { + "epoch": 0.6373056994818653, + "grad_norm": 8.0, + "learning_rate": 4.3626943005181346e-05, + "loss": 0.6016, + "step": 123 + }, + { + "epoch": 0.6373056994818653, + "eval_accuracy": 0.6457286432160804, + "eval_loss": 0.6557396650314331, + "eval_runtime": 23.4825, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.129, + "step": 123 + }, + { + "epoch": 0.6424870466321243, + "grad_norm": 11.0625, + "learning_rate": 4.357512953367876e-05, + "loss": 0.625, + "step": 124 + }, + { + "epoch": 0.6424870466321243, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.6570547819137573, + "eval_runtime": 23.5278, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.125, + "step": 124 + }, + { + "epoch": 0.6476683937823834, + "grad_norm": 11.9375, + "learning_rate": 4.352331606217617e-05, + "loss": 0.875, + "step": 125 + }, + { + "epoch": 0.6476683937823834, + "eval_accuracy": 0.6381909547738693, + "eval_loss": 0.65558260679245, + "eval_runtime": 23.6306, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 125 + }, + { + "epoch": 0.6528497409326425, + "grad_norm": 12.1875, + "learning_rate": 4.347150259067358e-05, + "loss": 1.0, + "step": 126 + }, + { + "epoch": 0.6528497409326425, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.654895544052124, + "eval_runtime": 23.6479, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 126 + }, + { + "epoch": 0.6580310880829016, + "grad_norm": 12.0625, + "learning_rate": 4.341968911917099e-05, + "loss": 0.3691, + "step": 127 + }, + { + "epoch": 0.6580310880829016, + "eval_accuracy": 0.6532663316582915, + "eval_loss": 0.6522652506828308, + "eval_runtime": 23.6848, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 127 + }, + { + "epoch": 0.6632124352331606, + "grad_norm": 7.40625, + "learning_rate": 4.3367875647668395e-05, + "loss": 0.6328, + "step": 128 + }, + { + "epoch": 0.6632124352331606, + "eval_accuracy": 0.6582914572864321, + "eval_loss": 0.6493797302246094, + "eval_runtime": 23.6896, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 128 + }, + { + "epoch": 0.6683937823834197, + "grad_norm": 7.0, + "learning_rate": 4.3316062176165804e-05, + "loss": 0.5352, + "step": 129 + }, + { + "epoch": 0.6683937823834197, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.6486337780952454, + "eval_runtime": 23.7062, + "eval_samples_per_second": 16.789, + "eval_steps_per_second": 2.109, + "step": 129 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 14.4375, + "learning_rate": 4.326424870466321e-05, + "loss": 1.2969, + "step": 130 + }, + { + "epoch": 0.6735751295336787, + "eval_accuracy": 0.6582914572864321, + "eval_loss": 0.6467886567115784, + "eval_runtime": 23.686, + "eval_samples_per_second": 16.803, + "eval_steps_per_second": 2.111, + "step": 130 + }, + { + "epoch": 0.6787564766839378, + "grad_norm": 8.6875, + "learning_rate": 4.321243523316063e-05, + "loss": 0.6016, + "step": 131 + }, + { + "epoch": 0.6787564766839378, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.6467886567115784, + "eval_runtime": 23.665, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.113, + "step": 131 + }, + { + "epoch": 0.6839378238341969, + "grad_norm": 8.3125, + "learning_rate": 4.3160621761658036e-05, + "loss": 0.6016, + "step": 132 + }, + { + "epoch": 0.6839378238341969, + "eval_accuracy": 0.6532663316582915, + "eval_loss": 0.6463371515274048, + "eval_runtime": 23.6103, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.118, + "step": 132 + }, + { + "epoch": 0.689119170984456, + "grad_norm": 8.25, + "learning_rate": 4.3108808290155444e-05, + "loss": 0.6367, + "step": 133 + }, + { + "epoch": 0.689119170984456, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6469653248786926, + "eval_runtime": 23.6155, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.117, + "step": 133 + }, + { + "epoch": 0.694300518134715, + "grad_norm": 9.625, + "learning_rate": 4.305699481865285e-05, + "loss": 0.8477, + "step": 134 + }, + { + "epoch": 0.694300518134715, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.6467493772506714, + "eval_runtime": 23.5476, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.123, + "step": 134 + }, + { + "epoch": 0.6994818652849741, + "grad_norm": 7.125, + "learning_rate": 4.300518134715026e-05, + "loss": 0.4375, + "step": 135 + }, + { + "epoch": 0.6994818652849741, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6458268165588379, + "eval_runtime": 23.5632, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.122, + "step": 135 + }, + { + "epoch": 0.7046632124352331, + "grad_norm": 17.0, + "learning_rate": 4.295336787564767e-05, + "loss": 0.9844, + "step": 136 + }, + { + "epoch": 0.7046632124352331, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6434909105300903, + "eval_runtime": 23.5504, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.123, + "step": 136 + }, + { + "epoch": 0.7098445595854922, + "grad_norm": 6.90625, + "learning_rate": 4.290155440414508e-05, + "loss": 0.5703, + "step": 137 + }, + { + "epoch": 0.7098445595854922, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6412138938903809, + "eval_runtime": 23.6273, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 137 + }, + { + "epoch": 0.7150259067357513, + "grad_norm": 16.625, + "learning_rate": 4.284974093264249e-05, + "loss": 0.9648, + "step": 138 + }, + { + "epoch": 0.7150259067357513, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6374843120574951, + "eval_runtime": 23.6722, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 138 + }, + { + "epoch": 0.7202072538860104, + "grad_norm": 6.9375, + "learning_rate": 4.2797927461139894e-05, + "loss": 0.6289, + "step": 139 + }, + { + "epoch": 0.7202072538860104, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6345202326774597, + "eval_runtime": 23.689, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 139 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 7.8125, + "learning_rate": 4.274611398963731e-05, + "loss": 0.7188, + "step": 140 + }, + { + "epoch": 0.7253886010362695, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6332836151123047, + "eval_runtime": 23.6828, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.111, + "step": 140 + }, + { + "epoch": 0.7305699481865285, + "grad_norm": 8.6875, + "learning_rate": 4.269430051813472e-05, + "loss": 0.7812, + "step": 141 + }, + { + "epoch": 0.7305699481865285, + "eval_accuracy": 0.6507537688442211, + "eval_loss": 0.6318310499191284, + "eval_runtime": 23.6898, + "eval_samples_per_second": 16.8, + "eval_steps_per_second": 2.111, + "step": 141 + }, + { + "epoch": 0.7357512953367875, + "grad_norm": 7.5625, + "learning_rate": 4.2642487046632127e-05, + "loss": 0.6016, + "step": 142 + }, + { + "epoch": 0.7357512953367875, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6313403248786926, + "eval_runtime": 23.6677, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.113, + "step": 142 + }, + { + "epoch": 0.7409326424870466, + "grad_norm": 10.3125, + "learning_rate": 4.2590673575129535e-05, + "loss": 0.9453, + "step": 143 + }, + { + "epoch": 0.7409326424870466, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6319684386253357, + "eval_runtime": 23.6017, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.118, + "step": 143 + }, + { + "epoch": 0.7461139896373057, + "grad_norm": 7.59375, + "learning_rate": 4.253886010362694e-05, + "loss": 0.7734, + "step": 144 + }, + { + "epoch": 0.7461139896373057, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.632164716720581, + "eval_runtime": 23.5775, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.121, + "step": 144 + }, + { + "epoch": 0.7512953367875648, + "grad_norm": 7.3125, + "learning_rate": 4.248704663212436e-05, + "loss": 0.6641, + "step": 145 + }, + { + "epoch": 0.7512953367875648, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6330480575561523, + "eval_runtime": 23.5459, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.124, + "step": 145 + }, + { + "epoch": 0.7564766839378239, + "grad_norm": 7.90625, + "learning_rate": 4.243523316062176e-05, + "loss": 0.6328, + "step": 146 + }, + { + "epoch": 0.7564766839378239, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6339510083198547, + "eval_runtime": 23.4736, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.13, + "step": 146 + }, + { + "epoch": 0.7616580310880829, + "grad_norm": 8.1875, + "learning_rate": 4.2383419689119175e-05, + "loss": 0.7734, + "step": 147 + }, + { + "epoch": 0.7616580310880829, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6343239545822144, + "eval_runtime": 23.5275, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.125, + "step": 147 + }, + { + "epoch": 0.7668393782383419, + "grad_norm": 9.875, + "learning_rate": 4.2331606217616584e-05, + "loss": 0.5703, + "step": 148 + }, + { + "epoch": 0.7668393782383419, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6348735690116882, + "eval_runtime": 23.6078, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.118, + "step": 148 + }, + { + "epoch": 0.772020725388601, + "grad_norm": 6.5, + "learning_rate": 4.227979274611399e-05, + "loss": 0.6289, + "step": 149 + }, + { + "epoch": 0.772020725388601, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6365616917610168, + "eval_runtime": 23.6584, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 149 + }, + { + "epoch": 0.7772020725388601, + "grad_norm": 8.4375, + "learning_rate": 4.22279792746114e-05, + "loss": 0.7734, + "step": 150 + }, + { + "epoch": 0.7772020725388601, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6376217007637024, + "eval_runtime": 23.6769, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.112, + "step": 150 + }, + { + "epoch": 0.7823834196891192, + "grad_norm": 7.46875, + "learning_rate": 4.217616580310881e-05, + "loss": 0.7188, + "step": 151 + }, + { + "epoch": 0.7823834196891192, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.639309823513031, + "eval_runtime": 23.5332, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.125, + "step": 151 + }, + { + "epoch": 0.7875647668393783, + "grad_norm": 6.125, + "learning_rate": 4.212435233160622e-05, + "loss": 0.4434, + "step": 152 + }, + { + "epoch": 0.7875647668393783, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6415672302246094, + "eval_runtime": 23.5075, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.127, + "step": 152 + }, + { + "epoch": 0.7927461139896373, + "grad_norm": 6.15625, + "learning_rate": 4.2072538860103626e-05, + "loss": 0.5625, + "step": 153 + }, + { + "epoch": 0.7927461139896373, + "eval_accuracy": 0.6608040201005025, + "eval_loss": 0.6433731317520142, + "eval_runtime": 23.6051, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.118, + "step": 153 + }, + { + "epoch": 0.7979274611398963, + "grad_norm": 8.0, + "learning_rate": 4.202072538860104e-05, + "loss": 0.7422, + "step": 154 + }, + { + "epoch": 0.7979274611398963, + "eval_accuracy": 0.6582914572864321, + "eval_loss": 0.6435105204582214, + "eval_runtime": 23.7009, + "eval_samples_per_second": 16.793, + "eval_steps_per_second": 2.11, + "step": 154 + }, + { + "epoch": 0.8031088082901554, + "grad_norm": 9.5625, + "learning_rate": 4.196891191709845e-05, + "loss": 0.6641, + "step": 155 + }, + { + "epoch": 0.8031088082901554, + "eval_accuracy": 0.6407035175879398, + "eval_loss": 0.6442956924438477, + "eval_runtime": 23.6704, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.112, + "step": 155 + }, + { + "epoch": 0.8082901554404145, + "grad_norm": 6.09375, + "learning_rate": 4.191709844559586e-05, + "loss": 0.5195, + "step": 156 + }, + { + "epoch": 0.8082901554404145, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.6439031362533569, + "eval_runtime": 23.6756, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 156 + }, + { + "epoch": 0.8134715025906736, + "grad_norm": 10.875, + "learning_rate": 4.1865284974093266e-05, + "loss": 0.8242, + "step": 157 + }, + { + "epoch": 0.8134715025906736, + "eval_accuracy": 0.6457286432160804, + "eval_loss": 0.6454342007637024, + "eval_runtime": 23.6787, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.112, + "step": 157 + }, + { + "epoch": 0.8186528497409327, + "grad_norm": 6.03125, + "learning_rate": 4.1813471502590675e-05, + "loss": 0.5703, + "step": 158 + }, + { + "epoch": 0.8186528497409327, + "eval_accuracy": 0.6507537688442211, + "eval_loss": 0.6455715894699097, + "eval_runtime": 23.6738, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 158 + }, + { + "epoch": 0.8238341968911918, + "grad_norm": 7.25, + "learning_rate": 4.176165803108808e-05, + "loss": 0.6602, + "step": 159 + }, + { + "epoch": 0.8238341968911918, + "eval_accuracy": 0.6582914572864321, + "eval_loss": 0.6457482576370239, + "eval_runtime": 23.6971, + "eval_samples_per_second": 16.795, + "eval_steps_per_second": 2.11, + "step": 159 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 6.0625, + "learning_rate": 4.170984455958549e-05, + "loss": 0.4219, + "step": 160 + }, + { + "epoch": 0.8290155440414507, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.6459249258041382, + "eval_runtime": 23.6074, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.118, + "step": 160 + }, + { + "epoch": 0.8341968911917098, + "grad_norm": 6.875, + "learning_rate": 4.165803108808291e-05, + "loss": 0.6602, + "step": 161 + }, + { + "epoch": 0.8341968911917098, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.646454930305481, + "eval_runtime": 23.609, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.118, + "step": 161 + }, + { + "epoch": 0.8393782383419689, + "grad_norm": 7.65625, + "learning_rate": 4.1606217616580315e-05, + "loss": 0.8281, + "step": 162 + }, + { + "epoch": 0.8393782383419689, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.6467100977897644, + "eval_runtime": 23.5829, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.12, + "step": 162 + }, + { + "epoch": 0.844559585492228, + "grad_norm": 6.875, + "learning_rate": 4.1554404145077724e-05, + "loss": 0.6328, + "step": 163 + }, + { + "epoch": 0.844559585492228, + "eval_accuracy": 0.6457286432160804, + "eval_loss": 0.6473971605300903, + "eval_runtime": 23.6487, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 163 + }, + { + "epoch": 0.8497409326424871, + "grad_norm": 6.4375, + "learning_rate": 4.150259067357513e-05, + "loss": 0.5586, + "step": 164 + }, + { + "epoch": 0.8497409326424871, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.6480448842048645, + "eval_runtime": 23.6761, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.112, + "step": 164 + }, + { + "epoch": 0.8549222797927462, + "grad_norm": 7.25, + "learning_rate": 4.145077720207254e-05, + "loss": 0.7188, + "step": 165 + }, + { + "epoch": 0.8549222797927462, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.648476779460907, + "eval_runtime": 23.6749, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 165 + }, + { + "epoch": 0.8601036269430051, + "grad_norm": 7.09375, + "learning_rate": 4.139896373056995e-05, + "loss": 0.6602, + "step": 166 + }, + { + "epoch": 0.8601036269430051, + "eval_accuracy": 0.6180904522613065, + "eval_loss": 0.6493600606918335, + "eval_runtime": 23.6773, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.112, + "step": 166 + }, + { + "epoch": 0.8652849740932642, + "grad_norm": 10.125, + "learning_rate": 4.134715025906736e-05, + "loss": 0.7734, + "step": 167 + }, + { + "epoch": 0.8652849740932642, + "eval_accuracy": 0.6206030150753769, + "eval_loss": 0.649006724357605, + "eval_runtime": 23.6949, + "eval_samples_per_second": 16.797, + "eval_steps_per_second": 2.11, + "step": 167 + }, + { + "epoch": 0.8704663212435233, + "grad_norm": 6.25, + "learning_rate": 4.129533678756477e-05, + "loss": 0.4492, + "step": 168 + }, + { + "epoch": 0.8704663212435233, + "eval_accuracy": 0.6080402010050251, + "eval_loss": 0.6479271650314331, + "eval_runtime": 23.6225, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 168 + }, + { + "epoch": 0.8756476683937824, + "grad_norm": 6.25, + "learning_rate": 4.1243523316062174e-05, + "loss": 0.4844, + "step": 169 + }, + { + "epoch": 0.8756476683937824, + "eval_accuracy": 0.6155778894472361, + "eval_loss": 0.6470241546630859, + "eval_runtime": 23.6016, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.119, + "step": 169 + }, + { + "epoch": 0.8808290155440415, + "grad_norm": 9.4375, + "learning_rate": 4.119170984455959e-05, + "loss": 0.75, + "step": 170 + }, + { + "epoch": 0.8808290155440415, + "eval_accuracy": 0.6130653266331658, + "eval_loss": 0.6477504968643188, + "eval_runtime": 23.6537, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 170 + }, + { + "epoch": 0.8860103626943006, + "grad_norm": 7.9375, + "learning_rate": 4.1139896373057e-05, + "loss": 0.4434, + "step": 171 + }, + { + "epoch": 0.8860103626943006, + "eval_accuracy": 0.6155778894472361, + "eval_loss": 0.6493208408355713, + "eval_runtime": 23.72, + "eval_samples_per_second": 16.779, + "eval_steps_per_second": 2.108, + "step": 171 + }, + { + "epoch": 0.8911917098445595, + "grad_norm": 7.53125, + "learning_rate": 4.1088082901554406e-05, + "loss": 0.6641, + "step": 172 + }, + { + "epoch": 0.8911917098445595, + "eval_accuracy": 0.6055276381909548, + "eval_loss": 0.6501452326774597, + "eval_runtime": 23.6798, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.112, + "step": 172 + }, + { + "epoch": 0.8963730569948186, + "grad_norm": 8.0625, + "learning_rate": 4.1036269430051815e-05, + "loss": 0.6055, + "step": 173 + }, + { + "epoch": 0.8963730569948186, + "eval_accuracy": 0.6206030150753769, + "eval_loss": 0.6505967378616333, + "eval_runtime": 23.682, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.111, + "step": 173 + }, + { + "epoch": 0.9015544041450777, + "grad_norm": 6.9375, + "learning_rate": 4.098445595854922e-05, + "loss": 0.6602, + "step": 174 + }, + { + "epoch": 0.9015544041450777, + "eval_accuracy": 0.6381909547738693, + "eval_loss": 0.6506949067115784, + "eval_runtime": 23.6632, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 174 + }, + { + "epoch": 0.9067357512953368, + "grad_norm": 8.875, + "learning_rate": 4.093264248704664e-05, + "loss": 0.832, + "step": 175 + }, + { + "epoch": 0.9067357512953368, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.6519119143486023, + "eval_runtime": 23.6461, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.115, + "step": 175 + }, + { + "epoch": 0.9119170984455959, + "grad_norm": 8.375, + "learning_rate": 4.088082901554404e-05, + "loss": 0.5234, + "step": 176 + }, + { + "epoch": 0.9119170984455959, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.6522455811500549, + "eval_runtime": 23.5409, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.124, + "step": 176 + }, + { + "epoch": 0.917098445595855, + "grad_norm": 8.6875, + "learning_rate": 4.0829015544041455e-05, + "loss": 0.7656, + "step": 177 + }, + { + "epoch": 0.917098445595855, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.6532663106918335, + "eval_runtime": 23.5473, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.123, + "step": 177 + }, + { + "epoch": 0.9222797927461139, + "grad_norm": 15.875, + "learning_rate": 4.0777202072538863e-05, + "loss": 0.6875, + "step": 178 + }, + { + "epoch": 0.9222797927461139, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.6490656137466431, + "eval_runtime": 23.4762, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 2.13, + "step": 178 + }, + { + "epoch": 0.927461139896373, + "grad_norm": 7.8125, + "learning_rate": 4.072538860103627e-05, + "loss": 0.6836, + "step": 179 + }, + { + "epoch": 0.927461139896373, + "eval_accuracy": 0.628140703517588, + "eval_loss": 0.6481823325157166, + "eval_runtime": 23.5597, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.122, + "step": 179 + }, + { + "epoch": 0.9326424870466321, + "grad_norm": 25.125, + "learning_rate": 4.067357512953368e-05, + "loss": 0.582, + "step": 180 + }, + { + "epoch": 0.9326424870466321, + "eval_accuracy": 0.6231155778894473, + "eval_loss": 0.6482019424438477, + "eval_runtime": 23.6028, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.118, + "step": 180 + }, + { + "epoch": 0.9378238341968912, + "grad_norm": 8.9375, + "learning_rate": 4.062176165803109e-05, + "loss": 0.6875, + "step": 181 + }, + { + "epoch": 0.9378238341968912, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.647161602973938, + "eval_runtime": 23.6591, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 181 + }, + { + "epoch": 0.9430051813471503, + "grad_norm": 6.5, + "learning_rate": 4.05699481865285e-05, + "loss": 0.416, + "step": 182 + }, + { + "epoch": 0.9430051813471503, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.6480841636657715, + "eval_runtime": 23.6852, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 182 + }, + { + "epoch": 0.9481865284974094, + "grad_norm": 6.65625, + "learning_rate": 4.0518134715025906e-05, + "loss": 0.5508, + "step": 183 + }, + { + "epoch": 0.9481865284974094, + "eval_accuracy": 0.6256281407035176, + "eval_loss": 0.6514407992362976, + "eval_runtime": 23.7245, + "eval_samples_per_second": 16.776, + "eval_steps_per_second": 2.108, + "step": 183 + }, + { + "epoch": 0.9533678756476683, + "grad_norm": 7.96875, + "learning_rate": 4.046632124352332e-05, + "loss": 0.6914, + "step": 184 + }, + { + "epoch": 0.9533678756476683, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.6534429788589478, + "eval_runtime": 23.6779, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.112, + "step": 184 + }, + { + "epoch": 0.9585492227979274, + "grad_norm": 6.84375, + "learning_rate": 4.041450777202073e-05, + "loss": 0.6289, + "step": 185 + }, + { + "epoch": 0.9585492227979274, + "eval_accuracy": 0.6306532663316583, + "eval_loss": 0.6553470492362976, + "eval_runtime": 23.6743, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 185 + }, + { + "epoch": 0.9637305699481865, + "grad_norm": 13.875, + "learning_rate": 4.036269430051814e-05, + "loss": 0.7383, + "step": 186 + }, + { + "epoch": 0.9637305699481865, + "eval_accuracy": 0.635678391959799, + "eval_loss": 0.656289279460907, + "eval_runtime": 23.6664, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 186 + }, + { + "epoch": 0.9689119170984456, + "grad_norm": 11.6875, + "learning_rate": 4.0310880829015546e-05, + "loss": 0.5625, + "step": 187 + }, + { + "epoch": 0.9689119170984456, + "eval_accuracy": 0.6432160804020101, + "eval_loss": 0.6584681272506714, + "eval_runtime": 23.6614, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 187 + }, + { + "epoch": 0.9740932642487047, + "grad_norm": 13.0625, + "learning_rate": 4.0259067357512954e-05, + "loss": 0.7734, + "step": 188 + }, + { + "epoch": 0.9740932642487047, + "eval_accuracy": 0.6507537688442211, + "eval_loss": 0.6626884341239929, + "eval_runtime": 23.6145, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.117, + "step": 188 + }, + { + "epoch": 0.9792746113989638, + "grad_norm": 6.375, + "learning_rate": 4.020725388601036e-05, + "loss": 0.4199, + "step": 189 + }, + { + "epoch": 0.9792746113989638, + "eval_accuracy": 0.6482412060301508, + "eval_loss": 0.6657113432884216, + "eval_runtime": 23.5402, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.124, + "step": 189 + }, + { + "epoch": 0.9844559585492227, + "grad_norm": 6.4375, + "learning_rate": 4.015544041450777e-05, + "loss": 0.5547, + "step": 190 + }, + { + "epoch": 0.9844559585492227, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.6646121144294739, + "eval_runtime": 23.5622, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.122, + "step": 190 + }, + { + "epoch": 0.9896373056994818, + "grad_norm": 7.21875, + "learning_rate": 4.0103626943005186e-05, + "loss": 0.582, + "step": 191 + }, + { + "epoch": 0.9896373056994818, + "eval_accuracy": 0.6432160804020101, + "eval_loss": 0.662138819694519, + "eval_runtime": 23.5431, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.124, + "step": 191 + }, + { + "epoch": 0.9948186528497409, + "grad_norm": 7.40625, + "learning_rate": 4.0051813471502595e-05, + "loss": 0.582, + "step": 192 + }, + { + "epoch": 0.9948186528497409, + "eval_accuracy": 0.6608040201005025, + "eval_loss": 0.6599206924438477, + "eval_runtime": 23.5434, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.124, + "step": 192 + }, + { + "epoch": 1.0, + "grad_norm": 15.25, + "learning_rate": 4e-05, + "loss": 0.6758, + "step": 193 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.6572314500808716, + "eval_runtime": 23.6323, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 193 + }, + { + "epoch": 1.005181347150259, + "grad_norm": 6.25, + "learning_rate": 3.994818652849741e-05, + "loss": 0.4336, + "step": 194 + }, + { + "epoch": 1.005181347150259, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6553863286972046, + "eval_runtime": 23.6747, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 194 + }, + { + "epoch": 1.0103626943005182, + "grad_norm": 6.5, + "learning_rate": 3.989637305699482e-05, + "loss": 0.3828, + "step": 195 + }, + { + "epoch": 1.0103626943005182, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6535804271697998, + "eval_runtime": 23.6886, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 195 + }, + { + "epoch": 1.0155440414507773, + "grad_norm": 7.0625, + "learning_rate": 3.984455958549223e-05, + "loss": 0.7109, + "step": 196 + }, + { + "epoch": 1.0155440414507773, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6520689129829407, + "eval_runtime": 23.6793, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.112, + "step": 196 + }, + { + "epoch": 1.0207253886010363, + "grad_norm": 9.8125, + "learning_rate": 3.979274611398964e-05, + "loss": 0.5977, + "step": 197 + }, + { + "epoch": 1.0207253886010363, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6507145166397095, + "eval_runtime": 23.6809, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.111, + "step": 197 + }, + { + "epoch": 1.0259067357512954, + "grad_norm": 12.4375, + "learning_rate": 3.974093264248705e-05, + "loss": 0.707, + "step": 198 + }, + { + "epoch": 1.0259067357512954, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.6495171189308167, + "eval_runtime": 23.6686, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.113, + "step": 198 + }, + { + "epoch": 1.0310880829015545, + "grad_norm": 7.5, + "learning_rate": 3.9689119170984454e-05, + "loss": 0.5469, + "step": 199 + }, + { + "epoch": 1.0310880829015545, + "eval_accuracy": 0.6608040201005025, + "eval_loss": 0.6492815613746643, + "eval_runtime": 23.6606, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 199 + }, + { + "epoch": 1.0362694300518134, + "grad_norm": 7.34375, + "learning_rate": 3.963730569948187e-05, + "loss": 0.6367, + "step": 200 + }, + { + "epoch": 1.0362694300518134, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.6477112174034119, + "eval_runtime": 23.5857, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.12, + "step": 200 + }, + { + "epoch": 1.0414507772020725, + "grad_norm": 6.34375, + "learning_rate": 3.958549222797928e-05, + "loss": 0.5469, + "step": 201 + }, + { + "epoch": 1.0414507772020725, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.6466315984725952, + "eval_runtime": 23.4463, + "eval_samples_per_second": 16.975, + "eval_steps_per_second": 2.133, + "step": 201 + }, + { + "epoch": 1.0466321243523315, + "grad_norm": 5.84375, + "learning_rate": 3.9533678756476686e-05, + "loss": 0.2188, + "step": 202 + }, + { + "epoch": 1.0466321243523315, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.6468671560287476, + "eval_runtime": 23.4855, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.129, + "step": 202 + }, + { + "epoch": 1.0518134715025906, + "grad_norm": 7.53125, + "learning_rate": 3.9481865284974094e-05, + "loss": 0.6914, + "step": 203 + }, + { + "epoch": 1.0518134715025906, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.6452968120574951, + "eval_runtime": 23.6163, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.117, + "step": 203 + }, + { + "epoch": 1.0569948186528497, + "grad_norm": 10.8125, + "learning_rate": 3.94300518134715e-05, + "loss": 1.0547, + "step": 204 + }, + { + "epoch": 1.0569948186528497, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.6440209150314331, + "eval_runtime": 23.6272, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 204 + }, + { + "epoch": 1.0621761658031088, + "grad_norm": 12.25, + "learning_rate": 3.937823834196892e-05, + "loss": 0.3887, + "step": 205 + }, + { + "epoch": 1.0621761658031088, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.6423524022102356, + "eval_runtime": 23.6926, + "eval_samples_per_second": 16.798, + "eval_steps_per_second": 2.11, + "step": 205 + }, + { + "epoch": 1.067357512953368, + "grad_norm": 7.5, + "learning_rate": 3.932642487046632e-05, + "loss": 0.5312, + "step": 206 + }, + { + "epoch": 1.067357512953368, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.6398202180862427, + "eval_runtime": 23.672, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 206 + }, + { + "epoch": 1.072538860103627, + "grad_norm": 10.0, + "learning_rate": 3.9274611398963735e-05, + "loss": 0.7422, + "step": 207 + }, + { + "epoch": 1.072538860103627, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6371702551841736, + "eval_runtime": 23.7327, + "eval_samples_per_second": 16.77, + "eval_steps_per_second": 2.107, + "step": 207 + }, + { + "epoch": 1.077720207253886, + "grad_norm": 13.1875, + "learning_rate": 3.922279792746114e-05, + "loss": 0.7812, + "step": 208 + }, + { + "epoch": 1.077720207253886, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6344613432884216, + "eval_runtime": 23.6784, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.112, + "step": 208 + }, + { + "epoch": 1.0829015544041452, + "grad_norm": 7.0625, + "learning_rate": 3.917098445595855e-05, + "loss": 0.6367, + "step": 209 + }, + { + "epoch": 1.0829015544041452, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6319880485534668, + "eval_runtime": 23.6834, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.111, + "step": 209 + }, + { + "epoch": 1.0880829015544042, + "grad_norm": 6.46875, + "learning_rate": 3.911917098445596e-05, + "loss": 0.6133, + "step": 210 + }, + { + "epoch": 1.0880829015544042, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6307710409164429, + "eval_runtime": 23.692, + "eval_samples_per_second": 16.799, + "eval_steps_per_second": 2.11, + "step": 210 + }, + { + "epoch": 1.093264248704663, + "grad_norm": 8.75, + "learning_rate": 3.906735751295337e-05, + "loss": 0.7617, + "step": 211 + }, + { + "epoch": 1.093264248704663, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6279640197753906, + "eval_runtime": 23.6215, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.117, + "step": 211 + }, + { + "epoch": 1.0984455958549222, + "grad_norm": 4.59375, + "learning_rate": 3.901554404145078e-05, + "loss": 0.3145, + "step": 212 + }, + { + "epoch": 1.0984455958549222, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.626354455947876, + "eval_runtime": 23.611, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.118, + "step": 212 + }, + { + "epoch": 1.1036269430051813, + "grad_norm": 11.25, + "learning_rate": 3.8963730569948185e-05, + "loss": 1.1016, + "step": 213 + }, + { + "epoch": 1.1036269430051813, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6237829923629761, + "eval_runtime": 23.5795, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.12, + "step": 213 + }, + { + "epoch": 1.1088082901554404, + "grad_norm": 6.34375, + "learning_rate": 3.89119170984456e-05, + "loss": 0.5312, + "step": 214 + }, + { + "epoch": 1.1088082901554404, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.621741533279419, + "eval_runtime": 23.5524, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.123, + "step": 214 + }, + { + "epoch": 1.1139896373056994, + "grad_norm": 15.875, + "learning_rate": 3.886010362694301e-05, + "loss": 0.4473, + "step": 215 + }, + { + "epoch": 1.1139896373056994, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6221537590026855, + "eval_runtime": 23.6289, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 215 + }, + { + "epoch": 1.1191709844559585, + "grad_norm": 8.6875, + "learning_rate": 3.880829015544042e-05, + "loss": 0.7891, + "step": 216 + }, + { + "epoch": 1.1191709844559585, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6211526393890381, + "eval_runtime": 23.7224, + "eval_samples_per_second": 16.777, + "eval_steps_per_second": 2.108, + "step": 216 + }, + { + "epoch": 1.1243523316062176, + "grad_norm": 17.625, + "learning_rate": 3.8756476683937826e-05, + "loss": 0.8477, + "step": 217 + }, + { + "epoch": 1.1243523316062176, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6199748516082764, + "eval_runtime": 23.6728, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 217 + }, + { + "epoch": 1.1295336787564767, + "grad_norm": 8.3125, + "learning_rate": 3.8704663212435234e-05, + "loss": 0.6875, + "step": 218 + }, + { + "epoch": 1.1295336787564767, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6203281879425049, + "eval_runtime": 23.674, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 218 + }, + { + "epoch": 1.1347150259067358, + "grad_norm": 6.34375, + "learning_rate": 3.865284974093264e-05, + "loss": 0.3828, + "step": 219 + }, + { + "epoch": 1.1347150259067358, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.619739294052124, + "eval_runtime": 23.6886, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 219 + }, + { + "epoch": 1.1398963730569949, + "grad_norm": 8.1875, + "learning_rate": 3.860103626943005e-05, + "loss": 0.7969, + "step": 220 + }, + { + "epoch": 1.1398963730569949, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6196411848068237, + "eval_runtime": 23.7383, + "eval_samples_per_second": 16.766, + "eval_steps_per_second": 2.106, + "step": 220 + }, + { + "epoch": 1.145077720207254, + "grad_norm": 6.40625, + "learning_rate": 3.8549222797927466e-05, + "loss": 0.4961, + "step": 221 + }, + { + "epoch": 1.145077720207254, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6197981834411621, + "eval_runtime": 23.6661, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 221 + }, + { + "epoch": 1.150259067357513, + "grad_norm": 8.0, + "learning_rate": 3.8497409326424875e-05, + "loss": 0.832, + "step": 222 + }, + { + "epoch": 1.150259067357513, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6194449067115784, + "eval_runtime": 23.6337, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.116, + "step": 222 + }, + { + "epoch": 1.1554404145077721, + "grad_norm": 6.96875, + "learning_rate": 3.844559585492228e-05, + "loss": 0.5312, + "step": 223 + }, + { + "epoch": 1.1554404145077721, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6180511713027954, + "eval_runtime": 23.5685, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.121, + "step": 223 + }, + { + "epoch": 1.160621761658031, + "grad_norm": 7.40625, + "learning_rate": 3.839378238341969e-05, + "loss": 0.7109, + "step": 224 + }, + { + "epoch": 1.160621761658031, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6184830665588379, + "eval_runtime": 23.5762, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.121, + "step": 224 + }, + { + "epoch": 1.16580310880829, + "grad_norm": 9.9375, + "learning_rate": 3.83419689119171e-05, + "loss": 0.8867, + "step": 225 + }, + { + "epoch": 1.16580310880829, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6188952326774597, + "eval_runtime": 23.5118, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.127, + "step": 225 + }, + { + "epoch": 1.1709844559585492, + "grad_norm": 8.5, + "learning_rate": 3.829015544041451e-05, + "loss": 0.8594, + "step": 226 + }, + { + "epoch": 1.1709844559585492, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6189149022102356, + "eval_runtime": 23.4925, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.128, + "step": 226 + }, + { + "epoch": 1.1761658031088082, + "grad_norm": 5.40625, + "learning_rate": 3.8238341968911917e-05, + "loss": 0.3965, + "step": 227 + }, + { + "epoch": 1.1761658031088082, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6198374629020691, + "eval_runtime": 23.5208, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.126, + "step": 227 + }, + { + "epoch": 1.1813471502590673, + "grad_norm": 11.5, + "learning_rate": 3.818652849740933e-05, + "loss": 1.0391, + "step": 228 + }, + { + "epoch": 1.1813471502590673, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6190326809883118, + "eval_runtime": 23.6516, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 228 + }, + { + "epoch": 1.1865284974093264, + "grad_norm": 7.09375, + "learning_rate": 3.8134715025906733e-05, + "loss": 0.6211, + "step": 229 + }, + { + "epoch": 1.1865284974093264, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6189737915992737, + "eval_runtime": 23.6577, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 229 + }, + { + "epoch": 1.1917098445595855, + "grad_norm": 7.125, + "learning_rate": 3.808290155440415e-05, + "loss": 0.5938, + "step": 230 + }, + { + "epoch": 1.1917098445595855, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.618541955947876, + "eval_runtime": 23.6745, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 230 + }, + { + "epoch": 1.1968911917098446, + "grad_norm": 7.09375, + "learning_rate": 3.803108808290156e-05, + "loss": 0.6289, + "step": 231 + }, + { + "epoch": 1.1968911917098446, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6181297302246094, + "eval_runtime": 23.7298, + "eval_samples_per_second": 16.772, + "eval_steps_per_second": 2.107, + "step": 231 + }, + { + "epoch": 1.2020725388601037, + "grad_norm": 6.09375, + "learning_rate": 3.7979274611398965e-05, + "loss": 0.5703, + "step": 232 + }, + { + "epoch": 1.2020725388601037, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.617796003818512, + "eval_runtime": 23.6754, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 232 + }, + { + "epoch": 1.2072538860103628, + "grad_norm": 8.3125, + "learning_rate": 3.7927461139896374e-05, + "loss": 0.7227, + "step": 233 + }, + { + "epoch": 1.2072538860103628, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6188363432884216, + "eval_runtime": 23.6792, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.112, + "step": 233 + }, + { + "epoch": 1.2124352331606219, + "grad_norm": 7.5, + "learning_rate": 3.787564766839378e-05, + "loss": 0.6719, + "step": 234 + }, + { + "epoch": 1.2124352331606219, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6190915703773499, + "eval_runtime": 23.6816, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.111, + "step": 234 + }, + { + "epoch": 1.2176165803108807, + "grad_norm": 5.78125, + "learning_rate": 3.78238341968912e-05, + "loss": 0.4199, + "step": 235 + }, + { + "epoch": 1.2176165803108807, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.620485246181488, + "eval_runtime": 23.639, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 235 + }, + { + "epoch": 1.2227979274611398, + "grad_norm": 7.34375, + "learning_rate": 3.77720207253886e-05, + "loss": 0.7188, + "step": 236 + }, + { + "epoch": 1.2227979274611398, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6218593120574951, + "eval_runtime": 23.6257, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 236 + }, + { + "epoch": 1.2279792746113989, + "grad_norm": 9.9375, + "learning_rate": 3.7720207253886014e-05, + "loss": 0.9258, + "step": 237 + }, + { + "epoch": 1.2279792746113989, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6227425932884216, + "eval_runtime": 23.5425, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.124, + "step": 237 + }, + { + "epoch": 1.233160621761658, + "grad_norm": 16.875, + "learning_rate": 3.766839378238342e-05, + "loss": 0.5625, + "step": 238 + }, + { + "epoch": 1.233160621761658, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6227818727493286, + "eval_runtime": 23.5136, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.126, + "step": 238 + }, + { + "epoch": 1.238341968911917, + "grad_norm": 8.5, + "learning_rate": 3.761658031088083e-05, + "loss": 0.6602, + "step": 239 + }, + { + "epoch": 1.238341968911917, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.62433260679245, + "eval_runtime": 23.5365, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.124, + "step": 239 + }, + { + "epoch": 1.2435233160621761, + "grad_norm": 8.8125, + "learning_rate": 3.756476683937824e-05, + "loss": 0.5547, + "step": 240 + }, + { + "epoch": 1.2435233160621761, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6250981688499451, + "eval_runtime": 23.5521, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.123, + "step": 240 + }, + { + "epoch": 1.2487046632124352, + "grad_norm": 7.3125, + "learning_rate": 3.751295336787565e-05, + "loss": 0.582, + "step": 241 + }, + { + "epoch": 1.2487046632124352, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.625706672668457, + "eval_runtime": 23.6015, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.119, + "step": 241 + }, + { + "epoch": 1.2538860103626943, + "grad_norm": 6.25, + "learning_rate": 3.7461139896373056e-05, + "loss": 0.5586, + "step": 242 + }, + { + "epoch": 1.2538860103626943, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.627551794052124, + "eval_runtime": 23.6371, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 242 + }, + { + "epoch": 1.2590673575129534, + "grad_norm": 4.9375, + "learning_rate": 3.7409326424870465e-05, + "loss": 0.3242, + "step": 243 + }, + { + "epoch": 1.2590673575129534, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6292595863342285, + "eval_runtime": 23.7023, + "eval_samples_per_second": 16.792, + "eval_steps_per_second": 2.11, + "step": 243 + }, + { + "epoch": 1.2642487046632125, + "grad_norm": 10.0, + "learning_rate": 3.735751295336788e-05, + "loss": 1.0, + "step": 244 + }, + { + "epoch": 1.2642487046632125, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6318310499191284, + "eval_runtime": 23.6517, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 244 + }, + { + "epoch": 1.2694300518134716, + "grad_norm": 4.3125, + "learning_rate": 3.730569948186529e-05, + "loss": 0.1953, + "step": 245 + }, + { + "epoch": 1.2694300518134716, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6344810128211975, + "eval_runtime": 23.7123, + "eval_samples_per_second": 16.785, + "eval_steps_per_second": 2.109, + "step": 245 + }, + { + "epoch": 1.2746113989637307, + "grad_norm": 8.25, + "learning_rate": 3.72538860103627e-05, + "loss": 0.7461, + "step": 246 + }, + { + "epoch": 1.2746113989637307, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6361887454986572, + "eval_runtime": 23.6632, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 246 + }, + { + "epoch": 1.2797927461139897, + "grad_norm": 4.71875, + "learning_rate": 3.7202072538860105e-05, + "loss": 0.3574, + "step": 247 + }, + { + "epoch": 1.2797927461139897, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6381517052650452, + "eval_runtime": 23.6574, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.114, + "step": 247 + }, + { + "epoch": 1.2849740932642488, + "grad_norm": 3.8125, + "learning_rate": 3.7150259067357514e-05, + "loss": 0.1641, + "step": 248 + }, + { + "epoch": 1.2849740932642488, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.64386385679245, + "eval_runtime": 23.7016, + "eval_samples_per_second": 16.792, + "eval_steps_per_second": 2.11, + "step": 248 + }, + { + "epoch": 1.2901554404145077, + "grad_norm": 11.25, + "learning_rate": 3.709844559585492e-05, + "loss": 0.7148, + "step": 249 + }, + { + "epoch": 1.2901554404145077, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.647122323513031, + "eval_runtime": 23.6326, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 249 + }, + { + "epoch": 1.2953367875647668, + "grad_norm": 8.125, + "learning_rate": 3.704663212435233e-05, + "loss": 0.6211, + "step": 250 + }, + { + "epoch": 1.2953367875647668, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6512837409973145, + "eval_runtime": 23.6074, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.118, + "step": 250 + }, + { + "epoch": 1.3005181347150259, + "grad_norm": 11.0, + "learning_rate": 3.6994818652849746e-05, + "loss": 0.9688, + "step": 251 + }, + { + "epoch": 1.3005181347150259, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6545814871788025, + "eval_runtime": 23.4283, + "eval_samples_per_second": 16.988, + "eval_steps_per_second": 2.134, + "step": 251 + }, + { + "epoch": 1.305699481865285, + "grad_norm": 11.1875, + "learning_rate": 3.6943005181347154e-05, + "loss": 0.9883, + "step": 252 + }, + { + "epoch": 1.305699481865285, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.65625, + "eval_runtime": 23.4625, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.131, + "step": 252 + }, + { + "epoch": 1.310880829015544, + "grad_norm": 11.875, + "learning_rate": 3.689119170984456e-05, + "loss": 0.9648, + "step": 253 + }, + { + "epoch": 1.310880829015544, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6564462780952454, + "eval_runtime": 23.5515, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.123, + "step": 253 + }, + { + "epoch": 1.3160621761658031, + "grad_norm": 11.4375, + "learning_rate": 3.683937823834197e-05, + "loss": 0.6484, + "step": 254 + }, + { + "epoch": 1.3160621761658031, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6585074067115784, + "eval_runtime": 23.6513, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 254 + }, + { + "epoch": 1.3212435233160622, + "grad_norm": 5.84375, + "learning_rate": 3.678756476683938e-05, + "loss": 0.4453, + "step": 255 + }, + { + "epoch": 1.3212435233160622, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6610199213027954, + "eval_runtime": 23.6761, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.112, + "step": 255 + }, + { + "epoch": 1.3264248704663213, + "grad_norm": 9.1875, + "learning_rate": 3.673575129533679e-05, + "loss": 0.3945, + "step": 256 + }, + { + "epoch": 1.3264248704663213, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6629436016082764, + "eval_runtime": 23.6417, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 256 + }, + { + "epoch": 1.3316062176165804, + "grad_norm": 7.0625, + "learning_rate": 3.6683937823834196e-05, + "loss": 0.3965, + "step": 257 + }, + { + "epoch": 1.3316062176165804, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6674191355705261, + "eval_runtime": 23.7041, + "eval_samples_per_second": 16.79, + "eval_steps_per_second": 2.109, + "step": 257 + }, + { + "epoch": 1.3367875647668392, + "grad_norm": 11.25, + "learning_rate": 3.663212435233161e-05, + "loss": 0.8281, + "step": 258 + }, + { + "epoch": 1.3367875647668392, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6693624258041382, + "eval_runtime": 23.6666, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 258 + }, + { + "epoch": 1.3419689119170983, + "grad_norm": 6.34375, + "learning_rate": 3.658031088082901e-05, + "loss": 0.4277, + "step": 259 + }, + { + "epoch": 1.3419689119170983, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6731705665588379, + "eval_runtime": 23.6462, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.115, + "step": 259 + }, + { + "epoch": 1.3471502590673574, + "grad_norm": 5.875, + "learning_rate": 3.652849740932643e-05, + "loss": 0.459, + "step": 260 + }, + { + "epoch": 1.3471502590673574, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6757223606109619, + "eval_runtime": 23.6496, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.114, + "step": 260 + }, + { + "epoch": 1.3523316062176165, + "grad_norm": 9.4375, + "learning_rate": 3.647668393782384e-05, + "loss": 0.7852, + "step": 261 + }, + { + "epoch": 1.3523316062176165, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6805315613746643, + "eval_runtime": 23.656, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 261 + }, + { + "epoch": 1.3575129533678756, + "grad_norm": 4.34375, + "learning_rate": 3.6424870466321245e-05, + "loss": 0.1299, + "step": 262 + }, + { + "epoch": 1.3575129533678756, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6862437129020691, + "eval_runtime": 23.7234, + "eval_samples_per_second": 16.777, + "eval_steps_per_second": 2.108, + "step": 262 + }, + { + "epoch": 1.3626943005181347, + "grad_norm": 15.5, + "learning_rate": 3.6373056994818654e-05, + "loss": 0.9961, + "step": 263 + }, + { + "epoch": 1.3626943005181347, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6902481317520142, + "eval_runtime": 23.6698, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 263 + }, + { + "epoch": 1.3678756476683938, + "grad_norm": 6.40625, + "learning_rate": 3.632124352331606e-05, + "loss": 0.4258, + "step": 264 + }, + { + "epoch": 1.3678756476683938, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.6958228349685669, + "eval_runtime": 23.6363, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 264 + }, + { + "epoch": 1.3730569948186528, + "grad_norm": 6.53125, + "learning_rate": 3.626943005181348e-05, + "loss": 0.4355, + "step": 265 + }, + { + "epoch": 1.3730569948186528, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6993954181671143, + "eval_runtime": 23.5968, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.119, + "step": 265 + }, + { + "epoch": 1.378238341968912, + "grad_norm": 28.5, + "learning_rate": 3.621761658031088e-05, + "loss": 1.2031, + "step": 266 + }, + { + "epoch": 1.378238341968912, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.7014957666397095, + "eval_runtime": 23.6388, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 266 + }, + { + "epoch": 1.383419689119171, + "grad_norm": 5.65625, + "learning_rate": 3.6165803108808294e-05, + "loss": 0.3398, + "step": 267 + }, + { + "epoch": 1.383419689119171, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.705048680305481, + "eval_runtime": 23.628, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 267 + }, + { + "epoch": 1.38860103626943, + "grad_norm": 9.0625, + "learning_rate": 3.61139896373057e-05, + "loss": 0.5195, + "step": 268 + }, + { + "epoch": 1.38860103626943, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.7062853574752808, + "eval_runtime": 23.6465, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 268 + }, + { + "epoch": 1.3937823834196892, + "grad_norm": 10.8125, + "learning_rate": 3.606217616580311e-05, + "loss": 0.7148, + "step": 269 + }, + { + "epoch": 1.3937823834196892, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.7110552787780762, + "eval_runtime": 23.6584, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 269 + }, + { + "epoch": 1.3989637305699483, + "grad_norm": 10.75, + "learning_rate": 3.601036269430052e-05, + "loss": 0.8438, + "step": 270 + }, + { + "epoch": 1.3989637305699483, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.7138034105300903, + "eval_runtime": 23.6525, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 270 + }, + { + "epoch": 1.4041450777202074, + "grad_norm": 14.0625, + "learning_rate": 3.595854922279793e-05, + "loss": 0.9141, + "step": 271 + }, + { + "epoch": 1.4041450777202074, + "eval_accuracy": 0.6582914572864321, + "eval_loss": 0.7121152877807617, + "eval_runtime": 23.6746, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 271 + }, + { + "epoch": 1.4093264248704664, + "grad_norm": 12.0625, + "learning_rate": 3.590673575129534e-05, + "loss": 1.0078, + "step": 272 + }, + { + "epoch": 1.4093264248704664, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.7091708779335022, + "eval_runtime": 23.6747, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 272 + }, + { + "epoch": 1.4145077720207253, + "grad_norm": 10.625, + "learning_rate": 3.5854922279792744e-05, + "loss": 0.7305, + "step": 273 + }, + { + "epoch": 1.4145077720207253, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.7040672302246094, + "eval_runtime": 23.6619, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.113, + "step": 273 + }, + { + "epoch": 1.4196891191709844, + "grad_norm": 9.625, + "learning_rate": 3.580310880829016e-05, + "loss": 0.5742, + "step": 274 + }, + { + "epoch": 1.4196891191709844, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.7003179788589478, + "eval_runtime": 23.6844, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 274 + }, + { + "epoch": 1.4248704663212435, + "grad_norm": 22.375, + "learning_rate": 3.575129533678757e-05, + "loss": 0.6016, + "step": 275 + }, + { + "epoch": 1.4248704663212435, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.698512077331543, + "eval_runtime": 23.7207, + "eval_samples_per_second": 16.779, + "eval_steps_per_second": 2.108, + "step": 275 + }, + { + "epoch": 1.4300518134715026, + "grad_norm": 14.4375, + "learning_rate": 3.5699481865284977e-05, + "loss": 0.4238, + "step": 276 + }, + { + "epoch": 1.4300518134715026, + "eval_accuracy": 0.6633165829145728, + "eval_loss": 0.6935458779335022, + "eval_runtime": 23.6683, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.113, + "step": 276 + }, + { + "epoch": 1.4352331606217616, + "grad_norm": 8.3125, + "learning_rate": 3.5647668393782385e-05, + "loss": 0.582, + "step": 277 + }, + { + "epoch": 1.4352331606217616, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6897377371788025, + "eval_runtime": 23.6469, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 277 + }, + { + "epoch": 1.4404145077720207, + "grad_norm": 6.1875, + "learning_rate": 3.559585492227979e-05, + "loss": 0.416, + "step": 278 + }, + { + "epoch": 1.4404145077720207, + "eval_accuracy": 0.6557788944723618, + "eval_loss": 0.686793327331543, + "eval_runtime": 23.6436, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 278 + }, + { + "epoch": 1.4455958549222798, + "grad_norm": 9.875, + "learning_rate": 3.55440414507772e-05, + "loss": 0.4277, + "step": 279 + }, + { + "epoch": 1.4455958549222798, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6826122999191284, + "eval_runtime": 23.5911, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.119, + "step": 279 + }, + { + "epoch": 1.450777202072539, + "grad_norm": 17.125, + "learning_rate": 3.549222797927461e-05, + "loss": 1.0938, + "step": 280 + }, + { + "epoch": 1.450777202072539, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6798052787780762, + "eval_runtime": 23.6107, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.118, + "step": 280 + }, + { + "epoch": 1.455958549222798, + "grad_norm": 6.59375, + "learning_rate": 3.5440414507772025e-05, + "loss": 0.3555, + "step": 281 + }, + { + "epoch": 1.455958549222798, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6782545447349548, + "eval_runtime": 23.6365, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 281 + }, + { + "epoch": 1.4611398963730569, + "grad_norm": 7.03125, + "learning_rate": 3.5388601036269434e-05, + "loss": 0.5859, + "step": 282 + }, + { + "epoch": 1.4611398963730569, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6789612174034119, + "eval_runtime": 23.6354, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 282 + }, + { + "epoch": 1.466321243523316, + "grad_norm": 15.4375, + "learning_rate": 3.533678756476684e-05, + "loss": 1.2734, + "step": 283 + }, + { + "epoch": 1.466321243523316, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6776067614555359, + "eval_runtime": 23.7058, + "eval_samples_per_second": 16.789, + "eval_steps_per_second": 2.109, + "step": 283 + }, + { + "epoch": 1.471502590673575, + "grad_norm": 8.125, + "learning_rate": 3.528497409326425e-05, + "loss": 0.4551, + "step": 284 + }, + { + "epoch": 1.471502590673575, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6754671931266785, + "eval_runtime": 23.6738, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 284 + }, + { + "epoch": 1.4766839378238341, + "grad_norm": 10.375, + "learning_rate": 3.523316062176166e-05, + "loss": 0.582, + "step": 285 + }, + { + "epoch": 1.4766839378238341, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6741716265678406, + "eval_runtime": 23.714, + "eval_samples_per_second": 16.783, + "eval_steps_per_second": 2.108, + "step": 285 + }, + { + "epoch": 1.4818652849740932, + "grad_norm": 11.0625, + "learning_rate": 3.518134715025907e-05, + "loss": 0.5078, + "step": 286 + }, + { + "epoch": 1.4818652849740932, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6731312870979309, + "eval_runtime": 23.6618, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.113, + "step": 286 + }, + { + "epoch": 1.4870466321243523, + "grad_norm": 7.59375, + "learning_rate": 3.5129533678756476e-05, + "loss": 0.4668, + "step": 287 + }, + { + "epoch": 1.4870466321243523, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6729742288589478, + "eval_runtime": 23.6713, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.112, + "step": 287 + }, + { + "epoch": 1.4922279792746114, + "grad_norm": 7.0625, + "learning_rate": 3.507772020725389e-05, + "loss": 0.4336, + "step": 288 + }, + { + "epoch": 1.4922279792746114, + "eval_accuracy": 0.6658291457286433, + "eval_loss": 0.6729546189308167, + "eval_runtime": 23.6681, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.113, + "step": 288 + }, + { + "epoch": 1.4974093264248705, + "grad_norm": 9.875, + "learning_rate": 3.502590673575129e-05, + "loss": 0.4902, + "step": 289 + }, + { + "epoch": 1.4974093264248705, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6731116771697998, + "eval_runtime": 23.6974, + "eval_samples_per_second": 16.795, + "eval_steps_per_second": 2.11, + "step": 289 + }, + { + "epoch": 1.5025906735751295, + "grad_norm": 17.5, + "learning_rate": 3.497409326424871e-05, + "loss": 1.3672, + "step": 290 + }, + { + "epoch": 1.5025906735751295, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6709720492362976, + "eval_runtime": 23.6252, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 290 + }, + { + "epoch": 1.5077720207253886, + "grad_norm": 13.25, + "learning_rate": 3.4922279792746116e-05, + "loss": 0.6328, + "step": 291 + }, + { + "epoch": 1.5077720207253886, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6689502000808716, + "eval_runtime": 23.6217, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.117, + "step": 291 + }, + { + "epoch": 1.5129533678756477, + "grad_norm": 31.75, + "learning_rate": 3.4870466321243525e-05, + "loss": 1.4844, + "step": 292 + }, + { + "epoch": 1.5129533678756477, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6669676303863525, + "eval_runtime": 23.567, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.122, + "step": 292 + }, + { + "epoch": 1.5181347150259068, + "grad_norm": 8.875, + "learning_rate": 3.481865284974093e-05, + "loss": 0.6797, + "step": 293 + }, + { + "epoch": 1.5181347150259068, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6640821099281311, + "eval_runtime": 23.5831, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.12, + "step": 293 + }, + { + "epoch": 1.5233160621761659, + "grad_norm": 9.8125, + "learning_rate": 3.476683937823834e-05, + "loss": 0.6289, + "step": 294 + }, + { + "epoch": 1.5233160621761659, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6612358689308167, + "eval_runtime": 23.5326, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.125, + "step": 294 + }, + { + "epoch": 1.528497409326425, + "grad_norm": 24.125, + "learning_rate": 3.471502590673576e-05, + "loss": 0.707, + "step": 295 + }, + { + "epoch": 1.528497409326425, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6599599719047546, + "eval_runtime": 23.5247, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.125, + "step": 295 + }, + { + "epoch": 1.533678756476684, + "grad_norm": 11.0, + "learning_rate": 3.466321243523316e-05, + "loss": 0.6602, + "step": 296 + }, + { + "epoch": 1.533678756476684, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6596262454986572, + "eval_runtime": 23.5653, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.122, + "step": 296 + }, + { + "epoch": 1.5388601036269431, + "grad_norm": 8.875, + "learning_rate": 3.4611398963730574e-05, + "loss": 0.6562, + "step": 297 + }, + { + "epoch": 1.5388601036269431, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6592532992362976, + "eval_runtime": 23.518, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.126, + "step": 297 + }, + { + "epoch": 1.5440414507772022, + "grad_norm": 14.375, + "learning_rate": 3.455958549222798e-05, + "loss": 0.7148, + "step": 298 + }, + { + "epoch": 1.5440414507772022, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6590962409973145, + "eval_runtime": 23.5831, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.12, + "step": 298 + }, + { + "epoch": 1.549222797927461, + "grad_norm": 5.78125, + "learning_rate": 3.450777202072539e-05, + "loss": 0.4082, + "step": 299 + }, + { + "epoch": 1.549222797927461, + "eval_accuracy": 0.6683417085427136, + "eval_loss": 0.6611377000808716, + "eval_runtime": 23.6173, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.117, + "step": 299 + }, + { + "epoch": 1.5544041450777202, + "grad_norm": 10.3125, + "learning_rate": 3.44559585492228e-05, + "loss": 0.8242, + "step": 300 + }, + { + "epoch": 1.5544041450777202, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6608825325965881, + "eval_runtime": 23.6417, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 300 + }, + { + "epoch": 1.5595854922279793, + "grad_norm": 4.875, + "learning_rate": 3.440414507772021e-05, + "loss": 0.3867, + "step": 301 + }, + { + "epoch": 1.5595854922279793, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6618443727493286, + "eval_runtime": 23.6225, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 301 + }, + { + "epoch": 1.5647668393782384, + "grad_norm": 6.53125, + "learning_rate": 3.435233160621762e-05, + "loss": 0.373, + "step": 302 + }, + { + "epoch": 1.5647668393782384, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.662786602973938, + "eval_runtime": 23.5624, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.122, + "step": 302 + }, + { + "epoch": 1.5699481865284974, + "grad_norm": 9.9375, + "learning_rate": 3.4300518134715024e-05, + "loss": 0.8125, + "step": 303 + }, + { + "epoch": 1.5699481865284974, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6617462038993835, + "eval_runtime": 23.6249, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.116, + "step": 303 + }, + { + "epoch": 1.5751295336787565, + "grad_norm": 11.0625, + "learning_rate": 3.424870466321244e-05, + "loss": 0.8359, + "step": 304 + }, + { + "epoch": 1.5751295336787565, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6596066355705261, + "eval_runtime": 23.6514, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 304 + }, + { + "epoch": 1.5803108808290154, + "grad_norm": 17.875, + "learning_rate": 3.419689119170985e-05, + "loss": 0.4922, + "step": 305 + }, + { + "epoch": 1.5803108808290154, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.658232569694519, + "eval_runtime": 23.66, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 305 + }, + { + "epoch": 1.5854922279792745, + "grad_norm": 6.34375, + "learning_rate": 3.4145077720207256e-05, + "loss": 0.5781, + "step": 306 + }, + { + "epoch": 1.5854922279792745, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6578007340431213, + "eval_runtime": 23.6442, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 306 + }, + { + "epoch": 1.5906735751295336, + "grad_norm": 13.9375, + "learning_rate": 3.4093264248704665e-05, + "loss": 0.6484, + "step": 307 + }, + { + "epoch": 1.5906735751295336, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.65691739320755, + "eval_runtime": 23.6915, + "eval_samples_per_second": 16.799, + "eval_steps_per_second": 2.11, + "step": 307 + }, + { + "epoch": 1.5958549222797926, + "grad_norm": 11.125, + "learning_rate": 3.404145077720207e-05, + "loss": 0.9805, + "step": 308 + }, + { + "epoch": 1.5958549222797926, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6560733318328857, + "eval_runtime": 23.6544, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 308 + }, + { + "epoch": 1.6010362694300517, + "grad_norm": 9.4375, + "learning_rate": 3.398963730569948e-05, + "loss": 0.8359, + "step": 309 + }, + { + "epoch": 1.6010362694300517, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6554452180862427, + "eval_runtime": 23.6418, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 309 + }, + { + "epoch": 1.6062176165803108, + "grad_norm": 11.5625, + "learning_rate": 3.393782383419689e-05, + "loss": 0.6797, + "step": 310 + }, + { + "epoch": 1.6062176165803108, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6549937129020691, + "eval_runtime": 23.7088, + "eval_samples_per_second": 16.787, + "eval_steps_per_second": 2.109, + "step": 310 + }, + { + "epoch": 1.61139896373057, + "grad_norm": 11.3125, + "learning_rate": 3.3886010362694305e-05, + "loss": 1.0156, + "step": 311 + }, + { + "epoch": 1.61139896373057, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6533448696136475, + "eval_runtime": 23.6471, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 311 + }, + { + "epoch": 1.616580310880829, + "grad_norm": 8.0625, + "learning_rate": 3.3834196891191713e-05, + "loss": 0.8984, + "step": 312 + }, + { + "epoch": 1.616580310880829, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6514407992362976, + "eval_runtime": 23.6394, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 312 + }, + { + "epoch": 1.621761658031088, + "grad_norm": 8.9375, + "learning_rate": 3.378238341968912e-05, + "loss": 0.8711, + "step": 313 + }, + { + "epoch": 1.621761658031088, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6496741771697998, + "eval_runtime": 23.5726, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.121, + "step": 313 + }, + { + "epoch": 1.6269430051813472, + "grad_norm": 9.8125, + "learning_rate": 3.373056994818653e-05, + "loss": 0.6289, + "step": 314 + }, + { + "epoch": 1.6269430051813472, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6471812129020691, + "eval_runtime": 23.5983, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.119, + "step": 314 + }, + { + "epoch": 1.6321243523316062, + "grad_norm": 6.375, + "learning_rate": 3.367875647668394e-05, + "loss": 0.4863, + "step": 315 + }, + { + "epoch": 1.6321243523316062, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6469653248786926, + "eval_runtime": 23.5414, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.124, + "step": 315 + }, + { + "epoch": 1.6373056994818653, + "grad_norm": 8.75, + "learning_rate": 3.362694300518135e-05, + "loss": 0.6797, + "step": 316 + }, + { + "epoch": 1.6373056994818653, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6460034251213074, + "eval_runtime": 23.5267, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.125, + "step": 316 + }, + { + "epoch": 1.6424870466321244, + "grad_norm": 6.625, + "learning_rate": 3.3575129533678756e-05, + "loss": 0.6172, + "step": 317 + }, + { + "epoch": 1.6424870466321244, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6442171931266785, + "eval_runtime": 23.5157, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.126, + "step": 317 + }, + { + "epoch": 1.6476683937823835, + "grad_norm": 9.9375, + "learning_rate": 3.352331606217617e-05, + "loss": 0.6055, + "step": 318 + }, + { + "epoch": 1.6476683937823835, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6437264680862427, + "eval_runtime": 23.5541, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.123, + "step": 318 + }, + { + "epoch": 1.6528497409326426, + "grad_norm": 10.8125, + "learning_rate": 3.347150259067357e-05, + "loss": 0.793, + "step": 319 + }, + { + "epoch": 1.6528497409326426, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6424309015274048, + "eval_runtime": 23.5046, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.127, + "step": 319 + }, + { + "epoch": 1.6580310880829017, + "grad_norm": 6.4375, + "learning_rate": 3.341968911917099e-05, + "loss": 0.3848, + "step": 320 + }, + { + "epoch": 1.6580310880829017, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6412138938903809, + "eval_runtime": 23.5439, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.124, + "step": 320 + }, + { + "epoch": 1.6632124352331608, + "grad_norm": 7.625, + "learning_rate": 3.3367875647668396e-05, + "loss": 0.6992, + "step": 321 + }, + { + "epoch": 1.6632124352331608, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6410176157951355, + "eval_runtime": 23.5276, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.125, + "step": 321 + }, + { + "epoch": 1.6683937823834198, + "grad_norm": 6.0625, + "learning_rate": 3.3316062176165804e-05, + "loss": 0.4277, + "step": 322 + }, + { + "epoch": 1.6683937823834198, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6406838893890381, + "eval_runtime": 23.6405, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 322 + }, + { + "epoch": 1.6735751295336787, + "grad_norm": 22.625, + "learning_rate": 3.326424870466321e-05, + "loss": 0.8828, + "step": 323 + }, + { + "epoch": 1.6735751295336787, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6387798190116882, + "eval_runtime": 23.6269, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 323 + }, + { + "epoch": 1.6787564766839378, + "grad_norm": 7.09375, + "learning_rate": 3.321243523316062e-05, + "loss": 0.6367, + "step": 324 + }, + { + "epoch": 1.6787564766839378, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6390153765678406, + "eval_runtime": 23.6355, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 324 + }, + { + "epoch": 1.6839378238341969, + "grad_norm": 9.5625, + "learning_rate": 3.3160621761658036e-05, + "loss": 0.5234, + "step": 325 + }, + { + "epoch": 1.6839378238341969, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6381517052650452, + "eval_runtime": 23.6462, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.115, + "step": 325 + }, + { + "epoch": 1.689119170984456, + "grad_norm": 7.0625, + "learning_rate": 3.310880829015544e-05, + "loss": 0.6172, + "step": 326 + }, + { + "epoch": 1.689119170984456, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6377394795417786, + "eval_runtime": 23.6845, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 326 + }, + { + "epoch": 1.694300518134715, + "grad_norm": 5.59375, + "learning_rate": 3.305699481865285e-05, + "loss": 0.4414, + "step": 327 + }, + { + "epoch": 1.694300518134715, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6377002000808716, + "eval_runtime": 23.69, + "eval_samples_per_second": 16.8, + "eval_steps_per_second": 2.111, + "step": 327 + }, + { + "epoch": 1.6994818652849741, + "grad_norm": 5.5625, + "learning_rate": 3.300518134715026e-05, + "loss": 0.543, + "step": 328 + }, + { + "epoch": 1.6994818652849741, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6381909251213074, + "eval_runtime": 23.6437, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 328 + }, + { + "epoch": 1.704663212435233, + "grad_norm": 6.75, + "learning_rate": 3.295336787564767e-05, + "loss": 0.4844, + "step": 329 + }, + { + "epoch": 1.704663212435233, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6387209296226501, + "eval_runtime": 23.6542, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 329 + }, + { + "epoch": 1.709844559585492, + "grad_norm": 8.4375, + "learning_rate": 3.290155440414508e-05, + "loss": 0.7461, + "step": 330 + }, + { + "epoch": 1.709844559585492, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6385050415992737, + "eval_runtime": 23.6676, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.113, + "step": 330 + }, + { + "epoch": 1.7150259067357512, + "grad_norm": 9.375, + "learning_rate": 3.284974093264249e-05, + "loss": 0.8242, + "step": 331 + }, + { + "epoch": 1.7150259067357512, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6382105946540833, + "eval_runtime": 23.6732, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 331 + }, + { + "epoch": 1.7202072538860103, + "grad_norm": 9.3125, + "learning_rate": 3.27979274611399e-05, + "loss": 0.5273, + "step": 332 + }, + { + "epoch": 1.7202072538860103, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6377002000808716, + "eval_runtime": 23.6749, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 332 + }, + { + "epoch": 1.7253886010362693, + "grad_norm": 12.75, + "learning_rate": 3.2746113989637304e-05, + "loss": 1.0469, + "step": 333 + }, + { + "epoch": 1.7253886010362693, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6377590894699097, + "eval_runtime": 23.6834, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.111, + "step": 333 + }, + { + "epoch": 1.7305699481865284, + "grad_norm": 7.9375, + "learning_rate": 3.269430051813472e-05, + "loss": 0.7695, + "step": 334 + }, + { + "epoch": 1.7305699481865284, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6362084150314331, + "eval_runtime": 23.7122, + "eval_samples_per_second": 16.785, + "eval_steps_per_second": 2.109, + "step": 334 + }, + { + "epoch": 1.7357512953367875, + "grad_norm": 5.0625, + "learning_rate": 3.264248704663213e-05, + "loss": 0.3945, + "step": 335 + }, + { + "epoch": 1.7357512953367875, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6346772909164429, + "eval_runtime": 23.6107, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.118, + "step": 335 + }, + { + "epoch": 1.7409326424870466, + "grad_norm": 7.40625, + "learning_rate": 3.2590673575129536e-05, + "loss": 0.7891, + "step": 336 + }, + { + "epoch": 1.7409326424870466, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6336369514465332, + "eval_runtime": 23.618, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.117, + "step": 336 + }, + { + "epoch": 1.7461139896373057, + "grad_norm": 9.625, + "learning_rate": 3.2538860103626944e-05, + "loss": 0.832, + "step": 337 + }, + { + "epoch": 1.7461139896373057, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6320273280143738, + "eval_runtime": 23.5946, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.119, + "step": 337 + }, + { + "epoch": 1.7512953367875648, + "grad_norm": 7.78125, + "learning_rate": 3.248704663212435e-05, + "loss": 0.7695, + "step": 338 + }, + { + "epoch": 1.7512953367875648, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6302999258041382, + "eval_runtime": 23.5379, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.124, + "step": 338 + }, + { + "epoch": 1.7564766839378239, + "grad_norm": 6.6875, + "learning_rate": 3.243523316062176e-05, + "loss": 0.6094, + "step": 339 + }, + { + "epoch": 1.7564766839378239, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6292399764060974, + "eval_runtime": 23.574, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.121, + "step": 339 + }, + { + "epoch": 1.761658031088083, + "grad_norm": 5.65625, + "learning_rate": 3.238341968911917e-05, + "loss": 0.5195, + "step": 340 + }, + { + "epoch": 1.761658031088083, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6274732947349548, + "eval_runtime": 23.5145, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.126, + "step": 340 + }, + { + "epoch": 1.766839378238342, + "grad_norm": 13.375, + "learning_rate": 3.2331606217616585e-05, + "loss": 0.832, + "step": 341 + }, + { + "epoch": 1.766839378238342, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6259029507637024, + "eval_runtime": 23.5481, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.123, + "step": 341 + }, + { + "epoch": 1.7720207253886011, + "grad_norm": 4.9375, + "learning_rate": 3.227979274611399e-05, + "loss": 0.2773, + "step": 342 + }, + { + "epoch": 1.7720207253886011, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6244111061096191, + "eval_runtime": 23.5009, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.128, + "step": 342 + }, + { + "epoch": 1.7772020725388602, + "grad_norm": 4.96875, + "learning_rate": 3.22279792746114e-05, + "loss": 0.4238, + "step": 343 + }, + { + "epoch": 1.7772020725388602, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6229389309883118, + "eval_runtime": 23.5402, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.124, + "step": 343 + }, + { + "epoch": 1.7823834196891193, + "grad_norm": 6.03125, + "learning_rate": 3.217616580310881e-05, + "loss": 0.5195, + "step": 344 + }, + { + "epoch": 1.7823834196891193, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6220163106918335, + "eval_runtime": 23.5096, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.127, + "step": 344 + }, + { + "epoch": 1.7875647668393784, + "grad_norm": 10.3125, + "learning_rate": 3.212435233160622e-05, + "loss": 0.9375, + "step": 345 + }, + { + "epoch": 1.7875647668393784, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6224874258041382, + "eval_runtime": 23.5383, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.124, + "step": 345 + }, + { + "epoch": 1.7927461139896375, + "grad_norm": 5.15625, + "learning_rate": 3.207253886010363e-05, + "loss": 0.4004, + "step": 346 + }, + { + "epoch": 1.7927461139896375, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6217807531356812, + "eval_runtime": 23.6486, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 346 + }, + { + "epoch": 1.7979274611398963, + "grad_norm": 6.125, + "learning_rate": 3.2020725388601035e-05, + "loss": 0.5703, + "step": 347 + }, + { + "epoch": 1.7979274611398963, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6214863657951355, + "eval_runtime": 23.6349, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.116, + "step": 347 + }, + { + "epoch": 1.8031088082901554, + "grad_norm": 7.0, + "learning_rate": 3.196891191709845e-05, + "loss": 0.5508, + "step": 348 + }, + { + "epoch": 1.8031088082901554, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6211722493171692, + "eval_runtime": 23.6832, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.111, + "step": 348 + }, + { + "epoch": 1.8082901554404145, + "grad_norm": 6.3125, + "learning_rate": 3.191709844559586e-05, + "loss": 0.4648, + "step": 349 + }, + { + "epoch": 1.8082901554404145, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6203281879425049, + "eval_runtime": 23.6518, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 349 + }, + { + "epoch": 1.8134715025906736, + "grad_norm": 7.03125, + "learning_rate": 3.186528497409327e-05, + "loss": 0.5898, + "step": 350 + }, + { + "epoch": 1.8134715025906736, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6198767423629761, + "eval_runtime": 23.6644, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 350 + }, + { + "epoch": 1.8186528497409327, + "grad_norm": 7.78125, + "learning_rate": 3.1813471502590676e-05, + "loss": 0.7539, + "step": 351 + }, + { + "epoch": 1.8186528497409327, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6190326809883118, + "eval_runtime": 23.5998, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.119, + "step": 351 + }, + { + "epoch": 1.8238341968911918, + "grad_norm": 8.1875, + "learning_rate": 3.1761658031088084e-05, + "loss": 0.6211, + "step": 352 + }, + { + "epoch": 1.8238341968911918, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6185222864151001, + "eval_runtime": 23.4606, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.131, + "step": 352 + }, + { + "epoch": 1.8290155440414506, + "grad_norm": 6.28125, + "learning_rate": 3.170984455958549e-05, + "loss": 0.5195, + "step": 353 + }, + { + "epoch": 1.8290155440414506, + "eval_accuracy": 0.7085427135678392, + "eval_loss": 0.6171090006828308, + "eval_runtime": 23.599, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.119, + "step": 353 + }, + { + "epoch": 1.8341968911917097, + "grad_norm": 4.125, + "learning_rate": 3.16580310880829e-05, + "loss": 0.2754, + "step": 354 + }, + { + "epoch": 1.8341968911917097, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6168930530548096, + "eval_runtime": 23.6226, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 354 + }, + { + "epoch": 1.8393782383419688, + "grad_norm": 5.84375, + "learning_rate": 3.1606217616580316e-05, + "loss": 0.5234, + "step": 355 + }, + { + "epoch": 1.8393782383419688, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.617796003818512, + "eval_runtime": 23.6737, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 355 + }, + { + "epoch": 1.8445595854922279, + "grad_norm": 8.0, + "learning_rate": 3.155440414507772e-05, + "loss": 0.8047, + "step": 356 + }, + { + "epoch": 1.8445595854922279, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.617894172668457, + "eval_runtime": 23.6591, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 356 + }, + { + "epoch": 1.849740932642487, + "grad_norm": 6.71875, + "learning_rate": 3.150259067357513e-05, + "loss": 0.6797, + "step": 357 + }, + { + "epoch": 1.849740932642487, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6174819469451904, + "eval_runtime": 23.6576, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 357 + }, + { + "epoch": 1.854922279792746, + "grad_norm": 7.125, + "learning_rate": 3.145077720207254e-05, + "loss": 0.6211, + "step": 358 + }, + { + "epoch": 1.854922279792746, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6182867288589478, + "eval_runtime": 23.703, + "eval_samples_per_second": 16.791, + "eval_steps_per_second": 2.109, + "step": 358 + }, + { + "epoch": 1.8601036269430051, + "grad_norm": 10.3125, + "learning_rate": 3.139896373056995e-05, + "loss": 0.9297, + "step": 359 + }, + { + "epoch": 1.8601036269430051, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.618463397026062, + "eval_runtime": 23.6923, + "eval_samples_per_second": 16.799, + "eval_steps_per_second": 2.11, + "step": 359 + }, + { + "epoch": 1.8652849740932642, + "grad_norm": 6.8125, + "learning_rate": 3.134715025906736e-05, + "loss": 0.4883, + "step": 360 + }, + { + "epoch": 1.8652849740932642, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6196411848068237, + "eval_runtime": 23.6607, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 360 + }, + { + "epoch": 1.8704663212435233, + "grad_norm": 7.625, + "learning_rate": 3.1295336787564767e-05, + "loss": 0.6797, + "step": 361 + }, + { + "epoch": 1.8704663212435233, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6205637454986572, + "eval_runtime": 23.6669, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 361 + }, + { + "epoch": 1.8756476683937824, + "grad_norm": 4.78125, + "learning_rate": 3.124352331606218e-05, + "loss": 0.4219, + "step": 362 + }, + { + "epoch": 1.8756476683937824, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.620485246181488, + "eval_runtime": 23.717, + "eval_samples_per_second": 16.781, + "eval_steps_per_second": 2.108, + "step": 362 + }, + { + "epoch": 1.8808290155440415, + "grad_norm": 5.84375, + "learning_rate": 3.1191709844559583e-05, + "loss": 0.5312, + "step": 363 + }, + { + "epoch": 1.8808290155440415, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6215844750404358, + "eval_runtime": 23.7024, + "eval_samples_per_second": 16.792, + "eval_steps_per_second": 2.109, + "step": 363 + }, + { + "epoch": 1.8860103626943006, + "grad_norm": 13.6875, + "learning_rate": 3.1139896373057e-05, + "loss": 0.9453, + "step": 364 + }, + { + "epoch": 1.8860103626943006, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6222322583198547, + "eval_runtime": 23.6463, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 364 + }, + { + "epoch": 1.8911917098445596, + "grad_norm": 13.75, + "learning_rate": 3.108808290155441e-05, + "loss": 1.3672, + "step": 365 + }, + { + "epoch": 1.8911917098445596, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6221537590026855, + "eval_runtime": 23.6511, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 365 + }, + { + "epoch": 1.8963730569948187, + "grad_norm": 7.8125, + "learning_rate": 3.1036269430051815e-05, + "loss": 0.5547, + "step": 366 + }, + { + "epoch": 1.8963730569948187, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6233903765678406, + "eval_runtime": 23.6265, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 366 + }, + { + "epoch": 1.9015544041450778, + "grad_norm": 15.0, + "learning_rate": 3.0984455958549224e-05, + "loss": 1.5703, + "step": 367 + }, + { + "epoch": 1.9015544041450778, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6225855946540833, + "eval_runtime": 23.5801, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.12, + "step": 367 + }, + { + "epoch": 1.906735751295337, + "grad_norm": 8.5625, + "learning_rate": 3.093264248704663e-05, + "loss": 0.707, + "step": 368 + }, + { + "epoch": 1.906735751295337, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.622408926486969, + "eval_runtime": 23.5671, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.122, + "step": 368 + }, + { + "epoch": 1.911917098445596, + "grad_norm": 5.4375, + "learning_rate": 3.088082901554404e-05, + "loss": 0.4414, + "step": 369 + }, + { + "epoch": 1.911917098445596, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6227622628211975, + "eval_runtime": 23.5656, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.122, + "step": 369 + }, + { + "epoch": 1.917098445595855, + "grad_norm": 5.4375, + "learning_rate": 3.082901554404145e-05, + "loss": 0.4727, + "step": 370 + }, + { + "epoch": 1.917098445595855, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6226444840431213, + "eval_runtime": 23.5842, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.12, + "step": 370 + }, + { + "epoch": 1.922279792746114, + "grad_norm": 7.53125, + "learning_rate": 3.0777202072538864e-05, + "loss": 0.5234, + "step": 371 + }, + { + "epoch": 1.922279792746114, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6228211522102356, + "eval_runtime": 23.5858, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.12, + "step": 371 + }, + { + "epoch": 1.927461139896373, + "grad_norm": 8.75, + "learning_rate": 3.072538860103627e-05, + "loss": 0.8398, + "step": 372 + }, + { + "epoch": 1.927461139896373, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6229389309883118, + "eval_runtime": 23.6628, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.113, + "step": 372 + }, + { + "epoch": 1.932642487046632, + "grad_norm": 8.8125, + "learning_rate": 3.067357512953368e-05, + "loss": 0.8398, + "step": 373 + }, + { + "epoch": 1.932642487046632, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6226052045822144, + "eval_runtime": 23.6438, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 373 + }, + { + "epoch": 1.9378238341968912, + "grad_norm": 7.3125, + "learning_rate": 3.062176165803109e-05, + "loss": 0.7031, + "step": 374 + }, + { + "epoch": 1.9378238341968912, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6223893165588379, + "eval_runtime": 23.6616, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 374 + }, + { + "epoch": 1.9430051813471503, + "grad_norm": 7.90625, + "learning_rate": 3.05699481865285e-05, + "loss": 0.8281, + "step": 375 + }, + { + "epoch": 1.9430051813471503, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6218593120574951, + "eval_runtime": 23.7067, + "eval_samples_per_second": 16.789, + "eval_steps_per_second": 2.109, + "step": 375 + }, + { + "epoch": 1.9481865284974094, + "grad_norm": 9.25, + "learning_rate": 3.0518134715025906e-05, + "loss": 0.6367, + "step": 376 + }, + { + "epoch": 1.9481865284974094, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6208385825157166, + "eval_runtime": 23.6584, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 376 + }, + { + "epoch": 1.9533678756476682, + "grad_norm": 6.90625, + "learning_rate": 3.0466321243523315e-05, + "loss": 0.6875, + "step": 377 + }, + { + "epoch": 1.9533678756476682, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6193074584007263, + "eval_runtime": 23.6657, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.113, + "step": 377 + }, + { + "epoch": 1.9585492227979273, + "grad_norm": 5.6875, + "learning_rate": 3.0414507772020727e-05, + "loss": 0.5, + "step": 378 + }, + { + "epoch": 1.9585492227979273, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6188560128211975, + "eval_runtime": 23.6553, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.114, + "step": 378 + }, + { + "epoch": 1.9637305699481864, + "grad_norm": 10.3125, + "learning_rate": 3.0362694300518135e-05, + "loss": 0.8086, + "step": 379 + }, + { + "epoch": 1.9637305699481864, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6191504597663879, + "eval_runtime": 23.7099, + "eval_samples_per_second": 16.786, + "eval_steps_per_second": 2.109, + "step": 379 + }, + { + "epoch": 1.9689119170984455, + "grad_norm": 9.1875, + "learning_rate": 3.0310880829015547e-05, + "loss": 0.6172, + "step": 380 + }, + { + "epoch": 1.9689119170984455, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6183063983917236, + "eval_runtime": 23.6976, + "eval_samples_per_second": 16.795, + "eval_steps_per_second": 2.11, + "step": 380 + }, + { + "epoch": 1.9740932642487046, + "grad_norm": 7.5, + "learning_rate": 3.0259067357512955e-05, + "loss": 0.7266, + "step": 381 + }, + { + "epoch": 1.9740932642487046, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6173052787780762, + "eval_runtime": 23.6466, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 381 + }, + { + "epoch": 1.9792746113989637, + "grad_norm": 7.5, + "learning_rate": 3.020725388601036e-05, + "loss": 0.6953, + "step": 382 + }, + { + "epoch": 1.9792746113989637, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6169323325157166, + "eval_runtime": 23.5457, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.124, + "step": 382 + }, + { + "epoch": 1.9844559585492227, + "grad_norm": 4.25, + "learning_rate": 3.0155440414507776e-05, + "loss": 0.2598, + "step": 383 + }, + { + "epoch": 1.9844559585492227, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6166574954986572, + "eval_runtime": 23.5577, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.122, + "step": 383 + }, + { + "epoch": 1.9896373056994818, + "grad_norm": 6.0625, + "learning_rate": 3.010362694300518e-05, + "loss": 0.5156, + "step": 384 + }, + { + "epoch": 1.9896373056994818, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6162845492362976, + "eval_runtime": 23.528, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.125, + "step": 384 + }, + { + "epoch": 1.994818652849741, + "grad_norm": 5.53125, + "learning_rate": 3.0051813471502592e-05, + "loss": 0.5078, + "step": 385 + }, + { + "epoch": 1.994818652849741, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6161863803863525, + "eval_runtime": 23.536, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.124, + "step": 385 + }, + { + "epoch": 2.0, + "grad_norm": 10.375, + "learning_rate": 3e-05, + "loss": 0.4863, + "step": 386 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6158919334411621, + "eval_runtime": 23.6283, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 386 + }, + { + "epoch": 2.005181347150259, + "grad_norm": 8.3125, + "learning_rate": 2.9948186528497413e-05, + "loss": 0.5781, + "step": 387 + }, + { + "epoch": 2.005181347150259, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6153227090835571, + "eval_runtime": 23.6301, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 387 + }, + { + "epoch": 2.010362694300518, + "grad_norm": 22.375, + "learning_rate": 2.989637305699482e-05, + "loss": 1.5, + "step": 388 + }, + { + "epoch": 2.010362694300518, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6150478720664978, + "eval_runtime": 23.6512, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 388 + }, + { + "epoch": 2.0155440414507773, + "grad_norm": 6.125, + "learning_rate": 2.9844559585492226e-05, + "loss": 0.6289, + "step": 389 + }, + { + "epoch": 2.0155440414507773, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6142234802246094, + "eval_runtime": 23.6659, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 389 + }, + { + "epoch": 2.0207253886010363, + "grad_norm": 7.5, + "learning_rate": 2.9792746113989638e-05, + "loss": 0.8828, + "step": 390 + }, + { + "epoch": 2.0207253886010363, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6144786477088928, + "eval_runtime": 23.6637, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 390 + }, + { + "epoch": 2.0259067357512954, + "grad_norm": 6.125, + "learning_rate": 2.9740932642487046e-05, + "loss": 0.5859, + "step": 391 + }, + { + "epoch": 2.0259067357512954, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6144982576370239, + "eval_runtime": 23.6531, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 391 + }, + { + "epoch": 2.0310880829015545, + "grad_norm": 6.09375, + "learning_rate": 2.9689119170984458e-05, + "loss": 0.6797, + "step": 392 + }, + { + "epoch": 2.0310880829015545, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6141253113746643, + "eval_runtime": 23.6479, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 392 + }, + { + "epoch": 2.0362694300518136, + "grad_norm": 5.96875, + "learning_rate": 2.9637305699481866e-05, + "loss": 0.5391, + "step": 393 + }, + { + "epoch": 2.0362694300518136, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.613320529460907, + "eval_runtime": 23.6431, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 393 + }, + { + "epoch": 2.0414507772020727, + "grad_norm": 5.65625, + "learning_rate": 2.9585492227979278e-05, + "loss": 0.5156, + "step": 394 + }, + { + "epoch": 2.0414507772020727, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6140075325965881, + "eval_runtime": 23.6593, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 394 + }, + { + "epoch": 2.0466321243523318, + "grad_norm": 7.03125, + "learning_rate": 2.9533678756476683e-05, + "loss": 0.6328, + "step": 395 + }, + { + "epoch": 2.0466321243523318, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6133794188499451, + "eval_runtime": 23.6615, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 395 + }, + { + "epoch": 2.051813471502591, + "grad_norm": 6.75, + "learning_rate": 2.94818652849741e-05, + "loss": 0.75, + "step": 396 + }, + { + "epoch": 2.051813471502591, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6141842007637024, + "eval_runtime": 23.6713, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.112, + "step": 396 + }, + { + "epoch": 2.05699481865285, + "grad_norm": 6.3125, + "learning_rate": 2.9430051813471504e-05, + "loss": 0.5898, + "step": 397 + }, + { + "epoch": 2.05699481865285, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6142234802246094, + "eval_runtime": 23.6574, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.114, + "step": 397 + }, + { + "epoch": 2.062176165803109, + "grad_norm": 5.71875, + "learning_rate": 2.9378238341968912e-05, + "loss": 0.5234, + "step": 398 + }, + { + "epoch": 2.062176165803109, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6141645908355713, + "eval_runtime": 23.6229, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 398 + }, + { + "epoch": 2.0673575129533677, + "grad_norm": 5.1875, + "learning_rate": 2.9326424870466324e-05, + "loss": 0.4492, + "step": 399 + }, + { + "epoch": 2.0673575129533677, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6137327551841736, + "eval_runtime": 23.6005, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.119, + "step": 399 + }, + { + "epoch": 2.0725388601036268, + "grad_norm": 10.125, + "learning_rate": 2.9274611398963732e-05, + "loss": 1.1094, + "step": 400 + }, + { + "epoch": 2.0725388601036268, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6137327551841736, + "eval_runtime": 23.5951, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.119, + "step": 400 + }, + { + "epoch": 2.077720207253886, + "grad_norm": 7.09375, + "learning_rate": 2.9222797927461144e-05, + "loss": 0.6562, + "step": 401 + }, + { + "epoch": 2.077720207253886, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6141057014465332, + "eval_runtime": 23.4117, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.136, + "step": 401 + }, + { + "epoch": 2.082901554404145, + "grad_norm": 8.9375, + "learning_rate": 2.917098445595855e-05, + "loss": 0.9453, + "step": 402 + }, + { + "epoch": 2.082901554404145, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6143019795417786, + "eval_runtime": 23.4458, + "eval_samples_per_second": 16.975, + "eval_steps_per_second": 2.133, + "step": 402 + }, + { + "epoch": 2.088082901554404, + "grad_norm": 6.53125, + "learning_rate": 2.911917098445596e-05, + "loss": 0.6367, + "step": 403 + }, + { + "epoch": 2.088082901554404, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.61394864320755, + "eval_runtime": 23.6127, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.118, + "step": 403 + }, + { + "epoch": 2.093264248704663, + "grad_norm": 5.34375, + "learning_rate": 2.906735751295337e-05, + "loss": 0.4766, + "step": 404 + }, + { + "epoch": 2.093264248704663, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.613241970539093, + "eval_runtime": 23.6078, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.118, + "step": 404 + }, + { + "epoch": 2.098445595854922, + "grad_norm": 6.5625, + "learning_rate": 2.9015544041450778e-05, + "loss": 0.6523, + "step": 405 + }, + { + "epoch": 2.098445595854922, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6137130856513977, + "eval_runtime": 23.6298, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 405 + }, + { + "epoch": 2.1036269430051813, + "grad_norm": 5.53125, + "learning_rate": 2.896373056994819e-05, + "loss": 0.5117, + "step": 406 + }, + { + "epoch": 2.1036269430051813, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6134775280952454, + "eval_runtime": 23.6785, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.112, + "step": 406 + }, + { + "epoch": 2.1088082901554404, + "grad_norm": 8.1875, + "learning_rate": 2.8911917098445594e-05, + "loss": 0.8008, + "step": 407 + }, + { + "epoch": 2.1088082901554404, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6127709150314331, + "eval_runtime": 23.6527, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 407 + }, + { + "epoch": 2.1139896373056994, + "grad_norm": 7.03125, + "learning_rate": 2.886010362694301e-05, + "loss": 0.6719, + "step": 408 + }, + { + "epoch": 2.1139896373056994, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6129671931266785, + "eval_runtime": 23.6697, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 408 + }, + { + "epoch": 2.1191709844559585, + "grad_norm": 6.0625, + "learning_rate": 2.8808290155440415e-05, + "loss": 0.5781, + "step": 409 + }, + { + "epoch": 2.1191709844559585, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6129475235939026, + "eval_runtime": 23.6651, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.113, + "step": 409 + }, + { + "epoch": 2.1243523316062176, + "grad_norm": 5.90625, + "learning_rate": 2.8756476683937827e-05, + "loss": 0.5703, + "step": 410 + }, + { + "epoch": 2.1243523316062176, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6136934757232666, + "eval_runtime": 23.7072, + "eval_samples_per_second": 16.788, + "eval_steps_per_second": 2.109, + "step": 410 + }, + { + "epoch": 2.1295336787564767, + "grad_norm": 8.0, + "learning_rate": 2.8704663212435235e-05, + "loss": 0.8086, + "step": 411 + }, + { + "epoch": 2.1295336787564767, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6131045818328857, + "eval_runtime": 23.669, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 411 + }, + { + "epoch": 2.134715025906736, + "grad_norm": 5.9375, + "learning_rate": 2.865284974093264e-05, + "loss": 0.5898, + "step": 412 + }, + { + "epoch": 2.134715025906736, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6133401393890381, + "eval_runtime": 23.6642, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 412 + }, + { + "epoch": 2.139896373056995, + "grad_norm": 15.0, + "learning_rate": 2.8601036269430055e-05, + "loss": 0.707, + "step": 413 + }, + { + "epoch": 2.139896373056995, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6132027506828308, + "eval_runtime": 23.6993, + "eval_samples_per_second": 16.794, + "eval_steps_per_second": 2.11, + "step": 413 + }, + { + "epoch": 2.145077720207254, + "grad_norm": 6.96875, + "learning_rate": 2.854922279792746e-05, + "loss": 0.7188, + "step": 414 + }, + { + "epoch": 2.145077720207254, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6131045818328857, + "eval_runtime": 23.5977, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.119, + "step": 414 + }, + { + "epoch": 2.150259067357513, + "grad_norm": 5.25, + "learning_rate": 2.8497409326424872e-05, + "loss": 0.4512, + "step": 415 + }, + { + "epoch": 2.150259067357513, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.612712025642395, + "eval_runtime": 23.559, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.122, + "step": 415 + }, + { + "epoch": 2.155440414507772, + "grad_norm": 5.375, + "learning_rate": 2.844559585492228e-05, + "loss": 0.4629, + "step": 416 + }, + { + "epoch": 2.155440414507772, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6131241917610168, + "eval_runtime": 23.5082, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.127, + "step": 416 + }, + { + "epoch": 2.160621761658031, + "grad_norm": 7.34375, + "learning_rate": 2.8393782383419692e-05, + "loss": 0.7422, + "step": 417 + }, + { + "epoch": 2.160621761658031, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6125942468643188, + "eval_runtime": 23.5285, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.125, + "step": 417 + }, + { + "epoch": 2.1658031088082903, + "grad_norm": 7.5625, + "learning_rate": 2.83419689119171e-05, + "loss": 0.6289, + "step": 418 + }, + { + "epoch": 2.1658031088082903, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6123586893081665, + "eval_runtime": 23.4699, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.13, + "step": 418 + }, + { + "epoch": 2.1709844559585494, + "grad_norm": 8.4375, + "learning_rate": 2.8290155440414506e-05, + "loss": 0.875, + "step": 419 + }, + { + "epoch": 2.1709844559585494, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6130456924438477, + "eval_runtime": 23.5234, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.126, + "step": 419 + }, + { + "epoch": 2.1761658031088085, + "grad_norm": 6.375, + "learning_rate": 2.8238341968911917e-05, + "loss": 0.6797, + "step": 420 + }, + { + "epoch": 2.1761658031088085, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6128297448158264, + "eval_runtime": 23.5877, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.12, + "step": 420 + }, + { + "epoch": 2.1813471502590676, + "grad_norm": 6.71875, + "learning_rate": 2.8186528497409326e-05, + "loss": 0.5469, + "step": 421 + }, + { + "epoch": 2.1813471502590676, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6129671931266785, + "eval_runtime": 23.6256, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 421 + }, + { + "epoch": 2.186528497409326, + "grad_norm": 5.34375, + "learning_rate": 2.8134715025906738e-05, + "loss": 0.4375, + "step": 422 + }, + { + "epoch": 2.186528497409326, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6131241917610168, + "eval_runtime": 23.6922, + "eval_samples_per_second": 16.799, + "eval_steps_per_second": 2.11, + "step": 422 + }, + { + "epoch": 2.1917098445595853, + "grad_norm": 5.21875, + "learning_rate": 2.8082901554404146e-05, + "loss": 0.3926, + "step": 423 + }, + { + "epoch": 2.1917098445595853, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6141449213027954, + "eval_runtime": 23.6535, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 423 + }, + { + "epoch": 2.1968911917098444, + "grad_norm": 7.25, + "learning_rate": 2.8031088082901558e-05, + "loss": 0.6133, + "step": 424 + }, + { + "epoch": 2.1968911917098444, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.613987922668457, + "eval_runtime": 23.7027, + "eval_samples_per_second": 16.791, + "eval_steps_per_second": 2.109, + "step": 424 + }, + { + "epoch": 2.2020725388601035, + "grad_norm": 5.59375, + "learning_rate": 2.7979274611398963e-05, + "loss": 0.4414, + "step": 425 + }, + { + "epoch": 2.2020725388601035, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6142823696136475, + "eval_runtime": 23.6497, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.114, + "step": 425 + }, + { + "epoch": 2.2072538860103625, + "grad_norm": 6.59375, + "learning_rate": 2.7927461139896378e-05, + "loss": 0.6914, + "step": 426 + }, + { + "epoch": 2.2072538860103625, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.615303099155426, + "eval_runtime": 23.6517, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 426 + }, + { + "epoch": 2.2124352331606216, + "grad_norm": 9.1875, + "learning_rate": 2.7875647668393783e-05, + "loss": 0.9727, + "step": 427 + }, + { + "epoch": 2.2124352331606216, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6153423190116882, + "eval_runtime": 23.6941, + "eval_samples_per_second": 16.797, + "eval_steps_per_second": 2.11, + "step": 427 + }, + { + "epoch": 2.2176165803108807, + "grad_norm": 8.875, + "learning_rate": 2.782383419689119e-05, + "loss": 0.7812, + "step": 428 + }, + { + "epoch": 2.2176165803108807, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6156367659568787, + "eval_runtime": 23.6612, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 428 + }, + { + "epoch": 2.22279792746114, + "grad_norm": 6.0, + "learning_rate": 2.7772020725388603e-05, + "loss": 0.5859, + "step": 429 + }, + { + "epoch": 2.22279792746114, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.615911602973938, + "eval_runtime": 23.6248, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.116, + "step": 429 + }, + { + "epoch": 2.227979274611399, + "grad_norm": 5.625, + "learning_rate": 2.7720207253886012e-05, + "loss": 0.418, + "step": 430 + }, + { + "epoch": 2.227979274611399, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6161274909973145, + "eval_runtime": 23.594, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.119, + "step": 430 + }, + { + "epoch": 2.233160621761658, + "grad_norm": 6.59375, + "learning_rate": 2.7668393782383424e-05, + "loss": 0.6719, + "step": 431 + }, + { + "epoch": 2.233160621761658, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6172659993171692, + "eval_runtime": 23.6408, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 431 + }, + { + "epoch": 2.238341968911917, + "grad_norm": 8.3125, + "learning_rate": 2.761658031088083e-05, + "loss": 0.9141, + "step": 432 + }, + { + "epoch": 2.238341968911917, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6166771650314331, + "eval_runtime": 23.6749, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 432 + }, + { + "epoch": 2.243523316062176, + "grad_norm": 7.875, + "learning_rate": 2.756476683937824e-05, + "loss": 0.8945, + "step": 433 + }, + { + "epoch": 2.243523316062176, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6168930530548096, + "eval_runtime": 23.6462, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.115, + "step": 433 + }, + { + "epoch": 2.2487046632124352, + "grad_norm": 4.53125, + "learning_rate": 2.751295336787565e-05, + "loss": 0.3496, + "step": 434 + }, + { + "epoch": 2.2487046632124352, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6169126629829407, + "eval_runtime": 23.7151, + "eval_samples_per_second": 16.783, + "eval_steps_per_second": 2.108, + "step": 434 + }, + { + "epoch": 2.2538860103626943, + "grad_norm": 5.78125, + "learning_rate": 2.7461139896373057e-05, + "loss": 0.5039, + "step": 435 + }, + { + "epoch": 2.2538860103626943, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6164416074752808, + "eval_runtime": 23.6605, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 435 + }, + { + "epoch": 2.2590673575129534, + "grad_norm": 8.0625, + "learning_rate": 2.740932642487047e-05, + "loss": 0.6914, + "step": 436 + }, + { + "epoch": 2.2590673575129534, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6163238286972046, + "eval_runtime": 23.6888, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 436 + }, + { + "epoch": 2.2642487046632125, + "grad_norm": 5.34375, + "learning_rate": 2.7357512953367874e-05, + "loss": 0.291, + "step": 437 + }, + { + "epoch": 2.2642487046632125, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6165593862533569, + "eval_runtime": 23.6955, + "eval_samples_per_second": 16.796, + "eval_steps_per_second": 2.11, + "step": 437 + }, + { + "epoch": 2.2694300518134716, + "grad_norm": 7.9375, + "learning_rate": 2.730569948186529e-05, + "loss": 0.7695, + "step": 438 + }, + { + "epoch": 2.2694300518134716, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6155386567115784, + "eval_runtime": 23.6616, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 438 + }, + { + "epoch": 2.2746113989637307, + "grad_norm": 7.625, + "learning_rate": 2.7253886010362694e-05, + "loss": 0.6719, + "step": 439 + }, + { + "epoch": 2.2746113989637307, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6152442097663879, + "eval_runtime": 23.7037, + "eval_samples_per_second": 16.791, + "eval_steps_per_second": 2.109, + "step": 439 + }, + { + "epoch": 2.2797927461139897, + "grad_norm": 5.3125, + "learning_rate": 2.7202072538860106e-05, + "loss": 0.459, + "step": 440 + }, + { + "epoch": 2.2797927461139897, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6154797673225403, + "eval_runtime": 23.6706, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.112, + "step": 440 + }, + { + "epoch": 2.284974093264249, + "grad_norm": 11.125, + "learning_rate": 2.7150259067357515e-05, + "loss": 0.6758, + "step": 441 + }, + { + "epoch": 2.284974093264249, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6149497628211975, + "eval_runtime": 23.6013, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.119, + "step": 441 + }, + { + "epoch": 2.290155440414508, + "grad_norm": 4.03125, + "learning_rate": 2.709844559585492e-05, + "loss": 0.3066, + "step": 442 + }, + { + "epoch": 2.290155440414508, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6150086522102356, + "eval_runtime": 23.5729, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.121, + "step": 442 + }, + { + "epoch": 2.295336787564767, + "grad_norm": 4.96875, + "learning_rate": 2.7046632124352335e-05, + "loss": 0.416, + "step": 443 + }, + { + "epoch": 2.295336787564767, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6153815984725952, + "eval_runtime": 23.5942, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.119, + "step": 443 + }, + { + "epoch": 2.300518134715026, + "grad_norm": 6.4375, + "learning_rate": 2.699481865284974e-05, + "loss": 0.4922, + "step": 444 + }, + { + "epoch": 2.300518134715026, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.615303099155426, + "eval_runtime": 23.5706, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.121, + "step": 444 + }, + { + "epoch": 2.305699481865285, + "grad_norm": 7.53125, + "learning_rate": 2.694300518134715e-05, + "loss": 0.6484, + "step": 445 + }, + { + "epoch": 2.305699481865285, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6155189871788025, + "eval_runtime": 23.6119, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.118, + "step": 445 + }, + { + "epoch": 2.3108808290155443, + "grad_norm": 5.5625, + "learning_rate": 2.689119170984456e-05, + "loss": 0.3887, + "step": 446 + }, + { + "epoch": 2.3108808290155443, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6149889826774597, + "eval_runtime": 23.6401, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 446 + }, + { + "epoch": 2.3160621761658033, + "grad_norm": 6.53125, + "learning_rate": 2.6839378238341972e-05, + "loss": 0.7031, + "step": 447 + }, + { + "epoch": 2.3160621761658033, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6156564354896545, + "eval_runtime": 23.6403, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 447 + }, + { + "epoch": 2.321243523316062, + "grad_norm": 8.75, + "learning_rate": 2.678756476683938e-05, + "loss": 0.9141, + "step": 448 + }, + { + "epoch": 2.321243523316062, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6155189871788025, + "eval_runtime": 23.6439, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 448 + }, + { + "epoch": 2.326424870466321, + "grad_norm": 5.21875, + "learning_rate": 2.6735751295336785e-05, + "loss": 0.4551, + "step": 449 + }, + { + "epoch": 2.326424870466321, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6157153248786926, + "eval_runtime": 23.6475, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 449 + }, + { + "epoch": 2.33160621761658, + "grad_norm": 7.0625, + "learning_rate": 2.6683937823834197e-05, + "loss": 0.5938, + "step": 450 + }, + { + "epoch": 2.33160621761658, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6153619885444641, + "eval_runtime": 23.6952, + "eval_samples_per_second": 16.797, + "eval_steps_per_second": 2.11, + "step": 450 + }, + { + "epoch": 2.3367875647668392, + "grad_norm": 5.34375, + "learning_rate": 2.6632124352331606e-05, + "loss": 0.459, + "step": 451 + }, + { + "epoch": 2.3367875647668392, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6160686016082764, + "eval_runtime": 23.5818, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.12, + "step": 451 + }, + { + "epoch": 2.3419689119170983, + "grad_norm": 7.0625, + "learning_rate": 2.6580310880829017e-05, + "loss": 0.7734, + "step": 452 + }, + { + "epoch": 2.3419689119170983, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6156367659568787, + "eval_runtime": 23.4756, + "eval_samples_per_second": 16.954, + "eval_steps_per_second": 2.13, + "step": 452 + }, + { + "epoch": 2.3471502590673574, + "grad_norm": 6.28125, + "learning_rate": 2.6528497409326426e-05, + "loss": 0.582, + "step": 453 + }, + { + "epoch": 2.3471502590673574, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6155189871788025, + "eval_runtime": 23.5994, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.119, + "step": 453 + }, + { + "epoch": 2.3523316062176165, + "grad_norm": 6.59375, + "learning_rate": 2.6476683937823838e-05, + "loss": 0.4375, + "step": 454 + }, + { + "epoch": 2.3523316062176165, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6167556643486023, + "eval_runtime": 23.5968, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.119, + "step": 454 + }, + { + "epoch": 2.3575129533678756, + "grad_norm": 4.71875, + "learning_rate": 2.6424870466321246e-05, + "loss": 0.3984, + "step": 455 + }, + { + "epoch": 2.3575129533678756, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6162452697753906, + "eval_runtime": 23.6692, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 455 + }, + { + "epoch": 2.3626943005181347, + "grad_norm": 5.46875, + "learning_rate": 2.6373056994818658e-05, + "loss": 0.4414, + "step": 456 + }, + { + "epoch": 2.3626943005181347, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6175015568733215, + "eval_runtime": 23.6275, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 456 + }, + { + "epoch": 2.3678756476683938, + "grad_norm": 7.03125, + "learning_rate": 2.6321243523316063e-05, + "loss": 0.6875, + "step": 457 + }, + { + "epoch": 2.3678756476683938, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6173052787780762, + "eval_runtime": 23.6347, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.116, + "step": 457 + }, + { + "epoch": 2.373056994818653, + "grad_norm": 8.25, + "learning_rate": 2.626943005181347e-05, + "loss": 0.5352, + "step": 458 + }, + { + "epoch": 2.373056994818653, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6184830665588379, + "eval_runtime": 23.6762, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.112, + "step": 458 + }, + { + "epoch": 2.378238341968912, + "grad_norm": 7.28125, + "learning_rate": 2.6217616580310883e-05, + "loss": 0.3828, + "step": 459 + }, + { + "epoch": 2.378238341968912, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6195822954177856, + "eval_runtime": 23.6814, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.111, + "step": 459 + }, + { + "epoch": 2.383419689119171, + "grad_norm": 6.40625, + "learning_rate": 2.616580310880829e-05, + "loss": 0.6016, + "step": 460 + }, + { + "epoch": 2.383419689119171, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6200337409973145, + "eval_runtime": 23.642, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 460 + }, + { + "epoch": 2.38860103626943, + "grad_norm": 11.25, + "learning_rate": 2.6113989637305703e-05, + "loss": 1.0391, + "step": 461 + }, + { + "epoch": 2.38860103626943, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6205441355705261, + "eval_runtime": 23.642, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 461 + }, + { + "epoch": 2.393782383419689, + "grad_norm": 5.1875, + "learning_rate": 2.6062176165803108e-05, + "loss": 0.3926, + "step": 462 + }, + { + "epoch": 2.393782383419689, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6216040849685669, + "eval_runtime": 23.6459, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.115, + "step": 462 + }, + { + "epoch": 2.3989637305699483, + "grad_norm": 8.6875, + "learning_rate": 2.6010362694300523e-05, + "loss": 0.8789, + "step": 463 + }, + { + "epoch": 2.3989637305699483, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6211133599281311, + "eval_runtime": 23.6853, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 463 + }, + { + "epoch": 2.4041450777202074, + "grad_norm": 7.84375, + "learning_rate": 2.595854922279793e-05, + "loss": 0.6875, + "step": 464 + }, + { + "epoch": 2.4041450777202074, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6215452551841736, + "eval_runtime": 23.6578, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 464 + }, + { + "epoch": 2.4093264248704664, + "grad_norm": 7.65625, + "learning_rate": 2.5906735751295337e-05, + "loss": 0.6562, + "step": 465 + }, + { + "epoch": 2.4093264248704664, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6214274764060974, + "eval_runtime": 23.6459, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.115, + "step": 465 + }, + { + "epoch": 2.4145077720207255, + "grad_norm": 5.78125, + "learning_rate": 2.585492227979275e-05, + "loss": 0.5273, + "step": 466 + }, + { + "epoch": 2.4145077720207255, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6225267052650452, + "eval_runtime": 23.6574, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.114, + "step": 466 + }, + { + "epoch": 2.4196891191709846, + "grad_norm": 4.625, + "learning_rate": 2.5803108808290154e-05, + "loss": 0.3086, + "step": 467 + }, + { + "epoch": 2.4196891191709846, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6228014826774597, + "eval_runtime": 23.6561, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 467 + }, + { + "epoch": 2.4248704663212437, + "grad_norm": 8.375, + "learning_rate": 2.575129533678757e-05, + "loss": 0.5938, + "step": 468 + }, + { + "epoch": 2.4248704663212437, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6235081553459167, + "eval_runtime": 23.661, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 468 + }, + { + "epoch": 2.4300518134715023, + "grad_norm": 6.3125, + "learning_rate": 2.5699481865284974e-05, + "loss": 0.5664, + "step": 469 + }, + { + "epoch": 2.4300518134715023, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6238026022911072, + "eval_runtime": 23.6431, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 469 + }, + { + "epoch": 2.4352331606217614, + "grad_norm": 12.75, + "learning_rate": 2.5647668393782386e-05, + "loss": 1.2422, + "step": 470 + }, + { + "epoch": 2.4352331606217614, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6233314871788025, + "eval_runtime": 23.6077, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.118, + "step": 470 + }, + { + "epoch": 2.4404145077720205, + "grad_norm": 4.46875, + "learning_rate": 2.5595854922279794e-05, + "loss": 0.3633, + "step": 471 + }, + { + "epoch": 2.4404145077720205, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6233511567115784, + "eval_runtime": 23.5901, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.12, + "step": 471 + }, + { + "epoch": 2.4455958549222796, + "grad_norm": 6.0625, + "learning_rate": 2.55440414507772e-05, + "loss": 0.5625, + "step": 472 + }, + { + "epoch": 2.4455958549222796, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6231548190116882, + "eval_runtime": 23.6198, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 472 + }, + { + "epoch": 2.4507772020725387, + "grad_norm": 5.59375, + "learning_rate": 2.5492227979274614e-05, + "loss": 0.5039, + "step": 473 + }, + { + "epoch": 2.4507772020725387, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6226837038993835, + "eval_runtime": 23.5729, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.121, + "step": 473 + }, + { + "epoch": 2.4559585492227978, + "grad_norm": 5.125, + "learning_rate": 2.544041450777202e-05, + "loss": 0.4902, + "step": 474 + }, + { + "epoch": 2.4559585492227978, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6227229833602905, + "eval_runtime": 23.6381, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 474 + }, + { + "epoch": 2.461139896373057, + "grad_norm": 17.25, + "learning_rate": 2.538860103626943e-05, + "loss": 0.7734, + "step": 475 + }, + { + "epoch": 2.461139896373057, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6224285364151001, + "eval_runtime": 23.6357, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 475 + }, + { + "epoch": 2.466321243523316, + "grad_norm": 6.375, + "learning_rate": 2.533678756476684e-05, + "loss": 0.5078, + "step": 476 + }, + { + "epoch": 2.466321243523316, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6228603720664978, + "eval_runtime": 23.6522, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 476 + }, + { + "epoch": 2.471502590673575, + "grad_norm": 12.625, + "learning_rate": 2.528497409326425e-05, + "loss": 1.3906, + "step": 477 + }, + { + "epoch": 2.471502590673575, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6220163106918335, + "eval_runtime": 23.6843, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 477 + }, + { + "epoch": 2.476683937823834, + "grad_norm": 6.1875, + "learning_rate": 2.523316062176166e-05, + "loss": 0.6484, + "step": 478 + }, + { + "epoch": 2.476683937823834, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6219574213027954, + "eval_runtime": 23.6501, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.114, + "step": 478 + }, + { + "epoch": 2.481865284974093, + "grad_norm": 4.5, + "learning_rate": 2.5181347150259065e-05, + "loss": 0.2891, + "step": 479 + }, + { + "epoch": 2.481865284974093, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6220163106918335, + "eval_runtime": 23.7076, + "eval_samples_per_second": 16.788, + "eval_steps_per_second": 2.109, + "step": 479 + }, + { + "epoch": 2.4870466321243523, + "grad_norm": 5.375, + "learning_rate": 2.5129533678756477e-05, + "loss": 0.3711, + "step": 480 + }, + { + "epoch": 2.4870466321243523, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6223304271697998, + "eval_runtime": 23.6726, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 480 + }, + { + "epoch": 2.4922279792746114, + "grad_norm": 5.1875, + "learning_rate": 2.5077720207253885e-05, + "loss": 0.3574, + "step": 481 + }, + { + "epoch": 2.4922279792746114, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6226444840431213, + "eval_runtime": 23.7252, + "eval_samples_per_second": 16.775, + "eval_steps_per_second": 2.107, + "step": 481 + }, + { + "epoch": 2.4974093264248705, + "grad_norm": 4.6875, + "learning_rate": 2.5025906735751297e-05, + "loss": 0.3105, + "step": 482 + }, + { + "epoch": 2.4974093264248705, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6235081553459167, + "eval_runtime": 23.6683, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.113, + "step": 482 + }, + { + "epoch": 2.5025906735751295, + "grad_norm": 6.0625, + "learning_rate": 2.4974093264248705e-05, + "loss": 0.5156, + "step": 483 + }, + { + "epoch": 2.5025906735751295, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.623724102973938, + "eval_runtime": 23.6412, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 483 + }, + { + "epoch": 2.5077720207253886, + "grad_norm": 6.84375, + "learning_rate": 2.4922279792746114e-05, + "loss": 0.5586, + "step": 484 + }, + { + "epoch": 2.5077720207253886, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6236259341239929, + "eval_runtime": 23.6004, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.119, + "step": 484 + }, + { + "epoch": 2.5129533678756477, + "grad_norm": 10.1875, + "learning_rate": 2.4870466321243526e-05, + "loss": 0.9883, + "step": 485 + }, + { + "epoch": 2.5129533678756477, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6235866546630859, + "eval_runtime": 23.5749, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.121, + "step": 485 + }, + { + "epoch": 2.518134715025907, + "grad_norm": 8.125, + "learning_rate": 2.4818652849740934e-05, + "loss": 0.7578, + "step": 486 + }, + { + "epoch": 2.518134715025907, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6242148280143738, + "eval_runtime": 23.597, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.119, + "step": 486 + }, + { + "epoch": 2.523316062176166, + "grad_norm": 6.34375, + "learning_rate": 2.4766839378238342e-05, + "loss": 0.4551, + "step": 487 + }, + { + "epoch": 2.523316062176166, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6239203810691833, + "eval_runtime": 23.5796, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.12, + "step": 487 + }, + { + "epoch": 2.528497409326425, + "grad_norm": 7.125, + "learning_rate": 2.4715025906735754e-05, + "loss": 0.6172, + "step": 488 + }, + { + "epoch": 2.528497409326425, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6234492659568787, + "eval_runtime": 23.5288, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.125, + "step": 488 + }, + { + "epoch": 2.533678756476684, + "grad_norm": 6.25, + "learning_rate": 2.4663212435233163e-05, + "loss": 0.5078, + "step": 489 + }, + { + "epoch": 2.533678756476684, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6235866546630859, + "eval_runtime": 23.5174, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.126, + "step": 489 + }, + { + "epoch": 2.538860103626943, + "grad_norm": 9.8125, + "learning_rate": 2.461139896373057e-05, + "loss": 1.0547, + "step": 490 + }, + { + "epoch": 2.538860103626943, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6228014826774597, + "eval_runtime": 23.5388, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.124, + "step": 490 + }, + { + "epoch": 2.5440414507772022, + "grad_norm": 7.1875, + "learning_rate": 2.455958549222798e-05, + "loss": 0.707, + "step": 491 + }, + { + "epoch": 2.5440414507772022, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6229389309883118, + "eval_runtime": 23.5964, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.119, + "step": 491 + }, + { + "epoch": 2.5492227979274613, + "grad_norm": 7.21875, + "learning_rate": 2.4507772020725388e-05, + "loss": 0.4336, + "step": 492 + }, + { + "epoch": 2.5492227979274613, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6223500370979309, + "eval_runtime": 23.6238, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.117, + "step": 492 + }, + { + "epoch": 2.5544041450777204, + "grad_norm": 9.125, + "learning_rate": 2.44559585492228e-05, + "loss": 0.7031, + "step": 493 + }, + { + "epoch": 2.5544041450777204, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6215648651123047, + "eval_runtime": 23.6314, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 493 + }, + { + "epoch": 2.5595854922279795, + "grad_norm": 4.28125, + "learning_rate": 2.4404145077720208e-05, + "loss": 0.3223, + "step": 494 + }, + { + "epoch": 2.5595854922279795, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6223107576370239, + "eval_runtime": 23.6819, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.111, + "step": 494 + }, + { + "epoch": 2.5647668393782386, + "grad_norm": 5.25, + "learning_rate": 2.4352331606217617e-05, + "loss": 0.4219, + "step": 495 + }, + { + "epoch": 2.5647668393782386, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6218985319137573, + "eval_runtime": 23.6427, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 495 + }, + { + "epoch": 2.5699481865284977, + "grad_norm": 9.5625, + "learning_rate": 2.430051813471503e-05, + "loss": 0.6406, + "step": 496 + }, + { + "epoch": 2.5699481865284977, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6226052045822144, + "eval_runtime": 23.6921, + "eval_samples_per_second": 16.799, + "eval_steps_per_second": 2.11, + "step": 496 + }, + { + "epoch": 2.5751295336787567, + "grad_norm": 4.375, + "learning_rate": 2.4248704663212437e-05, + "loss": 0.334, + "step": 497 + }, + { + "epoch": 2.5751295336787567, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6228603720664978, + "eval_runtime": 23.6469, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 497 + }, + { + "epoch": 2.5803108808290154, + "grad_norm": 9.875, + "learning_rate": 2.4196891191709845e-05, + "loss": 0.5664, + "step": 498 + }, + { + "epoch": 2.5803108808290154, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6235081553459167, + "eval_runtime": 23.6392, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 498 + }, + { + "epoch": 2.5854922279792745, + "grad_norm": 6.90625, + "learning_rate": 2.4145077720207254e-05, + "loss": 0.625, + "step": 499 + }, + { + "epoch": 2.5854922279792745, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6236259341239929, + "eval_runtime": 23.6559, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.114, + "step": 499 + }, + { + "epoch": 2.5906735751295336, + "grad_norm": 6.46875, + "learning_rate": 2.4093264248704665e-05, + "loss": 0.5039, + "step": 500 + }, + { + "epoch": 2.5906735751295336, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6237829923629761, + "eval_runtime": 23.6943, + "eval_samples_per_second": 16.797, + "eval_steps_per_second": 2.11, + "step": 500 + }, + { + "epoch": 2.5958549222797926, + "grad_norm": 8.125, + "learning_rate": 2.4041450777202074e-05, + "loss": 0.7188, + "step": 501 + }, + { + "epoch": 2.5958549222797926, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.624391496181488, + "eval_runtime": 23.6253, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 501 + }, + { + "epoch": 2.6010362694300517, + "grad_norm": 3.546875, + "learning_rate": 2.3989637305699482e-05, + "loss": 0.2178, + "step": 502 + }, + { + "epoch": 2.6010362694300517, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6249018311500549, + "eval_runtime": 23.4696, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.13, + "step": 502 + }, + { + "epoch": 2.606217616580311, + "grad_norm": 3.546875, + "learning_rate": 2.3937823834196894e-05, + "loss": 0.2266, + "step": 503 + }, + { + "epoch": 2.606217616580311, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6256281137466431, + "eval_runtime": 23.5606, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.122, + "step": 503 + }, + { + "epoch": 2.61139896373057, + "grad_norm": 7.53125, + "learning_rate": 2.3886010362694303e-05, + "loss": 0.6953, + "step": 504 + }, + { + "epoch": 2.61139896373057, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6266881227493286, + "eval_runtime": 23.6034, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.118, + "step": 504 + }, + { + "epoch": 2.616580310880829, + "grad_norm": 5.8125, + "learning_rate": 2.383419689119171e-05, + "loss": 0.3672, + "step": 505 + }, + { + "epoch": 2.616580310880829, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6284547448158264, + "eval_runtime": 23.6812, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.111, + "step": 505 + }, + { + "epoch": 2.621761658031088, + "grad_norm": 6.25, + "learning_rate": 2.378238341968912e-05, + "loss": 0.4805, + "step": 506 + }, + { + "epoch": 2.621761658031088, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6300643682479858, + "eval_runtime": 23.6502, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.114, + "step": 506 + }, + { + "epoch": 2.626943005181347, + "grad_norm": 9.25, + "learning_rate": 2.3730569948186528e-05, + "loss": 1.0781, + "step": 507 + }, + { + "epoch": 2.626943005181347, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6313599348068237, + "eval_runtime": 23.6955, + "eval_samples_per_second": 16.796, + "eval_steps_per_second": 2.11, + "step": 507 + }, + { + "epoch": 2.6321243523316062, + "grad_norm": 5.5, + "learning_rate": 2.367875647668394e-05, + "loss": 0.3867, + "step": 508 + }, + { + "epoch": 2.6321243523316062, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6326358318328857, + "eval_runtime": 23.6408, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 508 + }, + { + "epoch": 2.6373056994818653, + "grad_norm": 7.5625, + "learning_rate": 2.3626943005181348e-05, + "loss": 0.7227, + "step": 509 + }, + { + "epoch": 2.6373056994818653, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.633421003818512, + "eval_runtime": 23.678, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.112, + "step": 509 + }, + { + "epoch": 2.6424870466321244, + "grad_norm": 6.625, + "learning_rate": 2.3575129533678756e-05, + "loss": 0.543, + "step": 510 + }, + { + "epoch": 2.6424870466321244, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6351484060287476, + "eval_runtime": 23.6466, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 510 + }, + { + "epoch": 2.6476683937823835, + "grad_norm": 11.75, + "learning_rate": 2.3523316062176168e-05, + "loss": 1.0547, + "step": 511 + }, + { + "epoch": 2.6476683937823835, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6356980204582214, + "eval_runtime": 23.7004, + "eval_samples_per_second": 16.793, + "eval_steps_per_second": 2.11, + "step": 511 + }, + { + "epoch": 2.6528497409326426, + "grad_norm": 7.03125, + "learning_rate": 2.3471502590673577e-05, + "loss": 0.5117, + "step": 512 + }, + { + "epoch": 2.6528497409326426, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6362280249595642, + "eval_runtime": 23.6531, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 512 + }, + { + "epoch": 2.6580310880829017, + "grad_norm": 9.125, + "learning_rate": 2.3419689119170985e-05, + "loss": 0.8242, + "step": 513 + }, + { + "epoch": 2.6580310880829017, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6368561387062073, + "eval_runtime": 23.7077, + "eval_samples_per_second": 16.788, + "eval_steps_per_second": 2.109, + "step": 513 + }, + { + "epoch": 2.6632124352331608, + "grad_norm": 7.125, + "learning_rate": 2.3367875647668393e-05, + "loss": 0.5859, + "step": 514 + }, + { + "epoch": 2.6632124352331608, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.638701319694519, + "eval_runtime": 23.6337, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.116, + "step": 514 + }, + { + "epoch": 2.66839378238342, + "grad_norm": 6.875, + "learning_rate": 2.3316062176165805e-05, + "loss": 0.6914, + "step": 515 + }, + { + "epoch": 2.66839378238342, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6394864916801453, + "eval_runtime": 23.5949, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.119, + "step": 515 + }, + { + "epoch": 2.6735751295336785, + "grad_norm": 7.125, + "learning_rate": 2.3264248704663214e-05, + "loss": 0.7227, + "step": 516 + }, + { + "epoch": 2.6735751295336785, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6396435499191284, + "eval_runtime": 23.5666, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.122, + "step": 516 + }, + { + "epoch": 2.6787564766839376, + "grad_norm": 7.65625, + "learning_rate": 2.3212435233160622e-05, + "loss": 0.6797, + "step": 517 + }, + { + "epoch": 2.6787564766839376, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.6416457295417786, + "eval_runtime": 23.5897, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.12, + "step": 517 + }, + { + "epoch": 2.6839378238341967, + "grad_norm": 6.125, + "learning_rate": 2.3160621761658034e-05, + "loss": 0.4395, + "step": 518 + }, + { + "epoch": 2.6839378238341967, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.641979455947876, + "eval_runtime": 23.5427, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.124, + "step": 518 + }, + { + "epoch": 2.6891191709844557, + "grad_norm": 9.375, + "learning_rate": 2.3108808290155442e-05, + "loss": 0.5234, + "step": 519 + }, + { + "epoch": 2.6891191709844557, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6423524022102356, + "eval_runtime": 23.5339, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.125, + "step": 519 + }, + { + "epoch": 2.694300518134715, + "grad_norm": 5.96875, + "learning_rate": 2.305699481865285e-05, + "loss": 0.4707, + "step": 520 + }, + { + "epoch": 2.694300518134715, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6430982947349548, + "eval_runtime": 23.5239, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.125, + "step": 520 + }, + { + "epoch": 2.699481865284974, + "grad_norm": 7.65625, + "learning_rate": 2.300518134715026e-05, + "loss": 0.6641, + "step": 521 + }, + { + "epoch": 2.699481865284974, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.6431571841239929, + "eval_runtime": 23.5622, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.122, + "step": 521 + }, + { + "epoch": 2.704663212435233, + "grad_norm": 7.0625, + "learning_rate": 2.2953367875647668e-05, + "loss": 0.543, + "step": 522 + }, + { + "epoch": 2.704663212435233, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6444527506828308, + "eval_runtime": 23.5567, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.123, + "step": 522 + }, + { + "epoch": 2.709844559585492, + "grad_norm": 7.65625, + "learning_rate": 2.290155440414508e-05, + "loss": 0.4199, + "step": 523 + }, + { + "epoch": 2.709844559585492, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6457679271697998, + "eval_runtime": 23.51, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.127, + "step": 523 + }, + { + "epoch": 2.715025906735751, + "grad_norm": 6.28125, + "learning_rate": 2.2849740932642488e-05, + "loss": 0.582, + "step": 524 + }, + { + "epoch": 2.715025906735751, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6464942097663879, + "eval_runtime": 23.5504, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.123, + "step": 524 + }, + { + "epoch": 2.7202072538860103, + "grad_norm": 8.125, + "learning_rate": 2.27979274611399e-05, + "loss": 0.6914, + "step": 525 + }, + { + "epoch": 2.7202072538860103, + "eval_accuracy": 0.6708542713567839, + "eval_loss": 0.6470634341239929, + "eval_runtime": 23.5451, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.124, + "step": 525 + }, + { + "epoch": 2.7253886010362693, + "grad_norm": 5.9375, + "learning_rate": 2.2746113989637308e-05, + "loss": 0.5117, + "step": 526 + }, + { + "epoch": 2.7253886010362693, + "eval_accuracy": 0.6733668341708543, + "eval_loss": 0.6472597122192383, + "eval_runtime": 23.5514, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.123, + "step": 526 + }, + { + "epoch": 2.7305699481865284, + "grad_norm": 10.375, + "learning_rate": 2.2694300518134716e-05, + "loss": 0.9531, + "step": 527 + }, + { + "epoch": 2.7305699481865284, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.647161602973938, + "eval_runtime": 23.5433, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.124, + "step": 527 + }, + { + "epoch": 2.7357512953367875, + "grad_norm": 7.40625, + "learning_rate": 2.2642487046632125e-05, + "loss": 0.7383, + "step": 528 + }, + { + "epoch": 2.7357512953367875, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6462389826774597, + "eval_runtime": 23.5035, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.127, + "step": 528 + }, + { + "epoch": 2.7409326424870466, + "grad_norm": 7.0625, + "learning_rate": 2.2590673575129533e-05, + "loss": 0.6406, + "step": 529 + }, + { + "epoch": 2.7409326424870466, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6467886567115784, + "eval_runtime": 23.5458, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.124, + "step": 529 + }, + { + "epoch": 2.7461139896373057, + "grad_norm": 9.1875, + "learning_rate": 2.2538860103626945e-05, + "loss": 0.8359, + "step": 530 + }, + { + "epoch": 2.7461139896373057, + "eval_accuracy": 0.6758793969849246, + "eval_loss": 0.645846426486969, + "eval_runtime": 23.5339, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.125, + "step": 530 + }, + { + "epoch": 2.7512953367875648, + "grad_norm": 8.8125, + "learning_rate": 2.2487046632124354e-05, + "loss": 0.7188, + "step": 531 + }, + { + "epoch": 2.7512953367875648, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6464353203773499, + "eval_runtime": 23.6386, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 531 + }, + { + "epoch": 2.756476683937824, + "grad_norm": 9.75, + "learning_rate": 2.2435233160621762e-05, + "loss": 0.5703, + "step": 532 + }, + { + "epoch": 2.756476683937824, + "eval_accuracy": 0.678391959798995, + "eval_loss": 0.645139753818512, + "eval_runtime": 23.6225, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 532 + }, + { + "epoch": 2.761658031088083, + "grad_norm": 16.0, + "learning_rate": 2.2383419689119174e-05, + "loss": 1.4297, + "step": 533 + }, + { + "epoch": 2.761658031088083, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6443545818328857, + "eval_runtime": 23.6418, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 533 + }, + { + "epoch": 2.766839378238342, + "grad_norm": 6.625, + "learning_rate": 2.2331606217616582e-05, + "loss": 0.457, + "step": 534 + }, + { + "epoch": 2.766839378238342, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6432945728302002, + "eval_runtime": 23.6483, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 534 + }, + { + "epoch": 2.772020725388601, + "grad_norm": 20.125, + "learning_rate": 2.227979274611399e-05, + "loss": 0.5859, + "step": 535 + }, + { + "epoch": 2.772020725388601, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6422738432884216, + "eval_runtime": 23.6956, + "eval_samples_per_second": 16.796, + "eval_steps_per_second": 2.11, + "step": 535 + }, + { + "epoch": 2.77720207253886, + "grad_norm": 5.625, + "learning_rate": 2.22279792746114e-05, + "loss": 0.4062, + "step": 536 + }, + { + "epoch": 2.77720207253886, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6423524022102356, + "eval_runtime": 23.696, + "eval_samples_per_second": 16.796, + "eval_steps_per_second": 2.11, + "step": 536 + }, + { + "epoch": 2.7823834196891193, + "grad_norm": 6.5625, + "learning_rate": 2.2176165803108807e-05, + "loss": 0.4746, + "step": 537 + }, + { + "epoch": 2.7823834196891193, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6420579552650452, + "eval_runtime": 23.6507, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 537 + }, + { + "epoch": 2.7875647668393784, + "grad_norm": 5.84375, + "learning_rate": 2.212435233160622e-05, + "loss": 0.3516, + "step": 538 + }, + { + "epoch": 2.7875647668393784, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6404876112937927, + "eval_runtime": 23.6533, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 538 + }, + { + "epoch": 2.7927461139896375, + "grad_norm": 6.71875, + "learning_rate": 2.2072538860103628e-05, + "loss": 0.6406, + "step": 539 + }, + { + "epoch": 2.7927461139896375, + "eval_accuracy": 0.6809045226130653, + "eval_loss": 0.6408998370170593, + "eval_runtime": 23.6557, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.114, + "step": 539 + }, + { + "epoch": 2.7979274611398965, + "grad_norm": 6.46875, + "learning_rate": 2.202072538860104e-05, + "loss": 0.4473, + "step": 540 + }, + { + "epoch": 2.7979274611398965, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6398791074752808, + "eval_runtime": 23.6715, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 540 + }, + { + "epoch": 2.8031088082901556, + "grad_norm": 11.25, + "learning_rate": 2.1968911917098448e-05, + "loss": 0.7344, + "step": 541 + }, + { + "epoch": 2.8031088082901556, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6394864916801453, + "eval_runtime": 23.606, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.118, + "step": 541 + }, + { + "epoch": 2.8082901554404147, + "grad_norm": 3.84375, + "learning_rate": 2.1917098445595856e-05, + "loss": 0.2168, + "step": 542 + }, + { + "epoch": 2.8082901554404147, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6404483318328857, + "eval_runtime": 23.5641, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.122, + "step": 542 + }, + { + "epoch": 2.813471502590674, + "grad_norm": 9.5625, + "learning_rate": 2.1865284974093265e-05, + "loss": 0.7812, + "step": 543 + }, + { + "epoch": 2.813471502590674, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.639309823513031, + "eval_runtime": 23.5927, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.119, + "step": 543 + }, + { + "epoch": 2.818652849740933, + "grad_norm": 7.34375, + "learning_rate": 2.1813471502590673e-05, + "loss": 0.5352, + "step": 544 + }, + { + "epoch": 2.818652849740933, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6386620402336121, + "eval_runtime": 23.5856, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.12, + "step": 544 + }, + { + "epoch": 2.823834196891192, + "grad_norm": 10.375, + "learning_rate": 2.1761658031088085e-05, + "loss": 0.8398, + "step": 545 + }, + { + "epoch": 2.823834196891192, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6370524764060974, + "eval_runtime": 23.5396, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.124, + "step": 545 + }, + { + "epoch": 2.8290155440414506, + "grad_norm": 7.0625, + "learning_rate": 2.1709844559585493e-05, + "loss": 0.668, + "step": 546 + }, + { + "epoch": 2.8290155440414506, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6359139680862427, + "eval_runtime": 23.5604, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.122, + "step": 546 + }, + { + "epoch": 2.8341968911917097, + "grad_norm": 5.15625, + "learning_rate": 2.1658031088082902e-05, + "loss": 0.4727, + "step": 547 + }, + { + "epoch": 2.8341968911917097, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6351680159568787, + "eval_runtime": 23.6154, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.117, + "step": 547 + }, + { + "epoch": 2.839378238341969, + "grad_norm": 3.109375, + "learning_rate": 2.1606217616580314e-05, + "loss": 0.1631, + "step": 548 + }, + { + "epoch": 2.839378238341969, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6350895166397095, + "eval_runtime": 23.6749, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 548 + }, + { + "epoch": 2.844559585492228, + "grad_norm": 5.0, + "learning_rate": 2.1554404145077722e-05, + "loss": 0.3965, + "step": 549 + }, + { + "epoch": 2.844559585492228, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6345987915992737, + "eval_runtime": 23.6514, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 549 + }, + { + "epoch": 2.849740932642487, + "grad_norm": 7.0, + "learning_rate": 2.150259067357513e-05, + "loss": 0.6797, + "step": 550 + }, + { + "epoch": 2.849740932642487, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6338332295417786, + "eval_runtime": 23.6542, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 550 + }, + { + "epoch": 2.854922279792746, + "grad_norm": 8.6875, + "learning_rate": 2.145077720207254e-05, + "loss": 0.7578, + "step": 551 + }, + { + "epoch": 2.854922279792746, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6336958408355713, + "eval_runtime": 23.5033, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.127, + "step": 551 + }, + { + "epoch": 2.860103626943005, + "grad_norm": 6.78125, + "learning_rate": 2.1398963730569947e-05, + "loss": 0.6484, + "step": 552 + }, + { + "epoch": 2.860103626943005, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6330480575561523, + "eval_runtime": 23.4512, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.132, + "step": 552 + }, + { + "epoch": 2.865284974093264, + "grad_norm": 8.5, + "learning_rate": 2.134715025906736e-05, + "loss": 0.7227, + "step": 553 + }, + { + "epoch": 2.865284974093264, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6334406137466431, + "eval_runtime": 23.5943, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.119, + "step": 553 + }, + { + "epoch": 2.8704663212435233, + "grad_norm": 5.53125, + "learning_rate": 2.1295336787564767e-05, + "loss": 0.4629, + "step": 554 + }, + { + "epoch": 2.8704663212435233, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6327340006828308, + "eval_runtime": 23.6533, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 554 + }, + { + "epoch": 2.8756476683937824, + "grad_norm": 7.78125, + "learning_rate": 2.124352331606218e-05, + "loss": 0.5859, + "step": 555 + }, + { + "epoch": 2.8756476683937824, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.632851779460907, + "eval_runtime": 23.6398, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 555 + }, + { + "epoch": 2.8808290155440415, + "grad_norm": 4.65625, + "learning_rate": 2.1191709844559588e-05, + "loss": 0.3496, + "step": 556 + }, + { + "epoch": 2.8808290155440415, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6314777135848999, + "eval_runtime": 23.6878, + "eval_samples_per_second": 16.802, + "eval_steps_per_second": 2.111, + "step": 556 + }, + { + "epoch": 2.8860103626943006, + "grad_norm": 4.65625, + "learning_rate": 2.1139896373056996e-05, + "loss": 0.2617, + "step": 557 + }, + { + "epoch": 2.8860103626943006, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6320862174034119, + "eval_runtime": 23.6585, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 557 + }, + { + "epoch": 2.8911917098445596, + "grad_norm": 8.4375, + "learning_rate": 2.1088082901554405e-05, + "loss": 0.8047, + "step": 558 + }, + { + "epoch": 2.8911917098445596, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6323806643486023, + "eval_runtime": 23.694, + "eval_samples_per_second": 16.798, + "eval_steps_per_second": 2.11, + "step": 558 + }, + { + "epoch": 2.8963730569948187, + "grad_norm": 6.375, + "learning_rate": 2.1036269430051813e-05, + "loss": 0.4863, + "step": 559 + }, + { + "epoch": 2.8963730569948187, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6323021650314331, + "eval_runtime": 23.6756, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 559 + }, + { + "epoch": 2.901554404145078, + "grad_norm": 5.625, + "learning_rate": 2.0984455958549225e-05, + "loss": 0.4531, + "step": 560 + }, + { + "epoch": 2.901554404145078, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6317524909973145, + "eval_runtime": 23.6815, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.111, + "step": 560 + }, + { + "epoch": 2.906735751295337, + "grad_norm": 5.875, + "learning_rate": 2.0932642487046633e-05, + "loss": 0.4727, + "step": 561 + }, + { + "epoch": 2.906735751295337, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6320666074752808, + "eval_runtime": 23.6597, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 561 + }, + { + "epoch": 2.911917098445596, + "grad_norm": 12.0, + "learning_rate": 2.088082901554404e-05, + "loss": 1.0156, + "step": 562 + }, + { + "epoch": 2.911917098445596, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6323021650314331, + "eval_runtime": 23.6429, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 562 + }, + { + "epoch": 2.917098445595855, + "grad_norm": 10.9375, + "learning_rate": 2.0829015544041453e-05, + "loss": 0.9922, + "step": 563 + }, + { + "epoch": 2.917098445595855, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6315169334411621, + "eval_runtime": 23.6587, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 563 + }, + { + "epoch": 2.9222797927461137, + "grad_norm": 12.3125, + "learning_rate": 2.0777202072538862e-05, + "loss": 0.9219, + "step": 564 + }, + { + "epoch": 2.9222797927461137, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6311439871788025, + "eval_runtime": 23.6932, + "eval_samples_per_second": 16.798, + "eval_steps_per_second": 2.11, + "step": 564 + }, + { + "epoch": 2.927461139896373, + "grad_norm": 6.625, + "learning_rate": 2.072538860103627e-05, + "loss": 0.4258, + "step": 565 + }, + { + "epoch": 2.927461139896373, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6306728720664978, + "eval_runtime": 23.6432, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 565 + }, + { + "epoch": 2.932642487046632, + "grad_norm": 7.9375, + "learning_rate": 2.067357512953368e-05, + "loss": 0.7734, + "step": 566 + }, + { + "epoch": 2.932642487046632, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6306925415992737, + "eval_runtime": 23.6597, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 566 + }, + { + "epoch": 2.937823834196891, + "grad_norm": 7.28125, + "learning_rate": 2.0621761658031087e-05, + "loss": 0.6719, + "step": 567 + }, + { + "epoch": 2.937823834196891, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.630790650844574, + "eval_runtime": 23.6111, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.118, + "step": 567 + }, + { + "epoch": 2.94300518134715, + "grad_norm": 18.125, + "learning_rate": 2.05699481865285e-05, + "loss": 0.4473, + "step": 568 + }, + { + "epoch": 2.94300518134715, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6299073696136475, + "eval_runtime": 23.6067, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.118, + "step": 568 + }, + { + "epoch": 2.948186528497409, + "grad_norm": 6.78125, + "learning_rate": 2.0518134715025907e-05, + "loss": 0.6133, + "step": 569 + }, + { + "epoch": 2.948186528497409, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6303392052650452, + "eval_runtime": 23.5904, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.12, + "step": 569 + }, + { + "epoch": 2.9533678756476682, + "grad_norm": 9.0625, + "learning_rate": 2.046632124352332e-05, + "loss": 0.7461, + "step": 570 + }, + { + "epoch": 2.9533678756476682, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6298288106918335, + "eval_runtime": 23.6327, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 570 + }, + { + "epoch": 2.9585492227979273, + "grad_norm": 11.4375, + "learning_rate": 2.0414507772020728e-05, + "loss": 0.6992, + "step": 571 + }, + { + "epoch": 2.9585492227979273, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6301036477088928, + "eval_runtime": 23.581, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.12, + "step": 571 + }, + { + "epoch": 2.9637305699481864, + "grad_norm": 7.15625, + "learning_rate": 2.0362694300518136e-05, + "loss": 0.6406, + "step": 572 + }, + { + "epoch": 2.9637305699481864, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6294362545013428, + "eval_runtime": 23.6223, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 572 + }, + { + "epoch": 2.9689119170984455, + "grad_norm": 10.9375, + "learning_rate": 2.0310880829015544e-05, + "loss": 1.0, + "step": 573 + }, + { + "epoch": 2.9689119170984455, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6292399764060974, + "eval_runtime": 23.5749, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.121, + "step": 573 + }, + { + "epoch": 2.9740932642487046, + "grad_norm": 8.5, + "learning_rate": 2.0259067357512953e-05, + "loss": 0.7578, + "step": 574 + }, + { + "epoch": 2.9740932642487046, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6292595863342285, + "eval_runtime": 23.6214, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.117, + "step": 574 + }, + { + "epoch": 2.9792746113989637, + "grad_norm": 8.5, + "learning_rate": 2.0207253886010365e-05, + "loss": 0.5859, + "step": 575 + }, + { + "epoch": 2.9792746113989637, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6280621886253357, + "eval_runtime": 23.5732, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.121, + "step": 575 + }, + { + "epoch": 2.9844559585492227, + "grad_norm": 6.84375, + "learning_rate": 2.0155440414507773e-05, + "loss": 0.6602, + "step": 576 + }, + { + "epoch": 2.9844559585492227, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6278659105300903, + "eval_runtime": 23.6094, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.118, + "step": 576 + }, + { + "epoch": 2.989637305699482, + "grad_norm": 12.25, + "learning_rate": 2.010362694300518e-05, + "loss": 1.0469, + "step": 577 + }, + { + "epoch": 2.989637305699482, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6269629597663879, + "eval_runtime": 23.5695, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.121, + "step": 577 + }, + { + "epoch": 2.994818652849741, + "grad_norm": 9.1875, + "learning_rate": 2.0051813471502593e-05, + "loss": 0.832, + "step": 578 + }, + { + "epoch": 2.994818652849741, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6267862915992737, + "eval_runtime": 23.5749, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.121, + "step": 578 + }, + { + "epoch": 3.0, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.0212, + "step": 579 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6258636713027954, + "eval_runtime": 23.5669, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.122, + "step": 579 + }, + { + "epoch": 3.005181347150259, + "grad_norm": 6.34375, + "learning_rate": 1.994818652849741e-05, + "loss": 0.5781, + "step": 580 + }, + { + "epoch": 3.005181347150259, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6258047819137573, + "eval_runtime": 23.6164, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.117, + "step": 580 + }, + { + "epoch": 3.010362694300518, + "grad_norm": 6.46875, + "learning_rate": 1.989637305699482e-05, + "loss": 0.5273, + "step": 581 + }, + { + "epoch": 3.010362694300518, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6260011196136475, + "eval_runtime": 23.5971, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.119, + "step": 581 + }, + { + "epoch": 3.0155440414507773, + "grad_norm": 5.21875, + "learning_rate": 1.9844559585492227e-05, + "loss": 0.4395, + "step": 582 + }, + { + "epoch": 3.0155440414507773, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6259422302246094, + "eval_runtime": 23.6069, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.118, + "step": 582 + }, + { + "epoch": 3.0207253886010363, + "grad_norm": 10.5, + "learning_rate": 1.979274611398964e-05, + "loss": 0.9062, + "step": 583 + }, + { + "epoch": 3.0207253886010363, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6246073842048645, + "eval_runtime": 23.6148, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.117, + "step": 583 + }, + { + "epoch": 3.0259067357512954, + "grad_norm": 6.71875, + "learning_rate": 1.9740932642487047e-05, + "loss": 0.5859, + "step": 584 + }, + { + "epoch": 3.0259067357512954, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6245484948158264, + "eval_runtime": 23.6099, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.118, + "step": 584 + }, + { + "epoch": 3.0310880829015545, + "grad_norm": 6.6875, + "learning_rate": 1.968911917098446e-05, + "loss": 0.6211, + "step": 585 + }, + { + "epoch": 3.0310880829015545, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6240185499191284, + "eval_runtime": 23.6571, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 585 + }, + { + "epoch": 3.0362694300518136, + "grad_norm": 6.3125, + "learning_rate": 1.9637305699481867e-05, + "loss": 0.4805, + "step": 586 + }, + { + "epoch": 3.0362694300518136, + "eval_accuracy": 0.7060301507537688, + "eval_loss": 0.6237437129020691, + "eval_runtime": 23.6147, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.117, + "step": 586 + }, + { + "epoch": 3.0414507772020727, + "grad_norm": 8.25, + "learning_rate": 1.9585492227979276e-05, + "loss": 0.8438, + "step": 587 + }, + { + "epoch": 3.0414507772020727, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6238811016082764, + "eval_runtime": 23.6195, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 587 + }, + { + "epoch": 3.0466321243523318, + "grad_norm": 8.4375, + "learning_rate": 1.9533678756476684e-05, + "loss": 0.707, + "step": 588 + }, + { + "epoch": 3.0466321243523318, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6230370402336121, + "eval_runtime": 23.6198, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 588 + }, + { + "epoch": 3.051813471502591, + "grad_norm": 6.9375, + "learning_rate": 1.9481865284974093e-05, + "loss": 0.5156, + "step": 589 + }, + { + "epoch": 3.051813471502591, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.622448205947876, + "eval_runtime": 23.6204, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 589 + }, + { + "epoch": 3.05699481865285, + "grad_norm": 5.46875, + "learning_rate": 1.9430051813471504e-05, + "loss": 0.4238, + "step": 590 + }, + { + "epoch": 3.05699481865285, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.623115599155426, + "eval_runtime": 23.6249, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.116, + "step": 590 + }, + { + "epoch": 3.062176165803109, + "grad_norm": 7.125, + "learning_rate": 1.9378238341968913e-05, + "loss": 0.6602, + "step": 591 + }, + { + "epoch": 3.062176165803109, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6220948696136475, + "eval_runtime": 23.6629, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.113, + "step": 591 + }, + { + "epoch": 3.0673575129533677, + "grad_norm": 16.0, + "learning_rate": 1.932642487046632e-05, + "loss": 1.0781, + "step": 592 + }, + { + "epoch": 3.0673575129533677, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6213096976280212, + "eval_runtime": 23.6293, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 592 + }, + { + "epoch": 3.0725388601036268, + "grad_norm": 7.65625, + "learning_rate": 1.9274611398963733e-05, + "loss": 0.6602, + "step": 593 + }, + { + "epoch": 3.0725388601036268, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6214078068733215, + "eval_runtime": 23.6322, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 593 + }, + { + "epoch": 3.077720207253886, + "grad_norm": 9.0, + "learning_rate": 1.922279792746114e-05, + "loss": 0.8945, + "step": 594 + }, + { + "epoch": 3.077720207253886, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6208778023719788, + "eval_runtime": 23.6789, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.112, + "step": 594 + }, + { + "epoch": 3.082901554404145, + "grad_norm": 7.1875, + "learning_rate": 1.917098445595855e-05, + "loss": 0.6055, + "step": 595 + }, + { + "epoch": 3.082901554404145, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.62042635679245, + "eval_runtime": 23.6346, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.116, + "step": 595 + }, + { + "epoch": 3.088082901554404, + "grad_norm": 4.8125, + "learning_rate": 1.9119170984455958e-05, + "loss": 0.3379, + "step": 596 + }, + { + "epoch": 3.088082901554404, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6199356317520142, + "eval_runtime": 23.6318, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 596 + }, + { + "epoch": 3.093264248704663, + "grad_norm": 7.1875, + "learning_rate": 1.9067357512953367e-05, + "loss": 0.6328, + "step": 597 + }, + { + "epoch": 3.093264248704663, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6198374629020691, + "eval_runtime": 23.6372, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 597 + }, + { + "epoch": 3.098445595854922, + "grad_norm": 4.34375, + "learning_rate": 1.901554404145078e-05, + "loss": 0.2119, + "step": 598 + }, + { + "epoch": 3.098445595854922, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.619170069694519, + "eval_runtime": 23.6337, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.116, + "step": 598 + }, + { + "epoch": 3.1036269430051813, + "grad_norm": 6.125, + "learning_rate": 1.8963730569948187e-05, + "loss": 0.5391, + "step": 599 + }, + { + "epoch": 3.1036269430051813, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6201907992362976, + "eval_runtime": 23.6676, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.113, + "step": 599 + }, + { + "epoch": 3.1088082901554404, + "grad_norm": 6.03125, + "learning_rate": 1.89119170984456e-05, + "loss": 0.6211, + "step": 600 + }, + { + "epoch": 3.1088082901554404, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6193074584007263, + "eval_runtime": 23.6316, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 600 + }, + { + "epoch": 3.1139896373056994, + "grad_norm": 5.65625, + "learning_rate": 1.8860103626943007e-05, + "loss": 0.543, + "step": 601 + }, + { + "epoch": 3.1139896373056994, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6195626854896545, + "eval_runtime": 23.5642, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.122, + "step": 601 + }, + { + "epoch": 3.1191709844559585, + "grad_norm": 5.96875, + "learning_rate": 1.8808290155440416e-05, + "loss": 0.4219, + "step": 602 + }, + { + "epoch": 3.1191709844559585, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6197196841239929, + "eval_runtime": 23.5213, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.126, + "step": 602 + }, + { + "epoch": 3.1243523316062176, + "grad_norm": 6.71875, + "learning_rate": 1.8756476683937824e-05, + "loss": 0.7227, + "step": 603 + }, + { + "epoch": 3.1243523316062176, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6197000741958618, + "eval_runtime": 23.5808, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.12, + "step": 603 + }, + { + "epoch": 3.1295336787564767, + "grad_norm": 11.125, + "learning_rate": 1.8704663212435232e-05, + "loss": 1.0, + "step": 604 + }, + { + "epoch": 3.1295336787564767, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6213489174842834, + "eval_runtime": 23.6482, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 604 + }, + { + "epoch": 3.134715025906736, + "grad_norm": 5.3125, + "learning_rate": 1.8652849740932644e-05, + "loss": 0.4512, + "step": 605 + }, + { + "epoch": 3.134715025906736, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6202496886253357, + "eval_runtime": 23.6668, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 605 + }, + { + "epoch": 3.139896373056995, + "grad_norm": 7.28125, + "learning_rate": 1.8601036269430053e-05, + "loss": 0.6758, + "step": 606 + }, + { + "epoch": 3.139896373056995, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6204656362533569, + "eval_runtime": 23.6332, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 606 + }, + { + "epoch": 3.145077720207254, + "grad_norm": 12.5, + "learning_rate": 1.854922279792746e-05, + "loss": 0.8906, + "step": 607 + }, + { + "epoch": 3.145077720207254, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.620524525642395, + "eval_runtime": 23.6301, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 607 + }, + { + "epoch": 3.150259067357513, + "grad_norm": 6.84375, + "learning_rate": 1.8497409326424873e-05, + "loss": 0.5234, + "step": 608 + }, + { + "epoch": 3.150259067357513, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.620387077331543, + "eval_runtime": 23.6501, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.114, + "step": 608 + }, + { + "epoch": 3.155440414507772, + "grad_norm": 6.34375, + "learning_rate": 1.844559585492228e-05, + "loss": 0.5898, + "step": 609 + }, + { + "epoch": 3.155440414507772, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6203674674034119, + "eval_runtime": 23.642, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 609 + }, + { + "epoch": 3.160621761658031, + "grad_norm": 19.0, + "learning_rate": 1.839378238341969e-05, + "loss": 0.7031, + "step": 610 + }, + { + "epoch": 3.160621761658031, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6209563612937927, + "eval_runtime": 23.6403, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 610 + }, + { + "epoch": 3.1658031088082903, + "grad_norm": 6.6875, + "learning_rate": 1.8341968911917098e-05, + "loss": 0.6914, + "step": 611 + }, + { + "epoch": 3.1658031088082903, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6209563612937927, + "eval_runtime": 23.6387, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 611 + }, + { + "epoch": 3.1709844559585494, + "grad_norm": 4.8125, + "learning_rate": 1.8290155440414507e-05, + "loss": 0.4121, + "step": 612 + }, + { + "epoch": 3.1709844559585494, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.621054470539093, + "eval_runtime": 23.6825, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.111, + "step": 612 + }, + { + "epoch": 3.1761658031088085, + "grad_norm": 10.0, + "learning_rate": 1.823834196891192e-05, + "loss": 0.9688, + "step": 613 + }, + { + "epoch": 3.1761658031088085, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6208385825157166, + "eval_runtime": 23.6475, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 613 + }, + { + "epoch": 3.1813471502590676, + "grad_norm": 12.375, + "learning_rate": 1.8186528497409327e-05, + "loss": 1.2891, + "step": 614 + }, + { + "epoch": 3.1813471502590676, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6209955811500549, + "eval_runtime": 23.6452, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.115, + "step": 614 + }, + { + "epoch": 3.186528497409326, + "grad_norm": 6.40625, + "learning_rate": 1.813471502590674e-05, + "loss": 0.5781, + "step": 615 + }, + { + "epoch": 3.186528497409326, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.620387077331543, + "eval_runtime": 23.6294, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 615 + }, + { + "epoch": 3.1917098445595853, + "grad_norm": 4.75, + "learning_rate": 1.8082901554404147e-05, + "loss": 0.4102, + "step": 616 + }, + { + "epoch": 3.1917098445595853, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6201122999191284, + "eval_runtime": 23.673, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 616 + }, + { + "epoch": 3.1968911917098444, + "grad_norm": 6.6875, + "learning_rate": 1.8031088082901555e-05, + "loss": 0.6133, + "step": 617 + }, + { + "epoch": 3.1968911917098444, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.619817852973938, + "eval_runtime": 23.6433, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 617 + }, + { + "epoch": 3.2020725388601035, + "grad_norm": 7.8125, + "learning_rate": 1.7979274611398964e-05, + "loss": 0.7422, + "step": 618 + }, + { + "epoch": 3.2020725388601035, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6195234060287476, + "eval_runtime": 23.6427, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 618 + }, + { + "epoch": 3.2072538860103625, + "grad_norm": 11.75, + "learning_rate": 1.7927461139896372e-05, + "loss": 0.9961, + "step": 619 + }, + { + "epoch": 3.2072538860103625, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.619209349155426, + "eval_runtime": 23.6413, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 619 + }, + { + "epoch": 3.2124352331606216, + "grad_norm": 8.0625, + "learning_rate": 1.7875647668393784e-05, + "loss": 0.6211, + "step": 620 + }, + { + "epoch": 3.2124352331606216, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6193860173225403, + "eval_runtime": 23.68, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.111, + "step": 620 + }, + { + "epoch": 3.2176165803108807, + "grad_norm": 4.15625, + "learning_rate": 1.7823834196891192e-05, + "loss": 0.3027, + "step": 621 + }, + { + "epoch": 3.2176165803108807, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6192682385444641, + "eval_runtime": 23.6475, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 621 + }, + { + "epoch": 3.22279792746114, + "grad_norm": 12.625, + "learning_rate": 1.77720207253886e-05, + "loss": 0.7539, + "step": 622 + }, + { + "epoch": 3.22279792746114, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6186793446540833, + "eval_runtime": 23.6866, + "eval_samples_per_second": 16.803, + "eval_steps_per_second": 2.111, + "step": 622 + }, + { + "epoch": 3.227979274611399, + "grad_norm": 6.6875, + "learning_rate": 1.7720207253886013e-05, + "loss": 0.5938, + "step": 623 + }, + { + "epoch": 3.227979274611399, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6189934015274048, + "eval_runtime": 23.6434, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 623 + }, + { + "epoch": 3.233160621761658, + "grad_norm": 5.15625, + "learning_rate": 1.766839378238342e-05, + "loss": 0.4414, + "step": 624 + }, + { + "epoch": 3.233160621761658, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6188363432884216, + "eval_runtime": 23.6377, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 624 + }, + { + "epoch": 3.238341968911917, + "grad_norm": 7.6875, + "learning_rate": 1.761658031088083e-05, + "loss": 0.5938, + "step": 625 + }, + { + "epoch": 3.238341968911917, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6181297302246094, + "eval_runtime": 23.6482, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 625 + }, + { + "epoch": 3.243523316062176, + "grad_norm": 9.1875, + "learning_rate": 1.7564766839378238e-05, + "loss": 0.8164, + "step": 626 + }, + { + "epoch": 3.243523316062176, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6179726719856262, + "eval_runtime": 23.652, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 626 + }, + { + "epoch": 3.2487046632124352, + "grad_norm": 5.3125, + "learning_rate": 1.7512953367875646e-05, + "loss": 0.5195, + "step": 627 + }, + { + "epoch": 3.2487046632124352, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6183260083198547, + "eval_runtime": 23.6565, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 627 + }, + { + "epoch": 3.2538860103626943, + "grad_norm": 5.46875, + "learning_rate": 1.7461139896373058e-05, + "loss": 0.3457, + "step": 628 + }, + { + "epoch": 3.2538860103626943, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6179137825965881, + "eval_runtime": 23.6506, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 628 + }, + { + "epoch": 3.2590673575129534, + "grad_norm": 6.1875, + "learning_rate": 1.7409326424870467e-05, + "loss": 0.5234, + "step": 629 + }, + { + "epoch": 3.2590673575129534, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6179137825965881, + "eval_runtime": 23.6636, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 629 + }, + { + "epoch": 3.2642487046632125, + "grad_norm": 6.96875, + "learning_rate": 1.735751295336788e-05, + "loss": 0.707, + "step": 630 + }, + { + "epoch": 3.2642487046632125, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.61785489320755, + "eval_runtime": 23.6615, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 630 + }, + { + "epoch": 3.2694300518134716, + "grad_norm": 5.6875, + "learning_rate": 1.7305699481865287e-05, + "loss": 0.4844, + "step": 631 + }, + { + "epoch": 3.2694300518134716, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6176978349685669, + "eval_runtime": 23.6482, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 631 + }, + { + "epoch": 3.2746113989637307, + "grad_norm": 12.25, + "learning_rate": 1.7253886010362695e-05, + "loss": 1.0781, + "step": 632 + }, + { + "epoch": 3.2746113989637307, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6172463893890381, + "eval_runtime": 23.6972, + "eval_samples_per_second": 16.795, + "eval_steps_per_second": 2.11, + "step": 632 + }, + { + "epoch": 3.2797927461139897, + "grad_norm": 6.625, + "learning_rate": 1.7202072538860104e-05, + "loss": 0.5273, + "step": 633 + }, + { + "epoch": 3.2797927461139897, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6176782250404358, + "eval_runtime": 23.6615, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 633 + }, + { + "epoch": 3.284974093264249, + "grad_norm": 6.8125, + "learning_rate": 1.7150259067357512e-05, + "loss": 0.5195, + "step": 634 + }, + { + "epoch": 3.284974093264249, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6176782250404358, + "eval_runtime": 23.6594, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 634 + }, + { + "epoch": 3.290155440414508, + "grad_norm": 8.1875, + "learning_rate": 1.7098445595854924e-05, + "loss": 0.7344, + "step": 635 + }, + { + "epoch": 3.290155440414508, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6171678900718689, + "eval_runtime": 23.6507, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 635 + }, + { + "epoch": 3.295336787564767, + "grad_norm": 7.3125, + "learning_rate": 1.7046632124352332e-05, + "loss": 0.7422, + "step": 636 + }, + { + "epoch": 3.295336787564767, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6169323325157166, + "eval_runtime": 23.6515, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 636 + }, + { + "epoch": 3.300518134715026, + "grad_norm": 6.9375, + "learning_rate": 1.699481865284974e-05, + "loss": 0.6211, + "step": 637 + }, + { + "epoch": 3.300518134715026, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6170697212219238, + "eval_runtime": 23.6567, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 637 + }, + { + "epoch": 3.305699481865285, + "grad_norm": 6.3125, + "learning_rate": 1.6943005181347153e-05, + "loss": 0.5391, + "step": 638 + }, + { + "epoch": 3.305699481865285, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6171286106109619, + "eval_runtime": 23.6331, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 638 + }, + { + "epoch": 3.3108808290155443, + "grad_norm": 4.28125, + "learning_rate": 1.689119170984456e-05, + "loss": 0.2773, + "step": 639 + }, + { + "epoch": 3.3108808290155443, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.616539716720581, + "eval_runtime": 23.576, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.121, + "step": 639 + }, + { + "epoch": 3.3160621761658033, + "grad_norm": 9.3125, + "learning_rate": 1.683937823834197e-05, + "loss": 0.9336, + "step": 640 + }, + { + "epoch": 3.3160621761658033, + "eval_accuracy": 0.6834170854271356, + "eval_loss": 0.6170304417610168, + "eval_runtime": 23.5585, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.122, + "step": 640 + }, + { + "epoch": 3.321243523316062, + "grad_norm": 6.6875, + "learning_rate": 1.6787564766839378e-05, + "loss": 0.4609, + "step": 641 + }, + { + "epoch": 3.321243523316062, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.61652010679245, + "eval_runtime": 23.5889, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.12, + "step": 641 + }, + { + "epoch": 3.326424870466321, + "grad_norm": 7.9375, + "learning_rate": 1.6735751295336786e-05, + "loss": 0.6836, + "step": 642 + }, + { + "epoch": 3.326424870466321, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6168341636657715, + "eval_runtime": 23.543, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.124, + "step": 642 + }, + { + "epoch": 3.33160621761658, + "grad_norm": 7.0, + "learning_rate": 1.6683937823834198e-05, + "loss": 0.5078, + "step": 643 + }, + { + "epoch": 3.33160621761658, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6166378855705261, + "eval_runtime": 23.6008, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.119, + "step": 643 + }, + { + "epoch": 3.3367875647668392, + "grad_norm": 5.8125, + "learning_rate": 1.6632124352331606e-05, + "loss": 0.3516, + "step": 644 + }, + { + "epoch": 3.3367875647668392, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6168145537376404, + "eval_runtime": 23.5958, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.119, + "step": 644 + }, + { + "epoch": 3.3419689119170983, + "grad_norm": 6.0625, + "learning_rate": 1.6580310880829018e-05, + "loss": 0.5078, + "step": 645 + }, + { + "epoch": 3.3419689119170983, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6166378855705261, + "eval_runtime": 23.6692, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 645 + }, + { + "epoch": 3.3471502590673574, + "grad_norm": 6.21875, + "learning_rate": 1.6528497409326427e-05, + "loss": 0.5078, + "step": 646 + }, + { + "epoch": 3.3471502590673574, + "eval_accuracy": 0.7060301507537688, + "eval_loss": 0.6169519424438477, + "eval_runtime": 23.6376, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 646 + }, + { + "epoch": 3.3523316062176165, + "grad_norm": 4.71875, + "learning_rate": 1.6476683937823835e-05, + "loss": 0.2891, + "step": 647 + }, + { + "epoch": 3.3523316062176165, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6173445582389832, + "eval_runtime": 23.6485, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 647 + }, + { + "epoch": 3.3575129533678756, + "grad_norm": 9.0625, + "learning_rate": 1.6424870466321243e-05, + "loss": 0.875, + "step": 648 + }, + { + "epoch": 3.3575129533678756, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6174426674842834, + "eval_runtime": 23.6926, + "eval_samples_per_second": 16.798, + "eval_steps_per_second": 2.11, + "step": 648 + }, + { + "epoch": 3.3626943005181347, + "grad_norm": 8.625, + "learning_rate": 1.6373056994818652e-05, + "loss": 0.9375, + "step": 649 + }, + { + "epoch": 3.3626943005181347, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6176390051841736, + "eval_runtime": 23.6578, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 649 + }, + { + "epoch": 3.3678756476683938, + "grad_norm": 6.78125, + "learning_rate": 1.6321243523316064e-05, + "loss": 0.4863, + "step": 650 + }, + { + "epoch": 3.3678756476683938, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6173641681671143, + "eval_runtime": 23.6427, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 650 + }, + { + "epoch": 3.373056994818653, + "grad_norm": 7.09375, + "learning_rate": 1.6269430051813472e-05, + "loss": 0.7344, + "step": 651 + }, + { + "epoch": 3.373056994818653, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6170893311500549, + "eval_runtime": 23.5782, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.121, + "step": 651 + }, + { + "epoch": 3.378238341968912, + "grad_norm": 5.15625, + "learning_rate": 1.621761658031088e-05, + "loss": 0.416, + "step": 652 + }, + { + "epoch": 3.378238341968912, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6170304417610168, + "eval_runtime": 23.5443, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.124, + "step": 652 + }, + { + "epoch": 3.383419689119171, + "grad_norm": 5.25, + "learning_rate": 1.6165803108808292e-05, + "loss": 0.4727, + "step": 653 + }, + { + "epoch": 3.383419689119171, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6167359948158264, + "eval_runtime": 23.6048, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.118, + "step": 653 + }, + { + "epoch": 3.38860103626943, + "grad_norm": 6.09375, + "learning_rate": 1.61139896373057e-05, + "loss": 0.4668, + "step": 654 + }, + { + "epoch": 3.38860103626943, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6174426674842834, + "eval_runtime": 23.6334, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 654 + }, + { + "epoch": 3.393782383419689, + "grad_norm": 6.21875, + "learning_rate": 1.606217616580311e-05, + "loss": 0.4863, + "step": 655 + }, + { + "epoch": 3.393782383419689, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6170501112937927, + "eval_runtime": 23.651, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 655 + }, + { + "epoch": 3.3989637305699483, + "grad_norm": 6.25, + "learning_rate": 1.6010362694300518e-05, + "loss": 0.5508, + "step": 656 + }, + { + "epoch": 3.3989637305699483, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6171875, + "eval_runtime": 23.6544, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 656 + }, + { + "epoch": 3.4041450777202074, + "grad_norm": 6.3125, + "learning_rate": 1.595854922279793e-05, + "loss": 0.6094, + "step": 657 + }, + { + "epoch": 3.4041450777202074, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.617226779460907, + "eval_runtime": 23.6861, + "eval_samples_per_second": 16.803, + "eval_steps_per_second": 2.111, + "step": 657 + }, + { + "epoch": 3.4093264248704664, + "grad_norm": 7.25, + "learning_rate": 1.5906735751295338e-05, + "loss": 0.4785, + "step": 658 + }, + { + "epoch": 3.4093264248704664, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6174623370170593, + "eval_runtime": 23.6706, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.112, + "step": 658 + }, + { + "epoch": 3.4145077720207255, + "grad_norm": 6.34375, + "learning_rate": 1.5854922279792746e-05, + "loss": 0.5391, + "step": 659 + }, + { + "epoch": 3.4145077720207255, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6169912219047546, + "eval_runtime": 23.6512, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 659 + }, + { + "epoch": 3.4196891191709846, + "grad_norm": 6.59375, + "learning_rate": 1.5803108808290158e-05, + "loss": 0.5625, + "step": 660 + }, + { + "epoch": 3.4196891191709846, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6175015568733215, + "eval_runtime": 23.6494, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.114, + "step": 660 + }, + { + "epoch": 3.4248704663212437, + "grad_norm": 9.375, + "learning_rate": 1.5751295336787566e-05, + "loss": 0.6758, + "step": 661 + }, + { + "epoch": 3.4248704663212437, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6176978349685669, + "eval_runtime": 23.6403, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 661 + }, + { + "epoch": 3.4300518134715023, + "grad_norm": 7.25, + "learning_rate": 1.5699481865284975e-05, + "loss": 0.4609, + "step": 662 + }, + { + "epoch": 3.4300518134715023, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6178156137466431, + "eval_runtime": 23.6615, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 662 + }, + { + "epoch": 3.4352331606217614, + "grad_norm": 5.625, + "learning_rate": 1.5647668393782383e-05, + "loss": 0.5312, + "step": 663 + }, + { + "epoch": 3.4352331606217614, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6170697212219238, + "eval_runtime": 23.6322, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 663 + }, + { + "epoch": 3.4404145077720205, + "grad_norm": 5.75, + "learning_rate": 1.5595854922279792e-05, + "loss": 0.4375, + "step": 664 + }, + { + "epoch": 3.4404145077720205, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6172071099281311, + "eval_runtime": 23.6057, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.118, + "step": 664 + }, + { + "epoch": 3.4455958549222796, + "grad_norm": 16.875, + "learning_rate": 1.5544041450777204e-05, + "loss": 1.5859, + "step": 665 + }, + { + "epoch": 3.4455958549222796, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6167556643486023, + "eval_runtime": 23.631, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 665 + }, + { + "epoch": 3.4507772020725387, + "grad_norm": 4.5, + "learning_rate": 1.5492227979274612e-05, + "loss": 0.3262, + "step": 666 + }, + { + "epoch": 3.4507772020725387, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6166378855705261, + "eval_runtime": 23.5776, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.121, + "step": 666 + }, + { + "epoch": 3.4559585492227978, + "grad_norm": 11.5625, + "learning_rate": 1.544041450777202e-05, + "loss": 0.8477, + "step": 667 + }, + { + "epoch": 3.4559585492227978, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6169715523719788, + "eval_runtime": 23.6027, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.118, + "step": 667 + }, + { + "epoch": 3.461139896373057, + "grad_norm": 4.96875, + "learning_rate": 1.5388601036269432e-05, + "loss": 0.3887, + "step": 668 + }, + { + "epoch": 3.461139896373057, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6171875, + "eval_runtime": 23.5938, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.119, + "step": 668 + }, + { + "epoch": 3.466321243523316, + "grad_norm": 5.1875, + "learning_rate": 1.533678756476684e-05, + "loss": 0.3184, + "step": 669 + }, + { + "epoch": 3.466321243523316, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6168734431266785, + "eval_runtime": 23.5394, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.124, + "step": 669 + }, + { + "epoch": 3.471502590673575, + "grad_norm": 5.03125, + "learning_rate": 1.528497409326425e-05, + "loss": 0.416, + "step": 670 + }, + { + "epoch": 3.471502590673575, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6175015568733215, + "eval_runtime": 23.5672, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.122, + "step": 670 + }, + { + "epoch": 3.476683937823834, + "grad_norm": 6.65625, + "learning_rate": 1.5233160621761657e-05, + "loss": 0.4395, + "step": 671 + }, + { + "epoch": 3.476683937823834, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6171875, + "eval_runtime": 23.5217, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.126, + "step": 671 + }, + { + "epoch": 3.481865284974093, + "grad_norm": 5.875, + "learning_rate": 1.5181347150259068e-05, + "loss": 0.4375, + "step": 672 + }, + { + "epoch": 3.481865284974093, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6174034476280212, + "eval_runtime": 23.5578, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.122, + "step": 672 + }, + { + "epoch": 3.4870466321243523, + "grad_norm": 4.71875, + "learning_rate": 1.5129533678756478e-05, + "loss": 0.4434, + "step": 673 + }, + { + "epoch": 3.4870466321243523, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6176193356513977, + "eval_runtime": 23.5676, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.122, + "step": 673 + }, + { + "epoch": 3.4922279792746114, + "grad_norm": 10.5, + "learning_rate": 1.5077720207253888e-05, + "loss": 0.9258, + "step": 674 + }, + { + "epoch": 3.4922279792746114, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6175801157951355, + "eval_runtime": 23.6225, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 674 + }, + { + "epoch": 3.4974093264248705, + "grad_norm": 10.0, + "learning_rate": 1.5025906735751296e-05, + "loss": 0.7305, + "step": 675 + }, + { + "epoch": 3.4974093264248705, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6177371144294739, + "eval_runtime": 23.612, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.118, + "step": 675 + }, + { + "epoch": 3.5025906735751295, + "grad_norm": 3.875, + "learning_rate": 1.4974093264248706e-05, + "loss": 0.2451, + "step": 676 + }, + { + "epoch": 3.5025906735751295, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6174426674842834, + "eval_runtime": 23.6277, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 676 + }, + { + "epoch": 3.5077720207253886, + "grad_norm": 5.9375, + "learning_rate": 1.4922279792746113e-05, + "loss": 0.498, + "step": 677 + }, + { + "epoch": 3.5077720207253886, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6181100606918335, + "eval_runtime": 23.6452, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.115, + "step": 677 + }, + { + "epoch": 3.5129533678756477, + "grad_norm": 6.71875, + "learning_rate": 1.4870466321243523e-05, + "loss": 0.5039, + "step": 678 + }, + { + "epoch": 3.5129533678756477, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6180315613746643, + "eval_runtime": 23.6842, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 678 + }, + { + "epoch": 3.518134715025907, + "grad_norm": 5.1875, + "learning_rate": 1.4818652849740933e-05, + "loss": 0.4121, + "step": 679 + }, + { + "epoch": 3.518134715025907, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6176782250404358, + "eval_runtime": 23.6923, + "eval_samples_per_second": 16.799, + "eval_steps_per_second": 2.11, + "step": 679 + }, + { + "epoch": 3.523316062176166, + "grad_norm": 5.65625, + "learning_rate": 1.4766839378238342e-05, + "loss": 0.459, + "step": 680 + }, + { + "epoch": 3.523316062176166, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6184241771697998, + "eval_runtime": 23.6659, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 680 + }, + { + "epoch": 3.528497409326425, + "grad_norm": 8.375, + "learning_rate": 1.4715025906735752e-05, + "loss": 0.8008, + "step": 681 + }, + { + "epoch": 3.528497409326425, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6182278394699097, + "eval_runtime": 23.7036, + "eval_samples_per_second": 16.791, + "eval_steps_per_second": 2.109, + "step": 681 + }, + { + "epoch": 3.533678756476684, + "grad_norm": 5.09375, + "learning_rate": 1.4663212435233162e-05, + "loss": 0.3984, + "step": 682 + }, + { + "epoch": 3.533678756476684, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6184830665588379, + "eval_runtime": 23.6595, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 682 + }, + { + "epoch": 3.538860103626943, + "grad_norm": 6.875, + "learning_rate": 1.4611398963730572e-05, + "loss": 0.5195, + "step": 683 + }, + { + "epoch": 3.538860103626943, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6189934015274048, + "eval_runtime": 23.6653, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.113, + "step": 683 + }, + { + "epoch": 3.5440414507772022, + "grad_norm": 7.4375, + "learning_rate": 1.455958549222798e-05, + "loss": 0.6875, + "step": 684 + }, + { + "epoch": 3.5440414507772022, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6189149022102356, + "eval_runtime": 23.6513, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 684 + }, + { + "epoch": 3.5492227979274613, + "grad_norm": 6.84375, + "learning_rate": 1.4507772020725389e-05, + "loss": 0.5938, + "step": 685 + }, + { + "epoch": 3.5492227979274613, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6185811758041382, + "eval_runtime": 23.6419, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 685 + }, + { + "epoch": 3.5544041450777204, + "grad_norm": 6.0, + "learning_rate": 1.4455958549222797e-05, + "loss": 0.4629, + "step": 686 + }, + { + "epoch": 3.5544041450777204, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.617835283279419, + "eval_runtime": 23.5476, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.123, + "step": 686 + }, + { + "epoch": 3.5595854922279795, + "grad_norm": 12.5, + "learning_rate": 1.4404145077720207e-05, + "loss": 0.8867, + "step": 687 + }, + { + "epoch": 3.5595854922279795, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6186400651931763, + "eval_runtime": 23.5578, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.122, + "step": 687 + }, + { + "epoch": 3.5647668393782386, + "grad_norm": 7.125, + "learning_rate": 1.4352331606217617e-05, + "loss": 0.6523, + "step": 688 + }, + { + "epoch": 3.5647668393782386, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6187382340431213, + "eval_runtime": 23.492, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.128, + "step": 688 + }, + { + "epoch": 3.5699481865284977, + "grad_norm": 7.0625, + "learning_rate": 1.4300518134715028e-05, + "loss": 0.6211, + "step": 689 + }, + { + "epoch": 3.5699481865284977, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6186596751213074, + "eval_runtime": 23.5327, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.125, + "step": 689 + }, + { + "epoch": 3.5751295336787567, + "grad_norm": 7.09375, + "learning_rate": 1.4248704663212436e-05, + "loss": 0.6875, + "step": 690 + }, + { + "epoch": 3.5751295336787567, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6188363432884216, + "eval_runtime": 23.5754, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.121, + "step": 690 + }, + { + "epoch": 3.5803108808290154, + "grad_norm": 6.9375, + "learning_rate": 1.4196891191709846e-05, + "loss": 0.2578, + "step": 691 + }, + { + "epoch": 3.5803108808290154, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6183456182479858, + "eval_runtime": 23.6578, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 691 + }, + { + "epoch": 3.5854922279792745, + "grad_norm": 7.0625, + "learning_rate": 1.4145077720207253e-05, + "loss": 0.5508, + "step": 692 + }, + { + "epoch": 3.5854922279792745, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6180904507637024, + "eval_runtime": 23.6817, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.111, + "step": 692 + }, + { + "epoch": 3.5906735751295336, + "grad_norm": 24.5, + "learning_rate": 1.4093264248704663e-05, + "loss": 1.2734, + "step": 693 + }, + { + "epoch": 3.5906735751295336, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6183260083198547, + "eval_runtime": 23.6564, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 693 + }, + { + "epoch": 3.5958549222797926, + "grad_norm": 9.3125, + "learning_rate": 1.4041450777202073e-05, + "loss": 0.8906, + "step": 694 + }, + { + "epoch": 3.5958549222797926, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6182278394699097, + "eval_runtime": 23.6609, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 694 + }, + { + "epoch": 3.6010362694300517, + "grad_norm": 5.875, + "learning_rate": 1.3989637305699481e-05, + "loss": 0.4707, + "step": 695 + }, + { + "epoch": 3.6010362694300517, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6179726719856262, + "eval_runtime": 23.6902, + "eval_samples_per_second": 16.8, + "eval_steps_per_second": 2.111, + "step": 695 + }, + { + "epoch": 3.606217616580311, + "grad_norm": 5.78125, + "learning_rate": 1.3937823834196892e-05, + "loss": 0.4785, + "step": 696 + }, + { + "epoch": 3.606217616580311, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6182671189308167, + "eval_runtime": 23.6642, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 696 + }, + { + "epoch": 3.61139896373057, + "grad_norm": 6.6875, + "learning_rate": 1.3886010362694302e-05, + "loss": 0.5039, + "step": 697 + }, + { + "epoch": 3.61139896373057, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6182475090026855, + "eval_runtime": 23.6765, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.112, + "step": 697 + }, + { + "epoch": 3.616580310880829, + "grad_norm": 21.625, + "learning_rate": 1.3834196891191712e-05, + "loss": 0.8828, + "step": 698 + }, + { + "epoch": 3.616580310880829, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6180904507637024, + "eval_runtime": 23.7175, + "eval_samples_per_second": 16.781, + "eval_steps_per_second": 2.108, + "step": 698 + }, + { + "epoch": 3.621761658031088, + "grad_norm": 5.3125, + "learning_rate": 1.378238341968912e-05, + "loss": 0.3066, + "step": 699 + }, + { + "epoch": 3.621761658031088, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6179530620574951, + "eval_runtime": 23.6572, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 699 + }, + { + "epoch": 3.626943005181347, + "grad_norm": 10.375, + "learning_rate": 1.3730569948186529e-05, + "loss": 0.6328, + "step": 700 + }, + { + "epoch": 3.626943005181347, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6181493401527405, + "eval_runtime": 23.5952, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.119, + "step": 700 + }, + { + "epoch": 3.6321243523316062, + "grad_norm": 7.40625, + "learning_rate": 1.3678756476683937e-05, + "loss": 0.6328, + "step": 701 + }, + { + "epoch": 3.6321243523316062, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6183063983917236, + "eval_runtime": 23.5775, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.121, + "step": 701 + }, + { + "epoch": 3.6373056994818653, + "grad_norm": 10.0, + "learning_rate": 1.3626943005181347e-05, + "loss": 0.8672, + "step": 702 + }, + { + "epoch": 3.6373056994818653, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6185615658760071, + "eval_runtime": 23.5714, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.121, + "step": 702 + }, + { + "epoch": 3.6424870466321244, + "grad_norm": 7.0625, + "learning_rate": 1.3575129533678757e-05, + "loss": 0.6094, + "step": 703 + }, + { + "epoch": 3.6424870466321244, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6183063983917236, + "eval_runtime": 23.6287, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 703 + }, + { + "epoch": 3.6476683937823835, + "grad_norm": 9.25, + "learning_rate": 1.3523316062176167e-05, + "loss": 0.5547, + "step": 704 + }, + { + "epoch": 3.6476683937823835, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6179726719856262, + "eval_runtime": 23.6203, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 704 + }, + { + "epoch": 3.6528497409326426, + "grad_norm": 9.4375, + "learning_rate": 1.3471502590673576e-05, + "loss": 0.8867, + "step": 705 + }, + { + "epoch": 3.6528497409326426, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6188363432884216, + "eval_runtime": 23.6723, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 705 + }, + { + "epoch": 3.6580310880829017, + "grad_norm": 6.9375, + "learning_rate": 1.3419689119170986e-05, + "loss": 0.6406, + "step": 706 + }, + { + "epoch": 3.6580310880829017, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6186008453369141, + "eval_runtime": 23.6543, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 706 + }, + { + "epoch": 3.6632124352331608, + "grad_norm": 5.96875, + "learning_rate": 1.3367875647668393e-05, + "loss": 0.4844, + "step": 707 + }, + { + "epoch": 3.6632124352331608, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6197589635848999, + "eval_runtime": 23.6321, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 707 + }, + { + "epoch": 3.66839378238342, + "grad_norm": 6.5625, + "learning_rate": 1.3316062176165803e-05, + "loss": 0.5078, + "step": 708 + }, + { + "epoch": 3.66839378238342, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.619111180305481, + "eval_runtime": 23.6647, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.113, + "step": 708 + }, + { + "epoch": 3.6735751295336785, + "grad_norm": 7.78125, + "learning_rate": 1.3264248704663213e-05, + "loss": 0.6484, + "step": 709 + }, + { + "epoch": 3.6735751295336785, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6194645166397095, + "eval_runtime": 23.6413, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 709 + }, + { + "epoch": 3.6787564766839376, + "grad_norm": 10.125, + "learning_rate": 1.3212435233160623e-05, + "loss": 0.6602, + "step": 710 + }, + { + "epoch": 3.6787564766839376, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6189541220664978, + "eval_runtime": 23.6439, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 710 + }, + { + "epoch": 3.6839378238341967, + "grad_norm": 10.375, + "learning_rate": 1.3160621761658031e-05, + "loss": 0.6758, + "step": 711 + }, + { + "epoch": 3.6839378238341967, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.619111180305481, + "eval_runtime": 23.6799, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.111, + "step": 711 + }, + { + "epoch": 3.6891191709844557, + "grad_norm": 7.46875, + "learning_rate": 1.3108808290155442e-05, + "loss": 0.6484, + "step": 712 + }, + { + "epoch": 3.6891191709844557, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.619111180305481, + "eval_runtime": 23.6431, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 712 + }, + { + "epoch": 3.694300518134715, + "grad_norm": 6.03125, + "learning_rate": 1.3056994818652852e-05, + "loss": 0.4805, + "step": 713 + }, + { + "epoch": 3.694300518134715, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6192682385444641, + "eval_runtime": 23.6395, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 713 + }, + { + "epoch": 3.699481865284974, + "grad_norm": 5.0625, + "learning_rate": 1.3005181347150262e-05, + "loss": 0.4238, + "step": 714 + }, + { + "epoch": 3.699481865284974, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6192289590835571, + "eval_runtime": 23.6813, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.111, + "step": 714 + }, + { + "epoch": 3.704663212435233, + "grad_norm": 19.25, + "learning_rate": 1.2953367875647668e-05, + "loss": 0.9922, + "step": 715 + }, + { + "epoch": 3.704663212435233, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6193467378616333, + "eval_runtime": 23.6381, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 715 + }, + { + "epoch": 3.709844559585492, + "grad_norm": 10.4375, + "learning_rate": 1.2901554404145077e-05, + "loss": 0.7812, + "step": 716 + }, + { + "epoch": 3.709844559585492, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6194252371788025, + "eval_runtime": 23.6689, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 716 + }, + { + "epoch": 3.715025906735751, + "grad_norm": 5.6875, + "learning_rate": 1.2849740932642487e-05, + "loss": 0.4336, + "step": 717 + }, + { + "epoch": 3.715025906735751, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6196215748786926, + "eval_runtime": 23.6699, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 717 + }, + { + "epoch": 3.7202072538860103, + "grad_norm": 6.0625, + "learning_rate": 1.2797927461139897e-05, + "loss": 0.4746, + "step": 718 + }, + { + "epoch": 3.7202072538860103, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6194645166397095, + "eval_runtime": 23.6288, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 718 + }, + { + "epoch": 3.7253886010362693, + "grad_norm": 11.0, + "learning_rate": 1.2746113989637307e-05, + "loss": 0.9219, + "step": 719 + }, + { + "epoch": 3.7253886010362693, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6189345121383667, + "eval_runtime": 23.663, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.113, + "step": 719 + }, + { + "epoch": 3.7305699481865284, + "grad_norm": 7.875, + "learning_rate": 1.2694300518134716e-05, + "loss": 0.5898, + "step": 720 + }, + { + "epoch": 3.7305699481865284, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6192289590835571, + "eval_runtime": 23.6249, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.116, + "step": 720 + }, + { + "epoch": 3.7357512953367875, + "grad_norm": 8.75, + "learning_rate": 1.2642487046632126e-05, + "loss": 0.6602, + "step": 721 + }, + { + "epoch": 3.7357512953367875, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6196804046630859, + "eval_runtime": 23.6252, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 721 + }, + { + "epoch": 3.7409326424870466, + "grad_norm": 7.0625, + "learning_rate": 1.2590673575129532e-05, + "loss": 0.6641, + "step": 722 + }, + { + "epoch": 3.7409326424870466, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.619111180305481, + "eval_runtime": 23.665, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.113, + "step": 722 + }, + { + "epoch": 3.7461139896373057, + "grad_norm": 5.3125, + "learning_rate": 1.2538860103626943e-05, + "loss": 0.4082, + "step": 723 + }, + { + "epoch": 3.7461139896373057, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6191896796226501, + "eval_runtime": 23.6621, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.113, + "step": 723 + }, + { + "epoch": 3.7512953367875648, + "grad_norm": 6.71875, + "learning_rate": 1.2487046632124353e-05, + "loss": 0.6094, + "step": 724 + }, + { + "epoch": 3.7512953367875648, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6190326809883118, + "eval_runtime": 23.6518, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 724 + }, + { + "epoch": 3.756476683937824, + "grad_norm": 4.09375, + "learning_rate": 1.2435233160621763e-05, + "loss": 0.2383, + "step": 725 + }, + { + "epoch": 3.756476683937824, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6200141310691833, + "eval_runtime": 23.6522, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 725 + }, + { + "epoch": 3.761658031088083, + "grad_norm": 8.625, + "learning_rate": 1.2383419689119171e-05, + "loss": 0.4766, + "step": 726 + }, + { + "epoch": 3.761658031088083, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6203478574752808, + "eval_runtime": 23.6096, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.118, + "step": 726 + }, + { + "epoch": 3.766839378238342, + "grad_norm": 7.71875, + "learning_rate": 1.2331606217616581e-05, + "loss": 0.582, + "step": 727 + }, + { + "epoch": 3.766839378238342, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6201515197753906, + "eval_runtime": 23.6491, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.114, + "step": 727 + }, + { + "epoch": 3.772020725388601, + "grad_norm": 12.1875, + "learning_rate": 1.227979274611399e-05, + "loss": 1.0625, + "step": 728 + }, + { + "epoch": 3.772020725388601, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6206815242767334, + "eval_runtime": 23.6075, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.118, + "step": 728 + }, + { + "epoch": 3.77720207253886, + "grad_norm": 6.84375, + "learning_rate": 1.22279792746114e-05, + "loss": 0.3262, + "step": 729 + }, + { + "epoch": 3.77720207253886, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6207404136657715, + "eval_runtime": 23.6123, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.118, + "step": 729 + }, + { + "epoch": 3.7823834196891193, + "grad_norm": 9.5, + "learning_rate": 1.2176165803108808e-05, + "loss": 0.7031, + "step": 730 + }, + { + "epoch": 3.7823834196891193, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6206815242767334, + "eval_runtime": 23.6162, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.117, + "step": 730 + }, + { + "epoch": 3.7875647668393784, + "grad_norm": 5.34375, + "learning_rate": 1.2124352331606218e-05, + "loss": 0.377, + "step": 731 + }, + { + "epoch": 3.7875647668393784, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6207600235939026, + "eval_runtime": 23.6144, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.117, + "step": 731 + }, + { + "epoch": 3.7927461139896375, + "grad_norm": 23.125, + "learning_rate": 1.2072538860103627e-05, + "loss": 0.9375, + "step": 732 + }, + { + "epoch": 3.7927461139896375, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6219378113746643, + "eval_runtime": 23.6126, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.118, + "step": 732 + }, + { + "epoch": 3.7979274611398965, + "grad_norm": 7.46875, + "learning_rate": 1.2020725388601037e-05, + "loss": 0.3594, + "step": 733 + }, + { + "epoch": 3.7979274611398965, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6208778023719788, + "eval_runtime": 23.6188, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.117, + "step": 733 + }, + { + "epoch": 3.8031088082901556, + "grad_norm": 4.78125, + "learning_rate": 1.1968911917098447e-05, + "loss": 0.3047, + "step": 734 + }, + { + "epoch": 3.8031088082901556, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6214470863342285, + "eval_runtime": 23.6511, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 734 + }, + { + "epoch": 3.8082901554404147, + "grad_norm": 7.0, + "learning_rate": 1.1917098445595855e-05, + "loss": 0.6875, + "step": 735 + }, + { + "epoch": 3.8082901554404147, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6220163106918335, + "eval_runtime": 23.6181, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.117, + "step": 735 + }, + { + "epoch": 3.813471502590674, + "grad_norm": 7.15625, + "learning_rate": 1.1865284974093264e-05, + "loss": 0.6367, + "step": 736 + }, + { + "epoch": 3.813471502590674, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6219770908355713, + "eval_runtime": 23.6119, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.118, + "step": 736 + }, + { + "epoch": 3.818652849740933, + "grad_norm": 6.1875, + "learning_rate": 1.1813471502590674e-05, + "loss": 0.5508, + "step": 737 + }, + { + "epoch": 3.818652849740933, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6218396425247192, + "eval_runtime": 23.6145, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.117, + "step": 737 + }, + { + "epoch": 3.823834196891192, + "grad_norm": 7.09375, + "learning_rate": 1.1761658031088084e-05, + "loss": 0.5039, + "step": 738 + }, + { + "epoch": 3.823834196891192, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6214863657951355, + "eval_runtime": 23.6477, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 738 + }, + { + "epoch": 3.8290155440414506, + "grad_norm": 5.09375, + "learning_rate": 1.1709844559585493e-05, + "loss": 0.4473, + "step": 739 + }, + { + "epoch": 3.8290155440414506, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6212704181671143, + "eval_runtime": 23.6181, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.117, + "step": 739 + }, + { + "epoch": 3.8341968911917097, + "grad_norm": 8.625, + "learning_rate": 1.1658031088082903e-05, + "loss": 0.7734, + "step": 740 + }, + { + "epoch": 3.8341968911917097, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6226052045822144, + "eval_runtime": 23.6224, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 740 + }, + { + "epoch": 3.839378238341969, + "grad_norm": 9.6875, + "learning_rate": 1.1606217616580311e-05, + "loss": 0.8164, + "step": 741 + }, + { + "epoch": 3.839378238341969, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6222126483917236, + "eval_runtime": 23.6391, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 741 + }, + { + "epoch": 3.844559585492228, + "grad_norm": 5.28125, + "learning_rate": 1.1554404145077721e-05, + "loss": 0.416, + "step": 742 + }, + { + "epoch": 3.844559585492228, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6228014826774597, + "eval_runtime": 23.6324, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 742 + }, + { + "epoch": 3.849740932642487, + "grad_norm": 5.5, + "learning_rate": 1.150259067357513e-05, + "loss": 0.3379, + "step": 743 + }, + { + "epoch": 3.849740932642487, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6225267052650452, + "eval_runtime": 23.6758, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.112, + "step": 743 + }, + { + "epoch": 3.854922279792746, + "grad_norm": 6.0625, + "learning_rate": 1.145077720207254e-05, + "loss": 0.4414, + "step": 744 + }, + { + "epoch": 3.854922279792746, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6229978203773499, + "eval_runtime": 23.637, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 744 + }, + { + "epoch": 3.860103626943005, + "grad_norm": 12.125, + "learning_rate": 1.139896373056995e-05, + "loss": 0.9141, + "step": 745 + }, + { + "epoch": 3.860103626943005, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6235474348068237, + "eval_runtime": 23.6831, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.111, + "step": 745 + }, + { + "epoch": 3.865284974093264, + "grad_norm": 11.375, + "learning_rate": 1.1347150259067358e-05, + "loss": 0.8711, + "step": 746 + }, + { + "epoch": 3.865284974093264, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6226052045822144, + "eval_runtime": 23.6973, + "eval_samples_per_second": 16.795, + "eval_steps_per_second": 2.11, + "step": 746 + }, + { + "epoch": 3.8704663212435233, + "grad_norm": 5.1875, + "learning_rate": 1.1295336787564767e-05, + "loss": 0.4355, + "step": 747 + }, + { + "epoch": 3.8704663212435233, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6235474348068237, + "eval_runtime": 23.6532, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.114, + "step": 747 + }, + { + "epoch": 3.8756476683937824, + "grad_norm": 7.34375, + "learning_rate": 1.1243523316062177e-05, + "loss": 0.7383, + "step": 748 + }, + { + "epoch": 3.8756476683937824, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6227622628211975, + "eval_runtime": 23.6509, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 748 + }, + { + "epoch": 3.8808290155440415, + "grad_norm": 6.46875, + "learning_rate": 1.1191709844559587e-05, + "loss": 0.5898, + "step": 749 + }, + { + "epoch": 3.8808290155440415, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6229389309883118, + "eval_runtime": 23.6917, + "eval_samples_per_second": 16.799, + "eval_steps_per_second": 2.11, + "step": 749 + }, + { + "epoch": 3.8860103626943006, + "grad_norm": 6.1875, + "learning_rate": 1.1139896373056995e-05, + "loss": 0.5625, + "step": 750 + }, + { + "epoch": 3.8860103626943006, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6229978203773499, + "eval_runtime": 23.6599, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 750 + }, + { + "epoch": 3.8911917098445596, + "grad_norm": 10.125, + "learning_rate": 1.1088082901554404e-05, + "loss": 0.8984, + "step": 751 + }, + { + "epoch": 3.8911917098445596, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6231940984725952, + "eval_runtime": 23.6281, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 751 + }, + { + "epoch": 3.8963730569948187, + "grad_norm": 12.6875, + "learning_rate": 1.1036269430051814e-05, + "loss": 0.918, + "step": 752 + }, + { + "epoch": 3.8963730569948187, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6234296560287476, + "eval_runtime": 23.5357, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.124, + "step": 752 + }, + { + "epoch": 3.901554404145078, + "grad_norm": 14.125, + "learning_rate": 1.0984455958549224e-05, + "loss": 1.1953, + "step": 753 + }, + { + "epoch": 3.901554404145078, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6233707666397095, + "eval_runtime": 23.6405, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 753 + }, + { + "epoch": 3.906735751295337, + "grad_norm": 7.46875, + "learning_rate": 1.0932642487046632e-05, + "loss": 0.6992, + "step": 754 + }, + { + "epoch": 3.906735751295337, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6229192614555359, + "eval_runtime": 23.6841, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.111, + "step": 754 + }, + { + "epoch": 3.911917098445596, + "grad_norm": 3.890625, + "learning_rate": 1.0880829015544042e-05, + "loss": 0.2559, + "step": 755 + }, + { + "epoch": 3.911917098445596, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6233707666397095, + "eval_runtime": 23.6506, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 755 + }, + { + "epoch": 3.917098445595855, + "grad_norm": 5.03125, + "learning_rate": 1.0829015544041451e-05, + "loss": 0.3848, + "step": 756 + }, + { + "epoch": 3.917098445595855, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6236652135848999, + "eval_runtime": 23.6628, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.113, + "step": 756 + }, + { + "epoch": 3.9222797927461137, + "grad_norm": 7.15625, + "learning_rate": 1.0777202072538861e-05, + "loss": 0.6719, + "step": 757 + }, + { + "epoch": 3.9222797927461137, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6237044334411621, + "eval_runtime": 23.6784, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.112, + "step": 757 + }, + { + "epoch": 3.927461139896373, + "grad_norm": 6.5, + "learning_rate": 1.072538860103627e-05, + "loss": 0.5898, + "step": 758 + }, + { + "epoch": 3.927461139896373, + "eval_accuracy": 0.7060301507537688, + "eval_loss": 0.6235474348068237, + "eval_runtime": 23.6757, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.112, + "step": 758 + }, + { + "epoch": 3.932642487046632, + "grad_norm": 5.34375, + "learning_rate": 1.067357512953368e-05, + "loss": 0.4102, + "step": 759 + }, + { + "epoch": 3.932642487046632, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6241952180862427, + "eval_runtime": 23.7165, + "eval_samples_per_second": 16.782, + "eval_steps_per_second": 2.108, + "step": 759 + }, + { + "epoch": 3.937823834196891, + "grad_norm": 5.8125, + "learning_rate": 1.062176165803109e-05, + "loss": 0.3633, + "step": 760 + }, + { + "epoch": 3.937823834196891, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6244699954986572, + "eval_runtime": 23.6547, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.114, + "step": 760 + }, + { + "epoch": 3.94300518134715, + "grad_norm": 7.0, + "learning_rate": 1.0569948186528498e-05, + "loss": 0.5898, + "step": 761 + }, + { + "epoch": 3.94300518134715, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6246662735939026, + "eval_runtime": 23.6966, + "eval_samples_per_second": 16.796, + "eval_steps_per_second": 2.11, + "step": 761 + }, + { + "epoch": 3.948186528497409, + "grad_norm": 4.46875, + "learning_rate": 1.0518134715025906e-05, + "loss": 0.2637, + "step": 762 + }, + { + "epoch": 3.948186528497409, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.625, + "eval_runtime": 23.62, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 762 + }, + { + "epoch": 3.9533678756476682, + "grad_norm": 8.3125, + "learning_rate": 1.0466321243523317e-05, + "loss": 0.8125, + "step": 763 + }, + { + "epoch": 3.9533678756476682, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6252748370170593, + "eval_runtime": 23.5898, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.12, + "step": 763 + }, + { + "epoch": 3.9585492227979273, + "grad_norm": 8.625, + "learning_rate": 1.0414507772020727e-05, + "loss": 0.6562, + "step": 764 + }, + { + "epoch": 3.9585492227979273, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6259814500808716, + "eval_runtime": 23.5659, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.122, + "step": 764 + }, + { + "epoch": 3.9637305699481864, + "grad_norm": 7.0625, + "learning_rate": 1.0362694300518135e-05, + "loss": 0.5703, + "step": 765 + }, + { + "epoch": 3.9637305699481864, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6259225606918335, + "eval_runtime": 23.5418, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.124, + "step": 765 + }, + { + "epoch": 3.9689119170984455, + "grad_norm": 6.0625, + "learning_rate": 1.0310880829015544e-05, + "loss": 0.543, + "step": 766 + }, + { + "epoch": 3.9689119170984455, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6257851719856262, + "eval_runtime": 23.5622, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.122, + "step": 766 + }, + { + "epoch": 3.9740932642487046, + "grad_norm": 7.40625, + "learning_rate": 1.0259067357512954e-05, + "loss": 0.75, + "step": 767 + }, + { + "epoch": 3.9740932642487046, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.625569224357605, + "eval_runtime": 23.5089, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.127, + "step": 767 + }, + { + "epoch": 3.9792746113989637, + "grad_norm": 15.75, + "learning_rate": 1.0207253886010364e-05, + "loss": 0.4473, + "step": 768 + }, + { + "epoch": 3.9792746113989637, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6255300045013428, + "eval_runtime": 23.4966, + "eval_samples_per_second": 16.939, + "eval_steps_per_second": 2.128, + "step": 768 + }, + { + "epoch": 3.9844559585492227, + "grad_norm": 8.25, + "learning_rate": 1.0155440414507772e-05, + "loss": 0.7266, + "step": 769 + }, + { + "epoch": 3.9844559585492227, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.624293327331543, + "eval_runtime": 23.5206, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.126, + "step": 769 + }, + { + "epoch": 3.989637305699482, + "grad_norm": 7.1875, + "learning_rate": 1.0103626943005182e-05, + "loss": 0.4746, + "step": 770 + }, + { + "epoch": 3.989637305699482, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6247251629829407, + "eval_runtime": 23.5871, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.12, + "step": 770 + }, + { + "epoch": 3.994818652849741, + "grad_norm": 7.28125, + "learning_rate": 1.005181347150259e-05, + "loss": 0.5156, + "step": 771 + }, + { + "epoch": 3.994818652849741, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6251177787780762, + "eval_runtime": 23.6195, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 771 + }, + { + "epoch": 4.0, + "grad_norm": 8.0625, + "learning_rate": 1e-05, + "loss": 0.3223, + "step": 772 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6251570582389832, + "eval_runtime": 23.6173, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.117, + "step": 772 + }, + { + "epoch": 4.005181347150259, + "grad_norm": 8.625, + "learning_rate": 9.94818652849741e-06, + "loss": 0.5039, + "step": 773 + }, + { + "epoch": 4.005181347150259, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6253926157951355, + "eval_runtime": 23.6604, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 773 + }, + { + "epoch": 4.010362694300518, + "grad_norm": 5.84375, + "learning_rate": 9.89637305699482e-06, + "loss": 0.3145, + "step": 774 + }, + { + "epoch": 4.010362694300518, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.625647783279419, + "eval_runtime": 23.6178, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.117, + "step": 774 + }, + { + "epoch": 4.015544041450777, + "grad_norm": 9.875, + "learning_rate": 9.84455958549223e-06, + "loss": 0.7695, + "step": 775 + }, + { + "epoch": 4.015544041450777, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.625, + "eval_runtime": 23.6227, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 775 + }, + { + "epoch": 4.020725388601036, + "grad_norm": 4.25, + "learning_rate": 9.792746113989638e-06, + "loss": 0.2949, + "step": 776 + }, + { + "epoch": 4.020725388601036, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6257262825965881, + "eval_runtime": 23.6284, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 776 + }, + { + "epoch": 4.025906735751295, + "grad_norm": 7.5, + "learning_rate": 9.740932642487046e-06, + "loss": 0.5, + "step": 777 + }, + { + "epoch": 4.025906735751295, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6258833408355713, + "eval_runtime": 23.6256, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 777 + }, + { + "epoch": 4.0310880829015545, + "grad_norm": 8.125, + "learning_rate": 9.689119170984456e-06, + "loss": 0.5898, + "step": 778 + }, + { + "epoch": 4.0310880829015545, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6261973977088928, + "eval_runtime": 23.6639, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 778 + }, + { + "epoch": 4.036269430051814, + "grad_norm": 7.75, + "learning_rate": 9.637305699481867e-06, + "loss": 0.5156, + "step": 779 + }, + { + "epoch": 4.036269430051814, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6260011196136475, + "eval_runtime": 23.6226, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 779 + }, + { + "epoch": 4.041450777202073, + "grad_norm": 6.75, + "learning_rate": 9.585492227979275e-06, + "loss": 0.4297, + "step": 780 + }, + { + "epoch": 4.041450777202073, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6266096234321594, + "eval_runtime": 23.6565, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 780 + }, + { + "epoch": 4.046632124352332, + "grad_norm": 6.40625, + "learning_rate": 9.533678756476683e-06, + "loss": 0.6562, + "step": 781 + }, + { + "epoch": 4.046632124352332, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6264133453369141, + "eval_runtime": 23.6585, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 781 + }, + { + "epoch": 4.051813471502591, + "grad_norm": 10.9375, + "learning_rate": 9.481865284974093e-06, + "loss": 0.7891, + "step": 782 + }, + { + "epoch": 4.051813471502591, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6268059015274048, + "eval_runtime": 23.6244, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.116, + "step": 782 + }, + { + "epoch": 4.05699481865285, + "grad_norm": 6.1875, + "learning_rate": 9.430051813471504e-06, + "loss": 0.4453, + "step": 783 + }, + { + "epoch": 4.05699481865285, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6262366771697998, + "eval_runtime": 23.6358, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 783 + }, + { + "epoch": 4.062176165803109, + "grad_norm": 10.625, + "learning_rate": 9.378238341968912e-06, + "loss": 0.7734, + "step": 784 + }, + { + "epoch": 4.062176165803109, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6264329552650452, + "eval_runtime": 23.622, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.117, + "step": 784 + }, + { + "epoch": 4.067357512953368, + "grad_norm": 11.5625, + "learning_rate": 9.326424870466322e-06, + "loss": 0.9688, + "step": 785 + }, + { + "epoch": 4.067357512953368, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.626354455947876, + "eval_runtime": 23.6269, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 785 + }, + { + "epoch": 4.072538860103627, + "grad_norm": 5.40625, + "learning_rate": 9.27461139896373e-06, + "loss": 0.4062, + "step": 786 + }, + { + "epoch": 4.072538860103627, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6261385083198547, + "eval_runtime": 23.6631, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.113, + "step": 786 + }, + { + "epoch": 4.077720207253886, + "grad_norm": 11.8125, + "learning_rate": 9.22279792746114e-06, + "loss": 0.7266, + "step": 787 + }, + { + "epoch": 4.077720207253886, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6265310645103455, + "eval_runtime": 23.6687, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 787 + }, + { + "epoch": 4.082901554404145, + "grad_norm": 5.53125, + "learning_rate": 9.170984455958549e-06, + "loss": 0.4824, + "step": 788 + }, + { + "epoch": 4.082901554404145, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.626884400844574, + "eval_runtime": 23.6287, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 788 + }, + { + "epoch": 4.0880829015544045, + "grad_norm": 6.0625, + "learning_rate": 9.11917098445596e-06, + "loss": 0.4434, + "step": 789 + }, + { + "epoch": 4.0880829015544045, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.626982569694519, + "eval_runtime": 23.6378, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 789 + }, + { + "epoch": 4.0932642487046635, + "grad_norm": 6.125, + "learning_rate": 9.06735751295337e-06, + "loss": 0.5547, + "step": 790 + }, + { + "epoch": 4.0932642487046635, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.626923680305481, + "eval_runtime": 23.6315, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 790 + }, + { + "epoch": 4.098445595854923, + "grad_norm": 6.25, + "learning_rate": 9.015544041450778e-06, + "loss": 0.459, + "step": 791 + }, + { + "epoch": 4.098445595854923, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6269040703773499, + "eval_runtime": 23.67, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 791 + }, + { + "epoch": 4.103626943005182, + "grad_norm": 6.34375, + "learning_rate": 8.963730569948186e-06, + "loss": 0.543, + "step": 792 + }, + { + "epoch": 4.103626943005182, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6264133453369141, + "eval_runtime": 23.675, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 792 + }, + { + "epoch": 4.108808290155441, + "grad_norm": 10.625, + "learning_rate": 8.911917098445596e-06, + "loss": 0.8594, + "step": 793 + }, + { + "epoch": 4.108808290155441, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6271396279335022, + "eval_runtime": 23.6301, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 793 + }, + { + "epoch": 4.1139896373057, + "grad_norm": 7.4375, + "learning_rate": 8.860103626943006e-06, + "loss": 0.75, + "step": 794 + }, + { + "epoch": 4.1139896373057, + "eval_accuracy": 0.6859296482412061, + "eval_loss": 0.6272181272506714, + "eval_runtime": 23.6284, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 794 + }, + { + "epoch": 4.119170984455959, + "grad_norm": 5.78125, + "learning_rate": 8.808290155440415e-06, + "loss": 0.4512, + "step": 795 + }, + { + "epoch": 4.119170984455959, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6268451809883118, + "eval_runtime": 23.6324, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 795 + }, + { + "epoch": 4.124352331606218, + "grad_norm": 6.0625, + "learning_rate": 8.756476683937823e-06, + "loss": 0.416, + "step": 796 + }, + { + "epoch": 4.124352331606218, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6274732947349548, + "eval_runtime": 23.6274, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 796 + }, + { + "epoch": 4.129533678756476, + "grad_norm": 8.6875, + "learning_rate": 8.704663212435233e-06, + "loss": 0.7852, + "step": 797 + }, + { + "epoch": 4.129533678756476, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6269040703773499, + "eval_runtime": 23.6588, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 797 + }, + { + "epoch": 4.134715025906735, + "grad_norm": 5.59375, + "learning_rate": 8.652849740932643e-06, + "loss": 0.3105, + "step": 798 + }, + { + "epoch": 4.134715025906735, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6268451809883118, + "eval_runtime": 23.6259, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 798 + }, + { + "epoch": 4.139896373056994, + "grad_norm": 6.4375, + "learning_rate": 8.601036269430052e-06, + "loss": 0.5664, + "step": 799 + }, + { + "epoch": 4.139896373056994, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6271199584007263, + "eval_runtime": 23.6195, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 799 + }, + { + "epoch": 4.1450777202072535, + "grad_norm": 5.59375, + "learning_rate": 8.549222797927462e-06, + "loss": 0.4199, + "step": 800 + }, + { + "epoch": 4.1450777202072535, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6271199584007263, + "eval_runtime": 23.6577, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 800 + }, + { + "epoch": 4.150259067357513, + "grad_norm": 7.28125, + "learning_rate": 8.49740932642487e-06, + "loss": 0.5625, + "step": 801 + }, + { + "epoch": 4.150259067357513, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6272377371788025, + "eval_runtime": 23.558, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.122, + "step": 801 + }, + { + "epoch": 4.155440414507772, + "grad_norm": 5.375, + "learning_rate": 8.44559585492228e-06, + "loss": 0.3359, + "step": 802 + }, + { + "epoch": 4.155440414507772, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6279444098472595, + "eval_runtime": 23.454, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.132, + "step": 802 + }, + { + "epoch": 4.160621761658031, + "grad_norm": 8.125, + "learning_rate": 8.393782383419689e-06, + "loss": 0.6484, + "step": 803 + }, + { + "epoch": 4.160621761658031, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6273947954177856, + "eval_runtime": 23.5302, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.125, + "step": 803 + }, + { + "epoch": 4.16580310880829, + "grad_norm": 9.75, + "learning_rate": 8.341968911917099e-06, + "loss": 0.6367, + "step": 804 + }, + { + "epoch": 4.16580310880829, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6274536848068237, + "eval_runtime": 23.6159, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.117, + "step": 804 + }, + { + "epoch": 4.170984455958549, + "grad_norm": 7.65625, + "learning_rate": 8.290155440414509e-06, + "loss": 0.6719, + "step": 805 + }, + { + "epoch": 4.170984455958549, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6284547448158264, + "eval_runtime": 23.5907, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.119, + "step": 805 + }, + { + "epoch": 4.176165803108808, + "grad_norm": 7.4375, + "learning_rate": 8.238341968911918e-06, + "loss": 0.668, + "step": 806 + }, + { + "epoch": 4.176165803108808, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6276892423629761, + "eval_runtime": 23.6048, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.118, + "step": 806 + }, + { + "epoch": 4.181347150259067, + "grad_norm": 8.625, + "learning_rate": 8.186528497409326e-06, + "loss": 0.7578, + "step": 807 + }, + { + "epoch": 4.181347150259067, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.627591073513031, + "eval_runtime": 23.6546, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.114, + "step": 807 + }, + { + "epoch": 4.186528497409326, + "grad_norm": 8.625, + "learning_rate": 8.134715025906736e-06, + "loss": 0.7812, + "step": 808 + }, + { + "epoch": 4.186528497409326, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6278659105300903, + "eval_runtime": 23.6196, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 808 + }, + { + "epoch": 4.191709844559585, + "grad_norm": 5.0, + "learning_rate": 8.082901554404146e-06, + "loss": 0.3711, + "step": 809 + }, + { + "epoch": 4.191709844559585, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6276892423629761, + "eval_runtime": 23.6136, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.117, + "step": 809 + }, + { + "epoch": 4.196891191709844, + "grad_norm": 21.125, + "learning_rate": 8.031088082901555e-06, + "loss": 1.125, + "step": 810 + }, + { + "epoch": 4.196891191709844, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6279444098472595, + "eval_runtime": 23.6456, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.115, + "step": 810 + }, + { + "epoch": 4.2020725388601035, + "grad_norm": 7.78125, + "learning_rate": 7.979274611398965e-06, + "loss": 0.6094, + "step": 811 + }, + { + "epoch": 4.2020725388601035, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6277481317520142, + "eval_runtime": 23.6511, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.114, + "step": 811 + }, + { + "epoch": 4.2072538860103625, + "grad_norm": 6.5625, + "learning_rate": 7.927461139896373e-06, + "loss": 0.4199, + "step": 812 + }, + { + "epoch": 4.2072538860103625, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6277088522911072, + "eval_runtime": 23.6132, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.117, + "step": 812 + }, + { + "epoch": 4.212435233160622, + "grad_norm": 6.5625, + "learning_rate": 7.875647668393783e-06, + "loss": 0.3984, + "step": 813 + }, + { + "epoch": 4.212435233160622, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.628337025642395, + "eval_runtime": 23.6606, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 813 + }, + { + "epoch": 4.217616580310881, + "grad_norm": 8.1875, + "learning_rate": 7.823834196891192e-06, + "loss": 0.8633, + "step": 814 + }, + { + "epoch": 4.217616580310881, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6273947954177856, + "eval_runtime": 23.6241, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.116, + "step": 814 + }, + { + "epoch": 4.22279792746114, + "grad_norm": 12.0, + "learning_rate": 7.772020725388602e-06, + "loss": 0.5195, + "step": 815 + }, + { + "epoch": 4.22279792746114, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6279640197753906, + "eval_runtime": 23.6288, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 815 + }, + { + "epoch": 4.227979274611399, + "grad_norm": 9.9375, + "learning_rate": 7.72020725388601e-06, + "loss": 0.5039, + "step": 816 + }, + { + "epoch": 4.227979274611399, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6277481317520142, + "eval_runtime": 23.6709, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.112, + "step": 816 + }, + { + "epoch": 4.233160621761658, + "grad_norm": 6.4375, + "learning_rate": 7.66839378238342e-06, + "loss": 0.4746, + "step": 817 + }, + { + "epoch": 4.233160621761658, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6284744143486023, + "eval_runtime": 23.6727, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 817 + }, + { + "epoch": 4.238341968911917, + "grad_norm": 8.5, + "learning_rate": 7.616580310880829e-06, + "loss": 0.8359, + "step": 818 + }, + { + "epoch": 4.238341968911917, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6284940242767334, + "eval_runtime": 23.6315, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 818 + }, + { + "epoch": 4.243523316062176, + "grad_norm": 7.15625, + "learning_rate": 7.564766839378239e-06, + "loss": 0.5547, + "step": 819 + }, + { + "epoch": 4.243523316062176, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6288473606109619, + "eval_runtime": 23.6356, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 819 + }, + { + "epoch": 4.248704663212435, + "grad_norm": 18.0, + "learning_rate": 7.512953367875648e-06, + "loss": 0.9102, + "step": 820 + }, + { + "epoch": 4.248704663212435, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6281603574752808, + "eval_runtime": 23.6307, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 820 + }, + { + "epoch": 4.253886010362694, + "grad_norm": 9.5625, + "learning_rate": 7.4611398963730565e-06, + "loss": 0.5156, + "step": 821 + }, + { + "epoch": 4.253886010362694, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6286118030548096, + "eval_runtime": 23.6286, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 821 + }, + { + "epoch": 4.259067357512953, + "grad_norm": 11.0, + "learning_rate": 7.409326424870467e-06, + "loss": 1.0625, + "step": 822 + }, + { + "epoch": 4.259067357512953, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6283566355705261, + "eval_runtime": 23.6274, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 822 + }, + { + "epoch": 4.2642487046632125, + "grad_norm": 7.09375, + "learning_rate": 7.357512953367876e-06, + "loss": 0.4414, + "step": 823 + }, + { + "epoch": 4.2642487046632125, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6282192468643188, + "eval_runtime": 23.6665, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 823 + }, + { + "epoch": 4.269430051813472, + "grad_norm": 6.40625, + "learning_rate": 7.305699481865286e-06, + "loss": 0.5703, + "step": 824 + }, + { + "epoch": 4.269430051813472, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6276892423629761, + "eval_runtime": 23.6286, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 824 + }, + { + "epoch": 4.274611398963731, + "grad_norm": 5.71875, + "learning_rate": 7.253886010362694e-06, + "loss": 0.4668, + "step": 825 + }, + { + "epoch": 4.274611398963731, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6285529136657715, + "eval_runtime": 23.6748, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.112, + "step": 825 + }, + { + "epoch": 4.27979274611399, + "grad_norm": 8.3125, + "learning_rate": 7.202072538860104e-06, + "loss": 0.707, + "step": 826 + }, + { + "epoch": 4.27979274611399, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6283173561096191, + "eval_runtime": 23.6396, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 826 + }, + { + "epoch": 4.284974093264249, + "grad_norm": 3.796875, + "learning_rate": 7.150259067357514e-06, + "loss": 0.2432, + "step": 827 + }, + { + "epoch": 4.284974093264249, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6281210780143738, + "eval_runtime": 23.6386, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 827 + }, + { + "epoch": 4.290155440414508, + "grad_norm": 7.3125, + "learning_rate": 7.098445595854923e-06, + "loss": 0.6719, + "step": 828 + }, + { + "epoch": 4.290155440414508, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6287688612937927, + "eval_runtime": 23.6469, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 828 + }, + { + "epoch": 4.295336787564767, + "grad_norm": 6.53125, + "learning_rate": 7.0466321243523315e-06, + "loss": 0.5547, + "step": 829 + }, + { + "epoch": 4.295336787564767, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6283566355705261, + "eval_runtime": 23.6383, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 829 + }, + { + "epoch": 4.300518134715026, + "grad_norm": 5.5625, + "learning_rate": 6.994818652849741e-06, + "loss": 0.3691, + "step": 830 + }, + { + "epoch": 4.300518134715026, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6276695728302002, + "eval_runtime": 23.6775, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.112, + "step": 830 + }, + { + "epoch": 4.305699481865285, + "grad_norm": 8.0625, + "learning_rate": 6.943005181347151e-06, + "loss": 0.6484, + "step": 831 + }, + { + "epoch": 4.305699481865285, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6281210780143738, + "eval_runtime": 23.6484, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 831 + }, + { + "epoch": 4.310880829015544, + "grad_norm": 9.6875, + "learning_rate": 6.89119170984456e-06, + "loss": 0.8867, + "step": 832 + }, + { + "epoch": 4.310880829015544, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6285529136657715, + "eval_runtime": 23.6419, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 832 + }, + { + "epoch": 4.316062176165803, + "grad_norm": 7.40625, + "learning_rate": 6.8393782383419685e-06, + "loss": 0.6602, + "step": 833 + }, + { + "epoch": 4.316062176165803, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6279640197753906, + "eval_runtime": 23.6487, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 833 + }, + { + "epoch": 4.321243523316062, + "grad_norm": 4.28125, + "learning_rate": 6.787564766839379e-06, + "loss": 0.3027, + "step": 834 + }, + { + "epoch": 4.321243523316062, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6281210780143738, + "eval_runtime": 23.6607, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 834 + }, + { + "epoch": 4.3264248704663215, + "grad_norm": 12.4375, + "learning_rate": 6.735751295336788e-06, + "loss": 1.0078, + "step": 835 + }, + { + "epoch": 4.3264248704663215, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6286118030548096, + "eval_runtime": 23.7227, + "eval_samples_per_second": 16.777, + "eval_steps_per_second": 2.108, + "step": 835 + }, + { + "epoch": 4.331606217616581, + "grad_norm": 5.625, + "learning_rate": 6.683937823834196e-06, + "loss": 0.4355, + "step": 836 + }, + { + "epoch": 4.331606217616581, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6279640197753906, + "eval_runtime": 23.6714, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.112, + "step": 836 + }, + { + "epoch": 4.33678756476684, + "grad_norm": 7.96875, + "learning_rate": 6.6321243523316064e-06, + "loss": 0.707, + "step": 837 + }, + { + "epoch": 4.33678756476684, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6274144053459167, + "eval_runtime": 23.6584, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 837 + }, + { + "epoch": 4.341968911917099, + "grad_norm": 9.0625, + "learning_rate": 6.580310880829016e-06, + "loss": 0.625, + "step": 838 + }, + { + "epoch": 4.341968911917099, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6286510825157166, + "eval_runtime": 23.6488, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 838 + }, + { + "epoch": 4.347150259067358, + "grad_norm": 7.09375, + "learning_rate": 6.528497409326426e-06, + "loss": 0.5156, + "step": 839 + }, + { + "epoch": 4.347150259067358, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6281799674034119, + "eval_runtime": 23.6884, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 839 + }, + { + "epoch": 4.352331606217617, + "grad_norm": 11.5, + "learning_rate": 6.476683937823834e-06, + "loss": 0.9609, + "step": 840 + }, + { + "epoch": 4.352331606217617, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6290240287780762, + "eval_runtime": 23.6488, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.114, + "step": 840 + }, + { + "epoch": 4.357512953367876, + "grad_norm": 7.25, + "learning_rate": 6.4248704663212435e-06, + "loss": 0.5664, + "step": 841 + }, + { + "epoch": 4.357512953367876, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6283566355705261, + "eval_runtime": 23.6878, + "eval_samples_per_second": 16.802, + "eval_steps_per_second": 2.111, + "step": 841 + }, + { + "epoch": 4.362694300518135, + "grad_norm": 6.65625, + "learning_rate": 6.373056994818654e-06, + "loss": 0.5117, + "step": 842 + }, + { + "epoch": 4.362694300518135, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6279051303863525, + "eval_runtime": 23.6594, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 842 + }, + { + "epoch": 4.367875647668393, + "grad_norm": 6.03125, + "learning_rate": 6.321243523316063e-06, + "loss": 0.4023, + "step": 843 + }, + { + "epoch": 4.367875647668393, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.628258466720581, + "eval_runtime": 23.7132, + "eval_samples_per_second": 16.784, + "eval_steps_per_second": 2.109, + "step": 843 + }, + { + "epoch": 4.373056994818652, + "grad_norm": 5.6875, + "learning_rate": 6.269430051813471e-06, + "loss": 0.4805, + "step": 844 + }, + { + "epoch": 4.373056994818652, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6280229091644287, + "eval_runtime": 23.6775, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.112, + "step": 844 + }, + { + "epoch": 4.3782383419689115, + "grad_norm": 12.4375, + "learning_rate": 6.217616580310881e-06, + "loss": 1.0547, + "step": 845 + }, + { + "epoch": 4.3782383419689115, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6285136342048645, + "eval_runtime": 23.7162, + "eval_samples_per_second": 16.782, + "eval_steps_per_second": 2.108, + "step": 845 + }, + { + "epoch": 4.383419689119171, + "grad_norm": 9.6875, + "learning_rate": 6.165803108808291e-06, + "loss": 0.8047, + "step": 846 + }, + { + "epoch": 4.383419689119171, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6286510825157166, + "eval_runtime": 23.6887, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 846 + }, + { + "epoch": 4.38860103626943, + "grad_norm": 7.96875, + "learning_rate": 6.1139896373057e-06, + "loss": 0.6133, + "step": 847 + }, + { + "epoch": 4.38860103626943, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6287491917610168, + "eval_runtime": 23.6178, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.117, + "step": 847 + }, + { + "epoch": 4.393782383419689, + "grad_norm": 9.0, + "learning_rate": 6.062176165803109e-06, + "loss": 0.373, + "step": 848 + }, + { + "epoch": 4.393782383419689, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6281603574752808, + "eval_runtime": 23.5839, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.12, + "step": 848 + }, + { + "epoch": 4.398963730569948, + "grad_norm": 7.625, + "learning_rate": 6.0103626943005185e-06, + "loss": 0.5703, + "step": 849 + }, + { + "epoch": 4.398963730569948, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6284547448158264, + "eval_runtime": 23.5672, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.122, + "step": 849 + }, + { + "epoch": 4.404145077720207, + "grad_norm": 11.0, + "learning_rate": 5.958549222797928e-06, + "loss": 0.2695, + "step": 850 + }, + { + "epoch": 4.404145077720207, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6283762454986572, + "eval_runtime": 23.5486, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.123, + "step": 850 + }, + { + "epoch": 4.409326424870466, + "grad_norm": 8.0625, + "learning_rate": 5.906735751295337e-06, + "loss": 0.7266, + "step": 851 + }, + { + "epoch": 4.409326424870466, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6281406879425049, + "eval_runtime": 23.5597, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.122, + "step": 851 + }, + { + "epoch": 4.414507772020725, + "grad_norm": 6.75, + "learning_rate": 5.854922279792746e-06, + "loss": 0.6289, + "step": 852 + }, + { + "epoch": 4.414507772020725, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.628866970539093, + "eval_runtime": 23.4826, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.129, + "step": 852 + }, + { + "epoch": 4.419689119170984, + "grad_norm": 6.4375, + "learning_rate": 5.8031088082901555e-06, + "loss": 0.5703, + "step": 853 + }, + { + "epoch": 4.419689119170984, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6278855204582214, + "eval_runtime": 23.6083, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.118, + "step": 853 + }, + { + "epoch": 4.424870466321243, + "grad_norm": 6.40625, + "learning_rate": 5.751295336787565e-06, + "loss": 0.3672, + "step": 854 + }, + { + "epoch": 4.424870466321243, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6285136342048645, + "eval_runtime": 23.6534, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.114, + "step": 854 + }, + { + "epoch": 4.430051813471502, + "grad_norm": 8.625, + "learning_rate": 5.699481865284975e-06, + "loss": 0.7617, + "step": 855 + }, + { + "epoch": 4.430051813471502, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6285136342048645, + "eval_runtime": 23.6313, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 855 + }, + { + "epoch": 4.435233160621761, + "grad_norm": 8.25, + "learning_rate": 5.647668393782383e-06, + "loss": 0.7969, + "step": 856 + }, + { + "epoch": 4.435233160621761, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6290436387062073, + "eval_runtime": 23.6288, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 856 + }, + { + "epoch": 4.4404145077720205, + "grad_norm": 12.625, + "learning_rate": 5.5958549222797934e-06, + "loss": 0.5234, + "step": 857 + }, + { + "epoch": 4.4404145077720205, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6287491917610168, + "eval_runtime": 23.6803, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.111, + "step": 857 + }, + { + "epoch": 4.44559585492228, + "grad_norm": 17.625, + "learning_rate": 5.544041450777202e-06, + "loss": 1.6328, + "step": 858 + }, + { + "epoch": 4.44559585492228, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6287099719047546, + "eval_runtime": 23.6804, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.111, + "step": 858 + }, + { + "epoch": 4.450777202072539, + "grad_norm": 6.125, + "learning_rate": 5.492227979274612e-06, + "loss": 0.3965, + "step": 859 + }, + { + "epoch": 4.450777202072539, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6288080811500549, + "eval_runtime": 23.6428, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 859 + }, + { + "epoch": 4.455958549222798, + "grad_norm": 7.125, + "learning_rate": 5.440414507772021e-06, + "loss": 0.4922, + "step": 860 + }, + { + "epoch": 4.455958549222798, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6286118030548096, + "eval_runtime": 23.635, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.116, + "step": 860 + }, + { + "epoch": 4.461139896373057, + "grad_norm": 6.90625, + "learning_rate": 5.3886010362694305e-06, + "loss": 0.5, + "step": 861 + }, + { + "epoch": 4.461139896373057, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6284351348876953, + "eval_runtime": 23.6263, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 861 + }, + { + "epoch": 4.466321243523316, + "grad_norm": 6.84375, + "learning_rate": 5.33678756476684e-06, + "loss": 0.6016, + "step": 862 + }, + { + "epoch": 4.466321243523316, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6281799674034119, + "eval_runtime": 23.6726, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 862 + }, + { + "epoch": 4.471502590673575, + "grad_norm": 7.46875, + "learning_rate": 5.284974093264249e-06, + "loss": 0.668, + "step": 863 + }, + { + "epoch": 4.471502590673575, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6286314129829407, + "eval_runtime": 23.6691, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 863 + }, + { + "epoch": 4.476683937823834, + "grad_norm": 6.625, + "learning_rate": 5.233160621761658e-06, + "loss": 0.6133, + "step": 864 + }, + { + "epoch": 4.476683937823834, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6277481317520142, + "eval_runtime": 23.6372, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 864 + }, + { + "epoch": 4.481865284974093, + "grad_norm": 7.0, + "learning_rate": 5.1813471502590676e-06, + "loss": 0.6992, + "step": 865 + }, + { + "epoch": 4.481865284974093, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6283762454986572, + "eval_runtime": 23.6371, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 865 + }, + { + "epoch": 4.487046632124352, + "grad_norm": 6.5625, + "learning_rate": 5.129533678756477e-06, + "loss": 0.5117, + "step": 866 + }, + { + "epoch": 4.487046632124352, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6287491917610168, + "eval_runtime": 23.6314, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 866 + }, + { + "epoch": 4.492227979274611, + "grad_norm": 4.4375, + "learning_rate": 5.077720207253886e-06, + "loss": 0.2656, + "step": 867 + }, + { + "epoch": 4.492227979274611, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6288277506828308, + "eval_runtime": 23.631, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 867 + }, + { + "epoch": 4.4974093264248705, + "grad_norm": 4.9375, + "learning_rate": 5.025906735751295e-06, + "loss": 0.3848, + "step": 868 + }, + { + "epoch": 4.4974093264248705, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6286118030548096, + "eval_runtime": 23.6267, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 868 + }, + { + "epoch": 4.5025906735751295, + "grad_norm": 6.28125, + "learning_rate": 4.974093264248705e-06, + "loss": 0.6328, + "step": 869 + }, + { + "epoch": 4.5025906735751295, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6286903023719788, + "eval_runtime": 23.6762, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.112, + "step": 869 + }, + { + "epoch": 4.507772020725389, + "grad_norm": 7.09375, + "learning_rate": 4.922279792746115e-06, + "loss": 0.4961, + "step": 870 + }, + { + "epoch": 4.507772020725389, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.62890625, + "eval_runtime": 23.6373, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 870 + }, + { + "epoch": 4.512953367875648, + "grad_norm": 8.625, + "learning_rate": 4.870466321243523e-06, + "loss": 0.7148, + "step": 871 + }, + { + "epoch": 4.512953367875648, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6278659105300903, + "eval_runtime": 23.6781, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.112, + "step": 871 + }, + { + "epoch": 4.518134715025907, + "grad_norm": 20.625, + "learning_rate": 4.818652849740933e-06, + "loss": 0.6172, + "step": 872 + }, + { + "epoch": 4.518134715025907, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6280032992362976, + "eval_runtime": 23.6359, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 872 + }, + { + "epoch": 4.523316062176166, + "grad_norm": 6.53125, + "learning_rate": 4.766839378238342e-06, + "loss": 0.4863, + "step": 873 + }, + { + "epoch": 4.523316062176166, + "eval_accuracy": 0.6884422110552764, + "eval_loss": 0.6277481317520142, + "eval_runtime": 23.6398, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 873 + }, + { + "epoch": 4.528497409326425, + "grad_norm": 4.40625, + "learning_rate": 4.715025906735752e-06, + "loss": 0.2754, + "step": 874 + }, + { + "epoch": 4.528497409326425, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6280621886253357, + "eval_runtime": 23.6789, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.112, + "step": 874 + }, + { + "epoch": 4.533678756476684, + "grad_norm": 8.3125, + "learning_rate": 4.663212435233161e-06, + "loss": 0.6367, + "step": 875 + }, + { + "epoch": 4.533678756476684, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6277481317520142, + "eval_runtime": 23.6412, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 875 + }, + { + "epoch": 4.538860103626943, + "grad_norm": 6.71875, + "learning_rate": 4.61139896373057e-06, + "loss": 0.5195, + "step": 876 + }, + { + "epoch": 4.538860103626943, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6284547448158264, + "eval_runtime": 23.6805, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.111, + "step": 876 + }, + { + "epoch": 4.544041450777202, + "grad_norm": 7.59375, + "learning_rate": 4.55958549222798e-06, + "loss": 0.543, + "step": 877 + }, + { + "epoch": 4.544041450777202, + "eval_accuracy": 0.6909547738693468, + "eval_loss": 0.6278070211410522, + "eval_runtime": 23.687, + "eval_samples_per_second": 16.802, + "eval_steps_per_second": 2.111, + "step": 877 + }, + { + "epoch": 4.549222797927461, + "grad_norm": 5.40625, + "learning_rate": 4.507772020725389e-06, + "loss": 0.4102, + "step": 878 + }, + { + "epoch": 4.549222797927461, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6279444098472595, + "eval_runtime": 23.6438, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.115, + "step": 878 + }, + { + "epoch": 4.55440414507772, + "grad_norm": 8.125, + "learning_rate": 4.455958549222798e-06, + "loss": 0.6445, + "step": 879 + }, + { + "epoch": 4.55440414507772, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6281210780143738, + "eval_runtime": 23.6397, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 879 + }, + { + "epoch": 4.5595854922279795, + "grad_norm": 7.6875, + "learning_rate": 4.404145077720207e-06, + "loss": 0.7109, + "step": 880 + }, + { + "epoch": 4.5595854922279795, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6279836893081665, + "eval_runtime": 23.6379, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 880 + }, + { + "epoch": 4.564766839378239, + "grad_norm": 10.625, + "learning_rate": 4.352331606217617e-06, + "loss": 0.918, + "step": 881 + }, + { + "epoch": 4.564766839378239, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6284155249595642, + "eval_runtime": 23.636, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 881 + }, + { + "epoch": 4.569948186528498, + "grad_norm": 6.125, + "learning_rate": 4.300518134715026e-06, + "loss": 0.373, + "step": 882 + }, + { + "epoch": 4.569948186528498, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6279444098472595, + "eval_runtime": 23.6287, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 882 + }, + { + "epoch": 4.575129533678757, + "grad_norm": 10.0, + "learning_rate": 4.248704663212435e-06, + "loss": 0.6602, + "step": 883 + }, + { + "epoch": 4.575129533678757, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6287099719047546, + "eval_runtime": 23.6733, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 883 + }, + { + "epoch": 4.580310880829016, + "grad_norm": 9.0625, + "learning_rate": 4.1968911917098444e-06, + "loss": 0.7578, + "step": 884 + }, + { + "epoch": 4.580310880829016, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6280032992362976, + "eval_runtime": 23.6252, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 884 + }, + { + "epoch": 4.585492227979275, + "grad_norm": 12.375, + "learning_rate": 4.1450777202072546e-06, + "loss": 0.5859, + "step": 885 + }, + { + "epoch": 4.585492227979275, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6281406879425049, + "eval_runtime": 23.6613, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 885 + }, + { + "epoch": 4.590673575129534, + "grad_norm": 4.34375, + "learning_rate": 4.093264248704663e-06, + "loss": 0.2002, + "step": 886 + }, + { + "epoch": 4.590673575129534, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6282192468643188, + "eval_runtime": 23.6252, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 886 + }, + { + "epoch": 4.595854922279793, + "grad_norm": 10.75, + "learning_rate": 4.041450777202073e-06, + "loss": 0.707, + "step": 887 + }, + { + "epoch": 4.595854922279793, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6284547448158264, + "eval_runtime": 23.622, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.117, + "step": 887 + }, + { + "epoch": 4.601036269430052, + "grad_norm": 7.6875, + "learning_rate": 3.989637305699482e-06, + "loss": 0.625, + "step": 888 + }, + { + "epoch": 4.601036269430052, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6279640197753906, + "eval_runtime": 23.6595, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.113, + "step": 888 + }, + { + "epoch": 4.606217616580311, + "grad_norm": 7.46875, + "learning_rate": 3.937823834196892e-06, + "loss": 0.543, + "step": 889 + }, + { + "epoch": 4.606217616580311, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6282781362533569, + "eval_runtime": 23.6577, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.113, + "step": 889 + }, + { + "epoch": 4.61139896373057, + "grad_norm": 13.3125, + "learning_rate": 3.886010362694301e-06, + "loss": 0.4512, + "step": 890 + }, + { + "epoch": 4.61139896373057, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6277677416801453, + "eval_runtime": 23.6167, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.117, + "step": 890 + }, + { + "epoch": 4.616580310880829, + "grad_norm": 4.40625, + "learning_rate": 3.83419689119171e-06, + "loss": 0.3359, + "step": 891 + }, + { + "epoch": 4.616580310880829, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6278659105300903, + "eval_runtime": 23.6137, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.117, + "step": 891 + }, + { + "epoch": 4.6217616580310885, + "grad_norm": 7.15625, + "learning_rate": 3.7823834196891194e-06, + "loss": 0.6602, + "step": 892 + }, + { + "epoch": 4.6217616580310885, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6278659105300903, + "eval_runtime": 23.657, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 892 + }, + { + "epoch": 4.626943005181348, + "grad_norm": 7.15625, + "learning_rate": 3.7305699481865283e-06, + "loss": 0.707, + "step": 893 + }, + { + "epoch": 4.626943005181348, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6276499629020691, + "eval_runtime": 23.6668, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 893 + }, + { + "epoch": 4.632124352331607, + "grad_norm": 5.0, + "learning_rate": 3.678756476683938e-06, + "loss": 0.3555, + "step": 894 + }, + { + "epoch": 4.632124352331607, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6282192468643188, + "eval_runtime": 23.638, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 894 + }, + { + "epoch": 4.637305699481866, + "grad_norm": 8.4375, + "learning_rate": 3.626943005181347e-06, + "loss": 0.6016, + "step": 895 + }, + { + "epoch": 4.637305699481866, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6277284622192383, + "eval_runtime": 23.6385, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 895 + }, + { + "epoch": 4.642487046632124, + "grad_norm": 3.859375, + "learning_rate": 3.575129533678757e-06, + "loss": 0.1943, + "step": 896 + }, + { + "epoch": 4.642487046632124, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6276892423629761, + "eval_runtime": 23.636, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 896 + }, + { + "epoch": 4.647668393782383, + "grad_norm": 6.5625, + "learning_rate": 3.5233160621761657e-06, + "loss": 0.5781, + "step": 897 + }, + { + "epoch": 4.647668393782383, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6281014680862427, + "eval_runtime": 23.633, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 897 + }, + { + "epoch": 4.652849740932642, + "grad_norm": 7.28125, + "learning_rate": 3.4715025906735754e-06, + "loss": 0.5859, + "step": 898 + }, + { + "epoch": 4.652849740932642, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.628258466720581, + "eval_runtime": 23.633, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.116, + "step": 898 + }, + { + "epoch": 4.658031088082901, + "grad_norm": 9.75, + "learning_rate": 3.4196891191709843e-06, + "loss": 0.7617, + "step": 899 + }, + { + "epoch": 4.658031088082901, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6271592378616333, + "eval_runtime": 23.6666, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.113, + "step": 899 + }, + { + "epoch": 4.66321243523316, + "grad_norm": 4.15625, + "learning_rate": 3.367875647668394e-06, + "loss": 0.3066, + "step": 900 + }, + { + "epoch": 4.66321243523316, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6280229091644287, + "eval_runtime": 23.6268, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.116, + "step": 900 + }, + { + "epoch": 4.668393782383419, + "grad_norm": 7.15625, + "learning_rate": 3.3160621761658032e-06, + "loss": 0.3906, + "step": 901 + }, + { + "epoch": 4.668393782383419, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6277284622192383, + "eval_runtime": 23.5503, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.123, + "step": 901 + }, + { + "epoch": 4.6735751295336785, + "grad_norm": 8.0, + "learning_rate": 3.264248704663213e-06, + "loss": 0.75, + "step": 902 + }, + { + "epoch": 4.6735751295336785, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6281014680862427, + "eval_runtime": 23.5116, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.127, + "step": 902 + }, + { + "epoch": 4.678756476683938, + "grad_norm": 4.78125, + "learning_rate": 3.2124352331606218e-06, + "loss": 0.3457, + "step": 903 + }, + { + "epoch": 4.678756476683938, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6280229091644287, + "eval_runtime": 23.5849, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.12, + "step": 903 + }, + { + "epoch": 4.683937823834197, + "grad_norm": 11.6875, + "learning_rate": 3.1606217616580314e-06, + "loss": 0.8516, + "step": 904 + }, + { + "epoch": 4.683937823834197, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6284351348876953, + "eval_runtime": 23.5906, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.119, + "step": 904 + }, + { + "epoch": 4.689119170984456, + "grad_norm": 8.9375, + "learning_rate": 3.1088082901554407e-06, + "loss": 0.7773, + "step": 905 + }, + { + "epoch": 4.689119170984456, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.627630352973938, + "eval_runtime": 23.6405, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.115, + "step": 905 + }, + { + "epoch": 4.694300518134715, + "grad_norm": 10.75, + "learning_rate": 3.05699481865285e-06, + "loss": 0.9453, + "step": 906 + }, + { + "epoch": 4.694300518134715, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6280621886253357, + "eval_runtime": 23.624, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.116, + "step": 906 + }, + { + "epoch": 4.699481865284974, + "grad_norm": 7.96875, + "learning_rate": 3.0051813471502592e-06, + "loss": 0.5117, + "step": 907 + }, + { + "epoch": 4.699481865284974, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6279247999191284, + "eval_runtime": 23.6735, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 907 + }, + { + "epoch": 4.704663212435233, + "grad_norm": 5.0625, + "learning_rate": 2.9533678756476685e-06, + "loss": 0.4219, + "step": 908 + }, + { + "epoch": 4.704663212435233, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6274732947349548, + "eval_runtime": 23.632, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 908 + }, + { + "epoch": 4.709844559585492, + "grad_norm": 8.125, + "learning_rate": 2.9015544041450778e-06, + "loss": 0.5469, + "step": 909 + }, + { + "epoch": 4.709844559585492, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6273947954177856, + "eval_runtime": 23.6291, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.116, + "step": 909 + }, + { + "epoch": 4.715025906735751, + "grad_norm": 7.4375, + "learning_rate": 2.8497409326424875e-06, + "loss": 0.6328, + "step": 910 + }, + { + "epoch": 4.715025906735751, + "eval_accuracy": 0.7060301507537688, + "eval_loss": 0.6275125741958618, + "eval_runtime": 23.6683, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.113, + "step": 910 + }, + { + "epoch": 4.72020725388601, + "grad_norm": 10.625, + "learning_rate": 2.7979274611398967e-06, + "loss": 0.5469, + "step": 911 + }, + { + "epoch": 4.72020725388601, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.627630352973938, + "eval_runtime": 23.62, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.117, + "step": 911 + }, + { + "epoch": 4.725388601036269, + "grad_norm": 7.34375, + "learning_rate": 2.746113989637306e-06, + "loss": 0.6523, + "step": 912 + }, + { + "epoch": 4.725388601036269, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.627591073513031, + "eval_runtime": 23.661, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 912 + }, + { + "epoch": 4.730569948186528, + "grad_norm": 7.4375, + "learning_rate": 2.6943005181347152e-06, + "loss": 0.5352, + "step": 913 + }, + { + "epoch": 4.730569948186528, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.6278266310691833, + "eval_runtime": 23.6245, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.116, + "step": 913 + }, + { + "epoch": 4.7357512953367875, + "grad_norm": 6.90625, + "learning_rate": 2.6424870466321245e-06, + "loss": 0.625, + "step": 914 + }, + { + "epoch": 4.7357512953367875, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6273359060287476, + "eval_runtime": 23.6624, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.113, + "step": 914 + }, + { + "epoch": 4.740932642487047, + "grad_norm": 7.1875, + "learning_rate": 2.5906735751295338e-06, + "loss": 0.6055, + "step": 915 + }, + { + "epoch": 4.740932642487047, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6275125741958618, + "eval_runtime": 23.6263, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.116, + "step": 915 + }, + { + "epoch": 4.746113989637306, + "grad_norm": 8.6875, + "learning_rate": 2.538860103626943e-06, + "loss": 0.6406, + "step": 916 + }, + { + "epoch": 4.746113989637306, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6279051303863525, + "eval_runtime": 23.6726, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.112, + "step": 916 + }, + { + "epoch": 4.751295336787565, + "grad_norm": 9.25, + "learning_rate": 2.4870466321243523e-06, + "loss": 0.7031, + "step": 917 + }, + { + "epoch": 4.751295336787565, + "eval_accuracy": 0.7060301507537688, + "eval_loss": 0.6280229091644287, + "eval_runtime": 23.6294, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.116, + "step": 917 + }, + { + "epoch": 4.756476683937824, + "grad_norm": 6.78125, + "learning_rate": 2.4352331606217616e-06, + "loss": 0.5898, + "step": 918 + }, + { + "epoch": 4.756476683937824, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6279444098472595, + "eval_runtime": 23.6345, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.116, + "step": 918 + }, + { + "epoch": 4.761658031088083, + "grad_norm": 6.6875, + "learning_rate": 2.383419689119171e-06, + "loss": 0.5664, + "step": 919 + }, + { + "epoch": 4.761658031088083, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6274536848068237, + "eval_runtime": 23.6426, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 919 + }, + { + "epoch": 4.766839378238342, + "grad_norm": 4.28125, + "learning_rate": 2.3316062176165805e-06, + "loss": 0.2383, + "step": 920 + }, + { + "epoch": 4.766839378238342, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6276695728302002, + "eval_runtime": 23.6831, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.111, + "step": 920 + }, + { + "epoch": 4.772020725388601, + "grad_norm": 11.5625, + "learning_rate": 2.27979274611399e-06, + "loss": 0.7461, + "step": 921 + }, + { + "epoch": 4.772020725388601, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6282192468643188, + "eval_runtime": 23.6432, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.115, + "step": 921 + }, + { + "epoch": 4.77720207253886, + "grad_norm": 6.8125, + "learning_rate": 2.227979274611399e-06, + "loss": 0.4531, + "step": 922 + }, + { + "epoch": 4.77720207253886, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6280817985534668, + "eval_runtime": 23.6887, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 922 + }, + { + "epoch": 4.782383419689119, + "grad_norm": 6.09375, + "learning_rate": 2.1761658031088083e-06, + "loss": 0.4727, + "step": 923 + }, + { + "epoch": 4.782383419689119, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6276106834411621, + "eval_runtime": 23.6415, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 923 + }, + { + "epoch": 4.787564766839378, + "grad_norm": 5.875, + "learning_rate": 2.1243523316062176e-06, + "loss": 0.459, + "step": 924 + }, + { + "epoch": 4.787564766839378, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6279836893081665, + "eval_runtime": 23.6741, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.112, + "step": 924 + }, + { + "epoch": 4.7927461139896375, + "grad_norm": 6.8125, + "learning_rate": 2.0725388601036273e-06, + "loss": 0.5352, + "step": 925 + }, + { + "epoch": 4.7927461139896375, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6277481317520142, + "eval_runtime": 23.6462, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.115, + "step": 925 + }, + { + "epoch": 4.7979274611398965, + "grad_norm": 7.8125, + "learning_rate": 2.0207253886010365e-06, + "loss": 0.5547, + "step": 926 + }, + { + "epoch": 4.7979274611398965, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6277088522911072, + "eval_runtime": 23.7041, + "eval_samples_per_second": 16.79, + "eval_steps_per_second": 2.109, + "step": 926 + }, + { + "epoch": 4.803108808290156, + "grad_norm": 9.375, + "learning_rate": 1.968911917098446e-06, + "loss": 0.7344, + "step": 927 + }, + { + "epoch": 4.803108808290156, + "eval_accuracy": 0.7085427135678392, + "eval_loss": 0.627551794052124, + "eval_runtime": 23.6692, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 927 + }, + { + "epoch": 4.808290155440415, + "grad_norm": 5.84375, + "learning_rate": 1.917098445595855e-06, + "loss": 0.3691, + "step": 928 + }, + { + "epoch": 4.808290155440415, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.627630352973938, + "eval_runtime": 23.7052, + "eval_samples_per_second": 16.79, + "eval_steps_per_second": 2.109, + "step": 928 + }, + { + "epoch": 4.813471502590674, + "grad_norm": 8.875, + "learning_rate": 1.8652849740932641e-06, + "loss": 0.8242, + "step": 929 + }, + { + "epoch": 4.813471502590674, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6281210780143738, + "eval_runtime": 23.6552, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.114, + "step": 929 + }, + { + "epoch": 4.818652849740933, + "grad_norm": 5.9375, + "learning_rate": 1.8134715025906736e-06, + "loss": 0.3906, + "step": 930 + }, + { + "epoch": 4.818652849740933, + "eval_accuracy": 0.6934673366834171, + "eval_loss": 0.6278070211410522, + "eval_runtime": 23.638, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 930 + }, + { + "epoch": 4.823834196891192, + "grad_norm": 6.625, + "learning_rate": 1.7616580310880829e-06, + "loss": 0.4297, + "step": 931 + }, + { + "epoch": 4.823834196891192, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6281406879425049, + "eval_runtime": 23.639, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.115, + "step": 931 + }, + { + "epoch": 4.829015544041451, + "grad_norm": 6.5625, + "learning_rate": 1.7098445595854921e-06, + "loss": 0.4883, + "step": 932 + }, + { + "epoch": 4.829015544041451, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6272966265678406, + "eval_runtime": 23.6309, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.116, + "step": 932 + }, + { + "epoch": 4.83419689119171, + "grad_norm": 5.90625, + "learning_rate": 1.6580310880829016e-06, + "loss": 0.5352, + "step": 933 + }, + { + "epoch": 4.83419689119171, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.627630352973938, + "eval_runtime": 23.637, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.115, + "step": 933 + }, + { + "epoch": 4.839378238341969, + "grad_norm": 5.96875, + "learning_rate": 1.6062176165803109e-06, + "loss": 0.459, + "step": 934 + }, + { + "epoch": 4.839378238341969, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6285725235939026, + "eval_runtime": 23.6689, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.112, + "step": 934 + }, + { + "epoch": 4.844559585492228, + "grad_norm": 5.0, + "learning_rate": 1.5544041450777204e-06, + "loss": 0.2891, + "step": 935 + }, + { + "epoch": 4.844559585492228, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6277873516082764, + "eval_runtime": 23.6843, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 935 + }, + { + "epoch": 4.849740932642487, + "grad_norm": 7.1875, + "learning_rate": 1.5025906735751296e-06, + "loss": 0.5703, + "step": 936 + }, + { + "epoch": 4.849740932642487, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6276695728302002, + "eval_runtime": 23.6818, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.111, + "step": 936 + }, + { + "epoch": 4.8549222797927465, + "grad_norm": 7.625, + "learning_rate": 1.4507772020725389e-06, + "loss": 0.7188, + "step": 937 + }, + { + "epoch": 4.8549222797927465, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6277088522911072, + "eval_runtime": 23.6866, + "eval_samples_per_second": 16.803, + "eval_steps_per_second": 2.111, + "step": 937 + }, + { + "epoch": 4.860103626943005, + "grad_norm": 5.46875, + "learning_rate": 1.3989637305699484e-06, + "loss": 0.3574, + "step": 938 + }, + { + "epoch": 4.860103626943005, + "eval_accuracy": 0.7035175879396985, + "eval_loss": 0.627551794052124, + "eval_runtime": 23.6855, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.111, + "step": 938 + }, + { + "epoch": 4.865284974093264, + "grad_norm": 9.8125, + "learning_rate": 1.3471502590673576e-06, + "loss": 0.8359, + "step": 939 + }, + { + "epoch": 4.865284974093264, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.627591073513031, + "eval_runtime": 23.6361, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.115, + "step": 939 + }, + { + "epoch": 4.870466321243523, + "grad_norm": 10.4375, + "learning_rate": 1.2953367875647669e-06, + "loss": 0.8125, + "step": 940 + }, + { + "epoch": 4.870466321243523, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6271985173225403, + "eval_runtime": 23.6464, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.114, + "step": 940 + }, + { + "epoch": 4.875647668393782, + "grad_norm": 11.1875, + "learning_rate": 1.2435233160621762e-06, + "loss": 1.0547, + "step": 941 + }, + { + "epoch": 4.875647668393782, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6285136342048645, + "eval_runtime": 23.689, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.111, + "step": 941 + }, + { + "epoch": 4.880829015544041, + "grad_norm": 9.0625, + "learning_rate": 1.1917098445595854e-06, + "loss": 0.7656, + "step": 942 + }, + { + "epoch": 4.880829015544041, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6273162961006165, + "eval_runtime": 23.6559, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.114, + "step": 942 + }, + { + "epoch": 4.8860103626943, + "grad_norm": 10.8125, + "learning_rate": 1.139896373056995e-06, + "loss": 0.793, + "step": 943 + }, + { + "epoch": 4.8860103626943, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6278266310691833, + "eval_runtime": 23.7034, + "eval_samples_per_second": 16.791, + "eval_steps_per_second": 2.109, + "step": 943 + }, + { + "epoch": 4.891191709844559, + "grad_norm": 7.5625, + "learning_rate": 1.0880829015544042e-06, + "loss": 0.6719, + "step": 944 + }, + { + "epoch": 4.891191709844559, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6280621886253357, + "eval_runtime": 23.7062, + "eval_samples_per_second": 16.789, + "eval_steps_per_second": 2.109, + "step": 944 + }, + { + "epoch": 4.896373056994818, + "grad_norm": 9.6875, + "learning_rate": 1.0362694300518136e-06, + "loss": 0.8164, + "step": 945 + }, + { + "epoch": 4.896373056994818, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6277284622192383, + "eval_runtime": 23.6569, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.114, + "step": 945 + }, + { + "epoch": 4.901554404145077, + "grad_norm": 8.9375, + "learning_rate": 9.84455958549223e-07, + "loss": 0.7578, + "step": 946 + }, + { + "epoch": 4.901554404145077, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6278462409973145, + "eval_runtime": 23.6412, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.115, + "step": 946 + }, + { + "epoch": 4.9067357512953365, + "grad_norm": 12.375, + "learning_rate": 9.326424870466321e-07, + "loss": 1.0703, + "step": 947 + }, + { + "epoch": 4.9067357512953365, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.627630352973938, + "eval_runtime": 23.6762, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.112, + "step": 947 + }, + { + "epoch": 4.9119170984455955, + "grad_norm": 6.9375, + "learning_rate": 8.808290155440414e-07, + "loss": 0.6523, + "step": 948 + }, + { + "epoch": 4.9119170984455955, + "eval_accuracy": 0.7010050251256281, + "eval_loss": 0.6280817985534668, + "eval_runtime": 23.6229, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.117, + "step": 948 + }, + { + "epoch": 4.917098445595855, + "grad_norm": 5.6875, + "learning_rate": 8.290155440414508e-07, + "loss": 0.3516, + "step": 949 + }, + { + "epoch": 4.917098445595855, + "eval_accuracy": 0.6959798994974874, + "eval_loss": 0.6279836893081665, + "eval_runtime": 23.6614, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.113, + "step": 949 + }, + { + "epoch": 4.922279792746114, + "grad_norm": 8.0, + "learning_rate": 7.772020725388602e-07, + "loss": 0.7148, + "step": 950 + }, + { + "epoch": 4.922279792746114, + "eval_accuracy": 0.6984924623115578, + "eval_loss": 0.6275321841239929, + "eval_runtime": 23.6141, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.117, + "step": 950 + } + ], + "logging_steps": 1, + "max_steps": 965, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}