| { |
| "best_metric": 0.2697894275188446, |
| "best_model_checkpoint": "/mlspeech/data/yoadsnapir/models/heb_small_exp_1/checkpoint-3300", |
| "epoch": 1.966626936829559, |
| "eval_steps": 150, |
| "global_step": 3300, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011918951132300357, |
| "grad_norm": 0.46440812945365906, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0799, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.023837902264600714, |
| "grad_norm": 0.3614272475242615, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0757, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.03575685339690107, |
| "grad_norm": 0.3306252658367157, |
| "learning_rate": 3e-06, |
| "loss": 0.0707, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04767580452920143, |
| "grad_norm": 0.3750213384628296, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.0707, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.05959475566150179, |
| "grad_norm": 0.3395947515964508, |
| "learning_rate": 5e-06, |
| "loss": 0.0677, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07151370679380215, |
| "grad_norm": 0.3402997553348541, |
| "learning_rate": 6e-06, |
| "loss": 0.0651, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.08343265792610251, |
| "grad_norm": 0.36161497235298157, |
| "learning_rate": 7e-06, |
| "loss": 0.0662, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.08939213349225268, |
| "eval_loss": 0.4555220901966095, |
| "eval_runtime": 32.3927, |
| "eval_samples_per_second": 86.439, |
| "eval_steps_per_second": 1.358, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09535160905840286, |
| "grad_norm": 0.35821646451950073, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.0659, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.10727056019070322, |
| "grad_norm": 0.3810754716396332, |
| "learning_rate": 9e-06, |
| "loss": 0.0632, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.11918951132300358, |
| "grad_norm": 0.35852399468421936, |
| "learning_rate": 1e-05, |
| "loss": 0.0636, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.13110846245530394, |
| "grad_norm": 0.33330053091049194, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 0.0632, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.1430274135876043, |
| "grad_norm": 0.3282977044582367, |
| "learning_rate": 1.2e-05, |
| "loss": 0.0617, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.15494636471990464, |
| "grad_norm": 0.32969963550567627, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.0609, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.16686531585220502, |
| "grad_norm": 0.33057352900505066, |
| "learning_rate": 1.4e-05, |
| "loss": 0.0606, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.17878426698450536, |
| "grad_norm": 0.3497098982334137, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.0598, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.17878426698450536, |
| "eval_loss": 0.4085025489330292, |
| "eval_runtime": 24.3982, |
| "eval_samples_per_second": 114.763, |
| "eval_steps_per_second": 1.803, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.1907032181168057, |
| "grad_norm": 0.34013646841049194, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.0586, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.2026221692491061, |
| "grad_norm": 0.3030904233455658, |
| "learning_rate": 1.7e-05, |
| "loss": 0.0588, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.21454112038140644, |
| "grad_norm": 0.3328603208065033, |
| "learning_rate": 1.8e-05, |
| "loss": 0.0562, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.22646007151370678, |
| "grad_norm": 0.32149162888526917, |
| "learning_rate": 1.9e-05, |
| "loss": 0.0559, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.23837902264600716, |
| "grad_norm": 0.2991398572921753, |
| "learning_rate": 2e-05, |
| "loss": 0.0553, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.25029797377830754, |
| "grad_norm": 0.30644193291664124, |
| "learning_rate": 2.1000000000000002e-05, |
| "loss": 0.0556, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.2622169249106079, |
| "grad_norm": 0.357546329498291, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 0.0546, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.26817640047675806, |
| "eval_loss": 0.38544225692749023, |
| "eval_runtime": 24.3319, |
| "eval_samples_per_second": 115.075, |
| "eval_steps_per_second": 1.808, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.27413587604290823, |
| "grad_norm": 0.3578217923641205, |
| "learning_rate": 2.3e-05, |
| "loss": 0.054, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.2860548271752086, |
| "grad_norm": 0.34381797909736633, |
| "learning_rate": 2.4e-05, |
| "loss": 0.0553, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.29797377830750893, |
| "grad_norm": 0.30176016688346863, |
| "learning_rate": 2.5e-05, |
| "loss": 0.0543, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3098927294398093, |
| "grad_norm": 0.33324921131134033, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 0.0544, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.3218116805721097, |
| "grad_norm": 0.3484705090522766, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 0.0531, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.33373063170441003, |
| "grad_norm": 0.3282860219478607, |
| "learning_rate": 2.8e-05, |
| "loss": 0.0518, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3456495828367104, |
| "grad_norm": 0.3473268449306488, |
| "learning_rate": 2.9e-05, |
| "loss": 0.0515, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.3575685339690107, |
| "grad_norm": 0.3255419433116913, |
| "learning_rate": 3.0000000000000004e-05, |
| "loss": 0.0526, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3575685339690107, |
| "eval_loss": 0.36782801151275635, |
| "eval_runtime": 24.3536, |
| "eval_samples_per_second": 114.973, |
| "eval_steps_per_second": 1.807, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3694874851013111, |
| "grad_norm": 0.3105810880661011, |
| "learning_rate": 3.1e-05, |
| "loss": 0.0505, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.3814064362336114, |
| "grad_norm": 0.3272826373577118, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 0.0514, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.3933253873659118, |
| "grad_norm": 0.2896602749824524, |
| "learning_rate": 3.3e-05, |
| "loss": 0.0509, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.4052443384982122, |
| "grad_norm": 0.2664831876754761, |
| "learning_rate": 3.4e-05, |
| "loss": 0.0508, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.4171632896305125, |
| "grad_norm": 0.29491451382637024, |
| "learning_rate": 3.5000000000000004e-05, |
| "loss": 0.0503, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.42908224076281287, |
| "grad_norm": 0.3337947726249695, |
| "learning_rate": 3.6e-05, |
| "loss": 0.0508, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.4410011918951132, |
| "grad_norm": 0.274883508682251, |
| "learning_rate": 3.7000000000000005e-05, |
| "loss": 0.0491, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.4469606674612634, |
| "eval_loss": 0.3556332290172577, |
| "eval_runtime": 24.3629, |
| "eval_samples_per_second": 114.929, |
| "eval_steps_per_second": 1.806, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.45292014302741357, |
| "grad_norm": 0.3304573595523834, |
| "learning_rate": 3.8e-05, |
| "loss": 0.0496, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.464839094159714, |
| "grad_norm": 0.43425601720809937, |
| "learning_rate": 3.9e-05, |
| "loss": 0.0514, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.4767580452920143, |
| "grad_norm": 0.27023833990097046, |
| "learning_rate": 4e-05, |
| "loss": 0.0493, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.48867699642431467, |
| "grad_norm": 0.30646392703056335, |
| "learning_rate": 3.981105337742088e-05, |
| "loss": 0.0479, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5005959475566151, |
| "grad_norm": 0.2886641025543213, |
| "learning_rate": 3.9622106754841764e-05, |
| "loss": 0.0477, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.5125148986889154, |
| "grad_norm": 0.2744874358177185, |
| "learning_rate": 3.943316013226264e-05, |
| "loss": 0.0475, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.5244338498212158, |
| "grad_norm": 0.3189797103404999, |
| "learning_rate": 3.924421350968352e-05, |
| "loss": 0.0487, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5363528009535161, |
| "grad_norm": 0.2827907204627991, |
| "learning_rate": 3.9055266887104394e-05, |
| "loss": 0.047, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5363528009535161, |
| "eval_loss": 0.3409503102302551, |
| "eval_runtime": 24.4692, |
| "eval_samples_per_second": 114.43, |
| "eval_steps_per_second": 1.798, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5482717520858165, |
| "grad_norm": 0.29098019003868103, |
| "learning_rate": 3.886632026452528e-05, |
| "loss": 0.0477, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.5601907032181168, |
| "grad_norm": 0.34882616996765137, |
| "learning_rate": 3.8677373641946155e-05, |
| "loss": 0.0469, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.5721096543504172, |
| "grad_norm": 0.27143335342407227, |
| "learning_rate": 3.848842701936703e-05, |
| "loss": 0.0463, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.5840286054827175, |
| "grad_norm": 0.3700932562351227, |
| "learning_rate": 3.829948039678791e-05, |
| "loss": 0.0471, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.5959475566150179, |
| "grad_norm": 0.2656194567680359, |
| "learning_rate": 3.811053377420879e-05, |
| "loss": 0.0435, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6078665077473182, |
| "grad_norm": 0.3158736526966095, |
| "learning_rate": 3.792158715162967e-05, |
| "loss": 0.0456, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.6197854588796186, |
| "grad_norm": 0.3060922920703888, |
| "learning_rate": 3.7732640529050546e-05, |
| "loss": 0.0464, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.6257449344457687, |
| "eval_loss": 0.32704514265060425, |
| "eval_runtime": 24.2942, |
| "eval_samples_per_second": 115.254, |
| "eval_steps_per_second": 1.811, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6317044100119189, |
| "grad_norm": 0.284631609916687, |
| "learning_rate": 3.754369390647142e-05, |
| "loss": 0.0436, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.6436233611442194, |
| "grad_norm": 0.2965853810310364, |
| "learning_rate": 3.7354747283892307e-05, |
| "loss": 0.0451, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.6555423122765197, |
| "grad_norm": 0.2642371952533722, |
| "learning_rate": 3.7165800661313183e-05, |
| "loss": 0.0447, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6674612634088201, |
| "grad_norm": 0.2913525402545929, |
| "learning_rate": 3.697685403873406e-05, |
| "loss": 0.0438, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.6793802145411204, |
| "grad_norm": 0.9295551776885986, |
| "learning_rate": 3.678790741615494e-05, |
| "loss": 0.0454, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.6912991656734208, |
| "grad_norm": 0.2802795469760895, |
| "learning_rate": 3.659896079357582e-05, |
| "loss": 0.0437, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.7032181168057211, |
| "grad_norm": 0.2768346965312958, |
| "learning_rate": 3.64100141709967e-05, |
| "loss": 0.0436, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.7151370679380215, |
| "grad_norm": 0.32342103123664856, |
| "learning_rate": 3.6221067548417575e-05, |
| "loss": 0.0434, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7151370679380215, |
| "eval_loss": 0.3151066303253174, |
| "eval_runtime": 24.3395, |
| "eval_samples_per_second": 115.039, |
| "eval_steps_per_second": 1.808, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7270560190703218, |
| "grad_norm": 0.28825289011001587, |
| "learning_rate": 3.603212092583845e-05, |
| "loss": 0.0444, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.7389749702026222, |
| "grad_norm": 0.2973019778728485, |
| "learning_rate": 3.5843174303259335e-05, |
| "loss": 0.0432, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.7508939213349225, |
| "grad_norm": 0.28604206442832947, |
| "learning_rate": 3.565422768068021e-05, |
| "loss": 0.0427, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.7628128724672228, |
| "grad_norm": 0.3076627552509308, |
| "learning_rate": 3.546528105810109e-05, |
| "loss": 0.0437, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.7747318235995232, |
| "grad_norm": 0.24243097007274628, |
| "learning_rate": 3.5276334435521966e-05, |
| "loss": 0.0419, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7866507747318237, |
| "grad_norm": 0.2624588906764984, |
| "learning_rate": 3.508738781294285e-05, |
| "loss": 0.041, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.798569725864124, |
| "grad_norm": 0.2966582477092743, |
| "learning_rate": 3.4898441190363726e-05, |
| "loss": 0.0419, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.8045292014302742, |
| "eval_loss": 0.30741235613822937, |
| "eval_runtime": 24.3327, |
| "eval_samples_per_second": 115.071, |
| "eval_steps_per_second": 1.808, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.8104886769964244, |
| "grad_norm": 0.2586236894130707, |
| "learning_rate": 3.47094945677846e-05, |
| "loss": 0.0417, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.8224076281287247, |
| "grad_norm": 0.26168954372406006, |
| "learning_rate": 3.452054794520548e-05, |
| "loss": 0.0416, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.834326579261025, |
| "grad_norm": 0.2288641631603241, |
| "learning_rate": 3.4331601322626364e-05, |
| "loss": 0.0407, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8462455303933254, |
| "grad_norm": 0.239312082529068, |
| "learning_rate": 3.414265470004724e-05, |
| "loss": 0.0424, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.8581644815256257, |
| "grad_norm": 0.2796385884284973, |
| "learning_rate": 3.395370807746812e-05, |
| "loss": 0.0423, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.8700834326579261, |
| "grad_norm": 0.26898354291915894, |
| "learning_rate": 3.3764761454888994e-05, |
| "loss": 0.0404, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.8820023837902264, |
| "grad_norm": 0.2980300188064575, |
| "learning_rate": 3.357581483230988e-05, |
| "loss": 0.0413, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.8939213349225268, |
| "grad_norm": 0.2609230577945709, |
| "learning_rate": 3.3386868209730755e-05, |
| "loss": 0.0417, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8939213349225268, |
| "eval_loss": 0.3016362488269806, |
| "eval_runtime": 24.39, |
| "eval_samples_per_second": 114.801, |
| "eval_steps_per_second": 1.804, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.9058402860548271, |
| "grad_norm": 0.22545361518859863, |
| "learning_rate": 3.319792158715163e-05, |
| "loss": 0.0402, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.9177592371871275, |
| "grad_norm": 0.23952241241931915, |
| "learning_rate": 3.300897496457251e-05, |
| "loss": 0.0404, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.929678188319428, |
| "grad_norm": 0.211136594414711, |
| "learning_rate": 3.282002834199339e-05, |
| "loss": 0.0394, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.9415971394517283, |
| "grad_norm": 0.2786746323108673, |
| "learning_rate": 3.263108171941427e-05, |
| "loss": 0.0405, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.9535160905840286, |
| "grad_norm": 0.27551010251045227, |
| "learning_rate": 3.2442135096835146e-05, |
| "loss": 0.0393, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.965435041716329, |
| "grad_norm": 0.23502199351787567, |
| "learning_rate": 3.225318847425602e-05, |
| "loss": 0.0397, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.9773539928486293, |
| "grad_norm": 0.22266072034835815, |
| "learning_rate": 3.2064241851676906e-05, |
| "loss": 0.0382, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.9833134684147795, |
| "eval_loss": 0.2927956283092499, |
| "eval_runtime": 24.5022, |
| "eval_samples_per_second": 114.276, |
| "eval_steps_per_second": 1.796, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.9892729439809297, |
| "grad_norm": 0.2455359548330307, |
| "learning_rate": 3.187529522909778e-05, |
| "loss": 0.0377, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.0011918951132301, |
| "grad_norm": 0.19770461320877075, |
| "learning_rate": 3.168634860651866e-05, |
| "loss": 0.0394, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.0131108462455305, |
| "grad_norm": 0.23039540648460388, |
| "learning_rate": 3.149740198393954e-05, |
| "loss": 0.0306, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.0250297973778308, |
| "grad_norm": 0.22617976367473602, |
| "learning_rate": 3.130845536136042e-05, |
| "loss": 0.0302, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.0369487485101312, |
| "grad_norm": 0.2306171953678131, |
| "learning_rate": 3.11195087387813e-05, |
| "loss": 0.0297, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.0488676996424315, |
| "grad_norm": 0.2231408953666687, |
| "learning_rate": 3.0930562116202174e-05, |
| "loss": 0.0291, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.0607866507747319, |
| "grad_norm": 0.2368210107088089, |
| "learning_rate": 3.074161549362305e-05, |
| "loss": 0.0296, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.0727056019070322, |
| "grad_norm": 0.2105601727962494, |
| "learning_rate": 3.0552668871043935e-05, |
| "loss": 0.0302, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.0727056019070322, |
| "eval_loss": 0.2897118330001831, |
| "eval_runtime": 24.5143, |
| "eval_samples_per_second": 114.219, |
| "eval_steps_per_second": 1.795, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.0846245530393326, |
| "grad_norm": 0.21115803718566895, |
| "learning_rate": 3.036372224846481e-05, |
| "loss": 0.03, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.096543504171633, |
| "grad_norm": 0.21346606314182281, |
| "learning_rate": 3.017477562588569e-05, |
| "loss": 0.0303, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.1084624553039333, |
| "grad_norm": 0.2169450968503952, |
| "learning_rate": 2.998582900330657e-05, |
| "loss": 0.0301, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.1203814064362336, |
| "grad_norm": 0.22653773427009583, |
| "learning_rate": 2.979688238072745e-05, |
| "loss": 0.03, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.132300357568534, |
| "grad_norm": 0.23219868540763855, |
| "learning_rate": 2.9607935758148326e-05, |
| "loss": 0.0299, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.1442193087008343, |
| "grad_norm": 0.19858673214912415, |
| "learning_rate": 2.9418989135569203e-05, |
| "loss": 0.0304, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.1561382598331347, |
| "grad_norm": 0.20563630759716034, |
| "learning_rate": 2.9230042512990083e-05, |
| "loss": 0.0298, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.162097735399285, |
| "eval_loss": 0.2881970703601837, |
| "eval_runtime": 24.482, |
| "eval_samples_per_second": 114.37, |
| "eval_steps_per_second": 1.797, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.168057210965435, |
| "grad_norm": 0.23171384632587433, |
| "learning_rate": 2.9041095890410963e-05, |
| "loss": 0.0305, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.1799761620977354, |
| "grad_norm": 0.22927935421466827, |
| "learning_rate": 2.885214926783184e-05, |
| "loss": 0.0293, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.1918951132300357, |
| "grad_norm": 0.22874480485916138, |
| "learning_rate": 2.8663202645252717e-05, |
| "loss": 0.0306, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.203814064362336, |
| "grad_norm": 0.2067805975675583, |
| "learning_rate": 2.8474256022673597e-05, |
| "loss": 0.0301, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.2157330154946364, |
| "grad_norm": 0.2245558649301529, |
| "learning_rate": 2.8285309400094477e-05, |
| "loss": 0.032, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.2276519666269368, |
| "grad_norm": 0.2403639554977417, |
| "learning_rate": 2.8096362777515354e-05, |
| "loss": 0.0296, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.2395709177592371, |
| "grad_norm": 0.2070910781621933, |
| "learning_rate": 2.790741615493623e-05, |
| "loss": 0.03, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.2514898688915377, |
| "grad_norm": 0.23112566769123077, |
| "learning_rate": 2.771846953235711e-05, |
| "loss": 0.0295, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.2514898688915377, |
| "eval_loss": 0.2860635817050934, |
| "eval_runtime": 24.4598, |
| "eval_samples_per_second": 114.473, |
| "eval_steps_per_second": 1.799, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.2634088200238378, |
| "grad_norm": 0.2321086972951889, |
| "learning_rate": 2.7529522909777992e-05, |
| "loss": 0.0307, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.2753277711561384, |
| "grad_norm": 0.23003174364566803, |
| "learning_rate": 2.734057628719887e-05, |
| "loss": 0.0306, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.2872467222884385, |
| "grad_norm": 0.2210853099822998, |
| "learning_rate": 2.7151629664619745e-05, |
| "loss": 0.0289, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.299165673420739, |
| "grad_norm": 0.2513985335826874, |
| "learning_rate": 2.6962683042040626e-05, |
| "loss": 0.0298, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.3110846245530392, |
| "grad_norm": 0.2379380464553833, |
| "learning_rate": 2.6773736419461506e-05, |
| "loss": 0.0301, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.3230035756853398, |
| "grad_norm": 0.21417196094989777, |
| "learning_rate": 2.6584789796882383e-05, |
| "loss": 0.0301, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.3349225268176401, |
| "grad_norm": 0.22283975780010223, |
| "learning_rate": 2.639584317430326e-05, |
| "loss": 0.0309, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.3408820023837902, |
| "eval_loss": 0.2835468649864197, |
| "eval_runtime": 24.4862, |
| "eval_samples_per_second": 114.35, |
| "eval_steps_per_second": 1.797, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.3468414779499405, |
| "grad_norm": 0.21475766599178314, |
| "learning_rate": 2.620689655172414e-05, |
| "loss": 0.0296, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.3587604290822408, |
| "grad_norm": 0.2578884959220886, |
| "learning_rate": 2.601794992914502e-05, |
| "loss": 0.0285, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.3706793802145412, |
| "grad_norm": 0.208306223154068, |
| "learning_rate": 2.5829003306565897e-05, |
| "loss": 0.0296, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.3825983313468415, |
| "grad_norm": 0.20098654925823212, |
| "learning_rate": 2.5640056683986774e-05, |
| "loss": 0.0299, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.3945172824791419, |
| "grad_norm": 0.19324277341365814, |
| "learning_rate": 2.5451110061407654e-05, |
| "loss": 0.0299, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.4064362336114422, |
| "grad_norm": 0.22991597652435303, |
| "learning_rate": 2.5262163438828534e-05, |
| "loss": 0.0283, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.4183551847437426, |
| "grad_norm": 0.20805244147777557, |
| "learning_rate": 2.507321681624941e-05, |
| "loss": 0.0294, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.430274135876043, |
| "grad_norm": 0.2217872589826584, |
| "learning_rate": 2.488427019367029e-05, |
| "loss": 0.0298, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.430274135876043, |
| "eval_loss": 0.28145191073417664, |
| "eval_runtime": 24.4822, |
| "eval_samples_per_second": 114.369, |
| "eval_steps_per_second": 1.797, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.4421930870083433, |
| "grad_norm": 0.2886907756328583, |
| "learning_rate": 2.469532357109117e-05, |
| "loss": 0.0286, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.4541120381406436, |
| "grad_norm": 0.22498397529125214, |
| "learning_rate": 2.450637694851205e-05, |
| "loss": 0.0303, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.466030989272944, |
| "grad_norm": 0.2244011014699936, |
| "learning_rate": 2.4317430325932926e-05, |
| "loss": 0.0291, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.4779499404052443, |
| "grad_norm": 0.254245400428772, |
| "learning_rate": 2.4128483703353806e-05, |
| "loss": 0.0291, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.4898688915375446, |
| "grad_norm": 0.23943567276000977, |
| "learning_rate": 2.3939537080774683e-05, |
| "loss": 0.0279, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.5017878426698452, |
| "grad_norm": 0.19281241297721863, |
| "learning_rate": 2.3750590458195563e-05, |
| "loss": 0.0284, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.5137067938021453, |
| "grad_norm": 0.1942477971315384, |
| "learning_rate": 2.356164383561644e-05, |
| "loss": 0.0286, |
| "step": 2540 |
| }, |
| { |
| "epoch": 1.5196662693682956, |
| "eval_loss": 0.2797168493270874, |
| "eval_runtime": 24.5526, |
| "eval_samples_per_second": 114.041, |
| "eval_steps_per_second": 1.792, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.525625744934446, |
| "grad_norm": 0.21547091007232666, |
| "learning_rate": 2.337269721303732e-05, |
| "loss": 0.0299, |
| "step": 2560 |
| }, |
| { |
| "epoch": 1.537544696066746, |
| "grad_norm": 0.2152203619480133, |
| "learning_rate": 2.3183750590458197e-05, |
| "loss": 0.0285, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.5494636471990466, |
| "grad_norm": 0.19925089180469513, |
| "learning_rate": 2.2994803967879077e-05, |
| "loss": 0.0282, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.5613825983313467, |
| "grad_norm": 0.21512266993522644, |
| "learning_rate": 2.2805857345299954e-05, |
| "loss": 0.0291, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.5733015494636473, |
| "grad_norm": 0.2275344282388687, |
| "learning_rate": 2.2616910722720834e-05, |
| "loss": 0.0288, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.5852205005959474, |
| "grad_norm": 0.22899560630321503, |
| "learning_rate": 2.242796410014171e-05, |
| "loss": 0.0279, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.597139451728248, |
| "grad_norm": 0.21439722180366516, |
| "learning_rate": 2.223901747756259e-05, |
| "loss": 0.0273, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.6090584028605481, |
| "grad_norm": 0.21393275260925293, |
| "learning_rate": 2.205007085498347e-05, |
| "loss": 0.0284, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.6090584028605481, |
| "eval_loss": 0.27715668082237244, |
| "eval_runtime": 24.544, |
| "eval_samples_per_second": 114.081, |
| "eval_steps_per_second": 1.793, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.6209773539928487, |
| "grad_norm": 0.22286593914031982, |
| "learning_rate": 2.186112423240435e-05, |
| "loss": 0.028, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.6328963051251488, |
| "grad_norm": 0.21552002429962158, |
| "learning_rate": 2.1672177609825225e-05, |
| "loss": 0.0282, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.6448152562574494, |
| "grad_norm": 0.20231053233146667, |
| "learning_rate": 2.1483230987246106e-05, |
| "loss": 0.0278, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.6567342073897497, |
| "grad_norm": 0.20446668565273285, |
| "learning_rate": 2.1294284364666983e-05, |
| "loss": 0.0283, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.66865315852205, |
| "grad_norm": 0.21102942526340485, |
| "learning_rate": 2.1105337742087863e-05, |
| "loss": 0.029, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.6805721096543504, |
| "grad_norm": 0.190469890832901, |
| "learning_rate": 2.091639111950874e-05, |
| "loss": 0.0286, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.6924910607866508, |
| "grad_norm": 0.2115412801504135, |
| "learning_rate": 2.072744449692962e-05, |
| "loss": 0.0278, |
| "step": 2840 |
| }, |
| { |
| "epoch": 1.698450536352801, |
| "eval_loss": 0.2754322588443756, |
| "eval_runtime": 24.6377, |
| "eval_samples_per_second": 113.647, |
| "eval_steps_per_second": 1.786, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.7044100119189511, |
| "grad_norm": 0.18578040599822998, |
| "learning_rate": 2.0538497874350497e-05, |
| "loss": 0.0284, |
| "step": 2860 |
| }, |
| { |
| "epoch": 1.7163289630512515, |
| "grad_norm": 0.23163530230522156, |
| "learning_rate": 2.0349551251771377e-05, |
| "loss": 0.0277, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.7282479141835518, |
| "grad_norm": 0.2035950869321823, |
| "learning_rate": 2.0160604629192254e-05, |
| "loss": 0.0275, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.7401668653158522, |
| "grad_norm": 0.24992471933364868, |
| "learning_rate": 1.9971658006613134e-05, |
| "loss": 0.028, |
| "step": 2920 |
| }, |
| { |
| "epoch": 1.7520858164481525, |
| "grad_norm": 0.20510628819465637, |
| "learning_rate": 1.978271138403401e-05, |
| "loss": 0.027, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.7640047675804529, |
| "grad_norm": 0.216608926653862, |
| "learning_rate": 1.959376476145489e-05, |
| "loss": 0.0283, |
| "step": 2960 |
| }, |
| { |
| "epoch": 1.7759237187127532, |
| "grad_norm": 0.20540915429592133, |
| "learning_rate": 1.9404818138875768e-05, |
| "loss": 0.0274, |
| "step": 2980 |
| }, |
| { |
| "epoch": 1.7878426698450536, |
| "grad_norm": 0.20191486179828644, |
| "learning_rate": 1.921587151629665e-05, |
| "loss": 0.0275, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.7878426698450536, |
| "eval_loss": 0.27235740423202515, |
| "eval_runtime": 24.612, |
| "eval_samples_per_second": 113.766, |
| "eval_steps_per_second": 1.788, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.7997616209773541, |
| "grad_norm": 0.24276046454906464, |
| "learning_rate": 1.9026924893717525e-05, |
| "loss": 0.028, |
| "step": 3020 |
| }, |
| { |
| "epoch": 1.8116805721096543, |
| "grad_norm": 0.21864300966262817, |
| "learning_rate": 1.8837978271138406e-05, |
| "loss": 0.0279, |
| "step": 3040 |
| }, |
| { |
| "epoch": 1.8235995232419548, |
| "grad_norm": 0.2271438091993332, |
| "learning_rate": 1.8649031648559282e-05, |
| "loss": 0.0276, |
| "step": 3060 |
| }, |
| { |
| "epoch": 1.835518474374255, |
| "grad_norm": 0.20855475962162018, |
| "learning_rate": 1.8460085025980163e-05, |
| "loss": 0.0283, |
| "step": 3080 |
| }, |
| { |
| "epoch": 1.8474374255065555, |
| "grad_norm": 0.2064722329378128, |
| "learning_rate": 1.827113840340104e-05, |
| "loss": 0.0271, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.8593563766388557, |
| "grad_norm": 0.2111743986606598, |
| "learning_rate": 1.808219178082192e-05, |
| "loss": 0.0273, |
| "step": 3120 |
| }, |
| { |
| "epoch": 1.8712753277711562, |
| "grad_norm": 0.18417872488498688, |
| "learning_rate": 1.7893245158242797e-05, |
| "loss": 0.0275, |
| "step": 3140 |
| }, |
| { |
| "epoch": 1.8772348033373063, |
| "eval_loss": 0.2706995904445648, |
| "eval_runtime": 24.5215, |
| "eval_samples_per_second": 114.186, |
| "eval_steps_per_second": 1.794, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.8831942789034564, |
| "grad_norm": 0.19550646841526031, |
| "learning_rate": 1.7704298535663677e-05, |
| "loss": 0.0275, |
| "step": 3160 |
| }, |
| { |
| "epoch": 1.895113230035757, |
| "grad_norm": 0.19912759959697723, |
| "learning_rate": 1.7515351913084554e-05, |
| "loss": 0.0267, |
| "step": 3180 |
| }, |
| { |
| "epoch": 1.907032181168057, |
| "grad_norm": 0.20110997557640076, |
| "learning_rate": 1.7326405290505434e-05, |
| "loss": 0.0281, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.9189511323003576, |
| "grad_norm": 0.20968247950077057, |
| "learning_rate": 1.713745866792631e-05, |
| "loss": 0.0271, |
| "step": 3220 |
| }, |
| { |
| "epoch": 1.930870083432658, |
| "grad_norm": 0.20576246082782745, |
| "learning_rate": 1.694851204534719e-05, |
| "loss": 0.0277, |
| "step": 3240 |
| }, |
| { |
| "epoch": 1.9427890345649583, |
| "grad_norm": 0.21499717235565186, |
| "learning_rate": 1.6759565422768068e-05, |
| "loss": 0.0264, |
| "step": 3260 |
| }, |
| { |
| "epoch": 1.9547079856972587, |
| "grad_norm": 0.23822776973247528, |
| "learning_rate": 1.6570618800188948e-05, |
| "loss": 0.027, |
| "step": 3280 |
| }, |
| { |
| "epoch": 1.966626936829559, |
| "grad_norm": 0.21049529314041138, |
| "learning_rate": 1.6381672177609825e-05, |
| "loss": 0.0259, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.966626936829559, |
| "eval_loss": 0.2697894275188446, |
| "eval_runtime": 24.6235, |
| "eval_samples_per_second": 113.713, |
| "eval_steps_per_second": 1.787, |
| "step": 3300 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 5034, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 300, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2189847399207797e+20, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|