| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 5624, |
| "global_step": 16872, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 64.30284881591797, |
| "learning_rate": 2.9620853080568726e-09, |
| "loss": 3.7905, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 74.81798553466797, |
| "learning_rate": 9.478672985781992e-08, |
| "loss": 4.4137, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 101.4776840209961, |
| "learning_rate": 1.8957345971563984e-07, |
| "loss": 4.2954, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 68.84349822998047, |
| "learning_rate": 2.843601895734597e-07, |
| "loss": 3.376, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 42.47146224975586, |
| "learning_rate": 3.791469194312797e-07, |
| "loss": 1.962, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 46.63043212890625, |
| "learning_rate": 4.7393364928909956e-07, |
| "loss": 1.168, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 2.709139347076416, |
| "learning_rate": 5.687203791469194e-07, |
| "loss": 0.4681, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 3.304001808166504, |
| "learning_rate": 6.635071090047394e-07, |
| "loss": 0.4059, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 2.5251598358154297, |
| "learning_rate": 7.582938388625594e-07, |
| "loss": 0.3999, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 1.9685628414154053, |
| "learning_rate": 8.530805687203792e-07, |
| "loss": 0.3904, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.7696113586425781, |
| "learning_rate": 9.478672985781991e-07, |
| "loss": 0.3769, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.8694322109222412, |
| "learning_rate": 1.042654028436019e-06, |
| "loss": 0.3718, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 1.7985926866531372, |
| "learning_rate": 1.1374407582938388e-06, |
| "loss": 0.3569, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 1.9462366104125977, |
| "learning_rate": 1.2322274881516587e-06, |
| "loss": 0.3432, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 1.916548490524292, |
| "learning_rate": 1.3270142180094788e-06, |
| "loss": 0.331, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 1.2683026790618896, |
| "learning_rate": 1.4218009478672987e-06, |
| "loss": 0.334, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 1.4642925262451172, |
| "learning_rate": 1.5165876777251187e-06, |
| "loss": 0.3386, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 1.3890858888626099, |
| "learning_rate": 1.6113744075829384e-06, |
| "loss": 0.3073, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 1.4565709829330444, |
| "learning_rate": 1.7061611374407585e-06, |
| "loss": 0.3263, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 1.349706768989563, |
| "learning_rate": 1.8009478672985784e-06, |
| "loss": 0.3165, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 1.7937098741531372, |
| "learning_rate": 1.8957345971563982e-06, |
| "loss": 0.3052, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 1.616604208946228, |
| "learning_rate": 1.990521327014218e-06, |
| "loss": 0.3138, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.3185547590255737, |
| "learning_rate": 2.085308056872038e-06, |
| "loss": 0.297, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.8201861381530762, |
| "learning_rate": 2.180094786729858e-06, |
| "loss": 0.3072, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 1.1981289386749268, |
| "learning_rate": 2.2748815165876777e-06, |
| "loss": 0.2957, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 1.7006583213806152, |
| "learning_rate": 2.369668246445498e-06, |
| "loss": 0.2941, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 2.1674447059631348, |
| "learning_rate": 2.4644549763033174e-06, |
| "loss": 0.2798, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 1.3364813327789307, |
| "learning_rate": 2.5592417061611373e-06, |
| "loss": 0.2996, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 1.2685174942016602, |
| "learning_rate": 2.6540284360189576e-06, |
| "loss": 0.3027, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 1.3952556848526, |
| "learning_rate": 2.7488151658767775e-06, |
| "loss": 0.2985, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 1.251714825630188, |
| "learning_rate": 2.8436018957345973e-06, |
| "loss": 0.2905, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 1.553152322769165, |
| "learning_rate": 2.938388625592417e-06, |
| "loss": 0.278, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 1.380920171737671, |
| "learning_rate": 3.0331753554502375e-06, |
| "loss": 0.2813, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 1.5351643562316895, |
| "learning_rate": 3.1279620853080574e-06, |
| "loss": 0.2805, |
| "step": 1056 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 1.4867887496948242, |
| "learning_rate": 3.222748815165877e-06, |
| "loss": 0.2767, |
| "step": 1088 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 1.317229986190796, |
| "learning_rate": 3.3175355450236967e-06, |
| "loss": 0.2859, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 1.8770791292190552, |
| "learning_rate": 3.412322274881517e-06, |
| "loss": 0.2875, |
| "step": 1152 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 1.4476697444915771, |
| "learning_rate": 3.507109004739337e-06, |
| "loss": 0.2884, |
| "step": 1184 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 1.351965069770813, |
| "learning_rate": 3.6018957345971567e-06, |
| "loss": 0.2802, |
| "step": 1216 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 1.4647209644317627, |
| "learning_rate": 3.6966824644549766e-06, |
| "loss": 0.2703, |
| "step": 1248 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 1.901773452758789, |
| "learning_rate": 3.7914691943127964e-06, |
| "loss": 0.2815, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 1.139844536781311, |
| "learning_rate": 3.886255924170616e-06, |
| "loss": 0.2658, |
| "step": 1312 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.1863261461257935, |
| "learning_rate": 3.981042654028436e-06, |
| "loss": 0.2707, |
| "step": 1344 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.2720916271209717, |
| "learning_rate": 4.075829383886256e-06, |
| "loss": 0.2646, |
| "step": 1376 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 1.6161096096038818, |
| "learning_rate": 4.170616113744076e-06, |
| "loss": 0.2748, |
| "step": 1408 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 1.4303381443023682, |
| "learning_rate": 4.265402843601897e-06, |
| "loss": 0.2691, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 1.401880145072937, |
| "learning_rate": 4.360189573459716e-06, |
| "loss": 0.2699, |
| "step": 1472 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 1.3495467901229858, |
| "learning_rate": 4.4549763033175355e-06, |
| "loss": 0.2772, |
| "step": 1504 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 1.3915464878082275, |
| "learning_rate": 4.549763033175355e-06, |
| "loss": 0.2752, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 1.3412673473358154, |
| "learning_rate": 4.644549763033176e-06, |
| "loss": 0.2751, |
| "step": 1568 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 1.3777296543121338, |
| "learning_rate": 4.739336492890996e-06, |
| "loss": 0.2717, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 1.2612873315811157, |
| "learning_rate": 4.834123222748816e-06, |
| "loss": 0.2678, |
| "step": 1632 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 1.2989717721939087, |
| "learning_rate": 4.928909952606635e-06, |
| "loss": 0.2778, |
| "step": 1664 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 1.3525702953338623, |
| "learning_rate": 4.999996575341721e-06, |
| "loss": 0.2719, |
| "step": 1696 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 1.4678899049758911, |
| "learning_rate": 4.999914384012144e-06, |
| "loss": 0.2755, |
| "step": 1728 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 1.2093278169631958, |
| "learning_rate": 4.999722607745944e-06, |
| "loss": 0.2755, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 1.4915423393249512, |
| "learning_rate": 4.999421254949728e-06, |
| "loss": 0.2686, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 1.1101861000061035, |
| "learning_rate": 4.999010338833436e-06, |
| "loss": 0.2594, |
| "step": 1824 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 1.3432806730270386, |
| "learning_rate": 4.9984898774097735e-06, |
| "loss": 0.2658, |
| "step": 1856 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 1.2808105945587158, |
| "learning_rate": 4.997859893493414e-06, |
| "loss": 0.2632, |
| "step": 1888 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 1.3815045356750488, |
| "learning_rate": 4.997120414700003e-06, |
| "loss": 0.2557, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 1.4393643140792847, |
| "learning_rate": 4.996271473444944e-06, |
| "loss": 0.263, |
| "step": 1952 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 1.138375163078308, |
| "learning_rate": 4.995313106941982e-06, |
| "loss": 0.2805, |
| "step": 1984 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 1.6412934064865112, |
| "learning_rate": 4.994245357201568e-06, |
| "loss": 0.2641, |
| "step": 2016 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 1.465922236442566, |
| "learning_rate": 4.9930682710290205e-06, |
| "loss": 0.2637, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 1.4526797533035278, |
| "learning_rate": 4.991781900022471e-06, |
| "loss": 0.2596, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 1.504759669303894, |
| "learning_rate": 4.990386300570607e-06, |
| "loss": 0.2633, |
| "step": 2112 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 1.5599263906478882, |
| "learning_rate": 4.988881533850192e-06, |
| "loss": 0.2658, |
| "step": 2144 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 1.1662814617156982, |
| "learning_rate": 4.987267665823392e-06, |
| "loss": 0.2694, |
| "step": 2176 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 1.3952819108963013, |
| "learning_rate": 4.98554476723488e-06, |
| "loss": 0.2449, |
| "step": 2208 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 1.2887946367263794, |
| "learning_rate": 4.983712913608736e-06, |
| "loss": 0.2651, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 1.5893690586090088, |
| "learning_rate": 4.981772185245135e-06, |
| "loss": 0.2568, |
| "step": 2272 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 1.228550672531128, |
| "learning_rate": 4.979722667216829e-06, |
| "loss": 0.2667, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 1.2756662368774414, |
| "learning_rate": 4.977564449365415e-06, |
| "loss": 0.2508, |
| "step": 2336 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 1.5225822925567627, |
| "learning_rate": 4.975297626297399e-06, |
| "loss": 0.2691, |
| "step": 2368 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 1.2656946182250977, |
| "learning_rate": 4.972922297380052e-06, |
| "loss": 0.2704, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 1.3268104791641235, |
| "learning_rate": 4.970438566737043e-06, |
| "loss": 0.2577, |
| "step": 2432 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 1.5536099672317505, |
| "learning_rate": 4.96784654324389e-06, |
| "loss": 0.2578, |
| "step": 2464 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 1.1516194343566895, |
| "learning_rate": 4.965146340523175e-06, |
| "loss": 0.2446, |
| "step": 2496 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 1.1923089027404785, |
| "learning_rate": 4.962338076939569e-06, |
| "loss": 0.2569, |
| "step": 2528 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 1.124197006225586, |
| "learning_rate": 4.959421875594643e-06, |
| "loss": 0.2625, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 1.680388331413269, |
| "learning_rate": 4.95639786432147e-06, |
| "loss": 0.264, |
| "step": 2592 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 1.3039462566375732, |
| "learning_rate": 4.953266175679023e-06, |
| "loss": 0.2624, |
| "step": 2624 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 1.109054684638977, |
| "learning_rate": 4.9500269469463655e-06, |
| "loss": 0.2548, |
| "step": 2656 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 1.2704750299453735, |
| "learning_rate": 4.94668032011663e-06, |
| "loss": 0.2569, |
| "step": 2688 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 1.1952179670333862, |
| "learning_rate": 4.943226441890794e-06, |
| "loss": 0.2599, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 1.2229312658309937, |
| "learning_rate": 4.939665463671255e-06, |
| "loss": 0.2577, |
| "step": 2752 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 1.3956924676895142, |
| "learning_rate": 4.935997541555188e-06, |
| "loss": 0.2642, |
| "step": 2784 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 1.116629958152771, |
| "learning_rate": 4.932222836327703e-06, |
| "loss": 0.2587, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 1.1389435529708862, |
| "learning_rate": 4.928341513454801e-06, |
| "loss": 0.2566, |
| "step": 2848 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 1.3800580501556396, |
| "learning_rate": 4.9243537430761155e-06, |
| "loss": 0.2579, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 1.3852914571762085, |
| "learning_rate": 4.920259699997461e-06, |
| "loss": 0.2666, |
| "step": 2912 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 1.31257963180542, |
| "learning_rate": 4.916059563683162e-06, |
| "loss": 0.2547, |
| "step": 2944 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 1.599116563796997, |
| "learning_rate": 4.911753518248194e-06, |
| "loss": 0.2612, |
| "step": 2976 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 1.2397140264511108, |
| "learning_rate": 4.907341752450105e-06, |
| "loss": 0.2589, |
| "step": 3008 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 1.3178327083587646, |
| "learning_rate": 4.9028244596807525e-06, |
| "loss": 0.2605, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 1.7413417100906372, |
| "learning_rate": 4.898201837957811e-06, |
| "loss": 0.2565, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 1.314085602760315, |
| "learning_rate": 4.893474089916105e-06, |
| "loss": 0.2498, |
| "step": 3104 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 1.1399492025375366, |
| "learning_rate": 4.888641422798719e-06, |
| "loss": 0.2647, |
| "step": 3136 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 1.3332985639572144, |
| "learning_rate": 4.883704048447916e-06, |
| "loss": 0.2594, |
| "step": 3168 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 1.3460063934326172, |
| "learning_rate": 4.87866218329585e-06, |
| "loss": 0.2571, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 1.5006327629089355, |
| "learning_rate": 4.87351604835508e-06, |
| "loss": 0.2458, |
| "step": 3232 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 1.1781283617019653, |
| "learning_rate": 4.868265869208879e-06, |
| "loss": 0.2452, |
| "step": 3264 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 1.117686152458191, |
| "learning_rate": 4.862911876001348e-06, |
| "loss": 0.2469, |
| "step": 3296 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.9969549775123596, |
| "learning_rate": 4.857454303427328e-06, |
| "loss": 0.2453, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 1.4894945621490479, |
| "learning_rate": 4.851893390722109e-06, |
| "loss": 0.2457, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 1.106041431427002, |
| "learning_rate": 4.846229381650946e-06, |
| "loss": 0.2474, |
| "step": 3392 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 1.035601019859314, |
| "learning_rate": 4.840462524498372e-06, |
| "loss": 0.2593, |
| "step": 3424 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 1.7077690362930298, |
| "learning_rate": 4.834593072057313e-06, |
| "loss": 0.2506, |
| "step": 3456 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 1.1017436981201172, |
| "learning_rate": 4.8286212816180124e-06, |
| "loss": 0.2506, |
| "step": 3488 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 1.2720685005187988, |
| "learning_rate": 4.8225474149567434e-06, |
| "loss": 0.2567, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 1.328189730644226, |
| "learning_rate": 4.816371738324343e-06, |
| "loss": 0.2531, |
| "step": 3552 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 1.2597825527191162, |
| "learning_rate": 4.810094522434534e-06, |
| "loss": 0.246, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 1.244281530380249, |
| "learning_rate": 4.803716042452063e-06, |
| "loss": 0.2433, |
| "step": 3616 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 1.4658986330032349, |
| "learning_rate": 4.797236577980634e-06, |
| "loss": 0.2496, |
| "step": 3648 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 1.4121670722961426, |
| "learning_rate": 4.7906564130506575e-06, |
| "loss": 0.2531, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 1.1751240491867065, |
| "learning_rate": 4.783975836106791e-06, |
| "loss": 0.2515, |
| "step": 3712 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 1.2011898756027222, |
| "learning_rate": 4.777195139995308e-06, |
| "loss": 0.2453, |
| "step": 3744 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 1.5764689445495605, |
| "learning_rate": 4.770314621951245e-06, |
| "loss": 0.2496, |
| "step": 3776 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 1.4584077596664429, |
| "learning_rate": 4.763334583585388e-06, |
| "loss": 0.2392, |
| "step": 3808 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 1.0098185539245605, |
| "learning_rate": 4.756255330871039e-06, |
| "loss": 0.2393, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 1.3514459133148193, |
| "learning_rate": 4.749077174130609e-06, |
| "loss": 0.2572, |
| "step": 3872 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 1.3888107538223267, |
| "learning_rate": 4.741800428022014e-06, |
| "loss": 0.2383, |
| "step": 3904 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 1.3402737379074097, |
| "learning_rate": 4.734425411524884e-06, |
| "loss": 0.2556, |
| "step": 3936 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 1.2175307273864746, |
| "learning_rate": 4.726952447926576e-06, |
| "loss": 0.2555, |
| "step": 3968 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 1.386852502822876, |
| "learning_rate": 4.719381864808005e-06, |
| "loss": 0.2503, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 1.2774380445480347, |
| "learning_rate": 4.711713994029284e-06, |
| "loss": 0.2503, |
| "step": 4032 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 1.177322268486023, |
| "learning_rate": 4.703949171715179e-06, |
| "loss": 0.2574, |
| "step": 4064 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 1.269942283630371, |
| "learning_rate": 4.69608773824037e-06, |
| "loss": 0.2529, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 1.2209409475326538, |
| "learning_rate": 4.688130038214534e-06, |
| "loss": 0.2536, |
| "step": 4128 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 1.4368942975997925, |
| "learning_rate": 4.6800764204672385e-06, |
| "loss": 0.2378, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 1.6493048667907715, |
| "learning_rate": 4.671927238032651e-06, |
| "loss": 0.2538, |
| "step": 4192 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 1.038549542427063, |
| "learning_rate": 4.6636828481340594e-06, |
| "loss": 0.2501, |
| "step": 4224 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 1.343204379081726, |
| "learning_rate": 4.655343612168219e-06, |
| "loss": 0.251, |
| "step": 4256 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 1.4020464420318604, |
| "learning_rate": 4.646909895689508e-06, |
| "loss": 0.2564, |
| "step": 4288 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 1.1331307888031006, |
| "learning_rate": 4.638382068393899e-06, |
| "loss": 0.2505, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 1.3825620412826538, |
| "learning_rate": 4.629760504102761e-06, |
| "loss": 0.2513, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 1.310570478439331, |
| "learning_rate": 4.621045580746467e-06, |
| "loss": 0.2464, |
| "step": 4384 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 1.15547776222229, |
| "learning_rate": 4.61223768034783e-06, |
| "loss": 0.2515, |
| "step": 4416 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 1.340010404586792, |
| "learning_rate": 4.603337189005354e-06, |
| "loss": 0.2473, |
| "step": 4448 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 1.2413158416748047, |
| "learning_rate": 4.594344496876313e-06, |
| "loss": 0.2354, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 1.2394189834594727, |
| "learning_rate": 4.585259998159646e-06, |
| "loss": 0.2512, |
| "step": 4512 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 1.2866027355194092, |
| "learning_rate": 4.576084091078677e-06, |
| "loss": 0.2364, |
| "step": 4544 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 1.1080009937286377, |
| "learning_rate": 4.5668171778636585e-06, |
| "loss": 0.2432, |
| "step": 4576 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 1.2469310760498047, |
| "learning_rate": 4.5574596647341414e-06, |
| "loss": 0.256, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 1.0387507677078247, |
| "learning_rate": 4.548011961881167e-06, |
| "loss": 0.232, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 1.2382770776748657, |
| "learning_rate": 4.538474483449286e-06, |
| "loss": 0.2552, |
| "step": 4672 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 1.2282336950302124, |
| "learning_rate": 4.528847647518403e-06, |
| "loss": 0.2525, |
| "step": 4704 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 1.4016482830047607, |
| "learning_rate": 4.5191318760854526e-06, |
| "loss": 0.2582, |
| "step": 4736 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 1.3214083909988403, |
| "learning_rate": 4.509327595045898e-06, |
| "loss": 0.2578, |
| "step": 4768 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.9114232063293457, |
| "learning_rate": 4.499435234175065e-06, |
| "loss": 0.2533, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 1.172450065612793, |
| "learning_rate": 4.4894552271093e-06, |
| "loss": 0.264, |
| "step": 4832 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 1.249770998954773, |
| "learning_rate": 4.4793880113269595e-06, |
| "loss": 0.2389, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 1.0912755727767944, |
| "learning_rate": 4.469234028129241e-06, |
| "loss": 0.2456, |
| "step": 4896 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 1.1503956317901611, |
| "learning_rate": 4.458993722620827e-06, |
| "loss": 0.2562, |
| "step": 4928 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 1.1564654111862183, |
| "learning_rate": 4.448667543690384e-06, |
| "loss": 0.25, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 1.271000862121582, |
| "learning_rate": 4.438255943990879e-06, |
| "loss": 0.243, |
| "step": 4992 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 1.0601048469543457, |
| "learning_rate": 4.427759379919739e-06, |
| "loss": 0.2397, |
| "step": 5024 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 1.214858889579773, |
| "learning_rate": 4.417178311598845e-06, |
| "loss": 0.2442, |
| "step": 5056 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 1.0516908168792725, |
| "learning_rate": 4.406513202854363e-06, |
| "loss": 0.2467, |
| "step": 5088 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 1.326076865196228, |
| "learning_rate": 4.3957645211964065e-06, |
| "loss": 0.2488, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 1.173823356628418, |
| "learning_rate": 4.384932737798554e-06, |
| "loss": 0.241, |
| "step": 5152 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 1.4526327848434448, |
| "learning_rate": 4.3740183274771845e-06, |
| "loss": 0.2553, |
| "step": 5184 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 1.2346609830856323, |
| "learning_rate": 4.363021768670668e-06, |
| "loss": 0.242, |
| "step": 5216 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.8957495093345642, |
| "learning_rate": 4.351943543418392e-06, |
| "loss": 0.2444, |
| "step": 5248 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 1.097772479057312, |
| "learning_rate": 4.340784137339632e-06, |
| "loss": 0.2531, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 1.1537779569625854, |
| "learning_rate": 4.329544039612264e-06, |
| "loss": 0.2507, |
| "step": 5312 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 1.1922253370285034, |
| "learning_rate": 4.318223742951321e-06, |
| "loss": 0.2335, |
| "step": 5344 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 1.1036819219589233, |
| "learning_rate": 4.306823743587394e-06, |
| "loss": 0.2465, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 1.229779839515686, |
| "learning_rate": 4.295344541244879e-06, |
| "loss": 0.2403, |
| "step": 5408 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 1.4036519527435303, |
| "learning_rate": 4.283786639120074e-06, |
| "loss": 0.254, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.9732062816619873, |
| "learning_rate": 4.272150543859117e-06, |
| "loss": 0.2517, |
| "step": 5472 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 1.3309801816940308, |
| "learning_rate": 4.260436765535784e-06, |
| "loss": 0.25, |
| "step": 5504 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 1.3353493213653564, |
| "learning_rate": 4.2486458176291176e-06, |
| "loss": 0.2482, |
| "step": 5536 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 1.6585358381271362, |
| "learning_rate": 4.236778217000934e-06, |
| "loss": 0.248, |
| "step": 5568 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.9717461466789246, |
| "learning_rate": 4.224834483873152e-06, |
| "loss": 0.2366, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.9571962356567383, |
| "learning_rate": 4.2128151418049976e-06, |
| "loss": 0.2404, |
| "step": 5632 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 1.0692377090454102, |
| "learning_rate": 4.200720717670048e-06, |
| "loss": 0.2135, |
| "step": 5664 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 1.1159001588821411, |
| "learning_rate": 4.188551741633144e-06, |
| "loss": 0.1854, |
| "step": 5696 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 1.4514949321746826, |
| "learning_rate": 4.176308747127136e-06, |
| "loss": 0.2095, |
| "step": 5728 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 1.4603676795959473, |
| "learning_rate": 4.1639922708295176e-06, |
| "loss": 0.2015, |
| "step": 5760 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 1.1802875995635986, |
| "learning_rate": 4.151602852638888e-06, |
| "loss": 0.222, |
| "step": 5792 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 1.2036052942276, |
| "learning_rate": 4.139141035651288e-06, |
| "loss": 0.2093, |
| "step": 5824 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 1.1690653562545776, |
| "learning_rate": 4.126607366136395e-06, |
| "loss": 0.1925, |
| "step": 5856 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.9996016621589661, |
| "learning_rate": 4.114002393513577e-06, |
| "loss": 0.206, |
| "step": 5888 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 1.1670773029327393, |
| "learning_rate": 4.101326670327807e-06, |
| "loss": 0.2097, |
| "step": 5920 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 0.8733654022216797, |
| "learning_rate": 4.0885807522254435e-06, |
| "loss": 0.2015, |
| "step": 5952 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 1.2280749082565308, |
| "learning_rate": 4.075765197929872e-06, |
| "loss": 0.2108, |
| "step": 5984 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 1.1926356554031372, |
| "learning_rate": 4.0628805692170105e-06, |
| "loss": 0.2047, |
| "step": 6016 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 1.0048396587371826, |
| "learning_rate": 4.049927430890693e-06, |
| "loss": 0.2077, |
| "step": 6048 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 1.026442050933838, |
| "learning_rate": 4.0369063507578995e-06, |
| "loss": 0.2051, |
| "step": 6080 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 1.1310842037200928, |
| "learning_rate": 4.023817899603875e-06, |
| "loss": 0.2055, |
| "step": 6112 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 1.1275712251663208, |
| "learning_rate": 4.010662651167106e-06, |
| "loss": 0.1965, |
| "step": 6144 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 1.1789113283157349, |
| "learning_rate": 3.997441182114164e-06, |
| "loss": 0.2118, |
| "step": 6176 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 1.3836599588394165, |
| "learning_rate": 3.984154072014438e-06, |
| "loss": 0.2056, |
| "step": 6208 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 1.1013050079345703, |
| "learning_rate": 3.970801903314722e-06, |
| "loss": 0.2109, |
| "step": 6240 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 1.1018249988555908, |
| "learning_rate": 3.957385261313685e-06, |
| "loss": 0.202, |
| "step": 6272 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 1.3906185626983643, |
| "learning_rate": 3.943904734136213e-06, |
| "loss": 0.2065, |
| "step": 6304 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 1.2197610139846802, |
| "learning_rate": 3.930360912707632e-06, |
| "loss": 0.2096, |
| "step": 6336 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 1.0342845916748047, |
| "learning_rate": 3.916754390727795e-06, |
| "loss": 0.2024, |
| "step": 6368 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 1.236260175704956, |
| "learning_rate": 3.90308576464507e-06, |
| "loss": 0.216, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 1.3906086683273315, |
| "learning_rate": 3.889355633630186e-06, |
| "loss": 0.2153, |
| "step": 6432 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 1.2441129684448242, |
| "learning_rate": 3.875564599549968e-06, |
| "loss": 0.2092, |
| "step": 6464 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 1.2320338487625122, |
| "learning_rate": 3.861713266940959e-06, |
| "loss": 0.2038, |
| "step": 6496 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 1.6422646045684814, |
| "learning_rate": 3.847802242982915e-06, |
| "loss": 0.205, |
| "step": 6528 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 1.1179068088531494, |
| "learning_rate": 3.83383213747219e-06, |
| "loss": 0.2162, |
| "step": 6560 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 1.0986745357513428, |
| "learning_rate": 3.8198035627950084e-06, |
| "loss": 0.1956, |
| "step": 6592 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 1.340859055519104, |
| "learning_rate": 3.8057171339006138e-06, |
| "loss": 0.2093, |
| "step": 6624 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 1.7803446054458618, |
| "learning_rate": 3.791573468274323e-06, |
| "loss": 0.2133, |
| "step": 6656 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 1.022388219833374, |
| "learning_rate": 3.777373185910448e-06, |
| "loss": 0.2182, |
| "step": 6688 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 1.0795223712921143, |
| "learning_rate": 3.7631169092851226e-06, |
| "loss": 0.2051, |
| "step": 6720 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 1.0785856246948242, |
| "learning_rate": 3.7488052633290174e-06, |
| "loss": 0.2047, |
| "step": 6752 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 1.0391508340835571, |
| "learning_rate": 3.7344388753999434e-06, |
| "loss": 0.2081, |
| "step": 6784 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 1.3015925884246826, |
| "learning_rate": 3.720018375255352e-06, |
| "loss": 0.2013, |
| "step": 6816 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 1.2382066249847412, |
| "learning_rate": 3.7055443950247276e-06, |
| "loss": 0.2037, |
| "step": 6848 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 1.1386123895645142, |
| "learning_rate": 3.691017569181882e-06, |
| "loss": 0.2046, |
| "step": 6880 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 0.9857081770896912, |
| "learning_rate": 3.6764385345171393e-06, |
| "loss": 0.207, |
| "step": 6912 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 1.1276394128799438, |
| "learning_rate": 3.661807930109422e-06, |
| "loss": 0.2134, |
| "step": 6944 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 1.1821982860565186, |
| "learning_rate": 3.647126397298234e-06, |
| "loss": 0.2162, |
| "step": 6976 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 1.218800663948059, |
| "learning_rate": 3.632394579655555e-06, |
| "loss": 0.2023, |
| "step": 7008 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 1.083310842514038, |
| "learning_rate": 3.6176131229576193e-06, |
| "loss": 0.1999, |
| "step": 7040 |
| }, |
| { |
| "epoch": 1.26, |
| "grad_norm": 1.0640002489089966, |
| "learning_rate": 3.602782675156617e-06, |
| "loss": 0.2125, |
| "step": 7072 |
| }, |
| { |
| "epoch": 1.26, |
| "grad_norm": 1.1672149896621704, |
| "learning_rate": 3.5879038863522843e-06, |
| "loss": 0.2157, |
| "step": 7104 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 1.1732845306396484, |
| "learning_rate": 3.572977408763407e-06, |
| "loss": 0.2082, |
| "step": 7136 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 1.1544941663742065, |
| "learning_rate": 3.5580038966992344e-06, |
| "loss": 0.2067, |
| "step": 7168 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 1.2914546728134155, |
| "learning_rate": 3.5429840065307924e-06, |
| "loss": 0.2019, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.29, |
| "grad_norm": 1.0473650693893433, |
| "learning_rate": 3.527918396662115e-06, |
| "loss": 0.1952, |
| "step": 7232 |
| }, |
| { |
| "epoch": 1.29, |
| "grad_norm": 1.2211614847183228, |
| "learning_rate": 3.512807727501379e-06, |
| "loss": 0.2093, |
| "step": 7264 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 1.1035760641098022, |
| "learning_rate": 3.4976526614319573e-06, |
| "loss": 0.2007, |
| "step": 7296 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 1.2120308876037598, |
| "learning_rate": 3.4824538627833825e-06, |
| "loss": 0.2205, |
| "step": 7328 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 0.8647122979164124, |
| "learning_rate": 3.4672119978022277e-06, |
| "loss": 0.2063, |
| "step": 7360 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 1.1142189502716064, |
| "learning_rate": 3.4519277346228953e-06, |
| "loss": 0.2075, |
| "step": 7392 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 1.3183207511901855, |
| "learning_rate": 3.436601743238335e-06, |
| "loss": 0.2094, |
| "step": 7424 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 1.0320820808410645, |
| "learning_rate": 3.421234695470673e-06, |
| "loss": 0.2029, |
| "step": 7456 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 1.3065481185913086, |
| "learning_rate": 3.4058272649417607e-06, |
| "loss": 0.2127, |
| "step": 7488 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 1.209372639656067, |
| "learning_rate": 3.3903801270436465e-06, |
| "loss": 0.2015, |
| "step": 7520 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 1.0177247524261475, |
| "learning_rate": 3.374893958908971e-06, |
| "loss": 0.2075, |
| "step": 7552 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 1.4457709789276123, |
| "learning_rate": 3.3593694393812827e-06, |
| "loss": 0.2098, |
| "step": 7584 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 1.1711078882217407, |
| "learning_rate": 3.3438072489852837e-06, |
| "loss": 0.2088, |
| "step": 7616 |
| }, |
| { |
| "epoch": 1.36, |
| "grad_norm": 1.1928409337997437, |
| "learning_rate": 3.3282080698969953e-06, |
| "loss": 0.1918, |
| "step": 7648 |
| }, |
| { |
| "epoch": 1.37, |
| "grad_norm": 0.9215808510780334, |
| "learning_rate": 3.3125725859138548e-06, |
| "loss": 0.2106, |
| "step": 7680 |
| }, |
| { |
| "epoch": 1.37, |
| "grad_norm": 1.3021633625030518, |
| "learning_rate": 3.2969014824247436e-06, |
| "loss": 0.2018, |
| "step": 7712 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 1.1597398519515991, |
| "learning_rate": 3.28119544637994e-06, |
| "loss": 0.2035, |
| "step": 7744 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 1.2015706300735474, |
| "learning_rate": 3.265455166261009e-06, |
| "loss": 0.2027, |
| "step": 7776 |
| }, |
| { |
| "epoch": 1.39, |
| "grad_norm": 1.1449264287948608, |
| "learning_rate": 3.2496813320506183e-06, |
| "loss": 0.2165, |
| "step": 7808 |
| }, |
| { |
| "epoch": 1.39, |
| "grad_norm": 1.1332265138626099, |
| "learning_rate": 3.2338746352022965e-06, |
| "loss": 0.2006, |
| "step": 7840 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 1.430891990661621, |
| "learning_rate": 3.2180357686101226e-06, |
| "loss": 0.2102, |
| "step": 7872 |
| }, |
| { |
| "epoch": 1.41, |
| "grad_norm": 1.4063985347747803, |
| "learning_rate": 3.2021654265783505e-06, |
| "loss": 0.196, |
| "step": 7904 |
| }, |
| { |
| "epoch": 1.41, |
| "grad_norm": 1.3970558643341064, |
| "learning_rate": 3.1862643047909746e-06, |
| "loss": 0.2161, |
| "step": 7936 |
| }, |
| { |
| "epoch": 1.42, |
| "grad_norm": 1.3233983516693115, |
| "learning_rate": 3.170333100281236e-06, |
| "loss": 0.1921, |
| "step": 7968 |
| }, |
| { |
| "epoch": 1.42, |
| "grad_norm": 1.2325806617736816, |
| "learning_rate": 3.154372511401064e-06, |
| "loss": 0.2042, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 1.0558186769485474, |
| "learning_rate": 3.1383832377904676e-06, |
| "loss": 0.2056, |
| "step": 8032 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 1.189503788948059, |
| "learning_rate": 3.1223659803468653e-06, |
| "loss": 0.203, |
| "step": 8064 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 1.1646627187728882, |
| "learning_rate": 3.1063214411943576e-06, |
| "loss": 0.2088, |
| "step": 8096 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 1.1149977445602417, |
| "learning_rate": 3.0902503236529533e-06, |
| "loss": 0.2081, |
| "step": 8128 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 1.5566740036010742, |
| "learning_rate": 3.074153332207738e-06, |
| "loss": 0.2141, |
| "step": 8160 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 1.304262638092041, |
| "learning_rate": 3.058031172477992e-06, |
| "loss": 0.2006, |
| "step": 8192 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 1.1247010231018066, |
| "learning_rate": 3.041884551186258e-06, |
| "loss": 0.2109, |
| "step": 8224 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 1.4587311744689941, |
| "learning_rate": 3.0257141761273627e-06, |
| "loss": 0.2016, |
| "step": 8256 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 1.1545838117599487, |
| "learning_rate": 3.0095207561373935e-06, |
| "loss": 0.183, |
| "step": 8288 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 1.3221790790557861, |
| "learning_rate": 2.9933050010626208e-06, |
| "loss": 0.1985, |
| "step": 8320 |
| }, |
| { |
| "epoch": 1.49, |
| "grad_norm": 1.1717700958251953, |
| "learning_rate": 2.9770676217283844e-06, |
| "loss": 0.2113, |
| "step": 8352 |
| }, |
| { |
| "epoch": 1.49, |
| "grad_norm": 1.0709022283554077, |
| "learning_rate": 2.960809329907934e-06, |
| "loss": 0.2012, |
| "step": 8384 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 1.4594495296478271, |
| "learning_rate": 2.944530838291229e-06, |
| "loss": 0.2039, |
| "step": 8416 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 1.3135212659835815, |
| "learning_rate": 2.928232860453694e-06, |
| "loss": 0.206, |
| "step": 8448 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 1.316394329071045, |
| "learning_rate": 2.911916110824945e-06, |
| "loss": 0.212, |
| "step": 8480 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 1.3602452278137207, |
| "learning_rate": 2.895581304657465e-06, |
| "loss": 0.2068, |
| "step": 8512 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 1.2073897123336792, |
| "learning_rate": 2.8792291579952553e-06, |
| "loss": 0.2098, |
| "step": 8544 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 1.2983072996139526, |
| "learning_rate": 2.8628603876424467e-06, |
| "loss": 0.2086, |
| "step": 8576 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 1.0781196355819702, |
| "learning_rate": 2.846475711131877e-06, |
| "loss": 0.201, |
| "step": 8608 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 1.1917251348495483, |
| "learning_rate": 2.8300758466936366e-06, |
| "loss": 0.1982, |
| "step": 8640 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 1.2894983291625977, |
| "learning_rate": 2.813661513223588e-06, |
| "loss": 0.1943, |
| "step": 8672 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 1.2249202728271484, |
| "learning_rate": 2.7972334302518504e-06, |
| "loss": 0.2145, |
| "step": 8704 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 1.1947064399719238, |
| "learning_rate": 2.7807923179112576e-06, |
| "loss": 0.2003, |
| "step": 8736 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 1.0660251379013062, |
| "learning_rate": 2.764338896905792e-06, |
| "loss": 0.1984, |
| "step": 8768 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 1.0243247747421265, |
| "learning_rate": 2.7478738884789934e-06, |
| "loss": 0.2036, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.57, |
| "grad_norm": 1.286199927330017, |
| "learning_rate": 2.731398014382341e-06, |
| "loss": 0.2027, |
| "step": 8832 |
| }, |
| { |
| "epoch": 1.58, |
| "grad_norm": 1.1617448329925537, |
| "learning_rate": 2.714911996843617e-06, |
| "loss": 0.2162, |
| "step": 8864 |
| }, |
| { |
| "epoch": 1.58, |
| "grad_norm": 1.1921496391296387, |
| "learning_rate": 2.6984165585352435e-06, |
| "loss": 0.2124, |
| "step": 8896 |
| }, |
| { |
| "epoch": 1.59, |
| "grad_norm": 1.2066140174865723, |
| "learning_rate": 2.6819124225426085e-06, |
| "loss": 0.199, |
| "step": 8928 |
| }, |
| { |
| "epoch": 1.59, |
| "grad_norm": 1.0459320545196533, |
| "learning_rate": 2.665400312332368e-06, |
| "loss": 0.2072, |
| "step": 8960 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.2983636856079102, |
| "learning_rate": 2.648880951720729e-06, |
| "loss": 0.2024, |
| "step": 8992 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.0876768827438354, |
| "learning_rate": 2.6323550648417267e-06, |
| "loss": 0.2143, |
| "step": 9024 |
| }, |
| { |
| "epoch": 1.61, |
| "grad_norm": 1.0047022104263306, |
| "learning_rate": 2.6158233761154744e-06, |
| "loss": 0.2043, |
| "step": 9056 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 0.9878237247467041, |
| "learning_rate": 2.5992866102164146e-06, |
| "loss": 0.1991, |
| "step": 9088 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 0.9894827604293823, |
| "learning_rate": 2.58274549204155e-06, |
| "loss": 0.1979, |
| "step": 9120 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 0.9374232292175293, |
| "learning_rate": 2.5662007466786674e-06, |
| "loss": 0.2055, |
| "step": 9152 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 1.259948492050171, |
| "learning_rate": 2.5496530993745518e-06, |
| "loss": 0.2057, |
| "step": 9184 |
| }, |
| { |
| "epoch": 1.64, |
| "grad_norm": 0.958737850189209, |
| "learning_rate": 2.533103275503197e-06, |
| "loss": 0.2029, |
| "step": 9216 |
| }, |
| { |
| "epoch": 1.64, |
| "grad_norm": 1.079717755317688, |
| "learning_rate": 2.5165520005340082e-06, |
| "loss": 0.2049, |
| "step": 9248 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 1.1001982688903809, |
| "learning_rate": 2.5e-06, |
| "loss": 0.211, |
| "step": 9280 |
| }, |
| { |
| "epoch": 1.66, |
| "grad_norm": 1.2408779859542847, |
| "learning_rate": 2.4834479994659926e-06, |
| "loss": 0.2028, |
| "step": 9312 |
| }, |
| { |
| "epoch": 1.66, |
| "grad_norm": 1.0395313501358032, |
| "learning_rate": 2.4668967244968035e-06, |
| "loss": 0.1988, |
| "step": 9344 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 1.0080056190490723, |
| "learning_rate": 2.4503469006254487e-06, |
| "loss": 0.1988, |
| "step": 9376 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 1.4669593572616577, |
| "learning_rate": 2.4337992533213334e-06, |
| "loss": 0.1942, |
| "step": 9408 |
| }, |
| { |
| "epoch": 1.68, |
| "grad_norm": 1.3393524885177612, |
| "learning_rate": 2.4172545079584508e-06, |
| "loss": 0.1964, |
| "step": 9440 |
| }, |
| { |
| "epoch": 1.68, |
| "grad_norm": 0.9786149263381958, |
| "learning_rate": 2.4007133897835863e-06, |
| "loss": 0.1984, |
| "step": 9472 |
| }, |
| { |
| "epoch": 1.69, |
| "grad_norm": 1.1999776363372803, |
| "learning_rate": 2.3841766238845264e-06, |
| "loss": 0.2102, |
| "step": 9504 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 1.3276174068450928, |
| "learning_rate": 2.367644935158274e-06, |
| "loss": 0.1941, |
| "step": 9536 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 1.0124472379684448, |
| "learning_rate": 2.3511190482792713e-06, |
| "loss": 0.199, |
| "step": 9568 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 1.258489966392517, |
| "learning_rate": 2.3345996876676334e-06, |
| "loss": 0.2008, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 1.1993016004562378, |
| "learning_rate": 2.318087577457392e-06, |
| "loss": 0.2154, |
| "step": 9632 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 1.250908613204956, |
| "learning_rate": 2.3015834414647573e-06, |
| "loss": 0.2068, |
| "step": 9664 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 1.211915373802185, |
| "learning_rate": 2.2850880031563845e-06, |
| "loss": 0.1946, |
| "step": 9696 |
| }, |
| { |
| "epoch": 1.73, |
| "grad_norm": 1.0278340578079224, |
| "learning_rate": 2.26860198561766e-06, |
| "loss": 0.1948, |
| "step": 9728 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 1.2455780506134033, |
| "learning_rate": 2.2521261115210074e-06, |
| "loss": 0.197, |
| "step": 9760 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 1.2321908473968506, |
| "learning_rate": 2.2356611030942084e-06, |
| "loss": 0.2075, |
| "step": 9792 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 1.0618436336517334, |
| "learning_rate": 2.219207682088743e-06, |
| "loss": 0.1931, |
| "step": 9824 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 1.36842942237854, |
| "learning_rate": 2.20276656974815e-06, |
| "loss": 0.1999, |
| "step": 9856 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 1.033603310585022, |
| "learning_rate": 2.186338486776412e-06, |
| "loss": 0.2028, |
| "step": 9888 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 1.303781270980835, |
| "learning_rate": 2.169924153306363e-06, |
| "loss": 0.214, |
| "step": 9920 |
| }, |
| { |
| "epoch": 1.77, |
| "grad_norm": 1.2051355838775635, |
| "learning_rate": 2.153524288868124e-06, |
| "loss": 0.2091, |
| "step": 9952 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 0.9946267604827881, |
| "learning_rate": 2.137139612357554e-06, |
| "loss": 0.1942, |
| "step": 9984 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 1.283492088317871, |
| "learning_rate": 2.120770842004746e-06, |
| "loss": 0.1971, |
| "step": 10016 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 0.9329233169555664, |
| "learning_rate": 2.1044186953425358e-06, |
| "loss": 0.203, |
| "step": 10048 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 1.082767367362976, |
| "learning_rate": 2.0880838891750553e-06, |
| "loss": 0.2012, |
| "step": 10080 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 1.1007740497589111, |
| "learning_rate": 2.0717671395463063e-06, |
| "loss": 0.2028, |
| "step": 10112 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 1.2502014636993408, |
| "learning_rate": 2.0554691617087725e-06, |
| "loss": 0.2121, |
| "step": 10144 |
| }, |
| { |
| "epoch": 1.81, |
| "grad_norm": 1.073034405708313, |
| "learning_rate": 2.0391906700920667e-06, |
| "loss": 0.1994, |
| "step": 10176 |
| }, |
| { |
| "epoch": 1.82, |
| "grad_norm": 0.9409189820289612, |
| "learning_rate": 2.0229323782716156e-06, |
| "loss": 0.2054, |
| "step": 10208 |
| }, |
| { |
| "epoch": 1.82, |
| "grad_norm": 1.197383999824524, |
| "learning_rate": 2.0066949989373797e-06, |
| "loss": 0.1946, |
| "step": 10240 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 1.261612892150879, |
| "learning_rate": 1.9904792438626074e-06, |
| "loss": 0.2038, |
| "step": 10272 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 1.4839472770690918, |
| "learning_rate": 1.9742858238726377e-06, |
| "loss": 0.2067, |
| "step": 10304 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 1.0103521347045898, |
| "learning_rate": 1.9581154488137425e-06, |
| "loss": 0.2104, |
| "step": 10336 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 1.2776283025741577, |
| "learning_rate": 1.9419688275220085e-06, |
| "loss": 0.196, |
| "step": 10368 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 1.0784910917282104, |
| "learning_rate": 1.9258466677922624e-06, |
| "loss": 0.1975, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 0.9643808007240295, |
| "learning_rate": 1.909749676347047e-06, |
| "loss": 0.2111, |
| "step": 10432 |
| }, |
| { |
| "epoch": 1.86, |
| "grad_norm": 1.3432437181472778, |
| "learning_rate": 1.8936785588056428e-06, |
| "loss": 0.1923, |
| "step": 10464 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 1.133470892906189, |
| "learning_rate": 1.8776340196531351e-06, |
| "loss": 0.2016, |
| "step": 10496 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 1.0897003412246704, |
| "learning_rate": 1.8616167622095328e-06, |
| "loss": 0.193, |
| "step": 10528 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 0.9629374146461487, |
| "learning_rate": 1.8456274885989374e-06, |
| "loss": 0.1937, |
| "step": 10560 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 1.406630039215088, |
| "learning_rate": 1.829666899718765e-06, |
| "loss": 0.1997, |
| "step": 10592 |
| }, |
| { |
| "epoch": 1.89, |
| "grad_norm": 1.165366291999817, |
| "learning_rate": 1.8137356952090258e-06, |
| "loss": 0.1976, |
| "step": 10624 |
| }, |
| { |
| "epoch": 1.89, |
| "grad_norm": 1.2674609422683716, |
| "learning_rate": 1.7978345734216502e-06, |
| "loss": 0.1908, |
| "step": 10656 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 1.2211626768112183, |
| "learning_rate": 1.7819642313898783e-06, |
| "loss": 0.1984, |
| "step": 10688 |
| }, |
| { |
| "epoch": 1.91, |
| "grad_norm": 1.1066653728485107, |
| "learning_rate": 1.766125364797704e-06, |
| "loss": 0.2035, |
| "step": 10720 |
| }, |
| { |
| "epoch": 1.91, |
| "grad_norm": 1.539157748222351, |
| "learning_rate": 1.7503186679493821e-06, |
| "loss": 0.201, |
| "step": 10752 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 1.1897879838943481, |
| "learning_rate": 1.7345448337389918e-06, |
| "loss": 0.194, |
| "step": 10784 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 1.162571668624878, |
| "learning_rate": 1.7188045536200604e-06, |
| "loss": 0.1899, |
| "step": 10816 |
| }, |
| { |
| "epoch": 1.93, |
| "grad_norm": 1.0616639852523804, |
| "learning_rate": 1.7030985175752574e-06, |
| "loss": 0.1978, |
| "step": 10848 |
| }, |
| { |
| "epoch": 1.93, |
| "grad_norm": 1.0391641855239868, |
| "learning_rate": 1.687427414086146e-06, |
| "loss": 0.2017, |
| "step": 10880 |
| }, |
| { |
| "epoch": 1.94, |
| "grad_norm": 1.0870113372802734, |
| "learning_rate": 1.6717919301030055e-06, |
| "loss": 0.2012, |
| "step": 10912 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 1.143446922302246, |
| "learning_rate": 1.6561927510147172e-06, |
| "loss": 0.1911, |
| "step": 10944 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 1.015080213546753, |
| "learning_rate": 1.6406305606187183e-06, |
| "loss": 0.198, |
| "step": 10976 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 0.9722278714179993, |
| "learning_rate": 1.6251060410910301e-06, |
| "loss": 0.1862, |
| "step": 11008 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 0.9311158061027527, |
| "learning_rate": 1.6096198729563539e-06, |
| "loss": 0.198, |
| "step": 11040 |
| }, |
| { |
| "epoch": 1.97, |
| "grad_norm": 1.3127020597457886, |
| "learning_rate": 1.5941727350582399e-06, |
| "loss": 0.2, |
| "step": 11072 |
| }, |
| { |
| "epoch": 1.97, |
| "grad_norm": 0.9450055956840515, |
| "learning_rate": 1.5787653045293278e-06, |
| "loss": 0.2015, |
| "step": 11104 |
| }, |
| { |
| "epoch": 1.98, |
| "grad_norm": 1.1553057432174683, |
| "learning_rate": 1.5633982567616657e-06, |
| "loss": 0.2068, |
| "step": 11136 |
| }, |
| { |
| "epoch": 1.99, |
| "grad_norm": 0.8617095351219177, |
| "learning_rate": 1.548072265377105e-06, |
| "loss": 0.2014, |
| "step": 11168 |
| }, |
| { |
| "epoch": 1.99, |
| "grad_norm": 0.9857013821601868, |
| "learning_rate": 1.532788002197773e-06, |
| "loss": 0.2031, |
| "step": 11200 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.0158861875534058, |
| "learning_rate": 1.5175461372166177e-06, |
| "loss": 0.1941, |
| "step": 11232 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.1121022701263428, |
| "learning_rate": 1.5023473385680438e-06, |
| "loss": 0.1708, |
| "step": 11264 |
| }, |
| { |
| "epoch": 2.01, |
| "grad_norm": 1.1300593614578247, |
| "learning_rate": 1.4871922724986215e-06, |
| "loss": 0.1504, |
| "step": 11296 |
| }, |
| { |
| "epoch": 2.01, |
| "grad_norm": 1.232246994972229, |
| "learning_rate": 1.4720816033378856e-06, |
| "loss": 0.151, |
| "step": 11328 |
| }, |
| { |
| "epoch": 2.02, |
| "grad_norm": 1.2618398666381836, |
| "learning_rate": 1.4570159934692085e-06, |
| "loss": 0.1421, |
| "step": 11360 |
| }, |
| { |
| "epoch": 2.03, |
| "grad_norm": 1.275038242340088, |
| "learning_rate": 1.4419961033007669e-06, |
| "loss": 0.1457, |
| "step": 11392 |
| }, |
| { |
| "epoch": 2.03, |
| "grad_norm": 1.079405426979065, |
| "learning_rate": 1.427022591236594e-06, |
| "loss": 0.144, |
| "step": 11424 |
| }, |
| { |
| "epoch": 2.04, |
| "grad_norm": 1.1695400476455688, |
| "learning_rate": 1.4120961136477168e-06, |
| "loss": 0.1531, |
| "step": 11456 |
| }, |
| { |
| "epoch": 2.04, |
| "grad_norm": 1.008547306060791, |
| "learning_rate": 1.3972173248433832e-06, |
| "loss": 0.1453, |
| "step": 11488 |
| }, |
| { |
| "epoch": 2.05, |
| "grad_norm": 1.0746265649795532, |
| "learning_rate": 1.3823868770423815e-06, |
| "loss": 0.1446, |
| "step": 11520 |
| }, |
| { |
| "epoch": 2.05, |
| "grad_norm": 1.1596614122390747, |
| "learning_rate": 1.3676054203444462e-06, |
| "loss": 0.1477, |
| "step": 11552 |
| }, |
| { |
| "epoch": 2.06, |
| "grad_norm": 1.1029706001281738, |
| "learning_rate": 1.3528736027017663e-06, |
| "loss": 0.1477, |
| "step": 11584 |
| }, |
| { |
| "epoch": 2.07, |
| "grad_norm": 0.9398396015167236, |
| "learning_rate": 1.3381920698905788e-06, |
| "loss": 0.1477, |
| "step": 11616 |
| }, |
| { |
| "epoch": 2.07, |
| "grad_norm": 1.0209776163101196, |
| "learning_rate": 1.3235614654828604e-06, |
| "loss": 0.1448, |
| "step": 11648 |
| }, |
| { |
| "epoch": 2.08, |
| "grad_norm": 0.9415841102600098, |
| "learning_rate": 1.3089824308181187e-06, |
| "loss": 0.1481, |
| "step": 11680 |
| }, |
| { |
| "epoch": 2.08, |
| "grad_norm": 1.088069200515747, |
| "learning_rate": 1.2944556049752726e-06, |
| "loss": 0.149, |
| "step": 11712 |
| }, |
| { |
| "epoch": 2.09, |
| "grad_norm": 1.3269786834716797, |
| "learning_rate": 1.2799816247446494e-06, |
| "loss": 0.1497, |
| "step": 11744 |
| }, |
| { |
| "epoch": 2.09, |
| "grad_norm": 0.9119545817375183, |
| "learning_rate": 1.265561124600057e-06, |
| "loss": 0.1467, |
| "step": 11776 |
| }, |
| { |
| "epoch": 2.1, |
| "grad_norm": 1.0677683353424072, |
| "learning_rate": 1.251194736670983e-06, |
| "loss": 0.1448, |
| "step": 11808 |
| }, |
| { |
| "epoch": 2.11, |
| "grad_norm": 1.0756884813308716, |
| "learning_rate": 1.2368830907148778e-06, |
| "loss": 0.1363, |
| "step": 11840 |
| }, |
| { |
| "epoch": 2.11, |
| "grad_norm": 1.0578961372375488, |
| "learning_rate": 1.2226268140895528e-06, |
| "loss": 0.1527, |
| "step": 11872 |
| }, |
| { |
| "epoch": 2.12, |
| "grad_norm": 1.0562700033187866, |
| "learning_rate": 1.2084265317256772e-06, |
| "loss": 0.1449, |
| "step": 11904 |
| }, |
| { |
| "epoch": 2.12, |
| "grad_norm": 1.0958082675933838, |
| "learning_rate": 1.1942828660993869e-06, |
| "loss": 0.1474, |
| "step": 11936 |
| }, |
| { |
| "epoch": 2.13, |
| "grad_norm": 0.9672511219978333, |
| "learning_rate": 1.1801964372049932e-06, |
| "loss": 0.1459, |
| "step": 11968 |
| }, |
| { |
| "epoch": 2.13, |
| "grad_norm": 1.1125974655151367, |
| "learning_rate": 1.1661678625278106e-06, |
| "loss": 0.1483, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.14, |
| "grad_norm": 1.227283239364624, |
| "learning_rate": 1.152197757017086e-06, |
| "loss": 0.1453, |
| "step": 12032 |
| }, |
| { |
| "epoch": 2.15, |
| "grad_norm": 1.3206896781921387, |
| "learning_rate": 1.1382867330590414e-06, |
| "loss": 0.1425, |
| "step": 12064 |
| }, |
| { |
| "epoch": 2.15, |
| "grad_norm": 0.9912922978401184, |
| "learning_rate": 1.1244354004500335e-06, |
| "loss": 0.1529, |
| "step": 12096 |
| }, |
| { |
| "epoch": 2.16, |
| "grad_norm": 0.8996345400810242, |
| "learning_rate": 1.110644366369815e-06, |
| "loss": 0.1437, |
| "step": 12128 |
| }, |
| { |
| "epoch": 2.16, |
| "grad_norm": 0.9117119312286377, |
| "learning_rate": 1.0969142353549315e-06, |
| "loss": 0.1429, |
| "step": 12160 |
| }, |
| { |
| "epoch": 2.17, |
| "grad_norm": 1.1916334629058838, |
| "learning_rate": 1.0832456092722063e-06, |
| "loss": 0.1509, |
| "step": 12192 |
| }, |
| { |
| "epoch": 2.17, |
| "grad_norm": 1.1345574855804443, |
| "learning_rate": 1.0696390872923696e-06, |
| "loss": 0.1547, |
| "step": 12224 |
| }, |
| { |
| "epoch": 2.18, |
| "grad_norm": 1.3311399221420288, |
| "learning_rate": 1.0560952658637869e-06, |
| "loss": 0.1428, |
| "step": 12256 |
| }, |
| { |
| "epoch": 2.18, |
| "grad_norm": 1.0195939540863037, |
| "learning_rate": 1.042614738686315e-06, |
| "loss": 0.1447, |
| "step": 12288 |
| }, |
| { |
| "epoch": 2.19, |
| "grad_norm": 1.1453065872192383, |
| "learning_rate": 1.029198096685278e-06, |
| "loss": 0.1384, |
| "step": 12320 |
| }, |
| { |
| "epoch": 2.2, |
| "grad_norm": 1.1899516582489014, |
| "learning_rate": 1.0158459279855632e-06, |
| "loss": 0.1433, |
| "step": 12352 |
| }, |
| { |
| "epoch": 2.2, |
| "grad_norm": 1.060065507888794, |
| "learning_rate": 1.0025588178858372e-06, |
| "loss": 0.1456, |
| "step": 12384 |
| }, |
| { |
| "epoch": 2.21, |
| "grad_norm": 1.1489146947860718, |
| "learning_rate": 9.893373488328953e-07, |
| "loss": 0.1433, |
| "step": 12416 |
| }, |
| { |
| "epoch": 2.21, |
| "grad_norm": 1.3114417791366577, |
| "learning_rate": 9.761821003961246e-07, |
| "loss": 0.1467, |
| "step": 12448 |
| }, |
| { |
| "epoch": 2.22, |
| "grad_norm": 1.3255183696746826, |
| "learning_rate": 9.630936492421005e-07, |
| "loss": 0.1463, |
| "step": 12480 |
| }, |
| { |
| "epoch": 2.22, |
| "grad_norm": 1.2642742395401, |
| "learning_rate": 9.500725691093085e-07, |
| "loss": 0.1525, |
| "step": 12512 |
| }, |
| { |
| "epoch": 2.23, |
| "grad_norm": 1.2281138896942139, |
| "learning_rate": 9.371194307829895e-07, |
| "loss": 0.1383, |
| "step": 12544 |
| }, |
| { |
| "epoch": 2.24, |
| "grad_norm": 0.9627875089645386, |
| "learning_rate": 9.242348020701295e-07, |
| "loss": 0.1642, |
| "step": 12576 |
| }, |
| { |
| "epoch": 2.24, |
| "grad_norm": 1.1187249422073364, |
| "learning_rate": 9.114192477745568e-07, |
| "loss": 0.1439, |
| "step": 12608 |
| }, |
| { |
| "epoch": 2.25, |
| "grad_norm": 1.0410840511322021, |
| "learning_rate": 8.986733296721931e-07, |
| "loss": 0.142, |
| "step": 12640 |
| }, |
| { |
| "epoch": 2.25, |
| "grad_norm": 1.2345024347305298, |
| "learning_rate": 8.859976064864235e-07, |
| "loss": 0.1512, |
| "step": 12672 |
| }, |
| { |
| "epoch": 2.26, |
| "grad_norm": 1.6558443307876587, |
| "learning_rate": 8.733926338636056e-07, |
| "loss": 0.1363, |
| "step": 12704 |
| }, |
| { |
| "epoch": 2.26, |
| "grad_norm": 1.3118538856506348, |
| "learning_rate": 8.608589643487128e-07, |
| "loss": 0.1471, |
| "step": 12736 |
| }, |
| { |
| "epoch": 2.27, |
| "grad_norm": 1.1155567169189453, |
| "learning_rate": 8.483971473611133e-07, |
| "loss": 0.1396, |
| "step": 12768 |
| }, |
| { |
| "epoch": 2.28, |
| "grad_norm": 1.0880179405212402, |
| "learning_rate": 8.360077291704821e-07, |
| "loss": 0.1413, |
| "step": 12800 |
| }, |
| { |
| "epoch": 2.28, |
| "grad_norm": 0.9752321839332581, |
| "learning_rate": 8.236912528728647e-07, |
| "loss": 0.146, |
| "step": 12832 |
| }, |
| { |
| "epoch": 2.29, |
| "grad_norm": 0.9778379201889038, |
| "learning_rate": 8.114482583668576e-07, |
| "loss": 0.1403, |
| "step": 12864 |
| }, |
| { |
| "epoch": 2.29, |
| "grad_norm": 0.8760839700698853, |
| "learning_rate": 7.99279282329952e-07, |
| "loss": 0.148, |
| "step": 12896 |
| }, |
| { |
| "epoch": 2.3, |
| "grad_norm": 1.187658667564392, |
| "learning_rate": 7.871848581950039e-07, |
| "loss": 0.132, |
| "step": 12928 |
| }, |
| { |
| "epoch": 2.3, |
| "grad_norm": 0.9668059349060059, |
| "learning_rate": 7.751655161268481e-07, |
| "loss": 0.1424, |
| "step": 12960 |
| }, |
| { |
| "epoch": 2.31, |
| "grad_norm": 1.1318392753601074, |
| "learning_rate": 7.632217829990668e-07, |
| "loss": 0.1516, |
| "step": 12992 |
| }, |
| { |
| "epoch": 2.32, |
| "grad_norm": 1.3520994186401367, |
| "learning_rate": 7.513541823708828e-07, |
| "loss": 0.1495, |
| "step": 13024 |
| }, |
| { |
| "epoch": 2.32, |
| "grad_norm": 1.3352413177490234, |
| "learning_rate": 7.395632344642173e-07, |
| "loss": 0.1446, |
| "step": 13056 |
| }, |
| { |
| "epoch": 2.33, |
| "grad_norm": 1.0273305177688599, |
| "learning_rate": 7.278494561408833e-07, |
| "loss": 0.1391, |
| "step": 13088 |
| }, |
| { |
| "epoch": 2.33, |
| "grad_norm": 1.2872681617736816, |
| "learning_rate": 7.162133608799271e-07, |
| "loss": 0.1391, |
| "step": 13120 |
| }, |
| { |
| "epoch": 2.34, |
| "grad_norm": 1.0563528537750244, |
| "learning_rate": 7.046554587551216e-07, |
| "loss": 0.1521, |
| "step": 13152 |
| }, |
| { |
| "epoch": 2.34, |
| "grad_norm": 1.1487845182418823, |
| "learning_rate": 6.931762564126074e-07, |
| "loss": 0.1411, |
| "step": 13184 |
| }, |
| { |
| "epoch": 2.35, |
| "grad_norm": 1.058159351348877, |
| "learning_rate": 6.817762570486791e-07, |
| "loss": 0.1424, |
| "step": 13216 |
| }, |
| { |
| "epoch": 2.36, |
| "grad_norm": 1.249377727508545, |
| "learning_rate": 6.704559603877367e-07, |
| "loss": 0.1448, |
| "step": 13248 |
| }, |
| { |
| "epoch": 2.36, |
| "grad_norm": 0.9334893226623535, |
| "learning_rate": 6.592158626603689e-07, |
| "loss": 0.1384, |
| "step": 13280 |
| }, |
| { |
| "epoch": 2.37, |
| "grad_norm": 1.5639148950576782, |
| "learning_rate": 6.480564565816091e-07, |
| "loss": 0.1426, |
| "step": 13312 |
| }, |
| { |
| "epoch": 2.37, |
| "grad_norm": 1.0596867799758911, |
| "learning_rate": 6.369782313293335e-07, |
| "loss": 0.1358, |
| "step": 13344 |
| }, |
| { |
| "epoch": 2.38, |
| "grad_norm": 0.8567415475845337, |
| "learning_rate": 6.259816725228158e-07, |
| "loss": 0.1465, |
| "step": 13376 |
| }, |
| { |
| "epoch": 2.38, |
| "grad_norm": 1.1086764335632324, |
| "learning_rate": 6.150672622014459e-07, |
| "loss": 0.1538, |
| "step": 13408 |
| }, |
| { |
| "epoch": 2.39, |
| "grad_norm": 1.1636631488800049, |
| "learning_rate": 6.042354788035943e-07, |
| "loss": 0.1389, |
| "step": 13440 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 1.274596929550171, |
| "learning_rate": 5.934867971456384e-07, |
| "loss": 0.1464, |
| "step": 13472 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 1.173563003540039, |
| "learning_rate": 5.828216884011553e-07, |
| "loss": 0.1435, |
| "step": 13504 |
| }, |
| { |
| "epoch": 2.41, |
| "grad_norm": 1.0788921117782593, |
| "learning_rate": 5.722406200802613e-07, |
| "loss": 0.145, |
| "step": 13536 |
| }, |
| { |
| "epoch": 2.41, |
| "grad_norm": 1.1613490581512451, |
| "learning_rate": 5.617440560091212e-07, |
| "loss": 0.1474, |
| "step": 13568 |
| }, |
| { |
| "epoch": 2.42, |
| "grad_norm": 0.9075080156326294, |
| "learning_rate": 5.513324563096167e-07, |
| "loss": 0.1423, |
| "step": 13600 |
| }, |
| { |
| "epoch": 2.42, |
| "grad_norm": 1.1296495199203491, |
| "learning_rate": 5.41006277379173e-07, |
| "loss": 0.1506, |
| "step": 13632 |
| }, |
| { |
| "epoch": 2.43, |
| "grad_norm": 1.2199699878692627, |
| "learning_rate": 5.307659718707603e-07, |
| "loss": 0.1459, |
| "step": 13664 |
| }, |
| { |
| "epoch": 2.44, |
| "grad_norm": 1.2415364980697632, |
| "learning_rate": 5.20611988673041e-07, |
| "loss": 0.1459, |
| "step": 13696 |
| }, |
| { |
| "epoch": 2.44, |
| "grad_norm": 0.9814534187316895, |
| "learning_rate": 5.105447728907012e-07, |
| "loss": 0.1405, |
| "step": 13728 |
| }, |
| { |
| "epoch": 2.45, |
| "grad_norm": 1.1437889337539673, |
| "learning_rate": 5.00564765824936e-07, |
| "loss": 0.147, |
| "step": 13760 |
| }, |
| { |
| "epoch": 2.45, |
| "grad_norm": 1.1459242105484009, |
| "learning_rate": 4.906724049541023e-07, |
| "loss": 0.1454, |
| "step": 13792 |
| }, |
| { |
| "epoch": 2.46, |
| "grad_norm": 1.093807578086853, |
| "learning_rate": 4.808681239145479e-07, |
| "loss": 0.1448, |
| "step": 13824 |
| }, |
| { |
| "epoch": 2.46, |
| "grad_norm": 1.1457182168960571, |
| "learning_rate": 4.711523524815978e-07, |
| "loss": 0.1391, |
| "step": 13856 |
| }, |
| { |
| "epoch": 2.47, |
| "grad_norm": 1.0422513484954834, |
| "learning_rate": 4.615255165507146e-07, |
| "loss": 0.1435, |
| "step": 13888 |
| }, |
| { |
| "epoch": 2.48, |
| "grad_norm": 1.1171213388442993, |
| "learning_rate": 4.5198803811883326e-07, |
| "loss": 0.1545, |
| "step": 13920 |
| }, |
| { |
| "epoch": 2.48, |
| "grad_norm": 1.3410863876342773, |
| "learning_rate": 4.4254033526585917e-07, |
| "loss": 0.1526, |
| "step": 13952 |
| }, |
| { |
| "epoch": 2.49, |
| "grad_norm": 0.9821498394012451, |
| "learning_rate": 4.331828221363424e-07, |
| "loss": 0.1407, |
| "step": 13984 |
| }, |
| { |
| "epoch": 2.49, |
| "grad_norm": 1.2533886432647705, |
| "learning_rate": 4.239159089213246e-07, |
| "loss": 0.1358, |
| "step": 14016 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.9848787784576416, |
| "learning_rate": 4.147400018403544e-07, |
| "loss": 0.1449, |
| "step": 14048 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.9092288613319397, |
| "learning_rate": 4.056555031236878e-07, |
| "loss": 0.1433, |
| "step": 14080 |
| }, |
| { |
| "epoch": 2.51, |
| "grad_norm": 1.3839354515075684, |
| "learning_rate": 3.966628109946469e-07, |
| "loss": 0.1494, |
| "step": 14112 |
| }, |
| { |
| "epoch": 2.51, |
| "grad_norm": 0.9744381904602051, |
| "learning_rate": 3.877623196521707e-07, |
| "loss": 0.1426, |
| "step": 14144 |
| }, |
| { |
| "epoch": 2.52, |
| "grad_norm": 1.165624976158142, |
| "learning_rate": 3.7895441925353356e-07, |
| "loss": 0.1418, |
| "step": 14176 |
| }, |
| { |
| "epoch": 2.53, |
| "grad_norm": 0.9751477241516113, |
| "learning_rate": 3.702394958972391e-07, |
| "loss": 0.1479, |
| "step": 14208 |
| }, |
| { |
| "epoch": 2.53, |
| "grad_norm": 1.0645439624786377, |
| "learning_rate": 3.616179316061011e-07, |
| "loss": 0.1373, |
| "step": 14240 |
| }, |
| { |
| "epoch": 2.54, |
| "grad_norm": 0.9858296513557434, |
| "learning_rate": 3.5309010431049284e-07, |
| "loss": 0.1367, |
| "step": 14272 |
| }, |
| { |
| "epoch": 2.54, |
| "grad_norm": 1.0521584749221802, |
| "learning_rate": 3.44656387831781e-07, |
| "loss": 0.1421, |
| "step": 14304 |
| }, |
| { |
| "epoch": 2.55, |
| "grad_norm": 1.2546510696411133, |
| "learning_rate": 3.363171518659408e-07, |
| "loss": 0.1384, |
| "step": 14336 |
| }, |
| { |
| "epoch": 2.55, |
| "grad_norm": 1.0795034170150757, |
| "learning_rate": 3.280727619673496e-07, |
| "loss": 0.1463, |
| "step": 14368 |
| }, |
| { |
| "epoch": 2.56, |
| "grad_norm": 1.1764500141143799, |
| "learning_rate": 3.199235795327615e-07, |
| "loss": 0.1499, |
| "step": 14400 |
| }, |
| { |
| "epoch": 2.57, |
| "grad_norm": 1.1234067678451538, |
| "learning_rate": 3.1186996178546674e-07, |
| "loss": 0.1497, |
| "step": 14432 |
| }, |
| { |
| "epoch": 2.57, |
| "grad_norm": 0.9825364947319031, |
| "learning_rate": 3.039122617596302e-07, |
| "loss": 0.1514, |
| "step": 14464 |
| }, |
| { |
| "epoch": 2.58, |
| "grad_norm": 1.263085961341858, |
| "learning_rate": 2.960508282848215e-07, |
| "loss": 0.1476, |
| "step": 14496 |
| }, |
| { |
| "epoch": 2.58, |
| "grad_norm": 1.084181308746338, |
| "learning_rate": 2.8828600597071597e-07, |
| "loss": 0.1308, |
| "step": 14528 |
| }, |
| { |
| "epoch": 2.59, |
| "grad_norm": 1.1697498559951782, |
| "learning_rate": 2.8061813519199536e-07, |
| "loss": 0.1348, |
| "step": 14560 |
| }, |
| { |
| "epoch": 2.59, |
| "grad_norm": 1.3982306718826294, |
| "learning_rate": 2.7304755207342467e-07, |
| "loss": 0.1455, |
| "step": 14592 |
| }, |
| { |
| "epoch": 2.6, |
| "grad_norm": 1.1802705526351929, |
| "learning_rate": 2.655745884751157e-07, |
| "loss": 0.1437, |
| "step": 14624 |
| }, |
| { |
| "epoch": 2.61, |
| "grad_norm": 1.0200531482696533, |
| "learning_rate": 2.581995719779856e-07, |
| "loss": 0.1394, |
| "step": 14656 |
| }, |
| { |
| "epoch": 2.61, |
| "grad_norm": 1.1693042516708374, |
| "learning_rate": 2.5092282586939187e-07, |
| "loss": 0.151, |
| "step": 14688 |
| }, |
| { |
| "epoch": 2.62, |
| "grad_norm": 1.116024374961853, |
| "learning_rate": 2.437446691289616e-07, |
| "loss": 0.1478, |
| "step": 14720 |
| }, |
| { |
| "epoch": 2.62, |
| "grad_norm": 1.05259370803833, |
| "learning_rate": 2.3666541641461231e-07, |
| "loss": 0.1436, |
| "step": 14752 |
| }, |
| { |
| "epoch": 2.63, |
| "grad_norm": 1.0545703172683716, |
| "learning_rate": 2.2968537804875485e-07, |
| "loss": 0.1379, |
| "step": 14784 |
| }, |
| { |
| "epoch": 2.63, |
| "grad_norm": 1.0618197917938232, |
| "learning_rate": 2.228048600046928e-07, |
| "loss": 0.1409, |
| "step": 14816 |
| }, |
| { |
| "epoch": 2.64, |
| "grad_norm": 1.389092206954956, |
| "learning_rate": 2.1602416389320922e-07, |
| "loss": 0.1499, |
| "step": 14848 |
| }, |
| { |
| "epoch": 2.65, |
| "grad_norm": 1.0467108488082886, |
| "learning_rate": 2.0934358694934347e-07, |
| "loss": 0.1406, |
| "step": 14880 |
| }, |
| { |
| "epoch": 2.65, |
| "grad_norm": 1.1932706832885742, |
| "learning_rate": 2.0276342201936637e-07, |
| "loss": 0.1468, |
| "step": 14912 |
| }, |
| { |
| "epoch": 2.66, |
| "grad_norm": 1.2286850214004517, |
| "learning_rate": 1.9628395754793777e-07, |
| "loss": 0.1457, |
| "step": 14944 |
| }, |
| { |
| "epoch": 2.66, |
| "grad_norm": 0.9705607891082764, |
| "learning_rate": 1.899054775654663e-07, |
| "loss": 0.1439, |
| "step": 14976 |
| }, |
| { |
| "epoch": 2.67, |
| "grad_norm": 0.9110348224639893, |
| "learning_rate": 1.8362826167565796e-07, |
| "loss": 0.1439, |
| "step": 15008 |
| }, |
| { |
| "epoch": 2.67, |
| "grad_norm": 0.9858996272087097, |
| "learning_rate": 1.774525850432568e-07, |
| "loss": 0.1528, |
| "step": 15040 |
| }, |
| { |
| "epoch": 2.68, |
| "grad_norm": 1.1253962516784668, |
| "learning_rate": 1.7137871838198817e-07, |
| "loss": 0.1408, |
| "step": 15072 |
| }, |
| { |
| "epoch": 2.69, |
| "grad_norm": 1.5971510410308838, |
| "learning_rate": 1.654069279426873e-07, |
| "loss": 0.1497, |
| "step": 15104 |
| }, |
| { |
| "epoch": 2.69, |
| "grad_norm": 0.8475412130355835, |
| "learning_rate": 1.5953747550162907e-07, |
| "loss": 0.1456, |
| "step": 15136 |
| }, |
| { |
| "epoch": 2.7, |
| "grad_norm": 0.9866968989372253, |
| "learning_rate": 1.537706183490545e-07, |
| "loss": 0.1349, |
| "step": 15168 |
| }, |
| { |
| "epoch": 2.7, |
| "grad_norm": 1.1067461967468262, |
| "learning_rate": 1.481066092778913e-07, |
| "loss": 0.1457, |
| "step": 15200 |
| }, |
| { |
| "epoch": 2.71, |
| "grad_norm": 1.1080329418182373, |
| "learning_rate": 1.4254569657267235e-07, |
| "loss": 0.146, |
| "step": 15232 |
| }, |
| { |
| "epoch": 2.71, |
| "grad_norm": 0.992157518863678, |
| "learning_rate": 1.370881239986524e-07, |
| "loss": 0.1439, |
| "step": 15264 |
| }, |
| { |
| "epoch": 2.72, |
| "grad_norm": 1.032788872718811, |
| "learning_rate": 1.3173413079112128e-07, |
| "loss": 0.1369, |
| "step": 15296 |
| }, |
| { |
| "epoch": 2.73, |
| "grad_norm": 0.9706469774246216, |
| "learning_rate": 1.264839516449204e-07, |
| "loss": 0.136, |
| "step": 15328 |
| }, |
| { |
| "epoch": 2.73, |
| "grad_norm": 1.1187324523925781, |
| "learning_rate": 1.2133781670415013e-07, |
| "loss": 0.1359, |
| "step": 15360 |
| }, |
| { |
| "epoch": 2.74, |
| "grad_norm": 1.1595239639282227, |
| "learning_rate": 1.1629595155208424e-07, |
| "loss": 0.1401, |
| "step": 15392 |
| }, |
| { |
| "epoch": 2.74, |
| "grad_norm": 1.087785243988037, |
| "learning_rate": 1.1135857720128151e-07, |
| "loss": 0.1358, |
| "step": 15424 |
| }, |
| { |
| "epoch": 2.75, |
| "grad_norm": 1.0765306949615479, |
| "learning_rate": 1.0652591008389557e-07, |
| "loss": 0.1438, |
| "step": 15456 |
| }, |
| { |
| "epoch": 2.75, |
| "grad_norm": 1.1016918420791626, |
| "learning_rate": 1.0179816204218928e-07, |
| "loss": 0.1373, |
| "step": 15488 |
| }, |
| { |
| "epoch": 2.76, |
| "grad_norm": 1.0536975860595703, |
| "learning_rate": 9.717554031924842e-08, |
| "loss": 0.1349, |
| "step": 15520 |
| }, |
| { |
| "epoch": 2.77, |
| "grad_norm": 0.8933613300323486, |
| "learning_rate": 9.265824754989467e-08, |
| "loss": 0.1316, |
| "step": 15552 |
| }, |
| { |
| "epoch": 2.77, |
| "grad_norm": 0.9983497858047485, |
| "learning_rate": 8.824648175180722e-08, |
| "loss": 0.1346, |
| "step": 15584 |
| }, |
| { |
| "epoch": 2.78, |
| "grad_norm": 1.0600674152374268, |
| "learning_rate": 8.394043631683862e-08, |
| "loss": 0.1533, |
| "step": 15616 |
| }, |
| { |
| "epoch": 2.78, |
| "grad_norm": 1.0515772104263306, |
| "learning_rate": 7.974030000253986e-08, |
| "loss": 0.139, |
| "step": 15648 |
| }, |
| { |
| "epoch": 2.79, |
| "grad_norm": 1.4163565635681152, |
| "learning_rate": 7.564625692388499e-08, |
| "loss": 0.1323, |
| "step": 15680 |
| }, |
| { |
| "epoch": 2.79, |
| "grad_norm": 1.0619480609893799, |
| "learning_rate": 7.165848654519969e-08, |
| "loss": 0.1373, |
| "step": 15712 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 0.9783567786216736, |
| "learning_rate": 6.777716367229764e-08, |
| "loss": 0.1525, |
| "step": 15744 |
| }, |
| { |
| "epoch": 2.81, |
| "grad_norm": 1.0433095693588257, |
| "learning_rate": 6.400245844481262e-08, |
| "loss": 0.1409, |
| "step": 15776 |
| }, |
| { |
| "epoch": 2.81, |
| "grad_norm": 1.258354663848877, |
| "learning_rate": 6.033453632874498e-08, |
| "loss": 0.1402, |
| "step": 15808 |
| }, |
| { |
| "epoch": 2.82, |
| "grad_norm": 1.1823972463607788, |
| "learning_rate": 5.677355810920604e-08, |
| "loss": 0.1418, |
| "step": 15840 |
| }, |
| { |
| "epoch": 2.82, |
| "grad_norm": 1.2745051383972168, |
| "learning_rate": 5.3319679883370724e-08, |
| "loss": 0.1471, |
| "step": 15872 |
| }, |
| { |
| "epoch": 2.83, |
| "grad_norm": 1.238215684890747, |
| "learning_rate": 4.9973053053634365e-08, |
| "loss": 0.1426, |
| "step": 15904 |
| }, |
| { |
| "epoch": 2.83, |
| "grad_norm": 1.0394669771194458, |
| "learning_rate": 4.6733824320976674e-08, |
| "loss": 0.1335, |
| "step": 15936 |
| }, |
| { |
| "epoch": 2.84, |
| "grad_norm": 1.2872110605239868, |
| "learning_rate": 4.360213567853072e-08, |
| "loss": 0.1544, |
| "step": 15968 |
| }, |
| { |
| "epoch": 2.84, |
| "grad_norm": 1.2786180973052979, |
| "learning_rate": 4.057812440535797e-08, |
| "loss": 0.1461, |
| "step": 16000 |
| }, |
| { |
| "epoch": 2.85, |
| "grad_norm": 1.169412612915039, |
| "learning_rate": 3.766192306043165e-08, |
| "loss": 0.1413, |
| "step": 16032 |
| }, |
| { |
| "epoch": 2.86, |
| "grad_norm": 1.2436180114746094, |
| "learning_rate": 3.485365947682562e-08, |
| "loss": 0.1357, |
| "step": 16064 |
| }, |
| { |
| "epoch": 2.86, |
| "grad_norm": 1.1065387725830078, |
| "learning_rate": 3.215345675611076e-08, |
| "loss": 0.1472, |
| "step": 16096 |
| }, |
| { |
| "epoch": 2.87, |
| "grad_norm": 1.00310218334198, |
| "learning_rate": 2.9561433262957072e-08, |
| "loss": 0.1499, |
| "step": 16128 |
| }, |
| { |
| "epoch": 2.87, |
| "grad_norm": 0.9328859448432922, |
| "learning_rate": 2.7077702619948963e-08, |
| "loss": 0.1376, |
| "step": 16160 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 1.122558832168579, |
| "learning_rate": 2.4702373702600868e-08, |
| "loss": 0.1461, |
| "step": 16192 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 1.1374387741088867, |
| "learning_rate": 2.2435550634585522e-08, |
| "loss": 0.1427, |
| "step": 16224 |
| }, |
| { |
| "epoch": 2.89, |
| "grad_norm": 1.102001428604126, |
| "learning_rate": 2.027733278317151e-08, |
| "loss": 0.1402, |
| "step": 16256 |
| }, |
| { |
| "epoch": 2.9, |
| "grad_norm": 1.3255332708358765, |
| "learning_rate": 1.822781475486507e-08, |
| "loss": 0.1427, |
| "step": 16288 |
| }, |
| { |
| "epoch": 2.9, |
| "grad_norm": 1.1129345893859863, |
| "learning_rate": 1.628708639126425e-08, |
| "loss": 0.1443, |
| "step": 16320 |
| }, |
| { |
| "epoch": 2.91, |
| "grad_norm": 0.9628246426582336, |
| "learning_rate": 1.4455232765120397e-08, |
| "loss": 0.1425, |
| "step": 16352 |
| }, |
| { |
| "epoch": 2.91, |
| "grad_norm": 1.2334058284759521, |
| "learning_rate": 1.273233417660863e-08, |
| "loss": 0.134, |
| "step": 16384 |
| }, |
| { |
| "epoch": 2.92, |
| "grad_norm": 1.1855486631393433, |
| "learning_rate": 1.1118466149808994e-08, |
| "loss": 0.1403, |
| "step": 16416 |
| }, |
| { |
| "epoch": 2.92, |
| "grad_norm": 1.0412129163742065, |
| "learning_rate": 9.61369942939383e-09, |
| "loss": 0.1369, |
| "step": 16448 |
| }, |
| { |
| "epoch": 2.93, |
| "grad_norm": 1.2178360223770142, |
| "learning_rate": 8.218099977528871e-09, |
| "loss": 0.1346, |
| "step": 16480 |
| }, |
| { |
| "epoch": 2.94, |
| "grad_norm": 1.1358833312988281, |
| "learning_rate": 6.9317289709799896e-09, |
| "loss": 0.1504, |
| "step": 16512 |
| }, |
| { |
| "epoch": 2.94, |
| "grad_norm": 1.1772416830062866, |
| "learning_rate": 5.754642798432297e-09, |
| "loss": 0.144, |
| "step": 16544 |
| }, |
| { |
| "epoch": 2.95, |
| "grad_norm": 1.4765156507492065, |
| "learning_rate": 4.686893058018227e-09, |
| "loss": 0.1531, |
| "step": 16576 |
| }, |
| { |
| "epoch": 2.95, |
| "grad_norm": 1.0203588008880615, |
| "learning_rate": 3.728526555056289e-09, |
| "loss": 0.1439, |
| "step": 16608 |
| }, |
| { |
| "epoch": 2.96, |
| "grad_norm": 1.053261637687683, |
| "learning_rate": 2.879585299997434e-09, |
| "loss": 0.1438, |
| "step": 16640 |
| }, |
| { |
| "epoch": 2.96, |
| "grad_norm": 1.2388105392456055, |
| "learning_rate": 2.1401065065859704e-09, |
| "loss": 0.145, |
| "step": 16672 |
| }, |
| { |
| "epoch": 2.97, |
| "grad_norm": 0.9954524040222168, |
| "learning_rate": 1.5101225902267036e-09, |
| "loss": 0.147, |
| "step": 16704 |
| }, |
| { |
| "epoch": 2.98, |
| "grad_norm": 1.2384732961654663, |
| "learning_rate": 9.89661166564404e-10, |
| "loss": 0.1492, |
| "step": 16736 |
| }, |
| { |
| "epoch": 2.98, |
| "grad_norm": 0.8787427544593811, |
| "learning_rate": 5.787450502728331e-10, |
| "loss": 0.1299, |
| "step": 16768 |
| }, |
| { |
| "epoch": 2.99, |
| "grad_norm": 0.9824705719947815, |
| "learning_rate": 2.7739225405609694e-10, |
| "loss": 0.1428, |
| "step": 16800 |
| }, |
| { |
| "epoch": 2.99, |
| "grad_norm": 1.0306531190872192, |
| "learning_rate": 8.561598785705727e-11, |
| "loss": 0.1434, |
| "step": 16832 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 1.0343343019485474, |
| "learning_rate": 3.424658279460591e-12, |
| "loss": 0.1521, |
| "step": 16864 |
| } |
| ], |
| "logging_steps": 32, |
| "max_steps": 16872, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 5624, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|