diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,65833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9741942170173075, + "eval_steps": 500, + "global_step": 9400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010363768266141569, + "grad_norm": 1.0679150819778442, + "learning_rate": 1.3793103448275863e-07, + "loss": 0.9527, + "step": 1 + }, + { + "epoch": 0.00020727536532283138, + "grad_norm": 1.5423212051391602, + "learning_rate": 2.7586206896551726e-07, + "loss": 1.2815, + "step": 2 + }, + { + "epoch": 0.00031091304798424707, + "grad_norm": 1.3922085762023926, + "learning_rate": 4.137931034482759e-07, + "loss": 1.1755, + "step": 3 + }, + { + "epoch": 0.00041455073064566275, + "grad_norm": 1.4147642850875854, + "learning_rate": 5.517241379310345e-07, + "loss": 1.111, + "step": 4 + }, + { + "epoch": 0.0005181884133070785, + "grad_norm": 1.4427387714385986, + "learning_rate": 6.896551724137931e-07, + "loss": 1.2391, + "step": 5 + }, + { + "epoch": 0.0006218260959684941, + "grad_norm": 1.404998540878296, + "learning_rate": 8.275862068965518e-07, + "loss": 1.1594, + "step": 6 + }, + { + "epoch": 0.0007254637786299099, + "grad_norm": 1.4490731954574585, + "learning_rate": 9.655172413793103e-07, + "loss": 1.1807, + "step": 7 + }, + { + "epoch": 0.0008291014612913255, + "grad_norm": 1.4405816793441772, + "learning_rate": 1.103448275862069e-06, + "loss": 1.209, + "step": 8 + }, + { + "epoch": 0.0009327391439527412, + "grad_norm": 1.3140469789505005, + "learning_rate": 1.2413793103448277e-06, + "loss": 1.0561, + "step": 9 + }, + { + "epoch": 0.001036376826614157, + "grad_norm": 1.3970351219177246, + "learning_rate": 1.3793103448275862e-06, + "loss": 1.1147, + "step": 10 + }, + { + "epoch": 0.0011400145092755726, + "grad_norm": 1.3507342338562012, + "learning_rate": 1.517241379310345e-06, + "loss": 1.1222, + "step": 11 + }, + { + "epoch": 0.0012436521919369883, + "grad_norm": 1.404917597770691, + "learning_rate": 1.6551724137931037e-06, + "loss": 1.1796, + "step": 12 + }, + { + "epoch": 0.001347289874598404, + "grad_norm": 1.2982468605041504, + "learning_rate": 1.7931034482758622e-06, + "loss": 1.1586, + "step": 13 + }, + { + "epoch": 0.0014509275572598197, + "grad_norm": 1.3628538846969604, + "learning_rate": 1.9310344827586207e-06, + "loss": 1.1517, + "step": 14 + }, + { + "epoch": 0.0015545652399212354, + "grad_norm": 1.6343719959259033, + "learning_rate": 2.0689655172413796e-06, + "loss": 1.2808, + "step": 15 + }, + { + "epoch": 0.001658202922582651, + "grad_norm": 1.227524757385254, + "learning_rate": 2.206896551724138e-06, + "loss": 1.0468, + "step": 16 + }, + { + "epoch": 0.0017618406052440666, + "grad_norm": 1.4011520147323608, + "learning_rate": 2.3448275862068966e-06, + "loss": 1.183, + "step": 17 + }, + { + "epoch": 0.0018654782879054825, + "grad_norm": 1.3206757307052612, + "learning_rate": 2.4827586206896555e-06, + "loss": 1.1323, + "step": 18 + }, + { + "epoch": 0.001969115970566898, + "grad_norm": 1.2996641397476196, + "learning_rate": 2.6206896551724144e-06, + "loss": 1.1096, + "step": 19 + }, + { + "epoch": 0.002072753653228314, + "grad_norm": 1.3232481479644775, + "learning_rate": 2.7586206896551725e-06, + "loss": 1.1212, + "step": 20 + }, + { + "epoch": 0.0021763913358897294, + "grad_norm": 1.409572958946228, + "learning_rate": 2.8965517241379314e-06, + "loss": 1.1313, + "step": 21 + }, + { + "epoch": 0.0022800290185511453, + "grad_norm": 1.2496095895767212, + "learning_rate": 3.03448275862069e-06, + "loss": 1.0481, + "step": 22 + }, + { + "epoch": 0.002383666701212561, + "grad_norm": 1.353409767150879, + "learning_rate": 3.172413793103449e-06, + "loss": 1.1167, + "step": 23 + }, + { + "epoch": 0.0024873043838739765, + "grad_norm": 1.3337156772613525, + "learning_rate": 3.3103448275862073e-06, + "loss": 1.1315, + "step": 24 + }, + { + "epoch": 0.0025909420665353924, + "grad_norm": 1.301856517791748, + "learning_rate": 3.448275862068966e-06, + "loss": 1.0786, + "step": 25 + }, + { + "epoch": 0.002694579749196808, + "grad_norm": 1.4697136878967285, + "learning_rate": 3.5862068965517243e-06, + "loss": 1.168, + "step": 26 + }, + { + "epoch": 0.0027982174318582236, + "grad_norm": 1.3797833919525146, + "learning_rate": 3.7241379310344832e-06, + "loss": 1.1165, + "step": 27 + }, + { + "epoch": 0.0029018551145196395, + "grad_norm": 1.247714638710022, + "learning_rate": 3.862068965517241e-06, + "loss": 1.0296, + "step": 28 + }, + { + "epoch": 0.003005492797181055, + "grad_norm": 1.4584671258926392, + "learning_rate": 4.000000000000001e-06, + "loss": 1.1499, + "step": 29 + }, + { + "epoch": 0.0031091304798424708, + "grad_norm": 1.4505141973495483, + "learning_rate": 4.137931034482759e-06, + "loss": 1.1686, + "step": 30 + }, + { + "epoch": 0.0032127681625038866, + "grad_norm": 1.4163717031478882, + "learning_rate": 4.275862068965518e-06, + "loss": 1.0762, + "step": 31 + }, + { + "epoch": 0.003316405845165302, + "grad_norm": 1.2217729091644287, + "learning_rate": 4.413793103448276e-06, + "loss": 0.9751, + "step": 32 + }, + { + "epoch": 0.003420043527826718, + "grad_norm": 1.5361864566802979, + "learning_rate": 4.551724137931035e-06, + "loss": 1.1817, + "step": 33 + }, + { + "epoch": 0.0035236812104881333, + "grad_norm": 1.3760747909545898, + "learning_rate": 4.689655172413793e-06, + "loss": 1.024, + "step": 34 + }, + { + "epoch": 0.003627318893149549, + "grad_norm": 1.1844209432601929, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.9327, + "step": 35 + }, + { + "epoch": 0.003730956575810965, + "grad_norm": 1.568856120109558, + "learning_rate": 4.965517241379311e-06, + "loss": 1.2155, + "step": 36 + }, + { + "epoch": 0.0038345942584723804, + "grad_norm": 1.200225591659546, + "learning_rate": 5.1034482758620695e-06, + "loss": 0.9578, + "step": 37 + }, + { + "epoch": 0.003938231941133796, + "grad_norm": 1.2568145990371704, + "learning_rate": 5.241379310344829e-06, + "loss": 0.9857, + "step": 38 + }, + { + "epoch": 0.004041869623795212, + "grad_norm": 1.3353314399719238, + "learning_rate": 5.3793103448275865e-06, + "loss": 1.1098, + "step": 39 + }, + { + "epoch": 0.004145507306456628, + "grad_norm": 1.161146640777588, + "learning_rate": 5.517241379310345e-06, + "loss": 0.9297, + "step": 40 + }, + { + "epoch": 0.004249144989118043, + "grad_norm": 1.262049913406372, + "learning_rate": 5.655172413793104e-06, + "loss": 1.0301, + "step": 41 + }, + { + "epoch": 0.004352782671779459, + "grad_norm": 1.0658643245697021, + "learning_rate": 5.793103448275863e-06, + "loss": 1.0132, + "step": 42 + }, + { + "epoch": 0.004456420354440875, + "grad_norm": 1.0373111963272095, + "learning_rate": 5.9310344827586205e-06, + "loss": 0.9424, + "step": 43 + }, + { + "epoch": 0.0045600580371022905, + "grad_norm": 1.1395219564437866, + "learning_rate": 6.06896551724138e-06, + "loss": 1.036, + "step": 44 + }, + { + "epoch": 0.004663695719763706, + "grad_norm": 0.9569352865219116, + "learning_rate": 6.206896551724138e-06, + "loss": 0.8821, + "step": 45 + }, + { + "epoch": 0.004767333402425122, + "grad_norm": 0.9801313877105713, + "learning_rate": 6.344827586206898e-06, + "loss": 0.9391, + "step": 46 + }, + { + "epoch": 0.004870971085086538, + "grad_norm": 1.0882436037063599, + "learning_rate": 6.482758620689655e-06, + "loss": 0.9922, + "step": 47 + }, + { + "epoch": 0.004974608767747953, + "grad_norm": 0.9098449945449829, + "learning_rate": 6.620689655172415e-06, + "loss": 0.8526, + "step": 48 + }, + { + "epoch": 0.0050782464504093685, + "grad_norm": 0.9299390912055969, + "learning_rate": 6.758620689655173e-06, + "loss": 0.8582, + "step": 49 + }, + { + "epoch": 0.005181884133070785, + "grad_norm": 0.8806210160255432, + "learning_rate": 6.896551724137932e-06, + "loss": 0.8892, + "step": 50 + }, + { + "epoch": 0.0052855218157322, + "grad_norm": 0.8702979683876038, + "learning_rate": 7.03448275862069e-06, + "loss": 0.8592, + "step": 51 + }, + { + "epoch": 0.005389159498393616, + "grad_norm": 0.8257914781570435, + "learning_rate": 7.172413793103449e-06, + "loss": 0.9179, + "step": 52 + }, + { + "epoch": 0.005492797181055032, + "grad_norm": 0.8620724081993103, + "learning_rate": 7.310344827586208e-06, + "loss": 0.8952, + "step": 53 + }, + { + "epoch": 0.005596434863716447, + "grad_norm": 0.8150980472564697, + "learning_rate": 7.4482758620689665e-06, + "loss": 0.7818, + "step": 54 + }, + { + "epoch": 0.005700072546377863, + "grad_norm": 0.8714101910591125, + "learning_rate": 7.586206896551724e-06, + "loss": 0.8628, + "step": 55 + }, + { + "epoch": 0.005803710229039279, + "grad_norm": 0.9289737939834595, + "learning_rate": 7.724137931034483e-06, + "loss": 0.9275, + "step": 56 + }, + { + "epoch": 0.005907347911700694, + "grad_norm": 0.7288852334022522, + "learning_rate": 7.862068965517242e-06, + "loss": 0.7724, + "step": 57 + }, + { + "epoch": 0.00601098559436211, + "grad_norm": 0.7169085741043091, + "learning_rate": 8.000000000000001e-06, + "loss": 0.7608, + "step": 58 + }, + { + "epoch": 0.006114623277023526, + "grad_norm": 0.6929631233215332, + "learning_rate": 8.137931034482759e-06, + "loss": 0.7667, + "step": 59 + }, + { + "epoch": 0.0062182609596849415, + "grad_norm": 0.789334237575531, + "learning_rate": 8.275862068965518e-06, + "loss": 0.7887, + "step": 60 + }, + { + "epoch": 0.006321898642346357, + "grad_norm": 0.7103134393692017, + "learning_rate": 8.413793103448276e-06, + "loss": 0.7329, + "step": 61 + }, + { + "epoch": 0.006425536325007773, + "grad_norm": 0.7139261960983276, + "learning_rate": 8.551724137931035e-06, + "loss": 0.722, + "step": 62 + }, + { + "epoch": 0.006529174007669189, + "grad_norm": 0.6719704866409302, + "learning_rate": 8.689655172413793e-06, + "loss": 0.6828, + "step": 63 + }, + { + "epoch": 0.006632811690330604, + "grad_norm": 0.6433077454566956, + "learning_rate": 8.827586206896552e-06, + "loss": 0.6597, + "step": 64 + }, + { + "epoch": 0.0067364493729920195, + "grad_norm": 0.5811107158660889, + "learning_rate": 8.965517241379312e-06, + "loss": 0.6593, + "step": 65 + }, + { + "epoch": 0.006840087055653436, + "grad_norm": 0.5606787800788879, + "learning_rate": 9.10344827586207e-06, + "loss": 0.7172, + "step": 66 + }, + { + "epoch": 0.006943724738314851, + "grad_norm": 0.5808541178703308, + "learning_rate": 9.241379310344829e-06, + "loss": 0.6282, + "step": 67 + }, + { + "epoch": 0.007047362420976267, + "grad_norm": 0.5700259208679199, + "learning_rate": 9.379310344827586e-06, + "loss": 0.6215, + "step": 68 + }, + { + "epoch": 0.007151000103637683, + "grad_norm": 0.6022498607635498, + "learning_rate": 9.517241379310346e-06, + "loss": 0.6481, + "step": 69 + }, + { + "epoch": 0.007254637786299098, + "grad_norm": 0.63319993019104, + "learning_rate": 9.655172413793105e-06, + "loss": 0.6874, + "step": 70 + }, + { + "epoch": 0.007358275468960514, + "grad_norm": 0.6779617071151733, + "learning_rate": 9.793103448275863e-06, + "loss": 0.6006, + "step": 71 + }, + { + "epoch": 0.00746191315162193, + "grad_norm": 0.7366542816162109, + "learning_rate": 9.931034482758622e-06, + "loss": 0.7501, + "step": 72 + }, + { + "epoch": 0.007565550834283345, + "grad_norm": 0.681352436542511, + "learning_rate": 1.006896551724138e-05, + "loss": 0.6076, + "step": 73 + }, + { + "epoch": 0.007669188516944761, + "grad_norm": 0.5272005200386047, + "learning_rate": 1.0206896551724139e-05, + "loss": 0.5571, + "step": 74 + }, + { + "epoch": 0.007772826199606177, + "grad_norm": 0.579204797744751, + "learning_rate": 1.0344827586206898e-05, + "loss": 0.6202, + "step": 75 + }, + { + "epoch": 0.007876463882267593, + "grad_norm": 0.4939001202583313, + "learning_rate": 1.0482758620689658e-05, + "loss": 0.6472, + "step": 76 + }, + { + "epoch": 0.007980101564929008, + "grad_norm": 0.4667978286743164, + "learning_rate": 1.0620689655172414e-05, + "loss": 0.6026, + "step": 77 + }, + { + "epoch": 0.008083739247590423, + "grad_norm": 0.5069308280944824, + "learning_rate": 1.0758620689655173e-05, + "loss": 0.6633, + "step": 78 + }, + { + "epoch": 0.008187376930251839, + "grad_norm": 0.39213359355926514, + "learning_rate": 1.0896551724137932e-05, + "loss": 0.534, + "step": 79 + }, + { + "epoch": 0.008291014612913256, + "grad_norm": 0.48177847266197205, + "learning_rate": 1.103448275862069e-05, + "loss": 0.5833, + "step": 80 + }, + { + "epoch": 0.008394652295574671, + "grad_norm": 0.4758414030075073, + "learning_rate": 1.117241379310345e-05, + "loss": 0.4952, + "step": 81 + }, + { + "epoch": 0.008498289978236087, + "grad_norm": 0.40138718485832214, + "learning_rate": 1.1310344827586209e-05, + "loss": 0.4742, + "step": 82 + }, + { + "epoch": 0.008601927660897502, + "grad_norm": 0.4233314096927643, + "learning_rate": 1.1448275862068966e-05, + "loss": 0.6, + "step": 83 + }, + { + "epoch": 0.008705565343558918, + "grad_norm": 0.4068356156349182, + "learning_rate": 1.1586206896551726e-05, + "loss": 0.5541, + "step": 84 + }, + { + "epoch": 0.008809203026220333, + "grad_norm": 0.4177202582359314, + "learning_rate": 1.1724137931034483e-05, + "loss": 0.4775, + "step": 85 + }, + { + "epoch": 0.00891284070888175, + "grad_norm": 0.43890196084976196, + "learning_rate": 1.1862068965517241e-05, + "loss": 0.5701, + "step": 86 + }, + { + "epoch": 0.009016478391543166, + "grad_norm": 0.4295465648174286, + "learning_rate": 1.2e-05, + "loss": 0.5267, + "step": 87 + }, + { + "epoch": 0.009120116074204581, + "grad_norm": 0.4324893057346344, + "learning_rate": 1.213793103448276e-05, + "loss": 0.5316, + "step": 88 + }, + { + "epoch": 0.009223753756865996, + "grad_norm": 0.4378964304924011, + "learning_rate": 1.2275862068965519e-05, + "loss": 0.5169, + "step": 89 + }, + { + "epoch": 0.009327391439527412, + "grad_norm": 0.46402058005332947, + "learning_rate": 1.2413793103448277e-05, + "loss": 0.5574, + "step": 90 + }, + { + "epoch": 0.009431029122188827, + "grad_norm": 0.41128864884376526, + "learning_rate": 1.2551724137931036e-05, + "loss": 0.4585, + "step": 91 + }, + { + "epoch": 0.009534666804850244, + "grad_norm": 0.4755265414714813, + "learning_rate": 1.2689655172413795e-05, + "loss": 0.544, + "step": 92 + }, + { + "epoch": 0.00963830448751166, + "grad_norm": 0.3633743226528168, + "learning_rate": 1.2827586206896551e-05, + "loss": 0.4365, + "step": 93 + }, + { + "epoch": 0.009741942170173075, + "grad_norm": 0.4546061158180237, + "learning_rate": 1.296551724137931e-05, + "loss": 0.5609, + "step": 94 + }, + { + "epoch": 0.00984557985283449, + "grad_norm": 0.43278414011001587, + "learning_rate": 1.310344827586207e-05, + "loss": 0.5665, + "step": 95 + }, + { + "epoch": 0.009949217535495906, + "grad_norm": 0.4906076192855835, + "learning_rate": 1.324137931034483e-05, + "loss": 0.5032, + "step": 96 + }, + { + "epoch": 0.010052855218157321, + "grad_norm": 0.42069289088249207, + "learning_rate": 1.3379310344827587e-05, + "loss": 0.4994, + "step": 97 + }, + { + "epoch": 0.010156492900818737, + "grad_norm": 0.4489268660545349, + "learning_rate": 1.3517241379310346e-05, + "loss": 0.4499, + "step": 98 + }, + { + "epoch": 0.010260130583480154, + "grad_norm": 0.4298715591430664, + "learning_rate": 1.3655172413793106e-05, + "loss": 0.4592, + "step": 99 + }, + { + "epoch": 0.01036376826614157, + "grad_norm": 0.6667768359184265, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.557, + "step": 100 + }, + { + "epoch": 0.010467405948802985, + "grad_norm": 0.39222168922424316, + "learning_rate": 1.3931034482758621e-05, + "loss": 0.4509, + "step": 101 + }, + { + "epoch": 0.0105710436314644, + "grad_norm": 0.45276468992233276, + "learning_rate": 1.406896551724138e-05, + "loss": 0.4495, + "step": 102 + }, + { + "epoch": 0.010674681314125816, + "grad_norm": 0.531661331653595, + "learning_rate": 1.4206896551724138e-05, + "loss": 0.5465, + "step": 103 + }, + { + "epoch": 0.010778318996787231, + "grad_norm": 0.5024605989456177, + "learning_rate": 1.4344827586206897e-05, + "loss": 0.518, + "step": 104 + }, + { + "epoch": 0.010881956679448648, + "grad_norm": 0.6134429574012756, + "learning_rate": 1.4482758620689657e-05, + "loss": 0.4726, + "step": 105 + }, + { + "epoch": 0.010985594362110064, + "grad_norm": 0.41454777121543884, + "learning_rate": 1.4620689655172416e-05, + "loss": 0.4367, + "step": 106 + }, + { + "epoch": 0.01108923204477148, + "grad_norm": 0.4755084812641144, + "learning_rate": 1.4758620689655174e-05, + "loss": 0.4115, + "step": 107 + }, + { + "epoch": 0.011192869727432895, + "grad_norm": 0.37892940640449524, + "learning_rate": 1.4896551724137933e-05, + "loss": 0.4145, + "step": 108 + }, + { + "epoch": 0.01129650741009431, + "grad_norm": 0.43258216977119446, + "learning_rate": 1.503448275862069e-05, + "loss": 0.356, + "step": 109 + }, + { + "epoch": 0.011400145092755725, + "grad_norm": 0.4437129497528076, + "learning_rate": 1.5172413793103448e-05, + "loss": 0.4951, + "step": 110 + }, + { + "epoch": 0.01150378277541714, + "grad_norm": 0.3721010386943817, + "learning_rate": 1.5310344827586208e-05, + "loss": 0.4168, + "step": 111 + }, + { + "epoch": 0.011607420458078558, + "grad_norm": 0.4480922818183899, + "learning_rate": 1.5448275862068965e-05, + "loss": 0.4556, + "step": 112 + }, + { + "epoch": 0.011711058140739973, + "grad_norm": 0.3953317403793335, + "learning_rate": 1.5586206896551726e-05, + "loss": 0.3778, + "step": 113 + }, + { + "epoch": 0.011814695823401389, + "grad_norm": 0.4656274914741516, + "learning_rate": 1.5724137931034484e-05, + "loss": 0.4854, + "step": 114 + }, + { + "epoch": 0.011918333506062804, + "grad_norm": 0.4352475106716156, + "learning_rate": 1.586206896551724e-05, + "loss": 0.4269, + "step": 115 + }, + { + "epoch": 0.01202197118872422, + "grad_norm": 0.44665348529815674, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.3769, + "step": 116 + }, + { + "epoch": 0.012125608871385635, + "grad_norm": 0.44178900122642517, + "learning_rate": 1.613793103448276e-05, + "loss": 0.4108, + "step": 117 + }, + { + "epoch": 0.012229246554047052, + "grad_norm": 0.45032814145088196, + "learning_rate": 1.6275862068965518e-05, + "loss": 0.4648, + "step": 118 + }, + { + "epoch": 0.012332884236708468, + "grad_norm": 0.3899349272251129, + "learning_rate": 1.6413793103448276e-05, + "loss": 0.42, + "step": 119 + }, + { + "epoch": 0.012436521919369883, + "grad_norm": 0.4548027813434601, + "learning_rate": 1.6551724137931037e-05, + "loss": 0.4473, + "step": 120 + }, + { + "epoch": 0.012540159602031298, + "grad_norm": 0.4112202227115631, + "learning_rate": 1.6689655172413794e-05, + "loss": 0.3918, + "step": 121 + }, + { + "epoch": 0.012643797284692714, + "grad_norm": 0.5322526693344116, + "learning_rate": 1.6827586206896552e-05, + "loss": 0.5122, + "step": 122 + }, + { + "epoch": 0.01274743496735413, + "grad_norm": 0.48069503903388977, + "learning_rate": 1.6965517241379313e-05, + "loss": 0.4585, + "step": 123 + }, + { + "epoch": 0.012851072650015546, + "grad_norm": 0.48351654410362244, + "learning_rate": 1.710344827586207e-05, + "loss": 0.3876, + "step": 124 + }, + { + "epoch": 0.012954710332676962, + "grad_norm": 0.4733540117740631, + "learning_rate": 1.7241379310344828e-05, + "loss": 0.4134, + "step": 125 + }, + { + "epoch": 0.013058348015338377, + "grad_norm": 0.3814505934715271, + "learning_rate": 1.7379310344827586e-05, + "loss": 0.3921, + "step": 126 + }, + { + "epoch": 0.013161985697999793, + "grad_norm": 0.45198261737823486, + "learning_rate": 1.7517241379310347e-05, + "loss": 0.4672, + "step": 127 + }, + { + "epoch": 0.013265623380661208, + "grad_norm": 0.5899551510810852, + "learning_rate": 1.7655172413793105e-05, + "loss": 0.4775, + "step": 128 + }, + { + "epoch": 0.013369261063322624, + "grad_norm": 0.4327309727668762, + "learning_rate": 1.7793103448275862e-05, + "loss": 0.3983, + "step": 129 + }, + { + "epoch": 0.013472898745984039, + "grad_norm": 0.42272329330444336, + "learning_rate": 1.7931034482758623e-05, + "loss": 0.4247, + "step": 130 + }, + { + "epoch": 0.013576536428645456, + "grad_norm": 0.4427931308746338, + "learning_rate": 1.806896551724138e-05, + "loss": 0.4422, + "step": 131 + }, + { + "epoch": 0.013680174111306872, + "grad_norm": 0.4716092050075531, + "learning_rate": 1.820689655172414e-05, + "loss": 0.4261, + "step": 132 + }, + { + "epoch": 0.013783811793968287, + "grad_norm": 0.40726834535598755, + "learning_rate": 1.8344827586206896e-05, + "loss": 0.4017, + "step": 133 + }, + { + "epoch": 0.013887449476629702, + "grad_norm": 0.3557395040988922, + "learning_rate": 1.8482758620689657e-05, + "loss": 0.3759, + "step": 134 + }, + { + "epoch": 0.013991087159291118, + "grad_norm": 0.4733648896217346, + "learning_rate": 1.8620689655172415e-05, + "loss": 0.4111, + "step": 135 + }, + { + "epoch": 0.014094724841952533, + "grad_norm": 0.3284488320350647, + "learning_rate": 1.8758620689655173e-05, + "loss": 0.3095, + "step": 136 + }, + { + "epoch": 0.01419836252461395, + "grad_norm": 0.3625854551792145, + "learning_rate": 1.8896551724137934e-05, + "loss": 0.3512, + "step": 137 + }, + { + "epoch": 0.014302000207275366, + "grad_norm": 0.4706290662288666, + "learning_rate": 1.903448275862069e-05, + "loss": 0.4833, + "step": 138 + }, + { + "epoch": 0.014405637889936781, + "grad_norm": 0.45985788106918335, + "learning_rate": 1.917241379310345e-05, + "loss": 0.3579, + "step": 139 + }, + { + "epoch": 0.014509275572598197, + "grad_norm": 0.4740069806575775, + "learning_rate": 1.931034482758621e-05, + "loss": 0.4024, + "step": 140 + }, + { + "epoch": 0.014612913255259612, + "grad_norm": 0.4517245590686798, + "learning_rate": 1.9448275862068968e-05, + "loss": 0.3534, + "step": 141 + }, + { + "epoch": 0.014716550937921027, + "grad_norm": 0.4930357336997986, + "learning_rate": 1.9586206896551725e-05, + "loss": 0.4098, + "step": 142 + }, + { + "epoch": 0.014820188620582445, + "grad_norm": 0.4142999053001404, + "learning_rate": 1.9724137931034483e-05, + "loss": 0.3796, + "step": 143 + }, + { + "epoch": 0.01492382630324386, + "grad_norm": 0.44440484046936035, + "learning_rate": 1.9862068965517244e-05, + "loss": 0.3553, + "step": 144 + }, + { + "epoch": 0.015027463985905275, + "grad_norm": 0.4276621639728546, + "learning_rate": 2e-05, + "loss": 0.374, + "step": 145 + }, + { + "epoch": 0.01513110166856669, + "grad_norm": 0.4273627698421478, + "learning_rate": 2.013793103448276e-05, + "loss": 0.4201, + "step": 146 + }, + { + "epoch": 0.015234739351228106, + "grad_norm": 0.5435613989830017, + "learning_rate": 2.027586206896552e-05, + "loss": 0.4238, + "step": 147 + }, + { + "epoch": 0.015338377033889522, + "grad_norm": 0.47127071022987366, + "learning_rate": 2.0413793103448278e-05, + "loss": 0.3729, + "step": 148 + }, + { + "epoch": 0.015442014716550937, + "grad_norm": 0.37225133180618286, + "learning_rate": 2.0551724137931036e-05, + "loss": 0.3273, + "step": 149 + }, + { + "epoch": 0.015545652399212354, + "grad_norm": 0.6549889445304871, + "learning_rate": 2.0689655172413797e-05, + "loss": 0.4219, + "step": 150 + }, + { + "epoch": 0.015649290081873768, + "grad_norm": 0.44427254796028137, + "learning_rate": 2.0827586206896554e-05, + "loss": 0.3056, + "step": 151 + }, + { + "epoch": 0.015752927764535185, + "grad_norm": 0.4055584669113159, + "learning_rate": 2.0965517241379315e-05, + "loss": 0.3699, + "step": 152 + }, + { + "epoch": 0.015856565447196602, + "grad_norm": 0.4897557497024536, + "learning_rate": 2.1103448275862073e-05, + "loss": 0.3725, + "step": 153 + }, + { + "epoch": 0.015960203129858016, + "grad_norm": 0.49200958013534546, + "learning_rate": 2.1241379310344827e-05, + "loss": 0.3634, + "step": 154 + }, + { + "epoch": 0.016063840812519433, + "grad_norm": 0.47332412004470825, + "learning_rate": 2.1379310344827585e-05, + "loss": 0.4008, + "step": 155 + }, + { + "epoch": 0.016167478495180847, + "grad_norm": 0.4750024378299713, + "learning_rate": 2.1517241379310346e-05, + "loss": 0.4689, + "step": 156 + }, + { + "epoch": 0.016271116177842264, + "grad_norm": 0.45490074157714844, + "learning_rate": 2.1655172413793104e-05, + "loss": 0.4351, + "step": 157 + }, + { + "epoch": 0.016374753860503678, + "grad_norm": 0.4558456838130951, + "learning_rate": 2.1793103448275865e-05, + "loss": 0.3582, + "step": 158 + }, + { + "epoch": 0.016478391543165095, + "grad_norm": 0.4957145154476166, + "learning_rate": 2.1931034482758622e-05, + "loss": 0.4335, + "step": 159 + }, + { + "epoch": 0.016582029225826512, + "grad_norm": 0.4751178026199341, + "learning_rate": 2.206896551724138e-05, + "loss": 0.3405, + "step": 160 + }, + { + "epoch": 0.016685666908487926, + "grad_norm": 0.4940536916255951, + "learning_rate": 2.220689655172414e-05, + "loss": 0.386, + "step": 161 + }, + { + "epoch": 0.016789304591149343, + "grad_norm": 0.525262713432312, + "learning_rate": 2.23448275862069e-05, + "loss": 0.337, + "step": 162 + }, + { + "epoch": 0.016892942273810756, + "grad_norm": 0.4709758758544922, + "learning_rate": 2.2482758620689656e-05, + "loss": 0.3818, + "step": 163 + }, + { + "epoch": 0.016996579956472174, + "grad_norm": 0.51386559009552, + "learning_rate": 2.2620689655172417e-05, + "loss": 0.3694, + "step": 164 + }, + { + "epoch": 0.01710021763913359, + "grad_norm": 0.48915234208106995, + "learning_rate": 2.2758620689655175e-05, + "loss": 0.4096, + "step": 165 + }, + { + "epoch": 0.017203855321795004, + "grad_norm": 0.5260508060455322, + "learning_rate": 2.2896551724137933e-05, + "loss": 0.3378, + "step": 166 + }, + { + "epoch": 0.01730749300445642, + "grad_norm": 0.48009803891181946, + "learning_rate": 2.3034482758620694e-05, + "loss": 0.3874, + "step": 167 + }, + { + "epoch": 0.017411130687117835, + "grad_norm": 0.5436667799949646, + "learning_rate": 2.317241379310345e-05, + "loss": 0.3528, + "step": 168 + }, + { + "epoch": 0.017514768369779252, + "grad_norm": 0.5391678810119629, + "learning_rate": 2.3310344827586212e-05, + "loss": 0.3975, + "step": 169 + }, + { + "epoch": 0.017618406052440666, + "grad_norm": 0.38910624384880066, + "learning_rate": 2.3448275862068967e-05, + "loss": 0.3708, + "step": 170 + }, + { + "epoch": 0.017722043735102083, + "grad_norm": 0.4939960241317749, + "learning_rate": 2.3586206896551724e-05, + "loss": 0.357, + "step": 171 + }, + { + "epoch": 0.0178256814177635, + "grad_norm": 0.3927547037601471, + "learning_rate": 2.3724137931034482e-05, + "loss": 0.2959, + "step": 172 + }, + { + "epoch": 0.017929319100424914, + "grad_norm": 0.45952022075653076, + "learning_rate": 2.3862068965517243e-05, + "loss": 0.3149, + "step": 173 + }, + { + "epoch": 0.01803295678308633, + "grad_norm": 0.4640410840511322, + "learning_rate": 2.4e-05, + "loss": 0.3859, + "step": 174 + }, + { + "epoch": 0.018136594465747745, + "grad_norm": 0.6159297823905945, + "learning_rate": 2.413793103448276e-05, + "loss": 0.4259, + "step": 175 + }, + { + "epoch": 0.018240232148409162, + "grad_norm": 0.5587474703788757, + "learning_rate": 2.427586206896552e-05, + "loss": 0.3207, + "step": 176 + }, + { + "epoch": 0.018343869831070576, + "grad_norm": 0.5045621395111084, + "learning_rate": 2.4413793103448277e-05, + "loss": 0.4125, + "step": 177 + }, + { + "epoch": 0.018447507513731993, + "grad_norm": 0.42282718420028687, + "learning_rate": 2.4551724137931038e-05, + "loss": 0.3426, + "step": 178 + }, + { + "epoch": 0.01855114519639341, + "grad_norm": 0.5033625960350037, + "learning_rate": 2.4689655172413796e-05, + "loss": 0.3988, + "step": 179 + }, + { + "epoch": 0.018654782879054824, + "grad_norm": 0.5493314862251282, + "learning_rate": 2.4827586206896553e-05, + "loss": 0.3396, + "step": 180 + }, + { + "epoch": 0.01875842056171624, + "grad_norm": 0.47352248430252075, + "learning_rate": 2.4965517241379314e-05, + "loss": 0.3763, + "step": 181 + }, + { + "epoch": 0.018862058244377655, + "grad_norm": 0.45625126361846924, + "learning_rate": 2.5103448275862072e-05, + "loss": 0.3829, + "step": 182 + }, + { + "epoch": 0.01896569592703907, + "grad_norm": 0.5125209093093872, + "learning_rate": 2.524137931034483e-05, + "loss": 0.3919, + "step": 183 + }, + { + "epoch": 0.01906933360970049, + "grad_norm": 0.4657573103904724, + "learning_rate": 2.537931034482759e-05, + "loss": 0.3326, + "step": 184 + }, + { + "epoch": 0.019172971292361903, + "grad_norm": 0.5020167231559753, + "learning_rate": 2.551724137931035e-05, + "loss": 0.3733, + "step": 185 + }, + { + "epoch": 0.01927660897502332, + "grad_norm": 0.48332661390304565, + "learning_rate": 2.5655172413793103e-05, + "loss": 0.3545, + "step": 186 + }, + { + "epoch": 0.019380246657684733, + "grad_norm": 0.4483455717563629, + "learning_rate": 2.5793103448275864e-05, + "loss": 0.3001, + "step": 187 + }, + { + "epoch": 0.01948388434034615, + "grad_norm": 0.4317588806152344, + "learning_rate": 2.593103448275862e-05, + "loss": 0.3211, + "step": 188 + }, + { + "epoch": 0.019587522023007564, + "grad_norm": 0.5770359635353088, + "learning_rate": 2.606896551724138e-05, + "loss": 0.3931, + "step": 189 + }, + { + "epoch": 0.01969115970566898, + "grad_norm": 0.4711754322052002, + "learning_rate": 2.620689655172414e-05, + "loss": 0.3387, + "step": 190 + }, + { + "epoch": 0.0197947973883304, + "grad_norm": 0.4772488474845886, + "learning_rate": 2.6344827586206898e-05, + "loss": 0.3786, + "step": 191 + }, + { + "epoch": 0.019898435070991812, + "grad_norm": 0.5390376448631287, + "learning_rate": 2.648275862068966e-05, + "loss": 0.359, + "step": 192 + }, + { + "epoch": 0.02000207275365323, + "grad_norm": 0.5037919878959656, + "learning_rate": 2.6620689655172416e-05, + "loss": 0.3906, + "step": 193 + }, + { + "epoch": 0.020105710436314643, + "grad_norm": 0.47281694412231445, + "learning_rate": 2.6758620689655174e-05, + "loss": 0.3429, + "step": 194 + }, + { + "epoch": 0.02020934811897606, + "grad_norm": 0.4878309369087219, + "learning_rate": 2.6896551724137935e-05, + "loss": 0.349, + "step": 195 + }, + { + "epoch": 0.020312985801637474, + "grad_norm": 0.41772618889808655, + "learning_rate": 2.7034482758620693e-05, + "loss": 0.2966, + "step": 196 + }, + { + "epoch": 0.02041662348429889, + "grad_norm": 0.5848966836929321, + "learning_rate": 2.717241379310345e-05, + "loss": 0.3791, + "step": 197 + }, + { + "epoch": 0.020520261166960308, + "grad_norm": 0.5652657151222229, + "learning_rate": 2.731034482758621e-05, + "loss": 0.4379, + "step": 198 + }, + { + "epoch": 0.020623898849621722, + "grad_norm": 0.4548835754394531, + "learning_rate": 2.744827586206897e-05, + "loss": 0.2742, + "step": 199 + }, + { + "epoch": 0.02072753653228314, + "grad_norm": 0.41999682784080505, + "learning_rate": 2.7586206896551727e-05, + "loss": 0.2811, + "step": 200 + }, + { + "epoch": 0.020831174214944553, + "grad_norm": 0.48233315348625183, + "learning_rate": 2.7724137931034488e-05, + "loss": 0.3374, + "step": 201 + }, + { + "epoch": 0.02093481189760597, + "grad_norm": 0.5580697059631348, + "learning_rate": 2.7862068965517242e-05, + "loss": 0.3705, + "step": 202 + }, + { + "epoch": 0.021038449580267383, + "grad_norm": 0.5358230471611023, + "learning_rate": 2.8e-05, + "loss": 0.3769, + "step": 203 + }, + { + "epoch": 0.0211420872629288, + "grad_norm": 0.4528452157974243, + "learning_rate": 2.813793103448276e-05, + "loss": 0.3231, + "step": 204 + }, + { + "epoch": 0.021245724945590218, + "grad_norm": 0.5074284076690674, + "learning_rate": 2.8275862068965518e-05, + "loss": 0.3532, + "step": 205 + }, + { + "epoch": 0.02134936262825163, + "grad_norm": 0.46669670939445496, + "learning_rate": 2.8413793103448276e-05, + "loss": 0.2634, + "step": 206 + }, + { + "epoch": 0.02145300031091305, + "grad_norm": 0.445211797952652, + "learning_rate": 2.8551724137931037e-05, + "loss": 0.2899, + "step": 207 + }, + { + "epoch": 0.021556637993574462, + "grad_norm": 0.49502161145210266, + "learning_rate": 2.8689655172413795e-05, + "loss": 0.3434, + "step": 208 + }, + { + "epoch": 0.02166027567623588, + "grad_norm": 0.5964809060096741, + "learning_rate": 2.8827586206896556e-05, + "loss": 0.3686, + "step": 209 + }, + { + "epoch": 0.021763913358897297, + "grad_norm": 0.45797616243362427, + "learning_rate": 2.8965517241379313e-05, + "loss": 0.3317, + "step": 210 + }, + { + "epoch": 0.02186755104155871, + "grad_norm": 0.5436007380485535, + "learning_rate": 2.910344827586207e-05, + "loss": 0.3771, + "step": 211 + }, + { + "epoch": 0.021971188724220127, + "grad_norm": 0.49826550483703613, + "learning_rate": 2.9241379310344832e-05, + "loss": 0.3123, + "step": 212 + }, + { + "epoch": 0.02207482640688154, + "grad_norm": 0.5190101861953735, + "learning_rate": 2.937931034482759e-05, + "loss": 0.3288, + "step": 213 + }, + { + "epoch": 0.02217846408954296, + "grad_norm": 0.49877676367759705, + "learning_rate": 2.9517241379310347e-05, + "loss": 0.317, + "step": 214 + }, + { + "epoch": 0.022282101772204372, + "grad_norm": 0.524618923664093, + "learning_rate": 2.965517241379311e-05, + "loss": 0.3448, + "step": 215 + }, + { + "epoch": 0.02238573945486579, + "grad_norm": 0.5132225155830383, + "learning_rate": 2.9793103448275866e-05, + "loss": 0.3199, + "step": 216 + }, + { + "epoch": 0.022489377137527206, + "grad_norm": 0.44829437136650085, + "learning_rate": 2.9931034482758624e-05, + "loss": 0.3222, + "step": 217 + }, + { + "epoch": 0.02259301482018862, + "grad_norm": 0.5427300333976746, + "learning_rate": 3.006896551724138e-05, + "loss": 0.3107, + "step": 218 + }, + { + "epoch": 0.022696652502850037, + "grad_norm": 0.49013856053352356, + "learning_rate": 3.020689655172414e-05, + "loss": 0.2805, + "step": 219 + }, + { + "epoch": 0.02280029018551145, + "grad_norm": 0.4895915985107422, + "learning_rate": 3.0344827586206897e-05, + "loss": 0.2775, + "step": 220 + }, + { + "epoch": 0.022903927868172868, + "grad_norm": 0.5539257526397705, + "learning_rate": 3.0482758620689658e-05, + "loss": 0.364, + "step": 221 + }, + { + "epoch": 0.02300756555083428, + "grad_norm": 0.5412529110908508, + "learning_rate": 3.0620689655172415e-05, + "loss": 0.331, + "step": 222 + }, + { + "epoch": 0.0231112032334957, + "grad_norm": 0.45927807688713074, + "learning_rate": 3.0758620689655176e-05, + "loss": 0.3188, + "step": 223 + }, + { + "epoch": 0.023214840916157116, + "grad_norm": 0.5479596257209778, + "learning_rate": 3.089655172413793e-05, + "loss": 0.3522, + "step": 224 + }, + { + "epoch": 0.02331847859881853, + "grad_norm": 0.5624309778213501, + "learning_rate": 3.103448275862069e-05, + "loss": 0.4032, + "step": 225 + }, + { + "epoch": 0.023422116281479947, + "grad_norm": 0.5181564688682556, + "learning_rate": 3.117241379310345e-05, + "loss": 0.3215, + "step": 226 + }, + { + "epoch": 0.02352575396414136, + "grad_norm": 0.4655615985393524, + "learning_rate": 3.131034482758621e-05, + "loss": 0.3679, + "step": 227 + }, + { + "epoch": 0.023629391646802778, + "grad_norm": 0.4179629981517792, + "learning_rate": 3.144827586206897e-05, + "loss": 0.2558, + "step": 228 + }, + { + "epoch": 0.023733029329464195, + "grad_norm": 0.4396083950996399, + "learning_rate": 3.158620689655173e-05, + "loss": 0.2876, + "step": 229 + }, + { + "epoch": 0.02383666701212561, + "grad_norm": 0.49648308753967285, + "learning_rate": 3.172413793103448e-05, + "loss": 0.2919, + "step": 230 + }, + { + "epoch": 0.023940304694787026, + "grad_norm": 0.5090090036392212, + "learning_rate": 3.1862068965517244e-05, + "loss": 0.3227, + "step": 231 + }, + { + "epoch": 0.02404394237744844, + "grad_norm": 0.5114050507545471, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.3374, + "step": 232 + }, + { + "epoch": 0.024147580060109856, + "grad_norm": 0.48820868134498596, + "learning_rate": 3.2137931034482766e-05, + "loss": 0.3371, + "step": 233 + }, + { + "epoch": 0.02425121774277127, + "grad_norm": 0.5176852941513062, + "learning_rate": 3.227586206896552e-05, + "loss": 0.3017, + "step": 234 + }, + { + "epoch": 0.024354855425432687, + "grad_norm": 0.4962432086467743, + "learning_rate": 3.2413793103448275e-05, + "loss": 0.311, + "step": 235 + }, + { + "epoch": 0.024458493108094104, + "grad_norm": 0.6376422643661499, + "learning_rate": 3.2551724137931036e-05, + "loss": 0.3636, + "step": 236 + }, + { + "epoch": 0.024562130790755518, + "grad_norm": 0.5156605839729309, + "learning_rate": 3.26896551724138e-05, + "loss": 0.283, + "step": 237 + }, + { + "epoch": 0.024665768473416935, + "grad_norm": 0.5535326600074768, + "learning_rate": 3.282758620689655e-05, + "loss": 0.2782, + "step": 238 + }, + { + "epoch": 0.02476940615607835, + "grad_norm": 0.459387868642807, + "learning_rate": 3.296551724137931e-05, + "loss": 0.2749, + "step": 239 + }, + { + "epoch": 0.024873043838739766, + "grad_norm": 0.47547951340675354, + "learning_rate": 3.310344827586207e-05, + "loss": 0.2842, + "step": 240 + }, + { + "epoch": 0.02497668152140118, + "grad_norm": 0.5172399282455444, + "learning_rate": 3.324137931034483e-05, + "loss": 0.3367, + "step": 241 + }, + { + "epoch": 0.025080319204062597, + "grad_norm": 0.5787510275840759, + "learning_rate": 3.337931034482759e-05, + "loss": 0.3104, + "step": 242 + }, + { + "epoch": 0.025183956886724014, + "grad_norm": 0.5061002969741821, + "learning_rate": 3.351724137931035e-05, + "loss": 0.2925, + "step": 243 + }, + { + "epoch": 0.025287594569385428, + "grad_norm": 0.5571149587631226, + "learning_rate": 3.3655172413793104e-05, + "loss": 0.3751, + "step": 244 + }, + { + "epoch": 0.025391232252046845, + "grad_norm": 0.5382230281829834, + "learning_rate": 3.3793103448275865e-05, + "loss": 0.3197, + "step": 245 + }, + { + "epoch": 0.02549486993470826, + "grad_norm": 0.5850135087966919, + "learning_rate": 3.3931034482758626e-05, + "loss": 0.3414, + "step": 246 + }, + { + "epoch": 0.025598507617369676, + "grad_norm": 0.5176693797111511, + "learning_rate": 3.406896551724138e-05, + "loss": 0.3454, + "step": 247 + }, + { + "epoch": 0.025702145300031093, + "grad_norm": 0.5404151678085327, + "learning_rate": 3.420689655172414e-05, + "loss": 0.3133, + "step": 248 + }, + { + "epoch": 0.025805782982692507, + "grad_norm": 0.49067196249961853, + "learning_rate": 3.43448275862069e-05, + "loss": 0.2944, + "step": 249 + }, + { + "epoch": 0.025909420665353924, + "grad_norm": 0.457242876291275, + "learning_rate": 3.4482758620689657e-05, + "loss": 0.274, + "step": 250 + }, + { + "epoch": 0.026013058348015337, + "grad_norm": 0.5542387366294861, + "learning_rate": 3.462068965517242e-05, + "loss": 0.3578, + "step": 251 + }, + { + "epoch": 0.026116696030676755, + "grad_norm": 0.46230652928352356, + "learning_rate": 3.475862068965517e-05, + "loss": 0.3237, + "step": 252 + }, + { + "epoch": 0.026220333713338168, + "grad_norm": 0.5638226866722107, + "learning_rate": 3.489655172413793e-05, + "loss": 0.3196, + "step": 253 + }, + { + "epoch": 0.026323971395999585, + "grad_norm": 0.5501084923744202, + "learning_rate": 3.5034482758620694e-05, + "loss": 0.3407, + "step": 254 + }, + { + "epoch": 0.026427609078661003, + "grad_norm": 0.5448089838027954, + "learning_rate": 3.517241379310345e-05, + "loss": 0.2784, + "step": 255 + }, + { + "epoch": 0.026531246761322416, + "grad_norm": 0.4925558865070343, + "learning_rate": 3.531034482758621e-05, + "loss": 0.3386, + "step": 256 + }, + { + "epoch": 0.026634884443983833, + "grad_norm": 0.5627326369285583, + "learning_rate": 3.544827586206897e-05, + "loss": 0.3312, + "step": 257 + }, + { + "epoch": 0.026738522126645247, + "grad_norm": 0.5060065388679504, + "learning_rate": 3.5586206896551725e-05, + "loss": 0.3281, + "step": 258 + }, + { + "epoch": 0.026842159809306664, + "grad_norm": 0.6172648668289185, + "learning_rate": 3.5724137931034486e-05, + "loss": 0.3321, + "step": 259 + }, + { + "epoch": 0.026945797491968078, + "grad_norm": 0.5663166046142578, + "learning_rate": 3.586206896551725e-05, + "loss": 0.3, + "step": 260 + }, + { + "epoch": 0.027049435174629495, + "grad_norm": 0.47459420561790466, + "learning_rate": 3.6e-05, + "loss": 0.2953, + "step": 261 + }, + { + "epoch": 0.027153072857290912, + "grad_norm": 0.5799350738525391, + "learning_rate": 3.613793103448276e-05, + "loss": 0.3353, + "step": 262 + }, + { + "epoch": 0.027256710539952326, + "grad_norm": 0.5640973448753357, + "learning_rate": 3.627586206896552e-05, + "loss": 0.3178, + "step": 263 + }, + { + "epoch": 0.027360348222613743, + "grad_norm": 0.5171974301338196, + "learning_rate": 3.641379310344828e-05, + "loss": 0.3243, + "step": 264 + }, + { + "epoch": 0.027463985905275157, + "grad_norm": 0.569558322429657, + "learning_rate": 3.655172413793104e-05, + "loss": 0.3031, + "step": 265 + }, + { + "epoch": 0.027567623587936574, + "grad_norm": 0.531513512134552, + "learning_rate": 3.668965517241379e-05, + "loss": 0.3143, + "step": 266 + }, + { + "epoch": 0.02767126127059799, + "grad_norm": 0.5028888583183289, + "learning_rate": 3.6827586206896554e-05, + "loss": 0.3018, + "step": 267 + }, + { + "epoch": 0.027774898953259405, + "grad_norm": 0.5275846719741821, + "learning_rate": 3.6965517241379315e-05, + "loss": 0.3304, + "step": 268 + }, + { + "epoch": 0.027878536635920822, + "grad_norm": 0.48110881447792053, + "learning_rate": 3.710344827586207e-05, + "loss": 0.2937, + "step": 269 + }, + { + "epoch": 0.027982174318582236, + "grad_norm": 0.5279209017753601, + "learning_rate": 3.724137931034483e-05, + "loss": 0.3709, + "step": 270 + }, + { + "epoch": 0.028085812001243653, + "grad_norm": 0.5388814210891724, + "learning_rate": 3.737931034482759e-05, + "loss": 0.3539, + "step": 271 + }, + { + "epoch": 0.028189449683905066, + "grad_norm": 0.5041470527648926, + "learning_rate": 3.7517241379310345e-05, + "loss": 0.3097, + "step": 272 + }, + { + "epoch": 0.028293087366566484, + "grad_norm": 0.55646812915802, + "learning_rate": 3.7655172413793106e-05, + "loss": 0.2919, + "step": 273 + }, + { + "epoch": 0.0283967250492279, + "grad_norm": 0.5717840194702148, + "learning_rate": 3.779310344827587e-05, + "loss": 0.3896, + "step": 274 + }, + { + "epoch": 0.028500362731889314, + "grad_norm": 0.5246320366859436, + "learning_rate": 3.793103448275862e-05, + "loss": 0.374, + "step": 275 + }, + { + "epoch": 0.02860400041455073, + "grad_norm": 0.4986425042152405, + "learning_rate": 3.806896551724138e-05, + "loss": 0.3252, + "step": 276 + }, + { + "epoch": 0.028707638097212145, + "grad_norm": 0.5473276376724243, + "learning_rate": 3.8206896551724144e-05, + "loss": 0.2667, + "step": 277 + }, + { + "epoch": 0.028811275779873562, + "grad_norm": 0.6111901998519897, + "learning_rate": 3.83448275862069e-05, + "loss": 0.3327, + "step": 278 + }, + { + "epoch": 0.028914913462534976, + "grad_norm": 0.540371835231781, + "learning_rate": 3.848275862068966e-05, + "loss": 0.3404, + "step": 279 + }, + { + "epoch": 0.029018551145196393, + "grad_norm": 0.4997597336769104, + "learning_rate": 3.862068965517242e-05, + "loss": 0.3128, + "step": 280 + }, + { + "epoch": 0.02912218882785781, + "grad_norm": 0.45018187165260315, + "learning_rate": 3.8758620689655174e-05, + "loss": 0.2513, + "step": 281 + }, + { + "epoch": 0.029225826510519224, + "grad_norm": 0.4394817650318146, + "learning_rate": 3.8896551724137935e-05, + "loss": 0.2758, + "step": 282 + }, + { + "epoch": 0.02932946419318064, + "grad_norm": 0.5027971267700195, + "learning_rate": 3.903448275862069e-05, + "loss": 0.3064, + "step": 283 + }, + { + "epoch": 0.029433101875842055, + "grad_norm": 0.47926047444343567, + "learning_rate": 3.917241379310345e-05, + "loss": 0.2235, + "step": 284 + }, + { + "epoch": 0.029536739558503472, + "grad_norm": 0.4586944878101349, + "learning_rate": 3.931034482758621e-05, + "loss": 0.295, + "step": 285 + }, + { + "epoch": 0.02964037724116489, + "grad_norm": 0.6728528738021851, + "learning_rate": 3.9448275862068966e-05, + "loss": 0.3476, + "step": 286 + }, + { + "epoch": 0.029744014923826303, + "grad_norm": 0.5615842938423157, + "learning_rate": 3.958620689655173e-05, + "loss": 0.2827, + "step": 287 + }, + { + "epoch": 0.02984765260648772, + "grad_norm": 0.46119019389152527, + "learning_rate": 3.972413793103449e-05, + "loss": 0.2363, + "step": 288 + }, + { + "epoch": 0.029951290289149134, + "grad_norm": 0.4322184920310974, + "learning_rate": 3.986206896551724e-05, + "loss": 0.2375, + "step": 289 + }, + { + "epoch": 0.03005492797181055, + "grad_norm": 0.43789973855018616, + "learning_rate": 4e-05, + "loss": 0.2555, + "step": 290 + }, + { + "epoch": 0.030158565654471964, + "grad_norm": 0.47169503569602966, + "learning_rate": 3.999999887321555e-05, + "loss": 0.246, + "step": 291 + }, + { + "epoch": 0.03026220333713338, + "grad_norm": 0.415968656539917, + "learning_rate": 3.999999549286231e-05, + "loss": 0.3025, + "step": 292 + }, + { + "epoch": 0.0303658410197948, + "grad_norm": 0.4890746474266052, + "learning_rate": 3.999998985894067e-05, + "loss": 0.2951, + "step": 293 + }, + { + "epoch": 0.030469478702456212, + "grad_norm": 0.5048105716705322, + "learning_rate": 3.999998197145127e-05, + "loss": 0.3317, + "step": 294 + }, + { + "epoch": 0.03057311638511763, + "grad_norm": 0.4656011462211609, + "learning_rate": 3.999997183039498e-05, + "loss": 0.299, + "step": 295 + }, + { + "epoch": 0.030676754067779043, + "grad_norm": 0.5024938583374023, + "learning_rate": 3.999995943577297e-05, + "loss": 0.2788, + "step": 296 + }, + { + "epoch": 0.03078039175044046, + "grad_norm": 0.5081213116645813, + "learning_rate": 3.9999944787586606e-05, + "loss": 0.3068, + "step": 297 + }, + { + "epoch": 0.030884029433101874, + "grad_norm": 0.529151976108551, + "learning_rate": 3.999992788583756e-05, + "loss": 0.3062, + "step": 298 + }, + { + "epoch": 0.03098766711576329, + "grad_norm": 0.6183048486709595, + "learning_rate": 3.999990873052774e-05, + "loss": 0.3258, + "step": 299 + }, + { + "epoch": 0.03109130479842471, + "grad_norm": 0.4893452227115631, + "learning_rate": 3.999988732165928e-05, + "loss": 0.3118, + "step": 300 + }, + { + "epoch": 0.031194942481086122, + "grad_norm": 0.5209515690803528, + "learning_rate": 3.999986365923461e-05, + "loss": 0.282, + "step": 301 + }, + { + "epoch": 0.031298580163747536, + "grad_norm": 0.6380146741867065, + "learning_rate": 3.99998377432564e-05, + "loss": 0.3404, + "step": 302 + }, + { + "epoch": 0.03140221784640895, + "grad_norm": 0.607676088809967, + "learning_rate": 3.9999809573727556e-05, + "loss": 0.2871, + "step": 303 + }, + { + "epoch": 0.03150585552907037, + "grad_norm": 0.5492339134216309, + "learning_rate": 3.9999779150651266e-05, + "loss": 0.333, + "step": 304 + }, + { + "epoch": 0.03160949321173179, + "grad_norm": 0.4717906415462494, + "learning_rate": 3.9999746474030945e-05, + "loss": 0.2943, + "step": 305 + }, + { + "epoch": 0.031713130894393204, + "grad_norm": 0.5014916062355042, + "learning_rate": 3.999971154387028e-05, + "loss": 0.2634, + "step": 306 + }, + { + "epoch": 0.031816768577054615, + "grad_norm": 0.5935418605804443, + "learning_rate": 3.999967436017322e-05, + "loss": 0.2988, + "step": 307 + }, + { + "epoch": 0.03192040625971603, + "grad_norm": 0.5674479007720947, + "learning_rate": 3.9999634922943934e-05, + "loss": 0.3497, + "step": 308 + }, + { + "epoch": 0.03202404394237745, + "grad_norm": 0.4754267930984497, + "learning_rate": 3.999959323218688e-05, + "loss": 0.2656, + "step": 309 + }, + { + "epoch": 0.032127681625038866, + "grad_norm": 0.5481701493263245, + "learning_rate": 3.9999549287906746e-05, + "loss": 0.3455, + "step": 310 + }, + { + "epoch": 0.03223131930770028, + "grad_norm": 0.6132522821426392, + "learning_rate": 3.9999503090108494e-05, + "loss": 0.3413, + "step": 311 + }, + { + "epoch": 0.03233495699036169, + "grad_norm": 0.5571139454841614, + "learning_rate": 3.999945463879732e-05, + "loss": 0.3069, + "step": 312 + }, + { + "epoch": 0.03243859467302311, + "grad_norm": 0.5365187525749207, + "learning_rate": 3.999940393397869e-05, + "loss": 0.2773, + "step": 313 + }, + { + "epoch": 0.03254223235568453, + "grad_norm": 0.5804490447044373, + "learning_rate": 3.99993509756583e-05, + "loss": 0.3161, + "step": 314 + }, + { + "epoch": 0.032645870038345945, + "grad_norm": 0.5937289595603943, + "learning_rate": 3.999929576384215e-05, + "loss": 0.2953, + "step": 315 + }, + { + "epoch": 0.032749507721007355, + "grad_norm": 0.5334322452545166, + "learning_rate": 3.9999238298536436e-05, + "loss": 0.2788, + "step": 316 + }, + { + "epoch": 0.03285314540366877, + "grad_norm": 0.5063037276268005, + "learning_rate": 3.9999178579747636e-05, + "loss": 0.2907, + "step": 317 + }, + { + "epoch": 0.03295678308633019, + "grad_norm": 0.6236560344696045, + "learning_rate": 3.999911660748249e-05, + "loss": 0.3119, + "step": 318 + }, + { + "epoch": 0.03306042076899161, + "grad_norm": 0.47660860419273376, + "learning_rate": 3.999905238174797e-05, + "loss": 0.2608, + "step": 319 + }, + { + "epoch": 0.033164058451653024, + "grad_norm": 0.5512443780899048, + "learning_rate": 3.9998985902551315e-05, + "loss": 0.307, + "step": 320 + }, + { + "epoch": 0.033267696134314434, + "grad_norm": 0.6016091108322144, + "learning_rate": 3.999891716990002e-05, + "loss": 0.3115, + "step": 321 + }, + { + "epoch": 0.03337133381697585, + "grad_norm": 0.5076780319213867, + "learning_rate": 3.9998846183801826e-05, + "loss": 0.2699, + "step": 322 + }, + { + "epoch": 0.03347497149963727, + "grad_norm": 0.46604278683662415, + "learning_rate": 3.999877294426474e-05, + "loss": 0.3226, + "step": 323 + }, + { + "epoch": 0.033578609182298685, + "grad_norm": 0.5372481942176819, + "learning_rate": 3.9998697451297e-05, + "loss": 0.3307, + "step": 324 + }, + { + "epoch": 0.0336822468649601, + "grad_norm": 0.5999156832695007, + "learning_rate": 3.999861970490711e-05, + "loss": 0.2952, + "step": 325 + }, + { + "epoch": 0.03378588454762151, + "grad_norm": 0.5492348074913025, + "learning_rate": 3.999853970510386e-05, + "loss": 0.3538, + "step": 326 + }, + { + "epoch": 0.03388952223028293, + "grad_norm": 0.4700670838356018, + "learning_rate": 3.9998457451896234e-05, + "loss": 0.3182, + "step": 327 + }, + { + "epoch": 0.03399315991294435, + "grad_norm": 0.4508884847164154, + "learning_rate": 3.999837294529351e-05, + "loss": 0.2587, + "step": 328 + }, + { + "epoch": 0.034096797595605764, + "grad_norm": 0.38943636417388916, + "learning_rate": 3.9998286185305216e-05, + "loss": 0.22, + "step": 329 + }, + { + "epoch": 0.03420043527826718, + "grad_norm": 0.6094530820846558, + "learning_rate": 3.999819717194111e-05, + "loss": 0.3358, + "step": 330 + }, + { + "epoch": 0.03430407296092859, + "grad_norm": 0.5398091077804565, + "learning_rate": 3.999810590521125e-05, + "loss": 0.2796, + "step": 331 + }, + { + "epoch": 0.03440771064359001, + "grad_norm": 0.5134654641151428, + "learning_rate": 3.9998012385125896e-05, + "loss": 0.2806, + "step": 332 + }, + { + "epoch": 0.034511348326251426, + "grad_norm": 0.5822792649269104, + "learning_rate": 3.999791661169559e-05, + "loss": 0.312, + "step": 333 + }, + { + "epoch": 0.03461498600891284, + "grad_norm": 0.5909684300422668, + "learning_rate": 3.999781858493114e-05, + "loss": 0.3188, + "step": 334 + }, + { + "epoch": 0.03471862369157425, + "grad_norm": 0.4724874794483185, + "learning_rate": 3.9997718304843574e-05, + "loss": 0.2591, + "step": 335 + }, + { + "epoch": 0.03482226137423567, + "grad_norm": 0.5870285630226135, + "learning_rate": 3.9997615771444194e-05, + "loss": 0.3509, + "step": 336 + }, + { + "epoch": 0.03492589905689709, + "grad_norm": 0.5800949335098267, + "learning_rate": 3.999751098474455e-05, + "loss": 0.285, + "step": 337 + }, + { + "epoch": 0.035029536739558505, + "grad_norm": 0.4950229525566101, + "learning_rate": 3.9997403944756466e-05, + "loss": 0.2865, + "step": 338 + }, + { + "epoch": 0.03513317442221992, + "grad_norm": 0.5681800246238708, + "learning_rate": 3.999729465149199e-05, + "loss": 0.3624, + "step": 339 + }, + { + "epoch": 0.03523681210488133, + "grad_norm": 0.49301090836524963, + "learning_rate": 3.999718310496344e-05, + "loss": 0.3046, + "step": 340 + }, + { + "epoch": 0.03534044978754275, + "grad_norm": 0.4641712009906769, + "learning_rate": 3.999706930518338e-05, + "loss": 0.2779, + "step": 341 + }, + { + "epoch": 0.035444087470204166, + "grad_norm": 0.5455201268196106, + "learning_rate": 3.999695325216464e-05, + "loss": 0.338, + "step": 342 + }, + { + "epoch": 0.035547725152865584, + "grad_norm": 0.4920612573623657, + "learning_rate": 3.9996834945920286e-05, + "loss": 0.3227, + "step": 343 + }, + { + "epoch": 0.035651362835527, + "grad_norm": 0.4165387749671936, + "learning_rate": 3.999671438646366e-05, + "loss": 0.2795, + "step": 344 + }, + { + "epoch": 0.03575500051818841, + "grad_norm": 0.46413376927375793, + "learning_rate": 3.9996591573808346e-05, + "loss": 0.2857, + "step": 345 + }, + { + "epoch": 0.03585863820084983, + "grad_norm": 0.4862508773803711, + "learning_rate": 3.9996466507968175e-05, + "loss": 0.3072, + "step": 346 + }, + { + "epoch": 0.035962275883511245, + "grad_norm": 0.4505625069141388, + "learning_rate": 3.9996339188957243e-05, + "loss": 0.2708, + "step": 347 + }, + { + "epoch": 0.03606591356617266, + "grad_norm": 0.3864147961139679, + "learning_rate": 3.9996209616789897e-05, + "loss": 0.2053, + "step": 348 + }, + { + "epoch": 0.03616955124883408, + "grad_norm": 0.4509885311126709, + "learning_rate": 3.999607779148074e-05, + "loss": 0.2996, + "step": 349 + }, + { + "epoch": 0.03627318893149549, + "grad_norm": 0.4463110864162445, + "learning_rate": 3.999594371304461e-05, + "loss": 0.2519, + "step": 350 + }, + { + "epoch": 0.03637682661415691, + "grad_norm": 0.499304860830307, + "learning_rate": 3.999580738149664e-05, + "loss": 0.2424, + "step": 351 + }, + { + "epoch": 0.036480464296818324, + "grad_norm": 0.4653189480304718, + "learning_rate": 3.9995668796852174e-05, + "loss": 0.3005, + "step": 352 + }, + { + "epoch": 0.03658410197947974, + "grad_norm": 0.4830104112625122, + "learning_rate": 3.9995527959126835e-05, + "loss": 0.2373, + "step": 353 + }, + { + "epoch": 0.03668773966214115, + "grad_norm": 0.5555719137191772, + "learning_rate": 3.999538486833648e-05, + "loss": 0.3509, + "step": 354 + }, + { + "epoch": 0.03679137734480257, + "grad_norm": 0.5193206667900085, + "learning_rate": 3.999523952449725e-05, + "loss": 0.2891, + "step": 355 + }, + { + "epoch": 0.036895015027463986, + "grad_norm": 0.5097503066062927, + "learning_rate": 3.999509192762551e-05, + "loss": 0.3265, + "step": 356 + }, + { + "epoch": 0.0369986527101254, + "grad_norm": 0.4666855037212372, + "learning_rate": 3.999494207773789e-05, + "loss": 0.2682, + "step": 357 + }, + { + "epoch": 0.03710229039278682, + "grad_norm": 0.5073977112770081, + "learning_rate": 3.9994789974851285e-05, + "loss": 0.3099, + "step": 358 + }, + { + "epoch": 0.03720592807544823, + "grad_norm": 0.4599367380142212, + "learning_rate": 3.999463561898283e-05, + "loss": 0.2535, + "step": 359 + }, + { + "epoch": 0.03730956575810965, + "grad_norm": 0.5521755814552307, + "learning_rate": 3.999447901014991e-05, + "loss": 0.2691, + "step": 360 + }, + { + "epoch": 0.037413203440771065, + "grad_norm": 0.5754250288009644, + "learning_rate": 3.999432014837018e-05, + "loss": 0.2835, + "step": 361 + }, + { + "epoch": 0.03751684112343248, + "grad_norm": 0.4755854904651642, + "learning_rate": 3.9994159033661535e-05, + "loss": 0.2636, + "step": 362 + }, + { + "epoch": 0.0376204788060939, + "grad_norm": 0.49590805172920227, + "learning_rate": 3.999399566604214e-05, + "loss": 0.26, + "step": 363 + }, + { + "epoch": 0.03772411648875531, + "grad_norm": 0.5175451636314392, + "learning_rate": 3.999383004553039e-05, + "loss": 0.2751, + "step": 364 + }, + { + "epoch": 0.037827754171416726, + "grad_norm": 0.5882648825645447, + "learning_rate": 3.999366217214495e-05, + "loss": 0.3524, + "step": 365 + }, + { + "epoch": 0.03793139185407814, + "grad_norm": 0.5125890970230103, + "learning_rate": 3.9993492045904734e-05, + "loss": 0.3061, + "step": 366 + }, + { + "epoch": 0.03803502953673956, + "grad_norm": 0.5740824937820435, + "learning_rate": 3.999331966682892e-05, + "loss": 0.3479, + "step": 367 + }, + { + "epoch": 0.03813866721940098, + "grad_norm": 0.5392363667488098, + "learning_rate": 3.999314503493692e-05, + "loss": 0.2503, + "step": 368 + }, + { + "epoch": 0.03824230490206239, + "grad_norm": 0.45441433787345886, + "learning_rate": 3.9992968150248426e-05, + "loss": 0.2621, + "step": 369 + }, + { + "epoch": 0.038345942584723805, + "grad_norm": 0.5334871411323547, + "learning_rate": 3.999278901278336e-05, + "loss": 0.2924, + "step": 370 + }, + { + "epoch": 0.03844958026738522, + "grad_norm": 0.4892536401748657, + "learning_rate": 3.99926076225619e-05, + "loss": 0.2559, + "step": 371 + }, + { + "epoch": 0.03855321795004664, + "grad_norm": 0.48785465955734253, + "learning_rate": 3.9992423979604496e-05, + "loss": 0.2914, + "step": 372 + }, + { + "epoch": 0.03865685563270805, + "grad_norm": 0.443486750125885, + "learning_rate": 3.9992238083931834e-05, + "loss": 0.2435, + "step": 373 + }, + { + "epoch": 0.03876049331536947, + "grad_norm": 0.48417192697525024, + "learning_rate": 3.999204993556487e-05, + "loss": 0.2702, + "step": 374 + }, + { + "epoch": 0.038864130998030884, + "grad_norm": 0.5423489809036255, + "learning_rate": 3.99918595345248e-05, + "loss": 0.2818, + "step": 375 + }, + { + "epoch": 0.0389677686806923, + "grad_norm": 0.5185508728027344, + "learning_rate": 3.9991666880833064e-05, + "loss": 0.3319, + "step": 376 + }, + { + "epoch": 0.03907140636335372, + "grad_norm": 0.5657951235771179, + "learning_rate": 3.9991471974511384e-05, + "loss": 0.2834, + "step": 377 + }, + { + "epoch": 0.03917504404601513, + "grad_norm": 0.5035321116447449, + "learning_rate": 3.9991274815581726e-05, + "loss": 0.2788, + "step": 378 + }, + { + "epoch": 0.039278681728676546, + "grad_norm": 0.49256327748298645, + "learning_rate": 3.9991075404066296e-05, + "loss": 0.2841, + "step": 379 + }, + { + "epoch": 0.03938231941133796, + "grad_norm": 0.48294106125831604, + "learning_rate": 3.999087373998756e-05, + "loss": 0.2793, + "step": 380 + }, + { + "epoch": 0.03948595709399938, + "grad_norm": 0.4720240831375122, + "learning_rate": 3.9990669823368255e-05, + "loss": 0.2448, + "step": 381 + }, + { + "epoch": 0.0395895947766608, + "grad_norm": 0.4897540509700775, + "learning_rate": 3.999046365423134e-05, + "loss": 0.2596, + "step": 382 + }, + { + "epoch": 0.03969323245932221, + "grad_norm": 0.6128451824188232, + "learning_rate": 3.999025523260007e-05, + "loss": 0.3332, + "step": 383 + }, + { + "epoch": 0.039796870141983624, + "grad_norm": 0.47030091285705566, + "learning_rate": 3.999004455849791e-05, + "loss": 0.2902, + "step": 384 + }, + { + "epoch": 0.03990050782464504, + "grad_norm": 0.4823108911514282, + "learning_rate": 3.998983163194861e-05, + "loss": 0.3106, + "step": 385 + }, + { + "epoch": 0.04000414550730646, + "grad_norm": 0.4495014548301697, + "learning_rate": 3.998961645297614e-05, + "loss": 0.2716, + "step": 386 + }, + { + "epoch": 0.040107783189967876, + "grad_norm": 0.4260924160480499, + "learning_rate": 3.998939902160478e-05, + "loss": 0.2548, + "step": 387 + }, + { + "epoch": 0.040211420872629286, + "grad_norm": 0.5300273299217224, + "learning_rate": 3.9989179337859e-05, + "loss": 0.297, + "step": 388 + }, + { + "epoch": 0.0403150585552907, + "grad_norm": 0.4261798858642578, + "learning_rate": 3.998895740176358e-05, + "loss": 0.2258, + "step": 389 + }, + { + "epoch": 0.04041869623795212, + "grad_norm": 0.5781340003013611, + "learning_rate": 3.9988733213343506e-05, + "loss": 0.3491, + "step": 390 + }, + { + "epoch": 0.04052233392061354, + "grad_norm": 0.4649885296821594, + "learning_rate": 3.998850677262404e-05, + "loss": 0.2692, + "step": 391 + }, + { + "epoch": 0.04062597160327495, + "grad_norm": 0.4471288025379181, + "learning_rate": 3.998827807963071e-05, + "loss": 0.2811, + "step": 392 + }, + { + "epoch": 0.040729609285936365, + "grad_norm": 0.49814170598983765, + "learning_rate": 3.998804713438928e-05, + "loss": 0.3116, + "step": 393 + }, + { + "epoch": 0.04083324696859778, + "grad_norm": 0.5495662689208984, + "learning_rate": 3.998781393692577e-05, + "loss": 0.3267, + "step": 394 + }, + { + "epoch": 0.0409368846512592, + "grad_norm": 0.5408654808998108, + "learning_rate": 3.998757848726645e-05, + "loss": 0.3068, + "step": 395 + }, + { + "epoch": 0.041040522333920616, + "grad_norm": 0.5414074063301086, + "learning_rate": 3.998734078543787e-05, + "loss": 0.3334, + "step": 396 + }, + { + "epoch": 0.041144160016582026, + "grad_norm": 0.5118625164031982, + "learning_rate": 3.9987100831466794e-05, + "loss": 0.2899, + "step": 397 + }, + { + "epoch": 0.041247797699243444, + "grad_norm": 0.43798866868019104, + "learning_rate": 3.998685862538026e-05, + "loss": 0.2438, + "step": 398 + }, + { + "epoch": 0.04135143538190486, + "grad_norm": 0.4928605556488037, + "learning_rate": 3.998661416720558e-05, + "loss": 0.2591, + "step": 399 + }, + { + "epoch": 0.04145507306456628, + "grad_norm": 0.46029067039489746, + "learning_rate": 3.9986367456970274e-05, + "loss": 0.234, + "step": 400 + }, + { + "epoch": 0.041558710747227695, + "grad_norm": 0.45207706093788147, + "learning_rate": 3.9986118494702155e-05, + "loss": 0.2274, + "step": 401 + }, + { + "epoch": 0.041662348429889105, + "grad_norm": 0.4618712067604065, + "learning_rate": 3.998586728042928e-05, + "loss": 0.2565, + "step": 402 + }, + { + "epoch": 0.04176598611255052, + "grad_norm": 0.5343668460845947, + "learning_rate": 3.998561381417994e-05, + "loss": 0.2647, + "step": 403 + }, + { + "epoch": 0.04186962379521194, + "grad_norm": 0.4870312809944153, + "learning_rate": 3.998535809598271e-05, + "loss": 0.329, + "step": 404 + }, + { + "epoch": 0.04197326147787336, + "grad_norm": 0.5918148756027222, + "learning_rate": 3.998510012586639e-05, + "loss": 0.3432, + "step": 405 + }, + { + "epoch": 0.04207689916053477, + "grad_norm": 0.5239502191543579, + "learning_rate": 3.9984839903860066e-05, + "loss": 0.2599, + "step": 406 + }, + { + "epoch": 0.042180536843196184, + "grad_norm": 0.6107182502746582, + "learning_rate": 3.9984577429993044e-05, + "loss": 0.298, + "step": 407 + }, + { + "epoch": 0.0422841745258576, + "grad_norm": 0.39335134625434875, + "learning_rate": 3.99843127042949e-05, + "loss": 0.2406, + "step": 408 + }, + { + "epoch": 0.04238781220851902, + "grad_norm": 0.4815625846385956, + "learning_rate": 3.9984045726795474e-05, + "loss": 0.2469, + "step": 409 + }, + { + "epoch": 0.042491449891180436, + "grad_norm": 0.4272301495075226, + "learning_rate": 3.9983776497524835e-05, + "loss": 0.2548, + "step": 410 + }, + { + "epoch": 0.042595087573841846, + "grad_norm": 0.5317639112472534, + "learning_rate": 3.998350501651333e-05, + "loss": 0.3006, + "step": 411 + }, + { + "epoch": 0.04269872525650326, + "grad_norm": 0.43263331055641174, + "learning_rate": 3.9983231283791537e-05, + "loss": 0.2391, + "step": 412 + }, + { + "epoch": 0.04280236293916468, + "grad_norm": 0.4960077404975891, + "learning_rate": 3.998295529939031e-05, + "loss": 0.2975, + "step": 413 + }, + { + "epoch": 0.0429060006218261, + "grad_norm": 0.5747363567352295, + "learning_rate": 3.998267706334075e-05, + "loss": 0.3214, + "step": 414 + }, + { + "epoch": 0.043009638304487514, + "grad_norm": 0.4495050013065338, + "learning_rate": 3.99823965756742e-05, + "loss": 0.2738, + "step": 415 + }, + { + "epoch": 0.043113275987148925, + "grad_norm": 0.46255549788475037, + "learning_rate": 3.998211383642226e-05, + "loss": 0.2564, + "step": 416 + }, + { + "epoch": 0.04321691366981034, + "grad_norm": 0.539284348487854, + "learning_rate": 3.9981828845616804e-05, + "loss": 0.2949, + "step": 417 + }, + { + "epoch": 0.04332055135247176, + "grad_norm": 0.4439292252063751, + "learning_rate": 3.9981541603289935e-05, + "loss": 0.2582, + "step": 418 + }, + { + "epoch": 0.043424189035133176, + "grad_norm": 0.5235161781311035, + "learning_rate": 3.998125210947402e-05, + "loss": 0.3272, + "step": 419 + }, + { + "epoch": 0.04352782671779459, + "grad_norm": 0.42719945311546326, + "learning_rate": 3.9980960364201676e-05, + "loss": 0.2415, + "step": 420 + }, + { + "epoch": 0.043631464400456, + "grad_norm": 0.4765814542770386, + "learning_rate": 3.998066636750578e-05, + "loss": 0.271, + "step": 421 + }, + { + "epoch": 0.04373510208311742, + "grad_norm": 0.47974205017089844, + "learning_rate": 3.998037011941946e-05, + "loss": 0.2669, + "step": 422 + }, + { + "epoch": 0.04383873976577884, + "grad_norm": 0.44285377860069275, + "learning_rate": 3.998007161997609e-05, + "loss": 0.2327, + "step": 423 + }, + { + "epoch": 0.043942377448440255, + "grad_norm": 0.4870794415473938, + "learning_rate": 3.997977086920932e-05, + "loss": 0.2458, + "step": 424 + }, + { + "epoch": 0.044046015131101665, + "grad_norm": 0.47079092264175415, + "learning_rate": 3.997946786715302e-05, + "loss": 0.2139, + "step": 425 + }, + { + "epoch": 0.04414965281376308, + "grad_norm": 0.516667366027832, + "learning_rate": 3.9979162613841336e-05, + "loss": 0.2378, + "step": 426 + }, + { + "epoch": 0.0442532904964245, + "grad_norm": 0.5418141484260559, + "learning_rate": 3.9978855109308675e-05, + "loss": 0.272, + "step": 427 + }, + { + "epoch": 0.04435692817908592, + "grad_norm": 0.5159924030303955, + "learning_rate": 3.9978545353589675e-05, + "loss": 0.2741, + "step": 428 + }, + { + "epoch": 0.044460565861747334, + "grad_norm": 0.48079657554626465, + "learning_rate": 3.9978233346719235e-05, + "loss": 0.2834, + "step": 429 + }, + { + "epoch": 0.044564203544408744, + "grad_norm": 0.4194307327270508, + "learning_rate": 3.997791908873253e-05, + "loss": 0.2139, + "step": 430 + }, + { + "epoch": 0.04466784122707016, + "grad_norm": 0.5443658828735352, + "learning_rate": 3.997760257966495e-05, + "loss": 0.292, + "step": 431 + }, + { + "epoch": 0.04477147890973158, + "grad_norm": 0.5482102036476135, + "learning_rate": 3.997728381955217e-05, + "loss": 0.2584, + "step": 432 + }, + { + "epoch": 0.044875116592392995, + "grad_norm": 0.49964722990989685, + "learning_rate": 3.99769628084301e-05, + "loss": 0.2593, + "step": 433 + }, + { + "epoch": 0.04497875427505441, + "grad_norm": 0.45780518651008606, + "learning_rate": 3.997663954633492e-05, + "loss": 0.2778, + "step": 434 + }, + { + "epoch": 0.04508239195771582, + "grad_norm": 0.5288244485855103, + "learning_rate": 3.9976314033303056e-05, + "loss": 0.2929, + "step": 435 + }, + { + "epoch": 0.04518602964037724, + "grad_norm": 0.48790210485458374, + "learning_rate": 3.9975986269371175e-05, + "loss": 0.2738, + "step": 436 + }, + { + "epoch": 0.04528966732303866, + "grad_norm": 0.4951280355453491, + "learning_rate": 3.997565625457621e-05, + "loss": 0.3266, + "step": 437 + }, + { + "epoch": 0.045393305005700074, + "grad_norm": 0.551414966583252, + "learning_rate": 3.997532398895536e-05, + "loss": 0.322, + "step": 438 + }, + { + "epoch": 0.04549694268836149, + "grad_norm": 0.4794701039791107, + "learning_rate": 3.997498947254605e-05, + "loss": 0.2303, + "step": 439 + }, + { + "epoch": 0.0456005803710229, + "grad_norm": 0.4955127239227295, + "learning_rate": 3.997465270538597e-05, + "loss": 0.3013, + "step": 440 + }, + { + "epoch": 0.04570421805368432, + "grad_norm": 0.6322245001792908, + "learning_rate": 3.9974313687513086e-05, + "loss": 0.3439, + "step": 441 + }, + { + "epoch": 0.045807855736345736, + "grad_norm": 0.4571327865123749, + "learning_rate": 3.9973972418965586e-05, + "loss": 0.2581, + "step": 442 + }, + { + "epoch": 0.04591149341900715, + "grad_norm": 0.5030487775802612, + "learning_rate": 3.997362889978192e-05, + "loss": 0.268, + "step": 443 + }, + { + "epoch": 0.04601513110166856, + "grad_norm": 0.42191100120544434, + "learning_rate": 3.99732831300008e-05, + "loss": 0.2507, + "step": 444 + }, + { + "epoch": 0.04611876878432998, + "grad_norm": 0.5020284056663513, + "learning_rate": 3.997293510966119e-05, + "loss": 0.3243, + "step": 445 + }, + { + "epoch": 0.0462224064669914, + "grad_norm": 0.5030457973480225, + "learning_rate": 3.997258483880229e-05, + "loss": 0.2361, + "step": 446 + }, + { + "epoch": 0.046326044149652815, + "grad_norm": 0.508976936340332, + "learning_rate": 3.997223231746358e-05, + "loss": 0.2452, + "step": 447 + }, + { + "epoch": 0.04642968183231423, + "grad_norm": 0.5234203934669495, + "learning_rate": 3.997187754568479e-05, + "loss": 0.2858, + "step": 448 + }, + { + "epoch": 0.04653331951497564, + "grad_norm": 0.5409331917762756, + "learning_rate": 3.997152052350588e-05, + "loss": 0.2737, + "step": 449 + }, + { + "epoch": 0.04663695719763706, + "grad_norm": 0.48061665892601013, + "learning_rate": 3.997116125096709e-05, + "loss": 0.2485, + "step": 450 + }, + { + "epoch": 0.046740594880298476, + "grad_norm": 0.45012375712394714, + "learning_rate": 3.997079972810888e-05, + "loss": 0.2657, + "step": 451 + }, + { + "epoch": 0.046844232562959894, + "grad_norm": 0.4897686839103699, + "learning_rate": 3.997043595497201e-05, + "loss": 0.3284, + "step": 452 + }, + { + "epoch": 0.04694787024562131, + "grad_norm": 0.43526583909988403, + "learning_rate": 3.9970069931597465e-05, + "loss": 0.2704, + "step": 453 + }, + { + "epoch": 0.04705150792828272, + "grad_norm": 0.4520479738712311, + "learning_rate": 3.9969701658026484e-05, + "loss": 0.2512, + "step": 454 + }, + { + "epoch": 0.04715514561094414, + "grad_norm": 0.507448136806488, + "learning_rate": 3.996933113430056e-05, + "loss": 0.3099, + "step": 455 + }, + { + "epoch": 0.047258783293605555, + "grad_norm": 0.5489389300346375, + "learning_rate": 3.996895836046145e-05, + "loss": 0.2653, + "step": 456 + }, + { + "epoch": 0.04736242097626697, + "grad_norm": 0.5679994821548462, + "learning_rate": 3.996858333655115e-05, + "loss": 0.2939, + "step": 457 + }, + { + "epoch": 0.04746605865892839, + "grad_norm": 0.4488953649997711, + "learning_rate": 3.996820606261192e-05, + "loss": 0.2719, + "step": 458 + }, + { + "epoch": 0.0475696963415898, + "grad_norm": 0.4154788851737976, + "learning_rate": 3.9967826538686274e-05, + "loss": 0.2521, + "step": 459 + }, + { + "epoch": 0.04767333402425122, + "grad_norm": 0.49899592995643616, + "learning_rate": 3.996744476481698e-05, + "loss": 0.2997, + "step": 460 + }, + { + "epoch": 0.047776971706912634, + "grad_norm": 0.5255045890808105, + "learning_rate": 3.9967060741047045e-05, + "loss": 0.2912, + "step": 461 + }, + { + "epoch": 0.04788060938957405, + "grad_norm": 0.3737160563468933, + "learning_rate": 3.996667446741975e-05, + "loss": 0.2099, + "step": 462 + }, + { + "epoch": 0.04798424707223546, + "grad_norm": 0.45894649624824524, + "learning_rate": 3.996628594397861e-05, + "loss": 0.2822, + "step": 463 + }, + { + "epoch": 0.04808788475489688, + "grad_norm": 0.5205749273300171, + "learning_rate": 3.996589517076741e-05, + "loss": 0.2846, + "step": 464 + }, + { + "epoch": 0.048191522437558296, + "grad_norm": 0.5407760739326477, + "learning_rate": 3.9965502147830174e-05, + "loss": 0.2829, + "step": 465 + }, + { + "epoch": 0.04829516012021971, + "grad_norm": 0.5501642823219299, + "learning_rate": 3.9965106875211204e-05, + "loss": 0.2911, + "step": 466 + }, + { + "epoch": 0.04839879780288113, + "grad_norm": 0.4986078441143036, + "learning_rate": 3.9964709352955016e-05, + "loss": 0.2863, + "step": 467 + }, + { + "epoch": 0.04850243548554254, + "grad_norm": 0.4460117518901825, + "learning_rate": 3.996430958110642e-05, + "loss": 0.26, + "step": 468 + }, + { + "epoch": 0.04860607316820396, + "grad_norm": 0.4906882047653198, + "learning_rate": 3.996390755971046e-05, + "loss": 0.2729, + "step": 469 + }, + { + "epoch": 0.048709710850865374, + "grad_norm": 0.4443777799606323, + "learning_rate": 3.9963503288812424e-05, + "loss": 0.2802, + "step": 470 + }, + { + "epoch": 0.04881334853352679, + "grad_norm": 0.40381139516830444, + "learning_rate": 3.996309676845787e-05, + "loss": 0.2588, + "step": 471 + }, + { + "epoch": 0.04891698621618821, + "grad_norm": 0.5093889832496643, + "learning_rate": 3.9962687998692605e-05, + "loss": 0.2781, + "step": 472 + }, + { + "epoch": 0.04902062389884962, + "grad_norm": 0.489504873752594, + "learning_rate": 3.996227697956269e-05, + "loss": 0.2818, + "step": 473 + }, + { + "epoch": 0.049124261581511036, + "grad_norm": 0.43166133761405945, + "learning_rate": 3.996186371111444e-05, + "loss": 0.2617, + "step": 474 + }, + { + "epoch": 0.04922789926417245, + "grad_norm": 0.5207101106643677, + "learning_rate": 3.996144819339442e-05, + "loss": 0.2939, + "step": 475 + }, + { + "epoch": 0.04933153694683387, + "grad_norm": 0.4953382611274719, + "learning_rate": 3.9961030426449445e-05, + "loss": 0.2864, + "step": 476 + }, + { + "epoch": 0.04943517462949529, + "grad_norm": 0.4431532919406891, + "learning_rate": 3.996061041032659e-05, + "loss": 0.2596, + "step": 477 + }, + { + "epoch": 0.0495388123121567, + "grad_norm": 0.4658108949661255, + "learning_rate": 3.996018814507319e-05, + "loss": 0.2675, + "step": 478 + }, + { + "epoch": 0.049642449994818115, + "grad_norm": 0.5410202741622925, + "learning_rate": 3.995976363073681e-05, + "loss": 0.2785, + "step": 479 + }, + { + "epoch": 0.04974608767747953, + "grad_norm": 0.5721487402915955, + "learning_rate": 3.995933686736529e-05, + "loss": 0.2572, + "step": 480 + }, + { + "epoch": 0.04984972536014095, + "grad_norm": 0.48720431327819824, + "learning_rate": 3.995890785500673e-05, + "loss": 0.2581, + "step": 481 + }, + { + "epoch": 0.04995336304280236, + "grad_norm": 0.49117180705070496, + "learning_rate": 3.995847659370945e-05, + "loss": 0.3231, + "step": 482 + }, + { + "epoch": 0.05005700072546378, + "grad_norm": 0.4524441659450531, + "learning_rate": 3.995804308352206e-05, + "loss": 0.2857, + "step": 483 + }, + { + "epoch": 0.050160638408125194, + "grad_norm": 0.4876324534416199, + "learning_rate": 3.995760732449341e-05, + "loss": 0.2839, + "step": 484 + }, + { + "epoch": 0.05026427609078661, + "grad_norm": 0.463663786649704, + "learning_rate": 3.995716931667257e-05, + "loss": 0.2638, + "step": 485 + }, + { + "epoch": 0.05036791377344803, + "grad_norm": 0.45413050055503845, + "learning_rate": 3.995672906010893e-05, + "loss": 0.2708, + "step": 486 + }, + { + "epoch": 0.05047155145610944, + "grad_norm": 0.407673716545105, + "learning_rate": 3.995628655485208e-05, + "loss": 0.2334, + "step": 487 + }, + { + "epoch": 0.050575189138770855, + "grad_norm": 0.5084668397903442, + "learning_rate": 3.995584180095188e-05, + "loss": 0.2808, + "step": 488 + }, + { + "epoch": 0.05067882682143227, + "grad_norm": 0.49144288897514343, + "learning_rate": 3.995539479845845e-05, + "loss": 0.2631, + "step": 489 + }, + { + "epoch": 0.05078246450409369, + "grad_norm": 0.40713638067245483, + "learning_rate": 3.995494554742215e-05, + "loss": 0.2282, + "step": 490 + }, + { + "epoch": 0.05088610218675511, + "grad_norm": 0.5845853090286255, + "learning_rate": 3.995449404789361e-05, + "loss": 0.2976, + "step": 491 + }, + { + "epoch": 0.05098973986941652, + "grad_norm": 0.5178698301315308, + "learning_rate": 3.995404029992371e-05, + "loss": 0.2887, + "step": 492 + }, + { + "epoch": 0.051093377552077934, + "grad_norm": 0.5840103626251221, + "learning_rate": 3.9953584303563557e-05, + "loss": 0.2829, + "step": 493 + }, + { + "epoch": 0.05119701523473935, + "grad_norm": 0.533604085445404, + "learning_rate": 3.995312605886454e-05, + "loss": 0.2684, + "step": 494 + }, + { + "epoch": 0.05130065291740077, + "grad_norm": 0.43775051832199097, + "learning_rate": 3.995266556587831e-05, + "loss": 0.2498, + "step": 495 + }, + { + "epoch": 0.051404290600062186, + "grad_norm": 0.5094471573829651, + "learning_rate": 3.9952202824656734e-05, + "loss": 0.3014, + "step": 496 + }, + { + "epoch": 0.051507928282723596, + "grad_norm": 0.48946914076805115, + "learning_rate": 3.995173783525196e-05, + "loss": 0.2674, + "step": 497 + }, + { + "epoch": 0.05161156596538501, + "grad_norm": 0.5266274213790894, + "learning_rate": 3.995127059771638e-05, + "loss": 0.263, + "step": 498 + }, + { + "epoch": 0.05171520364804643, + "grad_norm": 0.5169990062713623, + "learning_rate": 3.995080111210265e-05, + "loss": 0.2346, + "step": 499 + }, + { + "epoch": 0.05181884133070785, + "grad_norm": 0.48766180872917175, + "learning_rate": 3.995032937846366e-05, + "loss": 0.2589, + "step": 500 + }, + { + "epoch": 0.05192247901336926, + "grad_norm": 0.5541372895240784, + "learning_rate": 3.9949855396852566e-05, + "loss": 0.2897, + "step": 501 + }, + { + "epoch": 0.052026116696030675, + "grad_norm": 0.6318419575691223, + "learning_rate": 3.994937916732278e-05, + "loss": 0.3263, + "step": 502 + }, + { + "epoch": 0.05212975437869209, + "grad_norm": 0.472393661737442, + "learning_rate": 3.994890068992797e-05, + "loss": 0.2942, + "step": 503 + }, + { + "epoch": 0.05223339206135351, + "grad_norm": 0.49356287717819214, + "learning_rate": 3.994841996472203e-05, + "loss": 0.3301, + "step": 504 + }, + { + "epoch": 0.052337029744014926, + "grad_norm": 0.44888001680374146, + "learning_rate": 3.994793699175915e-05, + "loss": 0.1962, + "step": 505 + }, + { + "epoch": 0.052440667426676336, + "grad_norm": 0.46697762608528137, + "learning_rate": 3.9947451771093736e-05, + "loss": 0.2719, + "step": 506 + }, + { + "epoch": 0.052544305109337754, + "grad_norm": 0.4748048782348633, + "learning_rate": 3.9946964302780455e-05, + "loss": 0.3056, + "step": 507 + }, + { + "epoch": 0.05264794279199917, + "grad_norm": 0.4572794735431671, + "learning_rate": 3.9946474586874255e-05, + "loss": 0.2318, + "step": 508 + }, + { + "epoch": 0.05275158047466059, + "grad_norm": 0.4336269199848175, + "learning_rate": 3.99459826234303e-05, + "loss": 0.2087, + "step": 509 + }, + { + "epoch": 0.052855218157322005, + "grad_norm": 0.5002438426017761, + "learning_rate": 3.994548841250404e-05, + "loss": 0.2842, + "step": 510 + }, + { + "epoch": 0.052958855839983415, + "grad_norm": 0.4813220500946045, + "learning_rate": 3.994499195415114e-05, + "loss": 0.285, + "step": 511 + }, + { + "epoch": 0.05306249352264483, + "grad_norm": 0.46560052037239075, + "learning_rate": 3.994449324842756e-05, + "loss": 0.2432, + "step": 512 + }, + { + "epoch": 0.05316613120530625, + "grad_norm": 0.43234068155288696, + "learning_rate": 3.994399229538948e-05, + "loss": 0.2148, + "step": 513 + }, + { + "epoch": 0.05326976888796767, + "grad_norm": 0.4758170247077942, + "learning_rate": 3.994348909509335e-05, + "loss": 0.2559, + "step": 514 + }, + { + "epoch": 0.053373406570629084, + "grad_norm": 0.48988309502601624, + "learning_rate": 3.9942983647595876e-05, + "loss": 0.2804, + "step": 515 + }, + { + "epoch": 0.053477044253290494, + "grad_norm": 0.47322186827659607, + "learning_rate": 3.994247595295401e-05, + "loss": 0.2072, + "step": 516 + }, + { + "epoch": 0.05358068193595191, + "grad_norm": 0.431432843208313, + "learning_rate": 3.994196601122495e-05, + "loss": 0.2448, + "step": 517 + }, + { + "epoch": 0.05368431961861333, + "grad_norm": 0.49172186851501465, + "learning_rate": 3.9941453822466154e-05, + "loss": 0.2353, + "step": 518 + }, + { + "epoch": 0.053787957301274746, + "grad_norm": 0.5228767395019531, + "learning_rate": 3.994093938673535e-05, + "loss": 0.2858, + "step": 519 + }, + { + "epoch": 0.053891594983936156, + "grad_norm": 0.5165749788284302, + "learning_rate": 3.994042270409049e-05, + "loss": 0.2751, + "step": 520 + }, + { + "epoch": 0.05399523266659757, + "grad_norm": 0.5928406715393066, + "learning_rate": 3.99399037745898e-05, + "loss": 0.2959, + "step": 521 + }, + { + "epoch": 0.05409887034925899, + "grad_norm": 0.4643728733062744, + "learning_rate": 3.9939382598291744e-05, + "loss": 0.2623, + "step": 522 + }, + { + "epoch": 0.05420250803192041, + "grad_norm": 0.45560747385025024, + "learning_rate": 3.993885917525506e-05, + "loss": 0.2463, + "step": 523 + }, + { + "epoch": 0.054306145714581824, + "grad_norm": 0.4651755392551422, + "learning_rate": 3.993833350553872e-05, + "loss": 0.2634, + "step": 524 + }, + { + "epoch": 0.054409783397243235, + "grad_norm": 0.5355138778686523, + "learning_rate": 3.9937805589201955e-05, + "loss": 0.2612, + "step": 525 + }, + { + "epoch": 0.05451342107990465, + "grad_norm": 0.45348060131073, + "learning_rate": 3.993727542630425e-05, + "loss": 0.2655, + "step": 526 + }, + { + "epoch": 0.05461705876256607, + "grad_norm": 0.46827277541160583, + "learning_rate": 3.993674301690534e-05, + "loss": 0.2503, + "step": 527 + }, + { + "epoch": 0.054720696445227486, + "grad_norm": 0.584631085395813, + "learning_rate": 3.993620836106522e-05, + "loss": 0.2676, + "step": 528 + }, + { + "epoch": 0.0548243341278889, + "grad_norm": 0.5135299563407898, + "learning_rate": 3.9935671458844136e-05, + "loss": 0.2998, + "step": 529 + }, + { + "epoch": 0.05492797181055031, + "grad_norm": 0.5041263103485107, + "learning_rate": 3.9935132310302576e-05, + "loss": 0.2631, + "step": 530 + }, + { + "epoch": 0.05503160949321173, + "grad_norm": 0.4767071604728699, + "learning_rate": 3.99345909155013e-05, + "loss": 0.2727, + "step": 531 + }, + { + "epoch": 0.05513524717587315, + "grad_norm": 0.4792020916938782, + "learning_rate": 3.993404727450132e-05, + "loss": 0.2791, + "step": 532 + }, + { + "epoch": 0.055238884858534565, + "grad_norm": 0.4517286419868469, + "learning_rate": 3.993350138736387e-05, + "loss": 0.2462, + "step": 533 + }, + { + "epoch": 0.05534252254119598, + "grad_norm": 0.5668163299560547, + "learning_rate": 3.993295325415047e-05, + "loss": 0.2838, + "step": 534 + }, + { + "epoch": 0.05544616022385739, + "grad_norm": 0.5690762996673584, + "learning_rate": 3.993240287492288e-05, + "loss": 0.2764, + "step": 535 + }, + { + "epoch": 0.05554979790651881, + "grad_norm": 0.5362303256988525, + "learning_rate": 3.993185024974313e-05, + "loss": 0.2808, + "step": 536 + }, + { + "epoch": 0.05565343558918023, + "grad_norm": 0.5116853713989258, + "learning_rate": 3.993129537867347e-05, + "loss": 0.2977, + "step": 537 + }, + { + "epoch": 0.055757073271841644, + "grad_norm": 0.4673402011394501, + "learning_rate": 3.993073826177644e-05, + "loss": 0.2497, + "step": 538 + }, + { + "epoch": 0.055860710954503054, + "grad_norm": 0.5054620504379272, + "learning_rate": 3.99301788991148e-05, + "loss": 0.2774, + "step": 539 + }, + { + "epoch": 0.05596434863716447, + "grad_norm": 0.5134178400039673, + "learning_rate": 3.992961729075158e-05, + "loss": 0.2455, + "step": 540 + }, + { + "epoch": 0.05606798631982589, + "grad_norm": 0.4898240268230438, + "learning_rate": 3.992905343675007e-05, + "loss": 0.2442, + "step": 541 + }, + { + "epoch": 0.056171624002487305, + "grad_norm": 0.4968765377998352, + "learning_rate": 3.99284873371738e-05, + "loss": 0.2789, + "step": 542 + }, + { + "epoch": 0.05627526168514872, + "grad_norm": 0.6002929210662842, + "learning_rate": 3.992791899208656e-05, + "loss": 0.3064, + "step": 543 + }, + { + "epoch": 0.05637889936781013, + "grad_norm": 0.46553024649620056, + "learning_rate": 3.992734840155238e-05, + "loss": 0.2779, + "step": 544 + }, + { + "epoch": 0.05648253705047155, + "grad_norm": 0.44942498207092285, + "learning_rate": 3.9926775565635555e-05, + "loss": 0.2545, + "step": 545 + }, + { + "epoch": 0.05658617473313297, + "grad_norm": 0.529929518699646, + "learning_rate": 3.9926200484400644e-05, + "loss": 0.2918, + "step": 546 + }, + { + "epoch": 0.056689812415794384, + "grad_norm": 0.5951007008552551, + "learning_rate": 3.992562315791244e-05, + "loss": 0.295, + "step": 547 + }, + { + "epoch": 0.0567934500984558, + "grad_norm": 0.5104761719703674, + "learning_rate": 3.992504358623598e-05, + "loss": 0.2625, + "step": 548 + }, + { + "epoch": 0.05689708778111721, + "grad_norm": 0.5036949515342712, + "learning_rate": 3.992446176943659e-05, + "loss": 0.3084, + "step": 549 + }, + { + "epoch": 0.05700072546377863, + "grad_norm": 0.5363962054252625, + "learning_rate": 3.992387770757983e-05, + "loss": 0.2859, + "step": 550 + }, + { + "epoch": 0.057104363146440046, + "grad_norm": 0.5645461082458496, + "learning_rate": 3.99232914007315e-05, + "loss": 0.2702, + "step": 551 + }, + { + "epoch": 0.05720800082910146, + "grad_norm": 0.4483615756034851, + "learning_rate": 3.992270284895765e-05, + "loss": 0.2359, + "step": 552 + }, + { + "epoch": 0.05731163851176288, + "grad_norm": 0.5485745668411255, + "learning_rate": 3.992211205232463e-05, + "loss": 0.3096, + "step": 553 + }, + { + "epoch": 0.05741527619442429, + "grad_norm": 0.4500156342983246, + "learning_rate": 3.992151901089899e-05, + "loss": 0.2573, + "step": 554 + }, + { + "epoch": 0.05751891387708571, + "grad_norm": 0.49787795543670654, + "learning_rate": 3.9920923724747555e-05, + "loss": 0.2604, + "step": 555 + }, + { + "epoch": 0.057622551559747125, + "grad_norm": 0.4587489366531372, + "learning_rate": 3.9920326193937405e-05, + "loss": 0.2344, + "step": 556 + }, + { + "epoch": 0.05772618924240854, + "grad_norm": 0.5138572454452515, + "learning_rate": 3.991972641853586e-05, + "loss": 0.3073, + "step": 557 + }, + { + "epoch": 0.05782982692506995, + "grad_norm": 0.5564447045326233, + "learning_rate": 3.9919124398610514e-05, + "loss": 0.3053, + "step": 558 + }, + { + "epoch": 0.05793346460773137, + "grad_norm": 0.48643192648887634, + "learning_rate": 3.991852013422919e-05, + "loss": 0.2658, + "step": 559 + }, + { + "epoch": 0.058037102290392786, + "grad_norm": 0.5111423134803772, + "learning_rate": 3.9917913625459986e-05, + "loss": 0.3184, + "step": 560 + }, + { + "epoch": 0.058140739973054203, + "grad_norm": 0.534591019153595, + "learning_rate": 3.9917304872371236e-05, + "loss": 0.255, + "step": 561 + }, + { + "epoch": 0.05824437765571562, + "grad_norm": 0.45287781953811646, + "learning_rate": 3.991669387503153e-05, + "loss": 0.2544, + "step": 562 + }, + { + "epoch": 0.05834801533837703, + "grad_norm": 0.49371427297592163, + "learning_rate": 3.991608063350973e-05, + "loss": 0.2866, + "step": 563 + }, + { + "epoch": 0.05845165302103845, + "grad_norm": 0.43072739243507385, + "learning_rate": 3.9915465147874916e-05, + "loss": 0.2397, + "step": 564 + }, + { + "epoch": 0.058555290703699865, + "grad_norm": 0.5488777756690979, + "learning_rate": 3.991484741819645e-05, + "loss": 0.2936, + "step": 565 + }, + { + "epoch": 0.05865892838636128, + "grad_norm": 0.5285367965698242, + "learning_rate": 3.9914227444543936e-05, + "loss": 0.3119, + "step": 566 + }, + { + "epoch": 0.0587625660690227, + "grad_norm": 0.48357492685317993, + "learning_rate": 3.991360522698723e-05, + "loss": 0.2333, + "step": 567 + }, + { + "epoch": 0.05886620375168411, + "grad_norm": 0.4516059160232544, + "learning_rate": 3.991298076559645e-05, + "loss": 0.257, + "step": 568 + }, + { + "epoch": 0.05896984143434553, + "grad_norm": 0.4679553508758545, + "learning_rate": 3.991235406044195e-05, + "loss": 0.2778, + "step": 569 + }, + { + "epoch": 0.059073479117006944, + "grad_norm": 0.5020044445991516, + "learning_rate": 3.991172511159434e-05, + "loss": 0.2706, + "step": 570 + }, + { + "epoch": 0.05917711679966836, + "grad_norm": 0.4471382796764374, + "learning_rate": 3.99110939191245e-05, + "loss": 0.2608, + "step": 571 + }, + { + "epoch": 0.05928075448232978, + "grad_norm": 0.48660770058631897, + "learning_rate": 3.991046048310356e-05, + "loss": 0.2749, + "step": 572 + }, + { + "epoch": 0.05938439216499119, + "grad_norm": 0.5402244329452515, + "learning_rate": 3.990982480360288e-05, + "loss": 0.3266, + "step": 573 + }, + { + "epoch": 0.059488029847652606, + "grad_norm": 0.49644187092781067, + "learning_rate": 3.9909186880694086e-05, + "loss": 0.2948, + "step": 574 + }, + { + "epoch": 0.05959166753031402, + "grad_norm": 0.47150805592536926, + "learning_rate": 3.990854671444906e-05, + "loss": 0.3078, + "step": 575 + }, + { + "epoch": 0.05969530521297544, + "grad_norm": 0.39560598134994507, + "learning_rate": 3.990790430493995e-05, + "loss": 0.1867, + "step": 576 + }, + { + "epoch": 0.05979894289563685, + "grad_norm": 0.521929919719696, + "learning_rate": 3.9907259652239125e-05, + "loss": 0.2943, + "step": 577 + }, + { + "epoch": 0.05990258057829827, + "grad_norm": 0.45911693572998047, + "learning_rate": 3.9906612756419234e-05, + "loss": 0.2251, + "step": 578 + }, + { + "epoch": 0.060006218260959684, + "grad_norm": 0.5514270067214966, + "learning_rate": 3.9905963617553154e-05, + "loss": 0.3134, + "step": 579 + }, + { + "epoch": 0.0601098559436211, + "grad_norm": 0.4795793294906616, + "learning_rate": 3.990531223571404e-05, + "loss": 0.2496, + "step": 580 + }, + { + "epoch": 0.06021349362628252, + "grad_norm": 0.48294639587402344, + "learning_rate": 3.990465861097529e-05, + "loss": 0.2785, + "step": 581 + }, + { + "epoch": 0.06031713130894393, + "grad_norm": 0.4807794690132141, + "learning_rate": 3.990400274341055e-05, + "loss": 0.29, + "step": 582 + }, + { + "epoch": 0.060420768991605346, + "grad_norm": 0.5464568138122559, + "learning_rate": 3.9903344633093724e-05, + "loss": 0.2699, + "step": 583 + }, + { + "epoch": 0.06052440667426676, + "grad_norm": 0.48656803369522095, + "learning_rate": 3.9902684280098965e-05, + "loss": 0.3006, + "step": 584 + }, + { + "epoch": 0.06062804435692818, + "grad_norm": 0.4978223443031311, + "learning_rate": 3.9902021684500677e-05, + "loss": 0.25, + "step": 585 + }, + { + "epoch": 0.0607316820395896, + "grad_norm": 0.4835212826728821, + "learning_rate": 3.990135684637352e-05, + "loss": 0.2632, + "step": 586 + }, + { + "epoch": 0.06083531972225101, + "grad_norm": 0.47485238313674927, + "learning_rate": 3.990068976579242e-05, + "loss": 0.3159, + "step": 587 + }, + { + "epoch": 0.060938957404912425, + "grad_norm": 0.4746256470680237, + "learning_rate": 3.990002044283253e-05, + "loss": 0.2545, + "step": 588 + }, + { + "epoch": 0.06104259508757384, + "grad_norm": 0.4291842579841614, + "learning_rate": 3.989934887756927e-05, + "loss": 0.2674, + "step": 589 + }, + { + "epoch": 0.06114623277023526, + "grad_norm": 0.47232916951179504, + "learning_rate": 3.989867507007831e-05, + "loss": 0.2618, + "step": 590 + }, + { + "epoch": 0.061249870452896676, + "grad_norm": 0.49870309233665466, + "learning_rate": 3.989799902043558e-05, + "loss": 0.2889, + "step": 591 + }, + { + "epoch": 0.06135350813555809, + "grad_norm": 0.5332955718040466, + "learning_rate": 3.9897320728717254e-05, + "loss": 0.3145, + "step": 592 + }, + { + "epoch": 0.061457145818219504, + "grad_norm": 0.45895615220069885, + "learning_rate": 3.9896640194999754e-05, + "loss": 0.2917, + "step": 593 + }, + { + "epoch": 0.06156078350088092, + "grad_norm": 0.5079963803291321, + "learning_rate": 3.989595741935977e-05, + "loss": 0.2516, + "step": 594 + }, + { + "epoch": 0.06166442118354234, + "grad_norm": 0.42996639013290405, + "learning_rate": 3.989527240187424e-05, + "loss": 0.2601, + "step": 595 + }, + { + "epoch": 0.06176805886620375, + "grad_norm": 0.48144781589508057, + "learning_rate": 3.989458514262034e-05, + "loss": 0.2976, + "step": 596 + }, + { + "epoch": 0.061871696548865165, + "grad_norm": 0.5348926782608032, + "learning_rate": 3.98938956416755e-05, + "loss": 0.3098, + "step": 597 + }, + { + "epoch": 0.06197533423152658, + "grad_norm": 0.46923643350601196, + "learning_rate": 3.9893203899117445e-05, + "loss": 0.2442, + "step": 598 + }, + { + "epoch": 0.062078971914188, + "grad_norm": 0.5009055733680725, + "learning_rate": 3.989250991502408e-05, + "loss": 0.2768, + "step": 599 + }, + { + "epoch": 0.06218260959684942, + "grad_norm": 0.47320249676704407, + "learning_rate": 3.989181368947363e-05, + "loss": 0.224, + "step": 600 + }, + { + "epoch": 0.06228624727951083, + "grad_norm": 0.49606916308403015, + "learning_rate": 3.989111522254453e-05, + "loss": 0.2572, + "step": 601 + }, + { + "epoch": 0.062389884962172244, + "grad_norm": 0.4603443443775177, + "learning_rate": 3.9890414514315504e-05, + "loss": 0.2611, + "step": 602 + }, + { + "epoch": 0.06249352264483366, + "grad_norm": 0.4671558141708374, + "learning_rate": 3.988971156486548e-05, + "loss": 0.2703, + "step": 603 + }, + { + "epoch": 0.06259716032749507, + "grad_norm": 0.4635378420352936, + "learning_rate": 3.988900637427367e-05, + "loss": 0.2564, + "step": 604 + }, + { + "epoch": 0.06270079801015649, + "grad_norm": 0.6459101438522339, + "learning_rate": 3.9888298942619555e-05, + "loss": 0.3151, + "step": 605 + }, + { + "epoch": 0.0628044356928179, + "grad_norm": 0.48071354627609253, + "learning_rate": 3.988758926998282e-05, + "loss": 0.2886, + "step": 606 + }, + { + "epoch": 0.06290807337547932, + "grad_norm": 0.4736165404319763, + "learning_rate": 3.988687735644345e-05, + "loss": 0.237, + "step": 607 + }, + { + "epoch": 0.06301171105814074, + "grad_norm": 0.45571112632751465, + "learning_rate": 3.988616320208165e-05, + "loss": 0.2198, + "step": 608 + }, + { + "epoch": 0.06311534874080216, + "grad_norm": 0.48835575580596924, + "learning_rate": 3.988544680697789e-05, + "loss": 0.3071, + "step": 609 + }, + { + "epoch": 0.06321898642346357, + "grad_norm": 0.49265989661216736, + "learning_rate": 3.98847281712129e-05, + "loss": 0.251, + "step": 610 + }, + { + "epoch": 0.06332262410612499, + "grad_norm": 0.5096911787986755, + "learning_rate": 3.988400729486765e-05, + "loss": 0.2774, + "step": 611 + }, + { + "epoch": 0.06342626178878641, + "grad_norm": 0.3974534273147583, + "learning_rate": 3.988328417802337e-05, + "loss": 0.2419, + "step": 612 + }, + { + "epoch": 0.06352989947144781, + "grad_norm": 0.46511921286582947, + "learning_rate": 3.988255882076154e-05, + "loss": 0.2559, + "step": 613 + }, + { + "epoch": 0.06363353715410923, + "grad_norm": 0.46322038769721985, + "learning_rate": 3.988183122316389e-05, + "loss": 0.2517, + "step": 614 + }, + { + "epoch": 0.06373717483677065, + "grad_norm": 0.4440378248691559, + "learning_rate": 3.98811013853124e-05, + "loss": 0.2646, + "step": 615 + }, + { + "epoch": 0.06384081251943206, + "grad_norm": 0.5909380912780762, + "learning_rate": 3.988036930728931e-05, + "loss": 0.3207, + "step": 616 + }, + { + "epoch": 0.06394445020209348, + "grad_norm": 0.45240291953086853, + "learning_rate": 3.9879634989177114e-05, + "loss": 0.2645, + "step": 617 + }, + { + "epoch": 0.0640480878847549, + "grad_norm": 0.47876596450805664, + "learning_rate": 3.987889843105856e-05, + "loss": 0.2787, + "step": 618 + }, + { + "epoch": 0.06415172556741632, + "grad_norm": 0.4310585558414459, + "learning_rate": 3.9878159633016624e-05, + "loss": 0.2348, + "step": 619 + }, + { + "epoch": 0.06425536325007773, + "grad_norm": 0.44966933131217957, + "learning_rate": 3.987741859513456e-05, + "loss": 0.2453, + "step": 620 + }, + { + "epoch": 0.06435900093273915, + "grad_norm": 0.45143213868141174, + "learning_rate": 3.987667531749587e-05, + "loss": 0.2788, + "step": 621 + }, + { + "epoch": 0.06446263861540057, + "grad_norm": 0.47991883754730225, + "learning_rate": 3.987592980018431e-05, + "loss": 0.2786, + "step": 622 + }, + { + "epoch": 0.06456627629806197, + "grad_norm": 0.4811300039291382, + "learning_rate": 3.987518204328387e-05, + "loss": 0.2806, + "step": 623 + }, + { + "epoch": 0.06466991398072339, + "grad_norm": 0.6241214275360107, + "learning_rate": 3.987443204687882e-05, + "loss": 0.3336, + "step": 624 + }, + { + "epoch": 0.0647735516633848, + "grad_norm": 0.6035299301147461, + "learning_rate": 3.987367981105366e-05, + "loss": 0.3272, + "step": 625 + }, + { + "epoch": 0.06487718934604622, + "grad_norm": 0.4932394325733185, + "learning_rate": 3.987292533589315e-05, + "loss": 0.2865, + "step": 626 + }, + { + "epoch": 0.06498082702870764, + "grad_norm": 0.49736925959587097, + "learning_rate": 3.9872168621482304e-05, + "loss": 0.2722, + "step": 627 + }, + { + "epoch": 0.06508446471136906, + "grad_norm": 0.4849455654621124, + "learning_rate": 3.98714096679064e-05, + "loss": 0.2399, + "step": 628 + }, + { + "epoch": 0.06518810239403047, + "grad_norm": 0.4412994980812073, + "learning_rate": 3.9870648475250944e-05, + "loss": 0.2704, + "step": 629 + }, + { + "epoch": 0.06529174007669189, + "grad_norm": 0.4405463933944702, + "learning_rate": 3.98698850436017e-05, + "loss": 0.2294, + "step": 630 + }, + { + "epoch": 0.06539537775935331, + "grad_norm": 0.40049633383750916, + "learning_rate": 3.98691193730447e-05, + "loss": 0.2128, + "step": 631 + }, + { + "epoch": 0.06549901544201471, + "grad_norm": 0.45948949456214905, + "learning_rate": 3.9868351463666213e-05, + "loss": 0.2207, + "step": 632 + }, + { + "epoch": 0.06560265312467613, + "grad_norm": 0.4937278628349304, + "learning_rate": 3.986758131555278e-05, + "loss": 0.275, + "step": 633 + }, + { + "epoch": 0.06570629080733754, + "grad_norm": 0.477566123008728, + "learning_rate": 3.9866808928791154e-05, + "loss": 0.2946, + "step": 634 + }, + { + "epoch": 0.06580992848999896, + "grad_norm": 0.514164388179779, + "learning_rate": 3.986603430346839e-05, + "loss": 0.2787, + "step": 635 + }, + { + "epoch": 0.06591356617266038, + "grad_norm": 0.4273333251476288, + "learning_rate": 3.9865257439671765e-05, + "loss": 0.22, + "step": 636 + }, + { + "epoch": 0.0660172038553218, + "grad_norm": 0.5105968713760376, + "learning_rate": 3.9864478337488817e-05, + "loss": 0.2503, + "step": 637 + }, + { + "epoch": 0.06612084153798321, + "grad_norm": 0.4005119204521179, + "learning_rate": 3.986369699700732e-05, + "loss": 0.2327, + "step": 638 + }, + { + "epoch": 0.06622447922064463, + "grad_norm": 0.5089367032051086, + "learning_rate": 3.986291341831533e-05, + "loss": 0.2827, + "step": 639 + }, + { + "epoch": 0.06632811690330605, + "grad_norm": 0.4681312143802643, + "learning_rate": 3.986212760150113e-05, + "loss": 0.2527, + "step": 640 + }, + { + "epoch": 0.06643175458596746, + "grad_norm": 0.5034751892089844, + "learning_rate": 3.986133954665327e-05, + "loss": 0.3003, + "step": 641 + }, + { + "epoch": 0.06653539226862887, + "grad_norm": 0.4600752294063568, + "learning_rate": 3.986054925386055e-05, + "loss": 0.2519, + "step": 642 + }, + { + "epoch": 0.06663902995129029, + "grad_norm": 0.48441654443740845, + "learning_rate": 3.9859756723212e-05, + "loss": 0.3041, + "step": 643 + }, + { + "epoch": 0.0667426676339517, + "grad_norm": 0.4735645651817322, + "learning_rate": 3.985896195479694e-05, + "loss": 0.2661, + "step": 644 + }, + { + "epoch": 0.06684630531661312, + "grad_norm": 0.3847654461860657, + "learning_rate": 3.985816494870492e-05, + "loss": 0.2302, + "step": 645 + }, + { + "epoch": 0.06694994299927454, + "grad_norm": 0.45513978600502014, + "learning_rate": 3.985736570502575e-05, + "loss": 0.2386, + "step": 646 + }, + { + "epoch": 0.06705358068193595, + "grad_norm": 0.4446656405925751, + "learning_rate": 3.985656422384947e-05, + "loss": 0.232, + "step": 647 + }, + { + "epoch": 0.06715721836459737, + "grad_norm": 0.473247766494751, + "learning_rate": 3.985576050526641e-05, + "loss": 0.2098, + "step": 648 + }, + { + "epoch": 0.06726085604725879, + "grad_norm": 0.5222258567810059, + "learning_rate": 3.985495454936712e-05, + "loss": 0.2534, + "step": 649 + }, + { + "epoch": 0.0673644937299202, + "grad_norm": 0.44592931866645813, + "learning_rate": 3.985414635624242e-05, + "loss": 0.2113, + "step": 650 + }, + { + "epoch": 0.06746813141258161, + "grad_norm": 0.5243046283721924, + "learning_rate": 3.9853335925983366e-05, + "loss": 0.2934, + "step": 651 + }, + { + "epoch": 0.06757176909524303, + "grad_norm": 0.45387986302375793, + "learning_rate": 3.985252325868129e-05, + "loss": 0.2599, + "step": 652 + }, + { + "epoch": 0.06767540677790444, + "grad_norm": 0.5690520405769348, + "learning_rate": 3.985170835442775e-05, + "loss": 0.2906, + "step": 653 + }, + { + "epoch": 0.06777904446056586, + "grad_norm": 0.605476975440979, + "learning_rate": 3.985089121331457e-05, + "loss": 0.302, + "step": 654 + }, + { + "epoch": 0.06788268214322728, + "grad_norm": 0.5086917877197266, + "learning_rate": 3.985007183543383e-05, + "loss": 0.3054, + "step": 655 + }, + { + "epoch": 0.0679863198258887, + "grad_norm": 0.4781462252140045, + "learning_rate": 3.9849250220877856e-05, + "loss": 0.2927, + "step": 656 + }, + { + "epoch": 0.06808995750855011, + "grad_norm": 0.44163042306900024, + "learning_rate": 3.984842636973921e-05, + "loss": 0.2179, + "step": 657 + }, + { + "epoch": 0.06819359519121153, + "grad_norm": 0.49941644072532654, + "learning_rate": 3.9847600282110755e-05, + "loss": 0.2359, + "step": 658 + }, + { + "epoch": 0.06829723287387295, + "grad_norm": 0.5164234042167664, + "learning_rate": 3.984677195808554e-05, + "loss": 0.2928, + "step": 659 + }, + { + "epoch": 0.06840087055653436, + "grad_norm": 0.5161421298980713, + "learning_rate": 3.9845941397756924e-05, + "loss": 0.3114, + "step": 660 + }, + { + "epoch": 0.06850450823919577, + "grad_norm": 0.48917636275291443, + "learning_rate": 3.9845108601218474e-05, + "loss": 0.2335, + "step": 661 + }, + { + "epoch": 0.06860814592185718, + "grad_norm": 0.44179776310920715, + "learning_rate": 3.9844273568564036e-05, + "loss": 0.2595, + "step": 662 + }, + { + "epoch": 0.0687117836045186, + "grad_norm": 0.5596027970314026, + "learning_rate": 3.98434362998877e-05, + "loss": 0.2839, + "step": 663 + }, + { + "epoch": 0.06881542128718002, + "grad_norm": 0.5134223103523254, + "learning_rate": 3.9842596795283814e-05, + "loss": 0.297, + "step": 664 + }, + { + "epoch": 0.06891905896984143, + "grad_norm": 0.4692731201648712, + "learning_rate": 3.984175505484697e-05, + "loss": 0.2729, + "step": 665 + }, + { + "epoch": 0.06902269665250285, + "grad_norm": 0.37804538011550903, + "learning_rate": 3.9840911078672003e-05, + "loss": 0.2095, + "step": 666 + }, + { + "epoch": 0.06912633433516427, + "grad_norm": 0.5090510845184326, + "learning_rate": 3.9840064866854026e-05, + "loss": 0.2432, + "step": 667 + }, + { + "epoch": 0.06922997201782569, + "grad_norm": 0.38887301087379456, + "learning_rate": 3.983921641948838e-05, + "loss": 0.1764, + "step": 668 + }, + { + "epoch": 0.0693336097004871, + "grad_norm": 0.5521963834762573, + "learning_rate": 3.9838365736670665e-05, + "loss": 0.2711, + "step": 669 + }, + { + "epoch": 0.0694372473831485, + "grad_norm": 0.6766364574432373, + "learning_rate": 3.983751281849674e-05, + "loss": 0.3268, + "step": 670 + }, + { + "epoch": 0.06954088506580992, + "grad_norm": 0.44805410504341125, + "learning_rate": 3.9836657665062704e-05, + "loss": 0.2156, + "step": 671 + }, + { + "epoch": 0.06964452274847134, + "grad_norm": 0.4524245262145996, + "learning_rate": 3.983580027646492e-05, + "loss": 0.2797, + "step": 672 + }, + { + "epoch": 0.06974816043113276, + "grad_norm": 0.4633697271347046, + "learning_rate": 3.983494065280001e-05, + "loss": 0.241, + "step": 673 + }, + { + "epoch": 0.06985179811379418, + "grad_norm": 0.4903186559677124, + "learning_rate": 3.983407879416481e-05, + "loss": 0.2816, + "step": 674 + }, + { + "epoch": 0.06995543579645559, + "grad_norm": 0.4883442223072052, + "learning_rate": 3.983321470065644e-05, + "loss": 0.238, + "step": 675 + }, + { + "epoch": 0.07005907347911701, + "grad_norm": 0.4707334637641907, + "learning_rate": 3.983234837237228e-05, + "loss": 0.2567, + "step": 676 + }, + { + "epoch": 0.07016271116177843, + "grad_norm": 0.5906460285186768, + "learning_rate": 3.983147980940993e-05, + "loss": 0.2579, + "step": 677 + }, + { + "epoch": 0.07026634884443984, + "grad_norm": 0.5003019571304321, + "learning_rate": 3.983060901186726e-05, + "loss": 0.2549, + "step": 678 + }, + { + "epoch": 0.07036998652710126, + "grad_norm": 0.5827634334564209, + "learning_rate": 3.98297359798424e-05, + "loss": 0.3284, + "step": 679 + }, + { + "epoch": 0.07047362420976266, + "grad_norm": 0.45304641127586365, + "learning_rate": 3.9828860713433705e-05, + "loss": 0.2802, + "step": 680 + }, + { + "epoch": 0.07057726189242408, + "grad_norm": 0.5432493686676025, + "learning_rate": 3.982798321273982e-05, + "loss": 0.2634, + "step": 681 + }, + { + "epoch": 0.0706808995750855, + "grad_norm": 0.5457451343536377, + "learning_rate": 3.9827103477859605e-05, + "loss": 0.2908, + "step": 682 + }, + { + "epoch": 0.07078453725774692, + "grad_norm": 0.44750288128852844, + "learning_rate": 3.9826221508892196e-05, + "loss": 0.2125, + "step": 683 + }, + { + "epoch": 0.07088817494040833, + "grad_norm": 0.5408027172088623, + "learning_rate": 3.9825337305936965e-05, + "loss": 0.2741, + "step": 684 + }, + { + "epoch": 0.07099181262306975, + "grad_norm": 0.5531283617019653, + "learning_rate": 3.982445086909354e-05, + "loss": 0.2839, + "step": 685 + }, + { + "epoch": 0.07109545030573117, + "grad_norm": 0.5220621824264526, + "learning_rate": 3.982356219846182e-05, + "loss": 0.2682, + "step": 686 + }, + { + "epoch": 0.07119908798839258, + "grad_norm": 0.5586988925933838, + "learning_rate": 3.9822671294141916e-05, + "loss": 0.2544, + "step": 687 + }, + { + "epoch": 0.071302725671054, + "grad_norm": 0.4634181261062622, + "learning_rate": 3.9821778156234236e-05, + "loss": 0.2481, + "step": 688 + }, + { + "epoch": 0.0714063633537154, + "grad_norm": 0.46268972754478455, + "learning_rate": 3.9820882784839405e-05, + "loss": 0.2382, + "step": 689 + }, + { + "epoch": 0.07151000103637682, + "grad_norm": 0.5548827052116394, + "learning_rate": 3.9819985180058314e-05, + "loss": 0.3154, + "step": 690 + }, + { + "epoch": 0.07161363871903824, + "grad_norm": 0.40516266226768494, + "learning_rate": 3.9819085341992106e-05, + "loss": 0.2322, + "step": 691 + }, + { + "epoch": 0.07171727640169966, + "grad_norm": 0.49902236461639404, + "learning_rate": 3.981818327074216e-05, + "loss": 0.2632, + "step": 692 + }, + { + "epoch": 0.07182091408436107, + "grad_norm": 0.5100747346878052, + "learning_rate": 3.9817278966410134e-05, + "loss": 0.245, + "step": 693 + }, + { + "epoch": 0.07192455176702249, + "grad_norm": 0.5476238131523132, + "learning_rate": 3.981637242909793e-05, + "loss": 0.2564, + "step": 694 + }, + { + "epoch": 0.07202818944968391, + "grad_norm": 0.5173237919807434, + "learning_rate": 3.981546365890768e-05, + "loss": 0.2933, + "step": 695 + }, + { + "epoch": 0.07213182713234532, + "grad_norm": 0.4427679777145386, + "learning_rate": 3.9814552655941784e-05, + "loss": 0.2186, + "step": 696 + }, + { + "epoch": 0.07223546481500674, + "grad_norm": 0.4230489432811737, + "learning_rate": 3.9813639420302906e-05, + "loss": 0.2543, + "step": 697 + }, + { + "epoch": 0.07233910249766816, + "grad_norm": 0.42406851053237915, + "learning_rate": 3.9812723952093936e-05, + "loss": 0.2499, + "step": 698 + }, + { + "epoch": 0.07244274018032956, + "grad_norm": 0.41877564787864685, + "learning_rate": 3.981180625141803e-05, + "loss": 0.2248, + "step": 699 + }, + { + "epoch": 0.07254637786299098, + "grad_norm": 0.5359909534454346, + "learning_rate": 3.981088631837859e-05, + "loss": 0.2902, + "step": 700 + }, + { + "epoch": 0.0726500155456524, + "grad_norm": 0.43423157930374146, + "learning_rate": 3.980996415307928e-05, + "loss": 0.2227, + "step": 701 + }, + { + "epoch": 0.07275365322831381, + "grad_norm": 0.5584090352058411, + "learning_rate": 3.980903975562401e-05, + "loss": 0.2968, + "step": 702 + }, + { + "epoch": 0.07285729091097523, + "grad_norm": 0.4519881010055542, + "learning_rate": 3.980811312611692e-05, + "loss": 0.2353, + "step": 703 + }, + { + "epoch": 0.07296092859363665, + "grad_norm": 0.5242766737937927, + "learning_rate": 3.980718426466244e-05, + "loss": 0.3233, + "step": 704 + }, + { + "epoch": 0.07306456627629807, + "grad_norm": 0.5491629838943481, + "learning_rate": 3.980625317136523e-05, + "loss": 0.3075, + "step": 705 + }, + { + "epoch": 0.07316820395895948, + "grad_norm": 0.39431241154670715, + "learning_rate": 3.980531984633021e-05, + "loss": 0.2077, + "step": 706 + }, + { + "epoch": 0.0732718416416209, + "grad_norm": 0.5462802052497864, + "learning_rate": 3.980438428966253e-05, + "loss": 0.3186, + "step": 707 + }, + { + "epoch": 0.0733754793242823, + "grad_norm": 0.5003985166549683, + "learning_rate": 3.980344650146761e-05, + "loss": 0.251, + "step": 708 + }, + { + "epoch": 0.07347911700694372, + "grad_norm": 0.4941883981227875, + "learning_rate": 3.980250648185113e-05, + "loss": 0.2518, + "step": 709 + }, + { + "epoch": 0.07358275468960514, + "grad_norm": 0.4943901002407074, + "learning_rate": 3.9801564230919006e-05, + "loss": 0.2836, + "step": 710 + }, + { + "epoch": 0.07368639237226655, + "grad_norm": 0.4397679567337036, + "learning_rate": 3.98006197487774e-05, + "loss": 0.2366, + "step": 711 + }, + { + "epoch": 0.07379003005492797, + "grad_norm": 0.46421751379966736, + "learning_rate": 3.9799673035532745e-05, + "loss": 0.2499, + "step": 712 + }, + { + "epoch": 0.07389366773758939, + "grad_norm": 0.4776816964149475, + "learning_rate": 3.9798724091291715e-05, + "loss": 0.3052, + "step": 713 + }, + { + "epoch": 0.0739973054202508, + "grad_norm": 0.506424069404602, + "learning_rate": 3.979777291616122e-05, + "loss": 0.2826, + "step": 714 + }, + { + "epoch": 0.07410094310291222, + "grad_norm": 0.47890761494636536, + "learning_rate": 3.979681951024846e-05, + "loss": 0.2883, + "step": 715 + }, + { + "epoch": 0.07420458078557364, + "grad_norm": 0.5536207556724548, + "learning_rate": 3.9795863873660846e-05, + "loss": 0.3255, + "step": 716 + }, + { + "epoch": 0.07430821846823506, + "grad_norm": 0.4948398470878601, + "learning_rate": 3.979490600650607e-05, + "loss": 0.2697, + "step": 717 + }, + { + "epoch": 0.07441185615089646, + "grad_norm": 0.4891558885574341, + "learning_rate": 3.9793945908892057e-05, + "loss": 0.303, + "step": 718 + }, + { + "epoch": 0.07451549383355788, + "grad_norm": 0.4618590474128723, + "learning_rate": 3.979298358092698e-05, + "loss": 0.2451, + "step": 719 + }, + { + "epoch": 0.0746191315162193, + "grad_norm": 0.4500598907470703, + "learning_rate": 3.9792019022719294e-05, + "loss": 0.2535, + "step": 720 + }, + { + "epoch": 0.07472276919888071, + "grad_norm": 0.5183244943618774, + "learning_rate": 3.9791052234377663e-05, + "loss": 0.2796, + "step": 721 + }, + { + "epoch": 0.07482640688154213, + "grad_norm": 0.45024576783180237, + "learning_rate": 3.979008321601104e-05, + "loss": 0.2339, + "step": 722 + }, + { + "epoch": 0.07493004456420355, + "grad_norm": 0.5016899704933167, + "learning_rate": 3.9789111967728595e-05, + "loss": 0.2972, + "step": 723 + }, + { + "epoch": 0.07503368224686496, + "grad_norm": 0.4528055489063263, + "learning_rate": 3.9788138489639786e-05, + "loss": 0.2613, + "step": 724 + }, + { + "epoch": 0.07513731992952638, + "grad_norm": 0.4231579899787903, + "learning_rate": 3.9787162781854284e-05, + "loss": 0.2259, + "step": 725 + }, + { + "epoch": 0.0752409576121878, + "grad_norm": 0.47760093212127686, + "learning_rate": 3.978618484448204e-05, + "loss": 0.2637, + "step": 726 + }, + { + "epoch": 0.0753445952948492, + "grad_norm": 0.4602471888065338, + "learning_rate": 3.978520467763325e-05, + "loss": 0.2547, + "step": 727 + }, + { + "epoch": 0.07544823297751062, + "grad_norm": 0.5210710167884827, + "learning_rate": 3.978422228141836e-05, + "loss": 0.2485, + "step": 728 + }, + { + "epoch": 0.07555187066017204, + "grad_norm": 0.47295230627059937, + "learning_rate": 3.9783237655948044e-05, + "loss": 0.2633, + "step": 729 + }, + { + "epoch": 0.07565550834283345, + "grad_norm": 0.4705461859703064, + "learning_rate": 3.9782250801333274e-05, + "loss": 0.2642, + "step": 730 + }, + { + "epoch": 0.07575914602549487, + "grad_norm": 0.4627399444580078, + "learning_rate": 3.978126171768523e-05, + "loss": 0.2571, + "step": 731 + }, + { + "epoch": 0.07586278370815629, + "grad_norm": 0.476418137550354, + "learning_rate": 3.978027040511537e-05, + "loss": 0.2702, + "step": 732 + }, + { + "epoch": 0.0759664213908177, + "grad_norm": 0.5001246333122253, + "learning_rate": 3.977927686373539e-05, + "loss": 0.2578, + "step": 733 + }, + { + "epoch": 0.07607005907347912, + "grad_norm": 0.46971946954727173, + "learning_rate": 3.977828109365724e-05, + "loss": 0.2661, + "step": 734 + }, + { + "epoch": 0.07617369675614054, + "grad_norm": 0.4575338065624237, + "learning_rate": 3.9777283094993115e-05, + "loss": 0.2778, + "step": 735 + }, + { + "epoch": 0.07627733443880196, + "grad_norm": 0.4811464846134186, + "learning_rate": 3.9776282867855475e-05, + "loss": 0.247, + "step": 736 + }, + { + "epoch": 0.07638097212146336, + "grad_norm": 0.5836530923843384, + "learning_rate": 3.9775280412357035e-05, + "loss": 0.2472, + "step": 737 + }, + { + "epoch": 0.07648460980412478, + "grad_norm": 0.4331651031970978, + "learning_rate": 3.977427572861073e-05, + "loss": 0.2376, + "step": 738 + }, + { + "epoch": 0.07658824748678619, + "grad_norm": 0.42944300174713135, + "learning_rate": 3.977326881672978e-05, + "loss": 0.2161, + "step": 739 + }, + { + "epoch": 0.07669188516944761, + "grad_norm": 0.4123460352420807, + "learning_rate": 3.977225967682764e-05, + "loss": 0.2387, + "step": 740 + }, + { + "epoch": 0.07679552285210903, + "grad_norm": 0.4910091757774353, + "learning_rate": 3.977124830901802e-05, + "loss": 0.3004, + "step": 741 + }, + { + "epoch": 0.07689916053477044, + "grad_norm": 0.552148163318634, + "learning_rate": 3.977023471341487e-05, + "loss": 0.2798, + "step": 742 + }, + { + "epoch": 0.07700279821743186, + "grad_norm": 0.4576529860496521, + "learning_rate": 3.9769218890132404e-05, + "loss": 0.2756, + "step": 743 + }, + { + "epoch": 0.07710643590009328, + "grad_norm": 0.5145983695983887, + "learning_rate": 3.9768200839285086e-05, + "loss": 0.231, + "step": 744 + }, + { + "epoch": 0.0772100735827547, + "grad_norm": 0.4392525851726532, + "learning_rate": 3.976718056098763e-05, + "loss": 0.2115, + "step": 745 + }, + { + "epoch": 0.0773137112654161, + "grad_norm": 0.465648353099823, + "learning_rate": 3.9766158055354996e-05, + "loss": 0.2376, + "step": 746 + }, + { + "epoch": 0.07741734894807752, + "grad_norm": 0.5436767935752869, + "learning_rate": 3.97651333225024e-05, + "loss": 0.2779, + "step": 747 + }, + { + "epoch": 0.07752098663073893, + "grad_norm": 0.523582935333252, + "learning_rate": 3.9764106362545305e-05, + "loss": 0.2704, + "step": 748 + }, + { + "epoch": 0.07762462431340035, + "grad_norm": 0.4481733441352844, + "learning_rate": 3.9763077175599426e-05, + "loss": 0.2684, + "step": 749 + }, + { + "epoch": 0.07772826199606177, + "grad_norm": 0.4326241612434387, + "learning_rate": 3.9762045761780734e-05, + "loss": 0.2551, + "step": 750 + }, + { + "epoch": 0.07783189967872318, + "grad_norm": 0.526773989200592, + "learning_rate": 3.9761012121205455e-05, + "loss": 0.297, + "step": 751 + }, + { + "epoch": 0.0779355373613846, + "grad_norm": 0.46273472905158997, + "learning_rate": 3.9759976253990046e-05, + "loss": 0.2507, + "step": 752 + }, + { + "epoch": 0.07803917504404602, + "grad_norm": 0.44620481133461, + "learning_rate": 3.975893816025123e-05, + "loss": 0.2264, + "step": 753 + }, + { + "epoch": 0.07814281272670744, + "grad_norm": 0.5178984999656677, + "learning_rate": 3.975789784010597e-05, + "loss": 0.2131, + "step": 754 + }, + { + "epoch": 0.07824645040936885, + "grad_norm": 0.4811955392360687, + "learning_rate": 3.97568552936715e-05, + "loss": 0.2773, + "step": 755 + }, + { + "epoch": 0.07835008809203026, + "grad_norm": 0.4600077271461487, + "learning_rate": 3.975581052106529e-05, + "loss": 0.235, + "step": 756 + }, + { + "epoch": 0.07845372577469167, + "grad_norm": 0.4501517713069916, + "learning_rate": 3.975476352240506e-05, + "loss": 0.2325, + "step": 757 + }, + { + "epoch": 0.07855736345735309, + "grad_norm": 0.4092719852924347, + "learning_rate": 3.9753714297808785e-05, + "loss": 0.2239, + "step": 758 + }, + { + "epoch": 0.07866100114001451, + "grad_norm": 0.5300100445747375, + "learning_rate": 3.975266284739469e-05, + "loss": 0.2852, + "step": 759 + }, + { + "epoch": 0.07876463882267593, + "grad_norm": 0.4446985423564911, + "learning_rate": 3.9751609171281255e-05, + "loss": 0.2648, + "step": 760 + }, + { + "epoch": 0.07886827650533734, + "grad_norm": 0.5151501297950745, + "learning_rate": 3.97505532695872e-05, + "loss": 0.257, + "step": 761 + }, + { + "epoch": 0.07897191418799876, + "grad_norm": 0.4873696565628052, + "learning_rate": 3.974949514243151e-05, + "loss": 0.2493, + "step": 762 + }, + { + "epoch": 0.07907555187066018, + "grad_norm": 0.3995228409767151, + "learning_rate": 3.9748434789933406e-05, + "loss": 0.2318, + "step": 763 + }, + { + "epoch": 0.0791791895533216, + "grad_norm": 0.5084617137908936, + "learning_rate": 3.974737221221238e-05, + "loss": 0.2919, + "step": 764 + }, + { + "epoch": 0.079282827235983, + "grad_norm": 0.539893388748169, + "learning_rate": 3.974630740938813e-05, + "loss": 0.2722, + "step": 765 + }, + { + "epoch": 0.07938646491864441, + "grad_norm": 0.466237336397171, + "learning_rate": 3.974524038158067e-05, + "loss": 0.2258, + "step": 766 + }, + { + "epoch": 0.07949010260130583, + "grad_norm": 0.5253474712371826, + "learning_rate": 3.9744171128910214e-05, + "loss": 0.3033, + "step": 767 + }, + { + "epoch": 0.07959374028396725, + "grad_norm": 0.3932199478149414, + "learning_rate": 3.974309965149725e-05, + "loss": 0.1973, + "step": 768 + }, + { + "epoch": 0.07969737796662867, + "grad_norm": 0.5772408246994019, + "learning_rate": 3.974202594946251e-05, + "loss": 0.3122, + "step": 769 + }, + { + "epoch": 0.07980101564929008, + "grad_norm": 0.5734532475471497, + "learning_rate": 3.9740950022926974e-05, + "loss": 0.2943, + "step": 770 + }, + { + "epoch": 0.0799046533319515, + "grad_norm": 0.46139439940452576, + "learning_rate": 3.973987187201188e-05, + "loss": 0.2527, + "step": 771 + }, + { + "epoch": 0.08000829101461292, + "grad_norm": 0.4981940686702728, + "learning_rate": 3.9738791496838703e-05, + "loss": 0.2318, + "step": 772 + }, + { + "epoch": 0.08011192869727433, + "grad_norm": 0.5667979121208191, + "learning_rate": 3.973770889752919e-05, + "loss": 0.2542, + "step": 773 + }, + { + "epoch": 0.08021556637993575, + "grad_norm": 0.47905823588371277, + "learning_rate": 3.973662407420532e-05, + "loss": 0.2411, + "step": 774 + }, + { + "epoch": 0.08031920406259715, + "grad_norm": 0.5183667540550232, + "learning_rate": 3.973553702698933e-05, + "loss": 0.2745, + "step": 775 + }, + { + "epoch": 0.08042284174525857, + "grad_norm": 0.48042669892311096, + "learning_rate": 3.9734447756003704e-05, + "loss": 0.2382, + "step": 776 + }, + { + "epoch": 0.08052647942791999, + "grad_norm": 0.48477572202682495, + "learning_rate": 3.973335626137119e-05, + "loss": 0.2375, + "step": 777 + }, + { + "epoch": 0.0806301171105814, + "grad_norm": 0.4594244062900543, + "learning_rate": 3.973226254321477e-05, + "loss": 0.2366, + "step": 778 + }, + { + "epoch": 0.08073375479324282, + "grad_norm": 0.5444444417953491, + "learning_rate": 3.973116660165767e-05, + "loss": 0.298, + "step": 779 + }, + { + "epoch": 0.08083739247590424, + "grad_norm": 0.5429214239120483, + "learning_rate": 3.9730068436823395e-05, + "loss": 0.2718, + "step": 780 + }, + { + "epoch": 0.08094103015856566, + "grad_norm": 0.4269948899745941, + "learning_rate": 3.972896804883568e-05, + "loss": 0.1915, + "step": 781 + }, + { + "epoch": 0.08104466784122707, + "grad_norm": 0.511663556098938, + "learning_rate": 3.972786543781852e-05, + "loss": 0.2616, + "step": 782 + }, + { + "epoch": 0.08114830552388849, + "grad_norm": 0.49156785011291504, + "learning_rate": 3.972676060389614e-05, + "loss": 0.2885, + "step": 783 + }, + { + "epoch": 0.0812519432065499, + "grad_norm": 0.4817125201225281, + "learning_rate": 3.972565354719305e-05, + "loss": 0.2804, + "step": 784 + }, + { + "epoch": 0.08135558088921131, + "grad_norm": 0.4289361238479614, + "learning_rate": 3.9724544267833975e-05, + "loss": 0.2031, + "step": 785 + }, + { + "epoch": 0.08145921857187273, + "grad_norm": 0.44129499793052673, + "learning_rate": 3.9723432765943916e-05, + "loss": 0.2282, + "step": 786 + }, + { + "epoch": 0.08156285625453415, + "grad_norm": 0.49618256092071533, + "learning_rate": 3.972231904164812e-05, + "loss": 0.283, + "step": 787 + }, + { + "epoch": 0.08166649393719556, + "grad_norm": 0.44237473607063293, + "learning_rate": 3.9721203095072066e-05, + "loss": 0.2416, + "step": 788 + }, + { + "epoch": 0.08177013161985698, + "grad_norm": 0.45676106214523315, + "learning_rate": 3.972008492634151e-05, + "loss": 0.2704, + "step": 789 + }, + { + "epoch": 0.0818737693025184, + "grad_norm": 0.45083701610565186, + "learning_rate": 3.971896453558244e-05, + "loss": 0.2541, + "step": 790 + }, + { + "epoch": 0.08197740698517982, + "grad_norm": 0.34744521975517273, + "learning_rate": 3.971784192292109e-05, + "loss": 0.166, + "step": 791 + }, + { + "epoch": 0.08208104466784123, + "grad_norm": 0.41484206914901733, + "learning_rate": 3.971671708848398e-05, + "loss": 0.2457, + "step": 792 + }, + { + "epoch": 0.08218468235050265, + "grad_norm": 0.4614809453487396, + "learning_rate": 3.971559003239782e-05, + "loss": 0.2399, + "step": 793 + }, + { + "epoch": 0.08228832003316405, + "grad_norm": 0.502249002456665, + "learning_rate": 3.971446075478964e-05, + "loss": 0.2933, + "step": 794 + }, + { + "epoch": 0.08239195771582547, + "grad_norm": 0.42531266808509827, + "learning_rate": 3.971332925578666e-05, + "loss": 0.2611, + "step": 795 + }, + { + "epoch": 0.08249559539848689, + "grad_norm": 0.4552249610424042, + "learning_rate": 3.971219553551639e-05, + "loss": 0.2785, + "step": 796 + }, + { + "epoch": 0.0825992330811483, + "grad_norm": 0.43548455834388733, + "learning_rate": 3.9711059594106566e-05, + "loss": 0.2453, + "step": 797 + }, + { + "epoch": 0.08270287076380972, + "grad_norm": 0.5675352215766907, + "learning_rate": 3.970992143168519e-05, + "loss": 0.2832, + "step": 798 + }, + { + "epoch": 0.08280650844647114, + "grad_norm": 0.48082834482192993, + "learning_rate": 3.9708781048380506e-05, + "loss": 0.2183, + "step": 799 + }, + { + "epoch": 0.08291014612913256, + "grad_norm": 0.5138378143310547, + "learning_rate": 3.9707638444321015e-05, + "loss": 0.2467, + "step": 800 + }, + { + "epoch": 0.08301378381179397, + "grad_norm": 0.5684926509857178, + "learning_rate": 3.970649361963545e-05, + "loss": 0.2779, + "step": 801 + }, + { + "epoch": 0.08311742149445539, + "grad_norm": 0.4427591562271118, + "learning_rate": 3.9705346574452825e-05, + "loss": 0.2376, + "step": 802 + }, + { + "epoch": 0.0832210591771168, + "grad_norm": 0.5298888087272644, + "learning_rate": 3.970419730890238e-05, + "loss": 0.2352, + "step": 803 + }, + { + "epoch": 0.08332469685977821, + "grad_norm": 0.42755240201950073, + "learning_rate": 3.970304582311362e-05, + "loss": 0.2489, + "step": 804 + }, + { + "epoch": 0.08342833454243963, + "grad_norm": 0.43197426199913025, + "learning_rate": 3.970189211721627e-05, + "loss": 0.2128, + "step": 805 + }, + { + "epoch": 0.08353197222510104, + "grad_norm": 0.5031106472015381, + "learning_rate": 3.9700736191340355e-05, + "loss": 0.2749, + "step": 806 + }, + { + "epoch": 0.08363560990776246, + "grad_norm": 0.4439834654331207, + "learning_rate": 3.9699578045616114e-05, + "loss": 0.2277, + "step": 807 + }, + { + "epoch": 0.08373924759042388, + "grad_norm": 0.4895908534526825, + "learning_rate": 3.9698417680174035e-05, + "loss": 0.2738, + "step": 808 + }, + { + "epoch": 0.0838428852730853, + "grad_norm": 0.47002407908439636, + "learning_rate": 3.9697255095144874e-05, + "loss": 0.2332, + "step": 809 + }, + { + "epoch": 0.08394652295574671, + "grad_norm": 0.5205109715461731, + "learning_rate": 3.9696090290659634e-05, + "loss": 0.2589, + "step": 810 + }, + { + "epoch": 0.08405016063840813, + "grad_norm": 0.5365964770317078, + "learning_rate": 3.969492326684956e-05, + "loss": 0.3035, + "step": 811 + }, + { + "epoch": 0.08415379832106953, + "grad_norm": 0.45762428641319275, + "learning_rate": 3.9693754023846136e-05, + "loss": 0.2675, + "step": 812 + }, + { + "epoch": 0.08425743600373095, + "grad_norm": 0.40482908487319946, + "learning_rate": 3.9692582561781135e-05, + "loss": 0.2117, + "step": 813 + }, + { + "epoch": 0.08436107368639237, + "grad_norm": 0.4397818446159363, + "learning_rate": 3.969140888078654e-05, + "loss": 0.2109, + "step": 814 + }, + { + "epoch": 0.08446471136905379, + "grad_norm": 0.4823589026927948, + "learning_rate": 3.96902329809946e-05, + "loss": 0.259, + "step": 815 + }, + { + "epoch": 0.0845683490517152, + "grad_norm": 0.4680706262588501, + "learning_rate": 3.968905486253782e-05, + "loss": 0.2658, + "step": 816 + }, + { + "epoch": 0.08467198673437662, + "grad_norm": 0.505746066570282, + "learning_rate": 3.968787452554894e-05, + "loss": 0.2455, + "step": 817 + }, + { + "epoch": 0.08477562441703804, + "grad_norm": 0.4473956823348999, + "learning_rate": 3.968669197016097e-05, + "loss": 0.2645, + "step": 818 + }, + { + "epoch": 0.08487926209969945, + "grad_norm": 0.47364342212677, + "learning_rate": 3.9685507196507155e-05, + "loss": 0.2785, + "step": 819 + }, + { + "epoch": 0.08498289978236087, + "grad_norm": 0.4266853928565979, + "learning_rate": 3.968432020472098e-05, + "loss": 0.2244, + "step": 820 + }, + { + "epoch": 0.08508653746502229, + "grad_norm": 0.48588618636131287, + "learning_rate": 3.968313099493622e-05, + "loss": 0.2967, + "step": 821 + }, + { + "epoch": 0.08519017514768369, + "grad_norm": 0.43978413939476013, + "learning_rate": 3.968193956728684e-05, + "loss": 0.2031, + "step": 822 + }, + { + "epoch": 0.08529381283034511, + "grad_norm": 0.39410391449928284, + "learning_rate": 3.968074592190711e-05, + "loss": 0.2318, + "step": 823 + }, + { + "epoch": 0.08539745051300653, + "grad_norm": 0.47819966077804565, + "learning_rate": 3.967955005893154e-05, + "loss": 0.2304, + "step": 824 + }, + { + "epoch": 0.08550108819566794, + "grad_norm": 0.4324469268321991, + "learning_rate": 3.967835197849485e-05, + "loss": 0.2174, + "step": 825 + }, + { + "epoch": 0.08560472587832936, + "grad_norm": 0.5045969486236572, + "learning_rate": 3.967715168073205e-05, + "loss": 0.26, + "step": 826 + }, + { + "epoch": 0.08570836356099078, + "grad_norm": 0.470830500125885, + "learning_rate": 3.967594916577838e-05, + "loss": 0.2837, + "step": 827 + }, + { + "epoch": 0.0858120012436522, + "grad_norm": 0.4859100878238678, + "learning_rate": 3.9674744433769355e-05, + "loss": 0.2184, + "step": 828 + }, + { + "epoch": 0.08591563892631361, + "grad_norm": 0.499252587556839, + "learning_rate": 3.967353748484071e-05, + "loss": 0.2311, + "step": 829 + }, + { + "epoch": 0.08601927660897503, + "grad_norm": 0.45752185583114624, + "learning_rate": 3.967232831912844e-05, + "loss": 0.252, + "step": 830 + }, + { + "epoch": 0.08612291429163643, + "grad_norm": 0.46715739369392395, + "learning_rate": 3.96711169367688e-05, + "loss": 0.2308, + "step": 831 + }, + { + "epoch": 0.08622655197429785, + "grad_norm": 0.4784639775753021, + "learning_rate": 3.966990333789828e-05, + "loss": 0.2385, + "step": 832 + }, + { + "epoch": 0.08633018965695927, + "grad_norm": 0.46856051683425903, + "learning_rate": 3.9668687522653636e-05, + "loss": 0.2428, + "step": 833 + }, + { + "epoch": 0.08643382733962068, + "grad_norm": 0.47302600741386414, + "learning_rate": 3.9667469491171856e-05, + "loss": 0.2339, + "step": 834 + }, + { + "epoch": 0.0865374650222821, + "grad_norm": 0.5023990869522095, + "learning_rate": 3.966624924359018e-05, + "loss": 0.2783, + "step": 835 + }, + { + "epoch": 0.08664110270494352, + "grad_norm": 0.5078696608543396, + "learning_rate": 3.966502678004612e-05, + "loss": 0.2787, + "step": 836 + }, + { + "epoch": 0.08674474038760493, + "grad_norm": 0.49874991178512573, + "learning_rate": 3.9663802100677404e-05, + "loss": 0.2497, + "step": 837 + }, + { + "epoch": 0.08684837807026635, + "grad_norm": 0.5799754858016968, + "learning_rate": 3.966257520562204e-05, + "loss": 0.2834, + "step": 838 + }, + { + "epoch": 0.08695201575292777, + "grad_norm": 0.5072081089019775, + "learning_rate": 3.9661346095018264e-05, + "loss": 0.3123, + "step": 839 + }, + { + "epoch": 0.08705565343558919, + "grad_norm": 0.4480048716068268, + "learning_rate": 3.966011476900458e-05, + "loss": 0.2142, + "step": 840 + }, + { + "epoch": 0.08715929111825059, + "grad_norm": 0.4509342312812805, + "learning_rate": 3.965888122771972e-05, + "loss": 0.2228, + "step": 841 + }, + { + "epoch": 0.087262928800912, + "grad_norm": 0.43247190117836, + "learning_rate": 3.965764547130269e-05, + "loss": 0.2275, + "step": 842 + }, + { + "epoch": 0.08736656648357342, + "grad_norm": 0.53459233045578, + "learning_rate": 3.9656407499892724e-05, + "loss": 0.2793, + "step": 843 + }, + { + "epoch": 0.08747020416623484, + "grad_norm": 0.4321698248386383, + "learning_rate": 3.965516731362931e-05, + "loss": 0.2133, + "step": 844 + }, + { + "epoch": 0.08757384184889626, + "grad_norm": 0.5410631895065308, + "learning_rate": 3.965392491265221e-05, + "loss": 0.2837, + "step": 845 + }, + { + "epoch": 0.08767747953155768, + "grad_norm": 0.46599480509757996, + "learning_rate": 3.965268029710139e-05, + "loss": 0.243, + "step": 846 + }, + { + "epoch": 0.08778111721421909, + "grad_norm": 0.5210885405540466, + "learning_rate": 3.9651433467117123e-05, + "loss": 0.3113, + "step": 847 + }, + { + "epoch": 0.08788475489688051, + "grad_norm": 0.46380823850631714, + "learning_rate": 3.9650184422839875e-05, + "loss": 0.2821, + "step": 848 + }, + { + "epoch": 0.08798839257954193, + "grad_norm": 0.48911863565444946, + "learning_rate": 3.9648933164410385e-05, + "loss": 0.2866, + "step": 849 + }, + { + "epoch": 0.08809203026220333, + "grad_norm": 0.4629068970680237, + "learning_rate": 3.964767969196966e-05, + "loss": 0.2962, + "step": 850 + }, + { + "epoch": 0.08819566794486475, + "grad_norm": 0.48530182242393494, + "learning_rate": 3.9646424005658925e-05, + "loss": 0.256, + "step": 851 + }, + { + "epoch": 0.08829930562752616, + "grad_norm": 0.4971450865268707, + "learning_rate": 3.9645166105619674e-05, + "loss": 0.2523, + "step": 852 + }, + { + "epoch": 0.08840294331018758, + "grad_norm": 0.41097646951675415, + "learning_rate": 3.964390599199364e-05, + "loss": 0.198, + "step": 853 + }, + { + "epoch": 0.088506580992849, + "grad_norm": 0.40204328298568726, + "learning_rate": 3.9642643664922825e-05, + "loss": 0.2282, + "step": 854 + }, + { + "epoch": 0.08861021867551042, + "grad_norm": 0.44748103618621826, + "learning_rate": 3.964137912454945e-05, + "loss": 0.2481, + "step": 855 + }, + { + "epoch": 0.08871385635817183, + "grad_norm": 0.48509109020233154, + "learning_rate": 3.9640112371016016e-05, + "loss": 0.2571, + "step": 856 + }, + { + "epoch": 0.08881749404083325, + "grad_norm": 0.42726966738700867, + "learning_rate": 3.963884340446525e-05, + "loss": 0.2374, + "step": 857 + }, + { + "epoch": 0.08892113172349467, + "grad_norm": 0.47798478603363037, + "learning_rate": 3.963757222504013e-05, + "loss": 0.2458, + "step": 858 + }, + { + "epoch": 0.08902476940615608, + "grad_norm": 0.5577030777931213, + "learning_rate": 3.9636298832883905e-05, + "loss": 0.2845, + "step": 859 + }, + { + "epoch": 0.08912840708881749, + "grad_norm": 0.45337674021720886, + "learning_rate": 3.9635023228140056e-05, + "loss": 0.2509, + "step": 860 + }, + { + "epoch": 0.0892320447714789, + "grad_norm": 0.5406741499900818, + "learning_rate": 3.963374541095231e-05, + "loss": 0.3109, + "step": 861 + }, + { + "epoch": 0.08933568245414032, + "grad_norm": 0.46739137172698975, + "learning_rate": 3.963246538146465e-05, + "loss": 0.2387, + "step": 862 + }, + { + "epoch": 0.08943932013680174, + "grad_norm": 0.4822086691856384, + "learning_rate": 3.963118313982131e-05, + "loss": 0.249, + "step": 863 + }, + { + "epoch": 0.08954295781946316, + "grad_norm": 0.4558001756668091, + "learning_rate": 3.962989868616677e-05, + "loss": 0.2376, + "step": 864 + }, + { + "epoch": 0.08964659550212457, + "grad_norm": 0.44393190741539, + "learning_rate": 3.9628612020645766e-05, + "loss": 0.2362, + "step": 865 + }, + { + "epoch": 0.08975023318478599, + "grad_norm": 0.4516092836856842, + "learning_rate": 3.9627323143403276e-05, + "loss": 0.2039, + "step": 866 + }, + { + "epoch": 0.08985387086744741, + "grad_norm": 0.49979168176651, + "learning_rate": 3.9626032054584515e-05, + "loss": 0.2927, + "step": 867 + }, + { + "epoch": 0.08995750855010883, + "grad_norm": 0.474592387676239, + "learning_rate": 3.962473875433498e-05, + "loss": 0.2919, + "step": 868 + }, + { + "epoch": 0.09006114623277023, + "grad_norm": 0.4473690688610077, + "learning_rate": 3.962344324280038e-05, + "loss": 0.2519, + "step": 869 + }, + { + "epoch": 0.09016478391543165, + "grad_norm": 0.5843013525009155, + "learning_rate": 3.962214552012671e-05, + "loss": 0.2983, + "step": 870 + }, + { + "epoch": 0.09026842159809306, + "grad_norm": 0.5144256949424744, + "learning_rate": 3.962084558646018e-05, + "loss": 0.2722, + "step": 871 + }, + { + "epoch": 0.09037205928075448, + "grad_norm": 0.5222553610801697, + "learning_rate": 3.9619543441947274e-05, + "loss": 0.2652, + "step": 872 + }, + { + "epoch": 0.0904756969634159, + "grad_norm": 0.430524080991745, + "learning_rate": 3.9618239086734716e-05, + "loss": 0.2197, + "step": 873 + }, + { + "epoch": 0.09057933464607731, + "grad_norm": 0.506688117980957, + "learning_rate": 3.961693252096947e-05, + "loss": 0.2509, + "step": 874 + }, + { + "epoch": 0.09068297232873873, + "grad_norm": 0.41722744703292847, + "learning_rate": 3.961562374479876e-05, + "loss": 0.2376, + "step": 875 + }, + { + "epoch": 0.09078661001140015, + "grad_norm": 0.4740089774131775, + "learning_rate": 3.961431275837006e-05, + "loss": 0.2546, + "step": 876 + }, + { + "epoch": 0.09089024769406157, + "grad_norm": 0.46386855840682983, + "learning_rate": 3.961299956183109e-05, + "loss": 0.2554, + "step": 877 + }, + { + "epoch": 0.09099388537672298, + "grad_norm": 0.4838809669017792, + "learning_rate": 3.9611684155329825e-05, + "loss": 0.2585, + "step": 878 + }, + { + "epoch": 0.09109752305938439, + "grad_norm": 0.4920610189437866, + "learning_rate": 3.9610366539014474e-05, + "loss": 0.2723, + "step": 879 + }, + { + "epoch": 0.0912011607420458, + "grad_norm": 0.4134841561317444, + "learning_rate": 3.96090467130335e-05, + "loss": 0.2264, + "step": 880 + }, + { + "epoch": 0.09130479842470722, + "grad_norm": 0.5652831196784973, + "learning_rate": 3.9607724677535626e-05, + "loss": 0.2446, + "step": 881 + }, + { + "epoch": 0.09140843610736864, + "grad_norm": 0.4715527892112732, + "learning_rate": 3.960640043266982e-05, + "loss": 0.2656, + "step": 882 + }, + { + "epoch": 0.09151207379003005, + "grad_norm": 0.4669806957244873, + "learning_rate": 3.960507397858529e-05, + "loss": 0.2143, + "step": 883 + }, + { + "epoch": 0.09161571147269147, + "grad_norm": 0.4482795000076294, + "learning_rate": 3.96037453154315e-05, + "loss": 0.2185, + "step": 884 + }, + { + "epoch": 0.09171934915535289, + "grad_norm": 0.4721313714981079, + "learning_rate": 3.960241444335817e-05, + "loss": 0.2793, + "step": 885 + }, + { + "epoch": 0.0918229868380143, + "grad_norm": 0.5375263094902039, + "learning_rate": 3.9601081362515245e-05, + "loss": 0.2409, + "step": 886 + }, + { + "epoch": 0.09192662452067572, + "grad_norm": 0.5323898792266846, + "learning_rate": 3.9599746073052945e-05, + "loss": 0.248, + "step": 887 + }, + { + "epoch": 0.09203026220333713, + "grad_norm": 0.4709322154521942, + "learning_rate": 3.959840857512172e-05, + "loss": 0.25, + "step": 888 + }, + { + "epoch": 0.09213389988599854, + "grad_norm": 0.4204918146133423, + "learning_rate": 3.9597068868872296e-05, + "loss": 0.2437, + "step": 889 + }, + { + "epoch": 0.09223753756865996, + "grad_norm": 0.5008912086486816, + "learning_rate": 3.9595726954455606e-05, + "loss": 0.2705, + "step": 890 + }, + { + "epoch": 0.09234117525132138, + "grad_norm": 0.4866129755973816, + "learning_rate": 3.959438283202287e-05, + "loss": 0.2598, + "step": 891 + }, + { + "epoch": 0.0924448129339828, + "grad_norm": 0.5146870613098145, + "learning_rate": 3.959303650172554e-05, + "loss": 0.2436, + "step": 892 + }, + { + "epoch": 0.09254845061664421, + "grad_norm": 0.4937049448490143, + "learning_rate": 3.959168796371531e-05, + "loss": 0.2376, + "step": 893 + }, + { + "epoch": 0.09265208829930563, + "grad_norm": 0.5434243679046631, + "learning_rate": 3.959033721814413e-05, + "loss": 0.282, + "step": 894 + }, + { + "epoch": 0.09275572598196705, + "grad_norm": 0.5378921627998352, + "learning_rate": 3.958898426516421e-05, + "loss": 0.2877, + "step": 895 + }, + { + "epoch": 0.09285936366462846, + "grad_norm": 0.5169168710708618, + "learning_rate": 3.9587629104927995e-05, + "loss": 0.2575, + "step": 896 + }, + { + "epoch": 0.09296300134728988, + "grad_norm": 0.511336624622345, + "learning_rate": 3.9586271737588184e-05, + "loss": 0.2763, + "step": 897 + }, + { + "epoch": 0.09306663902995128, + "grad_norm": 0.4890974462032318, + "learning_rate": 3.958491216329772e-05, + "loss": 0.2712, + "step": 898 + }, + { + "epoch": 0.0931702767126127, + "grad_norm": 0.47972339391708374, + "learning_rate": 3.95835503822098e-05, + "loss": 0.2379, + "step": 899 + }, + { + "epoch": 0.09327391439527412, + "grad_norm": 0.5554192662239075, + "learning_rate": 3.9582186394477864e-05, + "loss": 0.284, + "step": 900 + }, + { + "epoch": 0.09337755207793554, + "grad_norm": 0.5789952278137207, + "learning_rate": 3.958082020025561e-05, + "loss": 0.2869, + "step": 901 + }, + { + "epoch": 0.09348118976059695, + "grad_norm": 0.5500551462173462, + "learning_rate": 3.957945179969697e-05, + "loss": 0.2719, + "step": 902 + }, + { + "epoch": 0.09358482744325837, + "grad_norm": 0.5016873478889465, + "learning_rate": 3.957808119295614e-05, + "loss": 0.2527, + "step": 903 + }, + { + "epoch": 0.09368846512591979, + "grad_norm": 0.4230045676231384, + "learning_rate": 3.957670838018755e-05, + "loss": 0.21, + "step": 904 + }, + { + "epoch": 0.0937921028085812, + "grad_norm": 0.513180673122406, + "learning_rate": 3.957533336154591e-05, + "loss": 0.2474, + "step": 905 + }, + { + "epoch": 0.09389574049124262, + "grad_norm": 0.5388766527175903, + "learning_rate": 3.9573956137186124e-05, + "loss": 0.2803, + "step": 906 + }, + { + "epoch": 0.09399937817390402, + "grad_norm": 0.42283692955970764, + "learning_rate": 3.957257670726339e-05, + "loss": 0.2359, + "step": 907 + }, + { + "epoch": 0.09410301585656544, + "grad_norm": 0.47266629338264465, + "learning_rate": 3.957119507193314e-05, + "loss": 0.2384, + "step": 908 + }, + { + "epoch": 0.09420665353922686, + "grad_norm": 0.43167486786842346, + "learning_rate": 3.956981123135105e-05, + "loss": 0.2279, + "step": 909 + }, + { + "epoch": 0.09431029122188828, + "grad_norm": 0.42065706849098206, + "learning_rate": 3.956842518567305e-05, + "loss": 0.1911, + "step": 910 + }, + { + "epoch": 0.0944139289045497, + "grad_norm": 0.49886035919189453, + "learning_rate": 3.956703693505533e-05, + "loss": 0.2458, + "step": 911 + }, + { + "epoch": 0.09451756658721111, + "grad_norm": 0.5268099308013916, + "learning_rate": 3.95656464796543e-05, + "loss": 0.3251, + "step": 912 + }, + { + "epoch": 0.09462120426987253, + "grad_norm": 0.4417741596698761, + "learning_rate": 3.956425381962664e-05, + "loss": 0.2542, + "step": 913 + }, + { + "epoch": 0.09472484195253394, + "grad_norm": 0.4796222150325775, + "learning_rate": 3.956285895512928e-05, + "loss": 0.233, + "step": 914 + }, + { + "epoch": 0.09482847963519536, + "grad_norm": 0.46497902274131775, + "learning_rate": 3.956146188631937e-05, + "loss": 0.2712, + "step": 915 + }, + { + "epoch": 0.09493211731785678, + "grad_norm": 0.4230740964412689, + "learning_rate": 3.956006261335435e-05, + "loss": 0.216, + "step": 916 + }, + { + "epoch": 0.09503575500051818, + "grad_norm": 0.421990305185318, + "learning_rate": 3.9558661136391886e-05, + "loss": 0.225, + "step": 917 + }, + { + "epoch": 0.0951393926831796, + "grad_norm": 0.4670604169368744, + "learning_rate": 3.955725745558988e-05, + "loss": 0.2331, + "step": 918 + }, + { + "epoch": 0.09524303036584102, + "grad_norm": 0.5368228554725647, + "learning_rate": 3.9555851571106514e-05, + "loss": 0.2914, + "step": 919 + }, + { + "epoch": 0.09534666804850243, + "grad_norm": 0.42806532979011536, + "learning_rate": 3.955444348310019e-05, + "loss": 0.236, + "step": 920 + }, + { + "epoch": 0.09545030573116385, + "grad_norm": 0.49855703115463257, + "learning_rate": 3.9553033191729576e-05, + "loss": 0.2523, + "step": 921 + }, + { + "epoch": 0.09555394341382527, + "grad_norm": 0.41719886660575867, + "learning_rate": 3.9551620697153575e-05, + "loss": 0.2338, + "step": 922 + }, + { + "epoch": 0.09565758109648669, + "grad_norm": 0.4818481206893921, + "learning_rate": 3.955020599953135e-05, + "loss": 0.2773, + "step": 923 + }, + { + "epoch": 0.0957612187791481, + "grad_norm": 0.4772093594074249, + "learning_rate": 3.9548789099022305e-05, + "loss": 0.2894, + "step": 924 + }, + { + "epoch": 0.09586485646180952, + "grad_norm": 0.39468058943748474, + "learning_rate": 3.9547369995786084e-05, + "loss": 0.1933, + "step": 925 + }, + { + "epoch": 0.09596849414447092, + "grad_norm": 0.4253804683685303, + "learning_rate": 3.9545948689982605e-05, + "loss": 0.2506, + "step": 926 + }, + { + "epoch": 0.09607213182713234, + "grad_norm": 0.4445357620716095, + "learning_rate": 3.954452518177201e-05, + "loss": 0.2319, + "step": 927 + }, + { + "epoch": 0.09617576950979376, + "grad_norm": 0.4653438329696655, + "learning_rate": 3.954309947131471e-05, + "loss": 0.2988, + "step": 928 + }, + { + "epoch": 0.09627940719245517, + "grad_norm": 0.3784976899623871, + "learning_rate": 3.9541671558771334e-05, + "loss": 0.1875, + "step": 929 + }, + { + "epoch": 0.09638304487511659, + "grad_norm": 0.47835689783096313, + "learning_rate": 3.954024144430278e-05, + "loss": 0.2569, + "step": 930 + }, + { + "epoch": 0.09648668255777801, + "grad_norm": 0.4231659770011902, + "learning_rate": 3.95388091280702e-05, + "loss": 0.2265, + "step": 931 + }, + { + "epoch": 0.09659032024043943, + "grad_norm": 0.46786612272262573, + "learning_rate": 3.953737461023499e-05, + "loss": 0.2365, + "step": 932 + }, + { + "epoch": 0.09669395792310084, + "grad_norm": 0.4952070116996765, + "learning_rate": 3.953593789095877e-05, + "loss": 0.267, + "step": 933 + }, + { + "epoch": 0.09679759560576226, + "grad_norm": 0.4483940601348877, + "learning_rate": 3.953449897040344e-05, + "loss": 0.2318, + "step": 934 + }, + { + "epoch": 0.09690123328842368, + "grad_norm": 0.5074480772018433, + "learning_rate": 3.953305784873114e-05, + "loss": 0.2706, + "step": 935 + }, + { + "epoch": 0.09700487097108508, + "grad_norm": 0.5509966015815735, + "learning_rate": 3.9531614526104237e-05, + "loss": 0.2932, + "step": 936 + }, + { + "epoch": 0.0971085086537465, + "grad_norm": 0.3346215784549713, + "learning_rate": 3.953016900268537e-05, + "loss": 0.1715, + "step": 937 + }, + { + "epoch": 0.09721214633640791, + "grad_norm": 0.37619563937187195, + "learning_rate": 3.952872127863743e-05, + "loss": 0.1943, + "step": 938 + }, + { + "epoch": 0.09731578401906933, + "grad_norm": 0.508621096611023, + "learning_rate": 3.952727135412353e-05, + "loss": 0.2681, + "step": 939 + }, + { + "epoch": 0.09741942170173075, + "grad_norm": 0.4613504409790039, + "learning_rate": 3.9525819229307044e-05, + "loss": 0.2572, + "step": 940 + }, + { + "epoch": 0.09752305938439217, + "grad_norm": 0.554082453250885, + "learning_rate": 3.952436490435161e-05, + "loss": 0.3056, + "step": 941 + }, + { + "epoch": 0.09762669706705358, + "grad_norm": 0.4735698997974396, + "learning_rate": 3.952290837942108e-05, + "loss": 0.1899, + "step": 942 + }, + { + "epoch": 0.097730334749715, + "grad_norm": 0.4331062138080597, + "learning_rate": 3.952144965467959e-05, + "loss": 0.25, + "step": 943 + }, + { + "epoch": 0.09783397243237642, + "grad_norm": 0.4538209140300751, + "learning_rate": 3.9519988730291493e-05, + "loss": 0.2491, + "step": 944 + }, + { + "epoch": 0.09793761011503782, + "grad_norm": 0.4904068112373352, + "learning_rate": 3.9518525606421414e-05, + "loss": 0.2782, + "step": 945 + }, + { + "epoch": 0.09804124779769924, + "grad_norm": 0.5205404162406921, + "learning_rate": 3.9517060283234216e-05, + "loss": 0.2507, + "step": 946 + }, + { + "epoch": 0.09814488548036066, + "grad_norm": 0.48228219151496887, + "learning_rate": 3.9515592760895005e-05, + "loss": 0.2576, + "step": 947 + }, + { + "epoch": 0.09824852316302207, + "grad_norm": 0.4108220636844635, + "learning_rate": 3.9514123039569135e-05, + "loss": 0.233, + "step": 948 + }, + { + "epoch": 0.09835216084568349, + "grad_norm": 0.4790742099285126, + "learning_rate": 3.951265111942221e-05, + "loss": 0.2426, + "step": 949 + }, + { + "epoch": 0.0984557985283449, + "grad_norm": 0.4508419632911682, + "learning_rate": 3.95111770006201e-05, + "loss": 0.252, + "step": 950 + }, + { + "epoch": 0.09855943621100632, + "grad_norm": 0.41636496782302856, + "learning_rate": 3.95097006833289e-05, + "loss": 0.2434, + "step": 951 + }, + { + "epoch": 0.09866307389366774, + "grad_norm": 0.5083922743797302, + "learning_rate": 3.9508222167714945e-05, + "loss": 0.2878, + "step": 952 + }, + { + "epoch": 0.09876671157632916, + "grad_norm": 0.46241459250450134, + "learning_rate": 3.950674145394484e-05, + "loss": 0.2322, + "step": 953 + }, + { + "epoch": 0.09887034925899058, + "grad_norm": 0.458252489566803, + "learning_rate": 3.950525854218544e-05, + "loss": 0.2587, + "step": 954 + }, + { + "epoch": 0.09897398694165198, + "grad_norm": 0.4902607202529907, + "learning_rate": 3.950377343260383e-05, + "loss": 0.2161, + "step": 955 + }, + { + "epoch": 0.0990776246243134, + "grad_norm": 0.40840044617652893, + "learning_rate": 3.9502286125367345e-05, + "loss": 0.1812, + "step": 956 + }, + { + "epoch": 0.09918126230697481, + "grad_norm": 0.42781862616539, + "learning_rate": 3.950079662064358e-05, + "loss": 0.2268, + "step": 957 + }, + { + "epoch": 0.09928489998963623, + "grad_norm": 0.48561370372772217, + "learning_rate": 3.949930491860036e-05, + "loss": 0.2736, + "step": 958 + }, + { + "epoch": 0.09938853767229765, + "grad_norm": 0.5395079255104065, + "learning_rate": 3.949781101940578e-05, + "loss": 0.2579, + "step": 959 + }, + { + "epoch": 0.09949217535495906, + "grad_norm": 0.5001274943351746, + "learning_rate": 3.949631492322816e-05, + "loss": 0.2844, + "step": 960 + }, + { + "epoch": 0.09959581303762048, + "grad_norm": 0.5049706101417542, + "learning_rate": 3.949481663023608e-05, + "loss": 0.2715, + "step": 961 + }, + { + "epoch": 0.0996994507202819, + "grad_norm": 0.5008152723312378, + "learning_rate": 3.9493316140598376e-05, + "loss": 0.2894, + "step": 962 + }, + { + "epoch": 0.09980308840294332, + "grad_norm": 0.5027458667755127, + "learning_rate": 3.94918134544841e-05, + "loss": 0.2589, + "step": 963 + }, + { + "epoch": 0.09990672608560472, + "grad_norm": 0.5868988633155823, + "learning_rate": 3.94903085720626e-05, + "loss": 0.2717, + "step": 964 + }, + { + "epoch": 0.10001036376826614, + "grad_norm": 0.4468805193901062, + "learning_rate": 3.9488801493503414e-05, + "loss": 0.2377, + "step": 965 + }, + { + "epoch": 0.10011400145092755, + "grad_norm": 0.5798137187957764, + "learning_rate": 3.948729221897638e-05, + "loss": 0.2686, + "step": 966 + }, + { + "epoch": 0.10021763913358897, + "grad_norm": 0.5064831972122192, + "learning_rate": 3.948578074865155e-05, + "loss": 0.2471, + "step": 967 + }, + { + "epoch": 0.10032127681625039, + "grad_norm": 0.4484928846359253, + "learning_rate": 3.9484267082699236e-05, + "loss": 0.2226, + "step": 968 + }, + { + "epoch": 0.1004249144989118, + "grad_norm": 0.42684027552604675, + "learning_rate": 3.948275122129e-05, + "loss": 0.2371, + "step": 969 + }, + { + "epoch": 0.10052855218157322, + "grad_norm": 0.500918447971344, + "learning_rate": 3.948123316459464e-05, + "loss": 0.2368, + "step": 970 + }, + { + "epoch": 0.10063218986423464, + "grad_norm": 0.4330311417579651, + "learning_rate": 3.947971291278421e-05, + "loss": 0.227, + "step": 971 + }, + { + "epoch": 0.10073582754689606, + "grad_norm": 0.3878055214881897, + "learning_rate": 3.947819046603001e-05, + "loss": 0.216, + "step": 972 + }, + { + "epoch": 0.10083946522955747, + "grad_norm": 0.5194640159606934, + "learning_rate": 3.947666582450359e-05, + "loss": 0.3169, + "step": 973 + }, + { + "epoch": 0.10094310291221888, + "grad_norm": 0.4383881390094757, + "learning_rate": 3.947513898837674e-05, + "loss": 0.2236, + "step": 974 + }, + { + "epoch": 0.1010467405948803, + "grad_norm": 0.4771685004234314, + "learning_rate": 3.94736099578215e-05, + "loss": 0.2398, + "step": 975 + }, + { + "epoch": 0.10115037827754171, + "grad_norm": 0.4392111003398895, + "learning_rate": 3.9472078733010174e-05, + "loss": 0.2375, + "step": 976 + }, + { + "epoch": 0.10125401596020313, + "grad_norm": 0.502129077911377, + "learning_rate": 3.9470545314115274e-05, + "loss": 0.2515, + "step": 977 + }, + { + "epoch": 0.10135765364286455, + "grad_norm": 0.5018049478530884, + "learning_rate": 3.9469009701309605e-05, + "loss": 0.2249, + "step": 978 + }, + { + "epoch": 0.10146129132552596, + "grad_norm": 0.4413333535194397, + "learning_rate": 3.946747189476618e-05, + "loss": 0.2153, + "step": 979 + }, + { + "epoch": 0.10156492900818738, + "grad_norm": 0.5340810418128967, + "learning_rate": 3.946593189465829e-05, + "loss": 0.2898, + "step": 980 + }, + { + "epoch": 0.1016685666908488, + "grad_norm": 0.46548914909362793, + "learning_rate": 3.946438970115945e-05, + "loss": 0.2418, + "step": 981 + }, + { + "epoch": 0.10177220437351021, + "grad_norm": 0.5104466676712036, + "learning_rate": 3.9462845314443445e-05, + "loss": 0.2741, + "step": 982 + }, + { + "epoch": 0.10187584205617162, + "grad_norm": 0.4898616373538971, + "learning_rate": 3.9461298734684275e-05, + "loss": 0.2547, + "step": 983 + }, + { + "epoch": 0.10197947973883303, + "grad_norm": 0.4741784334182739, + "learning_rate": 3.9459749962056225e-05, + "loss": 0.2502, + "step": 984 + }, + { + "epoch": 0.10208311742149445, + "grad_norm": 0.5162531733512878, + "learning_rate": 3.94581989967338e-05, + "loss": 0.3015, + "step": 985 + }, + { + "epoch": 0.10218675510415587, + "grad_norm": 0.5185749530792236, + "learning_rate": 3.9456645838891755e-05, + "loss": 0.2858, + "step": 986 + }, + { + "epoch": 0.10229039278681729, + "grad_norm": 0.4627940356731415, + "learning_rate": 3.94550904887051e-05, + "loss": 0.2377, + "step": 987 + }, + { + "epoch": 0.1023940304694787, + "grad_norm": 0.5982307195663452, + "learning_rate": 3.94535329463491e-05, + "loss": 0.2988, + "step": 988 + }, + { + "epoch": 0.10249766815214012, + "grad_norm": 0.4750824570655823, + "learning_rate": 3.945197321199925e-05, + "loss": 0.2445, + "step": 989 + }, + { + "epoch": 0.10260130583480154, + "grad_norm": 0.5667946934700012, + "learning_rate": 3.945041128583129e-05, + "loss": 0.2684, + "step": 990 + }, + { + "epoch": 0.10270494351746295, + "grad_norm": 0.4947032928466797, + "learning_rate": 3.9448847168021226e-05, + "loss": 0.2445, + "step": 991 + }, + { + "epoch": 0.10280858120012437, + "grad_norm": 0.4986145496368408, + "learning_rate": 3.9447280858745295e-05, + "loss": 0.3004, + "step": 992 + }, + { + "epoch": 0.10291221888278577, + "grad_norm": 0.4144374132156372, + "learning_rate": 3.944571235817999e-05, + "loss": 0.2092, + "step": 993 + }, + { + "epoch": 0.10301585656544719, + "grad_norm": 0.5639985799789429, + "learning_rate": 3.944414166650204e-05, + "loss": 0.3377, + "step": 994 + }, + { + "epoch": 0.10311949424810861, + "grad_norm": 0.5175818204879761, + "learning_rate": 3.9442568783888443e-05, + "loss": 0.2365, + "step": 995 + }, + { + "epoch": 0.10322313193077003, + "grad_norm": 0.4092256724834442, + "learning_rate": 3.9440993710516415e-05, + "loss": 0.2572, + "step": 996 + }, + { + "epoch": 0.10332676961343144, + "grad_norm": 0.47614118456840515, + "learning_rate": 3.943941644656344e-05, + "loss": 0.2535, + "step": 997 + }, + { + "epoch": 0.10343040729609286, + "grad_norm": 0.5114443898200989, + "learning_rate": 3.9437836992207234e-05, + "loss": 0.2779, + "step": 998 + }, + { + "epoch": 0.10353404497875428, + "grad_norm": 0.4928836524486542, + "learning_rate": 3.9436255347625775e-05, + "loss": 0.2521, + "step": 999 + }, + { + "epoch": 0.1036376826614157, + "grad_norm": 0.46215057373046875, + "learning_rate": 3.9434671512997276e-05, + "loss": 0.2436, + "step": 1000 + }, + { + "epoch": 0.10374132034407711, + "grad_norm": 0.45834508538246155, + "learning_rate": 3.943308548850021e-05, + "loss": 0.249, + "step": 1001 + }, + { + "epoch": 0.10384495802673852, + "grad_norm": 0.40858447551727295, + "learning_rate": 3.943149727431327e-05, + "loss": 0.1887, + "step": 1002 + }, + { + "epoch": 0.10394859570939993, + "grad_norm": 0.5563355684280396, + "learning_rate": 3.942990687061543e-05, + "loss": 0.2852, + "step": 1003 + }, + { + "epoch": 0.10405223339206135, + "grad_norm": 0.43062031269073486, + "learning_rate": 3.942831427758589e-05, + "loss": 0.208, + "step": 1004 + }, + { + "epoch": 0.10415587107472277, + "grad_norm": 0.5552340149879456, + "learning_rate": 3.942671949540409e-05, + "loss": 0.2622, + "step": 1005 + }, + { + "epoch": 0.10425950875738418, + "grad_norm": 0.4066930413246155, + "learning_rate": 3.942512252424974e-05, + "loss": 0.1943, + "step": 1006 + }, + { + "epoch": 0.1043631464400456, + "grad_norm": 0.42987269163131714, + "learning_rate": 3.9423523364302795e-05, + "loss": 0.2548, + "step": 1007 + }, + { + "epoch": 0.10446678412270702, + "grad_norm": 0.45666632056236267, + "learning_rate": 3.942192201574341e-05, + "loss": 0.2442, + "step": 1008 + }, + { + "epoch": 0.10457042180536844, + "grad_norm": 0.5226937532424927, + "learning_rate": 3.9420318478752056e-05, + "loss": 0.2615, + "step": 1009 + }, + { + "epoch": 0.10467405948802985, + "grad_norm": 0.4175627827644348, + "learning_rate": 3.9418712753509406e-05, + "loss": 0.1772, + "step": 1010 + }, + { + "epoch": 0.10477769717069127, + "grad_norm": 0.4888598918914795, + "learning_rate": 3.941710484019639e-05, + "loss": 0.2718, + "step": 1011 + }, + { + "epoch": 0.10488133485335267, + "grad_norm": 0.4504351019859314, + "learning_rate": 3.941549473899418e-05, + "loss": 0.2557, + "step": 1012 + }, + { + "epoch": 0.10498497253601409, + "grad_norm": 0.5280157923698425, + "learning_rate": 3.941388245008421e-05, + "loss": 0.2456, + "step": 1013 + }, + { + "epoch": 0.10508861021867551, + "grad_norm": 0.4623817801475525, + "learning_rate": 3.941226797364814e-05, + "loss": 0.2398, + "step": 1014 + }, + { + "epoch": 0.10519224790133692, + "grad_norm": 0.42403942346572876, + "learning_rate": 3.94106513098679e-05, + "loss": 0.2141, + "step": 1015 + }, + { + "epoch": 0.10529588558399834, + "grad_norm": 0.483390748500824, + "learning_rate": 3.940903245892564e-05, + "loss": 0.2364, + "step": 1016 + }, + { + "epoch": 0.10539952326665976, + "grad_norm": 0.47593533992767334, + "learning_rate": 3.940741142100377e-05, + "loss": 0.2373, + "step": 1017 + }, + { + "epoch": 0.10550316094932118, + "grad_norm": 0.39513906836509705, + "learning_rate": 3.940578819628495e-05, + "loss": 0.2268, + "step": 1018 + }, + { + "epoch": 0.10560679863198259, + "grad_norm": 0.48310190439224243, + "learning_rate": 3.940416278495209e-05, + "loss": 0.257, + "step": 1019 + }, + { + "epoch": 0.10571043631464401, + "grad_norm": 0.5143681764602661, + "learning_rate": 3.940253518718833e-05, + "loss": 0.2589, + "step": 1020 + }, + { + "epoch": 0.10581407399730541, + "grad_norm": 0.5714600086212158, + "learning_rate": 3.940090540317706e-05, + "loss": 0.308, + "step": 1021 + }, + { + "epoch": 0.10591771167996683, + "grad_norm": 0.5120255947113037, + "learning_rate": 3.939927343310194e-05, + "loss": 0.3122, + "step": 1022 + }, + { + "epoch": 0.10602134936262825, + "grad_norm": 0.4649025797843933, + "learning_rate": 3.939763927714684e-05, + "loss": 0.2271, + "step": 1023 + }, + { + "epoch": 0.10612498704528966, + "grad_norm": 0.5476794242858887, + "learning_rate": 3.9396002935495895e-05, + "loss": 0.2673, + "step": 1024 + }, + { + "epoch": 0.10622862472795108, + "grad_norm": 0.5482632517814636, + "learning_rate": 3.93943644083335e-05, + "loss": 0.2655, + "step": 1025 + }, + { + "epoch": 0.1063322624106125, + "grad_norm": 0.4433247148990631, + "learning_rate": 3.939272369584427e-05, + "loss": 0.2377, + "step": 1026 + }, + { + "epoch": 0.10643590009327392, + "grad_norm": 0.4874042272567749, + "learning_rate": 3.939108079821308e-05, + "loss": 0.228, + "step": 1027 + }, + { + "epoch": 0.10653953777593533, + "grad_norm": 0.4970521926879883, + "learning_rate": 3.938943571562505e-05, + "loss": 0.2692, + "step": 1028 + }, + { + "epoch": 0.10664317545859675, + "grad_norm": 0.4594666063785553, + "learning_rate": 3.9387788448265546e-05, + "loss": 0.2255, + "step": 1029 + }, + { + "epoch": 0.10674681314125817, + "grad_norm": 0.40364277362823486, + "learning_rate": 3.9386138996320176e-05, + "loss": 0.2188, + "step": 1030 + }, + { + "epoch": 0.10685045082391957, + "grad_norm": 0.49549341201782227, + "learning_rate": 3.9384487359974806e-05, + "loss": 0.2475, + "step": 1031 + }, + { + "epoch": 0.10695408850658099, + "grad_norm": 0.4368329644203186, + "learning_rate": 3.9382833539415526e-05, + "loss": 0.2564, + "step": 1032 + }, + { + "epoch": 0.1070577261892424, + "grad_norm": 0.48317834734916687, + "learning_rate": 3.93811775348287e-05, + "loss": 0.2726, + "step": 1033 + }, + { + "epoch": 0.10716136387190382, + "grad_norm": 0.4917926490306854, + "learning_rate": 3.937951934640091e-05, + "loss": 0.2571, + "step": 1034 + }, + { + "epoch": 0.10726500155456524, + "grad_norm": 0.5338187217712402, + "learning_rate": 3.9377858974319014e-05, + "loss": 0.2776, + "step": 1035 + }, + { + "epoch": 0.10736863923722666, + "grad_norm": 0.49937704205513, + "learning_rate": 3.937619641877009e-05, + "loss": 0.2573, + "step": 1036 + }, + { + "epoch": 0.10747227691988807, + "grad_norm": 0.4757545292377472, + "learning_rate": 3.9374531679941474e-05, + "loss": 0.2601, + "step": 1037 + }, + { + "epoch": 0.10757591460254949, + "grad_norm": 0.49643903970718384, + "learning_rate": 3.937286475802075e-05, + "loss": 0.2899, + "step": 1038 + }, + { + "epoch": 0.10767955228521091, + "grad_norm": 0.4637889564037323, + "learning_rate": 3.937119565319574e-05, + "loss": 0.2456, + "step": 1039 + }, + { + "epoch": 0.10778318996787231, + "grad_norm": 0.44887062907218933, + "learning_rate": 3.936952436565451e-05, + "loss": 0.2265, + "step": 1040 + }, + { + "epoch": 0.10788682765053373, + "grad_norm": 0.44160398840904236, + "learning_rate": 3.9367850895585394e-05, + "loss": 0.2418, + "step": 1041 + }, + { + "epoch": 0.10799046533319515, + "grad_norm": 0.5461088418960571, + "learning_rate": 3.936617524317694e-05, + "loss": 0.2811, + "step": 1042 + }, + { + "epoch": 0.10809410301585656, + "grad_norm": 0.44432151317596436, + "learning_rate": 3.936449740861797e-05, + "loss": 0.233, + "step": 1043 + }, + { + "epoch": 0.10819774069851798, + "grad_norm": 0.4568310081958771, + "learning_rate": 3.936281739209752e-05, + "loss": 0.2414, + "step": 1044 + }, + { + "epoch": 0.1083013783811794, + "grad_norm": 0.47912347316741943, + "learning_rate": 3.936113519380493e-05, + "loss": 0.2377, + "step": 1045 + }, + { + "epoch": 0.10840501606384081, + "grad_norm": 0.5290972590446472, + "learning_rate": 3.9359450813929705e-05, + "loss": 0.2794, + "step": 1046 + }, + { + "epoch": 0.10850865374650223, + "grad_norm": 0.5089946389198303, + "learning_rate": 3.935776425266166e-05, + "loss": 0.2565, + "step": 1047 + }, + { + "epoch": 0.10861229142916365, + "grad_norm": 0.4862841069698334, + "learning_rate": 3.935607551019084e-05, + "loss": 0.2442, + "step": 1048 + }, + { + "epoch": 0.10871592911182507, + "grad_norm": 0.46862688660621643, + "learning_rate": 3.935438458670752e-05, + "loss": 0.2144, + "step": 1049 + }, + { + "epoch": 0.10881956679448647, + "grad_norm": 0.4151781499385834, + "learning_rate": 3.935269148240223e-05, + "loss": 0.261, + "step": 1050 + }, + { + "epoch": 0.10892320447714789, + "grad_norm": 0.5200570821762085, + "learning_rate": 3.935099619746575e-05, + "loss": 0.2695, + "step": 1051 + }, + { + "epoch": 0.1090268421598093, + "grad_norm": 0.48534032702445984, + "learning_rate": 3.934929873208909e-05, + "loss": 0.2555, + "step": 1052 + }, + { + "epoch": 0.10913047984247072, + "grad_norm": 0.4310669004917145, + "learning_rate": 3.934759908646354e-05, + "loss": 0.2691, + "step": 1053 + }, + { + "epoch": 0.10923411752513214, + "grad_norm": 0.49501046538352966, + "learning_rate": 3.934589726078059e-05, + "loss": 0.2694, + "step": 1054 + }, + { + "epoch": 0.10933775520779355, + "grad_norm": 0.537093997001648, + "learning_rate": 3.9344193255232016e-05, + "loss": 0.2443, + "step": 1055 + }, + { + "epoch": 0.10944139289045497, + "grad_norm": 0.49397650361061096, + "learning_rate": 3.934248707000982e-05, + "loss": 0.2945, + "step": 1056 + }, + { + "epoch": 0.10954503057311639, + "grad_norm": 0.5158434510231018, + "learning_rate": 3.9340778705306244e-05, + "loss": 0.2643, + "step": 1057 + }, + { + "epoch": 0.1096486682557778, + "grad_norm": 0.4754325747489929, + "learning_rate": 3.9339068161313796e-05, + "loss": 0.2584, + "step": 1058 + }, + { + "epoch": 0.10975230593843921, + "grad_norm": 0.5299050807952881, + "learning_rate": 3.9337355438225205e-05, + "loss": 0.2647, + "step": 1059 + }, + { + "epoch": 0.10985594362110063, + "grad_norm": 0.5079212188720703, + "learning_rate": 3.9335640536233465e-05, + "loss": 0.2731, + "step": 1060 + }, + { + "epoch": 0.10995958130376204, + "grad_norm": 0.4635586142539978, + "learning_rate": 3.933392345553181e-05, + "loss": 0.2567, + "step": 1061 + }, + { + "epoch": 0.11006321898642346, + "grad_norm": 0.4382088780403137, + "learning_rate": 3.933220419631371e-05, + "loss": 0.25, + "step": 1062 + }, + { + "epoch": 0.11016685666908488, + "grad_norm": 0.49184006452560425, + "learning_rate": 3.93304827587729e-05, + "loss": 0.2391, + "step": 1063 + }, + { + "epoch": 0.1102704943517463, + "grad_norm": 0.4317704141139984, + "learning_rate": 3.932875914310334e-05, + "loss": 0.2253, + "step": 1064 + }, + { + "epoch": 0.11037413203440771, + "grad_norm": 0.471828430891037, + "learning_rate": 3.9327033349499247e-05, + "loss": 0.2143, + "step": 1065 + }, + { + "epoch": 0.11047776971706913, + "grad_norm": 0.4559275805950165, + "learning_rate": 3.9325305378155076e-05, + "loss": 0.2205, + "step": 1066 + }, + { + "epoch": 0.11058140739973055, + "grad_norm": 0.45583653450012207, + "learning_rate": 3.932357522926554e-05, + "loss": 0.2311, + "step": 1067 + }, + { + "epoch": 0.11068504508239196, + "grad_norm": 0.5379915237426758, + "learning_rate": 3.932184290302559e-05, + "loss": 0.2838, + "step": 1068 + }, + { + "epoch": 0.11078868276505337, + "grad_norm": 0.4416240155696869, + "learning_rate": 3.9320108399630414e-05, + "loss": 0.2054, + "step": 1069 + }, + { + "epoch": 0.11089232044771478, + "grad_norm": 0.4274689257144928, + "learning_rate": 3.931837171927546e-05, + "loss": 0.2233, + "step": 1070 + }, + { + "epoch": 0.1109959581303762, + "grad_norm": 0.4448922872543335, + "learning_rate": 3.931663286215641e-05, + "loss": 0.263, + "step": 1071 + }, + { + "epoch": 0.11109959581303762, + "grad_norm": 0.47742193937301636, + "learning_rate": 3.93148918284692e-05, + "loss": 0.2838, + "step": 1072 + }, + { + "epoch": 0.11120323349569904, + "grad_norm": 0.454584002494812, + "learning_rate": 3.931314861841e-05, + "loss": 0.228, + "step": 1073 + }, + { + "epoch": 0.11130687117836045, + "grad_norm": 0.4988914728164673, + "learning_rate": 3.931140323217524e-05, + "loss": 0.2943, + "step": 1074 + }, + { + "epoch": 0.11141050886102187, + "grad_norm": 0.48816895484924316, + "learning_rate": 3.930965566996158e-05, + "loss": 0.3103, + "step": 1075 + }, + { + "epoch": 0.11151414654368329, + "grad_norm": 0.41996386647224426, + "learning_rate": 3.9307905931965934e-05, + "loss": 0.2142, + "step": 1076 + }, + { + "epoch": 0.1116177842263447, + "grad_norm": 0.4714778661727905, + "learning_rate": 3.9306154018385474e-05, + "loss": 0.2659, + "step": 1077 + }, + { + "epoch": 0.11172142190900611, + "grad_norm": 0.48112183809280396, + "learning_rate": 3.930439992941758e-05, + "loss": 0.2355, + "step": 1078 + }, + { + "epoch": 0.11182505959166752, + "grad_norm": 0.5149953961372375, + "learning_rate": 3.930264366525992e-05, + "loss": 0.2331, + "step": 1079 + }, + { + "epoch": 0.11192869727432894, + "grad_norm": 0.5702659487724304, + "learning_rate": 3.930088522611037e-05, + "loss": 0.2842, + "step": 1080 + }, + { + "epoch": 0.11203233495699036, + "grad_norm": 0.5287723541259766, + "learning_rate": 3.929912461216708e-05, + "loss": 0.2585, + "step": 1081 + }, + { + "epoch": 0.11213597263965178, + "grad_norm": 0.47466516494750977, + "learning_rate": 3.929736182362843e-05, + "loss": 0.2428, + "step": 1082 + }, + { + "epoch": 0.1122396103223132, + "grad_norm": 0.4626266062259674, + "learning_rate": 3.9295596860693054e-05, + "loss": 0.2083, + "step": 1083 + }, + { + "epoch": 0.11234324800497461, + "grad_norm": 0.4878407120704651, + "learning_rate": 3.929382972355981e-05, + "loss": 0.2483, + "step": 1084 + }, + { + "epoch": 0.11244688568763603, + "grad_norm": 0.4909697473049164, + "learning_rate": 3.929206041242782e-05, + "loss": 0.258, + "step": 1085 + }, + { + "epoch": 0.11255052337029745, + "grad_norm": 0.5589547753334045, + "learning_rate": 3.929028892749647e-05, + "loss": 0.2531, + "step": 1086 + }, + { + "epoch": 0.11265416105295886, + "grad_norm": 0.46808531880378723, + "learning_rate": 3.928851526896535e-05, + "loss": 0.269, + "step": 1087 + }, + { + "epoch": 0.11275779873562027, + "grad_norm": 0.5738458633422852, + "learning_rate": 3.9286739437034304e-05, + "loss": 0.2583, + "step": 1088 + }, + { + "epoch": 0.11286143641828168, + "grad_norm": 0.500787615776062, + "learning_rate": 3.928496143190344e-05, + "loss": 0.2518, + "step": 1089 + }, + { + "epoch": 0.1129650741009431, + "grad_norm": 0.5720177292823792, + "learning_rate": 3.92831812537731e-05, + "loss": 0.267, + "step": 1090 + }, + { + "epoch": 0.11306871178360452, + "grad_norm": 0.5120452046394348, + "learning_rate": 3.9281398902843875e-05, + "loss": 0.2359, + "step": 1091 + }, + { + "epoch": 0.11317234946626593, + "grad_norm": 0.4737999439239502, + "learning_rate": 3.92796143793166e-05, + "loss": 0.2387, + "step": 1092 + }, + { + "epoch": 0.11327598714892735, + "grad_norm": 0.47964850068092346, + "learning_rate": 3.927782768339235e-05, + "loss": 0.2534, + "step": 1093 + }, + { + "epoch": 0.11337962483158877, + "grad_norm": 0.5042358040809631, + "learning_rate": 3.9276038815272436e-05, + "loss": 0.2673, + "step": 1094 + }, + { + "epoch": 0.11348326251425019, + "grad_norm": 0.4781404137611389, + "learning_rate": 3.9274247775158433e-05, + "loss": 0.237, + "step": 1095 + }, + { + "epoch": 0.1135869001969116, + "grad_norm": 0.4982626736164093, + "learning_rate": 3.927245456325216e-05, + "loss": 0.2354, + "step": 1096 + }, + { + "epoch": 0.113690537879573, + "grad_norm": 0.4899817109107971, + "learning_rate": 3.9270659179755656e-05, + "loss": 0.3026, + "step": 1097 + }, + { + "epoch": 0.11379417556223442, + "grad_norm": 0.4677768349647522, + "learning_rate": 3.926886162487124e-05, + "loss": 0.206, + "step": 1098 + }, + { + "epoch": 0.11389781324489584, + "grad_norm": 0.5187677145004272, + "learning_rate": 3.926706189880145e-05, + "loss": 0.2868, + "step": 1099 + }, + { + "epoch": 0.11400145092755726, + "grad_norm": 0.42227962613105774, + "learning_rate": 3.926526000174908e-05, + "loss": 0.2685, + "step": 1100 + }, + { + "epoch": 0.11410508861021867, + "grad_norm": 0.5097355842590332, + "learning_rate": 3.926345593391715e-05, + "loss": 0.2664, + "step": 1101 + }, + { + "epoch": 0.11420872629288009, + "grad_norm": 0.39787158370018005, + "learning_rate": 3.926164969550896e-05, + "loss": 0.2331, + "step": 1102 + }, + { + "epoch": 0.11431236397554151, + "grad_norm": 0.504854142665863, + "learning_rate": 3.9259841286728024e-05, + "loss": 0.2386, + "step": 1103 + }, + { + "epoch": 0.11441600165820293, + "grad_norm": 0.5150250196456909, + "learning_rate": 3.925803070777812e-05, + "loss": 0.2853, + "step": 1104 + }, + { + "epoch": 0.11451963934086434, + "grad_norm": 0.4854704737663269, + "learning_rate": 3.9256217958863236e-05, + "loss": 0.2674, + "step": 1105 + }, + { + "epoch": 0.11462327702352576, + "grad_norm": 0.4782135486602783, + "learning_rate": 3.9254403040187664e-05, + "loss": 0.2326, + "step": 1106 + }, + { + "epoch": 0.11472691470618716, + "grad_norm": 0.5136960744857788, + "learning_rate": 3.925258595195587e-05, + "loss": 0.2808, + "step": 1107 + }, + { + "epoch": 0.11483055238884858, + "grad_norm": 0.45660775899887085, + "learning_rate": 3.9250766694372634e-05, + "loss": 0.2145, + "step": 1108 + }, + { + "epoch": 0.11493419007151, + "grad_norm": 0.4734261929988861, + "learning_rate": 3.924894526764293e-05, + "loss": 0.2327, + "step": 1109 + }, + { + "epoch": 0.11503782775417142, + "grad_norm": 0.451330304145813, + "learning_rate": 3.924712167197199e-05, + "loss": 0.2568, + "step": 1110 + }, + { + "epoch": 0.11514146543683283, + "grad_norm": 0.4574805200099945, + "learning_rate": 3.924529590756531e-05, + "loss": 0.233, + "step": 1111 + }, + { + "epoch": 0.11524510311949425, + "grad_norm": 0.468637615442276, + "learning_rate": 3.9243467974628596e-05, + "loss": 0.2596, + "step": 1112 + }, + { + "epoch": 0.11534874080215567, + "grad_norm": 0.4985135793685913, + "learning_rate": 3.924163787336783e-05, + "loss": 0.2166, + "step": 1113 + }, + { + "epoch": 0.11545237848481708, + "grad_norm": 0.5171657204627991, + "learning_rate": 3.9239805603989213e-05, + "loss": 0.2658, + "step": 1114 + }, + { + "epoch": 0.1155560161674785, + "grad_norm": 0.48922353982925415, + "learning_rate": 3.923797116669922e-05, + "loss": 0.2254, + "step": 1115 + }, + { + "epoch": 0.1156596538501399, + "grad_norm": 0.4309007227420807, + "learning_rate": 3.923613456170454e-05, + "loss": 0.2001, + "step": 1116 + }, + { + "epoch": 0.11576329153280132, + "grad_norm": 0.4351019263267517, + "learning_rate": 3.923429578921211e-05, + "loss": 0.2553, + "step": 1117 + }, + { + "epoch": 0.11586692921546274, + "grad_norm": 0.49027588963508606, + "learning_rate": 3.9232454849429144e-05, + "loss": 0.2328, + "step": 1118 + }, + { + "epoch": 0.11597056689812416, + "grad_norm": 0.42478036880493164, + "learning_rate": 3.9230611742563055e-05, + "loss": 0.2166, + "step": 1119 + }, + { + "epoch": 0.11607420458078557, + "grad_norm": 0.5433241128921509, + "learning_rate": 3.922876646882153e-05, + "loss": 0.2613, + "step": 1120 + }, + { + "epoch": 0.11617784226344699, + "grad_norm": 0.3327256739139557, + "learning_rate": 3.9226919028412494e-05, + "loss": 0.1829, + "step": 1121 + }, + { + "epoch": 0.11628147994610841, + "grad_norm": 0.5044670701026917, + "learning_rate": 3.9225069421544113e-05, + "loss": 0.2609, + "step": 1122 + }, + { + "epoch": 0.11638511762876982, + "grad_norm": 0.5583537817001343, + "learning_rate": 3.922321764842479e-05, + "loss": 0.283, + "step": 1123 + }, + { + "epoch": 0.11648875531143124, + "grad_norm": 0.44634732604026794, + "learning_rate": 3.922136370926319e-05, + "loss": 0.2372, + "step": 1124 + }, + { + "epoch": 0.11659239299409266, + "grad_norm": 0.43324288725852966, + "learning_rate": 3.92195076042682e-05, + "loss": 0.2096, + "step": 1125 + }, + { + "epoch": 0.11669603067675406, + "grad_norm": 0.4762877821922302, + "learning_rate": 3.9217649333648984e-05, + "loss": 0.2601, + "step": 1126 + }, + { + "epoch": 0.11679966835941548, + "grad_norm": 0.529367983341217, + "learning_rate": 3.92157888976149e-05, + "loss": 0.2616, + "step": 1127 + }, + { + "epoch": 0.1169033060420769, + "grad_norm": 0.4866292178630829, + "learning_rate": 3.92139262963756e-05, + "loss": 0.2193, + "step": 1128 + }, + { + "epoch": 0.11700694372473831, + "grad_norm": 0.4979463219642639, + "learning_rate": 3.921206153014096e-05, + "loss": 0.2631, + "step": 1129 + }, + { + "epoch": 0.11711058140739973, + "grad_norm": 0.5391631722450256, + "learning_rate": 3.921019459912109e-05, + "loss": 0.2552, + "step": 1130 + }, + { + "epoch": 0.11721421909006115, + "grad_norm": 0.5459678173065186, + "learning_rate": 3.920832550352635e-05, + "loss": 0.2565, + "step": 1131 + }, + { + "epoch": 0.11731785677272256, + "grad_norm": 0.5035747289657593, + "learning_rate": 3.920645424356735e-05, + "loss": 0.2511, + "step": 1132 + }, + { + "epoch": 0.11742149445538398, + "grad_norm": 0.4486197233200073, + "learning_rate": 3.920458081945495e-05, + "loss": 0.227, + "step": 1133 + }, + { + "epoch": 0.1175251321380454, + "grad_norm": 0.4556668996810913, + "learning_rate": 3.9202705231400237e-05, + "loss": 0.2207, + "step": 1134 + }, + { + "epoch": 0.1176287698207068, + "grad_norm": 0.4258916974067688, + "learning_rate": 3.920082747961455e-05, + "loss": 0.242, + "step": 1135 + }, + { + "epoch": 0.11773240750336822, + "grad_norm": 0.48589828610420227, + "learning_rate": 3.919894756430947e-05, + "loss": 0.2494, + "step": 1136 + }, + { + "epoch": 0.11783604518602964, + "grad_norm": 0.4887816309928894, + "learning_rate": 3.919706548569682e-05, + "loss": 0.2222, + "step": 1137 + }, + { + "epoch": 0.11793968286869105, + "grad_norm": 0.48513126373291016, + "learning_rate": 3.9195181243988676e-05, + "loss": 0.2251, + "step": 1138 + }, + { + "epoch": 0.11804332055135247, + "grad_norm": 0.43728193640708923, + "learning_rate": 3.919329483939735e-05, + "loss": 0.2303, + "step": 1139 + }, + { + "epoch": 0.11814695823401389, + "grad_norm": 0.5254392623901367, + "learning_rate": 3.91914062721354e-05, + "loss": 0.3175, + "step": 1140 + }, + { + "epoch": 0.1182505959166753, + "grad_norm": 0.40364351868629456, + "learning_rate": 3.918951554241562e-05, + "loss": 0.2119, + "step": 1141 + }, + { + "epoch": 0.11835423359933672, + "grad_norm": 0.46782156825065613, + "learning_rate": 3.9187622650451065e-05, + "loss": 0.2109, + "step": 1142 + }, + { + "epoch": 0.11845787128199814, + "grad_norm": 0.5325164198875427, + "learning_rate": 3.9185727596455015e-05, + "loss": 0.2793, + "step": 1143 + }, + { + "epoch": 0.11856150896465956, + "grad_norm": 0.48460426926612854, + "learning_rate": 3.9183830380641e-05, + "loss": 0.248, + "step": 1144 + }, + { + "epoch": 0.11866514664732096, + "grad_norm": 0.4963580369949341, + "learning_rate": 3.91819310032228e-05, + "loss": 0.2803, + "step": 1145 + }, + { + "epoch": 0.11876878432998238, + "grad_norm": 0.4866742789745331, + "learning_rate": 3.918002946441444e-05, + "loss": 0.2398, + "step": 1146 + }, + { + "epoch": 0.1188724220126438, + "grad_norm": 0.4291539490222931, + "learning_rate": 3.917812576443017e-05, + "loss": 0.2557, + "step": 1147 + }, + { + "epoch": 0.11897605969530521, + "grad_norm": 0.4939398765563965, + "learning_rate": 3.91762199034845e-05, + "loss": 0.2478, + "step": 1148 + }, + { + "epoch": 0.11907969737796663, + "grad_norm": 0.4453083574771881, + "learning_rate": 3.917431188179219e-05, + "loss": 0.2164, + "step": 1149 + }, + { + "epoch": 0.11918333506062805, + "grad_norm": 0.5078995227813721, + "learning_rate": 3.917240169956822e-05, + "loss": 0.265, + "step": 1150 + }, + { + "epoch": 0.11928697274328946, + "grad_norm": 0.4686465561389923, + "learning_rate": 3.9170489357027827e-05, + "loss": 0.25, + "step": 1151 + }, + { + "epoch": 0.11939061042595088, + "grad_norm": 0.40397870540618896, + "learning_rate": 3.91685748543865e-05, + "loss": 0.2002, + "step": 1152 + }, + { + "epoch": 0.1194942481086123, + "grad_norm": 0.5199308395385742, + "learning_rate": 3.916665819185995e-05, + "loss": 0.271, + "step": 1153 + }, + { + "epoch": 0.1195978857912737, + "grad_norm": 0.5083966255187988, + "learning_rate": 3.916473936966416e-05, + "loss": 0.2282, + "step": 1154 + }, + { + "epoch": 0.11970152347393512, + "grad_norm": 0.4933621287345886, + "learning_rate": 3.9162818388015324e-05, + "loss": 0.2453, + "step": 1155 + }, + { + "epoch": 0.11980516115659653, + "grad_norm": 0.4529334008693695, + "learning_rate": 3.9160895247129905e-05, + "loss": 0.2728, + "step": 1156 + }, + { + "epoch": 0.11990879883925795, + "grad_norm": 0.4741211533546448, + "learning_rate": 3.915896994722458e-05, + "loss": 0.2299, + "step": 1157 + }, + { + "epoch": 0.12001243652191937, + "grad_norm": 0.48024997115135193, + "learning_rate": 3.9157042488516325e-05, + "loss": 0.2603, + "step": 1158 + }, + { + "epoch": 0.12011607420458079, + "grad_norm": 0.501326322555542, + "learning_rate": 3.915511287122229e-05, + "loss": 0.2643, + "step": 1159 + }, + { + "epoch": 0.1202197118872422, + "grad_norm": 0.48749133944511414, + "learning_rate": 3.9153181095559924e-05, + "loss": 0.2703, + "step": 1160 + }, + { + "epoch": 0.12032334956990362, + "grad_norm": 0.505791425704956, + "learning_rate": 3.915124716174688e-05, + "loss": 0.2386, + "step": 1161 + }, + { + "epoch": 0.12042698725256504, + "grad_norm": 0.5771316885948181, + "learning_rate": 3.914931107000107e-05, + "loss": 0.248, + "step": 1162 + }, + { + "epoch": 0.12053062493522645, + "grad_norm": 0.5785459280014038, + "learning_rate": 3.914737282054067e-05, + "loss": 0.2648, + "step": 1163 + }, + { + "epoch": 0.12063426261788786, + "grad_norm": 0.4337507486343384, + "learning_rate": 3.914543241358406e-05, + "loss": 0.24, + "step": 1164 + }, + { + "epoch": 0.12073790030054928, + "grad_norm": 0.3794737756252289, + "learning_rate": 3.9143489849349886e-05, + "loss": 0.2227, + "step": 1165 + }, + { + "epoch": 0.12084153798321069, + "grad_norm": 0.5467318892478943, + "learning_rate": 3.914154512805704e-05, + "loss": 0.3101, + "step": 1166 + }, + { + "epoch": 0.12094517566587211, + "grad_norm": 0.4539571702480316, + "learning_rate": 3.9139598249924635e-05, + "loss": 0.2309, + "step": 1167 + }, + { + "epoch": 0.12104881334853353, + "grad_norm": 0.5110743641853333, + "learning_rate": 3.913764921517207e-05, + "loss": 0.2546, + "step": 1168 + }, + { + "epoch": 0.12115245103119494, + "grad_norm": 0.4412529170513153, + "learning_rate": 3.913569802401892e-05, + "loss": 0.2387, + "step": 1169 + }, + { + "epoch": 0.12125608871385636, + "grad_norm": 0.4804104268550873, + "learning_rate": 3.9133744676685075e-05, + "loss": 0.2543, + "step": 1170 + }, + { + "epoch": 0.12135972639651778, + "grad_norm": 0.4778258502483368, + "learning_rate": 3.913178917339062e-05, + "loss": 0.2648, + "step": 1171 + }, + { + "epoch": 0.1214633640791792, + "grad_norm": 0.48556414246559143, + "learning_rate": 3.912983151435591e-05, + "loss": 0.239, + "step": 1172 + }, + { + "epoch": 0.1215670017618406, + "grad_norm": 0.4241321086883545, + "learning_rate": 3.912787169980152e-05, + "loss": 0.2021, + "step": 1173 + }, + { + "epoch": 0.12167063944450202, + "grad_norm": 0.4774876832962036, + "learning_rate": 3.9125909729948276e-05, + "loss": 0.24, + "step": 1174 + }, + { + "epoch": 0.12177427712716343, + "grad_norm": 0.4637620449066162, + "learning_rate": 3.912394560501726e-05, + "loss": 0.2167, + "step": 1175 + }, + { + "epoch": 0.12187791480982485, + "grad_norm": 0.48373448848724365, + "learning_rate": 3.9121979325229784e-05, + "loss": 0.2424, + "step": 1176 + }, + { + "epoch": 0.12198155249248627, + "grad_norm": 0.49754947423934937, + "learning_rate": 3.91200108908074e-05, + "loss": 0.2698, + "step": 1177 + }, + { + "epoch": 0.12208519017514768, + "grad_norm": 0.46409887075424194, + "learning_rate": 3.911804030197191e-05, + "loss": 0.2258, + "step": 1178 + }, + { + "epoch": 0.1221888278578091, + "grad_norm": 0.56962651014328, + "learning_rate": 3.911606755894536e-05, + "loss": 0.2766, + "step": 1179 + }, + { + "epoch": 0.12229246554047052, + "grad_norm": 0.48047998547554016, + "learning_rate": 3.911409266195003e-05, + "loss": 0.2596, + "step": 1180 + }, + { + "epoch": 0.12239610322313194, + "grad_norm": 0.5867676734924316, + "learning_rate": 3.911211561120846e-05, + "loss": 0.3096, + "step": 1181 + }, + { + "epoch": 0.12249974090579335, + "grad_norm": 0.49909263849258423, + "learning_rate": 3.911013640694341e-05, + "loss": 0.2556, + "step": 1182 + }, + { + "epoch": 0.12260337858845476, + "grad_norm": 0.501254677772522, + "learning_rate": 3.910815504937789e-05, + "loss": 0.2347, + "step": 1183 + }, + { + "epoch": 0.12270701627111617, + "grad_norm": 0.5377936959266663, + "learning_rate": 3.910617153873517e-05, + "loss": 0.2529, + "step": 1184 + }, + { + "epoch": 0.12281065395377759, + "grad_norm": 0.5332674384117126, + "learning_rate": 3.910418587523874e-05, + "loss": 0.3185, + "step": 1185 + }, + { + "epoch": 0.12291429163643901, + "grad_norm": 0.4071737825870514, + "learning_rate": 3.910219805911234e-05, + "loss": 0.2396, + "step": 1186 + }, + { + "epoch": 0.12301792931910042, + "grad_norm": 0.5417530536651611, + "learning_rate": 3.910020809057997e-05, + "loss": 0.2836, + "step": 1187 + }, + { + "epoch": 0.12312156700176184, + "grad_norm": 0.44844117760658264, + "learning_rate": 3.909821596986584e-05, + "loss": 0.2133, + "step": 1188 + }, + { + "epoch": 0.12322520468442326, + "grad_norm": 0.41205480694770813, + "learning_rate": 3.909622169719442e-05, + "loss": 0.2244, + "step": 1189 + }, + { + "epoch": 0.12332884236708468, + "grad_norm": 0.49824705719947815, + "learning_rate": 3.909422527279042e-05, + "loss": 0.2742, + "step": 1190 + }, + { + "epoch": 0.1234324800497461, + "grad_norm": 0.4864721894264221, + "learning_rate": 3.909222669687881e-05, + "loss": 0.2334, + "step": 1191 + }, + { + "epoch": 0.1235361177324075, + "grad_norm": 0.3949887454509735, + "learning_rate": 3.909022596968477e-05, + "loss": 0.1864, + "step": 1192 + }, + { + "epoch": 0.12363975541506891, + "grad_norm": 0.6071784496307373, + "learning_rate": 3.908822309143374e-05, + "loss": 0.3133, + "step": 1193 + }, + { + "epoch": 0.12374339309773033, + "grad_norm": 0.4521191120147705, + "learning_rate": 3.908621806235141e-05, + "loss": 0.2416, + "step": 1194 + }, + { + "epoch": 0.12384703078039175, + "grad_norm": 0.5484431385993958, + "learning_rate": 3.9084210882663695e-05, + "loss": 0.2785, + "step": 1195 + }, + { + "epoch": 0.12395066846305317, + "grad_norm": 0.5148352980613708, + "learning_rate": 3.908220155259677e-05, + "loss": 0.2188, + "step": 1196 + }, + { + "epoch": 0.12405430614571458, + "grad_norm": 0.43146973848342896, + "learning_rate": 3.908019007237703e-05, + "loss": 0.1878, + "step": 1197 + }, + { + "epoch": 0.124157943828376, + "grad_norm": 0.5318440794944763, + "learning_rate": 3.907817644223114e-05, + "loss": 0.2551, + "step": 1198 + }, + { + "epoch": 0.12426158151103742, + "grad_norm": 0.4649069905281067, + "learning_rate": 3.9076160662385986e-05, + "loss": 0.2022, + "step": 1199 + }, + { + "epoch": 0.12436521919369883, + "grad_norm": 0.4180242717266083, + "learning_rate": 3.9074142733068704e-05, + "loss": 0.2165, + "step": 1200 + }, + { + "epoch": 0.12446885687636025, + "grad_norm": 0.5106714963912964, + "learning_rate": 3.907212265450666e-05, + "loss": 0.2603, + "step": 1201 + }, + { + "epoch": 0.12457249455902165, + "grad_norm": 0.507351815700531, + "learning_rate": 3.90701004269275e-05, + "loss": 0.284, + "step": 1202 + }, + { + "epoch": 0.12467613224168307, + "grad_norm": 0.5832152366638184, + "learning_rate": 3.906807605055906e-05, + "loss": 0.3114, + "step": 1203 + }, + { + "epoch": 0.12477976992434449, + "grad_norm": 0.5098874568939209, + "learning_rate": 3.906604952562945e-05, + "loss": 0.2527, + "step": 1204 + }, + { + "epoch": 0.1248834076070059, + "grad_norm": 0.47632232308387756, + "learning_rate": 3.9064020852367024e-05, + "loss": 0.2398, + "step": 1205 + }, + { + "epoch": 0.12498704528966732, + "grad_norm": 0.515520453453064, + "learning_rate": 3.906199003100036e-05, + "loss": 0.2738, + "step": 1206 + }, + { + "epoch": 0.12509068297232873, + "grad_norm": 0.5277115702629089, + "learning_rate": 3.905995706175829e-05, + "loss": 0.2652, + "step": 1207 + }, + { + "epoch": 0.12519432065499014, + "grad_norm": 0.44119536876678467, + "learning_rate": 3.9057921944869896e-05, + "loss": 0.2407, + "step": 1208 + }, + { + "epoch": 0.12529795833765156, + "grad_norm": 0.5089069604873657, + "learning_rate": 3.9055884680564474e-05, + "loss": 0.2819, + "step": 1209 + }, + { + "epoch": 0.12540159602031298, + "grad_norm": 0.5063955783843994, + "learning_rate": 3.9053845269071595e-05, + "loss": 0.2752, + "step": 1210 + }, + { + "epoch": 0.1255052337029744, + "grad_norm": 0.4868340790271759, + "learning_rate": 3.905180371062105e-05, + "loss": 0.272, + "step": 1211 + }, + { + "epoch": 0.1256088713856358, + "grad_norm": 0.4226343035697937, + "learning_rate": 3.9049760005442875e-05, + "loss": 0.2112, + "step": 1212 + }, + { + "epoch": 0.12571250906829723, + "grad_norm": 0.41173243522644043, + "learning_rate": 3.904771415376736e-05, + "loss": 0.2205, + "step": 1213 + }, + { + "epoch": 0.12581614675095865, + "grad_norm": 0.48055100440979004, + "learning_rate": 3.9045666155825024e-05, + "loss": 0.2855, + "step": 1214 + }, + { + "epoch": 0.12591978443362006, + "grad_norm": 0.45464372634887695, + "learning_rate": 3.904361601184663e-05, + "loss": 0.2378, + "step": 1215 + }, + { + "epoch": 0.12602342211628148, + "grad_norm": 0.5273199081420898, + "learning_rate": 3.904156372206319e-05, + "loss": 0.3178, + "step": 1216 + }, + { + "epoch": 0.1261270597989429, + "grad_norm": 0.4722169041633606, + "learning_rate": 3.903950928670595e-05, + "loss": 0.2473, + "step": 1217 + }, + { + "epoch": 0.12623069748160431, + "grad_norm": 0.4227035343647003, + "learning_rate": 3.90374527060064e-05, + "loss": 0.2393, + "step": 1218 + }, + { + "epoch": 0.12633433516426573, + "grad_norm": 0.45303425192832947, + "learning_rate": 3.9035393980196274e-05, + "loss": 0.2462, + "step": 1219 + }, + { + "epoch": 0.12643797284692715, + "grad_norm": 0.3675241768360138, + "learning_rate": 3.903333310950755e-05, + "loss": 0.1741, + "step": 1220 + }, + { + "epoch": 0.12654161052958857, + "grad_norm": 0.39168018102645874, + "learning_rate": 3.903127009417244e-05, + "loss": 0.2223, + "step": 1221 + }, + { + "epoch": 0.12664524821224998, + "grad_norm": 0.4818548858165741, + "learning_rate": 3.902920493442339e-05, + "loss": 0.2515, + "step": 1222 + }, + { + "epoch": 0.1267488858949114, + "grad_norm": 0.412105530500412, + "learning_rate": 3.9027137630493114e-05, + "loss": 0.1881, + "step": 1223 + }, + { + "epoch": 0.12685252357757282, + "grad_norm": 0.49911603331565857, + "learning_rate": 3.902506818261455e-05, + "loss": 0.28, + "step": 1224 + }, + { + "epoch": 0.12695616126023423, + "grad_norm": 0.4557046592235565, + "learning_rate": 3.902299659102088e-05, + "loss": 0.2285, + "step": 1225 + }, + { + "epoch": 0.12705979894289562, + "grad_norm": 0.4454115331172943, + "learning_rate": 3.902092285594552e-05, + "loss": 0.2487, + "step": 1226 + }, + { + "epoch": 0.12716343662555704, + "grad_norm": 0.4897434711456299, + "learning_rate": 3.9018846977622143e-05, + "loss": 0.2826, + "step": 1227 + }, + { + "epoch": 0.12726707430821846, + "grad_norm": 0.5356862545013428, + "learning_rate": 3.901676895628466e-05, + "loss": 0.304, + "step": 1228 + }, + { + "epoch": 0.12737071199087988, + "grad_norm": 0.4942310154438019, + "learning_rate": 3.9014688792167206e-05, + "loss": 0.2785, + "step": 1229 + }, + { + "epoch": 0.1274743496735413, + "grad_norm": 0.45313510298728943, + "learning_rate": 3.901260648550418e-05, + "loss": 0.2448, + "step": 1230 + }, + { + "epoch": 0.1275779873562027, + "grad_norm": 0.41935995221138, + "learning_rate": 3.901052203653021e-05, + "loss": 0.2032, + "step": 1231 + }, + { + "epoch": 0.12768162503886413, + "grad_norm": 0.3813270032405853, + "learning_rate": 3.900843544548017e-05, + "loss": 0.194, + "step": 1232 + }, + { + "epoch": 0.12778526272152554, + "grad_norm": 0.4859898090362549, + "learning_rate": 3.900634671258917e-05, + "loss": 0.2654, + "step": 1233 + }, + { + "epoch": 0.12788890040418696, + "grad_norm": 0.49495765566825867, + "learning_rate": 3.900425583809258e-05, + "loss": 0.2822, + "step": 1234 + }, + { + "epoch": 0.12799253808684838, + "grad_norm": 0.4281288683414459, + "learning_rate": 3.9002162822225975e-05, + "loss": 0.2209, + "step": 1235 + }, + { + "epoch": 0.1280961757695098, + "grad_norm": 0.4575282037258148, + "learning_rate": 3.900006766522521e-05, + "loss": 0.2194, + "step": 1236 + }, + { + "epoch": 0.1281998134521712, + "grad_norm": 0.551102340221405, + "learning_rate": 3.899797036732635e-05, + "loss": 0.2612, + "step": 1237 + }, + { + "epoch": 0.12830345113483263, + "grad_norm": 0.44362667202949524, + "learning_rate": 3.899587092876572e-05, + "loss": 0.2357, + "step": 1238 + }, + { + "epoch": 0.12840708881749405, + "grad_norm": 0.42648833990097046, + "learning_rate": 3.899376934977989e-05, + "loss": 0.2324, + "step": 1239 + }, + { + "epoch": 0.12851072650015546, + "grad_norm": 0.48824894428253174, + "learning_rate": 3.899166563060565e-05, + "loss": 0.2273, + "step": 1240 + }, + { + "epoch": 0.12861436418281688, + "grad_norm": 0.48277342319488525, + "learning_rate": 3.898955977148005e-05, + "loss": 0.2452, + "step": 1241 + }, + { + "epoch": 0.1287180018654783, + "grad_norm": 0.5302169919013977, + "learning_rate": 3.8987451772640386e-05, + "loss": 0.2873, + "step": 1242 + }, + { + "epoch": 0.12882163954813972, + "grad_norm": 0.45799848437309265, + "learning_rate": 3.898534163432416e-05, + "loss": 0.2306, + "step": 1243 + }, + { + "epoch": 0.12892527723080113, + "grad_norm": 0.42073294520378113, + "learning_rate": 3.8983229356769155e-05, + "loss": 0.2101, + "step": 1244 + }, + { + "epoch": 0.12902891491346252, + "grad_norm": 0.4514122009277344, + "learning_rate": 3.898111494021338e-05, + "loss": 0.2231, + "step": 1245 + }, + { + "epoch": 0.12913255259612394, + "grad_norm": 0.4837682545185089, + "learning_rate": 3.897899838489507e-05, + "loss": 0.2748, + "step": 1246 + }, + { + "epoch": 0.12923619027878536, + "grad_norm": 0.5559665560722351, + "learning_rate": 3.8976879691052743e-05, + "loss": 0.2748, + "step": 1247 + }, + { + "epoch": 0.12933982796144677, + "grad_norm": 0.4597959816455841, + "learning_rate": 3.89747588589251e-05, + "loss": 0.2271, + "step": 1248 + }, + { + "epoch": 0.1294434656441082, + "grad_norm": 0.45351025462150574, + "learning_rate": 3.8972635888751125e-05, + "loss": 0.2592, + "step": 1249 + }, + { + "epoch": 0.1295471033267696, + "grad_norm": 0.42491164803504944, + "learning_rate": 3.897051078077003e-05, + "loss": 0.2205, + "step": 1250 + }, + { + "epoch": 0.12965074100943103, + "grad_norm": 0.4073490798473358, + "learning_rate": 3.896838353522128e-05, + "loss": 0.2481, + "step": 1251 + }, + { + "epoch": 0.12975437869209244, + "grad_norm": 0.43321773409843445, + "learning_rate": 3.8966254152344555e-05, + "loss": 0.2368, + "step": 1252 + }, + { + "epoch": 0.12985801637475386, + "grad_norm": 0.4953174591064453, + "learning_rate": 3.89641226323798e-05, + "loss": 0.2316, + "step": 1253 + }, + { + "epoch": 0.12996165405741528, + "grad_norm": 0.487069308757782, + "learning_rate": 3.896198897556718e-05, + "loss": 0.2518, + "step": 1254 + }, + { + "epoch": 0.1300652917400767, + "grad_norm": 0.44341564178466797, + "learning_rate": 3.895985318214712e-05, + "loss": 0.2243, + "step": 1255 + }, + { + "epoch": 0.1301689294227381, + "grad_norm": 0.49199700355529785, + "learning_rate": 3.895771525236028e-05, + "loss": 0.2572, + "step": 1256 + }, + { + "epoch": 0.13027256710539953, + "grad_norm": 0.4807712733745575, + "learning_rate": 3.895557518644756e-05, + "loss": 0.2587, + "step": 1257 + }, + { + "epoch": 0.13037620478806095, + "grad_norm": 0.48649948835372925, + "learning_rate": 3.8953432984650085e-05, + "loss": 0.2514, + "step": 1258 + }, + { + "epoch": 0.13047984247072236, + "grad_norm": 0.46023765206336975, + "learning_rate": 3.895128864720925e-05, + "loss": 0.2664, + "step": 1259 + }, + { + "epoch": 0.13058348015338378, + "grad_norm": 0.45767319202423096, + "learning_rate": 3.894914217436667e-05, + "loss": 0.2184, + "step": 1260 + }, + { + "epoch": 0.1306871178360452, + "grad_norm": 0.4645031988620758, + "learning_rate": 3.894699356636421e-05, + "loss": 0.2444, + "step": 1261 + }, + { + "epoch": 0.13079075551870661, + "grad_norm": 0.556087076663971, + "learning_rate": 3.894484282344396e-05, + "loss": 0.2425, + "step": 1262 + }, + { + "epoch": 0.13089439320136803, + "grad_norm": 0.4621123969554901, + "learning_rate": 3.8942689945848284e-05, + "loss": 0.2066, + "step": 1263 + }, + { + "epoch": 0.13099803088402942, + "grad_norm": 0.5394175052642822, + "learning_rate": 3.8940534933819744e-05, + "loss": 0.2577, + "step": 1264 + }, + { + "epoch": 0.13110166856669084, + "grad_norm": 0.42368635535240173, + "learning_rate": 3.893837778760117e-05, + "loss": 0.2162, + "step": 1265 + }, + { + "epoch": 0.13120530624935225, + "grad_norm": 0.4939993917942047, + "learning_rate": 3.893621850743563e-05, + "loss": 0.2421, + "step": 1266 + }, + { + "epoch": 0.13130894393201367, + "grad_norm": 0.48513132333755493, + "learning_rate": 3.893405709356642e-05, + "loss": 0.2505, + "step": 1267 + }, + { + "epoch": 0.1314125816146751, + "grad_norm": 0.4326397180557251, + "learning_rate": 3.89318935462371e-05, + "loss": 0.1924, + "step": 1268 + }, + { + "epoch": 0.1315162192973365, + "grad_norm": 0.49334537982940674, + "learning_rate": 3.8929727865691445e-05, + "loss": 0.2633, + "step": 1269 + }, + { + "epoch": 0.13161985697999792, + "grad_norm": 0.4784174859523773, + "learning_rate": 3.892756005217347e-05, + "loss": 0.2485, + "step": 1270 + }, + { + "epoch": 0.13172349466265934, + "grad_norm": 0.5221570134162903, + "learning_rate": 3.892539010592746e-05, + "loss": 0.2777, + "step": 1271 + }, + { + "epoch": 0.13182713234532076, + "grad_norm": 0.5145605802536011, + "learning_rate": 3.892321802719791e-05, + "loss": 0.2546, + "step": 1272 + }, + { + "epoch": 0.13193077002798217, + "grad_norm": 0.5601503849029541, + "learning_rate": 3.892104381622957e-05, + "loss": 0.2766, + "step": 1273 + }, + { + "epoch": 0.1320344077106436, + "grad_norm": 0.5030035972595215, + "learning_rate": 3.891886747326743e-05, + "loss": 0.2316, + "step": 1274 + }, + { + "epoch": 0.132138045393305, + "grad_norm": 0.4209156930446625, + "learning_rate": 3.891668899855671e-05, + "loss": 0.2388, + "step": 1275 + }, + { + "epoch": 0.13224168307596643, + "grad_norm": 0.4128512740135193, + "learning_rate": 3.891450839234288e-05, + "loss": 0.1973, + "step": 1276 + }, + { + "epoch": 0.13234532075862784, + "grad_norm": 0.3734406530857086, + "learning_rate": 3.891232565487164e-05, + "loss": 0.1845, + "step": 1277 + }, + { + "epoch": 0.13244895844128926, + "grad_norm": 0.4477710425853729, + "learning_rate": 3.891014078638896e-05, + "loss": 0.2195, + "step": 1278 + }, + { + "epoch": 0.13255259612395068, + "grad_norm": 0.43623194098472595, + "learning_rate": 3.890795378714101e-05, + "loss": 0.2471, + "step": 1279 + }, + { + "epoch": 0.1326562338066121, + "grad_norm": 0.571938693523407, + "learning_rate": 3.890576465737421e-05, + "loss": 0.2825, + "step": 1280 + }, + { + "epoch": 0.1327598714892735, + "grad_norm": 0.4960414171218872, + "learning_rate": 3.890357339733524e-05, + "loss": 0.2314, + "step": 1281 + }, + { + "epoch": 0.13286350917193493, + "grad_norm": 0.4360418915748596, + "learning_rate": 3.890138000727101e-05, + "loss": 0.1954, + "step": 1282 + }, + { + "epoch": 0.13296714685459632, + "grad_norm": 0.49753016233444214, + "learning_rate": 3.8899184487428665e-05, + "loss": 0.2618, + "step": 1283 + }, + { + "epoch": 0.13307078453725774, + "grad_norm": 0.492436021566391, + "learning_rate": 3.889698683805559e-05, + "loss": 0.2932, + "step": 1284 + }, + { + "epoch": 0.13317442221991915, + "grad_norm": 0.5095851421356201, + "learning_rate": 3.889478705939941e-05, + "loss": 0.2694, + "step": 1285 + }, + { + "epoch": 0.13327805990258057, + "grad_norm": 0.47561195492744446, + "learning_rate": 3.8892585151708005e-05, + "loss": 0.2354, + "step": 1286 + }, + { + "epoch": 0.133381697585242, + "grad_norm": 0.5100952386856079, + "learning_rate": 3.8890381115229465e-05, + "loss": 0.286, + "step": 1287 + }, + { + "epoch": 0.1334853352679034, + "grad_norm": 0.3915499448776245, + "learning_rate": 3.888817495021215e-05, + "loss": 0.1861, + "step": 1288 + }, + { + "epoch": 0.13358897295056482, + "grad_norm": 0.4240892827510834, + "learning_rate": 3.8885966656904646e-05, + "loss": 0.2148, + "step": 1289 + }, + { + "epoch": 0.13369261063322624, + "grad_norm": 0.492962121963501, + "learning_rate": 3.888375623555578e-05, + "loss": 0.286, + "step": 1290 + }, + { + "epoch": 0.13379624831588766, + "grad_norm": 0.4343388080596924, + "learning_rate": 3.888154368641461e-05, + "loss": 0.2041, + "step": 1291 + }, + { + "epoch": 0.13389988599854907, + "grad_norm": 0.46965283155441284, + "learning_rate": 3.887932900973045e-05, + "loss": 0.2112, + "step": 1292 + }, + { + "epoch": 0.1340035236812105, + "grad_norm": 0.48123374581336975, + "learning_rate": 3.887711220575285e-05, + "loss": 0.2624, + "step": 1293 + }, + { + "epoch": 0.1341071613638719, + "grad_norm": 0.5123097896575928, + "learning_rate": 3.887489327473159e-05, + "loss": 0.2465, + "step": 1294 + }, + { + "epoch": 0.13421079904653332, + "grad_norm": 0.5587708353996277, + "learning_rate": 3.8872672216916696e-05, + "loss": 0.2606, + "step": 1295 + }, + { + "epoch": 0.13431443672919474, + "grad_norm": 0.5073254108428955, + "learning_rate": 3.8870449032558436e-05, + "loss": 0.2617, + "step": 1296 + }, + { + "epoch": 0.13441807441185616, + "grad_norm": 0.4759473204612732, + "learning_rate": 3.8868223721907314e-05, + "loss": 0.2218, + "step": 1297 + }, + { + "epoch": 0.13452171209451758, + "grad_norm": 0.4606642425060272, + "learning_rate": 3.886599628521407e-05, + "loss": 0.2088, + "step": 1298 + }, + { + "epoch": 0.134625349777179, + "grad_norm": 0.4292793869972229, + "learning_rate": 3.8863766722729696e-05, + "loss": 0.1981, + "step": 1299 + }, + { + "epoch": 0.1347289874598404, + "grad_norm": 0.4475545585155487, + "learning_rate": 3.8861535034705416e-05, + "loss": 0.2544, + "step": 1300 + }, + { + "epoch": 0.13483262514250183, + "grad_norm": 0.4809749722480774, + "learning_rate": 3.8859301221392686e-05, + "loss": 0.2726, + "step": 1301 + }, + { + "epoch": 0.13493626282516322, + "grad_norm": 0.5430611371994019, + "learning_rate": 3.885706528304321e-05, + "loss": 0.2832, + "step": 1302 + }, + { + "epoch": 0.13503990050782463, + "grad_norm": 0.5341582894325256, + "learning_rate": 3.885482721990893e-05, + "loss": 0.2874, + "step": 1303 + }, + { + "epoch": 0.13514353819048605, + "grad_norm": 0.4888921082019806, + "learning_rate": 3.885258703224204e-05, + "loss": 0.2659, + "step": 1304 + }, + { + "epoch": 0.13524717587314747, + "grad_norm": 0.4214054346084595, + "learning_rate": 3.8850344720294934e-05, + "loss": 0.1975, + "step": 1305 + }, + { + "epoch": 0.13535081355580889, + "grad_norm": 0.5062821507453918, + "learning_rate": 3.8848100284320295e-05, + "loss": 0.2452, + "step": 1306 + }, + { + "epoch": 0.1354544512384703, + "grad_norm": 0.4390522837638855, + "learning_rate": 3.884585372457102e-05, + "loss": 0.2504, + "step": 1307 + }, + { + "epoch": 0.13555808892113172, + "grad_norm": 0.46084293723106384, + "learning_rate": 3.8843605041300245e-05, + "loss": 0.225, + "step": 1308 + }, + { + "epoch": 0.13566172660379314, + "grad_norm": 0.40920358896255493, + "learning_rate": 3.884135423476134e-05, + "loss": 0.2164, + "step": 1309 + }, + { + "epoch": 0.13576536428645455, + "grad_norm": 0.4455513656139374, + "learning_rate": 3.8839101305207934e-05, + "loss": 0.2211, + "step": 1310 + }, + { + "epoch": 0.13586900196911597, + "grad_norm": 0.42182791233062744, + "learning_rate": 3.883684625289388e-05, + "loss": 0.2196, + "step": 1311 + }, + { + "epoch": 0.1359726396517774, + "grad_norm": 0.47031956911087036, + "learning_rate": 3.883458907807327e-05, + "loss": 0.2434, + "step": 1312 + }, + { + "epoch": 0.1360762773344388, + "grad_norm": 0.46593621373176575, + "learning_rate": 3.883232978100044e-05, + "loss": 0.253, + "step": 1313 + }, + { + "epoch": 0.13617991501710022, + "grad_norm": 0.49692028760910034, + "learning_rate": 3.883006836192997e-05, + "loss": 0.2765, + "step": 1314 + }, + { + "epoch": 0.13628355269976164, + "grad_norm": 0.5057439208030701, + "learning_rate": 3.882780482111666e-05, + "loss": 0.2238, + "step": 1315 + }, + { + "epoch": 0.13638719038242306, + "grad_norm": 0.4612136781215668, + "learning_rate": 3.882553915881558e-05, + "loss": 0.2381, + "step": 1316 + }, + { + "epoch": 0.13649082806508447, + "grad_norm": 0.5236420631408691, + "learning_rate": 3.8823271375282005e-05, + "loss": 0.2598, + "step": 1317 + }, + { + "epoch": 0.1365944657477459, + "grad_norm": 0.5677441358566284, + "learning_rate": 3.882100147077148e-05, + "loss": 0.3423, + "step": 1318 + }, + { + "epoch": 0.1366981034304073, + "grad_norm": 0.48475411534309387, + "learning_rate": 3.8818729445539765e-05, + "loss": 0.2229, + "step": 1319 + }, + { + "epoch": 0.13680174111306873, + "grad_norm": 0.5205956697463989, + "learning_rate": 3.881645529984287e-05, + "loss": 0.2704, + "step": 1320 + }, + { + "epoch": 0.13690537879573011, + "grad_norm": 0.43500372767448425, + "learning_rate": 3.881417903393704e-05, + "loss": 0.1902, + "step": 1321 + }, + { + "epoch": 0.13700901647839153, + "grad_norm": 0.46301934123039246, + "learning_rate": 3.8811900648078766e-05, + "loss": 0.2265, + "step": 1322 + }, + { + "epoch": 0.13711265416105295, + "grad_norm": 0.5650634169578552, + "learning_rate": 3.880962014252477e-05, + "loss": 0.287, + "step": 1323 + }, + { + "epoch": 0.13721629184371437, + "grad_norm": 0.59044349193573, + "learning_rate": 3.880733751753202e-05, + "loss": 0.2648, + "step": 1324 + }, + { + "epoch": 0.13731992952637578, + "grad_norm": 0.4861128628253937, + "learning_rate": 3.880505277335771e-05, + "loss": 0.278, + "step": 1325 + }, + { + "epoch": 0.1374235672090372, + "grad_norm": 0.48721516132354736, + "learning_rate": 3.880276591025929e-05, + "loss": 0.2859, + "step": 1326 + }, + { + "epoch": 0.13752720489169862, + "grad_norm": 0.4308353662490845, + "learning_rate": 3.880047692849443e-05, + "loss": 0.1972, + "step": 1327 + }, + { + "epoch": 0.13763084257436003, + "grad_norm": 0.44840529561042786, + "learning_rate": 3.8798185828321064e-05, + "loss": 0.2119, + "step": 1328 + }, + { + "epoch": 0.13773448025702145, + "grad_norm": 0.4419281780719757, + "learning_rate": 3.879589260999734e-05, + "loss": 0.2327, + "step": 1329 + }, + { + "epoch": 0.13783811793968287, + "grad_norm": 0.41590288281440735, + "learning_rate": 3.8793597273781654e-05, + "loss": 0.2311, + "step": 1330 + }, + { + "epoch": 0.1379417556223443, + "grad_norm": 0.51755690574646, + "learning_rate": 3.879129981993265e-05, + "loss": 0.2652, + "step": 1331 + }, + { + "epoch": 0.1380453933050057, + "grad_norm": 0.5011487007141113, + "learning_rate": 3.878900024870918e-05, + "loss": 0.284, + "step": 1332 + }, + { + "epoch": 0.13814903098766712, + "grad_norm": 0.4875727593898773, + "learning_rate": 3.8786698560370374e-05, + "loss": 0.266, + "step": 1333 + }, + { + "epoch": 0.13825266867032854, + "grad_norm": 0.4918694496154785, + "learning_rate": 3.878439475517558e-05, + "loss": 0.2329, + "step": 1334 + }, + { + "epoch": 0.13835630635298996, + "grad_norm": 0.49377235770225525, + "learning_rate": 3.878208883338439e-05, + "loss": 0.2486, + "step": 1335 + }, + { + "epoch": 0.13845994403565137, + "grad_norm": 0.4716956913471222, + "learning_rate": 3.8779780795256625e-05, + "loss": 0.2515, + "step": 1336 + }, + { + "epoch": 0.1385635817183128, + "grad_norm": 0.44217389822006226, + "learning_rate": 3.877747064105235e-05, + "loss": 0.2106, + "step": 1337 + }, + { + "epoch": 0.1386672194009742, + "grad_norm": 0.464129239320755, + "learning_rate": 3.877515837103188e-05, + "loss": 0.2344, + "step": 1338 + }, + { + "epoch": 0.13877085708363562, + "grad_norm": 0.5682270526885986, + "learning_rate": 3.877284398545575e-05, + "loss": 0.3267, + "step": 1339 + }, + { + "epoch": 0.138874494766297, + "grad_norm": 0.48125720024108887, + "learning_rate": 3.8770527484584735e-05, + "loss": 0.251, + "step": 1340 + }, + { + "epoch": 0.13897813244895843, + "grad_norm": 0.5513635873794556, + "learning_rate": 3.876820886867987e-05, + "loss": 0.2555, + "step": 1341 + }, + { + "epoch": 0.13908177013161985, + "grad_norm": 0.4538300037384033, + "learning_rate": 3.87658881380024e-05, + "loss": 0.2604, + "step": 1342 + }, + { + "epoch": 0.13918540781428126, + "grad_norm": 0.45465579628944397, + "learning_rate": 3.876356529281383e-05, + "loss": 0.2267, + "step": 1343 + }, + { + "epoch": 0.13928904549694268, + "grad_norm": 0.4481503963470459, + "learning_rate": 3.876124033337589e-05, + "loss": 0.2248, + "step": 1344 + }, + { + "epoch": 0.1393926831796041, + "grad_norm": 0.493288516998291, + "learning_rate": 3.875891325995056e-05, + "loss": 0.2517, + "step": 1345 + }, + { + "epoch": 0.13949632086226552, + "grad_norm": 0.5286715626716614, + "learning_rate": 3.875658407280004e-05, + "loss": 0.257, + "step": 1346 + }, + { + "epoch": 0.13959995854492693, + "grad_norm": 0.4413294196128845, + "learning_rate": 3.875425277218678e-05, + "loss": 0.2114, + "step": 1347 + }, + { + "epoch": 0.13970359622758835, + "grad_norm": 0.4887286126613617, + "learning_rate": 3.875191935837348e-05, + "loss": 0.2617, + "step": 1348 + }, + { + "epoch": 0.13980723391024977, + "grad_norm": 0.42799079418182373, + "learning_rate": 3.874958383162305e-05, + "loss": 0.1822, + "step": 1349 + }, + { + "epoch": 0.13991087159291118, + "grad_norm": 0.5083425045013428, + "learning_rate": 3.874724619219867e-05, + "loss": 0.2603, + "step": 1350 + }, + { + "epoch": 0.1400145092755726, + "grad_norm": 0.4632699489593506, + "learning_rate": 3.874490644036373e-05, + "loss": 0.2258, + "step": 1351 + }, + { + "epoch": 0.14011814695823402, + "grad_norm": 0.4966972768306732, + "learning_rate": 3.8742564576381864e-05, + "loss": 0.248, + "step": 1352 + }, + { + "epoch": 0.14022178464089544, + "grad_norm": 0.4451618194580078, + "learning_rate": 3.8740220600516964e-05, + "loss": 0.2373, + "step": 1353 + }, + { + "epoch": 0.14032542232355685, + "grad_norm": 0.46346795558929443, + "learning_rate": 3.8737874513033134e-05, + "loss": 0.2441, + "step": 1354 + }, + { + "epoch": 0.14042906000621827, + "grad_norm": 0.43191859126091003, + "learning_rate": 3.8735526314194735e-05, + "loss": 0.2372, + "step": 1355 + }, + { + "epoch": 0.1405326976888797, + "grad_norm": 0.4335680902004242, + "learning_rate": 3.873317600426636e-05, + "loss": 0.2367, + "step": 1356 + }, + { + "epoch": 0.1406363353715411, + "grad_norm": 0.49184444546699524, + "learning_rate": 3.873082358351283e-05, + "loss": 0.2548, + "step": 1357 + }, + { + "epoch": 0.14073997305420252, + "grad_norm": 0.48698222637176514, + "learning_rate": 3.8728469052199214e-05, + "loss": 0.2618, + "step": 1358 + }, + { + "epoch": 0.1408436107368639, + "grad_norm": 0.48168468475341797, + "learning_rate": 3.872611241059083e-05, + "loss": 0.2138, + "step": 1359 + }, + { + "epoch": 0.14094724841952533, + "grad_norm": 0.4432443678379059, + "learning_rate": 3.872375365895319e-05, + "loss": 0.2345, + "step": 1360 + }, + { + "epoch": 0.14105088610218675, + "grad_norm": 0.4656221866607666, + "learning_rate": 3.872139279755211e-05, + "loss": 0.2455, + "step": 1361 + }, + { + "epoch": 0.14115452378484816, + "grad_norm": 0.5141227841377258, + "learning_rate": 3.8719029826653584e-05, + "loss": 0.2426, + "step": 1362 + }, + { + "epoch": 0.14125816146750958, + "grad_norm": 0.4511367678642273, + "learning_rate": 3.8716664746523885e-05, + "loss": 0.232, + "step": 1363 + }, + { + "epoch": 0.141361799150171, + "grad_norm": 0.44538968801498413, + "learning_rate": 3.87142975574295e-05, + "loss": 0.2215, + "step": 1364 + }, + { + "epoch": 0.14146543683283241, + "grad_norm": 0.4484039843082428, + "learning_rate": 3.871192825963714e-05, + "loss": 0.2219, + "step": 1365 + }, + { + "epoch": 0.14156907451549383, + "grad_norm": 0.4748300015926361, + "learning_rate": 3.870955685341381e-05, + "loss": 0.2509, + "step": 1366 + }, + { + "epoch": 0.14167271219815525, + "grad_norm": 0.541286826133728, + "learning_rate": 3.870718333902669e-05, + "loss": 0.2251, + "step": 1367 + }, + { + "epoch": 0.14177634988081667, + "grad_norm": 0.4920642375946045, + "learning_rate": 3.8704807716743235e-05, + "loss": 0.2539, + "step": 1368 + }, + { + "epoch": 0.14187998756347808, + "grad_norm": 0.43952393531799316, + "learning_rate": 3.8702429986831124e-05, + "loss": 0.2329, + "step": 1369 + }, + { + "epoch": 0.1419836252461395, + "grad_norm": 0.48505479097366333, + "learning_rate": 3.870005014955827e-05, + "loss": 0.263, + "step": 1370 + }, + { + "epoch": 0.14208726292880092, + "grad_norm": 0.4841783940792084, + "learning_rate": 3.8697668205192843e-05, + "loss": 0.2223, + "step": 1371 + }, + { + "epoch": 0.14219090061146233, + "grad_norm": 0.4626272916793823, + "learning_rate": 3.8695284154003226e-05, + "loss": 0.2635, + "step": 1372 + }, + { + "epoch": 0.14229453829412375, + "grad_norm": 0.4150180518627167, + "learning_rate": 3.869289799625805e-05, + "loss": 0.1935, + "step": 1373 + }, + { + "epoch": 0.14239817597678517, + "grad_norm": 0.44310134649276733, + "learning_rate": 3.8690509732226186e-05, + "loss": 0.2141, + "step": 1374 + }, + { + "epoch": 0.14250181365944659, + "grad_norm": 0.4911743700504303, + "learning_rate": 3.8688119362176745e-05, + "loss": 0.212, + "step": 1375 + }, + { + "epoch": 0.142605451342108, + "grad_norm": 0.5100070834159851, + "learning_rate": 3.868572688637906e-05, + "loss": 0.2713, + "step": 1376 + }, + { + "epoch": 0.14270908902476942, + "grad_norm": 0.4721531271934509, + "learning_rate": 3.868333230510273e-05, + "loss": 0.2523, + "step": 1377 + }, + { + "epoch": 0.1428127267074308, + "grad_norm": 0.550628125667572, + "learning_rate": 3.868093561861755e-05, + "loss": 0.2788, + "step": 1378 + }, + { + "epoch": 0.14291636439009223, + "grad_norm": 0.5342934131622314, + "learning_rate": 3.867853682719358e-05, + "loss": 0.2644, + "step": 1379 + }, + { + "epoch": 0.14302000207275364, + "grad_norm": 0.47953546047210693, + "learning_rate": 3.8676135931101126e-05, + "loss": 0.2527, + "step": 1380 + }, + { + "epoch": 0.14312363975541506, + "grad_norm": 0.5149561762809753, + "learning_rate": 3.86737329306107e-05, + "loss": 0.2622, + "step": 1381 + }, + { + "epoch": 0.14322727743807648, + "grad_norm": 0.4426271915435791, + "learning_rate": 3.867132782599308e-05, + "loss": 0.2703, + "step": 1382 + }, + { + "epoch": 0.1433309151207379, + "grad_norm": 0.5571516156196594, + "learning_rate": 3.8668920617519267e-05, + "loss": 0.3005, + "step": 1383 + }, + { + "epoch": 0.1434345528033993, + "grad_norm": 0.5235035419464111, + "learning_rate": 3.86665113054605e-05, + "loss": 0.2747, + "step": 1384 + }, + { + "epoch": 0.14353819048606073, + "grad_norm": 0.4032425284385681, + "learning_rate": 3.8664099890088254e-05, + "loss": 0.2243, + "step": 1385 + }, + { + "epoch": 0.14364182816872215, + "grad_norm": 0.47944414615631104, + "learning_rate": 3.866168637167425e-05, + "loss": 0.2625, + "step": 1386 + }, + { + "epoch": 0.14374546585138356, + "grad_norm": 0.4225427210330963, + "learning_rate": 3.865927075049043e-05, + "loss": 0.2474, + "step": 1387 + }, + { + "epoch": 0.14384910353404498, + "grad_norm": 0.5522814393043518, + "learning_rate": 3.8656853026808997e-05, + "loss": 0.2607, + "step": 1388 + }, + { + "epoch": 0.1439527412167064, + "grad_norm": 0.4542481601238251, + "learning_rate": 3.8654433200902355e-05, + "loss": 0.2425, + "step": 1389 + }, + { + "epoch": 0.14405637889936782, + "grad_norm": 0.4231477677822113, + "learning_rate": 3.865201127304319e-05, + "loss": 0.2069, + "step": 1390 + }, + { + "epoch": 0.14416001658202923, + "grad_norm": 0.49372538924217224, + "learning_rate": 3.864958724350438e-05, + "loss": 0.2639, + "step": 1391 + }, + { + "epoch": 0.14426365426469065, + "grad_norm": 0.5042855739593506, + "learning_rate": 3.864716111255908e-05, + "loss": 0.296, + "step": 1392 + }, + { + "epoch": 0.14436729194735207, + "grad_norm": 0.47383350133895874, + "learning_rate": 3.864473288048065e-05, + "loss": 0.2672, + "step": 1393 + }, + { + "epoch": 0.14447092963001348, + "grad_norm": 0.4303416907787323, + "learning_rate": 3.8642302547542704e-05, + "loss": 0.1998, + "step": 1394 + }, + { + "epoch": 0.1445745673126749, + "grad_norm": 0.35755103826522827, + "learning_rate": 3.863987011401909e-05, + "loss": 0.1763, + "step": 1395 + }, + { + "epoch": 0.14467820499533632, + "grad_norm": 0.4051773250102997, + "learning_rate": 3.863743558018388e-05, + "loss": 0.2044, + "step": 1396 + }, + { + "epoch": 0.1447818426779977, + "grad_norm": 0.5011558532714844, + "learning_rate": 3.86349989463114e-05, + "loss": 0.2867, + "step": 1397 + }, + { + "epoch": 0.14488548036065912, + "grad_norm": 0.4668574631214142, + "learning_rate": 3.8632560212676215e-05, + "loss": 0.2441, + "step": 1398 + }, + { + "epoch": 0.14498911804332054, + "grad_norm": 0.43723446130752563, + "learning_rate": 3.863011937955311e-05, + "loss": 0.223, + "step": 1399 + }, + { + "epoch": 0.14509275572598196, + "grad_norm": 0.4947415888309479, + "learning_rate": 3.862767644721711e-05, + "loss": 0.2607, + "step": 1400 + }, + { + "epoch": 0.14519639340864338, + "grad_norm": 0.4283679723739624, + "learning_rate": 3.8625231415943486e-05, + "loss": 0.2271, + "step": 1401 + }, + { + "epoch": 0.1453000310913048, + "grad_norm": 0.40165016055107117, + "learning_rate": 3.8622784286007744e-05, + "loss": 0.2107, + "step": 1402 + }, + { + "epoch": 0.1454036687739662, + "grad_norm": 0.4284477233886719, + "learning_rate": 3.8620335057685616e-05, + "loss": 0.2341, + "step": 1403 + }, + { + "epoch": 0.14550730645662763, + "grad_norm": 0.44606518745422363, + "learning_rate": 3.861788373125308e-05, + "loss": 0.2221, + "step": 1404 + }, + { + "epoch": 0.14561094413928904, + "grad_norm": 0.5379354357719421, + "learning_rate": 3.861543030698634e-05, + "loss": 0.2498, + "step": 1405 + }, + { + "epoch": 0.14571458182195046, + "grad_norm": 0.4723847210407257, + "learning_rate": 3.861297478516186e-05, + "loss": 0.2483, + "step": 1406 + }, + { + "epoch": 0.14581821950461188, + "grad_norm": 0.4253641664981842, + "learning_rate": 3.861051716605631e-05, + "loss": 0.2081, + "step": 1407 + }, + { + "epoch": 0.1459218571872733, + "grad_norm": 0.42284271121025085, + "learning_rate": 3.860805744994662e-05, + "loss": 0.2198, + "step": 1408 + }, + { + "epoch": 0.1460254948699347, + "grad_norm": 0.49065491557121277, + "learning_rate": 3.860559563710994e-05, + "loss": 0.2668, + "step": 1409 + }, + { + "epoch": 0.14612913255259613, + "grad_norm": 0.4915112555027008, + "learning_rate": 3.860313172782367e-05, + "loss": 0.2572, + "step": 1410 + }, + { + "epoch": 0.14623277023525755, + "grad_norm": 0.5048714280128479, + "learning_rate": 3.8600665722365434e-05, + "loss": 0.2325, + "step": 1411 + }, + { + "epoch": 0.14633640791791896, + "grad_norm": 0.45734068751335144, + "learning_rate": 3.8598197621013106e-05, + "loss": 0.2633, + "step": 1412 + }, + { + "epoch": 0.14644004560058038, + "grad_norm": 0.44115594029426575, + "learning_rate": 3.859572742404477e-05, + "loss": 0.2272, + "step": 1413 + }, + { + "epoch": 0.1465436832832418, + "grad_norm": 0.4850781559944153, + "learning_rate": 3.859325513173878e-05, + "loss": 0.2463, + "step": 1414 + }, + { + "epoch": 0.14664732096590322, + "grad_norm": 0.4548441171646118, + "learning_rate": 3.8590780744373715e-05, + "loss": 0.2373, + "step": 1415 + }, + { + "epoch": 0.1467509586485646, + "grad_norm": 0.4415612816810608, + "learning_rate": 3.8588304262228365e-05, + "loss": 0.2392, + "step": 1416 + }, + { + "epoch": 0.14685459633122602, + "grad_norm": 0.5979070067405701, + "learning_rate": 3.858582568558179e-05, + "loss": 0.2893, + "step": 1417 + }, + { + "epoch": 0.14695823401388744, + "grad_norm": 0.5068161487579346, + "learning_rate": 3.8583345014713264e-05, + "loss": 0.2626, + "step": 1418 + }, + { + "epoch": 0.14706187169654886, + "grad_norm": 0.4130820333957672, + "learning_rate": 3.8580862249902324e-05, + "loss": 0.2448, + "step": 1419 + }, + { + "epoch": 0.14716550937921027, + "grad_norm": 0.4339570999145508, + "learning_rate": 3.85783773914287e-05, + "loss": 0.2182, + "step": 1420 + }, + { + "epoch": 0.1472691470618717, + "grad_norm": 0.4481070041656494, + "learning_rate": 3.857589043957239e-05, + "loss": 0.2051, + "step": 1421 + }, + { + "epoch": 0.1473727847445331, + "grad_norm": 0.4021722972393036, + "learning_rate": 3.8573401394613624e-05, + "loss": 0.1988, + "step": 1422 + }, + { + "epoch": 0.14747642242719453, + "grad_norm": 0.5416178107261658, + "learning_rate": 3.8570910256832866e-05, + "loss": 0.2911, + "step": 1423 + }, + { + "epoch": 0.14758006010985594, + "grad_norm": 0.4623812139034271, + "learning_rate": 3.85684170265108e-05, + "loss": 0.2437, + "step": 1424 + }, + { + "epoch": 0.14768369779251736, + "grad_norm": 0.5158494114875793, + "learning_rate": 3.856592170392838e-05, + "loss": 0.2771, + "step": 1425 + }, + { + "epoch": 0.14778733547517878, + "grad_norm": 0.3807986080646515, + "learning_rate": 3.856342428936675e-05, + "loss": 0.2136, + "step": 1426 + }, + { + "epoch": 0.1478909731578402, + "grad_norm": 0.4274327754974365, + "learning_rate": 3.856092478310734e-05, + "loss": 0.2196, + "step": 1427 + }, + { + "epoch": 0.1479946108405016, + "grad_norm": 0.46692219376564026, + "learning_rate": 3.855842318543178e-05, + "loss": 0.2327, + "step": 1428 + }, + { + "epoch": 0.14809824852316303, + "grad_norm": 0.504139244556427, + "learning_rate": 3.855591949662194e-05, + "loss": 0.2566, + "step": 1429 + }, + { + "epoch": 0.14820188620582445, + "grad_norm": 0.49943894147872925, + "learning_rate": 3.855341371695994e-05, + "loss": 0.2472, + "step": 1430 + }, + { + "epoch": 0.14830552388848586, + "grad_norm": 0.4539870321750641, + "learning_rate": 3.855090584672812e-05, + "loss": 0.2612, + "step": 1431 + }, + { + "epoch": 0.14840916157114728, + "grad_norm": 0.4134329855442047, + "learning_rate": 3.854839588620907e-05, + "loss": 0.2064, + "step": 1432 + }, + { + "epoch": 0.1485127992538087, + "grad_norm": 0.4111216068267822, + "learning_rate": 3.8545883835685606e-05, + "loss": 0.2015, + "step": 1433 + }, + { + "epoch": 0.14861643693647011, + "grad_norm": 0.4707288444042206, + "learning_rate": 3.854336969544078e-05, + "loss": 0.243, + "step": 1434 + }, + { + "epoch": 0.1487200746191315, + "grad_norm": 0.44714653491973877, + "learning_rate": 3.8540853465757885e-05, + "loss": 0.2217, + "step": 1435 + }, + { + "epoch": 0.14882371230179292, + "grad_norm": 0.5175408124923706, + "learning_rate": 3.853833514692044e-05, + "loss": 0.2918, + "step": 1436 + }, + { + "epoch": 0.14892734998445434, + "grad_norm": 0.4449577033519745, + "learning_rate": 3.853581473921221e-05, + "loss": 0.2179, + "step": 1437 + }, + { + "epoch": 0.14903098766711576, + "grad_norm": 0.45618268847465515, + "learning_rate": 3.8533292242917195e-05, + "loss": 0.2558, + "step": 1438 + }, + { + "epoch": 0.14913462534977717, + "grad_norm": 0.499709814786911, + "learning_rate": 3.8530767658319614e-05, + "loss": 0.2354, + "step": 1439 + }, + { + "epoch": 0.1492382630324386, + "grad_norm": 0.5203845500946045, + "learning_rate": 3.852824098570394e-05, + "loss": 0.2389, + "step": 1440 + }, + { + "epoch": 0.1493419007151, + "grad_norm": 0.4207056164741516, + "learning_rate": 3.852571222535487e-05, + "loss": 0.2087, + "step": 1441 + }, + { + "epoch": 0.14944553839776142, + "grad_norm": 0.494806706905365, + "learning_rate": 3.852318137755736e-05, + "loss": 0.2368, + "step": 1442 + }, + { + "epoch": 0.14954917608042284, + "grad_norm": 0.5283109545707703, + "learning_rate": 3.852064844259656e-05, + "loss": 0.2782, + "step": 1443 + }, + { + "epoch": 0.14965281376308426, + "grad_norm": 0.5322892069816589, + "learning_rate": 3.851811342075788e-05, + "loss": 0.235, + "step": 1444 + }, + { + "epoch": 0.14975645144574568, + "grad_norm": 0.48319143056869507, + "learning_rate": 3.851557631232697e-05, + "loss": 0.2362, + "step": 1445 + }, + { + "epoch": 0.1498600891284071, + "grad_norm": 0.4722559452056885, + "learning_rate": 3.851303711758971e-05, + "loss": 0.2772, + "step": 1446 + }, + { + "epoch": 0.1499637268110685, + "grad_norm": 0.447968989610672, + "learning_rate": 3.85104958368322e-05, + "loss": 0.2705, + "step": 1447 + }, + { + "epoch": 0.15006736449372993, + "grad_norm": 0.49089664220809937, + "learning_rate": 3.8507952470340794e-05, + "loss": 0.2811, + "step": 1448 + }, + { + "epoch": 0.15017100217639134, + "grad_norm": 0.45806270837783813, + "learning_rate": 3.8505407018402073e-05, + "loss": 0.2484, + "step": 1449 + }, + { + "epoch": 0.15027463985905276, + "grad_norm": 0.5139738321304321, + "learning_rate": 3.850285948130286e-05, + "loss": 0.2482, + "step": 1450 + }, + { + "epoch": 0.15037827754171418, + "grad_norm": 0.4105122685432434, + "learning_rate": 3.8500309859330205e-05, + "loss": 0.2106, + "step": 1451 + }, + { + "epoch": 0.1504819152243756, + "grad_norm": 0.41013506054878235, + "learning_rate": 3.849775815277139e-05, + "loss": 0.1953, + "step": 1452 + }, + { + "epoch": 0.150585552907037, + "grad_norm": 0.527901828289032, + "learning_rate": 3.8495204361913944e-05, + "loss": 0.2527, + "step": 1453 + }, + { + "epoch": 0.1506891905896984, + "grad_norm": 0.5156935453414917, + "learning_rate": 3.849264848704563e-05, + "loss": 0.2418, + "step": 1454 + }, + { + "epoch": 0.15079282827235982, + "grad_norm": 0.5491344928741455, + "learning_rate": 3.8490090528454415e-05, + "loss": 0.276, + "step": 1455 + }, + { + "epoch": 0.15089646595502124, + "grad_norm": 0.4275567829608917, + "learning_rate": 3.8487530486428554e-05, + "loss": 0.2141, + "step": 1456 + }, + { + "epoch": 0.15100010363768265, + "grad_norm": 0.4257248342037201, + "learning_rate": 3.84849683612565e-05, + "loss": 0.236, + "step": 1457 + }, + { + "epoch": 0.15110374132034407, + "grad_norm": 0.43031546473503113, + "learning_rate": 3.848240415322693e-05, + "loss": 0.2132, + "step": 1458 + }, + { + "epoch": 0.1512073790030055, + "grad_norm": 0.445788711309433, + "learning_rate": 3.847983786262881e-05, + "loss": 0.2503, + "step": 1459 + }, + { + "epoch": 0.1513110166856669, + "grad_norm": 0.49891895055770874, + "learning_rate": 3.847726948975128e-05, + "loss": 0.2545, + "step": 1460 + }, + { + "epoch": 0.15141465436832832, + "grad_norm": 0.4228783845901489, + "learning_rate": 3.847469903488375e-05, + "loss": 0.2335, + "step": 1461 + }, + { + "epoch": 0.15151829205098974, + "grad_norm": 0.4258784055709839, + "learning_rate": 3.847212649831585e-05, + "loss": 0.2165, + "step": 1462 + }, + { + "epoch": 0.15162192973365116, + "grad_norm": 0.5519275069236755, + "learning_rate": 3.846955188033745e-05, + "loss": 0.2555, + "step": 1463 + }, + { + "epoch": 0.15172556741631257, + "grad_norm": 0.43952473998069763, + "learning_rate": 3.846697518123866e-05, + "loss": 0.2355, + "step": 1464 + }, + { + "epoch": 0.151829205098974, + "grad_norm": 0.494721919298172, + "learning_rate": 3.8464396401309804e-05, + "loss": 0.2615, + "step": 1465 + }, + { + "epoch": 0.1519328427816354, + "grad_norm": 0.5367245078086853, + "learning_rate": 3.846181554084147e-05, + "loss": 0.2833, + "step": 1466 + }, + { + "epoch": 0.15203648046429682, + "grad_norm": 0.519638180732727, + "learning_rate": 3.845923260012446e-05, + "loss": 0.3263, + "step": 1467 + }, + { + "epoch": 0.15214011814695824, + "grad_norm": 0.4717256724834442, + "learning_rate": 3.845664757944983e-05, + "loss": 0.2725, + "step": 1468 + }, + { + "epoch": 0.15224375582961966, + "grad_norm": 0.47035694122314453, + "learning_rate": 3.845406047910883e-05, + "loss": 0.2707, + "step": 1469 + }, + { + "epoch": 0.15234739351228108, + "grad_norm": 0.5218305587768555, + "learning_rate": 3.845147129939298e-05, + "loss": 0.2632, + "step": 1470 + }, + { + "epoch": 0.1524510311949425, + "grad_norm": 0.5742754936218262, + "learning_rate": 3.844888004059403e-05, + "loss": 0.2487, + "step": 1471 + }, + { + "epoch": 0.1525546688776039, + "grad_norm": 0.4790658950805664, + "learning_rate": 3.844628670300396e-05, + "loss": 0.2402, + "step": 1472 + }, + { + "epoch": 0.1526583065602653, + "grad_norm": 0.4891185760498047, + "learning_rate": 3.844369128691497e-05, + "loss": 0.2472, + "step": 1473 + }, + { + "epoch": 0.15276194424292672, + "grad_norm": 0.4714306890964508, + "learning_rate": 3.844109379261953e-05, + "loss": 0.2062, + "step": 1474 + }, + { + "epoch": 0.15286558192558813, + "grad_norm": 0.46967020630836487, + "learning_rate": 3.84384942204103e-05, + "loss": 0.1977, + "step": 1475 + }, + { + "epoch": 0.15296921960824955, + "grad_norm": 0.46074581146240234, + "learning_rate": 3.8435892570580205e-05, + "loss": 0.2474, + "step": 1476 + }, + { + "epoch": 0.15307285729091097, + "grad_norm": 0.5243543386459351, + "learning_rate": 3.8433288843422395e-05, + "loss": 0.2718, + "step": 1477 + }, + { + "epoch": 0.15317649497357239, + "grad_norm": 0.4539581835269928, + "learning_rate": 3.8430683039230254e-05, + "loss": 0.2315, + "step": 1478 + }, + { + "epoch": 0.1532801326562338, + "grad_norm": 0.5196117162704468, + "learning_rate": 3.8428075158297404e-05, + "loss": 0.2527, + "step": 1479 + }, + { + "epoch": 0.15338377033889522, + "grad_norm": 0.482964426279068, + "learning_rate": 3.842546520091769e-05, + "loss": 0.2847, + "step": 1480 + }, + { + "epoch": 0.15348740802155664, + "grad_norm": 0.5170326828956604, + "learning_rate": 3.8422853167385195e-05, + "loss": 0.2953, + "step": 1481 + }, + { + "epoch": 0.15359104570421805, + "grad_norm": 0.48660704493522644, + "learning_rate": 3.842023905799425e-05, + "loss": 0.2518, + "step": 1482 + }, + { + "epoch": 0.15369468338687947, + "grad_norm": 0.46241337060928345, + "learning_rate": 3.8417622873039396e-05, + "loss": 0.2286, + "step": 1483 + }, + { + "epoch": 0.1537983210695409, + "grad_norm": 0.44095274806022644, + "learning_rate": 3.841500461281543e-05, + "loss": 0.2328, + "step": 1484 + }, + { + "epoch": 0.1539019587522023, + "grad_norm": 0.4544067978858948, + "learning_rate": 3.8412384277617374e-05, + "loss": 0.2414, + "step": 1485 + }, + { + "epoch": 0.15400559643486372, + "grad_norm": 0.38432058691978455, + "learning_rate": 3.8409761867740476e-05, + "loss": 0.1946, + "step": 1486 + }, + { + "epoch": 0.15410923411752514, + "grad_norm": 0.44219785928726196, + "learning_rate": 3.840713738348023e-05, + "loss": 0.232, + "step": 1487 + }, + { + "epoch": 0.15421287180018656, + "grad_norm": 0.5125858783721924, + "learning_rate": 3.840451082513236e-05, + "loss": 0.2364, + "step": 1488 + }, + { + "epoch": 0.15431650948284797, + "grad_norm": 0.5096924901008606, + "learning_rate": 3.840188219299282e-05, + "loss": 0.2639, + "step": 1489 + }, + { + "epoch": 0.1544201471655094, + "grad_norm": 0.4817996919155121, + "learning_rate": 3.83992514873578e-05, + "loss": 0.2794, + "step": 1490 + }, + { + "epoch": 0.1545237848481708, + "grad_norm": 0.46409645676612854, + "learning_rate": 3.839661870852372e-05, + "loss": 0.2414, + "step": 1491 + }, + { + "epoch": 0.1546274225308322, + "grad_norm": 0.500841498374939, + "learning_rate": 3.839398385678725e-05, + "loss": 0.231, + "step": 1492 + }, + { + "epoch": 0.15473106021349362, + "grad_norm": 0.5692920684814453, + "learning_rate": 3.839134693244527e-05, + "loss": 0.2904, + "step": 1493 + }, + { + "epoch": 0.15483469789615503, + "grad_norm": 0.49656811356544495, + "learning_rate": 3.8388707935794905e-05, + "loss": 0.2348, + "step": 1494 + }, + { + "epoch": 0.15493833557881645, + "grad_norm": 0.40897035598754883, + "learning_rate": 3.8386066867133515e-05, + "loss": 0.2214, + "step": 1495 + }, + { + "epoch": 0.15504197326147787, + "grad_norm": 0.48259782791137695, + "learning_rate": 3.83834237267587e-05, + "loss": 0.2226, + "step": 1496 + }, + { + "epoch": 0.15514561094413928, + "grad_norm": 0.532426655292511, + "learning_rate": 3.838077851496827e-05, + "loss": 0.2935, + "step": 1497 + }, + { + "epoch": 0.1552492486268007, + "grad_norm": 0.4772799611091614, + "learning_rate": 3.837813123206029e-05, + "loss": 0.2373, + "step": 1498 + }, + { + "epoch": 0.15535288630946212, + "grad_norm": 0.47948452830314636, + "learning_rate": 3.837548187833306e-05, + "loss": 0.2564, + "step": 1499 + }, + { + "epoch": 0.15545652399212354, + "grad_norm": 0.4946514070034027, + "learning_rate": 3.8372830454085095e-05, + "loss": 0.2392, + "step": 1500 + }, + { + "epoch": 0.15556016167478495, + "grad_norm": 0.4094104766845703, + "learning_rate": 3.8370176959615154e-05, + "loss": 0.1878, + "step": 1501 + }, + { + "epoch": 0.15566379935744637, + "grad_norm": 0.4557799994945526, + "learning_rate": 3.8367521395222225e-05, + "loss": 0.2277, + "step": 1502 + }, + { + "epoch": 0.1557674370401078, + "grad_norm": 0.3870164453983307, + "learning_rate": 3.8364863761205546e-05, + "loss": 0.1776, + "step": 1503 + }, + { + "epoch": 0.1558710747227692, + "grad_norm": 0.5103594660758972, + "learning_rate": 3.836220405786456e-05, + "loss": 0.2714, + "step": 1504 + }, + { + "epoch": 0.15597471240543062, + "grad_norm": 0.5301050543785095, + "learning_rate": 3.8359542285498966e-05, + "loss": 0.2546, + "step": 1505 + }, + { + "epoch": 0.15607835008809204, + "grad_norm": 0.4772056043148041, + "learning_rate": 3.8356878444408696e-05, + "loss": 0.2327, + "step": 1506 + }, + { + "epoch": 0.15618198777075346, + "grad_norm": 0.5653616786003113, + "learning_rate": 3.835421253489389e-05, + "loss": 0.2944, + "step": 1507 + }, + { + "epoch": 0.15628562545341487, + "grad_norm": 0.5195934772491455, + "learning_rate": 3.8351544557254954e-05, + "loss": 0.2897, + "step": 1508 + }, + { + "epoch": 0.1563892631360763, + "grad_norm": 0.4947530925273895, + "learning_rate": 3.83488745117925e-05, + "loss": 0.2477, + "step": 1509 + }, + { + "epoch": 0.1564929008187377, + "grad_norm": 0.519658088684082, + "learning_rate": 3.834620239880739e-05, + "loss": 0.2648, + "step": 1510 + }, + { + "epoch": 0.1565965385013991, + "grad_norm": 0.46499353647232056, + "learning_rate": 3.834352821860072e-05, + "loss": 0.2663, + "step": 1511 + }, + { + "epoch": 0.1567001761840605, + "grad_norm": 0.5055245161056519, + "learning_rate": 3.83408519714738e-05, + "loss": 0.2683, + "step": 1512 + }, + { + "epoch": 0.15680381386672193, + "grad_norm": 0.5055966377258301, + "learning_rate": 3.833817365772819e-05, + "loss": 0.2382, + "step": 1513 + }, + { + "epoch": 0.15690745154938335, + "grad_norm": 0.43308383226394653, + "learning_rate": 3.833549327766569e-05, + "loss": 0.2109, + "step": 1514 + }, + { + "epoch": 0.15701108923204476, + "grad_norm": 0.4320563077926636, + "learning_rate": 3.83328108315883e-05, + "loss": 0.201, + "step": 1515 + }, + { + "epoch": 0.15711472691470618, + "grad_norm": 0.5178250670433044, + "learning_rate": 3.833012631979829e-05, + "loss": 0.2529, + "step": 1516 + }, + { + "epoch": 0.1572183645973676, + "grad_norm": 0.4101965129375458, + "learning_rate": 3.832743974259814e-05, + "loss": 0.2078, + "step": 1517 + }, + { + "epoch": 0.15732200228002902, + "grad_norm": 0.5203193426132202, + "learning_rate": 3.832475110029056e-05, + "loss": 0.2528, + "step": 1518 + }, + { + "epoch": 0.15742563996269043, + "grad_norm": 0.4788103401660919, + "learning_rate": 3.8322060393178526e-05, + "loss": 0.2372, + "step": 1519 + }, + { + "epoch": 0.15752927764535185, + "grad_norm": 0.49796929955482483, + "learning_rate": 3.8319367621565205e-05, + "loss": 0.2327, + "step": 1520 + }, + { + "epoch": 0.15763291532801327, + "grad_norm": 0.5318668484687805, + "learning_rate": 3.831667278575402e-05, + "loss": 0.2643, + "step": 1521 + }, + { + "epoch": 0.15773655301067468, + "grad_norm": 0.4808575510978699, + "learning_rate": 3.831397588604861e-05, + "loss": 0.2411, + "step": 1522 + }, + { + "epoch": 0.1578401906933361, + "grad_norm": 0.5338442325592041, + "learning_rate": 3.8311276922752876e-05, + "loss": 0.2625, + "step": 1523 + }, + { + "epoch": 0.15794382837599752, + "grad_norm": 0.5726016759872437, + "learning_rate": 3.830857589617092e-05, + "loss": 0.2925, + "step": 1524 + }, + { + "epoch": 0.15804746605865894, + "grad_norm": 0.49443721771240234, + "learning_rate": 3.83058728066071e-05, + "loss": 0.2573, + "step": 1525 + }, + { + "epoch": 0.15815110374132035, + "grad_norm": 0.5603029131889343, + "learning_rate": 3.830316765436598e-05, + "loss": 0.2656, + "step": 1526 + }, + { + "epoch": 0.15825474142398177, + "grad_norm": 0.4532327950000763, + "learning_rate": 3.830046043975239e-05, + "loss": 0.2201, + "step": 1527 + }, + { + "epoch": 0.1583583791066432, + "grad_norm": 0.45034271478652954, + "learning_rate": 3.8297751163071356e-05, + "loss": 0.2703, + "step": 1528 + }, + { + "epoch": 0.1584620167893046, + "grad_norm": 0.49351736903190613, + "learning_rate": 3.829503982462817e-05, + "loss": 0.2187, + "step": 1529 + }, + { + "epoch": 0.158565654471966, + "grad_norm": 0.46013131737709045, + "learning_rate": 3.8292326424728344e-05, + "loss": 0.252, + "step": 1530 + }, + { + "epoch": 0.1586692921546274, + "grad_norm": 0.4857384264469147, + "learning_rate": 3.8289610963677605e-05, + "loss": 0.2406, + "step": 1531 + }, + { + "epoch": 0.15877292983728883, + "grad_norm": 0.38991162180900574, + "learning_rate": 3.8286893441781935e-05, + "loss": 0.1895, + "step": 1532 + }, + { + "epoch": 0.15887656751995025, + "grad_norm": 0.435508668422699, + "learning_rate": 3.828417385934754e-05, + "loss": 0.2135, + "step": 1533 + }, + { + "epoch": 0.15898020520261166, + "grad_norm": 0.5303035378456116, + "learning_rate": 3.828145221668086e-05, + "loss": 0.2434, + "step": 1534 + }, + { + "epoch": 0.15908384288527308, + "grad_norm": 0.46581482887268066, + "learning_rate": 3.827872851408856e-05, + "loss": 0.2462, + "step": 1535 + }, + { + "epoch": 0.1591874805679345, + "grad_norm": 0.5437631607055664, + "learning_rate": 3.827600275187755e-05, + "loss": 0.2885, + "step": 1536 + }, + { + "epoch": 0.15929111825059591, + "grad_norm": 0.45981255173683167, + "learning_rate": 3.8273274930354955e-05, + "loss": 0.2248, + "step": 1537 + }, + { + "epoch": 0.15939475593325733, + "grad_norm": 0.4518927037715912, + "learning_rate": 3.827054504982815e-05, + "loss": 0.2211, + "step": 1538 + }, + { + "epoch": 0.15949839361591875, + "grad_norm": 0.44678646326065063, + "learning_rate": 3.826781311060473e-05, + "loss": 0.2235, + "step": 1539 + }, + { + "epoch": 0.15960203129858017, + "grad_norm": 0.45804503560066223, + "learning_rate": 3.8265079112992525e-05, + "loss": 0.2477, + "step": 1540 + }, + { + "epoch": 0.15970566898124158, + "grad_norm": 0.46019431948661804, + "learning_rate": 3.82623430572996e-05, + "loss": 0.2451, + "step": 1541 + }, + { + "epoch": 0.159809306663903, + "grad_norm": 0.5183833837509155, + "learning_rate": 3.825960494383426e-05, + "loss": 0.2567, + "step": 1542 + }, + { + "epoch": 0.15991294434656442, + "grad_norm": 0.49382057785987854, + "learning_rate": 3.8256864772905006e-05, + "loss": 0.2484, + "step": 1543 + }, + { + "epoch": 0.16001658202922583, + "grad_norm": 0.4504323899745941, + "learning_rate": 3.8254122544820615e-05, + "loss": 0.2157, + "step": 1544 + }, + { + "epoch": 0.16012021971188725, + "grad_norm": 0.46211907267570496, + "learning_rate": 3.825137825989007e-05, + "loss": 0.244, + "step": 1545 + }, + { + "epoch": 0.16022385739454867, + "grad_norm": 0.4525067210197449, + "learning_rate": 3.8248631918422595e-05, + "loss": 0.2706, + "step": 1546 + }, + { + "epoch": 0.16032749507721009, + "grad_norm": 0.42060381174087524, + "learning_rate": 3.8245883520727646e-05, + "loss": 0.217, + "step": 1547 + }, + { + "epoch": 0.1604311327598715, + "grad_norm": 0.46641871333122253, + "learning_rate": 3.82431330671149e-05, + "loss": 0.2388, + "step": 1548 + }, + { + "epoch": 0.1605347704425329, + "grad_norm": 0.45688316226005554, + "learning_rate": 3.824038055789429e-05, + "loss": 0.2185, + "step": 1549 + }, + { + "epoch": 0.1606384081251943, + "grad_norm": 0.5488545298576355, + "learning_rate": 3.823762599337595e-05, + "loss": 0.2564, + "step": 1550 + }, + { + "epoch": 0.16074204580785573, + "grad_norm": 0.4734223186969757, + "learning_rate": 3.823486937387026e-05, + "loss": 0.2141, + "step": 1551 + }, + { + "epoch": 0.16084568349051714, + "grad_norm": 0.4606945514678955, + "learning_rate": 3.8232110699687836e-05, + "loss": 0.2681, + "step": 1552 + }, + { + "epoch": 0.16094932117317856, + "grad_norm": 0.49000391364097595, + "learning_rate": 3.822934997113953e-05, + "loss": 0.252, + "step": 1553 + }, + { + "epoch": 0.16105295885583998, + "grad_norm": 0.4744507074356079, + "learning_rate": 3.82265871885364e-05, + "loss": 0.248, + "step": 1554 + }, + { + "epoch": 0.1611565965385014, + "grad_norm": 0.5346882939338684, + "learning_rate": 3.822382235218975e-05, + "loss": 0.3036, + "step": 1555 + }, + { + "epoch": 0.1612602342211628, + "grad_norm": 0.4564545154571533, + "learning_rate": 3.822105546241114e-05, + "loss": 0.2214, + "step": 1556 + }, + { + "epoch": 0.16136387190382423, + "grad_norm": 0.4706862270832062, + "learning_rate": 3.821828651951232e-05, + "loss": 0.2465, + "step": 1557 + }, + { + "epoch": 0.16146750958648565, + "grad_norm": 0.41739633679389954, + "learning_rate": 3.82155155238053e-05, + "loss": 0.2372, + "step": 1558 + }, + { + "epoch": 0.16157114726914706, + "grad_norm": 0.4622024595737457, + "learning_rate": 3.82127424756023e-05, + "loss": 0.2483, + "step": 1559 + }, + { + "epoch": 0.16167478495180848, + "grad_norm": 0.42721524834632874, + "learning_rate": 3.82099673752158e-05, + "loss": 0.2631, + "step": 1560 + }, + { + "epoch": 0.1617784226344699, + "grad_norm": 0.5179228782653809, + "learning_rate": 3.8207190222958474e-05, + "loss": 0.2988, + "step": 1561 + }, + { + "epoch": 0.16188206031713132, + "grad_norm": 0.5048723816871643, + "learning_rate": 3.820441101914327e-05, + "loss": 0.2608, + "step": 1562 + }, + { + "epoch": 0.16198569799979273, + "grad_norm": 0.49215126037597656, + "learning_rate": 3.820162976408332e-05, + "loss": 0.2485, + "step": 1563 + }, + { + "epoch": 0.16208933568245415, + "grad_norm": 0.5016050934791565, + "learning_rate": 3.819884645809203e-05, + "loss": 0.2456, + "step": 1564 + }, + { + "epoch": 0.16219297336511557, + "grad_norm": 0.42429667711257935, + "learning_rate": 3.819606110148301e-05, + "loss": 0.1731, + "step": 1565 + }, + { + "epoch": 0.16229661104777698, + "grad_norm": 0.4097731411457062, + "learning_rate": 3.8193273694570105e-05, + "loss": 0.2605, + "step": 1566 + }, + { + "epoch": 0.1624002487304384, + "grad_norm": 0.4741736948490143, + "learning_rate": 3.819048423766741e-05, + "loss": 0.2363, + "step": 1567 + }, + { + "epoch": 0.1625038864130998, + "grad_norm": 0.4571591913700104, + "learning_rate": 3.818769273108923e-05, + "loss": 0.231, + "step": 1568 + }, + { + "epoch": 0.1626075240957612, + "grad_norm": 0.41287946701049805, + "learning_rate": 3.81848991751501e-05, + "loss": 0.1934, + "step": 1569 + }, + { + "epoch": 0.16271116177842262, + "grad_norm": 0.600933313369751, + "learning_rate": 3.81821035701648e-05, + "loss": 0.2542, + "step": 1570 + }, + { + "epoch": 0.16281479946108404, + "grad_norm": 0.4297773540019989, + "learning_rate": 3.817930591644834e-05, + "loss": 0.2204, + "step": 1571 + }, + { + "epoch": 0.16291843714374546, + "grad_norm": 0.43877971172332764, + "learning_rate": 3.817650621431595e-05, + "loss": 0.1925, + "step": 1572 + }, + { + "epoch": 0.16302207482640688, + "grad_norm": 0.4170320928096771, + "learning_rate": 3.817370446408309e-05, + "loss": 0.1872, + "step": 1573 + }, + { + "epoch": 0.1631257125090683, + "grad_norm": 0.5067045092582703, + "learning_rate": 3.817090066606547e-05, + "loss": 0.2353, + "step": 1574 + }, + { + "epoch": 0.1632293501917297, + "grad_norm": 0.5097575187683105, + "learning_rate": 3.8168094820579e-05, + "loss": 0.2528, + "step": 1575 + }, + { + "epoch": 0.16333298787439113, + "grad_norm": 0.49412232637405396, + "learning_rate": 3.816528692793985e-05, + "loss": 0.2501, + "step": 1576 + }, + { + "epoch": 0.16343662555705255, + "grad_norm": 0.4716198444366455, + "learning_rate": 3.816247698846441e-05, + "loss": 0.2394, + "step": 1577 + }, + { + "epoch": 0.16354026323971396, + "grad_norm": 0.5026634931564331, + "learning_rate": 3.81596650024693e-05, + "loss": 0.2521, + "step": 1578 + }, + { + "epoch": 0.16364390092237538, + "grad_norm": 0.4558325409889221, + "learning_rate": 3.815685097027137e-05, + "loss": 0.2268, + "step": 1579 + }, + { + "epoch": 0.1637475386050368, + "grad_norm": 0.4844086766242981, + "learning_rate": 3.8154034892187685e-05, + "loss": 0.242, + "step": 1580 + }, + { + "epoch": 0.1638511762876982, + "grad_norm": 0.43945661187171936, + "learning_rate": 3.8151216768535584e-05, + "loss": 0.2156, + "step": 1581 + }, + { + "epoch": 0.16395481397035963, + "grad_norm": 0.5005728006362915, + "learning_rate": 3.8148396599632585e-05, + "loss": 0.253, + "step": 1582 + }, + { + "epoch": 0.16405845165302105, + "grad_norm": 0.5019499063491821, + "learning_rate": 3.8145574385796475e-05, + "loss": 0.2484, + "step": 1583 + }, + { + "epoch": 0.16416208933568247, + "grad_norm": 0.4787789285182953, + "learning_rate": 3.8142750127345244e-05, + "loss": 0.2561, + "step": 1584 + }, + { + "epoch": 0.16426572701834388, + "grad_norm": 0.49771633744239807, + "learning_rate": 3.813992382459714e-05, + "loss": 0.218, + "step": 1585 + }, + { + "epoch": 0.1643693647010053, + "grad_norm": 0.5096080303192139, + "learning_rate": 3.8137095477870616e-05, + "loss": 0.2274, + "step": 1586 + }, + { + "epoch": 0.1644730023836667, + "grad_norm": 0.46017125248908997, + "learning_rate": 3.8134265087484364e-05, + "loss": 0.2196, + "step": 1587 + }, + { + "epoch": 0.1645766400663281, + "grad_norm": 0.42982977628707886, + "learning_rate": 3.8131432653757315e-05, + "loss": 0.2139, + "step": 1588 + }, + { + "epoch": 0.16468027774898952, + "grad_norm": 0.447560578584671, + "learning_rate": 3.812859817700862e-05, + "loss": 0.228, + "step": 1589 + }, + { + "epoch": 0.16478391543165094, + "grad_norm": 0.5123468041419983, + "learning_rate": 3.812576165755767e-05, + "loss": 0.2573, + "step": 1590 + }, + { + "epoch": 0.16488755311431236, + "grad_norm": 0.5309221148490906, + "learning_rate": 3.8122923095724064e-05, + "loss": 0.2648, + "step": 1591 + }, + { + "epoch": 0.16499119079697377, + "grad_norm": 0.4955211281776428, + "learning_rate": 3.812008249182766e-05, + "loss": 0.2401, + "step": 1592 + }, + { + "epoch": 0.1650948284796352, + "grad_norm": 0.47056087851524353, + "learning_rate": 3.811723984618853e-05, + "loss": 0.2236, + "step": 1593 + }, + { + "epoch": 0.1651984661622966, + "grad_norm": 0.49324530363082886, + "learning_rate": 3.811439515912698e-05, + "loss": 0.2231, + "step": 1594 + }, + { + "epoch": 0.16530210384495803, + "grad_norm": 0.4553638994693756, + "learning_rate": 3.811154843096354e-05, + "loss": 0.215, + "step": 1595 + }, + { + "epoch": 0.16540574152761944, + "grad_norm": 0.4878029525279999, + "learning_rate": 3.810869966201898e-05, + "loss": 0.2443, + "step": 1596 + }, + { + "epoch": 0.16550937921028086, + "grad_norm": 0.49212896823883057, + "learning_rate": 3.8105848852614286e-05, + "loss": 0.2045, + "step": 1597 + }, + { + "epoch": 0.16561301689294228, + "grad_norm": 0.42695632576942444, + "learning_rate": 3.810299600307069e-05, + "loss": 0.2011, + "step": 1598 + }, + { + "epoch": 0.1657166545756037, + "grad_norm": 0.5269631743431091, + "learning_rate": 3.810014111370966e-05, + "loss": 0.2627, + "step": 1599 + }, + { + "epoch": 0.1658202922582651, + "grad_norm": 0.48581254482269287, + "learning_rate": 3.8097284184852853e-05, + "loss": 0.2273, + "step": 1600 + }, + { + "epoch": 0.16592392994092653, + "grad_norm": 0.523431122303009, + "learning_rate": 3.80944252168222e-05, + "loss": 0.2213, + "step": 1601 + }, + { + "epoch": 0.16602756762358795, + "grad_norm": 0.5090680718421936, + "learning_rate": 3.8091564209939834e-05, + "loss": 0.2753, + "step": 1602 + }, + { + "epoch": 0.16613120530624936, + "grad_norm": 0.5586525797843933, + "learning_rate": 3.808870116452815e-05, + "loss": 0.2552, + "step": 1603 + }, + { + "epoch": 0.16623484298891078, + "grad_norm": 0.5396630764007568, + "learning_rate": 3.808583608090974e-05, + "loss": 0.2336, + "step": 1604 + }, + { + "epoch": 0.1663384806715722, + "grad_norm": 0.5063744783401489, + "learning_rate": 3.808296895940742e-05, + "loss": 0.2239, + "step": 1605 + }, + { + "epoch": 0.1664421183542336, + "grad_norm": 0.45250818133354187, + "learning_rate": 3.808009980034428e-05, + "loss": 0.244, + "step": 1606 + }, + { + "epoch": 0.166545756036895, + "grad_norm": 0.4877474904060364, + "learning_rate": 3.8077228604043595e-05, + "loss": 0.2478, + "step": 1607 + }, + { + "epoch": 0.16664939371955642, + "grad_norm": 0.460329532623291, + "learning_rate": 3.8074355370828896e-05, + "loss": 0.2195, + "step": 1608 + }, + { + "epoch": 0.16675303140221784, + "grad_norm": 0.5120121240615845, + "learning_rate": 3.807148010102393e-05, + "loss": 0.2765, + "step": 1609 + }, + { + "epoch": 0.16685666908487926, + "grad_norm": 0.40834420919418335, + "learning_rate": 3.8068602794952675e-05, + "loss": 0.2175, + "step": 1610 + }, + { + "epoch": 0.16696030676754067, + "grad_norm": 0.44246426224708557, + "learning_rate": 3.806572345293935e-05, + "loss": 0.2393, + "step": 1611 + }, + { + "epoch": 0.1670639444502021, + "grad_norm": 0.49997183680534363, + "learning_rate": 3.806284207530839e-05, + "loss": 0.2735, + "step": 1612 + }, + { + "epoch": 0.1671675821328635, + "grad_norm": 0.5130741596221924, + "learning_rate": 3.805995866238446e-05, + "loss": 0.2287, + "step": 1613 + }, + { + "epoch": 0.16727121981552492, + "grad_norm": 0.4857131242752075, + "learning_rate": 3.805707321449247e-05, + "loss": 0.2527, + "step": 1614 + }, + { + "epoch": 0.16737485749818634, + "grad_norm": 0.5433309674263, + "learning_rate": 3.8054185731957536e-05, + "loss": 0.2963, + "step": 1615 + }, + { + "epoch": 0.16747849518084776, + "grad_norm": 0.47748440504074097, + "learning_rate": 3.805129621510502e-05, + "loss": 0.1869, + "step": 1616 + }, + { + "epoch": 0.16758213286350918, + "grad_norm": 0.5478339195251465, + "learning_rate": 3.804840466426051e-05, + "loss": 0.2652, + "step": 1617 + }, + { + "epoch": 0.1676857705461706, + "grad_norm": 0.509493350982666, + "learning_rate": 3.8045511079749816e-05, + "loss": 0.243, + "step": 1618 + }, + { + "epoch": 0.167789408228832, + "grad_norm": 0.5129976868629456, + "learning_rate": 3.804261546189899e-05, + "loss": 0.2909, + "step": 1619 + }, + { + "epoch": 0.16789304591149343, + "grad_norm": 0.5227918028831482, + "learning_rate": 3.803971781103429e-05, + "loss": 0.2589, + "step": 1620 + }, + { + "epoch": 0.16799668359415484, + "grad_norm": 0.4164186418056488, + "learning_rate": 3.803681812748224e-05, + "loss": 0.2058, + "step": 1621 + }, + { + "epoch": 0.16810032127681626, + "grad_norm": 0.3985108733177185, + "learning_rate": 3.803391641156956e-05, + "loss": 0.2003, + "step": 1622 + }, + { + "epoch": 0.16820395895947768, + "grad_norm": 0.5249270796775818, + "learning_rate": 3.803101266362321e-05, + "loss": 0.2415, + "step": 1623 + }, + { + "epoch": 0.16830759664213907, + "grad_norm": 0.4701800048351288, + "learning_rate": 3.8028106883970386e-05, + "loss": 0.2596, + "step": 1624 + }, + { + "epoch": 0.16841123432480049, + "grad_norm": 0.5113517642021179, + "learning_rate": 3.80251990729385e-05, + "loss": 0.2563, + "step": 1625 + }, + { + "epoch": 0.1685148720074619, + "grad_norm": 0.5005125999450684, + "learning_rate": 3.802228923085522e-05, + "loss": 0.2691, + "step": 1626 + }, + { + "epoch": 0.16861850969012332, + "grad_norm": 0.4985792338848114, + "learning_rate": 3.801937735804838e-05, + "loss": 0.2418, + "step": 1627 + }, + { + "epoch": 0.16872214737278474, + "grad_norm": 0.4471662938594818, + "learning_rate": 3.8016463454846125e-05, + "loss": 0.213, + "step": 1628 + }, + { + "epoch": 0.16882578505544615, + "grad_norm": 0.5097355842590332, + "learning_rate": 3.801354752157678e-05, + "loss": 0.3001, + "step": 1629 + }, + { + "epoch": 0.16892942273810757, + "grad_norm": 0.466145783662796, + "learning_rate": 3.8010629558568895e-05, + "loss": 0.2276, + "step": 1630 + }, + { + "epoch": 0.169033060420769, + "grad_norm": 0.472263902425766, + "learning_rate": 3.800770956615127e-05, + "loss": 0.2542, + "step": 1631 + }, + { + "epoch": 0.1691366981034304, + "grad_norm": 0.5002596974372864, + "learning_rate": 3.800478754465292e-05, + "loss": 0.233, + "step": 1632 + }, + { + "epoch": 0.16924033578609182, + "grad_norm": 0.4746779501438141, + "learning_rate": 3.800186349440311e-05, + "loss": 0.2507, + "step": 1633 + }, + { + "epoch": 0.16934397346875324, + "grad_norm": 0.47505295276641846, + "learning_rate": 3.79989374157313e-05, + "loss": 0.2394, + "step": 1634 + }, + { + "epoch": 0.16944761115141466, + "grad_norm": 0.4893549680709839, + "learning_rate": 3.79960093089672e-05, + "loss": 0.2677, + "step": 1635 + }, + { + "epoch": 0.16955124883407607, + "grad_norm": 0.41501083970069885, + "learning_rate": 3.799307917444075e-05, + "loss": 0.2226, + "step": 1636 + }, + { + "epoch": 0.1696548865167375, + "grad_norm": 0.5570014119148254, + "learning_rate": 3.7990147012482104e-05, + "loss": 0.267, + "step": 1637 + }, + { + "epoch": 0.1697585241993989, + "grad_norm": 0.41493335366249084, + "learning_rate": 3.798721282342167e-05, + "loss": 0.241, + "step": 1638 + }, + { + "epoch": 0.16986216188206033, + "grad_norm": 0.5655845403671265, + "learning_rate": 3.7984276607590044e-05, + "loss": 0.2715, + "step": 1639 + }, + { + "epoch": 0.16996579956472174, + "grad_norm": 0.40720781683921814, + "learning_rate": 3.798133836531809e-05, + "loss": 0.1749, + "step": 1640 + }, + { + "epoch": 0.17006943724738316, + "grad_norm": 0.46933940052986145, + "learning_rate": 3.7978398096936887e-05, + "loss": 0.2353, + "step": 1641 + }, + { + "epoch": 0.17017307493004458, + "grad_norm": 0.4930836856365204, + "learning_rate": 3.797545580277773e-05, + "loss": 0.2269, + "step": 1642 + }, + { + "epoch": 0.17027671261270597, + "grad_norm": 0.4270491302013397, + "learning_rate": 3.7972511483172157e-05, + "loss": 0.2244, + "step": 1643 + }, + { + "epoch": 0.17038035029536738, + "grad_norm": 0.49993693828582764, + "learning_rate": 3.7969565138451934e-05, + "loss": 0.262, + "step": 1644 + }, + { + "epoch": 0.1704839879780288, + "grad_norm": 0.4708077609539032, + "learning_rate": 3.796661676894903e-05, + "loss": 0.221, + "step": 1645 + }, + { + "epoch": 0.17058762566069022, + "grad_norm": 0.39685335755348206, + "learning_rate": 3.79636663749957e-05, + "loss": 0.1984, + "step": 1646 + }, + { + "epoch": 0.17069126334335163, + "grad_norm": 0.5080126523971558, + "learning_rate": 3.796071395692435e-05, + "loss": 0.2375, + "step": 1647 + }, + { + "epoch": 0.17079490102601305, + "grad_norm": 0.5168275237083435, + "learning_rate": 3.7957759515067676e-05, + "loss": 0.2201, + "step": 1648 + }, + { + "epoch": 0.17089853870867447, + "grad_norm": 0.47243815660476685, + "learning_rate": 3.7954803049758584e-05, + "loss": 0.2194, + "step": 1649 + }, + { + "epoch": 0.17100217639133589, + "grad_norm": 0.4236190617084503, + "learning_rate": 3.795184456133019e-05, + "loss": 0.2056, + "step": 1650 + }, + { + "epoch": 0.1711058140739973, + "grad_norm": 0.47571811079978943, + "learning_rate": 3.794888405011586e-05, + "loss": 0.2113, + "step": 1651 + }, + { + "epoch": 0.17120945175665872, + "grad_norm": 0.4334424138069153, + "learning_rate": 3.794592151644917e-05, + "loss": 0.2364, + "step": 1652 + }, + { + "epoch": 0.17131308943932014, + "grad_norm": 0.522921085357666, + "learning_rate": 3.794295696066395e-05, + "loss": 0.2501, + "step": 1653 + }, + { + "epoch": 0.17141672712198155, + "grad_norm": 0.4549519717693329, + "learning_rate": 3.793999038309423e-05, + "loss": 0.2209, + "step": 1654 + }, + { + "epoch": 0.17152036480464297, + "grad_norm": 0.4927518963813782, + "learning_rate": 3.793702178407427e-05, + "loss": 0.2683, + "step": 1655 + }, + { + "epoch": 0.1716240024873044, + "grad_norm": 0.42857837677001953, + "learning_rate": 3.79340511639386e-05, + "loss": 0.2232, + "step": 1656 + }, + { + "epoch": 0.1717276401699658, + "grad_norm": 0.4798828661441803, + "learning_rate": 3.7931078523021906e-05, + "loss": 0.2271, + "step": 1657 + }, + { + "epoch": 0.17183127785262722, + "grad_norm": 0.48226112127304077, + "learning_rate": 3.792810386165917e-05, + "loss": 0.2419, + "step": 1658 + }, + { + "epoch": 0.17193491553528864, + "grad_norm": 0.4466378390789032, + "learning_rate": 3.792512718018555e-05, + "loss": 0.2011, + "step": 1659 + }, + { + "epoch": 0.17203855321795006, + "grad_norm": 0.41471484303474426, + "learning_rate": 3.7922148478936476e-05, + "loss": 0.2134, + "step": 1660 + }, + { + "epoch": 0.17214219090061147, + "grad_norm": 0.4517219364643097, + "learning_rate": 3.791916775824757e-05, + "loss": 0.2131, + "step": 1661 + }, + { + "epoch": 0.17224582858327286, + "grad_norm": 0.44317197799682617, + "learning_rate": 3.791618501845469e-05, + "loss": 0.2009, + "step": 1662 + }, + { + "epoch": 0.17234946626593428, + "grad_norm": 0.5028525590896606, + "learning_rate": 3.791320025989394e-05, + "loss": 0.2398, + "step": 1663 + }, + { + "epoch": 0.1724531039485957, + "grad_norm": 0.46632954478263855, + "learning_rate": 3.7910213482901625e-05, + "loss": 0.1982, + "step": 1664 + }, + { + "epoch": 0.17255674163125712, + "grad_norm": 0.5463936924934387, + "learning_rate": 3.79072246878143e-05, + "loss": 0.267, + "step": 1665 + }, + { + "epoch": 0.17266037931391853, + "grad_norm": 0.5574196577072144, + "learning_rate": 3.7904233874968737e-05, + "loss": 0.3005, + "step": 1666 + }, + { + "epoch": 0.17276401699657995, + "grad_norm": 0.5142245888710022, + "learning_rate": 3.7901241044701934e-05, + "loss": 0.2538, + "step": 1667 + }, + { + "epoch": 0.17286765467924137, + "grad_norm": 0.3882453739643097, + "learning_rate": 3.789824619735111e-05, + "loss": 0.1806, + "step": 1668 + }, + { + "epoch": 0.17297129236190278, + "grad_norm": 0.426973432302475, + "learning_rate": 3.789524933325373e-05, + "loss": 0.2028, + "step": 1669 + }, + { + "epoch": 0.1730749300445642, + "grad_norm": 0.49242454767227173, + "learning_rate": 3.789225045274748e-05, + "loss": 0.2358, + "step": 1670 + }, + { + "epoch": 0.17317856772722562, + "grad_norm": 0.43440622091293335, + "learning_rate": 3.788924955617026e-05, + "loss": 0.2029, + "step": 1671 + }, + { + "epoch": 0.17328220540988704, + "grad_norm": 0.44206124544143677, + "learning_rate": 3.7886246643860205e-05, + "loss": 0.2188, + "step": 1672 + }, + { + "epoch": 0.17338584309254845, + "grad_norm": 0.45106032490730286, + "learning_rate": 3.788324171615569e-05, + "loss": 0.2165, + "step": 1673 + }, + { + "epoch": 0.17348948077520987, + "grad_norm": 0.4641522765159607, + "learning_rate": 3.78802347733953e-05, + "loss": 0.2457, + "step": 1674 + }, + { + "epoch": 0.1735931184578713, + "grad_norm": 0.5247939229011536, + "learning_rate": 3.787722581591784e-05, + "loss": 0.2566, + "step": 1675 + }, + { + "epoch": 0.1736967561405327, + "grad_norm": 0.4361657202243805, + "learning_rate": 3.787421484406238e-05, + "loss": 0.2386, + "step": 1676 + }, + { + "epoch": 0.17380039382319412, + "grad_norm": 0.4735523760318756, + "learning_rate": 3.7871201858168165e-05, + "loss": 0.2391, + "step": 1677 + }, + { + "epoch": 0.17390403150585554, + "grad_norm": 0.48238077759742737, + "learning_rate": 3.786818685857471e-05, + "loss": 0.2358, + "step": 1678 + }, + { + "epoch": 0.17400766918851696, + "grad_norm": 0.47058871388435364, + "learning_rate": 3.786516984562174e-05, + "loss": 0.2377, + "step": 1679 + }, + { + "epoch": 0.17411130687117837, + "grad_norm": 0.429928183555603, + "learning_rate": 3.78621508196492e-05, + "loss": 0.2096, + "step": 1680 + }, + { + "epoch": 0.17421494455383976, + "grad_norm": 0.5752142667770386, + "learning_rate": 3.7859129780997274e-05, + "loss": 0.2683, + "step": 1681 + }, + { + "epoch": 0.17431858223650118, + "grad_norm": 0.45631688833236694, + "learning_rate": 3.785610673000637e-05, + "loss": 0.2206, + "step": 1682 + }, + { + "epoch": 0.1744222199191626, + "grad_norm": 0.43998152017593384, + "learning_rate": 3.7853081667017114e-05, + "loss": 0.2012, + "step": 1683 + }, + { + "epoch": 0.174525857601824, + "grad_norm": 0.40259674191474915, + "learning_rate": 3.7850054592370363e-05, + "loss": 0.2024, + "step": 1684 + }, + { + "epoch": 0.17462949528448543, + "grad_norm": 0.4920617341995239, + "learning_rate": 3.784702550640722e-05, + "loss": 0.2325, + "step": 1685 + }, + { + "epoch": 0.17473313296714685, + "grad_norm": 0.46443817019462585, + "learning_rate": 3.7843994409468984e-05, + "loss": 0.2327, + "step": 1686 + }, + { + "epoch": 0.17483677064980827, + "grad_norm": 0.48127278685569763, + "learning_rate": 3.784096130189719e-05, + "loss": 0.2373, + "step": 1687 + }, + { + "epoch": 0.17494040833246968, + "grad_norm": 0.4313235282897949, + "learning_rate": 3.783792618403362e-05, + "loss": 0.2353, + "step": 1688 + }, + { + "epoch": 0.1750440460151311, + "grad_norm": 0.4920996427536011, + "learning_rate": 3.783488905622025e-05, + "loss": 0.271, + "step": 1689 + }, + { + "epoch": 0.17514768369779252, + "grad_norm": 0.40804946422576904, + "learning_rate": 3.783184991879931e-05, + "loss": 0.2022, + "step": 1690 + }, + { + "epoch": 0.17525132138045393, + "grad_norm": 0.49815499782562256, + "learning_rate": 3.782880877211324e-05, + "loss": 0.2574, + "step": 1691 + }, + { + "epoch": 0.17535495906311535, + "grad_norm": 0.4492497444152832, + "learning_rate": 3.782576561650471e-05, + "loss": 0.2049, + "step": 1692 + }, + { + "epoch": 0.17545859674577677, + "grad_norm": 0.4622615873813629, + "learning_rate": 3.7822720452316625e-05, + "loss": 0.2345, + "step": 1693 + }, + { + "epoch": 0.17556223442843819, + "grad_norm": 0.4740571081638336, + "learning_rate": 3.781967327989211e-05, + "loss": 0.2069, + "step": 1694 + }, + { + "epoch": 0.1756658721110996, + "grad_norm": 0.5480529069900513, + "learning_rate": 3.78166240995745e-05, + "loss": 0.2657, + "step": 1695 + }, + { + "epoch": 0.17576950979376102, + "grad_norm": 0.47912469506263733, + "learning_rate": 3.781357291170739e-05, + "loss": 0.2428, + "step": 1696 + }, + { + "epoch": 0.17587314747642244, + "grad_norm": 0.4989534318447113, + "learning_rate": 3.7810519716634575e-05, + "loss": 0.2095, + "step": 1697 + }, + { + "epoch": 0.17597678515908385, + "grad_norm": 0.5266689658164978, + "learning_rate": 3.780746451470008e-05, + "loss": 0.2175, + "step": 1698 + }, + { + "epoch": 0.17608042284174527, + "grad_norm": 0.4845711886882782, + "learning_rate": 3.7804407306248177e-05, + "loss": 0.2, + "step": 1699 + }, + { + "epoch": 0.17618406052440666, + "grad_norm": 0.45821481943130493, + "learning_rate": 3.780134809162332e-05, + "loss": 0.2165, + "step": 1700 + }, + { + "epoch": 0.17628769820706808, + "grad_norm": 0.4305170476436615, + "learning_rate": 3.779828687117025e-05, + "loss": 0.202, + "step": 1701 + }, + { + "epoch": 0.1763913358897295, + "grad_norm": 0.4545448422431946, + "learning_rate": 3.7795223645233876e-05, + "loss": 0.2185, + "step": 1702 + }, + { + "epoch": 0.1764949735723909, + "grad_norm": 0.5075645446777344, + "learning_rate": 3.779215841415936e-05, + "loss": 0.2546, + "step": 1703 + }, + { + "epoch": 0.17659861125505233, + "grad_norm": 0.573638916015625, + "learning_rate": 3.77890911782921e-05, + "loss": 0.2826, + "step": 1704 + }, + { + "epoch": 0.17670224893771375, + "grad_norm": 0.5099818110466003, + "learning_rate": 3.7786021937977694e-05, + "loss": 0.2561, + "step": 1705 + }, + { + "epoch": 0.17680588662037516, + "grad_norm": 0.5081855654716492, + "learning_rate": 3.778295069356199e-05, + "loss": 0.2702, + "step": 1706 + }, + { + "epoch": 0.17690952430303658, + "grad_norm": 0.49809956550598145, + "learning_rate": 3.777987744539104e-05, + "loss": 0.2416, + "step": 1707 + }, + { + "epoch": 0.177013161985698, + "grad_norm": 0.45424211025238037, + "learning_rate": 3.7776802193811146e-05, + "loss": 0.1965, + "step": 1708 + }, + { + "epoch": 0.17711679966835941, + "grad_norm": 0.5301138162612915, + "learning_rate": 3.777372493916881e-05, + "loss": 0.2866, + "step": 1709 + }, + { + "epoch": 0.17722043735102083, + "grad_norm": 0.5104231834411621, + "learning_rate": 3.7770645681810786e-05, + "loss": 0.2571, + "step": 1710 + }, + { + "epoch": 0.17732407503368225, + "grad_norm": 0.5096403360366821, + "learning_rate": 3.776756442208402e-05, + "loss": 0.2279, + "step": 1711 + }, + { + "epoch": 0.17742771271634367, + "grad_norm": 0.46347206830978394, + "learning_rate": 3.776448116033572e-05, + "loss": 0.1851, + "step": 1712 + }, + { + "epoch": 0.17753135039900508, + "grad_norm": 0.5343012809753418, + "learning_rate": 3.77613958969133e-05, + "loss": 0.2606, + "step": 1713 + }, + { + "epoch": 0.1776349880816665, + "grad_norm": 0.5330291390419006, + "learning_rate": 3.77583086321644e-05, + "loss": 0.2886, + "step": 1714 + }, + { + "epoch": 0.17773862576432792, + "grad_norm": 0.619606614112854, + "learning_rate": 3.775521936643689e-05, + "loss": 0.2602, + "step": 1715 + }, + { + "epoch": 0.17784226344698933, + "grad_norm": 0.38278481364250183, + "learning_rate": 3.775212810007886e-05, + "loss": 0.1895, + "step": 1716 + }, + { + "epoch": 0.17794590112965075, + "grad_norm": 0.5417003035545349, + "learning_rate": 3.774903483343863e-05, + "loss": 0.2846, + "step": 1717 + }, + { + "epoch": 0.17804953881231217, + "grad_norm": 0.5265429615974426, + "learning_rate": 3.774593956686475e-05, + "loss": 0.2595, + "step": 1718 + }, + { + "epoch": 0.17815317649497356, + "grad_norm": 0.45582106709480286, + "learning_rate": 3.774284230070599e-05, + "loss": 0.2193, + "step": 1719 + }, + { + "epoch": 0.17825681417763498, + "grad_norm": 0.4111036956310272, + "learning_rate": 3.773974303531134e-05, + "loss": 0.1962, + "step": 1720 + }, + { + "epoch": 0.1783604518602964, + "grad_norm": 0.48741012811660767, + "learning_rate": 3.7736641771030015e-05, + "loss": 0.2775, + "step": 1721 + }, + { + "epoch": 0.1784640895429578, + "grad_norm": 0.4571603238582611, + "learning_rate": 3.773353850821147e-05, + "loss": 0.2303, + "step": 1722 + }, + { + "epoch": 0.17856772722561923, + "grad_norm": 0.4960167706012726, + "learning_rate": 3.773043324720537e-05, + "loss": 0.2443, + "step": 1723 + }, + { + "epoch": 0.17867136490828064, + "grad_norm": 0.49686622619628906, + "learning_rate": 3.772732598836163e-05, + "loss": 0.2539, + "step": 1724 + }, + { + "epoch": 0.17877500259094206, + "grad_norm": 0.5505495071411133, + "learning_rate": 3.772421673203034e-05, + "loss": 0.2692, + "step": 1725 + }, + { + "epoch": 0.17887864027360348, + "grad_norm": 0.5004148483276367, + "learning_rate": 3.7721105478561866e-05, + "loss": 0.2403, + "step": 1726 + }, + { + "epoch": 0.1789822779562649, + "grad_norm": 0.515392541885376, + "learning_rate": 3.771799222830677e-05, + "loss": 0.2311, + "step": 1727 + }, + { + "epoch": 0.1790859156389263, + "grad_norm": 0.5341514348983765, + "learning_rate": 3.7714876981615866e-05, + "loss": 0.2478, + "step": 1728 + }, + { + "epoch": 0.17918955332158773, + "grad_norm": 0.41764670610427856, + "learning_rate": 3.771175973884014e-05, + "loss": 0.1995, + "step": 1729 + }, + { + "epoch": 0.17929319100424915, + "grad_norm": 0.4541582763195038, + "learning_rate": 3.770864050033088e-05, + "loss": 0.2117, + "step": 1730 + }, + { + "epoch": 0.17939682868691056, + "grad_norm": 0.463843435049057, + "learning_rate": 3.770551926643953e-05, + "loss": 0.2273, + "step": 1731 + }, + { + "epoch": 0.17950046636957198, + "grad_norm": 0.45743948221206665, + "learning_rate": 3.7702396037517795e-05, + "loss": 0.2495, + "step": 1732 + }, + { + "epoch": 0.1796041040522334, + "grad_norm": 0.4244634509086609, + "learning_rate": 3.769927081391759e-05, + "loss": 0.1787, + "step": 1733 + }, + { + "epoch": 0.17970774173489482, + "grad_norm": 0.4777889549732208, + "learning_rate": 3.769614359599106e-05, + "loss": 0.2215, + "step": 1734 + }, + { + "epoch": 0.17981137941755623, + "grad_norm": 0.44900938868522644, + "learning_rate": 3.769301438409059e-05, + "loss": 0.2304, + "step": 1735 + }, + { + "epoch": 0.17991501710021765, + "grad_norm": 0.5480301380157471, + "learning_rate": 3.7689883178568755e-05, + "loss": 0.2723, + "step": 1736 + }, + { + "epoch": 0.18001865478287907, + "grad_norm": 0.47610998153686523, + "learning_rate": 3.7686749979778386e-05, + "loss": 0.2481, + "step": 1737 + }, + { + "epoch": 0.18012229246554046, + "grad_norm": 0.5337226390838623, + "learning_rate": 3.7683614788072527e-05, + "loss": 0.236, + "step": 1738 + }, + { + "epoch": 0.18022593014820187, + "grad_norm": 0.4076583981513977, + "learning_rate": 3.768047760380444e-05, + "loss": 0.2005, + "step": 1739 + }, + { + "epoch": 0.1803295678308633, + "grad_norm": 0.46692726016044617, + "learning_rate": 3.767733842732762e-05, + "loss": 0.2354, + "step": 1740 + }, + { + "epoch": 0.1804332055135247, + "grad_norm": 0.4673640727996826, + "learning_rate": 3.7674197258995785e-05, + "loss": 0.2642, + "step": 1741 + }, + { + "epoch": 0.18053684319618613, + "grad_norm": 0.47768813371658325, + "learning_rate": 3.767105409916288e-05, + "loss": 0.2261, + "step": 1742 + }, + { + "epoch": 0.18064048087884754, + "grad_norm": 0.4129585921764374, + "learning_rate": 3.7667908948183075e-05, + "loss": 0.1918, + "step": 1743 + }, + { + "epoch": 0.18074411856150896, + "grad_norm": 0.4703997075557709, + "learning_rate": 3.7664761806410744e-05, + "loss": 0.2537, + "step": 1744 + }, + { + "epoch": 0.18084775624417038, + "grad_norm": 0.5203202366828918, + "learning_rate": 3.766161267420052e-05, + "loss": 0.2438, + "step": 1745 + }, + { + "epoch": 0.1809513939268318, + "grad_norm": 0.5779595375061035, + "learning_rate": 3.765846155190723e-05, + "loss": 0.2892, + "step": 1746 + }, + { + "epoch": 0.1810550316094932, + "grad_norm": 0.6186638474464417, + "learning_rate": 3.765530843988595e-05, + "loss": 0.2868, + "step": 1747 + }, + { + "epoch": 0.18115866929215463, + "grad_norm": 0.5214221477508545, + "learning_rate": 3.765215333849196e-05, + "loss": 0.2815, + "step": 1748 + }, + { + "epoch": 0.18126230697481605, + "grad_norm": 0.4598151445388794, + "learning_rate": 3.7648996248080765e-05, + "loss": 0.2439, + "step": 1749 + }, + { + "epoch": 0.18136594465747746, + "grad_norm": 0.5000353455543518, + "learning_rate": 3.764583716900812e-05, + "loss": 0.2499, + "step": 1750 + }, + { + "epoch": 0.18146958234013888, + "grad_norm": 0.44633814692497253, + "learning_rate": 3.764267610162996e-05, + "loss": 0.2367, + "step": 1751 + }, + { + "epoch": 0.1815732200228003, + "grad_norm": 0.5109118819236755, + "learning_rate": 3.763951304630249e-05, + "loss": 0.2787, + "step": 1752 + }, + { + "epoch": 0.18167685770546171, + "grad_norm": 0.47466611862182617, + "learning_rate": 3.763634800338211e-05, + "loss": 0.218, + "step": 1753 + }, + { + "epoch": 0.18178049538812313, + "grad_norm": 0.4710037112236023, + "learning_rate": 3.763318097322546e-05, + "loss": 0.2761, + "step": 1754 + }, + { + "epoch": 0.18188413307078455, + "grad_norm": 0.4349900484085083, + "learning_rate": 3.7630011956189386e-05, + "loss": 0.1997, + "step": 1755 + }, + { + "epoch": 0.18198777075344597, + "grad_norm": 0.5574545860290527, + "learning_rate": 3.7626840952630966e-05, + "loss": 0.2889, + "step": 1756 + }, + { + "epoch": 0.18209140843610735, + "grad_norm": 0.405887246131897, + "learning_rate": 3.762366796290751e-05, + "loss": 0.1968, + "step": 1757 + }, + { + "epoch": 0.18219504611876877, + "grad_norm": 0.44126248359680176, + "learning_rate": 3.7620492987376544e-05, + "loss": 0.2313, + "step": 1758 + }, + { + "epoch": 0.1822986838014302, + "grad_norm": 0.448207288980484, + "learning_rate": 3.7617316026395824e-05, + "loss": 0.2524, + "step": 1759 + }, + { + "epoch": 0.1824023214840916, + "grad_norm": 0.4973071217536926, + "learning_rate": 3.761413708032332e-05, + "loss": 0.244, + "step": 1760 + }, + { + "epoch": 0.18250595916675302, + "grad_norm": 0.4090721607208252, + "learning_rate": 3.7610956149517235e-05, + "loss": 0.216, + "step": 1761 + }, + { + "epoch": 0.18260959684941444, + "grad_norm": 0.44438639283180237, + "learning_rate": 3.7607773234335984e-05, + "loss": 0.2264, + "step": 1762 + }, + { + "epoch": 0.18271323453207586, + "grad_norm": 0.5119732022285461, + "learning_rate": 3.760458833513821e-05, + "loss": 0.2179, + "step": 1763 + }, + { + "epoch": 0.18281687221473727, + "grad_norm": 0.4416314661502838, + "learning_rate": 3.7601401452282795e-05, + "loss": 0.2009, + "step": 1764 + }, + { + "epoch": 0.1829205098973987, + "grad_norm": 0.5023295283317566, + "learning_rate": 3.759821258612883e-05, + "loss": 0.272, + "step": 1765 + }, + { + "epoch": 0.1830241475800601, + "grad_norm": 0.4766654074192047, + "learning_rate": 3.759502173703562e-05, + "loss": 0.2325, + "step": 1766 + }, + { + "epoch": 0.18312778526272153, + "grad_norm": 0.49825185537338257, + "learning_rate": 3.7591828905362724e-05, + "loss": 0.2496, + "step": 1767 + }, + { + "epoch": 0.18323142294538294, + "grad_norm": 0.4586202800273895, + "learning_rate": 3.758863409146988e-05, + "loss": 0.2415, + "step": 1768 + }, + { + "epoch": 0.18333506062804436, + "grad_norm": 0.5108445286750793, + "learning_rate": 3.75854372957171e-05, + "loss": 0.2488, + "step": 1769 + }, + { + "epoch": 0.18343869831070578, + "grad_norm": 0.5143339037895203, + "learning_rate": 3.7582238518464576e-05, + "loss": 0.238, + "step": 1770 + }, + { + "epoch": 0.1835423359933672, + "grad_norm": 0.4804529845714569, + "learning_rate": 3.757903776007275e-05, + "loss": 0.2545, + "step": 1771 + }, + { + "epoch": 0.1836459736760286, + "grad_norm": 0.4834483861923218, + "learning_rate": 3.7575835020902275e-05, + "loss": 0.2442, + "step": 1772 + }, + { + "epoch": 0.18374961135869003, + "grad_norm": 0.462864488363266, + "learning_rate": 3.7572630301314036e-05, + "loss": 0.2142, + "step": 1773 + }, + { + "epoch": 0.18385324904135145, + "grad_norm": 0.493757963180542, + "learning_rate": 3.756942360166913e-05, + "loss": 0.2236, + "step": 1774 + }, + { + "epoch": 0.18395688672401286, + "grad_norm": 0.5044764280319214, + "learning_rate": 3.756621492232888e-05, + "loss": 0.2864, + "step": 1775 + }, + { + "epoch": 0.18406052440667425, + "grad_norm": 0.4586033225059509, + "learning_rate": 3.756300426365485e-05, + "loss": 0.2048, + "step": 1776 + }, + { + "epoch": 0.18416416208933567, + "grad_norm": 0.4786682426929474, + "learning_rate": 3.7559791626008795e-05, + "loss": 0.2639, + "step": 1777 + }, + { + "epoch": 0.1842677997719971, + "grad_norm": 0.5083273649215698, + "learning_rate": 3.755657700975272e-05, + "loss": 0.2491, + "step": 1778 + }, + { + "epoch": 0.1843714374546585, + "grad_norm": 0.46496713161468506, + "learning_rate": 3.755336041524883e-05, + "loss": 0.225, + "step": 1779 + }, + { + "epoch": 0.18447507513731992, + "grad_norm": 0.45612651109695435, + "learning_rate": 3.7550141842859586e-05, + "loss": 0.215, + "step": 1780 + }, + { + "epoch": 0.18457871281998134, + "grad_norm": 0.41664206981658936, + "learning_rate": 3.754692129294764e-05, + "loss": 0.2182, + "step": 1781 + }, + { + "epoch": 0.18468235050264276, + "grad_norm": 0.5147910714149475, + "learning_rate": 3.7543698765875873e-05, + "loss": 0.2259, + "step": 1782 + }, + { + "epoch": 0.18478598818530417, + "grad_norm": 0.4890103340148926, + "learning_rate": 3.754047426200741e-05, + "loss": 0.2464, + "step": 1783 + }, + { + "epoch": 0.1848896258679656, + "grad_norm": 0.4901570975780487, + "learning_rate": 3.753724778170557e-05, + "loss": 0.2265, + "step": 1784 + }, + { + "epoch": 0.184993263550627, + "grad_norm": 0.49182382225990295, + "learning_rate": 3.753401932533391e-05, + "loss": 0.2065, + "step": 1785 + }, + { + "epoch": 0.18509690123328842, + "grad_norm": 0.47069478034973145, + "learning_rate": 3.7530788893256217e-05, + "loss": 0.2427, + "step": 1786 + }, + { + "epoch": 0.18520053891594984, + "grad_norm": 0.4829593598842621, + "learning_rate": 3.752755648583648e-05, + "loss": 0.1938, + "step": 1787 + }, + { + "epoch": 0.18530417659861126, + "grad_norm": 0.5271267294883728, + "learning_rate": 3.752432210343893e-05, + "loss": 0.2727, + "step": 1788 + }, + { + "epoch": 0.18540781428127268, + "grad_norm": 0.49652940034866333, + "learning_rate": 3.752108574642799e-05, + "loss": 0.2396, + "step": 1789 + }, + { + "epoch": 0.1855114519639341, + "grad_norm": 0.43199917674064636, + "learning_rate": 3.7517847415168365e-05, + "loss": 0.2286, + "step": 1790 + }, + { + "epoch": 0.1856150896465955, + "grad_norm": 0.4996117055416107, + "learning_rate": 3.751460711002492e-05, + "loss": 0.2406, + "step": 1791 + }, + { + "epoch": 0.18571872732925693, + "grad_norm": 0.5022050142288208, + "learning_rate": 3.7511364831362766e-05, + "loss": 0.2317, + "step": 1792 + }, + { + "epoch": 0.18582236501191834, + "grad_norm": 0.44132325053215027, + "learning_rate": 3.750812057954725e-05, + "loss": 0.2145, + "step": 1793 + }, + { + "epoch": 0.18592600269457976, + "grad_norm": 0.5774767994880676, + "learning_rate": 3.750487435494392e-05, + "loss": 0.3222, + "step": 1794 + }, + { + "epoch": 0.18602964037724115, + "grad_norm": 0.49793311953544617, + "learning_rate": 3.7501626157918564e-05, + "loss": 0.2787, + "step": 1795 + }, + { + "epoch": 0.18613327805990257, + "grad_norm": 0.469102680683136, + "learning_rate": 3.749837598883718e-05, + "loss": 0.2617, + "step": 1796 + }, + { + "epoch": 0.18623691574256399, + "grad_norm": 0.5038089156150818, + "learning_rate": 3.7495123848065984e-05, + "loss": 0.2302, + "step": 1797 + }, + { + "epoch": 0.1863405534252254, + "grad_norm": 0.5174588561058044, + "learning_rate": 3.749186973597144e-05, + "loss": 0.2199, + "step": 1798 + }, + { + "epoch": 0.18644419110788682, + "grad_norm": 0.44706490635871887, + "learning_rate": 3.74886136529202e-05, + "loss": 0.247, + "step": 1799 + }, + { + "epoch": 0.18654782879054824, + "grad_norm": 0.5332117080688477, + "learning_rate": 3.748535559927916e-05, + "loss": 0.2707, + "step": 1800 + }, + { + "epoch": 0.18665146647320965, + "grad_norm": 0.45966261625289917, + "learning_rate": 3.748209557541543e-05, + "loss": 0.2536, + "step": 1801 + }, + { + "epoch": 0.18675510415587107, + "grad_norm": 0.4928770959377289, + "learning_rate": 3.7478833581696354e-05, + "loss": 0.2439, + "step": 1802 + }, + { + "epoch": 0.1868587418385325, + "grad_norm": 0.48167261481285095, + "learning_rate": 3.747556961848948e-05, + "loss": 0.2288, + "step": 1803 + }, + { + "epoch": 0.1869623795211939, + "grad_norm": 0.5721211433410645, + "learning_rate": 3.747230368616258e-05, + "loss": 0.2477, + "step": 1804 + }, + { + "epoch": 0.18706601720385532, + "grad_norm": 0.4811793565750122, + "learning_rate": 3.746903578508367e-05, + "loss": 0.1919, + "step": 1805 + }, + { + "epoch": 0.18716965488651674, + "grad_norm": 0.4622481167316437, + "learning_rate": 3.746576591562096e-05, + "loss": 0.2346, + "step": 1806 + }, + { + "epoch": 0.18727329256917816, + "grad_norm": 0.4700213372707367, + "learning_rate": 3.74624940781429e-05, + "loss": 0.2085, + "step": 1807 + }, + { + "epoch": 0.18737693025183957, + "grad_norm": 0.4065195918083191, + "learning_rate": 3.745922027301814e-05, + "loss": 0.1784, + "step": 1808 + }, + { + "epoch": 0.187480567934501, + "grad_norm": 0.4072580635547638, + "learning_rate": 3.74559445006156e-05, + "loss": 0.1962, + "step": 1809 + }, + { + "epoch": 0.1875842056171624, + "grad_norm": 0.4518824815750122, + "learning_rate": 3.7452666761304365e-05, + "loss": 0.2514, + "step": 1810 + }, + { + "epoch": 0.18768784329982383, + "grad_norm": 0.46154463291168213, + "learning_rate": 3.744938705545377e-05, + "loss": 0.266, + "step": 1811 + }, + { + "epoch": 0.18779148098248524, + "grad_norm": 0.4089922308921814, + "learning_rate": 3.7446105383433364e-05, + "loss": 0.2019, + "step": 1812 + }, + { + "epoch": 0.18789511866514666, + "grad_norm": 0.4270873963832855, + "learning_rate": 3.744282174561292e-05, + "loss": 0.2124, + "step": 1813 + }, + { + "epoch": 0.18799875634780805, + "grad_norm": 0.46813175082206726, + "learning_rate": 3.743953614236244e-05, + "loss": 0.2316, + "step": 1814 + }, + { + "epoch": 0.18810239403046947, + "grad_norm": 0.4075770378112793, + "learning_rate": 3.743624857405214e-05, + "loss": 0.2163, + "step": 1815 + }, + { + "epoch": 0.18820603171313088, + "grad_norm": 0.5519664287567139, + "learning_rate": 3.7432959041052455e-05, + "loss": 0.2886, + "step": 1816 + }, + { + "epoch": 0.1883096693957923, + "grad_norm": 0.4090479016304016, + "learning_rate": 3.7429667543734045e-05, + "loss": 0.1961, + "step": 1817 + }, + { + "epoch": 0.18841330707845372, + "grad_norm": 0.4551500380039215, + "learning_rate": 3.742637408246779e-05, + "loss": 0.2134, + "step": 1818 + }, + { + "epoch": 0.18851694476111513, + "grad_norm": 0.5673427581787109, + "learning_rate": 3.742307865762479e-05, + "loss": 0.2555, + "step": 1819 + }, + { + "epoch": 0.18862058244377655, + "grad_norm": 0.42178842425346375, + "learning_rate": 3.741978126957638e-05, + "loss": 0.2362, + "step": 1820 + }, + { + "epoch": 0.18872422012643797, + "grad_norm": 0.5114960074424744, + "learning_rate": 3.7416481918694094e-05, + "loss": 0.2224, + "step": 1821 + }, + { + "epoch": 0.1888278578090994, + "grad_norm": 0.44898855686187744, + "learning_rate": 3.74131806053497e-05, + "loss": 0.2132, + "step": 1822 + }, + { + "epoch": 0.1889314954917608, + "grad_norm": 0.5565721988677979, + "learning_rate": 3.740987732991518e-05, + "loss": 0.2961, + "step": 1823 + }, + { + "epoch": 0.18903513317442222, + "grad_norm": 0.5278633236885071, + "learning_rate": 3.7406572092762744e-05, + "loss": 0.2566, + "step": 1824 + }, + { + "epoch": 0.18913877085708364, + "grad_norm": 0.48794278502464294, + "learning_rate": 3.7403264894264836e-05, + "loss": 0.2649, + "step": 1825 + }, + { + "epoch": 0.18924240853974506, + "grad_norm": 0.5211032032966614, + "learning_rate": 3.739995573479408e-05, + "loss": 0.2638, + "step": 1826 + }, + { + "epoch": 0.18934604622240647, + "grad_norm": 0.5264724493026733, + "learning_rate": 3.7396644614723374e-05, + "loss": 0.2508, + "step": 1827 + }, + { + "epoch": 0.1894496839050679, + "grad_norm": 0.4915875196456909, + "learning_rate": 3.739333153442579e-05, + "loss": 0.2422, + "step": 1828 + }, + { + "epoch": 0.1895533215877293, + "grad_norm": 0.5053110718727112, + "learning_rate": 3.739001649427464e-05, + "loss": 0.2531, + "step": 1829 + }, + { + "epoch": 0.18965695927039072, + "grad_norm": 0.43746939301490784, + "learning_rate": 3.738669949464347e-05, + "loss": 0.2166, + "step": 1830 + }, + { + "epoch": 0.18976059695305214, + "grad_norm": 0.5301503539085388, + "learning_rate": 3.7383380535906033e-05, + "loss": 0.2461, + "step": 1831 + }, + { + "epoch": 0.18986423463571356, + "grad_norm": 0.4544839560985565, + "learning_rate": 3.7380059618436305e-05, + "loss": 0.2497, + "step": 1832 + }, + { + "epoch": 0.18996787231837495, + "grad_norm": 0.428805410861969, + "learning_rate": 3.7376736742608465e-05, + "loss": 0.1934, + "step": 1833 + }, + { + "epoch": 0.19007151000103636, + "grad_norm": 0.44237765669822693, + "learning_rate": 3.7373411908796944e-05, + "loss": 0.2145, + "step": 1834 + }, + { + "epoch": 0.19017514768369778, + "grad_norm": 0.5216934084892273, + "learning_rate": 3.737008511737638e-05, + "loss": 0.2676, + "step": 1835 + }, + { + "epoch": 0.1902787853663592, + "grad_norm": 0.5544553399085999, + "learning_rate": 3.736675636872162e-05, + "loss": 0.3118, + "step": 1836 + }, + { + "epoch": 0.19038242304902062, + "grad_norm": 0.4797340929508209, + "learning_rate": 3.736342566320776e-05, + "loss": 0.2353, + "step": 1837 + }, + { + "epoch": 0.19048606073168203, + "grad_norm": 0.4621637463569641, + "learning_rate": 3.736009300121009e-05, + "loss": 0.2282, + "step": 1838 + }, + { + "epoch": 0.19058969841434345, + "grad_norm": 0.45533865690231323, + "learning_rate": 3.735675838310412e-05, + "loss": 0.2282, + "step": 1839 + }, + { + "epoch": 0.19069333609700487, + "grad_norm": 0.4414368271827698, + "learning_rate": 3.7353421809265596e-05, + "loss": 0.2026, + "step": 1840 + }, + { + "epoch": 0.19079697377966628, + "grad_norm": 0.37621623277664185, + "learning_rate": 3.735008328007048e-05, + "loss": 0.1971, + "step": 1841 + }, + { + "epoch": 0.1909006114623277, + "grad_norm": 0.36020252108573914, + "learning_rate": 3.7346742795894954e-05, + "loss": 0.1844, + "step": 1842 + }, + { + "epoch": 0.19100424914498912, + "grad_norm": 0.5098097324371338, + "learning_rate": 3.734340035711541e-05, + "loss": 0.2449, + "step": 1843 + }, + { + "epoch": 0.19110788682765054, + "grad_norm": 0.4699155390262604, + "learning_rate": 3.734005596410848e-05, + "loss": 0.2441, + "step": 1844 + }, + { + "epoch": 0.19121152451031195, + "grad_norm": 0.3725084364414215, + "learning_rate": 3.7336709617251e-05, + "loss": 0.2027, + "step": 1845 + }, + { + "epoch": 0.19131516219297337, + "grad_norm": 0.49921226501464844, + "learning_rate": 3.733336131692003e-05, + "loss": 0.2866, + "step": 1846 + }, + { + "epoch": 0.1914187998756348, + "grad_norm": 0.4483483135700226, + "learning_rate": 3.733001106349285e-05, + "loss": 0.2498, + "step": 1847 + }, + { + "epoch": 0.1915224375582962, + "grad_norm": 0.43417519330978394, + "learning_rate": 3.7326658857346964e-05, + "loss": 0.2234, + "step": 1848 + }, + { + "epoch": 0.19162607524095762, + "grad_norm": 0.4553414583206177, + "learning_rate": 3.73233046988601e-05, + "loss": 0.2338, + "step": 1849 + }, + { + "epoch": 0.19172971292361904, + "grad_norm": 0.4434491991996765, + "learning_rate": 3.731994858841018e-05, + "loss": 0.2092, + "step": 1850 + }, + { + "epoch": 0.19183335060628046, + "grad_norm": 0.5110379457473755, + "learning_rate": 3.7316590526375385e-05, + "loss": 0.2763, + "step": 1851 + }, + { + "epoch": 0.19193698828894185, + "grad_norm": 0.4639647603034973, + "learning_rate": 3.731323051313409e-05, + "loss": 0.249, + "step": 1852 + }, + { + "epoch": 0.19204062597160326, + "grad_norm": 0.4629019498825073, + "learning_rate": 3.730986854906489e-05, + "loss": 0.2365, + "step": 1853 + }, + { + "epoch": 0.19214426365426468, + "grad_norm": 0.510564923286438, + "learning_rate": 3.7306504634546605e-05, + "loss": 0.2578, + "step": 1854 + }, + { + "epoch": 0.1922479013369261, + "grad_norm": 0.4567071497440338, + "learning_rate": 3.730313876995829e-05, + "loss": 0.2494, + "step": 1855 + }, + { + "epoch": 0.19235153901958751, + "grad_norm": 0.4712611436843872, + "learning_rate": 3.7299770955679196e-05, + "loss": 0.2677, + "step": 1856 + }, + { + "epoch": 0.19245517670224893, + "grad_norm": 0.49776506423950195, + "learning_rate": 3.7296401192088804e-05, + "loss": 0.2344, + "step": 1857 + }, + { + "epoch": 0.19255881438491035, + "grad_norm": 0.5162355899810791, + "learning_rate": 3.729302947956681e-05, + "loss": 0.2403, + "step": 1858 + }, + { + "epoch": 0.19266245206757177, + "grad_norm": 0.45831117033958435, + "learning_rate": 3.728965581849314e-05, + "loss": 0.2333, + "step": 1859 + }, + { + "epoch": 0.19276608975023318, + "grad_norm": 0.39789503812789917, + "learning_rate": 3.728628020924793e-05, + "loss": 0.2093, + "step": 1860 + }, + { + "epoch": 0.1928697274328946, + "grad_norm": 0.406093031167984, + "learning_rate": 3.7282902652211535e-05, + "loss": 0.1883, + "step": 1861 + }, + { + "epoch": 0.19297336511555602, + "grad_norm": 0.5074322819709778, + "learning_rate": 3.7279523147764536e-05, + "loss": 0.2556, + "step": 1862 + }, + { + "epoch": 0.19307700279821743, + "grad_norm": 0.4497910737991333, + "learning_rate": 3.727614169628773e-05, + "loss": 0.2734, + "step": 1863 + }, + { + "epoch": 0.19318064048087885, + "grad_norm": 0.49084556102752686, + "learning_rate": 3.727275829816214e-05, + "loss": 0.2526, + "step": 1864 + }, + { + "epoch": 0.19328427816354027, + "grad_norm": 0.4497647285461426, + "learning_rate": 3.7269372953768995e-05, + "loss": 0.2143, + "step": 1865 + }, + { + "epoch": 0.19338791584620169, + "grad_norm": 0.5706869959831238, + "learning_rate": 3.726598566348974e-05, + "loss": 0.2836, + "step": 1866 + }, + { + "epoch": 0.1934915535288631, + "grad_norm": 0.3616214990615845, + "learning_rate": 3.7262596427706075e-05, + "loss": 0.1878, + "step": 1867 + }, + { + "epoch": 0.19359519121152452, + "grad_norm": 0.4331905245780945, + "learning_rate": 3.725920524679987e-05, + "loss": 0.2154, + "step": 1868 + }, + { + "epoch": 0.19369882889418594, + "grad_norm": 0.3976379930973053, + "learning_rate": 3.725581212115325e-05, + "loss": 0.1836, + "step": 1869 + }, + { + "epoch": 0.19380246657684735, + "grad_norm": 0.4745163023471832, + "learning_rate": 3.725241705114855e-05, + "loss": 0.2518, + "step": 1870 + }, + { + "epoch": 0.19390610425950874, + "grad_norm": 0.498110830783844, + "learning_rate": 3.724902003716831e-05, + "loss": 0.2702, + "step": 1871 + }, + { + "epoch": 0.19400974194217016, + "grad_norm": 0.5331942439079285, + "learning_rate": 3.724562107959531e-05, + "loss": 0.269, + "step": 1872 + }, + { + "epoch": 0.19411337962483158, + "grad_norm": 0.47925272583961487, + "learning_rate": 3.724222017881253e-05, + "loss": 0.2345, + "step": 1873 + }, + { + "epoch": 0.194217017307493, + "grad_norm": 0.47513824701309204, + "learning_rate": 3.723881733520319e-05, + "loss": 0.2699, + "step": 1874 + }, + { + "epoch": 0.1943206549901544, + "grad_norm": 0.4839094579219818, + "learning_rate": 3.723541254915071e-05, + "loss": 0.251, + "step": 1875 + }, + { + "epoch": 0.19442429267281583, + "grad_norm": 0.42007312178611755, + "learning_rate": 3.723200582103874e-05, + "loss": 0.2155, + "step": 1876 + }, + { + "epoch": 0.19452793035547725, + "grad_norm": 0.5020776987075806, + "learning_rate": 3.722859715125114e-05, + "loss": 0.2374, + "step": 1877 + }, + { + "epoch": 0.19463156803813866, + "grad_norm": 0.4552202820777893, + "learning_rate": 3.722518654017199e-05, + "loss": 0.218, + "step": 1878 + }, + { + "epoch": 0.19473520572080008, + "grad_norm": 0.48936206102371216, + "learning_rate": 3.7221773988185604e-05, + "loss": 0.2722, + "step": 1879 + }, + { + "epoch": 0.1948388434034615, + "grad_norm": 0.49154961109161377, + "learning_rate": 3.721835949567649e-05, + "loss": 0.2223, + "step": 1880 + }, + { + "epoch": 0.19494248108612292, + "grad_norm": 0.5503633618354797, + "learning_rate": 3.7214943063029395e-05, + "loss": 0.2764, + "step": 1881 + }, + { + "epoch": 0.19504611876878433, + "grad_norm": 0.46910014748573303, + "learning_rate": 3.721152469062928e-05, + "loss": 0.229, + "step": 1882 + }, + { + "epoch": 0.19514975645144575, + "grad_norm": 0.4484991431236267, + "learning_rate": 3.720810437886132e-05, + "loss": 0.2392, + "step": 1883 + }, + { + "epoch": 0.19525339413410717, + "grad_norm": 0.47034701704978943, + "learning_rate": 3.7204682128110905e-05, + "loss": 0.2275, + "step": 1884 + }, + { + "epoch": 0.19535703181676858, + "grad_norm": 0.42318418622016907, + "learning_rate": 3.7201257938763656e-05, + "loss": 0.2151, + "step": 1885 + }, + { + "epoch": 0.19546066949943, + "grad_norm": 0.5113075971603394, + "learning_rate": 3.719783181120541e-05, + "loss": 0.2369, + "step": 1886 + }, + { + "epoch": 0.19556430718209142, + "grad_norm": 0.46260493993759155, + "learning_rate": 3.719440374582219e-05, + "loss": 0.2378, + "step": 1887 + }, + { + "epoch": 0.19566794486475284, + "grad_norm": 0.427514910697937, + "learning_rate": 3.71909737430003e-05, + "loss": 0.2124, + "step": 1888 + }, + { + "epoch": 0.19577158254741425, + "grad_norm": 0.5521390438079834, + "learning_rate": 3.718754180312621e-05, + "loss": 0.2562, + "step": 1889 + }, + { + "epoch": 0.19587522023007564, + "grad_norm": 0.3623069226741791, + "learning_rate": 3.718410792658663e-05, + "loss": 0.1583, + "step": 1890 + }, + { + "epoch": 0.19597885791273706, + "grad_norm": 0.4941210448741913, + "learning_rate": 3.718067211376848e-05, + "loss": 0.2501, + "step": 1891 + }, + { + "epoch": 0.19608249559539848, + "grad_norm": 0.4543443024158478, + "learning_rate": 3.717723436505891e-05, + "loss": 0.2043, + "step": 1892 + }, + { + "epoch": 0.1961861332780599, + "grad_norm": 0.42174211144447327, + "learning_rate": 3.717379468084526e-05, + "loss": 0.2133, + "step": 1893 + }, + { + "epoch": 0.1962897709607213, + "grad_norm": 0.5058109164237976, + "learning_rate": 3.7170353061515135e-05, + "loss": 0.2495, + "step": 1894 + }, + { + "epoch": 0.19639340864338273, + "grad_norm": 0.43948596715927124, + "learning_rate": 3.716690950745632e-05, + "loss": 0.2201, + "step": 1895 + }, + { + "epoch": 0.19649704632604414, + "grad_norm": 0.48528286814689636, + "learning_rate": 3.7163464019056824e-05, + "loss": 0.2503, + "step": 1896 + }, + { + "epoch": 0.19660068400870556, + "grad_norm": 0.4711983799934387, + "learning_rate": 3.7160016596704876e-05, + "loss": 0.2445, + "step": 1897 + }, + { + "epoch": 0.19670432169136698, + "grad_norm": 0.5053095817565918, + "learning_rate": 3.715656724078894e-05, + "loss": 0.2604, + "step": 1898 + }, + { + "epoch": 0.1968079593740284, + "grad_norm": 0.5295904278755188, + "learning_rate": 3.715311595169768e-05, + "loss": 0.2671, + "step": 1899 + }, + { + "epoch": 0.1969115970566898, + "grad_norm": 0.494743287563324, + "learning_rate": 3.7149662729819976e-05, + "loss": 0.2782, + "step": 1900 + }, + { + "epoch": 0.19701523473935123, + "grad_norm": 0.4124172627925873, + "learning_rate": 3.7146207575544935e-05, + "loss": 0.1894, + "step": 1901 + }, + { + "epoch": 0.19711887242201265, + "grad_norm": 0.457582950592041, + "learning_rate": 3.714275048926188e-05, + "loss": 0.2048, + "step": 1902 + }, + { + "epoch": 0.19722251010467406, + "grad_norm": 0.5982666611671448, + "learning_rate": 3.713929147136035e-05, + "loss": 0.2634, + "step": 1903 + }, + { + "epoch": 0.19732614778733548, + "grad_norm": 0.49243828654289246, + "learning_rate": 3.71358305222301e-05, + "loss": 0.237, + "step": 1904 + }, + { + "epoch": 0.1974297854699969, + "grad_norm": 0.5259522199630737, + "learning_rate": 3.7132367642261106e-05, + "loss": 0.2442, + "step": 1905 + }, + { + "epoch": 0.19753342315265832, + "grad_norm": 0.4355629086494446, + "learning_rate": 3.7128902831843554e-05, + "loss": 0.2314, + "step": 1906 + }, + { + "epoch": 0.19763706083531973, + "grad_norm": 0.4574223458766937, + "learning_rate": 3.7125436091367866e-05, + "loss": 0.2163, + "step": 1907 + }, + { + "epoch": 0.19774069851798115, + "grad_norm": 0.480646014213562, + "learning_rate": 3.712196742122466e-05, + "loss": 0.2259, + "step": 1908 + }, + { + "epoch": 0.19784433620064254, + "grad_norm": 0.5398069024085999, + "learning_rate": 3.711849682180477e-05, + "loss": 0.2917, + "step": 1909 + }, + { + "epoch": 0.19794797388330396, + "grad_norm": 0.4341377019882202, + "learning_rate": 3.711502429349928e-05, + "loss": 0.2098, + "step": 1910 + }, + { + "epoch": 0.19805161156596537, + "grad_norm": 0.47715920209884644, + "learning_rate": 3.7111549836699456e-05, + "loss": 0.2335, + "step": 1911 + }, + { + "epoch": 0.1981552492486268, + "grad_norm": 0.4509512186050415, + "learning_rate": 3.71080734517968e-05, + "loss": 0.212, + "step": 1912 + }, + { + "epoch": 0.1982588869312882, + "grad_norm": 0.438888818025589, + "learning_rate": 3.7104595139183014e-05, + "loss": 0.237, + "step": 1913 + }, + { + "epoch": 0.19836252461394963, + "grad_norm": 0.4621768593788147, + "learning_rate": 3.710111489925004e-05, + "loss": 0.2551, + "step": 1914 + }, + { + "epoch": 0.19846616229661104, + "grad_norm": 0.5049877762794495, + "learning_rate": 3.709763273239003e-05, + "loss": 0.2512, + "step": 1915 + }, + { + "epoch": 0.19856979997927246, + "grad_norm": 0.4743228554725647, + "learning_rate": 3.709414863899534e-05, + "loss": 0.2509, + "step": 1916 + }, + { + "epoch": 0.19867343766193388, + "grad_norm": 0.44488999247550964, + "learning_rate": 3.7090662619458555e-05, + "loss": 0.2225, + "step": 1917 + }, + { + "epoch": 0.1987770753445953, + "grad_norm": 0.45012685656547546, + "learning_rate": 3.708717467417248e-05, + "loss": 0.2298, + "step": 1918 + }, + { + "epoch": 0.1988807130272567, + "grad_norm": 0.5009463429450989, + "learning_rate": 3.708368480353011e-05, + "loss": 0.2378, + "step": 1919 + }, + { + "epoch": 0.19898435070991813, + "grad_norm": 0.5277925133705139, + "learning_rate": 3.70801930079247e-05, + "loss": 0.2575, + "step": 1920 + }, + { + "epoch": 0.19908798839257955, + "grad_norm": 0.4720187187194824, + "learning_rate": 3.707669928774969e-05, + "loss": 0.2611, + "step": 1921 + }, + { + "epoch": 0.19919162607524096, + "grad_norm": 0.5206968188285828, + "learning_rate": 3.7073203643398764e-05, + "loss": 0.2704, + "step": 1922 + }, + { + "epoch": 0.19929526375790238, + "grad_norm": 0.5649044513702393, + "learning_rate": 3.706970607526578e-05, + "loss": 0.2472, + "step": 1923 + }, + { + "epoch": 0.1993989014405638, + "grad_norm": 0.4891642928123474, + "learning_rate": 3.7066206583744855e-05, + "loss": 0.2403, + "step": 1924 + }, + { + "epoch": 0.19950253912322521, + "grad_norm": 0.46503427624702454, + "learning_rate": 3.70627051692303e-05, + "loss": 0.2065, + "step": 1925 + }, + { + "epoch": 0.19960617680588663, + "grad_norm": 0.567818820476532, + "learning_rate": 3.705920183211666e-05, + "loss": 0.3012, + "step": 1926 + }, + { + "epoch": 0.19970981448854805, + "grad_norm": 0.49717381596565247, + "learning_rate": 3.705569657279866e-05, + "loss": 0.2438, + "step": 1927 + }, + { + "epoch": 0.19981345217120944, + "grad_norm": 0.5408019423484802, + "learning_rate": 3.7052189391671295e-05, + "loss": 0.2561, + "step": 1928 + }, + { + "epoch": 0.19991708985387086, + "grad_norm": 0.4729301333427429, + "learning_rate": 3.704868028912974e-05, + "loss": 0.2296, + "step": 1929 + }, + { + "epoch": 0.20002072753653227, + "grad_norm": 0.4479348361492157, + "learning_rate": 3.7045169265569384e-05, + "loss": 0.2332, + "step": 1930 + }, + { + "epoch": 0.2001243652191937, + "grad_norm": 0.4986315071582794, + "learning_rate": 3.7041656321385857e-05, + "loss": 0.2766, + "step": 1931 + }, + { + "epoch": 0.2002280029018551, + "grad_norm": 0.49219274520874023, + "learning_rate": 3.7038141456974986e-05, + "loss": 0.2417, + "step": 1932 + }, + { + "epoch": 0.20033164058451652, + "grad_norm": 0.4674849212169647, + "learning_rate": 3.703462467273282e-05, + "loss": 0.2562, + "step": 1933 + }, + { + "epoch": 0.20043527826717794, + "grad_norm": 0.4261247217655182, + "learning_rate": 3.703110596905563e-05, + "loss": 0.2184, + "step": 1934 + }, + { + "epoch": 0.20053891594983936, + "grad_norm": 0.40725192427635193, + "learning_rate": 3.70275853463399e-05, + "loss": 0.1822, + "step": 1935 + }, + { + "epoch": 0.20064255363250078, + "grad_norm": 0.5302891135215759, + "learning_rate": 3.7024062804982315e-05, + "loss": 0.2622, + "step": 1936 + }, + { + "epoch": 0.2007461913151622, + "grad_norm": 0.4687095284461975, + "learning_rate": 3.70205383453798e-05, + "loss": 0.2528, + "step": 1937 + }, + { + "epoch": 0.2008498289978236, + "grad_norm": 0.43511658906936646, + "learning_rate": 3.7017011967929484e-05, + "loss": 0.2163, + "step": 1938 + }, + { + "epoch": 0.20095346668048503, + "grad_norm": 0.40954121947288513, + "learning_rate": 3.701348367302871e-05, + "loss": 0.2077, + "step": 1939 + }, + { + "epoch": 0.20105710436314644, + "grad_norm": 0.504981279373169, + "learning_rate": 3.7009953461075044e-05, + "loss": 0.2458, + "step": 1940 + }, + { + "epoch": 0.20116074204580786, + "grad_norm": 0.4793414771556854, + "learning_rate": 3.700642133246627e-05, + "loss": 0.231, + "step": 1941 + }, + { + "epoch": 0.20126437972846928, + "grad_norm": 0.5226635336875916, + "learning_rate": 3.700288728760037e-05, + "loss": 0.242, + "step": 1942 + }, + { + "epoch": 0.2013680174111307, + "grad_norm": 0.5597573518753052, + "learning_rate": 3.699935132687556e-05, + "loss": 0.3137, + "step": 1943 + }, + { + "epoch": 0.2014716550937921, + "grad_norm": 0.4148508608341217, + "learning_rate": 3.699581345069028e-05, + "loss": 0.1813, + "step": 1944 + }, + { + "epoch": 0.20157529277645353, + "grad_norm": 0.49836692214012146, + "learning_rate": 3.699227365944316e-05, + "loss": 0.2657, + "step": 1945 + }, + { + "epoch": 0.20167893045911495, + "grad_norm": 0.4882723093032837, + "learning_rate": 3.698873195353305e-05, + "loss": 0.2314, + "step": 1946 + }, + { + "epoch": 0.20178256814177634, + "grad_norm": 0.4234292507171631, + "learning_rate": 3.698518833335904e-05, + "loss": 0.2216, + "step": 1947 + }, + { + "epoch": 0.20188620582443775, + "grad_norm": 0.48601916432380676, + "learning_rate": 3.69816427993204e-05, + "loss": 0.2464, + "step": 1948 + }, + { + "epoch": 0.20198984350709917, + "grad_norm": 0.46592000126838684, + "learning_rate": 3.6978095351816656e-05, + "loss": 0.2274, + "step": 1949 + }, + { + "epoch": 0.2020934811897606, + "grad_norm": 0.4673402011394501, + "learning_rate": 3.697454599124753e-05, + "loss": 0.2211, + "step": 1950 + }, + { + "epoch": 0.202197118872422, + "grad_norm": 0.4336487948894501, + "learning_rate": 3.697099471801294e-05, + "loss": 0.2031, + "step": 1951 + }, + { + "epoch": 0.20230075655508342, + "grad_norm": 0.5105946063995361, + "learning_rate": 3.6967441532513046e-05, + "loss": 0.2742, + "step": 1952 + }, + { + "epoch": 0.20240439423774484, + "grad_norm": 0.46542996168136597, + "learning_rate": 3.696388643514822e-05, + "loss": 0.2272, + "step": 1953 + }, + { + "epoch": 0.20250803192040626, + "grad_norm": 0.48160192370414734, + "learning_rate": 3.696032942631904e-05, + "loss": 0.2484, + "step": 1954 + }, + { + "epoch": 0.20261166960306767, + "grad_norm": 0.5089412927627563, + "learning_rate": 3.6956770506426304e-05, + "loss": 0.2255, + "step": 1955 + }, + { + "epoch": 0.2027153072857291, + "grad_norm": 0.5144649147987366, + "learning_rate": 3.695320967587103e-05, + "loss": 0.2596, + "step": 1956 + }, + { + "epoch": 0.2028189449683905, + "grad_norm": 0.4798566997051239, + "learning_rate": 3.6949646935054445e-05, + "loss": 0.2526, + "step": 1957 + }, + { + "epoch": 0.20292258265105192, + "grad_norm": 0.39153149724006653, + "learning_rate": 3.694608228437798e-05, + "loss": 0.1896, + "step": 1958 + }, + { + "epoch": 0.20302622033371334, + "grad_norm": 0.47644275426864624, + "learning_rate": 3.6942515724243326e-05, + "loss": 0.2485, + "step": 1959 + }, + { + "epoch": 0.20312985801637476, + "grad_norm": 0.39403602480888367, + "learning_rate": 3.693894725505232e-05, + "loss": 0.1817, + "step": 1960 + }, + { + "epoch": 0.20323349569903618, + "grad_norm": 0.48983171582221985, + "learning_rate": 3.6935376877207086e-05, + "loss": 0.232, + "step": 1961 + }, + { + "epoch": 0.2033371333816976, + "grad_norm": 0.5000522136688232, + "learning_rate": 3.69318045911099e-05, + "loss": 0.2101, + "step": 1962 + }, + { + "epoch": 0.203440771064359, + "grad_norm": 0.47150513529777527, + "learning_rate": 3.69282303971633e-05, + "loss": 0.2131, + "step": 1963 + }, + { + "epoch": 0.20354440874702043, + "grad_norm": 0.44279807806015015, + "learning_rate": 3.692465429577001e-05, + "loss": 0.2111, + "step": 1964 + }, + { + "epoch": 0.20364804642968184, + "grad_norm": 0.49045294523239136, + "learning_rate": 3.6921076287332985e-05, + "loss": 0.2326, + "step": 1965 + }, + { + "epoch": 0.20375168411234323, + "grad_norm": 0.4297841489315033, + "learning_rate": 3.691749637225539e-05, + "loss": 0.2008, + "step": 1966 + }, + { + "epoch": 0.20385532179500465, + "grad_norm": 0.51743483543396, + "learning_rate": 3.69139145509406e-05, + "loss": 0.2289, + "step": 1967 + }, + { + "epoch": 0.20395895947766607, + "grad_norm": 0.4800795316696167, + "learning_rate": 3.691033082379221e-05, + "loss": 0.2418, + "step": 1968 + }, + { + "epoch": 0.20406259716032749, + "grad_norm": 0.4997004270553589, + "learning_rate": 3.6906745191214035e-05, + "loss": 0.2497, + "step": 1969 + }, + { + "epoch": 0.2041662348429889, + "grad_norm": 0.5265177488327026, + "learning_rate": 3.690315765361009e-05, + "loss": 0.2331, + "step": 1970 + }, + { + "epoch": 0.20426987252565032, + "grad_norm": 0.4550437033176422, + "learning_rate": 3.689956821138462e-05, + "loss": 0.2309, + "step": 1971 + }, + { + "epoch": 0.20437351020831174, + "grad_norm": 0.4945697486400604, + "learning_rate": 3.689597686494208e-05, + "loss": 0.2477, + "step": 1972 + }, + { + "epoch": 0.20447714789097315, + "grad_norm": 0.44872772693634033, + "learning_rate": 3.689238361468712e-05, + "loss": 0.2023, + "step": 1973 + }, + { + "epoch": 0.20458078557363457, + "grad_norm": 0.4784891605377197, + "learning_rate": 3.6888788461024636e-05, + "loss": 0.2607, + "step": 1974 + }, + { + "epoch": 0.204684423256296, + "grad_norm": 0.3880353271961212, + "learning_rate": 3.6885191404359725e-05, + "loss": 0.2065, + "step": 1975 + }, + { + "epoch": 0.2047880609389574, + "grad_norm": 0.5710169672966003, + "learning_rate": 3.68815924450977e-05, + "loss": 0.2543, + "step": 1976 + }, + { + "epoch": 0.20489169862161882, + "grad_norm": 0.505615234375, + "learning_rate": 3.687799158364408e-05, + "loss": 0.2596, + "step": 1977 + }, + { + "epoch": 0.20499533630428024, + "grad_norm": 0.48438864946365356, + "learning_rate": 3.6874388820404604e-05, + "loss": 0.2379, + "step": 1978 + }, + { + "epoch": 0.20509897398694166, + "grad_norm": 0.4900949001312256, + "learning_rate": 3.6870784155785225e-05, + "loss": 0.245, + "step": 1979 + }, + { + "epoch": 0.20520261166960307, + "grad_norm": 0.5086212754249573, + "learning_rate": 3.686717759019212e-05, + "loss": 0.2613, + "step": 1980 + }, + { + "epoch": 0.2053062493522645, + "grad_norm": 0.495339035987854, + "learning_rate": 3.686356912403166e-05, + "loss": 0.2395, + "step": 1981 + }, + { + "epoch": 0.2054098870349259, + "grad_norm": 0.4228864908218384, + "learning_rate": 3.6859958757710444e-05, + "loss": 0.2175, + "step": 1982 + }, + { + "epoch": 0.20551352471758733, + "grad_norm": 0.5192626714706421, + "learning_rate": 3.685634649163529e-05, + "loss": 0.2428, + "step": 1983 + }, + { + "epoch": 0.20561716240024874, + "grad_norm": 0.5048412680625916, + "learning_rate": 3.6852732326213206e-05, + "loss": 0.2664, + "step": 1984 + }, + { + "epoch": 0.20572080008291013, + "grad_norm": 0.45641231536865234, + "learning_rate": 3.684911626185146e-05, + "loss": 0.2276, + "step": 1985 + }, + { + "epoch": 0.20582443776557155, + "grad_norm": 0.42920756340026855, + "learning_rate": 3.6845498298957466e-05, + "loss": 0.2067, + "step": 1986 + }, + { + "epoch": 0.20592807544823297, + "grad_norm": 0.5071409344673157, + "learning_rate": 3.684187843793892e-05, + "loss": 0.2515, + "step": 1987 + }, + { + "epoch": 0.20603171313089438, + "grad_norm": 0.4680119752883911, + "learning_rate": 3.68382566792037e-05, + "loss": 0.2415, + "step": 1988 + }, + { + "epoch": 0.2061353508135558, + "grad_norm": 0.49129483103752136, + "learning_rate": 3.6834633023159885e-05, + "loss": 0.2643, + "step": 1989 + }, + { + "epoch": 0.20623898849621722, + "grad_norm": 0.4970560669898987, + "learning_rate": 3.6831007470215785e-05, + "loss": 0.2455, + "step": 1990 + }, + { + "epoch": 0.20634262617887864, + "grad_norm": 0.4532671868801117, + "learning_rate": 3.682738002077994e-05, + "loss": 0.2252, + "step": 1991 + }, + { + "epoch": 0.20644626386154005, + "grad_norm": 0.47092416882514954, + "learning_rate": 3.6823750675261064e-05, + "loss": 0.2445, + "step": 1992 + }, + { + "epoch": 0.20654990154420147, + "grad_norm": 0.5188236832618713, + "learning_rate": 3.682011943406812e-05, + "loss": 0.2674, + "step": 1993 + }, + { + "epoch": 0.2066535392268629, + "grad_norm": 0.4372886121273041, + "learning_rate": 3.681648629761026e-05, + "loss": 0.1847, + "step": 1994 + }, + { + "epoch": 0.2067571769095243, + "grad_norm": 0.44967779517173767, + "learning_rate": 3.6812851266296866e-05, + "loss": 0.2293, + "step": 1995 + }, + { + "epoch": 0.20686081459218572, + "grad_norm": 0.515315592288971, + "learning_rate": 3.680921434053753e-05, + "loss": 0.246, + "step": 1996 + }, + { + "epoch": 0.20696445227484714, + "grad_norm": 0.49991822242736816, + "learning_rate": 3.6805575520742057e-05, + "loss": 0.2767, + "step": 1997 + }, + { + "epoch": 0.20706808995750856, + "grad_norm": 0.43403372168540955, + "learning_rate": 3.6801934807320455e-05, + "loss": 0.2123, + "step": 1998 + }, + { + "epoch": 0.20717172764016997, + "grad_norm": 0.49474579095840454, + "learning_rate": 3.679829220068296e-05, + "loss": 0.2626, + "step": 1999 + }, + { + "epoch": 0.2072753653228314, + "grad_norm": 0.45910385251045227, + "learning_rate": 3.679464770124001e-05, + "loss": 0.2441, + "step": 2000 + }, + { + "epoch": 0.2073790030054928, + "grad_norm": 0.45843109488487244, + "learning_rate": 3.679100130940227e-05, + "loss": 0.2406, + "step": 2001 + }, + { + "epoch": 0.20748264068815422, + "grad_norm": 0.478091299533844, + "learning_rate": 3.6787353025580596e-05, + "loss": 0.2225, + "step": 2002 + }, + { + "epoch": 0.20758627837081564, + "grad_norm": 0.49887460470199585, + "learning_rate": 3.678370285018608e-05, + "loss": 0.2616, + "step": 2003 + }, + { + "epoch": 0.20768991605347703, + "grad_norm": 0.4933151304721832, + "learning_rate": 3.6780050783630024e-05, + "loss": 0.2249, + "step": 2004 + }, + { + "epoch": 0.20779355373613845, + "grad_norm": 0.4972792863845825, + "learning_rate": 3.6776396826323925e-05, + "loss": 0.2304, + "step": 2005 + }, + { + "epoch": 0.20789719141879986, + "grad_norm": 0.4850703775882721, + "learning_rate": 3.6772740978679517e-05, + "loss": 0.2548, + "step": 2006 + }, + { + "epoch": 0.20800082910146128, + "grad_norm": 0.5416536927223206, + "learning_rate": 3.676908324110873e-05, + "loss": 0.2829, + "step": 2007 + }, + { + "epoch": 0.2081044667841227, + "grad_norm": 0.46809735894203186, + "learning_rate": 3.676542361402371e-05, + "loss": 0.2198, + "step": 2008 + }, + { + "epoch": 0.20820810446678412, + "grad_norm": 0.44908079504966736, + "learning_rate": 3.676176209783681e-05, + "loss": 0.2197, + "step": 2009 + }, + { + "epoch": 0.20831174214944553, + "grad_norm": 0.5226306319236755, + "learning_rate": 3.675809869296063e-05, + "loss": 0.2744, + "step": 2010 + }, + { + "epoch": 0.20841537983210695, + "grad_norm": 0.5002926588058472, + "learning_rate": 3.6754433399807925e-05, + "loss": 0.2611, + "step": 2011 + }, + { + "epoch": 0.20851901751476837, + "grad_norm": 0.5227557420730591, + "learning_rate": 3.675076621879172e-05, + "loss": 0.253, + "step": 2012 + }, + { + "epoch": 0.20862265519742978, + "grad_norm": 0.5399764180183411, + "learning_rate": 3.674709715032521e-05, + "loss": 0.2428, + "step": 2013 + }, + { + "epoch": 0.2087262928800912, + "grad_norm": 0.5577982664108276, + "learning_rate": 3.6743426194821836e-05, + "loss": 0.2532, + "step": 2014 + }, + { + "epoch": 0.20882993056275262, + "grad_norm": 0.4555790424346924, + "learning_rate": 3.6739753352695224e-05, + "loss": 0.218, + "step": 2015 + }, + { + "epoch": 0.20893356824541404, + "grad_norm": 0.4721558392047882, + "learning_rate": 3.6736078624359216e-05, + "loss": 0.206, + "step": 2016 + }, + { + "epoch": 0.20903720592807545, + "grad_norm": 0.6262646317481995, + "learning_rate": 3.6732402010227895e-05, + "loss": 0.2736, + "step": 2017 + }, + { + "epoch": 0.20914084361073687, + "grad_norm": 0.4947630763053894, + "learning_rate": 3.672872351071552e-05, + "loss": 0.229, + "step": 2018 + }, + { + "epoch": 0.2092444812933983, + "grad_norm": 0.46532395482063293, + "learning_rate": 3.6725043126236596e-05, + "loss": 0.2402, + "step": 2019 + }, + { + "epoch": 0.2093481189760597, + "grad_norm": 0.4798278212547302, + "learning_rate": 3.67213608572058e-05, + "loss": 0.2595, + "step": 2020 + }, + { + "epoch": 0.20945175665872112, + "grad_norm": 0.4924963116645813, + "learning_rate": 3.671767670403807e-05, + "loss": 0.2451, + "step": 2021 + }, + { + "epoch": 0.20955539434138254, + "grad_norm": 0.4918677806854248, + "learning_rate": 3.6713990667148507e-05, + "loss": 0.229, + "step": 2022 + }, + { + "epoch": 0.20965903202404393, + "grad_norm": 0.4604406952857971, + "learning_rate": 3.6710302746952466e-05, + "loss": 0.2251, + "step": 2023 + }, + { + "epoch": 0.20976266970670535, + "grad_norm": 0.4431096017360687, + "learning_rate": 3.670661294386548e-05, + "loss": 0.2266, + "step": 2024 + }, + { + "epoch": 0.20986630738936676, + "grad_norm": 0.49358218908309937, + "learning_rate": 3.670292125830332e-05, + "loss": 0.2289, + "step": 2025 + }, + { + "epoch": 0.20996994507202818, + "grad_norm": 0.47046852111816406, + "learning_rate": 3.669922769068196e-05, + "loss": 0.2291, + "step": 2026 + }, + { + "epoch": 0.2100735827546896, + "grad_norm": 0.601206362247467, + "learning_rate": 3.669553224141758e-05, + "loss": 0.2383, + "step": 2027 + }, + { + "epoch": 0.21017722043735101, + "grad_norm": 0.5137280225753784, + "learning_rate": 3.669183491092658e-05, + "loss": 0.2755, + "step": 2028 + }, + { + "epoch": 0.21028085812001243, + "grad_norm": 0.5034176707267761, + "learning_rate": 3.668813569962557e-05, + "loss": 0.2303, + "step": 2029 + }, + { + "epoch": 0.21038449580267385, + "grad_norm": 0.42328396439552307, + "learning_rate": 3.668443460793138e-05, + "loss": 0.1977, + "step": 2030 + }, + { + "epoch": 0.21048813348533527, + "grad_norm": 0.5424888730049133, + "learning_rate": 3.668073163626103e-05, + "loss": 0.2446, + "step": 2031 + }, + { + "epoch": 0.21059177116799668, + "grad_norm": 0.43510910868644714, + "learning_rate": 3.667702678503177e-05, + "loss": 0.2236, + "step": 2032 + }, + { + "epoch": 0.2106954088506581, + "grad_norm": 0.4465221166610718, + "learning_rate": 3.667332005466105e-05, + "loss": 0.2022, + "step": 2033 + }, + { + "epoch": 0.21079904653331952, + "grad_norm": 0.45926061272621155, + "learning_rate": 3.666961144556655e-05, + "loss": 0.2257, + "step": 2034 + }, + { + "epoch": 0.21090268421598093, + "grad_norm": 0.5046967267990112, + "learning_rate": 3.6665900958166154e-05, + "loss": 0.2671, + "step": 2035 + }, + { + "epoch": 0.21100632189864235, + "grad_norm": 0.4858223795890808, + "learning_rate": 3.6662188592877936e-05, + "loss": 0.2366, + "step": 2036 + }, + { + "epoch": 0.21110995958130377, + "grad_norm": 0.4389292299747467, + "learning_rate": 3.665847435012022e-05, + "loss": 0.2114, + "step": 2037 + }, + { + "epoch": 0.21121359726396519, + "grad_norm": 0.5166811943054199, + "learning_rate": 3.66547582303115e-05, + "loss": 0.2513, + "step": 2038 + }, + { + "epoch": 0.2113172349466266, + "grad_norm": 0.43512311577796936, + "learning_rate": 3.6651040233870514e-05, + "loss": 0.2319, + "step": 2039 + }, + { + "epoch": 0.21142087262928802, + "grad_norm": 0.5205447673797607, + "learning_rate": 3.66473203612162e-05, + "loss": 0.2404, + "step": 2040 + }, + { + "epoch": 0.21152451031194944, + "grad_norm": 0.5103970170021057, + "learning_rate": 3.66435986127677e-05, + "loss": 0.2808, + "step": 2041 + }, + { + "epoch": 0.21162814799461083, + "grad_norm": 0.47343921661376953, + "learning_rate": 3.66398749889444e-05, + "loss": 0.2526, + "step": 2042 + }, + { + "epoch": 0.21173178567727224, + "grad_norm": 0.4443468451499939, + "learning_rate": 3.663614949016584e-05, + "loss": 0.2371, + "step": 2043 + }, + { + "epoch": 0.21183542335993366, + "grad_norm": 0.4487053155899048, + "learning_rate": 3.663242211685181e-05, + "loss": 0.217, + "step": 2044 + }, + { + "epoch": 0.21193906104259508, + "grad_norm": 0.41238850355148315, + "learning_rate": 3.662869286942233e-05, + "loss": 0.1768, + "step": 2045 + }, + { + "epoch": 0.2120426987252565, + "grad_norm": 0.5283915996551514, + "learning_rate": 3.662496174829757e-05, + "loss": 0.2673, + "step": 2046 + }, + { + "epoch": 0.2121463364079179, + "grad_norm": 0.3949659466743469, + "learning_rate": 3.662122875389797e-05, + "loss": 0.1958, + "step": 2047 + }, + { + "epoch": 0.21224997409057933, + "grad_norm": 0.49384358525276184, + "learning_rate": 3.661749388664416e-05, + "loss": 0.2489, + "step": 2048 + }, + { + "epoch": 0.21235361177324075, + "grad_norm": 0.43529611825942993, + "learning_rate": 3.6613757146956964e-05, + "loss": 0.2323, + "step": 2049 + }, + { + "epoch": 0.21245724945590216, + "grad_norm": 0.46041977405548096, + "learning_rate": 3.661001853525744e-05, + "loss": 0.1857, + "step": 2050 + }, + { + "epoch": 0.21256088713856358, + "grad_norm": 0.5310932397842407, + "learning_rate": 3.660627805196685e-05, + "loss": 0.2152, + "step": 2051 + }, + { + "epoch": 0.212664524821225, + "grad_norm": 0.4441661238670349, + "learning_rate": 3.660253569750666e-05, + "loss": 0.2208, + "step": 2052 + }, + { + "epoch": 0.21276816250388642, + "grad_norm": 0.5119684338569641, + "learning_rate": 3.659879147229856e-05, + "loss": 0.197, + "step": 2053 + }, + { + "epoch": 0.21287180018654783, + "grad_norm": 0.5342808961868286, + "learning_rate": 3.659504537676444e-05, + "loss": 0.2501, + "step": 2054 + }, + { + "epoch": 0.21297543786920925, + "grad_norm": 0.453165203332901, + "learning_rate": 3.6591297411326404e-05, + "loss": 0.2131, + "step": 2055 + }, + { + "epoch": 0.21307907555187067, + "grad_norm": 0.5051000118255615, + "learning_rate": 3.6587547576406764e-05, + "loss": 0.229, + "step": 2056 + }, + { + "epoch": 0.21318271323453208, + "grad_norm": 0.49012821912765503, + "learning_rate": 3.658379587242805e-05, + "loss": 0.2067, + "step": 2057 + }, + { + "epoch": 0.2132863509171935, + "grad_norm": 0.4396739602088928, + "learning_rate": 3.6580042299813004e-05, + "loss": 0.203, + "step": 2058 + }, + { + "epoch": 0.21338998859985492, + "grad_norm": 0.5717998743057251, + "learning_rate": 3.657628685898456e-05, + "loss": 0.2327, + "step": 2059 + }, + { + "epoch": 0.21349362628251634, + "grad_norm": 0.4730100929737091, + "learning_rate": 3.657252955036588e-05, + "loss": 0.2186, + "step": 2060 + }, + { + "epoch": 0.21359726396517772, + "grad_norm": 0.5230618119239807, + "learning_rate": 3.6568770374380335e-05, + "loss": 0.2373, + "step": 2061 + }, + { + "epoch": 0.21370090164783914, + "grad_norm": 0.4315328299999237, + "learning_rate": 3.6565009331451505e-05, + "loss": 0.178, + "step": 2062 + }, + { + "epoch": 0.21380453933050056, + "grad_norm": 0.5424378514289856, + "learning_rate": 3.656124642200316e-05, + "loss": 0.2195, + "step": 2063 + }, + { + "epoch": 0.21390817701316198, + "grad_norm": 0.475759357213974, + "learning_rate": 3.6557481646459325e-05, + "loss": 0.2354, + "step": 2064 + }, + { + "epoch": 0.2140118146958234, + "grad_norm": 0.44949331879615784, + "learning_rate": 3.65537150052442e-05, + "loss": 0.2196, + "step": 2065 + }, + { + "epoch": 0.2141154523784848, + "grad_norm": 0.4517166316509247, + "learning_rate": 3.6549946498782195e-05, + "loss": 0.208, + "step": 2066 + }, + { + "epoch": 0.21421909006114623, + "grad_norm": 0.5226971507072449, + "learning_rate": 3.6546176127497954e-05, + "loss": 0.255, + "step": 2067 + }, + { + "epoch": 0.21432272774380765, + "grad_norm": 0.5534842014312744, + "learning_rate": 3.6542403891816303e-05, + "loss": 0.2933, + "step": 2068 + }, + { + "epoch": 0.21442636542646906, + "grad_norm": 0.46718254685401917, + "learning_rate": 3.65386297921623e-05, + "loss": 0.2338, + "step": 2069 + }, + { + "epoch": 0.21453000310913048, + "grad_norm": 0.5315462946891785, + "learning_rate": 3.6534853828961194e-05, + "loss": 0.248, + "step": 2070 + }, + { + "epoch": 0.2146336407917919, + "grad_norm": 0.5207595825195312, + "learning_rate": 3.653107600263846e-05, + "loss": 0.2666, + "step": 2071 + }, + { + "epoch": 0.2147372784744533, + "grad_norm": 0.4615035951137543, + "learning_rate": 3.652729631361979e-05, + "loss": 0.2273, + "step": 2072 + }, + { + "epoch": 0.21484091615711473, + "grad_norm": 0.4617545008659363, + "learning_rate": 3.652351476233106e-05, + "loss": 0.2371, + "step": 2073 + }, + { + "epoch": 0.21494455383977615, + "grad_norm": 0.5457558631896973, + "learning_rate": 3.651973134919837e-05, + "loss": 0.2653, + "step": 2074 + }, + { + "epoch": 0.21504819152243757, + "grad_norm": 0.5435750484466553, + "learning_rate": 3.651594607464804e-05, + "loss": 0.2927, + "step": 2075 + }, + { + "epoch": 0.21515182920509898, + "grad_norm": 0.4262548089027405, + "learning_rate": 3.651215893910657e-05, + "loss": 0.2037, + "step": 2076 + }, + { + "epoch": 0.2152554668877604, + "grad_norm": 0.4520435631275177, + "learning_rate": 3.650836994300071e-05, + "loss": 0.2371, + "step": 2077 + }, + { + "epoch": 0.21535910457042182, + "grad_norm": 0.4527454972267151, + "learning_rate": 3.650457908675738e-05, + "loss": 0.2498, + "step": 2078 + }, + { + "epoch": 0.21546274225308323, + "grad_norm": 0.45087677240371704, + "learning_rate": 3.650078637080374e-05, + "loss": 0.2273, + "step": 2079 + }, + { + "epoch": 0.21556637993574462, + "grad_norm": 0.49029573798179626, + "learning_rate": 3.6496991795567146e-05, + "loss": 0.2562, + "step": 2080 + }, + { + "epoch": 0.21567001761840604, + "grad_norm": 0.46473249793052673, + "learning_rate": 3.649319536147515e-05, + "loss": 0.2418, + "step": 2081 + }, + { + "epoch": 0.21577365530106746, + "grad_norm": 0.54538494348526, + "learning_rate": 3.648939706895555e-05, + "loss": 0.2899, + "step": 2082 + }, + { + "epoch": 0.21587729298372887, + "grad_norm": 0.4631831645965576, + "learning_rate": 3.648559691843632e-05, + "loss": 0.2447, + "step": 2083 + }, + { + "epoch": 0.2159809306663903, + "grad_norm": 0.43592122197151184, + "learning_rate": 3.648179491034565e-05, + "loss": 0.2054, + "step": 2084 + }, + { + "epoch": 0.2160845683490517, + "grad_norm": 0.42652252316474915, + "learning_rate": 3.647799104511195e-05, + "loss": 0.1869, + "step": 2085 + }, + { + "epoch": 0.21618820603171313, + "grad_norm": 0.46224814653396606, + "learning_rate": 3.647418532316385e-05, + "loss": 0.2235, + "step": 2086 + }, + { + "epoch": 0.21629184371437454, + "grad_norm": 0.4874723553657532, + "learning_rate": 3.6470377744930145e-05, + "loss": 0.2386, + "step": 2087 + }, + { + "epoch": 0.21639548139703596, + "grad_norm": 0.5245859622955322, + "learning_rate": 3.646656831083988e-05, + "loss": 0.2737, + "step": 2088 + }, + { + "epoch": 0.21649911907969738, + "grad_norm": 0.462100088596344, + "learning_rate": 3.64627570213223e-05, + "loss": 0.2354, + "step": 2089 + }, + { + "epoch": 0.2166027567623588, + "grad_norm": 0.5441925525665283, + "learning_rate": 3.645894387680685e-05, + "loss": 0.2697, + "step": 2090 + }, + { + "epoch": 0.2167063944450202, + "grad_norm": 0.48411786556243896, + "learning_rate": 3.6455128877723186e-05, + "loss": 0.2207, + "step": 2091 + }, + { + "epoch": 0.21681003212768163, + "grad_norm": 0.41479653120040894, + "learning_rate": 3.645131202450119e-05, + "loss": 0.1996, + "step": 2092 + }, + { + "epoch": 0.21691366981034305, + "grad_norm": 0.474609911441803, + "learning_rate": 3.6447493317570914e-05, + "loss": 0.2263, + "step": 2093 + }, + { + "epoch": 0.21701730749300446, + "grad_norm": 0.442793071269989, + "learning_rate": 3.6443672757362666e-05, + "loss": 0.2181, + "step": 2094 + }, + { + "epoch": 0.21712094517566588, + "grad_norm": 0.5405283570289612, + "learning_rate": 3.643985034430693e-05, + "loss": 0.237, + "step": 2095 + }, + { + "epoch": 0.2172245828583273, + "grad_norm": 0.5367290377616882, + "learning_rate": 3.643602607883442e-05, + "loss": 0.2752, + "step": 2096 + }, + { + "epoch": 0.21732822054098871, + "grad_norm": 0.5797774195671082, + "learning_rate": 3.643219996137604e-05, + "loss": 0.2599, + "step": 2097 + }, + { + "epoch": 0.21743185822365013, + "grad_norm": 0.49853330850601196, + "learning_rate": 3.6428371992362916e-05, + "loss": 0.2604, + "step": 2098 + }, + { + "epoch": 0.21753549590631152, + "grad_norm": 0.4744069576263428, + "learning_rate": 3.642454217222637e-05, + "loss": 0.2321, + "step": 2099 + }, + { + "epoch": 0.21763913358897294, + "grad_norm": 0.5347335338592529, + "learning_rate": 3.6420710501397944e-05, + "loss": 0.2782, + "step": 2100 + }, + { + "epoch": 0.21774277127163436, + "grad_norm": 0.4640110433101654, + "learning_rate": 3.641687698030938e-05, + "loss": 0.2306, + "step": 2101 + }, + { + "epoch": 0.21784640895429577, + "grad_norm": 0.48161885142326355, + "learning_rate": 3.641304160939265e-05, + "loss": 0.2285, + "step": 2102 + }, + { + "epoch": 0.2179500466369572, + "grad_norm": 0.5034273862838745, + "learning_rate": 3.6409204389079896e-05, + "loss": 0.2126, + "step": 2103 + }, + { + "epoch": 0.2180536843196186, + "grad_norm": 0.4994756877422333, + "learning_rate": 3.640536531980351e-05, + "loss": 0.27, + "step": 2104 + }, + { + "epoch": 0.21815732200228002, + "grad_norm": 0.44723254442214966, + "learning_rate": 3.6401524401996056e-05, + "loss": 0.2185, + "step": 2105 + }, + { + "epoch": 0.21826095968494144, + "grad_norm": 0.4936169385910034, + "learning_rate": 3.639768163609033e-05, + "loss": 0.2574, + "step": 2106 + }, + { + "epoch": 0.21836459736760286, + "grad_norm": 0.48400330543518066, + "learning_rate": 3.639383702251933e-05, + "loss": 0.2183, + "step": 2107 + }, + { + "epoch": 0.21846823505026428, + "grad_norm": 0.5214065313339233, + "learning_rate": 3.638999056171626e-05, + "loss": 0.2313, + "step": 2108 + }, + { + "epoch": 0.2185718727329257, + "grad_norm": 0.45542773604393005, + "learning_rate": 3.638614225411452e-05, + "loss": 0.2153, + "step": 2109 + }, + { + "epoch": 0.2186755104155871, + "grad_norm": 0.5218877196311951, + "learning_rate": 3.638229210014776e-05, + "loss": 0.2727, + "step": 2110 + }, + { + "epoch": 0.21877914809824853, + "grad_norm": 0.4047102928161621, + "learning_rate": 3.6378440100249785e-05, + "loss": 0.1887, + "step": 2111 + }, + { + "epoch": 0.21888278578090994, + "grad_norm": 0.4060623347759247, + "learning_rate": 3.637458625485464e-05, + "loss": 0.2128, + "step": 2112 + }, + { + "epoch": 0.21898642346357136, + "grad_norm": 0.4566936194896698, + "learning_rate": 3.637073056439657e-05, + "loss": 0.2326, + "step": 2113 + }, + { + "epoch": 0.21909006114623278, + "grad_norm": 0.39076003432273865, + "learning_rate": 3.636687302931003e-05, + "loss": 0.1933, + "step": 2114 + }, + { + "epoch": 0.2191936988288942, + "grad_norm": 0.5355175733566284, + "learning_rate": 3.636301365002968e-05, + "loss": 0.2741, + "step": 2115 + }, + { + "epoch": 0.2192973365115556, + "grad_norm": 0.5408210158348083, + "learning_rate": 3.6359152426990384e-05, + "loss": 0.2438, + "step": 2116 + }, + { + "epoch": 0.21940097419421703, + "grad_norm": 0.535264253616333, + "learning_rate": 3.6355289360627236e-05, + "loss": 0.2508, + "step": 2117 + }, + { + "epoch": 0.21950461187687842, + "grad_norm": 0.47466620802879333, + "learning_rate": 3.6351424451375494e-05, + "loss": 0.2343, + "step": 2118 + }, + { + "epoch": 0.21960824955953984, + "grad_norm": 0.5331148505210876, + "learning_rate": 3.6347557699670675e-05, + "loss": 0.2753, + "step": 2119 + }, + { + "epoch": 0.21971188724220125, + "grad_norm": 0.48955854773521423, + "learning_rate": 3.634368910594846e-05, + "loss": 0.2518, + "step": 2120 + }, + { + "epoch": 0.21981552492486267, + "grad_norm": 0.5582018494606018, + "learning_rate": 3.633981867064476e-05, + "loss": 0.2871, + "step": 2121 + }, + { + "epoch": 0.2199191626075241, + "grad_norm": 0.4639434218406677, + "learning_rate": 3.633594639419571e-05, + "loss": 0.2199, + "step": 2122 + }, + { + "epoch": 0.2200228002901855, + "grad_norm": 0.550144612789154, + "learning_rate": 3.6332072277037596e-05, + "loss": 0.3005, + "step": 2123 + }, + { + "epoch": 0.22012643797284692, + "grad_norm": 0.47822409868240356, + "learning_rate": 3.632819631960697e-05, + "loss": 0.2721, + "step": 2124 + }, + { + "epoch": 0.22023007565550834, + "grad_norm": 0.43804317712783813, + "learning_rate": 3.6324318522340576e-05, + "loss": 0.2284, + "step": 2125 + }, + { + "epoch": 0.22033371333816976, + "grad_norm": 0.4789344072341919, + "learning_rate": 3.632043888567534e-05, + "loss": 0.236, + "step": 2126 + }, + { + "epoch": 0.22043735102083117, + "grad_norm": 0.42324569821357727, + "learning_rate": 3.631655741004842e-05, + "loss": 0.212, + "step": 2127 + }, + { + "epoch": 0.2205409887034926, + "grad_norm": 0.4153405427932739, + "learning_rate": 3.631267409589717e-05, + "loss": 0.1964, + "step": 2128 + }, + { + "epoch": 0.220644626386154, + "grad_norm": 0.44336336851119995, + "learning_rate": 3.6308788943659174e-05, + "loss": 0.2468, + "step": 2129 + }, + { + "epoch": 0.22074826406881543, + "grad_norm": 0.4833187460899353, + "learning_rate": 3.6304901953772185e-05, + "loss": 0.2162, + "step": 2130 + }, + { + "epoch": 0.22085190175147684, + "grad_norm": 0.5347158312797546, + "learning_rate": 3.630101312667419e-05, + "loss": 0.2594, + "step": 2131 + }, + { + "epoch": 0.22095553943413826, + "grad_norm": 0.520388126373291, + "learning_rate": 3.629712246280338e-05, + "loss": 0.2475, + "step": 2132 + }, + { + "epoch": 0.22105917711679968, + "grad_norm": 0.4484923779964447, + "learning_rate": 3.6293229962598144e-05, + "loss": 0.2394, + "step": 2133 + }, + { + "epoch": 0.2211628147994611, + "grad_norm": 0.494242399930954, + "learning_rate": 3.6289335626497085e-05, + "loss": 0.2432, + "step": 2134 + }, + { + "epoch": 0.2212664524821225, + "grad_norm": 0.49932751059532166, + "learning_rate": 3.628543945493901e-05, + "loss": 0.2355, + "step": 2135 + }, + { + "epoch": 0.22137009016478393, + "grad_norm": 0.5083844065666199, + "learning_rate": 3.628154144836293e-05, + "loss": 0.2473, + "step": 2136 + }, + { + "epoch": 0.22147372784744532, + "grad_norm": 0.4479765295982361, + "learning_rate": 3.627764160720807e-05, + "loss": 0.2316, + "step": 2137 + }, + { + "epoch": 0.22157736553010673, + "grad_norm": 0.4882799983024597, + "learning_rate": 3.627373993191386e-05, + "loss": 0.2514, + "step": 2138 + }, + { + "epoch": 0.22168100321276815, + "grad_norm": 0.5669364929199219, + "learning_rate": 3.6269836422919933e-05, + "loss": 0.2423, + "step": 2139 + }, + { + "epoch": 0.22178464089542957, + "grad_norm": 0.5127422213554382, + "learning_rate": 3.6265931080666125e-05, + "loss": 0.2577, + "step": 2140 + }, + { + "epoch": 0.22188827857809099, + "grad_norm": 0.5250160098075867, + "learning_rate": 3.626202390559249e-05, + "loss": 0.263, + "step": 2141 + }, + { + "epoch": 0.2219919162607524, + "grad_norm": 0.4941604435443878, + "learning_rate": 3.625811489813929e-05, + "loss": 0.2477, + "step": 2142 + }, + { + "epoch": 0.22209555394341382, + "grad_norm": 0.4347946345806122, + "learning_rate": 3.6254204058746966e-05, + "loss": 0.2361, + "step": 2143 + }, + { + "epoch": 0.22219919162607524, + "grad_norm": 0.539672315120697, + "learning_rate": 3.62502913878562e-05, + "loss": 0.2687, + "step": 2144 + }, + { + "epoch": 0.22230282930873665, + "grad_norm": 0.4801972508430481, + "learning_rate": 3.624637688590787e-05, + "loss": 0.2288, + "step": 2145 + }, + { + "epoch": 0.22240646699139807, + "grad_norm": 0.4568727910518646, + "learning_rate": 3.624246055334304e-05, + "loss": 0.2154, + "step": 2146 + }, + { + "epoch": 0.2225101046740595, + "grad_norm": 0.49095943570137024, + "learning_rate": 3.6238542390603006e-05, + "loss": 0.2024, + "step": 2147 + }, + { + "epoch": 0.2226137423567209, + "grad_norm": 0.49304577708244324, + "learning_rate": 3.623462239812925e-05, + "loss": 0.2166, + "step": 2148 + }, + { + "epoch": 0.22271738003938232, + "grad_norm": 0.5479660630226135, + "learning_rate": 3.623070057636349e-05, + "loss": 0.251, + "step": 2149 + }, + { + "epoch": 0.22282101772204374, + "grad_norm": 0.4642309248447418, + "learning_rate": 3.6226776925747615e-05, + "loss": 0.24, + "step": 2150 + }, + { + "epoch": 0.22292465540470516, + "grad_norm": 0.44070860743522644, + "learning_rate": 3.622285144672375e-05, + "loss": 0.2082, + "step": 2151 + }, + { + "epoch": 0.22302829308736657, + "grad_norm": 0.519973635673523, + "learning_rate": 3.62189241397342e-05, + "loss": 0.268, + "step": 2152 + }, + { + "epoch": 0.223131930770028, + "grad_norm": 0.5418823957443237, + "learning_rate": 3.621499500522149e-05, + "loss": 0.2498, + "step": 2153 + }, + { + "epoch": 0.2232355684526894, + "grad_norm": 0.5058969259262085, + "learning_rate": 3.621106404362834e-05, + "loss": 0.2339, + "step": 2154 + }, + { + "epoch": 0.22333920613535083, + "grad_norm": 0.4511276185512543, + "learning_rate": 3.6207131255397705e-05, + "loss": 0.1953, + "step": 2155 + }, + { + "epoch": 0.22344284381801222, + "grad_norm": 0.5690330862998962, + "learning_rate": 3.62031966409727e-05, + "loss": 0.2619, + "step": 2156 + }, + { + "epoch": 0.22354648150067363, + "grad_norm": 0.46670278906822205, + "learning_rate": 3.6199260200796704e-05, + "loss": 0.2208, + "step": 2157 + }, + { + "epoch": 0.22365011918333505, + "grad_norm": 0.5800625085830688, + "learning_rate": 3.619532193531324e-05, + "loss": 0.2527, + "step": 2158 + }, + { + "epoch": 0.22375375686599647, + "grad_norm": 0.5150881409645081, + "learning_rate": 3.619138184496608e-05, + "loss": 0.2255, + "step": 2159 + }, + { + "epoch": 0.22385739454865788, + "grad_norm": 0.5035178661346436, + "learning_rate": 3.618743993019919e-05, + "loss": 0.2525, + "step": 2160 + }, + { + "epoch": 0.2239610322313193, + "grad_norm": 0.5059001445770264, + "learning_rate": 3.618349619145672e-05, + "loss": 0.2804, + "step": 2161 + }, + { + "epoch": 0.22406466991398072, + "grad_norm": 0.43283551931381226, + "learning_rate": 3.6179550629183065e-05, + "loss": 0.2177, + "step": 2162 + }, + { + "epoch": 0.22416830759664214, + "grad_norm": 0.46817097067832947, + "learning_rate": 3.617560324382279e-05, + "loss": 0.233, + "step": 2163 + }, + { + "epoch": 0.22427194527930355, + "grad_norm": 0.506182074546814, + "learning_rate": 3.61716540358207e-05, + "loss": 0.2184, + "step": 2164 + }, + { + "epoch": 0.22437558296196497, + "grad_norm": 0.3902835249900818, + "learning_rate": 3.6167703005621755e-05, + "loss": 0.1976, + "step": 2165 + }, + { + "epoch": 0.2244792206446264, + "grad_norm": 0.4224104881286621, + "learning_rate": 3.6163750153671175e-05, + "loss": 0.198, + "step": 2166 + }, + { + "epoch": 0.2245828583272878, + "grad_norm": 0.5203019380569458, + "learning_rate": 3.615979548041436e-05, + "loss": 0.2684, + "step": 2167 + }, + { + "epoch": 0.22468649600994922, + "grad_norm": 0.41954994201660156, + "learning_rate": 3.615583898629691e-05, + "loss": 0.1831, + "step": 2168 + }, + { + "epoch": 0.22479013369261064, + "grad_norm": 0.4721447825431824, + "learning_rate": 3.615188067176464e-05, + "loss": 0.2176, + "step": 2169 + }, + { + "epoch": 0.22489377137527206, + "grad_norm": 0.44744497537612915, + "learning_rate": 3.6147920537263554e-05, + "loss": 0.2144, + "step": 2170 + }, + { + "epoch": 0.22499740905793347, + "grad_norm": 0.5009974241256714, + "learning_rate": 3.6143958583239894e-05, + "loss": 0.2345, + "step": 2171 + }, + { + "epoch": 0.2251010467405949, + "grad_norm": 0.48787763714790344, + "learning_rate": 3.6139994810140075e-05, + "loss": 0.2087, + "step": 2172 + }, + { + "epoch": 0.2252046844232563, + "grad_norm": 0.5042424201965332, + "learning_rate": 3.6136029218410725e-05, + "loss": 0.2421, + "step": 2173 + }, + { + "epoch": 0.22530832210591772, + "grad_norm": 0.4651280641555786, + "learning_rate": 3.6132061808498694e-05, + "loss": 0.2095, + "step": 2174 + }, + { + "epoch": 0.2254119597885791, + "grad_norm": 0.47553327679634094, + "learning_rate": 3.6128092580851016e-05, + "loss": 0.2376, + "step": 2175 + }, + { + "epoch": 0.22551559747124053, + "grad_norm": 0.5473289489746094, + "learning_rate": 3.612412153591493e-05, + "loss": 0.2601, + "step": 2176 + }, + { + "epoch": 0.22561923515390195, + "grad_norm": 0.5122581124305725, + "learning_rate": 3.61201486741379e-05, + "loss": 0.2436, + "step": 2177 + }, + { + "epoch": 0.22572287283656337, + "grad_norm": 0.5065480470657349, + "learning_rate": 3.6116173995967575e-05, + "loss": 0.2909, + "step": 2178 + }, + { + "epoch": 0.22582651051922478, + "grad_norm": 0.42552104592323303, + "learning_rate": 3.611219750185182e-05, + "loss": 0.2091, + "step": 2179 + }, + { + "epoch": 0.2259301482018862, + "grad_norm": 0.49925532937049866, + "learning_rate": 3.61082191922387e-05, + "loss": 0.2441, + "step": 2180 + }, + { + "epoch": 0.22603378588454762, + "grad_norm": 0.5123841166496277, + "learning_rate": 3.610423906757648e-05, + "loss": 0.2403, + "step": 2181 + }, + { + "epoch": 0.22613742356720903, + "grad_norm": 0.5170781016349792, + "learning_rate": 3.610025712831363e-05, + "loss": 0.2609, + "step": 2182 + }, + { + "epoch": 0.22624106124987045, + "grad_norm": 0.49091219902038574, + "learning_rate": 3.609627337489884e-05, + "loss": 0.2492, + "step": 2183 + }, + { + "epoch": 0.22634469893253187, + "grad_norm": 0.5327954292297363, + "learning_rate": 3.609228780778099e-05, + "loss": 0.3047, + "step": 2184 + }, + { + "epoch": 0.22644833661519329, + "grad_norm": 0.5308407545089722, + "learning_rate": 3.6088300427409164e-05, + "loss": 0.2458, + "step": 2185 + }, + { + "epoch": 0.2265519742978547, + "grad_norm": 0.44684699177742004, + "learning_rate": 3.608431123423265e-05, + "loss": 0.1978, + "step": 2186 + }, + { + "epoch": 0.22665561198051612, + "grad_norm": 0.4521043300628662, + "learning_rate": 3.608032022870096e-05, + "loss": 0.2171, + "step": 2187 + }, + { + "epoch": 0.22675924966317754, + "grad_norm": 0.4902312755584717, + "learning_rate": 3.607632741126378e-05, + "loss": 0.224, + "step": 2188 + }, + { + "epoch": 0.22686288734583895, + "grad_norm": 0.4611247479915619, + "learning_rate": 3.607233278237102e-05, + "loss": 0.2333, + "step": 2189 + }, + { + "epoch": 0.22696652502850037, + "grad_norm": 0.5038001537322998, + "learning_rate": 3.606833634247278e-05, + "loss": 0.2534, + "step": 2190 + }, + { + "epoch": 0.2270701627111618, + "grad_norm": 0.489808589220047, + "learning_rate": 3.6064338092019386e-05, + "loss": 0.246, + "step": 2191 + }, + { + "epoch": 0.2271738003938232, + "grad_norm": 0.4640105962753296, + "learning_rate": 3.6060338031461346e-05, + "loss": 0.2347, + "step": 2192 + }, + { + "epoch": 0.22727743807648462, + "grad_norm": 0.5169340372085571, + "learning_rate": 3.605633616124938e-05, + "loss": 0.2775, + "step": 2193 + }, + { + "epoch": 0.227381075759146, + "grad_norm": 0.38409602642059326, + "learning_rate": 3.605233248183442e-05, + "loss": 0.1974, + "step": 2194 + }, + { + "epoch": 0.22748471344180743, + "grad_norm": 0.48459893465042114, + "learning_rate": 3.604832699366759e-05, + "loss": 0.2115, + "step": 2195 + }, + { + "epoch": 0.22758835112446885, + "grad_norm": 0.4470805525779724, + "learning_rate": 3.604431969720022e-05, + "loss": 0.2337, + "step": 2196 + }, + { + "epoch": 0.22769198880713026, + "grad_norm": 0.4740588963031769, + "learning_rate": 3.604031059288385e-05, + "loss": 0.2527, + "step": 2197 + }, + { + "epoch": 0.22779562648979168, + "grad_norm": 0.4325917363166809, + "learning_rate": 3.603629968117021e-05, + "loss": 0.2073, + "step": 2198 + }, + { + "epoch": 0.2278992641724531, + "grad_norm": 0.49139603972435, + "learning_rate": 3.603228696251126e-05, + "loss": 0.2524, + "step": 2199 + }, + { + "epoch": 0.22800290185511451, + "grad_norm": 0.39995265007019043, + "learning_rate": 3.602827243735913e-05, + "loss": 0.1868, + "step": 2200 + }, + { + "epoch": 0.22810653953777593, + "grad_norm": 0.4172254204750061, + "learning_rate": 3.6024256106166194e-05, + "loss": 0.2258, + "step": 2201 + }, + { + "epoch": 0.22821017722043735, + "grad_norm": 0.4221954643726349, + "learning_rate": 3.602023796938497e-05, + "loss": 0.2019, + "step": 2202 + }, + { + "epoch": 0.22831381490309877, + "grad_norm": 0.4521850347518921, + "learning_rate": 3.601621802746825e-05, + "loss": 0.2284, + "step": 2203 + }, + { + "epoch": 0.22841745258576018, + "grad_norm": 0.42566412687301636, + "learning_rate": 3.601219628086897e-05, + "loss": 0.1948, + "step": 2204 + }, + { + "epoch": 0.2285210902684216, + "grad_norm": 0.4874782860279083, + "learning_rate": 3.600817273004031e-05, + "loss": 0.2305, + "step": 2205 + }, + { + "epoch": 0.22862472795108302, + "grad_norm": 0.48723307251930237, + "learning_rate": 3.600414737543563e-05, + "loss": 0.2804, + "step": 2206 + }, + { + "epoch": 0.22872836563374443, + "grad_norm": 0.5111587643623352, + "learning_rate": 3.600012021750851e-05, + "loss": 0.234, + "step": 2207 + }, + { + "epoch": 0.22883200331640585, + "grad_norm": 0.42869290709495544, + "learning_rate": 3.599609125671271e-05, + "loss": 0.1748, + "step": 2208 + }, + { + "epoch": 0.22893564099906727, + "grad_norm": 0.5120351314544678, + "learning_rate": 3.599206049350222e-05, + "loss": 0.2618, + "step": 2209 + }, + { + "epoch": 0.2290392786817287, + "grad_norm": 0.4712057411670685, + "learning_rate": 3.59880279283312e-05, + "loss": 0.2117, + "step": 2210 + }, + { + "epoch": 0.2291429163643901, + "grad_norm": 0.4707184433937073, + "learning_rate": 3.5983993561654056e-05, + "loss": 0.2593, + "step": 2211 + }, + { + "epoch": 0.22924655404705152, + "grad_norm": 0.5554648041725159, + "learning_rate": 3.597995739392536e-05, + "loss": 0.2521, + "step": 2212 + }, + { + "epoch": 0.2293501917297129, + "grad_norm": 0.5168809294700623, + "learning_rate": 3.597591942559991e-05, + "loss": 0.2678, + "step": 2213 + }, + { + "epoch": 0.22945382941237433, + "grad_norm": 0.453225314617157, + "learning_rate": 3.59718796571327e-05, + "loss": 0.2176, + "step": 2214 + }, + { + "epoch": 0.22955746709503574, + "grad_norm": 0.4564521610736847, + "learning_rate": 3.596783808897891e-05, + "loss": 0.1954, + "step": 2215 + }, + { + "epoch": 0.22966110477769716, + "grad_norm": 0.4787885844707489, + "learning_rate": 3.5963794721593954e-05, + "loss": 0.2448, + "step": 2216 + }, + { + "epoch": 0.22976474246035858, + "grad_norm": 0.461749404668808, + "learning_rate": 3.595974955543341e-05, + "loss": 0.2387, + "step": 2217 + }, + { + "epoch": 0.22986838014302, + "grad_norm": 0.4586969017982483, + "learning_rate": 3.595570259095311e-05, + "loss": 0.24, + "step": 2218 + }, + { + "epoch": 0.2299720178256814, + "grad_norm": 0.4212735891342163, + "learning_rate": 3.595165382860905e-05, + "loss": 0.2298, + "step": 2219 + }, + { + "epoch": 0.23007565550834283, + "grad_norm": 0.4474669396877289, + "learning_rate": 3.594760326885742e-05, + "loss": 0.1942, + "step": 2220 + }, + { + "epoch": 0.23017929319100425, + "grad_norm": 0.467050701379776, + "learning_rate": 3.594355091215465e-05, + "loss": 0.2101, + "step": 2221 + }, + { + "epoch": 0.23028293087366566, + "grad_norm": 0.511583685874939, + "learning_rate": 3.593949675895735e-05, + "loss": 0.2294, + "step": 2222 + }, + { + "epoch": 0.23038656855632708, + "grad_norm": 0.48996490240097046, + "learning_rate": 3.5935440809722336e-05, + "loss": 0.2568, + "step": 2223 + }, + { + "epoch": 0.2304902062389885, + "grad_norm": 0.43239009380340576, + "learning_rate": 3.5931383064906617e-05, + "loss": 0.199, + "step": 2224 + }, + { + "epoch": 0.23059384392164992, + "grad_norm": 0.565646767616272, + "learning_rate": 3.5927323524967426e-05, + "loss": 0.2698, + "step": 2225 + }, + { + "epoch": 0.23069748160431133, + "grad_norm": 0.4224366843700409, + "learning_rate": 3.5923262190362176e-05, + "loss": 0.2195, + "step": 2226 + }, + { + "epoch": 0.23080111928697275, + "grad_norm": 0.44575369358062744, + "learning_rate": 3.5919199061548494e-05, + "loss": 0.2244, + "step": 2227 + }, + { + "epoch": 0.23090475696963417, + "grad_norm": 0.39572209119796753, + "learning_rate": 3.591513413898421e-05, + "loss": 0.2164, + "step": 2228 + }, + { + "epoch": 0.23100839465229558, + "grad_norm": 0.5055858492851257, + "learning_rate": 3.591106742312736e-05, + "loss": 0.2646, + "step": 2229 + }, + { + "epoch": 0.231112032334957, + "grad_norm": 0.4088776707649231, + "learning_rate": 3.590699891443616e-05, + "loss": 0.1873, + "step": 2230 + }, + { + "epoch": 0.23121567001761842, + "grad_norm": 0.4450381100177765, + "learning_rate": 3.590292861336905e-05, + "loss": 0.2417, + "step": 2231 + }, + { + "epoch": 0.2313193077002798, + "grad_norm": 0.4661766588687897, + "learning_rate": 3.589885652038466e-05, + "loss": 0.2358, + "step": 2232 + }, + { + "epoch": 0.23142294538294123, + "grad_norm": 0.5066637992858887, + "learning_rate": 3.5894782635941845e-05, + "loss": 0.2599, + "step": 2233 + }, + { + "epoch": 0.23152658306560264, + "grad_norm": 0.47971197962760925, + "learning_rate": 3.5890706960499626e-05, + "loss": 0.2382, + "step": 2234 + }, + { + "epoch": 0.23163022074826406, + "grad_norm": 0.5027540326118469, + "learning_rate": 3.588662949451724e-05, + "loss": 0.2304, + "step": 2235 + }, + { + "epoch": 0.23173385843092548, + "grad_norm": 0.5136678814888, + "learning_rate": 3.588255023845415e-05, + "loss": 0.2408, + "step": 2236 + }, + { + "epoch": 0.2318374961135869, + "grad_norm": 0.44080325961112976, + "learning_rate": 3.587846919276999e-05, + "loss": 0.239, + "step": 2237 + }, + { + "epoch": 0.2319411337962483, + "grad_norm": 0.4627954959869385, + "learning_rate": 3.58743863579246e-05, + "loss": 0.2559, + "step": 2238 + }, + { + "epoch": 0.23204477147890973, + "grad_norm": 0.46266400814056396, + "learning_rate": 3.587030173437803e-05, + "loss": 0.2562, + "step": 2239 + }, + { + "epoch": 0.23214840916157115, + "grad_norm": 0.4966365694999695, + "learning_rate": 3.586621532259053e-05, + "loss": 0.2602, + "step": 2240 + }, + { + "epoch": 0.23225204684423256, + "grad_norm": 0.3625810146331787, + "learning_rate": 3.586212712302256e-05, + "loss": 0.1887, + "step": 2241 + }, + { + "epoch": 0.23235568452689398, + "grad_norm": 0.47931361198425293, + "learning_rate": 3.5858037136134765e-05, + "loss": 0.1949, + "step": 2242 + }, + { + "epoch": 0.2324593222095554, + "grad_norm": 0.4693504571914673, + "learning_rate": 3.585394536238799e-05, + "loss": 0.2414, + "step": 2243 + }, + { + "epoch": 0.23256295989221681, + "grad_norm": 0.46216142177581787, + "learning_rate": 3.58498518022433e-05, + "loss": 0.219, + "step": 2244 + }, + { + "epoch": 0.23266659757487823, + "grad_norm": 0.5020700693130493, + "learning_rate": 3.5845756456161945e-05, + "loss": 0.245, + "step": 2245 + }, + { + "epoch": 0.23277023525753965, + "grad_norm": 0.4642801284790039, + "learning_rate": 3.584165932460539e-05, + "loss": 0.2375, + "step": 2246 + }, + { + "epoch": 0.23287387294020107, + "grad_norm": 0.48018375039100647, + "learning_rate": 3.583756040803529e-05, + "loss": 0.2736, + "step": 2247 + }, + { + "epoch": 0.23297751062286248, + "grad_norm": 0.5564702153205872, + "learning_rate": 3.5833459706913494e-05, + "loss": 0.246, + "step": 2248 + }, + { + "epoch": 0.2330811483055239, + "grad_norm": 0.45636841654777527, + "learning_rate": 3.582935722170208e-05, + "loss": 0.2145, + "step": 2249 + }, + { + "epoch": 0.23318478598818532, + "grad_norm": 0.4764099717140198, + "learning_rate": 3.5825252952863296e-05, + "loss": 0.2133, + "step": 2250 + }, + { + "epoch": 0.2332884236708467, + "grad_norm": 0.44867387413978577, + "learning_rate": 3.5821146900859615e-05, + "loss": 0.2396, + "step": 2251 + }, + { + "epoch": 0.23339206135350812, + "grad_norm": 0.43731316924095154, + "learning_rate": 3.58170390661537e-05, + "loss": 0.1834, + "step": 2252 + }, + { + "epoch": 0.23349569903616954, + "grad_norm": 0.4410570561885834, + "learning_rate": 3.58129294492084e-05, + "loss": 0.2443, + "step": 2253 + }, + { + "epoch": 0.23359933671883096, + "grad_norm": 0.4620094299316406, + "learning_rate": 3.580881805048679e-05, + "loss": 0.2463, + "step": 2254 + }, + { + "epoch": 0.23370297440149237, + "grad_norm": 0.49641790986061096, + "learning_rate": 3.580470487045215e-05, + "loss": 0.209, + "step": 2255 + }, + { + "epoch": 0.2338066120841538, + "grad_norm": 0.5519564151763916, + "learning_rate": 3.580058990956793e-05, + "loss": 0.2769, + "step": 2256 + }, + { + "epoch": 0.2339102497668152, + "grad_norm": 0.46755626797676086, + "learning_rate": 3.57964731682978e-05, + "loss": 0.2131, + "step": 2257 + }, + { + "epoch": 0.23401388744947663, + "grad_norm": 0.482364296913147, + "learning_rate": 3.579235464710563e-05, + "loss": 0.2219, + "step": 2258 + }, + { + "epoch": 0.23411752513213804, + "grad_norm": 0.44284772872924805, + "learning_rate": 3.578823434645548e-05, + "loss": 0.1814, + "step": 2259 + }, + { + "epoch": 0.23422116281479946, + "grad_norm": 0.5211065411567688, + "learning_rate": 3.578411226681164e-05, + "loss": 0.2518, + "step": 2260 + }, + { + "epoch": 0.23432480049746088, + "grad_norm": 0.4506303369998932, + "learning_rate": 3.577998840863856e-05, + "loss": 0.2337, + "step": 2261 + }, + { + "epoch": 0.2344284381801223, + "grad_norm": 0.4736618101596832, + "learning_rate": 3.5775862772400915e-05, + "loss": 0.2252, + "step": 2262 + }, + { + "epoch": 0.2345320758627837, + "grad_norm": 0.5009260177612305, + "learning_rate": 3.577173535856358e-05, + "loss": 0.2371, + "step": 2263 + }, + { + "epoch": 0.23463571354544513, + "grad_norm": 0.3928007185459137, + "learning_rate": 3.576760616759162e-05, + "loss": 0.1764, + "step": 2264 + }, + { + "epoch": 0.23473935122810655, + "grad_norm": 0.39292821288108826, + "learning_rate": 3.576347519995031e-05, + "loss": 0.1968, + "step": 2265 + }, + { + "epoch": 0.23484298891076796, + "grad_norm": 0.45694684982299805, + "learning_rate": 3.575934245610512e-05, + "loss": 0.2036, + "step": 2266 + }, + { + "epoch": 0.23494662659342938, + "grad_norm": 0.5031452775001526, + "learning_rate": 3.575520793652172e-05, + "loss": 0.2422, + "step": 2267 + }, + { + "epoch": 0.2350502642760908, + "grad_norm": 0.4028596878051758, + "learning_rate": 3.575107164166598e-05, + "loss": 0.1861, + "step": 2268 + }, + { + "epoch": 0.23515390195875222, + "grad_norm": 0.4759109914302826, + "learning_rate": 3.574693357200398e-05, + "loss": 0.2187, + "step": 2269 + }, + { + "epoch": 0.2352575396414136, + "grad_norm": 0.4903968274593353, + "learning_rate": 3.574279372800197e-05, + "loss": 0.229, + "step": 2270 + }, + { + "epoch": 0.23536117732407502, + "grad_norm": 0.5485450625419617, + "learning_rate": 3.5738652110126446e-05, + "loss": 0.2908, + "step": 2271 + }, + { + "epoch": 0.23546481500673644, + "grad_norm": 0.43799784779548645, + "learning_rate": 3.573450871884407e-05, + "loss": 0.2087, + "step": 2272 + }, + { + "epoch": 0.23556845268939786, + "grad_norm": 0.43799030780792236, + "learning_rate": 3.57303635546217e-05, + "loss": 0.195, + "step": 2273 + }, + { + "epoch": 0.23567209037205927, + "grad_norm": 0.4460390508174896, + "learning_rate": 3.572621661792643e-05, + "loss": 0.2226, + "step": 2274 + }, + { + "epoch": 0.2357757280547207, + "grad_norm": 0.5236432552337646, + "learning_rate": 3.572206790922551e-05, + "loss": 0.2997, + "step": 2275 + }, + { + "epoch": 0.2358793657373821, + "grad_norm": 0.42137596011161804, + "learning_rate": 3.571791742898642e-05, + "loss": 0.2122, + "step": 2276 + }, + { + "epoch": 0.23598300342004352, + "grad_norm": 0.5710049867630005, + "learning_rate": 3.571376517767683e-05, + "loss": 0.2646, + "step": 2277 + }, + { + "epoch": 0.23608664110270494, + "grad_norm": 0.49377962946891785, + "learning_rate": 3.5709611155764613e-05, + "loss": 0.2466, + "step": 2278 + }, + { + "epoch": 0.23619027878536636, + "grad_norm": 0.4961196184158325, + "learning_rate": 3.570545536371783e-05, + "loss": 0.2743, + "step": 2279 + }, + { + "epoch": 0.23629391646802778, + "grad_norm": 0.46566084027290344, + "learning_rate": 3.570129780200474e-05, + "loss": 0.194, + "step": 2280 + }, + { + "epoch": 0.2363975541506892, + "grad_norm": 0.5308505892753601, + "learning_rate": 3.569713847109383e-05, + "loss": 0.2664, + "step": 2281 + }, + { + "epoch": 0.2365011918333506, + "grad_norm": 0.4228411614894867, + "learning_rate": 3.569297737145376e-05, + "loss": 0.1951, + "step": 2282 + }, + { + "epoch": 0.23660482951601203, + "grad_norm": 0.44380027055740356, + "learning_rate": 3.568881450355339e-05, + "loss": 0.2404, + "step": 2283 + }, + { + "epoch": 0.23670846719867344, + "grad_norm": 0.4705795347690582, + "learning_rate": 3.56846498678618e-05, + "loss": 0.2116, + "step": 2284 + }, + { + "epoch": 0.23681210488133486, + "grad_norm": 0.4383763372898102, + "learning_rate": 3.568048346484824e-05, + "loss": 0.2554, + "step": 2285 + }, + { + "epoch": 0.23691574256399628, + "grad_norm": 0.47628912329673767, + "learning_rate": 3.5676315294982175e-05, + "loss": 0.2251, + "step": 2286 + }, + { + "epoch": 0.2370193802466577, + "grad_norm": 0.4527871608734131, + "learning_rate": 3.567214535873327e-05, + "loss": 0.2059, + "step": 2287 + }, + { + "epoch": 0.2371230179293191, + "grad_norm": 0.43125396966934204, + "learning_rate": 3.56679736565714e-05, + "loss": 0.1852, + "step": 2288 + }, + { + "epoch": 0.2372266556119805, + "grad_norm": 0.5477954149246216, + "learning_rate": 3.56638001889666e-05, + "loss": 0.2607, + "step": 2289 + }, + { + "epoch": 0.23733029329464192, + "grad_norm": 0.483190655708313, + "learning_rate": 3.5659624956389155e-05, + "loss": 0.2338, + "step": 2290 + }, + { + "epoch": 0.23743393097730334, + "grad_norm": 0.4622649848461151, + "learning_rate": 3.5655447959309515e-05, + "loss": 0.2365, + "step": 2291 + }, + { + "epoch": 0.23753756865996475, + "grad_norm": 0.506360650062561, + "learning_rate": 3.5651269198198334e-05, + "loss": 0.2201, + "step": 2292 + }, + { + "epoch": 0.23764120634262617, + "grad_norm": 0.6054961681365967, + "learning_rate": 3.564708867352646e-05, + "loss": 0.2303, + "step": 2293 + }, + { + "epoch": 0.2377448440252876, + "grad_norm": 0.4164169728755951, + "learning_rate": 3.564290638576497e-05, + "loss": 0.1778, + "step": 2294 + }, + { + "epoch": 0.237848481707949, + "grad_norm": 0.4977959990501404, + "learning_rate": 3.56387223353851e-05, + "loss": 0.236, + "step": 2295 + }, + { + "epoch": 0.23795211939061042, + "grad_norm": 0.3724239468574524, + "learning_rate": 3.563453652285831e-05, + "loss": 0.1979, + "step": 2296 + }, + { + "epoch": 0.23805575707327184, + "grad_norm": 0.42812469601631165, + "learning_rate": 3.563034894865625e-05, + "loss": 0.203, + "step": 2297 + }, + { + "epoch": 0.23815939475593326, + "grad_norm": 0.49317336082458496, + "learning_rate": 3.5626159613250765e-05, + "loss": 0.2338, + "step": 2298 + }, + { + "epoch": 0.23826303243859467, + "grad_norm": 0.49684178829193115, + "learning_rate": 3.562196851711391e-05, + "loss": 0.236, + "step": 2299 + }, + { + "epoch": 0.2383666701212561, + "grad_norm": 0.4792449176311493, + "learning_rate": 3.561777566071793e-05, + "loss": 0.2391, + "step": 2300 + }, + { + "epoch": 0.2384703078039175, + "grad_norm": 0.47309496998786926, + "learning_rate": 3.5613581044535266e-05, + "loss": 0.2172, + "step": 2301 + }, + { + "epoch": 0.23857394548657893, + "grad_norm": 0.618942141532898, + "learning_rate": 3.5609384669038556e-05, + "loss": 0.2557, + "step": 2302 + }, + { + "epoch": 0.23867758316924034, + "grad_norm": 0.3872990310192108, + "learning_rate": 3.5605186534700645e-05, + "loss": 0.177, + "step": 2303 + }, + { + "epoch": 0.23878122085190176, + "grad_norm": 0.4337783753871918, + "learning_rate": 3.560098664199458e-05, + "loss": 0.1974, + "step": 2304 + }, + { + "epoch": 0.23888485853456318, + "grad_norm": 0.4320196807384491, + "learning_rate": 3.5596784991393586e-05, + "loss": 0.1936, + "step": 2305 + }, + { + "epoch": 0.2389884962172246, + "grad_norm": 0.49404609203338623, + "learning_rate": 3.559258158337112e-05, + "loss": 0.201, + "step": 2306 + }, + { + "epoch": 0.239092133899886, + "grad_norm": 0.5494398474693298, + "learning_rate": 3.558837641840078e-05, + "loss": 0.2736, + "step": 2307 + }, + { + "epoch": 0.2391957715825474, + "grad_norm": 0.5490972399711609, + "learning_rate": 3.558416949695643e-05, + "loss": 0.2663, + "step": 2308 + }, + { + "epoch": 0.23929940926520882, + "grad_norm": 0.45828548073768616, + "learning_rate": 3.5579960819512087e-05, + "loss": 0.2081, + "step": 2309 + }, + { + "epoch": 0.23940304694787023, + "grad_norm": 0.402240127325058, + "learning_rate": 3.557575038654198e-05, + "loss": 0.1676, + "step": 2310 + }, + { + "epoch": 0.23950668463053165, + "grad_norm": 0.5030764937400818, + "learning_rate": 3.557153819852052e-05, + "loss": 0.2264, + "step": 2311 + }, + { + "epoch": 0.23961032231319307, + "grad_norm": 0.5498928427696228, + "learning_rate": 3.556732425592235e-05, + "loss": 0.2523, + "step": 2312 + }, + { + "epoch": 0.2397139599958545, + "grad_norm": 0.45575132966041565, + "learning_rate": 3.5563108559222285e-05, + "loss": 0.2067, + "step": 2313 + }, + { + "epoch": 0.2398175976785159, + "grad_norm": 0.5032340288162231, + "learning_rate": 3.555889110889534e-05, + "loss": 0.199, + "step": 2314 + }, + { + "epoch": 0.23992123536117732, + "grad_norm": 0.4415561258792877, + "learning_rate": 3.5554671905416734e-05, + "loss": 0.2416, + "step": 2315 + }, + { + "epoch": 0.24002487304383874, + "grad_norm": 0.5081425905227661, + "learning_rate": 3.555045094926187e-05, + "loss": 0.2588, + "step": 2316 + }, + { + "epoch": 0.24012851072650016, + "grad_norm": 0.49607375264167786, + "learning_rate": 3.5546228240906374e-05, + "loss": 0.2602, + "step": 2317 + }, + { + "epoch": 0.24023214840916157, + "grad_norm": 0.46513596177101135, + "learning_rate": 3.554200378082604e-05, + "loss": 0.2349, + "step": 2318 + }, + { + "epoch": 0.240335786091823, + "grad_norm": 0.5502655506134033, + "learning_rate": 3.553777756949689e-05, + "loss": 0.2683, + "step": 2319 + }, + { + "epoch": 0.2404394237744844, + "grad_norm": 0.4852595329284668, + "learning_rate": 3.553354960739511e-05, + "loss": 0.2198, + "step": 2320 + }, + { + "epoch": 0.24054306145714582, + "grad_norm": 0.4226512908935547, + "learning_rate": 3.552931989499711e-05, + "loss": 0.2103, + "step": 2321 + }, + { + "epoch": 0.24064669913980724, + "grad_norm": 0.435518354177475, + "learning_rate": 3.552508843277949e-05, + "loss": 0.2134, + "step": 2322 + }, + { + "epoch": 0.24075033682246866, + "grad_norm": 0.4544883072376251, + "learning_rate": 3.552085522121903e-05, + "loss": 0.2135, + "step": 2323 + }, + { + "epoch": 0.24085397450513008, + "grad_norm": 0.4653942883014679, + "learning_rate": 3.5516620260792736e-05, + "loss": 0.2045, + "step": 2324 + }, + { + "epoch": 0.2409576121877915, + "grad_norm": 0.5308933854103088, + "learning_rate": 3.551238355197779e-05, + "loss": 0.2572, + "step": 2325 + }, + { + "epoch": 0.2410612498704529, + "grad_norm": 0.4636653661727905, + "learning_rate": 3.550814509525158e-05, + "loss": 0.2315, + "step": 2326 + }, + { + "epoch": 0.2411648875531143, + "grad_norm": 0.4664221704006195, + "learning_rate": 3.550390489109169e-05, + "loss": 0.242, + "step": 2327 + }, + { + "epoch": 0.24126852523577572, + "grad_norm": 0.4433833360671997, + "learning_rate": 3.54996629399759e-05, + "loss": 0.2044, + "step": 2328 + }, + { + "epoch": 0.24137216291843713, + "grad_norm": 0.47471415996551514, + "learning_rate": 3.549541924238218e-05, + "loss": 0.2377, + "step": 2329 + }, + { + "epoch": 0.24147580060109855, + "grad_norm": 0.47291579842567444, + "learning_rate": 3.549117379878872e-05, + "loss": 0.2312, + "step": 2330 + }, + { + "epoch": 0.24157943828375997, + "grad_norm": 0.5618043541908264, + "learning_rate": 3.5486926609673856e-05, + "loss": 0.2786, + "step": 2331 + }, + { + "epoch": 0.24168307596642138, + "grad_norm": 0.5181592106819153, + "learning_rate": 3.54826776755162e-05, + "loss": 0.224, + "step": 2332 + }, + { + "epoch": 0.2417867136490828, + "grad_norm": 0.4476979672908783, + "learning_rate": 3.547842699679447e-05, + "loss": 0.227, + "step": 2333 + }, + { + "epoch": 0.24189035133174422, + "grad_norm": 0.49343031644821167, + "learning_rate": 3.5474174573987664e-05, + "loss": 0.236, + "step": 2334 + }, + { + "epoch": 0.24199398901440564, + "grad_norm": 0.5434033274650574, + "learning_rate": 3.546992040757492e-05, + "loss": 0.2521, + "step": 2335 + }, + { + "epoch": 0.24209762669706705, + "grad_norm": 0.5006096959114075, + "learning_rate": 3.546566449803558e-05, + "loss": 0.2538, + "step": 2336 + }, + { + "epoch": 0.24220126437972847, + "grad_norm": 0.48902684450149536, + "learning_rate": 3.5461406845849224e-05, + "loss": 0.2282, + "step": 2337 + }, + { + "epoch": 0.2423049020623899, + "grad_norm": 0.5164515972137451, + "learning_rate": 3.545714745149557e-05, + "loss": 0.2763, + "step": 2338 + }, + { + "epoch": 0.2424085397450513, + "grad_norm": 0.5709071755409241, + "learning_rate": 3.5452886315454574e-05, + "loss": 0.2716, + "step": 2339 + }, + { + "epoch": 0.24251217742771272, + "grad_norm": 0.5185917019844055, + "learning_rate": 3.5448623438206364e-05, + "loss": 0.2517, + "step": 2340 + }, + { + "epoch": 0.24261581511037414, + "grad_norm": 0.48523247241973877, + "learning_rate": 3.544435882023129e-05, + "loss": 0.2657, + "step": 2341 + }, + { + "epoch": 0.24271945279303556, + "grad_norm": 0.4242948889732361, + "learning_rate": 3.544009246200986e-05, + "loss": 0.2054, + "step": 2342 + }, + { + "epoch": 0.24282309047569697, + "grad_norm": 0.47015151381492615, + "learning_rate": 3.543582436402283e-05, + "loss": 0.2423, + "step": 2343 + }, + { + "epoch": 0.2429267281583584, + "grad_norm": 0.4143276810646057, + "learning_rate": 3.543155452675109e-05, + "loss": 0.1662, + "step": 2344 + }, + { + "epoch": 0.2430303658410198, + "grad_norm": 0.4106583893299103, + "learning_rate": 3.542728295067578e-05, + "loss": 0.2155, + "step": 2345 + }, + { + "epoch": 0.2431340035236812, + "grad_norm": 0.4531787037849426, + "learning_rate": 3.542300963627821e-05, + "loss": 0.2699, + "step": 2346 + }, + { + "epoch": 0.24323764120634261, + "grad_norm": 0.39408352971076965, + "learning_rate": 3.541873458403989e-05, + "loss": 0.1848, + "step": 2347 + }, + { + "epoch": 0.24334127888900403, + "grad_norm": 0.46209919452667236, + "learning_rate": 3.541445779444252e-05, + "loss": 0.2232, + "step": 2348 + }, + { + "epoch": 0.24344491657166545, + "grad_norm": 0.46624454855918884, + "learning_rate": 3.541017926796802e-05, + "loss": 0.2159, + "step": 2349 + }, + { + "epoch": 0.24354855425432687, + "grad_norm": 0.5353434681892395, + "learning_rate": 3.540589900509847e-05, + "loss": 0.2553, + "step": 2350 + }, + { + "epoch": 0.24365219193698828, + "grad_norm": 0.4967138469219208, + "learning_rate": 3.540161700631617e-05, + "loss": 0.2271, + "step": 2351 + }, + { + "epoch": 0.2437558296196497, + "grad_norm": 0.5033750534057617, + "learning_rate": 3.5397333272103606e-05, + "loss": 0.2315, + "step": 2352 + }, + { + "epoch": 0.24385946730231112, + "grad_norm": 0.5195525884628296, + "learning_rate": 3.5393047802943466e-05, + "loss": 0.2609, + "step": 2353 + }, + { + "epoch": 0.24396310498497253, + "grad_norm": 0.48657095432281494, + "learning_rate": 3.538876059931862e-05, + "loss": 0.2344, + "step": 2354 + }, + { + "epoch": 0.24406674266763395, + "grad_norm": 0.45042067766189575, + "learning_rate": 3.538447166171216e-05, + "loss": 0.252, + "step": 2355 + }, + { + "epoch": 0.24417038035029537, + "grad_norm": 0.4620642066001892, + "learning_rate": 3.538018099060735e-05, + "loss": 0.2239, + "step": 2356 + }, + { + "epoch": 0.24427401803295679, + "grad_norm": 0.5294321179389954, + "learning_rate": 3.5375888586487654e-05, + "loss": 0.233, + "step": 2357 + }, + { + "epoch": 0.2443776557156182, + "grad_norm": 0.4590419828891754, + "learning_rate": 3.537159444983673e-05, + "loss": 0.2632, + "step": 2358 + }, + { + "epoch": 0.24448129339827962, + "grad_norm": 0.4740966260433197, + "learning_rate": 3.536729858113845e-05, + "loss": 0.2107, + "step": 2359 + }, + { + "epoch": 0.24458493108094104, + "grad_norm": 0.39860785007476807, + "learning_rate": 3.5363000980876845e-05, + "loss": 0.2037, + "step": 2360 + }, + { + "epoch": 0.24468856876360245, + "grad_norm": 0.5298658609390259, + "learning_rate": 3.535870164953617e-05, + "loss": 0.2321, + "step": 2361 + }, + { + "epoch": 0.24479220644626387, + "grad_norm": 0.5323640704154968, + "learning_rate": 3.535440058760088e-05, + "loss": 0.2316, + "step": 2362 + }, + { + "epoch": 0.2448958441289253, + "grad_norm": 0.459436297416687, + "learning_rate": 3.5350097795555595e-05, + "loss": 0.2277, + "step": 2363 + }, + { + "epoch": 0.2449994818115867, + "grad_norm": 0.5758830308914185, + "learning_rate": 3.5345793273885146e-05, + "loss": 0.2836, + "step": 2364 + }, + { + "epoch": 0.2451031194942481, + "grad_norm": 0.48922109603881836, + "learning_rate": 3.534148702307457e-05, + "loss": 0.2193, + "step": 2365 + }, + { + "epoch": 0.2452067571769095, + "grad_norm": 0.4887892007827759, + "learning_rate": 3.533717904360909e-05, + "loss": 0.2279, + "step": 2366 + }, + { + "epoch": 0.24531039485957093, + "grad_norm": 0.3865070939064026, + "learning_rate": 3.533286933597412e-05, + "loss": 0.1826, + "step": 2367 + }, + { + "epoch": 0.24541403254223235, + "grad_norm": 0.4107268750667572, + "learning_rate": 3.532855790065526e-05, + "loss": 0.1901, + "step": 2368 + }, + { + "epoch": 0.24551767022489376, + "grad_norm": 0.5012355446815491, + "learning_rate": 3.532424473813833e-05, + "loss": 0.2448, + "step": 2369 + }, + { + "epoch": 0.24562130790755518, + "grad_norm": 0.49892574548721313, + "learning_rate": 3.5319929848909315e-05, + "loss": 0.2523, + "step": 2370 + }, + { + "epoch": 0.2457249455902166, + "grad_norm": 0.5325135588645935, + "learning_rate": 3.531561323345442e-05, + "loss": 0.2776, + "step": 2371 + }, + { + "epoch": 0.24582858327287802, + "grad_norm": 0.544353723526001, + "learning_rate": 3.531129489226005e-05, + "loss": 0.2531, + "step": 2372 + }, + { + "epoch": 0.24593222095553943, + "grad_norm": 0.4833967089653015, + "learning_rate": 3.530697482581277e-05, + "loss": 0.2313, + "step": 2373 + }, + { + "epoch": 0.24603585863820085, + "grad_norm": 0.45267507433891296, + "learning_rate": 3.5302653034599346e-05, + "loss": 0.2006, + "step": 2374 + }, + { + "epoch": 0.24613949632086227, + "grad_norm": 0.47396013140678406, + "learning_rate": 3.5298329519106784e-05, + "loss": 0.2584, + "step": 2375 + }, + { + "epoch": 0.24624313400352368, + "grad_norm": 0.5356199741363525, + "learning_rate": 3.529400427982222e-05, + "loss": 0.2736, + "step": 2376 + }, + { + "epoch": 0.2463467716861851, + "grad_norm": 0.5099332332611084, + "learning_rate": 3.528967731723304e-05, + "loss": 0.2403, + "step": 2377 + }, + { + "epoch": 0.24645040936884652, + "grad_norm": 0.48083239793777466, + "learning_rate": 3.528534863182678e-05, + "loss": 0.2074, + "step": 2378 + }, + { + "epoch": 0.24655404705150794, + "grad_norm": 0.46469470858573914, + "learning_rate": 3.52810182240912e-05, + "loss": 0.2522, + "step": 2379 + }, + { + "epoch": 0.24665768473416935, + "grad_norm": 0.39701616764068604, + "learning_rate": 3.527668609451424e-05, + "loss": 0.1751, + "step": 2380 + }, + { + "epoch": 0.24676132241683077, + "grad_norm": 0.4343932271003723, + "learning_rate": 3.527235224358405e-05, + "loss": 0.1808, + "step": 2381 + }, + { + "epoch": 0.2468649600994922, + "grad_norm": 0.46541762351989746, + "learning_rate": 3.526801667178894e-05, + "loss": 0.2317, + "step": 2382 + }, + { + "epoch": 0.2469685977821536, + "grad_norm": 0.4486386775970459, + "learning_rate": 3.526367937961745e-05, + "loss": 0.2143, + "step": 2383 + }, + { + "epoch": 0.247072235464815, + "grad_norm": 0.5101872682571411, + "learning_rate": 3.525934036755829e-05, + "loss": 0.1991, + "step": 2384 + }, + { + "epoch": 0.2471758731474764, + "grad_norm": 0.4728240966796875, + "learning_rate": 3.525499963610038e-05, + "loss": 0.1982, + "step": 2385 + }, + { + "epoch": 0.24727951083013783, + "grad_norm": 0.47777464985847473, + "learning_rate": 3.525065718573283e-05, + "loss": 0.2096, + "step": 2386 + }, + { + "epoch": 0.24738314851279924, + "grad_norm": 0.512395977973938, + "learning_rate": 3.524631301694493e-05, + "loss": 0.2312, + "step": 2387 + }, + { + "epoch": 0.24748678619546066, + "grad_norm": 0.4622325301170349, + "learning_rate": 3.524196713022619e-05, + "loss": 0.2352, + "step": 2388 + }, + { + "epoch": 0.24759042387812208, + "grad_norm": 0.5057352781295776, + "learning_rate": 3.523761952606628e-05, + "loss": 0.2622, + "step": 2389 + }, + { + "epoch": 0.2476940615607835, + "grad_norm": 0.5191410183906555, + "learning_rate": 3.523327020495509e-05, + "loss": 0.2335, + "step": 2390 + }, + { + "epoch": 0.2477976992434449, + "grad_norm": 0.5610035061836243, + "learning_rate": 3.522891916738269e-05, + "loss": 0.2573, + "step": 2391 + }, + { + "epoch": 0.24790133692610633, + "grad_norm": 0.5406128168106079, + "learning_rate": 3.5224566413839354e-05, + "loss": 0.2566, + "step": 2392 + }, + { + "epoch": 0.24800497460876775, + "grad_norm": 0.45537373423576355, + "learning_rate": 3.522021194481555e-05, + "loss": 0.2189, + "step": 2393 + }, + { + "epoch": 0.24810861229142916, + "grad_norm": 0.4934905469417572, + "learning_rate": 3.5215855760801916e-05, + "loss": 0.21, + "step": 2394 + }, + { + "epoch": 0.24821224997409058, + "grad_norm": 0.4511372745037079, + "learning_rate": 3.521149786228931e-05, + "loss": 0.2128, + "step": 2395 + }, + { + "epoch": 0.248315887656752, + "grad_norm": 0.6283062100410461, + "learning_rate": 3.5207138249768774e-05, + "loss": 0.2992, + "step": 2396 + }, + { + "epoch": 0.24841952533941342, + "grad_norm": 0.5505823493003845, + "learning_rate": 3.520277692373154e-05, + "loss": 0.2483, + "step": 2397 + }, + { + "epoch": 0.24852316302207483, + "grad_norm": 0.504241943359375, + "learning_rate": 3.519841388466903e-05, + "loss": 0.2516, + "step": 2398 + }, + { + "epoch": 0.24862680070473625, + "grad_norm": 0.5520890355110168, + "learning_rate": 3.519404913307288e-05, + "loss": 0.2548, + "step": 2399 + }, + { + "epoch": 0.24873043838739767, + "grad_norm": 0.48844993114471436, + "learning_rate": 3.518968266943488e-05, + "loss": 0.2167, + "step": 2400 + }, + { + "epoch": 0.24883407607005908, + "grad_norm": 0.419709712266922, + "learning_rate": 3.5185314494247054e-05, + "loss": 0.2273, + "step": 2401 + }, + { + "epoch": 0.2489377137527205, + "grad_norm": 0.5344715714454651, + "learning_rate": 3.51809446080016e-05, + "loss": 0.241, + "step": 2402 + }, + { + "epoch": 0.2490413514353819, + "grad_norm": 0.46909359097480774, + "learning_rate": 3.517657301119091e-05, + "loss": 0.2167, + "step": 2403 + }, + { + "epoch": 0.2491449891180433, + "grad_norm": 0.4646020531654358, + "learning_rate": 3.5172199704307556e-05, + "loss": 0.2554, + "step": 2404 + }, + { + "epoch": 0.24924862680070473, + "grad_norm": 0.5733885169029236, + "learning_rate": 3.516782468784433e-05, + "loss": 0.2596, + "step": 2405 + }, + { + "epoch": 0.24935226448336614, + "grad_norm": 0.41124191880226135, + "learning_rate": 3.51634479622942e-05, + "loss": 0.1986, + "step": 2406 + }, + { + "epoch": 0.24945590216602756, + "grad_norm": 0.40536609292030334, + "learning_rate": 3.515906952815032e-05, + "loss": 0.1833, + "step": 2407 + }, + { + "epoch": 0.24955953984868898, + "grad_norm": 0.46964260935783386, + "learning_rate": 3.5154689385906057e-05, + "loss": 0.2116, + "step": 2408 + }, + { + "epoch": 0.2496631775313504, + "grad_norm": 0.4841628968715668, + "learning_rate": 3.515030753605495e-05, + "loss": 0.2472, + "step": 2409 + }, + { + "epoch": 0.2497668152140118, + "grad_norm": 0.431428462266922, + "learning_rate": 3.514592397909073e-05, + "loss": 0.2226, + "step": 2410 + }, + { + "epoch": 0.24987045289667323, + "grad_norm": 0.46539610624313354, + "learning_rate": 3.5141538715507365e-05, + "loss": 0.239, + "step": 2411 + }, + { + "epoch": 0.24997409057933465, + "grad_norm": 0.4847576320171356, + "learning_rate": 3.5137151745798936e-05, + "loss": 0.2361, + "step": 2412 + }, + { + "epoch": 0.25007772826199604, + "grad_norm": 0.46512630581855774, + "learning_rate": 3.513276307045979e-05, + "loss": 0.2222, + "step": 2413 + }, + { + "epoch": 0.25018136594465745, + "grad_norm": 0.5072996616363525, + "learning_rate": 3.512837268998442e-05, + "loss": 0.2377, + "step": 2414 + }, + { + "epoch": 0.25028500362731887, + "grad_norm": 0.4271259903907776, + "learning_rate": 3.512398060486753e-05, + "loss": 0.2092, + "step": 2415 + }, + { + "epoch": 0.2503886413099803, + "grad_norm": 0.43090930581092834, + "learning_rate": 3.5119586815604024e-05, + "loss": 0.1862, + "step": 2416 + }, + { + "epoch": 0.2504922789926417, + "grad_norm": 0.49372291564941406, + "learning_rate": 3.511519132268897e-05, + "loss": 0.2342, + "step": 2417 + }, + { + "epoch": 0.2505959166753031, + "grad_norm": 0.47086745500564575, + "learning_rate": 3.511079412661766e-05, + "loss": 0.2385, + "step": 2418 + }, + { + "epoch": 0.25069955435796454, + "grad_norm": 0.4185890853404999, + "learning_rate": 3.510639522788556e-05, + "loss": 0.1988, + "step": 2419 + }, + { + "epoch": 0.25080319204062596, + "grad_norm": 0.40863776206970215, + "learning_rate": 3.510199462698832e-05, + "loss": 0.2124, + "step": 2420 + }, + { + "epoch": 0.2509068297232874, + "grad_norm": 0.4368896186351776, + "learning_rate": 3.509759232442182e-05, + "loss": 0.2064, + "step": 2421 + }, + { + "epoch": 0.2510104674059488, + "grad_norm": 0.5333195924758911, + "learning_rate": 3.509318832068207e-05, + "loss": 0.2248, + "step": 2422 + }, + { + "epoch": 0.2511141050886102, + "grad_norm": 0.4488769769668579, + "learning_rate": 3.508878261626533e-05, + "loss": 0.1975, + "step": 2423 + }, + { + "epoch": 0.2512177427712716, + "grad_norm": 0.40054166316986084, + "learning_rate": 3.508437521166802e-05, + "loss": 0.1882, + "step": 2424 + }, + { + "epoch": 0.25132138045393304, + "grad_norm": 0.4375198185443878, + "learning_rate": 3.507996610738676e-05, + "loss": 0.2201, + "step": 2425 + }, + { + "epoch": 0.25142501813659446, + "grad_norm": 0.45398038625717163, + "learning_rate": 3.507555530391836e-05, + "loss": 0.2449, + "step": 2426 + }, + { + "epoch": 0.2515286558192559, + "grad_norm": 0.48056045174598694, + "learning_rate": 3.507114280175983e-05, + "loss": 0.196, + "step": 2427 + }, + { + "epoch": 0.2516322935019173, + "grad_norm": 0.3780093193054199, + "learning_rate": 3.5066728601408345e-05, + "loss": 0.1725, + "step": 2428 + }, + { + "epoch": 0.2517359311845787, + "grad_norm": 0.4994482696056366, + "learning_rate": 3.506231270336131e-05, + "loss": 0.2284, + "step": 2429 + }, + { + "epoch": 0.2518395688672401, + "grad_norm": 0.4925689995288849, + "learning_rate": 3.50578951081163e-05, + "loss": 0.2282, + "step": 2430 + }, + { + "epoch": 0.25194320654990154, + "grad_norm": 0.4943445026874542, + "learning_rate": 3.505347581617107e-05, + "loss": 0.2013, + "step": 2431 + }, + { + "epoch": 0.25204684423256296, + "grad_norm": 0.4667070508003235, + "learning_rate": 3.504905482802358e-05, + "loss": 0.2371, + "step": 2432 + }, + { + "epoch": 0.2521504819152244, + "grad_norm": 0.45808538794517517, + "learning_rate": 3.5044632144172e-05, + "loss": 0.2211, + "step": 2433 + }, + { + "epoch": 0.2522541195978858, + "grad_norm": 0.5020188093185425, + "learning_rate": 3.5040207765114646e-05, + "loss": 0.2668, + "step": 2434 + }, + { + "epoch": 0.2523577572805472, + "grad_norm": 0.5777788162231445, + "learning_rate": 3.503578169135007e-05, + "loss": 0.2702, + "step": 2435 + }, + { + "epoch": 0.25246139496320863, + "grad_norm": 0.4506339728832245, + "learning_rate": 3.5031353923376985e-05, + "loss": 0.2052, + "step": 2436 + }, + { + "epoch": 0.25256503264587005, + "grad_norm": 0.5711771845817566, + "learning_rate": 3.50269244616943e-05, + "loss": 0.2459, + "step": 2437 + }, + { + "epoch": 0.25266867032853146, + "grad_norm": 0.4558737277984619, + "learning_rate": 3.502249330680114e-05, + "loss": 0.2507, + "step": 2438 + }, + { + "epoch": 0.2527723080111929, + "grad_norm": 0.45028817653656006, + "learning_rate": 3.5018060459196774e-05, + "loss": 0.2199, + "step": 2439 + }, + { + "epoch": 0.2528759456938543, + "grad_norm": 0.46420130133628845, + "learning_rate": 3.501362591938071e-05, + "loss": 0.2087, + "step": 2440 + }, + { + "epoch": 0.2529795833765157, + "grad_norm": 0.4835816025733948, + "learning_rate": 3.500918968785261e-05, + "loss": 0.2513, + "step": 2441 + }, + { + "epoch": 0.25308322105917713, + "grad_norm": 0.5205420851707458, + "learning_rate": 3.500475176511235e-05, + "loss": 0.2436, + "step": 2442 + }, + { + "epoch": 0.25318685874183855, + "grad_norm": 0.4250602722167969, + "learning_rate": 3.500031215165999e-05, + "loss": 0.1797, + "step": 2443 + }, + { + "epoch": 0.25329049642449997, + "grad_norm": 0.5139288306236267, + "learning_rate": 3.499587084799577e-05, + "loss": 0.2537, + "step": 2444 + }, + { + "epoch": 0.2533941341071614, + "grad_norm": 0.4984401762485504, + "learning_rate": 3.499142785462014e-05, + "loss": 0.2866, + "step": 2445 + }, + { + "epoch": 0.2534977717898228, + "grad_norm": 0.5047465562820435, + "learning_rate": 3.498698317203372e-05, + "loss": 0.2601, + "step": 2446 + }, + { + "epoch": 0.2536014094724842, + "grad_norm": 0.47029536962509155, + "learning_rate": 3.498253680073733e-05, + "loss": 0.2057, + "step": 2447 + }, + { + "epoch": 0.25370504715514564, + "grad_norm": 0.458085834980011, + "learning_rate": 3.497808874123199e-05, + "loss": 0.2185, + "step": 2448 + }, + { + "epoch": 0.25380868483780705, + "grad_norm": 0.507517397403717, + "learning_rate": 3.497363899401889e-05, + "loss": 0.2218, + "step": 2449 + }, + { + "epoch": 0.25391232252046847, + "grad_norm": 0.3929824233055115, + "learning_rate": 3.496918755959943e-05, + "loss": 0.2094, + "step": 2450 + }, + { + "epoch": 0.25401596020312983, + "grad_norm": 0.4605729579925537, + "learning_rate": 3.496473443847519e-05, + "loss": 0.2345, + "step": 2451 + }, + { + "epoch": 0.25411959788579125, + "grad_norm": 0.46283668279647827, + "learning_rate": 3.4960279631147926e-05, + "loss": 0.25, + "step": 2452 + }, + { + "epoch": 0.25422323556845267, + "grad_norm": 0.5951175093650818, + "learning_rate": 3.4955823138119616e-05, + "loss": 0.2615, + "step": 2453 + }, + { + "epoch": 0.2543268732511141, + "grad_norm": 0.5442880392074585, + "learning_rate": 3.4951364959892404e-05, + "loss": 0.2556, + "step": 2454 + }, + { + "epoch": 0.2544305109337755, + "grad_norm": 0.48734351992607117, + "learning_rate": 3.494690509696863e-05, + "loss": 0.2347, + "step": 2455 + }, + { + "epoch": 0.2545341486164369, + "grad_norm": 0.4687855541706085, + "learning_rate": 3.4942443549850825e-05, + "loss": 0.2264, + "step": 2456 + }, + { + "epoch": 0.25463778629909833, + "grad_norm": 0.4448159337043762, + "learning_rate": 3.4937980319041704e-05, + "loss": 0.2161, + "step": 2457 + }, + { + "epoch": 0.25474142398175975, + "grad_norm": 0.5229969024658203, + "learning_rate": 3.493351540504419e-05, + "loss": 0.2545, + "step": 2458 + }, + { + "epoch": 0.25484506166442117, + "grad_norm": 0.48294875025749207, + "learning_rate": 3.492904880836137e-05, + "loss": 0.2283, + "step": 2459 + }, + { + "epoch": 0.2549486993470826, + "grad_norm": 0.4588148295879364, + "learning_rate": 3.492458052949654e-05, + "loss": 0.227, + "step": 2460 + }, + { + "epoch": 0.255052337029744, + "grad_norm": 0.4651046097278595, + "learning_rate": 3.492011056895318e-05, + "loss": 0.2307, + "step": 2461 + }, + { + "epoch": 0.2551559747124054, + "grad_norm": 0.44987449049949646, + "learning_rate": 3.4915638927234945e-05, + "loss": 0.1988, + "step": 2462 + }, + { + "epoch": 0.25525961239506684, + "grad_norm": 0.49422404170036316, + "learning_rate": 3.491116560484571e-05, + "loss": 0.2618, + "step": 2463 + }, + { + "epoch": 0.25536325007772825, + "grad_norm": 0.43810033798217773, + "learning_rate": 3.490669060228951e-05, + "loss": 0.209, + "step": 2464 + }, + { + "epoch": 0.25546688776038967, + "grad_norm": 0.46789655089378357, + "learning_rate": 3.4902213920070594e-05, + "loss": 0.2543, + "step": 2465 + }, + { + "epoch": 0.2555705254430511, + "grad_norm": 0.49316537380218506, + "learning_rate": 3.489773555869337e-05, + "loss": 0.2182, + "step": 2466 + }, + { + "epoch": 0.2556741631257125, + "grad_norm": 0.4785871207714081, + "learning_rate": 3.489325551866246e-05, + "loss": 0.2429, + "step": 2467 + }, + { + "epoch": 0.2557778008083739, + "grad_norm": 0.4753807485103607, + "learning_rate": 3.488877380048268e-05, + "loss": 0.2197, + "step": 2468 + }, + { + "epoch": 0.25588143849103534, + "grad_norm": 0.5222952365875244, + "learning_rate": 3.488429040465901e-05, + "loss": 0.2271, + "step": 2469 + }, + { + "epoch": 0.25598507617369676, + "grad_norm": 0.4613376557826996, + "learning_rate": 3.4879805331696636e-05, + "loss": 0.1977, + "step": 2470 + }, + { + "epoch": 0.2560887138563582, + "grad_norm": 0.47917234897613525, + "learning_rate": 3.487531858210093e-05, + "loss": 0.249, + "step": 2471 + }, + { + "epoch": 0.2561923515390196, + "grad_norm": 0.48481738567352295, + "learning_rate": 3.487083015637745e-05, + "loss": 0.2302, + "step": 2472 + }, + { + "epoch": 0.256295989221681, + "grad_norm": 0.4428340196609497, + "learning_rate": 3.486634005503194e-05, + "loss": 0.2583, + "step": 2473 + }, + { + "epoch": 0.2563996269043424, + "grad_norm": 0.49516594409942627, + "learning_rate": 3.486184827857034e-05, + "loss": 0.2256, + "step": 2474 + }, + { + "epoch": 0.25650326458700384, + "grad_norm": 0.442982017993927, + "learning_rate": 3.4857354827498785e-05, + "loss": 0.224, + "step": 2475 + }, + { + "epoch": 0.25660690226966526, + "grad_norm": 0.5688411593437195, + "learning_rate": 3.485285970232359e-05, + "loss": 0.2637, + "step": 2476 + }, + { + "epoch": 0.2567105399523267, + "grad_norm": 0.6057034134864807, + "learning_rate": 3.484836290355124e-05, + "loss": 0.2628, + "step": 2477 + }, + { + "epoch": 0.2568141776349881, + "grad_norm": 0.5497249960899353, + "learning_rate": 3.484386443168845e-05, + "loss": 0.2511, + "step": 2478 + }, + { + "epoch": 0.2569178153176495, + "grad_norm": 0.5606237053871155, + "learning_rate": 3.483936428724209e-05, + "loss": 0.2256, + "step": 2479 + }, + { + "epoch": 0.25702145300031093, + "grad_norm": 0.37174859642982483, + "learning_rate": 3.483486247071923e-05, + "loss": 0.1698, + "step": 2480 + }, + { + "epoch": 0.25712509068297235, + "grad_norm": 0.4627086818218231, + "learning_rate": 3.483035898262712e-05, + "loss": 0.2211, + "step": 2481 + }, + { + "epoch": 0.25722872836563376, + "grad_norm": 0.5031776428222656, + "learning_rate": 3.482585382347323e-05, + "loss": 0.2481, + "step": 2482 + }, + { + "epoch": 0.2573323660482952, + "grad_norm": 0.44983866810798645, + "learning_rate": 3.482134699376517e-05, + "loss": 0.2178, + "step": 2483 + }, + { + "epoch": 0.2574360037309566, + "grad_norm": 0.4599207937717438, + "learning_rate": 3.481683849401076e-05, + "loss": 0.2105, + "step": 2484 + }, + { + "epoch": 0.257539641413618, + "grad_norm": 0.503778338432312, + "learning_rate": 3.481232832471803e-05, + "loss": 0.2501, + "step": 2485 + }, + { + "epoch": 0.25764327909627943, + "grad_norm": 0.4835233688354492, + "learning_rate": 3.480781648639518e-05, + "loss": 0.2587, + "step": 2486 + }, + { + "epoch": 0.25774691677894085, + "grad_norm": 0.46209481358528137, + "learning_rate": 3.480330297955058e-05, + "loss": 0.2162, + "step": 2487 + }, + { + "epoch": 0.25785055446160227, + "grad_norm": 0.47634264826774597, + "learning_rate": 3.479878780469281e-05, + "loss": 0.2128, + "step": 2488 + }, + { + "epoch": 0.2579541921442636, + "grad_norm": 0.5128913521766663, + "learning_rate": 3.4794270962330636e-05, + "loss": 0.2432, + "step": 2489 + }, + { + "epoch": 0.25805782982692504, + "grad_norm": 0.5134532451629639, + "learning_rate": 3.478975245297301e-05, + "loss": 0.2343, + "step": 2490 + }, + { + "epoch": 0.25816146750958646, + "grad_norm": 0.47444865107536316, + "learning_rate": 3.478523227712907e-05, + "loss": 0.2099, + "step": 2491 + }, + { + "epoch": 0.2582651051922479, + "grad_norm": 0.4637894928455353, + "learning_rate": 3.478071043530814e-05, + "loss": 0.1962, + "step": 2492 + }, + { + "epoch": 0.2583687428749093, + "grad_norm": 0.4853185713291168, + "learning_rate": 3.477618692801973e-05, + "loss": 0.2556, + "step": 2493 + }, + { + "epoch": 0.2584723805575707, + "grad_norm": 0.44991862773895264, + "learning_rate": 3.4771661755773554e-05, + "loss": 0.2264, + "step": 2494 + }, + { + "epoch": 0.25857601824023213, + "grad_norm": 0.5146219730377197, + "learning_rate": 3.47671349190795e-05, + "loss": 0.2321, + "step": 2495 + }, + { + "epoch": 0.25867965592289355, + "grad_norm": 0.46559420228004456, + "learning_rate": 3.4762606418447626e-05, + "loss": 0.236, + "step": 2496 + }, + { + "epoch": 0.25878329360555496, + "grad_norm": 0.4216259717941284, + "learning_rate": 3.4758076254388215e-05, + "loss": 0.2275, + "step": 2497 + }, + { + "epoch": 0.2588869312882164, + "grad_norm": 0.526623010635376, + "learning_rate": 3.475354442741171e-05, + "loss": 0.2449, + "step": 2498 + }, + { + "epoch": 0.2589905689708778, + "grad_norm": 0.48265042901039124, + "learning_rate": 3.474901093802876e-05, + "loss": 0.2404, + "step": 2499 + }, + { + "epoch": 0.2590942066535392, + "grad_norm": 0.5185845494270325, + "learning_rate": 3.474447578675018e-05, + "loss": 0.2179, + "step": 2500 + }, + { + "epoch": 0.25919784433620063, + "grad_norm": 0.43969860672950745, + "learning_rate": 3.4739938974086995e-05, + "loss": 0.2103, + "step": 2501 + }, + { + "epoch": 0.25930148201886205, + "grad_norm": 0.5436468720436096, + "learning_rate": 3.473540050055039e-05, + "loss": 0.2532, + "step": 2502 + }, + { + "epoch": 0.25940511970152347, + "grad_norm": 0.4670475125312805, + "learning_rate": 3.473086036665177e-05, + "loss": 0.2131, + "step": 2503 + }, + { + "epoch": 0.2595087573841849, + "grad_norm": 0.5052348971366882, + "learning_rate": 3.472631857290271e-05, + "loss": 0.2488, + "step": 2504 + }, + { + "epoch": 0.2596123950668463, + "grad_norm": 0.43734100461006165, + "learning_rate": 3.472177511981496e-05, + "loss": 0.181, + "step": 2505 + }, + { + "epoch": 0.2597160327495077, + "grad_norm": 0.46347135305404663, + "learning_rate": 3.4717230007900475e-05, + "loss": 0.2301, + "step": 2506 + }, + { + "epoch": 0.25981967043216914, + "grad_norm": 0.46348029375076294, + "learning_rate": 3.4712683237671384e-05, + "loss": 0.2098, + "step": 2507 + }, + { + "epoch": 0.25992330811483055, + "grad_norm": 0.4893947243690491, + "learning_rate": 3.470813480964003e-05, + "loss": 0.2338, + "step": 2508 + }, + { + "epoch": 0.26002694579749197, + "grad_norm": 0.5009042024612427, + "learning_rate": 3.47035847243189e-05, + "loss": 0.2403, + "step": 2509 + }, + { + "epoch": 0.2601305834801534, + "grad_norm": 0.4790656864643097, + "learning_rate": 3.46990329822207e-05, + "loss": 0.2428, + "step": 2510 + }, + { + "epoch": 0.2602342211628148, + "grad_norm": 0.49752435088157654, + "learning_rate": 3.469447958385832e-05, + "loss": 0.243, + "step": 2511 + }, + { + "epoch": 0.2603378588454762, + "grad_norm": 0.43102583289146423, + "learning_rate": 3.468992452974482e-05, + "loss": 0.2117, + "step": 2512 + }, + { + "epoch": 0.26044149652813764, + "grad_norm": 0.4716110825538635, + "learning_rate": 3.468536782039346e-05, + "loss": 0.2481, + "step": 2513 + }, + { + "epoch": 0.26054513421079906, + "grad_norm": 0.4976402521133423, + "learning_rate": 3.468080945631768e-05, + "loss": 0.2461, + "step": 2514 + }, + { + "epoch": 0.2606487718934605, + "grad_norm": 0.5053207278251648, + "learning_rate": 3.467624943803112e-05, + "loss": 0.2132, + "step": 2515 + }, + { + "epoch": 0.2607524095761219, + "grad_norm": 0.4024655818939209, + "learning_rate": 3.4671687766047585e-05, + "loss": 0.1757, + "step": 2516 + }, + { + "epoch": 0.2608560472587833, + "grad_norm": 0.402815043926239, + "learning_rate": 3.4667124440881074e-05, + "loss": 0.1968, + "step": 2517 + }, + { + "epoch": 0.2609596849414447, + "grad_norm": 0.4925979673862457, + "learning_rate": 3.466255946304579e-05, + "loss": 0.2339, + "step": 2518 + }, + { + "epoch": 0.26106332262410614, + "grad_norm": 0.49440789222717285, + "learning_rate": 3.4657992833056095e-05, + "loss": 0.213, + "step": 2519 + }, + { + "epoch": 0.26116696030676756, + "grad_norm": 0.5009583234786987, + "learning_rate": 3.465342455142655e-05, + "loss": 0.2516, + "step": 2520 + }, + { + "epoch": 0.261270597989429, + "grad_norm": 0.4979705214500427, + "learning_rate": 3.464885461867191e-05, + "loss": 0.2497, + "step": 2521 + }, + { + "epoch": 0.2613742356720904, + "grad_norm": 0.45276740193367004, + "learning_rate": 3.464428303530711e-05, + "loss": 0.1933, + "step": 2522 + }, + { + "epoch": 0.2614778733547518, + "grad_norm": 0.4588010907173157, + "learning_rate": 3.4639709801847254e-05, + "loss": 0.2244, + "step": 2523 + }, + { + "epoch": 0.26158151103741323, + "grad_norm": 0.4481523931026459, + "learning_rate": 3.4635134918807656e-05, + "loss": 0.2337, + "step": 2524 + }, + { + "epoch": 0.26168514872007465, + "grad_norm": 0.4547864496707916, + "learning_rate": 3.463055838670381e-05, + "loss": 0.2262, + "step": 2525 + }, + { + "epoch": 0.26178878640273606, + "grad_norm": 0.4842788279056549, + "learning_rate": 3.4625980206051385e-05, + "loss": 0.2523, + "step": 2526 + }, + { + "epoch": 0.2618924240853974, + "grad_norm": 0.43599626421928406, + "learning_rate": 3.462140037736624e-05, + "loss": 0.1896, + "step": 2527 + }, + { + "epoch": 0.26199606176805884, + "grad_norm": 0.46129560470581055, + "learning_rate": 3.461681890116445e-05, + "loss": 0.2364, + "step": 2528 + }, + { + "epoch": 0.26209969945072026, + "grad_norm": 0.3715370297431946, + "learning_rate": 3.461223577796221e-05, + "loss": 0.1899, + "step": 2529 + }, + { + "epoch": 0.2622033371333817, + "grad_norm": 0.48308712244033813, + "learning_rate": 3.460765100827597e-05, + "loss": 0.2143, + "step": 2530 + }, + { + "epoch": 0.2623069748160431, + "grad_norm": 0.5160189270973206, + "learning_rate": 3.4603064592622315e-05, + "loss": 0.26, + "step": 2531 + }, + { + "epoch": 0.2624106124987045, + "grad_norm": 0.4867519736289978, + "learning_rate": 3.4598476531518045e-05, + "loss": 0.2412, + "step": 2532 + }, + { + "epoch": 0.2625142501813659, + "grad_norm": 0.465255469083786, + "learning_rate": 3.459388682548013e-05, + "loss": 0.2242, + "step": 2533 + }, + { + "epoch": 0.26261788786402734, + "grad_norm": 0.4549020528793335, + "learning_rate": 3.458929547502574e-05, + "loss": 0.2067, + "step": 2534 + }, + { + "epoch": 0.26272152554668876, + "grad_norm": 0.43989869952201843, + "learning_rate": 3.458470248067221e-05, + "loss": 0.2126, + "step": 2535 + }, + { + "epoch": 0.2628251632293502, + "grad_norm": 0.45250603556632996, + "learning_rate": 3.458010784293708e-05, + "loss": 0.2006, + "step": 2536 + }, + { + "epoch": 0.2629288009120116, + "grad_norm": 0.3904036581516266, + "learning_rate": 3.457551156233806e-05, + "loss": 0.1817, + "step": 2537 + }, + { + "epoch": 0.263032438594673, + "grad_norm": 0.46451374888420105, + "learning_rate": 3.457091363939306e-05, + "loss": 0.2423, + "step": 2538 + }, + { + "epoch": 0.26313607627733443, + "grad_norm": 0.4874878525733948, + "learning_rate": 3.456631407462016e-05, + "loss": 0.2326, + "step": 2539 + }, + { + "epoch": 0.26323971395999585, + "grad_norm": 0.39999979734420776, + "learning_rate": 3.4561712868537634e-05, + "loss": 0.1798, + "step": 2540 + }, + { + "epoch": 0.26334335164265726, + "grad_norm": 0.49433350563049316, + "learning_rate": 3.4557110021663934e-05, + "loss": 0.2475, + "step": 2541 + }, + { + "epoch": 0.2634469893253187, + "grad_norm": 0.4696984589099884, + "learning_rate": 3.4552505534517714e-05, + "loss": 0.185, + "step": 2542 + }, + { + "epoch": 0.2635506270079801, + "grad_norm": 0.5648089647293091, + "learning_rate": 3.4547899407617786e-05, + "loss": 0.2477, + "step": 2543 + }, + { + "epoch": 0.2636542646906415, + "grad_norm": 0.5114067792892456, + "learning_rate": 3.454329164148317e-05, + "loss": 0.221, + "step": 2544 + }, + { + "epoch": 0.26375790237330293, + "grad_norm": 0.43366900086402893, + "learning_rate": 3.453868223663306e-05, + "loss": 0.2543, + "step": 2545 + }, + { + "epoch": 0.26386154005596435, + "grad_norm": 0.563288688659668, + "learning_rate": 3.453407119358684e-05, + "loss": 0.2662, + "step": 2546 + }, + { + "epoch": 0.26396517773862577, + "grad_norm": 0.3807101845741272, + "learning_rate": 3.4529458512864064e-05, + "loss": 0.1638, + "step": 2547 + }, + { + "epoch": 0.2640688154212872, + "grad_norm": 0.4380934238433838, + "learning_rate": 3.4524844194984496e-05, + "loss": 0.225, + "step": 2548 + }, + { + "epoch": 0.2641724531039486, + "grad_norm": 0.4026784300804138, + "learning_rate": 3.4520228240468065e-05, + "loss": 0.2008, + "step": 2549 + }, + { + "epoch": 0.26427609078661, + "grad_norm": 0.3506837487220764, + "learning_rate": 3.451561064983488e-05, + "loss": 0.177, + "step": 2550 + }, + { + "epoch": 0.26437972846927144, + "grad_norm": 0.6199037432670593, + "learning_rate": 3.4510991423605254e-05, + "loss": 0.2554, + "step": 2551 + }, + { + "epoch": 0.26448336615193285, + "grad_norm": 0.48114362359046936, + "learning_rate": 3.450637056229968e-05, + "loss": 0.2453, + "step": 2552 + }, + { + "epoch": 0.26458700383459427, + "grad_norm": 0.5279095768928528, + "learning_rate": 3.450174806643881e-05, + "loss": 0.2228, + "step": 2553 + }, + { + "epoch": 0.2646906415172557, + "grad_norm": 0.47686752676963806, + "learning_rate": 3.449712393654352e-05, + "loss": 0.2405, + "step": 2554 + }, + { + "epoch": 0.2647942791999171, + "grad_norm": 0.5018643140792847, + "learning_rate": 3.449249817313484e-05, + "loss": 0.2228, + "step": 2555 + }, + { + "epoch": 0.2648979168825785, + "grad_norm": 0.558510422706604, + "learning_rate": 3.4487870776733993e-05, + "loss": 0.2629, + "step": 2556 + }, + { + "epoch": 0.26500155456523994, + "grad_norm": 0.48924878239631653, + "learning_rate": 3.448324174786239e-05, + "loss": 0.2092, + "step": 2557 + }, + { + "epoch": 0.26510519224790136, + "grad_norm": 0.4139244556427002, + "learning_rate": 3.447861108704162e-05, + "loss": 0.2089, + "step": 2558 + }, + { + "epoch": 0.2652088299305628, + "grad_norm": 0.5043182969093323, + "learning_rate": 3.447397879479346e-05, + "loss": 0.2725, + "step": 2559 + }, + { + "epoch": 0.2653124676132242, + "grad_norm": 0.4431025981903076, + "learning_rate": 3.446934487163988e-05, + "loss": 0.208, + "step": 2560 + }, + { + "epoch": 0.2654161052958856, + "grad_norm": 0.4326241910457611, + "learning_rate": 3.446470931810299e-05, + "loss": 0.2237, + "step": 2561 + }, + { + "epoch": 0.265519742978547, + "grad_norm": 0.43413740396499634, + "learning_rate": 3.4460072134705156e-05, + "loss": 0.2078, + "step": 2562 + }, + { + "epoch": 0.26562338066120844, + "grad_norm": 0.46051687002182007, + "learning_rate": 3.445543332196887e-05, + "loss": 0.1887, + "step": 2563 + }, + { + "epoch": 0.26572701834386986, + "grad_norm": 0.4019477665424347, + "learning_rate": 3.445079288041683e-05, + "loss": 0.1932, + "step": 2564 + }, + { + "epoch": 0.2658306560265312, + "grad_norm": 0.5146316289901733, + "learning_rate": 3.444615081057191e-05, + "loss": 0.2214, + "step": 2565 + }, + { + "epoch": 0.26593429370919264, + "grad_norm": 0.47542494535446167, + "learning_rate": 3.4441507112957174e-05, + "loss": 0.2559, + "step": 2566 + }, + { + "epoch": 0.26603793139185405, + "grad_norm": 0.46404582262039185, + "learning_rate": 3.443686178809587e-05, + "loss": 0.2318, + "step": 2567 + }, + { + "epoch": 0.26614156907451547, + "grad_norm": 0.5001510381698608, + "learning_rate": 3.443221483651141e-05, + "loss": 0.2377, + "step": 2568 + }, + { + "epoch": 0.2662452067571769, + "grad_norm": 0.5247422456741333, + "learning_rate": 3.4427566258727436e-05, + "loss": 0.2905, + "step": 2569 + }, + { + "epoch": 0.2663488444398383, + "grad_norm": 0.47278907895088196, + "learning_rate": 3.442291605526771e-05, + "loss": 0.2313, + "step": 2570 + }, + { + "epoch": 0.2664524821224997, + "grad_norm": 0.4855808913707733, + "learning_rate": 3.441826422665623e-05, + "loss": 0.2213, + "step": 2571 + }, + { + "epoch": 0.26655611980516114, + "grad_norm": 0.4366244077682495, + "learning_rate": 3.4413610773417144e-05, + "loss": 0.2079, + "step": 2572 + }, + { + "epoch": 0.26665975748782256, + "grad_norm": 0.4548654556274414, + "learning_rate": 3.440895569607481e-05, + "loss": 0.2035, + "step": 2573 + }, + { + "epoch": 0.266763395170484, + "grad_norm": 0.4923447072505951, + "learning_rate": 3.440429899515375e-05, + "loss": 0.2339, + "step": 2574 + }, + { + "epoch": 0.2668670328531454, + "grad_norm": 0.48065465688705444, + "learning_rate": 3.439964067117866e-05, + "loss": 0.2111, + "step": 2575 + }, + { + "epoch": 0.2669706705358068, + "grad_norm": 0.4575445353984833, + "learning_rate": 3.439498072467445e-05, + "loss": 0.1983, + "step": 2576 + }, + { + "epoch": 0.2670743082184682, + "grad_norm": 0.5345796942710876, + "learning_rate": 3.439031915616619e-05, + "loss": 0.2442, + "step": 2577 + }, + { + "epoch": 0.26717794590112964, + "grad_norm": 0.6588506698608398, + "learning_rate": 3.438565596617913e-05, + "loss": 0.2577, + "step": 2578 + }, + { + "epoch": 0.26728158358379106, + "grad_norm": 0.4015815258026123, + "learning_rate": 3.438099115523873e-05, + "loss": 0.1669, + "step": 2579 + }, + { + "epoch": 0.2673852212664525, + "grad_norm": 0.5120643377304077, + "learning_rate": 3.437632472387059e-05, + "loss": 0.2513, + "step": 2580 + }, + { + "epoch": 0.2674888589491139, + "grad_norm": 0.4354187548160553, + "learning_rate": 3.437165667260054e-05, + "loss": 0.2074, + "step": 2581 + }, + { + "epoch": 0.2675924966317753, + "grad_norm": 0.45943692326545715, + "learning_rate": 3.4366987001954555e-05, + "loss": 0.2222, + "step": 2582 + }, + { + "epoch": 0.26769613431443673, + "grad_norm": 0.49233517050743103, + "learning_rate": 3.436231571245881e-05, + "loss": 0.2487, + "step": 2583 + }, + { + "epoch": 0.26779977199709815, + "grad_norm": 0.5188155174255371, + "learning_rate": 3.435764280463965e-05, + "loss": 0.256, + "step": 2584 + }, + { + "epoch": 0.26790340967975956, + "grad_norm": 0.43278053402900696, + "learning_rate": 3.4352968279023624e-05, + "loss": 0.2005, + "step": 2585 + }, + { + "epoch": 0.268007047362421, + "grad_norm": 0.3969469368457794, + "learning_rate": 3.434829213613744e-05, + "loss": 0.204, + "step": 2586 + }, + { + "epoch": 0.2681106850450824, + "grad_norm": 0.5019869804382324, + "learning_rate": 3.434361437650801e-05, + "loss": 0.2308, + "step": 2587 + }, + { + "epoch": 0.2682143227277438, + "grad_norm": 0.5247126817703247, + "learning_rate": 3.43389350006624e-05, + "loss": 0.2222, + "step": 2588 + }, + { + "epoch": 0.26831796041040523, + "grad_norm": 0.46773356199264526, + "learning_rate": 3.433425400912789e-05, + "loss": 0.1998, + "step": 2589 + }, + { + "epoch": 0.26842159809306665, + "grad_norm": 0.4832060933113098, + "learning_rate": 3.4329571402431924e-05, + "loss": 0.2589, + "step": 2590 + }, + { + "epoch": 0.26852523577572807, + "grad_norm": 0.4738462567329407, + "learning_rate": 3.432488718110213e-05, + "loss": 0.2095, + "step": 2591 + }, + { + "epoch": 0.2686288734583895, + "grad_norm": 0.497236967086792, + "learning_rate": 3.4320201345666306e-05, + "loss": 0.2728, + "step": 2592 + }, + { + "epoch": 0.2687325111410509, + "grad_norm": 0.4942580461502075, + "learning_rate": 3.431551389665246e-05, + "loss": 0.2264, + "step": 2593 + }, + { + "epoch": 0.2688361488237123, + "grad_norm": 0.5522456169128418, + "learning_rate": 3.431082483458876e-05, + "loss": 0.2528, + "step": 2594 + }, + { + "epoch": 0.26893978650637373, + "grad_norm": 0.4271014630794525, + "learning_rate": 3.4306134160003575e-05, + "loss": 0.1912, + "step": 2595 + }, + { + "epoch": 0.26904342418903515, + "grad_norm": 0.43170085549354553, + "learning_rate": 3.430144187342542e-05, + "loss": 0.2042, + "step": 2596 + }, + { + "epoch": 0.26914706187169657, + "grad_norm": 0.48882734775543213, + "learning_rate": 3.429674797538304e-05, + "loss": 0.2515, + "step": 2597 + }, + { + "epoch": 0.269250699554358, + "grad_norm": 0.44802772998809814, + "learning_rate": 3.429205246640531e-05, + "loss": 0.2281, + "step": 2598 + }, + { + "epoch": 0.2693543372370194, + "grad_norm": 0.518919825553894, + "learning_rate": 3.428735534702133e-05, + "loss": 0.2292, + "step": 2599 + }, + { + "epoch": 0.2694579749196808, + "grad_norm": 0.41916918754577637, + "learning_rate": 3.4282656617760355e-05, + "loss": 0.2027, + "step": 2600 + }, + { + "epoch": 0.26956161260234224, + "grad_norm": 0.47483789920806885, + "learning_rate": 3.427795627915184e-05, + "loss": 0.2318, + "step": 2601 + }, + { + "epoch": 0.26966525028500365, + "grad_norm": 0.4560285806655884, + "learning_rate": 3.427325433172541e-05, + "loss": 0.2335, + "step": 2602 + }, + { + "epoch": 0.269768887967665, + "grad_norm": 0.5098150372505188, + "learning_rate": 3.426855077601086e-05, + "loss": 0.236, + "step": 2603 + }, + { + "epoch": 0.26987252565032643, + "grad_norm": 0.47620120644569397, + "learning_rate": 3.4263845612538203e-05, + "loss": 0.2085, + "step": 2604 + }, + { + "epoch": 0.26997616333298785, + "grad_norm": 0.4596398174762726, + "learning_rate": 3.425913884183759e-05, + "loss": 0.2159, + "step": 2605 + }, + { + "epoch": 0.27007980101564927, + "grad_norm": 0.6123204231262207, + "learning_rate": 3.425443046443938e-05, + "loss": 0.2241, + "step": 2606 + }, + { + "epoch": 0.2701834386983107, + "grad_norm": 0.5035721659660339, + "learning_rate": 3.42497204808741e-05, + "loss": 0.2487, + "step": 2607 + }, + { + "epoch": 0.2702870763809721, + "grad_norm": 0.47740867733955383, + "learning_rate": 3.424500889167247e-05, + "loss": 0.2217, + "step": 2608 + }, + { + "epoch": 0.2703907140636335, + "grad_norm": 0.47603389620780945, + "learning_rate": 3.424029569736538e-05, + "loss": 0.2013, + "step": 2609 + }, + { + "epoch": 0.27049435174629494, + "grad_norm": 0.5369718670845032, + "learning_rate": 3.423558089848391e-05, + "loss": 0.2376, + "step": 2610 + }, + { + "epoch": 0.27059798942895635, + "grad_norm": 0.5321739315986633, + "learning_rate": 3.423086449555931e-05, + "loss": 0.2722, + "step": 2611 + }, + { + "epoch": 0.27070162711161777, + "grad_norm": 0.47373032569885254, + "learning_rate": 3.422614648912303e-05, + "loss": 0.2063, + "step": 2612 + }, + { + "epoch": 0.2708052647942792, + "grad_norm": 0.44949063658714294, + "learning_rate": 3.4221426879706676e-05, + "loss": 0.221, + "step": 2613 + }, + { + "epoch": 0.2709089024769406, + "grad_norm": 0.4665064215660095, + "learning_rate": 3.4216705667842044e-05, + "loss": 0.1946, + "step": 2614 + }, + { + "epoch": 0.271012540159602, + "grad_norm": 0.5441843271255493, + "learning_rate": 3.421198285406112e-05, + "loss": 0.2502, + "step": 2615 + }, + { + "epoch": 0.27111617784226344, + "grad_norm": 0.45392951369285583, + "learning_rate": 3.4207258438896056e-05, + "loss": 0.2215, + "step": 2616 + }, + { + "epoch": 0.27121981552492486, + "grad_norm": 0.516017496585846, + "learning_rate": 3.4202532422879204e-05, + "loss": 0.2451, + "step": 2617 + }, + { + "epoch": 0.2713234532075863, + "grad_norm": 0.5189616680145264, + "learning_rate": 3.4197804806543076e-05, + "loss": 0.2074, + "step": 2618 + }, + { + "epoch": 0.2714270908902477, + "grad_norm": 0.5190929174423218, + "learning_rate": 3.419307559042037e-05, + "loss": 0.2542, + "step": 2619 + }, + { + "epoch": 0.2715307285729091, + "grad_norm": 0.5525068044662476, + "learning_rate": 3.4188344775043976e-05, + "loss": 0.2576, + "step": 2620 + }, + { + "epoch": 0.2716343662555705, + "grad_norm": 0.48754870891571045, + "learning_rate": 3.4183612360946945e-05, + "loss": 0.2111, + "step": 2621 + }, + { + "epoch": 0.27173800393823194, + "grad_norm": 0.4261727035045624, + "learning_rate": 3.4178878348662524e-05, + "loss": 0.2137, + "step": 2622 + }, + { + "epoch": 0.27184164162089336, + "grad_norm": 0.46469005942344666, + "learning_rate": 3.417414273872413e-05, + "loss": 0.2086, + "step": 2623 + }, + { + "epoch": 0.2719452793035548, + "grad_norm": 0.43636736273765564, + "learning_rate": 3.416940553166536e-05, + "loss": 0.2079, + "step": 2624 + }, + { + "epoch": 0.2720489169862162, + "grad_norm": 0.5391519665718079, + "learning_rate": 3.416466672802001e-05, + "loss": 0.259, + "step": 2625 + }, + { + "epoch": 0.2721525546688776, + "grad_norm": 0.47564294934272766, + "learning_rate": 3.415992632832203e-05, + "loss": 0.2077, + "step": 2626 + }, + { + "epoch": 0.27225619235153903, + "grad_norm": 0.5383174419403076, + "learning_rate": 3.415518433310556e-05, + "loss": 0.2078, + "step": 2627 + }, + { + "epoch": 0.27235983003420045, + "grad_norm": 0.4875207543373108, + "learning_rate": 3.415044074290493e-05, + "loss": 0.2087, + "step": 2628 + }, + { + "epoch": 0.27246346771686186, + "grad_norm": 0.587772011756897, + "learning_rate": 3.4145695558254635e-05, + "loss": 0.2498, + "step": 2629 + }, + { + "epoch": 0.2725671053995233, + "grad_norm": 0.48780080676078796, + "learning_rate": 3.4140948779689344e-05, + "loss": 0.2204, + "step": 2630 + }, + { + "epoch": 0.2726707430821847, + "grad_norm": 0.5223898887634277, + "learning_rate": 3.413620040774394e-05, + "loss": 0.2468, + "step": 2631 + }, + { + "epoch": 0.2727743807648461, + "grad_norm": 0.46727365255355835, + "learning_rate": 3.413145044295345e-05, + "loss": 0.2159, + "step": 2632 + }, + { + "epoch": 0.27287801844750753, + "grad_norm": 0.4932716190814972, + "learning_rate": 3.412669888585308e-05, + "loss": 0.2174, + "step": 2633 + }, + { + "epoch": 0.27298165613016895, + "grad_norm": 0.5126780867576599, + "learning_rate": 3.412194573697824e-05, + "loss": 0.2561, + "step": 2634 + }, + { + "epoch": 0.27308529381283037, + "grad_norm": 0.5749641060829163, + "learning_rate": 3.411719099686452e-05, + "loss": 0.2558, + "step": 2635 + }, + { + "epoch": 0.2731889314954918, + "grad_norm": 0.5617765188217163, + "learning_rate": 3.4112434666047656e-05, + "loss": 0.2858, + "step": 2636 + }, + { + "epoch": 0.2732925691781532, + "grad_norm": 0.5142896771430969, + "learning_rate": 3.410767674506359e-05, + "loss": 0.2607, + "step": 2637 + }, + { + "epoch": 0.2733962068608146, + "grad_norm": 0.5220364928245544, + "learning_rate": 3.410291723444845e-05, + "loss": 0.2621, + "step": 2638 + }, + { + "epoch": 0.27349984454347603, + "grad_norm": 0.5148720741271973, + "learning_rate": 3.4098156134738505e-05, + "loss": 0.2601, + "step": 2639 + }, + { + "epoch": 0.27360348222613745, + "grad_norm": 0.42664462327957153, + "learning_rate": 3.409339344647025e-05, + "loss": 0.2046, + "step": 2640 + }, + { + "epoch": 0.2737071199087988, + "grad_norm": 0.5582861304283142, + "learning_rate": 3.408862917018033e-05, + "loss": 0.2342, + "step": 2641 + }, + { + "epoch": 0.27381075759146023, + "grad_norm": 0.4054379463195801, + "learning_rate": 3.4083863306405576e-05, + "loss": 0.1905, + "step": 2642 + }, + { + "epoch": 0.27391439527412165, + "grad_norm": 0.41754868626594543, + "learning_rate": 3.4079095855683e-05, + "loss": 0.2054, + "step": 2643 + }, + { + "epoch": 0.27401803295678306, + "grad_norm": 0.4834083318710327, + "learning_rate": 3.407432681854978e-05, + "loss": 0.2179, + "step": 2644 + }, + { + "epoch": 0.2741216706394445, + "grad_norm": 0.46197500824928284, + "learning_rate": 3.4069556195543305e-05, + "loss": 0.213, + "step": 2645 + }, + { + "epoch": 0.2742253083221059, + "grad_norm": 0.47068068385124207, + "learning_rate": 3.4064783987201104e-05, + "loss": 0.2639, + "step": 2646 + }, + { + "epoch": 0.2743289460047673, + "grad_norm": 0.43616458773612976, + "learning_rate": 3.406001019406091e-05, + "loss": 0.1928, + "step": 2647 + }, + { + "epoch": 0.27443258368742873, + "grad_norm": 0.4012099504470825, + "learning_rate": 3.405523481666063e-05, + "loss": 0.177, + "step": 2648 + }, + { + "epoch": 0.27453622137009015, + "grad_norm": 0.6151448488235474, + "learning_rate": 3.405045785553832e-05, + "loss": 0.2866, + "step": 2649 + }, + { + "epoch": 0.27463985905275157, + "grad_norm": 0.5154361128807068, + "learning_rate": 3.4045679311232276e-05, + "loss": 0.2403, + "step": 2650 + }, + { + "epoch": 0.274743496735413, + "grad_norm": 0.41954943537712097, + "learning_rate": 3.404089918428092e-05, + "loss": 0.2054, + "step": 2651 + }, + { + "epoch": 0.2748471344180744, + "grad_norm": 0.5191131234169006, + "learning_rate": 3.4036117475222865e-05, + "loss": 0.2279, + "step": 2652 + }, + { + "epoch": 0.2749507721007358, + "grad_norm": 0.4807083010673523, + "learning_rate": 3.4031334184596926e-05, + "loss": 0.208, + "step": 2653 + }, + { + "epoch": 0.27505440978339724, + "grad_norm": 0.4737519919872284, + "learning_rate": 3.4026549312942046e-05, + "loss": 0.2369, + "step": 2654 + }, + { + "epoch": 0.27515804746605865, + "grad_norm": 0.4876102805137634, + "learning_rate": 3.4021762860797405e-05, + "loss": 0.2183, + "step": 2655 + }, + { + "epoch": 0.27526168514872007, + "grad_norm": 0.44601619243621826, + "learning_rate": 3.4016974828702326e-05, + "loss": 0.2141, + "step": 2656 + }, + { + "epoch": 0.2753653228313815, + "grad_norm": 0.5628261566162109, + "learning_rate": 3.40121852171963e-05, + "loss": 0.216, + "step": 2657 + }, + { + "epoch": 0.2754689605140429, + "grad_norm": 0.48517346382141113, + "learning_rate": 3.400739402681904e-05, + "loss": 0.2285, + "step": 2658 + }, + { + "epoch": 0.2755725981967043, + "grad_norm": 0.47906678915023804, + "learning_rate": 3.400260125811039e-05, + "loss": 0.2302, + "step": 2659 + }, + { + "epoch": 0.27567623587936574, + "grad_norm": 0.5147571563720703, + "learning_rate": 3.399780691161039e-05, + "loss": 0.2663, + "step": 2660 + }, + { + "epoch": 0.27577987356202716, + "grad_norm": 0.49551793932914734, + "learning_rate": 3.3993010987859275e-05, + "loss": 0.2448, + "step": 2661 + }, + { + "epoch": 0.2758835112446886, + "grad_norm": 0.5411267876625061, + "learning_rate": 3.398821348739743e-05, + "loss": 0.2527, + "step": 2662 + }, + { + "epoch": 0.27598714892735, + "grad_norm": 0.44436827301979065, + "learning_rate": 3.3983414410765445e-05, + "loss": 0.2313, + "step": 2663 + }, + { + "epoch": 0.2760907866100114, + "grad_norm": 0.5024104714393616, + "learning_rate": 3.397861375850405e-05, + "loss": 0.2151, + "step": 2664 + }, + { + "epoch": 0.2761944242926728, + "grad_norm": 0.46596401929855347, + "learning_rate": 3.397381153115419e-05, + "loss": 0.2007, + "step": 2665 + }, + { + "epoch": 0.27629806197533424, + "grad_norm": 0.5202387571334839, + "learning_rate": 3.396900772925697e-05, + "loss": 0.2144, + "step": 2666 + }, + { + "epoch": 0.27640169965799566, + "grad_norm": 0.4352811574935913, + "learning_rate": 3.396420235335367e-05, + "loss": 0.1939, + "step": 2667 + }, + { + "epoch": 0.2765053373406571, + "grad_norm": 0.47732260823249817, + "learning_rate": 3.395939540398577e-05, + "loss": 0.2285, + "step": 2668 + }, + { + "epoch": 0.2766089750233185, + "grad_norm": 0.43177780508995056, + "learning_rate": 3.395458688169489e-05, + "loss": 0.2192, + "step": 2669 + }, + { + "epoch": 0.2767126127059799, + "grad_norm": 0.4750939905643463, + "learning_rate": 3.394977678702285e-05, + "loss": 0.2262, + "step": 2670 + }, + { + "epoch": 0.2768162503886413, + "grad_norm": 0.49694186449050903, + "learning_rate": 3.3944965120511645e-05, + "loss": 0.2308, + "step": 2671 + }, + { + "epoch": 0.27691988807130274, + "grad_norm": 0.4427125155925751, + "learning_rate": 3.3940151882703446e-05, + "loss": 0.2138, + "step": 2672 + }, + { + "epoch": 0.27702352575396416, + "grad_norm": 0.4885327219963074, + "learning_rate": 3.393533707414061e-05, + "loss": 0.2419, + "step": 2673 + }, + { + "epoch": 0.2771271634366256, + "grad_norm": 0.43068984150886536, + "learning_rate": 3.393052069536566e-05, + "loss": 0.2122, + "step": 2674 + }, + { + "epoch": 0.277230801119287, + "grad_norm": 0.4368421137332916, + "learning_rate": 3.392570274692128e-05, + "loss": 0.2531, + "step": 2675 + }, + { + "epoch": 0.2773344388019484, + "grad_norm": 0.5177136063575745, + "learning_rate": 3.3920883229350375e-05, + "loss": 0.2249, + "step": 2676 + }, + { + "epoch": 0.27743807648460983, + "grad_norm": 0.5152319073677063, + "learning_rate": 3.391606214319598e-05, + "loss": 0.2245, + "step": 2677 + }, + { + "epoch": 0.27754171416727125, + "grad_norm": 0.4497338831424713, + "learning_rate": 3.3911239489001344e-05, + "loss": 0.1975, + "step": 2678 + }, + { + "epoch": 0.2776453518499326, + "grad_norm": 0.5171423554420471, + "learning_rate": 3.3906415267309855e-05, + "loss": 0.2159, + "step": 2679 + }, + { + "epoch": 0.277748989532594, + "grad_norm": 0.4115270972251892, + "learning_rate": 3.390158947866512e-05, + "loss": 0.1753, + "step": 2680 + }, + { + "epoch": 0.27785262721525544, + "grad_norm": 0.5156992077827454, + "learning_rate": 3.389676212361089e-05, + "loss": 0.2541, + "step": 2681 + }, + { + "epoch": 0.27795626489791686, + "grad_norm": 0.44904083013534546, + "learning_rate": 3.3891933202691105e-05, + "loss": 0.2106, + "step": 2682 + }, + { + "epoch": 0.2780599025805783, + "grad_norm": 0.3925611674785614, + "learning_rate": 3.388710271644989e-05, + "loss": 0.1621, + "step": 2683 + }, + { + "epoch": 0.2781635402632397, + "grad_norm": 0.4597630500793457, + "learning_rate": 3.388227066543152e-05, + "loss": 0.2062, + "step": 2684 + }, + { + "epoch": 0.2782671779459011, + "grad_norm": 0.5479700565338135, + "learning_rate": 3.3877437050180485e-05, + "loss": 0.2441, + "step": 2685 + }, + { + "epoch": 0.27837081562856253, + "grad_norm": 0.4624266028404236, + "learning_rate": 3.38726018712414e-05, + "loss": 0.2156, + "step": 2686 + }, + { + "epoch": 0.27847445331122395, + "grad_norm": 0.5356397032737732, + "learning_rate": 3.386776512915911e-05, + "loss": 0.2616, + "step": 2687 + }, + { + "epoch": 0.27857809099388536, + "grad_norm": 0.5255545973777771, + "learning_rate": 3.38629268244786e-05, + "loss": 0.2488, + "step": 2688 + }, + { + "epoch": 0.2786817286765468, + "grad_norm": 0.492130845785141, + "learning_rate": 3.385808695774505e-05, + "loss": 0.2256, + "step": 2689 + }, + { + "epoch": 0.2787853663592082, + "grad_norm": 0.5462003350257874, + "learning_rate": 3.38532455295038e-05, + "loss": 0.2542, + "step": 2690 + }, + { + "epoch": 0.2788890040418696, + "grad_norm": 0.4930932819843292, + "learning_rate": 3.384840254030039e-05, + "loss": 0.2697, + "step": 2691 + }, + { + "epoch": 0.27899264172453103, + "grad_norm": 0.48198801279067993, + "learning_rate": 3.38435579906805e-05, + "loss": 0.2177, + "step": 2692 + }, + { + "epoch": 0.27909627940719245, + "grad_norm": 0.5145705938339233, + "learning_rate": 3.383871188119001e-05, + "loss": 0.2553, + "step": 2693 + }, + { + "epoch": 0.27919991708985387, + "grad_norm": 0.4967647194862366, + "learning_rate": 3.383386421237498e-05, + "loss": 0.2098, + "step": 2694 + }, + { + "epoch": 0.2793035547725153, + "grad_norm": 0.4569028317928314, + "learning_rate": 3.382901498478164e-05, + "loss": 0.2186, + "step": 2695 + }, + { + "epoch": 0.2794071924551767, + "grad_norm": 0.5114362239837646, + "learning_rate": 3.382416419895639e-05, + "loss": 0.2598, + "step": 2696 + }, + { + "epoch": 0.2795108301378381, + "grad_norm": 0.4411246180534363, + "learning_rate": 3.3819311855445814e-05, + "loss": 0.1998, + "step": 2697 + }, + { + "epoch": 0.27961446782049953, + "grad_norm": 0.4930228292942047, + "learning_rate": 3.381445795479665e-05, + "loss": 0.2473, + "step": 2698 + }, + { + "epoch": 0.27971810550316095, + "grad_norm": 0.4813242554664612, + "learning_rate": 3.380960249755584e-05, + "loss": 0.2299, + "step": 2699 + }, + { + "epoch": 0.27982174318582237, + "grad_norm": 0.5559103488922119, + "learning_rate": 3.3804745484270496e-05, + "loss": 0.2343, + "step": 2700 + }, + { + "epoch": 0.2799253808684838, + "grad_norm": 0.4250171482563019, + "learning_rate": 3.379988691548788e-05, + "loss": 0.1892, + "step": 2701 + }, + { + "epoch": 0.2800290185511452, + "grad_norm": 0.5068527460098267, + "learning_rate": 3.379502679175547e-05, + "loss": 0.2442, + "step": 2702 + }, + { + "epoch": 0.2801326562338066, + "grad_norm": 0.48161518573760986, + "learning_rate": 3.379016511362088e-05, + "loss": 0.2018, + "step": 2703 + }, + { + "epoch": 0.28023629391646804, + "grad_norm": 0.4886307418346405, + "learning_rate": 3.378530188163192e-05, + "loss": 0.2528, + "step": 2704 + }, + { + "epoch": 0.28033993159912945, + "grad_norm": 0.43843701481819153, + "learning_rate": 3.378043709633658e-05, + "loss": 0.2159, + "step": 2705 + }, + { + "epoch": 0.28044356928179087, + "grad_norm": 0.44648537039756775, + "learning_rate": 3.3775570758283004e-05, + "loss": 0.199, + "step": 2706 + }, + { + "epoch": 0.2805472069644523, + "grad_norm": 0.3697032630443573, + "learning_rate": 3.377070286801953e-05, + "loss": 0.1692, + "step": 2707 + }, + { + "epoch": 0.2806508446471137, + "grad_norm": 0.5212875008583069, + "learning_rate": 3.3765833426094664e-05, + "loss": 0.2153, + "step": 2708 + }, + { + "epoch": 0.2807544823297751, + "grad_norm": 0.5370132327079773, + "learning_rate": 3.376096243305709e-05, + "loss": 0.2295, + "step": 2709 + }, + { + "epoch": 0.28085812001243654, + "grad_norm": 0.4295409321784973, + "learning_rate": 3.375608988945566e-05, + "loss": 0.1938, + "step": 2710 + }, + { + "epoch": 0.28096175769509796, + "grad_norm": 0.5155740976333618, + "learning_rate": 3.3751215795839405e-05, + "loss": 0.2662, + "step": 2711 + }, + { + "epoch": 0.2810653953777594, + "grad_norm": 0.498410165309906, + "learning_rate": 3.374634015275753e-05, + "loss": 0.238, + "step": 2712 + }, + { + "epoch": 0.2811690330604208, + "grad_norm": 0.4350877106189728, + "learning_rate": 3.374146296075942e-05, + "loss": 0.2165, + "step": 2713 + }, + { + "epoch": 0.2812726707430822, + "grad_norm": 0.4817065894603729, + "learning_rate": 3.373658422039461e-05, + "loss": 0.2161, + "step": 2714 + }, + { + "epoch": 0.2813763084257436, + "grad_norm": 0.4794848561286926, + "learning_rate": 3.373170393221286e-05, + "loss": 0.2226, + "step": 2715 + }, + { + "epoch": 0.28147994610840504, + "grad_norm": 0.5036630034446716, + "learning_rate": 3.372682209676406e-05, + "loss": 0.2261, + "step": 2716 + }, + { + "epoch": 0.2815835837910664, + "grad_norm": 0.49164390563964844, + "learning_rate": 3.372193871459827e-05, + "loss": 0.2334, + "step": 2717 + }, + { + "epoch": 0.2816872214737278, + "grad_norm": 0.5513508319854736, + "learning_rate": 3.371705378626577e-05, + "loss": 0.2651, + "step": 2718 + }, + { + "epoch": 0.28179085915638924, + "grad_norm": 0.4919818639755249, + "learning_rate": 3.371216731231696e-05, + "loss": 0.2358, + "step": 2719 + }, + { + "epoch": 0.28189449683905066, + "grad_norm": 0.5139323472976685, + "learning_rate": 3.370727929330246e-05, + "loss": 0.2665, + "step": 2720 + }, + { + "epoch": 0.2819981345217121, + "grad_norm": 0.45646989345550537, + "learning_rate": 3.370238972977304e-05, + "loss": 0.2199, + "step": 2721 + }, + { + "epoch": 0.2821017722043735, + "grad_norm": 0.4081878364086151, + "learning_rate": 3.369749862227965e-05, + "loss": 0.1909, + "step": 2722 + }, + { + "epoch": 0.2822054098870349, + "grad_norm": 0.41155028343200684, + "learning_rate": 3.3692605971373396e-05, + "loss": 0.1797, + "step": 2723 + }, + { + "epoch": 0.2823090475696963, + "grad_norm": 0.5415491461753845, + "learning_rate": 3.368771177760559e-05, + "loss": 0.2148, + "step": 2724 + }, + { + "epoch": 0.28241268525235774, + "grad_norm": 0.4850313365459442, + "learning_rate": 3.368281604152771e-05, + "loss": 0.2251, + "step": 2725 + }, + { + "epoch": 0.28251632293501916, + "grad_norm": 0.5246897339820862, + "learning_rate": 3.367791876369138e-05, + "loss": 0.2501, + "step": 2726 + }, + { + "epoch": 0.2826199606176806, + "grad_norm": 0.5343725681304932, + "learning_rate": 3.3673019944648425e-05, + "loss": 0.2495, + "step": 2727 + }, + { + "epoch": 0.282723598300342, + "grad_norm": 0.501419186592102, + "learning_rate": 3.366811958495084e-05, + "loss": 0.2135, + "step": 2728 + }, + { + "epoch": 0.2828272359830034, + "grad_norm": 0.5163164734840393, + "learning_rate": 3.366321768515079e-05, + "loss": 0.2465, + "step": 2729 + }, + { + "epoch": 0.28293087366566483, + "grad_norm": 0.5618857145309448, + "learning_rate": 3.36583142458006e-05, + "loss": 0.2716, + "step": 2730 + }, + { + "epoch": 0.28303451134832625, + "grad_norm": 0.4317639172077179, + "learning_rate": 3.3653409267452805e-05, + "loss": 0.2219, + "step": 2731 + }, + { + "epoch": 0.28313814903098766, + "grad_norm": 0.4637451767921448, + "learning_rate": 3.364850275066008e-05, + "loss": 0.2314, + "step": 2732 + }, + { + "epoch": 0.2832417867136491, + "grad_norm": 0.4839981198310852, + "learning_rate": 3.3643594695975275e-05, + "loss": 0.2227, + "step": 2733 + }, + { + "epoch": 0.2833454243963105, + "grad_norm": 0.43426045775413513, + "learning_rate": 3.3638685103951427e-05, + "loss": 0.1999, + "step": 2734 + }, + { + "epoch": 0.2834490620789719, + "grad_norm": 0.4142928421497345, + "learning_rate": 3.363377397514176e-05, + "loss": 0.1953, + "step": 2735 + }, + { + "epoch": 0.28355269976163333, + "grad_norm": 0.5417696237564087, + "learning_rate": 3.3628861310099615e-05, + "loss": 0.2151, + "step": 2736 + }, + { + "epoch": 0.28365633744429475, + "grad_norm": 0.49811097979545593, + "learning_rate": 3.3623947109378574e-05, + "loss": 0.2449, + "step": 2737 + }, + { + "epoch": 0.28375997512695617, + "grad_norm": 0.48678550124168396, + "learning_rate": 3.3619031373532344e-05, + "loss": 0.2261, + "step": 2738 + }, + { + "epoch": 0.2838636128096176, + "grad_norm": 0.4436796307563782, + "learning_rate": 3.3614114103114835e-05, + "loss": 0.2415, + "step": 2739 + }, + { + "epoch": 0.283967250492279, + "grad_norm": 0.46993231773376465, + "learning_rate": 3.360919529868012e-05, + "loss": 0.2432, + "step": 2740 + }, + { + "epoch": 0.2840708881749404, + "grad_norm": 0.4227633774280548, + "learning_rate": 3.3604274960782426e-05, + "loss": 0.2129, + "step": 2741 + }, + { + "epoch": 0.28417452585760183, + "grad_norm": 0.4004562795162201, + "learning_rate": 3.3599353089976184e-05, + "loss": 0.1948, + "step": 2742 + }, + { + "epoch": 0.28427816354026325, + "grad_norm": 0.4743999242782593, + "learning_rate": 3.3594429686815965e-05, + "loss": 0.2525, + "step": 2743 + }, + { + "epoch": 0.28438180122292467, + "grad_norm": 0.46449390053749084, + "learning_rate": 3.358950475185655e-05, + "loss": 0.2396, + "step": 2744 + }, + { + "epoch": 0.2844854389055861, + "grad_norm": 0.497114360332489, + "learning_rate": 3.3584578285652866e-05, + "loss": 0.2229, + "step": 2745 + }, + { + "epoch": 0.2845890765882475, + "grad_norm": 0.519870936870575, + "learning_rate": 3.357965028876001e-05, + "loss": 0.305, + "step": 2746 + }, + { + "epoch": 0.2846927142709089, + "grad_norm": 0.46930116415023804, + "learning_rate": 3.357472076173328e-05, + "loss": 0.2255, + "step": 2747 + }, + { + "epoch": 0.28479635195357034, + "grad_norm": 0.48047900199890137, + "learning_rate": 3.35697897051281e-05, + "loss": 0.2303, + "step": 2748 + }, + { + "epoch": 0.28489998963623175, + "grad_norm": 0.430600106716156, + "learning_rate": 3.356485711950013e-05, + "loss": 0.2214, + "step": 2749 + }, + { + "epoch": 0.28500362731889317, + "grad_norm": 0.403071790933609, + "learning_rate": 3.355992300540514e-05, + "loss": 0.1785, + "step": 2750 + }, + { + "epoch": 0.2851072650015546, + "grad_norm": 0.5231764316558838, + "learning_rate": 3.355498736339911e-05, + "loss": 0.2488, + "step": 2751 + }, + { + "epoch": 0.285210902684216, + "grad_norm": 0.39844778180122375, + "learning_rate": 3.355005019403817e-05, + "loss": 0.1972, + "step": 2752 + }, + { + "epoch": 0.2853145403668774, + "grad_norm": 0.4338219463825226, + "learning_rate": 3.3545111497878636e-05, + "loss": 0.2196, + "step": 2753 + }, + { + "epoch": 0.28541817804953884, + "grad_norm": 0.4960634112358093, + "learning_rate": 3.3540171275477e-05, + "loss": 0.2427, + "step": 2754 + }, + { + "epoch": 0.2855218157322002, + "grad_norm": 0.4221111238002777, + "learning_rate": 3.353522952738991e-05, + "loss": 0.197, + "step": 2755 + }, + { + "epoch": 0.2856254534148616, + "grad_norm": 0.5171911120414734, + "learning_rate": 3.353028625417419e-05, + "loss": 0.2553, + "step": 2756 + }, + { + "epoch": 0.28572909109752304, + "grad_norm": 0.4602873921394348, + "learning_rate": 3.352534145638687e-05, + "loss": 0.213, + "step": 2757 + }, + { + "epoch": 0.28583272878018445, + "grad_norm": 0.5187010765075684, + "learning_rate": 3.352039513458508e-05, + "loss": 0.2479, + "step": 2758 + }, + { + "epoch": 0.28593636646284587, + "grad_norm": 0.4925081431865692, + "learning_rate": 3.351544728932619e-05, + "loss": 0.2646, + "step": 2759 + }, + { + "epoch": 0.2860400041455073, + "grad_norm": 0.44228821992874146, + "learning_rate": 3.3510497921167706e-05, + "loss": 0.1664, + "step": 2760 + }, + { + "epoch": 0.2861436418281687, + "grad_norm": 0.4919978976249695, + "learning_rate": 3.350554703066733e-05, + "loss": 0.2295, + "step": 2761 + }, + { + "epoch": 0.2862472795108301, + "grad_norm": 0.512550950050354, + "learning_rate": 3.350059461838291e-05, + "loss": 0.207, + "step": 2762 + }, + { + "epoch": 0.28635091719349154, + "grad_norm": 0.5739670991897583, + "learning_rate": 3.3495640684872455e-05, + "loss": 0.2877, + "step": 2763 + }, + { + "epoch": 0.28645455487615296, + "grad_norm": 0.5250087976455688, + "learning_rate": 3.34906852306942e-05, + "loss": 0.2482, + "step": 2764 + }, + { + "epoch": 0.2865581925588144, + "grad_norm": 0.5244482159614563, + "learning_rate": 3.348572825640651e-05, + "loss": 0.2481, + "step": 2765 + }, + { + "epoch": 0.2866618302414758, + "grad_norm": 0.46387696266174316, + "learning_rate": 3.3480769762567914e-05, + "loss": 0.2042, + "step": 2766 + }, + { + "epoch": 0.2867654679241372, + "grad_norm": 0.5388164520263672, + "learning_rate": 3.3475809749737146e-05, + "loss": 0.2556, + "step": 2767 + }, + { + "epoch": 0.2868691056067986, + "grad_norm": 0.3879207372665405, + "learning_rate": 3.3470848218473076e-05, + "loss": 0.1961, + "step": 2768 + }, + { + "epoch": 0.28697274328946004, + "grad_norm": 0.5269706845283508, + "learning_rate": 3.3465885169334774e-05, + "loss": 0.2097, + "step": 2769 + }, + { + "epoch": 0.28707638097212146, + "grad_norm": 0.5072423219680786, + "learning_rate": 3.3460920602881465e-05, + "loss": 0.2636, + "step": 2770 + }, + { + "epoch": 0.2871800186547829, + "grad_norm": 0.44804468750953674, + "learning_rate": 3.345595451967254e-05, + "loss": 0.2273, + "step": 2771 + }, + { + "epoch": 0.2872836563374443, + "grad_norm": 0.4465629458427429, + "learning_rate": 3.345098692026759e-05, + "loss": 0.2041, + "step": 2772 + }, + { + "epoch": 0.2873872940201057, + "grad_norm": 0.3973003029823303, + "learning_rate": 3.344601780522634e-05, + "loss": 0.1755, + "step": 2773 + }, + { + "epoch": 0.2874909317027671, + "grad_norm": 0.4714430272579193, + "learning_rate": 3.34410471751087e-05, + "loss": 0.246, + "step": 2774 + }, + { + "epoch": 0.28759456938542854, + "grad_norm": 0.46732309460639954, + "learning_rate": 3.343607503047476e-05, + "loss": 0.1942, + "step": 2775 + }, + { + "epoch": 0.28769820706808996, + "grad_norm": 0.5006055235862732, + "learning_rate": 3.343110137188478e-05, + "loss": 0.2429, + "step": 2776 + }, + { + "epoch": 0.2878018447507514, + "grad_norm": 0.48675471544265747, + "learning_rate": 3.342612619989917e-05, + "loss": 0.2485, + "step": 2777 + }, + { + "epoch": 0.2879054824334128, + "grad_norm": 0.44414421916007996, + "learning_rate": 3.342114951507854e-05, + "loss": 0.2069, + "step": 2778 + }, + { + "epoch": 0.2880091201160742, + "grad_norm": 0.4877408742904663, + "learning_rate": 3.341617131798364e-05, + "loss": 0.2289, + "step": 2779 + }, + { + "epoch": 0.28811275779873563, + "grad_norm": 0.5503265261650085, + "learning_rate": 3.34111916091754e-05, + "loss": 0.2699, + "step": 2780 + }, + { + "epoch": 0.28821639548139705, + "grad_norm": 0.4621533453464508, + "learning_rate": 3.340621038921495e-05, + "loss": 0.2018, + "step": 2781 + }, + { + "epoch": 0.28832003316405846, + "grad_norm": 0.5180187225341797, + "learning_rate": 3.3401227658663555e-05, + "loss": 0.2269, + "step": 2782 + }, + { + "epoch": 0.2884236708467199, + "grad_norm": 0.4518931210041046, + "learning_rate": 3.339624341808266e-05, + "loss": 0.2024, + "step": 2783 + }, + { + "epoch": 0.2885273085293813, + "grad_norm": 0.5057093501091003, + "learning_rate": 3.3391257668033875e-05, + "loss": 0.2671, + "step": 2784 + }, + { + "epoch": 0.2886309462120427, + "grad_norm": 0.45429331064224243, + "learning_rate": 3.3386270409078994e-05, + "loss": 0.2417, + "step": 2785 + }, + { + "epoch": 0.28873458389470413, + "grad_norm": 0.4282505512237549, + "learning_rate": 3.338128164177998e-05, + "loss": 0.1975, + "step": 2786 + }, + { + "epoch": 0.28883822157736555, + "grad_norm": 0.4234720766544342, + "learning_rate": 3.337629136669894e-05, + "loss": 0.186, + "step": 2787 + }, + { + "epoch": 0.28894185926002697, + "grad_norm": 0.5359960794448853, + "learning_rate": 3.337129958439819e-05, + "loss": 0.2675, + "step": 2788 + }, + { + "epoch": 0.2890454969426884, + "grad_norm": 0.47228848934173584, + "learning_rate": 3.3366306295440195e-05, + "loss": 0.2325, + "step": 2789 + }, + { + "epoch": 0.2891491346253498, + "grad_norm": 0.4502311944961548, + "learning_rate": 3.336131150038758e-05, + "loss": 0.2237, + "step": 2790 + }, + { + "epoch": 0.2892527723080112, + "grad_norm": 0.4958683252334595, + "learning_rate": 3.335631519980315e-05, + "loss": 0.248, + "step": 2791 + }, + { + "epoch": 0.28935640999067264, + "grad_norm": 0.47601819038391113, + "learning_rate": 3.335131739424989e-05, + "loss": 0.2319, + "step": 2792 + }, + { + "epoch": 0.289460047673334, + "grad_norm": 0.4568394124507904, + "learning_rate": 3.3346318084290944e-05, + "loss": 0.2319, + "step": 2793 + }, + { + "epoch": 0.2895636853559954, + "grad_norm": 0.4676230847835541, + "learning_rate": 3.334131727048962e-05, + "loss": 0.1982, + "step": 2794 + }, + { + "epoch": 0.28966732303865683, + "grad_norm": 0.537865400314331, + "learning_rate": 3.333631495340941e-05, + "loss": 0.257, + "step": 2795 + }, + { + "epoch": 0.28977096072131825, + "grad_norm": 0.45650193095207214, + "learning_rate": 3.333131113361396e-05, + "loss": 0.2095, + "step": 2796 + }, + { + "epoch": 0.28987459840397967, + "grad_norm": 0.4981147348880768, + "learning_rate": 3.332630581166709e-05, + "loss": 0.2283, + "step": 2797 + }, + { + "epoch": 0.2899782360866411, + "grad_norm": 0.4698350131511688, + "learning_rate": 3.3321298988132804e-05, + "loss": 0.2071, + "step": 2798 + }, + { + "epoch": 0.2900818737693025, + "grad_norm": 0.40696632862091064, + "learning_rate": 3.331629066357526e-05, + "loss": 0.2053, + "step": 2799 + }, + { + "epoch": 0.2901855114519639, + "grad_norm": 0.5765353441238403, + "learning_rate": 3.3311280838558775e-05, + "loss": 0.2077, + "step": 2800 + }, + { + "epoch": 0.29028914913462533, + "grad_norm": 0.41738903522491455, + "learning_rate": 3.3306269513647866e-05, + "loss": 0.2314, + "step": 2801 + }, + { + "epoch": 0.29039278681728675, + "grad_norm": 0.4574735164642334, + "learning_rate": 3.330125668940718e-05, + "loss": 0.2015, + "step": 2802 + }, + { + "epoch": 0.29049642449994817, + "grad_norm": 0.5367337465286255, + "learning_rate": 3.329624236640158e-05, + "loss": 0.2695, + "step": 2803 + }, + { + "epoch": 0.2906000621826096, + "grad_norm": 0.49216893315315247, + "learning_rate": 3.329122654519606e-05, + "loss": 0.2188, + "step": 2804 + }, + { + "epoch": 0.290703699865271, + "grad_norm": 0.48801082372665405, + "learning_rate": 3.328620922635579e-05, + "loss": 0.216, + "step": 2805 + }, + { + "epoch": 0.2908073375479324, + "grad_norm": 0.4400445222854614, + "learning_rate": 3.328119041044611e-05, + "loss": 0.205, + "step": 2806 + }, + { + "epoch": 0.29091097523059384, + "grad_norm": 0.45788559317588806, + "learning_rate": 3.3276170098032554e-05, + "loss": 0.2079, + "step": 2807 + }, + { + "epoch": 0.29101461291325526, + "grad_norm": 0.4800960421562195, + "learning_rate": 3.327114828968079e-05, + "loss": 0.208, + "step": 2808 + }, + { + "epoch": 0.29111825059591667, + "grad_norm": 0.48112618923187256, + "learning_rate": 3.326612498595666e-05, + "loss": 0.2366, + "step": 2809 + }, + { + "epoch": 0.2912218882785781, + "grad_norm": 0.514846920967102, + "learning_rate": 3.326110018742619e-05, + "loss": 0.2296, + "step": 2810 + }, + { + "epoch": 0.2913255259612395, + "grad_norm": 0.534676194190979, + "learning_rate": 3.325607389465557e-05, + "loss": 0.2436, + "step": 2811 + }, + { + "epoch": 0.2914291636439009, + "grad_norm": 0.5063111782073975, + "learning_rate": 3.3251046108211146e-05, + "loss": 0.2508, + "step": 2812 + }, + { + "epoch": 0.29153280132656234, + "grad_norm": 0.4884932041168213, + "learning_rate": 3.324601682865945e-05, + "loss": 0.1963, + "step": 2813 + }, + { + "epoch": 0.29163643900922376, + "grad_norm": 0.4822821617126465, + "learning_rate": 3.324098605656716e-05, + "loss": 0.2566, + "step": 2814 + }, + { + "epoch": 0.2917400766918852, + "grad_norm": 0.5030156970024109, + "learning_rate": 3.323595379250116e-05, + "loss": 0.2319, + "step": 2815 + }, + { + "epoch": 0.2918437143745466, + "grad_norm": 0.5082794427871704, + "learning_rate": 3.3230920037028454e-05, + "loss": 0.2419, + "step": 2816 + }, + { + "epoch": 0.291947352057208, + "grad_norm": 0.5310083627700806, + "learning_rate": 3.322588479071624e-05, + "loss": 0.2438, + "step": 2817 + }, + { + "epoch": 0.2920509897398694, + "grad_norm": 0.5204180479049683, + "learning_rate": 3.3220848054131894e-05, + "loss": 0.2373, + "step": 2818 + }, + { + "epoch": 0.29215462742253084, + "grad_norm": 0.5026124119758606, + "learning_rate": 3.321580982784294e-05, + "loss": 0.2799, + "step": 2819 + }, + { + "epoch": 0.29225826510519226, + "grad_norm": 0.48803022503852844, + "learning_rate": 3.321077011241708e-05, + "loss": 0.2551, + "step": 2820 + }, + { + "epoch": 0.2923619027878537, + "grad_norm": 0.5668957829475403, + "learning_rate": 3.3205728908422185e-05, + "loss": 0.2575, + "step": 2821 + }, + { + "epoch": 0.2924655404705151, + "grad_norm": 0.468801885843277, + "learning_rate": 3.320068621642627e-05, + "loss": 0.2291, + "step": 2822 + }, + { + "epoch": 0.2925691781531765, + "grad_norm": 0.4148104190826416, + "learning_rate": 3.3195642036997565e-05, + "loss": 0.2065, + "step": 2823 + }, + { + "epoch": 0.29267281583583793, + "grad_norm": 0.4796377420425415, + "learning_rate": 3.319059637070443e-05, + "loss": 0.2096, + "step": 2824 + }, + { + "epoch": 0.29277645351849935, + "grad_norm": 0.46540844440460205, + "learning_rate": 3.31855492181154e-05, + "loss": 0.2421, + "step": 2825 + }, + { + "epoch": 0.29288009120116076, + "grad_norm": 0.5077078938484192, + "learning_rate": 3.3180500579799174e-05, + "loss": 0.2703, + "step": 2826 + }, + { + "epoch": 0.2929837288838222, + "grad_norm": 0.4614660441875458, + "learning_rate": 3.317545045632464e-05, + "loss": 0.1994, + "step": 2827 + }, + { + "epoch": 0.2930873665664836, + "grad_norm": 0.46856820583343506, + "learning_rate": 3.3170398848260824e-05, + "loss": 0.2523, + "step": 2828 + }, + { + "epoch": 0.293191004249145, + "grad_norm": 0.40745294094085693, + "learning_rate": 3.316534575617694e-05, + "loss": 0.1807, + "step": 2829 + }, + { + "epoch": 0.29329464193180643, + "grad_norm": 0.47301098704338074, + "learning_rate": 3.316029118064237e-05, + "loss": 0.2208, + "step": 2830 + }, + { + "epoch": 0.2933982796144678, + "grad_norm": 0.49375954270362854, + "learning_rate": 3.3155235122226644e-05, + "loss": 0.2289, + "step": 2831 + }, + { + "epoch": 0.2935019172971292, + "grad_norm": 0.4718109667301178, + "learning_rate": 3.315017758149947e-05, + "loss": 0.2461, + "step": 2832 + }, + { + "epoch": 0.29360555497979063, + "grad_norm": 0.45232123136520386, + "learning_rate": 3.314511855903074e-05, + "loss": 0.1911, + "step": 2833 + }, + { + "epoch": 0.29370919266245205, + "grad_norm": 0.46630412340164185, + "learning_rate": 3.314005805539047e-05, + "loss": 0.2235, + "step": 2834 + }, + { + "epoch": 0.29381283034511346, + "grad_norm": 0.4936429262161255, + "learning_rate": 3.313499607114889e-05, + "loss": 0.2478, + "step": 2835 + }, + { + "epoch": 0.2939164680277749, + "grad_norm": 0.5034680366516113, + "learning_rate": 3.312993260687638e-05, + "loss": 0.2093, + "step": 2836 + }, + { + "epoch": 0.2940201057104363, + "grad_norm": 0.5205070972442627, + "learning_rate": 3.3124867663143465e-05, + "loss": 0.2508, + "step": 2837 + }, + { + "epoch": 0.2941237433930977, + "grad_norm": 0.47330421209335327, + "learning_rate": 3.311980124052087e-05, + "loss": 0.2285, + "step": 2838 + }, + { + "epoch": 0.29422738107575913, + "grad_norm": 0.6019695997238159, + "learning_rate": 3.3114733339579466e-05, + "loss": 0.2709, + "step": 2839 + }, + { + "epoch": 0.29433101875842055, + "grad_norm": 0.543933093547821, + "learning_rate": 3.3109663960890294e-05, + "loss": 0.2559, + "step": 2840 + }, + { + "epoch": 0.29443465644108197, + "grad_norm": 0.5584891438484192, + "learning_rate": 3.3104593105024566e-05, + "loss": 0.2622, + "step": 2841 + }, + { + "epoch": 0.2945382941237434, + "grad_norm": 0.48917511105537415, + "learning_rate": 3.309952077255366e-05, + "loss": 0.2082, + "step": 2842 + }, + { + "epoch": 0.2946419318064048, + "grad_norm": 0.4935635030269623, + "learning_rate": 3.309444696404912e-05, + "loss": 0.2342, + "step": 2843 + }, + { + "epoch": 0.2947455694890662, + "grad_norm": 0.4728579819202423, + "learning_rate": 3.308937168008265e-05, + "loss": 0.2032, + "step": 2844 + }, + { + "epoch": 0.29484920717172763, + "grad_norm": 0.44754189252853394, + "learning_rate": 3.308429492122612e-05, + "loss": 0.1801, + "step": 2845 + }, + { + "epoch": 0.29495284485438905, + "grad_norm": 0.508368730545044, + "learning_rate": 3.307921668805158e-05, + "loss": 0.1915, + "step": 2846 + }, + { + "epoch": 0.29505648253705047, + "grad_norm": 0.5484712719917297, + "learning_rate": 3.307413698113125e-05, + "loss": 0.2658, + "step": 2847 + }, + { + "epoch": 0.2951601202197119, + "grad_norm": 0.4943729043006897, + "learning_rate": 3.306905580103747e-05, + "loss": 0.2126, + "step": 2848 + }, + { + "epoch": 0.2952637579023733, + "grad_norm": 0.4991382360458374, + "learning_rate": 3.306397314834281e-05, + "loss": 0.2014, + "step": 2849 + }, + { + "epoch": 0.2953673955850347, + "grad_norm": 0.4928804636001587, + "learning_rate": 3.305888902361996e-05, + "loss": 0.2488, + "step": 2850 + }, + { + "epoch": 0.29547103326769614, + "grad_norm": 0.5445026159286499, + "learning_rate": 3.3053803427441794e-05, + "loss": 0.2528, + "step": 2851 + }, + { + "epoch": 0.29557467095035755, + "grad_norm": 0.3906521797180176, + "learning_rate": 3.304871636038136e-05, + "loss": 0.1731, + "step": 2852 + }, + { + "epoch": 0.29567830863301897, + "grad_norm": 0.45666688680648804, + "learning_rate": 3.3043627823011835e-05, + "loss": 0.2324, + "step": 2853 + }, + { + "epoch": 0.2957819463156804, + "grad_norm": 0.39771783351898193, + "learning_rate": 3.3038537815906614e-05, + "loss": 0.1834, + "step": 2854 + }, + { + "epoch": 0.2958855839983418, + "grad_norm": 0.4375212788581848, + "learning_rate": 3.303344633963922e-05, + "loss": 0.1887, + "step": 2855 + }, + { + "epoch": 0.2959892216810032, + "grad_norm": 0.5007089376449585, + "learning_rate": 3.3028353394783356e-05, + "loss": 0.2079, + "step": 2856 + }, + { + "epoch": 0.29609285936366464, + "grad_norm": 0.4184623956680298, + "learning_rate": 3.302325898191287e-05, + "loss": 0.1885, + "step": 2857 + }, + { + "epoch": 0.29619649704632606, + "grad_norm": 0.4855022430419922, + "learning_rate": 3.3018163101601826e-05, + "loss": 0.2266, + "step": 2858 + }, + { + "epoch": 0.2963001347289875, + "grad_norm": 0.38218221068382263, + "learning_rate": 3.301306575442439e-05, + "loss": 0.1572, + "step": 2859 + }, + { + "epoch": 0.2964037724116489, + "grad_norm": 0.5013300180435181, + "learning_rate": 3.3007966940954935e-05, + "loss": 0.2348, + "step": 2860 + }, + { + "epoch": 0.2965074100943103, + "grad_norm": 0.4487498104572296, + "learning_rate": 3.300286666176799e-05, + "loss": 0.2037, + "step": 2861 + }, + { + "epoch": 0.2966110477769717, + "grad_norm": 0.49756789207458496, + "learning_rate": 3.2997764917438244e-05, + "loss": 0.2194, + "step": 2862 + }, + { + "epoch": 0.29671468545963314, + "grad_norm": 0.4958988428115845, + "learning_rate": 3.299266170854055e-05, + "loss": 0.1784, + "step": 2863 + }, + { + "epoch": 0.29681832314229456, + "grad_norm": 0.5324316620826721, + "learning_rate": 3.298755703564993e-05, + "loss": 0.2172, + "step": 2864 + }, + { + "epoch": 0.296921960824956, + "grad_norm": 0.4497959017753601, + "learning_rate": 3.298245089934158e-05, + "loss": 0.1974, + "step": 2865 + }, + { + "epoch": 0.2970255985076174, + "grad_norm": 0.4527725577354431, + "learning_rate": 3.297734330019083e-05, + "loss": 0.1927, + "step": 2866 + }, + { + "epoch": 0.2971292361902788, + "grad_norm": 0.49358245730400085, + "learning_rate": 3.2972234238773216e-05, + "loss": 0.2326, + "step": 2867 + }, + { + "epoch": 0.29723287387294023, + "grad_norm": 0.4673818349838257, + "learning_rate": 3.296712371566442e-05, + "loss": 0.2101, + "step": 2868 + }, + { + "epoch": 0.2973365115556016, + "grad_norm": 0.8467883467674255, + "learning_rate": 3.296201173144028e-05, + "loss": 0.2568, + "step": 2869 + }, + { + "epoch": 0.297440149238263, + "grad_norm": 0.5352072715759277, + "learning_rate": 3.295689828667681e-05, + "loss": 0.2206, + "step": 2870 + }, + { + "epoch": 0.2975437869209244, + "grad_norm": 0.5007523894309998, + "learning_rate": 3.2951783381950174e-05, + "loss": 0.2206, + "step": 2871 + }, + { + "epoch": 0.29764742460358584, + "grad_norm": 0.515024721622467, + "learning_rate": 3.294666701783673e-05, + "loss": 0.2372, + "step": 2872 + }, + { + "epoch": 0.29775106228624726, + "grad_norm": 0.5650159120559692, + "learning_rate": 3.2941549194912964e-05, + "loss": 0.2429, + "step": 2873 + }, + { + "epoch": 0.2978546999689087, + "grad_norm": 0.33148810267448425, + "learning_rate": 3.293642991375556e-05, + "loss": 0.1549, + "step": 2874 + }, + { + "epoch": 0.2979583376515701, + "grad_norm": 0.6188404560089111, + "learning_rate": 3.293130917494134e-05, + "loss": 0.2466, + "step": 2875 + }, + { + "epoch": 0.2980619753342315, + "grad_norm": 0.451937735080719, + "learning_rate": 3.29261869790473e-05, + "loss": 0.184, + "step": 2876 + }, + { + "epoch": 0.2981656130168929, + "grad_norm": 0.47771209478378296, + "learning_rate": 3.292106332665061e-05, + "loss": 0.2318, + "step": 2877 + }, + { + "epoch": 0.29826925069955434, + "grad_norm": 0.4931339919567108, + "learning_rate": 3.291593821832859e-05, + "loss": 0.2409, + "step": 2878 + }, + { + "epoch": 0.29837288838221576, + "grad_norm": 0.44261494278907776, + "learning_rate": 3.2910811654658734e-05, + "loss": 0.2072, + "step": 2879 + }, + { + "epoch": 0.2984765260648772, + "grad_norm": 0.5541775226593018, + "learning_rate": 3.2905683636218684e-05, + "loss": 0.2444, + "step": 2880 + }, + { + "epoch": 0.2985801637475386, + "grad_norm": 0.46059805154800415, + "learning_rate": 3.290055416358627e-05, + "loss": 0.2344, + "step": 2881 + }, + { + "epoch": 0.2986838014302, + "grad_norm": 0.47794246673583984, + "learning_rate": 3.2895423237339465e-05, + "loss": 0.2564, + "step": 2882 + }, + { + "epoch": 0.29878743911286143, + "grad_norm": 0.46349892020225525, + "learning_rate": 3.2890290858056415e-05, + "loss": 0.2233, + "step": 2883 + }, + { + "epoch": 0.29889107679552285, + "grad_norm": 0.40743401646614075, + "learning_rate": 3.288515702631543e-05, + "loss": 0.2031, + "step": 2884 + }, + { + "epoch": 0.29899471447818426, + "grad_norm": 0.5463640689849854, + "learning_rate": 3.288002174269498e-05, + "loss": 0.2295, + "step": 2885 + }, + { + "epoch": 0.2990983521608457, + "grad_norm": 0.46196120977401733, + "learning_rate": 3.28748850077737e-05, + "loss": 0.2328, + "step": 2886 + }, + { + "epoch": 0.2992019898435071, + "grad_norm": 0.48436087369918823, + "learning_rate": 3.28697468221304e-05, + "loss": 0.2364, + "step": 2887 + }, + { + "epoch": 0.2993056275261685, + "grad_norm": 0.49515339732170105, + "learning_rate": 3.2864607186344026e-05, + "loss": 0.2263, + "step": 2888 + }, + { + "epoch": 0.29940926520882993, + "grad_norm": 0.49403858184814453, + "learning_rate": 3.2859466100993723e-05, + "loss": 0.2266, + "step": 2889 + }, + { + "epoch": 0.29951290289149135, + "grad_norm": 0.47844377160072327, + "learning_rate": 3.2854323566658765e-05, + "loss": 0.2178, + "step": 2890 + }, + { + "epoch": 0.29961654057415277, + "grad_norm": 0.5542422533035278, + "learning_rate": 3.28491795839186e-05, + "loss": 0.24, + "step": 2891 + }, + { + "epoch": 0.2997201782568142, + "grad_norm": 0.48089051246643066, + "learning_rate": 3.284403415335287e-05, + "loss": 0.2399, + "step": 2892 + }, + { + "epoch": 0.2998238159394756, + "grad_norm": 0.44947952032089233, + "learning_rate": 3.283888727554133e-05, + "loss": 0.2078, + "step": 2893 + }, + { + "epoch": 0.299927453622137, + "grad_norm": 0.47643810510635376, + "learning_rate": 3.283373895106393e-05, + "loss": 0.225, + "step": 2894 + }, + { + "epoch": 0.30003109130479844, + "grad_norm": 0.4375708997249603, + "learning_rate": 3.282858918050078e-05, + "loss": 0.2326, + "step": 2895 + }, + { + "epoch": 0.30013472898745985, + "grad_norm": 0.42022502422332764, + "learning_rate": 3.282343796443214e-05, + "loss": 0.2018, + "step": 2896 + }, + { + "epoch": 0.30023836667012127, + "grad_norm": 0.47073331475257874, + "learning_rate": 3.2818285303438436e-05, + "loss": 0.2165, + "step": 2897 + }, + { + "epoch": 0.3003420043527827, + "grad_norm": 0.41679564118385315, + "learning_rate": 3.281313119810028e-05, + "loss": 0.2027, + "step": 2898 + }, + { + "epoch": 0.3004456420354441, + "grad_norm": 0.459945410490036, + "learning_rate": 3.2807975648998426e-05, + "loss": 0.2193, + "step": 2899 + }, + { + "epoch": 0.3005492797181055, + "grad_norm": 0.5290501713752747, + "learning_rate": 3.280281865671378e-05, + "loss": 0.2539, + "step": 2900 + }, + { + "epoch": 0.30065291740076694, + "grad_norm": 0.45214882493019104, + "learning_rate": 3.279766022182742e-05, + "loss": 0.1958, + "step": 2901 + }, + { + "epoch": 0.30075655508342836, + "grad_norm": 0.48252326250076294, + "learning_rate": 3.279250034492061e-05, + "loss": 0.2137, + "step": 2902 + }, + { + "epoch": 0.3008601927660898, + "grad_norm": 0.47179168462753296, + "learning_rate": 3.278733902657475e-05, + "loss": 0.2163, + "step": 2903 + }, + { + "epoch": 0.3009638304487512, + "grad_norm": 0.49783962965011597, + "learning_rate": 3.2782176267371405e-05, + "loss": 0.2305, + "step": 2904 + }, + { + "epoch": 0.3010674681314126, + "grad_norm": 0.5070213079452515, + "learning_rate": 3.277701206789231e-05, + "loss": 0.2238, + "step": 2905 + }, + { + "epoch": 0.301171105814074, + "grad_norm": 0.534799337387085, + "learning_rate": 3.2771846428719346e-05, + "loss": 0.2196, + "step": 2906 + }, + { + "epoch": 0.3012747434967354, + "grad_norm": 0.519961953163147, + "learning_rate": 3.276667935043459e-05, + "loss": 0.2191, + "step": 2907 + }, + { + "epoch": 0.3013783811793968, + "grad_norm": 0.48025771975517273, + "learning_rate": 3.276151083362025e-05, + "loss": 0.2283, + "step": 2908 + }, + { + "epoch": 0.3014820188620582, + "grad_norm": 0.4781363606452942, + "learning_rate": 3.2756340878858705e-05, + "loss": 0.2372, + "step": 2909 + }, + { + "epoch": 0.30158565654471964, + "grad_norm": 0.5782521367073059, + "learning_rate": 3.27511694867325e-05, + "loss": 0.2735, + "step": 2910 + }, + { + "epoch": 0.30168929422738106, + "grad_norm": 0.47211775183677673, + "learning_rate": 3.2745996657824344e-05, + "loss": 0.2132, + "step": 2911 + }, + { + "epoch": 0.3017929319100425, + "grad_norm": 0.5187753438949585, + "learning_rate": 3.27408223927171e-05, + "loss": 0.2434, + "step": 2912 + }, + { + "epoch": 0.3018965695927039, + "grad_norm": 0.4415639042854309, + "learning_rate": 3.273564669199379e-05, + "loss": 0.2159, + "step": 2913 + }, + { + "epoch": 0.3020002072753653, + "grad_norm": 0.3430590033531189, + "learning_rate": 3.273046955623761e-05, + "loss": 0.1583, + "step": 2914 + }, + { + "epoch": 0.3021038449580267, + "grad_norm": 0.5086635947227478, + "learning_rate": 3.272529098603191e-05, + "loss": 0.2336, + "step": 2915 + }, + { + "epoch": 0.30220748264068814, + "grad_norm": 0.5063747763633728, + "learning_rate": 3.272011098196019e-05, + "loss": 0.2242, + "step": 2916 + }, + { + "epoch": 0.30231112032334956, + "grad_norm": 0.4871509075164795, + "learning_rate": 3.271492954460616e-05, + "loss": 0.2192, + "step": 2917 + }, + { + "epoch": 0.302414758006011, + "grad_norm": 0.49528932571411133, + "learning_rate": 3.270974667455363e-05, + "loss": 0.2385, + "step": 2918 + }, + { + "epoch": 0.3025183956886724, + "grad_norm": 0.5143001675605774, + "learning_rate": 3.27045623723866e-05, + "loss": 0.2266, + "step": 2919 + }, + { + "epoch": 0.3026220333713338, + "grad_norm": 0.4750981032848358, + "learning_rate": 3.269937663868923e-05, + "loss": 0.2625, + "step": 2920 + }, + { + "epoch": 0.3027256710539952, + "grad_norm": 0.4782578945159912, + "learning_rate": 3.269418947404584e-05, + "loss": 0.2223, + "step": 2921 + }, + { + "epoch": 0.30282930873665664, + "grad_norm": 0.5503315329551697, + "learning_rate": 3.268900087904092e-05, + "loss": 0.2176, + "step": 2922 + }, + { + "epoch": 0.30293294641931806, + "grad_norm": 0.447047621011734, + "learning_rate": 3.26838108542591e-05, + "loss": 0.201, + "step": 2923 + }, + { + "epoch": 0.3030365841019795, + "grad_norm": 0.46707892417907715, + "learning_rate": 3.2678619400285194e-05, + "loss": 0.2134, + "step": 2924 + }, + { + "epoch": 0.3031402217846409, + "grad_norm": 0.42201143503189087, + "learning_rate": 3.267342651770416e-05, + "loss": 0.1949, + "step": 2925 + }, + { + "epoch": 0.3032438594673023, + "grad_norm": 0.4469546675682068, + "learning_rate": 3.266823220710113e-05, + "loss": 0.2177, + "step": 2926 + }, + { + "epoch": 0.30334749714996373, + "grad_norm": 0.5175836086273193, + "learning_rate": 3.266303646906138e-05, + "loss": 0.2585, + "step": 2927 + }, + { + "epoch": 0.30345113483262515, + "grad_norm": 0.44420644640922546, + "learning_rate": 3.2657839304170376e-05, + "loss": 0.2004, + "step": 2928 + }, + { + "epoch": 0.30355477251528656, + "grad_norm": 0.589979350566864, + "learning_rate": 3.2652640713013716e-05, + "loss": 0.2362, + "step": 2929 + }, + { + "epoch": 0.303658410197948, + "grad_norm": 0.4705340266227722, + "learning_rate": 3.264744069617716e-05, + "loss": 0.2222, + "step": 2930 + }, + { + "epoch": 0.3037620478806094, + "grad_norm": 0.47983983159065247, + "learning_rate": 3.2642239254246654e-05, + "loss": 0.2291, + "step": 2931 + }, + { + "epoch": 0.3038656855632708, + "grad_norm": 0.3722575306892395, + "learning_rate": 3.263703638780828e-05, + "loss": 0.1571, + "step": 2932 + }, + { + "epoch": 0.30396932324593223, + "grad_norm": 0.5379013419151306, + "learning_rate": 3.26318320974483e-05, + "loss": 0.2528, + "step": 2933 + }, + { + "epoch": 0.30407296092859365, + "grad_norm": 0.5010048151016235, + "learning_rate": 3.2626626383753096e-05, + "loss": 0.234, + "step": 2934 + }, + { + "epoch": 0.30417659861125507, + "grad_norm": 0.4466943144798279, + "learning_rate": 3.262141924730928e-05, + "loss": 0.2191, + "step": 2935 + }, + { + "epoch": 0.3042802362939165, + "grad_norm": 0.45711278915405273, + "learning_rate": 3.261621068870355e-05, + "loss": 0.2232, + "step": 2936 + }, + { + "epoch": 0.3043838739765779, + "grad_norm": 0.4415625333786011, + "learning_rate": 3.2611000708522816e-05, + "loss": 0.1871, + "step": 2937 + }, + { + "epoch": 0.3044875116592393, + "grad_norm": 0.505708634853363, + "learning_rate": 3.260578930735413e-05, + "loss": 0.2555, + "step": 2938 + }, + { + "epoch": 0.30459114934190074, + "grad_norm": 0.4457675516605377, + "learning_rate": 3.26005764857847e-05, + "loss": 0.2082, + "step": 2939 + }, + { + "epoch": 0.30469478702456215, + "grad_norm": 0.4445909857749939, + "learning_rate": 3.259536224440189e-05, + "loss": 0.1989, + "step": 2940 + }, + { + "epoch": 0.30479842470722357, + "grad_norm": 0.5568691492080688, + "learning_rate": 3.259014658379325e-05, + "loss": 0.2802, + "step": 2941 + }, + { + "epoch": 0.304902062389885, + "grad_norm": 0.47887614369392395, + "learning_rate": 3.258492950454647e-05, + "loss": 0.2461, + "step": 2942 + }, + { + "epoch": 0.3050057000725464, + "grad_norm": 0.4918515384197235, + "learning_rate": 3.257971100724939e-05, + "loss": 0.234, + "step": 2943 + }, + { + "epoch": 0.3051093377552078, + "grad_norm": 0.5362921953201294, + "learning_rate": 3.2574491092490035e-05, + "loss": 0.2481, + "step": 2944 + }, + { + "epoch": 0.3052129754378692, + "grad_norm": 0.6173157095909119, + "learning_rate": 3.256926976085656e-05, + "loss": 0.2721, + "step": 2945 + }, + { + "epoch": 0.3053166131205306, + "grad_norm": 0.47653764486312866, + "learning_rate": 3.2564047012937314e-05, + "loss": 0.2253, + "step": 2946 + }, + { + "epoch": 0.305420250803192, + "grad_norm": 0.4073546826839447, + "learning_rate": 3.255882284932078e-05, + "loss": 0.1797, + "step": 2947 + }, + { + "epoch": 0.30552388848585343, + "grad_norm": 0.4936966598033905, + "learning_rate": 3.2553597270595617e-05, + "loss": 0.2383, + "step": 2948 + }, + { + "epoch": 0.30562752616851485, + "grad_norm": 0.4709612727165222, + "learning_rate": 3.2548370277350625e-05, + "loss": 0.2253, + "step": 2949 + }, + { + "epoch": 0.30573116385117627, + "grad_norm": 0.46805763244628906, + "learning_rate": 3.254314187017477e-05, + "loss": 0.1926, + "step": 2950 + }, + { + "epoch": 0.3058348015338377, + "grad_norm": 0.4419136047363281, + "learning_rate": 3.2537912049657197e-05, + "loss": 0.1716, + "step": 2951 + }, + { + "epoch": 0.3059384392164991, + "grad_norm": 0.4995832145214081, + "learning_rate": 3.253268081638718e-05, + "loss": 0.2273, + "step": 2952 + }, + { + "epoch": 0.3060420768991605, + "grad_norm": 0.47150254249572754, + "learning_rate": 3.2527448170954174e-05, + "loss": 0.239, + "step": 2953 + }, + { + "epoch": 0.30614571458182194, + "grad_norm": 0.48097914457321167, + "learning_rate": 3.2522214113947775e-05, + "loss": 0.2356, + "step": 2954 + }, + { + "epoch": 0.30624935226448335, + "grad_norm": 0.48609909415245056, + "learning_rate": 3.251697864595777e-05, + "loss": 0.2218, + "step": 2955 + }, + { + "epoch": 0.30635298994714477, + "grad_norm": 0.5264389514923096, + "learning_rate": 3.2511741767574055e-05, + "loss": 0.2461, + "step": 2956 + }, + { + "epoch": 0.3064566276298062, + "grad_norm": 0.46607768535614014, + "learning_rate": 3.250650347938673e-05, + "loss": 0.1884, + "step": 2957 + }, + { + "epoch": 0.3065602653124676, + "grad_norm": 0.45885488390922546, + "learning_rate": 3.250126378198604e-05, + "loss": 0.2234, + "step": 2958 + }, + { + "epoch": 0.306663902995129, + "grad_norm": 0.5328109860420227, + "learning_rate": 3.249602267596238e-05, + "loss": 0.2408, + "step": 2959 + }, + { + "epoch": 0.30676754067779044, + "grad_norm": 0.5008374452590942, + "learning_rate": 3.249078016190631e-05, + "loss": 0.2543, + "step": 2960 + }, + { + "epoch": 0.30687117836045186, + "grad_norm": 0.5086766481399536, + "learning_rate": 3.248553624040855e-05, + "loss": 0.2036, + "step": 2961 + }, + { + "epoch": 0.3069748160431133, + "grad_norm": 0.5206964015960693, + "learning_rate": 3.248029091205997e-05, + "loss": 0.1785, + "step": 2962 + }, + { + "epoch": 0.3070784537257747, + "grad_norm": 0.4729783535003662, + "learning_rate": 3.247504417745162e-05, + "loss": 0.2269, + "step": 2963 + }, + { + "epoch": 0.3071820914084361, + "grad_norm": 0.5227089524269104, + "learning_rate": 3.246979603717467e-05, + "loss": 0.2451, + "step": 2964 + }, + { + "epoch": 0.3072857290910975, + "grad_norm": 0.4764210879802704, + "learning_rate": 3.24645464918205e-05, + "loss": 0.2012, + "step": 2965 + }, + { + "epoch": 0.30738936677375894, + "grad_norm": 0.4624175429344177, + "learning_rate": 3.245929554198061e-05, + "loss": 0.228, + "step": 2966 + }, + { + "epoch": 0.30749300445642036, + "grad_norm": 0.4398234784603119, + "learning_rate": 3.245404318824665e-05, + "loss": 0.2152, + "step": 2967 + }, + { + "epoch": 0.3075966421390818, + "grad_norm": 0.6094075441360474, + "learning_rate": 3.2448789431210484e-05, + "loss": 0.2587, + "step": 2968 + }, + { + "epoch": 0.3077002798217432, + "grad_norm": 0.5074710249900818, + "learning_rate": 3.2443534271464066e-05, + "loss": 0.2235, + "step": 2969 + }, + { + "epoch": 0.3078039175044046, + "grad_norm": 0.5189184546470642, + "learning_rate": 3.2438277709599556e-05, + "loss": 0.225, + "step": 2970 + }, + { + "epoch": 0.30790755518706603, + "grad_norm": 0.458265095949173, + "learning_rate": 3.243301974620924e-05, + "loss": 0.2228, + "step": 2971 + }, + { + "epoch": 0.30801119286972745, + "grad_norm": 0.47438445687294006, + "learning_rate": 3.242776038188559e-05, + "loss": 0.2019, + "step": 2972 + }, + { + "epoch": 0.30811483055238886, + "grad_norm": 0.4507431387901306, + "learning_rate": 3.242249961722122e-05, + "loss": 0.1969, + "step": 2973 + }, + { + "epoch": 0.3082184682350503, + "grad_norm": 0.45547452569007874, + "learning_rate": 3.2417237452808906e-05, + "loss": 0.2172, + "step": 2974 + }, + { + "epoch": 0.3083221059177117, + "grad_norm": 0.4992049038410187, + "learning_rate": 3.2411973889241575e-05, + "loss": 0.2085, + "step": 2975 + }, + { + "epoch": 0.3084257436003731, + "grad_norm": 0.5008047223091125, + "learning_rate": 3.240670892711233e-05, + "loss": 0.2157, + "step": 2976 + }, + { + "epoch": 0.30852938128303453, + "grad_norm": 0.5114787817001343, + "learning_rate": 3.24014425670144e-05, + "loss": 0.2332, + "step": 2977 + }, + { + "epoch": 0.30863301896569595, + "grad_norm": 0.5216313600540161, + "learning_rate": 3.2396174809541204e-05, + "loss": 0.2363, + "step": 2978 + }, + { + "epoch": 0.30873665664835737, + "grad_norm": 0.45231184363365173, + "learning_rate": 3.23909056552863e-05, + "loss": 0.1808, + "step": 2979 + }, + { + "epoch": 0.3088402943310188, + "grad_norm": 0.4798497259616852, + "learning_rate": 3.238563510484341e-05, + "loss": 0.2297, + "step": 2980 + }, + { + "epoch": 0.3089439320136802, + "grad_norm": 0.5364487767219543, + "learning_rate": 3.2380363158806404e-05, + "loss": 0.2599, + "step": 2981 + }, + { + "epoch": 0.3090475696963416, + "grad_norm": 0.5060192346572876, + "learning_rate": 3.237508981776933e-05, + "loss": 0.2063, + "step": 2982 + }, + { + "epoch": 0.309151207379003, + "grad_norm": 0.4609908163547516, + "learning_rate": 3.2369815082326375e-05, + "loss": 0.2181, + "step": 2983 + }, + { + "epoch": 0.3092548450616644, + "grad_norm": 0.4750822186470032, + "learning_rate": 3.236453895307188e-05, + "loss": 0.224, + "step": 2984 + }, + { + "epoch": 0.3093584827443258, + "grad_norm": 0.5447022914886475, + "learning_rate": 3.235926143060036e-05, + "loss": 0.2667, + "step": 2985 + }, + { + "epoch": 0.30946212042698723, + "grad_norm": 0.4615114629268646, + "learning_rate": 3.2353982515506474e-05, + "loss": 0.2105, + "step": 2986 + }, + { + "epoch": 0.30956575810964865, + "grad_norm": 0.5125647783279419, + "learning_rate": 3.234870220838504e-05, + "loss": 0.2268, + "step": 2987 + }, + { + "epoch": 0.30966939579231006, + "grad_norm": 0.42592471837997437, + "learning_rate": 3.234342050983104e-05, + "loss": 0.1967, + "step": 2988 + }, + { + "epoch": 0.3097730334749715, + "grad_norm": 0.4169655442237854, + "learning_rate": 3.2338137420439605e-05, + "loss": 0.1783, + "step": 2989 + }, + { + "epoch": 0.3098766711576329, + "grad_norm": 0.4965342879295349, + "learning_rate": 3.2332852940806026e-05, + "loss": 0.2288, + "step": 2990 + }, + { + "epoch": 0.3099803088402943, + "grad_norm": 0.5323944091796875, + "learning_rate": 3.232756707152575e-05, + "loss": 0.2666, + "step": 2991 + }, + { + "epoch": 0.31008394652295573, + "grad_norm": 0.44411584734916687, + "learning_rate": 3.232227981319438e-05, + "loss": 0.1984, + "step": 2992 + }, + { + "epoch": 0.31018758420561715, + "grad_norm": 0.4794510006904602, + "learning_rate": 3.231699116640768e-05, + "loss": 0.2346, + "step": 2993 + }, + { + "epoch": 0.31029122188827857, + "grad_norm": 0.5558497309684753, + "learning_rate": 3.2311701131761545e-05, + "loss": 0.2594, + "step": 2994 + }, + { + "epoch": 0.31039485957094, + "grad_norm": 0.5508059859275818, + "learning_rate": 3.230640970985208e-05, + "loss": 0.2549, + "step": 2995 + }, + { + "epoch": 0.3104984972536014, + "grad_norm": 0.47948360443115234, + "learning_rate": 3.23011169012755e-05, + "loss": 0.2273, + "step": 2996 + }, + { + "epoch": 0.3106021349362628, + "grad_norm": 0.4396754801273346, + "learning_rate": 3.229582270662819e-05, + "loss": 0.2065, + "step": 2997 + }, + { + "epoch": 0.31070577261892424, + "grad_norm": 0.5111278891563416, + "learning_rate": 3.229052712650669e-05, + "loss": 0.2326, + "step": 2998 + }, + { + "epoch": 0.31080941030158565, + "grad_norm": 0.536485493183136, + "learning_rate": 3.228523016150769e-05, + "loss": 0.2356, + "step": 2999 + }, + { + "epoch": 0.31091304798424707, + "grad_norm": 0.4807487726211548, + "learning_rate": 3.2279931812228066e-05, + "loss": 0.1892, + "step": 3000 + }, + { + "epoch": 0.3110166856669085, + "grad_norm": 0.476895272731781, + "learning_rate": 3.2274632079264806e-05, + "loss": 0.2202, + "step": 3001 + }, + { + "epoch": 0.3111203233495699, + "grad_norm": 0.5374165773391724, + "learning_rate": 3.226933096321509e-05, + "loss": 0.2163, + "step": 3002 + }, + { + "epoch": 0.3112239610322313, + "grad_norm": 0.4013517200946808, + "learning_rate": 3.2264028464676235e-05, + "loss": 0.1641, + "step": 3003 + }, + { + "epoch": 0.31132759871489274, + "grad_norm": 0.45430639386177063, + "learning_rate": 3.2258724584245714e-05, + "loss": 0.2065, + "step": 3004 + }, + { + "epoch": 0.31143123639755416, + "grad_norm": 0.5279176831245422, + "learning_rate": 3.225341932252117e-05, + "loss": 0.2276, + "step": 3005 + }, + { + "epoch": 0.3115348740802156, + "grad_norm": 0.433173805475235, + "learning_rate": 3.224811268010037e-05, + "loss": 0.2105, + "step": 3006 + }, + { + "epoch": 0.311638511762877, + "grad_norm": 0.4613426923751831, + "learning_rate": 3.224280465758129e-05, + "loss": 0.2328, + "step": 3007 + }, + { + "epoch": 0.3117421494455384, + "grad_norm": 0.5462455153465271, + "learning_rate": 3.2237495255562e-05, + "loss": 0.2742, + "step": 3008 + }, + { + "epoch": 0.3118457871281998, + "grad_norm": 0.45191264152526855, + "learning_rate": 3.223218447464078e-05, + "loss": 0.2205, + "step": 3009 + }, + { + "epoch": 0.31194942481086124, + "grad_norm": 0.42294684052467346, + "learning_rate": 3.222687231541602e-05, + "loss": 0.1812, + "step": 3010 + }, + { + "epoch": 0.31205306249352266, + "grad_norm": 0.4990582764148712, + "learning_rate": 3.2221558778486306e-05, + "loss": 0.2395, + "step": 3011 + }, + { + "epoch": 0.3121567001761841, + "grad_norm": 0.4808881878852844, + "learning_rate": 3.221624386445034e-05, + "loss": 0.2356, + "step": 3012 + }, + { + "epoch": 0.3122603378588455, + "grad_norm": 0.5097794532775879, + "learning_rate": 3.2210927573907005e-05, + "loss": 0.2568, + "step": 3013 + }, + { + "epoch": 0.3123639755415069, + "grad_norm": 0.47635650634765625, + "learning_rate": 3.220560990745533e-05, + "loss": 0.2498, + "step": 3014 + }, + { + "epoch": 0.31246761322416833, + "grad_norm": 0.5116894841194153, + "learning_rate": 3.220029086569451e-05, + "loss": 0.2193, + "step": 3015 + }, + { + "epoch": 0.31257125090682975, + "grad_norm": 0.43708619475364685, + "learning_rate": 3.219497044922387e-05, + "loss": 0.2245, + "step": 3016 + }, + { + "epoch": 0.31267488858949116, + "grad_norm": 0.4694969058036804, + "learning_rate": 3.218964865864293e-05, + "loss": 0.2264, + "step": 3017 + }, + { + "epoch": 0.3127785262721526, + "grad_norm": 0.5043584108352661, + "learning_rate": 3.2184325494551324e-05, + "loss": 0.2223, + "step": 3018 + }, + { + "epoch": 0.312882163954814, + "grad_norm": 0.5066149830818176, + "learning_rate": 3.2179000957548864e-05, + "loss": 0.2381, + "step": 3019 + }, + { + "epoch": 0.3129858016374754, + "grad_norm": 0.46746236085891724, + "learning_rate": 3.217367504823551e-05, + "loss": 0.2025, + "step": 3020 + }, + { + "epoch": 0.3130894393201368, + "grad_norm": 0.5089457631111145, + "learning_rate": 3.216834776721137e-05, + "loss": 0.2606, + "step": 3021 + }, + { + "epoch": 0.3131930770027982, + "grad_norm": 0.4705091714859009, + "learning_rate": 3.2163019115076726e-05, + "loss": 0.2394, + "step": 3022 + }, + { + "epoch": 0.3132967146854596, + "grad_norm": 0.46200108528137207, + "learning_rate": 3.215768909243199e-05, + "loss": 0.2027, + "step": 3023 + }, + { + "epoch": 0.313400352368121, + "grad_norm": 0.5282918214797974, + "learning_rate": 3.215235769987775e-05, + "loss": 0.2104, + "step": 3024 + }, + { + "epoch": 0.31350399005078244, + "grad_norm": 0.4366593360900879, + "learning_rate": 3.2147024938014736e-05, + "loss": 0.2224, + "step": 3025 + }, + { + "epoch": 0.31360762773344386, + "grad_norm": 0.5153645277023315, + "learning_rate": 3.214169080744383e-05, + "loss": 0.2207, + "step": 3026 + }, + { + "epoch": 0.3137112654161053, + "grad_norm": 0.5471642017364502, + "learning_rate": 3.2136355308766084e-05, + "loss": 0.2624, + "step": 3027 + }, + { + "epoch": 0.3138149030987667, + "grad_norm": 0.5268349051475525, + "learning_rate": 3.213101844258269e-05, + "loss": 0.2451, + "step": 3028 + }, + { + "epoch": 0.3139185407814281, + "grad_norm": 0.43142902851104736, + "learning_rate": 3.2125680209494994e-05, + "loss": 0.2066, + "step": 3029 + }, + { + "epoch": 0.31402217846408953, + "grad_norm": 0.5392974019050598, + "learning_rate": 3.21203406101045e-05, + "loss": 0.2476, + "step": 3030 + }, + { + "epoch": 0.31412581614675095, + "grad_norm": 0.46632081270217896, + "learning_rate": 3.211499964501286e-05, + "loss": 0.2199, + "step": 3031 + }, + { + "epoch": 0.31422945382941236, + "grad_norm": 0.42282339930534363, + "learning_rate": 3.21096573148219e-05, + "loss": 0.1972, + "step": 3032 + }, + { + "epoch": 0.3143330915120738, + "grad_norm": 0.4383908212184906, + "learning_rate": 3.210431362013358e-05, + "loss": 0.1942, + "step": 3033 + }, + { + "epoch": 0.3144367291947352, + "grad_norm": 0.4615064263343811, + "learning_rate": 3.2098968561550024e-05, + "loss": 0.186, + "step": 3034 + }, + { + "epoch": 0.3145403668773966, + "grad_norm": 0.4781564474105835, + "learning_rate": 3.209362213967349e-05, + "loss": 0.1935, + "step": 3035 + }, + { + "epoch": 0.31464400456005803, + "grad_norm": 0.48859652876853943, + "learning_rate": 3.208827435510642e-05, + "loss": 0.2017, + "step": 3036 + }, + { + "epoch": 0.31474764224271945, + "grad_norm": 0.5409640669822693, + "learning_rate": 3.208292520845138e-05, + "loss": 0.2363, + "step": 3037 + }, + { + "epoch": 0.31485127992538087, + "grad_norm": 0.5108470320701599, + "learning_rate": 3.2077574700311115e-05, + "loss": 0.2529, + "step": 3038 + }, + { + "epoch": 0.3149549176080423, + "grad_norm": 0.48213520646095276, + "learning_rate": 3.20722228312885e-05, + "loss": 0.1981, + "step": 3039 + }, + { + "epoch": 0.3150585552907037, + "grad_norm": 0.45971915125846863, + "learning_rate": 3.206686960198659e-05, + "loss": 0.2123, + "step": 3040 + }, + { + "epoch": 0.3151621929733651, + "grad_norm": 0.5029280185699463, + "learning_rate": 3.206151501300857e-05, + "loss": 0.2282, + "step": 3041 + }, + { + "epoch": 0.31526583065602654, + "grad_norm": 0.4633747935295105, + "learning_rate": 3.205615906495779e-05, + "loss": 0.2132, + "step": 3042 + }, + { + "epoch": 0.31536946833868795, + "grad_norm": 0.49723488092422485, + "learning_rate": 3.2050801758437744e-05, + "loss": 0.2267, + "step": 3043 + }, + { + "epoch": 0.31547310602134937, + "grad_norm": 0.4490548372268677, + "learning_rate": 3.204544309405209e-05, + "loss": 0.2058, + "step": 3044 + }, + { + "epoch": 0.3155767437040108, + "grad_norm": 0.4246513247489929, + "learning_rate": 3.204008307240464e-05, + "loss": 0.174, + "step": 3045 + }, + { + "epoch": 0.3156803813866722, + "grad_norm": 0.5136212706565857, + "learning_rate": 3.203472169409934e-05, + "loss": 0.2301, + "step": 3046 + }, + { + "epoch": 0.3157840190693336, + "grad_norm": 0.5009952783584595, + "learning_rate": 3.202935895974031e-05, + "loss": 0.2162, + "step": 3047 + }, + { + "epoch": 0.31588765675199504, + "grad_norm": 0.49895063042640686, + "learning_rate": 3.202399486993181e-05, + "loss": 0.2342, + "step": 3048 + }, + { + "epoch": 0.31599129443465646, + "grad_norm": 0.5760830044746399, + "learning_rate": 3.2018629425278266e-05, + "loss": 0.2478, + "step": 3049 + }, + { + "epoch": 0.3160949321173179, + "grad_norm": 0.48224908113479614, + "learning_rate": 3.201326262638423e-05, + "loss": 0.2174, + "step": 3050 + }, + { + "epoch": 0.3161985697999793, + "grad_norm": 0.5179287791252136, + "learning_rate": 3.200789447385445e-05, + "loss": 0.2583, + "step": 3051 + }, + { + "epoch": 0.3163022074826407, + "grad_norm": 0.46274638175964355, + "learning_rate": 3.200252496829378e-05, + "loss": 0.2115, + "step": 3052 + }, + { + "epoch": 0.3164058451653021, + "grad_norm": 0.4526350796222687, + "learning_rate": 3.199715411030726e-05, + "loss": 0.1978, + "step": 3053 + }, + { + "epoch": 0.31650948284796354, + "grad_norm": 0.5232929587364197, + "learning_rate": 3.1991781900500054e-05, + "loss": 0.2694, + "step": 3054 + }, + { + "epoch": 0.31661312053062496, + "grad_norm": 0.5103795528411865, + "learning_rate": 3.1986408339477515e-05, + "loss": 0.222, + "step": 3055 + }, + { + "epoch": 0.3167167582132864, + "grad_norm": 0.4008198082447052, + "learning_rate": 3.198103342784511e-05, + "loss": 0.1697, + "step": 3056 + }, + { + "epoch": 0.3168203958959478, + "grad_norm": 0.45790600776672363, + "learning_rate": 3.1975657166208486e-05, + "loss": 0.2185, + "step": 3057 + }, + { + "epoch": 0.3169240335786092, + "grad_norm": 0.5262759923934937, + "learning_rate": 3.197027955517343e-05, + "loss": 0.2439, + "step": 3058 + }, + { + "epoch": 0.31702767126127057, + "grad_norm": 0.47552964091300964, + "learning_rate": 3.196490059534588e-05, + "loss": 0.2011, + "step": 3059 + }, + { + "epoch": 0.317131308943932, + "grad_norm": 0.4066140353679657, + "learning_rate": 3.195952028733193e-05, + "loss": 0.1874, + "step": 3060 + }, + { + "epoch": 0.3172349466265934, + "grad_norm": 0.49806711077690125, + "learning_rate": 3.1954138631737836e-05, + "loss": 0.2417, + "step": 3061 + }, + { + "epoch": 0.3173385843092548, + "grad_norm": 0.49791496992111206, + "learning_rate": 3.194875562916997e-05, + "loss": 0.2377, + "step": 3062 + }, + { + "epoch": 0.31744222199191624, + "grad_norm": 0.5316081047058105, + "learning_rate": 3.1943371280234905e-05, + "loss": 0.2532, + "step": 3063 + }, + { + "epoch": 0.31754585967457766, + "grad_norm": 0.48708289861679077, + "learning_rate": 3.193798558553933e-05, + "loss": 0.2109, + "step": 3064 + }, + { + "epoch": 0.3176494973572391, + "grad_norm": 0.5030457377433777, + "learning_rate": 3.193259854569009e-05, + "loss": 0.2195, + "step": 3065 + }, + { + "epoch": 0.3177531350399005, + "grad_norm": 0.4297417104244232, + "learning_rate": 3.19272101612942e-05, + "loss": 0.1726, + "step": 3066 + }, + { + "epoch": 0.3178567727225619, + "grad_norm": 0.5806198120117188, + "learning_rate": 3.192182043295881e-05, + "loss": 0.2898, + "step": 3067 + }, + { + "epoch": 0.3179604104052233, + "grad_norm": 0.49950316548347473, + "learning_rate": 3.191642936129122e-05, + "loss": 0.2381, + "step": 3068 + }, + { + "epoch": 0.31806404808788474, + "grad_norm": 0.5107114315032959, + "learning_rate": 3.1911036946898896e-05, + "loss": 0.2421, + "step": 3069 + }, + { + "epoch": 0.31816768577054616, + "grad_norm": 0.5184438228607178, + "learning_rate": 3.190564319038945e-05, + "loss": 0.2508, + "step": 3070 + }, + { + "epoch": 0.3182713234532076, + "grad_norm": 0.504603385925293, + "learning_rate": 3.190024809237064e-05, + "loss": 0.223, + "step": 3071 + }, + { + "epoch": 0.318374961135869, + "grad_norm": 0.43039676547050476, + "learning_rate": 3.189485165345037e-05, + "loss": 0.1966, + "step": 3072 + }, + { + "epoch": 0.3184785988185304, + "grad_norm": 0.43356984853744507, + "learning_rate": 3.188945387423671e-05, + "loss": 0.1816, + "step": 3073 + }, + { + "epoch": 0.31858223650119183, + "grad_norm": 0.439496248960495, + "learning_rate": 3.188405475533786e-05, + "loss": 0.1978, + "step": 3074 + }, + { + "epoch": 0.31868587418385325, + "grad_norm": 0.5059715509414673, + "learning_rate": 3.1878654297362196e-05, + "loss": 0.2591, + "step": 3075 + }, + { + "epoch": 0.31878951186651466, + "grad_norm": 0.4735623002052307, + "learning_rate": 3.187325250091824e-05, + "loss": 0.2131, + "step": 3076 + }, + { + "epoch": 0.3188931495491761, + "grad_norm": 0.4958748519420624, + "learning_rate": 3.1867849366614644e-05, + "loss": 0.2159, + "step": 3077 + }, + { + "epoch": 0.3189967872318375, + "grad_norm": 0.48477983474731445, + "learning_rate": 3.186244489506024e-05, + "loss": 0.2237, + "step": 3078 + }, + { + "epoch": 0.3191004249144989, + "grad_norm": 0.4658282399177551, + "learning_rate": 3.185703908686397e-05, + "loss": 0.2029, + "step": 3079 + }, + { + "epoch": 0.31920406259716033, + "grad_norm": 0.4622492492198944, + "learning_rate": 3.185163194263497e-05, + "loss": 0.2376, + "step": 3080 + }, + { + "epoch": 0.31930770027982175, + "grad_norm": 0.4980875849723816, + "learning_rate": 3.184622346298252e-05, + "loss": 0.2247, + "step": 3081 + }, + { + "epoch": 0.31941133796248317, + "grad_norm": 0.5297892689704895, + "learning_rate": 3.1840813648516015e-05, + "loss": 0.2165, + "step": 3082 + }, + { + "epoch": 0.3195149756451446, + "grad_norm": 0.5445452332496643, + "learning_rate": 3.183540249984504e-05, + "loss": 0.2828, + "step": 3083 + }, + { + "epoch": 0.319618613327806, + "grad_norm": 0.4954063892364502, + "learning_rate": 3.1829990017579306e-05, + "loss": 0.2117, + "step": 3084 + }, + { + "epoch": 0.3197222510104674, + "grad_norm": 0.4327438473701477, + "learning_rate": 3.182457620232868e-05, + "loss": 0.2227, + "step": 3085 + }, + { + "epoch": 0.31982588869312883, + "grad_norm": 0.4633021950721741, + "learning_rate": 3.18191610547032e-05, + "loss": 0.2006, + "step": 3086 + }, + { + "epoch": 0.31992952637579025, + "grad_norm": 0.43436625599861145, + "learning_rate": 3.181374457531303e-05, + "loss": 0.195, + "step": 3087 + }, + { + "epoch": 0.32003316405845167, + "grad_norm": 0.5254188179969788, + "learning_rate": 3.180832676476848e-05, + "loss": 0.238, + "step": 3088 + }, + { + "epoch": 0.3201368017411131, + "grad_norm": 0.4394558370113373, + "learning_rate": 3.180290762368002e-05, + "loss": 0.2024, + "step": 3089 + }, + { + "epoch": 0.3202404394237745, + "grad_norm": 0.4608902335166931, + "learning_rate": 3.179748715265828e-05, + "loss": 0.1795, + "step": 3090 + }, + { + "epoch": 0.3203440771064359, + "grad_norm": 0.4589252173900604, + "learning_rate": 3.179206535231403e-05, + "loss": 0.1954, + "step": 3091 + }, + { + "epoch": 0.32044771478909734, + "grad_norm": 0.5103592276573181, + "learning_rate": 3.1786642223258186e-05, + "loss": 0.2091, + "step": 3092 + }, + { + "epoch": 0.32055135247175875, + "grad_norm": 0.45052146911621094, + "learning_rate": 3.178121776610182e-05, + "loss": 0.1921, + "step": 3093 + }, + { + "epoch": 0.32065499015442017, + "grad_norm": 0.49646496772766113, + "learning_rate": 3.177579198145615e-05, + "loss": 0.2066, + "step": 3094 + }, + { + "epoch": 0.3207586278370816, + "grad_norm": 0.4916592836380005, + "learning_rate": 3.177036486993255e-05, + "loss": 0.2183, + "step": 3095 + }, + { + "epoch": 0.320862265519743, + "grad_norm": 0.5783033967018127, + "learning_rate": 3.1764936432142525e-05, + "loss": 0.2631, + "step": 3096 + }, + { + "epoch": 0.32096590320240437, + "grad_norm": 0.4790973663330078, + "learning_rate": 3.175950666869776e-05, + "loss": 0.2235, + "step": 3097 + }, + { + "epoch": 0.3210695408850658, + "grad_norm": 1.142399549484253, + "learning_rate": 3.1754075580210054e-05, + "loss": 0.2191, + "step": 3098 + }, + { + "epoch": 0.3211731785677272, + "grad_norm": 0.5207923054695129, + "learning_rate": 3.174864316729139e-05, + "loss": 0.2476, + "step": 3099 + }, + { + "epoch": 0.3212768162503886, + "grad_norm": 0.5381038784980774, + "learning_rate": 3.1743209430553883e-05, + "loss": 0.2596, + "step": 3100 + }, + { + "epoch": 0.32138045393305004, + "grad_norm": 0.5408893823623657, + "learning_rate": 3.173777437060978e-05, + "loss": 0.2685, + "step": 3101 + }, + { + "epoch": 0.32148409161571145, + "grad_norm": 0.521721601486206, + "learning_rate": 3.173233798807152e-05, + "loss": 0.2394, + "step": 3102 + }, + { + "epoch": 0.32158772929837287, + "grad_norm": 0.43176308274269104, + "learning_rate": 3.172690028355165e-05, + "loss": 0.2122, + "step": 3103 + }, + { + "epoch": 0.3216913669810343, + "grad_norm": 0.431378036737442, + "learning_rate": 3.172146125766288e-05, + "loss": 0.1887, + "step": 3104 + }, + { + "epoch": 0.3217950046636957, + "grad_norm": 0.47518616914749146, + "learning_rate": 3.171602091101808e-05, + "loss": 0.257, + "step": 3105 + }, + { + "epoch": 0.3218986423463571, + "grad_norm": 0.46573805809020996, + "learning_rate": 3.171057924423026e-05, + "loss": 0.2172, + "step": 3106 + }, + { + "epoch": 0.32200228002901854, + "grad_norm": 0.4244065582752228, + "learning_rate": 3.170513625791257e-05, + "loss": 0.1887, + "step": 3107 + }, + { + "epoch": 0.32210591771167996, + "grad_norm": 0.45914992690086365, + "learning_rate": 3.169969195267833e-05, + "loss": 0.2254, + "step": 3108 + }, + { + "epoch": 0.3222095553943414, + "grad_norm": 0.4996054768562317, + "learning_rate": 3.169424632914098e-05, + "loss": 0.2468, + "step": 3109 + }, + { + "epoch": 0.3223131930770028, + "grad_norm": 0.49699288606643677, + "learning_rate": 3.168879938791413e-05, + "loss": 0.2291, + "step": 3110 + }, + { + "epoch": 0.3224168307596642, + "grad_norm": 0.4879097640514374, + "learning_rate": 3.1683351129611547e-05, + "loss": 0.2367, + "step": 3111 + }, + { + "epoch": 0.3225204684423256, + "grad_norm": 0.46923696994781494, + "learning_rate": 3.1677901554847116e-05, + "loss": 0.2089, + "step": 3112 + }, + { + "epoch": 0.32262410612498704, + "grad_norm": 0.46386706829071045, + "learning_rate": 3.16724506642349e-05, + "loss": 0.2316, + "step": 3113 + }, + { + "epoch": 0.32272774380764846, + "grad_norm": 0.4610229432582855, + "learning_rate": 3.166699845838907e-05, + "loss": 0.2175, + "step": 3114 + }, + { + "epoch": 0.3228313814903099, + "grad_norm": 0.5416106581687927, + "learning_rate": 3.1661544937923996e-05, + "loss": 0.2118, + "step": 3115 + }, + { + "epoch": 0.3229350191729713, + "grad_norm": 0.5286328196525574, + "learning_rate": 3.1656090103454174e-05, + "loss": 0.2406, + "step": 3116 + }, + { + "epoch": 0.3230386568556327, + "grad_norm": 0.5512104630470276, + "learning_rate": 3.1650633955594235e-05, + "loss": 0.2189, + "step": 3117 + }, + { + "epoch": 0.32314229453829413, + "grad_norm": 0.4454720914363861, + "learning_rate": 3.164517649495898e-05, + "loss": 0.2077, + "step": 3118 + }, + { + "epoch": 0.32324593222095555, + "grad_norm": 0.5066326260566711, + "learning_rate": 3.163971772216333e-05, + "loss": 0.2053, + "step": 3119 + }, + { + "epoch": 0.32334956990361696, + "grad_norm": 0.4619614779949188, + "learning_rate": 3.163425763782238e-05, + "loss": 0.2108, + "step": 3120 + }, + { + "epoch": 0.3234532075862784, + "grad_norm": 0.5279788374900818, + "learning_rate": 3.1628796242551374e-05, + "loss": 0.2304, + "step": 3121 + }, + { + "epoch": 0.3235568452689398, + "grad_norm": 0.509197473526001, + "learning_rate": 3.1623333536965684e-05, + "loss": 0.2144, + "step": 3122 + }, + { + "epoch": 0.3236604829516012, + "grad_norm": 0.5262678861618042, + "learning_rate": 3.161786952168083e-05, + "loss": 0.2838, + "step": 3123 + }, + { + "epoch": 0.32376412063426263, + "grad_norm": 0.4825746715068817, + "learning_rate": 3.161240419731251e-05, + "loss": 0.2476, + "step": 3124 + }, + { + "epoch": 0.32386775831692405, + "grad_norm": 0.4855850636959076, + "learning_rate": 3.160693756447654e-05, + "loss": 0.2653, + "step": 3125 + }, + { + "epoch": 0.32397139599958547, + "grad_norm": 0.4569106698036194, + "learning_rate": 3.160146962378887e-05, + "loss": 0.2241, + "step": 3126 + }, + { + "epoch": 0.3240750336822469, + "grad_norm": 0.47725579142570496, + "learning_rate": 3.159600037586565e-05, + "loss": 0.2526, + "step": 3127 + }, + { + "epoch": 0.3241786713649083, + "grad_norm": 0.4275054633617401, + "learning_rate": 3.1590529821323134e-05, + "loss": 0.1906, + "step": 3128 + }, + { + "epoch": 0.3242823090475697, + "grad_norm": 0.472245991230011, + "learning_rate": 3.1585057960777735e-05, + "loss": 0.2525, + "step": 3129 + }, + { + "epoch": 0.32438594673023113, + "grad_norm": 0.4678281843662262, + "learning_rate": 3.1579584794846015e-05, + "loss": 0.2226, + "step": 3130 + }, + { + "epoch": 0.32448958441289255, + "grad_norm": 0.4702921211719513, + "learning_rate": 3.1574110324144676e-05, + "loss": 0.1996, + "step": 3131 + }, + { + "epoch": 0.32459322209555397, + "grad_norm": 0.4830780029296875, + "learning_rate": 3.156863454929059e-05, + "loss": 0.2251, + "step": 3132 + }, + { + "epoch": 0.3246968597782154, + "grad_norm": 0.4437926411628723, + "learning_rate": 3.156315747090073e-05, + "loss": 0.1927, + "step": 3133 + }, + { + "epoch": 0.3248004974608768, + "grad_norm": 0.5422351956367493, + "learning_rate": 3.1557679089592274e-05, + "loss": 0.222, + "step": 3134 + }, + { + "epoch": 0.32490413514353816, + "grad_norm": 0.48913538455963135, + "learning_rate": 3.15521994059825e-05, + "loss": 0.2249, + "step": 3135 + }, + { + "epoch": 0.3250077728261996, + "grad_norm": 0.43056854605674744, + "learning_rate": 3.154671842068886e-05, + "loss": 0.1881, + "step": 3136 + }, + { + "epoch": 0.325111410508861, + "grad_norm": 0.41883689165115356, + "learning_rate": 3.154123613432893e-05, + "loss": 0.1765, + "step": 3137 + }, + { + "epoch": 0.3252150481915224, + "grad_norm": 0.5446802973747253, + "learning_rate": 3.1535752547520456e-05, + "loss": 0.2483, + "step": 3138 + }, + { + "epoch": 0.32531868587418383, + "grad_norm": 0.4917711615562439, + "learning_rate": 3.153026766088132e-05, + "loss": 0.2339, + "step": 3139 + }, + { + "epoch": 0.32542232355684525, + "grad_norm": 0.5216474533081055, + "learning_rate": 3.152478147502954e-05, + "loss": 0.259, + "step": 3140 + }, + { + "epoch": 0.32552596123950667, + "grad_norm": 0.4385351240634918, + "learning_rate": 3.1519293990583316e-05, + "loss": 0.1905, + "step": 3141 + }, + { + "epoch": 0.3256295989221681, + "grad_norm": 0.527709424495697, + "learning_rate": 3.151380520816094e-05, + "loss": 0.2384, + "step": 3142 + }, + { + "epoch": 0.3257332366048295, + "grad_norm": 0.5104259252548218, + "learning_rate": 3.15083151283809e-05, + "loss": 0.2303, + "step": 3143 + }, + { + "epoch": 0.3258368742874909, + "grad_norm": 0.45804575085639954, + "learning_rate": 3.150282375186179e-05, + "loss": 0.2083, + "step": 3144 + }, + { + "epoch": 0.32594051197015234, + "grad_norm": 0.5373900532722473, + "learning_rate": 3.149733107922239e-05, + "loss": 0.2249, + "step": 3145 + }, + { + "epoch": 0.32604414965281375, + "grad_norm": 0.4487946927547455, + "learning_rate": 3.149183711108159e-05, + "loss": 0.1957, + "step": 3146 + }, + { + "epoch": 0.32614778733547517, + "grad_norm": 0.48644623160362244, + "learning_rate": 3.148634184805845e-05, + "loss": 0.2174, + "step": 3147 + }, + { + "epoch": 0.3262514250181366, + "grad_norm": 0.536426305770874, + "learning_rate": 3.1480845290772176e-05, + "loss": 0.2566, + "step": 3148 + }, + { + "epoch": 0.326355062700798, + "grad_norm": 0.4516526758670807, + "learning_rate": 3.147534743984209e-05, + "loss": 0.2121, + "step": 3149 + }, + { + "epoch": 0.3264587003834594, + "grad_norm": 0.5031577348709106, + "learning_rate": 3.146984829588769e-05, + "loss": 0.2263, + "step": 3150 + }, + { + "epoch": 0.32656233806612084, + "grad_norm": 0.4382554590702057, + "learning_rate": 3.146434785952863e-05, + "loss": 0.2149, + "step": 3151 + }, + { + "epoch": 0.32666597574878226, + "grad_norm": 0.5372516512870789, + "learning_rate": 3.1458846131384666e-05, + "loss": 0.2253, + "step": 3152 + }, + { + "epoch": 0.3267696134314437, + "grad_norm": 0.530376672744751, + "learning_rate": 3.145334311207574e-05, + "loss": 0.2312, + "step": 3153 + }, + { + "epoch": 0.3268732511141051, + "grad_norm": 0.4759053587913513, + "learning_rate": 3.144783880222191e-05, + "loss": 0.2315, + "step": 3154 + }, + { + "epoch": 0.3269768887967665, + "grad_norm": 0.4735720753669739, + "learning_rate": 3.1442333202443394e-05, + "loss": 0.2465, + "step": 3155 + }, + { + "epoch": 0.3270805264794279, + "grad_norm": 0.525758683681488, + "learning_rate": 3.1436826313360565e-05, + "loss": 0.2394, + "step": 3156 + }, + { + "epoch": 0.32718416416208934, + "grad_norm": 0.5047840476036072, + "learning_rate": 3.1431318135593936e-05, + "loss": 0.2349, + "step": 3157 + }, + { + "epoch": 0.32728780184475076, + "grad_norm": 0.4203908443450928, + "learning_rate": 3.142580866976414e-05, + "loss": 0.193, + "step": 3158 + }, + { + "epoch": 0.3273914395274122, + "grad_norm": 0.442714124917984, + "learning_rate": 3.142029791649198e-05, + "loss": 0.1932, + "step": 3159 + }, + { + "epoch": 0.3274950772100736, + "grad_norm": 0.5398364663124084, + "learning_rate": 3.1414785876398416e-05, + "loss": 0.2462, + "step": 3160 + }, + { + "epoch": 0.327598714892735, + "grad_norm": 0.5328658819198608, + "learning_rate": 3.140927255010452e-05, + "loss": 0.2018, + "step": 3161 + }, + { + "epoch": 0.3277023525753964, + "grad_norm": 0.5347991585731506, + "learning_rate": 3.140375793823152e-05, + "loss": 0.2357, + "step": 3162 + }, + { + "epoch": 0.32780599025805784, + "grad_norm": 0.40971866250038147, + "learning_rate": 3.139824204140082e-05, + "loss": 0.1614, + "step": 3163 + }, + { + "epoch": 0.32790962794071926, + "grad_norm": 0.4941691756248474, + "learning_rate": 3.139272486023391e-05, + "loss": 0.2036, + "step": 3164 + }, + { + "epoch": 0.3280132656233807, + "grad_norm": 0.42997175455093384, + "learning_rate": 3.1387206395352486e-05, + "loss": 0.2139, + "step": 3165 + }, + { + "epoch": 0.3281169033060421, + "grad_norm": 0.531755805015564, + "learning_rate": 3.138168664737833e-05, + "loss": 0.2309, + "step": 3166 + }, + { + "epoch": 0.3282205409887035, + "grad_norm": 0.5963094830513, + "learning_rate": 3.137616561693343e-05, + "loss": 0.2677, + "step": 3167 + }, + { + "epoch": 0.32832417867136493, + "grad_norm": 0.4682312309741974, + "learning_rate": 3.137064330463987e-05, + "loss": 0.2123, + "step": 3168 + }, + { + "epoch": 0.32842781635402635, + "grad_norm": 0.5144062638282776, + "learning_rate": 3.13651197111199e-05, + "loss": 0.2315, + "step": 3169 + }, + { + "epoch": 0.32853145403668776, + "grad_norm": 0.5011259317398071, + "learning_rate": 3.1359594836995906e-05, + "loss": 0.219, + "step": 3170 + }, + { + "epoch": 0.3286350917193492, + "grad_norm": 0.508209228515625, + "learning_rate": 3.135406868289042e-05, + "loss": 0.258, + "step": 3171 + }, + { + "epoch": 0.3287387294020106, + "grad_norm": 0.5223318338394165, + "learning_rate": 3.134854124942613e-05, + "loss": 0.2294, + "step": 3172 + }, + { + "epoch": 0.32884236708467196, + "grad_norm": 0.3559339642524719, + "learning_rate": 3.134301253722585e-05, + "loss": 0.1636, + "step": 3173 + }, + { + "epoch": 0.3289460047673334, + "grad_norm": 0.4399315118789673, + "learning_rate": 3.133748254691256e-05, + "loss": 0.2027, + "step": 3174 + }, + { + "epoch": 0.3290496424499948, + "grad_norm": 0.48089268803596497, + "learning_rate": 3.1331951279109354e-05, + "loss": 0.2416, + "step": 3175 + }, + { + "epoch": 0.3291532801326562, + "grad_norm": 0.5257017016410828, + "learning_rate": 3.1326418734439495e-05, + "loss": 0.243, + "step": 3176 + }, + { + "epoch": 0.32925691781531763, + "grad_norm": 0.46509110927581787, + "learning_rate": 3.132088491352638e-05, + "loss": 0.2278, + "step": 3177 + }, + { + "epoch": 0.32936055549797905, + "grad_norm": 0.5531212091445923, + "learning_rate": 3.131534981699355e-05, + "loss": 0.2315, + "step": 3178 + }, + { + "epoch": 0.32946419318064046, + "grad_norm": 0.43978351354599, + "learning_rate": 3.1309813445464695e-05, + "loss": 0.2014, + "step": 3179 + }, + { + "epoch": 0.3295678308633019, + "grad_norm": 0.5329275131225586, + "learning_rate": 3.1304275799563645e-05, + "loss": 0.2421, + "step": 3180 + }, + { + "epoch": 0.3296714685459633, + "grad_norm": 0.47121936082839966, + "learning_rate": 3.1298736879914364e-05, + "loss": 0.2161, + "step": 3181 + }, + { + "epoch": 0.3297751062286247, + "grad_norm": 0.4731251001358032, + "learning_rate": 3.1293196687140973e-05, + "loss": 0.2332, + "step": 3182 + }, + { + "epoch": 0.32987874391128613, + "grad_norm": 0.4779028594493866, + "learning_rate": 3.128765522186774e-05, + "loss": 0.1887, + "step": 3183 + }, + { + "epoch": 0.32998238159394755, + "grad_norm": 0.45643532276153564, + "learning_rate": 3.1282112484719066e-05, + "loss": 0.21, + "step": 3184 + }, + { + "epoch": 0.33008601927660897, + "grad_norm": 0.4478624761104584, + "learning_rate": 3.1276568476319495e-05, + "loss": 0.2212, + "step": 3185 + }, + { + "epoch": 0.3301896569592704, + "grad_norm": 0.5598623752593994, + "learning_rate": 3.127102319729372e-05, + "loss": 0.2548, + "step": 3186 + }, + { + "epoch": 0.3302932946419318, + "grad_norm": 0.492170512676239, + "learning_rate": 3.1265476648266565e-05, + "loss": 0.2141, + "step": 3187 + }, + { + "epoch": 0.3303969323245932, + "grad_norm": 0.475373238325119, + "learning_rate": 3.125992882986302e-05, + "loss": 0.2155, + "step": 3188 + }, + { + "epoch": 0.33050057000725463, + "grad_norm": 0.4140654504299164, + "learning_rate": 3.1254379742708195e-05, + "loss": 0.2101, + "step": 3189 + }, + { + "epoch": 0.33060420768991605, + "grad_norm": 0.4968665838241577, + "learning_rate": 3.124882938742736e-05, + "loss": 0.2455, + "step": 3190 + }, + { + "epoch": 0.33070784537257747, + "grad_norm": 0.49836060404777527, + "learning_rate": 3.1243277764645905e-05, + "loss": 0.2396, + "step": 3191 + }, + { + "epoch": 0.3308114830552389, + "grad_norm": 0.373451828956604, + "learning_rate": 3.1237724874989405e-05, + "loss": 0.1591, + "step": 3192 + }, + { + "epoch": 0.3309151207379003, + "grad_norm": 0.5305958390235901, + "learning_rate": 3.1232170719083525e-05, + "loss": 0.2475, + "step": 3193 + }, + { + "epoch": 0.3310187584205617, + "grad_norm": 0.40864086151123047, + "learning_rate": 3.1226615297554114e-05, + "loss": 0.187, + "step": 3194 + }, + { + "epoch": 0.33112239610322314, + "grad_norm": 0.43606048822402954, + "learning_rate": 3.122105861102714e-05, + "loss": 0.2194, + "step": 3195 + }, + { + "epoch": 0.33122603378588455, + "grad_norm": 0.4913009703159332, + "learning_rate": 3.121550066012873e-05, + "loss": 0.2425, + "step": 3196 + }, + { + "epoch": 0.33132967146854597, + "grad_norm": 0.4447884261608124, + "learning_rate": 3.120994144548513e-05, + "loss": 0.2283, + "step": 3197 + }, + { + "epoch": 0.3314333091512074, + "grad_norm": 0.4598451256752014, + "learning_rate": 3.120438096772277e-05, + "loss": 0.2035, + "step": 3198 + }, + { + "epoch": 0.3315369468338688, + "grad_norm": 0.4184786379337311, + "learning_rate": 3.1198819227468166e-05, + "loss": 0.2052, + "step": 3199 + }, + { + "epoch": 0.3316405845165302, + "grad_norm": 0.517906665802002, + "learning_rate": 3.1193256225348025e-05, + "loss": 0.2555, + "step": 3200 + }, + { + "epoch": 0.33174422219919164, + "grad_norm": 0.5278466939926147, + "learning_rate": 3.1187691961989184e-05, + "loss": 0.2461, + "step": 3201 + }, + { + "epoch": 0.33184785988185306, + "grad_norm": 0.3784679174423218, + "learning_rate": 3.118212643801859e-05, + "loss": 0.1559, + "step": 3202 + }, + { + "epoch": 0.3319514975645145, + "grad_norm": 0.47298792004585266, + "learning_rate": 3.1176559654063375e-05, + "loss": 0.2083, + "step": 3203 + }, + { + "epoch": 0.3320551352471759, + "grad_norm": 0.48570263385772705, + "learning_rate": 3.1170991610750795e-05, + "loss": 0.2348, + "step": 3204 + }, + { + "epoch": 0.3321587729298373, + "grad_norm": 0.5153487920761108, + "learning_rate": 3.116542230870824e-05, + "loss": 0.2424, + "step": 3205 + }, + { + "epoch": 0.3322624106124987, + "grad_norm": 0.45193734765052795, + "learning_rate": 3.1159851748563265e-05, + "loss": 0.2048, + "step": 3206 + }, + { + "epoch": 0.33236604829516014, + "grad_norm": 0.5584096312522888, + "learning_rate": 3.115427993094354e-05, + "loss": 0.256, + "step": 3207 + }, + { + "epoch": 0.33246968597782156, + "grad_norm": 0.4924525320529938, + "learning_rate": 3.114870685647688e-05, + "loss": 0.2356, + "step": 3208 + }, + { + "epoch": 0.332573323660483, + "grad_norm": 0.4525347948074341, + "learning_rate": 3.1143132525791275e-05, + "loss": 0.1851, + "step": 3209 + }, + { + "epoch": 0.3326769613431444, + "grad_norm": 0.4745256304740906, + "learning_rate": 3.113755693951482e-05, + "loss": 0.1991, + "step": 3210 + }, + { + "epoch": 0.33278059902580576, + "grad_norm": 0.40173184871673584, + "learning_rate": 3.113198009827576e-05, + "loss": 0.1902, + "step": 3211 + }, + { + "epoch": 0.3328842367084672, + "grad_norm": 0.5420322418212891, + "learning_rate": 3.1126402002702495e-05, + "loss": 0.2449, + "step": 3212 + }, + { + "epoch": 0.3329878743911286, + "grad_norm": 0.5138987898826599, + "learning_rate": 3.112082265342354e-05, + "loss": 0.233, + "step": 3213 + }, + { + "epoch": 0.33309151207379, + "grad_norm": 0.49910151958465576, + "learning_rate": 3.1115242051067574e-05, + "loss": 0.2059, + "step": 3214 + }, + { + "epoch": 0.3331951497564514, + "grad_norm": 0.5475865602493286, + "learning_rate": 3.110966019626342e-05, + "loss": 0.2524, + "step": 3215 + }, + { + "epoch": 0.33329878743911284, + "grad_norm": 0.4959619641304016, + "learning_rate": 3.1104077089640016e-05, + "loss": 0.2182, + "step": 3216 + }, + { + "epoch": 0.33340242512177426, + "grad_norm": 0.5604843497276306, + "learning_rate": 3.109849273182648e-05, + "loss": 0.2404, + "step": 3217 + }, + { + "epoch": 0.3335060628044357, + "grad_norm": 0.6018540859222412, + "learning_rate": 3.1092907123452024e-05, + "loss": 0.2359, + "step": 3218 + }, + { + "epoch": 0.3336097004870971, + "grad_norm": 0.4417458772659302, + "learning_rate": 3.108732026514604e-05, + "loss": 0.1685, + "step": 3219 + }, + { + "epoch": 0.3337133381697585, + "grad_norm": 0.4794652760028839, + "learning_rate": 3.108173215753805e-05, + "loss": 0.2495, + "step": 3220 + }, + { + "epoch": 0.33381697585241993, + "grad_norm": 0.5345839858055115, + "learning_rate": 3.10761428012577e-05, + "loss": 0.2436, + "step": 3221 + }, + { + "epoch": 0.33392061353508135, + "grad_norm": 0.5043509602546692, + "learning_rate": 3.1070552196934803e-05, + "loss": 0.2336, + "step": 3222 + }, + { + "epoch": 0.33402425121774276, + "grad_norm": 0.5134027004241943, + "learning_rate": 3.106496034519929e-05, + "loss": 0.2336, + "step": 3223 + }, + { + "epoch": 0.3341278889004042, + "grad_norm": 0.5828744769096375, + "learning_rate": 3.105936724668125e-05, + "loss": 0.2761, + "step": 3224 + }, + { + "epoch": 0.3342315265830656, + "grad_norm": 0.5718686580657959, + "learning_rate": 3.10537729020109e-05, + "loss": 0.2309, + "step": 3225 + }, + { + "epoch": 0.334335164265727, + "grad_norm": 0.45800167322158813, + "learning_rate": 3.10481773118186e-05, + "loss": 0.2076, + "step": 3226 + }, + { + "epoch": 0.33443880194838843, + "grad_norm": 0.45891281962394714, + "learning_rate": 3.104258047673486e-05, + "loss": 0.1819, + "step": 3227 + }, + { + "epoch": 0.33454243963104985, + "grad_norm": 0.4926428198814392, + "learning_rate": 3.103698239739031e-05, + "loss": 0.1926, + "step": 3228 + }, + { + "epoch": 0.33464607731371127, + "grad_norm": 0.5176846385002136, + "learning_rate": 3.103138307441575e-05, + "loss": 0.2335, + "step": 3229 + }, + { + "epoch": 0.3347497149963727, + "grad_norm": 0.509946882724762, + "learning_rate": 3.102578250844209e-05, + "loss": 0.2154, + "step": 3230 + }, + { + "epoch": 0.3348533526790341, + "grad_norm": 0.5066658854484558, + "learning_rate": 3.1020180700100395e-05, + "loss": 0.2191, + "step": 3231 + }, + { + "epoch": 0.3349569903616955, + "grad_norm": 0.4816665053367615, + "learning_rate": 3.101457765002187e-05, + "loss": 0.2324, + "step": 3232 + }, + { + "epoch": 0.33506062804435693, + "grad_norm": 0.4046016335487366, + "learning_rate": 3.100897335883786e-05, + "loss": 0.177, + "step": 3233 + }, + { + "epoch": 0.33516426572701835, + "grad_norm": 0.38172265887260437, + "learning_rate": 3.100336782717984e-05, + "loss": 0.1846, + "step": 3234 + }, + { + "epoch": 0.33526790340967977, + "grad_norm": 0.45876002311706543, + "learning_rate": 3.099776105567945e-05, + "loss": 0.2034, + "step": 3235 + }, + { + "epoch": 0.3353715410923412, + "grad_norm": 0.49584850668907166, + "learning_rate": 3.099215304496843e-05, + "loss": 0.2165, + "step": 3236 + }, + { + "epoch": 0.3354751787750026, + "grad_norm": 0.5085937976837158, + "learning_rate": 3.09865437956787e-05, + "loss": 0.2458, + "step": 3237 + }, + { + "epoch": 0.335578816457664, + "grad_norm": 0.49745145440101624, + "learning_rate": 3.0980933308442295e-05, + "loss": 0.2399, + "step": 3238 + }, + { + "epoch": 0.33568245414032544, + "grad_norm": 0.48892679810523987, + "learning_rate": 3.097532158389139e-05, + "loss": 0.2472, + "step": 3239 + }, + { + "epoch": 0.33578609182298685, + "grad_norm": 0.5717560648918152, + "learning_rate": 3.09697086226583e-05, + "loss": 0.2519, + "step": 3240 + }, + { + "epoch": 0.33588972950564827, + "grad_norm": 0.42260032892227173, + "learning_rate": 3.0964094425375515e-05, + "loss": 0.1985, + "step": 3241 + }, + { + "epoch": 0.3359933671883097, + "grad_norm": 0.38656240701675415, + "learning_rate": 3.0958478992675606e-05, + "loss": 0.1557, + "step": 3242 + }, + { + "epoch": 0.3360970048709711, + "grad_norm": 0.49424460530281067, + "learning_rate": 3.095286232519131e-05, + "loss": 0.273, + "step": 3243 + }, + { + "epoch": 0.3362006425536325, + "grad_norm": 0.49864912033081055, + "learning_rate": 3.0947244423555526e-05, + "loss": 0.2151, + "step": 3244 + }, + { + "epoch": 0.33630428023629394, + "grad_norm": 0.4592277407646179, + "learning_rate": 3.094162528840126e-05, + "loss": 0.2061, + "step": 3245 + }, + { + "epoch": 0.33640791791895536, + "grad_norm": 0.4824981987476349, + "learning_rate": 3.0936004920361654e-05, + "loss": 0.2311, + "step": 3246 + }, + { + "epoch": 0.3365115556016168, + "grad_norm": 0.43453484773635864, + "learning_rate": 3.0930383320070025e-05, + "loss": 0.1828, + "step": 3247 + }, + { + "epoch": 0.33661519328427814, + "grad_norm": 0.5099527835845947, + "learning_rate": 3.092476048815979e-05, + "loss": 0.2207, + "step": 3248 + }, + { + "epoch": 0.33671883096693955, + "grad_norm": 0.4967183768749237, + "learning_rate": 3.0919136425264525e-05, + "loss": 0.2227, + "step": 3249 + }, + { + "epoch": 0.33682246864960097, + "grad_norm": 0.46992024779319763, + "learning_rate": 3.0913511132017943e-05, + "loss": 0.2024, + "step": 3250 + }, + { + "epoch": 0.3369261063322624, + "grad_norm": 0.47221264243125916, + "learning_rate": 3.090788460905389e-05, + "loss": 0.2158, + "step": 3251 + }, + { + "epoch": 0.3370297440149238, + "grad_norm": 0.4187302887439728, + "learning_rate": 3.090225685700636e-05, + "loss": 0.1696, + "step": 3252 + }, + { + "epoch": 0.3371333816975852, + "grad_norm": 0.4217827618122101, + "learning_rate": 3.089662787650947e-05, + "loss": 0.1975, + "step": 3253 + }, + { + "epoch": 0.33723701938024664, + "grad_norm": 0.3924787640571594, + "learning_rate": 3.089099766819749e-05, + "loss": 0.1776, + "step": 3254 + }, + { + "epoch": 0.33734065706290806, + "grad_norm": 0.5434726476669312, + "learning_rate": 3.088536623270483e-05, + "loss": 0.2242, + "step": 3255 + }, + { + "epoch": 0.3374442947455695, + "grad_norm": 0.5202988982200623, + "learning_rate": 3.0879733570666024e-05, + "loss": 0.2304, + "step": 3256 + }, + { + "epoch": 0.3375479324282309, + "grad_norm": 0.5428398251533508, + "learning_rate": 3.0874099682715745e-05, + "loss": 0.2248, + "step": 3257 + }, + { + "epoch": 0.3376515701108923, + "grad_norm": 0.5170606970787048, + "learning_rate": 3.086846456948882e-05, + "loss": 0.2164, + "step": 3258 + }, + { + "epoch": 0.3377552077935537, + "grad_norm": 0.4656817317008972, + "learning_rate": 3.0862828231620206e-05, + "loss": 0.2134, + "step": 3259 + }, + { + "epoch": 0.33785884547621514, + "grad_norm": 0.5413587093353271, + "learning_rate": 3.085719066974499e-05, + "loss": 0.2403, + "step": 3260 + }, + { + "epoch": 0.33796248315887656, + "grad_norm": 0.565420925617218, + "learning_rate": 3.0851551884498414e-05, + "loss": 0.2482, + "step": 3261 + }, + { + "epoch": 0.338066120841538, + "grad_norm": 0.5669689178466797, + "learning_rate": 3.084591187651583e-05, + "loss": 0.2584, + "step": 3262 + }, + { + "epoch": 0.3381697585241994, + "grad_norm": 0.4945541024208069, + "learning_rate": 3.0840270646432765e-05, + "loss": 0.2163, + "step": 3263 + }, + { + "epoch": 0.3382733962068608, + "grad_norm": 0.5486595630645752, + "learning_rate": 3.0834628194884854e-05, + "loss": 0.2259, + "step": 3264 + }, + { + "epoch": 0.3383770338895222, + "grad_norm": 0.4185464680194855, + "learning_rate": 3.0828984522507875e-05, + "loss": 0.1834, + "step": 3265 + }, + { + "epoch": 0.33848067157218364, + "grad_norm": 0.49174776673316956, + "learning_rate": 3.082333962993776e-05, + "loss": 0.2183, + "step": 3266 + }, + { + "epoch": 0.33858430925484506, + "grad_norm": 0.5619238615036011, + "learning_rate": 3.0817693517810555e-05, + "loss": 0.2336, + "step": 3267 + }, + { + "epoch": 0.3386879469375065, + "grad_norm": 0.4658437669277191, + "learning_rate": 3.081204618676246e-05, + "loss": 0.2035, + "step": 3268 + }, + { + "epoch": 0.3387915846201679, + "grad_norm": 0.4304681420326233, + "learning_rate": 3.0806397637429815e-05, + "loss": 0.1782, + "step": 3269 + }, + { + "epoch": 0.3388952223028293, + "grad_norm": 0.49248623847961426, + "learning_rate": 3.0800747870449085e-05, + "loss": 0.1977, + "step": 3270 + }, + { + "epoch": 0.33899885998549073, + "grad_norm": 0.44981497526168823, + "learning_rate": 3.0795096886456864e-05, + "loss": 0.2195, + "step": 3271 + }, + { + "epoch": 0.33910249766815215, + "grad_norm": 0.5155792832374573, + "learning_rate": 3.078944468608992e-05, + "loss": 0.2303, + "step": 3272 + }, + { + "epoch": 0.33920613535081356, + "grad_norm": 0.5373932123184204, + "learning_rate": 3.078379126998511e-05, + "loss": 0.235, + "step": 3273 + }, + { + "epoch": 0.339309773033475, + "grad_norm": 0.5162752270698547, + "learning_rate": 3.077813663877946e-05, + "loss": 0.2134, + "step": 3274 + }, + { + "epoch": 0.3394134107161364, + "grad_norm": 0.48236146569252014, + "learning_rate": 3.077248079311015e-05, + "loss": 0.2218, + "step": 3275 + }, + { + "epoch": 0.3395170483987978, + "grad_norm": 0.5287761688232422, + "learning_rate": 3.076682373361443e-05, + "loss": 0.273, + "step": 3276 + }, + { + "epoch": 0.33962068608145923, + "grad_norm": 0.4486253261566162, + "learning_rate": 3.076116546092975e-05, + "loss": 0.2095, + "step": 3277 + }, + { + "epoch": 0.33972432376412065, + "grad_norm": 0.4924604296684265, + "learning_rate": 3.075550597569369e-05, + "loss": 0.2117, + "step": 3278 + }, + { + "epoch": 0.33982796144678207, + "grad_norm": 0.45519301295280457, + "learning_rate": 3.074984527854392e-05, + "loss": 0.1761, + "step": 3279 + }, + { + "epoch": 0.3399315991294435, + "grad_norm": 0.5044865608215332, + "learning_rate": 3.074418337011831e-05, + "loss": 0.1913, + "step": 3280 + }, + { + "epoch": 0.3400352368121049, + "grad_norm": 0.5022374391555786, + "learning_rate": 3.073852025105481e-05, + "loss": 0.2132, + "step": 3281 + }, + { + "epoch": 0.3401388744947663, + "grad_norm": 0.4834980070590973, + "learning_rate": 3.073285592199154e-05, + "loss": 0.2091, + "step": 3282 + }, + { + "epoch": 0.34024251217742774, + "grad_norm": 0.5449108481407166, + "learning_rate": 3.072719038356675e-05, + "loss": 0.2529, + "step": 3283 + }, + { + "epoch": 0.34034614986008915, + "grad_norm": 0.535485029220581, + "learning_rate": 3.072152363641883e-05, + "loss": 0.2137, + "step": 3284 + }, + { + "epoch": 0.34044978754275057, + "grad_norm": 0.5348849296569824, + "learning_rate": 3.0715855681186294e-05, + "loss": 0.2221, + "step": 3285 + }, + { + "epoch": 0.34055342522541193, + "grad_norm": 0.48995643854141235, + "learning_rate": 3.0710186518507794e-05, + "loss": 0.2252, + "step": 3286 + }, + { + "epoch": 0.34065706290807335, + "grad_norm": 0.5089548230171204, + "learning_rate": 3.0704516149022126e-05, + "loss": 0.2416, + "step": 3287 + }, + { + "epoch": 0.34076070059073477, + "grad_norm": 0.4678729176521301, + "learning_rate": 3.069884457336822e-05, + "loss": 0.208, + "step": 3288 + }, + { + "epoch": 0.3408643382733962, + "grad_norm": 0.47132188081741333, + "learning_rate": 3.069317179218513e-05, + "loss": 0.1738, + "step": 3289 + }, + { + "epoch": 0.3409679759560576, + "grad_norm": 0.46246013045310974, + "learning_rate": 3.068749780611208e-05, + "loss": 0.2065, + "step": 3290 + }, + { + "epoch": 0.341071613638719, + "grad_norm": 0.5273382067680359, + "learning_rate": 3.068182261578839e-05, + "loss": 0.2094, + "step": 3291 + }, + { + "epoch": 0.34117525132138043, + "grad_norm": 0.47908997535705566, + "learning_rate": 3.067614622185352e-05, + "loss": 0.2143, + "step": 3292 + }, + { + "epoch": 0.34127888900404185, + "grad_norm": 0.45752736926078796, + "learning_rate": 3.06704686249471e-05, + "loss": 0.1816, + "step": 3293 + }, + { + "epoch": 0.34138252668670327, + "grad_norm": 0.42608192563056946, + "learning_rate": 3.066478982570886e-05, + "loss": 0.1813, + "step": 3294 + }, + { + "epoch": 0.3414861643693647, + "grad_norm": 0.5494682788848877, + "learning_rate": 3.065910982477868e-05, + "loss": 0.2782, + "step": 3295 + }, + { + "epoch": 0.3415898020520261, + "grad_norm": 0.5171986818313599, + "learning_rate": 3.065342862279658e-05, + "loss": 0.2454, + "step": 3296 + }, + { + "epoch": 0.3416934397346875, + "grad_norm": 0.4978666305541992, + "learning_rate": 3.06477462204027e-05, + "loss": 0.2121, + "step": 3297 + }, + { + "epoch": 0.34179707741734894, + "grad_norm": 0.4874645471572876, + "learning_rate": 3.0642062618237326e-05, + "loss": 0.2193, + "step": 3298 + }, + { + "epoch": 0.34190071510001036, + "grad_norm": 0.46120017766952515, + "learning_rate": 3.063637781694088e-05, + "loss": 0.2243, + "step": 3299 + }, + { + "epoch": 0.34200435278267177, + "grad_norm": 0.5550026893615723, + "learning_rate": 3.063069181715392e-05, + "loss": 0.224, + "step": 3300 + }, + { + "epoch": 0.3421079904653332, + "grad_norm": 0.47999265789985657, + "learning_rate": 3.0625004619517136e-05, + "loss": 0.2073, + "step": 3301 + }, + { + "epoch": 0.3422116281479946, + "grad_norm": 0.5649874806404114, + "learning_rate": 3.061931622467134e-05, + "loss": 0.2269, + "step": 3302 + }, + { + "epoch": 0.342315265830656, + "grad_norm": 0.4317651093006134, + "learning_rate": 3.0613626633257504e-05, + "loss": 0.1858, + "step": 3303 + }, + { + "epoch": 0.34241890351331744, + "grad_norm": 0.4310738444328308, + "learning_rate": 3.060793584591671e-05, + "loss": 0.1749, + "step": 3304 + }, + { + "epoch": 0.34252254119597886, + "grad_norm": 0.45287322998046875, + "learning_rate": 3.060224386329021e-05, + "loss": 0.1899, + "step": 3305 + }, + { + "epoch": 0.3426261788786403, + "grad_norm": 0.577049970626831, + "learning_rate": 3.059655068601934e-05, + "loss": 0.2766, + "step": 3306 + }, + { + "epoch": 0.3427298165613017, + "grad_norm": 0.47695785760879517, + "learning_rate": 3.059085631474562e-05, + "loss": 0.2051, + "step": 3307 + }, + { + "epoch": 0.3428334542439631, + "grad_norm": 0.4532393515110016, + "learning_rate": 3.0585160750110664e-05, + "loss": 0.1965, + "step": 3308 + }, + { + "epoch": 0.3429370919266245, + "grad_norm": 0.5475507974624634, + "learning_rate": 3.057946399275626e-05, + "loss": 0.2345, + "step": 3309 + }, + { + "epoch": 0.34304072960928594, + "grad_norm": 0.4509216845035553, + "learning_rate": 3.0573766043324294e-05, + "loss": 0.1935, + "step": 3310 + }, + { + "epoch": 0.34314436729194736, + "grad_norm": 0.5102453231811523, + "learning_rate": 3.056806690245681e-05, + "loss": 0.2255, + "step": 3311 + }, + { + "epoch": 0.3432480049746088, + "grad_norm": 0.5190200805664062, + "learning_rate": 3.056236657079597e-05, + "loss": 0.2351, + "step": 3312 + }, + { + "epoch": 0.3433516426572702, + "grad_norm": 0.4880451261997223, + "learning_rate": 3.0556665048984094e-05, + "loss": 0.2048, + "step": 3313 + }, + { + "epoch": 0.3434552803399316, + "grad_norm": 0.48179447650909424, + "learning_rate": 3.05509623376636e-05, + "loss": 0.2381, + "step": 3314 + }, + { + "epoch": 0.34355891802259303, + "grad_norm": 0.5229012966156006, + "learning_rate": 3.054525843747708e-05, + "loss": 0.2506, + "step": 3315 + }, + { + "epoch": 0.34366255570525445, + "grad_norm": 0.5210829377174377, + "learning_rate": 3.053955334906723e-05, + "loss": 0.2234, + "step": 3316 + }, + { + "epoch": 0.34376619338791586, + "grad_norm": 0.5315191745758057, + "learning_rate": 3.053384707307689e-05, + "loss": 0.246, + "step": 3317 + }, + { + "epoch": 0.3438698310705773, + "grad_norm": 0.5583591461181641, + "learning_rate": 3.052813961014904e-05, + "loss": 0.2363, + "step": 3318 + }, + { + "epoch": 0.3439734687532387, + "grad_norm": 0.4921298623085022, + "learning_rate": 3.0522430960926786e-05, + "loss": 0.2049, + "step": 3319 + }, + { + "epoch": 0.3440771064359001, + "grad_norm": 0.5304951667785645, + "learning_rate": 3.051672112605337e-05, + "loss": 0.2364, + "step": 3320 + }, + { + "epoch": 0.34418074411856153, + "grad_norm": 0.4428289234638214, + "learning_rate": 3.051101010617216e-05, + "loss": 0.2182, + "step": 3321 + }, + { + "epoch": 0.34428438180122295, + "grad_norm": 0.5289511680603027, + "learning_rate": 3.0505297901926672e-05, + "loss": 0.265, + "step": 3322 + }, + { + "epoch": 0.34438801948388437, + "grad_norm": 0.45246875286102295, + "learning_rate": 3.0499584513960553e-05, + "loss": 0.1823, + "step": 3323 + }, + { + "epoch": 0.34449165716654573, + "grad_norm": 0.5081989169120789, + "learning_rate": 3.0493869942917563e-05, + "loss": 0.2345, + "step": 3324 + }, + { + "epoch": 0.34459529484920715, + "grad_norm": 0.4332667589187622, + "learning_rate": 3.0488154189441627e-05, + "loss": 0.1824, + "step": 3325 + }, + { + "epoch": 0.34469893253186856, + "grad_norm": 0.5207022428512573, + "learning_rate": 3.0482437254176785e-05, + "loss": 0.1947, + "step": 3326 + }, + { + "epoch": 0.34480257021453, + "grad_norm": 0.5865523815155029, + "learning_rate": 3.04767191377672e-05, + "loss": 0.262, + "step": 3327 + }, + { + "epoch": 0.3449062078971914, + "grad_norm": 0.5545551776885986, + "learning_rate": 3.04709998408572e-05, + "loss": 0.2542, + "step": 3328 + }, + { + "epoch": 0.3450098455798528, + "grad_norm": 0.49805060029029846, + "learning_rate": 3.0465279364091204e-05, + "loss": 0.2334, + "step": 3329 + }, + { + "epoch": 0.34511348326251423, + "grad_norm": 0.4513069689273834, + "learning_rate": 3.0459557708113806e-05, + "loss": 0.1891, + "step": 3330 + }, + { + "epoch": 0.34521712094517565, + "grad_norm": 0.5274832248687744, + "learning_rate": 3.0453834873569703e-05, + "loss": 0.2066, + "step": 3331 + }, + { + "epoch": 0.34532075862783707, + "grad_norm": 0.5223729610443115, + "learning_rate": 3.0448110861103735e-05, + "loss": 0.2185, + "step": 3332 + }, + { + "epoch": 0.3454243963104985, + "grad_norm": 0.5173854827880859, + "learning_rate": 3.0442385671360876e-05, + "loss": 0.2362, + "step": 3333 + }, + { + "epoch": 0.3455280339931599, + "grad_norm": 0.5483968257904053, + "learning_rate": 3.043665930498623e-05, + "loss": 0.2287, + "step": 3334 + }, + { + "epoch": 0.3456316716758213, + "grad_norm": 0.549764096736908, + "learning_rate": 3.0430931762625052e-05, + "loss": 0.2352, + "step": 3335 + }, + { + "epoch": 0.34573530935848273, + "grad_norm": 0.5403971672058105, + "learning_rate": 3.0425203044922687e-05, + "loss": 0.2391, + "step": 3336 + }, + { + "epoch": 0.34583894704114415, + "grad_norm": 0.48038166761398315, + "learning_rate": 3.041947315252465e-05, + "loss": 0.2254, + "step": 3337 + }, + { + "epoch": 0.34594258472380557, + "grad_norm": 0.5125417113304138, + "learning_rate": 3.0413742086076577e-05, + "loss": 0.2023, + "step": 3338 + }, + { + "epoch": 0.346046222406467, + "grad_norm": 0.4226140081882477, + "learning_rate": 3.040800984622423e-05, + "loss": 0.1708, + "step": 3339 + }, + { + "epoch": 0.3461498600891284, + "grad_norm": 0.4800489842891693, + "learning_rate": 3.040227643361352e-05, + "loss": 0.2031, + "step": 3340 + }, + { + "epoch": 0.3462534977717898, + "grad_norm": 0.5459068417549133, + "learning_rate": 3.0396541848890472e-05, + "loss": 0.2332, + "step": 3341 + }, + { + "epoch": 0.34635713545445124, + "grad_norm": 0.470639169216156, + "learning_rate": 3.039080609270124e-05, + "loss": 0.2205, + "step": 3342 + }, + { + "epoch": 0.34646077313711265, + "grad_norm": 0.5183528065681458, + "learning_rate": 3.0385069165692137e-05, + "loss": 0.2205, + "step": 3343 + }, + { + "epoch": 0.34656441081977407, + "grad_norm": 0.583718478679657, + "learning_rate": 3.0379331068509587e-05, + "loss": 0.2338, + "step": 3344 + }, + { + "epoch": 0.3466680485024355, + "grad_norm": 0.4664159417152405, + "learning_rate": 3.0373591801800147e-05, + "loss": 0.2279, + "step": 3345 + }, + { + "epoch": 0.3467716861850969, + "grad_norm": 0.4871425926685333, + "learning_rate": 3.0367851366210507e-05, + "loss": 0.22, + "step": 3346 + }, + { + "epoch": 0.3468753238677583, + "grad_norm": 0.4640686511993408, + "learning_rate": 3.0362109762387488e-05, + "loss": 0.2021, + "step": 3347 + }, + { + "epoch": 0.34697896155041974, + "grad_norm": 0.4763514995574951, + "learning_rate": 3.0356366990978055e-05, + "loss": 0.2174, + "step": 3348 + }, + { + "epoch": 0.34708259923308116, + "grad_norm": 0.5456576347351074, + "learning_rate": 3.0350623052629284e-05, + "loss": 0.2332, + "step": 3349 + }, + { + "epoch": 0.3471862369157426, + "grad_norm": 0.4503636658191681, + "learning_rate": 3.0344877947988397e-05, + "loss": 0.2002, + "step": 3350 + }, + { + "epoch": 0.347289874598404, + "grad_norm": 0.47983184456825256, + "learning_rate": 3.0339131677702754e-05, + "loss": 0.22, + "step": 3351 + }, + { + "epoch": 0.3473935122810654, + "grad_norm": 0.6034876108169556, + "learning_rate": 3.033338424241982e-05, + "loss": 0.259, + "step": 3352 + }, + { + "epoch": 0.3474971499637268, + "grad_norm": 0.521963357925415, + "learning_rate": 3.0327635642787208e-05, + "loss": 0.2344, + "step": 3353 + }, + { + "epoch": 0.34760078764638824, + "grad_norm": 0.5032604932785034, + "learning_rate": 3.0321885879452673e-05, + "loss": 0.1985, + "step": 3354 + }, + { + "epoch": 0.34770442532904966, + "grad_norm": 0.5303429365158081, + "learning_rate": 3.0316134953064083e-05, + "loss": 0.2356, + "step": 3355 + }, + { + "epoch": 0.3478080630117111, + "grad_norm": 0.46349650621414185, + "learning_rate": 3.0310382864269442e-05, + "loss": 0.1997, + "step": 3356 + }, + { + "epoch": 0.3479117006943725, + "grad_norm": 0.5277966260910034, + "learning_rate": 3.0304629613716882e-05, + "loss": 0.2755, + "step": 3357 + }, + { + "epoch": 0.3480153383770339, + "grad_norm": 0.5392859578132629, + "learning_rate": 3.029887520205469e-05, + "loss": 0.2278, + "step": 3358 + }, + { + "epoch": 0.34811897605969533, + "grad_norm": 0.5537436008453369, + "learning_rate": 3.0293119629931235e-05, + "loss": 0.2596, + "step": 3359 + }, + { + "epoch": 0.34822261374235675, + "grad_norm": 0.5064177513122559, + "learning_rate": 3.0287362897995068e-05, + "loss": 0.2116, + "step": 3360 + }, + { + "epoch": 0.34832625142501816, + "grad_norm": 0.4268822968006134, + "learning_rate": 3.0281605006894837e-05, + "loss": 0.2007, + "step": 3361 + }, + { + "epoch": 0.3484298891076795, + "grad_norm": 0.5377349257469177, + "learning_rate": 3.027584595727934e-05, + "loss": 0.2513, + "step": 3362 + }, + { + "epoch": 0.34853352679034094, + "grad_norm": 0.5493626594543457, + "learning_rate": 3.02700857497975e-05, + "loss": 0.2513, + "step": 3363 + }, + { + "epoch": 0.34863716447300236, + "grad_norm": 0.4258686900138855, + "learning_rate": 3.0264324385098356e-05, + "loss": 0.1931, + "step": 3364 + }, + { + "epoch": 0.3487408021556638, + "grad_norm": 0.5182408094406128, + "learning_rate": 3.0258561863831103e-05, + "loss": 0.2278, + "step": 3365 + }, + { + "epoch": 0.3488444398383252, + "grad_norm": 0.5387588143348694, + "learning_rate": 3.025279818664504e-05, + "loss": 0.2599, + "step": 3366 + }, + { + "epoch": 0.3489480775209866, + "grad_norm": 0.4380408525466919, + "learning_rate": 3.024703335418962e-05, + "loss": 0.1948, + "step": 3367 + }, + { + "epoch": 0.349051715203648, + "grad_norm": 0.42744994163513184, + "learning_rate": 3.0241267367114404e-05, + "loss": 0.2135, + "step": 3368 + }, + { + "epoch": 0.34915535288630944, + "grad_norm": 0.5398333072662354, + "learning_rate": 3.0235500226069105e-05, + "loss": 0.2239, + "step": 3369 + }, + { + "epoch": 0.34925899056897086, + "grad_norm": 0.40965038537979126, + "learning_rate": 3.0229731931703558e-05, + "loss": 0.172, + "step": 3370 + }, + { + "epoch": 0.3493626282516323, + "grad_norm": 0.4280138611793518, + "learning_rate": 3.022396248466771e-05, + "loss": 0.1964, + "step": 3371 + }, + { + "epoch": 0.3494662659342937, + "grad_norm": 0.5184552073478699, + "learning_rate": 3.0218191885611666e-05, + "loss": 0.2325, + "step": 3372 + }, + { + "epoch": 0.3495699036169551, + "grad_norm": 0.4650012254714966, + "learning_rate": 3.0212420135185652e-05, + "loss": 0.2152, + "step": 3373 + }, + { + "epoch": 0.34967354129961653, + "grad_norm": 0.4905893802642822, + "learning_rate": 3.0206647234040006e-05, + "loss": 0.2161, + "step": 3374 + }, + { + "epoch": 0.34977717898227795, + "grad_norm": 0.4558361768722534, + "learning_rate": 3.0200873182825218e-05, + "loss": 0.1786, + "step": 3375 + }, + { + "epoch": 0.34988081666493936, + "grad_norm": 0.5681077837944031, + "learning_rate": 3.019509798219189e-05, + "loss": 0.2663, + "step": 3376 + }, + { + "epoch": 0.3499844543476008, + "grad_norm": 0.5157054662704468, + "learning_rate": 3.018932163279078e-05, + "loss": 0.2384, + "step": 3377 + }, + { + "epoch": 0.3500880920302622, + "grad_norm": 0.49062496423721313, + "learning_rate": 3.0183544135272744e-05, + "loss": 0.2436, + "step": 3378 + }, + { + "epoch": 0.3501917297129236, + "grad_norm": 0.4667421579360962, + "learning_rate": 3.017776549028879e-05, + "loss": 0.209, + "step": 3379 + }, + { + "epoch": 0.35029536739558503, + "grad_norm": 0.5137354135513306, + "learning_rate": 3.0171985698490034e-05, + "loss": 0.226, + "step": 3380 + }, + { + "epoch": 0.35039900507824645, + "grad_norm": 0.47683942317962646, + "learning_rate": 3.016620476052774e-05, + "loss": 0.1956, + "step": 3381 + }, + { + "epoch": 0.35050264276090787, + "grad_norm": 0.5153138637542725, + "learning_rate": 3.0160422677053306e-05, + "loss": 0.22, + "step": 3382 + }, + { + "epoch": 0.3506062804435693, + "grad_norm": 0.5153190493583679, + "learning_rate": 3.0154639448718242e-05, + "loss": 0.2453, + "step": 3383 + }, + { + "epoch": 0.3507099181262307, + "grad_norm": 0.47236472368240356, + "learning_rate": 3.014885507617418e-05, + "loss": 0.2101, + "step": 3384 + }, + { + "epoch": 0.3508135558088921, + "grad_norm": 0.4625147879123688, + "learning_rate": 3.0143069560072914e-05, + "loss": 0.2132, + "step": 3385 + }, + { + "epoch": 0.35091719349155354, + "grad_norm": 0.46439385414123535, + "learning_rate": 3.0137282901066332e-05, + "loss": 0.2111, + "step": 3386 + }, + { + "epoch": 0.35102083117421495, + "grad_norm": 0.4748516082763672, + "learning_rate": 3.0131495099806472e-05, + "loss": 0.2143, + "step": 3387 + }, + { + "epoch": 0.35112446885687637, + "grad_norm": 0.5125780701637268, + "learning_rate": 3.012570615694549e-05, + "loss": 0.2532, + "step": 3388 + }, + { + "epoch": 0.3512281065395378, + "grad_norm": 0.4637886583805084, + "learning_rate": 3.011991607313569e-05, + "loss": 0.2181, + "step": 3389 + }, + { + "epoch": 0.3513317442221992, + "grad_norm": 0.45637691020965576, + "learning_rate": 3.0114124849029474e-05, + "loss": 0.2108, + "step": 3390 + }, + { + "epoch": 0.3514353819048606, + "grad_norm": 0.4881954789161682, + "learning_rate": 3.0108332485279387e-05, + "loss": 0.2281, + "step": 3391 + }, + { + "epoch": 0.35153901958752204, + "grad_norm": 0.4588933289051056, + "learning_rate": 3.0102538982538116e-05, + "loss": 0.219, + "step": 3392 + }, + { + "epoch": 0.35164265727018346, + "grad_norm": 0.5305696725845337, + "learning_rate": 3.0096744341458452e-05, + "loss": 0.2512, + "step": 3393 + }, + { + "epoch": 0.3517462949528449, + "grad_norm": 0.5035028457641602, + "learning_rate": 3.0090948562693336e-05, + "loss": 0.2561, + "step": 3394 + }, + { + "epoch": 0.3518499326355063, + "grad_norm": 0.39127054810523987, + "learning_rate": 3.0085151646895823e-05, + "loss": 0.1747, + "step": 3395 + }, + { + "epoch": 0.3519535703181677, + "grad_norm": 0.46729522943496704, + "learning_rate": 3.007935359471909e-05, + "loss": 0.2006, + "step": 3396 + }, + { + "epoch": 0.3520572080008291, + "grad_norm": 0.419251024723053, + "learning_rate": 3.0073554406816474e-05, + "loss": 0.1925, + "step": 3397 + }, + { + "epoch": 0.35216084568349054, + "grad_norm": 0.46076175570487976, + "learning_rate": 3.0067754083841406e-05, + "loss": 0.2069, + "step": 3398 + }, + { + "epoch": 0.35226448336615196, + "grad_norm": 0.4998854696750641, + "learning_rate": 3.0061952626447458e-05, + "loss": 0.2189, + "step": 3399 + }, + { + "epoch": 0.3523681210488133, + "grad_norm": 0.4538286328315735, + "learning_rate": 3.0056150035288323e-05, + "loss": 0.2224, + "step": 3400 + }, + { + "epoch": 0.35247175873147474, + "grad_norm": 0.5140456557273865, + "learning_rate": 3.0050346311017842e-05, + "loss": 0.2425, + "step": 3401 + }, + { + "epoch": 0.35257539641413616, + "grad_norm": 0.40988877415657043, + "learning_rate": 3.004454145428996e-05, + "loss": 0.1665, + "step": 3402 + }, + { + "epoch": 0.3526790340967976, + "grad_norm": 0.43955564498901367, + "learning_rate": 3.003873546575876e-05, + "loss": 0.1954, + "step": 3403 + }, + { + "epoch": 0.352782671779459, + "grad_norm": 0.44053974747657776, + "learning_rate": 3.0032928346078453e-05, + "loss": 0.183, + "step": 3404 + }, + { + "epoch": 0.3528863094621204, + "grad_norm": 0.4559771418571472, + "learning_rate": 3.0027120095903378e-05, + "loss": 0.206, + "step": 3405 + }, + { + "epoch": 0.3529899471447818, + "grad_norm": 0.4658268988132477, + "learning_rate": 3.0021310715887996e-05, + "loss": 0.2101, + "step": 3406 + }, + { + "epoch": 0.35309358482744324, + "grad_norm": 0.44966891407966614, + "learning_rate": 3.0015500206686906e-05, + "loss": 0.2125, + "step": 3407 + }, + { + "epoch": 0.35319722251010466, + "grad_norm": 0.48129624128341675, + "learning_rate": 3.0009688568954818e-05, + "loss": 0.2433, + "step": 3408 + }, + { + "epoch": 0.3533008601927661, + "grad_norm": 0.4340304434299469, + "learning_rate": 3.0003875803346577e-05, + "loss": 0.2029, + "step": 3409 + }, + { + "epoch": 0.3534044978754275, + "grad_norm": 0.4602864384651184, + "learning_rate": 2.9998061910517172e-05, + "loss": 0.219, + "step": 3410 + }, + { + "epoch": 0.3535081355580889, + "grad_norm": 0.5180659890174866, + "learning_rate": 2.999224689112169e-05, + "loss": 0.1932, + "step": 3411 + }, + { + "epoch": 0.3536117732407503, + "grad_norm": 0.42697539925575256, + "learning_rate": 2.998643074581536e-05, + "loss": 0.189, + "step": 3412 + }, + { + "epoch": 0.35371541092341174, + "grad_norm": 0.46669483184814453, + "learning_rate": 2.9980613475253535e-05, + "loss": 0.2007, + "step": 3413 + }, + { + "epoch": 0.35381904860607316, + "grad_norm": 0.6057338714599609, + "learning_rate": 2.9974795080091708e-05, + "loss": 0.2793, + "step": 3414 + }, + { + "epoch": 0.3539226862887346, + "grad_norm": 0.5159379243850708, + "learning_rate": 2.996897556098547e-05, + "loss": 0.2276, + "step": 3415 + }, + { + "epoch": 0.354026323971396, + "grad_norm": 0.5511611700057983, + "learning_rate": 2.996315491859056e-05, + "loss": 0.2427, + "step": 3416 + }, + { + "epoch": 0.3541299616540574, + "grad_norm": 0.4959019124507904, + "learning_rate": 2.9957333153562847e-05, + "loss": 0.2244, + "step": 3417 + }, + { + "epoch": 0.35423359933671883, + "grad_norm": 0.5330846905708313, + "learning_rate": 2.9951510266558314e-05, + "loss": 0.2392, + "step": 3418 + }, + { + "epoch": 0.35433723701938025, + "grad_norm": 0.49244412779808044, + "learning_rate": 2.9945686258233073e-05, + "loss": 0.2299, + "step": 3419 + }, + { + "epoch": 0.35444087470204166, + "grad_norm": 0.47341403365135193, + "learning_rate": 2.9939861129243368e-05, + "loss": 0.1775, + "step": 3420 + }, + { + "epoch": 0.3545445123847031, + "grad_norm": 0.5534030199050903, + "learning_rate": 2.9934034880245554e-05, + "loss": 0.2475, + "step": 3421 + }, + { + "epoch": 0.3546481500673645, + "grad_norm": 0.488478422164917, + "learning_rate": 2.992820751189614e-05, + "loss": 0.2421, + "step": 3422 + }, + { + "epoch": 0.3547517877500259, + "grad_norm": 0.5435716509819031, + "learning_rate": 2.992237902485174e-05, + "loss": 0.2221, + "step": 3423 + }, + { + "epoch": 0.35485542543268733, + "grad_norm": 0.4245526194572449, + "learning_rate": 2.9916549419769086e-05, + "loss": 0.1846, + "step": 3424 + }, + { + "epoch": 0.35495906311534875, + "grad_norm": 0.5004781484603882, + "learning_rate": 2.991071869730507e-05, + "loss": 0.2062, + "step": 3425 + }, + { + "epoch": 0.35506270079801017, + "grad_norm": 0.5690580010414124, + "learning_rate": 2.990488685811667e-05, + "loss": 0.2647, + "step": 3426 + }, + { + "epoch": 0.3551663384806716, + "grad_norm": 0.5165721774101257, + "learning_rate": 2.989905390286102e-05, + "loss": 0.2444, + "step": 3427 + }, + { + "epoch": 0.355269976163333, + "grad_norm": 0.5091362595558167, + "learning_rate": 2.989321983219536e-05, + "loss": 0.2333, + "step": 3428 + }, + { + "epoch": 0.3553736138459944, + "grad_norm": 0.49818992614746094, + "learning_rate": 2.988738464677707e-05, + "loss": 0.2068, + "step": 3429 + }, + { + "epoch": 0.35547725152865584, + "grad_norm": 0.4385213553905487, + "learning_rate": 2.9881548347263654e-05, + "loss": 0.2095, + "step": 3430 + }, + { + "epoch": 0.35558088921131725, + "grad_norm": 0.5639091730117798, + "learning_rate": 2.9875710934312723e-05, + "loss": 0.2368, + "step": 3431 + }, + { + "epoch": 0.35568452689397867, + "grad_norm": 0.5173773765563965, + "learning_rate": 2.986987240858204e-05, + "loss": 0.2211, + "step": 3432 + }, + { + "epoch": 0.3557881645766401, + "grad_norm": 0.4789592921733856, + "learning_rate": 2.986403277072948e-05, + "loss": 0.2244, + "step": 3433 + }, + { + "epoch": 0.3558918022593015, + "grad_norm": 0.5333912372589111, + "learning_rate": 2.985819202141304e-05, + "loss": 0.2436, + "step": 3434 + }, + { + "epoch": 0.3559954399419629, + "grad_norm": 0.5361971259117126, + "learning_rate": 2.9852350161290844e-05, + "loss": 0.2124, + "step": 3435 + }, + { + "epoch": 0.35609907762462434, + "grad_norm": 0.5212494134902954, + "learning_rate": 2.984650719102115e-05, + "loss": 0.2428, + "step": 3436 + }, + { + "epoch": 0.35620271530728576, + "grad_norm": 0.5393432974815369, + "learning_rate": 2.9840663111262334e-05, + "loss": 0.2282, + "step": 3437 + }, + { + "epoch": 0.3563063529899471, + "grad_norm": 0.506872296333313, + "learning_rate": 2.9834817922672892e-05, + "loss": 0.2369, + "step": 3438 + }, + { + "epoch": 0.35640999067260853, + "grad_norm": 0.4239386022090912, + "learning_rate": 2.9828971625911465e-05, + "loss": 0.1886, + "step": 3439 + }, + { + "epoch": 0.35651362835526995, + "grad_norm": 0.5943200588226318, + "learning_rate": 2.9823124221636784e-05, + "loss": 0.2464, + "step": 3440 + }, + { + "epoch": 0.35661726603793137, + "grad_norm": 0.4862738847732544, + "learning_rate": 2.9817275710507735e-05, + "loss": 0.2484, + "step": 3441 + }, + { + "epoch": 0.3567209037205928, + "grad_norm": 0.483430415391922, + "learning_rate": 2.981142609318333e-05, + "loss": 0.2233, + "step": 3442 + }, + { + "epoch": 0.3568245414032542, + "grad_norm": 0.45114192366600037, + "learning_rate": 2.980557537032268e-05, + "loss": 0.1913, + "step": 3443 + }, + { + "epoch": 0.3569281790859156, + "grad_norm": 0.510878324508667, + "learning_rate": 2.979972354258504e-05, + "loss": 0.2194, + "step": 3444 + }, + { + "epoch": 0.35703181676857704, + "grad_norm": 0.4664299190044403, + "learning_rate": 2.979387061062978e-05, + "loss": 0.2104, + "step": 3445 + }, + { + "epoch": 0.35713545445123845, + "grad_norm": 0.5411281585693359, + "learning_rate": 2.9788016575116412e-05, + "loss": 0.2701, + "step": 3446 + }, + { + "epoch": 0.35723909213389987, + "grad_norm": 0.41982153058052063, + "learning_rate": 2.978216143670455e-05, + "loss": 0.1865, + "step": 3447 + }, + { + "epoch": 0.3573427298165613, + "grad_norm": 0.45480507612228394, + "learning_rate": 2.977630519605394e-05, + "loss": 0.2287, + "step": 3448 + }, + { + "epoch": 0.3574463674992227, + "grad_norm": 0.4503597319126129, + "learning_rate": 2.9770447853824468e-05, + "loss": 0.2002, + "step": 3449 + }, + { + "epoch": 0.3575500051818841, + "grad_norm": 0.5262497067451477, + "learning_rate": 2.976458941067611e-05, + "loss": 0.2392, + "step": 3450 + }, + { + "epoch": 0.35765364286454554, + "grad_norm": 0.42646539211273193, + "learning_rate": 2.9758729867268998e-05, + "loss": 0.2063, + "step": 3451 + }, + { + "epoch": 0.35775728054720696, + "grad_norm": 0.5226529240608215, + "learning_rate": 2.975286922426338e-05, + "loss": 0.2364, + "step": 3452 + }, + { + "epoch": 0.3578609182298684, + "grad_norm": 0.49958691000938416, + "learning_rate": 2.9747007482319616e-05, + "loss": 0.231, + "step": 3453 + }, + { + "epoch": 0.3579645559125298, + "grad_norm": 0.44578859210014343, + "learning_rate": 2.9741144642098204e-05, + "loss": 0.2012, + "step": 3454 + }, + { + "epoch": 0.3580681935951912, + "grad_norm": 0.49792999029159546, + "learning_rate": 2.9735280704259755e-05, + "loss": 0.2201, + "step": 3455 + }, + { + "epoch": 0.3581718312778526, + "grad_norm": 0.4345203638076782, + "learning_rate": 2.9729415669465004e-05, + "loss": 0.1937, + "step": 3456 + }, + { + "epoch": 0.35827546896051404, + "grad_norm": 0.48358073830604553, + "learning_rate": 2.9723549538374824e-05, + "loss": 0.2582, + "step": 3457 + }, + { + "epoch": 0.35837910664317546, + "grad_norm": 0.488992303609848, + "learning_rate": 2.9717682311650204e-05, + "loss": 0.2291, + "step": 3458 + }, + { + "epoch": 0.3584827443258369, + "grad_norm": 0.5303345918655396, + "learning_rate": 2.9711813989952242e-05, + "loss": 0.2248, + "step": 3459 + }, + { + "epoch": 0.3585863820084983, + "grad_norm": 0.4908701181411743, + "learning_rate": 2.9705944573942173e-05, + "loss": 0.2136, + "step": 3460 + }, + { + "epoch": 0.3586900196911597, + "grad_norm": 0.4816449284553528, + "learning_rate": 2.9700074064281367e-05, + "loss": 0.2352, + "step": 3461 + }, + { + "epoch": 0.35879365737382113, + "grad_norm": 0.4632469713687897, + "learning_rate": 2.9694202461631282e-05, + "loss": 0.2046, + "step": 3462 + }, + { + "epoch": 0.35889729505648255, + "grad_norm": 0.49483367800712585, + "learning_rate": 2.968832976665354e-05, + "loss": 0.233, + "step": 3463 + }, + { + "epoch": 0.35900093273914396, + "grad_norm": 0.45997875928878784, + "learning_rate": 2.9682455980009862e-05, + "loss": 0.1865, + "step": 3464 + }, + { + "epoch": 0.3591045704218054, + "grad_norm": 0.449470192193985, + "learning_rate": 2.96765811023621e-05, + "loss": 0.1967, + "step": 3465 + }, + { + "epoch": 0.3592082081044668, + "grad_norm": 0.5391585230827332, + "learning_rate": 2.9670705134372216e-05, + "loss": 0.2325, + "step": 3466 + }, + { + "epoch": 0.3593118457871282, + "grad_norm": 0.4918924570083618, + "learning_rate": 2.9664828076702307e-05, + "loss": 0.2164, + "step": 3467 + }, + { + "epoch": 0.35941548346978963, + "grad_norm": 0.5079565048217773, + "learning_rate": 2.9658949930014606e-05, + "loss": 0.2176, + "step": 3468 + }, + { + "epoch": 0.35951912115245105, + "grad_norm": 0.44536030292510986, + "learning_rate": 2.9653070694971435e-05, + "loss": 0.1739, + "step": 3469 + }, + { + "epoch": 0.35962275883511247, + "grad_norm": 0.44883498549461365, + "learning_rate": 2.9647190372235265e-05, + "loss": 0.1945, + "step": 3470 + }, + { + "epoch": 0.3597263965177739, + "grad_norm": 0.510392427444458, + "learning_rate": 2.9641308962468676e-05, + "loss": 0.2237, + "step": 3471 + }, + { + "epoch": 0.3598300342004353, + "grad_norm": 0.47959601879119873, + "learning_rate": 2.9635426466334384e-05, + "loss": 0.1916, + "step": 3472 + }, + { + "epoch": 0.3599336718830967, + "grad_norm": 0.5199118256568909, + "learning_rate": 2.962954288449522e-05, + "loss": 0.2417, + "step": 3473 + }, + { + "epoch": 0.36003730956575813, + "grad_norm": 0.5270677208900452, + "learning_rate": 2.9623658217614132e-05, + "loss": 0.2059, + "step": 3474 + }, + { + "epoch": 0.36014094724841955, + "grad_norm": 0.5287850499153137, + "learning_rate": 2.9617772466354192e-05, + "loss": 0.2272, + "step": 3475 + }, + { + "epoch": 0.3602445849310809, + "grad_norm": 0.481442928314209, + "learning_rate": 2.9611885631378602e-05, + "loss": 0.2097, + "step": 3476 + }, + { + "epoch": 0.36034822261374233, + "grad_norm": 0.4829895794391632, + "learning_rate": 2.9605997713350686e-05, + "loss": 0.2042, + "step": 3477 + }, + { + "epoch": 0.36045186029640375, + "grad_norm": 0.49132823944091797, + "learning_rate": 2.960010871293388e-05, + "loss": 0.2404, + "step": 3478 + }, + { + "epoch": 0.36055549797906516, + "grad_norm": 0.4517565369606018, + "learning_rate": 2.9594218630791746e-05, + "loss": 0.1784, + "step": 3479 + }, + { + "epoch": 0.3606591356617266, + "grad_norm": 0.5756237506866455, + "learning_rate": 2.958832746758797e-05, + "loss": 0.2601, + "step": 3480 + }, + { + "epoch": 0.360762773344388, + "grad_norm": 0.47717320919036865, + "learning_rate": 2.9582435223986363e-05, + "loss": 0.218, + "step": 3481 + }, + { + "epoch": 0.3608664110270494, + "grad_norm": 0.515692412853241, + "learning_rate": 2.9576541900650847e-05, + "loss": 0.2357, + "step": 3482 + }, + { + "epoch": 0.36097004870971083, + "grad_norm": 0.5456742644309998, + "learning_rate": 2.9570647498245484e-05, + "loss": 0.2441, + "step": 3483 + }, + { + "epoch": 0.36107368639237225, + "grad_norm": 0.5447889566421509, + "learning_rate": 2.9564752017434432e-05, + "loss": 0.2426, + "step": 3484 + }, + { + "epoch": 0.36117732407503367, + "grad_norm": 0.5068838596343994, + "learning_rate": 2.955885545888199e-05, + "loss": 0.2359, + "step": 3485 + }, + { + "epoch": 0.3612809617576951, + "grad_norm": 0.5069774389266968, + "learning_rate": 2.955295782325258e-05, + "loss": 0.2234, + "step": 3486 + }, + { + "epoch": 0.3613845994403565, + "grad_norm": 0.48465102910995483, + "learning_rate": 2.954705911121073e-05, + "loss": 0.2047, + "step": 3487 + }, + { + "epoch": 0.3614882371230179, + "grad_norm": 0.44714057445526123, + "learning_rate": 2.95411593234211e-05, + "loss": 0.1955, + "step": 3488 + }, + { + "epoch": 0.36159187480567934, + "grad_norm": 0.46095696091651917, + "learning_rate": 2.9535258460548473e-05, + "loss": 0.1973, + "step": 3489 + }, + { + "epoch": 0.36169551248834075, + "grad_norm": 0.5460710525512695, + "learning_rate": 2.9529356523257742e-05, + "loss": 0.2626, + "step": 3490 + }, + { + "epoch": 0.36179915017100217, + "grad_norm": 0.49723753333091736, + "learning_rate": 2.952345351221393e-05, + "loss": 0.2202, + "step": 3491 + }, + { + "epoch": 0.3619027878536636, + "grad_norm": 0.476315975189209, + "learning_rate": 2.9517549428082185e-05, + "loss": 0.2361, + "step": 3492 + }, + { + "epoch": 0.362006425536325, + "grad_norm": 0.4768473207950592, + "learning_rate": 2.951164427152777e-05, + "loss": 0.2027, + "step": 3493 + }, + { + "epoch": 0.3621100632189864, + "grad_norm": 0.49800312519073486, + "learning_rate": 2.9505738043216053e-05, + "loss": 0.2222, + "step": 3494 + }, + { + "epoch": 0.36221370090164784, + "grad_norm": 0.48882049322128296, + "learning_rate": 2.9499830743812557e-05, + "loss": 0.215, + "step": 3495 + }, + { + "epoch": 0.36231733858430926, + "grad_norm": 0.5254577994346619, + "learning_rate": 2.94939223739829e-05, + "loss": 0.2355, + "step": 3496 + }, + { + "epoch": 0.3624209762669707, + "grad_norm": 0.46470046043395996, + "learning_rate": 2.9488012934392828e-05, + "loss": 0.2513, + "step": 3497 + }, + { + "epoch": 0.3625246139496321, + "grad_norm": 0.5480517148971558, + "learning_rate": 2.948210242570821e-05, + "loss": 0.2706, + "step": 3498 + }, + { + "epoch": 0.3626282516322935, + "grad_norm": 0.4826633930206299, + "learning_rate": 2.9476190848595032e-05, + "loss": 0.2083, + "step": 3499 + }, + { + "epoch": 0.3627318893149549, + "grad_norm": 0.45155906677246094, + "learning_rate": 2.947027820371939e-05, + "loss": 0.2236, + "step": 3500 + }, + { + "epoch": 0.36283552699761634, + "grad_norm": 0.5023136138916016, + "learning_rate": 2.946436449174753e-05, + "loss": 0.2225, + "step": 3501 + }, + { + "epoch": 0.36293916468027776, + "grad_norm": 0.44159698486328125, + "learning_rate": 2.9458449713345795e-05, + "loss": 0.1906, + "step": 3502 + }, + { + "epoch": 0.3630428023629392, + "grad_norm": 0.5016050934791565, + "learning_rate": 2.9452533869180643e-05, + "loss": 0.2195, + "step": 3503 + }, + { + "epoch": 0.3631464400456006, + "grad_norm": 0.4987817704677582, + "learning_rate": 2.944661695991867e-05, + "loss": 0.2113, + "step": 3504 + }, + { + "epoch": 0.363250077728262, + "grad_norm": 0.4627358615398407, + "learning_rate": 2.9440698986226585e-05, + "loss": 0.2135, + "step": 3505 + }, + { + "epoch": 0.36335371541092343, + "grad_norm": 0.45628219842910767, + "learning_rate": 2.9434779948771214e-05, + "loss": 0.1765, + "step": 3506 + }, + { + "epoch": 0.36345735309358485, + "grad_norm": 0.43228745460510254, + "learning_rate": 2.9428859848219505e-05, + "loss": 0.1921, + "step": 3507 + }, + { + "epoch": 0.36356099077624626, + "grad_norm": 0.47669291496276855, + "learning_rate": 2.9422938685238524e-05, + "loss": 0.2206, + "step": 3508 + }, + { + "epoch": 0.3636646284589077, + "grad_norm": 0.5110993385314941, + "learning_rate": 2.9417016460495466e-05, + "loss": 0.2518, + "step": 3509 + }, + { + "epoch": 0.3637682661415691, + "grad_norm": 0.5036219358444214, + "learning_rate": 2.9411093174657622e-05, + "loss": 0.2165, + "step": 3510 + }, + { + "epoch": 0.3638719038242305, + "grad_norm": 0.4140499234199524, + "learning_rate": 2.9405168828392436e-05, + "loss": 0.1599, + "step": 3511 + }, + { + "epoch": 0.36397554150689193, + "grad_norm": 0.5629992485046387, + "learning_rate": 2.9399243422367445e-05, + "loss": 0.2612, + "step": 3512 + }, + { + "epoch": 0.36407917918955335, + "grad_norm": 0.46416953206062317, + "learning_rate": 2.9393316957250317e-05, + "loss": 0.1849, + "step": 3513 + }, + { + "epoch": 0.3641828168722147, + "grad_norm": 0.5125628113746643, + "learning_rate": 2.9387389433708837e-05, + "loss": 0.2172, + "step": 3514 + }, + { + "epoch": 0.3642864545548761, + "grad_norm": 0.5483174920082092, + "learning_rate": 2.9381460852410906e-05, + "loss": 0.2493, + "step": 3515 + }, + { + "epoch": 0.36439009223753754, + "grad_norm": 0.5044114589691162, + "learning_rate": 2.9375531214024553e-05, + "loss": 0.2402, + "step": 3516 + }, + { + "epoch": 0.36449372992019896, + "grad_norm": 0.4661901593208313, + "learning_rate": 2.9369600519217916e-05, + "loss": 0.1922, + "step": 3517 + }, + { + "epoch": 0.3645973676028604, + "grad_norm": 0.5807094573974609, + "learning_rate": 2.9363668768659263e-05, + "loss": 0.2717, + "step": 3518 + }, + { + "epoch": 0.3647010052855218, + "grad_norm": 0.5298463106155396, + "learning_rate": 2.9357735963016963e-05, + "loss": 0.1951, + "step": 3519 + }, + { + "epoch": 0.3648046429681832, + "grad_norm": 0.5299417972564697, + "learning_rate": 2.935180210295952e-05, + "loss": 0.2032, + "step": 3520 + }, + { + "epoch": 0.36490828065084463, + "grad_norm": 0.5053216814994812, + "learning_rate": 2.9345867189155562e-05, + "loss": 0.2035, + "step": 3521 + }, + { + "epoch": 0.36501191833350605, + "grad_norm": 0.48312923312187195, + "learning_rate": 2.933993122227381e-05, + "loss": 0.206, + "step": 3522 + }, + { + "epoch": 0.36511555601616746, + "grad_norm": 0.5141869783401489, + "learning_rate": 2.933399420298313e-05, + "loss": 0.222, + "step": 3523 + }, + { + "epoch": 0.3652191936988289, + "grad_norm": 0.4666402339935303, + "learning_rate": 2.932805613195249e-05, + "loss": 0.1896, + "step": 3524 + }, + { + "epoch": 0.3653228313814903, + "grad_norm": 0.4823445975780487, + "learning_rate": 2.9322117009850988e-05, + "loss": 0.1735, + "step": 3525 + }, + { + "epoch": 0.3654264690641517, + "grad_norm": 0.5427972078323364, + "learning_rate": 2.9316176837347834e-05, + "loss": 0.2418, + "step": 3526 + }, + { + "epoch": 0.36553010674681313, + "grad_norm": 0.4606572091579437, + "learning_rate": 2.9310235615112357e-05, + "loss": 0.1913, + "step": 3527 + }, + { + "epoch": 0.36563374442947455, + "grad_norm": 0.3660353422164917, + "learning_rate": 2.9304293343814005e-05, + "loss": 0.1392, + "step": 3528 + }, + { + "epoch": 0.36573738211213597, + "grad_norm": 0.5093293190002441, + "learning_rate": 2.929835002412234e-05, + "loss": 0.2118, + "step": 3529 + }, + { + "epoch": 0.3658410197947974, + "grad_norm": 0.5066865682601929, + "learning_rate": 2.9292405656707044e-05, + "loss": 0.2008, + "step": 3530 + }, + { + "epoch": 0.3659446574774588, + "grad_norm": 0.6015675663948059, + "learning_rate": 2.928646024223793e-05, + "loss": 0.2406, + "step": 3531 + }, + { + "epoch": 0.3660482951601202, + "grad_norm": 0.6050959229469299, + "learning_rate": 2.9280513781384913e-05, + "loss": 0.2522, + "step": 3532 + }, + { + "epoch": 0.36615193284278164, + "grad_norm": 0.4754674434661865, + "learning_rate": 2.9274566274818027e-05, + "loss": 0.2329, + "step": 3533 + }, + { + "epoch": 0.36625557052544305, + "grad_norm": 0.527370274066925, + "learning_rate": 2.9268617723207433e-05, + "loss": 0.2376, + "step": 3534 + }, + { + "epoch": 0.36635920820810447, + "grad_norm": 0.4930630624294281, + "learning_rate": 2.92626681272234e-05, + "loss": 0.1882, + "step": 3535 + }, + { + "epoch": 0.3664628458907659, + "grad_norm": 0.5467690825462341, + "learning_rate": 2.9256717487536322e-05, + "loss": 0.2391, + "step": 3536 + }, + { + "epoch": 0.3665664835734273, + "grad_norm": 0.5535871386528015, + "learning_rate": 2.9250765804816712e-05, + "loss": 0.264, + "step": 3537 + }, + { + "epoch": 0.3666701212560887, + "grad_norm": 0.4704132676124573, + "learning_rate": 2.9244813079735186e-05, + "loss": 0.2198, + "step": 3538 + }, + { + "epoch": 0.36677375893875014, + "grad_norm": 0.4724007248878479, + "learning_rate": 2.9238859312962496e-05, + "loss": 0.2065, + "step": 3539 + }, + { + "epoch": 0.36687739662141156, + "grad_norm": 0.5545615553855896, + "learning_rate": 2.9232904505169498e-05, + "loss": 0.2557, + "step": 3540 + }, + { + "epoch": 0.366981034304073, + "grad_norm": 0.4997697174549103, + "learning_rate": 2.9226948657027178e-05, + "loss": 0.2299, + "step": 3541 + }, + { + "epoch": 0.3670846719867344, + "grad_norm": 0.4856959581375122, + "learning_rate": 2.9220991769206617e-05, + "loss": 0.1989, + "step": 3542 + }, + { + "epoch": 0.3671883096693958, + "grad_norm": 0.5277553796768188, + "learning_rate": 2.9215033842379048e-05, + "loss": 0.2166, + "step": 3543 + }, + { + "epoch": 0.3672919473520572, + "grad_norm": 0.5107161998748779, + "learning_rate": 2.920907487721579e-05, + "loss": 0.2279, + "step": 3544 + }, + { + "epoch": 0.36739558503471864, + "grad_norm": 0.44258975982666016, + "learning_rate": 2.920311487438828e-05, + "loss": 0.1818, + "step": 3545 + }, + { + "epoch": 0.36749922271738006, + "grad_norm": 0.4603618085384369, + "learning_rate": 2.9197153834568106e-05, + "loss": 0.1815, + "step": 3546 + }, + { + "epoch": 0.3676028604000415, + "grad_norm": 0.47472718358039856, + "learning_rate": 2.919119175842693e-05, + "loss": 0.2362, + "step": 3547 + }, + { + "epoch": 0.3677064980827029, + "grad_norm": 0.4437291920185089, + "learning_rate": 2.9185228646636553e-05, + "loss": 0.1845, + "step": 3548 + }, + { + "epoch": 0.3678101357653643, + "grad_norm": 0.4785867929458618, + "learning_rate": 2.9179264499868893e-05, + "loss": 0.2044, + "step": 3549 + }, + { + "epoch": 0.3679137734480257, + "grad_norm": 0.5831817388534546, + "learning_rate": 2.917329931879598e-05, + "loss": 0.261, + "step": 3550 + }, + { + "epoch": 0.36801741113068714, + "grad_norm": 0.45431435108184814, + "learning_rate": 2.9167333104089956e-05, + "loss": 0.2025, + "step": 3551 + }, + { + "epoch": 0.3681210488133485, + "grad_norm": 0.5137087106704712, + "learning_rate": 2.916136585642309e-05, + "loss": 0.208, + "step": 3552 + }, + { + "epoch": 0.3682246864960099, + "grad_norm": 0.5459833741188049, + "learning_rate": 2.9155397576467765e-05, + "loss": 0.2807, + "step": 3553 + }, + { + "epoch": 0.36832832417867134, + "grad_norm": 0.470268189907074, + "learning_rate": 2.9149428264896465e-05, + "loss": 0.2268, + "step": 3554 + }, + { + "epoch": 0.36843196186133276, + "grad_norm": 0.48998451232910156, + "learning_rate": 2.9143457922381816e-05, + "loss": 0.2062, + "step": 3555 + }, + { + "epoch": 0.3685355995439942, + "grad_norm": 0.5100763440132141, + "learning_rate": 2.9137486549596544e-05, + "loss": 0.2345, + "step": 3556 + }, + { + "epoch": 0.3686392372266556, + "grad_norm": 0.5126616954803467, + "learning_rate": 2.913151414721349e-05, + "loss": 0.2055, + "step": 3557 + }, + { + "epoch": 0.368742874909317, + "grad_norm": 0.43402886390686035, + "learning_rate": 2.9125540715905613e-05, + "loss": 0.1838, + "step": 3558 + }, + { + "epoch": 0.3688465125919784, + "grad_norm": 0.4938167631626129, + "learning_rate": 2.9119566256346e-05, + "loss": 0.2162, + "step": 3559 + }, + { + "epoch": 0.36895015027463984, + "grad_norm": 0.4344816207885742, + "learning_rate": 2.911359076920784e-05, + "loss": 0.1651, + "step": 3560 + }, + { + "epoch": 0.36905378795730126, + "grad_norm": 0.4878370463848114, + "learning_rate": 2.910761425516443e-05, + "loss": 0.2244, + "step": 3561 + }, + { + "epoch": 0.3691574256399627, + "grad_norm": 0.5052449107170105, + "learning_rate": 2.9101636714889215e-05, + "loss": 0.2288, + "step": 3562 + }, + { + "epoch": 0.3692610633226241, + "grad_norm": 0.5431941151618958, + "learning_rate": 2.9095658149055713e-05, + "loss": 0.2603, + "step": 3563 + }, + { + "epoch": 0.3693647010052855, + "grad_norm": 0.5375214219093323, + "learning_rate": 2.9089678558337593e-05, + "loss": 0.2502, + "step": 3564 + }, + { + "epoch": 0.36946833868794693, + "grad_norm": 0.5288280844688416, + "learning_rate": 2.908369794340863e-05, + "loss": 0.2508, + "step": 3565 + }, + { + "epoch": 0.36957197637060835, + "grad_norm": 0.5204939842224121, + "learning_rate": 2.9077716304942698e-05, + "loss": 0.2162, + "step": 3566 + }, + { + "epoch": 0.36967561405326976, + "grad_norm": 0.5446228981018066, + "learning_rate": 2.9071733643613806e-05, + "loss": 0.2541, + "step": 3567 + }, + { + "epoch": 0.3697792517359312, + "grad_norm": 0.5477862358093262, + "learning_rate": 2.9065749960096066e-05, + "loss": 0.2338, + "step": 3568 + }, + { + "epoch": 0.3698828894185926, + "grad_norm": 0.5047724843025208, + "learning_rate": 2.9059765255063718e-05, + "loss": 0.2153, + "step": 3569 + }, + { + "epoch": 0.369986527101254, + "grad_norm": 0.571573793888092, + "learning_rate": 2.90537795291911e-05, + "loss": 0.2785, + "step": 3570 + }, + { + "epoch": 0.37009016478391543, + "grad_norm": 0.4648495018482208, + "learning_rate": 2.9047792783152685e-05, + "loss": 0.2054, + "step": 3571 + }, + { + "epoch": 0.37019380246657685, + "grad_norm": 0.5595817565917969, + "learning_rate": 2.9041805017623043e-05, + "loss": 0.2432, + "step": 3572 + }, + { + "epoch": 0.37029744014923827, + "grad_norm": 0.4277222752571106, + "learning_rate": 2.9035816233276866e-05, + "loss": 0.1885, + "step": 3573 + }, + { + "epoch": 0.3704010778318997, + "grad_norm": 0.41399163007736206, + "learning_rate": 2.902982643078896e-05, + "loss": 0.1749, + "step": 3574 + }, + { + "epoch": 0.3705047155145611, + "grad_norm": 0.49881187081336975, + "learning_rate": 2.9023835610834253e-05, + "loss": 0.2281, + "step": 3575 + }, + { + "epoch": 0.3706083531972225, + "grad_norm": 0.5957393646240234, + "learning_rate": 2.9017843774087774e-05, + "loss": 0.2337, + "step": 3576 + }, + { + "epoch": 0.37071199087988393, + "grad_norm": 0.5654463171958923, + "learning_rate": 2.901185092122468e-05, + "loss": 0.2728, + "step": 3577 + }, + { + "epoch": 0.37081562856254535, + "grad_norm": 0.4537357985973358, + "learning_rate": 2.9005857052920232e-05, + "loss": 0.2025, + "step": 3578 + }, + { + "epoch": 0.37091926624520677, + "grad_norm": 0.5320644378662109, + "learning_rate": 2.8999862169849807e-05, + "loss": 0.2283, + "step": 3579 + }, + { + "epoch": 0.3710229039278682, + "grad_norm": 0.5118852257728577, + "learning_rate": 2.8993866272688905e-05, + "loss": 0.2241, + "step": 3580 + }, + { + "epoch": 0.3711265416105296, + "grad_norm": 0.48598912358283997, + "learning_rate": 2.898786936211314e-05, + "loss": 0.2328, + "step": 3581 + }, + { + "epoch": 0.371230179293191, + "grad_norm": 0.44688498973846436, + "learning_rate": 2.898187143879822e-05, + "loss": 0.1865, + "step": 3582 + }, + { + "epoch": 0.37133381697585244, + "grad_norm": 0.5546681880950928, + "learning_rate": 2.897587250341999e-05, + "loss": 0.2309, + "step": 3583 + }, + { + "epoch": 0.37143745465851385, + "grad_norm": 0.48288416862487793, + "learning_rate": 2.89698725566544e-05, + "loss": 0.2094, + "step": 3584 + }, + { + "epoch": 0.37154109234117527, + "grad_norm": 0.47836169600486755, + "learning_rate": 2.8963871599177517e-05, + "loss": 0.2109, + "step": 3585 + }, + { + "epoch": 0.3716447300238367, + "grad_norm": 0.4424706995487213, + "learning_rate": 2.8957869631665514e-05, + "loss": 0.1797, + "step": 3586 + }, + { + "epoch": 0.3717483677064981, + "grad_norm": 0.48061931133270264, + "learning_rate": 2.8951866654794683e-05, + "loss": 0.2098, + "step": 3587 + }, + { + "epoch": 0.3718520053891595, + "grad_norm": 0.4864051043987274, + "learning_rate": 2.8945862669241443e-05, + "loss": 0.1919, + "step": 3588 + }, + { + "epoch": 0.37195564307182094, + "grad_norm": 0.4833623170852661, + "learning_rate": 2.893985767568229e-05, + "loss": 0.2074, + "step": 3589 + }, + { + "epoch": 0.3720592807544823, + "grad_norm": 0.49619123339653015, + "learning_rate": 2.8933851674793883e-05, + "loss": 0.1877, + "step": 3590 + }, + { + "epoch": 0.3721629184371437, + "grad_norm": 0.4423529803752899, + "learning_rate": 2.8927844667252953e-05, + "loss": 0.1977, + "step": 3591 + }, + { + "epoch": 0.37226655611980514, + "grad_norm": 0.38620245456695557, + "learning_rate": 2.8921836653736366e-05, + "loss": 0.1848, + "step": 3592 + }, + { + "epoch": 0.37237019380246655, + "grad_norm": 0.4467632472515106, + "learning_rate": 2.8915827634921094e-05, + "loss": 0.1845, + "step": 3593 + }, + { + "epoch": 0.37247383148512797, + "grad_norm": 0.4751870930194855, + "learning_rate": 2.890981761148422e-05, + "loss": 0.2199, + "step": 3594 + }, + { + "epoch": 0.3725774691677894, + "grad_norm": 0.466667115688324, + "learning_rate": 2.890380658410295e-05, + "loss": 0.2033, + "step": 3595 + }, + { + "epoch": 0.3726811068504508, + "grad_norm": 0.48820653557777405, + "learning_rate": 2.8897794553454597e-05, + "loss": 0.2083, + "step": 3596 + }, + { + "epoch": 0.3727847445331122, + "grad_norm": 0.5090116262435913, + "learning_rate": 2.889178152021659e-05, + "loss": 0.2137, + "step": 3597 + }, + { + "epoch": 0.37288838221577364, + "grad_norm": 0.5207107663154602, + "learning_rate": 2.888576748506646e-05, + "loss": 0.1894, + "step": 3598 + }, + { + "epoch": 0.37299201989843506, + "grad_norm": 0.5283442139625549, + "learning_rate": 2.8879752448681856e-05, + "loss": 0.2255, + "step": 3599 + }, + { + "epoch": 0.3730956575810965, + "grad_norm": 0.49328505992889404, + "learning_rate": 2.8873736411740557e-05, + "loss": 0.2099, + "step": 3600 + }, + { + "epoch": 0.3731992952637579, + "grad_norm": 0.4782813489437103, + "learning_rate": 2.886771937492043e-05, + "loss": 0.1914, + "step": 3601 + }, + { + "epoch": 0.3733029329464193, + "grad_norm": 0.45772305130958557, + "learning_rate": 2.8861701338899472e-05, + "loss": 0.1888, + "step": 3602 + }, + { + "epoch": 0.3734065706290807, + "grad_norm": 0.5250973105430603, + "learning_rate": 2.8855682304355787e-05, + "loss": 0.187, + "step": 3603 + }, + { + "epoch": 0.37351020831174214, + "grad_norm": 0.48250705003738403, + "learning_rate": 2.884966227196758e-05, + "loss": 0.1926, + "step": 3604 + }, + { + "epoch": 0.37361384599440356, + "grad_norm": 0.5074548125267029, + "learning_rate": 2.8843641242413184e-05, + "loss": 0.2024, + "step": 3605 + }, + { + "epoch": 0.373717483677065, + "grad_norm": 0.516728937625885, + "learning_rate": 2.8837619216371045e-05, + "loss": 0.2338, + "step": 3606 + }, + { + "epoch": 0.3738211213597264, + "grad_norm": 0.4710516333580017, + "learning_rate": 2.8831596194519713e-05, + "loss": 0.2226, + "step": 3607 + }, + { + "epoch": 0.3739247590423878, + "grad_norm": 0.5935936570167542, + "learning_rate": 2.8825572177537846e-05, + "loss": 0.2288, + "step": 3608 + }, + { + "epoch": 0.37402839672504923, + "grad_norm": 0.46532249450683594, + "learning_rate": 2.8819547166104228e-05, + "loss": 0.2031, + "step": 3609 + }, + { + "epoch": 0.37413203440771065, + "grad_norm": 0.49024295806884766, + "learning_rate": 2.8813521160897742e-05, + "loss": 0.2221, + "step": 3610 + }, + { + "epoch": 0.37423567209037206, + "grad_norm": 0.46530261635780334, + "learning_rate": 2.8807494162597395e-05, + "loss": 0.198, + "step": 3611 + }, + { + "epoch": 0.3743393097730335, + "grad_norm": 0.5083104372024536, + "learning_rate": 2.8801466171882296e-05, + "loss": 0.2171, + "step": 3612 + }, + { + "epoch": 0.3744429474556949, + "grad_norm": 0.534168541431427, + "learning_rate": 2.8795437189431675e-05, + "loss": 0.2295, + "step": 3613 + }, + { + "epoch": 0.3745465851383563, + "grad_norm": 0.5137151479721069, + "learning_rate": 2.8789407215924855e-05, + "loss": 0.2035, + "step": 3614 + }, + { + "epoch": 0.37465022282101773, + "grad_norm": 0.5426977872848511, + "learning_rate": 2.87833762520413e-05, + "loss": 0.2551, + "step": 3615 + }, + { + "epoch": 0.37475386050367915, + "grad_norm": 0.5401124358177185, + "learning_rate": 2.877734429846057e-05, + "loss": 0.214, + "step": 3616 + }, + { + "epoch": 0.37485749818634057, + "grad_norm": 0.46718594431877136, + "learning_rate": 2.8771311355862323e-05, + "loss": 0.2072, + "step": 3617 + }, + { + "epoch": 0.374961135869002, + "grad_norm": 0.5239135026931763, + "learning_rate": 2.876527742492634e-05, + "loss": 0.2267, + "step": 3618 + }, + { + "epoch": 0.3750647735516634, + "grad_norm": 0.4801141321659088, + "learning_rate": 2.8759242506332534e-05, + "loss": 0.2098, + "step": 3619 + }, + { + "epoch": 0.3751684112343248, + "grad_norm": 0.5632691979408264, + "learning_rate": 2.875320660076089e-05, + "loss": 0.2429, + "step": 3620 + }, + { + "epoch": 0.37527204891698623, + "grad_norm": 0.48936334252357483, + "learning_rate": 2.8747169708891537e-05, + "loss": 0.2224, + "step": 3621 + }, + { + "epoch": 0.37537568659964765, + "grad_norm": 0.47907862067222595, + "learning_rate": 2.87411318314047e-05, + "loss": 0.191, + "step": 3622 + }, + { + "epoch": 0.37547932428230907, + "grad_norm": 0.5278570055961609, + "learning_rate": 2.873509296898072e-05, + "loss": 0.2606, + "step": 3623 + }, + { + "epoch": 0.3755829619649705, + "grad_norm": 0.5191770792007446, + "learning_rate": 2.8729053122300032e-05, + "loss": 0.2244, + "step": 3624 + }, + { + "epoch": 0.3756865996476319, + "grad_norm": 0.4299468994140625, + "learning_rate": 2.8723012292043223e-05, + "loss": 0.2104, + "step": 3625 + }, + { + "epoch": 0.3757902373302933, + "grad_norm": 0.5098922848701477, + "learning_rate": 2.871697047889094e-05, + "loss": 0.2499, + "step": 3626 + }, + { + "epoch": 0.37589387501295474, + "grad_norm": 0.44818735122680664, + "learning_rate": 2.871092768352397e-05, + "loss": 0.1935, + "step": 3627 + }, + { + "epoch": 0.3759975126956161, + "grad_norm": 0.5166599154472351, + "learning_rate": 2.8704883906623216e-05, + "loss": 0.2431, + "step": 3628 + }, + { + "epoch": 0.3761011503782775, + "grad_norm": 0.43848398327827454, + "learning_rate": 2.8698839148869676e-05, + "loss": 0.1817, + "step": 3629 + }, + { + "epoch": 0.37620478806093893, + "grad_norm": 0.48116207122802734, + "learning_rate": 2.869279341094446e-05, + "loss": 0.2319, + "step": 3630 + }, + { + "epoch": 0.37630842574360035, + "grad_norm": 0.4673698842525482, + "learning_rate": 2.86867466935288e-05, + "loss": 0.1873, + "step": 3631 + }, + { + "epoch": 0.37641206342626177, + "grad_norm": 0.4763447344303131, + "learning_rate": 2.8680698997304025e-05, + "loss": 0.2004, + "step": 3632 + }, + { + "epoch": 0.3765157011089232, + "grad_norm": 0.42831769585609436, + "learning_rate": 2.867465032295158e-05, + "loss": 0.1795, + "step": 3633 + }, + { + "epoch": 0.3766193387915846, + "grad_norm": 0.5147861838340759, + "learning_rate": 2.866860067115302e-05, + "loss": 0.2013, + "step": 3634 + }, + { + "epoch": 0.376722976474246, + "grad_norm": 0.5274387001991272, + "learning_rate": 2.8662550042590015e-05, + "loss": 0.2056, + "step": 3635 + }, + { + "epoch": 0.37682661415690744, + "grad_norm": 0.5126632452011108, + "learning_rate": 2.8656498437944335e-05, + "loss": 0.2297, + "step": 3636 + }, + { + "epoch": 0.37693025183956885, + "grad_norm": 0.49158743023872375, + "learning_rate": 2.865044585789787e-05, + "loss": 0.2318, + "step": 3637 + }, + { + "epoch": 0.37703388952223027, + "grad_norm": 0.5416737794876099, + "learning_rate": 2.8644392303132612e-05, + "loss": 0.2301, + "step": 3638 + }, + { + "epoch": 0.3771375272048917, + "grad_norm": 0.44698041677474976, + "learning_rate": 2.8638337774330667e-05, + "loss": 0.2062, + "step": 3639 + }, + { + "epoch": 0.3772411648875531, + "grad_norm": 0.4664641320705414, + "learning_rate": 2.863228227217425e-05, + "loss": 0.2039, + "step": 3640 + }, + { + "epoch": 0.3773448025702145, + "grad_norm": 0.4857037365436554, + "learning_rate": 2.8626225797345692e-05, + "loss": 0.2384, + "step": 3641 + }, + { + "epoch": 0.37744844025287594, + "grad_norm": 0.5228758454322815, + "learning_rate": 2.8620168350527414e-05, + "loss": 0.2612, + "step": 3642 + }, + { + "epoch": 0.37755207793553736, + "grad_norm": 0.4399373531341553, + "learning_rate": 2.8614109932401968e-05, + "loss": 0.1923, + "step": 3643 + }, + { + "epoch": 0.3776557156181988, + "grad_norm": 0.5993676781654358, + "learning_rate": 2.8608050543652005e-05, + "loss": 0.2368, + "step": 3644 + }, + { + "epoch": 0.3777593533008602, + "grad_norm": 0.5373175740242004, + "learning_rate": 2.8601990184960287e-05, + "loss": 0.245, + "step": 3645 + }, + { + "epoch": 0.3778629909835216, + "grad_norm": 0.5086439251899719, + "learning_rate": 2.859592885700969e-05, + "loss": 0.2329, + "step": 3646 + }, + { + "epoch": 0.377966628666183, + "grad_norm": 0.4674811363220215, + "learning_rate": 2.8589866560483188e-05, + "loss": 0.1995, + "step": 3647 + }, + { + "epoch": 0.37807026634884444, + "grad_norm": 0.5111047625541687, + "learning_rate": 2.858380329606388e-05, + "loss": 0.2598, + "step": 3648 + }, + { + "epoch": 0.37817390403150586, + "grad_norm": 0.4779401421546936, + "learning_rate": 2.8577739064434954e-05, + "loss": 0.2097, + "step": 3649 + }, + { + "epoch": 0.3782775417141673, + "grad_norm": 0.47433412075042725, + "learning_rate": 2.857167386627973e-05, + "loss": 0.2137, + "step": 3650 + }, + { + "epoch": 0.3783811793968287, + "grad_norm": 0.4639725685119629, + "learning_rate": 2.8565607702281622e-05, + "loss": 0.1845, + "step": 3651 + }, + { + "epoch": 0.3784848170794901, + "grad_norm": 0.5112288594245911, + "learning_rate": 2.8559540573124152e-05, + "loss": 0.2398, + "step": 3652 + }, + { + "epoch": 0.3785884547621515, + "grad_norm": 0.5371864438056946, + "learning_rate": 2.8553472479490952e-05, + "loss": 0.2088, + "step": 3653 + }, + { + "epoch": 0.37869209244481294, + "grad_norm": 0.5942397117614746, + "learning_rate": 2.8547403422065776e-05, + "loss": 0.2508, + "step": 3654 + }, + { + "epoch": 0.37879573012747436, + "grad_norm": 0.5302644371986389, + "learning_rate": 2.8541333401532463e-05, + "loss": 0.2158, + "step": 3655 + }, + { + "epoch": 0.3788993678101358, + "grad_norm": 0.489822655916214, + "learning_rate": 2.8535262418574982e-05, + "loss": 0.237, + "step": 3656 + }, + { + "epoch": 0.3790030054927972, + "grad_norm": 0.47960638999938965, + "learning_rate": 2.85291904738774e-05, + "loss": 0.224, + "step": 3657 + }, + { + "epoch": 0.3791066431754586, + "grad_norm": 0.41798311471939087, + "learning_rate": 2.852311756812389e-05, + "loss": 0.1702, + "step": 3658 + }, + { + "epoch": 0.37921028085812003, + "grad_norm": 0.5734483599662781, + "learning_rate": 2.851704370199875e-05, + "loss": 0.2361, + "step": 3659 + }, + { + "epoch": 0.37931391854078145, + "grad_norm": 0.48896273970603943, + "learning_rate": 2.8510968876186365e-05, + "loss": 0.2099, + "step": 3660 + }, + { + "epoch": 0.37941755622344286, + "grad_norm": 0.5119735598564148, + "learning_rate": 2.8504893091371236e-05, + "loss": 0.2296, + "step": 3661 + }, + { + "epoch": 0.3795211939061043, + "grad_norm": 0.44356822967529297, + "learning_rate": 2.849881634823797e-05, + "loss": 0.1887, + "step": 3662 + }, + { + "epoch": 0.3796248315887657, + "grad_norm": 0.47616955637931824, + "learning_rate": 2.84927386474713e-05, + "loss": 0.2442, + "step": 3663 + }, + { + "epoch": 0.3797284692714271, + "grad_norm": 0.5216185450553894, + "learning_rate": 2.8486659989756034e-05, + "loss": 0.2451, + "step": 3664 + }, + { + "epoch": 0.37983210695408853, + "grad_norm": 0.5075971484184265, + "learning_rate": 2.848058037577711e-05, + "loss": 0.2533, + "step": 3665 + }, + { + "epoch": 0.3799357446367499, + "grad_norm": 0.5356259346008301, + "learning_rate": 2.8474499806219577e-05, + "loss": 0.2233, + "step": 3666 + }, + { + "epoch": 0.3800393823194113, + "grad_norm": 0.36403751373291016, + "learning_rate": 2.8468418281768586e-05, + "loss": 0.1487, + "step": 3667 + }, + { + "epoch": 0.38014302000207273, + "grad_norm": 0.5536927580833435, + "learning_rate": 2.8462335803109372e-05, + "loss": 0.2205, + "step": 3668 + }, + { + "epoch": 0.38024665768473415, + "grad_norm": 0.5602768063545227, + "learning_rate": 2.8456252370927324e-05, + "loss": 0.2484, + "step": 3669 + }, + { + "epoch": 0.38035029536739556, + "grad_norm": 0.44447940587997437, + "learning_rate": 2.845016798590791e-05, + "loss": 0.2046, + "step": 3670 + }, + { + "epoch": 0.380453933050057, + "grad_norm": 0.4588824212551117, + "learning_rate": 2.8444082648736695e-05, + "loss": 0.1836, + "step": 3671 + }, + { + "epoch": 0.3805575707327184, + "grad_norm": 0.5337849259376526, + "learning_rate": 2.843799636009937e-05, + "loss": 0.2089, + "step": 3672 + }, + { + "epoch": 0.3806612084153798, + "grad_norm": 0.5306573510169983, + "learning_rate": 2.843190912068174e-05, + "loss": 0.2089, + "step": 3673 + }, + { + "epoch": 0.38076484609804123, + "grad_norm": 0.42665427923202515, + "learning_rate": 2.8425820931169695e-05, + "loss": 0.1583, + "step": 3674 + }, + { + "epoch": 0.38086848378070265, + "grad_norm": 0.47262972593307495, + "learning_rate": 2.8419731792249248e-05, + "loss": 0.2146, + "step": 3675 + }, + { + "epoch": 0.38097212146336407, + "grad_norm": 0.5057385563850403, + "learning_rate": 2.8413641704606516e-05, + "loss": 0.2437, + "step": 3676 + }, + { + "epoch": 0.3810757591460255, + "grad_norm": 0.5412306785583496, + "learning_rate": 2.8407550668927708e-05, + "loss": 0.2116, + "step": 3677 + }, + { + "epoch": 0.3811793968286869, + "grad_norm": 0.4526110887527466, + "learning_rate": 2.8401458685899153e-05, + "loss": 0.1855, + "step": 3678 + }, + { + "epoch": 0.3812830345113483, + "grad_norm": 0.4231882393360138, + "learning_rate": 2.8395365756207312e-05, + "loss": 0.173, + "step": 3679 + }, + { + "epoch": 0.38138667219400973, + "grad_norm": 0.4655570685863495, + "learning_rate": 2.8389271880538694e-05, + "loss": 0.1903, + "step": 3680 + }, + { + "epoch": 0.38149030987667115, + "grad_norm": 0.5213018655776978, + "learning_rate": 2.8383177059579972e-05, + "loss": 0.2384, + "step": 3681 + }, + { + "epoch": 0.38159394755933257, + "grad_norm": 0.5192651748657227, + "learning_rate": 2.8377081294017883e-05, + "loss": 0.2217, + "step": 3682 + }, + { + "epoch": 0.381697585241994, + "grad_norm": 0.5380532145500183, + "learning_rate": 2.83709845845393e-05, + "loss": 0.2111, + "step": 3683 + }, + { + "epoch": 0.3818012229246554, + "grad_norm": 0.5495359301567078, + "learning_rate": 2.8364886931831183e-05, + "loss": 0.2514, + "step": 3684 + }, + { + "epoch": 0.3819048606073168, + "grad_norm": 0.5004047155380249, + "learning_rate": 2.8358788336580618e-05, + "loss": 0.2099, + "step": 3685 + }, + { + "epoch": 0.38200849828997824, + "grad_norm": 0.4904569089412689, + "learning_rate": 2.8352688799474776e-05, + "loss": 0.1872, + "step": 3686 + }, + { + "epoch": 0.38211213597263965, + "grad_norm": 0.44963860511779785, + "learning_rate": 2.834658832120094e-05, + "loss": 0.1746, + "step": 3687 + }, + { + "epoch": 0.38221577365530107, + "grad_norm": 0.5347903966903687, + "learning_rate": 2.8340486902446506e-05, + "loss": 0.2204, + "step": 3688 + }, + { + "epoch": 0.3823194113379625, + "grad_norm": 0.5351092219352722, + "learning_rate": 2.8334384543898975e-05, + "loss": 0.1985, + "step": 3689 + }, + { + "epoch": 0.3824230490206239, + "grad_norm": 0.4588063359260559, + "learning_rate": 2.832828124624595e-05, + "loss": 0.1982, + "step": 3690 + }, + { + "epoch": 0.3825266867032853, + "grad_norm": 0.501385509967804, + "learning_rate": 2.832217701017514e-05, + "loss": 0.1994, + "step": 3691 + }, + { + "epoch": 0.38263032438594674, + "grad_norm": 0.6098371744155884, + "learning_rate": 2.8316071836374365e-05, + "loss": 0.2754, + "step": 3692 + }, + { + "epoch": 0.38273396206860816, + "grad_norm": 0.5000957250595093, + "learning_rate": 2.8309965725531535e-05, + "loss": 0.2122, + "step": 3693 + }, + { + "epoch": 0.3828375997512696, + "grad_norm": 0.5229092240333557, + "learning_rate": 2.830385867833469e-05, + "loss": 0.2474, + "step": 3694 + }, + { + "epoch": 0.382941237433931, + "grad_norm": 0.5045500993728638, + "learning_rate": 2.8297750695471965e-05, + "loss": 0.2237, + "step": 3695 + }, + { + "epoch": 0.3830448751165924, + "grad_norm": 0.46148043870925903, + "learning_rate": 2.829164177763158e-05, + "loss": 0.2063, + "step": 3696 + }, + { + "epoch": 0.3831485127992538, + "grad_norm": 0.5285515785217285, + "learning_rate": 2.8285531925501897e-05, + "loss": 0.2431, + "step": 3697 + }, + { + "epoch": 0.38325215048191524, + "grad_norm": 0.5902262926101685, + "learning_rate": 2.827942113977135e-05, + "loss": 0.2401, + "step": 3698 + }, + { + "epoch": 0.38335578816457666, + "grad_norm": 0.489153653383255, + "learning_rate": 2.8273309421128502e-05, + "loss": 0.1986, + "step": 3699 + }, + { + "epoch": 0.3834594258472381, + "grad_norm": 0.5225285887718201, + "learning_rate": 2.826719677026201e-05, + "loss": 0.2199, + "step": 3700 + }, + { + "epoch": 0.3835630635298995, + "grad_norm": 0.4897618889808655, + "learning_rate": 2.8261083187860635e-05, + "loss": 0.1975, + "step": 3701 + }, + { + "epoch": 0.3836667012125609, + "grad_norm": 0.5175366401672363, + "learning_rate": 2.8254968674613254e-05, + "loss": 0.2548, + "step": 3702 + }, + { + "epoch": 0.38377033889522233, + "grad_norm": 0.4286321997642517, + "learning_rate": 2.8248853231208832e-05, + "loss": 0.1868, + "step": 3703 + }, + { + "epoch": 0.3838739765778837, + "grad_norm": 0.49945926666259766, + "learning_rate": 2.8242736858336455e-05, + "loss": 0.2267, + "step": 3704 + }, + { + "epoch": 0.3839776142605451, + "grad_norm": 0.5295497179031372, + "learning_rate": 2.8236619556685298e-05, + "loss": 0.2116, + "step": 3705 + }, + { + "epoch": 0.3840812519432065, + "grad_norm": 0.4920368194580078, + "learning_rate": 2.823050132694465e-05, + "loss": 0.1964, + "step": 3706 + }, + { + "epoch": 0.38418488962586794, + "grad_norm": 0.49020951986312866, + "learning_rate": 2.8224382169803913e-05, + "loss": 0.2093, + "step": 3707 + }, + { + "epoch": 0.38428852730852936, + "grad_norm": 0.4164317846298218, + "learning_rate": 2.8218262085952573e-05, + "loss": 0.1682, + "step": 3708 + }, + { + "epoch": 0.3843921649911908, + "grad_norm": 0.4984332323074341, + "learning_rate": 2.8212141076080244e-05, + "loss": 0.2156, + "step": 3709 + }, + { + "epoch": 0.3844958026738522, + "grad_norm": 0.6256229281425476, + "learning_rate": 2.820601914087662e-05, + "loss": 0.2604, + "step": 3710 + }, + { + "epoch": 0.3845994403565136, + "grad_norm": 0.478753924369812, + "learning_rate": 2.8199896281031522e-05, + "loss": 0.1884, + "step": 3711 + }, + { + "epoch": 0.38470307803917503, + "grad_norm": 0.5280382633209229, + "learning_rate": 2.819377249723485e-05, + "loss": 0.2263, + "step": 3712 + }, + { + "epoch": 0.38480671572183645, + "grad_norm": 0.42487865686416626, + "learning_rate": 2.818764779017663e-05, + "loss": 0.1919, + "step": 3713 + }, + { + "epoch": 0.38491035340449786, + "grad_norm": 0.5221870541572571, + "learning_rate": 2.818152216054699e-05, + "loss": 0.203, + "step": 3714 + }, + { + "epoch": 0.3850139910871593, + "grad_norm": 0.45968756079673767, + "learning_rate": 2.8175395609036148e-05, + "loss": 0.206, + "step": 3715 + }, + { + "epoch": 0.3851176287698207, + "grad_norm": 0.4523147940635681, + "learning_rate": 2.816926813633444e-05, + "loss": 0.1907, + "step": 3716 + }, + { + "epoch": 0.3852212664524821, + "grad_norm": 0.46949905157089233, + "learning_rate": 2.81631397431323e-05, + "loss": 0.2099, + "step": 3717 + }, + { + "epoch": 0.38532490413514353, + "grad_norm": 0.4426126480102539, + "learning_rate": 2.8157010430120257e-05, + "loss": 0.177, + "step": 3718 + }, + { + "epoch": 0.38542854181780495, + "grad_norm": 0.553290069103241, + "learning_rate": 2.8150880197988958e-05, + "loss": 0.2324, + "step": 3719 + }, + { + "epoch": 0.38553217950046637, + "grad_norm": 0.4237673282623291, + "learning_rate": 2.8144749047429155e-05, + "loss": 0.1808, + "step": 3720 + }, + { + "epoch": 0.3856358171831278, + "grad_norm": 0.4610205590724945, + "learning_rate": 2.813861697913169e-05, + "loss": 0.2064, + "step": 3721 + }, + { + "epoch": 0.3857394548657892, + "grad_norm": 0.513278603553772, + "learning_rate": 2.8132483993787513e-05, + "loss": 0.2378, + "step": 3722 + }, + { + "epoch": 0.3858430925484506, + "grad_norm": 0.5233225226402283, + "learning_rate": 2.8126350092087683e-05, + "loss": 0.2396, + "step": 3723 + }, + { + "epoch": 0.38594673023111203, + "grad_norm": 0.4505366086959839, + "learning_rate": 2.812021527472336e-05, + "loss": 0.1931, + "step": 3724 + }, + { + "epoch": 0.38605036791377345, + "grad_norm": 0.5511522889137268, + "learning_rate": 2.81140795423858e-05, + "loss": 0.2346, + "step": 3725 + }, + { + "epoch": 0.38615400559643487, + "grad_norm": 0.4193154275417328, + "learning_rate": 2.8107942895766372e-05, + "loss": 0.2101, + "step": 3726 + }, + { + "epoch": 0.3862576432790963, + "grad_norm": 0.42316773533821106, + "learning_rate": 2.8101805335556543e-05, + "loss": 0.1671, + "step": 3727 + }, + { + "epoch": 0.3863612809617577, + "grad_norm": 0.48060035705566406, + "learning_rate": 2.8095666862447876e-05, + "loss": 0.209, + "step": 3728 + }, + { + "epoch": 0.3864649186444191, + "grad_norm": 0.44975948333740234, + "learning_rate": 2.808952747713206e-05, + "loss": 0.2219, + "step": 3729 + }, + { + "epoch": 0.38656855632708054, + "grad_norm": 0.4692697525024414, + "learning_rate": 2.8083387180300864e-05, + "loss": 0.2145, + "step": 3730 + }, + { + "epoch": 0.38667219400974195, + "grad_norm": 0.5550007224082947, + "learning_rate": 2.807724597264616e-05, + "loss": 0.2177, + "step": 3731 + }, + { + "epoch": 0.38677583169240337, + "grad_norm": 0.46553054451942444, + "learning_rate": 2.8071103854859943e-05, + "loss": 0.1989, + "step": 3732 + }, + { + "epoch": 0.3868794693750648, + "grad_norm": 0.5102365016937256, + "learning_rate": 2.8064960827634284e-05, + "loss": 0.2313, + "step": 3733 + }, + { + "epoch": 0.3869831070577262, + "grad_norm": 0.48375582695007324, + "learning_rate": 2.805881689166138e-05, + "loss": 0.2148, + "step": 3734 + }, + { + "epoch": 0.3870867447403876, + "grad_norm": 0.50575190782547, + "learning_rate": 2.8052672047633514e-05, + "loss": 0.2292, + "step": 3735 + }, + { + "epoch": 0.38719038242304904, + "grad_norm": 0.5282059907913208, + "learning_rate": 2.804652629624309e-05, + "loss": 0.2265, + "step": 3736 + }, + { + "epoch": 0.38729402010571046, + "grad_norm": 0.5234432220458984, + "learning_rate": 2.804037963818258e-05, + "loss": 0.205, + "step": 3737 + }, + { + "epoch": 0.3873976577883719, + "grad_norm": 0.5600778460502625, + "learning_rate": 2.8034232074144586e-05, + "loss": 0.2626, + "step": 3738 + }, + { + "epoch": 0.3875012954710333, + "grad_norm": 0.41027340292930603, + "learning_rate": 2.8028083604821827e-05, + "loss": 0.1776, + "step": 3739 + }, + { + "epoch": 0.3876049331536947, + "grad_norm": 0.5438401699066162, + "learning_rate": 2.802193423090708e-05, + "loss": 0.2424, + "step": 3740 + }, + { + "epoch": 0.3877085708363561, + "grad_norm": 0.459451824426651, + "learning_rate": 2.8015783953093253e-05, + "loss": 0.1835, + "step": 3741 + }, + { + "epoch": 0.3878122085190175, + "grad_norm": 0.46863898634910583, + "learning_rate": 2.8009632772073348e-05, + "loss": 0.2197, + "step": 3742 + }, + { + "epoch": 0.3879158462016789, + "grad_norm": 0.4693146347999573, + "learning_rate": 2.800348068854048e-05, + "loss": 0.2177, + "step": 3743 + }, + { + "epoch": 0.3880194838843403, + "grad_norm": 0.6115514636039734, + "learning_rate": 2.7997327703187848e-05, + "loss": 0.272, + "step": 3744 + }, + { + "epoch": 0.38812312156700174, + "grad_norm": 0.5041795969009399, + "learning_rate": 2.7991173816708765e-05, + "loss": 0.2129, + "step": 3745 + }, + { + "epoch": 0.38822675924966316, + "grad_norm": 0.5383294820785522, + "learning_rate": 2.7985019029796636e-05, + "loss": 0.2416, + "step": 3746 + }, + { + "epoch": 0.3883303969323246, + "grad_norm": 0.6601837873458862, + "learning_rate": 2.7978863343144973e-05, + "loss": 0.2338, + "step": 3747 + }, + { + "epoch": 0.388434034614986, + "grad_norm": 0.5024908781051636, + "learning_rate": 2.7972706757447392e-05, + "loss": 0.2098, + "step": 3748 + }, + { + "epoch": 0.3885376722976474, + "grad_norm": 0.43663129210472107, + "learning_rate": 2.796654927339761e-05, + "loss": 0.1831, + "step": 3749 + }, + { + "epoch": 0.3886413099803088, + "grad_norm": 0.4988679885864258, + "learning_rate": 2.796039089168944e-05, + "loss": 0.1949, + "step": 3750 + }, + { + "epoch": 0.38874494766297024, + "grad_norm": 0.4759289026260376, + "learning_rate": 2.79542316130168e-05, + "loss": 0.2208, + "step": 3751 + }, + { + "epoch": 0.38884858534563166, + "grad_norm": 0.558282196521759, + "learning_rate": 2.7948071438073702e-05, + "loss": 0.2515, + "step": 3752 + }, + { + "epoch": 0.3889522230282931, + "grad_norm": 0.3603500425815582, + "learning_rate": 2.7941910367554276e-05, + "loss": 0.1413, + "step": 3753 + }, + { + "epoch": 0.3890558607109545, + "grad_norm": 0.5307490229606628, + "learning_rate": 2.7935748402152733e-05, + "loss": 0.1958, + "step": 3754 + }, + { + "epoch": 0.3891594983936159, + "grad_norm": 0.49905791878700256, + "learning_rate": 2.7929585542563404e-05, + "loss": 0.1986, + "step": 3755 + }, + { + "epoch": 0.3892631360762773, + "grad_norm": 0.5282691717147827, + "learning_rate": 2.7923421789480692e-05, + "loss": 0.1965, + "step": 3756 + }, + { + "epoch": 0.38936677375893874, + "grad_norm": 0.48384660482406616, + "learning_rate": 2.791725714359913e-05, + "loss": 0.1925, + "step": 3757 + }, + { + "epoch": 0.38947041144160016, + "grad_norm": 0.49548470973968506, + "learning_rate": 2.7911091605613348e-05, + "loss": 0.2268, + "step": 3758 + }, + { + "epoch": 0.3895740491242616, + "grad_norm": 0.48279035091400146, + "learning_rate": 2.7904925176218055e-05, + "loss": 0.2361, + "step": 3759 + }, + { + "epoch": 0.389677686806923, + "grad_norm": 0.4718702733516693, + "learning_rate": 2.7898757856108086e-05, + "loss": 0.1889, + "step": 3760 + }, + { + "epoch": 0.3897813244895844, + "grad_norm": 0.5636996030807495, + "learning_rate": 2.789258964597836e-05, + "loss": 0.2656, + "step": 3761 + }, + { + "epoch": 0.38988496217224583, + "grad_norm": 0.41767802834510803, + "learning_rate": 2.78864205465239e-05, + "loss": 0.1772, + "step": 3762 + }, + { + "epoch": 0.38998859985490725, + "grad_norm": 0.4735535681247711, + "learning_rate": 2.788025055843983e-05, + "loss": 0.2161, + "step": 3763 + }, + { + "epoch": 0.39009223753756866, + "grad_norm": 0.5146605968475342, + "learning_rate": 2.787407968242138e-05, + "loss": 0.214, + "step": 3764 + }, + { + "epoch": 0.3901958752202301, + "grad_norm": 0.4984552264213562, + "learning_rate": 2.7867907919163878e-05, + "loss": 0.2188, + "step": 3765 + }, + { + "epoch": 0.3902995129028915, + "grad_norm": 0.5831129550933838, + "learning_rate": 2.786173526936274e-05, + "loss": 0.2528, + "step": 3766 + }, + { + "epoch": 0.3904031505855529, + "grad_norm": 0.4989360272884369, + "learning_rate": 2.7855561733713486e-05, + "loss": 0.2426, + "step": 3767 + }, + { + "epoch": 0.39050678826821433, + "grad_norm": 0.40938708186149597, + "learning_rate": 2.7849387312911754e-05, + "loss": 0.1791, + "step": 3768 + }, + { + "epoch": 0.39061042595087575, + "grad_norm": 0.5959457159042358, + "learning_rate": 2.784321200765326e-05, + "loss": 0.2583, + "step": 3769 + }, + { + "epoch": 0.39071406363353717, + "grad_norm": 0.4509630501270294, + "learning_rate": 2.7837035818633827e-05, + "loss": 0.2004, + "step": 3770 + }, + { + "epoch": 0.3908177013161986, + "grad_norm": 0.47014522552490234, + "learning_rate": 2.7830858746549388e-05, + "loss": 0.2221, + "step": 3771 + }, + { + "epoch": 0.39092133899886, + "grad_norm": 0.5484637022018433, + "learning_rate": 2.7824680792095945e-05, + "loss": 0.2175, + "step": 3772 + }, + { + "epoch": 0.3910249766815214, + "grad_norm": 0.48253583908081055, + "learning_rate": 2.7818501955969642e-05, + "loss": 0.2088, + "step": 3773 + }, + { + "epoch": 0.39112861436418284, + "grad_norm": 0.4862441122531891, + "learning_rate": 2.78123222388667e-05, + "loss": 0.2095, + "step": 3774 + }, + { + "epoch": 0.39123225204684425, + "grad_norm": 0.45672932267189026, + "learning_rate": 2.7806141641483425e-05, + "loss": 0.1935, + "step": 3775 + }, + { + "epoch": 0.39133588972950567, + "grad_norm": 0.5105577707290649, + "learning_rate": 2.7799960164516243e-05, + "loss": 0.2201, + "step": 3776 + }, + { + "epoch": 0.3914395274121671, + "grad_norm": 0.47937294840812683, + "learning_rate": 2.7793777808661676e-05, + "loss": 0.1911, + "step": 3777 + }, + { + "epoch": 0.3915431650948285, + "grad_norm": 0.566051185131073, + "learning_rate": 2.7787594574616345e-05, + "loss": 0.2338, + "step": 3778 + }, + { + "epoch": 0.3916468027774899, + "grad_norm": 0.5488920211791992, + "learning_rate": 2.7781410463076963e-05, + "loss": 0.2653, + "step": 3779 + }, + { + "epoch": 0.3917504404601513, + "grad_norm": 0.48996976017951965, + "learning_rate": 2.7775225474740347e-05, + "loss": 0.2128, + "step": 3780 + }, + { + "epoch": 0.3918540781428127, + "grad_norm": 0.5063322186470032, + "learning_rate": 2.7769039610303408e-05, + "loss": 0.2321, + "step": 3781 + }, + { + "epoch": 0.3919577158254741, + "grad_norm": 0.5853592157363892, + "learning_rate": 2.776285287046316e-05, + "loss": 0.2537, + "step": 3782 + }, + { + "epoch": 0.39206135350813553, + "grad_norm": 0.5853617191314697, + "learning_rate": 2.775666525591673e-05, + "loss": 0.257, + "step": 3783 + }, + { + "epoch": 0.39216499119079695, + "grad_norm": 0.5316409468650818, + "learning_rate": 2.77504767673613e-05, + "loss": 0.1986, + "step": 3784 + }, + { + "epoch": 0.39226862887345837, + "grad_norm": 0.4623914062976837, + "learning_rate": 2.774428740549421e-05, + "loss": 0.2091, + "step": 3785 + }, + { + "epoch": 0.3923722665561198, + "grad_norm": 0.5352954268455505, + "learning_rate": 2.7738097171012848e-05, + "loss": 0.2586, + "step": 3786 + }, + { + "epoch": 0.3924759042387812, + "grad_norm": 0.4515346884727478, + "learning_rate": 2.773190606461473e-05, + "loss": 0.1974, + "step": 3787 + }, + { + "epoch": 0.3925795419214426, + "grad_norm": 0.519649088382721, + "learning_rate": 2.772571408699745e-05, + "loss": 0.2108, + "step": 3788 + }, + { + "epoch": 0.39268317960410404, + "grad_norm": 0.4203835427761078, + "learning_rate": 2.771952123885872e-05, + "loss": 0.1778, + "step": 3789 + }, + { + "epoch": 0.39278681728676546, + "grad_norm": 0.47701048851013184, + "learning_rate": 2.771332752089634e-05, + "loss": 0.2384, + "step": 3790 + }, + { + "epoch": 0.39289045496942687, + "grad_norm": 0.4731229543685913, + "learning_rate": 2.7707132933808202e-05, + "loss": 0.1945, + "step": 3791 + }, + { + "epoch": 0.3929940926520883, + "grad_norm": 0.5297321677207947, + "learning_rate": 2.77009374782923e-05, + "loss": 0.226, + "step": 3792 + }, + { + "epoch": 0.3930977303347497, + "grad_norm": 0.45654892921447754, + "learning_rate": 2.769474115504674e-05, + "loss": 0.2059, + "step": 3793 + }, + { + "epoch": 0.3932013680174111, + "grad_norm": 0.45342886447906494, + "learning_rate": 2.7688543964769716e-05, + "loss": 0.2228, + "step": 3794 + }, + { + "epoch": 0.39330500570007254, + "grad_norm": 0.49522092938423157, + "learning_rate": 2.7682345908159497e-05, + "loss": 0.231, + "step": 3795 + }, + { + "epoch": 0.39340864338273396, + "grad_norm": 0.5499382019042969, + "learning_rate": 2.76761469859145e-05, + "loss": 0.2165, + "step": 3796 + }, + { + "epoch": 0.3935122810653954, + "grad_norm": 0.48214516043663025, + "learning_rate": 2.7669947198733177e-05, + "loss": 0.1901, + "step": 3797 + }, + { + "epoch": 0.3936159187480568, + "grad_norm": 0.48165568709373474, + "learning_rate": 2.7663746547314134e-05, + "loss": 0.2256, + "step": 3798 + }, + { + "epoch": 0.3937195564307182, + "grad_norm": 0.5463171601295471, + "learning_rate": 2.7657545032356042e-05, + "loss": 0.2254, + "step": 3799 + }, + { + "epoch": 0.3938231941133796, + "grad_norm": 0.5542256832122803, + "learning_rate": 2.765134265455768e-05, + "loss": 0.2026, + "step": 3800 + }, + { + "epoch": 0.39392683179604104, + "grad_norm": 0.4160597026348114, + "learning_rate": 2.7645139414617922e-05, + "loss": 0.1652, + "step": 3801 + }, + { + "epoch": 0.39403046947870246, + "grad_norm": 0.5020577907562256, + "learning_rate": 2.7638935313235738e-05, + "loss": 0.2038, + "step": 3802 + }, + { + "epoch": 0.3941341071613639, + "grad_norm": 0.46063679456710815, + "learning_rate": 2.76327303511102e-05, + "loss": 0.2035, + "step": 3803 + }, + { + "epoch": 0.3942377448440253, + "grad_norm": 0.6740379333496094, + "learning_rate": 2.762652452894047e-05, + "loss": 0.2884, + "step": 3804 + }, + { + "epoch": 0.3943413825266867, + "grad_norm": 0.487909734249115, + "learning_rate": 2.7620317847425808e-05, + "loss": 0.2064, + "step": 3805 + }, + { + "epoch": 0.39444502020934813, + "grad_norm": 0.49684950709342957, + "learning_rate": 2.7614110307265587e-05, + "loss": 0.2161, + "step": 3806 + }, + { + "epoch": 0.39454865789200955, + "grad_norm": 0.5438411831855774, + "learning_rate": 2.7607901909159243e-05, + "loss": 0.2406, + "step": 3807 + }, + { + "epoch": 0.39465229557467096, + "grad_norm": 0.49316468834877014, + "learning_rate": 2.7601692653806343e-05, + "loss": 0.2279, + "step": 3808 + }, + { + "epoch": 0.3947559332573324, + "grad_norm": 0.49097537994384766, + "learning_rate": 2.7595482541906534e-05, + "loss": 0.2197, + "step": 3809 + }, + { + "epoch": 0.3948595709399938, + "grad_norm": 0.5558735728263855, + "learning_rate": 2.758927157415956e-05, + "loss": 0.2393, + "step": 3810 + }, + { + "epoch": 0.3949632086226552, + "grad_norm": 0.45270583033561707, + "learning_rate": 2.7583059751265256e-05, + "loss": 0.2114, + "step": 3811 + }, + { + "epoch": 0.39506684630531663, + "grad_norm": 0.4534200429916382, + "learning_rate": 2.7576847073923572e-05, + "loss": 0.1899, + "step": 3812 + }, + { + "epoch": 0.39517048398797805, + "grad_norm": 0.5005395412445068, + "learning_rate": 2.757063354283454e-05, + "loss": 0.2217, + "step": 3813 + }, + { + "epoch": 0.39527412167063947, + "grad_norm": 0.47601640224456787, + "learning_rate": 2.7564419158698282e-05, + "loss": 0.1935, + "step": 3814 + }, + { + "epoch": 0.3953777593533009, + "grad_norm": 0.4823366701602936, + "learning_rate": 2.7558203922215044e-05, + "loss": 0.2128, + "step": 3815 + }, + { + "epoch": 0.3954813970359623, + "grad_norm": 0.4116901457309723, + "learning_rate": 2.7551987834085125e-05, + "loss": 0.1867, + "step": 3816 + }, + { + "epoch": 0.3955850347186237, + "grad_norm": 0.45607835054397583, + "learning_rate": 2.7545770895008962e-05, + "loss": 0.1936, + "step": 3817 + }, + { + "epoch": 0.3956886724012851, + "grad_norm": 0.46392399072647095, + "learning_rate": 2.7539553105687063e-05, + "loss": 0.2034, + "step": 3818 + }, + { + "epoch": 0.3957923100839465, + "grad_norm": 0.5502959489822388, + "learning_rate": 2.7533334466820046e-05, + "loss": 0.2184, + "step": 3819 + }, + { + "epoch": 0.3958959477666079, + "grad_norm": 0.49488842487335205, + "learning_rate": 2.752711497910861e-05, + "loss": 0.2121, + "step": 3820 + }, + { + "epoch": 0.39599958544926933, + "grad_norm": 0.49059224128723145, + "learning_rate": 2.7520894643253554e-05, + "loss": 0.2081, + "step": 3821 + }, + { + "epoch": 0.39610322313193075, + "grad_norm": 0.5765684843063354, + "learning_rate": 2.7514673459955786e-05, + "loss": 0.2582, + "step": 3822 + }, + { + "epoch": 0.39620686081459217, + "grad_norm": 0.44499170780181885, + "learning_rate": 2.750845142991629e-05, + "loss": 0.1702, + "step": 3823 + }, + { + "epoch": 0.3963104984972536, + "grad_norm": 0.49501338601112366, + "learning_rate": 2.750222855383616e-05, + "loss": 0.1933, + "step": 3824 + }, + { + "epoch": 0.396414136179915, + "grad_norm": 0.5264593362808228, + "learning_rate": 2.7496004832416584e-05, + "loss": 0.2337, + "step": 3825 + }, + { + "epoch": 0.3965177738625764, + "grad_norm": 0.5584926009178162, + "learning_rate": 2.7489780266358835e-05, + "loss": 0.2621, + "step": 3826 + }, + { + "epoch": 0.39662141154523783, + "grad_norm": 0.47269389033317566, + "learning_rate": 2.7483554856364282e-05, + "loss": 0.2089, + "step": 3827 + }, + { + "epoch": 0.39672504922789925, + "grad_norm": 0.5708098411560059, + "learning_rate": 2.7477328603134413e-05, + "loss": 0.235, + "step": 3828 + }, + { + "epoch": 0.39682868691056067, + "grad_norm": 0.5021374821662903, + "learning_rate": 2.7471101507370768e-05, + "loss": 0.2116, + "step": 3829 + }, + { + "epoch": 0.3969323245932221, + "grad_norm": 0.45472949743270874, + "learning_rate": 2.746487356977503e-05, + "loss": 0.1767, + "step": 3830 + }, + { + "epoch": 0.3970359622758835, + "grad_norm": 0.4058346748352051, + "learning_rate": 2.7458644791048937e-05, + "loss": 0.172, + "step": 3831 + }, + { + "epoch": 0.3971395999585449, + "grad_norm": 0.5224340558052063, + "learning_rate": 2.745241517189434e-05, + "loss": 0.2334, + "step": 3832 + }, + { + "epoch": 0.39724323764120634, + "grad_norm": 0.550030529499054, + "learning_rate": 2.744618471301319e-05, + "loss": 0.2282, + "step": 3833 + }, + { + "epoch": 0.39734687532386775, + "grad_norm": 0.4936904013156891, + "learning_rate": 2.7439953415107527e-05, + "loss": 0.2152, + "step": 3834 + }, + { + "epoch": 0.39745051300652917, + "grad_norm": 0.5040317177772522, + "learning_rate": 2.7433721278879474e-05, + "loss": 0.2241, + "step": 3835 + }, + { + "epoch": 0.3975541506891906, + "grad_norm": 0.47307634353637695, + "learning_rate": 2.7427488305031264e-05, + "loss": 0.219, + "step": 3836 + }, + { + "epoch": 0.397657788371852, + "grad_norm": 0.46557918190956116, + "learning_rate": 2.7421254494265218e-05, + "loss": 0.2127, + "step": 3837 + }, + { + "epoch": 0.3977614260545134, + "grad_norm": 0.5010596513748169, + "learning_rate": 2.741501984728375e-05, + "loss": 0.2423, + "step": 3838 + }, + { + "epoch": 0.39786506373717484, + "grad_norm": 0.6242858171463013, + "learning_rate": 2.7408784364789373e-05, + "loss": 0.2613, + "step": 3839 + }, + { + "epoch": 0.39796870141983626, + "grad_norm": 0.5589104294776917, + "learning_rate": 2.7402548047484693e-05, + "loss": 0.2325, + "step": 3840 + }, + { + "epoch": 0.3980723391024977, + "grad_norm": 0.5010468363761902, + "learning_rate": 2.7396310896072412e-05, + "loss": 0.2008, + "step": 3841 + }, + { + "epoch": 0.3981759767851591, + "grad_norm": 0.4926588833332062, + "learning_rate": 2.73900729112553e-05, + "loss": 0.1933, + "step": 3842 + }, + { + "epoch": 0.3982796144678205, + "grad_norm": 0.5381539463996887, + "learning_rate": 2.7383834093736278e-05, + "loss": 0.2323, + "step": 3843 + }, + { + "epoch": 0.3983832521504819, + "grad_norm": 0.5139055252075195, + "learning_rate": 2.7377594444218298e-05, + "loss": 0.202, + "step": 3844 + }, + { + "epoch": 0.39848688983314334, + "grad_norm": 0.5416883826255798, + "learning_rate": 2.737135396340445e-05, + "loss": 0.2483, + "step": 3845 + }, + { + "epoch": 0.39859052751580476, + "grad_norm": 0.5522505640983582, + "learning_rate": 2.7365112651997895e-05, + "loss": 0.2305, + "step": 3846 + }, + { + "epoch": 0.3986941651984662, + "grad_norm": 0.5479319095611572, + "learning_rate": 2.7358870510701895e-05, + "loss": 0.2176, + "step": 3847 + }, + { + "epoch": 0.3987978028811276, + "grad_norm": 0.4639412462711334, + "learning_rate": 2.7352627540219806e-05, + "loss": 0.184, + "step": 3848 + }, + { + "epoch": 0.398901440563789, + "grad_norm": 0.45885124802589417, + "learning_rate": 2.7346383741255076e-05, + "loss": 0.1719, + "step": 3849 + }, + { + "epoch": 0.39900507824645043, + "grad_norm": 0.5005857348442078, + "learning_rate": 2.734013911451125e-05, + "loss": 0.2172, + "step": 3850 + }, + { + "epoch": 0.39910871592911185, + "grad_norm": 0.5310930609703064, + "learning_rate": 2.733389366069195e-05, + "loss": 0.2308, + "step": 3851 + }, + { + "epoch": 0.39921235361177326, + "grad_norm": 0.5748088955879211, + "learning_rate": 2.732764738050092e-05, + "loss": 0.2393, + "step": 3852 + }, + { + "epoch": 0.3993159912944347, + "grad_norm": 0.5303531885147095, + "learning_rate": 2.7321400274641972e-05, + "loss": 0.23, + "step": 3853 + }, + { + "epoch": 0.3994196289770961, + "grad_norm": 0.5156660079956055, + "learning_rate": 2.7315152343819026e-05, + "loss": 0.2149, + "step": 3854 + }, + { + "epoch": 0.3995232666597575, + "grad_norm": 0.5107412338256836, + "learning_rate": 2.730890358873608e-05, + "loss": 0.2374, + "step": 3855 + }, + { + "epoch": 0.3996269043424189, + "grad_norm": 0.5499140620231628, + "learning_rate": 2.730265401009724e-05, + "loss": 0.2443, + "step": 3856 + }, + { + "epoch": 0.3997305420250803, + "grad_norm": 0.5674372315406799, + "learning_rate": 2.7296403608606698e-05, + "loss": 0.2273, + "step": 3857 + }, + { + "epoch": 0.3998341797077417, + "grad_norm": 0.5294408798217773, + "learning_rate": 2.7290152384968743e-05, + "loss": 0.2283, + "step": 3858 + }, + { + "epoch": 0.3999378173904031, + "grad_norm": 0.4700827896595001, + "learning_rate": 2.7283900339887753e-05, + "loss": 0.1761, + "step": 3859 + }, + { + "epoch": 0.40004145507306454, + "grad_norm": 0.489352822303772, + "learning_rate": 2.727764747406819e-05, + "loss": 0.1984, + "step": 3860 + }, + { + "epoch": 0.40014509275572596, + "grad_norm": 0.47893935441970825, + "learning_rate": 2.7271393788214622e-05, + "loss": 0.1767, + "step": 3861 + }, + { + "epoch": 0.4002487304383874, + "grad_norm": 0.5101784467697144, + "learning_rate": 2.7265139283031713e-05, + "loss": 0.2046, + "step": 3862 + }, + { + "epoch": 0.4003523681210488, + "grad_norm": 0.5397927165031433, + "learning_rate": 2.7258883959224197e-05, + "loss": 0.235, + "step": 3863 + }, + { + "epoch": 0.4004560058037102, + "grad_norm": 0.4748551845550537, + "learning_rate": 2.7252627817496923e-05, + "loss": 0.1911, + "step": 3864 + }, + { + "epoch": 0.40055964348637163, + "grad_norm": 0.5310261845588684, + "learning_rate": 2.7246370858554816e-05, + "loss": 0.2191, + "step": 3865 + }, + { + "epoch": 0.40066328116903305, + "grad_norm": 0.5089344382286072, + "learning_rate": 2.7240113083102913e-05, + "loss": 0.2, + "step": 3866 + }, + { + "epoch": 0.40076691885169446, + "grad_norm": 0.6126011610031128, + "learning_rate": 2.7233854491846314e-05, + "loss": 0.2525, + "step": 3867 + }, + { + "epoch": 0.4008705565343559, + "grad_norm": 0.5655479431152344, + "learning_rate": 2.7227595085490242e-05, + "loss": 0.2657, + "step": 3868 + }, + { + "epoch": 0.4009741942170173, + "grad_norm": 0.5203585624694824, + "learning_rate": 2.7221334864739994e-05, + "loss": 0.2292, + "step": 3869 + }, + { + "epoch": 0.4010778318996787, + "grad_norm": 0.5850130319595337, + "learning_rate": 2.721507383030096e-05, + "loss": 0.23, + "step": 3870 + }, + { + "epoch": 0.40118146958234013, + "grad_norm": 0.5276358127593994, + "learning_rate": 2.7208811982878614e-05, + "loss": 0.2176, + "step": 3871 + }, + { + "epoch": 0.40128510726500155, + "grad_norm": 0.5064187049865723, + "learning_rate": 2.7202549323178543e-05, + "loss": 0.2187, + "step": 3872 + }, + { + "epoch": 0.40138874494766297, + "grad_norm": 0.5020982623100281, + "learning_rate": 2.7196285851906417e-05, + "loss": 0.1937, + "step": 3873 + }, + { + "epoch": 0.4014923826303244, + "grad_norm": 0.5079458355903625, + "learning_rate": 2.719002156976798e-05, + "loss": 0.2184, + "step": 3874 + }, + { + "epoch": 0.4015960203129858, + "grad_norm": 0.5517920851707458, + "learning_rate": 2.7183756477469096e-05, + "loss": 0.2227, + "step": 3875 + }, + { + "epoch": 0.4016996579956472, + "grad_norm": 0.48392802476882935, + "learning_rate": 2.7177490575715695e-05, + "loss": 0.1766, + "step": 3876 + }, + { + "epoch": 0.40180329567830864, + "grad_norm": 0.48515447974205017, + "learning_rate": 2.7171223865213816e-05, + "loss": 0.1998, + "step": 3877 + }, + { + "epoch": 0.40190693336097005, + "grad_norm": 0.5205907821655273, + "learning_rate": 2.716495634666958e-05, + "loss": 0.2373, + "step": 3878 + }, + { + "epoch": 0.40201057104363147, + "grad_norm": 0.4959084987640381, + "learning_rate": 2.7158688020789202e-05, + "loss": 0.2178, + "step": 3879 + }, + { + "epoch": 0.4021142087262929, + "grad_norm": 0.47091081738471985, + "learning_rate": 2.7152418888278983e-05, + "loss": 0.1983, + "step": 3880 + }, + { + "epoch": 0.4022178464089543, + "grad_norm": 0.5432823300361633, + "learning_rate": 2.714614894984532e-05, + "loss": 0.2555, + "step": 3881 + }, + { + "epoch": 0.4023214840916157, + "grad_norm": 0.5162708759307861, + "learning_rate": 2.7139878206194708e-05, + "loss": 0.2116, + "step": 3882 + }, + { + "epoch": 0.40242512177427714, + "grad_norm": 0.5271724462509155, + "learning_rate": 2.7133606658033717e-05, + "loss": 0.21, + "step": 3883 + }, + { + "epoch": 0.40252875945693856, + "grad_norm": 0.4762440621852875, + "learning_rate": 2.7127334306069016e-05, + "loss": 0.2041, + "step": 3884 + }, + { + "epoch": 0.4026323971396, + "grad_norm": 0.4821658134460449, + "learning_rate": 2.712106115100737e-05, + "loss": 0.2078, + "step": 3885 + }, + { + "epoch": 0.4027360348222614, + "grad_norm": 0.46634823083877563, + "learning_rate": 2.7114787193555615e-05, + "loss": 0.1904, + "step": 3886 + }, + { + "epoch": 0.4028396725049228, + "grad_norm": 0.519838273525238, + "learning_rate": 2.71085124344207e-05, + "loss": 0.2123, + "step": 3887 + }, + { + "epoch": 0.4029433101875842, + "grad_norm": 0.531387209892273, + "learning_rate": 2.7102236874309666e-05, + "loss": 0.2175, + "step": 3888 + }, + { + "epoch": 0.40304694787024564, + "grad_norm": 0.5151052474975586, + "learning_rate": 2.709596051392961e-05, + "loss": 0.2335, + "step": 3889 + }, + { + "epoch": 0.40315058555290706, + "grad_norm": 0.5302076935768127, + "learning_rate": 2.708968335398776e-05, + "loss": 0.1885, + "step": 3890 + }, + { + "epoch": 0.4032542232355685, + "grad_norm": 0.4122977554798126, + "learning_rate": 2.708340539519141e-05, + "loss": 0.1672, + "step": 3891 + }, + { + "epoch": 0.4033578609182299, + "grad_norm": 0.5019770860671997, + "learning_rate": 2.707712663824795e-05, + "loss": 0.2152, + "step": 3892 + }, + { + "epoch": 0.4034614986008913, + "grad_norm": 0.5264571309089661, + "learning_rate": 2.7070847083864858e-05, + "loss": 0.2132, + "step": 3893 + }, + { + "epoch": 0.4035651362835527, + "grad_norm": 0.489394873380661, + "learning_rate": 2.706456673274972e-05, + "loss": 0.2037, + "step": 3894 + }, + { + "epoch": 0.4036687739662141, + "grad_norm": 0.5119697451591492, + "learning_rate": 2.7058285585610174e-05, + "loss": 0.2347, + "step": 3895 + }, + { + "epoch": 0.4037724116488755, + "grad_norm": 0.5221233367919922, + "learning_rate": 2.7052003643153976e-05, + "loss": 0.2272, + "step": 3896 + }, + { + "epoch": 0.4038760493315369, + "grad_norm": 0.5005158185958862, + "learning_rate": 2.704572090608898e-05, + "loss": 0.2381, + "step": 3897 + }, + { + "epoch": 0.40397968701419834, + "grad_norm": 0.4670834243297577, + "learning_rate": 2.7039437375123108e-05, + "loss": 0.2344, + "step": 3898 + }, + { + "epoch": 0.40408332469685976, + "grad_norm": 0.5773009657859802, + "learning_rate": 2.703315305096437e-05, + "loss": 0.2493, + "step": 3899 + }, + { + "epoch": 0.4041869623795212, + "grad_norm": 0.592467188835144, + "learning_rate": 2.702686793432088e-05, + "loss": 0.2573, + "step": 3900 + }, + { + "epoch": 0.4042906000621826, + "grad_norm": 0.5141617059707642, + "learning_rate": 2.7020582025900832e-05, + "loss": 0.2319, + "step": 3901 + }, + { + "epoch": 0.404394237744844, + "grad_norm": 0.4802696406841278, + "learning_rate": 2.7014295326412514e-05, + "loss": 0.2043, + "step": 3902 + }, + { + "epoch": 0.4044978754275054, + "grad_norm": 0.5270540714263916, + "learning_rate": 2.7008007836564307e-05, + "loss": 0.2206, + "step": 3903 + }, + { + "epoch": 0.40460151311016684, + "grad_norm": 0.4769273102283478, + "learning_rate": 2.7001719557064673e-05, + "loss": 0.2162, + "step": 3904 + }, + { + "epoch": 0.40470515079282826, + "grad_norm": 0.5118731260299683, + "learning_rate": 2.699543048862216e-05, + "loss": 0.2191, + "step": 3905 + }, + { + "epoch": 0.4048087884754897, + "grad_norm": 0.4943947196006775, + "learning_rate": 2.6989140631945412e-05, + "loss": 0.228, + "step": 3906 + }, + { + "epoch": 0.4049124261581511, + "grad_norm": 0.40463706851005554, + "learning_rate": 2.6982849987743168e-05, + "loss": 0.1749, + "step": 3907 + }, + { + "epoch": 0.4050160638408125, + "grad_norm": 0.47197356820106506, + "learning_rate": 2.697655855672424e-05, + "loss": 0.2002, + "step": 3908 + }, + { + "epoch": 0.40511970152347393, + "grad_norm": 0.47983160614967346, + "learning_rate": 2.697026633959754e-05, + "loss": 0.1981, + "step": 3909 + }, + { + "epoch": 0.40522333920613535, + "grad_norm": 0.5490193963050842, + "learning_rate": 2.6963973337072066e-05, + "loss": 0.2429, + "step": 3910 + }, + { + "epoch": 0.40532697688879676, + "grad_norm": 0.46993234753608704, + "learning_rate": 2.6957679549856893e-05, + "loss": 0.1988, + "step": 3911 + }, + { + "epoch": 0.4054306145714582, + "grad_norm": 0.5280964970588684, + "learning_rate": 2.695138497866121e-05, + "loss": 0.2176, + "step": 3912 + }, + { + "epoch": 0.4055342522541196, + "grad_norm": 0.5112079381942749, + "learning_rate": 2.694508962419428e-05, + "loss": 0.2213, + "step": 3913 + }, + { + "epoch": 0.405637889936781, + "grad_norm": 0.45652469992637634, + "learning_rate": 2.693879348716544e-05, + "loss": 0.221, + "step": 3914 + }, + { + "epoch": 0.40574152761944243, + "grad_norm": 0.45350202918052673, + "learning_rate": 2.6932496568284138e-05, + "loss": 0.1938, + "step": 3915 + }, + { + "epoch": 0.40584516530210385, + "grad_norm": 0.43645814061164856, + "learning_rate": 2.69261988682599e-05, + "loss": 0.1972, + "step": 3916 + }, + { + "epoch": 0.40594880298476527, + "grad_norm": 0.40848255157470703, + "learning_rate": 2.691990038780234e-05, + "loss": 0.1787, + "step": 3917 + }, + { + "epoch": 0.4060524406674267, + "grad_norm": 0.4718957245349884, + "learning_rate": 2.691360112762116e-05, + "loss": 0.1987, + "step": 3918 + }, + { + "epoch": 0.4061560783500881, + "grad_norm": 0.5582277178764343, + "learning_rate": 2.6907301088426155e-05, + "loss": 0.2202, + "step": 3919 + }, + { + "epoch": 0.4062597160327495, + "grad_norm": 0.46594712138175964, + "learning_rate": 2.6901000270927204e-05, + "loss": 0.2082, + "step": 3920 + }, + { + "epoch": 0.40636335371541094, + "grad_norm": 0.48121878504753113, + "learning_rate": 2.689469867583426e-05, + "loss": 0.2238, + "step": 3921 + }, + { + "epoch": 0.40646699139807235, + "grad_norm": 0.5775296092033386, + "learning_rate": 2.68883963038574e-05, + "loss": 0.263, + "step": 3922 + }, + { + "epoch": 0.40657062908073377, + "grad_norm": 0.4708368182182312, + "learning_rate": 2.6882093155706742e-05, + "loss": 0.2085, + "step": 3923 + }, + { + "epoch": 0.4066742667633952, + "grad_norm": 0.4810127317905426, + "learning_rate": 2.687578923209253e-05, + "loss": 0.2149, + "step": 3924 + }, + { + "epoch": 0.4067779044460566, + "grad_norm": 0.45860347151756287, + "learning_rate": 2.686948453372508e-05, + "loss": 0.1914, + "step": 3925 + }, + { + "epoch": 0.406881542128718, + "grad_norm": 0.5676863789558411, + "learning_rate": 2.6863179061314784e-05, + "loss": 0.218, + "step": 3926 + }, + { + "epoch": 0.40698517981137944, + "grad_norm": 0.44273561239242554, + "learning_rate": 2.6856872815572145e-05, + "loss": 0.1738, + "step": 3927 + }, + { + "epoch": 0.40708881749404086, + "grad_norm": 0.5740095376968384, + "learning_rate": 2.6850565797207733e-05, + "loss": 0.2254, + "step": 3928 + }, + { + "epoch": 0.4071924551767023, + "grad_norm": 0.5443901419639587, + "learning_rate": 2.684425800693222e-05, + "loss": 0.239, + "step": 3929 + }, + { + "epoch": 0.4072960928593637, + "grad_norm": 0.43100705742836, + "learning_rate": 2.6837949445456355e-05, + "loss": 0.1679, + "step": 3930 + }, + { + "epoch": 0.4073997305420251, + "grad_norm": 0.4696880280971527, + "learning_rate": 2.6831640113490965e-05, + "loss": 0.1976, + "step": 3931 + }, + { + "epoch": 0.40750336822468647, + "grad_norm": 0.5075644254684448, + "learning_rate": 2.6825330011747003e-05, + "loss": 0.1958, + "step": 3932 + }, + { + "epoch": 0.4076070059073479, + "grad_norm": 0.4977028965950012, + "learning_rate": 2.6819019140935458e-05, + "loss": 0.1925, + "step": 3933 + }, + { + "epoch": 0.4077106435900093, + "grad_norm": 0.5052193999290466, + "learning_rate": 2.6812707501767438e-05, + "loss": 0.2208, + "step": 3934 + }, + { + "epoch": 0.4078142812726707, + "grad_norm": 0.4574637711048126, + "learning_rate": 2.6806395094954126e-05, + "loss": 0.1852, + "step": 3935 + }, + { + "epoch": 0.40791791895533214, + "grad_norm": 0.550905704498291, + "learning_rate": 2.6800081921206795e-05, + "loss": 0.2398, + "step": 3936 + }, + { + "epoch": 0.40802155663799355, + "grad_norm": 0.4931041896343231, + "learning_rate": 2.6793767981236807e-05, + "loss": 0.2163, + "step": 3937 + }, + { + "epoch": 0.40812519432065497, + "grad_norm": 0.5394712686538696, + "learning_rate": 2.6787453275755603e-05, + "loss": 0.2329, + "step": 3938 + }, + { + "epoch": 0.4082288320033164, + "grad_norm": 0.5719344019889832, + "learning_rate": 2.6781137805474714e-05, + "loss": 0.2535, + "step": 3939 + }, + { + "epoch": 0.4083324696859778, + "grad_norm": 0.4687522053718567, + "learning_rate": 2.6774821571105758e-05, + "loss": 0.1757, + "step": 3940 + }, + { + "epoch": 0.4084361073686392, + "grad_norm": 0.46701177954673767, + "learning_rate": 2.6768504573360438e-05, + "loss": 0.1978, + "step": 3941 + }, + { + "epoch": 0.40853974505130064, + "grad_norm": 0.4797942340373993, + "learning_rate": 2.6762186812950548e-05, + "loss": 0.2064, + "step": 3942 + }, + { + "epoch": 0.40864338273396206, + "grad_norm": 0.5513239502906799, + "learning_rate": 2.6755868290587956e-05, + "loss": 0.2065, + "step": 3943 + }, + { + "epoch": 0.4087470204166235, + "grad_norm": 0.48958635330200195, + "learning_rate": 2.6749549006984633e-05, + "loss": 0.2184, + "step": 3944 + }, + { + "epoch": 0.4088506580992849, + "grad_norm": 0.5532996654510498, + "learning_rate": 2.674322896285262e-05, + "loss": 0.2361, + "step": 3945 + }, + { + "epoch": 0.4089542957819463, + "grad_norm": 0.5579491257667542, + "learning_rate": 2.673690815890404e-05, + "loss": 0.2286, + "step": 3946 + }, + { + "epoch": 0.4090579334646077, + "grad_norm": 0.4894980192184448, + "learning_rate": 2.6730586595851127e-05, + "loss": 0.2012, + "step": 3947 + }, + { + "epoch": 0.40916157114726914, + "grad_norm": 0.4395800232887268, + "learning_rate": 2.6724264274406185e-05, + "loss": 0.202, + "step": 3948 + }, + { + "epoch": 0.40926520882993056, + "grad_norm": 0.4964560270309448, + "learning_rate": 2.6717941195281595e-05, + "loss": 0.1948, + "step": 3949 + }, + { + "epoch": 0.409368846512592, + "grad_norm": 0.5479037165641785, + "learning_rate": 2.6711617359189827e-05, + "loss": 0.2029, + "step": 3950 + }, + { + "epoch": 0.4094724841952534, + "grad_norm": 0.5029044151306152, + "learning_rate": 2.6705292766843455e-05, + "loss": 0.2172, + "step": 3951 + }, + { + "epoch": 0.4095761218779148, + "grad_norm": 0.5871431231498718, + "learning_rate": 2.6698967418955116e-05, + "loss": 0.2156, + "step": 3952 + }, + { + "epoch": 0.40967975956057623, + "grad_norm": 0.4270620048046112, + "learning_rate": 2.669264131623754e-05, + "loss": 0.1749, + "step": 3953 + }, + { + "epoch": 0.40978339724323765, + "grad_norm": 0.4759542644023895, + "learning_rate": 2.668631445940355e-05, + "loss": 0.2114, + "step": 3954 + }, + { + "epoch": 0.40988703492589906, + "grad_norm": 0.4727574288845062, + "learning_rate": 2.6679986849166034e-05, + "loss": 0.2146, + "step": 3955 + }, + { + "epoch": 0.4099906726085605, + "grad_norm": 0.47743672132492065, + "learning_rate": 2.6673658486237986e-05, + "loss": 0.2077, + "step": 3956 + }, + { + "epoch": 0.4100943102912219, + "grad_norm": 0.542067289352417, + "learning_rate": 2.6667329371332482e-05, + "loss": 0.2316, + "step": 3957 + }, + { + "epoch": 0.4101979479738833, + "grad_norm": 0.4978281557559967, + "learning_rate": 2.6660999505162662e-05, + "loss": 0.2187, + "step": 3958 + }, + { + "epoch": 0.41030158565654473, + "grad_norm": 0.5522135496139526, + "learning_rate": 2.6654668888441776e-05, + "loss": 0.2458, + "step": 3959 + }, + { + "epoch": 0.41040522333920615, + "grad_norm": 0.5353384613990784, + "learning_rate": 2.664833752188314e-05, + "loss": 0.2263, + "step": 3960 + }, + { + "epoch": 0.41050886102186757, + "grad_norm": 0.4794256389141083, + "learning_rate": 2.6642005406200166e-05, + "loss": 0.2042, + "step": 3961 + }, + { + "epoch": 0.410612498704529, + "grad_norm": 0.515170693397522, + "learning_rate": 2.663567254210635e-05, + "loss": 0.2041, + "step": 3962 + }, + { + "epoch": 0.4107161363871904, + "grad_norm": 0.6219731569290161, + "learning_rate": 2.662933893031527e-05, + "loss": 0.2692, + "step": 3963 + }, + { + "epoch": 0.4108197740698518, + "grad_norm": 0.5235690474510193, + "learning_rate": 2.6623004571540584e-05, + "loss": 0.2223, + "step": 3964 + }, + { + "epoch": 0.41092341175251323, + "grad_norm": 0.5011698007583618, + "learning_rate": 2.6616669466496037e-05, + "loss": 0.194, + "step": 3965 + }, + { + "epoch": 0.41102704943517465, + "grad_norm": 0.5136545300483704, + "learning_rate": 2.661033361589546e-05, + "loss": 0.1901, + "step": 3966 + }, + { + "epoch": 0.41113068711783607, + "grad_norm": 0.5623643398284912, + "learning_rate": 2.6603997020452773e-05, + "loss": 0.2327, + "step": 3967 + }, + { + "epoch": 0.4112343248004975, + "grad_norm": 0.5664448142051697, + "learning_rate": 2.659765968088196e-05, + "loss": 0.2145, + "step": 3968 + }, + { + "epoch": 0.4113379624831589, + "grad_norm": 0.4725498855113983, + "learning_rate": 2.659132159789711e-05, + "loss": 0.1775, + "step": 3969 + }, + { + "epoch": 0.41144160016582026, + "grad_norm": 0.5574590563774109, + "learning_rate": 2.6584982772212394e-05, + "loss": 0.2156, + "step": 3970 + }, + { + "epoch": 0.4115452378484817, + "grad_norm": 0.46027258038520813, + "learning_rate": 2.6578643204542052e-05, + "loss": 0.1852, + "step": 3971 + }, + { + "epoch": 0.4116488755311431, + "grad_norm": 0.5373114943504333, + "learning_rate": 2.6572302895600422e-05, + "loss": 0.2292, + "step": 3972 + }, + { + "epoch": 0.4117525132138045, + "grad_norm": 0.5287101864814758, + "learning_rate": 2.6565961846101922e-05, + "loss": 0.2131, + "step": 3973 + }, + { + "epoch": 0.41185615089646593, + "grad_norm": 0.5218322277069092, + "learning_rate": 2.6559620056761044e-05, + "loss": 0.1849, + "step": 3974 + }, + { + "epoch": 0.41195978857912735, + "grad_norm": 0.5134084820747375, + "learning_rate": 2.655327752829237e-05, + "loss": 0.2233, + "step": 3975 + }, + { + "epoch": 0.41206342626178877, + "grad_norm": 0.47732385993003845, + "learning_rate": 2.6546934261410588e-05, + "loss": 0.1842, + "step": 3976 + }, + { + "epoch": 0.4121670639444502, + "grad_norm": 0.499118447303772, + "learning_rate": 2.654059025683042e-05, + "loss": 0.2001, + "step": 3977 + }, + { + "epoch": 0.4122707016271116, + "grad_norm": 0.5397217869758606, + "learning_rate": 2.6534245515266707e-05, + "loss": 0.2192, + "step": 3978 + }, + { + "epoch": 0.412374339309773, + "grad_norm": 0.47455599904060364, + "learning_rate": 2.6527900037434368e-05, + "loss": 0.1985, + "step": 3979 + }, + { + "epoch": 0.41247797699243444, + "grad_norm": 0.49492162466049194, + "learning_rate": 2.65215538240484e-05, + "loss": 0.1935, + "step": 3980 + }, + { + "epoch": 0.41258161467509585, + "grad_norm": 0.446444571018219, + "learning_rate": 2.651520687582389e-05, + "loss": 0.1995, + "step": 3981 + }, + { + "epoch": 0.41268525235775727, + "grad_norm": 0.4676581919193268, + "learning_rate": 2.6508859193475994e-05, + "loss": 0.1677, + "step": 3982 + }, + { + "epoch": 0.4127888900404187, + "grad_norm": 0.47199270129203796, + "learning_rate": 2.6502510777719967e-05, + "loss": 0.2, + "step": 3983 + }, + { + "epoch": 0.4128925277230801, + "grad_norm": 0.4684051275253296, + "learning_rate": 2.649616162927113e-05, + "loss": 0.1958, + "step": 3984 + }, + { + "epoch": 0.4129961654057415, + "grad_norm": 0.5107256770133972, + "learning_rate": 2.6489811748844897e-05, + "loss": 0.2066, + "step": 3985 + }, + { + "epoch": 0.41309980308840294, + "grad_norm": 0.5172587633132935, + "learning_rate": 2.6483461137156767e-05, + "loss": 0.1881, + "step": 3986 + }, + { + "epoch": 0.41320344077106436, + "grad_norm": 0.5457533597946167, + "learning_rate": 2.6477109794922317e-05, + "loss": 0.2704, + "step": 3987 + }, + { + "epoch": 0.4133070784537258, + "grad_norm": 0.4835330843925476, + "learning_rate": 2.6470757722857195e-05, + "loss": 0.1947, + "step": 3988 + }, + { + "epoch": 0.4134107161363872, + "grad_norm": 0.5259234309196472, + "learning_rate": 2.6464404921677168e-05, + "loss": 0.2229, + "step": 3989 + }, + { + "epoch": 0.4135143538190486, + "grad_norm": 0.5861340761184692, + "learning_rate": 2.645805139209803e-05, + "loss": 0.2424, + "step": 3990 + }, + { + "epoch": 0.41361799150171, + "grad_norm": 0.48513346910476685, + "learning_rate": 2.64516971348357e-05, + "loss": 0.173, + "step": 3991 + }, + { + "epoch": 0.41372162918437144, + "grad_norm": 0.4559204578399658, + "learning_rate": 2.6445342150606175e-05, + "loss": 0.1826, + "step": 3992 + }, + { + "epoch": 0.41382526686703286, + "grad_norm": 0.5155350565910339, + "learning_rate": 2.6438986440125513e-05, + "loss": 0.2136, + "step": 3993 + }, + { + "epoch": 0.4139289045496943, + "grad_norm": 0.6065822839736938, + "learning_rate": 2.6432630004109862e-05, + "loss": 0.2694, + "step": 3994 + }, + { + "epoch": 0.4140325422323557, + "grad_norm": 0.4986894726753235, + "learning_rate": 2.6426272843275467e-05, + "loss": 0.24, + "step": 3995 + }, + { + "epoch": 0.4141361799150171, + "grad_norm": 0.5501571297645569, + "learning_rate": 2.641991495833864e-05, + "loss": 0.2292, + "step": 3996 + }, + { + "epoch": 0.41423981759767853, + "grad_norm": 0.5160378813743591, + "learning_rate": 2.6413556350015773e-05, + "loss": 0.191, + "step": 3997 + }, + { + "epoch": 0.41434345528033995, + "grad_norm": 0.47425827383995056, + "learning_rate": 2.6407197019023346e-05, + "loss": 0.2144, + "step": 3998 + }, + { + "epoch": 0.41444709296300136, + "grad_norm": 0.5496262311935425, + "learning_rate": 2.6400836966077924e-05, + "loss": 0.234, + "step": 3999 + }, + { + "epoch": 0.4145507306456628, + "grad_norm": 0.5443497896194458, + "learning_rate": 2.639447619189613e-05, + "loss": 0.1911, + "step": 4000 + }, + { + "epoch": 0.4146543683283242, + "grad_norm": 0.4492267668247223, + "learning_rate": 2.6388114697194717e-05, + "loss": 0.1755, + "step": 4001 + }, + { + "epoch": 0.4147580060109856, + "grad_norm": 0.4667634963989258, + "learning_rate": 2.638175248269046e-05, + "loss": 0.1822, + "step": 4002 + }, + { + "epoch": 0.41486164369364703, + "grad_norm": 0.48646092414855957, + "learning_rate": 2.6375389549100253e-05, + "loss": 0.2004, + "step": 4003 + }, + { + "epoch": 0.41496528137630845, + "grad_norm": 0.46625807881355286, + "learning_rate": 2.6369025897141065e-05, + "loss": 0.2038, + "step": 4004 + }, + { + "epoch": 0.41506891905896987, + "grad_norm": 0.46255430579185486, + "learning_rate": 2.6362661527529935e-05, + "loss": 0.2021, + "step": 4005 + }, + { + "epoch": 0.4151725567416313, + "grad_norm": 0.5445932745933533, + "learning_rate": 2.6356296440984003e-05, + "loss": 0.2351, + "step": 4006 + }, + { + "epoch": 0.4152761944242927, + "grad_norm": 0.5121250748634338, + "learning_rate": 2.6349930638220463e-05, + "loss": 0.193, + "step": 4007 + }, + { + "epoch": 0.41537983210695406, + "grad_norm": 0.48741304874420166, + "learning_rate": 2.6343564119956617e-05, + "loss": 0.2042, + "step": 4008 + }, + { + "epoch": 0.4154834697896155, + "grad_norm": 0.4352017641067505, + "learning_rate": 2.6337196886909823e-05, + "loss": 0.1824, + "step": 4009 + }, + { + "epoch": 0.4155871074722769, + "grad_norm": 0.5594912171363831, + "learning_rate": 2.633082893979753e-05, + "loss": 0.242, + "step": 4010 + }, + { + "epoch": 0.4156907451549383, + "grad_norm": 0.5285469889640808, + "learning_rate": 2.6324460279337282e-05, + "loss": 0.2176, + "step": 4011 + }, + { + "epoch": 0.41579438283759973, + "grad_norm": 0.4648481607437134, + "learning_rate": 2.6318090906246677e-05, + "loss": 0.1913, + "step": 4012 + }, + { + "epoch": 0.41589802052026115, + "grad_norm": 0.4458993673324585, + "learning_rate": 2.631172082124341e-05, + "loss": 0.2024, + "step": 4013 + }, + { + "epoch": 0.41600165820292256, + "grad_norm": 0.5384868383407593, + "learning_rate": 2.6305350025045257e-05, + "loss": 0.235, + "step": 4014 + }, + { + "epoch": 0.416105295885584, + "grad_norm": 0.45828738808631897, + "learning_rate": 2.629897851837006e-05, + "loss": 0.2207, + "step": 4015 + }, + { + "epoch": 0.4162089335682454, + "grad_norm": 0.4959215223789215, + "learning_rate": 2.6292606301935752e-05, + "loss": 0.2176, + "step": 4016 + }, + { + "epoch": 0.4163125712509068, + "grad_norm": 0.49704524874687195, + "learning_rate": 2.628623337646036e-05, + "loss": 0.2098, + "step": 4017 + }, + { + "epoch": 0.41641620893356823, + "grad_norm": 0.4785382151603699, + "learning_rate": 2.6279859742661954e-05, + "loss": 0.2164, + "step": 4018 + }, + { + "epoch": 0.41651984661622965, + "grad_norm": 0.5414184331893921, + "learning_rate": 2.627348540125872e-05, + "loss": 0.2271, + "step": 4019 + }, + { + "epoch": 0.41662348429889107, + "grad_norm": 0.45238590240478516, + "learning_rate": 2.6267110352968894e-05, + "loss": 0.1869, + "step": 4020 + }, + { + "epoch": 0.4167271219815525, + "grad_norm": 0.49890977144241333, + "learning_rate": 2.626073459851082e-05, + "loss": 0.2091, + "step": 4021 + }, + { + "epoch": 0.4168307596642139, + "grad_norm": 0.5310577154159546, + "learning_rate": 2.625435813860291e-05, + "loss": 0.249, + "step": 4022 + }, + { + "epoch": 0.4169343973468753, + "grad_norm": 0.5554194450378418, + "learning_rate": 2.6247980973963642e-05, + "loss": 0.2236, + "step": 4023 + }, + { + "epoch": 0.41703803502953674, + "grad_norm": 0.5059272646903992, + "learning_rate": 2.6241603105311594e-05, + "loss": 0.2097, + "step": 4024 + }, + { + "epoch": 0.41714167271219815, + "grad_norm": 0.522890567779541, + "learning_rate": 2.6235224533365403e-05, + "loss": 0.2178, + "step": 4025 + }, + { + "epoch": 0.41724531039485957, + "grad_norm": 0.5513536930084229, + "learning_rate": 2.622884525884381e-05, + "loss": 0.2594, + "step": 4026 + }, + { + "epoch": 0.417348948077521, + "grad_norm": 0.4362293481826782, + "learning_rate": 2.622246528246562e-05, + "loss": 0.2008, + "step": 4027 + }, + { + "epoch": 0.4174525857601824, + "grad_norm": 0.4287848174571991, + "learning_rate": 2.6216084604949715e-05, + "loss": 0.1771, + "step": 4028 + }, + { + "epoch": 0.4175562234428438, + "grad_norm": 0.4799305200576782, + "learning_rate": 2.6209703227015054e-05, + "loss": 0.1934, + "step": 4029 + }, + { + "epoch": 0.41765986112550524, + "grad_norm": 0.5041770339012146, + "learning_rate": 2.6203321149380685e-05, + "loss": 0.2348, + "step": 4030 + }, + { + "epoch": 0.41776349880816666, + "grad_norm": 0.4590016007423401, + "learning_rate": 2.6196938372765736e-05, + "loss": 0.1825, + "step": 4031 + }, + { + "epoch": 0.4178671364908281, + "grad_norm": 0.4430880546569824, + "learning_rate": 2.6190554897889406e-05, + "loss": 0.1719, + "step": 4032 + }, + { + "epoch": 0.4179707741734895, + "grad_norm": 0.5018654465675354, + "learning_rate": 2.6184170725470974e-05, + "loss": 0.2149, + "step": 4033 + }, + { + "epoch": 0.4180744118561509, + "grad_norm": 0.4524014890193939, + "learning_rate": 2.6177785856229795e-05, + "loss": 0.2038, + "step": 4034 + }, + { + "epoch": 0.4181780495388123, + "grad_norm": 0.4818570017814636, + "learning_rate": 2.6171400290885305e-05, + "loss": 0.1997, + "step": 4035 + }, + { + "epoch": 0.41828168722147374, + "grad_norm": 0.5182315111160278, + "learning_rate": 2.6165014030157032e-05, + "loss": 0.199, + "step": 4036 + }, + { + "epoch": 0.41838532490413516, + "grad_norm": 0.509839653968811, + "learning_rate": 2.6158627074764564e-05, + "loss": 0.199, + "step": 4037 + }, + { + "epoch": 0.4184889625867966, + "grad_norm": 0.5443018078804016, + "learning_rate": 2.6152239425427563e-05, + "loss": 0.2108, + "step": 4038 + }, + { + "epoch": 0.418592600269458, + "grad_norm": 0.5486571788787842, + "learning_rate": 2.6145851082865788e-05, + "loss": 0.2207, + "step": 4039 + }, + { + "epoch": 0.4186962379521194, + "grad_norm": 0.48583099246025085, + "learning_rate": 2.6139462047799067e-05, + "loss": 0.22, + "step": 4040 + }, + { + "epoch": 0.4187998756347808, + "grad_norm": 0.5152609348297119, + "learning_rate": 2.613307232094731e-05, + "loss": 0.226, + "step": 4041 + }, + { + "epoch": 0.41890351331744224, + "grad_norm": 0.558648943901062, + "learning_rate": 2.612668190303049e-05, + "loss": 0.2448, + "step": 4042 + }, + { + "epoch": 0.41900715100010366, + "grad_norm": 0.4209331274032593, + "learning_rate": 2.6120290794768694e-05, + "loss": 0.1843, + "step": 4043 + }, + { + "epoch": 0.4191107886827651, + "grad_norm": 0.5400482416152954, + "learning_rate": 2.611389899688203e-05, + "loss": 0.2258, + "step": 4044 + }, + { + "epoch": 0.4192144263654265, + "grad_norm": 0.4837553799152374, + "learning_rate": 2.6107506510090735e-05, + "loss": 0.2076, + "step": 4045 + }, + { + "epoch": 0.41931806404808786, + "grad_norm": 0.4797718822956085, + "learning_rate": 2.6101113335115106e-05, + "loss": 0.2159, + "step": 4046 + }, + { + "epoch": 0.4194217017307493, + "grad_norm": 0.5106173157691956, + "learning_rate": 2.6094719472675506e-05, + "loss": 0.2231, + "step": 4047 + }, + { + "epoch": 0.4195253394134107, + "grad_norm": 0.4600318968296051, + "learning_rate": 2.608832492349239e-05, + "loss": 0.1843, + "step": 4048 + }, + { + "epoch": 0.4196289770960721, + "grad_norm": 0.5094060897827148, + "learning_rate": 2.608192968828629e-05, + "loss": 0.1999, + "step": 4049 + }, + { + "epoch": 0.4197326147787335, + "grad_norm": 0.425560861825943, + "learning_rate": 2.60755337677778e-05, + "loss": 0.1894, + "step": 4050 + }, + { + "epoch": 0.41983625246139494, + "grad_norm": 0.516232430934906, + "learning_rate": 2.6069137162687614e-05, + "loss": 0.2042, + "step": 4051 + }, + { + "epoch": 0.41993989014405636, + "grad_norm": 0.48487234115600586, + "learning_rate": 2.606273987373649e-05, + "loss": 0.2229, + "step": 4052 + }, + { + "epoch": 0.4200435278267178, + "grad_norm": 0.5296849012374878, + "learning_rate": 2.6056341901645263e-05, + "loss": 0.2178, + "step": 4053 + }, + { + "epoch": 0.4201471655093792, + "grad_norm": 0.5496042370796204, + "learning_rate": 2.6049943247134836e-05, + "loss": 0.2556, + "step": 4054 + }, + { + "epoch": 0.4202508031920406, + "grad_norm": 0.5170057415962219, + "learning_rate": 2.6043543910926214e-05, + "loss": 0.2248, + "step": 4055 + }, + { + "epoch": 0.42035444087470203, + "grad_norm": 0.5637809038162231, + "learning_rate": 2.603714389374046e-05, + "loss": 0.2415, + "step": 4056 + }, + { + "epoch": 0.42045807855736345, + "grad_norm": 0.4687712788581848, + "learning_rate": 2.6030743196298716e-05, + "loss": 0.2121, + "step": 4057 + }, + { + "epoch": 0.42056171624002486, + "grad_norm": 0.5689849257469177, + "learning_rate": 2.6024341819322203e-05, + "loss": 0.258, + "step": 4058 + }, + { + "epoch": 0.4206653539226863, + "grad_norm": 0.5509251952171326, + "learning_rate": 2.601793976353222e-05, + "loss": 0.2148, + "step": 4059 + }, + { + "epoch": 0.4207689916053477, + "grad_norm": 0.586294412612915, + "learning_rate": 2.6011537029650135e-05, + "loss": 0.2386, + "step": 4060 + }, + { + "epoch": 0.4208726292880091, + "grad_norm": 0.49938663840293884, + "learning_rate": 2.600513361839741e-05, + "loss": 0.19, + "step": 4061 + }, + { + "epoch": 0.42097626697067053, + "grad_norm": 0.3958568871021271, + "learning_rate": 2.5998729530495564e-05, + "loss": 0.1808, + "step": 4062 + }, + { + "epoch": 0.42107990465333195, + "grad_norm": 0.5225630402565002, + "learning_rate": 2.5992324766666194e-05, + "loss": 0.2262, + "step": 4063 + }, + { + "epoch": 0.42118354233599337, + "grad_norm": 0.5840221643447876, + "learning_rate": 2.598591932763099e-05, + "loss": 0.2594, + "step": 4064 + }, + { + "epoch": 0.4212871800186548, + "grad_norm": 0.44575127959251404, + "learning_rate": 2.5979513214111697e-05, + "loss": 0.1738, + "step": 4065 + }, + { + "epoch": 0.4213908177013162, + "grad_norm": 0.5635486841201782, + "learning_rate": 2.5973106426830148e-05, + "loss": 0.2284, + "step": 4066 + }, + { + "epoch": 0.4214944553839776, + "grad_norm": 0.4605032205581665, + "learning_rate": 2.5966698966508257e-05, + "loss": 0.1794, + "step": 4067 + }, + { + "epoch": 0.42159809306663903, + "grad_norm": 0.4816349446773529, + "learning_rate": 2.5960290833868004e-05, + "loss": 0.1917, + "step": 4068 + }, + { + "epoch": 0.42170173074930045, + "grad_norm": 0.5689332485198975, + "learning_rate": 2.595388202963144e-05, + "loss": 0.2646, + "step": 4069 + }, + { + "epoch": 0.42180536843196187, + "grad_norm": 0.3935214877128601, + "learning_rate": 2.5947472554520702e-05, + "loss": 0.1494, + "step": 4070 + }, + { + "epoch": 0.4219090061146233, + "grad_norm": 0.536814272403717, + "learning_rate": 2.5941062409258013e-05, + "loss": 0.232, + "step": 4071 + }, + { + "epoch": 0.4220126437972847, + "grad_norm": 0.43292945623397827, + "learning_rate": 2.593465159456564e-05, + "loss": 0.1705, + "step": 4072 + }, + { + "epoch": 0.4221162814799461, + "grad_norm": 0.44623786211013794, + "learning_rate": 2.5928240111165952e-05, + "loss": 0.2061, + "step": 4073 + }, + { + "epoch": 0.42221991916260754, + "grad_norm": 0.4869650900363922, + "learning_rate": 2.592182795978138e-05, + "loss": 0.2111, + "step": 4074 + }, + { + "epoch": 0.42232355684526895, + "grad_norm": 0.5473789572715759, + "learning_rate": 2.5915415141134443e-05, + "loss": 0.2491, + "step": 4075 + }, + { + "epoch": 0.42242719452793037, + "grad_norm": 0.5232890248298645, + "learning_rate": 2.5909001655947723e-05, + "loss": 0.2273, + "step": 4076 + }, + { + "epoch": 0.4225308322105918, + "grad_norm": 0.47419649362564087, + "learning_rate": 2.590258750494388e-05, + "loss": 0.1944, + "step": 4077 + }, + { + "epoch": 0.4226344698932532, + "grad_norm": 0.5294226408004761, + "learning_rate": 2.589617268884566e-05, + "loss": 0.2216, + "step": 4078 + }, + { + "epoch": 0.4227381075759146, + "grad_norm": 0.5194813013076782, + "learning_rate": 2.5889757208375858e-05, + "loss": 0.1979, + "step": 4079 + }, + { + "epoch": 0.42284174525857604, + "grad_norm": 0.4927207827568054, + "learning_rate": 2.588334106425738e-05, + "loss": 0.2037, + "step": 4080 + }, + { + "epoch": 0.42294538294123746, + "grad_norm": 0.48588693141937256, + "learning_rate": 2.587692425721317e-05, + "loss": 0.2071, + "step": 4081 + }, + { + "epoch": 0.4230490206238989, + "grad_norm": 0.5707266330718994, + "learning_rate": 2.5870506787966272e-05, + "loss": 0.2475, + "step": 4082 + }, + { + "epoch": 0.4231526583065603, + "grad_norm": 0.43110227584838867, + "learning_rate": 2.5864088657239795e-05, + "loss": 0.1909, + "step": 4083 + }, + { + "epoch": 0.42325629598922165, + "grad_norm": 0.453917533159256, + "learning_rate": 2.585766986575692e-05, + "loss": 0.1965, + "step": 4084 + }, + { + "epoch": 0.42335993367188307, + "grad_norm": 0.4615586996078491, + "learning_rate": 2.5851250414240915e-05, + "loss": 0.176, + "step": 4085 + }, + { + "epoch": 0.4234635713545445, + "grad_norm": 0.5152726173400879, + "learning_rate": 2.5844830303415105e-05, + "loss": 0.183, + "step": 4086 + }, + { + "epoch": 0.4235672090372059, + "grad_norm": 0.5132525563240051, + "learning_rate": 2.5838409534002907e-05, + "loss": 0.2253, + "step": 4087 + }, + { + "epoch": 0.4236708467198673, + "grad_norm": 0.5109318494796753, + "learning_rate": 2.58319881067278e-05, + "loss": 0.2057, + "step": 4088 + }, + { + "epoch": 0.42377448440252874, + "grad_norm": 0.4559604227542877, + "learning_rate": 2.5825566022313327e-05, + "loss": 0.1812, + "step": 4089 + }, + { + "epoch": 0.42387812208519016, + "grad_norm": 0.4541739523410797, + "learning_rate": 2.581914328148315e-05, + "loss": 0.1889, + "step": 4090 + }, + { + "epoch": 0.4239817597678516, + "grad_norm": 0.6262094974517822, + "learning_rate": 2.581271988496094e-05, + "loss": 0.2568, + "step": 4091 + }, + { + "epoch": 0.424085397450513, + "grad_norm": 0.5760964751243591, + "learning_rate": 2.5806295833470493e-05, + "loss": 0.2161, + "step": 4092 + }, + { + "epoch": 0.4241890351331744, + "grad_norm": 0.5587107539176941, + "learning_rate": 2.5799871127735663e-05, + "loss": 0.2343, + "step": 4093 + }, + { + "epoch": 0.4242926728158358, + "grad_norm": 0.4467000663280487, + "learning_rate": 2.579344576848036e-05, + "loss": 0.1805, + "step": 4094 + }, + { + "epoch": 0.42439631049849724, + "grad_norm": 0.5288705825805664, + "learning_rate": 2.57870197564286e-05, + "loss": 0.2407, + "step": 4095 + }, + { + "epoch": 0.42449994818115866, + "grad_norm": 0.5100021958351135, + "learning_rate": 2.5780593092304452e-05, + "loss": 0.2091, + "step": 4096 + }, + { + "epoch": 0.4246035858638201, + "grad_norm": 0.4132768213748932, + "learning_rate": 2.5774165776832058e-05, + "loss": 0.1696, + "step": 4097 + }, + { + "epoch": 0.4247072235464815, + "grad_norm": 0.5586245656013489, + "learning_rate": 2.5767737810735636e-05, + "loss": 0.2356, + "step": 4098 + }, + { + "epoch": 0.4248108612291429, + "grad_norm": 0.5645946264266968, + "learning_rate": 2.5761309194739486e-05, + "loss": 0.2492, + "step": 4099 + }, + { + "epoch": 0.42491449891180433, + "grad_norm": 0.4798961281776428, + "learning_rate": 2.575487992956798e-05, + "loss": 0.1909, + "step": 4100 + }, + { + "epoch": 0.42501813659446575, + "grad_norm": 0.5436182618141174, + "learning_rate": 2.574845001594554e-05, + "loss": 0.2442, + "step": 4101 + }, + { + "epoch": 0.42512177427712716, + "grad_norm": 0.5537167191505432, + "learning_rate": 2.5742019454596688e-05, + "loss": 0.2371, + "step": 4102 + }, + { + "epoch": 0.4252254119597886, + "grad_norm": 0.4842146039009094, + "learning_rate": 2.573558824624602e-05, + "loss": 0.1927, + "step": 4103 + }, + { + "epoch": 0.42532904964245, + "grad_norm": 0.48249712586402893, + "learning_rate": 2.5729156391618172e-05, + "loss": 0.1799, + "step": 4104 + }, + { + "epoch": 0.4254326873251114, + "grad_norm": 0.5177933573722839, + "learning_rate": 2.5722723891437894e-05, + "loss": 0.2161, + "step": 4105 + }, + { + "epoch": 0.42553632500777283, + "grad_norm": 0.5344215035438538, + "learning_rate": 2.571629074642999e-05, + "loss": 0.2228, + "step": 4106 + }, + { + "epoch": 0.42563996269043425, + "grad_norm": 0.5359126925468445, + "learning_rate": 2.5709856957319323e-05, + "loss": 0.2086, + "step": 4107 + }, + { + "epoch": 0.42574360037309567, + "grad_norm": 0.45025259256362915, + "learning_rate": 2.570342252483085e-05, + "loss": 0.1903, + "step": 4108 + }, + { + "epoch": 0.4258472380557571, + "grad_norm": 0.4615389108657837, + "learning_rate": 2.5696987449689594e-05, + "loss": 0.1903, + "step": 4109 + }, + { + "epoch": 0.4259508757384185, + "grad_norm": 0.5702045559883118, + "learning_rate": 2.569055173262065e-05, + "loss": 0.2017, + "step": 4110 + }, + { + "epoch": 0.4260545134210799, + "grad_norm": 0.4639468193054199, + "learning_rate": 2.5684115374349184e-05, + "loss": 0.1703, + "step": 4111 + }, + { + "epoch": 0.42615815110374133, + "grad_norm": 0.4748694598674774, + "learning_rate": 2.5677678375600436e-05, + "loss": 0.1906, + "step": 4112 + }, + { + "epoch": 0.42626178878640275, + "grad_norm": 0.5352087616920471, + "learning_rate": 2.567124073709971e-05, + "loss": 0.2314, + "step": 4113 + }, + { + "epoch": 0.42636542646906417, + "grad_norm": 0.5662137866020203, + "learning_rate": 2.566480245957239e-05, + "loss": 0.2385, + "step": 4114 + }, + { + "epoch": 0.4264690641517256, + "grad_norm": 0.495313435792923, + "learning_rate": 2.5658363543743944e-05, + "loss": 0.2215, + "step": 4115 + }, + { + "epoch": 0.426572701834387, + "grad_norm": 0.4843652844429016, + "learning_rate": 2.5651923990339884e-05, + "loss": 0.1973, + "step": 4116 + }, + { + "epoch": 0.4266763395170484, + "grad_norm": 0.5149955153465271, + "learning_rate": 2.5645483800085815e-05, + "loss": 0.1962, + "step": 4117 + }, + { + "epoch": 0.42677997719970984, + "grad_norm": 0.41277220845222473, + "learning_rate": 2.563904297370741e-05, + "loss": 0.1654, + "step": 4118 + }, + { + "epoch": 0.42688361488237125, + "grad_norm": 0.47097402811050415, + "learning_rate": 2.5632601511930405e-05, + "loss": 0.1941, + "step": 4119 + }, + { + "epoch": 0.42698725256503267, + "grad_norm": 0.390764981508255, + "learning_rate": 2.562615941548062e-05, + "loss": 0.1509, + "step": 4120 + }, + { + "epoch": 0.4270908902476941, + "grad_norm": 0.5125734210014343, + "learning_rate": 2.561971668508394e-05, + "loss": 0.208, + "step": 4121 + }, + { + "epoch": 0.42719452793035545, + "grad_norm": 0.5046721696853638, + "learning_rate": 2.5613273321466324e-05, + "loss": 0.1955, + "step": 4122 + }, + { + "epoch": 0.42729816561301687, + "grad_norm": 0.5146350860595703, + "learning_rate": 2.5606829325353788e-05, + "loss": 0.202, + "step": 4123 + }, + { + "epoch": 0.4274018032956783, + "grad_norm": 0.5878974795341492, + "learning_rate": 2.560038469747244e-05, + "loss": 0.2564, + "step": 4124 + }, + { + "epoch": 0.4275054409783397, + "grad_norm": 0.5669657588005066, + "learning_rate": 2.5593939438548455e-05, + "loss": 0.2733, + "step": 4125 + }, + { + "epoch": 0.4276090786610011, + "grad_norm": 0.5380009412765503, + "learning_rate": 2.558749354930807e-05, + "loss": 0.199, + "step": 4126 + }, + { + "epoch": 0.42771271634366254, + "grad_norm": 0.4734484553337097, + "learning_rate": 2.558104703047759e-05, + "loss": 0.2015, + "step": 4127 + }, + { + "epoch": 0.42781635402632395, + "grad_norm": 0.43804338574409485, + "learning_rate": 2.5574599882783417e-05, + "loss": 0.2008, + "step": 4128 + }, + { + "epoch": 0.42791999170898537, + "grad_norm": 0.507434070110321, + "learning_rate": 2.5568152106951986e-05, + "loss": 0.1931, + "step": 4129 + }, + { + "epoch": 0.4280236293916468, + "grad_norm": 0.5040130019187927, + "learning_rate": 2.5561703703709837e-05, + "loss": 0.2176, + "step": 4130 + }, + { + "epoch": 0.4281272670743082, + "grad_norm": 0.5868910551071167, + "learning_rate": 2.555525467378356e-05, + "loss": 0.264, + "step": 4131 + }, + { + "epoch": 0.4282309047569696, + "grad_norm": 0.5386139750480652, + "learning_rate": 2.554880501789982e-05, + "loss": 0.2438, + "step": 4132 + }, + { + "epoch": 0.42833454243963104, + "grad_norm": 0.5160743594169617, + "learning_rate": 2.554235473678536e-05, + "loss": 0.2014, + "step": 4133 + }, + { + "epoch": 0.42843818012229246, + "grad_norm": 0.4884987771511078, + "learning_rate": 2.553590383116698e-05, + "loss": 0.2044, + "step": 4134 + }, + { + "epoch": 0.4285418178049539, + "grad_norm": 0.5191545486450195, + "learning_rate": 2.5529452301771563e-05, + "loss": 0.2101, + "step": 4135 + }, + { + "epoch": 0.4286454554876153, + "grad_norm": 0.495727002620697, + "learning_rate": 2.5523000149326053e-05, + "loss": 0.1947, + "step": 4136 + }, + { + "epoch": 0.4287490931702767, + "grad_norm": 0.4700315296649933, + "learning_rate": 2.551654737455748e-05, + "loss": 0.1862, + "step": 4137 + }, + { + "epoch": 0.4288527308529381, + "grad_norm": 0.4800029397010803, + "learning_rate": 2.5510093978192922e-05, + "loss": 0.1945, + "step": 4138 + }, + { + "epoch": 0.42895636853559954, + "grad_norm": 0.4926530718803406, + "learning_rate": 2.5503639960959534e-05, + "loss": 0.2111, + "step": 4139 + }, + { + "epoch": 0.42906000621826096, + "grad_norm": 0.4847051203250885, + "learning_rate": 2.5497185323584556e-05, + "loss": 0.1836, + "step": 4140 + }, + { + "epoch": 0.4291636439009224, + "grad_norm": 0.4954644739627838, + "learning_rate": 2.5490730066795282e-05, + "loss": 0.194, + "step": 4141 + }, + { + "epoch": 0.4292672815835838, + "grad_norm": 0.5430189967155457, + "learning_rate": 2.548427419131908e-05, + "loss": 0.208, + "step": 4142 + }, + { + "epoch": 0.4293709192662452, + "grad_norm": 0.5013940334320068, + "learning_rate": 2.5477817697883383e-05, + "loss": 0.2248, + "step": 4143 + }, + { + "epoch": 0.4294745569489066, + "grad_norm": 0.4881177544593811, + "learning_rate": 2.5471360587215706e-05, + "loss": 0.2098, + "step": 4144 + }, + { + "epoch": 0.42957819463156804, + "grad_norm": 0.439849317073822, + "learning_rate": 2.546490286004362e-05, + "loss": 0.1732, + "step": 4145 + }, + { + "epoch": 0.42968183231422946, + "grad_norm": 0.5770775675773621, + "learning_rate": 2.5458444517094777e-05, + "loss": 0.2279, + "step": 4146 + }, + { + "epoch": 0.4297854699968909, + "grad_norm": 0.4989546239376068, + "learning_rate": 2.5451985559096903e-05, + "loss": 0.2004, + "step": 4147 + }, + { + "epoch": 0.4298891076795523, + "grad_norm": 0.5036958456039429, + "learning_rate": 2.5445525986777755e-05, + "loss": 0.2017, + "step": 4148 + }, + { + "epoch": 0.4299927453622137, + "grad_norm": 0.562789797782898, + "learning_rate": 2.5439065800865206e-05, + "loss": 0.2267, + "step": 4149 + }, + { + "epoch": 0.43009638304487513, + "grad_norm": 0.5571634769439697, + "learning_rate": 2.543260500208719e-05, + "loss": 0.2299, + "step": 4150 + }, + { + "epoch": 0.43020002072753655, + "grad_norm": 0.6189036965370178, + "learning_rate": 2.5426143591171678e-05, + "loss": 0.2534, + "step": 4151 + }, + { + "epoch": 0.43030365841019796, + "grad_norm": 0.5095949172973633, + "learning_rate": 2.5419681568846742e-05, + "loss": 0.2192, + "step": 4152 + }, + { + "epoch": 0.4304072960928594, + "grad_norm": 0.4679073691368103, + "learning_rate": 2.5413218935840508e-05, + "loss": 0.2048, + "step": 4153 + }, + { + "epoch": 0.4305109337755208, + "grad_norm": 0.5005534887313843, + "learning_rate": 2.5406755692881183e-05, + "loss": 0.2092, + "step": 4154 + }, + { + "epoch": 0.4306145714581822, + "grad_norm": 0.5270726084709167, + "learning_rate": 2.5400291840697032e-05, + "loss": 0.2268, + "step": 4155 + }, + { + "epoch": 0.43071820914084363, + "grad_norm": 0.49191877245903015, + "learning_rate": 2.5393827380016397e-05, + "loss": 0.241, + "step": 4156 + }, + { + "epoch": 0.43082184682350505, + "grad_norm": 0.5938627123832703, + "learning_rate": 2.538736231156767e-05, + "loss": 0.247, + "step": 4157 + }, + { + "epoch": 0.43092548450616647, + "grad_norm": 0.4768686592578888, + "learning_rate": 2.538089663607933e-05, + "loss": 0.1919, + "step": 4158 + }, + { + "epoch": 0.4310291221888279, + "grad_norm": 0.4200623035430908, + "learning_rate": 2.5374430354279934e-05, + "loss": 0.1469, + "step": 4159 + }, + { + "epoch": 0.43113275987148925, + "grad_norm": 0.4872795343399048, + "learning_rate": 2.5367963466898073e-05, + "loss": 0.218, + "step": 4160 + }, + { + "epoch": 0.43123639755415066, + "grad_norm": 0.5187094211578369, + "learning_rate": 2.536149597466243e-05, + "loss": 0.2321, + "step": 4161 + }, + { + "epoch": 0.4313400352368121, + "grad_norm": 0.5779088139533997, + "learning_rate": 2.5355027878301756e-05, + "loss": 0.2624, + "step": 4162 + }, + { + "epoch": 0.4314436729194735, + "grad_norm": 0.48629242181777954, + "learning_rate": 2.5348559178544866e-05, + "loss": 0.1804, + "step": 4163 + }, + { + "epoch": 0.4315473106021349, + "grad_norm": 0.3519076704978943, + "learning_rate": 2.5342089876120647e-05, + "loss": 0.1498, + "step": 4164 + }, + { + "epoch": 0.43165094828479633, + "grad_norm": 0.40314781665802, + "learning_rate": 2.533561997175804e-05, + "loss": 0.1749, + "step": 4165 + }, + { + "epoch": 0.43175458596745775, + "grad_norm": 0.5147646069526672, + "learning_rate": 2.5329149466186075e-05, + "loss": 0.2372, + "step": 4166 + }, + { + "epoch": 0.43185822365011917, + "grad_norm": 0.4711988568305969, + "learning_rate": 2.532267836013383e-05, + "loss": 0.1937, + "step": 4167 + }, + { + "epoch": 0.4319618613327806, + "grad_norm": 0.5647103786468506, + "learning_rate": 2.531620665433045e-05, + "loss": 0.2275, + "step": 4168 + }, + { + "epoch": 0.432065499015442, + "grad_norm": 0.4511049687862396, + "learning_rate": 2.5309734349505183e-05, + "loss": 0.19, + "step": 4169 + }, + { + "epoch": 0.4321691366981034, + "grad_norm": 0.5962859988212585, + "learning_rate": 2.53032614463873e-05, + "loss": 0.2334, + "step": 4170 + }, + { + "epoch": 0.43227277438076483, + "grad_norm": 0.5469899773597717, + "learning_rate": 2.5296787945706162e-05, + "loss": 0.2063, + "step": 4171 + }, + { + "epoch": 0.43237641206342625, + "grad_norm": 0.47863686084747314, + "learning_rate": 2.5290313848191193e-05, + "loss": 0.192, + "step": 4172 + }, + { + "epoch": 0.43248004974608767, + "grad_norm": 0.6556797623634338, + "learning_rate": 2.5283839154571878e-05, + "loss": 0.2346, + "step": 4173 + }, + { + "epoch": 0.4325836874287491, + "grad_norm": 0.5168253183364868, + "learning_rate": 2.5277363865577783e-05, + "loss": 0.2092, + "step": 4174 + }, + { + "epoch": 0.4326873251114105, + "grad_norm": 0.5221461653709412, + "learning_rate": 2.5270887981938533e-05, + "loss": 0.2228, + "step": 4175 + }, + { + "epoch": 0.4327909627940719, + "grad_norm": 0.5210689306259155, + "learning_rate": 2.5264411504383822e-05, + "loss": 0.2144, + "step": 4176 + }, + { + "epoch": 0.43289460047673334, + "grad_norm": 0.5394599437713623, + "learning_rate": 2.5257934433643404e-05, + "loss": 0.2284, + "step": 4177 + }, + { + "epoch": 0.43299823815939475, + "grad_norm": 0.5580217242240906, + "learning_rate": 2.5251456770447105e-05, + "loss": 0.2218, + "step": 4178 + }, + { + "epoch": 0.43310187584205617, + "grad_norm": 0.5175135135650635, + "learning_rate": 2.5244978515524824e-05, + "loss": 0.222, + "step": 4179 + }, + { + "epoch": 0.4332055135247176, + "grad_norm": 0.47659069299697876, + "learning_rate": 2.523849966960651e-05, + "loss": 0.2125, + "step": 4180 + }, + { + "epoch": 0.433309151207379, + "grad_norm": 0.4961530864238739, + "learning_rate": 2.5232020233422202e-05, + "loss": 0.226, + "step": 4181 + }, + { + "epoch": 0.4334127888900404, + "grad_norm": 0.5616767406463623, + "learning_rate": 2.5225540207701996e-05, + "loss": 0.2508, + "step": 4182 + }, + { + "epoch": 0.43351642657270184, + "grad_norm": 0.5131274461746216, + "learning_rate": 2.5219059593176026e-05, + "loss": 0.1833, + "step": 4183 + }, + { + "epoch": 0.43362006425536326, + "grad_norm": 0.5015151500701904, + "learning_rate": 2.5212578390574542e-05, + "loss": 0.2318, + "step": 4184 + }, + { + "epoch": 0.4337237019380247, + "grad_norm": 0.5001304745674133, + "learning_rate": 2.5206096600627832e-05, + "loss": 0.2145, + "step": 4185 + }, + { + "epoch": 0.4338273396206861, + "grad_norm": 0.44869324564933777, + "learning_rate": 2.519961422406625e-05, + "loss": 0.2037, + "step": 4186 + }, + { + "epoch": 0.4339309773033475, + "grad_norm": 0.5340890884399414, + "learning_rate": 2.5193131261620213e-05, + "loss": 0.2216, + "step": 4187 + }, + { + "epoch": 0.4340346149860089, + "grad_norm": 0.4965475797653198, + "learning_rate": 2.518664771402022e-05, + "loss": 0.2003, + "step": 4188 + }, + { + "epoch": 0.43413825266867034, + "grad_norm": 0.45350730419158936, + "learning_rate": 2.5180163581996828e-05, + "loss": 0.1868, + "step": 4189 + }, + { + "epoch": 0.43424189035133176, + "grad_norm": 0.4761691987514496, + "learning_rate": 2.5173678866280655e-05, + "loss": 0.231, + "step": 4190 + }, + { + "epoch": 0.4343455280339932, + "grad_norm": 0.4665203094482422, + "learning_rate": 2.5167193567602395e-05, + "loss": 0.189, + "step": 4191 + }, + { + "epoch": 0.4344491657166546, + "grad_norm": 0.5481677055358887, + "learning_rate": 2.5160707686692796e-05, + "loss": 0.2258, + "step": 4192 + }, + { + "epoch": 0.434552803399316, + "grad_norm": 0.5124077796936035, + "learning_rate": 2.5154221224282664e-05, + "loss": 0.1925, + "step": 4193 + }, + { + "epoch": 0.43465644108197743, + "grad_norm": 0.45199984312057495, + "learning_rate": 2.5147734181102915e-05, + "loss": 0.1949, + "step": 4194 + }, + { + "epoch": 0.43476007876463885, + "grad_norm": 0.434619277715683, + "learning_rate": 2.5141246557884466e-05, + "loss": 0.1763, + "step": 4195 + }, + { + "epoch": 0.43486371644730026, + "grad_norm": 0.4808880388736725, + "learning_rate": 2.5134758355358355e-05, + "loss": 0.1883, + "step": 4196 + }, + { + "epoch": 0.4349673541299617, + "grad_norm": 0.5743508338928223, + "learning_rate": 2.5128269574255653e-05, + "loss": 0.2515, + "step": 4197 + }, + { + "epoch": 0.43507099181262304, + "grad_norm": 0.5212010741233826, + "learning_rate": 2.5121780215307507e-05, + "loss": 0.1886, + "step": 4198 + }, + { + "epoch": 0.43517462949528446, + "grad_norm": 0.5839496850967407, + "learning_rate": 2.511529027924513e-05, + "loss": 0.2768, + "step": 4199 + }, + { + "epoch": 0.4352782671779459, + "grad_norm": 0.48613330721855164, + "learning_rate": 2.5108799766799794e-05, + "loss": 0.1711, + "step": 4200 + }, + { + "epoch": 0.4353819048606073, + "grad_norm": 0.47015008330345154, + "learning_rate": 2.5102308678702842e-05, + "loss": 0.2118, + "step": 4201 + }, + { + "epoch": 0.4354855425432687, + "grad_norm": 0.45157095789909363, + "learning_rate": 2.5095817015685686e-05, + "loss": 0.1939, + "step": 4202 + }, + { + "epoch": 0.43558918022593013, + "grad_norm": 0.5309513807296753, + "learning_rate": 2.508932477847978e-05, + "loss": 0.2386, + "step": 4203 + }, + { + "epoch": 0.43569281790859155, + "grad_norm": 0.5956526398658752, + "learning_rate": 2.5082831967816676e-05, + "loss": 0.2582, + "step": 4204 + }, + { + "epoch": 0.43579645559125296, + "grad_norm": 0.47897085547447205, + "learning_rate": 2.5076338584427963e-05, + "loss": 0.1902, + "step": 4205 + }, + { + "epoch": 0.4359000932739144, + "grad_norm": 0.4405311346054077, + "learning_rate": 2.5069844629045314e-05, + "loss": 0.1887, + "step": 4206 + }, + { + "epoch": 0.4360037309565758, + "grad_norm": 0.5245810747146606, + "learning_rate": 2.5063350102400454e-05, + "loss": 0.2111, + "step": 4207 + }, + { + "epoch": 0.4361073686392372, + "grad_norm": 0.4980141222476959, + "learning_rate": 2.505685500522517e-05, + "loss": 0.1955, + "step": 4208 + }, + { + "epoch": 0.43621100632189863, + "grad_norm": 0.5519107580184937, + "learning_rate": 2.505035933825133e-05, + "loss": 0.2498, + "step": 4209 + }, + { + "epoch": 0.43631464400456005, + "grad_norm": 0.5352892875671387, + "learning_rate": 2.5043863102210854e-05, + "loss": 0.2226, + "step": 4210 + }, + { + "epoch": 0.43641828168722147, + "grad_norm": 0.4760989546775818, + "learning_rate": 2.5037366297835716e-05, + "loss": 0.2011, + "step": 4211 + }, + { + "epoch": 0.4365219193698829, + "grad_norm": 0.48700448870658875, + "learning_rate": 2.5030868925857976e-05, + "loss": 0.1922, + "step": 4212 + }, + { + "epoch": 0.4366255570525443, + "grad_norm": 0.5479152202606201, + "learning_rate": 2.5024370987009748e-05, + "loss": 0.2453, + "step": 4213 + }, + { + "epoch": 0.4367291947352057, + "grad_norm": 0.4179893136024475, + "learning_rate": 2.5017872482023208e-05, + "loss": 0.1989, + "step": 4214 + }, + { + "epoch": 0.43683283241786713, + "grad_norm": 0.5843135714530945, + "learning_rate": 2.5011373411630598e-05, + "loss": 0.2674, + "step": 4215 + }, + { + "epoch": 0.43693647010052855, + "grad_norm": 0.5585675239562988, + "learning_rate": 2.500487377656422e-05, + "loss": 0.2292, + "step": 4216 + }, + { + "epoch": 0.43704010778318997, + "grad_norm": 0.508930504322052, + "learning_rate": 2.4998373577556446e-05, + "loss": 0.2291, + "step": 4217 + }, + { + "epoch": 0.4371437454658514, + "grad_norm": 0.5432911515235901, + "learning_rate": 2.4991872815339706e-05, + "loss": 0.2018, + "step": 4218 + }, + { + "epoch": 0.4372473831485128, + "grad_norm": 0.5012953877449036, + "learning_rate": 2.4985371490646505e-05, + "loss": 0.1923, + "step": 4219 + }, + { + "epoch": 0.4373510208311742, + "grad_norm": 0.4885612726211548, + "learning_rate": 2.4978869604209385e-05, + "loss": 0.1867, + "step": 4220 + }, + { + "epoch": 0.43745465851383564, + "grad_norm": 0.5111250877380371, + "learning_rate": 2.4972367156760982e-05, + "loss": 0.2209, + "step": 4221 + }, + { + "epoch": 0.43755829619649705, + "grad_norm": 0.49524879455566406, + "learning_rate": 2.4965864149033972e-05, + "loss": 0.2193, + "step": 4222 + }, + { + "epoch": 0.43766193387915847, + "grad_norm": 0.5377824306488037, + "learning_rate": 2.4959360581761118e-05, + "loss": 0.2349, + "step": 4223 + }, + { + "epoch": 0.4377655715618199, + "grad_norm": 0.44770196080207825, + "learning_rate": 2.4952856455675214e-05, + "loss": 0.1717, + "step": 4224 + }, + { + "epoch": 0.4378692092444813, + "grad_norm": 0.6824713349342346, + "learning_rate": 2.4946351771509153e-05, + "loss": 0.2685, + "step": 4225 + }, + { + "epoch": 0.4379728469271427, + "grad_norm": 0.533047080039978, + "learning_rate": 2.4939846529995858e-05, + "loss": 0.2252, + "step": 4226 + }, + { + "epoch": 0.43807648460980414, + "grad_norm": 0.5649407505989075, + "learning_rate": 2.4933340731868342e-05, + "loss": 0.2244, + "step": 4227 + }, + { + "epoch": 0.43818012229246556, + "grad_norm": 0.5743054151535034, + "learning_rate": 2.4926834377859646e-05, + "loss": 0.2235, + "step": 4228 + }, + { + "epoch": 0.438283759975127, + "grad_norm": 0.5511937141418457, + "learning_rate": 2.4920327468702927e-05, + "loss": 0.2072, + "step": 4229 + }, + { + "epoch": 0.4383873976577884, + "grad_norm": 0.5833871364593506, + "learning_rate": 2.4913820005131353e-05, + "loss": 0.2507, + "step": 4230 + }, + { + "epoch": 0.4384910353404498, + "grad_norm": 0.49012407660484314, + "learning_rate": 2.4907311987878177e-05, + "loss": 0.2389, + "step": 4231 + }, + { + "epoch": 0.4385946730231112, + "grad_norm": 0.5859277844429016, + "learning_rate": 2.4900803417676715e-05, + "loss": 0.258, + "step": 4232 + }, + { + "epoch": 0.43869831070577264, + "grad_norm": 0.55899977684021, + "learning_rate": 2.4894294295260344e-05, + "loss": 0.2351, + "step": 4233 + }, + { + "epoch": 0.43880194838843406, + "grad_norm": 0.4472266733646393, + "learning_rate": 2.4887784621362498e-05, + "loss": 0.1898, + "step": 4234 + }, + { + "epoch": 0.4389055860710955, + "grad_norm": 0.4899350702762604, + "learning_rate": 2.4881274396716687e-05, + "loss": 0.2168, + "step": 4235 + }, + { + "epoch": 0.43900922375375684, + "grad_norm": 0.4696301221847534, + "learning_rate": 2.4874763622056453e-05, + "loss": 0.1883, + "step": 4236 + }, + { + "epoch": 0.43911286143641826, + "grad_norm": 0.45876142382621765, + "learning_rate": 2.4868252298115437e-05, + "loss": 0.1895, + "step": 4237 + }, + { + "epoch": 0.4392164991190797, + "grad_norm": 0.6082130074501038, + "learning_rate": 2.4861740425627323e-05, + "loss": 0.2643, + "step": 4238 + }, + { + "epoch": 0.4393201368017411, + "grad_norm": 0.5665858387947083, + "learning_rate": 2.4855228005325854e-05, + "loss": 0.2428, + "step": 4239 + }, + { + "epoch": 0.4394237744844025, + "grad_norm": 0.45706334710121155, + "learning_rate": 2.4848715037944836e-05, + "loss": 0.1914, + "step": 4240 + }, + { + "epoch": 0.4395274121670639, + "grad_norm": 0.4469106197357178, + "learning_rate": 2.484220152421815e-05, + "loss": 0.1838, + "step": 4241 + }, + { + "epoch": 0.43963104984972534, + "grad_norm": 0.5052451491355896, + "learning_rate": 2.483568746487972e-05, + "loss": 0.2364, + "step": 4242 + }, + { + "epoch": 0.43973468753238676, + "grad_norm": 0.5797638893127441, + "learning_rate": 2.482917286066355e-05, + "loss": 0.2248, + "step": 4243 + }, + { + "epoch": 0.4398383252150482, + "grad_norm": 0.5379804968833923, + "learning_rate": 2.482265771230368e-05, + "loss": 0.192, + "step": 4244 + }, + { + "epoch": 0.4399419628977096, + "grad_norm": 0.5726395845413208, + "learning_rate": 2.481614202053425e-05, + "loss": 0.2318, + "step": 4245 + }, + { + "epoch": 0.440045600580371, + "grad_norm": 0.5437195301055908, + "learning_rate": 2.4809625786089413e-05, + "loss": 0.2094, + "step": 4246 + }, + { + "epoch": 0.4401492382630324, + "grad_norm": 0.48460930585861206, + "learning_rate": 2.4803109009703417e-05, + "loss": 0.1963, + "step": 4247 + }, + { + "epoch": 0.44025287594569384, + "grad_norm": 0.5928154587745667, + "learning_rate": 2.479659169211057e-05, + "loss": 0.2648, + "step": 4248 + }, + { + "epoch": 0.44035651362835526, + "grad_norm": 0.5197213292121887, + "learning_rate": 2.4790073834045226e-05, + "loss": 0.2283, + "step": 4249 + }, + { + "epoch": 0.4404601513110167, + "grad_norm": 0.5223854780197144, + "learning_rate": 2.478355543624181e-05, + "loss": 0.199, + "step": 4250 + }, + { + "epoch": 0.4405637889936781, + "grad_norm": 0.5380105376243591, + "learning_rate": 2.4777036499434805e-05, + "loss": 0.2177, + "step": 4251 + }, + { + "epoch": 0.4406674266763395, + "grad_norm": 0.44573792815208435, + "learning_rate": 2.477051702435875e-05, + "loss": 0.1869, + "step": 4252 + }, + { + "epoch": 0.44077106435900093, + "grad_norm": 0.5541878938674927, + "learning_rate": 2.4763997011748253e-05, + "loss": 0.1807, + "step": 4253 + }, + { + "epoch": 0.44087470204166235, + "grad_norm": 0.43598732352256775, + "learning_rate": 2.4757476462337985e-05, + "loss": 0.1723, + "step": 4254 + }, + { + "epoch": 0.44097833972432376, + "grad_norm": 0.5411472320556641, + "learning_rate": 2.4750955376862655e-05, + "loss": 0.2145, + "step": 4255 + }, + { + "epoch": 0.4410819774069852, + "grad_norm": 0.43212613463401794, + "learning_rate": 2.4744433756057062e-05, + "loss": 0.1869, + "step": 4256 + }, + { + "epoch": 0.4411856150896466, + "grad_norm": 0.560360312461853, + "learning_rate": 2.473791160065605e-05, + "loss": 0.2129, + "step": 4257 + }, + { + "epoch": 0.441289252772308, + "grad_norm": 0.5539674162864685, + "learning_rate": 2.473138891139452e-05, + "loss": 0.2323, + "step": 4258 + }, + { + "epoch": 0.44139289045496943, + "grad_norm": 0.5044907927513123, + "learning_rate": 2.472486568900745e-05, + "loss": 0.205, + "step": 4259 + }, + { + "epoch": 0.44149652813763085, + "grad_norm": 0.4743616282939911, + "learning_rate": 2.4718341934229852e-05, + "loss": 0.1676, + "step": 4260 + }, + { + "epoch": 0.44160016582029227, + "grad_norm": 0.47938790917396545, + "learning_rate": 2.4711817647796828e-05, + "loss": 0.1853, + "step": 4261 + }, + { + "epoch": 0.4417038035029537, + "grad_norm": 0.5426909327507019, + "learning_rate": 2.470529283044351e-05, + "loss": 0.2362, + "step": 4262 + }, + { + "epoch": 0.4418074411856151, + "grad_norm": 0.5705090165138245, + "learning_rate": 2.469876748290511e-05, + "loss": 0.2277, + "step": 4263 + }, + { + "epoch": 0.4419110788682765, + "grad_norm": 0.492149293422699, + "learning_rate": 2.4692241605916897e-05, + "loss": 0.2039, + "step": 4264 + }, + { + "epoch": 0.44201471655093794, + "grad_norm": 0.5022493004798889, + "learning_rate": 2.468571520021419e-05, + "loss": 0.193, + "step": 4265 + }, + { + "epoch": 0.44211835423359935, + "grad_norm": 0.5307921171188354, + "learning_rate": 2.467918826653238e-05, + "loss": 0.2117, + "step": 4266 + }, + { + "epoch": 0.44222199191626077, + "grad_norm": 0.5519681572914124, + "learning_rate": 2.4672660805606913e-05, + "loss": 0.2201, + "step": 4267 + }, + { + "epoch": 0.4423256295989222, + "grad_norm": 0.5887792706489563, + "learning_rate": 2.466613281817329e-05, + "loss": 0.2721, + "step": 4268 + }, + { + "epoch": 0.4424292672815836, + "grad_norm": 0.596578061580658, + "learning_rate": 2.4659604304967068e-05, + "loss": 0.2425, + "step": 4269 + }, + { + "epoch": 0.442532904964245, + "grad_norm": 0.5199716091156006, + "learning_rate": 2.4653075266723886e-05, + "loss": 0.1963, + "step": 4270 + }, + { + "epoch": 0.44263654264690644, + "grad_norm": 0.46791988611221313, + "learning_rate": 2.4646545704179413e-05, + "loss": 0.1851, + "step": 4271 + }, + { + "epoch": 0.44274018032956786, + "grad_norm": 0.4838850796222687, + "learning_rate": 2.4640015618069386e-05, + "loss": 0.2125, + "step": 4272 + }, + { + "epoch": 0.4428438180122293, + "grad_norm": 0.5285201072692871, + "learning_rate": 2.4633485009129622e-05, + "loss": 0.2239, + "step": 4273 + }, + { + "epoch": 0.44294745569489063, + "grad_norm": 0.5395921468734741, + "learning_rate": 2.4626953878095968e-05, + "loss": 0.2252, + "step": 4274 + }, + { + "epoch": 0.44305109337755205, + "grad_norm": 0.5135499238967896, + "learning_rate": 2.4620422225704342e-05, + "loss": 0.2168, + "step": 4275 + }, + { + "epoch": 0.44315473106021347, + "grad_norm": 0.5286493897438049, + "learning_rate": 2.4613890052690722e-05, + "loss": 0.2413, + "step": 4276 + }, + { + "epoch": 0.4432583687428749, + "grad_norm": 0.5408833622932434, + "learning_rate": 2.4607357359791146e-05, + "loss": 0.2342, + "step": 4277 + }, + { + "epoch": 0.4433620064255363, + "grad_norm": 0.620570719242096, + "learning_rate": 2.4600824147741698e-05, + "loss": 0.2632, + "step": 4278 + }, + { + "epoch": 0.4434656441081977, + "grad_norm": 0.5322298407554626, + "learning_rate": 2.4594290417278542e-05, + "loss": 0.2383, + "step": 4279 + }, + { + "epoch": 0.44356928179085914, + "grad_norm": 0.43045082688331604, + "learning_rate": 2.458775616913789e-05, + "loss": 0.1629, + "step": 4280 + }, + { + "epoch": 0.44367291947352056, + "grad_norm": 0.6002359390258789, + "learning_rate": 2.4581221404055992e-05, + "loss": 0.2368, + "step": 4281 + }, + { + "epoch": 0.44377655715618197, + "grad_norm": 0.44602683186531067, + "learning_rate": 2.4574686122769195e-05, + "loss": 0.1859, + "step": 4282 + }, + { + "epoch": 0.4438801948388434, + "grad_norm": 0.5169301629066467, + "learning_rate": 2.4568150326013877e-05, + "loss": 0.1988, + "step": 4283 + }, + { + "epoch": 0.4439838325215048, + "grad_norm": 0.5686281323432922, + "learning_rate": 2.456161401452648e-05, + "loss": 0.2601, + "step": 4284 + }, + { + "epoch": 0.4440874702041662, + "grad_norm": 0.5369881391525269, + "learning_rate": 2.455507718904351e-05, + "loss": 0.227, + "step": 4285 + }, + { + "epoch": 0.44419110788682764, + "grad_norm": 0.5094032883644104, + "learning_rate": 2.4548539850301523e-05, + "loss": 0.2082, + "step": 4286 + }, + { + "epoch": 0.44429474556948906, + "grad_norm": 0.5631538033485413, + "learning_rate": 2.4542001999037125e-05, + "loss": 0.2334, + "step": 4287 + }, + { + "epoch": 0.4443983832521505, + "grad_norm": 0.4589706361293793, + "learning_rate": 2.4535463635987012e-05, + "loss": 0.1805, + "step": 4288 + }, + { + "epoch": 0.4445020209348119, + "grad_norm": 0.5049294233322144, + "learning_rate": 2.4528924761887915e-05, + "loss": 0.2093, + "step": 4289 + }, + { + "epoch": 0.4446056586174733, + "grad_norm": 0.5298270583152771, + "learning_rate": 2.4522385377476607e-05, + "loss": 0.2439, + "step": 4290 + }, + { + "epoch": 0.4447092963001347, + "grad_norm": 0.49855804443359375, + "learning_rate": 2.4515845483489943e-05, + "loss": 0.2057, + "step": 4291 + }, + { + "epoch": 0.44481293398279614, + "grad_norm": 0.4899182617664337, + "learning_rate": 2.4509305080664834e-05, + "loss": 0.1818, + "step": 4292 + }, + { + "epoch": 0.44491657166545756, + "grad_norm": 0.45728784799575806, + "learning_rate": 2.4502764169738237e-05, + "loss": 0.1793, + "step": 4293 + }, + { + "epoch": 0.445020209348119, + "grad_norm": 0.534850001335144, + "learning_rate": 2.449622275144717e-05, + "loss": 0.2487, + "step": 4294 + }, + { + "epoch": 0.4451238470307804, + "grad_norm": 0.5148128867149353, + "learning_rate": 2.448968082652872e-05, + "loss": 0.2385, + "step": 4295 + }, + { + "epoch": 0.4452274847134418, + "grad_norm": 0.5814290642738342, + "learning_rate": 2.4483138395720013e-05, + "loss": 0.2136, + "step": 4296 + }, + { + "epoch": 0.44533112239610323, + "grad_norm": 0.46989554166793823, + "learning_rate": 2.4476595459758234e-05, + "loss": 0.1806, + "step": 4297 + }, + { + "epoch": 0.44543476007876465, + "grad_norm": 0.5008494853973389, + "learning_rate": 2.4470052019380646e-05, + "loss": 0.1966, + "step": 4298 + }, + { + "epoch": 0.44553839776142606, + "grad_norm": 0.5500649809837341, + "learning_rate": 2.446350807532454e-05, + "loss": 0.259, + "step": 4299 + }, + { + "epoch": 0.4456420354440875, + "grad_norm": 0.46479493379592896, + "learning_rate": 2.4456963628327284e-05, + "loss": 0.2102, + "step": 4300 + }, + { + "epoch": 0.4457456731267489, + "grad_norm": 0.4430640935897827, + "learning_rate": 2.445041867912629e-05, + "loss": 0.1848, + "step": 4301 + }, + { + "epoch": 0.4458493108094103, + "grad_norm": 0.48857590556144714, + "learning_rate": 2.4443873228459044e-05, + "loss": 0.2187, + "step": 4302 + }, + { + "epoch": 0.44595294849207173, + "grad_norm": 0.5656534433364868, + "learning_rate": 2.443732727706307e-05, + "loss": 0.2404, + "step": 4303 + }, + { + "epoch": 0.44605658617473315, + "grad_norm": 0.5321723818778992, + "learning_rate": 2.4430780825675952e-05, + "loss": 0.235, + "step": 4304 + }, + { + "epoch": 0.44616022385739457, + "grad_norm": 0.5956742763519287, + "learning_rate": 2.4424233875035344e-05, + "loss": 0.2373, + "step": 4305 + }, + { + "epoch": 0.446263861540056, + "grad_norm": 0.43731212615966797, + "learning_rate": 2.441768642587894e-05, + "loss": 0.1862, + "step": 4306 + }, + { + "epoch": 0.4463674992227174, + "grad_norm": 0.45064225792884827, + "learning_rate": 2.4411138478944488e-05, + "loss": 0.2012, + "step": 4307 + }, + { + "epoch": 0.4464711369053788, + "grad_norm": 0.5171825885772705, + "learning_rate": 2.4404590034969822e-05, + "loss": 0.2055, + "step": 4308 + }, + { + "epoch": 0.44657477458804024, + "grad_norm": 0.5186307430267334, + "learning_rate": 2.439804109469279e-05, + "loss": 0.2207, + "step": 4309 + }, + { + "epoch": 0.44667841227070165, + "grad_norm": 0.4361405074596405, + "learning_rate": 2.4391491658851324e-05, + "loss": 0.1691, + "step": 4310 + }, + { + "epoch": 0.44678204995336307, + "grad_norm": 0.5209653973579407, + "learning_rate": 2.4384941728183406e-05, + "loss": 0.2176, + "step": 4311 + }, + { + "epoch": 0.44688568763602443, + "grad_norm": 0.49713316559791565, + "learning_rate": 2.4378391303427072e-05, + "loss": 0.1889, + "step": 4312 + }, + { + "epoch": 0.44698932531868585, + "grad_norm": 0.46346545219421387, + "learning_rate": 2.4371840385320413e-05, + "loss": 0.1594, + "step": 4313 + }, + { + "epoch": 0.44709296300134727, + "grad_norm": 0.4363242983818054, + "learning_rate": 2.436528897460158e-05, + "loss": 0.181, + "step": 4314 + }, + { + "epoch": 0.4471966006840087, + "grad_norm": 0.5040505528450012, + "learning_rate": 2.4358737072008763e-05, + "loss": 0.2105, + "step": 4315 + }, + { + "epoch": 0.4473002383666701, + "grad_norm": 0.5211902856826782, + "learning_rate": 2.435218467828023e-05, + "loss": 0.1993, + "step": 4316 + }, + { + "epoch": 0.4474038760493315, + "grad_norm": 0.5485498309135437, + "learning_rate": 2.4345631794154297e-05, + "loss": 0.2324, + "step": 4317 + }, + { + "epoch": 0.44750751373199293, + "grad_norm": 0.4832819700241089, + "learning_rate": 2.4339078420369325e-05, + "loss": 0.1938, + "step": 4318 + }, + { + "epoch": 0.44761115141465435, + "grad_norm": 0.4788733720779419, + "learning_rate": 2.433252455766374e-05, + "loss": 0.1766, + "step": 4319 + }, + { + "epoch": 0.44771478909731577, + "grad_norm": 0.5498034358024597, + "learning_rate": 2.4325970206776028e-05, + "loss": 0.2261, + "step": 4320 + }, + { + "epoch": 0.4478184267799772, + "grad_norm": 0.5819903016090393, + "learning_rate": 2.431941536844472e-05, + "loss": 0.2342, + "step": 4321 + }, + { + "epoch": 0.4479220644626386, + "grad_norm": 0.41093122959136963, + "learning_rate": 2.431286004340839e-05, + "loss": 0.1647, + "step": 4322 + }, + { + "epoch": 0.4480257021453, + "grad_norm": 0.5445446968078613, + "learning_rate": 2.4306304232405707e-05, + "loss": 0.2403, + "step": 4323 + }, + { + "epoch": 0.44812933982796144, + "grad_norm": 0.4773034453392029, + "learning_rate": 2.4299747936175354e-05, + "loss": 0.1937, + "step": 4324 + }, + { + "epoch": 0.44823297751062285, + "grad_norm": 0.6398991942405701, + "learning_rate": 2.4293191155456087e-05, + "loss": 0.2661, + "step": 4325 + }, + { + "epoch": 0.44833661519328427, + "grad_norm": 0.46146780252456665, + "learning_rate": 2.428663389098672e-05, + "loss": 0.1862, + "step": 4326 + }, + { + "epoch": 0.4484402528759457, + "grad_norm": 0.5381836891174316, + "learning_rate": 2.4280076143506103e-05, + "loss": 0.2242, + "step": 4327 + }, + { + "epoch": 0.4485438905586071, + "grad_norm": 0.5368882417678833, + "learning_rate": 2.427351791375316e-05, + "loss": 0.2284, + "step": 4328 + }, + { + "epoch": 0.4486475282412685, + "grad_norm": 0.5917689204216003, + "learning_rate": 2.4266959202466862e-05, + "loss": 0.2463, + "step": 4329 + }, + { + "epoch": 0.44875116592392994, + "grad_norm": 0.5041471123695374, + "learning_rate": 2.426040001038624e-05, + "loss": 0.2225, + "step": 4330 + }, + { + "epoch": 0.44885480360659136, + "grad_norm": 0.5714327096939087, + "learning_rate": 2.4253840338250364e-05, + "loss": 0.2419, + "step": 4331 + }, + { + "epoch": 0.4489584412892528, + "grad_norm": 0.5183489918708801, + "learning_rate": 2.4247280186798364e-05, + "loss": 0.2156, + "step": 4332 + }, + { + "epoch": 0.4490620789719142, + "grad_norm": 0.68091881275177, + "learning_rate": 2.4240719556769446e-05, + "loss": 0.2074, + "step": 4333 + }, + { + "epoch": 0.4491657166545756, + "grad_norm": 0.4969347417354584, + "learning_rate": 2.4234158448902835e-05, + "loss": 0.2056, + "step": 4334 + }, + { + "epoch": 0.449269354337237, + "grad_norm": 0.512744128704071, + "learning_rate": 2.4227596863937835e-05, + "loss": 0.1999, + "step": 4335 + }, + { + "epoch": 0.44937299201989844, + "grad_norm": 0.49277830123901367, + "learning_rate": 2.422103480261379e-05, + "loss": 0.1888, + "step": 4336 + }, + { + "epoch": 0.44947662970255986, + "grad_norm": 0.5266940593719482, + "learning_rate": 2.4214472265670105e-05, + "loss": 0.2017, + "step": 4337 + }, + { + "epoch": 0.4495802673852213, + "grad_norm": 0.5924473404884338, + "learning_rate": 2.420790925384624e-05, + "loss": 0.2185, + "step": 4338 + }, + { + "epoch": 0.4496839050678827, + "grad_norm": 0.505537748336792, + "learning_rate": 2.4201345767881697e-05, + "loss": 0.1918, + "step": 4339 + }, + { + "epoch": 0.4497875427505441, + "grad_norm": 0.5462311506271362, + "learning_rate": 2.4194781808516047e-05, + "loss": 0.2503, + "step": 4340 + }, + { + "epoch": 0.44989118043320553, + "grad_norm": 0.5938394665718079, + "learning_rate": 2.41882173764889e-05, + "loss": 0.2578, + "step": 4341 + }, + { + "epoch": 0.44999481811586695, + "grad_norm": 0.5128920674324036, + "learning_rate": 2.4181652472539937e-05, + "loss": 0.2014, + "step": 4342 + }, + { + "epoch": 0.45009845579852836, + "grad_norm": 0.5374717712402344, + "learning_rate": 2.417508709740887e-05, + "loss": 0.2188, + "step": 4343 + }, + { + "epoch": 0.4502020934811898, + "grad_norm": 0.5328230261802673, + "learning_rate": 2.4168521251835477e-05, + "loss": 0.2324, + "step": 4344 + }, + { + "epoch": 0.4503057311638512, + "grad_norm": 0.4726838171482086, + "learning_rate": 2.416195493655959e-05, + "loss": 0.1806, + "step": 4345 + }, + { + "epoch": 0.4504093688465126, + "grad_norm": 0.4431437849998474, + "learning_rate": 2.4155388152321094e-05, + "loss": 0.1856, + "step": 4346 + }, + { + "epoch": 0.45051300652917403, + "grad_norm": 0.4972296357154846, + "learning_rate": 2.414882089985992e-05, + "loss": 0.1903, + "step": 4347 + }, + { + "epoch": 0.45061664421183545, + "grad_norm": 0.45771318674087524, + "learning_rate": 2.414225317991605e-05, + "loss": 0.1796, + "step": 4348 + }, + { + "epoch": 0.45072028189449687, + "grad_norm": 0.4780783951282501, + "learning_rate": 2.4135684993229546e-05, + "loss": 0.1904, + "step": 4349 + }, + { + "epoch": 0.4508239195771582, + "grad_norm": 0.4782392382621765, + "learning_rate": 2.4129116340540472e-05, + "loss": 0.2062, + "step": 4350 + }, + { + "epoch": 0.45092755725981964, + "grad_norm": 0.4502820670604706, + "learning_rate": 2.4122547222588986e-05, + "loss": 0.192, + "step": 4351 + }, + { + "epoch": 0.45103119494248106, + "grad_norm": 0.5763320922851562, + "learning_rate": 2.41159776401153e-05, + "loss": 0.2177, + "step": 4352 + }, + { + "epoch": 0.4511348326251425, + "grad_norm": 0.5172169804573059, + "learning_rate": 2.410940759385964e-05, + "loss": 0.2476, + "step": 4353 + }, + { + "epoch": 0.4512384703078039, + "grad_norm": 0.48860692977905273, + "learning_rate": 2.410283708456233e-05, + "loss": 0.1816, + "step": 4354 + }, + { + "epoch": 0.4513421079904653, + "grad_norm": 0.4798571765422821, + "learning_rate": 2.4096266112963707e-05, + "loss": 0.2029, + "step": 4355 + }, + { + "epoch": 0.45144574567312673, + "grad_norm": 0.5416128039360046, + "learning_rate": 2.408969467980419e-05, + "loss": 0.2245, + "step": 4356 + }, + { + "epoch": 0.45154938335578815, + "grad_norm": 0.4884701371192932, + "learning_rate": 2.4083122785824236e-05, + "loss": 0.1841, + "step": 4357 + }, + { + "epoch": 0.45165302103844956, + "grad_norm": 0.5009334683418274, + "learning_rate": 2.407655043176435e-05, + "loss": 0.2266, + "step": 4358 + }, + { + "epoch": 0.451756658721111, + "grad_norm": 0.49989357590675354, + "learning_rate": 2.4069977618365106e-05, + "loss": 0.1884, + "step": 4359 + }, + { + "epoch": 0.4518602964037724, + "grad_norm": 0.5729777216911316, + "learning_rate": 2.4063404346367102e-05, + "loss": 0.2186, + "step": 4360 + }, + { + "epoch": 0.4519639340864338, + "grad_norm": 0.5348790287971497, + "learning_rate": 2.4056830616511015e-05, + "loss": 0.2289, + "step": 4361 + }, + { + "epoch": 0.45206757176909523, + "grad_norm": 0.53462153673172, + "learning_rate": 2.4050256429537565e-05, + "loss": 0.2166, + "step": 4362 + }, + { + "epoch": 0.45217120945175665, + "grad_norm": 0.5014505386352539, + "learning_rate": 2.404368178618751e-05, + "loss": 0.1993, + "step": 4363 + }, + { + "epoch": 0.45227484713441807, + "grad_norm": 0.5104119181632996, + "learning_rate": 2.4037106687201683e-05, + "loss": 0.2259, + "step": 4364 + }, + { + "epoch": 0.4523784848170795, + "grad_norm": 0.5958729386329651, + "learning_rate": 2.4030531133320947e-05, + "loss": 0.2364, + "step": 4365 + }, + { + "epoch": 0.4524821224997409, + "grad_norm": 0.5871717929840088, + "learning_rate": 2.4023955125286228e-05, + "loss": 0.2327, + "step": 4366 + }, + { + "epoch": 0.4525857601824023, + "grad_norm": 0.4634658396244049, + "learning_rate": 2.40173786638385e-05, + "loss": 0.1715, + "step": 4367 + }, + { + "epoch": 0.45268939786506374, + "grad_norm": 0.5972458124160767, + "learning_rate": 2.40108017497188e-05, + "loss": 0.2327, + "step": 4368 + }, + { + "epoch": 0.45279303554772515, + "grad_norm": 0.5323976874351501, + "learning_rate": 2.4004224383668183e-05, + "loss": 0.2112, + "step": 4369 + }, + { + "epoch": 0.45289667323038657, + "grad_norm": 0.47007569670677185, + "learning_rate": 2.399764656642779e-05, + "loss": 0.2046, + "step": 4370 + }, + { + "epoch": 0.453000310913048, + "grad_norm": 0.5574849843978882, + "learning_rate": 2.3991068298738794e-05, + "loss": 0.2419, + "step": 4371 + }, + { + "epoch": 0.4531039485957094, + "grad_norm": 0.5175203680992126, + "learning_rate": 2.398448958134243e-05, + "loss": 0.2094, + "step": 4372 + }, + { + "epoch": 0.4532075862783708, + "grad_norm": 0.476266473531723, + "learning_rate": 2.397791041497997e-05, + "loss": 0.2042, + "step": 4373 + }, + { + "epoch": 0.45331122396103224, + "grad_norm": 0.41861820220947266, + "learning_rate": 2.3971330800392753e-05, + "loss": 0.1562, + "step": 4374 + }, + { + "epoch": 0.45341486164369366, + "grad_norm": 0.5280885696411133, + "learning_rate": 2.3964750738322155e-05, + "loss": 0.1882, + "step": 4375 + }, + { + "epoch": 0.4535184993263551, + "grad_norm": 0.6168330311775208, + "learning_rate": 2.39581702295096e-05, + "loss": 0.2421, + "step": 4376 + }, + { + "epoch": 0.4536221370090165, + "grad_norm": 0.4856494963169098, + "learning_rate": 2.3951589274696586e-05, + "loss": 0.1916, + "step": 4377 + }, + { + "epoch": 0.4537257746916779, + "grad_norm": 0.5814061164855957, + "learning_rate": 2.394500787462463e-05, + "loss": 0.2429, + "step": 4378 + }, + { + "epoch": 0.4538294123743393, + "grad_norm": 0.4881944954395294, + "learning_rate": 2.393842603003532e-05, + "loss": 0.2185, + "step": 4379 + }, + { + "epoch": 0.45393305005700074, + "grad_norm": 0.515495240688324, + "learning_rate": 2.3931843741670283e-05, + "loss": 0.1946, + "step": 4380 + }, + { + "epoch": 0.45403668773966216, + "grad_norm": 0.5544822216033936, + "learning_rate": 2.3925261010271212e-05, + "loss": 0.2752, + "step": 4381 + }, + { + "epoch": 0.4541403254223236, + "grad_norm": 0.44878914952278137, + "learning_rate": 2.3918677836579828e-05, + "loss": 0.1865, + "step": 4382 + }, + { + "epoch": 0.454243963104985, + "grad_norm": 0.4595224857330322, + "learning_rate": 2.391209422133792e-05, + "loss": 0.1835, + "step": 4383 + }, + { + "epoch": 0.4543476007876464, + "grad_norm": 0.5050191283226013, + "learning_rate": 2.3905510165287317e-05, + "loss": 0.2167, + "step": 4384 + }, + { + "epoch": 0.45445123847030783, + "grad_norm": 0.497149258852005, + "learning_rate": 2.38989256691699e-05, + "loss": 0.1899, + "step": 4385 + }, + { + "epoch": 0.45455487615296924, + "grad_norm": 0.5135137438774109, + "learning_rate": 2.3892340733727594e-05, + "loss": 0.2113, + "step": 4386 + }, + { + "epoch": 0.45465851383563066, + "grad_norm": 0.546467661857605, + "learning_rate": 2.3885755359702395e-05, + "loss": 0.2358, + "step": 4387 + }, + { + "epoch": 0.454762151518292, + "grad_norm": 0.46558383107185364, + "learning_rate": 2.387916954783631e-05, + "loss": 0.1788, + "step": 4388 + }, + { + "epoch": 0.45486578920095344, + "grad_norm": 0.49247148633003235, + "learning_rate": 2.387258329887144e-05, + "loss": 0.1819, + "step": 4389 + }, + { + "epoch": 0.45496942688361486, + "grad_norm": 0.5415985584259033, + "learning_rate": 2.3865996613549905e-05, + "loss": 0.2116, + "step": 4390 + }, + { + "epoch": 0.4550730645662763, + "grad_norm": 0.5160948038101196, + "learning_rate": 2.3859409492613873e-05, + "loss": 0.2166, + "step": 4391 + }, + { + "epoch": 0.4551767022489377, + "grad_norm": 0.5398045182228088, + "learning_rate": 2.3852821936805582e-05, + "loss": 0.2496, + "step": 4392 + }, + { + "epoch": 0.4552803399315991, + "grad_norm": 0.46893683075904846, + "learning_rate": 2.384623394686731e-05, + "loss": 0.2064, + "step": 4393 + }, + { + "epoch": 0.4553839776142605, + "grad_norm": 0.5869868397712708, + "learning_rate": 2.3839645523541376e-05, + "loss": 0.2697, + "step": 4394 + }, + { + "epoch": 0.45548761529692194, + "grad_norm": 0.45383915305137634, + "learning_rate": 2.3833056667570146e-05, + "loss": 0.1766, + "step": 4395 + }, + { + "epoch": 0.45559125297958336, + "grad_norm": 0.48213571310043335, + "learning_rate": 2.382646737969605e-05, + "loss": 0.1922, + "step": 4396 + }, + { + "epoch": 0.4556948906622448, + "grad_norm": 0.5315578579902649, + "learning_rate": 2.381987766066156e-05, + "loss": 0.2304, + "step": 4397 + }, + { + "epoch": 0.4557985283449062, + "grad_norm": 0.5228009223937988, + "learning_rate": 2.3813287511209194e-05, + "loss": 0.2411, + "step": 4398 + }, + { + "epoch": 0.4559021660275676, + "grad_norm": 0.6200061440467834, + "learning_rate": 2.3806696932081516e-05, + "loss": 0.2588, + "step": 4399 + }, + { + "epoch": 0.45600580371022903, + "grad_norm": 0.5222203731536865, + "learning_rate": 2.3800105924021154e-05, + "loss": 0.2328, + "step": 4400 + }, + { + "epoch": 0.45610944139289045, + "grad_norm": 0.5595455765724182, + "learning_rate": 2.3793514487770753e-05, + "loss": 0.2555, + "step": 4401 + }, + { + "epoch": 0.45621307907555186, + "grad_norm": 0.4543464779853821, + "learning_rate": 2.378692262407304e-05, + "loss": 0.1815, + "step": 4402 + }, + { + "epoch": 0.4563167167582133, + "grad_norm": 0.522218644618988, + "learning_rate": 2.378033033367078e-05, + "loss": 0.2272, + "step": 4403 + }, + { + "epoch": 0.4564203544408747, + "grad_norm": 0.5159570574760437, + "learning_rate": 2.377373761730677e-05, + "loss": 0.2134, + "step": 4404 + }, + { + "epoch": 0.4565239921235361, + "grad_norm": 0.49490806460380554, + "learning_rate": 2.376714447572387e-05, + "loss": 0.2054, + "step": 4405 + }, + { + "epoch": 0.45662762980619753, + "grad_norm": 0.4948660433292389, + "learning_rate": 2.3760550909664987e-05, + "loss": 0.2206, + "step": 4406 + }, + { + "epoch": 0.45673126748885895, + "grad_norm": 0.4976104199886322, + "learning_rate": 2.3753956919873074e-05, + "loss": 0.1837, + "step": 4407 + }, + { + "epoch": 0.45683490517152037, + "grad_norm": 0.510616660118103, + "learning_rate": 2.3747362507091126e-05, + "loss": 0.2338, + "step": 4408 + }, + { + "epoch": 0.4569385428541818, + "grad_norm": 0.6493490934371948, + "learning_rate": 2.3740767672062206e-05, + "loss": 0.2459, + "step": 4409 + }, + { + "epoch": 0.4570421805368432, + "grad_norm": 0.533977746963501, + "learning_rate": 2.3734172415529394e-05, + "loss": 0.2021, + "step": 4410 + }, + { + "epoch": 0.4571458182195046, + "grad_norm": 0.5535910129547119, + "learning_rate": 2.3727576738235838e-05, + "loss": 0.2244, + "step": 4411 + }, + { + "epoch": 0.45724945590216604, + "grad_norm": 0.5072131752967834, + "learning_rate": 2.3720980640924733e-05, + "loss": 0.2006, + "step": 4412 + }, + { + "epoch": 0.45735309358482745, + "grad_norm": 0.5280940532684326, + "learning_rate": 2.371438412433931e-05, + "loss": 0.2347, + "step": 4413 + }, + { + "epoch": 0.45745673126748887, + "grad_norm": 0.5479831099510193, + "learning_rate": 2.370778718922286e-05, + "loss": 0.2098, + "step": 4414 + }, + { + "epoch": 0.4575603689501503, + "grad_norm": 0.47882604598999023, + "learning_rate": 2.3701189836318715e-05, + "loss": 0.184, + "step": 4415 + }, + { + "epoch": 0.4576640066328117, + "grad_norm": 0.5942421555519104, + "learning_rate": 2.369459206637025e-05, + "loss": 0.2398, + "step": 4416 + }, + { + "epoch": 0.4577676443154731, + "grad_norm": 0.483982652425766, + "learning_rate": 2.3687993880120895e-05, + "loss": 0.2383, + "step": 4417 + }, + { + "epoch": 0.45787128199813454, + "grad_norm": 0.5605577826499939, + "learning_rate": 2.3681395278314125e-05, + "loss": 0.2347, + "step": 4418 + }, + { + "epoch": 0.45797491968079596, + "grad_norm": 0.49620258808135986, + "learning_rate": 2.3674796261693456e-05, + "loss": 0.1894, + "step": 4419 + }, + { + "epoch": 0.4580785573634574, + "grad_norm": 0.5180416107177734, + "learning_rate": 2.366819683100246e-05, + "loss": 0.213, + "step": 4420 + }, + { + "epoch": 0.4581821950461188, + "grad_norm": 0.5405642986297607, + "learning_rate": 2.366159698698474e-05, + "loss": 0.1836, + "step": 4421 + }, + { + "epoch": 0.4582858327287802, + "grad_norm": 0.5185651183128357, + "learning_rate": 2.365499673038397e-05, + "loss": 0.2256, + "step": 4422 + }, + { + "epoch": 0.4583894704114416, + "grad_norm": 0.5593555569648743, + "learning_rate": 2.364839606194385e-05, + "loss": 0.2403, + "step": 4423 + }, + { + "epoch": 0.45849310809410304, + "grad_norm": 0.49372783303260803, + "learning_rate": 2.3641794982408133e-05, + "loss": 0.2145, + "step": 4424 + }, + { + "epoch": 0.45859674577676446, + "grad_norm": 0.4593977630138397, + "learning_rate": 2.3635193492520617e-05, + "loss": 0.1916, + "step": 4425 + }, + { + "epoch": 0.4587003834594258, + "grad_norm": 0.5412962436676025, + "learning_rate": 2.362859159302515e-05, + "loss": 0.2149, + "step": 4426 + }, + { + "epoch": 0.45880402114208724, + "grad_norm": 0.4642784595489502, + "learning_rate": 2.3621989284665617e-05, + "loss": 0.191, + "step": 4427 + }, + { + "epoch": 0.45890765882474865, + "grad_norm": 0.4705636203289032, + "learning_rate": 2.3615386568185973e-05, + "loss": 0.1854, + "step": 4428 + }, + { + "epoch": 0.45901129650741007, + "grad_norm": 0.5116401314735413, + "learning_rate": 2.3608783444330184e-05, + "loss": 0.2054, + "step": 4429 + }, + { + "epoch": 0.4591149341900715, + "grad_norm": 0.4768635332584381, + "learning_rate": 2.3602179913842286e-05, + "loss": 0.1735, + "step": 4430 + }, + { + "epoch": 0.4592185718727329, + "grad_norm": 0.524996280670166, + "learning_rate": 2.3595575977466355e-05, + "loss": 0.2116, + "step": 4431 + }, + { + "epoch": 0.4593222095553943, + "grad_norm": 0.5416904091835022, + "learning_rate": 2.3588971635946517e-05, + "loss": 0.2347, + "step": 4432 + }, + { + "epoch": 0.45942584723805574, + "grad_norm": 0.5637031197547913, + "learning_rate": 2.358236689002693e-05, + "loss": 0.2178, + "step": 4433 + }, + { + "epoch": 0.45952948492071716, + "grad_norm": 0.5045896172523499, + "learning_rate": 2.357576174045181e-05, + "loss": 0.2282, + "step": 4434 + }, + { + "epoch": 0.4596331226033786, + "grad_norm": 0.48037102818489075, + "learning_rate": 2.3569156187965418e-05, + "loss": 0.1931, + "step": 4435 + }, + { + "epoch": 0.45973676028604, + "grad_norm": 0.4950512945652008, + "learning_rate": 2.3562550233312054e-05, + "loss": 0.1801, + "step": 4436 + }, + { + "epoch": 0.4598403979687014, + "grad_norm": 0.5815441608428955, + "learning_rate": 2.355594387723607e-05, + "loss": 0.2448, + "step": 4437 + }, + { + "epoch": 0.4599440356513628, + "grad_norm": 0.5947690010070801, + "learning_rate": 2.3549337120481858e-05, + "loss": 0.2311, + "step": 4438 + }, + { + "epoch": 0.46004767333402424, + "grad_norm": 0.5021811723709106, + "learning_rate": 2.3542729963793854e-05, + "loss": 0.2134, + "step": 4439 + }, + { + "epoch": 0.46015131101668566, + "grad_norm": 0.5300816893577576, + "learning_rate": 2.353612240791655e-05, + "loss": 0.2169, + "step": 4440 + }, + { + "epoch": 0.4602549486993471, + "grad_norm": 0.5285887718200684, + "learning_rate": 2.3529514453594465e-05, + "loss": 0.2223, + "step": 4441 + }, + { + "epoch": 0.4603585863820085, + "grad_norm": 0.4838586747646332, + "learning_rate": 2.3522906101572174e-05, + "loss": 0.2023, + "step": 4442 + }, + { + "epoch": 0.4604622240646699, + "grad_norm": 0.4455113708972931, + "learning_rate": 2.351629735259431e-05, + "loss": 0.2087, + "step": 4443 + }, + { + "epoch": 0.46056586174733133, + "grad_norm": 0.6215282082557678, + "learning_rate": 2.350968820740552e-05, + "loss": 0.2205, + "step": 4444 + }, + { + "epoch": 0.46066949942999275, + "grad_norm": 0.5159878730773926, + "learning_rate": 2.3503078666750518e-05, + "loss": 0.2092, + "step": 4445 + }, + { + "epoch": 0.46077313711265416, + "grad_norm": 0.5394431948661804, + "learning_rate": 2.349646873137406e-05, + "loss": 0.2548, + "step": 4446 + }, + { + "epoch": 0.4608767747953156, + "grad_norm": 0.5611411333084106, + "learning_rate": 2.348985840202094e-05, + "loss": 0.2058, + "step": 4447 + }, + { + "epoch": 0.460980412477977, + "grad_norm": 0.5420775413513184, + "learning_rate": 2.3483247679436004e-05, + "loss": 0.2025, + "step": 4448 + }, + { + "epoch": 0.4610840501606384, + "grad_norm": 0.5555642247200012, + "learning_rate": 2.3476636564364128e-05, + "loss": 0.2105, + "step": 4449 + }, + { + "epoch": 0.46118768784329983, + "grad_norm": 0.48131993412971497, + "learning_rate": 2.3470025057550253e-05, + "loss": 0.1676, + "step": 4450 + }, + { + "epoch": 0.46129132552596125, + "grad_norm": 0.5748143196105957, + "learning_rate": 2.346341315973935e-05, + "loss": 0.2159, + "step": 4451 + }, + { + "epoch": 0.46139496320862267, + "grad_norm": 0.4542897343635559, + "learning_rate": 2.3456800871676428e-05, + "loss": 0.1623, + "step": 4452 + }, + { + "epoch": 0.4614986008912841, + "grad_norm": 0.532109260559082, + "learning_rate": 2.345018819410657e-05, + "loss": 0.228, + "step": 4453 + }, + { + "epoch": 0.4616022385739455, + "grad_norm": 0.49552032351493835, + "learning_rate": 2.344357512777486e-05, + "loss": 0.1974, + "step": 4454 + }, + { + "epoch": 0.4617058762566069, + "grad_norm": 0.5018864274024963, + "learning_rate": 2.3436961673426456e-05, + "loss": 0.2239, + "step": 4455 + }, + { + "epoch": 0.46180951393926833, + "grad_norm": 0.6501008868217468, + "learning_rate": 2.3430347831806565e-05, + "loss": 0.2513, + "step": 4456 + }, + { + "epoch": 0.46191315162192975, + "grad_norm": 0.5242273211479187, + "learning_rate": 2.3423733603660406e-05, + "loss": 0.196, + "step": 4457 + }, + { + "epoch": 0.46201678930459117, + "grad_norm": 0.4985031485557556, + "learning_rate": 2.3417118989733265e-05, + "loss": 0.2189, + "step": 4458 + }, + { + "epoch": 0.4621204269872526, + "grad_norm": 0.4776060879230499, + "learning_rate": 2.3410503990770468e-05, + "loss": 0.1906, + "step": 4459 + }, + { + "epoch": 0.462224064669914, + "grad_norm": 0.5783148407936096, + "learning_rate": 2.3403888607517385e-05, + "loss": 0.235, + "step": 4460 + }, + { + "epoch": 0.4623277023525754, + "grad_norm": 0.5404558777809143, + "learning_rate": 2.3397272840719425e-05, + "loss": 0.2144, + "step": 4461 + }, + { + "epoch": 0.46243134003523684, + "grad_norm": 0.5489241480827332, + "learning_rate": 2.339065669112204e-05, + "loss": 0.2439, + "step": 4462 + }, + { + "epoch": 0.46253497771789825, + "grad_norm": 0.5944549441337585, + "learning_rate": 2.3384040159470738e-05, + "loss": 0.2608, + "step": 4463 + }, + { + "epoch": 0.4626386154005596, + "grad_norm": 0.5333137512207031, + "learning_rate": 2.3377423246511047e-05, + "loss": 0.2113, + "step": 4464 + }, + { + "epoch": 0.46274225308322103, + "grad_norm": 0.5300441980361938, + "learning_rate": 2.3370805952988546e-05, + "loss": 0.2222, + "step": 4465 + }, + { + "epoch": 0.46284589076588245, + "grad_norm": 0.4752691984176636, + "learning_rate": 2.3364188279648886e-05, + "loss": 0.1833, + "step": 4466 + }, + { + "epoch": 0.46294952844854387, + "grad_norm": 0.5182095766067505, + "learning_rate": 2.3357570227237712e-05, + "loss": 0.2192, + "step": 4467 + }, + { + "epoch": 0.4630531661312053, + "grad_norm": 0.49979159235954285, + "learning_rate": 2.3350951796500744e-05, + "loss": 0.2226, + "step": 4468 + }, + { + "epoch": 0.4631568038138667, + "grad_norm": 0.5155590772628784, + "learning_rate": 2.334433298818374e-05, + "loss": 0.2073, + "step": 4469 + }, + { + "epoch": 0.4632604414965281, + "grad_norm": 0.5436052680015564, + "learning_rate": 2.3337713803032487e-05, + "loss": 0.1981, + "step": 4470 + }, + { + "epoch": 0.46336407917918954, + "grad_norm": 0.5005167126655579, + "learning_rate": 2.3331094241792834e-05, + "loss": 0.1916, + "step": 4471 + }, + { + "epoch": 0.46346771686185095, + "grad_norm": 0.5140410661697388, + "learning_rate": 2.3324474305210666e-05, + "loss": 0.1918, + "step": 4472 + }, + { + "epoch": 0.46357135454451237, + "grad_norm": 0.5105440616607666, + "learning_rate": 2.3317853994031897e-05, + "loss": 0.2205, + "step": 4473 + }, + { + "epoch": 0.4636749922271738, + "grad_norm": 0.47799333930015564, + "learning_rate": 2.3311233309002493e-05, + "loss": 0.2034, + "step": 4474 + }, + { + "epoch": 0.4637786299098352, + "grad_norm": 0.48624324798583984, + "learning_rate": 2.3304612250868472e-05, + "loss": 0.1824, + "step": 4475 + }, + { + "epoch": 0.4638822675924966, + "grad_norm": 0.4912174344062805, + "learning_rate": 2.329799082037588e-05, + "loss": 0.2126, + "step": 4476 + }, + { + "epoch": 0.46398590527515804, + "grad_norm": 0.5111244916915894, + "learning_rate": 2.329136901827081e-05, + "loss": 0.1873, + "step": 4477 + }, + { + "epoch": 0.46408954295781946, + "grad_norm": 0.5195016264915466, + "learning_rate": 2.3284746845299396e-05, + "loss": 0.2154, + "step": 4478 + }, + { + "epoch": 0.4641931806404809, + "grad_norm": 0.5187585949897766, + "learning_rate": 2.3278124302207812e-05, + "loss": 0.2148, + "step": 4479 + }, + { + "epoch": 0.4642968183231423, + "grad_norm": 0.5739063024520874, + "learning_rate": 2.3271501389742273e-05, + "loss": 0.2576, + "step": 4480 + }, + { + "epoch": 0.4644004560058037, + "grad_norm": 0.44460976123809814, + "learning_rate": 2.3264878108649046e-05, + "loss": 0.1872, + "step": 4481 + }, + { + "epoch": 0.4645040936884651, + "grad_norm": 0.5364735126495361, + "learning_rate": 2.3258254459674438e-05, + "loss": 0.222, + "step": 4482 + }, + { + "epoch": 0.46460773137112654, + "grad_norm": 0.5735208988189697, + "learning_rate": 2.3251630443564773e-05, + "loss": 0.2206, + "step": 4483 + }, + { + "epoch": 0.46471136905378796, + "grad_norm": 0.5087382197380066, + "learning_rate": 2.3245006061066446e-05, + "loss": 0.2137, + "step": 4484 + }, + { + "epoch": 0.4648150067364494, + "grad_norm": 0.5853786468505859, + "learning_rate": 2.323838131292588e-05, + "loss": 0.2365, + "step": 4485 + }, + { + "epoch": 0.4649186444191108, + "grad_norm": 0.5093985199928284, + "learning_rate": 2.323175619988954e-05, + "loss": 0.1947, + "step": 4486 + }, + { + "epoch": 0.4650222821017722, + "grad_norm": 0.5494992733001709, + "learning_rate": 2.322513072270394e-05, + "loss": 0.2159, + "step": 4487 + }, + { + "epoch": 0.46512591978443363, + "grad_norm": 0.4877338111400604, + "learning_rate": 2.3218504882115624e-05, + "loss": 0.1989, + "step": 4488 + }, + { + "epoch": 0.46522955746709505, + "grad_norm": 0.4691953659057617, + "learning_rate": 2.321187867887118e-05, + "loss": 0.167, + "step": 4489 + }, + { + "epoch": 0.46533319514975646, + "grad_norm": 0.5337307453155518, + "learning_rate": 2.3205252113717234e-05, + "loss": 0.2049, + "step": 4490 + }, + { + "epoch": 0.4654368328324179, + "grad_norm": 0.5148242712020874, + "learning_rate": 2.3198625187400473e-05, + "loss": 0.218, + "step": 4491 + }, + { + "epoch": 0.4655404705150793, + "grad_norm": 0.5255232453346252, + "learning_rate": 2.3191997900667588e-05, + "loss": 0.1783, + "step": 4492 + }, + { + "epoch": 0.4656441081977407, + "grad_norm": 0.6174560189247131, + "learning_rate": 2.3185370254265343e-05, + "loss": 0.2203, + "step": 4493 + }, + { + "epoch": 0.46574774588040213, + "grad_norm": 0.39398008584976196, + "learning_rate": 2.3178742248940534e-05, + "loss": 0.1436, + "step": 4494 + }, + { + "epoch": 0.46585138356306355, + "grad_norm": 0.5350342392921448, + "learning_rate": 2.317211388543999e-05, + "loss": 0.2203, + "step": 4495 + }, + { + "epoch": 0.46595502124572497, + "grad_norm": 0.5608664751052856, + "learning_rate": 2.3165485164510582e-05, + "loss": 0.1973, + "step": 4496 + }, + { + "epoch": 0.4660586589283864, + "grad_norm": 0.5720195770263672, + "learning_rate": 2.3158856086899223e-05, + "loss": 0.2351, + "step": 4497 + }, + { + "epoch": 0.4661622966110478, + "grad_norm": 0.42739763855934143, + "learning_rate": 2.315222665335288e-05, + "loss": 0.183, + "step": 4498 + }, + { + "epoch": 0.4662659342937092, + "grad_norm": 0.537451446056366, + "learning_rate": 2.3145596864618534e-05, + "loss": 0.2143, + "step": 4499 + }, + { + "epoch": 0.46636957197637063, + "grad_norm": 0.554932713508606, + "learning_rate": 2.3138966721443213e-05, + "loss": 0.2185, + "step": 4500 + }, + { + "epoch": 0.46647320965903205, + "grad_norm": 0.5975545048713684, + "learning_rate": 2.3132336224574015e-05, + "loss": 0.2352, + "step": 4501 + }, + { + "epoch": 0.4665768473416934, + "grad_norm": 0.604669451713562, + "learning_rate": 2.3125705374758037e-05, + "loss": 0.2296, + "step": 4502 + }, + { + "epoch": 0.46668048502435483, + "grad_norm": 0.493513286113739, + "learning_rate": 2.3119074172742435e-05, + "loss": 0.1878, + "step": 4503 + }, + { + "epoch": 0.46678412270701625, + "grad_norm": 0.5445199608802795, + "learning_rate": 2.3112442619274408e-05, + "loss": 0.2184, + "step": 4504 + }, + { + "epoch": 0.46688776038967766, + "grad_norm": 0.5227720737457275, + "learning_rate": 2.3105810715101175e-05, + "loss": 0.203, + "step": 4505 + }, + { + "epoch": 0.4669913980723391, + "grad_norm": 0.5246191024780273, + "learning_rate": 2.3099178460970025e-05, + "loss": 0.1861, + "step": 4506 + }, + { + "epoch": 0.4670950357550005, + "grad_norm": 0.5684474110603333, + "learning_rate": 2.3092545857628265e-05, + "loss": 0.2275, + "step": 4507 + }, + { + "epoch": 0.4671986734376619, + "grad_norm": 0.49848929047584534, + "learning_rate": 2.3085912905823246e-05, + "loss": 0.18, + "step": 4508 + }, + { + "epoch": 0.46730231112032333, + "grad_norm": 0.5276935696601868, + "learning_rate": 2.3079279606302355e-05, + "loss": 0.1834, + "step": 4509 + }, + { + "epoch": 0.46740594880298475, + "grad_norm": 0.5134925842285156, + "learning_rate": 2.3072645959813026e-05, + "loss": 0.1973, + "step": 4510 + }, + { + "epoch": 0.46750958648564617, + "grad_norm": 0.5260006785392761, + "learning_rate": 2.3066011967102723e-05, + "loss": 0.2145, + "step": 4511 + }, + { + "epoch": 0.4676132241683076, + "grad_norm": 0.511784553527832, + "learning_rate": 2.3059377628918963e-05, + "loss": 0.2012, + "step": 4512 + }, + { + "epoch": 0.467716861850969, + "grad_norm": 0.5256336331367493, + "learning_rate": 2.3052742946009284e-05, + "loss": 0.2149, + "step": 4513 + }, + { + "epoch": 0.4678204995336304, + "grad_norm": 0.5943604111671448, + "learning_rate": 2.3046107919121284e-05, + "loss": 0.2157, + "step": 4514 + }, + { + "epoch": 0.46792413721629184, + "grad_norm": 0.47138938307762146, + "learning_rate": 2.3039472549002567e-05, + "loss": 0.1846, + "step": 4515 + }, + { + "epoch": 0.46802777489895325, + "grad_norm": 0.4793859124183655, + "learning_rate": 2.3032836836400816e-05, + "loss": 0.1732, + "step": 4516 + }, + { + "epoch": 0.46813141258161467, + "grad_norm": 0.47348594665527344, + "learning_rate": 2.3026200782063724e-05, + "loss": 0.2121, + "step": 4517 + }, + { + "epoch": 0.4682350502642761, + "grad_norm": 0.49051177501678467, + "learning_rate": 2.301956438673903e-05, + "loss": 0.2176, + "step": 4518 + }, + { + "epoch": 0.4683386879469375, + "grad_norm": 0.49050796031951904, + "learning_rate": 2.301292765117451e-05, + "loss": 0.1984, + "step": 4519 + }, + { + "epoch": 0.4684423256295989, + "grad_norm": 0.48881155252456665, + "learning_rate": 2.3006290576117993e-05, + "loss": 0.1835, + "step": 4520 + }, + { + "epoch": 0.46854596331226034, + "grad_norm": 0.5071969628334045, + "learning_rate": 2.2999653162317324e-05, + "loss": 0.2173, + "step": 4521 + }, + { + "epoch": 0.46864960099492176, + "grad_norm": 0.5467783808708191, + "learning_rate": 2.29930154105204e-05, + "loss": 0.2094, + "step": 4522 + }, + { + "epoch": 0.4687532386775832, + "grad_norm": 0.4916337728500366, + "learning_rate": 2.298637732147516e-05, + "loss": 0.1885, + "step": 4523 + }, + { + "epoch": 0.4688568763602446, + "grad_norm": 0.5077255964279175, + "learning_rate": 2.2979738895929557e-05, + "loss": 0.2174, + "step": 4524 + }, + { + "epoch": 0.468960514042906, + "grad_norm": 0.5012960433959961, + "learning_rate": 2.2973100134631606e-05, + "loss": 0.2026, + "step": 4525 + }, + { + "epoch": 0.4690641517255674, + "grad_norm": 0.4907589256763458, + "learning_rate": 2.2966461038329363e-05, + "loss": 0.1988, + "step": 4526 + }, + { + "epoch": 0.46916778940822884, + "grad_norm": 0.4194316565990448, + "learning_rate": 2.295982160777089e-05, + "loss": 0.1829, + "step": 4527 + }, + { + "epoch": 0.46927142709089026, + "grad_norm": 0.5632020235061646, + "learning_rate": 2.295318184370433e-05, + "loss": 0.2158, + "step": 4528 + }, + { + "epoch": 0.4693750647735517, + "grad_norm": 0.6058095693588257, + "learning_rate": 2.294654174687782e-05, + "loss": 0.2683, + "step": 4529 + }, + { + "epoch": 0.4694787024562131, + "grad_norm": 0.5311865210533142, + "learning_rate": 2.2939901318039574e-05, + "loss": 0.2035, + "step": 4530 + }, + { + "epoch": 0.4695823401388745, + "grad_norm": 0.4692043960094452, + "learning_rate": 2.2933260557937817e-05, + "loss": 0.2069, + "step": 4531 + }, + { + "epoch": 0.4696859778215359, + "grad_norm": 0.5152689218521118, + "learning_rate": 2.292661946732082e-05, + "loss": 0.234, + "step": 4532 + }, + { + "epoch": 0.46978961550419734, + "grad_norm": 0.4731234312057495, + "learning_rate": 2.291997804693689e-05, + "loss": 0.1788, + "step": 4533 + }, + { + "epoch": 0.46989325318685876, + "grad_norm": 0.4742664694786072, + "learning_rate": 2.291333629753437e-05, + "loss": 0.1968, + "step": 4534 + }, + { + "epoch": 0.4699968908695202, + "grad_norm": 0.46158522367477417, + "learning_rate": 2.290669421986165e-05, + "loss": 0.1797, + "step": 4535 + }, + { + "epoch": 0.4701005285521816, + "grad_norm": 0.5073385834693909, + "learning_rate": 2.290005181466714e-05, + "loss": 0.217, + "step": 4536 + }, + { + "epoch": 0.470204166234843, + "grad_norm": 0.6219344735145569, + "learning_rate": 2.2893409082699304e-05, + "loss": 0.256, + "step": 4537 + }, + { + "epoch": 0.47030780391750443, + "grad_norm": 0.5234584212303162, + "learning_rate": 2.2886766024706626e-05, + "loss": 0.1764, + "step": 4538 + }, + { + "epoch": 0.47041144160016585, + "grad_norm": 0.4554639160633087, + "learning_rate": 2.2880122641437642e-05, + "loss": 0.1643, + "step": 4539 + }, + { + "epoch": 0.4705150792828272, + "grad_norm": 0.4849519729614258, + "learning_rate": 2.2873478933640918e-05, + "loss": 0.2102, + "step": 4540 + }, + { + "epoch": 0.4706187169654886, + "grad_norm": 0.5174660682678223, + "learning_rate": 2.286683490206505e-05, + "loss": 0.2159, + "step": 4541 + }, + { + "epoch": 0.47072235464815004, + "grad_norm": 0.48189786076545715, + "learning_rate": 2.286019054745869e-05, + "loss": 0.1991, + "step": 4542 + }, + { + "epoch": 0.47082599233081146, + "grad_norm": 0.5426444411277771, + "learning_rate": 2.2853545870570496e-05, + "loss": 0.2493, + "step": 4543 + }, + { + "epoch": 0.4709296300134729, + "grad_norm": 0.5645356178283691, + "learning_rate": 2.2846900872149188e-05, + "loss": 0.2221, + "step": 4544 + }, + { + "epoch": 0.4710332676961343, + "grad_norm": 0.45435675978660583, + "learning_rate": 2.2840255552943527e-05, + "loss": 0.1806, + "step": 4545 + }, + { + "epoch": 0.4711369053787957, + "grad_norm": 0.5457807183265686, + "learning_rate": 2.2833609913702276e-05, + "loss": 0.1892, + "step": 4546 + }, + { + "epoch": 0.47124054306145713, + "grad_norm": 0.551639199256897, + "learning_rate": 2.2826963955174266e-05, + "loss": 0.1931, + "step": 4547 + }, + { + "epoch": 0.47134418074411855, + "grad_norm": 0.4076830744743347, + "learning_rate": 2.282031767810836e-05, + "loss": 0.1541, + "step": 4548 + }, + { + "epoch": 0.47144781842677996, + "grad_norm": 0.4333003759384155, + "learning_rate": 2.281367108325343e-05, + "loss": 0.1656, + "step": 4549 + }, + { + "epoch": 0.4715514561094414, + "grad_norm": 0.48861321806907654, + "learning_rate": 2.2807024171358424e-05, + "loss": 0.194, + "step": 4550 + }, + { + "epoch": 0.4716550937921028, + "grad_norm": 0.4885448217391968, + "learning_rate": 2.28003769431723e-05, + "loss": 0.2106, + "step": 4551 + }, + { + "epoch": 0.4717587314747642, + "grad_norm": 0.5283402800559998, + "learning_rate": 2.2793729399444052e-05, + "loss": 0.2272, + "step": 4552 + }, + { + "epoch": 0.47186236915742563, + "grad_norm": 0.48492392897605896, + "learning_rate": 2.2787081540922716e-05, + "loss": 0.1947, + "step": 4553 + }, + { + "epoch": 0.47196600684008705, + "grad_norm": 0.4583858847618103, + "learning_rate": 2.2780433368357366e-05, + "loss": 0.1907, + "step": 4554 + }, + { + "epoch": 0.47206964452274847, + "grad_norm": 0.5057693123817444, + "learning_rate": 2.2773784882497104e-05, + "loss": 0.1971, + "step": 4555 + }, + { + "epoch": 0.4721732822054099, + "grad_norm": 0.5850982666015625, + "learning_rate": 2.2767136084091076e-05, + "loss": 0.2276, + "step": 4556 + }, + { + "epoch": 0.4722769198880713, + "grad_norm": 0.5325038433074951, + "learning_rate": 2.2760486973888452e-05, + "loss": 0.2018, + "step": 4557 + }, + { + "epoch": 0.4723805575707327, + "grad_norm": 0.5394957065582275, + "learning_rate": 2.275383755263846e-05, + "loss": 0.223, + "step": 4558 + }, + { + "epoch": 0.47248419525339413, + "grad_norm": 0.4816633462905884, + "learning_rate": 2.274718782109032e-05, + "loss": 0.2213, + "step": 4559 + }, + { + "epoch": 0.47258783293605555, + "grad_norm": 0.48285600543022156, + "learning_rate": 2.274053777999333e-05, + "loss": 0.1915, + "step": 4560 + }, + { + "epoch": 0.47269147061871697, + "grad_norm": 0.5504255890846252, + "learning_rate": 2.273388743009681e-05, + "loss": 0.2267, + "step": 4561 + }, + { + "epoch": 0.4727951083013784, + "grad_norm": 0.5120184421539307, + "learning_rate": 2.2727236772150095e-05, + "loss": 0.2017, + "step": 4562 + }, + { + "epoch": 0.4728987459840398, + "grad_norm": 0.5613784193992615, + "learning_rate": 2.2720585806902582e-05, + "loss": 0.2139, + "step": 4563 + }, + { + "epoch": 0.4730023836667012, + "grad_norm": 0.6070305109024048, + "learning_rate": 2.2713934535103692e-05, + "loss": 0.2626, + "step": 4564 + }, + { + "epoch": 0.47310602134936264, + "grad_norm": 0.5143283009529114, + "learning_rate": 2.2707282957502875e-05, + "loss": 0.1937, + "step": 4565 + }, + { + "epoch": 0.47320965903202405, + "grad_norm": 0.545512318611145, + "learning_rate": 2.2700631074849624e-05, + "loss": 0.218, + "step": 4566 + }, + { + "epoch": 0.47331329671468547, + "grad_norm": 0.5820977091789246, + "learning_rate": 2.2693978887893467e-05, + "loss": 0.2178, + "step": 4567 + }, + { + "epoch": 0.4734169343973469, + "grad_norm": 0.6015923023223877, + "learning_rate": 2.2687326397383952e-05, + "loss": 0.2258, + "step": 4568 + }, + { + "epoch": 0.4735205720800083, + "grad_norm": 0.5356124043464661, + "learning_rate": 2.2680673604070675e-05, + "loss": 0.2095, + "step": 4569 + }, + { + "epoch": 0.4736242097626697, + "grad_norm": 0.43398767709732056, + "learning_rate": 2.2674020508703266e-05, + "loss": 0.1709, + "step": 4570 + }, + { + "epoch": 0.47372784744533114, + "grad_norm": 0.5021622180938721, + "learning_rate": 2.2667367112031382e-05, + "loss": 0.1984, + "step": 4571 + }, + { + "epoch": 0.47383148512799256, + "grad_norm": 0.4682827293872833, + "learning_rate": 2.2660713414804713e-05, + "loss": 0.1612, + "step": 4572 + }, + { + "epoch": 0.473935122810654, + "grad_norm": 0.530903697013855, + "learning_rate": 2.2654059417773e-05, + "loss": 0.22, + "step": 4573 + }, + { + "epoch": 0.4740387604933154, + "grad_norm": 0.5349231362342834, + "learning_rate": 2.2647405121685996e-05, + "loss": 0.2349, + "step": 4574 + }, + { + "epoch": 0.4741423981759768, + "grad_norm": 0.5128585696220398, + "learning_rate": 2.2640750527293495e-05, + "loss": 0.2182, + "step": 4575 + }, + { + "epoch": 0.4742460358586382, + "grad_norm": 0.5996513962745667, + "learning_rate": 2.2634095635345333e-05, + "loss": 0.2495, + "step": 4576 + }, + { + "epoch": 0.47434967354129964, + "grad_norm": 0.5359048843383789, + "learning_rate": 2.262744044659137e-05, + "loss": 0.2267, + "step": 4577 + }, + { + "epoch": 0.474453311223961, + "grad_norm": 0.5653430223464966, + "learning_rate": 2.2620784961781502e-05, + "loss": 0.2158, + "step": 4578 + }, + { + "epoch": 0.4745569489066224, + "grad_norm": 0.5262527465820312, + "learning_rate": 2.261412918166565e-05, + "loss": 0.2212, + "step": 4579 + }, + { + "epoch": 0.47466058658928384, + "grad_norm": 0.4911423623561859, + "learning_rate": 2.2607473106993796e-05, + "loss": 0.2193, + "step": 4580 + }, + { + "epoch": 0.47476422427194526, + "grad_norm": 0.5154918432235718, + "learning_rate": 2.2600816738515924e-05, + "loss": 0.1967, + "step": 4581 + }, + { + "epoch": 0.4748678619546067, + "grad_norm": 0.4786593019962311, + "learning_rate": 2.2594160076982063e-05, + "loss": 0.1907, + "step": 4582 + }, + { + "epoch": 0.4749714996372681, + "grad_norm": 0.6081719398498535, + "learning_rate": 2.2587503123142282e-05, + "loss": 0.2068, + "step": 4583 + }, + { + "epoch": 0.4750751373199295, + "grad_norm": 0.5504110455513, + "learning_rate": 2.2580845877746662e-05, + "loss": 0.2328, + "step": 4584 + }, + { + "epoch": 0.4751787750025909, + "grad_norm": 0.5487823486328125, + "learning_rate": 2.2574188341545343e-05, + "loss": 0.2313, + "step": 4585 + }, + { + "epoch": 0.47528241268525234, + "grad_norm": 0.49996358156204224, + "learning_rate": 2.256753051528849e-05, + "loss": 0.1872, + "step": 4586 + }, + { + "epoch": 0.47538605036791376, + "grad_norm": 0.5519025325775146, + "learning_rate": 2.2560872399726286e-05, + "loss": 0.2212, + "step": 4587 + }, + { + "epoch": 0.4754896880505752, + "grad_norm": 0.5117783546447754, + "learning_rate": 2.255421399560896e-05, + "loss": 0.2141, + "step": 4588 + }, + { + "epoch": 0.4755933257332366, + "grad_norm": 0.5507332682609558, + "learning_rate": 2.2547555303686774e-05, + "loss": 0.2443, + "step": 4589 + }, + { + "epoch": 0.475696963415898, + "grad_norm": 0.5101597309112549, + "learning_rate": 2.2540896324710015e-05, + "loss": 0.2159, + "step": 4590 + }, + { + "epoch": 0.47580060109855943, + "grad_norm": 0.5492226481437683, + "learning_rate": 2.2534237059429004e-05, + "loss": 0.2243, + "step": 4591 + }, + { + "epoch": 0.47590423878122085, + "grad_norm": 0.4963025450706482, + "learning_rate": 2.2527577508594107e-05, + "loss": 0.1767, + "step": 4592 + }, + { + "epoch": 0.47600787646388226, + "grad_norm": 0.5667779445648193, + "learning_rate": 2.2520917672955706e-05, + "loss": 0.2069, + "step": 4593 + }, + { + "epoch": 0.4761115141465437, + "grad_norm": 0.5749843120574951, + "learning_rate": 2.2514257553264213e-05, + "loss": 0.2216, + "step": 4594 + }, + { + "epoch": 0.4762151518292051, + "grad_norm": 0.5432445406913757, + "learning_rate": 2.25075971502701e-05, + "loss": 0.2001, + "step": 4595 + }, + { + "epoch": 0.4763187895118665, + "grad_norm": 0.43292057514190674, + "learning_rate": 2.2500936464723825e-05, + "loss": 0.1596, + "step": 4596 + }, + { + "epoch": 0.47642242719452793, + "grad_norm": 0.4687357246875763, + "learning_rate": 2.2494275497375925e-05, + "loss": 0.1798, + "step": 4597 + }, + { + "epoch": 0.47652606487718935, + "grad_norm": 0.41464748978614807, + "learning_rate": 2.2487614248976932e-05, + "loss": 0.1577, + "step": 4598 + }, + { + "epoch": 0.47662970255985077, + "grad_norm": 0.5634739995002747, + "learning_rate": 2.2480952720277437e-05, + "loss": 0.2338, + "step": 4599 + }, + { + "epoch": 0.4767333402425122, + "grad_norm": 0.6048978567123413, + "learning_rate": 2.247429091202805e-05, + "loss": 0.2468, + "step": 4600 + }, + { + "epoch": 0.4768369779251736, + "grad_norm": 0.42859944701194763, + "learning_rate": 2.2467628824979402e-05, + "loss": 0.17, + "step": 4601 + }, + { + "epoch": 0.476940615607835, + "grad_norm": 0.5351769328117371, + "learning_rate": 2.2460966459882184e-05, + "loss": 0.2032, + "step": 4602 + }, + { + "epoch": 0.47704425329049643, + "grad_norm": 0.4877007305622101, + "learning_rate": 2.245430381748708e-05, + "loss": 0.1612, + "step": 4603 + }, + { + "epoch": 0.47714789097315785, + "grad_norm": 0.5551847219467163, + "learning_rate": 2.244764089854484e-05, + "loss": 0.2231, + "step": 4604 + }, + { + "epoch": 0.47725152865581927, + "grad_norm": 0.46438121795654297, + "learning_rate": 2.2440977703806237e-05, + "loss": 0.1837, + "step": 4605 + }, + { + "epoch": 0.4773551663384807, + "grad_norm": 0.48726150393486023, + "learning_rate": 2.2434314234022052e-05, + "loss": 0.1802, + "step": 4606 + }, + { + "epoch": 0.4774588040211421, + "grad_norm": 0.5019385814666748, + "learning_rate": 2.2427650489943124e-05, + "loss": 0.2204, + "step": 4607 + }, + { + "epoch": 0.4775624417038035, + "grad_norm": 0.4917573034763336, + "learning_rate": 2.2420986472320312e-05, + "loss": 0.1845, + "step": 4608 + }, + { + "epoch": 0.47766607938646494, + "grad_norm": 0.5490579605102539, + "learning_rate": 2.241432218190451e-05, + "loss": 0.2184, + "step": 4609 + }, + { + "epoch": 0.47776971706912635, + "grad_norm": 0.5504197478294373, + "learning_rate": 2.2407657619446637e-05, + "loss": 0.2286, + "step": 4610 + }, + { + "epoch": 0.47787335475178777, + "grad_norm": 0.5806339383125305, + "learning_rate": 2.240099278569765e-05, + "loss": 0.2198, + "step": 4611 + }, + { + "epoch": 0.4779769924344492, + "grad_norm": 0.5220992565155029, + "learning_rate": 2.2394327681408527e-05, + "loss": 0.2153, + "step": 4612 + }, + { + "epoch": 0.4780806301171106, + "grad_norm": 0.5559518337249756, + "learning_rate": 2.238766230733028e-05, + "loss": 0.2228, + "step": 4613 + }, + { + "epoch": 0.478184267799772, + "grad_norm": 0.5041811466217041, + "learning_rate": 2.2380996664213957e-05, + "loss": 0.2144, + "step": 4614 + }, + { + "epoch": 0.47828790548243344, + "grad_norm": 0.5417464375495911, + "learning_rate": 2.237433075281063e-05, + "loss": 0.2393, + "step": 4615 + }, + { + "epoch": 0.4783915431650948, + "grad_norm": 0.4931691884994507, + "learning_rate": 2.2367664573871406e-05, + "loss": 0.2006, + "step": 4616 + }, + { + "epoch": 0.4784951808477562, + "grad_norm": 0.523485541343689, + "learning_rate": 2.2360998128147417e-05, + "loss": 0.2208, + "step": 4617 + }, + { + "epoch": 0.47859881853041764, + "grad_norm": 0.4347810447216034, + "learning_rate": 2.2354331416389835e-05, + "loss": 0.1571, + "step": 4618 + }, + { + "epoch": 0.47870245621307905, + "grad_norm": 0.5515587329864502, + "learning_rate": 2.2347664439349838e-05, + "loss": 0.2146, + "step": 4619 + }, + { + "epoch": 0.47880609389574047, + "grad_norm": 0.48896217346191406, + "learning_rate": 2.234099719777867e-05, + "loss": 0.2021, + "step": 4620 + }, + { + "epoch": 0.4789097315784019, + "grad_norm": 0.5830408334732056, + "learning_rate": 2.2334329692427577e-05, + "loss": 0.2272, + "step": 4621 + }, + { + "epoch": 0.4790133692610633, + "grad_norm": 0.5266159772872925, + "learning_rate": 2.2327661924047842e-05, + "loss": 0.2095, + "step": 4622 + }, + { + "epoch": 0.4791170069437247, + "grad_norm": 0.5474720597267151, + "learning_rate": 2.2320993893390775e-05, + "loss": 0.2496, + "step": 4623 + }, + { + "epoch": 0.47922064462638614, + "grad_norm": 0.5088273882865906, + "learning_rate": 2.231432560120773e-05, + "loss": 0.1993, + "step": 4624 + }, + { + "epoch": 0.47932428230904756, + "grad_norm": 0.5491158366203308, + "learning_rate": 2.230765704825007e-05, + "loss": 0.2022, + "step": 4625 + }, + { + "epoch": 0.479427919991709, + "grad_norm": 0.5160688757896423, + "learning_rate": 2.23009882352692e-05, + "loss": 0.2078, + "step": 4626 + }, + { + "epoch": 0.4795315576743704, + "grad_norm": 0.4869052469730377, + "learning_rate": 2.229431916301656e-05, + "loss": 0.1796, + "step": 4627 + }, + { + "epoch": 0.4796351953570318, + "grad_norm": 0.5505785346031189, + "learning_rate": 2.22876498322436e-05, + "loss": 0.2223, + "step": 4628 + }, + { + "epoch": 0.4797388330396932, + "grad_norm": 0.45815008878707886, + "learning_rate": 2.228098024370181e-05, + "loss": 0.1812, + "step": 4629 + }, + { + "epoch": 0.47984247072235464, + "grad_norm": 0.5766584277153015, + "learning_rate": 2.2274310398142713e-05, + "loss": 0.2521, + "step": 4630 + }, + { + "epoch": 0.47994610840501606, + "grad_norm": 0.5113991498947144, + "learning_rate": 2.2267640296317857e-05, + "loss": 0.2069, + "step": 4631 + }, + { + "epoch": 0.4800497460876775, + "grad_norm": 0.5927762985229492, + "learning_rate": 2.2260969938978815e-05, + "loss": 0.243, + "step": 4632 + }, + { + "epoch": 0.4801533837703389, + "grad_norm": 0.5017601251602173, + "learning_rate": 2.22542993268772e-05, + "loss": 0.1858, + "step": 4633 + }, + { + "epoch": 0.4802570214530003, + "grad_norm": 0.5273293256759644, + "learning_rate": 2.224762846076464e-05, + "loss": 0.2179, + "step": 4634 + }, + { + "epoch": 0.4803606591356617, + "grad_norm": 0.4704946279525757, + "learning_rate": 2.2240957341392796e-05, + "loss": 0.1698, + "step": 4635 + }, + { + "epoch": 0.48046429681832314, + "grad_norm": 0.47459760308265686, + "learning_rate": 2.2234285969513363e-05, + "loss": 0.1876, + "step": 4636 + }, + { + "epoch": 0.48056793450098456, + "grad_norm": 0.5367644429206848, + "learning_rate": 2.2227614345878066e-05, + "loss": 0.1927, + "step": 4637 + }, + { + "epoch": 0.480671572183646, + "grad_norm": 0.5465733408927917, + "learning_rate": 2.2220942471238636e-05, + "loss": 0.211, + "step": 4638 + }, + { + "epoch": 0.4807752098663074, + "grad_norm": 0.5373432040214539, + "learning_rate": 2.2214270346346866e-05, + "loss": 0.225, + "step": 4639 + }, + { + "epoch": 0.4808788475489688, + "grad_norm": 0.5162397027015686, + "learning_rate": 2.220759797195456e-05, + "loss": 0.1835, + "step": 4640 + }, + { + "epoch": 0.48098248523163023, + "grad_norm": 0.4821995794773102, + "learning_rate": 2.2200925348813542e-05, + "loss": 0.1701, + "step": 4641 + }, + { + "epoch": 0.48108612291429165, + "grad_norm": 0.57627272605896, + "learning_rate": 2.2194252477675676e-05, + "loss": 0.2357, + "step": 4642 + }, + { + "epoch": 0.48118976059695306, + "grad_norm": 0.6950919032096863, + "learning_rate": 2.218757935929286e-05, + "loss": 0.2545, + "step": 4643 + }, + { + "epoch": 0.4812933982796145, + "grad_norm": 0.4768347442150116, + "learning_rate": 2.2180905994416992e-05, + "loss": 0.2094, + "step": 4644 + }, + { + "epoch": 0.4813970359622759, + "grad_norm": 0.5636467933654785, + "learning_rate": 2.2174232383800033e-05, + "loss": 0.2087, + "step": 4645 + }, + { + "epoch": 0.4815006736449373, + "grad_norm": 0.5050401091575623, + "learning_rate": 2.216755852819395e-05, + "loss": 0.1834, + "step": 4646 + }, + { + "epoch": 0.48160431132759873, + "grad_norm": 0.6181920170783997, + "learning_rate": 2.2160884428350737e-05, + "loss": 0.2612, + "step": 4647 + }, + { + "epoch": 0.48170794901026015, + "grad_norm": 0.5703398585319519, + "learning_rate": 2.2154210085022426e-05, + "loss": 0.2377, + "step": 4648 + }, + { + "epoch": 0.48181158669292157, + "grad_norm": 0.5547765493392944, + "learning_rate": 2.2147535498961075e-05, + "loss": 0.2058, + "step": 4649 + }, + { + "epoch": 0.481915224375583, + "grad_norm": 0.5765746831893921, + "learning_rate": 2.2140860670918762e-05, + "loss": 0.2426, + "step": 4650 + }, + { + "epoch": 0.4820188620582444, + "grad_norm": 0.5647233724594116, + "learning_rate": 2.2134185601647595e-05, + "loss": 0.1993, + "step": 4651 + }, + { + "epoch": 0.4821224997409058, + "grad_norm": 0.5586739778518677, + "learning_rate": 2.212751029189971e-05, + "loss": 0.2334, + "step": 4652 + }, + { + "epoch": 0.48222613742356724, + "grad_norm": 0.5199017524719238, + "learning_rate": 2.212083474242727e-05, + "loss": 0.1745, + "step": 4653 + }, + { + "epoch": 0.4823297751062286, + "grad_norm": 0.5173331499099731, + "learning_rate": 2.211415895398248e-05, + "loss": 0.2134, + "step": 4654 + }, + { + "epoch": 0.48243341278889, + "grad_norm": 0.6455654501914978, + "learning_rate": 2.2107482927317534e-05, + "loss": 0.203, + "step": 4655 + }, + { + "epoch": 0.48253705047155143, + "grad_norm": 0.4340267479419708, + "learning_rate": 2.21008066631847e-05, + "loss": 0.1797, + "step": 4656 + }, + { + "epoch": 0.48264068815421285, + "grad_norm": 0.5833914279937744, + "learning_rate": 2.2094130162336227e-05, + "loss": 0.2338, + "step": 4657 + }, + { + "epoch": 0.48274432583687427, + "grad_norm": 0.5767900347709656, + "learning_rate": 2.2087453425524426e-05, + "loss": 0.2228, + "step": 4658 + }, + { + "epoch": 0.4828479635195357, + "grad_norm": 0.5963987708091736, + "learning_rate": 2.2080776453501614e-05, + "loss": 0.2438, + "step": 4659 + }, + { + "epoch": 0.4829516012021971, + "grad_norm": 0.46724367141723633, + "learning_rate": 2.2074099247020147e-05, + "loss": 0.1832, + "step": 4660 + }, + { + "epoch": 0.4830552388848585, + "grad_norm": 0.5257183909416199, + "learning_rate": 2.2067421806832403e-05, + "loss": 0.2144, + "step": 4661 + }, + { + "epoch": 0.48315887656751993, + "grad_norm": 0.6028289198875427, + "learning_rate": 2.206074413369079e-05, + "loss": 0.2536, + "step": 4662 + }, + { + "epoch": 0.48326251425018135, + "grad_norm": 0.5243949890136719, + "learning_rate": 2.205406622834772e-05, + "loss": 0.1765, + "step": 4663 + }, + { + "epoch": 0.48336615193284277, + "grad_norm": 0.4694502353668213, + "learning_rate": 2.2047388091555665e-05, + "loss": 0.1839, + "step": 4664 + }, + { + "epoch": 0.4834697896155042, + "grad_norm": 0.5306639671325684, + "learning_rate": 2.2040709724067106e-05, + "loss": 0.2186, + "step": 4665 + }, + { + "epoch": 0.4835734272981656, + "grad_norm": 0.4971131384372711, + "learning_rate": 2.2034031126634544e-05, + "loss": 0.1806, + "step": 4666 + }, + { + "epoch": 0.483677064980827, + "grad_norm": 0.6058434844017029, + "learning_rate": 2.202735230001052e-05, + "loss": 0.2814, + "step": 4667 + }, + { + "epoch": 0.48378070266348844, + "grad_norm": 0.4897499978542328, + "learning_rate": 2.2020673244947587e-05, + "loss": 0.1726, + "step": 4668 + }, + { + "epoch": 0.48388434034614985, + "grad_norm": 0.43752723932266235, + "learning_rate": 2.2013993962198336e-05, + "loss": 0.1812, + "step": 4669 + }, + { + "epoch": 0.48398797802881127, + "grad_norm": 0.4837397038936615, + "learning_rate": 2.2007314452515375e-05, + "loss": 0.2176, + "step": 4670 + }, + { + "epoch": 0.4840916157114727, + "grad_norm": 0.5185476541519165, + "learning_rate": 2.2000634716651338e-05, + "loss": 0.203, + "step": 4671 + }, + { + "epoch": 0.4841952533941341, + "grad_norm": 0.46402788162231445, + "learning_rate": 2.19939547553589e-05, + "loss": 0.177, + "step": 4672 + }, + { + "epoch": 0.4842988910767955, + "grad_norm": 0.5397825241088867, + "learning_rate": 2.1987274569390727e-05, + "loss": 0.237, + "step": 4673 + }, + { + "epoch": 0.48440252875945694, + "grad_norm": 0.5343857407569885, + "learning_rate": 2.1980594159499558e-05, + "loss": 0.2077, + "step": 4674 + }, + { + "epoch": 0.48450616644211836, + "grad_norm": 0.500300943851471, + "learning_rate": 2.1973913526438114e-05, + "loss": 0.219, + "step": 4675 + }, + { + "epoch": 0.4846098041247798, + "grad_norm": 0.5813903212547302, + "learning_rate": 2.1967232670959158e-05, + "loss": 0.2583, + "step": 4676 + }, + { + "epoch": 0.4847134418074412, + "grad_norm": 0.5086579918861389, + "learning_rate": 2.196055159381549e-05, + "loss": 0.2018, + "step": 4677 + }, + { + "epoch": 0.4848170794901026, + "grad_norm": 0.5221996307373047, + "learning_rate": 2.195387029575991e-05, + "loss": 0.2156, + "step": 4678 + }, + { + "epoch": 0.484920717172764, + "grad_norm": 0.5296904444694519, + "learning_rate": 2.194718877754526e-05, + "loss": 0.2459, + "step": 4679 + }, + { + "epoch": 0.48502435485542544, + "grad_norm": 0.5555968880653381, + "learning_rate": 2.1940507039924414e-05, + "loss": 0.2386, + "step": 4680 + }, + { + "epoch": 0.48512799253808686, + "grad_norm": 0.4701452851295471, + "learning_rate": 2.193382508365025e-05, + "loss": 0.1761, + "step": 4681 + }, + { + "epoch": 0.4852316302207483, + "grad_norm": 0.5224292874336243, + "learning_rate": 2.192714290947568e-05, + "loss": 0.2258, + "step": 4682 + }, + { + "epoch": 0.4853352679034097, + "grad_norm": 0.5685223340988159, + "learning_rate": 2.1920460518153637e-05, + "loss": 0.2386, + "step": 4683 + }, + { + "epoch": 0.4854389055860711, + "grad_norm": 0.6110420227050781, + "learning_rate": 2.1913777910437094e-05, + "loss": 0.2472, + "step": 4684 + }, + { + "epoch": 0.48554254326873253, + "grad_norm": 0.523505687713623, + "learning_rate": 2.190709508707903e-05, + "loss": 0.2076, + "step": 4685 + }, + { + "epoch": 0.48564618095139395, + "grad_norm": 0.4906814694404602, + "learning_rate": 2.1900412048832456e-05, + "loss": 0.1953, + "step": 4686 + }, + { + "epoch": 0.48574981863405536, + "grad_norm": 0.49548250436782837, + "learning_rate": 2.189372879645041e-05, + "loss": 0.2112, + "step": 4687 + }, + { + "epoch": 0.4858534563167168, + "grad_norm": 0.5095117688179016, + "learning_rate": 2.1887045330685937e-05, + "loss": 0.1941, + "step": 4688 + }, + { + "epoch": 0.4859570939993782, + "grad_norm": 0.521227240562439, + "learning_rate": 2.188036165229214e-05, + "loss": 0.2176, + "step": 4689 + }, + { + "epoch": 0.4860607316820396, + "grad_norm": 0.5012630224227905, + "learning_rate": 2.1873677762022116e-05, + "loss": 0.2102, + "step": 4690 + }, + { + "epoch": 0.48616436936470103, + "grad_norm": 0.5226165056228638, + "learning_rate": 2.186699366062899e-05, + "loss": 0.2343, + "step": 4691 + }, + { + "epoch": 0.4862680070473624, + "grad_norm": 0.5168052315711975, + "learning_rate": 2.186030934886592e-05, + "loss": 0.1981, + "step": 4692 + }, + { + "epoch": 0.4863716447300238, + "grad_norm": 0.5914785265922546, + "learning_rate": 2.1853624827486082e-05, + "loss": 0.2423, + "step": 4693 + }, + { + "epoch": 0.48647528241268523, + "grad_norm": 0.508635938167572, + "learning_rate": 2.1846940097242684e-05, + "loss": 0.1999, + "step": 4694 + }, + { + "epoch": 0.48657892009534665, + "grad_norm": 0.5382908582687378, + "learning_rate": 2.1840255158888946e-05, + "loss": 0.2426, + "step": 4695 + }, + { + "epoch": 0.48668255777800806, + "grad_norm": 0.5902379751205444, + "learning_rate": 2.1833570013178117e-05, + "loss": 0.223, + "step": 4696 + }, + { + "epoch": 0.4867861954606695, + "grad_norm": 0.4909026622772217, + "learning_rate": 2.1826884660863473e-05, + "loss": 0.1949, + "step": 4697 + }, + { + "epoch": 0.4868898331433309, + "grad_norm": 0.5593119859695435, + "learning_rate": 2.18201991026983e-05, + "loss": 0.2712, + "step": 4698 + }, + { + "epoch": 0.4869934708259923, + "grad_norm": 0.5627800822257996, + "learning_rate": 2.1813513339435924e-05, + "loss": 0.2411, + "step": 4699 + }, + { + "epoch": 0.48709710850865373, + "grad_norm": 0.5743441581726074, + "learning_rate": 2.1806827371829686e-05, + "loss": 0.2438, + "step": 4700 + }, + { + "epoch": 0.48720074619131515, + "grad_norm": 0.554901659488678, + "learning_rate": 2.1800141200632944e-05, + "loss": 0.2311, + "step": 4701 + }, + { + "epoch": 0.48730438387397657, + "grad_norm": 0.5914807319641113, + "learning_rate": 2.1793454826599092e-05, + "loss": 0.2139, + "step": 4702 + }, + { + "epoch": 0.487408021556638, + "grad_norm": 0.5732871294021606, + "learning_rate": 2.178676825048154e-05, + "loss": 0.207, + "step": 4703 + }, + { + "epoch": 0.4875116592392994, + "grad_norm": 0.5167139768600464, + "learning_rate": 2.1780081473033715e-05, + "loss": 0.2267, + "step": 4704 + }, + { + "epoch": 0.4876152969219608, + "grad_norm": 0.5026474595069885, + "learning_rate": 2.177339449500908e-05, + "loss": 0.1783, + "step": 4705 + }, + { + "epoch": 0.48771893460462223, + "grad_norm": 0.5291749835014343, + "learning_rate": 2.1766707317161115e-05, + "loss": 0.2097, + "step": 4706 + }, + { + "epoch": 0.48782257228728365, + "grad_norm": 0.5648260116577148, + "learning_rate": 2.176001994024331e-05, + "loss": 0.2037, + "step": 4707 + }, + { + "epoch": 0.48792620996994507, + "grad_norm": 0.6325567960739136, + "learning_rate": 2.175333236500919e-05, + "loss": 0.2531, + "step": 4708 + }, + { + "epoch": 0.4880298476526065, + "grad_norm": 0.46111971139907837, + "learning_rate": 2.1746644592212314e-05, + "loss": 0.179, + "step": 4709 + }, + { + "epoch": 0.4881334853352679, + "grad_norm": 0.4900364279747009, + "learning_rate": 2.173995662260623e-05, + "loss": 0.2037, + "step": 4710 + }, + { + "epoch": 0.4882371230179293, + "grad_norm": 0.5328616499900818, + "learning_rate": 2.1733268456944544e-05, + "loss": 0.2163, + "step": 4711 + }, + { + "epoch": 0.48834076070059074, + "grad_norm": 0.5089332461357117, + "learning_rate": 2.1726580095980866e-05, + "loss": 0.1893, + "step": 4712 + }, + { + "epoch": 0.48844439838325215, + "grad_norm": 0.5162228345870972, + "learning_rate": 2.1719891540468824e-05, + "loss": 0.1971, + "step": 4713 + }, + { + "epoch": 0.48854803606591357, + "grad_norm": 0.4371989667415619, + "learning_rate": 2.1713202791162074e-05, + "loss": 0.1675, + "step": 4714 + }, + { + "epoch": 0.488651673748575, + "grad_norm": 0.528220534324646, + "learning_rate": 2.17065138488143e-05, + "loss": 0.2311, + "step": 4715 + }, + { + "epoch": 0.4887553114312364, + "grad_norm": 0.5761193633079529, + "learning_rate": 2.16998247141792e-05, + "loss": 0.2424, + "step": 4716 + }, + { + "epoch": 0.4888589491138978, + "grad_norm": 0.5048497319221497, + "learning_rate": 2.1693135388010492e-05, + "loss": 0.2086, + "step": 4717 + }, + { + "epoch": 0.48896258679655924, + "grad_norm": 0.4758215546607971, + "learning_rate": 2.1686445871061913e-05, + "loss": 0.1793, + "step": 4718 + }, + { + "epoch": 0.48906622447922066, + "grad_norm": 0.5638699531555176, + "learning_rate": 2.1679756164087248e-05, + "loss": 0.2315, + "step": 4719 + }, + { + "epoch": 0.4891698621618821, + "grad_norm": 0.4771273136138916, + "learning_rate": 2.1673066267840266e-05, + "loss": 0.1924, + "step": 4720 + }, + { + "epoch": 0.4892734998445435, + "grad_norm": 0.5214678645133972, + "learning_rate": 2.166637618307477e-05, + "loss": 0.2128, + "step": 4721 + }, + { + "epoch": 0.4893771375272049, + "grad_norm": 0.4893636703491211, + "learning_rate": 2.165968591054461e-05, + "loss": 0.1814, + "step": 4722 + }, + { + "epoch": 0.4894807752098663, + "grad_norm": 0.46817266941070557, + "learning_rate": 2.1652995451003608e-05, + "loss": 0.1749, + "step": 4723 + }, + { + "epoch": 0.48958441289252774, + "grad_norm": 0.5227934122085571, + "learning_rate": 2.164630480520565e-05, + "loss": 0.2133, + "step": 4724 + }, + { + "epoch": 0.48968805057518916, + "grad_norm": 0.5478593111038208, + "learning_rate": 2.1639613973904633e-05, + "loss": 0.2459, + "step": 4725 + }, + { + "epoch": 0.4897916882578506, + "grad_norm": 0.5256202220916748, + "learning_rate": 2.163292295785446e-05, + "loss": 0.1603, + "step": 4726 + }, + { + "epoch": 0.489895325940512, + "grad_norm": 0.6194120645523071, + "learning_rate": 2.162623175780906e-05, + "loss": 0.2218, + "step": 4727 + }, + { + "epoch": 0.4899989636231734, + "grad_norm": 0.5352566242218018, + "learning_rate": 2.16195403745224e-05, + "loss": 0.1922, + "step": 4728 + }, + { + "epoch": 0.49010260130583483, + "grad_norm": 0.5198310613632202, + "learning_rate": 2.1612848808748446e-05, + "loss": 0.2033, + "step": 4729 + }, + { + "epoch": 0.4902062389884962, + "grad_norm": 0.5337640643119812, + "learning_rate": 2.1606157061241196e-05, + "loss": 0.218, + "step": 4730 + }, + { + "epoch": 0.4903098766711576, + "grad_norm": 0.42674437165260315, + "learning_rate": 2.1599465132754664e-05, + "loss": 0.1643, + "step": 4731 + }, + { + "epoch": 0.490413514353819, + "grad_norm": 0.4258442521095276, + "learning_rate": 2.159277302404289e-05, + "loss": 0.1576, + "step": 4732 + }, + { + "epoch": 0.49051715203648044, + "grad_norm": 0.5152009129524231, + "learning_rate": 2.1586080735859926e-05, + "loss": 0.2069, + "step": 4733 + }, + { + "epoch": 0.49062078971914186, + "grad_norm": 0.5394965410232544, + "learning_rate": 2.1579388268959848e-05, + "loss": 0.2046, + "step": 4734 + }, + { + "epoch": 0.4907244274018033, + "grad_norm": 0.5877050757408142, + "learning_rate": 2.1572695624096763e-05, + "loss": 0.2356, + "step": 4735 + }, + { + "epoch": 0.4908280650844647, + "grad_norm": 0.5340604782104492, + "learning_rate": 2.1566002802024776e-05, + "loss": 0.2014, + "step": 4736 + }, + { + "epoch": 0.4909317027671261, + "grad_norm": 0.4920124113559723, + "learning_rate": 2.1559309803498022e-05, + "loss": 0.1995, + "step": 4737 + }, + { + "epoch": 0.4910353404497875, + "grad_norm": 0.4382156729698181, + "learning_rate": 2.1552616629270668e-05, + "loss": 0.1734, + "step": 4738 + }, + { + "epoch": 0.49113897813244894, + "grad_norm": 0.5280501246452332, + "learning_rate": 2.1545923280096885e-05, + "loss": 0.1988, + "step": 4739 + }, + { + "epoch": 0.49124261581511036, + "grad_norm": 0.5084026455879211, + "learning_rate": 2.1539229756730868e-05, + "loss": 0.185, + "step": 4740 + }, + { + "epoch": 0.4913462534977718, + "grad_norm": 0.457512229681015, + "learning_rate": 2.1532536059926842e-05, + "loss": 0.1759, + "step": 4741 + }, + { + "epoch": 0.4914498911804332, + "grad_norm": 0.5790635943412781, + "learning_rate": 2.1525842190439022e-05, + "loss": 0.2617, + "step": 4742 + }, + { + "epoch": 0.4915535288630946, + "grad_norm": 0.5027420520782471, + "learning_rate": 2.1519148149021688e-05, + "loss": 0.1973, + "step": 4743 + }, + { + "epoch": 0.49165716654575603, + "grad_norm": 0.6793406009674072, + "learning_rate": 2.15124539364291e-05, + "loss": 0.2529, + "step": 4744 + }, + { + "epoch": 0.49176080422841745, + "grad_norm": 0.5920283198356628, + "learning_rate": 2.1505759553415554e-05, + "loss": 0.2527, + "step": 4745 + }, + { + "epoch": 0.49186444191107886, + "grad_norm": 0.4970380365848541, + "learning_rate": 2.1499065000735357e-05, + "loss": 0.1747, + "step": 4746 + }, + { + "epoch": 0.4919680795937403, + "grad_norm": 0.543004035949707, + "learning_rate": 2.1492370279142848e-05, + "loss": 0.2162, + "step": 4747 + }, + { + "epoch": 0.4920717172764017, + "grad_norm": 0.5673389434814453, + "learning_rate": 2.1485675389392377e-05, + "loss": 0.25, + "step": 4748 + }, + { + "epoch": 0.4921753549590631, + "grad_norm": 0.5720213055610657, + "learning_rate": 2.147898033223831e-05, + "loss": 0.208, + "step": 4749 + }, + { + "epoch": 0.49227899264172453, + "grad_norm": 0.6086804866790771, + "learning_rate": 2.1472285108435044e-05, + "loss": 0.2467, + "step": 4750 + }, + { + "epoch": 0.49238263032438595, + "grad_norm": 0.5073210597038269, + "learning_rate": 2.146558971873698e-05, + "loss": 0.2411, + "step": 4751 + }, + { + "epoch": 0.49248626800704737, + "grad_norm": 0.4711146652698517, + "learning_rate": 2.1458894163898537e-05, + "loss": 0.1911, + "step": 4752 + }, + { + "epoch": 0.4925899056897088, + "grad_norm": 0.5544717907905579, + "learning_rate": 2.1452198444674176e-05, + "loss": 0.206, + "step": 4753 + }, + { + "epoch": 0.4926935433723702, + "grad_norm": 0.5063883662223816, + "learning_rate": 2.144550256181835e-05, + "loss": 0.2161, + "step": 4754 + }, + { + "epoch": 0.4927971810550316, + "grad_norm": 0.5890721082687378, + "learning_rate": 2.143880651608554e-05, + "loss": 0.2086, + "step": 4755 + }, + { + "epoch": 0.49290081873769304, + "grad_norm": 0.5515021681785583, + "learning_rate": 2.1432110308230246e-05, + "loss": 0.2401, + "step": 4756 + }, + { + "epoch": 0.49300445642035445, + "grad_norm": 0.390842080116272, + "learning_rate": 2.1425413939006995e-05, + "loss": 0.1599, + "step": 4757 + }, + { + "epoch": 0.49310809410301587, + "grad_norm": 0.5897576808929443, + "learning_rate": 2.1418717409170312e-05, + "loss": 0.2546, + "step": 4758 + }, + { + "epoch": 0.4932117317856773, + "grad_norm": 0.5300201177597046, + "learning_rate": 2.1412020719474758e-05, + "loss": 0.2265, + "step": 4759 + }, + { + "epoch": 0.4933153694683387, + "grad_norm": 0.4570264220237732, + "learning_rate": 2.140532387067491e-05, + "loss": 0.1888, + "step": 4760 + }, + { + "epoch": 0.4934190071510001, + "grad_norm": 0.5168429017066956, + "learning_rate": 2.139862686352535e-05, + "loss": 0.2056, + "step": 4761 + }, + { + "epoch": 0.49352264483366154, + "grad_norm": 0.49334126710891724, + "learning_rate": 2.139192969878068e-05, + "loss": 0.1755, + "step": 4762 + }, + { + "epoch": 0.49362628251632296, + "grad_norm": 0.5192726850509644, + "learning_rate": 2.138523237719555e-05, + "loss": 0.1771, + "step": 4763 + }, + { + "epoch": 0.4937299201989844, + "grad_norm": 0.4422583281993866, + "learning_rate": 2.137853489952458e-05, + "loss": 0.1702, + "step": 4764 + }, + { + "epoch": 0.4938335578816458, + "grad_norm": 0.47275489568710327, + "learning_rate": 2.1371837266522443e-05, + "loss": 0.1853, + "step": 4765 + }, + { + "epoch": 0.4939371955643072, + "grad_norm": 0.4642319083213806, + "learning_rate": 2.1365139478943816e-05, + "loss": 0.1811, + "step": 4766 + }, + { + "epoch": 0.4940408332469686, + "grad_norm": 0.5717727541923523, + "learning_rate": 2.1358441537543393e-05, + "loss": 0.2215, + "step": 4767 + }, + { + "epoch": 0.49414447092963, + "grad_norm": 0.48989662528038025, + "learning_rate": 2.135174344307589e-05, + "loss": 0.1697, + "step": 4768 + }, + { + "epoch": 0.4942481086122914, + "grad_norm": 0.5562463998794556, + "learning_rate": 2.1345045196296036e-05, + "loss": 0.2133, + "step": 4769 + }, + { + "epoch": 0.4943517462949528, + "grad_norm": 0.5160841941833496, + "learning_rate": 2.1338346797958584e-05, + "loss": 0.2158, + "step": 4770 + }, + { + "epoch": 0.49445538397761424, + "grad_norm": 0.5394311547279358, + "learning_rate": 2.1331648248818292e-05, + "loss": 0.2265, + "step": 4771 + }, + { + "epoch": 0.49455902166027566, + "grad_norm": 0.5124887824058533, + "learning_rate": 2.1324949549629946e-05, + "loss": 0.2185, + "step": 4772 + }, + { + "epoch": 0.49466265934293707, + "grad_norm": 0.42647621035575867, + "learning_rate": 2.1318250701148343e-05, + "loss": 0.1415, + "step": 4773 + }, + { + "epoch": 0.4947662970255985, + "grad_norm": 0.5553656816482544, + "learning_rate": 2.1311551704128298e-05, + "loss": 0.2234, + "step": 4774 + }, + { + "epoch": 0.4948699347082599, + "grad_norm": 0.5774763226509094, + "learning_rate": 2.1304852559324653e-05, + "loss": 0.245, + "step": 4775 + }, + { + "epoch": 0.4949735723909213, + "grad_norm": 0.5323425531387329, + "learning_rate": 2.129815326749225e-05, + "loss": 0.2211, + "step": 4776 + }, + { + "epoch": 0.49507721007358274, + "grad_norm": 0.5304404497146606, + "learning_rate": 2.1291453829385947e-05, + "loss": 0.214, + "step": 4777 + }, + { + "epoch": 0.49518084775624416, + "grad_norm": 0.5394851565361023, + "learning_rate": 2.1284754245760637e-05, + "loss": 0.1973, + "step": 4778 + }, + { + "epoch": 0.4952844854389056, + "grad_norm": 0.5241143703460693, + "learning_rate": 2.127805451737122e-05, + "loss": 0.1916, + "step": 4779 + }, + { + "epoch": 0.495388123121567, + "grad_norm": 0.48268353939056396, + "learning_rate": 2.1271354644972603e-05, + "loss": 0.1924, + "step": 4780 + }, + { + "epoch": 0.4954917608042284, + "grad_norm": 0.5393835306167603, + "learning_rate": 2.126465462931972e-05, + "loss": 0.2122, + "step": 4781 + }, + { + "epoch": 0.4955953984868898, + "grad_norm": 0.4567290246486664, + "learning_rate": 2.1257954471167525e-05, + "loss": 0.1814, + "step": 4782 + }, + { + "epoch": 0.49569903616955124, + "grad_norm": 0.5594385862350464, + "learning_rate": 2.1251254171270972e-05, + "loss": 0.2462, + "step": 4783 + }, + { + "epoch": 0.49580267385221266, + "grad_norm": 0.604749858379364, + "learning_rate": 2.124455373038504e-05, + "loss": 0.2055, + "step": 4784 + }, + { + "epoch": 0.4959063115348741, + "grad_norm": 0.5307607650756836, + "learning_rate": 2.123785314926474e-05, + "loss": 0.2186, + "step": 4785 + }, + { + "epoch": 0.4960099492175355, + "grad_norm": 0.5244830846786499, + "learning_rate": 2.123115242866506e-05, + "loss": 0.2084, + "step": 4786 + }, + { + "epoch": 0.4961135869001969, + "grad_norm": 0.5358648300170898, + "learning_rate": 2.122445156934104e-05, + "loss": 0.217, + "step": 4787 + }, + { + "epoch": 0.49621722458285833, + "grad_norm": 0.6167570948600769, + "learning_rate": 2.1217750572047725e-05, + "loss": 0.2607, + "step": 4788 + }, + { + "epoch": 0.49632086226551975, + "grad_norm": 0.5059612393379211, + "learning_rate": 2.1211049437540166e-05, + "loss": 0.1914, + "step": 4789 + }, + { + "epoch": 0.49642449994818116, + "grad_norm": 0.5213268995285034, + "learning_rate": 2.120434816657344e-05, + "loss": 0.1881, + "step": 4790 + }, + { + "epoch": 0.4965281376308426, + "grad_norm": 0.4669734239578247, + "learning_rate": 2.119764675990263e-05, + "loss": 0.1719, + "step": 4791 + }, + { + "epoch": 0.496631775313504, + "grad_norm": 0.5827397108078003, + "learning_rate": 2.119094521828285e-05, + "loss": 0.2356, + "step": 4792 + }, + { + "epoch": 0.4967354129961654, + "grad_norm": 0.5616671442985535, + "learning_rate": 2.1184243542469214e-05, + "loss": 0.218, + "step": 4793 + }, + { + "epoch": 0.49683905067882683, + "grad_norm": 0.5867148637771606, + "learning_rate": 2.1177541733216853e-05, + "loss": 0.2114, + "step": 4794 + }, + { + "epoch": 0.49694268836148825, + "grad_norm": 0.4665326774120331, + "learning_rate": 2.1170839791280928e-05, + "loss": 0.1659, + "step": 4795 + }, + { + "epoch": 0.49704632604414967, + "grad_norm": 0.5649195313453674, + "learning_rate": 2.116413771741659e-05, + "loss": 0.2069, + "step": 4796 + }, + { + "epoch": 0.4971499637268111, + "grad_norm": 0.5897424817085266, + "learning_rate": 2.115743551237902e-05, + "loss": 0.1988, + "step": 4797 + }, + { + "epoch": 0.4972536014094725, + "grad_norm": 0.6335086822509766, + "learning_rate": 2.115073317692342e-05, + "loss": 0.2381, + "step": 4798 + }, + { + "epoch": 0.4973572390921339, + "grad_norm": 0.5553082227706909, + "learning_rate": 2.1144030711804996e-05, + "loss": 0.2303, + "step": 4799 + }, + { + "epoch": 0.49746087677479534, + "grad_norm": 0.4504654109477997, + "learning_rate": 2.1137328117778967e-05, + "loss": 0.1665, + "step": 4800 + }, + { + "epoch": 0.49756451445745675, + "grad_norm": 0.5283129215240479, + "learning_rate": 2.113062539560058e-05, + "loss": 0.215, + "step": 4801 + }, + { + "epoch": 0.49766815214011817, + "grad_norm": 0.5294463038444519, + "learning_rate": 2.112392254602507e-05, + "loss": 0.2278, + "step": 4802 + }, + { + "epoch": 0.4977717898227796, + "grad_norm": 0.579262912273407, + "learning_rate": 2.1117219569807717e-05, + "loss": 0.2302, + "step": 4803 + }, + { + "epoch": 0.497875427505441, + "grad_norm": 0.5535164475440979, + "learning_rate": 2.111051646770381e-05, + "loss": 0.2272, + "step": 4804 + }, + { + "epoch": 0.4979790651881024, + "grad_norm": 0.5065291523933411, + "learning_rate": 2.1103813240468624e-05, + "loss": 0.199, + "step": 4805 + }, + { + "epoch": 0.4980827028707638, + "grad_norm": 0.4622606635093689, + "learning_rate": 2.109710988885748e-05, + "loss": 0.1854, + "step": 4806 + }, + { + "epoch": 0.4981863405534252, + "grad_norm": 0.5690516829490662, + "learning_rate": 2.10904064136257e-05, + "loss": 0.2311, + "step": 4807 + }, + { + "epoch": 0.4982899782360866, + "grad_norm": 0.5897866487503052, + "learning_rate": 2.108370281552862e-05, + "loss": 0.2058, + "step": 4808 + }, + { + "epoch": 0.49839361591874803, + "grad_norm": 0.5769507884979248, + "learning_rate": 2.107699909532159e-05, + "loss": 0.2183, + "step": 4809 + }, + { + "epoch": 0.49849725360140945, + "grad_norm": 0.5400382876396179, + "learning_rate": 2.107029525375998e-05, + "loss": 0.1919, + "step": 4810 + }, + { + "epoch": 0.49860089128407087, + "grad_norm": 0.48948460817337036, + "learning_rate": 2.1063591291599167e-05, + "loss": 0.2164, + "step": 4811 + }, + { + "epoch": 0.4987045289667323, + "grad_norm": 0.49134010076522827, + "learning_rate": 2.105688720959453e-05, + "loss": 0.218, + "step": 4812 + }, + { + "epoch": 0.4988081666493937, + "grad_norm": 0.5676142573356628, + "learning_rate": 2.1050183008501487e-05, + "loss": 0.2142, + "step": 4813 + }, + { + "epoch": 0.4989118043320551, + "grad_norm": 0.46498799324035645, + "learning_rate": 2.1043478689075464e-05, + "loss": 0.1828, + "step": 4814 + }, + { + "epoch": 0.49901544201471654, + "grad_norm": 0.5404950380325317, + "learning_rate": 2.103677425207188e-05, + "loss": 0.2091, + "step": 4815 + }, + { + "epoch": 0.49911907969737795, + "grad_norm": 0.5214618444442749, + "learning_rate": 2.103006969824618e-05, + "loss": 0.2306, + "step": 4816 + }, + { + "epoch": 0.49922271738003937, + "grad_norm": 0.6194533109664917, + "learning_rate": 2.1023365028353835e-05, + "loss": 0.235, + "step": 4817 + }, + { + "epoch": 0.4993263550627008, + "grad_norm": 0.649368941783905, + "learning_rate": 2.101666024315031e-05, + "loss": 0.2216, + "step": 4818 + }, + { + "epoch": 0.4994299927453622, + "grad_norm": 0.49496763944625854, + "learning_rate": 2.1009955343391084e-05, + "loss": 0.1852, + "step": 4819 + }, + { + "epoch": 0.4995336304280236, + "grad_norm": 0.5390393137931824, + "learning_rate": 2.1003250329831664e-05, + "loss": 0.2087, + "step": 4820 + }, + { + "epoch": 0.49963726811068504, + "grad_norm": 0.5321680307388306, + "learning_rate": 2.0996545203227556e-05, + "loss": 0.2018, + "step": 4821 + }, + { + "epoch": 0.49974090579334646, + "grad_norm": 0.6124078035354614, + "learning_rate": 2.0989839964334275e-05, + "loss": 0.2178, + "step": 4822 + }, + { + "epoch": 0.4998445434760079, + "grad_norm": 0.5372506380081177, + "learning_rate": 2.0983134613907378e-05, + "loss": 0.2017, + "step": 4823 + }, + { + "epoch": 0.4999481811586693, + "grad_norm": 0.586346447467804, + "learning_rate": 2.0976429152702392e-05, + "loss": 0.2346, + "step": 4824 + }, + { + "epoch": 0.5000518188413307, + "grad_norm": 0.5241424441337585, + "learning_rate": 2.096972358147489e-05, + "loss": 0.1981, + "step": 4825 + }, + { + "epoch": 0.5001554565239921, + "grad_norm": 0.5134982466697693, + "learning_rate": 2.0963017900980444e-05, + "loss": 0.2092, + "step": 4826 + }, + { + "epoch": 0.5002590942066535, + "grad_norm": 0.5294827818870544, + "learning_rate": 2.0956312111974636e-05, + "loss": 0.2074, + "step": 4827 + }, + { + "epoch": 0.5003627318893149, + "grad_norm": 0.5249479413032532, + "learning_rate": 2.094960621521307e-05, + "loss": 0.2131, + "step": 4828 + }, + { + "epoch": 0.5004663695719763, + "grad_norm": 0.5820158123970032, + "learning_rate": 2.0942900211451352e-05, + "loss": 0.2111, + "step": 4829 + }, + { + "epoch": 0.5005700072546377, + "grad_norm": 0.5495398044586182, + "learning_rate": 2.09361941014451e-05, + "loss": 0.2015, + "step": 4830 + }, + { + "epoch": 0.5006736449372992, + "grad_norm": 0.5136843323707581, + "learning_rate": 2.0929487885949945e-05, + "loss": 0.2091, + "step": 4831 + }, + { + "epoch": 0.5007772826199606, + "grad_norm": 0.5137192606925964, + "learning_rate": 2.0922781565721552e-05, + "loss": 0.172, + "step": 4832 + }, + { + "epoch": 0.500880920302622, + "grad_norm": 0.5403234958648682, + "learning_rate": 2.0916075141515563e-05, + "loss": 0.2079, + "step": 4833 + }, + { + "epoch": 0.5009845579852834, + "grad_norm": 0.5703449845314026, + "learning_rate": 2.090936861408765e-05, + "loss": 0.2085, + "step": 4834 + }, + { + "epoch": 0.5010881956679448, + "grad_norm": 0.4938044548034668, + "learning_rate": 2.0902661984193493e-05, + "loss": 0.1959, + "step": 4835 + }, + { + "epoch": 0.5011918333506062, + "grad_norm": 0.4589805603027344, + "learning_rate": 2.0895955252588787e-05, + "loss": 0.1679, + "step": 4836 + }, + { + "epoch": 0.5012954710332677, + "grad_norm": 0.5060716867446899, + "learning_rate": 2.088924842002924e-05, + "loss": 0.1942, + "step": 4837 + }, + { + "epoch": 0.5013991087159291, + "grad_norm": 0.5256392359733582, + "learning_rate": 2.088254148727056e-05, + "loss": 0.1835, + "step": 4838 + }, + { + "epoch": 0.5015027463985905, + "grad_norm": 0.5400627255439758, + "learning_rate": 2.0875834455068485e-05, + "loss": 0.1998, + "step": 4839 + }, + { + "epoch": 0.5016063840812519, + "grad_norm": 0.5017799139022827, + "learning_rate": 2.0869127324178734e-05, + "loss": 0.1944, + "step": 4840 + }, + { + "epoch": 0.5017100217639133, + "grad_norm": 0.5077838897705078, + "learning_rate": 2.086242009535707e-05, + "loss": 0.189, + "step": 4841 + }, + { + "epoch": 0.5018136594465747, + "grad_norm": 0.5228158831596375, + "learning_rate": 2.0855712769359257e-05, + "loss": 0.2039, + "step": 4842 + }, + { + "epoch": 0.5019172971292362, + "grad_norm": 0.5608428716659546, + "learning_rate": 2.0849005346941053e-05, + "loss": 0.2434, + "step": 4843 + }, + { + "epoch": 0.5020209348118976, + "grad_norm": 0.4849930703639984, + "learning_rate": 2.084229782885825e-05, + "loss": 0.1997, + "step": 4844 + }, + { + "epoch": 0.502124572494559, + "grad_norm": 0.7088670134544373, + "learning_rate": 2.0835590215866638e-05, + "loss": 0.2212, + "step": 4845 + }, + { + "epoch": 0.5022282101772204, + "grad_norm": 0.4910408854484558, + "learning_rate": 2.0828882508722012e-05, + "loss": 0.2057, + "step": 4846 + }, + { + "epoch": 0.5023318478598818, + "grad_norm": 0.4826197028160095, + "learning_rate": 2.08221747081802e-05, + "loss": 0.2006, + "step": 4847 + }, + { + "epoch": 0.5024354855425432, + "grad_norm": 0.5076550841331482, + "learning_rate": 2.0815466814997017e-05, + "loss": 0.2186, + "step": 4848 + }, + { + "epoch": 0.5025391232252047, + "grad_norm": 0.473873496055603, + "learning_rate": 2.0808758829928304e-05, + "loss": 0.1932, + "step": 4849 + }, + { + "epoch": 0.5026427609078661, + "grad_norm": 0.5110844969749451, + "learning_rate": 2.08020507537299e-05, + "loss": 0.2079, + "step": 4850 + }, + { + "epoch": 0.5027463985905275, + "grad_norm": 0.4520585536956787, + "learning_rate": 2.0795342587157664e-05, + "loss": 0.1579, + "step": 4851 + }, + { + "epoch": 0.5028500362731889, + "grad_norm": 0.5274686217308044, + "learning_rate": 2.0788634330967464e-05, + "loss": 0.1953, + "step": 4852 + }, + { + "epoch": 0.5029536739558503, + "grad_norm": 0.637174129486084, + "learning_rate": 2.078192598591517e-05, + "loss": 0.2671, + "step": 4853 + }, + { + "epoch": 0.5030573116385118, + "grad_norm": 0.5289623141288757, + "learning_rate": 2.0775217552756673e-05, + "loss": 0.1988, + "step": 4854 + }, + { + "epoch": 0.5031609493211732, + "grad_norm": 0.5649394392967224, + "learning_rate": 2.076850903224787e-05, + "loss": 0.2036, + "step": 4855 + }, + { + "epoch": 0.5032645870038346, + "grad_norm": 0.5184451937675476, + "learning_rate": 2.076180042514466e-05, + "loss": 0.2003, + "step": 4856 + }, + { + "epoch": 0.503368224686496, + "grad_norm": 0.6143597364425659, + "learning_rate": 2.0755091732202963e-05, + "loss": 0.2641, + "step": 4857 + }, + { + "epoch": 0.5034718623691574, + "grad_norm": 0.5865727066993713, + "learning_rate": 2.074838295417871e-05, + "loss": 0.2279, + "step": 4858 + }, + { + "epoch": 0.5035755000518188, + "grad_norm": 0.5569704174995422, + "learning_rate": 2.074167409182782e-05, + "loss": 0.2323, + "step": 4859 + }, + { + "epoch": 0.5036791377344803, + "grad_norm": 0.5546584129333496, + "learning_rate": 2.0734965145906248e-05, + "loss": 0.2227, + "step": 4860 + }, + { + "epoch": 0.5037827754171417, + "grad_norm": 0.5088528990745544, + "learning_rate": 2.0728256117169948e-05, + "loss": 0.2144, + "step": 4861 + }, + { + "epoch": 0.5038864130998031, + "grad_norm": 0.4553952217102051, + "learning_rate": 2.0721547006374882e-05, + "loss": 0.1935, + "step": 4862 + }, + { + "epoch": 0.5039900507824645, + "grad_norm": 0.5757675766944885, + "learning_rate": 2.071483781427702e-05, + "loss": 0.2233, + "step": 4863 + }, + { + "epoch": 0.5040936884651259, + "grad_norm": 0.4998389482498169, + "learning_rate": 2.0708128541632347e-05, + "loss": 0.1866, + "step": 4864 + }, + { + "epoch": 0.5041973261477873, + "grad_norm": 0.5182422995567322, + "learning_rate": 2.070141918919685e-05, + "loss": 0.1961, + "step": 4865 + }, + { + "epoch": 0.5043009638304488, + "grad_norm": 0.5532435774803162, + "learning_rate": 2.069470975772652e-05, + "loss": 0.2189, + "step": 4866 + }, + { + "epoch": 0.5044046015131102, + "grad_norm": 0.5550752282142639, + "learning_rate": 2.0688000247977385e-05, + "loss": 0.2251, + "step": 4867 + }, + { + "epoch": 0.5045082391957716, + "grad_norm": 0.6300921440124512, + "learning_rate": 2.0681290660705446e-05, + "loss": 0.2397, + "step": 4868 + }, + { + "epoch": 0.504611876878433, + "grad_norm": 0.535825788974762, + "learning_rate": 2.0674580996666734e-05, + "loss": 0.2173, + "step": 4869 + }, + { + "epoch": 0.5047155145610944, + "grad_norm": 0.561592698097229, + "learning_rate": 2.0667871256617288e-05, + "loss": 0.2028, + "step": 4870 + }, + { + "epoch": 0.5048191522437558, + "grad_norm": 0.4686198830604553, + "learning_rate": 2.0661161441313145e-05, + "loss": 0.1866, + "step": 4871 + }, + { + "epoch": 0.5049227899264173, + "grad_norm": 0.6049419045448303, + "learning_rate": 2.0654451551510358e-05, + "loss": 0.2358, + "step": 4872 + }, + { + "epoch": 0.5050264276090787, + "grad_norm": 0.5065613389015198, + "learning_rate": 2.064774158796499e-05, + "loss": 0.2327, + "step": 4873 + }, + { + "epoch": 0.5051300652917401, + "grad_norm": 0.5832587480545044, + "learning_rate": 2.064103155143311e-05, + "loss": 0.198, + "step": 4874 + }, + { + "epoch": 0.5052337029744015, + "grad_norm": 0.6004157662391663, + "learning_rate": 2.0634321442670786e-05, + "loss": 0.2501, + "step": 4875 + }, + { + "epoch": 0.5053373406570629, + "grad_norm": 0.5657427310943604, + "learning_rate": 2.0627611262434103e-05, + "loss": 0.1991, + "step": 4876 + }, + { + "epoch": 0.5054409783397243, + "grad_norm": 0.45743632316589355, + "learning_rate": 2.0620901011479167e-05, + "loss": 0.1834, + "step": 4877 + }, + { + "epoch": 0.5055446160223858, + "grad_norm": 0.58927983045578, + "learning_rate": 2.0614190690562065e-05, + "loss": 0.2683, + "step": 4878 + }, + { + "epoch": 0.5056482537050472, + "grad_norm": 0.5818038582801819, + "learning_rate": 2.0607480300438914e-05, + "loss": 0.2057, + "step": 4879 + }, + { + "epoch": 0.5057518913877086, + "grad_norm": 0.5848224759101868, + "learning_rate": 2.0600769841865832e-05, + "loss": 0.2057, + "step": 4880 + }, + { + "epoch": 0.50585552907037, + "grad_norm": 0.5434708595275879, + "learning_rate": 2.0594059315598924e-05, + "loss": 0.2042, + "step": 4881 + }, + { + "epoch": 0.5059591667530314, + "grad_norm": 0.6042613387107849, + "learning_rate": 2.0587348722394346e-05, + "loss": 0.2285, + "step": 4882 + }, + { + "epoch": 0.5060628044356928, + "grad_norm": 0.5077903866767883, + "learning_rate": 2.0580638063008228e-05, + "loss": 0.1994, + "step": 4883 + }, + { + "epoch": 0.5061664421183543, + "grad_norm": 0.5937612652778625, + "learning_rate": 2.0573927338196712e-05, + "loss": 0.2173, + "step": 4884 + }, + { + "epoch": 0.5062700798010157, + "grad_norm": 0.6044044494628906, + "learning_rate": 2.056721654871596e-05, + "loss": 0.2195, + "step": 4885 + }, + { + "epoch": 0.5063737174836771, + "grad_norm": 0.5159188508987427, + "learning_rate": 2.056050569532212e-05, + "loss": 0.2136, + "step": 4886 + }, + { + "epoch": 0.5064773551663385, + "grad_norm": 0.5349753499031067, + "learning_rate": 2.0553794778771375e-05, + "loss": 0.206, + "step": 4887 + }, + { + "epoch": 0.5065809928489999, + "grad_norm": 0.650302529335022, + "learning_rate": 2.05470837998199e-05, + "loss": 0.2485, + "step": 4888 + }, + { + "epoch": 0.5066846305316614, + "grad_norm": 0.5740201473236084, + "learning_rate": 2.0540372759223865e-05, + "loss": 0.2163, + "step": 4889 + }, + { + "epoch": 0.5067882682143228, + "grad_norm": 0.49569568037986755, + "learning_rate": 2.053366165773947e-05, + "loss": 0.1884, + "step": 4890 + }, + { + "epoch": 0.5068919058969842, + "grad_norm": 0.602543830871582, + "learning_rate": 2.052695049612291e-05, + "loss": 0.2305, + "step": 4891 + }, + { + "epoch": 0.5069955435796456, + "grad_norm": 0.6395046710968018, + "learning_rate": 2.0520239275130386e-05, + "loss": 0.236, + "step": 4892 + }, + { + "epoch": 0.507099181262307, + "grad_norm": 0.5917165875434875, + "learning_rate": 2.051352799551811e-05, + "loss": 0.2195, + "step": 4893 + }, + { + "epoch": 0.5072028189449684, + "grad_norm": 0.5440151691436768, + "learning_rate": 2.05068166580423e-05, + "loss": 0.2175, + "step": 4894 + }, + { + "epoch": 0.5073064566276299, + "grad_norm": 0.5186762809753418, + "learning_rate": 2.0500105263459177e-05, + "loss": 0.2083, + "step": 4895 + }, + { + "epoch": 0.5074100943102913, + "grad_norm": 0.47229626774787903, + "learning_rate": 2.0493393812524967e-05, + "loss": 0.1562, + "step": 4896 + }, + { + "epoch": 0.5075137319929527, + "grad_norm": 0.5724924802780151, + "learning_rate": 2.048668230599591e-05, + "loss": 0.2029, + "step": 4897 + }, + { + "epoch": 0.5076173696756141, + "grad_norm": 0.48794323205947876, + "learning_rate": 2.0479970744628245e-05, + "loss": 0.2231, + "step": 4898 + }, + { + "epoch": 0.5077210073582755, + "grad_norm": 0.5196564793586731, + "learning_rate": 2.047325912917823e-05, + "loss": 0.1771, + "step": 4899 + }, + { + "epoch": 0.5078246450409369, + "grad_norm": 0.6032145023345947, + "learning_rate": 2.0466547460402105e-05, + "loss": 0.2086, + "step": 4900 + }, + { + "epoch": 0.5079282827235982, + "grad_norm": 0.5414867401123047, + "learning_rate": 2.0459835739056134e-05, + "loss": 0.2299, + "step": 4901 + }, + { + "epoch": 0.5080319204062597, + "grad_norm": 0.602116584777832, + "learning_rate": 2.0453123965896598e-05, + "loss": 0.2339, + "step": 4902 + }, + { + "epoch": 0.5081355580889211, + "grad_norm": 0.5289559364318848, + "learning_rate": 2.0446412141679756e-05, + "loss": 0.2001, + "step": 4903 + }, + { + "epoch": 0.5082391957715825, + "grad_norm": 0.506097137928009, + "learning_rate": 2.043970026716188e-05, + "loss": 0.1866, + "step": 4904 + }, + { + "epoch": 0.5083428334542439, + "grad_norm": 0.4521408677101135, + "learning_rate": 2.0432988343099267e-05, + "loss": 0.1532, + "step": 4905 + }, + { + "epoch": 0.5084464711369053, + "grad_norm": 0.5555333495140076, + "learning_rate": 2.0426276370248197e-05, + "loss": 0.2099, + "step": 4906 + }, + { + "epoch": 0.5085501088195667, + "grad_norm": 0.5497279763221741, + "learning_rate": 2.041956434936497e-05, + "loss": 0.2062, + "step": 4907 + }, + { + "epoch": 0.5086537465022282, + "grad_norm": 0.5221918225288391, + "learning_rate": 2.041285228120589e-05, + "loss": 0.1963, + "step": 4908 + }, + { + "epoch": 0.5087573841848896, + "grad_norm": 0.5457268953323364, + "learning_rate": 2.0406140166527247e-05, + "loss": 0.189, + "step": 4909 + }, + { + "epoch": 0.508861021867551, + "grad_norm": 0.5438562631607056, + "learning_rate": 2.039942800608537e-05, + "loss": 0.196, + "step": 4910 + }, + { + "epoch": 0.5089646595502124, + "grad_norm": 0.5521365404129028, + "learning_rate": 2.039271580063656e-05, + "loss": 0.2085, + "step": 4911 + }, + { + "epoch": 0.5090682972328738, + "grad_norm": 0.6124582290649414, + "learning_rate": 2.0386003550937147e-05, + "loss": 0.2398, + "step": 4912 + }, + { + "epoch": 0.5091719349155353, + "grad_norm": 0.49621763825416565, + "learning_rate": 2.037929125774345e-05, + "loss": 0.1739, + "step": 4913 + }, + { + "epoch": 0.5092755725981967, + "grad_norm": 0.48987457156181335, + "learning_rate": 2.037257892181181e-05, + "loss": 0.1832, + "step": 4914 + }, + { + "epoch": 0.5093792102808581, + "grad_norm": 0.5281900763511658, + "learning_rate": 2.0365866543898556e-05, + "loss": 0.1738, + "step": 4915 + }, + { + "epoch": 0.5094828479635195, + "grad_norm": 0.4487219452857971, + "learning_rate": 2.0359154124760022e-05, + "loss": 0.1562, + "step": 4916 + }, + { + "epoch": 0.5095864856461809, + "grad_norm": 0.5459572076797485, + "learning_rate": 2.035244166515256e-05, + "loss": 0.221, + "step": 4917 + }, + { + "epoch": 0.5096901233288423, + "grad_norm": 0.590134859085083, + "learning_rate": 2.0345729165832527e-05, + "loss": 0.2269, + "step": 4918 + }, + { + "epoch": 0.5097937610115038, + "grad_norm": 0.5557520389556885, + "learning_rate": 2.033901662755626e-05, + "loss": 0.1803, + "step": 4919 + }, + { + "epoch": 0.5098973986941652, + "grad_norm": 0.5433508157730103, + "learning_rate": 2.0332304051080135e-05, + "loss": 0.2154, + "step": 4920 + }, + { + "epoch": 0.5100010363768266, + "grad_norm": 0.6076089143753052, + "learning_rate": 2.03255914371605e-05, + "loss": 0.2399, + "step": 4921 + }, + { + "epoch": 0.510104674059488, + "grad_norm": 0.6198269128799438, + "learning_rate": 2.0318878786553727e-05, + "loss": 0.2298, + "step": 4922 + }, + { + "epoch": 0.5102083117421494, + "grad_norm": 0.5292469263076782, + "learning_rate": 2.0312166100016192e-05, + "loss": 0.1892, + "step": 4923 + }, + { + "epoch": 0.5103119494248108, + "grad_norm": 0.6244973540306091, + "learning_rate": 2.030545337830427e-05, + "loss": 0.2101, + "step": 4924 + }, + { + "epoch": 0.5104155871074723, + "grad_norm": 0.5299911499023438, + "learning_rate": 2.0298740622174328e-05, + "loss": 0.2013, + "step": 4925 + }, + { + "epoch": 0.5105192247901337, + "grad_norm": 0.4421478509902954, + "learning_rate": 2.029202783238276e-05, + "loss": 0.1414, + "step": 4926 + }, + { + "epoch": 0.5106228624727951, + "grad_norm": 0.43288445472717285, + "learning_rate": 2.0285315009685952e-05, + "loss": 0.1628, + "step": 4927 + }, + { + "epoch": 0.5107265001554565, + "grad_norm": 0.6298075914382935, + "learning_rate": 2.027860215484029e-05, + "loss": 0.2313, + "step": 4928 + }, + { + "epoch": 0.5108301378381179, + "grad_norm": 0.4587681293487549, + "learning_rate": 2.027188926860217e-05, + "loss": 0.1725, + "step": 4929 + }, + { + "epoch": 0.5109337755207793, + "grad_norm": 0.4807582497596741, + "learning_rate": 2.026517635172799e-05, + "loss": 0.1722, + "step": 4930 + }, + { + "epoch": 0.5110374132034408, + "grad_norm": 0.591253399848938, + "learning_rate": 2.0258463404974155e-05, + "loss": 0.2413, + "step": 4931 + }, + { + "epoch": 0.5111410508861022, + "grad_norm": 0.557607889175415, + "learning_rate": 2.0251750429097057e-05, + "loss": 0.1867, + "step": 4932 + }, + { + "epoch": 0.5112446885687636, + "grad_norm": 0.520668089389801, + "learning_rate": 2.0245037424853117e-05, + "loss": 0.2009, + "step": 4933 + }, + { + "epoch": 0.511348326251425, + "grad_norm": 0.5009155869483948, + "learning_rate": 2.0238324392998745e-05, + "loss": 0.2092, + "step": 4934 + }, + { + "epoch": 0.5114519639340864, + "grad_norm": 0.5666335225105286, + "learning_rate": 2.0231611334290343e-05, + "loss": 0.2012, + "step": 4935 + }, + { + "epoch": 0.5115556016167478, + "grad_norm": 0.519422173500061, + "learning_rate": 2.022489824948434e-05, + "loss": 0.1971, + "step": 4936 + }, + { + "epoch": 0.5116592392994093, + "grad_norm": 0.5896724462509155, + "learning_rate": 2.0218185139337155e-05, + "loss": 0.2373, + "step": 4937 + }, + { + "epoch": 0.5117628769820707, + "grad_norm": 0.5998659133911133, + "learning_rate": 2.02114720046052e-05, + "loss": 0.2382, + "step": 4938 + }, + { + "epoch": 0.5118665146647321, + "grad_norm": 0.49710676074028015, + "learning_rate": 2.0204758846044912e-05, + "loss": 0.1944, + "step": 4939 + }, + { + "epoch": 0.5119701523473935, + "grad_norm": 0.5756406784057617, + "learning_rate": 2.0198045664412717e-05, + "loss": 0.2207, + "step": 4940 + }, + { + "epoch": 0.5120737900300549, + "grad_norm": 0.4823175072669983, + "learning_rate": 2.0191332460465042e-05, + "loss": 0.1776, + "step": 4941 + }, + { + "epoch": 0.5121774277127163, + "grad_norm": 0.5221631526947021, + "learning_rate": 2.0184619234958322e-05, + "loss": 0.2037, + "step": 4942 + }, + { + "epoch": 0.5122810653953778, + "grad_norm": 0.5067155361175537, + "learning_rate": 2.0177905988649e-05, + "loss": 0.1981, + "step": 4943 + }, + { + "epoch": 0.5123847030780392, + "grad_norm": 0.6370543837547302, + "learning_rate": 2.0171192722293504e-05, + "loss": 0.2487, + "step": 4944 + }, + { + "epoch": 0.5124883407607006, + "grad_norm": 0.5774009823799133, + "learning_rate": 2.0164479436648272e-05, + "loss": 0.2187, + "step": 4945 + }, + { + "epoch": 0.512591978443362, + "grad_norm": 0.5051264762878418, + "learning_rate": 2.0157766132469762e-05, + "loss": 0.1969, + "step": 4946 + }, + { + "epoch": 0.5126956161260234, + "grad_norm": 0.5043640732765198, + "learning_rate": 2.0151052810514405e-05, + "loss": 0.176, + "step": 4947 + }, + { + "epoch": 0.5127992538086849, + "grad_norm": 0.581299901008606, + "learning_rate": 2.014433947153865e-05, + "loss": 0.2074, + "step": 4948 + }, + { + "epoch": 0.5129028914913463, + "grad_norm": 0.5512892603874207, + "learning_rate": 2.013762611629895e-05, + "loss": 0.2097, + "step": 4949 + }, + { + "epoch": 0.5130065291740077, + "grad_norm": 0.5397634506225586, + "learning_rate": 2.0130912745551753e-05, + "loss": 0.21, + "step": 4950 + }, + { + "epoch": 0.5131101668566691, + "grad_norm": 0.5209311842918396, + "learning_rate": 2.0124199360053516e-05, + "loss": 0.2005, + "step": 4951 + }, + { + "epoch": 0.5132138045393305, + "grad_norm": 0.5762450098991394, + "learning_rate": 2.0117485960560682e-05, + "loss": 0.2271, + "step": 4952 + }, + { + "epoch": 0.5133174422219919, + "grad_norm": 0.503059446811676, + "learning_rate": 2.011077254782972e-05, + "loss": 0.1871, + "step": 4953 + }, + { + "epoch": 0.5134210799046534, + "grad_norm": 0.42786842584609985, + "learning_rate": 2.0104059122617073e-05, + "loss": 0.1904, + "step": 4954 + }, + { + "epoch": 0.5135247175873148, + "grad_norm": 0.4788164496421814, + "learning_rate": 2.009734568567921e-05, + "loss": 0.192, + "step": 4955 + }, + { + "epoch": 0.5136283552699762, + "grad_norm": 0.4950418174266815, + "learning_rate": 2.009063223777258e-05, + "loss": 0.2189, + "step": 4956 + }, + { + "epoch": 0.5137319929526376, + "grad_norm": 0.5276430249214172, + "learning_rate": 2.0083918779653653e-05, + "loss": 0.2255, + "step": 4957 + }, + { + "epoch": 0.513835630635299, + "grad_norm": 0.4317634403705597, + "learning_rate": 2.007720531207889e-05, + "loss": 0.1522, + "step": 4958 + }, + { + "epoch": 0.5139392683179604, + "grad_norm": 0.5495805740356445, + "learning_rate": 2.0070491835804752e-05, + "loss": 0.2207, + "step": 4959 + }, + { + "epoch": 0.5140429060006219, + "grad_norm": 0.5706672668457031, + "learning_rate": 2.00637783515877e-05, + "loss": 0.2179, + "step": 4960 + }, + { + "epoch": 0.5141465436832833, + "grad_norm": 0.5287724137306213, + "learning_rate": 2.0057064860184207e-05, + "loss": 0.1862, + "step": 4961 + }, + { + "epoch": 0.5142501813659447, + "grad_norm": 0.5140873193740845, + "learning_rate": 2.0050351362350737e-05, + "loss": 0.2104, + "step": 4962 + }, + { + "epoch": 0.5143538190486061, + "grad_norm": 0.5113810896873474, + "learning_rate": 2.0043637858843748e-05, + "loss": 0.208, + "step": 4963 + }, + { + "epoch": 0.5144574567312675, + "grad_norm": 0.5623809099197388, + "learning_rate": 2.0036924350419716e-05, + "loss": 0.2047, + "step": 4964 + }, + { + "epoch": 0.5145610944139289, + "grad_norm": 0.5720166563987732, + "learning_rate": 2.0030210837835105e-05, + "loss": 0.2262, + "step": 4965 + }, + { + "epoch": 0.5146647320965904, + "grad_norm": 0.5529698133468628, + "learning_rate": 2.002349732184638e-05, + "loss": 0.1992, + "step": 4966 + }, + { + "epoch": 0.5147683697792518, + "grad_norm": 0.5903406143188477, + "learning_rate": 2.0016783803210015e-05, + "loss": 0.2305, + "step": 4967 + }, + { + "epoch": 0.5148720074619132, + "grad_norm": 0.5520504117012024, + "learning_rate": 2.001007028268248e-05, + "loss": 0.2056, + "step": 4968 + }, + { + "epoch": 0.5149756451445746, + "grad_norm": 0.5906986594200134, + "learning_rate": 2.000335676102024e-05, + "loss": 0.2187, + "step": 4969 + }, + { + "epoch": 0.515079282827236, + "grad_norm": 0.4421522319316864, + "learning_rate": 1.9996643238979763e-05, + "loss": 0.1622, + "step": 4970 + }, + { + "epoch": 0.5151829205098974, + "grad_norm": 0.5289708375930786, + "learning_rate": 1.9989929717317528e-05, + "loss": 0.2081, + "step": 4971 + }, + { + "epoch": 0.5152865581925589, + "grad_norm": 0.5530785322189331, + "learning_rate": 1.9983216196789988e-05, + "loss": 0.2322, + "step": 4972 + }, + { + "epoch": 0.5153901958752203, + "grad_norm": 0.5963031649589539, + "learning_rate": 1.9976502678153622e-05, + "loss": 0.2172, + "step": 4973 + }, + { + "epoch": 0.5154938335578817, + "grad_norm": 0.5122589468955994, + "learning_rate": 1.9969789162164905e-05, + "loss": 0.2029, + "step": 4974 + }, + { + "epoch": 0.5155974712405431, + "grad_norm": 0.5265605449676514, + "learning_rate": 1.996307564958029e-05, + "loss": 0.1863, + "step": 4975 + }, + { + "epoch": 0.5157011089232045, + "grad_norm": 0.5558157563209534, + "learning_rate": 1.9956362141156262e-05, + "loss": 0.1968, + "step": 4976 + }, + { + "epoch": 0.5158047466058658, + "grad_norm": 0.43986696004867554, + "learning_rate": 1.994964863764927e-05, + "loss": 0.1801, + "step": 4977 + }, + { + "epoch": 0.5159083842885273, + "grad_norm": 0.45596033334732056, + "learning_rate": 1.9942935139815796e-05, + "loss": 0.1739, + "step": 4978 + }, + { + "epoch": 0.5160120219711887, + "grad_norm": 0.621364951133728, + "learning_rate": 1.9936221648412305e-05, + "loss": 0.2441, + "step": 4979 + }, + { + "epoch": 0.5161156596538501, + "grad_norm": 0.49341389536857605, + "learning_rate": 1.992950816419525e-05, + "loss": 0.1733, + "step": 4980 + }, + { + "epoch": 0.5162192973365115, + "grad_norm": 0.4995219111442566, + "learning_rate": 1.9922794687921117e-05, + "loss": 0.1743, + "step": 4981 + }, + { + "epoch": 0.5163229350191729, + "grad_norm": 0.5646112561225891, + "learning_rate": 1.9916081220346353e-05, + "loss": 0.2044, + "step": 4982 + }, + { + "epoch": 0.5164265727018343, + "grad_norm": 0.5320212841033936, + "learning_rate": 1.990936776222742e-05, + "loss": 0.2062, + "step": 4983 + }, + { + "epoch": 0.5165302103844958, + "grad_norm": 0.5387780666351318, + "learning_rate": 1.99026543143208e-05, + "loss": 0.2139, + "step": 4984 + }, + { + "epoch": 0.5166338480671572, + "grad_norm": 0.5501453280448914, + "learning_rate": 1.9895940877382934e-05, + "loss": 0.215, + "step": 4985 + }, + { + "epoch": 0.5167374857498186, + "grad_norm": 0.5094042420387268, + "learning_rate": 1.9889227452170294e-05, + "loss": 0.2023, + "step": 4986 + }, + { + "epoch": 0.51684112343248, + "grad_norm": 0.5256874561309814, + "learning_rate": 1.9882514039439324e-05, + "loss": 0.2131, + "step": 4987 + }, + { + "epoch": 0.5169447611151414, + "grad_norm": 0.5167512893676758, + "learning_rate": 1.9875800639946487e-05, + "loss": 0.2212, + "step": 4988 + }, + { + "epoch": 0.5170483987978028, + "grad_norm": 0.4585399925708771, + "learning_rate": 1.986908725444825e-05, + "loss": 0.1731, + "step": 4989 + }, + { + "epoch": 0.5171520364804643, + "grad_norm": 0.5621468424797058, + "learning_rate": 1.9862373883701055e-05, + "loss": 0.218, + "step": 4990 + }, + { + "epoch": 0.5172556741631257, + "grad_norm": 0.5321661233901978, + "learning_rate": 1.9855660528461356e-05, + "loss": 0.1796, + "step": 4991 + }, + { + "epoch": 0.5173593118457871, + "grad_norm": 0.4230225682258606, + "learning_rate": 1.98489471894856e-05, + "loss": 0.1497, + "step": 4992 + }, + { + "epoch": 0.5174629495284485, + "grad_norm": 0.4817984402179718, + "learning_rate": 1.984223386753024e-05, + "loss": 0.1789, + "step": 4993 + }, + { + "epoch": 0.5175665872111099, + "grad_norm": 0.47356268763542175, + "learning_rate": 1.9835520563351735e-05, + "loss": 0.1957, + "step": 4994 + }, + { + "epoch": 0.5176702248937713, + "grad_norm": 0.5884582996368408, + "learning_rate": 1.9828807277706502e-05, + "loss": 0.2165, + "step": 4995 + }, + { + "epoch": 0.5177738625764328, + "grad_norm": 0.545630156993866, + "learning_rate": 1.982209401135101e-05, + "loss": 0.1977, + "step": 4996 + }, + { + "epoch": 0.5178775002590942, + "grad_norm": 0.5935811996459961, + "learning_rate": 1.981538076504168e-05, + "loss": 0.2542, + "step": 4997 + }, + { + "epoch": 0.5179811379417556, + "grad_norm": 0.49267682433128357, + "learning_rate": 1.980866753953496e-05, + "loss": 0.2011, + "step": 4998 + }, + { + "epoch": 0.518084775624417, + "grad_norm": 0.5562503337860107, + "learning_rate": 1.980195433558729e-05, + "loss": 0.2326, + "step": 4999 + }, + { + "epoch": 0.5181884133070784, + "grad_norm": 0.5674566626548767, + "learning_rate": 1.9795241153955094e-05, + "loss": 0.215, + "step": 5000 + }, + { + "epoch": 0.5182920509897398, + "grad_norm": 0.5795115232467651, + "learning_rate": 1.978852799539481e-05, + "loss": 0.226, + "step": 5001 + }, + { + "epoch": 0.5183956886724013, + "grad_norm": 0.44766589999198914, + "learning_rate": 1.9781814860662855e-05, + "loss": 0.1581, + "step": 5002 + }, + { + "epoch": 0.5184993263550627, + "grad_norm": 0.4725109338760376, + "learning_rate": 1.9775101750515663e-05, + "loss": 0.1696, + "step": 5003 + }, + { + "epoch": 0.5186029640377241, + "grad_norm": 0.5291849374771118, + "learning_rate": 1.9768388665709664e-05, + "loss": 0.1988, + "step": 5004 + }, + { + "epoch": 0.5187066017203855, + "grad_norm": 0.5688635110855103, + "learning_rate": 1.976167560700126e-05, + "loss": 0.2089, + "step": 5005 + }, + { + "epoch": 0.5188102394030469, + "grad_norm": 0.4801862835884094, + "learning_rate": 1.9754962575146887e-05, + "loss": 0.1639, + "step": 5006 + }, + { + "epoch": 0.5189138770857084, + "grad_norm": 0.45741963386535645, + "learning_rate": 1.974824957090295e-05, + "loss": 0.1668, + "step": 5007 + }, + { + "epoch": 0.5190175147683698, + "grad_norm": 0.5641513466835022, + "learning_rate": 1.974153659502585e-05, + "loss": 0.2191, + "step": 5008 + }, + { + "epoch": 0.5191211524510312, + "grad_norm": 0.5678064823150635, + "learning_rate": 1.9734823648272013e-05, + "loss": 0.2029, + "step": 5009 + }, + { + "epoch": 0.5192247901336926, + "grad_norm": 0.5019567608833313, + "learning_rate": 1.9728110731397835e-05, + "loss": 0.2027, + "step": 5010 + }, + { + "epoch": 0.519328427816354, + "grad_norm": 0.536777675151825, + "learning_rate": 1.972139784515972e-05, + "loss": 0.209, + "step": 5011 + }, + { + "epoch": 0.5194320654990154, + "grad_norm": 0.5042787194252014, + "learning_rate": 1.971468499031405e-05, + "loss": 0.1956, + "step": 5012 + }, + { + "epoch": 0.5195357031816769, + "grad_norm": 0.5100334286689758, + "learning_rate": 1.970797216761724e-05, + "loss": 0.2024, + "step": 5013 + }, + { + "epoch": 0.5196393408643383, + "grad_norm": 0.5792742967605591, + "learning_rate": 1.970125937782568e-05, + "loss": 0.2186, + "step": 5014 + }, + { + "epoch": 0.5197429785469997, + "grad_norm": 0.5526664853096008, + "learning_rate": 1.9694546621695737e-05, + "loss": 0.2045, + "step": 5015 + }, + { + "epoch": 0.5198466162296611, + "grad_norm": 0.48117056488990784, + "learning_rate": 1.9687833899983818e-05, + "loss": 0.1734, + "step": 5016 + }, + { + "epoch": 0.5199502539123225, + "grad_norm": 0.5357035994529724, + "learning_rate": 1.9681121213446276e-05, + "loss": 0.1905, + "step": 5017 + }, + { + "epoch": 0.5200538915949839, + "grad_norm": 0.636917769908905, + "learning_rate": 1.96744085628395e-05, + "loss": 0.2659, + "step": 5018 + }, + { + "epoch": 0.5201575292776454, + "grad_norm": 0.5372515916824341, + "learning_rate": 1.9667695948919875e-05, + "loss": 0.2142, + "step": 5019 + }, + { + "epoch": 0.5202611669603068, + "grad_norm": 0.5247703194618225, + "learning_rate": 1.966098337244374e-05, + "loss": 0.1983, + "step": 5020 + }, + { + "epoch": 0.5203648046429682, + "grad_norm": 0.663495659828186, + "learning_rate": 1.9654270834167483e-05, + "loss": 0.2582, + "step": 5021 + }, + { + "epoch": 0.5204684423256296, + "grad_norm": 0.5289007425308228, + "learning_rate": 1.9647558334847445e-05, + "loss": 0.192, + "step": 5022 + }, + { + "epoch": 0.520572080008291, + "grad_norm": 0.4809255003929138, + "learning_rate": 1.964084587523998e-05, + "loss": 0.1857, + "step": 5023 + }, + { + "epoch": 0.5206757176909524, + "grad_norm": 0.5267800688743591, + "learning_rate": 1.9634133456101458e-05, + "loss": 0.2173, + "step": 5024 + }, + { + "epoch": 0.5207793553736139, + "grad_norm": 0.5737069249153137, + "learning_rate": 1.9627421078188197e-05, + "loss": 0.2565, + "step": 5025 + }, + { + "epoch": 0.5208829930562753, + "grad_norm": 0.6062821745872498, + "learning_rate": 1.962070874225656e-05, + "loss": 0.2093, + "step": 5026 + }, + { + "epoch": 0.5209866307389367, + "grad_norm": 0.5270495414733887, + "learning_rate": 1.961399644906286e-05, + "loss": 0.2017, + "step": 5027 + }, + { + "epoch": 0.5210902684215981, + "grad_norm": 0.5432182550430298, + "learning_rate": 1.9607284199363445e-05, + "loss": 0.2218, + "step": 5028 + }, + { + "epoch": 0.5211939061042595, + "grad_norm": 0.7014081478118896, + "learning_rate": 1.960057199391464e-05, + "loss": 0.2633, + "step": 5029 + }, + { + "epoch": 0.521297543786921, + "grad_norm": 0.5988870859146118, + "learning_rate": 1.9593859833472756e-05, + "loss": 0.1928, + "step": 5030 + }, + { + "epoch": 0.5214011814695824, + "grad_norm": 0.5661015510559082, + "learning_rate": 1.958714771879412e-05, + "loss": 0.2194, + "step": 5031 + }, + { + "epoch": 0.5215048191522438, + "grad_norm": 0.5360023975372314, + "learning_rate": 1.958043565063504e-05, + "loss": 0.2002, + "step": 5032 + }, + { + "epoch": 0.5216084568349052, + "grad_norm": 0.5409901738166809, + "learning_rate": 1.9573723629751803e-05, + "loss": 0.2131, + "step": 5033 + }, + { + "epoch": 0.5217120945175666, + "grad_norm": 0.5607601404190063, + "learning_rate": 1.956701165690074e-05, + "loss": 0.2015, + "step": 5034 + }, + { + "epoch": 0.521815732200228, + "grad_norm": 0.529923677444458, + "learning_rate": 1.9560299732838127e-05, + "loss": 0.1902, + "step": 5035 + }, + { + "epoch": 0.5219193698828895, + "grad_norm": 0.6019048094749451, + "learning_rate": 1.9553587858320257e-05, + "loss": 0.2162, + "step": 5036 + }, + { + "epoch": 0.5220230075655509, + "grad_norm": 0.5528945922851562, + "learning_rate": 1.954687603410341e-05, + "loss": 0.243, + "step": 5037 + }, + { + "epoch": 0.5221266452482123, + "grad_norm": 0.5756701827049255, + "learning_rate": 1.9540164260943866e-05, + "loss": 0.1943, + "step": 5038 + }, + { + "epoch": 0.5222302829308737, + "grad_norm": 0.5087042450904846, + "learning_rate": 1.9533452539597905e-05, + "loss": 0.2166, + "step": 5039 + }, + { + "epoch": 0.5223339206135351, + "grad_norm": 0.5744715929031372, + "learning_rate": 1.9526740870821776e-05, + "loss": 0.2086, + "step": 5040 + }, + { + "epoch": 0.5224375582961965, + "grad_norm": 0.5722135305404663, + "learning_rate": 1.952002925537176e-05, + "loss": 0.2113, + "step": 5041 + }, + { + "epoch": 0.522541195978858, + "grad_norm": 0.5168009400367737, + "learning_rate": 1.9513317694004097e-05, + "loss": 0.1892, + "step": 5042 + }, + { + "epoch": 0.5226448336615194, + "grad_norm": 0.5232700109481812, + "learning_rate": 1.9506606187475036e-05, + "loss": 0.1743, + "step": 5043 + }, + { + "epoch": 0.5227484713441808, + "grad_norm": 0.6535675525665283, + "learning_rate": 1.9499894736540833e-05, + "loss": 0.2438, + "step": 5044 + }, + { + "epoch": 0.5228521090268422, + "grad_norm": 0.6077508926391602, + "learning_rate": 1.9493183341957706e-05, + "loss": 0.1878, + "step": 5045 + }, + { + "epoch": 0.5229557467095036, + "grad_norm": 0.5761801600456238, + "learning_rate": 1.94864720044819e-05, + "loss": 0.1892, + "step": 5046 + }, + { + "epoch": 0.523059384392165, + "grad_norm": 0.5046095848083496, + "learning_rate": 1.9479760724869617e-05, + "loss": 0.1862, + "step": 5047 + }, + { + "epoch": 0.5231630220748265, + "grad_norm": 0.514988362789154, + "learning_rate": 1.9473049503877094e-05, + "loss": 0.1951, + "step": 5048 + }, + { + "epoch": 0.5232666597574879, + "grad_norm": 0.5308780074119568, + "learning_rate": 1.946633834226054e-05, + "loss": 0.2073, + "step": 5049 + }, + { + "epoch": 0.5233702974401493, + "grad_norm": 0.5171186327934265, + "learning_rate": 1.9459627240776142e-05, + "loss": 0.1954, + "step": 5050 + }, + { + "epoch": 0.5234739351228107, + "grad_norm": 0.5829634070396423, + "learning_rate": 1.9452916200180115e-05, + "loss": 0.209, + "step": 5051 + }, + { + "epoch": 0.5235775728054721, + "grad_norm": 0.585152804851532, + "learning_rate": 1.9446205221228628e-05, + "loss": 0.2127, + "step": 5052 + }, + { + "epoch": 0.5236812104881334, + "grad_norm": 0.499165415763855, + "learning_rate": 1.9439494304677883e-05, + "loss": 0.1915, + "step": 5053 + }, + { + "epoch": 0.5237848481707948, + "grad_norm": 0.5298867225646973, + "learning_rate": 1.9432783451284055e-05, + "loss": 0.1859, + "step": 5054 + }, + { + "epoch": 0.5238884858534563, + "grad_norm": 0.4951799213886261, + "learning_rate": 1.942607266180329e-05, + "loss": 0.1751, + "step": 5055 + }, + { + "epoch": 0.5239921235361177, + "grad_norm": 0.6185844540596008, + "learning_rate": 1.9419361936991782e-05, + "loss": 0.2251, + "step": 5056 + }, + { + "epoch": 0.5240957612187791, + "grad_norm": 0.5453723669052124, + "learning_rate": 1.941265127760566e-05, + "loss": 0.2042, + "step": 5057 + }, + { + "epoch": 0.5241993989014405, + "grad_norm": 0.5065323710441589, + "learning_rate": 1.9405940684401076e-05, + "loss": 0.1834, + "step": 5058 + }, + { + "epoch": 0.5243030365841019, + "grad_norm": 0.6081438660621643, + "learning_rate": 1.9399230158134178e-05, + "loss": 0.2117, + "step": 5059 + }, + { + "epoch": 0.5244066742667634, + "grad_norm": 0.547488272190094, + "learning_rate": 1.9392519699561092e-05, + "loss": 0.2205, + "step": 5060 + }, + { + "epoch": 0.5245103119494248, + "grad_norm": 0.4661538600921631, + "learning_rate": 1.9385809309437945e-05, + "loss": 0.1676, + "step": 5061 + }, + { + "epoch": 0.5246139496320862, + "grad_norm": 0.5201432108879089, + "learning_rate": 1.937909898852084e-05, + "loss": 0.1842, + "step": 5062 + }, + { + "epoch": 0.5247175873147476, + "grad_norm": 0.5604529976844788, + "learning_rate": 1.93723887375659e-05, + "loss": 0.1932, + "step": 5063 + }, + { + "epoch": 0.524821224997409, + "grad_norm": 0.5652872920036316, + "learning_rate": 1.9365678557329227e-05, + "loss": 0.2091, + "step": 5064 + }, + { + "epoch": 0.5249248626800704, + "grad_norm": 0.5462526082992554, + "learning_rate": 1.93589684485669e-05, + "loss": 0.193, + "step": 5065 + }, + { + "epoch": 0.5250285003627319, + "grad_norm": 0.576276421546936, + "learning_rate": 1.9352258412035016e-05, + "loss": 0.2424, + "step": 5066 + }, + { + "epoch": 0.5251321380453933, + "grad_norm": 0.5367346405982971, + "learning_rate": 1.9345548448489645e-05, + "loss": 0.1786, + "step": 5067 + }, + { + "epoch": 0.5252357757280547, + "grad_norm": 0.6185262203216553, + "learning_rate": 1.9338838558686858e-05, + "loss": 0.2227, + "step": 5068 + }, + { + "epoch": 0.5253394134107161, + "grad_norm": 0.5650673508644104, + "learning_rate": 1.9332128743382715e-05, + "loss": 0.2209, + "step": 5069 + }, + { + "epoch": 0.5254430510933775, + "grad_norm": 0.5670083165168762, + "learning_rate": 1.932541900333327e-05, + "loss": 0.2333, + "step": 5070 + }, + { + "epoch": 0.5255466887760389, + "grad_norm": 0.5780977010726929, + "learning_rate": 1.9318709339294554e-05, + "loss": 0.2025, + "step": 5071 + }, + { + "epoch": 0.5256503264587004, + "grad_norm": 0.6275104284286499, + "learning_rate": 1.931199975202262e-05, + "loss": 0.2034, + "step": 5072 + }, + { + "epoch": 0.5257539641413618, + "grad_norm": 0.5302552580833435, + "learning_rate": 1.9305290242273482e-05, + "loss": 0.1911, + "step": 5073 + }, + { + "epoch": 0.5258576018240232, + "grad_norm": 0.6476036310195923, + "learning_rate": 1.9298580810803163e-05, + "loss": 0.2388, + "step": 5074 + }, + { + "epoch": 0.5259612395066846, + "grad_norm": 0.551716685295105, + "learning_rate": 1.929187145836766e-05, + "loss": 0.2417, + "step": 5075 + }, + { + "epoch": 0.526064877189346, + "grad_norm": 0.5433430671691895, + "learning_rate": 1.928516218572298e-05, + "loss": 0.2182, + "step": 5076 + }, + { + "epoch": 0.5261685148720074, + "grad_norm": 0.5775318145751953, + "learning_rate": 1.927845299362512e-05, + "loss": 0.2168, + "step": 5077 + }, + { + "epoch": 0.5262721525546689, + "grad_norm": 0.48974794149398804, + "learning_rate": 1.9271743882830052e-05, + "loss": 0.1999, + "step": 5078 + }, + { + "epoch": 0.5263757902373303, + "grad_norm": 0.6543878316879272, + "learning_rate": 1.926503485409376e-05, + "loss": 0.2193, + "step": 5079 + }, + { + "epoch": 0.5264794279199917, + "grad_norm": 0.5222949385643005, + "learning_rate": 1.9258325908172185e-05, + "loss": 0.2047, + "step": 5080 + }, + { + "epoch": 0.5265830656026531, + "grad_norm": 0.5059442520141602, + "learning_rate": 1.9251617045821295e-05, + "loss": 0.2218, + "step": 5081 + }, + { + "epoch": 0.5266867032853145, + "grad_norm": 0.5379745364189148, + "learning_rate": 1.9244908267797043e-05, + "loss": 0.2186, + "step": 5082 + }, + { + "epoch": 0.526790340967976, + "grad_norm": 0.5693245530128479, + "learning_rate": 1.9238199574855344e-05, + "loss": 0.2163, + "step": 5083 + }, + { + "epoch": 0.5268939786506374, + "grad_norm": 0.44062313437461853, + "learning_rate": 1.9231490967752137e-05, + "loss": 0.1705, + "step": 5084 + }, + { + "epoch": 0.5269976163332988, + "grad_norm": 0.5416108965873718, + "learning_rate": 1.9224782447243334e-05, + "loss": 0.2132, + "step": 5085 + }, + { + "epoch": 0.5271012540159602, + "grad_norm": 0.5123291015625, + "learning_rate": 1.921807401408483e-05, + "loss": 0.2008, + "step": 5086 + }, + { + "epoch": 0.5272048916986216, + "grad_norm": 0.5766613483428955, + "learning_rate": 1.9211365669032546e-05, + "loss": 0.2038, + "step": 5087 + }, + { + "epoch": 0.527308529381283, + "grad_norm": 0.5707951188087463, + "learning_rate": 1.9204657412842342e-05, + "loss": 0.2128, + "step": 5088 + }, + { + "epoch": 0.5274121670639444, + "grad_norm": 0.5377147197723389, + "learning_rate": 1.9197949246270112e-05, + "loss": 0.1951, + "step": 5089 + }, + { + "epoch": 0.5275158047466059, + "grad_norm": 0.5024455189704895, + "learning_rate": 1.9191241170071702e-05, + "loss": 0.1794, + "step": 5090 + }, + { + "epoch": 0.5276194424292673, + "grad_norm": 0.5326787233352661, + "learning_rate": 1.9184533185002986e-05, + "loss": 0.1962, + "step": 5091 + }, + { + "epoch": 0.5277230801119287, + "grad_norm": 0.5894343852996826, + "learning_rate": 1.917782529181981e-05, + "loss": 0.2277, + "step": 5092 + }, + { + "epoch": 0.5278267177945901, + "grad_norm": 0.5485954284667969, + "learning_rate": 1.917111749127799e-05, + "loss": 0.1965, + "step": 5093 + }, + { + "epoch": 0.5279303554772515, + "grad_norm": 0.534939706325531, + "learning_rate": 1.9164409784133372e-05, + "loss": 0.211, + "step": 5094 + }, + { + "epoch": 0.528033993159913, + "grad_norm": 0.5456961989402771, + "learning_rate": 1.9157702171141757e-05, + "loss": 0.1976, + "step": 5095 + }, + { + "epoch": 0.5281376308425744, + "grad_norm": 0.5366075038909912, + "learning_rate": 1.9150994653058947e-05, + "loss": 0.2125, + "step": 5096 + }, + { + "epoch": 0.5282412685252358, + "grad_norm": 0.5909968018531799, + "learning_rate": 1.914428723064075e-05, + "loss": 0.2123, + "step": 5097 + }, + { + "epoch": 0.5283449062078972, + "grad_norm": 0.64056795835495, + "learning_rate": 1.913757990464293e-05, + "loss": 0.2349, + "step": 5098 + }, + { + "epoch": 0.5284485438905586, + "grad_norm": 0.5149229168891907, + "learning_rate": 1.9130872675821273e-05, + "loss": 0.1872, + "step": 5099 + }, + { + "epoch": 0.52855218157322, + "grad_norm": 0.5525230765342712, + "learning_rate": 1.912416554493152e-05, + "loss": 0.2375, + "step": 5100 + }, + { + "epoch": 0.5286558192558815, + "grad_norm": 0.5269282460212708, + "learning_rate": 1.9117458512729443e-05, + "loss": 0.1978, + "step": 5101 + }, + { + "epoch": 0.5287594569385429, + "grad_norm": 0.5171878337860107, + "learning_rate": 1.9110751579970767e-05, + "loss": 0.1845, + "step": 5102 + }, + { + "epoch": 0.5288630946212043, + "grad_norm": 0.5259308815002441, + "learning_rate": 1.9104044747411213e-05, + "loss": 0.1994, + "step": 5103 + }, + { + "epoch": 0.5289667323038657, + "grad_norm": 0.49962759017944336, + "learning_rate": 1.9097338015806514e-05, + "loss": 0.1928, + "step": 5104 + }, + { + "epoch": 0.5290703699865271, + "grad_norm": 0.5970175862312317, + "learning_rate": 1.9090631385912356e-05, + "loss": 0.2264, + "step": 5105 + }, + { + "epoch": 0.5291740076691885, + "grad_norm": 0.5184756517410278, + "learning_rate": 1.908392485848444e-05, + "loss": 0.181, + "step": 5106 + }, + { + "epoch": 0.52927764535185, + "grad_norm": 0.5002952814102173, + "learning_rate": 1.9077218434278454e-05, + "loss": 0.2004, + "step": 5107 + }, + { + "epoch": 0.5293812830345114, + "grad_norm": 0.5318555235862732, + "learning_rate": 1.9070512114050055e-05, + "loss": 0.2001, + "step": 5108 + }, + { + "epoch": 0.5294849207171728, + "grad_norm": 0.6008946299552917, + "learning_rate": 1.906380589855491e-05, + "loss": 0.2246, + "step": 5109 + }, + { + "epoch": 0.5295885583998342, + "grad_norm": 0.6363059282302856, + "learning_rate": 1.9057099788548658e-05, + "loss": 0.2458, + "step": 5110 + }, + { + "epoch": 0.5296921960824956, + "grad_norm": 0.5909748673439026, + "learning_rate": 1.9050393784786933e-05, + "loss": 0.2314, + "step": 5111 + }, + { + "epoch": 0.529795833765157, + "grad_norm": 0.6305270791053772, + "learning_rate": 1.9043687888025367e-05, + "loss": 0.2594, + "step": 5112 + }, + { + "epoch": 0.5298994714478185, + "grad_norm": 0.6209995746612549, + "learning_rate": 1.903698209901956e-05, + "loss": 0.2149, + "step": 5113 + }, + { + "epoch": 0.5300031091304799, + "grad_norm": 0.6202288269996643, + "learning_rate": 1.903027641852512e-05, + "loss": 0.2201, + "step": 5114 + }, + { + "epoch": 0.5301067468131413, + "grad_norm": 0.6097816824913025, + "learning_rate": 1.902357084729761e-05, + "loss": 0.209, + "step": 5115 + }, + { + "epoch": 0.5302103844958027, + "grad_norm": 0.6084482073783875, + "learning_rate": 1.901686538609263e-05, + "loss": 0.2025, + "step": 5116 + }, + { + "epoch": 0.5303140221784641, + "grad_norm": 0.6160656809806824, + "learning_rate": 1.901016003566573e-05, + "loss": 0.2042, + "step": 5117 + }, + { + "epoch": 0.5304176598611255, + "grad_norm": 0.5739774107933044, + "learning_rate": 1.900345479677245e-05, + "loss": 0.203, + "step": 5118 + }, + { + "epoch": 0.530521297543787, + "grad_norm": 0.5758394598960876, + "learning_rate": 1.8996749670168342e-05, + "loss": 0.221, + "step": 5119 + }, + { + "epoch": 0.5306249352264484, + "grad_norm": 0.5090250372886658, + "learning_rate": 1.8990044656608922e-05, + "loss": 0.2106, + "step": 5120 + }, + { + "epoch": 0.5307285729091098, + "grad_norm": 0.5946542024612427, + "learning_rate": 1.8983339756849693e-05, + "loss": 0.2208, + "step": 5121 + }, + { + "epoch": 0.5308322105917712, + "grad_norm": 0.5539801120758057, + "learning_rate": 1.8976634971646168e-05, + "loss": 0.2032, + "step": 5122 + }, + { + "epoch": 0.5309358482744326, + "grad_norm": 0.6142858266830444, + "learning_rate": 1.8969930301753822e-05, + "loss": 0.2313, + "step": 5123 + }, + { + "epoch": 0.531039485957094, + "grad_norm": 0.5650430917739868, + "learning_rate": 1.896322574792813e-05, + "loss": 0.2256, + "step": 5124 + }, + { + "epoch": 0.5311431236397555, + "grad_norm": 0.4985640347003937, + "learning_rate": 1.895652131092454e-05, + "loss": 0.1825, + "step": 5125 + }, + { + "epoch": 0.5312467613224169, + "grad_norm": 0.5422653555870056, + "learning_rate": 1.8949816991498512e-05, + "loss": 0.201, + "step": 5126 + }, + { + "epoch": 0.5313503990050783, + "grad_norm": 0.4722491204738617, + "learning_rate": 1.894311279040548e-05, + "loss": 0.18, + "step": 5127 + }, + { + "epoch": 0.5314540366877397, + "grad_norm": 0.5422795414924622, + "learning_rate": 1.8936408708400843e-05, + "loss": 0.1689, + "step": 5128 + }, + { + "epoch": 0.531557674370401, + "grad_norm": 0.5806071162223816, + "learning_rate": 1.8929704746240028e-05, + "loss": 0.2322, + "step": 5129 + }, + { + "epoch": 0.5316613120530624, + "grad_norm": 0.5612425208091736, + "learning_rate": 1.8923000904678413e-05, + "loss": 0.2202, + "step": 5130 + }, + { + "epoch": 0.5317649497357239, + "grad_norm": 0.6409311294555664, + "learning_rate": 1.891629718447138e-05, + "loss": 0.2545, + "step": 5131 + }, + { + "epoch": 0.5318685874183853, + "grad_norm": 0.507507860660553, + "learning_rate": 1.8909593586374306e-05, + "loss": 0.1902, + "step": 5132 + }, + { + "epoch": 0.5319722251010467, + "grad_norm": 0.5384594202041626, + "learning_rate": 1.8902890111142524e-05, + "loss": 0.2057, + "step": 5133 + }, + { + "epoch": 0.5320758627837081, + "grad_norm": 0.5002520680427551, + "learning_rate": 1.8896186759531386e-05, + "loss": 0.1977, + "step": 5134 + }, + { + "epoch": 0.5321795004663695, + "grad_norm": 0.5171283483505249, + "learning_rate": 1.8889483532296198e-05, + "loss": 0.1795, + "step": 5135 + }, + { + "epoch": 0.5322831381490309, + "grad_norm": 0.4970911145210266, + "learning_rate": 1.8882780430192283e-05, + "loss": 0.1786, + "step": 5136 + }, + { + "epoch": 0.5323867758316924, + "grad_norm": 0.5327664613723755, + "learning_rate": 1.8876077453974936e-05, + "loss": 0.2345, + "step": 5137 + }, + { + "epoch": 0.5324904135143538, + "grad_norm": 0.5078004002571106, + "learning_rate": 1.886937460439943e-05, + "loss": 0.1826, + "step": 5138 + }, + { + "epoch": 0.5325940511970152, + "grad_norm": 0.5058934092521667, + "learning_rate": 1.886267188222104e-05, + "loss": 0.2089, + "step": 5139 + }, + { + "epoch": 0.5326976888796766, + "grad_norm": 0.5866641998291016, + "learning_rate": 1.885596928819501e-05, + "loss": 0.2104, + "step": 5140 + }, + { + "epoch": 0.532801326562338, + "grad_norm": 0.5487980842590332, + "learning_rate": 1.8849266823076578e-05, + "loss": 0.2334, + "step": 5141 + }, + { + "epoch": 0.5329049642449994, + "grad_norm": 0.6039403080940247, + "learning_rate": 1.8842564487620987e-05, + "loss": 0.248, + "step": 5142 + }, + { + "epoch": 0.5330086019276609, + "grad_norm": 0.6887938380241394, + "learning_rate": 1.8835862282583418e-05, + "loss": 0.2647, + "step": 5143 + }, + { + "epoch": 0.5331122396103223, + "grad_norm": 0.542961835861206, + "learning_rate": 1.8829160208719082e-05, + "loss": 0.2161, + "step": 5144 + }, + { + "epoch": 0.5332158772929837, + "grad_norm": 0.5670207738876343, + "learning_rate": 1.8822458266783154e-05, + "loss": 0.1922, + "step": 5145 + }, + { + "epoch": 0.5333195149756451, + "grad_norm": 0.5435192584991455, + "learning_rate": 1.8815756457530786e-05, + "loss": 0.194, + "step": 5146 + }, + { + "epoch": 0.5334231526583065, + "grad_norm": 0.4792133569717407, + "learning_rate": 1.8809054781717156e-05, + "loss": 0.199, + "step": 5147 + }, + { + "epoch": 0.533526790340968, + "grad_norm": 0.5294674634933472, + "learning_rate": 1.8802353240097374e-05, + "loss": 0.2228, + "step": 5148 + }, + { + "epoch": 0.5336304280236294, + "grad_norm": 0.5679932832717896, + "learning_rate": 1.8795651833426572e-05, + "loss": 0.2322, + "step": 5149 + }, + { + "epoch": 0.5337340657062908, + "grad_norm": 0.5157346725463867, + "learning_rate": 1.878895056245984e-05, + "loss": 0.1722, + "step": 5150 + }, + { + "epoch": 0.5338377033889522, + "grad_norm": 0.5690714120864868, + "learning_rate": 1.8782249427952282e-05, + "loss": 0.2167, + "step": 5151 + }, + { + "epoch": 0.5339413410716136, + "grad_norm": 0.5080891847610474, + "learning_rate": 1.877554843065897e-05, + "loss": 0.1814, + "step": 5152 + }, + { + "epoch": 0.534044978754275, + "grad_norm": 0.5569474101066589, + "learning_rate": 1.8768847571334945e-05, + "loss": 0.2148, + "step": 5153 + }, + { + "epoch": 0.5341486164369365, + "grad_norm": 0.5498877763748169, + "learning_rate": 1.876214685073527e-05, + "loss": 0.2354, + "step": 5154 + }, + { + "epoch": 0.5342522541195979, + "grad_norm": 0.6328274011611938, + "learning_rate": 1.8755446269614964e-05, + "loss": 0.2635, + "step": 5155 + }, + { + "epoch": 0.5343558918022593, + "grad_norm": 0.5649670362472534, + "learning_rate": 1.874874582872903e-05, + "loss": 0.2419, + "step": 5156 + }, + { + "epoch": 0.5344595294849207, + "grad_norm": 0.6114292144775391, + "learning_rate": 1.8742045528832482e-05, + "loss": 0.2163, + "step": 5157 + }, + { + "epoch": 0.5345631671675821, + "grad_norm": 0.47934606671333313, + "learning_rate": 1.8735345370680283e-05, + "loss": 0.183, + "step": 5158 + }, + { + "epoch": 0.5346668048502435, + "grad_norm": 0.5587397217750549, + "learning_rate": 1.8728645355027407e-05, + "loss": 0.2233, + "step": 5159 + }, + { + "epoch": 0.534770442532905, + "grad_norm": 0.556676983833313, + "learning_rate": 1.8721945482628786e-05, + "loss": 0.2177, + "step": 5160 + }, + { + "epoch": 0.5348740802155664, + "grad_norm": 0.4846968352794647, + "learning_rate": 1.8715245754239363e-05, + "loss": 0.2065, + "step": 5161 + }, + { + "epoch": 0.5349777178982278, + "grad_norm": 0.6382477283477783, + "learning_rate": 1.870854617061406e-05, + "loss": 0.2509, + "step": 5162 + }, + { + "epoch": 0.5350813555808892, + "grad_norm": 0.5473085045814514, + "learning_rate": 1.8701846732507757e-05, + "loss": 0.2239, + "step": 5163 + }, + { + "epoch": 0.5351849932635506, + "grad_norm": 0.5322921276092529, + "learning_rate": 1.8695147440675354e-05, + "loss": 0.1842, + "step": 5164 + }, + { + "epoch": 0.535288630946212, + "grad_norm": 0.5844441652297974, + "learning_rate": 1.8688448295871705e-05, + "loss": 0.2158, + "step": 5165 + }, + { + "epoch": 0.5353922686288735, + "grad_norm": 0.49183139204978943, + "learning_rate": 1.868174929885166e-05, + "loss": 0.1852, + "step": 5166 + }, + { + "epoch": 0.5354959063115349, + "grad_norm": 0.659501850605011, + "learning_rate": 1.867505045037006e-05, + "loss": 0.2578, + "step": 5167 + }, + { + "epoch": 0.5355995439941963, + "grad_norm": 0.5893154740333557, + "learning_rate": 1.8668351751181715e-05, + "loss": 0.2354, + "step": 5168 + }, + { + "epoch": 0.5357031816768577, + "grad_norm": 0.5923011302947998, + "learning_rate": 1.8661653202041427e-05, + "loss": 0.2168, + "step": 5169 + }, + { + "epoch": 0.5358068193595191, + "grad_norm": 0.5813918709754944, + "learning_rate": 1.8654954803703967e-05, + "loss": 0.2229, + "step": 5170 + }, + { + "epoch": 0.5359104570421805, + "grad_norm": 0.5587177276611328, + "learning_rate": 1.8648256556924114e-05, + "loss": 0.2057, + "step": 5171 + }, + { + "epoch": 0.536014094724842, + "grad_norm": 0.5316544771194458, + "learning_rate": 1.8641558462456614e-05, + "loss": 0.219, + "step": 5172 + }, + { + "epoch": 0.5361177324075034, + "grad_norm": 0.47134703397750854, + "learning_rate": 1.8634860521056187e-05, + "loss": 0.1603, + "step": 5173 + }, + { + "epoch": 0.5362213700901648, + "grad_norm": 0.5119736194610596, + "learning_rate": 1.8628162733477567e-05, + "loss": 0.2112, + "step": 5174 + }, + { + "epoch": 0.5363250077728262, + "grad_norm": 0.5992642641067505, + "learning_rate": 1.8621465100475426e-05, + "loss": 0.2282, + "step": 5175 + }, + { + "epoch": 0.5364286454554876, + "grad_norm": 0.5826268792152405, + "learning_rate": 1.8614767622804457e-05, + "loss": 0.2159, + "step": 5176 + }, + { + "epoch": 0.536532283138149, + "grad_norm": 0.4886619448661804, + "learning_rate": 1.8608070301219323e-05, + "loss": 0.1942, + "step": 5177 + }, + { + "epoch": 0.5366359208208105, + "grad_norm": 0.5100725293159485, + "learning_rate": 1.8601373136474657e-05, + "loss": 0.1664, + "step": 5178 + }, + { + "epoch": 0.5367395585034719, + "grad_norm": 0.5713315606117249, + "learning_rate": 1.85946761293251e-05, + "loss": 0.2196, + "step": 5179 + }, + { + "epoch": 0.5368431961861333, + "grad_norm": 0.5622671246528625, + "learning_rate": 1.8587979280525245e-05, + "loss": 0.1861, + "step": 5180 + }, + { + "epoch": 0.5369468338687947, + "grad_norm": 0.5729225873947144, + "learning_rate": 1.8581282590829687e-05, + "loss": 0.178, + "step": 5181 + }, + { + "epoch": 0.5370504715514561, + "grad_norm": 0.5069655179977417, + "learning_rate": 1.857458606099301e-05, + "loss": 0.1662, + "step": 5182 + }, + { + "epoch": 0.5371541092341175, + "grad_norm": 0.5307016968727112, + "learning_rate": 1.8567889691769757e-05, + "loss": 0.1914, + "step": 5183 + }, + { + "epoch": 0.537257746916779, + "grad_norm": 0.5231932997703552, + "learning_rate": 1.856119348391447e-05, + "loss": 0.2088, + "step": 5184 + }, + { + "epoch": 0.5373613845994404, + "grad_norm": 0.49470946192741394, + "learning_rate": 1.8554497438181655e-05, + "loss": 0.1688, + "step": 5185 + }, + { + "epoch": 0.5374650222821018, + "grad_norm": 0.5611957311630249, + "learning_rate": 1.8547801555325827e-05, + "loss": 0.2162, + "step": 5186 + }, + { + "epoch": 0.5375686599647632, + "grad_norm": 0.5838019251823425, + "learning_rate": 1.854110583610147e-05, + "loss": 0.188, + "step": 5187 + }, + { + "epoch": 0.5376722976474246, + "grad_norm": 0.5590904355049133, + "learning_rate": 1.8534410281263027e-05, + "loss": 0.2025, + "step": 5188 + }, + { + "epoch": 0.537775935330086, + "grad_norm": 0.6141221523284912, + "learning_rate": 1.8527714891564963e-05, + "loss": 0.2027, + "step": 5189 + }, + { + "epoch": 0.5378795730127475, + "grad_norm": 0.6489647030830383, + "learning_rate": 1.8521019667761697e-05, + "loss": 0.241, + "step": 5190 + }, + { + "epoch": 0.5379832106954089, + "grad_norm": 0.5584114193916321, + "learning_rate": 1.8514324610607626e-05, + "loss": 0.2239, + "step": 5191 + }, + { + "epoch": 0.5380868483780703, + "grad_norm": 0.5955229997634888, + "learning_rate": 1.8507629720857156e-05, + "loss": 0.2114, + "step": 5192 + }, + { + "epoch": 0.5381904860607317, + "grad_norm": 0.5925705432891846, + "learning_rate": 1.850093499926465e-05, + "loss": 0.1965, + "step": 5193 + }, + { + "epoch": 0.5382941237433931, + "grad_norm": 0.556430995464325, + "learning_rate": 1.849424044658446e-05, + "loss": 0.1813, + "step": 5194 + }, + { + "epoch": 0.5383977614260546, + "grad_norm": 0.5698986053466797, + "learning_rate": 1.8487546063570905e-05, + "loss": 0.1943, + "step": 5195 + }, + { + "epoch": 0.538501399108716, + "grad_norm": 0.5680100917816162, + "learning_rate": 1.8480851850978315e-05, + "loss": 0.201, + "step": 5196 + }, + { + "epoch": 0.5386050367913774, + "grad_norm": 0.5762329697608948, + "learning_rate": 1.847415780956098e-05, + "loss": 0.2255, + "step": 5197 + }, + { + "epoch": 0.5387086744740388, + "grad_norm": 0.6530503630638123, + "learning_rate": 1.8467463940073165e-05, + "loss": 0.2322, + "step": 5198 + }, + { + "epoch": 0.5388123121567002, + "grad_norm": 0.5293810367584229, + "learning_rate": 1.8460770243269135e-05, + "loss": 0.1851, + "step": 5199 + }, + { + "epoch": 0.5389159498393616, + "grad_norm": 0.6074128746986389, + "learning_rate": 1.8454076719903122e-05, + "loss": 0.2146, + "step": 5200 + }, + { + "epoch": 0.5390195875220231, + "grad_norm": 0.602114737033844, + "learning_rate": 1.8447383370729335e-05, + "loss": 0.2105, + "step": 5201 + }, + { + "epoch": 0.5391232252046845, + "grad_norm": 0.6234201788902283, + "learning_rate": 1.844069019650198e-05, + "loss": 0.2281, + "step": 5202 + }, + { + "epoch": 0.5392268628873459, + "grad_norm": 0.6111294627189636, + "learning_rate": 1.8433997197975234e-05, + "loss": 0.2109, + "step": 5203 + }, + { + "epoch": 0.5393305005700073, + "grad_norm": 0.5196408629417419, + "learning_rate": 1.8427304375903247e-05, + "loss": 0.2038, + "step": 5204 + }, + { + "epoch": 0.5394341382526686, + "grad_norm": 0.4035626947879791, + "learning_rate": 1.8420611731040155e-05, + "loss": 0.1288, + "step": 5205 + }, + { + "epoch": 0.53953777593533, + "grad_norm": 0.6713810563087463, + "learning_rate": 1.8413919264140078e-05, + "loss": 0.2453, + "step": 5206 + }, + { + "epoch": 0.5396414136179915, + "grad_norm": 0.5893852114677429, + "learning_rate": 1.8407226975957118e-05, + "loss": 0.2025, + "step": 5207 + }, + { + "epoch": 0.5397450513006529, + "grad_norm": 0.53298020362854, + "learning_rate": 1.840053486724534e-05, + "loss": 0.1801, + "step": 5208 + }, + { + "epoch": 0.5398486889833143, + "grad_norm": 0.5477540493011475, + "learning_rate": 1.8393842938758814e-05, + "loss": 0.1831, + "step": 5209 + }, + { + "epoch": 0.5399523266659757, + "grad_norm": 0.5426579117774963, + "learning_rate": 1.838715119125156e-05, + "loss": 0.2084, + "step": 5210 + }, + { + "epoch": 0.5400559643486371, + "grad_norm": 0.47737252712249756, + "learning_rate": 1.8380459625477605e-05, + "loss": 0.1755, + "step": 5211 + }, + { + "epoch": 0.5401596020312985, + "grad_norm": 0.6123090982437134, + "learning_rate": 1.8373768242190947e-05, + "loss": 0.2052, + "step": 5212 + }, + { + "epoch": 0.54026323971396, + "grad_norm": 0.5149741172790527, + "learning_rate": 1.8367077042145547e-05, + "loss": 0.1853, + "step": 5213 + }, + { + "epoch": 0.5403668773966214, + "grad_norm": 0.580742597579956, + "learning_rate": 1.8360386026095377e-05, + "loss": 0.2322, + "step": 5214 + }, + { + "epoch": 0.5404705150792828, + "grad_norm": 0.5247928500175476, + "learning_rate": 1.8353695194794355e-05, + "loss": 0.1819, + "step": 5215 + }, + { + "epoch": 0.5405741527619442, + "grad_norm": 0.5545816421508789, + "learning_rate": 1.8347004548996395e-05, + "loss": 0.1981, + "step": 5216 + }, + { + "epoch": 0.5406777904446056, + "grad_norm": 0.5632359385490417, + "learning_rate": 1.8340314089455403e-05, + "loss": 0.1988, + "step": 5217 + }, + { + "epoch": 0.540781428127267, + "grad_norm": 0.5110858082771301, + "learning_rate": 1.8333623816925232e-05, + "loss": 0.1861, + "step": 5218 + }, + { + "epoch": 0.5408850658099285, + "grad_norm": 0.563378095626831, + "learning_rate": 1.8326933732159748e-05, + "loss": 0.2071, + "step": 5219 + }, + { + "epoch": 0.5409887034925899, + "grad_norm": 0.6319681406021118, + "learning_rate": 1.8320243835912755e-05, + "loss": 0.2227, + "step": 5220 + }, + { + "epoch": 0.5410923411752513, + "grad_norm": 0.7164770364761353, + "learning_rate": 1.8313554128938084e-05, + "loss": 0.2571, + "step": 5221 + }, + { + "epoch": 0.5411959788579127, + "grad_norm": 0.5762877464294434, + "learning_rate": 1.8306864611989518e-05, + "loss": 0.2073, + "step": 5222 + }, + { + "epoch": 0.5412996165405741, + "grad_norm": 0.570988118648529, + "learning_rate": 1.8300175285820802e-05, + "loss": 0.2105, + "step": 5223 + }, + { + "epoch": 0.5414032542232355, + "grad_norm": 0.54582279920578, + "learning_rate": 1.8293486151185703e-05, + "loss": 0.2258, + "step": 5224 + }, + { + "epoch": 0.541506891905897, + "grad_norm": 0.598730206489563, + "learning_rate": 1.828679720883793e-05, + "loss": 0.2315, + "step": 5225 + }, + { + "epoch": 0.5416105295885584, + "grad_norm": 0.5679086446762085, + "learning_rate": 1.828010845953118e-05, + "loss": 0.1977, + "step": 5226 + }, + { + "epoch": 0.5417141672712198, + "grad_norm": 0.5190294981002808, + "learning_rate": 1.827341990401914e-05, + "loss": 0.1964, + "step": 5227 + }, + { + "epoch": 0.5418178049538812, + "grad_norm": 0.5730674266815186, + "learning_rate": 1.826673154305546e-05, + "loss": 0.2332, + "step": 5228 + }, + { + "epoch": 0.5419214426365426, + "grad_norm": 0.5738920569419861, + "learning_rate": 1.8260043377393777e-05, + "loss": 0.1991, + "step": 5229 + }, + { + "epoch": 0.542025080319204, + "grad_norm": 0.648719847202301, + "learning_rate": 1.8253355407787693e-05, + "loss": 0.2623, + "step": 5230 + }, + { + "epoch": 0.5421287180018655, + "grad_norm": 0.5778685808181763, + "learning_rate": 1.8246667634990815e-05, + "loss": 0.2092, + "step": 5231 + }, + { + "epoch": 0.5422323556845269, + "grad_norm": 0.47101718187332153, + "learning_rate": 1.82399800597567e-05, + "loss": 0.1834, + "step": 5232 + }, + { + "epoch": 0.5423359933671883, + "grad_norm": 0.587877631187439, + "learning_rate": 1.8233292682838892e-05, + "loss": 0.226, + "step": 5233 + }, + { + "epoch": 0.5424396310498497, + "grad_norm": 0.571196436882019, + "learning_rate": 1.822660550499093e-05, + "loss": 0.1996, + "step": 5234 + }, + { + "epoch": 0.5425432687325111, + "grad_norm": 0.5070984363555908, + "learning_rate": 1.821991852696629e-05, + "loss": 0.1936, + "step": 5235 + }, + { + "epoch": 0.5426469064151725, + "grad_norm": 0.49533402919769287, + "learning_rate": 1.821323174951846e-05, + "loss": 0.1849, + "step": 5236 + }, + { + "epoch": 0.542750544097834, + "grad_norm": 0.44407081604003906, + "learning_rate": 1.8206545173400915e-05, + "loss": 0.1468, + "step": 5237 + }, + { + "epoch": 0.5428541817804954, + "grad_norm": 0.5772473812103271, + "learning_rate": 1.819985879936706e-05, + "loss": 0.1975, + "step": 5238 + }, + { + "epoch": 0.5429578194631568, + "grad_norm": 0.5305479168891907, + "learning_rate": 1.8193172628170324e-05, + "loss": 0.1822, + "step": 5239 + }, + { + "epoch": 0.5430614571458182, + "grad_norm": 0.576831579208374, + "learning_rate": 1.8186486660564083e-05, + "loss": 0.2108, + "step": 5240 + }, + { + "epoch": 0.5431650948284796, + "grad_norm": 0.5997173190116882, + "learning_rate": 1.81798008973017e-05, + "loss": 0.2474, + "step": 5241 + }, + { + "epoch": 0.543268732511141, + "grad_norm": 0.5615498423576355, + "learning_rate": 1.8173115339136537e-05, + "loss": 0.2051, + "step": 5242 + }, + { + "epoch": 0.5433723701938025, + "grad_norm": 0.6059595942497253, + "learning_rate": 1.8166429986821887e-05, + "loss": 0.2629, + "step": 5243 + }, + { + "epoch": 0.5434760078764639, + "grad_norm": 0.5712944865226746, + "learning_rate": 1.8159744841111064e-05, + "loss": 0.2219, + "step": 5244 + }, + { + "epoch": 0.5435796455591253, + "grad_norm": 0.5239284634590149, + "learning_rate": 1.8153059902757322e-05, + "loss": 0.1881, + "step": 5245 + }, + { + "epoch": 0.5436832832417867, + "grad_norm": 0.5493417978286743, + "learning_rate": 1.814637517251392e-05, + "loss": 0.1873, + "step": 5246 + }, + { + "epoch": 0.5437869209244481, + "grad_norm": 0.47680315375328064, + "learning_rate": 1.8139690651134092e-05, + "loss": 0.1967, + "step": 5247 + }, + { + "epoch": 0.5438905586071096, + "grad_norm": 0.6329566836357117, + "learning_rate": 1.813300633937102e-05, + "loss": 0.201, + "step": 5248 + }, + { + "epoch": 0.543994196289771, + "grad_norm": 0.5699524879455566, + "learning_rate": 1.8126322237977894e-05, + "loss": 0.2109, + "step": 5249 + }, + { + "epoch": 0.5440978339724324, + "grad_norm": 0.6076616644859314, + "learning_rate": 1.8119638347707868e-05, + "loss": 0.2299, + "step": 5250 + }, + { + "epoch": 0.5442014716550938, + "grad_norm": 0.5323348045349121, + "learning_rate": 1.811295466931406e-05, + "loss": 0.1677, + "step": 5251 + }, + { + "epoch": 0.5443051093377552, + "grad_norm": 0.5809974670410156, + "learning_rate": 1.81062712035496e-05, + "loss": 0.2257, + "step": 5252 + }, + { + "epoch": 0.5444087470204166, + "grad_norm": 0.5623114109039307, + "learning_rate": 1.809958795116755e-05, + "loss": 0.2039, + "step": 5253 + }, + { + "epoch": 0.5445123847030781, + "grad_norm": 0.6232031583786011, + "learning_rate": 1.809290491292098e-05, + "loss": 0.2525, + "step": 5254 + }, + { + "epoch": 0.5446160223857395, + "grad_norm": 0.5744406580924988, + "learning_rate": 1.808622208956291e-05, + "loss": 0.2111, + "step": 5255 + }, + { + "epoch": 0.5447196600684009, + "grad_norm": 0.5415107011795044, + "learning_rate": 1.8079539481846366e-05, + "loss": 0.2234, + "step": 5256 + }, + { + "epoch": 0.5448232977510623, + "grad_norm": 0.673896312713623, + "learning_rate": 1.8072857090524332e-05, + "loss": 0.2452, + "step": 5257 + }, + { + "epoch": 0.5449269354337237, + "grad_norm": 0.5377049446105957, + "learning_rate": 1.806617491634976e-05, + "loss": 0.2092, + "step": 5258 + }, + { + "epoch": 0.5450305731163851, + "grad_norm": 0.5719529986381531, + "learning_rate": 1.8059492960075593e-05, + "loss": 0.1924, + "step": 5259 + }, + { + "epoch": 0.5451342107990466, + "grad_norm": 0.5697689652442932, + "learning_rate": 1.8052811222454743e-05, + "loss": 0.2074, + "step": 5260 + }, + { + "epoch": 0.545237848481708, + "grad_norm": 0.5704839825630188, + "learning_rate": 1.804612970424009e-05, + "loss": 0.1868, + "step": 5261 + }, + { + "epoch": 0.5453414861643694, + "grad_norm": 0.5120481848716736, + "learning_rate": 1.803944840618452e-05, + "loss": 0.1946, + "step": 5262 + }, + { + "epoch": 0.5454451238470308, + "grad_norm": 0.561516523361206, + "learning_rate": 1.8032767329040846e-05, + "loss": 0.2016, + "step": 5263 + }, + { + "epoch": 0.5455487615296922, + "grad_norm": 0.5484476089477539, + "learning_rate": 1.802608647356189e-05, + "loss": 0.1962, + "step": 5264 + }, + { + "epoch": 0.5456523992123536, + "grad_norm": 0.5292932987213135, + "learning_rate": 1.8019405840500446e-05, + "loss": 0.1673, + "step": 5265 + }, + { + "epoch": 0.5457560368950151, + "grad_norm": 0.539897620677948, + "learning_rate": 1.8012725430609273e-05, + "loss": 0.1865, + "step": 5266 + }, + { + "epoch": 0.5458596745776765, + "grad_norm": 0.5219693183898926, + "learning_rate": 1.800604524464111e-05, + "loss": 0.2064, + "step": 5267 + }, + { + "epoch": 0.5459633122603379, + "grad_norm": 0.5294369459152222, + "learning_rate": 1.7999365283348665e-05, + "loss": 0.198, + "step": 5268 + }, + { + "epoch": 0.5460669499429993, + "grad_norm": 0.49916645884513855, + "learning_rate": 1.7992685547484628e-05, + "loss": 0.1724, + "step": 5269 + }, + { + "epoch": 0.5461705876256607, + "grad_norm": 0.5305742621421814, + "learning_rate": 1.7986006037801674e-05, + "loss": 0.1846, + "step": 5270 + }, + { + "epoch": 0.5462742253083221, + "grad_norm": 0.6302502751350403, + "learning_rate": 1.797932675505242e-05, + "loss": 0.2534, + "step": 5271 + }, + { + "epoch": 0.5463778629909836, + "grad_norm": 0.496896892786026, + "learning_rate": 1.797264769998949e-05, + "loss": 0.2217, + "step": 5272 + }, + { + "epoch": 0.546481500673645, + "grad_norm": 0.5490161776542664, + "learning_rate": 1.796596887336546e-05, + "loss": 0.1957, + "step": 5273 + }, + { + "epoch": 0.5465851383563064, + "grad_norm": 0.517518937587738, + "learning_rate": 1.79592902759329e-05, + "loss": 0.197, + "step": 5274 + }, + { + "epoch": 0.5466887760389678, + "grad_norm": 0.5890717506408691, + "learning_rate": 1.7952611908444342e-05, + "loss": 0.1834, + "step": 5275 + }, + { + "epoch": 0.5467924137216292, + "grad_norm": 0.528845489025116, + "learning_rate": 1.794593377165228e-05, + "loss": 0.2173, + "step": 5276 + }, + { + "epoch": 0.5468960514042907, + "grad_norm": 0.6080582141876221, + "learning_rate": 1.793925586630922e-05, + "loss": 0.2091, + "step": 5277 + }, + { + "epoch": 0.5469996890869521, + "grad_norm": 0.5256290435791016, + "learning_rate": 1.79325781931676e-05, + "loss": 0.173, + "step": 5278 + }, + { + "epoch": 0.5471033267696135, + "grad_norm": 0.5938299894332886, + "learning_rate": 1.7925900752979853e-05, + "loss": 0.2164, + "step": 5279 + }, + { + "epoch": 0.5472069644522749, + "grad_norm": 0.6256065368652344, + "learning_rate": 1.791922354649839e-05, + "loss": 0.2082, + "step": 5280 + }, + { + "epoch": 0.5473106021349362, + "grad_norm": 0.562027633190155, + "learning_rate": 1.791254657447558e-05, + "loss": 0.186, + "step": 5281 + }, + { + "epoch": 0.5474142398175976, + "grad_norm": 0.6371752023696899, + "learning_rate": 1.7905869837663783e-05, + "loss": 0.2397, + "step": 5282 + }, + { + "epoch": 0.547517877500259, + "grad_norm": 0.5378360152244568, + "learning_rate": 1.7899193336815307e-05, + "loss": 0.1959, + "step": 5283 + }, + { + "epoch": 0.5476215151829205, + "grad_norm": 0.5654333233833313, + "learning_rate": 1.7892517072682466e-05, + "loss": 0.2047, + "step": 5284 + }, + { + "epoch": 0.5477251528655819, + "grad_norm": 0.49719518423080444, + "learning_rate": 1.7885841046017528e-05, + "loss": 0.1789, + "step": 5285 + }, + { + "epoch": 0.5478287905482433, + "grad_norm": 0.5758084058761597, + "learning_rate": 1.7879165257572725e-05, + "loss": 0.2246, + "step": 5286 + }, + { + "epoch": 0.5479324282309047, + "grad_norm": 0.5711327791213989, + "learning_rate": 1.7872489708100295e-05, + "loss": 0.2078, + "step": 5287 + }, + { + "epoch": 0.5480360659135661, + "grad_norm": 0.6247648000717163, + "learning_rate": 1.786581439835241e-05, + "loss": 0.2448, + "step": 5288 + }, + { + "epoch": 0.5481397035962275, + "grad_norm": 0.6398104429244995, + "learning_rate": 1.785913932908124e-05, + "loss": 0.2248, + "step": 5289 + }, + { + "epoch": 0.548243341278889, + "grad_norm": 0.602163553237915, + "learning_rate": 1.785246450103893e-05, + "loss": 0.2323, + "step": 5290 + }, + { + "epoch": 0.5483469789615504, + "grad_norm": 0.6218096017837524, + "learning_rate": 1.7845789914977578e-05, + "loss": 0.2218, + "step": 5291 + }, + { + "epoch": 0.5484506166442118, + "grad_norm": 0.5533711314201355, + "learning_rate": 1.783911557164927e-05, + "loss": 0.1929, + "step": 5292 + }, + { + "epoch": 0.5485542543268732, + "grad_norm": 0.5798667669296265, + "learning_rate": 1.7832441471806053e-05, + "loss": 0.1912, + "step": 5293 + }, + { + "epoch": 0.5486578920095346, + "grad_norm": 0.6059601306915283, + "learning_rate": 1.782576761619997e-05, + "loss": 0.2237, + "step": 5294 + }, + { + "epoch": 0.548761529692196, + "grad_norm": 0.553433895111084, + "learning_rate": 1.781909400558301e-05, + "loss": 0.1725, + "step": 5295 + }, + { + "epoch": 0.5488651673748575, + "grad_norm": 0.5430698990821838, + "learning_rate": 1.7812420640707143e-05, + "loss": 0.2093, + "step": 5296 + }, + { + "epoch": 0.5489688050575189, + "grad_norm": 0.5995080471038818, + "learning_rate": 1.780574752232433e-05, + "loss": 0.221, + "step": 5297 + }, + { + "epoch": 0.5490724427401803, + "grad_norm": 0.46209654211997986, + "learning_rate": 1.779907465118646e-05, + "loss": 0.1817, + "step": 5298 + }, + { + "epoch": 0.5491760804228417, + "grad_norm": 0.5681026577949524, + "learning_rate": 1.7792402028045442e-05, + "loss": 0.2409, + "step": 5299 + }, + { + "epoch": 0.5492797181055031, + "grad_norm": 0.5082581043243408, + "learning_rate": 1.7785729653653137e-05, + "loss": 0.1704, + "step": 5300 + }, + { + "epoch": 0.5493833557881646, + "grad_norm": 0.559217631816864, + "learning_rate": 1.7779057528761364e-05, + "loss": 0.2277, + "step": 5301 + }, + { + "epoch": 0.549486993470826, + "grad_norm": 0.5087469220161438, + "learning_rate": 1.7772385654121947e-05, + "loss": 0.1993, + "step": 5302 + }, + { + "epoch": 0.5495906311534874, + "grad_norm": 0.5892218947410583, + "learning_rate": 1.7765714030486644e-05, + "loss": 0.2039, + "step": 5303 + }, + { + "epoch": 0.5496942688361488, + "grad_norm": 0.5468608736991882, + "learning_rate": 1.7759042658607208e-05, + "loss": 0.1872, + "step": 5304 + }, + { + "epoch": 0.5497979065188102, + "grad_norm": 0.4923431873321533, + "learning_rate": 1.7752371539235367e-05, + "loss": 0.1761, + "step": 5305 + }, + { + "epoch": 0.5499015442014716, + "grad_norm": 0.5435743927955627, + "learning_rate": 1.7745700673122804e-05, + "loss": 0.1834, + "step": 5306 + }, + { + "epoch": 0.550005181884133, + "grad_norm": 0.5914965271949768, + "learning_rate": 1.773903006102119e-05, + "loss": 0.2169, + "step": 5307 + }, + { + "epoch": 0.5501088195667945, + "grad_norm": 0.6041461825370789, + "learning_rate": 1.7732359703682146e-05, + "loss": 0.2351, + "step": 5308 + }, + { + "epoch": 0.5502124572494559, + "grad_norm": 0.5261842012405396, + "learning_rate": 1.772568960185729e-05, + "loss": 0.186, + "step": 5309 + }, + { + "epoch": 0.5503160949321173, + "grad_norm": 0.5709972977638245, + "learning_rate": 1.7719019756298198e-05, + "loss": 0.1804, + "step": 5310 + }, + { + "epoch": 0.5504197326147787, + "grad_norm": 0.5214843153953552, + "learning_rate": 1.7712350167756407e-05, + "loss": 0.2023, + "step": 5311 + }, + { + "epoch": 0.5505233702974401, + "grad_norm": 0.6243929266929626, + "learning_rate": 1.7705680836983448e-05, + "loss": 0.2634, + "step": 5312 + }, + { + "epoch": 0.5506270079801016, + "grad_norm": 0.5082367062568665, + "learning_rate": 1.7699011764730806e-05, + "loss": 0.1727, + "step": 5313 + }, + { + "epoch": 0.550730645662763, + "grad_norm": 0.604770302772522, + "learning_rate": 1.769234295174993e-05, + "loss": 0.2468, + "step": 5314 + }, + { + "epoch": 0.5508342833454244, + "grad_norm": 0.5542752146720886, + "learning_rate": 1.7685674398792278e-05, + "loss": 0.22, + "step": 5315 + }, + { + "epoch": 0.5509379210280858, + "grad_norm": 0.6486345529556274, + "learning_rate": 1.7679006106609228e-05, + "loss": 0.2442, + "step": 5316 + }, + { + "epoch": 0.5510415587107472, + "grad_norm": 0.6120084524154663, + "learning_rate": 1.767233807595217e-05, + "loss": 0.2233, + "step": 5317 + }, + { + "epoch": 0.5511451963934086, + "grad_norm": 0.5620784163475037, + "learning_rate": 1.766567030757243e-05, + "loss": 0.2261, + "step": 5318 + }, + { + "epoch": 0.5512488340760701, + "grad_norm": 0.6042924523353577, + "learning_rate": 1.7659002802221334e-05, + "loss": 0.2238, + "step": 5319 + }, + { + "epoch": 0.5513524717587315, + "grad_norm": 0.59671950340271, + "learning_rate": 1.7652335560650165e-05, + "loss": 0.221, + "step": 5320 + }, + { + "epoch": 0.5514561094413929, + "grad_norm": 0.6434497237205505, + "learning_rate": 1.764566858361017e-05, + "loss": 0.2577, + "step": 5321 + }, + { + "epoch": 0.5515597471240543, + "grad_norm": 0.5868349671363831, + "learning_rate": 1.763900187185259e-05, + "loss": 0.2109, + "step": 5322 + }, + { + "epoch": 0.5516633848067157, + "grad_norm": 0.6528464555740356, + "learning_rate": 1.76323354261286e-05, + "loss": 0.2481, + "step": 5323 + }, + { + "epoch": 0.5517670224893771, + "grad_norm": 0.5790517926216125, + "learning_rate": 1.7625669247189372e-05, + "loss": 0.2097, + "step": 5324 + }, + { + "epoch": 0.5518706601720386, + "grad_norm": 0.5709297060966492, + "learning_rate": 1.7619003335786053e-05, + "loss": 0.2235, + "step": 5325 + }, + { + "epoch": 0.5519742978547, + "grad_norm": 0.5696895122528076, + "learning_rate": 1.7612337692669726e-05, + "loss": 0.2164, + "step": 5326 + }, + { + "epoch": 0.5520779355373614, + "grad_norm": 0.5986068248748779, + "learning_rate": 1.7605672318591484e-05, + "loss": 0.2088, + "step": 5327 + }, + { + "epoch": 0.5521815732200228, + "grad_norm": 0.6396403908729553, + "learning_rate": 1.7599007214302356e-05, + "loss": 0.2216, + "step": 5328 + }, + { + "epoch": 0.5522852109026842, + "grad_norm": 0.49083516001701355, + "learning_rate": 1.7592342380553366e-05, + "loss": 0.1829, + "step": 5329 + }, + { + "epoch": 0.5523888485853456, + "grad_norm": 0.5620934367179871, + "learning_rate": 1.7585677818095493e-05, + "loss": 0.2067, + "step": 5330 + }, + { + "epoch": 0.5524924862680071, + "grad_norm": 0.5595473051071167, + "learning_rate": 1.7579013527679694e-05, + "loss": 0.195, + "step": 5331 + }, + { + "epoch": 0.5525961239506685, + "grad_norm": 0.48822200298309326, + "learning_rate": 1.7572349510056886e-05, + "loss": 0.1652, + "step": 5332 + }, + { + "epoch": 0.5526997616333299, + "grad_norm": 0.48362472653388977, + "learning_rate": 1.7565685765977955e-05, + "loss": 0.1778, + "step": 5333 + }, + { + "epoch": 0.5528033993159913, + "grad_norm": 0.5342475771903992, + "learning_rate": 1.7559022296193774e-05, + "loss": 0.2133, + "step": 5334 + }, + { + "epoch": 0.5529070369986527, + "grad_norm": 0.5649202466011047, + "learning_rate": 1.7552359101455166e-05, + "loss": 0.1966, + "step": 5335 + }, + { + "epoch": 0.5530106746813142, + "grad_norm": 0.5567264556884766, + "learning_rate": 1.7545696182512923e-05, + "loss": 0.1973, + "step": 5336 + }, + { + "epoch": 0.5531143123639756, + "grad_norm": 0.5125313401222229, + "learning_rate": 1.753903354011783e-05, + "loss": 0.1939, + "step": 5337 + }, + { + "epoch": 0.553217950046637, + "grad_norm": 0.5587508082389832, + "learning_rate": 1.7532371175020604e-05, + "loss": 0.2031, + "step": 5338 + }, + { + "epoch": 0.5533215877292984, + "grad_norm": 0.4835485816001892, + "learning_rate": 1.7525709087971953e-05, + "loss": 0.1942, + "step": 5339 + }, + { + "epoch": 0.5534252254119598, + "grad_norm": 0.4491787552833557, + "learning_rate": 1.7519047279722566e-05, + "loss": 0.156, + "step": 5340 + }, + { + "epoch": 0.5535288630946212, + "grad_norm": 0.555193305015564, + "learning_rate": 1.751238575102307e-05, + "loss": 0.1826, + "step": 5341 + }, + { + "epoch": 0.5536325007772827, + "grad_norm": 0.48197105526924133, + "learning_rate": 1.750572450262409e-05, + "loss": 0.1616, + "step": 5342 + }, + { + "epoch": 0.5537361384599441, + "grad_norm": 0.5574571490287781, + "learning_rate": 1.7499063535276178e-05, + "loss": 0.244, + "step": 5343 + }, + { + "epoch": 0.5538397761426055, + "grad_norm": 0.5440523624420166, + "learning_rate": 1.749240284972991e-05, + "loss": 0.203, + "step": 5344 + }, + { + "epoch": 0.5539434138252669, + "grad_norm": 0.6209940314292908, + "learning_rate": 1.748574244673579e-05, + "loss": 0.2292, + "step": 5345 + }, + { + "epoch": 0.5540470515079283, + "grad_norm": 0.6357861161231995, + "learning_rate": 1.7479082327044297e-05, + "loss": 0.2167, + "step": 5346 + }, + { + "epoch": 0.5541506891905897, + "grad_norm": 0.6537289023399353, + "learning_rate": 1.74724224914059e-05, + "loss": 0.2441, + "step": 5347 + }, + { + "epoch": 0.5542543268732512, + "grad_norm": 0.5783520936965942, + "learning_rate": 1.7465762940571e-05, + "loss": 0.2096, + "step": 5348 + }, + { + "epoch": 0.5543579645559126, + "grad_norm": 0.548708975315094, + "learning_rate": 1.745910367528999e-05, + "loss": 0.1975, + "step": 5349 + }, + { + "epoch": 0.554461602238574, + "grad_norm": 0.5915510654449463, + "learning_rate": 1.7452444696313232e-05, + "loss": 0.2071, + "step": 5350 + }, + { + "epoch": 0.5545652399212354, + "grad_norm": 0.3948794901371002, + "learning_rate": 1.7445786004391046e-05, + "loss": 0.1414, + "step": 5351 + }, + { + "epoch": 0.5546688776038968, + "grad_norm": 0.5463305711746216, + "learning_rate": 1.743912760027372e-05, + "loss": 0.2018, + "step": 5352 + }, + { + "epoch": 0.5547725152865582, + "grad_norm": 0.5292659401893616, + "learning_rate": 1.7432469484711516e-05, + "loss": 0.1838, + "step": 5353 + }, + { + "epoch": 0.5548761529692197, + "grad_norm": 0.6309893131256104, + "learning_rate": 1.7425811658454657e-05, + "loss": 0.2153, + "step": 5354 + }, + { + "epoch": 0.5549797906518811, + "grad_norm": 0.49494248628616333, + "learning_rate": 1.7419154122253344e-05, + "loss": 0.1536, + "step": 5355 + }, + { + "epoch": 0.5550834283345425, + "grad_norm": 0.5081691145896912, + "learning_rate": 1.741249687685772e-05, + "loss": 0.2007, + "step": 5356 + }, + { + "epoch": 0.5551870660172038, + "grad_norm": 0.5499346852302551, + "learning_rate": 1.740583992301794e-05, + "loss": 0.2021, + "step": 5357 + }, + { + "epoch": 0.5552907036998652, + "grad_norm": 0.5672243237495422, + "learning_rate": 1.7399183261484083e-05, + "loss": 0.2332, + "step": 5358 + }, + { + "epoch": 0.5553943413825266, + "grad_norm": 0.5117417573928833, + "learning_rate": 1.7392526893006204e-05, + "loss": 0.1965, + "step": 5359 + }, + { + "epoch": 0.555497979065188, + "grad_norm": 0.5445107221603394, + "learning_rate": 1.7385870818334353e-05, + "loss": 0.1574, + "step": 5360 + }, + { + "epoch": 0.5556016167478495, + "grad_norm": 0.5121918320655823, + "learning_rate": 1.7379215038218505e-05, + "loss": 0.1841, + "step": 5361 + }, + { + "epoch": 0.5557052544305109, + "grad_norm": 0.5723550915718079, + "learning_rate": 1.737255955340864e-05, + "loss": 0.2184, + "step": 5362 + }, + { + "epoch": 0.5558088921131723, + "grad_norm": 0.5024679899215698, + "learning_rate": 1.7365904364654677e-05, + "loss": 0.1718, + "step": 5363 + }, + { + "epoch": 0.5559125297958337, + "grad_norm": 0.5417926907539368, + "learning_rate": 1.7359249472706508e-05, + "loss": 0.1767, + "step": 5364 + }, + { + "epoch": 0.5560161674784951, + "grad_norm": 0.5689263343811035, + "learning_rate": 1.7352594878314014e-05, + "loss": 0.1896, + "step": 5365 + }, + { + "epoch": 0.5561198051611566, + "grad_norm": 0.580989420413971, + "learning_rate": 1.7345940582227004e-05, + "loss": 0.2157, + "step": 5366 + }, + { + "epoch": 0.556223442843818, + "grad_norm": 0.5768171548843384, + "learning_rate": 1.7339286585195294e-05, + "loss": 0.1891, + "step": 5367 + }, + { + "epoch": 0.5563270805264794, + "grad_norm": 0.5534247159957886, + "learning_rate": 1.7332632887968625e-05, + "loss": 0.2009, + "step": 5368 + }, + { + "epoch": 0.5564307182091408, + "grad_norm": 0.5858224630355835, + "learning_rate": 1.732597949129674e-05, + "loss": 0.2313, + "step": 5369 + }, + { + "epoch": 0.5565343558918022, + "grad_norm": 0.6130474209785461, + "learning_rate": 1.7319326395929335e-05, + "loss": 0.2484, + "step": 5370 + }, + { + "epoch": 0.5566379935744636, + "grad_norm": 0.5742282867431641, + "learning_rate": 1.7312673602616055e-05, + "loss": 0.2073, + "step": 5371 + }, + { + "epoch": 0.5567416312571251, + "grad_norm": 0.5607818961143494, + "learning_rate": 1.7306021112106543e-05, + "loss": 0.2023, + "step": 5372 + }, + { + "epoch": 0.5568452689397865, + "grad_norm": 0.5325602889060974, + "learning_rate": 1.729936892515038e-05, + "loss": 0.1775, + "step": 5373 + }, + { + "epoch": 0.5569489066224479, + "grad_norm": 0.6108415722846985, + "learning_rate": 1.7292717042497125e-05, + "loss": 0.2389, + "step": 5374 + }, + { + "epoch": 0.5570525443051093, + "grad_norm": 0.4758548438549042, + "learning_rate": 1.7286065464896315e-05, + "loss": 0.1564, + "step": 5375 + }, + { + "epoch": 0.5571561819877707, + "grad_norm": 0.5790165662765503, + "learning_rate": 1.7279414193097424e-05, + "loss": 0.218, + "step": 5376 + }, + { + "epoch": 0.5572598196704321, + "grad_norm": 0.603882908821106, + "learning_rate": 1.7272763227849915e-05, + "loss": 0.2127, + "step": 5377 + }, + { + "epoch": 0.5573634573530936, + "grad_norm": 0.5412938594818115, + "learning_rate": 1.7266112569903198e-05, + "loss": 0.1934, + "step": 5378 + }, + { + "epoch": 0.557467095035755, + "grad_norm": 0.5319254398345947, + "learning_rate": 1.7259462220006673e-05, + "loss": 0.2102, + "step": 5379 + }, + { + "epoch": 0.5575707327184164, + "grad_norm": 0.534984827041626, + "learning_rate": 1.7252812178909687e-05, + "loss": 0.2199, + "step": 5380 + }, + { + "epoch": 0.5576743704010778, + "grad_norm": 0.5318877100944519, + "learning_rate": 1.7246162447361546e-05, + "loss": 0.1978, + "step": 5381 + }, + { + "epoch": 0.5577780080837392, + "grad_norm": 0.6203436255455017, + "learning_rate": 1.723951302611155e-05, + "loss": 0.2205, + "step": 5382 + }, + { + "epoch": 0.5578816457664006, + "grad_norm": 0.5534346699714661, + "learning_rate": 1.723286391590893e-05, + "loss": 0.2086, + "step": 5383 + }, + { + "epoch": 0.5579852834490621, + "grad_norm": 0.5063457489013672, + "learning_rate": 1.7226215117502896e-05, + "loss": 0.1778, + "step": 5384 + }, + { + "epoch": 0.5580889211317235, + "grad_norm": 0.6248680949211121, + "learning_rate": 1.721956663164264e-05, + "loss": 0.2315, + "step": 5385 + }, + { + "epoch": 0.5581925588143849, + "grad_norm": 0.5708314776420593, + "learning_rate": 1.721291845907729e-05, + "loss": 0.2203, + "step": 5386 + }, + { + "epoch": 0.5582961964970463, + "grad_norm": 0.6492812633514404, + "learning_rate": 1.720627060055596e-05, + "loss": 0.2406, + "step": 5387 + }, + { + "epoch": 0.5583998341797077, + "grad_norm": 0.507900059223175, + "learning_rate": 1.7199623056827708e-05, + "loss": 0.1866, + "step": 5388 + }, + { + "epoch": 0.5585034718623691, + "grad_norm": 0.6005551815032959, + "learning_rate": 1.719297582864158e-05, + "loss": 0.2196, + "step": 5389 + }, + { + "epoch": 0.5586071095450306, + "grad_norm": 0.649582028388977, + "learning_rate": 1.7186328916746576e-05, + "loss": 0.2067, + "step": 5390 + }, + { + "epoch": 0.558710747227692, + "grad_norm": 0.5855095982551575, + "learning_rate": 1.7179682321891648e-05, + "loss": 0.2202, + "step": 5391 + }, + { + "epoch": 0.5588143849103534, + "grad_norm": 0.5517435073852539, + "learning_rate": 1.717303604482574e-05, + "loss": 0.194, + "step": 5392 + }, + { + "epoch": 0.5589180225930148, + "grad_norm": 0.5487895607948303, + "learning_rate": 1.7166390086297727e-05, + "loss": 0.1985, + "step": 5393 + }, + { + "epoch": 0.5590216602756762, + "grad_norm": 0.5209961533546448, + "learning_rate": 1.715974444705648e-05, + "loss": 0.1841, + "step": 5394 + }, + { + "epoch": 0.5591252979583377, + "grad_norm": 0.5808537602424622, + "learning_rate": 1.7153099127850815e-05, + "loss": 0.229, + "step": 5395 + }, + { + "epoch": 0.5592289356409991, + "grad_norm": 0.5422407984733582, + "learning_rate": 1.7146454129429508e-05, + "loss": 0.1929, + "step": 5396 + }, + { + "epoch": 0.5593325733236605, + "grad_norm": 0.672896146774292, + "learning_rate": 1.7139809452541324e-05, + "loss": 0.2492, + "step": 5397 + }, + { + "epoch": 0.5594362110063219, + "grad_norm": 0.5069525837898254, + "learning_rate": 1.7133165097934957e-05, + "loss": 0.1646, + "step": 5398 + }, + { + "epoch": 0.5595398486889833, + "grad_norm": 0.7111231088638306, + "learning_rate": 1.7126521066359085e-05, + "loss": 0.1945, + "step": 5399 + }, + { + "epoch": 0.5596434863716447, + "grad_norm": 0.5727432370185852, + "learning_rate": 1.711987735856236e-05, + "loss": 0.2158, + "step": 5400 + }, + { + "epoch": 0.5597471240543062, + "grad_norm": 0.5655161738395691, + "learning_rate": 1.7113233975293377e-05, + "loss": 0.2227, + "step": 5401 + }, + { + "epoch": 0.5598507617369676, + "grad_norm": 0.4536275267601013, + "learning_rate": 1.7106590917300706e-05, + "loss": 0.1658, + "step": 5402 + }, + { + "epoch": 0.559954399419629, + "grad_norm": 0.5846953392028809, + "learning_rate": 1.7099948185332862e-05, + "loss": 0.1957, + "step": 5403 + }, + { + "epoch": 0.5600580371022904, + "grad_norm": 0.49920523166656494, + "learning_rate": 1.7093305780138352e-05, + "loss": 0.1929, + "step": 5404 + }, + { + "epoch": 0.5601616747849518, + "grad_norm": 0.5249096155166626, + "learning_rate": 1.7086663702465635e-05, + "loss": 0.1501, + "step": 5405 + }, + { + "epoch": 0.5602653124676132, + "grad_norm": 0.5586709380149841, + "learning_rate": 1.7080021953063112e-05, + "loss": 0.1983, + "step": 5406 + }, + { + "epoch": 0.5603689501502747, + "grad_norm": 0.569900393486023, + "learning_rate": 1.7073380532679187e-05, + "loss": 0.2374, + "step": 5407 + }, + { + "epoch": 0.5604725878329361, + "grad_norm": 0.6065160036087036, + "learning_rate": 1.706673944206219e-05, + "loss": 0.2185, + "step": 5408 + }, + { + "epoch": 0.5605762255155975, + "grad_norm": 0.5849648118019104, + "learning_rate": 1.7060098681960426e-05, + "loss": 0.1944, + "step": 5409 + }, + { + "epoch": 0.5606798631982589, + "grad_norm": 0.4929162263870239, + "learning_rate": 1.7053458253122183e-05, + "loss": 0.1848, + "step": 5410 + }, + { + "epoch": 0.5607835008809203, + "grad_norm": 0.503531277179718, + "learning_rate": 1.7046818156295678e-05, + "loss": 0.1887, + "step": 5411 + }, + { + "epoch": 0.5608871385635817, + "grad_norm": 0.6490597128868103, + "learning_rate": 1.7040178392229116e-05, + "loss": 0.2554, + "step": 5412 + }, + { + "epoch": 0.5609907762462432, + "grad_norm": 0.5584279894828796, + "learning_rate": 1.7033538961670647e-05, + "loss": 0.2067, + "step": 5413 + }, + { + "epoch": 0.5610944139289046, + "grad_norm": 0.478782057762146, + "learning_rate": 1.7026899865368397e-05, + "loss": 0.1619, + "step": 5414 + }, + { + "epoch": 0.561198051611566, + "grad_norm": 0.6137062907218933, + "learning_rate": 1.7020261104070453e-05, + "loss": 0.2265, + "step": 5415 + }, + { + "epoch": 0.5613016892942274, + "grad_norm": 0.49538955092430115, + "learning_rate": 1.701362267852485e-05, + "loss": 0.1764, + "step": 5416 + }, + { + "epoch": 0.5614053269768888, + "grad_norm": 0.5120887756347656, + "learning_rate": 1.7006984589479604e-05, + "loss": 0.2008, + "step": 5417 + }, + { + "epoch": 0.5615089646595502, + "grad_norm": 0.6444122791290283, + "learning_rate": 1.7000346837682682e-05, + "loss": 0.2251, + "step": 5418 + }, + { + "epoch": 0.5616126023422117, + "grad_norm": 0.5767812728881836, + "learning_rate": 1.699370942388201e-05, + "loss": 0.2123, + "step": 5419 + }, + { + "epoch": 0.5617162400248731, + "grad_norm": 0.4728251099586487, + "learning_rate": 1.6987072348825493e-05, + "loss": 0.1823, + "step": 5420 + }, + { + "epoch": 0.5618198777075345, + "grad_norm": 0.7038987874984741, + "learning_rate": 1.6980435613260978e-05, + "loss": 0.2367, + "step": 5421 + }, + { + "epoch": 0.5619235153901959, + "grad_norm": 0.6106839776039124, + "learning_rate": 1.697379921793629e-05, + "loss": 0.2225, + "step": 5422 + }, + { + "epoch": 0.5620271530728573, + "grad_norm": 0.5902169346809387, + "learning_rate": 1.696716316359919e-05, + "loss": 0.2348, + "step": 5423 + }, + { + "epoch": 0.5621307907555187, + "grad_norm": 0.5676903128623962, + "learning_rate": 1.6960527450997436e-05, + "loss": 0.1769, + "step": 5424 + }, + { + "epoch": 0.5622344284381802, + "grad_norm": 0.5985881090164185, + "learning_rate": 1.695389208087873e-05, + "loss": 0.2165, + "step": 5425 + }, + { + "epoch": 0.5623380661208416, + "grad_norm": 0.5767608880996704, + "learning_rate": 1.6947257053990722e-05, + "loss": 0.1941, + "step": 5426 + }, + { + "epoch": 0.562441703803503, + "grad_norm": 0.49438780546188354, + "learning_rate": 1.6940622371081047e-05, + "loss": 0.2138, + "step": 5427 + }, + { + "epoch": 0.5625453414861644, + "grad_norm": 0.5235041975975037, + "learning_rate": 1.693398803289728e-05, + "loss": 0.1915, + "step": 5428 + }, + { + "epoch": 0.5626489791688258, + "grad_norm": 0.43641653656959534, + "learning_rate": 1.692735404018698e-05, + "loss": 0.1612, + "step": 5429 + }, + { + "epoch": 0.5627526168514873, + "grad_norm": 0.4511115550994873, + "learning_rate": 1.6920720393697655e-05, + "loss": 0.163, + "step": 5430 + }, + { + "epoch": 0.5628562545341487, + "grad_norm": 0.5550640821456909, + "learning_rate": 1.6914087094176758e-05, + "loss": 0.204, + "step": 5431 + }, + { + "epoch": 0.5629598922168101, + "grad_norm": 0.5841968655586243, + "learning_rate": 1.6907454142371742e-05, + "loss": 0.2113, + "step": 5432 + }, + { + "epoch": 0.5630635298994714, + "grad_norm": 0.6049975156784058, + "learning_rate": 1.6900821539029982e-05, + "loss": 0.2142, + "step": 5433 + }, + { + "epoch": 0.5631671675821328, + "grad_norm": 0.5049476623535156, + "learning_rate": 1.6894189284898825e-05, + "loss": 0.1679, + "step": 5434 + }, + { + "epoch": 0.5632708052647942, + "grad_norm": 0.5981543064117432, + "learning_rate": 1.6887557380725602e-05, + "loss": 0.2469, + "step": 5435 + }, + { + "epoch": 0.5633744429474556, + "grad_norm": 0.663126528263092, + "learning_rate": 1.6880925827257572e-05, + "loss": 0.2443, + "step": 5436 + }, + { + "epoch": 0.5634780806301171, + "grad_norm": 0.5187920928001404, + "learning_rate": 1.6874294625241973e-05, + "loss": 0.195, + "step": 5437 + }, + { + "epoch": 0.5635817183127785, + "grad_norm": 0.5009486675262451, + "learning_rate": 1.686766377542599e-05, + "loss": 0.1967, + "step": 5438 + }, + { + "epoch": 0.5636853559954399, + "grad_norm": 0.5059811472892761, + "learning_rate": 1.6861033278556787e-05, + "loss": 0.2093, + "step": 5439 + }, + { + "epoch": 0.5637889936781013, + "grad_norm": 0.6117614507675171, + "learning_rate": 1.685440313538148e-05, + "loss": 0.2318, + "step": 5440 + }, + { + "epoch": 0.5638926313607627, + "grad_norm": 0.5326939225196838, + "learning_rate": 1.6847773346647123e-05, + "loss": 0.1856, + "step": 5441 + }, + { + "epoch": 0.5639962690434241, + "grad_norm": 0.5146560072898865, + "learning_rate": 1.684114391310078e-05, + "loss": 0.2088, + "step": 5442 + }, + { + "epoch": 0.5640999067260856, + "grad_norm": 0.45405811071395874, + "learning_rate": 1.6834514835489428e-05, + "loss": 0.1723, + "step": 5443 + }, + { + "epoch": 0.564203544408747, + "grad_norm": 0.5213394165039062, + "learning_rate": 1.6827886114560014e-05, + "loss": 0.1969, + "step": 5444 + }, + { + "epoch": 0.5643071820914084, + "grad_norm": 0.5760708451271057, + "learning_rate": 1.6821257751059473e-05, + "loss": 0.2074, + "step": 5445 + }, + { + "epoch": 0.5644108197740698, + "grad_norm": 0.5615007281303406, + "learning_rate": 1.681462974573466e-05, + "loss": 0.187, + "step": 5446 + }, + { + "epoch": 0.5645144574567312, + "grad_norm": 0.5178715586662292, + "learning_rate": 1.6808002099332422e-05, + "loss": 0.1816, + "step": 5447 + }, + { + "epoch": 0.5646180951393927, + "grad_norm": 0.5387221574783325, + "learning_rate": 1.6801374812599537e-05, + "loss": 0.1927, + "step": 5448 + }, + { + "epoch": 0.5647217328220541, + "grad_norm": 0.5163358449935913, + "learning_rate": 1.679474788628277e-05, + "loss": 0.1766, + "step": 5449 + }, + { + "epoch": 0.5648253705047155, + "grad_norm": 0.5913562178611755, + "learning_rate": 1.6788121321128832e-05, + "loss": 0.2392, + "step": 5450 + }, + { + "epoch": 0.5649290081873769, + "grad_norm": 0.49062827229499817, + "learning_rate": 1.6781495117884382e-05, + "loss": 0.1978, + "step": 5451 + }, + { + "epoch": 0.5650326458700383, + "grad_norm": 0.6228064894676208, + "learning_rate": 1.6774869277296065e-05, + "loss": 0.247, + "step": 5452 + }, + { + "epoch": 0.5651362835526997, + "grad_norm": 0.645623505115509, + "learning_rate": 1.6768243800110464e-05, + "loss": 0.2396, + "step": 5453 + }, + { + "epoch": 0.5652399212353612, + "grad_norm": 0.5212389230728149, + "learning_rate": 1.676161868707412e-05, + "loss": 0.2024, + "step": 5454 + }, + { + "epoch": 0.5653435589180226, + "grad_norm": 0.6496005654335022, + "learning_rate": 1.6754993938933564e-05, + "loss": 0.2437, + "step": 5455 + }, + { + "epoch": 0.565447196600684, + "grad_norm": 0.5416711568832397, + "learning_rate": 1.6748369556435234e-05, + "loss": 0.1886, + "step": 5456 + }, + { + "epoch": 0.5655508342833454, + "grad_norm": 0.5800312161445618, + "learning_rate": 1.6741745540325572e-05, + "loss": 0.2158, + "step": 5457 + }, + { + "epoch": 0.5656544719660068, + "grad_norm": 0.5040571093559265, + "learning_rate": 1.6735121891350957e-05, + "loss": 0.1883, + "step": 5458 + }, + { + "epoch": 0.5657581096486682, + "grad_norm": 0.5947726368904114, + "learning_rate": 1.672849861025773e-05, + "loss": 0.2403, + "step": 5459 + }, + { + "epoch": 0.5658617473313297, + "grad_norm": 0.5437408685684204, + "learning_rate": 1.6721875697792198e-05, + "loss": 0.211, + "step": 5460 + }, + { + "epoch": 0.5659653850139911, + "grad_norm": 0.6372669339179993, + "learning_rate": 1.6715253154700614e-05, + "loss": 0.2235, + "step": 5461 + }, + { + "epoch": 0.5660690226966525, + "grad_norm": 0.5936444401741028, + "learning_rate": 1.6708630981729194e-05, + "loss": 0.1839, + "step": 5462 + }, + { + "epoch": 0.5661726603793139, + "grad_norm": 0.5205757021903992, + "learning_rate": 1.6702009179624123e-05, + "loss": 0.1911, + "step": 5463 + }, + { + "epoch": 0.5662762980619753, + "grad_norm": 0.5825114846229553, + "learning_rate": 1.669538774913153e-05, + "loss": 0.2063, + "step": 5464 + }, + { + "epoch": 0.5663799357446367, + "grad_norm": 0.580242931842804, + "learning_rate": 1.6688766690997514e-05, + "loss": 0.2016, + "step": 5465 + }, + { + "epoch": 0.5664835734272982, + "grad_norm": 0.508276104927063, + "learning_rate": 1.668214600596811e-05, + "loss": 0.1754, + "step": 5466 + }, + { + "epoch": 0.5665872111099596, + "grad_norm": 0.566452145576477, + "learning_rate": 1.667552569478934e-05, + "loss": 0.2086, + "step": 5467 + }, + { + "epoch": 0.566690848792621, + "grad_norm": 0.5992406606674194, + "learning_rate": 1.6668905758207173e-05, + "loss": 0.2269, + "step": 5468 + }, + { + "epoch": 0.5667944864752824, + "grad_norm": 0.5830585360527039, + "learning_rate": 1.6662286196967517e-05, + "loss": 0.207, + "step": 5469 + }, + { + "epoch": 0.5668981241579438, + "grad_norm": 0.5889588594436646, + "learning_rate": 1.665566701181627e-05, + "loss": 0.2168, + "step": 5470 + }, + { + "epoch": 0.5670017618406052, + "grad_norm": 0.5479435920715332, + "learning_rate": 1.6649048203499263e-05, + "loss": 0.2006, + "step": 5471 + }, + { + "epoch": 0.5671053995232667, + "grad_norm": 0.5586081147193909, + "learning_rate": 1.664242977276229e-05, + "loss": 0.2087, + "step": 5472 + }, + { + "epoch": 0.5672090372059281, + "grad_norm": 0.5428041219711304, + "learning_rate": 1.6635811720351124e-05, + "loss": 0.2113, + "step": 5473 + }, + { + "epoch": 0.5673126748885895, + "grad_norm": 0.523199737071991, + "learning_rate": 1.6629194047011454e-05, + "loss": 0.184, + "step": 5474 + }, + { + "epoch": 0.5674163125712509, + "grad_norm": 0.5387806296348572, + "learning_rate": 1.6622576753488963e-05, + "loss": 0.2036, + "step": 5475 + }, + { + "epoch": 0.5675199502539123, + "grad_norm": 0.5509522557258606, + "learning_rate": 1.661595984052927e-05, + "loss": 0.2163, + "step": 5476 + }, + { + "epoch": 0.5676235879365737, + "grad_norm": 0.4755285382270813, + "learning_rate": 1.660934330887796e-05, + "loss": 0.1716, + "step": 5477 + }, + { + "epoch": 0.5677272256192352, + "grad_norm": 0.5920076370239258, + "learning_rate": 1.660272715928058e-05, + "loss": 0.2038, + "step": 5478 + }, + { + "epoch": 0.5678308633018966, + "grad_norm": 0.5493197441101074, + "learning_rate": 1.6596111392482618e-05, + "loss": 0.2052, + "step": 5479 + }, + { + "epoch": 0.567934500984558, + "grad_norm": 0.6421695947647095, + "learning_rate": 1.658949600922954e-05, + "loss": 0.2539, + "step": 5480 + }, + { + "epoch": 0.5680381386672194, + "grad_norm": 0.6136369109153748, + "learning_rate": 1.658288101026674e-05, + "loss": 0.2309, + "step": 5481 + }, + { + "epoch": 0.5681417763498808, + "grad_norm": 0.563614547252655, + "learning_rate": 1.6576266396339597e-05, + "loss": 0.1861, + "step": 5482 + }, + { + "epoch": 0.5682454140325423, + "grad_norm": 0.5318019390106201, + "learning_rate": 1.6569652168193442e-05, + "loss": 0.1826, + "step": 5483 + }, + { + "epoch": 0.5683490517152037, + "grad_norm": 0.5460665225982666, + "learning_rate": 1.6563038326573544e-05, + "loss": 0.1832, + "step": 5484 + }, + { + "epoch": 0.5684526893978651, + "grad_norm": 0.6247243285179138, + "learning_rate": 1.655642487222515e-05, + "loss": 0.2252, + "step": 5485 + }, + { + "epoch": 0.5685563270805265, + "grad_norm": 0.671765148639679, + "learning_rate": 1.6549811805893437e-05, + "loss": 0.235, + "step": 5486 + }, + { + "epoch": 0.5686599647631879, + "grad_norm": 0.5407508015632629, + "learning_rate": 1.654319912832357e-05, + "loss": 0.2001, + "step": 5487 + }, + { + "epoch": 0.5687636024458493, + "grad_norm": 0.5194950103759766, + "learning_rate": 1.6536586840260657e-05, + "loss": 0.1991, + "step": 5488 + }, + { + "epoch": 0.5688672401285108, + "grad_norm": 0.581840991973877, + "learning_rate": 1.652997494244975e-05, + "loss": 0.1809, + "step": 5489 + }, + { + "epoch": 0.5689708778111722, + "grad_norm": 0.5414415597915649, + "learning_rate": 1.652336343563588e-05, + "loss": 0.2264, + "step": 5490 + }, + { + "epoch": 0.5690745154938336, + "grad_norm": 0.6323450207710266, + "learning_rate": 1.6516752320564003e-05, + "loss": 0.2452, + "step": 5491 + }, + { + "epoch": 0.569178153176495, + "grad_norm": 0.6471281051635742, + "learning_rate": 1.6510141597979062e-05, + "loss": 0.2658, + "step": 5492 + }, + { + "epoch": 0.5692817908591564, + "grad_norm": 0.5367169976234436, + "learning_rate": 1.650353126862595e-05, + "loss": 0.2028, + "step": 5493 + }, + { + "epoch": 0.5693854285418178, + "grad_norm": 0.47116124629974365, + "learning_rate": 1.6496921333249486e-05, + "loss": 0.158, + "step": 5494 + }, + { + "epoch": 0.5694890662244793, + "grad_norm": 0.6376896500587463, + "learning_rate": 1.6490311792594486e-05, + "loss": 0.2416, + "step": 5495 + }, + { + "epoch": 0.5695927039071407, + "grad_norm": 0.5647479295730591, + "learning_rate": 1.64837026474057e-05, + "loss": 0.1938, + "step": 5496 + }, + { + "epoch": 0.5696963415898021, + "grad_norm": 0.5366935729980469, + "learning_rate": 1.6477093898427826e-05, + "loss": 0.2007, + "step": 5497 + }, + { + "epoch": 0.5697999792724635, + "grad_norm": 0.5869206786155701, + "learning_rate": 1.6470485546405545e-05, + "loss": 0.1958, + "step": 5498 + }, + { + "epoch": 0.5699036169551249, + "grad_norm": 0.6000187993049622, + "learning_rate": 1.646387759208346e-05, + "loss": 0.2337, + "step": 5499 + }, + { + "epoch": 0.5700072546377863, + "grad_norm": 0.6028838753700256, + "learning_rate": 1.6457270036206153e-05, + "loss": 0.2222, + "step": 5500 + }, + { + "epoch": 0.5701108923204478, + "grad_norm": 0.5079191327095032, + "learning_rate": 1.6450662879518146e-05, + "loss": 0.1862, + "step": 5501 + }, + { + "epoch": 0.5702145300031092, + "grad_norm": 0.5666100978851318, + "learning_rate": 1.644405612276393e-05, + "loss": 0.2222, + "step": 5502 + }, + { + "epoch": 0.5703181676857706, + "grad_norm": 0.5046568512916565, + "learning_rate": 1.6437449766687952e-05, + "loss": 0.1666, + "step": 5503 + }, + { + "epoch": 0.570421805368432, + "grad_norm": 0.5350242853164673, + "learning_rate": 1.6430843812034582e-05, + "loss": 0.1935, + "step": 5504 + }, + { + "epoch": 0.5705254430510934, + "grad_norm": 0.5835912823677063, + "learning_rate": 1.6424238259548197e-05, + "loss": 0.2177, + "step": 5505 + }, + { + "epoch": 0.5706290807337548, + "grad_norm": 0.6207568049430847, + "learning_rate": 1.6417633109973076e-05, + "loss": 0.2086, + "step": 5506 + }, + { + "epoch": 0.5707327184164163, + "grad_norm": 0.4970071017742157, + "learning_rate": 1.6411028364053486e-05, + "loss": 0.1772, + "step": 5507 + }, + { + "epoch": 0.5708363560990777, + "grad_norm": 0.484801709651947, + "learning_rate": 1.640442402253365e-05, + "loss": 0.1792, + "step": 5508 + }, + { + "epoch": 0.570939993781739, + "grad_norm": 0.497266948223114, + "learning_rate": 1.639782008615772e-05, + "loss": 0.1754, + "step": 5509 + }, + { + "epoch": 0.5710436314644004, + "grad_norm": 0.696564257144928, + "learning_rate": 1.6391216555669826e-05, + "loss": 0.2451, + "step": 5510 + }, + { + "epoch": 0.5711472691470618, + "grad_norm": 0.5477309823036194, + "learning_rate": 1.6384613431814033e-05, + "loss": 0.1897, + "step": 5511 + }, + { + "epoch": 0.5712509068297232, + "grad_norm": 0.4967729151248932, + "learning_rate": 1.6378010715334383e-05, + "loss": 0.1624, + "step": 5512 + }, + { + "epoch": 0.5713545445123847, + "grad_norm": 0.6100444197654724, + "learning_rate": 1.637140840697486e-05, + "loss": 0.2225, + "step": 5513 + }, + { + "epoch": 0.5714581821950461, + "grad_norm": 0.5622698664665222, + "learning_rate": 1.6364806507479386e-05, + "loss": 0.1817, + "step": 5514 + }, + { + "epoch": 0.5715618198777075, + "grad_norm": 0.5402734875679016, + "learning_rate": 1.6358205017591874e-05, + "loss": 0.1982, + "step": 5515 + }, + { + "epoch": 0.5716654575603689, + "grad_norm": 0.6594802737236023, + "learning_rate": 1.6351603938056157e-05, + "loss": 0.2517, + "step": 5516 + }, + { + "epoch": 0.5717690952430303, + "grad_norm": 0.5295108556747437, + "learning_rate": 1.634500326961603e-05, + "loss": 0.2083, + "step": 5517 + }, + { + "epoch": 0.5718727329256917, + "grad_norm": 0.5516922473907471, + "learning_rate": 1.633840301301527e-05, + "loss": 0.1915, + "step": 5518 + }, + { + "epoch": 0.5719763706083532, + "grad_norm": 0.6079210638999939, + "learning_rate": 1.6331803168997547e-05, + "loss": 0.2179, + "step": 5519 + }, + { + "epoch": 0.5720800082910146, + "grad_norm": 0.5783078670501709, + "learning_rate": 1.632520373830655e-05, + "loss": 0.2124, + "step": 5520 + }, + { + "epoch": 0.572183645973676, + "grad_norm": 0.6089596748352051, + "learning_rate": 1.6318604721685882e-05, + "loss": 0.1942, + "step": 5521 + }, + { + "epoch": 0.5722872836563374, + "grad_norm": 0.6430178880691528, + "learning_rate": 1.6312006119879105e-05, + "loss": 0.2204, + "step": 5522 + }, + { + "epoch": 0.5723909213389988, + "grad_norm": 0.5460180044174194, + "learning_rate": 1.6305407933629754e-05, + "loss": 0.1975, + "step": 5523 + }, + { + "epoch": 0.5724945590216602, + "grad_norm": 0.571719229221344, + "learning_rate": 1.6298810163681292e-05, + "loss": 0.1972, + "step": 5524 + }, + { + "epoch": 0.5725981967043217, + "grad_norm": 0.5407332181930542, + "learning_rate": 1.6292212810777148e-05, + "loss": 0.1786, + "step": 5525 + }, + { + "epoch": 0.5727018343869831, + "grad_norm": 0.5866890549659729, + "learning_rate": 1.6285615875660696e-05, + "loss": 0.2088, + "step": 5526 + }, + { + "epoch": 0.5728054720696445, + "grad_norm": 0.6215521693229675, + "learning_rate": 1.627901935907527e-05, + "loss": 0.1896, + "step": 5527 + }, + { + "epoch": 0.5729091097523059, + "grad_norm": 0.4777623414993286, + "learning_rate": 1.627242326176417e-05, + "loss": 0.1722, + "step": 5528 + }, + { + "epoch": 0.5730127474349673, + "grad_norm": 0.4880693852901459, + "learning_rate": 1.626582758447061e-05, + "loss": 0.1898, + "step": 5529 + }, + { + "epoch": 0.5731163851176287, + "grad_norm": 0.6543561220169067, + "learning_rate": 1.62592323279378e-05, + "loss": 0.2323, + "step": 5530 + }, + { + "epoch": 0.5732200228002902, + "grad_norm": 0.6428809762001038, + "learning_rate": 1.6252637492908877e-05, + "loss": 0.2266, + "step": 5531 + }, + { + "epoch": 0.5733236604829516, + "grad_norm": 0.5870211124420166, + "learning_rate": 1.624604308012693e-05, + "loss": 0.1892, + "step": 5532 + }, + { + "epoch": 0.573427298165613, + "grad_norm": 0.5597742795944214, + "learning_rate": 1.623944909033502e-05, + "loss": 0.1971, + "step": 5533 + }, + { + "epoch": 0.5735309358482744, + "grad_norm": 0.6064586639404297, + "learning_rate": 1.6232855524276137e-05, + "loss": 0.2119, + "step": 5534 + }, + { + "epoch": 0.5736345735309358, + "grad_norm": 0.6230360865592957, + "learning_rate": 1.622626238269324e-05, + "loss": 0.236, + "step": 5535 + }, + { + "epoch": 0.5737382112135972, + "grad_norm": 0.6390331983566284, + "learning_rate": 1.6219669666329224e-05, + "loss": 0.2296, + "step": 5536 + }, + { + "epoch": 0.5738418488962587, + "grad_norm": 0.551810622215271, + "learning_rate": 1.6213077375926957e-05, + "loss": 0.1991, + "step": 5537 + }, + { + "epoch": 0.5739454865789201, + "grad_norm": 0.5981395244598389, + "learning_rate": 1.620648551222925e-05, + "loss": 0.2252, + "step": 5538 + }, + { + "epoch": 0.5740491242615815, + "grad_norm": 0.5947065353393555, + "learning_rate": 1.619989407597885e-05, + "loss": 0.2526, + "step": 5539 + }, + { + "epoch": 0.5741527619442429, + "grad_norm": 0.4955461323261261, + "learning_rate": 1.6193303067918488e-05, + "loss": 0.1816, + "step": 5540 + }, + { + "epoch": 0.5742563996269043, + "grad_norm": 0.579858124256134, + "learning_rate": 1.618671248879081e-05, + "loss": 0.1989, + "step": 5541 + }, + { + "epoch": 0.5743600373095658, + "grad_norm": 0.49859556555747986, + "learning_rate": 1.618012233933844e-05, + "loss": 0.1659, + "step": 5542 + }, + { + "epoch": 0.5744636749922272, + "grad_norm": 0.5783235430717468, + "learning_rate": 1.6173532620303953e-05, + "loss": 0.2121, + "step": 5543 + }, + { + "epoch": 0.5745673126748886, + "grad_norm": 0.6743443608283997, + "learning_rate": 1.616694333242986e-05, + "loss": 0.2295, + "step": 5544 + }, + { + "epoch": 0.57467095035755, + "grad_norm": 0.5629667639732361, + "learning_rate": 1.6160354476458638e-05, + "loss": 0.2156, + "step": 5545 + }, + { + "epoch": 0.5747745880402114, + "grad_norm": 0.5217868089675903, + "learning_rate": 1.6153766053132694e-05, + "loss": 0.2063, + "step": 5546 + }, + { + "epoch": 0.5748782257228728, + "grad_norm": 0.5237048864364624, + "learning_rate": 1.6147178063194418e-05, + "loss": 0.1993, + "step": 5547 + }, + { + "epoch": 0.5749818634055343, + "grad_norm": 0.48419585824012756, + "learning_rate": 1.614059050738613e-05, + "loss": 0.1416, + "step": 5548 + }, + { + "epoch": 0.5750855010881957, + "grad_norm": 0.6031510829925537, + "learning_rate": 1.61340033864501e-05, + "loss": 0.221, + "step": 5549 + }, + { + "epoch": 0.5751891387708571, + "grad_norm": 0.5714973211288452, + "learning_rate": 1.6127416701128572e-05, + "loss": 0.1976, + "step": 5550 + }, + { + "epoch": 0.5752927764535185, + "grad_norm": 0.7110316157341003, + "learning_rate": 1.6120830452163692e-05, + "loss": 0.2463, + "step": 5551 + }, + { + "epoch": 0.5753964141361799, + "grad_norm": 0.6350976824760437, + "learning_rate": 1.6114244640297615e-05, + "loss": 0.2138, + "step": 5552 + }, + { + "epoch": 0.5755000518188413, + "grad_norm": 0.6358919739723206, + "learning_rate": 1.6107659266272413e-05, + "loss": 0.197, + "step": 5553 + }, + { + "epoch": 0.5756036895015028, + "grad_norm": 0.5224093794822693, + "learning_rate": 1.6101074330830106e-05, + "loss": 0.1934, + "step": 5554 + }, + { + "epoch": 0.5757073271841642, + "grad_norm": 0.5277766585350037, + "learning_rate": 1.609448983471269e-05, + "loss": 0.1736, + "step": 5555 + }, + { + "epoch": 0.5758109648668256, + "grad_norm": 0.5051241517066956, + "learning_rate": 1.6087905778662087e-05, + "loss": 0.1814, + "step": 5556 + }, + { + "epoch": 0.575914602549487, + "grad_norm": 0.5881791114807129, + "learning_rate": 1.6081322163420172e-05, + "loss": 0.1983, + "step": 5557 + }, + { + "epoch": 0.5760182402321484, + "grad_norm": 0.613420844078064, + "learning_rate": 1.6074738989728794e-05, + "loss": 0.2434, + "step": 5558 + }, + { + "epoch": 0.5761218779148098, + "grad_norm": 0.6119860410690308, + "learning_rate": 1.606815625832972e-05, + "loss": 0.2256, + "step": 5559 + }, + { + "epoch": 0.5762255155974713, + "grad_norm": 0.6397984623908997, + "learning_rate": 1.6061573969964694e-05, + "loss": 0.2369, + "step": 5560 + }, + { + "epoch": 0.5763291532801327, + "grad_norm": 0.6807885766029358, + "learning_rate": 1.6054992125375377e-05, + "loss": 0.2651, + "step": 5561 + }, + { + "epoch": 0.5764327909627941, + "grad_norm": 0.5684066414833069, + "learning_rate": 1.6048410725303424e-05, + "loss": 0.1837, + "step": 5562 + }, + { + "epoch": 0.5765364286454555, + "grad_norm": 0.5673686861991882, + "learning_rate": 1.6041829770490407e-05, + "loss": 0.1845, + "step": 5563 + }, + { + "epoch": 0.5766400663281169, + "grad_norm": 0.6042220592498779, + "learning_rate": 1.6035249261677852e-05, + "loss": 0.2123, + "step": 5564 + }, + { + "epoch": 0.5767437040107783, + "grad_norm": 0.6854713559150696, + "learning_rate": 1.6028669199607258e-05, + "loss": 0.211, + "step": 5565 + }, + { + "epoch": 0.5768473416934398, + "grad_norm": 0.48943090438842773, + "learning_rate": 1.6022089585020036e-05, + "loss": 0.16, + "step": 5566 + }, + { + "epoch": 0.5769509793761012, + "grad_norm": 0.5919265151023865, + "learning_rate": 1.6015510418657574e-05, + "loss": 0.2261, + "step": 5567 + }, + { + "epoch": 0.5770546170587626, + "grad_norm": 0.5791772603988647, + "learning_rate": 1.600893170126121e-05, + "loss": 0.2395, + "step": 5568 + }, + { + "epoch": 0.577158254741424, + "grad_norm": 0.6177813410758972, + "learning_rate": 1.6002353433572214e-05, + "loss": 0.2158, + "step": 5569 + }, + { + "epoch": 0.5772618924240854, + "grad_norm": 0.5023506283760071, + "learning_rate": 1.5995775616331827e-05, + "loss": 0.1814, + "step": 5570 + }, + { + "epoch": 0.5773655301067468, + "grad_norm": 0.594092071056366, + "learning_rate": 1.5989198250281208e-05, + "loss": 0.2052, + "step": 5571 + }, + { + "epoch": 0.5774691677894083, + "grad_norm": 0.5519578456878662, + "learning_rate": 1.5982621336161497e-05, + "loss": 0.1876, + "step": 5572 + }, + { + "epoch": 0.5775728054720697, + "grad_norm": 0.6383631229400635, + "learning_rate": 1.5976044874713775e-05, + "loss": 0.2333, + "step": 5573 + }, + { + "epoch": 0.5776764431547311, + "grad_norm": 0.5924476385116577, + "learning_rate": 1.5969468866679056e-05, + "loss": 0.2161, + "step": 5574 + }, + { + "epoch": 0.5777800808373925, + "grad_norm": 0.5061485767364502, + "learning_rate": 1.5962893312798324e-05, + "loss": 0.1701, + "step": 5575 + }, + { + "epoch": 0.5778837185200539, + "grad_norm": 0.5206407308578491, + "learning_rate": 1.5956318213812493e-05, + "loss": 0.1783, + "step": 5576 + }, + { + "epoch": 0.5779873562027154, + "grad_norm": 0.5351812839508057, + "learning_rate": 1.5949743570462438e-05, + "loss": 0.183, + "step": 5577 + }, + { + "epoch": 0.5780909938853768, + "grad_norm": 0.6140007972717285, + "learning_rate": 1.594316938348899e-05, + "loss": 0.2215, + "step": 5578 + }, + { + "epoch": 0.5781946315680382, + "grad_norm": 0.6508222222328186, + "learning_rate": 1.59365956536329e-05, + "loss": 0.2327, + "step": 5579 + }, + { + "epoch": 0.5782982692506996, + "grad_norm": 0.46037814021110535, + "learning_rate": 1.5930022381634908e-05, + "loss": 0.1498, + "step": 5580 + }, + { + "epoch": 0.578401906933361, + "grad_norm": 0.5092030763626099, + "learning_rate": 1.5923449568235656e-05, + "loss": 0.1948, + "step": 5581 + }, + { + "epoch": 0.5785055446160224, + "grad_norm": 0.5261053442955017, + "learning_rate": 1.5916877214175768e-05, + "loss": 0.1641, + "step": 5582 + }, + { + "epoch": 0.5786091822986839, + "grad_norm": 0.5427508354187012, + "learning_rate": 1.5910305320195814e-05, + "loss": 0.209, + "step": 5583 + }, + { + "epoch": 0.5787128199813453, + "grad_norm": 0.5705498456954956, + "learning_rate": 1.59037338870363e-05, + "loss": 0.2181, + "step": 5584 + }, + { + "epoch": 0.5788164576640066, + "grad_norm": 0.5972639918327332, + "learning_rate": 1.589716291543768e-05, + "loss": 0.2035, + "step": 5585 + }, + { + "epoch": 0.578920095346668, + "grad_norm": 0.6710713505744934, + "learning_rate": 1.5890592406140363e-05, + "loss": 0.261, + "step": 5586 + }, + { + "epoch": 0.5790237330293294, + "grad_norm": 0.45358237624168396, + "learning_rate": 1.588402235988471e-05, + "loss": 0.1604, + "step": 5587 + }, + { + "epoch": 0.5791273707119908, + "grad_norm": 0.5760126113891602, + "learning_rate": 1.5877452777411017e-05, + "loss": 0.1777, + "step": 5588 + }, + { + "epoch": 0.5792310083946522, + "grad_norm": 0.6019428968429565, + "learning_rate": 1.587088365945953e-05, + "loss": 0.2076, + "step": 5589 + }, + { + "epoch": 0.5793346460773137, + "grad_norm": 0.6227850317955017, + "learning_rate": 1.5864315006770467e-05, + "loss": 0.221, + "step": 5590 + }, + { + "epoch": 0.5794382837599751, + "grad_norm": 0.6790367960929871, + "learning_rate": 1.5857746820083952e-05, + "loss": 0.2219, + "step": 5591 + }, + { + "epoch": 0.5795419214426365, + "grad_norm": 0.603782594203949, + "learning_rate": 1.5851179100140085e-05, + "loss": 0.2325, + "step": 5592 + }, + { + "epoch": 0.5796455591252979, + "grad_norm": 0.5080618858337402, + "learning_rate": 1.5844611847678912e-05, + "loss": 0.1768, + "step": 5593 + }, + { + "epoch": 0.5797491968079593, + "grad_norm": 0.6221045851707458, + "learning_rate": 1.5838045063440413e-05, + "loss": 0.2104, + "step": 5594 + }, + { + "epoch": 0.5798528344906207, + "grad_norm": 0.45891180634498596, + "learning_rate": 1.583147874816453e-05, + "loss": 0.1379, + "step": 5595 + }, + { + "epoch": 0.5799564721732822, + "grad_norm": 0.4983937740325928, + "learning_rate": 1.5824912902591134e-05, + "loss": 0.1699, + "step": 5596 + }, + { + "epoch": 0.5800601098559436, + "grad_norm": 0.6115486025810242, + "learning_rate": 1.5818347527460067e-05, + "loss": 0.2125, + "step": 5597 + }, + { + "epoch": 0.580163747538605, + "grad_norm": 0.6062911152839661, + "learning_rate": 1.5811782623511104e-05, + "loss": 0.2391, + "step": 5598 + }, + { + "epoch": 0.5802673852212664, + "grad_norm": 0.5856295228004456, + "learning_rate": 1.5805218191483956e-05, + "loss": 0.1943, + "step": 5599 + }, + { + "epoch": 0.5803710229039278, + "grad_norm": 0.6322320699691772, + "learning_rate": 1.579865423211831e-05, + "loss": 0.2269, + "step": 5600 + }, + { + "epoch": 0.5804746605865893, + "grad_norm": 0.5917003154754639, + "learning_rate": 1.5792090746153766e-05, + "loss": 0.2041, + "step": 5601 + }, + { + "epoch": 0.5805782982692507, + "grad_norm": 0.6539716124534607, + "learning_rate": 1.5785527734329895e-05, + "loss": 0.2067, + "step": 5602 + }, + { + "epoch": 0.5806819359519121, + "grad_norm": 0.584199845790863, + "learning_rate": 1.5778965197386216e-05, + "loss": 0.232, + "step": 5603 + }, + { + "epoch": 0.5807855736345735, + "grad_norm": 0.6193050146102905, + "learning_rate": 1.5772403136062172e-05, + "loss": 0.2086, + "step": 5604 + }, + { + "epoch": 0.5808892113172349, + "grad_norm": 0.6499961018562317, + "learning_rate": 1.5765841551097172e-05, + "loss": 0.2251, + "step": 5605 + }, + { + "epoch": 0.5809928489998963, + "grad_norm": 0.5342284440994263, + "learning_rate": 1.5759280443230558e-05, + "loss": 0.1961, + "step": 5606 + }, + { + "epoch": 0.5810964866825578, + "grad_norm": 0.5079427361488342, + "learning_rate": 1.5752719813201636e-05, + "loss": 0.1861, + "step": 5607 + }, + { + "epoch": 0.5812001243652192, + "grad_norm": 0.6445949077606201, + "learning_rate": 1.5746159661749646e-05, + "loss": 0.2146, + "step": 5608 + }, + { + "epoch": 0.5813037620478806, + "grad_norm": 0.574851930141449, + "learning_rate": 1.5739599989613764e-05, + "loss": 0.2117, + "step": 5609 + }, + { + "epoch": 0.581407399730542, + "grad_norm": 0.5927166938781738, + "learning_rate": 1.5733040797533148e-05, + "loss": 0.2057, + "step": 5610 + }, + { + "epoch": 0.5815110374132034, + "grad_norm": 0.5667980313301086, + "learning_rate": 1.5726482086246846e-05, + "loss": 0.2046, + "step": 5611 + }, + { + "epoch": 0.5816146750958648, + "grad_norm": 0.5506219863891602, + "learning_rate": 1.57199238564939e-05, + "loss": 0.2038, + "step": 5612 + }, + { + "epoch": 0.5817183127785263, + "grad_norm": 0.5867335200309753, + "learning_rate": 1.5713366109013294e-05, + "loss": 0.178, + "step": 5613 + }, + { + "epoch": 0.5818219504611877, + "grad_norm": 0.5505114793777466, + "learning_rate": 1.5706808844543916e-05, + "loss": 0.1969, + "step": 5614 + }, + { + "epoch": 0.5819255881438491, + "grad_norm": 0.6018428206443787, + "learning_rate": 1.5700252063824652e-05, + "loss": 0.2279, + "step": 5615 + }, + { + "epoch": 0.5820292258265105, + "grad_norm": 0.5699183344841003, + "learning_rate": 1.5693695767594303e-05, + "loss": 0.1997, + "step": 5616 + }, + { + "epoch": 0.5821328635091719, + "grad_norm": 0.5802676677703857, + "learning_rate": 1.5687139956591608e-05, + "loss": 0.2091, + "step": 5617 + }, + { + "epoch": 0.5822365011918333, + "grad_norm": 0.5912351012229919, + "learning_rate": 1.568058463155529e-05, + "loss": 0.1978, + "step": 5618 + }, + { + "epoch": 0.5823401388744948, + "grad_norm": 0.5182409882545471, + "learning_rate": 1.567402979322398e-05, + "loss": 0.1968, + "step": 5619 + }, + { + "epoch": 0.5824437765571562, + "grad_norm": 0.553266704082489, + "learning_rate": 1.566747544233627e-05, + "loss": 0.1964, + "step": 5620 + }, + { + "epoch": 0.5825474142398176, + "grad_norm": 0.6378732323646545, + "learning_rate": 1.5660921579630682e-05, + "loss": 0.2161, + "step": 5621 + }, + { + "epoch": 0.582651051922479, + "grad_norm": 0.6760129928588867, + "learning_rate": 1.5654368205845713e-05, + "loss": 0.2399, + "step": 5622 + }, + { + "epoch": 0.5827546896051404, + "grad_norm": 0.6148195862770081, + "learning_rate": 1.564781532171978e-05, + "loss": 0.2071, + "step": 5623 + }, + { + "epoch": 0.5828583272878018, + "grad_norm": 0.6314781904220581, + "learning_rate": 1.5641262927991243e-05, + "loss": 0.212, + "step": 5624 + }, + { + "epoch": 0.5829619649704633, + "grad_norm": 0.5955851674079895, + "learning_rate": 1.5634711025398433e-05, + "loss": 0.1934, + "step": 5625 + }, + { + "epoch": 0.5830656026531247, + "grad_norm": 0.6427017450332642, + "learning_rate": 1.562815961467959e-05, + "loss": 0.2414, + "step": 5626 + }, + { + "epoch": 0.5831692403357861, + "grad_norm": 0.5245332717895508, + "learning_rate": 1.562160869657293e-05, + "loss": 0.1862, + "step": 5627 + }, + { + "epoch": 0.5832728780184475, + "grad_norm": 0.570083737373352, + "learning_rate": 1.5615058271816597e-05, + "loss": 0.2063, + "step": 5628 + }, + { + "epoch": 0.5833765157011089, + "grad_norm": 0.5223228335380554, + "learning_rate": 1.560850834114868e-05, + "loss": 0.196, + "step": 5629 + }, + { + "epoch": 0.5834801533837704, + "grad_norm": 0.5994619131088257, + "learning_rate": 1.560195890530722e-05, + "loss": 0.2046, + "step": 5630 + }, + { + "epoch": 0.5835837910664318, + "grad_norm": 0.5086055397987366, + "learning_rate": 1.5595409965030188e-05, + "loss": 0.174, + "step": 5631 + }, + { + "epoch": 0.5836874287490932, + "grad_norm": 0.5901955366134644, + "learning_rate": 1.5588861521055515e-05, + "loss": 0.2171, + "step": 5632 + }, + { + "epoch": 0.5837910664317546, + "grad_norm": 0.5532594323158264, + "learning_rate": 1.5582313574121073e-05, + "loss": 0.1983, + "step": 5633 + }, + { + "epoch": 0.583894704114416, + "grad_norm": 0.5771147012710571, + "learning_rate": 1.5575766124964662e-05, + "loss": 0.216, + "step": 5634 + }, + { + "epoch": 0.5839983417970774, + "grad_norm": 0.5754862427711487, + "learning_rate": 1.5569219174324055e-05, + "loss": 0.2245, + "step": 5635 + }, + { + "epoch": 0.5841019794797389, + "grad_norm": 0.5546948313713074, + "learning_rate": 1.556267272293694e-05, + "loss": 0.1975, + "step": 5636 + }, + { + "epoch": 0.5842056171624003, + "grad_norm": 0.5417519211769104, + "learning_rate": 1.555612677154096e-05, + "loss": 0.2128, + "step": 5637 + }, + { + "epoch": 0.5843092548450617, + "grad_norm": 0.4762413799762726, + "learning_rate": 1.5549581320873715e-05, + "loss": 0.1572, + "step": 5638 + }, + { + "epoch": 0.5844128925277231, + "grad_norm": 0.5316885113716125, + "learning_rate": 1.5543036371672723e-05, + "loss": 0.1968, + "step": 5639 + }, + { + "epoch": 0.5845165302103845, + "grad_norm": 0.5961049199104309, + "learning_rate": 1.553649192467547e-05, + "loss": 0.2024, + "step": 5640 + }, + { + "epoch": 0.5846201678930459, + "grad_norm": 0.5178672671318054, + "learning_rate": 1.552994798061936e-05, + "loss": 0.1809, + "step": 5641 + }, + { + "epoch": 0.5847238055757074, + "grad_norm": 0.5495697259902954, + "learning_rate": 1.552340454024177e-05, + "loss": 0.191, + "step": 5642 + }, + { + "epoch": 0.5848274432583688, + "grad_norm": 0.5312830209732056, + "learning_rate": 1.5516861604279997e-05, + "loss": 0.1858, + "step": 5643 + }, + { + "epoch": 0.5849310809410302, + "grad_norm": 0.5942406058311462, + "learning_rate": 1.5510319173471285e-05, + "loss": 0.2073, + "step": 5644 + }, + { + "epoch": 0.5850347186236916, + "grad_norm": 0.5381253957748413, + "learning_rate": 1.5503777248552836e-05, + "loss": 0.1863, + "step": 5645 + }, + { + "epoch": 0.585138356306353, + "grad_norm": 0.6405827403068542, + "learning_rate": 1.5497235830261766e-05, + "loss": 0.1973, + "step": 5646 + }, + { + "epoch": 0.5852419939890144, + "grad_norm": 0.5816875696182251, + "learning_rate": 1.5490694919335172e-05, + "loss": 0.2108, + "step": 5647 + }, + { + "epoch": 0.5853456316716759, + "grad_norm": 0.43517184257507324, + "learning_rate": 1.5484154516510063e-05, + "loss": 0.1403, + "step": 5648 + }, + { + "epoch": 0.5854492693543373, + "grad_norm": 0.620516836643219, + "learning_rate": 1.54776146225234e-05, + "loss": 0.1912, + "step": 5649 + }, + { + "epoch": 0.5855529070369987, + "grad_norm": 0.5462132096290588, + "learning_rate": 1.5471075238112098e-05, + "loss": 0.1708, + "step": 5650 + }, + { + "epoch": 0.5856565447196601, + "grad_norm": 0.603030800819397, + "learning_rate": 1.546453636401299e-05, + "loss": 0.2087, + "step": 5651 + }, + { + "epoch": 0.5857601824023215, + "grad_norm": 0.5474585294723511, + "learning_rate": 1.545799800096287e-05, + "loss": 0.2027, + "step": 5652 + }, + { + "epoch": 0.585863820084983, + "grad_norm": 0.6245854496955872, + "learning_rate": 1.545146014969849e-05, + "loss": 0.2468, + "step": 5653 + }, + { + "epoch": 0.5859674577676444, + "grad_norm": 0.5252482295036316, + "learning_rate": 1.5444922810956498e-05, + "loss": 0.1801, + "step": 5654 + }, + { + "epoch": 0.5860710954503058, + "grad_norm": 0.6183892488479614, + "learning_rate": 1.5438385985473523e-05, + "loss": 0.2425, + "step": 5655 + }, + { + "epoch": 0.5861747331329672, + "grad_norm": 0.6712120175361633, + "learning_rate": 1.543184967398613e-05, + "loss": 0.2037, + "step": 5656 + }, + { + "epoch": 0.5862783708156286, + "grad_norm": 0.5625413060188293, + "learning_rate": 1.542531387723081e-05, + "loss": 0.1956, + "step": 5657 + }, + { + "epoch": 0.58638200849829, + "grad_norm": 0.5676254630088806, + "learning_rate": 1.5418778595944015e-05, + "loss": 0.1947, + "step": 5658 + }, + { + "epoch": 0.5864856461809514, + "grad_norm": 0.533459484577179, + "learning_rate": 1.5412243830862118e-05, + "loss": 0.2088, + "step": 5659 + }, + { + "epoch": 0.5865892838636129, + "grad_norm": 0.7574283480644226, + "learning_rate": 1.540570958272146e-05, + "loss": 0.247, + "step": 5660 + }, + { + "epoch": 0.5866929215462742, + "grad_norm": 0.6113412976264954, + "learning_rate": 1.539917585225831e-05, + "loss": 0.2037, + "step": 5661 + }, + { + "epoch": 0.5867965592289356, + "grad_norm": 0.5184718370437622, + "learning_rate": 1.5392642640208857e-05, + "loss": 0.2112, + "step": 5662 + }, + { + "epoch": 0.586900196911597, + "grad_norm": 0.5778669118881226, + "learning_rate": 1.5386109947309284e-05, + "loss": 0.2067, + "step": 5663 + }, + { + "epoch": 0.5870038345942584, + "grad_norm": 0.5942280888557434, + "learning_rate": 1.5379577774295665e-05, + "loss": 0.2127, + "step": 5664 + }, + { + "epoch": 0.5871074722769198, + "grad_norm": 0.539800226688385, + "learning_rate": 1.5373046121904032e-05, + "loss": 0.1769, + "step": 5665 + }, + { + "epoch": 0.5872111099595813, + "grad_norm": 0.6362435817718506, + "learning_rate": 1.536651499087038e-05, + "loss": 0.2244, + "step": 5666 + }, + { + "epoch": 0.5873147476422427, + "grad_norm": 0.5899614095687866, + "learning_rate": 1.5359984381930613e-05, + "loss": 0.2061, + "step": 5667 + }, + { + "epoch": 0.5874183853249041, + "grad_norm": 0.588805079460144, + "learning_rate": 1.5353454295820594e-05, + "loss": 0.1829, + "step": 5668 + }, + { + "epoch": 0.5875220230075655, + "grad_norm": 0.6606040000915527, + "learning_rate": 1.5346924733276117e-05, + "loss": 0.2479, + "step": 5669 + }, + { + "epoch": 0.5876256606902269, + "grad_norm": 0.5283950567245483, + "learning_rate": 1.534039569503293e-05, + "loss": 0.188, + "step": 5670 + }, + { + "epoch": 0.5877292983728883, + "grad_norm": 0.5973883271217346, + "learning_rate": 1.5333867181826717e-05, + "loss": 0.1903, + "step": 5671 + }, + { + "epoch": 0.5878329360555498, + "grad_norm": 0.5801593065261841, + "learning_rate": 1.5327339194393087e-05, + "loss": 0.1868, + "step": 5672 + }, + { + "epoch": 0.5879365737382112, + "grad_norm": 0.5748353600502014, + "learning_rate": 1.5320811733467626e-05, + "loss": 0.2009, + "step": 5673 + }, + { + "epoch": 0.5880402114208726, + "grad_norm": 0.5203397274017334, + "learning_rate": 1.5314284799785815e-05, + "loss": 0.1895, + "step": 5674 + }, + { + "epoch": 0.588143849103534, + "grad_norm": 0.5716513991355896, + "learning_rate": 1.5307758394083103e-05, + "loss": 0.2138, + "step": 5675 + }, + { + "epoch": 0.5882474867861954, + "grad_norm": 0.5933395028114319, + "learning_rate": 1.53012325170949e-05, + "loss": 0.2089, + "step": 5676 + }, + { + "epoch": 0.5883511244688568, + "grad_norm": 0.7028188109397888, + "learning_rate": 1.5294707169556494e-05, + "loss": 0.2341, + "step": 5677 + }, + { + "epoch": 0.5884547621515183, + "grad_norm": 0.8074510097503662, + "learning_rate": 1.5288182352203182e-05, + "loss": 0.1966, + "step": 5678 + }, + { + "epoch": 0.5885583998341797, + "grad_norm": 0.4657239019870758, + "learning_rate": 1.528165806577015e-05, + "loss": 0.1423, + "step": 5679 + }, + { + "epoch": 0.5886620375168411, + "grad_norm": 0.5782691240310669, + "learning_rate": 1.5275134310992553e-05, + "loss": 0.216, + "step": 5680 + }, + { + "epoch": 0.5887656751995025, + "grad_norm": 0.641411304473877, + "learning_rate": 1.5268611088605482e-05, + "loss": 0.2132, + "step": 5681 + }, + { + "epoch": 0.5888693128821639, + "grad_norm": 0.5617420077323914, + "learning_rate": 1.5262088399343954e-05, + "loss": 0.2034, + "step": 5682 + }, + { + "epoch": 0.5889729505648253, + "grad_norm": 0.6933878660202026, + "learning_rate": 1.5255566243942945e-05, + "loss": 0.2358, + "step": 5683 + }, + { + "epoch": 0.5890765882474868, + "grad_norm": 0.5039753317832947, + "learning_rate": 1.524904462313735e-05, + "loss": 0.1609, + "step": 5684 + }, + { + "epoch": 0.5891802259301482, + "grad_norm": 0.6344572305679321, + "learning_rate": 1.5242523537662023e-05, + "loss": 0.216, + "step": 5685 + }, + { + "epoch": 0.5892838636128096, + "grad_norm": 0.5435722470283508, + "learning_rate": 1.5236002988251752e-05, + "loss": 0.2057, + "step": 5686 + }, + { + "epoch": 0.589387501295471, + "grad_norm": 0.6607723236083984, + "learning_rate": 1.5229482975641252e-05, + "loss": 0.2446, + "step": 5687 + }, + { + "epoch": 0.5894911389781324, + "grad_norm": 0.5871069431304932, + "learning_rate": 1.5222963500565201e-05, + "loss": 0.2199, + "step": 5688 + }, + { + "epoch": 0.5895947766607939, + "grad_norm": 0.642362117767334, + "learning_rate": 1.5216444563758195e-05, + "loss": 0.2033, + "step": 5689 + }, + { + "epoch": 0.5896984143434553, + "grad_norm": 0.547693133354187, + "learning_rate": 1.5209926165954772e-05, + "loss": 0.183, + "step": 5690 + }, + { + "epoch": 0.5898020520261167, + "grad_norm": 0.5935719609260559, + "learning_rate": 1.5203408307889433e-05, + "loss": 0.1924, + "step": 5691 + }, + { + "epoch": 0.5899056897087781, + "grad_norm": 0.5445742011070251, + "learning_rate": 1.5196890990296584e-05, + "loss": 0.1783, + "step": 5692 + }, + { + "epoch": 0.5900093273914395, + "grad_norm": 0.6331069469451904, + "learning_rate": 1.5190374213910597e-05, + "loss": 0.2173, + "step": 5693 + }, + { + "epoch": 0.5901129650741009, + "grad_norm": 0.5657593011856079, + "learning_rate": 1.5183857979465757e-05, + "loss": 0.1942, + "step": 5694 + }, + { + "epoch": 0.5902166027567624, + "grad_norm": 0.5342247486114502, + "learning_rate": 1.5177342287696317e-05, + "loss": 0.1977, + "step": 5695 + }, + { + "epoch": 0.5903202404394238, + "grad_norm": 0.5558289885520935, + "learning_rate": 1.5170827139336457e-05, + "loss": 0.1896, + "step": 5696 + }, + { + "epoch": 0.5904238781220852, + "grad_norm": 0.6776431202888489, + "learning_rate": 1.5164312535120279e-05, + "loss": 0.2219, + "step": 5697 + }, + { + "epoch": 0.5905275158047466, + "grad_norm": 0.5458036065101624, + "learning_rate": 1.5157798475781856e-05, + "loss": 0.2065, + "step": 5698 + }, + { + "epoch": 0.590631153487408, + "grad_norm": 0.5801975131034851, + "learning_rate": 1.5151284962055168e-05, + "loss": 0.2235, + "step": 5699 + }, + { + "epoch": 0.5907347911700694, + "grad_norm": 0.5337779521942139, + "learning_rate": 1.514477199467415e-05, + "loss": 0.198, + "step": 5700 + }, + { + "epoch": 0.5908384288527309, + "grad_norm": 0.6479519009590149, + "learning_rate": 1.5138259574372684e-05, + "loss": 0.2172, + "step": 5701 + }, + { + "epoch": 0.5909420665353923, + "grad_norm": 0.5114519000053406, + "learning_rate": 1.5131747701884566e-05, + "loss": 0.1868, + "step": 5702 + }, + { + "epoch": 0.5910457042180537, + "grad_norm": 0.5659390091896057, + "learning_rate": 1.5125236377943555e-05, + "loss": 0.1897, + "step": 5703 + }, + { + "epoch": 0.5911493419007151, + "grad_norm": 0.6139878630638123, + "learning_rate": 1.5118725603283321e-05, + "loss": 0.2211, + "step": 5704 + }, + { + "epoch": 0.5912529795833765, + "grad_norm": 0.5344733595848083, + "learning_rate": 1.5112215378637505e-05, + "loss": 0.1686, + "step": 5705 + }, + { + "epoch": 0.5913566172660379, + "grad_norm": 0.5037564039230347, + "learning_rate": 1.5105705704739664e-05, + "loss": 0.165, + "step": 5706 + }, + { + "epoch": 0.5914602549486994, + "grad_norm": 0.5492294430732727, + "learning_rate": 1.5099196582323293e-05, + "loss": 0.205, + "step": 5707 + }, + { + "epoch": 0.5915638926313608, + "grad_norm": 0.5443542003631592, + "learning_rate": 1.5092688012121835e-05, + "loss": 0.1677, + "step": 5708 + }, + { + "epoch": 0.5916675303140222, + "grad_norm": 0.5195833444595337, + "learning_rate": 1.5086179994868656e-05, + "loss": 0.1945, + "step": 5709 + }, + { + "epoch": 0.5917711679966836, + "grad_norm": 0.6017307639122009, + "learning_rate": 1.5079672531297078e-05, + "loss": 0.2087, + "step": 5710 + }, + { + "epoch": 0.591874805679345, + "grad_norm": 0.5943590998649597, + "learning_rate": 1.5073165622140358e-05, + "loss": 0.2028, + "step": 5711 + }, + { + "epoch": 0.5919784433620064, + "grad_norm": 0.5091972351074219, + "learning_rate": 1.5066659268131666e-05, + "loss": 0.1927, + "step": 5712 + }, + { + "epoch": 0.5920820810446679, + "grad_norm": 0.5621009469032288, + "learning_rate": 1.5060153470004149e-05, + "loss": 0.1909, + "step": 5713 + }, + { + "epoch": 0.5921857187273293, + "grad_norm": 0.5365742444992065, + "learning_rate": 1.5053648228490857e-05, + "loss": 0.2044, + "step": 5714 + }, + { + "epoch": 0.5922893564099907, + "grad_norm": 0.6904767751693726, + "learning_rate": 1.5047143544324784e-05, + "loss": 0.2657, + "step": 5715 + }, + { + "epoch": 0.5923929940926521, + "grad_norm": 0.6271598935127258, + "learning_rate": 1.5040639418238889e-05, + "loss": 0.2233, + "step": 5716 + }, + { + "epoch": 0.5924966317753135, + "grad_norm": 0.624140739440918, + "learning_rate": 1.5034135850966033e-05, + "loss": 0.2019, + "step": 5717 + }, + { + "epoch": 0.592600269457975, + "grad_norm": 0.6414351463317871, + "learning_rate": 1.502763284323903e-05, + "loss": 0.2285, + "step": 5718 + }, + { + "epoch": 0.5927039071406364, + "grad_norm": 0.6503438949584961, + "learning_rate": 1.502113039579062e-05, + "loss": 0.216, + "step": 5719 + }, + { + "epoch": 0.5928075448232978, + "grad_norm": 0.6282237768173218, + "learning_rate": 1.5014628509353503e-05, + "loss": 0.2206, + "step": 5720 + }, + { + "epoch": 0.5929111825059592, + "grad_norm": 0.5512993931770325, + "learning_rate": 1.50081271846603e-05, + "loss": 0.1927, + "step": 5721 + }, + { + "epoch": 0.5930148201886206, + "grad_norm": 0.6197196841239929, + "learning_rate": 1.5001626422443556e-05, + "loss": 0.2021, + "step": 5722 + }, + { + "epoch": 0.593118457871282, + "grad_norm": 0.5854577422142029, + "learning_rate": 1.4995126223435788e-05, + "loss": 0.2068, + "step": 5723 + }, + { + "epoch": 0.5932220955539435, + "grad_norm": 0.5938782095909119, + "learning_rate": 1.498862658836941e-05, + "loss": 0.1997, + "step": 5724 + }, + { + "epoch": 0.5933257332366049, + "grad_norm": 0.5628288984298706, + "learning_rate": 1.4982127517976794e-05, + "loss": 0.1953, + "step": 5725 + }, + { + "epoch": 0.5934293709192663, + "grad_norm": 0.5366216897964478, + "learning_rate": 1.4975629012990255e-05, + "loss": 0.1715, + "step": 5726 + }, + { + "epoch": 0.5935330086019277, + "grad_norm": 0.6630962491035461, + "learning_rate": 1.4969131074142027e-05, + "loss": 0.2667, + "step": 5727 + }, + { + "epoch": 0.5936366462845891, + "grad_norm": 0.5116687417030334, + "learning_rate": 1.4962633702164296e-05, + "loss": 0.1781, + "step": 5728 + }, + { + "epoch": 0.5937402839672505, + "grad_norm": 0.5852224230766296, + "learning_rate": 1.4956136897789155e-05, + "loss": 0.1788, + "step": 5729 + }, + { + "epoch": 0.593843921649912, + "grad_norm": 0.6093877553939819, + "learning_rate": 1.4949640661748674e-05, + "loss": 0.2172, + "step": 5730 + }, + { + "epoch": 0.5939475593325734, + "grad_norm": 0.518648087978363, + "learning_rate": 1.4943144994774836e-05, + "loss": 0.1722, + "step": 5731 + }, + { + "epoch": 0.5940511970152348, + "grad_norm": 0.5264543890953064, + "learning_rate": 1.4936649897599548e-05, + "loss": 0.182, + "step": 5732 + }, + { + "epoch": 0.5941548346978962, + "grad_norm": 0.6355027556419373, + "learning_rate": 1.4930155370954693e-05, + "loss": 0.2237, + "step": 5733 + }, + { + "epoch": 0.5942584723805576, + "grad_norm": 0.6642647981643677, + "learning_rate": 1.4923661415572039e-05, + "loss": 0.2471, + "step": 5734 + }, + { + "epoch": 0.594362110063219, + "grad_norm": 0.5539326667785645, + "learning_rate": 1.4917168032183326e-05, + "loss": 0.173, + "step": 5735 + }, + { + "epoch": 0.5944657477458805, + "grad_norm": 0.600437581539154, + "learning_rate": 1.4910675221520228e-05, + "loss": 0.2121, + "step": 5736 + }, + { + "epoch": 0.5945693854285418, + "grad_norm": 0.6293091177940369, + "learning_rate": 1.4904182984314321e-05, + "loss": 0.2145, + "step": 5737 + }, + { + "epoch": 0.5946730231112032, + "grad_norm": 0.5350037813186646, + "learning_rate": 1.4897691321297164e-05, + "loss": 0.1654, + "step": 5738 + }, + { + "epoch": 0.5947766607938646, + "grad_norm": 0.516567051410675, + "learning_rate": 1.4891200233200214e-05, + "loss": 0.1768, + "step": 5739 + }, + { + "epoch": 0.594880298476526, + "grad_norm": 0.512410581111908, + "learning_rate": 1.4884709720754873e-05, + "loss": 0.1932, + "step": 5740 + }, + { + "epoch": 0.5949839361591874, + "grad_norm": 0.5908318161964417, + "learning_rate": 1.4878219784692501e-05, + "loss": 0.2061, + "step": 5741 + }, + { + "epoch": 0.5950875738418488, + "grad_norm": 0.590922474861145, + "learning_rate": 1.4871730425744352e-05, + "loss": 0.2305, + "step": 5742 + }, + { + "epoch": 0.5951912115245103, + "grad_norm": 0.6016734838485718, + "learning_rate": 1.4865241644641655e-05, + "loss": 0.1954, + "step": 5743 + }, + { + "epoch": 0.5952948492071717, + "grad_norm": 0.5654884576797485, + "learning_rate": 1.4858753442115537e-05, + "loss": 0.2019, + "step": 5744 + }, + { + "epoch": 0.5953984868898331, + "grad_norm": 0.6041907668113708, + "learning_rate": 1.4852265818897094e-05, + "loss": 0.2211, + "step": 5745 + }, + { + "epoch": 0.5955021245724945, + "grad_norm": 0.5785591006278992, + "learning_rate": 1.484577877571734e-05, + "loss": 0.1885, + "step": 5746 + }, + { + "epoch": 0.5956057622551559, + "grad_norm": 0.610321044921875, + "learning_rate": 1.4839292313307211e-05, + "loss": 0.2192, + "step": 5747 + }, + { + "epoch": 0.5957093999378174, + "grad_norm": 0.5311514735221863, + "learning_rate": 1.4832806432397613e-05, + "loss": 0.1803, + "step": 5748 + }, + { + "epoch": 0.5958130376204788, + "grad_norm": 0.6609798669815063, + "learning_rate": 1.4826321133719348e-05, + "loss": 0.2282, + "step": 5749 + }, + { + "epoch": 0.5959166753031402, + "grad_norm": 0.5810403823852539, + "learning_rate": 1.481983641800317e-05, + "loss": 0.174, + "step": 5750 + }, + { + "epoch": 0.5960203129858016, + "grad_norm": 0.5086999535560608, + "learning_rate": 1.4813352285979783e-05, + "loss": 0.1838, + "step": 5751 + }, + { + "epoch": 0.596123950668463, + "grad_norm": 0.507858157157898, + "learning_rate": 1.4806868738379792e-05, + "loss": 0.1733, + "step": 5752 + }, + { + "epoch": 0.5962275883511244, + "grad_norm": 0.4975156784057617, + "learning_rate": 1.4800385775933763e-05, + "loss": 0.1727, + "step": 5753 + }, + { + "epoch": 0.5963312260337859, + "grad_norm": 0.6003329753875732, + "learning_rate": 1.4793903399372171e-05, + "loss": 0.2041, + "step": 5754 + }, + { + "epoch": 0.5964348637164473, + "grad_norm": 0.6466993689537048, + "learning_rate": 1.4787421609425458e-05, + "loss": 0.2228, + "step": 5755 + }, + { + "epoch": 0.5965385013991087, + "grad_norm": 0.5515438318252563, + "learning_rate": 1.478094040682398e-05, + "loss": 0.1908, + "step": 5756 + }, + { + "epoch": 0.5966421390817701, + "grad_norm": 0.5963743329048157, + "learning_rate": 1.4774459792298012e-05, + "loss": 0.2199, + "step": 5757 + }, + { + "epoch": 0.5967457767644315, + "grad_norm": 0.6550231575965881, + "learning_rate": 1.4767979766577803e-05, + "loss": 0.2265, + "step": 5758 + }, + { + "epoch": 0.5968494144470929, + "grad_norm": 0.6037734150886536, + "learning_rate": 1.4761500330393493e-05, + "loss": 0.2229, + "step": 5759 + }, + { + "epoch": 0.5969530521297544, + "grad_norm": 0.6222856044769287, + "learning_rate": 1.4755021484475181e-05, + "loss": 0.2153, + "step": 5760 + }, + { + "epoch": 0.5970566898124158, + "grad_norm": 0.5862886905670166, + "learning_rate": 1.4748543229552904e-05, + "loss": 0.1949, + "step": 5761 + }, + { + "epoch": 0.5971603274950772, + "grad_norm": 0.5452234148979187, + "learning_rate": 1.4742065566356605e-05, + "loss": 0.1907, + "step": 5762 + }, + { + "epoch": 0.5972639651777386, + "grad_norm": 0.5806859135627747, + "learning_rate": 1.4735588495616188e-05, + "loss": 0.2052, + "step": 5763 + }, + { + "epoch": 0.5973676028604, + "grad_norm": 0.5556889772415161, + "learning_rate": 1.4729112018061469e-05, + "loss": 0.1936, + "step": 5764 + }, + { + "epoch": 0.5974712405430614, + "grad_norm": 0.48853084444999695, + "learning_rate": 1.4722636134422218e-05, + "loss": 0.1507, + "step": 5765 + }, + { + "epoch": 0.5975748782257229, + "grad_norm": 0.47970500588417053, + "learning_rate": 1.4716160845428129e-05, + "loss": 0.1691, + "step": 5766 + }, + { + "epoch": 0.5976785159083843, + "grad_norm": 0.6103032231330872, + "learning_rate": 1.470968615180881e-05, + "loss": 0.2244, + "step": 5767 + }, + { + "epoch": 0.5977821535910457, + "grad_norm": 0.6266652941703796, + "learning_rate": 1.470321205429385e-05, + "loss": 0.1797, + "step": 5768 + }, + { + "epoch": 0.5978857912737071, + "grad_norm": 0.6148141026496887, + "learning_rate": 1.4696738553612706e-05, + "loss": 0.2012, + "step": 5769 + }, + { + "epoch": 0.5979894289563685, + "grad_norm": 0.5959713459014893, + "learning_rate": 1.469026565049482e-05, + "loss": 0.1835, + "step": 5770 + }, + { + "epoch": 0.59809306663903, + "grad_norm": 0.6304055452346802, + "learning_rate": 1.4683793345669552e-05, + "loss": 0.2218, + "step": 5771 + }, + { + "epoch": 0.5981967043216914, + "grad_norm": 0.6706244945526123, + "learning_rate": 1.4677321639866178e-05, + "loss": 0.2184, + "step": 5772 + }, + { + "epoch": 0.5983003420043528, + "grad_norm": 0.5483915209770203, + "learning_rate": 1.4670850533813936e-05, + "loss": 0.185, + "step": 5773 + }, + { + "epoch": 0.5984039796870142, + "grad_norm": 0.626857340335846, + "learning_rate": 1.4664380028241967e-05, + "loss": 0.213, + "step": 5774 + }, + { + "epoch": 0.5985076173696756, + "grad_norm": 0.5656337738037109, + "learning_rate": 1.4657910123879356e-05, + "loss": 0.1883, + "step": 5775 + }, + { + "epoch": 0.598611255052337, + "grad_norm": 0.46662232279777527, + "learning_rate": 1.4651440821455137e-05, + "loss": 0.1711, + "step": 5776 + }, + { + "epoch": 0.5987148927349984, + "grad_norm": 0.5397834777832031, + "learning_rate": 1.464497212169825e-05, + "loss": 0.1711, + "step": 5777 + }, + { + "epoch": 0.5988185304176599, + "grad_norm": 0.4762875735759735, + "learning_rate": 1.463850402533758e-05, + "loss": 0.1724, + "step": 5778 + }, + { + "epoch": 0.5989221681003213, + "grad_norm": 0.5345758199691772, + "learning_rate": 1.4632036533101937e-05, + "loss": 0.1922, + "step": 5779 + }, + { + "epoch": 0.5990258057829827, + "grad_norm": 0.6376749873161316, + "learning_rate": 1.4625569645720075e-05, + "loss": 0.2209, + "step": 5780 + }, + { + "epoch": 0.5991294434656441, + "grad_norm": 0.6170158386230469, + "learning_rate": 1.4619103363920674e-05, + "loss": 0.2231, + "step": 5781 + }, + { + "epoch": 0.5992330811483055, + "grad_norm": 0.6733803749084473, + "learning_rate": 1.4612637688432334e-05, + "loss": 0.2256, + "step": 5782 + }, + { + "epoch": 0.599336718830967, + "grad_norm": 0.5524088144302368, + "learning_rate": 1.4606172619983614e-05, + "loss": 0.1688, + "step": 5783 + }, + { + "epoch": 0.5994403565136284, + "grad_norm": 0.5725765228271484, + "learning_rate": 1.4599708159302972e-05, + "loss": 0.1957, + "step": 5784 + }, + { + "epoch": 0.5995439941962898, + "grad_norm": 0.5375611186027527, + "learning_rate": 1.4593244307118817e-05, + "loss": 0.1875, + "step": 5785 + }, + { + "epoch": 0.5996476318789512, + "grad_norm": 0.6030101180076599, + "learning_rate": 1.4586781064159497e-05, + "loss": 0.2153, + "step": 5786 + }, + { + "epoch": 0.5997512695616126, + "grad_norm": 0.5640246868133545, + "learning_rate": 1.4580318431153266e-05, + "loss": 0.1789, + "step": 5787 + }, + { + "epoch": 0.599854907244274, + "grad_norm": 0.6239410042762756, + "learning_rate": 1.4573856408828335e-05, + "loss": 0.2141, + "step": 5788 + }, + { + "epoch": 0.5999585449269355, + "grad_norm": 0.6547892093658447, + "learning_rate": 1.4567394997912821e-05, + "loss": 0.2006, + "step": 5789 + }, + { + "epoch": 0.6000621826095969, + "grad_norm": 0.6406012773513794, + "learning_rate": 1.4560934199134792e-05, + "loss": 0.2158, + "step": 5790 + }, + { + "epoch": 0.6001658202922583, + "grad_norm": 0.49104514718055725, + "learning_rate": 1.4554474013222254e-05, + "loss": 0.1527, + "step": 5791 + }, + { + "epoch": 0.6002694579749197, + "grad_norm": 0.5826954245567322, + "learning_rate": 1.4548014440903106e-05, + "loss": 0.1849, + "step": 5792 + }, + { + "epoch": 0.6003730956575811, + "grad_norm": 0.6551445126533508, + "learning_rate": 1.4541555482905225e-05, + "loss": 0.219, + "step": 5793 + }, + { + "epoch": 0.6004767333402425, + "grad_norm": 0.5907532572746277, + "learning_rate": 1.4535097139956383e-05, + "loss": 0.2236, + "step": 5794 + }, + { + "epoch": 0.600580371022904, + "grad_norm": 0.5432443022727966, + "learning_rate": 1.4528639412784295e-05, + "loss": 0.1802, + "step": 5795 + }, + { + "epoch": 0.6006840087055654, + "grad_norm": 0.6686699390411377, + "learning_rate": 1.452218230211662e-05, + "loss": 0.2265, + "step": 5796 + }, + { + "epoch": 0.6007876463882268, + "grad_norm": 0.536210834980011, + "learning_rate": 1.4515725808680927e-05, + "loss": 0.1792, + "step": 5797 + }, + { + "epoch": 0.6008912840708882, + "grad_norm": 0.6930983662605286, + "learning_rate": 1.4509269933204726e-05, + "loss": 0.2277, + "step": 5798 + }, + { + "epoch": 0.6009949217535496, + "grad_norm": 0.5104745626449585, + "learning_rate": 1.450281467641545e-05, + "loss": 0.1933, + "step": 5799 + }, + { + "epoch": 0.601098559436211, + "grad_norm": 0.6219746470451355, + "learning_rate": 1.4496360039040466e-05, + "loss": 0.2294, + "step": 5800 + }, + { + "epoch": 0.6012021971188725, + "grad_norm": 0.6207789182662964, + "learning_rate": 1.4489906021807088e-05, + "loss": 0.2242, + "step": 5801 + }, + { + "epoch": 0.6013058348015339, + "grad_norm": 0.5910596251487732, + "learning_rate": 1.4483452625442526e-05, + "loss": 0.2022, + "step": 5802 + }, + { + "epoch": 0.6014094724841953, + "grad_norm": 0.5521644353866577, + "learning_rate": 1.4476999850673953e-05, + "loss": 0.2039, + "step": 5803 + }, + { + "epoch": 0.6015131101668567, + "grad_norm": 0.6113677024841309, + "learning_rate": 1.4470547698228444e-05, + "loss": 0.2237, + "step": 5804 + }, + { + "epoch": 0.6016167478495181, + "grad_norm": 0.8434403538703918, + "learning_rate": 1.4464096168833026e-05, + "loss": 0.2313, + "step": 5805 + }, + { + "epoch": 0.6017203855321795, + "grad_norm": 0.7482733130455017, + "learning_rate": 1.4457645263214653e-05, + "loss": 0.2591, + "step": 5806 + }, + { + "epoch": 0.601824023214841, + "grad_norm": 0.6481677293777466, + "learning_rate": 1.4451194982100182e-05, + "loss": 0.2041, + "step": 5807 + }, + { + "epoch": 0.6019276608975024, + "grad_norm": 0.5924575328826904, + "learning_rate": 1.4444745326216448e-05, + "loss": 0.2071, + "step": 5808 + }, + { + "epoch": 0.6020312985801638, + "grad_norm": 0.6279072165489197, + "learning_rate": 1.4438296296290171e-05, + "loss": 0.2094, + "step": 5809 + }, + { + "epoch": 0.6021349362628252, + "grad_norm": 0.5018296241760254, + "learning_rate": 1.4431847893048014e-05, + "loss": 0.1717, + "step": 5810 + }, + { + "epoch": 0.6022385739454866, + "grad_norm": 0.5699949264526367, + "learning_rate": 1.4425400117216591e-05, + "loss": 0.1707, + "step": 5811 + }, + { + "epoch": 0.602342211628148, + "grad_norm": 0.743048906326294, + "learning_rate": 1.4418952969522413e-05, + "loss": 0.2423, + "step": 5812 + }, + { + "epoch": 0.6024458493108094, + "grad_norm": 0.5955899357795715, + "learning_rate": 1.4412506450691943e-05, + "loss": 0.2163, + "step": 5813 + }, + { + "epoch": 0.6025494869934708, + "grad_norm": 0.572127103805542, + "learning_rate": 1.4406060561451552e-05, + "loss": 0.1935, + "step": 5814 + }, + { + "epoch": 0.6026531246761322, + "grad_norm": 0.562829315662384, + "learning_rate": 1.4399615302527563e-05, + "loss": 0.1898, + "step": 5815 + }, + { + "epoch": 0.6027567623587936, + "grad_norm": 0.5324281454086304, + "learning_rate": 1.4393170674646219e-05, + "loss": 0.1956, + "step": 5816 + }, + { + "epoch": 0.602860400041455, + "grad_norm": 0.5725294947624207, + "learning_rate": 1.4386726678533683e-05, + "loss": 0.2052, + "step": 5817 + }, + { + "epoch": 0.6029640377241164, + "grad_norm": 0.6350352168083191, + "learning_rate": 1.4380283314916064e-05, + "loss": 0.2063, + "step": 5818 + }, + { + "epoch": 0.6030676754067779, + "grad_norm": 0.6516824960708618, + "learning_rate": 1.4373840584519382e-05, + "loss": 0.2174, + "step": 5819 + }, + { + "epoch": 0.6031713130894393, + "grad_norm": 0.6768869161605835, + "learning_rate": 1.4367398488069593e-05, + "loss": 0.2007, + "step": 5820 + }, + { + "epoch": 0.6032749507721007, + "grad_norm": 0.5802131295204163, + "learning_rate": 1.4360957026292597e-05, + "loss": 0.1979, + "step": 5821 + }, + { + "epoch": 0.6033785884547621, + "grad_norm": 0.5170549154281616, + "learning_rate": 1.4354516199914189e-05, + "loss": 0.1807, + "step": 5822 + }, + { + "epoch": 0.6034822261374235, + "grad_norm": 0.6196373105049133, + "learning_rate": 1.4348076009660126e-05, + "loss": 0.1965, + "step": 5823 + }, + { + "epoch": 0.603585863820085, + "grad_norm": 0.5476102828979492, + "learning_rate": 1.4341636456256062e-05, + "loss": 0.168, + "step": 5824 + }, + { + "epoch": 0.6036895015027464, + "grad_norm": 0.4828929603099823, + "learning_rate": 1.4335197540427611e-05, + "loss": 0.1573, + "step": 5825 + }, + { + "epoch": 0.6037931391854078, + "grad_norm": 0.5340254902839661, + "learning_rate": 1.4328759262900301e-05, + "loss": 0.1683, + "step": 5826 + }, + { + "epoch": 0.6038967768680692, + "grad_norm": 0.5734612941741943, + "learning_rate": 1.432232162439957e-05, + "loss": 0.1818, + "step": 5827 + }, + { + "epoch": 0.6040004145507306, + "grad_norm": 0.5522999167442322, + "learning_rate": 1.4315884625650823e-05, + "loss": 0.2115, + "step": 5828 + }, + { + "epoch": 0.604104052233392, + "grad_norm": 0.6274425387382507, + "learning_rate": 1.4309448267379353e-05, + "loss": 0.2546, + "step": 5829 + }, + { + "epoch": 0.6042076899160534, + "grad_norm": 0.5536311268806458, + "learning_rate": 1.4303012550310404e-05, + "loss": 0.1809, + "step": 5830 + }, + { + "epoch": 0.6043113275987149, + "grad_norm": 0.6251480579376221, + "learning_rate": 1.4296577475169158e-05, + "loss": 0.2058, + "step": 5831 + }, + { + "epoch": 0.6044149652813763, + "grad_norm": 0.653142511844635, + "learning_rate": 1.4290143042680682e-05, + "loss": 0.21, + "step": 5832 + }, + { + "epoch": 0.6045186029640377, + "grad_norm": 0.5902639031410217, + "learning_rate": 1.4283709253570022e-05, + "loss": 0.1963, + "step": 5833 + }, + { + "epoch": 0.6046222406466991, + "grad_norm": 0.534339427947998, + "learning_rate": 1.427727610856211e-05, + "loss": 0.1939, + "step": 5834 + }, + { + "epoch": 0.6047258783293605, + "grad_norm": 0.6114777326583862, + "learning_rate": 1.4270843608381828e-05, + "loss": 0.207, + "step": 5835 + }, + { + "epoch": 0.604829516012022, + "grad_norm": 0.6344853639602661, + "learning_rate": 1.4264411753753991e-05, + "loss": 0.196, + "step": 5836 + }, + { + "epoch": 0.6049331536946834, + "grad_norm": 0.6889345049858093, + "learning_rate": 1.4257980545403315e-05, + "loss": 0.2376, + "step": 5837 + }, + { + "epoch": 0.6050367913773448, + "grad_norm": 0.5750885009765625, + "learning_rate": 1.4251549984054472e-05, + "loss": 0.1835, + "step": 5838 + }, + { + "epoch": 0.6051404290600062, + "grad_norm": 0.6389616131782532, + "learning_rate": 1.424512007043203e-05, + "loss": 0.2091, + "step": 5839 + }, + { + "epoch": 0.6052440667426676, + "grad_norm": 0.5323380827903748, + "learning_rate": 1.4238690805260515e-05, + "loss": 0.1784, + "step": 5840 + }, + { + "epoch": 0.605347704425329, + "grad_norm": 0.5227212905883789, + "learning_rate": 1.4232262189264373e-05, + "loss": 0.1749, + "step": 5841 + }, + { + "epoch": 0.6054513421079905, + "grad_norm": 0.5386372208595276, + "learning_rate": 1.4225834223167948e-05, + "loss": 0.1762, + "step": 5842 + }, + { + "epoch": 0.6055549797906519, + "grad_norm": 0.509102463722229, + "learning_rate": 1.421940690769556e-05, + "loss": 0.1793, + "step": 5843 + }, + { + "epoch": 0.6056586174733133, + "grad_norm": 0.5998203158378601, + "learning_rate": 1.4212980243571406e-05, + "loss": 0.2048, + "step": 5844 + }, + { + "epoch": 0.6057622551559747, + "grad_norm": 0.6335312128067017, + "learning_rate": 1.4206554231519642e-05, + "loss": 0.2481, + "step": 5845 + }, + { + "epoch": 0.6058658928386361, + "grad_norm": 0.5277844071388245, + "learning_rate": 1.4200128872264347e-05, + "loss": 0.1876, + "step": 5846 + }, + { + "epoch": 0.6059695305212975, + "grad_norm": 0.5580236911773682, + "learning_rate": 1.4193704166529512e-05, + "loss": 0.2033, + "step": 5847 + }, + { + "epoch": 0.606073168203959, + "grad_norm": 0.621285617351532, + "learning_rate": 1.4187280115039062e-05, + "loss": 0.1903, + "step": 5848 + }, + { + "epoch": 0.6061768058866204, + "grad_norm": 0.5097457766532898, + "learning_rate": 1.4180856718516858e-05, + "loss": 0.1686, + "step": 5849 + }, + { + "epoch": 0.6062804435692818, + "grad_norm": 0.6255810856819153, + "learning_rate": 1.417443397768667e-05, + "loss": 0.2329, + "step": 5850 + }, + { + "epoch": 0.6063840812519432, + "grad_norm": 0.5889554619789124, + "learning_rate": 1.416801189327221e-05, + "loss": 0.2081, + "step": 5851 + }, + { + "epoch": 0.6064877189346046, + "grad_norm": 0.6353864669799805, + "learning_rate": 1.4161590465997095e-05, + "loss": 0.2118, + "step": 5852 + }, + { + "epoch": 0.606591356617266, + "grad_norm": 0.5470232367515564, + "learning_rate": 1.4155169696584895e-05, + "loss": 0.1968, + "step": 5853 + }, + { + "epoch": 0.6066949942999275, + "grad_norm": 0.6519717574119568, + "learning_rate": 1.4148749585759092e-05, + "loss": 0.2176, + "step": 5854 + }, + { + "epoch": 0.6067986319825889, + "grad_norm": 0.6173345446586609, + "learning_rate": 1.4142330134243081e-05, + "loss": 0.2298, + "step": 5855 + }, + { + "epoch": 0.6069022696652503, + "grad_norm": 0.5106127262115479, + "learning_rate": 1.4135911342760213e-05, + "loss": 0.1628, + "step": 5856 + }, + { + "epoch": 0.6070059073479117, + "grad_norm": 0.5532249212265015, + "learning_rate": 1.4129493212033736e-05, + "loss": 0.1945, + "step": 5857 + }, + { + "epoch": 0.6071095450305731, + "grad_norm": 0.5892948508262634, + "learning_rate": 1.412307574278683e-05, + "loss": 0.1949, + "step": 5858 + }, + { + "epoch": 0.6072131827132345, + "grad_norm": 0.4962332844734192, + "learning_rate": 1.4116658935742625e-05, + "loss": 0.1676, + "step": 5859 + }, + { + "epoch": 0.607316820395896, + "grad_norm": 0.5226655602455139, + "learning_rate": 1.411024279162414e-05, + "loss": 0.1594, + "step": 5860 + }, + { + "epoch": 0.6074204580785574, + "grad_norm": 0.6553589701652527, + "learning_rate": 1.4103827311154347e-05, + "loss": 0.2408, + "step": 5861 + }, + { + "epoch": 0.6075240957612188, + "grad_norm": 0.480268269777298, + "learning_rate": 1.4097412495056122e-05, + "loss": 0.1564, + "step": 5862 + }, + { + "epoch": 0.6076277334438802, + "grad_norm": 0.6084377765655518, + "learning_rate": 1.4090998344052277e-05, + "loss": 0.2365, + "step": 5863 + }, + { + "epoch": 0.6077313711265416, + "grad_norm": 0.5667449831962585, + "learning_rate": 1.408458485886556e-05, + "loss": 0.21, + "step": 5864 + }, + { + "epoch": 0.607835008809203, + "grad_norm": 0.5660369396209717, + "learning_rate": 1.4078172040218622e-05, + "loss": 0.1853, + "step": 5865 + }, + { + "epoch": 0.6079386464918645, + "grad_norm": 0.5079784393310547, + "learning_rate": 1.407175988883406e-05, + "loss": 0.1748, + "step": 5866 + }, + { + "epoch": 0.6080422841745259, + "grad_norm": 0.6271408200263977, + "learning_rate": 1.4065348405434365e-05, + "loss": 0.2038, + "step": 5867 + }, + { + "epoch": 0.6081459218571873, + "grad_norm": 0.5546168684959412, + "learning_rate": 1.4058937590741995e-05, + "loss": 0.1919, + "step": 5868 + }, + { + "epoch": 0.6082495595398487, + "grad_norm": 0.628492534160614, + "learning_rate": 1.40525274454793e-05, + "loss": 0.2464, + "step": 5869 + }, + { + "epoch": 0.6083531972225101, + "grad_norm": 0.5692856907844543, + "learning_rate": 1.4046117970368562e-05, + "loss": 0.2058, + "step": 5870 + }, + { + "epoch": 0.6084568349051716, + "grad_norm": 0.5562016367912292, + "learning_rate": 1.4039709166132006e-05, + "loss": 0.173, + "step": 5871 + }, + { + "epoch": 0.608560472587833, + "grad_norm": 0.5617178678512573, + "learning_rate": 1.4033301033491748e-05, + "loss": 0.1998, + "step": 5872 + }, + { + "epoch": 0.6086641102704944, + "grad_norm": 0.6305001974105835, + "learning_rate": 1.402689357316985e-05, + "loss": 0.208, + "step": 5873 + }, + { + "epoch": 0.6087677479531558, + "grad_norm": 0.6569817662239075, + "learning_rate": 1.402048678588831e-05, + "loss": 0.2387, + "step": 5874 + }, + { + "epoch": 0.6088713856358172, + "grad_norm": 0.6421927809715271, + "learning_rate": 1.401408067236902e-05, + "loss": 0.2147, + "step": 5875 + }, + { + "epoch": 0.6089750233184786, + "grad_norm": 0.5628452301025391, + "learning_rate": 1.4007675233333812e-05, + "loss": 0.1741, + "step": 5876 + }, + { + "epoch": 0.60907866100114, + "grad_norm": 0.5609790086746216, + "learning_rate": 1.4001270469504442e-05, + "loss": 0.2206, + "step": 5877 + }, + { + "epoch": 0.6091822986838015, + "grad_norm": 0.5732092261314392, + "learning_rate": 1.3994866381602592e-05, + "loss": 0.197, + "step": 5878 + }, + { + "epoch": 0.6092859363664629, + "grad_norm": 0.5437310338020325, + "learning_rate": 1.3988462970349868e-05, + "loss": 0.1677, + "step": 5879 + }, + { + "epoch": 0.6093895740491243, + "grad_norm": 0.528835654258728, + "learning_rate": 1.398206023646778e-05, + "loss": 0.1728, + "step": 5880 + }, + { + "epoch": 0.6094932117317857, + "grad_norm": 0.6198895573616028, + "learning_rate": 1.39756581806778e-05, + "loss": 0.1956, + "step": 5881 + }, + { + "epoch": 0.6095968494144471, + "grad_norm": 0.6650860905647278, + "learning_rate": 1.3969256803701288e-05, + "loss": 0.2185, + "step": 5882 + }, + { + "epoch": 0.6097004870971086, + "grad_norm": 0.6388286352157593, + "learning_rate": 1.396285610625954e-05, + "loss": 0.2208, + "step": 5883 + }, + { + "epoch": 0.60980412477977, + "grad_norm": 0.5759437680244446, + "learning_rate": 1.3956456089073789e-05, + "loss": 0.1941, + "step": 5884 + }, + { + "epoch": 0.6099077624624314, + "grad_norm": 0.6277048587799072, + "learning_rate": 1.3950056752865166e-05, + "loss": 0.2193, + "step": 5885 + }, + { + "epoch": 0.6100114001450928, + "grad_norm": 0.6908484697341919, + "learning_rate": 1.3943658098354749e-05, + "loss": 0.2166, + "step": 5886 + }, + { + "epoch": 0.6101150378277542, + "grad_norm": 0.6081604957580566, + "learning_rate": 1.3937260126263512e-05, + "loss": 0.2151, + "step": 5887 + }, + { + "epoch": 0.6102186755104156, + "grad_norm": 0.6466516256332397, + "learning_rate": 1.3930862837312384e-05, + "loss": 0.2133, + "step": 5888 + }, + { + "epoch": 0.610322313193077, + "grad_norm": 0.6081497669219971, + "learning_rate": 1.3924466232222203e-05, + "loss": 0.2237, + "step": 5889 + }, + { + "epoch": 0.6104259508757384, + "grad_norm": 0.5361385345458984, + "learning_rate": 1.3918070311713714e-05, + "loss": 0.1824, + "step": 5890 + }, + { + "epoch": 0.6105295885583998, + "grad_norm": 0.6183144450187683, + "learning_rate": 1.3911675076507613e-05, + "loss": 0.203, + "step": 5891 + }, + { + "epoch": 0.6106332262410612, + "grad_norm": 0.5001223087310791, + "learning_rate": 1.3905280527324499e-05, + "loss": 0.1663, + "step": 5892 + }, + { + "epoch": 0.6107368639237226, + "grad_norm": 0.6348176598548889, + "learning_rate": 1.3898886664884894e-05, + "loss": 0.2415, + "step": 5893 + }, + { + "epoch": 0.610840501606384, + "grad_norm": 0.5159345865249634, + "learning_rate": 1.389249348990927e-05, + "loss": 0.1831, + "step": 5894 + }, + { + "epoch": 0.6109441392890455, + "grad_norm": 0.6143372654914856, + "learning_rate": 1.388610100311797e-05, + "loss": 0.1734, + "step": 5895 + }, + { + "epoch": 0.6110477769717069, + "grad_norm": 0.6014468669891357, + "learning_rate": 1.3879709205231318e-05, + "loss": 0.2024, + "step": 5896 + }, + { + "epoch": 0.6111514146543683, + "grad_norm": 0.6340698003768921, + "learning_rate": 1.387331809696951e-05, + "loss": 0.2011, + "step": 5897 + }, + { + "epoch": 0.6112550523370297, + "grad_norm": 0.5582529902458191, + "learning_rate": 1.3866927679052693e-05, + "loss": 0.1802, + "step": 5898 + }, + { + "epoch": 0.6113586900196911, + "grad_norm": 0.601702868938446, + "learning_rate": 1.386053795220094e-05, + "loss": 0.1888, + "step": 5899 + }, + { + "epoch": 0.6114623277023525, + "grad_norm": 0.5560715198516846, + "learning_rate": 1.3854148917134219e-05, + "loss": 0.1572, + "step": 5900 + }, + { + "epoch": 0.611565965385014, + "grad_norm": 0.5951459407806396, + "learning_rate": 1.3847760574572449e-05, + "loss": 0.1912, + "step": 5901 + }, + { + "epoch": 0.6116696030676754, + "grad_norm": 0.5832486152648926, + "learning_rate": 1.3841372925235446e-05, + "loss": 0.1851, + "step": 5902 + }, + { + "epoch": 0.6117732407503368, + "grad_norm": 0.5868365168571472, + "learning_rate": 1.3834985969842971e-05, + "loss": 0.2079, + "step": 5903 + }, + { + "epoch": 0.6118768784329982, + "grad_norm": 0.6246257424354553, + "learning_rate": 1.3828599709114698e-05, + "loss": 0.1944, + "step": 5904 + }, + { + "epoch": 0.6119805161156596, + "grad_norm": 0.6336971521377563, + "learning_rate": 1.382221414377021e-05, + "loss": 0.2636, + "step": 5905 + }, + { + "epoch": 0.612084153798321, + "grad_norm": 0.5235512256622314, + "learning_rate": 1.3815829274529036e-05, + "loss": 0.1764, + "step": 5906 + }, + { + "epoch": 0.6121877914809825, + "grad_norm": 0.5604624152183533, + "learning_rate": 1.3809445102110601e-05, + "loss": 0.1885, + "step": 5907 + }, + { + "epoch": 0.6122914291636439, + "grad_norm": 0.6259495615959167, + "learning_rate": 1.3803061627234264e-05, + "loss": 0.1987, + "step": 5908 + }, + { + "epoch": 0.6123950668463053, + "grad_norm": 0.567039430141449, + "learning_rate": 1.3796678850619318e-05, + "loss": 0.1948, + "step": 5909 + }, + { + "epoch": 0.6124987045289667, + "grad_norm": 0.510911762714386, + "learning_rate": 1.3790296772984952e-05, + "loss": 0.1761, + "step": 5910 + }, + { + "epoch": 0.6126023422116281, + "grad_norm": 0.5724498629570007, + "learning_rate": 1.3783915395050298e-05, + "loss": 0.186, + "step": 5911 + }, + { + "epoch": 0.6127059798942895, + "grad_norm": 0.7346729040145874, + "learning_rate": 1.3777534717534384e-05, + "loss": 0.2581, + "step": 5912 + }, + { + "epoch": 0.612809617576951, + "grad_norm": 0.7468560338020325, + "learning_rate": 1.3771154741156189e-05, + "loss": 0.2128, + "step": 5913 + }, + { + "epoch": 0.6129132552596124, + "grad_norm": 0.6244396567344666, + "learning_rate": 1.3764775466634602e-05, + "loss": 0.2212, + "step": 5914 + }, + { + "epoch": 0.6130168929422738, + "grad_norm": 0.6232682466506958, + "learning_rate": 1.3758396894688411e-05, + "loss": 0.229, + "step": 5915 + }, + { + "epoch": 0.6131205306249352, + "grad_norm": 0.7059705257415771, + "learning_rate": 1.3752019026036365e-05, + "loss": 0.2301, + "step": 5916 + }, + { + "epoch": 0.6132241683075966, + "grad_norm": 0.5770846605300903, + "learning_rate": 1.37456418613971e-05, + "loss": 0.2041, + "step": 5917 + }, + { + "epoch": 0.613327805990258, + "grad_norm": 0.5559613704681396, + "learning_rate": 1.3739265401489177e-05, + "loss": 0.1936, + "step": 5918 + }, + { + "epoch": 0.6134314436729195, + "grad_norm": 0.6115998029708862, + "learning_rate": 1.373288964703111e-05, + "loss": 0.2183, + "step": 5919 + }, + { + "epoch": 0.6135350813555809, + "grad_norm": 0.5869366526603699, + "learning_rate": 1.372651459874129e-05, + "loss": 0.1974, + "step": 5920 + }, + { + "epoch": 0.6136387190382423, + "grad_norm": 0.6263245940208435, + "learning_rate": 1.3720140257338054e-05, + "loss": 0.211, + "step": 5921 + }, + { + "epoch": 0.6137423567209037, + "grad_norm": 0.6086539626121521, + "learning_rate": 1.3713766623539648e-05, + "loss": 0.2089, + "step": 5922 + }, + { + "epoch": 0.6138459944035651, + "grad_norm": 0.5775662660598755, + "learning_rate": 1.3707393698064246e-05, + "loss": 0.179, + "step": 5923 + }, + { + "epoch": 0.6139496320862265, + "grad_norm": 0.4994238615036011, + "learning_rate": 1.370102148162995e-05, + "loss": 0.1706, + "step": 5924 + }, + { + "epoch": 0.614053269768888, + "grad_norm": 0.5579550266265869, + "learning_rate": 1.369464997495475e-05, + "loss": 0.1878, + "step": 5925 + }, + { + "epoch": 0.6141569074515494, + "grad_norm": 0.6892876029014587, + "learning_rate": 1.36882791787566e-05, + "loss": 0.2255, + "step": 5926 + }, + { + "epoch": 0.6142605451342108, + "grad_norm": 0.5811253786087036, + "learning_rate": 1.368190909375333e-05, + "loss": 0.1869, + "step": 5927 + }, + { + "epoch": 0.6143641828168722, + "grad_norm": 0.5586794018745422, + "learning_rate": 1.3675539720662724e-05, + "loss": 0.1835, + "step": 5928 + }, + { + "epoch": 0.6144678204995336, + "grad_norm": 0.5717200636863708, + "learning_rate": 1.3669171060202477e-05, + "loss": 0.2098, + "step": 5929 + }, + { + "epoch": 0.614571458182195, + "grad_norm": 0.5924586057662964, + "learning_rate": 1.3662803113090183e-05, + "loss": 0.2235, + "step": 5930 + }, + { + "epoch": 0.6146750958648565, + "grad_norm": 0.6300281882286072, + "learning_rate": 1.3656435880043393e-05, + "loss": 0.191, + "step": 5931 + }, + { + "epoch": 0.6147787335475179, + "grad_norm": 0.4872299134731293, + "learning_rate": 1.365006936177954e-05, + "loss": 0.1658, + "step": 5932 + }, + { + "epoch": 0.6148823712301793, + "grad_norm": 0.6179653406143188, + "learning_rate": 1.3643703559016e-05, + "loss": 0.2167, + "step": 5933 + }, + { + "epoch": 0.6149860089128407, + "grad_norm": 0.6873209476470947, + "learning_rate": 1.3637338472470068e-05, + "loss": 0.2234, + "step": 5934 + }, + { + "epoch": 0.6150896465955021, + "grad_norm": 0.562300980091095, + "learning_rate": 1.363097410285894e-05, + "loss": 0.1928, + "step": 5935 + }, + { + "epoch": 0.6151932842781636, + "grad_norm": 0.5228613615036011, + "learning_rate": 1.3624610450899755e-05, + "loss": 0.1785, + "step": 5936 + }, + { + "epoch": 0.615296921960825, + "grad_norm": 0.6604247689247131, + "learning_rate": 1.3618247517309548e-05, + "loss": 0.2466, + "step": 5937 + }, + { + "epoch": 0.6154005596434864, + "grad_norm": 0.6046388149261475, + "learning_rate": 1.3611885302805291e-05, + "loss": 0.2204, + "step": 5938 + }, + { + "epoch": 0.6155041973261478, + "grad_norm": 0.5771061182022095, + "learning_rate": 1.3605523808103874e-05, + "loss": 0.1655, + "step": 5939 + }, + { + "epoch": 0.6156078350088092, + "grad_norm": 0.5569134950637817, + "learning_rate": 1.3599163033922083e-05, + "loss": 0.1799, + "step": 5940 + }, + { + "epoch": 0.6157114726914706, + "grad_norm": 0.5640320777893066, + "learning_rate": 1.3592802980976661e-05, + "loss": 0.1726, + "step": 5941 + }, + { + "epoch": 0.6158151103741321, + "grad_norm": 0.640559732913971, + "learning_rate": 1.3586443649984234e-05, + "loss": 0.2414, + "step": 5942 + }, + { + "epoch": 0.6159187480567935, + "grad_norm": 0.5472736954689026, + "learning_rate": 1.3580085041661364e-05, + "loss": 0.1822, + "step": 5943 + }, + { + "epoch": 0.6160223857394549, + "grad_norm": 0.6231314539909363, + "learning_rate": 1.3573727156724537e-05, + "loss": 0.2318, + "step": 5944 + }, + { + "epoch": 0.6161260234221163, + "grad_norm": 0.4999285936355591, + "learning_rate": 1.3567369995890141e-05, + "loss": 0.1651, + "step": 5945 + }, + { + "epoch": 0.6162296611047777, + "grad_norm": 0.6263250112533569, + "learning_rate": 1.35610135598745e-05, + "loss": 0.2065, + "step": 5946 + }, + { + "epoch": 0.6163332987874391, + "grad_norm": 0.6203282475471497, + "learning_rate": 1.3554657849393831e-05, + "loss": 0.1995, + "step": 5947 + }, + { + "epoch": 0.6164369364701006, + "grad_norm": 0.5617152452468872, + "learning_rate": 1.35483028651643e-05, + "loss": 0.1854, + "step": 5948 + }, + { + "epoch": 0.616540574152762, + "grad_norm": 0.7309468388557434, + "learning_rate": 1.354194860790198e-05, + "loss": 0.251, + "step": 5949 + }, + { + "epoch": 0.6166442118354234, + "grad_norm": 0.5707843899726868, + "learning_rate": 1.353559507832284e-05, + "loss": 0.211, + "step": 5950 + }, + { + "epoch": 0.6167478495180848, + "grad_norm": 0.6699648499488831, + "learning_rate": 1.3529242277142806e-05, + "loss": 0.2132, + "step": 5951 + }, + { + "epoch": 0.6168514872007462, + "grad_norm": 0.631347119808197, + "learning_rate": 1.3522890205077693e-05, + "loss": 0.2241, + "step": 5952 + }, + { + "epoch": 0.6169551248834076, + "grad_norm": 0.5974149703979492, + "learning_rate": 1.3516538862843236e-05, + "loss": 0.1831, + "step": 5953 + }, + { + "epoch": 0.6170587625660691, + "grad_norm": 0.6694416403770447, + "learning_rate": 1.351018825115511e-05, + "loss": 0.2164, + "step": 5954 + }, + { + "epoch": 0.6171624002487305, + "grad_norm": 0.5892156958580017, + "learning_rate": 1.3503838370728879e-05, + "loss": 0.2071, + "step": 5955 + }, + { + "epoch": 0.6172660379313919, + "grad_norm": 0.6549639701843262, + "learning_rate": 1.3497489222280045e-05, + "loss": 0.2303, + "step": 5956 + }, + { + "epoch": 0.6173696756140533, + "grad_norm": 0.6274462938308716, + "learning_rate": 1.3491140806524013e-05, + "loss": 0.2158, + "step": 5957 + }, + { + "epoch": 0.6174733132967147, + "grad_norm": 0.6224632263183594, + "learning_rate": 1.3484793124176112e-05, + "loss": 0.2081, + "step": 5958 + }, + { + "epoch": 0.6175769509793761, + "grad_norm": 0.5738040804862976, + "learning_rate": 1.3478446175951603e-05, + "loss": 0.1823, + "step": 5959 + }, + { + "epoch": 0.6176805886620376, + "grad_norm": 0.5787317156791687, + "learning_rate": 1.3472099962565639e-05, + "loss": 0.2163, + "step": 5960 + }, + { + "epoch": 0.617784226344699, + "grad_norm": 0.6176537275314331, + "learning_rate": 1.3465754484733305e-05, + "loss": 0.2005, + "step": 5961 + }, + { + "epoch": 0.6178878640273604, + "grad_norm": 0.5757036805152893, + "learning_rate": 1.345940974316959e-05, + "loss": 0.1916, + "step": 5962 + }, + { + "epoch": 0.6179915017100218, + "grad_norm": 0.4411177635192871, + "learning_rate": 1.3453065738589422e-05, + "loss": 0.1488, + "step": 5963 + }, + { + "epoch": 0.6180951393926832, + "grad_norm": 0.6579338908195496, + "learning_rate": 1.3446722471707632e-05, + "loss": 0.2305, + "step": 5964 + }, + { + "epoch": 0.6181987770753445, + "grad_norm": 0.6656500101089478, + "learning_rate": 1.344037994323896e-05, + "loss": 0.2265, + "step": 5965 + }, + { + "epoch": 0.618302414758006, + "grad_norm": 0.6222073435783386, + "learning_rate": 1.3434038153898086e-05, + "loss": 0.2064, + "step": 5966 + }, + { + "epoch": 0.6184060524406674, + "grad_norm": 0.6472442746162415, + "learning_rate": 1.3427697104399583e-05, + "loss": 0.2306, + "step": 5967 + }, + { + "epoch": 0.6185096901233288, + "grad_norm": 0.617219865322113, + "learning_rate": 1.3421356795457948e-05, + "loss": 0.2312, + "step": 5968 + }, + { + "epoch": 0.6186133278059902, + "grad_norm": 0.5383247137069702, + "learning_rate": 1.3415017227787613e-05, + "loss": 0.179, + "step": 5969 + }, + { + "epoch": 0.6187169654886516, + "grad_norm": 0.5395634770393372, + "learning_rate": 1.3408678402102892e-05, + "loss": 0.1796, + "step": 5970 + }, + { + "epoch": 0.618820603171313, + "grad_norm": 0.5641382932662964, + "learning_rate": 1.340234031911805e-05, + "loss": 0.1987, + "step": 5971 + }, + { + "epoch": 0.6189242408539745, + "grad_norm": 0.49503156542778015, + "learning_rate": 1.3396002979547236e-05, + "loss": 0.1857, + "step": 5972 + }, + { + "epoch": 0.6190278785366359, + "grad_norm": 0.5869657397270203, + "learning_rate": 1.3389666384104544e-05, + "loss": 0.2176, + "step": 5973 + }, + { + "epoch": 0.6191315162192973, + "grad_norm": 0.582115113735199, + "learning_rate": 1.3383330533503971e-05, + "loss": 0.2017, + "step": 5974 + }, + { + "epoch": 0.6192351539019587, + "grad_norm": 0.5133143663406372, + "learning_rate": 1.3376995428459421e-05, + "loss": 0.1893, + "step": 5975 + }, + { + "epoch": 0.6193387915846201, + "grad_norm": 0.5100739598274231, + "learning_rate": 1.3370661069684739e-05, + "loss": 0.1676, + "step": 5976 + }, + { + "epoch": 0.6194424292672815, + "grad_norm": 0.5148884057998657, + "learning_rate": 1.3364327457893654e-05, + "loss": 0.1661, + "step": 5977 + }, + { + "epoch": 0.619546066949943, + "grad_norm": 0.6616286635398865, + "learning_rate": 1.3357994593799834e-05, + "loss": 0.2332, + "step": 5978 + }, + { + "epoch": 0.6196497046326044, + "grad_norm": 0.6580483913421631, + "learning_rate": 1.3351662478116867e-05, + "loss": 0.2324, + "step": 5979 + }, + { + "epoch": 0.6197533423152658, + "grad_norm": 0.5478088855743408, + "learning_rate": 1.3345331111558233e-05, + "loss": 0.1849, + "step": 5980 + }, + { + "epoch": 0.6198569799979272, + "grad_norm": 0.5173262357711792, + "learning_rate": 1.3339000494837348e-05, + "loss": 0.1809, + "step": 5981 + }, + { + "epoch": 0.6199606176805886, + "grad_norm": 0.6629555225372314, + "learning_rate": 1.3332670628667523e-05, + "loss": 0.2339, + "step": 5982 + }, + { + "epoch": 0.62006425536325, + "grad_norm": 0.5579218864440918, + "learning_rate": 1.3326341513762014e-05, + "loss": 0.1951, + "step": 5983 + }, + { + "epoch": 0.6201678930459115, + "grad_norm": 0.608124315738678, + "learning_rate": 1.3320013150833971e-05, + "loss": 0.1826, + "step": 5984 + }, + { + "epoch": 0.6202715307285729, + "grad_norm": 0.6063987016677856, + "learning_rate": 1.3313685540596452e-05, + "loss": 0.181, + "step": 5985 + }, + { + "epoch": 0.6203751684112343, + "grad_norm": 0.5893003344535828, + "learning_rate": 1.3307358683762469e-05, + "loss": 0.2025, + "step": 5986 + }, + { + "epoch": 0.6204788060938957, + "grad_norm": 0.6283261775970459, + "learning_rate": 1.3301032581044887e-05, + "loss": 0.1977, + "step": 5987 + }, + { + "epoch": 0.6205824437765571, + "grad_norm": 0.6476390361785889, + "learning_rate": 1.3294707233156548e-05, + "loss": 0.2452, + "step": 5988 + }, + { + "epoch": 0.6206860814592186, + "grad_norm": 0.6753994226455688, + "learning_rate": 1.3288382640810183e-05, + "loss": 0.2337, + "step": 5989 + }, + { + "epoch": 0.62078971914188, + "grad_norm": 0.5716283321380615, + "learning_rate": 1.3282058804718415e-05, + "loss": 0.1922, + "step": 5990 + }, + { + "epoch": 0.6208933568245414, + "grad_norm": 0.5066370964050293, + "learning_rate": 1.3275735725593827e-05, + "loss": 0.179, + "step": 5991 + }, + { + "epoch": 0.6209969945072028, + "grad_norm": 0.598387598991394, + "learning_rate": 1.3269413404148878e-05, + "loss": 0.2116, + "step": 5992 + }, + { + "epoch": 0.6211006321898642, + "grad_norm": 0.5515682101249695, + "learning_rate": 1.3263091841095963e-05, + "loss": 0.186, + "step": 5993 + }, + { + "epoch": 0.6212042698725256, + "grad_norm": 0.578323483467102, + "learning_rate": 1.3256771037147393e-05, + "loss": 0.185, + "step": 5994 + }, + { + "epoch": 0.6213079075551871, + "grad_norm": 0.6118210554122925, + "learning_rate": 1.3250450993015375e-05, + "loss": 0.2163, + "step": 5995 + }, + { + "epoch": 0.6214115452378485, + "grad_norm": 0.6113288998603821, + "learning_rate": 1.324413170941205e-05, + "loss": 0.2106, + "step": 5996 + }, + { + "epoch": 0.6215151829205099, + "grad_norm": 0.4464210569858551, + "learning_rate": 1.323781318704946e-05, + "loss": 0.1524, + "step": 5997 + }, + { + "epoch": 0.6216188206031713, + "grad_norm": 0.5403082966804504, + "learning_rate": 1.3231495426639565e-05, + "loss": 0.1801, + "step": 5998 + }, + { + "epoch": 0.6217224582858327, + "grad_norm": 0.5802052617073059, + "learning_rate": 1.3225178428894252e-05, + "loss": 0.2067, + "step": 5999 + }, + { + "epoch": 0.6218260959684941, + "grad_norm": 0.6188898086547852, + "learning_rate": 1.3218862194525293e-05, + "loss": 0.2286, + "step": 6000 + }, + { + "epoch": 0.6219297336511556, + "grad_norm": 0.6729816794395447, + "learning_rate": 1.3212546724244407e-05, + "loss": 0.2219, + "step": 6001 + }, + { + "epoch": 0.622033371333817, + "grad_norm": 0.5771058797836304, + "learning_rate": 1.3206232018763201e-05, + "loss": 0.2049, + "step": 6002 + }, + { + "epoch": 0.6221370090164784, + "grad_norm": 0.5784742832183838, + "learning_rate": 1.3199918078793205e-05, + "loss": 0.1952, + "step": 6003 + }, + { + "epoch": 0.6222406466991398, + "grad_norm": 0.5525607466697693, + "learning_rate": 1.3193604905045881e-05, + "loss": 0.1839, + "step": 6004 + }, + { + "epoch": 0.6223442843818012, + "grad_norm": 0.587228000164032, + "learning_rate": 1.3187292498232567e-05, + "loss": 0.2023, + "step": 6005 + }, + { + "epoch": 0.6224479220644626, + "grad_norm": 0.6367173790931702, + "learning_rate": 1.3180980859064549e-05, + "loss": 0.1923, + "step": 6006 + }, + { + "epoch": 0.6225515597471241, + "grad_norm": 0.5229629874229431, + "learning_rate": 1.3174669988253002e-05, + "loss": 0.1719, + "step": 6007 + }, + { + "epoch": 0.6226551974297855, + "grad_norm": 0.572173535823822, + "learning_rate": 1.3168359886509032e-05, + "loss": 0.1868, + "step": 6008 + }, + { + "epoch": 0.6227588351124469, + "grad_norm": 0.5453316569328308, + "learning_rate": 1.3162050554543655e-05, + "loss": 0.2008, + "step": 6009 + }, + { + "epoch": 0.6228624727951083, + "grad_norm": 0.5589512586593628, + "learning_rate": 1.3155741993067782e-05, + "loss": 0.1805, + "step": 6010 + }, + { + "epoch": 0.6229661104777697, + "grad_norm": 0.582014799118042, + "learning_rate": 1.3149434202792272e-05, + "loss": 0.199, + "step": 6011 + }, + { + "epoch": 0.6230697481604311, + "grad_norm": 0.5078872442245483, + "learning_rate": 1.3143127184427863e-05, + "loss": 0.1789, + "step": 6012 + }, + { + "epoch": 0.6231733858430926, + "grad_norm": 0.5509869456291199, + "learning_rate": 1.3136820938685218e-05, + "loss": 0.1822, + "step": 6013 + }, + { + "epoch": 0.623277023525754, + "grad_norm": 0.619751513004303, + "learning_rate": 1.3130515466274929e-05, + "loss": 0.1909, + "step": 6014 + }, + { + "epoch": 0.6233806612084154, + "grad_norm": 0.5621147751808167, + "learning_rate": 1.3124210767907472e-05, + "loss": 0.1961, + "step": 6015 + }, + { + "epoch": 0.6234842988910768, + "grad_norm": 0.686894953250885, + "learning_rate": 1.3117906844293265e-05, + "loss": 0.2095, + "step": 6016 + }, + { + "epoch": 0.6235879365737382, + "grad_norm": 0.6219806671142578, + "learning_rate": 1.3111603696142608e-05, + "loss": 0.2168, + "step": 6017 + }, + { + "epoch": 0.6236915742563997, + "grad_norm": 0.5394052267074585, + "learning_rate": 1.310530132416574e-05, + "loss": 0.1844, + "step": 6018 + }, + { + "epoch": 0.6237952119390611, + "grad_norm": 0.5847477316856384, + "learning_rate": 1.3098999729072808e-05, + "loss": 0.2056, + "step": 6019 + }, + { + "epoch": 0.6238988496217225, + "grad_norm": 0.572024405002594, + "learning_rate": 1.3092698911573851e-05, + "loss": 0.2052, + "step": 6020 + }, + { + "epoch": 0.6240024873043839, + "grad_norm": 0.6376693844795227, + "learning_rate": 1.308639887237885e-05, + "loss": 0.2321, + "step": 6021 + }, + { + "epoch": 0.6241061249870453, + "grad_norm": 0.6629868745803833, + "learning_rate": 1.3080099612197668e-05, + "loss": 0.2018, + "step": 6022 + }, + { + "epoch": 0.6242097626697067, + "grad_norm": 0.577773928642273, + "learning_rate": 1.3073801131740104e-05, + "loss": 0.1968, + "step": 6023 + }, + { + "epoch": 0.6243134003523682, + "grad_norm": 0.6478099226951599, + "learning_rate": 1.306750343171587e-05, + "loss": 0.2023, + "step": 6024 + }, + { + "epoch": 0.6244170380350296, + "grad_norm": 0.6318678259849548, + "learning_rate": 1.3061206512834566e-05, + "loss": 0.2024, + "step": 6025 + }, + { + "epoch": 0.624520675717691, + "grad_norm": 0.5699331760406494, + "learning_rate": 1.305491037580573e-05, + "loss": 0.1704, + "step": 6026 + }, + { + "epoch": 0.6246243134003524, + "grad_norm": 0.4767056405544281, + "learning_rate": 1.3048615021338793e-05, + "loss": 0.1338, + "step": 6027 + }, + { + "epoch": 0.6247279510830138, + "grad_norm": 0.5985897779464722, + "learning_rate": 1.3042320450143107e-05, + "loss": 0.186, + "step": 6028 + }, + { + "epoch": 0.6248315887656752, + "grad_norm": 0.5831640958786011, + "learning_rate": 1.3036026662927945e-05, + "loss": 0.1772, + "step": 6029 + }, + { + "epoch": 0.6249352264483367, + "grad_norm": 0.5923576951026917, + "learning_rate": 1.3029733660402468e-05, + "loss": 0.2073, + "step": 6030 + }, + { + "epoch": 0.6250388641309981, + "grad_norm": 0.5854648351669312, + "learning_rate": 1.3023441443275769e-05, + "loss": 0.1938, + "step": 6031 + }, + { + "epoch": 0.6251425018136595, + "grad_norm": 0.6780679821968079, + "learning_rate": 1.3017150012256839e-05, + "loss": 0.2207, + "step": 6032 + }, + { + "epoch": 0.6252461394963209, + "grad_norm": 0.6159749031066895, + "learning_rate": 1.301085936805459e-05, + "loss": 0.2198, + "step": 6033 + }, + { + "epoch": 0.6253497771789823, + "grad_norm": 0.5998734831809998, + "learning_rate": 1.3004569511377847e-05, + "loss": 0.1759, + "step": 6034 + }, + { + "epoch": 0.6254534148616437, + "grad_norm": 0.5629125237464905, + "learning_rate": 1.2998280442935332e-05, + "loss": 0.1934, + "step": 6035 + }, + { + "epoch": 0.6255570525443052, + "grad_norm": 0.6259785890579224, + "learning_rate": 1.2991992163435698e-05, + "loss": 0.2059, + "step": 6036 + }, + { + "epoch": 0.6256606902269666, + "grad_norm": 0.6138043403625488, + "learning_rate": 1.2985704673587489e-05, + "loss": 0.1864, + "step": 6037 + }, + { + "epoch": 0.625764327909628, + "grad_norm": 0.5702097415924072, + "learning_rate": 1.2979417974099171e-05, + "loss": 0.2021, + "step": 6038 + }, + { + "epoch": 0.6258679655922894, + "grad_norm": 0.5695834755897522, + "learning_rate": 1.297313206567913e-05, + "loss": 0.2073, + "step": 6039 + }, + { + "epoch": 0.6259716032749508, + "grad_norm": 0.5235719680786133, + "learning_rate": 1.2966846949035638e-05, + "loss": 0.1831, + "step": 6040 + }, + { + "epoch": 0.6260752409576121, + "grad_norm": 0.7065122723579407, + "learning_rate": 1.2960562624876905e-05, + "loss": 0.2351, + "step": 6041 + }, + { + "epoch": 0.6261788786402736, + "grad_norm": 0.5279736518859863, + "learning_rate": 1.2954279093911022e-05, + "loss": 0.1734, + "step": 6042 + }, + { + "epoch": 0.626282516322935, + "grad_norm": 0.6119756698608398, + "learning_rate": 1.2947996356846022e-05, + "loss": 0.1967, + "step": 6043 + }, + { + "epoch": 0.6263861540055964, + "grad_norm": 0.5212472081184387, + "learning_rate": 1.2941714414389836e-05, + "loss": 0.1729, + "step": 6044 + }, + { + "epoch": 0.6264897916882578, + "grad_norm": 0.5788500308990479, + "learning_rate": 1.2935433267250291e-05, + "loss": 0.2174, + "step": 6045 + }, + { + "epoch": 0.6265934293709192, + "grad_norm": 0.6433359384536743, + "learning_rate": 1.2929152916135142e-05, + "loss": 0.2228, + "step": 6046 + }, + { + "epoch": 0.6266970670535806, + "grad_norm": 0.5505836606025696, + "learning_rate": 1.292287336175206e-05, + "loss": 0.1746, + "step": 6047 + }, + { + "epoch": 0.626800704736242, + "grad_norm": 0.6313563585281372, + "learning_rate": 1.2916594604808595e-05, + "loss": 0.205, + "step": 6048 + }, + { + "epoch": 0.6269043424189035, + "grad_norm": 0.6382592916488647, + "learning_rate": 1.291031664601225e-05, + "loss": 0.2262, + "step": 6049 + }, + { + "epoch": 0.6270079801015649, + "grad_norm": 0.5864508748054504, + "learning_rate": 1.2904039486070393e-05, + "loss": 0.2127, + "step": 6050 + }, + { + "epoch": 0.6271116177842263, + "grad_norm": 0.6004242897033691, + "learning_rate": 1.2897763125690337e-05, + "loss": 0.2149, + "step": 6051 + }, + { + "epoch": 0.6272152554668877, + "grad_norm": 0.6075720191001892, + "learning_rate": 1.2891487565579301e-05, + "loss": 0.2105, + "step": 6052 + }, + { + "epoch": 0.6273188931495491, + "grad_norm": 0.6497997641563416, + "learning_rate": 1.2885212806444386e-05, + "loss": 0.2003, + "step": 6053 + }, + { + "epoch": 0.6274225308322106, + "grad_norm": 0.6276025176048279, + "learning_rate": 1.287893884899264e-05, + "loss": 0.2091, + "step": 6054 + }, + { + "epoch": 0.627526168514872, + "grad_norm": 0.6251934766769409, + "learning_rate": 1.2872665693930989e-05, + "loss": 0.2007, + "step": 6055 + }, + { + "epoch": 0.6276298061975334, + "grad_norm": 0.5782637596130371, + "learning_rate": 1.2866393341966283e-05, + "loss": 0.2139, + "step": 6056 + }, + { + "epoch": 0.6277334438801948, + "grad_norm": 0.5191376209259033, + "learning_rate": 1.2860121793805295e-05, + "loss": 0.1795, + "step": 6057 + }, + { + "epoch": 0.6278370815628562, + "grad_norm": 0.6319491267204285, + "learning_rate": 1.2853851050154683e-05, + "loss": 0.2185, + "step": 6058 + }, + { + "epoch": 0.6279407192455176, + "grad_norm": 0.5615255236625671, + "learning_rate": 1.2847581111721029e-05, + "loss": 0.1724, + "step": 6059 + }, + { + "epoch": 0.6280443569281791, + "grad_norm": 0.6250758171081543, + "learning_rate": 1.2841311979210804e-05, + "loss": 0.2187, + "step": 6060 + }, + { + "epoch": 0.6281479946108405, + "grad_norm": 0.5791369676589966, + "learning_rate": 1.2835043653330423e-05, + "loss": 0.1838, + "step": 6061 + }, + { + "epoch": 0.6282516322935019, + "grad_norm": 0.6014963984489441, + "learning_rate": 1.2828776134786192e-05, + "loss": 0.2082, + "step": 6062 + }, + { + "epoch": 0.6283552699761633, + "grad_norm": 0.6349720358848572, + "learning_rate": 1.2822509424284308e-05, + "loss": 0.1994, + "step": 6063 + }, + { + "epoch": 0.6284589076588247, + "grad_norm": 0.5179983377456665, + "learning_rate": 1.281624352253091e-05, + "loss": 0.1738, + "step": 6064 + }, + { + "epoch": 0.6285625453414861, + "grad_norm": 0.5795208811759949, + "learning_rate": 1.2809978430232024e-05, + "loss": 0.1772, + "step": 6065 + }, + { + "epoch": 0.6286661830241476, + "grad_norm": 0.5753578543663025, + "learning_rate": 1.2803714148093586e-05, + "loss": 0.186, + "step": 6066 + }, + { + "epoch": 0.628769820706809, + "grad_norm": 0.5059354305267334, + "learning_rate": 1.2797450676821459e-05, + "loss": 0.1857, + "step": 6067 + }, + { + "epoch": 0.6288734583894704, + "grad_norm": 0.5674526691436768, + "learning_rate": 1.279118801712139e-05, + "loss": 0.2047, + "step": 6068 + }, + { + "epoch": 0.6289770960721318, + "grad_norm": 0.5886660814285278, + "learning_rate": 1.2784926169699053e-05, + "loss": 0.189, + "step": 6069 + }, + { + "epoch": 0.6290807337547932, + "grad_norm": 0.5672776103019714, + "learning_rate": 1.2778665135260009e-05, + "loss": 0.2002, + "step": 6070 + }, + { + "epoch": 0.6291843714374546, + "grad_norm": 0.6054661273956299, + "learning_rate": 1.277240491450976e-05, + "loss": 0.2121, + "step": 6071 + }, + { + "epoch": 0.6292880091201161, + "grad_norm": 0.6700482964515686, + "learning_rate": 1.2766145508153689e-05, + "loss": 0.2335, + "step": 6072 + }, + { + "epoch": 0.6293916468027775, + "grad_norm": 0.5708560943603516, + "learning_rate": 1.275988691689709e-05, + "loss": 0.1988, + "step": 6073 + }, + { + "epoch": 0.6294952844854389, + "grad_norm": 0.5475946068763733, + "learning_rate": 1.2753629141445187e-05, + "loss": 0.1719, + "step": 6074 + }, + { + "epoch": 0.6295989221681003, + "grad_norm": 0.6204192638397217, + "learning_rate": 1.2747372182503085e-05, + "loss": 0.1957, + "step": 6075 + }, + { + "epoch": 0.6297025598507617, + "grad_norm": 0.4806893765926361, + "learning_rate": 1.2741116040775805e-05, + "loss": 0.1546, + "step": 6076 + }, + { + "epoch": 0.6298061975334232, + "grad_norm": 0.562664270401001, + "learning_rate": 1.2734860716968295e-05, + "loss": 0.1696, + "step": 6077 + }, + { + "epoch": 0.6299098352160846, + "grad_norm": 0.6378372311592102, + "learning_rate": 1.2728606211785381e-05, + "loss": 0.2234, + "step": 6078 + }, + { + "epoch": 0.630013472898746, + "grad_norm": 0.6317660212516785, + "learning_rate": 1.2722352525931818e-05, + "loss": 0.1851, + "step": 6079 + }, + { + "epoch": 0.6301171105814074, + "grad_norm": 0.6436264514923096, + "learning_rate": 1.2716099660112253e-05, + "loss": 0.2365, + "step": 6080 + }, + { + "epoch": 0.6302207482640688, + "grad_norm": 0.529187798500061, + "learning_rate": 1.2709847615031258e-05, + "loss": 0.1745, + "step": 6081 + }, + { + "epoch": 0.6303243859467302, + "grad_norm": 0.587397575378418, + "learning_rate": 1.2703596391393305e-05, + "loss": 0.1912, + "step": 6082 + }, + { + "epoch": 0.6304280236293917, + "grad_norm": 0.597055971622467, + "learning_rate": 1.2697345989902765e-05, + "loss": 0.2151, + "step": 6083 + }, + { + "epoch": 0.6305316613120531, + "grad_norm": 0.6372289657592773, + "learning_rate": 1.2691096411263932e-05, + "loss": 0.1959, + "step": 6084 + }, + { + "epoch": 0.6306352989947145, + "grad_norm": 0.6131249666213989, + "learning_rate": 1.2684847656180982e-05, + "loss": 0.1865, + "step": 6085 + }, + { + "epoch": 0.6307389366773759, + "grad_norm": 0.5528261065483093, + "learning_rate": 1.2678599725358031e-05, + "loss": 0.1794, + "step": 6086 + }, + { + "epoch": 0.6308425743600373, + "grad_norm": 0.5730780959129333, + "learning_rate": 1.2672352619499089e-05, + "loss": 0.1803, + "step": 6087 + }, + { + "epoch": 0.6309462120426987, + "grad_norm": 0.6822916865348816, + "learning_rate": 1.2666106339308053e-05, + "loss": 0.2319, + "step": 6088 + }, + { + "epoch": 0.6310498497253602, + "grad_norm": 0.6828444600105286, + "learning_rate": 1.2659860885488761e-05, + "loss": 0.2448, + "step": 6089 + }, + { + "epoch": 0.6311534874080216, + "grad_norm": 0.6281768083572388, + "learning_rate": 1.2653616258744929e-05, + "loss": 0.1941, + "step": 6090 + }, + { + "epoch": 0.631257125090683, + "grad_norm": 0.5856252312660217, + "learning_rate": 1.2647372459780196e-05, + "loss": 0.1912, + "step": 6091 + }, + { + "epoch": 0.6313607627733444, + "grad_norm": 0.5877519249916077, + "learning_rate": 1.264112948929811e-05, + "loss": 0.2097, + "step": 6092 + }, + { + "epoch": 0.6314644004560058, + "grad_norm": 0.6103312969207764, + "learning_rate": 1.263488734800211e-05, + "loss": 0.2068, + "step": 6093 + }, + { + "epoch": 0.6315680381386672, + "grad_norm": 0.5803897380828857, + "learning_rate": 1.2628646036595558e-05, + "loss": 0.2086, + "step": 6094 + }, + { + "epoch": 0.6316716758213287, + "grad_norm": 0.5573654770851135, + "learning_rate": 1.2622405555781705e-05, + "loss": 0.1879, + "step": 6095 + }, + { + "epoch": 0.6317753135039901, + "grad_norm": 0.4737562835216522, + "learning_rate": 1.2616165906263729e-05, + "loss": 0.1428, + "step": 6096 + }, + { + "epoch": 0.6318789511866515, + "grad_norm": 0.6127933263778687, + "learning_rate": 1.2609927088744702e-05, + "loss": 0.2075, + "step": 6097 + }, + { + "epoch": 0.6319825888693129, + "grad_norm": 0.5848706364631653, + "learning_rate": 1.2603689103927596e-05, + "loss": 0.225, + "step": 6098 + }, + { + "epoch": 0.6320862265519743, + "grad_norm": 0.6001607775688171, + "learning_rate": 1.2597451952515312e-05, + "loss": 0.2076, + "step": 6099 + }, + { + "epoch": 0.6321898642346357, + "grad_norm": 0.5365419983863831, + "learning_rate": 1.2591215635210632e-05, + "loss": 0.1666, + "step": 6100 + }, + { + "epoch": 0.6322935019172972, + "grad_norm": 0.5268027186393738, + "learning_rate": 1.2584980152716253e-05, + "loss": 0.1595, + "step": 6101 + }, + { + "epoch": 0.6323971395999586, + "grad_norm": 0.4245770573616028, + "learning_rate": 1.2578745505734789e-05, + "loss": 0.1364, + "step": 6102 + }, + { + "epoch": 0.63250077728262, + "grad_norm": 0.6496620774269104, + "learning_rate": 1.2572511694968741e-05, + "loss": 0.2125, + "step": 6103 + }, + { + "epoch": 0.6326044149652814, + "grad_norm": 0.5363298654556274, + "learning_rate": 1.2566278721120536e-05, + "loss": 0.1786, + "step": 6104 + }, + { + "epoch": 0.6327080526479428, + "grad_norm": 0.6401138305664062, + "learning_rate": 1.2560046584892478e-05, + "loss": 0.2154, + "step": 6105 + }, + { + "epoch": 0.6328116903306042, + "grad_norm": 0.5801215767860413, + "learning_rate": 1.255381528698681e-05, + "loss": 0.1994, + "step": 6106 + }, + { + "epoch": 0.6329153280132657, + "grad_norm": 0.5988919734954834, + "learning_rate": 1.2547584828105665e-05, + "loss": 0.2116, + "step": 6107 + }, + { + "epoch": 0.6330189656959271, + "grad_norm": 0.6742331981658936, + "learning_rate": 1.2541355208951066e-05, + "loss": 0.2405, + "step": 6108 + }, + { + "epoch": 0.6331226033785885, + "grad_norm": 0.5302967429161072, + "learning_rate": 1.2535126430224977e-05, + "loss": 0.1704, + "step": 6109 + }, + { + "epoch": 0.6332262410612499, + "grad_norm": 0.530825138092041, + "learning_rate": 1.2528898492629233e-05, + "loss": 0.1832, + "step": 6110 + }, + { + "epoch": 0.6333298787439113, + "grad_norm": 0.6048321723937988, + "learning_rate": 1.2522671396865592e-05, + "loss": 0.199, + "step": 6111 + }, + { + "epoch": 0.6334335164265728, + "grad_norm": 0.6834918260574341, + "learning_rate": 1.2516445143635723e-05, + "loss": 0.244, + "step": 6112 + }, + { + "epoch": 0.6335371541092342, + "grad_norm": 0.49825942516326904, + "learning_rate": 1.2510219733641171e-05, + "loss": 0.168, + "step": 6113 + }, + { + "epoch": 0.6336407917918956, + "grad_norm": 0.5733488202095032, + "learning_rate": 1.2503995167583423e-05, + "loss": 0.1786, + "step": 6114 + }, + { + "epoch": 0.633744429474557, + "grad_norm": 0.5585787296295166, + "learning_rate": 1.2497771446163842e-05, + "loss": 0.1995, + "step": 6115 + }, + { + "epoch": 0.6338480671572184, + "grad_norm": 0.5882747769355774, + "learning_rate": 1.249154857008371e-05, + "loss": 0.188, + "step": 6116 + }, + { + "epoch": 0.6339517048398797, + "grad_norm": 0.5983887314796448, + "learning_rate": 1.2485326540044223e-05, + "loss": 0.1747, + "step": 6117 + }, + { + "epoch": 0.6340553425225411, + "grad_norm": 0.6462018489837646, + "learning_rate": 1.2479105356746453e-05, + "loss": 0.2252, + "step": 6118 + }, + { + "epoch": 0.6341589802052026, + "grad_norm": 0.6256545186042786, + "learning_rate": 1.2472885020891403e-05, + "loss": 0.2436, + "step": 6119 + }, + { + "epoch": 0.634262617887864, + "grad_norm": 0.5571950078010559, + "learning_rate": 1.246666553317996e-05, + "loss": 0.2127, + "step": 6120 + }, + { + "epoch": 0.6343662555705254, + "grad_norm": 0.6304576396942139, + "learning_rate": 1.2460446894312938e-05, + "loss": 0.196, + "step": 6121 + }, + { + "epoch": 0.6344698932531868, + "grad_norm": 0.5518038272857666, + "learning_rate": 1.2454229104991046e-05, + "loss": 0.1952, + "step": 6122 + }, + { + "epoch": 0.6345735309358482, + "grad_norm": 0.5594738125801086, + "learning_rate": 1.2448012165914879e-05, + "loss": 0.1723, + "step": 6123 + }, + { + "epoch": 0.6346771686185096, + "grad_norm": 0.5120114684104919, + "learning_rate": 1.244179607778497e-05, + "loss": 0.1828, + "step": 6124 + }, + { + "epoch": 0.6347808063011711, + "grad_norm": 0.615667462348938, + "learning_rate": 1.2435580841301723e-05, + "loss": 0.2202, + "step": 6125 + }, + { + "epoch": 0.6348844439838325, + "grad_norm": 0.6665404438972473, + "learning_rate": 1.2429366457165464e-05, + "loss": 0.2206, + "step": 6126 + }, + { + "epoch": 0.6349880816664939, + "grad_norm": 0.6501171588897705, + "learning_rate": 1.2423152926076434e-05, + "loss": 0.191, + "step": 6127 + }, + { + "epoch": 0.6350917193491553, + "grad_norm": 0.5853009819984436, + "learning_rate": 1.2416940248734748e-05, + "loss": 0.2092, + "step": 6128 + }, + { + "epoch": 0.6351953570318167, + "grad_norm": 0.6276318430900574, + "learning_rate": 1.2410728425840452e-05, + "loss": 0.2164, + "step": 6129 + }, + { + "epoch": 0.6352989947144781, + "grad_norm": 0.6532617211341858, + "learning_rate": 1.2404517458093471e-05, + "loss": 0.2066, + "step": 6130 + }, + { + "epoch": 0.6354026323971396, + "grad_norm": 0.6746910810470581, + "learning_rate": 1.2398307346193659e-05, + "loss": 0.2314, + "step": 6131 + }, + { + "epoch": 0.635506270079801, + "grad_norm": 0.6130415797233582, + "learning_rate": 1.239209809084076e-05, + "loss": 0.2115, + "step": 6132 + }, + { + "epoch": 0.6356099077624624, + "grad_norm": 0.6588493585586548, + "learning_rate": 1.2385889692734416e-05, + "loss": 0.2204, + "step": 6133 + }, + { + "epoch": 0.6357135454451238, + "grad_norm": 0.5292779207229614, + "learning_rate": 1.2379682152574195e-05, + "loss": 0.2022, + "step": 6134 + }, + { + "epoch": 0.6358171831277852, + "grad_norm": 0.4803861975669861, + "learning_rate": 1.2373475471059538e-05, + "loss": 0.1608, + "step": 6135 + }, + { + "epoch": 0.6359208208104467, + "grad_norm": 0.5866460204124451, + "learning_rate": 1.2367269648889804e-05, + "loss": 0.199, + "step": 6136 + }, + { + "epoch": 0.6360244584931081, + "grad_norm": 0.6466445326805115, + "learning_rate": 1.2361064686764265e-05, + "loss": 0.2038, + "step": 6137 + }, + { + "epoch": 0.6361280961757695, + "grad_norm": 0.5443708300590515, + "learning_rate": 1.2354860585382084e-05, + "loss": 0.1699, + "step": 6138 + }, + { + "epoch": 0.6362317338584309, + "grad_norm": 0.638749361038208, + "learning_rate": 1.234865734544233e-05, + "loss": 0.1992, + "step": 6139 + }, + { + "epoch": 0.6363353715410923, + "grad_norm": 0.5929224491119385, + "learning_rate": 1.2342454967643964e-05, + "loss": 0.1935, + "step": 6140 + }, + { + "epoch": 0.6364390092237537, + "grad_norm": 0.6019617915153503, + "learning_rate": 1.2336253452685871e-05, + "loss": 0.2156, + "step": 6141 + }, + { + "epoch": 0.6365426469064152, + "grad_norm": 0.6148455142974854, + "learning_rate": 1.2330052801266832e-05, + "loss": 0.1975, + "step": 6142 + }, + { + "epoch": 0.6366462845890766, + "grad_norm": 0.598908543586731, + "learning_rate": 1.232385301408551e-05, + "loss": 0.2119, + "step": 6143 + }, + { + "epoch": 0.636749922271738, + "grad_norm": 0.6222452521324158, + "learning_rate": 1.231765409184051e-05, + "loss": 0.1918, + "step": 6144 + }, + { + "epoch": 0.6368535599543994, + "grad_norm": 0.6636112928390503, + "learning_rate": 1.2311456035230291e-05, + "loss": 0.194, + "step": 6145 + }, + { + "epoch": 0.6369571976370608, + "grad_norm": 0.5806705951690674, + "learning_rate": 1.230525884495326e-05, + "loss": 0.2072, + "step": 6146 + }, + { + "epoch": 0.6370608353197222, + "grad_norm": 0.6016364097595215, + "learning_rate": 1.2299062521707702e-05, + "loss": 0.2144, + "step": 6147 + }, + { + "epoch": 0.6371644730023837, + "grad_norm": 0.7070096135139465, + "learning_rate": 1.2292867066191803e-05, + "loss": 0.2558, + "step": 6148 + }, + { + "epoch": 0.6372681106850451, + "grad_norm": 0.6243141889572144, + "learning_rate": 1.2286672479103672e-05, + "loss": 0.1832, + "step": 6149 + }, + { + "epoch": 0.6373717483677065, + "grad_norm": 0.5644826292991638, + "learning_rate": 1.2280478761141286e-05, + "loss": 0.2061, + "step": 6150 + }, + { + "epoch": 0.6374753860503679, + "grad_norm": 0.5745707154273987, + "learning_rate": 1.2274285913002553e-05, + "loss": 0.2043, + "step": 6151 + }, + { + "epoch": 0.6375790237330293, + "grad_norm": 0.5755109190940857, + "learning_rate": 1.226809393538528e-05, + "loss": 0.1898, + "step": 6152 + }, + { + "epoch": 0.6376826614156907, + "grad_norm": 0.5627861618995667, + "learning_rate": 1.2261902828987155e-05, + "loss": 0.1917, + "step": 6153 + }, + { + "epoch": 0.6377862990983522, + "grad_norm": 0.5213609933853149, + "learning_rate": 1.22557125945058e-05, + "loss": 0.1626, + "step": 6154 + }, + { + "epoch": 0.6378899367810136, + "grad_norm": 0.5159831047058105, + "learning_rate": 1.22495232326387e-05, + "loss": 0.173, + "step": 6155 + }, + { + "epoch": 0.637993574463675, + "grad_norm": 0.5197933316230774, + "learning_rate": 1.224333474408328e-05, + "loss": 0.1644, + "step": 6156 + }, + { + "epoch": 0.6380972121463364, + "grad_norm": 0.6517233848571777, + "learning_rate": 1.2237147129536844e-05, + "loss": 0.2025, + "step": 6157 + }, + { + "epoch": 0.6382008498289978, + "grad_norm": 0.5894943475723267, + "learning_rate": 1.2230960389696595e-05, + "loss": 0.1898, + "step": 6158 + }, + { + "epoch": 0.6383044875116592, + "grad_norm": 0.6199367046356201, + "learning_rate": 1.2224774525259661e-05, + "loss": 0.2153, + "step": 6159 + }, + { + "epoch": 0.6384081251943207, + "grad_norm": 0.552657425403595, + "learning_rate": 1.2218589536923045e-05, + "loss": 0.1642, + "step": 6160 + }, + { + "epoch": 0.6385117628769821, + "grad_norm": 0.5651284456253052, + "learning_rate": 1.2212405425383655e-05, + "loss": 0.2049, + "step": 6161 + }, + { + "epoch": 0.6386154005596435, + "grad_norm": 0.6005743741989136, + "learning_rate": 1.2206222191338326e-05, + "loss": 0.1984, + "step": 6162 + }, + { + "epoch": 0.6387190382423049, + "grad_norm": 0.5854420065879822, + "learning_rate": 1.220003983548376e-05, + "loss": 0.2186, + "step": 6163 + }, + { + "epoch": 0.6388226759249663, + "grad_norm": 0.6057052612304688, + "learning_rate": 1.2193858358516585e-05, + "loss": 0.1816, + "step": 6164 + }, + { + "epoch": 0.6389263136076277, + "grad_norm": 0.5955823659896851, + "learning_rate": 1.2187677761133307e-05, + "loss": 0.2141, + "step": 6165 + }, + { + "epoch": 0.6390299512902892, + "grad_norm": 0.5140160322189331, + "learning_rate": 1.2181498044030358e-05, + "loss": 0.1802, + "step": 6166 + }, + { + "epoch": 0.6391335889729506, + "grad_norm": 0.4712127447128296, + "learning_rate": 1.217531920790406e-05, + "loss": 0.1374, + "step": 6167 + }, + { + "epoch": 0.639237226655612, + "grad_norm": 0.5993489027023315, + "learning_rate": 1.216914125345062e-05, + "loss": 0.202, + "step": 6168 + }, + { + "epoch": 0.6393408643382734, + "grad_norm": 0.6108299493789673, + "learning_rate": 1.2162964181366181e-05, + "loss": 0.1944, + "step": 6169 + }, + { + "epoch": 0.6394445020209348, + "grad_norm": 0.49651244282722473, + "learning_rate": 1.215678799234675e-05, + "loss": 0.1705, + "step": 6170 + }, + { + "epoch": 0.6395481397035963, + "grad_norm": 0.5781722068786621, + "learning_rate": 1.215061268708825e-05, + "loss": 0.1894, + "step": 6171 + }, + { + "epoch": 0.6396517773862577, + "grad_norm": 0.5969220995903015, + "learning_rate": 1.214443826628652e-05, + "loss": 0.2042, + "step": 6172 + }, + { + "epoch": 0.6397554150689191, + "grad_norm": 0.5631805658340454, + "learning_rate": 1.213826473063727e-05, + "loss": 0.2022, + "step": 6173 + }, + { + "epoch": 0.6398590527515805, + "grad_norm": 0.5786588191986084, + "learning_rate": 1.213209208083613e-05, + "loss": 0.2157, + "step": 6174 + }, + { + "epoch": 0.6399626904342419, + "grad_norm": 0.5629664659500122, + "learning_rate": 1.2125920317578625e-05, + "loss": 0.2047, + "step": 6175 + }, + { + "epoch": 0.6400663281169033, + "grad_norm": 0.6054370403289795, + "learning_rate": 1.2119749441560172e-05, + "loss": 0.203, + "step": 6176 + }, + { + "epoch": 0.6401699657995648, + "grad_norm": 0.6195040941238403, + "learning_rate": 1.211357945347611e-05, + "loss": 0.191, + "step": 6177 + }, + { + "epoch": 0.6402736034822262, + "grad_norm": 0.6448860168457031, + "learning_rate": 1.210741035402165e-05, + "loss": 0.2079, + "step": 6178 + }, + { + "epoch": 0.6403772411648876, + "grad_norm": 0.535200297832489, + "learning_rate": 1.2101242143891928e-05, + "loss": 0.1747, + "step": 6179 + }, + { + "epoch": 0.640480878847549, + "grad_norm": 0.5435940027236938, + "learning_rate": 1.2095074823781951e-05, + "loss": 0.1863, + "step": 6180 + }, + { + "epoch": 0.6405845165302104, + "grad_norm": 0.5571568608283997, + "learning_rate": 1.208890839438666e-05, + "loss": 0.1791, + "step": 6181 + }, + { + "epoch": 0.6406881542128718, + "grad_norm": 0.568340003490448, + "learning_rate": 1.2082742856400878e-05, + "loss": 0.206, + "step": 6182 + }, + { + "epoch": 0.6407917918955333, + "grad_norm": 0.598192572593689, + "learning_rate": 1.2076578210519316e-05, + "loss": 0.1983, + "step": 6183 + }, + { + "epoch": 0.6408954295781947, + "grad_norm": 0.6520498394966125, + "learning_rate": 1.207041445743661e-05, + "loss": 0.2388, + "step": 6184 + }, + { + "epoch": 0.6409990672608561, + "grad_norm": 0.6946018934249878, + "learning_rate": 1.2064251597847272e-05, + "loss": 0.2306, + "step": 6185 + }, + { + "epoch": 0.6411027049435175, + "grad_norm": 0.6726908683776855, + "learning_rate": 1.2058089632445724e-05, + "loss": 0.2149, + "step": 6186 + }, + { + "epoch": 0.6412063426261789, + "grad_norm": 0.5651692748069763, + "learning_rate": 1.2051928561926301e-05, + "loss": 0.1764, + "step": 6187 + }, + { + "epoch": 0.6413099803088403, + "grad_norm": 0.5629836916923523, + "learning_rate": 1.2045768386983206e-05, + "loss": 0.1805, + "step": 6188 + }, + { + "epoch": 0.6414136179915018, + "grad_norm": 0.5897392630577087, + "learning_rate": 1.2039609108310567e-05, + "loss": 0.1846, + "step": 6189 + }, + { + "epoch": 0.6415172556741632, + "grad_norm": 0.5797823071479797, + "learning_rate": 1.2033450726602393e-05, + "loss": 0.1657, + "step": 6190 + }, + { + "epoch": 0.6416208933568246, + "grad_norm": 0.5978760123252869, + "learning_rate": 1.2027293242552611e-05, + "loss": 0.1973, + "step": 6191 + }, + { + "epoch": 0.641724531039486, + "grad_norm": 0.5614368915557861, + "learning_rate": 1.2021136656855034e-05, + "loss": 0.1924, + "step": 6192 + }, + { + "epoch": 0.6418281687221473, + "grad_norm": 0.548904299736023, + "learning_rate": 1.2014980970203372e-05, + "loss": 0.1604, + "step": 6193 + }, + { + "epoch": 0.6419318064048087, + "grad_norm": 0.5700724720954895, + "learning_rate": 1.2008826183291244e-05, + "loss": 0.1799, + "step": 6194 + }, + { + "epoch": 0.6420354440874702, + "grad_norm": 0.5496739745140076, + "learning_rate": 1.2002672296812157e-05, + "loss": 0.2177, + "step": 6195 + }, + { + "epoch": 0.6421390817701316, + "grad_norm": 0.5980311632156372, + "learning_rate": 1.199651931145952e-05, + "loss": 0.1943, + "step": 6196 + }, + { + "epoch": 0.642242719452793, + "grad_norm": 0.587797999382019, + "learning_rate": 1.1990367227926653e-05, + "loss": 0.1994, + "step": 6197 + }, + { + "epoch": 0.6423463571354544, + "grad_norm": 0.6709038615226746, + "learning_rate": 1.1984216046906752e-05, + "loss": 0.2178, + "step": 6198 + }, + { + "epoch": 0.6424499948181158, + "grad_norm": 0.741703987121582, + "learning_rate": 1.197806576909293e-05, + "loss": 0.2407, + "step": 6199 + }, + { + "epoch": 0.6425536325007772, + "grad_norm": 0.6239121556282043, + "learning_rate": 1.1971916395178178e-05, + "loss": 0.1947, + "step": 6200 + }, + { + "epoch": 0.6426572701834387, + "grad_norm": 0.4833756685256958, + "learning_rate": 1.1965767925855412e-05, + "loss": 0.1475, + "step": 6201 + }, + { + "epoch": 0.6427609078661001, + "grad_norm": 0.5679334402084351, + "learning_rate": 1.195962036181743e-05, + "loss": 0.1836, + "step": 6202 + }, + { + "epoch": 0.6428645455487615, + "grad_norm": 0.7786445021629333, + "learning_rate": 1.1953473703756919e-05, + "loss": 0.2027, + "step": 6203 + }, + { + "epoch": 0.6429681832314229, + "grad_norm": 0.6195889115333557, + "learning_rate": 1.1947327952366492e-05, + "loss": 0.1896, + "step": 6204 + }, + { + "epoch": 0.6430718209140843, + "grad_norm": 0.6255584359169006, + "learning_rate": 1.1941183108338623e-05, + "loss": 0.2115, + "step": 6205 + }, + { + "epoch": 0.6431754585967457, + "grad_norm": 0.5589773654937744, + "learning_rate": 1.1935039172365714e-05, + "loss": 0.1666, + "step": 6206 + }, + { + "epoch": 0.6432790962794072, + "grad_norm": 0.6697091460227966, + "learning_rate": 1.1928896145140066e-05, + "loss": 0.2285, + "step": 6207 + }, + { + "epoch": 0.6433827339620686, + "grad_norm": 0.5722152590751648, + "learning_rate": 1.1922754027353843e-05, + "loss": 0.1814, + "step": 6208 + }, + { + "epoch": 0.64348637164473, + "grad_norm": 0.6211981177330017, + "learning_rate": 1.1916612819699145e-05, + "loss": 0.1856, + "step": 6209 + }, + { + "epoch": 0.6435900093273914, + "grad_norm": 0.6125348210334778, + "learning_rate": 1.1910472522867947e-05, + "loss": 0.1893, + "step": 6210 + }, + { + "epoch": 0.6436936470100528, + "grad_norm": 0.5103516578674316, + "learning_rate": 1.1904333137552124e-05, + "loss": 0.1666, + "step": 6211 + }, + { + "epoch": 0.6437972846927142, + "grad_norm": 0.5360734462738037, + "learning_rate": 1.1898194664443468e-05, + "loss": 0.1575, + "step": 6212 + }, + { + "epoch": 0.6439009223753757, + "grad_norm": 0.6586069464683533, + "learning_rate": 1.1892057104233637e-05, + "loss": 0.1877, + "step": 6213 + }, + { + "epoch": 0.6440045600580371, + "grad_norm": 0.6260287165641785, + "learning_rate": 1.188592045761421e-05, + "loss": 0.166, + "step": 6214 + }, + { + "epoch": 0.6441081977406985, + "grad_norm": 0.5675740838050842, + "learning_rate": 1.1879784725276646e-05, + "loss": 0.1914, + "step": 6215 + }, + { + "epoch": 0.6442118354233599, + "grad_norm": 0.6457982659339905, + "learning_rate": 1.1873649907912319e-05, + "loss": 0.2159, + "step": 6216 + }, + { + "epoch": 0.6443154731060213, + "grad_norm": 0.6307981610298157, + "learning_rate": 1.1867516006212494e-05, + "loss": 0.2285, + "step": 6217 + }, + { + "epoch": 0.6444191107886827, + "grad_norm": 0.5788914561271667, + "learning_rate": 1.1861383020868313e-05, + "loss": 0.1898, + "step": 6218 + }, + { + "epoch": 0.6445227484713442, + "grad_norm": 0.607032835483551, + "learning_rate": 1.1855250952570852e-05, + "loss": 0.1993, + "step": 6219 + }, + { + "epoch": 0.6446263861540056, + "grad_norm": 0.7031238079071045, + "learning_rate": 1.1849119802011047e-05, + "loss": 0.2415, + "step": 6220 + }, + { + "epoch": 0.644730023836667, + "grad_norm": 0.5608206987380981, + "learning_rate": 1.1842989569879748e-05, + "loss": 0.197, + "step": 6221 + }, + { + "epoch": 0.6448336615193284, + "grad_norm": 0.6588431000709534, + "learning_rate": 1.1836860256867712e-05, + "loss": 0.203, + "step": 6222 + }, + { + "epoch": 0.6449372992019898, + "grad_norm": 0.5253265500068665, + "learning_rate": 1.1830731863665567e-05, + "loss": 0.1723, + "step": 6223 + }, + { + "epoch": 0.6450409368846513, + "grad_norm": 0.6054903864860535, + "learning_rate": 1.1824604390963864e-05, + "loss": 0.1933, + "step": 6224 + }, + { + "epoch": 0.6451445745673127, + "grad_norm": 0.6375058889389038, + "learning_rate": 1.1818477839453015e-05, + "loss": 0.203, + "step": 6225 + }, + { + "epoch": 0.6452482122499741, + "grad_norm": 0.6265483498573303, + "learning_rate": 1.1812352209823374e-05, + "loss": 0.2128, + "step": 6226 + }, + { + "epoch": 0.6453518499326355, + "grad_norm": 0.583483874797821, + "learning_rate": 1.1806227502765162e-05, + "loss": 0.1917, + "step": 6227 + }, + { + "epoch": 0.6454554876152969, + "grad_norm": 0.6172074675559998, + "learning_rate": 1.1800103718968488e-05, + "loss": 0.2075, + "step": 6228 + }, + { + "epoch": 0.6455591252979583, + "grad_norm": 0.6154376864433289, + "learning_rate": 1.1793980859123387e-05, + "loss": 0.1963, + "step": 6229 + }, + { + "epoch": 0.6456627629806198, + "grad_norm": 0.6489928960800171, + "learning_rate": 1.1787858923919764e-05, + "loss": 0.1812, + "step": 6230 + }, + { + "epoch": 0.6457664006632812, + "grad_norm": 0.6327508091926575, + "learning_rate": 1.1781737914047425e-05, + "loss": 0.2166, + "step": 6231 + }, + { + "epoch": 0.6458700383459426, + "grad_norm": 0.6692586541175842, + "learning_rate": 1.1775617830196092e-05, + "loss": 0.2319, + "step": 6232 + }, + { + "epoch": 0.645973676028604, + "grad_norm": 0.5764483213424683, + "learning_rate": 1.1769498673055352e-05, + "loss": 0.1785, + "step": 6233 + }, + { + "epoch": 0.6460773137112654, + "grad_norm": 0.6731470227241516, + "learning_rate": 1.1763380443314714e-05, + "loss": 0.237, + "step": 6234 + }, + { + "epoch": 0.6461809513939268, + "grad_norm": 0.6515440940856934, + "learning_rate": 1.1757263141663552e-05, + "loss": 0.2232, + "step": 6235 + }, + { + "epoch": 0.6462845890765883, + "grad_norm": 0.5058755278587341, + "learning_rate": 1.1751146768791171e-05, + "loss": 0.151, + "step": 6236 + }, + { + "epoch": 0.6463882267592497, + "grad_norm": 0.6155193448066711, + "learning_rate": 1.1745031325386753e-05, + "loss": 0.1943, + "step": 6237 + }, + { + "epoch": 0.6464918644419111, + "grad_norm": 0.6133295893669128, + "learning_rate": 1.1738916812139367e-05, + "loss": 0.2002, + "step": 6238 + }, + { + "epoch": 0.6465955021245725, + "grad_norm": 0.7166955471038818, + "learning_rate": 1.173280322973799e-05, + "loss": 0.2125, + "step": 6239 + }, + { + "epoch": 0.6466991398072339, + "grad_norm": 0.6078068017959595, + "learning_rate": 1.1726690578871503e-05, + "loss": 0.2031, + "step": 6240 + }, + { + "epoch": 0.6468027774898953, + "grad_norm": 0.6322109699249268, + "learning_rate": 1.1720578860228656e-05, + "loss": 0.2118, + "step": 6241 + }, + { + "epoch": 0.6469064151725568, + "grad_norm": 0.6122873425483704, + "learning_rate": 1.1714468074498115e-05, + "loss": 0.2086, + "step": 6242 + }, + { + "epoch": 0.6470100528552182, + "grad_norm": 0.6217548251152039, + "learning_rate": 1.1708358222368424e-05, + "loss": 0.2045, + "step": 6243 + }, + { + "epoch": 0.6471136905378796, + "grad_norm": 0.6710754632949829, + "learning_rate": 1.1702249304528042e-05, + "loss": 0.2115, + "step": 6244 + }, + { + "epoch": 0.647217328220541, + "grad_norm": 0.6420841813087463, + "learning_rate": 1.1696141321665312e-05, + "loss": 0.2321, + "step": 6245 + }, + { + "epoch": 0.6473209659032024, + "grad_norm": 0.6284909248352051, + "learning_rate": 1.1690034274468465e-05, + "loss": 0.2028, + "step": 6246 + }, + { + "epoch": 0.6474246035858638, + "grad_norm": 0.6745210289955139, + "learning_rate": 1.1683928163625644e-05, + "loss": 0.2271, + "step": 6247 + }, + { + "epoch": 0.6475282412685253, + "grad_norm": 0.5823943018913269, + "learning_rate": 1.1677822989824867e-05, + "loss": 0.1933, + "step": 6248 + }, + { + "epoch": 0.6476318789511867, + "grad_norm": 0.5115127563476562, + "learning_rate": 1.1671718753754053e-05, + "loss": 0.1693, + "step": 6249 + }, + { + "epoch": 0.6477355166338481, + "grad_norm": 0.5595300197601318, + "learning_rate": 1.1665615456101031e-05, + "loss": 0.1855, + "step": 6250 + }, + { + "epoch": 0.6478391543165095, + "grad_norm": 0.6594432592391968, + "learning_rate": 1.1659513097553496e-05, + "loss": 0.2192, + "step": 6251 + }, + { + "epoch": 0.6479427919991709, + "grad_norm": 0.5352497696876526, + "learning_rate": 1.1653411678799067e-05, + "loss": 0.1705, + "step": 6252 + }, + { + "epoch": 0.6480464296818323, + "grad_norm": 0.6237494349479675, + "learning_rate": 1.1647311200525238e-05, + "loss": 0.215, + "step": 6253 + }, + { + "epoch": 0.6481500673644938, + "grad_norm": 0.5473231077194214, + "learning_rate": 1.1641211663419387e-05, + "loss": 0.1566, + "step": 6254 + }, + { + "epoch": 0.6482537050471552, + "grad_norm": 0.5947620868682861, + "learning_rate": 1.1635113068168819e-05, + "loss": 0.2016, + "step": 6255 + }, + { + "epoch": 0.6483573427298166, + "grad_norm": 0.6945819854736328, + "learning_rate": 1.16290154154607e-05, + "loss": 0.2172, + "step": 6256 + }, + { + "epoch": 0.648460980412478, + "grad_norm": 0.5550529360771179, + "learning_rate": 1.162291870598212e-05, + "loss": 0.199, + "step": 6257 + }, + { + "epoch": 0.6485646180951394, + "grad_norm": 0.5681254863739014, + "learning_rate": 1.1616822940420031e-05, + "loss": 0.1788, + "step": 6258 + }, + { + "epoch": 0.6486682557778009, + "grad_norm": 0.6118218898773193, + "learning_rate": 1.1610728119461307e-05, + "loss": 0.1859, + "step": 6259 + }, + { + "epoch": 0.6487718934604623, + "grad_norm": 0.6234940886497498, + "learning_rate": 1.1604634243792698e-05, + "loss": 0.1804, + "step": 6260 + }, + { + "epoch": 0.6488755311431237, + "grad_norm": 0.4990558624267578, + "learning_rate": 1.1598541314100843e-05, + "loss": 0.2122, + "step": 6261 + }, + { + "epoch": 0.6489791688257851, + "grad_norm": 0.6863871216773987, + "learning_rate": 1.1592449331072302e-05, + "loss": 0.2144, + "step": 6262 + }, + { + "epoch": 0.6490828065084465, + "grad_norm": 0.5716148614883423, + "learning_rate": 1.1586358295393492e-05, + "loss": 0.1828, + "step": 6263 + }, + { + "epoch": 0.6491864441911079, + "grad_norm": 0.6025523543357849, + "learning_rate": 1.1580268207750748e-05, + "loss": 0.2148, + "step": 6264 + }, + { + "epoch": 0.6492900818737694, + "grad_norm": 0.6130708456039429, + "learning_rate": 1.1574179068830305e-05, + "loss": 0.1948, + "step": 6265 + }, + { + "epoch": 0.6493937195564308, + "grad_norm": 0.5691248178482056, + "learning_rate": 1.1568090879318263e-05, + "loss": 0.1868, + "step": 6266 + }, + { + "epoch": 0.6494973572390922, + "grad_norm": 0.6407581567764282, + "learning_rate": 1.1562003639900637e-05, + "loss": 0.2172, + "step": 6267 + }, + { + "epoch": 0.6496009949217536, + "grad_norm": 0.6883420348167419, + "learning_rate": 1.1555917351263313e-05, + "loss": 0.2487, + "step": 6268 + }, + { + "epoch": 0.6497046326044149, + "grad_norm": 0.5731263160705566, + "learning_rate": 1.1549832014092096e-05, + "loss": 0.1926, + "step": 6269 + }, + { + "epoch": 0.6498082702870763, + "grad_norm": 0.5525518655776978, + "learning_rate": 1.1543747629072677e-05, + "loss": 0.1879, + "step": 6270 + }, + { + "epoch": 0.6499119079697377, + "grad_norm": 0.591450572013855, + "learning_rate": 1.1537664196890633e-05, + "loss": 0.2096, + "step": 6271 + }, + { + "epoch": 0.6500155456523992, + "grad_norm": 0.5698865056037903, + "learning_rate": 1.1531581718231431e-05, + "loss": 0.1797, + "step": 6272 + }, + { + "epoch": 0.6501191833350606, + "grad_norm": 0.4984050393104553, + "learning_rate": 1.1525500193780428e-05, + "loss": 0.1639, + "step": 6273 + }, + { + "epoch": 0.650222821017722, + "grad_norm": 0.6695106029510498, + "learning_rate": 1.151941962422289e-05, + "loss": 0.2214, + "step": 6274 + }, + { + "epoch": 0.6503264587003834, + "grad_norm": 0.5672677755355835, + "learning_rate": 1.1513340010243974e-05, + "loss": 0.1743, + "step": 6275 + }, + { + "epoch": 0.6504300963830448, + "grad_norm": 0.6967812180519104, + "learning_rate": 1.1507261352528705e-05, + "loss": 0.2428, + "step": 6276 + }, + { + "epoch": 0.6505337340657062, + "grad_norm": 0.6740899682044983, + "learning_rate": 1.1501183651762032e-05, + "loss": 0.2171, + "step": 6277 + }, + { + "epoch": 0.6506373717483677, + "grad_norm": 0.6313149333000183, + "learning_rate": 1.1495106908628772e-05, + "loss": 0.2304, + "step": 6278 + }, + { + "epoch": 0.6507410094310291, + "grad_norm": 0.625226616859436, + "learning_rate": 1.1489031123813636e-05, + "loss": 0.22, + "step": 6279 + }, + { + "epoch": 0.6508446471136905, + "grad_norm": 0.5863350629806519, + "learning_rate": 1.1482956298001256e-05, + "loss": 0.1794, + "step": 6280 + }, + { + "epoch": 0.6509482847963519, + "grad_norm": 0.5141674876213074, + "learning_rate": 1.1476882431876107e-05, + "loss": 0.211, + "step": 6281 + }, + { + "epoch": 0.6510519224790133, + "grad_norm": 0.595971941947937, + "learning_rate": 1.1470809526122606e-05, + "loss": 0.1766, + "step": 6282 + }, + { + "epoch": 0.6511555601616748, + "grad_norm": 0.5330041646957397, + "learning_rate": 1.1464737581425021e-05, + "loss": 0.1635, + "step": 6283 + }, + { + "epoch": 0.6512591978443362, + "grad_norm": 0.5961182713508606, + "learning_rate": 1.1458666598467542e-05, + "loss": 0.2467, + "step": 6284 + }, + { + "epoch": 0.6513628355269976, + "grad_norm": 0.6333070993423462, + "learning_rate": 1.1452596577934236e-05, + "loss": 0.2263, + "step": 6285 + }, + { + "epoch": 0.651466473209659, + "grad_norm": 0.5714941024780273, + "learning_rate": 1.1446527520509053e-05, + "loss": 0.1853, + "step": 6286 + }, + { + "epoch": 0.6515701108923204, + "grad_norm": 0.5562633872032166, + "learning_rate": 1.1440459426875858e-05, + "loss": 0.1808, + "step": 6287 + }, + { + "epoch": 0.6516737485749818, + "grad_norm": 0.5774336457252502, + "learning_rate": 1.1434392297718383e-05, + "loss": 0.1897, + "step": 6288 + }, + { + "epoch": 0.6517773862576433, + "grad_norm": 0.5281375646591187, + "learning_rate": 1.1428326133720264e-05, + "loss": 0.1773, + "step": 6289 + }, + { + "epoch": 0.6518810239403047, + "grad_norm": 0.7557573914527893, + "learning_rate": 1.1422260935565044e-05, + "loss": 0.2354, + "step": 6290 + }, + { + "epoch": 0.6519846616229661, + "grad_norm": 0.6803430318832397, + "learning_rate": 1.1416196703936126e-05, + "loss": 0.2208, + "step": 6291 + }, + { + "epoch": 0.6520882993056275, + "grad_norm": 0.6010298132896423, + "learning_rate": 1.1410133439516819e-05, + "loss": 0.214, + "step": 6292 + }, + { + "epoch": 0.6521919369882889, + "grad_norm": 0.5949862003326416, + "learning_rate": 1.1404071142990316e-05, + "loss": 0.21, + "step": 6293 + }, + { + "epoch": 0.6522955746709503, + "grad_norm": 0.5815996527671814, + "learning_rate": 1.1398009815039713e-05, + "loss": 0.1684, + "step": 6294 + }, + { + "epoch": 0.6523992123536118, + "grad_norm": 0.559094250202179, + "learning_rate": 1.1391949456348002e-05, + "loss": 0.1721, + "step": 6295 + }, + { + "epoch": 0.6525028500362732, + "grad_norm": 0.6896238923072815, + "learning_rate": 1.1385890067598035e-05, + "loss": 0.2419, + "step": 6296 + }, + { + "epoch": 0.6526064877189346, + "grad_norm": 0.6087605357170105, + "learning_rate": 1.1379831649472593e-05, + "loss": 0.203, + "step": 6297 + }, + { + "epoch": 0.652710125401596, + "grad_norm": 0.5834180116653442, + "learning_rate": 1.1373774202654318e-05, + "loss": 0.2032, + "step": 6298 + }, + { + "epoch": 0.6528137630842574, + "grad_norm": 0.5780649781227112, + "learning_rate": 1.136771772782575e-05, + "loss": 0.1786, + "step": 6299 + }, + { + "epoch": 0.6529174007669188, + "grad_norm": 0.6673455238342285, + "learning_rate": 1.1361662225669336e-05, + "loss": 0.2009, + "step": 6300 + }, + { + "epoch": 0.6530210384495803, + "grad_norm": 0.6683764457702637, + "learning_rate": 1.1355607696867388e-05, + "loss": 0.2237, + "step": 6301 + }, + { + "epoch": 0.6531246761322417, + "grad_norm": 0.6092632412910461, + "learning_rate": 1.1349554142102135e-05, + "loss": 0.2161, + "step": 6302 + }, + { + "epoch": 0.6532283138149031, + "grad_norm": 0.6827420592308044, + "learning_rate": 1.1343501562055672e-05, + "loss": 0.2094, + "step": 6303 + }, + { + "epoch": 0.6533319514975645, + "grad_norm": 0.7059791088104248, + "learning_rate": 1.133744995740999e-05, + "loss": 0.2332, + "step": 6304 + }, + { + "epoch": 0.6534355891802259, + "grad_norm": 0.6687110662460327, + "learning_rate": 1.1331399328846988e-05, + "loss": 0.2179, + "step": 6305 + }, + { + "epoch": 0.6535392268628873, + "grad_norm": 0.5980477333068848, + "learning_rate": 1.1325349677048423e-05, + "loss": 0.1849, + "step": 6306 + }, + { + "epoch": 0.6536428645455488, + "grad_norm": 0.5202463269233704, + "learning_rate": 1.1319301002695983e-05, + "loss": 0.1744, + "step": 6307 + }, + { + "epoch": 0.6537465022282102, + "grad_norm": 0.611396849155426, + "learning_rate": 1.1313253306471203e-05, + "loss": 0.2171, + "step": 6308 + }, + { + "epoch": 0.6538501399108716, + "grad_norm": 0.5297112464904785, + "learning_rate": 1.1307206589055543e-05, + "loss": 0.185, + "step": 6309 + }, + { + "epoch": 0.653953777593533, + "grad_norm": 0.5845366716384888, + "learning_rate": 1.1301160851130332e-05, + "loss": 0.2043, + "step": 6310 + }, + { + "epoch": 0.6540574152761944, + "grad_norm": 0.6273592114448547, + "learning_rate": 1.1295116093376789e-05, + "loss": 0.1966, + "step": 6311 + }, + { + "epoch": 0.6541610529588558, + "grad_norm": 0.5722415447235107, + "learning_rate": 1.1289072316476038e-05, + "loss": 0.1606, + "step": 6312 + }, + { + "epoch": 0.6542646906415173, + "grad_norm": 0.6014687418937683, + "learning_rate": 1.1283029521109068e-05, + "loss": 0.2056, + "step": 6313 + }, + { + "epoch": 0.6543683283241787, + "grad_norm": 0.5814545750617981, + "learning_rate": 1.1276987707956781e-05, + "loss": 0.1998, + "step": 6314 + }, + { + "epoch": 0.6544719660068401, + "grad_norm": 0.5836402773857117, + "learning_rate": 1.1270946877699966e-05, + "loss": 0.2036, + "step": 6315 + }, + { + "epoch": 0.6545756036895015, + "grad_norm": 0.4817843735218048, + "learning_rate": 1.126490703101929e-05, + "loss": 0.1646, + "step": 6316 + }, + { + "epoch": 0.6546792413721629, + "grad_norm": 0.6447229385375977, + "learning_rate": 1.1258868168595309e-05, + "loss": 0.2061, + "step": 6317 + }, + { + "epoch": 0.6547828790548244, + "grad_norm": 0.626266360282898, + "learning_rate": 1.1252830291108467e-05, + "loss": 0.1843, + "step": 6318 + }, + { + "epoch": 0.6548865167374858, + "grad_norm": 0.6417117714881897, + "learning_rate": 1.1246793399239108e-05, + "loss": 0.2096, + "step": 6319 + }, + { + "epoch": 0.6549901544201472, + "grad_norm": 0.6152032017707825, + "learning_rate": 1.1240757493667473e-05, + "loss": 0.1958, + "step": 6320 + }, + { + "epoch": 0.6550937921028086, + "grad_norm": 0.5442467927932739, + "learning_rate": 1.1234722575073658e-05, + "loss": 0.1809, + "step": 6321 + }, + { + "epoch": 0.65519742978547, + "grad_norm": 0.5608288645744324, + "learning_rate": 1.1228688644137686e-05, + "loss": 0.1761, + "step": 6322 + }, + { + "epoch": 0.6553010674681314, + "grad_norm": 0.6397298574447632, + "learning_rate": 1.1222655701539442e-05, + "loss": 0.225, + "step": 6323 + }, + { + "epoch": 0.6554047051507929, + "grad_norm": 0.5038039684295654, + "learning_rate": 1.12166237479587e-05, + "loss": 0.1614, + "step": 6324 + }, + { + "epoch": 0.6555083428334543, + "grad_norm": 0.7007007598876953, + "learning_rate": 1.1210592784075147e-05, + "loss": 0.2412, + "step": 6325 + }, + { + "epoch": 0.6556119805161157, + "grad_norm": 0.6230477094650269, + "learning_rate": 1.1204562810568328e-05, + "loss": 0.2036, + "step": 6326 + }, + { + "epoch": 0.6557156181987771, + "grad_norm": 0.5754637718200684, + "learning_rate": 1.1198533828117709e-05, + "loss": 0.1917, + "step": 6327 + }, + { + "epoch": 0.6558192558814385, + "grad_norm": 0.6380010843276978, + "learning_rate": 1.1192505837402608e-05, + "loss": 0.2174, + "step": 6328 + }, + { + "epoch": 0.6559228935640999, + "grad_norm": 0.6515641808509827, + "learning_rate": 1.1186478839102264e-05, + "loss": 0.1706, + "step": 6329 + }, + { + "epoch": 0.6560265312467614, + "grad_norm": 0.7134867310523987, + "learning_rate": 1.1180452833895786e-05, + "loss": 0.2298, + "step": 6330 + }, + { + "epoch": 0.6561301689294228, + "grad_norm": 0.5794904828071594, + "learning_rate": 1.117442782246216e-05, + "loss": 0.1696, + "step": 6331 + }, + { + "epoch": 0.6562338066120842, + "grad_norm": 0.5649944543838501, + "learning_rate": 1.1168403805480299e-05, + "loss": 0.1748, + "step": 6332 + }, + { + "epoch": 0.6563374442947456, + "grad_norm": 0.6089136600494385, + "learning_rate": 1.1162380783628959e-05, + "loss": 0.1737, + "step": 6333 + }, + { + "epoch": 0.656441081977407, + "grad_norm": 0.6334869861602783, + "learning_rate": 1.1156358757586823e-05, + "loss": 0.2071, + "step": 6334 + }, + { + "epoch": 0.6565447196600684, + "grad_norm": 0.5529614090919495, + "learning_rate": 1.1150337728032431e-05, + "loss": 0.1934, + "step": 6335 + }, + { + "epoch": 0.6566483573427299, + "grad_norm": 0.6286069750785828, + "learning_rate": 1.1144317695644222e-05, + "loss": 0.2047, + "step": 6336 + }, + { + "epoch": 0.6567519950253913, + "grad_norm": 0.605652391910553, + "learning_rate": 1.1138298661100536e-05, + "loss": 0.2188, + "step": 6337 + }, + { + "epoch": 0.6568556327080527, + "grad_norm": 0.6238192915916443, + "learning_rate": 1.1132280625079572e-05, + "loss": 0.2033, + "step": 6338 + }, + { + "epoch": 0.6569592703907141, + "grad_norm": 0.4853861629962921, + "learning_rate": 1.1126263588259443e-05, + "loss": 0.1712, + "step": 6339 + }, + { + "epoch": 0.6570629080733755, + "grad_norm": 0.6667852401733398, + "learning_rate": 1.1120247551318149e-05, + "loss": 0.2145, + "step": 6340 + }, + { + "epoch": 0.657166545756037, + "grad_norm": 0.6394658088684082, + "learning_rate": 1.1114232514933553e-05, + "loss": 0.2086, + "step": 6341 + }, + { + "epoch": 0.6572701834386984, + "grad_norm": 0.6247240900993347, + "learning_rate": 1.1108218479783423e-05, + "loss": 0.2268, + "step": 6342 + }, + { + "epoch": 0.6573738211213598, + "grad_norm": 0.5785343050956726, + "learning_rate": 1.110220544654541e-05, + "loss": 0.17, + "step": 6343 + }, + { + "epoch": 0.6574774588040212, + "grad_norm": 0.5689795017242432, + "learning_rate": 1.109619341589705e-05, + "loss": 0.1695, + "step": 6344 + }, + { + "epoch": 0.6575810964866825, + "grad_norm": 0.5147643089294434, + "learning_rate": 1.1090182388515785e-05, + "loss": 0.1636, + "step": 6345 + }, + { + "epoch": 0.6576847341693439, + "grad_norm": 0.6086384654045105, + "learning_rate": 1.108417236507891e-05, + "loss": 0.2108, + "step": 6346 + }, + { + "epoch": 0.6577883718520053, + "grad_norm": 0.6293398141860962, + "learning_rate": 1.1078163346263642e-05, + "loss": 0.201, + "step": 6347 + }, + { + "epoch": 0.6578920095346668, + "grad_norm": 0.642492949962616, + "learning_rate": 1.1072155332747055e-05, + "loss": 0.2068, + "step": 6348 + }, + { + "epoch": 0.6579956472173282, + "grad_norm": 0.5426843166351318, + "learning_rate": 1.106614832520612e-05, + "loss": 0.1698, + "step": 6349 + }, + { + "epoch": 0.6580992848999896, + "grad_norm": 0.5720564723014832, + "learning_rate": 1.1060142324317714e-05, + "loss": 0.1949, + "step": 6350 + }, + { + "epoch": 0.658202922582651, + "grad_norm": 0.5899260640144348, + "learning_rate": 1.1054137330758566e-05, + "loss": 0.1784, + "step": 6351 + }, + { + "epoch": 0.6583065602653124, + "grad_norm": 0.5442241430282593, + "learning_rate": 1.1048133345205324e-05, + "loss": 0.2007, + "step": 6352 + }, + { + "epoch": 0.6584101979479738, + "grad_norm": 0.6020263433456421, + "learning_rate": 1.1042130368334489e-05, + "loss": 0.199, + "step": 6353 + }, + { + "epoch": 0.6585138356306353, + "grad_norm": 0.620783805847168, + "learning_rate": 1.1036128400822492e-05, + "loss": 0.1957, + "step": 6354 + }, + { + "epoch": 0.6586174733132967, + "grad_norm": 0.5608299970626831, + "learning_rate": 1.1030127443345608e-05, + "loss": 0.1779, + "step": 6355 + }, + { + "epoch": 0.6587211109959581, + "grad_norm": 0.6411195993423462, + "learning_rate": 1.1024127496580015e-05, + "loss": 0.2047, + "step": 6356 + }, + { + "epoch": 0.6588247486786195, + "grad_norm": 0.6850147247314453, + "learning_rate": 1.1018128561201788e-05, + "loss": 0.2488, + "step": 6357 + }, + { + "epoch": 0.6589283863612809, + "grad_norm": 0.6264161467552185, + "learning_rate": 1.1012130637886866e-05, + "loss": 0.1941, + "step": 6358 + }, + { + "epoch": 0.6590320240439423, + "grad_norm": 0.5913993716239929, + "learning_rate": 1.1006133727311093e-05, + "loss": 0.1888, + "step": 6359 + }, + { + "epoch": 0.6591356617266038, + "grad_norm": 0.603714108467102, + "learning_rate": 1.1000137830150194e-05, + "loss": 0.1881, + "step": 6360 + }, + { + "epoch": 0.6592392994092652, + "grad_norm": 0.5576220154762268, + "learning_rate": 1.0994142947079775e-05, + "loss": 0.1861, + "step": 6361 + }, + { + "epoch": 0.6593429370919266, + "grad_norm": 0.7011269330978394, + "learning_rate": 1.0988149078775333e-05, + "loss": 0.2525, + "step": 6362 + }, + { + "epoch": 0.659446574774588, + "grad_norm": 0.670717179775238, + "learning_rate": 1.0982156225912233e-05, + "loss": 0.1932, + "step": 6363 + }, + { + "epoch": 0.6595502124572494, + "grad_norm": 0.6681691408157349, + "learning_rate": 1.0976164389165749e-05, + "loss": 0.2259, + "step": 6364 + }, + { + "epoch": 0.6596538501399108, + "grad_norm": 0.5890450477600098, + "learning_rate": 1.0970173569211045e-05, + "loss": 0.1853, + "step": 6365 + }, + { + "epoch": 0.6597574878225723, + "grad_norm": 0.6007232666015625, + "learning_rate": 1.0964183766723142e-05, + "loss": 0.2006, + "step": 6366 + }, + { + "epoch": 0.6598611255052337, + "grad_norm": 0.5991565585136414, + "learning_rate": 1.095819498237697e-05, + "loss": 0.1848, + "step": 6367 + }, + { + "epoch": 0.6599647631878951, + "grad_norm": 0.5873733758926392, + "learning_rate": 1.0952207216847322e-05, + "loss": 0.194, + "step": 6368 + }, + { + "epoch": 0.6600684008705565, + "grad_norm": 0.6535905599594116, + "learning_rate": 1.09462204708089e-05, + "loss": 0.2136, + "step": 6369 + }, + { + "epoch": 0.6601720385532179, + "grad_norm": 0.585079550743103, + "learning_rate": 1.0940234744936289e-05, + "loss": 0.1893, + "step": 6370 + }, + { + "epoch": 0.6602756762358793, + "grad_norm": 0.6181477308273315, + "learning_rate": 1.0934250039903933e-05, + "loss": 0.1949, + "step": 6371 + }, + { + "epoch": 0.6603793139185408, + "grad_norm": 0.5763558745384216, + "learning_rate": 1.0928266356386199e-05, + "loss": 0.1837, + "step": 6372 + }, + { + "epoch": 0.6604829516012022, + "grad_norm": 0.49055740237236023, + "learning_rate": 1.092228369505731e-05, + "loss": 0.145, + "step": 6373 + }, + { + "epoch": 0.6605865892838636, + "grad_norm": 0.6213757991790771, + "learning_rate": 1.0916302056591377e-05, + "loss": 0.2113, + "step": 6374 + }, + { + "epoch": 0.660690226966525, + "grad_norm": 0.6518123149871826, + "learning_rate": 1.091032144166241e-05, + "loss": 0.2039, + "step": 6375 + }, + { + "epoch": 0.6607938646491864, + "grad_norm": 0.5271971821784973, + "learning_rate": 1.0904341850944288e-05, + "loss": 0.1671, + "step": 6376 + }, + { + "epoch": 0.6608975023318479, + "grad_norm": 0.6460587382316589, + "learning_rate": 1.0898363285110796e-05, + "loss": 0.2075, + "step": 6377 + }, + { + "epoch": 0.6610011400145093, + "grad_norm": 0.5846865773200989, + "learning_rate": 1.0892385744835573e-05, + "loss": 0.2127, + "step": 6378 + }, + { + "epoch": 0.6611047776971707, + "grad_norm": 0.5889509916305542, + "learning_rate": 1.088640923079217e-05, + "loss": 0.1931, + "step": 6379 + }, + { + "epoch": 0.6612084153798321, + "grad_norm": 0.6355632543563843, + "learning_rate": 1.0880433743654008e-05, + "loss": 0.1969, + "step": 6380 + }, + { + "epoch": 0.6613120530624935, + "grad_norm": 0.6751269102096558, + "learning_rate": 1.087445928409439e-05, + "loss": 0.1973, + "step": 6381 + }, + { + "epoch": 0.6614156907451549, + "grad_norm": 0.6042639017105103, + "learning_rate": 1.0868485852786522e-05, + "loss": 0.1817, + "step": 6382 + }, + { + "epoch": 0.6615193284278164, + "grad_norm": 0.5439656972885132, + "learning_rate": 1.0862513450403463e-05, + "loss": 0.1766, + "step": 6383 + }, + { + "epoch": 0.6616229661104778, + "grad_norm": 0.554614245891571, + "learning_rate": 1.0856542077618184e-05, + "loss": 0.1997, + "step": 6384 + }, + { + "epoch": 0.6617266037931392, + "grad_norm": 0.5925115942955017, + "learning_rate": 1.0850571735103537e-05, + "loss": 0.1999, + "step": 6385 + }, + { + "epoch": 0.6618302414758006, + "grad_norm": 0.643631100654602, + "learning_rate": 1.0844602423532247e-05, + "loss": 0.2104, + "step": 6386 + }, + { + "epoch": 0.661933879158462, + "grad_norm": 0.6977851986885071, + "learning_rate": 1.0838634143576922e-05, + "loss": 0.2173, + "step": 6387 + }, + { + "epoch": 0.6620375168411234, + "grad_norm": 0.6471225023269653, + "learning_rate": 1.0832666895910051e-05, + "loss": 0.2138, + "step": 6388 + }, + { + "epoch": 0.6621411545237849, + "grad_norm": 0.6244288682937622, + "learning_rate": 1.0826700681204026e-05, + "loss": 0.1965, + "step": 6389 + }, + { + "epoch": 0.6622447922064463, + "grad_norm": 0.533596396446228, + "learning_rate": 1.0820735500131112e-05, + "loss": 0.1782, + "step": 6390 + }, + { + "epoch": 0.6623484298891077, + "grad_norm": 0.6266582012176514, + "learning_rate": 1.0814771353363447e-05, + "loss": 0.2079, + "step": 6391 + }, + { + "epoch": 0.6624520675717691, + "grad_norm": 0.6816765666007996, + "learning_rate": 1.0808808241573082e-05, + "loss": 0.2015, + "step": 6392 + }, + { + "epoch": 0.6625557052544305, + "grad_norm": 0.668770968914032, + "learning_rate": 1.0802846165431901e-05, + "loss": 0.2086, + "step": 6393 + }, + { + "epoch": 0.6626593429370919, + "grad_norm": 0.5263325572013855, + "learning_rate": 1.0796885125611719e-05, + "loss": 0.1932, + "step": 6394 + }, + { + "epoch": 0.6627629806197534, + "grad_norm": 0.6744509339332581, + "learning_rate": 1.079092512278422e-05, + "loss": 0.1999, + "step": 6395 + }, + { + "epoch": 0.6628666183024148, + "grad_norm": 0.6415967345237732, + "learning_rate": 1.0784966157620956e-05, + "loss": 0.204, + "step": 6396 + }, + { + "epoch": 0.6629702559850762, + "grad_norm": 0.6224689483642578, + "learning_rate": 1.0779008230793386e-05, + "loss": 0.2143, + "step": 6397 + }, + { + "epoch": 0.6630738936677376, + "grad_norm": 0.6195698380470276, + "learning_rate": 1.0773051342972835e-05, + "loss": 0.2197, + "step": 6398 + }, + { + "epoch": 0.663177531350399, + "grad_norm": 0.6115604043006897, + "learning_rate": 1.0767095494830509e-05, + "loss": 0.1848, + "step": 6399 + }, + { + "epoch": 0.6632811690330604, + "grad_norm": 0.5525495409965515, + "learning_rate": 1.0761140687037516e-05, + "loss": 0.1694, + "step": 6400 + }, + { + "epoch": 0.6633848067157219, + "grad_norm": 0.6176031827926636, + "learning_rate": 1.0755186920264819e-05, + "loss": 0.22, + "step": 6401 + }, + { + "epoch": 0.6634884443983833, + "grad_norm": 0.5892438292503357, + "learning_rate": 1.07492341951833e-05, + "loss": 0.1728, + "step": 6402 + }, + { + "epoch": 0.6635920820810447, + "grad_norm": 0.6732398867607117, + "learning_rate": 1.0743282512463681e-05, + "loss": 0.2228, + "step": 6403 + }, + { + "epoch": 0.6636957197637061, + "grad_norm": 0.618508517742157, + "learning_rate": 1.0737331872776606e-05, + "loss": 0.1928, + "step": 6404 + }, + { + "epoch": 0.6637993574463675, + "grad_norm": 0.6269073486328125, + "learning_rate": 1.0731382276792579e-05, + "loss": 0.227, + "step": 6405 + }, + { + "epoch": 0.663902995129029, + "grad_norm": 0.6141325235366821, + "learning_rate": 1.0725433725181977e-05, + "loss": 0.1951, + "step": 6406 + }, + { + "epoch": 0.6640066328116904, + "grad_norm": 0.6289651989936829, + "learning_rate": 1.0719486218615099e-05, + "loss": 0.1731, + "step": 6407 + }, + { + "epoch": 0.6641102704943518, + "grad_norm": 0.57537442445755, + "learning_rate": 1.0713539757762073e-05, + "loss": 0.181, + "step": 6408 + }, + { + "epoch": 0.6642139081770132, + "grad_norm": 0.5934721827507019, + "learning_rate": 1.0707594343292956e-05, + "loss": 0.1923, + "step": 6409 + }, + { + "epoch": 0.6643175458596746, + "grad_norm": 0.5678470730781555, + "learning_rate": 1.0701649975877668e-05, + "loss": 0.1672, + "step": 6410 + }, + { + "epoch": 0.664421183542336, + "grad_norm": 0.5607133507728577, + "learning_rate": 1.0695706656186005e-05, + "loss": 0.1695, + "step": 6411 + }, + { + "epoch": 0.6645248212249975, + "grad_norm": 0.47020161151885986, + "learning_rate": 1.0689764384887655e-05, + "loss": 0.1408, + "step": 6412 + }, + { + "epoch": 0.6646284589076589, + "grad_norm": 0.6394020318984985, + "learning_rate": 1.0683823162652171e-05, + "loss": 0.2396, + "step": 6413 + }, + { + "epoch": 0.6647320965903203, + "grad_norm": 0.5689187049865723, + "learning_rate": 1.067788299014901e-05, + "loss": 0.1671, + "step": 6414 + }, + { + "epoch": 0.6648357342729817, + "grad_norm": 0.5286722779273987, + "learning_rate": 1.0671943868047514e-05, + "loss": 0.1927, + "step": 6415 + }, + { + "epoch": 0.6649393719556431, + "grad_norm": 0.7043396830558777, + "learning_rate": 1.0666005797016874e-05, + "loss": 0.2081, + "step": 6416 + }, + { + "epoch": 0.6650430096383045, + "grad_norm": 0.5633207559585571, + "learning_rate": 1.0660068777726196e-05, + "loss": 0.174, + "step": 6417 + }, + { + "epoch": 0.665146647320966, + "grad_norm": 0.5071108937263489, + "learning_rate": 1.0654132810844452e-05, + "loss": 0.1632, + "step": 6418 + }, + { + "epoch": 0.6652502850036274, + "grad_norm": 0.626396119594574, + "learning_rate": 1.0648197897040483e-05, + "loss": 0.2045, + "step": 6419 + }, + { + "epoch": 0.6653539226862888, + "grad_norm": 0.6252375245094299, + "learning_rate": 1.0642264036983045e-05, + "loss": 0.1788, + "step": 6420 + }, + { + "epoch": 0.6654575603689501, + "grad_norm": 0.6734023094177246, + "learning_rate": 1.0636331231340744e-05, + "loss": 0.2031, + "step": 6421 + }, + { + "epoch": 0.6655611980516115, + "grad_norm": 0.565576434135437, + "learning_rate": 1.0630399480782087e-05, + "loss": 0.1719, + "step": 6422 + }, + { + "epoch": 0.6656648357342729, + "grad_norm": 0.5246537327766418, + "learning_rate": 1.0624468785975448e-05, + "loss": 0.1728, + "step": 6423 + }, + { + "epoch": 0.6657684734169343, + "grad_norm": 0.5331704616546631, + "learning_rate": 1.0618539147589098e-05, + "loss": 0.1855, + "step": 6424 + }, + { + "epoch": 0.6658721110995958, + "grad_norm": 0.6282480955123901, + "learning_rate": 1.0612610566291171e-05, + "loss": 0.1903, + "step": 6425 + }, + { + "epoch": 0.6659757487822572, + "grad_norm": 0.5429293513298035, + "learning_rate": 1.0606683042749687e-05, + "loss": 0.1755, + "step": 6426 + }, + { + "epoch": 0.6660793864649186, + "grad_norm": 0.6085336208343506, + "learning_rate": 1.0600756577632563e-05, + "loss": 0.1785, + "step": 6427 + }, + { + "epoch": 0.66618302414758, + "grad_norm": 0.6111792922019958, + "learning_rate": 1.059483117160757e-05, + "loss": 0.2024, + "step": 6428 + }, + { + "epoch": 0.6662866618302414, + "grad_norm": 0.590161919593811, + "learning_rate": 1.0588906825342384e-05, + "loss": 0.192, + "step": 6429 + }, + { + "epoch": 0.6663902995129029, + "grad_norm": 0.6143799424171448, + "learning_rate": 1.058298353950455e-05, + "loss": 0.1819, + "step": 6430 + }, + { + "epoch": 0.6664939371955643, + "grad_norm": 0.644903302192688, + "learning_rate": 1.0577061314761483e-05, + "loss": 0.186, + "step": 6431 + }, + { + "epoch": 0.6665975748782257, + "grad_norm": 0.5767917037010193, + "learning_rate": 1.0571140151780498e-05, + "loss": 0.1574, + "step": 6432 + }, + { + "epoch": 0.6667012125608871, + "grad_norm": 0.5911566615104675, + "learning_rate": 1.0565220051228793e-05, + "loss": 0.2091, + "step": 6433 + }, + { + "epoch": 0.6668048502435485, + "grad_norm": 0.5595060586929321, + "learning_rate": 1.0559301013773418e-05, + "loss": 0.1742, + "step": 6434 + }, + { + "epoch": 0.6669084879262099, + "grad_norm": 0.6926925778388977, + "learning_rate": 1.0553383040081333e-05, + "loss": 0.2148, + "step": 6435 + }, + { + "epoch": 0.6670121256088714, + "grad_norm": 0.607693076133728, + "learning_rate": 1.0547466130819365e-05, + "loss": 0.2077, + "step": 6436 + }, + { + "epoch": 0.6671157632915328, + "grad_norm": 0.6666292548179626, + "learning_rate": 1.0541550286654212e-05, + "loss": 0.2025, + "step": 6437 + }, + { + "epoch": 0.6672194009741942, + "grad_norm": 0.6411061882972717, + "learning_rate": 1.0535635508252478e-05, + "loss": 0.2155, + "step": 6438 + }, + { + "epoch": 0.6673230386568556, + "grad_norm": 0.5984596610069275, + "learning_rate": 1.052972179628061e-05, + "loss": 0.1794, + "step": 6439 + }, + { + "epoch": 0.667426676339517, + "grad_norm": 0.5402892231941223, + "learning_rate": 1.052380915140498e-05, + "loss": 0.1639, + "step": 6440 + }, + { + "epoch": 0.6675303140221784, + "grad_norm": 0.6519803404808044, + "learning_rate": 1.0517897574291794e-05, + "loss": 0.2258, + "step": 6441 + }, + { + "epoch": 0.6676339517048399, + "grad_norm": 0.6968314051628113, + "learning_rate": 1.0511987065607179e-05, + "loss": 0.2165, + "step": 6442 + }, + { + "epoch": 0.6677375893875013, + "grad_norm": 0.6417881846427917, + "learning_rate": 1.0506077626017111e-05, + "loss": 0.2114, + "step": 6443 + }, + { + "epoch": 0.6678412270701627, + "grad_norm": 0.660434365272522, + "learning_rate": 1.0500169256187448e-05, + "loss": 0.22, + "step": 6444 + }, + { + "epoch": 0.6679448647528241, + "grad_norm": 0.6588543653488159, + "learning_rate": 1.0494261956783954e-05, + "loss": 0.2094, + "step": 6445 + }, + { + "epoch": 0.6680485024354855, + "grad_norm": 0.5505183339118958, + "learning_rate": 1.048835572847224e-05, + "loss": 0.1897, + "step": 6446 + }, + { + "epoch": 0.6681521401181469, + "grad_norm": 0.6822441220283508, + "learning_rate": 1.0482450571917813e-05, + "loss": 0.2243, + "step": 6447 + }, + { + "epoch": 0.6682557778008084, + "grad_norm": 0.5411374568939209, + "learning_rate": 1.047654648778607e-05, + "loss": 0.1799, + "step": 6448 + }, + { + "epoch": 0.6683594154834698, + "grad_norm": 0.6397514343261719, + "learning_rate": 1.0470643476742266e-05, + "loss": 0.2116, + "step": 6449 + }, + { + "epoch": 0.6684630531661312, + "grad_norm": 0.6044232845306396, + "learning_rate": 1.0464741539451539e-05, + "loss": 0.185, + "step": 6450 + }, + { + "epoch": 0.6685666908487926, + "grad_norm": 0.6306686401367188, + "learning_rate": 1.0458840676578905e-05, + "loss": 0.2067, + "step": 6451 + }, + { + "epoch": 0.668670328531454, + "grad_norm": 0.6507785320281982, + "learning_rate": 1.0452940888789272e-05, + "loss": 0.2045, + "step": 6452 + }, + { + "epoch": 0.6687739662141154, + "grad_norm": 0.7334423661231995, + "learning_rate": 1.0447042176747426e-05, + "loss": 0.2334, + "step": 6453 + }, + { + "epoch": 0.6688776038967769, + "grad_norm": 0.517721951007843, + "learning_rate": 1.0441144541118007e-05, + "loss": 0.1535, + "step": 6454 + }, + { + "epoch": 0.6689812415794383, + "grad_norm": 0.7203530669212341, + "learning_rate": 1.043524798256558e-05, + "loss": 0.2481, + "step": 6455 + }, + { + "epoch": 0.6690848792620997, + "grad_norm": 0.6525432467460632, + "learning_rate": 1.0429352501754526e-05, + "loss": 0.1722, + "step": 6456 + }, + { + "epoch": 0.6691885169447611, + "grad_norm": 0.6704126596450806, + "learning_rate": 1.0423458099349153e-05, + "loss": 0.22, + "step": 6457 + }, + { + "epoch": 0.6692921546274225, + "grad_norm": 0.6624952554702759, + "learning_rate": 1.0417564776013644e-05, + "loss": 0.2111, + "step": 6458 + }, + { + "epoch": 0.669395792310084, + "grad_norm": 0.7517199516296387, + "learning_rate": 1.0411672532412031e-05, + "loss": 0.2311, + "step": 6459 + }, + { + "epoch": 0.6694994299927454, + "grad_norm": 0.5905138254165649, + "learning_rate": 1.040578136920826e-05, + "loss": 0.1839, + "step": 6460 + }, + { + "epoch": 0.6696030676754068, + "grad_norm": 0.6565616726875305, + "learning_rate": 1.0399891287066129e-05, + "loss": 0.2347, + "step": 6461 + }, + { + "epoch": 0.6697067053580682, + "grad_norm": 0.6401223540306091, + "learning_rate": 1.0394002286649317e-05, + "loss": 0.2026, + "step": 6462 + }, + { + "epoch": 0.6698103430407296, + "grad_norm": 0.6125038266181946, + "learning_rate": 1.0388114368621403e-05, + "loss": 0.1979, + "step": 6463 + }, + { + "epoch": 0.669913980723391, + "grad_norm": 0.6319140195846558, + "learning_rate": 1.038222753364581e-05, + "loss": 0.2092, + "step": 6464 + }, + { + "epoch": 0.6700176184060525, + "grad_norm": 0.5588834881782532, + "learning_rate": 1.0376341782385876e-05, + "loss": 0.1929, + "step": 6465 + }, + { + "epoch": 0.6701212560887139, + "grad_norm": 0.5903143286705017, + "learning_rate": 1.0370457115504781e-05, + "loss": 0.202, + "step": 6466 + }, + { + "epoch": 0.6702248937713753, + "grad_norm": 0.5594356060028076, + "learning_rate": 1.0364573533665619e-05, + "loss": 0.1906, + "step": 6467 + }, + { + "epoch": 0.6703285314540367, + "grad_norm": 0.6221143007278442, + "learning_rate": 1.0358691037531332e-05, + "loss": 0.1818, + "step": 6468 + }, + { + "epoch": 0.6704321691366981, + "grad_norm": 0.5643758773803711, + "learning_rate": 1.0352809627764743e-05, + "loss": 0.1787, + "step": 6469 + }, + { + "epoch": 0.6705358068193595, + "grad_norm": 0.5540712475776672, + "learning_rate": 1.0346929305028577e-05, + "loss": 0.1868, + "step": 6470 + }, + { + "epoch": 0.670639444502021, + "grad_norm": 0.6116195917129517, + "learning_rate": 1.0341050069985401e-05, + "loss": 0.1716, + "step": 6471 + }, + { + "epoch": 0.6707430821846824, + "grad_norm": 0.594440758228302, + "learning_rate": 1.033517192329769e-05, + "loss": 0.1982, + "step": 6472 + }, + { + "epoch": 0.6708467198673438, + "grad_norm": 0.6060428023338318, + "learning_rate": 1.0329294865627788e-05, + "loss": 0.201, + "step": 6473 + }, + { + "epoch": 0.6709503575500052, + "grad_norm": 0.6356008648872375, + "learning_rate": 1.0323418897637909e-05, + "loss": 0.2222, + "step": 6474 + }, + { + "epoch": 0.6710539952326666, + "grad_norm": 0.6097822189331055, + "learning_rate": 1.0317544019990145e-05, + "loss": 0.2058, + "step": 6475 + }, + { + "epoch": 0.671157632915328, + "grad_norm": 0.6528740525245667, + "learning_rate": 1.0311670233346462e-05, + "loss": 0.1942, + "step": 6476 + }, + { + "epoch": 0.6712612705979895, + "grad_norm": 0.7413235902786255, + "learning_rate": 1.0305797538368716e-05, + "loss": 0.1769, + "step": 6477 + }, + { + "epoch": 0.6713649082806509, + "grad_norm": 0.5531677007675171, + "learning_rate": 1.0299925935718641e-05, + "loss": 0.1661, + "step": 6478 + }, + { + "epoch": 0.6714685459633123, + "grad_norm": 0.5698585510253906, + "learning_rate": 1.0294055426057827e-05, + "loss": 0.1855, + "step": 6479 + }, + { + "epoch": 0.6715721836459737, + "grad_norm": 0.6898641586303711, + "learning_rate": 1.0288186010047765e-05, + "loss": 0.226, + "step": 6480 + }, + { + "epoch": 0.6716758213286351, + "grad_norm": 0.6154083013534546, + "learning_rate": 1.0282317688349805e-05, + "loss": 0.1725, + "step": 6481 + }, + { + "epoch": 0.6717794590112965, + "grad_norm": 0.5920049548149109, + "learning_rate": 1.0276450461625176e-05, + "loss": 0.1629, + "step": 6482 + }, + { + "epoch": 0.671883096693958, + "grad_norm": 0.5286540985107422, + "learning_rate": 1.0270584330535001e-05, + "loss": 0.1745, + "step": 6483 + }, + { + "epoch": 0.6719867343766194, + "grad_norm": 0.6335064768791199, + "learning_rate": 1.0264719295740251e-05, + "loss": 0.2205, + "step": 6484 + }, + { + "epoch": 0.6720903720592808, + "grad_norm": 0.6364689469337463, + "learning_rate": 1.0258855357901805e-05, + "loss": 0.1982, + "step": 6485 + }, + { + "epoch": 0.6721940097419422, + "grad_norm": 0.5329858660697937, + "learning_rate": 1.0252992517680384e-05, + "loss": 0.1636, + "step": 6486 + }, + { + "epoch": 0.6722976474246036, + "grad_norm": 0.6173647046089172, + "learning_rate": 1.0247130775736625e-05, + "loss": 0.2033, + "step": 6487 + }, + { + "epoch": 0.672401285107265, + "grad_norm": 0.6465008854866028, + "learning_rate": 1.0241270132731007e-05, + "loss": 0.2266, + "step": 6488 + }, + { + "epoch": 0.6725049227899265, + "grad_norm": 0.6485345363616943, + "learning_rate": 1.0235410589323892e-05, + "loss": 0.1974, + "step": 6489 + }, + { + "epoch": 0.6726085604725879, + "grad_norm": 0.6480139493942261, + "learning_rate": 1.0229552146175543e-05, + "loss": 0.2248, + "step": 6490 + }, + { + "epoch": 0.6727121981552493, + "grad_norm": 0.5608558654785156, + "learning_rate": 1.022369480394606e-05, + "loss": 0.1878, + "step": 6491 + }, + { + "epoch": 0.6728158358379107, + "grad_norm": 0.6115498542785645, + "learning_rate": 1.0217838563295456e-05, + "loss": 0.1926, + "step": 6492 + }, + { + "epoch": 0.6729194735205721, + "grad_norm": 0.6429728269577026, + "learning_rate": 1.0211983424883598e-05, + "loss": 0.2051, + "step": 6493 + }, + { + "epoch": 0.6730231112032335, + "grad_norm": 0.6617724895477295, + "learning_rate": 1.0206129389370222e-05, + "loss": 0.2178, + "step": 6494 + }, + { + "epoch": 0.673126748885895, + "grad_norm": 0.6400306820869446, + "learning_rate": 1.020027645741497e-05, + "loss": 0.2006, + "step": 6495 + }, + { + "epoch": 0.6732303865685563, + "grad_norm": 0.5858351588249207, + "learning_rate": 1.0194424629677328e-05, + "loss": 0.1879, + "step": 6496 + }, + { + "epoch": 0.6733340242512177, + "grad_norm": 0.6285557150840759, + "learning_rate": 1.0188573906816672e-05, + "loss": 0.186, + "step": 6497 + }, + { + "epoch": 0.6734376619338791, + "grad_norm": 0.5560044050216675, + "learning_rate": 1.0182724289492265e-05, + "loss": 0.1782, + "step": 6498 + }, + { + "epoch": 0.6735412996165405, + "grad_norm": 0.5952905416488647, + "learning_rate": 1.0176875778363225e-05, + "loss": 0.2016, + "step": 6499 + }, + { + "epoch": 0.6736449372992019, + "grad_norm": 0.5803484320640564, + "learning_rate": 1.0171028374088552e-05, + "loss": 0.1996, + "step": 6500 + }, + { + "epoch": 0.6737485749818634, + "grad_norm": 0.7529601454734802, + "learning_rate": 1.0165182077327111e-05, + "loss": 0.228, + "step": 6501 + }, + { + "epoch": 0.6738522126645248, + "grad_norm": 0.6083478331565857, + "learning_rate": 1.015933688873767e-05, + "loss": 0.203, + "step": 6502 + }, + { + "epoch": 0.6739558503471862, + "grad_norm": 0.6588034629821777, + "learning_rate": 1.0153492808978855e-05, + "loss": 0.1929, + "step": 6503 + }, + { + "epoch": 0.6740594880298476, + "grad_norm": 0.7018873691558838, + "learning_rate": 1.0147649838709154e-05, + "loss": 0.2466, + "step": 6504 + }, + { + "epoch": 0.674163125712509, + "grad_norm": 0.614115297794342, + "learning_rate": 1.0141807978586967e-05, + "loss": 0.1956, + "step": 6505 + }, + { + "epoch": 0.6742667633951704, + "grad_norm": 0.5698854923248291, + "learning_rate": 1.0135967229270527e-05, + "loss": 0.173, + "step": 6506 + }, + { + "epoch": 0.6743704010778319, + "grad_norm": 0.5453628897666931, + "learning_rate": 1.013012759141796e-05, + "loss": 0.1696, + "step": 6507 + }, + { + "epoch": 0.6744740387604933, + "grad_norm": 0.6856065988540649, + "learning_rate": 1.012428906568728e-05, + "loss": 0.2373, + "step": 6508 + }, + { + "epoch": 0.6745776764431547, + "grad_norm": 0.6497329473495483, + "learning_rate": 1.011845165273635e-05, + "loss": 0.2016, + "step": 6509 + }, + { + "epoch": 0.6746813141258161, + "grad_norm": 0.6395732164382935, + "learning_rate": 1.0112615353222934e-05, + "loss": 0.2267, + "step": 6510 + }, + { + "epoch": 0.6747849518084775, + "grad_norm": 0.6444329619407654, + "learning_rate": 1.0106780167804642e-05, + "loss": 0.1958, + "step": 6511 + }, + { + "epoch": 0.674888589491139, + "grad_norm": 0.5601248145103455, + "learning_rate": 1.0100946097138988e-05, + "loss": 0.1757, + "step": 6512 + }, + { + "epoch": 0.6749922271738004, + "grad_norm": 0.7075749635696411, + "learning_rate": 1.009511314188334e-05, + "loss": 0.2391, + "step": 6513 + }, + { + "epoch": 0.6750958648564618, + "grad_norm": 0.8617693781852722, + "learning_rate": 1.0089281302694938e-05, + "loss": 0.2913, + "step": 6514 + }, + { + "epoch": 0.6751995025391232, + "grad_norm": 0.6507748961448669, + "learning_rate": 1.008345058023092e-05, + "loss": 0.178, + "step": 6515 + }, + { + "epoch": 0.6753031402217846, + "grad_norm": 0.6890247464179993, + "learning_rate": 1.0077620975148266e-05, + "loss": 0.2301, + "step": 6516 + }, + { + "epoch": 0.675406777904446, + "grad_norm": 0.6728772521018982, + "learning_rate": 1.0071792488103858e-05, + "loss": 0.2165, + "step": 6517 + }, + { + "epoch": 0.6755104155871074, + "grad_norm": 0.5938656330108643, + "learning_rate": 1.0065965119754444e-05, + "loss": 0.2102, + "step": 6518 + }, + { + "epoch": 0.6756140532697689, + "grad_norm": 0.6155257821083069, + "learning_rate": 1.0060138870756639e-05, + "loss": 0.1723, + "step": 6519 + }, + { + "epoch": 0.6757176909524303, + "grad_norm": 0.6233095526695251, + "learning_rate": 1.0054313741766935e-05, + "loss": 0.1971, + "step": 6520 + }, + { + "epoch": 0.6758213286350917, + "grad_norm": 0.63090980052948, + "learning_rate": 1.0048489733441689e-05, + "loss": 0.1919, + "step": 6521 + }, + { + "epoch": 0.6759249663177531, + "grad_norm": 0.6411224603652954, + "learning_rate": 1.0042666846437151e-05, + "loss": 0.2079, + "step": 6522 + }, + { + "epoch": 0.6760286040004145, + "grad_norm": 0.5687317252159119, + "learning_rate": 1.0036845081409441e-05, + "loss": 0.1825, + "step": 6523 + }, + { + "epoch": 0.676132241683076, + "grad_norm": 0.625381350517273, + "learning_rate": 1.003102443901454e-05, + "loss": 0.1768, + "step": 6524 + }, + { + "epoch": 0.6762358793657374, + "grad_norm": 0.6624096632003784, + "learning_rate": 1.0025204919908304e-05, + "loss": 0.2164, + "step": 6525 + }, + { + "epoch": 0.6763395170483988, + "grad_norm": 0.7490399479866028, + "learning_rate": 1.0019386524746468e-05, + "loss": 0.2266, + "step": 6526 + }, + { + "epoch": 0.6764431547310602, + "grad_norm": 0.608364462852478, + "learning_rate": 1.0013569254184644e-05, + "loss": 0.2014, + "step": 6527 + }, + { + "epoch": 0.6765467924137216, + "grad_norm": 0.4916069209575653, + "learning_rate": 1.0007753108878315e-05, + "loss": 0.156, + "step": 6528 + }, + { + "epoch": 0.676650430096383, + "grad_norm": 0.7098358273506165, + "learning_rate": 1.000193808948283e-05, + "loss": 0.2104, + "step": 6529 + }, + { + "epoch": 0.6767540677790445, + "grad_norm": 0.642331600189209, + "learning_rate": 9.996124196653425e-06, + "loss": 0.2237, + "step": 6530 + }, + { + "epoch": 0.6768577054617059, + "grad_norm": 0.6876441240310669, + "learning_rate": 9.990311431045192e-06, + "loss": 0.2116, + "step": 6531 + }, + { + "epoch": 0.6769613431443673, + "grad_norm": 0.6571114659309387, + "learning_rate": 9.984499793313101e-06, + "loss": 0.2079, + "step": 6532 + }, + { + "epoch": 0.6770649808270287, + "grad_norm": 0.588650107383728, + "learning_rate": 9.978689284112011e-06, + "loss": 0.2035, + "step": 6533 + }, + { + "epoch": 0.6771686185096901, + "grad_norm": 0.5409191250801086, + "learning_rate": 9.972879904096627e-06, + "loss": 0.1662, + "step": 6534 + }, + { + "epoch": 0.6772722561923515, + "grad_norm": 0.6306576728820801, + "learning_rate": 9.967071653921553e-06, + "loss": 0.2059, + "step": 6535 + }, + { + "epoch": 0.677375893875013, + "grad_norm": 0.520565927028656, + "learning_rate": 9.961264534241244e-06, + "loss": 0.1732, + "step": 6536 + }, + { + "epoch": 0.6774795315576744, + "grad_norm": 0.6069191098213196, + "learning_rate": 9.955458545710048e-06, + "loss": 0.1992, + "step": 6537 + }, + { + "epoch": 0.6775831692403358, + "grad_norm": 0.4916059076786041, + "learning_rate": 9.949653688982168e-06, + "loss": 0.1414, + "step": 6538 + }, + { + "epoch": 0.6776868069229972, + "grad_norm": 0.7133325338363647, + "learning_rate": 9.94384996471168e-06, + "loss": 0.2446, + "step": 6539 + }, + { + "epoch": 0.6777904446056586, + "grad_norm": 0.6343425512313843, + "learning_rate": 9.938047373552554e-06, + "loss": 0.1779, + "step": 6540 + }, + { + "epoch": 0.67789408228832, + "grad_norm": 0.6749465465545654, + "learning_rate": 9.932245916158599e-06, + "loss": 0.2319, + "step": 6541 + }, + { + "epoch": 0.6779977199709815, + "grad_norm": 0.6080639958381653, + "learning_rate": 9.926445593183524e-06, + "loss": 0.2151, + "step": 6542 + }, + { + "epoch": 0.6781013576536429, + "grad_norm": 0.7060633301734924, + "learning_rate": 9.920646405280907e-06, + "loss": 0.2051, + "step": 6543 + }, + { + "epoch": 0.6782049953363043, + "grad_norm": 0.5446678996086121, + "learning_rate": 9.914848353104185e-06, + "loss": 0.1879, + "step": 6544 + }, + { + "epoch": 0.6783086330189657, + "grad_norm": 0.5900073051452637, + "learning_rate": 9.909051437306674e-06, + "loss": 0.2155, + "step": 6545 + }, + { + "epoch": 0.6784122707016271, + "grad_norm": 0.5862210392951965, + "learning_rate": 9.903255658541551e-06, + "loss": 0.1696, + "step": 6546 + }, + { + "epoch": 0.6785159083842885, + "grad_norm": 0.6753283143043518, + "learning_rate": 9.897461017461885e-06, + "loss": 0.216, + "step": 6547 + }, + { + "epoch": 0.67861954606695, + "grad_norm": 0.613251268863678, + "learning_rate": 9.891667514720616e-06, + "loss": 0.1948, + "step": 6548 + }, + { + "epoch": 0.6787231837496114, + "grad_norm": 0.5268702507019043, + "learning_rate": 9.885875150970527e-06, + "loss": 0.15, + "step": 6549 + }, + { + "epoch": 0.6788268214322728, + "grad_norm": 0.6268615126609802, + "learning_rate": 9.880083926864321e-06, + "loss": 0.2068, + "step": 6550 + }, + { + "epoch": 0.6789304591149342, + "grad_norm": 0.6250087022781372, + "learning_rate": 9.874293843054512e-06, + "loss": 0.2036, + "step": 6551 + }, + { + "epoch": 0.6790340967975956, + "grad_norm": 0.4842534363269806, + "learning_rate": 9.86850490019353e-06, + "loss": 0.1598, + "step": 6552 + }, + { + "epoch": 0.679137734480257, + "grad_norm": 0.9720544219017029, + "learning_rate": 9.862717098933675e-06, + "loss": 0.1766, + "step": 6553 + }, + { + "epoch": 0.6792413721629185, + "grad_norm": 0.6002216935157776, + "learning_rate": 9.85693043992709e-06, + "loss": 0.1951, + "step": 6554 + }, + { + "epoch": 0.6793450098455799, + "grad_norm": 0.5289440751075745, + "learning_rate": 9.851144923825823e-06, + "loss": 0.1799, + "step": 6555 + }, + { + "epoch": 0.6794486475282413, + "grad_norm": 0.6132556200027466, + "learning_rate": 9.845360551281771e-06, + "loss": 0.201, + "step": 6556 + }, + { + "epoch": 0.6795522852109027, + "grad_norm": 0.659072756767273, + "learning_rate": 9.839577322946697e-06, + "loss": 0.2324, + "step": 6557 + }, + { + "epoch": 0.6796559228935641, + "grad_norm": 0.5505070090293884, + "learning_rate": 9.833795239472264e-06, + "loss": 0.1905, + "step": 6558 + }, + { + "epoch": 0.6797595605762256, + "grad_norm": 0.5536336898803711, + "learning_rate": 9.82801430150997e-06, + "loss": 0.1771, + "step": 6559 + }, + { + "epoch": 0.679863198258887, + "grad_norm": 0.6265289187431335, + "learning_rate": 9.82223450971122e-06, + "loss": 0.2106, + "step": 6560 + }, + { + "epoch": 0.6799668359415484, + "grad_norm": 0.5902706980705261, + "learning_rate": 9.816455864727259e-06, + "loss": 0.205, + "step": 6561 + }, + { + "epoch": 0.6800704736242098, + "grad_norm": 0.5661330223083496, + "learning_rate": 9.810678367209227e-06, + "loss": 0.1659, + "step": 6562 + }, + { + "epoch": 0.6801741113068712, + "grad_norm": 0.6367207169532776, + "learning_rate": 9.804902017808116e-06, + "loss": 0.2077, + "step": 6563 + }, + { + "epoch": 0.6802777489895326, + "grad_norm": 0.7023079991340637, + "learning_rate": 9.799126817174789e-06, + "loss": 0.217, + "step": 6564 + }, + { + "epoch": 0.680381386672194, + "grad_norm": 0.6935189962387085, + "learning_rate": 9.793352765960004e-06, + "loss": 0.2127, + "step": 6565 + }, + { + "epoch": 0.6804850243548555, + "grad_norm": 0.5774267315864563, + "learning_rate": 9.787579864814354e-06, + "loss": 0.185, + "step": 6566 + }, + { + "epoch": 0.6805886620375169, + "grad_norm": 0.5946171879768372, + "learning_rate": 9.781808114388329e-06, + "loss": 0.1977, + "step": 6567 + }, + { + "epoch": 0.6806922997201783, + "grad_norm": 0.6323895454406738, + "learning_rate": 9.776037515332291e-06, + "loss": 0.2236, + "step": 6568 + }, + { + "epoch": 0.6807959374028397, + "grad_norm": 0.5324143767356873, + "learning_rate": 9.77026806829645e-06, + "loss": 0.174, + "step": 6569 + }, + { + "epoch": 0.6808995750855011, + "grad_norm": 0.6232479810714722, + "learning_rate": 9.764499773930902e-06, + "loss": 0.1951, + "step": 6570 + }, + { + "epoch": 0.6810032127681626, + "grad_norm": 0.6016594767570496, + "learning_rate": 9.7587326328856e-06, + "loss": 0.1881, + "step": 6571 + }, + { + "epoch": 0.6811068504508239, + "grad_norm": 0.6569531559944153, + "learning_rate": 9.752966645810384e-06, + "loss": 0.204, + "step": 6572 + }, + { + "epoch": 0.6812104881334853, + "grad_norm": 0.6725072264671326, + "learning_rate": 9.747201813354965e-06, + "loss": 0.2137, + "step": 6573 + }, + { + "epoch": 0.6813141258161467, + "grad_norm": 0.5521656274795532, + "learning_rate": 9.741438136168902e-06, + "loss": 0.1794, + "step": 6574 + }, + { + "epoch": 0.6814177634988081, + "grad_norm": 0.5544648766517639, + "learning_rate": 9.735675614901648e-06, + "loss": 0.1838, + "step": 6575 + }, + { + "epoch": 0.6815214011814695, + "grad_norm": 0.650189995765686, + "learning_rate": 9.729914250202507e-06, + "loss": 0.2, + "step": 6576 + }, + { + "epoch": 0.681625038864131, + "grad_norm": 0.4788687229156494, + "learning_rate": 9.724154042720659e-06, + "loss": 0.145, + "step": 6577 + }, + { + "epoch": 0.6817286765467924, + "grad_norm": 0.5694932341575623, + "learning_rate": 9.718394993105167e-06, + "loss": 0.193, + "step": 6578 + }, + { + "epoch": 0.6818323142294538, + "grad_norm": 0.6239579319953918, + "learning_rate": 9.712637102004936e-06, + "loss": 0.1809, + "step": 6579 + }, + { + "epoch": 0.6819359519121152, + "grad_norm": 0.6551296710968018, + "learning_rate": 9.70688037006877e-06, + "loss": 0.2346, + "step": 6580 + }, + { + "epoch": 0.6820395895947766, + "grad_norm": 0.6136123538017273, + "learning_rate": 9.701124797945318e-06, + "loss": 0.199, + "step": 6581 + }, + { + "epoch": 0.682143227277438, + "grad_norm": 0.5647212266921997, + "learning_rate": 9.695370386283121e-06, + "loss": 0.1813, + "step": 6582 + }, + { + "epoch": 0.6822468649600995, + "grad_norm": 0.6047530174255371, + "learning_rate": 9.689617135730566e-06, + "loss": 0.1863, + "step": 6583 + }, + { + "epoch": 0.6823505026427609, + "grad_norm": 0.5722807049751282, + "learning_rate": 9.683865046935923e-06, + "loss": 0.1696, + "step": 6584 + }, + { + "epoch": 0.6824541403254223, + "grad_norm": 0.597629189491272, + "learning_rate": 9.678114120547333e-06, + "loss": 0.2195, + "step": 6585 + }, + { + "epoch": 0.6825577780080837, + "grad_norm": 0.6136654019355774, + "learning_rate": 9.672364357212793e-06, + "loss": 0.2049, + "step": 6586 + }, + { + "epoch": 0.6826614156907451, + "grad_norm": 0.7166646718978882, + "learning_rate": 9.666615757580189e-06, + "loss": 0.2406, + "step": 6587 + }, + { + "epoch": 0.6827650533734065, + "grad_norm": 0.6847596168518066, + "learning_rate": 9.66086832229726e-06, + "loss": 0.2443, + "step": 6588 + }, + { + "epoch": 0.682868691056068, + "grad_norm": 0.7182942628860474, + "learning_rate": 9.655122052011604e-06, + "loss": 0.2095, + "step": 6589 + }, + { + "epoch": 0.6829723287387294, + "grad_norm": 0.6858102083206177, + "learning_rate": 9.649376947370724e-06, + "loss": 0.2029, + "step": 6590 + }, + { + "epoch": 0.6830759664213908, + "grad_norm": 0.6318065524101257, + "learning_rate": 9.643633009021952e-06, + "loss": 0.217, + "step": 6591 + }, + { + "epoch": 0.6831796041040522, + "grad_norm": 0.5490069389343262, + "learning_rate": 9.637890237612512e-06, + "loss": 0.1983, + "step": 6592 + }, + { + "epoch": 0.6832832417867136, + "grad_norm": 0.6092766523361206, + "learning_rate": 9.6321486337895e-06, + "loss": 0.2206, + "step": 6593 + }, + { + "epoch": 0.683386879469375, + "grad_norm": 0.6065627336502075, + "learning_rate": 9.626408198199864e-06, + "loss": 0.1804, + "step": 6594 + }, + { + "epoch": 0.6834905171520365, + "grad_norm": 0.5605596303939819, + "learning_rate": 9.620668931490425e-06, + "loss": 0.1733, + "step": 6595 + }, + { + "epoch": 0.6835941548346979, + "grad_norm": 0.6768938899040222, + "learning_rate": 9.614930834307867e-06, + "loss": 0.2123, + "step": 6596 + }, + { + "epoch": 0.6836977925173593, + "grad_norm": 0.6282684803009033, + "learning_rate": 9.609193907298762e-06, + "loss": 0.1992, + "step": 6597 + }, + { + "epoch": 0.6838014302000207, + "grad_norm": 0.512245774269104, + "learning_rate": 9.60345815110954e-06, + "loss": 0.1934, + "step": 6598 + }, + { + "epoch": 0.6839050678826821, + "grad_norm": 0.6321278214454651, + "learning_rate": 9.597723566386484e-06, + "loss": 0.1997, + "step": 6599 + }, + { + "epoch": 0.6840087055653435, + "grad_norm": 0.56236732006073, + "learning_rate": 9.591990153775774e-06, + "loss": 0.1812, + "step": 6600 + }, + { + "epoch": 0.684112343248005, + "grad_norm": 0.5921213626861572, + "learning_rate": 9.586257913923433e-06, + "loss": 0.1733, + "step": 6601 + }, + { + "epoch": 0.6842159809306664, + "grad_norm": 0.6034637093544006, + "learning_rate": 9.580526847475356e-06, + "loss": 0.2052, + "step": 6602 + }, + { + "epoch": 0.6843196186133278, + "grad_norm": 0.6185294389724731, + "learning_rate": 9.574796955077323e-06, + "loss": 0.2122, + "step": 6603 + }, + { + "epoch": 0.6844232562959892, + "grad_norm": 0.6712292432785034, + "learning_rate": 9.569068237374955e-06, + "loss": 0.1815, + "step": 6604 + }, + { + "epoch": 0.6845268939786506, + "grad_norm": 0.6129352450370789, + "learning_rate": 9.563340695013772e-06, + "loss": 0.211, + "step": 6605 + }, + { + "epoch": 0.684630531661312, + "grad_norm": 0.6321895718574524, + "learning_rate": 9.557614328639127e-06, + "loss": 0.1953, + "step": 6606 + }, + { + "epoch": 0.6847341693439735, + "grad_norm": 0.48475220799446106, + "learning_rate": 9.551889138896274e-06, + "loss": 0.1606, + "step": 6607 + }, + { + "epoch": 0.6848378070266349, + "grad_norm": 0.5537219643592834, + "learning_rate": 9.546165126430309e-06, + "loss": 0.1823, + "step": 6608 + }, + { + "epoch": 0.6849414447092963, + "grad_norm": 0.8098316192626953, + "learning_rate": 9.540442291886201e-06, + "loss": 0.2273, + "step": 6609 + }, + { + "epoch": 0.6850450823919577, + "grad_norm": 0.6700147390365601, + "learning_rate": 9.534720635908803e-06, + "loss": 0.2132, + "step": 6610 + }, + { + "epoch": 0.6851487200746191, + "grad_norm": 0.6605697274208069, + "learning_rate": 9.529000159142806e-06, + "loss": 0.2098, + "step": 6611 + }, + { + "epoch": 0.6852523577572806, + "grad_norm": 0.5277429223060608, + "learning_rate": 9.523280862232795e-06, + "loss": 0.1619, + "step": 6612 + }, + { + "epoch": 0.685355995439942, + "grad_norm": 0.6423800587654114, + "learning_rate": 9.517562745823228e-06, + "loss": 0.1979, + "step": 6613 + }, + { + "epoch": 0.6854596331226034, + "grad_norm": 0.5679594874382019, + "learning_rate": 9.511845810558376e-06, + "loss": 0.1915, + "step": 6614 + }, + { + "epoch": 0.6855632708052648, + "grad_norm": 0.6598084568977356, + "learning_rate": 9.506130057082444e-06, + "loss": 0.2111, + "step": 6615 + }, + { + "epoch": 0.6856669084879262, + "grad_norm": 0.6002801656723022, + "learning_rate": 9.500415486039456e-06, + "loss": 0.1962, + "step": 6616 + }, + { + "epoch": 0.6857705461705876, + "grad_norm": 0.6224452257156372, + "learning_rate": 9.49470209807333e-06, + "loss": 0.2008, + "step": 6617 + }, + { + "epoch": 0.685874183853249, + "grad_norm": 0.5566638708114624, + "learning_rate": 9.488989893827847e-06, + "loss": 0.1935, + "step": 6618 + }, + { + "epoch": 0.6859778215359105, + "grad_norm": 0.6481515169143677, + "learning_rate": 9.483278873946643e-06, + "loss": 0.2004, + "step": 6619 + }, + { + "epoch": 0.6860814592185719, + "grad_norm": 0.5335357785224915, + "learning_rate": 9.477569039073227e-06, + "loss": 0.1573, + "step": 6620 + }, + { + "epoch": 0.6861850969012333, + "grad_norm": 0.6524526476860046, + "learning_rate": 9.471860389850967e-06, + "loss": 0.2323, + "step": 6621 + }, + { + "epoch": 0.6862887345838947, + "grad_norm": 0.6136525273323059, + "learning_rate": 9.46615292692311e-06, + "loss": 0.1873, + "step": 6622 + }, + { + "epoch": 0.6863923722665561, + "grad_norm": 0.6165292859077454, + "learning_rate": 9.460446650932778e-06, + "loss": 0.1827, + "step": 6623 + }, + { + "epoch": 0.6864960099492176, + "grad_norm": 0.6506378054618835, + "learning_rate": 9.454741562522922e-06, + "loss": 0.1998, + "step": 6624 + }, + { + "epoch": 0.686599647631879, + "grad_norm": 0.6071855425834656, + "learning_rate": 9.449037662336405e-06, + "loss": 0.2088, + "step": 6625 + }, + { + "epoch": 0.6867032853145404, + "grad_norm": 0.6445494294166565, + "learning_rate": 9.44333495101592e-06, + "loss": 0.186, + "step": 6626 + }, + { + "epoch": 0.6868069229972018, + "grad_norm": 0.6092303991317749, + "learning_rate": 9.437633429204033e-06, + "loss": 0.1988, + "step": 6627 + }, + { + "epoch": 0.6869105606798632, + "grad_norm": 0.5655484795570374, + "learning_rate": 9.431933097543202e-06, + "loss": 0.1717, + "step": 6628 + }, + { + "epoch": 0.6870141983625246, + "grad_norm": 0.6163622736930847, + "learning_rate": 9.426233956675712e-06, + "loss": 0.179, + "step": 6629 + }, + { + "epoch": 0.6871178360451861, + "grad_norm": 0.5814242959022522, + "learning_rate": 9.420536007243744e-06, + "loss": 0.1849, + "step": 6630 + }, + { + "epoch": 0.6872214737278475, + "grad_norm": 0.5335648059844971, + "learning_rate": 9.414839249889338e-06, + "loss": 0.1771, + "step": 6631 + }, + { + "epoch": 0.6873251114105089, + "grad_norm": 0.6043835282325745, + "learning_rate": 9.409143685254391e-06, + "loss": 0.1998, + "step": 6632 + }, + { + "epoch": 0.6874287490931703, + "grad_norm": 0.6903834939002991, + "learning_rate": 9.403449313980671e-06, + "loss": 0.2307, + "step": 6633 + }, + { + "epoch": 0.6875323867758317, + "grad_norm": 0.6415154933929443, + "learning_rate": 9.397756136709801e-06, + "loss": 0.1853, + "step": 6634 + }, + { + "epoch": 0.6876360244584931, + "grad_norm": 0.6361954808235168, + "learning_rate": 9.392064154083288e-06, + "loss": 0.2021, + "step": 6635 + }, + { + "epoch": 0.6877396621411546, + "grad_norm": 0.6217038631439209, + "learning_rate": 9.386373366742505e-06, + "loss": 0.2018, + "step": 6636 + }, + { + "epoch": 0.687843299823816, + "grad_norm": 0.5349450707435608, + "learning_rate": 9.380683775328662e-06, + "loss": 0.1548, + "step": 6637 + }, + { + "epoch": 0.6879469375064774, + "grad_norm": 0.657341718673706, + "learning_rate": 9.374995380482872e-06, + "loss": 0.2109, + "step": 6638 + }, + { + "epoch": 0.6880505751891388, + "grad_norm": 0.6744164824485779, + "learning_rate": 9.369308182846086e-06, + "loss": 0.179, + "step": 6639 + }, + { + "epoch": 0.6881542128718002, + "grad_norm": 0.5822855234146118, + "learning_rate": 9.36362218305912e-06, + "loss": 0.1793, + "step": 6640 + }, + { + "epoch": 0.6882578505544616, + "grad_norm": 0.6837253570556641, + "learning_rate": 9.35793738176268e-06, + "loss": 0.2267, + "step": 6641 + }, + { + "epoch": 0.6883614882371231, + "grad_norm": 0.7134829163551331, + "learning_rate": 9.352253779597305e-06, + "loss": 0.2496, + "step": 6642 + }, + { + "epoch": 0.6884651259197845, + "grad_norm": 0.542172908782959, + "learning_rate": 9.346571377203428e-06, + "loss": 0.1805, + "step": 6643 + }, + { + "epoch": 0.6885687636024459, + "grad_norm": 0.634016752243042, + "learning_rate": 9.34089017522132e-06, + "loss": 0.1763, + "step": 6644 + }, + { + "epoch": 0.6886724012851073, + "grad_norm": 0.6878634095191956, + "learning_rate": 9.335210174291145e-06, + "loss": 0.2227, + "step": 6645 + }, + { + "epoch": 0.6887760389677687, + "grad_norm": 0.6338194608688354, + "learning_rate": 9.32953137505291e-06, + "loss": 0.2074, + "step": 6646 + }, + { + "epoch": 0.6888796766504302, + "grad_norm": 0.588844895362854, + "learning_rate": 9.32385377814648e-06, + "loss": 0.1972, + "step": 6647 + }, + { + "epoch": 0.6889833143330915, + "grad_norm": 0.6829007863998413, + "learning_rate": 9.318177384211621e-06, + "loss": 0.2387, + "step": 6648 + }, + { + "epoch": 0.6890869520157529, + "grad_norm": 0.601935625076294, + "learning_rate": 9.312502193887922e-06, + "loss": 0.1743, + "step": 6649 + }, + { + "epoch": 0.6891905896984143, + "grad_norm": 0.6067055463790894, + "learning_rate": 9.30682820781487e-06, + "loss": 0.1696, + "step": 6650 + }, + { + "epoch": 0.6892942273810757, + "grad_norm": 0.518624484539032, + "learning_rate": 9.301155426631792e-06, + "loss": 0.1541, + "step": 6651 + }, + { + "epoch": 0.6893978650637371, + "grad_norm": 0.6755282282829285, + "learning_rate": 9.29548385097788e-06, + "loss": 0.2102, + "step": 6652 + }, + { + "epoch": 0.6895015027463985, + "grad_norm": 0.6593364477157593, + "learning_rate": 9.289813481492216e-06, + "loss": 0.1932, + "step": 6653 + }, + { + "epoch": 0.68960514042906, + "grad_norm": 0.6965281963348389, + "learning_rate": 9.28414431881371e-06, + "loss": 0.2365, + "step": 6654 + }, + { + "epoch": 0.6897087781117214, + "grad_norm": 0.6095872521400452, + "learning_rate": 9.278476363581166e-06, + "loss": 0.2224, + "step": 6655 + }, + { + "epoch": 0.6898124157943828, + "grad_norm": 0.6280148029327393, + "learning_rate": 9.272809616433245e-06, + "loss": 0.1917, + "step": 6656 + }, + { + "epoch": 0.6899160534770442, + "grad_norm": 0.6921609044075012, + "learning_rate": 9.267144078008462e-06, + "loss": 0.1953, + "step": 6657 + }, + { + "epoch": 0.6900196911597056, + "grad_norm": 0.7111772894859314, + "learning_rate": 9.261479748945199e-06, + "loss": 0.2258, + "step": 6658 + }, + { + "epoch": 0.690123328842367, + "grad_norm": 0.6491702198982239, + "learning_rate": 9.255816629881698e-06, + "loss": 0.187, + "step": 6659 + }, + { + "epoch": 0.6902269665250285, + "grad_norm": 0.6895001530647278, + "learning_rate": 9.250154721456075e-06, + "loss": 0.2074, + "step": 6660 + }, + { + "epoch": 0.6903306042076899, + "grad_norm": 0.6911149621009827, + "learning_rate": 9.244494024306315e-06, + "loss": 0.1962, + "step": 6661 + }, + { + "epoch": 0.6904342418903513, + "grad_norm": 0.5192302465438843, + "learning_rate": 9.23883453907024e-06, + "loss": 0.1676, + "step": 6662 + }, + { + "epoch": 0.6905378795730127, + "grad_norm": 0.6603487133979797, + "learning_rate": 9.233176266385571e-06, + "loss": 0.2041, + "step": 6663 + }, + { + "epoch": 0.6906415172556741, + "grad_norm": 0.5469902157783508, + "learning_rate": 9.22751920688986e-06, + "loss": 0.1585, + "step": 6664 + }, + { + "epoch": 0.6907451549383355, + "grad_norm": 0.6095395088195801, + "learning_rate": 9.221863361220534e-06, + "loss": 0.2082, + "step": 6665 + }, + { + "epoch": 0.690848792620997, + "grad_norm": 0.7023966312408447, + "learning_rate": 9.216208730014895e-06, + "loss": 0.2116, + "step": 6666 + }, + { + "epoch": 0.6909524303036584, + "grad_norm": 0.5786901116371155, + "learning_rate": 9.210555313910086e-06, + "loss": 0.1756, + "step": 6667 + }, + { + "epoch": 0.6910560679863198, + "grad_norm": 0.6502581238746643, + "learning_rate": 9.20490311354314e-06, + "loss": 0.1929, + "step": 6668 + }, + { + "epoch": 0.6911597056689812, + "grad_norm": 0.6279124617576599, + "learning_rate": 9.199252129550922e-06, + "loss": 0.1784, + "step": 6669 + }, + { + "epoch": 0.6912633433516426, + "grad_norm": 0.6859745383262634, + "learning_rate": 9.193602362570188e-06, + "loss": 0.2056, + "step": 6670 + }, + { + "epoch": 0.691366981034304, + "grad_norm": 0.6891130208969116, + "learning_rate": 9.187953813237544e-06, + "loss": 0.2161, + "step": 6671 + }, + { + "epoch": 0.6914706187169655, + "grad_norm": 0.685141921043396, + "learning_rate": 9.18230648218945e-06, + "loss": 0.2127, + "step": 6672 + }, + { + "epoch": 0.6915742563996269, + "grad_norm": 0.5688890814781189, + "learning_rate": 9.17666037006225e-06, + "loss": 0.1718, + "step": 6673 + }, + { + "epoch": 0.6916778940822883, + "grad_norm": 0.6026402115821838, + "learning_rate": 9.171015477492129e-06, + "loss": 0.1706, + "step": 6674 + }, + { + "epoch": 0.6917815317649497, + "grad_norm": 0.6500517725944519, + "learning_rate": 9.165371805115151e-06, + "loss": 0.1848, + "step": 6675 + }, + { + "epoch": 0.6918851694476111, + "grad_norm": 0.5920248627662659, + "learning_rate": 9.159729353567248e-06, + "loss": 0.2026, + "step": 6676 + }, + { + "epoch": 0.6919888071302726, + "grad_norm": 0.48986637592315674, + "learning_rate": 9.154088123484175e-06, + "loss": 0.1551, + "step": 6677 + }, + { + "epoch": 0.692092444812934, + "grad_norm": 0.6007040739059448, + "learning_rate": 9.1484481155016e-06, + "loss": 0.1839, + "step": 6678 + }, + { + "epoch": 0.6921960824955954, + "grad_norm": 0.4834362268447876, + "learning_rate": 9.142809330255015e-06, + "loss": 0.1471, + "step": 6679 + }, + { + "epoch": 0.6922997201782568, + "grad_norm": 0.5714542269706726, + "learning_rate": 9.137171768379796e-06, + "loss": 0.1876, + "step": 6680 + }, + { + "epoch": 0.6924033578609182, + "grad_norm": 0.5275331139564514, + "learning_rate": 9.131535430511185e-06, + "loss": 0.1654, + "step": 6681 + }, + { + "epoch": 0.6925069955435796, + "grad_norm": 0.6573442816734314, + "learning_rate": 9.125900317284265e-06, + "loss": 0.211, + "step": 6682 + }, + { + "epoch": 0.6926106332262411, + "grad_norm": 0.722928524017334, + "learning_rate": 9.120266429333993e-06, + "loss": 0.2449, + "step": 6683 + }, + { + "epoch": 0.6927142709089025, + "grad_norm": 0.6090424656867981, + "learning_rate": 9.114633767295178e-06, + "loss": 0.2074, + "step": 6684 + }, + { + "epoch": 0.6928179085915639, + "grad_norm": 0.6295527219772339, + "learning_rate": 9.10900233180251e-06, + "loss": 0.1865, + "step": 6685 + }, + { + "epoch": 0.6929215462742253, + "grad_norm": 0.664747953414917, + "learning_rate": 9.103372123490538e-06, + "loss": 0.2115, + "step": 6686 + }, + { + "epoch": 0.6930251839568867, + "grad_norm": 0.6330031156539917, + "learning_rate": 9.097743142993644e-06, + "loss": 0.2085, + "step": 6687 + }, + { + "epoch": 0.6931288216395481, + "grad_norm": 0.6022166013717651, + "learning_rate": 9.092115390946117e-06, + "loss": 0.2151, + "step": 6688 + }, + { + "epoch": 0.6932324593222096, + "grad_norm": 0.6728755831718445, + "learning_rate": 9.086488867982068e-06, + "loss": 0.2074, + "step": 6689 + }, + { + "epoch": 0.693336097004871, + "grad_norm": 0.5417996644973755, + "learning_rate": 9.08086357473548e-06, + "loss": 0.1755, + "step": 6690 + }, + { + "epoch": 0.6934397346875324, + "grad_norm": 0.547081708908081, + "learning_rate": 9.075239511840222e-06, + "loss": 0.1851, + "step": 6691 + }, + { + "epoch": 0.6935433723701938, + "grad_norm": 0.5020447373390198, + "learning_rate": 9.069616679929982e-06, + "loss": 0.1557, + "step": 6692 + }, + { + "epoch": 0.6936470100528552, + "grad_norm": 0.6151479482650757, + "learning_rate": 9.063995079638352e-06, + "loss": 0.1753, + "step": 6693 + }, + { + "epoch": 0.6937506477355166, + "grad_norm": 0.637016773223877, + "learning_rate": 9.058374711598747e-06, + "loss": 0.2052, + "step": 6694 + }, + { + "epoch": 0.6938542854181781, + "grad_norm": 0.5873505473136902, + "learning_rate": 9.052755576444479e-06, + "loss": 0.1735, + "step": 6695 + }, + { + "epoch": 0.6939579231008395, + "grad_norm": 0.5946987271308899, + "learning_rate": 9.047137674808694e-06, + "loss": 0.2131, + "step": 6696 + }, + { + "epoch": 0.6940615607835009, + "grad_norm": 0.5622970461845398, + "learning_rate": 9.041521007324403e-06, + "loss": 0.1803, + "step": 6697 + }, + { + "epoch": 0.6941651984661623, + "grad_norm": 0.5434853434562683, + "learning_rate": 9.035905574624496e-06, + "loss": 0.1699, + "step": 6698 + }, + { + "epoch": 0.6942688361488237, + "grad_norm": 0.599409818649292, + "learning_rate": 9.030291377341698e-06, + "loss": 0.1842, + "step": 6699 + }, + { + "epoch": 0.6943724738314851, + "grad_norm": 0.589805006980896, + "learning_rate": 9.024678416108615e-06, + "loss": 0.1921, + "step": 6700 + }, + { + "epoch": 0.6944761115141466, + "grad_norm": 0.6243669390678406, + "learning_rate": 9.019066691557714e-06, + "loss": 0.2022, + "step": 6701 + }, + { + "epoch": 0.694579749196808, + "grad_norm": 0.5878159403800964, + "learning_rate": 9.013456204321307e-06, + "loss": 0.1865, + "step": 6702 + }, + { + "epoch": 0.6946833868794694, + "grad_norm": 0.565056562423706, + "learning_rate": 9.007846955031575e-06, + "loss": 0.1897, + "step": 6703 + }, + { + "epoch": 0.6947870245621308, + "grad_norm": 0.48694732785224915, + "learning_rate": 9.002238944320555e-06, + "loss": 0.1539, + "step": 6704 + }, + { + "epoch": 0.6948906622447922, + "grad_norm": 0.5932483077049255, + "learning_rate": 8.996632172820155e-06, + "loss": 0.1718, + "step": 6705 + }, + { + "epoch": 0.6949942999274537, + "grad_norm": 0.6488050818443298, + "learning_rate": 8.991026641162144e-06, + "loss": 0.1962, + "step": 6706 + }, + { + "epoch": 0.6950979376101151, + "grad_norm": 0.6209344863891602, + "learning_rate": 8.985422349978127e-06, + "loss": 0.1948, + "step": 6707 + }, + { + "epoch": 0.6952015752927765, + "grad_norm": 0.5865502953529358, + "learning_rate": 8.979819299899615e-06, + "loss": 0.1798, + "step": 6708 + }, + { + "epoch": 0.6953052129754379, + "grad_norm": 0.6794852018356323, + "learning_rate": 8.974217491557916e-06, + "loss": 0.2036, + "step": 6709 + }, + { + "epoch": 0.6954088506580993, + "grad_norm": 0.6519509553909302, + "learning_rate": 8.968616925584253e-06, + "loss": 0.2042, + "step": 6710 + }, + { + "epoch": 0.6955124883407607, + "grad_norm": 0.6843174695968628, + "learning_rate": 8.963017602609691e-06, + "loss": 0.2087, + "step": 6711 + }, + { + "epoch": 0.6956161260234222, + "grad_norm": 0.8070350289344788, + "learning_rate": 8.957419523265142e-06, + "loss": 0.2181, + "step": 6712 + }, + { + "epoch": 0.6957197637060836, + "grad_norm": 0.5602341890335083, + "learning_rate": 8.951822688181405e-06, + "loss": 0.1616, + "step": 6713 + }, + { + "epoch": 0.695823401388745, + "grad_norm": 0.6749635934829712, + "learning_rate": 8.946227097989108e-06, + "loss": 0.1948, + "step": 6714 + }, + { + "epoch": 0.6959270390714064, + "grad_norm": 0.680906355381012, + "learning_rate": 8.940632753318755e-06, + "loss": 0.2078, + "step": 6715 + }, + { + "epoch": 0.6960306767540678, + "grad_norm": 0.6567285656929016, + "learning_rate": 8.935039654800714e-06, + "loss": 0.224, + "step": 6716 + }, + { + "epoch": 0.6961343144367292, + "grad_norm": 0.5188596844673157, + "learning_rate": 8.929447803065202e-06, + "loss": 0.1859, + "step": 6717 + }, + { + "epoch": 0.6962379521193907, + "grad_norm": 0.6843985319137573, + "learning_rate": 8.923857198742305e-06, + "loss": 0.2187, + "step": 6718 + }, + { + "epoch": 0.6963415898020521, + "grad_norm": 0.6555632948875427, + "learning_rate": 8.918267842461955e-06, + "loss": 0.2246, + "step": 6719 + }, + { + "epoch": 0.6964452274847135, + "grad_norm": 0.563077986240387, + "learning_rate": 8.912679734853963e-06, + "loss": 0.1897, + "step": 6720 + }, + { + "epoch": 0.6965488651673749, + "grad_norm": 0.6430466175079346, + "learning_rate": 8.907092876547984e-06, + "loss": 0.1791, + "step": 6721 + }, + { + "epoch": 0.6966525028500363, + "grad_norm": 0.6822545528411865, + "learning_rate": 8.90150726817353e-06, + "loss": 0.2179, + "step": 6722 + }, + { + "epoch": 0.6967561405326977, + "grad_norm": 0.6235038638114929, + "learning_rate": 8.89592291035999e-06, + "loss": 0.208, + "step": 6723 + }, + { + "epoch": 0.696859778215359, + "grad_norm": 0.6741113662719727, + "learning_rate": 8.890339803736587e-06, + "loss": 0.2042, + "step": 6724 + }, + { + "epoch": 0.6969634158980205, + "grad_norm": 0.6065617799758911, + "learning_rate": 8.884757948932426e-06, + "loss": 0.1959, + "step": 6725 + }, + { + "epoch": 0.6970670535806819, + "grad_norm": 0.6889532208442688, + "learning_rate": 8.879177346576466e-06, + "loss": 0.2158, + "step": 6726 + }, + { + "epoch": 0.6971706912633433, + "grad_norm": 0.5668037533760071, + "learning_rate": 8.873597997297516e-06, + "loss": 0.1659, + "step": 6727 + }, + { + "epoch": 0.6972743289460047, + "grad_norm": 0.5899565815925598, + "learning_rate": 8.868019901724248e-06, + "loss": 0.1864, + "step": 6728 + }, + { + "epoch": 0.6973779666286661, + "grad_norm": 0.6486213803291321, + "learning_rate": 8.862443060485184e-06, + "loss": 0.2049, + "step": 6729 + }, + { + "epoch": 0.6974816043113276, + "grad_norm": 0.597908079624176, + "learning_rate": 8.856867474208724e-06, + "loss": 0.1649, + "step": 6730 + }, + { + "epoch": 0.697585241993989, + "grad_norm": 0.6301078796386719, + "learning_rate": 8.851293143523118e-06, + "loss": 0.1797, + "step": 6731 + }, + { + "epoch": 0.6976888796766504, + "grad_norm": 0.5568992495536804, + "learning_rate": 8.845720069056465e-06, + "loss": 0.1658, + "step": 6732 + }, + { + "epoch": 0.6977925173593118, + "grad_norm": 0.6819639801979065, + "learning_rate": 8.840148251436741e-06, + "loss": 0.2158, + "step": 6733 + }, + { + "epoch": 0.6978961550419732, + "grad_norm": 0.5686694979667664, + "learning_rate": 8.834577691291768e-06, + "loss": 0.1897, + "step": 6734 + }, + { + "epoch": 0.6979997927246346, + "grad_norm": 0.5983203053474426, + "learning_rate": 8.829008389249212e-06, + "loss": 0.1684, + "step": 6735 + }, + { + "epoch": 0.698103430407296, + "grad_norm": 0.5979479551315308, + "learning_rate": 8.823440345936633e-06, + "loss": 0.1926, + "step": 6736 + }, + { + "epoch": 0.6982070680899575, + "grad_norm": 0.6044715046882629, + "learning_rate": 8.817873561981416e-06, + "loss": 0.1793, + "step": 6737 + }, + { + "epoch": 0.6983107057726189, + "grad_norm": 0.5939434170722961, + "learning_rate": 8.812308038010828e-06, + "loss": 0.1777, + "step": 6738 + }, + { + "epoch": 0.6984143434552803, + "grad_norm": 0.6867238879203796, + "learning_rate": 8.80674377465198e-06, + "loss": 0.2102, + "step": 6739 + }, + { + "epoch": 0.6985179811379417, + "grad_norm": 0.6145426630973816, + "learning_rate": 8.801180772531836e-06, + "loss": 0.2017, + "step": 6740 + }, + { + "epoch": 0.6986216188206031, + "grad_norm": 0.6226168274879456, + "learning_rate": 8.79561903227724e-06, + "loss": 0.2076, + "step": 6741 + }, + { + "epoch": 0.6987252565032646, + "grad_norm": 0.567448079586029, + "learning_rate": 8.790058554514868e-06, + "loss": 0.1658, + "step": 6742 + }, + { + "epoch": 0.698828894185926, + "grad_norm": 0.7417593598365784, + "learning_rate": 8.78449933987128e-06, + "loss": 0.2276, + "step": 6743 + }, + { + "epoch": 0.6989325318685874, + "grad_norm": 0.6055402755737305, + "learning_rate": 8.778941388972861e-06, + "loss": 0.2017, + "step": 6744 + }, + { + "epoch": 0.6990361695512488, + "grad_norm": 0.56300288438797, + "learning_rate": 8.773384702445893e-06, + "loss": 0.1638, + "step": 6745 + }, + { + "epoch": 0.6991398072339102, + "grad_norm": 0.6405076384544373, + "learning_rate": 8.767829280916485e-06, + "loss": 0.2077, + "step": 6746 + }, + { + "epoch": 0.6992434449165716, + "grad_norm": 0.6768667697906494, + "learning_rate": 8.762275125010602e-06, + "loss": 0.2175, + "step": 6747 + }, + { + "epoch": 0.6993470825992331, + "grad_norm": 0.6095232367515564, + "learning_rate": 8.756722235354099e-06, + "loss": 0.2166, + "step": 6748 + }, + { + "epoch": 0.6994507202818945, + "grad_norm": 0.6138764023780823, + "learning_rate": 8.751170612572648e-06, + "loss": 0.1859, + "step": 6749 + }, + { + "epoch": 0.6995543579645559, + "grad_norm": 0.7089101076126099, + "learning_rate": 8.745620257291805e-06, + "loss": 0.2385, + "step": 6750 + }, + { + "epoch": 0.6996579956472173, + "grad_norm": 0.599244236946106, + "learning_rate": 8.740071170136986e-06, + "loss": 0.1867, + "step": 6751 + }, + { + "epoch": 0.6997616333298787, + "grad_norm": 0.7416609525680542, + "learning_rate": 8.734523351733442e-06, + "loss": 0.2266, + "step": 6752 + }, + { + "epoch": 0.6998652710125401, + "grad_norm": 0.5874851942062378, + "learning_rate": 8.728976802706293e-06, + "loss": 0.1634, + "step": 6753 + }, + { + "epoch": 0.6999689086952016, + "grad_norm": 0.6908164024353027, + "learning_rate": 8.72343152368051e-06, + "loss": 0.2111, + "step": 6754 + }, + { + "epoch": 0.700072546377863, + "grad_norm": 0.5676965713500977, + "learning_rate": 8.717887515280934e-06, + "loss": 0.1673, + "step": 6755 + }, + { + "epoch": 0.7001761840605244, + "grad_norm": 0.745006263256073, + "learning_rate": 8.712344778132262e-06, + "loss": 0.2086, + "step": 6756 + }, + { + "epoch": 0.7002798217431858, + "grad_norm": 0.6484946608543396, + "learning_rate": 8.706803312859025e-06, + "loss": 0.2198, + "step": 6757 + }, + { + "epoch": 0.7003834594258472, + "grad_norm": 0.6044569611549377, + "learning_rate": 8.701263120085643e-06, + "loss": 0.1877, + "step": 6758 + }, + { + "epoch": 0.7004870971085086, + "grad_norm": 0.6353676319122314, + "learning_rate": 8.695724200436369e-06, + "loss": 0.2428, + "step": 6759 + }, + { + "epoch": 0.7005907347911701, + "grad_norm": 0.6398935317993164, + "learning_rate": 8.690186554535312e-06, + "loss": 0.1975, + "step": 6760 + }, + { + "epoch": 0.7006943724738315, + "grad_norm": 0.5691301822662354, + "learning_rate": 8.684650183006457e-06, + "loss": 0.1759, + "step": 6761 + }, + { + "epoch": 0.7007980101564929, + "grad_norm": 0.6124591827392578, + "learning_rate": 8.679115086473625e-06, + "loss": 0.1891, + "step": 6762 + }, + { + "epoch": 0.7009016478391543, + "grad_norm": 0.6126314401626587, + "learning_rate": 8.673581265560513e-06, + "loss": 0.1912, + "step": 6763 + }, + { + "epoch": 0.7010052855218157, + "grad_norm": 0.6804085373878479, + "learning_rate": 8.66804872089065e-06, + "loss": 0.1997, + "step": 6764 + }, + { + "epoch": 0.7011089232044772, + "grad_norm": 0.5831896066665649, + "learning_rate": 8.662517453087446e-06, + "loss": 0.2051, + "step": 6765 + }, + { + "epoch": 0.7012125608871386, + "grad_norm": 0.6971094608306885, + "learning_rate": 8.656987462774153e-06, + "loss": 0.2093, + "step": 6766 + }, + { + "epoch": 0.7013161985698, + "grad_norm": 0.593682050704956, + "learning_rate": 8.651458750573874e-06, + "loss": 0.172, + "step": 6767 + }, + { + "epoch": 0.7014198362524614, + "grad_norm": 0.6307997107505798, + "learning_rate": 8.645931317109585e-06, + "loss": 0.1916, + "step": 6768 + }, + { + "epoch": 0.7015234739351228, + "grad_norm": 0.7226161956787109, + "learning_rate": 8.6404051630041e-06, + "loss": 0.215, + "step": 6769 + }, + { + "epoch": 0.7016271116177842, + "grad_norm": 0.583858072757721, + "learning_rate": 8.634880288880102e-06, + "loss": 0.1554, + "step": 6770 + }, + { + "epoch": 0.7017307493004457, + "grad_norm": 0.6536171436309814, + "learning_rate": 8.62935669536014e-06, + "loss": 0.22, + "step": 6771 + }, + { + "epoch": 0.7018343869831071, + "grad_norm": 0.6236019730567932, + "learning_rate": 8.623834383066576e-06, + "loss": 0.1963, + "step": 6772 + }, + { + "epoch": 0.7019380246657685, + "grad_norm": 0.6093345880508423, + "learning_rate": 8.618313352621675e-06, + "loss": 0.1875, + "step": 6773 + }, + { + "epoch": 0.7020416623484299, + "grad_norm": 0.6570085287094116, + "learning_rate": 8.612793604647525e-06, + "loss": 0.1845, + "step": 6774 + }, + { + "epoch": 0.7021453000310913, + "grad_norm": 0.5571804642677307, + "learning_rate": 8.607275139766089e-06, + "loss": 0.1607, + "step": 6775 + }, + { + "epoch": 0.7022489377137527, + "grad_norm": 0.5573575496673584, + "learning_rate": 8.60175795859919e-06, + "loss": 0.1622, + "step": 6776 + }, + { + "epoch": 0.7023525753964142, + "grad_norm": 0.5488677024841309, + "learning_rate": 8.596242061768482e-06, + "loss": 0.1491, + "step": 6777 + }, + { + "epoch": 0.7024562130790756, + "grad_norm": 0.6735754013061523, + "learning_rate": 8.590727449895495e-06, + "loss": 0.1831, + "step": 6778 + }, + { + "epoch": 0.702559850761737, + "grad_norm": 0.5725656151771545, + "learning_rate": 8.585214123601593e-06, + "loss": 0.1875, + "step": 6779 + }, + { + "epoch": 0.7026634884443984, + "grad_norm": 0.6627886295318604, + "learning_rate": 8.579702083508018e-06, + "loss": 0.2066, + "step": 6780 + }, + { + "epoch": 0.7027671261270598, + "grad_norm": 0.5476593375205994, + "learning_rate": 8.574191330235868e-06, + "loss": 0.1565, + "step": 6781 + }, + { + "epoch": 0.7028707638097212, + "grad_norm": 0.6509799957275391, + "learning_rate": 8.56868186440607e-06, + "loss": 0.1828, + "step": 6782 + }, + { + "epoch": 0.7029744014923827, + "grad_norm": 0.6372617483139038, + "learning_rate": 8.563173686639436e-06, + "loss": 0.2039, + "step": 6783 + }, + { + "epoch": 0.7030780391750441, + "grad_norm": 0.6228369474411011, + "learning_rate": 8.557666797556612e-06, + "loss": 0.2162, + "step": 6784 + }, + { + "epoch": 0.7031816768577055, + "grad_norm": 0.5669928789138794, + "learning_rate": 8.5521611977781e-06, + "loss": 0.1807, + "step": 6785 + }, + { + "epoch": 0.7032853145403669, + "grad_norm": 1.0194318294525146, + "learning_rate": 8.546656887924275e-06, + "loss": 0.1848, + "step": 6786 + }, + { + "epoch": 0.7033889522230283, + "grad_norm": 0.675552487373352, + "learning_rate": 8.541153868615337e-06, + "loss": 0.198, + "step": 6787 + }, + { + "epoch": 0.7034925899056897, + "grad_norm": 0.6050292253494263, + "learning_rate": 8.535652140471377e-06, + "loss": 0.1967, + "step": 6788 + }, + { + "epoch": 0.7035962275883512, + "grad_norm": 0.6931806206703186, + "learning_rate": 8.530151704112307e-06, + "loss": 0.2266, + "step": 6789 + }, + { + "epoch": 0.7036998652710126, + "grad_norm": 0.6126047372817993, + "learning_rate": 8.524652560157918e-06, + "loss": 0.1797, + "step": 6790 + }, + { + "epoch": 0.703803502953674, + "grad_norm": 0.6635711193084717, + "learning_rate": 8.51915470922784e-06, + "loss": 0.2182, + "step": 6791 + }, + { + "epoch": 0.7039071406363354, + "grad_norm": 0.5589820742607117, + "learning_rate": 8.513658151941552e-06, + "loss": 0.1763, + "step": 6792 + }, + { + "epoch": 0.7040107783189968, + "grad_norm": 0.6231314539909363, + "learning_rate": 8.508162888918419e-06, + "loss": 0.1962, + "step": 6793 + }, + { + "epoch": 0.7041144160016582, + "grad_norm": 0.6989869475364685, + "learning_rate": 8.50266892077762e-06, + "loss": 0.2078, + "step": 6794 + }, + { + "epoch": 0.7042180536843197, + "grad_norm": 0.6447907090187073, + "learning_rate": 8.497176248138212e-06, + "loss": 0.1918, + "step": 6795 + }, + { + "epoch": 0.7043216913669811, + "grad_norm": 0.6743923425674438, + "learning_rate": 8.49168487161911e-06, + "loss": 0.2248, + "step": 6796 + }, + { + "epoch": 0.7044253290496425, + "grad_norm": 0.7061793208122253, + "learning_rate": 8.486194791839068e-06, + "loss": 0.2346, + "step": 6797 + }, + { + "epoch": 0.7045289667323039, + "grad_norm": 0.5488268136978149, + "learning_rate": 8.480706009416697e-06, + "loss": 0.182, + "step": 6798 + }, + { + "epoch": 0.7046326044149653, + "grad_norm": 0.5685331225395203, + "learning_rate": 8.475218524970459e-06, + "loss": 0.2149, + "step": 6799 + }, + { + "epoch": 0.7047362420976266, + "grad_norm": 0.8025184273719788, + "learning_rate": 8.469732339118684e-06, + "loss": 0.2606, + "step": 6800 + }, + { + "epoch": 0.7048398797802881, + "grad_norm": 0.6012006998062134, + "learning_rate": 8.46424745247955e-06, + "loss": 0.1907, + "step": 6801 + }, + { + "epoch": 0.7049435174629495, + "grad_norm": 0.6444013714790344, + "learning_rate": 8.45876386567108e-06, + "loss": 0.1938, + "step": 6802 + }, + { + "epoch": 0.7050471551456109, + "grad_norm": 0.6376757025718689, + "learning_rate": 8.453281579311156e-06, + "loss": 0.203, + "step": 6803 + }, + { + "epoch": 0.7051507928282723, + "grad_norm": 0.7025614976882935, + "learning_rate": 8.44780059401751e-06, + "loss": 0.2266, + "step": 6804 + }, + { + "epoch": 0.7052544305109337, + "grad_norm": 0.6612352728843689, + "learning_rate": 8.44232091040773e-06, + "loss": 0.2203, + "step": 6805 + }, + { + "epoch": 0.7053580681935951, + "grad_norm": 0.5750066637992859, + "learning_rate": 8.436842529099275e-06, + "loss": 0.178, + "step": 6806 + }, + { + "epoch": 0.7054617058762566, + "grad_norm": 0.4683450162410736, + "learning_rate": 8.431365450709419e-06, + "loss": 0.1454, + "step": 6807 + }, + { + "epoch": 0.705565343558918, + "grad_norm": 0.5653798580169678, + "learning_rate": 8.425889675855327e-06, + "loss": 0.1553, + "step": 6808 + }, + { + "epoch": 0.7056689812415794, + "grad_norm": 0.5899336338043213, + "learning_rate": 8.420415205153996e-06, + "loss": 0.1706, + "step": 6809 + }, + { + "epoch": 0.7057726189242408, + "grad_norm": 0.6194468140602112, + "learning_rate": 8.414942039222268e-06, + "loss": 0.1942, + "step": 6810 + }, + { + "epoch": 0.7058762566069022, + "grad_norm": 0.6131260395050049, + "learning_rate": 8.409470178676873e-06, + "loss": 0.1838, + "step": 6811 + }, + { + "epoch": 0.7059798942895636, + "grad_norm": 0.6971158385276794, + "learning_rate": 8.403999624134352e-06, + "loss": 0.2318, + "step": 6812 + }, + { + "epoch": 0.7060835319722251, + "grad_norm": 0.5297203063964844, + "learning_rate": 8.398530376211133e-06, + "loss": 0.1457, + "step": 6813 + }, + { + "epoch": 0.7061871696548865, + "grad_norm": 0.6779531836509705, + "learning_rate": 8.393062435523471e-06, + "loss": 0.2145, + "step": 6814 + }, + { + "epoch": 0.7062908073375479, + "grad_norm": 0.6087656021118164, + "learning_rate": 8.387595802687497e-06, + "loss": 0.1806, + "step": 6815 + }, + { + "epoch": 0.7063944450202093, + "grad_norm": 0.5969190001487732, + "learning_rate": 8.382130478319174e-06, + "loss": 0.1882, + "step": 6816 + }, + { + "epoch": 0.7064980827028707, + "grad_norm": 0.6744171977043152, + "learning_rate": 8.376666463034324e-06, + "loss": 0.2035, + "step": 6817 + }, + { + "epoch": 0.7066017203855322, + "grad_norm": 0.6547932028770447, + "learning_rate": 8.371203757448634e-06, + "loss": 0.1902, + "step": 6818 + }, + { + "epoch": 0.7067053580681936, + "grad_norm": 0.49832016229629517, + "learning_rate": 8.36574236217762e-06, + "loss": 0.1507, + "step": 6819 + }, + { + "epoch": 0.706808995750855, + "grad_norm": 0.5831357836723328, + "learning_rate": 8.36028227783667e-06, + "loss": 0.1786, + "step": 6820 + }, + { + "epoch": 0.7069126334335164, + "grad_norm": 0.6416530013084412, + "learning_rate": 8.354823505041028e-06, + "loss": 0.2137, + "step": 6821 + }, + { + "epoch": 0.7070162711161778, + "grad_norm": 0.6294683218002319, + "learning_rate": 8.349366044405769e-06, + "loss": 0.2052, + "step": 6822 + }, + { + "epoch": 0.7071199087988392, + "grad_norm": 0.6354984045028687, + "learning_rate": 8.343909896545827e-06, + "loss": 0.1817, + "step": 6823 + }, + { + "epoch": 0.7072235464815007, + "grad_norm": 0.5182710289955139, + "learning_rate": 8.338455062076006e-06, + "loss": 0.1478, + "step": 6824 + }, + { + "epoch": 0.7073271841641621, + "grad_norm": 0.6225818991661072, + "learning_rate": 8.33300154161093e-06, + "loss": 0.2003, + "step": 6825 + }, + { + "epoch": 0.7074308218468235, + "grad_norm": 0.4340483844280243, + "learning_rate": 8.327549335765112e-06, + "loss": 0.131, + "step": 6826 + }, + { + "epoch": 0.7075344595294849, + "grad_norm": 0.7447500228881836, + "learning_rate": 8.322098445152884e-06, + "loss": 0.2315, + "step": 6827 + }, + { + "epoch": 0.7076380972121463, + "grad_norm": 0.6974065899848938, + "learning_rate": 8.316648870388455e-06, + "loss": 0.1915, + "step": 6828 + }, + { + "epoch": 0.7077417348948077, + "grad_norm": 0.5116464495658875, + "learning_rate": 8.31120061208587e-06, + "loss": 0.143, + "step": 6829 + }, + { + "epoch": 0.7078453725774692, + "grad_norm": 0.5923205614089966, + "learning_rate": 8.305753670859023e-06, + "loss": 0.2009, + "step": 6830 + }, + { + "epoch": 0.7079490102601306, + "grad_norm": 0.6276652216911316, + "learning_rate": 8.300308047321679e-06, + "loss": 0.1875, + "step": 6831 + }, + { + "epoch": 0.708052647942792, + "grad_norm": 0.6970288753509521, + "learning_rate": 8.294863742087432e-06, + "loss": 0.2212, + "step": 6832 + }, + { + "epoch": 0.7081562856254534, + "grad_norm": 0.5120554566383362, + "learning_rate": 8.289420755769738e-06, + "loss": 0.148, + "step": 6833 + }, + { + "epoch": 0.7082599233081148, + "grad_norm": 0.6152527332305908, + "learning_rate": 8.283979088981929e-06, + "loss": 0.1847, + "step": 6834 + }, + { + "epoch": 0.7083635609907762, + "grad_norm": 0.662187933921814, + "learning_rate": 8.278538742337125e-06, + "loss": 0.2004, + "step": 6835 + }, + { + "epoch": 0.7084671986734377, + "grad_norm": 0.6494444012641907, + "learning_rate": 8.273099716448362e-06, + "loss": 0.2184, + "step": 6836 + }, + { + "epoch": 0.7085708363560991, + "grad_norm": 0.6606298089027405, + "learning_rate": 8.267662011928485e-06, + "loss": 0.195, + "step": 6837 + }, + { + "epoch": 0.7086744740387605, + "grad_norm": 0.6092416048049927, + "learning_rate": 8.262225629390217e-06, + "loss": 0.2103, + "step": 6838 + }, + { + "epoch": 0.7087781117214219, + "grad_norm": 0.5877014398574829, + "learning_rate": 8.256790569446123e-06, + "loss": 0.1852, + "step": 6839 + }, + { + "epoch": 0.7088817494040833, + "grad_norm": 0.6736811995506287, + "learning_rate": 8.251356832708615e-06, + "loss": 0.2091, + "step": 6840 + }, + { + "epoch": 0.7089853870867447, + "grad_norm": 0.6421195268630981, + "learning_rate": 8.245924419789953e-06, + "loss": 0.1842, + "step": 6841 + }, + { + "epoch": 0.7090890247694062, + "grad_norm": 0.6364426016807556, + "learning_rate": 8.240493331302249e-06, + "loss": 0.1877, + "step": 6842 + }, + { + "epoch": 0.7091926624520676, + "grad_norm": 0.6009476184844971, + "learning_rate": 8.235063567857475e-06, + "loss": 0.1594, + "step": 6843 + }, + { + "epoch": 0.709296300134729, + "grad_norm": 0.6266801953315735, + "learning_rate": 8.22963513006746e-06, + "loss": 0.2032, + "step": 6844 + }, + { + "epoch": 0.7093999378173904, + "grad_norm": 0.6310063004493713, + "learning_rate": 8.224208018543849e-06, + "loss": 0.2144, + "step": 6845 + }, + { + "epoch": 0.7095035755000518, + "grad_norm": 0.5849006175994873, + "learning_rate": 8.218782233898183e-06, + "loss": 0.2095, + "step": 6846 + }, + { + "epoch": 0.7096072131827132, + "grad_norm": 0.6484225988388062, + "learning_rate": 8.213357776741819e-06, + "loss": 0.1889, + "step": 6847 + }, + { + "epoch": 0.7097108508653747, + "grad_norm": 0.7030849456787109, + "learning_rate": 8.207934647685972e-06, + "loss": 0.2273, + "step": 6848 + }, + { + "epoch": 0.7098144885480361, + "grad_norm": 0.645413339138031, + "learning_rate": 8.202512847341724e-06, + "loss": 0.2285, + "step": 6849 + }, + { + "epoch": 0.7099181262306975, + "grad_norm": 0.5695993900299072, + "learning_rate": 8.197092376319984e-06, + "loss": 0.1934, + "step": 6850 + }, + { + "epoch": 0.7100217639133589, + "grad_norm": 0.6163286566734314, + "learning_rate": 8.191673235231532e-06, + "loss": 0.1681, + "step": 6851 + }, + { + "epoch": 0.7101254015960203, + "grad_norm": 0.5936559438705444, + "learning_rate": 8.186255424686975e-06, + "loss": 0.1889, + "step": 6852 + }, + { + "epoch": 0.7102290392786818, + "grad_norm": 0.6233243942260742, + "learning_rate": 8.180838945296803e-06, + "loss": 0.1745, + "step": 6853 + }, + { + "epoch": 0.7103326769613432, + "grad_norm": 0.6084573268890381, + "learning_rate": 8.175423797671322e-06, + "loss": 0.2057, + "step": 6854 + }, + { + "epoch": 0.7104363146440046, + "grad_norm": 0.5781868100166321, + "learning_rate": 8.170009982420699e-06, + "loss": 0.1724, + "step": 6855 + }, + { + "epoch": 0.710539952326666, + "grad_norm": 0.6366261839866638, + "learning_rate": 8.16459750015497e-06, + "loss": 0.1877, + "step": 6856 + }, + { + "epoch": 0.7106435900093274, + "grad_norm": 0.6529333591461182, + "learning_rate": 8.159186351483987e-06, + "loss": 0.2034, + "step": 6857 + }, + { + "epoch": 0.7107472276919888, + "grad_norm": 0.5903447866439819, + "learning_rate": 8.153776537017482e-06, + "loss": 0.2054, + "step": 6858 + }, + { + "epoch": 0.7108508653746503, + "grad_norm": 0.6202003359794617, + "learning_rate": 8.148368057365026e-06, + "loss": 0.1741, + "step": 6859 + }, + { + "epoch": 0.7109545030573117, + "grad_norm": 0.662723183631897, + "learning_rate": 8.142960913136036e-06, + "loss": 0.2178, + "step": 6860 + }, + { + "epoch": 0.7110581407399731, + "grad_norm": 0.6324710845947266, + "learning_rate": 8.137555104939776e-06, + "loss": 0.2129, + "step": 6861 + }, + { + "epoch": 0.7111617784226345, + "grad_norm": 0.5313311219215393, + "learning_rate": 8.132150633385359e-06, + "loss": 0.1504, + "step": 6862 + }, + { + "epoch": 0.7112654161052959, + "grad_norm": 0.6284851431846619, + "learning_rate": 8.12674749908176e-06, + "loss": 0.1995, + "step": 6863 + }, + { + "epoch": 0.7113690537879573, + "grad_norm": 0.8839542865753174, + "learning_rate": 8.121345702637804e-06, + "loss": 0.2332, + "step": 6864 + }, + { + "epoch": 0.7114726914706188, + "grad_norm": 0.6175948977470398, + "learning_rate": 8.115945244662148e-06, + "loss": 0.2128, + "step": 6865 + }, + { + "epoch": 0.7115763291532802, + "grad_norm": 0.6142639517784119, + "learning_rate": 8.110546125763305e-06, + "loss": 0.2062, + "step": 6866 + }, + { + "epoch": 0.7116799668359416, + "grad_norm": 0.7615811824798584, + "learning_rate": 8.105148346549638e-06, + "loss": 0.2388, + "step": 6867 + }, + { + "epoch": 0.711783604518603, + "grad_norm": 0.678072452545166, + "learning_rate": 8.099751907629363e-06, + "loss": 0.2006, + "step": 6868 + }, + { + "epoch": 0.7118872422012644, + "grad_norm": 0.7236624360084534, + "learning_rate": 8.094356809610554e-06, + "loss": 0.2101, + "step": 6869 + }, + { + "epoch": 0.7119908798839258, + "grad_norm": 0.5722697377204895, + "learning_rate": 8.0889630531011e-06, + "loss": 0.1766, + "step": 6870 + }, + { + "epoch": 0.7120945175665873, + "grad_norm": 0.6453374624252319, + "learning_rate": 8.083570638708785e-06, + "loss": 0.2015, + "step": 6871 + }, + { + "epoch": 0.7121981552492487, + "grad_norm": 0.717676043510437, + "learning_rate": 8.078179567041201e-06, + "loss": 0.2209, + "step": 6872 + }, + { + "epoch": 0.7123017929319101, + "grad_norm": 0.6053109169006348, + "learning_rate": 8.072789838705805e-06, + "loss": 0.1759, + "step": 6873 + }, + { + "epoch": 0.7124054306145715, + "grad_norm": 0.6611320376396179, + "learning_rate": 8.067401454309917e-06, + "loss": 0.21, + "step": 6874 + }, + { + "epoch": 0.7125090682972329, + "grad_norm": 0.6562949419021606, + "learning_rate": 8.062014414460677e-06, + "loss": 0.2028, + "step": 6875 + }, + { + "epoch": 0.7126127059798942, + "grad_norm": 0.6381715536117554, + "learning_rate": 8.056628719765103e-06, + "loss": 0.2144, + "step": 6876 + }, + { + "epoch": 0.7127163436625557, + "grad_norm": 0.5687152147293091, + "learning_rate": 8.051244370830029e-06, + "loss": 0.1689, + "step": 6877 + }, + { + "epoch": 0.7128199813452171, + "grad_norm": 0.6299018263816833, + "learning_rate": 8.045861368262172e-06, + "loss": 0.1954, + "step": 6878 + }, + { + "epoch": 0.7129236190278785, + "grad_norm": 0.6874618530273438, + "learning_rate": 8.040479712668071e-06, + "loss": 0.2333, + "step": 6879 + }, + { + "epoch": 0.7130272567105399, + "grad_norm": 0.5801635384559631, + "learning_rate": 8.035099404654122e-06, + "loss": 0.1564, + "step": 6880 + }, + { + "epoch": 0.7131308943932013, + "grad_norm": 0.6684380173683167, + "learning_rate": 8.029720444826576e-06, + "loss": 0.2082, + "step": 6881 + }, + { + "epoch": 0.7132345320758627, + "grad_norm": 0.6474950909614563, + "learning_rate": 8.024342833791517e-06, + "loss": 0.2106, + "step": 6882 + }, + { + "epoch": 0.7133381697585242, + "grad_norm": 0.6856906414031982, + "learning_rate": 8.018966572154889e-06, + "loss": 0.199, + "step": 6883 + }, + { + "epoch": 0.7134418074411856, + "grad_norm": 0.6510111689567566, + "learning_rate": 8.013591660522494e-06, + "loss": 0.2001, + "step": 6884 + }, + { + "epoch": 0.713545445123847, + "grad_norm": 0.5542777180671692, + "learning_rate": 8.008218099499952e-06, + "loss": 0.1561, + "step": 6885 + }, + { + "epoch": 0.7136490828065084, + "grad_norm": 0.6461411714553833, + "learning_rate": 8.002845889692756e-06, + "loss": 0.1928, + "step": 6886 + }, + { + "epoch": 0.7137527204891698, + "grad_norm": 0.6065050959587097, + "learning_rate": 7.997475031706228e-06, + "loss": 0.1815, + "step": 6887 + }, + { + "epoch": 0.7138563581718312, + "grad_norm": 0.5979995131492615, + "learning_rate": 7.992105526145555e-06, + "loss": 0.1972, + "step": 6888 + }, + { + "epoch": 0.7139599958544927, + "grad_norm": 0.6340582966804504, + "learning_rate": 7.98673737361577e-06, + "loss": 0.1953, + "step": 6889 + }, + { + "epoch": 0.7140636335371541, + "grad_norm": 0.614409327507019, + "learning_rate": 7.981370574721739e-06, + "loss": 0.1865, + "step": 6890 + }, + { + "epoch": 0.7141672712198155, + "grad_norm": 0.7286944389343262, + "learning_rate": 7.976005130068192e-06, + "loss": 0.2428, + "step": 6891 + }, + { + "epoch": 0.7142709089024769, + "grad_norm": 0.6045624017715454, + "learning_rate": 7.970641040259696e-06, + "loss": 0.2026, + "step": 6892 + }, + { + "epoch": 0.7143745465851383, + "grad_norm": 0.7129000425338745, + "learning_rate": 7.965278305900661e-06, + "loss": 0.2338, + "step": 6893 + }, + { + "epoch": 0.7144781842677997, + "grad_norm": 0.6371878981590271, + "learning_rate": 7.959916927595366e-06, + "loss": 0.1773, + "step": 6894 + }, + { + "epoch": 0.7145818219504612, + "grad_norm": 0.5102376341819763, + "learning_rate": 7.954556905947909e-06, + "loss": 0.1615, + "step": 6895 + }, + { + "epoch": 0.7146854596331226, + "grad_norm": 1.1365267038345337, + "learning_rate": 7.94919824156226e-06, + "loss": 0.2256, + "step": 6896 + }, + { + "epoch": 0.714789097315784, + "grad_norm": 0.6125771403312683, + "learning_rate": 7.94384093504222e-06, + "loss": 0.1686, + "step": 6897 + }, + { + "epoch": 0.7148927349984454, + "grad_norm": 0.7069416046142578, + "learning_rate": 7.938484986991435e-06, + "loss": 0.2137, + "step": 6898 + }, + { + "epoch": 0.7149963726811068, + "grad_norm": 0.7099876403808594, + "learning_rate": 7.933130398013419e-06, + "loss": 0.1804, + "step": 6899 + }, + { + "epoch": 0.7151000103637682, + "grad_norm": 0.5490723252296448, + "learning_rate": 7.927777168711503e-06, + "loss": 0.1791, + "step": 6900 + }, + { + "epoch": 0.7152036480464297, + "grad_norm": 0.6424699425697327, + "learning_rate": 7.922425299688895e-06, + "loss": 0.1828, + "step": 6901 + }, + { + "epoch": 0.7153072857290911, + "grad_norm": 0.6973537802696228, + "learning_rate": 7.917074791548625e-06, + "loss": 0.2145, + "step": 6902 + }, + { + "epoch": 0.7154109234117525, + "grad_norm": 0.6093841791152954, + "learning_rate": 7.91172564489359e-06, + "loss": 0.1884, + "step": 6903 + }, + { + "epoch": 0.7155145610944139, + "grad_norm": 0.6583380699157715, + "learning_rate": 7.90637786032652e-06, + "loss": 0.2057, + "step": 6904 + }, + { + "epoch": 0.7156181987770753, + "grad_norm": 0.6568527817726135, + "learning_rate": 7.901031438449982e-06, + "loss": 0.1879, + "step": 6905 + }, + { + "epoch": 0.7157218364597367, + "grad_norm": 0.6096395254135132, + "learning_rate": 7.895686379866423e-06, + "loss": 0.2051, + "step": 6906 + }, + { + "epoch": 0.7158254741423982, + "grad_norm": 0.5662840604782104, + "learning_rate": 7.890342685178098e-06, + "loss": 0.1826, + "step": 6907 + }, + { + "epoch": 0.7159291118250596, + "grad_norm": 0.6252114772796631, + "learning_rate": 7.885000354987136e-06, + "loss": 0.1981, + "step": 6908 + }, + { + "epoch": 0.716032749507721, + "grad_norm": 0.662355899810791, + "learning_rate": 7.879659389895506e-06, + "loss": 0.2038, + "step": 6909 + }, + { + "epoch": 0.7161363871903824, + "grad_norm": 0.6738420724868774, + "learning_rate": 7.874319790505016e-06, + "loss": 0.2336, + "step": 6910 + }, + { + "epoch": 0.7162400248730438, + "grad_norm": 0.5899892449378967, + "learning_rate": 7.86898155741732e-06, + "loss": 0.1656, + "step": 6911 + }, + { + "epoch": 0.7163436625557053, + "grad_norm": 0.6204176545143127, + "learning_rate": 7.863644691233921e-06, + "loss": 0.2036, + "step": 6912 + }, + { + "epoch": 0.7164473002383667, + "grad_norm": 0.5711419582366943, + "learning_rate": 7.858309192556168e-06, + "loss": 0.1938, + "step": 6913 + }, + { + "epoch": 0.7165509379210281, + "grad_norm": 0.7031733393669128, + "learning_rate": 7.852975061985269e-06, + "loss": 0.1909, + "step": 6914 + }, + { + "epoch": 0.7166545756036895, + "grad_norm": 0.669396162033081, + "learning_rate": 7.847642300122251e-06, + "loss": 0.1892, + "step": 6915 + }, + { + "epoch": 0.7167582132863509, + "grad_norm": 0.6639227271080017, + "learning_rate": 7.842310907568014e-06, + "loss": 0.1982, + "step": 6916 + }, + { + "epoch": 0.7168618509690123, + "grad_norm": 0.6344635486602783, + "learning_rate": 7.836980884923282e-06, + "loss": 0.1984, + "step": 6917 + }, + { + "epoch": 0.7169654886516738, + "grad_norm": 0.5545706152915955, + "learning_rate": 7.831652232788632e-06, + "loss": 0.1731, + "step": 6918 + }, + { + "epoch": 0.7170691263343352, + "grad_norm": 0.6619438529014587, + "learning_rate": 7.8263249517645e-06, + "loss": 0.1947, + "step": 6919 + }, + { + "epoch": 0.7171727640169966, + "grad_norm": 0.7299826145172119, + "learning_rate": 7.820999042451139e-06, + "loss": 0.2295, + "step": 6920 + }, + { + "epoch": 0.717276401699658, + "grad_norm": 0.5815308690071106, + "learning_rate": 7.81567450544868e-06, + "loss": 0.1882, + "step": 6921 + }, + { + "epoch": 0.7173800393823194, + "grad_norm": 0.6410979628562927, + "learning_rate": 7.81035134135707e-06, + "loss": 0.1999, + "step": 6922 + }, + { + "epoch": 0.7174836770649808, + "grad_norm": 0.7028884291648865, + "learning_rate": 7.805029550776128e-06, + "loss": 0.2004, + "step": 6923 + }, + { + "epoch": 0.7175873147476423, + "grad_norm": 0.6730342507362366, + "learning_rate": 7.799709134305502e-06, + "loss": 0.2102, + "step": 6924 + }, + { + "epoch": 0.7176909524303037, + "grad_norm": 0.6692425012588501, + "learning_rate": 7.794390092544674e-06, + "loss": 0.2261, + "step": 6925 + }, + { + "epoch": 0.7177945901129651, + "grad_norm": 0.6351549625396729, + "learning_rate": 7.789072426093007e-06, + "loss": 0.1822, + "step": 6926 + }, + { + "epoch": 0.7178982277956265, + "grad_norm": 0.6907915472984314, + "learning_rate": 7.783756135549669e-06, + "loss": 0.2574, + "step": 6927 + }, + { + "epoch": 0.7180018654782879, + "grad_norm": 0.5755670666694641, + "learning_rate": 7.778441221513704e-06, + "loss": 0.1686, + "step": 6928 + }, + { + "epoch": 0.7181055031609493, + "grad_norm": 0.6563580632209778, + "learning_rate": 7.773127684583985e-06, + "loss": 0.1904, + "step": 6929 + }, + { + "epoch": 0.7182091408436108, + "grad_norm": 0.6980963349342346, + "learning_rate": 7.767815525359224e-06, + "loss": 0.2022, + "step": 6930 + }, + { + "epoch": 0.7183127785262722, + "grad_norm": 0.6036818027496338, + "learning_rate": 7.762504744438002e-06, + "loss": 0.1904, + "step": 6931 + }, + { + "epoch": 0.7184164162089336, + "grad_norm": 0.6479178071022034, + "learning_rate": 7.757195342418716e-06, + "loss": 0.1899, + "step": 6932 + }, + { + "epoch": 0.718520053891595, + "grad_norm": 0.6769992113113403, + "learning_rate": 7.751887319899625e-06, + "loss": 0.2049, + "step": 6933 + }, + { + "epoch": 0.7186236915742564, + "grad_norm": 0.6470862030982971, + "learning_rate": 7.746580677478837e-06, + "loss": 0.206, + "step": 6934 + }, + { + "epoch": 0.7187273292569178, + "grad_norm": 0.6667788624763489, + "learning_rate": 7.74127541575429e-06, + "loss": 0.2166, + "step": 6935 + }, + { + "epoch": 0.7188309669395793, + "grad_norm": 0.7218932509422302, + "learning_rate": 7.735971535323775e-06, + "loss": 0.2256, + "step": 6936 + }, + { + "epoch": 0.7189346046222407, + "grad_norm": 0.6005328893661499, + "learning_rate": 7.730669036784915e-06, + "loss": 0.185, + "step": 6937 + }, + { + "epoch": 0.7190382423049021, + "grad_norm": 0.6119803786277771, + "learning_rate": 7.725367920735194e-06, + "loss": 0.1941, + "step": 6938 + }, + { + "epoch": 0.7191418799875635, + "grad_norm": 0.6969441771507263, + "learning_rate": 7.72006818777194e-06, + "loss": 0.2169, + "step": 6939 + }, + { + "epoch": 0.7192455176702249, + "grad_norm": 0.5583266615867615, + "learning_rate": 7.714769838492309e-06, + "loss": 0.1492, + "step": 6940 + }, + { + "epoch": 0.7193491553528863, + "grad_norm": 0.6057022213935852, + "learning_rate": 7.70947287349332e-06, + "loss": 0.1932, + "step": 6941 + }, + { + "epoch": 0.7194527930355478, + "grad_norm": 0.580872118473053, + "learning_rate": 7.704177293371822e-06, + "loss": 0.185, + "step": 6942 + }, + { + "epoch": 0.7195564307182092, + "grad_norm": 0.6409164071083069, + "learning_rate": 7.698883098724506e-06, + "loss": 0.1777, + "step": 6943 + }, + { + "epoch": 0.7196600684008706, + "grad_norm": 0.7127074599266052, + "learning_rate": 7.693590290147925e-06, + "loss": 0.2104, + "step": 6944 + }, + { + "epoch": 0.719763706083532, + "grad_norm": 0.5302404165267944, + "learning_rate": 7.688298868238454e-06, + "loss": 0.1679, + "step": 6945 + }, + { + "epoch": 0.7198673437661934, + "grad_norm": 0.5907058715820312, + "learning_rate": 7.683008833592336e-06, + "loss": 0.1893, + "step": 6946 + }, + { + "epoch": 0.7199709814488549, + "grad_norm": 0.6386611461639404, + "learning_rate": 7.677720186805626e-06, + "loss": 0.2356, + "step": 6947 + }, + { + "epoch": 0.7200746191315163, + "grad_norm": 0.7018064856529236, + "learning_rate": 7.672432928474258e-06, + "loss": 0.2337, + "step": 6948 + }, + { + "epoch": 0.7201782568141777, + "grad_norm": 0.6732597947120667, + "learning_rate": 7.667147059193984e-06, + "loss": 0.221, + "step": 6949 + }, + { + "epoch": 0.7202818944968391, + "grad_norm": 0.48719078302383423, + "learning_rate": 7.6618625795604e-06, + "loss": 0.1384, + "step": 6950 + }, + { + "epoch": 0.7203855321795005, + "grad_norm": 0.669743537902832, + "learning_rate": 7.656579490168967e-06, + "loss": 0.2023, + "step": 6951 + }, + { + "epoch": 0.7204891698621618, + "grad_norm": 0.7527252435684204, + "learning_rate": 7.651297791614964e-06, + "loss": 0.206, + "step": 6952 + }, + { + "epoch": 0.7205928075448232, + "grad_norm": 0.6796308159828186, + "learning_rate": 7.64601748449353e-06, + "loss": 0.195, + "step": 6953 + }, + { + "epoch": 0.7206964452274847, + "grad_norm": 0.6304457783699036, + "learning_rate": 7.640738569399645e-06, + "loss": 0.1813, + "step": 6954 + }, + { + "epoch": 0.7208000829101461, + "grad_norm": 0.5981799960136414, + "learning_rate": 7.635461046928127e-06, + "loss": 0.1854, + "step": 6955 + }, + { + "epoch": 0.7209037205928075, + "grad_norm": 0.6973567605018616, + "learning_rate": 7.630184917673638e-06, + "loss": 0.2322, + "step": 6956 + }, + { + "epoch": 0.7210073582754689, + "grad_norm": 0.6168457269668579, + "learning_rate": 7.624910182230674e-06, + "loss": 0.2044, + "step": 6957 + }, + { + "epoch": 0.7211109959581303, + "grad_norm": 0.7069376707077026, + "learning_rate": 7.619636841193594e-06, + "loss": 0.2197, + "step": 6958 + }, + { + "epoch": 0.7212146336407917, + "grad_norm": 0.6199985146522522, + "learning_rate": 7.614364895156597e-06, + "loss": 0.1872, + "step": 6959 + }, + { + "epoch": 0.7213182713234532, + "grad_norm": 0.633570671081543, + "learning_rate": 7.609094344713708e-06, + "loss": 0.1903, + "step": 6960 + }, + { + "epoch": 0.7214219090061146, + "grad_norm": 0.5866010189056396, + "learning_rate": 7.603825190458809e-06, + "loss": 0.1844, + "step": 6961 + }, + { + "epoch": 0.721525546688776, + "grad_norm": 0.6975796222686768, + "learning_rate": 7.598557432985607e-06, + "loss": 0.2257, + "step": 6962 + }, + { + "epoch": 0.7216291843714374, + "grad_norm": 0.575537383556366, + "learning_rate": 7.5932910728876766e-06, + "loss": 0.1636, + "step": 6963 + }, + { + "epoch": 0.7217328220540988, + "grad_norm": 0.5930981040000916, + "learning_rate": 7.588026110758428e-06, + "loss": 0.2089, + "step": 6964 + }, + { + "epoch": 0.7218364597367602, + "grad_norm": 0.5862255096435547, + "learning_rate": 7.5827625471910985e-06, + "loss": 0.2131, + "step": 6965 + }, + { + "epoch": 0.7219400974194217, + "grad_norm": 0.5902138948440552, + "learning_rate": 7.5775003827787864e-06, + "loss": 0.1991, + "step": 6966 + }, + { + "epoch": 0.7220437351020831, + "grad_norm": 0.6987681984901428, + "learning_rate": 7.5722396181144185e-06, + "loss": 0.2177, + "step": 6967 + }, + { + "epoch": 0.7221473727847445, + "grad_norm": 0.6114003658294678, + "learning_rate": 7.566980253790768e-06, + "loss": 0.1965, + "step": 6968 + }, + { + "epoch": 0.7222510104674059, + "grad_norm": 0.5686897039413452, + "learning_rate": 7.5617222904004595e-06, + "loss": 0.1715, + "step": 6969 + }, + { + "epoch": 0.7223546481500673, + "grad_norm": 0.6705672740936279, + "learning_rate": 7.55646572853594e-06, + "loss": 0.199, + "step": 6970 + }, + { + "epoch": 0.7224582858327288, + "grad_norm": 0.5653578042984009, + "learning_rate": 7.551210568789526e-06, + "loss": 0.1635, + "step": 6971 + }, + { + "epoch": 0.7225619235153902, + "grad_norm": 0.6964936256408691, + "learning_rate": 7.545956811753348e-06, + "loss": 0.191, + "step": 6972 + }, + { + "epoch": 0.7226655611980516, + "grad_norm": 0.7278429865837097, + "learning_rate": 7.540704458019401e-06, + "loss": 0.2348, + "step": 6973 + }, + { + "epoch": 0.722769198880713, + "grad_norm": 0.5915662050247192, + "learning_rate": 7.535453508179509e-06, + "loss": 0.2003, + "step": 6974 + }, + { + "epoch": 0.7228728365633744, + "grad_norm": 0.6096627712249756, + "learning_rate": 7.530203962825331e-06, + "loss": 0.179, + "step": 6975 + }, + { + "epoch": 0.7229764742460358, + "grad_norm": 0.5854135155677795, + "learning_rate": 7.5249558225483945e-06, + "loss": 0.1851, + "step": 6976 + }, + { + "epoch": 0.7230801119286973, + "grad_norm": 0.5830105543136597, + "learning_rate": 7.519709087940034e-06, + "loss": 0.1764, + "step": 6977 + }, + { + "epoch": 0.7231837496113587, + "grad_norm": 0.6320511102676392, + "learning_rate": 7.514463759591453e-06, + "loss": 0.1989, + "step": 6978 + }, + { + "epoch": 0.7232873872940201, + "grad_norm": 0.5508729219436646, + "learning_rate": 7.509219838093693e-06, + "loss": 0.1655, + "step": 6979 + }, + { + "epoch": 0.7233910249766815, + "grad_norm": 0.5589364767074585, + "learning_rate": 7.503977324037626e-06, + "loss": 0.1898, + "step": 6980 + }, + { + "epoch": 0.7234946626593429, + "grad_norm": 0.731992781162262, + "learning_rate": 7.4987362180139665e-06, + "loss": 0.2017, + "step": 6981 + }, + { + "epoch": 0.7235983003420043, + "grad_norm": 0.6651973724365234, + "learning_rate": 7.49349652061327e-06, + "loss": 0.1773, + "step": 6982 + }, + { + "epoch": 0.7237019380246658, + "grad_norm": 0.6765840649604797, + "learning_rate": 7.488258232425947e-06, + "loss": 0.2017, + "step": 6983 + }, + { + "epoch": 0.7238055757073272, + "grad_norm": 0.6248261332511902, + "learning_rate": 7.483021354042239e-06, + "loss": 0.1862, + "step": 6984 + }, + { + "epoch": 0.7239092133899886, + "grad_norm": 0.6525418162345886, + "learning_rate": 7.477785886052223e-06, + "loss": 0.1857, + "step": 6985 + }, + { + "epoch": 0.72401285107265, + "grad_norm": 0.6958816647529602, + "learning_rate": 7.472551829045833e-06, + "loss": 0.1915, + "step": 6986 + }, + { + "epoch": 0.7241164887553114, + "grad_norm": 0.624822735786438, + "learning_rate": 7.467319183612827e-06, + "loss": 0.1936, + "step": 6987 + }, + { + "epoch": 0.7242201264379728, + "grad_norm": 0.7277296781539917, + "learning_rate": 7.462087950342809e-06, + "loss": 0.2439, + "step": 6988 + }, + { + "epoch": 0.7243237641206343, + "grad_norm": 0.6706949472427368, + "learning_rate": 7.456858129825235e-06, + "loss": 0.224, + "step": 6989 + }, + { + "epoch": 0.7244274018032957, + "grad_norm": 0.5695304274559021, + "learning_rate": 7.45162972264938e-06, + "loss": 0.1608, + "step": 6990 + }, + { + "epoch": 0.7245310394859571, + "grad_norm": 0.7456833720207214, + "learning_rate": 7.446402729404392e-06, + "loss": 0.2379, + "step": 6991 + }, + { + "epoch": 0.7246346771686185, + "grad_norm": 0.6324352025985718, + "learning_rate": 7.441177150679226e-06, + "loss": 0.2016, + "step": 6992 + }, + { + "epoch": 0.7247383148512799, + "grad_norm": 0.6055063009262085, + "learning_rate": 7.435952987062691e-06, + "loss": 0.1911, + "step": 6993 + }, + { + "epoch": 0.7248419525339413, + "grad_norm": 0.6443060040473938, + "learning_rate": 7.430730239143449e-06, + "loss": 0.1936, + "step": 6994 + }, + { + "epoch": 0.7249455902166028, + "grad_norm": 0.6803500056266785, + "learning_rate": 7.425508907509975e-06, + "loss": 0.2094, + "step": 6995 + }, + { + "epoch": 0.7250492278992642, + "grad_norm": 0.6133655309677124, + "learning_rate": 7.420288992750619e-06, + "loss": 0.1851, + "step": 6996 + }, + { + "epoch": 0.7251528655819256, + "grad_norm": 0.6188987493515015, + "learning_rate": 7.415070495453536e-06, + "loss": 0.2265, + "step": 6997 + }, + { + "epoch": 0.725256503264587, + "grad_norm": 0.5657846927642822, + "learning_rate": 7.4098534162067535e-06, + "loss": 0.1798, + "step": 6998 + }, + { + "epoch": 0.7253601409472484, + "grad_norm": 0.6191845536231995, + "learning_rate": 7.404637755598116e-06, + "loss": 0.1956, + "step": 6999 + }, + { + "epoch": 0.7254637786299099, + "grad_norm": 0.5805138349533081, + "learning_rate": 7.399423514215309e-06, + "loss": 0.17, + "step": 7000 + }, + { + "epoch": 0.7255674163125713, + "grad_norm": 0.6225676536560059, + "learning_rate": 7.39421069264588e-06, + "loss": 0.2, + "step": 7001 + }, + { + "epoch": 0.7256710539952327, + "grad_norm": 0.6446588635444641, + "learning_rate": 7.388999291477186e-06, + "loss": 0.1967, + "step": 7002 + }, + { + "epoch": 0.7257746916778941, + "grad_norm": 0.67559814453125, + "learning_rate": 7.38378931129645e-06, + "loss": 0.1686, + "step": 7003 + }, + { + "epoch": 0.7258783293605555, + "grad_norm": 0.6291429400444031, + "learning_rate": 7.378580752690727e-06, + "loss": 0.1794, + "step": 7004 + }, + { + "epoch": 0.7259819670432169, + "grad_norm": 0.5955010056495667, + "learning_rate": 7.373373616246904e-06, + "loss": 0.2062, + "step": 7005 + }, + { + "epoch": 0.7260856047258784, + "grad_norm": 0.6775494813919067, + "learning_rate": 7.368167902551715e-06, + "loss": 0.2031, + "step": 7006 + }, + { + "epoch": 0.7261892424085398, + "grad_norm": 0.6713017821311951, + "learning_rate": 7.362963612191723e-06, + "loss": 0.1774, + "step": 7007 + }, + { + "epoch": 0.7262928800912012, + "grad_norm": 0.6503675580024719, + "learning_rate": 7.357760745753346e-06, + "loss": 0.2129, + "step": 7008 + }, + { + "epoch": 0.7263965177738626, + "grad_norm": 0.6224893927574158, + "learning_rate": 7.352559303822842e-06, + "loss": 0.1953, + "step": 7009 + }, + { + "epoch": 0.726500155456524, + "grad_norm": 0.5616812109947205, + "learning_rate": 7.347359286986289e-06, + "loss": 0.1639, + "step": 7010 + }, + { + "epoch": 0.7266037931391854, + "grad_norm": 0.6985262632369995, + "learning_rate": 7.342160695829627e-06, + "loss": 0.1931, + "step": 7011 + }, + { + "epoch": 0.7267074308218469, + "grad_norm": 0.6394574046134949, + "learning_rate": 7.336963530938623e-06, + "loss": 0.1868, + "step": 7012 + }, + { + "epoch": 0.7268110685045083, + "grad_norm": 0.6743451952934265, + "learning_rate": 7.331767792898878e-06, + "loss": 0.2028, + "step": 7013 + }, + { + "epoch": 0.7269147061871697, + "grad_norm": 0.614748477935791, + "learning_rate": 7.326573482295849e-06, + "loss": 0.167, + "step": 7014 + }, + { + "epoch": 0.7270183438698311, + "grad_norm": 0.6770737171173096, + "learning_rate": 7.321380599714813e-06, + "loss": 0.2017, + "step": 7015 + }, + { + "epoch": 0.7271219815524925, + "grad_norm": 0.5676266551017761, + "learning_rate": 7.3161891457409085e-06, + "loss": 0.1755, + "step": 7016 + }, + { + "epoch": 0.7272256192351539, + "grad_norm": 0.6676115393638611, + "learning_rate": 7.310999120959085e-06, + "loss": 0.1915, + "step": 7017 + }, + { + "epoch": 0.7273292569178154, + "grad_norm": 0.6348340511322021, + "learning_rate": 7.305810525954167e-06, + "loss": 0.1791, + "step": 7018 + }, + { + "epoch": 0.7274328946004768, + "grad_norm": 0.5726019144058228, + "learning_rate": 7.300623361310781e-06, + "loss": 0.1801, + "step": 7019 + }, + { + "epoch": 0.7275365322831382, + "grad_norm": 0.506493330001831, + "learning_rate": 7.295437627613407e-06, + "loss": 0.151, + "step": 7020 + }, + { + "epoch": 0.7276401699657996, + "grad_norm": 0.5711618661880493, + "learning_rate": 7.290253325446372e-06, + "loss": 0.169, + "step": 7021 + }, + { + "epoch": 0.727743807648461, + "grad_norm": 0.620705783367157, + "learning_rate": 7.2850704553938415e-06, + "loss": 0.1716, + "step": 7022 + }, + { + "epoch": 0.7278474453311224, + "grad_norm": 0.5504769086837769, + "learning_rate": 7.279889018039806e-06, + "loss": 0.156, + "step": 7023 + }, + { + "epoch": 0.7279510830137839, + "grad_norm": 0.6168975234031677, + "learning_rate": 7.274709013968102e-06, + "loss": 0.219, + "step": 7024 + }, + { + "epoch": 0.7280547206964453, + "grad_norm": 0.6001594066619873, + "learning_rate": 7.269530443762398e-06, + "loss": 0.1716, + "step": 7025 + }, + { + "epoch": 0.7281583583791067, + "grad_norm": 0.7288575172424316, + "learning_rate": 7.264353308006214e-06, + "loss": 0.2085, + "step": 7026 + }, + { + "epoch": 0.7282619960617681, + "grad_norm": 0.5633076429367065, + "learning_rate": 7.259177607282908e-06, + "loss": 0.1604, + "step": 7027 + }, + { + "epoch": 0.7283656337444294, + "grad_norm": 0.642590343952179, + "learning_rate": 7.254003342175658e-06, + "loss": 0.2107, + "step": 7028 + }, + { + "epoch": 0.7284692714270908, + "grad_norm": 0.6819926500320435, + "learning_rate": 7.2488305132675e-06, + "loss": 0.2183, + "step": 7029 + }, + { + "epoch": 0.7285729091097523, + "grad_norm": 0.7595263719558716, + "learning_rate": 7.2436591211412995e-06, + "loss": 0.2345, + "step": 7030 + }, + { + "epoch": 0.7286765467924137, + "grad_norm": 0.6355629563331604, + "learning_rate": 7.238489166379754e-06, + "loss": 0.186, + "step": 7031 + }, + { + "epoch": 0.7287801844750751, + "grad_norm": 0.6492575407028198, + "learning_rate": 7.233320649565416e-06, + "loss": 0.1813, + "step": 7032 + }, + { + "epoch": 0.7288838221577365, + "grad_norm": 0.6751068830490112, + "learning_rate": 7.228153571280656e-06, + "loss": 0.2214, + "step": 7033 + }, + { + "epoch": 0.7289874598403979, + "grad_norm": 0.6983469724655151, + "learning_rate": 7.222987932107704e-06, + "loss": 0.2329, + "step": 7034 + }, + { + "epoch": 0.7290910975230593, + "grad_norm": 0.6801983118057251, + "learning_rate": 7.217823732628602e-06, + "loss": 0.1896, + "step": 7035 + }, + { + "epoch": 0.7291947352057208, + "grad_norm": 0.7078520059585571, + "learning_rate": 7.212660973425258e-06, + "loss": 0.2178, + "step": 7036 + }, + { + "epoch": 0.7292983728883822, + "grad_norm": 0.7132784724235535, + "learning_rate": 7.207499655079398e-06, + "loss": 0.2036, + "step": 7037 + }, + { + "epoch": 0.7294020105710436, + "grad_norm": 0.718744158744812, + "learning_rate": 7.202339778172583e-06, + "loss": 0.201, + "step": 7038 + }, + { + "epoch": 0.729505648253705, + "grad_norm": 0.7349317669868469, + "learning_rate": 7.197181343286233e-06, + "loss": 0.2098, + "step": 7039 + }, + { + "epoch": 0.7296092859363664, + "grad_norm": 0.6733152270317078, + "learning_rate": 7.192024351001583e-06, + "loss": 0.1987, + "step": 7040 + }, + { + "epoch": 0.7297129236190278, + "grad_norm": 0.6271103024482727, + "learning_rate": 7.186868801899715e-06, + "loss": 0.1924, + "step": 7041 + }, + { + "epoch": 0.7298165613016893, + "grad_norm": 0.6496624946594238, + "learning_rate": 7.181714696561561e-06, + "loss": 0.2, + "step": 7042 + }, + { + "epoch": 0.7299201989843507, + "grad_norm": 0.5096017718315125, + "learning_rate": 7.176562035567869e-06, + "loss": 0.1585, + "step": 7043 + }, + { + "epoch": 0.7300238366670121, + "grad_norm": 0.5929197072982788, + "learning_rate": 7.171410819499234e-06, + "loss": 0.1789, + "step": 7044 + }, + { + "epoch": 0.7301274743496735, + "grad_norm": 0.6660709977149963, + "learning_rate": 7.166261048936076e-06, + "loss": 0.2199, + "step": 7045 + }, + { + "epoch": 0.7302311120323349, + "grad_norm": 0.6655041575431824, + "learning_rate": 7.161112724458672e-06, + "loss": 0.1909, + "step": 7046 + }, + { + "epoch": 0.7303347497149963, + "grad_norm": 0.6861178278923035, + "learning_rate": 7.155965846647137e-06, + "loss": 0.1877, + "step": 7047 + }, + { + "epoch": 0.7304383873976578, + "grad_norm": 0.7006655931472778, + "learning_rate": 7.150820416081396e-06, + "loss": 0.2136, + "step": 7048 + }, + { + "epoch": 0.7305420250803192, + "grad_norm": 0.6472157835960388, + "learning_rate": 7.145676433341242e-06, + "loss": 0.1865, + "step": 7049 + }, + { + "epoch": 0.7306456627629806, + "grad_norm": 0.5800744295120239, + "learning_rate": 7.140533899006286e-06, + "loss": 0.1847, + "step": 7050 + }, + { + "epoch": 0.730749300445642, + "grad_norm": 0.6264989972114563, + "learning_rate": 7.135392813655972e-06, + "loss": 0.1887, + "step": 7051 + }, + { + "epoch": 0.7308529381283034, + "grad_norm": 0.7023571729660034, + "learning_rate": 7.130253177869606e-06, + "loss": 0.2213, + "step": 7052 + }, + { + "epoch": 0.7309565758109648, + "grad_norm": 0.5341815948486328, + "learning_rate": 7.125114992226298e-06, + "loss": 0.1723, + "step": 7053 + }, + { + "epoch": 0.7310602134936263, + "grad_norm": 0.5917410850524902, + "learning_rate": 7.119978257305025e-06, + "loss": 0.1953, + "step": 7054 + }, + { + "epoch": 0.7311638511762877, + "grad_norm": 0.6590427756309509, + "learning_rate": 7.11484297368458e-06, + "loss": 0.1937, + "step": 7055 + }, + { + "epoch": 0.7312674888589491, + "grad_norm": 0.6602505445480347, + "learning_rate": 7.10970914194359e-06, + "loss": 0.1754, + "step": 7056 + }, + { + "epoch": 0.7313711265416105, + "grad_norm": 0.6671847701072693, + "learning_rate": 7.104576762660544e-06, + "loss": 0.1826, + "step": 7057 + }, + { + "epoch": 0.7314747642242719, + "grad_norm": 0.6499585509300232, + "learning_rate": 7.099445836413734e-06, + "loss": 0.1776, + "step": 7058 + }, + { + "epoch": 0.7315784019069334, + "grad_norm": 0.5618710517883301, + "learning_rate": 7.094316363781322e-06, + "loss": 0.1652, + "step": 7059 + }, + { + "epoch": 0.7316820395895948, + "grad_norm": 0.7040053606033325, + "learning_rate": 7.0891883453412715e-06, + "loss": 0.1812, + "step": 7060 + }, + { + "epoch": 0.7317856772722562, + "grad_norm": 0.7040082812309265, + "learning_rate": 7.084061781671414e-06, + "loss": 0.2095, + "step": 7061 + }, + { + "epoch": 0.7318893149549176, + "grad_norm": 0.5922701358795166, + "learning_rate": 7.078936673349397e-06, + "loss": 0.1669, + "step": 7062 + }, + { + "epoch": 0.731992952637579, + "grad_norm": 0.603987991809845, + "learning_rate": 7.073813020952702e-06, + "loss": 0.196, + "step": 7063 + }, + { + "epoch": 0.7320965903202404, + "grad_norm": 0.5776903033256531, + "learning_rate": 7.06869082505867e-06, + "loss": 0.1604, + "step": 7064 + }, + { + "epoch": 0.7322002280029019, + "grad_norm": 0.6143998503684998, + "learning_rate": 7.063570086244447e-06, + "loss": 0.1959, + "step": 7065 + }, + { + "epoch": 0.7323038656855633, + "grad_norm": 0.6320601105690002, + "learning_rate": 7.058450805087036e-06, + "loss": 0.2035, + "step": 7066 + }, + { + "epoch": 0.7324075033682247, + "grad_norm": 0.6088545918464661, + "learning_rate": 7.053332982163277e-06, + "loss": 0.1554, + "step": 7067 + }, + { + "epoch": 0.7325111410508861, + "grad_norm": 0.7791801691055298, + "learning_rate": 7.048216618049832e-06, + "loss": 0.2042, + "step": 7068 + }, + { + "epoch": 0.7326147787335475, + "grad_norm": 0.5860157608985901, + "learning_rate": 7.043101713323204e-06, + "loss": 0.174, + "step": 7069 + }, + { + "epoch": 0.7327184164162089, + "grad_norm": 0.5433716177940369, + "learning_rate": 7.037988268559726e-06, + "loss": 0.169, + "step": 7070 + }, + { + "epoch": 0.7328220540988704, + "grad_norm": 0.6011751294136047, + "learning_rate": 7.032876284335582e-06, + "loss": 0.1832, + "step": 7071 + }, + { + "epoch": 0.7329256917815318, + "grad_norm": 0.6410551071166992, + "learning_rate": 7.027765761226783e-06, + "loss": 0.1924, + "step": 7072 + }, + { + "epoch": 0.7330293294641932, + "grad_norm": 0.6368006467819214, + "learning_rate": 7.022656699809169e-06, + "loss": 0.2275, + "step": 7073 + }, + { + "epoch": 0.7331329671468546, + "grad_norm": 0.66558837890625, + "learning_rate": 7.017549100658432e-06, + "loss": 0.1969, + "step": 7074 + }, + { + "epoch": 0.733236604829516, + "grad_norm": 0.6814399361610413, + "learning_rate": 7.012442964350079e-06, + "loss": 0.1954, + "step": 7075 + }, + { + "epoch": 0.7333402425121774, + "grad_norm": 0.6535932421684265, + "learning_rate": 7.007338291459456e-06, + "loss": 0.2077, + "step": 7076 + }, + { + "epoch": 0.7334438801948389, + "grad_norm": 0.5728932619094849, + "learning_rate": 7.002235082561764e-06, + "loss": 0.1578, + "step": 7077 + }, + { + "epoch": 0.7335475178775003, + "grad_norm": 0.7541806101799011, + "learning_rate": 6.9971333382320115e-06, + "loss": 0.2565, + "step": 7078 + }, + { + "epoch": 0.7336511555601617, + "grad_norm": 0.6706312894821167, + "learning_rate": 6.992033059045067e-06, + "loss": 0.1976, + "step": 7079 + }, + { + "epoch": 0.7337547932428231, + "grad_norm": 0.6851133704185486, + "learning_rate": 6.986934245575609e-06, + "loss": 0.2066, + "step": 7080 + }, + { + "epoch": 0.7338584309254845, + "grad_norm": 0.6354007124900818, + "learning_rate": 6.98183689839818e-06, + "loss": 0.1962, + "step": 7081 + }, + { + "epoch": 0.733962068608146, + "grad_norm": 0.662876546382904, + "learning_rate": 6.97674101808713e-06, + "loss": 0.192, + "step": 7082 + }, + { + "epoch": 0.7340657062908074, + "grad_norm": 0.6014705300331116, + "learning_rate": 6.9716466052166505e-06, + "loss": 0.1826, + "step": 7083 + }, + { + "epoch": 0.7341693439734688, + "grad_norm": 0.7361433506011963, + "learning_rate": 6.9665536603607864e-06, + "loss": 0.2282, + "step": 7084 + }, + { + "epoch": 0.7342729816561302, + "grad_norm": 0.631420373916626, + "learning_rate": 6.961462184093388e-06, + "loss": 0.1959, + "step": 7085 + }, + { + "epoch": 0.7343766193387916, + "grad_norm": 0.6514199376106262, + "learning_rate": 6.956372176988169e-06, + "loss": 0.1865, + "step": 7086 + }, + { + "epoch": 0.734480257021453, + "grad_norm": 0.5663927793502808, + "learning_rate": 6.951283639618654e-06, + "loss": 0.1761, + "step": 7087 + }, + { + "epoch": 0.7345838947041144, + "grad_norm": 0.789225697517395, + "learning_rate": 6.946196572558208e-06, + "loss": 0.2245, + "step": 7088 + }, + { + "epoch": 0.7346875323867759, + "grad_norm": 0.716204047203064, + "learning_rate": 6.941110976380048e-06, + "loss": 0.2109, + "step": 7089 + }, + { + "epoch": 0.7347911700694373, + "grad_norm": 0.626390814781189, + "learning_rate": 6.936026851657196e-06, + "loss": 0.2, + "step": 7090 + }, + { + "epoch": 0.7348948077520987, + "grad_norm": 0.558161199092865, + "learning_rate": 6.930944198962528e-06, + "loss": 0.1667, + "step": 7091 + }, + { + "epoch": 0.7349984454347601, + "grad_norm": 0.7186371088027954, + "learning_rate": 6.925863018868759e-06, + "loss": 0.1815, + "step": 7092 + }, + { + "epoch": 0.7351020831174215, + "grad_norm": 0.6502755880355835, + "learning_rate": 6.920783311948423e-06, + "loss": 0.2085, + "step": 7093 + }, + { + "epoch": 0.735205720800083, + "grad_norm": 0.6628555059432983, + "learning_rate": 6.915705078773889e-06, + "loss": 0.199, + "step": 7094 + }, + { + "epoch": 0.7353093584827444, + "grad_norm": 0.6222200989723206, + "learning_rate": 6.910628319917361e-06, + "loss": 0.1827, + "step": 7095 + }, + { + "epoch": 0.7354129961654058, + "grad_norm": 0.6655079126358032, + "learning_rate": 6.905553035950885e-06, + "loss": 0.2189, + "step": 7096 + }, + { + "epoch": 0.7355166338480672, + "grad_norm": 0.7366286516189575, + "learning_rate": 6.9004792274463436e-06, + "loss": 0.2256, + "step": 7097 + }, + { + "epoch": 0.7356202715307286, + "grad_norm": 0.6794492602348328, + "learning_rate": 6.895406894975434e-06, + "loss": 0.2237, + "step": 7098 + }, + { + "epoch": 0.73572390921339, + "grad_norm": 0.6098574995994568, + "learning_rate": 6.890336039109711e-06, + "loss": 0.1884, + "step": 7099 + }, + { + "epoch": 0.7358275468960515, + "grad_norm": 0.6938572525978088, + "learning_rate": 6.885266660420542e-06, + "loss": 0.2077, + "step": 7100 + }, + { + "epoch": 0.7359311845787129, + "grad_norm": 0.6158689856529236, + "learning_rate": 6.880198759479133e-06, + "loss": 0.1525, + "step": 7101 + }, + { + "epoch": 0.7360348222613743, + "grad_norm": 0.721074104309082, + "learning_rate": 6.87513233685654e-06, + "loss": 0.2077, + "step": 7102 + }, + { + "epoch": 0.7361384599440357, + "grad_norm": 0.6900755167007446, + "learning_rate": 6.870067393123625e-06, + "loss": 0.19, + "step": 7103 + }, + { + "epoch": 0.736242097626697, + "grad_norm": 0.6101871728897095, + "learning_rate": 6.865003928851111e-06, + "loss": 0.1671, + "step": 7104 + }, + { + "epoch": 0.7363457353093584, + "grad_norm": 0.8352978229522705, + "learning_rate": 6.85994194460953e-06, + "loss": 0.2456, + "step": 7105 + }, + { + "epoch": 0.7364493729920198, + "grad_norm": 0.6034935712814331, + "learning_rate": 6.85488144096927e-06, + "loss": 0.2044, + "step": 7106 + }, + { + "epoch": 0.7365530106746813, + "grad_norm": 0.698615550994873, + "learning_rate": 6.849822418500534e-06, + "loss": 0.2079, + "step": 7107 + }, + { + "epoch": 0.7366566483573427, + "grad_norm": 0.6203604340553284, + "learning_rate": 6.84476487777336e-06, + "loss": 0.1676, + "step": 7108 + }, + { + "epoch": 0.7367602860400041, + "grad_norm": 0.7435559034347534, + "learning_rate": 6.839708819357636e-06, + "loss": 0.2041, + "step": 7109 + }, + { + "epoch": 0.7368639237226655, + "grad_norm": 0.6135737895965576, + "learning_rate": 6.834654243823058e-06, + "loss": 0.1708, + "step": 7110 + }, + { + "epoch": 0.7369675614053269, + "grad_norm": 0.6980074644088745, + "learning_rate": 6.829601151739174e-06, + "loss": 0.2352, + "step": 7111 + }, + { + "epoch": 0.7370711990879883, + "grad_norm": 0.6273593902587891, + "learning_rate": 6.8245495436753625e-06, + "loss": 0.1904, + "step": 7112 + }, + { + "epoch": 0.7371748367706498, + "grad_norm": 0.6526858806610107, + "learning_rate": 6.81949942020083e-06, + "loss": 0.1946, + "step": 7113 + }, + { + "epoch": 0.7372784744533112, + "grad_norm": 0.6693414449691772, + "learning_rate": 6.814450781884611e-06, + "loss": 0.1951, + "step": 7114 + }, + { + "epoch": 0.7373821121359726, + "grad_norm": 0.8420910239219666, + "learning_rate": 6.809403629295575e-06, + "loss": 0.2187, + "step": 7115 + }, + { + "epoch": 0.737485749818634, + "grad_norm": 0.8213813304901123, + "learning_rate": 6.804357963002432e-06, + "loss": 0.2387, + "step": 7116 + }, + { + "epoch": 0.7375893875012954, + "grad_norm": 0.7364427447319031, + "learning_rate": 6.7993137835737284e-06, + "loss": 0.2172, + "step": 7117 + }, + { + "epoch": 0.7376930251839569, + "grad_norm": 0.6254062652587891, + "learning_rate": 6.794271091577826e-06, + "loss": 0.1975, + "step": 7118 + }, + { + "epoch": 0.7377966628666183, + "grad_norm": 0.5966928005218506, + "learning_rate": 6.789229887582931e-06, + "loss": 0.171, + "step": 7119 + }, + { + "epoch": 0.7379003005492797, + "grad_norm": 0.6676751971244812, + "learning_rate": 6.784190172157066e-06, + "loss": 0.1861, + "step": 7120 + }, + { + "epoch": 0.7380039382319411, + "grad_norm": 0.7175436615943909, + "learning_rate": 6.7791519458681096e-06, + "loss": 0.2538, + "step": 7121 + }, + { + "epoch": 0.7381075759146025, + "grad_norm": 0.6358978748321533, + "learning_rate": 6.774115209283765e-06, + "loss": 0.2022, + "step": 7122 + }, + { + "epoch": 0.7382112135972639, + "grad_norm": 0.5605407357215881, + "learning_rate": 6.769079962971552e-06, + "loss": 0.1718, + "step": 7123 + }, + { + "epoch": 0.7383148512799254, + "grad_norm": 0.568594753742218, + "learning_rate": 6.764046207498849e-06, + "loss": 0.1717, + "step": 7124 + }, + { + "epoch": 0.7384184889625868, + "grad_norm": 0.5866584181785583, + "learning_rate": 6.759013943432844e-06, + "loss": 0.1749, + "step": 7125 + }, + { + "epoch": 0.7385221266452482, + "grad_norm": 0.6093940734863281, + "learning_rate": 6.7539831713405565e-06, + "loss": 0.1746, + "step": 7126 + }, + { + "epoch": 0.7386257643279096, + "grad_norm": 0.6412978768348694, + "learning_rate": 6.748953891788861e-06, + "loss": 0.1921, + "step": 7127 + }, + { + "epoch": 0.738729402010571, + "grad_norm": 0.6676449179649353, + "learning_rate": 6.743926105344434e-06, + "loss": 0.1936, + "step": 7128 + }, + { + "epoch": 0.7388330396932324, + "grad_norm": 0.7243309617042542, + "learning_rate": 6.738899812573814e-06, + "loss": 0.2387, + "step": 7129 + }, + { + "epoch": 0.7389366773758939, + "grad_norm": 0.6230798959732056, + "learning_rate": 6.73387501404334e-06, + "loss": 0.1818, + "step": 7130 + }, + { + "epoch": 0.7390403150585553, + "grad_norm": 0.6703595519065857, + "learning_rate": 6.7288517103192175e-06, + "loss": 0.1998, + "step": 7131 + }, + { + "epoch": 0.7391439527412167, + "grad_norm": 0.6641132235527039, + "learning_rate": 6.723829901967451e-06, + "loss": 0.1881, + "step": 7132 + }, + { + "epoch": 0.7392475904238781, + "grad_norm": 0.65935218334198, + "learning_rate": 6.718809589553885e-06, + "loss": 0.2021, + "step": 7133 + }, + { + "epoch": 0.7393512281065395, + "grad_norm": 0.6337850093841553, + "learning_rate": 6.713790773644218e-06, + "loss": 0.2061, + "step": 7134 + }, + { + "epoch": 0.7394548657892009, + "grad_norm": 0.8226786255836487, + "learning_rate": 6.708773454803945e-06, + "loss": 0.2009, + "step": 7135 + }, + { + "epoch": 0.7395585034718624, + "grad_norm": 0.7300819158554077, + "learning_rate": 6.703757633598418e-06, + "loss": 0.2217, + "step": 7136 + }, + { + "epoch": 0.7396621411545238, + "grad_norm": 0.586715817451477, + "learning_rate": 6.698743310592817e-06, + "loss": 0.1743, + "step": 7137 + }, + { + "epoch": 0.7397657788371852, + "grad_norm": 0.6823568940162659, + "learning_rate": 6.693730486352144e-06, + "loss": 0.1799, + "step": 7138 + }, + { + "epoch": 0.7398694165198466, + "grad_norm": 0.598688542842865, + "learning_rate": 6.6887191614412325e-06, + "loss": 0.1852, + "step": 7139 + }, + { + "epoch": 0.739973054202508, + "grad_norm": 0.6224052906036377, + "learning_rate": 6.683709336424748e-06, + "loss": 0.1757, + "step": 7140 + }, + { + "epoch": 0.7400766918851694, + "grad_norm": 0.7828407287597656, + "learning_rate": 6.6787010118671945e-06, + "loss": 0.2367, + "step": 7141 + }, + { + "epoch": 0.7401803295678309, + "grad_norm": 0.595071017742157, + "learning_rate": 6.673694188332911e-06, + "loss": 0.1754, + "step": 7142 + }, + { + "epoch": 0.7402839672504923, + "grad_norm": 0.6053313612937927, + "learning_rate": 6.6686888663860415e-06, + "loss": 0.1788, + "step": 7143 + }, + { + "epoch": 0.7403876049331537, + "grad_norm": 0.6300621032714844, + "learning_rate": 6.6636850465906e-06, + "loss": 0.2062, + "step": 7144 + }, + { + "epoch": 0.7404912426158151, + "grad_norm": 0.552588939666748, + "learning_rate": 6.658682729510384e-06, + "loss": 0.1553, + "step": 7145 + }, + { + "epoch": 0.7405948802984765, + "grad_norm": 0.5638970136642456, + "learning_rate": 6.653681915709058e-06, + "loss": 0.163, + "step": 7146 + }, + { + "epoch": 0.740698517981138, + "grad_norm": 0.7059263586997986, + "learning_rate": 6.648682605750112e-06, + "loss": 0.208, + "step": 7147 + }, + { + "epoch": 0.7408021556637994, + "grad_norm": 0.6765500903129578, + "learning_rate": 6.64368480019685e-06, + "loss": 0.226, + "step": 7148 + }, + { + "epoch": 0.7409057933464608, + "grad_norm": 0.6322110295295715, + "learning_rate": 6.638688499612426e-06, + "loss": 0.1794, + "step": 7149 + }, + { + "epoch": 0.7410094310291222, + "grad_norm": 0.6688316464424133, + "learning_rate": 6.633693704559814e-06, + "loss": 0.1942, + "step": 7150 + }, + { + "epoch": 0.7411130687117836, + "grad_norm": 0.7907984852790833, + "learning_rate": 6.628700415601809e-06, + "loss": 0.2294, + "step": 7151 + }, + { + "epoch": 0.741216706394445, + "grad_norm": 0.5939874053001404, + "learning_rate": 6.623708633301063e-06, + "loss": 0.1794, + "step": 7152 + }, + { + "epoch": 0.7413203440771065, + "grad_norm": 0.5933582782745361, + "learning_rate": 6.618718358220027e-06, + "loss": 0.1885, + "step": 7153 + }, + { + "epoch": 0.7414239817597679, + "grad_norm": 0.5784906148910522, + "learning_rate": 6.61372959092101e-06, + "loss": 0.1943, + "step": 7154 + }, + { + "epoch": 0.7415276194424293, + "grad_norm": 0.628635048866272, + "learning_rate": 6.608742331966127e-06, + "loss": 0.1953, + "step": 7155 + }, + { + "epoch": 0.7416312571250907, + "grad_norm": 0.6248700618743896, + "learning_rate": 6.603756581917349e-06, + "loss": 0.1832, + "step": 7156 + }, + { + "epoch": 0.7417348948077521, + "grad_norm": 0.572599470615387, + "learning_rate": 6.598772341336455e-06, + "loss": 0.176, + "step": 7157 + }, + { + "epoch": 0.7418385324904135, + "grad_norm": 0.735245406627655, + "learning_rate": 6.593789610785053e-06, + "loss": 0.2286, + "step": 7158 + }, + { + "epoch": 0.741942170173075, + "grad_norm": 0.6545553803443909, + "learning_rate": 6.588808390824604e-06, + "loss": 0.1877, + "step": 7159 + }, + { + "epoch": 0.7420458078557364, + "grad_norm": 0.6569068431854248, + "learning_rate": 6.583828682016371e-06, + "loss": 0.1726, + "step": 7160 + }, + { + "epoch": 0.7421494455383978, + "grad_norm": 0.6712062954902649, + "learning_rate": 6.578850484921466e-06, + "loss": 0.2028, + "step": 7161 + }, + { + "epoch": 0.7422530832210592, + "grad_norm": 0.6517971754074097, + "learning_rate": 6.573873800100832e-06, + "loss": 0.1797, + "step": 7162 + }, + { + "epoch": 0.7423567209037206, + "grad_norm": 0.5719771981239319, + "learning_rate": 6.568898628115226e-06, + "loss": 0.194, + "step": 7163 + }, + { + "epoch": 0.742460358586382, + "grad_norm": 0.6975439786911011, + "learning_rate": 6.563924969525244e-06, + "loss": 0.1985, + "step": 7164 + }, + { + "epoch": 0.7425639962690435, + "grad_norm": 0.6007943153381348, + "learning_rate": 6.5589528248913025e-06, + "loss": 0.1948, + "step": 7165 + }, + { + "epoch": 0.7426676339517049, + "grad_norm": 0.6068305969238281, + "learning_rate": 6.553982194773663e-06, + "loss": 0.1751, + "step": 7166 + }, + { + "epoch": 0.7427712716343663, + "grad_norm": 0.5859012603759766, + "learning_rate": 6.549013079732413e-06, + "loss": 0.173, + "step": 7167 + }, + { + "epoch": 0.7428749093170277, + "grad_norm": 0.7530663013458252, + "learning_rate": 6.544045480327455e-06, + "loss": 0.2106, + "step": 7168 + }, + { + "epoch": 0.7429785469996891, + "grad_norm": 0.6807491779327393, + "learning_rate": 6.539079397118539e-06, + "loss": 0.1839, + "step": 7169 + }, + { + "epoch": 0.7430821846823505, + "grad_norm": 0.7634007334709167, + "learning_rate": 6.534114830665232e-06, + "loss": 0.2349, + "step": 7170 + }, + { + "epoch": 0.743185822365012, + "grad_norm": 0.7185533046722412, + "learning_rate": 6.529151781526926e-06, + "loss": 0.2298, + "step": 7171 + }, + { + "epoch": 0.7432894600476734, + "grad_norm": 0.6481119990348816, + "learning_rate": 6.524190250262863e-06, + "loss": 0.1949, + "step": 7172 + }, + { + "epoch": 0.7433930977303348, + "grad_norm": 0.6291430592536926, + "learning_rate": 6.519230237432088e-06, + "loss": 0.2016, + "step": 7173 + }, + { + "epoch": 0.7434967354129962, + "grad_norm": 0.874639630317688, + "learning_rate": 6.514271743593499e-06, + "loss": 0.2434, + "step": 7174 + }, + { + "epoch": 0.7436003730956576, + "grad_norm": 0.5795329213142395, + "learning_rate": 6.509314769305801e-06, + "loss": 0.1647, + "step": 7175 + }, + { + "epoch": 0.743704010778319, + "grad_norm": 0.6141259074211121, + "learning_rate": 6.504359315127549e-06, + "loss": 0.22, + "step": 7176 + }, + { + "epoch": 0.7438076484609805, + "grad_norm": 0.5499668121337891, + "learning_rate": 6.49940538161711e-06, + "loss": 0.1762, + "step": 7177 + }, + { + "epoch": 0.7439112861436419, + "grad_norm": 0.5772035717964172, + "learning_rate": 6.494452969332678e-06, + "loss": 0.1637, + "step": 7178 + }, + { + "epoch": 0.7440149238263033, + "grad_norm": 0.678733766078949, + "learning_rate": 6.489502078832297e-06, + "loss": 0.2116, + "step": 7179 + }, + { + "epoch": 0.7441185615089646, + "grad_norm": 0.6421101093292236, + "learning_rate": 6.484552710673815e-06, + "loss": 0.2189, + "step": 7180 + }, + { + "epoch": 0.744222199191626, + "grad_norm": 0.6648598313331604, + "learning_rate": 6.479604865414928e-06, + "loss": 0.1842, + "step": 7181 + }, + { + "epoch": 0.7443258368742874, + "grad_norm": 0.6394937634468079, + "learning_rate": 6.474658543613146e-06, + "loss": 0.2011, + "step": 7182 + }, + { + "epoch": 0.7444294745569489, + "grad_norm": 0.5273407697677612, + "learning_rate": 6.46971374582581e-06, + "loss": 0.1534, + "step": 7183 + }, + { + "epoch": 0.7445331122396103, + "grad_norm": 0.6797968745231628, + "learning_rate": 6.4647704726101e-06, + "loss": 0.2042, + "step": 7184 + }, + { + "epoch": 0.7446367499222717, + "grad_norm": 0.6618654131889343, + "learning_rate": 6.459828724523007e-06, + "loss": 0.1849, + "step": 7185 + }, + { + "epoch": 0.7447403876049331, + "grad_norm": 0.6694583892822266, + "learning_rate": 6.4548885021213635e-06, + "loss": 0.2022, + "step": 7186 + }, + { + "epoch": 0.7448440252875945, + "grad_norm": 0.6789039969444275, + "learning_rate": 6.449949805961835e-06, + "loss": 0.1922, + "step": 7187 + }, + { + "epoch": 0.7449476629702559, + "grad_norm": 0.714964747428894, + "learning_rate": 6.445012636600898e-06, + "loss": 0.1984, + "step": 7188 + }, + { + "epoch": 0.7450513006529174, + "grad_norm": 0.6054500937461853, + "learning_rate": 6.440076994594866e-06, + "loss": 0.1852, + "step": 7189 + }, + { + "epoch": 0.7451549383355788, + "grad_norm": 0.669039785861969, + "learning_rate": 6.435142880499874e-06, + "loss": 0.1992, + "step": 7190 + }, + { + "epoch": 0.7452585760182402, + "grad_norm": 0.6293997168540955, + "learning_rate": 6.430210294871893e-06, + "loss": 0.21, + "step": 7191 + }, + { + "epoch": 0.7453622137009016, + "grad_norm": 0.6848145723342896, + "learning_rate": 6.4252792382667285e-06, + "loss": 0.1994, + "step": 7192 + }, + { + "epoch": 0.745465851383563, + "grad_norm": 0.6127070188522339, + "learning_rate": 6.420349711239988e-06, + "loss": 0.1897, + "step": 7193 + }, + { + "epoch": 0.7455694890662244, + "grad_norm": 0.6485124826431274, + "learning_rate": 6.415421714347141e-06, + "loss": 0.1981, + "step": 7194 + }, + { + "epoch": 0.7456731267488859, + "grad_norm": 0.6168133616447449, + "learning_rate": 6.410495248143458e-06, + "loss": 0.1741, + "step": 7195 + }, + { + "epoch": 0.7457767644315473, + "grad_norm": 0.659416913986206, + "learning_rate": 6.405570313184038e-06, + "loss": 0.239, + "step": 7196 + }, + { + "epoch": 0.7458804021142087, + "grad_norm": 0.6403185129165649, + "learning_rate": 6.400646910023829e-06, + "loss": 0.2017, + "step": 7197 + }, + { + "epoch": 0.7459840397968701, + "grad_norm": 0.7637109160423279, + "learning_rate": 6.395725039217579e-06, + "loss": 0.2671, + "step": 7198 + }, + { + "epoch": 0.7460876774795315, + "grad_norm": 0.5837101936340332, + "learning_rate": 6.39080470131989e-06, + "loss": 0.1724, + "step": 7199 + }, + { + "epoch": 0.746191315162193, + "grad_norm": 0.5817883014678955, + "learning_rate": 6.3858858968851645e-06, + "loss": 0.1936, + "step": 7200 + }, + { + "epoch": 0.7462949528448544, + "grad_norm": 0.6717552542686462, + "learning_rate": 6.380968626467659e-06, + "loss": 0.2043, + "step": 7201 + }, + { + "epoch": 0.7463985905275158, + "grad_norm": 0.5266337394714355, + "learning_rate": 6.376052890621438e-06, + "loss": 0.1498, + "step": 7202 + }, + { + "epoch": 0.7465022282101772, + "grad_norm": 0.5839206576347351, + "learning_rate": 6.371138689900392e-06, + "loss": 0.1862, + "step": 7203 + }, + { + "epoch": 0.7466058658928386, + "grad_norm": 0.7016838192939758, + "learning_rate": 6.3662260248582575e-06, + "loss": 0.1997, + "step": 7204 + }, + { + "epoch": 0.7467095035755, + "grad_norm": 0.8019984364509583, + "learning_rate": 6.361314896048574e-06, + "loss": 0.2547, + "step": 7205 + }, + { + "epoch": 0.7468131412581615, + "grad_norm": 0.6328192353248596, + "learning_rate": 6.356405304024726e-06, + "loss": 0.1839, + "step": 7206 + }, + { + "epoch": 0.7469167789408229, + "grad_norm": 0.5738548636436462, + "learning_rate": 6.351497249339933e-06, + "loss": 0.139, + "step": 7207 + }, + { + "epoch": 0.7470204166234843, + "grad_norm": 0.6084631085395813, + "learning_rate": 6.3465907325471996e-06, + "loss": 0.1768, + "step": 7208 + }, + { + "epoch": 0.7471240543061457, + "grad_norm": 0.6383935213088989, + "learning_rate": 6.341685754199405e-06, + "loss": 0.1846, + "step": 7209 + }, + { + "epoch": 0.7472276919888071, + "grad_norm": 0.6913857460021973, + "learning_rate": 6.336782314849219e-06, + "loss": 0.2225, + "step": 7210 + }, + { + "epoch": 0.7473313296714685, + "grad_norm": 0.7214654684066772, + "learning_rate": 6.331880415049163e-06, + "loss": 0.1921, + "step": 7211 + }, + { + "epoch": 0.74743496735413, + "grad_norm": 0.6223048567771912, + "learning_rate": 6.326980055351581e-06, + "loss": 0.1955, + "step": 7212 + }, + { + "epoch": 0.7475386050367914, + "grad_norm": 0.5999380350112915, + "learning_rate": 6.32208123630863e-06, + "loss": 0.1961, + "step": 7213 + }, + { + "epoch": 0.7476422427194528, + "grad_norm": 0.7001415491104126, + "learning_rate": 6.317183958472297e-06, + "loss": 0.2018, + "step": 7214 + }, + { + "epoch": 0.7477458804021142, + "grad_norm": 0.5766058564186096, + "learning_rate": 6.312288222394414e-06, + "loss": 0.1626, + "step": 7215 + }, + { + "epoch": 0.7478495180847756, + "grad_norm": 0.5628561973571777, + "learning_rate": 6.307394028626605e-06, + "loss": 0.1717, + "step": 7216 + }, + { + "epoch": 0.747953155767437, + "grad_norm": 0.7716298699378967, + "learning_rate": 6.3025013777203605e-06, + "loss": 0.2195, + "step": 7217 + }, + { + "epoch": 0.7480567934500985, + "grad_norm": 0.7538269758224487, + "learning_rate": 6.2976102702269615e-06, + "loss": 0.1996, + "step": 7218 + }, + { + "epoch": 0.7481604311327599, + "grad_norm": 0.5600654482841492, + "learning_rate": 6.292720706697541e-06, + "loss": 0.1617, + "step": 7219 + }, + { + "epoch": 0.7482640688154213, + "grad_norm": 0.6885689496994019, + "learning_rate": 6.287832687683046e-06, + "loss": 0.2133, + "step": 7220 + }, + { + "epoch": 0.7483677064980827, + "grad_norm": 0.5769020318984985, + "learning_rate": 6.28294621373424e-06, + "loss": 0.1793, + "step": 7221 + }, + { + "epoch": 0.7484713441807441, + "grad_norm": 0.689033567905426, + "learning_rate": 6.278061285401735e-06, + "loss": 0.1862, + "step": 7222 + }, + { + "epoch": 0.7485749818634055, + "grad_norm": 0.6020885109901428, + "learning_rate": 6.27317790323595e-06, + "loss": 0.1927, + "step": 7223 + }, + { + "epoch": 0.748678619546067, + "grad_norm": 0.6004186868667603, + "learning_rate": 6.268296067787138e-06, + "loss": 0.1753, + "step": 7224 + }, + { + "epoch": 0.7487822572287284, + "grad_norm": 0.615755021572113, + "learning_rate": 6.263415779605386e-06, + "loss": 0.1849, + "step": 7225 + }, + { + "epoch": 0.7488858949113898, + "grad_norm": 0.562086284160614, + "learning_rate": 6.258537039240591e-06, + "loss": 0.167, + "step": 7226 + }, + { + "epoch": 0.7489895325940512, + "grad_norm": 0.6605566143989563, + "learning_rate": 6.253659847242479e-06, + "loss": 0.1977, + "step": 7227 + }, + { + "epoch": 0.7490931702767126, + "grad_norm": 0.623592734336853, + "learning_rate": 6.248784204160601e-06, + "loss": 0.1716, + "step": 7228 + }, + { + "epoch": 0.749196807959374, + "grad_norm": 0.545217752456665, + "learning_rate": 6.2439101105443424e-06, + "loss": 0.1843, + "step": 7229 + }, + { + "epoch": 0.7493004456420355, + "grad_norm": 0.6777985692024231, + "learning_rate": 6.239037566942914e-06, + "loss": 0.2135, + "step": 7230 + }, + { + "epoch": 0.7494040833246969, + "grad_norm": 0.5926411747932434, + "learning_rate": 6.234166573905336e-06, + "loss": 0.1834, + "step": 7231 + }, + { + "epoch": 0.7495077210073583, + "grad_norm": 0.5609111189842224, + "learning_rate": 6.229297131980474e-06, + "loss": 0.1676, + "step": 7232 + }, + { + "epoch": 0.7496113586900197, + "grad_norm": 0.6948944330215454, + "learning_rate": 6.224429241717003e-06, + "loss": 0.2067, + "step": 7233 + }, + { + "epoch": 0.7497149963726811, + "grad_norm": 0.5710058808326721, + "learning_rate": 6.219562903663425e-06, + "loss": 0.18, + "step": 7234 + }, + { + "epoch": 0.7498186340553425, + "grad_norm": 0.6364341974258423, + "learning_rate": 6.214698118368085e-06, + "loss": 0.1769, + "step": 7235 + }, + { + "epoch": 0.749922271738004, + "grad_norm": 0.6758280992507935, + "learning_rate": 6.209834886379123e-06, + "loss": 0.1917, + "step": 7236 + }, + { + "epoch": 0.7500259094206654, + "grad_norm": 0.7250998616218567, + "learning_rate": 6.204973208244536e-06, + "loss": 0.2122, + "step": 7237 + }, + { + "epoch": 0.7501295471033268, + "grad_norm": 0.5818528532981873, + "learning_rate": 6.200113084512116e-06, + "loss": 0.1641, + "step": 7238 + }, + { + "epoch": 0.7502331847859882, + "grad_norm": 0.7421915531158447, + "learning_rate": 6.19525451572951e-06, + "loss": 0.2021, + "step": 7239 + }, + { + "epoch": 0.7503368224686496, + "grad_norm": 0.6421279907226562, + "learning_rate": 6.1903975024441634e-06, + "loss": 0.198, + "step": 7240 + }, + { + "epoch": 0.750440460151311, + "grad_norm": 0.6554982662200928, + "learning_rate": 6.185542045203352e-06, + "loss": 0.2247, + "step": 7241 + }, + { + "epoch": 0.7505440978339725, + "grad_norm": 0.5850462317466736, + "learning_rate": 6.180688144554194e-06, + "loss": 0.1705, + "step": 7242 + }, + { + "epoch": 0.7506477355166339, + "grad_norm": 0.6490334868431091, + "learning_rate": 6.175835801043608e-06, + "loss": 0.1991, + "step": 7243 + }, + { + "epoch": 0.7507513731992953, + "grad_norm": 0.7006539106369019, + "learning_rate": 6.17098501521836e-06, + "loss": 0.2035, + "step": 7244 + }, + { + "epoch": 0.7508550108819567, + "grad_norm": 0.6657992601394653, + "learning_rate": 6.166135787625023e-06, + "loss": 0.1977, + "step": 7245 + }, + { + "epoch": 0.7509586485646181, + "grad_norm": 0.619107723236084, + "learning_rate": 6.161288118809994e-06, + "loss": 0.1919, + "step": 7246 + }, + { + "epoch": 0.7510622862472796, + "grad_norm": 0.810283899307251, + "learning_rate": 6.156442009319512e-06, + "loss": 0.2356, + "step": 7247 + }, + { + "epoch": 0.751165923929941, + "grad_norm": 0.5877438187599182, + "learning_rate": 6.151597459699621e-06, + "loss": 0.1847, + "step": 7248 + }, + { + "epoch": 0.7512695616126024, + "grad_norm": 0.6159684658050537, + "learning_rate": 6.1467544704961965e-06, + "loss": 0.1944, + "step": 7249 + }, + { + "epoch": 0.7513731992952638, + "grad_norm": 0.8051716685295105, + "learning_rate": 6.141913042254952e-06, + "loss": 0.2004, + "step": 7250 + }, + { + "epoch": 0.7514768369779252, + "grad_norm": 0.7064564228057861, + "learning_rate": 6.137073175521402e-06, + "loss": 0.2126, + "step": 7251 + }, + { + "epoch": 0.7515804746605866, + "grad_norm": 0.6129339933395386, + "learning_rate": 6.132234870840899e-06, + "loss": 0.1987, + "step": 7252 + }, + { + "epoch": 0.7516841123432481, + "grad_norm": 0.5935209393501282, + "learning_rate": 6.1273981287586035e-06, + "loss": 0.1574, + "step": 7253 + }, + { + "epoch": 0.7517877500259095, + "grad_norm": 0.7329012751579285, + "learning_rate": 6.122562949819522e-06, + "loss": 0.2455, + "step": 7254 + }, + { + "epoch": 0.7518913877085709, + "grad_norm": 0.6233592629432678, + "learning_rate": 6.117729334568481e-06, + "loss": 0.205, + "step": 7255 + }, + { + "epoch": 0.7519950253912322, + "grad_norm": 0.5638620257377625, + "learning_rate": 6.112897283550112e-06, + "loss": 0.1628, + "step": 7256 + }, + { + "epoch": 0.7520986630738936, + "grad_norm": 0.6088778376579285, + "learning_rate": 6.108066797308896e-06, + "loss": 0.1974, + "step": 7257 + }, + { + "epoch": 0.752202300756555, + "grad_norm": 0.7371406555175781, + "learning_rate": 6.103237876389117e-06, + "loss": 0.1761, + "step": 7258 + }, + { + "epoch": 0.7523059384392164, + "grad_norm": 0.5529123544692993, + "learning_rate": 6.098410521334883e-06, + "loss": 0.163, + "step": 7259 + }, + { + "epoch": 0.7524095761218779, + "grad_norm": 0.6525443196296692, + "learning_rate": 6.09358473269015e-06, + "loss": 0.2138, + "step": 7260 + }, + { + "epoch": 0.7525132138045393, + "grad_norm": 0.669493556022644, + "learning_rate": 6.088760510998664e-06, + "loss": 0.2048, + "step": 7261 + }, + { + "epoch": 0.7526168514872007, + "grad_norm": 0.6533212661743164, + "learning_rate": 6.083937856804025e-06, + "loss": 0.1936, + "step": 7262 + }, + { + "epoch": 0.7527204891698621, + "grad_norm": 0.6517473459243774, + "learning_rate": 6.079116770649629e-06, + "loss": 0.198, + "step": 7263 + }, + { + "epoch": 0.7528241268525235, + "grad_norm": 0.724342942237854, + "learning_rate": 6.074297253078723e-06, + "loss": 0.2223, + "step": 7264 + }, + { + "epoch": 0.752927764535185, + "grad_norm": 0.6567179560661316, + "learning_rate": 6.069479304634352e-06, + "loss": 0.1875, + "step": 7265 + }, + { + "epoch": 0.7530314022178464, + "grad_norm": 0.7255767583847046, + "learning_rate": 6.06466292585939e-06, + "loss": 0.2116, + "step": 7266 + }, + { + "epoch": 0.7531350399005078, + "grad_norm": 0.6283934712409973, + "learning_rate": 6.059848117296556e-06, + "loss": 0.1815, + "step": 7267 + }, + { + "epoch": 0.7532386775831692, + "grad_norm": 0.7235457897186279, + "learning_rate": 6.05503487948836e-06, + "loss": 0.1972, + "step": 7268 + }, + { + "epoch": 0.7533423152658306, + "grad_norm": 0.5651537179946899, + "learning_rate": 6.050223212977153e-06, + "loss": 0.161, + "step": 7269 + }, + { + "epoch": 0.753445952948492, + "grad_norm": 0.7023213505744934, + "learning_rate": 6.045413118305123e-06, + "loss": 0.2197, + "step": 7270 + }, + { + "epoch": 0.7535495906311535, + "grad_norm": 0.7070270776748657, + "learning_rate": 6.0406045960142365e-06, + "loss": 0.2125, + "step": 7271 + }, + { + "epoch": 0.7536532283138149, + "grad_norm": 0.6896175146102905, + "learning_rate": 6.035797646646331e-06, + "loss": 0.2296, + "step": 7272 + }, + { + "epoch": 0.7537568659964763, + "grad_norm": 0.705349326133728, + "learning_rate": 6.030992270743032e-06, + "loss": 0.2146, + "step": 7273 + }, + { + "epoch": 0.7538605036791377, + "grad_norm": 0.49893879890441895, + "learning_rate": 6.026188468845811e-06, + "loss": 0.1506, + "step": 7274 + }, + { + "epoch": 0.7539641413617991, + "grad_norm": 0.5738864541053772, + "learning_rate": 6.021386241495955e-06, + "loss": 0.1677, + "step": 7275 + }, + { + "epoch": 0.7540677790444605, + "grad_norm": 0.6488167643547058, + "learning_rate": 6.016585589234567e-06, + "loss": 0.2096, + "step": 7276 + }, + { + "epoch": 0.754171416727122, + "grad_norm": 0.6203997731208801, + "learning_rate": 6.011786512602576e-06, + "loss": 0.1919, + "step": 7277 + }, + { + "epoch": 0.7542750544097834, + "grad_norm": 0.578915536403656, + "learning_rate": 6.006989012140732e-06, + "loss": 0.1701, + "step": 7278 + }, + { + "epoch": 0.7543786920924448, + "grad_norm": 0.5318428874015808, + "learning_rate": 6.002193088389612e-06, + "loss": 0.1654, + "step": 7279 + }, + { + "epoch": 0.7544823297751062, + "grad_norm": 0.5900943875312805, + "learning_rate": 5.997398741889619e-06, + "loss": 0.1857, + "step": 7280 + }, + { + "epoch": 0.7545859674577676, + "grad_norm": 0.6281424760818481, + "learning_rate": 5.992605973180965e-06, + "loss": 0.192, + "step": 7281 + }, + { + "epoch": 0.754689605140429, + "grad_norm": 0.6875169277191162, + "learning_rate": 5.987814782803702e-06, + "loss": 0.2051, + "step": 7282 + }, + { + "epoch": 0.7547932428230905, + "grad_norm": 0.7132881879806519, + "learning_rate": 5.983025171297685e-06, + "loss": 0.2304, + "step": 7283 + }, + { + "epoch": 0.7548968805057519, + "grad_norm": 0.7771185636520386, + "learning_rate": 5.978237139202596e-06, + "loss": 0.2364, + "step": 7284 + }, + { + "epoch": 0.7550005181884133, + "grad_norm": 0.637414276599884, + "learning_rate": 5.973450687057956e-06, + "loss": 0.2289, + "step": 7285 + }, + { + "epoch": 0.7551041558710747, + "grad_norm": 0.6778030395507812, + "learning_rate": 5.9686658154030804e-06, + "loss": 0.1916, + "step": 7286 + }, + { + "epoch": 0.7552077935537361, + "grad_norm": 0.5655515789985657, + "learning_rate": 5.963882524777136e-06, + "loss": 0.1663, + "step": 7287 + }, + { + "epoch": 0.7553114312363975, + "grad_norm": 0.6484770774841309, + "learning_rate": 5.959100815719083e-06, + "loss": 0.1996, + "step": 7288 + }, + { + "epoch": 0.755415068919059, + "grad_norm": 0.6523261070251465, + "learning_rate": 5.954320688767727e-06, + "loss": 0.1783, + "step": 7289 + }, + { + "epoch": 0.7555187066017204, + "grad_norm": 0.706913948059082, + "learning_rate": 5.949542144461684e-06, + "loss": 0.2012, + "step": 7290 + }, + { + "epoch": 0.7556223442843818, + "grad_norm": 0.7174582481384277, + "learning_rate": 5.944765183339383e-06, + "loss": 0.1925, + "step": 7291 + }, + { + "epoch": 0.7557259819670432, + "grad_norm": 0.6907774209976196, + "learning_rate": 5.9399898059390996e-06, + "loss": 0.2149, + "step": 7292 + }, + { + "epoch": 0.7558296196497046, + "grad_norm": 0.7346753478050232, + "learning_rate": 5.935216012798899e-06, + "loss": 0.2104, + "step": 7293 + }, + { + "epoch": 0.755933257332366, + "grad_norm": 0.6349897980690002, + "learning_rate": 5.930443804456696e-06, + "loss": 0.1915, + "step": 7294 + }, + { + "epoch": 0.7560368950150275, + "grad_norm": 0.5483682155609131, + "learning_rate": 5.925673181450217e-06, + "loss": 0.1488, + "step": 7295 + }, + { + "epoch": 0.7561405326976889, + "grad_norm": 0.6724481582641602, + "learning_rate": 5.920904144317008e-06, + "loss": 0.2061, + "step": 7296 + }, + { + "epoch": 0.7562441703803503, + "grad_norm": 0.6132513284683228, + "learning_rate": 5.916136693594434e-06, + "loss": 0.1948, + "step": 7297 + }, + { + "epoch": 0.7563478080630117, + "grad_norm": 0.5399127006530762, + "learning_rate": 5.911370829819676e-06, + "loss": 0.1638, + "step": 7298 + }, + { + "epoch": 0.7564514457456731, + "grad_norm": 0.6391360759735107, + "learning_rate": 5.906606553529752e-06, + "loss": 0.1918, + "step": 7299 + }, + { + "epoch": 0.7565550834283346, + "grad_norm": 0.6599064469337463, + "learning_rate": 5.901843865261499e-06, + "loss": 0.1923, + "step": 7300 + }, + { + "epoch": 0.756658721110996, + "grad_norm": 0.7922466993331909, + "learning_rate": 5.897082765551556e-06, + "loss": 0.2055, + "step": 7301 + }, + { + "epoch": 0.7567623587936574, + "grad_norm": 0.6952490210533142, + "learning_rate": 5.892323254936419e-06, + "loss": 0.2004, + "step": 7302 + }, + { + "epoch": 0.7568659964763188, + "grad_norm": 0.6422750949859619, + "learning_rate": 5.887565333952352e-06, + "loss": 0.1778, + "step": 7303 + }, + { + "epoch": 0.7569696341589802, + "grad_norm": 0.6436856985092163, + "learning_rate": 5.882809003135486e-06, + "loss": 0.1764, + "step": 7304 + }, + { + "epoch": 0.7570732718416416, + "grad_norm": 0.6292835474014282, + "learning_rate": 5.87805426302176e-06, + "loss": 0.1754, + "step": 7305 + }, + { + "epoch": 0.757176909524303, + "grad_norm": 0.7755428552627563, + "learning_rate": 5.8733011141469236e-06, + "loss": 0.2428, + "step": 7306 + }, + { + "epoch": 0.7572805472069645, + "grad_norm": 0.6775556802749634, + "learning_rate": 5.868549557046561e-06, + "loss": 0.1965, + "step": 7307 + }, + { + "epoch": 0.7573841848896259, + "grad_norm": 0.7172718048095703, + "learning_rate": 5.863799592256067e-06, + "loss": 0.1977, + "step": 7308 + }, + { + "epoch": 0.7574878225722873, + "grad_norm": 0.5608749985694885, + "learning_rate": 5.8590512203106544e-06, + "loss": 0.1709, + "step": 7309 + }, + { + "epoch": 0.7575914602549487, + "grad_norm": 0.6475331783294678, + "learning_rate": 5.854304441745373e-06, + "loss": 0.2118, + "step": 7310 + }, + { + "epoch": 0.7576950979376101, + "grad_norm": 0.6698943376541138, + "learning_rate": 5.8495592570950724e-06, + "loss": 0.204, + "step": 7311 + }, + { + "epoch": 0.7577987356202716, + "grad_norm": 0.6197548508644104, + "learning_rate": 5.844815666894443e-06, + "loss": 0.1903, + "step": 7312 + }, + { + "epoch": 0.757902373302933, + "grad_norm": 0.6404499411582947, + "learning_rate": 5.840073671677973e-06, + "loss": 0.1973, + "step": 7313 + }, + { + "epoch": 0.7580060109855944, + "grad_norm": 0.6370285749435425, + "learning_rate": 5.835333271979995e-06, + "loss": 0.1978, + "step": 7314 + }, + { + "epoch": 0.7581096486682558, + "grad_norm": 0.6697351336479187, + "learning_rate": 5.830594468334647e-06, + "loss": 0.2058, + "step": 7315 + }, + { + "epoch": 0.7582132863509172, + "grad_norm": 0.6191688179969788, + "learning_rate": 5.82585726127588e-06, + "loss": 0.1983, + "step": 7316 + }, + { + "epoch": 0.7583169240335786, + "grad_norm": 0.6752298474311829, + "learning_rate": 5.821121651337489e-06, + "loss": 0.1862, + "step": 7317 + }, + { + "epoch": 0.7584205617162401, + "grad_norm": 0.6014376878738403, + "learning_rate": 5.81638763905306e-06, + "loss": 0.1798, + "step": 7318 + }, + { + "epoch": 0.7585241993989015, + "grad_norm": 0.7816725373268127, + "learning_rate": 5.8116552249560274e-06, + "loss": 0.24, + "step": 7319 + }, + { + "epoch": 0.7586278370815629, + "grad_norm": 0.6555365324020386, + "learning_rate": 5.806924409579631e-06, + "loss": 0.207, + "step": 7320 + }, + { + "epoch": 0.7587314747642243, + "grad_norm": 0.6532207727432251, + "learning_rate": 5.80219519345693e-06, + "loss": 0.1918, + "step": 7321 + }, + { + "epoch": 0.7588351124468857, + "grad_norm": 0.7684751152992249, + "learning_rate": 5.797467577120803e-06, + "loss": 0.2297, + "step": 7322 + }, + { + "epoch": 0.7589387501295471, + "grad_norm": 0.6303255558013916, + "learning_rate": 5.792741561103945e-06, + "loss": 0.1956, + "step": 7323 + }, + { + "epoch": 0.7590423878122086, + "grad_norm": 0.5961942076683044, + "learning_rate": 5.788017145938882e-06, + "loss": 0.1808, + "step": 7324 + }, + { + "epoch": 0.75914602549487, + "grad_norm": 0.6447713375091553, + "learning_rate": 5.783294332157963e-06, + "loss": 0.2017, + "step": 7325 + }, + { + "epoch": 0.7592496631775314, + "grad_norm": 0.725047767162323, + "learning_rate": 5.778573120293329e-06, + "loss": 0.2063, + "step": 7326 + }, + { + "epoch": 0.7593533008601928, + "grad_norm": 0.5291570425033569, + "learning_rate": 5.773853510876975e-06, + "loss": 0.1796, + "step": 7327 + }, + { + "epoch": 0.7594569385428542, + "grad_norm": 0.5901037454605103, + "learning_rate": 5.769135504440693e-06, + "loss": 0.1886, + "step": 7328 + }, + { + "epoch": 0.7595605762255156, + "grad_norm": 0.6665926575660706, + "learning_rate": 5.764419101516095e-06, + "loss": 0.2019, + "step": 7329 + }, + { + "epoch": 0.7596642139081771, + "grad_norm": 0.6632572412490845, + "learning_rate": 5.759704302634626e-06, + "loss": 0.2079, + "step": 7330 + }, + { + "epoch": 0.7597678515908385, + "grad_norm": 0.5482375025749207, + "learning_rate": 5.754991108327534e-06, + "loss": 0.1617, + "step": 7331 + }, + { + "epoch": 0.7598714892734998, + "grad_norm": 0.6653674840927124, + "learning_rate": 5.750279519125908e-06, + "loss": 0.187, + "step": 7332 + }, + { + "epoch": 0.7599751269561612, + "grad_norm": 0.562900185585022, + "learning_rate": 5.7455695355606314e-06, + "loss": 0.1761, + "step": 7333 + }, + { + "epoch": 0.7600787646388226, + "grad_norm": 0.6961675882339478, + "learning_rate": 5.740861158162416e-06, + "loss": 0.2146, + "step": 7334 + }, + { + "epoch": 0.760182402321484, + "grad_norm": 0.5686554312705994, + "learning_rate": 5.736154387461805e-06, + "loss": 0.1669, + "step": 7335 + }, + { + "epoch": 0.7602860400041455, + "grad_norm": 0.6474700570106506, + "learning_rate": 5.731449223989138e-06, + "loss": 0.2088, + "step": 7336 + }, + { + "epoch": 0.7603896776868069, + "grad_norm": 0.6459776759147644, + "learning_rate": 5.726745668274598e-06, + "loss": 0.2023, + "step": 7337 + }, + { + "epoch": 0.7604933153694683, + "grad_norm": 0.7052670121192932, + "learning_rate": 5.72204372084816e-06, + "loss": 0.2519, + "step": 7338 + }, + { + "epoch": 0.7605969530521297, + "grad_norm": 0.5499739646911621, + "learning_rate": 5.717343382239649e-06, + "loss": 0.1594, + "step": 7339 + }, + { + "epoch": 0.7607005907347911, + "grad_norm": 0.6012383103370667, + "learning_rate": 5.7126446529786804e-06, + "loss": 0.1812, + "step": 7340 + }, + { + "epoch": 0.7608042284174525, + "grad_norm": 0.7203385829925537, + "learning_rate": 5.707947533594698e-06, + "loss": 0.2274, + "step": 7341 + }, + { + "epoch": 0.760907866100114, + "grad_norm": 0.6299017667770386, + "learning_rate": 5.703252024616974e-06, + "loss": 0.1962, + "step": 7342 + }, + { + "epoch": 0.7610115037827754, + "grad_norm": 0.6651351451873779, + "learning_rate": 5.698558126574583e-06, + "loss": 0.2004, + "step": 7343 + }, + { + "epoch": 0.7611151414654368, + "grad_norm": 0.6428758502006531, + "learning_rate": 5.6938658399964285e-06, + "loss": 0.1912, + "step": 7344 + }, + { + "epoch": 0.7612187791480982, + "grad_norm": 0.5440841913223267, + "learning_rate": 5.6891751654112405e-06, + "loss": 0.1646, + "step": 7345 + }, + { + "epoch": 0.7613224168307596, + "grad_norm": 0.5122158527374268, + "learning_rate": 5.6844861033475466e-06, + "loss": 0.1455, + "step": 7346 + }, + { + "epoch": 0.761426054513421, + "grad_norm": 0.5525471568107605, + "learning_rate": 5.679798654333704e-06, + "loss": 0.1597, + "step": 7347 + }, + { + "epoch": 0.7615296921960825, + "grad_norm": 0.7224408388137817, + "learning_rate": 5.6751128188978835e-06, + "loss": 0.2429, + "step": 7348 + }, + { + "epoch": 0.7616333298787439, + "grad_norm": 0.6339262127876282, + "learning_rate": 5.670428597568081e-06, + "loss": 0.2082, + "step": 7349 + }, + { + "epoch": 0.7617369675614053, + "grad_norm": 0.6057512164115906, + "learning_rate": 5.665745990872114e-06, + "loss": 0.1794, + "step": 7350 + }, + { + "epoch": 0.7618406052440667, + "grad_norm": 0.7479916214942932, + "learning_rate": 5.6610649993376e-06, + "loss": 0.2254, + "step": 7351 + }, + { + "epoch": 0.7619442429267281, + "grad_norm": 0.5429776310920715, + "learning_rate": 5.656385623491998e-06, + "loss": 0.1486, + "step": 7352 + }, + { + "epoch": 0.7620478806093895, + "grad_norm": 0.5628483891487122, + "learning_rate": 5.651707863862566e-06, + "loss": 0.1827, + "step": 7353 + }, + { + "epoch": 0.762151518292051, + "grad_norm": 0.6502082943916321, + "learning_rate": 5.647031720976382e-06, + "loss": 0.1867, + "step": 7354 + }, + { + "epoch": 0.7622551559747124, + "grad_norm": 0.6297394633293152, + "learning_rate": 5.642357195360355e-06, + "loss": 0.2034, + "step": 7355 + }, + { + "epoch": 0.7623587936573738, + "grad_norm": 0.6706293821334839, + "learning_rate": 5.637684287541196e-06, + "loss": 0.195, + "step": 7356 + }, + { + "epoch": 0.7624624313400352, + "grad_norm": 0.6522697806358337, + "learning_rate": 5.633012998045451e-06, + "loss": 0.2099, + "step": 7357 + }, + { + "epoch": 0.7625660690226966, + "grad_norm": 0.6541911959648132, + "learning_rate": 5.628343327399462e-06, + "loss": 0.1754, + "step": 7358 + }, + { + "epoch": 0.762669706705358, + "grad_norm": 0.6433303952217102, + "learning_rate": 5.62367527612941e-06, + "loss": 0.1875, + "step": 7359 + }, + { + "epoch": 0.7627733443880195, + "grad_norm": 0.6429468393325806, + "learning_rate": 5.61900884476128e-06, + "loss": 0.1947, + "step": 7360 + }, + { + "epoch": 0.7628769820706809, + "grad_norm": 0.6449114084243774, + "learning_rate": 5.614344033820871e-06, + "loss": 0.1945, + "step": 7361 + }, + { + "epoch": 0.7629806197533423, + "grad_norm": 0.6627293825149536, + "learning_rate": 5.60968084383382e-06, + "loss": 0.2207, + "step": 7362 + }, + { + "epoch": 0.7630842574360037, + "grad_norm": 0.7052158713340759, + "learning_rate": 5.6050192753255565e-06, + "loss": 0.2095, + "step": 7363 + }, + { + "epoch": 0.7631878951186651, + "grad_norm": 0.6348506212234497, + "learning_rate": 5.600359328821341e-06, + "loss": 0.19, + "step": 7364 + }, + { + "epoch": 0.7632915328013266, + "grad_norm": 0.5358766317367554, + "learning_rate": 5.595701004846266e-06, + "loss": 0.1686, + "step": 7365 + }, + { + "epoch": 0.763395170483988, + "grad_norm": 0.7228176593780518, + "learning_rate": 5.591044303925197e-06, + "loss": 0.2052, + "step": 7366 + }, + { + "epoch": 0.7634988081666494, + "grad_norm": 0.5297076106071472, + "learning_rate": 5.586389226582862e-06, + "loss": 0.1641, + "step": 7367 + }, + { + "epoch": 0.7636024458493108, + "grad_norm": 0.6142310500144958, + "learning_rate": 5.581735773343777e-06, + "loss": 0.172, + "step": 7368 + }, + { + "epoch": 0.7637060835319722, + "grad_norm": 0.5850446224212646, + "learning_rate": 5.5770839447322936e-06, + "loss": 0.1708, + "step": 7369 + }, + { + "epoch": 0.7638097212146336, + "grad_norm": 0.6489421725273132, + "learning_rate": 5.572433741272574e-06, + "loss": 0.1959, + "step": 7370 + }, + { + "epoch": 0.7639133588972951, + "grad_norm": 0.7610423564910889, + "learning_rate": 5.567785163488592e-06, + "loss": 0.2226, + "step": 7371 + }, + { + "epoch": 0.7640169965799565, + "grad_norm": 0.5759432911872864, + "learning_rate": 5.563138211904144e-06, + "loss": 0.1486, + "step": 7372 + }, + { + "epoch": 0.7641206342626179, + "grad_norm": 0.6531688570976257, + "learning_rate": 5.558492887042832e-06, + "loss": 0.1952, + "step": 7373 + }, + { + "epoch": 0.7642242719452793, + "grad_norm": 0.6584306955337524, + "learning_rate": 5.5538491894280935e-06, + "loss": 0.1974, + "step": 7374 + }, + { + "epoch": 0.7643279096279407, + "grad_norm": 0.6911411881446838, + "learning_rate": 5.549207119583177e-06, + "loss": 0.209, + "step": 7375 + }, + { + "epoch": 0.7644315473106021, + "grad_norm": 0.711522102355957, + "learning_rate": 5.544566678031132e-06, + "loss": 0.2089, + "step": 7376 + }, + { + "epoch": 0.7645351849932636, + "grad_norm": 0.6881641745567322, + "learning_rate": 5.539927865294848e-06, + "loss": 0.2114, + "step": 7377 + }, + { + "epoch": 0.764638822675925, + "grad_norm": 0.5821783542633057, + "learning_rate": 5.535290681897014e-06, + "loss": 0.1677, + "step": 7378 + }, + { + "epoch": 0.7647424603585864, + "grad_norm": 0.5881667733192444, + "learning_rate": 5.530655128360134e-06, + "loss": 0.1903, + "step": 7379 + }, + { + "epoch": 0.7648460980412478, + "grad_norm": 0.6320826411247253, + "learning_rate": 5.526021205206546e-06, + "loss": 0.209, + "step": 7380 + }, + { + "epoch": 0.7649497357239092, + "grad_norm": 0.619613528251648, + "learning_rate": 5.521388912958383e-06, + "loss": 0.1741, + "step": 7381 + }, + { + "epoch": 0.7650533734065706, + "grad_norm": 0.7030311226844788, + "learning_rate": 5.5167582521376175e-06, + "loss": 0.1982, + "step": 7382 + }, + { + "epoch": 0.7651570110892321, + "grad_norm": 0.6498759388923645, + "learning_rate": 5.51212922326601e-06, + "loss": 0.1697, + "step": 7383 + }, + { + "epoch": 0.7652606487718935, + "grad_norm": 0.5602517127990723, + "learning_rate": 5.507501826865164e-06, + "loss": 0.1753, + "step": 7384 + }, + { + "epoch": 0.7653642864545549, + "grad_norm": 0.6065822839736938, + "learning_rate": 5.502876063456486e-06, + "loss": 0.1891, + "step": 7385 + }, + { + "epoch": 0.7654679241372163, + "grad_norm": 0.5793746113777161, + "learning_rate": 5.498251933561189e-06, + "loss": 0.1856, + "step": 7386 + }, + { + "epoch": 0.7655715618198777, + "grad_norm": 0.6203474998474121, + "learning_rate": 5.4936294377003296e-06, + "loss": 0.1801, + "step": 7387 + }, + { + "epoch": 0.7656751995025391, + "grad_norm": 0.6482757925987244, + "learning_rate": 5.489008576394745e-06, + "loss": 0.1831, + "step": 7388 + }, + { + "epoch": 0.7657788371852006, + "grad_norm": 0.7696741819381714, + "learning_rate": 5.484389350165118e-06, + "loss": 0.2658, + "step": 7389 + }, + { + "epoch": 0.765882474867862, + "grad_norm": 0.6601531505584717, + "learning_rate": 5.47977175953194e-06, + "loss": 0.1854, + "step": 7390 + }, + { + "epoch": 0.7659861125505234, + "grad_norm": 0.5935264229774475, + "learning_rate": 5.4751558050155085e-06, + "loss": 0.1804, + "step": 7391 + }, + { + "epoch": 0.7660897502331848, + "grad_norm": 0.5694248676300049, + "learning_rate": 5.470541487135941e-06, + "loss": 0.1706, + "step": 7392 + }, + { + "epoch": 0.7661933879158462, + "grad_norm": 0.70427405834198, + "learning_rate": 5.465928806413166e-06, + "loss": 0.1906, + "step": 7393 + }, + { + "epoch": 0.7662970255985077, + "grad_norm": 0.6865828633308411, + "learning_rate": 5.4613177633669405e-06, + "loss": 0.2231, + "step": 7394 + }, + { + "epoch": 0.7664006632811691, + "grad_norm": 0.6764995455741882, + "learning_rate": 5.456708358516833e-06, + "loss": 0.2044, + "step": 7395 + }, + { + "epoch": 0.7665043009638305, + "grad_norm": 0.6275007724761963, + "learning_rate": 5.452100592382221e-06, + "loss": 0.1497, + "step": 7396 + }, + { + "epoch": 0.7666079386464919, + "grad_norm": 0.662211537361145, + "learning_rate": 5.447494465482299e-06, + "loss": 0.1907, + "step": 7397 + }, + { + "epoch": 0.7667115763291533, + "grad_norm": 0.7616379261016846, + "learning_rate": 5.442889978336072e-06, + "loss": 0.1956, + "step": 7398 + }, + { + "epoch": 0.7668152140118147, + "grad_norm": 0.7183577418327332, + "learning_rate": 5.438287131462372e-06, + "loss": 0.2269, + "step": 7399 + }, + { + "epoch": 0.7669188516944762, + "grad_norm": 0.597819983959198, + "learning_rate": 5.433685925379848e-06, + "loss": 0.154, + "step": 7400 + }, + { + "epoch": 0.7670224893771376, + "grad_norm": 0.6873559951782227, + "learning_rate": 5.429086360606946e-06, + "loss": 0.1902, + "step": 7401 + }, + { + "epoch": 0.767126127059799, + "grad_norm": 0.6474601626396179, + "learning_rate": 5.424488437661946e-06, + "loss": 0.2061, + "step": 7402 + }, + { + "epoch": 0.7672297647424604, + "grad_norm": 0.6201249361038208, + "learning_rate": 5.419892157062929e-06, + "loss": 0.2112, + "step": 7403 + }, + { + "epoch": 0.7673334024251218, + "grad_norm": 0.7287440896034241, + "learning_rate": 5.4152975193277955e-06, + "loss": 0.2156, + "step": 7404 + }, + { + "epoch": 0.7674370401077832, + "grad_norm": 0.6518657803535461, + "learning_rate": 5.4107045249742705e-06, + "loss": 0.1786, + "step": 7405 + }, + { + "epoch": 0.7675406777904447, + "grad_norm": 1.0298590660095215, + "learning_rate": 5.406113174519874e-06, + "loss": 0.2331, + "step": 7406 + }, + { + "epoch": 0.7676443154731061, + "grad_norm": 0.6306723952293396, + "learning_rate": 5.401523468481957e-06, + "loss": 0.1963, + "step": 7407 + }, + { + "epoch": 0.7677479531557674, + "grad_norm": 0.662980854511261, + "learning_rate": 5.3969354073776905e-06, + "loss": 0.213, + "step": 7408 + }, + { + "epoch": 0.7678515908384288, + "grad_norm": 0.5138009190559387, + "learning_rate": 5.392348991724039e-06, + "loss": 0.1588, + "step": 7409 + }, + { + "epoch": 0.7679552285210902, + "grad_norm": 0.6134780645370483, + "learning_rate": 5.387764222037797e-06, + "loss": 0.162, + "step": 7410 + }, + { + "epoch": 0.7680588662037516, + "grad_norm": 0.6860098838806152, + "learning_rate": 5.383181098835559e-06, + "loss": 0.1942, + "step": 7411 + }, + { + "epoch": 0.768162503886413, + "grad_norm": 0.7752522230148315, + "learning_rate": 5.378599622633754e-06, + "loss": 0.27, + "step": 7412 + }, + { + "epoch": 0.7682661415690745, + "grad_norm": 0.7609741687774658, + "learning_rate": 5.374019793948619e-06, + "loss": 0.2123, + "step": 7413 + }, + { + "epoch": 0.7683697792517359, + "grad_norm": 0.567396879196167, + "learning_rate": 5.369441613296191e-06, + "loss": 0.1608, + "step": 7414 + }, + { + "epoch": 0.7684734169343973, + "grad_norm": 0.6172008514404297, + "learning_rate": 5.3648650811923476e-06, + "loss": 0.1783, + "step": 7415 + }, + { + "epoch": 0.7685770546170587, + "grad_norm": 0.6998493075370789, + "learning_rate": 5.3602901981527535e-06, + "loss": 0.2033, + "step": 7416 + }, + { + "epoch": 0.7686806922997201, + "grad_norm": 0.6385000348091125, + "learning_rate": 5.355716964692896e-06, + "loss": 0.1916, + "step": 7417 + }, + { + "epoch": 0.7687843299823816, + "grad_norm": 0.6977570652961731, + "learning_rate": 5.351145381328091e-06, + "loss": 0.1962, + "step": 7418 + }, + { + "epoch": 0.768887967665043, + "grad_norm": 0.8362807631492615, + "learning_rate": 5.346575448573448e-06, + "loss": 0.2188, + "step": 7419 + }, + { + "epoch": 0.7689916053477044, + "grad_norm": 0.6531651020050049, + "learning_rate": 5.34200716694391e-06, + "loss": 0.2103, + "step": 7420 + }, + { + "epoch": 0.7690952430303658, + "grad_norm": 0.7263716459274292, + "learning_rate": 5.337440536954213e-06, + "loss": 0.224, + "step": 7421 + }, + { + "epoch": 0.7691988807130272, + "grad_norm": 0.6424431204795837, + "learning_rate": 5.332875559118926e-06, + "loss": 0.1877, + "step": 7422 + }, + { + "epoch": 0.7693025183956886, + "grad_norm": 0.5936616063117981, + "learning_rate": 5.328312233952424e-06, + "loss": 0.1721, + "step": 7423 + }, + { + "epoch": 0.7694061560783501, + "grad_norm": 0.6746377348899841, + "learning_rate": 5.323750561968883e-06, + "loss": 0.1984, + "step": 7424 + }, + { + "epoch": 0.7695097937610115, + "grad_norm": 0.6120424866676331, + "learning_rate": 5.319190543682322e-06, + "loss": 0.1611, + "step": 7425 + }, + { + "epoch": 0.7696134314436729, + "grad_norm": 0.6708462834358215, + "learning_rate": 5.314632179606542e-06, + "loss": 0.1872, + "step": 7426 + }, + { + "epoch": 0.7697170691263343, + "grad_norm": 0.5981742739677429, + "learning_rate": 5.31007547025518e-06, + "loss": 0.2012, + "step": 7427 + }, + { + "epoch": 0.7698207068089957, + "grad_norm": 0.6955262422561646, + "learning_rate": 5.30552041614169e-06, + "loss": 0.2084, + "step": 7428 + }, + { + "epoch": 0.7699243444916571, + "grad_norm": 0.6701587438583374, + "learning_rate": 5.300967017779304e-06, + "loss": 0.2103, + "step": 7429 + }, + { + "epoch": 0.7700279821743186, + "grad_norm": 0.620948851108551, + "learning_rate": 5.296415275681108e-06, + "loss": 0.188, + "step": 7430 + }, + { + "epoch": 0.77013161985698, + "grad_norm": 0.578031599521637, + "learning_rate": 5.291865190359979e-06, + "loss": 0.1507, + "step": 7431 + }, + { + "epoch": 0.7702352575396414, + "grad_norm": 0.6619542837142944, + "learning_rate": 5.287316762328614e-06, + "loss": 0.1747, + "step": 7432 + }, + { + "epoch": 0.7703388952223028, + "grad_norm": 0.6650355458259583, + "learning_rate": 5.282769992099532e-06, + "loss": 0.1927, + "step": 7433 + }, + { + "epoch": 0.7704425329049642, + "grad_norm": 0.6092379689216614, + "learning_rate": 5.27822488018505e-06, + "loss": 0.1978, + "step": 7434 + }, + { + "epoch": 0.7705461705876256, + "grad_norm": 0.6268170475959778, + "learning_rate": 5.273681427097302e-06, + "loss": 0.1838, + "step": 7435 + }, + { + "epoch": 0.7706498082702871, + "grad_norm": 0.6930217146873474, + "learning_rate": 5.269139633348231e-06, + "loss": 0.232, + "step": 7436 + }, + { + "epoch": 0.7707534459529485, + "grad_norm": 0.6758466958999634, + "learning_rate": 5.264599499449607e-06, + "loss": 0.1875, + "step": 7437 + }, + { + "epoch": 0.7708570836356099, + "grad_norm": 0.5889658331871033, + "learning_rate": 5.260061025913013e-06, + "loss": 0.1626, + "step": 7438 + }, + { + "epoch": 0.7709607213182713, + "grad_norm": 0.6119863390922546, + "learning_rate": 5.255524213249821e-06, + "loss": 0.1937, + "step": 7439 + }, + { + "epoch": 0.7710643590009327, + "grad_norm": 0.6818948984146118, + "learning_rate": 5.2509890619712455e-06, + "loss": 0.198, + "step": 7440 + }, + { + "epoch": 0.7711679966835941, + "grad_norm": 0.597781777381897, + "learning_rate": 5.246455572588296e-06, + "loss": 0.1981, + "step": 7441 + }, + { + "epoch": 0.7712716343662556, + "grad_norm": 0.6981128454208374, + "learning_rate": 5.24192374561179e-06, + "loss": 0.2089, + "step": 7442 + }, + { + "epoch": 0.771375272048917, + "grad_norm": 0.6379117965698242, + "learning_rate": 5.237393581552381e-06, + "loss": 0.171, + "step": 7443 + }, + { + "epoch": 0.7714789097315784, + "grad_norm": 0.6575020551681519, + "learning_rate": 5.23286508092051e-06, + "loss": 0.1762, + "step": 7444 + }, + { + "epoch": 0.7715825474142398, + "grad_norm": 0.7529862523078918, + "learning_rate": 5.22833824422645e-06, + "loss": 0.2192, + "step": 7445 + }, + { + "epoch": 0.7716861850969012, + "grad_norm": 0.7869521975517273, + "learning_rate": 5.223813071980268e-06, + "loss": 0.2304, + "step": 7446 + }, + { + "epoch": 0.7717898227795627, + "grad_norm": 0.6265051960945129, + "learning_rate": 5.219289564691865e-06, + "loss": 0.1852, + "step": 7447 + }, + { + "epoch": 0.7718934604622241, + "grad_norm": 0.6529081463813782, + "learning_rate": 5.21476772287094e-06, + "loss": 0.1881, + "step": 7448 + }, + { + "epoch": 0.7719970981448855, + "grad_norm": 0.5538001656532288, + "learning_rate": 5.210247547026994e-06, + "loss": 0.1643, + "step": 7449 + }, + { + "epoch": 0.7721007358275469, + "grad_norm": 0.6321317553520203, + "learning_rate": 5.205729037669369e-06, + "loss": 0.1948, + "step": 7450 + }, + { + "epoch": 0.7722043735102083, + "grad_norm": 0.7393016815185547, + "learning_rate": 5.201212195307195e-06, + "loss": 0.2235, + "step": 7451 + }, + { + "epoch": 0.7723080111928697, + "grad_norm": 0.7127220034599304, + "learning_rate": 5.196697020449422e-06, + "loss": 0.1997, + "step": 7452 + }, + { + "epoch": 0.7724116488755312, + "grad_norm": 0.6728219389915466, + "learning_rate": 5.1921835136048246e-06, + "loss": 0.1959, + "step": 7453 + }, + { + "epoch": 0.7725152865581926, + "grad_norm": 0.6634606719017029, + "learning_rate": 5.187671675281969e-06, + "loss": 0.1965, + "step": 7454 + }, + { + "epoch": 0.772618924240854, + "grad_norm": 0.6769964694976807, + "learning_rate": 5.183161505989245e-06, + "loss": 0.2009, + "step": 7455 + }, + { + "epoch": 0.7727225619235154, + "grad_norm": 0.7333029508590698, + "learning_rate": 5.178653006234842e-06, + "loss": 0.2321, + "step": 7456 + }, + { + "epoch": 0.7728261996061768, + "grad_norm": 0.6659340858459473, + "learning_rate": 5.174146176526777e-06, + "loss": 0.2039, + "step": 7457 + }, + { + "epoch": 0.7729298372888382, + "grad_norm": 0.5503813028335571, + "learning_rate": 5.16964101737288e-06, + "loss": 0.1678, + "step": 7458 + }, + { + "epoch": 0.7730334749714997, + "grad_norm": 0.7269548177719116, + "learning_rate": 5.165137529280773e-06, + "loss": 0.2382, + "step": 7459 + }, + { + "epoch": 0.7731371126541611, + "grad_norm": 0.5385704040527344, + "learning_rate": 5.16063571275792e-06, + "loss": 0.1917, + "step": 7460 + }, + { + "epoch": 0.7732407503368225, + "grad_norm": 0.5735783576965332, + "learning_rate": 5.1561355683115556e-06, + "loss": 0.1565, + "step": 7461 + }, + { + "epoch": 0.7733443880194839, + "grad_norm": 0.5853539705276489, + "learning_rate": 5.151637096448759e-06, + "loss": 0.1584, + "step": 7462 + }, + { + "epoch": 0.7734480257021453, + "grad_norm": 0.7648028135299683, + "learning_rate": 5.147140297676419e-06, + "loss": 0.2353, + "step": 7463 + }, + { + "epoch": 0.7735516633848067, + "grad_norm": 0.6179225444793701, + "learning_rate": 5.142645172501213e-06, + "loss": 0.1658, + "step": 7464 + }, + { + "epoch": 0.7736553010674682, + "grad_norm": 0.6266987323760986, + "learning_rate": 5.138151721429661e-06, + "loss": 0.1843, + "step": 7465 + }, + { + "epoch": 0.7737589387501296, + "grad_norm": 0.6973028779029846, + "learning_rate": 5.133659944968068e-06, + "loss": 0.1981, + "step": 7466 + }, + { + "epoch": 0.773862576432791, + "grad_norm": 0.6442803740501404, + "learning_rate": 5.129169843622559e-06, + "loss": 0.1957, + "step": 7467 + }, + { + "epoch": 0.7739662141154524, + "grad_norm": 0.6411221027374268, + "learning_rate": 5.124681417899078e-06, + "loss": 0.1768, + "step": 7468 + }, + { + "epoch": 0.7740698517981138, + "grad_norm": 0.5441047549247742, + "learning_rate": 5.120194668303367e-06, + "loss": 0.149, + "step": 7469 + }, + { + "epoch": 0.7741734894807752, + "grad_norm": 0.6542971730232239, + "learning_rate": 5.1157095953409945e-06, + "loss": 0.1912, + "step": 7470 + }, + { + "epoch": 0.7742771271634367, + "grad_norm": 0.6243970394134521, + "learning_rate": 5.1112261995173204e-06, + "loss": 0.1683, + "step": 7471 + }, + { + "epoch": 0.7743807648460981, + "grad_norm": 0.6158422231674194, + "learning_rate": 5.106744481337538e-06, + "loss": 0.1964, + "step": 7472 + }, + { + "epoch": 0.7744844025287595, + "grad_norm": 0.6128589510917664, + "learning_rate": 5.102264441306637e-06, + "loss": 0.1793, + "step": 7473 + }, + { + "epoch": 0.7745880402114209, + "grad_norm": 0.7475075721740723, + "learning_rate": 5.097786079929414e-06, + "loss": 0.2269, + "step": 7474 + }, + { + "epoch": 0.7746916778940823, + "grad_norm": 0.639706015586853, + "learning_rate": 5.093309397710496e-06, + "loss": 0.1853, + "step": 7475 + }, + { + "epoch": 0.7747953155767437, + "grad_norm": 0.656455397605896, + "learning_rate": 5.088834395154294e-06, + "loss": 0.2121, + "step": 7476 + }, + { + "epoch": 0.7748989532594052, + "grad_norm": 0.6708592772483826, + "learning_rate": 5.084361072765054e-06, + "loss": 0.2067, + "step": 7477 + }, + { + "epoch": 0.7750025909420666, + "grad_norm": 0.7351187467575073, + "learning_rate": 5.079889431046827e-06, + "loss": 0.2239, + "step": 7478 + }, + { + "epoch": 0.775106228624728, + "grad_norm": 0.6682601571083069, + "learning_rate": 5.0754194705034665e-06, + "loss": 0.1995, + "step": 7479 + }, + { + "epoch": 0.7752098663073894, + "grad_norm": 0.5769458413124084, + "learning_rate": 5.070951191638638e-06, + "loss": 0.1914, + "step": 7480 + }, + { + "epoch": 0.7753135039900508, + "grad_norm": 0.549268901348114, + "learning_rate": 5.066484594955816e-06, + "loss": 0.155, + "step": 7481 + }, + { + "epoch": 0.7754171416727123, + "grad_norm": 0.7133857011795044, + "learning_rate": 5.062019680958297e-06, + "loss": 0.2038, + "step": 7482 + }, + { + "epoch": 0.7755207793553737, + "grad_norm": 0.6121452450752258, + "learning_rate": 5.057556450149181e-06, + "loss": 0.1537, + "step": 7483 + }, + { + "epoch": 0.775624417038035, + "grad_norm": 0.762610137462616, + "learning_rate": 5.053094903031372e-06, + "loss": 0.2088, + "step": 7484 + }, + { + "epoch": 0.7757280547206964, + "grad_norm": 0.7283558249473572, + "learning_rate": 5.0486350401076014e-06, + "loss": 0.2135, + "step": 7485 + }, + { + "epoch": 0.7758316924033578, + "grad_norm": 0.7834716439247131, + "learning_rate": 5.04417686188039e-06, + "loss": 0.2105, + "step": 7486 + }, + { + "epoch": 0.7759353300860192, + "grad_norm": 0.7361102104187012, + "learning_rate": 5.039720368852075e-06, + "loss": 0.1951, + "step": 7487 + }, + { + "epoch": 0.7760389677686806, + "grad_norm": 0.5629470348358154, + "learning_rate": 5.035265561524818e-06, + "loss": 0.1637, + "step": 7488 + }, + { + "epoch": 0.7761426054513421, + "grad_norm": 0.6459644436836243, + "learning_rate": 5.030812440400567e-06, + "loss": 0.1788, + "step": 7489 + }, + { + "epoch": 0.7762462431340035, + "grad_norm": 0.5973713994026184, + "learning_rate": 5.026361005981109e-06, + "loss": 0.161, + "step": 7490 + }, + { + "epoch": 0.7763498808166649, + "grad_norm": 0.6738793849945068, + "learning_rate": 5.021911258768015e-06, + "loss": 0.1773, + "step": 7491 + }, + { + "epoch": 0.7764535184993263, + "grad_norm": 0.6055402159690857, + "learning_rate": 5.017463199262671e-06, + "loss": 0.1879, + "step": 7492 + }, + { + "epoch": 0.7765571561819877, + "grad_norm": 0.6321949362754822, + "learning_rate": 5.013016827966289e-06, + "loss": 0.1795, + "step": 7493 + }, + { + "epoch": 0.7766607938646491, + "grad_norm": 0.6103180646896362, + "learning_rate": 5.008572145379866e-06, + "loss": 0.1861, + "step": 7494 + }, + { + "epoch": 0.7767644315473106, + "grad_norm": 0.6383140087127686, + "learning_rate": 5.004129152004236e-06, + "loss": 0.1591, + "step": 7495 + }, + { + "epoch": 0.776868069229972, + "grad_norm": 0.6704176068305969, + "learning_rate": 4.999687848340013e-06, + "loss": 0.1914, + "step": 7496 + }, + { + "epoch": 0.7769717069126334, + "grad_norm": 0.7036976218223572, + "learning_rate": 4.995248234887655e-06, + "loss": 0.1977, + "step": 7497 + }, + { + "epoch": 0.7770753445952948, + "grad_norm": 0.6711276173591614, + "learning_rate": 4.990810312147398e-06, + "loss": 0.1765, + "step": 7498 + }, + { + "epoch": 0.7771789822779562, + "grad_norm": 0.6839596629142761, + "learning_rate": 4.9863740806192965e-06, + "loss": 0.2104, + "step": 7499 + }, + { + "epoch": 0.7772826199606176, + "grad_norm": 0.6389532685279846, + "learning_rate": 4.9819395408032316e-06, + "loss": 0.198, + "step": 7500 + }, + { + "epoch": 0.7773862576432791, + "grad_norm": 0.6310150027275085, + "learning_rate": 4.977506693198868e-06, + "loss": 0.1597, + "step": 7501 + }, + { + "epoch": 0.7774898953259405, + "grad_norm": 0.6443967223167419, + "learning_rate": 4.973075538305696e-06, + "loss": 0.1648, + "step": 7502 + }, + { + "epoch": 0.7775935330086019, + "grad_norm": 0.7315575480461121, + "learning_rate": 4.968646076623018e-06, + "loss": 0.2289, + "step": 7503 + }, + { + "epoch": 0.7776971706912633, + "grad_norm": 0.6594371795654297, + "learning_rate": 4.964218308649933e-06, + "loss": 0.189, + "step": 7504 + }, + { + "epoch": 0.7778008083739247, + "grad_norm": 0.6741247177124023, + "learning_rate": 4.959792234885357e-06, + "loss": 0.1891, + "step": 7505 + }, + { + "epoch": 0.7779044460565862, + "grad_norm": 0.5423347353935242, + "learning_rate": 4.955367855828006e-06, + "loss": 0.174, + "step": 7506 + }, + { + "epoch": 0.7780080837392476, + "grad_norm": 0.6483452916145325, + "learning_rate": 4.9509451719764155e-06, + "loss": 0.1826, + "step": 7507 + }, + { + "epoch": 0.778111721421909, + "grad_norm": 0.6669498085975647, + "learning_rate": 4.946524183828935e-06, + "loss": 0.1766, + "step": 7508 + }, + { + "epoch": 0.7782153591045704, + "grad_norm": 0.7897505164146423, + "learning_rate": 4.942104891883706e-06, + "loss": 0.1816, + "step": 7509 + }, + { + "epoch": 0.7783189967872318, + "grad_norm": 0.6134946346282959, + "learning_rate": 4.9376872966386915e-06, + "loss": 0.1867, + "step": 7510 + }, + { + "epoch": 0.7784226344698932, + "grad_norm": 0.6003133654594421, + "learning_rate": 4.933271398591659e-06, + "loss": 0.1704, + "step": 7511 + }, + { + "epoch": 0.7785262721525547, + "grad_norm": 0.6122907400131226, + "learning_rate": 4.928857198240178e-06, + "loss": 0.1665, + "step": 7512 + }, + { + "epoch": 0.7786299098352161, + "grad_norm": 0.6975659132003784, + "learning_rate": 4.924444696081645e-06, + "loss": 0.1831, + "step": 7513 + }, + { + "epoch": 0.7787335475178775, + "grad_norm": 0.5816099643707275, + "learning_rate": 4.9200338926132426e-06, + "loss": 0.1692, + "step": 7514 + }, + { + "epoch": 0.7788371852005389, + "grad_norm": 0.7342694401741028, + "learning_rate": 4.915624788331985e-06, + "loss": 0.1865, + "step": 7515 + }, + { + "epoch": 0.7789408228832003, + "grad_norm": 0.6069790720939636, + "learning_rate": 4.911217383734672e-06, + "loss": 0.1633, + "step": 7516 + }, + { + "epoch": 0.7790444605658617, + "grad_norm": 0.728117048740387, + "learning_rate": 4.906811679317933e-06, + "loss": 0.2204, + "step": 7517 + }, + { + "epoch": 0.7791480982485232, + "grad_norm": 0.6618308424949646, + "learning_rate": 4.902407675578191e-06, + "loss": 0.1931, + "step": 7518 + }, + { + "epoch": 0.7792517359311846, + "grad_norm": 0.6637287139892578, + "learning_rate": 4.898005373011676e-06, + "loss": 0.1927, + "step": 7519 + }, + { + "epoch": 0.779355373613846, + "grad_norm": 0.6428270936012268, + "learning_rate": 4.893604772114446e-06, + "loss": 0.183, + "step": 7520 + }, + { + "epoch": 0.7794590112965074, + "grad_norm": 0.5997044444084167, + "learning_rate": 4.8892058733823415e-06, + "loss": 0.1783, + "step": 7521 + }, + { + "epoch": 0.7795626489791688, + "grad_norm": 0.703879177570343, + "learning_rate": 4.884808677311028e-06, + "loss": 0.199, + "step": 7522 + }, + { + "epoch": 0.7796662866618302, + "grad_norm": 0.7539674043655396, + "learning_rate": 4.880413184395989e-06, + "loss": 0.1822, + "step": 7523 + }, + { + "epoch": 0.7797699243444917, + "grad_norm": 0.7416384816169739, + "learning_rate": 4.876019395132474e-06, + "loss": 0.23, + "step": 7524 + }, + { + "epoch": 0.7798735620271531, + "grad_norm": 0.6457820534706116, + "learning_rate": 4.871627310015592e-06, + "loss": 0.1892, + "step": 7525 + }, + { + "epoch": 0.7799771997098145, + "grad_norm": 0.6704725027084351, + "learning_rate": 4.8672369295402175e-06, + "loss": 0.1884, + "step": 7526 + }, + { + "epoch": 0.7800808373924759, + "grad_norm": 0.643944501876831, + "learning_rate": 4.862848254201065e-06, + "loss": 0.1914, + "step": 7527 + }, + { + "epoch": 0.7801844750751373, + "grad_norm": 0.6778848767280579, + "learning_rate": 4.8584612844926436e-06, + "loss": 0.2133, + "step": 7528 + }, + { + "epoch": 0.7802881127577987, + "grad_norm": 0.6219654083251953, + "learning_rate": 4.854076020909268e-06, + "loss": 0.1786, + "step": 7529 + }, + { + "epoch": 0.7803917504404602, + "grad_norm": 0.6912334561347961, + "learning_rate": 4.8496924639450616e-06, + "loss": 0.201, + "step": 7530 + }, + { + "epoch": 0.7804953881231216, + "grad_norm": 0.596788227558136, + "learning_rate": 4.84531061409395e-06, + "loss": 0.1809, + "step": 7531 + }, + { + "epoch": 0.780599025805783, + "grad_norm": 0.6440489888191223, + "learning_rate": 4.8409304718496806e-06, + "loss": 0.1877, + "step": 7532 + }, + { + "epoch": 0.7807026634884444, + "grad_norm": 0.6101676821708679, + "learning_rate": 4.836552037705806e-06, + "loss": 0.1767, + "step": 7533 + }, + { + "epoch": 0.7808063011711058, + "grad_norm": 0.5853814482688904, + "learning_rate": 4.832175312155671e-06, + "loss": 0.1783, + "step": 7534 + }, + { + "epoch": 0.7809099388537672, + "grad_norm": 0.7949321866035461, + "learning_rate": 4.8278002956924466e-06, + "loss": 0.2089, + "step": 7535 + }, + { + "epoch": 0.7810135765364287, + "grad_norm": 0.7004521489143372, + "learning_rate": 4.8234269888091015e-06, + "loss": 0.199, + "step": 7536 + }, + { + "epoch": 0.7811172142190901, + "grad_norm": 0.8060917258262634, + "learning_rate": 4.819055391998404e-06, + "loss": 0.2149, + "step": 7537 + }, + { + "epoch": 0.7812208519017515, + "grad_norm": 0.6604312062263489, + "learning_rate": 4.814685505752951e-06, + "loss": 0.194, + "step": 7538 + }, + { + "epoch": 0.7813244895844129, + "grad_norm": 0.6711355447769165, + "learning_rate": 4.810317330565124e-06, + "loss": 0.1831, + "step": 7539 + }, + { + "epoch": 0.7814281272670743, + "grad_norm": 0.624002993106842, + "learning_rate": 4.805950866927134e-06, + "loss": 0.1685, + "step": 7540 + }, + { + "epoch": 0.7815317649497358, + "grad_norm": 0.694841206073761, + "learning_rate": 4.801586115330974e-06, + "loss": 0.1975, + "step": 7541 + }, + { + "epoch": 0.7816354026323972, + "grad_norm": 0.6798537969589233, + "learning_rate": 4.7972230762684695e-06, + "loss": 0.2092, + "step": 7542 + }, + { + "epoch": 0.7817390403150586, + "grad_norm": 0.7163605093955994, + "learning_rate": 4.792861750231235e-06, + "loss": 0.2637, + "step": 7543 + }, + { + "epoch": 0.78184267799772, + "grad_norm": 0.6866286396980286, + "learning_rate": 4.788502137710696e-06, + "loss": 0.2164, + "step": 7544 + }, + { + "epoch": 0.7819463156803814, + "grad_norm": 0.6416062116622925, + "learning_rate": 4.784144239198092e-06, + "loss": 0.1825, + "step": 7545 + }, + { + "epoch": 0.7820499533630428, + "grad_norm": 0.6997154355049133, + "learning_rate": 4.779788055184456e-06, + "loss": 0.2044, + "step": 7546 + }, + { + "epoch": 0.7821535910457043, + "grad_norm": 0.7446556091308594, + "learning_rate": 4.775433586160643e-06, + "loss": 0.2441, + "step": 7547 + }, + { + "epoch": 0.7822572287283657, + "grad_norm": 0.5584850907325745, + "learning_rate": 4.7710808326173115e-06, + "loss": 0.1662, + "step": 7548 + }, + { + "epoch": 0.7823608664110271, + "grad_norm": 0.6249669194221497, + "learning_rate": 4.7667297950449175e-06, + "loss": 0.1936, + "step": 7549 + }, + { + "epoch": 0.7824645040936885, + "grad_norm": 0.7454187273979187, + "learning_rate": 4.7623804739337294e-06, + "loss": 0.2051, + "step": 7550 + }, + { + "epoch": 0.7825681417763499, + "grad_norm": 0.735968828201294, + "learning_rate": 4.7580328697738185e-06, + "loss": 0.2207, + "step": 7551 + }, + { + "epoch": 0.7826717794590113, + "grad_norm": 0.6438806653022766, + "learning_rate": 4.753686983055068e-06, + "loss": 0.2113, + "step": 7552 + }, + { + "epoch": 0.7827754171416728, + "grad_norm": 0.7040858268737793, + "learning_rate": 4.749342814267175e-06, + "loss": 0.1905, + "step": 7553 + }, + { + "epoch": 0.7828790548243342, + "grad_norm": 0.7186905145645142, + "learning_rate": 4.7450003638996236e-06, + "loss": 0.2102, + "step": 7554 + }, + { + "epoch": 0.7829826925069956, + "grad_norm": 0.6388292908668518, + "learning_rate": 4.740659632441718e-06, + "loss": 0.1748, + "step": 7555 + }, + { + "epoch": 0.783086330189657, + "grad_norm": 0.7200673222541809, + "learning_rate": 4.736320620382557e-06, + "loss": 0.194, + "step": 7556 + }, + { + "epoch": 0.7831899678723184, + "grad_norm": 0.7050800323486328, + "learning_rate": 4.731983328211063e-06, + "loss": 0.2493, + "step": 7557 + }, + { + "epoch": 0.7832936055549798, + "grad_norm": 0.6410065293312073, + "learning_rate": 4.727647756415959e-06, + "loss": 0.1901, + "step": 7558 + }, + { + "epoch": 0.7833972432376413, + "grad_norm": 0.5777173638343811, + "learning_rate": 4.723313905485756e-06, + "loss": 0.145, + "step": 7559 + }, + { + "epoch": 0.7835008809203026, + "grad_norm": 0.6110885143280029, + "learning_rate": 4.718981775908802e-06, + "loss": 0.1884, + "step": 7560 + }, + { + "epoch": 0.783604518602964, + "grad_norm": 0.6367966532707214, + "learning_rate": 4.714651368173224e-06, + "loss": 0.1869, + "step": 7561 + }, + { + "epoch": 0.7837081562856254, + "grad_norm": 0.6776068210601807, + "learning_rate": 4.710322682766966e-06, + "loss": 0.1777, + "step": 7562 + }, + { + "epoch": 0.7838117939682868, + "grad_norm": 0.7195240259170532, + "learning_rate": 4.705995720177783e-06, + "loss": 0.2436, + "step": 7563 + }, + { + "epoch": 0.7839154316509482, + "grad_norm": 0.6103599071502686, + "learning_rate": 4.7016704808932215e-06, + "loss": 0.1693, + "step": 7564 + }, + { + "epoch": 0.7840190693336097, + "grad_norm": 0.5551150441169739, + "learning_rate": 4.697346965400655e-06, + "loss": 0.1532, + "step": 7565 + }, + { + "epoch": 0.7841227070162711, + "grad_norm": 0.7115580439567566, + "learning_rate": 4.69302517418724e-06, + "loss": 0.1867, + "step": 7566 + }, + { + "epoch": 0.7842263446989325, + "grad_norm": 0.6921216249465942, + "learning_rate": 4.688705107739957e-06, + "loss": 0.2031, + "step": 7567 + }, + { + "epoch": 0.7843299823815939, + "grad_norm": 0.667380690574646, + "learning_rate": 4.684386766545581e-06, + "loss": 0.1955, + "step": 7568 + }, + { + "epoch": 0.7844336200642553, + "grad_norm": 0.603564441204071, + "learning_rate": 4.68007015109069e-06, + "loss": 0.1552, + "step": 7569 + }, + { + "epoch": 0.7845372577469167, + "grad_norm": 0.6818835139274597, + "learning_rate": 4.675755261861683e-06, + "loss": 0.1894, + "step": 7570 + }, + { + "epoch": 0.7846408954295782, + "grad_norm": 0.6953232288360596, + "learning_rate": 4.671442099344748e-06, + "loss": 0.2172, + "step": 7571 + }, + { + "epoch": 0.7847445331122396, + "grad_norm": 0.5935460329055786, + "learning_rate": 4.667130664025887e-06, + "loss": 0.1555, + "step": 7572 + }, + { + "epoch": 0.784848170794901, + "grad_norm": 0.541191577911377, + "learning_rate": 4.662820956390914e-06, + "loss": 0.1599, + "step": 7573 + }, + { + "epoch": 0.7849518084775624, + "grad_norm": 0.6485776305198669, + "learning_rate": 4.6585129769254335e-06, + "loss": 0.1955, + "step": 7574 + }, + { + "epoch": 0.7850554461602238, + "grad_norm": 0.7501851320266724, + "learning_rate": 4.6542067261148605e-06, + "loss": 0.1685, + "step": 7575 + }, + { + "epoch": 0.7851590838428852, + "grad_norm": 0.7224445939064026, + "learning_rate": 4.6499022044444146e-06, + "loss": 0.1905, + "step": 7576 + }, + { + "epoch": 0.7852627215255467, + "grad_norm": 0.7130233645439148, + "learning_rate": 4.6455994123991245e-06, + "loss": 0.1917, + "step": 7577 + }, + { + "epoch": 0.7853663592082081, + "grad_norm": 0.6433622241020203, + "learning_rate": 4.641298350463829e-06, + "loss": 0.195, + "step": 7578 + }, + { + "epoch": 0.7854699968908695, + "grad_norm": 0.5959939956665039, + "learning_rate": 4.636999019123156e-06, + "loss": 0.1559, + "step": 7579 + }, + { + "epoch": 0.7855736345735309, + "grad_norm": 0.597389280796051, + "learning_rate": 4.632701418861556e-06, + "loss": 0.1695, + "step": 7580 + }, + { + "epoch": 0.7856772722561923, + "grad_norm": 0.669829249382019, + "learning_rate": 4.628405550163271e-06, + "loss": 0.2049, + "step": 7581 + }, + { + "epoch": 0.7857809099388537, + "grad_norm": 0.7399320602416992, + "learning_rate": 4.624111413512347e-06, + "loss": 0.2242, + "step": 7582 + }, + { + "epoch": 0.7858845476215152, + "grad_norm": 0.6723091006278992, + "learning_rate": 4.619819009392652e-06, + "loss": 0.1893, + "step": 7583 + }, + { + "epoch": 0.7859881853041766, + "grad_norm": 0.757306694984436, + "learning_rate": 4.615528338287838e-06, + "loss": 0.2211, + "step": 7584 + }, + { + "epoch": 0.786091822986838, + "grad_norm": 0.5871422290802002, + "learning_rate": 4.61123940068138e-06, + "loss": 0.1739, + "step": 7585 + }, + { + "epoch": 0.7861954606694994, + "grad_norm": 0.7001006603240967, + "learning_rate": 4.606952197056545e-06, + "loss": 0.1978, + "step": 7586 + }, + { + "epoch": 0.7862990983521608, + "grad_norm": 0.6934093832969666, + "learning_rate": 4.602666727896401e-06, + "loss": 0.2132, + "step": 7587 + }, + { + "epoch": 0.7864027360348222, + "grad_norm": 0.5585488677024841, + "learning_rate": 4.598382993683839e-06, + "loss": 0.1583, + "step": 7588 + }, + { + "epoch": 0.7865063737174837, + "grad_norm": 0.6785788536071777, + "learning_rate": 4.594100994901536e-06, + "loss": 0.1813, + "step": 7589 + }, + { + "epoch": 0.7866100114001451, + "grad_norm": 0.6068393588066101, + "learning_rate": 4.589820732031986e-06, + "loss": 0.1642, + "step": 7590 + }, + { + "epoch": 0.7867136490828065, + "grad_norm": 0.6990689039230347, + "learning_rate": 4.585542205557478e-06, + "loss": 0.1972, + "step": 7591 + }, + { + "epoch": 0.7868172867654679, + "grad_norm": 0.6085687279701233, + "learning_rate": 4.581265415960117e-06, + "loss": 0.1692, + "step": 7592 + }, + { + "epoch": 0.7869209244481293, + "grad_norm": 0.7180781960487366, + "learning_rate": 4.5769903637217985e-06, + "loss": 0.2019, + "step": 7593 + }, + { + "epoch": 0.7870245621307908, + "grad_norm": 0.6923770308494568, + "learning_rate": 4.5727170493242245e-06, + "loss": 0.1699, + "step": 7594 + }, + { + "epoch": 0.7871281998134522, + "grad_norm": 0.5988040566444397, + "learning_rate": 4.5684454732489195e-06, + "loss": 0.1571, + "step": 7595 + }, + { + "epoch": 0.7872318374961136, + "grad_norm": 0.6364946365356445, + "learning_rate": 4.564175635977181e-06, + "loss": 0.1648, + "step": 7596 + }, + { + "epoch": 0.787335475178775, + "grad_norm": 0.7644514441490173, + "learning_rate": 4.559907537990138e-06, + "loss": 0.2072, + "step": 7597 + }, + { + "epoch": 0.7874391128614364, + "grad_norm": 0.7390966415405273, + "learning_rate": 4.555641179768718e-06, + "loss": 0.2181, + "step": 7598 + }, + { + "epoch": 0.7875427505440978, + "grad_norm": 0.670900285243988, + "learning_rate": 4.551376561793641e-06, + "loss": 0.2063, + "step": 7599 + }, + { + "epoch": 0.7876463882267593, + "grad_norm": 0.6904001832008362, + "learning_rate": 4.547113684545437e-06, + "loss": 0.1957, + "step": 7600 + }, + { + "epoch": 0.7877500259094207, + "grad_norm": 0.6743488311767578, + "learning_rate": 4.542852548504435e-06, + "loss": 0.1843, + "step": 7601 + }, + { + "epoch": 0.7878536635920821, + "grad_norm": 0.7440134882926941, + "learning_rate": 4.538593154150779e-06, + "loss": 0.2007, + "step": 7602 + }, + { + "epoch": 0.7879573012747435, + "grad_norm": 0.6265433430671692, + "learning_rate": 4.534335501964417e-06, + "loss": 0.1843, + "step": 7603 + }, + { + "epoch": 0.7880609389574049, + "grad_norm": 0.6931333541870117, + "learning_rate": 4.530079592425083e-06, + "loss": 0.223, + "step": 7604 + }, + { + "epoch": 0.7881645766400663, + "grad_norm": 0.5931916236877441, + "learning_rate": 4.52582542601234e-06, + "loss": 0.1664, + "step": 7605 + }, + { + "epoch": 0.7882682143227278, + "grad_norm": 0.7090798020362854, + "learning_rate": 4.5215730032055305e-06, + "loss": 0.2137, + "step": 7606 + }, + { + "epoch": 0.7883718520053892, + "grad_norm": 0.6874864101409912, + "learning_rate": 4.517322324483808e-06, + "loss": 0.196, + "step": 7607 + }, + { + "epoch": 0.7884754896880506, + "grad_norm": 0.5806185603141785, + "learning_rate": 4.5130733903261435e-06, + "loss": 0.1846, + "step": 7608 + }, + { + "epoch": 0.788579127370712, + "grad_norm": 0.5684027671813965, + "learning_rate": 4.508826201211289e-06, + "loss": 0.143, + "step": 7609 + }, + { + "epoch": 0.7886827650533734, + "grad_norm": 0.6393711566925049, + "learning_rate": 4.504580757617818e-06, + "loss": 0.1733, + "step": 7610 + }, + { + "epoch": 0.7887864027360348, + "grad_norm": 0.617420494556427, + "learning_rate": 4.5003370600241024e-06, + "loss": 0.1706, + "step": 7611 + }, + { + "epoch": 0.7888900404186963, + "grad_norm": 0.6285762786865234, + "learning_rate": 4.496095108908314e-06, + "loss": 0.1777, + "step": 7612 + }, + { + "epoch": 0.7889936781013577, + "grad_norm": 0.6379365921020508, + "learning_rate": 4.491854904748425e-06, + "loss": 0.1945, + "step": 7613 + }, + { + "epoch": 0.7890973157840191, + "grad_norm": 0.7149636745452881, + "learning_rate": 4.487616448022214e-06, + "loss": 0.2048, + "step": 7614 + }, + { + "epoch": 0.7892009534666805, + "grad_norm": 0.7371107339859009, + "learning_rate": 4.483379739207268e-06, + "loss": 0.2181, + "step": 7615 + }, + { + "epoch": 0.7893045911493419, + "grad_norm": 0.81303870677948, + "learning_rate": 4.479144778780975e-06, + "loss": 0.2209, + "step": 7616 + }, + { + "epoch": 0.7894082288320033, + "grad_norm": 0.6847606897354126, + "learning_rate": 4.474911567220521e-06, + "loss": 0.1824, + "step": 7617 + }, + { + "epoch": 0.7895118665146648, + "grad_norm": 0.7516778707504272, + "learning_rate": 4.470680105002898e-06, + "loss": 0.2122, + "step": 7618 + }, + { + "epoch": 0.7896155041973262, + "grad_norm": 0.6628775596618652, + "learning_rate": 4.466450392604895e-06, + "loss": 0.1991, + "step": 7619 + }, + { + "epoch": 0.7897191418799876, + "grad_norm": 0.7115910053253174, + "learning_rate": 4.462222430503116e-06, + "loss": 0.1812, + "step": 7620 + }, + { + "epoch": 0.789822779562649, + "grad_norm": 0.6615688800811768, + "learning_rate": 4.457996219173961e-06, + "loss": 0.2134, + "step": 7621 + }, + { + "epoch": 0.7899264172453104, + "grad_norm": 0.7453076839447021, + "learning_rate": 4.453771759093628e-06, + "loss": 0.2128, + "step": 7622 + }, + { + "epoch": 0.7900300549279718, + "grad_norm": 0.6784555912017822, + "learning_rate": 4.4495490507381335e-06, + "loss": 0.1785, + "step": 7623 + }, + { + "epoch": 0.7901336926106333, + "grad_norm": 0.700274646282196, + "learning_rate": 4.445328094583276e-06, + "loss": 0.2061, + "step": 7624 + }, + { + "epoch": 0.7902373302932947, + "grad_norm": 0.5862249135971069, + "learning_rate": 4.441108891104664e-06, + "loss": 0.1685, + "step": 7625 + }, + { + "epoch": 0.7903409679759561, + "grad_norm": 0.7702421545982361, + "learning_rate": 4.4368914407777195e-06, + "loss": 0.2497, + "step": 7626 + }, + { + "epoch": 0.7904446056586175, + "grad_norm": 0.6641459465026855, + "learning_rate": 4.43267574407765e-06, + "loss": 0.1881, + "step": 7627 + }, + { + "epoch": 0.7905482433412789, + "grad_norm": 0.6433275938034058, + "learning_rate": 4.428461801479485e-06, + "loss": 0.1791, + "step": 7628 + }, + { + "epoch": 0.7906518810239404, + "grad_norm": 0.5558257699012756, + "learning_rate": 4.424249613458029e-06, + "loss": 0.1568, + "step": 7629 + }, + { + "epoch": 0.7907555187066018, + "grad_norm": 0.5986382961273193, + "learning_rate": 4.420039180487921e-06, + "loss": 0.1797, + "step": 7630 + }, + { + "epoch": 0.7908591563892632, + "grad_norm": 0.8127989768981934, + "learning_rate": 4.415830503043577e-06, + "loss": 0.2045, + "step": 7631 + }, + { + "epoch": 0.7909627940719246, + "grad_norm": 0.6428434252738953, + "learning_rate": 4.41162358159922e-06, + "loss": 0.186, + "step": 7632 + }, + { + "epoch": 0.791066431754586, + "grad_norm": 0.6171351075172424, + "learning_rate": 4.4074184166288926e-06, + "loss": 0.1645, + "step": 7633 + }, + { + "epoch": 0.7911700694372474, + "grad_norm": 0.5491045713424683, + "learning_rate": 4.4032150086064145e-06, + "loss": 0.1743, + "step": 7634 + }, + { + "epoch": 0.7912737071199089, + "grad_norm": 0.7326172590255737, + "learning_rate": 4.399013358005422e-06, + "loss": 0.2124, + "step": 7635 + }, + { + "epoch": 0.7913773448025702, + "grad_norm": 0.5941773653030396, + "learning_rate": 4.3948134652993566e-06, + "loss": 0.1853, + "step": 7636 + }, + { + "epoch": 0.7914809824852316, + "grad_norm": 0.7196014523506165, + "learning_rate": 4.390615330961452e-06, + "loss": 0.2059, + "step": 7637 + }, + { + "epoch": 0.791584620167893, + "grad_norm": 0.5386014580726624, + "learning_rate": 4.386418955464746e-06, + "loss": 0.1561, + "step": 7638 + }, + { + "epoch": 0.7916882578505544, + "grad_norm": 0.6675041913986206, + "learning_rate": 4.382224339282078e-06, + "loss": 0.196, + "step": 7639 + }, + { + "epoch": 0.7917918955332158, + "grad_norm": 0.6638970971107483, + "learning_rate": 4.3780314828860895e-06, + "loss": 0.1982, + "step": 7640 + }, + { + "epoch": 0.7918955332158772, + "grad_norm": 0.8089138269424438, + "learning_rate": 4.3738403867492355e-06, + "loss": 0.2275, + "step": 7641 + }, + { + "epoch": 0.7919991708985387, + "grad_norm": 0.7028430700302124, + "learning_rate": 4.369651051343748e-06, + "loss": 0.1993, + "step": 7642 + }, + { + "epoch": 0.7921028085812001, + "grad_norm": 0.6406500339508057, + "learning_rate": 4.365463477141691e-06, + "loss": 0.1906, + "step": 7643 + }, + { + "epoch": 0.7922064462638615, + "grad_norm": 0.6021590828895569, + "learning_rate": 4.361277664614902e-06, + "loss": 0.1731, + "step": 7644 + }, + { + "epoch": 0.7923100839465229, + "grad_norm": 0.6423798203468323, + "learning_rate": 4.357093614235033e-06, + "loss": 0.1857, + "step": 7645 + }, + { + "epoch": 0.7924137216291843, + "grad_norm": 0.6370554566383362, + "learning_rate": 4.3529113264735415e-06, + "loss": 0.2192, + "step": 7646 + }, + { + "epoch": 0.7925173593118457, + "grad_norm": 0.7619375586509705, + "learning_rate": 4.3487308018016724e-06, + "loss": 0.2125, + "step": 7647 + }, + { + "epoch": 0.7926209969945072, + "grad_norm": 0.776597797870636, + "learning_rate": 4.344552040690491e-06, + "loss": 0.2391, + "step": 7648 + }, + { + "epoch": 0.7927246346771686, + "grad_norm": 0.6406593918800354, + "learning_rate": 4.340375043610849e-06, + "loss": 0.2062, + "step": 7649 + }, + { + "epoch": 0.79282827235983, + "grad_norm": 0.555220365524292, + "learning_rate": 4.336199811033399e-06, + "loss": 0.1435, + "step": 7650 + }, + { + "epoch": 0.7929319100424914, + "grad_norm": 0.6545402407646179, + "learning_rate": 4.33202634342861e-06, + "loss": 0.1921, + "step": 7651 + }, + { + "epoch": 0.7930355477251528, + "grad_norm": 0.6345674395561218, + "learning_rate": 4.327854641266731e-06, + "loss": 0.1772, + "step": 7652 + }, + { + "epoch": 0.7931391854078143, + "grad_norm": 0.7767759561538696, + "learning_rate": 4.323684705017832e-06, + "loss": 0.2258, + "step": 7653 + }, + { + "epoch": 0.7932428230904757, + "grad_norm": 0.671420156955719, + "learning_rate": 4.3195165351517665e-06, + "loss": 0.1911, + "step": 7654 + }, + { + "epoch": 0.7933464607731371, + "grad_norm": 0.6630171537399292, + "learning_rate": 4.31535013213821e-06, + "loss": 0.1895, + "step": 7655 + }, + { + "epoch": 0.7934500984557985, + "grad_norm": 0.6785514950752258, + "learning_rate": 4.311185496446615e-06, + "loss": 0.197, + "step": 7656 + }, + { + "epoch": 0.7935537361384599, + "grad_norm": 0.6731833219528198, + "learning_rate": 4.307022628546245e-06, + "loss": 0.1895, + "step": 7657 + }, + { + "epoch": 0.7936573738211213, + "grad_norm": 0.6627957820892334, + "learning_rate": 4.302861528906175e-06, + "loss": 0.1866, + "step": 7658 + }, + { + "epoch": 0.7937610115037828, + "grad_norm": 0.6604629158973694, + "learning_rate": 4.2987021979952614e-06, + "loss": 0.1938, + "step": 7659 + }, + { + "epoch": 0.7938646491864442, + "grad_norm": 0.64192795753479, + "learning_rate": 4.294544636282176e-06, + "loss": 0.1723, + "step": 7660 + }, + { + "epoch": 0.7939682868691056, + "grad_norm": 0.7034128904342651, + "learning_rate": 4.290388844235393e-06, + "loss": 0.2055, + "step": 7661 + }, + { + "epoch": 0.794071924551767, + "grad_norm": 0.653624951839447, + "learning_rate": 4.286234822323172e-06, + "loss": 0.198, + "step": 7662 + }, + { + "epoch": 0.7941755622344284, + "grad_norm": 0.5965669751167297, + "learning_rate": 4.282082571013586e-06, + "loss": 0.1825, + "step": 7663 + }, + { + "epoch": 0.7942791999170898, + "grad_norm": 0.5858683586120605, + "learning_rate": 4.277932090774495e-06, + "loss": 0.1663, + "step": 7664 + }, + { + "epoch": 0.7943828375997513, + "grad_norm": 0.6321005821228027, + "learning_rate": 4.2737833820735755e-06, + "loss": 0.1959, + "step": 7665 + }, + { + "epoch": 0.7944864752824127, + "grad_norm": 0.7521500587463379, + "learning_rate": 4.269636445378302e-06, + "loss": 0.2037, + "step": 7666 + }, + { + "epoch": 0.7945901129650741, + "grad_norm": 0.6142362356185913, + "learning_rate": 4.265491281155938e-06, + "loss": 0.1817, + "step": 7667 + }, + { + "epoch": 0.7946937506477355, + "grad_norm": 0.6033626198768616, + "learning_rate": 4.261347889873559e-06, + "loss": 0.1917, + "step": 7668 + }, + { + "epoch": 0.7947973883303969, + "grad_norm": 0.6412879228591919, + "learning_rate": 4.257206271998035e-06, + "loss": 0.2057, + "step": 7669 + }, + { + "epoch": 0.7949010260130583, + "grad_norm": 0.6569292545318604, + "learning_rate": 4.2530664279960306e-06, + "loss": 0.174, + "step": 7670 + }, + { + "epoch": 0.7950046636957198, + "grad_norm": 0.6312701106071472, + "learning_rate": 4.248928358334028e-06, + "loss": 0.1919, + "step": 7671 + }, + { + "epoch": 0.7951083013783812, + "grad_norm": 0.538118302822113, + "learning_rate": 4.244792063478285e-06, + "loss": 0.1808, + "step": 7672 + }, + { + "epoch": 0.7952119390610426, + "grad_norm": 0.6475589275360107, + "learning_rate": 4.240657543894886e-06, + "loss": 0.1569, + "step": 7673 + }, + { + "epoch": 0.795315576743704, + "grad_norm": 0.5423773527145386, + "learning_rate": 4.236524800049693e-06, + "loss": 0.1594, + "step": 7674 + }, + { + "epoch": 0.7954192144263654, + "grad_norm": 0.6256198883056641, + "learning_rate": 4.232393832408386e-06, + "loss": 0.1882, + "step": 7675 + }, + { + "epoch": 0.7955228521090268, + "grad_norm": 0.6854049563407898, + "learning_rate": 4.228264641436428e-06, + "loss": 0.1953, + "step": 7676 + }, + { + "epoch": 0.7956264897916883, + "grad_norm": 0.5907077193260193, + "learning_rate": 4.22413722759909e-06, + "loss": 0.1858, + "step": 7677 + }, + { + "epoch": 0.7957301274743497, + "grad_norm": 0.8190000653266907, + "learning_rate": 4.220011591361451e-06, + "loss": 0.2183, + "step": 7678 + }, + { + "epoch": 0.7958337651570111, + "grad_norm": 0.8084255456924438, + "learning_rate": 4.215887733188367e-06, + "loss": 0.2142, + "step": 7679 + }, + { + "epoch": 0.7959374028396725, + "grad_norm": 0.6585685610771179, + "learning_rate": 4.211765653544524e-06, + "loss": 0.1927, + "step": 7680 + }, + { + "epoch": 0.7960410405223339, + "grad_norm": 0.6803631782531738, + "learning_rate": 4.2076453528943824e-06, + "loss": 0.1931, + "step": 7681 + }, + { + "epoch": 0.7961446782049953, + "grad_norm": 0.616976261138916, + "learning_rate": 4.203526831702207e-06, + "loss": 0.1928, + "step": 7682 + }, + { + "epoch": 0.7962483158876568, + "grad_norm": 0.6163725256919861, + "learning_rate": 4.199410090432079e-06, + "loss": 0.1834, + "step": 7683 + }, + { + "epoch": 0.7963519535703182, + "grad_norm": 0.6457942724227905, + "learning_rate": 4.195295129547854e-06, + "loss": 0.2003, + "step": 7684 + }, + { + "epoch": 0.7964555912529796, + "grad_norm": 0.5381678342819214, + "learning_rate": 4.191181949513206e-06, + "loss": 0.1674, + "step": 7685 + }, + { + "epoch": 0.796559228935641, + "grad_norm": 0.7502049207687378, + "learning_rate": 4.187070550791603e-06, + "loss": 0.2316, + "step": 7686 + }, + { + "epoch": 0.7966628666183024, + "grad_norm": 0.6284844875335693, + "learning_rate": 4.182960933846311e-06, + "loss": 0.1994, + "step": 7687 + }, + { + "epoch": 0.7967665043009639, + "grad_norm": 0.6811497807502747, + "learning_rate": 4.178853099140392e-06, + "loss": 0.2126, + "step": 7688 + }, + { + "epoch": 0.7968701419836253, + "grad_norm": 0.6516885757446289, + "learning_rate": 4.174747047136707e-06, + "loss": 0.1746, + "step": 7689 + }, + { + "epoch": 0.7969737796662867, + "grad_norm": 0.6377501487731934, + "learning_rate": 4.170642778297922e-06, + "loss": 0.2046, + "step": 7690 + }, + { + "epoch": 0.7970774173489481, + "grad_norm": 0.5635381937026978, + "learning_rate": 4.166540293086509e-06, + "loss": 0.151, + "step": 7691 + }, + { + "epoch": 0.7971810550316095, + "grad_norm": 0.5966200828552246, + "learning_rate": 4.162439591964716e-06, + "loss": 0.16, + "step": 7692 + }, + { + "epoch": 0.7972846927142709, + "grad_norm": 0.6098965406417847, + "learning_rate": 4.158340675394614e-06, + "loss": 0.1992, + "step": 7693 + }, + { + "epoch": 0.7973883303969324, + "grad_norm": 0.6058703660964966, + "learning_rate": 4.154243543838059e-06, + "loss": 0.1873, + "step": 7694 + }, + { + "epoch": 0.7974919680795938, + "grad_norm": 0.7015929222106934, + "learning_rate": 4.150148197756705e-06, + "loss": 0.2142, + "step": 7695 + }, + { + "epoch": 0.7975956057622552, + "grad_norm": 0.7512447834014893, + "learning_rate": 4.146054637612016e-06, + "loss": 0.2191, + "step": 7696 + }, + { + "epoch": 0.7976992434449166, + "grad_norm": 0.7451758980751038, + "learning_rate": 4.141962863865241e-06, + "loss": 0.1941, + "step": 7697 + }, + { + "epoch": 0.797802881127578, + "grad_norm": 0.6696723103523254, + "learning_rate": 4.137872876977445e-06, + "loss": 0.1771, + "step": 7698 + }, + { + "epoch": 0.7979065188102394, + "grad_norm": 0.6065865755081177, + "learning_rate": 4.13378467740947e-06, + "loss": 0.2062, + "step": 7699 + }, + { + "epoch": 0.7980101564929009, + "grad_norm": 0.7292668223381042, + "learning_rate": 4.129698265621975e-06, + "loss": 0.2209, + "step": 7700 + }, + { + "epoch": 0.7981137941755623, + "grad_norm": 0.6175323128700256, + "learning_rate": 4.12561364207541e-06, + "loss": 0.1771, + "step": 7701 + }, + { + "epoch": 0.7982174318582237, + "grad_norm": 0.6026126146316528, + "learning_rate": 4.1215308072300185e-06, + "loss": 0.167, + "step": 7702 + }, + { + "epoch": 0.7983210695408851, + "grad_norm": 0.6252270340919495, + "learning_rate": 4.117449761545858e-06, + "loss": 0.1731, + "step": 7703 + }, + { + "epoch": 0.7984247072235465, + "grad_norm": 0.6011770963668823, + "learning_rate": 4.113370505482761e-06, + "loss": 0.1856, + "step": 7704 + }, + { + "epoch": 0.7985283449062079, + "grad_norm": 0.7582604885101318, + "learning_rate": 4.109293039500379e-06, + "loss": 0.2264, + "step": 7705 + }, + { + "epoch": 0.7986319825888694, + "grad_norm": 0.6858969926834106, + "learning_rate": 4.105217364058161e-06, + "loss": 0.2149, + "step": 7706 + }, + { + "epoch": 0.7987356202715308, + "grad_norm": 0.5900029540061951, + "learning_rate": 4.101143479615342e-06, + "loss": 0.1788, + "step": 7707 + }, + { + "epoch": 0.7988392579541922, + "grad_norm": 0.6448624730110168, + "learning_rate": 4.097071386630959e-06, + "loss": 0.1975, + "step": 7708 + }, + { + "epoch": 0.7989428956368536, + "grad_norm": 0.6842593550682068, + "learning_rate": 4.093001085563848e-06, + "loss": 0.2097, + "step": 7709 + }, + { + "epoch": 0.799046533319515, + "grad_norm": 0.6258800625801086, + "learning_rate": 4.088932576872644e-06, + "loss": 0.1821, + "step": 7710 + }, + { + "epoch": 0.7991501710021764, + "grad_norm": 0.7252703309059143, + "learning_rate": 4.08486586101579e-06, + "loss": 0.2068, + "step": 7711 + }, + { + "epoch": 0.7992538086848378, + "grad_norm": 0.6990664601325989, + "learning_rate": 4.08080093845151e-06, + "loss": 0.1987, + "step": 7712 + }, + { + "epoch": 0.7993574463674992, + "grad_norm": 0.5749051570892334, + "learning_rate": 4.076737809637832e-06, + "loss": 0.1504, + "step": 7713 + }, + { + "epoch": 0.7994610840501606, + "grad_norm": 0.7165976762771606, + "learning_rate": 4.07267647503258e-06, + "loss": 0.1921, + "step": 7714 + }, + { + "epoch": 0.799564721732822, + "grad_norm": 0.6567685008049011, + "learning_rate": 4.068616935093383e-06, + "loss": 0.1897, + "step": 7715 + }, + { + "epoch": 0.7996683594154834, + "grad_norm": 0.601642906665802, + "learning_rate": 4.0645591902776705e-06, + "loss": 0.15, + "step": 7716 + }, + { + "epoch": 0.7997719970981448, + "grad_norm": 0.6340222954750061, + "learning_rate": 4.06050324104265e-06, + "loss": 0.1769, + "step": 7717 + }, + { + "epoch": 0.7998756347808063, + "grad_norm": 0.6264651417732239, + "learning_rate": 4.056449087845351e-06, + "loss": 0.1734, + "step": 7718 + }, + { + "epoch": 0.7999792724634677, + "grad_norm": 0.6766581535339355, + "learning_rate": 4.052396731142585e-06, + "loss": 0.1894, + "step": 7719 + }, + { + "epoch": 0.8000829101461291, + "grad_norm": 0.5773683786392212, + "learning_rate": 4.048346171390958e-06, + "loss": 0.1882, + "step": 7720 + }, + { + "epoch": 0.8001865478287905, + "grad_norm": 0.5931955575942993, + "learning_rate": 4.044297409046893e-06, + "loss": 0.1559, + "step": 7721 + }, + { + "epoch": 0.8002901855114519, + "grad_norm": 0.654396116733551, + "learning_rate": 4.040250444566587e-06, + "loss": 0.2006, + "step": 7722 + }, + { + "epoch": 0.8003938231941133, + "grad_norm": 0.6663067936897278, + "learning_rate": 4.036205278406056e-06, + "loss": 0.1625, + "step": 7723 + }, + { + "epoch": 0.8004974608767748, + "grad_norm": 0.6567176580429077, + "learning_rate": 4.032161911021093e-06, + "loss": 0.1865, + "step": 7724 + }, + { + "epoch": 0.8006010985594362, + "grad_norm": 0.7086308002471924, + "learning_rate": 4.0281203428673074e-06, + "loss": 0.1984, + "step": 7725 + }, + { + "epoch": 0.8007047362420976, + "grad_norm": 0.7191071510314941, + "learning_rate": 4.024080574400095e-06, + "loss": 0.201, + "step": 7726 + }, + { + "epoch": 0.800808373924759, + "grad_norm": 0.6508558392524719, + "learning_rate": 4.020042606074641e-06, + "loss": 0.2075, + "step": 7727 + }, + { + "epoch": 0.8009120116074204, + "grad_norm": 0.5657883286476135, + "learning_rate": 4.016006438345952e-06, + "loss": 0.1825, + "step": 7728 + }, + { + "epoch": 0.8010156492900818, + "grad_norm": 0.6412180066108704, + "learning_rate": 4.0119720716688036e-06, + "loss": 0.1856, + "step": 7729 + }, + { + "epoch": 0.8011192869727433, + "grad_norm": 0.5706683993339539, + "learning_rate": 4.007939506497789e-06, + "loss": 0.1576, + "step": 7730 + }, + { + "epoch": 0.8012229246554047, + "grad_norm": 0.6363986730575562, + "learning_rate": 4.003908743287295e-06, + "loss": 0.1971, + "step": 7731 + }, + { + "epoch": 0.8013265623380661, + "grad_norm": 0.7020382285118103, + "learning_rate": 3.999879782491498e-06, + "loss": 0.2137, + "step": 7732 + }, + { + "epoch": 0.8014302000207275, + "grad_norm": 0.6399512887001038, + "learning_rate": 3.995852624564373e-06, + "loss": 0.1558, + "step": 7733 + }, + { + "epoch": 0.8015338377033889, + "grad_norm": 0.5676230192184448, + "learning_rate": 3.991827269959692e-06, + "loss": 0.1692, + "step": 7734 + }, + { + "epoch": 0.8016374753860503, + "grad_norm": 0.6501866579055786, + "learning_rate": 3.987803719131029e-06, + "loss": 0.2023, + "step": 7735 + }, + { + "epoch": 0.8017411130687118, + "grad_norm": 0.7282348871231079, + "learning_rate": 3.983781972531755e-06, + "loss": 0.2117, + "step": 7736 + }, + { + "epoch": 0.8018447507513732, + "grad_norm": 0.6642242670059204, + "learning_rate": 3.9797620306150265e-06, + "loss": 0.1705, + "step": 7737 + }, + { + "epoch": 0.8019483884340346, + "grad_norm": 0.714993953704834, + "learning_rate": 3.975743893833821e-06, + "loss": 0.1853, + "step": 7738 + }, + { + "epoch": 0.802052026116696, + "grad_norm": 0.7452964186668396, + "learning_rate": 3.9717275626408705e-06, + "loss": 0.1921, + "step": 7739 + }, + { + "epoch": 0.8021556637993574, + "grad_norm": 0.6581090092658997, + "learning_rate": 3.9677130374887404e-06, + "loss": 0.1727, + "step": 7740 + }, + { + "epoch": 0.8022593014820188, + "grad_norm": 0.7347173690795898, + "learning_rate": 3.96370031882979e-06, + "loss": 0.2425, + "step": 7741 + }, + { + "epoch": 0.8023629391646803, + "grad_norm": 0.6474437117576599, + "learning_rate": 3.959689407116154e-06, + "loss": 0.1671, + "step": 7742 + }, + { + "epoch": 0.8024665768473417, + "grad_norm": 0.7023115158081055, + "learning_rate": 3.955680302799785e-06, + "loss": 0.2162, + "step": 7743 + }, + { + "epoch": 0.8025702145300031, + "grad_norm": 0.661474347114563, + "learning_rate": 3.951673006332417e-06, + "loss": 0.1961, + "step": 7744 + }, + { + "epoch": 0.8026738522126645, + "grad_norm": 0.77756667137146, + "learning_rate": 3.9476675181655835e-06, + "loss": 0.2247, + "step": 7745 + }, + { + "epoch": 0.8027774898953259, + "grad_norm": 0.7199652791023254, + "learning_rate": 3.943663838750624e-06, + "loss": 0.2, + "step": 7746 + }, + { + "epoch": 0.8028811275779874, + "grad_norm": 0.6598642468452454, + "learning_rate": 3.939661968538657e-06, + "loss": 0.181, + "step": 7747 + }, + { + "epoch": 0.8029847652606488, + "grad_norm": 0.6574103832244873, + "learning_rate": 3.935661907980621e-06, + "loss": 0.2025, + "step": 7748 + }, + { + "epoch": 0.8030884029433102, + "grad_norm": 0.5850088596343994, + "learning_rate": 3.931663657527223e-06, + "loss": 0.1783, + "step": 7749 + }, + { + "epoch": 0.8031920406259716, + "grad_norm": 0.6657868027687073, + "learning_rate": 3.92766721762899e-06, + "loss": 0.1982, + "step": 7750 + }, + { + "epoch": 0.803295678308633, + "grad_norm": 0.6957859396934509, + "learning_rate": 3.9236725887362295e-06, + "loss": 0.2032, + "step": 7751 + }, + { + "epoch": 0.8033993159912944, + "grad_norm": 0.7359391450881958, + "learning_rate": 3.919679771299045e-06, + "loss": 0.1726, + "step": 7752 + }, + { + "epoch": 0.8035029536739559, + "grad_norm": 0.740073025226593, + "learning_rate": 3.915688765767354e-06, + "loss": 0.1758, + "step": 7753 + }, + { + "epoch": 0.8036065913566173, + "grad_norm": 0.7247598767280579, + "learning_rate": 3.911699572590841e-06, + "loss": 0.1948, + "step": 7754 + }, + { + "epoch": 0.8037102290392787, + "grad_norm": 0.6437425017356873, + "learning_rate": 3.907712192219013e-06, + "loss": 0.1864, + "step": 7755 + }, + { + "epoch": 0.8038138667219401, + "grad_norm": 0.742752194404602, + "learning_rate": 3.903726625101163e-06, + "loss": 0.185, + "step": 7756 + }, + { + "epoch": 0.8039175044046015, + "grad_norm": 0.7389211058616638, + "learning_rate": 3.899742871686374e-06, + "loss": 0.2026, + "step": 7757 + }, + { + "epoch": 0.8040211420872629, + "grad_norm": 0.6474356651306152, + "learning_rate": 3.895760932423531e-06, + "loss": 0.1708, + "step": 7758 + }, + { + "epoch": 0.8041247797699244, + "grad_norm": 0.6608010530471802, + "learning_rate": 3.891780807761307e-06, + "loss": 0.1722, + "step": 7759 + }, + { + "epoch": 0.8042284174525858, + "grad_norm": 0.7372478246688843, + "learning_rate": 3.887802498148181e-06, + "loss": 0.2083, + "step": 7760 + }, + { + "epoch": 0.8043320551352472, + "grad_norm": 0.7452009916305542, + "learning_rate": 3.883826004032427e-06, + "loss": 0.2486, + "step": 7761 + }, + { + "epoch": 0.8044356928179086, + "grad_norm": 0.5467132329940796, + "learning_rate": 3.879851325862101e-06, + "loss": 0.1624, + "step": 7762 + }, + { + "epoch": 0.80453933050057, + "grad_norm": 0.6250606775283813, + "learning_rate": 3.875878464085072e-06, + "loss": 0.1832, + "step": 7763 + }, + { + "epoch": 0.8046429681832314, + "grad_norm": 0.7398848533630371, + "learning_rate": 3.871907419148995e-06, + "loss": 0.2039, + "step": 7764 + }, + { + "epoch": 0.8047466058658929, + "grad_norm": 0.6963582634925842, + "learning_rate": 3.8679381915013105e-06, + "loss": 0.1991, + "step": 7765 + }, + { + "epoch": 0.8048502435485543, + "grad_norm": 0.7243926525115967, + "learning_rate": 3.863970781589279e-06, + "loss": 0.2111, + "step": 7766 + }, + { + "epoch": 0.8049538812312157, + "grad_norm": 0.646825909614563, + "learning_rate": 3.860005189859932e-06, + "loss": 0.1942, + "step": 7767 + }, + { + "epoch": 0.8050575189138771, + "grad_norm": 0.6852625012397766, + "learning_rate": 3.856041416760115e-06, + "loss": 0.1953, + "step": 7768 + }, + { + "epoch": 0.8051611565965385, + "grad_norm": 0.5806519985198975, + "learning_rate": 3.852079462736446e-06, + "loss": 0.1767, + "step": 7769 + }, + { + "epoch": 0.8052647942792, + "grad_norm": 0.6389683485031128, + "learning_rate": 3.848119328235369e-06, + "loss": 0.1805, + "step": 7770 + }, + { + "epoch": 0.8053684319618614, + "grad_norm": 0.7592096328735352, + "learning_rate": 3.844161013703099e-06, + "loss": 0.1841, + "step": 7771 + }, + { + "epoch": 0.8054720696445228, + "grad_norm": 0.7471230626106262, + "learning_rate": 3.840204519585644e-06, + "loss": 0.2049, + "step": 7772 + }, + { + "epoch": 0.8055757073271842, + "grad_norm": 0.664825439453125, + "learning_rate": 3.836249846328828e-06, + "loss": 0.2012, + "step": 7773 + }, + { + "epoch": 0.8056793450098456, + "grad_norm": 0.7081588506698608, + "learning_rate": 3.832296994378248e-06, + "loss": 0.1837, + "step": 7774 + }, + { + "epoch": 0.805782982692507, + "grad_norm": 0.6164419054985046, + "learning_rate": 3.828345964179314e-06, + "loss": 0.1563, + "step": 7775 + }, + { + "epoch": 0.8058866203751684, + "grad_norm": 0.6512451767921448, + "learning_rate": 3.824396756177218e-06, + "loss": 0.1932, + "step": 7776 + }, + { + "epoch": 0.8059902580578299, + "grad_norm": 0.6552871465682983, + "learning_rate": 3.820449370816943e-06, + "loss": 0.1938, + "step": 7777 + }, + { + "epoch": 0.8060938957404913, + "grad_norm": 0.5573453307151794, + "learning_rate": 3.816503808543288e-06, + "loss": 0.1378, + "step": 7778 + }, + { + "epoch": 0.8061975334231527, + "grad_norm": 0.7078878283500671, + "learning_rate": 3.8125600698008214e-06, + "loss": 0.2211, + "step": 7779 + }, + { + "epoch": 0.8063011711058141, + "grad_norm": 0.7048439383506775, + "learning_rate": 3.808618155033921e-06, + "loss": 0.1942, + "step": 7780 + }, + { + "epoch": 0.8064048087884755, + "grad_norm": 0.6537505388259888, + "learning_rate": 3.8046780646867644e-06, + "loss": 0.1889, + "step": 7781 + }, + { + "epoch": 0.806508446471137, + "grad_norm": 0.6082873344421387, + "learning_rate": 3.8007397992033056e-06, + "loss": 0.1664, + "step": 7782 + }, + { + "epoch": 0.8066120841537984, + "grad_norm": 0.7674204111099243, + "learning_rate": 3.7968033590273035e-06, + "loss": 0.2166, + "step": 7783 + }, + { + "epoch": 0.8067157218364598, + "grad_norm": 0.6718268394470215, + "learning_rate": 3.792868744602305e-06, + "loss": 0.1989, + "step": 7784 + }, + { + "epoch": 0.8068193595191212, + "grad_norm": 0.7679089903831482, + "learning_rate": 3.7889359563716643e-06, + "loss": 0.2455, + "step": 7785 + }, + { + "epoch": 0.8069229972017826, + "grad_norm": 0.5761622786521912, + "learning_rate": 3.7850049947785227e-06, + "loss": 0.1587, + "step": 7786 + }, + { + "epoch": 0.807026634884444, + "grad_norm": 0.7585948705673218, + "learning_rate": 3.781075860265806e-06, + "loss": 0.176, + "step": 7787 + }, + { + "epoch": 0.8071302725671053, + "grad_norm": 0.784820020198822, + "learning_rate": 3.777148553276255e-06, + "loss": 0.2499, + "step": 7788 + }, + { + "epoch": 0.8072339102497668, + "grad_norm": 0.6705400347709656, + "learning_rate": 3.7732230742523855e-06, + "loss": 0.201, + "step": 7789 + }, + { + "epoch": 0.8073375479324282, + "grad_norm": 0.6821495294570923, + "learning_rate": 3.76929942363651e-06, + "loss": 0.198, + "step": 7790 + }, + { + "epoch": 0.8074411856150896, + "grad_norm": 0.6199593544006348, + "learning_rate": 3.76537760187075e-06, + "loss": 0.1606, + "step": 7791 + }, + { + "epoch": 0.807544823297751, + "grad_norm": 0.6515820622444153, + "learning_rate": 3.761457609396999e-06, + "loss": 0.1781, + "step": 7792 + }, + { + "epoch": 0.8076484609804124, + "grad_norm": 0.5457937121391296, + "learning_rate": 3.7575394466569657e-06, + "loss": 0.1743, + "step": 7793 + }, + { + "epoch": 0.8077520986630738, + "grad_norm": 0.6231228709220886, + "learning_rate": 3.7536231140921353e-06, + "loss": 0.1745, + "step": 7794 + }, + { + "epoch": 0.8078557363457353, + "grad_norm": 0.7281748056411743, + "learning_rate": 3.749708612143801e-06, + "loss": 0.2469, + "step": 7795 + }, + { + "epoch": 0.8079593740283967, + "grad_norm": 0.5639762878417969, + "learning_rate": 3.7457959412530365e-06, + "loss": 0.1465, + "step": 7796 + }, + { + "epoch": 0.8080630117110581, + "grad_norm": 0.64995276927948, + "learning_rate": 3.741885101860716e-06, + "loss": 0.1749, + "step": 7797 + }, + { + "epoch": 0.8081666493937195, + "grad_norm": 0.9467796087265015, + "learning_rate": 3.737976094407505e-06, + "loss": 0.2079, + "step": 7798 + }, + { + "epoch": 0.8082702870763809, + "grad_norm": 0.6556515693664551, + "learning_rate": 3.7340689193338754e-06, + "loss": 0.19, + "step": 7799 + }, + { + "epoch": 0.8083739247590424, + "grad_norm": 0.6055012941360474, + "learning_rate": 3.730163577080068e-06, + "loss": 0.1661, + "step": 7800 + }, + { + "epoch": 0.8084775624417038, + "grad_norm": 0.7720165848731995, + "learning_rate": 3.7262600680861494e-06, + "loss": 0.2375, + "step": 7801 + }, + { + "epoch": 0.8085812001243652, + "grad_norm": 0.7421008944511414, + "learning_rate": 3.722358392791936e-06, + "loss": 0.2197, + "step": 7802 + }, + { + "epoch": 0.8086848378070266, + "grad_norm": 0.7144314646720886, + "learning_rate": 3.718458551637074e-06, + "loss": 0.1834, + "step": 7803 + }, + { + "epoch": 0.808788475489688, + "grad_norm": 0.6831815242767334, + "learning_rate": 3.714560545060999e-06, + "loss": 0.1838, + "step": 7804 + }, + { + "epoch": 0.8088921131723494, + "grad_norm": 0.646991491317749, + "learning_rate": 3.710664373502919e-06, + "loss": 0.1667, + "step": 7805 + }, + { + "epoch": 0.8089957508550109, + "grad_norm": 0.6269602179527283, + "learning_rate": 3.706770037401861e-06, + "loss": 0.1731, + "step": 7806 + }, + { + "epoch": 0.8090993885376723, + "grad_norm": 0.6706764101982117, + "learning_rate": 3.7028775371966276e-06, + "loss": 0.1996, + "step": 7807 + }, + { + "epoch": 0.8092030262203337, + "grad_norm": 0.667586088180542, + "learning_rate": 3.6989868733258114e-06, + "loss": 0.1834, + "step": 7808 + }, + { + "epoch": 0.8093066639029951, + "grad_norm": 0.7090936899185181, + "learning_rate": 3.695098046227821e-06, + "loss": 0.2133, + "step": 7809 + }, + { + "epoch": 0.8094103015856565, + "grad_norm": 0.7052953243255615, + "learning_rate": 3.691211056340831e-06, + "loss": 0.1951, + "step": 7810 + }, + { + "epoch": 0.8095139392683179, + "grad_norm": 0.7060131430625916, + "learning_rate": 3.687325904102832e-06, + "loss": 0.2182, + "step": 7811 + }, + { + "epoch": 0.8096175769509794, + "grad_norm": 0.5730564594268799, + "learning_rate": 3.6834425899515847e-06, + "loss": 0.1709, + "step": 7812 + }, + { + "epoch": 0.8097212146336408, + "grad_norm": 0.7492680549621582, + "learning_rate": 3.67956111432467e-06, + "loss": 0.2341, + "step": 7813 + }, + { + "epoch": 0.8098248523163022, + "grad_norm": 0.6759665012359619, + "learning_rate": 3.675681477659436e-06, + "loss": 0.1985, + "step": 7814 + }, + { + "epoch": 0.8099284899989636, + "grad_norm": 0.5997726321220398, + "learning_rate": 3.6718036803930313e-06, + "loss": 0.1793, + "step": 7815 + }, + { + "epoch": 0.810032127681625, + "grad_norm": 0.6804363131523132, + "learning_rate": 3.667927722962412e-06, + "loss": 0.2198, + "step": 7816 + }, + { + "epoch": 0.8101357653642864, + "grad_norm": 0.727741539478302, + "learning_rate": 3.664053605804301e-06, + "loss": 0.1963, + "step": 7817 + }, + { + "epoch": 0.8102394030469479, + "grad_norm": 0.5082893371582031, + "learning_rate": 3.6601813293552368e-06, + "loss": 0.1407, + "step": 7818 + }, + { + "epoch": 0.8103430407296093, + "grad_norm": 0.5630528330802917, + "learning_rate": 3.6563108940515445e-06, + "loss": 0.1542, + "step": 7819 + }, + { + "epoch": 0.8104466784122707, + "grad_norm": 0.623420000076294, + "learning_rate": 3.652442300329333e-06, + "loss": 0.1824, + "step": 7820 + }, + { + "epoch": 0.8105503160949321, + "grad_norm": 0.7584496736526489, + "learning_rate": 3.648575548624511e-06, + "loss": 0.2128, + "step": 7821 + }, + { + "epoch": 0.8106539537775935, + "grad_norm": 0.5935616493225098, + "learning_rate": 3.6447106393727705e-06, + "loss": 0.1523, + "step": 7822 + }, + { + "epoch": 0.810757591460255, + "grad_norm": 0.603014349937439, + "learning_rate": 3.6408475730096117e-06, + "loss": 0.1584, + "step": 7823 + }, + { + "epoch": 0.8108612291429164, + "grad_norm": 0.7835105657577515, + "learning_rate": 3.636986349970324e-06, + "loss": 0.2411, + "step": 7824 + }, + { + "epoch": 0.8109648668255778, + "grad_norm": 0.6582568883895874, + "learning_rate": 3.633126970689971e-06, + "loss": 0.1735, + "step": 7825 + }, + { + "epoch": 0.8110685045082392, + "grad_norm": 0.8031266927719116, + "learning_rate": 3.629269435603433e-06, + "loss": 0.2094, + "step": 7826 + }, + { + "epoch": 0.8111721421909006, + "grad_norm": 0.5696879029273987, + "learning_rate": 3.6254137451453676e-06, + "loss": 0.1548, + "step": 7827 + }, + { + "epoch": 0.811275779873562, + "grad_norm": 0.629696249961853, + "learning_rate": 3.62155989975022e-06, + "loss": 0.2036, + "step": 7828 + }, + { + "epoch": 0.8113794175562234, + "grad_norm": 0.6651034355163574, + "learning_rate": 3.61770789985225e-06, + "loss": 0.1996, + "step": 7829 + }, + { + "epoch": 0.8114830552388849, + "grad_norm": 0.6993162631988525, + "learning_rate": 3.6138577458854783e-06, + "loss": 0.2038, + "step": 7830 + }, + { + "epoch": 0.8115866929215463, + "grad_norm": 0.7502244114875793, + "learning_rate": 3.6100094382837504e-06, + "loss": 0.2078, + "step": 7831 + }, + { + "epoch": 0.8116903306042077, + "grad_norm": 0.7379708290100098, + "learning_rate": 3.606162977480674e-06, + "loss": 0.2197, + "step": 7832 + }, + { + "epoch": 0.8117939682868691, + "grad_norm": 0.725213348865509, + "learning_rate": 3.6023183639096736e-06, + "loss": 0.2164, + "step": 7833 + }, + { + "epoch": 0.8118976059695305, + "grad_norm": 0.4685988128185272, + "learning_rate": 3.59847559800395e-06, + "loss": 0.1492, + "step": 7834 + }, + { + "epoch": 0.812001243652192, + "grad_norm": 0.7826396226882935, + "learning_rate": 3.5946346801964937e-06, + "loss": 0.2228, + "step": 7835 + }, + { + "epoch": 0.8121048813348534, + "grad_norm": 0.659755527973175, + "learning_rate": 3.590795610920106e-06, + "loss": 0.181, + "step": 7836 + }, + { + "epoch": 0.8122085190175148, + "grad_norm": 0.7572306990623474, + "learning_rate": 3.5869583906073537e-06, + "loss": 0.1664, + "step": 7837 + }, + { + "epoch": 0.8123121567001762, + "grad_norm": 0.70071941614151, + "learning_rate": 3.5831230196906196e-06, + "loss": 0.2061, + "step": 7838 + }, + { + "epoch": 0.8124157943828376, + "grad_norm": 0.7053723335266113, + "learning_rate": 3.5792894986020634e-06, + "loss": 0.2201, + "step": 7839 + }, + { + "epoch": 0.812519432065499, + "grad_norm": 0.5322561264038086, + "learning_rate": 3.575457827773636e-06, + "loss": 0.1807, + "step": 7840 + }, + { + "epoch": 0.8126230697481605, + "grad_norm": 0.6401653289794922, + "learning_rate": 3.5716280076370936e-06, + "loss": 0.1678, + "step": 7841 + }, + { + "epoch": 0.8127267074308219, + "grad_norm": 0.7975711226463318, + "learning_rate": 3.567800038623963e-06, + "loss": 0.2015, + "step": 7842 + }, + { + "epoch": 0.8128303451134833, + "grad_norm": 0.6188352108001709, + "learning_rate": 3.563973921165578e-06, + "loss": 0.1668, + "step": 7843 + }, + { + "epoch": 0.8129339827961447, + "grad_norm": 0.686998724937439, + "learning_rate": 3.560149655693068e-06, + "loss": 0.2124, + "step": 7844 + }, + { + "epoch": 0.8130376204788061, + "grad_norm": 0.5393928289413452, + "learning_rate": 3.5563272426373386e-06, + "loss": 0.1506, + "step": 7845 + }, + { + "epoch": 0.8131412581614675, + "grad_norm": 0.7458584904670715, + "learning_rate": 3.552506682429093e-06, + "loss": 0.2071, + "step": 7846 + }, + { + "epoch": 0.813244895844129, + "grad_norm": 0.6960498690605164, + "learning_rate": 3.548687975498821e-06, + "loss": 0.194, + "step": 7847 + }, + { + "epoch": 0.8133485335267904, + "grad_norm": 0.6655008792877197, + "learning_rate": 3.544871122276816e-06, + "loss": 0.182, + "step": 7848 + }, + { + "epoch": 0.8134521712094518, + "grad_norm": 0.601629912853241, + "learning_rate": 3.541056123193156e-06, + "loss": 0.1485, + "step": 7849 + }, + { + "epoch": 0.8135558088921132, + "grad_norm": 0.6167109608650208, + "learning_rate": 3.5372429786777018e-06, + "loss": 0.1899, + "step": 7850 + }, + { + "epoch": 0.8136594465747746, + "grad_norm": 0.6821781992912292, + "learning_rate": 3.5334316891601206e-06, + "loss": 0.2029, + "step": 7851 + }, + { + "epoch": 0.813763084257436, + "grad_norm": 0.6972765326499939, + "learning_rate": 3.52962225506986e-06, + "loss": 0.1923, + "step": 7852 + }, + { + "epoch": 0.8138667219400975, + "grad_norm": 0.5665042400360107, + "learning_rate": 3.5258146768361546e-06, + "loss": 0.1436, + "step": 7853 + }, + { + "epoch": 0.8139703596227589, + "grad_norm": 0.6780118942260742, + "learning_rate": 3.5220089548880475e-06, + "loss": 0.182, + "step": 7854 + }, + { + "epoch": 0.8140739973054203, + "grad_norm": 0.7073417901992798, + "learning_rate": 3.5182050896543497e-06, + "loss": 0.1855, + "step": 7855 + }, + { + "epoch": 0.8141776349880817, + "grad_norm": 0.7430309653282166, + "learning_rate": 3.5144030815636867e-06, + "loss": 0.1915, + "step": 7856 + }, + { + "epoch": 0.8142812726707431, + "grad_norm": 0.624646782875061, + "learning_rate": 3.510602931044451e-06, + "loss": 0.1819, + "step": 7857 + }, + { + "epoch": 0.8143849103534045, + "grad_norm": 0.6942505240440369, + "learning_rate": 3.50680463852485e-06, + "loss": 0.1867, + "step": 7858 + }, + { + "epoch": 0.814488548036066, + "grad_norm": 0.6526444554328918, + "learning_rate": 3.503008204432863e-06, + "loss": 0.1955, + "step": 7859 + }, + { + "epoch": 0.8145921857187274, + "grad_norm": 0.6996036171913147, + "learning_rate": 3.499213629196263e-06, + "loss": 0.2007, + "step": 7860 + }, + { + "epoch": 0.8146958234013888, + "grad_norm": 0.725845992565155, + "learning_rate": 3.495420913242622e-06, + "loss": 0.2015, + "step": 7861 + }, + { + "epoch": 0.8147994610840502, + "grad_norm": 0.7196703553199768, + "learning_rate": 3.4916300569992934e-06, + "loss": 0.223, + "step": 7862 + }, + { + "epoch": 0.8149030987667116, + "grad_norm": 0.6106156706809998, + "learning_rate": 3.4878410608934264e-06, + "loss": 0.174, + "step": 7863 + }, + { + "epoch": 0.8150067364493729, + "grad_norm": 0.5675254464149475, + "learning_rate": 3.4840539253519645e-06, + "loss": 0.1835, + "step": 7864 + }, + { + "epoch": 0.8151103741320344, + "grad_norm": 0.707380473613739, + "learning_rate": 3.4802686508016302e-06, + "loss": 0.1977, + "step": 7865 + }, + { + "epoch": 0.8152140118146958, + "grad_norm": 0.6617516279220581, + "learning_rate": 3.4764852376689475e-06, + "loss": 0.182, + "step": 7866 + }, + { + "epoch": 0.8153176494973572, + "grad_norm": 0.7335023283958435, + "learning_rate": 3.4727036863802143e-06, + "loss": 0.2243, + "step": 7867 + }, + { + "epoch": 0.8154212871800186, + "grad_norm": 0.5932573080062866, + "learning_rate": 3.4689239973615374e-06, + "loss": 0.1748, + "step": 7868 + }, + { + "epoch": 0.81552492486268, + "grad_norm": 0.6992368698120117, + "learning_rate": 3.4651461710388135e-06, + "loss": 0.1935, + "step": 7869 + }, + { + "epoch": 0.8156285625453414, + "grad_norm": 0.6213841438293457, + "learning_rate": 3.461370207837713e-06, + "loss": 0.181, + "step": 7870 + }, + { + "epoch": 0.8157322002280029, + "grad_norm": 0.6884294152259827, + "learning_rate": 3.4575961081837096e-06, + "loss": 0.1792, + "step": 7871 + }, + { + "epoch": 0.8158358379106643, + "grad_norm": 0.6872044801712036, + "learning_rate": 3.4538238725020555e-06, + "loss": 0.2066, + "step": 7872 + }, + { + "epoch": 0.8159394755933257, + "grad_norm": 0.6185559630393982, + "learning_rate": 3.4500535012178048e-06, + "loss": 0.1691, + "step": 7873 + }, + { + "epoch": 0.8160431132759871, + "grad_norm": 0.6163897514343262, + "learning_rate": 3.446284994755804e-06, + "loss": 0.1921, + "step": 7874 + }, + { + "epoch": 0.8161467509586485, + "grad_norm": 0.8531190156936646, + "learning_rate": 3.4425183535406713e-06, + "loss": 0.2188, + "step": 7875 + }, + { + "epoch": 0.8162503886413099, + "grad_norm": 0.7438344955444336, + "learning_rate": 3.4387535779968363e-06, + "loss": 0.1949, + "step": 7876 + }, + { + "epoch": 0.8163540263239714, + "grad_norm": 0.7845487594604492, + "learning_rate": 3.4349906685485036e-06, + "loss": 0.1977, + "step": 7877 + }, + { + "epoch": 0.8164576640066328, + "grad_norm": 0.7111387252807617, + "learning_rate": 3.4312296256196674e-06, + "loss": 0.1862, + "step": 7878 + }, + { + "epoch": 0.8165613016892942, + "grad_norm": 0.6945541501045227, + "learning_rate": 3.427470449634125e-06, + "loss": 0.2017, + "step": 7879 + }, + { + "epoch": 0.8166649393719556, + "grad_norm": 0.654155433177948, + "learning_rate": 3.4237131410154436e-06, + "loss": 0.1817, + "step": 7880 + }, + { + "epoch": 0.816768577054617, + "grad_norm": 0.6121152639389038, + "learning_rate": 3.4199577001870043e-06, + "loss": 0.2015, + "step": 7881 + }, + { + "epoch": 0.8168722147372784, + "grad_norm": 0.7844396829605103, + "learning_rate": 3.416204127571949e-06, + "loss": 0.2014, + "step": 7882 + }, + { + "epoch": 0.8169758524199399, + "grad_norm": 0.7182947993278503, + "learning_rate": 3.41245242359324e-06, + "loss": 0.2143, + "step": 7883 + }, + { + "epoch": 0.8170794901026013, + "grad_norm": 0.7850946187973022, + "learning_rate": 3.408702588673605e-06, + "loss": 0.2084, + "step": 7884 + }, + { + "epoch": 0.8171831277852627, + "grad_norm": 0.604456901550293, + "learning_rate": 3.4049546232355677e-06, + "loss": 0.1536, + "step": 7885 + }, + { + "epoch": 0.8172867654679241, + "grad_norm": 0.6011174321174622, + "learning_rate": 3.4012085277014494e-06, + "loss": 0.152, + "step": 7886 + }, + { + "epoch": 0.8173904031505855, + "grad_norm": 0.6505752801895142, + "learning_rate": 3.397464302493345e-06, + "loss": 0.1617, + "step": 7887 + }, + { + "epoch": 0.817494040833247, + "grad_norm": 0.7308200597763062, + "learning_rate": 3.393721948033155e-06, + "loss": 0.2055, + "step": 7888 + }, + { + "epoch": 0.8175976785159084, + "grad_norm": 0.6321290135383606, + "learning_rate": 3.3899814647425644e-06, + "loss": 0.1746, + "step": 7889 + }, + { + "epoch": 0.8177013161985698, + "grad_norm": 0.5009555220603943, + "learning_rate": 3.3862428530430426e-06, + "loss": 0.1329, + "step": 7890 + }, + { + "epoch": 0.8178049538812312, + "grad_norm": 0.6724569797515869, + "learning_rate": 3.3825061133558502e-06, + "loss": 0.1645, + "step": 7891 + }, + { + "epoch": 0.8179085915638926, + "grad_norm": 0.7115445733070374, + "learning_rate": 3.3787712461020305e-06, + "loss": 0.2052, + "step": 7892 + }, + { + "epoch": 0.818012229246554, + "grad_norm": 0.6832154393196106, + "learning_rate": 3.3750382517024294e-06, + "loss": 0.1735, + "step": 7893 + }, + { + "epoch": 0.8181158669292155, + "grad_norm": 0.6225373148918152, + "learning_rate": 3.371307130577679e-06, + "loss": 0.1836, + "step": 7894 + }, + { + "epoch": 0.8182195046118769, + "grad_norm": 0.7177269458770752, + "learning_rate": 3.3675778831481877e-06, + "loss": 0.1943, + "step": 7895 + }, + { + "epoch": 0.8183231422945383, + "grad_norm": 0.7160996198654175, + "learning_rate": 3.3638505098341725e-06, + "loss": 0.2182, + "step": 7896 + }, + { + "epoch": 0.8184267799771997, + "grad_norm": 0.5422918200492859, + "learning_rate": 3.3601250110556107e-06, + "loss": 0.145, + "step": 7897 + }, + { + "epoch": 0.8185304176598611, + "grad_norm": 0.6439123153686523, + "learning_rate": 3.3564013872322977e-06, + "loss": 0.1561, + "step": 7898 + }, + { + "epoch": 0.8186340553425225, + "grad_norm": 0.7110884189605713, + "learning_rate": 3.3526796387838067e-06, + "loss": 0.1909, + "step": 7899 + }, + { + "epoch": 0.818737693025184, + "grad_norm": 0.6405838131904602, + "learning_rate": 3.3489597661294893e-06, + "loss": 0.1769, + "step": 7900 + }, + { + "epoch": 0.8188413307078454, + "grad_norm": 0.6757362484931946, + "learning_rate": 3.3452417696885077e-06, + "loss": 0.1868, + "step": 7901 + }, + { + "epoch": 0.8189449683905068, + "grad_norm": 0.6399852633476257, + "learning_rate": 3.341525649879791e-06, + "loss": 0.2003, + "step": 7902 + }, + { + "epoch": 0.8190486060731682, + "grad_norm": 0.7883408665657043, + "learning_rate": 3.3378114071220647e-06, + "loss": 0.1885, + "step": 7903 + }, + { + "epoch": 0.8191522437558296, + "grad_norm": 0.764289379119873, + "learning_rate": 3.3340990418338516e-06, + "loss": 0.2051, + "step": 7904 + }, + { + "epoch": 0.819255881438491, + "grad_norm": 0.5843377709388733, + "learning_rate": 3.330388554433448e-06, + "loss": 0.1742, + "step": 7905 + }, + { + "epoch": 0.8193595191211525, + "grad_norm": 0.6539674997329712, + "learning_rate": 3.326679945338951e-06, + "loss": 0.1888, + "step": 7906 + }, + { + "epoch": 0.8194631568038139, + "grad_norm": 0.6555024981498718, + "learning_rate": 3.3229732149682347e-06, + "loss": 0.1789, + "step": 7907 + }, + { + "epoch": 0.8195667944864753, + "grad_norm": 0.7149132490158081, + "learning_rate": 3.3192683637389768e-06, + "loss": 0.2211, + "step": 7908 + }, + { + "epoch": 0.8196704321691367, + "grad_norm": 0.6635371446609497, + "learning_rate": 3.315565392068627e-06, + "loss": 0.1692, + "step": 7909 + }, + { + "epoch": 0.8197740698517981, + "grad_norm": 0.6320151090621948, + "learning_rate": 3.31186430037443e-06, + "loss": 0.1634, + "step": 7910 + }, + { + "epoch": 0.8198777075344595, + "grad_norm": 0.7512708306312561, + "learning_rate": 3.3081650890734253e-06, + "loss": 0.2088, + "step": 7911 + }, + { + "epoch": 0.819981345217121, + "grad_norm": 0.6643633246421814, + "learning_rate": 3.3044677585824237e-06, + "loss": 0.1926, + "step": 7912 + }, + { + "epoch": 0.8200849828997824, + "grad_norm": 0.6350557208061218, + "learning_rate": 3.300772309318043e-06, + "loss": 0.1898, + "step": 7913 + }, + { + "epoch": 0.8201886205824438, + "grad_norm": 0.6989490389823914, + "learning_rate": 3.297078741696684e-06, + "loss": 0.2359, + "step": 7914 + }, + { + "epoch": 0.8202922582651052, + "grad_norm": 0.6762931942939758, + "learning_rate": 3.293387056134527e-06, + "loss": 0.2028, + "step": 7915 + }, + { + "epoch": 0.8203958959477666, + "grad_norm": 0.8357157111167908, + "learning_rate": 3.2896972530475458e-06, + "loss": 0.2159, + "step": 7916 + }, + { + "epoch": 0.820499533630428, + "grad_norm": 0.5877515077590942, + "learning_rate": 3.2860093328514963e-06, + "loss": 0.1595, + "step": 7917 + }, + { + "epoch": 0.8206031713130895, + "grad_norm": 0.6448560953140259, + "learning_rate": 3.282323295961933e-06, + "loss": 0.1867, + "step": 7918 + }, + { + "epoch": 0.8207068089957509, + "grad_norm": 0.7989858984947205, + "learning_rate": 3.2786391427941977e-06, + "loss": 0.2033, + "step": 7919 + }, + { + "epoch": 0.8208104466784123, + "grad_norm": 0.6952617764472961, + "learning_rate": 3.2749568737634064e-06, + "loss": 0.1862, + "step": 7920 + }, + { + "epoch": 0.8209140843610737, + "grad_norm": 0.7309961915016174, + "learning_rate": 3.271276489284478e-06, + "loss": 0.2101, + "step": 7921 + }, + { + "epoch": 0.8210177220437351, + "grad_norm": 0.7257064580917358, + "learning_rate": 3.2675979897721087e-06, + "loss": 0.1971, + "step": 7922 + }, + { + "epoch": 0.8211213597263965, + "grad_norm": 0.6392828226089478, + "learning_rate": 3.2639213756407837e-06, + "loss": 0.2122, + "step": 7923 + }, + { + "epoch": 0.821224997409058, + "grad_norm": 0.6919106245040894, + "learning_rate": 3.2602466473047854e-06, + "loss": 0.2003, + "step": 7924 + }, + { + "epoch": 0.8213286350917194, + "grad_norm": 0.6594109535217285, + "learning_rate": 3.256573805178167e-06, + "loss": 0.179, + "step": 7925 + }, + { + "epoch": 0.8214322727743808, + "grad_norm": 0.5921263694763184, + "learning_rate": 3.2529028496747904e-06, + "loss": 0.1603, + "step": 7926 + }, + { + "epoch": 0.8215359104570422, + "grad_norm": 0.6132636070251465, + "learning_rate": 3.249233781208281e-06, + "loss": 0.1844, + "step": 7927 + }, + { + "epoch": 0.8216395481397036, + "grad_norm": 0.7435948848724365, + "learning_rate": 3.245566600192074e-06, + "loss": 0.2056, + "step": 7928 + }, + { + "epoch": 0.821743185822365, + "grad_norm": 0.6977766156196594, + "learning_rate": 3.241901307039379e-06, + "loss": 0.2141, + "step": 7929 + }, + { + "epoch": 0.8218468235050265, + "grad_norm": 0.7409849762916565, + "learning_rate": 3.2382379021631883e-06, + "loss": 0.2091, + "step": 7930 + }, + { + "epoch": 0.8219504611876879, + "grad_norm": 0.5388767719268799, + "learning_rate": 3.234576385976298e-06, + "loss": 0.1455, + "step": 7931 + }, + { + "epoch": 0.8220540988703493, + "grad_norm": 0.6420896053314209, + "learning_rate": 3.2309167588912738e-06, + "loss": 0.1823, + "step": 7932 + }, + { + "epoch": 0.8221577365530107, + "grad_norm": 0.6220948696136475, + "learning_rate": 3.227259021320486e-06, + "loss": 0.1838, + "step": 7933 + }, + { + "epoch": 0.8222613742356721, + "grad_norm": 0.608130693435669, + "learning_rate": 3.2236031736760775e-06, + "loss": 0.1688, + "step": 7934 + }, + { + "epoch": 0.8223650119183336, + "grad_norm": 0.6391450762748718, + "learning_rate": 3.21994921636998e-06, + "loss": 0.1903, + "step": 7935 + }, + { + "epoch": 0.822468649600995, + "grad_norm": 0.652980387210846, + "learning_rate": 3.216297149813923e-06, + "loss": 0.1884, + "step": 7936 + }, + { + "epoch": 0.8225722872836564, + "grad_norm": 0.711746335029602, + "learning_rate": 3.2126469744194087e-06, + "loss": 0.2326, + "step": 7937 + }, + { + "epoch": 0.8226759249663178, + "grad_norm": 0.6753913164138794, + "learning_rate": 3.208998690597738e-06, + "loss": 0.1629, + "step": 7938 + }, + { + "epoch": 0.8227795626489792, + "grad_norm": 0.6291230916976929, + "learning_rate": 3.2053522987599963e-06, + "loss": 0.1576, + "step": 7939 + }, + { + "epoch": 0.8228832003316405, + "grad_norm": 0.6426824927330017, + "learning_rate": 3.2017077993170485e-06, + "loss": 0.1681, + "step": 7940 + }, + { + "epoch": 0.822986838014302, + "grad_norm": 0.6569961905479431, + "learning_rate": 3.1980651926795558e-06, + "loss": 0.1742, + "step": 7941 + }, + { + "epoch": 0.8230904756969634, + "grad_norm": 0.7171782851219177, + "learning_rate": 3.1944244792579493e-06, + "loss": 0.1868, + "step": 7942 + }, + { + "epoch": 0.8231941133796248, + "grad_norm": 0.7539594769477844, + "learning_rate": 3.1907856594624696e-06, + "loss": 0.2004, + "step": 7943 + }, + { + "epoch": 0.8232977510622862, + "grad_norm": 0.684523344039917, + "learning_rate": 3.187148733703138e-06, + "loss": 0.2053, + "step": 7944 + }, + { + "epoch": 0.8234013887449476, + "grad_norm": 0.6421189308166504, + "learning_rate": 3.1835137023897445e-06, + "loss": 0.1817, + "step": 7945 + }, + { + "epoch": 0.823505026427609, + "grad_norm": 0.6755841970443726, + "learning_rate": 3.17988056593189e-06, + "loss": 0.1837, + "step": 7946 + }, + { + "epoch": 0.8236086641102704, + "grad_norm": 0.7096763849258423, + "learning_rate": 3.1762493247389447e-06, + "loss": 0.1852, + "step": 7947 + }, + { + "epoch": 0.8237123017929319, + "grad_norm": 0.7008394598960876, + "learning_rate": 3.1726199792200685e-06, + "loss": 0.1991, + "step": 7948 + }, + { + "epoch": 0.8238159394755933, + "grad_norm": 0.6629871129989624, + "learning_rate": 3.1689925297842208e-06, + "loss": 0.1864, + "step": 7949 + }, + { + "epoch": 0.8239195771582547, + "grad_norm": 0.5846853852272034, + "learning_rate": 3.1653669768401253e-06, + "loss": 0.1494, + "step": 7950 + }, + { + "epoch": 0.8240232148409161, + "grad_norm": 0.7168512344360352, + "learning_rate": 3.161743320796311e-06, + "loss": 0.2545, + "step": 7951 + }, + { + "epoch": 0.8241268525235775, + "grad_norm": 0.6414791345596313, + "learning_rate": 3.1581215620610804e-06, + "loss": 0.1835, + "step": 7952 + }, + { + "epoch": 0.824230490206239, + "grad_norm": 0.7068555355072021, + "learning_rate": 3.154501701042538e-06, + "loss": 0.1911, + "step": 7953 + }, + { + "epoch": 0.8243341278889004, + "grad_norm": 0.5438865423202515, + "learning_rate": 3.150883738148556e-06, + "loss": 0.155, + "step": 7954 + }, + { + "epoch": 0.8244377655715618, + "grad_norm": 0.6644114255905151, + "learning_rate": 3.1472676737867956e-06, + "loss": 0.1718, + "step": 7955 + }, + { + "epoch": 0.8245414032542232, + "grad_norm": 0.5944012999534607, + "learning_rate": 3.1436535083647214e-06, + "loss": 0.1748, + "step": 7956 + }, + { + "epoch": 0.8246450409368846, + "grad_norm": 0.6920223832130432, + "learning_rate": 3.14004124228956e-06, + "loss": 0.2383, + "step": 7957 + }, + { + "epoch": 0.824748678619546, + "grad_norm": 0.6960580945014954, + "learning_rate": 3.1364308759683438e-06, + "loss": 0.2038, + "step": 7958 + }, + { + "epoch": 0.8248523163022075, + "grad_norm": 0.5951129794120789, + "learning_rate": 3.1328224098078917e-06, + "loss": 0.1752, + "step": 7959 + }, + { + "epoch": 0.8249559539848689, + "grad_norm": 0.635200023651123, + "learning_rate": 3.129215844214779e-06, + "loss": 0.176, + "step": 7960 + }, + { + "epoch": 0.8250595916675303, + "grad_norm": 0.7616642713546753, + "learning_rate": 3.1256111795954046e-06, + "loss": 0.2122, + "step": 7961 + }, + { + "epoch": 0.8251632293501917, + "grad_norm": 0.705881655216217, + "learning_rate": 3.122008416355924e-06, + "loss": 0.2105, + "step": 7962 + }, + { + "epoch": 0.8252668670328531, + "grad_norm": 0.7588737607002258, + "learning_rate": 3.1184075549023007e-06, + "loss": 0.2225, + "step": 7963 + }, + { + "epoch": 0.8253705047155145, + "grad_norm": 0.7803189754486084, + "learning_rate": 3.114808595640273e-06, + "loss": 0.2452, + "step": 7964 + }, + { + "epoch": 0.825474142398176, + "grad_norm": 0.6398329138755798, + "learning_rate": 3.111211538975365e-06, + "loss": 0.1879, + "step": 7965 + }, + { + "epoch": 0.8255777800808374, + "grad_norm": 0.6574432849884033, + "learning_rate": 3.107616385312888e-06, + "loss": 0.1746, + "step": 7966 + }, + { + "epoch": 0.8256814177634988, + "grad_norm": 0.7401537299156189, + "learning_rate": 3.104023135057932e-06, + "loss": 0.2021, + "step": 7967 + }, + { + "epoch": 0.8257850554461602, + "grad_norm": 0.7985172867774963, + "learning_rate": 3.1004317886153835e-06, + "loss": 0.2037, + "step": 7968 + }, + { + "epoch": 0.8258886931288216, + "grad_norm": 0.6925119757652283, + "learning_rate": 3.0968423463899145e-06, + "loss": 0.1943, + "step": 7969 + }, + { + "epoch": 0.825992330811483, + "grad_norm": 0.7429485321044922, + "learning_rate": 3.0932548087859683e-06, + "loss": 0.1971, + "step": 7970 + }, + { + "epoch": 0.8260959684941445, + "grad_norm": 0.733308732509613, + "learning_rate": 3.0896691762077923e-06, + "loss": 0.2351, + "step": 7971 + }, + { + "epoch": 0.8261996061768059, + "grad_norm": 0.7723768353462219, + "learning_rate": 3.0860854490594084e-06, + "loss": 0.1963, + "step": 7972 + }, + { + "epoch": 0.8263032438594673, + "grad_norm": 0.6854905486106873, + "learning_rate": 3.0825036277446176e-06, + "loss": 0.2001, + "step": 7973 + }, + { + "epoch": 0.8264068815421287, + "grad_norm": 0.7836577296257019, + "learning_rate": 3.0789237126670214e-06, + "loss": 0.2203, + "step": 7974 + }, + { + "epoch": 0.8265105192247901, + "grad_norm": 0.6477953195571899, + "learning_rate": 3.075345704229995e-06, + "loss": 0.1941, + "step": 7975 + }, + { + "epoch": 0.8266141569074515, + "grad_norm": 0.6857738494873047, + "learning_rate": 3.0717696028367093e-06, + "loss": 0.2041, + "step": 7976 + }, + { + "epoch": 0.826717794590113, + "grad_norm": 0.6048509478569031, + "learning_rate": 3.0681954088901024e-06, + "loss": 0.1582, + "step": 7977 + }, + { + "epoch": 0.8268214322727744, + "grad_norm": 0.8324301242828369, + "learning_rate": 3.0646231227929224e-06, + "loss": 0.2352, + "step": 7978 + }, + { + "epoch": 0.8269250699554358, + "grad_norm": 0.7217226624488831, + "learning_rate": 3.061052744947681e-06, + "loss": 0.2169, + "step": 7979 + }, + { + "epoch": 0.8270287076380972, + "grad_norm": 0.7014948725700378, + "learning_rate": 3.0574842757566814e-06, + "loss": 0.1883, + "step": 7980 + }, + { + "epoch": 0.8271323453207586, + "grad_norm": 0.7494576573371887, + "learning_rate": 3.053917715622019e-06, + "loss": 0.2175, + "step": 7981 + }, + { + "epoch": 0.82723598300342, + "grad_norm": 0.6612431406974792, + "learning_rate": 3.0503530649455616e-06, + "loss": 0.1752, + "step": 7982 + }, + { + "epoch": 0.8273396206860815, + "grad_norm": 0.710117518901825, + "learning_rate": 3.046790324128972e-06, + "loss": 0.2034, + "step": 7983 + }, + { + "epoch": 0.8274432583687429, + "grad_norm": 0.5640646815299988, + "learning_rate": 3.0432294935736985e-06, + "loss": 0.1702, + "step": 7984 + }, + { + "epoch": 0.8275468960514043, + "grad_norm": 0.7199845910072327, + "learning_rate": 3.0396705736809664e-06, + "loss": 0.2295, + "step": 7985 + }, + { + "epoch": 0.8276505337340657, + "grad_norm": 0.658679187297821, + "learning_rate": 3.0361135648517883e-06, + "loss": 0.1973, + "step": 7986 + }, + { + "epoch": 0.8277541714167271, + "grad_norm": 0.7033202648162842, + "learning_rate": 3.032558467486959e-06, + "loss": 0.2307, + "step": 7987 + }, + { + "epoch": 0.8278578090993886, + "grad_norm": 0.7593150734901428, + "learning_rate": 3.0290052819870654e-06, + "loss": 0.2021, + "step": 7988 + }, + { + "epoch": 0.82796144678205, + "grad_norm": 0.6131226420402527, + "learning_rate": 3.0254540087524775e-06, + "loss": 0.2034, + "step": 7989 + }, + { + "epoch": 0.8280650844647114, + "grad_norm": 0.6799296140670776, + "learning_rate": 3.0219046481833404e-06, + "loss": 0.1865, + "step": 7990 + }, + { + "epoch": 0.8281687221473728, + "grad_norm": 0.7688307166099548, + "learning_rate": 3.0183572006796045e-06, + "loss": 0.223, + "step": 7991 + }, + { + "epoch": 0.8282723598300342, + "grad_norm": 0.6909458041191101, + "learning_rate": 3.014811666640971e-06, + "loss": 0.1901, + "step": 7992 + }, + { + "epoch": 0.8283759975126956, + "grad_norm": 0.8143532276153564, + "learning_rate": 3.011268046466955e-06, + "loss": 0.217, + "step": 7993 + }, + { + "epoch": 0.8284796351953571, + "grad_norm": 0.773020327091217, + "learning_rate": 3.007726340556851e-06, + "loss": 0.2167, + "step": 7994 + }, + { + "epoch": 0.8285832728780185, + "grad_norm": 0.6222478151321411, + "learning_rate": 3.004186549309722e-06, + "loss": 0.1789, + "step": 7995 + }, + { + "epoch": 0.8286869105606799, + "grad_norm": 0.7233225703239441, + "learning_rate": 3.00064867312444e-06, + "loss": 0.2182, + "step": 7996 + }, + { + "epoch": 0.8287905482433413, + "grad_norm": 0.7089139819145203, + "learning_rate": 2.997112712399637e-06, + "loss": 0.2036, + "step": 7997 + }, + { + "epoch": 0.8288941859260027, + "grad_norm": 0.5846203565597534, + "learning_rate": 2.9935786675337365e-06, + "loss": 0.1738, + "step": 7998 + }, + { + "epoch": 0.8289978236086641, + "grad_norm": 0.7119741439819336, + "learning_rate": 2.9900465389249623e-06, + "loss": 0.2005, + "step": 7999 + }, + { + "epoch": 0.8291014612913256, + "grad_norm": 0.6737663149833679, + "learning_rate": 2.986516326971294e-06, + "loss": 0.183, + "step": 8000 + }, + { + "epoch": 0.829205098973987, + "grad_norm": 0.6113407015800476, + "learning_rate": 2.9829880320705196e-06, + "loss": 0.1536, + "step": 8001 + }, + { + "epoch": 0.8293087366566484, + "grad_norm": 0.7092680931091309, + "learning_rate": 2.979461654620206e-06, + "loss": 0.1975, + "step": 8002 + }, + { + "epoch": 0.8294123743393098, + "grad_norm": 0.6716724634170532, + "learning_rate": 2.975937195017693e-06, + "loss": 0.1731, + "step": 8003 + }, + { + "epoch": 0.8295160120219712, + "grad_norm": 0.6372599601745605, + "learning_rate": 2.9724146536601116e-06, + "loss": 0.1856, + "step": 8004 + }, + { + "epoch": 0.8296196497046326, + "grad_norm": 0.5934385061264038, + "learning_rate": 2.9688940309443738e-06, + "loss": 0.168, + "step": 8005 + }, + { + "epoch": 0.8297232873872941, + "grad_norm": 0.48493868112564087, + "learning_rate": 2.965375327267179e-06, + "loss": 0.1335, + "step": 8006 + }, + { + "epoch": 0.8298269250699555, + "grad_norm": 0.722053587436676, + "learning_rate": 2.961858543025018e-06, + "loss": 0.2245, + "step": 8007 + }, + { + "epoch": 0.8299305627526169, + "grad_norm": 0.7954843640327454, + "learning_rate": 2.9583436786141463e-06, + "loss": 0.2197, + "step": 8008 + }, + { + "epoch": 0.8300342004352783, + "grad_norm": 0.607681393623352, + "learning_rate": 2.9548307344306205e-06, + "loss": 0.1742, + "step": 8009 + }, + { + "epoch": 0.8301378381179397, + "grad_norm": 0.7497290372848511, + "learning_rate": 2.9513197108702706e-06, + "loss": 0.1911, + "step": 8010 + }, + { + "epoch": 0.8302414758006011, + "grad_norm": 0.7454660534858704, + "learning_rate": 2.947810608328707e-06, + "loss": 0.2072, + "step": 8011 + }, + { + "epoch": 0.8303451134832626, + "grad_norm": 0.634694516658783, + "learning_rate": 2.944303427201343e-06, + "loss": 0.1618, + "step": 8012 + }, + { + "epoch": 0.830448751165924, + "grad_norm": 0.7154065370559692, + "learning_rate": 2.9407981678833496e-06, + "loss": 0.2029, + "step": 8013 + }, + { + "epoch": 0.8305523888485854, + "grad_norm": 0.6388426423072815, + "learning_rate": 2.9372948307697034e-06, + "loss": 0.2012, + "step": 8014 + }, + { + "epoch": 0.8306560265312468, + "grad_norm": 0.69815593957901, + "learning_rate": 2.9337934162551462e-06, + "loss": 0.2191, + "step": 8015 + }, + { + "epoch": 0.8307596642139081, + "grad_norm": 0.6910235285758972, + "learning_rate": 2.9302939247342244e-06, + "loss": 0.2123, + "step": 8016 + }, + { + "epoch": 0.8308633018965695, + "grad_norm": 0.5625707507133484, + "learning_rate": 2.9267963566012447e-06, + "loss": 0.146, + "step": 8017 + }, + { + "epoch": 0.830966939579231, + "grad_norm": 0.66923588514328, + "learning_rate": 2.9233007122503076e-06, + "loss": 0.1868, + "step": 8018 + }, + { + "epoch": 0.8310705772618924, + "grad_norm": 0.6398361325263977, + "learning_rate": 2.9198069920753045e-06, + "loss": 0.2055, + "step": 8019 + }, + { + "epoch": 0.8311742149445538, + "grad_norm": 0.6225539445877075, + "learning_rate": 2.916315196469892e-06, + "loss": 0.1683, + "step": 8020 + }, + { + "epoch": 0.8312778526272152, + "grad_norm": 0.7289894223213196, + "learning_rate": 2.9128253258275285e-06, + "loss": 0.2036, + "step": 8021 + }, + { + "epoch": 0.8313814903098766, + "grad_norm": 0.8791557550430298, + "learning_rate": 2.9093373805414526e-06, + "loss": 0.2305, + "step": 8022 + }, + { + "epoch": 0.831485127992538, + "grad_norm": 0.695125162601471, + "learning_rate": 2.9058513610046634e-06, + "loss": 0.1801, + "step": 8023 + }, + { + "epoch": 0.8315887656751995, + "grad_norm": 0.6579564809799194, + "learning_rate": 2.9023672676099733e-06, + "loss": 0.1957, + "step": 8024 + }, + { + "epoch": 0.8316924033578609, + "grad_norm": 0.5032559633255005, + "learning_rate": 2.8988851007499575e-06, + "loss": 0.1511, + "step": 8025 + }, + { + "epoch": 0.8317960410405223, + "grad_norm": 0.7749326229095459, + "learning_rate": 2.8954048608169837e-06, + "loss": 0.22, + "step": 8026 + }, + { + "epoch": 0.8318996787231837, + "grad_norm": 0.6643040776252747, + "learning_rate": 2.8919265482032057e-06, + "loss": 0.1976, + "step": 8027 + }, + { + "epoch": 0.8320033164058451, + "grad_norm": 0.6258031129837036, + "learning_rate": 2.888450163300549e-06, + "loss": 0.1713, + "step": 8028 + }, + { + "epoch": 0.8321069540885065, + "grad_norm": 0.6977447271347046, + "learning_rate": 2.884975706500728e-06, + "loss": 0.2078, + "step": 8029 + }, + { + "epoch": 0.832210591771168, + "grad_norm": 0.6385695338249207, + "learning_rate": 2.8815031781952328e-06, + "loss": 0.1869, + "step": 8030 + }, + { + "epoch": 0.8323142294538294, + "grad_norm": 0.7600287199020386, + "learning_rate": 2.8780325787753494e-06, + "loss": 0.2049, + "step": 8031 + }, + { + "epoch": 0.8324178671364908, + "grad_norm": 0.6887364983558655, + "learning_rate": 2.874563908632142e-06, + "loss": 0.2005, + "step": 8032 + }, + { + "epoch": 0.8325215048191522, + "grad_norm": 0.7222094535827637, + "learning_rate": 2.8710971681564472e-06, + "loss": 0.2162, + "step": 8033 + }, + { + "epoch": 0.8326251425018136, + "grad_norm": 0.5581437945365906, + "learning_rate": 2.867632357738901e-06, + "loss": 0.1525, + "step": 8034 + }, + { + "epoch": 0.832728780184475, + "grad_norm": 0.6178992986679077, + "learning_rate": 2.864169477769907e-06, + "loss": 0.1748, + "step": 8035 + }, + { + "epoch": 0.8328324178671365, + "grad_norm": 0.674506664276123, + "learning_rate": 2.860708528639653e-06, + "loss": 0.1973, + "step": 8036 + }, + { + "epoch": 0.8329360555497979, + "grad_norm": 0.640842854976654, + "learning_rate": 2.857249510738125e-06, + "loss": 0.1906, + "step": 8037 + }, + { + "epoch": 0.8330396932324593, + "grad_norm": 0.5568889379501343, + "learning_rate": 2.8537924244550686e-06, + "loss": 0.1765, + "step": 8038 + }, + { + "epoch": 0.8331433309151207, + "grad_norm": 0.703639566898346, + "learning_rate": 2.8503372701800304e-06, + "loss": 0.2035, + "step": 8039 + }, + { + "epoch": 0.8332469685977821, + "grad_norm": 0.7148422598838806, + "learning_rate": 2.846884048302325e-06, + "loss": 0.1781, + "step": 8040 + }, + { + "epoch": 0.8333506062804436, + "grad_norm": 0.5351859331130981, + "learning_rate": 2.8434327592110646e-06, + "loss": 0.1772, + "step": 8041 + }, + { + "epoch": 0.833454243963105, + "grad_norm": 0.6493597030639648, + "learning_rate": 2.83998340329513e-06, + "loss": 0.1941, + "step": 8042 + }, + { + "epoch": 0.8335578816457664, + "grad_norm": 0.8238438963890076, + "learning_rate": 2.836535980943187e-06, + "loss": 0.2018, + "step": 8043 + }, + { + "epoch": 0.8336615193284278, + "grad_norm": 0.7464231252670288, + "learning_rate": 2.833090492543691e-06, + "loss": 0.2039, + "step": 8044 + }, + { + "epoch": 0.8337651570110892, + "grad_norm": 0.7440434098243713, + "learning_rate": 2.829646938484869e-06, + "loss": 0.1981, + "step": 8045 + }, + { + "epoch": 0.8338687946937506, + "grad_norm": 0.6757184863090515, + "learning_rate": 2.8262053191547377e-06, + "loss": 0.177, + "step": 8046 + }, + { + "epoch": 0.833972432376412, + "grad_norm": 0.5503365397453308, + "learning_rate": 2.822765634941098e-06, + "loss": 0.134, + "step": 8047 + }, + { + "epoch": 0.8340760700590735, + "grad_norm": 0.7327800393104553, + "learning_rate": 2.819327886231524e-06, + "loss": 0.1837, + "step": 8048 + }, + { + "epoch": 0.8341797077417349, + "grad_norm": 0.7348129153251648, + "learning_rate": 2.8158920734133753e-06, + "loss": 0.2082, + "step": 8049 + }, + { + "epoch": 0.8342833454243963, + "grad_norm": 0.7318658232688904, + "learning_rate": 2.812458196873791e-06, + "loss": 0.1945, + "step": 8050 + }, + { + "epoch": 0.8343869831070577, + "grad_norm": 0.6062182784080505, + "learning_rate": 2.8090262569996984e-06, + "loss": 0.1756, + "step": 8051 + }, + { + "epoch": 0.8344906207897191, + "grad_norm": 0.6037781238555908, + "learning_rate": 2.805596254177807e-06, + "loss": 0.1927, + "step": 8052 + }, + { + "epoch": 0.8345942584723806, + "grad_norm": 0.631915271282196, + "learning_rate": 2.8021681887945964e-06, + "loss": 0.1818, + "step": 8053 + }, + { + "epoch": 0.834697896155042, + "grad_norm": 0.8001081943511963, + "learning_rate": 2.79874206123635e-06, + "loss": 0.2223, + "step": 8054 + }, + { + "epoch": 0.8348015338377034, + "grad_norm": 0.7133634686470032, + "learning_rate": 2.795317871889098e-06, + "loss": 0.1844, + "step": 8055 + }, + { + "epoch": 0.8349051715203648, + "grad_norm": 0.8070812821388245, + "learning_rate": 2.7918956211386826e-06, + "loss": 0.211, + "step": 8056 + }, + { + "epoch": 0.8350088092030262, + "grad_norm": 0.6299300789833069, + "learning_rate": 2.788475309370724e-06, + "loss": 0.1743, + "step": 8057 + }, + { + "epoch": 0.8351124468856876, + "grad_norm": 0.7091789841651917, + "learning_rate": 2.7850569369706048e-06, + "loss": 0.2124, + "step": 8058 + }, + { + "epoch": 0.8352160845683491, + "grad_norm": 0.5357885956764221, + "learning_rate": 2.781640504323515e-06, + "loss": 0.1677, + "step": 8059 + }, + { + "epoch": 0.8353197222510105, + "grad_norm": 0.6380164623260498, + "learning_rate": 2.7782260118144065e-06, + "loss": 0.2014, + "step": 8060 + }, + { + "epoch": 0.8354233599336719, + "grad_norm": 0.7222257256507874, + "learning_rate": 2.774813459828016e-06, + "loss": 0.194, + "step": 8061 + }, + { + "epoch": 0.8355269976163333, + "grad_norm": 0.689274787902832, + "learning_rate": 2.771402848748872e-06, + "loss": 0.1961, + "step": 8062 + }, + { + "epoch": 0.8356306352989947, + "grad_norm": 0.6324693560600281, + "learning_rate": 2.767994178961266e-06, + "loss": 0.18, + "step": 8063 + }, + { + "epoch": 0.8357342729816561, + "grad_norm": 0.671479344367981, + "learning_rate": 2.7645874508492943e-06, + "loss": 0.1977, + "step": 8064 + }, + { + "epoch": 0.8358379106643176, + "grad_norm": 0.7867868542671204, + "learning_rate": 2.761182664796811e-06, + "loss": 0.2011, + "step": 8065 + }, + { + "epoch": 0.835941548346979, + "grad_norm": 0.7570082545280457, + "learning_rate": 2.7577798211874717e-06, + "loss": 0.2166, + "step": 8066 + }, + { + "epoch": 0.8360451860296404, + "grad_norm": 0.6764487624168396, + "learning_rate": 2.7543789204046967e-06, + "loss": 0.1951, + "step": 8067 + }, + { + "epoch": 0.8361488237123018, + "grad_norm": 0.5519558191299438, + "learning_rate": 2.7509799628316923e-06, + "loss": 0.153, + "step": 8068 + }, + { + "epoch": 0.8362524613949632, + "grad_norm": 0.680759072303772, + "learning_rate": 2.747582948851457e-06, + "loss": 0.2039, + "step": 8069 + }, + { + "epoch": 0.8363560990776246, + "grad_norm": 0.658055305480957, + "learning_rate": 2.7441878788467515e-06, + "loss": 0.1988, + "step": 8070 + }, + { + "epoch": 0.8364597367602861, + "grad_norm": 0.7835912108421326, + "learning_rate": 2.740794753200131e-06, + "loss": 0.1886, + "step": 8071 + }, + { + "epoch": 0.8365633744429475, + "grad_norm": 0.7117691040039062, + "learning_rate": 2.7374035722939307e-06, + "loss": 0.2061, + "step": 8072 + }, + { + "epoch": 0.8366670121256089, + "grad_norm": 0.8212706446647644, + "learning_rate": 2.7340143365102623e-06, + "loss": 0.2299, + "step": 8073 + }, + { + "epoch": 0.8367706498082703, + "grad_norm": 0.6776049733161926, + "learning_rate": 2.7306270462310158e-06, + "loss": 0.2003, + "step": 8074 + }, + { + "epoch": 0.8368742874909317, + "grad_norm": 0.62154221534729, + "learning_rate": 2.7272417018378662e-06, + "loss": 0.17, + "step": 8075 + }, + { + "epoch": 0.8369779251735932, + "grad_norm": 0.5986471772193909, + "learning_rate": 2.72385830371227e-06, + "loss": 0.1653, + "step": 8076 + }, + { + "epoch": 0.8370815628562546, + "grad_norm": 0.7299423813819885, + "learning_rate": 2.7204768522354675e-06, + "loss": 0.2454, + "step": 8077 + }, + { + "epoch": 0.837185200538916, + "grad_norm": 0.6664876937866211, + "learning_rate": 2.7170973477884666e-06, + "loss": 0.1904, + "step": 8078 + }, + { + "epoch": 0.8372888382215774, + "grad_norm": 0.6690018773078918, + "learning_rate": 2.7137197907520763e-06, + "loss": 0.1751, + "step": 8079 + }, + { + "epoch": 0.8373924759042388, + "grad_norm": 0.5193243622779846, + "learning_rate": 2.7103441815068656e-06, + "loss": 0.1532, + "step": 8080 + }, + { + "epoch": 0.8374961135869002, + "grad_norm": 0.760511577129364, + "learning_rate": 2.706970520433192e-06, + "loss": 0.2175, + "step": 8081 + }, + { + "epoch": 0.8375997512695617, + "grad_norm": 0.6827983856201172, + "learning_rate": 2.703598807911203e-06, + "loss": 0.1964, + "step": 8082 + }, + { + "epoch": 0.8377033889522231, + "grad_norm": 0.566498875617981, + "learning_rate": 2.7002290443208056e-06, + "loss": 0.1586, + "step": 8083 + }, + { + "epoch": 0.8378070266348845, + "grad_norm": 0.619435727596283, + "learning_rate": 2.696861230041714e-06, + "loss": 0.1744, + "step": 8084 + }, + { + "epoch": 0.8379106643175459, + "grad_norm": 0.6570318937301636, + "learning_rate": 2.693495365453398e-06, + "loss": 0.2029, + "step": 8085 + }, + { + "epoch": 0.8380143020002073, + "grad_norm": 0.7254251837730408, + "learning_rate": 2.6901314509351183e-06, + "loss": 0.203, + "step": 8086 + }, + { + "epoch": 0.8381179396828687, + "grad_norm": 0.7361481785774231, + "learning_rate": 2.6867694868659213e-06, + "loss": 0.2305, + "step": 8087 + }, + { + "epoch": 0.8382215773655302, + "grad_norm": 0.5852012038230896, + "learning_rate": 2.6834094736246207e-06, + "loss": 0.1895, + "step": 8088 + }, + { + "epoch": 0.8383252150481916, + "grad_norm": 0.5840774178504944, + "learning_rate": 2.680051411589826e-06, + "loss": 0.1664, + "step": 8089 + }, + { + "epoch": 0.838428852730853, + "grad_norm": 0.6098830699920654, + "learning_rate": 2.676695301139909e-06, + "loss": 0.1635, + "step": 8090 + }, + { + "epoch": 0.8385324904135144, + "grad_norm": 0.6075127720832825, + "learning_rate": 2.6733411426530385e-06, + "loss": 0.1635, + "step": 8091 + }, + { + "epoch": 0.8386361280961757, + "grad_norm": 0.6864191293716431, + "learning_rate": 2.669988936507155e-06, + "loss": 0.1931, + "step": 8092 + }, + { + "epoch": 0.8387397657788371, + "grad_norm": 0.6276586055755615, + "learning_rate": 2.666638683079974e-06, + "loss": 0.1773, + "step": 8093 + }, + { + "epoch": 0.8388434034614985, + "grad_norm": 0.7612707018852234, + "learning_rate": 2.6632903827490063e-06, + "loss": 0.2042, + "step": 8094 + }, + { + "epoch": 0.83894704114416, + "grad_norm": 0.8222978711128235, + "learning_rate": 2.659944035891522e-06, + "loss": 0.2243, + "step": 8095 + }, + { + "epoch": 0.8390506788268214, + "grad_norm": 0.5825135707855225, + "learning_rate": 2.6565996428845873e-06, + "loss": 0.1806, + "step": 8096 + }, + { + "epoch": 0.8391543165094828, + "grad_norm": 0.6076071858406067, + "learning_rate": 2.653257204105051e-06, + "loss": 0.1851, + "step": 8097 + }, + { + "epoch": 0.8392579541921442, + "grad_norm": 0.6137374639511108, + "learning_rate": 2.6499167199295263e-06, + "loss": 0.159, + "step": 8098 + }, + { + "epoch": 0.8393615918748056, + "grad_norm": 0.5743726491928101, + "learning_rate": 2.6465781907344124e-06, + "loss": 0.1445, + "step": 8099 + }, + { + "epoch": 0.839465229557467, + "grad_norm": 0.722928524017334, + "learning_rate": 2.6432416168958887e-06, + "loss": 0.1987, + "step": 8100 + }, + { + "epoch": 0.8395688672401285, + "grad_norm": 0.6331915259361267, + "learning_rate": 2.6399069987899163e-06, + "loss": 0.1782, + "step": 8101 + }, + { + "epoch": 0.8396725049227899, + "grad_norm": 0.5965705513954163, + "learning_rate": 2.6365743367922434e-06, + "loss": 0.1649, + "step": 8102 + }, + { + "epoch": 0.8397761426054513, + "grad_norm": 0.7161065340042114, + "learning_rate": 2.633243631278375e-06, + "loss": 0.1848, + "step": 8103 + }, + { + "epoch": 0.8398797802881127, + "grad_norm": 0.7084481120109558, + "learning_rate": 2.6299148826236233e-06, + "loss": 0.216, + "step": 8104 + }, + { + "epoch": 0.8399834179707741, + "grad_norm": 0.6854572892189026, + "learning_rate": 2.626588091203062e-06, + "loss": 0.2289, + "step": 8105 + }, + { + "epoch": 0.8400870556534356, + "grad_norm": 0.7259882092475891, + "learning_rate": 2.6232632573915397e-06, + "loss": 0.2186, + "step": 8106 + }, + { + "epoch": 0.840190693336097, + "grad_norm": 0.7211769819259644, + "learning_rate": 2.619940381563706e-06, + "loss": 0.1835, + "step": 8107 + }, + { + "epoch": 0.8402943310187584, + "grad_norm": 0.6280680298805237, + "learning_rate": 2.616619464093968e-06, + "loss": 0.1744, + "step": 8108 + }, + { + "epoch": 0.8403979687014198, + "grad_norm": 0.708250105381012, + "learning_rate": 2.6133005053565306e-06, + "loss": 0.1952, + "step": 8109 + }, + { + "epoch": 0.8405016063840812, + "grad_norm": 0.7434312701225281, + "learning_rate": 2.60998350572536e-06, + "loss": 0.1949, + "step": 8110 + }, + { + "epoch": 0.8406052440667426, + "grad_norm": 0.7382619380950928, + "learning_rate": 2.606668465574218e-06, + "loss": 0.1763, + "step": 8111 + }, + { + "epoch": 0.8407088817494041, + "grad_norm": 0.607403576374054, + "learning_rate": 2.6033553852766356e-06, + "loss": 0.1779, + "step": 8112 + }, + { + "epoch": 0.8408125194320655, + "grad_norm": 0.6638832092285156, + "learning_rate": 2.600044265205921e-06, + "loss": 0.1804, + "step": 8113 + }, + { + "epoch": 0.8409161571147269, + "grad_norm": 0.702063262462616, + "learning_rate": 2.596735105735173e-06, + "loss": 0.1919, + "step": 8114 + }, + { + "epoch": 0.8410197947973883, + "grad_norm": 0.6962888836860657, + "learning_rate": 2.5934279072372558e-06, + "loss": 0.1829, + "step": 8115 + }, + { + "epoch": 0.8411234324800497, + "grad_norm": 0.6257210969924927, + "learning_rate": 2.5901226700848226e-06, + "loss": 0.1742, + "step": 8116 + }, + { + "epoch": 0.8412270701627111, + "grad_norm": 0.7082609534263611, + "learning_rate": 2.5868193946503106e-06, + "loss": 0.188, + "step": 8117 + }, + { + "epoch": 0.8413307078453726, + "grad_norm": 0.5426973104476929, + "learning_rate": 2.583518081305911e-06, + "loss": 0.1474, + "step": 8118 + }, + { + "epoch": 0.841434345528034, + "grad_norm": 0.6663376688957214, + "learning_rate": 2.580218730423627e-06, + "loss": 0.1948, + "step": 8119 + }, + { + "epoch": 0.8415379832106954, + "grad_norm": 0.6552466154098511, + "learning_rate": 2.5769213423752093e-06, + "loss": 0.1686, + "step": 8120 + }, + { + "epoch": 0.8416416208933568, + "grad_norm": 0.7427363991737366, + "learning_rate": 2.573625917532212e-06, + "loss": 0.2328, + "step": 8121 + }, + { + "epoch": 0.8417452585760182, + "grad_norm": 0.7532801032066345, + "learning_rate": 2.5703324562659605e-06, + "loss": 0.2163, + "step": 8122 + }, + { + "epoch": 0.8418488962586796, + "grad_norm": 0.6454771757125854, + "learning_rate": 2.567040958947551e-06, + "loss": 0.1798, + "step": 8123 + }, + { + "epoch": 0.8419525339413411, + "grad_norm": 0.6346676349639893, + "learning_rate": 2.563751425947869e-06, + "loss": 0.1686, + "step": 8124 + }, + { + "epoch": 0.8420561716240025, + "grad_norm": 0.679057776927948, + "learning_rate": 2.560463857637565e-06, + "loss": 0.2044, + "step": 8125 + }, + { + "epoch": 0.8421598093066639, + "grad_norm": 0.8455599546432495, + "learning_rate": 2.5571782543870826e-06, + "loss": 0.2225, + "step": 8126 + }, + { + "epoch": 0.8422634469893253, + "grad_norm": 0.6194790005683899, + "learning_rate": 2.5538946165666457e-06, + "loss": 0.158, + "step": 8127 + }, + { + "epoch": 0.8423670846719867, + "grad_norm": 0.6403823494911194, + "learning_rate": 2.550612944546238e-06, + "loss": 0.1891, + "step": 8128 + }, + { + "epoch": 0.8424707223546481, + "grad_norm": 0.6203027367591858, + "learning_rate": 2.5473332386956417e-06, + "loss": 0.1667, + "step": 8129 + }, + { + "epoch": 0.8425743600373096, + "grad_norm": 0.6994303464889526, + "learning_rate": 2.544055499384406e-06, + "loss": 0.1789, + "step": 8130 + }, + { + "epoch": 0.842677997719971, + "grad_norm": 0.6344105005264282, + "learning_rate": 2.5407797269818546e-06, + "loss": 0.1967, + "step": 8131 + }, + { + "epoch": 0.8427816354026324, + "grad_norm": 0.7821642756462097, + "learning_rate": 2.5375059218571084e-06, + "loss": 0.217, + "step": 8132 + }, + { + "epoch": 0.8428852730852938, + "grad_norm": 0.6653828024864197, + "learning_rate": 2.5342340843790458e-06, + "loss": 0.1609, + "step": 8133 + }, + { + "epoch": 0.8429889107679552, + "grad_norm": 0.6741253733634949, + "learning_rate": 2.5309642149163384e-06, + "loss": 0.1892, + "step": 8134 + }, + { + "epoch": 0.8430925484506167, + "grad_norm": 0.7713050246238708, + "learning_rate": 2.527696313837422e-06, + "loss": 0.205, + "step": 8135 + }, + { + "epoch": 0.8431961861332781, + "grad_norm": 0.7960719466209412, + "learning_rate": 2.52443038151053e-06, + "loss": 0.2105, + "step": 8136 + }, + { + "epoch": 0.8432998238159395, + "grad_norm": 0.7390683889389038, + "learning_rate": 2.5211664183036554e-06, + "loss": 0.1933, + "step": 8137 + }, + { + "epoch": 0.8434034614986009, + "grad_norm": 0.6969931721687317, + "learning_rate": 2.517904424584574e-06, + "loss": 0.2129, + "step": 8138 + }, + { + "epoch": 0.8435070991812623, + "grad_norm": 0.626531720161438, + "learning_rate": 2.5146444007208493e-06, + "loss": 0.1689, + "step": 8139 + }, + { + "epoch": 0.8436107368639237, + "grad_norm": 0.760380744934082, + "learning_rate": 2.5113863470798074e-06, + "loss": 0.2185, + "step": 8140 + }, + { + "epoch": 0.8437143745465852, + "grad_norm": 0.8247089385986328, + "learning_rate": 2.5081302640285656e-06, + "loss": 0.2483, + "step": 8141 + }, + { + "epoch": 0.8438180122292466, + "grad_norm": 0.5384553074836731, + "learning_rate": 2.504876151934017e-06, + "loss": 0.1601, + "step": 8142 + }, + { + "epoch": 0.843921649911908, + "grad_norm": 0.6652536392211914, + "learning_rate": 2.501624011162829e-06, + "loss": 0.2186, + "step": 8143 + }, + { + "epoch": 0.8440252875945694, + "grad_norm": 0.7298545241355896, + "learning_rate": 2.498373842081443e-06, + "loss": 0.2247, + "step": 8144 + }, + { + "epoch": 0.8441289252772308, + "grad_norm": 0.726304829120636, + "learning_rate": 2.4951256450560844e-06, + "loss": 0.2118, + "step": 8145 + }, + { + "epoch": 0.8442325629598922, + "grad_norm": 0.6961712837219238, + "learning_rate": 2.491879420452754e-06, + "loss": 0.1772, + "step": 8146 + }, + { + "epoch": 0.8443362006425537, + "grad_norm": 0.7808390259742737, + "learning_rate": 2.488635168637239e-06, + "loss": 0.2232, + "step": 8147 + }, + { + "epoch": 0.8444398383252151, + "grad_norm": 0.705258309841156, + "learning_rate": 2.485392889975091e-06, + "loss": 0.1916, + "step": 8148 + }, + { + "epoch": 0.8445434760078765, + "grad_norm": 0.7543384432792664, + "learning_rate": 2.4821525848316454e-06, + "loss": 0.2208, + "step": 8149 + }, + { + "epoch": 0.8446471136905379, + "grad_norm": 0.7452192306518555, + "learning_rate": 2.478914253572011e-06, + "loss": 0.2098, + "step": 8150 + }, + { + "epoch": 0.8447507513731993, + "grad_norm": 0.8277132511138916, + "learning_rate": 2.4756778965610794e-06, + "loss": 0.2144, + "step": 8151 + }, + { + "epoch": 0.8448543890558607, + "grad_norm": 0.553561270236969, + "learning_rate": 2.472443514163525e-06, + "loss": 0.1629, + "step": 8152 + }, + { + "epoch": 0.8449580267385222, + "grad_norm": 0.694131076335907, + "learning_rate": 2.469211106743785e-06, + "loss": 0.1864, + "step": 8153 + }, + { + "epoch": 0.8450616644211836, + "grad_norm": 0.6504413485527039, + "learning_rate": 2.465980674666091e-06, + "loss": 0.1743, + "step": 8154 + }, + { + "epoch": 0.845165302103845, + "grad_norm": 0.6953386664390564, + "learning_rate": 2.462752218294435e-06, + "loss": 0.1974, + "step": 8155 + }, + { + "epoch": 0.8452689397865064, + "grad_norm": 0.7989981770515442, + "learning_rate": 2.4595257379925943e-06, + "loss": 0.2132, + "step": 8156 + }, + { + "epoch": 0.8453725774691678, + "grad_norm": 0.6267284750938416, + "learning_rate": 2.45630123412413e-06, + "loss": 0.1682, + "step": 8157 + }, + { + "epoch": 0.8454762151518292, + "grad_norm": 0.6220939755439758, + "learning_rate": 2.4530787070523655e-06, + "loss": 0.1936, + "step": 8158 + }, + { + "epoch": 0.8455798528344907, + "grad_norm": 0.6248716711997986, + "learning_rate": 2.4498581571404188e-06, + "loss": 0.1933, + "step": 8159 + }, + { + "epoch": 0.8456834905171521, + "grad_norm": 0.6560169458389282, + "learning_rate": 2.446639584751169e-06, + "loss": 0.1932, + "step": 8160 + }, + { + "epoch": 0.8457871281998135, + "grad_norm": 0.723879873752594, + "learning_rate": 2.4434229902472882e-06, + "loss": 0.2124, + "step": 8161 + }, + { + "epoch": 0.8458907658824749, + "grad_norm": 0.671961784362793, + "learning_rate": 2.440208373991213e-06, + "loss": 0.2075, + "step": 8162 + }, + { + "epoch": 0.8459944035651363, + "grad_norm": 0.7564287185668945, + "learning_rate": 2.4369957363451557e-06, + "loss": 0.2031, + "step": 8163 + }, + { + "epoch": 0.8460980412477977, + "grad_norm": 0.7248294353485107, + "learning_rate": 2.4337850776711223e-06, + "loss": 0.209, + "step": 8164 + }, + { + "epoch": 0.8462016789304592, + "grad_norm": 0.663909375667572, + "learning_rate": 2.430576398330873e-06, + "loss": 0.2041, + "step": 8165 + }, + { + "epoch": 0.8463053166131206, + "grad_norm": 0.647461473941803, + "learning_rate": 2.427369698685964e-06, + "loss": 0.1846, + "step": 8166 + }, + { + "epoch": 0.846408954295782, + "grad_norm": 0.5828126072883606, + "learning_rate": 2.424164979097725e-06, + "loss": 0.1643, + "step": 8167 + }, + { + "epoch": 0.8465125919784433, + "grad_norm": 0.690223753452301, + "learning_rate": 2.420962239927254e-06, + "loss": 0.19, + "step": 8168 + }, + { + "epoch": 0.8466162296611047, + "grad_norm": 0.6529852151870728, + "learning_rate": 2.41776148153543e-06, + "loss": 0.197, + "step": 8169 + }, + { + "epoch": 0.8467198673437661, + "grad_norm": 0.6087262630462646, + "learning_rate": 2.4145627042829056e-06, + "loss": 0.1779, + "step": 8170 + }, + { + "epoch": 0.8468235050264276, + "grad_norm": 0.6241489052772522, + "learning_rate": 2.411365908530119e-06, + "loss": 0.1634, + "step": 8171 + }, + { + "epoch": 0.846927142709089, + "grad_norm": 0.6478610634803772, + "learning_rate": 2.408171094637284e-06, + "loss": 0.1907, + "step": 8172 + }, + { + "epoch": 0.8470307803917504, + "grad_norm": 0.7067448496818542, + "learning_rate": 2.404978262964379e-06, + "loss": 0.1986, + "step": 8173 + }, + { + "epoch": 0.8471344180744118, + "grad_norm": 0.6290725469589233, + "learning_rate": 2.401787413871175e-06, + "loss": 0.167, + "step": 8174 + }, + { + "epoch": 0.8472380557570732, + "grad_norm": 0.6874917149543762, + "learning_rate": 2.398598547717208e-06, + "loss": 0.216, + "step": 8175 + }, + { + "epoch": 0.8473416934397346, + "grad_norm": 0.6628933548927307, + "learning_rate": 2.3954116648617907e-06, + "loss": 0.1737, + "step": 8176 + }, + { + "epoch": 0.8474453311223961, + "grad_norm": 0.7646678686141968, + "learning_rate": 2.3922267656640253e-06, + "loss": 0.2003, + "step": 8177 + }, + { + "epoch": 0.8475489688050575, + "grad_norm": 0.7109581232070923, + "learning_rate": 2.3890438504827706e-06, + "loss": 0.2019, + "step": 8178 + }, + { + "epoch": 0.8476526064877189, + "grad_norm": 0.628286600112915, + "learning_rate": 2.3858629196766846e-06, + "loss": 0.1878, + "step": 8179 + }, + { + "epoch": 0.8477562441703803, + "grad_norm": 0.5861483812332153, + "learning_rate": 2.382683973604181e-06, + "loss": 0.1746, + "step": 8180 + }, + { + "epoch": 0.8478598818530417, + "grad_norm": 0.631664514541626, + "learning_rate": 2.3795070126234563e-06, + "loss": 0.1521, + "step": 8181 + }, + { + "epoch": 0.8479635195357031, + "grad_norm": 0.7326377630233765, + "learning_rate": 2.376332037092495e-06, + "loss": 0.2062, + "step": 8182 + }, + { + "epoch": 0.8480671572183646, + "grad_norm": 0.7119858860969543, + "learning_rate": 2.373159047369038e-06, + "loss": 0.1766, + "step": 8183 + }, + { + "epoch": 0.848170794901026, + "grad_norm": 0.6483974456787109, + "learning_rate": 2.3699880438106225e-06, + "loss": 0.1695, + "step": 8184 + }, + { + "epoch": 0.8482744325836874, + "grad_norm": 0.5860996842384338, + "learning_rate": 2.366819026774545e-06, + "loss": 0.1443, + "step": 8185 + }, + { + "epoch": 0.8483780702663488, + "grad_norm": 0.6049185991287231, + "learning_rate": 2.3636519966178905e-06, + "loss": 0.1847, + "step": 8186 + }, + { + "epoch": 0.8484817079490102, + "grad_norm": 0.7595677375793457, + "learning_rate": 2.360486953697516e-06, + "loss": 0.2154, + "step": 8187 + }, + { + "epoch": 0.8485853456316717, + "grad_norm": 0.6683211326599121, + "learning_rate": 2.3573238983700432e-06, + "loss": 0.206, + "step": 8188 + }, + { + "epoch": 0.8486889833143331, + "grad_norm": 0.7454718351364136, + "learning_rate": 2.3541628309918886e-06, + "loss": 0.2157, + "step": 8189 + }, + { + "epoch": 0.8487926209969945, + "grad_norm": 0.7679634094238281, + "learning_rate": 2.3510037519192385e-06, + "loss": 0.2123, + "step": 8190 + }, + { + "epoch": 0.8488962586796559, + "grad_norm": 0.6262506246566772, + "learning_rate": 2.3478466615080465e-06, + "loss": 0.1947, + "step": 8191 + }, + { + "epoch": 0.8489998963623173, + "grad_norm": 0.7192820310592651, + "learning_rate": 2.3446915601140564e-06, + "loss": 0.2036, + "step": 8192 + }, + { + "epoch": 0.8491035340449787, + "grad_norm": 0.7609272599220276, + "learning_rate": 2.341538448092775e-06, + "loss": 0.1991, + "step": 8193 + }, + { + "epoch": 0.8492071717276402, + "grad_norm": 0.6904714107513428, + "learning_rate": 2.3383873257994848e-06, + "loss": 0.1854, + "step": 8194 + }, + { + "epoch": 0.8493108094103016, + "grad_norm": 0.741290807723999, + "learning_rate": 2.3352381935892622e-06, + "loss": 0.2169, + "step": 8195 + }, + { + "epoch": 0.849414447092963, + "grad_norm": 0.5596386790275574, + "learning_rate": 2.3320910518169335e-06, + "loss": 0.1678, + "step": 8196 + }, + { + "epoch": 0.8495180847756244, + "grad_norm": 0.7157734036445618, + "learning_rate": 2.328945900837125e-06, + "loss": 0.2256, + "step": 8197 + }, + { + "epoch": 0.8496217224582858, + "grad_norm": 0.6879149675369263, + "learning_rate": 2.3258027410042173e-06, + "loss": 0.2025, + "step": 8198 + }, + { + "epoch": 0.8497253601409472, + "grad_norm": 0.7603225708007812, + "learning_rate": 2.3226615726723868e-06, + "loss": 0.212, + "step": 8199 + }, + { + "epoch": 0.8498289978236087, + "grad_norm": 0.6839963793754578, + "learning_rate": 2.3195223961955705e-06, + "loss": 0.2131, + "step": 8200 + }, + { + "epoch": 0.8499326355062701, + "grad_norm": 0.5889342427253723, + "learning_rate": 2.316385211927479e-06, + "loss": 0.163, + "step": 8201 + }, + { + "epoch": 0.8500362731889315, + "grad_norm": 0.6352614164352417, + "learning_rate": 2.313250020221618e-06, + "loss": 0.1916, + "step": 8202 + }, + { + "epoch": 0.8501399108715929, + "grad_norm": 0.6570771336555481, + "learning_rate": 2.3101168214312474e-06, + "loss": 0.155, + "step": 8203 + }, + { + "epoch": 0.8502435485542543, + "grad_norm": 0.79152911901474, + "learning_rate": 2.3069856159094115e-06, + "loss": 0.222, + "step": 8204 + }, + { + "epoch": 0.8503471862369157, + "grad_norm": 0.7477522492408752, + "learning_rate": 2.3038564040089374e-06, + "loss": 0.2124, + "step": 8205 + }, + { + "epoch": 0.8504508239195772, + "grad_norm": 0.7140838503837585, + "learning_rate": 2.3007291860824155e-06, + "loss": 0.1901, + "step": 8206 + }, + { + "epoch": 0.8505544616022386, + "grad_norm": 0.5891298651695251, + "learning_rate": 2.2976039624822133e-06, + "loss": 0.1695, + "step": 8207 + }, + { + "epoch": 0.8506580992849, + "grad_norm": 0.6833799481391907, + "learning_rate": 2.2944807335604733e-06, + "loss": 0.191, + "step": 8208 + }, + { + "epoch": 0.8507617369675614, + "grad_norm": 0.6844556331634521, + "learning_rate": 2.2913594996691212e-06, + "loss": 0.1878, + "step": 8209 + }, + { + "epoch": 0.8508653746502228, + "grad_norm": 0.6452999711036682, + "learning_rate": 2.2882402611598574e-06, + "loss": 0.1819, + "step": 8210 + }, + { + "epoch": 0.8509690123328842, + "grad_norm": 0.6083611845970154, + "learning_rate": 2.2851230183841453e-06, + "loss": 0.1653, + "step": 8211 + }, + { + "epoch": 0.8510726500155457, + "grad_norm": 0.6736093759536743, + "learning_rate": 2.2820077716932353e-06, + "loss": 0.1728, + "step": 8212 + }, + { + "epoch": 0.8511762876982071, + "grad_norm": 0.6951460838317871, + "learning_rate": 2.27889452143814e-06, + "loss": 0.1975, + "step": 8213 + }, + { + "epoch": 0.8512799253808685, + "grad_norm": 1.0119082927703857, + "learning_rate": 2.2757832679696645e-06, + "loss": 0.2236, + "step": 8214 + }, + { + "epoch": 0.8513835630635299, + "grad_norm": 0.7682976126670837, + "learning_rate": 2.2726740116383805e-06, + "loss": 0.2365, + "step": 8215 + }, + { + "epoch": 0.8514872007461913, + "grad_norm": 0.7260084748268127, + "learning_rate": 2.2695667527946253e-06, + "loss": 0.2021, + "step": 8216 + }, + { + "epoch": 0.8515908384288527, + "grad_norm": 0.663433313369751, + "learning_rate": 2.2664614917885318e-06, + "loss": 0.1833, + "step": 8217 + }, + { + "epoch": 0.8516944761115142, + "grad_norm": 0.6920781135559082, + "learning_rate": 2.263358228969992e-06, + "loss": 0.1774, + "step": 8218 + }, + { + "epoch": 0.8517981137941756, + "grad_norm": 0.6206676959991455, + "learning_rate": 2.2602569646886674e-06, + "loss": 0.1613, + "step": 8219 + }, + { + "epoch": 0.851901751476837, + "grad_norm": 0.7389729619026184, + "learning_rate": 2.257157699294017e-06, + "loss": 0.2059, + "step": 8220 + }, + { + "epoch": 0.8520053891594984, + "grad_norm": 0.6792731285095215, + "learning_rate": 2.25406043313525e-06, + "loss": 0.1891, + "step": 8221 + }, + { + "epoch": 0.8521090268421598, + "grad_norm": 0.6999104022979736, + "learning_rate": 2.2509651665613717e-06, + "loss": 0.2218, + "step": 8222 + }, + { + "epoch": 0.8522126645248213, + "grad_norm": 0.714362621307373, + "learning_rate": 2.2478718999211436e-06, + "loss": 0.1854, + "step": 8223 + }, + { + "epoch": 0.8523163022074827, + "grad_norm": 0.8368086218833923, + "learning_rate": 2.244780633563115e-06, + "loss": 0.2328, + "step": 8224 + }, + { + "epoch": 0.8524199398901441, + "grad_norm": 0.6363280415534973, + "learning_rate": 2.2416913678356054e-06, + "loss": 0.1777, + "step": 8225 + }, + { + "epoch": 0.8525235775728055, + "grad_norm": 0.6111525893211365, + "learning_rate": 2.2386041030867034e-06, + "loss": 0.1913, + "step": 8226 + }, + { + "epoch": 0.8526272152554669, + "grad_norm": 0.760571300983429, + "learning_rate": 2.2355188396642833e-06, + "loss": 0.2093, + "step": 8227 + }, + { + "epoch": 0.8527308529381283, + "grad_norm": 0.7413966059684753, + "learning_rate": 2.232435577915981e-06, + "loss": 0.2058, + "step": 8228 + }, + { + "epoch": 0.8528344906207898, + "grad_norm": 0.6896591782569885, + "learning_rate": 2.2293543181892186e-06, + "loss": 0.1985, + "step": 8229 + }, + { + "epoch": 0.8529381283034512, + "grad_norm": 0.6558910608291626, + "learning_rate": 2.226275060831189e-06, + "loss": 0.1944, + "step": 8230 + }, + { + "epoch": 0.8530417659861126, + "grad_norm": 0.6779370903968811, + "learning_rate": 2.223197806188857e-06, + "loss": 0.1768, + "step": 8231 + }, + { + "epoch": 0.853145403668774, + "grad_norm": 0.5688185095787048, + "learning_rate": 2.2201225546089612e-06, + "loss": 0.1503, + "step": 8232 + }, + { + "epoch": 0.8532490413514354, + "grad_norm": 0.7514565587043762, + "learning_rate": 2.2170493064380126e-06, + "loss": 0.2315, + "step": 8233 + }, + { + "epoch": 0.8533526790340968, + "grad_norm": 0.7149367332458496, + "learning_rate": 2.213978062022304e-06, + "loss": 0.1942, + "step": 8234 + }, + { + "epoch": 0.8534563167167583, + "grad_norm": 0.6362113356590271, + "learning_rate": 2.2109088217079023e-06, + "loss": 0.1959, + "step": 8235 + }, + { + "epoch": 0.8535599543994197, + "grad_norm": 0.6998996138572693, + "learning_rate": 2.2078415858406377e-06, + "loss": 0.196, + "step": 8236 + }, + { + "epoch": 0.8536635920820811, + "grad_norm": 0.6976892352104187, + "learning_rate": 2.2047763547661295e-06, + "loss": 0.1936, + "step": 8237 + }, + { + "epoch": 0.8537672297647425, + "grad_norm": 0.6329091787338257, + "learning_rate": 2.2017131288297567e-06, + "loss": 0.1782, + "step": 8238 + }, + { + "epoch": 0.8538708674474039, + "grad_norm": 0.6380835771560669, + "learning_rate": 2.1986519083766767e-06, + "loss": 0.1838, + "step": 8239 + }, + { + "epoch": 0.8539745051300653, + "grad_norm": 0.702772855758667, + "learning_rate": 2.1955926937518314e-06, + "loss": 0.1531, + "step": 8240 + }, + { + "epoch": 0.8540781428127268, + "grad_norm": 0.5946405529975891, + "learning_rate": 2.1925354852999204e-06, + "loss": 0.1662, + "step": 8241 + }, + { + "epoch": 0.8541817804953882, + "grad_norm": 0.7317295670509338, + "learning_rate": 2.1894802833654305e-06, + "loss": 0.2033, + "step": 8242 + }, + { + "epoch": 0.8542854181780496, + "grad_norm": 0.6746308207511902, + "learning_rate": 2.1864270882926176e-06, + "loss": 0.1898, + "step": 8243 + }, + { + "epoch": 0.8543890558607109, + "grad_norm": 0.6947521567344666, + "learning_rate": 2.183375900425504e-06, + "loss": 0.1915, + "step": 8244 + }, + { + "epoch": 0.8544926935433723, + "grad_norm": 0.6422945857048035, + "learning_rate": 2.1803267201079015e-06, + "loss": 0.1886, + "step": 8245 + }, + { + "epoch": 0.8545963312260337, + "grad_norm": 0.6722944378852844, + "learning_rate": 2.1772795476833776e-06, + "loss": 0.217, + "step": 8246 + }, + { + "epoch": 0.8546999689086952, + "grad_norm": 0.731451690196991, + "learning_rate": 2.174234383495293e-06, + "loss": 0.1954, + "step": 8247 + }, + { + "epoch": 0.8548036065913566, + "grad_norm": 0.5368337035179138, + "learning_rate": 2.171191227886764e-06, + "loss": 0.1504, + "step": 8248 + }, + { + "epoch": 0.854907244274018, + "grad_norm": 0.7373712062835693, + "learning_rate": 2.168150081200697e-06, + "loss": 0.1878, + "step": 8249 + }, + { + "epoch": 0.8550108819566794, + "grad_norm": 0.6935915350914001, + "learning_rate": 2.165110943779756e-06, + "loss": 0.1821, + "step": 8250 + }, + { + "epoch": 0.8551145196393408, + "grad_norm": 0.7318151593208313, + "learning_rate": 2.162073815966388e-06, + "loss": 0.2027, + "step": 8251 + }, + { + "epoch": 0.8552181573220022, + "grad_norm": 0.7722030878067017, + "learning_rate": 2.1590386981028154e-06, + "loss": 0.1945, + "step": 8252 + }, + { + "epoch": 0.8553217950046637, + "grad_norm": 0.640789270401001, + "learning_rate": 2.1560055905310227e-06, + "loss": 0.1789, + "step": 8253 + }, + { + "epoch": 0.8554254326873251, + "grad_norm": 0.6491336226463318, + "learning_rate": 2.1529744935927834e-06, + "loss": 0.1898, + "step": 8254 + }, + { + "epoch": 0.8555290703699865, + "grad_norm": 0.7431808710098267, + "learning_rate": 2.1499454076296365e-06, + "loss": 0.2145, + "step": 8255 + }, + { + "epoch": 0.8556327080526479, + "grad_norm": 0.747519850730896, + "learning_rate": 2.1469183329828925e-06, + "loss": 0.2015, + "step": 8256 + }, + { + "epoch": 0.8557363457353093, + "grad_norm": 0.7901133894920349, + "learning_rate": 2.14389326999364e-06, + "loss": 0.2204, + "step": 8257 + }, + { + "epoch": 0.8558399834179707, + "grad_norm": 0.6684821844100952, + "learning_rate": 2.140870219002731e-06, + "loss": 0.197, + "step": 8258 + }, + { + "epoch": 0.8559436211006322, + "grad_norm": 0.6593709588050842, + "learning_rate": 2.137849180350802e-06, + "loss": 0.1911, + "step": 8259 + }, + { + "epoch": 0.8560472587832936, + "grad_norm": 0.7188991904258728, + "learning_rate": 2.1348301543782648e-06, + "loss": 0.2288, + "step": 8260 + }, + { + "epoch": 0.856150896465955, + "grad_norm": 0.683214008808136, + "learning_rate": 2.1318131414252895e-06, + "loss": 0.185, + "step": 8261 + }, + { + "epoch": 0.8562545341486164, + "grad_norm": 0.6181463599205017, + "learning_rate": 2.128798141831836e-06, + "loss": 0.1876, + "step": 8262 + }, + { + "epoch": 0.8563581718312778, + "grad_norm": 0.6907486319541931, + "learning_rate": 2.1257851559376296e-06, + "loss": 0.1847, + "step": 8263 + }, + { + "epoch": 0.8564618095139392, + "grad_norm": 0.6637782454490662, + "learning_rate": 2.122774184082159e-06, + "loss": 0.1711, + "step": 8264 + }, + { + "epoch": 0.8565654471966007, + "grad_norm": 0.6911178827285767, + "learning_rate": 2.1197652266047064e-06, + "loss": 0.2138, + "step": 8265 + }, + { + "epoch": 0.8566690848792621, + "grad_norm": 0.5903525948524475, + "learning_rate": 2.116758283844311e-06, + "loss": 0.1618, + "step": 8266 + }, + { + "epoch": 0.8567727225619235, + "grad_norm": 0.6123262047767639, + "learning_rate": 2.1137533561397937e-06, + "loss": 0.1849, + "step": 8267 + }, + { + "epoch": 0.8568763602445849, + "grad_norm": 0.5878732800483704, + "learning_rate": 2.110750443829741e-06, + "loss": 0.1721, + "step": 8268 + }, + { + "epoch": 0.8569799979272463, + "grad_norm": 0.7839241623878479, + "learning_rate": 2.1077495472525242e-06, + "loss": 0.2296, + "step": 8269 + }, + { + "epoch": 0.8570836356099077, + "grad_norm": 0.6753618717193604, + "learning_rate": 2.1047506667462713e-06, + "loss": 0.1854, + "step": 8270 + }, + { + "epoch": 0.8571872732925692, + "grad_norm": 0.751511812210083, + "learning_rate": 2.1017538026488936e-06, + "loss": 0.1914, + "step": 8271 + }, + { + "epoch": 0.8572909109752306, + "grad_norm": 0.639025866985321, + "learning_rate": 2.098758955298077e-06, + "loss": 0.1722, + "step": 8272 + }, + { + "epoch": 0.857394548657892, + "grad_norm": 0.7590596079826355, + "learning_rate": 2.095766125031269e-06, + "loss": 0.2329, + "step": 8273 + }, + { + "epoch": 0.8574981863405534, + "grad_norm": 0.7018405795097351, + "learning_rate": 2.0927753121857043e-06, + "loss": 0.2226, + "step": 8274 + }, + { + "epoch": 0.8576018240232148, + "grad_norm": 0.6705461144447327, + "learning_rate": 2.08978651709838e-06, + "loss": 0.2002, + "step": 8275 + }, + { + "epoch": 0.8577054617058762, + "grad_norm": 0.681593120098114, + "learning_rate": 2.0867997401060667e-06, + "loss": 0.2266, + "step": 8276 + }, + { + "epoch": 0.8578090993885377, + "grad_norm": 0.590961217880249, + "learning_rate": 2.083814981545316e-06, + "loss": 0.2027, + "step": 8277 + }, + { + "epoch": 0.8579127370711991, + "grad_norm": 0.6383353471755981, + "learning_rate": 2.080832241752437e-06, + "loss": 0.1774, + "step": 8278 + }, + { + "epoch": 0.8580163747538605, + "grad_norm": 0.6680395007133484, + "learning_rate": 2.077851521063525e-06, + "loss": 0.1708, + "step": 8279 + }, + { + "epoch": 0.8581200124365219, + "grad_norm": 0.8208906650543213, + "learning_rate": 2.0748728198144484e-06, + "loss": 0.1869, + "step": 8280 + }, + { + "epoch": 0.8582236501191833, + "grad_norm": 0.6549916863441467, + "learning_rate": 2.0718961383408365e-06, + "loss": 0.1796, + "step": 8281 + }, + { + "epoch": 0.8583272878018448, + "grad_norm": 0.6959506273269653, + "learning_rate": 2.0689214769780962e-06, + "loss": 0.1727, + "step": 8282 + }, + { + "epoch": 0.8584309254845062, + "grad_norm": 0.8060963153839111, + "learning_rate": 2.0659488360614087e-06, + "loss": 0.2413, + "step": 8283 + }, + { + "epoch": 0.8585345631671676, + "grad_norm": 0.7632854580879211, + "learning_rate": 2.062978215925726e-06, + "loss": 0.1939, + "step": 8284 + }, + { + "epoch": 0.858638200849829, + "grad_norm": 0.661284327507019, + "learning_rate": 2.060009616905778e-06, + "loss": 0.1853, + "step": 8285 + }, + { + "epoch": 0.8587418385324904, + "grad_norm": 0.608495831489563, + "learning_rate": 2.057043039336053e-06, + "loss": 0.1729, + "step": 8286 + }, + { + "epoch": 0.8588454762151518, + "grad_norm": 0.7137344479560852, + "learning_rate": 2.0540784835508322e-06, + "loss": 0.1857, + "step": 8287 + }, + { + "epoch": 0.8589491138978133, + "grad_norm": 0.6977378726005554, + "learning_rate": 2.051115949884148e-06, + "loss": 0.1964, + "step": 8288 + }, + { + "epoch": 0.8590527515804747, + "grad_norm": 0.7782437205314636, + "learning_rate": 2.0481554386698142e-06, + "loss": 0.2284, + "step": 8289 + }, + { + "epoch": 0.8591563892631361, + "grad_norm": 0.6119537353515625, + "learning_rate": 2.0451969502414214e-06, + "loss": 0.1852, + "step": 8290 + }, + { + "epoch": 0.8592600269457975, + "grad_norm": 0.7304680347442627, + "learning_rate": 2.0422404849323207e-06, + "loss": 0.2084, + "step": 8291 + }, + { + "epoch": 0.8593636646284589, + "grad_norm": 0.6950478553771973, + "learning_rate": 2.0392860430756523e-06, + "loss": 0.1838, + "step": 8292 + }, + { + "epoch": 0.8594673023111203, + "grad_norm": 0.663688600063324, + "learning_rate": 2.0363336250043074e-06, + "loss": 0.1667, + "step": 8293 + }, + { + "epoch": 0.8595709399937818, + "grad_norm": 0.7153915166854858, + "learning_rate": 2.033383231050967e-06, + "loss": 0.1977, + "step": 8294 + }, + { + "epoch": 0.8596745776764432, + "grad_norm": 0.7251477837562561, + "learning_rate": 2.0304348615480763e-06, + "loss": 0.2058, + "step": 8295 + }, + { + "epoch": 0.8597782153591046, + "grad_norm": 0.6031265258789062, + "learning_rate": 2.0274885168278467e-06, + "loss": 0.1914, + "step": 8296 + }, + { + "epoch": 0.859881853041766, + "grad_norm": 0.735448956489563, + "learning_rate": 2.024544197222276e-06, + "loss": 0.2079, + "step": 8297 + }, + { + "epoch": 0.8599854907244274, + "grad_norm": 0.7274251580238342, + "learning_rate": 2.021601903063117e-06, + "loss": 0.189, + "step": 8298 + }, + { + "epoch": 0.8600891284070888, + "grad_norm": 0.7757635712623596, + "learning_rate": 2.0186616346819087e-06, + "loss": 0.2079, + "step": 8299 + }, + { + "epoch": 0.8601927660897503, + "grad_norm": 0.5823425650596619, + "learning_rate": 2.015723392409958e-06, + "loss": 0.1876, + "step": 8300 + }, + { + "epoch": 0.8602964037724117, + "grad_norm": 0.704903244972229, + "learning_rate": 2.012787176578339e-06, + "loss": 0.2228, + "step": 8301 + }, + { + "epoch": 0.8604000414550731, + "grad_norm": 0.7526519894599915, + "learning_rate": 2.0098529875178995e-06, + "loss": 0.2367, + "step": 8302 + }, + { + "epoch": 0.8605036791377345, + "grad_norm": 0.6222978234291077, + "learning_rate": 2.0069208255592555e-06, + "loss": 0.1892, + "step": 8303 + }, + { + "epoch": 0.8606073168203959, + "grad_norm": 0.7311579585075378, + "learning_rate": 2.0039906910328e-06, + "loss": 0.2006, + "step": 8304 + }, + { + "epoch": 0.8607109545030573, + "grad_norm": 0.6790103316307068, + "learning_rate": 2.0010625842687047e-06, + "loss": 0.1814, + "step": 8305 + }, + { + "epoch": 0.8608145921857188, + "grad_norm": 0.6187705397605896, + "learning_rate": 1.998136505596895e-06, + "loss": 0.1786, + "step": 8306 + }, + { + "epoch": 0.8609182298683802, + "grad_norm": 0.6673712134361267, + "learning_rate": 1.995212455347082e-06, + "loss": 0.1717, + "step": 8307 + }, + { + "epoch": 0.8610218675510416, + "grad_norm": 0.8776508569717407, + "learning_rate": 1.992290433848736e-06, + "loss": 0.2063, + "step": 8308 + }, + { + "epoch": 0.861125505233703, + "grad_norm": 0.7525877952575684, + "learning_rate": 1.9893704414311086e-06, + "loss": 0.2112, + "step": 8309 + }, + { + "epoch": 0.8612291429163644, + "grad_norm": 0.6398442983627319, + "learning_rate": 1.9864524784232265e-06, + "loss": 0.1998, + "step": 8310 + }, + { + "epoch": 0.8613327805990258, + "grad_norm": 0.6990947723388672, + "learning_rate": 1.9835365451538725e-06, + "loss": 0.2018, + "step": 8311 + }, + { + "epoch": 0.8614364182816873, + "grad_norm": 0.6815956830978394, + "learning_rate": 1.9806226419516195e-06, + "loss": 0.2087, + "step": 8312 + }, + { + "epoch": 0.8615400559643487, + "grad_norm": 0.7535561323165894, + "learning_rate": 1.977710769144794e-06, + "loss": 0.2075, + "step": 8313 + }, + { + "epoch": 0.8616436936470101, + "grad_norm": 0.6954166293144226, + "learning_rate": 1.9748009270614978e-06, + "loss": 0.1772, + "step": 8314 + }, + { + "epoch": 0.8617473313296715, + "grad_norm": 0.7022972702980042, + "learning_rate": 1.9718931160296175e-06, + "loss": 0.2064, + "step": 8315 + }, + { + "epoch": 0.8618509690123329, + "grad_norm": 0.6478790044784546, + "learning_rate": 1.9689873363767907e-06, + "loss": 0.1998, + "step": 8316 + }, + { + "epoch": 0.8619546066949944, + "grad_norm": 0.6146255135536194, + "learning_rate": 1.966083588430445e-06, + "loss": 0.1938, + "step": 8317 + }, + { + "epoch": 0.8620582443776558, + "grad_norm": 0.7748193144798279, + "learning_rate": 1.9631818725177654e-06, + "loss": 0.1849, + "step": 8318 + }, + { + "epoch": 0.8621618820603172, + "grad_norm": 0.5730516314506531, + "learning_rate": 1.9602821889657144e-06, + "loss": 0.1722, + "step": 8319 + }, + { + "epoch": 0.8622655197429785, + "grad_norm": 0.643845796585083, + "learning_rate": 1.9573845381010236e-06, + "loss": 0.1853, + "step": 8320 + }, + { + "epoch": 0.8623691574256399, + "grad_norm": 0.7794548869132996, + "learning_rate": 1.9544889202501925e-06, + "loss": 0.1981, + "step": 8321 + }, + { + "epoch": 0.8624727951083013, + "grad_norm": 0.6954991221427917, + "learning_rate": 1.9515953357395e-06, + "loss": 0.2042, + "step": 8322 + }, + { + "epoch": 0.8625764327909627, + "grad_norm": 0.7107481360435486, + "learning_rate": 1.9487037848949854e-06, + "loss": 0.2207, + "step": 8323 + }, + { + "epoch": 0.8626800704736242, + "grad_norm": 0.7179362177848816, + "learning_rate": 1.9458142680424674e-06, + "loss": 0.2147, + "step": 8324 + }, + { + "epoch": 0.8627837081562856, + "grad_norm": 0.7268587350845337, + "learning_rate": 1.942926785507535e-06, + "loss": 0.2107, + "step": 8325 + }, + { + "epoch": 0.862887345838947, + "grad_norm": 0.678972601890564, + "learning_rate": 1.9400413376155414e-06, + "loss": 0.166, + "step": 8326 + }, + { + "epoch": 0.8629909835216084, + "grad_norm": 0.6394493579864502, + "learning_rate": 1.9371579246916173e-06, + "loss": 0.1717, + "step": 8327 + }, + { + "epoch": 0.8630946212042698, + "grad_norm": 0.6306682825088501, + "learning_rate": 1.934276547060654e-06, + "loss": 0.1982, + "step": 8328 + }, + { + "epoch": 0.8631982588869312, + "grad_norm": 0.6881734728813171, + "learning_rate": 1.931397205047325e-06, + "loss": 0.1803, + "step": 8329 + }, + { + "epoch": 0.8633018965695927, + "grad_norm": 0.6395617723464966, + "learning_rate": 1.9285198989760757e-06, + "loss": 0.1674, + "step": 8330 + }, + { + "epoch": 0.8634055342522541, + "grad_norm": 0.7378185987472534, + "learning_rate": 1.925644629171106e-06, + "loss": 0.2131, + "step": 8331 + }, + { + "epoch": 0.8635091719349155, + "grad_norm": 0.6664524674415588, + "learning_rate": 1.9227713959564066e-06, + "loss": 0.1983, + "step": 8332 + }, + { + "epoch": 0.8636128096175769, + "grad_norm": 0.5630848407745361, + "learning_rate": 1.9199001996557263e-06, + "loss": 0.1648, + "step": 8333 + }, + { + "epoch": 0.8637164473002383, + "grad_norm": 0.7485938668251038, + "learning_rate": 1.917031040592581e-06, + "loss": 0.1823, + "step": 8334 + }, + { + "epoch": 0.8638200849828997, + "grad_norm": 0.6663389801979065, + "learning_rate": 1.9141639190902707e-06, + "loss": 0.1965, + "step": 8335 + }, + { + "epoch": 0.8639237226655612, + "grad_norm": 0.6991190314292908, + "learning_rate": 1.911298835471851e-06, + "loss": 0.2125, + "step": 8336 + }, + { + "epoch": 0.8640273603482226, + "grad_norm": 0.6290115714073181, + "learning_rate": 1.9084357900601637e-06, + "loss": 0.1882, + "step": 8337 + }, + { + "epoch": 0.864130998030884, + "grad_norm": 0.5512939691543579, + "learning_rate": 1.9055747831778082e-06, + "loss": 0.1605, + "step": 8338 + }, + { + "epoch": 0.8642346357135454, + "grad_norm": 0.7532790303230286, + "learning_rate": 1.9027158151471537e-06, + "loss": 0.2065, + "step": 8339 + }, + { + "epoch": 0.8643382733962068, + "grad_norm": 0.7798289656639099, + "learning_rate": 1.899858886290351e-06, + "loss": 0.1994, + "step": 8340 + }, + { + "epoch": 0.8644419110788683, + "grad_norm": 0.660067081451416, + "learning_rate": 1.8970039969293096e-06, + "loss": 0.1888, + "step": 8341 + }, + { + "epoch": 0.8645455487615297, + "grad_norm": 0.7333917021751404, + "learning_rate": 1.8941511473857188e-06, + "loss": 0.212, + "step": 8342 + }, + { + "epoch": 0.8646491864441911, + "grad_norm": 0.7680668830871582, + "learning_rate": 1.8913003379810258e-06, + "loss": 0.1958, + "step": 8343 + }, + { + "epoch": 0.8647528241268525, + "grad_norm": 0.688458263874054, + "learning_rate": 1.8884515690364669e-06, + "loss": 0.1967, + "step": 8344 + }, + { + "epoch": 0.8648564618095139, + "grad_norm": 0.6076627969741821, + "learning_rate": 1.8856048408730277e-06, + "loss": 0.1723, + "step": 8345 + }, + { + "epoch": 0.8649600994921753, + "grad_norm": 0.6366537809371948, + "learning_rate": 1.8827601538114736e-06, + "loss": 0.1806, + "step": 8346 + }, + { + "epoch": 0.8650637371748368, + "grad_norm": 0.6349666118621826, + "learning_rate": 1.8799175081723443e-06, + "loss": 0.1895, + "step": 8347 + }, + { + "epoch": 0.8651673748574982, + "grad_norm": 0.6724454760551453, + "learning_rate": 1.8770769042759384e-06, + "loss": 0.1907, + "step": 8348 + }, + { + "epoch": 0.8652710125401596, + "grad_norm": 0.7153465747833252, + "learning_rate": 1.8742383424423339e-06, + "loss": 0.1836, + "step": 8349 + }, + { + "epoch": 0.865374650222821, + "grad_norm": 0.6909909844398499, + "learning_rate": 1.8714018229913812e-06, + "loss": 0.2226, + "step": 8350 + }, + { + "epoch": 0.8654782879054824, + "grad_norm": 0.6756448745727539, + "learning_rate": 1.8685673462426867e-06, + "loss": 0.1931, + "step": 8351 + }, + { + "epoch": 0.8655819255881438, + "grad_norm": 0.6229551434516907, + "learning_rate": 1.8657349125156398e-06, + "loss": 0.1787, + "step": 8352 + }, + { + "epoch": 0.8656855632708053, + "grad_norm": 0.8107147812843323, + "learning_rate": 1.8629045221293894e-06, + "loss": 0.2089, + "step": 8353 + }, + { + "epoch": 0.8657892009534667, + "grad_norm": 0.730259358882904, + "learning_rate": 1.860076175402863e-06, + "loss": 0.1814, + "step": 8354 + }, + { + "epoch": 0.8658928386361281, + "grad_norm": 0.7431645393371582, + "learning_rate": 1.857249872654756e-06, + "loss": 0.2158, + "step": 8355 + }, + { + "epoch": 0.8659964763187895, + "grad_norm": 0.6153815388679504, + "learning_rate": 1.8544256142035278e-06, + "loss": 0.1675, + "step": 8356 + }, + { + "epoch": 0.8661001140014509, + "grad_norm": 0.6440610289573669, + "learning_rate": 1.8516034003674167e-06, + "loss": 0.165, + "step": 8357 + }, + { + "epoch": 0.8662037516841123, + "grad_norm": 0.5951587557792664, + "learning_rate": 1.8487832314644216e-06, + "loss": 0.1589, + "step": 8358 + }, + { + "epoch": 0.8663073893667738, + "grad_norm": 0.6662946939468384, + "learning_rate": 1.845965107812313e-06, + "loss": 0.1968, + "step": 8359 + }, + { + "epoch": 0.8664110270494352, + "grad_norm": 0.6845201253890991, + "learning_rate": 1.8431490297286369e-06, + "loss": 0.1838, + "step": 8360 + }, + { + "epoch": 0.8665146647320966, + "grad_norm": 0.7018457651138306, + "learning_rate": 1.8403349975307017e-06, + "loss": 0.1808, + "step": 8361 + }, + { + "epoch": 0.866618302414758, + "grad_norm": 0.6310449838638306, + "learning_rate": 1.8375230115355913e-06, + "loss": 0.159, + "step": 8362 + }, + { + "epoch": 0.8667219400974194, + "grad_norm": 0.7740344405174255, + "learning_rate": 1.8347130720601503e-06, + "loss": 0.2258, + "step": 8363 + }, + { + "epoch": 0.8668255777800808, + "grad_norm": 0.6985799074172974, + "learning_rate": 1.8319051794210053e-06, + "loss": 0.1968, + "step": 8364 + }, + { + "epoch": 0.8669292154627423, + "grad_norm": 0.7374464273452759, + "learning_rate": 1.8290993339345408e-06, + "loss": 0.2033, + "step": 8365 + }, + { + "epoch": 0.8670328531454037, + "grad_norm": 0.6944618225097656, + "learning_rate": 1.8262955359169155e-06, + "loss": 0.1908, + "step": 8366 + }, + { + "epoch": 0.8671364908280651, + "grad_norm": 0.8125670552253723, + "learning_rate": 1.8234937856840585e-06, + "loss": 0.2245, + "step": 8367 + }, + { + "epoch": 0.8672401285107265, + "grad_norm": 0.7239158153533936, + "learning_rate": 1.820694083551664e-06, + "loss": 0.1842, + "step": 8368 + }, + { + "epoch": 0.8673437661933879, + "grad_norm": 0.6269294023513794, + "learning_rate": 1.817896429835202e-06, + "loss": 0.213, + "step": 8369 + }, + { + "epoch": 0.8674474038760493, + "grad_norm": 0.6639854907989502, + "learning_rate": 1.815100824849907e-06, + "loss": 0.1927, + "step": 8370 + }, + { + "epoch": 0.8675510415587108, + "grad_norm": 0.6192017793655396, + "learning_rate": 1.812307268910778e-06, + "loss": 0.1647, + "step": 8371 + }, + { + "epoch": 0.8676546792413722, + "grad_norm": 0.6484594941139221, + "learning_rate": 1.8095157623325943e-06, + "loss": 0.1913, + "step": 8372 + }, + { + "epoch": 0.8677583169240336, + "grad_norm": 0.7138566970825195, + "learning_rate": 1.8067263054298955e-06, + "loss": 0.1758, + "step": 8373 + }, + { + "epoch": 0.867861954606695, + "grad_norm": 0.6029613018035889, + "learning_rate": 1.8039388985169947e-06, + "loss": 0.1631, + "step": 8374 + }, + { + "epoch": 0.8679655922893564, + "grad_norm": 0.651297390460968, + "learning_rate": 1.801153541907974e-06, + "loss": 0.1831, + "step": 8375 + }, + { + "epoch": 0.8680692299720179, + "grad_norm": 0.6598191261291504, + "learning_rate": 1.7983702359166844e-06, + "loss": 0.1793, + "step": 8376 + }, + { + "epoch": 0.8681728676546793, + "grad_norm": 0.6889581680297852, + "learning_rate": 1.7955889808567396e-06, + "loss": 0.1924, + "step": 8377 + }, + { + "epoch": 0.8682765053373407, + "grad_norm": 0.6279385089874268, + "learning_rate": 1.7928097770415264e-06, + "loss": 0.1672, + "step": 8378 + }, + { + "epoch": 0.8683801430200021, + "grad_norm": 0.5347639918327332, + "learning_rate": 1.7900326247842036e-06, + "loss": 0.1342, + "step": 8379 + }, + { + "epoch": 0.8684837807026635, + "grad_norm": 0.6175400018692017, + "learning_rate": 1.787257524397701e-06, + "loss": 0.1555, + "step": 8380 + }, + { + "epoch": 0.8685874183853249, + "grad_norm": 0.6695175170898438, + "learning_rate": 1.7844844761947033e-06, + "loss": 0.1776, + "step": 8381 + }, + { + "epoch": 0.8686910560679864, + "grad_norm": 0.7033392786979675, + "learning_rate": 1.7817134804876835e-06, + "loss": 0.2009, + "step": 8382 + }, + { + "epoch": 0.8687946937506478, + "grad_norm": 0.691206693649292, + "learning_rate": 1.778944537588867e-06, + "loss": 0.1606, + "step": 8383 + }, + { + "epoch": 0.8688983314333092, + "grad_norm": 0.7595584988594055, + "learning_rate": 1.7761776478102511e-06, + "loss": 0.2042, + "step": 8384 + }, + { + "epoch": 0.8690019691159706, + "grad_norm": 0.6974121928215027, + "learning_rate": 1.7734128114636106e-06, + "loss": 0.1898, + "step": 8385 + }, + { + "epoch": 0.869105606798632, + "grad_norm": 0.6632170677185059, + "learning_rate": 1.7706500288604788e-06, + "loss": 0.1832, + "step": 8386 + }, + { + "epoch": 0.8692092444812934, + "grad_norm": 0.6973819136619568, + "learning_rate": 1.7678893003121644e-06, + "loss": 0.1884, + "step": 8387 + }, + { + "epoch": 0.8693128821639549, + "grad_norm": 0.6757559776306152, + "learning_rate": 1.765130626129743e-06, + "loss": 0.191, + "step": 8388 + }, + { + "epoch": 0.8694165198466163, + "grad_norm": 0.5859283208847046, + "learning_rate": 1.7623740066240568e-06, + "loss": 0.1687, + "step": 8389 + }, + { + "epoch": 0.8695201575292777, + "grad_norm": 0.65776127576828, + "learning_rate": 1.7596194421057178e-06, + "loss": 0.1918, + "step": 8390 + }, + { + "epoch": 0.8696237952119391, + "grad_norm": 0.689621090888977, + "learning_rate": 1.7568669328850996e-06, + "loss": 0.1998, + "step": 8391 + }, + { + "epoch": 0.8697274328946005, + "grad_norm": 0.88386470079422, + "learning_rate": 1.7541164792723565e-06, + "loss": 0.2358, + "step": 8392 + }, + { + "epoch": 0.8698310705772619, + "grad_norm": 0.7360150218009949, + "learning_rate": 1.751368081577407e-06, + "loss": 0.214, + "step": 8393 + }, + { + "epoch": 0.8699347082599234, + "grad_norm": 0.7591801285743713, + "learning_rate": 1.7486217401099326e-06, + "loss": 0.1993, + "step": 8394 + }, + { + "epoch": 0.8700383459425848, + "grad_norm": 0.5601509809494019, + "learning_rate": 1.74587745517939e-06, + "loss": 0.1616, + "step": 8395 + }, + { + "epoch": 0.8701419836252461, + "grad_norm": 0.6487360596656799, + "learning_rate": 1.7431352270950008e-06, + "loss": 0.173, + "step": 8396 + }, + { + "epoch": 0.8702456213079075, + "grad_norm": 0.7577373385429382, + "learning_rate": 1.740395056165749e-06, + "loss": 0.1838, + "step": 8397 + }, + { + "epoch": 0.8703492589905689, + "grad_norm": 0.7360081672668457, + "learning_rate": 1.737656942700401e-06, + "loss": 0.2132, + "step": 8398 + }, + { + "epoch": 0.8704528966732303, + "grad_norm": 0.6201830506324768, + "learning_rate": 1.7349208870074763e-06, + "loss": 0.1699, + "step": 8399 + }, + { + "epoch": 0.8705565343558918, + "grad_norm": 0.6153380274772644, + "learning_rate": 1.7321868893952754e-06, + "loss": 0.1726, + "step": 8400 + }, + { + "epoch": 0.8706601720385532, + "grad_norm": 0.5483555793762207, + "learning_rate": 1.729454950171856e-06, + "loss": 0.1651, + "step": 8401 + }, + { + "epoch": 0.8707638097212146, + "grad_norm": 0.6861063241958618, + "learning_rate": 1.72672506964505e-06, + "loss": 0.1513, + "step": 8402 + }, + { + "epoch": 0.870867447403876, + "grad_norm": 0.6212663054466248, + "learning_rate": 1.72399724812246e-06, + "loss": 0.1682, + "step": 8403 + }, + { + "epoch": 0.8709710850865374, + "grad_norm": 0.612116813659668, + "learning_rate": 1.7212714859114442e-06, + "loss": 0.1855, + "step": 8404 + }, + { + "epoch": 0.8710747227691988, + "grad_norm": 0.6737285256385803, + "learning_rate": 1.7185477833191467e-06, + "loss": 0.1908, + "step": 8405 + }, + { + "epoch": 0.8711783604518603, + "grad_norm": 0.7784683108329773, + "learning_rate": 1.7158261406524635e-06, + "loss": 0.2532, + "step": 8406 + }, + { + "epoch": 0.8712819981345217, + "grad_norm": 0.7392608523368835, + "learning_rate": 1.7131065582180695e-06, + "loss": 0.1981, + "step": 8407 + }, + { + "epoch": 0.8713856358171831, + "grad_norm": 0.6000805497169495, + "learning_rate": 1.710389036322402e-06, + "loss": 0.1531, + "step": 8408 + }, + { + "epoch": 0.8714892734998445, + "grad_norm": 0.7079887986183167, + "learning_rate": 1.7076735752716622e-06, + "loss": 0.1919, + "step": 8409 + }, + { + "epoch": 0.8715929111825059, + "grad_norm": 0.6999341249465942, + "learning_rate": 1.7049601753718325e-06, + "loss": 0.1987, + "step": 8410 + }, + { + "epoch": 0.8716965488651673, + "grad_norm": 0.6581019163131714, + "learning_rate": 1.7022488369286462e-06, + "loss": 0.1993, + "step": 8411 + }, + { + "epoch": 0.8718001865478288, + "grad_norm": 0.7524918913841248, + "learning_rate": 1.699539560247616e-06, + "loss": 0.2153, + "step": 8412 + }, + { + "epoch": 0.8719038242304902, + "grad_norm": 0.6387086510658264, + "learning_rate": 1.696832345634023e-06, + "loss": 0.1677, + "step": 8413 + }, + { + "epoch": 0.8720074619131516, + "grad_norm": 0.8131616711616516, + "learning_rate": 1.6941271933929071e-06, + "loss": 0.199, + "step": 8414 + }, + { + "epoch": 0.872111099595813, + "grad_norm": 0.7414273619651794, + "learning_rate": 1.6914241038290846e-06, + "loss": 0.2105, + "step": 8415 + }, + { + "epoch": 0.8722147372784744, + "grad_norm": 0.6521408557891846, + "learning_rate": 1.6887230772471276e-06, + "loss": 0.1887, + "step": 8416 + }, + { + "epoch": 0.8723183749611358, + "grad_norm": 0.7166591286659241, + "learning_rate": 1.6860241139513899e-06, + "loss": 0.2254, + "step": 8417 + }, + { + "epoch": 0.8724220126437973, + "grad_norm": 0.6408938765525818, + "learning_rate": 1.6833272142459888e-06, + "loss": 0.1757, + "step": 8418 + }, + { + "epoch": 0.8725256503264587, + "grad_norm": 0.6115769743919373, + "learning_rate": 1.680632378434799e-06, + "loss": 0.1787, + "step": 8419 + }, + { + "epoch": 0.8726292880091201, + "grad_norm": 0.7922074794769287, + "learning_rate": 1.6779396068214792e-06, + "loss": 0.2271, + "step": 8420 + }, + { + "epoch": 0.8727329256917815, + "grad_norm": 0.8125144243240356, + "learning_rate": 1.6752488997094408e-06, + "loss": 0.228, + "step": 8421 + }, + { + "epoch": 0.8728365633744429, + "grad_norm": 0.6652587056159973, + "learning_rate": 1.6725602574018695e-06, + "loss": 0.1765, + "step": 8422 + }, + { + "epoch": 0.8729402010571043, + "grad_norm": 0.7473057508468628, + "learning_rate": 1.6698736802017191e-06, + "loss": 0.1948, + "step": 8423 + }, + { + "epoch": 0.8730438387397658, + "grad_norm": 0.6176879405975342, + "learning_rate": 1.6671891684117048e-06, + "loss": 0.1811, + "step": 8424 + }, + { + "epoch": 0.8731474764224272, + "grad_norm": 0.6850529909133911, + "learning_rate": 1.6645067223343181e-06, + "loss": 0.189, + "step": 8425 + }, + { + "epoch": 0.8732511141050886, + "grad_norm": 0.6224367618560791, + "learning_rate": 1.6618263422718084e-06, + "loss": 0.1704, + "step": 8426 + }, + { + "epoch": 0.87335475178775, + "grad_norm": 0.6955450177192688, + "learning_rate": 1.659148028526203e-06, + "loss": 0.1871, + "step": 8427 + }, + { + "epoch": 0.8734583894704114, + "grad_norm": 0.7510151863098145, + "learning_rate": 1.656471781399287e-06, + "loss": 0.2142, + "step": 8428 + }, + { + "epoch": 0.8735620271530729, + "grad_norm": 0.6863204836845398, + "learning_rate": 1.6537976011926105e-06, + "loss": 0.1914, + "step": 8429 + }, + { + "epoch": 0.8736656648357343, + "grad_norm": 0.7114805579185486, + "learning_rate": 1.6511254882075056e-06, + "loss": 0.1905, + "step": 8430 + }, + { + "epoch": 0.8737693025183957, + "grad_norm": 0.6145648956298828, + "learning_rate": 1.6484554427450516e-06, + "loss": 0.1829, + "step": 8431 + }, + { + "epoch": 0.8738729402010571, + "grad_norm": 0.6042888164520264, + "learning_rate": 1.6457874651061145e-06, + "loss": 0.19, + "step": 8432 + }, + { + "epoch": 0.8739765778837185, + "grad_norm": 0.7755526304244995, + "learning_rate": 1.6431215555913138e-06, + "loss": 0.2108, + "step": 8433 + }, + { + "epoch": 0.8740802155663799, + "grad_norm": 0.7109092473983765, + "learning_rate": 1.6404577145010358e-06, + "loss": 0.1888, + "step": 8434 + }, + { + "epoch": 0.8741838532490414, + "grad_norm": 0.7340528964996338, + "learning_rate": 1.637795942135445e-06, + "loss": 0.1662, + "step": 8435 + }, + { + "epoch": 0.8742874909317028, + "grad_norm": 0.7330647110939026, + "learning_rate": 1.635136238794459e-06, + "loss": 0.2035, + "step": 8436 + }, + { + "epoch": 0.8743911286143642, + "grad_norm": 0.6903119087219238, + "learning_rate": 1.6324786047777763e-06, + "loss": 0.1828, + "step": 8437 + }, + { + "epoch": 0.8744947662970256, + "grad_norm": 0.669479489326477, + "learning_rate": 1.6298230403848526e-06, + "loss": 0.2263, + "step": 8438 + }, + { + "epoch": 0.874598403979687, + "grad_norm": 0.683139979839325, + "learning_rate": 1.6271695459149106e-06, + "loss": 0.2082, + "step": 8439 + }, + { + "epoch": 0.8747020416623484, + "grad_norm": 0.6867363452911377, + "learning_rate": 1.6245181216669447e-06, + "loss": 0.2003, + "step": 8440 + }, + { + "epoch": 0.8748056793450099, + "grad_norm": 0.6809399724006653, + "learning_rate": 1.6218687679397072e-06, + "loss": 0.1851, + "step": 8441 + }, + { + "epoch": 0.8749093170276713, + "grad_norm": 0.6889615058898926, + "learning_rate": 1.6192214850317277e-06, + "loss": 0.1993, + "step": 8442 + }, + { + "epoch": 0.8750129547103327, + "grad_norm": 0.6928166151046753, + "learning_rate": 1.6165762732413037e-06, + "loss": 0.1885, + "step": 8443 + }, + { + "epoch": 0.8751165923929941, + "grad_norm": 0.520487904548645, + "learning_rate": 1.6139331328664809e-06, + "loss": 0.1578, + "step": 8444 + }, + { + "epoch": 0.8752202300756555, + "grad_norm": 0.6969999074935913, + "learning_rate": 1.6112920642050967e-06, + "loss": 0.1966, + "step": 8445 + }, + { + "epoch": 0.8753238677583169, + "grad_norm": 0.6650992631912231, + "learning_rate": 1.608653067554735e-06, + "loss": 0.1963, + "step": 8446 + }, + { + "epoch": 0.8754275054409784, + "grad_norm": 0.6840571761131287, + "learning_rate": 1.606016143212754e-06, + "loss": 0.1662, + "step": 8447 + }, + { + "epoch": 0.8755311431236398, + "grad_norm": 0.6494219303131104, + "learning_rate": 1.6033812914762826e-06, + "loss": 0.1947, + "step": 8448 + }, + { + "epoch": 0.8756347808063012, + "grad_norm": 0.7240989804267883, + "learning_rate": 1.6007485126422051e-06, + "loss": 0.2117, + "step": 8449 + }, + { + "epoch": 0.8757384184889626, + "grad_norm": 0.6862815022468567, + "learning_rate": 1.5981178070071868e-06, + "loss": 0.1972, + "step": 8450 + }, + { + "epoch": 0.875842056171624, + "grad_norm": 0.7563923597335815, + "learning_rate": 1.595489174867646e-06, + "loss": 0.2292, + "step": 8451 + }, + { + "epoch": 0.8759456938542854, + "grad_norm": 0.7190267443656921, + "learning_rate": 1.5928626165197768e-06, + "loss": 0.1881, + "step": 8452 + }, + { + "epoch": 0.8760493315369469, + "grad_norm": 0.6157662272453308, + "learning_rate": 1.590238132259534e-06, + "loss": 0.1751, + "step": 8453 + }, + { + "epoch": 0.8761529692196083, + "grad_norm": 0.6653298139572144, + "learning_rate": 1.587615722382634e-06, + "loss": 0.1936, + "step": 8454 + }, + { + "epoch": 0.8762566069022697, + "grad_norm": 0.6411898136138916, + "learning_rate": 1.584995387184578e-06, + "loss": 0.1855, + "step": 8455 + }, + { + "epoch": 0.8763602445849311, + "grad_norm": 0.6861348748207092, + "learning_rate": 1.5823771269606102e-06, + "loss": 0.1738, + "step": 8456 + }, + { + "epoch": 0.8764638822675925, + "grad_norm": 0.6964961290359497, + "learning_rate": 1.579760942005759e-06, + "loss": 0.2199, + "step": 8457 + }, + { + "epoch": 0.876567519950254, + "grad_norm": 0.6873329281806946, + "learning_rate": 1.577146832614811e-06, + "loss": 0.1788, + "step": 8458 + }, + { + "epoch": 0.8766711576329154, + "grad_norm": 0.6251301765441895, + "learning_rate": 1.5745347990823213e-06, + "loss": 0.1615, + "step": 8459 + }, + { + "epoch": 0.8767747953155768, + "grad_norm": 0.8414469957351685, + "learning_rate": 1.5719248417026057e-06, + "loss": 0.2304, + "step": 8460 + }, + { + "epoch": 0.8768784329982382, + "grad_norm": 0.6537505388259888, + "learning_rate": 1.5693169607697489e-06, + "loss": 0.1712, + "step": 8461 + }, + { + "epoch": 0.8769820706808996, + "grad_norm": 0.797130823135376, + "learning_rate": 1.5667111565776049e-06, + "loss": 0.2095, + "step": 8462 + }, + { + "epoch": 0.877085708363561, + "grad_norm": 0.718692421913147, + "learning_rate": 1.5641074294197988e-06, + "loss": 0.19, + "step": 8463 + }, + { + "epoch": 0.8771893460462225, + "grad_norm": 0.6466525793075562, + "learning_rate": 1.561505779589707e-06, + "loss": 0.1863, + "step": 8464 + }, + { + "epoch": 0.8772929837288839, + "grad_norm": 0.6893818974494934, + "learning_rate": 1.5589062073804796e-06, + "loss": 0.1938, + "step": 8465 + }, + { + "epoch": 0.8773966214115453, + "grad_norm": 0.7317748665809631, + "learning_rate": 1.5563087130850307e-06, + "loss": 0.2374, + "step": 8466 + }, + { + "epoch": 0.8775002590942067, + "grad_norm": 0.7286064624786377, + "learning_rate": 1.5537132969960466e-06, + "loss": 0.2192, + "step": 8467 + }, + { + "epoch": 0.8776038967768681, + "grad_norm": 0.655089259147644, + "learning_rate": 1.5511199594059733e-06, + "loss": 0.1532, + "step": 8468 + }, + { + "epoch": 0.8777075344595295, + "grad_norm": 0.7922578454017639, + "learning_rate": 1.5485287006070238e-06, + "loss": 0.2014, + "step": 8469 + }, + { + "epoch": 0.877811172142191, + "grad_norm": 0.6886960864067078, + "learning_rate": 1.5459395208911776e-06, + "loss": 0.1911, + "step": 8470 + }, + { + "epoch": 0.8779148098248524, + "grad_norm": 0.6597391963005066, + "learning_rate": 1.5433524205501793e-06, + "loss": 0.1979, + "step": 8471 + }, + { + "epoch": 0.8780184475075137, + "grad_norm": 0.7308657169342041, + "learning_rate": 1.5407673998755358e-06, + "loss": 0.1626, + "step": 8472 + }, + { + "epoch": 0.8781220851901751, + "grad_norm": 0.5964559316635132, + "learning_rate": 1.5381844591585294e-06, + "loss": 0.1551, + "step": 8473 + }, + { + "epoch": 0.8782257228728365, + "grad_norm": 0.5970776081085205, + "learning_rate": 1.5356035986901962e-06, + "loss": 0.1964, + "step": 8474 + }, + { + "epoch": 0.8783293605554979, + "grad_norm": 0.8105737566947937, + "learning_rate": 1.5330248187613484e-06, + "loss": 0.2208, + "step": 8475 + }, + { + "epoch": 0.8784329982381593, + "grad_norm": 0.6745266914367676, + "learning_rate": 1.5304481196625531e-06, + "loss": 0.1824, + "step": 8476 + }, + { + "epoch": 0.8785366359208208, + "grad_norm": 0.8474743962287903, + "learning_rate": 1.527873501684156e-06, + "loss": 0.2178, + "step": 8477 + }, + { + "epoch": 0.8786402736034822, + "grad_norm": 0.6158428192138672, + "learning_rate": 1.5253009651162564e-06, + "loss": 0.1684, + "step": 8478 + }, + { + "epoch": 0.8787439112861436, + "grad_norm": 0.6813104748725891, + "learning_rate": 1.5227305102487223e-06, + "loss": 0.1848, + "step": 8479 + }, + { + "epoch": 0.878847548968805, + "grad_norm": 0.5705843567848206, + "learning_rate": 1.520162137371195e-06, + "loss": 0.1832, + "step": 8480 + }, + { + "epoch": 0.8789511866514664, + "grad_norm": 0.6192941069602966, + "learning_rate": 1.5175958467730656e-06, + "loss": 0.1557, + "step": 8481 + }, + { + "epoch": 0.8790548243341278, + "grad_norm": 0.6419438719749451, + "learning_rate": 1.5150316387435049e-06, + "loss": 0.1714, + "step": 8482 + }, + { + "epoch": 0.8791584620167893, + "grad_norm": 0.5894890427589417, + "learning_rate": 1.5124695135714463e-06, + "loss": 0.1507, + "step": 8483 + }, + { + "epoch": 0.8792620996994507, + "grad_norm": 0.7742616534233093, + "learning_rate": 1.5099094715455852e-06, + "loss": 0.1856, + "step": 8484 + }, + { + "epoch": 0.8793657373821121, + "grad_norm": 0.6059996485710144, + "learning_rate": 1.5073515129543802e-06, + "loss": 0.1659, + "step": 8485 + }, + { + "epoch": 0.8794693750647735, + "grad_norm": 0.7219932079315186, + "learning_rate": 1.5047956380860584e-06, + "loss": 0.2266, + "step": 8486 + }, + { + "epoch": 0.8795730127474349, + "grad_norm": 0.6520408987998962, + "learning_rate": 1.5022418472286094e-06, + "loss": 0.1831, + "step": 8487 + }, + { + "epoch": 0.8796766504300964, + "grad_norm": 0.5570911765098572, + "learning_rate": 1.4996901406697983e-06, + "loss": 0.1414, + "step": 8488 + }, + { + "epoch": 0.8797802881127578, + "grad_norm": 0.6972008943557739, + "learning_rate": 1.49714051869714e-06, + "loss": 0.1907, + "step": 8489 + }, + { + "epoch": 0.8798839257954192, + "grad_norm": 0.6869020462036133, + "learning_rate": 1.4945929815979332e-06, + "loss": 0.1736, + "step": 8490 + }, + { + "epoch": 0.8799875634780806, + "grad_norm": 0.7016257643699646, + "learning_rate": 1.492047529659213e-06, + "loss": 0.1684, + "step": 8491 + }, + { + "epoch": 0.880091201160742, + "grad_norm": 0.7112381458282471, + "learning_rate": 1.4895041631678053e-06, + "loss": 0.1909, + "step": 8492 + }, + { + "epoch": 0.8801948388434034, + "grad_norm": 0.7013789415359497, + "learning_rate": 1.4869628824102989e-06, + "loss": 0.2006, + "step": 8493 + }, + { + "epoch": 0.8802984765260649, + "grad_norm": 0.6262712478637695, + "learning_rate": 1.4844236876730312e-06, + "loss": 0.1677, + "step": 8494 + }, + { + "epoch": 0.8804021142087263, + "grad_norm": 0.6590498685836792, + "learning_rate": 1.4818865792421221e-06, + "loss": 0.1858, + "step": 8495 + }, + { + "epoch": 0.8805057518913877, + "grad_norm": 0.7231387495994568, + "learning_rate": 1.4793515574034478e-06, + "loss": 0.2059, + "step": 8496 + }, + { + "epoch": 0.8806093895740491, + "grad_norm": 0.6526983380317688, + "learning_rate": 1.4768186224426463e-06, + "loss": 0.1733, + "step": 8497 + }, + { + "epoch": 0.8807130272567105, + "grad_norm": 0.7078770399093628, + "learning_rate": 1.474287774645129e-06, + "loss": 0.1852, + "step": 8498 + }, + { + "epoch": 0.8808166649393719, + "grad_norm": 0.7307820320129395, + "learning_rate": 1.4717590142960637e-06, + "loss": 0.2009, + "step": 8499 + }, + { + "epoch": 0.8809203026220334, + "grad_norm": 0.7521584630012512, + "learning_rate": 1.4692323416803934e-06, + "loss": 0.1977, + "step": 8500 + }, + { + "epoch": 0.8810239403046948, + "grad_norm": 0.6136340498924255, + "learning_rate": 1.4667077570828125e-06, + "loss": 0.165, + "step": 8501 + }, + { + "epoch": 0.8811275779873562, + "grad_norm": 0.6525123119354248, + "learning_rate": 1.464185260787796e-06, + "loss": 0.2126, + "step": 8502 + }, + { + "epoch": 0.8812312156700176, + "grad_norm": 0.666297435760498, + "learning_rate": 1.4616648530795673e-06, + "loss": 0.1823, + "step": 8503 + }, + { + "epoch": 0.881334853352679, + "grad_norm": 0.68965744972229, + "learning_rate": 1.4591465342421218e-06, + "loss": 0.1814, + "step": 8504 + }, + { + "epoch": 0.8814384910353404, + "grad_norm": 0.7001585960388184, + "learning_rate": 1.4566303045592279e-06, + "loss": 0.1772, + "step": 8505 + }, + { + "epoch": 0.8815421287180019, + "grad_norm": 0.7614684104919434, + "learning_rate": 1.4541161643144008e-06, + "loss": 0.2056, + "step": 8506 + }, + { + "epoch": 0.8816457664006633, + "grad_norm": 0.6306325793266296, + "learning_rate": 1.451604113790932e-06, + "loss": 0.1705, + "step": 8507 + }, + { + "epoch": 0.8817494040833247, + "grad_norm": 0.7202686071395874, + "learning_rate": 1.449094153271884e-06, + "loss": 0.2256, + "step": 8508 + }, + { + "epoch": 0.8818530417659861, + "grad_norm": 0.6471310257911682, + "learning_rate": 1.4465862830400678e-06, + "loss": 0.1681, + "step": 8509 + }, + { + "epoch": 0.8819566794486475, + "grad_norm": 0.7523260712623596, + "learning_rate": 1.444080503378067e-06, + "loss": 0.199, + "step": 8510 + }, + { + "epoch": 0.882060317131309, + "grad_norm": 0.7514868378639221, + "learning_rate": 1.4415768145682264e-06, + "loss": 0.1917, + "step": 8511 + }, + { + "epoch": 0.8821639548139704, + "grad_norm": 0.5885258316993713, + "learning_rate": 1.4390752168926603e-06, + "loss": 0.1863, + "step": 8512 + }, + { + "epoch": 0.8822675924966318, + "grad_norm": 0.6411988139152527, + "learning_rate": 1.4365757106332479e-06, + "loss": 0.1728, + "step": 8513 + }, + { + "epoch": 0.8823712301792932, + "grad_norm": 0.6003291606903076, + "learning_rate": 1.4340782960716238e-06, + "loss": 0.159, + "step": 8514 + }, + { + "epoch": 0.8824748678619546, + "grad_norm": 0.647544801235199, + "learning_rate": 1.4315829734892006e-06, + "loss": 0.1769, + "step": 8515 + }, + { + "epoch": 0.882578505544616, + "grad_norm": 0.6379469633102417, + "learning_rate": 1.4290897431671424e-06, + "loss": 0.1869, + "step": 8516 + }, + { + "epoch": 0.8826821432272774, + "grad_norm": 0.8017239570617676, + "learning_rate": 1.4265986053863802e-06, + "loss": 0.2074, + "step": 8517 + }, + { + "epoch": 0.8827857809099389, + "grad_norm": 0.6173701882362366, + "learning_rate": 1.4241095604276157e-06, + "loss": 0.1819, + "step": 8518 + }, + { + "epoch": 0.8828894185926003, + "grad_norm": 0.7021307349205017, + "learning_rate": 1.421622608571307e-06, + "loss": 0.1928, + "step": 8519 + }, + { + "epoch": 0.8829930562752617, + "grad_norm": 0.7378525733947754, + "learning_rate": 1.4191377500976856e-06, + "loss": 0.2155, + "step": 8520 + }, + { + "epoch": 0.8830966939579231, + "grad_norm": 0.5751429796218872, + "learning_rate": 1.416654985286734e-06, + "loss": 0.1536, + "step": 8521 + }, + { + "epoch": 0.8832003316405845, + "grad_norm": 0.6245465874671936, + "learning_rate": 1.4141743144182153e-06, + "loss": 0.1699, + "step": 8522 + }, + { + "epoch": 0.883303969323246, + "grad_norm": 0.6487674713134766, + "learning_rate": 1.4116957377716412e-06, + "loss": 0.1718, + "step": 8523 + }, + { + "epoch": 0.8834076070059074, + "grad_norm": 0.5731277465820312, + "learning_rate": 1.4092192556262907e-06, + "loss": 0.1552, + "step": 8524 + }, + { + "epoch": 0.8835112446885688, + "grad_norm": 0.7612659335136414, + "learning_rate": 1.4067448682612207e-06, + "loss": 0.2067, + "step": 8525 + }, + { + "epoch": 0.8836148823712302, + "grad_norm": 0.5277737975120544, + "learning_rate": 1.40427257595523e-06, + "loss": 0.1608, + "step": 8526 + }, + { + "epoch": 0.8837185200538916, + "grad_norm": 0.6964701414108276, + "learning_rate": 1.4018023789869005e-06, + "loss": 0.1774, + "step": 8527 + }, + { + "epoch": 0.883822157736553, + "grad_norm": 0.6501127481460571, + "learning_rate": 1.3993342776345698e-06, + "loss": 0.1771, + "step": 8528 + }, + { + "epoch": 0.8839257954192145, + "grad_norm": 0.6661208868026733, + "learning_rate": 1.3968682721763328e-06, + "loss": 0.1711, + "step": 8529 + }, + { + "epoch": 0.8840294331018759, + "grad_norm": 0.6030734181404114, + "learning_rate": 1.3944043628900627e-06, + "loss": 0.1518, + "step": 8530 + }, + { + "epoch": 0.8841330707845373, + "grad_norm": 0.726662278175354, + "learning_rate": 1.3919425500533823e-06, + "loss": 0.189, + "step": 8531 + }, + { + "epoch": 0.8842367084671987, + "grad_norm": 0.7446136474609375, + "learning_rate": 1.3894828339436894e-06, + "loss": 0.1774, + "step": 8532 + }, + { + "epoch": 0.8843403461498601, + "grad_norm": 0.6524977684020996, + "learning_rate": 1.3870252148381446e-06, + "loss": 0.188, + "step": 8533 + }, + { + "epoch": 0.8844439838325215, + "grad_norm": 0.7023906707763672, + "learning_rate": 1.3845696930136621e-06, + "loss": 0.1999, + "step": 8534 + }, + { + "epoch": 0.884547621515183, + "grad_norm": 0.5903967618942261, + "learning_rate": 1.3821162687469291e-06, + "loss": 0.1639, + "step": 8535 + }, + { + "epoch": 0.8846512591978444, + "grad_norm": 0.6685062646865845, + "learning_rate": 1.3796649423143915e-06, + "loss": 0.2118, + "step": 8536 + }, + { + "epoch": 0.8847548968805058, + "grad_norm": 0.7017149329185486, + "learning_rate": 1.3772157139922593e-06, + "loss": 0.2078, + "step": 8537 + }, + { + "epoch": 0.8848585345631672, + "grad_norm": 0.6549966335296631, + "learning_rate": 1.374768584056516e-06, + "loss": 0.1869, + "step": 8538 + }, + { + "epoch": 0.8849621722458286, + "grad_norm": 0.6312375068664551, + "learning_rate": 1.3723235527828904e-06, + "loss": 0.1654, + "step": 8539 + }, + { + "epoch": 0.88506580992849, + "grad_norm": 0.5591091513633728, + "learning_rate": 1.369880620446895e-06, + "loss": 0.1516, + "step": 8540 + }, + { + "epoch": 0.8851694476111515, + "grad_norm": 0.6782501935958862, + "learning_rate": 1.3674397873237877e-06, + "loss": 0.1706, + "step": 8541 + }, + { + "epoch": 0.8852730852938129, + "grad_norm": 0.7043976187705994, + "learning_rate": 1.3650010536885994e-06, + "loss": 0.1916, + "step": 8542 + }, + { + "epoch": 0.8853767229764743, + "grad_norm": 0.650366485118866, + "learning_rate": 1.3625644198161259e-06, + "loss": 0.1798, + "step": 8543 + }, + { + "epoch": 0.8854803606591357, + "grad_norm": 0.6807155013084412, + "learning_rate": 1.3601298859809165e-06, + "loss": 0.2027, + "step": 8544 + }, + { + "epoch": 0.8855839983417971, + "grad_norm": 0.7555018067359924, + "learning_rate": 1.3576974524573006e-06, + "loss": 0.211, + "step": 8545 + }, + { + "epoch": 0.8856876360244585, + "grad_norm": 0.6486994028091431, + "learning_rate": 1.3552671195193523e-06, + "loss": 0.1975, + "step": 8546 + }, + { + "epoch": 0.88579127370712, + "grad_norm": 0.7745081782341003, + "learning_rate": 1.3528388874409238e-06, + "loss": 0.2216, + "step": 8547 + }, + { + "epoch": 0.8858949113897813, + "grad_norm": 0.7181711792945862, + "learning_rate": 1.3504127564956205e-06, + "loss": 0.1998, + "step": 8548 + }, + { + "epoch": 0.8859985490724427, + "grad_norm": 0.7709293961524963, + "learning_rate": 1.3479887269568148e-06, + "loss": 0.2232, + "step": 8549 + }, + { + "epoch": 0.8861021867551041, + "grad_norm": 0.8563987016677856, + "learning_rate": 1.3455667990976483e-06, + "loss": 0.222, + "step": 8550 + }, + { + "epoch": 0.8862058244377655, + "grad_norm": 0.7017079591751099, + "learning_rate": 1.3431469731910118e-06, + "loss": 0.1925, + "step": 8551 + }, + { + "epoch": 0.8863094621204269, + "grad_norm": 0.6531928777694702, + "learning_rate": 1.3407292495095715e-06, + "loss": 0.1829, + "step": 8552 + }, + { + "epoch": 0.8864130998030884, + "grad_norm": 0.6812769174575806, + "learning_rate": 1.3383136283257581e-06, + "loss": 0.1918, + "step": 8553 + }, + { + "epoch": 0.8865167374857498, + "grad_norm": 0.560120701789856, + "learning_rate": 1.3359001099117518e-06, + "loss": 0.1385, + "step": 8554 + }, + { + "epoch": 0.8866203751684112, + "grad_norm": 0.6926707625389099, + "learning_rate": 1.3334886945395086e-06, + "loss": 0.2024, + "step": 8555 + }, + { + "epoch": 0.8867240128510726, + "grad_norm": 0.6750642657279968, + "learning_rate": 1.3310793824807378e-06, + "loss": 0.1806, + "step": 8556 + }, + { + "epoch": 0.886827650533734, + "grad_norm": 0.6305122971534729, + "learning_rate": 1.328672174006922e-06, + "loss": 0.1703, + "step": 8557 + }, + { + "epoch": 0.8869312882163954, + "grad_norm": 0.5967231392860413, + "learning_rate": 1.326267069389302e-06, + "loss": 0.1607, + "step": 8558 + }, + { + "epoch": 0.8870349258990569, + "grad_norm": 0.7030181288719177, + "learning_rate": 1.3238640688988814e-06, + "loss": 0.2134, + "step": 8559 + }, + { + "epoch": 0.8871385635817183, + "grad_norm": 0.7760520577430725, + "learning_rate": 1.321463172806423e-06, + "loss": 0.2166, + "step": 8560 + }, + { + "epoch": 0.8872422012643797, + "grad_norm": 0.7813576459884644, + "learning_rate": 1.3190643813824577e-06, + "loss": 0.1998, + "step": 8561 + }, + { + "epoch": 0.8873458389470411, + "grad_norm": 0.7017430067062378, + "learning_rate": 1.3166676948972757e-06, + "loss": 0.1894, + "step": 8562 + }, + { + "epoch": 0.8874494766297025, + "grad_norm": 0.720665454864502, + "learning_rate": 1.3142731136209364e-06, + "loss": 0.1956, + "step": 8563 + }, + { + "epoch": 0.8875531143123639, + "grad_norm": 0.7103095054626465, + "learning_rate": 1.311880637823255e-06, + "loss": 0.1927, + "step": 8564 + }, + { + "epoch": 0.8876567519950254, + "grad_norm": 0.7657372355461121, + "learning_rate": 1.309490267773812e-06, + "loss": 0.1948, + "step": 8565 + }, + { + "epoch": 0.8877603896776868, + "grad_norm": 0.6733664274215698, + "learning_rate": 1.3071020037419535e-06, + "loss": 0.1827, + "step": 8566 + }, + { + "epoch": 0.8878640273603482, + "grad_norm": 0.7801433205604553, + "learning_rate": 1.304715845996778e-06, + "loss": 0.2261, + "step": 8567 + }, + { + "epoch": 0.8879676650430096, + "grad_norm": 0.6368528604507446, + "learning_rate": 1.3023317948071611e-06, + "loss": 0.1702, + "step": 8568 + }, + { + "epoch": 0.888071302725671, + "grad_norm": 0.5935059785842896, + "learning_rate": 1.2999498504417286e-06, + "loss": 0.158, + "step": 8569 + }, + { + "epoch": 0.8881749404083324, + "grad_norm": 0.6271074414253235, + "learning_rate": 1.2975700131688806e-06, + "loss": 0.1565, + "step": 8570 + }, + { + "epoch": 0.8882785780909939, + "grad_norm": 0.7890324592590332, + "learning_rate": 1.2951922832567676e-06, + "loss": 0.2071, + "step": 8571 + }, + { + "epoch": 0.8883822157736553, + "grad_norm": 0.7705354690551758, + "learning_rate": 1.292816660973315e-06, + "loss": 0.2464, + "step": 8572 + }, + { + "epoch": 0.8884858534563167, + "grad_norm": 0.6207772493362427, + "learning_rate": 1.2904431465861978e-06, + "loss": 0.1653, + "step": 8573 + }, + { + "epoch": 0.8885894911389781, + "grad_norm": 0.7003123760223389, + "learning_rate": 1.2880717403628596e-06, + "loss": 0.1639, + "step": 8574 + }, + { + "epoch": 0.8886931288216395, + "grad_norm": 0.5598231554031372, + "learning_rate": 1.2857024425705133e-06, + "loss": 0.1635, + "step": 8575 + }, + { + "epoch": 0.888796766504301, + "grad_norm": 0.5815957188606262, + "learning_rate": 1.2833352534761212e-06, + "loss": 0.168, + "step": 8576 + }, + { + "epoch": 0.8889004041869624, + "grad_norm": 0.6298869848251343, + "learning_rate": 1.2809701733464164e-06, + "loss": 0.1857, + "step": 8577 + }, + { + "epoch": 0.8890040418696238, + "grad_norm": 0.6543485522270203, + "learning_rate": 1.2786072024478945e-06, + "loss": 0.1884, + "step": 8578 + }, + { + "epoch": 0.8891076795522852, + "grad_norm": 0.5738658308982849, + "learning_rate": 1.2762463410468117e-06, + "loss": 0.163, + "step": 8579 + }, + { + "epoch": 0.8892113172349466, + "grad_norm": 0.7424209117889404, + "learning_rate": 1.2738875894091817e-06, + "loss": 0.2165, + "step": 8580 + }, + { + "epoch": 0.889314954917608, + "grad_norm": 0.6845530867576599, + "learning_rate": 1.271530947800792e-06, + "loss": 0.1887, + "step": 8581 + }, + { + "epoch": 0.8894185926002695, + "grad_norm": 0.7086684703826904, + "learning_rate": 1.2691764164871768e-06, + "loss": 0.2111, + "step": 8582 + }, + { + "epoch": 0.8895222302829309, + "grad_norm": 0.6318870186805725, + "learning_rate": 1.266823995733648e-06, + "loss": 0.1869, + "step": 8583 + }, + { + "epoch": 0.8896258679655923, + "grad_norm": 0.7254533171653748, + "learning_rate": 1.2644736858052675e-06, + "loss": 0.208, + "step": 8584 + }, + { + "epoch": 0.8897295056482537, + "grad_norm": 0.7029841542243958, + "learning_rate": 1.2621254869668698e-06, + "loss": 0.1879, + "step": 8585 + }, + { + "epoch": 0.8898331433309151, + "grad_norm": 0.7651259303092957, + "learning_rate": 1.259779399483043e-06, + "loss": 0.1963, + "step": 8586 + }, + { + "epoch": 0.8899367810135765, + "grad_norm": 0.6780407428741455, + "learning_rate": 1.2574354236181408e-06, + "loss": 0.2059, + "step": 8587 + }, + { + "epoch": 0.890040418696238, + "grad_norm": 0.6573833227157593, + "learning_rate": 1.25509355963628e-06, + "loss": 0.2028, + "step": 8588 + }, + { + "epoch": 0.8901440563788994, + "grad_norm": 0.5772253274917603, + "learning_rate": 1.2527538078013346e-06, + "loss": 0.1696, + "step": 8589 + }, + { + "epoch": 0.8902476940615608, + "grad_norm": 0.6660610437393188, + "learning_rate": 1.2504161683769512e-06, + "loss": 0.1676, + "step": 8590 + }, + { + "epoch": 0.8903513317442222, + "grad_norm": 0.708368718624115, + "learning_rate": 1.2480806416265256e-06, + "loss": 0.197, + "step": 8591 + }, + { + "epoch": 0.8904549694268836, + "grad_norm": 0.6154781579971313, + "learning_rate": 1.245747227813221e-06, + "loss": 0.1903, + "step": 8592 + }, + { + "epoch": 0.890558607109545, + "grad_norm": 0.778201699256897, + "learning_rate": 1.2434159271999668e-06, + "loss": 0.212, + "step": 8593 + }, + { + "epoch": 0.8906622447922065, + "grad_norm": 0.7690391540527344, + "learning_rate": 1.2410867400494464e-06, + "loss": 0.205, + "step": 8594 + }, + { + "epoch": 0.8907658824748679, + "grad_norm": 0.7358372211456299, + "learning_rate": 1.2387596666241097e-06, + "loss": 0.199, + "step": 8595 + }, + { + "epoch": 0.8908695201575293, + "grad_norm": 0.6339231133460999, + "learning_rate": 1.2364347071861716e-06, + "loss": 0.1822, + "step": 8596 + }, + { + "epoch": 0.8909731578401907, + "grad_norm": 0.624960720539093, + "learning_rate": 1.2341118619976023e-06, + "loss": 0.1765, + "step": 8597 + }, + { + "epoch": 0.8910767955228521, + "grad_norm": 0.6983635425567627, + "learning_rate": 1.2317911313201369e-06, + "loss": 0.1782, + "step": 8598 + }, + { + "epoch": 0.8911804332055135, + "grad_norm": 0.607162594795227, + "learning_rate": 1.2294725154152687e-06, + "loss": 0.1706, + "step": 8599 + }, + { + "epoch": 0.891284070888175, + "grad_norm": 0.6569267511367798, + "learning_rate": 1.2271560145442574e-06, + "loss": 0.1998, + "step": 8600 + }, + { + "epoch": 0.8913877085708364, + "grad_norm": 0.5935954451560974, + "learning_rate": 1.2248416289681253e-06, + "loss": 0.1625, + "step": 8601 + }, + { + "epoch": 0.8914913462534978, + "grad_norm": 0.5493476390838623, + "learning_rate": 1.2225293589476506e-06, + "loss": 0.1439, + "step": 8602 + }, + { + "epoch": 0.8915949839361592, + "grad_norm": 0.6254029273986816, + "learning_rate": 1.2202192047433802e-06, + "loss": 0.172, + "step": 8603 + }, + { + "epoch": 0.8916986216188206, + "grad_norm": 0.5982598066329956, + "learning_rate": 1.2179111666156152e-06, + "loss": 0.1627, + "step": 8604 + }, + { + "epoch": 0.891802259301482, + "grad_norm": 0.6403316259384155, + "learning_rate": 1.2156052448244204e-06, + "loss": 0.1734, + "step": 8605 + }, + { + "epoch": 0.8919058969841435, + "grad_norm": 0.6913343071937561, + "learning_rate": 1.2133014396296283e-06, + "loss": 0.1981, + "step": 8606 + }, + { + "epoch": 0.8920095346668049, + "grad_norm": 0.7176704406738281, + "learning_rate": 1.2109997512908245e-06, + "loss": 0.2002, + "step": 8607 + }, + { + "epoch": 0.8921131723494663, + "grad_norm": 0.703881561756134, + "learning_rate": 1.2087001800673615e-06, + "loss": 0.1836, + "step": 8608 + }, + { + "epoch": 0.8922168100321277, + "grad_norm": 0.738377034664154, + "learning_rate": 1.2064027262183475e-06, + "loss": 0.2089, + "step": 8609 + }, + { + "epoch": 0.8923204477147891, + "grad_norm": 0.6741861701011658, + "learning_rate": 1.2041073900026624e-06, + "loss": 0.1878, + "step": 8610 + }, + { + "epoch": 0.8924240853974506, + "grad_norm": 0.9033978581428528, + "learning_rate": 1.201814171678939e-06, + "loss": 0.2101, + "step": 8611 + }, + { + "epoch": 0.892527723080112, + "grad_norm": 0.7610499262809753, + "learning_rate": 1.1995230715055684e-06, + "loss": 0.2188, + "step": 8612 + }, + { + "epoch": 0.8926313607627734, + "grad_norm": 0.6552374362945557, + "learning_rate": 1.1972340897407153e-06, + "loss": 0.1904, + "step": 8613 + }, + { + "epoch": 0.8927349984454348, + "grad_norm": 0.6682619452476501, + "learning_rate": 1.1949472266422913e-06, + "loss": 0.1852, + "step": 8614 + }, + { + "epoch": 0.8928386361280962, + "grad_norm": 0.6607064604759216, + "learning_rate": 1.1926624824679833e-06, + "loss": 0.1904, + "step": 8615 + }, + { + "epoch": 0.8929422738107576, + "grad_norm": 0.7705369591712952, + "learning_rate": 1.1903798574752346e-06, + "loss": 0.1992, + "step": 8616 + }, + { + "epoch": 0.893045911493419, + "grad_norm": 0.6280906796455383, + "learning_rate": 1.188099351921237e-06, + "loss": 0.1838, + "step": 8617 + }, + { + "epoch": 0.8931495491760805, + "grad_norm": 0.6903097033500671, + "learning_rate": 1.185820966062965e-06, + "loss": 0.1842, + "step": 8618 + }, + { + "epoch": 0.8932531868587419, + "grad_norm": 0.8103142976760864, + "learning_rate": 1.1835447001571353e-06, + "loss": 0.2249, + "step": 8619 + }, + { + "epoch": 0.8933568245414033, + "grad_norm": 0.7311362624168396, + "learning_rate": 1.1812705544602387e-06, + "loss": 0.1945, + "step": 8620 + }, + { + "epoch": 0.8934604622240647, + "grad_norm": 0.6058357954025269, + "learning_rate": 1.1789985292285233e-06, + "loss": 0.153, + "step": 8621 + }, + { + "epoch": 0.8935640999067261, + "grad_norm": 0.5987060070037842, + "learning_rate": 1.1767286247179976e-06, + "loss": 0.1663, + "step": 8622 + }, + { + "epoch": 0.8936677375893876, + "grad_norm": 0.6946430206298828, + "learning_rate": 1.1744608411844282e-06, + "loss": 0.1893, + "step": 8623 + }, + { + "epoch": 0.8937713752720489, + "grad_norm": 0.7491544485092163, + "learning_rate": 1.172195178883344e-06, + "loss": 0.2054, + "step": 8624 + }, + { + "epoch": 0.8938750129547103, + "grad_norm": 0.6955428123474121, + "learning_rate": 1.1699316380700388e-06, + "loss": 0.1676, + "step": 8625 + }, + { + "epoch": 0.8939786506373717, + "grad_norm": 0.6969360709190369, + "learning_rate": 1.1676702189995681e-06, + "loss": 0.1758, + "step": 8626 + }, + { + "epoch": 0.8940822883200331, + "grad_norm": 0.6750574111938477, + "learning_rate": 1.1654109219267373e-06, + "loss": 0.1787, + "step": 8627 + }, + { + "epoch": 0.8941859260026945, + "grad_norm": 0.6682292819023132, + "learning_rate": 1.163153747106127e-06, + "loss": 0.1799, + "step": 8628 + }, + { + "epoch": 0.894289563685356, + "grad_norm": 0.7346889972686768, + "learning_rate": 1.160898694792072e-06, + "loss": 0.2096, + "step": 8629 + }, + { + "epoch": 0.8943932013680174, + "grad_norm": 0.8192048072814941, + "learning_rate": 1.158645765238664e-06, + "loss": 0.213, + "step": 8630 + }, + { + "epoch": 0.8944968390506788, + "grad_norm": 0.669141411781311, + "learning_rate": 1.1563949586997625e-06, + "loss": 0.1865, + "step": 8631 + }, + { + "epoch": 0.8946004767333402, + "grad_norm": 0.7453798055648804, + "learning_rate": 1.1541462754289823e-06, + "loss": 0.1989, + "step": 8632 + }, + { + "epoch": 0.8947041144160016, + "grad_norm": 0.6614513397216797, + "learning_rate": 1.1518997156797053e-06, + "loss": 0.1701, + "step": 8633 + }, + { + "epoch": 0.894807752098663, + "grad_norm": 0.6962074637413025, + "learning_rate": 1.1496552797050665e-06, + "loss": 0.1856, + "step": 8634 + }, + { + "epoch": 0.8949113897813245, + "grad_norm": 0.7701820135116577, + "learning_rate": 1.1474129677579703e-06, + "loss": 0.2305, + "step": 8635 + }, + { + "epoch": 0.8950150274639859, + "grad_norm": 0.601924479007721, + "learning_rate": 1.1451727800910728e-06, + "loss": 0.18, + "step": 8636 + }, + { + "epoch": 0.8951186651466473, + "grad_norm": 0.7435898184776306, + "learning_rate": 1.1429347169567938e-06, + "loss": 0.1789, + "step": 8637 + }, + { + "epoch": 0.8952223028293087, + "grad_norm": 0.7584136724472046, + "learning_rate": 1.1406987786073209e-06, + "loss": 0.2202, + "step": 8638 + }, + { + "epoch": 0.8953259405119701, + "grad_norm": 0.6906249523162842, + "learning_rate": 1.1384649652945877e-06, + "loss": 0.1893, + "step": 8639 + }, + { + "epoch": 0.8954295781946315, + "grad_norm": 0.6184383630752563, + "learning_rate": 1.1362332772703021e-06, + "loss": 0.1861, + "step": 8640 + }, + { + "epoch": 0.895533215877293, + "grad_norm": 0.6365854740142822, + "learning_rate": 1.1340037147859295e-06, + "loss": 0.1636, + "step": 8641 + }, + { + "epoch": 0.8956368535599544, + "grad_norm": 0.7137528657913208, + "learning_rate": 1.131776278092691e-06, + "loss": 0.1808, + "step": 8642 + }, + { + "epoch": 0.8957404912426158, + "grad_norm": 0.7200307250022888, + "learning_rate": 1.1295509674415683e-06, + "loss": 0.2088, + "step": 8643 + }, + { + "epoch": 0.8958441289252772, + "grad_norm": 0.7620137333869934, + "learning_rate": 1.1273277830833075e-06, + "loss": 0.2023, + "step": 8644 + }, + { + "epoch": 0.8959477666079386, + "grad_norm": 0.5705604553222656, + "learning_rate": 1.1251067252684122e-06, + "loss": 0.1473, + "step": 8645 + }, + { + "epoch": 0.8960514042906, + "grad_norm": 0.8270102739334106, + "learning_rate": 1.122887794247154e-06, + "loss": 0.2148, + "step": 8646 + }, + { + "epoch": 0.8961550419732615, + "grad_norm": 0.7535790205001831, + "learning_rate": 1.1206709902695502e-06, + "loss": 0.1974, + "step": 8647 + }, + { + "epoch": 0.8962586796559229, + "grad_norm": 0.6767138838768005, + "learning_rate": 1.1184563135853965e-06, + "loss": 0.1893, + "step": 8648 + }, + { + "epoch": 0.8963623173385843, + "grad_norm": 0.860120952129364, + "learning_rate": 1.1162437644442291e-06, + "loss": 0.2046, + "step": 8649 + }, + { + "epoch": 0.8964659550212457, + "grad_norm": 0.5672703385353088, + "learning_rate": 1.1140333430953577e-06, + "loss": 0.1682, + "step": 8650 + }, + { + "epoch": 0.8965695927039071, + "grad_norm": 0.7149715423583984, + "learning_rate": 1.1118250497878535e-06, + "loss": 0.1976, + "step": 8651 + }, + { + "epoch": 0.8966732303865685, + "grad_norm": 0.7060601115226746, + "learning_rate": 1.1096188847705357e-06, + "loss": 0.2031, + "step": 8652 + }, + { + "epoch": 0.89677686806923, + "grad_norm": 0.7362112402915955, + "learning_rate": 1.1074148482920011e-06, + "loss": 0.2101, + "step": 8653 + }, + { + "epoch": 0.8968805057518914, + "grad_norm": 0.9833632707595825, + "learning_rate": 1.105212940600593e-06, + "loss": 0.1849, + "step": 8654 + }, + { + "epoch": 0.8969841434345528, + "grad_norm": 0.7174100279808044, + "learning_rate": 1.1030131619444128e-06, + "loss": 0.1928, + "step": 8655 + }, + { + "epoch": 0.8970877811172142, + "grad_norm": 0.559037446975708, + "learning_rate": 1.1008155125713382e-06, + "loss": 0.165, + "step": 8656 + }, + { + "epoch": 0.8971914187998756, + "grad_norm": 0.7384814023971558, + "learning_rate": 1.098619992728991e-06, + "loss": 0.2041, + "step": 8657 + }, + { + "epoch": 0.897295056482537, + "grad_norm": 0.7134201526641846, + "learning_rate": 1.0964266026647596e-06, + "loss": 0.2359, + "step": 8658 + }, + { + "epoch": 0.8973986941651985, + "grad_norm": 0.8288852572441101, + "learning_rate": 1.0942353426257934e-06, + "loss": 0.2553, + "step": 8659 + }, + { + "epoch": 0.8975023318478599, + "grad_norm": 0.6036843657493591, + "learning_rate": 1.0920462128589992e-06, + "loss": 0.168, + "step": 8660 + }, + { + "epoch": 0.8976059695305213, + "grad_norm": 0.7506285309791565, + "learning_rate": 1.0898592136110465e-06, + "loss": 0.1689, + "step": 8661 + }, + { + "epoch": 0.8977096072131827, + "grad_norm": 0.6949458122253418, + "learning_rate": 1.0876743451283555e-06, + "loss": 0.2195, + "step": 8662 + }, + { + "epoch": 0.8978132448958441, + "grad_norm": 0.5990825295448303, + "learning_rate": 1.0854916076571254e-06, + "loss": 0.1691, + "step": 8663 + }, + { + "epoch": 0.8979168825785055, + "grad_norm": 0.7362884879112244, + "learning_rate": 1.0833110014432945e-06, + "loss": 0.1726, + "step": 8664 + }, + { + "epoch": 0.898020520261167, + "grad_norm": 0.5977020263671875, + "learning_rate": 1.0811325267325712e-06, + "loss": 0.1707, + "step": 8665 + }, + { + "epoch": 0.8981241579438284, + "grad_norm": 0.6358360052108765, + "learning_rate": 1.07895618377043e-06, + "loss": 0.1798, + "step": 8666 + }, + { + "epoch": 0.8982277956264898, + "grad_norm": 0.7800115346908569, + "learning_rate": 1.0767819728020924e-06, + "loss": 0.2185, + "step": 8667 + }, + { + "epoch": 0.8983314333091512, + "grad_norm": 0.6631283760070801, + "learning_rate": 1.0746098940725447e-06, + "loss": 0.1629, + "step": 8668 + }, + { + "epoch": 0.8984350709918126, + "grad_norm": 0.6898950934410095, + "learning_rate": 1.0724399478265312e-06, + "loss": 0.1972, + "step": 8669 + }, + { + "epoch": 0.898538708674474, + "grad_norm": 0.7569713592529297, + "learning_rate": 1.0702721343085608e-06, + "loss": 0.2137, + "step": 8670 + }, + { + "epoch": 0.8986423463571355, + "grad_norm": 0.6086417436599731, + "learning_rate": 1.0681064537629027e-06, + "loss": 0.1691, + "step": 8671 + }, + { + "epoch": 0.8987459840397969, + "grad_norm": 0.6909456253051758, + "learning_rate": 1.065942906433577e-06, + "loss": 0.1838, + "step": 8672 + }, + { + "epoch": 0.8988496217224583, + "grad_norm": 0.6393236517906189, + "learning_rate": 1.063781492564373e-06, + "loss": 0.1817, + "step": 8673 + }, + { + "epoch": 0.8989532594051197, + "grad_norm": 0.8174936175346375, + "learning_rate": 1.061622212398834e-06, + "loss": 0.23, + "step": 8674 + }, + { + "epoch": 0.8990568970877811, + "grad_norm": 0.6729822754859924, + "learning_rate": 1.0594650661802608e-06, + "loss": 0.1935, + "step": 8675 + }, + { + "epoch": 0.8991605347704426, + "grad_norm": 0.6885797381401062, + "learning_rate": 1.0573100541517234e-06, + "loss": 0.1822, + "step": 8676 + }, + { + "epoch": 0.899264172453104, + "grad_norm": 0.5672880411148071, + "learning_rate": 1.0551571765560388e-06, + "loss": 0.1614, + "step": 8677 + }, + { + "epoch": 0.8993678101357654, + "grad_norm": 0.6624981164932251, + "learning_rate": 1.0530064336357948e-06, + "loss": 0.1921, + "step": 8678 + }, + { + "epoch": 0.8994714478184268, + "grad_norm": 0.6525665521621704, + "learning_rate": 1.0508578256333335e-06, + "loss": 0.1975, + "step": 8679 + }, + { + "epoch": 0.8995750855010882, + "grad_norm": 0.6125645637512207, + "learning_rate": 1.0487113527907522e-06, + "loss": 0.1559, + "step": 8680 + }, + { + "epoch": 0.8996787231837496, + "grad_norm": 0.6657199859619141, + "learning_rate": 1.0465670153499196e-06, + "loss": 0.1928, + "step": 8681 + }, + { + "epoch": 0.8997823608664111, + "grad_norm": 0.7960999608039856, + "learning_rate": 1.044424813552447e-06, + "loss": 0.2421, + "step": 8682 + }, + { + "epoch": 0.8998859985490725, + "grad_norm": 0.6669391989707947, + "learning_rate": 1.0422847476397235e-06, + "loss": 0.1811, + "step": 8683 + }, + { + "epoch": 0.8999896362317339, + "grad_norm": 0.7245315909385681, + "learning_rate": 1.0401468178528829e-06, + "loss": 0.1971, + "step": 8684 + }, + { + "epoch": 0.9000932739143953, + "grad_norm": 0.6361146569252014, + "learning_rate": 1.0380110244328256e-06, + "loss": 0.1836, + "step": 8685 + }, + { + "epoch": 0.9001969115970567, + "grad_norm": 0.806462287902832, + "learning_rate": 1.0358773676202105e-06, + "loss": 0.2434, + "step": 8686 + }, + { + "epoch": 0.9003005492797181, + "grad_norm": 0.6648523807525635, + "learning_rate": 1.0337458476554497e-06, + "loss": 0.1886, + "step": 8687 + }, + { + "epoch": 0.9004041869623796, + "grad_norm": 0.6641026735305786, + "learning_rate": 1.0316164647787263e-06, + "loss": 0.1828, + "step": 8688 + }, + { + "epoch": 0.900507824645041, + "grad_norm": 0.7108498215675354, + "learning_rate": 1.0294892192299688e-06, + "loss": 0.19, + "step": 8689 + }, + { + "epoch": 0.9006114623277024, + "grad_norm": 0.6778939962387085, + "learning_rate": 1.0273641112488787e-06, + "loss": 0.1905, + "step": 8690 + }, + { + "epoch": 0.9007151000103638, + "grad_norm": 0.5897559523582458, + "learning_rate": 1.0252411410749063e-06, + "loss": 0.156, + "step": 8691 + }, + { + "epoch": 0.9008187376930252, + "grad_norm": 0.8290709257125854, + "learning_rate": 1.0231203089472674e-06, + "loss": 0.2357, + "step": 8692 + }, + { + "epoch": 0.9009223753756866, + "grad_norm": 0.7354229092597961, + "learning_rate": 1.0210016151049329e-06, + "loss": 0.1803, + "step": 8693 + }, + { + "epoch": 0.9010260130583481, + "grad_norm": 0.8105208873748779, + "learning_rate": 1.0188850597866272e-06, + "loss": 0.2522, + "step": 8694 + }, + { + "epoch": 0.9011296507410095, + "grad_norm": 0.5526991486549377, + "learning_rate": 1.0167706432308487e-06, + "loss": 0.1541, + "step": 8695 + }, + { + "epoch": 0.9012332884236709, + "grad_norm": 0.7528881430625916, + "learning_rate": 1.0146583656758468e-06, + "loss": 0.2092, + "step": 8696 + }, + { + "epoch": 0.9013369261063323, + "grad_norm": 0.7546420693397522, + "learning_rate": 1.0125482273596222e-06, + "loss": 0.2322, + "step": 8697 + }, + { + "epoch": 0.9014405637889937, + "grad_norm": 0.685276985168457, + "learning_rate": 1.010440228519951e-06, + "loss": 0.1759, + "step": 8698 + }, + { + "epoch": 0.9015442014716551, + "grad_norm": 0.6832001209259033, + "learning_rate": 1.0083343693943548e-06, + "loss": 0.1734, + "step": 8699 + }, + { + "epoch": 0.9016478391543165, + "grad_norm": 0.6144412755966187, + "learning_rate": 1.006230650220117e-06, + "loss": 0.1697, + "step": 8700 + }, + { + "epoch": 0.9017514768369779, + "grad_norm": 0.8056115508079529, + "learning_rate": 1.0041290712342833e-06, + "loss": 0.2224, + "step": 8701 + }, + { + "epoch": 0.9018551145196393, + "grad_norm": 0.832954466342926, + "learning_rate": 1.0020296326736557e-06, + "loss": 0.2105, + "step": 8702 + }, + { + "epoch": 0.9019587522023007, + "grad_norm": 0.6306507587432861, + "learning_rate": 9.999323347747981e-07, + "loss": 0.1892, + "step": 8703 + }, + { + "epoch": 0.9020623898849621, + "grad_norm": 0.7022785544395447, + "learning_rate": 9.97837177774026e-07, + "loss": 0.1928, + "step": 8704 + }, + { + "epoch": 0.9021660275676235, + "grad_norm": 0.6971010565757751, + "learning_rate": 9.95744161907426e-07, + "loss": 0.1981, + "step": 8705 + }, + { + "epoch": 0.902269665250285, + "grad_norm": 0.7028632164001465, + "learning_rate": 9.936532874108296e-07, + "loss": 0.1925, + "step": 8706 + }, + { + "epoch": 0.9023733029329464, + "grad_norm": 0.7240568995475769, + "learning_rate": 9.915645545198304e-07, + "loss": 0.2021, + "step": 8707 + }, + { + "epoch": 0.9024769406156078, + "grad_norm": 0.747568666934967, + "learning_rate": 9.894779634697937e-07, + "loss": 0.1929, + "step": 8708 + }, + { + "epoch": 0.9025805782982692, + "grad_norm": 0.6814967393875122, + "learning_rate": 9.873935144958224e-07, + "loss": 0.1784, + "step": 8709 + }, + { + "epoch": 0.9026842159809306, + "grad_norm": 0.614301860332489, + "learning_rate": 9.853112078327954e-07, + "loss": 0.1796, + "step": 8710 + }, + { + "epoch": 0.902787853663592, + "grad_norm": 0.7195767164230347, + "learning_rate": 9.832310437153469e-07, + "loss": 0.2036, + "step": 8711 + }, + { + "epoch": 0.9028914913462535, + "grad_norm": 0.6737602949142456, + "learning_rate": 9.811530223778587e-07, + "loss": 0.1664, + "step": 8712 + }, + { + "epoch": 0.9029951290289149, + "grad_norm": 0.6514472961425781, + "learning_rate": 9.790771440544856e-07, + "loss": 0.1883, + "step": 8713 + }, + { + "epoch": 0.9030987667115763, + "grad_norm": 0.7326091527938843, + "learning_rate": 9.770034089791269e-07, + "loss": 0.191, + "step": 8714 + }, + { + "epoch": 0.9032024043942377, + "grad_norm": 0.7871467471122742, + "learning_rate": 9.749318173854515e-07, + "loss": 0.2391, + "step": 8715 + }, + { + "epoch": 0.9033060420768991, + "grad_norm": 0.6911875605583191, + "learning_rate": 9.728623695068885e-07, + "loss": 0.2012, + "step": 8716 + }, + { + "epoch": 0.9034096797595605, + "grad_norm": 0.7176652550697327, + "learning_rate": 9.707950655766152e-07, + "loss": 0.192, + "step": 8717 + }, + { + "epoch": 0.903513317442222, + "grad_norm": 0.5769132971763611, + "learning_rate": 9.687299058275723e-07, + "loss": 0.1668, + "step": 8718 + }, + { + "epoch": 0.9036169551248834, + "grad_norm": 0.7209014296531677, + "learning_rate": 9.666668904924558e-07, + "loss": 0.209, + "step": 8719 + }, + { + "epoch": 0.9037205928075448, + "grad_norm": 0.7249887585639954, + "learning_rate": 9.646060198037267e-07, + "loss": 0.2065, + "step": 8720 + }, + { + "epoch": 0.9038242304902062, + "grad_norm": 0.7001287937164307, + "learning_rate": 9.625472939936031e-07, + "loss": 0.208, + "step": 8721 + }, + { + "epoch": 0.9039278681728676, + "grad_norm": 0.7644567489624023, + "learning_rate": 9.604907132940511e-07, + "loss": 0.2086, + "step": 8722 + }, + { + "epoch": 0.904031505855529, + "grad_norm": 0.7459152340888977, + "learning_rate": 9.58436277936814e-07, + "loss": 0.2362, + "step": 8723 + }, + { + "epoch": 0.9041351435381905, + "grad_norm": 0.590069055557251, + "learning_rate": 9.563839881533754e-07, + "loss": 0.167, + "step": 8724 + }, + { + "epoch": 0.9042387812208519, + "grad_norm": 0.7091349959373474, + "learning_rate": 9.543338441749816e-07, + "loss": 0.1824, + "step": 8725 + }, + { + "epoch": 0.9043424189035133, + "grad_norm": 0.7616497874259949, + "learning_rate": 9.522858462326456e-07, + "loss": 0.2033, + "step": 8726 + }, + { + "epoch": 0.9044460565861747, + "grad_norm": 0.7694166302680969, + "learning_rate": 9.502399945571272e-07, + "loss": 0.199, + "step": 8727 + }, + { + "epoch": 0.9045496942688361, + "grad_norm": 0.6686285138130188, + "learning_rate": 9.481962893789575e-07, + "loss": 0.1821, + "step": 8728 + }, + { + "epoch": 0.9046533319514976, + "grad_norm": 0.5675410032272339, + "learning_rate": 9.461547309284103e-07, + "loss": 0.1629, + "step": 8729 + }, + { + "epoch": 0.904756969634159, + "grad_norm": 0.7331852912902832, + "learning_rate": 9.441153194355301e-07, + "loss": 0.2246, + "step": 8730 + }, + { + "epoch": 0.9048606073168204, + "grad_norm": 0.6197007298469543, + "learning_rate": 9.420780551301134e-07, + "loss": 0.1641, + "step": 8731 + }, + { + "epoch": 0.9049642449994818, + "grad_norm": 0.6497044563293457, + "learning_rate": 9.40042938241712e-07, + "loss": 0.1727, + "step": 8732 + }, + { + "epoch": 0.9050678826821432, + "grad_norm": 0.7364713549613953, + "learning_rate": 9.380099689996447e-07, + "loss": 0.2017, + "step": 8733 + }, + { + "epoch": 0.9051715203648046, + "grad_norm": 0.6770288348197937, + "learning_rate": 9.359791476329793e-07, + "loss": 0.1878, + "step": 8734 + }, + { + "epoch": 0.905275158047466, + "grad_norm": 0.679532527923584, + "learning_rate": 9.339504743705508e-07, + "loss": 0.2033, + "step": 8735 + }, + { + "epoch": 0.9053787957301275, + "grad_norm": 0.6726140975952148, + "learning_rate": 9.319239494409427e-07, + "loss": 0.1659, + "step": 8736 + }, + { + "epoch": 0.9054824334127889, + "grad_norm": 0.6465378403663635, + "learning_rate": 9.298995730725058e-07, + "loss": 0.1983, + "step": 8737 + }, + { + "epoch": 0.9055860710954503, + "grad_norm": 0.7215073108673096, + "learning_rate": 9.278773454933376e-07, + "loss": 0.2055, + "step": 8738 + }, + { + "epoch": 0.9056897087781117, + "grad_norm": 0.7202661037445068, + "learning_rate": 9.258572669313004e-07, + "loss": 0.1934, + "step": 8739 + }, + { + "epoch": 0.9057933464607731, + "grad_norm": 0.7599125504493713, + "learning_rate": 9.238393376140142e-07, + "loss": 0.1996, + "step": 8740 + }, + { + "epoch": 0.9058969841434346, + "grad_norm": 0.813451886177063, + "learning_rate": 9.218235577688594e-07, + "loss": 0.2303, + "step": 8741 + }, + { + "epoch": 0.906000621826096, + "grad_norm": 0.858730137348175, + "learning_rate": 9.198099276229699e-07, + "loss": 0.2127, + "step": 8742 + }, + { + "epoch": 0.9061042595087574, + "grad_norm": 0.6611201167106628, + "learning_rate": 9.177984474032353e-07, + "loss": 0.1684, + "step": 8743 + }, + { + "epoch": 0.9062078971914188, + "grad_norm": 0.614651620388031, + "learning_rate": 9.157891173363075e-07, + "loss": 0.158, + "step": 8744 + }, + { + "epoch": 0.9063115348740802, + "grad_norm": 0.7337183952331543, + "learning_rate": 9.137819376485924e-07, + "loss": 0.1859, + "step": 8745 + }, + { + "epoch": 0.9064151725567416, + "grad_norm": 0.6367558240890503, + "learning_rate": 9.11776908566262e-07, + "loss": 0.1739, + "step": 8746 + }, + { + "epoch": 0.9065188102394031, + "grad_norm": 0.7510764598846436, + "learning_rate": 9.097740303152336e-07, + "loss": 0.2334, + "step": 8747 + }, + { + "epoch": 0.9066224479220645, + "grad_norm": 0.7323877215385437, + "learning_rate": 9.077733031211955e-07, + "loss": 0.213, + "step": 8748 + }, + { + "epoch": 0.9067260856047259, + "grad_norm": 0.6575753092765808, + "learning_rate": 9.057747272095807e-07, + "loss": 0.1793, + "step": 8749 + }, + { + "epoch": 0.9068297232873873, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.037783028055847e-07, + "loss": 0.1631, + "step": 8750 + }, + { + "epoch": 0.9069333609700487, + "grad_norm": 0.6162066459655762, + "learning_rate": 9.017840301341651e-07, + "loss": 0.1867, + "step": 8751 + }, + { + "epoch": 0.9070369986527101, + "grad_norm": 0.6379433274269104, + "learning_rate": 8.997919094200314e-07, + "loss": 0.1753, + "step": 8752 + }, + { + "epoch": 0.9071406363353716, + "grad_norm": 0.6354124546051025, + "learning_rate": 8.978019408876548e-07, + "loss": 0.1817, + "step": 8753 + }, + { + "epoch": 0.907244274018033, + "grad_norm": 0.6392737627029419, + "learning_rate": 8.958141247612606e-07, + "loss": 0.1569, + "step": 8754 + }, + { + "epoch": 0.9073479117006944, + "grad_norm": 0.5502351522445679, + "learning_rate": 8.938284612648318e-07, + "loss": 0.1339, + "step": 8755 + }, + { + "epoch": 0.9074515493833558, + "grad_norm": 0.7764959931373596, + "learning_rate": 8.918449506221138e-07, + "loss": 0.2219, + "step": 8756 + }, + { + "epoch": 0.9075551870660172, + "grad_norm": 0.7255973815917969, + "learning_rate": 8.898635930565991e-07, + "loss": 0.1985, + "step": 8757 + }, + { + "epoch": 0.9076588247486786, + "grad_norm": 0.6193172931671143, + "learning_rate": 8.878843887915489e-07, + "loss": 0.1817, + "step": 8758 + }, + { + "epoch": 0.9077624624313401, + "grad_norm": 0.7121764421463013, + "learning_rate": 8.859073380499739e-07, + "loss": 0.1974, + "step": 8759 + }, + { + "epoch": 0.9078661001140015, + "grad_norm": 0.7299541234970093, + "learning_rate": 8.839324410546468e-07, + "loss": 0.2213, + "step": 8760 + }, + { + "epoch": 0.9079697377966629, + "grad_norm": 0.7306295037269592, + "learning_rate": 8.819596980280964e-07, + "loss": 0.2208, + "step": 8761 + }, + { + "epoch": 0.9080733754793243, + "grad_norm": 0.6126157641410828, + "learning_rate": 8.799891091926094e-07, + "loss": 0.1538, + "step": 8762 + }, + { + "epoch": 0.9081770131619857, + "grad_norm": 0.769790768623352, + "learning_rate": 8.780206747702258e-07, + "loss": 0.2081, + "step": 8763 + }, + { + "epoch": 0.9082806508446472, + "grad_norm": 0.7183113098144531, + "learning_rate": 8.760543949827461e-07, + "loss": 0.1936, + "step": 8764 + }, + { + "epoch": 0.9083842885273086, + "grad_norm": 0.6745706796646118, + "learning_rate": 8.740902700517284e-07, + "loss": 0.1883, + "step": 8765 + }, + { + "epoch": 0.90848792620997, + "grad_norm": 0.79713374376297, + "learning_rate": 8.721283001984871e-07, + "loss": 0.2036, + "step": 8766 + }, + { + "epoch": 0.9085915638926314, + "grad_norm": 0.8247671723365784, + "learning_rate": 8.70168485644094e-07, + "loss": 0.207, + "step": 8767 + }, + { + "epoch": 0.9086952015752928, + "grad_norm": 0.6058986783027649, + "learning_rate": 8.682108266093792e-07, + "loss": 0.1597, + "step": 8768 + }, + { + "epoch": 0.9087988392579542, + "grad_norm": 0.6964036226272583, + "learning_rate": 8.662553233149284e-07, + "loss": 0.1965, + "step": 8769 + }, + { + "epoch": 0.9089024769406157, + "grad_norm": 0.6855599880218506, + "learning_rate": 8.643019759810811e-07, + "loss": 0.2013, + "step": 8770 + }, + { + "epoch": 0.9090061146232771, + "grad_norm": 0.7175893187522888, + "learning_rate": 8.623507848279433e-07, + "loss": 0.18, + "step": 8771 + }, + { + "epoch": 0.9091097523059385, + "grad_norm": 0.7140921354293823, + "learning_rate": 8.604017500753659e-07, + "loss": 0.2082, + "step": 8772 + }, + { + "epoch": 0.9092133899885999, + "grad_norm": 0.7414721250534058, + "learning_rate": 8.584548719429664e-07, + "loss": 0.2212, + "step": 8773 + }, + { + "epoch": 0.9093170276712613, + "grad_norm": 0.7165692448616028, + "learning_rate": 8.565101506501206e-07, + "loss": 0.2057, + "step": 8774 + }, + { + "epoch": 0.9094206653539227, + "grad_norm": 0.5552352070808411, + "learning_rate": 8.545675864159464e-07, + "loss": 0.1537, + "step": 8775 + }, + { + "epoch": 0.909524303036584, + "grad_norm": 0.746362030506134, + "learning_rate": 8.526271794593377e-07, + "loss": 0.21, + "step": 8776 + }, + { + "epoch": 0.9096279407192455, + "grad_norm": 0.7297185659408569, + "learning_rate": 8.506889299989307e-07, + "loss": 0.2053, + "step": 8777 + }, + { + "epoch": 0.9097315784019069, + "grad_norm": 0.7221811413764954, + "learning_rate": 8.487528382531263e-07, + "loss": 0.1842, + "step": 8778 + }, + { + "epoch": 0.9098352160845683, + "grad_norm": 0.7709521651268005, + "learning_rate": 8.468189044400832e-07, + "loss": 0.2198, + "step": 8779 + }, + { + "epoch": 0.9099388537672297, + "grad_norm": 0.6700122356414795, + "learning_rate": 8.448871287777116e-07, + "loss": 0.1561, + "step": 8780 + }, + { + "epoch": 0.9100424914498911, + "grad_norm": 0.7160411477088928, + "learning_rate": 8.429575114836819e-07, + "loss": 0.1951, + "step": 8781 + }, + { + "epoch": 0.9101461291325526, + "grad_norm": 0.6808386445045471, + "learning_rate": 8.410300527754178e-07, + "loss": 0.171, + "step": 8782 + }, + { + "epoch": 0.910249766815214, + "grad_norm": 0.730686604976654, + "learning_rate": 8.391047528701035e-07, + "loss": 0.1973, + "step": 8783 + }, + { + "epoch": 0.9103534044978754, + "grad_norm": 0.6939843893051147, + "learning_rate": 8.37181611984681e-07, + "loss": 0.1772, + "step": 8784 + }, + { + "epoch": 0.9104570421805368, + "grad_norm": 0.6927918195724487, + "learning_rate": 8.352606303358435e-07, + "loss": 0.2027, + "step": 8785 + }, + { + "epoch": 0.9105606798631982, + "grad_norm": 0.7357034087181091, + "learning_rate": 8.33341808140049e-07, + "loss": 0.232, + "step": 8786 + }, + { + "epoch": 0.9106643175458596, + "grad_norm": 0.592159628868103, + "learning_rate": 8.314251456135047e-07, + "loss": 0.1668, + "step": 8787 + }, + { + "epoch": 0.910767955228521, + "grad_norm": 0.6434322595596313, + "learning_rate": 8.295106429721733e-07, + "loss": 0.1891, + "step": 8788 + }, + { + "epoch": 0.9108715929111825, + "grad_norm": 0.7842497229576111, + "learning_rate": 8.275983004317845e-07, + "loss": 0.2292, + "step": 8789 + }, + { + "epoch": 0.9109752305938439, + "grad_norm": 0.682867169380188, + "learning_rate": 8.256881182078125e-07, + "loss": 0.2043, + "step": 8790 + }, + { + "epoch": 0.9110788682765053, + "grad_norm": 0.7165136933326721, + "learning_rate": 8.237800965154985e-07, + "loss": 0.2061, + "step": 8791 + }, + { + "epoch": 0.9111825059591667, + "grad_norm": 0.7450674176216125, + "learning_rate": 8.218742355698306e-07, + "loss": 0.2129, + "step": 8792 + }, + { + "epoch": 0.9112861436418281, + "grad_norm": 0.7600066065788269, + "learning_rate": 8.199705355855637e-07, + "loss": 0.2152, + "step": 8793 + }, + { + "epoch": 0.9113897813244896, + "grad_norm": 0.6899908185005188, + "learning_rate": 8.180689967772016e-07, + "loss": 0.1928, + "step": 8794 + }, + { + "epoch": 0.911493419007151, + "grad_norm": 0.7446398138999939, + "learning_rate": 8.16169619359004e-07, + "loss": 0.1994, + "step": 8795 + }, + { + "epoch": 0.9115970566898124, + "grad_norm": 0.6524834632873535, + "learning_rate": 8.142724035449934e-07, + "loss": 0.1942, + "step": 8796 + }, + { + "epoch": 0.9117006943724738, + "grad_norm": 0.804182231426239, + "learning_rate": 8.123773495489406e-07, + "loss": 0.1911, + "step": 8797 + }, + { + "epoch": 0.9118043320551352, + "grad_norm": 0.677555501461029, + "learning_rate": 8.104844575843795e-07, + "loss": 0.1789, + "step": 8798 + }, + { + "epoch": 0.9119079697377966, + "grad_norm": 0.6828414797782898, + "learning_rate": 8.085937278646039e-07, + "loss": 0.1907, + "step": 8799 + }, + { + "epoch": 0.9120116074204581, + "grad_norm": 0.5415399074554443, + "learning_rate": 8.067051606026521e-07, + "loss": 0.1591, + "step": 8800 + }, + { + "epoch": 0.9121152451031195, + "grad_norm": 0.7139943838119507, + "learning_rate": 8.048187560113274e-07, + "loss": 0.1868, + "step": 8801 + }, + { + "epoch": 0.9122188827857809, + "grad_norm": 0.8067030906677246, + "learning_rate": 8.029345143031819e-07, + "loss": 0.2089, + "step": 8802 + }, + { + "epoch": 0.9123225204684423, + "grad_norm": 0.6702319979667664, + "learning_rate": 8.010524356905325e-07, + "loss": 0.1661, + "step": 8803 + }, + { + "epoch": 0.9124261581511037, + "grad_norm": 0.786279559135437, + "learning_rate": 7.99172520385454e-07, + "loss": 0.2033, + "step": 8804 + }, + { + "epoch": 0.9125297958337651, + "grad_norm": 0.6434553861618042, + "learning_rate": 7.972947685997634e-07, + "loss": 0.1687, + "step": 8805 + }, + { + "epoch": 0.9126334335164266, + "grad_norm": 0.729469358921051, + "learning_rate": 7.954191805450518e-07, + "loss": 0.1963, + "step": 8806 + }, + { + "epoch": 0.912737071199088, + "grad_norm": 0.8612606525421143, + "learning_rate": 7.935457564326477e-07, + "loss": 0.2483, + "step": 8807 + }, + { + "epoch": 0.9128407088817494, + "grad_norm": 0.6381658911705017, + "learning_rate": 7.916744964736511e-07, + "loss": 0.1866, + "step": 8808 + }, + { + "epoch": 0.9129443465644108, + "grad_norm": 0.7000229954719543, + "learning_rate": 7.898054008789157e-07, + "loss": 0.1798, + "step": 8809 + }, + { + "epoch": 0.9130479842470722, + "grad_norm": 0.6891587972640991, + "learning_rate": 7.87938469859042e-07, + "loss": 0.1968, + "step": 8810 + }, + { + "epoch": 0.9131516219297336, + "grad_norm": 0.7172576785087585, + "learning_rate": 7.860737036243993e-07, + "loss": 0.1819, + "step": 8811 + }, + { + "epoch": 0.9132552596123951, + "grad_norm": 0.5555979013442993, + "learning_rate": 7.842111023851018e-07, + "loss": 0.1501, + "step": 8812 + }, + { + "epoch": 0.9133588972950565, + "grad_norm": 0.7300063967704773, + "learning_rate": 7.823506663510239e-07, + "loss": 0.2363, + "step": 8813 + }, + { + "epoch": 0.9134625349777179, + "grad_norm": 0.645727276802063, + "learning_rate": 7.804923957318001e-07, + "loss": 0.1643, + "step": 8814 + }, + { + "epoch": 0.9135661726603793, + "grad_norm": 0.6470401883125305, + "learning_rate": 7.786362907368139e-07, + "loss": 0.1829, + "step": 8815 + }, + { + "epoch": 0.9136698103430407, + "grad_norm": 0.6089012622833252, + "learning_rate": 7.767823515752116e-07, + "loss": 0.1669, + "step": 8816 + }, + { + "epoch": 0.9137734480257022, + "grad_norm": 0.6825041770935059, + "learning_rate": 7.749305784558902e-07, + "loss": 0.1641, + "step": 8817 + }, + { + "epoch": 0.9138770857083636, + "grad_norm": 0.7475860714912415, + "learning_rate": 7.730809715875076e-07, + "loss": 0.2253, + "step": 8818 + }, + { + "epoch": 0.913980723391025, + "grad_norm": 0.6520041227340698, + "learning_rate": 7.712335311784703e-07, + "loss": 0.1733, + "step": 8819 + }, + { + "epoch": 0.9140843610736864, + "grad_norm": 0.6163454055786133, + "learning_rate": 7.693882574369471e-07, + "loss": 0.1588, + "step": 8820 + }, + { + "epoch": 0.9141879987563478, + "grad_norm": 0.7530646324157715, + "learning_rate": 7.675451505708609e-07, + "loss": 0.2205, + "step": 8821 + }, + { + "epoch": 0.9142916364390092, + "grad_norm": 0.6425605416297913, + "learning_rate": 7.657042107878898e-07, + "loss": 0.1807, + "step": 8822 + }, + { + "epoch": 0.9143952741216707, + "grad_norm": 0.831058919429779, + "learning_rate": 7.638654382954657e-07, + "loss": 0.2212, + "step": 8823 + }, + { + "epoch": 0.9144989118043321, + "grad_norm": 0.7559433579444885, + "learning_rate": 7.62028833300783e-07, + "loss": 0.2217, + "step": 8824 + }, + { + "epoch": 0.9146025494869935, + "grad_norm": 0.736668050289154, + "learning_rate": 7.601943960107871e-07, + "loss": 0.226, + "step": 8825 + }, + { + "epoch": 0.9147061871696549, + "grad_norm": 0.7217003107070923, + "learning_rate": 7.583621266321773e-07, + "loss": 0.1864, + "step": 8826 + }, + { + "epoch": 0.9148098248523163, + "grad_norm": 0.6183484196662903, + "learning_rate": 7.565320253714082e-07, + "loss": 0.1827, + "step": 8827 + }, + { + "epoch": 0.9149134625349777, + "grad_norm": 0.7267236709594727, + "learning_rate": 7.547040924346948e-07, + "loss": 0.2064, + "step": 8828 + }, + { + "epoch": 0.9150171002176392, + "grad_norm": 0.7210378050804138, + "learning_rate": 7.528783280280127e-07, + "loss": 0.214, + "step": 8829 + }, + { + "epoch": 0.9151207379003006, + "grad_norm": 0.6646870970726013, + "learning_rate": 7.510547323570749e-07, + "loss": 0.1879, + "step": 8830 + }, + { + "epoch": 0.915224375582962, + "grad_norm": 0.7031145691871643, + "learning_rate": 7.492333056273704e-07, + "loss": 0.1817, + "step": 8831 + }, + { + "epoch": 0.9153280132656234, + "grad_norm": 0.6843001842498779, + "learning_rate": 7.474140480441305e-07, + "loss": 0.2295, + "step": 8832 + }, + { + "epoch": 0.9154316509482848, + "grad_norm": 0.6840508580207825, + "learning_rate": 7.455969598123447e-07, + "loss": 0.2066, + "step": 8833 + }, + { + "epoch": 0.9155352886309462, + "grad_norm": 0.7510146498680115, + "learning_rate": 7.437820411367669e-07, + "loss": 0.2307, + "step": 8834 + }, + { + "epoch": 0.9156389263136077, + "grad_norm": 0.6502298712730408, + "learning_rate": 7.419692922218891e-07, + "loss": 0.2034, + "step": 8835 + }, + { + "epoch": 0.9157425639962691, + "grad_norm": 0.6547967195510864, + "learning_rate": 7.401587132719767e-07, + "loss": 0.1581, + "step": 8836 + }, + { + "epoch": 0.9158462016789305, + "grad_norm": 0.6777070164680481, + "learning_rate": 7.383503044910423e-07, + "loss": 0.1929, + "step": 8837 + }, + { + "epoch": 0.9159498393615919, + "grad_norm": 0.6916144490242004, + "learning_rate": 7.365440660828472e-07, + "loss": 0.1757, + "step": 8838 + }, + { + "epoch": 0.9160534770442533, + "grad_norm": 0.6567953824996948, + "learning_rate": 7.347399982509263e-07, + "loss": 0.2006, + "step": 8839 + }, + { + "epoch": 0.9161571147269147, + "grad_norm": 0.6022465825080872, + "learning_rate": 7.329381011985504e-07, + "loss": 0.1526, + "step": 8840 + }, + { + "epoch": 0.9162607524095762, + "grad_norm": 0.7050966024398804, + "learning_rate": 7.311383751287616e-07, + "loss": 0.1838, + "step": 8841 + }, + { + "epoch": 0.9163643900922376, + "grad_norm": 0.7759564518928528, + "learning_rate": 7.293408202443441e-07, + "loss": 0.2183, + "step": 8842 + }, + { + "epoch": 0.916468027774899, + "grad_norm": 0.6298695206642151, + "learning_rate": 7.275454367478474e-07, + "loss": 0.1592, + "step": 8843 + }, + { + "epoch": 0.9165716654575604, + "grad_norm": 0.6032199263572693, + "learning_rate": 7.257522248415716e-07, + "loss": 0.1811, + "step": 8844 + }, + { + "epoch": 0.9166753031402218, + "grad_norm": 0.6469445824623108, + "learning_rate": 7.239611847275707e-07, + "loss": 0.1816, + "step": 8845 + }, + { + "epoch": 0.9167789408228832, + "grad_norm": 0.6629596948623657, + "learning_rate": 7.221723166076611e-07, + "loss": 0.1962, + "step": 8846 + }, + { + "epoch": 0.9168825785055447, + "grad_norm": 0.6826691627502441, + "learning_rate": 7.203856206834037e-07, + "loss": 0.2113, + "step": 8847 + }, + { + "epoch": 0.9169862161882061, + "grad_norm": 0.80489182472229, + "learning_rate": 7.186010971561241e-07, + "loss": 0.1662, + "step": 8848 + }, + { + "epoch": 0.9170898538708675, + "grad_norm": 0.5962115526199341, + "learning_rate": 7.168187462269016e-07, + "loss": 0.1684, + "step": 8849 + }, + { + "epoch": 0.9171934915535289, + "grad_norm": 0.6177490949630737, + "learning_rate": 7.150385680965666e-07, + "loss": 0.1645, + "step": 8850 + }, + { + "epoch": 0.9172971292361903, + "grad_norm": 0.5791274309158325, + "learning_rate": 7.132605629657052e-07, + "loss": 0.1671, + "step": 8851 + }, + { + "epoch": 0.9174007669188516, + "grad_norm": 0.9155428409576416, + "learning_rate": 7.114847310346617e-07, + "loss": 0.2238, + "step": 8852 + }, + { + "epoch": 0.9175044046015131, + "grad_norm": 0.694946825504303, + "learning_rate": 7.097110725035339e-07, + "loss": 0.201, + "step": 8853 + }, + { + "epoch": 0.9176080422841745, + "grad_norm": 0.5938006639480591, + "learning_rate": 7.079395875721751e-07, + "loss": 0.1707, + "step": 8854 + }, + { + "epoch": 0.9177116799668359, + "grad_norm": 0.6835440993309021, + "learning_rate": 7.061702764401945e-07, + "loss": 0.1877, + "step": 8855 + }, + { + "epoch": 0.9178153176494973, + "grad_norm": 0.6363372206687927, + "learning_rate": 7.04403139306955e-07, + "loss": 0.1741, + "step": 8856 + }, + { + "epoch": 0.9179189553321587, + "grad_norm": 0.634978711605072, + "learning_rate": 7.026381763715729e-07, + "loss": 0.1802, + "step": 8857 + }, + { + "epoch": 0.9180225930148201, + "grad_norm": 0.5975295901298523, + "learning_rate": 7.008753878329222e-07, + "loss": 0.1777, + "step": 8858 + }, + { + "epoch": 0.9181262306974816, + "grad_norm": 0.7016348242759705, + "learning_rate": 6.991147738896331e-07, + "loss": 0.2119, + "step": 8859 + }, + { + "epoch": 0.918229868380143, + "grad_norm": 0.6460996866226196, + "learning_rate": 6.973563347400869e-07, + "loss": 0.1849, + "step": 8860 + }, + { + "epoch": 0.9183335060628044, + "grad_norm": 0.6963163614273071, + "learning_rate": 6.956000705824228e-07, + "loss": 0.1901, + "step": 8861 + }, + { + "epoch": 0.9184371437454658, + "grad_norm": 0.7260379195213318, + "learning_rate": 6.938459816145316e-07, + "loss": 0.201, + "step": 8862 + }, + { + "epoch": 0.9185407814281272, + "grad_norm": 0.5716794729232788, + "learning_rate": 6.92094068034066e-07, + "loss": 0.1861, + "step": 8863 + }, + { + "epoch": 0.9186444191107886, + "grad_norm": 0.7157220244407654, + "learning_rate": 6.90344330038426e-07, + "loss": 0.1921, + "step": 8864 + }, + { + "epoch": 0.9187480567934501, + "grad_norm": 0.6244716048240662, + "learning_rate": 6.885967678247652e-07, + "loss": 0.1564, + "step": 8865 + }, + { + "epoch": 0.9188516944761115, + "grad_norm": 0.6170865893363953, + "learning_rate": 6.868513815900057e-07, + "loss": 0.1619, + "step": 8866 + }, + { + "epoch": 0.9189553321587729, + "grad_norm": 0.7579802870750427, + "learning_rate": 6.851081715308061e-07, + "loss": 0.1691, + "step": 8867 + }, + { + "epoch": 0.9190589698414343, + "grad_norm": 0.6972897052764893, + "learning_rate": 6.833671378435913e-07, + "loss": 0.1976, + "step": 8868 + }, + { + "epoch": 0.9191626075240957, + "grad_norm": 0.8163078427314758, + "learning_rate": 6.816282807245444e-07, + "loss": 0.2038, + "step": 8869 + }, + { + "epoch": 0.9192662452067571, + "grad_norm": 0.7421157956123352, + "learning_rate": 6.798916003695888e-07, + "loss": 0.1971, + "step": 8870 + }, + { + "epoch": 0.9193698828894186, + "grad_norm": 0.6653786301612854, + "learning_rate": 6.781570969744145e-07, + "loss": 0.1626, + "step": 8871 + }, + { + "epoch": 0.91947352057208, + "grad_norm": 0.7527048587799072, + "learning_rate": 6.764247707344606e-07, + "loss": 0.1949, + "step": 8872 + }, + { + "epoch": 0.9195771582547414, + "grad_norm": 0.6733422875404358, + "learning_rate": 6.746946218449224e-07, + "loss": 0.1754, + "step": 8873 + }, + { + "epoch": 0.9196807959374028, + "grad_norm": 0.7400389909744263, + "learning_rate": 6.729666505007571e-07, + "loss": 0.202, + "step": 8874 + }, + { + "epoch": 0.9197844336200642, + "grad_norm": 0.791100263595581, + "learning_rate": 6.712408568966644e-07, + "loss": 0.2167, + "step": 8875 + }, + { + "epoch": 0.9198880713027257, + "grad_norm": 0.6577385663986206, + "learning_rate": 6.695172412271044e-07, + "loss": 0.1858, + "step": 8876 + }, + { + "epoch": 0.9199917089853871, + "grad_norm": 0.720878005027771, + "learning_rate": 6.677958036862908e-07, + "loss": 0.2019, + "step": 8877 + }, + { + "epoch": 0.9200953466680485, + "grad_norm": 0.7390361428260803, + "learning_rate": 6.660765444681927e-07, + "loss": 0.2165, + "step": 8878 + }, + { + "epoch": 0.9201989843507099, + "grad_norm": 0.7230072617530823, + "learning_rate": 6.643594637665374e-07, + "loss": 0.1955, + "step": 8879 + }, + { + "epoch": 0.9203026220333713, + "grad_norm": 0.6402953863143921, + "learning_rate": 6.626445617747968e-07, + "loss": 0.1666, + "step": 8880 + }, + { + "epoch": 0.9204062597160327, + "grad_norm": 0.5999537706375122, + "learning_rate": 6.609318386862096e-07, + "loss": 0.1816, + "step": 8881 + }, + { + "epoch": 0.9205098973986942, + "grad_norm": 0.7926232814788818, + "learning_rate": 6.592212946937571e-07, + "loss": 0.1697, + "step": 8882 + }, + { + "epoch": 0.9206135350813556, + "grad_norm": 0.6867687702178955, + "learning_rate": 6.575129299901828e-07, + "loss": 0.1815, + "step": 8883 + }, + { + "epoch": 0.920717172764017, + "grad_norm": 0.5919355750083923, + "learning_rate": 6.558067447679861e-07, + "loss": 0.1497, + "step": 8884 + }, + { + "epoch": 0.9208208104466784, + "grad_norm": 0.7224149703979492, + "learning_rate": 6.541027392194111e-07, + "loss": 0.1787, + "step": 8885 + }, + { + "epoch": 0.9209244481293398, + "grad_norm": 0.6357755064964294, + "learning_rate": 6.524009135364684e-07, + "loss": 0.1604, + "step": 8886 + }, + { + "epoch": 0.9210280858120012, + "grad_norm": 0.6600928902626038, + "learning_rate": 6.507012679109115e-07, + "loss": 0.1786, + "step": 8887 + }, + { + "epoch": 0.9211317234946627, + "grad_norm": 0.6421642303466797, + "learning_rate": 6.490038025342604e-07, + "loss": 0.1625, + "step": 8888 + }, + { + "epoch": 0.9212353611773241, + "grad_norm": 0.6637979745864868, + "learning_rate": 6.473085175977778e-07, + "loss": 0.1932, + "step": 8889 + }, + { + "epoch": 0.9213389988599855, + "grad_norm": 0.7803764343261719, + "learning_rate": 6.456154132924841e-07, + "loss": 0.2317, + "step": 8890 + }, + { + "epoch": 0.9214426365426469, + "grad_norm": 0.5398008227348328, + "learning_rate": 6.439244898091623e-07, + "loss": 0.1584, + "step": 8891 + }, + { + "epoch": 0.9215462742253083, + "grad_norm": 0.6695163249969482, + "learning_rate": 6.422357473383378e-07, + "loss": 0.1805, + "step": 8892 + }, + { + "epoch": 0.9216499119079697, + "grad_norm": 0.7654611468315125, + "learning_rate": 6.40549186070294e-07, + "loss": 0.2207, + "step": 8893 + }, + { + "epoch": 0.9217535495906312, + "grad_norm": 0.5898266434669495, + "learning_rate": 6.388648061950786e-07, + "loss": 0.1428, + "step": 8894 + }, + { + "epoch": 0.9218571872732926, + "grad_norm": 0.7077470421791077, + "learning_rate": 6.371826079024778e-07, + "loss": 0.1897, + "step": 8895 + }, + { + "epoch": 0.921960824955954, + "grad_norm": 0.7756674885749817, + "learning_rate": 6.355025913820401e-07, + "loss": 0.2364, + "step": 8896 + }, + { + "epoch": 0.9220644626386154, + "grad_norm": 0.7032382488250732, + "learning_rate": 6.338247568230671e-07, + "loss": 0.2081, + "step": 8897 + }, + { + "epoch": 0.9221681003212768, + "grad_norm": 0.5878440141677856, + "learning_rate": 6.321491044146145e-07, + "loss": 0.1781, + "step": 8898 + }, + { + "epoch": 0.9222717380039382, + "grad_norm": 0.6283981800079346, + "learning_rate": 6.304756343454954e-07, + "loss": 0.1764, + "step": 8899 + }, + { + "epoch": 0.9223753756865997, + "grad_norm": 0.6028159856796265, + "learning_rate": 6.288043468042704e-07, + "loss": 0.1623, + "step": 8900 + }, + { + "epoch": 0.9224790133692611, + "grad_norm": 0.7289518117904663, + "learning_rate": 6.271352419792576e-07, + "loss": 0.1888, + "step": 8901 + }, + { + "epoch": 0.9225826510519225, + "grad_norm": 0.7628673315048218, + "learning_rate": 6.25468320058531e-07, + "loss": 0.2095, + "step": 8902 + }, + { + "epoch": 0.9226862887345839, + "grad_norm": 0.6608951091766357, + "learning_rate": 6.238035812299137e-07, + "loss": 0.1904, + "step": 8903 + }, + { + "epoch": 0.9227899264172453, + "grad_norm": 0.7164463996887207, + "learning_rate": 6.22141025680989e-07, + "loss": 0.1777, + "step": 8904 + }, + { + "epoch": 0.9228935640999067, + "grad_norm": 0.7240995168685913, + "learning_rate": 6.204806535990893e-07, + "loss": 0.1771, + "step": 8905 + }, + { + "epoch": 0.9229972017825682, + "grad_norm": 0.5761838555335999, + "learning_rate": 6.188224651713071e-07, + "loss": 0.1562, + "step": 8906 + }, + { + "epoch": 0.9231008394652296, + "grad_norm": 0.7067749500274658, + "learning_rate": 6.171664605844796e-07, + "loss": 0.1994, + "step": 8907 + }, + { + "epoch": 0.923204477147891, + "grad_norm": 0.7407668828964233, + "learning_rate": 6.155126400252021e-07, + "loss": 0.1908, + "step": 8908 + }, + { + "epoch": 0.9233081148305524, + "grad_norm": 0.8542502522468567, + "learning_rate": 6.138610036798276e-07, + "loss": 0.2135, + "step": 8909 + }, + { + "epoch": 0.9234117525132138, + "grad_norm": 0.5649498105049133, + "learning_rate": 6.122115517344585e-07, + "loss": 0.1675, + "step": 8910 + }, + { + "epoch": 0.9235153901958753, + "grad_norm": 0.729465663433075, + "learning_rate": 6.105642843749526e-07, + "loss": 0.209, + "step": 8911 + }, + { + "epoch": 0.9236190278785367, + "grad_norm": 0.7549067139625549, + "learning_rate": 6.089192017869217e-07, + "loss": 0.2098, + "step": 8912 + }, + { + "epoch": 0.9237226655611981, + "grad_norm": 0.6583209037780762, + "learning_rate": 6.072763041557328e-07, + "loss": 0.188, + "step": 8913 + }, + { + "epoch": 0.9238263032438595, + "grad_norm": 0.7007696628570557, + "learning_rate": 6.056355916665024e-07, + "loss": 0.1745, + "step": 8914 + }, + { + "epoch": 0.9239299409265209, + "grad_norm": 0.7478169798851013, + "learning_rate": 6.039970645041027e-07, + "loss": 0.2256, + "step": 8915 + }, + { + "epoch": 0.9240335786091823, + "grad_norm": 0.6510449051856995, + "learning_rate": 6.023607228531659e-07, + "loss": 0.1608, + "step": 8916 + }, + { + "epoch": 0.9241372162918438, + "grad_norm": 0.7374513745307922, + "learning_rate": 6.007265668980644e-07, + "loss": 0.2035, + "step": 8917 + }, + { + "epoch": 0.9242408539745052, + "grad_norm": 0.8026915192604065, + "learning_rate": 5.990945968229378e-07, + "loss": 0.2051, + "step": 8918 + }, + { + "epoch": 0.9243444916571666, + "grad_norm": 0.6376965045928955, + "learning_rate": 5.974648128116744e-07, + "loss": 0.162, + "step": 8919 + }, + { + "epoch": 0.924448129339828, + "grad_norm": 0.7291052341461182, + "learning_rate": 5.958372150479141e-07, + "loss": 0.1945, + "step": 8920 + }, + { + "epoch": 0.9245517670224894, + "grad_norm": 0.7327719330787659, + "learning_rate": 5.942118037150524e-07, + "loss": 0.1804, + "step": 8921 + }, + { + "epoch": 0.9246554047051508, + "grad_norm": 0.650361180305481, + "learning_rate": 5.92588578996236e-07, + "loss": 0.1703, + "step": 8922 + }, + { + "epoch": 0.9247590423878123, + "grad_norm": 0.572733998298645, + "learning_rate": 5.909675410743676e-07, + "loss": 0.1423, + "step": 8923 + }, + { + "epoch": 0.9248626800704737, + "grad_norm": 0.7012404799461365, + "learning_rate": 5.893486901321077e-07, + "loss": 0.1902, + "step": 8924 + }, + { + "epoch": 0.9249663177531351, + "grad_norm": 0.7306863069534302, + "learning_rate": 5.877320263518615e-07, + "loss": 0.1976, + "step": 8925 + }, + { + "epoch": 0.9250699554357965, + "grad_norm": 0.7554032206535339, + "learning_rate": 5.861175499157945e-07, + "loss": 0.2146, + "step": 8926 + }, + { + "epoch": 0.9251735931184579, + "grad_norm": 0.5923937559127808, + "learning_rate": 5.845052610058232e-07, + "loss": 0.1641, + "step": 8927 + }, + { + "epoch": 0.9252772308011192, + "grad_norm": 0.6130524277687073, + "learning_rate": 5.828951598036137e-07, + "loss": 0.1876, + "step": 8928 + }, + { + "epoch": 0.9253808684837806, + "grad_norm": 0.6826179027557373, + "learning_rate": 5.812872464905984e-07, + "loss": 0.1976, + "step": 8929 + }, + { + "epoch": 0.9254845061664421, + "grad_norm": 0.7306415438652039, + "learning_rate": 5.796815212479434e-07, + "loss": 0.2122, + "step": 8930 + }, + { + "epoch": 0.9255881438491035, + "grad_norm": 0.7335067391395569, + "learning_rate": 5.780779842565887e-07, + "loss": 0.1904, + "step": 8931 + }, + { + "epoch": 0.9256917815317649, + "grad_norm": 0.7055259943008423, + "learning_rate": 5.764766356972163e-07, + "loss": 0.2108, + "step": 8932 + }, + { + "epoch": 0.9257954192144263, + "grad_norm": 0.812872052192688, + "learning_rate": 5.748774757502573e-07, + "loss": 0.2261, + "step": 8933 + }, + { + "epoch": 0.9258990568970877, + "grad_norm": 0.5800080299377441, + "learning_rate": 5.732805045959122e-07, + "loss": 0.1649, + "step": 8934 + }, + { + "epoch": 0.9260026945797492, + "grad_norm": 0.7482984066009521, + "learning_rate": 5.71685722414117e-07, + "loss": 0.2178, + "step": 8935 + }, + { + "epoch": 0.9261063322624106, + "grad_norm": 0.6158731579780579, + "learning_rate": 5.700931293845746e-07, + "loss": 0.1653, + "step": 8936 + }, + { + "epoch": 0.926209969945072, + "grad_norm": 0.6006136536598206, + "learning_rate": 5.685027256867326e-07, + "loss": 0.158, + "step": 8937 + }, + { + "epoch": 0.9263136076277334, + "grad_norm": 0.6906245946884155, + "learning_rate": 5.669145114997987e-07, + "loss": 0.1747, + "step": 8938 + }, + { + "epoch": 0.9264172453103948, + "grad_norm": 0.7390557527542114, + "learning_rate": 5.653284870027276e-07, + "loss": 0.2019, + "step": 8939 + }, + { + "epoch": 0.9265208829930562, + "grad_norm": 0.7188860774040222, + "learning_rate": 5.637446523742274e-07, + "loss": 0.2142, + "step": 8940 + }, + { + "epoch": 0.9266245206757177, + "grad_norm": 0.44092702865600586, + "learning_rate": 5.621630077927709e-07, + "loss": 0.1086, + "step": 8941 + }, + { + "epoch": 0.9267281583583791, + "grad_norm": 0.5864990949630737, + "learning_rate": 5.605835534365644e-07, + "loss": 0.1586, + "step": 8942 + }, + { + "epoch": 0.9268317960410405, + "grad_norm": 0.6084216237068176, + "learning_rate": 5.590062894835857e-07, + "loss": 0.1601, + "step": 8943 + }, + { + "epoch": 0.9269354337237019, + "grad_norm": 0.7542744278907776, + "learning_rate": 5.574312161115591e-07, + "loss": 0.1936, + "step": 8944 + }, + { + "epoch": 0.9270390714063633, + "grad_norm": 0.6386050581932068, + "learning_rate": 5.558583334979584e-07, + "loss": 0.1715, + "step": 8945 + }, + { + "epoch": 0.9271427090890247, + "grad_norm": 0.7549616694450378, + "learning_rate": 5.542876418200149e-07, + "loss": 0.2269, + "step": 8946 + }, + { + "epoch": 0.9272463467716862, + "grad_norm": 0.6269014477729797, + "learning_rate": 5.527191412547095e-07, + "loss": 0.1779, + "step": 8947 + }, + { + "epoch": 0.9273499844543476, + "grad_norm": 0.6783638000488281, + "learning_rate": 5.511528319787784e-07, + "loss": 0.1994, + "step": 8948 + }, + { + "epoch": 0.927453622137009, + "grad_norm": 0.6666401028633118, + "learning_rate": 5.49588714168714e-07, + "loss": 0.1861, + "step": 8949 + }, + { + "epoch": 0.9275572598196704, + "grad_norm": 0.6111328601837158, + "learning_rate": 5.480267880007572e-07, + "loss": 0.1851, + "step": 8950 + }, + { + "epoch": 0.9276608975023318, + "grad_norm": 0.6061242818832397, + "learning_rate": 5.464670536509031e-07, + "loss": 0.1717, + "step": 8951 + }, + { + "epoch": 0.9277645351849932, + "grad_norm": 0.6167566180229187, + "learning_rate": 5.449095112949021e-07, + "loss": 0.1655, + "step": 8952 + }, + { + "epoch": 0.9278681728676547, + "grad_norm": 0.6780339479446411, + "learning_rate": 5.433541611082493e-07, + "loss": 0.2067, + "step": 8953 + }, + { + "epoch": 0.9279718105503161, + "grad_norm": 0.6359187960624695, + "learning_rate": 5.418010032662091e-07, + "loss": 0.1675, + "step": 8954 + }, + { + "epoch": 0.9280754482329775, + "grad_norm": 0.7773694396018982, + "learning_rate": 5.402500379437792e-07, + "loss": 0.2221, + "step": 8955 + }, + { + "epoch": 0.9281790859156389, + "grad_norm": 0.5864004492759705, + "learning_rate": 5.387012653157264e-07, + "loss": 0.163, + "step": 8956 + }, + { + "epoch": 0.9282827235983003, + "grad_norm": 0.7232480645179749, + "learning_rate": 5.371546855565601e-07, + "loss": 0.1983, + "step": 8957 + }, + { + "epoch": 0.9283863612809617, + "grad_norm": 0.8004658818244934, + "learning_rate": 5.356102988405498e-07, + "loss": 0.2239, + "step": 8958 + }, + { + "epoch": 0.9284899989636232, + "grad_norm": 0.7216724753379822, + "learning_rate": 5.340681053417141e-07, + "loss": 0.1964, + "step": 8959 + }, + { + "epoch": 0.9285936366462846, + "grad_norm": 0.7647455334663391, + "learning_rate": 5.325281052338227e-07, + "loss": 0.2242, + "step": 8960 + }, + { + "epoch": 0.928697274328946, + "grad_norm": 0.5940206050872803, + "learning_rate": 5.309902986904015e-07, + "loss": 0.1849, + "step": 8961 + }, + { + "epoch": 0.9288009120116074, + "grad_norm": 0.6256440281867981, + "learning_rate": 5.294546858847271e-07, + "loss": 0.17, + "step": 8962 + }, + { + "epoch": 0.9289045496942688, + "grad_norm": 0.7074966430664062, + "learning_rate": 5.279212669898326e-07, + "loss": 0.197, + "step": 8963 + }, + { + "epoch": 0.9290081873769303, + "grad_norm": 0.6957035660743713, + "learning_rate": 5.263900421785017e-07, + "loss": 0.2053, + "step": 8964 + }, + { + "epoch": 0.9291118250595917, + "grad_norm": 0.5913291573524475, + "learning_rate": 5.248610116232633e-07, + "loss": 0.1513, + "step": 8965 + }, + { + "epoch": 0.9292154627422531, + "grad_norm": 0.6100219488143921, + "learning_rate": 5.233341754964172e-07, + "loss": 0.1573, + "step": 8966 + }, + { + "epoch": 0.9293191004249145, + "grad_norm": 0.7508609294891357, + "learning_rate": 5.218095339699947e-07, + "loss": 0.2029, + "step": 8967 + }, + { + "epoch": 0.9294227381075759, + "grad_norm": 0.57771235704422, + "learning_rate": 5.202870872157939e-07, + "loss": 0.1745, + "step": 8968 + }, + { + "epoch": 0.9295263757902373, + "grad_norm": 0.829882800579071, + "learning_rate": 5.187668354053666e-07, + "loss": 0.2003, + "step": 8969 + }, + { + "epoch": 0.9296300134728988, + "grad_norm": 0.6365578770637512, + "learning_rate": 5.172487787100066e-07, + "loss": 0.1669, + "step": 8970 + }, + { + "epoch": 0.9297336511555602, + "grad_norm": 0.6566683053970337, + "learning_rate": 5.157329173007663e-07, + "loss": 0.1639, + "step": 8971 + }, + { + "epoch": 0.9298372888382216, + "grad_norm": 0.7249619364738464, + "learning_rate": 5.142192513484534e-07, + "loss": 0.1855, + "step": 8972 + }, + { + "epoch": 0.929940926520883, + "grad_norm": 0.7078553438186646, + "learning_rate": 5.127077810236225e-07, + "loss": 0.1881, + "step": 8973 + }, + { + "epoch": 0.9300445642035444, + "grad_norm": 0.5799292325973511, + "learning_rate": 5.111985064965864e-07, + "loss": 0.1479, + "step": 8974 + }, + { + "epoch": 0.9301482018862058, + "grad_norm": 0.6730943918228149, + "learning_rate": 5.096914279374066e-07, + "loss": 0.1947, + "step": 8975 + }, + { + "epoch": 0.9302518395688673, + "grad_norm": 0.5914413928985596, + "learning_rate": 5.081865455158985e-07, + "loss": 0.1675, + "step": 8976 + }, + { + "epoch": 0.9303554772515287, + "grad_norm": 0.6787246465682983, + "learning_rate": 5.06683859401631e-07, + "loss": 0.204, + "step": 8977 + }, + { + "epoch": 0.9304591149341901, + "grad_norm": 0.6746557950973511, + "learning_rate": 5.051833697639197e-07, + "loss": 0.1957, + "step": 8978 + }, + { + "epoch": 0.9305627526168515, + "grad_norm": 0.6631671786308289, + "learning_rate": 5.036850767718448e-07, + "loss": 0.1639, + "step": 8979 + }, + { + "epoch": 0.9306663902995129, + "grad_norm": 0.6939098238945007, + "learning_rate": 5.021889805942248e-07, + "loss": 0.185, + "step": 8980 + }, + { + "epoch": 0.9307700279821743, + "grad_norm": 0.80235755443573, + "learning_rate": 5.006950813996403e-07, + "loss": 0.219, + "step": 8981 + }, + { + "epoch": 0.9308736656648358, + "grad_norm": 0.6713320016860962, + "learning_rate": 4.992033793564255e-07, + "loss": 0.1947, + "step": 8982 + }, + { + "epoch": 0.9309773033474972, + "grad_norm": 0.6114919781684875, + "learning_rate": 4.977138746326571e-07, + "loss": 0.1707, + "step": 8983 + }, + { + "epoch": 0.9310809410301586, + "grad_norm": 0.6712514162063599, + "learning_rate": 4.962265673961742e-07, + "loss": 0.1758, + "step": 8984 + }, + { + "epoch": 0.93118457871282, + "grad_norm": 0.6477010250091553, + "learning_rate": 4.947414578145604e-07, + "loss": 0.1921, + "step": 8985 + }, + { + "epoch": 0.9312882163954814, + "grad_norm": 0.6463591456413269, + "learning_rate": 4.932585460551576e-07, + "loss": 0.1665, + "step": 8986 + }, + { + "epoch": 0.9313918540781428, + "grad_norm": 0.6120611429214478, + "learning_rate": 4.917778322850586e-07, + "loss": 0.1445, + "step": 8987 + }, + { + "epoch": 0.9314954917608043, + "grad_norm": 0.6259697675704956, + "learning_rate": 4.902993166711056e-07, + "loss": 0.1939, + "step": 8988 + }, + { + "epoch": 0.9315991294434657, + "grad_norm": 0.6382227540016174, + "learning_rate": 4.888229993799009e-07, + "loss": 0.1686, + "step": 8989 + }, + { + "epoch": 0.9317027671261271, + "grad_norm": 0.688135027885437, + "learning_rate": 4.873488805777893e-07, + "loss": 0.1923, + "step": 8990 + }, + { + "epoch": 0.9318064048087885, + "grad_norm": 0.7672633528709412, + "learning_rate": 4.858769604308689e-07, + "loss": 0.2028, + "step": 8991 + }, + { + "epoch": 0.9319100424914499, + "grad_norm": 0.735131561756134, + "learning_rate": 4.844072391050003e-07, + "loss": 0.2298, + "step": 8992 + }, + { + "epoch": 0.9320136801741113, + "grad_norm": 0.6174731254577637, + "learning_rate": 4.829397167657867e-07, + "loss": 0.1749, + "step": 8993 + }, + { + "epoch": 0.9321173178567728, + "grad_norm": 0.6736615300178528, + "learning_rate": 4.814743935785848e-07, + "loss": 0.1787, + "step": 8994 + }, + { + "epoch": 0.9322209555394342, + "grad_norm": 0.6681990623474121, + "learning_rate": 4.800112697085068e-07, + "loss": 0.1808, + "step": 8995 + }, + { + "epoch": 0.9323245932220956, + "grad_norm": 0.7692642211914062, + "learning_rate": 4.785503453204143e-07, + "loss": 0.1991, + "step": 8996 + }, + { + "epoch": 0.932428230904757, + "grad_norm": 0.5919527411460876, + "learning_rate": 4.770916205789222e-07, + "loss": 0.1672, + "step": 8997 + }, + { + "epoch": 0.9325318685874184, + "grad_norm": 0.621511697769165, + "learning_rate": 4.756350956483968e-07, + "loss": 0.1641, + "step": 8998 + }, + { + "epoch": 0.9326355062700799, + "grad_norm": 0.6952032446861267, + "learning_rate": 4.74180770692958e-07, + "loss": 0.1866, + "step": 8999 + }, + { + "epoch": 0.9327391439527413, + "grad_norm": 0.576474130153656, + "learning_rate": 4.727286458764768e-07, + "loss": 0.1555, + "step": 9000 + }, + { + "epoch": 0.9328427816354027, + "grad_norm": 0.8230246901512146, + "learning_rate": 4.712787213625758e-07, + "loss": 0.2268, + "step": 9001 + }, + { + "epoch": 0.9329464193180641, + "grad_norm": 0.630755603313446, + "learning_rate": 4.698309973146309e-07, + "loss": 0.158, + "step": 9002 + }, + { + "epoch": 0.9330500570007255, + "grad_norm": 0.7077135443687439, + "learning_rate": 4.683854738957694e-07, + "loss": 0.2077, + "step": 9003 + }, + { + "epoch": 0.9331536946833868, + "grad_norm": 0.7478125095367432, + "learning_rate": 4.669421512688699e-07, + "loss": 0.2247, + "step": 9004 + }, + { + "epoch": 0.9332573323660482, + "grad_norm": 0.6382695436477661, + "learning_rate": 4.655010295965623e-07, + "loss": 0.1805, + "step": 9005 + }, + { + "epoch": 0.9333609700487097, + "grad_norm": 0.6977059245109558, + "learning_rate": 4.6406210904123226e-07, + "loss": 0.1851, + "step": 9006 + }, + { + "epoch": 0.9334646077313711, + "grad_norm": 0.7372855544090271, + "learning_rate": 4.626253897650146e-07, + "loss": 0.1846, + "step": 9007 + }, + { + "epoch": 0.9335682454140325, + "grad_norm": 0.5672875046730042, + "learning_rate": 4.611908719297997e-07, + "loss": 0.1489, + "step": 9008 + }, + { + "epoch": 0.9336718830966939, + "grad_norm": 0.7301895022392273, + "learning_rate": 4.597585556972206e-07, + "loss": 0.206, + "step": 9009 + }, + { + "epoch": 0.9337755207793553, + "grad_norm": 0.7008638978004456, + "learning_rate": 4.583284412286726e-07, + "loss": 0.2102, + "step": 9010 + }, + { + "epoch": 0.9338791584620167, + "grad_norm": 0.5648622512817383, + "learning_rate": 4.5690052868529564e-07, + "loss": 0.1679, + "step": 9011 + }, + { + "epoch": 0.9339827961446782, + "grad_norm": 0.6398313045501709, + "learning_rate": 4.5547481822799e-07, + "loss": 0.1742, + "step": 9012 + }, + { + "epoch": 0.9340864338273396, + "grad_norm": 0.6302163600921631, + "learning_rate": 4.54051310017396e-07, + "loss": 0.1678, + "step": 9013 + }, + { + "epoch": 0.934190071510001, + "grad_norm": 0.7005049586296082, + "learning_rate": 4.5263000421391866e-07, + "loss": 0.2095, + "step": 9014 + }, + { + "epoch": 0.9342937091926624, + "grad_norm": 0.7499439716339111, + "learning_rate": 4.5121090097770547e-07, + "loss": 0.2092, + "step": 9015 + }, + { + "epoch": 0.9343973468753238, + "grad_norm": 0.7579978108406067, + "learning_rate": 4.497940004686552e-07, + "loss": 0.2138, + "step": 9016 + }, + { + "epoch": 0.9345009845579852, + "grad_norm": 0.6810645461082458, + "learning_rate": 4.4837930284642896e-07, + "loss": 0.1777, + "step": 9017 + }, + { + "epoch": 0.9346046222406467, + "grad_norm": 0.6928239464759827, + "learning_rate": 4.4696680827042813e-07, + "loss": 0.1789, + "step": 9018 + }, + { + "epoch": 0.9347082599233081, + "grad_norm": 0.7871753573417664, + "learning_rate": 4.4555651689981214e-07, + "loss": 0.2459, + "step": 9019 + }, + { + "epoch": 0.9348118976059695, + "grad_norm": 0.6925694942474365, + "learning_rate": 4.441484288934872e-07, + "loss": 0.1903, + "step": 9020 + }, + { + "epoch": 0.9349155352886309, + "grad_norm": 0.6814939975738525, + "learning_rate": 4.427425444101219e-07, + "loss": 0.1729, + "step": 9021 + }, + { + "epoch": 0.9350191729712923, + "grad_norm": 0.7239201068878174, + "learning_rate": 4.413388636081206e-07, + "loss": 0.1968, + "step": 9022 + }, + { + "epoch": 0.9351228106539538, + "grad_norm": 0.6216186285018921, + "learning_rate": 4.3993738664565245e-07, + "loss": 0.1728, + "step": 9023 + }, + { + "epoch": 0.9352264483366152, + "grad_norm": 0.7584130764007568, + "learning_rate": 4.3853811368063326e-07, + "loss": 0.2498, + "step": 9024 + }, + { + "epoch": 0.9353300860192766, + "grad_norm": 0.671943187713623, + "learning_rate": 4.3714104487073027e-07, + "loss": 0.1835, + "step": 9025 + }, + { + "epoch": 0.935433723701938, + "grad_norm": 0.7655714154243469, + "learning_rate": 4.3574618037336427e-07, + "loss": 0.2028, + "step": 9026 + }, + { + "epoch": 0.9355373613845994, + "grad_norm": 0.7326068878173828, + "learning_rate": 4.34353520345705e-07, + "loss": 0.2027, + "step": 9027 + }, + { + "epoch": 0.9356409990672608, + "grad_norm": 0.6733595132827759, + "learning_rate": 4.3296306494467587e-07, + "loss": 0.1714, + "step": 9028 + }, + { + "epoch": 0.9357446367499223, + "grad_norm": 0.654135525226593, + "learning_rate": 4.3157481432694936e-07, + "loss": 0.1662, + "step": 9029 + }, + { + "epoch": 0.9358482744325837, + "grad_norm": 0.7005829811096191, + "learning_rate": 4.3018876864895365e-07, + "loss": 0.2, + "step": 9030 + }, + { + "epoch": 0.9359519121152451, + "grad_norm": 0.7137699127197266, + "learning_rate": 4.28804928066866e-07, + "loss": 0.2008, + "step": 9031 + }, + { + "epoch": 0.9360555497979065, + "grad_norm": 0.6908015012741089, + "learning_rate": 4.274232927366151e-07, + "loss": 0.2008, + "step": 9032 + }, + { + "epoch": 0.9361591874805679, + "grad_norm": 0.7445901036262512, + "learning_rate": 4.26043862813883e-07, + "loss": 0.2115, + "step": 9033 + }, + { + "epoch": 0.9362628251632293, + "grad_norm": 0.6784663796424866, + "learning_rate": 4.2466663845409874e-07, + "loss": 0.2025, + "step": 9034 + }, + { + "epoch": 0.9363664628458908, + "grad_norm": 0.8212520480155945, + "learning_rate": 4.2329161981244703e-07, + "loss": 0.2295, + "step": 9035 + }, + { + "epoch": 0.9364701005285522, + "grad_norm": 0.665834903717041, + "learning_rate": 4.2191880704386177e-07, + "loss": 0.1906, + "step": 9036 + }, + { + "epoch": 0.9365737382112136, + "grad_norm": 0.5903333425521851, + "learning_rate": 4.205482003030326e-07, + "loss": 0.1417, + "step": 9037 + }, + { + "epoch": 0.936677375893875, + "grad_norm": 0.7341218590736389, + "learning_rate": 4.191797997443936e-07, + "loss": 0.1764, + "step": 9038 + }, + { + "epoch": 0.9367810135765364, + "grad_norm": 0.6420240998268127, + "learning_rate": 4.178136055221371e-07, + "loss": 0.1736, + "step": 9039 + }, + { + "epoch": 0.9368846512591978, + "grad_norm": 0.6921041011810303, + "learning_rate": 4.1644961779020444e-07, + "loss": 0.1772, + "step": 9040 + }, + { + "epoch": 0.9369882889418593, + "grad_norm": 0.7865322232246399, + "learning_rate": 4.1508783670228145e-07, + "loss": 0.2269, + "step": 9041 + }, + { + "epoch": 0.9370919266245207, + "grad_norm": 0.6674950122833252, + "learning_rate": 4.137282624118188e-07, + "loss": 0.2126, + "step": 9042 + }, + { + "epoch": 0.9371955643071821, + "grad_norm": 0.6780579686164856, + "learning_rate": 4.123708950720073e-07, + "loss": 0.1804, + "step": 9043 + }, + { + "epoch": 0.9372992019898435, + "grad_norm": 0.7245308756828308, + "learning_rate": 4.110157348357935e-07, + "loss": 0.1784, + "step": 9044 + }, + { + "epoch": 0.9374028396725049, + "grad_norm": 0.8370404839515686, + "learning_rate": 4.0966278185587296e-07, + "loss": 0.2782, + "step": 9045 + }, + { + "epoch": 0.9375064773551663, + "grad_norm": 0.5746482014656067, + "learning_rate": 4.0831203628469927e-07, + "loss": 0.1499, + "step": 9046 + }, + { + "epoch": 0.9376101150378278, + "grad_norm": 0.5932947397232056, + "learning_rate": 4.069634982744708e-07, + "loss": 0.1518, + "step": 9047 + }, + { + "epoch": 0.9377137527204892, + "grad_norm": 0.6348161697387695, + "learning_rate": 4.056171679771326e-07, + "loss": 0.161, + "step": 9048 + }, + { + "epoch": 0.9378173904031506, + "grad_norm": 0.6963269114494324, + "learning_rate": 4.0427304554439664e-07, + "loss": 0.1708, + "step": 9049 + }, + { + "epoch": 0.937921028085812, + "grad_norm": 0.7186452746391296, + "learning_rate": 4.029311311277084e-07, + "loss": 0.1884, + "step": 9050 + }, + { + "epoch": 0.9380246657684734, + "grad_norm": 0.7195757627487183, + "learning_rate": 4.01591424878276e-07, + "loss": 0.1871, + "step": 9051 + }, + { + "epoch": 0.9381283034511348, + "grad_norm": 0.7170066237449646, + "learning_rate": 4.0025392694705843e-07, + "loss": 0.1975, + "step": 9052 + }, + { + "epoch": 0.9382319411337963, + "grad_norm": 0.7198824286460876, + "learning_rate": 3.9891863748475754e-07, + "loss": 0.188, + "step": 9053 + }, + { + "epoch": 0.9383355788164577, + "grad_norm": 0.7737149000167847, + "learning_rate": 3.9758555664183517e-07, + "loss": 0.2208, + "step": 9054 + }, + { + "epoch": 0.9384392164991191, + "grad_norm": 0.7000073790550232, + "learning_rate": 3.9625468456850005e-07, + "loss": 0.1748, + "step": 9055 + }, + { + "epoch": 0.9385428541817805, + "grad_norm": 0.678845226764679, + "learning_rate": 3.9492602141470995e-07, + "loss": 0.1802, + "step": 9056 + }, + { + "epoch": 0.9386464918644419, + "grad_norm": 0.6686621904373169, + "learning_rate": 3.935995673301829e-07, + "loss": 0.1957, + "step": 9057 + }, + { + "epoch": 0.9387501295471034, + "grad_norm": 0.6434568166732788, + "learning_rate": 3.9227532246437495e-07, + "loss": 0.1681, + "step": 9058 + }, + { + "epoch": 0.9388537672297648, + "grad_norm": 0.8343382477760315, + "learning_rate": 3.9095328696650446e-07, + "loss": 0.2164, + "step": 9059 + }, + { + "epoch": 0.9389574049124262, + "grad_norm": 0.7999582886695862, + "learning_rate": 3.8963346098553457e-07, + "loss": 0.2098, + "step": 9060 + }, + { + "epoch": 0.9390610425950876, + "grad_norm": 0.7141200304031372, + "learning_rate": 3.883158446701796e-07, + "loss": 0.1948, + "step": 9061 + }, + { + "epoch": 0.939164680277749, + "grad_norm": 0.7116166353225708, + "learning_rate": 3.8700043816890966e-07, + "loss": 0.1903, + "step": 9062 + }, + { + "epoch": 0.9392683179604104, + "grad_norm": 0.6133732199668884, + "learning_rate": 3.8568724162994174e-07, + "loss": 0.1553, + "step": 9063 + }, + { + "epoch": 0.9393719556430719, + "grad_norm": 0.6451056003570557, + "learning_rate": 3.843762552012442e-07, + "loss": 0.1747, + "step": 9064 + }, + { + "epoch": 0.9394755933257333, + "grad_norm": 0.5624836087226868, + "learning_rate": 3.8306747903053666e-07, + "loss": 0.1739, + "step": 9065 + }, + { + "epoch": 0.9395792310083947, + "grad_norm": 0.6717318296432495, + "learning_rate": 3.8176091326528995e-07, + "loss": 0.1789, + "step": 9066 + }, + { + "epoch": 0.9396828686910561, + "grad_norm": 0.5935096740722656, + "learning_rate": 3.804565580527286e-07, + "loss": 0.1773, + "step": 9067 + }, + { + "epoch": 0.9397865063737175, + "grad_norm": 0.6754716038703918, + "learning_rate": 3.7915441353982174e-07, + "loss": 0.1854, + "step": 9068 + }, + { + "epoch": 0.9398901440563789, + "grad_norm": 0.6990920305252075, + "learning_rate": 3.7785447987329415e-07, + "loss": 0.1883, + "step": 9069 + }, + { + "epoch": 0.9399937817390404, + "grad_norm": 0.7798400521278381, + "learning_rate": 3.765567571996198e-07, + "loss": 0.2196, + "step": 9070 + }, + { + "epoch": 0.9400974194217018, + "grad_norm": 0.7619076371192932, + "learning_rate": 3.752612456650262e-07, + "loss": 0.2451, + "step": 9071 + }, + { + "epoch": 0.9402010571043632, + "grad_norm": 0.6839298009872437, + "learning_rate": 3.7396794541548985e-07, + "loss": 0.1684, + "step": 9072 + }, + { + "epoch": 0.9403046947870246, + "grad_norm": 0.7317507863044739, + "learning_rate": 3.72676856596732e-07, + "loss": 0.188, + "step": 9073 + }, + { + "epoch": 0.940408332469686, + "grad_norm": 0.7622932195663452, + "learning_rate": 3.713879793542385e-07, + "loss": 0.2338, + "step": 9074 + }, + { + "epoch": 0.9405119701523474, + "grad_norm": 0.5636216998100281, + "learning_rate": 3.70101313833231e-07, + "loss": 0.1524, + "step": 9075 + }, + { + "epoch": 0.9406156078350089, + "grad_norm": 0.6244856715202332, + "learning_rate": 3.688168601786912e-07, + "loss": 0.1589, + "step": 9076 + }, + { + "epoch": 0.9407192455176703, + "grad_norm": 0.7944729328155518, + "learning_rate": 3.6753461853535455e-07, + "loss": 0.1947, + "step": 9077 + }, + { + "epoch": 0.9408228832003317, + "grad_norm": 0.7152156829833984, + "learning_rate": 3.662545890476965e-07, + "loss": 0.205, + "step": 9078 + }, + { + "epoch": 0.9409265208829931, + "grad_norm": 0.7420468330383301, + "learning_rate": 3.6497677185995064e-07, + "loss": 0.1886, + "step": 9079 + }, + { + "epoch": 0.9410301585656544, + "grad_norm": 0.6091653108596802, + "learning_rate": 3.6370116711609725e-07, + "loss": 0.1668, + "step": 9080 + }, + { + "epoch": 0.9411337962483158, + "grad_norm": 0.7298439741134644, + "learning_rate": 3.6242777495987036e-07, + "loss": 0.1966, + "step": 9081 + }, + { + "epoch": 0.9412374339309773, + "grad_norm": 0.6698196530342102, + "learning_rate": 3.6115659553475733e-07, + "loss": 0.1659, + "step": 9082 + }, + { + "epoch": 0.9413410716136387, + "grad_norm": 0.7139626741409302, + "learning_rate": 3.598876289839881e-07, + "loss": 0.1842, + "step": 9083 + }, + { + "epoch": 0.9414447092963001, + "grad_norm": 0.6268137097358704, + "learning_rate": 3.5862087545055267e-07, + "loss": 0.1794, + "step": 9084 + }, + { + "epoch": 0.9415483469789615, + "grad_norm": 0.7238689064979553, + "learning_rate": 3.5735633507717917e-07, + "loss": 0.1939, + "step": 9085 + }, + { + "epoch": 0.9416519846616229, + "grad_norm": 0.6313364505767822, + "learning_rate": 3.560940080063602e-07, + "loss": 0.1668, + "step": 9086 + }, + { + "epoch": 0.9417556223442843, + "grad_norm": 0.7056628465652466, + "learning_rate": 3.548338943803331e-07, + "loss": 0.1791, + "step": 9087 + }, + { + "epoch": 0.9418592600269458, + "grad_norm": 0.6397473216056824, + "learning_rate": 3.5357599434108216e-07, + "loss": 0.1679, + "step": 9088 + }, + { + "epoch": 0.9419628977096072, + "grad_norm": 0.6657278537750244, + "learning_rate": 3.523203080303472e-07, + "loss": 0.1768, + "step": 9089 + }, + { + "epoch": 0.9420665353922686, + "grad_norm": 0.7117383480072021, + "learning_rate": 3.510668355896196e-07, + "loss": 0.1996, + "step": 9090 + }, + { + "epoch": 0.94217017307493, + "grad_norm": 0.6991865038871765, + "learning_rate": 3.4981557716013305e-07, + "loss": 0.1769, + "step": 9091 + }, + { + "epoch": 0.9422738107575914, + "grad_norm": 0.658190131187439, + "learning_rate": 3.4856653288288353e-07, + "loss": 0.1916, + "step": 9092 + }, + { + "epoch": 0.9423774484402528, + "grad_norm": 0.7647689580917358, + "learning_rate": 3.473197028986053e-07, + "loss": 0.1954, + "step": 9093 + }, + { + "epoch": 0.9424810861229143, + "grad_norm": 0.8443564176559448, + "learning_rate": 3.460750873477925e-07, + "loss": 0.1755, + "step": 9094 + }, + { + "epoch": 0.9425847238055757, + "grad_norm": 0.6932904720306396, + "learning_rate": 3.4483268637068634e-07, + "loss": 0.1827, + "step": 9095 + }, + { + "epoch": 0.9426883614882371, + "grad_norm": 0.7962846755981445, + "learning_rate": 3.435925001072815e-07, + "loss": 0.2141, + "step": 9096 + }, + { + "epoch": 0.9427919991708985, + "grad_norm": 0.6723485589027405, + "learning_rate": 3.423545286973151e-07, + "loss": 0.1673, + "step": 9097 + }, + { + "epoch": 0.9428956368535599, + "grad_norm": 0.6464245319366455, + "learning_rate": 3.4111877228028e-07, + "loss": 0.1705, + "step": 9098 + }, + { + "epoch": 0.9429992745362213, + "grad_norm": 0.6785967350006104, + "learning_rate": 3.398852309954248e-07, + "loss": 0.1905, + "step": 9099 + }, + { + "epoch": 0.9431029122188828, + "grad_norm": 0.680761456489563, + "learning_rate": 3.3865390498173835e-07, + "loss": 0.1936, + "step": 9100 + }, + { + "epoch": 0.9432065499015442, + "grad_norm": 0.7931050658226013, + "learning_rate": 3.374247943779629e-07, + "loss": 0.1745, + "step": 9101 + }, + { + "epoch": 0.9433101875842056, + "grad_norm": 0.722449004650116, + "learning_rate": 3.3619789932259896e-07, + "loss": 0.1806, + "step": 9102 + }, + { + "epoch": 0.943413825266867, + "grad_norm": 0.7283592820167542, + "learning_rate": 3.349732199538891e-07, + "loss": 0.1982, + "step": 9103 + }, + { + "epoch": 0.9435174629495284, + "grad_norm": 0.5951731204986572, + "learning_rate": 3.337507564098252e-07, + "loss": 0.1577, + "step": 9104 + }, + { + "epoch": 0.9436211006321898, + "grad_norm": 0.7373857498168945, + "learning_rate": 3.3253050882815276e-07, + "loss": 0.1905, + "step": 9105 + }, + { + "epoch": 0.9437247383148513, + "grad_norm": 0.665081262588501, + "learning_rate": 3.313124773463683e-07, + "loss": 0.202, + "step": 9106 + }, + { + "epoch": 0.9438283759975127, + "grad_norm": 0.6457472443580627, + "learning_rate": 3.3009666210171985e-07, + "loss": 0.1814, + "step": 9107 + }, + { + "epoch": 0.9439320136801741, + "grad_norm": 0.740166425704956, + "learning_rate": 3.288830632312023e-07, + "loss": 0.2053, + "step": 9108 + }, + { + "epoch": 0.9440356513628355, + "grad_norm": 0.6799501776695251, + "learning_rate": 3.276716808715619e-07, + "loss": 0.223, + "step": 9109 + }, + { + "epoch": 0.9441392890454969, + "grad_norm": 0.7502972483634949, + "learning_rate": 3.2646251515929597e-07, + "loss": 0.205, + "step": 9110 + }, + { + "epoch": 0.9442429267281583, + "grad_norm": 0.7048971652984619, + "learning_rate": 3.252555662306489e-07, + "loss": 0.1915, + "step": 9111 + }, + { + "epoch": 0.9443465644108198, + "grad_norm": 0.6879097819328308, + "learning_rate": 3.240508342216209e-07, + "loss": 0.194, + "step": 9112 + }, + { + "epoch": 0.9444502020934812, + "grad_norm": 0.6178298592567444, + "learning_rate": 3.2284831926795877e-07, + "loss": 0.1754, + "step": 9113 + }, + { + "epoch": 0.9445538397761426, + "grad_norm": 0.7785412669181824, + "learning_rate": 3.2164802150515874e-07, + "loss": 0.2021, + "step": 9114 + }, + { + "epoch": 0.944657477458804, + "grad_norm": 0.7239789366722107, + "learning_rate": 3.204499410684658e-07, + "loss": 0.1903, + "step": 9115 + }, + { + "epoch": 0.9447611151414654, + "grad_norm": 0.7297715544700623, + "learning_rate": 3.1925407809288545e-07, + "loss": 0.2184, + "step": 9116 + }, + { + "epoch": 0.9448647528241269, + "grad_norm": 0.8004382848739624, + "learning_rate": 3.180604327131609e-07, + "loss": 0.2201, + "step": 9117 + }, + { + "epoch": 0.9449683905067883, + "grad_norm": 0.6330424547195435, + "learning_rate": 3.1686900506378904e-07, + "loss": 0.1734, + "step": 9118 + }, + { + "epoch": 0.9450720281894497, + "grad_norm": 0.7204705476760864, + "learning_rate": 3.1567979527902025e-07, + "loss": 0.2118, + "step": 9119 + }, + { + "epoch": 0.9451756658721111, + "grad_norm": 0.6637647747993469, + "learning_rate": 3.144928034928496e-07, + "loss": 0.1751, + "step": 9120 + }, + { + "epoch": 0.9452793035547725, + "grad_norm": 0.6790656447410583, + "learning_rate": 3.133080298390323e-07, + "loss": 0.1743, + "step": 9121 + }, + { + "epoch": 0.9453829412374339, + "grad_norm": 0.6014872193336487, + "learning_rate": 3.121254744510616e-07, + "loss": 0.172, + "step": 9122 + }, + { + "epoch": 0.9454865789200954, + "grad_norm": 0.8160472512245178, + "learning_rate": 3.1094513746218634e-07, + "loss": 0.2306, + "step": 9123 + }, + { + "epoch": 0.9455902166027568, + "grad_norm": 0.7487515807151794, + "learning_rate": 3.0976701900540474e-07, + "loss": 0.216, + "step": 9124 + }, + { + "epoch": 0.9456938542854182, + "grad_norm": 0.7319645285606384, + "learning_rate": 3.085911192134683e-07, + "loss": 0.1976, + "step": 9125 + }, + { + "epoch": 0.9457974919680796, + "grad_norm": 0.6307010054588318, + "learning_rate": 3.0741743821887104e-07, + "loss": 0.1757, + "step": 9126 + }, + { + "epoch": 0.945901129650741, + "grad_norm": 0.5927242636680603, + "learning_rate": 3.062459761538672e-07, + "loss": 0.168, + "step": 9127 + }, + { + "epoch": 0.9460047673334024, + "grad_norm": 0.9647350311279297, + "learning_rate": 3.0507673315045114e-07, + "loss": 0.2184, + "step": 9128 + }, + { + "epoch": 0.9461084050160639, + "grad_norm": 0.8228685855865479, + "learning_rate": 3.039097093403731e-07, + "loss": 0.2137, + "step": 9129 + }, + { + "epoch": 0.9462120426987253, + "grad_norm": 0.7404673099517822, + "learning_rate": 3.0274490485512785e-07, + "loss": 0.2161, + "step": 9130 + }, + { + "epoch": 0.9463156803813867, + "grad_norm": 0.6658329963684082, + "learning_rate": 3.015823198259682e-07, + "loss": 0.1994, + "step": 9131 + }, + { + "epoch": 0.9464193180640481, + "grad_norm": 0.6319071054458618, + "learning_rate": 3.0042195438389156e-07, + "loss": 0.1669, + "step": 9132 + }, + { + "epoch": 0.9465229557467095, + "grad_norm": 0.8151889443397522, + "learning_rate": 2.9926380865964664e-07, + "loss": 0.2278, + "step": 9133 + }, + { + "epoch": 0.9466265934293709, + "grad_norm": 0.652004063129425, + "learning_rate": 2.9810788278372915e-07, + "loss": 0.1911, + "step": 9134 + }, + { + "epoch": 0.9467302311120324, + "grad_norm": 0.6238911747932434, + "learning_rate": 2.969541768863882e-07, + "loss": 0.2012, + "step": 9135 + }, + { + "epoch": 0.9468338687946938, + "grad_norm": 0.6931841969490051, + "learning_rate": 2.9580269109762193e-07, + "loss": 0.1979, + "step": 9136 + }, + { + "epoch": 0.9469375064773552, + "grad_norm": 0.7002904415130615, + "learning_rate": 2.9465342554717777e-07, + "loss": 0.1809, + "step": 9137 + }, + { + "epoch": 0.9470411441600166, + "grad_norm": 0.706741988658905, + "learning_rate": 2.9350638036455216e-07, + "loss": 0.1801, + "step": 9138 + }, + { + "epoch": 0.947144781842678, + "grad_norm": 0.7300030589103699, + "learning_rate": 2.9236155567899493e-07, + "loss": 0.2023, + "step": 9139 + }, + { + "epoch": 0.9472484195253394, + "grad_norm": 0.8258287906646729, + "learning_rate": 2.9121895161949855e-07, + "loss": 0.1999, + "step": 9140 + }, + { + "epoch": 0.9473520572080009, + "grad_norm": 0.6586878299713135, + "learning_rate": 2.900785683148155e-07, + "loss": 0.2066, + "step": 9141 + }, + { + "epoch": 0.9474556948906623, + "grad_norm": 0.5997200012207031, + "learning_rate": 2.8894040589344086e-07, + "loss": 0.17, + "step": 9142 + }, + { + "epoch": 0.9475593325733237, + "grad_norm": 0.6866727471351624, + "learning_rate": 2.8780446448361644e-07, + "loss": 0.226, + "step": 9143 + }, + { + "epoch": 0.9476629702559851, + "grad_norm": 0.6648144721984863, + "learning_rate": 2.8667074421334426e-07, + "loss": 0.1901, + "step": 9144 + }, + { + "epoch": 0.9477666079386465, + "grad_norm": 0.6004384756088257, + "learning_rate": 2.8553924521036446e-07, + "loss": 0.1911, + "step": 9145 + }, + { + "epoch": 0.947870245621308, + "grad_norm": 0.6397583484649658, + "learning_rate": 2.844099676021772e-07, + "loss": 0.2107, + "step": 9146 + }, + { + "epoch": 0.9479738833039694, + "grad_norm": 0.7320306301116943, + "learning_rate": 2.832829115160296e-07, + "loss": 0.1859, + "step": 9147 + }, + { + "epoch": 0.9480775209866308, + "grad_norm": 0.717109203338623, + "learning_rate": 2.8215807707891117e-07, + "loss": 0.2056, + "step": 9148 + }, + { + "epoch": 0.9481811586692922, + "grad_norm": 0.6429559588432312, + "learning_rate": 2.810354644175672e-07, + "loss": 0.1857, + "step": 9149 + }, + { + "epoch": 0.9482847963519536, + "grad_norm": 0.6143239140510559, + "learning_rate": 2.799150736584944e-07, + "loss": 0.1653, + "step": 9150 + }, + { + "epoch": 0.948388434034615, + "grad_norm": 0.6657137870788574, + "learning_rate": 2.787969049279338e-07, + "loss": 0.1894, + "step": 9151 + }, + { + "epoch": 0.9484920717172765, + "grad_norm": 0.7038910984992981, + "learning_rate": 2.776809583518847e-07, + "loss": 0.1777, + "step": 9152 + }, + { + "epoch": 0.9485957093999379, + "grad_norm": 0.7006789445877075, + "learning_rate": 2.7656723405608435e-07, + "loss": 0.1983, + "step": 9153 + }, + { + "epoch": 0.9486993470825993, + "grad_norm": 0.7506330609321594, + "learning_rate": 2.754557321660278e-07, + "loss": 0.2015, + "step": 9154 + }, + { + "epoch": 0.9488029847652607, + "grad_norm": 0.73332279920578, + "learning_rate": 2.7434645280695507e-07, + "loss": 0.1957, + "step": 9155 + }, + { + "epoch": 0.948906622447922, + "grad_norm": 0.6071427464485168, + "learning_rate": 2.732393961038615e-07, + "loss": 0.1673, + "step": 9156 + }, + { + "epoch": 0.9490102601305834, + "grad_norm": 0.5476712584495544, + "learning_rate": 2.7213456218148747e-07, + "loss": 0.1428, + "step": 9157 + }, + { + "epoch": 0.9491138978132448, + "grad_norm": 0.7443254590034485, + "learning_rate": 2.7103195116432e-07, + "loss": 0.1875, + "step": 9158 + }, + { + "epoch": 0.9492175354959063, + "grad_norm": 0.723513126373291, + "learning_rate": 2.699315631766064e-07, + "loss": 0.2015, + "step": 9159 + }, + { + "epoch": 0.9493211731785677, + "grad_norm": 0.6531522274017334, + "learning_rate": 2.6883339834233413e-07, + "loss": 0.1771, + "step": 9160 + }, + { + "epoch": 0.9494248108612291, + "grad_norm": 0.790747880935669, + "learning_rate": 2.6773745678523975e-07, + "loss": 0.2109, + "step": 9161 + }, + { + "epoch": 0.9495284485438905, + "grad_norm": 0.6858984231948853, + "learning_rate": 2.666437386288156e-07, + "loss": 0.1845, + "step": 9162 + }, + { + "epoch": 0.9496320862265519, + "grad_norm": 0.7284145951271057, + "learning_rate": 2.6555224399629654e-07, + "loss": 0.2084, + "step": 9163 + }, + { + "epoch": 0.9497357239092133, + "grad_norm": 0.6425133943557739, + "learning_rate": 2.6446297301067294e-07, + "loss": 0.1689, + "step": 9164 + }, + { + "epoch": 0.9498393615918748, + "grad_norm": 0.6820522546768188, + "learning_rate": 2.633759257946844e-07, + "loss": 0.1882, + "step": 9165 + }, + { + "epoch": 0.9499429992745362, + "grad_norm": 0.68821120262146, + "learning_rate": 2.6229110247081525e-07, + "loss": 0.1756, + "step": 9166 + }, + { + "epoch": 0.9500466369571976, + "grad_norm": 0.7660084962844849, + "learning_rate": 2.61208503161301e-07, + "loss": 0.2169, + "step": 9167 + }, + { + "epoch": 0.950150274639859, + "grad_norm": 0.7064154148101807, + "learning_rate": 2.601281279881285e-07, + "loss": 0.1985, + "step": 9168 + }, + { + "epoch": 0.9502539123225204, + "grad_norm": 0.6654224395751953, + "learning_rate": 2.590499770730293e-07, + "loss": 0.1806, + "step": 9169 + }, + { + "epoch": 0.9503575500051819, + "grad_norm": 0.6362894177436829, + "learning_rate": 2.5797405053749503e-07, + "loss": 0.1939, + "step": 9170 + }, + { + "epoch": 0.9504611876878433, + "grad_norm": 0.5990746021270752, + "learning_rate": 2.5690034850275325e-07, + "loss": 0.1821, + "step": 9171 + }, + { + "epoch": 0.9505648253705047, + "grad_norm": 0.6387087106704712, + "learning_rate": 2.5582887108978937e-07, + "loss": 0.1815, + "step": 9172 + }, + { + "epoch": 0.9506684630531661, + "grad_norm": 0.7194986343383789, + "learning_rate": 2.547596184193357e-07, + "loss": 0.1952, + "step": 9173 + }, + { + "epoch": 0.9507721007358275, + "grad_norm": 0.7512621283531189, + "learning_rate": 2.5369259061187147e-07, + "loss": 0.1851, + "step": 9174 + }, + { + "epoch": 0.9508757384184889, + "grad_norm": 0.6478919386863708, + "learning_rate": 2.5262778778763373e-07, + "loss": 0.2025, + "step": 9175 + }, + { + "epoch": 0.9509793761011504, + "grad_norm": 0.7147660851478577, + "learning_rate": 2.515652100665955e-07, + "loss": 0.188, + "step": 9176 + }, + { + "epoch": 0.9510830137838118, + "grad_norm": 0.6657465696334839, + "learning_rate": 2.5050485756849205e-07, + "loss": 0.2053, + "step": 9177 + }, + { + "epoch": 0.9511866514664732, + "grad_norm": 0.7419307231903076, + "learning_rate": 2.4944673041279896e-07, + "loss": 0.1912, + "step": 9178 + }, + { + "epoch": 0.9512902891491346, + "grad_norm": 0.7255851030349731, + "learning_rate": 2.483908287187453e-07, + "loss": 0.2018, + "step": 9179 + }, + { + "epoch": 0.951393926831796, + "grad_norm": 0.6799176931381226, + "learning_rate": 2.473371526053092e-07, + "loss": 0.2009, + "step": 9180 + }, + { + "epoch": 0.9514975645144574, + "grad_norm": 0.6410163640975952, + "learning_rate": 2.462857021912157e-07, + "loss": 0.1848, + "step": 9181 + }, + { + "epoch": 0.9516012021971189, + "grad_norm": 0.605385422706604, + "learning_rate": 2.452364775949434e-07, + "loss": 0.1717, + "step": 9182 + }, + { + "epoch": 0.9517048398797803, + "grad_norm": 0.6804389357566833, + "learning_rate": 2.441894789347132e-07, + "loss": 0.1711, + "step": 9183 + }, + { + "epoch": 0.9518084775624417, + "grad_norm": 0.6403427720069885, + "learning_rate": 2.4314470632850417e-07, + "loss": 0.1905, + "step": 9184 + }, + { + "epoch": 0.9519121152451031, + "grad_norm": 0.5662232637405396, + "learning_rate": 2.421021598940354e-07, + "loss": 0.1414, + "step": 9185 + }, + { + "epoch": 0.9520157529277645, + "grad_norm": 0.5852274298667908, + "learning_rate": 2.4106183974877963e-07, + "loss": 0.1705, + "step": 9186 + }, + { + "epoch": 0.9521193906104259, + "grad_norm": 0.7288892269134521, + "learning_rate": 2.4002374600996305e-07, + "loss": 0.2011, + "step": 9187 + }, + { + "epoch": 0.9522230282930874, + "grad_norm": 0.6094224452972412, + "learning_rate": 2.3898787879454986e-07, + "loss": 0.1471, + "step": 9188 + }, + { + "epoch": 0.9523266659757488, + "grad_norm": 0.6364356875419617, + "learning_rate": 2.3795423821926457e-07, + "loss": 0.1697, + "step": 9189 + }, + { + "epoch": 0.9524303036584102, + "grad_norm": 0.787897527217865, + "learning_rate": 2.3692282440057613e-07, + "loss": 0.2307, + "step": 9190 + }, + { + "epoch": 0.9525339413410716, + "grad_norm": 0.8060945868492126, + "learning_rate": 2.3589363745470273e-07, + "loss": 0.2175, + "step": 9191 + }, + { + "epoch": 0.952637579023733, + "grad_norm": 0.6362557411193848, + "learning_rate": 2.3486667749760716e-07, + "loss": 0.1504, + "step": 9192 + }, + { + "epoch": 0.9527412167063944, + "grad_norm": 0.6807636618614197, + "learning_rate": 2.338419446450102e-07, + "loss": 0.194, + "step": 9193 + }, + { + "epoch": 0.9528448543890559, + "grad_norm": 0.8208065629005432, + "learning_rate": 2.3281943901237504e-07, + "loss": 0.2368, + "step": 9194 + }, + { + "epoch": 0.9529484920717173, + "grad_norm": 0.5947349667549133, + "learning_rate": 2.3179916071491838e-07, + "loss": 0.1703, + "step": 9195 + }, + { + "epoch": 0.9530521297543787, + "grad_norm": 0.7142931222915649, + "learning_rate": 2.307811098676016e-07, + "loss": 0.2065, + "step": 9196 + }, + { + "epoch": 0.9531557674370401, + "grad_norm": 0.7377387881278992, + "learning_rate": 2.2976528658513743e-07, + "loss": 0.2239, + "step": 9197 + }, + { + "epoch": 0.9532594051197015, + "grad_norm": 0.6919092535972595, + "learning_rate": 2.2875169098198758e-07, + "loss": 0.1961, + "step": 9198 + }, + { + "epoch": 0.953363042802363, + "grad_norm": 0.638052761554718, + "learning_rate": 2.2774032317236073e-07, + "loss": 0.1716, + "step": 9199 + }, + { + "epoch": 0.9534666804850244, + "grad_norm": 0.7168495655059814, + "learning_rate": 2.2673118327021904e-07, + "loss": 0.1944, + "step": 9200 + }, + { + "epoch": 0.9535703181676858, + "grad_norm": 0.7718733549118042, + "learning_rate": 2.2572427138926934e-07, + "loss": 0.2063, + "step": 9201 + }, + { + "epoch": 0.9536739558503472, + "grad_norm": 0.5611803531646729, + "learning_rate": 2.2471958764296974e-07, + "loss": 0.1399, + "step": 9202 + }, + { + "epoch": 0.9537775935330086, + "grad_norm": 0.7074396014213562, + "learning_rate": 2.2371713214452306e-07, + "loss": 0.2244, + "step": 9203 + }, + { + "epoch": 0.95388123121567, + "grad_norm": 0.730533242225647, + "learning_rate": 2.2271690500689003e-07, + "loss": 0.2261, + "step": 9204 + }, + { + "epoch": 0.9539848688983315, + "grad_norm": 0.6675938963890076, + "learning_rate": 2.217189063427716e-07, + "loss": 0.2112, + "step": 9205 + }, + { + "epoch": 0.9540885065809929, + "grad_norm": 0.7420927286148071, + "learning_rate": 2.2072313626461783e-07, + "loss": 0.204, + "step": 9206 + }, + { + "epoch": 0.9541921442636543, + "grad_norm": 0.609803318977356, + "learning_rate": 2.1972959488463674e-07, + "loss": 0.181, + "step": 9207 + }, + { + "epoch": 0.9542957819463157, + "grad_norm": 0.6916093230247498, + "learning_rate": 2.1873828231477433e-07, + "loss": 0.1855, + "step": 9208 + }, + { + "epoch": 0.9543994196289771, + "grad_norm": 0.5898705720901489, + "learning_rate": 2.1774919866673016e-07, + "loss": 0.1625, + "step": 9209 + }, + { + "epoch": 0.9545030573116385, + "grad_norm": 0.6615305542945862, + "learning_rate": 2.167623440519573e-07, + "loss": 0.1782, + "step": 9210 + }, + { + "epoch": 0.9546066949943, + "grad_norm": 0.7574900984764099, + "learning_rate": 2.1577771858164897e-07, + "loss": 0.2053, + "step": 9211 + }, + { + "epoch": 0.9547103326769614, + "grad_norm": 0.7126626968383789, + "learning_rate": 2.1479532236675427e-07, + "loss": 0.1804, + "step": 9212 + }, + { + "epoch": 0.9548139703596228, + "grad_norm": 0.6488138437271118, + "learning_rate": 2.1381515551796239e-07, + "loss": 0.16, + "step": 9213 + }, + { + "epoch": 0.9549176080422842, + "grad_norm": 0.7436590194702148, + "learning_rate": 2.1283721814572057e-07, + "loss": 0.1717, + "step": 9214 + }, + { + "epoch": 0.9550212457249456, + "grad_norm": 0.7559380531311035, + "learning_rate": 2.1186151036022285e-07, + "loss": 0.1938, + "step": 9215 + }, + { + "epoch": 0.955124883407607, + "grad_norm": 0.6089216470718384, + "learning_rate": 2.108880322714102e-07, + "loss": 0.2057, + "step": 9216 + }, + { + "epoch": 0.9552285210902685, + "grad_norm": 0.721402645111084, + "learning_rate": 2.099167839889682e-07, + "loss": 0.2071, + "step": 9217 + }, + { + "epoch": 0.9553321587729299, + "grad_norm": 1.3151590824127197, + "learning_rate": 2.0894776562234043e-07, + "loss": 0.1693, + "step": 9218 + }, + { + "epoch": 0.9554357964555913, + "grad_norm": 0.6875127553939819, + "learning_rate": 2.0798097728071065e-07, + "loss": 0.19, + "step": 9219 + }, + { + "epoch": 0.9555394341382527, + "grad_norm": 0.6574918031692505, + "learning_rate": 2.070164190730206e-07, + "loss": 0.1833, + "step": 9220 + }, + { + "epoch": 0.9556430718209141, + "grad_norm": 0.6254417300224304, + "learning_rate": 2.0605409110794782e-07, + "loss": 0.1849, + "step": 9221 + }, + { + "epoch": 0.9557467095035755, + "grad_norm": 0.850127100944519, + "learning_rate": 2.0509399349393223e-07, + "loss": 0.2251, + "step": 9222 + }, + { + "epoch": 0.955850347186237, + "grad_norm": 0.7196301221847534, + "learning_rate": 2.0413612633915393e-07, + "loss": 0.1976, + "step": 9223 + }, + { + "epoch": 0.9559539848688984, + "grad_norm": 0.824651837348938, + "learning_rate": 2.0318048975154213e-07, + "loss": 0.2216, + "step": 9224 + }, + { + "epoch": 0.9560576225515598, + "grad_norm": 0.8376520276069641, + "learning_rate": 2.0222708383877965e-07, + "loss": 0.2255, + "step": 9225 + }, + { + "epoch": 0.9561612602342212, + "grad_norm": 0.663469672203064, + "learning_rate": 2.0127590870829162e-07, + "loss": 0.1942, + "step": 9226 + }, + { + "epoch": 0.9562648979168826, + "grad_norm": 0.6651266813278198, + "learning_rate": 2.0032696446725897e-07, + "loss": 0.1909, + "step": 9227 + }, + { + "epoch": 0.956368535599544, + "grad_norm": 0.746058464050293, + "learning_rate": 1.9938025122260064e-07, + "loss": 0.2136, + "step": 9228 + }, + { + "epoch": 0.9564721732822055, + "grad_norm": 0.6924718022346497, + "learning_rate": 1.9843576908099792e-07, + "loss": 0.2021, + "step": 9229 + }, + { + "epoch": 0.9565758109648669, + "grad_norm": 0.5991740822792053, + "learning_rate": 1.9749351814887018e-07, + "loss": 0.1677, + "step": 9230 + }, + { + "epoch": 0.9566794486475283, + "grad_norm": 0.6867091655731201, + "learning_rate": 1.9655349853238803e-07, + "loss": 0.1728, + "step": 9231 + }, + { + "epoch": 0.9567830863301896, + "grad_norm": 0.6148860454559326, + "learning_rate": 1.956157103374756e-07, + "loss": 0.1728, + "step": 9232 + }, + { + "epoch": 0.956886724012851, + "grad_norm": 0.5756577253341675, + "learning_rate": 1.9468015366979508e-07, + "loss": 0.1648, + "step": 9233 + }, + { + "epoch": 0.9569903616955124, + "grad_norm": 0.7107852101325989, + "learning_rate": 1.9374682863476658e-07, + "loss": 0.1957, + "step": 9234 + }, + { + "epoch": 0.9570939993781739, + "grad_norm": 0.6376127004623413, + "learning_rate": 1.9281573533755927e-07, + "loss": 0.1805, + "step": 9235 + }, + { + "epoch": 0.9571976370608353, + "grad_norm": 0.680597722530365, + "learning_rate": 1.9188687388308258e-07, + "loss": 0.2029, + "step": 9236 + }, + { + "epoch": 0.9573012747434967, + "grad_norm": 0.6080154180526733, + "learning_rate": 1.9096024437600168e-07, + "loss": 0.1682, + "step": 9237 + }, + { + "epoch": 0.9574049124261581, + "grad_norm": 0.583561897277832, + "learning_rate": 1.900358469207242e-07, + "loss": 0.1859, + "step": 9238 + }, + { + "epoch": 0.9575085501088195, + "grad_norm": 0.6905496120452881, + "learning_rate": 1.891136816214134e-07, + "loss": 0.1795, + "step": 9239 + }, + { + "epoch": 0.9576121877914809, + "grad_norm": 0.7289076447486877, + "learning_rate": 1.8819374858197515e-07, + "loss": 0.2005, + "step": 9240 + }, + { + "epoch": 0.9577158254741424, + "grad_norm": 0.6302348971366882, + "learning_rate": 1.8727604790606868e-07, + "loss": 0.1881, + "step": 9241 + }, + { + "epoch": 0.9578194631568038, + "grad_norm": 0.6139100193977356, + "learning_rate": 1.863605796971002e-07, + "loss": 0.1731, + "step": 9242 + }, + { + "epoch": 0.9579231008394652, + "grad_norm": 0.8011465072631836, + "learning_rate": 1.8544734405821608e-07, + "loss": 0.2258, + "step": 9243 + }, + { + "epoch": 0.9580267385221266, + "grad_norm": 0.7786160111427307, + "learning_rate": 1.845363410923251e-07, + "loss": 0.2141, + "step": 9244 + }, + { + "epoch": 0.958130376204788, + "grad_norm": 0.6988627314567566, + "learning_rate": 1.8362757090207628e-07, + "loss": 0.1819, + "step": 9245 + }, + { + "epoch": 0.9582340138874494, + "grad_norm": 0.7249823808670044, + "learning_rate": 1.8272103358986549e-07, + "loss": 0.2156, + "step": 9246 + }, + { + "epoch": 0.9583376515701109, + "grad_norm": 0.7648645043373108, + "learning_rate": 1.818167292578421e-07, + "loss": 0.2203, + "step": 9247 + }, + { + "epoch": 0.9584412892527723, + "grad_norm": 0.692196786403656, + "learning_rate": 1.8091465800790465e-07, + "loss": 0.1896, + "step": 9248 + }, + { + "epoch": 0.9585449269354337, + "grad_norm": 0.7493067383766174, + "learning_rate": 1.8001481994169178e-07, + "loss": 0.2059, + "step": 9249 + }, + { + "epoch": 0.9586485646180951, + "grad_norm": 0.8199605941772461, + "learning_rate": 1.79117215160598e-07, + "loss": 0.2293, + "step": 9250 + }, + { + "epoch": 0.9587522023007565, + "grad_norm": 0.7317061424255371, + "learning_rate": 1.7822184376576456e-07, + "loss": 0.2183, + "step": 9251 + }, + { + "epoch": 0.958855839983418, + "grad_norm": 0.6175105571746826, + "learning_rate": 1.7732870585808194e-07, + "loss": 0.1434, + "step": 9252 + }, + { + "epoch": 0.9589594776660794, + "grad_norm": 0.6819245219230652, + "learning_rate": 1.764378015381829e-07, + "loss": 0.2049, + "step": 9253 + }, + { + "epoch": 0.9590631153487408, + "grad_norm": 0.6690202355384827, + "learning_rate": 1.7554913090645832e-07, + "loss": 0.1815, + "step": 9254 + }, + { + "epoch": 0.9591667530314022, + "grad_norm": 0.7080608010292053, + "learning_rate": 1.7466269406304136e-07, + "loss": 0.1971, + "step": 9255 + }, + { + "epoch": 0.9592703907140636, + "grad_norm": 0.6241846680641174, + "learning_rate": 1.7377849110780997e-07, + "loss": 0.1851, + "step": 9256 + }, + { + "epoch": 0.959374028396725, + "grad_norm": 0.723426878452301, + "learning_rate": 1.7289652214039775e-07, + "loss": 0.183, + "step": 9257 + }, + { + "epoch": 0.9594776660793864, + "grad_norm": 0.7585938572883606, + "learning_rate": 1.7201678726018522e-07, + "loss": 0.2243, + "step": 9258 + }, + { + "epoch": 0.9595813037620479, + "grad_norm": 0.6271798610687256, + "learning_rate": 1.711392865662953e-07, + "loss": 0.1819, + "step": 9259 + }, + { + "epoch": 0.9596849414447093, + "grad_norm": 0.699207067489624, + "learning_rate": 1.702640201576089e-07, + "loss": 0.197, + "step": 9260 + }, + { + "epoch": 0.9597885791273707, + "grad_norm": 0.6738049983978271, + "learning_rate": 1.6939098813274713e-07, + "loss": 0.2064, + "step": 9261 + }, + { + "epoch": 0.9598922168100321, + "grad_norm": 0.5655495524406433, + "learning_rate": 1.6852019059008019e-07, + "loss": 0.1767, + "step": 9262 + }, + { + "epoch": 0.9599958544926935, + "grad_norm": 0.6773292422294617, + "learning_rate": 1.6765162762772957e-07, + "loss": 0.2361, + "step": 9263 + }, + { + "epoch": 0.960099492175355, + "grad_norm": 0.7021222114562988, + "learning_rate": 1.6678529934356148e-07, + "loss": 0.1996, + "step": 9264 + }, + { + "epoch": 0.9602031298580164, + "grad_norm": 0.7449448704719543, + "learning_rate": 1.6592120583519778e-07, + "loss": 0.2057, + "step": 9265 + }, + { + "epoch": 0.9603067675406778, + "grad_norm": 0.6998816728591919, + "learning_rate": 1.6505934719999839e-07, + "loss": 0.1839, + "step": 9266 + }, + { + "epoch": 0.9604104052233392, + "grad_norm": 0.6950123310089111, + "learning_rate": 1.6419972353507895e-07, + "loss": 0.177, + "step": 9267 + }, + { + "epoch": 0.9605140429060006, + "grad_norm": 0.7720685005187988, + "learning_rate": 1.6334233493729757e-07, + "loss": 0.2048, + "step": 9268 + }, + { + "epoch": 0.960617680588662, + "grad_norm": 0.6883362531661987, + "learning_rate": 1.624871815032658e-07, + "loss": 0.2034, + "step": 9269 + }, + { + "epoch": 0.9607213182713235, + "grad_norm": 0.6283169388771057, + "learning_rate": 1.616342633293422e-07, + "loss": 0.1621, + "step": 9270 + }, + { + "epoch": 0.9608249559539849, + "grad_norm": 0.6474786400794983, + "learning_rate": 1.6078358051162757e-07, + "loss": 0.1752, + "step": 9271 + }, + { + "epoch": 0.9609285936366463, + "grad_norm": 0.7147000432014465, + "learning_rate": 1.5993513314598085e-07, + "loss": 0.2124, + "step": 9272 + }, + { + "epoch": 0.9610322313193077, + "grad_norm": 0.8779515624046326, + "learning_rate": 1.590889213279989e-07, + "loss": 0.2072, + "step": 9273 + }, + { + "epoch": 0.9611358690019691, + "grad_norm": 0.5219985246658325, + "learning_rate": 1.5824494515303658e-07, + "loss": 0.1388, + "step": 9274 + }, + { + "epoch": 0.9612395066846305, + "grad_norm": 0.6944804191589355, + "learning_rate": 1.5740320471618885e-07, + "loss": 0.1865, + "step": 9275 + }, + { + "epoch": 0.961343144367292, + "grad_norm": 0.5461245775222778, + "learning_rate": 1.5656370011229994e-07, + "loss": 0.158, + "step": 9276 + }, + { + "epoch": 0.9614467820499534, + "grad_norm": 0.7172948122024536, + "learning_rate": 1.5572643143596744e-07, + "loss": 0.2155, + "step": 9277 + }, + { + "epoch": 0.9615504197326148, + "grad_norm": 0.6956398487091064, + "learning_rate": 1.548913987815315e-07, + "loss": 0.167, + "step": 9278 + }, + { + "epoch": 0.9616540574152762, + "grad_norm": 0.739754855632782, + "learning_rate": 1.5405860224308345e-07, + "loss": 0.1697, + "step": 9279 + }, + { + "epoch": 0.9617576950979376, + "grad_norm": 0.5874171257019043, + "learning_rate": 1.532280419144616e-07, + "loss": 0.1471, + "step": 9280 + }, + { + "epoch": 0.961861332780599, + "grad_norm": 0.6367971301078796, + "learning_rate": 1.5239971788925113e-07, + "loss": 0.1815, + "step": 9281 + }, + { + "epoch": 0.9619649704632605, + "grad_norm": 0.7605910897254944, + "learning_rate": 1.515736302607884e-07, + "loss": 0.2044, + "step": 9282 + }, + { + "epoch": 0.9620686081459219, + "grad_norm": 0.6360923647880554, + "learning_rate": 1.5074977912215016e-07, + "loss": 0.1852, + "step": 9283 + }, + { + "epoch": 0.9621722458285833, + "grad_norm": 0.6246673464775085, + "learning_rate": 1.499281645661732e-07, + "loss": 0.1708, + "step": 9284 + }, + { + "epoch": 0.9622758835112447, + "grad_norm": 0.7202011346817017, + "learning_rate": 1.4910878668543238e-07, + "loss": 0.1855, + "step": 9285 + }, + { + "epoch": 0.9623795211939061, + "grad_norm": 0.5647118091583252, + "learning_rate": 1.4829164557225607e-07, + "loss": 0.1536, + "step": 9286 + }, + { + "epoch": 0.9624831588765675, + "grad_norm": 0.6627938151359558, + "learning_rate": 1.4747674131871502e-07, + "loss": 0.1647, + "step": 9287 + }, + { + "epoch": 0.962586796559229, + "grad_norm": 0.6464048624038696, + "learning_rate": 1.4666407401663586e-07, + "loss": 0.1842, + "step": 9288 + }, + { + "epoch": 0.9626904342418904, + "grad_norm": 0.6384124755859375, + "learning_rate": 1.4585364375758304e-07, + "loss": 0.1662, + "step": 9289 + }, + { + "epoch": 0.9627940719245518, + "grad_norm": 0.5953946709632874, + "learning_rate": 1.4504545063287912e-07, + "loss": 0.164, + "step": 9290 + }, + { + "epoch": 0.9628977096072132, + "grad_norm": 0.9141512513160706, + "learning_rate": 1.44239494733589e-07, + "loss": 0.1747, + "step": 9291 + }, + { + "epoch": 0.9630013472898746, + "grad_norm": 0.7162027955055237, + "learning_rate": 1.4343577615052672e-07, + "loss": 0.2083, + "step": 9292 + }, + { + "epoch": 0.963104984972536, + "grad_norm": 0.7092261910438538, + "learning_rate": 1.4263429497425318e-07, + "loss": 0.1882, + "step": 9293 + }, + { + "epoch": 0.9632086226551975, + "grad_norm": 0.7536466717720032, + "learning_rate": 1.4183505129507835e-07, + "loss": 0.1833, + "step": 9294 + }, + { + "epoch": 0.9633122603378589, + "grad_norm": 0.5375593304634094, + "learning_rate": 1.4103804520305908e-07, + "loss": 0.1514, + "step": 9295 + }, + { + "epoch": 0.9634158980205203, + "grad_norm": 0.7242726683616638, + "learning_rate": 1.4024327678800353e-07, + "loss": 0.1717, + "step": 9296 + }, + { + "epoch": 0.9635195357031817, + "grad_norm": 0.7065474987030029, + "learning_rate": 1.394507461394623e-07, + "loss": 0.1905, + "step": 9297 + }, + { + "epoch": 0.9636231733858431, + "grad_norm": 0.7065313458442688, + "learning_rate": 1.3866045334673506e-07, + "loss": 0.1847, + "step": 9298 + }, + { + "epoch": 0.9637268110685046, + "grad_norm": 0.597445547580719, + "learning_rate": 1.37872398498875e-07, + "loss": 0.1622, + "step": 9299 + }, + { + "epoch": 0.963830448751166, + "grad_norm": 0.7109730243682861, + "learning_rate": 1.3708658168467558e-07, + "loss": 0.2074, + "step": 9300 + }, + { + "epoch": 0.9639340864338274, + "grad_norm": 0.7618610858917236, + "learning_rate": 1.3630300299268374e-07, + "loss": 0.1903, + "step": 9301 + }, + { + "epoch": 0.9640377241164888, + "grad_norm": 0.6909206509590149, + "learning_rate": 1.3552166251119103e-07, + "loss": 0.2081, + "step": 9302 + }, + { + "epoch": 0.9641413617991502, + "grad_norm": 0.6721199750900269, + "learning_rate": 1.3474256032823596e-07, + "loss": 0.1874, + "step": 9303 + }, + { + "epoch": 0.9642449994818116, + "grad_norm": 0.5766302347183228, + "learning_rate": 1.3396569653160829e-07, + "loss": 0.1534, + "step": 9304 + }, + { + "epoch": 0.964348637164473, + "grad_norm": 0.605042576789856, + "learning_rate": 1.3319107120884467e-07, + "loss": 0.1516, + "step": 9305 + }, + { + "epoch": 0.9644522748471345, + "grad_norm": 0.6952525973320007, + "learning_rate": 1.324186844472264e-07, + "loss": 0.1817, + "step": 9306 + }, + { + "epoch": 0.9645559125297959, + "grad_norm": 0.7488659620285034, + "learning_rate": 1.3164853633378826e-07, + "loss": 0.2072, + "step": 9307 + }, + { + "epoch": 0.9646595502124572, + "grad_norm": 0.7539721131324768, + "learning_rate": 1.3088062695530312e-07, + "loss": 0.2186, + "step": 9308 + }, + { + "epoch": 0.9647631878951186, + "grad_norm": 0.8046135902404785, + "learning_rate": 1.3011495639830395e-07, + "loss": 0.2327, + "step": 9309 + }, + { + "epoch": 0.96486682557778, + "grad_norm": 0.7128415107727051, + "learning_rate": 1.2935152474906398e-07, + "loss": 0.193, + "step": 9310 + }, + { + "epoch": 0.9649704632604414, + "grad_norm": 0.6780071258544922, + "learning_rate": 1.285903320936055e-07, + "loss": 0.2047, + "step": 9311 + }, + { + "epoch": 0.9650741009431029, + "grad_norm": 0.8416036367416382, + "learning_rate": 1.2783137851769544e-07, + "loss": 0.2005, + "step": 9312 + }, + { + "epoch": 0.9651777386257643, + "grad_norm": 0.6061697602272034, + "learning_rate": 1.270746641068543e-07, + "loss": 0.15, + "step": 9313 + }, + { + "epoch": 0.9652813763084257, + "grad_norm": 0.7234962582588196, + "learning_rate": 1.2632018894634725e-07, + "loss": 0.1989, + "step": 9314 + }, + { + "epoch": 0.9653850139910871, + "grad_norm": 0.7064085006713867, + "learning_rate": 1.2556795312118619e-07, + "loss": 0.1927, + "step": 9315 + }, + { + "epoch": 0.9654886516737485, + "grad_norm": 0.7468860149383545, + "learning_rate": 1.248179567161345e-07, + "loss": 0.1868, + "step": 9316 + }, + { + "epoch": 0.96559228935641, + "grad_norm": 0.5981663465499878, + "learning_rate": 1.240701998156979e-07, + "loss": 0.1849, + "step": 9317 + }, + { + "epoch": 0.9656959270390714, + "grad_norm": 0.6494339108467102, + "learning_rate": 1.2332468250413343e-07, + "loss": 0.1836, + "step": 9318 + }, + { + "epoch": 0.9657995647217328, + "grad_norm": 0.6810412406921387, + "learning_rate": 1.22581404865445e-07, + "loss": 0.1801, + "step": 9319 + }, + { + "epoch": 0.9659032024043942, + "grad_norm": 0.74522465467453, + "learning_rate": 1.2184036698338343e-07, + "loss": 0.2059, + "step": 9320 + }, + { + "epoch": 0.9660068400870556, + "grad_norm": 0.7757186889648438, + "learning_rate": 1.2110156894144852e-07, + "loss": 0.2175, + "step": 9321 + }, + { + "epoch": 0.966110477769717, + "grad_norm": 0.675933837890625, + "learning_rate": 1.203650108228871e-07, + "loss": 0.1979, + "step": 9322 + }, + { + "epoch": 0.9662141154523785, + "grad_norm": 0.57355135679245, + "learning_rate": 1.196306927106905e-07, + "loss": 0.1434, + "step": 9323 + }, + { + "epoch": 0.9663177531350399, + "grad_norm": 0.6624523401260376, + "learning_rate": 1.1889861468760589e-07, + "loss": 0.169, + "step": 9324 + }, + { + "epoch": 0.9664213908177013, + "grad_norm": 0.698999285697937, + "learning_rate": 1.1816877683611838e-07, + "loss": 0.2011, + "step": 9325 + }, + { + "epoch": 0.9665250285003627, + "grad_norm": 0.6993268728256226, + "learning_rate": 1.1744117923846443e-07, + "loss": 0.2027, + "step": 9326 + }, + { + "epoch": 0.9666286661830241, + "grad_norm": 0.69150710105896, + "learning_rate": 1.16715821976634e-07, + "loss": 0.1873, + "step": 9327 + }, + { + "epoch": 0.9667323038656855, + "grad_norm": 0.6487718820571899, + "learning_rate": 1.1599270513235283e-07, + "loss": 0.1739, + "step": 9328 + }, + { + "epoch": 0.966835941548347, + "grad_norm": 0.6418184041976929, + "learning_rate": 1.1527182878710241e-07, + "loss": 0.1768, + "step": 9329 + }, + { + "epoch": 0.9669395792310084, + "grad_norm": 0.7726601958274841, + "learning_rate": 1.1455319302211332e-07, + "loss": 0.2201, + "step": 9330 + }, + { + "epoch": 0.9670432169136698, + "grad_norm": 0.6095311641693115, + "learning_rate": 1.1383679791835633e-07, + "loss": 0.1785, + "step": 9331 + }, + { + "epoch": 0.9671468545963312, + "grad_norm": 0.6452474594116211, + "learning_rate": 1.13122643556558e-07, + "loss": 0.1712, + "step": 9332 + }, + { + "epoch": 0.9672504922789926, + "grad_norm": 0.728452205657959, + "learning_rate": 1.1241073001718283e-07, + "loss": 0.1975, + "step": 9333 + }, + { + "epoch": 0.967354129961654, + "grad_norm": 0.679295539855957, + "learning_rate": 1.1170105738045112e-07, + "loss": 0.1911, + "step": 9334 + }, + { + "epoch": 0.9674577676443155, + "grad_norm": 0.716543436050415, + "learning_rate": 1.1099362572632777e-07, + "loss": 0.2033, + "step": 9335 + }, + { + "epoch": 0.9675614053269769, + "grad_norm": 0.5981671214103699, + "learning_rate": 1.1028843513452236e-07, + "loss": 0.1659, + "step": 9336 + }, + { + "epoch": 0.9676650430096383, + "grad_norm": 0.6800544857978821, + "learning_rate": 1.0958548568450245e-07, + "loss": 0.1971, + "step": 9337 + }, + { + "epoch": 0.9677686806922997, + "grad_norm": 0.6196473836898804, + "learning_rate": 1.0888477745546466e-07, + "loss": 0.1876, + "step": 9338 + }, + { + "epoch": 0.9678723183749611, + "grad_norm": 0.6917783617973328, + "learning_rate": 1.081863105263703e-07, + "loss": 0.1862, + "step": 9339 + }, + { + "epoch": 0.9679759560576225, + "grad_norm": 0.622767448425293, + "learning_rate": 1.0749008497592083e-07, + "loss": 0.1601, + "step": 9340 + }, + { + "epoch": 0.968079593740284, + "grad_norm": 0.6830424666404724, + "learning_rate": 1.0679610088256242e-07, + "loss": 0.1952, + "step": 9341 + }, + { + "epoch": 0.9681832314229454, + "grad_norm": 0.706273078918457, + "learning_rate": 1.0610435832449917e-07, + "loss": 0.1878, + "step": 9342 + }, + { + "epoch": 0.9682868691056068, + "grad_norm": 0.7518766522407532, + "learning_rate": 1.0541485737966872e-07, + "loss": 0.1948, + "step": 9343 + }, + { + "epoch": 0.9683905067882682, + "grad_norm": 0.7455720901489258, + "learning_rate": 1.047275981257645e-07, + "loss": 0.2139, + "step": 9344 + }, + { + "epoch": 0.9684941444709296, + "grad_norm": 0.746716320514679, + "learning_rate": 1.0404258064022898e-07, + "loss": 0.1952, + "step": 9345 + }, + { + "epoch": 0.968597782153591, + "grad_norm": 0.6383845210075378, + "learning_rate": 1.033598050002449e-07, + "loss": 0.1509, + "step": 9346 + }, + { + "epoch": 0.9687014198362525, + "grad_norm": 0.7649304270744324, + "learning_rate": 1.0267927128274846e-07, + "loss": 0.2389, + "step": 9347 + }, + { + "epoch": 0.9688050575189139, + "grad_norm": 0.5000459551811218, + "learning_rate": 1.0200097956442057e-07, + "loss": 0.1464, + "step": 9348 + }, + { + "epoch": 0.9689086952015753, + "grad_norm": 0.7772543430328369, + "learning_rate": 1.0132492992168896e-07, + "loss": 0.2005, + "step": 9349 + }, + { + "epoch": 0.9690123328842367, + "grad_norm": 0.5948247313499451, + "learning_rate": 1.0065112243073494e-07, + "loss": 0.1756, + "step": 9350 + }, + { + "epoch": 0.9691159705668981, + "grad_norm": 0.7647456526756287, + "learning_rate": 9.997955716747553e-08, + "loss": 0.1956, + "step": 9351 + }, + { + "epoch": 0.9692196082495595, + "grad_norm": 0.5967316031455994, + "learning_rate": 9.931023420758356e-08, + "loss": 0.1497, + "step": 9352 + }, + { + "epoch": 0.969323245932221, + "grad_norm": 0.7856627702713013, + "learning_rate": 9.864315362648091e-08, + "loss": 0.2095, + "step": 9353 + }, + { + "epoch": 0.9694268836148824, + "grad_norm": 0.6720486283302307, + "learning_rate": 9.797831549932745e-08, + "loss": 0.1724, + "step": 9354 + }, + { + "epoch": 0.9695305212975438, + "grad_norm": 0.68314129114151, + "learning_rate": 9.731571990104105e-08, + "loss": 0.2019, + "step": 9355 + }, + { + "epoch": 0.9696341589802052, + "grad_norm": 0.7092345952987671, + "learning_rate": 9.665536690627974e-08, + "loss": 0.1744, + "step": 9356 + }, + { + "epoch": 0.9697377966628666, + "grad_norm": 0.6779714822769165, + "learning_rate": 9.599725658945069e-08, + "loss": 0.1733, + "step": 9357 + }, + { + "epoch": 0.969841434345528, + "grad_norm": 0.690662145614624, + "learning_rate": 9.534138902471235e-08, + "loss": 0.1796, + "step": 9358 + }, + { + "epoch": 0.9699450720281895, + "grad_norm": 0.9036988019943237, + "learning_rate": 9.468776428596115e-08, + "loss": 0.1887, + "step": 9359 + }, + { + "epoch": 0.9700487097108509, + "grad_norm": 0.6716302633285522, + "learning_rate": 9.403638244684931e-08, + "loss": 0.1672, + "step": 9360 + }, + { + "epoch": 0.9701523473935123, + "grad_norm": 0.7448993921279907, + "learning_rate": 9.338724358077367e-08, + "loss": 0.2256, + "step": 9361 + }, + { + "epoch": 0.9702559850761737, + "grad_norm": 0.7266402840614319, + "learning_rate": 9.274034776087793e-08, + "loss": 0.1953, + "step": 9362 + }, + { + "epoch": 0.9703596227588351, + "grad_norm": 0.6478952169418335, + "learning_rate": 9.209569506005489e-08, + "loss": 0.1765, + "step": 9363 + }, + { + "epoch": 0.9704632604414966, + "grad_norm": 0.7265430688858032, + "learning_rate": 9.14532855509398e-08, + "loss": 0.2049, + "step": 9364 + }, + { + "epoch": 0.970566898124158, + "grad_norm": 0.6949917674064636, + "learning_rate": 9.081311930591919e-08, + "loss": 0.215, + "step": 9365 + }, + { + "epoch": 0.9706705358068194, + "grad_norm": 0.8174939155578613, + "learning_rate": 9.017519639712868e-08, + "loss": 0.2426, + "step": 9366 + }, + { + "epoch": 0.9707741734894808, + "grad_norm": 0.75638347864151, + "learning_rate": 8.953951689644636e-08, + "loss": 0.2089, + "step": 9367 + }, + { + "epoch": 0.9708778111721422, + "grad_norm": 0.745216429233551, + "learning_rate": 8.890608087549934e-08, + "loss": 0.2288, + "step": 9368 + }, + { + "epoch": 0.9709814488548036, + "grad_norm": 0.6507091522216797, + "learning_rate": 8.82748884056639e-08, + "loss": 0.179, + "step": 9369 + }, + { + "epoch": 0.9710850865374651, + "grad_norm": 0.659831166267395, + "learning_rate": 8.764593955805866e-08, + "loss": 0.198, + "step": 9370 + }, + { + "epoch": 0.9711887242201265, + "grad_norm": 0.6593323349952698, + "learning_rate": 8.701923440355586e-08, + "loss": 0.1899, + "step": 9371 + }, + { + "epoch": 0.9712923619027879, + "grad_norm": 0.791563093662262, + "learning_rate": 8.63947730127701e-08, + "loss": 0.1864, + "step": 9372 + }, + { + "epoch": 0.9713959995854493, + "grad_norm": 0.6524644494056702, + "learning_rate": 8.577255545606511e-08, + "loss": 0.1968, + "step": 9373 + }, + { + "epoch": 0.9714996372681107, + "grad_norm": 0.6397282481193542, + "learning_rate": 8.515258180355146e-08, + "loss": 0.1862, + "step": 9374 + }, + { + "epoch": 0.9716032749507721, + "grad_norm": 0.6900851726531982, + "learning_rate": 8.453485212508661e-08, + "loss": 0.1816, + "step": 9375 + }, + { + "epoch": 0.9717069126334336, + "grad_norm": 0.7994019985198975, + "learning_rate": 8.391936649027488e-08, + "loss": 0.2319, + "step": 9376 + }, + { + "epoch": 0.971810550316095, + "grad_norm": 0.7205654382705688, + "learning_rate": 8.330612496846968e-08, + "loss": 0.1788, + "step": 9377 + }, + { + "epoch": 0.9719141879987564, + "grad_norm": 0.6512675881385803, + "learning_rate": 8.269512762876908e-08, + "loss": 0.1773, + "step": 9378 + }, + { + "epoch": 0.9720178256814178, + "grad_norm": 0.6505001783370972, + "learning_rate": 8.2086374540018e-08, + "loss": 0.1936, + "step": 9379 + }, + { + "epoch": 0.9721214633640792, + "grad_norm": 0.6861594319343567, + "learning_rate": 8.147986577081269e-08, + "loss": 0.1831, + "step": 9380 + }, + { + "epoch": 0.9722251010467406, + "grad_norm": 0.6237919926643372, + "learning_rate": 8.087560138949179e-08, + "loss": 0.1648, + "step": 9381 + }, + { + "epoch": 0.9723287387294021, + "grad_norm": 0.7278616428375244, + "learning_rate": 8.027358146414311e-08, + "loss": 0.1742, + "step": 9382 + }, + { + "epoch": 0.9724323764120635, + "grad_norm": 0.7543025016784668, + "learning_rate": 7.967380606260344e-08, + "loss": 0.2103, + "step": 9383 + }, + { + "epoch": 0.9725360140947248, + "grad_norm": 0.5872170925140381, + "learning_rate": 7.907627525244988e-08, + "loss": 0.1625, + "step": 9384 + }, + { + "epoch": 0.9726396517773862, + "grad_norm": 0.6460816264152527, + "learning_rate": 7.84809891010152e-08, + "loss": 0.2087, + "step": 9385 + }, + { + "epoch": 0.9727432894600476, + "grad_norm": 0.6365247368812561, + "learning_rate": 7.788794767537244e-08, + "loss": 0.1757, + "step": 9386 + }, + { + "epoch": 0.972846927142709, + "grad_norm": 0.6571633815765381, + "learning_rate": 7.729715104234814e-08, + "loss": 0.195, + "step": 9387 + }, + { + "epoch": 0.9729505648253705, + "grad_norm": 0.6919662356376648, + "learning_rate": 7.670859926851126e-08, + "loss": 0.2203, + "step": 9388 + }, + { + "epoch": 0.9730542025080319, + "grad_norm": 0.7071418762207031, + "learning_rate": 7.612229242017543e-08, + "loss": 0.1816, + "step": 9389 + }, + { + "epoch": 0.9731578401906933, + "grad_norm": 0.8264790773391724, + "learning_rate": 7.553823056340781e-08, + "loss": 0.1992, + "step": 9390 + }, + { + "epoch": 0.9732614778733547, + "grad_norm": 0.6832471489906311, + "learning_rate": 7.495641376402019e-08, + "loss": 0.2038, + "step": 9391 + }, + { + "epoch": 0.9733651155560161, + "grad_norm": 0.7595140933990479, + "learning_rate": 7.4376842087569e-08, + "loss": 0.2317, + "step": 9392 + }, + { + "epoch": 0.9734687532386775, + "grad_norm": 0.7731542587280273, + "learning_rate": 7.379951559936205e-08, + "loss": 0.2163, + "step": 9393 + }, + { + "epoch": 0.973572390921339, + "grad_norm": 0.8025881052017212, + "learning_rate": 7.322443436444949e-08, + "loss": 0.1935, + "step": 9394 + }, + { + "epoch": 0.9736760286040004, + "grad_norm": 0.7015080451965332, + "learning_rate": 7.265159844762837e-08, + "loss": 0.2051, + "step": 9395 + }, + { + "epoch": 0.9737796662866618, + "grad_norm": 0.6422966718673706, + "learning_rate": 7.20810079134493e-08, + "loss": 0.1644, + "step": 9396 + }, + { + "epoch": 0.9738833039693232, + "grad_norm": 0.6956278681755066, + "learning_rate": 7.151266282620306e-08, + "loss": 0.2061, + "step": 9397 + }, + { + "epoch": 0.9739869416519846, + "grad_norm": 0.7768704891204834, + "learning_rate": 7.094656324993177e-08, + "loss": 0.2021, + "step": 9398 + }, + { + "epoch": 0.974090579334646, + "grad_norm": 0.670602560043335, + "learning_rate": 7.038270924841995e-08, + "loss": 0.2034, + "step": 9399 + }, + { + "epoch": 0.9741942170173075, + "grad_norm": 0.6462625861167908, + "learning_rate": 6.982110088520566e-08, + "loss": 0.1871, + "step": 9400 + } + ], + "logging_steps": 1.0, + "max_steps": 9649, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.184630033364665e+22, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}