diff --git "a/checkpoint-5700/trainer_state.json" "b/checkpoint-5700/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5700/trainer_state.json" @@ -0,0 +1,20135 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8779360800924143, + "eval_steps": 300, + "global_step": 5700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00030804774740084714, + "grad_norm": 1.6617624759674072, + "learning_rate": 9.999997658238351e-05, + "loss": 1.4818, + "step": 2 + }, + { + "epoch": 0.0006160954948016943, + "grad_norm": 0.8753126263618469, + "learning_rate": 9.9999906329556e-05, + "loss": 1.3083, + "step": 4 + }, + { + "epoch": 0.0009241432422025414, + "grad_norm": 1.0605000257492065, + "learning_rate": 9.999978924158326e-05, + "loss": 1.5628, + "step": 6 + }, + { + "epoch": 0.0012321909896033886, + "grad_norm": 0.9619163870811462, + "learning_rate": 9.999962531857496e-05, + "loss": 1.3115, + "step": 8 + }, + { + "epoch": 0.0015402387370042356, + "grad_norm": 2.4611310958862305, + "learning_rate": 9.999941456068467e-05, + "loss": 1.3254, + "step": 10 + }, + { + "epoch": 0.0018482864844050829, + "grad_norm": 1.5140900611877441, + "learning_rate": 9.999915696810979e-05, + "loss": 1.7101, + "step": 12 + }, + { + "epoch": 0.00215633423180593, + "grad_norm": 0.9794526696205139, + "learning_rate": 9.999885254109161e-05, + "loss": 1.439, + "step": 14 + }, + { + "epoch": 0.002464381979206777, + "grad_norm": 1.0064067840576172, + "learning_rate": 9.99985012799153e-05, + "loss": 1.3107, + "step": 16 + }, + { + "epoch": 0.002772429726607624, + "grad_norm": 1.075746774673462, + "learning_rate": 9.999810318490988e-05, + "loss": 1.8331, + "step": 18 + }, + { + "epoch": 0.0030804774740084712, + "grad_norm": 0.9585892558097839, + "learning_rate": 9.999765825644824e-05, + "loss": 1.3932, + "step": 20 + }, + { + "epoch": 0.0033885252214093183, + "grad_norm": 0.7582125067710876, + "learning_rate": 9.999716649494715e-05, + "loss": 1.1928, + "step": 22 + }, + { + "epoch": 0.0036965729688101657, + "grad_norm": 0.7594268918037415, + "learning_rate": 9.999662790086726e-05, + "loss": 1.3234, + "step": 24 + }, + { + "epoch": 0.004004620716211013, + "grad_norm": 0.8150789141654968, + "learning_rate": 9.999604247471306e-05, + "loss": 1.3775, + "step": 26 + }, + { + "epoch": 0.00431266846361186, + "grad_norm": 0.7926575541496277, + "learning_rate": 9.999541021703292e-05, + "loss": 1.2174, + "step": 28 + }, + { + "epoch": 0.004620716211012707, + "grad_norm": 0.9279683232307434, + "learning_rate": 9.999473112841908e-05, + "loss": 1.32, + "step": 30 + }, + { + "epoch": 0.004928763958413554, + "grad_norm": 0.7790488004684448, + "learning_rate": 9.999400520950766e-05, + "loss": 1.4515, + "step": 32 + }, + { + "epoch": 0.005236811705814401, + "grad_norm": 1.0332311391830444, + "learning_rate": 9.99932324609786e-05, + "loss": 1.3014, + "step": 34 + }, + { + "epoch": 0.005544859453215248, + "grad_norm": 0.9211418628692627, + "learning_rate": 9.999241288355577e-05, + "loss": 1.2203, + "step": 36 + }, + { + "epoch": 0.005852907200616096, + "grad_norm": 0.708522379398346, + "learning_rate": 9.999154647800686e-05, + "loss": 1.2018, + "step": 38 + }, + { + "epoch": 0.0061609549480169425, + "grad_norm": 1.3211787939071655, + "learning_rate": 9.999063324514344e-05, + "loss": 0.9897, + "step": 40 + }, + { + "epoch": 0.00646900269541779, + "grad_norm": 0.7706666588783264, + "learning_rate": 9.998967318582092e-05, + "loss": 1.2864, + "step": 42 + }, + { + "epoch": 0.0067770504428186365, + "grad_norm": 0.857083797454834, + "learning_rate": 9.998866630093861e-05, + "loss": 1.1717, + "step": 44 + }, + { + "epoch": 0.007085098190219484, + "grad_norm": 0.5936506390571594, + "learning_rate": 9.998761259143967e-05, + "loss": 1.1459, + "step": 46 + }, + { + "epoch": 0.0073931459376203315, + "grad_norm": 0.6077166795730591, + "learning_rate": 9.99865120583111e-05, + "loss": 1.2088, + "step": 48 + }, + { + "epoch": 0.007701193685021178, + "grad_norm": 2.3327157497406006, + "learning_rate": 9.998536470258378e-05, + "loss": 1.9314, + "step": 50 + }, + { + "epoch": 0.008009241432422026, + "grad_norm": 0.8980337381362915, + "learning_rate": 9.998417052533244e-05, + "loss": 1.1304, + "step": 52 + }, + { + "epoch": 0.008317289179822872, + "grad_norm": 0.7357211709022522, + "learning_rate": 9.998292952767569e-05, + "loss": 1.1089, + "step": 54 + }, + { + "epoch": 0.00862533692722372, + "grad_norm": 0.7480604648590088, + "learning_rate": 9.998164171077595e-05, + "loss": 1.3085, + "step": 56 + }, + { + "epoch": 0.008933384674624567, + "grad_norm": 0.7059953808784485, + "learning_rate": 9.998030707583953e-05, + "loss": 1.2016, + "step": 58 + }, + { + "epoch": 0.009241432422025414, + "grad_norm": 0.8867676258087158, + "learning_rate": 9.99789256241166e-05, + "loss": 1.263, + "step": 60 + }, + { + "epoch": 0.00954948016942626, + "grad_norm": 0.5447019338607788, + "learning_rate": 9.997749735690117e-05, + "loss": 1.1203, + "step": 62 + }, + { + "epoch": 0.009857527916827109, + "grad_norm": 1.0171029567718506, + "learning_rate": 9.997602227553112e-05, + "loss": 1.2449, + "step": 64 + }, + { + "epoch": 0.010165575664227955, + "grad_norm": 0.6860587000846863, + "learning_rate": 9.997450038138811e-05, + "loss": 1.1124, + "step": 66 + }, + { + "epoch": 0.010473623411628802, + "grad_norm": 0.7293751835823059, + "learning_rate": 9.997293167589778e-05, + "loss": 1.2182, + "step": 68 + }, + { + "epoch": 0.01078167115902965, + "grad_norm": 0.8685497045516968, + "learning_rate": 9.997131616052949e-05, + "loss": 1.3284, + "step": 70 + }, + { + "epoch": 0.011089718906430497, + "grad_norm": 0.662895917892456, + "learning_rate": 9.99696538367965e-05, + "loss": 1.0932, + "step": 72 + }, + { + "epoch": 0.011397766653831343, + "grad_norm": 0.8935220241546631, + "learning_rate": 9.996794470625597e-05, + "loss": 1.4038, + "step": 74 + }, + { + "epoch": 0.011705814401232192, + "grad_norm": 0.6189644932746887, + "learning_rate": 9.996618877050878e-05, + "loss": 1.1177, + "step": 76 + }, + { + "epoch": 0.012013862148633038, + "grad_norm": 0.861652135848999, + "learning_rate": 9.996438603119978e-05, + "loss": 1.2163, + "step": 78 + }, + { + "epoch": 0.012321909896033885, + "grad_norm": 0.6359105110168457, + "learning_rate": 9.996253649001759e-05, + "loss": 1.0015, + "step": 80 + }, + { + "epoch": 0.012629957643434732, + "grad_norm": 0.8358225226402283, + "learning_rate": 9.996064014869466e-05, + "loss": 1.0248, + "step": 82 + }, + { + "epoch": 0.01293800539083558, + "grad_norm": 1.0179784297943115, + "learning_rate": 9.995869700900732e-05, + "loss": 1.2234, + "step": 84 + }, + { + "epoch": 0.013246053138236426, + "grad_norm": 0.7293205261230469, + "learning_rate": 9.995670707277571e-05, + "loss": 1.2366, + "step": 86 + }, + { + "epoch": 0.013554100885637273, + "grad_norm": 0.5179591178894043, + "learning_rate": 9.995467034186383e-05, + "loss": 1.0338, + "step": 88 + }, + { + "epoch": 0.013862148633038121, + "grad_norm": 0.6510319113731384, + "learning_rate": 9.995258681817948e-05, + "loss": 1.1327, + "step": 90 + }, + { + "epoch": 0.014170196380438968, + "grad_norm": 0.6923282146453857, + "learning_rate": 9.995045650367432e-05, + "loss": 1.3055, + "step": 92 + }, + { + "epoch": 0.014478244127839815, + "grad_norm": 0.7597566843032837, + "learning_rate": 9.994827940034379e-05, + "loss": 1.5757, + "step": 94 + }, + { + "epoch": 0.014786291875240663, + "grad_norm": 0.7768942713737488, + "learning_rate": 9.994605551022724e-05, + "loss": 1.4926, + "step": 96 + }, + { + "epoch": 0.01509433962264151, + "grad_norm": 0.7350413203239441, + "learning_rate": 9.994378483540778e-05, + "loss": 1.2505, + "step": 98 + }, + { + "epoch": 0.015402387370042356, + "grad_norm": 0.7640051245689392, + "learning_rate": 9.994146737801235e-05, + "loss": 1.2688, + "step": 100 + }, + { + "epoch": 0.015710435117443203, + "grad_norm": 4.0233635902404785, + "learning_rate": 9.993910314021172e-05, + "loss": 2.1171, + "step": 102 + }, + { + "epoch": 0.01601848286484405, + "grad_norm": 0.6492887139320374, + "learning_rate": 9.99366921242205e-05, + "loss": 1.0807, + "step": 104 + }, + { + "epoch": 0.0163265306122449, + "grad_norm": 0.7625781893730164, + "learning_rate": 9.99342343322971e-05, + "loss": 1.0053, + "step": 106 + }, + { + "epoch": 0.016634578359645744, + "grad_norm": 2.026451587677002, + "learning_rate": 9.993172976674374e-05, + "loss": 1.36, + "step": 108 + }, + { + "epoch": 0.016942626107046593, + "grad_norm": 0.490477979183197, + "learning_rate": 9.992917842990645e-05, + "loss": 0.8772, + "step": 110 + }, + { + "epoch": 0.01725067385444744, + "grad_norm": 0.7898009419441223, + "learning_rate": 9.99265803241751e-05, + "loss": 1.0414, + "step": 112 + }, + { + "epoch": 0.017558721601848286, + "grad_norm": 1.5593082904815674, + "learning_rate": 9.992393545198332e-05, + "loss": 2.5199, + "step": 114 + }, + { + "epoch": 0.017866769349249134, + "grad_norm": 0.9973122477531433, + "learning_rate": 9.992124381580859e-05, + "loss": 1.1275, + "step": 116 + }, + { + "epoch": 0.01817481709664998, + "grad_norm": 0.5887067914009094, + "learning_rate": 9.991850541817219e-05, + "loss": 0.9591, + "step": 118 + }, + { + "epoch": 0.018482864844050827, + "grad_norm": 0.6366308927536011, + "learning_rate": 9.991572026163916e-05, + "loss": 1.0398, + "step": 120 + }, + { + "epoch": 0.018790912591451676, + "grad_norm": 1.0203287601470947, + "learning_rate": 9.991288834881839e-05, + "loss": 1.4016, + "step": 122 + }, + { + "epoch": 0.01909896033885252, + "grad_norm": 0.9442655444145203, + "learning_rate": 9.991000968236255e-05, + "loss": 1.2937, + "step": 124 + }, + { + "epoch": 0.01940700808625337, + "grad_norm": 1.4322446584701538, + "learning_rate": 9.990708426496808e-05, + "loss": 1.6756, + "step": 126 + }, + { + "epoch": 0.019715055833654217, + "grad_norm": 0.6030606627464294, + "learning_rate": 9.990411209937524e-05, + "loss": 1.2115, + "step": 128 + }, + { + "epoch": 0.020023103581055062, + "grad_norm": 0.7634426951408386, + "learning_rate": 9.990109318836809e-05, + "loss": 1.1997, + "step": 130 + }, + { + "epoch": 0.02033115132845591, + "grad_norm": 0.6622510552406311, + "learning_rate": 9.989802753477443e-05, + "loss": 1.3292, + "step": 132 + }, + { + "epoch": 0.02063919907585676, + "grad_norm": 0.7911828756332397, + "learning_rate": 9.989491514146589e-05, + "loss": 1.1769, + "step": 134 + }, + { + "epoch": 0.020947246823257604, + "grad_norm": 0.8692740201950073, + "learning_rate": 9.989175601135786e-05, + "loss": 1.2349, + "step": 136 + }, + { + "epoch": 0.021255294570658452, + "grad_norm": 0.5944163203239441, + "learning_rate": 9.988855014740951e-05, + "loss": 1.1596, + "step": 138 + }, + { + "epoch": 0.0215633423180593, + "grad_norm": 0.9593337774276733, + "learning_rate": 9.988529755262379e-05, + "loss": 1.1337, + "step": 140 + }, + { + "epoch": 0.021871390065460145, + "grad_norm": 0.7589074373245239, + "learning_rate": 9.988199823004741e-05, + "loss": 1.1172, + "step": 142 + }, + { + "epoch": 0.022179437812860994, + "grad_norm": 0.6502323746681213, + "learning_rate": 9.987865218277088e-05, + "loss": 1.1022, + "step": 144 + }, + { + "epoch": 0.022487485560261842, + "grad_norm": 0.8099236488342285, + "learning_rate": 9.987525941392844e-05, + "loss": 1.1416, + "step": 146 + }, + { + "epoch": 0.022795533307662687, + "grad_norm": 0.9074066877365112, + "learning_rate": 9.987181992669812e-05, + "loss": 1.3035, + "step": 148 + }, + { + "epoch": 0.023103581055063535, + "grad_norm": 0.7978343963623047, + "learning_rate": 9.98683337243017e-05, + "loss": 0.9997, + "step": 150 + }, + { + "epoch": 0.023411628802464383, + "grad_norm": 0.5891391634941101, + "learning_rate": 9.986480081000474e-05, + "loss": 0.9937, + "step": 152 + }, + { + "epoch": 0.02371967654986523, + "grad_norm": 0.747765302658081, + "learning_rate": 9.986122118711651e-05, + "loss": 1.0666, + "step": 154 + }, + { + "epoch": 0.024027724297266077, + "grad_norm": 0.7870962619781494, + "learning_rate": 9.985759485899009e-05, + "loss": 1.0771, + "step": 156 + }, + { + "epoch": 0.024335772044666925, + "grad_norm": 0.7572788596153259, + "learning_rate": 9.985392182902225e-05, + "loss": 1.2335, + "step": 158 + }, + { + "epoch": 0.02464381979206777, + "grad_norm": 1.1162936687469482, + "learning_rate": 9.985020210065353e-05, + "loss": 1.1569, + "step": 160 + }, + { + "epoch": 0.024951867539468618, + "grad_norm": 0.7760798335075378, + "learning_rate": 9.984643567736824e-05, + "loss": 1.5558, + "step": 162 + }, + { + "epoch": 0.025259915286869463, + "grad_norm": 0.7158915996551514, + "learning_rate": 9.984262256269441e-05, + "loss": 1.1885, + "step": 164 + }, + { + "epoch": 0.02556796303427031, + "grad_norm": 0.9121390581130981, + "learning_rate": 9.983876276020378e-05, + "loss": 1.3927, + "step": 166 + }, + { + "epoch": 0.02587601078167116, + "grad_norm": 0.5669654607772827, + "learning_rate": 9.983485627351187e-05, + "loss": 1.0516, + "step": 168 + }, + { + "epoch": 0.026184058529072005, + "grad_norm": 0.816312313079834, + "learning_rate": 9.983090310627787e-05, + "loss": 1.1956, + "step": 170 + }, + { + "epoch": 0.026492106276472853, + "grad_norm": 0.5462418794631958, + "learning_rate": 9.982690326220477e-05, + "loss": 1.2407, + "step": 172 + }, + { + "epoch": 0.0268001540238737, + "grad_norm": 0.9155438542366028, + "learning_rate": 9.98228567450392e-05, + "loss": 1.0894, + "step": 174 + }, + { + "epoch": 0.027108201771274546, + "grad_norm": 0.6453492045402527, + "learning_rate": 9.98187635585716e-05, + "loss": 1.1709, + "step": 176 + }, + { + "epoch": 0.027416249518675394, + "grad_norm": 0.8367354869842529, + "learning_rate": 9.981462370663604e-05, + "loss": 1.122, + "step": 178 + }, + { + "epoch": 0.027724297266076243, + "grad_norm": 0.657660961151123, + "learning_rate": 9.981043719311034e-05, + "loss": 1.3573, + "step": 180 + }, + { + "epoch": 0.028032345013477088, + "grad_norm": 0.7809942960739136, + "learning_rate": 9.980620402191603e-05, + "loss": 1.174, + "step": 182 + }, + { + "epoch": 0.028340392760877936, + "grad_norm": 0.8295182585716248, + "learning_rate": 9.980192419701837e-05, + "loss": 0.9466, + "step": 184 + }, + { + "epoch": 0.028648440508278784, + "grad_norm": 0.8026232719421387, + "learning_rate": 9.979759772242625e-05, + "loss": 1.0066, + "step": 186 + }, + { + "epoch": 0.02895648825567963, + "grad_norm": 0.714900016784668, + "learning_rate": 9.979322460219234e-05, + "loss": 1.3128, + "step": 188 + }, + { + "epoch": 0.029264536003080478, + "grad_norm": 0.8300663232803345, + "learning_rate": 9.978880484041292e-05, + "loss": 1.356, + "step": 190 + }, + { + "epoch": 0.029572583750481326, + "grad_norm": 1.3307452201843262, + "learning_rate": 9.978433844122804e-05, + "loss": 1.5442, + "step": 192 + }, + { + "epoch": 0.02988063149788217, + "grad_norm": 0.8927571773529053, + "learning_rate": 9.977982540882136e-05, + "loss": 1.1138, + "step": 194 + }, + { + "epoch": 0.03018867924528302, + "grad_norm": 0.7937953472137451, + "learning_rate": 9.977526574742028e-05, + "loss": 1.2531, + "step": 196 + }, + { + "epoch": 0.030496726992683867, + "grad_norm": 0.9369279742240906, + "learning_rate": 9.977065946129586e-05, + "loss": 1.1859, + "step": 198 + }, + { + "epoch": 0.030804774740084712, + "grad_norm": 0.643165647983551, + "learning_rate": 9.976600655476283e-05, + "loss": 1.1124, + "step": 200 + }, + { + "epoch": 0.03111282248748556, + "grad_norm": 0.632799506187439, + "learning_rate": 9.976130703217956e-05, + "loss": 2.1622, + "step": 202 + }, + { + "epoch": 0.031420870234886406, + "grad_norm": 1.6191530227661133, + "learning_rate": 9.975656089794816e-05, + "loss": 1.1623, + "step": 204 + }, + { + "epoch": 0.031728917982287254, + "grad_norm": 0.765177309513092, + "learning_rate": 9.975176815651431e-05, + "loss": 1.1794, + "step": 206 + }, + { + "epoch": 0.0320369657296881, + "grad_norm": 0.8509706258773804, + "learning_rate": 9.974692881236743e-05, + "loss": 1.093, + "step": 208 + }, + { + "epoch": 0.03234501347708895, + "grad_norm": 0.6735479831695557, + "learning_rate": 9.974204287004055e-05, + "loss": 1.1755, + "step": 210 + }, + { + "epoch": 0.0326530612244898, + "grad_norm": 0.8011558055877686, + "learning_rate": 9.973711033411034e-05, + "loss": 1.2142, + "step": 212 + }, + { + "epoch": 0.03296110897189064, + "grad_norm": 0.6858620643615723, + "learning_rate": 9.973213120919714e-05, + "loss": 1.0759, + "step": 214 + }, + { + "epoch": 0.03326915671929149, + "grad_norm": 0.9899487495422363, + "learning_rate": 9.97271054999649e-05, + "loss": 1.3075, + "step": 216 + }, + { + "epoch": 0.03357720446669234, + "grad_norm": 0.5964149236679077, + "learning_rate": 9.972203321112126e-05, + "loss": 1.0864, + "step": 218 + }, + { + "epoch": 0.033885252214093185, + "grad_norm": 1.0063085556030273, + "learning_rate": 9.971691434741742e-05, + "loss": 1.0993, + "step": 220 + }, + { + "epoch": 0.034193299961494034, + "grad_norm": 0.8755286335945129, + "learning_rate": 9.971174891364827e-05, + "loss": 1.1378, + "step": 222 + }, + { + "epoch": 0.03450134770889488, + "grad_norm": 3.5582456588745117, + "learning_rate": 9.970653691465229e-05, + "loss": 2.7598, + "step": 224 + }, + { + "epoch": 0.03480939545629572, + "grad_norm": 0.8937438130378723, + "learning_rate": 9.970127835531158e-05, + "loss": 1.0757, + "step": 226 + }, + { + "epoch": 0.03511744320369657, + "grad_norm": 0.6416354179382324, + "learning_rate": 9.969597324055187e-05, + "loss": 1.2164, + "step": 228 + }, + { + "epoch": 0.03542549095109742, + "grad_norm": 1.0077422857284546, + "learning_rate": 9.969062157534246e-05, + "loss": 1.0344, + "step": 230 + }, + { + "epoch": 0.03573353869849827, + "grad_norm": 0.7546489238739014, + "learning_rate": 9.96852233646963e-05, + "loss": 1.0326, + "step": 232 + }, + { + "epoch": 0.03604158644589912, + "grad_norm": 0.7511939406394958, + "learning_rate": 9.967977861366991e-05, + "loss": 1.3572, + "step": 234 + }, + { + "epoch": 0.03634963419329996, + "grad_norm": 0.7937233448028564, + "learning_rate": 9.967428732736341e-05, + "loss": 1.2514, + "step": 236 + }, + { + "epoch": 0.036657681940700806, + "grad_norm": 0.7672502398490906, + "learning_rate": 9.966874951092053e-05, + "loss": 1.2304, + "step": 238 + }, + { + "epoch": 0.036965729688101655, + "grad_norm": 0.8722102046012878, + "learning_rate": 9.966316516952854e-05, + "loss": 1.1674, + "step": 240 + }, + { + "epoch": 0.0372737774355025, + "grad_norm": 0.9118186831474304, + "learning_rate": 9.965753430841835e-05, + "loss": 1.3118, + "step": 242 + }, + { + "epoch": 0.03758182518290335, + "grad_norm": 0.6866461038589478, + "learning_rate": 9.96518569328644e-05, + "loss": 0.9958, + "step": 244 + }, + { + "epoch": 0.0378898729303042, + "grad_norm": 0.7743217945098877, + "learning_rate": 9.964613304818472e-05, + "loss": 1.1417, + "step": 246 + }, + { + "epoch": 0.03819792067770504, + "grad_norm": 0.7421185374259949, + "learning_rate": 9.964036265974089e-05, + "loss": 1.2372, + "step": 248 + }, + { + "epoch": 0.03850596842510589, + "grad_norm": 0.75697922706604, + "learning_rate": 9.963454577293808e-05, + "loss": 1.0522, + "step": 250 + }, + { + "epoch": 0.03881401617250674, + "grad_norm": 0.600669264793396, + "learning_rate": 9.962868239322495e-05, + "loss": 1.1678, + "step": 252 + }, + { + "epoch": 0.039122063919907586, + "grad_norm": 1.034199833869934, + "learning_rate": 9.96227725260938e-05, + "loss": 2.1076, + "step": 254 + }, + { + "epoch": 0.039430111667308435, + "grad_norm": 0.606637716293335, + "learning_rate": 9.96168161770804e-05, + "loss": 0.8878, + "step": 256 + }, + { + "epoch": 0.03973815941470928, + "grad_norm": 0.7937979102134705, + "learning_rate": 9.961081335176412e-05, + "loss": 2.2967, + "step": 258 + }, + { + "epoch": 0.040046207162110124, + "grad_norm": 0.6674078702926636, + "learning_rate": 9.960476405576782e-05, + "loss": 1.1733, + "step": 260 + }, + { + "epoch": 0.04035425490951097, + "grad_norm": 0.6720229983329773, + "learning_rate": 9.959866829475789e-05, + "loss": 1.1175, + "step": 262 + }, + { + "epoch": 0.04066230265691182, + "grad_norm": 0.6526698470115662, + "learning_rate": 9.959252607444427e-05, + "loss": 1.1886, + "step": 264 + }, + { + "epoch": 0.04097035040431267, + "grad_norm": 0.5717160701751709, + "learning_rate": 9.958633740058042e-05, + "loss": 0.9741, + "step": 266 + }, + { + "epoch": 0.04127839815171352, + "grad_norm": 0.6778659820556641, + "learning_rate": 9.958010227896329e-05, + "loss": 1.0269, + "step": 268 + }, + { + "epoch": 0.041586445899114366, + "grad_norm": 0.8337912559509277, + "learning_rate": 9.957382071543332e-05, + "loss": 1.1228, + "step": 270 + }, + { + "epoch": 0.04189449364651521, + "grad_norm": 0.8081030249595642, + "learning_rate": 9.95674927158745e-05, + "loss": 1.398, + "step": 272 + }, + { + "epoch": 0.042202541393916056, + "grad_norm": 0.6906352639198303, + "learning_rate": 9.956111828621432e-05, + "loss": 0.9677, + "step": 274 + }, + { + "epoch": 0.042510589141316904, + "grad_norm": 0.5956444144248962, + "learning_rate": 9.955469743242372e-05, + "loss": 1.0459, + "step": 276 + }, + { + "epoch": 0.04281863688871775, + "grad_norm": 0.5646885633468628, + "learning_rate": 9.954823016051713e-05, + "loss": 1.4604, + "step": 278 + }, + { + "epoch": 0.0431266846361186, + "grad_norm": 0.5150172710418701, + "learning_rate": 9.95417164765525e-05, + "loss": 0.9466, + "step": 280 + }, + { + "epoch": 0.04343473238351944, + "grad_norm": 0.6873899102210999, + "learning_rate": 9.95351563866312e-05, + "loss": 1.1715, + "step": 282 + }, + { + "epoch": 0.04374278013092029, + "grad_norm": 0.7087437510490417, + "learning_rate": 9.952854989689812e-05, + "loss": 1.2993, + "step": 284 + }, + { + "epoch": 0.04405082787832114, + "grad_norm": 0.7204945087432861, + "learning_rate": 9.952189701354158e-05, + "loss": 1.0131, + "step": 286 + }, + { + "epoch": 0.04435887562572199, + "grad_norm": 0.8233432769775391, + "learning_rate": 9.951519774279334e-05, + "loss": 1.1578, + "step": 288 + }, + { + "epoch": 0.044666923373122835, + "grad_norm": 1.0435971021652222, + "learning_rate": 9.95084520909287e-05, + "loss": 1.3136, + "step": 290 + }, + { + "epoch": 0.044974971120523684, + "grad_norm": 0.7032255530357361, + "learning_rate": 9.950166006426629e-05, + "loss": 0.9537, + "step": 292 + }, + { + "epoch": 0.045283018867924525, + "grad_norm": 0.8625809550285339, + "learning_rate": 9.949482166916826e-05, + "loss": 1.0124, + "step": 294 + }, + { + "epoch": 0.045591066615325374, + "grad_norm": 0.9952526688575745, + "learning_rate": 9.948793691204014e-05, + "loss": 1.304, + "step": 296 + }, + { + "epoch": 0.04589911436272622, + "grad_norm": 0.49537548422813416, + "learning_rate": 9.948100579933095e-05, + "loss": 0.8679, + "step": 298 + }, + { + "epoch": 0.04620716211012707, + "grad_norm": 0.7977579236030579, + "learning_rate": 9.947402833753307e-05, + "loss": 1.1701, + "step": 300 + }, + { + "epoch": 0.04620716211012707, + "eval_loss": 2.4681503772735596, + "eval_runtime": 746.0395, + "eval_samples_per_second": 2.681, + "eval_steps_per_second": 0.67, + "step": 300 + }, + { + "epoch": 0.04651520985752792, + "grad_norm": 0.5484630465507507, + "learning_rate": 9.946700453318234e-05, + "loss": 0.9929, + "step": 302 + }, + { + "epoch": 0.04682325760492877, + "grad_norm": 0.768426239490509, + "learning_rate": 9.945993439285797e-05, + "loss": 1.1075, + "step": 304 + }, + { + "epoch": 0.04713130535232961, + "grad_norm": 0.7085797786712646, + "learning_rate": 9.945281792318259e-05, + "loss": 1.2756, + "step": 306 + }, + { + "epoch": 0.04743935309973046, + "grad_norm": 0.5655617117881775, + "learning_rate": 9.944565513082227e-05, + "loss": 1.8454, + "step": 308 + }, + { + "epoch": 0.047747400847131305, + "grad_norm": 1.111232042312622, + "learning_rate": 9.943844602248638e-05, + "loss": 1.3407, + "step": 310 + }, + { + "epoch": 0.04805544859453215, + "grad_norm": 0.8340556025505066, + "learning_rate": 9.943119060492776e-05, + "loss": 2.5626, + "step": 312 + }, + { + "epoch": 0.048363496341933, + "grad_norm": 0.6861330270767212, + "learning_rate": 9.942388888494258e-05, + "loss": 1.1323, + "step": 314 + }, + { + "epoch": 0.04867154408933385, + "grad_norm": 0.9066111445426941, + "learning_rate": 9.94165408693704e-05, + "loss": 1.1893, + "step": 316 + }, + { + "epoch": 0.04897959183673469, + "grad_norm": 0.8065815567970276, + "learning_rate": 9.940914656509414e-05, + "loss": 1.1128, + "step": 318 + }, + { + "epoch": 0.04928763958413554, + "grad_norm": 0.7174093127250671, + "learning_rate": 9.940170597904006e-05, + "loss": 1.0627, + "step": 320 + }, + { + "epoch": 0.04959568733153639, + "grad_norm": 0.8688803315162659, + "learning_rate": 9.939421911817783e-05, + "loss": 2.0269, + "step": 322 + }, + { + "epoch": 0.049903735078937236, + "grad_norm": 0.8032212853431702, + "learning_rate": 9.93866859895204e-05, + "loss": 1.1595, + "step": 324 + }, + { + "epoch": 0.050211782826338085, + "grad_norm": 0.8701591491699219, + "learning_rate": 9.937910660012408e-05, + "loss": 1.41, + "step": 326 + }, + { + "epoch": 0.050519830573738926, + "grad_norm": 0.8329717516899109, + "learning_rate": 9.937148095708855e-05, + "loss": 1.5113, + "step": 328 + }, + { + "epoch": 0.050827878321139774, + "grad_norm": 0.6211066246032715, + "learning_rate": 9.936380906755676e-05, + "loss": 1.1042, + "step": 330 + }, + { + "epoch": 0.05113592606854062, + "grad_norm": 0.7419208288192749, + "learning_rate": 9.935609093871502e-05, + "loss": 2.0661, + "step": 332 + }, + { + "epoch": 0.05144397381594147, + "grad_norm": 0.7876706719398499, + "learning_rate": 9.934832657779291e-05, + "loss": 1.0094, + "step": 334 + }, + { + "epoch": 0.05175202156334232, + "grad_norm": 0.9068336486816406, + "learning_rate": 9.934051599206339e-05, + "loss": 1.0648, + "step": 336 + }, + { + "epoch": 0.05206006931074317, + "grad_norm": 0.7844152450561523, + "learning_rate": 9.933265918884262e-05, + "loss": 1.26, + "step": 338 + }, + { + "epoch": 0.05236811705814401, + "grad_norm": 0.7152935266494751, + "learning_rate": 9.932475617549016e-05, + "loss": 1.1199, + "step": 340 + }, + { + "epoch": 0.05267616480554486, + "grad_norm": 0.8740458488464355, + "learning_rate": 9.931680695940873e-05, + "loss": 1.2573, + "step": 342 + }, + { + "epoch": 0.052984212552945706, + "grad_norm": 0.8880050778388977, + "learning_rate": 9.930881154804446e-05, + "loss": 1.3006, + "step": 344 + }, + { + "epoch": 0.053292260300346554, + "grad_norm": 0.7322596311569214, + "learning_rate": 9.930076994888666e-05, + "loss": 1.1697, + "step": 346 + }, + { + "epoch": 0.0536003080477474, + "grad_norm": 0.64304518699646, + "learning_rate": 9.929268216946794e-05, + "loss": 1.2965, + "step": 348 + }, + { + "epoch": 0.05390835579514825, + "grad_norm": 0.706308126449585, + "learning_rate": 9.928454821736414e-05, + "loss": 1.1261, + "step": 350 + }, + { + "epoch": 0.05421640354254909, + "grad_norm": 0.6644442677497864, + "learning_rate": 9.927636810019441e-05, + "loss": 1.3967, + "step": 352 + }, + { + "epoch": 0.05452445128994994, + "grad_norm": 0.636601984500885, + "learning_rate": 9.926814182562108e-05, + "loss": 0.9961, + "step": 354 + }, + { + "epoch": 0.05483249903735079, + "grad_norm": 0.7085344195365906, + "learning_rate": 9.925986940134975e-05, + "loss": 0.9479, + "step": 356 + }, + { + "epoch": 0.05514054678475164, + "grad_norm": 0.8576806783676147, + "learning_rate": 9.925155083512922e-05, + "loss": 1.6484, + "step": 358 + }, + { + "epoch": 0.055448594532152486, + "grad_norm": 0.7348802089691162, + "learning_rate": 9.924318613475156e-05, + "loss": 2.0183, + "step": 360 + }, + { + "epoch": 0.055756642279553334, + "grad_norm": 0.6207509636878967, + "learning_rate": 9.923477530805199e-05, + "loss": 1.1584, + "step": 362 + }, + { + "epoch": 0.056064690026954175, + "grad_norm": 0.7382807731628418, + "learning_rate": 9.9226318362909e-05, + "loss": 1.4034, + "step": 364 + }, + { + "epoch": 0.056372737774355024, + "grad_norm": 0.6476181745529175, + "learning_rate": 9.921781530724421e-05, + "loss": 1.0029, + "step": 366 + }, + { + "epoch": 0.05668078552175587, + "grad_norm": 0.7093887329101562, + "learning_rate": 9.920926614902253e-05, + "loss": 1.9992, + "step": 368 + }, + { + "epoch": 0.05698883326915672, + "grad_norm": 0.7479656934738159, + "learning_rate": 9.920067089625194e-05, + "loss": 1.1022, + "step": 370 + }, + { + "epoch": 0.05729688101655757, + "grad_norm": 0.6984477043151855, + "learning_rate": 9.919202955698367e-05, + "loss": 1.2939, + "step": 372 + }, + { + "epoch": 0.05760492876395841, + "grad_norm": 0.6800373792648315, + "learning_rate": 9.918334213931214e-05, + "loss": 1.0686, + "step": 374 + }, + { + "epoch": 0.05791297651135926, + "grad_norm": 1.0920754671096802, + "learning_rate": 9.917460865137485e-05, + "loss": 1.1059, + "step": 376 + }, + { + "epoch": 0.05822102425876011, + "grad_norm": 0.7536227107048035, + "learning_rate": 9.916582910135252e-05, + "loss": 1.0106, + "step": 378 + }, + { + "epoch": 0.058529072006160955, + "grad_norm": 0.850918173789978, + "learning_rate": 9.915700349746898e-05, + "loss": 1.1953, + "step": 380 + }, + { + "epoch": 0.0588371197535618, + "grad_norm": 0.6495062112808228, + "learning_rate": 9.914813184799123e-05, + "loss": 1.0393, + "step": 382 + }, + { + "epoch": 0.05914516750096265, + "grad_norm": 0.6886342763900757, + "learning_rate": 9.913921416122937e-05, + "loss": 1.0439, + "step": 384 + }, + { + "epoch": 0.05945321524836349, + "grad_norm": 0.7080234885215759, + "learning_rate": 9.913025044553666e-05, + "loss": 0.9485, + "step": 386 + }, + { + "epoch": 0.05976126299576434, + "grad_norm": 0.5946139693260193, + "learning_rate": 9.912124070930943e-05, + "loss": 1.0163, + "step": 388 + }, + { + "epoch": 0.06006931074316519, + "grad_norm": 0.8572615385055542, + "learning_rate": 9.911218496098717e-05, + "loss": 1.3743, + "step": 390 + }, + { + "epoch": 0.06037735849056604, + "grad_norm": 0.7151904106140137, + "learning_rate": 9.91030832090524e-05, + "loss": 1.2767, + "step": 392 + }, + { + "epoch": 0.06068540623796689, + "grad_norm": 0.7724493145942688, + "learning_rate": 9.909393546203082e-05, + "loss": 1.1237, + "step": 394 + }, + { + "epoch": 0.060993453985367735, + "grad_norm": 1.2182974815368652, + "learning_rate": 9.908474172849114e-05, + "loss": 1.1877, + "step": 396 + }, + { + "epoch": 0.061301501732768576, + "grad_norm": 0.5870667695999146, + "learning_rate": 9.907550201704519e-05, + "loss": 1.1604, + "step": 398 + }, + { + "epoch": 0.061609549480169425, + "grad_norm": 0.6542190909385681, + "learning_rate": 9.906621633634782e-05, + "loss": 1.1505, + "step": 400 + }, + { + "epoch": 0.06191759722757027, + "grad_norm": 0.7185912132263184, + "learning_rate": 9.9056884695097e-05, + "loss": 1.1226, + "step": 402 + }, + { + "epoch": 0.06222564497497112, + "grad_norm": 0.802648663520813, + "learning_rate": 9.90475071020337e-05, + "loss": 1.075, + "step": 404 + }, + { + "epoch": 0.06253369272237197, + "grad_norm": 0.6152924299240112, + "learning_rate": 9.903808356594199e-05, + "loss": 1.1796, + "step": 406 + }, + { + "epoch": 0.06284174046977281, + "grad_norm": 0.8860695362091064, + "learning_rate": 9.90286140956489e-05, + "loss": 1.1769, + "step": 408 + }, + { + "epoch": 0.06314978821717367, + "grad_norm": 0.7648366093635559, + "learning_rate": 9.901909870002455e-05, + "loss": 1.1125, + "step": 410 + }, + { + "epoch": 0.06345783596457451, + "grad_norm": 0.708521842956543, + "learning_rate": 9.900953738798205e-05, + "loss": 1.9526, + "step": 412 + }, + { + "epoch": 0.06376588371197536, + "grad_norm": 0.5266025066375732, + "learning_rate": 9.899993016847753e-05, + "loss": 1.1469, + "step": 414 + }, + { + "epoch": 0.0640739314593762, + "grad_norm": 0.5720192193984985, + "learning_rate": 9.899027705051011e-05, + "loss": 1.0054, + "step": 416 + }, + { + "epoch": 0.06438197920677705, + "grad_norm": 0.7001132965087891, + "learning_rate": 9.89805780431219e-05, + "loss": 0.9696, + "step": 418 + }, + { + "epoch": 0.0646900269541779, + "grad_norm": 0.6034771800041199, + "learning_rate": 9.897083315539803e-05, + "loss": 1.0963, + "step": 420 + }, + { + "epoch": 0.06499807470157874, + "grad_norm": 0.6032156944274902, + "learning_rate": 9.896104239646658e-05, + "loss": 2.0431, + "step": 422 + }, + { + "epoch": 0.0653061224489796, + "grad_norm": 0.8947567939758301, + "learning_rate": 9.895120577549858e-05, + "loss": 1.1282, + "step": 424 + }, + { + "epoch": 0.06561417019638044, + "grad_norm": 0.6361309289932251, + "learning_rate": 9.894132330170805e-05, + "loss": 1.2933, + "step": 426 + }, + { + "epoch": 0.06592221794378128, + "grad_norm": 0.8116483092308044, + "learning_rate": 9.893139498435194e-05, + "loss": 1.1211, + "step": 428 + }, + { + "epoch": 0.06623026569118214, + "grad_norm": 0.594086766242981, + "learning_rate": 9.892142083273017e-05, + "loss": 0.8969, + "step": 430 + }, + { + "epoch": 0.06653831343858298, + "grad_norm": 0.45241397619247437, + "learning_rate": 9.891140085618555e-05, + "loss": 0.9593, + "step": 432 + }, + { + "epoch": 0.06684636118598383, + "grad_norm": 0.726063072681427, + "learning_rate": 9.890133506410386e-05, + "loss": 1.1369, + "step": 434 + }, + { + "epoch": 0.06715440893338467, + "grad_norm": 0.8245907425880432, + "learning_rate": 9.889122346591377e-05, + "loss": 1.1889, + "step": 436 + }, + { + "epoch": 0.06746245668078552, + "grad_norm": 0.5453060269355774, + "learning_rate": 9.888106607108687e-05, + "loss": 0.9628, + "step": 438 + }, + { + "epoch": 0.06777050442818637, + "grad_norm": 0.7849345803260803, + "learning_rate": 9.88708628891376e-05, + "loss": 1.1588, + "step": 440 + }, + { + "epoch": 0.06807855217558721, + "grad_norm": 0.8772358894348145, + "learning_rate": 9.886061392962336e-05, + "loss": 0.9983, + "step": 442 + }, + { + "epoch": 0.06838659992298807, + "grad_norm": 0.7637527585029602, + "learning_rate": 9.88503192021444e-05, + "loss": 1.2101, + "step": 444 + }, + { + "epoch": 0.06869464767038891, + "grad_norm": 0.9543507695198059, + "learning_rate": 9.883997871634383e-05, + "loss": 1.135, + "step": 446 + }, + { + "epoch": 0.06900269541778976, + "grad_norm": 0.6680512428283691, + "learning_rate": 9.882959248190764e-05, + "loss": 1.1381, + "step": 448 + }, + { + "epoch": 0.0693107431651906, + "grad_norm": 0.6533584594726562, + "learning_rate": 9.881916050856464e-05, + "loss": 0.9652, + "step": 450 + }, + { + "epoch": 0.06961879091259145, + "grad_norm": 0.9764670133590698, + "learning_rate": 9.880868280608654e-05, + "loss": 0.9112, + "step": 452 + }, + { + "epoch": 0.0699268386599923, + "grad_norm": 0.7887918949127197, + "learning_rate": 9.879815938428783e-05, + "loss": 1.0959, + "step": 454 + }, + { + "epoch": 0.07023488640739314, + "grad_norm": 0.723955512046814, + "learning_rate": 9.878759025302586e-05, + "loss": 1.118, + "step": 456 + }, + { + "epoch": 0.070542934154794, + "grad_norm": 0.8257218599319458, + "learning_rate": 9.877697542220078e-05, + "loss": 1.2431, + "step": 458 + }, + { + "epoch": 0.07085098190219484, + "grad_norm": 0.6132649779319763, + "learning_rate": 9.876631490175555e-05, + "loss": 0.9923, + "step": 460 + }, + { + "epoch": 0.07115902964959568, + "grad_norm": 0.9241247177124023, + "learning_rate": 9.875560870167594e-05, + "loss": 1.2972, + "step": 462 + }, + { + "epoch": 0.07146707739699654, + "grad_norm": 0.6281841993331909, + "learning_rate": 9.874485683199048e-05, + "loss": 0.9348, + "step": 464 + }, + { + "epoch": 0.07177512514439738, + "grad_norm": 0.6570271253585815, + "learning_rate": 9.87340593027705e-05, + "loss": 1.0261, + "step": 466 + }, + { + "epoch": 0.07208317289179823, + "grad_norm": 0.6641860008239746, + "learning_rate": 9.872321612413012e-05, + "loss": 1.0926, + "step": 468 + }, + { + "epoch": 0.07239122063919907, + "grad_norm": 0.6074772477149963, + "learning_rate": 9.871232730622618e-05, + "loss": 1.2501, + "step": 470 + }, + { + "epoch": 0.07269926838659992, + "grad_norm": 0.8022063970565796, + "learning_rate": 9.870139285925826e-05, + "loss": 1.1449, + "step": 472 + }, + { + "epoch": 0.07300731613400077, + "grad_norm": 0.783936619758606, + "learning_rate": 9.869041279346874e-05, + "loss": 1.1048, + "step": 474 + }, + { + "epoch": 0.07331536388140161, + "grad_norm": 0.8866376876831055, + "learning_rate": 9.867938711914269e-05, + "loss": 2.1867, + "step": 476 + }, + { + "epoch": 0.07362341162880247, + "grad_norm": 0.7495025992393494, + "learning_rate": 9.866831584660791e-05, + "loss": 1.0887, + "step": 478 + }, + { + "epoch": 0.07393145937620331, + "grad_norm": 0.5893321633338928, + "learning_rate": 9.86571989862349e-05, + "loss": 0.9163, + "step": 480 + }, + { + "epoch": 0.07423950712360416, + "grad_norm": 0.5532320737838745, + "learning_rate": 9.864603654843692e-05, + "loss": 1.0393, + "step": 482 + }, + { + "epoch": 0.074547554871005, + "grad_norm": 0.6310046911239624, + "learning_rate": 9.863482854366983e-05, + "loss": 1.244, + "step": 484 + }, + { + "epoch": 0.07485560261840585, + "grad_norm": 0.8252720832824707, + "learning_rate": 9.862357498243223e-05, + "loss": 1.3308, + "step": 486 + }, + { + "epoch": 0.0751636503658067, + "grad_norm": 0.820950984954834, + "learning_rate": 9.861227587526539e-05, + "loss": 1.222, + "step": 488 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 0.8616042733192444, + "learning_rate": 9.860093123275325e-05, + "loss": 1.0324, + "step": 490 + }, + { + "epoch": 0.0757797458606084, + "grad_norm": 0.68876051902771, + "learning_rate": 9.858954106552236e-05, + "loss": 1.3133, + "step": 492 + }, + { + "epoch": 0.07608779360800924, + "grad_norm": 0.66021329164505, + "learning_rate": 9.857810538424195e-05, + "loss": 0.9999, + "step": 494 + }, + { + "epoch": 0.07639584135541008, + "grad_norm": 0.5837455987930298, + "learning_rate": 9.85666241996239e-05, + "loss": 1.0609, + "step": 496 + }, + { + "epoch": 0.07670388910281094, + "grad_norm": 0.676024317741394, + "learning_rate": 9.855509752242267e-05, + "loss": 1.0629, + "step": 498 + }, + { + "epoch": 0.07701193685021178, + "grad_norm": 0.8846476674079895, + "learning_rate": 9.854352536343534e-05, + "loss": 1.3576, + "step": 500 + }, + { + "epoch": 0.07731998459761263, + "grad_norm": 0.6220236420631409, + "learning_rate": 9.853190773350164e-05, + "loss": 1.2044, + "step": 502 + }, + { + "epoch": 0.07762803234501348, + "grad_norm": 0.7261460423469543, + "learning_rate": 9.852024464350382e-05, + "loss": 1.1052, + "step": 504 + }, + { + "epoch": 0.07793608009241433, + "grad_norm": 0.660500168800354, + "learning_rate": 9.850853610436679e-05, + "loss": 1.1389, + "step": 506 + }, + { + "epoch": 0.07824412783981517, + "grad_norm": 0.7806474566459656, + "learning_rate": 9.849678212705796e-05, + "loss": 1.0534, + "step": 508 + }, + { + "epoch": 0.07855217558721601, + "grad_norm": 0.9472790956497192, + "learning_rate": 9.848498272258735e-05, + "loss": 1.0524, + "step": 510 + }, + { + "epoch": 0.07886022333461687, + "grad_norm": 0.5467221736907959, + "learning_rate": 9.847313790200751e-05, + "loss": 1.1608, + "step": 512 + }, + { + "epoch": 0.07916827108201771, + "grad_norm": 0.6035184860229492, + "learning_rate": 9.846124767641354e-05, + "loss": 0.9941, + "step": 514 + }, + { + "epoch": 0.07947631882941857, + "grad_norm": 0.46280941367149353, + "learning_rate": 9.844931205694308e-05, + "loss": 0.879, + "step": 516 + }, + { + "epoch": 0.07978436657681941, + "grad_norm": 0.8107156753540039, + "learning_rate": 9.843733105477628e-05, + "loss": 1.3569, + "step": 518 + }, + { + "epoch": 0.08009241432422025, + "grad_norm": 0.6657843589782715, + "learning_rate": 9.842530468113578e-05, + "loss": 0.9241, + "step": 520 + }, + { + "epoch": 0.0804004620716211, + "grad_norm": 0.6296514868736267, + "learning_rate": 9.841323294728675e-05, + "loss": 1.3311, + "step": 522 + }, + { + "epoch": 0.08070850981902195, + "grad_norm": 0.8303132057189941, + "learning_rate": 9.840111586453686e-05, + "loss": 1.178, + "step": 524 + }, + { + "epoch": 0.0810165575664228, + "grad_norm": 0.7254791855812073, + "learning_rate": 9.838895344423621e-05, + "loss": 1.0316, + "step": 526 + }, + { + "epoch": 0.08132460531382364, + "grad_norm": 0.8639194369316101, + "learning_rate": 9.837674569777742e-05, + "loss": 1.1379, + "step": 528 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 0.7651161551475525, + "learning_rate": 9.836449263659551e-05, + "loss": 1.3298, + "step": 530 + }, + { + "epoch": 0.08194070080862534, + "grad_norm": 0.6473088264465332, + "learning_rate": 9.835219427216801e-05, + "loss": 1.1758, + "step": 532 + }, + { + "epoch": 0.08224874855602618, + "grad_norm": 0.7255507111549377, + "learning_rate": 9.833985061601485e-05, + "loss": 1.2247, + "step": 534 + }, + { + "epoch": 0.08255679630342704, + "grad_norm": 0.6490479707717896, + "learning_rate": 9.832746167969837e-05, + "loss": 0.9711, + "step": 536 + }, + { + "epoch": 0.08286484405082788, + "grad_norm": 0.6560536026954651, + "learning_rate": 9.831502747482337e-05, + "loss": 1.0156, + "step": 538 + }, + { + "epoch": 0.08317289179822873, + "grad_norm": 0.6243906021118164, + "learning_rate": 9.830254801303702e-05, + "loss": 1.4298, + "step": 540 + }, + { + "epoch": 0.08348093954562957, + "grad_norm": 0.849922239780426, + "learning_rate": 9.829002330602888e-05, + "loss": 1.2591, + "step": 542 + }, + { + "epoch": 0.08378898729303041, + "grad_norm": 0.8038306832313538, + "learning_rate": 9.827745336553092e-05, + "loss": 1.1405, + "step": 544 + }, + { + "epoch": 0.08409703504043127, + "grad_norm": 0.7083746194839478, + "learning_rate": 9.826483820331743e-05, + "loss": 1.1005, + "step": 546 + }, + { + "epoch": 0.08440508278783211, + "grad_norm": 0.643138587474823, + "learning_rate": 9.825217783120513e-05, + "loss": 0.8967, + "step": 548 + }, + { + "epoch": 0.08471313053523297, + "grad_norm": 0.5439552068710327, + "learning_rate": 9.823947226105302e-05, + "loss": 1.805, + "step": 550 + }, + { + "epoch": 0.08502117828263381, + "grad_norm": 1.0784913301467896, + "learning_rate": 9.822672150476249e-05, + "loss": 1.3968, + "step": 552 + }, + { + "epoch": 0.08532922603003465, + "grad_norm": 0.6794597506523132, + "learning_rate": 9.82139255742772e-05, + "loss": 0.9096, + "step": 554 + }, + { + "epoch": 0.0856372737774355, + "grad_norm": 0.7618829607963562, + "learning_rate": 9.820108448158319e-05, + "loss": 1.2315, + "step": 556 + }, + { + "epoch": 0.08594532152483635, + "grad_norm": 0.6738618612289429, + "learning_rate": 9.818819823870876e-05, + "loss": 0.9266, + "step": 558 + }, + { + "epoch": 0.0862533692722372, + "grad_norm": 0.6213587522506714, + "learning_rate": 9.817526685772452e-05, + "loss": 0.8314, + "step": 560 + }, + { + "epoch": 0.08656141701963804, + "grad_norm": 0.6162586808204651, + "learning_rate": 9.816229035074334e-05, + "loss": 0.9922, + "step": 562 + }, + { + "epoch": 0.08686946476703888, + "grad_norm": 0.9549304246902466, + "learning_rate": 9.814926872992038e-05, + "loss": 1.0369, + "step": 564 + }, + { + "epoch": 0.08717751251443974, + "grad_norm": 0.919792890548706, + "learning_rate": 9.813620200745307e-05, + "loss": 1.1811, + "step": 566 + }, + { + "epoch": 0.08748556026184058, + "grad_norm": 0.6474389433860779, + "learning_rate": 9.812309019558103e-05, + "loss": 0.9927, + "step": 568 + }, + { + "epoch": 0.08779360800924144, + "grad_norm": 0.7442201375961304, + "learning_rate": 9.81099333065862e-05, + "loss": 1.2592, + "step": 570 + }, + { + "epoch": 0.08810165575664228, + "grad_norm": 0.7177843451499939, + "learning_rate": 9.809673135279268e-05, + "loss": 1.1623, + "step": 572 + }, + { + "epoch": 0.08840970350404313, + "grad_norm": 0.6520625352859497, + "learning_rate": 9.80834843465668e-05, + "loss": 0.9876, + "step": 574 + }, + { + "epoch": 0.08871775125144397, + "grad_norm": 0.796640157699585, + "learning_rate": 9.807019230031708e-05, + "loss": 1.2678, + "step": 576 + }, + { + "epoch": 0.08902579899884482, + "grad_norm": 0.6173284649848938, + "learning_rate": 9.805685522649428e-05, + "loss": 0.9507, + "step": 578 + }, + { + "epoch": 0.08933384674624567, + "grad_norm": 0.6731142401695251, + "learning_rate": 9.804347313759126e-05, + "loss": 1.2456, + "step": 580 + }, + { + "epoch": 0.08964189449364651, + "grad_norm": 0.821448802947998, + "learning_rate": 9.80300460461431e-05, + "loss": 1.1822, + "step": 582 + }, + { + "epoch": 0.08994994224104737, + "grad_norm": 0.7288122773170471, + "learning_rate": 9.801657396472702e-05, + "loss": 2.3747, + "step": 584 + }, + { + "epoch": 0.09025798998844821, + "grad_norm": 0.6716973781585693, + "learning_rate": 9.800305690596238e-05, + "loss": 1.1216, + "step": 586 + }, + { + "epoch": 0.09056603773584905, + "grad_norm": 0.627241849899292, + "learning_rate": 9.798949488251068e-05, + "loss": 1.2473, + "step": 588 + }, + { + "epoch": 0.0908740854832499, + "grad_norm": 0.7980815172195435, + "learning_rate": 9.797588790707551e-05, + "loss": 1.0989, + "step": 590 + }, + { + "epoch": 0.09118213323065075, + "grad_norm": 0.893456220626831, + "learning_rate": 9.796223599240262e-05, + "loss": 1.3478, + "step": 592 + }, + { + "epoch": 0.0914901809780516, + "grad_norm": 0.8465490341186523, + "learning_rate": 9.794853915127978e-05, + "loss": 1.0439, + "step": 594 + }, + { + "epoch": 0.09179822872545244, + "grad_norm": 0.7354926466941833, + "learning_rate": 9.793479739653692e-05, + "loss": 1.5374, + "step": 596 + }, + { + "epoch": 0.0921062764728533, + "grad_norm": 0.7580827474594116, + "learning_rate": 9.792101074104598e-05, + "loss": 1.0673, + "step": 598 + }, + { + "epoch": 0.09241432422025414, + "grad_norm": 0.7354786396026611, + "learning_rate": 9.790717919772102e-05, + "loss": 1.0498, + "step": 600 + }, + { + "epoch": 0.09241432422025414, + "eval_loss": 2.5175342559814453, + "eval_runtime": 736.9022, + "eval_samples_per_second": 2.714, + "eval_steps_per_second": 0.679, + "step": 600 + }, + { + "epoch": 0.09272237196765498, + "grad_norm": 0.9065431952476501, + "learning_rate": 9.789330277951807e-05, + "loss": 1.0872, + "step": 602 + }, + { + "epoch": 0.09303041971505584, + "grad_norm": 0.5320457816123962, + "learning_rate": 9.787938149943525e-05, + "loss": 3.4212, + "step": 604 + }, + { + "epoch": 0.09333846746245668, + "grad_norm": 0.693964421749115, + "learning_rate": 9.78654153705127e-05, + "loss": 1.0635, + "step": 606 + }, + { + "epoch": 0.09364651520985753, + "grad_norm": 0.7583566904067993, + "learning_rate": 9.785140440583256e-05, + "loss": 1.0579, + "step": 608 + }, + { + "epoch": 0.09395456295725838, + "grad_norm": 0.5443295240402222, + "learning_rate": 9.783734861851895e-05, + "loss": 2.0752, + "step": 610 + }, + { + "epoch": 0.09426261070465922, + "grad_norm": 0.7155397534370422, + "learning_rate": 9.7823248021738e-05, + "loss": 0.9966, + "step": 612 + }, + { + "epoch": 0.09457065845206007, + "grad_norm": 0.6753432154655457, + "learning_rate": 9.780910262869779e-05, + "loss": 1.8116, + "step": 614 + }, + { + "epoch": 0.09487870619946091, + "grad_norm": 0.647979199886322, + "learning_rate": 9.77949124526484e-05, + "loss": 0.8303, + "step": 616 + }, + { + "epoch": 0.09518675394686177, + "grad_norm": 0.7311884164810181, + "learning_rate": 9.77806775068818e-05, + "loss": 1.2308, + "step": 618 + }, + { + "epoch": 0.09549480169426261, + "grad_norm": 0.8863371014595032, + "learning_rate": 9.776639780473198e-05, + "loss": 1.2158, + "step": 620 + }, + { + "epoch": 0.09580284944166345, + "grad_norm": 0.8108947277069092, + "learning_rate": 9.775207335957476e-05, + "loss": 1.157, + "step": 622 + }, + { + "epoch": 0.0961108971890643, + "grad_norm": 0.790622353553772, + "learning_rate": 9.773770418482792e-05, + "loss": 0.9097, + "step": 624 + }, + { + "epoch": 0.09641894493646515, + "grad_norm": 1.0200419425964355, + "learning_rate": 9.772329029395116e-05, + "loss": 1.3464, + "step": 626 + }, + { + "epoch": 0.096726992683866, + "grad_norm": 0.7055392265319824, + "learning_rate": 9.770883170044603e-05, + "loss": 1.2125, + "step": 628 + }, + { + "epoch": 0.09703504043126684, + "grad_norm": 0.5598780512809753, + "learning_rate": 9.769432841785593e-05, + "loss": 1.185, + "step": 630 + }, + { + "epoch": 0.0973430881786677, + "grad_norm": 0.6309000849723816, + "learning_rate": 9.767978045976618e-05, + "loss": 1.229, + "step": 632 + }, + { + "epoch": 0.09765113592606854, + "grad_norm": 0.7420125007629395, + "learning_rate": 9.766518783980393e-05, + "loss": 1.0318, + "step": 634 + }, + { + "epoch": 0.09795918367346938, + "grad_norm": 0.7199630737304688, + "learning_rate": 9.765055057163813e-05, + "loss": 1.2673, + "step": 636 + }, + { + "epoch": 0.09826723142087024, + "grad_norm": 0.76442950963974, + "learning_rate": 9.763586866897959e-05, + "loss": 1.4397, + "step": 638 + }, + { + "epoch": 0.09857527916827108, + "grad_norm": 0.8405647873878479, + "learning_rate": 9.762114214558092e-05, + "loss": 0.9605, + "step": 640 + }, + { + "epoch": 0.09888332691567193, + "grad_norm": 0.5300808548927307, + "learning_rate": 9.76063710152365e-05, + "loss": 1.0489, + "step": 642 + }, + { + "epoch": 0.09919137466307278, + "grad_norm": 0.7019151449203491, + "learning_rate": 9.759155529178256e-05, + "loss": 1.0425, + "step": 644 + }, + { + "epoch": 0.09949942241047362, + "grad_norm": 0.6637164950370789, + "learning_rate": 9.757669498909701e-05, + "loss": 1.5364, + "step": 646 + }, + { + "epoch": 0.09980747015787447, + "grad_norm": 0.5824893712997437, + "learning_rate": 9.756179012109961e-05, + "loss": 1.8955, + "step": 648 + }, + { + "epoch": 0.10011551790527531, + "grad_norm": 0.7904444932937622, + "learning_rate": 9.754684070175178e-05, + "loss": 1.2757, + "step": 650 + }, + { + "epoch": 0.10042356565267617, + "grad_norm": 0.8260906338691711, + "learning_rate": 9.753184674505672e-05, + "loss": 1.0885, + "step": 652 + }, + { + "epoch": 0.10073161340007701, + "grad_norm": 0.7655699253082275, + "learning_rate": 9.751680826505935e-05, + "loss": 1.3115, + "step": 654 + }, + { + "epoch": 0.10103966114747785, + "grad_norm": 0.9455386400222778, + "learning_rate": 9.750172527584628e-05, + "loss": 1.1202, + "step": 656 + }, + { + "epoch": 0.10134770889487871, + "grad_norm": 0.6057466864585876, + "learning_rate": 9.748659779154583e-05, + "loss": 1.3526, + "step": 658 + }, + { + "epoch": 0.10165575664227955, + "grad_norm": 0.5363863706588745, + "learning_rate": 9.747142582632795e-05, + "loss": 0.8425, + "step": 660 + }, + { + "epoch": 0.1019638043896804, + "grad_norm": 0.6003315448760986, + "learning_rate": 9.745620939440433e-05, + "loss": 1.3913, + "step": 662 + }, + { + "epoch": 0.10227185213708125, + "grad_norm": 0.6269535422325134, + "learning_rate": 9.744094851002825e-05, + "loss": 1.0703, + "step": 664 + }, + { + "epoch": 0.1025798998844821, + "grad_norm": 1.0587024688720703, + "learning_rate": 9.742564318749465e-05, + "loss": 1.2466, + "step": 666 + }, + { + "epoch": 0.10288794763188294, + "grad_norm": 0.966397762298584, + "learning_rate": 9.741029344114011e-05, + "loss": 1.1399, + "step": 668 + }, + { + "epoch": 0.10319599537928378, + "grad_norm": 0.8659431338310242, + "learning_rate": 9.73948992853428e-05, + "loss": 1.2475, + "step": 670 + }, + { + "epoch": 0.10350404312668464, + "grad_norm": 0.6009111404418945, + "learning_rate": 9.737946073452249e-05, + "loss": 0.9601, + "step": 672 + }, + { + "epoch": 0.10381209087408548, + "grad_norm": 0.7170124650001526, + "learning_rate": 9.736397780314056e-05, + "loss": 1.0488, + "step": 674 + }, + { + "epoch": 0.10412013862148634, + "grad_norm": 0.9177563190460205, + "learning_rate": 9.734845050569994e-05, + "loss": 1.3932, + "step": 676 + }, + { + "epoch": 0.10442818636888718, + "grad_norm": 0.5799642205238342, + "learning_rate": 9.733287885674512e-05, + "loss": 2.0052, + "step": 678 + }, + { + "epoch": 0.10473623411628802, + "grad_norm": 0.6074812412261963, + "learning_rate": 9.731726287086211e-05, + "loss": 1.152, + "step": 680 + }, + { + "epoch": 0.10504428186368887, + "grad_norm": 0.673645555973053, + "learning_rate": 9.730160256267853e-05, + "loss": 1.3506, + "step": 682 + }, + { + "epoch": 0.10535232961108972, + "grad_norm": 0.8519735336303711, + "learning_rate": 9.728589794686342e-05, + "loss": 1.2018, + "step": 684 + }, + { + "epoch": 0.10566037735849057, + "grad_norm": 0.935734748840332, + "learning_rate": 9.727014903812736e-05, + "loss": 1.2358, + "step": 686 + }, + { + "epoch": 0.10596842510589141, + "grad_norm": 0.7728151082992554, + "learning_rate": 9.725435585122249e-05, + "loss": 2.1438, + "step": 688 + }, + { + "epoch": 0.10627647285329227, + "grad_norm": 0.6737799644470215, + "learning_rate": 9.72385184009423e-05, + "loss": 1.0234, + "step": 690 + }, + { + "epoch": 0.10658452060069311, + "grad_norm": 0.7550671696662903, + "learning_rate": 9.722263670212181e-05, + "loss": 1.1886, + "step": 692 + }, + { + "epoch": 0.10689256834809395, + "grad_norm": 0.4803139269351959, + "learning_rate": 9.72067107696375e-05, + "loss": 1.1422, + "step": 694 + }, + { + "epoch": 0.1072006160954948, + "grad_norm": 0.7486079931259155, + "learning_rate": 9.719074061840726e-05, + "loss": 0.998, + "step": 696 + }, + { + "epoch": 0.10750866384289565, + "grad_norm": 0.7024034261703491, + "learning_rate": 9.717472626339041e-05, + "loss": 1.1791, + "step": 698 + }, + { + "epoch": 0.1078167115902965, + "grad_norm": 0.7519829273223877, + "learning_rate": 9.715866771958766e-05, + "loss": 1.3896, + "step": 700 + }, + { + "epoch": 0.10812475933769734, + "grad_norm": 0.8522893190383911, + "learning_rate": 9.714256500204112e-05, + "loss": 0.981, + "step": 702 + }, + { + "epoch": 0.10843280708509818, + "grad_norm": 0.6957355737686157, + "learning_rate": 9.71264181258343e-05, + "loss": 0.9747, + "step": 704 + }, + { + "epoch": 0.10874085483249904, + "grad_norm": 0.6734793186187744, + "learning_rate": 9.711022710609204e-05, + "loss": 1.1019, + "step": 706 + }, + { + "epoch": 0.10904890257989988, + "grad_norm": 1.0916528701782227, + "learning_rate": 9.709399195798055e-05, + "loss": 2.5579, + "step": 708 + }, + { + "epoch": 0.10935695032730074, + "grad_norm": 0.662269651889801, + "learning_rate": 9.707771269670736e-05, + "loss": 1.076, + "step": 710 + }, + { + "epoch": 0.10966499807470158, + "grad_norm": 0.6294137835502625, + "learning_rate": 9.706138933752134e-05, + "loss": 0.9947, + "step": 712 + }, + { + "epoch": 0.10997304582210242, + "grad_norm": 0.7161690592765808, + "learning_rate": 9.704502189571262e-05, + "loss": 1.3204, + "step": 714 + }, + { + "epoch": 0.11028109356950327, + "grad_norm": 0.7409554719924927, + "learning_rate": 9.702861038661273e-05, + "loss": 1.0797, + "step": 716 + }, + { + "epoch": 0.11058914131690412, + "grad_norm": 0.5623186826705933, + "learning_rate": 9.701215482559436e-05, + "loss": 1.0801, + "step": 718 + }, + { + "epoch": 0.11089718906430497, + "grad_norm": 0.6551181674003601, + "learning_rate": 9.699565522807151e-05, + "loss": 0.9734, + "step": 720 + }, + { + "epoch": 0.11120523681170581, + "grad_norm": 0.9996515512466431, + "learning_rate": 9.697911160949944e-05, + "loss": 1.109, + "step": 722 + }, + { + "epoch": 0.11151328455910667, + "grad_norm": 0.6170802712440491, + "learning_rate": 9.696252398537462e-05, + "loss": 1.0989, + "step": 724 + }, + { + "epoch": 0.11182133230650751, + "grad_norm": 0.8524353504180908, + "learning_rate": 9.69458923712348e-05, + "loss": 1.8914, + "step": 726 + }, + { + "epoch": 0.11212938005390835, + "grad_norm": 0.5866247415542603, + "learning_rate": 9.692921678265883e-05, + "loss": 1.0485, + "step": 728 + }, + { + "epoch": 0.1124374278013092, + "grad_norm": 0.7652002573013306, + "learning_rate": 9.691249723526683e-05, + "loss": 1.0219, + "step": 730 + }, + { + "epoch": 0.11274547554871005, + "grad_norm": 0.8469516038894653, + "learning_rate": 9.689573374472011e-05, + "loss": 1.249, + "step": 732 + }, + { + "epoch": 0.1130535232961109, + "grad_norm": 0.9973403811454773, + "learning_rate": 9.687892632672109e-05, + "loss": 1.088, + "step": 734 + }, + { + "epoch": 0.11336157104351174, + "grad_norm": 0.7604619264602661, + "learning_rate": 9.686207499701334e-05, + "loss": 1.1157, + "step": 736 + }, + { + "epoch": 0.11366961879091259, + "grad_norm": 0.7274996638298035, + "learning_rate": 9.684517977138159e-05, + "loss": 0.9415, + "step": 738 + }, + { + "epoch": 0.11397766653831344, + "grad_norm": 0.6756743788719177, + "learning_rate": 9.682824066565168e-05, + "loss": 0.9793, + "step": 740 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.9891169667243958, + "learning_rate": 9.681125769569056e-05, + "loss": 0.965, + "step": 742 + }, + { + "epoch": 0.11459376203311514, + "grad_norm": 0.8296619653701782, + "learning_rate": 9.679423087740625e-05, + "loss": 1.2816, + "step": 744 + }, + { + "epoch": 0.11490180978051598, + "grad_norm": 0.6557073593139648, + "learning_rate": 9.677716022674783e-05, + "loss": 1.0373, + "step": 746 + }, + { + "epoch": 0.11520985752791682, + "grad_norm": 0.7231959700584412, + "learning_rate": 9.676004575970547e-05, + "loss": 1.0382, + "step": 748 + }, + { + "epoch": 0.11551790527531768, + "grad_norm": 0.6279606223106384, + "learning_rate": 9.67428874923104e-05, + "loss": 1.2397, + "step": 750 + }, + { + "epoch": 0.11582595302271852, + "grad_norm": 0.7001874446868896, + "learning_rate": 9.67256854406348e-05, + "loss": 1.1748, + "step": 752 + }, + { + "epoch": 0.11613400077011937, + "grad_norm": 0.829440712928772, + "learning_rate": 9.670843962079194e-05, + "loss": 0.9886, + "step": 754 + }, + { + "epoch": 0.11644204851752021, + "grad_norm": 0.7666531205177307, + "learning_rate": 9.669115004893606e-05, + "loss": 0.9669, + "step": 756 + }, + { + "epoch": 0.11675009626492107, + "grad_norm": 0.578072190284729, + "learning_rate": 9.667381674126238e-05, + "loss": 2.1085, + "step": 758 + }, + { + "epoch": 0.11705814401232191, + "grad_norm": 0.9283096194267273, + "learning_rate": 9.665643971400709e-05, + "loss": 2.3508, + "step": 760 + }, + { + "epoch": 0.11736619175972275, + "grad_norm": 0.7107291221618652, + "learning_rate": 9.663901898344732e-05, + "loss": 0.9454, + "step": 762 + }, + { + "epoch": 0.1176742395071236, + "grad_norm": 0.7699968218803406, + "learning_rate": 9.662155456590116e-05, + "loss": 1.0085, + "step": 764 + }, + { + "epoch": 0.11798228725452445, + "grad_norm": 0.688955545425415, + "learning_rate": 9.660404647772763e-05, + "loss": 0.9659, + "step": 766 + }, + { + "epoch": 0.1182903350019253, + "grad_norm": 0.7579169273376465, + "learning_rate": 9.65864947353266e-05, + "loss": 1.2011, + "step": 768 + }, + { + "epoch": 0.11859838274932614, + "grad_norm": 0.6933251023292542, + "learning_rate": 9.656889935513889e-05, + "loss": 1.0996, + "step": 770 + }, + { + "epoch": 0.11890643049672699, + "grad_norm": 0.8795177340507507, + "learning_rate": 9.655126035364617e-05, + "loss": 1.0225, + "step": 772 + }, + { + "epoch": 0.11921447824412784, + "grad_norm": 0.731309711933136, + "learning_rate": 9.6533577747371e-05, + "loss": 1.0972, + "step": 774 + }, + { + "epoch": 0.11952252599152868, + "grad_norm": 0.5974037051200867, + "learning_rate": 9.651585155287671e-05, + "loss": 0.9594, + "step": 776 + }, + { + "epoch": 0.11983057373892954, + "grad_norm": 0.869696855545044, + "learning_rate": 9.649808178676755e-05, + "loss": 1.07, + "step": 778 + }, + { + "epoch": 0.12013862148633038, + "grad_norm": 0.5604158043861389, + "learning_rate": 9.648026846568853e-05, + "loss": 1.7745, + "step": 780 + }, + { + "epoch": 0.12044666923373124, + "grad_norm": 0.8833852410316467, + "learning_rate": 9.646241160632547e-05, + "loss": 2.3886, + "step": 782 + }, + { + "epoch": 0.12075471698113208, + "grad_norm": 0.6206603050231934, + "learning_rate": 9.644451122540496e-05, + "loss": 0.9324, + "step": 784 + }, + { + "epoch": 0.12106276472853292, + "grad_norm": 0.8402155041694641, + "learning_rate": 9.642656733969439e-05, + "loss": 1.0746, + "step": 786 + }, + { + "epoch": 0.12137081247593377, + "grad_norm": 0.8293070197105408, + "learning_rate": 9.640857996600188e-05, + "loss": 2.127, + "step": 788 + }, + { + "epoch": 0.12167886022333461, + "grad_norm": 0.6803492903709412, + "learning_rate": 9.639054912117628e-05, + "loss": 0.9553, + "step": 790 + }, + { + "epoch": 0.12198690797073547, + "grad_norm": 0.817302942276001, + "learning_rate": 9.637247482210716e-05, + "loss": 1.1407, + "step": 792 + }, + { + "epoch": 0.12229495571813631, + "grad_norm": 0.7530799508094788, + "learning_rate": 9.635435708572482e-05, + "loss": 0.9892, + "step": 794 + }, + { + "epoch": 0.12260300346553715, + "grad_norm": 0.6724453568458557, + "learning_rate": 9.63361959290002e-05, + "loss": 1.0774, + "step": 796 + }, + { + "epoch": 0.12291105121293801, + "grad_norm": 0.7561922073364258, + "learning_rate": 9.631799136894498e-05, + "loss": 1.0483, + "step": 798 + }, + { + "epoch": 0.12321909896033885, + "grad_norm": 0.7706962823867798, + "learning_rate": 9.629974342261142e-05, + "loss": 2.1088, + "step": 800 + }, + { + "epoch": 0.1235271467077397, + "grad_norm": 1.036582350730896, + "learning_rate": 9.628145210709245e-05, + "loss": 1.1936, + "step": 802 + }, + { + "epoch": 0.12383519445514055, + "grad_norm": 0.9285162091255188, + "learning_rate": 9.626311743952167e-05, + "loss": 1.2731, + "step": 804 + }, + { + "epoch": 0.12414324220254139, + "grad_norm": 0.8123614192008972, + "learning_rate": 9.624473943707321e-05, + "loss": 1.0282, + "step": 806 + }, + { + "epoch": 0.12445128994994224, + "grad_norm": 0.6215323805809021, + "learning_rate": 9.622631811696187e-05, + "loss": 1.0445, + "step": 808 + }, + { + "epoch": 0.12475933769734308, + "grad_norm": 0.6055769324302673, + "learning_rate": 9.620785349644296e-05, + "loss": 0.9747, + "step": 810 + }, + { + "epoch": 0.12506738544474394, + "grad_norm": 0.5595365166664124, + "learning_rate": 9.618934559281237e-05, + "loss": 1.0832, + "step": 812 + }, + { + "epoch": 0.12537543319214478, + "grad_norm": 0.7242071628570557, + "learning_rate": 9.617079442340656e-05, + "loss": 1.161, + "step": 814 + }, + { + "epoch": 0.12568348093954562, + "grad_norm": 0.7436997294425964, + "learning_rate": 9.615220000560248e-05, + "loss": 1.0146, + "step": 816 + }, + { + "epoch": 0.12599152868694646, + "grad_norm": 0.569721519947052, + "learning_rate": 9.613356235681762e-05, + "loss": 0.9365, + "step": 818 + }, + { + "epoch": 0.12629957643434733, + "grad_norm": 0.9639449715614319, + "learning_rate": 9.611488149450995e-05, + "loss": 0.9866, + "step": 820 + }, + { + "epoch": 0.12660762418174817, + "grad_norm": 0.7013627886772156, + "learning_rate": 9.60961574361779e-05, + "loss": 1.0161, + "step": 822 + }, + { + "epoch": 0.12691567192914902, + "grad_norm": 0.735083281993866, + "learning_rate": 9.607739019936042e-05, + "loss": 1.1411, + "step": 824 + }, + { + "epoch": 0.12722371967654986, + "grad_norm": 0.6909815669059753, + "learning_rate": 9.605857980163684e-05, + "loss": 0.9881, + "step": 826 + }, + { + "epoch": 0.12753176742395073, + "grad_norm": 0.7418811321258545, + "learning_rate": 9.603972626062696e-05, + "loss": 1.2712, + "step": 828 + }, + { + "epoch": 0.12783981517135157, + "grad_norm": 0.5920977592468262, + "learning_rate": 9.602082959399098e-05, + "loss": 1.0488, + "step": 830 + }, + { + "epoch": 0.1281478629187524, + "grad_norm": 0.7561302781105042, + "learning_rate": 9.600188981942947e-05, + "loss": 1.1267, + "step": 832 + }, + { + "epoch": 0.12845591066615325, + "grad_norm": 0.7126810550689697, + "learning_rate": 9.598290695468346e-05, + "loss": 0.9252, + "step": 834 + }, + { + "epoch": 0.1287639584135541, + "grad_norm": 0.7497100234031677, + "learning_rate": 9.596388101753422e-05, + "loss": 2.0032, + "step": 836 + }, + { + "epoch": 0.12907200616095496, + "grad_norm": 0.6821487545967102, + "learning_rate": 9.594481202580349e-05, + "loss": 1.6951, + "step": 838 + }, + { + "epoch": 0.1293800539083558, + "grad_norm": 0.8109884858131409, + "learning_rate": 9.592569999735325e-05, + "loss": 1.0516, + "step": 840 + }, + { + "epoch": 0.12968810165575664, + "grad_norm": 0.6355708837509155, + "learning_rate": 9.590654495008586e-05, + "loss": 0.9621, + "step": 842 + }, + { + "epoch": 0.12999614940315748, + "grad_norm": 0.574379563331604, + "learning_rate": 9.58873469019439e-05, + "loss": 1.0012, + "step": 844 + }, + { + "epoch": 0.13030419715055833, + "grad_norm": 0.8498492240905762, + "learning_rate": 9.58681058709103e-05, + "loss": 1.2151, + "step": 846 + }, + { + "epoch": 0.1306122448979592, + "grad_norm": 1.3300247192382812, + "learning_rate": 9.584882187500822e-05, + "loss": 1.2963, + "step": 848 + }, + { + "epoch": 0.13092029264536004, + "grad_norm": 0.6958180069923401, + "learning_rate": 9.582949493230104e-05, + "loss": 1.8582, + "step": 850 + }, + { + "epoch": 0.13122834039276088, + "grad_norm": 0.7322615385055542, + "learning_rate": 9.581012506089243e-05, + "loss": 1.1858, + "step": 852 + }, + { + "epoch": 0.13153638814016172, + "grad_norm": 0.6359802484512329, + "learning_rate": 9.579071227892625e-05, + "loss": 2.001, + "step": 854 + }, + { + "epoch": 0.13184443588756256, + "grad_norm": 0.9646987318992615, + "learning_rate": 9.577125660458649e-05, + "loss": 1.5951, + "step": 856 + }, + { + "epoch": 0.13215248363496343, + "grad_norm": 0.5302859544754028, + "learning_rate": 9.575175805609741e-05, + "loss": 1.0739, + "step": 858 + }, + { + "epoch": 0.13246053138236427, + "grad_norm": 1.0131480693817139, + "learning_rate": 9.57322166517234e-05, + "loss": 1.2304, + "step": 860 + }, + { + "epoch": 0.1327685791297651, + "grad_norm": 0.762199878692627, + "learning_rate": 9.571263240976897e-05, + "loss": 1.0024, + "step": 862 + }, + { + "epoch": 0.13307662687716595, + "grad_norm": 0.6496574282646179, + "learning_rate": 9.569300534857875e-05, + "loss": 1.2726, + "step": 864 + }, + { + "epoch": 0.1333846746245668, + "grad_norm": 0.5563488602638245, + "learning_rate": 9.567333548653753e-05, + "loss": 1.2358, + "step": 866 + }, + { + "epoch": 0.13369272237196766, + "grad_norm": 0.6338052153587341, + "learning_rate": 9.565362284207016e-05, + "loss": 1.1254, + "step": 868 + }, + { + "epoch": 0.1340007701193685, + "grad_norm": 0.8334317207336426, + "learning_rate": 9.563386743364156e-05, + "loss": 1.1189, + "step": 870 + }, + { + "epoch": 0.13430881786676935, + "grad_norm": 0.6131289601325989, + "learning_rate": 9.561406927975669e-05, + "loss": 0.9274, + "step": 872 + }, + { + "epoch": 0.1346168656141702, + "grad_norm": 0.5694435238838196, + "learning_rate": 9.559422839896061e-05, + "loss": 1.1181, + "step": 874 + }, + { + "epoch": 0.13492491336157103, + "grad_norm": 1.0695499181747437, + "learning_rate": 9.557434480983833e-05, + "loss": 1.318, + "step": 876 + }, + { + "epoch": 0.1352329611089719, + "grad_norm": 0.6700896620750427, + "learning_rate": 9.555441853101494e-05, + "loss": 0.9522, + "step": 878 + }, + { + "epoch": 0.13554100885637274, + "grad_norm": 0.7727726697921753, + "learning_rate": 9.553444958115545e-05, + "loss": 1.0807, + "step": 880 + }, + { + "epoch": 0.13584905660377358, + "grad_norm": 0.7434378266334534, + "learning_rate": 9.551443797896487e-05, + "loss": 1.0798, + "step": 882 + }, + { + "epoch": 0.13615710435117442, + "grad_norm": 0.7013871073722839, + "learning_rate": 9.549438374318818e-05, + "loss": 1.3833, + "step": 884 + }, + { + "epoch": 0.13646515209857527, + "grad_norm": 0.5864063501358032, + "learning_rate": 9.547428689261024e-05, + "loss": 1.2405, + "step": 886 + }, + { + "epoch": 0.13677319984597613, + "grad_norm": 0.6030333638191223, + "learning_rate": 9.54541474460559e-05, + "loss": 1.056, + "step": 888 + }, + { + "epoch": 0.13708124759337698, + "grad_norm": 0.8067970275878906, + "learning_rate": 9.543396542238986e-05, + "loss": 1.1467, + "step": 890 + }, + { + "epoch": 0.13738929534077782, + "grad_norm": 0.7694763541221619, + "learning_rate": 9.541374084051673e-05, + "loss": 1.2326, + "step": 892 + }, + { + "epoch": 0.13769734308817866, + "grad_norm": 0.9057612419128418, + "learning_rate": 9.539347371938093e-05, + "loss": 1.1664, + "step": 894 + }, + { + "epoch": 0.13800539083557953, + "grad_norm": 0.9390200972557068, + "learning_rate": 9.537316407796681e-05, + "loss": 1.0607, + "step": 896 + }, + { + "epoch": 0.13831343858298037, + "grad_norm": 0.7872670292854309, + "learning_rate": 9.535281193529849e-05, + "loss": 1.8178, + "step": 898 + }, + { + "epoch": 0.1386214863303812, + "grad_norm": 0.7058428525924683, + "learning_rate": 9.53324173104399e-05, + "loss": 0.9798, + "step": 900 + }, + { + "epoch": 0.1386214863303812, + "eval_loss": 2.5482165813446045, + "eval_runtime": 736.9952, + "eval_samples_per_second": 2.714, + "eval_steps_per_second": 0.678, + "step": 900 + }, + { + "epoch": 0.13892953407778205, + "grad_norm": 0.9759823083877563, + "learning_rate": 9.531198022249479e-05, + "loss": 1.473, + "step": 902 + }, + { + "epoch": 0.1392375818251829, + "grad_norm": 1.039426326751709, + "learning_rate": 9.52915006906067e-05, + "loss": 1.4844, + "step": 904 + }, + { + "epoch": 0.13954562957258376, + "grad_norm": 0.5784148573875427, + "learning_rate": 9.527097873395887e-05, + "loss": 1.0574, + "step": 906 + }, + { + "epoch": 0.1398536773199846, + "grad_norm": 0.7337266802787781, + "learning_rate": 9.525041437177433e-05, + "loss": 1.1303, + "step": 908 + }, + { + "epoch": 0.14016172506738545, + "grad_norm": 0.6352843642234802, + "learning_rate": 9.522980762331582e-05, + "loss": 1.2027, + "step": 910 + }, + { + "epoch": 0.1404697728147863, + "grad_norm": 0.6290647983551025, + "learning_rate": 9.520915850788575e-05, + "loss": 2.3878, + "step": 912 + }, + { + "epoch": 0.14077782056218713, + "grad_norm": 0.8372937440872192, + "learning_rate": 9.518846704482627e-05, + "loss": 1.0772, + "step": 914 + }, + { + "epoch": 0.141085868309588, + "grad_norm": 0.826770544052124, + "learning_rate": 9.516773325351915e-05, + "loss": 1.3271, + "step": 916 + }, + { + "epoch": 0.14139391605698884, + "grad_norm": 0.6032495498657227, + "learning_rate": 9.514695715338585e-05, + "loss": 1.001, + "step": 918 + }, + { + "epoch": 0.14170196380438968, + "grad_norm": 0.8128840923309326, + "learning_rate": 9.512613876388742e-05, + "loss": 1.1132, + "step": 920 + }, + { + "epoch": 0.14201001155179052, + "grad_norm": 0.718402087688446, + "learning_rate": 9.510527810452455e-05, + "loss": 1.1008, + "step": 922 + }, + { + "epoch": 0.14231805929919136, + "grad_norm": 0.9620109796524048, + "learning_rate": 9.508437519483753e-05, + "loss": 1.0307, + "step": 924 + }, + { + "epoch": 0.14262610704659223, + "grad_norm": 0.7499354481697083, + "learning_rate": 9.506343005440618e-05, + "loss": 0.918, + "step": 926 + }, + { + "epoch": 0.14293415479399307, + "grad_norm": 0.6020653247833252, + "learning_rate": 9.504244270284994e-05, + "loss": 1.8571, + "step": 928 + }, + { + "epoch": 0.14324220254139391, + "grad_norm": 0.718852162361145, + "learning_rate": 9.502141315982776e-05, + "loss": 1.2873, + "step": 930 + }, + { + "epoch": 0.14355025028879476, + "grad_norm": 0.8073559403419495, + "learning_rate": 9.50003414450381e-05, + "loss": 0.8581, + "step": 932 + }, + { + "epoch": 0.1438582980361956, + "grad_norm": 0.6343957185745239, + "learning_rate": 9.497922757821894e-05, + "loss": 1.145, + "step": 934 + }, + { + "epoch": 0.14416634578359647, + "grad_norm": 0.9298047423362732, + "learning_rate": 9.495807157914771e-05, + "loss": 1.3353, + "step": 936 + }, + { + "epoch": 0.1444743935309973, + "grad_norm": 0.7584412097930908, + "learning_rate": 9.493687346764137e-05, + "loss": 1.1033, + "step": 938 + }, + { + "epoch": 0.14478244127839815, + "grad_norm": 0.5845352411270142, + "learning_rate": 9.491563326355628e-05, + "loss": 0.7995, + "step": 940 + }, + { + "epoch": 0.145090489025799, + "grad_norm": 0.6179757714271545, + "learning_rate": 9.489435098678823e-05, + "loss": 1.1081, + "step": 942 + }, + { + "epoch": 0.14539853677319983, + "grad_norm": 0.7248619198799133, + "learning_rate": 9.487302665727243e-05, + "loss": 1.4268, + "step": 944 + }, + { + "epoch": 0.1457065845206007, + "grad_norm": 0.7348257899284363, + "learning_rate": 9.485166029498348e-05, + "loss": 1.1433, + "step": 946 + }, + { + "epoch": 0.14601463226800154, + "grad_norm": 0.711986780166626, + "learning_rate": 9.483025191993535e-05, + "loss": 1.2964, + "step": 948 + }, + { + "epoch": 0.14632268001540238, + "grad_norm": 0.6216188073158264, + "learning_rate": 9.480880155218136e-05, + "loss": 1.4536, + "step": 950 + }, + { + "epoch": 0.14663072776280323, + "grad_norm": 0.8497354984283447, + "learning_rate": 9.478730921181419e-05, + "loss": 1.3397, + "step": 952 + }, + { + "epoch": 0.1469387755102041, + "grad_norm": 0.5710319876670837, + "learning_rate": 9.476577491896579e-05, + "loss": 0.8536, + "step": 954 + }, + { + "epoch": 0.14724682325760494, + "grad_norm": 0.6521850228309631, + "learning_rate": 9.474419869380745e-05, + "loss": 0.872, + "step": 956 + }, + { + "epoch": 0.14755487100500578, + "grad_norm": 0.6547313928604126, + "learning_rate": 9.472258055654971e-05, + "loss": 0.8257, + "step": 958 + }, + { + "epoch": 0.14786291875240662, + "grad_norm": 0.737887978553772, + "learning_rate": 9.47009205274424e-05, + "loss": 0.8285, + "step": 960 + }, + { + "epoch": 0.14817096649980746, + "grad_norm": 0.6487706303596497, + "learning_rate": 9.467921862677454e-05, + "loss": 1.2221, + "step": 962 + }, + { + "epoch": 0.14847901424720833, + "grad_norm": 0.6891322135925293, + "learning_rate": 9.46574748748744e-05, + "loss": 0.9617, + "step": 964 + }, + { + "epoch": 0.14878706199460917, + "grad_norm": 0.6219374537467957, + "learning_rate": 9.463568929210949e-05, + "loss": 1.0844, + "step": 966 + }, + { + "epoch": 0.14909510974201, + "grad_norm": 0.6913350820541382, + "learning_rate": 9.461386189888643e-05, + "loss": 0.9804, + "step": 968 + }, + { + "epoch": 0.14940315748941085, + "grad_norm": 0.78050297498703, + "learning_rate": 9.459199271565107e-05, + "loss": 1.1167, + "step": 970 + }, + { + "epoch": 0.1497112052368117, + "grad_norm": 0.589933454990387, + "learning_rate": 9.457008176288837e-05, + "loss": 1.1252, + "step": 972 + }, + { + "epoch": 0.15001925298421256, + "grad_norm": 0.7892959713935852, + "learning_rate": 9.45481290611224e-05, + "loss": 1.1474, + "step": 974 + }, + { + "epoch": 0.1503273007316134, + "grad_norm": 0.7642476558685303, + "learning_rate": 9.452613463091637e-05, + "loss": 1.0619, + "step": 976 + }, + { + "epoch": 0.15063534847901425, + "grad_norm": 0.5432103872299194, + "learning_rate": 9.450409849287258e-05, + "loss": 1.1147, + "step": 978 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 1.7676953077316284, + "learning_rate": 9.448202066763237e-05, + "loss": 1.0471, + "step": 980 + }, + { + "epoch": 0.15125144397381593, + "grad_norm": 0.5658864378929138, + "learning_rate": 9.445990117587614e-05, + "loss": 1.003, + "step": 982 + }, + { + "epoch": 0.1515594917212168, + "grad_norm": 0.6045776605606079, + "learning_rate": 9.443774003832332e-05, + "loss": 1.1408, + "step": 984 + }, + { + "epoch": 0.15186753946861764, + "grad_norm": 0.6377173662185669, + "learning_rate": 9.441553727573236e-05, + "loss": 1.0693, + "step": 986 + }, + { + "epoch": 0.15217558721601848, + "grad_norm": 0.8608678579330444, + "learning_rate": 9.439329290890068e-05, + "loss": 1.1497, + "step": 988 + }, + { + "epoch": 0.15248363496341932, + "grad_norm": 0.5447673797607422, + "learning_rate": 9.437100695866469e-05, + "loss": 0.9368, + "step": 990 + }, + { + "epoch": 0.15279168271082016, + "grad_norm": 0.6498827934265137, + "learning_rate": 9.434867944589973e-05, + "loss": 0.986, + "step": 992 + }, + { + "epoch": 0.15309973045822103, + "grad_norm": 0.686646044254303, + "learning_rate": 9.432631039152011e-05, + "loss": 0.9154, + "step": 994 + }, + { + "epoch": 0.15340777820562188, + "grad_norm": 0.7855954170227051, + "learning_rate": 9.430389981647901e-05, + "loss": 0.9782, + "step": 996 + }, + { + "epoch": 0.15371582595302272, + "grad_norm": 0.6959761381149292, + "learning_rate": 9.428144774176852e-05, + "loss": 1.2808, + "step": 998 + }, + { + "epoch": 0.15402387370042356, + "grad_norm": 0.6581859588623047, + "learning_rate": 9.425895418841961e-05, + "loss": 1.1032, + "step": 1000 + }, + { + "epoch": 0.1543319214478244, + "grad_norm": 0.9970901012420654, + "learning_rate": 9.42364191775021e-05, + "loss": 1.0597, + "step": 1002 + }, + { + "epoch": 0.15463996919522527, + "grad_norm": 0.6671077013015747, + "learning_rate": 9.421384273012463e-05, + "loss": 0.9503, + "step": 1004 + }, + { + "epoch": 0.1549480169426261, + "grad_norm": 0.6460065841674805, + "learning_rate": 9.419122486743466e-05, + "loss": 0.933, + "step": 1006 + }, + { + "epoch": 0.15525606469002695, + "grad_norm": 0.9560624361038208, + "learning_rate": 9.416856561061846e-05, + "loss": 1.2742, + "step": 1008 + }, + { + "epoch": 0.1555641124374278, + "grad_norm": 0.6850244402885437, + "learning_rate": 9.414586498090106e-05, + "loss": 1.0752, + "step": 1010 + }, + { + "epoch": 0.15587216018482866, + "grad_norm": 0.5755826830863953, + "learning_rate": 9.412312299954622e-05, + "loss": 1.0276, + "step": 1012 + }, + { + "epoch": 0.1561802079322295, + "grad_norm": 0.5836648941040039, + "learning_rate": 9.41003396878565e-05, + "loss": 0.9003, + "step": 1014 + }, + { + "epoch": 0.15648825567963034, + "grad_norm": 0.5940999984741211, + "learning_rate": 9.40775150671731e-05, + "loss": 0.9651, + "step": 1016 + }, + { + "epoch": 0.1567963034270312, + "grad_norm": 0.8461489081382751, + "learning_rate": 9.405464915887598e-05, + "loss": 1.0209, + "step": 1018 + }, + { + "epoch": 0.15710435117443203, + "grad_norm": 0.660050630569458, + "learning_rate": 9.403174198438372e-05, + "loss": 1.0409, + "step": 1020 + }, + { + "epoch": 0.1574123989218329, + "grad_norm": 1.0249272584915161, + "learning_rate": 9.400879356515357e-05, + "loss": 1.3634, + "step": 1022 + }, + { + "epoch": 0.15772044666923374, + "grad_norm": 0.6267766952514648, + "learning_rate": 9.398580392268145e-05, + "loss": 1.211, + "step": 1024 + }, + { + "epoch": 0.15802849441663458, + "grad_norm": 0.7840976119041443, + "learning_rate": 9.396277307850184e-05, + "loss": 1.1994, + "step": 1026 + }, + { + "epoch": 0.15833654216403542, + "grad_norm": 0.6230096817016602, + "learning_rate": 9.393970105418786e-05, + "loss": 0.9114, + "step": 1028 + }, + { + "epoch": 0.15864458991143626, + "grad_norm": 1.0648771524429321, + "learning_rate": 9.391658787135115e-05, + "loss": 1.0579, + "step": 1030 + }, + { + "epoch": 0.15895263765883713, + "grad_norm": 0.6591848134994507, + "learning_rate": 9.389343355164198e-05, + "loss": 1.2073, + "step": 1032 + }, + { + "epoch": 0.15926068540623797, + "grad_norm": 0.9383640885353088, + "learning_rate": 9.387023811674909e-05, + "loss": 1.2236, + "step": 1034 + }, + { + "epoch": 0.15956873315363881, + "grad_norm": 0.8099064230918884, + "learning_rate": 9.384700158839972e-05, + "loss": 0.8945, + "step": 1036 + }, + { + "epoch": 0.15987678090103966, + "grad_norm": 0.6530327796936035, + "learning_rate": 9.382372398835969e-05, + "loss": 1.0963, + "step": 1038 + }, + { + "epoch": 0.1601848286484405, + "grad_norm": 0.6211308836936951, + "learning_rate": 9.380040533843319e-05, + "loss": 1.0149, + "step": 1040 + }, + { + "epoch": 0.16049287639584137, + "grad_norm": 0.7817839980125427, + "learning_rate": 9.377704566046295e-05, + "loss": 1.1448, + "step": 1042 + }, + { + "epoch": 0.1608009241432422, + "grad_norm": 0.7905937433242798, + "learning_rate": 9.375364497633006e-05, + "loss": 1.153, + "step": 1044 + }, + { + "epoch": 0.16110897189064305, + "grad_norm": 0.8386938571929932, + "learning_rate": 9.373020330795403e-05, + "loss": 1.0107, + "step": 1046 + }, + { + "epoch": 0.1614170196380439, + "grad_norm": 0.7169632315635681, + "learning_rate": 9.370672067729284e-05, + "loss": 1.3401, + "step": 1048 + }, + { + "epoch": 0.16172506738544473, + "grad_norm": 0.808017909526825, + "learning_rate": 9.368319710634273e-05, + "loss": 1.0624, + "step": 1050 + }, + { + "epoch": 0.1620331151328456, + "grad_norm": 0.7317773699760437, + "learning_rate": 9.365963261713835e-05, + "loss": 1.0429, + "step": 1052 + }, + { + "epoch": 0.16234116288024644, + "grad_norm": 0.7492151260375977, + "learning_rate": 9.363602723175268e-05, + "loss": 1.078, + "step": 1054 + }, + { + "epoch": 0.16264921062764728, + "grad_norm": 0.8238731622695923, + "learning_rate": 9.361238097229699e-05, + "loss": 1.2244, + "step": 1056 + }, + { + "epoch": 0.16295725837504813, + "grad_norm": 0.899804949760437, + "learning_rate": 9.358869386092084e-05, + "loss": 1.2607, + "step": 1058 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.6087053418159485, + "learning_rate": 9.356496591981204e-05, + "loss": 1.0546, + "step": 1060 + }, + { + "epoch": 0.16357335386984984, + "grad_norm": 0.574735164642334, + "learning_rate": 9.354119717119669e-05, + "loss": 1.1068, + "step": 1062 + }, + { + "epoch": 0.16388140161725068, + "grad_norm": 0.7853001356124878, + "learning_rate": 9.351738763733906e-05, + "loss": 1.0626, + "step": 1064 + }, + { + "epoch": 0.16418944936465152, + "grad_norm": 0.7452929019927979, + "learning_rate": 9.349353734054167e-05, + "loss": 1.0014, + "step": 1066 + }, + { + "epoch": 0.16449749711205236, + "grad_norm": 0.5956066250801086, + "learning_rate": 9.346964630314521e-05, + "loss": 1.1041, + "step": 1068 + }, + { + "epoch": 0.16480554485945323, + "grad_norm": 1.5090982913970947, + "learning_rate": 9.344571454752851e-05, + "loss": 1.0974, + "step": 1070 + }, + { + "epoch": 0.16511359260685407, + "grad_norm": 0.7556977272033691, + "learning_rate": 9.342174209610857e-05, + "loss": 1.0473, + "step": 1072 + }, + { + "epoch": 0.1654216403542549, + "grad_norm": 0.5461626052856445, + "learning_rate": 9.339772897134049e-05, + "loss": 1.2671, + "step": 1074 + }, + { + "epoch": 0.16572968810165575, + "grad_norm": 0.7559049725532532, + "learning_rate": 9.337367519571748e-05, + "loss": 1.1524, + "step": 1076 + }, + { + "epoch": 0.1660377358490566, + "grad_norm": 1.1034940481185913, + "learning_rate": 9.334958079177081e-05, + "loss": 1.058, + "step": 1078 + }, + { + "epoch": 0.16634578359645746, + "grad_norm": 0.6188117861747742, + "learning_rate": 9.332544578206985e-05, + "loss": 1.1278, + "step": 1080 + }, + { + "epoch": 0.1666538313438583, + "grad_norm": 0.6941109895706177, + "learning_rate": 9.330127018922194e-05, + "loss": 1.0709, + "step": 1082 + }, + { + "epoch": 0.16696187909125915, + "grad_norm": 0.6960075497627258, + "learning_rate": 9.327705403587248e-05, + "loss": 0.9336, + "step": 1084 + }, + { + "epoch": 0.16726992683866, + "grad_norm": 0.9475935101509094, + "learning_rate": 9.325279734470488e-05, + "loss": 1.0948, + "step": 1086 + }, + { + "epoch": 0.16757797458606083, + "grad_norm": 0.9592021107673645, + "learning_rate": 9.322850013844046e-05, + "loss": 2.709, + "step": 1088 + }, + { + "epoch": 0.1678860223334617, + "grad_norm": 0.6552063226699829, + "learning_rate": 9.320416243983856e-05, + "loss": 0.8764, + "step": 1090 + }, + { + "epoch": 0.16819407008086254, + "grad_norm": 0.8499904870986938, + "learning_rate": 9.317978427169638e-05, + "loss": 1.2129, + "step": 1092 + }, + { + "epoch": 0.16850211782826338, + "grad_norm": 0.6355566382408142, + "learning_rate": 9.31553656568491e-05, + "loss": 0.941, + "step": 1094 + }, + { + "epoch": 0.16881016557566422, + "grad_norm": 0.6042637228965759, + "learning_rate": 9.313090661816972e-05, + "loss": 1.2016, + "step": 1096 + }, + { + "epoch": 0.16911821332306506, + "grad_norm": 0.6587199568748474, + "learning_rate": 9.310640717856915e-05, + "loss": 0.9431, + "step": 1098 + }, + { + "epoch": 0.16942626107046593, + "grad_norm": 0.6389713883399963, + "learning_rate": 9.308186736099614e-05, + "loss": 2.0506, + "step": 1100 + }, + { + "epoch": 0.16973430881786677, + "grad_norm": 0.8145257234573364, + "learning_rate": 9.305728718843723e-05, + "loss": 1.0665, + "step": 1102 + }, + { + "epoch": 0.17004235656526762, + "grad_norm": 0.8680522441864014, + "learning_rate": 9.303266668391679e-05, + "loss": 1.1634, + "step": 1104 + }, + { + "epoch": 0.17035040431266846, + "grad_norm": 0.9924330115318298, + "learning_rate": 9.300800587049696e-05, + "loss": 1.0959, + "step": 1106 + }, + { + "epoch": 0.1706584520600693, + "grad_norm": 0.5335946083068848, + "learning_rate": 9.298330477127763e-05, + "loss": 1.9391, + "step": 1108 + }, + { + "epoch": 0.17096649980747017, + "grad_norm": 0.6439701914787292, + "learning_rate": 9.295856340939648e-05, + "loss": 1.1721, + "step": 1110 + }, + { + "epoch": 0.171274547554871, + "grad_norm": 0.9099013209342957, + "learning_rate": 9.293378180802878e-05, + "loss": 1.2054, + "step": 1112 + }, + { + "epoch": 0.17158259530227185, + "grad_norm": 0.5536646842956543, + "learning_rate": 9.290895999038765e-05, + "loss": 0.8748, + "step": 1114 + }, + { + "epoch": 0.1718906430496727, + "grad_norm": 0.7243412733078003, + "learning_rate": 9.288409797972375e-05, + "loss": 0.9013, + "step": 1116 + }, + { + "epoch": 0.17219869079707353, + "grad_norm": 0.5222441554069519, + "learning_rate": 9.285919579932548e-05, + "loss": 0.9313, + "step": 1118 + }, + { + "epoch": 0.1725067385444744, + "grad_norm": 0.7990663647651672, + "learning_rate": 9.28342534725188e-05, + "loss": 2.3069, + "step": 1120 + }, + { + "epoch": 0.17281478629187524, + "grad_norm": 1.0527921915054321, + "learning_rate": 9.280927102266729e-05, + "loss": 0.9213, + "step": 1122 + }, + { + "epoch": 0.17312283403927609, + "grad_norm": 0.8192021250724792, + "learning_rate": 9.278424847317217e-05, + "loss": 2.6465, + "step": 1124 + }, + { + "epoch": 0.17343088178667693, + "grad_norm": 0.7312417030334473, + "learning_rate": 9.275918584747216e-05, + "loss": 1.1035, + "step": 1126 + }, + { + "epoch": 0.17373892953407777, + "grad_norm": 0.7595751881599426, + "learning_rate": 9.273408316904353e-05, + "loss": 1.0814, + "step": 1128 + }, + { + "epoch": 0.17404697728147864, + "grad_norm": 0.6808615326881409, + "learning_rate": 9.270894046140009e-05, + "loss": 1.6817, + "step": 1130 + }, + { + "epoch": 0.17435502502887948, + "grad_norm": 0.7840389013290405, + "learning_rate": 9.268375774809312e-05, + "loss": 1.3194, + "step": 1132 + }, + { + "epoch": 0.17466307277628032, + "grad_norm": 0.665709376335144, + "learning_rate": 9.265853505271139e-05, + "loss": 1.0137, + "step": 1134 + }, + { + "epoch": 0.17497112052368116, + "grad_norm": 0.7553781270980835, + "learning_rate": 9.26332723988811e-05, + "loss": 0.9437, + "step": 1136 + }, + { + "epoch": 0.17527916827108203, + "grad_norm": 0.6285457015037537, + "learning_rate": 9.260796981026591e-05, + "loss": 0.9462, + "step": 1138 + }, + { + "epoch": 0.17558721601848287, + "grad_norm": 0.6842612624168396, + "learning_rate": 9.258262731056688e-05, + "loss": 1.334, + "step": 1140 + }, + { + "epoch": 0.1758952637658837, + "grad_norm": 0.8658350110054016, + "learning_rate": 9.255724492352245e-05, + "loss": 1.1117, + "step": 1142 + }, + { + "epoch": 0.17620331151328456, + "grad_norm": 0.6949405074119568, + "learning_rate": 9.25318226729084e-05, + "loss": 1.0724, + "step": 1144 + }, + { + "epoch": 0.1765113592606854, + "grad_norm": 0.8136616349220276, + "learning_rate": 9.250636058253788e-05, + "loss": 1.1257, + "step": 1146 + }, + { + "epoch": 0.17681940700808627, + "grad_norm": 0.7518190741539001, + "learning_rate": 9.248085867626136e-05, + "loss": 0.912, + "step": 1148 + }, + { + "epoch": 0.1771274547554871, + "grad_norm": 0.783767819404602, + "learning_rate": 9.245531697796656e-05, + "loss": 1.1135, + "step": 1150 + }, + { + "epoch": 0.17743550250288795, + "grad_norm": 0.637515127658844, + "learning_rate": 9.242973551157857e-05, + "loss": 1.7548, + "step": 1152 + }, + { + "epoch": 0.1777435502502888, + "grad_norm": 0.6867687702178955, + "learning_rate": 9.24041143010596e-05, + "loss": 0.9194, + "step": 1154 + }, + { + "epoch": 0.17805159799768963, + "grad_norm": 0.647327721118927, + "learning_rate": 9.23784533704092e-05, + "loss": 0.9544, + "step": 1156 + }, + { + "epoch": 0.1783596457450905, + "grad_norm": 0.8152113556861877, + "learning_rate": 9.235275274366406e-05, + "loss": 1.0048, + "step": 1158 + }, + { + "epoch": 0.17866769349249134, + "grad_norm": 0.5865814685821533, + "learning_rate": 9.23270124448981e-05, + "loss": 0.9702, + "step": 1160 + }, + { + "epoch": 0.17897574123989218, + "grad_norm": 0.7674340009689331, + "learning_rate": 9.230123249822236e-05, + "loss": 1.5428, + "step": 1162 + }, + { + "epoch": 0.17928378898729302, + "grad_norm": 1.6844772100448608, + "learning_rate": 9.227541292778504e-05, + "loss": 1.0052, + "step": 1164 + }, + { + "epoch": 0.17959183673469387, + "grad_norm": 0.8579050898551941, + "learning_rate": 9.224955375777147e-05, + "loss": 1.1531, + "step": 1166 + }, + { + "epoch": 0.17989988448209474, + "grad_norm": 0.7544540762901306, + "learning_rate": 9.222365501240402e-05, + "loss": 1.3026, + "step": 1168 + }, + { + "epoch": 0.18020793222949558, + "grad_norm": 0.7298200130462646, + "learning_rate": 9.21977167159422e-05, + "loss": 1.0986, + "step": 1170 + }, + { + "epoch": 0.18051597997689642, + "grad_norm": 0.5601779222488403, + "learning_rate": 9.21717388926825e-05, + "loss": 1.082, + "step": 1172 + }, + { + "epoch": 0.18082402772429726, + "grad_norm": 0.6664190888404846, + "learning_rate": 9.214572156695849e-05, + "loss": 1.7701, + "step": 1174 + }, + { + "epoch": 0.1811320754716981, + "grad_norm": 0.8289921283721924, + "learning_rate": 9.211966476314072e-05, + "loss": 2.7131, + "step": 1176 + }, + { + "epoch": 0.18144012321909897, + "grad_norm": 0.6665297746658325, + "learning_rate": 9.209356850563672e-05, + "loss": 1.0969, + "step": 1178 + }, + { + "epoch": 0.1817481709664998, + "grad_norm": 0.5994783043861389, + "learning_rate": 9.206743281889097e-05, + "loss": 1.1694, + "step": 1180 + }, + { + "epoch": 0.18205621871390065, + "grad_norm": 0.6745316982269287, + "learning_rate": 9.204125772738488e-05, + "loss": 1.4434, + "step": 1182 + }, + { + "epoch": 0.1823642664613015, + "grad_norm": 0.683441162109375, + "learning_rate": 9.20150432556368e-05, + "loss": 1.0758, + "step": 1184 + }, + { + "epoch": 0.18267231420870234, + "grad_norm": 0.7067748308181763, + "learning_rate": 9.198878942820195e-05, + "loss": 1.0938, + "step": 1186 + }, + { + "epoch": 0.1829803619561032, + "grad_norm": 0.845294713973999, + "learning_rate": 9.196249626967237e-05, + "loss": 1.2249, + "step": 1188 + }, + { + "epoch": 0.18328840970350405, + "grad_norm": 0.9288262128829956, + "learning_rate": 9.193616380467704e-05, + "loss": 1.1201, + "step": 1190 + }, + { + "epoch": 0.1835964574509049, + "grad_norm": 0.6859925985336304, + "learning_rate": 9.190979205788169e-05, + "loss": 1.2275, + "step": 1192 + }, + { + "epoch": 0.18390450519830573, + "grad_norm": 0.6356547474861145, + "learning_rate": 9.188338105398882e-05, + "loss": 0.9563, + "step": 1194 + }, + { + "epoch": 0.1842125529457066, + "grad_norm": 0.6628222465515137, + "learning_rate": 9.185693081773777e-05, + "loss": 0.9248, + "step": 1196 + }, + { + "epoch": 0.18452060069310744, + "grad_norm": 0.8962293267250061, + "learning_rate": 9.183044137390461e-05, + "loss": 1.9412, + "step": 1198 + }, + { + "epoch": 0.18482864844050828, + "grad_norm": 0.673896849155426, + "learning_rate": 9.18039127473021e-05, + "loss": 1.0603, + "step": 1200 + }, + { + "epoch": 0.18482864844050828, + "eval_loss": 2.5936074256896973, + "eval_runtime": 736.9969, + "eval_samples_per_second": 2.714, + "eval_steps_per_second": 0.678, + "step": 1200 + }, + { + "epoch": 0.18513669618790912, + "grad_norm": 0.5971192121505737, + "learning_rate": 9.177734496277975e-05, + "loss": 0.9434, + "step": 1202 + }, + { + "epoch": 0.18544474393530996, + "grad_norm": 0.867962658405304, + "learning_rate": 9.175073804522371e-05, + "loss": 1.242, + "step": 1204 + }, + { + "epoch": 0.18575279168271083, + "grad_norm": 0.6973360776901245, + "learning_rate": 9.17240920195568e-05, + "loss": 1.0988, + "step": 1206 + }, + { + "epoch": 0.18606083943011167, + "grad_norm": 0.6780734658241272, + "learning_rate": 9.169740691073852e-05, + "loss": 1.1157, + "step": 1208 + }, + { + "epoch": 0.18636888717751252, + "grad_norm": 0.9438827633857727, + "learning_rate": 9.167068274376487e-05, + "loss": 1.1604, + "step": 1210 + }, + { + "epoch": 0.18667693492491336, + "grad_norm": 0.7693531513214111, + "learning_rate": 9.164391954366855e-05, + "loss": 1.1541, + "step": 1212 + }, + { + "epoch": 0.1869849826723142, + "grad_norm": 0.5881041884422302, + "learning_rate": 9.161711733551877e-05, + "loss": 2.0453, + "step": 1214 + }, + { + "epoch": 0.18729303041971507, + "grad_norm": 0.5809153914451599, + "learning_rate": 9.159027614442126e-05, + "loss": 2.0997, + "step": 1216 + }, + { + "epoch": 0.1876010781671159, + "grad_norm": 0.7402828335762024, + "learning_rate": 9.15633959955183e-05, + "loss": 1.0956, + "step": 1218 + }, + { + "epoch": 0.18790912591451675, + "grad_norm": 0.7167702913284302, + "learning_rate": 9.153647691398866e-05, + "loss": 3.4171, + "step": 1220 + }, + { + "epoch": 0.1882171736619176, + "grad_norm": 0.9422446489334106, + "learning_rate": 9.150951892504754e-05, + "loss": 1.1465, + "step": 1222 + }, + { + "epoch": 0.18852522140931843, + "grad_norm": 0.4820753037929535, + "learning_rate": 9.148252205394665e-05, + "loss": 1.045, + "step": 1224 + }, + { + "epoch": 0.1888332691567193, + "grad_norm": 0.768250048160553, + "learning_rate": 9.145548632597408e-05, + "loss": 1.2842, + "step": 1226 + }, + { + "epoch": 0.18914131690412014, + "grad_norm": 0.7467925548553467, + "learning_rate": 9.142841176645429e-05, + "loss": 1.2167, + "step": 1228 + }, + { + "epoch": 0.18944936465152099, + "grad_norm": 0.8527848720550537, + "learning_rate": 9.140129840074818e-05, + "loss": 0.9865, + "step": 1230 + }, + { + "epoch": 0.18975741239892183, + "grad_norm": 1.1066113710403442, + "learning_rate": 9.137414625425295e-05, + "loss": 1.2248, + "step": 1232 + }, + { + "epoch": 0.19006546014632267, + "grad_norm": 1.0038797855377197, + "learning_rate": 9.134695535240216e-05, + "loss": 1.1177, + "step": 1234 + }, + { + "epoch": 0.19037350789372354, + "grad_norm": 0.6442514061927795, + "learning_rate": 9.131972572066563e-05, + "loss": 1.1056, + "step": 1236 + }, + { + "epoch": 0.19068155564112438, + "grad_norm": 0.5913024544715881, + "learning_rate": 9.12924573845495e-05, + "loss": 1.052, + "step": 1238 + }, + { + "epoch": 0.19098960338852522, + "grad_norm": 0.7442435622215271, + "learning_rate": 9.126515036959613e-05, + "loss": 1.023, + "step": 1240 + }, + { + "epoch": 0.19129765113592606, + "grad_norm": 0.7399488687515259, + "learning_rate": 9.123780470138415e-05, + "loss": 1.4022, + "step": 1242 + }, + { + "epoch": 0.1916056988833269, + "grad_norm": 0.6702150106430054, + "learning_rate": 9.121042040552836e-05, + "loss": 0.973, + "step": 1244 + }, + { + "epoch": 0.19191374663072777, + "grad_norm": 0.6792922019958496, + "learning_rate": 9.118299750767976e-05, + "loss": 1.0224, + "step": 1246 + }, + { + "epoch": 0.1922217943781286, + "grad_norm": 0.7627959847450256, + "learning_rate": 9.115553603352551e-05, + "loss": 0.9449, + "step": 1248 + }, + { + "epoch": 0.19252984212552945, + "grad_norm": 0.7185088992118835, + "learning_rate": 9.11280360087889e-05, + "loss": 0.9539, + "step": 1250 + }, + { + "epoch": 0.1928378898729303, + "grad_norm": 0.6071786880493164, + "learning_rate": 9.110049745922933e-05, + "loss": 1.1379, + "step": 1252 + }, + { + "epoch": 0.19314593762033117, + "grad_norm": 0.6094626784324646, + "learning_rate": 9.107292041064229e-05, + "loss": 1.1505, + "step": 1254 + }, + { + "epoch": 0.193453985367732, + "grad_norm": 0.6313731670379639, + "learning_rate": 9.104530488885932e-05, + "loss": 0.9058, + "step": 1256 + }, + { + "epoch": 0.19376203311513285, + "grad_norm": 0.6403363943099976, + "learning_rate": 9.1017650919748e-05, + "loss": 1.212, + "step": 1258 + }, + { + "epoch": 0.1940700808625337, + "grad_norm": 0.6836332678794861, + "learning_rate": 9.098995852921197e-05, + "loss": 1.1166, + "step": 1260 + }, + { + "epoch": 0.19437812860993453, + "grad_norm": 0.5408475995063782, + "learning_rate": 9.09622277431908e-05, + "loss": 1.2248, + "step": 1262 + }, + { + "epoch": 0.1946861763573354, + "grad_norm": 0.586330771446228, + "learning_rate": 9.093445858766004e-05, + "loss": 0.8889, + "step": 1264 + }, + { + "epoch": 0.19499422410473624, + "grad_norm": 0.7370517253875732, + "learning_rate": 9.090665108863118e-05, + "loss": 1.1122, + "step": 1266 + }, + { + "epoch": 0.19530227185213708, + "grad_norm": 1.0544092655181885, + "learning_rate": 9.087880527215167e-05, + "loss": 1.2945, + "step": 1268 + }, + { + "epoch": 0.19561031959953792, + "grad_norm": 0.7124229073524475, + "learning_rate": 9.085092116430479e-05, + "loss": 1.0174, + "step": 1270 + }, + { + "epoch": 0.19591836734693877, + "grad_norm": 0.6399815678596497, + "learning_rate": 9.08229987912097e-05, + "loss": 1.2766, + "step": 1272 + }, + { + "epoch": 0.19622641509433963, + "grad_norm": 0.6686626076698303, + "learning_rate": 9.079503817902144e-05, + "loss": 0.9428, + "step": 1274 + }, + { + "epoch": 0.19653446284174048, + "grad_norm": 0.9779509902000427, + "learning_rate": 9.076703935393083e-05, + "loss": 1.1812, + "step": 1276 + }, + { + "epoch": 0.19684251058914132, + "grad_norm": 0.6352618336677551, + "learning_rate": 9.073900234216452e-05, + "loss": 1.0626, + "step": 1278 + }, + { + "epoch": 0.19715055833654216, + "grad_norm": 0.6762117147445679, + "learning_rate": 9.07109271699849e-05, + "loss": 1.2162, + "step": 1280 + }, + { + "epoch": 0.197458606083943, + "grad_norm": 0.6376475095748901, + "learning_rate": 9.06828138636901e-05, + "loss": 0.8033, + "step": 1282 + }, + { + "epoch": 0.19776665383134387, + "grad_norm": 0.6817375421524048, + "learning_rate": 9.065466244961402e-05, + "loss": 1.1285, + "step": 1284 + }, + { + "epoch": 0.1980747015787447, + "grad_norm": 0.7756999731063843, + "learning_rate": 9.062647295412619e-05, + "loss": 0.9947, + "step": 1286 + }, + { + "epoch": 0.19838274932614555, + "grad_norm": 0.9223238825798035, + "learning_rate": 9.059824540363183e-05, + "loss": 1.036, + "step": 1288 + }, + { + "epoch": 0.1986907970735464, + "grad_norm": 0.7431936860084534, + "learning_rate": 9.056997982457185e-05, + "loss": 1.1404, + "step": 1290 + }, + { + "epoch": 0.19899884482094723, + "grad_norm": 0.7931793928146362, + "learning_rate": 9.054167624342275e-05, + "loss": 1.1076, + "step": 1292 + }, + { + "epoch": 0.1993068925683481, + "grad_norm": 0.5251017808914185, + "learning_rate": 9.05133346866966e-05, + "loss": 0.92, + "step": 1294 + }, + { + "epoch": 0.19961494031574895, + "grad_norm": 0.5351060628890991, + "learning_rate": 9.048495518094109e-05, + "loss": 1.1359, + "step": 1296 + }, + { + "epoch": 0.1999229880631498, + "grad_norm": 0.7550303936004639, + "learning_rate": 9.045653775273942e-05, + "loss": 1.1408, + "step": 1298 + }, + { + "epoch": 0.20023103581055063, + "grad_norm": 0.7095838189125061, + "learning_rate": 9.042808242871035e-05, + "loss": 1.0531, + "step": 1300 + }, + { + "epoch": 0.20053908355795147, + "grad_norm": 0.5679411888122559, + "learning_rate": 9.039958923550808e-05, + "loss": 0.9117, + "step": 1302 + }, + { + "epoch": 0.20084713130535234, + "grad_norm": 0.7613952159881592, + "learning_rate": 9.037105819982234e-05, + "loss": 1.1077, + "step": 1304 + }, + { + "epoch": 0.20115517905275318, + "grad_norm": 0.5979565978050232, + "learning_rate": 9.03424893483783e-05, + "loss": 1.0161, + "step": 1306 + }, + { + "epoch": 0.20146322680015402, + "grad_norm": 0.7908322811126709, + "learning_rate": 9.03138827079365e-05, + "loss": 1.2111, + "step": 1308 + }, + { + "epoch": 0.20177127454755486, + "grad_norm": 0.7367496490478516, + "learning_rate": 9.028523830529295e-05, + "loss": 0.923, + "step": 1310 + }, + { + "epoch": 0.2020793222949557, + "grad_norm": 0.6930207014083862, + "learning_rate": 9.025655616727895e-05, + "loss": 0.9931, + "step": 1312 + }, + { + "epoch": 0.20238737004235657, + "grad_norm": 0.8138192296028137, + "learning_rate": 9.022783632076122e-05, + "loss": 1.1191, + "step": 1314 + }, + { + "epoch": 0.20269541778975741, + "grad_norm": 0.7855730652809143, + "learning_rate": 9.019907879264179e-05, + "loss": 1.2692, + "step": 1316 + }, + { + "epoch": 0.20300346553715826, + "grad_norm": 0.5641903281211853, + "learning_rate": 9.017028360985794e-05, + "loss": 1.5539, + "step": 1318 + }, + { + "epoch": 0.2033115132845591, + "grad_norm": 0.8672736883163452, + "learning_rate": 9.014145079938228e-05, + "loss": 1.1793, + "step": 1320 + }, + { + "epoch": 0.20361956103195997, + "grad_norm": 0.7759377360343933, + "learning_rate": 9.01125803882226e-05, + "loss": 1.2328, + "step": 1322 + }, + { + "epoch": 0.2039276087793608, + "grad_norm": 0.8218508958816528, + "learning_rate": 9.008367240342198e-05, + "loss": 1.0023, + "step": 1324 + }, + { + "epoch": 0.20423565652676165, + "grad_norm": 0.7159973382949829, + "learning_rate": 9.005472687205867e-05, + "loss": 1.7163, + "step": 1326 + }, + { + "epoch": 0.2045437042741625, + "grad_norm": 0.7014909386634827, + "learning_rate": 9.002574382124604e-05, + "loss": 1.1984, + "step": 1328 + }, + { + "epoch": 0.20485175202156333, + "grad_norm": 0.9051117897033691, + "learning_rate": 8.999672327813271e-05, + "loss": 1.1805, + "step": 1330 + }, + { + "epoch": 0.2051597997689642, + "grad_norm": 0.6193578839302063, + "learning_rate": 8.99676652699023e-05, + "loss": 0.9635, + "step": 1332 + }, + { + "epoch": 0.20546784751636504, + "grad_norm": 0.8723487257957458, + "learning_rate": 8.993856982377362e-05, + "loss": 1.2039, + "step": 1334 + }, + { + "epoch": 0.20577589526376588, + "grad_norm": 0.7481098771095276, + "learning_rate": 8.990943696700049e-05, + "loss": 1.097, + "step": 1336 + }, + { + "epoch": 0.20608394301116673, + "grad_norm": 0.8442493677139282, + "learning_rate": 8.988026672687182e-05, + "loss": 1.2301, + "step": 1338 + }, + { + "epoch": 0.20639199075856757, + "grad_norm": 0.6437552571296692, + "learning_rate": 8.985105913071148e-05, + "loss": 2.1001, + "step": 1340 + }, + { + "epoch": 0.20670003850596844, + "grad_norm": 0.6734112501144409, + "learning_rate": 8.982181420587836e-05, + "loss": 1.1627, + "step": 1342 + }, + { + "epoch": 0.20700808625336928, + "grad_norm": 0.7252438068389893, + "learning_rate": 8.979253197976633e-05, + "loss": 0.97, + "step": 1344 + }, + { + "epoch": 0.20731613400077012, + "grad_norm": 0.670896053314209, + "learning_rate": 8.976321247980419e-05, + "loss": 1.7838, + "step": 1346 + }, + { + "epoch": 0.20762418174817096, + "grad_norm": 0.7382272481918335, + "learning_rate": 8.973385573345566e-05, + "loss": 2.5687, + "step": 1348 + }, + { + "epoch": 0.2079322294955718, + "grad_norm": 0.6926072835922241, + "learning_rate": 8.970446176821933e-05, + "loss": 0.9489, + "step": 1350 + }, + { + "epoch": 0.20824027724297267, + "grad_norm": 0.6148782968521118, + "learning_rate": 8.967503061162865e-05, + "loss": 0.753, + "step": 1352 + }, + { + "epoch": 0.2085483249903735, + "grad_norm": 0.8001696467399597, + "learning_rate": 8.964556229125194e-05, + "loss": 1.9705, + "step": 1354 + }, + { + "epoch": 0.20885637273777435, + "grad_norm": 0.7240262627601624, + "learning_rate": 8.961605683469232e-05, + "loss": 0.986, + "step": 1356 + }, + { + "epoch": 0.2091644204851752, + "grad_norm": 0.7085553407669067, + "learning_rate": 8.958651426958767e-05, + "loss": 1.1624, + "step": 1358 + }, + { + "epoch": 0.20947246823257604, + "grad_norm": 0.6068323850631714, + "learning_rate": 8.955693462361065e-05, + "loss": 1.0409, + "step": 1360 + }, + { + "epoch": 0.2097805159799769, + "grad_norm": 0.6230590343475342, + "learning_rate": 8.952731792446865e-05, + "loss": 0.9571, + "step": 1362 + }, + { + "epoch": 0.21008856372737775, + "grad_norm": 0.6568693518638611, + "learning_rate": 8.949766419990379e-05, + "loss": 0.9922, + "step": 1364 + }, + { + "epoch": 0.2103966114747786, + "grad_norm": 0.7212346196174622, + "learning_rate": 8.946797347769284e-05, + "loss": 1.2634, + "step": 1366 + }, + { + "epoch": 0.21070465922217943, + "grad_norm": 0.7108849287033081, + "learning_rate": 8.943824578564724e-05, + "loss": 1.1369, + "step": 1368 + }, + { + "epoch": 0.21101270696958027, + "grad_norm": 1.3569531440734863, + "learning_rate": 8.940848115161307e-05, + "loss": 1.1873, + "step": 1370 + }, + { + "epoch": 0.21132075471698114, + "grad_norm": 0.6778193116188049, + "learning_rate": 8.937867960347095e-05, + "loss": 1.0359, + "step": 1372 + }, + { + "epoch": 0.21162880246438198, + "grad_norm": 0.6958675384521484, + "learning_rate": 8.93488411691362e-05, + "loss": 1.8977, + "step": 1374 + }, + { + "epoch": 0.21193685021178282, + "grad_norm": 0.7475863695144653, + "learning_rate": 8.931896587655857e-05, + "loss": 1.4979, + "step": 1376 + }, + { + "epoch": 0.21224489795918366, + "grad_norm": 0.7854467034339905, + "learning_rate": 8.92890537537224e-05, + "loss": 1.1155, + "step": 1378 + }, + { + "epoch": 0.21255294570658453, + "grad_norm": 0.8435900211334229, + "learning_rate": 8.925910482864652e-05, + "loss": 1.1637, + "step": 1380 + }, + { + "epoch": 0.21286099345398538, + "grad_norm": 0.6406680941581726, + "learning_rate": 8.922911912938422e-05, + "loss": 1.8449, + "step": 1382 + }, + { + "epoch": 0.21316904120138622, + "grad_norm": 0.8102788329124451, + "learning_rate": 8.919909668402325e-05, + "loss": 0.9947, + "step": 1384 + }, + { + "epoch": 0.21347708894878706, + "grad_norm": 0.7807202935218811, + "learning_rate": 8.916903752068578e-05, + "loss": 1.0053, + "step": 1386 + }, + { + "epoch": 0.2137851366961879, + "grad_norm": 0.7852822542190552, + "learning_rate": 8.913894166752835e-05, + "loss": 0.9945, + "step": 1388 + }, + { + "epoch": 0.21409318444358877, + "grad_norm": 0.6369873881340027, + "learning_rate": 8.910880915274191e-05, + "loss": 0.9943, + "step": 1390 + }, + { + "epoch": 0.2144012321909896, + "grad_norm": 0.568693220615387, + "learning_rate": 8.90786400045517e-05, + "loss": 1.1122, + "step": 1392 + }, + { + "epoch": 0.21470927993839045, + "grad_norm": 0.678931474685669, + "learning_rate": 8.904843425121733e-05, + "loss": 1.0143, + "step": 1394 + }, + { + "epoch": 0.2150173276857913, + "grad_norm": 0.6750305891036987, + "learning_rate": 8.901819192103266e-05, + "loss": 0.8868, + "step": 1396 + }, + { + "epoch": 0.21532537543319213, + "grad_norm": 0.7823706269264221, + "learning_rate": 8.898791304232581e-05, + "loss": 1.0879, + "step": 1398 + }, + { + "epoch": 0.215633423180593, + "grad_norm": 0.6973862051963806, + "learning_rate": 8.895759764345914e-05, + "loss": 0.9778, + "step": 1400 + }, + { + "epoch": 0.21594147092799384, + "grad_norm": 0.5516627430915833, + "learning_rate": 8.892724575282927e-05, + "loss": 1.6965, + "step": 1402 + }, + { + "epoch": 0.2162495186753947, + "grad_norm": 0.5122855305671692, + "learning_rate": 8.889685739886691e-05, + "loss": 0.9705, + "step": 1404 + }, + { + "epoch": 0.21655756642279553, + "grad_norm": 0.7519564032554626, + "learning_rate": 8.886643261003697e-05, + "loss": 1.0991, + "step": 1406 + }, + { + "epoch": 0.21686561417019637, + "grad_norm": 0.6111971139907837, + "learning_rate": 8.883597141483854e-05, + "loss": 0.8201, + "step": 1408 + }, + { + "epoch": 0.21717366191759724, + "grad_norm": 0.945604145526886, + "learning_rate": 8.880547384180473e-05, + "loss": 1.2554, + "step": 1410 + }, + { + "epoch": 0.21748170966499808, + "grad_norm": 0.6552610397338867, + "learning_rate": 8.877493991950276e-05, + "loss": 2.228, + "step": 1412 + }, + { + "epoch": 0.21778975741239892, + "grad_norm": 0.7612242102622986, + "learning_rate": 8.87443696765339e-05, + "loss": 0.9426, + "step": 1414 + }, + { + "epoch": 0.21809780515979976, + "grad_norm": 0.8174354434013367, + "learning_rate": 8.871376314153344e-05, + "loss": 1.4622, + "step": 1416 + }, + { + "epoch": 0.2184058529072006, + "grad_norm": 0.7058425545692444, + "learning_rate": 8.868312034317067e-05, + "loss": 1.4126, + "step": 1418 + }, + { + "epoch": 0.21871390065460147, + "grad_norm": 0.7839975953102112, + "learning_rate": 8.865244131014883e-05, + "loss": 0.9144, + "step": 1420 + }, + { + "epoch": 0.21902194840200231, + "grad_norm": 0.7724378108978271, + "learning_rate": 8.862172607120512e-05, + "loss": 0.8829, + "step": 1422 + }, + { + "epoch": 0.21932999614940316, + "grad_norm": 0.8645272254943848, + "learning_rate": 8.859097465511064e-05, + "loss": 1.1721, + "step": 1424 + }, + { + "epoch": 0.219638043896804, + "grad_norm": 0.7928360104560852, + "learning_rate": 8.85601870906704e-05, + "loss": 0.8773, + "step": 1426 + }, + { + "epoch": 0.21994609164420484, + "grad_norm": 0.6146901249885559, + "learning_rate": 8.852936340672324e-05, + "loss": 0.9543, + "step": 1428 + }, + { + "epoch": 0.2202541393916057, + "grad_norm": 0.7110276818275452, + "learning_rate": 8.849850363214186e-05, + "loss": 1.2455, + "step": 1430 + }, + { + "epoch": 0.22056218713900655, + "grad_norm": 0.7560061812400818, + "learning_rate": 8.846760779583274e-05, + "loss": 1.1127, + "step": 1432 + }, + { + "epoch": 0.2208702348864074, + "grad_norm": 0.8137415647506714, + "learning_rate": 8.843667592673616e-05, + "loss": 1.1296, + "step": 1434 + }, + { + "epoch": 0.22117828263380823, + "grad_norm": 0.9150519371032715, + "learning_rate": 8.840570805382617e-05, + "loss": 2.6254, + "step": 1436 + }, + { + "epoch": 0.2214863303812091, + "grad_norm": 0.5578750371932983, + "learning_rate": 8.837470420611048e-05, + "loss": 1.0047, + "step": 1438 + }, + { + "epoch": 0.22179437812860994, + "grad_norm": 0.6566774845123291, + "learning_rate": 8.834366441263056e-05, + "loss": 0.9647, + "step": 1440 + }, + { + "epoch": 0.22210242587601078, + "grad_norm": 0.7494180202484131, + "learning_rate": 8.831258870246154e-05, + "loss": 0.8414, + "step": 1442 + }, + { + "epoch": 0.22241047362341163, + "grad_norm": 0.47362321615219116, + "learning_rate": 8.828147710471217e-05, + "loss": 0.9061, + "step": 1444 + }, + { + "epoch": 0.22271852137081247, + "grad_norm": 0.8754947781562805, + "learning_rate": 8.825032964852482e-05, + "loss": 2.4502, + "step": 1446 + }, + { + "epoch": 0.22302656911821334, + "grad_norm": 0.6758772134780884, + "learning_rate": 8.821914636307547e-05, + "loss": 0.9356, + "step": 1448 + }, + { + "epoch": 0.22333461686561418, + "grad_norm": 0.8317671418190002, + "learning_rate": 8.818792727757363e-05, + "loss": 1.0184, + "step": 1450 + }, + { + "epoch": 0.22364266461301502, + "grad_norm": 0.6676676273345947, + "learning_rate": 8.81566724212624e-05, + "loss": 2.2253, + "step": 1452 + }, + { + "epoch": 0.22395071236041586, + "grad_norm": 0.7731775045394897, + "learning_rate": 8.812538182341832e-05, + "loss": 1.1849, + "step": 1454 + }, + { + "epoch": 0.2242587601078167, + "grad_norm": 0.715045690536499, + "learning_rate": 8.809405551335143e-05, + "loss": 0.9324, + "step": 1456 + }, + { + "epoch": 0.22456680785521757, + "grad_norm": 0.8740540742874146, + "learning_rate": 8.806269352040527e-05, + "loss": 1.2127, + "step": 1458 + }, + { + "epoch": 0.2248748556026184, + "grad_norm": 0.760273277759552, + "learning_rate": 8.803129587395673e-05, + "loss": 1.9512, + "step": 1460 + }, + { + "epoch": 0.22518290335001925, + "grad_norm": 0.6543803811073303, + "learning_rate": 8.799986260341615e-05, + "loss": 0.9039, + "step": 1462 + }, + { + "epoch": 0.2254909510974201, + "grad_norm": 0.5931572914123535, + "learning_rate": 8.796839373822721e-05, + "loss": 1.1781, + "step": 1464 + }, + { + "epoch": 0.22579899884482094, + "grad_norm": 0.7610328197479248, + "learning_rate": 8.793688930786694e-05, + "loss": 1.0437, + "step": 1466 + }, + { + "epoch": 0.2261070465922218, + "grad_norm": 1.0328961610794067, + "learning_rate": 8.790534934184569e-05, + "loss": 1.1108, + "step": 1468 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.7776150703430176, + "learning_rate": 8.787377386970712e-05, + "loss": 1.0043, + "step": 1470 + }, + { + "epoch": 0.2267231420870235, + "grad_norm": 0.6809033751487732, + "learning_rate": 8.784216292102807e-05, + "loss": 0.9907, + "step": 1472 + }, + { + "epoch": 0.22703118983442433, + "grad_norm": 0.7455710768699646, + "learning_rate": 8.781051652541872e-05, + "loss": 0.9955, + "step": 1474 + }, + { + "epoch": 0.22733923758182517, + "grad_norm": 0.8081583976745605, + "learning_rate": 8.777883471252235e-05, + "loss": 1.1366, + "step": 1476 + }, + { + "epoch": 0.22764728532922604, + "grad_norm": 0.8436789512634277, + "learning_rate": 8.774711751201547e-05, + "loss": 2.0628, + "step": 1478 + }, + { + "epoch": 0.22795533307662688, + "grad_norm": 0.7141880393028259, + "learning_rate": 8.771536495360776e-05, + "loss": 1.0375, + "step": 1480 + }, + { + "epoch": 0.22826338082402772, + "grad_norm": 0.709006667137146, + "learning_rate": 8.768357706704196e-05, + "loss": 1.0356, + "step": 1482 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.6896849870681763, + "learning_rate": 8.765175388209395e-05, + "loss": 0.8338, + "step": 1484 + }, + { + "epoch": 0.2288794763188294, + "grad_norm": 0.6931160092353821, + "learning_rate": 8.761989542857263e-05, + "loss": 1.0104, + "step": 1486 + }, + { + "epoch": 0.22918752406623027, + "grad_norm": 0.6755717396736145, + "learning_rate": 8.758800173631998e-05, + "loss": 0.8858, + "step": 1488 + }, + { + "epoch": 0.22949557181363112, + "grad_norm": 0.8900005221366882, + "learning_rate": 8.755607283521097e-05, + "loss": 1.0734, + "step": 1490 + }, + { + "epoch": 0.22980361956103196, + "grad_norm": 0.852319598197937, + "learning_rate": 8.752410875515353e-05, + "loss": 0.9932, + "step": 1492 + }, + { + "epoch": 0.2301116673084328, + "grad_norm": 0.7591366171836853, + "learning_rate": 8.74921095260886e-05, + "loss": 1.2182, + "step": 1494 + }, + { + "epoch": 0.23041971505583364, + "grad_norm": 0.7753162980079651, + "learning_rate": 8.746007517798999e-05, + "loss": 0.9069, + "step": 1496 + }, + { + "epoch": 0.2307277628032345, + "grad_norm": 0.7818796038627625, + "learning_rate": 8.742800574086443e-05, + "loss": 1.5481, + "step": 1498 + }, + { + "epoch": 0.23103581055063535, + "grad_norm": 0.5833514332771301, + "learning_rate": 8.739590124475148e-05, + "loss": 0.9812, + "step": 1500 + }, + { + "epoch": 0.23103581055063535, + "eval_loss": 2.57389497756958, + "eval_runtime": 736.2798, + "eval_samples_per_second": 2.716, + "eval_steps_per_second": 0.679, + "step": 1500 + }, + { + "epoch": 0.2313438582980362, + "grad_norm": 0.5849636793136597, + "learning_rate": 8.73637617197236e-05, + "loss": 0.9969, + "step": 1502 + }, + { + "epoch": 0.23165190604543703, + "grad_norm": 0.683225154876709, + "learning_rate": 8.733158719588603e-05, + "loss": 1.9737, + "step": 1504 + }, + { + "epoch": 0.2319599537928379, + "grad_norm": 0.6674225330352783, + "learning_rate": 8.729937770337677e-05, + "loss": 1.041, + "step": 1506 + }, + { + "epoch": 0.23226800154023874, + "grad_norm": 0.9080517292022705, + "learning_rate": 8.726713327236666e-05, + "loss": 1.1746, + "step": 1508 + }, + { + "epoch": 0.23257604928763959, + "grad_norm": 0.7534533143043518, + "learning_rate": 8.723485393305915e-05, + "loss": 0.9784, + "step": 1510 + }, + { + "epoch": 0.23288409703504043, + "grad_norm": 1.0462878942489624, + "learning_rate": 8.720253971569047e-05, + "loss": 1.1182, + "step": 1512 + }, + { + "epoch": 0.23319214478244127, + "grad_norm": 0.6824343800544739, + "learning_rate": 8.71701906505295e-05, + "loss": 1.0664, + "step": 1514 + }, + { + "epoch": 0.23350019252984214, + "grad_norm": 0.7235549688339233, + "learning_rate": 8.713780676787777e-05, + "loss": 0.9804, + "step": 1516 + }, + { + "epoch": 0.23380824027724298, + "grad_norm": 0.7442678809165955, + "learning_rate": 8.710538809806939e-05, + "loss": 1.1463, + "step": 1518 + }, + { + "epoch": 0.23411628802464382, + "grad_norm": 0.47301387786865234, + "learning_rate": 8.707293467147109e-05, + "loss": 0.9718, + "step": 1520 + }, + { + "epoch": 0.23442433577204466, + "grad_norm": 0.5563234090805054, + "learning_rate": 8.704044651848215e-05, + "loss": 1.0898, + "step": 1522 + }, + { + "epoch": 0.2347323835194455, + "grad_norm": 0.6864919066429138, + "learning_rate": 8.700792366953436e-05, + "loss": 1.087, + "step": 1524 + }, + { + "epoch": 0.23504043126684637, + "grad_norm": 0.537579357624054, + "learning_rate": 8.697536615509206e-05, + "loss": 0.935, + "step": 1526 + }, + { + "epoch": 0.2353484790142472, + "grad_norm": 0.6670091152191162, + "learning_rate": 8.694277400565198e-05, + "loss": 1.3778, + "step": 1528 + }, + { + "epoch": 0.23565652676164806, + "grad_norm": 0.8170010447502136, + "learning_rate": 8.691014725174337e-05, + "loss": 1.0208, + "step": 1530 + }, + { + "epoch": 0.2359645745090489, + "grad_norm": 0.677753746509552, + "learning_rate": 8.687748592392785e-05, + "loss": 0.9473, + "step": 1532 + }, + { + "epoch": 0.23627262225644974, + "grad_norm": 0.8103045225143433, + "learning_rate": 8.684479005279944e-05, + "loss": 1.124, + "step": 1534 + }, + { + "epoch": 0.2365806700038506, + "grad_norm": 0.5518203377723694, + "learning_rate": 8.681205966898451e-05, + "loss": 0.9217, + "step": 1536 + }, + { + "epoch": 0.23688871775125145, + "grad_norm": 0.7895511388778687, + "learning_rate": 8.677929480314177e-05, + "loss": 0.912, + "step": 1538 + }, + { + "epoch": 0.2371967654986523, + "grad_norm": 0.7995219230651855, + "learning_rate": 8.674649548596221e-05, + "loss": 1.0235, + "step": 1540 + }, + { + "epoch": 0.23750481324605313, + "grad_norm": 0.5603652596473694, + "learning_rate": 8.671366174816913e-05, + "loss": 0.8075, + "step": 1542 + }, + { + "epoch": 0.23781286099345397, + "grad_norm": 0.5374646782875061, + "learning_rate": 8.668079362051802e-05, + "loss": 0.9074, + "step": 1544 + }, + { + "epoch": 0.23812090874085484, + "grad_norm": 0.6922785043716431, + "learning_rate": 8.664789113379661e-05, + "loss": 1.0318, + "step": 1546 + }, + { + "epoch": 0.23842895648825568, + "grad_norm": 0.9918892979621887, + "learning_rate": 8.661495431882483e-05, + "loss": 1.0848, + "step": 1548 + }, + { + "epoch": 0.23873700423565652, + "grad_norm": 0.7347520589828491, + "learning_rate": 8.658198320645473e-05, + "loss": 1.0083, + "step": 1550 + }, + { + "epoch": 0.23904505198305737, + "grad_norm": 0.7262596487998962, + "learning_rate": 8.654897782757051e-05, + "loss": 1.5047, + "step": 1552 + }, + { + "epoch": 0.2393530997304582, + "grad_norm": 0.73700350522995, + "learning_rate": 8.651593821308847e-05, + "loss": 1.1478, + "step": 1554 + }, + { + "epoch": 0.23966114747785908, + "grad_norm": 0.5896843671798706, + "learning_rate": 8.648286439395697e-05, + "loss": 0.9431, + "step": 1556 + }, + { + "epoch": 0.23996919522525992, + "grad_norm": 0.6611576080322266, + "learning_rate": 8.644975640115639e-05, + "loss": 2.0998, + "step": 1558 + }, + { + "epoch": 0.24027724297266076, + "grad_norm": 0.7349005937576294, + "learning_rate": 8.641661426569916e-05, + "loss": 1.1122, + "step": 1560 + }, + { + "epoch": 0.2405852907200616, + "grad_norm": 0.9728613495826721, + "learning_rate": 8.638343801862967e-05, + "loss": 1.2116, + "step": 1562 + }, + { + "epoch": 0.24089333846746247, + "grad_norm": 0.6161354780197144, + "learning_rate": 8.635022769102428e-05, + "loss": 0.9882, + "step": 1564 + }, + { + "epoch": 0.2412013862148633, + "grad_norm": 0.89566570520401, + "learning_rate": 8.631698331399123e-05, + "loss": 1.0898, + "step": 1566 + }, + { + "epoch": 0.24150943396226415, + "grad_norm": 0.5751842260360718, + "learning_rate": 8.628370491867068e-05, + "loss": 0.8537, + "step": 1568 + }, + { + "epoch": 0.241817481709665, + "grad_norm": 0.7686675786972046, + "learning_rate": 8.62503925362347e-05, + "loss": 0.9874, + "step": 1570 + }, + { + "epoch": 0.24212552945706584, + "grad_norm": 0.6984812021255493, + "learning_rate": 8.621704619788711e-05, + "loss": 0.8566, + "step": 1572 + }, + { + "epoch": 0.2424335772044667, + "grad_norm": 0.7423837780952454, + "learning_rate": 8.61836659348636e-05, + "loss": 1.1464, + "step": 1574 + }, + { + "epoch": 0.24274162495186755, + "grad_norm": 0.6439533829689026, + "learning_rate": 8.615025177843163e-05, + "loss": 1.0957, + "step": 1576 + }, + { + "epoch": 0.2430496726992684, + "grad_norm": 0.5935307145118713, + "learning_rate": 8.611680375989038e-05, + "loss": 0.9401, + "step": 1578 + }, + { + "epoch": 0.24335772044666923, + "grad_norm": 0.7962810397148132, + "learning_rate": 8.608332191057076e-05, + "loss": 1.0961, + "step": 1580 + }, + { + "epoch": 0.24366576819407007, + "grad_norm": 0.6294207572937012, + "learning_rate": 8.604980626183536e-05, + "loss": 1.0722, + "step": 1582 + }, + { + "epoch": 0.24397381594147094, + "grad_norm": 0.8451805710792542, + "learning_rate": 8.60162568450785e-05, + "loss": 1.2739, + "step": 1584 + }, + { + "epoch": 0.24428186368887178, + "grad_norm": 0.6558196544647217, + "learning_rate": 8.598267369172603e-05, + "loss": 0.9931, + "step": 1586 + }, + { + "epoch": 0.24458991143627262, + "grad_norm": 0.6440199017524719, + "learning_rate": 8.594905683323544e-05, + "loss": 0.908, + "step": 1588 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 0.8013085722923279, + "learning_rate": 8.591540630109583e-05, + "loss": 0.9742, + "step": 1590 + }, + { + "epoch": 0.2452060069310743, + "grad_norm": 0.727178156375885, + "learning_rate": 8.588172212682779e-05, + "loss": 2.4691, + "step": 1592 + }, + { + "epoch": 0.24551405467847517, + "grad_norm": 0.9094061851501465, + "learning_rate": 8.584800434198346e-05, + "loss": 1.0531, + "step": 1594 + }, + { + "epoch": 0.24582210242587602, + "grad_norm": 0.6858007311820984, + "learning_rate": 8.581425297814641e-05, + "loss": 1.0649, + "step": 1596 + }, + { + "epoch": 0.24613015017327686, + "grad_norm": 0.5976001024246216, + "learning_rate": 8.578046806693174e-05, + "loss": 0.7737, + "step": 1598 + }, + { + "epoch": 0.2464381979206777, + "grad_norm": 0.6907030344009399, + "learning_rate": 8.57466496399859e-05, + "loss": 0.8375, + "step": 1600 + }, + { + "epoch": 0.24674624566807854, + "grad_norm": 0.7412716746330261, + "learning_rate": 8.571279772898681e-05, + "loss": 1.0501, + "step": 1602 + }, + { + "epoch": 0.2470542934154794, + "grad_norm": 0.7598106861114502, + "learning_rate": 8.567891236564368e-05, + "loss": 1.006, + "step": 1604 + }, + { + "epoch": 0.24736234116288025, + "grad_norm": 0.7194213271141052, + "learning_rate": 8.56449935816971e-05, + "loss": 0.9603, + "step": 1606 + }, + { + "epoch": 0.2476703889102811, + "grad_norm": 0.6847167611122131, + "learning_rate": 8.561104140891894e-05, + "loss": 1.1455, + "step": 1608 + }, + { + "epoch": 0.24797843665768193, + "grad_norm": 0.6343832612037659, + "learning_rate": 8.557705587911238e-05, + "loss": 0.8958, + "step": 1610 + }, + { + "epoch": 0.24828648440508277, + "grad_norm": 0.6502009630203247, + "learning_rate": 8.55430370241118e-05, + "loss": 0.9663, + "step": 1612 + }, + { + "epoch": 0.24859453215248364, + "grad_norm": 0.9204524159431458, + "learning_rate": 8.550898487578282e-05, + "loss": 1.0136, + "step": 1614 + }, + { + "epoch": 0.24890257989988449, + "grad_norm": 0.6391148567199707, + "learning_rate": 8.547489946602227e-05, + "loss": 1.0722, + "step": 1616 + }, + { + "epoch": 0.24921062764728533, + "grad_norm": 0.6553990840911865, + "learning_rate": 8.54407808267581e-05, + "loss": 0.9506, + "step": 1618 + }, + { + "epoch": 0.24951867539468617, + "grad_norm": 0.6417995691299438, + "learning_rate": 8.54066289899494e-05, + "loss": 1.0027, + "step": 1620 + }, + { + "epoch": 0.24982672314208704, + "grad_norm": 0.7358739972114563, + "learning_rate": 8.537244398758636e-05, + "loss": 0.9905, + "step": 1622 + }, + { + "epoch": 0.2501347708894879, + "grad_norm": 0.5125988721847534, + "learning_rate": 8.533822585169022e-05, + "loss": 1.0122, + "step": 1624 + }, + { + "epoch": 0.2504428186368887, + "grad_norm": 0.5452527403831482, + "learning_rate": 8.530397461431325e-05, + "loss": 0.8484, + "step": 1626 + }, + { + "epoch": 0.25075086638428956, + "grad_norm": 0.8188121914863586, + "learning_rate": 8.526969030753879e-05, + "loss": 0.8833, + "step": 1628 + }, + { + "epoch": 0.2510589141316904, + "grad_norm": 0.9002227187156677, + "learning_rate": 8.523537296348107e-05, + "loss": 1.1187, + "step": 1630 + }, + { + "epoch": 0.25136696187909124, + "grad_norm": 0.6643141508102417, + "learning_rate": 8.520102261428534e-05, + "loss": 0.9072, + "step": 1632 + }, + { + "epoch": 0.2516750096264921, + "grad_norm": 0.9124694466590881, + "learning_rate": 8.516663929212769e-05, + "loss": 1.2232, + "step": 1634 + }, + { + "epoch": 0.2519830573738929, + "grad_norm": 0.6632004380226135, + "learning_rate": 8.513222302921517e-05, + "loss": 1.0628, + "step": 1636 + }, + { + "epoch": 0.2522911051212938, + "grad_norm": 0.5880236625671387, + "learning_rate": 8.509777385778565e-05, + "loss": 0.9547, + "step": 1638 + }, + { + "epoch": 0.25259915286869467, + "grad_norm": 0.9797520041465759, + "learning_rate": 8.506329181010781e-05, + "loss": 1.1099, + "step": 1640 + }, + { + "epoch": 0.2529072006160955, + "grad_norm": 0.6555163860321045, + "learning_rate": 8.502877691848117e-05, + "loss": 0.8764, + "step": 1642 + }, + { + "epoch": 0.25321524836349635, + "grad_norm": 0.7387675046920776, + "learning_rate": 8.499422921523596e-05, + "loss": 1.1055, + "step": 1644 + }, + { + "epoch": 0.2535232961108972, + "grad_norm": 1.114042043685913, + "learning_rate": 8.495964873273322e-05, + "loss": 1.285, + "step": 1646 + }, + { + "epoch": 0.25383134385829803, + "grad_norm": 0.9613330364227295, + "learning_rate": 8.492503550336462e-05, + "loss": 1.951, + "step": 1648 + }, + { + "epoch": 0.25413939160569887, + "grad_norm": 0.7614074945449829, + "learning_rate": 8.489038955955251e-05, + "loss": 1.0745, + "step": 1650 + }, + { + "epoch": 0.2544474393530997, + "grad_norm": 0.7193354964256287, + "learning_rate": 8.485571093374995e-05, + "loss": 0.9635, + "step": 1652 + }, + { + "epoch": 0.25475548710050056, + "grad_norm": 0.7330077290534973, + "learning_rate": 8.482099965844056e-05, + "loss": 1.1106, + "step": 1654 + }, + { + "epoch": 0.25506353484790145, + "grad_norm": 0.8675753474235535, + "learning_rate": 8.478625576613853e-05, + "loss": 1.1381, + "step": 1656 + }, + { + "epoch": 0.2553715825953023, + "grad_norm": 0.7048096060752869, + "learning_rate": 8.475147928938866e-05, + "loss": 1.2692, + "step": 1658 + }, + { + "epoch": 0.25567963034270313, + "grad_norm": 0.9633030891418457, + "learning_rate": 8.471667026076621e-05, + "loss": 1.1013, + "step": 1660 + }, + { + "epoch": 0.255987678090104, + "grad_norm": 0.5854815244674683, + "learning_rate": 8.468182871287695e-05, + "loss": 0.8917, + "step": 1662 + }, + { + "epoch": 0.2562957258375048, + "grad_norm": 0.7640011310577393, + "learning_rate": 8.464695467835718e-05, + "loss": 0.9966, + "step": 1664 + }, + { + "epoch": 0.25660377358490566, + "grad_norm": 0.6665043234825134, + "learning_rate": 8.461204818987349e-05, + "loss": 1.7742, + "step": 1666 + }, + { + "epoch": 0.2569118213323065, + "grad_norm": 11.362093925476074, + "learning_rate": 8.457710928012301e-05, + "loss": 1.056, + "step": 1668 + }, + { + "epoch": 0.25721986907970734, + "grad_norm": 0.998767077922821, + "learning_rate": 8.454213798183317e-05, + "loss": 1.1667, + "step": 1670 + }, + { + "epoch": 0.2575279168271082, + "grad_norm": 0.7688530683517456, + "learning_rate": 8.450713432776172e-05, + "loss": 1.0143, + "step": 1672 + }, + { + "epoch": 0.257835964574509, + "grad_norm": 0.8430312275886536, + "learning_rate": 8.447209835069678e-05, + "loss": 0.857, + "step": 1674 + }, + { + "epoch": 0.2581440123219099, + "grad_norm": 0.6447070837020874, + "learning_rate": 8.443703008345669e-05, + "loss": 1.1301, + "step": 1676 + }, + { + "epoch": 0.25845206006931076, + "grad_norm": 0.8711054921150208, + "learning_rate": 8.440192955889006e-05, + "loss": 1.0479, + "step": 1678 + }, + { + "epoch": 0.2587601078167116, + "grad_norm": 0.7437043786048889, + "learning_rate": 8.436679680987571e-05, + "loss": 1.3005, + "step": 1680 + }, + { + "epoch": 0.25906815556411245, + "grad_norm": 0.9241888523101807, + "learning_rate": 8.433163186932268e-05, + "loss": 0.9172, + "step": 1682 + }, + { + "epoch": 0.2593762033115133, + "grad_norm": 2.8876569271087646, + "learning_rate": 8.429643477017011e-05, + "loss": 0.811, + "step": 1684 + }, + { + "epoch": 0.25968425105891413, + "grad_norm": 0.9509261846542358, + "learning_rate": 8.42612055453873e-05, + "loss": 0.9527, + "step": 1686 + }, + { + "epoch": 0.25999229880631497, + "grad_norm": 0.861937403678894, + "learning_rate": 8.42259442279736e-05, + "loss": 1.0687, + "step": 1688 + }, + { + "epoch": 0.2603003465537158, + "grad_norm": 0.7974975109100342, + "learning_rate": 8.419065085095849e-05, + "loss": 0.8891, + "step": 1690 + }, + { + "epoch": 0.26060839430111665, + "grad_norm": 0.6367357969284058, + "learning_rate": 8.41553254474014e-05, + "loss": 1.2569, + "step": 1692 + }, + { + "epoch": 0.2609164420485175, + "grad_norm": 0.749701738357544, + "learning_rate": 8.411996805039184e-05, + "loss": 2.3228, + "step": 1694 + }, + { + "epoch": 0.2612244897959184, + "grad_norm": 0.7712326049804688, + "learning_rate": 8.408457869304923e-05, + "loss": 1.1975, + "step": 1696 + }, + { + "epoch": 0.26153253754331923, + "grad_norm": 0.7463422417640686, + "learning_rate": 8.404915740852292e-05, + "loss": 0.9153, + "step": 1698 + }, + { + "epoch": 0.2618405852907201, + "grad_norm": 0.6337231397628784, + "learning_rate": 8.401370422999224e-05, + "loss": 0.8179, + "step": 1700 + }, + { + "epoch": 0.2621486330381209, + "grad_norm": 0.7018243670463562, + "learning_rate": 8.397821919066632e-05, + "loss": 2.1112, + "step": 1702 + }, + { + "epoch": 0.26245668078552176, + "grad_norm": 0.6725606918334961, + "learning_rate": 8.394270232378419e-05, + "loss": 0.9945, + "step": 1704 + }, + { + "epoch": 0.2627647285329226, + "grad_norm": 0.8441318869590759, + "learning_rate": 8.390715366261461e-05, + "loss": 1.2714, + "step": 1706 + }, + { + "epoch": 0.26307277628032344, + "grad_norm": 1.5008949041366577, + "learning_rate": 8.387157324045623e-05, + "loss": 1.0337, + "step": 1708 + }, + { + "epoch": 0.2633808240277243, + "grad_norm": 0.5705763697624207, + "learning_rate": 8.383596109063736e-05, + "loss": 0.9527, + "step": 1710 + }, + { + "epoch": 0.2636888717751251, + "grad_norm": 0.7076048851013184, + "learning_rate": 8.380031724651608e-05, + "loss": 0.9782, + "step": 1712 + }, + { + "epoch": 0.263996919522526, + "grad_norm": 0.4575868546962738, + "learning_rate": 8.376464174148015e-05, + "loss": 0.8988, + "step": 1714 + }, + { + "epoch": 0.26430496726992686, + "grad_norm": 1.2648346424102783, + "learning_rate": 8.372893460894699e-05, + "loss": 1.0505, + "step": 1716 + }, + { + "epoch": 0.2646130150173277, + "grad_norm": 0.8442276120185852, + "learning_rate": 8.369319588236362e-05, + "loss": 1.4117, + "step": 1718 + }, + { + "epoch": 0.26492106276472854, + "grad_norm": 0.6854913830757141, + "learning_rate": 8.365742559520669e-05, + "loss": 0.8767, + "step": 1720 + }, + { + "epoch": 0.2652291105121294, + "grad_norm": 0.7954949140548706, + "learning_rate": 8.362162378098234e-05, + "loss": 1.1106, + "step": 1722 + }, + { + "epoch": 0.2655371582595302, + "grad_norm": 0.9195547103881836, + "learning_rate": 8.358579047322639e-05, + "loss": 0.9618, + "step": 1724 + }, + { + "epoch": 0.26584520600693107, + "grad_norm": 0.8742767572402954, + "learning_rate": 8.3549925705504e-05, + "loss": 1.1815, + "step": 1726 + }, + { + "epoch": 0.2661532537543319, + "grad_norm": 0.7814459204673767, + "learning_rate": 8.351402951140988e-05, + "loss": 1.2934, + "step": 1728 + }, + { + "epoch": 0.26646130150173275, + "grad_norm": 0.8386595249176025, + "learning_rate": 8.347810192456815e-05, + "loss": 0.9509, + "step": 1730 + }, + { + "epoch": 0.2667693492491336, + "grad_norm": 0.7231822609901428, + "learning_rate": 8.344214297863237e-05, + "loss": 1.1595, + "step": 1732 + }, + { + "epoch": 0.2670773969965345, + "grad_norm": 0.6814515590667725, + "learning_rate": 8.340615270728545e-05, + "loss": 1.0287, + "step": 1734 + }, + { + "epoch": 0.26738544474393533, + "grad_norm": 0.6391722559928894, + "learning_rate": 8.337013114423962e-05, + "loss": 1.2981, + "step": 1736 + }, + { + "epoch": 0.26769349249133617, + "grad_norm": 0.658207893371582, + "learning_rate": 8.333407832323647e-05, + "loss": 1.2991, + "step": 1738 + }, + { + "epoch": 0.268001540238737, + "grad_norm": 0.48568904399871826, + "learning_rate": 8.329799427804683e-05, + "loss": 1.9648, + "step": 1740 + }, + { + "epoch": 0.26830958798613785, + "grad_norm": 0.5600195527076721, + "learning_rate": 8.326187904247083e-05, + "loss": 0.8692, + "step": 1742 + }, + { + "epoch": 0.2686176357335387, + "grad_norm": 0.760915219783783, + "learning_rate": 8.322573265033773e-05, + "loss": 0.9963, + "step": 1744 + }, + { + "epoch": 0.26892568348093954, + "grad_norm": 0.6481974720954895, + "learning_rate": 8.318955513550604e-05, + "loss": 0.79, + "step": 1746 + }, + { + "epoch": 0.2692337312283404, + "grad_norm": 0.5259435176849365, + "learning_rate": 8.315334653186343e-05, + "loss": 0.9599, + "step": 1748 + }, + { + "epoch": 0.2695417789757412, + "grad_norm": 0.967396080493927, + "learning_rate": 8.311710687332665e-05, + "loss": 1.147, + "step": 1750 + }, + { + "epoch": 0.26984982672314206, + "grad_norm": 0.7488315105438232, + "learning_rate": 8.308083619384154e-05, + "loss": 1.0591, + "step": 1752 + }, + { + "epoch": 0.27015787447054296, + "grad_norm": 1.1080056428909302, + "learning_rate": 8.304453452738305e-05, + "loss": 1.2517, + "step": 1754 + }, + { + "epoch": 0.2704659222179438, + "grad_norm": 0.7297698855400085, + "learning_rate": 8.300820190795508e-05, + "loss": 1.1062, + "step": 1756 + }, + { + "epoch": 0.27077396996534464, + "grad_norm": 0.7065379619598389, + "learning_rate": 8.297183836959062e-05, + "loss": 0.9981, + "step": 1758 + }, + { + "epoch": 0.2710820177127455, + "grad_norm": 0.6783261895179749, + "learning_rate": 8.293544394635149e-05, + "loss": 0.9019, + "step": 1760 + }, + { + "epoch": 0.2713900654601463, + "grad_norm": 0.7251535654067993, + "learning_rate": 8.289901867232858e-05, + "loss": 0.9765, + "step": 1762 + }, + { + "epoch": 0.27169811320754716, + "grad_norm": 0.8736194968223572, + "learning_rate": 8.286256258164158e-05, + "loss": 1.1514, + "step": 1764 + }, + { + "epoch": 0.272006160954948, + "grad_norm": 0.6842576861381531, + "learning_rate": 8.28260757084391e-05, + "loss": 1.1712, + "step": 1766 + }, + { + "epoch": 0.27231420870234885, + "grad_norm": 1.0661685466766357, + "learning_rate": 8.278955808689856e-05, + "loss": 1.2, + "step": 1768 + }, + { + "epoch": 0.2726222564497497, + "grad_norm": 0.8276075720787048, + "learning_rate": 8.275300975122618e-05, + "loss": 1.0937, + "step": 1770 + }, + { + "epoch": 0.27293030419715053, + "grad_norm": 0.8004717826843262, + "learning_rate": 8.271643073565695e-05, + "loss": 2.487, + "step": 1772 + }, + { + "epoch": 0.2732383519445514, + "grad_norm": 0.6467103362083435, + "learning_rate": 8.267982107445463e-05, + "loss": 1.0008, + "step": 1774 + }, + { + "epoch": 0.27354639969195227, + "grad_norm": 0.7134955525398254, + "learning_rate": 8.264318080191162e-05, + "loss": 2.025, + "step": 1776 + }, + { + "epoch": 0.2738544474393531, + "grad_norm": 0.8171098828315735, + "learning_rate": 8.260650995234907e-05, + "loss": 1.265, + "step": 1778 + }, + { + "epoch": 0.27416249518675395, + "grad_norm": 0.6574395298957825, + "learning_rate": 8.256980856011672e-05, + "loss": 0.9239, + "step": 1780 + }, + { + "epoch": 0.2744705429341548, + "grad_norm": 0.5392150282859802, + "learning_rate": 8.253307665959293e-05, + "loss": 0.8495, + "step": 1782 + }, + { + "epoch": 0.27477859068155563, + "grad_norm": 0.7107676267623901, + "learning_rate": 8.249631428518465e-05, + "loss": 1.0611, + "step": 1784 + }, + { + "epoch": 0.2750866384289565, + "grad_norm": 0.7121739387512207, + "learning_rate": 8.245952147132736e-05, + "loss": 1.0221, + "step": 1786 + }, + { + "epoch": 0.2753946861763573, + "grad_norm": 0.7235463857650757, + "learning_rate": 8.242269825248509e-05, + "loss": 1.0936, + "step": 1788 + }, + { + "epoch": 0.27570273392375816, + "grad_norm": 0.8050673604011536, + "learning_rate": 8.238584466315027e-05, + "loss": 0.974, + "step": 1790 + }, + { + "epoch": 0.27601078167115906, + "grad_norm": 0.8345509171485901, + "learning_rate": 8.234896073784389e-05, + "loss": 1.991, + "step": 1792 + }, + { + "epoch": 0.2763188294185599, + "grad_norm": 0.6287657022476196, + "learning_rate": 8.231204651111524e-05, + "loss": 1.3361, + "step": 1794 + }, + { + "epoch": 0.27662687716596074, + "grad_norm": 0.6475083827972412, + "learning_rate": 8.227510201754207e-05, + "loss": 1.0319, + "step": 1796 + }, + { + "epoch": 0.2769349249133616, + "grad_norm": 0.4755045771598816, + "learning_rate": 8.223812729173045e-05, + "loss": 2.33, + "step": 1798 + }, + { + "epoch": 0.2772429726607624, + "grad_norm": 0.7601513266563416, + "learning_rate": 8.22011223683148e-05, + "loss": 0.7241, + "step": 1800 + }, + { + "epoch": 0.2772429726607624, + "eval_loss": 2.5164382457733154, + "eval_runtime": 736.9947, + "eval_samples_per_second": 2.714, + "eval_steps_per_second": 0.678, + "step": 1800 + }, + { + "epoch": 0.27755102040816326, + "grad_norm": 0.7659019827842712, + "learning_rate": 8.216408728195779e-05, + "loss": 1.0013, + "step": 1802 + }, + { + "epoch": 0.2778590681555641, + "grad_norm": 0.7980189323425293, + "learning_rate": 8.212702206735036e-05, + "loss": 1.0552, + "step": 1804 + }, + { + "epoch": 0.27816711590296495, + "grad_norm": 0.7149158120155334, + "learning_rate": 8.208992675921166e-05, + "loss": 0.9471, + "step": 1806 + }, + { + "epoch": 0.2784751636503658, + "grad_norm": 0.7278897166252136, + "learning_rate": 8.205280139228906e-05, + "loss": 1.0755, + "step": 1808 + }, + { + "epoch": 0.27878321139776663, + "grad_norm": 0.79888516664505, + "learning_rate": 8.201564600135803e-05, + "loss": 1.0658, + "step": 1810 + }, + { + "epoch": 0.2790912591451675, + "grad_norm": 0.929144561290741, + "learning_rate": 8.197846062122223e-05, + "loss": 1.0584, + "step": 1812 + }, + { + "epoch": 0.27939930689256837, + "grad_norm": 0.8886906504631042, + "learning_rate": 8.194124528671337e-05, + "loss": 1.0032, + "step": 1814 + }, + { + "epoch": 0.2797073546399692, + "grad_norm": 0.8825163245201111, + "learning_rate": 8.190400003269121e-05, + "loss": 0.7641, + "step": 1816 + }, + { + "epoch": 0.28001540238737005, + "grad_norm": 0.9231512546539307, + "learning_rate": 8.186672489404359e-05, + "loss": 1.1552, + "step": 1818 + }, + { + "epoch": 0.2803234501347709, + "grad_norm": 0.7428817749023438, + "learning_rate": 8.182941990568626e-05, + "loss": 1.3535, + "step": 1820 + }, + { + "epoch": 0.28063149788217173, + "grad_norm": 0.6336200833320618, + "learning_rate": 8.179208510256302e-05, + "loss": 0.9099, + "step": 1822 + }, + { + "epoch": 0.2809395456295726, + "grad_norm": 0.8053500056266785, + "learning_rate": 8.175472051964552e-05, + "loss": 1.0487, + "step": 1824 + }, + { + "epoch": 0.2812475933769734, + "grad_norm": 1.5604395866394043, + "learning_rate": 8.171732619193336e-05, + "loss": 0.9007, + "step": 1826 + }, + { + "epoch": 0.28155564112437426, + "grad_norm": 1.509132981300354, + "learning_rate": 8.167990215445395e-05, + "loss": 1.5257, + "step": 1828 + }, + { + "epoch": 0.2818636888717751, + "grad_norm": 0.6117202639579773, + "learning_rate": 8.164244844226261e-05, + "loss": 1.0918, + "step": 1830 + }, + { + "epoch": 0.282171736619176, + "grad_norm": 0.6877596378326416, + "learning_rate": 8.160496509044238e-05, + "loss": 1.0748, + "step": 1832 + }, + { + "epoch": 0.28247978436657684, + "grad_norm": 0.7765544652938843, + "learning_rate": 8.156745213410407e-05, + "loss": 1.1675, + "step": 1834 + }, + { + "epoch": 0.2827878321139777, + "grad_norm": 0.6919207572937012, + "learning_rate": 8.152990960838628e-05, + "loss": 0.981, + "step": 1836 + }, + { + "epoch": 0.2830958798613785, + "grad_norm": 0.7913166284561157, + "learning_rate": 8.149233754845525e-05, + "loss": 0.9699, + "step": 1838 + }, + { + "epoch": 0.28340392760877936, + "grad_norm": 0.7056479454040527, + "learning_rate": 8.145473598950489e-05, + "loss": 1.3239, + "step": 1840 + }, + { + "epoch": 0.2837119753561802, + "grad_norm": 0.8406484723091125, + "learning_rate": 8.141710496675679e-05, + "loss": 1.8384, + "step": 1842 + }, + { + "epoch": 0.28402002310358104, + "grad_norm": 0.518136739730835, + "learning_rate": 8.137944451546007e-05, + "loss": 0.752, + "step": 1844 + }, + { + "epoch": 0.2843280708509819, + "grad_norm": 0.8231174945831299, + "learning_rate": 8.134175467089146e-05, + "loss": 1.0535, + "step": 1846 + }, + { + "epoch": 0.2846361185983827, + "grad_norm": 0.5989777445793152, + "learning_rate": 8.130403546835523e-05, + "loss": 0.7501, + "step": 1848 + }, + { + "epoch": 0.2849441663457836, + "grad_norm": 0.7494628429412842, + "learning_rate": 8.12662869431831e-05, + "loss": 0.8713, + "step": 1850 + }, + { + "epoch": 0.28525221409318446, + "grad_norm": 0.6568158864974976, + "learning_rate": 8.122850913073433e-05, + "loss": 0.925, + "step": 1852 + }, + { + "epoch": 0.2855602618405853, + "grad_norm": 0.7191964983940125, + "learning_rate": 8.119070206639554e-05, + "loss": 1.434, + "step": 1854 + }, + { + "epoch": 0.28586830958798615, + "grad_norm": 0.6533187627792358, + "learning_rate": 8.115286578558081e-05, + "loss": 0.9723, + "step": 1856 + }, + { + "epoch": 0.286176357335387, + "grad_norm": 0.6065565943717957, + "learning_rate": 8.111500032373153e-05, + "loss": 0.9806, + "step": 1858 + }, + { + "epoch": 0.28648440508278783, + "grad_norm": 0.5518906116485596, + "learning_rate": 8.107710571631648e-05, + "loss": 1.0744, + "step": 1860 + }, + { + "epoch": 0.28679245283018867, + "grad_norm": 0.9548470973968506, + "learning_rate": 8.10391819988317e-05, + "loss": 1.1006, + "step": 1862 + }, + { + "epoch": 0.2871005005775895, + "grad_norm": 0.636380136013031, + "learning_rate": 8.100122920680052e-05, + "loss": 1.3742, + "step": 1864 + }, + { + "epoch": 0.28740854832499035, + "grad_norm": 0.8006538152694702, + "learning_rate": 8.09632473757735e-05, + "loss": 0.7724, + "step": 1866 + }, + { + "epoch": 0.2877165960723912, + "grad_norm": 0.7312654852867126, + "learning_rate": 8.09252365413284e-05, + "loss": 0.9995, + "step": 1868 + }, + { + "epoch": 0.2880246438197921, + "grad_norm": 0.8602042198181152, + "learning_rate": 8.088719673907013e-05, + "loss": 1.32, + "step": 1870 + }, + { + "epoch": 0.28833269156719293, + "grad_norm": 0.8657503128051758, + "learning_rate": 8.084912800463076e-05, + "loss": 1.339, + "step": 1872 + }, + { + "epoch": 0.2886407393145938, + "grad_norm": 0.6329157948493958, + "learning_rate": 8.081103037366944e-05, + "loss": 1.0352, + "step": 1874 + }, + { + "epoch": 0.2889487870619946, + "grad_norm": 0.6768485903739929, + "learning_rate": 8.077290388187243e-05, + "loss": 1.0781, + "step": 1876 + }, + { + "epoch": 0.28925683480939546, + "grad_norm": 0.687986433506012, + "learning_rate": 8.073474856495296e-05, + "loss": 1.0905, + "step": 1878 + }, + { + "epoch": 0.2895648825567963, + "grad_norm": 0.7147440910339355, + "learning_rate": 8.06965644586513e-05, + "loss": 0.9371, + "step": 1880 + }, + { + "epoch": 0.28987293030419714, + "grad_norm": 0.7411069273948669, + "learning_rate": 8.06583515987347e-05, + "loss": 0.9088, + "step": 1882 + }, + { + "epoch": 0.290180978051598, + "grad_norm": 0.5603320598602295, + "learning_rate": 8.06201100209973e-05, + "loss": 0.9143, + "step": 1884 + }, + { + "epoch": 0.2904890257989988, + "grad_norm": 0.75848388671875, + "learning_rate": 8.058183976126018e-05, + "loss": 1.1381, + "step": 1886 + }, + { + "epoch": 0.29079707354639966, + "grad_norm": 0.753108561038971, + "learning_rate": 8.054354085537126e-05, + "loss": 1.8448, + "step": 1888 + }, + { + "epoch": 0.29110512129380056, + "grad_norm": 0.6318671107292175, + "learning_rate": 8.05052133392053e-05, + "loss": 1.0209, + "step": 1890 + }, + { + "epoch": 0.2914131690412014, + "grad_norm": 0.7125961780548096, + "learning_rate": 8.046685724866387e-05, + "loss": 0.8734, + "step": 1892 + }, + { + "epoch": 0.29172121678860224, + "grad_norm": 0.7472694516181946, + "learning_rate": 8.042847261967531e-05, + "loss": 1.0617, + "step": 1894 + }, + { + "epoch": 0.2920292645360031, + "grad_norm": 0.6471079587936401, + "learning_rate": 8.039005948819467e-05, + "loss": 0.8885, + "step": 1896 + }, + { + "epoch": 0.2923373122834039, + "grad_norm": 0.8556898832321167, + "learning_rate": 8.03516178902037e-05, + "loss": 0.9897, + "step": 1898 + }, + { + "epoch": 0.29264536003080477, + "grad_norm": 0.6888934373855591, + "learning_rate": 8.031314786171083e-05, + "loss": 0.9881, + "step": 1900 + }, + { + "epoch": 0.2929534077782056, + "grad_norm": 1.0639433860778809, + "learning_rate": 8.027464943875113e-05, + "loss": 1.2831, + "step": 1902 + }, + { + "epoch": 0.29326145552560645, + "grad_norm": 0.7208219766616821, + "learning_rate": 8.023612265738624e-05, + "loss": 1.0755, + "step": 1904 + }, + { + "epoch": 0.2935695032730073, + "grad_norm": 0.6373528242111206, + "learning_rate": 8.019756755370437e-05, + "loss": 1.0752, + "step": 1906 + }, + { + "epoch": 0.2938775510204082, + "grad_norm": 0.9459060430526733, + "learning_rate": 8.015898416382026e-05, + "loss": 1.0827, + "step": 1908 + }, + { + "epoch": 0.29418559876780903, + "grad_norm": 0.8489103317260742, + "learning_rate": 8.012037252387518e-05, + "loss": 0.888, + "step": 1910 + }, + { + "epoch": 0.2944936465152099, + "grad_norm": 0.685509979724884, + "learning_rate": 8.00817326700368e-05, + "loss": 0.952, + "step": 1912 + }, + { + "epoch": 0.2948016942626107, + "grad_norm": 0.8169518113136292, + "learning_rate": 8.004306463849927e-05, + "loss": 0.8875, + "step": 1914 + }, + { + "epoch": 0.29510974201001156, + "grad_norm": 0.604669451713562, + "learning_rate": 8.000436846548314e-05, + "loss": 0.9481, + "step": 1916 + }, + { + "epoch": 0.2954177897574124, + "grad_norm": 0.7678325772285461, + "learning_rate": 7.996564418723522e-05, + "loss": 1.3087, + "step": 1918 + }, + { + "epoch": 0.29572583750481324, + "grad_norm": 0.7700533270835876, + "learning_rate": 7.99268918400288e-05, + "loss": 0.9033, + "step": 1920 + }, + { + "epoch": 0.2960338852522141, + "grad_norm": 0.5778908729553223, + "learning_rate": 7.988811146016336e-05, + "loss": 0.9179, + "step": 1922 + }, + { + "epoch": 0.2963419329996149, + "grad_norm": 0.5745398998260498, + "learning_rate": 7.984930308396464e-05, + "loss": 0.8747, + "step": 1924 + }, + { + "epoch": 0.29664998074701576, + "grad_norm": 0.9561242461204529, + "learning_rate": 7.981046674778462e-05, + "loss": 1.2659, + "step": 1926 + }, + { + "epoch": 0.29695802849441666, + "grad_norm": 0.8658321499824524, + "learning_rate": 7.977160248800152e-05, + "loss": 0.9769, + "step": 1928 + }, + { + "epoch": 0.2972660762418175, + "grad_norm": 0.8289058804512024, + "learning_rate": 7.973271034101966e-05, + "loss": 1.0655, + "step": 1930 + }, + { + "epoch": 0.29757412398921834, + "grad_norm": 0.802101194858551, + "learning_rate": 7.969379034326949e-05, + "loss": 1.1368, + "step": 1932 + }, + { + "epoch": 0.2978821717366192, + "grad_norm": 0.8443379402160645, + "learning_rate": 7.965484253120754e-05, + "loss": 1.0558, + "step": 1934 + }, + { + "epoch": 0.29819021948402, + "grad_norm": 0.7214971780776978, + "learning_rate": 7.961586694131643e-05, + "loss": 1.4491, + "step": 1936 + }, + { + "epoch": 0.29849826723142087, + "grad_norm": 0.7081072926521301, + "learning_rate": 7.957686361010475e-05, + "loss": 0.9587, + "step": 1938 + }, + { + "epoch": 0.2988063149788217, + "grad_norm": 0.6165660619735718, + "learning_rate": 7.953783257410713e-05, + "loss": 0.9157, + "step": 1940 + }, + { + "epoch": 0.29911436272622255, + "grad_norm": 0.7237483263015747, + "learning_rate": 7.94987738698841e-05, + "loss": 1.1032, + "step": 1942 + }, + { + "epoch": 0.2994224104736234, + "grad_norm": 0.90876704454422, + "learning_rate": 7.945968753402216e-05, + "loss": 1.1303, + "step": 1944 + }, + { + "epoch": 0.29973045822102423, + "grad_norm": 0.6631532907485962, + "learning_rate": 7.942057360313361e-05, + "loss": 1.3403, + "step": 1946 + }, + { + "epoch": 0.30003850596842513, + "grad_norm": 0.691525936126709, + "learning_rate": 7.938143211385672e-05, + "loss": 1.2834, + "step": 1948 + }, + { + "epoch": 0.30034655371582597, + "grad_norm": 0.8195240497589111, + "learning_rate": 7.934226310285543e-05, + "loss": 1.1512, + "step": 1950 + }, + { + "epoch": 0.3006546014632268, + "grad_norm": 0.6079697608947754, + "learning_rate": 7.930306660681961e-05, + "loss": 0.9393, + "step": 1952 + }, + { + "epoch": 0.30096264921062765, + "grad_norm": 0.6469001173973083, + "learning_rate": 7.926384266246477e-05, + "loss": 1.0557, + "step": 1954 + }, + { + "epoch": 0.3012706969580285, + "grad_norm": 0.5937241315841675, + "learning_rate": 7.922459130653213e-05, + "loss": 1.1243, + "step": 1956 + }, + { + "epoch": 0.30157874470542934, + "grad_norm": 0.7131739258766174, + "learning_rate": 7.918531257578865e-05, + "loss": 0.8834, + "step": 1958 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.8835099339485168, + "learning_rate": 7.914600650702691e-05, + "loss": 2.3994, + "step": 1960 + }, + { + "epoch": 0.302194840200231, + "grad_norm": 0.6994355916976929, + "learning_rate": 7.910667313706506e-05, + "loss": 0.9667, + "step": 1962 + }, + { + "epoch": 0.30250288794763186, + "grad_norm": 0.6864457130432129, + "learning_rate": 7.906731250274687e-05, + "loss": 0.9741, + "step": 1964 + }, + { + "epoch": 0.30281093569503276, + "grad_norm": 0.5470225214958191, + "learning_rate": 7.902792464094163e-05, + "loss": 0.9114, + "step": 1966 + }, + { + "epoch": 0.3031189834424336, + "grad_norm": 0.7735422253608704, + "learning_rate": 7.898850958854412e-05, + "loss": 1.099, + "step": 1968 + }, + { + "epoch": 0.30342703118983444, + "grad_norm": 0.7568907737731934, + "learning_rate": 7.89490673824746e-05, + "loss": 0.9845, + "step": 1970 + }, + { + "epoch": 0.3037350789372353, + "grad_norm": 0.6680666208267212, + "learning_rate": 7.890959805967879e-05, + "loss": 1.0772, + "step": 1972 + }, + { + "epoch": 0.3040431266846361, + "grad_norm": 0.6780060529708862, + "learning_rate": 7.887010165712778e-05, + "loss": 0.9617, + "step": 1974 + }, + { + "epoch": 0.30435117443203696, + "grad_norm": 0.5781919360160828, + "learning_rate": 7.883057821181803e-05, + "loss": 0.9055, + "step": 1976 + }, + { + "epoch": 0.3046592221794378, + "grad_norm": 0.8196890354156494, + "learning_rate": 7.879102776077131e-05, + "loss": 1.0094, + "step": 1978 + }, + { + "epoch": 0.30496726992683865, + "grad_norm": 0.9342995285987854, + "learning_rate": 7.875145034103479e-05, + "loss": 1.1311, + "step": 1980 + }, + { + "epoch": 0.3052753176742395, + "grad_norm": 0.8569959998130798, + "learning_rate": 7.871184598968073e-05, + "loss": 1.0934, + "step": 1982 + }, + { + "epoch": 0.30558336542164033, + "grad_norm": 0.6865665912628174, + "learning_rate": 7.867221474380677e-05, + "loss": 0.9035, + "step": 1984 + }, + { + "epoch": 0.3058914131690412, + "grad_norm": 1.1311630010604858, + "learning_rate": 7.863255664053566e-05, + "loss": 1.0764, + "step": 1986 + }, + { + "epoch": 0.30619946091644207, + "grad_norm": 0.7756600975990295, + "learning_rate": 7.859287171701534e-05, + "loss": 1.0406, + "step": 1988 + }, + { + "epoch": 0.3065075086638429, + "grad_norm": 0.6949130296707153, + "learning_rate": 7.855316001041886e-05, + "loss": 0.8968, + "step": 1990 + }, + { + "epoch": 0.30681555641124375, + "grad_norm": 0.6985813975334167, + "learning_rate": 7.851342155794434e-05, + "loss": 1.0365, + "step": 1992 + }, + { + "epoch": 0.3071236041586446, + "grad_norm": 0.4934740364551544, + "learning_rate": 7.847365639681501e-05, + "loss": 0.9828, + "step": 1994 + }, + { + "epoch": 0.30743165190604543, + "grad_norm": 0.7085347771644592, + "learning_rate": 7.843386456427905e-05, + "loss": 0.9736, + "step": 1996 + }, + { + "epoch": 0.3077396996534463, + "grad_norm": 0.6113731265068054, + "learning_rate": 7.839404609760969e-05, + "loss": 1.1267, + "step": 1998 + }, + { + "epoch": 0.3080477474008471, + "grad_norm": 0.720635712146759, + "learning_rate": 7.835420103410504e-05, + "loss": 0.9767, + "step": 2000 + }, + { + "epoch": 0.30835579514824796, + "grad_norm": 1.2781391143798828, + "learning_rate": 7.831432941108818e-05, + "loss": 0.9985, + "step": 2002 + }, + { + "epoch": 0.3086638428956488, + "grad_norm": 0.6825604438781738, + "learning_rate": 7.827443126590701e-05, + "loss": 1.0275, + "step": 2004 + }, + { + "epoch": 0.3089718906430497, + "grad_norm": 0.9183567762374878, + "learning_rate": 7.823450663593435e-05, + "loss": 1.1826, + "step": 2006 + }, + { + "epoch": 0.30927993839045054, + "grad_norm": 0.8156041502952576, + "learning_rate": 7.819455555856777e-05, + "loss": 0.824, + "step": 2008 + }, + { + "epoch": 0.3095879861378514, + "grad_norm": 0.7667585611343384, + "learning_rate": 7.815457807122962e-05, + "loss": 0.9641, + "step": 2010 + }, + { + "epoch": 0.3098960338852522, + "grad_norm": 0.7487335801124573, + "learning_rate": 7.8114574211367e-05, + "loss": 1.1025, + "step": 2012 + }, + { + "epoch": 0.31020408163265306, + "grad_norm": 0.7451789975166321, + "learning_rate": 7.807454401645174e-05, + "loss": 0.8889, + "step": 2014 + }, + { + "epoch": 0.3105121293800539, + "grad_norm": 0.7183794975280762, + "learning_rate": 7.80344875239803e-05, + "loss": 0.9154, + "step": 2016 + }, + { + "epoch": 0.31082017712745474, + "grad_norm": 0.5591393709182739, + "learning_rate": 7.799440477147376e-05, + "loss": 1.2368, + "step": 2018 + }, + { + "epoch": 0.3111282248748556, + "grad_norm": 0.7356299757957458, + "learning_rate": 7.795429579647781e-05, + "loss": 1.0495, + "step": 2020 + }, + { + "epoch": 0.3114362726222564, + "grad_norm": 0.8593969345092773, + "learning_rate": 7.791416063656277e-05, + "loss": 1.0617, + "step": 2022 + }, + { + "epoch": 0.3117443203696573, + "grad_norm": 0.8033467531204224, + "learning_rate": 7.787399932932337e-05, + "loss": 1.9187, + "step": 2024 + }, + { + "epoch": 0.31205236811705817, + "grad_norm": 0.7294034361839294, + "learning_rate": 7.783381191237895e-05, + "loss": 0.9414, + "step": 2026 + }, + { + "epoch": 0.312360415864459, + "grad_norm": 0.7499777674674988, + "learning_rate": 7.779359842337321e-05, + "loss": 0.9633, + "step": 2028 + }, + { + "epoch": 0.31266846361185985, + "grad_norm": 0.9022547006607056, + "learning_rate": 7.775335889997435e-05, + "loss": 1.0092, + "step": 2030 + }, + { + "epoch": 0.3129765113592607, + "grad_norm": 0.5967230200767517, + "learning_rate": 7.771309337987487e-05, + "loss": 1.0805, + "step": 2032 + }, + { + "epoch": 0.31328455910666153, + "grad_norm": 0.5884339809417725, + "learning_rate": 7.76728019007917e-05, + "loss": 1.0213, + "step": 2034 + }, + { + "epoch": 0.3135926068540624, + "grad_norm": 0.7071999311447144, + "learning_rate": 7.763248450046605e-05, + "loss": 1.0096, + "step": 2036 + }, + { + "epoch": 0.3139006546014632, + "grad_norm": 0.7165465950965881, + "learning_rate": 7.759214121666343e-05, + "loss": 2.1081, + "step": 2038 + }, + { + "epoch": 0.31420870234886406, + "grad_norm": 0.6906457543373108, + "learning_rate": 7.755177208717356e-05, + "loss": 1.1053, + "step": 2040 + }, + { + "epoch": 0.3145167500962649, + "grad_norm": 0.7363939881324768, + "learning_rate": 7.75113771498104e-05, + "loss": 0.8927, + "step": 2042 + }, + { + "epoch": 0.3148247978436658, + "grad_norm": 0.6364469528198242, + "learning_rate": 7.747095644241209e-05, + "loss": 0.823, + "step": 2044 + }, + { + "epoch": 0.31513284559106663, + "grad_norm": 0.6343603134155273, + "learning_rate": 7.743051000284087e-05, + "loss": 2.1096, + "step": 2046 + }, + { + "epoch": 0.3154408933384675, + "grad_norm": 0.7235949039459229, + "learning_rate": 7.739003786898314e-05, + "loss": 1.0984, + "step": 2048 + }, + { + "epoch": 0.3157489410858683, + "grad_norm": 0.7274911403656006, + "learning_rate": 7.734954007874931e-05, + "loss": 1.1339, + "step": 2050 + }, + { + "epoch": 0.31605698883326916, + "grad_norm": 0.7011663913726807, + "learning_rate": 7.730901667007384e-05, + "loss": 1.0374, + "step": 2052 + }, + { + "epoch": 0.31636503658067, + "grad_norm": 0.6921355724334717, + "learning_rate": 7.726846768091523e-05, + "loss": 1.2023, + "step": 2054 + }, + { + "epoch": 0.31667308432807084, + "grad_norm": 1.0013788938522339, + "learning_rate": 7.722789314925589e-05, + "loss": 1.2035, + "step": 2056 + }, + { + "epoch": 0.3169811320754717, + "grad_norm": 0.7735009789466858, + "learning_rate": 7.718729311310215e-05, + "loss": 0.948, + "step": 2058 + }, + { + "epoch": 0.3172891798228725, + "grad_norm": 0.9819526076316833, + "learning_rate": 7.71466676104843e-05, + "loss": 1.1703, + "step": 2060 + }, + { + "epoch": 0.31759722757027337, + "grad_norm": 0.7479535937309265, + "learning_rate": 7.71060166794564e-05, + "loss": 0.9304, + "step": 2062 + }, + { + "epoch": 0.31790527531767426, + "grad_norm": 0.6823731660842896, + "learning_rate": 7.70653403580964e-05, + "loss": 0.9511, + "step": 2064 + }, + { + "epoch": 0.3182133230650751, + "grad_norm": 0.8437545895576477, + "learning_rate": 7.702463868450596e-05, + "loss": 0.9972, + "step": 2066 + }, + { + "epoch": 0.31852137081247595, + "grad_norm": 1.0784193277359009, + "learning_rate": 7.698391169681055e-05, + "loss": 0.9541, + "step": 2068 + }, + { + "epoch": 0.3188294185598768, + "grad_norm": 0.7519109845161438, + "learning_rate": 7.694315943315933e-05, + "loss": 1.0247, + "step": 2070 + }, + { + "epoch": 0.31913746630727763, + "grad_norm": 0.9027669429779053, + "learning_rate": 7.690238193172511e-05, + "loss": 0.9814, + "step": 2072 + }, + { + "epoch": 0.31944551405467847, + "grad_norm": 0.704380989074707, + "learning_rate": 7.686157923070442e-05, + "loss": 1.0821, + "step": 2074 + }, + { + "epoch": 0.3197535618020793, + "grad_norm": 0.7209782004356384, + "learning_rate": 7.68207513683173e-05, + "loss": 1.1074, + "step": 2076 + }, + { + "epoch": 0.32006160954948015, + "grad_norm": 0.9780352115631104, + "learning_rate": 7.677989838280739e-05, + "loss": 0.995, + "step": 2078 + }, + { + "epoch": 0.320369657296881, + "grad_norm": 0.6577726602554321, + "learning_rate": 7.673902031244189e-05, + "loss": 0.8098, + "step": 2080 + }, + { + "epoch": 0.3206777050442819, + "grad_norm": 0.8020167350769043, + "learning_rate": 7.669811719551149e-05, + "loss": 1.0673, + "step": 2082 + }, + { + "epoch": 0.32098575279168273, + "grad_norm": 0.5862478613853455, + "learning_rate": 7.665718907033031e-05, + "loss": 0.9923, + "step": 2084 + }, + { + "epoch": 0.3212938005390836, + "grad_norm": 0.6254997849464417, + "learning_rate": 7.661623597523592e-05, + "loss": 1.0672, + "step": 2086 + }, + { + "epoch": 0.3216018482864844, + "grad_norm": 0.6177172064781189, + "learning_rate": 7.657525794858926e-05, + "loss": 1.9986, + "step": 2088 + }, + { + "epoch": 0.32190989603388526, + "grad_norm": 0.7840576171875, + "learning_rate": 7.653425502877469e-05, + "loss": 1.025, + "step": 2090 + }, + { + "epoch": 0.3222179437812861, + "grad_norm": 0.7192623019218445, + "learning_rate": 7.649322725419977e-05, + "loss": 0.9782, + "step": 2092 + }, + { + "epoch": 0.32252599152868694, + "grad_norm": 0.7550196051597595, + "learning_rate": 7.645217466329546e-05, + "loss": 1.0586, + "step": 2094 + }, + { + "epoch": 0.3228340392760878, + "grad_norm": 0.8093752264976501, + "learning_rate": 7.641109729451588e-05, + "loss": 1.2799, + "step": 2096 + }, + { + "epoch": 0.3231420870234886, + "grad_norm": 0.6246629953384399, + "learning_rate": 7.636999518633841e-05, + "loss": 1.1575, + "step": 2098 + }, + { + "epoch": 0.32345013477088946, + "grad_norm": 0.9272355437278748, + "learning_rate": 7.632886837726359e-05, + "loss": 1.0441, + "step": 2100 + }, + { + "epoch": 0.32345013477088946, + "eval_loss": 2.473278045654297, + "eval_runtime": 736.1878, + "eval_samples_per_second": 2.717, + "eval_steps_per_second": 0.679, + "step": 2100 + }, + { + "epoch": 0.32375818251829036, + "grad_norm": 0.7700662612915039, + "learning_rate": 7.628771690581508e-05, + "loss": 1.0005, + "step": 2102 + }, + { + "epoch": 0.3240662302656912, + "grad_norm": 0.9488754272460938, + "learning_rate": 7.624654081053966e-05, + "loss": 1.0601, + "step": 2104 + }, + { + "epoch": 0.32437427801309204, + "grad_norm": 0.566880464553833, + "learning_rate": 7.620534013000716e-05, + "loss": 0.8569, + "step": 2106 + }, + { + "epoch": 0.3246823257604929, + "grad_norm": 0.8669172525405884, + "learning_rate": 7.616411490281048e-05, + "loss": 1.013, + "step": 2108 + }, + { + "epoch": 0.3249903735078937, + "grad_norm": 0.9163511991500854, + "learning_rate": 7.612286516756544e-05, + "loss": 0.9347, + "step": 2110 + }, + { + "epoch": 0.32529842125529457, + "grad_norm": 0.5301980376243591, + "learning_rate": 7.60815909629109e-05, + "loss": 0.8109, + "step": 2112 + }, + { + "epoch": 0.3256064690026954, + "grad_norm": 0.6251680254936218, + "learning_rate": 7.604029232750858e-05, + "loss": 1.0528, + "step": 2114 + }, + { + "epoch": 0.32591451675009625, + "grad_norm": 0.7380292415618896, + "learning_rate": 7.599896930004309e-05, + "loss": 1.1757, + "step": 2116 + }, + { + "epoch": 0.3262225644974971, + "grad_norm": 0.6980268955230713, + "learning_rate": 7.595762191922192e-05, + "loss": 1.5307, + "step": 2118 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.580763041973114, + "learning_rate": 7.591625022377537e-05, + "loss": 0.9236, + "step": 2120 + }, + { + "epoch": 0.32683865999229883, + "grad_norm": 0.6827017664909363, + "learning_rate": 7.587485425245648e-05, + "loss": 1.0231, + "step": 2122 + }, + { + "epoch": 0.32714670773969967, + "grad_norm": 0.7438657283782959, + "learning_rate": 7.583343404404104e-05, + "loss": 1.1546, + "step": 2124 + }, + { + "epoch": 0.3274547554871005, + "grad_norm": 0.6554436683654785, + "learning_rate": 7.579198963732756e-05, + "loss": 0.9518, + "step": 2126 + }, + { + "epoch": 0.32776280323450135, + "grad_norm": 0.6709740161895752, + "learning_rate": 7.575052107113722e-05, + "loss": 1.3103, + "step": 2128 + }, + { + "epoch": 0.3280708509819022, + "grad_norm": 0.6320585012435913, + "learning_rate": 7.570902838431382e-05, + "loss": 1.0456, + "step": 2130 + }, + { + "epoch": 0.32837889872930304, + "grad_norm": 0.6538925170898438, + "learning_rate": 7.566751161572372e-05, + "loss": 0.9174, + "step": 2132 + }, + { + "epoch": 0.3286869464767039, + "grad_norm": 0.6235430836677551, + "learning_rate": 7.562597080425592e-05, + "loss": 0.9084, + "step": 2134 + }, + { + "epoch": 0.3289949942241047, + "grad_norm": 0.6424623131752014, + "learning_rate": 7.558440598882185e-05, + "loss": 0.9133, + "step": 2136 + }, + { + "epoch": 0.32930304197150556, + "grad_norm": 0.8305116295814514, + "learning_rate": 7.554281720835549e-05, + "loss": 1.0305, + "step": 2138 + }, + { + "epoch": 0.32961108971890646, + "grad_norm": 0.7598525285720825, + "learning_rate": 7.550120450181324e-05, + "loss": 0.8613, + "step": 2140 + }, + { + "epoch": 0.3299191374663073, + "grad_norm": 0.9990943670272827, + "learning_rate": 7.545956790817391e-05, + "loss": 0.7471, + "step": 2142 + }, + { + "epoch": 0.33022718521370814, + "grad_norm": 0.5743617415428162, + "learning_rate": 7.54179074664387e-05, + "loss": 0.8861, + "step": 2144 + }, + { + "epoch": 0.330535232961109, + "grad_norm": 0.6631594300270081, + "learning_rate": 7.537622321563114e-05, + "loss": 1.039, + "step": 2146 + }, + { + "epoch": 0.3308432807085098, + "grad_norm": 0.7509470582008362, + "learning_rate": 7.533451519479704e-05, + "loss": 1.086, + "step": 2148 + }, + { + "epoch": 0.33115132845591067, + "grad_norm": 0.7622755765914917, + "learning_rate": 7.529278344300452e-05, + "loss": 1.1007, + "step": 2150 + }, + { + "epoch": 0.3314593762033115, + "grad_norm": 0.5702837109565735, + "learning_rate": 7.525102799934392e-05, + "loss": 0.9004, + "step": 2152 + }, + { + "epoch": 0.33176742395071235, + "grad_norm": 0.6059790849685669, + "learning_rate": 7.52092489029277e-05, + "loss": 0.9381, + "step": 2154 + }, + { + "epoch": 0.3320754716981132, + "grad_norm": 0.9663304686546326, + "learning_rate": 7.51674461928906e-05, + "loss": 1.4107, + "step": 2156 + }, + { + "epoch": 0.33238351944551403, + "grad_norm": 0.641952633857727, + "learning_rate": 7.512561990838937e-05, + "loss": 1.3133, + "step": 2158 + }, + { + "epoch": 0.3326915671929149, + "grad_norm": 0.7226306200027466, + "learning_rate": 7.508377008860294e-05, + "loss": 1.0616, + "step": 2160 + }, + { + "epoch": 0.33299961494031577, + "grad_norm": 0.6401228904724121, + "learning_rate": 7.504189677273217e-05, + "loss": 1.0341, + "step": 2162 + }, + { + "epoch": 0.3333076626877166, + "grad_norm": 0.9536916017532349, + "learning_rate": 7.500000000000001e-05, + "loss": 0.8679, + "step": 2164 + }, + { + "epoch": 0.33361571043511745, + "grad_norm": 0.9569481015205383, + "learning_rate": 7.495807980965137e-05, + "loss": 0.9996, + "step": 2166 + }, + { + "epoch": 0.3339237581825183, + "grad_norm": 0.6950173377990723, + "learning_rate": 7.491613624095307e-05, + "loss": 1.0641, + "step": 2168 + }, + { + "epoch": 0.33423180592991913, + "grad_norm": 0.730665385723114, + "learning_rate": 7.487416933319389e-05, + "loss": 0.9797, + "step": 2170 + }, + { + "epoch": 0.33453985367732, + "grad_norm": 0.7115023732185364, + "learning_rate": 7.483217912568437e-05, + "loss": 1.022, + "step": 2172 + }, + { + "epoch": 0.3348479014247208, + "grad_norm": 0.6663245558738708, + "learning_rate": 7.479016565775697e-05, + "loss": 1.0022, + "step": 2174 + }, + { + "epoch": 0.33515594917212166, + "grad_norm": 0.7663930654525757, + "learning_rate": 7.474812896876588e-05, + "loss": 1.0008, + "step": 2176 + }, + { + "epoch": 0.3354639969195225, + "grad_norm": 0.5541350245475769, + "learning_rate": 7.47060690980871e-05, + "loss": 1.1907, + "step": 2178 + }, + { + "epoch": 0.3357720446669234, + "grad_norm": 0.654722273349762, + "learning_rate": 7.466398608511826e-05, + "loss": 0.9735, + "step": 2180 + }, + { + "epoch": 0.33608009241432424, + "grad_norm": 0.6130832433700562, + "learning_rate": 7.462187996927873e-05, + "loss": 1.0976, + "step": 2182 + }, + { + "epoch": 0.3363881401617251, + "grad_norm": 0.822195827960968, + "learning_rate": 7.457975079000954e-05, + "loss": 0.9524, + "step": 2184 + }, + { + "epoch": 0.3366961879091259, + "grad_norm": 0.6369136571884155, + "learning_rate": 7.453759858677324e-05, + "loss": 1.3212, + "step": 2186 + }, + { + "epoch": 0.33700423565652676, + "grad_norm": 0.7983855605125427, + "learning_rate": 7.449542339905401e-05, + "loss": 0.8971, + "step": 2188 + }, + { + "epoch": 0.3373122834039276, + "grad_norm": 0.6447656154632568, + "learning_rate": 7.445322526635756e-05, + "loss": 0.9881, + "step": 2190 + }, + { + "epoch": 0.33762033115132845, + "grad_norm": 0.7861150503158569, + "learning_rate": 7.441100422821107e-05, + "loss": 1.8758, + "step": 2192 + }, + { + "epoch": 0.3379283788987293, + "grad_norm": 0.6880084872245789, + "learning_rate": 7.436876032416317e-05, + "loss": 0.799, + "step": 2194 + }, + { + "epoch": 0.33823642664613013, + "grad_norm": 0.742363691329956, + "learning_rate": 7.432649359378393e-05, + "loss": 0.8949, + "step": 2196 + }, + { + "epoch": 0.33854447439353097, + "grad_norm": 0.7017818689346313, + "learning_rate": 7.42842040766648e-05, + "loss": 0.9626, + "step": 2198 + }, + { + "epoch": 0.33885252214093187, + "grad_norm": 0.6897172927856445, + "learning_rate": 7.424189181241856e-05, + "loss": 1.1507, + "step": 2200 + }, + { + "epoch": 0.3391605698883327, + "grad_norm": 0.6582860946655273, + "learning_rate": 7.419955684067929e-05, + "loss": 0.822, + "step": 2202 + }, + { + "epoch": 0.33946861763573355, + "grad_norm": 0.7101274728775024, + "learning_rate": 7.41571992011024e-05, + "loss": 1.0288, + "step": 2204 + }, + { + "epoch": 0.3397766653831344, + "grad_norm": 0.702109158039093, + "learning_rate": 7.411481893336446e-05, + "loss": 2.2937, + "step": 2206 + }, + { + "epoch": 0.34008471313053523, + "grad_norm": 0.756034255027771, + "learning_rate": 7.407241607716326e-05, + "loss": 0.9363, + "step": 2208 + }, + { + "epoch": 0.3403927608779361, + "grad_norm": 0.6211479902267456, + "learning_rate": 7.402999067221777e-05, + "loss": 0.7543, + "step": 2210 + }, + { + "epoch": 0.3407008086253369, + "grad_norm": 0.8365891575813293, + "learning_rate": 7.398754275826801e-05, + "loss": 1.1156, + "step": 2212 + }, + { + "epoch": 0.34100885637273776, + "grad_norm": 0.824151873588562, + "learning_rate": 7.394507237507522e-05, + "loss": 1.0464, + "step": 2214 + }, + { + "epoch": 0.3413169041201386, + "grad_norm": 0.9308121204376221, + "learning_rate": 7.390257956242154e-05, + "loss": 1.1058, + "step": 2216 + }, + { + "epoch": 0.3416249518675395, + "grad_norm": 0.6541286110877991, + "learning_rate": 7.386006436011026e-05, + "loss": 1.2529, + "step": 2218 + }, + { + "epoch": 0.34193299961494034, + "grad_norm": 0.611873209476471, + "learning_rate": 7.381752680796547e-05, + "loss": 0.7557, + "step": 2220 + }, + { + "epoch": 0.3422410473623412, + "grad_norm": 0.8098903894424438, + "learning_rate": 7.377496694583237e-05, + "loss": 2.2155, + "step": 2222 + }, + { + "epoch": 0.342549095109742, + "grad_norm": 0.6516982316970825, + "learning_rate": 7.373238481357696e-05, + "loss": 0.9124, + "step": 2224 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.5909575819969177, + "learning_rate": 7.36897804510861e-05, + "loss": 1.0173, + "step": 2226 + }, + { + "epoch": 0.3431651906045437, + "grad_norm": 0.6586790084838867, + "learning_rate": 7.364715389826752e-05, + "loss": 1.126, + "step": 2228 + }, + { + "epoch": 0.34347323835194454, + "grad_norm": 0.8317116498947144, + "learning_rate": 7.360450519504972e-05, + "loss": 1.014, + "step": 2230 + }, + { + "epoch": 0.3437812860993454, + "grad_norm": 0.7233576774597168, + "learning_rate": 7.35618343813819e-05, + "loss": 1.0308, + "step": 2232 + }, + { + "epoch": 0.3440893338467462, + "grad_norm": 0.8280635476112366, + "learning_rate": 7.351914149723404e-05, + "loss": 0.9603, + "step": 2234 + }, + { + "epoch": 0.34439738159414707, + "grad_norm": 0.6424171924591064, + "learning_rate": 7.347642658259675e-05, + "loss": 1.0437, + "step": 2236 + }, + { + "epoch": 0.34470542934154796, + "grad_norm": 0.6725434064865112, + "learning_rate": 7.343368967748129e-05, + "loss": 0.9262, + "step": 2238 + }, + { + "epoch": 0.3450134770889488, + "grad_norm": 0.8361773490905762, + "learning_rate": 7.339093082191953e-05, + "loss": 1.1557, + "step": 2240 + }, + { + "epoch": 0.34532152483634965, + "grad_norm": 0.9322282075881958, + "learning_rate": 7.334815005596387e-05, + "loss": 0.9511, + "step": 2242 + }, + { + "epoch": 0.3456295725837505, + "grad_norm": 0.5928962230682373, + "learning_rate": 7.330534741968729e-05, + "loss": 0.9221, + "step": 2244 + }, + { + "epoch": 0.34593762033115133, + "grad_norm": 0.8262036442756653, + "learning_rate": 7.326252295318318e-05, + "loss": 1.0763, + "step": 2246 + }, + { + "epoch": 0.34624566807855217, + "grad_norm": 0.6837438941001892, + "learning_rate": 7.321967669656545e-05, + "loss": 1.0927, + "step": 2248 + }, + { + "epoch": 0.346553715825953, + "grad_norm": 0.9353243708610535, + "learning_rate": 7.317680868996833e-05, + "loss": 0.9783, + "step": 2250 + }, + { + "epoch": 0.34686176357335385, + "grad_norm": 0.9607541561126709, + "learning_rate": 7.313391897354654e-05, + "loss": 1.081, + "step": 2252 + }, + { + "epoch": 0.3471698113207547, + "grad_norm": 0.6798626184463501, + "learning_rate": 7.309100758747506e-05, + "loss": 0.9822, + "step": 2254 + }, + { + "epoch": 0.34747785906815554, + "grad_norm": 1.0110447406768799, + "learning_rate": 7.304807457194918e-05, + "loss": 0.959, + "step": 2256 + }, + { + "epoch": 0.34778590681555643, + "grad_norm": 0.9492419958114624, + "learning_rate": 7.300511996718447e-05, + "loss": 1.0298, + "step": 2258 + }, + { + "epoch": 0.3480939545629573, + "grad_norm": 0.5809617638587952, + "learning_rate": 7.29621438134167e-05, + "loss": 0.8646, + "step": 2260 + }, + { + "epoch": 0.3484020023103581, + "grad_norm": 0.9347992539405823, + "learning_rate": 7.29191461509018e-05, + "loss": 1.047, + "step": 2262 + }, + { + "epoch": 0.34871005005775896, + "grad_norm": 0.6787614226341248, + "learning_rate": 7.287612701991595e-05, + "loss": 0.9601, + "step": 2264 + }, + { + "epoch": 0.3490180978051598, + "grad_norm": 0.655129611492157, + "learning_rate": 7.28330864607553e-05, + "loss": 0.9365, + "step": 2266 + }, + { + "epoch": 0.34932614555256064, + "grad_norm": 0.732175350189209, + "learning_rate": 7.27900245137362e-05, + "loss": 1.0254, + "step": 2268 + }, + { + "epoch": 0.3496341932999615, + "grad_norm": 0.7303794026374817, + "learning_rate": 7.274694121919495e-05, + "loss": 2.1755, + "step": 2270 + }, + { + "epoch": 0.3499422410473623, + "grad_norm": 0.852927565574646, + "learning_rate": 7.270383661748786e-05, + "loss": 1.173, + "step": 2272 + }, + { + "epoch": 0.35025028879476316, + "grad_norm": 0.9458498954772949, + "learning_rate": 7.266071074899124e-05, + "loss": 1.1105, + "step": 2274 + }, + { + "epoch": 0.35055833654216406, + "grad_norm": 1.054835557937622, + "learning_rate": 7.261756365410126e-05, + "loss": 1.0068, + "step": 2276 + }, + { + "epoch": 0.3508663842895649, + "grad_norm": 0.8554360270500183, + "learning_rate": 7.257439537323403e-05, + "loss": 1.1411, + "step": 2278 + }, + { + "epoch": 0.35117443203696574, + "grad_norm": 0.9421424269676208, + "learning_rate": 7.253120594682547e-05, + "loss": 1.161, + "step": 2280 + }, + { + "epoch": 0.3514824797843666, + "grad_norm": 0.5921900272369385, + "learning_rate": 7.24879954153313e-05, + "loss": 1.893, + "step": 2282 + }, + { + "epoch": 0.3517905275317674, + "grad_norm": 0.8463044762611389, + "learning_rate": 7.244476381922708e-05, + "loss": 1.1132, + "step": 2284 + }, + { + "epoch": 0.35209857527916827, + "grad_norm": 0.6275113224983215, + "learning_rate": 7.240151119900797e-05, + "loss": 1.1475, + "step": 2286 + }, + { + "epoch": 0.3524066230265691, + "grad_norm": 0.8620054125785828, + "learning_rate": 7.2358237595189e-05, + "loss": 1.212, + "step": 2288 + }, + { + "epoch": 0.35271467077396995, + "grad_norm": 0.8541956543922424, + "learning_rate": 7.231494304830465e-05, + "loss": 1.0452, + "step": 2290 + }, + { + "epoch": 0.3530227185213708, + "grad_norm": 0.5204923748970032, + "learning_rate": 7.227162759890919e-05, + "loss": 0.8911, + "step": 2292 + }, + { + "epoch": 0.35333076626877163, + "grad_norm": 0.5744819045066833, + "learning_rate": 7.22282912875764e-05, + "loss": 2.2718, + "step": 2294 + }, + { + "epoch": 0.35363881401617253, + "grad_norm": 0.6028558015823364, + "learning_rate": 7.218493415489956e-05, + "loss": 1.2403, + "step": 2296 + }, + { + "epoch": 0.3539468617635734, + "grad_norm": 0.8307321071624756, + "learning_rate": 7.214155624149156e-05, + "loss": 1.0185, + "step": 2298 + }, + { + "epoch": 0.3542549095109742, + "grad_norm": 1.0472615957260132, + "learning_rate": 7.209815758798464e-05, + "loss": 1.0723, + "step": 2300 + }, + { + "epoch": 0.35456295725837506, + "grad_norm": 0.7444417476654053, + "learning_rate": 7.205473823503057e-05, + "loss": 0.942, + "step": 2302 + }, + { + "epoch": 0.3548710050057759, + "grad_norm": 0.681360125541687, + "learning_rate": 7.201129822330041e-05, + "loss": 1.0413, + "step": 2304 + }, + { + "epoch": 0.35517905275317674, + "grad_norm": 0.7587233185768127, + "learning_rate": 7.196783759348465e-05, + "loss": 0.931, + "step": 2306 + }, + { + "epoch": 0.3554871005005776, + "grad_norm": 0.5804754495620728, + "learning_rate": 7.192435638629307e-05, + "loss": 1.9602, + "step": 2308 + }, + { + "epoch": 0.3557951482479784, + "grad_norm": 0.5681557059288025, + "learning_rate": 7.18808546424547e-05, + "loss": 0.8623, + "step": 2310 + }, + { + "epoch": 0.35610319599537926, + "grad_norm": 0.9095214009284973, + "learning_rate": 7.183733240271784e-05, + "loss": 1.2142, + "step": 2312 + }, + { + "epoch": 0.3564112437427801, + "grad_norm": 0.8182647228240967, + "learning_rate": 7.179378970784997e-05, + "loss": 1.1915, + "step": 2314 + }, + { + "epoch": 0.356719291490181, + "grad_norm": 0.8019623160362244, + "learning_rate": 7.175022659863773e-05, + "loss": 1.843, + "step": 2316 + }, + { + "epoch": 0.35702733923758184, + "grad_norm": 0.7972326874732971, + "learning_rate": 7.17066431158869e-05, + "loss": 1.0202, + "step": 2318 + }, + { + "epoch": 0.3573353869849827, + "grad_norm": 0.5969523787498474, + "learning_rate": 7.166303930042233e-05, + "loss": 2.6547, + "step": 2320 + }, + { + "epoch": 0.3576434347323835, + "grad_norm": 1.3465994596481323, + "learning_rate": 7.16194151930879e-05, + "loss": 1.0466, + "step": 2322 + }, + { + "epoch": 0.35795148247978437, + "grad_norm": 0.8386397957801819, + "learning_rate": 7.157577083474653e-05, + "loss": 1.1265, + "step": 2324 + }, + { + "epoch": 0.3582595302271852, + "grad_norm": 0.675894558429718, + "learning_rate": 7.153210626628007e-05, + "loss": 1.0571, + "step": 2326 + }, + { + "epoch": 0.35856757797458605, + "grad_norm": 0.656844973564148, + "learning_rate": 7.148842152858938e-05, + "loss": 1.0948, + "step": 2328 + }, + { + "epoch": 0.3588756257219869, + "grad_norm": 0.665457546710968, + "learning_rate": 7.144471666259409e-05, + "loss": 1.0663, + "step": 2330 + }, + { + "epoch": 0.35918367346938773, + "grad_norm": 0.7601264119148254, + "learning_rate": 7.140099170923281e-05, + "loss": 0.9879, + "step": 2332 + }, + { + "epoch": 0.35949172121678863, + "grad_norm": 1.0152552127838135, + "learning_rate": 7.135724670946288e-05, + "loss": 1.0867, + "step": 2334 + }, + { + "epoch": 0.35979976896418947, + "grad_norm": 0.9408657550811768, + "learning_rate": 7.131348170426042e-05, + "loss": 1.0168, + "step": 2336 + }, + { + "epoch": 0.3601078167115903, + "grad_norm": 1.0523954629898071, + "learning_rate": 7.126969673462037e-05, + "loss": 1.1912, + "step": 2338 + }, + { + "epoch": 0.36041586445899115, + "grad_norm": 0.9186645150184631, + "learning_rate": 7.122589184155626e-05, + "loss": 1.1382, + "step": 2340 + }, + { + "epoch": 0.360723912206392, + "grad_norm": 0.7351142168045044, + "learning_rate": 7.118206706610038e-05, + "loss": 0.9642, + "step": 2342 + }, + { + "epoch": 0.36103195995379284, + "grad_norm": 0.714341938495636, + "learning_rate": 7.113822244930357e-05, + "loss": 0.9253, + "step": 2344 + }, + { + "epoch": 0.3613400077011937, + "grad_norm": 0.7643159627914429, + "learning_rate": 7.109435803223531e-05, + "loss": 1.0953, + "step": 2346 + }, + { + "epoch": 0.3616480554485945, + "grad_norm": 0.8597730398178101, + "learning_rate": 7.105047385598359e-05, + "loss": 1.0206, + "step": 2348 + }, + { + "epoch": 0.36195610319599536, + "grad_norm": 0.6703843474388123, + "learning_rate": 7.100656996165493e-05, + "loss": 0.919, + "step": 2350 + }, + { + "epoch": 0.3622641509433962, + "grad_norm": 0.8356922268867493, + "learning_rate": 7.096264639037431e-05, + "loss": 0.9216, + "step": 2352 + }, + { + "epoch": 0.3625721986907971, + "grad_norm": 0.7385930418968201, + "learning_rate": 7.091870318328515e-05, + "loss": 1.3654, + "step": 2354 + }, + { + "epoch": 0.36288024643819794, + "grad_norm": 0.6730544567108154, + "learning_rate": 7.087474038154924e-05, + "loss": 2.8678, + "step": 2356 + }, + { + "epoch": 0.3631882941855988, + "grad_norm": 0.7496978044509888, + "learning_rate": 7.083075802634675e-05, + "loss": 0.9139, + "step": 2358 + }, + { + "epoch": 0.3634963419329996, + "grad_norm": 0.8838191628456116, + "learning_rate": 7.078675615887618e-05, + "loss": 0.9811, + "step": 2360 + }, + { + "epoch": 0.36380438968040046, + "grad_norm": 0.642116129398346, + "learning_rate": 7.074273482035424e-05, + "loss": 1.0675, + "step": 2362 + }, + { + "epoch": 0.3641124374278013, + "grad_norm": 0.6837906837463379, + "learning_rate": 7.069869405201595e-05, + "loss": 1.0903, + "step": 2364 + }, + { + "epoch": 0.36442048517520215, + "grad_norm": 0.7109867334365845, + "learning_rate": 7.065463389511449e-05, + "loss": 0.9847, + "step": 2366 + }, + { + "epoch": 0.364728532922603, + "grad_norm": 0.5639575719833374, + "learning_rate": 7.061055439092126e-05, + "loss": 0.865, + "step": 2368 + }, + { + "epoch": 0.36503658067000383, + "grad_norm": 0.7767153382301331, + "learning_rate": 7.056645558072565e-05, + "loss": 1.1613, + "step": 2370 + }, + { + "epoch": 0.36534462841740467, + "grad_norm": 0.7072895169258118, + "learning_rate": 7.052233750583532e-05, + "loss": 1.0345, + "step": 2372 + }, + { + "epoch": 0.36565267616480557, + "grad_norm": 0.7484140992164612, + "learning_rate": 7.047820020757579e-05, + "loss": 0.905, + "step": 2374 + }, + { + "epoch": 0.3659607239122064, + "grad_norm": 0.7483952641487122, + "learning_rate": 7.043404372729072e-05, + "loss": 1.2131, + "step": 2376 + }, + { + "epoch": 0.36626877165960725, + "grad_norm": 0.7827897071838379, + "learning_rate": 7.03898681063417e-05, + "loss": 1.0051, + "step": 2378 + }, + { + "epoch": 0.3665768194070081, + "grad_norm": 0.6883440613746643, + "learning_rate": 7.034567338610819e-05, + "loss": 1.1551, + "step": 2380 + }, + { + "epoch": 0.36688486715440893, + "grad_norm": 0.8326652646064758, + "learning_rate": 7.030145960798764e-05, + "loss": 0.8967, + "step": 2382 + }, + { + "epoch": 0.3671929149018098, + "grad_norm": 0.776249349117279, + "learning_rate": 7.025722681339528e-05, + "loss": 1.0518, + "step": 2384 + }, + { + "epoch": 0.3675009626492106, + "grad_norm": 0.5774699449539185, + "learning_rate": 7.021297504376418e-05, + "loss": 0.9455, + "step": 2386 + }, + { + "epoch": 0.36780901039661146, + "grad_norm": 0.7364729642868042, + "learning_rate": 7.016870434054517e-05, + "loss": 0.8656, + "step": 2388 + }, + { + "epoch": 0.3681170581440123, + "grad_norm": 0.7618663311004639, + "learning_rate": 7.012441474520683e-05, + "loss": 1.2443, + "step": 2390 + }, + { + "epoch": 0.3684251058914132, + "grad_norm": 0.6586353182792664, + "learning_rate": 7.008010629923544e-05, + "loss": 1.1953, + "step": 2392 + }, + { + "epoch": 0.36873315363881404, + "grad_norm": 0.6616129279136658, + "learning_rate": 7.003577904413492e-05, + "loss": 1.0623, + "step": 2394 + }, + { + "epoch": 0.3690412013862149, + "grad_norm": 0.8203920125961304, + "learning_rate": 6.999143302142681e-05, + "loss": 1.0939, + "step": 2396 + }, + { + "epoch": 0.3693492491336157, + "grad_norm": 0.6637328863143921, + "learning_rate": 6.994706827265024e-05, + "loss": 1.0425, + "step": 2398 + }, + { + "epoch": 0.36965729688101656, + "grad_norm": 0.7866489887237549, + "learning_rate": 6.990268483936189e-05, + "loss": 0.989, + "step": 2400 + }, + { + "epoch": 0.36965729688101656, + "eval_loss": 2.5227484703063965, + "eval_runtime": 737.0745, + "eval_samples_per_second": 2.713, + "eval_steps_per_second": 0.678, + "step": 2400 + }, + { + "epoch": 0.3699653446284174, + "grad_norm": 0.7644304633140564, + "learning_rate": 6.98582827631359e-05, + "loss": 2.0393, + "step": 2402 + }, + { + "epoch": 0.37027339237581824, + "grad_norm": 0.7707456350326538, + "learning_rate": 6.981386208556394e-05, + "loss": 0.9427, + "step": 2404 + }, + { + "epoch": 0.3705814401232191, + "grad_norm": 0.7313803434371948, + "learning_rate": 6.9769422848255e-05, + "loss": 1.0733, + "step": 2406 + }, + { + "epoch": 0.3708894878706199, + "grad_norm": 0.7003641128540039, + "learning_rate": 6.972496509283562e-05, + "loss": 1.4143, + "step": 2408 + }, + { + "epoch": 0.37119753561802077, + "grad_norm": 0.7405939102172852, + "learning_rate": 6.96804888609495e-05, + "loss": 1.2404, + "step": 2410 + }, + { + "epoch": 0.37150558336542167, + "grad_norm": 0.4707174003124237, + "learning_rate": 6.963599419425777e-05, + "loss": 0.8943, + "step": 2412 + }, + { + "epoch": 0.3718136311128225, + "grad_norm": 0.955906331539154, + "learning_rate": 6.959148113443879e-05, + "loss": 1.1045, + "step": 2414 + }, + { + "epoch": 0.37212167886022335, + "grad_norm": 0.674000084400177, + "learning_rate": 6.954694972318816e-05, + "loss": 0.9863, + "step": 2416 + }, + { + "epoch": 0.3724297266076242, + "grad_norm": 0.9632213115692139, + "learning_rate": 6.950240000221862e-05, + "loss": 0.8621, + "step": 2418 + }, + { + "epoch": 0.37273777435502503, + "grad_norm": 0.9839867353439331, + "learning_rate": 6.945783201326015e-05, + "loss": 1.1482, + "step": 2420 + }, + { + "epoch": 0.3730458221024259, + "grad_norm": 0.6643813848495483, + "learning_rate": 6.941324579805977e-05, + "loss": 0.8935, + "step": 2422 + }, + { + "epoch": 0.3733538698498267, + "grad_norm": 0.7968766689300537, + "learning_rate": 6.936864139838158e-05, + "loss": 1.2712, + "step": 2424 + }, + { + "epoch": 0.37366191759722756, + "grad_norm": 0.7045233845710754, + "learning_rate": 6.932401885600678e-05, + "loss": 1.0575, + "step": 2426 + }, + { + "epoch": 0.3739699653446284, + "grad_norm": 0.7721226811408997, + "learning_rate": 6.927937821273344e-05, + "loss": 1.0979, + "step": 2428 + }, + { + "epoch": 0.37427801309202924, + "grad_norm": 0.775489330291748, + "learning_rate": 6.923471951037672e-05, + "loss": 1.2188, + "step": 2430 + }, + { + "epoch": 0.37458606083943013, + "grad_norm": 0.6731612086296082, + "learning_rate": 6.919004279076862e-05, + "loss": 1.0237, + "step": 2432 + }, + { + "epoch": 0.374894108586831, + "grad_norm": 1.1068634986877441, + "learning_rate": 6.914534809575802e-05, + "loss": 0.9837, + "step": 2434 + }, + { + "epoch": 0.3752021563342318, + "grad_norm": 0.683112382888794, + "learning_rate": 6.910063546721064e-05, + "loss": 1.0354, + "step": 2436 + }, + { + "epoch": 0.37551020408163266, + "grad_norm": 0.7034465670585632, + "learning_rate": 6.905590494700905e-05, + "loss": 1.2002, + "step": 2438 + }, + { + "epoch": 0.3758182518290335, + "grad_norm": 0.862964928150177, + "learning_rate": 6.901115657705246e-05, + "loss": 0.8784, + "step": 2440 + }, + { + "epoch": 0.37612629957643434, + "grad_norm": 0.6790128350257874, + "learning_rate": 6.896639039925697e-05, + "loss": 0.9937, + "step": 2442 + }, + { + "epoch": 0.3764343473238352, + "grad_norm": 1.4234968423843384, + "learning_rate": 6.892160645555521e-05, + "loss": 0.9885, + "step": 2444 + }, + { + "epoch": 0.376742395071236, + "grad_norm": 0.7562160491943359, + "learning_rate": 6.88768047878965e-05, + "loss": 1.0742, + "step": 2446 + }, + { + "epoch": 0.37705044281863687, + "grad_norm": 0.6968784928321838, + "learning_rate": 6.883198543824681e-05, + "loss": 1.1895, + "step": 2448 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.760648787021637, + "learning_rate": 6.87871484485886e-05, + "loss": 1.0247, + "step": 2450 + }, + { + "epoch": 0.3776665383134386, + "grad_norm": 0.7358818650245667, + "learning_rate": 6.874229386092092e-05, + "loss": 1.0636, + "step": 2452 + }, + { + "epoch": 0.37797458606083945, + "grad_norm": 0.6471631526947021, + "learning_rate": 6.869742171725924e-05, + "loss": 0.915, + "step": 2454 + }, + { + "epoch": 0.3782826338082403, + "grad_norm": 0.7820512652397156, + "learning_rate": 6.865253205963555e-05, + "loss": 0.944, + "step": 2456 + }, + { + "epoch": 0.37859068155564113, + "grad_norm": 0.7044534683227539, + "learning_rate": 6.860762493009814e-05, + "loss": 1.0403, + "step": 2458 + }, + { + "epoch": 0.37889872930304197, + "grad_norm": 0.8854067325592041, + "learning_rate": 6.856270037071176e-05, + "loss": 1.0855, + "step": 2460 + }, + { + "epoch": 0.3792067770504428, + "grad_norm": 0.6846293210983276, + "learning_rate": 6.851775842355746e-05, + "loss": 1.2417, + "step": 2462 + }, + { + "epoch": 0.37951482479784365, + "grad_norm": 0.7980126738548279, + "learning_rate": 6.847279913073255e-05, + "loss": 1.2656, + "step": 2464 + }, + { + "epoch": 0.3798228725452445, + "grad_norm": 1.008276104927063, + "learning_rate": 6.842782253435065e-05, + "loss": 1.2209, + "step": 2466 + }, + { + "epoch": 0.38013092029264534, + "grad_norm": 0.6689044237136841, + "learning_rate": 6.838282867654149e-05, + "loss": 1.6123, + "step": 2468 + }, + { + "epoch": 0.38043896804004623, + "grad_norm": 0.9234145879745483, + "learning_rate": 6.833781759945107e-05, + "loss": 1.2141, + "step": 2470 + }, + { + "epoch": 0.3807470157874471, + "grad_norm": 0.6782144904136658, + "learning_rate": 6.829278934524146e-05, + "loss": 1.1363, + "step": 2472 + }, + { + "epoch": 0.3810550635348479, + "grad_norm": 0.569631814956665, + "learning_rate": 6.824774395609085e-05, + "loss": 1.0716, + "step": 2474 + }, + { + "epoch": 0.38136311128224876, + "grad_norm": 0.5350009799003601, + "learning_rate": 6.820268147419344e-05, + "loss": 1.2837, + "step": 2476 + }, + { + "epoch": 0.3816711590296496, + "grad_norm": 0.6322665810585022, + "learning_rate": 6.815760194175949e-05, + "loss": 1.1355, + "step": 2478 + }, + { + "epoch": 0.38197920677705044, + "grad_norm": 0.9357727766036987, + "learning_rate": 6.811250540101517e-05, + "loss": 2.2662, + "step": 2480 + }, + { + "epoch": 0.3822872545244513, + "grad_norm": 0.9705163240432739, + "learning_rate": 6.806739189420269e-05, + "loss": 1.028, + "step": 2482 + }, + { + "epoch": 0.3825953022718521, + "grad_norm": 0.8012816905975342, + "learning_rate": 6.802226146358001e-05, + "loss": 0.9716, + "step": 2484 + }, + { + "epoch": 0.38290335001925296, + "grad_norm": 0.7949168682098389, + "learning_rate": 6.797711415142105e-05, + "loss": 1.0279, + "step": 2486 + }, + { + "epoch": 0.3832113977666538, + "grad_norm": 0.7826275825500488, + "learning_rate": 6.793195000001551e-05, + "loss": 1.3541, + "step": 2488 + }, + { + "epoch": 0.3835194455140547, + "grad_norm": 0.8680853247642517, + "learning_rate": 6.788676905166884e-05, + "loss": 1.2099, + "step": 2490 + }, + { + "epoch": 0.38382749326145554, + "grad_norm": 0.8269588351249695, + "learning_rate": 6.784157134870228e-05, + "loss": 0.8367, + "step": 2492 + }, + { + "epoch": 0.3841355410088564, + "grad_norm": 0.8308361172676086, + "learning_rate": 6.779635693345268e-05, + "loss": 0.9333, + "step": 2494 + }, + { + "epoch": 0.3844435887562572, + "grad_norm": 0.6732942461967468, + "learning_rate": 6.775112584827266e-05, + "loss": 0.8547, + "step": 2496 + }, + { + "epoch": 0.38475163650365807, + "grad_norm": 0.7414329051971436, + "learning_rate": 6.77058781355303e-05, + "loss": 1.1987, + "step": 2498 + }, + { + "epoch": 0.3850596842510589, + "grad_norm": 0.7550074458122253, + "learning_rate": 6.766061383760943e-05, + "loss": 1.1309, + "step": 2500 + }, + { + "epoch": 0.38536773199845975, + "grad_norm": 0.6689410209655762, + "learning_rate": 6.761533299690927e-05, + "loss": 0.8807, + "step": 2502 + }, + { + "epoch": 0.3856757797458606, + "grad_norm": 0.7514511942863464, + "learning_rate": 6.757003565584463e-05, + "loss": 0.8829, + "step": 2504 + }, + { + "epoch": 0.38598382749326143, + "grad_norm": 0.6438677310943604, + "learning_rate": 6.752472185684573e-05, + "loss": 0.8669, + "step": 2506 + }, + { + "epoch": 0.38629187524066233, + "grad_norm": 0.9370463490486145, + "learning_rate": 6.747939164235819e-05, + "loss": 0.9831, + "step": 2508 + }, + { + "epoch": 0.38659992298806317, + "grad_norm": 1.0750024318695068, + "learning_rate": 6.743404505484308e-05, + "loss": 1.2794, + "step": 2510 + }, + { + "epoch": 0.386907970735464, + "grad_norm": 0.8463871479034424, + "learning_rate": 6.738868213677671e-05, + "loss": 2.0352, + "step": 2512 + }, + { + "epoch": 0.38721601848286485, + "grad_norm": 0.8012046813964844, + "learning_rate": 6.734330293065079e-05, + "loss": 0.9807, + "step": 2514 + }, + { + "epoch": 0.3875240662302657, + "grad_norm": 0.9211841225624084, + "learning_rate": 6.729790747897219e-05, + "loss": 1.1453, + "step": 2516 + }, + { + "epoch": 0.38783211397766654, + "grad_norm": 0.9312162399291992, + "learning_rate": 6.725249582426306e-05, + "loss": 1.1544, + "step": 2518 + }, + { + "epoch": 0.3881401617250674, + "grad_norm": 0.5721031427383423, + "learning_rate": 6.72070680090607e-05, + "loss": 1.2422, + "step": 2520 + }, + { + "epoch": 0.3884482094724682, + "grad_norm": 0.8398732542991638, + "learning_rate": 6.716162407591757e-05, + "loss": 0.9103, + "step": 2522 + }, + { + "epoch": 0.38875625721986906, + "grad_norm": 1.1913796663284302, + "learning_rate": 6.711616406740121e-05, + "loss": 1.1755, + "step": 2524 + }, + { + "epoch": 0.3890643049672699, + "grad_norm": 0.6675463914871216, + "learning_rate": 6.707068802609421e-05, + "loss": 0.9846, + "step": 2526 + }, + { + "epoch": 0.3893723527146708, + "grad_norm": 0.6696200370788574, + "learning_rate": 6.70251959945942e-05, + "loss": 0.922, + "step": 2528 + }, + { + "epoch": 0.38968040046207164, + "grad_norm": 0.7850413918495178, + "learning_rate": 6.697968801551378e-05, + "loss": 0.9998, + "step": 2530 + }, + { + "epoch": 0.3899884482094725, + "grad_norm": 0.7170053124427795, + "learning_rate": 6.693416413148045e-05, + "loss": 0.9936, + "step": 2532 + }, + { + "epoch": 0.3902964959568733, + "grad_norm": 0.7705442309379578, + "learning_rate": 6.68886243851367e-05, + "loss": 0.9031, + "step": 2534 + }, + { + "epoch": 0.39060454370427417, + "grad_norm": 0.7424684166908264, + "learning_rate": 6.684306881913982e-05, + "loss": 1.0102, + "step": 2536 + }, + { + "epoch": 0.390912591451675, + "grad_norm": 0.7100473642349243, + "learning_rate": 6.679749747616186e-05, + "loss": 0.8898, + "step": 2538 + }, + { + "epoch": 0.39122063919907585, + "grad_norm": 0.97356116771698, + "learning_rate": 6.675191039888978e-05, + "loss": 0.863, + "step": 2540 + }, + { + "epoch": 0.3915286869464767, + "grad_norm": 0.8761134743690491, + "learning_rate": 6.670630763002514e-05, + "loss": 1.0494, + "step": 2542 + }, + { + "epoch": 0.39183673469387753, + "grad_norm": 0.8845568895339966, + "learning_rate": 6.666068921228433e-05, + "loss": 1.2686, + "step": 2544 + }, + { + "epoch": 0.3921447824412784, + "grad_norm": 0.6904786229133606, + "learning_rate": 6.66150551883983e-05, + "loss": 0.9315, + "step": 2546 + }, + { + "epoch": 0.39245283018867927, + "grad_norm": 0.6124597191810608, + "learning_rate": 6.656940560111267e-05, + "loss": 1.2245, + "step": 2548 + }, + { + "epoch": 0.3927608779360801, + "grad_norm": 0.860092282295227, + "learning_rate": 6.65237404931876e-05, + "loss": 1.1557, + "step": 2550 + }, + { + "epoch": 0.39306892568348095, + "grad_norm": 0.7460459470748901, + "learning_rate": 6.647805990739782e-05, + "loss": 0.9544, + "step": 2552 + }, + { + "epoch": 0.3933769734308818, + "grad_norm": 0.7506148815155029, + "learning_rate": 6.643236388653255e-05, + "loss": 1.2874, + "step": 2554 + }, + { + "epoch": 0.39368502117828263, + "grad_norm": 0.6355990767478943, + "learning_rate": 6.638665247339546e-05, + "loss": 0.9506, + "step": 2556 + }, + { + "epoch": 0.3939930689256835, + "grad_norm": 0.693425714969635, + "learning_rate": 6.634092571080465e-05, + "loss": 0.8779, + "step": 2558 + }, + { + "epoch": 0.3943011166730843, + "grad_norm": 0.8063914775848389, + "learning_rate": 6.629518364159259e-05, + "loss": 1.1405, + "step": 2560 + }, + { + "epoch": 0.39460916442048516, + "grad_norm": 0.6337122321128845, + "learning_rate": 6.624942630860607e-05, + "loss": 1.1013, + "step": 2562 + }, + { + "epoch": 0.394917212167886, + "grad_norm": 0.9229851961135864, + "learning_rate": 6.620365375470623e-05, + "loss": 0.9953, + "step": 2564 + }, + { + "epoch": 0.39522525991528684, + "grad_norm": 0.7974163889884949, + "learning_rate": 6.615786602276843e-05, + "loss": 0.9866, + "step": 2566 + }, + { + "epoch": 0.39553330766268774, + "grad_norm": 0.7080705165863037, + "learning_rate": 6.611206315568223e-05, + "loss": 0.9946, + "step": 2568 + }, + { + "epoch": 0.3958413554100886, + "grad_norm": 0.7690320611000061, + "learning_rate": 6.606624519635138e-05, + "loss": 0.9264, + "step": 2570 + }, + { + "epoch": 0.3961494031574894, + "grad_norm": 0.8104195594787598, + "learning_rate": 6.602041218769383e-05, + "loss": 1.0719, + "step": 2572 + }, + { + "epoch": 0.39645745090489026, + "grad_norm": 0.5908204317092896, + "learning_rate": 6.597456417264151e-05, + "loss": 0.9732, + "step": 2574 + }, + { + "epoch": 0.3967654986522911, + "grad_norm": 0.6712586283683777, + "learning_rate": 6.592870119414052e-05, + "loss": 1.9431, + "step": 2576 + }, + { + "epoch": 0.39707354639969195, + "grad_norm": 0.8229988813400269, + "learning_rate": 6.588282329515089e-05, + "loss": 0.9665, + "step": 2578 + }, + { + "epoch": 0.3973815941470928, + "grad_norm": 0.6816299557685852, + "learning_rate": 6.583693051864668e-05, + "loss": 0.6664, + "step": 2580 + }, + { + "epoch": 0.39768964189449363, + "grad_norm": 0.6721845269203186, + "learning_rate": 6.579102290761586e-05, + "loss": 0.9116, + "step": 2582 + }, + { + "epoch": 0.39799768964189447, + "grad_norm": 0.5877856016159058, + "learning_rate": 6.57451005050603e-05, + "loss": 0.9351, + "step": 2584 + }, + { + "epoch": 0.39830573738929537, + "grad_norm": 0.7047834396362305, + "learning_rate": 6.569916335399576e-05, + "loss": 0.9426, + "step": 2586 + }, + { + "epoch": 0.3986137851366962, + "grad_norm": 0.7781816720962524, + "learning_rate": 6.565321149745174e-05, + "loss": 1.0048, + "step": 2588 + }, + { + "epoch": 0.39892183288409705, + "grad_norm": 0.6085528135299683, + "learning_rate": 6.560724497847159e-05, + "loss": 0.8135, + "step": 2590 + }, + { + "epoch": 0.3992298806314979, + "grad_norm": 0.8343983888626099, + "learning_rate": 6.556126384011233e-05, + "loss": 1.1167, + "step": 2592 + }, + { + "epoch": 0.39953792837889873, + "grad_norm": 0.7468252182006836, + "learning_rate": 6.551526812544474e-05, + "loss": 1.3543, + "step": 2594 + }, + { + "epoch": 0.3998459761262996, + "grad_norm": 0.7748050093650818, + "learning_rate": 6.546925787755321e-05, + "loss": 1.1425, + "step": 2596 + }, + { + "epoch": 0.4001540238737004, + "grad_norm": 0.7804863452911377, + "learning_rate": 6.542323313953574e-05, + "loss": 1.0779, + "step": 2598 + }, + { + "epoch": 0.40046207162110126, + "grad_norm": 0.9270213842391968, + "learning_rate": 6.537719395450391e-05, + "loss": 0.9406, + "step": 2600 + }, + { + "epoch": 0.4007701193685021, + "grad_norm": 0.6129812002182007, + "learning_rate": 6.533114036558287e-05, + "loss": 0.9975, + "step": 2602 + }, + { + "epoch": 0.40107816711590294, + "grad_norm": 0.7297736406326294, + "learning_rate": 6.528507241591121e-05, + "loss": 2.0731, + "step": 2604 + }, + { + "epoch": 0.40138621486330384, + "grad_norm": 0.7494378089904785, + "learning_rate": 6.523899014864102e-05, + "loss": 1.9419, + "step": 2606 + }, + { + "epoch": 0.4016942626107047, + "grad_norm": 0.7798280119895935, + "learning_rate": 6.519289360693774e-05, + "loss": 0.9775, + "step": 2608 + }, + { + "epoch": 0.4020023103581055, + "grad_norm": 0.8234038352966309, + "learning_rate": 6.514678283398022e-05, + "loss": 1.6277, + "step": 2610 + }, + { + "epoch": 0.40231035810550636, + "grad_norm": 0.7135938405990601, + "learning_rate": 6.510065787296064e-05, + "loss": 0.8109, + "step": 2612 + }, + { + "epoch": 0.4026184058529072, + "grad_norm": 0.8007364869117737, + "learning_rate": 6.505451876708448e-05, + "loss": 1.1473, + "step": 2614 + }, + { + "epoch": 0.40292645360030804, + "grad_norm": 0.8691009879112244, + "learning_rate": 6.500836555957046e-05, + "loss": 1.1948, + "step": 2616 + }, + { + "epoch": 0.4032345013477089, + "grad_norm": 0.985650360584259, + "learning_rate": 6.496219829365048e-05, + "loss": 1.1655, + "step": 2618 + }, + { + "epoch": 0.4035425490951097, + "grad_norm": 0.8488809466362, + "learning_rate": 6.491601701256966e-05, + "loss": 1.0303, + "step": 2620 + }, + { + "epoch": 0.40385059684251057, + "grad_norm": 0.7097958922386169, + "learning_rate": 6.486982175958618e-05, + "loss": 1.1143, + "step": 2622 + }, + { + "epoch": 0.4041586445899114, + "grad_norm": 0.6954580545425415, + "learning_rate": 6.482361257797138e-05, + "loss": 0.9396, + "step": 2624 + }, + { + "epoch": 0.4044666923373123, + "grad_norm": 0.671699583530426, + "learning_rate": 6.477738951100961e-05, + "loss": 1.1556, + "step": 2626 + }, + { + "epoch": 0.40477474008471315, + "grad_norm": 0.6676186323165894, + "learning_rate": 6.473115260199823e-05, + "loss": 1.0157, + "step": 2628 + }, + { + "epoch": 0.405082787832114, + "grad_norm": 0.6902966499328613, + "learning_rate": 6.468490189424759e-05, + "loss": 1.0301, + "step": 2630 + }, + { + "epoch": 0.40539083557951483, + "grad_norm": 0.7361807227134705, + "learning_rate": 6.46386374310809e-05, + "loss": 0.9511, + "step": 2632 + }, + { + "epoch": 0.40569888332691567, + "grad_norm": 0.6098953485488892, + "learning_rate": 6.459235925583433e-05, + "loss": 1.0431, + "step": 2634 + }, + { + "epoch": 0.4060069310743165, + "grad_norm": 0.9778101444244385, + "learning_rate": 6.454606741185686e-05, + "loss": 1.0221, + "step": 2636 + }, + { + "epoch": 0.40631497882171735, + "grad_norm": 0.7961477041244507, + "learning_rate": 6.449976194251026e-05, + "loss": 0.8727, + "step": 2638 + }, + { + "epoch": 0.4066230265691182, + "grad_norm": 0.6664597988128662, + "learning_rate": 6.44534428911691e-05, + "loss": 1.0133, + "step": 2640 + }, + { + "epoch": 0.40693107431651904, + "grad_norm": 0.5414862632751465, + "learning_rate": 6.440711030122063e-05, + "loss": 0.8452, + "step": 2642 + }, + { + "epoch": 0.40723912206391993, + "grad_norm": 0.6674546599388123, + "learning_rate": 6.43607642160648e-05, + "loss": 0.9969, + "step": 2644 + }, + { + "epoch": 0.4075471698113208, + "grad_norm": 1.1050533056259155, + "learning_rate": 6.431440467911424e-05, + "loss": 1.2907, + "step": 2646 + }, + { + "epoch": 0.4078552175587216, + "grad_norm": 0.7038127779960632, + "learning_rate": 6.426803173379412e-05, + "loss": 1.2864, + "step": 2648 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 1.0234121084213257, + "learning_rate": 6.422164542354219e-05, + "loss": 1.075, + "step": 2650 + }, + { + "epoch": 0.4084713130535233, + "grad_norm": 0.7568143010139465, + "learning_rate": 6.417524579180873e-05, + "loss": 1.0523, + "step": 2652 + }, + { + "epoch": 0.40877936080092414, + "grad_norm": 1.0208611488342285, + "learning_rate": 6.412883288205647e-05, + "loss": 1.9574, + "step": 2654 + }, + { + "epoch": 0.409087408548325, + "grad_norm": 6.761949062347412, + "learning_rate": 6.408240673776065e-05, + "loss": 1.047, + "step": 2656 + }, + { + "epoch": 0.4093954562957258, + "grad_norm": 0.6190847754478455, + "learning_rate": 6.40359674024088e-05, + "loss": 1.1624, + "step": 2658 + }, + { + "epoch": 0.40970350404312667, + "grad_norm": 0.5744146704673767, + "learning_rate": 6.398951491950089e-05, + "loss": 1.1138, + "step": 2660 + }, + { + "epoch": 0.4100115517905275, + "grad_norm": 0.7018343210220337, + "learning_rate": 6.394304933254916e-05, + "loss": 1.5961, + "step": 2662 + }, + { + "epoch": 0.4103195995379284, + "grad_norm": 0.7297990918159485, + "learning_rate": 6.389657068507819e-05, + "loss": 0.7845, + "step": 2664 + }, + { + "epoch": 0.41062764728532924, + "grad_norm": 0.4957539439201355, + "learning_rate": 6.385007902062467e-05, + "loss": 0.9743, + "step": 2666 + }, + { + "epoch": 0.4109356950327301, + "grad_norm": 0.6580387353897095, + "learning_rate": 6.380357438273763e-05, + "loss": 0.9583, + "step": 2668 + }, + { + "epoch": 0.4112437427801309, + "grad_norm": 0.7626873850822449, + "learning_rate": 6.375705681497813e-05, + "loss": 1.061, + "step": 2670 + }, + { + "epoch": 0.41155179052753177, + "grad_norm": 0.7961906790733337, + "learning_rate": 6.371052636091942e-05, + "loss": 1.9489, + "step": 2672 + }, + { + "epoch": 0.4118598382749326, + "grad_norm": 0.9162095189094543, + "learning_rate": 6.366398306414679e-05, + "loss": 1.2871, + "step": 2674 + }, + { + "epoch": 0.41216788602233345, + "grad_norm": 0.7108187675476074, + "learning_rate": 6.361742696825755e-05, + "loss": 0.992, + "step": 2676 + }, + { + "epoch": 0.4124759337697343, + "grad_norm": 0.7019790410995483, + "learning_rate": 6.357085811686103e-05, + "loss": 1.1067, + "step": 2678 + }, + { + "epoch": 0.41278398151713513, + "grad_norm": 0.6845753192901611, + "learning_rate": 6.352427655357848e-05, + "loss": 0.979, + "step": 2680 + }, + { + "epoch": 0.413092029264536, + "grad_norm": 0.5338165760040283, + "learning_rate": 6.347768232204305e-05, + "loss": 1.0374, + "step": 2682 + }, + { + "epoch": 0.4134000770119369, + "grad_norm": 0.6664561033248901, + "learning_rate": 6.343107546589982e-05, + "loss": 0.8928, + "step": 2684 + }, + { + "epoch": 0.4137081247593377, + "grad_norm": 0.8769541382789612, + "learning_rate": 6.33844560288056e-05, + "loss": 1.0479, + "step": 2686 + }, + { + "epoch": 0.41401617250673856, + "grad_norm": 0.86333167552948, + "learning_rate": 6.333782405442904e-05, + "loss": 0.8223, + "step": 2688 + }, + { + "epoch": 0.4143242202541394, + "grad_norm": 0.7412071228027344, + "learning_rate": 6.329117958645058e-05, + "loss": 1.0054, + "step": 2690 + }, + { + "epoch": 0.41463226800154024, + "grad_norm": 0.6570004820823669, + "learning_rate": 6.324452266856225e-05, + "loss": 0.8969, + "step": 2692 + }, + { + "epoch": 0.4149403157489411, + "grad_norm": 0.5948647856712341, + "learning_rate": 6.319785334446783e-05, + "loss": 0.8415, + "step": 2694 + }, + { + "epoch": 0.4152483634963419, + "grad_norm": 0.6360468864440918, + "learning_rate": 6.315117165788268e-05, + "loss": 0.9539, + "step": 2696 + }, + { + "epoch": 0.41555641124374276, + "grad_norm": 0.5860743522644043, + "learning_rate": 6.310447765253376e-05, + "loss": 0.8201, + "step": 2698 + }, + { + "epoch": 0.4158644589911436, + "grad_norm": 0.8867302536964417, + "learning_rate": 6.30577713721596e-05, + "loss": 2.5665, + "step": 2700 + }, + { + "epoch": 0.4158644589911436, + "eval_loss": 2.4475715160369873, + "eval_runtime": 737.1021, + "eval_samples_per_second": 2.713, + "eval_steps_per_second": 0.678, + "step": 2700 + }, + { + "epoch": 0.4161725067385445, + "grad_norm": 0.9580145478248596, + "learning_rate": 6.301105286051013e-05, + "loss": 0.9133, + "step": 2702 + }, + { + "epoch": 0.41648055448594534, + "grad_norm": 0.7150710225105286, + "learning_rate": 6.296432216134682e-05, + "loss": 0.868, + "step": 2704 + }, + { + "epoch": 0.4167886022333462, + "grad_norm": 0.761711835861206, + "learning_rate": 6.291757931844254e-05, + "loss": 1.1175, + "step": 2706 + }, + { + "epoch": 0.417096649980747, + "grad_norm": 0.8936048746109009, + "learning_rate": 6.287082437558151e-05, + "loss": 1.0394, + "step": 2708 + }, + { + "epoch": 0.41740469772814787, + "grad_norm": 0.8021292090415955, + "learning_rate": 6.282405737655933e-05, + "loss": 1.5893, + "step": 2710 + }, + { + "epoch": 0.4177127454755487, + "grad_norm": 0.7052441835403442, + "learning_rate": 6.277727836518286e-05, + "loss": 0.7865, + "step": 2712 + }, + { + "epoch": 0.41802079322294955, + "grad_norm": 0.9390805959701538, + "learning_rate": 6.27304873852702e-05, + "loss": 1.0504, + "step": 2714 + }, + { + "epoch": 0.4183288409703504, + "grad_norm": 0.8395248055458069, + "learning_rate": 6.268368448065069e-05, + "loss": 0.9347, + "step": 2716 + }, + { + "epoch": 0.41863688871775123, + "grad_norm": 0.6875936985015869, + "learning_rate": 6.263686969516483e-05, + "loss": 1.0526, + "step": 2718 + }, + { + "epoch": 0.4189449364651521, + "grad_norm": 0.6971736550331116, + "learning_rate": 6.259004307266426e-05, + "loss": 1.1244, + "step": 2720 + }, + { + "epoch": 0.41925298421255297, + "grad_norm": 0.7761056423187256, + "learning_rate": 6.254320465701166e-05, + "loss": 0.908, + "step": 2722 + }, + { + "epoch": 0.4195610319599538, + "grad_norm": 0.76072096824646, + "learning_rate": 6.249635449208085e-05, + "loss": 0.848, + "step": 2724 + }, + { + "epoch": 0.41986907970735465, + "grad_norm": 0.7384543418884277, + "learning_rate": 6.244949262175654e-05, + "loss": 1.108, + "step": 2726 + }, + { + "epoch": 0.4201771274547555, + "grad_norm": 0.9263654947280884, + "learning_rate": 6.240261908993447e-05, + "loss": 0.9928, + "step": 2728 + }, + { + "epoch": 0.42048517520215634, + "grad_norm": 0.5557862520217896, + "learning_rate": 6.235573394052134e-05, + "loss": 0.8398, + "step": 2730 + }, + { + "epoch": 0.4207932229495572, + "grad_norm": 0.7384987473487854, + "learning_rate": 6.230883721743462e-05, + "loss": 1.2297, + "step": 2732 + }, + { + "epoch": 0.421101270696958, + "grad_norm": 0.6595007181167603, + "learning_rate": 6.226192896460277e-05, + "loss": 0.8903, + "step": 2734 + }, + { + "epoch": 0.42140931844435886, + "grad_norm": 0.7695440649986267, + "learning_rate": 6.221500922596488e-05, + "loss": 0.9697, + "step": 2736 + }, + { + "epoch": 0.4217173661917597, + "grad_norm": 0.933242917060852, + "learning_rate": 6.216807804547097e-05, + "loss": 1.0735, + "step": 2738 + }, + { + "epoch": 0.42202541393916054, + "grad_norm": 0.8982356786727905, + "learning_rate": 6.212113546708165e-05, + "loss": 0.7696, + "step": 2740 + }, + { + "epoch": 0.42233346168656144, + "grad_norm": 0.8741735816001892, + "learning_rate": 6.207418153476824e-05, + "loss": 1.334, + "step": 2742 + }, + { + "epoch": 0.4226415094339623, + "grad_norm": 0.8034881353378296, + "learning_rate": 6.202721629251278e-05, + "loss": 2.425, + "step": 2744 + }, + { + "epoch": 0.4229495571813631, + "grad_norm": 0.582089364528656, + "learning_rate": 6.198023978430774e-05, + "loss": 0.9689, + "step": 2746 + }, + { + "epoch": 0.42325760492876396, + "grad_norm": 0.6534414887428284, + "learning_rate": 6.193325205415629e-05, + "loss": 0.9952, + "step": 2748 + }, + { + "epoch": 0.4235656526761648, + "grad_norm": 0.7840991616249084, + "learning_rate": 6.188625314607201e-05, + "loss": 0.963, + "step": 2750 + }, + { + "epoch": 0.42387370042356565, + "grad_norm": 0.7565934062004089, + "learning_rate": 6.183924310407905e-05, + "loss": 0.8481, + "step": 2752 + }, + { + "epoch": 0.4241817481709665, + "grad_norm": 0.7571336627006531, + "learning_rate": 6.17922219722119e-05, + "loss": 0.9408, + "step": 2754 + }, + { + "epoch": 0.42448979591836733, + "grad_norm": 0.8127564787864685, + "learning_rate": 6.17451897945155e-05, + "loss": 0.93, + "step": 2756 + }, + { + "epoch": 0.42479784366576817, + "grad_norm": 1.051659345626831, + "learning_rate": 6.169814661504509e-05, + "loss": 1.2283, + "step": 2758 + }, + { + "epoch": 0.42510589141316907, + "grad_norm": 0.8676214814186096, + "learning_rate": 6.165109247786624e-05, + "loss": 1.3913, + "step": 2760 + }, + { + "epoch": 0.4254139391605699, + "grad_norm": 0.7785351872444153, + "learning_rate": 6.160402742705477e-05, + "loss": 0.9559, + "step": 2762 + }, + { + "epoch": 0.42572198690797075, + "grad_norm": 0.7920562624931335, + "learning_rate": 6.155695150669675e-05, + "loss": 0.9236, + "step": 2764 + }, + { + "epoch": 0.4260300346553716, + "grad_norm": 0.6295596957206726, + "learning_rate": 6.150986476088841e-05, + "loss": 1.2528, + "step": 2766 + }, + { + "epoch": 0.42633808240277243, + "grad_norm": 0.6867069602012634, + "learning_rate": 6.14627672337361e-05, + "loss": 1.0019, + "step": 2768 + }, + { + "epoch": 0.4266461301501733, + "grad_norm": 0.7173376083374023, + "learning_rate": 6.141565896935633e-05, + "loss": 1.2387, + "step": 2770 + }, + { + "epoch": 0.4269541778975741, + "grad_norm": 0.7977665662765503, + "learning_rate": 6.13685400118756e-05, + "loss": 1.277, + "step": 2772 + }, + { + "epoch": 0.42726222564497496, + "grad_norm": 0.784441351890564, + "learning_rate": 6.13214104054305e-05, + "loss": 0.885, + "step": 2774 + }, + { + "epoch": 0.4275702733923758, + "grad_norm": 1.0195475816726685, + "learning_rate": 6.127427019416748e-05, + "loss": 1.2057, + "step": 2776 + }, + { + "epoch": 0.42787832113977664, + "grad_norm": 1.036000370979309, + "learning_rate": 6.122711942224308e-05, + "loss": 1.1531, + "step": 2778 + }, + { + "epoch": 0.42818636888717754, + "grad_norm": 0.7378907799720764, + "learning_rate": 6.117995813382357e-05, + "loss": 1.0969, + "step": 2780 + }, + { + "epoch": 0.4284944166345784, + "grad_norm": 0.8123480081558228, + "learning_rate": 6.113278637308519e-05, + "loss": 1.0515, + "step": 2782 + }, + { + "epoch": 0.4288024643819792, + "grad_norm": 0.8694273829460144, + "learning_rate": 6.108560418421397e-05, + "loss": 0.7927, + "step": 2784 + }, + { + "epoch": 0.42911051212938006, + "grad_norm": 0.7682204842567444, + "learning_rate": 6.103841161140564e-05, + "loss": 0.9275, + "step": 2786 + }, + { + "epoch": 0.4294185598767809, + "grad_norm": 0.8520436882972717, + "learning_rate": 6.099120869886573e-05, + "loss": 1.2366, + "step": 2788 + }, + { + "epoch": 0.42972660762418174, + "grad_norm": 0.9056306481361389, + "learning_rate": 6.0943995490809403e-05, + "loss": 1.9261, + "step": 2790 + }, + { + "epoch": 0.4300346553715826, + "grad_norm": 0.8186244368553162, + "learning_rate": 6.0896772031461514e-05, + "loss": 2.4318, + "step": 2792 + }, + { + "epoch": 0.4303427031189834, + "grad_norm": 0.9430123567581177, + "learning_rate": 6.08495383650565e-05, + "loss": 1.4405, + "step": 2794 + }, + { + "epoch": 0.43065075086638427, + "grad_norm": 0.7463430166244507, + "learning_rate": 6.0802294535838344e-05, + "loss": 1.1768, + "step": 2796 + }, + { + "epoch": 0.4309587986137851, + "grad_norm": 0.7996833324432373, + "learning_rate": 6.0755040588060565e-05, + "loss": 0.9411, + "step": 2798 + }, + { + "epoch": 0.431266846361186, + "grad_norm": 0.6117491722106934, + "learning_rate": 6.070777656598615e-05, + "loss": 0.9498, + "step": 2800 + }, + { + "epoch": 0.43157489410858685, + "grad_norm": 0.6472463607788086, + "learning_rate": 6.066050251388754e-05, + "loss": 0.9428, + "step": 2802 + }, + { + "epoch": 0.4318829418559877, + "grad_norm": 1.8328849077224731, + "learning_rate": 6.061321847604655e-05, + "loss": 0.8237, + "step": 2804 + }, + { + "epoch": 0.43219098960338853, + "grad_norm": 0.6823046207427979, + "learning_rate": 6.0565924496754366e-05, + "loss": 0.9588, + "step": 2806 + }, + { + "epoch": 0.4324990373507894, + "grad_norm": 0.6461644172668457, + "learning_rate": 6.0518620620311475e-05, + "loss": 0.851, + "step": 2808 + }, + { + "epoch": 0.4328070850981902, + "grad_norm": 0.7930482625961304, + "learning_rate": 6.0471306891027637e-05, + "loss": 0.9376, + "step": 2810 + }, + { + "epoch": 0.43311513284559106, + "grad_norm": 0.5953826308250427, + "learning_rate": 6.0423983353221836e-05, + "loss": 1.0449, + "step": 2812 + }, + { + "epoch": 0.4334231805929919, + "grad_norm": 0.8456257581710815, + "learning_rate": 6.037665005122228e-05, + "loss": 0.8364, + "step": 2814 + }, + { + "epoch": 0.43373122834039274, + "grad_norm": 0.6633259654045105, + "learning_rate": 6.032930702936626e-05, + "loss": 0.9747, + "step": 2816 + }, + { + "epoch": 0.43403927608779364, + "grad_norm": 0.7752382755279541, + "learning_rate": 6.0281954332000226e-05, + "loss": 1.7662, + "step": 2818 + }, + { + "epoch": 0.4343473238351945, + "grad_norm": 0.766176164150238, + "learning_rate": 6.023459200347964e-05, + "loss": 1.1785, + "step": 2820 + }, + { + "epoch": 0.4346553715825953, + "grad_norm": 0.698911726474762, + "learning_rate": 6.018722008816905e-05, + "loss": 1.3769, + "step": 2822 + }, + { + "epoch": 0.43496341932999616, + "grad_norm": 0.6189733743667603, + "learning_rate": 6.013983863044195e-05, + "loss": 0.8285, + "step": 2824 + }, + { + "epoch": 0.435271467077397, + "grad_norm": 0.9354870915412903, + "learning_rate": 6.009244767468074e-05, + "loss": 0.8931, + "step": 2826 + }, + { + "epoch": 0.43557951482479784, + "grad_norm": 0.8708924651145935, + "learning_rate": 6.004504726527679e-05, + "loss": 0.9616, + "step": 2828 + }, + { + "epoch": 0.4358875625721987, + "grad_norm": 1.112870693206787, + "learning_rate": 5.999763744663024e-05, + "loss": 1.2036, + "step": 2830 + }, + { + "epoch": 0.4361956103195995, + "grad_norm": 0.7736754417419434, + "learning_rate": 5.9950218263150114e-05, + "loss": 1.0429, + "step": 2832 + }, + { + "epoch": 0.43650365806700037, + "grad_norm": 0.9180028438568115, + "learning_rate": 5.99027897592542e-05, + "loss": 0.9176, + "step": 2834 + }, + { + "epoch": 0.4368117058144012, + "grad_norm": 0.9358363747596741, + "learning_rate": 5.985535197936896e-05, + "loss": 1.0841, + "step": 2836 + }, + { + "epoch": 0.4371197535618021, + "grad_norm": 0.8135294318199158, + "learning_rate": 5.9807904967929605e-05, + "loss": 1.0238, + "step": 2838 + }, + { + "epoch": 0.43742780130920295, + "grad_norm": 0.8550823330879211, + "learning_rate": 5.976044876937997e-05, + "loss": 1.2206, + "step": 2840 + }, + { + "epoch": 0.4377358490566038, + "grad_norm": 0.693490743637085, + "learning_rate": 5.9712983428172494e-05, + "loss": 0.9696, + "step": 2842 + }, + { + "epoch": 0.43804389680400463, + "grad_norm": 0.7726758718490601, + "learning_rate": 5.9665508988768185e-05, + "loss": 0.9961, + "step": 2844 + }, + { + "epoch": 0.43835194455140547, + "grad_norm": 0.6240848302841187, + "learning_rate": 5.961802549563658e-05, + "loss": 0.9851, + "step": 2846 + }, + { + "epoch": 0.4386599922988063, + "grad_norm": 0.8528410196304321, + "learning_rate": 5.957053299325566e-05, + "loss": 0.9969, + "step": 2848 + }, + { + "epoch": 0.43896804004620715, + "grad_norm": 0.8000697493553162, + "learning_rate": 5.952303152611191e-05, + "loss": 1.1812, + "step": 2850 + }, + { + "epoch": 0.439276087793608, + "grad_norm": 0.6206786036491394, + "learning_rate": 5.947552113870013e-05, + "loss": 1.0304, + "step": 2852 + }, + { + "epoch": 0.43958413554100884, + "grad_norm": 0.8891565203666687, + "learning_rate": 5.942800187552359e-05, + "loss": 1.0588, + "step": 2854 + }, + { + "epoch": 0.4398921832884097, + "grad_norm": 0.584948718547821, + "learning_rate": 5.938047378109373e-05, + "loss": 1.0238, + "step": 2856 + }, + { + "epoch": 0.4402002310358106, + "grad_norm": 0.502342164516449, + "learning_rate": 5.93329368999304e-05, + "loss": 1.6083, + "step": 2858 + }, + { + "epoch": 0.4405082787832114, + "grad_norm": 0.7828280925750732, + "learning_rate": 5.9285391276561565e-05, + "loss": 1.109, + "step": 2860 + }, + { + "epoch": 0.44081632653061226, + "grad_norm": 0.5425410270690918, + "learning_rate": 5.9237836955523484e-05, + "loss": 1.9821, + "step": 2862 + }, + { + "epoch": 0.4411243742780131, + "grad_norm": 0.864776611328125, + "learning_rate": 5.9190273981360454e-05, + "loss": 2.4012, + "step": 2864 + }, + { + "epoch": 0.44143242202541394, + "grad_norm": 0.77806156873703, + "learning_rate": 5.9142702398624985e-05, + "loss": 1.0941, + "step": 2866 + }, + { + "epoch": 0.4417404697728148, + "grad_norm": 0.6679258942604065, + "learning_rate": 5.909512225187759e-05, + "loss": 0.8178, + "step": 2868 + }, + { + "epoch": 0.4420485175202156, + "grad_norm": 1.0282930135726929, + "learning_rate": 5.9047533585686776e-05, + "loss": 0.9111, + "step": 2870 + }, + { + "epoch": 0.44235656526761646, + "grad_norm": 0.9571512341499329, + "learning_rate": 5.8999936444629125e-05, + "loss": 0.9596, + "step": 2872 + }, + { + "epoch": 0.4426646130150173, + "grad_norm": 0.5731912851333618, + "learning_rate": 5.895233087328904e-05, + "loss": 0.7965, + "step": 2874 + }, + { + "epoch": 0.4429726607624182, + "grad_norm": 0.8772318959236145, + "learning_rate": 5.890471691625894e-05, + "loss": 1.019, + "step": 2876 + }, + { + "epoch": 0.44328070850981904, + "grad_norm": 0.5373838543891907, + "learning_rate": 5.8857094618138996e-05, + "loss": 1.4404, + "step": 2878 + }, + { + "epoch": 0.4435887562572199, + "grad_norm": 1.4746814966201782, + "learning_rate": 5.8809464023537265e-05, + "loss": 1.1947, + "step": 2880 + }, + { + "epoch": 0.4438968040046207, + "grad_norm": 0.8003684878349304, + "learning_rate": 5.876182517706954e-05, + "loss": 2.4831, + "step": 2882 + }, + { + "epoch": 0.44420485175202157, + "grad_norm": 0.7508583068847656, + "learning_rate": 5.8714178123359345e-05, + "loss": 0.97, + "step": 2884 + }, + { + "epoch": 0.4445128994994224, + "grad_norm": 0.7142212986946106, + "learning_rate": 5.8666522907037905e-05, + "loss": 0.9413, + "step": 2886 + }, + { + "epoch": 0.44482094724682325, + "grad_norm": 0.8980920910835266, + "learning_rate": 5.8618859572744065e-05, + "loss": 2.0577, + "step": 2888 + }, + { + "epoch": 0.4451289949942241, + "grad_norm": 0.6517657041549683, + "learning_rate": 5.8571188165124316e-05, + "loss": 0.8379, + "step": 2890 + }, + { + "epoch": 0.44543704274162493, + "grad_norm": 0.7201358675956726, + "learning_rate": 5.852350872883267e-05, + "loss": 2.1127, + "step": 2892 + }, + { + "epoch": 0.4457450904890258, + "grad_norm": 0.876204252243042, + "learning_rate": 5.847582130853068e-05, + "loss": 0.9175, + "step": 2894 + }, + { + "epoch": 0.44605313823642667, + "grad_norm": 0.7286810278892517, + "learning_rate": 5.842812594888737e-05, + "loss": 1.0932, + "step": 2896 + }, + { + "epoch": 0.4463611859838275, + "grad_norm": 0.9662947654724121, + "learning_rate": 5.838042269457924e-05, + "loss": 0.9577, + "step": 2898 + }, + { + "epoch": 0.44666923373122835, + "grad_norm": 0.8888316750526428, + "learning_rate": 5.83327115902901e-05, + "loss": 1.2033, + "step": 2900 + }, + { + "epoch": 0.4469772814786292, + "grad_norm": 0.6747696399688721, + "learning_rate": 5.8284992680711204e-05, + "loss": 1.0227, + "step": 2902 + }, + { + "epoch": 0.44728532922603004, + "grad_norm": 0.6601919531822205, + "learning_rate": 5.8237266010541046e-05, + "loss": 0.8986, + "step": 2904 + }, + { + "epoch": 0.4475933769734309, + "grad_norm": 0.6246869564056396, + "learning_rate": 5.818953162448545e-05, + "loss": 0.94, + "step": 2906 + }, + { + "epoch": 0.4479014247208317, + "grad_norm": 0.7867985963821411, + "learning_rate": 5.814178956725742e-05, + "loss": 0.8662, + "step": 2908 + }, + { + "epoch": 0.44820947246823256, + "grad_norm": 0.7554641366004944, + "learning_rate": 5.8094039883577164e-05, + "loss": 1.655, + "step": 2910 + }, + { + "epoch": 0.4485175202156334, + "grad_norm": 0.7745111584663391, + "learning_rate": 5.804628261817204e-05, + "loss": 0.9167, + "step": 2912 + }, + { + "epoch": 0.44882556796303424, + "grad_norm": 0.5894488096237183, + "learning_rate": 5.79985178157765e-05, + "loss": 0.8354, + "step": 2914 + }, + { + "epoch": 0.44913361571043514, + "grad_norm": 0.7678777575492859, + "learning_rate": 5.7950745521132044e-05, + "loss": 0.8144, + "step": 2916 + }, + { + "epoch": 0.449441663457836, + "grad_norm": 0.6224768757820129, + "learning_rate": 5.7902965778987215e-05, + "loss": 0.8566, + "step": 2918 + }, + { + "epoch": 0.4497497112052368, + "grad_norm": 0.9957372546195984, + "learning_rate": 5.785517863409752e-05, + "loss": 0.9116, + "step": 2920 + }, + { + "epoch": 0.45005775895263767, + "grad_norm": 0.722830057144165, + "learning_rate": 5.7807384131225395e-05, + "loss": 0.9513, + "step": 2922 + }, + { + "epoch": 0.4503658067000385, + "grad_norm": 0.7828933000564575, + "learning_rate": 5.775958231514018e-05, + "loss": 0.9646, + "step": 2924 + }, + { + "epoch": 0.45067385444743935, + "grad_norm": 0.511208176612854, + "learning_rate": 5.771177323061806e-05, + "loss": 0.8199, + "step": 2926 + }, + { + "epoch": 0.4509819021948402, + "grad_norm": 0.8397185206413269, + "learning_rate": 5.766395692244202e-05, + "loss": 1.0869, + "step": 2928 + }, + { + "epoch": 0.45128994994224103, + "grad_norm": 0.7995164394378662, + "learning_rate": 5.761613343540182e-05, + "loss": 1.1447, + "step": 2930 + }, + { + "epoch": 0.4515979976896419, + "grad_norm": 0.8521782159805298, + "learning_rate": 5.756830281429395e-05, + "loss": 0.9796, + "step": 2932 + }, + { + "epoch": 0.45190604543704277, + "grad_norm": 0.8117061257362366, + "learning_rate": 5.752046510392156e-05, + "loss": 0.8261, + "step": 2934 + }, + { + "epoch": 0.4522140931844436, + "grad_norm": 0.7211443781852722, + "learning_rate": 5.747262034909446e-05, + "loss": 1.2989, + "step": 2936 + }, + { + "epoch": 0.45252214093184445, + "grad_norm": 1.1251598596572876, + "learning_rate": 5.7424768594629094e-05, + "loss": 1.1287, + "step": 2938 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.7493430376052856, + "learning_rate": 5.737690988534836e-05, + "loss": 0.9913, + "step": 2940 + }, + { + "epoch": 0.45313823642664613, + "grad_norm": 1.022343635559082, + "learning_rate": 5.732904426608179e-05, + "loss": 1.0708, + "step": 2942 + }, + { + "epoch": 0.453446284174047, + "grad_norm": 0.7727839946746826, + "learning_rate": 5.728117178166528e-05, + "loss": 0.9274, + "step": 2944 + }, + { + "epoch": 0.4537543319214478, + "grad_norm": 1.04244065284729, + "learning_rate": 5.7233292476941245e-05, + "loss": 1.0758, + "step": 2946 + }, + { + "epoch": 0.45406237966884866, + "grad_norm": 0.7736064791679382, + "learning_rate": 5.7185406396758445e-05, + "loss": 0.9561, + "step": 2948 + }, + { + "epoch": 0.4543704274162495, + "grad_norm": 0.9154314398765564, + "learning_rate": 5.7137513585972e-05, + "loss": 1.0392, + "step": 2950 + }, + { + "epoch": 0.45467847516365034, + "grad_norm": 0.8307647109031677, + "learning_rate": 5.708961408944333e-05, + "loss": 0.837, + "step": 2952 + }, + { + "epoch": 0.45498652291105124, + "grad_norm": 0.7958799600601196, + "learning_rate": 5.704170795204009e-05, + "loss": 1.1606, + "step": 2954 + }, + { + "epoch": 0.4552945706584521, + "grad_norm": 0.8393613696098328, + "learning_rate": 5.6993795218636215e-05, + "loss": 0.9094, + "step": 2956 + }, + { + "epoch": 0.4556026184058529, + "grad_norm": 0.6367685794830322, + "learning_rate": 5.694587593411176e-05, + "loss": 0.9539, + "step": 2958 + }, + { + "epoch": 0.45591066615325376, + "grad_norm": 0.774171769618988, + "learning_rate": 5.689795014335296e-05, + "loss": 1.0237, + "step": 2960 + }, + { + "epoch": 0.4562187139006546, + "grad_norm": 0.5494070649147034, + "learning_rate": 5.6850017891252125e-05, + "loss": 2.4835, + "step": 2962 + }, + { + "epoch": 0.45652676164805545, + "grad_norm": 0.764487624168396, + "learning_rate": 5.6802079222707614e-05, + "loss": 1.0823, + "step": 2964 + }, + { + "epoch": 0.4568348093954563, + "grad_norm": 0.6190123558044434, + "learning_rate": 5.67541341826238e-05, + "loss": 0.8227, + "step": 2966 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.7573233842849731, + "learning_rate": 5.6706182815911026e-05, + "loss": 0.9032, + "step": 2968 + }, + { + "epoch": 0.45745090489025797, + "grad_norm": 0.7121224999427795, + "learning_rate": 5.665822516748557e-05, + "loss": 0.9701, + "step": 2970 + }, + { + "epoch": 0.4577589526376588, + "grad_norm": 0.8833084106445312, + "learning_rate": 5.661026128226956e-05, + "loss": 1.0514, + "step": 2972 + }, + { + "epoch": 0.4580670003850597, + "grad_norm": 0.7853366136550903, + "learning_rate": 5.656229120519102e-05, + "loss": 0.7792, + "step": 2974 + }, + { + "epoch": 0.45837504813246055, + "grad_norm": 0.7289166450500488, + "learning_rate": 5.651431498118372e-05, + "loss": 0.9208, + "step": 2976 + }, + { + "epoch": 0.4586830958798614, + "grad_norm": 0.7399483323097229, + "learning_rate": 5.6466332655187235e-05, + "loss": 0.8391, + "step": 2978 + }, + { + "epoch": 0.45899114362726223, + "grad_norm": 0.6576483249664307, + "learning_rate": 5.6418344272146816e-05, + "loss": 1.1026, + "step": 2980 + }, + { + "epoch": 0.4592991913746631, + "grad_norm": 0.9064686894416809, + "learning_rate": 5.6370349877013426e-05, + "loss": 0.9908, + "step": 2982 + }, + { + "epoch": 0.4596072391220639, + "grad_norm": 0.6922678351402283, + "learning_rate": 5.632234951474361e-05, + "loss": 1.0612, + "step": 2984 + }, + { + "epoch": 0.45991528686946476, + "grad_norm": 0.6031603217124939, + "learning_rate": 5.6274343230299566e-05, + "loss": 1.0215, + "step": 2986 + }, + { + "epoch": 0.4602233346168656, + "grad_norm": 0.7492870092391968, + "learning_rate": 5.622633106864895e-05, + "loss": 0.8476, + "step": 2988 + }, + { + "epoch": 0.46053138236426644, + "grad_norm": 0.9854443669319153, + "learning_rate": 5.617831307476503e-05, + "loss": 1.1229, + "step": 2990 + }, + { + "epoch": 0.4608394301116673, + "grad_norm": 0.8583337068557739, + "learning_rate": 5.613028929362647e-05, + "loss": 0.9634, + "step": 2992 + }, + { + "epoch": 0.4611474778590682, + "grad_norm": 0.7198197841644287, + "learning_rate": 5.6082259770217363e-05, + "loss": 0.931, + "step": 2994 + }, + { + "epoch": 0.461455525606469, + "grad_norm": 0.8717345595359802, + "learning_rate": 5.603422454952719e-05, + "loss": 1.1268, + "step": 2996 + }, + { + "epoch": 0.46176357335386986, + "grad_norm": 0.558197557926178, + "learning_rate": 5.598618367655075e-05, + "loss": 0.8507, + "step": 2998 + }, + { + "epoch": 0.4620716211012707, + "grad_norm": 0.8464396595954895, + "learning_rate": 5.593813719628819e-05, + "loss": 1.0359, + "step": 3000 + }, + { + "epoch": 0.4620716211012707, + "eval_loss": 2.401945114135742, + "eval_runtime": 736.2006, + "eval_samples_per_second": 2.717, + "eval_steps_per_second": 0.679, + "step": 3000 + }, + { + "epoch": 0.46237966884867154, + "grad_norm": 0.8869822025299072, + "learning_rate": 5.589008515374484e-05, + "loss": 1.1223, + "step": 3002 + }, + { + "epoch": 0.4626877165960724, + "grad_norm": 0.6646120548248291, + "learning_rate": 5.584202759393128e-05, + "loss": 0.9921, + "step": 3004 + }, + { + "epoch": 0.4629957643434732, + "grad_norm": 0.8584650158882141, + "learning_rate": 5.5793964561863256e-05, + "loss": 1.1436, + "step": 3006 + }, + { + "epoch": 0.46330381209087407, + "grad_norm": 0.6617639660835266, + "learning_rate": 5.5745896102561636e-05, + "loss": 0.8618, + "step": 3008 + }, + { + "epoch": 0.4636118598382749, + "grad_norm": 0.825599193572998, + "learning_rate": 5.569782226105236e-05, + "loss": 0.9942, + "step": 3010 + }, + { + "epoch": 0.4639199075856758, + "grad_norm": 0.6756584644317627, + "learning_rate": 5.564974308236642e-05, + "loss": 0.9885, + "step": 3012 + }, + { + "epoch": 0.46422795533307665, + "grad_norm": 0.6235724687576294, + "learning_rate": 5.560165861153982e-05, + "loss": 0.7758, + "step": 3014 + }, + { + "epoch": 0.4645360030804775, + "grad_norm": 1.003424882888794, + "learning_rate": 5.555356889361349e-05, + "loss": 0.7478, + "step": 3016 + }, + { + "epoch": 0.46484405082787833, + "grad_norm": 0.6220825910568237, + "learning_rate": 5.55054739736333e-05, + "loss": 2.3584, + "step": 3018 + }, + { + "epoch": 0.46515209857527917, + "grad_norm": 0.7365529537200928, + "learning_rate": 5.545737389664999e-05, + "loss": 1.0472, + "step": 3020 + }, + { + "epoch": 0.46546014632268, + "grad_norm": 1.023934006690979, + "learning_rate": 5.540926870771913e-05, + "loss": 0.9631, + "step": 3022 + }, + { + "epoch": 0.46576819407008085, + "grad_norm": 1.0248138904571533, + "learning_rate": 5.536115845190105e-05, + "loss": 1.0431, + "step": 3024 + }, + { + "epoch": 0.4660762418174817, + "grad_norm": 0.6595494747161865, + "learning_rate": 5.531304317426089e-05, + "loss": 0.9056, + "step": 3026 + }, + { + "epoch": 0.46638428956488254, + "grad_norm": 0.7172659039497375, + "learning_rate": 5.526492291986841e-05, + "loss": 1.0834, + "step": 3028 + }, + { + "epoch": 0.4666923373122834, + "grad_norm": 0.8179621696472168, + "learning_rate": 5.521679773379812e-05, + "loss": 0.9538, + "step": 3030 + }, + { + "epoch": 0.4670003850596843, + "grad_norm": 0.6643562912940979, + "learning_rate": 5.516866766112908e-05, + "loss": 0.9234, + "step": 3032 + }, + { + "epoch": 0.4673084328070851, + "grad_norm": 0.8943853974342346, + "learning_rate": 5.5120532746944955e-05, + "loss": 1.2507, + "step": 3034 + }, + { + "epoch": 0.46761648055448596, + "grad_norm": 0.8801318407058716, + "learning_rate": 5.507239303633396e-05, + "loss": 1.0128, + "step": 3036 + }, + { + "epoch": 0.4679245283018868, + "grad_norm": 1.0031734704971313, + "learning_rate": 5.502424857438876e-05, + "loss": 1.0573, + "step": 3038 + }, + { + "epoch": 0.46823257604928764, + "grad_norm": 0.686630129814148, + "learning_rate": 5.4976099406206516e-05, + "loss": 0.9245, + "step": 3040 + }, + { + "epoch": 0.4685406237966885, + "grad_norm": 0.4664287269115448, + "learning_rate": 5.492794557688877e-05, + "loss": 1.6719, + "step": 3042 + }, + { + "epoch": 0.4688486715440893, + "grad_norm": 0.868158221244812, + "learning_rate": 5.487978713154144e-05, + "loss": 1.6452, + "step": 3044 + }, + { + "epoch": 0.46915671929149017, + "grad_norm": 0.7039262056350708, + "learning_rate": 5.483162411527477e-05, + "loss": 0.9206, + "step": 3046 + }, + { + "epoch": 0.469464767038891, + "grad_norm": 0.7789177894592285, + "learning_rate": 5.4783456573203283e-05, + "loss": 0.9925, + "step": 3048 + }, + { + "epoch": 0.46977281478629185, + "grad_norm": 0.8716445565223694, + "learning_rate": 5.473528455044572e-05, + "loss": 1.0072, + "step": 3050 + }, + { + "epoch": 0.47008086253369274, + "grad_norm": 0.786055862903595, + "learning_rate": 5.4687108092125074e-05, + "loss": 1.162, + "step": 3052 + }, + { + "epoch": 0.4703889102810936, + "grad_norm": 0.9024592638015747, + "learning_rate": 5.463892724336843e-05, + "loss": 1.053, + "step": 3054 + }, + { + "epoch": 0.4706969580284944, + "grad_norm": 1.0428143739700317, + "learning_rate": 5.459074204930703e-05, + "loss": 0.9698, + "step": 3056 + }, + { + "epoch": 0.47100500577589527, + "grad_norm": 0.6836503148078918, + "learning_rate": 5.454255255507615e-05, + "loss": 1.0836, + "step": 3058 + }, + { + "epoch": 0.4713130535232961, + "grad_norm": 0.7136745452880859, + "learning_rate": 5.449435880581513e-05, + "loss": 0.9677, + "step": 3060 + }, + { + "epoch": 0.47162110127069695, + "grad_norm": 0.9150458574295044, + "learning_rate": 5.444616084666729e-05, + "loss": 0.9073, + "step": 3062 + }, + { + "epoch": 0.4719291490180978, + "grad_norm": 0.7028246521949768, + "learning_rate": 5.439795872277985e-05, + "loss": 1.0398, + "step": 3064 + }, + { + "epoch": 0.47223719676549863, + "grad_norm": 1.0035873651504517, + "learning_rate": 5.4349752479304e-05, + "loss": 1.3181, + "step": 3066 + }, + { + "epoch": 0.4725452445128995, + "grad_norm": 0.91478031873703, + "learning_rate": 5.430154216139471e-05, + "loss": 1.049, + "step": 3068 + }, + { + "epoch": 0.4728532922603004, + "grad_norm": 0.8790503740310669, + "learning_rate": 5.425332781421085e-05, + "loss": 0.8075, + "step": 3070 + }, + { + "epoch": 0.4731613400077012, + "grad_norm": 1.371443271636963, + "learning_rate": 5.4205109482915017e-05, + "loss": 1.3958, + "step": 3072 + }, + { + "epoch": 0.47346938775510206, + "grad_norm": 0.630165159702301, + "learning_rate": 5.4156887212673535e-05, + "loss": 1.0174, + "step": 3074 + }, + { + "epoch": 0.4737774355025029, + "grad_norm": 0.5171129107475281, + "learning_rate": 5.410866104865643e-05, + "loss": 0.9023, + "step": 3076 + }, + { + "epoch": 0.47408548324990374, + "grad_norm": 0.7049612998962402, + "learning_rate": 5.4060431036037376e-05, + "loss": 0.778, + "step": 3078 + }, + { + "epoch": 0.4743935309973046, + "grad_norm": 0.6293292045593262, + "learning_rate": 5.401219721999364e-05, + "loss": 0.9847, + "step": 3080 + }, + { + "epoch": 0.4747015787447054, + "grad_norm": 0.6555522084236145, + "learning_rate": 5.3963959645706085e-05, + "loss": 1.0141, + "step": 3082 + }, + { + "epoch": 0.47500962649210626, + "grad_norm": 0.8215299248695374, + "learning_rate": 5.3915718358359066e-05, + "loss": 1.0369, + "step": 3084 + }, + { + "epoch": 0.4753176742395071, + "grad_norm": 0.8922412395477295, + "learning_rate": 5.386747340314041e-05, + "loss": 0.9832, + "step": 3086 + }, + { + "epoch": 0.47562572198690795, + "grad_norm": 0.6807952523231506, + "learning_rate": 5.38192248252414e-05, + "loss": 0.9139, + "step": 3088 + }, + { + "epoch": 0.47593376973430884, + "grad_norm": 0.6609580516815186, + "learning_rate": 5.37709726698567e-05, + "loss": 1.0296, + "step": 3090 + }, + { + "epoch": 0.4762418174817097, + "grad_norm": 1.3963803052902222, + "learning_rate": 5.372271698218433e-05, + "loss": 1.0484, + "step": 3092 + }, + { + "epoch": 0.4765498652291105, + "grad_norm": 0.6297670602798462, + "learning_rate": 5.367445780742559e-05, + "loss": 1.0407, + "step": 3094 + }, + { + "epoch": 0.47685791297651137, + "grad_norm": 0.622728705406189, + "learning_rate": 5.362619519078514e-05, + "loss": 0.9035, + "step": 3096 + }, + { + "epoch": 0.4771659607239122, + "grad_norm": 0.5931330919265747, + "learning_rate": 5.3577929177470757e-05, + "loss": 0.8357, + "step": 3098 + }, + { + "epoch": 0.47747400847131305, + "grad_norm": 0.9229997992515564, + "learning_rate": 5.352965981269342e-05, + "loss": 0.9726, + "step": 3100 + }, + { + "epoch": 0.4777820562187139, + "grad_norm": 0.7359765768051147, + "learning_rate": 5.348138714166731e-05, + "loss": 1.0161, + "step": 3102 + }, + { + "epoch": 0.47809010396611473, + "grad_norm": 0.929198682308197, + "learning_rate": 5.343311120960962e-05, + "loss": 0.8967, + "step": 3104 + }, + { + "epoch": 0.4783981517135156, + "grad_norm": 0.8329547047615051, + "learning_rate": 5.33848320617407e-05, + "loss": 1.2662, + "step": 3106 + }, + { + "epoch": 0.4787061994609164, + "grad_norm": 0.9614522457122803, + "learning_rate": 5.333654974328378e-05, + "loss": 1.0578, + "step": 3108 + }, + { + "epoch": 0.4790142472083173, + "grad_norm": 0.6592017412185669, + "learning_rate": 5.3288264299465196e-05, + "loss": 1.0484, + "step": 3110 + }, + { + "epoch": 0.47932229495571815, + "grad_norm": 0.906784176826477, + "learning_rate": 5.3239975775514097e-05, + "loss": 1.086, + "step": 3112 + }, + { + "epoch": 0.479630342703119, + "grad_norm": 0.7839928269386292, + "learning_rate": 5.319168421666261e-05, + "loss": 0.8187, + "step": 3114 + }, + { + "epoch": 0.47993839045051984, + "grad_norm": 0.756097137928009, + "learning_rate": 5.314338966814564e-05, + "loss": 0.9537, + "step": 3116 + }, + { + "epoch": 0.4802464381979207, + "grad_norm": 0.846994936466217, + "learning_rate": 5.309509217520092e-05, + "loss": 1.0448, + "step": 3118 + }, + { + "epoch": 0.4805544859453215, + "grad_norm": 0.5622255802154541, + "learning_rate": 5.304679178306894e-05, + "loss": 0.8573, + "step": 3120 + }, + { + "epoch": 0.48086253369272236, + "grad_norm": 0.6583067774772644, + "learning_rate": 5.2998488536992906e-05, + "loss": 0.9072, + "step": 3122 + }, + { + "epoch": 0.4811705814401232, + "grad_norm": 0.6179800629615784, + "learning_rate": 5.295018248221868e-05, + "loss": 1.9247, + "step": 3124 + }, + { + "epoch": 0.48147862918752404, + "grad_norm": 0.759346604347229, + "learning_rate": 5.290187366399478e-05, + "loss": 0.8023, + "step": 3126 + }, + { + "epoch": 0.48178667693492494, + "grad_norm": 1.0747575759887695, + "learning_rate": 5.285356212757231e-05, + "loss": 0.8941, + "step": 3128 + }, + { + "epoch": 0.4820947246823258, + "grad_norm": 0.7229118943214417, + "learning_rate": 5.280524791820488e-05, + "loss": 0.8908, + "step": 3130 + }, + { + "epoch": 0.4824027724297266, + "grad_norm": 0.9550780653953552, + "learning_rate": 5.275693108114868e-05, + "loss": 0.8559, + "step": 3132 + }, + { + "epoch": 0.48271082017712746, + "grad_norm": 0.9661011099815369, + "learning_rate": 5.2708611661662256e-05, + "loss": 1.0929, + "step": 3134 + }, + { + "epoch": 0.4830188679245283, + "grad_norm": 1.5283000469207764, + "learning_rate": 5.2660289705006696e-05, + "loss": 0.9719, + "step": 3136 + }, + { + "epoch": 0.48332691567192915, + "grad_norm": 0.9643043279647827, + "learning_rate": 5.261196525644535e-05, + "loss": 1.1199, + "step": 3138 + }, + { + "epoch": 0.48363496341933, + "grad_norm": 0.7876790761947632, + "learning_rate": 5.2563638361244004e-05, + "loss": 0.8679, + "step": 3140 + }, + { + "epoch": 0.48394301116673083, + "grad_norm": 0.7540126442909241, + "learning_rate": 5.251530906467065e-05, + "loss": 0.9442, + "step": 3142 + }, + { + "epoch": 0.48425105891413167, + "grad_norm": 0.7319709062576294, + "learning_rate": 5.2466977411995567e-05, + "loss": 0.8821, + "step": 3144 + }, + { + "epoch": 0.4845591066615325, + "grad_norm": 0.8169276118278503, + "learning_rate": 5.2418643448491265e-05, + "loss": 1.0041, + "step": 3146 + }, + { + "epoch": 0.4848671544089334, + "grad_norm": 0.8061478137969971, + "learning_rate": 5.237030721943236e-05, + "loss": 0.9777, + "step": 3148 + }, + { + "epoch": 0.48517520215633425, + "grad_norm": 0.8491753339767456, + "learning_rate": 5.2321968770095654e-05, + "loss": 1.2202, + "step": 3150 + }, + { + "epoch": 0.4854832499037351, + "grad_norm": 0.7268081307411194, + "learning_rate": 5.2273628145759954e-05, + "loss": 0.8325, + "step": 3152 + }, + { + "epoch": 0.48579129765113593, + "grad_norm": 1.062333583831787, + "learning_rate": 5.2225285391706194e-05, + "loss": 0.768, + "step": 3154 + }, + { + "epoch": 0.4860993453985368, + "grad_norm": 0.8420694470405579, + "learning_rate": 5.217694055321724e-05, + "loss": 1.969, + "step": 3156 + }, + { + "epoch": 0.4864073931459376, + "grad_norm": 0.6028314828872681, + "learning_rate": 5.212859367557793e-05, + "loss": 0.8924, + "step": 3158 + }, + { + "epoch": 0.48671544089333846, + "grad_norm": 0.5836811065673828, + "learning_rate": 5.2080244804075e-05, + "loss": 1.2632, + "step": 3160 + }, + { + "epoch": 0.4870234886407393, + "grad_norm": 0.6890228390693665, + "learning_rate": 5.203189398399707e-05, + "loss": 1.0361, + "step": 3162 + }, + { + "epoch": 0.48733153638814014, + "grad_norm": 0.8328240513801575, + "learning_rate": 5.1983541260634586e-05, + "loss": 1.0121, + "step": 3164 + }, + { + "epoch": 0.487639584135541, + "grad_norm": 0.7056050896644592, + "learning_rate": 5.1935186679279745e-05, + "loss": 0.9742, + "step": 3166 + }, + { + "epoch": 0.4879476318829419, + "grad_norm": 0.6008009314537048, + "learning_rate": 5.188683028522654e-05, + "loss": 0.8679, + "step": 3168 + }, + { + "epoch": 0.4882556796303427, + "grad_norm": 0.766815185546875, + "learning_rate": 5.183847212377061e-05, + "loss": 0.8381, + "step": 3170 + }, + { + "epoch": 0.48856372737774356, + "grad_norm": 0.7284615635871887, + "learning_rate": 5.179011224020928e-05, + "loss": 0.8723, + "step": 3172 + }, + { + "epoch": 0.4888717751251444, + "grad_norm": 0.7973135709762573, + "learning_rate": 5.174175067984145e-05, + "loss": 0.8574, + "step": 3174 + }, + { + "epoch": 0.48917982287254524, + "grad_norm": 0.8865051865577698, + "learning_rate": 5.169338748796767e-05, + "loss": 1.1238, + "step": 3176 + }, + { + "epoch": 0.4894878706199461, + "grad_norm": 0.6385417580604553, + "learning_rate": 5.164502270988992e-05, + "loss": 0.7434, + "step": 3178 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.7411341071128845, + "learning_rate": 5.1596656390911756e-05, + "loss": 0.9213, + "step": 3180 + }, + { + "epoch": 0.49010396611474777, + "grad_norm": 0.49342554807662964, + "learning_rate": 5.15482885763381e-05, + "loss": 0.8866, + "step": 3182 + }, + { + "epoch": 0.4904120138621486, + "grad_norm": 0.8545454144477844, + "learning_rate": 5.149991931147531e-05, + "loss": 0.9304, + "step": 3184 + }, + { + "epoch": 0.4907200616095495, + "grad_norm": 0.9464917778968811, + "learning_rate": 5.145154864163114e-05, + "loss": 1.0994, + "step": 3186 + }, + { + "epoch": 0.49102810935695035, + "grad_norm": 0.7000471949577332, + "learning_rate": 5.140317661211457e-05, + "loss": 1.1924, + "step": 3188 + }, + { + "epoch": 0.4913361571043512, + "grad_norm": 0.6230635643005371, + "learning_rate": 5.135480326823594e-05, + "loss": 0.9787, + "step": 3190 + }, + { + "epoch": 0.49164420485175203, + "grad_norm": 0.5892027020454407, + "learning_rate": 5.130642865530676e-05, + "loss": 0.9198, + "step": 3192 + }, + { + "epoch": 0.4919522525991529, + "grad_norm": 0.6804258823394775, + "learning_rate": 5.12580528186398e-05, + "loss": 1.0212, + "step": 3194 + }, + { + "epoch": 0.4922603003465537, + "grad_norm": 0.5627198219299316, + "learning_rate": 5.1209675803548875e-05, + "loss": 0.8279, + "step": 3196 + }, + { + "epoch": 0.49256834809395456, + "grad_norm": 0.6895137429237366, + "learning_rate": 5.116129765534899e-05, + "loss": 0.8793, + "step": 3198 + }, + { + "epoch": 0.4928763958413554, + "grad_norm": 0.6826330423355103, + "learning_rate": 5.111291841935619e-05, + "loss": 1.0857, + "step": 3200 + }, + { + "epoch": 0.49318444358875624, + "grad_norm": 1.6944973468780518, + "learning_rate": 5.106453814088753e-05, + "loss": 1.0824, + "step": 3202 + }, + { + "epoch": 0.4934924913361571, + "grad_norm": 0.7384228706359863, + "learning_rate": 5.101615686526102e-05, + "loss": 2.2589, + "step": 3204 + }, + { + "epoch": 0.493800539083558, + "grad_norm": 0.6678666472434998, + "learning_rate": 5.096777463779565e-05, + "loss": 1.6213, + "step": 3206 + }, + { + "epoch": 0.4941085868309588, + "grad_norm": 0.9313806891441345, + "learning_rate": 5.091939150381127e-05, + "loss": 0.8571, + "step": 3208 + }, + { + "epoch": 0.49441663457835966, + "grad_norm": 0.7525261044502258, + "learning_rate": 5.087100750862857e-05, + "loss": 1.0053, + "step": 3210 + }, + { + "epoch": 0.4947246823257605, + "grad_norm": 0.7191951274871826, + "learning_rate": 5.082262269756909e-05, + "loss": 0.961, + "step": 3212 + }, + { + "epoch": 0.49503273007316134, + "grad_norm": 0.5987869501113892, + "learning_rate": 5.0774237115955084e-05, + "loss": 0.9717, + "step": 3214 + }, + { + "epoch": 0.4953407778205622, + "grad_norm": 0.7908967137336731, + "learning_rate": 5.072585080910958e-05, + "loss": 0.7216, + "step": 3216 + }, + { + "epoch": 0.495648825567963, + "grad_norm": 0.8970842957496643, + "learning_rate": 5.067746382235622e-05, + "loss": 0.9381, + "step": 3218 + }, + { + "epoch": 0.49595687331536387, + "grad_norm": 1.010067105293274, + "learning_rate": 5.0629076201019364e-05, + "loss": 0.9412, + "step": 3220 + }, + { + "epoch": 0.4962649210627647, + "grad_norm": 0.6909099221229553, + "learning_rate": 5.058068799042387e-05, + "loss": 0.7825, + "step": 3222 + }, + { + "epoch": 0.49657296881016555, + "grad_norm": 0.8686575293540955, + "learning_rate": 5.053229923589526e-05, + "loss": 0.8715, + "step": 3224 + }, + { + "epoch": 0.49688101655756645, + "grad_norm": 0.6410868167877197, + "learning_rate": 5.048390998275947e-05, + "loss": 0.8294, + "step": 3226 + }, + { + "epoch": 0.4971890643049673, + "grad_norm": 0.7121363878250122, + "learning_rate": 5.043552027634293e-05, + "loss": 1.1581, + "step": 3228 + }, + { + "epoch": 0.49749711205236813, + "grad_norm": 0.8368136882781982, + "learning_rate": 5.0387130161972526e-05, + "loss": 1.0451, + "step": 3230 + }, + { + "epoch": 0.49780515979976897, + "grad_norm": 0.6324896216392517, + "learning_rate": 5.0338739684975486e-05, + "loss": 0.9246, + "step": 3232 + }, + { + "epoch": 0.4981132075471698, + "grad_norm": 0.6032724976539612, + "learning_rate": 5.029034889067943e-05, + "loss": 0.9274, + "step": 3234 + }, + { + "epoch": 0.49842125529457065, + "grad_norm": 0.8420709371566772, + "learning_rate": 5.024195782441219e-05, + "loss": 0.9483, + "step": 3236 + }, + { + "epoch": 0.4987293030419715, + "grad_norm": 0.6967646479606628, + "learning_rate": 5.0193566531501946e-05, + "loss": 1.0132, + "step": 3238 + }, + { + "epoch": 0.49903735078937234, + "grad_norm": 0.6845428347587585, + "learning_rate": 5.014517505727702e-05, + "loss": 1.0773, + "step": 3240 + }, + { + "epoch": 0.4993453985367732, + "grad_norm": 0.6973444223403931, + "learning_rate": 5.0096783447065946e-05, + "loss": 0.9261, + "step": 3242 + }, + { + "epoch": 0.4996534462841741, + "grad_norm": 1.047443151473999, + "learning_rate": 5.004839174619736e-05, + "loss": 1.0449, + "step": 3244 + }, + { + "epoch": 0.4999614940315749, + "grad_norm": 0.8661796450614929, + "learning_rate": 5e-05, + "loss": 0.9442, + "step": 3246 + }, + { + "epoch": 0.5002695417789758, + "grad_norm": 0.9085307121276855, + "learning_rate": 4.995160825380265e-05, + "loss": 0.9839, + "step": 3248 + }, + { + "epoch": 0.5005775895263765, + "grad_norm": 0.7589093446731567, + "learning_rate": 4.990321655293406e-05, + "loss": 1.0097, + "step": 3250 + }, + { + "epoch": 0.5008856372737774, + "grad_norm": 0.6354397535324097, + "learning_rate": 4.985482494272299e-05, + "loss": 0.9261, + "step": 3252 + }, + { + "epoch": 0.5011936850211783, + "grad_norm": 1.0207045078277588, + "learning_rate": 4.980643346849807e-05, + "loss": 1.218, + "step": 3254 + }, + { + "epoch": 0.5015017327685791, + "grad_norm": 0.7316938638687134, + "learning_rate": 4.9758042175587824e-05, + "loss": 0.9952, + "step": 3256 + }, + { + "epoch": 0.50180978051598, + "grad_norm": 0.7260974645614624, + "learning_rate": 4.9709651109320575e-05, + "loss": 1.122, + "step": 3258 + }, + { + "epoch": 0.5021178282633808, + "grad_norm": 0.6711363792419434, + "learning_rate": 4.966126031502452e-05, + "loss": 1.4948, + "step": 3260 + }, + { + "epoch": 0.5024258760107817, + "grad_norm": 1.114310622215271, + "learning_rate": 4.9612869838027485e-05, + "loss": 1.2719, + "step": 3262 + }, + { + "epoch": 0.5027339237581825, + "grad_norm": 0.6233516335487366, + "learning_rate": 4.9564479723657075e-05, + "loss": 0.9378, + "step": 3264 + }, + { + "epoch": 0.5030419715055834, + "grad_norm": 0.9802983403205872, + "learning_rate": 4.951609001724054e-05, + "loss": 1.0383, + "step": 3266 + }, + { + "epoch": 0.5033500192529842, + "grad_norm": 0.9117656350135803, + "learning_rate": 4.9467700764104756e-05, + "loss": 1.0586, + "step": 3268 + }, + { + "epoch": 0.5036580670003851, + "grad_norm": 0.836900532245636, + "learning_rate": 4.941931200957612e-05, + "loss": 1.4957, + "step": 3270 + }, + { + "epoch": 0.5039661147477859, + "grad_norm": 0.8332207798957825, + "learning_rate": 4.937092379898065e-05, + "loss": 1.1056, + "step": 3272 + }, + { + "epoch": 0.5042741624951868, + "grad_norm": 0.8060922026634216, + "learning_rate": 4.9322536177643794e-05, + "loss": 1.0183, + "step": 3274 + }, + { + "epoch": 0.5045822102425876, + "grad_norm": 0.8477973937988281, + "learning_rate": 4.927414919089045e-05, + "loss": 1.0103, + "step": 3276 + }, + { + "epoch": 0.5048902579899884, + "grad_norm": 2.138157606124878, + "learning_rate": 4.922576288404492e-05, + "loss": 0.9738, + "step": 3278 + }, + { + "epoch": 0.5051983057373893, + "grad_norm": 0.9014928936958313, + "learning_rate": 4.917737730243093e-05, + "loss": 0.9046, + "step": 3280 + }, + { + "epoch": 0.5055063534847901, + "grad_norm": 0.8116535544395447, + "learning_rate": 4.912899249137145e-05, + "loss": 0.8913, + "step": 3282 + }, + { + "epoch": 0.505814401232191, + "grad_norm": 0.7429752349853516, + "learning_rate": 4.908060849618875e-05, + "loss": 0.9569, + "step": 3284 + }, + { + "epoch": 0.5061224489795918, + "grad_norm": 0.6032552123069763, + "learning_rate": 4.9032225362204356e-05, + "loss": 0.8054, + "step": 3286 + }, + { + "epoch": 0.5064304967269927, + "grad_norm": 0.8589087724685669, + "learning_rate": 4.898384313473899e-05, + "loss": 2.1833, + "step": 3288 + }, + { + "epoch": 0.5067385444743935, + "grad_norm": 0.8798043727874756, + "learning_rate": 4.893546185911247e-05, + "loss": 0.9825, + "step": 3290 + }, + { + "epoch": 0.5070465922217944, + "grad_norm": 0.7908830046653748, + "learning_rate": 4.888708158064381e-05, + "loss": 1.2025, + "step": 3292 + }, + { + "epoch": 0.5073546399691953, + "grad_norm": 0.9762030243873596, + "learning_rate": 4.8838702344651014e-05, + "loss": 1.0325, + "step": 3294 + }, + { + "epoch": 0.5076626877165961, + "grad_norm": 0.8435834646224976, + "learning_rate": 4.879032419645114e-05, + "loss": 0.9938, + "step": 3296 + }, + { + "epoch": 0.507970735463997, + "grad_norm": 0.754014253616333, + "learning_rate": 4.8741947181360213e-05, + "loss": 0.9847, + "step": 3298 + }, + { + "epoch": 0.5082787832113977, + "grad_norm": 0.9299498796463013, + "learning_rate": 4.869357134469325e-05, + "loss": 0.8945, + "step": 3300 + }, + { + "epoch": 0.5082787832113977, + "eval_loss": 2.3888819217681885, + "eval_runtime": 736.18, + "eval_samples_per_second": 2.717, + "eval_steps_per_second": 0.679, + "step": 3300 + }, + { + "epoch": 0.5085868309587986, + "grad_norm": 0.606257975101471, + "learning_rate": 4.864519673176408e-05, + "loss": 1.0335, + "step": 3302 + }, + { + "epoch": 0.5088948787061994, + "grad_norm": 0.8606283068656921, + "learning_rate": 4.8596823387885435e-05, + "loss": 1.1664, + "step": 3304 + }, + { + "epoch": 0.5092029264536003, + "grad_norm": 0.657559335231781, + "learning_rate": 4.8548451358368876e-05, + "loss": 0.8417, + "step": 3306 + }, + { + "epoch": 0.5095109742010011, + "grad_norm": 0.7792297005653381, + "learning_rate": 4.8500080688524696e-05, + "loss": 1.0581, + "step": 3308 + }, + { + "epoch": 0.509819021948402, + "grad_norm": 0.8086965084075928, + "learning_rate": 4.8451711423661905e-05, + "loss": 1.0145, + "step": 3310 + }, + { + "epoch": 0.5101270696958029, + "grad_norm": 0.7681042551994324, + "learning_rate": 4.8403343609088255e-05, + "loss": 1.1651, + "step": 3312 + }, + { + "epoch": 0.5104351174432037, + "grad_norm": 0.6708490252494812, + "learning_rate": 4.835497729011009e-05, + "loss": 2.3072, + "step": 3314 + }, + { + "epoch": 0.5107431651906046, + "grad_norm": 0.6013543009757996, + "learning_rate": 4.830661251203235e-05, + "loss": 1.064, + "step": 3316 + }, + { + "epoch": 0.5110512129380054, + "grad_norm": 0.9775047898292542, + "learning_rate": 4.825824932015855e-05, + "loss": 2.4686, + "step": 3318 + }, + { + "epoch": 0.5113592606854063, + "grad_norm": 0.551460325717926, + "learning_rate": 4.820988775979074e-05, + "loss": 0.8143, + "step": 3320 + }, + { + "epoch": 0.511667308432807, + "grad_norm": 0.9195789694786072, + "learning_rate": 4.816152787622941e-05, + "loss": 0.9829, + "step": 3322 + }, + { + "epoch": 0.511975356180208, + "grad_norm": 1.0899276733398438, + "learning_rate": 4.811316971477346e-05, + "loss": 1.2838, + "step": 3324 + }, + { + "epoch": 0.5122834039276087, + "grad_norm": 0.8045056462287903, + "learning_rate": 4.806481332072027e-05, + "loss": 1.0896, + "step": 3326 + }, + { + "epoch": 0.5125914516750096, + "grad_norm": 0.6057413220405579, + "learning_rate": 4.801645873936543e-05, + "loss": 1.0034, + "step": 3328 + }, + { + "epoch": 0.5128994994224104, + "grad_norm": 0.8745119571685791, + "learning_rate": 4.796810601600293e-05, + "loss": 1.0275, + "step": 3330 + }, + { + "epoch": 0.5132075471698113, + "grad_norm": 0.6802326440811157, + "learning_rate": 4.7919755195925014e-05, + "loss": 1.0014, + "step": 3332 + }, + { + "epoch": 0.5135155949172122, + "grad_norm": 0.7547101378440857, + "learning_rate": 4.787140632442208e-05, + "loss": 1.0106, + "step": 3334 + }, + { + "epoch": 0.513823642664613, + "grad_norm": 0.883599579334259, + "learning_rate": 4.782305944678277e-05, + "loss": 0.9676, + "step": 3336 + }, + { + "epoch": 0.5141316904120139, + "grad_norm": 0.9065552353858948, + "learning_rate": 4.7774714608293804e-05, + "loss": 1.0767, + "step": 3338 + }, + { + "epoch": 0.5144397381594147, + "grad_norm": 0.8647933006286621, + "learning_rate": 4.772637185424005e-05, + "loss": 1.0344, + "step": 3340 + }, + { + "epoch": 0.5147477859068156, + "grad_norm": 0.7071940302848816, + "learning_rate": 4.767803122990437e-05, + "loss": 1.0686, + "step": 3342 + }, + { + "epoch": 0.5150558336542164, + "grad_norm": 0.70546954870224, + "learning_rate": 4.762969278056765e-05, + "loss": 0.9587, + "step": 3344 + }, + { + "epoch": 0.5153638814016173, + "grad_norm": 0.7491280436515808, + "learning_rate": 4.758135655150875e-05, + "loss": 0.9854, + "step": 3346 + }, + { + "epoch": 0.515671929149018, + "grad_norm": 0.647142767906189, + "learning_rate": 4.7533022588004445e-05, + "loss": 0.8884, + "step": 3348 + }, + { + "epoch": 0.515979976896419, + "grad_norm": 0.8309057354927063, + "learning_rate": 4.748469093532936e-05, + "loss": 0.8062, + "step": 3350 + }, + { + "epoch": 0.5162880246438198, + "grad_norm": 0.6286987066268921, + "learning_rate": 4.743636163875601e-05, + "loss": 0.8235, + "step": 3352 + }, + { + "epoch": 0.5165960723912206, + "grad_norm": 0.6637413501739502, + "learning_rate": 4.738803474355466e-05, + "loss": 0.9696, + "step": 3354 + }, + { + "epoch": 0.5169041201386215, + "grad_norm": 0.8996312618255615, + "learning_rate": 4.733971029499333e-05, + "loss": 0.9661, + "step": 3356 + }, + { + "epoch": 0.5172121678860223, + "grad_norm": 0.6931231021881104, + "learning_rate": 4.729138833833774e-05, + "loss": 1.0826, + "step": 3358 + }, + { + "epoch": 0.5175202156334232, + "grad_norm": 0.8037751317024231, + "learning_rate": 4.724306891885134e-05, + "loss": 1.4519, + "step": 3360 + }, + { + "epoch": 0.517828263380824, + "grad_norm": 0.6060133576393127, + "learning_rate": 4.719475208179513e-05, + "loss": 0.9653, + "step": 3362 + }, + { + "epoch": 0.5181363111282249, + "grad_norm": 0.8463549017906189, + "learning_rate": 4.7146437872427694e-05, + "loss": 0.8843, + "step": 3364 + }, + { + "epoch": 0.5184443588756257, + "grad_norm": 0.6483150124549866, + "learning_rate": 4.7098126336005224e-05, + "loss": 0.7722, + "step": 3366 + }, + { + "epoch": 0.5187524066230266, + "grad_norm": 1.063178300857544, + "learning_rate": 4.7049817517781325e-05, + "loss": 0.9337, + "step": 3368 + }, + { + "epoch": 0.5190604543704275, + "grad_norm": 0.760132372379303, + "learning_rate": 4.700151146300711e-05, + "loss": 1.0166, + "step": 3370 + }, + { + "epoch": 0.5193685021178283, + "grad_norm": 0.6914016008377075, + "learning_rate": 4.6953208216931065e-05, + "loss": 1.8584, + "step": 3372 + }, + { + "epoch": 0.5196765498652292, + "grad_norm": 0.6557642817497253, + "learning_rate": 4.690490782479909e-05, + "loss": 0.8962, + "step": 3374 + }, + { + "epoch": 0.5199845976126299, + "grad_norm": 0.7154399156570435, + "learning_rate": 4.685661033185437e-05, + "loss": 1.5047, + "step": 3376 + }, + { + "epoch": 0.5202926453600308, + "grad_norm": 0.6547550559043884, + "learning_rate": 4.6808315783337396e-05, + "loss": 0.9173, + "step": 3378 + }, + { + "epoch": 0.5206006931074316, + "grad_norm": 1.0237712860107422, + "learning_rate": 4.6760024224485915e-05, + "loss": 0.9651, + "step": 3380 + }, + { + "epoch": 0.5209087408548325, + "grad_norm": 0.8532687425613403, + "learning_rate": 4.671173570053483e-05, + "loss": 0.7217, + "step": 3382 + }, + { + "epoch": 0.5212167886022333, + "grad_norm": 0.9705612063407898, + "learning_rate": 4.6663450256716226e-05, + "loss": 1.9736, + "step": 3384 + }, + { + "epoch": 0.5215248363496342, + "grad_norm": 0.9786109924316406, + "learning_rate": 4.661516793825932e-05, + "loss": 0.9056, + "step": 3386 + }, + { + "epoch": 0.521832884097035, + "grad_norm": 0.6195076704025269, + "learning_rate": 4.656688879039039e-05, + "loss": 0.8693, + "step": 3388 + }, + { + "epoch": 0.5221409318444359, + "grad_norm": 1.7733471393585205, + "learning_rate": 4.651861285833272e-05, + "loss": 1.181, + "step": 3390 + }, + { + "epoch": 0.5224489795918368, + "grad_norm": 0.632230818271637, + "learning_rate": 4.647034018730658e-05, + "loss": 0.8432, + "step": 3392 + }, + { + "epoch": 0.5227570273392376, + "grad_norm": 0.6315904855728149, + "learning_rate": 4.6422070822529255e-05, + "loss": 0.9303, + "step": 3394 + }, + { + "epoch": 0.5230650750866385, + "grad_norm": 0.7267144918441772, + "learning_rate": 4.6373804809214875e-05, + "loss": 0.9445, + "step": 3396 + }, + { + "epoch": 0.5233731228340393, + "grad_norm": 0.8266833424568176, + "learning_rate": 4.63255421925744e-05, + "loss": 1.0778, + "step": 3398 + }, + { + "epoch": 0.5236811705814401, + "grad_norm": 0.9495794773101807, + "learning_rate": 4.627728301781569e-05, + "loss": 1.3003, + "step": 3400 + }, + { + "epoch": 0.5239892183288409, + "grad_norm": 0.7062100768089294, + "learning_rate": 4.6229027330143324e-05, + "loss": 0.9354, + "step": 3402 + }, + { + "epoch": 0.5242972660762418, + "grad_norm": 0.838807225227356, + "learning_rate": 4.61807751747586e-05, + "loss": 0.8519, + "step": 3404 + }, + { + "epoch": 0.5246053138236426, + "grad_norm": 0.7427268028259277, + "learning_rate": 4.61325265968596e-05, + "loss": 0.8379, + "step": 3406 + }, + { + "epoch": 0.5249133615710435, + "grad_norm": 0.6703910231590271, + "learning_rate": 4.6084281641640946e-05, + "loss": 0.9903, + "step": 3408 + }, + { + "epoch": 0.5252214093184444, + "grad_norm": 0.7218496203422546, + "learning_rate": 4.603604035429393e-05, + "loss": 0.8798, + "step": 3410 + }, + { + "epoch": 0.5255294570658452, + "grad_norm": 0.9973131418228149, + "learning_rate": 4.598780278000637e-05, + "loss": 1.1471, + "step": 3412 + }, + { + "epoch": 0.5258375048132461, + "grad_norm": 1.0085009336471558, + "learning_rate": 4.593956896396264e-05, + "loss": 1.0237, + "step": 3414 + }, + { + "epoch": 0.5261455525606469, + "grad_norm": 0.7579819560050964, + "learning_rate": 4.589133895134359e-05, + "loss": 1.035, + "step": 3416 + }, + { + "epoch": 0.5264536003080478, + "grad_norm": 0.8549184203147888, + "learning_rate": 4.584311278732647e-05, + "loss": 0.9305, + "step": 3418 + }, + { + "epoch": 0.5267616480554486, + "grad_norm": 1.1725854873657227, + "learning_rate": 4.5794890517084995e-05, + "loss": 1.007, + "step": 3420 + }, + { + "epoch": 0.5270696958028495, + "grad_norm": 0.8996204733848572, + "learning_rate": 4.574667218578915e-05, + "loss": 0.9192, + "step": 3422 + }, + { + "epoch": 0.5273777435502502, + "grad_norm": 0.8384501338005066, + "learning_rate": 4.5698457838605287e-05, + "loss": 0.9124, + "step": 3424 + }, + { + "epoch": 0.5276857912976511, + "grad_norm": 0.702349066734314, + "learning_rate": 4.565024752069601e-05, + "loss": 0.9621, + "step": 3426 + }, + { + "epoch": 0.527993839045052, + "grad_norm": 0.8768123388290405, + "learning_rate": 4.560204127722016e-05, + "loss": 0.9297, + "step": 3428 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 0.8436578512191772, + "learning_rate": 4.555383915333273e-05, + "loss": 1.4854, + "step": 3430 + }, + { + "epoch": 0.5286099345398537, + "grad_norm": 0.7559348344802856, + "learning_rate": 4.550564119418487e-05, + "loss": 0.9969, + "step": 3432 + }, + { + "epoch": 0.5289179822872545, + "grad_norm": 0.6650515198707581, + "learning_rate": 4.5457447444923854e-05, + "loss": 0.9929, + "step": 3434 + }, + { + "epoch": 0.5292260300346554, + "grad_norm": 0.9616847038269043, + "learning_rate": 4.540925795069299e-05, + "loss": 1.2402, + "step": 3436 + }, + { + "epoch": 0.5295340777820562, + "grad_norm": 0.7548688054084778, + "learning_rate": 4.536107275663157e-05, + "loss": 0.9014, + "step": 3438 + }, + { + "epoch": 0.5298421255294571, + "grad_norm": 0.7742545008659363, + "learning_rate": 4.531289190787493e-05, + "loss": 0.8877, + "step": 3440 + }, + { + "epoch": 0.5301501732768579, + "grad_norm": 0.7939615845680237, + "learning_rate": 4.526471544955429e-05, + "loss": 1.3333, + "step": 3442 + }, + { + "epoch": 0.5304582210242588, + "grad_norm": 0.7492351531982422, + "learning_rate": 4.521654342679672e-05, + "loss": 1.0736, + "step": 3444 + }, + { + "epoch": 0.5307662687716596, + "grad_norm": 0.6150861382484436, + "learning_rate": 4.516837588472524e-05, + "loss": 0.8605, + "step": 3446 + }, + { + "epoch": 0.5310743165190605, + "grad_norm": 0.8641752004623413, + "learning_rate": 4.5120212868458566e-05, + "loss": 0.9416, + "step": 3448 + }, + { + "epoch": 0.5313823642664613, + "grad_norm": 0.9148188233375549, + "learning_rate": 4.507205442311125e-05, + "loss": 0.8779, + "step": 3450 + }, + { + "epoch": 0.5316904120138621, + "grad_norm": 0.8277890682220459, + "learning_rate": 4.502390059379349e-05, + "loss": 0.9237, + "step": 3452 + }, + { + "epoch": 0.531998459761263, + "grad_norm": 0.8538454174995422, + "learning_rate": 4.497575142561125e-05, + "loss": 1.0778, + "step": 3454 + }, + { + "epoch": 0.5323065075086638, + "grad_norm": 0.7834024429321289, + "learning_rate": 4.492760696366606e-05, + "loss": 1.4203, + "step": 3456 + }, + { + "epoch": 0.5326145552560647, + "grad_norm": 0.6071308851242065, + "learning_rate": 4.487946725305504e-05, + "loss": 1.1278, + "step": 3458 + }, + { + "epoch": 0.5329226030034655, + "grad_norm": 0.6906324028968811, + "learning_rate": 4.483133233887093e-05, + "loss": 1.7408, + "step": 3460 + }, + { + "epoch": 0.5332306507508664, + "grad_norm": 0.5732936263084412, + "learning_rate": 4.478320226620189e-05, + "loss": 1.0157, + "step": 3462 + }, + { + "epoch": 0.5335386984982672, + "grad_norm": 0.913801372051239, + "learning_rate": 4.473507708013158e-05, + "loss": 1.2833, + "step": 3464 + }, + { + "epoch": 0.5338467462456681, + "grad_norm": 0.8521076440811157, + "learning_rate": 4.4686956825739115e-05, + "loss": 1.0119, + "step": 3466 + }, + { + "epoch": 0.534154793993069, + "grad_norm": 0.8757444620132446, + "learning_rate": 4.4638841548098956e-05, + "loss": 0.8007, + "step": 3468 + }, + { + "epoch": 0.5344628417404698, + "grad_norm": 0.6936830282211304, + "learning_rate": 4.459073129228089e-05, + "loss": 0.8151, + "step": 3470 + }, + { + "epoch": 0.5347708894878707, + "grad_norm": 0.7347168922424316, + "learning_rate": 4.4542626103350014e-05, + "loss": 0.9071, + "step": 3472 + }, + { + "epoch": 0.5350789372352714, + "grad_norm": 0.87520432472229, + "learning_rate": 4.449452602636671e-05, + "loss": 1.0179, + "step": 3474 + }, + { + "epoch": 0.5353869849826723, + "grad_norm": 0.7437530159950256, + "learning_rate": 4.444643110638653e-05, + "loss": 1.0384, + "step": 3476 + }, + { + "epoch": 0.5356950327300731, + "grad_norm": 0.7023616433143616, + "learning_rate": 4.439834138846019e-05, + "loss": 0.9684, + "step": 3478 + }, + { + "epoch": 0.536003080477474, + "grad_norm": 0.8628860116004944, + "learning_rate": 4.4350256917633585e-05, + "loss": 1.6387, + "step": 3480 + }, + { + "epoch": 0.5363111282248748, + "grad_norm": 0.8111205697059631, + "learning_rate": 4.4302177738947655e-05, + "loss": 0.836, + "step": 3482 + }, + { + "epoch": 0.5366191759722757, + "grad_norm": 0.6044906377792358, + "learning_rate": 4.425410389743839e-05, + "loss": 1.9801, + "step": 3484 + }, + { + "epoch": 0.5369272237196766, + "grad_norm": 0.7423304915428162, + "learning_rate": 4.420603543813675e-05, + "loss": 0.8639, + "step": 3486 + }, + { + "epoch": 0.5372352714670774, + "grad_norm": 0.7321537733078003, + "learning_rate": 4.415797240606872e-05, + "loss": 0.86, + "step": 3488 + }, + { + "epoch": 0.5375433192144783, + "grad_norm": 0.7974177598953247, + "learning_rate": 4.410991484625518e-05, + "loss": 0.9389, + "step": 3490 + }, + { + "epoch": 0.5378513669618791, + "grad_norm": 0.9002341628074646, + "learning_rate": 4.4061862803711815e-05, + "loss": 1.0236, + "step": 3492 + }, + { + "epoch": 0.53815941470928, + "grad_norm": 1.1152660846710205, + "learning_rate": 4.401381632344926e-05, + "loss": 0.9229, + "step": 3494 + }, + { + "epoch": 0.5384674624566808, + "grad_norm": 0.7044309973716736, + "learning_rate": 4.3965775450472826e-05, + "loss": 0.9132, + "step": 3496 + }, + { + "epoch": 0.5387755102040817, + "grad_norm": 0.5965349674224854, + "learning_rate": 4.391774022978264e-05, + "loss": 0.9993, + "step": 3498 + }, + { + "epoch": 0.5390835579514824, + "grad_norm": 0.7283958196640015, + "learning_rate": 4.386971070637354e-05, + "loss": 1.0205, + "step": 3500 + }, + { + "epoch": 0.5393916056988833, + "grad_norm": 0.9296109676361084, + "learning_rate": 4.382168692523498e-05, + "loss": 1.0551, + "step": 3502 + }, + { + "epoch": 0.5396996534462841, + "grad_norm": 0.6661383509635925, + "learning_rate": 4.3773668931351055e-05, + "loss": 0.9869, + "step": 3504 + }, + { + "epoch": 0.540007701193685, + "grad_norm": 1.1538796424865723, + "learning_rate": 4.372565676970045e-05, + "loss": 0.9673, + "step": 3506 + }, + { + "epoch": 0.5403157489410859, + "grad_norm": 0.9146564602851868, + "learning_rate": 4.367765048525641e-05, + "loss": 1.1098, + "step": 3508 + }, + { + "epoch": 0.5406237966884867, + "grad_norm": 0.8921990394592285, + "learning_rate": 4.362965012298659e-05, + "loss": 0.9715, + "step": 3510 + }, + { + "epoch": 0.5409318444358876, + "grad_norm": 0.7014362812042236, + "learning_rate": 4.358165572785318e-05, + "loss": 0.841, + "step": 3512 + }, + { + "epoch": 0.5412398921832884, + "grad_norm": 0.8201600313186646, + "learning_rate": 4.353366734481277e-05, + "loss": 1.0627, + "step": 3514 + }, + { + "epoch": 0.5415479399306893, + "grad_norm": 0.8277712464332581, + "learning_rate": 4.348568501881629e-05, + "loss": 1.1129, + "step": 3516 + }, + { + "epoch": 0.5418559876780901, + "grad_norm": 0.787347674369812, + "learning_rate": 4.343770879480899e-05, + "loss": 1.1029, + "step": 3518 + }, + { + "epoch": 0.542164035425491, + "grad_norm": 0.9938316345214844, + "learning_rate": 4.338973871773045e-05, + "loss": 0.9947, + "step": 3520 + }, + { + "epoch": 0.5424720831728918, + "grad_norm": 0.9085752367973328, + "learning_rate": 4.3341774832514445e-05, + "loss": 0.9601, + "step": 3522 + }, + { + "epoch": 0.5427801309202926, + "grad_norm": 0.5847015976905823, + "learning_rate": 4.329381718408899e-05, + "loss": 0.7997, + "step": 3524 + }, + { + "epoch": 0.5430881786676935, + "grad_norm": 0.853076159954071, + "learning_rate": 4.3245865817376206e-05, + "loss": 0.9325, + "step": 3526 + }, + { + "epoch": 0.5433962264150943, + "grad_norm": 0.6862278580665588, + "learning_rate": 4.319792077729239e-05, + "loss": 0.9502, + "step": 3528 + }, + { + "epoch": 0.5437042741624952, + "grad_norm": 1.0851012468338013, + "learning_rate": 4.314998210874789e-05, + "loss": 0.8269, + "step": 3530 + }, + { + "epoch": 0.544012321909896, + "grad_norm": 0.8321728110313416, + "learning_rate": 4.310204985664703e-05, + "loss": 1.1325, + "step": 3532 + }, + { + "epoch": 0.5443203696572969, + "grad_norm": 0.6708217859268188, + "learning_rate": 4.3054124065888244e-05, + "loss": 1.1693, + "step": 3534 + }, + { + "epoch": 0.5446284174046977, + "grad_norm": 0.8871041536331177, + "learning_rate": 4.3006204781363803e-05, + "loss": 0.8531, + "step": 3536 + }, + { + "epoch": 0.5449364651520986, + "grad_norm": 0.9859650135040283, + "learning_rate": 4.295829204795991e-05, + "loss": 1.1135, + "step": 3538 + }, + { + "epoch": 0.5452445128994994, + "grad_norm": 0.8064723014831543, + "learning_rate": 4.291038591055668e-05, + "loss": 0.9088, + "step": 3540 + }, + { + "epoch": 0.5455525606469003, + "grad_norm": 1.2011666297912598, + "learning_rate": 4.286248641402801e-05, + "loss": 1.0337, + "step": 3542 + }, + { + "epoch": 0.5458606083943011, + "grad_norm": 0.8110551238059998, + "learning_rate": 4.281459360324156e-05, + "loss": 0.8991, + "step": 3544 + }, + { + "epoch": 0.546168656141702, + "grad_norm": 0.8566579818725586, + "learning_rate": 4.276670752305875e-05, + "loss": 0.9251, + "step": 3546 + }, + { + "epoch": 0.5464767038891029, + "grad_norm": 0.7969164848327637, + "learning_rate": 4.2718828218334734e-05, + "loss": 1.1559, + "step": 3548 + }, + { + "epoch": 0.5467847516365036, + "grad_norm": 0.6673331260681152, + "learning_rate": 4.267095573391824e-05, + "loss": 1.0338, + "step": 3550 + }, + { + "epoch": 0.5470927993839045, + "grad_norm": 0.9492660760879517, + "learning_rate": 4.262309011465164e-05, + "loss": 1.0716, + "step": 3552 + }, + { + "epoch": 0.5474008471313053, + "grad_norm": 0.845156729221344, + "learning_rate": 4.257523140537092e-05, + "loss": 1.0976, + "step": 3554 + }, + { + "epoch": 0.5477088948787062, + "grad_norm": 0.5479189157485962, + "learning_rate": 4.252737965090554e-05, + "loss": 0.9575, + "step": 3556 + }, + { + "epoch": 0.548016942626107, + "grad_norm": 0.703624427318573, + "learning_rate": 4.2479534896078444e-05, + "loss": 0.833, + "step": 3558 + }, + { + "epoch": 0.5483249903735079, + "grad_norm": 0.9007683992385864, + "learning_rate": 4.243169718570606e-05, + "loss": 0.8989, + "step": 3560 + }, + { + "epoch": 0.5486330381209087, + "grad_norm": 0.9709227681159973, + "learning_rate": 4.2383866564598186e-05, + "loss": 1.1332, + "step": 3562 + }, + { + "epoch": 0.5489410858683096, + "grad_norm": 0.6905830502510071, + "learning_rate": 4.2336043077557996e-05, + "loss": 1.0249, + "step": 3564 + }, + { + "epoch": 0.5492491336157105, + "grad_norm": 1.027398943901062, + "learning_rate": 4.2288226769381944e-05, + "loss": 0.9233, + "step": 3566 + }, + { + "epoch": 0.5495571813631113, + "grad_norm": 0.7679511904716492, + "learning_rate": 4.2240417684859826e-05, + "loss": 1.0616, + "step": 3568 + }, + { + "epoch": 0.5498652291105122, + "grad_norm": 0.7321463227272034, + "learning_rate": 4.2192615868774624e-05, + "loss": 0.9422, + "step": 3570 + }, + { + "epoch": 0.550173276857913, + "grad_norm": 0.6400331258773804, + "learning_rate": 4.214482136590248e-05, + "loss": 0.8484, + "step": 3572 + }, + { + "epoch": 0.5504813246053138, + "grad_norm": 0.9933096766471863, + "learning_rate": 4.20970342210128e-05, + "loss": 1.9918, + "step": 3574 + }, + { + "epoch": 0.5507893723527146, + "grad_norm": 0.9462111592292786, + "learning_rate": 4.2049254478867974e-05, + "loss": 1.0123, + "step": 3576 + }, + { + "epoch": 0.5510974201001155, + "grad_norm": 0.804648756980896, + "learning_rate": 4.2001482184223505e-05, + "loss": 1.8671, + "step": 3578 + }, + { + "epoch": 0.5514054678475163, + "grad_norm": 0.7638084292411804, + "learning_rate": 4.195371738182796e-05, + "loss": 0.9511, + "step": 3580 + }, + { + "epoch": 0.5517135155949172, + "grad_norm": 0.6705355048179626, + "learning_rate": 4.190596011642285e-05, + "loss": 1.1035, + "step": 3582 + }, + { + "epoch": 0.5520215633423181, + "grad_norm": 0.732434093952179, + "learning_rate": 4.185821043274259e-05, + "loss": 1.0185, + "step": 3584 + }, + { + "epoch": 0.5523296110897189, + "grad_norm": 0.712803304195404, + "learning_rate": 4.181046837551455e-05, + "loss": 0.926, + "step": 3586 + }, + { + "epoch": 0.5526376588371198, + "grad_norm": 0.7874958515167236, + "learning_rate": 4.1762733989458965e-05, + "loss": 1.0391, + "step": 3588 + }, + { + "epoch": 0.5529457065845206, + "grad_norm": 0.5894497632980347, + "learning_rate": 4.1715007319288814e-05, + "loss": 0.8582, + "step": 3590 + }, + { + "epoch": 0.5532537543319215, + "grad_norm": 0.6469842195510864, + "learning_rate": 4.1667288409709905e-05, + "loss": 1.0132, + "step": 3592 + }, + { + "epoch": 0.5535618020793223, + "grad_norm": 0.8849762082099915, + "learning_rate": 4.1619577305420776e-05, + "loss": 1.0479, + "step": 3594 + }, + { + "epoch": 0.5538698498267232, + "grad_norm": 0.704217255115509, + "learning_rate": 4.157187405111264e-05, + "loss": 1.0849, + "step": 3596 + }, + { + "epoch": 0.554177897574124, + "grad_norm": 0.8714843392372131, + "learning_rate": 4.152417869146935e-05, + "loss": 1.185, + "step": 3598 + }, + { + "epoch": 0.5544859453215248, + "grad_norm": 0.7708948254585266, + "learning_rate": 4.147649127116735e-05, + "loss": 0.8242, + "step": 3600 + }, + { + "epoch": 0.5544859453215248, + "eval_loss": 2.3939669132232666, + "eval_runtime": 737.0426, + "eval_samples_per_second": 2.714, + "eval_steps_per_second": 0.678, + "step": 3600 + }, + { + "epoch": 0.5547939930689256, + "grad_norm": 0.7352039217948914, + "learning_rate": 4.14288118348757e-05, + "loss": 1.7142, + "step": 3602 + }, + { + "epoch": 0.5551020408163265, + "grad_norm": 0.5873389840126038, + "learning_rate": 4.138114042725596e-05, + "loss": 1.1663, + "step": 3604 + }, + { + "epoch": 0.5554100885637274, + "grad_norm": 0.686549961566925, + "learning_rate": 4.1333477092962114e-05, + "loss": 0.8772, + "step": 3606 + }, + { + "epoch": 0.5557181363111282, + "grad_norm": 0.8575116395950317, + "learning_rate": 4.128582187664066e-05, + "loss": 0.9958, + "step": 3608 + }, + { + "epoch": 0.5560261840585291, + "grad_norm": 0.658591628074646, + "learning_rate": 4.123817482293047e-05, + "loss": 0.8492, + "step": 3610 + }, + { + "epoch": 0.5563342318059299, + "grad_norm": 0.7009797096252441, + "learning_rate": 4.1190535976462726e-05, + "loss": 0.9576, + "step": 3612 + }, + { + "epoch": 0.5566422795533308, + "grad_norm": 1.0068048238754272, + "learning_rate": 4.114290538186101e-05, + "loss": 1.1333, + "step": 3614 + }, + { + "epoch": 0.5569503273007316, + "grad_norm": 0.6233420968055725, + "learning_rate": 4.109528308374108e-05, + "loss": 1.3391, + "step": 3616 + }, + { + "epoch": 0.5572583750481325, + "grad_norm": 0.5594993233680725, + "learning_rate": 4.104766912671098e-05, + "loss": 0.8397, + "step": 3618 + }, + { + "epoch": 0.5575664227955333, + "grad_norm": 0.9816888570785522, + "learning_rate": 4.1000063555370894e-05, + "loss": 1.0325, + "step": 3620 + }, + { + "epoch": 0.5578744705429342, + "grad_norm": 0.9001654386520386, + "learning_rate": 4.0952466414313235e-05, + "loss": 0.9348, + "step": 3622 + }, + { + "epoch": 0.558182518290335, + "grad_norm": 0.9082402586936951, + "learning_rate": 4.0904877748122436e-05, + "loss": 1.1301, + "step": 3624 + }, + { + "epoch": 0.5584905660377358, + "grad_norm": 0.8761422038078308, + "learning_rate": 4.085729760137501e-05, + "loss": 0.8416, + "step": 3626 + }, + { + "epoch": 0.5587986137851367, + "grad_norm": 1.0857791900634766, + "learning_rate": 4.080972601863956e-05, + "loss": 1.8213, + "step": 3628 + }, + { + "epoch": 0.5591066615325375, + "grad_norm": 0.7211164236068726, + "learning_rate": 4.076216304447654e-05, + "loss": 1.7474, + "step": 3630 + }, + { + "epoch": 0.5594147092799384, + "grad_norm": 0.7563309073448181, + "learning_rate": 4.071460872343843e-05, + "loss": 1.8916, + "step": 3632 + }, + { + "epoch": 0.5597227570273392, + "grad_norm": 0.7382598519325256, + "learning_rate": 4.066706310006961e-05, + "loss": 0.8198, + "step": 3634 + }, + { + "epoch": 0.5600308047747401, + "grad_norm": 0.9442710280418396, + "learning_rate": 4.061952621890628e-05, + "loss": 1.0136, + "step": 3636 + }, + { + "epoch": 0.5603388525221409, + "grad_norm": 0.7352667450904846, + "learning_rate": 4.0571998124476437e-05, + "loss": 0.754, + "step": 3638 + }, + { + "epoch": 0.5606469002695418, + "grad_norm": 0.768801212310791, + "learning_rate": 4.052447886129986e-05, + "loss": 0.9408, + "step": 3640 + }, + { + "epoch": 0.5609549480169427, + "grad_norm": 0.8420480489730835, + "learning_rate": 4.047696847388811e-05, + "loss": 1.056, + "step": 3642 + }, + { + "epoch": 0.5612629957643435, + "grad_norm": 0.7617586851119995, + "learning_rate": 4.042946700674436e-05, + "loss": 0.975, + "step": 3644 + }, + { + "epoch": 0.5615710435117444, + "grad_norm": 0.7460455298423767, + "learning_rate": 4.038197450436344e-05, + "loss": 1.0729, + "step": 3646 + }, + { + "epoch": 0.5618790912591451, + "grad_norm": 0.8208043575286865, + "learning_rate": 4.0334491011231826e-05, + "loss": 0.8507, + "step": 3648 + }, + { + "epoch": 0.562187139006546, + "grad_norm": 0.7699398994445801, + "learning_rate": 4.028701657182752e-05, + "loss": 1.9133, + "step": 3650 + }, + { + "epoch": 0.5624951867539468, + "grad_norm": 0.7481504082679749, + "learning_rate": 4.0239551230620034e-05, + "loss": 1.0713, + "step": 3652 + }, + { + "epoch": 0.5628032345013477, + "grad_norm": 0.7889412641525269, + "learning_rate": 4.0192095032070406e-05, + "loss": 0.9672, + "step": 3654 + }, + { + "epoch": 0.5631112822487485, + "grad_norm": 1.0069886445999146, + "learning_rate": 4.014464802063105e-05, + "loss": 1.0111, + "step": 3656 + }, + { + "epoch": 0.5634193299961494, + "grad_norm": 0.6418771147727966, + "learning_rate": 4.009721024074583e-05, + "loss": 0.8536, + "step": 3658 + }, + { + "epoch": 0.5637273777435502, + "grad_norm": 0.5600576400756836, + "learning_rate": 4.004978173684988e-05, + "loss": 0.8057, + "step": 3660 + }, + { + "epoch": 0.5640354254909511, + "grad_norm": 0.8424544930458069, + "learning_rate": 4.000236255336978e-05, + "loss": 0.8777, + "step": 3662 + }, + { + "epoch": 0.564343473238352, + "grad_norm": 1.293593406677246, + "learning_rate": 3.995495273472323e-05, + "loss": 1.2139, + "step": 3664 + }, + { + "epoch": 0.5646515209857528, + "grad_norm": 0.7682777643203735, + "learning_rate": 3.9907552325319266e-05, + "loss": 1.5731, + "step": 3666 + }, + { + "epoch": 0.5649595687331537, + "grad_norm": 0.7142342925071716, + "learning_rate": 3.986016136955806e-05, + "loss": 0.9631, + "step": 3668 + }, + { + "epoch": 0.5652676164805545, + "grad_norm": 0.7129397988319397, + "learning_rate": 3.981277991183096e-05, + "loss": 0.7958, + "step": 3670 + }, + { + "epoch": 0.5655756642279554, + "grad_norm": 0.7072484493255615, + "learning_rate": 3.976540799652037e-05, + "loss": 0.9842, + "step": 3672 + }, + { + "epoch": 0.5658837119753561, + "grad_norm": 0.8849371075630188, + "learning_rate": 3.971804566799979e-05, + "loss": 0.8443, + "step": 3674 + }, + { + "epoch": 0.566191759722757, + "grad_norm": 0.6128670573234558, + "learning_rate": 3.967069297063376e-05, + "loss": 0.9314, + "step": 3676 + }, + { + "epoch": 0.5664998074701578, + "grad_norm": 1.0203518867492676, + "learning_rate": 3.962334994877774e-05, + "loss": 0.8948, + "step": 3678 + }, + { + "epoch": 0.5668078552175587, + "grad_norm": 1.0024428367614746, + "learning_rate": 3.957601664677816e-05, + "loss": 0.9889, + "step": 3680 + }, + { + "epoch": 0.5671159029649596, + "grad_norm": 0.6684615612030029, + "learning_rate": 3.952869310897237e-05, + "loss": 0.9205, + "step": 3682 + }, + { + "epoch": 0.5674239507123604, + "grad_norm": 0.7785738110542297, + "learning_rate": 3.948137937968854e-05, + "loss": 0.9355, + "step": 3684 + }, + { + "epoch": 0.5677319984597613, + "grad_norm": 0.8673728108406067, + "learning_rate": 3.9434075503245646e-05, + "loss": 0.8856, + "step": 3686 + }, + { + "epoch": 0.5680400462071621, + "grad_norm": 0.8597670197486877, + "learning_rate": 3.938678152395346e-05, + "loss": 0.9154, + "step": 3688 + }, + { + "epoch": 0.568348093954563, + "grad_norm": 0.7761251330375671, + "learning_rate": 3.933949748611247e-05, + "loss": 0.8788, + "step": 3690 + }, + { + "epoch": 0.5686561417019638, + "grad_norm": 0.6056908965110779, + "learning_rate": 3.929222343401385e-05, + "loss": 0.7823, + "step": 3692 + }, + { + "epoch": 0.5689641894493647, + "grad_norm": 0.6534133553504944, + "learning_rate": 3.9244959411939447e-05, + "loss": 1.9013, + "step": 3694 + }, + { + "epoch": 0.5692722371967655, + "grad_norm": 0.9937305450439453, + "learning_rate": 3.9197705464161674e-05, + "loss": 1.0152, + "step": 3696 + }, + { + "epoch": 0.5695802849441663, + "grad_norm": 0.6183207035064697, + "learning_rate": 3.915046163494351e-05, + "loss": 1.1256, + "step": 3698 + }, + { + "epoch": 0.5698883326915672, + "grad_norm": 0.8396857380867004, + "learning_rate": 3.910322796853848e-05, + "loss": 1.1151, + "step": 3700 + }, + { + "epoch": 0.570196380438968, + "grad_norm": 0.7862452268600464, + "learning_rate": 3.905600450919061e-05, + "loss": 1.0242, + "step": 3702 + }, + { + "epoch": 0.5705044281863689, + "grad_norm": 0.9082566499710083, + "learning_rate": 3.9008791301134294e-05, + "loss": 0.7974, + "step": 3704 + }, + { + "epoch": 0.5708124759337697, + "grad_norm": 0.7656323313713074, + "learning_rate": 3.8961588388594366e-05, + "loss": 0.8431, + "step": 3706 + }, + { + "epoch": 0.5711205236811706, + "grad_norm": 0.901706337928772, + "learning_rate": 3.8914395815786045e-05, + "loss": 1.4954, + "step": 3708 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.125427007675171, + "learning_rate": 3.886721362691481e-05, + "loss": 0.7766, + "step": 3710 + }, + { + "epoch": 0.5717366191759723, + "grad_norm": 1.0342464447021484, + "learning_rate": 3.8820041866176444e-05, + "loss": 0.9791, + "step": 3712 + }, + { + "epoch": 0.5720446669233731, + "grad_norm": 0.843343198299408, + "learning_rate": 3.877288057775694e-05, + "loss": 0.9627, + "step": 3714 + }, + { + "epoch": 0.572352714670774, + "grad_norm": 0.6604567766189575, + "learning_rate": 3.872572980583253e-05, + "loss": 0.9817, + "step": 3716 + }, + { + "epoch": 0.5726607624181748, + "grad_norm": 0.8827517032623291, + "learning_rate": 3.8678589594569535e-05, + "loss": 1.3719, + "step": 3718 + }, + { + "epoch": 0.5729688101655757, + "grad_norm": 0.7129951119422913, + "learning_rate": 3.86314599881244e-05, + "loss": 0.9466, + "step": 3720 + }, + { + "epoch": 0.5732768579129766, + "grad_norm": 0.8309202790260315, + "learning_rate": 3.858434103064368e-05, + "loss": 0.9355, + "step": 3722 + }, + { + "epoch": 0.5735849056603773, + "grad_norm": 0.8105360865592957, + "learning_rate": 3.853723276626392e-05, + "loss": 0.9133, + "step": 3724 + }, + { + "epoch": 0.5738929534077782, + "grad_norm": 0.7791650295257568, + "learning_rate": 3.84901352391116e-05, + "loss": 0.7613, + "step": 3726 + }, + { + "epoch": 0.574201001155179, + "grad_norm": 0.9629634618759155, + "learning_rate": 3.844304849330326e-05, + "loss": 1.0993, + "step": 3728 + }, + { + "epoch": 0.5745090489025799, + "grad_norm": 0.7873706817626953, + "learning_rate": 3.839597257294524e-05, + "loss": 0.8449, + "step": 3730 + }, + { + "epoch": 0.5748170966499807, + "grad_norm": 0.6659789085388184, + "learning_rate": 3.834890752213379e-05, + "loss": 0.8397, + "step": 3732 + }, + { + "epoch": 0.5751251443973816, + "grad_norm": 0.7648429870605469, + "learning_rate": 3.8301853384954924e-05, + "loss": 0.8762, + "step": 3734 + }, + { + "epoch": 0.5754331921447824, + "grad_norm": 0.9787917137145996, + "learning_rate": 3.825481020548451e-05, + "loss": 1.344, + "step": 3736 + }, + { + "epoch": 0.5757412398921833, + "grad_norm": 1.028001308441162, + "learning_rate": 3.82077780277881e-05, + "loss": 1.0085, + "step": 3738 + }, + { + "epoch": 0.5760492876395842, + "grad_norm": 0.9443414211273193, + "learning_rate": 3.816075689592095e-05, + "loss": 1.051, + "step": 3740 + }, + { + "epoch": 0.576357335386985, + "grad_norm": 0.6627349257469177, + "learning_rate": 3.811374685392799e-05, + "loss": 1.0108, + "step": 3742 + }, + { + "epoch": 0.5766653831343859, + "grad_norm": 0.7638726234436035, + "learning_rate": 3.806674794584374e-05, + "loss": 2.2769, + "step": 3744 + }, + { + "epoch": 0.5769734308817867, + "grad_norm": 0.7516840100288391, + "learning_rate": 3.8019760215692266e-05, + "loss": 0.8065, + "step": 3746 + }, + { + "epoch": 0.5772814786291875, + "grad_norm": 1.1510311365127563, + "learning_rate": 3.7972783707487234e-05, + "loss": 1.1615, + "step": 3748 + }, + { + "epoch": 0.5775895263765883, + "grad_norm": 0.8986920714378357, + "learning_rate": 3.792581846523175e-05, + "loss": 0.9316, + "step": 3750 + }, + { + "epoch": 0.5778975741239892, + "grad_norm": 0.6290449500083923, + "learning_rate": 3.787886453291837e-05, + "loss": 1.03, + "step": 3752 + }, + { + "epoch": 0.57820562187139, + "grad_norm": 0.9564099311828613, + "learning_rate": 3.7831921954529035e-05, + "loss": 1.0917, + "step": 3754 + }, + { + "epoch": 0.5785136696187909, + "grad_norm": 0.7102187275886536, + "learning_rate": 3.7784990774035124e-05, + "loss": 1.1032, + "step": 3756 + }, + { + "epoch": 0.5788217173661918, + "grad_norm": 0.6460309028625488, + "learning_rate": 3.773807103539726e-05, + "loss": 1.249, + "step": 3758 + }, + { + "epoch": 0.5791297651135926, + "grad_norm": 0.8030285239219666, + "learning_rate": 3.7691162782565383e-05, + "loss": 0.9557, + "step": 3760 + }, + { + "epoch": 0.5794378128609935, + "grad_norm": 0.9096877574920654, + "learning_rate": 3.764426605947868e-05, + "loss": 0.9874, + "step": 3762 + }, + { + "epoch": 0.5797458606083943, + "grad_norm": 0.4688047468662262, + "learning_rate": 3.7597380910065547e-05, + "loss": 0.909, + "step": 3764 + }, + { + "epoch": 0.5800539083557952, + "grad_norm": 0.9313383102416992, + "learning_rate": 3.755050737824347e-05, + "loss": 0.8925, + "step": 3766 + }, + { + "epoch": 0.580361956103196, + "grad_norm": 0.7512313723564148, + "learning_rate": 3.7503645507919174e-05, + "loss": 0.9631, + "step": 3768 + }, + { + "epoch": 0.5806700038505969, + "grad_norm": 0.8948346376419067, + "learning_rate": 3.7456795342988336e-05, + "loss": 1.3042, + "step": 3770 + }, + { + "epoch": 0.5809780515979976, + "grad_norm": 0.8193276524543762, + "learning_rate": 3.7409956927335766e-05, + "loss": 0.8325, + "step": 3772 + }, + { + "epoch": 0.5812860993453985, + "grad_norm": 0.8436985611915588, + "learning_rate": 3.736313030483517e-05, + "loss": 1.0058, + "step": 3774 + }, + { + "epoch": 0.5815941470927993, + "grad_norm": 0.9698597192764282, + "learning_rate": 3.731631551934932e-05, + "loss": 0.931, + "step": 3776 + }, + { + "epoch": 0.5819021948402002, + "grad_norm": 0.5956494808197021, + "learning_rate": 3.726951261472981e-05, + "loss": 0.6849, + "step": 3778 + }, + { + "epoch": 0.5822102425876011, + "grad_norm": 0.5870460867881775, + "learning_rate": 3.7222721634817146e-05, + "loss": 0.9, + "step": 3780 + }, + { + "epoch": 0.5825182903350019, + "grad_norm": 0.7760697603225708, + "learning_rate": 3.7175942623440684e-05, + "loss": 1.0826, + "step": 3782 + }, + { + "epoch": 0.5828263380824028, + "grad_norm": 1.0230954885482788, + "learning_rate": 3.71291756244185e-05, + "loss": 1.0201, + "step": 3784 + }, + { + "epoch": 0.5831343858298036, + "grad_norm": 0.9187666177749634, + "learning_rate": 3.7082420681557476e-05, + "loss": 1.0352, + "step": 3786 + }, + { + "epoch": 0.5834424335772045, + "grad_norm": 0.7570469975471497, + "learning_rate": 3.7035677838653195e-05, + "loss": 1.8588, + "step": 3788 + }, + { + "epoch": 0.5837504813246053, + "grad_norm": 0.7575215697288513, + "learning_rate": 3.69889471394899e-05, + "loss": 1.0532, + "step": 3790 + }, + { + "epoch": 0.5840585290720062, + "grad_norm": 0.8707351088523865, + "learning_rate": 3.694222862784043e-05, + "loss": 2.6658, + "step": 3792 + }, + { + "epoch": 0.584366576819407, + "grad_norm": 0.9095748662948608, + "learning_rate": 3.689552234746623e-05, + "loss": 1.0685, + "step": 3794 + }, + { + "epoch": 0.5846746245668079, + "grad_norm": 0.806652843952179, + "learning_rate": 3.684882834211732e-05, + "loss": 1.082, + "step": 3796 + }, + { + "epoch": 0.5849826723142088, + "grad_norm": 0.6337845921516418, + "learning_rate": 3.6802146655532185e-05, + "loss": 1.365, + "step": 3798 + }, + { + "epoch": 0.5852907200616095, + "grad_norm": 0.5820702910423279, + "learning_rate": 3.675547733143776e-05, + "loss": 1.9037, + "step": 3800 + }, + { + "epoch": 0.5855987678090104, + "grad_norm": 0.72236168384552, + "learning_rate": 3.670882041354944e-05, + "loss": 1.5893, + "step": 3802 + }, + { + "epoch": 0.5859068155564112, + "grad_norm": 0.8915034532546997, + "learning_rate": 3.666217594557097e-05, + "loss": 1.1491, + "step": 3804 + }, + { + "epoch": 0.5862148633038121, + "grad_norm": 0.972583532333374, + "learning_rate": 3.6615543971194424e-05, + "loss": 0.865, + "step": 3806 + }, + { + "epoch": 0.5865229110512129, + "grad_norm": 0.6603726744651794, + "learning_rate": 3.65689245341002e-05, + "loss": 0.8353, + "step": 3808 + }, + { + "epoch": 0.5868309587986138, + "grad_norm": 0.525361955165863, + "learning_rate": 3.652231767795695e-05, + "loss": 0.7387, + "step": 3810 + }, + { + "epoch": 0.5871390065460146, + "grad_norm": 0.8613260388374329, + "learning_rate": 3.647572344642155e-05, + "loss": 0.8483, + "step": 3812 + }, + { + "epoch": 0.5874470542934155, + "grad_norm": 0.8724876642227173, + "learning_rate": 3.6429141883138986e-05, + "loss": 1.1493, + "step": 3814 + }, + { + "epoch": 0.5877551020408164, + "grad_norm": 0.7819899916648865, + "learning_rate": 3.638257303174246e-05, + "loss": 1.0313, + "step": 3816 + }, + { + "epoch": 0.5880631497882172, + "grad_norm": 0.8212162852287292, + "learning_rate": 3.6336016935853225e-05, + "loss": 1.0326, + "step": 3818 + }, + { + "epoch": 0.5883711975356181, + "grad_norm": 0.7977713346481323, + "learning_rate": 3.628947363908058e-05, + "loss": 0.8044, + "step": 3820 + }, + { + "epoch": 0.5886792452830188, + "grad_norm": 0.7846778631210327, + "learning_rate": 3.6242943185021875e-05, + "loss": 0.8219, + "step": 3822 + }, + { + "epoch": 0.5889872930304197, + "grad_norm": 0.7150901556015015, + "learning_rate": 3.6196425617262385e-05, + "loss": 1.035, + "step": 3824 + }, + { + "epoch": 0.5892953407778205, + "grad_norm": 0.68860924243927, + "learning_rate": 3.614992097937533e-05, + "loss": 0.8661, + "step": 3826 + }, + { + "epoch": 0.5896033885252214, + "grad_norm": 0.6898394823074341, + "learning_rate": 3.610342931492182e-05, + "loss": 1.3513, + "step": 3828 + }, + { + "epoch": 0.5899114362726222, + "grad_norm": 0.6659755110740662, + "learning_rate": 3.605695066745084e-05, + "loss": 1.4634, + "step": 3830 + }, + { + "epoch": 0.5902194840200231, + "grad_norm": 0.7592305541038513, + "learning_rate": 3.601048508049913e-05, + "loss": 0.8823, + "step": 3832 + }, + { + "epoch": 0.5905275317674239, + "grad_norm": 0.761418342590332, + "learning_rate": 3.5964032597591215e-05, + "loss": 1.001, + "step": 3834 + }, + { + "epoch": 0.5908355795148248, + "grad_norm": 0.8135924935340881, + "learning_rate": 3.591759326223937e-05, + "loss": 1.2993, + "step": 3836 + }, + { + "epoch": 0.5911436272622257, + "grad_norm": 0.7134994268417358, + "learning_rate": 3.5871167117943544e-05, + "loss": 0.9866, + "step": 3838 + }, + { + "epoch": 0.5914516750096265, + "grad_norm": 0.6832489967346191, + "learning_rate": 3.582475420819129e-05, + "loss": 1.9355, + "step": 3840 + }, + { + "epoch": 0.5917597227570274, + "grad_norm": 0.6804940700531006, + "learning_rate": 3.577835457645783e-05, + "loss": 0.917, + "step": 3842 + }, + { + "epoch": 0.5920677705044282, + "grad_norm": 0.92501300573349, + "learning_rate": 3.573196826620591e-05, + "loss": 1.0095, + "step": 3844 + }, + { + "epoch": 0.5923758182518291, + "grad_norm": 0.6867468953132629, + "learning_rate": 3.5685595320885776e-05, + "loss": 0.8609, + "step": 3846 + }, + { + "epoch": 0.5926838659992298, + "grad_norm": 0.5593806505203247, + "learning_rate": 3.56392357839352e-05, + "loss": 0.7962, + "step": 3848 + }, + { + "epoch": 0.5929919137466307, + "grad_norm": 0.693376362323761, + "learning_rate": 3.5592889698779385e-05, + "loss": 0.9079, + "step": 3850 + }, + { + "epoch": 0.5932999614940315, + "grad_norm": 0.7576355338096619, + "learning_rate": 3.5546557108830925e-05, + "loss": 0.8744, + "step": 3852 + }, + { + "epoch": 0.5936080092414324, + "grad_norm": 0.7912231087684631, + "learning_rate": 3.5500238057489746e-05, + "loss": 0.9186, + "step": 3854 + }, + { + "epoch": 0.5939160569888333, + "grad_norm": 0.5722372531890869, + "learning_rate": 3.545393258814316e-05, + "loss": 0.909, + "step": 3856 + }, + { + "epoch": 0.5942241047362341, + "grad_norm": 0.537816047668457, + "learning_rate": 3.540764074416568e-05, + "loss": 0.867, + "step": 3858 + }, + { + "epoch": 0.594532152483635, + "grad_norm": 0.7094846367835999, + "learning_rate": 3.53613625689191e-05, + "loss": 0.7947, + "step": 3860 + }, + { + "epoch": 0.5948402002310358, + "grad_norm": 0.7573848366737366, + "learning_rate": 3.5315098105752434e-05, + "loss": 1.117, + "step": 3862 + }, + { + "epoch": 0.5951482479784367, + "grad_norm": 0.8237606883049011, + "learning_rate": 3.5268847398001766e-05, + "loss": 1.0502, + "step": 3864 + }, + { + "epoch": 0.5954562957258375, + "grad_norm": 0.7843457460403442, + "learning_rate": 3.52226104889904e-05, + "loss": 1.145, + "step": 3866 + }, + { + "epoch": 0.5957643434732384, + "grad_norm": 0.932285487651825, + "learning_rate": 3.5176387422028625e-05, + "loss": 1.0675, + "step": 3868 + }, + { + "epoch": 0.5960723912206392, + "grad_norm": 0.6036128997802734, + "learning_rate": 3.5130178240413833e-05, + "loss": 0.8849, + "step": 3870 + }, + { + "epoch": 0.59638043896804, + "grad_norm": 0.9221835136413574, + "learning_rate": 3.508398298743036e-05, + "loss": 0.8411, + "step": 3872 + }, + { + "epoch": 0.596688486715441, + "grad_norm": 0.7087579369544983, + "learning_rate": 3.5037801706349524e-05, + "loss": 0.9154, + "step": 3874 + }, + { + "epoch": 0.5969965344628417, + "grad_norm": 0.9598959684371948, + "learning_rate": 3.4991634440429545e-05, + "loss": 1.1822, + "step": 3876 + }, + { + "epoch": 0.5973045822102426, + "grad_norm": 0.848368763923645, + "learning_rate": 3.494548123291552e-05, + "loss": 1.7755, + "step": 3878 + }, + { + "epoch": 0.5976126299576434, + "grad_norm": 0.9857352375984192, + "learning_rate": 3.489934212703936e-05, + "loss": 1.0044, + "step": 3880 + }, + { + "epoch": 0.5979206777050443, + "grad_norm": 0.9576002359390259, + "learning_rate": 3.485321716601979e-05, + "loss": 0.8722, + "step": 3882 + }, + { + "epoch": 0.5982287254524451, + "grad_norm": 0.7168333530426025, + "learning_rate": 3.4807106393062275e-05, + "loss": 0.9099, + "step": 3884 + }, + { + "epoch": 0.598536773199846, + "grad_norm": 0.8737758994102478, + "learning_rate": 3.476100985135901e-05, + "loss": 1.0395, + "step": 3886 + }, + { + "epoch": 0.5988448209472468, + "grad_norm": 0.8918794393539429, + "learning_rate": 3.471492758408879e-05, + "loss": 1.055, + "step": 3888 + }, + { + "epoch": 0.5991528686946477, + "grad_norm": 0.8955882787704468, + "learning_rate": 3.466885963441714e-05, + "loss": 0.8754, + "step": 3890 + }, + { + "epoch": 0.5994609164420485, + "grad_norm": 0.7722766399383545, + "learning_rate": 3.462280604549611e-05, + "loss": 0.763, + "step": 3892 + }, + { + "epoch": 0.5997689641894494, + "grad_norm": 0.7882518172264099, + "learning_rate": 3.457676686046427e-05, + "loss": 0.9779, + "step": 3894 + }, + { + "epoch": 0.6000770119368503, + "grad_norm": 0.8698257803916931, + "learning_rate": 3.453074212244681e-05, + "loss": 0.8066, + "step": 3896 + }, + { + "epoch": 0.600385059684251, + "grad_norm": 0.6644619107246399, + "learning_rate": 3.4484731874555274e-05, + "loss": 0.9186, + "step": 3898 + }, + { + "epoch": 0.6006931074316519, + "grad_norm": 0.5977016091346741, + "learning_rate": 3.4438736159887665e-05, + "loss": 0.911, + "step": 3900 + }, + { + "epoch": 0.6006931074316519, + "eval_loss": 2.3850553035736084, + "eval_runtime": 736.9149, + "eval_samples_per_second": 2.714, + "eval_steps_per_second": 0.679, + "step": 3900 + }, + { + "epoch": 0.6010011551790527, + "grad_norm": 0.8254393339157104, + "learning_rate": 3.4392755021528424e-05, + "loss": 1.0326, + "step": 3902 + }, + { + "epoch": 0.6013092029264536, + "grad_norm": 0.6651079654693604, + "learning_rate": 3.434678850254827e-05, + "loss": 0.9066, + "step": 3904 + }, + { + "epoch": 0.6016172506738544, + "grad_norm": 0.8798814415931702, + "learning_rate": 3.4300836646004253e-05, + "loss": 1.0857, + "step": 3906 + }, + { + "epoch": 0.6019252984212553, + "grad_norm": 0.8257869482040405, + "learning_rate": 3.425489949493969e-05, + "loss": 0.8308, + "step": 3908 + }, + { + "epoch": 0.6022333461686561, + "grad_norm": 0.7323641180992126, + "learning_rate": 3.420897709238414e-05, + "loss": 0.91, + "step": 3910 + }, + { + "epoch": 0.602541393916057, + "grad_norm": 0.9562618732452393, + "learning_rate": 3.4163069481353334e-05, + "loss": 1.0447, + "step": 3912 + }, + { + "epoch": 0.6028494416634579, + "grad_norm": 0.6833957433700562, + "learning_rate": 3.4117176704849116e-05, + "loss": 0.9323, + "step": 3914 + }, + { + "epoch": 0.6031574894108587, + "grad_norm": 0.9771287441253662, + "learning_rate": 3.4071298805859483e-05, + "loss": 0.9317, + "step": 3916 + }, + { + "epoch": 0.6034655371582596, + "grad_norm": 1.0490721464157104, + "learning_rate": 3.4025435827358497e-05, + "loss": 0.8953, + "step": 3918 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.9347744584083557, + "learning_rate": 3.3979587812306196e-05, + "loss": 1.1698, + "step": 3920 + }, + { + "epoch": 0.6040816326530613, + "grad_norm": 0.8014428615570068, + "learning_rate": 3.393375480364862e-05, + "loss": 0.8904, + "step": 3922 + }, + { + "epoch": 0.604389680400462, + "grad_norm": 0.5310930609703064, + "learning_rate": 3.388793684431779e-05, + "loss": 1.7, + "step": 3924 + }, + { + "epoch": 0.6046977281478629, + "grad_norm": 0.8464880585670471, + "learning_rate": 3.38421339772316e-05, + "loss": 1.2822, + "step": 3926 + }, + { + "epoch": 0.6050057758952637, + "grad_norm": 0.6187727451324463, + "learning_rate": 3.3796346245293775e-05, + "loss": 2.4976, + "step": 3928 + }, + { + "epoch": 0.6053138236426646, + "grad_norm": 0.6648653745651245, + "learning_rate": 3.375057369139394e-05, + "loss": 0.8097, + "step": 3930 + }, + { + "epoch": 0.6056218713900655, + "grad_norm": 0.8213443756103516, + "learning_rate": 3.370481635840744e-05, + "loss": 1.0321, + "step": 3932 + }, + { + "epoch": 0.6059299191374663, + "grad_norm": 1.0013117790222168, + "learning_rate": 3.365907428919536e-05, + "loss": 0.9412, + "step": 3934 + }, + { + "epoch": 0.6062379668848672, + "grad_norm": 0.7968295812606812, + "learning_rate": 3.361334752660456e-05, + "loss": 0.8192, + "step": 3936 + }, + { + "epoch": 0.606546014632268, + "grad_norm": 0.6209965944290161, + "learning_rate": 3.356763611346747e-05, + "loss": 0.9079, + "step": 3938 + }, + { + "epoch": 0.6068540623796689, + "grad_norm": 0.7653431296348572, + "learning_rate": 3.352194009260221e-05, + "loss": 1.0268, + "step": 3940 + }, + { + "epoch": 0.6071621101270697, + "grad_norm": 0.830397367477417, + "learning_rate": 3.3476259506812404e-05, + "loss": 0.9273, + "step": 3942 + }, + { + "epoch": 0.6074701578744706, + "grad_norm": 0.6765947937965393, + "learning_rate": 3.343059439888735e-05, + "loss": 0.9284, + "step": 3944 + }, + { + "epoch": 0.6077782056218713, + "grad_norm": 0.8520391583442688, + "learning_rate": 3.33849448116017e-05, + "loss": 0.98, + "step": 3946 + }, + { + "epoch": 0.6080862533692722, + "grad_norm": 0.8554842472076416, + "learning_rate": 3.3339310787715665e-05, + "loss": 0.907, + "step": 3948 + }, + { + "epoch": 0.608394301116673, + "grad_norm": 1.0883864164352417, + "learning_rate": 3.329369236997486e-05, + "loss": 0.8451, + "step": 3950 + }, + { + "epoch": 0.6087023488640739, + "grad_norm": 0.65107661485672, + "learning_rate": 3.324808960111024e-05, + "loss": 1.0974, + "step": 3952 + }, + { + "epoch": 0.6090103966114748, + "grad_norm": 0.8793396949768066, + "learning_rate": 3.320250252383814e-05, + "loss": 0.9018, + "step": 3954 + }, + { + "epoch": 0.6093184443588756, + "grad_norm": 0.9697176218032837, + "learning_rate": 3.3156931180860195e-05, + "loss": 0.8161, + "step": 3956 + }, + { + "epoch": 0.6096264921062765, + "grad_norm": 0.8237152099609375, + "learning_rate": 3.3111375614863305e-05, + "loss": 1.2178, + "step": 3958 + }, + { + "epoch": 0.6099345398536773, + "grad_norm": 0.8094208240509033, + "learning_rate": 3.306583586851956e-05, + "loss": 1.0218, + "step": 3960 + }, + { + "epoch": 0.6102425876010782, + "grad_norm": 0.7797203063964844, + "learning_rate": 3.302031198448624e-05, + "loss": 0.8513, + "step": 3962 + }, + { + "epoch": 0.610550635348479, + "grad_norm": 0.7247787714004517, + "learning_rate": 3.297480400540581e-05, + "loss": 0.9219, + "step": 3964 + }, + { + "epoch": 0.6108586830958799, + "grad_norm": 0.7033014893531799, + "learning_rate": 3.292931197390581e-05, + "loss": 0.683, + "step": 3966 + }, + { + "epoch": 0.6111667308432807, + "grad_norm": 0.6294431090354919, + "learning_rate": 3.288383593259881e-05, + "loss": 0.9691, + "step": 3968 + }, + { + "epoch": 0.6114747785906816, + "grad_norm": 0.8306741118431091, + "learning_rate": 3.283837592408244e-05, + "loss": 1.1146, + "step": 3970 + }, + { + "epoch": 0.6117828263380825, + "grad_norm": 0.8149116039276123, + "learning_rate": 3.279293199093931e-05, + "loss": 1.1058, + "step": 3972 + }, + { + "epoch": 0.6120908740854832, + "grad_norm": 0.7836449146270752, + "learning_rate": 3.274750417573694e-05, + "loss": 1.1746, + "step": 3974 + }, + { + "epoch": 0.6123989218328841, + "grad_norm": 0.836641788482666, + "learning_rate": 3.270209252102782e-05, + "loss": 0.7761, + "step": 3976 + }, + { + "epoch": 0.6127069695802849, + "grad_norm": 0.9818074703216553, + "learning_rate": 3.2656697069349224e-05, + "loss": 1.2522, + "step": 3978 + }, + { + "epoch": 0.6130150173276858, + "grad_norm": 0.775067150592804, + "learning_rate": 3.26113178632233e-05, + "loss": 0.8516, + "step": 3980 + }, + { + "epoch": 0.6133230650750866, + "grad_norm": 1.0951143503189087, + "learning_rate": 3.2565954945156924e-05, + "loss": 1.0875, + "step": 3982 + }, + { + "epoch": 0.6136311128224875, + "grad_norm": 0.8564605712890625, + "learning_rate": 3.252060835764181e-05, + "loss": 1.164, + "step": 3984 + }, + { + "epoch": 0.6139391605698883, + "grad_norm": 0.7555924654006958, + "learning_rate": 3.2475278143154284e-05, + "loss": 0.8967, + "step": 3986 + }, + { + "epoch": 0.6142472083172892, + "grad_norm": 0.7195894718170166, + "learning_rate": 3.242996434415537e-05, + "loss": 0.8661, + "step": 3988 + }, + { + "epoch": 0.6145552560646901, + "grad_norm": 0.8980828523635864, + "learning_rate": 3.2384667003090727e-05, + "loss": 0.9645, + "step": 3990 + }, + { + "epoch": 0.6148633038120909, + "grad_norm": 0.6523622274398804, + "learning_rate": 3.233938616239058e-05, + "loss": 1.0817, + "step": 3992 + }, + { + "epoch": 0.6151713515594918, + "grad_norm": 0.997121274471283, + "learning_rate": 3.229412186446969e-05, + "loss": 0.9931, + "step": 3994 + }, + { + "epoch": 0.6154793993068925, + "grad_norm": 0.986936628818512, + "learning_rate": 3.2248874151727356e-05, + "loss": 0.8143, + "step": 3996 + }, + { + "epoch": 0.6157874470542934, + "grad_norm": 1.0862888097763062, + "learning_rate": 3.2203643066547315e-05, + "loss": 0.9638, + "step": 3998 + }, + { + "epoch": 0.6160954948016942, + "grad_norm": 0.6776353716850281, + "learning_rate": 3.215842865129773e-05, + "loss": 1.4069, + "step": 4000 + }, + { + "epoch": 0.6164035425490951, + "grad_norm": 0.6925904154777527, + "learning_rate": 3.2113230948331154e-05, + "loss": 0.9874, + "step": 4002 + }, + { + "epoch": 0.6167115902964959, + "grad_norm": 6.114987850189209, + "learning_rate": 3.20680499999845e-05, + "loss": 1.1663, + "step": 4004 + }, + { + "epoch": 0.6170196380438968, + "grad_norm": 0.7539072632789612, + "learning_rate": 3.2022885848578966e-05, + "loss": 1.1407, + "step": 4006 + }, + { + "epoch": 0.6173276857912976, + "grad_norm": 0.5863259434700012, + "learning_rate": 3.197773853642e-05, + "loss": 0.9743, + "step": 4008 + }, + { + "epoch": 0.6176357335386985, + "grad_norm": 0.9714782238006592, + "learning_rate": 3.193260810579733e-05, + "loss": 0.8682, + "step": 4010 + }, + { + "epoch": 0.6179437812860994, + "grad_norm": 1.0604935884475708, + "learning_rate": 3.188749459898482e-05, + "loss": 0.874, + "step": 4012 + }, + { + "epoch": 0.6182518290335002, + "grad_norm": 0.8278828263282776, + "learning_rate": 3.184239805824052e-05, + "loss": 1.1128, + "step": 4014 + }, + { + "epoch": 0.6185598767809011, + "grad_norm": 0.7398038506507874, + "learning_rate": 3.1797318525806575e-05, + "loss": 0.8065, + "step": 4016 + }, + { + "epoch": 0.6188679245283019, + "grad_norm": 0.6247692704200745, + "learning_rate": 3.175225604390917e-05, + "loss": 0.8993, + "step": 4018 + }, + { + "epoch": 0.6191759722757028, + "grad_norm": 0.7333313226699829, + "learning_rate": 3.1707210654758556e-05, + "loss": 0.7988, + "step": 4020 + }, + { + "epoch": 0.6194840200231035, + "grad_norm": 0.9650359153747559, + "learning_rate": 3.166218240054893e-05, + "loss": 0.9056, + "step": 4022 + }, + { + "epoch": 0.6197920677705044, + "grad_norm": 1.0183436870574951, + "learning_rate": 3.161717132345852e-05, + "loss": 0.8781, + "step": 4024 + }, + { + "epoch": 0.6201001155179052, + "grad_norm": 0.8829627633094788, + "learning_rate": 3.157217746564937e-05, + "loss": 0.9361, + "step": 4026 + }, + { + "epoch": 0.6204081632653061, + "grad_norm": 0.8895233273506165, + "learning_rate": 3.1527200869267446e-05, + "loss": 1.0811, + "step": 4028 + }, + { + "epoch": 0.620716211012707, + "grad_norm": 0.6992722749710083, + "learning_rate": 3.148224157644256e-05, + "loss": 2.264, + "step": 4030 + }, + { + "epoch": 0.6210242587601078, + "grad_norm": 0.630937933921814, + "learning_rate": 3.143729962928825e-05, + "loss": 2.1182, + "step": 4032 + }, + { + "epoch": 0.6213323065075087, + "grad_norm": 0.8037968873977661, + "learning_rate": 3.139237506990188e-05, + "loss": 0.8633, + "step": 4034 + }, + { + "epoch": 0.6216403542549095, + "grad_norm": 0.9234476685523987, + "learning_rate": 3.1347467940364466e-05, + "loss": 1.0269, + "step": 4036 + }, + { + "epoch": 0.6219484020023104, + "grad_norm": 0.7384040951728821, + "learning_rate": 3.1302578282740764e-05, + "loss": 1.135, + "step": 4038 + }, + { + "epoch": 0.6222564497497112, + "grad_norm": 0.5950331091880798, + "learning_rate": 3.125770613907909e-05, + "loss": 0.9479, + "step": 4040 + }, + { + "epoch": 0.6225644974971121, + "grad_norm": 0.9147387742996216, + "learning_rate": 3.1212851551411394e-05, + "loss": 0.8846, + "step": 4042 + }, + { + "epoch": 0.6228725452445129, + "grad_norm": 0.782964825630188, + "learning_rate": 3.1168014561753195e-05, + "loss": 0.9488, + "step": 4044 + }, + { + "epoch": 0.6231805929919138, + "grad_norm": 0.8128480911254883, + "learning_rate": 3.1123195212103515e-05, + "loss": 0.9245, + "step": 4046 + }, + { + "epoch": 0.6234886407393146, + "grad_norm": 0.9903732538223267, + "learning_rate": 3.1078393544444804e-05, + "loss": 1.4895, + "step": 4048 + }, + { + "epoch": 0.6237966884867154, + "grad_norm": 0.6795881986618042, + "learning_rate": 3.103360960074304e-05, + "loss": 0.9809, + "step": 4050 + }, + { + "epoch": 0.6241047362341163, + "grad_norm": 0.6919244527816772, + "learning_rate": 3.098884342294753e-05, + "loss": 1.0142, + "step": 4052 + }, + { + "epoch": 0.6244127839815171, + "grad_norm": 0.6578916907310486, + "learning_rate": 3.0944095052990985e-05, + "loss": 1.4133, + "step": 4054 + }, + { + "epoch": 0.624720831728918, + "grad_norm": 0.7969669103622437, + "learning_rate": 3.089936453278937e-05, + "loss": 0.8925, + "step": 4056 + }, + { + "epoch": 0.6250288794763188, + "grad_norm": 0.9621204733848572, + "learning_rate": 3.0854651904241993e-05, + "loss": 0.9851, + "step": 4058 + }, + { + "epoch": 0.6253369272237197, + "grad_norm": 0.7093731164932251, + "learning_rate": 3.08099572092314e-05, + "loss": 1.0454, + "step": 4060 + }, + { + "epoch": 0.6256449749711205, + "grad_norm": 0.9264700412750244, + "learning_rate": 3.076528048962327e-05, + "loss": 0.814, + "step": 4062 + }, + { + "epoch": 0.6259530227185214, + "grad_norm": 0.9630382657051086, + "learning_rate": 3.072062178726657e-05, + "loss": 1.0488, + "step": 4064 + }, + { + "epoch": 0.6262610704659222, + "grad_norm": 0.7568457126617432, + "learning_rate": 3.067598114399325e-05, + "loss": 1.015, + "step": 4066 + }, + { + "epoch": 0.6265691182133231, + "grad_norm": 0.6907071471214294, + "learning_rate": 3.063135860161842e-05, + "loss": 0.8186, + "step": 4068 + }, + { + "epoch": 0.626877165960724, + "grad_norm": 1.03848397731781, + "learning_rate": 3.0586754201940235e-05, + "loss": 0.8725, + "step": 4070 + }, + { + "epoch": 0.6271852137081247, + "grad_norm": 0.7759397625923157, + "learning_rate": 3.054216798673987e-05, + "loss": 0.9653, + "step": 4072 + }, + { + "epoch": 0.6274932614555256, + "grad_norm": 0.544243335723877, + "learning_rate": 3.049759999778139e-05, + "loss": 0.9639, + "step": 4074 + }, + { + "epoch": 0.6278013092029264, + "grad_norm": 0.8743525147438049, + "learning_rate": 3.0453050276811856e-05, + "loss": 0.8889, + "step": 4076 + }, + { + "epoch": 0.6281093569503273, + "grad_norm": 0.8232191205024719, + "learning_rate": 3.0408518865561225e-05, + "loss": 1.1389, + "step": 4078 + }, + { + "epoch": 0.6284174046977281, + "grad_norm": 0.8645479679107666, + "learning_rate": 3.0364005805742246e-05, + "loss": 1.0798, + "step": 4080 + }, + { + "epoch": 0.628725452445129, + "grad_norm": 0.8836220502853394, + "learning_rate": 3.0319511139050504e-05, + "loss": 0.9966, + "step": 4082 + }, + { + "epoch": 0.6290335001925298, + "grad_norm": 0.9457396268844604, + "learning_rate": 3.0275034907164396e-05, + "loss": 0.9485, + "step": 4084 + }, + { + "epoch": 0.6293415479399307, + "grad_norm": 0.8273984789848328, + "learning_rate": 3.0230577151745006e-05, + "loss": 1.7574, + "step": 4086 + }, + { + "epoch": 0.6296495956873316, + "grad_norm": 0.6231035590171814, + "learning_rate": 3.0186137914436085e-05, + "loss": 1.0173, + "step": 4088 + }, + { + "epoch": 0.6299576434347324, + "grad_norm": 0.7283669710159302, + "learning_rate": 3.014171723686411e-05, + "loss": 0.9442, + "step": 4090 + }, + { + "epoch": 0.6302656911821333, + "grad_norm": 1.0157694816589355, + "learning_rate": 3.009731516063813e-05, + "loss": 1.0788, + "step": 4092 + }, + { + "epoch": 0.6305737389295341, + "grad_norm": 0.7202984094619751, + "learning_rate": 3.0052931727349777e-05, + "loss": 0.9958, + "step": 4094 + }, + { + "epoch": 0.630881786676935, + "grad_norm": 0.699341356754303, + "learning_rate": 3.0008566978573206e-05, + "loss": 0.8117, + "step": 4096 + }, + { + "epoch": 0.6311898344243357, + "grad_norm": 0.7673215866088867, + "learning_rate": 2.9964220955865095e-05, + "loss": 0.7943, + "step": 4098 + }, + { + "epoch": 0.6314978821717366, + "grad_norm": 0.9850109815597534, + "learning_rate": 2.9919893700764566e-05, + "loss": 0.9798, + "step": 4100 + }, + { + "epoch": 0.6318059299191374, + "grad_norm": 0.8283570408821106, + "learning_rate": 2.9875585254793163e-05, + "loss": 1.0675, + "step": 4102 + }, + { + "epoch": 0.6321139776665383, + "grad_norm": 0.7977241277694702, + "learning_rate": 2.9831295659454838e-05, + "loss": 0.9345, + "step": 4104 + }, + { + "epoch": 0.6324220254139392, + "grad_norm": 1.0206035375595093, + "learning_rate": 2.9787024956235837e-05, + "loss": 0.9714, + "step": 4106 + }, + { + "epoch": 0.63273007316134, + "grad_norm": 0.7831523418426514, + "learning_rate": 2.974277318660472e-05, + "loss": 0.9212, + "step": 4108 + }, + { + "epoch": 0.6330381209087409, + "grad_norm": 0.7216141223907471, + "learning_rate": 2.9698540392012364e-05, + "loss": 0.7925, + "step": 4110 + }, + { + "epoch": 0.6333461686561417, + "grad_norm": 0.9296200275421143, + "learning_rate": 2.965432661389182e-05, + "loss": 0.8214, + "step": 4112 + }, + { + "epoch": 0.6336542164035426, + "grad_norm": 0.9630807638168335, + "learning_rate": 2.9610131893658328e-05, + "loss": 1.0262, + "step": 4114 + }, + { + "epoch": 0.6339622641509434, + "grad_norm": 0.7839917540550232, + "learning_rate": 2.9565956272709282e-05, + "loss": 1.5632, + "step": 4116 + }, + { + "epoch": 0.6342703118983443, + "grad_norm": 0.9187052845954895, + "learning_rate": 2.952179979242422e-05, + "loss": 0.9697, + "step": 4118 + }, + { + "epoch": 0.634578359645745, + "grad_norm": 1.2686012983322144, + "learning_rate": 2.9477662494164703e-05, + "loss": 1.8759, + "step": 4120 + }, + { + "epoch": 0.634886407393146, + "grad_norm": 1.2480783462524414, + "learning_rate": 2.943354441927434e-05, + "loss": 1.0542, + "step": 4122 + }, + { + "epoch": 0.6351944551405467, + "grad_norm": 0.7378079295158386, + "learning_rate": 2.938944560907876e-05, + "loss": 0.7757, + "step": 4124 + }, + { + "epoch": 0.6355025028879476, + "grad_norm": 0.715215802192688, + "learning_rate": 2.9345366104885514e-05, + "loss": 1.0343, + "step": 4126 + }, + { + "epoch": 0.6358105506353485, + "grad_norm": 1.0192413330078125, + "learning_rate": 2.930130594798405e-05, + "loss": 1.103, + "step": 4128 + }, + { + "epoch": 0.6361185983827493, + "grad_norm": 0.7151640057563782, + "learning_rate": 2.9257265179645764e-05, + "loss": 1.0364, + "step": 4130 + }, + { + "epoch": 0.6364266461301502, + "grad_norm": 1.1779733896255493, + "learning_rate": 2.921324384112384e-05, + "loss": 0.9568, + "step": 4132 + }, + { + "epoch": 0.636734693877551, + "grad_norm": 0.9525445103645325, + "learning_rate": 2.916924197365325e-05, + "loss": 1.1174, + "step": 4134 + }, + { + "epoch": 0.6370427416249519, + "grad_norm": 0.894616961479187, + "learning_rate": 2.9125259618450768e-05, + "loss": 1.0455, + "step": 4136 + }, + { + "epoch": 0.6373507893723527, + "grad_norm": 0.9701914191246033, + "learning_rate": 2.9081296816714864e-05, + "loss": 0.9364, + "step": 4138 + }, + { + "epoch": 0.6376588371197536, + "grad_norm": 0.7523389458656311, + "learning_rate": 2.9037353609625695e-05, + "loss": 0.8925, + "step": 4140 + }, + { + "epoch": 0.6379668848671544, + "grad_norm": 0.6569038033485413, + "learning_rate": 2.899343003834508e-05, + "loss": 0.8933, + "step": 4142 + }, + { + "epoch": 0.6382749326145553, + "grad_norm": 1.0741729736328125, + "learning_rate": 2.894952614401642e-05, + "loss": 0.9418, + "step": 4144 + }, + { + "epoch": 0.6385829803619562, + "grad_norm": 0.872235894203186, + "learning_rate": 2.8905641967764706e-05, + "loss": 1.0024, + "step": 4146 + }, + { + "epoch": 0.6388910281093569, + "grad_norm": 0.6244677901268005, + "learning_rate": 2.8861777550696444e-05, + "loss": 1.1566, + "step": 4148 + }, + { + "epoch": 0.6391990758567578, + "grad_norm": 0.7942041754722595, + "learning_rate": 2.8817932933899637e-05, + "loss": 0.9897, + "step": 4150 + }, + { + "epoch": 0.6395071236041586, + "grad_norm": 0.8112989664077759, + "learning_rate": 2.877410815844376e-05, + "loss": 0.9111, + "step": 4152 + }, + { + "epoch": 0.6398151713515595, + "grad_norm": 0.6369266510009766, + "learning_rate": 2.8730303265379654e-05, + "loss": 0.9752, + "step": 4154 + }, + { + "epoch": 0.6401232190989603, + "grad_norm": 0.8934615850448608, + "learning_rate": 2.8686518295739595e-05, + "loss": 0.887, + "step": 4156 + }, + { + "epoch": 0.6404312668463612, + "grad_norm": 0.8280193209648132, + "learning_rate": 2.864275329053715e-05, + "loss": 2.0003, + "step": 4158 + }, + { + "epoch": 0.640739314593762, + "grad_norm": 0.8070312738418579, + "learning_rate": 2.8599008290767204e-05, + "loss": 0.8533, + "step": 4160 + }, + { + "epoch": 0.6410473623411629, + "grad_norm": 0.9471308588981628, + "learning_rate": 2.8555283337405892e-05, + "loss": 1.0428, + "step": 4162 + }, + { + "epoch": 0.6413554100885638, + "grad_norm": 0.8286849856376648, + "learning_rate": 2.8511578471410637e-05, + "loss": 0.8051, + "step": 4164 + }, + { + "epoch": 0.6416634578359646, + "grad_norm": 0.6716766953468323, + "learning_rate": 2.846789373371993e-05, + "loss": 1.174, + "step": 4166 + }, + { + "epoch": 0.6419715055833655, + "grad_norm": 1.3844709396362305, + "learning_rate": 2.842422916525349e-05, + "loss": 0.9692, + "step": 4168 + }, + { + "epoch": 0.6422795533307663, + "grad_norm": 0.569908857345581, + "learning_rate": 2.8380584806912104e-05, + "loss": 3.4552, + "step": 4170 + }, + { + "epoch": 0.6425876010781671, + "grad_norm": 0.8689776062965393, + "learning_rate": 2.8336960699577698e-05, + "loss": 0.87, + "step": 4172 + }, + { + "epoch": 0.6428956488255679, + "grad_norm": 0.8701666593551636, + "learning_rate": 2.829335688411312e-05, + "loss": 0.9399, + "step": 4174 + }, + { + "epoch": 0.6432036965729688, + "grad_norm": 0.7730369567871094, + "learning_rate": 2.8249773401362267e-05, + "loss": 1.0089, + "step": 4176 + }, + { + "epoch": 0.6435117443203696, + "grad_norm": 0.8679829239845276, + "learning_rate": 2.820621029215003e-05, + "loss": 0.8542, + "step": 4178 + }, + { + "epoch": 0.6438197920677705, + "grad_norm": 0.6362239122390747, + "learning_rate": 2.8162667597282176e-05, + "loss": 1.0184, + "step": 4180 + }, + { + "epoch": 0.6441278398151713, + "grad_norm": 0.887014627456665, + "learning_rate": 2.8119145357545295e-05, + "loss": 0.986, + "step": 4182 + }, + { + "epoch": 0.6444358875625722, + "grad_norm": 0.6491660475730896, + "learning_rate": 2.8075643613706938e-05, + "loss": 0.9471, + "step": 4184 + }, + { + "epoch": 0.6447439353099731, + "grad_norm": 0.9777161478996277, + "learning_rate": 2.8032162406515372e-05, + "loss": 1.2648, + "step": 4186 + }, + { + "epoch": 0.6450519830573739, + "grad_norm": 0.8161064982414246, + "learning_rate": 2.7988701776699612e-05, + "loss": 0.8511, + "step": 4188 + }, + { + "epoch": 0.6453600308047748, + "grad_norm": 0.8995189070701599, + "learning_rate": 2.7945261764969442e-05, + "loss": 0.9348, + "step": 4190 + }, + { + "epoch": 0.6456680785521756, + "grad_norm": 0.7900372743606567, + "learning_rate": 2.7901842412015355e-05, + "loss": 1.3994, + "step": 4192 + }, + { + "epoch": 0.6459761262995765, + "grad_norm": 0.9890312552452087, + "learning_rate": 2.785844375850847e-05, + "loss": 1.2731, + "step": 4194 + }, + { + "epoch": 0.6462841740469772, + "grad_norm": 0.77409827709198, + "learning_rate": 2.7815065845100436e-05, + "loss": 0.8889, + "step": 4196 + }, + { + "epoch": 0.6465922217943781, + "grad_norm": 0.5893883109092712, + "learning_rate": 2.7771708712423615e-05, + "loss": 0.8937, + "step": 4198 + }, + { + "epoch": 0.6469002695417789, + "grad_norm": 0.7560586333274841, + "learning_rate": 2.7728372401090806e-05, + "loss": 0.9525, + "step": 4200 + }, + { + "epoch": 0.6469002695417789, + "eval_loss": 2.357358932495117, + "eval_runtime": 737.0158, + "eval_samples_per_second": 2.714, + "eval_steps_per_second": 0.678, + "step": 4200 + }, + { + "epoch": 0.6472083172891798, + "grad_norm": 0.5760608911514282, + "learning_rate": 2.7685056951695354e-05, + "loss": 0.7893, + "step": 4202 + }, + { + "epoch": 0.6475163650365807, + "grad_norm": 0.5902154445648193, + "learning_rate": 2.764176240481102e-05, + "loss": 1.0523, + "step": 4204 + }, + { + "epoch": 0.6478244127839815, + "grad_norm": 1.1871650218963623, + "learning_rate": 2.7598488800992018e-05, + "loss": 1.0079, + "step": 4206 + }, + { + "epoch": 0.6481324605313824, + "grad_norm": 1.000658631324768, + "learning_rate": 2.7555236180772937e-05, + "loss": 0.9369, + "step": 4208 + }, + { + "epoch": 0.6484405082787832, + "grad_norm": 0.6931748986244202, + "learning_rate": 2.7512004584668694e-05, + "loss": 0.9145, + "step": 4210 + }, + { + "epoch": 0.6487485560261841, + "grad_norm": 0.9558520317077637, + "learning_rate": 2.7468794053174547e-05, + "loss": 0.9502, + "step": 4212 + }, + { + "epoch": 0.6490566037735849, + "grad_norm": 0.6980428099632263, + "learning_rate": 2.7425604626765988e-05, + "loss": 0.7277, + "step": 4214 + }, + { + "epoch": 0.6493646515209858, + "grad_norm": 0.8523775935173035, + "learning_rate": 2.7382436345898754e-05, + "loss": 0.9375, + "step": 4216 + }, + { + "epoch": 0.6496726992683866, + "grad_norm": 0.8459200263023376, + "learning_rate": 2.7339289251008782e-05, + "loss": 0.8459, + "step": 4218 + }, + { + "epoch": 0.6499807470157875, + "grad_norm": 1.007840871810913, + "learning_rate": 2.729616338251215e-05, + "loss": 0.9507, + "step": 4220 + }, + { + "epoch": 0.6502887947631883, + "grad_norm": 0.9047834277153015, + "learning_rate": 2.7253058780805064e-05, + "loss": 0.8881, + "step": 4222 + }, + { + "epoch": 0.6505968425105891, + "grad_norm": 0.7920469641685486, + "learning_rate": 2.7209975486263807e-05, + "loss": 1.1688, + "step": 4224 + }, + { + "epoch": 0.65090489025799, + "grad_norm": 0.6454511284828186, + "learning_rate": 2.71669135392447e-05, + "loss": 2.2615, + "step": 4226 + }, + { + "epoch": 0.6512129380053908, + "grad_norm": 0.7499375343322754, + "learning_rate": 2.7123872980084066e-05, + "loss": 1.1278, + "step": 4228 + }, + { + "epoch": 0.6515209857527917, + "grad_norm": 0.9970037937164307, + "learning_rate": 2.7080853849098198e-05, + "loss": 0.8617, + "step": 4230 + }, + { + "epoch": 0.6518290335001925, + "grad_norm": 0.6305007934570312, + "learning_rate": 2.703785618658332e-05, + "loss": 0.7885, + "step": 4232 + }, + { + "epoch": 0.6521370812475934, + "grad_norm": 0.6080676317214966, + "learning_rate": 2.699488003281554e-05, + "loss": 0.8886, + "step": 4234 + }, + { + "epoch": 0.6524451289949942, + "grad_norm": 0.5880206227302551, + "learning_rate": 2.6951925428050807e-05, + "loss": 0.9416, + "step": 4236 + }, + { + "epoch": 0.6527531767423951, + "grad_norm": 0.8096851110458374, + "learning_rate": 2.6908992412524948e-05, + "loss": 0.9169, + "step": 4238 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.9902066588401794, + "learning_rate": 2.686608102645347e-05, + "loss": 1.0152, + "step": 4240 + }, + { + "epoch": 0.6533692722371968, + "grad_norm": 0.7456113696098328, + "learning_rate": 2.682319131003166e-05, + "loss": 1.0577, + "step": 4242 + }, + { + "epoch": 0.6536773199845977, + "grad_norm": 0.7236586809158325, + "learning_rate": 2.6780323303434586e-05, + "loss": 0.8771, + "step": 4244 + }, + { + "epoch": 0.6539853677319984, + "grad_norm": 1.0047794580459595, + "learning_rate": 2.673747704681684e-05, + "loss": 1.178, + "step": 4246 + }, + { + "epoch": 0.6542934154793993, + "grad_norm": 0.9127473831176758, + "learning_rate": 2.669465258031273e-05, + "loss": 1.2571, + "step": 4248 + }, + { + "epoch": 0.6546014632268001, + "grad_norm": 1.1883794069290161, + "learning_rate": 2.6651849944036118e-05, + "loss": 1.1044, + "step": 4250 + }, + { + "epoch": 0.654909510974201, + "grad_norm": 0.613161027431488, + "learning_rate": 2.6609069178080486e-05, + "loss": 0.7232, + "step": 4252 + }, + { + "epoch": 0.6552175587216018, + "grad_norm": 0.7150982618331909, + "learning_rate": 2.656631032251873e-05, + "loss": 1.0834, + "step": 4254 + }, + { + "epoch": 0.6555256064690027, + "grad_norm": 0.6977151036262512, + "learning_rate": 2.6523573417403258e-05, + "loss": 0.8445, + "step": 4256 + }, + { + "epoch": 0.6558336542164035, + "grad_norm": 0.626096248626709, + "learning_rate": 2.648085850276597e-05, + "loss": 0.8876, + "step": 4258 + }, + { + "epoch": 0.6561417019638044, + "grad_norm": 0.8646566867828369, + "learning_rate": 2.6438165618618127e-05, + "loss": 1.1347, + "step": 4260 + }, + { + "epoch": 0.6564497497112053, + "grad_norm": 0.8802438378334045, + "learning_rate": 2.6395494804950316e-05, + "loss": 1.6441, + "step": 4262 + }, + { + "epoch": 0.6567577974586061, + "grad_norm": 0.8071707487106323, + "learning_rate": 2.6352846101732474e-05, + "loss": 1.0018, + "step": 4264 + }, + { + "epoch": 0.657065845206007, + "grad_norm": 0.8037858605384827, + "learning_rate": 2.6310219548913917e-05, + "loss": 0.962, + "step": 4266 + }, + { + "epoch": 0.6573738929534078, + "grad_norm": 0.5976040959358215, + "learning_rate": 2.6267615186423068e-05, + "loss": 0.853, + "step": 4268 + }, + { + "epoch": 0.6576819407008087, + "grad_norm": 1.2724300622940063, + "learning_rate": 2.6225033054167626e-05, + "loss": 1.9666, + "step": 4270 + }, + { + "epoch": 0.6579899884482094, + "grad_norm": 0.8453805446624756, + "learning_rate": 2.6182473192034524e-05, + "loss": 1.0949, + "step": 4272 + }, + { + "epoch": 0.6582980361956103, + "grad_norm": 0.9200473427772522, + "learning_rate": 2.613993563988978e-05, + "loss": 0.8304, + "step": 4274 + }, + { + "epoch": 0.6586060839430111, + "grad_norm": 0.8198157548904419, + "learning_rate": 2.6097420437578447e-05, + "loss": 0.7959, + "step": 4276 + }, + { + "epoch": 0.658914131690412, + "grad_norm": 0.6957106590270996, + "learning_rate": 2.6054927624924785e-05, + "loss": 0.7985, + "step": 4278 + }, + { + "epoch": 0.6592221794378129, + "grad_norm": 0.6710758805274963, + "learning_rate": 2.6012457241731986e-05, + "loss": 0.9831, + "step": 4280 + }, + { + "epoch": 0.6595302271852137, + "grad_norm": 1.1137224435806274, + "learning_rate": 2.5970009327782274e-05, + "loss": 1.033, + "step": 4282 + }, + { + "epoch": 0.6598382749326146, + "grad_norm": 0.8313379287719727, + "learning_rate": 2.592758392283675e-05, + "loss": 0.9137, + "step": 4284 + }, + { + "epoch": 0.6601463226800154, + "grad_norm": 0.7043817043304443, + "learning_rate": 2.5885181066635545e-05, + "loss": 1.1218, + "step": 4286 + }, + { + "epoch": 0.6604543704274163, + "grad_norm": 0.8582731485366821, + "learning_rate": 2.58428007988976e-05, + "loss": 1.0096, + "step": 4288 + }, + { + "epoch": 0.6607624181748171, + "grad_norm": 0.7957725524902344, + "learning_rate": 2.5800443159320696e-05, + "loss": 0.9304, + "step": 4290 + }, + { + "epoch": 0.661070465922218, + "grad_norm": 0.9089280962944031, + "learning_rate": 2.575810818758145e-05, + "loss": 1.0874, + "step": 4292 + }, + { + "epoch": 0.6613785136696188, + "grad_norm": 0.9526920318603516, + "learning_rate": 2.5715795923335205e-05, + "loss": 1.1148, + "step": 4294 + }, + { + "epoch": 0.6616865614170196, + "grad_norm": 0.8966755867004395, + "learning_rate": 2.5673506406216074e-05, + "loss": 0.8414, + "step": 4296 + }, + { + "epoch": 0.6619946091644204, + "grad_norm": 0.9842838644981384, + "learning_rate": 2.5631239675836838e-05, + "loss": 1.0398, + "step": 4298 + }, + { + "epoch": 0.6623026569118213, + "grad_norm": 0.7387715578079224, + "learning_rate": 2.5588995771788942e-05, + "loss": 1.1329, + "step": 4300 + }, + { + "epoch": 0.6626107046592222, + "grad_norm": 0.8699015974998474, + "learning_rate": 2.5546774733642442e-05, + "loss": 1.213, + "step": 4302 + }, + { + "epoch": 0.662918752406623, + "grad_norm": 0.7089219689369202, + "learning_rate": 2.5504576600945994e-05, + "loss": 1.0877, + "step": 4304 + }, + { + "epoch": 0.6632268001540239, + "grad_norm": 0.8745829463005066, + "learning_rate": 2.5462401413226766e-05, + "loss": 0.9235, + "step": 4306 + }, + { + "epoch": 0.6635348479014247, + "grad_norm": 0.7542290091514587, + "learning_rate": 2.542024920999047e-05, + "loss": 1.0047, + "step": 4308 + }, + { + "epoch": 0.6638428956488256, + "grad_norm": 0.7538411617279053, + "learning_rate": 2.5378120030721263e-05, + "loss": 1.0855, + "step": 4310 + }, + { + "epoch": 0.6641509433962264, + "grad_norm": 0.7625778317451477, + "learning_rate": 2.533601391488175e-05, + "loss": 1.8407, + "step": 4312 + }, + { + "epoch": 0.6644589911436273, + "grad_norm": 0.8806657791137695, + "learning_rate": 2.529393090191292e-05, + "loss": 1.089, + "step": 4314 + }, + { + "epoch": 0.6647670388910281, + "grad_norm": 0.6660029888153076, + "learning_rate": 2.5251871031234108e-05, + "loss": 1.1389, + "step": 4316 + }, + { + "epoch": 0.665075086638429, + "grad_norm": 0.8225176930427551, + "learning_rate": 2.5209834342243042e-05, + "loss": 0.7636, + "step": 4318 + }, + { + "epoch": 0.6653831343858299, + "grad_norm": 0.6568019986152649, + "learning_rate": 2.516782087431565e-05, + "loss": 1.2996, + "step": 4320 + }, + { + "epoch": 0.6656911821332306, + "grad_norm": 0.860137403011322, + "learning_rate": 2.5125830666806137e-05, + "loss": 0.9067, + "step": 4322 + }, + { + "epoch": 0.6659992298806315, + "grad_norm": 0.7698874473571777, + "learning_rate": 2.5083863759046943e-05, + "loss": 1.3507, + "step": 4324 + }, + { + "epoch": 0.6663072776280323, + "grad_norm": 0.8295789957046509, + "learning_rate": 2.5041920190348655e-05, + "loss": 1.0864, + "step": 4326 + }, + { + "epoch": 0.6666153253754332, + "grad_norm": 0.7729510068893433, + "learning_rate": 2.500000000000001e-05, + "loss": 0.9469, + "step": 4328 + }, + { + "epoch": 0.666923373122834, + "grad_norm": 0.859638512134552, + "learning_rate": 2.4958103227267836e-05, + "loss": 2.426, + "step": 4330 + }, + { + "epoch": 0.6672314208702349, + "grad_norm": 0.67510586977005, + "learning_rate": 2.4916229911397083e-05, + "loss": 1.0348, + "step": 4332 + }, + { + "epoch": 0.6675394686176357, + "grad_norm": 0.6896673440933228, + "learning_rate": 2.4874380091610627e-05, + "loss": 1.0989, + "step": 4334 + }, + { + "epoch": 0.6678475163650366, + "grad_norm": 0.8158189058303833, + "learning_rate": 2.4832553807109392e-05, + "loss": 1.0081, + "step": 4336 + }, + { + "epoch": 0.6681555641124374, + "grad_norm": 0.8096099495887756, + "learning_rate": 2.479075109707229e-05, + "loss": 0.8653, + "step": 4338 + }, + { + "epoch": 0.6684636118598383, + "grad_norm": 0.8060958385467529, + "learning_rate": 2.474897200065611e-05, + "loss": 0.9234, + "step": 4340 + }, + { + "epoch": 0.6687716596072392, + "grad_norm": 1.3544212579727173, + "learning_rate": 2.47072165569955e-05, + "loss": 1.6383, + "step": 4342 + }, + { + "epoch": 0.66907970735464, + "grad_norm": 0.7652481198310852, + "learning_rate": 2.466548480520296e-05, + "loss": 0.9152, + "step": 4344 + }, + { + "epoch": 0.6693877551020408, + "grad_norm": 1.2091152667999268, + "learning_rate": 2.4623776784368868e-05, + "loss": 2.234, + "step": 4346 + }, + { + "epoch": 0.6696958028494416, + "grad_norm": 0.8425005078315735, + "learning_rate": 2.4582092533561325e-05, + "loss": 0.9222, + "step": 4348 + }, + { + "epoch": 0.6700038505968425, + "grad_norm": 0.8576799631118774, + "learning_rate": 2.4540432091826087e-05, + "loss": 1.0903, + "step": 4350 + }, + { + "epoch": 0.6703118983442433, + "grad_norm": 0.7403795719146729, + "learning_rate": 2.449879549818676e-05, + "loss": 0.8604, + "step": 4352 + }, + { + "epoch": 0.6706199460916442, + "grad_norm": 0.845689058303833, + "learning_rate": 2.445718279164453e-05, + "loss": 1.425, + "step": 4354 + }, + { + "epoch": 0.670927993839045, + "grad_norm": 0.9343613386154175, + "learning_rate": 2.441559401117815e-05, + "loss": 0.9845, + "step": 4356 + }, + { + "epoch": 0.6712360415864459, + "grad_norm": 0.7838619351387024, + "learning_rate": 2.4374029195744093e-05, + "loss": 0.8254, + "step": 4358 + }, + { + "epoch": 0.6715440893338468, + "grad_norm": 0.8020996451377869, + "learning_rate": 2.433248838427628e-05, + "loss": 0.9853, + "step": 4360 + }, + { + "epoch": 0.6718521370812476, + "grad_norm": 0.9349923729896545, + "learning_rate": 2.4290971615686215e-05, + "loss": 0.9348, + "step": 4362 + }, + { + "epoch": 0.6721601848286485, + "grad_norm": 0.5520917773246765, + "learning_rate": 2.424947892886279e-05, + "loss": 1.0036, + "step": 4364 + }, + { + "epoch": 0.6724682325760493, + "grad_norm": 0.8372582793235779, + "learning_rate": 2.4208010362672444e-05, + "loss": 0.9508, + "step": 4366 + }, + { + "epoch": 0.6727762803234502, + "grad_norm": 0.8463919162750244, + "learning_rate": 2.4166565955958976e-05, + "loss": 1.0411, + "step": 4368 + }, + { + "epoch": 0.673084328070851, + "grad_norm": 0.9879681468009949, + "learning_rate": 2.4125145747543537e-05, + "loss": 0.9868, + "step": 4370 + }, + { + "epoch": 0.6733923758182518, + "grad_norm": 0.8925761580467224, + "learning_rate": 2.408374977622464e-05, + "loss": 1.018, + "step": 4372 + }, + { + "epoch": 0.6737004235656526, + "grad_norm": 0.9484021067619324, + "learning_rate": 2.404237808077808e-05, + "loss": 0.978, + "step": 4374 + }, + { + "epoch": 0.6740084713130535, + "grad_norm": 0.8627564907073975, + "learning_rate": 2.4001030699956916e-05, + "loss": 0.8258, + "step": 4376 + }, + { + "epoch": 0.6743165190604544, + "grad_norm": 0.8027501702308655, + "learning_rate": 2.3959707672491437e-05, + "loss": 0.9057, + "step": 4378 + }, + { + "epoch": 0.6746245668078552, + "grad_norm": 0.9469018578529358, + "learning_rate": 2.3918409037089112e-05, + "loss": 1.0835, + "step": 4380 + }, + { + "epoch": 0.6749326145552561, + "grad_norm": 0.8287208676338196, + "learning_rate": 2.3877134832434567e-05, + "loss": 0.882, + "step": 4382 + }, + { + "epoch": 0.6752406623026569, + "grad_norm": 0.6991237998008728, + "learning_rate": 2.3835885097189535e-05, + "loss": 0.9511, + "step": 4384 + }, + { + "epoch": 0.6755487100500578, + "grad_norm": 0.9674092531204224, + "learning_rate": 2.3794659869992848e-05, + "loss": 1.1107, + "step": 4386 + }, + { + "epoch": 0.6758567577974586, + "grad_norm": 0.846439003944397, + "learning_rate": 2.375345918946036e-05, + "loss": 0.736, + "step": 4388 + }, + { + "epoch": 0.6761648055448595, + "grad_norm": 0.6624884009361267, + "learning_rate": 2.3712283094184934e-05, + "loss": 1.1075, + "step": 4390 + }, + { + "epoch": 0.6764728532922603, + "grad_norm": 0.5450026988983154, + "learning_rate": 2.3671131622736427e-05, + "loss": 0.7622, + "step": 4392 + }, + { + "epoch": 0.6767809010396612, + "grad_norm": 0.9617735147476196, + "learning_rate": 2.36300048136616e-05, + "loss": 1.0834, + "step": 4394 + }, + { + "epoch": 0.6770889487870619, + "grad_norm": 0.9630662798881531, + "learning_rate": 2.358890270548413e-05, + "loss": 2.0419, + "step": 4396 + }, + { + "epoch": 0.6773969965344628, + "grad_norm": 0.7190920114517212, + "learning_rate": 2.3547825336704555e-05, + "loss": 1.226, + "step": 4398 + }, + { + "epoch": 0.6777050442818637, + "grad_norm": 0.7295611500740051, + "learning_rate": 2.3506772745800238e-05, + "loss": 0.7479, + "step": 4400 + }, + { + "epoch": 0.6780130920292645, + "grad_norm": 0.7448195219039917, + "learning_rate": 2.3465744971225333e-05, + "loss": 0.9424, + "step": 4402 + }, + { + "epoch": 0.6783211397766654, + "grad_norm": 0.897513747215271, + "learning_rate": 2.342474205141073e-05, + "loss": 0.9566, + "step": 4404 + }, + { + "epoch": 0.6786291875240662, + "grad_norm": 0.7958559393882751, + "learning_rate": 2.3383764024764105e-05, + "loss": 0.8318, + "step": 4406 + }, + { + "epoch": 0.6789372352714671, + "grad_norm": 0.6669363379478455, + "learning_rate": 2.3342810929669712e-05, + "loss": 0.7176, + "step": 4408 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.6453452110290527, + "learning_rate": 2.330188280448851e-05, + "loss": 1.4984, + "step": 4410 + }, + { + "epoch": 0.6795533307662688, + "grad_norm": 0.7844343781471252, + "learning_rate": 2.326097968755812e-05, + "loss": 0.9918, + "step": 4412 + }, + { + "epoch": 0.6798613785136696, + "grad_norm": 0.7631637454032898, + "learning_rate": 2.322010161719263e-05, + "loss": 2.3614, + "step": 4414 + }, + { + "epoch": 0.6801694262610705, + "grad_norm": 0.7821967005729675, + "learning_rate": 2.3179248631682726e-05, + "loss": 1.7651, + "step": 4416 + }, + { + "epoch": 0.6804774740084714, + "grad_norm": 0.754566490650177, + "learning_rate": 2.3138420769295577e-05, + "loss": 0.964, + "step": 4418 + }, + { + "epoch": 0.6807855217558721, + "grad_norm": 0.737114667892456, + "learning_rate": 2.309761806827489e-05, + "loss": 0.7734, + "step": 4420 + }, + { + "epoch": 0.681093569503273, + "grad_norm": 0.9395533800125122, + "learning_rate": 2.3056840566840688e-05, + "loss": 0.9731, + "step": 4422 + }, + { + "epoch": 0.6814016172506738, + "grad_norm": 0.6122387051582336, + "learning_rate": 2.301608830318945e-05, + "loss": 0.8247, + "step": 4424 + }, + { + "epoch": 0.6817096649980747, + "grad_norm": 0.6384727358818054, + "learning_rate": 2.2975361315494037e-05, + "loss": 0.7968, + "step": 4426 + }, + { + "epoch": 0.6820177127454755, + "grad_norm": 0.8586402535438538, + "learning_rate": 2.293465964190362e-05, + "loss": 1.0213, + "step": 4428 + }, + { + "epoch": 0.6823257604928764, + "grad_norm": 0.8942074775695801, + "learning_rate": 2.2893983320543588e-05, + "loss": 0.9089, + "step": 4430 + }, + { + "epoch": 0.6826338082402772, + "grad_norm": 0.9709880352020264, + "learning_rate": 2.2853332389515698e-05, + "loss": 1.2399, + "step": 4432 + }, + { + "epoch": 0.6829418559876781, + "grad_norm": 0.8261923789978027, + "learning_rate": 2.281270688689784e-05, + "loss": 0.8675, + "step": 4434 + }, + { + "epoch": 0.683249903735079, + "grad_norm": 0.6313756704330444, + "learning_rate": 2.2772106850744136e-05, + "loss": 1.0011, + "step": 4436 + }, + { + "epoch": 0.6835579514824798, + "grad_norm": 0.818619966506958, + "learning_rate": 2.2731532319084774e-05, + "loss": 0.7433, + "step": 4438 + }, + { + "epoch": 0.6838659992298807, + "grad_norm": 0.8863322138786316, + "learning_rate": 2.2690983329926157e-05, + "loss": 0.9186, + "step": 4440 + }, + { + "epoch": 0.6841740469772815, + "grad_norm": 1.1261119842529297, + "learning_rate": 2.2650459921250723e-05, + "loss": 1.0289, + "step": 4442 + }, + { + "epoch": 0.6844820947246824, + "grad_norm": 0.790084958076477, + "learning_rate": 2.2609962131016872e-05, + "loss": 1.8737, + "step": 4444 + }, + { + "epoch": 0.6847901424720831, + "grad_norm": 0.6505240797996521, + "learning_rate": 2.2569489997159127e-05, + "loss": 0.8129, + "step": 4446 + }, + { + "epoch": 0.685098190219484, + "grad_norm": 0.7163221836090088, + "learning_rate": 2.2529043557587913e-05, + "loss": 1.1333, + "step": 4448 + }, + { + "epoch": 0.6854062379668848, + "grad_norm": 0.7663728594779968, + "learning_rate": 2.24886228501896e-05, + "loss": 0.8635, + "step": 4450 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.6637657880783081, + "learning_rate": 2.244822791282645e-05, + "loss": 0.7793, + "step": 4452 + }, + { + "epoch": 0.6860223334616865, + "grad_norm": 0.8549447655677795, + "learning_rate": 2.2407858783336576e-05, + "loss": 0.7412, + "step": 4454 + }, + { + "epoch": 0.6863303812090874, + "grad_norm": 0.7615169286727905, + "learning_rate": 2.2367515499533954e-05, + "loss": 1.2909, + "step": 4456 + }, + { + "epoch": 0.6866384289564883, + "grad_norm": 1.0264763832092285, + "learning_rate": 2.2327198099208307e-05, + "loss": 2.6198, + "step": 4458 + }, + { + "epoch": 0.6869464767038891, + "grad_norm": 1.1352583169937134, + "learning_rate": 2.228690662012514e-05, + "loss": 1.6127, + "step": 4460 + }, + { + "epoch": 0.68725452445129, + "grad_norm": 0.912841260433197, + "learning_rate": 2.2246641100025667e-05, + "loss": 1.0174, + "step": 4462 + }, + { + "epoch": 0.6875625721986908, + "grad_norm": 0.7334194779396057, + "learning_rate": 2.220640157662679e-05, + "loss": 2.2198, + "step": 4464 + }, + { + "epoch": 0.6878706199460917, + "grad_norm": 0.5997360944747925, + "learning_rate": 2.2166188087621054e-05, + "loss": 0.8775, + "step": 4466 + }, + { + "epoch": 0.6881786676934925, + "grad_norm": 0.8840118050575256, + "learning_rate": 2.2126000670676627e-05, + "loss": 0.7739, + "step": 4468 + }, + { + "epoch": 0.6884867154408933, + "grad_norm": 0.9735852479934692, + "learning_rate": 2.2085839363437244e-05, + "loss": 0.8629, + "step": 4470 + }, + { + "epoch": 0.6887947631882941, + "grad_norm": 0.5354695320129395, + "learning_rate": 2.2045704203522192e-05, + "loss": 1.7126, + "step": 4472 + }, + { + "epoch": 0.689102810935695, + "grad_norm": 0.7044001817703247, + "learning_rate": 2.2005595228526265e-05, + "loss": 2.1124, + "step": 4474 + }, + { + "epoch": 0.6894108586830959, + "grad_norm": 1.0412694215774536, + "learning_rate": 2.1965512476019724e-05, + "loss": 1.3892, + "step": 4476 + }, + { + "epoch": 0.6897189064304967, + "grad_norm": 0.6908366680145264, + "learning_rate": 2.1925455983548264e-05, + "loss": 0.9544, + "step": 4478 + }, + { + "epoch": 0.6900269541778976, + "grad_norm": 0.7264195680618286, + "learning_rate": 2.1885425788633e-05, + "loss": 1.0133, + "step": 4480 + }, + { + "epoch": 0.6903350019252984, + "grad_norm": 0.9055928587913513, + "learning_rate": 2.1845421928770393e-05, + "loss": 0.9205, + "step": 4482 + }, + { + "epoch": 0.6906430496726993, + "grad_norm": 0.5684531331062317, + "learning_rate": 2.1805444441432234e-05, + "loss": 0.8977, + "step": 4484 + }, + { + "epoch": 0.6909510974201001, + "grad_norm": 0.9478229880332947, + "learning_rate": 2.1765493364065665e-05, + "loss": 0.9969, + "step": 4486 + }, + { + "epoch": 0.691259145167501, + "grad_norm": 0.8816278576850891, + "learning_rate": 2.1725568734093e-05, + "loss": 0.9541, + "step": 4488 + }, + { + "epoch": 0.6915671929149018, + "grad_norm": 0.7542193531990051, + "learning_rate": 2.1685670588911843e-05, + "loss": 0.9754, + "step": 4490 + }, + { + "epoch": 0.6918752406623027, + "grad_norm": 0.7939161062240601, + "learning_rate": 2.1645798965894953e-05, + "loss": 1.3108, + "step": 4492 + }, + { + "epoch": 0.6921832884097036, + "grad_norm": 0.9942681193351746, + "learning_rate": 2.1605953902390326e-05, + "loss": 1.076, + "step": 4494 + }, + { + "epoch": 0.6924913361571043, + "grad_norm": 0.7257193326950073, + "learning_rate": 2.1566135435720954e-05, + "loss": 0.8966, + "step": 4496 + }, + { + "epoch": 0.6927993839045052, + "grad_norm": 0.8092132806777954, + "learning_rate": 2.1526343603184984e-05, + "loss": 1.0302, + "step": 4498 + }, + { + "epoch": 0.693107431651906, + "grad_norm": 0.7978146076202393, + "learning_rate": 2.1486578442055672e-05, + "loss": 0.8256, + "step": 4500 + }, + { + "epoch": 0.693107431651906, + "eval_loss": 2.392524003982544, + "eval_runtime": 736.2048, + "eval_samples_per_second": 2.717, + "eval_steps_per_second": 0.679, + "step": 4500 + }, + { + "epoch": 0.6934154793993069, + "grad_norm": 0.7591894865036011, + "learning_rate": 2.1446839989581165e-05, + "loss": 1.0619, + "step": 4502 + }, + { + "epoch": 0.6937235271467077, + "grad_norm": 0.8062347173690796, + "learning_rate": 2.1407128282984662e-05, + "loss": 0.8574, + "step": 4504 + }, + { + "epoch": 0.6940315748941086, + "grad_norm": 0.843224823474884, + "learning_rate": 2.136744335946434e-05, + "loss": 1.0387, + "step": 4506 + }, + { + "epoch": 0.6943396226415094, + "grad_norm": 0.7246967554092407, + "learning_rate": 2.132778525619325e-05, + "loss": 0.956, + "step": 4508 + }, + { + "epoch": 0.6946476703889103, + "grad_norm": 0.8044782280921936, + "learning_rate": 2.128815401031929e-05, + "loss": 1.076, + "step": 4510 + }, + { + "epoch": 0.6949557181363111, + "grad_norm": 0.8453338742256165, + "learning_rate": 2.124854965896522e-05, + "loss": 0.9274, + "step": 4512 + }, + { + "epoch": 0.695263765883712, + "grad_norm": 0.7545700073242188, + "learning_rate": 2.1208972239228674e-05, + "loss": 0.9612, + "step": 4514 + }, + { + "epoch": 0.6955718136311129, + "grad_norm": 0.7990420460700989, + "learning_rate": 2.1169421788181998e-05, + "loss": 1.0143, + "step": 4516 + }, + { + "epoch": 0.6958798613785137, + "grad_norm": 0.6616374850273132, + "learning_rate": 2.112989834287223e-05, + "loss": 1.4766, + "step": 4518 + }, + { + "epoch": 0.6961879091259146, + "grad_norm": 0.9233276844024658, + "learning_rate": 2.1090401940321212e-05, + "loss": 1.2026, + "step": 4520 + }, + { + "epoch": 0.6964959568733153, + "grad_norm": 0.7011730074882507, + "learning_rate": 2.1050932617525406e-05, + "loss": 1.139, + "step": 4522 + }, + { + "epoch": 0.6968040046207162, + "grad_norm": 1.0608782768249512, + "learning_rate": 2.1011490411455893e-05, + "loss": 0.9284, + "step": 4524 + }, + { + "epoch": 0.697112052368117, + "grad_norm": 0.7545628547668457, + "learning_rate": 2.0972075359058378e-05, + "loss": 0.8357, + "step": 4526 + }, + { + "epoch": 0.6974201001155179, + "grad_norm": 0.8437629342079163, + "learning_rate": 2.0932687497253133e-05, + "loss": 0.87, + "step": 4528 + }, + { + "epoch": 0.6977281478629187, + "grad_norm": 0.836249828338623, + "learning_rate": 2.0893326862934957e-05, + "loss": 0.83, + "step": 4530 + }, + { + "epoch": 0.6980361956103196, + "grad_norm": 0.8689998984336853, + "learning_rate": 2.0853993492973102e-05, + "loss": 1.1421, + "step": 4532 + }, + { + "epoch": 0.6983442433577205, + "grad_norm": 0.8435835242271423, + "learning_rate": 2.081468742421135e-05, + "loss": 1.7831, + "step": 4534 + }, + { + "epoch": 0.6986522911051213, + "grad_norm": 0.8725165128707886, + "learning_rate": 2.077540869346788e-05, + "loss": 0.9537, + "step": 4536 + }, + { + "epoch": 0.6989603388525222, + "grad_norm": 0.9655625820159912, + "learning_rate": 2.073615733753525e-05, + "loss": 1.0886, + "step": 4538 + }, + { + "epoch": 0.699268386599923, + "grad_norm": 0.9760432839393616, + "learning_rate": 2.0696933393180397e-05, + "loss": 1.1564, + "step": 4540 + }, + { + "epoch": 0.6995764343473239, + "grad_norm": 0.8607039451599121, + "learning_rate": 2.0657736897144564e-05, + "loss": 0.8723, + "step": 4542 + }, + { + "epoch": 0.6998844820947246, + "grad_norm": 0.7545386552810669, + "learning_rate": 2.0618567886143297e-05, + "loss": 2.6288, + "step": 4544 + }, + { + "epoch": 0.7001925298421255, + "grad_norm": 0.6643165946006775, + "learning_rate": 2.057942639686639e-05, + "loss": 0.8196, + "step": 4546 + }, + { + "epoch": 0.7005005775895263, + "grad_norm": 0.8222883939743042, + "learning_rate": 2.0540312465977863e-05, + "loss": 0.7392, + "step": 4548 + }, + { + "epoch": 0.7008086253369272, + "grad_norm": 0.9492761492729187, + "learning_rate": 2.050122613011591e-05, + "loss": 1.0447, + "step": 4550 + }, + { + "epoch": 0.7011166730843281, + "grad_norm": 0.6157514452934265, + "learning_rate": 2.046216742589288e-05, + "loss": 0.7543, + "step": 4552 + }, + { + "epoch": 0.7014247208317289, + "grad_norm": 0.5949583053588867, + "learning_rate": 2.042313638989526e-05, + "loss": 1.6896, + "step": 4554 + }, + { + "epoch": 0.7017327685791298, + "grad_norm": 0.8937816023826599, + "learning_rate": 2.0384133058683585e-05, + "loss": 1.0241, + "step": 4556 + }, + { + "epoch": 0.7020408163265306, + "grad_norm": 0.8714761137962341, + "learning_rate": 2.034515746879247e-05, + "loss": 1.918, + "step": 4558 + }, + { + "epoch": 0.7023488640739315, + "grad_norm": 0.6842667460441589, + "learning_rate": 2.0306209656730523e-05, + "loss": 0.8398, + "step": 4560 + }, + { + "epoch": 0.7026569118213323, + "grad_norm": 0.8343141674995422, + "learning_rate": 2.026728965898035e-05, + "loss": 0.8353, + "step": 4562 + }, + { + "epoch": 0.7029649595687332, + "grad_norm": 0.9512957334518433, + "learning_rate": 2.0228397511998463e-05, + "loss": 0.8459, + "step": 4564 + }, + { + "epoch": 0.703273007316134, + "grad_norm": 0.8050121068954468, + "learning_rate": 2.0189533252215387e-05, + "loss": 0.9695, + "step": 4566 + }, + { + "epoch": 0.7035810550635349, + "grad_norm": 0.784359335899353, + "learning_rate": 2.0150696916035388e-05, + "loss": 0.9633, + "step": 4568 + }, + { + "epoch": 0.7038891028109356, + "grad_norm": 0.8005040884017944, + "learning_rate": 2.011188853983667e-05, + "loss": 0.8726, + "step": 4570 + }, + { + "epoch": 0.7041971505583365, + "grad_norm": 0.8692052960395813, + "learning_rate": 2.0073108159971193e-05, + "loss": 2.3908, + "step": 4572 + }, + { + "epoch": 0.7045051983057374, + "grad_norm": 0.8023272156715393, + "learning_rate": 2.003435581276479e-05, + "loss": 0.8949, + "step": 4574 + }, + { + "epoch": 0.7048132460531382, + "grad_norm": 0.7615310549736023, + "learning_rate": 1.999563153451689e-05, + "loss": 1.2118, + "step": 4576 + }, + { + "epoch": 0.7051212938005391, + "grad_norm": 1.0422707796096802, + "learning_rate": 1.9956935361500717e-05, + "loss": 1.0129, + "step": 4578 + }, + { + "epoch": 0.7054293415479399, + "grad_norm": 0.907596230506897, + "learning_rate": 1.991826732996319e-05, + "loss": 0.9199, + "step": 4580 + }, + { + "epoch": 0.7057373892953408, + "grad_norm": 1.3769018650054932, + "learning_rate": 1.987962747612484e-05, + "loss": 1.0924, + "step": 4582 + }, + { + "epoch": 0.7060454370427416, + "grad_norm": 0.8509035110473633, + "learning_rate": 1.9841015836179734e-05, + "loss": 1.9078, + "step": 4584 + }, + { + "epoch": 0.7063534847901425, + "grad_norm": 1.1849422454833984, + "learning_rate": 1.980243244629564e-05, + "loss": 0.9282, + "step": 4586 + }, + { + "epoch": 0.7066615325375433, + "grad_norm": 1.1115151643753052, + "learning_rate": 1.9763877342613785e-05, + "loss": 0.9941, + "step": 4588 + }, + { + "epoch": 0.7069695802849442, + "grad_norm": 0.684323251247406, + "learning_rate": 1.972535056124889e-05, + "loss": 2.0089, + "step": 4590 + }, + { + "epoch": 0.7072776280323451, + "grad_norm": 0.6070319414138794, + "learning_rate": 1.9686852138289162e-05, + "loss": 0.8668, + "step": 4592 + }, + { + "epoch": 0.7075856757797458, + "grad_norm": 0.5560675263404846, + "learning_rate": 1.9648382109796304e-05, + "loss": 0.9991, + "step": 4594 + }, + { + "epoch": 0.7078937235271467, + "grad_norm": 1.0245397090911865, + "learning_rate": 1.9609940511805353e-05, + "loss": 1.0232, + "step": 4596 + }, + { + "epoch": 0.7082017712745475, + "grad_norm": 1.1413053274154663, + "learning_rate": 1.957152738032469e-05, + "loss": 1.6201, + "step": 4598 + }, + { + "epoch": 0.7085098190219484, + "grad_norm": 0.8900070190429688, + "learning_rate": 1.9533142751336126e-05, + "loss": 0.7824, + "step": 4600 + }, + { + "epoch": 0.7088178667693492, + "grad_norm": 0.9296131730079651, + "learning_rate": 1.9494786660794702e-05, + "loss": 0.9638, + "step": 4602 + }, + { + "epoch": 0.7091259145167501, + "grad_norm": 0.848101019859314, + "learning_rate": 1.9456459144628765e-05, + "loss": 1.0311, + "step": 4604 + }, + { + "epoch": 0.7094339622641509, + "grad_norm": 0.7903195023536682, + "learning_rate": 1.941816023873983e-05, + "loss": 1.0481, + "step": 4606 + }, + { + "epoch": 0.7097420100115518, + "grad_norm": 0.7192170023918152, + "learning_rate": 1.9379889979002704e-05, + "loss": 0.8418, + "step": 4608 + }, + { + "epoch": 0.7100500577589527, + "grad_norm": 0.8225452303886414, + "learning_rate": 1.9341648401265307e-05, + "loss": 0.9304, + "step": 4610 + }, + { + "epoch": 0.7103581055063535, + "grad_norm": 0.8312641382217407, + "learning_rate": 1.9303435541348695e-05, + "loss": 1.0101, + "step": 4612 + }, + { + "epoch": 0.7106661532537544, + "grad_norm": 0.8498125076293945, + "learning_rate": 1.9265251435047044e-05, + "loss": 0.9582, + "step": 4614 + }, + { + "epoch": 0.7109742010011552, + "grad_norm": 0.6809362173080444, + "learning_rate": 1.922709611812758e-05, + "loss": 1.0944, + "step": 4616 + }, + { + "epoch": 0.7112822487485561, + "grad_norm": 0.7999347448348999, + "learning_rate": 1.918896962633056e-05, + "loss": 1.1626, + "step": 4618 + }, + { + "epoch": 0.7115902964959568, + "grad_norm": 0.6605939865112305, + "learning_rate": 1.915087199536925e-05, + "loss": 1.0525, + "step": 4620 + }, + { + "epoch": 0.7118983442433577, + "grad_norm": 0.6720040440559387, + "learning_rate": 1.9112803260929884e-05, + "loss": 1.1791, + "step": 4622 + }, + { + "epoch": 0.7122063919907585, + "grad_norm": 0.9006309509277344, + "learning_rate": 1.907476345867162e-05, + "loss": 0.9723, + "step": 4624 + }, + { + "epoch": 0.7125144397381594, + "grad_norm": 0.7742815613746643, + "learning_rate": 1.9036752624226506e-05, + "loss": 0.7847, + "step": 4626 + }, + { + "epoch": 0.7128224874855602, + "grad_norm": 0.7233114838600159, + "learning_rate": 1.899877079319949e-05, + "loss": 2.2617, + "step": 4628 + }, + { + "epoch": 0.7131305352329611, + "grad_norm": 0.6334021091461182, + "learning_rate": 1.8960818001168308e-05, + "loss": 0.9077, + "step": 4630 + }, + { + "epoch": 0.713438582980362, + "grad_norm": 1.047119140625, + "learning_rate": 1.8922894283683533e-05, + "loss": 1.5837, + "step": 4632 + }, + { + "epoch": 0.7137466307277628, + "grad_norm": 0.7595541477203369, + "learning_rate": 1.8884999676268476e-05, + "loss": 1.6919, + "step": 4634 + }, + { + "epoch": 0.7140546784751637, + "grad_norm": 0.6998812556266785, + "learning_rate": 1.8847134214419205e-05, + "loss": 1.0882, + "step": 4636 + }, + { + "epoch": 0.7143627262225645, + "grad_norm": 0.9410682916641235, + "learning_rate": 1.8809297933604446e-05, + "loss": 0.8349, + "step": 4638 + }, + { + "epoch": 0.7146707739699654, + "grad_norm": 0.8228951692581177, + "learning_rate": 1.8771490869265686e-05, + "loss": 0.978, + "step": 4640 + }, + { + "epoch": 0.7149788217173662, + "grad_norm": 0.7356462478637695, + "learning_rate": 1.8733713056816905e-05, + "loss": 1.5331, + "step": 4642 + }, + { + "epoch": 0.715286869464767, + "grad_norm": 0.7669535279273987, + "learning_rate": 1.869596453164479e-05, + "loss": 0.9988, + "step": 4644 + }, + { + "epoch": 0.7155949172121678, + "grad_norm": 0.7517581582069397, + "learning_rate": 1.8658245329108553e-05, + "loss": 0.8652, + "step": 4646 + }, + { + "epoch": 0.7159029649595687, + "grad_norm": 0.8972408771514893, + "learning_rate": 1.862055548453995e-05, + "loss": 1.0223, + "step": 4648 + }, + { + "epoch": 0.7162110127069696, + "grad_norm": 0.7768507599830627, + "learning_rate": 1.8582895033243232e-05, + "loss": 0.7865, + "step": 4650 + }, + { + "epoch": 0.7165190604543704, + "grad_norm": 0.831098735332489, + "learning_rate": 1.8545264010495106e-05, + "loss": 1.0744, + "step": 4652 + }, + { + "epoch": 0.7168271082017713, + "grad_norm": 0.7342033982276917, + "learning_rate": 1.8507662451544772e-05, + "loss": 0.9346, + "step": 4654 + }, + { + "epoch": 0.7171351559491721, + "grad_norm": 0.8174619674682617, + "learning_rate": 1.8470090391613737e-05, + "loss": 0.8898, + "step": 4656 + }, + { + "epoch": 0.717443203696573, + "grad_norm": 0.7861392498016357, + "learning_rate": 1.8432547865895926e-05, + "loss": 0.9919, + "step": 4658 + }, + { + "epoch": 0.7177512514439738, + "grad_norm": 0.8522654175758362, + "learning_rate": 1.839503490955763e-05, + "loss": 0.9136, + "step": 4660 + }, + { + "epoch": 0.7180592991913747, + "grad_norm": 1.045430302619934, + "learning_rate": 1.8357551557737407e-05, + "loss": 1.2186, + "step": 4662 + }, + { + "epoch": 0.7183673469387755, + "grad_norm": 0.9019323587417603, + "learning_rate": 1.8320097845546058e-05, + "loss": 0.9851, + "step": 4664 + }, + { + "epoch": 0.7186753946861764, + "grad_norm": 0.724032998085022, + "learning_rate": 1.8282673808066653e-05, + "loss": 2.2028, + "step": 4666 + }, + { + "epoch": 0.7189834424335773, + "grad_norm": 0.8869044184684753, + "learning_rate": 1.8245279480354504e-05, + "loss": 0.9707, + "step": 4668 + }, + { + "epoch": 0.719291490180978, + "grad_norm": 0.9228594303131104, + "learning_rate": 1.8207914897437005e-05, + "loss": 0.9238, + "step": 4670 + }, + { + "epoch": 0.7195995379283789, + "grad_norm": 0.7796034216880798, + "learning_rate": 1.8170580094313738e-05, + "loss": 1.0092, + "step": 4672 + }, + { + "epoch": 0.7199075856757797, + "grad_norm": 0.6384996175765991, + "learning_rate": 1.813327510595642e-05, + "loss": 1.9974, + "step": 4674 + }, + { + "epoch": 0.7202156334231806, + "grad_norm": 0.6626761555671692, + "learning_rate": 1.80959999673088e-05, + "loss": 0.7973, + "step": 4676 + }, + { + "epoch": 0.7205236811705814, + "grad_norm": 0.8023983240127563, + "learning_rate": 1.8058754713286636e-05, + "loss": 1.0762, + "step": 4678 + }, + { + "epoch": 0.7208317289179823, + "grad_norm": 1.1368682384490967, + "learning_rate": 1.802153937877777e-05, + "loss": 0.8167, + "step": 4680 + }, + { + "epoch": 0.7211397766653831, + "grad_norm": 1.2227729558944702, + "learning_rate": 1.7984353998641973e-05, + "loss": 1.1872, + "step": 4682 + }, + { + "epoch": 0.721447824412784, + "grad_norm": 0.8481996059417725, + "learning_rate": 1.794719860771097e-05, + "loss": 0.9309, + "step": 4684 + }, + { + "epoch": 0.7217558721601848, + "grad_norm": 0.708870530128479, + "learning_rate": 1.7910073240788346e-05, + "loss": 1.3142, + "step": 4686 + }, + { + "epoch": 0.7220639199075857, + "grad_norm": 0.9485694169998169, + "learning_rate": 1.787297793264965e-05, + "loss": 0.9818, + "step": 4688 + }, + { + "epoch": 0.7223719676549866, + "grad_norm": 0.9983515739440918, + "learning_rate": 1.7835912718042212e-05, + "loss": 0.934, + "step": 4690 + }, + { + "epoch": 0.7226800154023874, + "grad_norm": 0.9369632005691528, + "learning_rate": 1.7798877631685202e-05, + "loss": 0.9286, + "step": 4692 + }, + { + "epoch": 0.7229880631497883, + "grad_norm": 0.730244517326355, + "learning_rate": 1.776187270826955e-05, + "loss": 0.9192, + "step": 4694 + }, + { + "epoch": 0.723296110897189, + "grad_norm": 0.5943278670310974, + "learning_rate": 1.7724897982457946e-05, + "loss": 0.8079, + "step": 4696 + }, + { + "epoch": 0.7236041586445899, + "grad_norm": 0.88278728723526, + "learning_rate": 1.7687953488884773e-05, + "loss": 0.8272, + "step": 4698 + }, + { + "epoch": 0.7239122063919907, + "grad_norm": 0.6160874962806702, + "learning_rate": 1.7651039262156126e-05, + "loss": 0.9233, + "step": 4700 + }, + { + "epoch": 0.7242202541393916, + "grad_norm": 0.5485923886299133, + "learning_rate": 1.761415533684973e-05, + "loss": 0.6803, + "step": 4702 + }, + { + "epoch": 0.7245283018867924, + "grad_norm": 0.7884142398834229, + "learning_rate": 1.7577301747514922e-05, + "loss": 0.9881, + "step": 4704 + }, + { + "epoch": 0.7248363496341933, + "grad_norm": 0.7269513010978699, + "learning_rate": 1.7540478528672645e-05, + "loss": 0.909, + "step": 4706 + }, + { + "epoch": 0.7251443973815942, + "grad_norm": 0.8242033123970032, + "learning_rate": 1.750368571481536e-05, + "loss": 0.7748, + "step": 4708 + }, + { + "epoch": 0.725452445128995, + "grad_norm": 0.6146319508552551, + "learning_rate": 1.7466923340407088e-05, + "loss": 1.8569, + "step": 4710 + }, + { + "epoch": 0.7257604928763959, + "grad_norm": 0.825018048286438, + "learning_rate": 1.7430191439883298e-05, + "loss": 0.9397, + "step": 4712 + }, + { + "epoch": 0.7260685406237967, + "grad_norm": 1.081148624420166, + "learning_rate": 1.7393490047650944e-05, + "loss": 0.9502, + "step": 4714 + }, + { + "epoch": 0.7263765883711976, + "grad_norm": 0.7197492718696594, + "learning_rate": 1.735681919808839e-05, + "loss": 0.9597, + "step": 4716 + }, + { + "epoch": 0.7266846361185983, + "grad_norm": 0.6531423330307007, + "learning_rate": 1.7320178925545387e-05, + "loss": 0.7852, + "step": 4718 + }, + { + "epoch": 0.7269926838659992, + "grad_norm": 0.6706268191337585, + "learning_rate": 1.728356926434306e-05, + "loss": 0.9987, + "step": 4720 + }, + { + "epoch": 0.7273007316134, + "grad_norm": 0.7522306442260742, + "learning_rate": 1.724699024877383e-05, + "loss": 1.0304, + "step": 4722 + }, + { + "epoch": 0.7276087793608009, + "grad_norm": 0.8848979473114014, + "learning_rate": 1.721044191310145e-05, + "loss": 0.9923, + "step": 4724 + }, + { + "epoch": 0.7279168271082018, + "grad_norm": 0.7735690474510193, + "learning_rate": 1.71739242915609e-05, + "loss": 2.0829, + "step": 4726 + }, + { + "epoch": 0.7282248748556026, + "grad_norm": 0.6652183532714844, + "learning_rate": 1.713743741835842e-05, + "loss": 1.0008, + "step": 4728 + }, + { + "epoch": 0.7285329226030035, + "grad_norm": 0.9963399171829224, + "learning_rate": 1.710098132767143e-05, + "loss": 0.9404, + "step": 4730 + }, + { + "epoch": 0.7288409703504043, + "grad_norm": 0.647809624671936, + "learning_rate": 1.70645560536485e-05, + "loss": 1.0353, + "step": 4732 + }, + { + "epoch": 0.7291490180978052, + "grad_norm": 0.8186082243919373, + "learning_rate": 1.7028161630409405e-05, + "loss": 1.0937, + "step": 4734 + }, + { + "epoch": 0.729457065845206, + "grad_norm": 0.7456851005554199, + "learning_rate": 1.699179809204493e-05, + "loss": 1.1491, + "step": 4736 + }, + { + "epoch": 0.7297651135926069, + "grad_norm": 0.9909383058547974, + "learning_rate": 1.6955465472616973e-05, + "loss": 0.9543, + "step": 4738 + }, + { + "epoch": 0.7300731613400077, + "grad_norm": 0.7538608908653259, + "learning_rate": 1.6919163806158455e-05, + "loss": 0.7093, + "step": 4740 + }, + { + "epoch": 0.7303812090874086, + "grad_norm": 0.6593775153160095, + "learning_rate": 1.6882893126673372e-05, + "loss": 0.7995, + "step": 4742 + }, + { + "epoch": 0.7306892568348093, + "grad_norm": 0.8437679409980774, + "learning_rate": 1.6846653468136588e-05, + "loss": 0.9412, + "step": 4744 + }, + { + "epoch": 0.7309973045822102, + "grad_norm": 1.0077937841415405, + "learning_rate": 1.681044486449395e-05, + "loss": 0.8965, + "step": 4746 + }, + { + "epoch": 0.7313053523296111, + "grad_norm": 1.157778024673462, + "learning_rate": 1.6774267349662274e-05, + "loss": 1.1927, + "step": 4748 + }, + { + "epoch": 0.7316134000770119, + "grad_norm": 1.2045066356658936, + "learning_rate": 1.6738120957529198e-05, + "loss": 0.9724, + "step": 4750 + }, + { + "epoch": 0.7319214478244128, + "grad_norm": 0.8141582608222961, + "learning_rate": 1.670200572195316e-05, + "loss": 1.0615, + "step": 4752 + }, + { + "epoch": 0.7322294955718136, + "grad_norm": 0.6798232793807983, + "learning_rate": 1.6665921676763536e-05, + "loss": 1.8187, + "step": 4754 + }, + { + "epoch": 0.7325375433192145, + "grad_norm": 0.8349108695983887, + "learning_rate": 1.6629868855760406e-05, + "loss": 1.0343, + "step": 4756 + }, + { + "epoch": 0.7328455910666153, + "grad_norm": 0.6482349038124084, + "learning_rate": 1.6593847292714582e-05, + "loss": 0.9633, + "step": 4758 + }, + { + "epoch": 0.7331536388140162, + "grad_norm": 0.6155962944030762, + "learning_rate": 1.655785702136764e-05, + "loss": 0.8955, + "step": 4760 + }, + { + "epoch": 0.733461686561417, + "grad_norm": 1.1165225505828857, + "learning_rate": 1.6521898075431858e-05, + "loss": 1.091, + "step": 4762 + }, + { + "epoch": 0.7337697343088179, + "grad_norm": 0.5706340670585632, + "learning_rate": 1.648597048859015e-05, + "loss": 0.8817, + "step": 4764 + }, + { + "epoch": 0.7340777820562188, + "grad_norm": 0.8465478420257568, + "learning_rate": 1.645007429449601e-05, + "loss": 1.3064, + "step": 4766 + }, + { + "epoch": 0.7343858298036195, + "grad_norm": 0.8700278997421265, + "learning_rate": 1.6414209526773616e-05, + "loss": 1.0423, + "step": 4768 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 0.6647862792015076, + "learning_rate": 1.6378376219017648e-05, + "loss": 0.81, + "step": 4770 + }, + { + "epoch": 0.7350019252984212, + "grad_norm": 0.5894257426261902, + "learning_rate": 1.6342574404793326e-05, + "loss": 1.8308, + "step": 4772 + }, + { + "epoch": 0.7353099730458221, + "grad_norm": 0.8121969103813171, + "learning_rate": 1.630680411763639e-05, + "loss": 1.0954, + "step": 4774 + }, + { + "epoch": 0.7356180207932229, + "grad_norm": 0.8044182658195496, + "learning_rate": 1.6271065391053013e-05, + "loss": 0.9867, + "step": 4776 + }, + { + "epoch": 0.7359260685406238, + "grad_norm": 0.7796005606651306, + "learning_rate": 1.623535825851985e-05, + "loss": 0.9733, + "step": 4778 + }, + { + "epoch": 0.7362341162880246, + "grad_norm": 0.6308910846710205, + "learning_rate": 1.6199682753483926e-05, + "loss": 1.0157, + "step": 4780 + }, + { + "epoch": 0.7365421640354255, + "grad_norm": 0.8022951483726501, + "learning_rate": 1.6164038909362656e-05, + "loss": 1.0141, + "step": 4782 + }, + { + "epoch": 0.7368502117828264, + "grad_norm": 0.6543718576431274, + "learning_rate": 1.6128426759543792e-05, + "loss": 1.0184, + "step": 4784 + }, + { + "epoch": 0.7371582595302272, + "grad_norm": 0.6891601085662842, + "learning_rate": 1.60928463373854e-05, + "loss": 0.875, + "step": 4786 + }, + { + "epoch": 0.7374663072776281, + "grad_norm": 0.9390774965286255, + "learning_rate": 1.6057297676215832e-05, + "loss": 1.0448, + "step": 4788 + }, + { + "epoch": 0.7377743550250289, + "grad_norm": 0.9003983736038208, + "learning_rate": 1.602178080933368e-05, + "loss": 1.0639, + "step": 4790 + }, + { + "epoch": 0.7380824027724298, + "grad_norm": 0.7824626564979553, + "learning_rate": 1.5986295770007765e-05, + "loss": 1.0726, + "step": 4792 + }, + { + "epoch": 0.7383904505198305, + "grad_norm": 0.7979996800422668, + "learning_rate": 1.5950842591477084e-05, + "loss": 0.8944, + "step": 4794 + }, + { + "epoch": 0.7386984982672314, + "grad_norm": 1.148526906967163, + "learning_rate": 1.591542130695079e-05, + "loss": 1.0637, + "step": 4796 + }, + { + "epoch": 0.7390065460146322, + "grad_norm": 0.7048614025115967, + "learning_rate": 1.588003194960817e-05, + "loss": 0.7536, + "step": 4798 + }, + { + "epoch": 0.7393145937620331, + "grad_norm": 0.6544722318649292, + "learning_rate": 1.584467455259861e-05, + "loss": 0.8258, + "step": 4800 + }, + { + "epoch": 0.7393145937620331, + "eval_loss": 2.364302396774292, + "eval_runtime": 737.0622, + "eval_samples_per_second": 2.713, + "eval_steps_per_second": 0.678, + "step": 4800 + }, + { + "epoch": 0.7396226415094339, + "grad_norm": 0.7770758867263794, + "learning_rate": 1.580934914904153e-05, + "loss": 0.903, + "step": 4802 + }, + { + "epoch": 0.7399306892568348, + "grad_norm": 0.7111145853996277, + "learning_rate": 1.5774055772026407e-05, + "loss": 0.8476, + "step": 4804 + }, + { + "epoch": 0.7402387370042357, + "grad_norm": 0.6309933066368103, + "learning_rate": 1.5738794454612703e-05, + "loss": 0.8852, + "step": 4806 + }, + { + "epoch": 0.7405467847516365, + "grad_norm": 0.6851494908332825, + "learning_rate": 1.5703565229829902e-05, + "loss": 0.9008, + "step": 4808 + }, + { + "epoch": 0.7408548324990374, + "grad_norm": 0.9459416270256042, + "learning_rate": 1.566836813067733e-05, + "loss": 0.968, + "step": 4810 + }, + { + "epoch": 0.7411628802464382, + "grad_norm": 0.8359102010726929, + "learning_rate": 1.563320319012428e-05, + "loss": 0.9852, + "step": 4812 + }, + { + "epoch": 0.7414709279938391, + "grad_norm": 0.7026764750480652, + "learning_rate": 1.5598070441109965e-05, + "loss": 0.8269, + "step": 4814 + }, + { + "epoch": 0.7417789757412399, + "grad_norm": 0.7479056715965271, + "learning_rate": 1.5562969916543336e-05, + "loss": 0.8847, + "step": 4816 + }, + { + "epoch": 0.7420870234886408, + "grad_norm": 0.9053459167480469, + "learning_rate": 1.552790164930324e-05, + "loss": 0.7555, + "step": 4818 + }, + { + "epoch": 0.7423950712360415, + "grad_norm": 0.7266227006912231, + "learning_rate": 1.5492865672238276e-05, + "loss": 1.1028, + "step": 4820 + }, + { + "epoch": 0.7427031189834424, + "grad_norm": 0.8516717553138733, + "learning_rate": 1.5457862018166847e-05, + "loss": 1.2673, + "step": 4822 + }, + { + "epoch": 0.7430111667308433, + "grad_norm": 0.6059942245483398, + "learning_rate": 1.5422890719877e-05, + "loss": 0.8081, + "step": 4824 + }, + { + "epoch": 0.7433192144782441, + "grad_norm": 0.8077753186225891, + "learning_rate": 1.53879518101265e-05, + "loss": 0.9809, + "step": 4826 + }, + { + "epoch": 0.743627262225645, + "grad_norm": 0.7996209859848022, + "learning_rate": 1.535304532164283e-05, + "loss": 0.8094, + "step": 4828 + }, + { + "epoch": 0.7439353099730458, + "grad_norm": 0.7710662484169006, + "learning_rate": 1.531817128712305e-05, + "loss": 1.0587, + "step": 4830 + }, + { + "epoch": 0.7442433577204467, + "grad_norm": 0.792667031288147, + "learning_rate": 1.5283329739233808e-05, + "loss": 0.9356, + "step": 4832 + }, + { + "epoch": 0.7445514054678475, + "grad_norm": 0.5613619089126587, + "learning_rate": 1.5248520710611347e-05, + "loss": 0.8864, + "step": 4834 + }, + { + "epoch": 0.7448594532152484, + "grad_norm": 0.9855058789253235, + "learning_rate": 1.5213744233861465e-05, + "loss": 0.9429, + "step": 4836 + }, + { + "epoch": 0.7451675009626492, + "grad_norm": 0.7374126315116882, + "learning_rate": 1.5179000341559463e-05, + "loss": 0.8766, + "step": 4838 + }, + { + "epoch": 0.7454755487100501, + "grad_norm": 0.9540923237800598, + "learning_rate": 1.5144289066250045e-05, + "loss": 0.9494, + "step": 4840 + }, + { + "epoch": 0.745783596457451, + "grad_norm": 0.9879464507102966, + "learning_rate": 1.5109610440447486e-05, + "loss": 0.8697, + "step": 4842 + }, + { + "epoch": 0.7460916442048517, + "grad_norm": 0.9068211913108826, + "learning_rate": 1.5074964496635407e-05, + "loss": 1.1991, + "step": 4844 + }, + { + "epoch": 0.7463996919522526, + "grad_norm": 0.9240580797195435, + "learning_rate": 1.5040351267266783e-05, + "loss": 0.8796, + "step": 4846 + }, + { + "epoch": 0.7467077396996534, + "grad_norm": 0.7835018038749695, + "learning_rate": 1.5005770784764034e-05, + "loss": 0.914, + "step": 4848 + }, + { + "epoch": 0.7470157874470543, + "grad_norm": 0.9912784695625305, + "learning_rate": 1.4971223081518837e-05, + "loss": 1.184, + "step": 4850 + }, + { + "epoch": 0.7473238351944551, + "grad_norm": 0.8975262641906738, + "learning_rate": 1.4936708189892212e-05, + "loss": 0.8886, + "step": 4852 + }, + { + "epoch": 0.747631882941856, + "grad_norm": 0.7419629096984863, + "learning_rate": 1.4902226142214366e-05, + "loss": 1.5564, + "step": 4854 + }, + { + "epoch": 0.7479399306892568, + "grad_norm": 0.6214430332183838, + "learning_rate": 1.4867776970784836e-05, + "loss": 0.9269, + "step": 4856 + }, + { + "epoch": 0.7482479784366577, + "grad_norm": 0.6790211796760559, + "learning_rate": 1.4833360707872319e-05, + "loss": 0.9317, + "step": 4858 + }, + { + "epoch": 0.7485560261840585, + "grad_norm": 0.8692981600761414, + "learning_rate": 1.479897738571468e-05, + "loss": 0.9814, + "step": 4860 + }, + { + "epoch": 0.7488640739314594, + "grad_norm": 0.8824407458305359, + "learning_rate": 1.4764627036518936e-05, + "loss": 0.9181, + "step": 4862 + }, + { + "epoch": 0.7491721216788603, + "grad_norm": 1.0911660194396973, + "learning_rate": 1.473030969246122e-05, + "loss": 0.9681, + "step": 4864 + }, + { + "epoch": 0.7494801694262611, + "grad_norm": 0.7026270627975464, + "learning_rate": 1.4696025385686752e-05, + "loss": 0.9831, + "step": 4866 + }, + { + "epoch": 0.749788217173662, + "grad_norm": 0.7922086119651794, + "learning_rate": 1.4661774148309799e-05, + "loss": 0.8616, + "step": 4868 + }, + { + "epoch": 0.7500962649210627, + "grad_norm": 0.8266775012016296, + "learning_rate": 1.462755601241365e-05, + "loss": 0.8314, + "step": 4870 + }, + { + "epoch": 0.7504043126684636, + "grad_norm": 0.9757092595100403, + "learning_rate": 1.4593371010050606e-05, + "loss": 1.2532, + "step": 4872 + }, + { + "epoch": 0.7507123604158644, + "grad_norm": 0.6591874957084656, + "learning_rate": 1.45592191732419e-05, + "loss": 1.854, + "step": 4874 + }, + { + "epoch": 0.7510204081632653, + "grad_norm": 0.7547690868377686, + "learning_rate": 1.4525100533977731e-05, + "loss": 0.9278, + "step": 4876 + }, + { + "epoch": 0.7513284559106661, + "grad_norm": 0.8740079998970032, + "learning_rate": 1.4491015124217184e-05, + "loss": 1.0041, + "step": 4878 + }, + { + "epoch": 0.751636503658067, + "grad_norm": 0.9281182885169983, + "learning_rate": 1.4456962975888216e-05, + "loss": 0.9211, + "step": 4880 + }, + { + "epoch": 0.7519445514054679, + "grad_norm": 0.6817439198493958, + "learning_rate": 1.4422944120887638e-05, + "loss": 0.875, + "step": 4882 + }, + { + "epoch": 0.7522525991528687, + "grad_norm": 0.749985933303833, + "learning_rate": 1.438895859108107e-05, + "loss": 0.9464, + "step": 4884 + }, + { + "epoch": 0.7525606469002696, + "grad_norm": 0.9303197264671326, + "learning_rate": 1.4355006418302896e-05, + "loss": 1.1307, + "step": 4886 + }, + { + "epoch": 0.7528686946476704, + "grad_norm": 0.6419119238853455, + "learning_rate": 1.4321087634356329e-05, + "loss": 0.8779, + "step": 4888 + }, + { + "epoch": 0.7531767423950713, + "grad_norm": 0.8447157144546509, + "learning_rate": 1.4287202271013196e-05, + "loss": 0.7976, + "step": 4890 + }, + { + "epoch": 0.753484790142472, + "grad_norm": 0.9394419193267822, + "learning_rate": 1.4253350360014095e-05, + "loss": 1.0733, + "step": 4892 + }, + { + "epoch": 0.753792837889873, + "grad_norm": 0.9550667405128479, + "learning_rate": 1.4219531933068259e-05, + "loss": 1.4181, + "step": 4894 + }, + { + "epoch": 0.7541008856372737, + "grad_norm": 0.79128497838974, + "learning_rate": 1.4185747021853601e-05, + "loss": 1.0558, + "step": 4896 + }, + { + "epoch": 0.7544089333846746, + "grad_norm": 0.8191699981689453, + "learning_rate": 1.4151995658016565e-05, + "loss": 0.8652, + "step": 4898 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.921259880065918, + "learning_rate": 1.4118277873172208e-05, + "loss": 1.017, + "step": 4900 + }, + { + "epoch": 0.7550250288794763, + "grad_norm": 0.7527545094490051, + "learning_rate": 1.4084593698904186e-05, + "loss": 1.068, + "step": 4902 + }, + { + "epoch": 0.7553330766268772, + "grad_norm": 0.7523015737533569, + "learning_rate": 1.4050943166764569e-05, + "loss": 0.9896, + "step": 4904 + }, + { + "epoch": 0.755641124374278, + "grad_norm": 1.0874977111816406, + "learning_rate": 1.4017326308273975e-05, + "loss": 0.9821, + "step": 4906 + }, + { + "epoch": 0.7559491721216789, + "grad_norm": 0.6856980323791504, + "learning_rate": 1.3983743154921503e-05, + "loss": 2.4016, + "step": 4908 + }, + { + "epoch": 0.7562572198690797, + "grad_norm": 1.0397601127624512, + "learning_rate": 1.3950193738164646e-05, + "loss": 0.7798, + "step": 4910 + }, + { + "epoch": 0.7565652676164806, + "grad_norm": 0.7780575156211853, + "learning_rate": 1.3916678089429264e-05, + "loss": 0.8409, + "step": 4912 + }, + { + "epoch": 0.7568733153638814, + "grad_norm": 0.7275823354721069, + "learning_rate": 1.3883196240109631e-05, + "loss": 0.9809, + "step": 4914 + }, + { + "epoch": 0.7571813631112823, + "grad_norm": 0.7291345596313477, + "learning_rate": 1.3849748221568371e-05, + "loss": 0.8507, + "step": 4916 + }, + { + "epoch": 0.757489410858683, + "grad_norm": 0.8503779768943787, + "learning_rate": 1.381633406513641e-05, + "loss": 0.8459, + "step": 4918 + }, + { + "epoch": 0.7577974586060839, + "grad_norm": 1.0080485343933105, + "learning_rate": 1.378295380211289e-05, + "loss": 0.82, + "step": 4920 + }, + { + "epoch": 0.7581055063534848, + "grad_norm": 1.0013842582702637, + "learning_rate": 1.3749607463765308e-05, + "loss": 2.36, + "step": 4922 + }, + { + "epoch": 0.7584135541008856, + "grad_norm": 0.6906487345695496, + "learning_rate": 1.3716295081329316e-05, + "loss": 0.9252, + "step": 4924 + }, + { + "epoch": 0.7587216018482865, + "grad_norm": 0.819780170917511, + "learning_rate": 1.3683016686008799e-05, + "loss": 1.4594, + "step": 4926 + }, + { + "epoch": 0.7590296495956873, + "grad_norm": 0.7055020928382874, + "learning_rate": 1.3649772308975733e-05, + "loss": 1.0051, + "step": 4928 + }, + { + "epoch": 0.7593376973430882, + "grad_norm": 0.6653702855110168, + "learning_rate": 1.3616561981370329e-05, + "loss": 0.6855, + "step": 4930 + }, + { + "epoch": 0.759645745090489, + "grad_norm": 0.7256321310997009, + "learning_rate": 1.3583385734300858e-05, + "loss": 1.0082, + "step": 4932 + }, + { + "epoch": 0.7599537928378899, + "grad_norm": 0.7310523986816406, + "learning_rate": 1.3550243598843615e-05, + "loss": 0.9595, + "step": 4934 + }, + { + "epoch": 0.7602618405852907, + "grad_norm": 0.8937221765518188, + "learning_rate": 1.3517135606043047e-05, + "loss": 1.0844, + "step": 4936 + }, + { + "epoch": 0.7605698883326916, + "grad_norm": 0.6377236843109131, + "learning_rate": 1.348406178691154e-05, + "loss": 1.0395, + "step": 4938 + }, + { + "epoch": 0.7608779360800925, + "grad_norm": 1.0823004245758057, + "learning_rate": 1.3451022172429495e-05, + "loss": 1.2769, + "step": 4940 + }, + { + "epoch": 0.7611859838274933, + "grad_norm": 0.7745267152786255, + "learning_rate": 1.341801679354528e-05, + "loss": 0.7941, + "step": 4942 + }, + { + "epoch": 0.7614940315748941, + "grad_norm": 0.5816026926040649, + "learning_rate": 1.338504568117518e-05, + "loss": 0.9029, + "step": 4944 + }, + { + "epoch": 0.7618020793222949, + "grad_norm": 0.9577437043190002, + "learning_rate": 1.3352108866203394e-05, + "loss": 0.8861, + "step": 4946 + }, + { + "epoch": 0.7621101270696958, + "grad_norm": 0.7536446452140808, + "learning_rate": 1.3319206379481991e-05, + "loss": 1.1366, + "step": 4948 + }, + { + "epoch": 0.7624181748170966, + "grad_norm": 0.7559531927108765, + "learning_rate": 1.3286338251830882e-05, + "loss": 1.6357, + "step": 4950 + }, + { + "epoch": 0.7627262225644975, + "grad_norm": 0.8585458993911743, + "learning_rate": 1.3253504514037796e-05, + "loss": 0.9577, + "step": 4952 + }, + { + "epoch": 0.7630342703118983, + "grad_norm": 0.9138365387916565, + "learning_rate": 1.3220705196858247e-05, + "loss": 0.9647, + "step": 4954 + }, + { + "epoch": 0.7633423180592992, + "grad_norm": 0.7139084339141846, + "learning_rate": 1.3187940331015503e-05, + "loss": 1.2702, + "step": 4956 + }, + { + "epoch": 0.7636503658067001, + "grad_norm": 1.0149308443069458, + "learning_rate": 1.3155209947200575e-05, + "loss": 1.8406, + "step": 4958 + }, + { + "epoch": 0.7639584135541009, + "grad_norm": 0.6941062808036804, + "learning_rate": 1.3122514076072163e-05, + "loss": 0.8376, + "step": 4960 + }, + { + "epoch": 0.7642664613015018, + "grad_norm": 0.7763035297393799, + "learning_rate": 1.3089852748256642e-05, + "loss": 0.9862, + "step": 4962 + }, + { + "epoch": 0.7645745090489026, + "grad_norm": 0.9273561239242554, + "learning_rate": 1.3057225994348027e-05, + "loss": 1.0633, + "step": 4964 + }, + { + "epoch": 0.7648825567963035, + "grad_norm": 0.8585821390151978, + "learning_rate": 1.3024633844907958e-05, + "loss": 1.0902, + "step": 4966 + }, + { + "epoch": 0.7651906045437042, + "grad_norm": 0.8621907234191895, + "learning_rate": 1.2992076330465642e-05, + "loss": 0.9008, + "step": 4968 + }, + { + "epoch": 0.7654986522911051, + "grad_norm": 0.6912165880203247, + "learning_rate": 1.2959553481517866e-05, + "loss": 0.8129, + "step": 4970 + }, + { + "epoch": 0.7658067000385059, + "grad_norm": 0.8399752378463745, + "learning_rate": 1.2927065328528926e-05, + "loss": 1.6187, + "step": 4972 + }, + { + "epoch": 0.7661147477859068, + "grad_norm": 0.8446044921875, + "learning_rate": 1.2894611901930615e-05, + "loss": 0.8109, + "step": 4974 + }, + { + "epoch": 0.7664227955333076, + "grad_norm": 0.8228721022605896, + "learning_rate": 1.286219323212225e-05, + "loss": 0.9516, + "step": 4976 + }, + { + "epoch": 0.7667308432807085, + "grad_norm": 0.9982367157936096, + "learning_rate": 1.2829809349470512e-05, + "loss": 1.016, + "step": 4978 + }, + { + "epoch": 0.7670388910281094, + "grad_norm": 0.764540433883667, + "learning_rate": 1.2797460284309532e-05, + "loss": 0.7849, + "step": 4980 + }, + { + "epoch": 0.7673469387755102, + "grad_norm": 0.718386173248291, + "learning_rate": 1.2765146066940853e-05, + "loss": 1.0434, + "step": 4982 + }, + { + "epoch": 0.7676549865229111, + "grad_norm": 0.8509320616722107, + "learning_rate": 1.2732866727633364e-05, + "loss": 1.1764, + "step": 4984 + }, + { + "epoch": 0.7679630342703119, + "grad_norm": 0.9963598251342773, + "learning_rate": 1.2700622296623239e-05, + "loss": 1.0726, + "step": 4986 + }, + { + "epoch": 0.7682710820177128, + "grad_norm": 1.1380006074905396, + "learning_rate": 1.2668412804113983e-05, + "loss": 1.0589, + "step": 4988 + }, + { + "epoch": 0.7685791297651136, + "grad_norm": 0.7528254985809326, + "learning_rate": 1.263623828027642e-05, + "loss": 0.8949, + "step": 4990 + }, + { + "epoch": 0.7688871775125145, + "grad_norm": 0.8158612251281738, + "learning_rate": 1.260409875524854e-05, + "loss": 0.8413, + "step": 4992 + }, + { + "epoch": 0.7691952252599152, + "grad_norm": 0.8937954306602478, + "learning_rate": 1.2571994259135583e-05, + "loss": 0.95, + "step": 4994 + }, + { + "epoch": 0.7695032730073161, + "grad_norm": 0.8776180148124695, + "learning_rate": 1.2539924822010007e-05, + "loss": 0.9603, + "step": 4996 + }, + { + "epoch": 0.769811320754717, + "grad_norm": 0.6999343633651733, + "learning_rate": 1.250789047391141e-05, + "loss": 0.8799, + "step": 4998 + }, + { + "epoch": 0.7701193685021178, + "grad_norm": 0.9434583187103271, + "learning_rate": 1.247589124484646e-05, + "loss": 0.8455, + "step": 5000 + }, + { + "epoch": 0.7704274162495187, + "grad_norm": 1.0791411399841309, + "learning_rate": 1.2443927164789037e-05, + "loss": 1.0607, + "step": 5002 + }, + { + "epoch": 0.7707354639969195, + "grad_norm": 0.6646117568016052, + "learning_rate": 1.241199826368003e-05, + "loss": 1.2331, + "step": 5004 + }, + { + "epoch": 0.7710435117443204, + "grad_norm": 0.898343026638031, + "learning_rate": 1.2380104571427398e-05, + "loss": 1.0122, + "step": 5006 + }, + { + "epoch": 0.7713515594917212, + "grad_norm": 0.7997497320175171, + "learning_rate": 1.2348246117906065e-05, + "loss": 1.6624, + "step": 5008 + }, + { + "epoch": 0.7716596072391221, + "grad_norm": 0.9120794534683228, + "learning_rate": 1.2316422932958044e-05, + "loss": 1.2284, + "step": 5010 + }, + { + "epoch": 0.7719676549865229, + "grad_norm": 0.7209826707839966, + "learning_rate": 1.2284635046392245e-05, + "loss": 0.8711, + "step": 5012 + }, + { + "epoch": 0.7722757027339238, + "grad_norm": 1.0334336757659912, + "learning_rate": 1.2252882487984529e-05, + "loss": 0.9622, + "step": 5014 + }, + { + "epoch": 0.7725837504813247, + "grad_norm": 0.8697283864021301, + "learning_rate": 1.222116528747766e-05, + "loss": 0.8894, + "step": 5016 + }, + { + "epoch": 0.7728917982287254, + "grad_norm": 1.0194828510284424, + "learning_rate": 1.2189483474581292e-05, + "loss": 1.0679, + "step": 5018 + }, + { + "epoch": 0.7731998459761263, + "grad_norm": 0.833803117275238, + "learning_rate": 1.2157837078971928e-05, + "loss": 0.8022, + "step": 5020 + }, + { + "epoch": 0.7735078937235271, + "grad_norm": 0.8861052393913269, + "learning_rate": 1.2126226130292895e-05, + "loss": 0.841, + "step": 5022 + }, + { + "epoch": 0.773815941470928, + "grad_norm": 0.9703818559646606, + "learning_rate": 1.209465065815431e-05, + "loss": 1.0202, + "step": 5024 + }, + { + "epoch": 0.7741239892183288, + "grad_norm": 0.5357424020767212, + "learning_rate": 1.2063110692133068e-05, + "loss": 0.9335, + "step": 5026 + }, + { + "epoch": 0.7744320369657297, + "grad_norm": 1.0410465002059937, + "learning_rate": 1.2031606261772805e-05, + "loss": 0.8673, + "step": 5028 + }, + { + "epoch": 0.7747400847131305, + "grad_norm": 0.9500507712364197, + "learning_rate": 1.200013739658386e-05, + "loss": 0.9811, + "step": 5030 + }, + { + "epoch": 0.7750481324605314, + "grad_norm": 0.7785813212394714, + "learning_rate": 1.1968704126043279e-05, + "loss": 0.9822, + "step": 5032 + }, + { + "epoch": 0.7753561802079322, + "grad_norm": 0.7565167546272278, + "learning_rate": 1.193730647959474e-05, + "loss": 0.7924, + "step": 5034 + }, + { + "epoch": 0.7756642279553331, + "grad_norm": 0.9191146492958069, + "learning_rate": 1.1905944486648568e-05, + "loss": 1.0759, + "step": 5036 + }, + { + "epoch": 0.775972275702734, + "grad_norm": 0.9619866013526917, + "learning_rate": 1.1874618176581693e-05, + "loss": 0.9241, + "step": 5038 + }, + { + "epoch": 0.7762803234501348, + "grad_norm": 0.9265114665031433, + "learning_rate": 1.1843327578737612e-05, + "loss": 0.7815, + "step": 5040 + }, + { + "epoch": 0.7765883711975357, + "grad_norm": 1.0196962356567383, + "learning_rate": 1.1812072722426371e-05, + "loss": 1.0626, + "step": 5042 + }, + { + "epoch": 0.7768964189449364, + "grad_norm": 0.9532772302627563, + "learning_rate": 1.1780853636924543e-05, + "loss": 0.8243, + "step": 5044 + }, + { + "epoch": 0.7772044666923373, + "grad_norm": 1.0061920881271362, + "learning_rate": 1.1749670351475195e-05, + "loss": 1.7845, + "step": 5046 + }, + { + "epoch": 0.7775125144397381, + "grad_norm": 0.7302465438842773, + "learning_rate": 1.1718522895287848e-05, + "loss": 0.8505, + "step": 5048 + }, + { + "epoch": 0.777820562187139, + "grad_norm": 0.8662822842597961, + "learning_rate": 1.1687411297538469e-05, + "loss": 0.9611, + "step": 5050 + }, + { + "epoch": 0.7781286099345398, + "grad_norm": 0.704204261302948, + "learning_rate": 1.1656335587369444e-05, + "loss": 0.9925, + "step": 5052 + }, + { + "epoch": 0.7784366576819407, + "grad_norm": 0.6102928519248962, + "learning_rate": 1.1625295793889512e-05, + "loss": 1.2709, + "step": 5054 + }, + { + "epoch": 0.7787447054293416, + "grad_norm": 0.7031947374343872, + "learning_rate": 1.1594291946173846e-05, + "loss": 2.1372, + "step": 5056 + }, + { + "epoch": 0.7790527531767424, + "grad_norm": 1.1103585958480835, + "learning_rate": 1.1563324073263843e-05, + "loss": 0.9965, + "step": 5058 + }, + { + "epoch": 0.7793608009241433, + "grad_norm": 0.8092791438102722, + "learning_rate": 1.1532392204167275e-05, + "loss": 1.0353, + "step": 5060 + }, + { + "epoch": 0.7796688486715441, + "grad_norm": 0.9561004042625427, + "learning_rate": 1.1501496367858144e-05, + "loss": 1.0016, + "step": 5062 + }, + { + "epoch": 0.779976896418945, + "grad_norm": 0.6417694091796875, + "learning_rate": 1.1470636593276779e-05, + "loss": 0.8021, + "step": 5064 + }, + { + "epoch": 0.7802849441663458, + "grad_norm": 0.8332743644714355, + "learning_rate": 1.1439812909329616e-05, + "loss": 1.1154, + "step": 5066 + }, + { + "epoch": 0.7805929919137466, + "grad_norm": 0.95404452085495, + "learning_rate": 1.1409025344889362e-05, + "loss": 1.0442, + "step": 5068 + }, + { + "epoch": 0.7809010396611474, + "grad_norm": 0.6904376149177551, + "learning_rate": 1.1378273928794886e-05, + "loss": 0.9646, + "step": 5070 + }, + { + "epoch": 0.7812090874085483, + "grad_norm": 1.0550509691238403, + "learning_rate": 1.134755868985119e-05, + "loss": 0.8689, + "step": 5072 + }, + { + "epoch": 0.7815171351559492, + "grad_norm": 0.7711645364761353, + "learning_rate": 1.1316879656829338e-05, + "loss": 0.985, + "step": 5074 + }, + { + "epoch": 0.78182518290335, + "grad_norm": 1.0338762998580933, + "learning_rate": 1.128623685846656e-05, + "loss": 1.0949, + "step": 5076 + }, + { + "epoch": 0.7821332306507509, + "grad_norm": 0.6583460569381714, + "learning_rate": 1.1255630323466116e-05, + "loss": 0.8089, + "step": 5078 + }, + { + "epoch": 0.7824412783981517, + "grad_norm": 0.9409406185150146, + "learning_rate": 1.1225060080497257e-05, + "loss": 0.9767, + "step": 5080 + }, + { + "epoch": 0.7827493261455526, + "grad_norm": 0.9006635546684265, + "learning_rate": 1.1194526158195274e-05, + "loss": 0.7231, + "step": 5082 + }, + { + "epoch": 0.7830573738929534, + "grad_norm": 1.1432090997695923, + "learning_rate": 1.1164028585161456e-05, + "loss": 0.9457, + "step": 5084 + }, + { + "epoch": 0.7833654216403543, + "grad_norm": 0.4902731478214264, + "learning_rate": 1.1133567389963035e-05, + "loss": 0.6747, + "step": 5086 + }, + { + "epoch": 0.7836734693877551, + "grad_norm": 1.0363097190856934, + "learning_rate": 1.1103142601133098e-05, + "loss": 0.955, + "step": 5088 + }, + { + "epoch": 0.783981517135156, + "grad_norm": 0.7746341228485107, + "learning_rate": 1.107275424717074e-05, + "loss": 0.7925, + "step": 5090 + }, + { + "epoch": 0.7842895648825567, + "grad_norm": 0.8868885040283203, + "learning_rate": 1.1042402356540853e-05, + "loss": 0.8253, + "step": 5092 + }, + { + "epoch": 0.7845976126299576, + "grad_norm": 1.0761430263519287, + "learning_rate": 1.1012086957674194e-05, + "loss": 1.0306, + "step": 5094 + }, + { + "epoch": 0.7849056603773585, + "grad_norm": 0.7536218762397766, + "learning_rate": 1.0981808078967348e-05, + "loss": 0.9471, + "step": 5096 + }, + { + "epoch": 0.7852137081247593, + "grad_norm": 0.8555169105529785, + "learning_rate": 1.0951565748782666e-05, + "loss": 2.3108, + "step": 5098 + }, + { + "epoch": 0.7855217558721602, + "grad_norm": 0.698821485042572, + "learning_rate": 1.092135999544831e-05, + "loss": 1.2321, + "step": 5100 + }, + { + "epoch": 0.7855217558721602, + "eval_loss": 2.402956485748291, + "eval_runtime": 737.0859, + "eval_samples_per_second": 2.713, + "eval_steps_per_second": 0.678, + "step": 5100 + }, + { + "epoch": 0.785829803619561, + "grad_norm": 0.6485735774040222, + "learning_rate": 1.0891190847258093e-05, + "loss": 0.7757, + "step": 5102 + }, + { + "epoch": 0.7861378513669619, + "grad_norm": 0.811083972454071, + "learning_rate": 1.0861058332471652e-05, + "loss": 1.1812, + "step": 5104 + }, + { + "epoch": 0.7864458991143627, + "grad_norm": 0.7928170561790466, + "learning_rate": 1.0830962479314226e-05, + "loss": 0.8711, + "step": 5106 + }, + { + "epoch": 0.7867539468617636, + "grad_norm": 0.771323025226593, + "learning_rate": 1.0800903315976757e-05, + "loss": 0.9764, + "step": 5108 + }, + { + "epoch": 0.7870619946091644, + "grad_norm": 0.9954501390457153, + "learning_rate": 1.077088087061579e-05, + "loss": 0.9129, + "step": 5110 + }, + { + "epoch": 0.7873700423565653, + "grad_norm": 0.6200007796287537, + "learning_rate": 1.0740895171353493e-05, + "loss": 1.8826, + "step": 5112 + }, + { + "epoch": 0.7876780901039662, + "grad_norm": 1.0519543886184692, + "learning_rate": 1.0710946246277615e-05, + "loss": 0.9654, + "step": 5114 + }, + { + "epoch": 0.787986137851367, + "grad_norm": 0.6422151327133179, + "learning_rate": 1.0681034123441447e-05, + "loss": 1.0095, + "step": 5116 + }, + { + "epoch": 0.7882941855987678, + "grad_norm": 0.7397940158843994, + "learning_rate": 1.0651158830863816e-05, + "loss": 0.8926, + "step": 5118 + }, + { + "epoch": 0.7886022333461686, + "grad_norm": 0.7962545156478882, + "learning_rate": 1.0621320396529056e-05, + "loss": 0.904, + "step": 5120 + }, + { + "epoch": 0.7889102810935695, + "grad_norm": 0.7353315949440002, + "learning_rate": 1.0591518848386956e-05, + "loss": 0.8093, + "step": 5122 + }, + { + "epoch": 0.7892183288409703, + "grad_norm": 0.8774969577789307, + "learning_rate": 1.0561754214352765e-05, + "loss": 1.0146, + "step": 5124 + }, + { + "epoch": 0.7895263765883712, + "grad_norm": 0.759262204170227, + "learning_rate": 1.0532026522307164e-05, + "loss": 1.0412, + "step": 5126 + }, + { + "epoch": 0.789834424335772, + "grad_norm": 0.972540020942688, + "learning_rate": 1.0502335800096214e-05, + "loss": 1.0528, + "step": 5128 + }, + { + "epoch": 0.7901424720831729, + "grad_norm": 0.7988440990447998, + "learning_rate": 1.0472682075531354e-05, + "loss": 0.9716, + "step": 5130 + }, + { + "epoch": 0.7904505198305737, + "grad_norm": 0.6502975821495056, + "learning_rate": 1.0443065376389366e-05, + "loss": 0.9217, + "step": 5132 + }, + { + "epoch": 0.7907585675779746, + "grad_norm": 0.9882317185401917, + "learning_rate": 1.0413485730412337e-05, + "loss": 1.5967, + "step": 5134 + }, + { + "epoch": 0.7910666153253755, + "grad_norm": 0.7147364020347595, + "learning_rate": 1.0383943165307697e-05, + "loss": 1.0296, + "step": 5136 + }, + { + "epoch": 0.7913746630727763, + "grad_norm": 0.9587258100509644, + "learning_rate": 1.0354437708748071e-05, + "loss": 1.0206, + "step": 5138 + }, + { + "epoch": 0.7916827108201772, + "grad_norm": 0.8416325449943542, + "learning_rate": 1.0324969388371364e-05, + "loss": 0.9132, + "step": 5140 + }, + { + "epoch": 0.791990758567578, + "grad_norm": 0.7498589754104614, + "learning_rate": 1.0295538231780677e-05, + "loss": 2.1829, + "step": 5142 + }, + { + "epoch": 0.7922988063149788, + "grad_norm": 0.7145204544067383, + "learning_rate": 1.0266144266544353e-05, + "loss": 0.8647, + "step": 5144 + }, + { + "epoch": 0.7926068540623796, + "grad_norm": 0.7027596831321716, + "learning_rate": 1.0236787520195812e-05, + "loss": 1.0984, + "step": 5146 + }, + { + "epoch": 0.7929149018097805, + "grad_norm": 0.6741682291030884, + "learning_rate": 1.0207468020233663e-05, + "loss": 0.9404, + "step": 5148 + }, + { + "epoch": 0.7932229495571813, + "grad_norm": 1.5651164054870605, + "learning_rate": 1.0178185794121643e-05, + "loss": 0.8988, + "step": 5150 + }, + { + "epoch": 0.7935309973045822, + "grad_norm": 0.9429001212120056, + "learning_rate": 1.0148940869288543e-05, + "loss": 0.883, + "step": 5152 + }, + { + "epoch": 0.7938390450519831, + "grad_norm": 0.6833556294441223, + "learning_rate": 1.01197332731282e-05, + "loss": 1.4627, + "step": 5154 + }, + { + "epoch": 0.7941470927993839, + "grad_norm": 0.6978062391281128, + "learning_rate": 1.0090563032999506e-05, + "loss": 2.135, + "step": 5156 + }, + { + "epoch": 0.7944551405467848, + "grad_norm": 1.2680683135986328, + "learning_rate": 1.0061430176226394e-05, + "loss": 1.1484, + "step": 5158 + }, + { + "epoch": 0.7947631882941856, + "grad_norm": 0.7495437264442444, + "learning_rate": 1.0032334730097715e-05, + "loss": 0.9135, + "step": 5160 + }, + { + "epoch": 0.7950712360415865, + "grad_norm": 1.005279541015625, + "learning_rate": 1.00032767218673e-05, + "loss": 1.0571, + "step": 5162 + }, + { + "epoch": 0.7953792837889873, + "grad_norm": 0.7408681511878967, + "learning_rate": 9.974256178753954e-06, + "loss": 0.8986, + "step": 5164 + }, + { + "epoch": 0.7956873315363882, + "grad_norm": 0.7218614816665649, + "learning_rate": 9.945273127941358e-06, + "loss": 0.8026, + "step": 5166 + }, + { + "epoch": 0.7959953792837889, + "grad_norm": 0.8716273903846741, + "learning_rate": 9.916327596578018e-06, + "loss": 1.7818, + "step": 5168 + }, + { + "epoch": 0.7963034270311898, + "grad_norm": 0.7888768315315247, + "learning_rate": 9.887419611777405e-06, + "loss": 1.3094, + "step": 5170 + }, + { + "epoch": 0.7966114747785907, + "grad_norm": 0.8038328289985657, + "learning_rate": 9.858549200617734e-06, + "loss": 0.8248, + "step": 5172 + }, + { + "epoch": 0.7969195225259915, + "grad_norm": 0.8979176878929138, + "learning_rate": 9.829716390142073e-06, + "loss": 1.0524, + "step": 5174 + }, + { + "epoch": 0.7972275702733924, + "grad_norm": 1.2263667583465576, + "learning_rate": 9.800921207358216e-06, + "loss": 0.9444, + "step": 5176 + }, + { + "epoch": 0.7975356180207932, + "grad_norm": 0.8946297764778137, + "learning_rate": 9.772163679238778e-06, + "loss": 2.0287, + "step": 5178 + }, + { + "epoch": 0.7978436657681941, + "grad_norm": 0.8920976519584656, + "learning_rate": 9.743443832721055e-06, + "loss": 0.7952, + "step": 5180 + }, + { + "epoch": 0.7981517135155949, + "grad_norm": 1.0788079500198364, + "learning_rate": 9.714761694707069e-06, + "loss": 1.0469, + "step": 5182 + }, + { + "epoch": 0.7984597612629958, + "grad_norm": 1.0200272798538208, + "learning_rate": 9.686117292063501e-06, + "loss": 0.8722, + "step": 5184 + }, + { + "epoch": 0.7987678090103966, + "grad_norm": 1.0531045198440552, + "learning_rate": 9.657510651621709e-06, + "loss": 0.9411, + "step": 5186 + }, + { + "epoch": 0.7990758567577975, + "grad_norm": 0.6479781270027161, + "learning_rate": 9.628941800177654e-06, + "loss": 0.916, + "step": 5188 + }, + { + "epoch": 0.7993839045051983, + "grad_norm": 0.8514047265052795, + "learning_rate": 9.600410764491924e-06, + "loss": 1.149, + "step": 5190 + }, + { + "epoch": 0.7996919522525991, + "grad_norm": 0.7890281081199646, + "learning_rate": 9.571917571289662e-06, + "loss": 0.8978, + "step": 5192 + }, + { + "epoch": 0.8, + "grad_norm": 0.9271743297576904, + "learning_rate": 9.543462247260586e-06, + "loss": 1.556, + "step": 5194 + }, + { + "epoch": 0.8003080477474008, + "grad_norm": 0.7708587646484375, + "learning_rate": 9.515044819058922e-06, + "loss": 1.6046, + "step": 5196 + }, + { + "epoch": 0.8006160954948017, + "grad_norm": 0.8407145142555237, + "learning_rate": 9.48666531330341e-06, + "loss": 0.825, + "step": 5198 + }, + { + "epoch": 0.8009241432422025, + "grad_norm": 0.6378217339515686, + "learning_rate": 9.458323756577264e-06, + "loss": 0.691, + "step": 5200 + }, + { + "epoch": 0.8012321909896034, + "grad_norm": 0.6951934099197388, + "learning_rate": 9.430020175428156e-06, + "loss": 0.8766, + "step": 5202 + }, + { + "epoch": 0.8015402387370042, + "grad_norm": 0.9855650067329407, + "learning_rate": 9.40175459636818e-06, + "loss": 0.9587, + "step": 5204 + }, + { + "epoch": 0.8018482864844051, + "grad_norm": 0.6208633184432983, + "learning_rate": 9.37352704587383e-06, + "loss": 2.1109, + "step": 5206 + }, + { + "epoch": 0.8021563342318059, + "grad_norm": 0.7674615383148193, + "learning_rate": 9.345337550385985e-06, + "loss": 0.8624, + "step": 5208 + }, + { + "epoch": 0.8024643819792068, + "grad_norm": 0.8252739906311035, + "learning_rate": 9.317186136309901e-06, + "loss": 0.9765, + "step": 5210 + }, + { + "epoch": 0.8027724297266077, + "grad_norm": 0.7470067143440247, + "learning_rate": 9.28907283001511e-06, + "loss": 1.4624, + "step": 5212 + }, + { + "epoch": 0.8030804774740085, + "grad_norm": 0.634072482585907, + "learning_rate": 9.260997657835486e-06, + "loss": 0.6964, + "step": 5214 + }, + { + "epoch": 0.8033885252214094, + "grad_norm": 0.9666215777397156, + "learning_rate": 9.232960646069171e-06, + "loss": 0.9129, + "step": 5216 + }, + { + "epoch": 0.8036965729688101, + "grad_norm": 0.8223074674606323, + "learning_rate": 9.204961820978569e-06, + "loss": 0.7595, + "step": 5218 + }, + { + "epoch": 0.804004620716211, + "grad_norm": 0.7416442632675171, + "learning_rate": 9.17700120879031e-06, + "loss": 2.8862, + "step": 5220 + }, + { + "epoch": 0.8043126684636118, + "grad_norm": 0.6753026843070984, + "learning_rate": 9.149078835695213e-06, + "loss": 1.1582, + "step": 5222 + }, + { + "epoch": 0.8046207162110127, + "grad_norm": 0.8889574408531189, + "learning_rate": 9.121194727848337e-06, + "loss": 1.0037, + "step": 5224 + }, + { + "epoch": 0.8049287639584135, + "grad_norm": 1.0888804197311401, + "learning_rate": 9.093348911368816e-06, + "loss": 1.1738, + "step": 5226 + }, + { + "epoch": 0.8052368117058144, + "grad_norm": 0.7588722109794617, + "learning_rate": 9.065541412339956e-06, + "loss": 1.1241, + "step": 5228 + }, + { + "epoch": 0.8055448594532153, + "grad_norm": 0.9247668385505676, + "learning_rate": 9.037772256809195e-06, + "loss": 1.1129, + "step": 5230 + }, + { + "epoch": 0.8058529072006161, + "grad_norm": 0.7385468482971191, + "learning_rate": 9.010041470788033e-06, + "loss": 1.0471, + "step": 5232 + }, + { + "epoch": 0.806160954948017, + "grad_norm": 0.6662497520446777, + "learning_rate": 8.982349080252e-06, + "loss": 0.9419, + "step": 5234 + }, + { + "epoch": 0.8064690026954178, + "grad_norm": 0.9361608028411865, + "learning_rate": 8.954695111140688e-06, + "loss": 0.9675, + "step": 5236 + }, + { + "epoch": 0.8067770504428187, + "grad_norm": 0.6301150918006897, + "learning_rate": 8.927079589357722e-06, + "loss": 0.8097, + "step": 5238 + }, + { + "epoch": 0.8070850981902195, + "grad_norm": 0.7306633591651917, + "learning_rate": 8.899502540770688e-06, + "loss": 0.9882, + "step": 5240 + }, + { + "epoch": 0.8073931459376203, + "grad_norm": 1.0911128520965576, + "learning_rate": 8.871963991211107e-06, + "loss": 0.9689, + "step": 5242 + }, + { + "epoch": 0.8077011936850211, + "grad_norm": 0.9000625610351562, + "learning_rate": 8.844463966474491e-06, + "loss": 1.0501, + "step": 5244 + }, + { + "epoch": 0.808009241432422, + "grad_norm": 0.8635241389274597, + "learning_rate": 8.81700249232026e-06, + "loss": 0.8923, + "step": 5246 + }, + { + "epoch": 0.8083172891798228, + "grad_norm": 0.5929491519927979, + "learning_rate": 8.789579594471648e-06, + "loss": 0.8787, + "step": 5248 + }, + { + "epoch": 0.8086253369272237, + "grad_norm": 0.6980977058410645, + "learning_rate": 8.762195298615855e-06, + "loss": 0.8485, + "step": 5250 + }, + { + "epoch": 0.8089333846746246, + "grad_norm": 0.7201548218727112, + "learning_rate": 8.734849630403873e-06, + "loss": 0.9347, + "step": 5252 + }, + { + "epoch": 0.8092414324220254, + "grad_norm": 0.7645498514175415, + "learning_rate": 8.70754261545052e-06, + "loss": 0.8769, + "step": 5254 + }, + { + "epoch": 0.8095494801694263, + "grad_norm": 0.5888391137123108, + "learning_rate": 8.680274279334372e-06, + "loss": 0.7781, + "step": 5256 + }, + { + "epoch": 0.8098575279168271, + "grad_norm": 1.011031150817871, + "learning_rate": 8.653044647597846e-06, + "loss": 0.95, + "step": 5258 + }, + { + "epoch": 0.810165575664228, + "grad_norm": 0.564937949180603, + "learning_rate": 8.625853745747048e-06, + "loss": 0.8399, + "step": 5260 + }, + { + "epoch": 0.8104736234116288, + "grad_norm": 0.9204162359237671, + "learning_rate": 8.598701599251818e-06, + "loss": 1.1062, + "step": 5262 + }, + { + "epoch": 0.8107816711590297, + "grad_norm": 0.6450652480125427, + "learning_rate": 8.571588233545713e-06, + "loss": 2.0054, + "step": 5264 + }, + { + "epoch": 0.8110897189064304, + "grad_norm": 1.0156514644622803, + "learning_rate": 8.544513674025934e-06, + "loss": 1.0376, + "step": 5266 + }, + { + "epoch": 0.8113977666538313, + "grad_norm": 1.0980149507522583, + "learning_rate": 8.517477946053353e-06, + "loss": 0.8962, + "step": 5268 + }, + { + "epoch": 0.8117058144012322, + "grad_norm": 1.1988979578018188, + "learning_rate": 8.49048107495246e-06, + "loss": 1.1929, + "step": 5270 + }, + { + "epoch": 0.812013862148633, + "grad_norm": 0.8038528561592102, + "learning_rate": 8.46352308601136e-06, + "loss": 1.0203, + "step": 5272 + }, + { + "epoch": 0.8123219098960339, + "grad_norm": 0.7679852247238159, + "learning_rate": 8.436604004481713e-06, + "loss": 1.1832, + "step": 5274 + }, + { + "epoch": 0.8126299576434347, + "grad_norm": 0.8102967143058777, + "learning_rate": 8.409723855578756e-06, + "loss": 2.2757, + "step": 5276 + }, + { + "epoch": 0.8129380053908356, + "grad_norm": 0.5498160123825073, + "learning_rate": 8.382882664481245e-06, + "loss": 0.7996, + "step": 5278 + }, + { + "epoch": 0.8132460531382364, + "grad_norm": 0.8951212763786316, + "learning_rate": 8.35608045633145e-06, + "loss": 0.9904, + "step": 5280 + }, + { + "epoch": 0.8135541008856373, + "grad_norm": 0.9070693254470825, + "learning_rate": 8.32931725623513e-06, + "loss": 2.2744, + "step": 5282 + }, + { + "epoch": 0.8138621486330381, + "grad_norm": 0.9921269416809082, + "learning_rate": 8.302593089261496e-06, + "loss": 0.9546, + "step": 5284 + }, + { + "epoch": 0.814170196380439, + "grad_norm": 0.9248214364051819, + "learning_rate": 8.275907980443199e-06, + "loss": 0.9497, + "step": 5286 + }, + { + "epoch": 0.8144782441278399, + "grad_norm": 0.9269345998764038, + "learning_rate": 8.2492619547763e-06, + "loss": 1.0542, + "step": 5288 + }, + { + "epoch": 0.8147862918752407, + "grad_norm": 0.7099084854125977, + "learning_rate": 8.222655037220261e-06, + "loss": 2.1591, + "step": 5290 + }, + { + "epoch": 0.8150943396226416, + "grad_norm": 0.8812754154205322, + "learning_rate": 8.19608725269791e-06, + "loss": 0.8864, + "step": 5292 + }, + { + "epoch": 0.8154023873700423, + "grad_norm": 0.8371664881706238, + "learning_rate": 8.169558626095403e-06, + "loss": 1.7353, + "step": 5294 + }, + { + "epoch": 0.8157104351174432, + "grad_norm": 0.7795840501785278, + "learning_rate": 8.143069182262226e-06, + "loss": 0.9172, + "step": 5296 + }, + { + "epoch": 0.816018482864844, + "grad_norm": 0.6335655450820923, + "learning_rate": 8.116618946011195e-06, + "loss": 1.0627, + "step": 5298 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.7013536691665649, + "learning_rate": 8.090207942118333e-06, + "loss": 0.9354, + "step": 5300 + }, + { + "epoch": 0.8166345783596457, + "grad_norm": 0.7837877869606018, + "learning_rate": 8.063836195322954e-06, + "loss": 1.2341, + "step": 5302 + }, + { + "epoch": 0.8169426261070466, + "grad_norm": 0.7043695449829102, + "learning_rate": 8.037503730327634e-06, + "loss": 0.9237, + "step": 5304 + }, + { + "epoch": 0.8172506738544474, + "grad_norm": 0.6496650576591492, + "learning_rate": 8.011210571798073e-06, + "loss": 0.9217, + "step": 5306 + }, + { + "epoch": 0.8175587216018483, + "grad_norm": 0.8028094172477722, + "learning_rate": 7.984956744363208e-06, + "loss": 1.028, + "step": 5308 + }, + { + "epoch": 0.8178667693492492, + "grad_norm": 0.9909071922302246, + "learning_rate": 7.958742272615117e-06, + "loss": 0.9005, + "step": 5310 + }, + { + "epoch": 0.81817481709665, + "grad_norm": 0.7143129110336304, + "learning_rate": 7.932567181109052e-06, + "loss": 0.8295, + "step": 5312 + }, + { + "epoch": 0.8184828648440509, + "grad_norm": 0.6799386739730835, + "learning_rate": 7.906431494363298e-06, + "loss": 1.0627, + "step": 5314 + }, + { + "epoch": 0.8187909125914516, + "grad_norm": 0.8612110018730164, + "learning_rate": 7.880335236859283e-06, + "loss": 0.8716, + "step": 5316 + }, + { + "epoch": 0.8190989603388525, + "grad_norm": 1.0764349699020386, + "learning_rate": 7.854278433041512e-06, + "loss": 1.0203, + "step": 5318 + }, + { + "epoch": 0.8194070080862533, + "grad_norm": 0.8197991847991943, + "learning_rate": 7.82826110731752e-06, + "loss": 0.8948, + "step": 5320 + }, + { + "epoch": 0.8197150558336542, + "grad_norm": 0.8864220976829529, + "learning_rate": 7.802283284057815e-06, + "loss": 0.8376, + "step": 5322 + }, + { + "epoch": 0.820023103581055, + "grad_norm": 0.7360268235206604, + "learning_rate": 7.776344987595985e-06, + "loss": 0.9808, + "step": 5324 + }, + { + "epoch": 0.8203311513284559, + "grad_norm": 0.7204895615577698, + "learning_rate": 7.750446242228543e-06, + "loss": 1.1138, + "step": 5326 + }, + { + "epoch": 0.8206391990758568, + "grad_norm": 0.814656138420105, + "learning_rate": 7.724587072214973e-06, + "loss": 0.8013, + "step": 5328 + }, + { + "epoch": 0.8209472468232576, + "grad_norm": 0.8463807106018066, + "learning_rate": 7.698767501777644e-06, + "loss": 0.9006, + "step": 5330 + }, + { + "epoch": 0.8212552945706585, + "grad_norm": 0.6860989928245544, + "learning_rate": 7.672987555101907e-06, + "loss": 0.9272, + "step": 5332 + }, + { + "epoch": 0.8215633423180593, + "grad_norm": 0.7387297749519348, + "learning_rate": 7.647247256335955e-06, + "loss": 1.1595, + "step": 5334 + }, + { + "epoch": 0.8218713900654602, + "grad_norm": 0.8675945401191711, + "learning_rate": 7.621546629590814e-06, + "loss": 0.9349, + "step": 5336 + }, + { + "epoch": 0.822179437812861, + "grad_norm": 0.6692162156105042, + "learning_rate": 7.595885698940408e-06, + "loss": 1.5799, + "step": 5338 + }, + { + "epoch": 0.8224874855602619, + "grad_norm": 0.8025321364402771, + "learning_rate": 7.570264488421447e-06, + "loss": 1.1254, + "step": 5340 + }, + { + "epoch": 0.8227955333076626, + "grad_norm": 0.8939410448074341, + "learning_rate": 7.544683022033439e-06, + "loss": 0.8886, + "step": 5342 + }, + { + "epoch": 0.8231035810550635, + "grad_norm": 0.9006834030151367, + "learning_rate": 7.519141323738654e-06, + "loss": 1.1788, + "step": 5344 + }, + { + "epoch": 0.8234116288024644, + "grad_norm": 0.690070390701294, + "learning_rate": 7.493639417462122e-06, + "loss": 1.1011, + "step": 5346 + }, + { + "epoch": 0.8237196765498652, + "grad_norm": 0.633353054523468, + "learning_rate": 7.468177327091608e-06, + "loss": 1.5389, + "step": 5348 + }, + { + "epoch": 0.8240277242972661, + "grad_norm": 0.6479977965354919, + "learning_rate": 7.442755076477559e-06, + "loss": 0.9762, + "step": 5350 + }, + { + "epoch": 0.8243357720446669, + "grad_norm": 0.7376688718795776, + "learning_rate": 7.417372689433122e-06, + "loss": 0.8483, + "step": 5352 + }, + { + "epoch": 0.8246438197920678, + "grad_norm": 0.644631028175354, + "learning_rate": 7.3920301897340945e-06, + "loss": 0.7513, + "step": 5354 + }, + { + "epoch": 0.8249518675394686, + "grad_norm": 0.8042016625404358, + "learning_rate": 7.366727601118911e-06, + "loss": 0.9001, + "step": 5356 + }, + { + "epoch": 0.8252599152868695, + "grad_norm": 0.8431175947189331, + "learning_rate": 7.341464947288629e-06, + "loss": 0.7852, + "step": 5358 + }, + { + "epoch": 0.8255679630342703, + "grad_norm": 0.7508544921875, + "learning_rate": 7.3162422519068966e-06, + "loss": 1.2253, + "step": 5360 + }, + { + "epoch": 0.8258760107816712, + "grad_norm": 0.6164590120315552, + "learning_rate": 7.29105953859992e-06, + "loss": 0.8867, + "step": 5362 + }, + { + "epoch": 0.826184058529072, + "grad_norm": 0.8672991991043091, + "learning_rate": 7.265916830956471e-06, + "loss": 1.1461, + "step": 5364 + }, + { + "epoch": 0.8264921062764728, + "grad_norm": 0.6648744344711304, + "learning_rate": 7.240814152527842e-06, + "loss": 1.064, + "step": 5366 + }, + { + "epoch": 0.8268001540238737, + "grad_norm": 0.943226158618927, + "learning_rate": 7.21575152682783e-06, + "loss": 0.8295, + "step": 5368 + }, + { + "epoch": 0.8271082017712745, + "grad_norm": 0.7448985576629639, + "learning_rate": 7.190728977332706e-06, + "loss": 0.8785, + "step": 5370 + }, + { + "epoch": 0.8274162495186754, + "grad_norm": 1.0283031463623047, + "learning_rate": 7.165746527481215e-06, + "loss": 0.9735, + "step": 5372 + }, + { + "epoch": 0.8277242972660762, + "grad_norm": 0.8894911408424377, + "learning_rate": 7.140804200674528e-06, + "loss": 0.8816, + "step": 5374 + }, + { + "epoch": 0.8280323450134771, + "grad_norm": 0.8363602161407471, + "learning_rate": 7.115902020276239e-06, + "loss": 0.8805, + "step": 5376 + }, + { + "epoch": 0.8283403927608779, + "grad_norm": 0.7517054080963135, + "learning_rate": 7.09104000961236e-06, + "loss": 0.8632, + "step": 5378 + }, + { + "epoch": 0.8286484405082788, + "grad_norm": 0.8066534996032715, + "learning_rate": 7.066218191971219e-06, + "loss": 0.8924, + "step": 5380 + }, + { + "epoch": 0.8289564882556796, + "grad_norm": 0.8637999892234802, + "learning_rate": 7.04143659060354e-06, + "loss": 1.0192, + "step": 5382 + }, + { + "epoch": 0.8292645360030805, + "grad_norm": 1.3805404901504517, + "learning_rate": 7.016695228722358e-06, + "loss": 1.1149, + "step": 5384 + }, + { + "epoch": 0.8295725837504814, + "grad_norm": 0.7475231289863586, + "learning_rate": 6.991994129503054e-06, + "loss": 0.9157, + "step": 5386 + }, + { + "epoch": 0.8298806314978822, + "grad_norm": 0.9382774233818054, + "learning_rate": 6.967333316083225e-06, + "loss": 1.1835, + "step": 5388 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 0.6275261640548706, + "learning_rate": 6.942712811562773e-06, + "loss": 0.9794, + "step": 5390 + }, + { + "epoch": 0.8304967269926838, + "grad_norm": 0.7447370290756226, + "learning_rate": 6.918132639003877e-06, + "loss": 0.9704, + "step": 5392 + }, + { + "epoch": 0.8308047747400847, + "grad_norm": 0.5516697764396667, + "learning_rate": 6.893592821430856e-06, + "loss": 0.9636, + "step": 5394 + }, + { + "epoch": 0.8311128224874855, + "grad_norm": 0.7253577709197998, + "learning_rate": 6.869093381830277e-06, + "loss": 0.7356, + "step": 5396 + }, + { + "epoch": 0.8314208702348864, + "grad_norm": 0.678250789642334, + "learning_rate": 6.844634343150902e-06, + "loss": 0.8708, + "step": 5398 + }, + { + "epoch": 0.8317289179822872, + "grad_norm": 0.6933665871620178, + "learning_rate": 6.820215728303625e-06, + "loss": 1.0718, + "step": 5400 + }, + { + "epoch": 0.8317289179822872, + "eval_loss": 2.4066405296325684, + "eval_runtime": 736.1016, + "eval_samples_per_second": 2.717, + "eval_steps_per_second": 0.679, + "step": 5400 + }, + { + "epoch": 0.8320369657296881, + "grad_norm": 0.6631490588188171, + "learning_rate": 6.795837560161456e-06, + "loss": 0.9066, + "step": 5402 + }, + { + "epoch": 0.832345013477089, + "grad_norm": 0.9897297024726868, + "learning_rate": 6.771499861559538e-06, + "loss": 1.3744, + "step": 5404 + }, + { + "epoch": 0.8326530612244898, + "grad_norm": 0.7145732641220093, + "learning_rate": 6.747202655295126e-06, + "loss": 0.705, + "step": 5406 + }, + { + "epoch": 0.8329611089718907, + "grad_norm": 0.8757843375205994, + "learning_rate": 6.722945964127525e-06, + "loss": 0.9422, + "step": 5408 + }, + { + "epoch": 0.8332691567192915, + "grad_norm": 0.7739924192428589, + "learning_rate": 6.698729810778065e-06, + "loss": 0.7416, + "step": 5410 + }, + { + "epoch": 0.8335772044666924, + "grad_norm": 0.7572183012962341, + "learning_rate": 6.674554217930162e-06, + "loss": 0.8495, + "step": 5412 + }, + { + "epoch": 0.8338852522140932, + "grad_norm": 0.725858747959137, + "learning_rate": 6.650419208229186e-06, + "loss": 0.9817, + "step": 5414 + }, + { + "epoch": 0.834193299961494, + "grad_norm": 0.7561565041542053, + "learning_rate": 6.626324804282525e-06, + "loss": 0.8207, + "step": 5416 + }, + { + "epoch": 0.8345013477088948, + "grad_norm": 0.695875883102417, + "learning_rate": 6.6022710286595064e-06, + "loss": 0.8971, + "step": 5418 + }, + { + "epoch": 0.8348093954562957, + "grad_norm": 0.8448840379714966, + "learning_rate": 6.578257903891427e-06, + "loss": 0.7577, + "step": 5420 + }, + { + "epoch": 0.8351174432036965, + "grad_norm": 0.7509380578994751, + "learning_rate": 6.554285452471498e-06, + "loss": 1.0915, + "step": 5422 + }, + { + "epoch": 0.8354254909510974, + "grad_norm": 0.9362658262252808, + "learning_rate": 6.530353696854791e-06, + "loss": 1.0626, + "step": 5424 + }, + { + "epoch": 0.8357335386984983, + "grad_norm": 0.5367106795310974, + "learning_rate": 6.506462659458329e-06, + "loss": 1.9706, + "step": 5426 + }, + { + "epoch": 0.8360415864458991, + "grad_norm": 1.0008156299591064, + "learning_rate": 6.482612362660945e-06, + "loss": 0.9507, + "step": 5428 + }, + { + "epoch": 0.8363496341933, + "grad_norm": 0.7898880243301392, + "learning_rate": 6.458802828803323e-06, + "loss": 0.9741, + "step": 5430 + }, + { + "epoch": 0.8366576819407008, + "grad_norm": 0.719279944896698, + "learning_rate": 6.435034080187969e-06, + "loss": 1.0709, + "step": 5432 + }, + { + "epoch": 0.8369657296881017, + "grad_norm": 0.6773216128349304, + "learning_rate": 6.411306139079176e-06, + "loss": 0.8413, + "step": 5434 + }, + { + "epoch": 0.8372737774355025, + "grad_norm": 0.6237614750862122, + "learning_rate": 6.387619027703018e-06, + "loss": 0.9458, + "step": 5436 + }, + { + "epoch": 0.8375818251829034, + "grad_norm": 0.7203165292739868, + "learning_rate": 6.3639727682473225e-06, + "loss": 0.9038, + "step": 5438 + }, + { + "epoch": 0.8378898729303041, + "grad_norm": 0.7765570282936096, + "learning_rate": 6.34036738286165e-06, + "loss": 1.0212, + "step": 5440 + }, + { + "epoch": 0.838197920677705, + "grad_norm": 0.8942344784736633, + "learning_rate": 6.316802893657275e-06, + "loss": 2.2842, + "step": 5442 + }, + { + "epoch": 0.8385059684251059, + "grad_norm": 1.1004189252853394, + "learning_rate": 6.293279322707169e-06, + "loss": 0.8196, + "step": 5444 + }, + { + "epoch": 0.8388140161725067, + "grad_norm": 0.819892168045044, + "learning_rate": 6.269796692045965e-06, + "loss": 0.8763, + "step": 5446 + }, + { + "epoch": 0.8391220639199076, + "grad_norm": 1.0760273933410645, + "learning_rate": 6.246355023669958e-06, + "loss": 1.3298, + "step": 5448 + }, + { + "epoch": 0.8394301116673084, + "grad_norm": 0.7805618047714233, + "learning_rate": 6.222954339537063e-06, + "loss": 0.9231, + "step": 5450 + }, + { + "epoch": 0.8397381594147093, + "grad_norm": 1.1943398714065552, + "learning_rate": 6.19959466156681e-06, + "loss": 1.0681, + "step": 5452 + }, + { + "epoch": 0.8400462071621101, + "grad_norm": 0.6588060259819031, + "learning_rate": 6.17627601164032e-06, + "loss": 0.8829, + "step": 5454 + }, + { + "epoch": 0.840354254909511, + "grad_norm": 0.9226628541946411, + "learning_rate": 6.152998411600269e-06, + "loss": 0.7163, + "step": 5456 + }, + { + "epoch": 0.8406623026569118, + "grad_norm": 0.7645328044891357, + "learning_rate": 6.1297618832509285e-06, + "loss": 0.7472, + "step": 5458 + }, + { + "epoch": 0.8409703504043127, + "grad_norm": 0.7715820670127869, + "learning_rate": 6.106566448358025e-06, + "loss": 1.4482, + "step": 5460 + }, + { + "epoch": 0.8412783981517136, + "grad_norm": 0.5981425046920776, + "learning_rate": 6.083412128648847e-06, + "loss": 0.7663, + "step": 5462 + }, + { + "epoch": 0.8415864458991144, + "grad_norm": 0.7702302932739258, + "learning_rate": 6.060298945812143e-06, + "loss": 1.0621, + "step": 5464 + }, + { + "epoch": 0.8418944936465153, + "grad_norm": 0.5986312627792358, + "learning_rate": 6.037226921498168e-06, + "loss": 1.6585, + "step": 5466 + }, + { + "epoch": 0.842202541393916, + "grad_norm": 0.7191674113273621, + "learning_rate": 6.014196077318562e-06, + "loss": 0.8664, + "step": 5468 + }, + { + "epoch": 0.8425105891413169, + "grad_norm": 1.0426437854766846, + "learning_rate": 5.9912064348464305e-06, + "loss": 1.0364, + "step": 5470 + }, + { + "epoch": 0.8428186368887177, + "grad_norm": 0.9384727478027344, + "learning_rate": 5.96825801561629e-06, + "loss": 1.0487, + "step": 5472 + }, + { + "epoch": 0.8431266846361186, + "grad_norm": 0.7399026155471802, + "learning_rate": 5.945350841124037e-06, + "loss": 0.8254, + "step": 5474 + }, + { + "epoch": 0.8434347323835194, + "grad_norm": 0.7573807835578918, + "learning_rate": 5.922484932826899e-06, + "loss": 0.9785, + "step": 5476 + }, + { + "epoch": 0.8437427801309203, + "grad_norm": 0.6942792534828186, + "learning_rate": 5.8996603121435065e-06, + "loss": 1.0023, + "step": 5478 + }, + { + "epoch": 0.8440508278783211, + "grad_norm": 0.7304031252861023, + "learning_rate": 5.8768770004537894e-06, + "loss": 1.9075, + "step": 5480 + }, + { + "epoch": 0.844358875625722, + "grad_norm": 0.8708906769752502, + "learning_rate": 5.854135019098961e-06, + "loss": 0.8669, + "step": 5482 + }, + { + "epoch": 0.8446669233731229, + "grad_norm": 0.7126341462135315, + "learning_rate": 5.831434389381546e-06, + "loss": 0.925, + "step": 5484 + }, + { + "epoch": 0.8449749711205237, + "grad_norm": 0.8131961226463318, + "learning_rate": 5.808775132565341e-06, + "loss": 0.8507, + "step": 5486 + }, + { + "epoch": 0.8452830188679246, + "grad_norm": 0.852538526058197, + "learning_rate": 5.786157269875386e-06, + "loss": 0.7647, + "step": 5488 + }, + { + "epoch": 0.8455910666153253, + "grad_norm": 0.785875141620636, + "learning_rate": 5.763580822497905e-06, + "loss": 0.7627, + "step": 5490 + }, + { + "epoch": 0.8458991143627262, + "grad_norm": 0.9879255294799805, + "learning_rate": 5.741045811580387e-06, + "loss": 0.9807, + "step": 5492 + }, + { + "epoch": 0.846207162110127, + "grad_norm": 0.8181496262550354, + "learning_rate": 5.71855225823148e-06, + "loss": 0.926, + "step": 5494 + }, + { + "epoch": 0.8465152098575279, + "grad_norm": 1.0856611728668213, + "learning_rate": 5.696100183521002e-06, + "loss": 1.0308, + "step": 5496 + }, + { + "epoch": 0.8468232576049287, + "grad_norm": 0.5956221222877502, + "learning_rate": 5.673689608479893e-06, + "loss": 1.0491, + "step": 5498 + }, + { + "epoch": 0.8471313053523296, + "grad_norm": 0.8387410044670105, + "learning_rate": 5.65132055410027e-06, + "loss": 1.1071, + "step": 5500 + }, + { + "epoch": 0.8474393530997305, + "grad_norm": 0.8523972034454346, + "learning_rate": 5.628993041335334e-06, + "loss": 0.8855, + "step": 5502 + }, + { + "epoch": 0.8477474008471313, + "grad_norm": 0.5948216915130615, + "learning_rate": 5.606707091099334e-06, + "loss": 0.8292, + "step": 5504 + }, + { + "epoch": 0.8480554485945322, + "grad_norm": 0.8574666976928711, + "learning_rate": 5.584462724267653e-06, + "loss": 1.0905, + "step": 5506 + }, + { + "epoch": 0.848363496341933, + "grad_norm": 0.92282634973526, + "learning_rate": 5.562259961676691e-06, + "loss": 0.8807, + "step": 5508 + }, + { + "epoch": 0.8486715440893339, + "grad_norm": 0.5856600999832153, + "learning_rate": 5.540098824123874e-06, + "loss": 0.7359, + "step": 5510 + }, + { + "epoch": 0.8489795918367347, + "grad_norm": 0.7432599067687988, + "learning_rate": 5.51797933236764e-06, + "loss": 0.8005, + "step": 5512 + }, + { + "epoch": 0.8492876395841356, + "grad_norm": 0.8326647281646729, + "learning_rate": 5.495901507127427e-06, + "loss": 0.8521, + "step": 5514 + }, + { + "epoch": 0.8495956873315363, + "grad_norm": 0.9699840545654297, + "learning_rate": 5.47386536908363e-06, + "loss": 1.0648, + "step": 5516 + }, + { + "epoch": 0.8499037350789372, + "grad_norm": 0.6920575499534607, + "learning_rate": 5.451870938877607e-06, + "loss": 1.1668, + "step": 5518 + }, + { + "epoch": 0.8502117828263381, + "grad_norm": 0.8997227549552917, + "learning_rate": 5.429918237111642e-06, + "loss": 2.2459, + "step": 5520 + }, + { + "epoch": 0.8505198305737389, + "grad_norm": 0.7757606506347656, + "learning_rate": 5.408007284348931e-06, + "loss": 0.8366, + "step": 5522 + }, + { + "epoch": 0.8508278783211398, + "grad_norm": 0.7504420876502991, + "learning_rate": 5.386138101113569e-06, + "loss": 0.8136, + "step": 5524 + }, + { + "epoch": 0.8511359260685406, + "grad_norm": 0.6319246292114258, + "learning_rate": 5.364310707890518e-06, + "loss": 1.1945, + "step": 5526 + }, + { + "epoch": 0.8514439738159415, + "grad_norm": 0.7972545623779297, + "learning_rate": 5.342525125125603e-06, + "loss": 1.0876, + "step": 5528 + }, + { + "epoch": 0.8517520215633423, + "grad_norm": 0.7912672162055969, + "learning_rate": 5.320781373225481e-06, + "loss": 1.05, + "step": 5530 + }, + { + "epoch": 0.8520600693107432, + "grad_norm": 0.8491456508636475, + "learning_rate": 5.299079472557622e-06, + "loss": 1.2363, + "step": 5532 + }, + { + "epoch": 0.852368117058144, + "grad_norm": 0.8818978071212769, + "learning_rate": 5.277419443450293e-06, + "loss": 0.8654, + "step": 5534 + }, + { + "epoch": 0.8526761648055449, + "grad_norm": 0.7931317687034607, + "learning_rate": 5.255801306192559e-06, + "loss": 0.8854, + "step": 5536 + }, + { + "epoch": 0.8529842125529457, + "grad_norm": 1.018884301185608, + "learning_rate": 5.234225081034216e-06, + "loss": 0.9068, + "step": 5538 + }, + { + "epoch": 0.8532922603003466, + "grad_norm": 0.6841465830802917, + "learning_rate": 5.21269078818582e-06, + "loss": 0.8391, + "step": 5540 + }, + { + "epoch": 0.8536003080477474, + "grad_norm": 0.8705657720565796, + "learning_rate": 5.191198447818646e-06, + "loss": 1.657, + "step": 5542 + }, + { + "epoch": 0.8539083557951482, + "grad_norm": 0.827742874622345, + "learning_rate": 5.169748080064651e-06, + "loss": 0.9083, + "step": 5544 + }, + { + "epoch": 0.8542164035425491, + "grad_norm": 0.5416191816329956, + "learning_rate": 5.1483397050165365e-06, + "loss": 0.8433, + "step": 5546 + }, + { + "epoch": 0.8545244512899499, + "grad_norm": 0.8019801378250122, + "learning_rate": 5.126973342727587e-06, + "loss": 0.9913, + "step": 5548 + }, + { + "epoch": 0.8548324990373508, + "grad_norm": 0.7593039274215698, + "learning_rate": 5.105649013211777e-06, + "loss": 0.8214, + "step": 5550 + }, + { + "epoch": 0.8551405467847516, + "grad_norm": 1.0784088373184204, + "learning_rate": 5.0843667364437246e-06, + "loss": 1.0835, + "step": 5552 + }, + { + "epoch": 0.8554485945321525, + "grad_norm": 0.7842487096786499, + "learning_rate": 5.063126532358642e-06, + "loss": 0.715, + "step": 5554 + }, + { + "epoch": 0.8557566422795533, + "grad_norm": 0.9213837385177612, + "learning_rate": 5.041928420852299e-06, + "loss": 0.974, + "step": 5556 + }, + { + "epoch": 0.8560646900269542, + "grad_norm": 0.7843840718269348, + "learning_rate": 5.020772421781073e-06, + "loss": 0.8333, + "step": 5558 + }, + { + "epoch": 0.8563727377743551, + "grad_norm": 0.6879072785377502, + "learning_rate": 4.999658554961917e-06, + "loss": 0.7875, + "step": 5560 + }, + { + "epoch": 0.8566807855217559, + "grad_norm": 0.9496925473213196, + "learning_rate": 4.97858684017225e-06, + "loss": 0.9994, + "step": 5562 + }, + { + "epoch": 0.8569888332691568, + "grad_norm": 0.7569987177848816, + "learning_rate": 4.957557297150056e-06, + "loss": 0.878, + "step": 5564 + }, + { + "epoch": 0.8572968810165575, + "grad_norm": 0.7940292954444885, + "learning_rate": 4.936569945593817e-06, + "loss": 0.7878, + "step": 5566 + }, + { + "epoch": 0.8576049287639584, + "grad_norm": 1.0187393426895142, + "learning_rate": 4.915624805162489e-06, + "loss": 0.9715, + "step": 5568 + }, + { + "epoch": 0.8579129765113592, + "grad_norm": 0.5202600955963135, + "learning_rate": 4.894721895475452e-06, + "loss": 0.94, + "step": 5570 + }, + { + "epoch": 0.8582210242587601, + "grad_norm": 0.9687458872795105, + "learning_rate": 4.873861236112587e-06, + "loss": 0.9434, + "step": 5572 + }, + { + "epoch": 0.8585290720061609, + "grad_norm": 0.7784386277198792, + "learning_rate": 4.853042846614159e-06, + "loss": 0.9892, + "step": 5574 + }, + { + "epoch": 0.8588371197535618, + "grad_norm": 0.98586106300354, + "learning_rate": 4.832266746480862e-06, + "loss": 1.1127, + "step": 5576 + }, + { + "epoch": 0.8591451675009627, + "grad_norm": 0.7746574282646179, + "learning_rate": 4.811532955173742e-06, + "loss": 0.9111, + "step": 5578 + }, + { + "epoch": 0.8594532152483635, + "grad_norm": 0.850456178188324, + "learning_rate": 4.790841492114256e-06, + "loss": 0.9865, + "step": 5580 + }, + { + "epoch": 0.8597612629957644, + "grad_norm": 0.6116123199462891, + "learning_rate": 4.770192376684196e-06, + "loss": 0.9438, + "step": 5582 + }, + { + "epoch": 0.8600693107431652, + "grad_norm": 0.9664915204048157, + "learning_rate": 4.749585628225678e-06, + "loss": 0.9979, + "step": 5584 + }, + { + "epoch": 0.8603773584905661, + "grad_norm": 0.6345783472061157, + "learning_rate": 4.729021266041139e-06, + "loss": 1.0397, + "step": 5586 + }, + { + "epoch": 0.8606854062379669, + "grad_norm": 0.9745763540267944, + "learning_rate": 4.70849930939331e-06, + "loss": 1.0937, + "step": 5588 + }, + { + "epoch": 0.8609934539853678, + "grad_norm": 1.003058910369873, + "learning_rate": 4.688019777505214e-06, + "loss": 1.1315, + "step": 5590 + }, + { + "epoch": 0.8613015017327685, + "grad_norm": 0.6395254731178284, + "learning_rate": 4.667582689560113e-06, + "loss": 0.8876, + "step": 5592 + }, + { + "epoch": 0.8616095494801694, + "grad_norm": 0.6765961647033691, + "learning_rate": 4.64718806470153e-06, + "loss": 1.0269, + "step": 5594 + }, + { + "epoch": 0.8619175972275702, + "grad_norm": 0.6591092944145203, + "learning_rate": 4.626835922033201e-06, + "loss": 1.0236, + "step": 5596 + }, + { + "epoch": 0.8622256449749711, + "grad_norm": 0.7652791738510132, + "learning_rate": 4.606526280619072e-06, + "loss": 0.9287, + "step": 5598 + }, + { + "epoch": 0.862533692722372, + "grad_norm": 0.8166400790214539, + "learning_rate": 4.586259159483286e-06, + "loss": 0.817, + "step": 5600 + }, + { + "epoch": 0.8628417404697728, + "grad_norm": 0.8276070952415466, + "learning_rate": 4.56603457761014e-06, + "loss": 2.0614, + "step": 5602 + }, + { + "epoch": 0.8631497882171737, + "grad_norm": 0.8190547823905945, + "learning_rate": 4.545852553944102e-06, + "loss": 1.0747, + "step": 5604 + }, + { + "epoch": 0.8634578359645745, + "grad_norm": 0.9670174717903137, + "learning_rate": 4.525713107389762e-06, + "loss": 1.0557, + "step": 5606 + }, + { + "epoch": 0.8637658837119754, + "grad_norm": 0.7336003184318542, + "learning_rate": 4.505616256811835e-06, + "loss": 0.8337, + "step": 5608 + }, + { + "epoch": 0.8640739314593762, + "grad_norm": 0.8529759049415588, + "learning_rate": 4.485562021035133e-06, + "loss": 0.9215, + "step": 5610 + }, + { + "epoch": 0.8643819792067771, + "grad_norm": 0.7147994637489319, + "learning_rate": 4.465550418844561e-06, + "loss": 0.7897, + "step": 5612 + }, + { + "epoch": 0.8646900269541778, + "grad_norm": 0.9193140268325806, + "learning_rate": 4.445581468985066e-06, + "loss": 0.8149, + "step": 5614 + }, + { + "epoch": 0.8649980747015787, + "grad_norm": 0.6203229427337646, + "learning_rate": 4.425655190161671e-06, + "loss": 0.9411, + "step": 5616 + }, + { + "epoch": 0.8653061224489796, + "grad_norm": 1.2721714973449707, + "learning_rate": 4.405771601039399e-06, + "loss": 0.9375, + "step": 5618 + }, + { + "epoch": 0.8656141701963804, + "grad_norm": 0.9241076707839966, + "learning_rate": 4.385930720243314e-06, + "loss": 0.8362, + "step": 5620 + }, + { + "epoch": 0.8659222179437813, + "grad_norm": 0.859778106212616, + "learning_rate": 4.366132566358455e-06, + "loss": 1.2562, + "step": 5622 + }, + { + "epoch": 0.8662302656911821, + "grad_norm": 0.7657763957977295, + "learning_rate": 4.346377157929838e-06, + "loss": 0.8539, + "step": 5624 + }, + { + "epoch": 0.866538313438583, + "grad_norm": 0.7467623353004456, + "learning_rate": 4.32666451346247e-06, + "loss": 1.0287, + "step": 5626 + }, + { + "epoch": 0.8668463611859838, + "grad_norm": 1.0547900199890137, + "learning_rate": 4.306994651421253e-06, + "loss": 1.0037, + "step": 5628 + }, + { + "epoch": 0.8671544089333847, + "grad_norm": 0.7862194180488586, + "learning_rate": 4.287367590231045e-06, + "loss": 0.7485, + "step": 5630 + }, + { + "epoch": 0.8674624566807855, + "grad_norm": 0.6208391785621643, + "learning_rate": 4.2677833482766e-06, + "loss": 1.0326, + "step": 5632 + }, + { + "epoch": 0.8677705044281864, + "grad_norm": 0.8460115790367126, + "learning_rate": 4.248241943902592e-06, + "loss": 1.4913, + "step": 5634 + }, + { + "epoch": 0.8680785521755873, + "grad_norm": 0.7300686240196228, + "learning_rate": 4.2287433954135205e-06, + "loss": 0.978, + "step": 5636 + }, + { + "epoch": 0.8683865999229881, + "grad_norm": 0.7100600004196167, + "learning_rate": 4.2092877210737684e-06, + "loss": 1.0427, + "step": 5638 + }, + { + "epoch": 0.868694647670389, + "grad_norm": 0.7860296368598938, + "learning_rate": 4.189874939107574e-06, + "loss": 0.8127, + "step": 5640 + }, + { + "epoch": 0.8690026954177897, + "grad_norm": 0.9475097060203552, + "learning_rate": 4.170505067698977e-06, + "loss": 0.9262, + "step": 5642 + }, + { + "epoch": 0.8693107431651906, + "grad_norm": 0.6762198805809021, + "learning_rate": 4.1511781249918e-06, + "loss": 1.0889, + "step": 5644 + }, + { + "epoch": 0.8696187909125914, + "grad_norm": 1.0616062879562378, + "learning_rate": 4.131894129089709e-06, + "loss": 0.8648, + "step": 5646 + }, + { + "epoch": 0.8699268386599923, + "grad_norm": 0.8382073640823364, + "learning_rate": 4.112653098056113e-06, + "loss": 0.9439, + "step": 5648 + }, + { + "epoch": 0.8702348864073931, + "grad_norm": 0.7744151949882507, + "learning_rate": 4.0934550499141575e-06, + "loss": 0.8389, + "step": 5650 + }, + { + "epoch": 0.870542934154794, + "grad_norm": 0.9079608917236328, + "learning_rate": 4.074300002646742e-06, + "loss": 1.0208, + "step": 5652 + }, + { + "epoch": 0.8708509819021948, + "grad_norm": 0.7319706678390503, + "learning_rate": 4.055187974196511e-06, + "loss": 0.8994, + "step": 5654 + }, + { + "epoch": 0.8711590296495957, + "grad_norm": 1.06355881690979, + "learning_rate": 4.036118982465787e-06, + "loss": 1.0155, + "step": 5656 + }, + { + "epoch": 0.8714670773969966, + "grad_norm": 0.8588384389877319, + "learning_rate": 4.01709304531655e-06, + "loss": 1.2075, + "step": 5658 + }, + { + "epoch": 0.8717751251443974, + "grad_norm": 1.1078176498413086, + "learning_rate": 3.998110180570525e-06, + "loss": 0.8002, + "step": 5660 + }, + { + "epoch": 0.8720831728917983, + "grad_norm": 0.8839870691299438, + "learning_rate": 3.979170406009031e-06, + "loss": 1.2371, + "step": 5662 + }, + { + "epoch": 0.872391220639199, + "grad_norm": 1.0431289672851562, + "learning_rate": 3.960273739373044e-06, + "loss": 1.0638, + "step": 5664 + }, + { + "epoch": 0.8726992683866, + "grad_norm": 0.6601695418357849, + "learning_rate": 3.941420198363166e-06, + "loss": 0.9727, + "step": 5666 + }, + { + "epoch": 0.8730073161340007, + "grad_norm": 0.8522267937660217, + "learning_rate": 3.922609800639587e-06, + "loss": 0.7752, + "step": 5668 + }, + { + "epoch": 0.8733153638814016, + "grad_norm": 0.9073500633239746, + "learning_rate": 3.903842563822102e-06, + "loss": 0.8728, + "step": 5670 + }, + { + "epoch": 0.8736234116288024, + "grad_norm": 0.6048290133476257, + "learning_rate": 3.885118505490065e-06, + "loss": 0.8422, + "step": 5672 + }, + { + "epoch": 0.8739314593762033, + "grad_norm": 0.5503186583518982, + "learning_rate": 3.866437643182391e-06, + "loss": 0.8766, + "step": 5674 + }, + { + "epoch": 0.8742395071236042, + "grad_norm": 0.9213413000106812, + "learning_rate": 3.847799994397527e-06, + "loss": 1.0147, + "step": 5676 + }, + { + "epoch": 0.874547554871005, + "grad_norm": 0.7165212035179138, + "learning_rate": 3.829205576593448e-06, + "loss": 1.0482, + "step": 5678 + }, + { + "epoch": 0.8748556026184059, + "grad_norm": 0.979213297367096, + "learning_rate": 3.810654407187636e-06, + "loss": 0.9725, + "step": 5680 + }, + { + "epoch": 0.8751636503658067, + "grad_norm": 0.9179180860519409, + "learning_rate": 3.7921465035570537e-06, + "loss": 1.2204, + "step": 5682 + }, + { + "epoch": 0.8754716981132076, + "grad_norm": 0.6514227986335754, + "learning_rate": 3.773681883038138e-06, + "loss": 2.1307, + "step": 5684 + }, + { + "epoch": 0.8757797458606084, + "grad_norm": 1.0250710248947144, + "learning_rate": 3.755260562926788e-06, + "loss": 0.9711, + "step": 5686 + }, + { + "epoch": 0.8760877936080093, + "grad_norm": 0.9408641457557678, + "learning_rate": 3.7368825604783386e-06, + "loss": 1.7764, + "step": 5688 + }, + { + "epoch": 0.87639584135541, + "grad_norm": 0.9162575006484985, + "learning_rate": 3.7185478929075536e-06, + "loss": 0.6678, + "step": 5690 + }, + { + "epoch": 0.8767038891028109, + "grad_norm": 0.7940691709518433, + "learning_rate": 3.7002565773886e-06, + "loss": 0.8517, + "step": 5692 + }, + { + "epoch": 0.8770119368502118, + "grad_norm": 0.8024888038635254, + "learning_rate": 3.6820086310550395e-06, + "loss": 0.9191, + "step": 5694 + }, + { + "epoch": 0.8773199845976126, + "grad_norm": 0.7842996716499329, + "learning_rate": 3.6638040709998044e-06, + "loss": 1.1716, + "step": 5696 + }, + { + "epoch": 0.8776280323450135, + "grad_norm": 0.6878358721733093, + "learning_rate": 3.6456429142751823e-06, + "loss": 0.7303, + "step": 5698 + }, + { + "epoch": 0.8779360800924143, + "grad_norm": 0.792102038860321, + "learning_rate": 3.6275251778928487e-06, + "loss": 1.0849, + "step": 5700 + }, + { + "epoch": 0.8779360800924143, + "eval_loss": 2.3906445503234863, + "eval_runtime": 737.0453, + "eval_samples_per_second": 2.714, + "eval_steps_per_second": 0.678, + "step": 5700 + } + ], + "logging_steps": 2, + "max_steps": 6492, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 150, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.193565032834662e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}