{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1, "eval_steps": 100, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 0.4723578691482544, "learning_rate": 5.000000000000001e-07, "loss": 15.0023, "loss/crossentropy": 2.8106061220169067, "loss/hidden": 0.0, "loss/logits": 0.15208163857460022, "loss/reg": 14.850236892700195, "step": 1 }, { "epoch": 0.002, "grad_norm": 0.3746773898601532, "learning_rate": 1.0000000000000002e-06, "loss": 14.8245, "loss/crossentropy": 2.88076388835907, "loss/hidden": 0.0, "loss/logits": 0.13159187883138657, "loss/reg": 14.692875862121582, "step": 2 }, { "epoch": 0.003, "grad_norm": 0.39371100068092346, "learning_rate": 1.5e-06, "loss": 14.6298, "loss/crossentropy": 2.8186020851135254, "loss/hidden": 0.0, "loss/logits": 0.13105076551437378, "loss/reg": 14.4987154006958, "step": 3 }, { "epoch": 0.004, "grad_norm": 0.4312513470649719, "learning_rate": 2.0000000000000003e-06, "loss": 14.4644, "loss/crossentropy": 2.561935782432556, "loss/hidden": 0.0, "loss/logits": 0.15662836283445358, "loss/reg": 14.307746887207031, "step": 4 }, { "epoch": 0.005, "grad_norm": 0.5712147951126099, "learning_rate": 2.5e-06, "loss": 14.2884, "loss/crossentropy": 2.6744261980056763, "loss/hidden": 0.0, "loss/logits": 0.16902200877666473, "loss/reg": 14.119378089904785, "step": 5 }, { "epoch": 0.006, "grad_norm": 0.4991922080516815, "learning_rate": 3e-06, "loss": 14.0796, "loss/crossentropy": 2.541142702102661, "loss/hidden": 0.0, "loss/logits": 0.14546513557434082, "loss/reg": 13.934137344360352, "step": 6 }, { "epoch": 0.007, "grad_norm": 0.394466370344162, "learning_rate": 3.5000000000000004e-06, "loss": 13.8978, "loss/crossentropy": 2.863157868385315, "loss/hidden": 0.0, "loss/logits": 0.14624819159507751, "loss/reg": 13.751592636108398, "step": 7 }, { "epoch": 0.008, "grad_norm": 0.5288362503051758, "learning_rate": 4.000000000000001e-06, "loss": 13.7514, "loss/crossentropy": 2.53192400932312, "loss/hidden": 0.0, "loss/logits": 0.17936921864748, "loss/reg": 13.571989059448242, "step": 8 }, { "epoch": 0.009, "grad_norm": 0.3661612570285797, "learning_rate": 4.5e-06, "loss": 13.5249, "loss/crossentropy": 2.6582940816879272, "loss/hidden": 0.0, "loss/logits": 0.13002116978168488, "loss/reg": 13.39488697052002, "step": 9 }, { "epoch": 0.01, "grad_norm": 0.37344738841056824, "learning_rate": 5e-06, "loss": 13.3611, "loss/crossentropy": 2.8080869913101196, "loss/hidden": 0.0, "loss/logits": 0.1404452547430992, "loss/reg": 13.220650672912598, "step": 10 }, { "epoch": 0.011, "grad_norm": 0.47446200251579285, "learning_rate": 5.500000000000001e-06, "loss": 13.2185, "loss/crossentropy": 2.7942490577697754, "loss/hidden": 0.0, "loss/logits": 0.16937557607889175, "loss/reg": 13.049126625061035, "step": 11 }, { "epoch": 0.012, "grad_norm": 0.41823914647102356, "learning_rate": 6e-06, "loss": 13.0431, "loss/crossentropy": 2.7883923053741455, "loss/hidden": 0.0, "loss/logits": 0.16317125409841537, "loss/reg": 12.879945755004883, "step": 12 }, { "epoch": 0.013, "grad_norm": 0.3810077905654907, "learning_rate": 6.5000000000000004e-06, "loss": 12.8458, "loss/crossentropy": 2.5887070894241333, "loss/hidden": 0.0, "loss/logits": 0.13234061002731323, "loss/reg": 12.713478088378906, "step": 13 }, { "epoch": 0.014, "grad_norm": 0.861165463924408, "learning_rate": 7.000000000000001e-06, "loss": 12.7646, "loss/crossentropy": 3.0489625930786133, "loss/hidden": 0.0, "loss/logits": 0.2151956632733345, "loss/reg": 12.549367904663086, "step": 14 }, { "epoch": 0.015, "grad_norm": 0.3999025225639343, "learning_rate": 7.5e-06, "loss": 12.5428, "loss/crossentropy": 2.6218740940093994, "loss/hidden": 0.0, "loss/logits": 0.15504448860883713, "loss/reg": 12.38780403137207, "step": 15 }, { "epoch": 0.016, "grad_norm": 0.3485129177570343, "grad_norm_var": 0.015782494207215392, "learning_rate": 8.000000000000001e-06, "loss": 12.3624, "loss/crossentropy": 2.704634428024292, "loss/hidden": 0.0, "loss/logits": 0.1335841417312622, "loss/reg": 12.228797912597656, "step": 16 }, { "epoch": 0.017, "grad_norm": 0.394564151763916, "grad_norm_var": 0.015986270113443057, "learning_rate": 8.500000000000002e-06, "loss": 12.2458, "loss/crossentropy": 2.6254637241363525, "loss/hidden": 0.0, "loss/logits": 0.17384596914052963, "loss/reg": 12.071925163269043, "step": 17 }, { "epoch": 0.018, "grad_norm": 0.359417200088501, "grad_norm_var": 0.016155457953818328, "learning_rate": 9e-06, "loss": 12.0449, "loss/crossentropy": 2.6955989599227905, "loss/hidden": 0.0, "loss/logits": 0.12739497795701027, "loss/reg": 11.917520523071289, "step": 18 }, { "epoch": 0.019, "grad_norm": 0.4362143278121948, "grad_norm_var": 0.01595094581098607, "learning_rate": 9.5e-06, "loss": 11.9337, "loss/crossentropy": 2.6590176820755005, "loss/hidden": 0.0, "loss/logits": 0.16834121942520142, "loss/reg": 11.765388488769531, "step": 19 }, { "epoch": 0.02, "grad_norm": 0.3776922821998596, "grad_norm_var": 0.01628110467363616, "learning_rate": 1e-05, "loss": 11.7405, "loss/crossentropy": 2.9381775856018066, "loss/hidden": 0.0, "loss/logits": 0.12479494512081146, "loss/reg": 11.615675926208496, "step": 20 }, { "epoch": 0.021, "grad_norm": 0.3638933598995209, "grad_norm_var": 0.01558998159507059, "learning_rate": 1.05e-05, "loss": 11.5966, "loss/crossentropy": 2.737419009208679, "loss/hidden": 0.0, "loss/logits": 0.12864574417471886, "loss/reg": 11.467905044555664, "step": 21 }, { "epoch": 0.022, "grad_norm": 0.3599083125591278, "grad_norm_var": 0.01563029096619477, "learning_rate": 1.1000000000000001e-05, "loss": 11.4504, "loss/crossentropy": 2.680889368057251, "loss/hidden": 0.0, "loss/logits": 0.1280709058046341, "loss/reg": 11.322328567504883, "step": 22 }, { "epoch": 0.023, "grad_norm": 0.5118765830993652, "grad_norm_var": 0.015976795681526535, "learning_rate": 1.1500000000000002e-05, "loss": 11.3452, "loss/crossentropy": 2.6692968606948853, "loss/hidden": 0.0, "loss/logits": 0.16634425520896912, "loss/reg": 11.178874969482422, "step": 23 }, { "epoch": 0.024, "grad_norm": 0.3334847688674927, "grad_norm_var": 0.015910143486327633, "learning_rate": 1.2e-05, "loss": 11.1616, "loss/crossentropy": 2.7403935194015503, "loss/hidden": 0.0, "loss/logits": 0.12420090287923813, "loss/reg": 11.037442207336426, "step": 24 }, { "epoch": 0.025, "grad_norm": 0.3828985095024109, "grad_norm_var": 0.015801931574252005, "learning_rate": 1.25e-05, "loss": 11.0467, "loss/crossentropy": 2.796410322189331, "loss/hidden": 0.0, "loss/logits": 0.14872785657644272, "loss/reg": 10.897927284240723, "step": 25 }, { "epoch": 0.026, "grad_norm": 0.434479683637619, "grad_norm_var": 0.015627081016672508, "learning_rate": 1.3000000000000001e-05, "loss": 10.9099, "loss/crossentropy": 2.726790428161621, "loss/hidden": 0.0, "loss/logits": 0.14949394017457962, "loss/reg": 10.760404586791992, "step": 26 }, { "epoch": 0.027, "grad_norm": 0.3825664520263672, "grad_norm_var": 0.015577720555919354, "learning_rate": 1.3500000000000001e-05, "loss": 10.761, "loss/crossentropy": 2.6250252723693848, "loss/hidden": 0.0, "loss/logits": 0.13614310324192047, "loss/reg": 10.624829292297363, "step": 27 }, { "epoch": 0.028, "grad_norm": 0.3759184777736664, "grad_norm_var": 0.015708703781819332, "learning_rate": 1.4000000000000001e-05, "loss": 10.6382, "loss/crossentropy": 2.835649013519287, "loss/hidden": 0.0, "loss/logits": 0.1468915417790413, "loss/reg": 10.491329193115234, "step": 28 }, { "epoch": 0.029, "grad_norm": 0.38468286395072937, "grad_norm_var": 0.015690946589658594, "learning_rate": 1.45e-05, "loss": 10.4996, "loss/crossentropy": 2.5807005167007446, "loss/hidden": 0.0, "loss/logits": 0.14014270156621933, "loss/reg": 10.359500885009766, "step": 29 }, { "epoch": 0.03, "grad_norm": 0.4535682201385498, "grad_norm_var": 0.002055153692638223, "learning_rate": 1.5e-05, "loss": 10.3735, "loss/crossentropy": 2.547928214073181, "loss/hidden": 0.0, "loss/logits": 0.14383359998464584, "loss/reg": 10.229703903198242, "step": 30 }, { "epoch": 0.031, "grad_norm": 0.38543379306793213, "grad_norm_var": 0.0020563179121854346, "learning_rate": 1.55e-05, "loss": 10.2439, "loss/crossentropy": 2.79032039642334, "loss/hidden": 0.0, "loss/logits": 0.1421559825539589, "loss/reg": 10.101773262023926, "step": 31 }, { "epoch": 0.032, "grad_norm": 0.33202216029167175, "grad_norm_var": 0.0021707343468757136, "learning_rate": 1.6000000000000003e-05, "loss": 10.0965, "loss/crossentropy": 2.7922052145004272, "loss/hidden": 0.0, "loss/logits": 0.12068676576018333, "loss/reg": 9.97581672668457, "step": 32 }, { "epoch": 0.033, "grad_norm": 0.39245933294296265, "grad_norm_var": 0.0021702323626646702, "learning_rate": 1.65e-05, "loss": 9.9865, "loss/crossentropy": 2.733828544616699, "loss/hidden": 0.0, "loss/logits": 0.1354268342256546, "loss/reg": 9.851117134094238, "step": 33 }, { "epoch": 0.034, "grad_norm": 0.38694247603416443, "grad_norm_var": 0.0020992626690788385, "learning_rate": 1.7000000000000003e-05, "loss": 9.8885, "loss/crossentropy": 2.777322292327881, "loss/hidden": 0.0, "loss/logits": 0.1599966138601303, "loss/reg": 9.7284574508667, "step": 34 }, { "epoch": 0.035, "grad_norm": 0.35561245679855347, "grad_norm_var": 0.0020449413010129036, "learning_rate": 1.75e-05, "loss": 9.7496, "loss/crossentropy": 2.7148683071136475, "loss/hidden": 0.0, "loss/logits": 0.14201582968235016, "loss/reg": 9.607542991638184, "step": 35 }, { "epoch": 0.036, "grad_norm": 0.4006306231021881, "grad_norm_var": 0.002045261355702799, "learning_rate": 1.8e-05, "loss": 9.635, "loss/crossentropy": 2.637976288795471, "loss/hidden": 0.0, "loss/logits": 0.14642605930566788, "loss/reg": 9.488553047180176, "step": 36 }, { "epoch": 0.037, "grad_norm": 0.41898205876350403, "grad_norm_var": 0.002044839434195215, "learning_rate": 1.85e-05, "loss": 9.5202, "loss/crossentropy": 2.777597665786743, "loss/hidden": 0.0, "loss/logits": 0.14908844977617264, "loss/reg": 9.371098518371582, "step": 37 }, { "epoch": 0.038, "grad_norm": 0.3961292803287506, "grad_norm_var": 0.001965975366123729, "learning_rate": 1.9e-05, "loss": 9.4067, "loss/crossentropy": 2.9033197164535522, "loss/hidden": 0.0, "loss/logits": 0.15159177035093307, "loss/reg": 9.255077362060547, "step": 38 }, { "epoch": 0.039, "grad_norm": 0.38804033398628235, "grad_norm_var": 0.0010025647229673696, "learning_rate": 1.9500000000000003e-05, "loss": 9.2802, "loss/crossentropy": 2.889930486679077, "loss/hidden": 0.0, "loss/logits": 0.1394604668021202, "loss/reg": 9.14071273803711, "step": 39 }, { "epoch": 0.04, "grad_norm": 0.42919155955314636, "grad_norm_var": 0.0008826965462539841, "learning_rate": 2e-05, "loss": 9.1743, "loss/crossentropy": 2.4006484746932983, "loss/hidden": 0.0, "loss/logits": 0.14666535705327988, "loss/reg": 9.027677536010742, "step": 40 }, { "epoch": 0.041, "grad_norm": 0.37885668873786926, "grad_norm_var": 0.0008895506586849048, "learning_rate": 2.05e-05, "loss": 9.0467, "loss/crossentropy": 2.7493255138397217, "loss/hidden": 0.0, "loss/logits": 0.13013121858239174, "loss/reg": 8.916520118713379, "step": 41 }, { "epoch": 0.042, "grad_norm": 0.5932357907295227, "grad_norm_var": 0.0033328458836597336, "learning_rate": 2.1e-05, "loss": 8.97, "loss/crossentropy": 2.6117889881134033, "loss/hidden": 0.0, "loss/logits": 0.16315819323062897, "loss/reg": 8.80688190460205, "step": 42 }, { "epoch": 0.043, "grad_norm": 0.3747418224811554, "grad_norm_var": 0.0033583994321603233, "learning_rate": 2.15e-05, "loss": 8.8396, "loss/crossentropy": 2.663694739341736, "loss/hidden": 0.0, "loss/logits": 0.14052008837461472, "loss/reg": 8.69911003112793, "step": 43 }, { "epoch": 0.044, "grad_norm": 0.4256257116794586, "grad_norm_var": 0.0033339815653853837, "learning_rate": 2.2000000000000003e-05, "loss": 8.726, "loss/crossentropy": 2.7881768941879272, "loss/hidden": 0.0, "loss/logits": 0.13326141238212585, "loss/reg": 8.592697143554688, "step": 44 }, { "epoch": 0.045, "grad_norm": 0.4122328460216522, "grad_norm_var": 0.0033030786394142473, "learning_rate": 2.25e-05, "loss": 8.6263, "loss/crossentropy": 2.7951114177703857, "loss/hidden": 0.0, "loss/logits": 0.13837838172912598, "loss/reg": 8.487950325012207, "step": 45 }, { "epoch": 0.046, "grad_norm": 0.4931789040565491, "grad_norm_var": 0.0036432243285688614, "learning_rate": 2.3000000000000003e-05, "loss": 8.5405, "loss/crossentropy": 2.80547297000885, "loss/hidden": 0.0, "loss/logits": 0.15613804012537003, "loss/reg": 8.384378433227539, "step": 46 }, { "epoch": 0.047, "grad_norm": 0.4260822832584381, "grad_norm_var": 0.003612225968443994, "learning_rate": 2.35e-05, "loss": 8.4325, "loss/crossentropy": 2.7571998834609985, "loss/hidden": 0.0, "loss/logits": 0.15075545758008957, "loss/reg": 8.281789779663086, "step": 47 }, { "epoch": 0.048, "grad_norm": 0.38472291827201843, "grad_norm_var": 0.003218571473485099, "learning_rate": 2.4e-05, "loss": 8.3219, "loss/crossentropy": 2.6643882989883423, "loss/hidden": 0.0, "loss/logits": 0.14136053621768951, "loss/reg": 8.180567741394043, "step": 48 }, { "epoch": 0.049, "grad_norm": 0.36624062061309814, "grad_norm_var": 0.0033439747229166835, "learning_rate": 2.45e-05, "loss": 8.2178, "loss/crossentropy": 2.737678050994873, "loss/hidden": 0.0, "loss/logits": 0.13707132637500763, "loss/reg": 8.080729484558105, "step": 49 }, { "epoch": 0.05, "grad_norm": 0.3760506212711334, "grad_norm_var": 0.0033912685784647087, "learning_rate": 2.5e-05, "loss": 8.137, "loss/crossentropy": 2.7842084169387817, "loss/hidden": 0.0, "loss/logits": 0.15427638590335846, "loss/reg": 7.982727527618408, "step": 50 }, { "epoch": 0.051, "grad_norm": 0.4842914044857025, "grad_norm_var": 0.0034291612008964874, "learning_rate": 2.5500000000000003e-05, "loss": 8.0328, "loss/crossentropy": 2.4711248874664307, "loss/hidden": 0.0, "loss/logits": 0.14703373610973358, "loss/reg": 7.885744094848633, "step": 51 }, { "epoch": 0.052, "grad_norm": 0.35167771577835083, "grad_norm_var": 0.0037168779577402794, "learning_rate": 2.6000000000000002e-05, "loss": 7.9218, "loss/crossentropy": 2.688042402267456, "loss/hidden": 0.0, "loss/logits": 0.13181188702583313, "loss/reg": 7.789944648742676, "step": 52 }, { "epoch": 0.053, "grad_norm": 0.4046782851219177, "grad_norm_var": 0.0037291369976557537, "learning_rate": 2.6500000000000004e-05, "loss": 7.8506, "loss/crossentropy": 2.6470447778701782, "loss/hidden": 0.0, "loss/logits": 0.1550150215625763, "loss/reg": 7.695549011230469, "step": 53 }, { "epoch": 0.054, "grad_norm": 0.36790555715560913, "grad_norm_var": 0.003860515189158183, "learning_rate": 2.7000000000000002e-05, "loss": 7.7397, "loss/crossentropy": 2.7709513902664185, "loss/hidden": 0.0, "loss/logits": 0.13687237352132797, "loss/reg": 7.6028337478637695, "step": 54 }, { "epoch": 0.055, "grad_norm": 0.378646582365036, "grad_norm_var": 0.003901108788218351, "learning_rate": 2.7500000000000004e-05, "loss": 7.6525, "loss/crossentropy": 2.8750112056732178, "loss/hidden": 0.0, "loss/logits": 0.14087973535060883, "loss/reg": 7.511648178100586, "step": 55 }, { "epoch": 0.056, "grad_norm": 0.43194806575775146, "grad_norm_var": 0.0039066305166497416, "learning_rate": 2.8000000000000003e-05, "loss": 7.5885, "loss/crossentropy": 2.6936086416244507, "loss/hidden": 0.0, "loss/logits": 0.16746176779270172, "loss/reg": 7.421041965484619, "step": 56 }, { "epoch": 0.057, "grad_norm": 0.4012855291366577, "grad_norm_var": 0.0038280935965925374, "learning_rate": 2.8499999999999998e-05, "loss": 7.4798, "loss/crossentropy": 2.8553980588912964, "loss/hidden": 0.0, "loss/logits": 0.14783543348312378, "loss/reg": 7.331946849822998, "step": 57 }, { "epoch": 0.058, "grad_norm": 0.41179144382476807, "grad_norm_var": 0.0016229469351226081, "learning_rate": 2.9e-05, "loss": 7.3917, "loss/crossentropy": 2.90153706073761, "loss/hidden": 0.0, "loss/logits": 0.1475769728422165, "loss/reg": 7.24411153793335, "step": 58 }, { "epoch": 0.059, "grad_norm": 0.5224902033805847, "grad_norm_var": 0.0023775492652975813, "learning_rate": 2.95e-05, "loss": 7.3131, "loss/crossentropy": 2.922032952308655, "loss/hidden": 0.0, "loss/logits": 0.15628121048212051, "loss/reg": 7.156867027282715, "step": 59 }, { "epoch": 0.06, "grad_norm": 0.511249840259552, "grad_norm_var": 0.0029578979489786493, "learning_rate": 3e-05, "loss": 7.2356, "loss/crossentropy": 2.7479801177978516, "loss/hidden": 0.0, "loss/logits": 0.1649666205048561, "loss/reg": 7.070590019226074, "step": 60 }, { "epoch": 0.061, "grad_norm": 0.4054379165172577, "grad_norm_var": 0.0029680738800097915, "learning_rate": 3.05e-05, "loss": 7.138, "loss/crossentropy": 2.8455255031585693, "loss/hidden": 0.0, "loss/logits": 0.15279172360897064, "loss/reg": 6.985229015350342, "step": 61 }, { "epoch": 0.062, "grad_norm": 0.3967909812927246, "grad_norm_var": 0.002606398157824093, "learning_rate": 3.1e-05, "loss": 7.045, "loss/crossentropy": 2.5253665447235107, "loss/hidden": 0.0, "loss/logits": 0.14376042783260345, "loss/reg": 6.901230335235596, "step": 62 }, { "epoch": 0.063, "grad_norm": 0.5105597376823425, "grad_norm_var": 0.0031904242194133975, "learning_rate": 3.15e-05, "loss": 7.0051, "loss/crossentropy": 2.6825212240219116, "loss/hidden": 0.0, "loss/logits": 0.18690404295921326, "loss/reg": 6.818210124969482, "step": 63 }, { "epoch": 0.064, "grad_norm": 0.4659242630004883, "grad_norm_var": 0.0032302192085850684, "learning_rate": 3.2000000000000005e-05, "loss": 6.9076, "loss/crossentropy": 2.6204874515533447, "loss/hidden": 0.0, "loss/logits": 0.17028357833623886, "loss/reg": 6.737268447875977, "step": 64 }, { "epoch": 0.065, "grad_norm": 0.3556163012981415, "grad_norm_var": 0.003319357356775484, "learning_rate": 3.2500000000000004e-05, "loss": 6.79, "loss/crossentropy": 2.6547772884368896, "loss/hidden": 0.0, "loss/logits": 0.13286586478352547, "loss/reg": 6.657088756561279, "step": 65 }, { "epoch": 0.066, "grad_norm": 0.4854835867881775, "grad_norm_var": 0.003375179781292544, "learning_rate": 3.3e-05, "loss": 6.758, "loss/crossentropy": 2.817094564437866, "loss/hidden": 0.0, "loss/logits": 0.18038344383239746, "loss/reg": 6.577615261077881, "step": 66 }, { "epoch": 0.067, "grad_norm": 0.4216950833797455, "grad_norm_var": 0.0031699615767635542, "learning_rate": 3.35e-05, "loss": 6.6588, "loss/crossentropy": 2.872538924217224, "loss/hidden": 0.0, "loss/logits": 0.15885479748249054, "loss/reg": 6.499909400939941, "step": 67 }, { "epoch": 0.068, "grad_norm": 0.4400080442428589, "grad_norm_var": 0.0027769945370051136, "learning_rate": 3.4000000000000007e-05, "loss": 6.5852, "loss/crossentropy": 2.6969038248062134, "loss/hidden": 0.0, "loss/logits": 0.16181904822587967, "loss/reg": 6.423398971557617, "step": 68 }, { "epoch": 0.069, "grad_norm": 0.4272904396057129, "grad_norm_var": 0.0027266697361371774, "learning_rate": 3.45e-05, "loss": 6.5089, "loss/crossentropy": 2.7056760787963867, "loss/hidden": 0.0, "loss/logits": 0.16134114563465118, "loss/reg": 6.347527503967285, "step": 69 }, { "epoch": 0.07, "grad_norm": 0.4215809404850006, "grad_norm_var": 0.0024381335593683163, "learning_rate": 3.5e-05, "loss": 6.4248, "loss/crossentropy": 2.822075366973877, "loss/hidden": 0.0, "loss/logits": 0.15255100280046463, "loss/reg": 6.2722673416137695, "step": 70 }, { "epoch": 0.071, "grad_norm": 0.3756145238876343, "grad_norm_var": 0.0024621927937988008, "learning_rate": 3.55e-05, "loss": 6.3343, "loss/crossentropy": 2.7529423236846924, "loss/hidden": 0.0, "loss/logits": 0.1365201622247696, "loss/reg": 6.1977925300598145, "step": 71 }, { "epoch": 0.072, "grad_norm": 0.4465688467025757, "grad_norm_var": 0.002466586095849488, "learning_rate": 3.6e-05, "loss": 6.2908, "loss/crossentropy": 2.8122498989105225, "loss/hidden": 0.0, "loss/logits": 0.16586530208587646, "loss/reg": 6.124953746795654, "step": 72 }, { "epoch": 0.073, "grad_norm": 0.5681706070899963, "grad_norm_var": 0.0034022813413523423, "learning_rate": 3.65e-05, "loss": 6.2178, "loss/crossentropy": 2.8980711698532104, "loss/hidden": 0.0, "loss/logits": 0.16514715552330017, "loss/reg": 6.052603244781494, "step": 73 }, { "epoch": 0.074, "grad_norm": 0.48894357681274414, "grad_norm_var": 0.0034029444248247385, "learning_rate": 3.7e-05, "loss": 6.1569, "loss/crossentropy": 2.7240917682647705, "loss/hidden": 0.0, "loss/logits": 0.17589756846427917, "loss/reg": 5.980965614318848, "step": 74 }, { "epoch": 0.075, "grad_norm": 0.4453263580799103, "grad_norm_var": 0.003057192832421729, "learning_rate": 3.7500000000000003e-05, "loss": 6.0806, "loss/crossentropy": 2.880316376686096, "loss/hidden": 0.0, "loss/logits": 0.17018750309944153, "loss/reg": 5.910387992858887, "step": 75 }, { "epoch": 0.076, "grad_norm": 0.4072200357913971, "grad_norm_var": 0.002854757019651292, "learning_rate": 3.8e-05, "loss": 5.9831, "loss/crossentropy": 2.8352543115615845, "loss/hidden": 0.0, "loss/logits": 0.14264734089374542, "loss/reg": 5.840408802032471, "step": 76 }, { "epoch": 0.077, "grad_norm": 0.47089695930480957, "grad_norm_var": 0.002808781993245483, "learning_rate": 3.85e-05, "loss": 5.9645, "loss/crossentropy": 2.9140390157699585, "loss/hidden": 0.0, "loss/logits": 0.19280433654785156, "loss/reg": 5.771743297576904, "step": 77 }, { "epoch": 0.078, "grad_norm": 0.3908418118953705, "grad_norm_var": 0.0028496157710373726, "learning_rate": 3.9000000000000006e-05, "loss": 5.8542, "loss/crossentropy": 2.70191752910614, "loss/hidden": 0.0, "loss/logits": 0.15121394395828247, "loss/reg": 5.7030134201049805, "step": 78 }, { "epoch": 0.079, "grad_norm": 0.417324423789978, "grad_norm_var": 0.002579272338799754, "learning_rate": 3.9500000000000005e-05, "loss": 5.7921, "loss/crossentropy": 2.8772886991500854, "loss/hidden": 0.0, "loss/logits": 0.15610723197460175, "loss/reg": 5.6359782218933105, "step": 79 }, { "epoch": 0.08, "grad_norm": 0.4123746156692505, "grad_norm_var": 0.002568267863573048, "learning_rate": 4e-05, "loss": 5.7333, "loss/crossentropy": 2.785880208015442, "loss/hidden": 0.0, "loss/logits": 0.16348882019519806, "loss/reg": 5.569836616516113, "step": 80 }, { "epoch": 0.081, "grad_norm": 0.4967520833015442, "grad_norm_var": 0.002301783549342551, "learning_rate": 4.05e-05, "loss": 5.6758, "loss/crossentropy": 2.7999967336654663, "loss/hidden": 0.0, "loss/logits": 0.17136523872613907, "loss/reg": 5.504480361938477, "step": 81 }, { "epoch": 0.082, "grad_norm": 0.5052318572998047, "grad_norm_var": 0.002433398774934281, "learning_rate": 4.1e-05, "loss": 5.6235, "loss/crossentropy": 2.745284676551819, "loss/hidden": 0.0, "loss/logits": 0.183761365711689, "loss/reg": 5.439704418182373, "step": 82 }, { "epoch": 0.083, "grad_norm": 0.42788586020469666, "grad_norm_var": 0.0024157402006901862, "learning_rate": 4.15e-05, "loss": 5.5482, "loss/crossentropy": 2.996077299118042, "loss/hidden": 0.0, "loss/logits": 0.1716163083910942, "loss/reg": 5.37656831741333, "step": 83 }, { "epoch": 0.084, "grad_norm": 0.4045878052711487, "grad_norm_var": 0.002524230641886934, "learning_rate": 4.2e-05, "loss": 5.4639, "loss/crossentropy": 2.611970067024231, "loss/hidden": 0.0, "loss/logits": 0.14972157776355743, "loss/reg": 5.314174175262451, "step": 84 }, { "epoch": 0.085, "grad_norm": 0.41384124755859375, "grad_norm_var": 0.0025657923048212758, "learning_rate": 4.25e-05, "loss": 5.4087, "loss/crossentropy": 2.7544474601745605, "loss/hidden": 0.0, "loss/logits": 0.15563561022281647, "loss/reg": 5.253114223480225, "step": 85 }, { "epoch": 0.086, "grad_norm": 0.4230954945087433, "grad_norm_var": 0.002561545150143992, "learning_rate": 4.3e-05, "loss": 5.3742, "loss/crossentropy": 2.8086984157562256, "loss/hidden": 0.0, "loss/logits": 0.18146374821662903, "loss/reg": 5.192752361297607, "step": 86 }, { "epoch": 0.087, "grad_norm": 0.45918789505958557, "grad_norm_var": 0.0022425431957913728, "learning_rate": 4.35e-05, "loss": 5.3138, "loss/crossentropy": 2.785672187805176, "loss/hidden": 0.0, "loss/logits": 0.1815495491027832, "loss/reg": 5.132205486297607, "step": 87 }, { "epoch": 0.088, "grad_norm": 0.4096454083919525, "grad_norm_var": 0.002337951427575909, "learning_rate": 4.4000000000000006e-05, "loss": 5.2231, "loss/crossentropy": 2.7667384147644043, "loss/hidden": 0.0, "loss/logits": 0.1498737782239914, "loss/reg": 5.073270797729492, "step": 88 }, { "epoch": 0.089, "grad_norm": 0.4818902313709259, "grad_norm_var": 0.001401593034965017, "learning_rate": 4.4500000000000004e-05, "loss": 5.1976, "loss/crossentropy": 2.722651481628418, "loss/hidden": 0.0, "loss/logits": 0.18228119611740112, "loss/reg": 5.015347480773926, "step": 89 }, { "epoch": 0.09, "grad_norm": 0.4635304808616638, "grad_norm_var": 0.0012793023910875926, "learning_rate": 4.5e-05, "loss": 5.1364, "loss/crossentropy": 2.7336219549179077, "loss/hidden": 0.0, "loss/logits": 0.1788763403892517, "loss/reg": 4.95751953125, "step": 90 }, { "epoch": 0.091, "grad_norm": 0.46776527166366577, "grad_norm_var": 0.0013286457514422071, "learning_rate": 4.55e-05, "loss": 5.089, "loss/crossentropy": 2.818411946296692, "loss/hidden": 0.0, "loss/logits": 0.18763671815395355, "loss/reg": 4.901313781738281, "step": 91 }, { "epoch": 0.092, "grad_norm": 0.4979424774646759, "grad_norm_var": 0.0014374124356233827, "learning_rate": 4.600000000000001e-05, "loss": 5.033, "loss/crossentropy": 2.780390739440918, "loss/hidden": 0.0, "loss/logits": 0.18797770142555237, "loss/reg": 4.844989776611328, "step": 92 }, { "epoch": 0.093, "grad_norm": 0.476500004529953, "grad_norm_var": 0.0014576571842103666, "learning_rate": 4.6500000000000005e-05, "loss": 4.9601, "loss/crossentropy": 2.873763084411621, "loss/hidden": 0.0, "loss/logits": 0.16978690773248672, "loss/reg": 4.790344715118408, "step": 93 }, { "epoch": 0.094, "grad_norm": 0.4781922996044159, "grad_norm_var": 0.0012831022874166228, "learning_rate": 4.7e-05, "loss": 4.9191, "loss/crossentropy": 2.7947566509246826, "loss/hidden": 0.0, "loss/logits": 0.1818721517920494, "loss/reg": 4.737210273742676, "step": 94 }, { "epoch": 0.095, "grad_norm": 0.4668790102005005, "grad_norm_var": 0.0012059221432147993, "learning_rate": 4.75e-05, "loss": 4.8722, "loss/crossentropy": 2.975098729133606, "loss/hidden": 0.0, "loss/logits": 0.18810376524925232, "loss/reg": 4.684102535247803, "step": 95 }, { "epoch": 0.096, "grad_norm": 0.6337518095970154, "grad_norm_var": 0.003000960526301677, "learning_rate": 4.8e-05, "loss": 4.8243, "loss/crossentropy": 2.8563435077667236, "loss/hidden": 0.0, "loss/logits": 0.19358298182487488, "loss/reg": 4.630751132965088, "step": 96 }, { "epoch": 0.097, "grad_norm": 0.5349352955818176, "grad_norm_var": 0.003232518858686584, "learning_rate": 4.85e-05, "loss": 4.7556, "loss/crossentropy": 2.8064088821411133, "loss/hidden": 0.0, "loss/logits": 0.1775345504283905, "loss/reg": 4.578096866607666, "step": 97 }, { "epoch": 0.098, "grad_norm": 0.4641897976398468, "grad_norm_var": 0.003153502010794122, "learning_rate": 4.9e-05, "loss": 4.7093, "loss/crossentropy": 2.737215995788574, "loss/hidden": 0.0, "loss/logits": 0.18364109843969345, "loss/reg": 4.525658130645752, "step": 98 }, { "epoch": 0.099, "grad_norm": 0.4986944794654846, "grad_norm_var": 0.003078809549519083, "learning_rate": 4.9500000000000004e-05, "loss": 4.6582, "loss/crossentropy": 2.7446107864379883, "loss/hidden": 0.0, "loss/logits": 0.1834591180086136, "loss/reg": 4.474703311920166, "step": 99 }, { "epoch": 0.1, "grad_norm": 0.5496240854263306, "grad_norm_var": 0.0030625509543914593, "learning_rate": 5e-05, "loss": 4.6055, "loss/crossentropy": 2.8407260179519653, "loss/hidden": 0.0, "loss/logits": 0.18116910755634308, "loss/reg": 4.424361228942871, "step": 100 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.57623446257664e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }