{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0625, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 2.377527952194214, "learning_rate": 1.0000000000000002e-06, "loss": 1.2768, "loss/crossentropy": 2.697097063064575, "loss/hidden": 1.1171875, "loss/logits": 0.15893849730491638, "loss/reg": 6.247002602322027e-05, "step": 1 }, { "epoch": 0.00025, "grad_norm": 4.216994762420654, "learning_rate": 2.0000000000000003e-06, "loss": 1.3752, "loss/crossentropy": 3.101844310760498, "loss/hidden": 1.1796875, "loss/logits": 0.1949012577533722, "loss/reg": 6.247002602322027e-05, "step": 2 }, { "epoch": 0.000375, "grad_norm": 2.3287529945373535, "learning_rate": 3e-06, "loss": 1.2785, "loss/crossentropy": 2.63712477684021, "loss/hidden": 1.09375, "loss/logits": 0.18410107493400574, "loss/reg": 6.246996053960174e-05, "step": 3 }, { "epoch": 0.0005, "grad_norm": 5.415231227874756, "learning_rate": 4.000000000000001e-06, "loss": 1.4285, "loss/crossentropy": 2.5702285766601562, "loss/hidden": 1.265625, "loss/logits": 0.16228657960891724, "loss/reg": 6.246980774449185e-05, "step": 4 }, { "epoch": 0.000625, "grad_norm": 4.888370513916016, "learning_rate": 5e-06, "loss": 1.5121, "loss/crossentropy": 2.439383029937744, "loss/hidden": 1.3125, "loss/logits": 0.19899356365203857, "loss/reg": 6.24695821898058e-05, "step": 5 }, { "epoch": 0.00075, "grad_norm": 2.608705997467041, "learning_rate": 6e-06, "loss": 1.293, "loss/crossentropy": 2.668699026107788, "loss/hidden": 1.109375, "loss/logits": 0.18298496305942535, "loss/reg": 6.246933480724692e-05, "step": 6 }, { "epoch": 0.000875, "grad_norm": 2.8447623252868652, "learning_rate": 7.000000000000001e-06, "loss": 1.5339, "loss/crossentropy": 2.5219366550445557, "loss/hidden": 1.296875, "loss/logits": 0.2364223599433899, "loss/reg": 6.246914563234895e-05, "step": 7 }, { "epoch": 0.001, "grad_norm": 3.7877628803253174, "learning_rate": 8.000000000000001e-06, "loss": 1.8218, "loss/crossentropy": 2.1927688121795654, "loss/hidden": 1.5546875, "loss/logits": 0.2664879262447357, "loss/reg": 6.246889097383246e-05, "step": 8 }, { "epoch": 0.001125, "grad_norm": 2.988516330718994, "learning_rate": 9e-06, "loss": 1.7373, "loss/crossentropy": 2.3826897144317627, "loss/hidden": 1.421875, "loss/logits": 0.314752995967865, "loss/reg": 6.246858538361266e-05, "step": 9 }, { "epoch": 0.00125, "grad_norm": 2.143723726272583, "learning_rate": 1e-05, "loss": 1.405, "loss/crossentropy": 2.2246415615081787, "loss/hidden": 1.234375, "loss/logits": 0.16997714340686798, "loss/reg": 6.246842531254515e-05, "step": 10 }, { "epoch": 0.001375, "grad_norm": 2.4413657188415527, "learning_rate": 1.1000000000000001e-05, "loss": 1.4206, "loss/crossentropy": 2.4612021446228027, "loss/hidden": 1.1796875, "loss/logits": 0.24033024907112122, "loss/reg": 6.246819975785911e-05, "step": 11 }, { "epoch": 0.0015, "grad_norm": 2.483156204223633, "learning_rate": 1.2e-05, "loss": 1.6449, "loss/crossentropy": 2.2882771492004395, "loss/hidden": 1.4140625, "loss/logits": 0.23023059964179993, "loss/reg": 6.246790871955454e-05, "step": 12 }, { "epoch": 0.001625, "grad_norm": 2.7368147373199463, "learning_rate": 1.3000000000000001e-05, "loss": 1.4981, "loss/crossentropy": 2.6942052841186523, "loss/hidden": 1.265625, "loss/logits": 0.23185348510742188, "loss/reg": 6.24675813014619e-05, "step": 13 }, { "epoch": 0.00175, "grad_norm": 5.189184665679932, "learning_rate": 1.4000000000000001e-05, "loss": 1.946, "loss/crossentropy": 2.3771214485168457, "loss/hidden": 1.625, "loss/logits": 0.320385217666626, "loss/reg": 6.246678822208196e-05, "step": 14 }, { "epoch": 0.001875, "grad_norm": 2.305589437484741, "learning_rate": 1.5e-05, "loss": 1.4982, "loss/crossentropy": 2.7562549114227295, "loss/hidden": 1.25, "loss/logits": 0.2476150244474411, "loss/reg": 6.246620614547282e-05, "step": 15 }, { "epoch": 0.002, "grad_norm": 2.3378520011901855, "grad_norm_var": 1.2675163586822178, "learning_rate": 1.6000000000000003e-05, "loss": 1.3302, "loss/crossentropy": 2.445441961288452, "loss/hidden": 1.125, "loss/logits": 0.20453599095344543, "loss/reg": 6.246585689950734e-05, "step": 16 }, { "epoch": 0.002125, "grad_norm": 1.7903435230255127, "grad_norm_var": 1.3529406709866008, "learning_rate": 1.7000000000000003e-05, "loss": 1.1333, "loss/crossentropy": 2.323503017425537, "loss/hidden": 0.984375, "loss/logits": 0.14828170835971832, "loss/reg": 6.246510747587308e-05, "step": 17 }, { "epoch": 0.00225, "grad_norm": 3.363795518875122, "grad_norm_var": 1.277817936381435, "learning_rate": 1.8e-05, "loss": 1.7292, "loss/crossentropy": 2.6075525283813477, "loss/hidden": 1.46875, "loss/logits": 0.25987327098846436, "loss/reg": 6.24642925686203e-05, "step": 18 }, { "epoch": 0.002375, "grad_norm": 2.162050724029541, "grad_norm_var": 1.2967721886362786, "learning_rate": 1.9e-05, "loss": 1.3146, "loss/crossentropy": 2.570558786392212, "loss/hidden": 1.125, "loss/logits": 0.18898281455039978, "loss/reg": 6.246323027880862e-05, "step": 19 }, { "epoch": 0.0025, "grad_norm": 2.147024393081665, "grad_norm_var": 0.9523869945360727, "learning_rate": 2e-05, "loss": 1.3484, "loss/crossentropy": 2.6676244735717773, "loss/hidden": 1.1484375, "loss/logits": 0.19929195940494537, "loss/reg": 6.246233533602208e-05, "step": 20 }, { "epoch": 0.002625, "grad_norm": 2.0668728351593018, "grad_norm_var": 0.6976603751830339, "learning_rate": 2.1e-05, "loss": 1.1929, "loss/crossentropy": 2.401143789291382, "loss/hidden": 1.03125, "loss/logits": 0.1610003113746643, "loss/reg": 6.246144039323553e-05, "step": 21 }, { "epoch": 0.00275, "grad_norm": 2.8019566535949707, "grad_norm_var": 0.6973240463492516, "learning_rate": 2.2000000000000003e-05, "loss": 1.419, "loss/crossentropy": 2.627523183822632, "loss/hidden": 1.203125, "loss/logits": 0.2152642011642456, "loss/reg": 6.246032717172056e-05, "step": 22 }, { "epoch": 0.002875, "grad_norm": 3.8118937015533447, "grad_norm_var": 0.7713008187193999, "learning_rate": 2.3000000000000003e-05, "loss": 1.4284, "loss/crossentropy": 2.7227890491485596, "loss/hidden": 1.1640625, "loss/logits": 0.2637593150138855, "loss/reg": 6.245896656764671e-05, "step": 23 }, { "epoch": 0.003, "grad_norm": 2.1418018341064453, "grad_norm_var": 0.7205284729945551, "learning_rate": 2.4e-05, "loss": 1.3002, "loss/crossentropy": 2.545552968978882, "loss/hidden": 1.1328125, "loss/logits": 0.16680249571800232, "loss/reg": 6.245774420676753e-05, "step": 24 }, { "epoch": 0.003125, "grad_norm": 3.5331156253814697, "grad_norm_var": 0.7613226543465996, "learning_rate": 2.5e-05, "loss": 1.3224, "loss/crossentropy": 2.2371270656585693, "loss/hidden": 1.15625, "loss/logits": 0.16548338532447815, "loss/reg": 6.245705299079418e-05, "step": 25 }, { "epoch": 0.00325, "grad_norm": 1.9795947074890137, "grad_norm_var": 0.7755306597344306, "learning_rate": 2.6000000000000002e-05, "loss": 1.3209, "loss/crossentropy": 2.7113037109375, "loss/hidden": 1.1328125, "loss/logits": 0.18742361664772034, "loss/reg": 6.245569966267794e-05, "step": 26 }, { "epoch": 0.003375, "grad_norm": 2.6044108867645264, "grad_norm_var": 0.7714440385524235, "learning_rate": 2.7000000000000002e-05, "loss": 1.4566, "loss/crossentropy": 2.6034419536590576, "loss/hidden": 1.2265625, "loss/logits": 0.22937631607055664, "loss/reg": 6.245376425795257e-05, "step": 27 }, { "epoch": 0.0035, "grad_norm": 2.48085355758667, "grad_norm_var": 0.7715158471256792, "learning_rate": 2.8000000000000003e-05, "loss": 1.4579, "loss/crossentropy": 2.5794363021850586, "loss/hidden": 1.2421875, "loss/logits": 0.21509718894958496, "loss/reg": 6.245166878215969e-05, "step": 28 }, { "epoch": 0.003625, "grad_norm": 3.0413854122161865, "grad_norm_var": 0.7781660489700184, "learning_rate": 2.9e-05, "loss": 1.6102, "loss/crossentropy": 2.4173922538757324, "loss/hidden": 1.375, "loss/logits": 0.23455965518951416, "loss/reg": 6.244902033358812e-05, "step": 29 }, { "epoch": 0.00375, "grad_norm": 2.1076390743255615, "grad_norm_var": 0.36324525064493024, "learning_rate": 3e-05, "loss": 1.0735, "loss/crossentropy": 2.4064886569976807, "loss/hidden": 0.9453125, "loss/logits": 0.12752822041511536, "loss/reg": 6.244838004931808e-05, "step": 30 }, { "epoch": 0.003875, "grad_norm": 2.5296630859375, "grad_norm_var": 0.359312391151574, "learning_rate": 3.1e-05, "loss": 1.3467, "loss/crossentropy": 2.61391544342041, "loss/hidden": 1.15625, "loss/logits": 0.18978667259216309, "loss/reg": 6.244736141525209e-05, "step": 31 }, { "epoch": 0.004, "grad_norm": 2.123671054840088, "grad_norm_var": 0.3684168280400947, "learning_rate": 3.2000000000000005e-05, "loss": 1.2191, "loss/crossentropy": 2.6056668758392334, "loss/hidden": 1.0546875, "loss/logits": 0.16381201148033142, "loss/reg": 6.244605174288154e-05, "step": 32 }, { "epoch": 0.004125, "grad_norm": 3.685770034790039, "grad_norm_var": 0.4027733703548923, "learning_rate": 3.3e-05, "loss": 1.6794, "loss/crossentropy": 2.519561290740967, "loss/hidden": 1.3828125, "loss/logits": 0.29592496156692505, "loss/reg": 6.24443418928422e-05, "step": 33 }, { "epoch": 0.00425, "grad_norm": 1.9660468101501465, "grad_norm_var": 0.393966226946808, "learning_rate": 3.4000000000000007e-05, "loss": 1.3395, "loss/crossentropy": 2.638051986694336, "loss/hidden": 1.15625, "loss/logits": 0.18261724710464478, "loss/reg": 6.244314135983586e-05, "step": 34 }, { "epoch": 0.004375, "grad_norm": 2.3111677169799805, "grad_norm_var": 0.38716579449971367, "learning_rate": 3.5e-05, "loss": 1.3501, "loss/crossentropy": 2.599940776824951, "loss/hidden": 1.15625, "loss/logits": 0.19327056407928467, "loss/reg": 6.244215182960033e-05, "step": 35 }, { "epoch": 0.0045, "grad_norm": 2.5357542037963867, "grad_norm_var": 0.3739975607775089, "learning_rate": 3.6e-05, "loss": 1.287, "loss/crossentropy": 2.9884798526763916, "loss/hidden": 1.1171875, "loss/logits": 0.16922441124916077, "loss/reg": 6.244022370083258e-05, "step": 36 }, { "epoch": 0.004625, "grad_norm": 1.7781621217727661, "grad_norm_var": 0.40002233468076764, "learning_rate": 3.7e-05, "loss": 1.074, "loss/crossentropy": 2.669071674346924, "loss/hidden": 0.93359375, "loss/logits": 0.13981276750564575, "loss/reg": 6.243858661036938e-05, "step": 37 }, { "epoch": 0.00475, "grad_norm": 24.6973819732666, "grad_norm_var": 30.983207545217457, "learning_rate": 3.8e-05, "loss": 1.3637, "loss/crossentropy": 2.482579469680786, "loss/hidden": 1.1953125, "loss/logits": 0.16777344048023224, "loss/reg": 6.243725511012599e-05, "step": 38 }, { "epoch": 0.004875, "grad_norm": 2.5728342533111572, "grad_norm_var": 31.103302953089262, "learning_rate": 3.9000000000000006e-05, "loss": 1.3424, "loss/crossentropy": 2.2785422801971436, "loss/hidden": 1.171875, "loss/logits": 0.16988611221313477, "loss/reg": 6.243555981200188e-05, "step": 39 }, { "epoch": 0.005, "grad_norm": 1.7385622262954712, "grad_norm_var": 31.206951393275006, "learning_rate": 4e-05, "loss": 1.077, "loss/crossentropy": 2.7017714977264404, "loss/hidden": 0.9453125, "loss/logits": 0.13102804124355316, "loss/reg": 6.243350071599707e-05, "step": 40 }, { "epoch": 0.005125, "grad_norm": 2.455116033554077, "grad_norm_var": 31.325901099338942, "learning_rate": 4.1e-05, "loss": 1.178, "loss/crossentropy": 2.6521873474121094, "loss/hidden": 1.015625, "loss/logits": 0.16170336306095123, "loss/reg": 6.243147072382271e-05, "step": 41 }, { "epoch": 0.00525, "grad_norm": 3.0441935062408447, "grad_norm_var": 31.14003983168487, "learning_rate": 4.2e-05, "loss": 1.488, "loss/crossentropy": 2.5000290870666504, "loss/hidden": 1.265625, "loss/logits": 0.2217317819595337, "loss/reg": 6.24291569693014e-05, "step": 42 }, { "epoch": 0.005375, "grad_norm": 2.6227200031280518, "grad_norm_var": 31.137008952861066, "learning_rate": 4.3e-05, "loss": 1.3106, "loss/crossentropy": 2.6832528114318848, "loss/hidden": 1.1171875, "loss/logits": 0.19276997447013855, "loss/reg": 6.242711242521182e-05, "step": 43 }, { "epoch": 0.0055, "grad_norm": 2.9194633960723877, "grad_norm_var": 31.06863081080745, "learning_rate": 4.4000000000000006e-05, "loss": 1.5396, "loss/crossentropy": 2.483938455581665, "loss/hidden": 1.3046875, "loss/logits": 0.23424991965293884, "loss/reg": 6.242513336474076e-05, "step": 44 }, { "epoch": 0.005625, "grad_norm": 2.2491037845611572, "grad_norm_var": 31.196778907875057, "learning_rate": 4.5e-05, "loss": 1.2321, "loss/crossentropy": 2.9735186100006104, "loss/hidden": 1.0625, "loss/logits": 0.1689363420009613, "loss/reg": 6.242344534257427e-05, "step": 45 }, { "epoch": 0.00575, "grad_norm": 2.687225103378296, "grad_norm_var": 31.084396554405373, "learning_rate": 4.600000000000001e-05, "loss": 1.2443, "loss/crossentropy": 2.913846254348755, "loss/hidden": 1.0625, "loss/logits": 0.18112678825855255, "loss/reg": 6.242193921934813e-05, "step": 46 }, { "epoch": 0.005875, "grad_norm": 2.3648312091827393, "grad_norm_var": 31.1155476706496, "learning_rate": 4.7e-05, "loss": 1.2044, "loss/crossentropy": 2.374119520187378, "loss/hidden": 1.046875, "loss/logits": 0.15688437223434448, "loss/reg": 6.242006929824129e-05, "step": 47 }, { "epoch": 0.006, "grad_norm": 1.896540880203247, "grad_norm_var": 31.171339818602494, "learning_rate": 4.8e-05, "loss": 1.238, "loss/crossentropy": 2.613962173461914, "loss/hidden": 1.0546875, "loss/logits": 0.1826920211315155, "loss/reg": 6.24187450739555e-05, "step": 48 }, { "epoch": 0.006125, "grad_norm": 1.7585434913635254, "grad_norm_var": 31.44447201393312, "learning_rate": 4.9e-05, "loss": 1.1411, "loss/crossentropy": 2.5672757625579834, "loss/hidden": 1.0, "loss/logits": 0.14043202996253967, "loss/reg": 6.241785740712658e-05, "step": 49 }, { "epoch": 0.00625, "grad_norm": 1.8257592916488647, "grad_norm_var": 31.47860052328912, "learning_rate": 5e-05, "loss": 1.2643, "loss/crossentropy": 2.4829366207122803, "loss/hidden": 1.0859375, "loss/logits": 0.1777852475643158, "loss/reg": 6.2416227592621e-05, "step": 50 }, { "epoch": 0.006375, "grad_norm": 1.9530550241470337, "grad_norm_var": 31.553698309541367, "learning_rate": 5.1000000000000006e-05, "loss": 1.1787, "loss/crossentropy": 2.501922369003296, "loss/hidden": 1.015625, "loss/logits": 0.16241338849067688, "loss/reg": 6.241373193915933e-05, "step": 51 }, { "epoch": 0.0065, "grad_norm": 2.366898536682129, "grad_norm_var": 31.58155048439878, "learning_rate": 5.2000000000000004e-05, "loss": 1.476, "loss/crossentropy": 2.557314872741699, "loss/hidden": 1.234375, "loss/logits": 0.24098029732704163, "loss/reg": 6.241213122848421e-05, "step": 52 }, { "epoch": 0.006625, "grad_norm": 2.139944553375244, "grad_norm_var": 31.497838767117898, "learning_rate": 5.300000000000001e-05, "loss": 1.3057, "loss/crossentropy": 2.5664379596710205, "loss/hidden": 1.125, "loss/logits": 0.18005570769309998, "loss/reg": 6.241026130737737e-05, "step": 53 }, { "epoch": 0.00675, "grad_norm": 2.2614963054656982, "grad_norm_var": 0.16298419379227144, "learning_rate": 5.4000000000000005e-05, "loss": 1.2081, "loss/crossentropy": 2.5651533603668213, "loss/hidden": 1.046875, "loss/logits": 0.1606135070323944, "loss/reg": 6.240784568944946e-05, "step": 54 }, { "epoch": 0.006875, "grad_norm": 1.88372802734375, "grad_norm_var": 0.16791840248250048, "learning_rate": 5.500000000000001e-05, "loss": 1.2037, "loss/crossentropy": 2.0431623458862305, "loss/hidden": 1.0703125, "loss/logits": 0.13271506130695343, "loss/reg": 6.240410584723577e-05, "step": 55 }, { "epoch": 0.007, "grad_norm": 1.7579172849655151, "grad_norm_var": 0.16659499666655736, "learning_rate": 5.6000000000000006e-05, "loss": 1.0787, "loss/crossentropy": 2.5805883407592773, "loss/hidden": 0.94140625, "loss/logits": 0.13670633733272552, "loss/reg": 6.240163202164695e-05, "step": 56 }, { "epoch": 0.007125, "grad_norm": 2.740758180618286, "grad_norm_var": 0.17906241043444873, "learning_rate": 5.6999999999999996e-05, "loss": 1.2499, "loss/crossentropy": 2.821078062057495, "loss/hidden": 1.0859375, "loss/logits": 0.16337308287620544, "loss/reg": 6.239958747755736e-05, "step": 57 }, { "epoch": 0.00725, "grad_norm": 3.3393216133117676, "grad_norm_var": 0.21459676497742203, "learning_rate": 5.8e-05, "loss": 1.5094, "loss/crossentropy": 2.6574273109436035, "loss/hidden": 1.2265625, "loss/logits": 0.2822623550891876, "loss/reg": 6.239775393623859e-05, "step": 58 }, { "epoch": 0.007375, "grad_norm": 2.1151742935180664, "grad_norm_var": 0.20871929877454623, "learning_rate": 5.9e-05, "loss": 1.31, "loss/crossentropy": 2.28176212310791, "loss/hidden": 1.125, "loss/logits": 0.18433833122253418, "loss/reg": 6.239649519557133e-05, "step": 59 }, { "epoch": 0.0075, "grad_norm": 1.9203850030899048, "grad_norm_var": 0.18408730894700795, "learning_rate": 6e-05, "loss": 1.2862, "loss/crossentropy": 2.319091558456421, "loss/hidden": 1.09375, "loss/logits": 0.1918697953224182, "loss/reg": 6.239335925783962e-05, "step": 60 }, { "epoch": 0.007625, "grad_norm": 2.689425230026245, "grad_norm_var": 0.1988651894699956, "learning_rate": 6.1e-05, "loss": 1.2077, "loss/crossentropy": 2.396440029144287, "loss/hidden": 1.0546875, "loss/logits": 0.1523526906967163, "loss/reg": 6.239157664822415e-05, "step": 61 }, { "epoch": 0.00775, "grad_norm": 2.0848548412323, "grad_norm_var": 0.184926237897677, "learning_rate": 6.2e-05, "loss": 1.1889, "loss/crossentropy": 2.375331401824951, "loss/hidden": 1.03125, "loss/logits": 0.15707406401634216, "loss/reg": 6.238814967218786e-05, "step": 62 }, { "epoch": 0.007875, "grad_norm": 1.9770179986953735, "grad_norm_var": 0.18547542502594508, "learning_rate": 6.3e-05, "loss": 1.1255, "loss/crossentropy": 2.5883288383483887, "loss/hidden": 0.984375, "loss/logits": 0.14046350121498108, "loss/reg": 6.238514470169321e-05, "step": 63 }, { "epoch": 0.008, "grad_norm": 1.9654349088668823, "grad_norm_var": 0.1832653842408547, "learning_rate": 6.400000000000001e-05, "loss": 1.1315, "loss/crossentropy": 2.6122260093688965, "loss/hidden": 0.9765625, "loss/logits": 0.1543133556842804, "loss/reg": 6.238299101823941e-05, "step": 64 }, { "epoch": 0.008125, "grad_norm": 2.110621690750122, "grad_norm_var": 0.1715223081433841, "learning_rate": 6.500000000000001e-05, "loss": 1.1513, "loss/crossentropy": 2.3829517364501953, "loss/hidden": 1.0, "loss/logits": 0.15063607692718506, "loss/reg": 6.237896013772115e-05, "step": 65 }, { "epoch": 0.00825, "grad_norm": 3.1477179527282715, "grad_norm_var": 0.21553302023151552, "learning_rate": 6.6e-05, "loss": 1.4659, "loss/crossentropy": 2.2805211544036865, "loss/hidden": 1.2421875, "loss/logits": 0.22310970723628998, "loss/reg": 6.237393972696736e-05, "step": 66 }, { "epoch": 0.008375, "grad_norm": 2.482203722000122, "grad_norm_var": 0.21008166056666275, "learning_rate": 6.7e-05, "loss": 1.0839, "loss/crossentropy": 2.982119560241699, "loss/hidden": 0.94140625, "loss/logits": 0.14186254143714905, "loss/reg": 6.236990884644911e-05, "step": 67 }, { "epoch": 0.0085, "grad_norm": 2.198028087615967, "grad_norm_var": 0.21061508280485744, "learning_rate": 6.800000000000001e-05, "loss": 1.2007, "loss/crossentropy": 2.725332498550415, "loss/hidden": 1.0390625, "loss/logits": 0.1610267162322998, "loss/reg": 6.236397166503593e-05, "step": 68 }, { "epoch": 0.008625, "grad_norm": 1.9412530660629272, "grad_norm_var": 0.21734592747188602, "learning_rate": 6.9e-05, "loss": 1.1269, "loss/crossentropy": 2.682379722595215, "loss/hidden": 0.984375, "loss/logits": 0.14185243844985962, "loss/reg": 6.235777982510626e-05, "step": 69 }, { "epoch": 0.00875, "grad_norm": 2.223443031311035, "grad_norm_var": 0.21757323137186588, "learning_rate": 7e-05, "loss": 1.3663, "loss/crossentropy": 2.6186935901641846, "loss/hidden": 1.1640625, "loss/logits": 0.2016535997390747, "loss/reg": 6.23530286247842e-05, "step": 70 }, { "epoch": 0.008875, "grad_norm": 3.4456241130828857, "grad_norm_var": 0.28625219910078287, "learning_rate": 7.1e-05, "loss": 1.6214, "loss/crossentropy": 2.054266929626465, "loss/hidden": 1.421875, "loss/logits": 0.19887767732143402, "loss/reg": 6.234741158550605e-05, "step": 71 }, { "epoch": 0.009, "grad_norm": 1.9013352394104004, "grad_norm_var": 0.27557130255187207, "learning_rate": 7.2e-05, "loss": 1.1365, "loss/crossentropy": 2.422841787338257, "loss/hidden": 0.9765625, "loss/logits": 0.15926527976989746, "loss/reg": 6.234211468836293e-05, "step": 72 }, { "epoch": 0.009125, "grad_norm": 2.4032697677612305, "grad_norm_var": 0.267026183625853, "learning_rate": 7.3e-05, "loss": 1.4414, "loss/crossentropy": 2.4159440994262695, "loss/hidden": 1.21875, "loss/logits": 0.22204136848449707, "loss/reg": 6.233662861632183e-05, "step": 73 }, { "epoch": 0.00925, "grad_norm": 1.915128231048584, "grad_norm_var": 0.21002777018266153, "learning_rate": 7.4e-05, "loss": 1.2439, "loss/crossentropy": 2.587275505065918, "loss/hidden": 1.0625, "loss/logits": 0.1807810664176941, "loss/reg": 6.232755549717695e-05, "step": 74 }, { "epoch": 0.009375, "grad_norm": 3.4048879146575928, "grad_norm_var": 0.28520435687560547, "learning_rate": 7.500000000000001e-05, "loss": 1.2774, "loss/crossentropy": 2.6182703971862793, "loss/hidden": 1.125, "loss/logits": 0.15172982215881348, "loss/reg": 6.231923180166632e-05, "step": 75 }, { "epoch": 0.0095, "grad_norm": 2.3605074882507324, "grad_norm_var": 0.27132747056331724, "learning_rate": 7.6e-05, "loss": 1.1409, "loss/crossentropy": 2.6013262271881104, "loss/hidden": 0.98828125, "loss/logits": 0.151985764503479, "loss/reg": 6.231063889572397e-05, "step": 76 }, { "epoch": 0.009625, "grad_norm": 2.6056039333343506, "grad_norm_var": 0.2684276793201585, "learning_rate": 7.7e-05, "loss": 1.1, "loss/crossentropy": 2.534158945083618, "loss/hidden": 0.94921875, "loss/logits": 0.1501779407262802, "loss/reg": 6.230256258277223e-05, "step": 77 }, { "epoch": 0.00975, "grad_norm": 1.7923972606658936, "grad_norm_var": 0.285494251958092, "learning_rate": 7.800000000000001e-05, "loss": 1.1471, "loss/crossentropy": 2.3036601543426514, "loss/hidden": 0.98828125, "loss/logits": 0.15817409753799438, "loss/reg": 6.229766586329788e-05, "step": 78 }, { "epoch": 0.009875, "grad_norm": 2.0376312732696533, "grad_norm_var": 0.2825708803585835, "learning_rate": 7.900000000000001e-05, "loss": 1.2985, "loss/crossentropy": 2.5548579692840576, "loss/hidden": 1.140625, "loss/logits": 0.1572834551334381, "loss/reg": 6.229063728824258e-05, "step": 79 }, { "epoch": 0.01, "grad_norm": 2.998662233352661, "grad_norm_var": 0.29342903010298654, "learning_rate": 8e-05, "loss": 1.5504, "loss/crossentropy": 2.4098215103149414, "loss/hidden": 1.3046875, "loss/logits": 0.24512597918510437, "loss/reg": 6.22822335571982e-05, "step": 80 }, { "epoch": 0.010125, "grad_norm": 2.103449583053589, "grad_norm_var": 0.29374293883859787, "learning_rate": 8.1e-05, "loss": 1.2985, "loss/crossentropy": 2.380378484725952, "loss/hidden": 1.125, "loss/logits": 0.17282900214195251, "loss/reg": 6.227292760740966e-05, "step": 81 }, { "epoch": 0.01025, "grad_norm": 2.6376256942749023, "grad_norm_var": 0.2615363410208279, "learning_rate": 8.2e-05, "loss": 1.266, "loss/crossentropy": 2.4291374683380127, "loss/hidden": 1.1015625, "loss/logits": 0.16384728252887726, "loss/reg": 6.226752884685993e-05, "step": 82 }, { "epoch": 0.010375, "grad_norm": 2.0763561725616455, "grad_norm_var": 0.2675552215302521, "learning_rate": 8.3e-05, "loss": 1.1733, "loss/crossentropy": 2.423896312713623, "loss/hidden": 1.015625, "loss/logits": 0.15705125033855438, "loss/reg": 6.225931429071352e-05, "step": 83 }, { "epoch": 0.0105, "grad_norm": 4.398110866546631, "grad_norm_var": 0.5173355174320988, "learning_rate": 8.4e-05, "loss": 1.5654, "loss/crossentropy": 2.230816602706909, "loss/hidden": 1.296875, "loss/logits": 0.26791903376579285, "loss/reg": 6.225006654858589e-05, "step": 84 }, { "epoch": 0.010625, "grad_norm": 2.7163784503936768, "grad_norm_var": 0.4955558090734691, "learning_rate": 8.5e-05, "loss": 1.2008, "loss/crossentropy": 2.1671087741851807, "loss/hidden": 1.0546875, "loss/logits": 0.145525261759758, "loss/reg": 6.224414391908795e-05, "step": 85 }, { "epoch": 0.01075, "grad_norm": 1.9465394020080566, "grad_norm_var": 0.5129132822581631, "learning_rate": 8.6e-05, "loss": 1.0109, "loss/crossentropy": 2.218550443649292, "loss/hidden": 0.90234375, "loss/logits": 0.10795612633228302, "loss/reg": 6.22385778115131e-05, "step": 86 }, { "epoch": 0.010875, "grad_norm": 5.668015956878662, "grad_norm_var": 1.0880389746416426, "learning_rate": 8.7e-05, "loss": 1.2925, "loss/crossentropy": 2.360995292663574, "loss/hidden": 1.1484375, "loss/logits": 0.1434704214334488, "loss/reg": 6.223141826922074e-05, "step": 87 }, { "epoch": 0.011, "grad_norm": 3.4049394130706787, "grad_norm_var": 1.0721571012465496, "learning_rate": 8.800000000000001e-05, "loss": 1.6353, "loss/crossentropy": 1.9898579120635986, "loss/hidden": 1.3828125, "loss/logits": 0.25186440348625183, "loss/reg": 6.222462252480909e-05, "step": 88 }, { "epoch": 0.011125, "grad_norm": 1.885895013809204, "grad_norm_var": 1.1148297312339375, "learning_rate": 8.900000000000001e-05, "loss": 1.0561, "loss/crossentropy": 2.670912027359009, "loss/hidden": 0.92578125, "loss/logits": 0.12972213327884674, "loss/reg": 6.221828516572714e-05, "step": 89 }, { "epoch": 0.01125, "grad_norm": 1.886960506439209, "grad_norm_var": 1.118003608268531, "learning_rate": 9e-05, "loss": 1.1335, "loss/crossentropy": 2.5691866874694824, "loss/hidden": 0.97265625, "loss/logits": 0.16021151840686798, "loss/reg": 6.221193325472996e-05, "step": 90 }, { "epoch": 0.011375, "grad_norm": 3.117880344390869, "grad_norm_var": 1.0979090394478965, "learning_rate": 9.1e-05, "loss": 1.3175, "loss/crossentropy": 2.7383711338043213, "loss/hidden": 1.140625, "loss/logits": 0.1762513369321823, "loss/reg": 6.220516661414877e-05, "step": 91 }, { "epoch": 0.0115, "grad_norm": 2.5928220748901367, "grad_norm_var": 1.0899203711980436, "learning_rate": 9.200000000000001e-05, "loss": 1.3898, "loss/crossentropy": 2.255321741104126, "loss/hidden": 1.171875, "loss/logits": 0.21727776527404785, "loss/reg": 6.2199542298913e-05, "step": 92 }, { "epoch": 0.011625, "grad_norm": 2.5842387676239014, "grad_norm_var": 1.09033696415262, "learning_rate": 9.300000000000001e-05, "loss": 1.3599, "loss/crossentropy": 2.7780256271362305, "loss/hidden": 1.15625, "loss/logits": 0.203078031539917, "loss/reg": 6.219152419362217e-05, "step": 93 }, { "epoch": 0.01175, "grad_norm": 2.497912645339966, "grad_norm_var": 1.032260222561935, "learning_rate": 9.4e-05, "loss": 1.2791, "loss/crossentropy": 2.0482513904571533, "loss/hidden": 1.109375, "loss/logits": 0.16910339891910553, "loss/reg": 6.218066846486181e-05, "step": 94 }, { "epoch": 0.011875, "grad_norm": 2.1033713817596436, "grad_norm_var": 1.0259829914817806, "learning_rate": 9.5e-05, "loss": 1.0875, "loss/crossentropy": 2.427816152572632, "loss/hidden": 0.94921875, "loss/logits": 0.13770164549350739, "loss/reg": 6.21745057287626e-05, "step": 95 }, { "epoch": 0.012, "grad_norm": 2.063559055328369, "grad_norm_var": 1.0544556100156115, "learning_rate": 9.6e-05, "loss": 1.217, "loss/crossentropy": 2.498270034790039, "loss/hidden": 1.046875, "loss/logits": 0.16950619220733643, "loss/reg": 6.216309702722356e-05, "step": 96 }, { "epoch": 0.012125, "grad_norm": 2.3693654537200928, "grad_norm_var": 1.036651450071012, "learning_rate": 9.7e-05, "loss": 1.2016, "loss/crossentropy": 2.8368701934814453, "loss/hidden": 1.0390625, "loss/logits": 0.16189493238925934, "loss/reg": 6.215785833774135e-05, "step": 97 }, { "epoch": 0.01225, "grad_norm": 2.2980258464813232, "grad_norm_var": 1.0488061784492646, "learning_rate": 9.8e-05, "loss": 1.5249, "loss/crossentropy": 2.194488525390625, "loss/hidden": 1.2421875, "loss/logits": 0.2820858359336853, "loss/reg": 6.215048051672056e-05, "step": 98 }, { "epoch": 0.012375, "grad_norm": 3.147524833679199, "grad_norm_var": 1.0277853179901806, "learning_rate": 9.900000000000001e-05, "loss": 1.7374, "loss/crossentropy": 2.7856016159057617, "loss/hidden": 1.4609375, "loss/logits": 0.27581536769866943, "loss/reg": 6.214459426701069e-05, "step": 99 }, { "epoch": 0.0125, "grad_norm": 2.1317031383514404, "grad_norm_var": 0.8636563030021608, "learning_rate": 0.0001, "loss": 1.3633, "loss/crossentropy": 2.282402753829956, "loss/hidden": 1.1484375, "loss/logits": 0.2142634242773056, "loss/reg": 6.213640881469473e-05, "step": 100 }, { "epoch": 0.012625, "grad_norm": 2.2720911502838135, "grad_norm_var": 0.8721171319962743, "learning_rate": 0.0001, "loss": 1.2405, "loss/crossentropy": 2.8501064777374268, "loss/hidden": 1.0625, "loss/logits": 0.17741592228412628, "loss/reg": 6.21288490947336e-05, "step": 101 }, { "epoch": 0.01275, "grad_norm": 2.879110097885132, "grad_norm_var": 0.8423375514351165, "learning_rate": 0.0001, "loss": 1.3486, "loss/crossentropy": 2.4649596214294434, "loss/hidden": 1.171875, "loss/logits": 0.1761254221200943, "loss/reg": 6.211963773239404e-05, "step": 102 }, { "epoch": 0.012875, "grad_norm": 2.2214345932006836, "grad_norm_var": 0.2123174305005847, "learning_rate": 0.0001, "loss": 1.1049, "loss/crossentropy": 2.513540029525757, "loss/hidden": 0.96484375, "loss/logits": 0.13943374156951904, "loss/reg": 6.21131548541598e-05, "step": 103 }, { "epoch": 0.013, "grad_norm": 1.9674383401870728, "grad_norm_var": 0.16151448650877043, "learning_rate": 0.0001, "loss": 1.2055, "loss/crossentropy": 2.4960575103759766, "loss/hidden": 1.03125, "loss/logits": 0.17365112900733948, "loss/reg": 6.210394349182025e-05, "step": 104 }, { "epoch": 0.013125, "grad_norm": 2.152989387512207, "grad_norm_var": 0.1485118756217919, "learning_rate": 0.0001, "loss": 1.3728, "loss/crossentropy": 2.651463508605957, "loss/hidden": 1.1796875, "loss/logits": 0.1924474835395813, "loss/reg": 6.209702405612916e-05, "step": 105 }, { "epoch": 0.01325, "grad_norm": 2.591555118560791, "grad_norm_var": 0.13200909593287988, "learning_rate": 0.0001, "loss": 1.5933, "loss/crossentropy": 2.1848952770233154, "loss/hidden": 1.375, "loss/logits": 0.21770122647285461, "loss/reg": 6.208720878930762e-05, "step": 106 }, { "epoch": 0.013375, "grad_norm": 2.205780029296875, "grad_norm_var": 0.10119294371901374, "learning_rate": 0.0001, "loss": 0.9785, "loss/crossentropy": 2.4988999366760254, "loss/hidden": 0.8671875, "loss/logits": 0.1106652021408081, "loss/reg": 6.207643309608102e-05, "step": 107 }, { "epoch": 0.0135, "grad_norm": 2.427882671356201, "grad_norm_var": 0.09821140867718908, "learning_rate": 0.0001, "loss": 1.2968, "loss/crossentropy": 2.5072600841522217, "loss/hidden": 1.09375, "loss/logits": 0.20241403579711914, "loss/reg": 6.206895341165364e-05, "step": 108 }, { "epoch": 0.013625, "grad_norm": 2.4435040950775146, "grad_norm_var": 0.09542213222792188, "learning_rate": 0.0001, "loss": 1.2803, "loss/crossentropy": 2.2629339694976807, "loss/hidden": 1.1015625, "loss/logits": 0.17810457944869995, "loss/reg": 6.205752288224176e-05, "step": 109 }, { "epoch": 0.01375, "grad_norm": 2.9938735961914062, "grad_norm_var": 0.11986086275213564, "learning_rate": 0.0001, "loss": 1.2708, "loss/crossentropy": 2.5084388256073, "loss/hidden": 1.09375, "loss/logits": 0.1764756739139557, "loss/reg": 6.204319652169943e-05, "step": 110 }, { "epoch": 0.013875, "grad_norm": 2.499802827835083, "grad_norm_var": 0.11443625726480532, "learning_rate": 0.0001, "loss": 1.3281, "loss/crossentropy": 2.342087507247925, "loss/hidden": 1.15625, "loss/logits": 0.17120838165283203, "loss/reg": 6.20328210061416e-05, "step": 111 }, { "epoch": 0.014, "grad_norm": 3.28193736076355, "grad_norm_var": 0.149862047644675, "learning_rate": 0.0001, "loss": 1.3891, "loss/crossentropy": 2.396040916442871, "loss/hidden": 1.1953125, "loss/logits": 0.193180650472641, "loss/reg": 6.202506483532488e-05, "step": 112 }, { "epoch": 0.014125, "grad_norm": 2.2074780464172363, "grad_norm_var": 0.15416329735346365, "learning_rate": 0.0001, "loss": 1.2137, "loss/crossentropy": 2.501718759536743, "loss/hidden": 1.0546875, "loss/logits": 0.1583903729915619, "loss/reg": 6.201667565619573e-05, "step": 113 }, { "epoch": 0.01425, "grad_norm": 2.888498306274414, "grad_norm_var": 0.1614203311265588, "learning_rate": 0.0001, "loss": 1.3498, "loss/crossentropy": 3.097370147705078, "loss/hidden": 1.15625, "loss/logits": 0.19293376803398132, "loss/reg": 6.200573989190161e-05, "step": 114 }, { "epoch": 0.014375, "grad_norm": 2.385442018508911, "grad_norm_var": 0.1339080451651928, "learning_rate": 0.0001, "loss": 1.3415, "loss/crossentropy": 2.4950473308563232, "loss/hidden": 1.15625, "loss/logits": 0.18464481830596924, "loss/reg": 6.199457857292145e-05, "step": 115 }, { "epoch": 0.0145, "grad_norm": 3.3269190788269043, "grad_norm_var": 0.16897616880053803, "learning_rate": 0.0001, "loss": 1.6405, "loss/crossentropy": 2.19484806060791, "loss/hidden": 1.3828125, "loss/logits": 0.2570968270301819, "loss/reg": 6.198590563144535e-05, "step": 116 }, { "epoch": 0.014625, "grad_norm": 2.2415361404418945, "grad_norm_var": 0.17015290356553733, "learning_rate": 0.0001, "loss": 1.2381, "loss/crossentropy": 2.540816068649292, "loss/hidden": 1.0625, "loss/logits": 0.1749531626701355, "loss/reg": 6.197726906975731e-05, "step": 117 }, { "epoch": 0.01475, "grad_norm": 2.397615671157837, "grad_norm_var": 0.1631737555736056, "learning_rate": 0.0001, "loss": 1.2192, "loss/crossentropy": 2.6213266849517822, "loss/hidden": 1.0546875, "loss/logits": 0.16386428475379944, "loss/reg": 6.197066250024363e-05, "step": 118 }, { "epoch": 0.014875, "grad_norm": 2.75325345993042, "grad_norm_var": 0.16006220619054398, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 2.3850035667419434, "loss/hidden": 1.34375, "loss/logits": 0.2249460369348526, "loss/reg": 6.196285539772362e-05, "step": 119 }, { "epoch": 0.015, "grad_norm": 2.675480842590332, "grad_norm_var": 0.13660137165245084, "learning_rate": 0.0001, "loss": 1.299, "loss/crossentropy": 2.380896806716919, "loss/hidden": 1.125, "loss/logits": 0.17339974641799927, "loss/reg": 6.195474998094141e-05, "step": 120 }, { "epoch": 0.015125, "grad_norm": 2.611541509628296, "grad_norm_var": 0.12289609882195597, "learning_rate": 0.0001, "loss": 1.2924, "loss/crossentropy": 2.7064404487609863, "loss/hidden": 1.109375, "loss/logits": 0.18236055970191956, "loss/reg": 6.194705929374322e-05, "step": 121 }, { "epoch": 0.01525, "grad_norm": 2.3449323177337646, "grad_norm_var": 0.12765774775469155, "learning_rate": 0.0001, "loss": 1.2957, "loss/crossentropy": 2.5846447944641113, "loss/hidden": 1.1171875, "loss/logits": 0.17786133289337158, "loss/reg": 6.193818262545392e-05, "step": 122 }, { "epoch": 0.015375, "grad_norm": 2.1001734733581543, "grad_norm_var": 0.13398098136615483, "learning_rate": 0.0001, "loss": 1.1704, "loss/crossentropy": 2.504185676574707, "loss/hidden": 1.015625, "loss/logits": 0.15416675806045532, "loss/reg": 6.192670116433874e-05, "step": 123 }, { "epoch": 0.0155, "grad_norm": 2.365839719772339, "grad_norm_var": 0.13563497966163046, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.3259832859039307, "loss/hidden": 1.171875, "loss/logits": 0.20480972528457642, "loss/reg": 6.19165730313398e-05, "step": 124 }, { "epoch": 0.015625, "grad_norm": 2.1480026245117188, "grad_norm_var": 0.1470561705316013, "learning_rate": 0.0001, "loss": 1.2768, "loss/crossentropy": 2.288093090057373, "loss/hidden": 1.109375, "loss/logits": 0.16683252155780792, "loss/reg": 6.19063139311038e-05, "step": 125 }, { "epoch": 0.01575, "grad_norm": 2.2346343994140625, "grad_norm_var": 0.14082182611320845, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 2.6062135696411133, "loss/hidden": 1.0, "loss/logits": 0.14351129531860352, "loss/reg": 6.189729174366221e-05, "step": 126 }, { "epoch": 0.015875, "grad_norm": 3.187627077102661, "grad_norm_var": 0.16771827237098264, "learning_rate": 0.0001, "loss": 1.4505, "loss/crossentropy": 2.3607077598571777, "loss/hidden": 1.2265625, "loss/logits": 0.22327345609664917, "loss/reg": 6.189044506754726e-05, "step": 127 }, { "epoch": 0.016, "grad_norm": 2.1208789348602295, "grad_norm_var": 0.1420574537193353, "learning_rate": 0.0001, "loss": 1.1414, "loss/crossentropy": 2.408287286758423, "loss/hidden": 1.0, "loss/logits": 0.14076298475265503, "loss/reg": 6.188445695443079e-05, "step": 128 }, { "epoch": 0.016125, "grad_norm": 2.4475457668304443, "grad_norm_var": 0.13631644029428572, "learning_rate": 0.0001, "loss": 1.2863, "loss/crossentropy": 2.4705042839050293, "loss/hidden": 1.1171875, "loss/logits": 0.16846278309822083, "loss/reg": 6.187462713569403e-05, "step": 129 }, { "epoch": 0.01625, "grad_norm": 2.3132476806640625, "grad_norm_var": 0.128302854564951, "learning_rate": 0.0001, "loss": 1.2265, "loss/crossentropy": 2.323221445083618, "loss/hidden": 1.0625, "loss/logits": 0.16340406239032745, "loss/reg": 6.18634803686291e-05, "step": 130 }, { "epoch": 0.016375, "grad_norm": 2.6015546321868896, "grad_norm_var": 0.12854282273958592, "learning_rate": 0.0001, "loss": 1.0946, "loss/crossentropy": 2.554730176925659, "loss/hidden": 0.9609375, "loss/logits": 0.13307343423366547, "loss/reg": 6.185180245665833e-05, "step": 131 }, { "epoch": 0.0165, "grad_norm": 2.040545701980591, "grad_norm_var": 0.08874970269449302, "learning_rate": 0.0001, "loss": 1.1715, "loss/crossentropy": 2.6177141666412354, "loss/hidden": 1.0078125, "loss/logits": 0.163020521402359, "loss/reg": 6.184292578836903e-05, "step": 132 }, { "epoch": 0.016625, "grad_norm": 2.4451427459716797, "grad_norm_var": 0.08672588329890019, "learning_rate": 0.0001, "loss": 1.2794, "loss/crossentropy": 2.6671459674835205, "loss/hidden": 1.109375, "loss/logits": 0.16941678524017334, "loss/reg": 6.18349076830782e-05, "step": 133 }, { "epoch": 0.01675, "grad_norm": 2.5730879306793213, "grad_norm_var": 0.08802712142174655, "learning_rate": 0.0001, "loss": 1.356, "loss/crossentropy": 2.483858585357666, "loss/hidden": 1.171875, "loss/logits": 0.1835438758134842, "loss/reg": 6.182605284266174e-05, "step": 134 }, { "epoch": 0.016875, "grad_norm": 2.996643543243408, "grad_norm_var": 0.10205043083370029, "learning_rate": 0.0001, "loss": 1.5067, "loss/crossentropy": 2.267930507659912, "loss/hidden": 1.3046875, "loss/logits": 0.20140591263771057, "loss/reg": 6.181577919051051e-05, "step": 135 }, { "epoch": 0.017, "grad_norm": 2.2333881855010986, "grad_norm_var": 0.10100001995976887, "learning_rate": 0.0001, "loss": 1.23, "loss/crossentropy": 2.552584648132324, "loss/hidden": 1.0546875, "loss/logits": 0.17466390132904053, "loss/reg": 6.180404307087883e-05, "step": 136 }, { "epoch": 0.017125, "grad_norm": 2.476086378097534, "grad_norm_var": 0.09873795942098601, "learning_rate": 0.0001, "loss": 1.2347, "loss/crossentropy": 2.2955551147460938, "loss/hidden": 1.09375, "loss/logits": 0.1402929574251175, "loss/reg": 6.179526099003851e-05, "step": 137 }, { "epoch": 0.01725, "grad_norm": 2.9701859951019287, "grad_norm_var": 0.11738609069977789, "learning_rate": 0.0001, "loss": 1.1041, "loss/crossentropy": 2.4560158252716064, "loss/hidden": 0.97265625, "loss/logits": 0.1307787150144577, "loss/reg": 6.178120383992791e-05, "step": 138 }, { "epoch": 0.017375, "grad_norm": 2.151567220687866, "grad_norm_var": 0.11513060923898569, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.6192235946655273, "loss/hidden": 0.98828125, "loss/logits": 0.15172292292118073, "loss/reg": 6.176753231557086e-05, "step": 139 }, { "epoch": 0.0175, "grad_norm": 2.0209085941314697, "grad_norm_var": 0.1267419293205286, "learning_rate": 0.0001, "loss": 1.0928, "loss/crossentropy": 2.6628799438476562, "loss/hidden": 0.94921875, "loss/logits": 0.14296585321426392, "loss/reg": 6.175567978061736e-05, "step": 140 }, { "epoch": 0.017625, "grad_norm": 3.458299398422241, "grad_norm_var": 0.18389511336323494, "learning_rate": 0.0001, "loss": 1.3966, "loss/crossentropy": 2.885798692703247, "loss/hidden": 1.171875, "loss/logits": 0.22411209344863892, "loss/reg": 6.174653390189633e-05, "step": 141 }, { "epoch": 0.01775, "grad_norm": 2.608558177947998, "grad_norm_var": 0.17855808227350187, "learning_rate": 0.0001, "loss": 1.1734, "loss/crossentropy": 2.2689590454101562, "loss/hidden": 1.0234375, "loss/logits": 0.1493585705757141, "loss/reg": 6.1732207541354e-05, "step": 142 }, { "epoch": 0.017875, "grad_norm": 2.7264318466186523, "grad_norm_var": 0.1520478077633771, "learning_rate": 0.0001, "loss": 1.2868, "loss/crossentropy": 2.3888814449310303, "loss/hidden": 1.1171875, "loss/logits": 0.16896918416023254, "loss/reg": 6.172260327730328e-05, "step": 143 }, { "epoch": 0.018, "grad_norm": 2.4999561309814453, "grad_norm_var": 0.14128539295791806, "learning_rate": 0.0001, "loss": 1.3804, "loss/crossentropy": 2.442732572555542, "loss/hidden": 1.1875, "loss/logits": 0.19230639934539795, "loss/reg": 6.171311542857438e-05, "step": 144 }, { "epoch": 0.018125, "grad_norm": 3.084848642349243, "grad_norm_var": 0.1592220375940921, "learning_rate": 0.0001, "loss": 1.5124, "loss/crossentropy": 2.6801810264587402, "loss/hidden": 1.2421875, "loss/logits": 0.2696050703525543, "loss/reg": 6.170615233713761e-05, "step": 145 }, { "epoch": 0.01825, "grad_norm": 3.0833539962768555, "grad_norm_var": 0.16940866671487811, "learning_rate": 0.0001, "loss": 1.294, "loss/crossentropy": 2.434020519256592, "loss/hidden": 1.140625, "loss/logits": 0.15272179245948792, "loss/reg": 6.170049164211378e-05, "step": 146 }, { "epoch": 0.018375, "grad_norm": 2.2046446800231934, "grad_norm_var": 0.18039814292173043, "learning_rate": 0.0001, "loss": 1.1769, "loss/crossentropy": 2.5624289512634277, "loss/hidden": 1.015625, "loss/logits": 0.160653755068779, "loss/reg": 6.169131665956229e-05, "step": 147 }, { "epoch": 0.0185, "grad_norm": 1.9920902252197266, "grad_norm_var": 0.18414873169562326, "learning_rate": 0.0001, "loss": 1.1186, "loss/crossentropy": 2.709728479385376, "loss/hidden": 0.96875, "loss/logits": 0.1492651402950287, "loss/reg": 6.168704567244276e-05, "step": 148 }, { "epoch": 0.018625, "grad_norm": 2.7053756713867188, "grad_norm_var": 0.18317033653553666, "learning_rate": 0.0001, "loss": 1.2849, "loss/crossentropy": 2.594032049179077, "loss/hidden": 1.09375, "loss/logits": 0.1905450075864792, "loss/reg": 6.168089748825878e-05, "step": 149 }, { "epoch": 0.01875, "grad_norm": 2.1234872341156006, "grad_norm_var": 0.1981121598309187, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.5880792140960693, "loss/hidden": 1.0703125, "loss/logits": 0.18171370029449463, "loss/reg": 6.167205719975755e-05, "step": 150 }, { "epoch": 0.018875, "grad_norm": 2.4820902347564697, "grad_norm_var": 0.18631464898325945, "learning_rate": 0.0001, "loss": 1.1869, "loss/crossentropy": 2.2422618865966797, "loss/hidden": 1.0234375, "loss/logits": 0.16288068890571594, "loss/reg": 6.166584353195503e-05, "step": 151 }, { "epoch": 0.019, "grad_norm": 2.5669338703155518, "grad_norm_var": 0.17912821539433874, "learning_rate": 0.0001, "loss": 1.0968, "loss/crossentropy": 2.5655312538146973, "loss/hidden": 0.953125, "loss/logits": 0.1430792212486267, "loss/reg": 6.165904778754339e-05, "step": 152 }, { "epoch": 0.019125, "grad_norm": 2.191638469696045, "grad_norm_var": 0.18782946638749062, "learning_rate": 0.0001, "loss": 1.297, "loss/crossentropy": 2.3935883045196533, "loss/hidden": 1.109375, "loss/logits": 0.18698745965957642, "loss/reg": 6.165434024296701e-05, "step": 153 }, { "epoch": 0.01925, "grad_norm": 1.9139376878738403, "grad_norm_var": 0.19900155234911943, "learning_rate": 0.0001, "loss": 1.1497, "loss/crossentropy": 2.5978732109069824, "loss/hidden": 0.99609375, "loss/logits": 0.1530168354511261, "loss/reg": 6.164138176245615e-05, "step": 154 }, { "epoch": 0.019375, "grad_norm": 2.061805486679077, "grad_norm_var": 0.20353621009625153, "learning_rate": 0.0001, "loss": 1.034, "loss/crossentropy": 2.29733943939209, "loss/hidden": 0.91015625, "loss/logits": 0.12318030744791031, "loss/reg": 6.162770296214148e-05, "step": 155 }, { "epoch": 0.0195, "grad_norm": 2.686328649520874, "grad_norm_var": 0.19023239802865194, "learning_rate": 0.0001, "loss": 1.4235, "loss/crossentropy": 2.2928433418273926, "loss/hidden": 1.2265625, "loss/logits": 0.19631928205490112, "loss/reg": 6.16170436842367e-05, "step": 156 }, { "epoch": 0.019625, "grad_norm": 2.6863300800323486, "grad_norm_var": 0.13134889378527811, "learning_rate": 0.0001, "loss": 1.4147, "loss/crossentropy": 2.289113759994507, "loss/hidden": 1.21875, "loss/logits": 0.19536322355270386, "loss/reg": 6.160605698823929e-05, "step": 157 }, { "epoch": 0.01975, "grad_norm": 3.7774782180786133, "grad_norm_var": 0.2373896188726722, "learning_rate": 0.0001, "loss": 1.3606, "loss/crossentropy": 2.4960098266601562, "loss/hidden": 1.171875, "loss/logits": 0.18812544643878937, "loss/reg": 6.159812619443983e-05, "step": 158 }, { "epoch": 0.019875, "grad_norm": 2.5556654930114746, "grad_norm_var": 0.23517615853210802, "learning_rate": 0.0001, "loss": 1.1015, "loss/crossentropy": 2.4794013500213623, "loss/hidden": 0.9609375, "loss/logits": 0.1399209052324295, "loss/reg": 6.158895121188834e-05, "step": 159 }, { "epoch": 0.02, "grad_norm": 2.3351266384124756, "grad_norm_var": 0.23772124659223212, "learning_rate": 0.0001, "loss": 1.1072, "loss/crossentropy": 2.402188301086426, "loss/hidden": 0.96484375, "loss/logits": 0.14173097908496857, "loss/reg": 6.158249016152695e-05, "step": 160 }, { "epoch": 0.020125, "grad_norm": 2.319366455078125, "grad_norm_var": 0.21752957054554395, "learning_rate": 0.0001, "loss": 1.1774, "loss/crossentropy": 2.1729917526245117, "loss/hidden": 1.0234375, "loss/logits": 0.15335121750831604, "loss/reg": 6.157202733447775e-05, "step": 161 }, { "epoch": 0.02025, "grad_norm": 2.0917341709136963, "grad_norm_var": 0.19926011430610652, "learning_rate": 0.0001, "loss": 1.2443, "loss/crossentropy": 2.276581048965454, "loss/hidden": 1.0859375, "loss/logits": 0.1577274203300476, "loss/reg": 6.156737799756229e-05, "step": 162 }, { "epoch": 0.020375, "grad_norm": 4.31035041809082, "grad_norm_var": 0.41637723338655513, "learning_rate": 0.0001, "loss": 1.8974, "loss/crossentropy": 2.6449058055877686, "loss/hidden": 1.5625, "loss/logits": 0.33430173993110657, "loss/reg": 6.156211748020723e-05, "step": 163 }, { "epoch": 0.0205, "grad_norm": 2.145301342010498, "grad_norm_var": 0.4064476055559296, "learning_rate": 0.0001, "loss": 1.2636, "loss/crossentropy": 2.613586664199829, "loss/hidden": 1.078125, "loss/logits": 0.1848127692937851, "loss/reg": 6.155785376904532e-05, "step": 164 }, { "epoch": 0.020625, "grad_norm": 3.6308248043060303, "grad_norm_var": 0.47796885273955964, "learning_rate": 0.0001, "loss": 1.2327, "loss/crossentropy": 2.599729537963867, "loss/hidden": 1.046875, "loss/logits": 0.1852511763572693, "loss/reg": 6.154972652439028e-05, "step": 165 }, { "epoch": 0.02075, "grad_norm": 2.812910556793213, "grad_norm_var": 0.4622733920417279, "learning_rate": 0.0001, "loss": 1.3898, "loss/crossentropy": 2.7171225547790527, "loss/hidden": 1.1875, "loss/logits": 0.20167264342308044, "loss/reg": 6.154461152618751e-05, "step": 166 }, { "epoch": 0.020875, "grad_norm": 2.4922893047332764, "grad_norm_var": 0.46203729327833537, "learning_rate": 0.0001, "loss": 1.3528, "loss/crossentropy": 2.648606777191162, "loss/hidden": 1.140625, "loss/logits": 0.21159711480140686, "loss/reg": 6.153558933874592e-05, "step": 167 }, { "epoch": 0.021, "grad_norm": 2.2380781173706055, "grad_norm_var": 0.47292652355391496, "learning_rate": 0.0001, "loss": 1.3863, "loss/crossentropy": 2.5556812286376953, "loss/hidden": 1.1796875, "loss/logits": 0.20603393018245697, "loss/reg": 6.152570131234825e-05, "step": 168 }, { "epoch": 0.021125, "grad_norm": 2.8179726600646973, "grad_norm_var": 0.4599538691877346, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.285341262817383, "loss/hidden": 1.140625, "loss/logits": 0.19030849635601044, "loss/reg": 6.151832349132746e-05, "step": 169 }, { "epoch": 0.02125, "grad_norm": 2.933023691177368, "grad_norm_var": 0.42080948451517297, "learning_rate": 0.0001, "loss": 1.5924, "loss/crossentropy": 2.254920482635498, "loss/hidden": 1.3828125, "loss/logits": 0.20900759100914001, "loss/reg": 6.151078559923917e-05, "step": 170 }, { "epoch": 0.021375, "grad_norm": 2.9309163093566895, "grad_norm_var": 0.38903358238886365, "learning_rate": 0.0001, "loss": 1.2104, "loss/crossentropy": 2.771516799926758, "loss/hidden": 1.0546875, "loss/logits": 0.15512725710868835, "loss/reg": 6.14999225945212e-05, "step": 171 }, { "epoch": 0.0215, "grad_norm": 2.7658286094665527, "grad_norm_var": 0.3882477326935183, "learning_rate": 0.0001, "loss": 1.2183, "loss/crossentropy": 2.565211296081543, "loss/hidden": 1.0546875, "loss/logits": 0.16297924518585205, "loss/reg": 6.149257387733087e-05, "step": 172 }, { "epoch": 0.021625, "grad_norm": 3.39176344871521, "grad_norm_var": 0.40840451933244426, "learning_rate": 0.0001, "loss": 1.3931, "loss/crossentropy": 2.4181013107299805, "loss/hidden": 1.1875, "loss/logits": 0.2049458771944046, "loss/reg": 6.148203829070553e-05, "step": 173 }, { "epoch": 0.02175, "grad_norm": 2.7971994876861572, "grad_norm_var": 0.3468190736041642, "learning_rate": 0.0001, "loss": 1.2467, "loss/crossentropy": 2.644824981689453, "loss/hidden": 1.0703125, "loss/logits": 0.17579111456871033, "loss/reg": 6.147275416878983e-05, "step": 174 }, { "epoch": 0.021875, "grad_norm": 7.143955707550049, "grad_norm_var": 1.5219747541806836, "learning_rate": 0.0001, "loss": 1.3279, "loss/crossentropy": 2.6274638175964355, "loss/hidden": 1.171875, "loss/logits": 0.15536972880363464, "loss/reg": 6.146173836896196e-05, "step": 175 }, { "epoch": 0.022, "grad_norm": 8.911324501037598, "grad_norm_var": 3.578509022301667, "learning_rate": 0.0001, "loss": 1.8863, "loss/crossentropy": 1.8980119228363037, "loss/hidden": 1.765625, "loss/logits": 0.12003660202026367, "loss/reg": 6.145203224150464e-05, "step": 176 }, { "epoch": 0.022125, "grad_norm": 2.14353609085083, "grad_norm_var": 3.6077286646662734, "learning_rate": 0.0001, "loss": 1.1573, "loss/crossentropy": 2.1538591384887695, "loss/hidden": 1.015625, "loss/logits": 0.1410439908504486, "loss/reg": 6.144325743662193e-05, "step": 177 }, { "epoch": 0.02225, "grad_norm": 4.625613212585449, "grad_norm_var": 3.542583274880191, "learning_rate": 0.0001, "loss": 1.6226, "loss/crossentropy": 2.7923362255096436, "loss/hidden": 1.375, "loss/logits": 0.24694563448429108, "loss/reg": 6.143252539914101e-05, "step": 178 }, { "epoch": 0.022375, "grad_norm": 2.543745517730713, "grad_norm_var": 3.5775446556342367, "learning_rate": 0.0001, "loss": 1.4192, "loss/crossentropy": 2.3237483501434326, "loss/hidden": 1.203125, "loss/logits": 0.21549411118030548, "loss/reg": 6.14215387031436e-05, "step": 179 }, { "epoch": 0.0225, "grad_norm": 2.3068995475769043, "grad_norm_var": 3.5495511663474453, "learning_rate": 0.0001, "loss": 1.2428, "loss/crossentropy": 2.7135560512542725, "loss/hidden": 1.0859375, "loss/logits": 0.1562565714120865, "loss/reg": 6.141421181382611e-05, "step": 180 }, { "epoch": 0.022625, "grad_norm": 3.465264081954956, "grad_norm_var": 3.5490467443763025, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 3.3183774948120117, "loss/hidden": 1.234375, "loss/logits": 0.2421126663684845, "loss/reg": 6.140418554423377e-05, "step": 181 }, { "epoch": 0.02275, "grad_norm": 2.696394205093384, "grad_norm_var": 3.5608805573030993, "learning_rate": 0.0001, "loss": 1.2269, "loss/crossentropy": 2.609964370727539, "loss/hidden": 1.0546875, "loss/logits": 0.17162814736366272, "loss/reg": 6.139430479379371e-05, "step": 182 }, { "epoch": 0.022875, "grad_norm": 2.3278727531433105, "grad_norm_var": 3.5849405900569513, "learning_rate": 0.0001, "loss": 1.0795, "loss/crossentropy": 2.753383159637451, "loss/hidden": 0.9453125, "loss/logits": 0.1335984170436859, "loss/reg": 6.138216122053564e-05, "step": 183 }, { "epoch": 0.023, "grad_norm": 2.4336531162261963, "grad_norm_var": 3.554360278579671, "learning_rate": 0.0001, "loss": 1.3948, "loss/crossentropy": 2.4162991046905518, "loss/hidden": 1.171875, "loss/logits": 0.22235547006130219, "loss/reg": 6.137174204923213e-05, "step": 184 }, { "epoch": 0.023125, "grad_norm": 2.420710802078247, "grad_norm_var": 3.601127481620784, "learning_rate": 0.0001, "loss": 1.4926, "loss/crossentropy": 2.30292010307312, "loss/hidden": 1.296875, "loss/logits": 0.19511133432388306, "loss/reg": 6.136245065135881e-05, "step": 185 }, { "epoch": 0.02325, "grad_norm": 2.727184534072876, "grad_norm_var": 3.6190579859970224, "learning_rate": 0.0001, "loss": 1.2816, "loss/crossentropy": 2.4605464935302734, "loss/hidden": 1.0703125, "loss/logits": 0.2107134908437729, "loss/reg": 6.135714647825807e-05, "step": 186 }, { "epoch": 0.023375, "grad_norm": 1.9292963743209839, "grad_norm_var": 3.754688597499932, "learning_rate": 0.0001, "loss": 1.1628, "loss/crossentropy": 2.5925047397613525, "loss/hidden": 1.0, "loss/logits": 0.16220712661743164, "loss/reg": 6.134893919806927e-05, "step": 187 }, { "epoch": 0.0235, "grad_norm": 2.1395771503448486, "grad_norm_var": 3.833355540800866, "learning_rate": 0.0001, "loss": 1.2712, "loss/crossentropy": 2.227994441986084, "loss/hidden": 1.0859375, "loss/logits": 0.18463259935379028, "loss/reg": 6.134230352472514e-05, "step": 188 }, { "epoch": 0.023625, "grad_norm": 3.552602529525757, "grad_norm_var": 3.8353265135005175, "learning_rate": 0.0001, "loss": 1.2518, "loss/crossentropy": 2.562777280807495, "loss/hidden": 1.0859375, "loss/logits": 0.16521015763282776, "loss/reg": 6.13337178947404e-05, "step": 189 }, { "epoch": 0.02375, "grad_norm": 2.766602039337158, "grad_norm_var": 3.8377842837978386, "learning_rate": 0.0001, "loss": 1.3731, "loss/crossentropy": 2.4200425148010254, "loss/hidden": 1.203125, "loss/logits": 0.1694013774394989, "loss/reg": 6.132431008154526e-05, "step": 190 }, { "epoch": 0.023875, "grad_norm": 2.403444528579712, "grad_norm_var": 2.8653780273055327, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.6963400840759277, "loss/hidden": 1.0078125, "loss/logits": 0.1566968709230423, "loss/reg": 6.132054841145873e-05, "step": 191 }, { "epoch": 0.024, "grad_norm": 2.0356028079986572, "grad_norm_var": 0.4806738598539164, "learning_rate": 0.0001, "loss": 1.4298, "loss/crossentropy": 2.174285650253296, "loss/hidden": 1.21875, "loss/logits": 0.21048110723495483, "loss/reg": 6.13146330579184e-05, "step": 192 }, { "epoch": 0.024125, "grad_norm": 2.501723051071167, "grad_norm_var": 0.4641524277019669, "learning_rate": 0.0001, "loss": 1.2669, "loss/crossentropy": 2.6477620601654053, "loss/hidden": 1.09375, "loss/logits": 0.17256709933280945, "loss/reg": 6.130609108367935e-05, "step": 193 }, { "epoch": 0.02425, "grad_norm": 2.8256325721740723, "grad_norm_var": 0.19964871735684203, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.4205310344696045, "loss/hidden": 1.1875, "loss/logits": 0.17588719725608826, "loss/reg": 6.129377288743854e-05, "step": 194 }, { "epoch": 0.024375, "grad_norm": 3.715850353240967, "grad_norm_var": 0.28183777248683595, "learning_rate": 0.0001, "loss": 1.4108, "loss/crossentropy": 2.5872642993927, "loss/hidden": 1.234375, "loss/logits": 0.1758473813533783, "loss/reg": 6.128078530309722e-05, "step": 195 }, { "epoch": 0.0245, "grad_norm": 3.3498318195343018, "grad_norm_var": 0.3034271167360647, "learning_rate": 0.0001, "loss": 1.3691, "loss/crossentropy": 2.6444506645202637, "loss/hidden": 1.171875, "loss/logits": 0.19665929675102234, "loss/reg": 6.126934749772772e-05, "step": 196 }, { "epoch": 0.024625, "grad_norm": 2.0526957511901855, "grad_norm_var": 0.2850787945150557, "learning_rate": 0.0001, "loss": 1.2051, "loss/crossentropy": 2.592327117919922, "loss/hidden": 1.0390625, "loss/logits": 0.16540399193763733, "loss/reg": 6.125810614321381e-05, "step": 197 }, { "epoch": 0.02475, "grad_norm": 2.4300317764282227, "grad_norm_var": 0.28670823409057716, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.36305570602417, "loss/hidden": 1.2890625, "loss/logits": 0.2389371693134308, "loss/reg": 6.124811625340953e-05, "step": 198 }, { "epoch": 0.024875, "grad_norm": 2.3255856037139893, "grad_norm_var": 0.28679178178242776, "learning_rate": 0.0001, "loss": 1.1743, "loss/crossentropy": 2.0803394317626953, "loss/hidden": 1.03125, "loss/logits": 0.1424179971218109, "loss/reg": 6.124229548731819e-05, "step": 199 }, { "epoch": 0.025, "grad_norm": 2.2634005546569824, "grad_norm_var": 0.2923937566916393, "learning_rate": 0.0001, "loss": 1.2619, "loss/crossentropy": 2.427354574203491, "loss/hidden": 1.0859375, "loss/logits": 0.1753256618976593, "loss/reg": 6.123317871242762e-05, "step": 200 }, { "epoch": 0.025125, "grad_norm": 2.789698839187622, "grad_norm_var": 0.292575209213462, "learning_rate": 0.0001, "loss": 1.2794, "loss/crossentropy": 2.4137160778045654, "loss/hidden": 1.1328125, "loss/logits": 0.14599566161632538, "loss/reg": 6.122920603957027e-05, "step": 201 }, { "epoch": 0.02525, "grad_norm": 2.23150897026062, "grad_norm_var": 0.3003877767651639, "learning_rate": 0.0001, "loss": 1.2906, "loss/crossentropy": 2.502619743347168, "loss/hidden": 1.09375, "loss/logits": 0.19620737433433533, "loss/reg": 6.122409831732512e-05, "step": 202 }, { "epoch": 0.025375, "grad_norm": 3.3167238235473633, "grad_norm_var": 0.2999410613935005, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.5889461040496826, "loss/hidden": 1.2265625, "loss/logits": 0.2239363044500351, "loss/reg": 6.122187187429518e-05, "step": 203 }, { "epoch": 0.0255, "grad_norm": 2.5847971439361572, "grad_norm_var": 0.28091485279191464, "learning_rate": 0.0001, "loss": 1.248, "loss/crossentropy": 2.4720451831817627, "loss/hidden": 1.078125, "loss/logits": 0.16930653154850006, "loss/reg": 6.120974285295233e-05, "step": 204 }, { "epoch": 0.025625, "grad_norm": 2.071563243865967, "grad_norm_var": 0.24897236933793085, "learning_rate": 0.0001, "loss": 1.1016, "loss/crossentropy": 2.5648884773254395, "loss/hidden": 0.96875, "loss/logits": 0.13218875229358673, "loss/reg": 6.120166654000059e-05, "step": 205 }, { "epoch": 0.02575, "grad_norm": 2.9454479217529297, "grad_norm_var": 0.2548478796483238, "learning_rate": 0.0001, "loss": 1.3574, "loss/crossentropy": 2.607356309890747, "loss/hidden": 1.15625, "loss/logits": 0.20053817331790924, "loss/reg": 6.119644967839122e-05, "step": 206 }, { "epoch": 0.025875, "grad_norm": 3.396070718765259, "grad_norm_var": 0.28840087929906133, "learning_rate": 0.0001, "loss": 1.1743, "loss/crossentropy": 2.682058334350586, "loss/hidden": 1.0078125, "loss/logits": 0.16590501368045807, "loss/reg": 6.11838695476763e-05, "step": 207 }, { "epoch": 0.026, "grad_norm": 2.4477601051330566, "grad_norm_var": 0.26375613878289506, "learning_rate": 0.0001, "loss": 1.3022, "loss/crossentropy": 2.819031000137329, "loss/hidden": 1.109375, "loss/logits": 0.19222432374954224, "loss/reg": 6.117635348346084e-05, "step": 208 }, { "epoch": 0.026125, "grad_norm": 2.5916216373443604, "grad_norm_var": 0.2618484053528464, "learning_rate": 0.0001, "loss": 1.353, "loss/crossentropy": 2.529510259628296, "loss/hidden": 1.15625, "loss/logits": 0.19612029194831848, "loss/reg": 6.116151052992791e-05, "step": 209 }, { "epoch": 0.02625, "grad_norm": 2.108261823654175, "grad_norm_var": 0.28282181699858694, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.3222012519836426, "loss/hidden": 1.09375, "loss/logits": 0.18379396200180054, "loss/reg": 6.114997813710943e-05, "step": 210 }, { "epoch": 0.026375, "grad_norm": 2.48710560798645, "grad_norm_var": 0.20482550381518247, "learning_rate": 0.0001, "loss": 1.2718, "loss/crossentropy": 2.6183624267578125, "loss/hidden": 1.0859375, "loss/logits": 0.18522073328495026, "loss/reg": 6.114102870924398e-05, "step": 211 }, { "epoch": 0.0265, "grad_norm": 2.63779616355896, "grad_norm_var": 0.1640915083279668, "learning_rate": 0.0001, "loss": 1.3499, "loss/crossentropy": 2.391116142272949, "loss/hidden": 1.1640625, "loss/logits": 0.18524512648582458, "loss/reg": 6.112866685725749e-05, "step": 212 }, { "epoch": 0.026625, "grad_norm": 2.7476329803466797, "grad_norm_var": 0.14889028663519804, "learning_rate": 0.0001, "loss": 1.2842, "loss/crossentropy": 2.5770251750946045, "loss/hidden": 1.1171875, "loss/logits": 0.16641706228256226, "loss/reg": 6.111864786362275e-05, "step": 213 }, { "epoch": 0.02675, "grad_norm": 2.565723419189453, "grad_norm_var": 0.14722036218699916, "learning_rate": 0.0001, "loss": 1.2381, "loss/crossentropy": 2.80257248878479, "loss/hidden": 1.0546875, "loss/logits": 0.18279102444648743, "loss/reg": 6.110716640250757e-05, "step": 214 }, { "epoch": 0.026875, "grad_norm": 4.107775688171387, "grad_norm_var": 0.2818514081658729, "learning_rate": 0.0001, "loss": 1.5243, "loss/crossentropy": 2.4806065559387207, "loss/hidden": 1.3046875, "loss/logits": 0.2190462350845337, "loss/reg": 6.109999230829999e-05, "step": 215 }, { "epoch": 0.027, "grad_norm": 2.3829445838928223, "grad_norm_var": 0.27569299833046823, "learning_rate": 0.0001, "loss": 1.2079, "loss/crossentropy": 2.466684579849243, "loss/hidden": 1.046875, "loss/logits": 0.16046380996704102, "loss/reg": 6.108790694270283e-05, "step": 216 }, { "epoch": 0.027125, "grad_norm": 2.554863929748535, "grad_norm_var": 0.2767468455530223, "learning_rate": 0.0001, "loss": 1.1988, "loss/crossentropy": 2.582035541534424, "loss/hidden": 1.046875, "loss/logits": 0.15130122005939484, "loss/reg": 6.1076192650944e-05, "step": 217 }, { "epoch": 0.02725, "grad_norm": 2.7898809909820557, "grad_norm_var": 0.26145832144768877, "learning_rate": 0.0001, "loss": 1.6592, "loss/crossentropy": 2.655186414718628, "loss/hidden": 1.3984375, "loss/logits": 0.26013702154159546, "loss/reg": 6.107001536292955e-05, "step": 218 }, { "epoch": 0.027375, "grad_norm": 2.7881548404693604, "grad_norm_var": 0.2378165583524293, "learning_rate": 0.0001, "loss": 1.5451, "loss/crossentropy": 2.4413743019104004, "loss/hidden": 1.3203125, "loss/logits": 0.2241469919681549, "loss/reg": 6.106249202275649e-05, "step": 219 }, { "epoch": 0.0275, "grad_norm": 2.2896728515625, "grad_norm_var": 0.24781162791184835, "learning_rate": 0.0001, "loss": 1.2198, "loss/crossentropy": 2.4421772956848145, "loss/hidden": 1.0703125, "loss/logits": 0.14890027046203613, "loss/reg": 6.105640932219103e-05, "step": 220 }, { "epoch": 0.027625, "grad_norm": 2.324869155883789, "grad_norm_var": 0.23120432182346703, "learning_rate": 0.0001, "loss": 1.3402, "loss/crossentropy": 2.526216745376587, "loss/hidden": 1.140625, "loss/logits": 0.19898337125778198, "loss/reg": 6.10438291914761e-05, "step": 221 }, { "epoch": 0.02775, "grad_norm": 2.88158917427063, "grad_norm_var": 0.22935101127255847, "learning_rate": 0.0001, "loss": 1.372, "loss/crossentropy": 2.361729621887207, "loss/hidden": 1.15625, "loss/logits": 0.21510916948318481, "loss/reg": 6.10318202234339e-05, "step": 222 }, { "epoch": 0.027875, "grad_norm": 2.9760019779205322, "grad_norm_var": 0.20104925696453316, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.5573909282684326, "loss/hidden": 1.1171875, "loss/logits": 0.1747477501630783, "loss/reg": 6.1027145420666784e-05, "step": 223 }, { "epoch": 0.028, "grad_norm": 2.702091932296753, "grad_norm_var": 0.19763696198550798, "learning_rate": 0.0001, "loss": 1.3524, "loss/crossentropy": 2.717195510864258, "loss/hidden": 1.15625, "loss/logits": 0.19553202390670776, "loss/reg": 6.1014961829641834e-05, "step": 224 }, { "epoch": 0.028125, "grad_norm": 2.1232945919036865, "grad_norm_var": 0.21708226542899425, "learning_rate": 0.0001, "loss": 1.2661, "loss/crossentropy": 2.4481968879699707, "loss/hidden": 1.0859375, "loss/logits": 0.1795472800731659, "loss/reg": 6.100164682720788e-05, "step": 225 }, { "epoch": 0.02825, "grad_norm": 2.191066026687622, "grad_norm_var": 0.2114830183011783, "learning_rate": 0.0001, "loss": 1.1895, "loss/crossentropy": 2.34470534324646, "loss/hidden": 1.03125, "loss/logits": 0.15763415396213531, "loss/reg": 6.099118763813749e-05, "step": 226 }, { "epoch": 0.028375, "grad_norm": 2.3068013191223145, "grad_norm_var": 0.21765702233228598, "learning_rate": 0.0001, "loss": 1.539, "loss/crossentropy": 2.5549845695495605, "loss/hidden": 1.328125, "loss/logits": 0.21025767922401428, "loss/reg": 6.09817034273874e-05, "step": 227 }, { "epoch": 0.0285, "grad_norm": 2.890655279159546, "grad_norm_var": 0.221304562186567, "learning_rate": 0.0001, "loss": 1.5638, "loss/crossentropy": 2.2339606285095215, "loss/hidden": 1.34375, "loss/logits": 0.21939440071582794, "loss/reg": 6.096933429944329e-05, "step": 228 }, { "epoch": 0.028625, "grad_norm": 2.182521343231201, "grad_norm_var": 0.2349577927735633, "learning_rate": 0.0001, "loss": 1.2085, "loss/crossentropy": 2.641230583190918, "loss/hidden": 1.046875, "loss/logits": 0.161014586687088, "loss/reg": 6.095720891607925e-05, "step": 229 }, { "epoch": 0.02875, "grad_norm": 2.704406976699829, "grad_norm_var": 0.23499684870281476, "learning_rate": 0.0001, "loss": 1.3456, "loss/crossentropy": 2.6833486557006836, "loss/hidden": 1.15625, "loss/logits": 0.18876385688781738, "loss/reg": 6.094613127061166e-05, "step": 230 }, { "epoch": 0.028875, "grad_norm": 3.4925310611724854, "grad_norm_var": 0.13802667852219105, "learning_rate": 0.0001, "loss": 1.3709, "loss/crossentropy": 2.1604089736938477, "loss/hidden": 1.1953125, "loss/logits": 0.17500904202461243, "loss/reg": 6.093499541748315e-05, "step": 231 }, { "epoch": 0.029, "grad_norm": 2.344773530960083, "grad_norm_var": 0.13921650701028032, "learning_rate": 0.0001, "loss": 1.4725, "loss/crossentropy": 2.493307113647461, "loss/hidden": 1.25, "loss/logits": 0.22193682193756104, "loss/reg": 6.092391777201556e-05, "step": 232 }, { "epoch": 0.029125, "grad_norm": 1.8828089237213135, "grad_norm_var": 0.17117140448626647, "learning_rate": 0.0001, "loss": 1.1104, "loss/crossentropy": 2.5302743911743164, "loss/hidden": 0.9765625, "loss/logits": 0.1331850290298462, "loss/reg": 6.0912472690688446e-05, "step": 233 }, { "epoch": 0.02925, "grad_norm": 2.747770071029663, "grad_norm_var": 0.16996031408720758, "learning_rate": 0.0001, "loss": 1.1371, "loss/crossentropy": 2.4189980030059814, "loss/hidden": 0.99609375, "loss/logits": 0.14035619795322418, "loss/reg": 6.089695307309739e-05, "step": 234 }, { "epoch": 0.029375, "grad_norm": 1.8742481470108032, "grad_norm_var": 0.1933626604088189, "learning_rate": 0.0001, "loss": 1.1601, "loss/crossentropy": 2.2694003582000732, "loss/hidden": 1.015625, "loss/logits": 0.14385350048542023, "loss/reg": 6.088387090130709e-05, "step": 235 }, { "epoch": 0.0295, "grad_norm": 2.0313689708709717, "grad_norm_var": 0.20459374724346724, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.4902865886688232, "loss/hidden": 1.0703125, "loss/logits": 0.17369529604911804, "loss/reg": 6.086897337809205e-05, "step": 236 }, { "epoch": 0.029625, "grad_norm": 2.3882880210876465, "grad_norm_var": 0.20354561810974156, "learning_rate": 0.0001, "loss": 1.3947, "loss/crossentropy": 2.4032340049743652, "loss/hidden": 1.1875, "loss/logits": 0.20656049251556396, "loss/reg": 6.085408676881343e-05, "step": 237 }, { "epoch": 0.02975, "grad_norm": 1.7327938079833984, "grad_norm_var": 0.22490130088653987, "learning_rate": 0.0001, "loss": 1.1777, "loss/crossentropy": 2.4949777126312256, "loss/hidden": 1.015625, "loss/logits": 0.1614799201488495, "loss/reg": 6.084307824494317e-05, "step": 238 }, { "epoch": 0.029875, "grad_norm": 2.2483370304107666, "grad_norm_var": 0.20314943964483845, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.5907418727874756, "loss/hidden": 1.1328125, "loss/logits": 0.19753864407539368, "loss/reg": 6.0828475398011506e-05, "step": 239 }, { "epoch": 0.03, "grad_norm": 2.5151193141937256, "grad_norm_var": 0.19693662117647784, "learning_rate": 0.0001, "loss": 1.2278, "loss/crossentropy": 2.6233856678009033, "loss/hidden": 1.0546875, "loss/logits": 0.1725194901227951, "loss/reg": 6.0820282669737935e-05, "step": 240 }, { "epoch": 0.030125, "grad_norm": 2.198249101638794, "grad_norm_var": 0.19498660957211478, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 2.368884563446045, "loss/hidden": 0.99609375, "loss/logits": 0.1473642736673355, "loss/reg": 6.0812566516688094e-05, "step": 241 }, { "epoch": 0.03025, "grad_norm": 2.195218563079834, "grad_norm_var": 0.1948951313244331, "learning_rate": 0.0001, "loss": 1.2993, "loss/crossentropy": 2.352041721343994, "loss/hidden": 1.1171875, "loss/logits": 0.1815069168806076, "loss/reg": 6.080829552956857e-05, "step": 242 }, { "epoch": 0.030375, "grad_norm": 2.6142425537109375, "grad_norm_var": 0.19868367561009795, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.497286558151245, "loss/hidden": 1.1875, "loss/logits": 0.17629210650920868, "loss/reg": 6.0799306083936244e-05, "step": 243 }, { "epoch": 0.0305, "grad_norm": 2.342033624649048, "grad_norm_var": 0.1799734399041227, "learning_rate": 0.0001, "loss": 1.1311, "loss/crossentropy": 2.5182478427886963, "loss/hidden": 0.984375, "loss/logits": 0.1461625099182129, "loss/reg": 6.078776277718134e-05, "step": 244 }, { "epoch": 0.030625, "grad_norm": 2.3943874835968018, "grad_norm_var": 0.17823371257387344, "learning_rate": 0.0001, "loss": 1.1773, "loss/crossentropy": 2.575707197189331, "loss/hidden": 1.015625, "loss/logits": 0.1610667109489441, "loss/reg": 6.078143633203581e-05, "step": 245 }, { "epoch": 0.03075, "grad_norm": 2.2752902507781982, "grad_norm_var": 0.16984605758260846, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.228628635406494, "loss/hidden": 1.1484375, "loss/logits": 0.18314987421035767, "loss/reg": 6.077219222788699e-05, "step": 246 }, { "epoch": 0.030875, "grad_norm": 2.1779940128326416, "grad_norm_var": 0.07406002979102144, "learning_rate": 0.0001, "loss": 1.179, "loss/crossentropy": 2.4325718879699707, "loss/hidden": 1.0078125, "loss/logits": 0.17062756419181824, "loss/reg": 6.076457793824375e-05, "step": 247 }, { "epoch": 0.031, "grad_norm": 2.031386613845825, "grad_norm_var": 0.07614130749575872, "learning_rate": 0.0001, "loss": 1.3177, "loss/crossentropy": 2.3050920963287354, "loss/hidden": 1.1328125, "loss/logits": 0.18426315486431122, "loss/reg": 6.075216515455395e-05, "step": 248 }, { "epoch": 0.031125, "grad_norm": 2.4880683422088623, "grad_norm_var": 0.07117238958467732, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.690160036087036, "loss/hidden": 1.0625, "loss/logits": 0.1985635757446289, "loss/reg": 6.0742688219761476e-05, "step": 249 }, { "epoch": 0.03125, "grad_norm": 2.631229877471924, "grad_norm_var": 0.06453399427719399, "learning_rate": 0.0001, "loss": 1.3072, "loss/crossentropy": 2.4459030628204346, "loss/hidden": 1.109375, "loss/logits": 0.1971898078918457, "loss/reg": 6.0733007558155805e-05, "step": 250 }, { "epoch": 0.031375, "grad_norm": 2.7028048038482666, "grad_norm_var": 0.06497512863382227, "learning_rate": 0.0001, "loss": 1.3656, "loss/crossentropy": 2.7830824851989746, "loss/hidden": 1.1796875, "loss/logits": 0.18533006310462952, "loss/reg": 6.0722686612280086e-05, "step": 251 }, { "epoch": 0.0315, "grad_norm": 3.7025880813598633, "grad_norm_var": 0.17735395269518506, "learning_rate": 0.0001, "loss": 1.2542, "loss/crossentropy": 2.4722542762756348, "loss/hidden": 1.078125, "loss/logits": 0.17551761865615845, "loss/reg": 6.0708127421094105e-05, "step": 252 }, { "epoch": 0.031625, "grad_norm": 2.1496498584747314, "grad_norm_var": 0.18175923180052275, "learning_rate": 0.0001, "loss": 1.0403, "loss/crossentropy": 2.4383487701416016, "loss/hidden": 0.91015625, "loss/logits": 0.12949630618095398, "loss/reg": 6.069323717383668e-05, "step": 253 }, { "epoch": 0.03175, "grad_norm": 3.212991237640381, "grad_norm_var": 0.18702365671043306, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.1896352767944336, "loss/hidden": 1.1953125, "loss/logits": 0.1595323085784912, "loss/reg": 6.067836147849448e-05, "step": 254 }, { "epoch": 0.031875, "grad_norm": 2.53044056892395, "grad_norm_var": 0.18281462084492142, "learning_rate": 0.0001, "loss": 1.2462, "loss/crossentropy": 2.8005239963531494, "loss/hidden": 1.0625, "loss/logits": 0.18304814398288727, "loss/reg": 6.0668298829114065e-05, "step": 255 }, { "epoch": 0.032, "grad_norm": 5.920226573944092, "grad_norm_var": 0.9097630014084027, "learning_rate": 0.0001, "loss": 1.9011, "loss/crossentropy": 2.2827932834625244, "loss/hidden": 1.59375, "loss/logits": 0.3067648708820343, "loss/reg": 6.0657377616735175e-05, "step": 256 }, { "epoch": 0.032125, "grad_norm": 3.144649028778076, "grad_norm_var": 0.8995354429829506, "learning_rate": 0.0001, "loss": 1.2361, "loss/crossentropy": 2.9163215160369873, "loss/hidden": 1.078125, "loss/logits": 0.15732741355895996, "loss/reg": 6.064687840989791e-05, "step": 257 }, { "epoch": 0.03225, "grad_norm": 2.677065849304199, "grad_norm_var": 0.8763431299745091, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.9036660194396973, "loss/hidden": 1.125, "loss/logits": 0.18664765357971191, "loss/reg": 6.0635462432401255e-05, "step": 258 }, { "epoch": 0.032375, "grad_norm": 1.9815617799758911, "grad_norm_var": 0.9180593253885627, "learning_rate": 0.0001, "loss": 1.2567, "loss/crossentropy": 2.6647751331329346, "loss/hidden": 1.0703125, "loss/logits": 0.18578888475894928, "loss/reg": 6.062128159101121e-05, "step": 259 }, { "epoch": 0.0325, "grad_norm": 2.6094260215759277, "grad_norm_var": 0.9071755924568459, "learning_rate": 0.0001, "loss": 1.4176, "loss/crossentropy": 2.9915220737457275, "loss/hidden": 1.21875, "loss/logits": 0.19824379682540894, "loss/reg": 6.060625673853792e-05, "step": 260 }, { "epoch": 0.032625, "grad_norm": 2.4859585762023926, "grad_norm_var": 0.9028772625757899, "learning_rate": 0.0001, "loss": 1.2047, "loss/crossentropy": 2.325611114501953, "loss/hidden": 1.03125, "loss/logits": 0.17281952500343323, "loss/reg": 6.0591693909373134e-05, "step": 261 }, { "epoch": 0.03275, "grad_norm": 4.910043716430664, "grad_norm_var": 1.154144117287072, "learning_rate": 0.0001, "loss": 1.2858, "loss/crossentropy": 2.568098306655884, "loss/hidden": 1.109375, "loss/logits": 0.17582398653030396, "loss/reg": 6.057979408069514e-05, "step": 262 }, { "epoch": 0.032875, "grad_norm": 2.2592694759368896, "grad_norm_var": 1.1460852387432343, "learning_rate": 0.0001, "loss": 1.3156, "loss/crossentropy": 2.5264766216278076, "loss/hidden": 1.1171875, "loss/logits": 0.19776055216789246, "loss/reg": 6.056776692275889e-05, "step": 263 }, { "epoch": 0.033, "grad_norm": 2.6964571475982666, "grad_norm_var": 1.0909556269012999, "learning_rate": 0.0001, "loss": 1.0468, "loss/crossentropy": 2.740647792816162, "loss/hidden": 0.91796875, "loss/logits": 0.12825211882591248, "loss/reg": 6.0556718381121755e-05, "step": 264 }, { "epoch": 0.033125, "grad_norm": 2.112201690673828, "grad_norm_var": 1.125761935491216, "learning_rate": 0.0001, "loss": 1.2175, "loss/crossentropy": 2.475130081176758, "loss/hidden": 1.0390625, "loss/logits": 0.1778050661087036, "loss/reg": 6.0543683503055945e-05, "step": 265 }, { "epoch": 0.03325, "grad_norm": 1.8527328968048096, "grad_norm_var": 1.2001448152569836, "learning_rate": 0.0001, "loss": 1.1913, "loss/crossentropy": 2.2017788887023926, "loss/hidden": 1.0234375, "loss/logits": 0.16727614402770996, "loss/reg": 6.053145989426412e-05, "step": 266 }, { "epoch": 0.033375, "grad_norm": 2.2294929027557373, "grad_norm_var": 1.2287526925730277, "learning_rate": 0.0001, "loss": 1.3521, "loss/crossentropy": 2.268073558807373, "loss/hidden": 1.1640625, "loss/logits": 0.18739831447601318, "loss/reg": 6.052442768123001e-05, "step": 267 }, { "epoch": 0.0335, "grad_norm": 2.185410499572754, "grad_norm_var": 1.2112062552861744, "learning_rate": 0.0001, "loss": 1.44, "loss/crossentropy": 2.390622138977051, "loss/hidden": 1.234375, "loss/logits": 0.20500804483890533, "loss/reg": 6.051711898180656e-05, "step": 268 }, { "epoch": 0.033625, "grad_norm": 2.616452693939209, "grad_norm_var": 1.1837342905938153, "learning_rate": 0.0001, "loss": 1.3338, "loss/crossentropy": 2.3374340534210205, "loss/hidden": 1.15625, "loss/logits": 0.17693625390529633, "loss/reg": 6.0506343288579956e-05, "step": 269 }, { "epoch": 0.03375, "grad_norm": 2.5214874744415283, "grad_norm_var": 1.1791403953024882, "learning_rate": 0.0001, "loss": 1.4572, "loss/crossentropy": 2.6334807872772217, "loss/hidden": 1.25, "loss/logits": 0.20655225217342377, "loss/reg": 6.0493421187857166e-05, "step": 270 }, { "epoch": 0.033875, "grad_norm": 2.3426766395568848, "grad_norm_var": 1.18798729537596, "learning_rate": 0.0001, "loss": 1.2858, "loss/crossentropy": 2.362666130065918, "loss/hidden": 1.1171875, "loss/logits": 0.16799038648605347, "loss/reg": 6.047951683285646e-05, "step": 271 }, { "epoch": 0.034, "grad_norm": 2.483227491378784, "grad_norm_var": 0.4891016266434789, "learning_rate": 0.0001, "loss": 1.4126, "loss/crossentropy": 2.6330323219299316, "loss/hidden": 1.203125, "loss/logits": 0.20882482826709747, "loss/reg": 6.046749331289902e-05, "step": 272 }, { "epoch": 0.034125, "grad_norm": 3.3453869819641113, "grad_norm_var": 0.5070205087741229, "learning_rate": 0.0001, "loss": 1.3731, "loss/crossentropy": 2.6637308597564697, "loss/hidden": 1.171875, "loss/logits": 0.20059773325920105, "loss/reg": 6.0458773077698424e-05, "step": 273 }, { "epoch": 0.03425, "grad_norm": 2.2971482276916504, "grad_norm_var": 0.5112160036914843, "learning_rate": 0.0001, "loss": 1.3516, "loss/crossentropy": 2.400428533554077, "loss/hidden": 1.1640625, "loss/logits": 0.18688717484474182, "loss/reg": 6.0452930483734235e-05, "step": 274 }, { "epoch": 0.034375, "grad_norm": 11.117164611816406, "grad_norm_var": 5.025199240890341, "learning_rate": 0.0001, "loss": 2.1956, "loss/crossentropy": 2.7653286457061768, "loss/hidden": 1.8984375, "loss/logits": 0.2965186834335327, "loss/reg": 6.045090049155988e-05, "step": 275 }, { "epoch": 0.0345, "grad_norm": 3.6517550945281982, "grad_norm_var": 5.020888752799834, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.8897998332977295, "loss/hidden": 1.1484375, "loss/logits": 0.26139265298843384, "loss/reg": 6.0451366152847186e-05, "step": 276 }, { "epoch": 0.034625, "grad_norm": 2.6342201232910156, "grad_norm_var": 5.008262345647254, "learning_rate": 0.0001, "loss": 1.272, "loss/crossentropy": 2.662801504135132, "loss/hidden": 1.09375, "loss/logits": 0.17764705419540405, "loss/reg": 6.0443973779911175e-05, "step": 277 }, { "epoch": 0.03475, "grad_norm": 2.613866090774536, "grad_norm_var": 4.815302301096653, "learning_rate": 0.0001, "loss": 1.3, "loss/crossentropy": 2.2599401473999023, "loss/hidden": 1.125, "loss/logits": 0.1744215488433838, "loss/reg": 6.04407032369636e-05, "step": 278 }, { "epoch": 0.034875, "grad_norm": 2.4121639728546143, "grad_norm_var": 4.800441045565859, "learning_rate": 0.0001, "loss": 1.2736, "loss/crossentropy": 2.3868885040283203, "loss/hidden": 1.109375, "loss/logits": 0.16360533237457275, "loss/reg": 6.0438182117650285e-05, "step": 279 }, { "epoch": 0.035, "grad_norm": 2.257427930831909, "grad_norm_var": 4.834324037466968, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.452359914779663, "loss/hidden": 1.1328125, "loss/logits": 0.19017404317855835, "loss/reg": 6.043619578122161e-05, "step": 280 }, { "epoch": 0.035125, "grad_norm": 2.3916571140289307, "grad_norm_var": 4.8045581397439525, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.4834201335906982, "loss/hidden": 1.109375, "loss/logits": 0.20611721277236938, "loss/reg": 6.043669054633938e-05, "step": 281 }, { "epoch": 0.03525, "grad_norm": 2.815398931503296, "grad_norm_var": 4.707581175884913, "learning_rate": 0.0001, "loss": 1.1312, "loss/crossentropy": 3.0801713466644287, "loss/hidden": 0.98828125, "loss/logits": 0.14229975640773773, "loss/reg": 6.044648034730926e-05, "step": 282 }, { "epoch": 0.035375, "grad_norm": 3.1715469360351562, "grad_norm_var": 4.651233430019207, "learning_rate": 0.0001, "loss": 1.409, "loss/crossentropy": 2.354785919189453, "loss/hidden": 1.1953125, "loss/logits": 0.21305763721466064, "loss/reg": 6.0437832871684805e-05, "step": 283 }, { "epoch": 0.0355, "grad_norm": 2.5010037422180176, "grad_norm_var": 4.615667456235268, "learning_rate": 0.0001, "loss": 1.3572, "loss/crossentropy": 2.492047071456909, "loss/hidden": 1.1640625, "loss/logits": 0.1925477683544159, "loss/reg": 6.044709516572766e-05, "step": 284 }, { "epoch": 0.035625, "grad_norm": 1.964429259300232, "grad_norm_var": 4.6928209367171645, "learning_rate": 0.0001, "loss": 1.1671, "loss/crossentropy": 2.3351125717163086, "loss/hidden": 0.99609375, "loss/logits": 0.1704423427581787, "loss/reg": 6.0453679907368496e-05, "step": 285 }, { "epoch": 0.03575, "grad_norm": 2.3656678199768066, "grad_norm_var": 4.707552916907375, "learning_rate": 0.0001, "loss": 1.5385, "loss/crossentropy": 2.4216158390045166, "loss/hidden": 1.28125, "loss/logits": 0.2566841244697571, "loss/reg": 6.0443537222454324e-05, "step": 286 }, { "epoch": 0.035875, "grad_norm": 3.140928030014038, "grad_norm_var": 4.661686527481659, "learning_rate": 0.0001, "loss": 1.3637, "loss/crossentropy": 2.8347983360290527, "loss/hidden": 1.15625, "loss/logits": 0.20682096481323242, "loss/reg": 6.043089888407849e-05, "step": 287 }, { "epoch": 0.036, "grad_norm": 2.6460797786712646, "grad_norm_var": 4.647830565858565, "learning_rate": 0.0001, "loss": 1.3928, "loss/crossentropy": 2.108215093612671, "loss/hidden": 1.2109375, "loss/logits": 0.18129181861877441, "loss/reg": 6.042820677976124e-05, "step": 288 }, { "epoch": 0.036125, "grad_norm": 2.879531145095825, "grad_norm_var": 4.652852381956769, "learning_rate": 0.0001, "loss": 1.4359, "loss/crossentropy": 2.90163516998291, "loss/hidden": 1.25, "loss/logits": 0.1853410005569458, "loss/reg": 6.042792301741429e-05, "step": 289 }, { "epoch": 0.03625, "grad_norm": 2.5701370239257812, "grad_norm_var": 4.625421100051376, "learning_rate": 0.0001, "loss": 1.3639, "loss/crossentropy": 2.6896326541900635, "loss/hidden": 1.15625, "loss/logits": 0.2070741057395935, "loss/reg": 6.0414979088818654e-05, "step": 290 }, { "epoch": 0.036375, "grad_norm": 2.988196849822998, "grad_norm_var": 0.16977142791367086, "learning_rate": 0.0001, "loss": 1.103, "loss/crossentropy": 2.8485705852508545, "loss/hidden": 0.96875, "loss/logits": 0.13362044095993042, "loss/reg": 6.0413527535274625e-05, "step": 291 }, { "epoch": 0.0365, "grad_norm": 5.9153923988342285, "grad_norm_var": 0.7809789933836029, "learning_rate": 0.0001, "loss": 1.6292, "loss/crossentropy": 2.607590436935425, "loss/hidden": 1.4375, "loss/logits": 0.19109681248664856, "loss/reg": 6.041422238922678e-05, "step": 292 }, { "epoch": 0.036625, "grad_norm": 1.932381510734558, "grad_norm_var": 0.8300136192923785, "learning_rate": 0.0001, "loss": 1.1314, "loss/crossentropy": 2.2319207191467285, "loss/hidden": 0.9921875, "loss/logits": 0.13856041431427002, "loss/reg": 6.041810775059275e-05, "step": 293 }, { "epoch": 0.03675, "grad_norm": 2.1218042373657227, "grad_norm_var": 0.8563980373093443, "learning_rate": 0.0001, "loss": 1.1898, "loss/crossentropy": 2.7033910751342773, "loss/hidden": 1.03125, "loss/logits": 0.15791726112365723, "loss/reg": 6.0404745454434305e-05, "step": 294 }, { "epoch": 0.036875, "grad_norm": 3.239748954772949, "grad_norm_var": 0.8614170936653748, "learning_rate": 0.0001, "loss": 1.6186, "loss/crossentropy": 2.3478281497955322, "loss/hidden": 1.3671875, "loss/logits": 0.2507687509059906, "loss/reg": 6.039286745362915e-05, "step": 295 }, { "epoch": 0.037, "grad_norm": 2.361431121826172, "grad_norm_var": 0.8544814148079373, "learning_rate": 0.0001, "loss": 1.2822, "loss/crossentropy": 2.4396111965179443, "loss/hidden": 1.0859375, "loss/logits": 0.19569119811058044, "loss/reg": 6.0390335420379415e-05, "step": 296 }, { "epoch": 0.037125, "grad_norm": 2.6921112537384033, "grad_norm_var": 0.8432509023111928, "learning_rate": 0.0001, "loss": 1.3584, "loss/crossentropy": 2.3235762119293213, "loss/hidden": 1.15625, "loss/logits": 0.20157676935195923, "loss/reg": 6.037576531525701e-05, "step": 297 }, { "epoch": 0.03725, "grad_norm": 2.2376601696014404, "grad_norm_var": 0.8653611900667765, "learning_rate": 0.0001, "loss": 1.3703, "loss/crossentropy": 2.441978693008423, "loss/hidden": 1.1875, "loss/logits": 0.1821848303079605, "loss/reg": 6.036146805854514e-05, "step": 298 }, { "epoch": 0.037375, "grad_norm": 2.5022082328796387, "grad_norm_var": 0.8598019948407729, "learning_rate": 0.0001, "loss": 1.2909, "loss/crossentropy": 2.4099972248077393, "loss/hidden": 1.09375, "loss/logits": 0.1965959370136261, "loss/reg": 6.035445403540507e-05, "step": 299 }, { "epoch": 0.0375, "grad_norm": 2.323599338531494, "grad_norm_var": 0.8677455500426021, "learning_rate": 0.0001, "loss": 1.2301, "loss/crossentropy": 2.714334011077881, "loss/hidden": 1.0625, "loss/logits": 0.16703477501869202, "loss/reg": 6.034153193468228e-05, "step": 300 }, { "epoch": 0.037625, "grad_norm": 2.902794361114502, "grad_norm_var": 0.8254198045813945, "learning_rate": 0.0001, "loss": 1.287, "loss/crossentropy": 2.5897319316864014, "loss/hidden": 1.1171875, "loss/logits": 0.1692187488079071, "loss/reg": 6.032464443705976e-05, "step": 301 }, { "epoch": 0.03775, "grad_norm": 2.455423355102539, "grad_norm_var": 0.8207107650276014, "learning_rate": 0.0001, "loss": 1.3118, "loss/crossentropy": 2.2553625106811523, "loss/hidden": 1.15625, "loss/logits": 0.15494795143604279, "loss/reg": 6.031416342011653e-05, "step": 302 }, { "epoch": 0.037875, "grad_norm": 2.70770001411438, "grad_norm_var": 0.8131429553718594, "learning_rate": 0.0001, "loss": 1.3645, "loss/crossentropy": 2.298628807067871, "loss/hidden": 1.1875, "loss/logits": 0.17642799019813538, "loss/reg": 6.029937867424451e-05, "step": 303 }, { "epoch": 0.038, "grad_norm": 2.4096872806549072, "grad_norm_var": 0.8208490888498592, "learning_rate": 0.0001, "loss": 1.2573, "loss/crossentropy": 2.6787161827087402, "loss/hidden": 1.078125, "loss/logits": 0.17861339449882507, "loss/reg": 6.027881318004802e-05, "step": 304 }, { "epoch": 0.038125, "grad_norm": 2.364800214767456, "grad_norm_var": 0.8295471446711137, "learning_rate": 0.0001, "loss": 1.3251, "loss/crossentropy": 2.351970911026001, "loss/hidden": 1.140625, "loss/logits": 0.18391045928001404, "loss/reg": 6.026409027981572e-05, "step": 305 }, { "epoch": 0.03825, "grad_norm": 2.0991923809051514, "grad_norm_var": 0.8536240669336511, "learning_rate": 0.0001, "loss": 1.078, "loss/crossentropy": 2.7187068462371826, "loss/hidden": 0.9453125, "loss/logits": 0.13205038011074066, "loss/reg": 6.0248257796047255e-05, "step": 306 }, { "epoch": 0.038375, "grad_norm": 2.7471582889556885, "grad_norm_var": 0.8481018158238611, "learning_rate": 0.0001, "loss": 1.4035, "loss/crossentropy": 2.1265523433685303, "loss/hidden": 1.21875, "loss/logits": 0.18416792154312134, "loss/reg": 6.0230733652133495e-05, "step": 307 }, { "epoch": 0.0385, "grad_norm": 2.2592687606811523, "grad_norm_var": 0.11041007633642194, "learning_rate": 0.0001, "loss": 1.271, "loss/crossentropy": 2.66719651222229, "loss/hidden": 1.0859375, "loss/logits": 0.184452086687088, "loss/reg": 6.0217109421500936e-05, "step": 308 }, { "epoch": 0.038625, "grad_norm": 2.2400615215301514, "grad_norm_var": 0.09468951175299385, "learning_rate": 0.0001, "loss": 1.2348, "loss/crossentropy": 2.3710193634033203, "loss/hidden": 1.0625, "loss/logits": 0.1717246174812317, "loss/reg": 6.020214277668856e-05, "step": 309 }, { "epoch": 0.03875, "grad_norm": 2.0783209800720215, "grad_norm_var": 0.09687885973874776, "learning_rate": 0.0001, "loss": 1.2085, "loss/crossentropy": 2.2699692249298096, "loss/hidden": 1.03125, "loss/logits": 0.17665645480155945, "loss/reg": 6.018438944010995e-05, "step": 310 }, { "epoch": 0.038875, "grad_norm": 2.077648162841797, "grad_norm_var": 0.06299334570375853, "learning_rate": 0.0001, "loss": 1.2169, "loss/crossentropy": 2.334127426147461, "loss/hidden": 1.0625, "loss/logits": 0.15378312766551971, "loss/reg": 6.0161146393511444e-05, "step": 311 }, { "epoch": 0.039, "grad_norm": 2.440629482269287, "grad_norm_var": 0.06293910816862744, "learning_rate": 0.0001, "loss": 1.2956, "loss/crossentropy": 2.791874408721924, "loss/hidden": 1.1171875, "loss/logits": 0.1777758002281189, "loss/reg": 6.014638711349107e-05, "step": 312 }, { "epoch": 0.039125, "grad_norm": 2.853940963745117, "grad_norm_var": 0.07069242228717272, "learning_rate": 0.0001, "loss": 1.2688, "loss/crossentropy": 2.5036516189575195, "loss/hidden": 1.0859375, "loss/logits": 0.18226328492164612, "loss/reg": 6.013087840983644e-05, "step": 313 }, { "epoch": 0.03925, "grad_norm": 3.287529230117798, "grad_norm_var": 0.11423125477930943, "learning_rate": 0.0001, "loss": 1.2435, "loss/crossentropy": 2.696265697479248, "loss/hidden": 1.0703125, "loss/logits": 0.17254707217216492, "loss/reg": 6.011854929965921e-05, "step": 314 }, { "epoch": 0.039375, "grad_norm": 3.1080963611602783, "grad_norm_var": 0.1386158794861321, "learning_rate": 0.0001, "loss": 1.473, "loss/crossentropy": 2.1882760524749756, "loss/hidden": 1.25, "loss/logits": 0.2224160134792328, "loss/reg": 6.0103353462181985e-05, "step": 315 }, { "epoch": 0.0395, "grad_norm": 2.7303977012634277, "grad_norm_var": 0.13818442385569654, "learning_rate": 0.0001, "loss": 1.4029, "loss/crossentropy": 2.361660957336426, "loss/hidden": 1.2109375, "loss/logits": 0.19139324128627777, "loss/reg": 6.008424679748714e-05, "step": 316 }, { "epoch": 0.039625, "grad_norm": 1.7651097774505615, "grad_norm_var": 0.16520987140884788, "learning_rate": 0.0001, "loss": 1.0765, "loss/crossentropy": 2.435858964920044, "loss/hidden": 0.953125, "loss/logits": 0.1227254569530487, "loss/reg": 6.007165211485699e-05, "step": 317 }, { "epoch": 0.03975, "grad_norm": 2.128772258758545, "grad_norm_var": 0.17279926669385734, "learning_rate": 0.0001, "loss": 1.1848, "loss/crossentropy": 2.334495782852173, "loss/hidden": 1.0546875, "loss/logits": 0.12953956425189972, "loss/reg": 6.005321120028384e-05, "step": 318 }, { "epoch": 0.039875, "grad_norm": 2.1308538913726807, "grad_norm_var": 0.1742483958439737, "learning_rate": 0.0001, "loss": 1.3191, "loss/crossentropy": 2.3873021602630615, "loss/hidden": 1.125, "loss/logits": 0.19348952174186707, "loss/reg": 6.0041034885216504e-05, "step": 319 }, { "epoch": 0.04, "grad_norm": 2.706742286682129, "grad_norm_var": 0.17935140917835876, "learning_rate": 0.0001, "loss": 1.4123, "loss/crossentropy": 2.5321033000946045, "loss/hidden": 1.203125, "loss/logits": 0.20852993428707123, "loss/reg": 6.002993177389726e-05, "step": 320 }, { "epoch": 0.040125, "grad_norm": 6.118154525756836, "grad_norm_var": 1.0228689502418715, "learning_rate": 0.0001, "loss": 1.7298, "loss/crossentropy": 2.457045316696167, "loss/hidden": 1.515625, "loss/logits": 0.2136228382587433, "loss/reg": 6.001694418955594e-05, "step": 321 }, { "epoch": 0.04025, "grad_norm": 3.091947317123413, "grad_norm_var": 1.0084811477178388, "learning_rate": 0.0001, "loss": 1.6635, "loss/crossentropy": 2.6943020820617676, "loss/hidden": 1.40625, "loss/logits": 0.25662127137184143, "loss/reg": 6.0004946135450155e-05, "step": 322 }, { "epoch": 0.040375, "grad_norm": 2.488391637802124, "grad_norm_var": 1.0122566583255546, "learning_rate": 0.0001, "loss": 1.2065, "loss/crossentropy": 2.646897792816162, "loss/hidden": 1.0546875, "loss/logits": 0.15123483538627625, "loss/reg": 5.9991711168549955e-05, "step": 323 }, { "epoch": 0.0405, "grad_norm": 3.0675456523895264, "grad_norm_var": 1.0035307165437406, "learning_rate": 0.0001, "loss": 1.4832, "loss/crossentropy": 2.4176406860351562, "loss/hidden": 1.2421875, "loss/logits": 0.24040505290031433, "loss/reg": 5.9981128288200125e-05, "step": 324 }, { "epoch": 0.040625, "grad_norm": 2.424546957015991, "grad_norm_var": 0.9926314451715664, "learning_rate": 0.0001, "loss": 1.07, "loss/crossentropy": 2.703134059906006, "loss/hidden": 0.94140625, "loss/logits": 0.12803316116333008, "loss/reg": 5.997138941893354e-05, "step": 325 }, { "epoch": 0.04075, "grad_norm": 2.9345507621765137, "grad_norm_var": 0.9582126623175621, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.8940789699554443, "loss/hidden": 1.21875, "loss/logits": 0.20539763569831848, "loss/reg": 5.996019172016531e-05, "step": 326 }, { "epoch": 0.040875, "grad_norm": 3.069572925567627, "grad_norm_var": 0.9195850402401864, "learning_rate": 0.0001, "loss": 1.3896, "loss/crossentropy": 2.4416871070861816, "loss/hidden": 1.1875, "loss/logits": 0.20154833793640137, "loss/reg": 5.9947429690510035e-05, "step": 327 }, { "epoch": 0.041, "grad_norm": 2.323606491088867, "grad_norm_var": 0.9275566292830253, "learning_rate": 0.0001, "loss": 1.2888, "loss/crossentropy": 2.811528444290161, "loss/hidden": 1.1015625, "loss/logits": 0.18662354350090027, "loss/reg": 5.9936231991741806e-05, "step": 328 }, { "epoch": 0.041125, "grad_norm": 3.1679723262786865, "grad_norm_var": 0.9322370885273564, "learning_rate": 0.0001, "loss": 1.5559, "loss/crossentropy": 2.3170981407165527, "loss/hidden": 1.3046875, "loss/logits": 0.2506353557109833, "loss/reg": 5.991987563902512e-05, "step": 329 }, { "epoch": 0.04125, "grad_norm": 2.7683303356170654, "grad_norm_var": 0.9228798875820224, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.51680850982666, "loss/hidden": 1.1171875, "loss/logits": 0.1949077993631363, "loss/reg": 5.990756835672073e-05, "step": 330 }, { "epoch": 0.041375, "grad_norm": 2.4825031757354736, "grad_norm_var": 0.9280253827718864, "learning_rate": 0.0001, "loss": 1.3408, "loss/crossentropy": 2.605055332183838, "loss/hidden": 1.140625, "loss/logits": 0.19955970346927643, "loss/reg": 5.989522469462827e-05, "step": 331 }, { "epoch": 0.0415, "grad_norm": 3.2399041652679443, "grad_norm_var": 0.9369785308922095, "learning_rate": 0.0001, "loss": 1.5753, "loss/crossentropy": 2.7269279956817627, "loss/hidden": 1.3515625, "loss/logits": 0.22315430641174316, "loss/reg": 5.988113844068721e-05, "step": 332 }, { "epoch": 0.041625, "grad_norm": 2.8936927318573, "grad_norm_var": 0.8504314928241191, "learning_rate": 0.0001, "loss": 1.3222, "loss/crossentropy": 2.812412738800049, "loss/hidden": 1.140625, "loss/logits": 0.1809367835521698, "loss/reg": 5.9867059462703764e-05, "step": 333 }, { "epoch": 0.04175, "grad_norm": 2.432213068008423, "grad_norm_var": 0.8233723477256942, "learning_rate": 0.0001, "loss": 1.4094, "loss/crossentropy": 2.6377694606781006, "loss/hidden": 1.203125, "loss/logits": 0.20563456416130066, "loss/reg": 5.9853711718460545e-05, "step": 334 }, { "epoch": 0.041875, "grad_norm": 2.422299861907959, "grad_norm_var": 0.7965082638815336, "learning_rate": 0.0001, "loss": 1.2328, "loss/crossentropy": 2.5352189540863037, "loss/hidden": 1.078125, "loss/logits": 0.15405428409576416, "loss/reg": 5.984482049825601e-05, "step": 335 }, { "epoch": 0.042, "grad_norm": 2.703420877456665, "grad_norm_var": 0.7966286375145801, "learning_rate": 0.0001, "loss": 1.2981, "loss/crossentropy": 2.525949716567993, "loss/hidden": 1.1015625, "loss/logits": 0.1959662139415741, "loss/reg": 5.983649680274539e-05, "step": 336 }, { "epoch": 0.042125, "grad_norm": 3.625760078430176, "grad_norm_var": 0.14094485019601447, "learning_rate": 0.0001, "loss": 1.6517, "loss/crossentropy": 1.9824917316436768, "loss/hidden": 1.3828125, "loss/logits": 0.2682979702949524, "loss/reg": 5.9825455537065864e-05, "step": 337 }, { "epoch": 0.04225, "grad_norm": 2.2066762447357178, "grad_norm_var": 0.1579467344221198, "learning_rate": 0.0001, "loss": 1.1768, "loss/crossentropy": 2.5151102542877197, "loss/hidden": 1.0078125, "loss/logits": 0.16843904554843903, "loss/reg": 5.981199865345843e-05, "step": 338 }, { "epoch": 0.042375, "grad_norm": 2.961968421936035, "grad_norm_var": 0.15445451920782696, "learning_rate": 0.0001, "loss": 1.5446, "loss/crossentropy": 2.397102117538452, "loss/hidden": 1.3046875, "loss/logits": 0.23934724926948547, "loss/reg": 5.979971319902688e-05, "step": 339 }, { "epoch": 0.0425, "grad_norm": 2.4696779251098633, "grad_norm_var": 0.15509145555751214, "learning_rate": 0.0001, "loss": 1.2907, "loss/crossentropy": 2.518648624420166, "loss/hidden": 1.1171875, "loss/logits": 0.17289261519908905, "loss/reg": 5.979237175779417e-05, "step": 340 }, { "epoch": 0.042625, "grad_norm": 2.2886741161346436, "grad_norm_var": 0.16228478040589658, "learning_rate": 0.0001, "loss": 1.2915, "loss/crossentropy": 2.4755570888519287, "loss/hidden": 1.109375, "loss/logits": 0.18152545392513275, "loss/reg": 5.978640547255054e-05, "step": 341 }, { "epoch": 0.04275, "grad_norm": 2.4154622554779053, "grad_norm_var": 0.16631279956205466, "learning_rate": 0.0001, "loss": 1.1361, "loss/crossentropy": 2.620903730392456, "loss/hidden": 0.9921875, "loss/logits": 0.1432739496231079, "loss/reg": 5.977362161502242e-05, "step": 342 }, { "epoch": 0.042875, "grad_norm": 3.9107778072357178, "grad_norm_var": 0.25008606934497735, "learning_rate": 0.0001, "loss": 1.6206, "loss/crossentropy": 3.3820858001708984, "loss/hidden": 1.40625, "loss/logits": 0.21375682950019836, "loss/reg": 5.976331885904074e-05, "step": 343 }, { "epoch": 0.043, "grad_norm": 2.2201833724975586, "grad_norm_var": 0.25690416036597197, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.467216730117798, "loss/hidden": 1.0625, "loss/logits": 0.18146604299545288, "loss/reg": 5.975304884486832e-05, "step": 344 }, { "epoch": 0.043125, "grad_norm": 2.1915907859802246, "grad_norm_var": 0.26377805805320803, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.3638522624969482, "loss/hidden": 1.203125, "loss/logits": 0.22996577620506287, "loss/reg": 5.974585292278789e-05, "step": 345 }, { "epoch": 0.04325, "grad_norm": 2.2508416175842285, "grad_norm_var": 0.27594342104869135, "learning_rate": 0.0001, "loss": 1.1804, "loss/crossentropy": 2.5332260131835938, "loss/hidden": 1.0234375, "loss/logits": 0.15640094876289368, "loss/reg": 5.973771112621762e-05, "step": 346 }, { "epoch": 0.043375, "grad_norm": 2.0090150833129883, "grad_norm_var": 0.30177518099136, "learning_rate": 0.0001, "loss": 1.2994, "loss/crossentropy": 2.511950731277466, "loss/hidden": 1.125, "loss/logits": 0.17384442687034607, "loss/reg": 5.9728798078140244e-05, "step": 347 }, { "epoch": 0.0435, "grad_norm": 2.7306134700775146, "grad_norm_var": 0.277258656834267, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.6389760971069336, "loss/hidden": 1.1875, "loss/logits": 0.2318291962146759, "loss/reg": 5.9719615819631144e-05, "step": 348 }, { "epoch": 0.043625, "grad_norm": 2.270148515701294, "grad_norm_var": 0.27783213891549774, "learning_rate": 0.0001, "loss": 1.5484, "loss/crossentropy": 2.312831163406372, "loss/hidden": 1.2890625, "loss/logits": 0.2587454915046692, "loss/reg": 5.970869824523106e-05, "step": 349 }, { "epoch": 0.04375, "grad_norm": 2.0988070964813232, "grad_norm_var": 0.2908751450016543, "learning_rate": 0.0001, "loss": 1.2681, "loss/crossentropy": 2.378908634185791, "loss/hidden": 1.109375, "loss/logits": 0.1581004559993744, "loss/reg": 5.970033089397475e-05, "step": 350 }, { "epoch": 0.043875, "grad_norm": 2.045546770095825, "grad_norm_var": 0.30608582246859417, "learning_rate": 0.0001, "loss": 1.1063, "loss/crossentropy": 2.4011952877044678, "loss/hidden": 0.96875, "loss/logits": 0.13691341876983643, "loss/reg": 5.969877020106651e-05, "step": 351 }, { "epoch": 0.044, "grad_norm": 2.9582409858703613, "grad_norm_var": 0.316207957574548, "learning_rate": 0.0001, "loss": 1.2072, "loss/crossentropy": 2.643101215362549, "loss/hidden": 1.046875, "loss/logits": 0.15975871682167053, "loss/reg": 5.9694295487133786e-05, "step": 352 }, { "epoch": 0.044125, "grad_norm": 2.125020742416382, "grad_norm_var": 0.23988746234485703, "learning_rate": 0.0001, "loss": 1.2268, "loss/crossentropy": 2.5923550128936768, "loss/hidden": 1.0390625, "loss/logits": 0.18714120984077454, "loss/reg": 5.968381810816936e-05, "step": 353 }, { "epoch": 0.04425, "grad_norm": 2.2348685264587402, "grad_norm_var": 0.2390334750954897, "learning_rate": 0.0001, "loss": 1.3948, "loss/crossentropy": 2.549100637435913, "loss/hidden": 1.1953125, "loss/logits": 0.198894202709198, "loss/reg": 5.9669990150723606e-05, "step": 354 }, { "epoch": 0.044375, "grad_norm": 2.6807351112365723, "grad_norm_var": 0.2247355561703434, "learning_rate": 0.0001, "loss": 1.5721, "loss/crossentropy": 2.2256884574890137, "loss/hidden": 1.3046875, "loss/logits": 0.26683151721954346, "loss/reg": 5.966486787656322e-05, "step": 355 }, { "epoch": 0.0445, "grad_norm": 3.1524059772491455, "grad_norm_var": 0.2573648537337417, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.4026124477386475, "loss/hidden": 1.3203125, "loss/logits": 0.22484509646892548, "loss/reg": 5.965128730167635e-05, "step": 356 }, { "epoch": 0.044625, "grad_norm": 3.806107759475708, "grad_norm_var": 0.3637951956534662, "learning_rate": 0.0001, "loss": 1.2257, "loss/crossentropy": 2.534790277481079, "loss/hidden": 1.1015625, "loss/logits": 0.12353114783763885, "loss/reg": 5.9637932281475514e-05, "step": 357 }, { "epoch": 0.04475, "grad_norm": 2.6499619483947754, "grad_norm_var": 0.36243857175732047, "learning_rate": 0.0001, "loss": 1.2577, "loss/crossentropy": 2.786536931991577, "loss/hidden": 1.078125, "loss/logits": 0.17896610498428345, "loss/reg": 5.962959403404966e-05, "step": 358 }, { "epoch": 0.044875, "grad_norm": 2.750371217727661, "grad_norm_var": 0.24122897908522703, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.5112698078155518, "loss/hidden": 1.125, "loss/logits": 0.19569161534309387, "loss/reg": 5.961711940472014e-05, "step": 359 }, { "epoch": 0.045, "grad_norm": 2.4145219326019287, "grad_norm_var": 0.23605635737508593, "learning_rate": 0.0001, "loss": 1.4067, "loss/crossentropy": 2.4327914714813232, "loss/hidden": 1.171875, "loss/logits": 0.23425719141960144, "loss/reg": 5.960506314295344e-05, "step": 360 }, { "epoch": 0.045125, "grad_norm": 2.7820589542388916, "grad_norm_var": 0.2317516785903725, "learning_rate": 0.0001, "loss": 1.5833, "loss/crossentropy": 2.6201419830322266, "loss/hidden": 1.3359375, "loss/logits": 0.2468121349811554, "loss/reg": 5.959635382168926e-05, "step": 361 }, { "epoch": 0.04525, "grad_norm": 3.0179331302642822, "grad_norm_var": 0.23691283979908515, "learning_rate": 0.0001, "loss": 1.3921, "loss/crossentropy": 2.728665351867676, "loss/hidden": 1.1953125, "loss/logits": 0.19617268443107605, "loss/reg": 5.958346446277574e-05, "step": 362 }, { "epoch": 0.045375, "grad_norm": 2.577760934829712, "grad_norm_var": 0.21171492452191767, "learning_rate": 0.0001, "loss": 1.3343, "loss/crossentropy": 2.4396915435791016, "loss/hidden": 1.140625, "loss/logits": 0.19306717813014984, "loss/reg": 5.957194298389368e-05, "step": 363 }, { "epoch": 0.0455, "grad_norm": 2.2478973865509033, "grad_norm_var": 0.22066793284785244, "learning_rate": 0.0001, "loss": 1.2107, "loss/crossentropy": 2.5703125, "loss/hidden": 1.046875, "loss/logits": 0.16320618987083435, "loss/reg": 5.955886445008218e-05, "step": 364 }, { "epoch": 0.045625, "grad_norm": 2.8303184509277344, "grad_norm_var": 0.21465200545435412, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.5793418884277344, "loss/hidden": 1.1171875, "loss/logits": 0.20061752200126648, "loss/reg": 5.954650623607449e-05, "step": 365 }, { "epoch": 0.04575, "grad_norm": 2.3407225608825684, "grad_norm_var": 0.20058607793752117, "learning_rate": 0.0001, "loss": 1.2154, "loss/crossentropy": 2.5118396282196045, "loss/hidden": 1.0546875, "loss/logits": 0.16011780500411987, "loss/reg": 5.9531517763389274e-05, "step": 366 }, { "epoch": 0.045875, "grad_norm": 2.9164462089538574, "grad_norm_var": 0.17624459628143327, "learning_rate": 0.0001, "loss": 2.3079, "loss/crossentropy": 2.530949831008911, "loss/hidden": 1.7890625, "loss/logits": 0.5182523727416992, "loss/reg": 5.9519883507164195e-05, "step": 367 }, { "epoch": 0.046, "grad_norm": 2.6031134128570557, "grad_norm_var": 0.17274354994838556, "learning_rate": 0.0001, "loss": 1.3529, "loss/crossentropy": 2.5211331844329834, "loss/hidden": 1.140625, "loss/logits": 0.21172133088111877, "loss/reg": 5.950441482127644e-05, "step": 368 }, { "epoch": 0.046125, "grad_norm": 2.2432241439819336, "grad_norm_var": 0.16462358021652007, "learning_rate": 0.0001, "loss": 1.2433, "loss/crossentropy": 2.469212055206299, "loss/hidden": 1.0625, "loss/logits": 0.18018998205661774, "loss/reg": 5.9490499552339315e-05, "step": 369 }, { "epoch": 0.04625, "grad_norm": 3.287365674972534, "grad_norm_var": 0.16815977224474163, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.7330899238586426, "loss/hidden": 1.140625, "loss/logits": 0.18986304104328156, "loss/reg": 5.9471924032550305e-05, "step": 370 }, { "epoch": 0.046375, "grad_norm": 2.6555063724517822, "grad_norm_var": 0.16849581874392333, "learning_rate": 0.0001, "loss": 1.3069, "loss/crossentropy": 2.4908649921417236, "loss/hidden": 1.1171875, "loss/logits": 0.1890988051891327, "loss/reg": 5.9457710449351e-05, "step": 371 }, { "epoch": 0.0465, "grad_norm": 2.2832915782928467, "grad_norm_var": 0.1710711381355336, "learning_rate": 0.0001, "loss": 1.251, "loss/crossentropy": 2.6485414505004883, "loss/hidden": 1.0859375, "loss/logits": 0.16444087028503418, "loss/reg": 5.9437123127281666e-05, "step": 372 }, { "epoch": 0.046625, "grad_norm": 1.9312299489974976, "grad_norm_var": 0.11748808484953574, "learning_rate": 0.0001, "loss": 1.3104, "loss/crossentropy": 2.4345285892486572, "loss/hidden": 1.125, "loss/logits": 0.1848057061433792, "loss/reg": 5.941649214946665e-05, "step": 373 }, { "epoch": 0.04675, "grad_norm": 2.2687668800354004, "grad_norm_var": 0.12381368567199799, "learning_rate": 0.0001, "loss": 1.2697, "loss/crossentropy": 2.514896869659424, "loss/hidden": 1.0859375, "loss/logits": 0.18321493268013, "loss/reg": 5.939120819675736e-05, "step": 374 }, { "epoch": 0.046875, "grad_norm": 2.1616384983062744, "grad_norm_var": 0.131467626574521, "learning_rate": 0.0001, "loss": 1.2405, "loss/crossentropy": 2.698112964630127, "loss/hidden": 1.0625, "loss/logits": 0.1773754358291626, "loss/reg": 5.936667002970353e-05, "step": 375 }, { "epoch": 0.047, "grad_norm": 2.6922011375427246, "grad_norm_var": 0.13182201209023336, "learning_rate": 0.0001, "loss": 1.3426, "loss/crossentropy": 2.538865327835083, "loss/hidden": 1.15625, "loss/logits": 0.1857489049434662, "loss/reg": 5.9345431509427726e-05, "step": 376 }, { "epoch": 0.047125, "grad_norm": 2.2630982398986816, "grad_norm_var": 0.13276797957838743, "learning_rate": 0.0001, "loss": 1.2869, "loss/crossentropy": 2.4644358158111572, "loss/hidden": 1.109375, "loss/logits": 0.17694343626499176, "loss/reg": 5.933275315328501e-05, "step": 377 }, { "epoch": 0.04725, "grad_norm": 2.479646682739258, "grad_norm_var": 0.11514238570119009, "learning_rate": 0.0001, "loss": 1.1618, "loss/crossentropy": 2.5582141876220703, "loss/hidden": 1.015625, "loss/logits": 0.14554372429847717, "loss/reg": 5.931046689511277e-05, "step": 378 }, { "epoch": 0.047375, "grad_norm": 2.466947317123413, "grad_norm_var": 0.114559834161389, "learning_rate": 0.0001, "loss": 1.48, "loss/crossentropy": 2.4128925800323486, "loss/hidden": 1.2421875, "loss/logits": 0.23724211752414703, "loss/reg": 5.929026156081818e-05, "step": 379 }, { "epoch": 0.0475, "grad_norm": 2.538424015045166, "grad_norm_var": 0.11086504579424972, "learning_rate": 0.0001, "loss": 1.4136, "loss/crossentropy": 2.0768887996673584, "loss/hidden": 1.234375, "loss/logits": 0.17867109179496765, "loss/reg": 5.92764736211393e-05, "step": 380 }, { "epoch": 0.047625, "grad_norm": 2.654524564743042, "grad_norm_var": 0.1049983644074643, "learning_rate": 0.0001, "loss": 1.3221, "loss/crossentropy": 2.1216413974761963, "loss/hidden": 1.15625, "loss/logits": 0.16521546244621277, "loss/reg": 5.926107769482769e-05, "step": 381 }, { "epoch": 0.04775, "grad_norm": 2.237818717956543, "grad_norm_var": 0.10766217194697697, "learning_rate": 0.0001, "loss": 1.2236, "loss/crossentropy": 2.6475207805633545, "loss/hidden": 1.0546875, "loss/logits": 0.16833502054214478, "loss/reg": 5.924178913119249e-05, "step": 382 }, { "epoch": 0.047875, "grad_norm": 2.7116799354553223, "grad_norm_var": 0.09837235459497572, "learning_rate": 0.0001, "loss": 1.3102, "loss/crossentropy": 2.6615209579467773, "loss/hidden": 1.1171875, "loss/logits": 0.1924624741077423, "loss/reg": 5.921960837440565e-05, "step": 383 }, { "epoch": 0.048, "grad_norm": 2.5439391136169434, "grad_norm_var": 0.09752047633307553, "learning_rate": 0.0001, "loss": 1.3258, "loss/crossentropy": 2.10198974609375, "loss/hidden": 1.15625, "loss/logits": 0.1689702719449997, "loss/reg": 5.919525210629217e-05, "step": 384 }, { "epoch": 0.048125, "grad_norm": 2.617921829223633, "grad_norm_var": 0.09528014676361156, "learning_rate": 0.0001, "loss": 1.61, "loss/crossentropy": 2.445833206176758, "loss/hidden": 1.328125, "loss/logits": 0.28133296966552734, "loss/reg": 5.917950693401508e-05, "step": 385 }, { "epoch": 0.04825, "grad_norm": 2.514899730682373, "grad_norm_var": 0.05015297139615639, "learning_rate": 0.0001, "loss": 1.1964, "loss/crossentropy": 2.4887778759002686, "loss/hidden": 1.0390625, "loss/logits": 0.1567072868347168, "loss/reg": 5.916162990615703e-05, "step": 386 }, { "epoch": 0.048375, "grad_norm": 2.1075565814971924, "grad_norm_var": 0.053089324895933446, "learning_rate": 0.0001, "loss": 1.0537, "loss/crossentropy": 2.4045815467834473, "loss/hidden": 0.921875, "loss/logits": 0.1312153935432434, "loss/reg": 5.914089342695661e-05, "step": 387 }, { "epoch": 0.0485, "grad_norm": 2.475404739379883, "grad_norm_var": 0.05228874002812057, "learning_rate": 0.0001, "loss": 1.3003, "loss/crossentropy": 2.591153383255005, "loss/hidden": 1.109375, "loss/logits": 0.19037862122058868, "loss/reg": 5.9116682677995414e-05, "step": 388 }, { "epoch": 0.048625, "grad_norm": 4.638079643249512, "grad_norm_var": 0.33504973194641535, "learning_rate": 0.0001, "loss": 1.7407, "loss/crossentropy": 2.992236852645874, "loss/hidden": 1.4609375, "loss/logits": 0.2792096734046936, "loss/reg": 5.9097284974996e-05, "step": 389 }, { "epoch": 0.04875, "grad_norm": 2.4662392139434814, "grad_norm_var": 0.32913998556907487, "learning_rate": 0.0001, "loss": 1.1454, "loss/crossentropy": 2.9239540100097656, "loss/hidden": 0.9921875, "loss/logits": 0.15260137617588043, "loss/reg": 5.907983722863719e-05, "step": 390 }, { "epoch": 0.048875, "grad_norm": 2.439119338989258, "grad_norm_var": 0.31780327994806234, "learning_rate": 0.0001, "loss": 1.3638, "loss/crossentropy": 2.450254440307617, "loss/hidden": 1.1640625, "loss/logits": 0.19915927946567535, "loss/reg": 5.9063841035822406e-05, "step": 391 }, { "epoch": 0.049, "grad_norm": 2.3475067615509033, "grad_norm_var": 0.3217026075593497, "learning_rate": 0.0001, "loss": 1.533, "loss/crossentropy": 2.617830753326416, "loss/hidden": 1.265625, "loss/logits": 0.26678475737571716, "loss/reg": 5.90429590374697e-05, "step": 392 }, { "epoch": 0.049125, "grad_norm": 4.364901065826416, "grad_norm_var": 0.5050899240629005, "learning_rate": 0.0001, "loss": 1.4632, "loss/crossentropy": 2.4607560634613037, "loss/hidden": 1.2421875, "loss/logits": 0.22039487957954407, "loss/reg": 5.902666089241393e-05, "step": 393 }, { "epoch": 0.04925, "grad_norm": 2.338758707046509, "grad_norm_var": 0.5109449021123245, "learning_rate": 0.0001, "loss": 1.2995, "loss/crossentropy": 2.6618576049804688, "loss/hidden": 1.1171875, "loss/logits": 0.1817541867494583, "loss/reg": 5.9010566474171355e-05, "step": 394 }, { "epoch": 0.049375, "grad_norm": 3.5642833709716797, "grad_norm_var": 0.549694181009107, "learning_rate": 0.0001, "loss": 1.3152, "loss/crossentropy": 2.300379753112793, "loss/hidden": 1.1171875, "loss/logits": 0.1974020004272461, "loss/reg": 5.8987676311517134e-05, "step": 395 }, { "epoch": 0.0495, "grad_norm": 2.1328978538513184, "grad_norm_var": 0.573308372527261, "learning_rate": 0.0001, "loss": 1.244, "loss/crossentropy": 2.4386301040649414, "loss/hidden": 1.0625, "loss/logits": 0.18090221285820007, "loss/reg": 5.8964946219930425e-05, "step": 396 }, { "epoch": 0.049625, "grad_norm": 3.0894107818603516, "grad_norm_var": 0.5790289690992334, "learning_rate": 0.0001, "loss": 1.5661, "loss/crossentropy": 2.365107297897339, "loss/hidden": 1.3671875, "loss/logits": 0.1983477920293808, "loss/reg": 5.8950212405761704e-05, "step": 397 }, { "epoch": 0.04975, "grad_norm": 3.194427967071533, "grad_norm_var": 0.566188494588774, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.384216547012329, "loss/hidden": 1.21875, "loss/logits": 0.20751546323299408, "loss/reg": 5.8928319049300626e-05, "step": 398 }, { "epoch": 0.049875, "grad_norm": 2.5108933448791504, "grad_norm_var": 0.5723226037333423, "learning_rate": 0.0001, "loss": 1.2127, "loss/crossentropy": 2.5466771125793457, "loss/hidden": 1.0546875, "loss/logits": 0.1574660688638687, "loss/reg": 5.891324326512404e-05, "step": 399 }, { "epoch": 0.05, "grad_norm": 2.9769773483276367, "grad_norm_var": 0.5672869916808385, "learning_rate": 0.0001, "loss": 1.3045, "loss/crossentropy": 2.7223000526428223, "loss/hidden": 1.1171875, "loss/logits": 0.18677057325839996, "loss/reg": 5.889027670491487e-05, "step": 400 }, { "epoch": 0.050125, "grad_norm": 3.31915283203125, "grad_norm_var": 0.5752734489563172, "learning_rate": 0.0001, "loss": 1.2639, "loss/crossentropy": 2.4886364936828613, "loss/hidden": 1.078125, "loss/logits": 0.1851940155029297, "loss/reg": 5.887265797355212e-05, "step": 401 }, { "epoch": 0.05025, "grad_norm": 1.8946937322616577, "grad_norm_var": 0.6315760522485537, "learning_rate": 0.0001, "loss": 1.2326, "loss/crossentropy": 2.414213180541992, "loss/hidden": 1.0703125, "loss/logits": 0.16165336966514587, "loss/reg": 5.885552673134953e-05, "step": 402 }, { "epoch": 0.050375, "grad_norm": 2.5370404720306396, "grad_norm_var": 0.5996572790739425, "learning_rate": 0.0001, "loss": 1.5079, "loss/crossentropy": 2.3421835899353027, "loss/hidden": 1.28125, "loss/logits": 0.22602099180221558, "loss/reg": 5.88419679843355e-05, "step": 403 }, { "epoch": 0.0505, "grad_norm": 2.4215445518493652, "grad_norm_var": 0.6028382899137373, "learning_rate": 0.0001, "loss": 1.4975, "loss/crossentropy": 2.7361152172088623, "loss/hidden": 1.2421875, "loss/logits": 0.25468122959136963, "loss/reg": 5.88247858104296e-05, "step": 404 }, { "epoch": 0.050625, "grad_norm": 2.049978733062744, "grad_norm_var": 0.4181645547932513, "learning_rate": 0.0001, "loss": 1.1088, "loss/crossentropy": 2.350353717803955, "loss/hidden": 0.953125, "loss/logits": 0.15509989857673645, "loss/reg": 5.880888784304261e-05, "step": 405 }, { "epoch": 0.05075, "grad_norm": 2.7967936992645264, "grad_norm_var": 0.41345734870287976, "learning_rate": 0.0001, "loss": 1.3869, "loss/crossentropy": 2.5875766277313232, "loss/hidden": 1.171875, "loss/logits": 0.21445012092590332, "loss/reg": 5.8793633797904477e-05, "step": 406 }, { "epoch": 0.050875, "grad_norm": 2.169900894165039, "grad_norm_var": 0.429098064205416, "learning_rate": 0.0001, "loss": 1.0776, "loss/crossentropy": 2.398125410079956, "loss/hidden": 0.93359375, "loss/logits": 0.14346018433570862, "loss/reg": 5.877741932636127e-05, "step": 407 }, { "epoch": 0.051, "grad_norm": 2.5045695304870605, "grad_norm_var": 0.4225916301522199, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.1697590351104736, "loss/hidden": 1.328125, "loss/logits": 0.20677754282951355, "loss/reg": 5.876670911675319e-05, "step": 408 }, { "epoch": 0.051125, "grad_norm": 18.23008918762207, "grad_norm_var": 15.43871781968465, "learning_rate": 0.0001, "loss": 1.4882, "loss/crossentropy": 2.602886438369751, "loss/hidden": 1.3046875, "loss/logits": 0.18289120495319366, "loss/reg": 5.874884300283156e-05, "step": 409 }, { "epoch": 0.05125, "grad_norm": 2.8436660766601562, "grad_norm_var": 15.369190103974788, "learning_rate": 0.0001, "loss": 1.3294, "loss/crossentropy": 2.4684174060821533, "loss/hidden": 1.140625, "loss/logits": 0.18818634748458862, "loss/reg": 5.873553891433403e-05, "step": 410 }, { "epoch": 0.051375, "grad_norm": 2.2729334831237793, "grad_norm_var": 15.486411427985377, "learning_rate": 0.0001, "loss": 1.2331, "loss/crossentropy": 2.550140857696533, "loss/hidden": 1.0390625, "loss/logits": 0.19348369538784027, "loss/reg": 5.8720732340589166e-05, "step": 411 }, { "epoch": 0.0515, "grad_norm": 2.5612359046936035, "grad_norm_var": 15.416427881560285, "learning_rate": 0.0001, "loss": 1.3333, "loss/crossentropy": 2.4774818420410156, "loss/hidden": 1.125, "loss/logits": 0.2077203392982483, "loss/reg": 5.8710702433018014e-05, "step": 412 }, { "epoch": 0.051625, "grad_norm": 4.02579927444458, "grad_norm_var": 15.409250289477422, "learning_rate": 0.0001, "loss": 1.507, "loss/crossentropy": 2.555722713470459, "loss/hidden": 1.3046875, "loss/logits": 0.20171231031417847, "loss/reg": 5.870195309398696e-05, "step": 413 }, { "epoch": 0.05175, "grad_norm": 2.443574905395508, "grad_norm_var": 15.489530544756628, "learning_rate": 0.0001, "loss": 1.2774, "loss/crossentropy": 2.706422805786133, "loss/hidden": 1.09375, "loss/logits": 0.1831112802028656, "loss/reg": 5.8690613514045253e-05, "step": 414 }, { "epoch": 0.051875, "grad_norm": 2.079418897628784, "grad_norm_var": 15.563674426313279, "learning_rate": 0.0001, "loss": 1.1798, "loss/crossentropy": 2.6763839721679688, "loss/hidden": 1.03125, "loss/logits": 0.14800235629081726, "loss/reg": 5.8675475884228945e-05, "step": 415 }, { "epoch": 0.052, "grad_norm": 2.7786471843719482, "grad_norm_var": 15.581826938638233, "learning_rate": 0.0001, "loss": 1.2465, "loss/crossentropy": 2.6709306240081787, "loss/hidden": 1.078125, "loss/logits": 0.1678304374217987, "loss/reg": 5.866462379344739e-05, "step": 416 }, { "epoch": 0.052125, "grad_norm": 2.770376443862915, "grad_norm_var": 15.618130403520784, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.646826982498169, "loss/hidden": 1.1328125, "loss/logits": 0.1777157187461853, "loss/reg": 5.865520142833702e-05, "step": 417 }, { "epoch": 0.05225, "grad_norm": 2.092414617538452, "grad_norm_var": 15.57762685735369, "learning_rate": 0.0001, "loss": 1.3353, "loss/crossentropy": 2.62361741065979, "loss/hidden": 1.15625, "loss/logits": 0.17848479747772217, "loss/reg": 5.8638761402107775e-05, "step": 418 }, { "epoch": 0.052375, "grad_norm": 2.05226731300354, "grad_norm_var": 15.656891853986265, "learning_rate": 0.0001, "loss": 1.14, "loss/crossentropy": 2.697723865509033, "loss/hidden": 0.9921875, "loss/logits": 0.14726917445659637, "loss/reg": 5.862316902494058e-05, "step": 419 }, { "epoch": 0.0525, "grad_norm": 2.6924796104431152, "grad_norm_var": 15.622310414474152, "learning_rate": 0.0001, "loss": 1.404, "loss/crossentropy": 2.601827383041382, "loss/hidden": 1.2109375, "loss/logits": 0.19246245920658112, "loss/reg": 5.860950841451995e-05, "step": 420 }, { "epoch": 0.052625, "grad_norm": 5.301983833312988, "grad_norm_var": 15.644682914862404, "learning_rate": 0.0001, "loss": 1.4862, "loss/crossentropy": 2.6217854022979736, "loss/hidden": 1.296875, "loss/logits": 0.18871337175369263, "loss/reg": 5.8600846386980265e-05, "step": 421 }, { "epoch": 0.05275, "grad_norm": 2.114091634750366, "grad_norm_var": 15.758396712898662, "learning_rate": 0.0001, "loss": 1.2033, "loss/crossentropy": 2.5663623809814453, "loss/hidden": 1.0390625, "loss/logits": 0.16362521052360535, "loss/reg": 5.8592915593180805e-05, "step": 422 }, { "epoch": 0.052875, "grad_norm": 2.757091999053955, "grad_norm_var": 15.661455859551703, "learning_rate": 0.0001, "loss": 1.1223, "loss/crossentropy": 2.4681971073150635, "loss/hidden": 0.97265625, "loss/logits": 0.14905983209609985, "loss/reg": 5.858425720361993e-05, "step": 423 }, { "epoch": 0.053, "grad_norm": 2.4524407386779785, "grad_norm_var": 15.670073831964206, "learning_rate": 0.0001, "loss": 1.2938, "loss/crossentropy": 2.4758145809173584, "loss/hidden": 1.1015625, "loss/logits": 0.19164547324180603, "loss/reg": 5.857350697624497e-05, "step": 424 }, { "epoch": 0.053125, "grad_norm": 2.3052892684936523, "grad_norm_var": 0.7038252417895506, "learning_rate": 0.0001, "loss": 1.2565, "loss/crossentropy": 2.597487211227417, "loss/hidden": 1.0703125, "loss/logits": 0.18559187650680542, "loss/reg": 5.855830750078894e-05, "step": 425 }, { "epoch": 0.05325, "grad_norm": 2.7276995182037354, "grad_norm_var": 0.7027765205874381, "learning_rate": 0.0001, "loss": 1.4141, "loss/crossentropy": 2.6818253993988037, "loss/hidden": 1.21875, "loss/logits": 0.1948131024837494, "loss/reg": 5.854442133568227e-05, "step": 426 }, { "epoch": 0.053375, "grad_norm": 1.725293517112732, "grad_norm_var": 0.7537440425638384, "learning_rate": 0.0001, "loss": 1.1664, "loss/crossentropy": 2.4244258403778076, "loss/hidden": 1.015625, "loss/logits": 0.1502000093460083, "loss/reg": 5.85384841542691e-05, "step": 427 }, { "epoch": 0.0535, "grad_norm": 2.6642932891845703, "grad_norm_var": 0.7527758186064119, "learning_rate": 0.0001, "loss": 1.5211, "loss/crossentropy": 2.1209182739257812, "loss/hidden": 1.328125, "loss/logits": 0.192403644323349, "loss/reg": 5.852692629559897e-05, "step": 428 }, { "epoch": 0.053625, "grad_norm": 2.7787868976593018, "grad_norm_var": 0.6272740663233074, "learning_rate": 0.0001, "loss": 1.3046, "loss/crossentropy": 2.3020565509796143, "loss/hidden": 1.125, "loss/logits": 0.179016575217247, "loss/reg": 5.851646346854977e-05, "step": 429 }, { "epoch": 0.05375, "grad_norm": 2.891101360321045, "grad_norm_var": 0.6299498912530666, "learning_rate": 0.0001, "loss": 1.4198, "loss/crossentropy": 2.33249568939209, "loss/hidden": 1.203125, "loss/logits": 0.21604114770889282, "loss/reg": 5.850956222275272e-05, "step": 430 }, { "epoch": 0.053875, "grad_norm": 2.7940289974212646, "grad_norm_var": 0.608789107013446, "learning_rate": 0.0001, "loss": 1.1825, "loss/crossentropy": 2.6553549766540527, "loss/hidden": 1.03125, "loss/logits": 0.15064392983913422, "loss/reg": 5.8500536397332326e-05, "step": 431 }, { "epoch": 0.054, "grad_norm": 25.06597328186035, "grad_norm_var": 31.943843646690855, "learning_rate": 0.0001, "loss": 2.4055, "loss/crossentropy": 2.7126245498657227, "loss/hidden": 2.03125, "loss/logits": 0.3736712336540222, "loss/reg": 5.849341687280685e-05, "step": 432 }, { "epoch": 0.054125, "grad_norm": 2.4612748622894287, "grad_norm_var": 32.003546233579016, "learning_rate": 0.0001, "loss": 1.4832, "loss/crossentropy": 2.6244633197784424, "loss/hidden": 1.25, "loss/logits": 0.23266229033470154, "loss/reg": 5.847978172823787e-05, "step": 433 }, { "epoch": 0.05425, "grad_norm": 2.413149356842041, "grad_norm_var": 31.926055741483236, "learning_rate": 0.0001, "loss": 1.405, "loss/crossentropy": 2.513383626937866, "loss/hidden": 1.1953125, "loss/logits": 0.2091376930475235, "loss/reg": 5.847239663125947e-05, "step": 434 }, { "epoch": 0.054375, "grad_norm": 2.1266605854034424, "grad_norm_var": 31.906339652731415, "learning_rate": 0.0001, "loss": 1.2307, "loss/crossentropy": 2.645113706588745, "loss/hidden": 1.0546875, "loss/logits": 0.17538747191429138, "loss/reg": 5.8466725022299215e-05, "step": 435 }, { "epoch": 0.0545, "grad_norm": 2.693485975265503, "grad_norm_var": 31.906153605922054, "learning_rate": 0.0001, "loss": 1.3491, "loss/crossentropy": 2.5616350173950195, "loss/hidden": 1.171875, "loss/logits": 0.1766662299633026, "loss/reg": 5.845691339345649e-05, "step": 436 }, { "epoch": 0.054625, "grad_norm": 3.594322681427002, "grad_norm_var": 31.81007436255887, "learning_rate": 0.0001, "loss": 1.4456, "loss/crossentropy": 2.320868492126465, "loss/hidden": 1.171875, "loss/logits": 0.2730950713157654, "loss/reg": 5.845166742801666e-05, "step": 437 }, { "epoch": 0.05475, "grad_norm": 2.725066900253296, "grad_norm_var": 31.681987454427826, "learning_rate": 0.0001, "loss": 1.4368, "loss/crossentropy": 2.4526007175445557, "loss/hidden": 1.21875, "loss/logits": 0.21745863556861877, "loss/reg": 5.844476982019842e-05, "step": 438 }, { "epoch": 0.054875, "grad_norm": 2.615208625793457, "grad_norm_var": 31.706966746538818, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.5873489379882812, "loss/hidden": 1.109375, "loss/logits": 0.18027284741401672, "loss/reg": 5.843998587806709e-05, "step": 439 }, { "epoch": 0.055, "grad_norm": 2.679504632949829, "grad_norm_var": 31.66327199965654, "learning_rate": 0.0001, "loss": 1.4142, "loss/crossentropy": 2.171384811401367, "loss/hidden": 1.21875, "loss/logits": 0.1948787271976471, "loss/reg": 5.8425270253792405e-05, "step": 440 }, { "epoch": 0.055125, "grad_norm": 2.781118869781494, "grad_norm_var": 31.56886824166385, "learning_rate": 0.0001, "loss": 1.2261, "loss/crossentropy": 2.616610050201416, "loss/hidden": 1.0625, "loss/logits": 0.16300562024116516, "loss/reg": 5.841004167450592e-05, "step": 441 }, { "epoch": 0.05525, "grad_norm": 2.8343710899353027, "grad_norm_var": 31.550828531904, "learning_rate": 0.0001, "loss": 1.6654, "loss/crossentropy": 2.254971504211426, "loss/hidden": 1.390625, "loss/logits": 0.27416497468948364, "loss/reg": 5.840086305397563e-05, "step": 442 }, { "epoch": 0.055375, "grad_norm": 2.943516254425049, "grad_norm_var": 31.26553828771242, "learning_rate": 0.0001, "loss": 1.3037, "loss/crossentropy": 2.607365131378174, "loss/hidden": 1.140625, "loss/logits": 0.16250211000442505, "loss/reg": 5.8392772189108655e-05, "step": 443 }, { "epoch": 0.0555, "grad_norm": 4.3494696617126465, "grad_norm_var": 31.11395178262311, "learning_rate": 0.0001, "loss": 1.4874, "loss/crossentropy": 2.803809642791748, "loss/hidden": 1.265625, "loss/logits": 0.22114460170269012, "loss/reg": 5.8383415307616815e-05, "step": 444 }, { "epoch": 0.055625, "grad_norm": 2.3149962425231934, "grad_norm_var": 31.21739595793184, "learning_rate": 0.0001, "loss": 1.1723, "loss/crossentropy": 2.7661781311035156, "loss/hidden": 1.015625, "loss/logits": 0.1560768485069275, "loss/reg": 5.8376208471599966e-05, "step": 445 }, { "epoch": 0.05575, "grad_norm": 2.5312862396240234, "grad_norm_var": 31.288532129977195, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.3608808517456055, "loss/hidden": 1.234375, "loss/logits": 0.22338923811912537, "loss/reg": 5.83621695113834e-05, "step": 446 }, { "epoch": 0.055875, "grad_norm": 2.0245697498321533, "grad_norm_var": 31.468007952235922, "learning_rate": 0.0001, "loss": 1.2537, "loss/crossentropy": 2.6646907329559326, "loss/hidden": 1.0859375, "loss/logits": 0.1671399027109146, "loss/reg": 5.835363481310196e-05, "step": 447 }, { "epoch": 0.056, "grad_norm": 4.180586338043213, "grad_norm_var": 0.4425575902395887, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.478865623474121, "loss/hidden": 1.1796875, "loss/logits": 0.2484455555677414, "loss/reg": 5.833926479681395e-05, "step": 448 }, { "epoch": 0.056125, "grad_norm": 2.2291383743286133, "grad_norm_var": 0.4573160813014281, "learning_rate": 0.0001, "loss": 1.2997, "loss/crossentropy": 2.244389295578003, "loss/hidden": 1.15625, "loss/logits": 0.14287710189819336, "loss/reg": 5.833054456161335e-05, "step": 449 }, { "epoch": 0.05625, "grad_norm": 2.204925060272217, "grad_norm_var": 0.47117643459253195, "learning_rate": 0.0001, "loss": 1.2876, "loss/crossentropy": 2.3469107151031494, "loss/hidden": 1.1015625, "loss/logits": 0.1854255050420761, "loss/reg": 5.832717943121679e-05, "step": 450 }, { "epoch": 0.056375, "grad_norm": 2.5266880989074707, "grad_norm_var": 0.4451698073358396, "learning_rate": 0.0001, "loss": 1.4392, "loss/crossentropy": 2.440885305404663, "loss/hidden": 1.2109375, "loss/logits": 0.22769977152347565, "loss/reg": 5.8323836128693074e-05, "step": 451 }, { "epoch": 0.0565, "grad_norm": 2.410515785217285, "grad_norm_var": 0.455202882380185, "learning_rate": 0.0001, "loss": 1.4083, "loss/crossentropy": 2.4578142166137695, "loss/hidden": 1.203125, "loss/logits": 0.20461352169513702, "loss/reg": 5.830869122291915e-05, "step": 452 }, { "epoch": 0.056625, "grad_norm": 2.0389811992645264, "grad_norm_var": 0.4435531519851603, "learning_rate": 0.0001, "loss": 1.1318, "loss/crossentropy": 2.139033317565918, "loss/hidden": 0.9921875, "loss/logits": 0.1390083134174347, "loss/reg": 5.829246947541833e-05, "step": 453 }, { "epoch": 0.05675, "grad_norm": 1.979454517364502, "grad_norm_var": 0.47698744011981165, "learning_rate": 0.0001, "loss": 1.3115, "loss/crossentropy": 2.546844005584717, "loss/hidden": 1.125, "loss/logits": 0.18587306141853333, "loss/reg": 5.8282243116991594e-05, "step": 454 }, { "epoch": 0.056875, "grad_norm": 2.0210747718811035, "grad_norm_var": 0.5030154373593951, "learning_rate": 0.0001, "loss": 1.21, "loss/crossentropy": 2.6095550060272217, "loss/hidden": 1.046875, "loss/logits": 0.16256017982959747, "loss/reg": 5.8266243286198005e-05, "step": 455 }, { "epoch": 0.057, "grad_norm": 2.0944671630859375, "grad_norm_var": 0.520400331750174, "learning_rate": 0.0001, "loss": 1.1407, "loss/crossentropy": 2.450681447982788, "loss/hidden": 0.98828125, "loss/logits": 0.15184549987316132, "loss/reg": 5.8250909205526114e-05, "step": 456 }, { "epoch": 0.057125, "grad_norm": 2.5854806900024414, "grad_norm_var": 0.5178481401493921, "learning_rate": 0.0001, "loss": 1.1308, "loss/crossentropy": 2.8090949058532715, "loss/hidden": 0.97265625, "loss/logits": 0.15754011273384094, "loss/reg": 5.8233421441400424e-05, "step": 457 }, { "epoch": 0.05725, "grad_norm": 6.832178592681885, "grad_norm_var": 1.6526915128701443, "learning_rate": 0.0001, "loss": 1.7544, "loss/crossentropy": 2.4325008392333984, "loss/hidden": 1.5625, "loss/logits": 0.1913643479347229, "loss/reg": 5.821782906423323e-05, "step": 458 }, { "epoch": 0.057375, "grad_norm": 2.4911727905273438, "grad_norm_var": 1.6585857165051416, "learning_rate": 0.0001, "loss": 1.277, "loss/crossentropy": 2.5682671070098877, "loss/hidden": 1.09375, "loss/logits": 0.18270117044448853, "loss/reg": 5.820325532113202e-05, "step": 459 }, { "epoch": 0.0575, "grad_norm": 2.2592287063598633, "grad_norm_var": 1.5000806172221008, "learning_rate": 0.0001, "loss": 1.149, "loss/crossentropy": 2.3300366401672363, "loss/hidden": 0.98828125, "loss/logits": 0.16016384959220886, "loss/reg": 5.8191151765640825e-05, "step": 460 }, { "epoch": 0.057625, "grad_norm": 2.6110737323760986, "grad_norm_var": 1.4915332961489087, "learning_rate": 0.0001, "loss": 1.4344, "loss/crossentropy": 2.560197591781616, "loss/hidden": 1.21875, "loss/logits": 0.21507461369037628, "loss/reg": 5.817634882987477e-05, "step": 461 }, { "epoch": 0.05775, "grad_norm": 2.6446752548217773, "grad_norm_var": 1.48995546498276, "learning_rate": 0.0001, "loss": 1.2381, "loss/crossentropy": 2.5068211555480957, "loss/hidden": 1.0546875, "loss/logits": 0.1828281581401825, "loss/reg": 5.816355405841023e-05, "step": 462 }, { "epoch": 0.057875, "grad_norm": 2.498300075531006, "grad_norm_var": 1.4615785550667995, "learning_rate": 0.0001, "loss": 1.3019, "loss/crossentropy": 2.3765523433685303, "loss/hidden": 1.1328125, "loss/logits": 0.16848215460777283, "loss/reg": 5.814860560349189e-05, "step": 463 }, { "epoch": 0.058, "grad_norm": 2.4674289226531982, "grad_norm_var": 1.3126372255276026, "learning_rate": 0.0001, "loss": 1.3472, "loss/crossentropy": 2.714657783508301, "loss/hidden": 1.1640625, "loss/logits": 0.18256625533103943, "loss/reg": 5.81321437493898e-05, "step": 464 }, { "epoch": 0.058125, "grad_norm": 3.7482964992523193, "grad_norm_var": 1.3780257940909062, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.7645256519317627, "loss/hidden": 1.2109375, "loss/logits": 0.24636635184288025, "loss/reg": 5.811548908241093e-05, "step": 465 }, { "epoch": 0.05825, "grad_norm": 3.1881492137908936, "grad_norm_var": 1.3717908440858895, "learning_rate": 0.0001, "loss": 1.2469, "loss/crossentropy": 2.6280384063720703, "loss/hidden": 1.078125, "loss/logits": 0.16818463802337646, "loss/reg": 5.8095396525459364e-05, "step": 466 }, { "epoch": 0.058375, "grad_norm": 3.4882731437683105, "grad_norm_var": 1.3977675144088226, "learning_rate": 0.0001, "loss": 1.5403, "loss/crossentropy": 1.8358429670333862, "loss/hidden": 1.3203125, "loss/logits": 0.21941694617271423, "loss/reg": 5.807522757095285e-05, "step": 467 }, { "epoch": 0.0585, "grad_norm": 2.530682325363159, "grad_norm_var": 1.391870091660969, "learning_rate": 0.0001, "loss": 1.1578, "loss/crossentropy": 2.3950142860412598, "loss/hidden": 0.99609375, "loss/logits": 0.1611400693655014, "loss/reg": 5.80518099013716e-05, "step": 468 }, { "epoch": 0.058625, "grad_norm": 3.4676575660705566, "grad_norm_var": 1.366390295617852, "learning_rate": 0.0001, "loss": 1.5162, "loss/crossentropy": 2.851280689239502, "loss/hidden": 1.234375, "loss/logits": 0.28122612833976746, "loss/reg": 5.8030982472701e-05, "step": 469 }, { "epoch": 0.05875, "grad_norm": 2.9446208477020264, "grad_norm_var": 1.302065384350945, "learning_rate": 0.0001, "loss": 1.3015, "loss/crossentropy": 2.740093469619751, "loss/hidden": 1.125, "loss/logits": 0.17590749263763428, "loss/reg": 5.800585859105922e-05, "step": 470 }, { "epoch": 0.058875, "grad_norm": 2.7597243785858154, "grad_norm_var": 1.2405377686230998, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.440762996673584, "loss/hidden": 1.015625, "loss/logits": 0.14888577163219452, "loss/reg": 5.7990357163362205e-05, "step": 471 }, { "epoch": 0.059, "grad_norm": 2.8147523403167725, "grad_norm_var": 1.182327943249795, "learning_rate": 0.0001, "loss": 1.3195, "loss/crossentropy": 2.5801327228546143, "loss/hidden": 1.140625, "loss/logits": 0.17824885249137878, "loss/reg": 5.7975972595158964e-05, "step": 472 }, { "epoch": 0.059125, "grad_norm": 2.4511027336120605, "grad_norm_var": 1.1923747545104257, "learning_rate": 0.0001, "loss": 1.4217, "loss/crossentropy": 2.5711913108825684, "loss/hidden": 1.203125, "loss/logits": 0.2180328667163849, "loss/reg": 5.796052937512286e-05, "step": 473 }, { "epoch": 0.05925, "grad_norm": 2.9213221073150635, "grad_norm_var": 0.1890407192544025, "learning_rate": 0.0001, "loss": 1.2735, "loss/crossentropy": 2.5805675983428955, "loss/hidden": 1.1015625, "loss/logits": 0.17132875323295593, "loss/reg": 5.794024036731571e-05, "step": 474 }, { "epoch": 0.059375, "grad_norm": 2.6587464809417725, "grad_norm_var": 0.1832162860499608, "learning_rate": 0.0001, "loss": 1.6569, "loss/crossentropy": 2.356299638748169, "loss/hidden": 1.40625, "loss/logits": 0.25005391240119934, "loss/reg": 5.791860894532874e-05, "step": 475 }, { "epoch": 0.0595, "grad_norm": 3.5978729724884033, "grad_norm_var": 0.19139826910290647, "learning_rate": 0.0001, "loss": 1.7357, "loss/crossentropy": 2.0626883506774902, "loss/hidden": 1.4765625, "loss/logits": 0.2585859000682831, "loss/reg": 5.790415525552817e-05, "step": 476 }, { "epoch": 0.059625, "grad_norm": 2.8491876125335693, "grad_norm_var": 0.18498974202791843, "learning_rate": 0.0001, "loss": 1.5276, "loss/crossentropy": 2.5583596229553223, "loss/hidden": 1.2734375, "loss/logits": 0.25358158349990845, "loss/reg": 5.788617272628471e-05, "step": 477 }, { "epoch": 0.05975, "grad_norm": 2.5821259021759033, "grad_norm_var": 0.1876924518839881, "learning_rate": 0.0001, "loss": 1.3568, "loss/crossentropy": 2.486640453338623, "loss/hidden": 1.1640625, "loss/logits": 0.19216927886009216, "loss/reg": 5.786680776509456e-05, "step": 478 }, { "epoch": 0.059875, "grad_norm": 2.877934217453003, "grad_norm_var": 0.17456917708907038, "learning_rate": 0.0001, "loss": 1.5607, "loss/crossentropy": 2.3836066722869873, "loss/hidden": 1.3203125, "loss/logits": 0.23981472849845886, "loss/reg": 5.785070243291557e-05, "step": 479 }, { "epoch": 0.06, "grad_norm": 2.3281009197235107, "grad_norm_var": 0.1849188959934999, "learning_rate": 0.0001, "loss": 1.2716, "loss/crossentropy": 2.508988380432129, "loss/hidden": 1.078125, "loss/logits": 0.19294525682926178, "loss/reg": 5.783725646324456e-05, "step": 480 }, { "epoch": 0.060125, "grad_norm": 2.8099567890167236, "grad_norm_var": 0.14013939438571937, "learning_rate": 0.0001, "loss": 1.5081, "loss/crossentropy": 2.3855881690979004, "loss/hidden": 1.25, "loss/logits": 0.2575419545173645, "loss/reg": 5.782474545412697e-05, "step": 481 }, { "epoch": 0.06025, "grad_norm": 2.9827277660369873, "grad_norm_var": 0.134662315913679, "learning_rate": 0.0001, "loss": 1.4593, "loss/crossentropy": 2.5487606525421143, "loss/hidden": 1.25, "loss/logits": 0.2087090015411377, "loss/reg": 5.7816720072878525e-05, "step": 482 }, { "epoch": 0.060375, "grad_norm": 2.306149959564209, "grad_norm_var": 0.1259770764512929, "learning_rate": 0.0001, "loss": 1.2076, "loss/crossentropy": 2.4755747318267822, "loss/hidden": 1.046875, "loss/logits": 0.16014963388442993, "loss/reg": 5.781082290923223e-05, "step": 483 }, { "epoch": 0.0605, "grad_norm": 2.4719114303588867, "grad_norm_var": 0.12834384378027816, "learning_rate": 0.0001, "loss": 1.3745, "loss/crossentropy": 2.8203346729278564, "loss/hidden": 1.171875, "loss/logits": 0.20208273828029633, "loss/reg": 5.7795077736955136e-05, "step": 484 }, { "epoch": 0.060625, "grad_norm": 2.300952911376953, "grad_norm_var": 0.10978991346620433, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.610508680343628, "loss/hidden": 1.2265625, "loss/logits": 0.2368427813053131, "loss/reg": 5.778546983492561e-05, "step": 485 }, { "epoch": 0.06075, "grad_norm": 3.3388009071350098, "grad_norm_var": 0.13085586368501342, "learning_rate": 0.0001, "loss": 1.5116, "loss/crossentropy": 2.763427972793579, "loss/hidden": 1.296875, "loss/logits": 0.21419215202331543, "loss/reg": 5.7770797866396606e-05, "step": 486 }, { "epoch": 0.060875, "grad_norm": 2.102293014526367, "grad_norm_var": 0.1572983810037916, "learning_rate": 0.0001, "loss": 1.1595, "loss/crossentropy": 2.204011917114258, "loss/hidden": 1.0, "loss/logits": 0.158901646733284, "loss/reg": 5.7755187299335375e-05, "step": 487 }, { "epoch": 0.061, "grad_norm": 2.766934633255005, "grad_norm_var": 0.15678694409689248, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.537151575088501, "loss/hidden": 1.2109375, "loss/logits": 0.2130882441997528, "loss/reg": 5.774224700871855e-05, "step": 488 }, { "epoch": 0.061125, "grad_norm": 2.0001540184020996, "grad_norm_var": 0.18501104247654798, "learning_rate": 0.0001, "loss": 1.103, "loss/crossentropy": 2.3592050075531006, "loss/hidden": 0.96484375, "loss/logits": 0.13754940032958984, "loss/reg": 5.77289865759667e-05, "step": 489 }, { "epoch": 0.06125, "grad_norm": 2.3166351318359375, "grad_norm_var": 0.18848381138351228, "learning_rate": 0.0001, "loss": 1.3329, "loss/crossentropy": 2.7236411571502686, "loss/hidden": 1.15625, "loss/logits": 0.1761033535003662, "loss/reg": 5.771181167801842e-05, "step": 490 }, { "epoch": 0.061375, "grad_norm": 2.357775926589966, "grad_norm_var": 0.19351960086170053, "learning_rate": 0.0001, "loss": 1.1437, "loss/crossentropy": 2.866445779800415, "loss/hidden": 0.98828125, "loss/logits": 0.15484049916267395, "loss/reg": 5.769642666564323e-05, "step": 491 }, { "epoch": 0.0615, "grad_norm": 3.680264949798584, "grad_norm_var": 0.20463866822373877, "learning_rate": 0.0001, "loss": 1.2002, "loss/crossentropy": 3.115431308746338, "loss/hidden": 1.0390625, "loss/logits": 0.16054463386535645, "loss/reg": 5.7679084420669824e-05, "step": 492 }, { "epoch": 0.061625, "grad_norm": 2.3650856018066406, "grad_norm_var": 0.2051052996774897, "learning_rate": 0.0001, "loss": 1.1996, "loss/crossentropy": 2.6519298553466797, "loss/hidden": 1.0234375, "loss/logits": 0.17554257810115814, "loss/reg": 5.766074173152447e-05, "step": 493 }, { "epoch": 0.06175, "grad_norm": 2.7080323696136475, "grad_norm_var": 0.2058088113620099, "learning_rate": 0.0001, "loss": 1.365, "loss/crossentropy": 2.329538106918335, "loss/hidden": 1.15625, "loss/logits": 0.20815491676330566, "loss/reg": 5.764625166193582e-05, "step": 494 }, { "epoch": 0.061875, "grad_norm": 2.2859530448913574, "grad_norm_var": 0.2063347958167308, "learning_rate": 0.0001, "loss": 1.2994, "loss/crossentropy": 2.6445348262786865, "loss/hidden": 1.125, "loss/logits": 0.1738019585609436, "loss/reg": 5.763155422755517e-05, "step": 495 }, { "epoch": 0.062, "grad_norm": 2.771320343017578, "grad_norm_var": 0.20431087500909348, "learning_rate": 0.0001, "loss": 1.4714, "loss/crossentropy": 2.340728282928467, "loss/hidden": 1.2578125, "loss/logits": 0.21303007006645203, "loss/reg": 5.761897409684025e-05, "step": 496 }, { "epoch": 0.062125, "grad_norm": 3.022183656692505, "grad_norm_var": 0.21312900983479016, "learning_rate": 0.0001, "loss": 1.4858, "loss/crossentropy": 2.6772336959838867, "loss/hidden": 1.265625, "loss/logits": 0.2196260541677475, "loss/reg": 5.761081411037594e-05, "step": 497 }, { "epoch": 0.06225, "grad_norm": 13.948429107666016, "grad_norm_var": 8.27193520122967, "learning_rate": 0.0001, "loss": 1.3633, "loss/crossentropy": 2.862323760986328, "loss/hidden": 1.171875, "loss/logits": 0.19083081185817719, "loss/reg": 5.7596374972490594e-05, "step": 498 }, { "epoch": 0.062375, "grad_norm": 2.6107678413391113, "grad_norm_var": 8.237513777759569, "learning_rate": 0.0001, "loss": 1.6771, "loss/crossentropy": 2.1725099086761475, "loss/hidden": 1.40625, "loss/logits": 0.2702314555644989, "loss/reg": 5.7586628827266395e-05, "step": 499 }, { "epoch": 0.0625, "grad_norm": 2.5658040046691895, "grad_norm_var": 8.22750426778598, "learning_rate": 0.0001, "loss": 1.4381, "loss/crossentropy": 2.246595859527588, "loss/hidden": 1.25, "loss/logits": 0.18755751848220825, "loss/reg": 5.756897371611558e-05, "step": 500 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.2202930782208e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }