{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02, "eval_steps": 1000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-05, "grad_norm": 0.37634041905403137, "learning_rate": 5e-06, "loss": 0.169, "loss/crossentropy": 2.8720462918281555, "loss/hidden": 0.0, "loss/logits": 0.16897856071591377, "loss/reg": 4.4040703773498535, "step": 1 }, { "epoch": 2e-05, "grad_norm": 0.35649582743644714, "learning_rate": 1e-05, "loss": 0.1696, "loss/crossentropy": 2.715533673763275, "loss/hidden": 0.0, "loss/logits": 0.1695844642817974, "loss/reg": 4.399058818817139, "step": 2 }, { "epoch": 3e-05, "grad_norm": 0.3591013252735138, "learning_rate": 1.5e-05, "loss": 0.1782, "loss/crossentropy": 2.6291310787200928, "loss/hidden": 0.0, "loss/logits": 0.1782267540693283, "loss/reg": 4.394084930419922, "step": 3 }, { "epoch": 4e-05, "grad_norm": 0.36401960253715515, "learning_rate": 2e-05, "loss": 0.1843, "loss/crossentropy": 2.7142109274864197, "loss/hidden": 0.0, "loss/logits": 0.1843317598104477, "loss/reg": 4.389005661010742, "step": 4 }, { "epoch": 5e-05, "grad_norm": 0.3119131922721863, "learning_rate": 2.5e-05, "loss": 0.1625, "loss/crossentropy": 2.7586326003074646, "loss/hidden": 0.0, "loss/logits": 0.1625315584242344, "loss/reg": 4.3841166496276855, "step": 5 }, { "epoch": 6e-05, "grad_norm": 0.3388400673866272, "learning_rate": 3e-05, "loss": 0.1844, "loss/crossentropy": 2.8104345202445984, "loss/hidden": 0.0, "loss/logits": 0.1844346523284912, "loss/reg": 4.3792877197265625, "step": 6 }, { "epoch": 7e-05, "grad_norm": 0.4783320426940918, "learning_rate": 3.5e-05, "loss": 0.1843, "loss/crossentropy": 2.8321655988693237, "loss/hidden": 0.0, "loss/logits": 0.18431555479764938, "loss/reg": 4.37478494644165, "step": 7 }, { "epoch": 8e-05, "grad_norm": 0.29636114835739136, "learning_rate": 4e-05, "loss": 0.1589, "loss/crossentropy": 2.6809526681900024, "loss/hidden": 0.0, "loss/logits": 0.15894119441509247, "loss/reg": 4.370139122009277, "step": 8 }, { "epoch": 9e-05, "grad_norm": 0.30071625113487244, "learning_rate": 4.5e-05, "loss": 0.1657, "loss/crossentropy": 2.6759764552116394, "loss/hidden": 0.0, "loss/logits": 0.16574353352189064, "loss/reg": 4.365106105804443, "step": 9 }, { "epoch": 0.0001, "grad_norm": 0.28883349895477295, "learning_rate": 5e-05, "loss": 0.1572, "loss/crossentropy": 2.808637499809265, "loss/hidden": 0.0, "loss/logits": 0.15719739720225334, "loss/reg": 4.360220909118652, "step": 10 }, { "epoch": 0.00011, "grad_norm": 0.28243017196655273, "learning_rate": 5e-05, "loss": 0.1426, "loss/crossentropy": 2.72423392534256, "loss/hidden": 0.0, "loss/logits": 0.14257685840129852, "loss/reg": 4.355813503265381, "step": 11 }, { "epoch": 0.00012, "grad_norm": 0.31152331829071045, "learning_rate": 5e-05, "loss": 0.147, "loss/crossentropy": 2.710044264793396, "loss/hidden": 0.0, "loss/logits": 0.14701137319207191, "loss/reg": 4.351265907287598, "step": 12 }, { "epoch": 0.00013, "grad_norm": 0.2739678919315338, "learning_rate": 5e-05, "loss": 0.1499, "loss/crossentropy": 2.7644649744033813, "loss/hidden": 0.0, "loss/logits": 0.149860430508852, "loss/reg": 4.346287727355957, "step": 13 }, { "epoch": 0.00014, "grad_norm": 0.2712353467941284, "learning_rate": 5e-05, "loss": 0.1454, "loss/crossentropy": 2.7370432019233704, "loss/hidden": 0.0, "loss/logits": 0.14539287611842155, "loss/reg": 4.340969085693359, "step": 14 }, { "epoch": 0.00015, "grad_norm": 0.2667863667011261, "learning_rate": 5e-05, "loss": 0.1403, "loss/crossentropy": 2.5638718008995056, "loss/hidden": 0.0, "loss/logits": 0.14029696956276894, "loss/reg": 4.336019515991211, "step": 15 }, { "epoch": 0.00016, "grad_norm": 0.30467212200164795, "grad_norm_var": 0.0029449483710212204, "learning_rate": 5e-05, "loss": 0.1361, "loss/crossentropy": 2.797445595264435, "loss/hidden": 0.0, "loss/logits": 0.13607431203126907, "loss/reg": 4.330692291259766, "step": 16 }, { "epoch": 0.00017, "grad_norm": 0.2617621421813965, "grad_norm_var": 0.0029635281595075556, "learning_rate": 5e-05, "loss": 0.1443, "loss/crossentropy": 2.7542406916618347, "loss/hidden": 0.0, "loss/logits": 0.14427556470036507, "loss/reg": 4.325323581695557, "step": 17 }, { "epoch": 0.00018, "grad_norm": 0.28648674488067627, "grad_norm_var": 0.0028982593896559215, "learning_rate": 5e-05, "loss": 0.1396, "loss/crossentropy": 2.674492835998535, "loss/hidden": 0.0, "loss/logits": 0.13961521908640862, "loss/reg": 4.31995153427124, "step": 18 }, { "epoch": 0.00019, "grad_norm": 0.26269060373306274, "grad_norm_var": 0.002877724259904054, "learning_rate": 5e-05, "loss": 0.141, "loss/crossentropy": 2.8323662281036377, "loss/hidden": 0.0, "loss/logits": 0.14103225618600845, "loss/reg": 4.315446853637695, "step": 19 }, { "epoch": 0.0002, "grad_norm": 0.2718074321746826, "grad_norm_var": 0.0026993307095730186, "learning_rate": 5e-05, "loss": 0.1314, "loss/crossentropy": 2.63212913274765, "loss/hidden": 0.0, "loss/logits": 0.1313977725803852, "loss/reg": 4.310704708099365, "step": 20 }, { "epoch": 0.00021, "grad_norm": 0.2430431842803955, "grad_norm_var": 0.0028911751903802204, "learning_rate": 5e-05, "loss": 0.1324, "loss/crossentropy": 2.664808928966522, "loss/hidden": 0.0, "loss/logits": 0.1324238833039999, "loss/reg": 4.305792808532715, "step": 21 }, { "epoch": 0.00022, "grad_norm": 0.24898661673069, "grad_norm_var": 0.00288514612507397, "learning_rate": 5e-05, "loss": 0.1242, "loss/crossentropy": 2.7142711877822876, "loss/hidden": 0.0, "loss/logits": 0.12423932552337646, "loss/reg": 4.300712585449219, "step": 22 }, { "epoch": 0.00023, "grad_norm": 0.3123313784599304, "grad_norm_var": 0.0004523056580034851, "learning_rate": 5e-05, "loss": 0.1321, "loss/crossentropy": 2.7829225063323975, "loss/hidden": 0.0, "loss/logits": 0.13212688639760017, "loss/reg": 4.295501232147217, "step": 23 }, { "epoch": 0.00024, "grad_norm": 0.25187963247299194, "grad_norm_var": 0.00048027979198491945, "learning_rate": 5e-05, "loss": 0.1248, "loss/crossentropy": 2.692659854888916, "loss/hidden": 0.0, "loss/logits": 0.12482420355081558, "loss/reg": 4.2908830642700195, "step": 24 }, { "epoch": 0.00025, "grad_norm": 0.2151177078485489, "grad_norm_var": 0.0006726495064564575, "learning_rate": 5e-05, "loss": 0.1232, "loss/crossentropy": 2.738182246685028, "loss/hidden": 0.0, "loss/logits": 0.1231868714094162, "loss/reg": 4.285846710205078, "step": 25 }, { "epoch": 0.00026, "grad_norm": 0.23308518528938293, "grad_norm_var": 0.0007424884519799501, "learning_rate": 5e-05, "loss": 0.1174, "loss/crossentropy": 2.555102586746216, "loss/hidden": 0.0, "loss/logits": 0.11737299524247646, "loss/reg": 4.281113147735596, "step": 26 }, { "epoch": 0.00027, "grad_norm": 0.24523235857486725, "grad_norm_var": 0.0007604384721796281, "learning_rate": 5e-05, "loss": 0.1201, "loss/crossentropy": 2.6816893815994263, "loss/hidden": 0.0, "loss/logits": 0.12014555744826794, "loss/reg": 4.2765069007873535, "step": 27 }, { "epoch": 0.00028, "grad_norm": 0.25897473096847534, "grad_norm_var": 0.0006160828367585275, "learning_rate": 5e-05, "loss": 0.1227, "loss/crossentropy": 2.7505548000335693, "loss/hidden": 0.0, "loss/logits": 0.12271320074796677, "loss/reg": 4.27158260345459, "step": 28 }, { "epoch": 0.00029, "grad_norm": 0.23087331652641296, "grad_norm_var": 0.0006691547004593392, "learning_rate": 5e-05, "loss": 0.1181, "loss/crossentropy": 2.8483291268348694, "loss/hidden": 0.0, "loss/logits": 0.11810225620865822, "loss/reg": 4.267061233520508, "step": 29 }, { "epoch": 0.0003, "grad_norm": 1.2210192680358887, "grad_norm_var": 0.05843327221954173, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.8535077571868896, "loss/hidden": 0.0, "loss/logits": 0.17234252952039242, "loss/reg": 4.262645244598389, "step": 30 }, { "epoch": 0.00031, "grad_norm": 0.2712586224079132, "grad_norm_var": 0.058402986662709634, "learning_rate": 5e-05, "loss": 0.1156, "loss/crossentropy": 2.6525614261627197, "loss/hidden": 0.0, "loss/logits": 0.11560441367328167, "loss/reg": 4.258092403411865, "step": 31 }, { "epoch": 0.00032, "grad_norm": 0.5226843953132629, "grad_norm_var": 0.06092943089461011, "learning_rate": 5e-05, "loss": 0.1537, "loss/crossentropy": 2.6228127479553223, "loss/hidden": 0.0, "loss/logits": 0.15369537472724915, "loss/reg": 4.253781318664551, "step": 32 }, { "epoch": 0.00033, "grad_norm": 0.35246461629867554, "grad_norm_var": 0.06057510886832484, "learning_rate": 5e-05, "loss": 0.1216, "loss/crossentropy": 2.6986429691314697, "loss/hidden": 0.0, "loss/logits": 0.12163393199443817, "loss/reg": 4.249208450317383, "step": 33 }, { "epoch": 0.00034, "grad_norm": 0.2868311405181885, "grad_norm_var": 0.060572693607631393, "learning_rate": 5e-05, "loss": 0.1215, "loss/crossentropy": 2.7423174381256104, "loss/hidden": 0.0, "loss/logits": 0.12151895463466644, "loss/reg": 4.244677543640137, "step": 34 }, { "epoch": 0.00035, "grad_norm": 0.2556142210960388, "grad_norm_var": 0.06064807497415105, "learning_rate": 5e-05, "loss": 0.1137, "loss/crossentropy": 2.7171207070350647, "loss/hidden": 0.0, "loss/logits": 0.1137176975607872, "loss/reg": 4.2399797439575195, "step": 35 }, { "epoch": 0.00036, "grad_norm": 0.2783287763595581, "grad_norm_var": 0.060592460146055585, "learning_rate": 5e-05, "loss": 0.1138, "loss/crossentropy": 2.7394094467163086, "loss/hidden": 0.0, "loss/logits": 0.11381806619465351, "loss/reg": 4.235424041748047, "step": 36 }, { "epoch": 0.00037, "grad_norm": 0.3065175712108612, "grad_norm_var": 0.06003019540430902, "learning_rate": 5e-05, "loss": 0.1235, "loss/crossentropy": 2.755502223968506, "loss/hidden": 0.0, "loss/logits": 0.12348765879869461, "loss/reg": 4.2310051918029785, "step": 37 }, { "epoch": 0.00038, "grad_norm": 0.26492562890052795, "grad_norm_var": 0.059845851287469956, "learning_rate": 5e-05, "loss": 0.1119, "loss/crossentropy": 2.8106552362442017, "loss/hidden": 0.0, "loss/logits": 0.11191634088754654, "loss/reg": 4.226707935333252, "step": 38 }, { "epoch": 0.00039, "grad_norm": 0.24673967063426971, "grad_norm_var": 0.06039341868271975, "learning_rate": 5e-05, "loss": 0.1161, "loss/crossentropy": 2.7490118741989136, "loss/hidden": 0.0, "loss/logits": 0.11609707958996296, "loss/reg": 4.222842216491699, "step": 39 }, { "epoch": 0.0004, "grad_norm": 0.2973298132419586, "grad_norm_var": 0.05998792869591778, "learning_rate": 5e-05, "loss": 0.1124, "loss/crossentropy": 2.7798808813095093, "loss/hidden": 0.0, "loss/logits": 0.11244922317564487, "loss/reg": 4.218531131744385, "step": 40 }, { "epoch": 0.00041, "grad_norm": 0.7517657279968262, "grad_norm_var": 0.06884148715130983, "learning_rate": 5e-05, "loss": 0.1545, "loss/crossentropy": 2.749855697154999, "loss/hidden": 0.0, "loss/logits": 0.15445118583738804, "loss/reg": 4.214253902435303, "step": 41 }, { "epoch": 0.00042, "grad_norm": 0.2417730987071991, "grad_norm_var": 0.06868010027414732, "learning_rate": 5e-05, "loss": 0.1099, "loss/crossentropy": 2.751042366027832, "loss/hidden": 0.0, "loss/logits": 0.1099155992269516, "loss/reg": 4.2101359367370605, "step": 42 }, { "epoch": 0.00043, "grad_norm": 0.2631951570510864, "grad_norm_var": 0.06838462807177058, "learning_rate": 5e-05, "loss": 0.1165, "loss/crossentropy": 2.7250843048095703, "loss/hidden": 0.0, "loss/logits": 0.11648696288466454, "loss/reg": 4.206397533416748, "step": 43 }, { "epoch": 0.00044, "grad_norm": 0.2518296241760254, "grad_norm_var": 0.06850134865244813, "learning_rate": 5e-05, "loss": 0.1111, "loss/crossentropy": 2.7153283953666687, "loss/hidden": 0.0, "loss/logits": 0.11108221486210823, "loss/reg": 4.201878547668457, "step": 44 }, { "epoch": 0.00045, "grad_norm": 0.24082158505916595, "grad_norm_var": 0.06831278207672915, "learning_rate": 5e-05, "loss": 0.1177, "loss/crossentropy": 2.6632660627365112, "loss/hidden": 0.0, "loss/logits": 0.11769118346273899, "loss/reg": 4.19778299331665, "step": 45 }, { "epoch": 0.00046, "grad_norm": 0.260890394449234, "grad_norm_var": 0.018048092726357542, "learning_rate": 5e-05, "loss": 0.1227, "loss/crossentropy": 2.7315176129341125, "loss/hidden": 0.0, "loss/logits": 0.12269957736134529, "loss/reg": 4.193592071533203, "step": 46 }, { "epoch": 0.00047, "grad_norm": 0.25268790125846863, "grad_norm_var": 0.018186152495949234, "learning_rate": 5e-05, "loss": 0.1178, "loss/crossentropy": 2.774504065513611, "loss/hidden": 0.0, "loss/logits": 0.11776839196681976, "loss/reg": 4.189169406890869, "step": 47 }, { "epoch": 0.00048, "grad_norm": 0.2759403884410858, "grad_norm_var": 0.015229396543742831, "learning_rate": 5e-05, "loss": 0.1289, "loss/crossentropy": 2.8515073657035828, "loss/hidden": 0.0, "loss/logits": 0.12885254248976707, "loss/reg": 4.185054779052734, "step": 48 }, { "epoch": 0.00049, "grad_norm": 0.24765782058238983, "grad_norm_var": 0.015206926335741973, "learning_rate": 5e-05, "loss": 0.1256, "loss/crossentropy": 2.7131593823432922, "loss/hidden": 0.0, "loss/logits": 0.1256290916353464, "loss/reg": 4.1810526847839355, "step": 49 }, { "epoch": 0.0005, "grad_norm": 0.3096969425678253, "grad_norm_var": 0.015214156358291781, "learning_rate": 5e-05, "loss": 0.1401, "loss/crossentropy": 2.7528311014175415, "loss/hidden": 0.0, "loss/logits": 0.14005928859114647, "loss/reg": 4.176880359649658, "step": 50 }, { "epoch": 0.00051, "grad_norm": 0.33225017786026, "grad_norm_var": 0.015162352298149247, "learning_rate": 5e-05, "loss": 0.1618, "loss/crossentropy": 2.73341304063797, "loss/hidden": 0.0, "loss/logits": 0.1618291698396206, "loss/reg": 4.173260688781738, "step": 51 }, { "epoch": 0.00052, "grad_norm": 0.33166685700416565, "grad_norm_var": 0.015176107188209845, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.824883460998535, "loss/hidden": 0.0, "loss/logits": 0.1703827939927578, "loss/reg": 4.168625354766846, "step": 52 }, { "epoch": 0.00053, "grad_norm": 0.4255874752998352, "grad_norm_var": 0.01609058098027729, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.8565452694892883, "loss/hidden": 0.0, "loss/logits": 0.18561138212680817, "loss/reg": 4.164296627044678, "step": 53 }, { "epoch": 0.00054, "grad_norm": 0.33207008242607117, "grad_norm_var": 0.015949373509081675, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.7211243510246277, "loss/hidden": 0.0, "loss/logits": 0.1762254200875759, "loss/reg": 4.16010856628418, "step": 54 }, { "epoch": 0.00055, "grad_norm": 0.3105420470237732, "grad_norm_var": 0.01561146008609899, "learning_rate": 5e-05, "loss": 0.172, "loss/crossentropy": 2.7821205854415894, "loss/hidden": 0.0, "loss/logits": 0.17203472182154655, "loss/reg": 4.155950546264648, "step": 55 }, { "epoch": 0.00056, "grad_norm": 0.3342844247817993, "grad_norm_var": 0.015583353488029018, "learning_rate": 5e-05, "loss": 0.1675, "loss/crossentropy": 2.783965766429901, "loss/hidden": 0.0, "loss/logits": 0.1675088219344616, "loss/reg": 4.151437759399414, "step": 56 }, { "epoch": 0.00057, "grad_norm": 0.3392151892185211, "grad_norm_var": 0.0026173613848745727, "learning_rate": 5e-05, "loss": 0.1675, "loss/crossentropy": 2.782883048057556, "loss/hidden": 0.0, "loss/logits": 0.16754426062107086, "loss/reg": 4.1469950675964355, "step": 57 }, { "epoch": 0.00058, "grad_norm": 0.46169230341911316, "grad_norm_var": 0.004024211017059094, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.6869139075279236, "loss/hidden": 0.0, "loss/logits": 0.18278859555721283, "loss/reg": 4.142712116241455, "step": 58 }, { "epoch": 0.00059, "grad_norm": 0.35874953866004944, "grad_norm_var": 0.00399056950783742, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.683705747127533, "loss/hidden": 0.0, "loss/logits": 0.17896704375743866, "loss/reg": 4.138728141784668, "step": 59 }, { "epoch": 0.0006, "grad_norm": 0.3390788435935974, "grad_norm_var": 0.0037128700604173097, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.6724974513053894, "loss/hidden": 0.0, "loss/logits": 0.18236950412392616, "loss/reg": 4.1345534324646, "step": 60 }, { "epoch": 0.00061, "grad_norm": 0.3341596722602844, "grad_norm_var": 0.003246451116369023, "learning_rate": 5e-05, "loss": 0.1694, "loss/crossentropy": 2.956072986125946, "loss/hidden": 0.0, "loss/logits": 0.16935936734080315, "loss/reg": 4.130521774291992, "step": 61 }, { "epoch": 0.00062, "grad_norm": 0.33658263087272644, "grad_norm_var": 0.0029283974011622186, "learning_rate": 5e-05, "loss": 0.1668, "loss/crossentropy": 2.8409587144851685, "loss/hidden": 0.0, "loss/logits": 0.16678539663553238, "loss/reg": 4.126163005828857, "step": 62 }, { "epoch": 0.00063, "grad_norm": 0.33723217248916626, "grad_norm_var": 0.0024741312804299983, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.7388935685157776, "loss/hidden": 0.0, "loss/logits": 0.18555545806884766, "loss/reg": 4.121931552886963, "step": 63 }, { "epoch": 0.00064, "grad_norm": 0.34580445289611816, "grad_norm_var": 0.0022020224702210757, "learning_rate": 5e-05, "loss": 0.1658, "loss/crossentropy": 2.6729788780212402, "loss/hidden": 0.0, "loss/logits": 0.16578427329659462, "loss/reg": 4.117753982543945, "step": 64 }, { "epoch": 0.00065, "grad_norm": 0.33867374062538147, "grad_norm_var": 0.0015716415803633144, "learning_rate": 5e-05, "loss": 0.1643, "loss/crossentropy": 2.8432253003120422, "loss/hidden": 0.0, "loss/logits": 0.16425132378935814, "loss/reg": 4.113894939422607, "step": 65 }, { "epoch": 0.00066, "grad_norm": 0.42098623514175415, "grad_norm_var": 0.001778022217079652, "learning_rate": 5e-05, "loss": 0.2155, "loss/crossentropy": 2.6712504625320435, "loss/hidden": 0.0, "loss/logits": 0.21550852805376053, "loss/reg": 4.10945463180542, "step": 66 }, { "epoch": 0.00067, "grad_norm": 0.35403043031692505, "grad_norm_var": 0.0017418631675115888, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.7415149211883545, "loss/hidden": 0.0, "loss/logits": 0.1797672137618065, "loss/reg": 4.1049418449401855, "step": 67 }, { "epoch": 0.00068, "grad_norm": 0.34834232926368713, "grad_norm_var": 0.0017045350753313, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.6858341097831726, "loss/hidden": 0.0, "loss/logits": 0.17833665013313293, "loss/reg": 4.100775718688965, "step": 68 }, { "epoch": 0.00069, "grad_norm": 0.3541049063205719, "grad_norm_var": 0.0013731843169029498, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 2.8710713982582092, "loss/hidden": 0.0, "loss/logits": 0.1744227409362793, "loss/reg": 4.096506595611572, "step": 69 }, { "epoch": 0.0007, "grad_norm": 0.3736323118209839, "grad_norm_var": 0.0013660110363047928, "learning_rate": 5e-05, "loss": 0.1994, "loss/crossentropy": 2.858128011226654, "loss/hidden": 0.0, "loss/logits": 0.19940509647130966, "loss/reg": 4.091678142547607, "step": 70 }, { "epoch": 0.00071, "grad_norm": 0.33025625348091125, "grad_norm_var": 0.001272272953577754, "learning_rate": 5e-05, "loss": 0.1646, "loss/crossentropy": 2.692229390144348, "loss/hidden": 0.0, "loss/logits": 0.16458340734243393, "loss/reg": 4.087361812591553, "step": 71 }, { "epoch": 0.00072, "grad_norm": 0.6907688975334167, "grad_norm_var": 0.00815051878013667, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.886055052280426, "loss/hidden": 0.0, "loss/logits": 0.1757429726421833, "loss/reg": 4.08318567276001, "step": 72 }, { "epoch": 0.00073, "grad_norm": 0.3311053514480591, "grad_norm_var": 0.008197602515626375, "learning_rate": 5e-05, "loss": 0.1682, "loss/crossentropy": 2.704796850681305, "loss/hidden": 0.0, "loss/logits": 0.1681583784520626, "loss/reg": 4.079033374786377, "step": 73 }, { "epoch": 0.00074, "grad_norm": 0.3336332142353058, "grad_norm_var": 0.0078012237613196535, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.6181225776672363, "loss/hidden": 0.0, "loss/logits": 0.16892167925834656, "loss/reg": 4.074740409851074, "step": 74 }, { "epoch": 0.00075, "grad_norm": 0.33766406774520874, "grad_norm_var": 0.007861895340318493, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.756729245185852, "loss/hidden": 0.0, "loss/logits": 0.17122048512101173, "loss/reg": 4.070303916931152, "step": 75 }, { "epoch": 0.00076, "grad_norm": 0.34048837423324585, "grad_norm_var": 0.007856372064757134, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.62674218416214, "loss/hidden": 0.0, "loss/logits": 0.17628077790141106, "loss/reg": 4.065893650054932, "step": 76 }, { "epoch": 0.00077, "grad_norm": 0.3368911147117615, "grad_norm_var": 0.007844070912018693, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.838981509208679, "loss/hidden": 0.0, "loss/logits": 0.17892110347747803, "loss/reg": 4.061193943023682, "step": 77 }, { "epoch": 0.00078, "grad_norm": 0.2983826696872711, "grad_norm_var": 0.008102358070792626, "learning_rate": 5e-05, "loss": 0.151, "loss/crossentropy": 2.8157095909118652, "loss/hidden": 0.0, "loss/logits": 0.15098581835627556, "loss/reg": 4.05631685256958, "step": 78 }, { "epoch": 0.00079, "grad_norm": 0.34036847949028015, "grad_norm_var": 0.008090524798600873, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.742383122444153, "loss/hidden": 0.0, "loss/logits": 0.17718595638871193, "loss/reg": 4.051788330078125, "step": 79 }, { "epoch": 0.0008, "grad_norm": 0.3196929097175598, "grad_norm_var": 0.008207612908988405, "learning_rate": 5e-05, "loss": 0.1574, "loss/crossentropy": 2.64748877286911, "loss/hidden": 0.0, "loss/logits": 0.15740340948104858, "loss/reg": 4.046438694000244, "step": 80 }, { "epoch": 0.00081, "grad_norm": 0.3145473897457123, "grad_norm_var": 0.008330494258097032, "learning_rate": 5e-05, "loss": 0.1591, "loss/crossentropy": 2.7640033960342407, "loss/hidden": 0.0, "loss/logits": 0.15912048518657684, "loss/reg": 4.041863441467285, "step": 81 }, { "epoch": 0.00082, "grad_norm": 0.37658828496932983, "grad_norm_var": 0.008116681055328008, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.8226330876350403, "loss/hidden": 0.0, "loss/logits": 0.17833809927105904, "loss/reg": 4.0372796058654785, "step": 82 }, { "epoch": 0.00083, "grad_norm": 0.36421865224838257, "grad_norm_var": 0.00811331907494814, "learning_rate": 5e-05, "loss": 0.1636, "loss/crossentropy": 2.762717604637146, "loss/hidden": 0.0, "loss/logits": 0.16359057649970055, "loss/reg": 4.032177925109863, "step": 83 }, { "epoch": 0.00084, "grad_norm": 0.3138120174407959, "grad_norm_var": 0.00825034262581384, "learning_rate": 5e-05, "loss": 0.1606, "loss/crossentropy": 2.625426709651947, "loss/hidden": 0.0, "loss/logits": 0.16061001271009445, "loss/reg": 4.027446269989014, "step": 84 }, { "epoch": 0.00085, "grad_norm": 0.34441590309143066, "grad_norm_var": 0.00826351514204321, "learning_rate": 5e-05, "loss": 0.1667, "loss/crossentropy": 2.8294222950935364, "loss/hidden": 0.0, "loss/logits": 0.16673466563224792, "loss/reg": 4.022748947143555, "step": 85 }, { "epoch": 0.00086, "grad_norm": 0.316683828830719, "grad_norm_var": 0.00835627592765974, "learning_rate": 5e-05, "loss": 0.1564, "loss/crossentropy": 2.8250383734703064, "loss/hidden": 0.0, "loss/logits": 0.1564498096704483, "loss/reg": 4.017378330230713, "step": 86 }, { "epoch": 0.00087, "grad_norm": 0.3178180456161499, "grad_norm_var": 0.008407967451986308, "learning_rate": 5e-05, "loss": 0.1589, "loss/crossentropy": 2.831330358982086, "loss/hidden": 0.0, "loss/logits": 0.15890633687376976, "loss/reg": 4.012408256530762, "step": 87 }, { "epoch": 0.00088, "grad_norm": 0.33865824341773987, "grad_norm_var": 0.00038455914158520567, "learning_rate": 5e-05, "loss": 0.1665, "loss/crossentropy": 2.8202422857284546, "loss/hidden": 0.0, "loss/logits": 0.16647625714540482, "loss/reg": 4.00655460357666, "step": 88 }, { "epoch": 0.00089, "grad_norm": 0.33375900983810425, "grad_norm_var": 0.00038439593085719167, "learning_rate": 5e-05, "loss": 0.1655, "loss/crossentropy": 2.748092472553253, "loss/hidden": 0.0, "loss/logits": 0.1655096672475338, "loss/reg": 4.000852584838867, "step": 89 }, { "epoch": 0.0009, "grad_norm": 0.41060250997543335, "grad_norm_var": 0.000761403690223957, "learning_rate": 5e-05, "loss": 0.1679, "loss/crossentropy": 2.8519994616508484, "loss/hidden": 0.0, "loss/logits": 0.1679377369582653, "loss/reg": 3.9966533184051514, "step": 90 }, { "epoch": 0.00091, "grad_norm": 0.3349744379520416, "grad_norm_var": 0.0007618998964447029, "learning_rate": 5e-05, "loss": 0.1663, "loss/crossentropy": 2.8302014470100403, "loss/hidden": 0.0, "loss/logits": 0.16629018262028694, "loss/reg": 3.9916272163391113, "step": 91 }, { "epoch": 0.00092, "grad_norm": 0.40859073400497437, "grad_norm_var": 0.0010778266384652254, "learning_rate": 5e-05, "loss": 0.1631, "loss/crossentropy": 2.831357002258301, "loss/hidden": 0.0, "loss/logits": 0.16314184293150902, "loss/reg": 3.9862587451934814, "step": 92 }, { "epoch": 0.00093, "grad_norm": 0.3679395616054535, "grad_norm_var": 0.0011174436691973562, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.653463125228882, "loss/hidden": 0.0, "loss/logits": 0.17491210997104645, "loss/reg": 3.9809703826904297, "step": 93 }, { "epoch": 0.00094, "grad_norm": 0.33192068338394165, "grad_norm_var": 0.000984578674839117, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.9128816723823547, "loss/hidden": 0.0, "loss/logits": 0.16890091821551323, "loss/reg": 3.9768238067626953, "step": 94 }, { "epoch": 0.00095, "grad_norm": 0.33981162309646606, "grad_norm_var": 0.000985009641976816, "learning_rate": 5e-05, "loss": 0.1651, "loss/crossentropy": 2.8998738527297974, "loss/hidden": 0.0, "loss/logits": 0.1651129573583603, "loss/reg": 3.9723405838012695, "step": 95 }, { "epoch": 0.00096, "grad_norm": 0.31845277547836304, "grad_norm_var": 0.0009894353533322537, "learning_rate": 5e-05, "loss": 0.1566, "loss/crossentropy": 2.738618314266205, "loss/hidden": 0.0, "loss/logits": 0.15662826597690582, "loss/reg": 3.9680373668670654, "step": 96 }, { "epoch": 0.00097, "grad_norm": 0.3521839678287506, "grad_norm_var": 0.0009211371554959176, "learning_rate": 5e-05, "loss": 0.1571, "loss/crossentropy": 2.896687388420105, "loss/hidden": 0.0, "loss/logits": 0.15710216015577316, "loss/reg": 3.964097499847412, "step": 97 }, { "epoch": 0.00098, "grad_norm": 0.41529935598373413, "grad_norm_var": 0.0011615701056859014, "learning_rate": 5e-05, "loss": 0.1761, "loss/crossentropy": 2.6711183190345764, "loss/hidden": 0.0, "loss/logits": 0.176058791577816, "loss/reg": 3.959585428237915, "step": 98 }, { "epoch": 0.00099, "grad_norm": 0.3406970202922821, "grad_norm_var": 0.0011533483453351997, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.762200713157654, "loss/hidden": 0.0, "loss/logits": 0.17553818225860596, "loss/reg": 3.9551267623901367, "step": 99 }, { "epoch": 0.001, "grad_norm": 0.3295409083366394, "grad_norm_var": 0.0010948026927074712, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.666721522808075, "loss/hidden": 0.0, "loss/logits": 0.17914289608597755, "loss/reg": 3.9509167671203613, "step": 100 }, { "epoch": 0.00101, "grad_norm": 0.3429720401763916, "grad_norm_var": 0.001096024238407974, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.82060843706131, "loss/hidden": 0.0, "loss/logits": 0.1792576014995575, "loss/reg": 3.9469358921051025, "step": 101 }, { "epoch": 0.00102, "grad_norm": 0.3215195834636688, "grad_norm_var": 0.0010760084324249537, "learning_rate": 5e-05, "loss": 0.1632, "loss/crossentropy": 2.808405876159668, "loss/hidden": 0.0, "loss/logits": 0.16316882148385048, "loss/reg": 3.943436622619629, "step": 102 }, { "epoch": 0.00103, "grad_norm": 0.33158427476882935, "grad_norm_var": 0.0010282390377130302, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.8497248888015747, "loss/hidden": 0.0, "loss/logits": 0.1783306896686554, "loss/reg": 3.9394803047180176, "step": 103 }, { "epoch": 0.00104, "grad_norm": 0.3384368121623993, "grad_norm_var": 0.001028611107856688, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.8479551672935486, "loss/hidden": 0.0, "loss/logits": 0.17731818184256554, "loss/reg": 3.935678243637085, "step": 104 }, { "epoch": 0.00105, "grad_norm": 0.3275454342365265, "grad_norm_var": 0.0010454262321925218, "learning_rate": 5e-05, "loss": 0.172, "loss/crossentropy": 2.7240310311317444, "loss/hidden": 0.0, "loss/logits": 0.17204875499010086, "loss/reg": 3.932224750518799, "step": 105 }, { "epoch": 0.00106, "grad_norm": 0.3352244198322296, "grad_norm_var": 0.0007990449288615142, "learning_rate": 5e-05, "loss": 0.1687, "loss/crossentropy": 2.657980978488922, "loss/hidden": 0.0, "loss/logits": 0.16869833320379257, "loss/reg": 3.92889142036438, "step": 106 }, { "epoch": 0.00107, "grad_norm": 0.3195781409740448, "grad_norm_var": 0.00083658300653268, "learning_rate": 5e-05, "loss": 0.1642, "loss/crossentropy": 2.7351735830307007, "loss/hidden": 0.0, "loss/logits": 0.16421591117978096, "loss/reg": 3.9260904788970947, "step": 107 }, { "epoch": 0.00108, "grad_norm": 0.3216703534126282, "grad_norm_var": 0.0005727423089818255, "learning_rate": 5e-05, "loss": 0.1611, "loss/crossentropy": 2.835266649723053, "loss/hidden": 0.0, "loss/logits": 0.1611352562904358, "loss/reg": 3.923356533050537, "step": 108 }, { "epoch": 0.00109, "grad_norm": 0.3534785807132721, "grad_norm_var": 0.0005312635552543169, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.8821677565574646, "loss/hidden": 0.0, "loss/logits": 0.1689467802643776, "loss/reg": 3.9208316802978516, "step": 109 }, { "epoch": 0.0011, "grad_norm": 0.33851271867752075, "grad_norm_var": 0.0005279815580263729, "learning_rate": 5e-05, "loss": 0.171, "loss/crossentropy": 2.7201637029647827, "loss/hidden": 0.0, "loss/logits": 0.17095838487148285, "loss/reg": 3.918743133544922, "step": 110 }, { "epoch": 0.00111, "grad_norm": 0.32998839020729065, "grad_norm_var": 0.0005331548233647158, "learning_rate": 5e-05, "loss": 0.166, "loss/crossentropy": 2.6836928725242615, "loss/hidden": 0.0, "loss/logits": 0.16604754701256752, "loss/reg": 3.914886951446533, "step": 111 }, { "epoch": 0.00112, "grad_norm": 0.420744925737381, "grad_norm_var": 0.0009131281860373264, "learning_rate": 5e-05, "loss": 0.1738, "loss/crossentropy": 2.568650722503662, "loss/hidden": 0.0, "loss/logits": 0.1737859919667244, "loss/reg": 3.9106812477111816, "step": 112 }, { "epoch": 0.00113, "grad_norm": 0.3349835276603699, "grad_norm_var": 0.000914996833659265, "learning_rate": 5e-05, "loss": 0.1522, "loss/crossentropy": 2.7411792278289795, "loss/hidden": 0.0, "loss/logits": 0.15223057195544243, "loss/reg": 3.9069032669067383, "step": 113 }, { "epoch": 0.00114, "grad_norm": 0.34276068210601807, "grad_norm_var": 0.0005529241807124034, "learning_rate": 5e-05, "loss": 0.1567, "loss/crossentropy": 2.80877947807312, "loss/hidden": 0.0, "loss/logits": 0.1567244492471218, "loss/reg": 3.90332293510437, "step": 114 }, { "epoch": 0.00115, "grad_norm": 0.35375383496284485, "grad_norm_var": 0.0005659636539689298, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 2.698065936565399, "loss/hidden": 0.0, "loss/logits": 0.16574329882860184, "loss/reg": 3.8998756408691406, "step": 115 }, { "epoch": 0.00116, "grad_norm": 0.33278602361679077, "grad_norm_var": 0.0005620343134485931, "learning_rate": 5e-05, "loss": 0.1739, "loss/crossentropy": 2.7814364433288574, "loss/hidden": 0.0, "loss/logits": 0.17385346069931984, "loss/reg": 3.8964290618896484, "step": 116 }, { "epoch": 0.00117, "grad_norm": 0.35139891505241394, "grad_norm_var": 0.0005694228893132684, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.7721198201179504, "loss/hidden": 0.0, "loss/logits": 0.1701316274702549, "loss/reg": 3.8925936222076416, "step": 117 }, { "epoch": 0.00118, "grad_norm": 0.3708522915840149, "grad_norm_var": 0.0005942298534055627, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.8753750920295715, "loss/hidden": 0.0, "loss/logits": 0.17226434499025345, "loss/reg": 3.888739824295044, "step": 118 }, { "epoch": 0.00119, "grad_norm": 0.32619452476501465, "grad_norm_var": 0.0006049363247454272, "learning_rate": 5e-05, "loss": 0.1559, "loss/crossentropy": 2.792622923851013, "loss/hidden": 0.0, "loss/logits": 0.15585486218333244, "loss/reg": 3.8849120140075684, "step": 119 }, { "epoch": 0.0012, "grad_norm": 0.3160404562950134, "grad_norm_var": 0.0006517621123632485, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 2.833389937877655, "loss/hidden": 0.0, "loss/logits": 0.16574294120073318, "loss/reg": 3.8814921379089355, "step": 120 }, { "epoch": 0.00121, "grad_norm": 2.6332755088806152, "grad_norm_var": 0.328414929277446, "learning_rate": 5e-05, "loss": 0.2807, "loss/crossentropy": 2.960978329181671, "loss/hidden": 0.0, "loss/logits": 0.280683059245348, "loss/reg": 3.8778162002563477, "step": 121 }, { "epoch": 0.00122, "grad_norm": 0.39280807971954346, "grad_norm_var": 0.32746202761424736, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.8656354546546936, "loss/hidden": 0.0, "loss/logits": 0.17905254289507866, "loss/reg": 3.8742706775665283, "step": 122 }, { "epoch": 0.00123, "grad_norm": 0.36644095182418823, "grad_norm_var": 0.3265348837601918, "learning_rate": 5e-05, "loss": 0.1765, "loss/crossentropy": 2.776346266269684, "loss/hidden": 0.0, "loss/logits": 0.1764557734131813, "loss/reg": 3.8701822757720947, "step": 123 }, { "epoch": 0.00124, "grad_norm": 0.39717525243759155, "grad_norm_var": 0.3251678188828664, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.9204375743865967, "loss/hidden": 0.0, "loss/logits": 0.1796155981719494, "loss/reg": 3.866316556930542, "step": 124 }, { "epoch": 0.00125, "grad_norm": 0.366623193025589, "grad_norm_var": 0.3249260727271075, "learning_rate": 5e-05, "loss": 0.1654, "loss/crossentropy": 2.42034849524498, "loss/hidden": 0.0, "loss/logits": 0.165392205119133, "loss/reg": 3.8625807762145996, "step": 125 }, { "epoch": 0.00126, "grad_norm": 0.3638598918914795, "grad_norm_var": 0.32442588175429127, "learning_rate": 5e-05, "loss": 0.1601, "loss/crossentropy": 2.936553716659546, "loss/hidden": 0.0, "loss/logits": 0.16014225035905838, "loss/reg": 3.8585283756256104, "step": 126 }, { "epoch": 0.00127, "grad_norm": 0.3437521159648895, "grad_norm_var": 0.3241257586372512, "learning_rate": 5e-05, "loss": 0.1603, "loss/crossentropy": 2.8428520560264587, "loss/hidden": 0.0, "loss/logits": 0.16030794754624367, "loss/reg": 3.854602813720703, "step": 127 }, { "epoch": 0.00128, "grad_norm": 0.3604683578014374, "grad_norm_var": 0.3249965569466151, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.717309355735779, "loss/hidden": 0.0, "loss/logits": 0.1687549129128456, "loss/reg": 3.85067081451416, "step": 128 }, { "epoch": 0.00129, "grad_norm": 0.3499651849269867, "grad_norm_var": 0.32468680185211135, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.819560468196869, "loss/hidden": 0.0, "loss/logits": 0.17475899681448936, "loss/reg": 3.8467037677764893, "step": 129 }, { "epoch": 0.0013, "grad_norm": 0.3231496512889862, "grad_norm_var": 0.32511678466571453, "learning_rate": 5e-05, "loss": 0.1695, "loss/crossentropy": 2.5843223929405212, "loss/hidden": 0.0, "loss/logits": 0.16951489821076393, "loss/reg": 3.843282699584961, "step": 130 }, { "epoch": 0.00131, "grad_norm": 0.3588982820510864, "grad_norm_var": 0.325020330590364, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.725651264190674, "loss/hidden": 0.0, "loss/logits": 0.16896183416247368, "loss/reg": 3.839895725250244, "step": 131 }, { "epoch": 0.00132, "grad_norm": 0.37743306159973145, "grad_norm_var": 0.32416673149153025, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 3.0410608053207397, "loss/hidden": 0.0, "loss/logits": 0.1833292953670025, "loss/reg": 3.8355963230133057, "step": 132 }, { "epoch": 0.00133, "grad_norm": 0.32988330721855164, "grad_norm_var": 0.32462166470000664, "learning_rate": 5e-05, "loss": 0.1654, "loss/crossentropy": 2.7005507349967957, "loss/hidden": 0.0, "loss/logits": 0.1653790920972824, "loss/reg": 3.831345558166504, "step": 133 }, { "epoch": 0.00134, "grad_norm": 0.35988613963127136, "grad_norm_var": 0.32481589623167567, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.7048683762550354, "loss/hidden": 0.0, "loss/logits": 0.17917973920702934, "loss/reg": 3.8267781734466553, "step": 134 }, { "epoch": 0.00135, "grad_norm": 0.32649827003479004, "grad_norm_var": 0.324808949416691, "learning_rate": 5e-05, "loss": 0.1642, "loss/crossentropy": 2.791461765766144, "loss/hidden": 0.0, "loss/logits": 0.16420895606279373, "loss/reg": 3.8223133087158203, "step": 135 }, { "epoch": 0.00136, "grad_norm": 0.6779212355613708, "grad_norm_var": 0.32421967313153754, "learning_rate": 5e-05, "loss": 0.2361, "loss/crossentropy": 3.063343107700348, "loss/hidden": 0.0, "loss/logits": 0.2360655590891838, "loss/reg": 3.818582057952881, "step": 136 }, { "epoch": 0.00137, "grad_norm": 0.4217770993709564, "grad_norm_var": 0.0069040846383882, "learning_rate": 5e-05, "loss": 0.1936, "loss/crossentropy": 2.8291149735450745, "loss/hidden": 0.0, "loss/logits": 0.19361505657434464, "loss/reg": 3.814713716506958, "step": 137 }, { "epoch": 0.00138, "grad_norm": 0.3183574378490448, "grad_norm_var": 0.0071460434004817905, "learning_rate": 5e-05, "loss": 0.1596, "loss/crossentropy": 2.733646512031555, "loss/hidden": 0.0, "loss/logits": 0.15959006920456886, "loss/reg": 3.8112361431121826, "step": 138 }, { "epoch": 0.00139, "grad_norm": 0.35119444131851196, "grad_norm_var": 0.007183318962822194, "learning_rate": 5e-05, "loss": 0.1706, "loss/crossentropy": 2.777931809425354, "loss/hidden": 0.0, "loss/logits": 0.17056189104914665, "loss/reg": 3.807130813598633, "step": 139 }, { "epoch": 0.0014, "grad_norm": 0.3381962478160858, "grad_norm_var": 0.007239536480815012, "learning_rate": 5e-05, "loss": 0.1651, "loss/crossentropy": 2.865752935409546, "loss/hidden": 0.0, "loss/logits": 0.16511252894997597, "loss/reg": 3.8030734062194824, "step": 140 }, { "epoch": 0.00141, "grad_norm": 0.35082533955574036, "grad_norm_var": 0.007268548808216302, "learning_rate": 5e-05, "loss": 0.1608, "loss/crossentropy": 2.734546184539795, "loss/hidden": 0.0, "loss/logits": 0.16080284118652344, "loss/reg": 3.7996251583099365, "step": 141 }, { "epoch": 0.00142, "grad_norm": 0.4269000291824341, "grad_norm_var": 0.007448472313405929, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.9227113127708435, "loss/hidden": 0.0, "loss/logits": 0.1805506870150566, "loss/reg": 3.7955057621002197, "step": 142 }, { "epoch": 0.00143, "grad_norm": 0.3532395660877228, "grad_norm_var": 0.0074133753520221855, "learning_rate": 5e-05, "loss": 0.1588, "loss/crossentropy": 2.9407125115394592, "loss/hidden": 0.0, "loss/logits": 0.15880529955029488, "loss/reg": 3.791508197784424, "step": 143 }, { "epoch": 0.00144, "grad_norm": 0.3449239134788513, "grad_norm_var": 0.007461781173789813, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 2.8305121660232544, "loss/hidden": 0.0, "loss/logits": 0.1652398444712162, "loss/reg": 3.7871744632720947, "step": 144 }, { "epoch": 0.00145, "grad_norm": 0.3272966742515564, "grad_norm_var": 0.007571273873210712, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.876939594745636, "loss/hidden": 0.0, "loss/logits": 0.17143940553069115, "loss/reg": 3.7832887172698975, "step": 145 }, { "epoch": 0.00146, "grad_norm": 0.31960922479629517, "grad_norm_var": 0.007596131782178968, "learning_rate": 5e-05, "loss": 0.1558, "loss/crossentropy": 2.7597694993019104, "loss/hidden": 0.0, "loss/logits": 0.15579523891210556, "loss/reg": 3.77976393699646, "step": 146 }, { "epoch": 0.00147, "grad_norm": 0.3329758048057556, "grad_norm_var": 0.007690076208493398, "learning_rate": 5e-05, "loss": 0.1602, "loss/crossentropy": 2.823091506958008, "loss/hidden": 0.0, "loss/logits": 0.16016652062535286, "loss/reg": 3.776364326477051, "step": 147 }, { "epoch": 0.00148, "grad_norm": 0.3245135545730591, "grad_norm_var": 0.007828939248271782, "learning_rate": 5e-05, "loss": 0.1608, "loss/crossentropy": 2.622242748737335, "loss/hidden": 0.0, "loss/logits": 0.16081608831882477, "loss/reg": 3.7724997997283936, "step": 148 }, { "epoch": 0.00149, "grad_norm": 0.3239537179470062, "grad_norm_var": 0.007862062788276463, "learning_rate": 5e-05, "loss": 0.1559, "loss/crossentropy": 2.826173484325409, "loss/hidden": 0.0, "loss/logits": 0.15591008588671684, "loss/reg": 3.7680001258850098, "step": 149 }, { "epoch": 0.0015, "grad_norm": 0.3199516534805298, "grad_norm_var": 0.00800828926831548, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.73406845331192, "loss/hidden": 0.0, "loss/logits": 0.17048393934965134, "loss/reg": 3.7640268802642822, "step": 150 }, { "epoch": 0.00151, "grad_norm": 0.3810157775878906, "grad_norm_var": 0.00790594146931481, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.746786952018738, "loss/hidden": 0.0, "loss/logits": 0.17715823650360107, "loss/reg": 3.760627508163452, "step": 151 }, { "epoch": 0.00152, "grad_norm": 0.33840498328208923, "grad_norm_var": 0.0011503711202599262, "learning_rate": 5e-05, "loss": 0.168, "loss/crossentropy": 2.7671576738357544, "loss/hidden": 0.0, "loss/logits": 0.1679898537695408, "loss/reg": 3.7571685314178467, "step": 152 }, { "epoch": 0.00153, "grad_norm": 0.35103219747543335, "grad_norm_var": 0.0007702874374444798, "learning_rate": 5e-05, "loss": 0.1683, "loss/crossentropy": 2.8394588828086853, "loss/hidden": 0.0, "loss/logits": 0.1682782731950283, "loss/reg": 3.7533957958221436, "step": 153 }, { "epoch": 0.00154, "grad_norm": 0.34948527812957764, "grad_norm_var": 0.000724837481797543, "learning_rate": 5e-05, "loss": 0.1551, "loss/crossentropy": 2.637475073337555, "loss/hidden": 0.0, "loss/logits": 0.1551469974219799, "loss/reg": 3.7496984004974365, "step": 154 }, { "epoch": 0.00155, "grad_norm": 0.32411250472068787, "grad_norm_var": 0.000751360146424022, "learning_rate": 5e-05, "loss": 0.1655, "loss/crossentropy": 2.65782767534256, "loss/hidden": 0.0, "loss/logits": 0.16551653295755386, "loss/reg": 3.7462174892425537, "step": 155 }, { "epoch": 0.00156, "grad_norm": 0.3659244775772095, "grad_norm_var": 0.0007773935392291246, "learning_rate": 5e-05, "loss": 0.1618, "loss/crossentropy": 2.8054139614105225, "loss/hidden": 0.0, "loss/logits": 0.16182733327150345, "loss/reg": 3.7422730922698975, "step": 156 }, { "epoch": 0.00157, "grad_norm": 0.3639696538448334, "grad_norm_var": 0.0007968496539047743, "learning_rate": 5e-05, "loss": 0.172, "loss/crossentropy": 2.643721103668213, "loss/hidden": 0.0, "loss/logits": 0.17196981981396675, "loss/reg": 3.7390189170837402, "step": 157 }, { "epoch": 0.00158, "grad_norm": 0.372111439704895, "grad_norm_var": 0.0003986384080602812, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.6860750317573547, "loss/hidden": 0.0, "loss/logits": 0.17522458359599113, "loss/reg": 3.7351748943328857, "step": 158 }, { "epoch": 0.00159, "grad_norm": 0.3412966728210449, "grad_norm_var": 0.0003916975034196302, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.7506829500198364, "loss/hidden": 0.0, "loss/logits": 0.17320549115538597, "loss/reg": 3.731645345687866, "step": 159 }, { "epoch": 0.0016, "grad_norm": 0.31508323550224304, "grad_norm_var": 0.0004378510847698321, "learning_rate": 5e-05, "loss": 0.1676, "loss/crossentropy": 2.672293782234192, "loss/hidden": 0.0, "loss/logits": 0.16758090257644653, "loss/reg": 3.727598190307617, "step": 160 }, { "epoch": 0.00161, "grad_norm": 0.39773106575012207, "grad_norm_var": 0.0006223116385708161, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.975751519203186, "loss/hidden": 0.0, "loss/logits": 0.18666821345686913, "loss/reg": 3.7237842082977295, "step": 161 }, { "epoch": 0.00162, "grad_norm": 0.3057797849178314, "grad_norm_var": 0.0006812186499233134, "learning_rate": 5e-05, "loss": 0.1511, "loss/crossentropy": 2.768982172012329, "loss/hidden": 0.0, "loss/logits": 0.15112394466996193, "loss/reg": 3.7201473712921143, "step": 162 }, { "epoch": 0.00163, "grad_norm": 0.39109617471694946, "grad_norm_var": 0.0008052929738533592, "learning_rate": 5e-05, "loss": 0.1692, "loss/crossentropy": 2.7556854486465454, "loss/hidden": 0.0, "loss/logits": 0.1692204400897026, "loss/reg": 3.715847969055176, "step": 163 }, { "epoch": 0.00164, "grad_norm": 0.3230038285255432, "grad_norm_var": 0.0008101312463145642, "learning_rate": 5e-05, "loss": 0.158, "loss/crossentropy": 2.663906216621399, "loss/hidden": 0.0, "loss/logits": 0.1579984687268734, "loss/reg": 3.712200403213501, "step": 164 }, { "epoch": 0.00165, "grad_norm": 0.32820436358451843, "grad_norm_var": 0.0007977755717131292, "learning_rate": 5e-05, "loss": 0.1535, "loss/crossentropy": 2.7556238174438477, "loss/hidden": 0.0, "loss/logits": 0.15348907560110092, "loss/reg": 3.7093729972839355, "step": 165 }, { "epoch": 0.00166, "grad_norm": 0.37247714400291443, "grad_norm_var": 0.0007736858685811421, "learning_rate": 5e-05, "loss": 0.168, "loss/crossentropy": 2.623964309692383, "loss/hidden": 0.0, "loss/logits": 0.16797634214162827, "loss/reg": 3.7055835723876953, "step": 166 }, { "epoch": 0.00167, "grad_norm": 0.31921809911727905, "grad_norm_var": 0.0007674848471050747, "learning_rate": 5e-05, "loss": 0.1618, "loss/crossentropy": 2.6233983039855957, "loss/hidden": 0.0, "loss/logits": 0.16180693730711937, "loss/reg": 3.7018704414367676, "step": 167 }, { "epoch": 0.00168, "grad_norm": 0.41518375277519226, "grad_norm_var": 0.0010434978692974088, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.794585347175598, "loss/hidden": 0.0, "loss/logits": 0.18423354998230934, "loss/reg": 3.6984987258911133, "step": 168 }, { "epoch": 0.00169, "grad_norm": 0.3530808985233307, "grad_norm_var": 0.0010434324942960296, "learning_rate": 5e-05, "loss": 0.1818, "loss/crossentropy": 2.725895941257477, "loss/hidden": 0.0, "loss/logits": 0.18175217881798744, "loss/reg": 3.6949737071990967, "step": 169 }, { "epoch": 0.0017, "grad_norm": 0.35729339718818665, "grad_norm_var": 0.001044250197534243, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.8144423365592957, "loss/hidden": 0.0, "loss/logits": 0.1758369542658329, "loss/reg": 3.6909079551696777, "step": 170 }, { "epoch": 0.00171, "grad_norm": 0.3258056044578552, "grad_norm_var": 0.0010379424391956011, "learning_rate": 5e-05, "loss": 0.1615, "loss/crossentropy": 2.6860609650611877, "loss/hidden": 0.0, "loss/logits": 0.1615053378045559, "loss/reg": 3.6872336864471436, "step": 171 }, { "epoch": 0.00172, "grad_norm": 0.3320024907588959, "grad_norm_var": 0.0010511954351829684, "learning_rate": 5e-05, "loss": 0.1669, "loss/crossentropy": 2.7618680596351624, "loss/hidden": 0.0, "loss/logits": 0.16686224937438965, "loss/reg": 3.684033155441284, "step": 172 }, { "epoch": 0.00173, "grad_norm": 0.32370057702064514, "grad_norm_var": 0.001082015111668518, "learning_rate": 5e-05, "loss": 0.1568, "loss/crossentropy": 2.8911356329917908, "loss/hidden": 0.0, "loss/logits": 0.15675026923418045, "loss/reg": 3.6797101497650146, "step": 173 }, { "epoch": 0.00174, "grad_norm": 0.3590388298034668, "grad_norm_var": 0.0010512214971074684, "learning_rate": 5e-05, "loss": 0.1608, "loss/crossentropy": 2.894763946533203, "loss/hidden": 0.0, "loss/logits": 0.16077794507145882, "loss/reg": 3.676694393157959, "step": 174 }, { "epoch": 0.00175, "grad_norm": 0.362693190574646, "grad_norm_var": 0.0010621381304175893, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.9355967044830322, "loss/hidden": 0.0, "loss/logits": 0.18095535412430763, "loss/reg": 3.6728715896606445, "step": 175 }, { "epoch": 0.00176, "grad_norm": 0.3421201705932617, "grad_norm_var": 0.0009861454944628978, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.771928548812866, "loss/hidden": 0.0, "loss/logits": 0.17522436380386353, "loss/reg": 3.6698157787323, "step": 176 }, { "epoch": 0.00177, "grad_norm": 0.3921768069267273, "grad_norm_var": 0.0009531156716223066, "learning_rate": 5e-05, "loss": 0.1682, "loss/crossentropy": 2.9020140171051025, "loss/hidden": 0.0, "loss/logits": 0.16816864535212517, "loss/reg": 3.6669130325317383, "step": 177 }, { "epoch": 0.00178, "grad_norm": 0.414460688829422, "grad_norm_var": 0.0010479472090343092, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 2.871070384979248, "loss/hidden": 0.0, "loss/logits": 0.1651761755347252, "loss/reg": 3.6637353897094727, "step": 178 }, { "epoch": 0.00179, "grad_norm": 0.37821123003959656, "grad_norm_var": 0.0009996989224075473, "learning_rate": 5e-05, "loss": 0.1618, "loss/crossentropy": 2.8318552374839783, "loss/hidden": 0.0, "loss/logits": 0.16177014261484146, "loss/reg": 3.6601526737213135, "step": 179 }, { "epoch": 0.0018, "grad_norm": 0.33756861090660095, "grad_norm_var": 0.0009485554235717804, "learning_rate": 5e-05, "loss": 0.164, "loss/crossentropy": 2.7179840803146362, "loss/hidden": 0.0, "loss/logits": 0.16402991488575935, "loss/reg": 3.655977725982666, "step": 180 }, { "epoch": 0.00181, "grad_norm": 0.3508152663707733, "grad_norm_var": 0.0008934631549546879, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.655538856983185, "loss/hidden": 0.0, "loss/logits": 0.18240001425147057, "loss/reg": 3.6522390842437744, "step": 181 }, { "epoch": 0.00182, "grad_norm": 0.4800889194011688, "grad_norm_var": 0.0018179163356779901, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.9170504808425903, "loss/hidden": 0.0, "loss/logits": 0.17730093747377396, "loss/reg": 3.648420810699463, "step": 182 }, { "epoch": 0.00183, "grad_norm": 0.32715606689453125, "grad_norm_var": 0.0017731703957083382, "learning_rate": 5e-05, "loss": 0.1599, "loss/crossentropy": 2.6978230476379395, "loss/hidden": 0.0, "loss/logits": 0.15988203510642052, "loss/reg": 3.644439458847046, "step": 183 }, { "epoch": 0.00184, "grad_norm": 0.3219493329524994, "grad_norm_var": 0.0017014689354580615, "learning_rate": 5e-05, "loss": 0.1588, "loss/crossentropy": 2.772395610809326, "loss/hidden": 0.0, "loss/logits": 0.158803328871727, "loss/reg": 3.6410605907440186, "step": 184 }, { "epoch": 0.00185, "grad_norm": 0.3204100728034973, "grad_norm_var": 0.0017978203455529696, "learning_rate": 5e-05, "loss": 0.1595, "loss/crossentropy": 2.7290788292884827, "loss/hidden": 0.0, "loss/logits": 0.15948805212974548, "loss/reg": 3.637272596359253, "step": 185 }, { "epoch": 0.00186, "grad_norm": 0.34646865725517273, "grad_norm_var": 0.0018059373173852718, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.68435937166214, "loss/hidden": 0.0, "loss/logits": 0.17226089164614677, "loss/reg": 3.6334545612335205, "step": 186 }, { "epoch": 0.00187, "grad_norm": 0.35515356063842773, "grad_norm_var": 0.001737051018651666, "learning_rate": 5e-05, "loss": 0.1656, "loss/crossentropy": 2.8159299492836, "loss/hidden": 0.0, "loss/logits": 0.16557194665074348, "loss/reg": 3.629215717315674, "step": 187 }, { "epoch": 0.00188, "grad_norm": 0.31605055928230286, "grad_norm_var": 0.0018103786054489293, "learning_rate": 5e-05, "loss": 0.1587, "loss/crossentropy": 2.737620174884796, "loss/hidden": 0.0, "loss/logits": 0.15867746248841286, "loss/reg": 3.6252663135528564, "step": 188 }, { "epoch": 0.00189, "grad_norm": 0.3383916914463043, "grad_norm_var": 0.0017566740185558556, "learning_rate": 5e-05, "loss": 0.1621, "loss/crossentropy": 2.7829577326774597, "loss/hidden": 0.0, "loss/logits": 0.162098228931427, "loss/reg": 3.621067523956299, "step": 189 }, { "epoch": 0.0019, "grad_norm": 0.4556836783885956, "grad_norm_var": 0.0023419423247556044, "learning_rate": 5e-05, "loss": 0.1687, "loss/crossentropy": 2.9624626636505127, "loss/hidden": 0.0, "loss/logits": 0.1686898171901703, "loss/reg": 3.6163265705108643, "step": 190 }, { "epoch": 0.00191, "grad_norm": 0.3975931406021118, "grad_norm_var": 0.0024075083289669527, "learning_rate": 5e-05, "loss": 0.155, "loss/crossentropy": 2.731001079082489, "loss/hidden": 0.0, "loss/logits": 0.15501929074525833, "loss/reg": 3.6121585369110107, "step": 191 }, { "epoch": 0.00192, "grad_norm": 0.37328633666038513, "grad_norm_var": 0.002364231645687964, "learning_rate": 5e-05, "loss": 0.1683, "loss/crossentropy": 2.754942238330841, "loss/hidden": 0.0, "loss/logits": 0.16829831898212433, "loss/reg": 3.6073873043060303, "step": 192 }, { "epoch": 0.00193, "grad_norm": 0.3342723250389099, "grad_norm_var": 0.0023955576435807737, "learning_rate": 5e-05, "loss": 0.1663, "loss/crossentropy": 2.7424720525741577, "loss/hidden": 0.0, "loss/logits": 0.16633369401097298, "loss/reg": 3.6036906242370605, "step": 193 }, { "epoch": 0.00194, "grad_norm": 0.38286155462265015, "grad_norm_var": 0.002251566346172081, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 2.9778133630752563, "loss/hidden": 0.0, "loss/logits": 0.16522743180394173, "loss/reg": 3.600316286087036, "step": 194 }, { "epoch": 0.00195, "grad_norm": 0.36051952838897705, "grad_norm_var": 0.0022364206403587715, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.6842609643936157, "loss/hidden": 0.0, "loss/logits": 0.1771794743835926, "loss/reg": 3.596491813659668, "step": 195 }, { "epoch": 0.00196, "grad_norm": 0.3526027202606201, "grad_norm_var": 0.0022007878333510996, "learning_rate": 5e-05, "loss": 0.1561, "loss/crossentropy": 2.7837477922439575, "loss/hidden": 0.0, "loss/logits": 0.15605639293789864, "loss/reg": 3.5926032066345215, "step": 196 }, { "epoch": 0.00197, "grad_norm": 0.35895583033561707, "grad_norm_var": 0.002191344445433652, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.85478812456131, "loss/hidden": 0.0, "loss/logits": 0.1800978109240532, "loss/reg": 3.589280843734741, "step": 197 }, { "epoch": 0.00198, "grad_norm": 0.3372839689254761, "grad_norm_var": 0.0012524713862786308, "learning_rate": 5e-05, "loss": 0.1571, "loss/crossentropy": 2.805725872516632, "loss/hidden": 0.0, "loss/logits": 0.15710647776722908, "loss/reg": 3.5851662158966064, "step": 198 }, { "epoch": 0.00199, "grad_norm": 0.33652499318122864, "grad_norm_var": 0.0012232813247675149, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 2.657254457473755, "loss/hidden": 0.0, "loss/logits": 0.1651643067598343, "loss/reg": 3.581798553466797, "step": 199 }, { "epoch": 0.002, "grad_norm": 0.36757001280784607, "grad_norm_var": 0.001149275638629573, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.7496553659439087, "loss/hidden": 0.0, "loss/logits": 0.17555997148156166, "loss/reg": 3.577878475189209, "step": 200 }, { "epoch": 0.00201, "grad_norm": 0.4317435324192047, "grad_norm_var": 0.0013607474972908151, "learning_rate": 5e-05, "loss": 0.1643, "loss/crossentropy": 3.168861448764801, "loss/hidden": 0.0, "loss/logits": 0.164311021566391, "loss/reg": 3.5741024017333984, "step": 201 }, { "epoch": 0.00202, "grad_norm": 0.3569833040237427, "grad_norm_var": 0.0013412425012825579, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.7941558957099915, "loss/hidden": 0.0, "loss/logits": 0.17778108268976212, "loss/reg": 3.5706787109375, "step": 202 }, { "epoch": 0.00203, "grad_norm": 0.31648150086402893, "grad_norm_var": 0.0014904716039333447, "learning_rate": 5e-05, "loss": 0.156, "loss/crossentropy": 2.872058689594269, "loss/hidden": 0.0, "loss/logits": 0.1559964008629322, "loss/reg": 3.5671305656433105, "step": 203 }, { "epoch": 0.00204, "grad_norm": 0.32686129212379456, "grad_norm_var": 0.0014293085106024154, "learning_rate": 5e-05, "loss": 0.1593, "loss/crossentropy": 2.7316592931747437, "loss/hidden": 0.0, "loss/logits": 0.15925980731844902, "loss/reg": 3.5632758140563965, "step": 204 }, { "epoch": 0.00205, "grad_norm": 0.3191937506198883, "grad_norm_var": 0.001518472211395964, "learning_rate": 5e-05, "loss": 0.1527, "loss/crossentropy": 2.7802085876464844, "loss/hidden": 0.0, "loss/logits": 0.15268265083432198, "loss/reg": 3.559633493423462, "step": 205 }, { "epoch": 0.00206, "grad_norm": 0.34924882650375366, "grad_norm_var": 0.0009115629505157467, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.792604923248291, "loss/hidden": 0.0, "loss/logits": 0.17729893326759338, "loss/reg": 3.555882453918457, "step": 206 }, { "epoch": 0.00207, "grad_norm": 0.38204553723335266, "grad_norm_var": 0.0008412229229646054, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.729912281036377, "loss/hidden": 0.0, "loss/logits": 0.17347190529108047, "loss/reg": 3.551867723464966, "step": 207 }, { "epoch": 0.00208, "grad_norm": 0.316631555557251, "grad_norm_var": 0.0009067368526577339, "learning_rate": 5e-05, "loss": 0.1521, "loss/crossentropy": 2.6910020112991333, "loss/hidden": 0.0, "loss/logits": 0.15211007744073868, "loss/reg": 3.547140598297119, "step": 208 }, { "epoch": 0.00209, "grad_norm": 0.3024788200855255, "grad_norm_var": 0.0010444754089082963, "learning_rate": 5e-05, "loss": 0.1534, "loss/crossentropy": 2.6174367666244507, "loss/hidden": 0.0, "loss/logits": 0.15340904891490936, "loss/reg": 3.5430798530578613, "step": 209 }, { "epoch": 0.0021, "grad_norm": 0.31879743933677673, "grad_norm_var": 0.0010192142441715734, "learning_rate": 5e-05, "loss": 0.1644, "loss/crossentropy": 2.6434658765792847, "loss/hidden": 0.0, "loss/logits": 0.164449330419302, "loss/reg": 3.539293050765991, "step": 210 }, { "epoch": 0.00211, "grad_norm": 0.37038934230804443, "grad_norm_var": 0.0010445807718520773, "learning_rate": 5e-05, "loss": 0.1618, "loss/crossentropy": 2.7187950015068054, "loss/hidden": 0.0, "loss/logits": 0.16179471090435982, "loss/reg": 3.5359723567962646, "step": 211 }, { "epoch": 0.00212, "grad_norm": 0.3256055414676666, "grad_norm_var": 0.0010681195543044476, "learning_rate": 5e-05, "loss": 0.1634, "loss/crossentropy": 2.6802476048469543, "loss/hidden": 0.0, "loss/logits": 0.16339639574289322, "loss/reg": 3.5320651531219482, "step": 212 }, { "epoch": 0.00213, "grad_norm": 0.363210529088974, "grad_norm_var": 0.0010772816324646883, "learning_rate": 5e-05, "loss": 0.1682, "loss/crossentropy": 2.925456941127777, "loss/hidden": 0.0, "loss/logits": 0.16816257312893867, "loss/reg": 3.527592420578003, "step": 213 }, { "epoch": 0.00214, "grad_norm": 0.3341169059276581, "grad_norm_var": 0.0010811945233913268, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.8775156140327454, "loss/hidden": 0.0, "loss/logits": 0.1689641959965229, "loss/reg": 3.5241305828094482, "step": 214 }, { "epoch": 0.00215, "grad_norm": 0.7971848249435425, "grad_norm_var": 0.013831743286372744, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.769020676612854, "loss/hidden": 0.0, "loss/logits": 0.18977811932563782, "loss/reg": 3.5197558403015137, "step": 215 }, { "epoch": 0.00216, "grad_norm": 0.3044687807559967, "grad_norm_var": 0.014131832632900828, "learning_rate": 5e-05, "loss": 0.1467, "loss/crossentropy": 2.792181670665741, "loss/hidden": 0.0, "loss/logits": 0.1466773971915245, "loss/reg": 3.516072988510132, "step": 216 }, { "epoch": 0.00217, "grad_norm": 0.3434732258319855, "grad_norm_var": 0.013888774653188173, "learning_rate": 5e-05, "loss": 0.1698, "loss/crossentropy": 2.577077627182007, "loss/hidden": 0.0, "loss/logits": 0.16977669671177864, "loss/reg": 3.512517213821411, "step": 217 }, { "epoch": 0.00218, "grad_norm": 0.37019920349121094, "grad_norm_var": 0.013886977393692842, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.722847878932953, "loss/hidden": 0.0, "loss/logits": 0.19428952783346176, "loss/reg": 3.5091969966888428, "step": 218 }, { "epoch": 0.00219, "grad_norm": 0.31637635827064514, "grad_norm_var": 0.013887658605223226, "learning_rate": 5e-05, "loss": 0.1547, "loss/crossentropy": 2.787532150745392, "loss/hidden": 0.0, "loss/logits": 0.1546883024275303, "loss/reg": 3.5059781074523926, "step": 219 }, { "epoch": 0.0022, "grad_norm": 0.368344783782959, "grad_norm_var": 0.013784165910995568, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.7095659971237183, "loss/hidden": 0.0, "loss/logits": 0.1773015893995762, "loss/reg": 3.502683162689209, "step": 220 }, { "epoch": 0.00221, "grad_norm": 0.3447912037372589, "grad_norm_var": 0.013659872247631084, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.7072474360466003, "loss/hidden": 0.0, "loss/logits": 0.1687602400779724, "loss/reg": 3.4986109733581543, "step": 221 }, { "epoch": 0.00222, "grad_norm": 0.3812227249145508, "grad_norm_var": 0.013638668912457892, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.8128660917282104, "loss/hidden": 0.0, "loss/logits": 0.18113631010055542, "loss/reg": 3.4947147369384766, "step": 222 }, { "epoch": 0.00223, "grad_norm": 0.339374303817749, "grad_norm_var": 0.013690814024359154, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.885101020336151, "loss/hidden": 0.0, "loss/logits": 0.17156245186924934, "loss/reg": 3.4905753135681152, "step": 223 }, { "epoch": 0.00224, "grad_norm": 0.3169143497943878, "grad_norm_var": 0.013688861707923295, "learning_rate": 5e-05, "loss": 0.1589, "loss/crossentropy": 2.6434147357940674, "loss/hidden": 0.0, "loss/logits": 0.15887855738401413, "loss/reg": 3.486919641494751, "step": 224 }, { "epoch": 0.00225, "grad_norm": 0.4436502456665039, "grad_norm_var": 0.013690624557478688, "learning_rate": 5e-05, "loss": 0.2037, "loss/crossentropy": 2.9042821526527405, "loss/hidden": 0.0, "loss/logits": 0.20374128222465515, "loss/reg": 3.483499765396118, "step": 225 }, { "epoch": 0.00226, "grad_norm": 0.44937804341316223, "grad_norm_var": 0.01373632101878638, "learning_rate": 5e-05, "loss": 0.1588, "loss/crossentropy": 2.79194039106369, "loss/hidden": 0.0, "loss/logits": 0.1587841510772705, "loss/reg": 3.480142593383789, "step": 226 }, { "epoch": 0.00227, "grad_norm": 0.3453376889228821, "grad_norm_var": 0.013826164241530992, "learning_rate": 5e-05, "loss": 0.1659, "loss/crossentropy": 2.7488330006599426, "loss/hidden": 0.0, "loss/logits": 0.16589100658893585, "loss/reg": 3.476062297821045, "step": 227 }, { "epoch": 0.00228, "grad_norm": 0.3845584988594055, "grad_norm_var": 0.013584549048495138, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.6935607194900513, "loss/hidden": 0.0, "loss/logits": 0.18416164070367813, "loss/reg": 3.4726946353912354, "step": 228 }, { "epoch": 0.00229, "grad_norm": 0.3347846567630768, "grad_norm_var": 0.013727727146734722, "learning_rate": 5e-05, "loss": 0.1767, "loss/crossentropy": 2.6182947754859924, "loss/hidden": 0.0, "loss/logits": 0.1767422929406166, "loss/reg": 3.469238519668579, "step": 229 }, { "epoch": 0.0023, "grad_norm": 0.35126739740371704, "grad_norm_var": 0.01362772883112919, "learning_rate": 5e-05, "loss": 0.1694, "loss/crossentropy": 2.8005401492118835, "loss/hidden": 0.0, "loss/logits": 0.1694028675556183, "loss/reg": 3.4662599563598633, "step": 230 }, { "epoch": 0.00231, "grad_norm": 0.37644773721694946, "grad_norm_var": 0.0016784352808341082, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.7537949085235596, "loss/hidden": 0.0, "loss/logits": 0.16772692278027534, "loss/reg": 3.4623701572418213, "step": 231 }, { "epoch": 0.00232, "grad_norm": 0.33086928725242615, "grad_norm_var": 0.0015241936410912834, "learning_rate": 5e-05, "loss": 0.1624, "loss/crossentropy": 2.7844293117523193, "loss/hidden": 0.0, "loss/logits": 0.1624348722398281, "loss/reg": 3.459073066711426, "step": 232 }, { "epoch": 0.00233, "grad_norm": 0.3152429461479187, "grad_norm_var": 0.0016449122438399724, "learning_rate": 5e-05, "loss": 0.1607, "loss/crossentropy": 2.5863555669784546, "loss/hidden": 0.0, "loss/logits": 0.16065017879009247, "loss/reg": 3.456038475036621, "step": 233 }, { "epoch": 0.00234, "grad_norm": 0.34679386019706726, "grad_norm_var": 0.001649030072333372, "learning_rate": 5e-05, "loss": 0.1656, "loss/crossentropy": 2.9068891406059265, "loss/hidden": 0.0, "loss/logits": 0.16555847227573395, "loss/reg": 3.452618360519409, "step": 234 }, { "epoch": 0.00235, "grad_norm": 0.36684513092041016, "grad_norm_var": 0.001520832425550959, "learning_rate": 5e-05, "loss": 0.1878, "loss/crossentropy": 2.6781840920448303, "loss/hidden": 0.0, "loss/logits": 0.18775511160492897, "loss/reg": 3.4493637084960938, "step": 235 }, { "epoch": 0.00236, "grad_norm": 0.39043235778808594, "grad_norm_var": 0.0015693055369300879, "learning_rate": 5e-05, "loss": 0.1559, "loss/crossentropy": 2.9237093925476074, "loss/hidden": 0.0, "loss/logits": 0.15592358261346817, "loss/reg": 3.446392059326172, "step": 236 }, { "epoch": 0.00237, "grad_norm": 0.3486286997795105, "grad_norm_var": 0.0015605921838873513, "learning_rate": 5e-05, "loss": 0.1524, "loss/crossentropy": 2.8276549577713013, "loss/hidden": 0.0, "loss/logits": 0.1523873247206211, "loss/reg": 3.443490505218506, "step": 237 }, { "epoch": 0.00238, "grad_norm": 0.4030380845069885, "grad_norm_var": 0.0016408419596595262, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.7374503016471863, "loss/hidden": 0.0, "loss/logits": 0.1839219257235527, "loss/reg": 3.440230131149292, "step": 238 }, { "epoch": 0.00239, "grad_norm": 0.3677695095539093, "grad_norm_var": 0.0015933721835237928, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.637487053871155, "loss/hidden": 0.0, "loss/logits": 0.1725292131304741, "loss/reg": 3.4369444847106934, "step": 239 }, { "epoch": 0.0024, "grad_norm": 0.3092736303806305, "grad_norm_var": 0.001648043714460871, "learning_rate": 5e-05, "loss": 0.1608, "loss/crossentropy": 2.785566747188568, "loss/hidden": 0.0, "loss/logits": 0.16075557842850685, "loss/reg": 3.4329705238342285, "step": 240 }, { "epoch": 0.00241, "grad_norm": 0.3242727518081665, "grad_norm_var": 0.001311046071157899, "learning_rate": 5e-05, "loss": 0.1641, "loss/crossentropy": 2.7823829650878906, "loss/hidden": 0.0, "loss/logits": 0.16410458087921143, "loss/reg": 3.429222345352173, "step": 241 }, { "epoch": 0.00242, "grad_norm": 0.3544396758079529, "grad_norm_var": 0.0007310749754719385, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.7899482250213623, "loss/hidden": 0.0, "loss/logits": 0.1741911694407463, "loss/reg": 3.4251747131347656, "step": 242 }, { "epoch": 0.00243, "grad_norm": 0.3156209886074066, "grad_norm_var": 0.0008171231835736463, "learning_rate": 5e-05, "loss": 0.159, "loss/crossentropy": 2.7414376735687256, "loss/hidden": 0.0, "loss/logits": 0.15898872911930084, "loss/reg": 3.421576976776123, "step": 243 }, { "epoch": 0.00244, "grad_norm": 0.3353999853134155, "grad_norm_var": 0.000749955482525048, "learning_rate": 5e-05, "loss": 0.1669, "loss/crossentropy": 2.707472503185272, "loss/hidden": 0.0, "loss/logits": 0.16693224385380745, "loss/reg": 3.417820930480957, "step": 244 }, { "epoch": 0.00245, "grad_norm": 0.32766133546829224, "grad_norm_var": 0.0007658640613261528, "learning_rate": 5e-05, "loss": 0.1761, "loss/crossentropy": 2.6950490474700928, "loss/hidden": 0.0, "loss/logits": 0.17608999833464622, "loss/reg": 3.414095640182495, "step": 245 }, { "epoch": 0.00246, "grad_norm": 0.31360548734664917, "grad_norm_var": 0.0008368534177580581, "learning_rate": 5e-05, "loss": 0.1578, "loss/crossentropy": 2.6977627873420715, "loss/hidden": 0.0, "loss/logits": 0.15783175826072693, "loss/reg": 3.409533739089966, "step": 246 }, { "epoch": 0.00247, "grad_norm": 0.35324403643608093, "grad_norm_var": 0.0007744365123312817, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.8509859442710876, "loss/hidden": 0.0, "loss/logits": 0.16875524446368217, "loss/reg": 3.4052798748016357, "step": 247 }, { "epoch": 0.00248, "grad_norm": 0.41796907782554626, "grad_norm_var": 0.0010967197155327421, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 2.701251804828644, "loss/hidden": 0.0, "loss/logits": 0.1800428181886673, "loss/reg": 3.4006803035736084, "step": 248 }, { "epoch": 0.00249, "grad_norm": 0.33844876289367676, "grad_norm_var": 0.0010247223552569313, "learning_rate": 5e-05, "loss": 0.1737, "loss/crossentropy": 2.7646324038505554, "loss/hidden": 0.0, "loss/logits": 0.17367269843816757, "loss/reg": 3.3968873023986816, "step": 249 }, { "epoch": 0.0025, "grad_norm": 0.31011876463890076, "grad_norm_var": 0.0011285754764581786, "learning_rate": 5e-05, "loss": 0.1591, "loss/crossentropy": 2.7303661704063416, "loss/hidden": 0.0, "loss/logits": 0.15912269055843353, "loss/reg": 3.3925790786743164, "step": 250 }, { "epoch": 0.00251, "grad_norm": 0.4837491512298584, "grad_norm_var": 0.0022679356659945546, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.718783438205719, "loss/hidden": 0.0, "loss/logits": 0.18454211205244064, "loss/reg": 3.389193296432495, "step": 251 }, { "epoch": 0.00252, "grad_norm": 0.30302709341049194, "grad_norm_var": 0.002342444325573334, "learning_rate": 5e-05, "loss": 0.1527, "loss/crossentropy": 2.7513213753700256, "loss/hidden": 0.0, "loss/logits": 0.15272299572825432, "loss/reg": 3.384976863861084, "step": 252 }, { "epoch": 0.00253, "grad_norm": 0.3376671075820923, "grad_norm_var": 0.002352530797232196, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.7082377672195435, "loss/hidden": 0.0, "loss/logits": 0.1717442087829113, "loss/reg": 3.381958246231079, "step": 253 }, { "epoch": 0.00254, "grad_norm": 0.3470434546470642, "grad_norm_var": 0.00215032290339258, "learning_rate": 5e-05, "loss": 0.1751, "loss/crossentropy": 2.7747623324394226, "loss/hidden": 0.0, "loss/logits": 0.17506984621286392, "loss/reg": 3.3780975341796875, "step": 254 }, { "epoch": 0.00255, "grad_norm": 0.35893791913986206, "grad_norm_var": 0.002129806794166807, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.6670790910720825, "loss/hidden": 0.0, "loss/logits": 0.17602670192718506, "loss/reg": 3.374431848526001, "step": 255 }, { "epoch": 0.00256, "grad_norm": 0.33274415135383606, "grad_norm_var": 0.002050384071076557, "learning_rate": 5e-05, "loss": 0.1683, "loss/crossentropy": 2.930284321308136, "loss/hidden": 0.0, "loss/logits": 0.16833152994513512, "loss/reg": 3.3709969520568848, "step": 256 }, { "epoch": 0.00257, "grad_norm": 0.3107251822948456, "grad_norm_var": 0.0021031284267367073, "learning_rate": 5e-05, "loss": 0.154, "loss/crossentropy": 2.7738651037216187, "loss/hidden": 0.0, "loss/logits": 0.15401111543178558, "loss/reg": 3.36681866645813, "step": 257 }, { "epoch": 0.00258, "grad_norm": 0.3238702118396759, "grad_norm_var": 0.00212825610345269, "learning_rate": 5e-05, "loss": 0.1485, "loss/crossentropy": 2.7926384806632996, "loss/hidden": 0.0, "loss/logits": 0.14850713685154915, "loss/reg": 3.363298177719116, "step": 258 }, { "epoch": 0.00259, "grad_norm": 0.3937188982963562, "grad_norm_var": 0.0022101531057158843, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.732594311237335, "loss/hidden": 0.0, "loss/logits": 0.17957409471273422, "loss/reg": 3.3592464923858643, "step": 259 }, { "epoch": 0.0026, "grad_norm": 0.35869738459587097, "grad_norm_var": 0.002201067050087302, "learning_rate": 5e-05, "loss": 0.1608, "loss/crossentropy": 2.6412158608436584, "loss/hidden": 0.0, "loss/logits": 0.1607726439833641, "loss/reg": 3.355531692504883, "step": 260 }, { "epoch": 0.00261, "grad_norm": 0.342753529548645, "grad_norm_var": 0.002168938888458849, "learning_rate": 5e-05, "loss": 0.1679, "loss/crossentropy": 2.8253002762794495, "loss/hidden": 0.0, "loss/logits": 0.16785955801606178, "loss/reg": 3.3510327339172363, "step": 261 }, { "epoch": 0.00262, "grad_norm": 0.3396557867527008, "grad_norm_var": 0.0020792270475482005, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.5446697473526, "loss/hidden": 0.0, "loss/logits": 0.1719457022845745, "loss/reg": 3.3462460041046143, "step": 262 }, { "epoch": 0.00263, "grad_norm": 0.326615571975708, "grad_norm_var": 0.002123647634079288, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.7185133695602417, "loss/hidden": 0.0, "loss/logits": 0.16621045768260956, "loss/reg": 3.342698097229004, "step": 263 }, { "epoch": 0.00264, "grad_norm": 0.372024804353714, "grad_norm_var": 0.0018490612448516057, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.90339195728302, "loss/hidden": 0.0, "loss/logits": 0.17848360165953636, "loss/reg": 3.3390297889709473, "step": 264 }, { "epoch": 0.00265, "grad_norm": 0.336412638425827, "grad_norm_var": 0.0018521135396843155, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.6998194456100464, "loss/hidden": 0.0, "loss/logits": 0.1684669330716133, "loss/reg": 3.3360044956207275, "step": 265 }, { "epoch": 0.00266, "grad_norm": 0.3179170787334442, "grad_norm_var": 0.0018158920564407192, "learning_rate": 5e-05, "loss": 0.164, "loss/crossentropy": 2.713620126247406, "loss/hidden": 0.0, "loss/logits": 0.16402245312929153, "loss/reg": 3.33297061920166, "step": 266 }, { "epoch": 0.00267, "grad_norm": 0.32180216908454895, "grad_norm_var": 0.0005475447645484169, "learning_rate": 5e-05, "loss": 0.1561, "loss/crossentropy": 2.8285736441612244, "loss/hidden": 0.0, "loss/logits": 0.15614933148026466, "loss/reg": 3.330070972442627, "step": 267 }, { "epoch": 0.00268, "grad_norm": 0.34155359864234924, "grad_norm_var": 0.00045564919377512797, "learning_rate": 5e-05, "loss": 0.1666, "loss/crossentropy": 2.698326587677002, "loss/hidden": 0.0, "loss/logits": 0.16663997247815132, "loss/reg": 3.326782464981079, "step": 268 }, { "epoch": 0.00269, "grad_norm": 0.3281239867210388, "grad_norm_var": 0.0004660702159405468, "learning_rate": 5e-05, "loss": 0.1547, "loss/crossentropy": 2.7132135033607483, "loss/hidden": 0.0, "loss/logits": 0.15473050251603127, "loss/reg": 3.3232901096343994, "step": 269 }, { "epoch": 0.0027, "grad_norm": 0.3694444000720978, "grad_norm_var": 0.0005161187813034911, "learning_rate": 5e-05, "loss": 0.1658, "loss/crossentropy": 2.943029820919037, "loss/hidden": 0.0, "loss/logits": 0.16579603031277657, "loss/reg": 3.319425344467163, "step": 270 }, { "epoch": 0.00271, "grad_norm": 0.3521305024623871, "grad_norm_var": 0.0005038113254072218, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.813421130180359, "loss/hidden": 0.0, "loss/logits": 0.17624986171722412, "loss/reg": 3.31587553024292, "step": 271 }, { "epoch": 0.00272, "grad_norm": 0.3419167995452881, "grad_norm_var": 0.0004980410714001496, "learning_rate": 5e-05, "loss": 0.1579, "loss/crossentropy": 2.6725985407829285, "loss/hidden": 0.0, "loss/logits": 0.1579086296260357, "loss/reg": 3.313774347305298, "step": 272 }, { "epoch": 0.00273, "grad_norm": 0.45973771810531616, "grad_norm_var": 0.001257799356739812, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.7593576908111572, "loss/hidden": 0.0, "loss/logits": 0.1806396320462227, "loss/reg": 3.311671257019043, "step": 273 }, { "epoch": 0.00274, "grad_norm": 0.327812522649765, "grad_norm_var": 0.00124416933097297, "learning_rate": 5e-05, "loss": 0.1544, "loss/crossentropy": 2.7368595004081726, "loss/hidden": 0.0, "loss/logits": 0.15438436716794968, "loss/reg": 3.308312177658081, "step": 274 }, { "epoch": 0.00275, "grad_norm": 0.43593037128448486, "grad_norm_var": 0.001590926391083336, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.8178694248199463, "loss/hidden": 0.0, "loss/logits": 0.17214355245232582, "loss/reg": 3.30526065826416, "step": 275 }, { "epoch": 0.00276, "grad_norm": 0.361247181892395, "grad_norm_var": 0.0015927484925991053, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.734869599342346, "loss/hidden": 0.0, "loss/logits": 0.17345493659377098, "loss/reg": 3.3015174865722656, "step": 276 }, { "epoch": 0.00277, "grad_norm": 0.3708873689174652, "grad_norm_var": 0.0015974331537993436, "learning_rate": 5e-05, "loss": 0.1628, "loss/crossentropy": 2.67022705078125, "loss/hidden": 0.0, "loss/logits": 0.16277828440070152, "loss/reg": 3.2979342937469482, "step": 277 }, { "epoch": 0.00278, "grad_norm": 0.3481086790561676, "grad_norm_var": 0.0015829700282981919, "learning_rate": 5e-05, "loss": 0.1577, "loss/crossentropy": 2.700168251991272, "loss/hidden": 0.0, "loss/logits": 0.15773406997323036, "loss/reg": 3.294177532196045, "step": 278 }, { "epoch": 0.00279, "grad_norm": 0.5134589076042175, "grad_norm_var": 0.003008442642246208, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.9033528566360474, "loss/hidden": 0.0, "loss/logits": 0.17655130848288536, "loss/reg": 3.2909915447235107, "step": 279 }, { "epoch": 0.0028, "grad_norm": 0.38205716013908386, "grad_norm_var": 0.0030192383608610503, "learning_rate": 5e-05, "loss": 0.1934, "loss/crossentropy": 2.7160211205482483, "loss/hidden": 0.0, "loss/logits": 0.19341961666941643, "loss/reg": 3.2878634929656982, "step": 280 }, { "epoch": 0.00281, "grad_norm": 0.3628558814525604, "grad_norm_var": 0.002947045102075485, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.6912715435028076, "loss/hidden": 0.0, "loss/logits": 0.17094064503908157, "loss/reg": 3.284353256225586, "step": 281 }, { "epoch": 0.00282, "grad_norm": 0.32757696509361267, "grad_norm_var": 0.002884588952342071, "learning_rate": 5e-05, "loss": 0.1702, "loss/crossentropy": 2.7818912267684937, "loss/hidden": 0.0, "loss/logits": 0.17016061395406723, "loss/reg": 3.281097412109375, "step": 282 }, { "epoch": 0.00283, "grad_norm": 0.5035110712051392, "grad_norm_var": 0.003743174506031214, "learning_rate": 5e-05, "loss": 0.1891, "loss/crossentropy": 2.8356027603149414, "loss/hidden": 0.0, "loss/logits": 0.1890563629567623, "loss/reg": 3.277557611465454, "step": 283 }, { "epoch": 0.00284, "grad_norm": 0.4021988809108734, "grad_norm_var": 0.003638735284583853, "learning_rate": 5e-05, "loss": 0.1574, "loss/crossentropy": 2.768595337867737, "loss/hidden": 0.0, "loss/logits": 0.15735788643360138, "loss/reg": 3.274083375930786, "step": 284 }, { "epoch": 0.00285, "grad_norm": 0.3557356297969818, "grad_norm_var": 0.0034707811870306284, "learning_rate": 5e-05, "loss": 0.1618, "loss/crossentropy": 2.7620763182640076, "loss/hidden": 0.0, "loss/logits": 0.16183337569236755, "loss/reg": 3.2709619998931885, "step": 285 }, { "epoch": 0.00286, "grad_norm": 0.35383257269859314, "grad_norm_var": 0.0035254991255895857, "learning_rate": 5e-05, "loss": 0.164, "loss/crossentropy": 2.7777530550956726, "loss/hidden": 0.0, "loss/logits": 0.16404272243380547, "loss/reg": 3.2678396701812744, "step": 286 }, { "epoch": 0.00287, "grad_norm": 0.3291935324668884, "grad_norm_var": 0.0036663583934404683, "learning_rate": 5e-05, "loss": 0.1545, "loss/crossentropy": 2.7828534841537476, "loss/hidden": 0.0, "loss/logits": 0.15450828149914742, "loss/reg": 3.2639055252075195, "step": 287 }, { "epoch": 0.00288, "grad_norm": 0.3174595534801483, "grad_norm_var": 0.003847509504795695, "learning_rate": 5e-05, "loss": 0.163, "loss/crossentropy": 2.7973127365112305, "loss/hidden": 0.0, "loss/logits": 0.16296324506402016, "loss/reg": 3.2608911991119385, "step": 288 }, { "epoch": 0.00289, "grad_norm": 0.3723791539669037, "grad_norm_var": 0.0034478366033269445, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.630415976047516, "loss/hidden": 0.0, "loss/logits": 0.17573364078998566, "loss/reg": 3.257523775100708, "step": 289 }, { "epoch": 0.0029, "grad_norm": 0.38034215569496155, "grad_norm_var": 0.003261674725795945, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.704579532146454, "loss/hidden": 0.0, "loss/logits": 0.16892588511109352, "loss/reg": 3.254065752029419, "step": 290 }, { "epoch": 0.00291, "grad_norm": 1.2464065551757812, "grad_norm_var": 0.05011180607704591, "learning_rate": 5e-05, "loss": 0.1921, "loss/crossentropy": 2.8595897555351257, "loss/hidden": 0.0, "loss/logits": 0.19211571291089058, "loss/reg": 3.2510294914245605, "step": 291 }, { "epoch": 0.00292, "grad_norm": 0.3307066559791565, "grad_norm_var": 0.05046209325623486, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.8195151686668396, "loss/hidden": 0.0, "loss/logits": 0.16810721158981323, "loss/reg": 3.2478625774383545, "step": 292 }, { "epoch": 0.00293, "grad_norm": 0.33664193749427795, "grad_norm_var": 0.05081007066127065, "learning_rate": 5e-05, "loss": 0.1602, "loss/crossentropy": 2.912789523601532, "loss/hidden": 0.0, "loss/logits": 0.16015203669667244, "loss/reg": 3.2451751232147217, "step": 293 }, { "epoch": 0.00294, "grad_norm": 0.42365437746047974, "grad_norm_var": 0.05035293502217161, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.7459517121315002, "loss/hidden": 0.0, "loss/logits": 0.18333038315176964, "loss/reg": 3.242023229598999, "step": 294 }, { "epoch": 0.00295, "grad_norm": 0.40840578079223633, "grad_norm_var": 0.049924464393714924, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.778249144554138, "loss/hidden": 0.0, "loss/logits": 0.17595936357975006, "loss/reg": 3.2388389110565186, "step": 295 }, { "epoch": 0.00296, "grad_norm": 0.3591618835926056, "grad_norm_var": 0.05009460642018338, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.7121748328208923, "loss/hidden": 0.0, "loss/logits": 0.17217102646827698, "loss/reg": 3.2364017963409424, "step": 296 }, { "epoch": 0.00297, "grad_norm": 0.31615006923675537, "grad_norm_var": 0.05062186135785553, "learning_rate": 5e-05, "loss": 0.1488, "loss/crossentropy": 2.7933038473129272, "loss/hidden": 0.0, "loss/logits": 0.14879318699240685, "loss/reg": 3.2344272136688232, "step": 297 }, { "epoch": 0.00298, "grad_norm": 0.3586377799510956, "grad_norm_var": 0.050288172636787816, "learning_rate": 5e-05, "loss": 0.1492, "loss/crossentropy": 2.869333803653717, "loss/hidden": 0.0, "loss/logits": 0.14924299344420433, "loss/reg": 3.232086181640625, "step": 298 }, { "epoch": 0.00299, "grad_norm": 0.37798771262168884, "grad_norm_var": 0.04995309393064352, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 2.913083255290985, "loss/hidden": 0.0, "loss/logits": 0.1651969812810421, "loss/reg": 3.2296016216278076, "step": 299 }, { "epoch": 0.003, "grad_norm": 0.5914519429206848, "grad_norm_var": 0.05182304954391634, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.8007007241249084, "loss/hidden": 0.0, "loss/logits": 0.18929021432995796, "loss/reg": 3.2266576290130615, "step": 300 }, { "epoch": 0.00301, "grad_norm": 0.3292617201805115, "grad_norm_var": 0.05212417516215169, "learning_rate": 5e-05, "loss": 0.16, "loss/crossentropy": 2.7326099276542664, "loss/hidden": 0.0, "loss/logits": 0.15998771041631699, "loss/reg": 3.2236239910125732, "step": 301 }, { "epoch": 0.00302, "grad_norm": 0.3807355761528015, "grad_norm_var": 0.05190702763823275, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.7199636101722717, "loss/hidden": 0.0, "loss/logits": 0.1831417679786682, "loss/reg": 3.220425844192505, "step": 302 }, { "epoch": 0.00303, "grad_norm": 0.4008902907371521, "grad_norm_var": 0.05127743798183474, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.7570589184761047, "loss/hidden": 0.0, "loss/logits": 0.177694384008646, "loss/reg": 3.2171859741210938, "step": 303 }, { "epoch": 0.00304, "grad_norm": 0.35962697863578796, "grad_norm_var": 0.05073816419262332, "learning_rate": 5e-05, "loss": 0.1574, "loss/crossentropy": 2.809377670288086, "loss/hidden": 0.0, "loss/logits": 0.15741629898548126, "loss/reg": 3.2137656211853027, "step": 304 }, { "epoch": 0.00305, "grad_norm": 0.404453843832016, "grad_norm_var": 0.05053133217663517, "learning_rate": 5e-05, "loss": 0.196, "loss/crossentropy": 2.6374824047088623, "loss/hidden": 0.0, "loss/logits": 0.19603004679083824, "loss/reg": 3.2106897830963135, "step": 305 }, { "epoch": 0.00306, "grad_norm": 0.3411775231361389, "grad_norm_var": 0.05092714807133293, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.7098072171211243, "loss/hidden": 0.0, "loss/logits": 0.17005891352891922, "loss/reg": 3.2072975635528564, "step": 306 }, { "epoch": 0.00307, "grad_norm": 0.48913073539733887, "grad_norm_var": 0.004874772049479148, "learning_rate": 5e-05, "loss": 0.2396, "loss/crossentropy": 2.8529589772224426, "loss/hidden": 0.0, "loss/logits": 0.23958228901028633, "loss/reg": 3.2032482624053955, "step": 307 }, { "epoch": 0.00308, "grad_norm": 0.3359135389328003, "grad_norm_var": 0.004836687315538634, "learning_rate": 5e-05, "loss": 0.154, "loss/crossentropy": 2.825865149497986, "loss/hidden": 0.0, "loss/logits": 0.15401727706193924, "loss/reg": 3.2001187801361084, "step": 308 }, { "epoch": 0.00309, "grad_norm": 0.3673790693283081, "grad_norm_var": 0.004683902714770716, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.757752239704132, "loss/hidden": 0.0, "loss/logits": 0.1700810343027115, "loss/reg": 3.197371244430542, "step": 309 }, { "epoch": 0.0031, "grad_norm": 0.3675878643989563, "grad_norm_var": 0.004630661781797581, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.9055893421173096, "loss/hidden": 0.0, "loss/logits": 0.17424792051315308, "loss/reg": 3.193922758102417, "step": 310 }, { "epoch": 0.00311, "grad_norm": 0.3216918110847473, "grad_norm_var": 0.004850203191397428, "learning_rate": 5e-05, "loss": 0.1529, "loss/crossentropy": 2.7421942353248596, "loss/hidden": 0.0, "loss/logits": 0.15289029106497765, "loss/reg": 3.1913256645202637, "step": 311 }, { "epoch": 0.00312, "grad_norm": 0.30283358693122864, "grad_norm_var": 0.005214980747276743, "learning_rate": 5e-05, "loss": 0.1501, "loss/crossentropy": 2.68456107378006, "loss/hidden": 0.0, "loss/logits": 0.15012749284505844, "loss/reg": 3.188302755355835, "step": 312 }, { "epoch": 0.00313, "grad_norm": 0.3840731978416443, "grad_norm_var": 0.004944937932171186, "learning_rate": 5e-05, "loss": 0.1565, "loss/crossentropy": 2.7386457920074463, "loss/hidden": 0.0, "loss/logits": 0.15645165741443634, "loss/reg": 3.1856327056884766, "step": 313 }, { "epoch": 0.00314, "grad_norm": 0.3471456468105316, "grad_norm_var": 0.004989069609234416, "learning_rate": 5e-05, "loss": 0.1661, "loss/crossentropy": 2.8941837549209595, "loss/hidden": 0.0, "loss/logits": 0.16613885760307312, "loss/reg": 3.182285785675049, "step": 314 }, { "epoch": 0.00315, "grad_norm": 0.40083590149879456, "grad_norm_var": 0.005011503442318803, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.830922782421112, "loss/hidden": 0.0, "loss/logits": 0.18162691593170166, "loss/reg": 3.1795294284820557, "step": 315 }, { "epoch": 0.00316, "grad_norm": 0.40112313628196716, "grad_norm_var": 0.001979603921073251, "learning_rate": 5e-05, "loss": 0.163, "loss/crossentropy": 2.5529216527938843, "loss/hidden": 0.0, "loss/logits": 0.1629548817873001, "loss/reg": 3.1763863563537598, "step": 316 }, { "epoch": 0.00317, "grad_norm": 0.3936459720134735, "grad_norm_var": 0.001881530067811854, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.780943751335144, "loss/hidden": 0.0, "loss/logits": 0.18506472185254097, "loss/reg": 3.173980474472046, "step": 317 }, { "epoch": 0.00318, "grad_norm": 0.3727342486381531, "grad_norm_var": 0.0018792953911145364, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.76874041557312, "loss/hidden": 0.0, "loss/logits": 0.1826922371983528, "loss/reg": 3.1704628467559814, "step": 318 }, { "epoch": 0.00319, "grad_norm": 0.3470066785812378, "grad_norm_var": 0.0018703712702832478, "learning_rate": 5e-05, "loss": 0.1598, "loss/crossentropy": 2.740228831768036, "loss/hidden": 0.0, "loss/logits": 0.1597808077931404, "loss/reg": 3.1674654483795166, "step": 319 }, { "epoch": 0.0032, "grad_norm": 0.3653993010520935, "grad_norm_var": 0.0018636832816178708, "learning_rate": 5e-05, "loss": 0.1549, "loss/crossentropy": 2.883521616458893, "loss/hidden": 0.0, "loss/logits": 0.15490083023905754, "loss/reg": 3.1641883850097656, "step": 320 }, { "epoch": 0.00321, "grad_norm": 0.3510638475418091, "grad_norm_var": 0.0018064205203171017, "learning_rate": 5e-05, "loss": 0.1577, "loss/crossentropy": 2.9007150530815125, "loss/hidden": 0.0, "loss/logits": 0.15773681923747063, "loss/reg": 3.1610915660858154, "step": 321 }, { "epoch": 0.00322, "grad_norm": 0.5068875551223755, "grad_norm_var": 0.0029290004167608335, "learning_rate": 5e-05, "loss": 0.2013, "loss/crossentropy": 2.716179847717285, "loss/hidden": 0.0, "loss/logits": 0.2013130635023117, "loss/reg": 3.1571173667907715, "step": 322 }, { "epoch": 0.00323, "grad_norm": 0.40178200602531433, "grad_norm_var": 0.0021162756618779235, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.9381837844848633, "loss/hidden": 0.0, "loss/logits": 0.1809130534529686, "loss/reg": 3.1540093421936035, "step": 323 }, { "epoch": 0.00324, "grad_norm": 0.35252845287323, "grad_norm_var": 0.0020514948206895294, "learning_rate": 5e-05, "loss": 0.1617, "loss/crossentropy": 2.743869721889496, "loss/hidden": 0.0, "loss/logits": 0.16171807795763016, "loss/reg": 3.1503779888153076, "step": 324 }, { "epoch": 0.00325, "grad_norm": 0.36802011728286743, "grad_norm_var": 0.0020509560983741053, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.934039294719696, "loss/hidden": 0.0, "loss/logits": 0.17217914387583733, "loss/reg": 3.1467440128326416, "step": 325 }, { "epoch": 0.00326, "grad_norm": 0.4267319142818451, "grad_norm_var": 0.0022188398751517274, "learning_rate": 5e-05, "loss": 0.1924, "loss/crossentropy": 2.802468180656433, "loss/hidden": 0.0, "loss/logits": 0.19241869449615479, "loss/reg": 3.1427693367004395, "step": 326 }, { "epoch": 0.00327, "grad_norm": 0.34044548869132996, "grad_norm_var": 0.0021007258044081806, "learning_rate": 5e-05, "loss": 0.1522, "loss/crossentropy": 2.8443135619163513, "loss/hidden": 0.0, "loss/logits": 0.1522187888622284, "loss/reg": 3.1388471126556396, "step": 327 }, { "epoch": 0.00328, "grad_norm": 0.4276120066642761, "grad_norm_var": 0.001808451579785623, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.8915366530418396, "loss/hidden": 0.0, "loss/logits": 0.17084889113903046, "loss/reg": 3.134800434112549, "step": 328 }, { "epoch": 0.00329, "grad_norm": 0.3486219346523285, "grad_norm_var": 0.0018993689379833108, "learning_rate": 5e-05, "loss": 0.1687, "loss/crossentropy": 2.575106978416443, "loss/hidden": 0.0, "loss/logits": 0.16874929517507553, "loss/reg": 3.1303272247314453, "step": 329 }, { "epoch": 0.0033, "grad_norm": 0.365105539560318, "grad_norm_var": 0.0018301403367672127, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.6813217401504517, "loss/hidden": 0.0, "loss/logits": 0.18416454643011093, "loss/reg": 3.1269419193267822, "step": 330 }, { "epoch": 0.00331, "grad_norm": 0.5757820010185242, "grad_norm_var": 0.004098500311938921, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.9679067730903625, "loss/hidden": 0.0, "loss/logits": 0.19354857876896858, "loss/reg": 3.1228113174438477, "step": 331 }, { "epoch": 0.00332, "grad_norm": 0.405617356300354, "grad_norm_var": 0.004102514647771457, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.843691408634186, "loss/hidden": 0.0, "loss/logits": 0.1716487891972065, "loss/reg": 3.120131015777588, "step": 332 }, { "epoch": 0.00333, "grad_norm": 0.3825243413448334, "grad_norm_var": 0.004114939464605513, "learning_rate": 5e-05, "loss": 0.1692, "loss/crossentropy": 2.722069561481476, "loss/hidden": 0.0, "loss/logits": 0.16915880143642426, "loss/reg": 3.117812395095825, "step": 333 }, { "epoch": 0.00334, "grad_norm": 0.38414397835731506, "grad_norm_var": 0.004087504594686679, "learning_rate": 5e-05, "loss": 0.1678, "loss/crossentropy": 2.727014124393463, "loss/hidden": 0.0, "loss/logits": 0.1677936352789402, "loss/reg": 3.115847587585449, "step": 334 }, { "epoch": 0.00335, "grad_norm": 0.4531976580619812, "grad_norm_var": 0.0040868556651997325, "learning_rate": 5e-05, "loss": 0.1726, "loss/crossentropy": 2.5817691683769226, "loss/hidden": 0.0, "loss/logits": 0.17255331575870514, "loss/reg": 3.1140594482421875, "step": 335 }, { "epoch": 0.00336, "grad_norm": 0.33963721990585327, "grad_norm_var": 0.004259094561609115, "learning_rate": 5e-05, "loss": 0.1605, "loss/crossentropy": 2.7009602189064026, "loss/hidden": 0.0, "loss/logits": 0.16050074249505997, "loss/reg": 3.1128299236297607, "step": 336 }, { "epoch": 0.00337, "grad_norm": 0.36085280776023865, "grad_norm_var": 0.00419878945557195, "learning_rate": 5e-05, "loss": 0.1641, "loss/crossentropy": 2.7012510299682617, "loss/hidden": 0.0, "loss/logits": 0.16410352289676666, "loss/reg": 3.109898805618286, "step": 337 }, { "epoch": 0.00338, "grad_norm": 0.3331363797187805, "grad_norm_var": 0.003666565441549352, "learning_rate": 5e-05, "loss": 0.1619, "loss/crossentropy": 2.802642047405243, "loss/hidden": 0.0, "loss/logits": 0.1618719846010208, "loss/reg": 3.1067681312561035, "step": 338 }, { "epoch": 0.00339, "grad_norm": 0.41531723737716675, "grad_norm_var": 0.003696375336840474, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.622368335723877, "loss/hidden": 0.0, "loss/logits": 0.18770882859826088, "loss/reg": 3.103231191635132, "step": 339 }, { "epoch": 0.0034, "grad_norm": 0.3483443260192871, "grad_norm_var": 0.0037197436901762657, "learning_rate": 5e-05, "loss": 0.1603, "loss/crossentropy": 2.7523834109306335, "loss/hidden": 0.0, "loss/logits": 0.16034872457385063, "loss/reg": 3.1008808612823486, "step": 340 }, { "epoch": 0.00341, "grad_norm": 0.46117284893989563, "grad_norm_var": 0.003961845355148208, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.820544958114624, "loss/hidden": 0.0, "loss/logits": 0.18007208034396172, "loss/reg": 3.098954439163208, "step": 341 }, { "epoch": 0.00342, "grad_norm": 0.38020917773246765, "grad_norm_var": 0.003918987421685794, "learning_rate": 5e-05, "loss": 0.1711, "loss/crossentropy": 2.864526093006134, "loss/hidden": 0.0, "loss/logits": 0.1710633859038353, "loss/reg": 3.0957016944885254, "step": 342 }, { "epoch": 0.00343, "grad_norm": 0.3978392779827118, "grad_norm_var": 0.0037065638898653073, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.859494388103485, "loss/hidden": 0.0, "loss/logits": 0.1782137230038643, "loss/reg": 3.09333872795105, "step": 343 }, { "epoch": 0.00344, "grad_norm": 0.36975786089897156, "grad_norm_var": 0.0036926924317912187, "learning_rate": 5e-05, "loss": 0.1658, "loss/crossentropy": 2.7827839255332947, "loss/hidden": 0.0, "loss/logits": 0.1657763496041298, "loss/reg": 3.090590715408325, "step": 344 }, { "epoch": 0.00345, "grad_norm": 0.333897203207016, "grad_norm_var": 0.0037974520830182084, "learning_rate": 5e-05, "loss": 0.1612, "loss/crossentropy": 2.7804853320121765, "loss/hidden": 0.0, "loss/logits": 0.1611798331141472, "loss/reg": 3.0877137184143066, "step": 345 }, { "epoch": 0.00346, "grad_norm": 0.43794891238212585, "grad_norm_var": 0.0038469119530984567, "learning_rate": 5e-05, "loss": 0.1678, "loss/crossentropy": 2.7229984402656555, "loss/hidden": 0.0, "loss/logits": 0.1677897423505783, "loss/reg": 3.085155725479126, "step": 346 }, { "epoch": 0.00347, "grad_norm": 0.33257824182510376, "grad_norm_var": 0.0018017603976316725, "learning_rate": 5e-05, "loss": 0.1745, "loss/crossentropy": 2.7936434745788574, "loss/hidden": 0.0, "loss/logits": 0.17448442056775093, "loss/reg": 3.0829925537109375, "step": 347 }, { "epoch": 0.00348, "grad_norm": 0.393646240234375, "grad_norm_var": 0.001775431972661142, "learning_rate": 5e-05, "loss": 0.1647, "loss/crossentropy": 2.8590177297592163, "loss/hidden": 0.0, "loss/logits": 0.16474304348230362, "loss/reg": 3.080383062362671, "step": 348 }, { "epoch": 0.00349, "grad_norm": 0.34549105167388916, "grad_norm_var": 0.0018623256252658482, "learning_rate": 5e-05, "loss": 0.1678, "loss/crossentropy": 2.7182729840278625, "loss/hidden": 0.0, "loss/logits": 0.16776488721370697, "loss/reg": 3.0792622566223145, "step": 349 }, { "epoch": 0.0035, "grad_norm": 0.9833559393882751, "grad_norm_var": 0.024598539346201563, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.720784068107605, "loss/hidden": 0.0, "loss/logits": 0.19106518849730492, "loss/reg": 3.0780370235443115, "step": 350 }, { "epoch": 0.00351, "grad_norm": 0.3550430238246918, "grad_norm_var": 0.02473872020472854, "learning_rate": 5e-05, "loss": 0.1683, "loss/crossentropy": 2.7388776540756226, "loss/hidden": 0.0, "loss/logits": 0.16827991232275963, "loss/reg": 3.07700777053833, "step": 351 }, { "epoch": 0.00352, "grad_norm": 0.32236865162849426, "grad_norm_var": 0.024923428623413246, "learning_rate": 5e-05, "loss": 0.1555, "loss/crossentropy": 2.787019371986389, "loss/hidden": 0.0, "loss/logits": 0.1554781049489975, "loss/reg": 3.075169324874878, "step": 352 }, { "epoch": 0.00353, "grad_norm": 0.34089383482933044, "grad_norm_var": 0.025080939274787682, "learning_rate": 5e-05, "loss": 0.1625, "loss/crossentropy": 2.763973832130432, "loss/hidden": 0.0, "loss/logits": 0.1625489443540573, "loss/reg": 3.0732734203338623, "step": 353 }, { "epoch": 0.00354, "grad_norm": 0.35467466711997986, "grad_norm_var": 0.02489081345717287, "learning_rate": 5e-05, "loss": 0.161, "loss/crossentropy": 2.6696255207061768, "loss/hidden": 0.0, "loss/logits": 0.1610397771000862, "loss/reg": 3.071441411972046, "step": 354 }, { "epoch": 0.00355, "grad_norm": 0.3465348184108734, "grad_norm_var": 0.02514492485323772, "learning_rate": 5e-05, "loss": 0.1692, "loss/crossentropy": 2.8481903076171875, "loss/hidden": 0.0, "loss/logits": 0.16923030093312263, "loss/reg": 3.068796157836914, "step": 355 }, { "epoch": 0.00356, "grad_norm": 0.3337138891220093, "grad_norm_var": 0.025271718941650815, "learning_rate": 5e-05, "loss": 0.1675, "loss/crossentropy": 2.734935760498047, "loss/hidden": 0.0, "loss/logits": 0.1674855425953865, "loss/reg": 3.066350221633911, "step": 356 }, { "epoch": 0.00357, "grad_norm": 0.3486330509185791, "grad_norm_var": 0.02522896182872459, "learning_rate": 5e-05, "loss": 0.1751, "loss/crossentropy": 2.8282878398895264, "loss/hidden": 0.0, "loss/logits": 0.17508375644683838, "loss/reg": 3.0633251667022705, "step": 357 }, { "epoch": 0.00358, "grad_norm": 0.3714129626750946, "grad_norm_var": 0.025255292610223575, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.723433256149292, "loss/hidden": 0.0, "loss/logits": 0.17982058972120285, "loss/reg": 3.060797929763794, "step": 358 }, { "epoch": 0.00359, "grad_norm": 0.38819992542266846, "grad_norm_var": 0.025261289598676597, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.5971017479896545, "loss/hidden": 0.0, "loss/logits": 0.17904112860560417, "loss/reg": 3.057884693145752, "step": 359 }, { "epoch": 0.0036, "grad_norm": 0.3948271870613098, "grad_norm_var": 0.02520822524046924, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.631825864315033, "loss/hidden": 0.0, "loss/logits": 0.1826026625931263, "loss/reg": 3.0550246238708496, "step": 360 }, { "epoch": 0.00361, "grad_norm": 0.46747469902038574, "grad_norm_var": 0.025164775676019657, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.628718376159668, "loss/hidden": 0.0, "loss/logits": 0.1848563477396965, "loss/reg": 3.052072525024414, "step": 361 }, { "epoch": 0.00362, "grad_norm": 0.9672635197639465, "grad_norm_var": 0.044838716189940266, "learning_rate": 5e-05, "loss": 0.2236, "loss/crossentropy": 2.7798518538475037, "loss/hidden": 0.0, "loss/logits": 0.22359847277402878, "loss/reg": 3.048891305923462, "step": 362 }, { "epoch": 0.00363, "grad_norm": 0.3722783029079437, "grad_norm_var": 0.04436658011174813, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.7808294892311096, "loss/hidden": 0.0, "loss/logits": 0.17498808726668358, "loss/reg": 3.0457394123077393, "step": 363 }, { "epoch": 0.00364, "grad_norm": 0.3547132611274719, "grad_norm_var": 0.04471680473078544, "learning_rate": 5e-05, "loss": 0.1602, "loss/crossentropy": 2.7656018137931824, "loss/hidden": 0.0, "loss/logits": 0.16020696610212326, "loss/reg": 3.043097734451294, "step": 364 }, { "epoch": 0.00365, "grad_norm": 0.4774816632270813, "grad_norm_var": 0.04413484451680517, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.9051772356033325, "loss/hidden": 0.0, "loss/logits": 0.18311960250139236, "loss/reg": 3.0403990745544434, "step": 365 }, { "epoch": 0.00366, "grad_norm": 0.41332709789276123, "grad_norm_var": 0.0238056716485936, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.8529672026634216, "loss/hidden": 0.0, "loss/logits": 0.17049206793308258, "loss/reg": 3.037370204925537, "step": 366 }, { "epoch": 0.00367, "grad_norm": 0.39109355211257935, "grad_norm_var": 0.023608062717162412, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.796768307685852, "loss/hidden": 0.0, "loss/logits": 0.17983367666602135, "loss/reg": 3.0343563556671143, "step": 367 }, { "epoch": 0.00368, "grad_norm": 0.36531057953834534, "grad_norm_var": 0.023191193861390014, "learning_rate": 5e-05, "loss": 0.1715, "loss/crossentropy": 2.8276050686836243, "loss/hidden": 0.0, "loss/logits": 0.1715383380651474, "loss/reg": 3.031611919403076, "step": 368 }, { "epoch": 0.00369, "grad_norm": 0.33283501863479614, "grad_norm_var": 0.023278092934366657, "learning_rate": 5e-05, "loss": 0.1499, "loss/crossentropy": 2.6641258597373962, "loss/hidden": 0.0, "loss/logits": 0.14990831911563873, "loss/reg": 3.0287797451019287, "step": 369 }, { "epoch": 0.0037, "grad_norm": 0.4542810618877411, "grad_norm_var": 0.023063995994232415, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.7453941702842712, "loss/hidden": 0.0, "loss/logits": 0.1721569411456585, "loss/reg": 3.0258800983428955, "step": 370 }, { "epoch": 0.00371, "grad_norm": 0.3705763816833496, "grad_norm_var": 0.022852728398777448, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.635721504688263, "loss/hidden": 0.0, "loss/logits": 0.18491211906075478, "loss/reg": 3.0235960483551025, "step": 371 }, { "epoch": 0.00372, "grad_norm": 0.4085729420185089, "grad_norm_var": 0.022289690361487074, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.732766628265381, "loss/hidden": 0.0, "loss/logits": 0.18710973486304283, "loss/reg": 3.0203185081481934, "step": 372 }, { "epoch": 0.00373, "grad_norm": 0.3334612250328064, "grad_norm_var": 0.022468457594482887, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.783429443836212, "loss/hidden": 0.0, "loss/logits": 0.1662071831524372, "loss/reg": 3.0171730518341064, "step": 373 }, { "epoch": 0.00374, "grad_norm": 0.35536903142929077, "grad_norm_var": 0.022607616164545874, "learning_rate": 5e-05, "loss": 0.1654, "loss/crossentropy": 2.818749785423279, "loss/hidden": 0.0, "loss/logits": 0.16541225090622902, "loss/reg": 3.0140058994293213, "step": 374 }, { "epoch": 0.00375, "grad_norm": 0.348376989364624, "grad_norm_var": 0.022917750109528078, "learning_rate": 5e-05, "loss": 0.1628, "loss/crossentropy": 2.9099320769309998, "loss/hidden": 0.0, "loss/logits": 0.1627991460263729, "loss/reg": 3.010875701904297, "step": 375 }, { "epoch": 0.00376, "grad_norm": 0.3394787311553955, "grad_norm_var": 0.023335225496050292, "learning_rate": 5e-05, "loss": 0.1605, "loss/crossentropy": 2.811407744884491, "loss/hidden": 0.0, "loss/logits": 0.16051743179559708, "loss/reg": 3.0073623657226562, "step": 376 }, { "epoch": 0.00377, "grad_norm": 0.42454567551612854, "grad_norm_var": 0.02319007765550817, "learning_rate": 5e-05, "loss": 0.1645, "loss/crossentropy": 2.773725748062134, "loss/hidden": 0.0, "loss/logits": 0.16450630128383636, "loss/reg": 3.004185676574707, "step": 377 }, { "epoch": 0.00378, "grad_norm": 0.3412385582923889, "grad_norm_var": 0.001946629707866813, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.785709500312805, "loss/hidden": 0.0, "loss/logits": 0.17420916631817818, "loss/reg": 3.0007028579711914, "step": 378 }, { "epoch": 0.00379, "grad_norm": 0.3544103503227234, "grad_norm_var": 0.001985417588834304, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.670408546924591, "loss/hidden": 0.0, "loss/logits": 0.17155754566192627, "loss/reg": 2.9972825050354004, "step": 379 }, { "epoch": 0.0038, "grad_norm": 0.36286091804504395, "grad_norm_var": 0.001963109812450625, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.822770357131958, "loss/hidden": 0.0, "loss/logits": 0.17864727228879929, "loss/reg": 2.993536949157715, "step": 380 }, { "epoch": 0.00381, "grad_norm": 0.5003440976142883, "grad_norm_var": 0.002294225514870618, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.790800392627716, "loss/hidden": 0.0, "loss/logits": 0.18961890786886215, "loss/reg": 2.990283489227295, "step": 381 }, { "epoch": 0.00382, "grad_norm": 0.3698294758796692, "grad_norm_var": 0.0022250210916228584, "learning_rate": 5e-05, "loss": 0.1648, "loss/crossentropy": 2.8308547139167786, "loss/hidden": 0.0, "loss/logits": 0.16477400809526443, "loss/reg": 2.986691474914551, "step": 382 }, { "epoch": 0.00383, "grad_norm": 0.36506953835487366, "grad_norm_var": 0.0022229105132923347, "learning_rate": 5e-05, "loss": 0.1682, "loss/crossentropy": 2.744426727294922, "loss/hidden": 0.0, "loss/logits": 0.16819821670651436, "loss/reg": 2.983008861541748, "step": 383 }, { "epoch": 0.00384, "grad_norm": 0.3243113160133362, "grad_norm_var": 0.002390011819316588, "learning_rate": 5e-05, "loss": 0.1596, "loss/crossentropy": 2.89188152551651, "loss/hidden": 0.0, "loss/logits": 0.1595698669552803, "loss/reg": 2.979191541671753, "step": 384 }, { "epoch": 0.00385, "grad_norm": 0.36836785078048706, "grad_norm_var": 0.002273433106164295, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.902570128440857, "loss/hidden": 0.0, "loss/logits": 0.1746898777782917, "loss/reg": 2.975698947906494, "step": 385 }, { "epoch": 0.00386, "grad_norm": 0.3365325927734375, "grad_norm_var": 0.001915978849994604, "learning_rate": 5e-05, "loss": 0.1608, "loss/crossentropy": 2.766001045703888, "loss/hidden": 0.0, "loss/logits": 0.16081618145108223, "loss/reg": 2.972667694091797, "step": 386 }, { "epoch": 0.00387, "grad_norm": 0.35417604446411133, "grad_norm_var": 0.0019292530227877358, "learning_rate": 5e-05, "loss": 0.1605, "loss/crossentropy": 2.7814798951148987, "loss/hidden": 0.0, "loss/logits": 0.16046970710158348, "loss/reg": 2.969967842102051, "step": 387 }, { "epoch": 0.00388, "grad_norm": 1.9537514448165894, "grad_norm_var": 0.15952536292831518, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.782427728176117, "loss/hidden": 0.0, "loss/logits": 0.1925731934607029, "loss/reg": 2.9671437740325928, "step": 388 }, { "epoch": 0.00389, "grad_norm": 0.3620118498802185, "grad_norm_var": 0.15907744774636304, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.716952919960022, "loss/hidden": 0.0, "loss/logits": 0.17136194929480553, "loss/reg": 2.9639716148376465, "step": 389 }, { "epoch": 0.0039, "grad_norm": 0.3765539526939392, "grad_norm_var": 0.1587921781193889, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.8395472168922424, "loss/hidden": 0.0, "loss/logits": 0.16893662884831429, "loss/reg": 2.9617481231689453, "step": 390 }, { "epoch": 0.00391, "grad_norm": 0.39779096841812134, "grad_norm_var": 0.15815917569478716, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.813215434551239, "loss/hidden": 0.0, "loss/logits": 0.1676802597939968, "loss/reg": 2.958872079849243, "step": 391 }, { "epoch": 0.00392, "grad_norm": 2.267273187637329, "grad_norm_var": 0.35670344578828533, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.710484206676483, "loss/hidden": 0.0, "loss/logits": 0.18981827050447464, "loss/reg": 2.9562907218933105, "step": 392 }, { "epoch": 0.00393, "grad_norm": 0.4732659161090851, "grad_norm_var": 0.35576926148027355, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.7463297247886658, "loss/hidden": 0.0, "loss/logits": 0.17890166491270065, "loss/reg": 2.953505277633667, "step": 393 }, { "epoch": 0.00394, "grad_norm": 0.46487849950790405, "grad_norm_var": 0.3525539310677323, "learning_rate": 5e-05, "loss": 0.1737, "loss/crossentropy": 2.781617820262909, "loss/hidden": 0.0, "loss/logits": 0.17365656793117523, "loss/reg": 2.951185464859009, "step": 394 }, { "epoch": 0.00395, "grad_norm": 0.36613309383392334, "grad_norm_var": 0.352175585204308, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.9521047472953796, "loss/hidden": 0.0, "loss/logits": 0.1749158501625061, "loss/reg": 2.949521780014038, "step": 395 }, { "epoch": 0.00396, "grad_norm": 0.33889761567115784, "grad_norm_var": 0.35297777688562104, "learning_rate": 5e-05, "loss": 0.1611, "loss/crossentropy": 2.709399461746216, "loss/hidden": 0.0, "loss/logits": 0.161102045327425, "loss/reg": 2.94758677482605, "step": 396 }, { "epoch": 0.00397, "grad_norm": 0.37460586428642273, "grad_norm_var": 0.3556567542520952, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.8318939208984375, "loss/hidden": 0.0, "loss/logits": 0.1709057316184044, "loss/reg": 2.9449052810668945, "step": 397 }, { "epoch": 0.00398, "grad_norm": 0.36912715435028076, "grad_norm_var": 0.3556777153015602, "learning_rate": 5e-05, "loss": 0.1699, "loss/crossentropy": 2.7699413895606995, "loss/hidden": 0.0, "loss/logits": 0.16991987824440002, "loss/reg": 2.9416375160217285, "step": 398 }, { "epoch": 0.00399, "grad_norm": 0.4335935711860657, "grad_norm_var": 0.35388598085202433, "learning_rate": 5e-05, "loss": 0.1621, "loss/crossentropy": 2.6929262280464172, "loss/hidden": 0.0, "loss/logits": 0.16208457946777344, "loss/reg": 2.9385571479797363, "step": 399 }, { "epoch": 0.004, "grad_norm": 0.36466526985168457, "grad_norm_var": 0.35251743192284957, "learning_rate": 5e-05, "loss": 0.1603, "loss/crossentropy": 2.9028329849243164, "loss/hidden": 0.0, "loss/logits": 0.16026458516716957, "loss/reg": 2.935270071029663, "step": 400 }, { "epoch": 0.00401, "grad_norm": 0.31859657168388367, "grad_norm_var": 0.3542100800677372, "learning_rate": 5e-05, "loss": 0.155, "loss/crossentropy": 2.7707905769348145, "loss/hidden": 0.0, "loss/logits": 0.15500668808817863, "loss/reg": 2.9319961071014404, "step": 401 }, { "epoch": 0.00402, "grad_norm": 0.39714375138282776, "grad_norm_var": 0.35233479687143326, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.5947351455688477, "loss/hidden": 0.0, "loss/logits": 0.17224042490124702, "loss/reg": 2.929413318634033, "step": 402 }, { "epoch": 0.00403, "grad_norm": 2.267681121826172, "grad_norm_var": 0.518261838886723, "learning_rate": 5e-05, "loss": 0.2313, "loss/crossentropy": 2.703626811504364, "loss/hidden": 0.0, "loss/logits": 0.23128759488463402, "loss/reg": 2.925968647003174, "step": 403 }, { "epoch": 0.00404, "grad_norm": 0.38179340958595276, "grad_norm_var": 0.414193396025083, "learning_rate": 5e-05, "loss": 0.1852, "loss/crossentropy": 2.820598065853119, "loss/hidden": 0.0, "loss/logits": 0.18524771928787231, "loss/reg": 2.9223124980926514, "step": 404 }, { "epoch": 0.00405, "grad_norm": 0.35225149989128113, "grad_norm_var": 0.4145378570625937, "learning_rate": 5e-05, "loss": 0.1672, "loss/crossentropy": 2.761472165584564, "loss/hidden": 0.0, "loss/logits": 0.1672290712594986, "loss/reg": 2.9187073707580566, "step": 405 }, { "epoch": 0.00406, "grad_norm": 0.35987603664398193, "grad_norm_var": 0.4150999685132215, "learning_rate": 5e-05, "loss": 0.1731, "loss/crossentropy": 2.7906153202056885, "loss/hidden": 0.0, "loss/logits": 0.17313192784786224, "loss/reg": 2.915867805480957, "step": 406 }, { "epoch": 0.00407, "grad_norm": 0.36714085936546326, "grad_norm_var": 0.416068714723823, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 2.7815486192703247, "loss/hidden": 0.0, "loss/logits": 0.17289156094193459, "loss/reg": 2.9124350547790527, "step": 407 }, { "epoch": 0.00408, "grad_norm": 0.43249061703681946, "grad_norm_var": 0.22313248530400895, "learning_rate": 5e-05, "loss": 0.1886, "loss/crossentropy": 2.7781424522399902, "loss/hidden": 0.0, "loss/logits": 0.18864024803042412, "loss/reg": 2.9094510078430176, "step": 408 }, { "epoch": 0.00409, "grad_norm": 0.34418484568595886, "grad_norm_var": 0.22470081409508588, "learning_rate": 5e-05, "loss": 0.1713, "loss/crossentropy": 2.7818892002105713, "loss/hidden": 0.0, "loss/logits": 0.17131192237138748, "loss/reg": 2.9064505100250244, "step": 409 }, { "epoch": 0.0041, "grad_norm": 0.5792982578277588, "grad_norm_var": 0.2250470715469535, "learning_rate": 5e-05, "loss": 0.1925, "loss/crossentropy": 2.8059155344963074, "loss/hidden": 0.0, "loss/logits": 0.1925133354961872, "loss/reg": 2.9032788276672363, "step": 410 }, { "epoch": 0.00411, "grad_norm": 0.35917767882347107, "grad_norm_var": 0.22517699381034958, "learning_rate": 5e-05, "loss": 0.1597, "loss/crossentropy": 2.9948400259017944, "loss/hidden": 0.0, "loss/logits": 0.1597156822681427, "loss/reg": 2.900125741958618, "step": 411 }, { "epoch": 0.00412, "grad_norm": 0.32936394214630127, "grad_norm_var": 0.2253906803631866, "learning_rate": 5e-05, "loss": 0.16, "loss/crossentropy": 2.8464353680610657, "loss/hidden": 0.0, "loss/logits": 0.15996254980564117, "loss/reg": 2.897120952606201, "step": 412 }, { "epoch": 0.00413, "grad_norm": 0.3636591136455536, "grad_norm_var": 0.22558401797348096, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.5940242409706116, "loss/hidden": 0.0, "loss/logits": 0.18961919099092484, "loss/reg": 2.8938040733337402, "step": 413 }, { "epoch": 0.00414, "grad_norm": 0.3614409565925598, "grad_norm_var": 0.2257231161008428, "learning_rate": 5e-05, "loss": 0.1669, "loss/crossentropy": 2.9938586950302124, "loss/hidden": 0.0, "loss/logits": 0.16688520461320877, "loss/reg": 2.891237497329712, "step": 414 }, { "epoch": 0.00415, "grad_norm": 0.33793795108795166, "grad_norm_var": 0.2271517945765009, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.7566803693771362, "loss/hidden": 0.0, "loss/logits": 0.16617370769381523, "loss/reg": 2.888444185256958, "step": 415 }, { "epoch": 0.00416, "grad_norm": 0.33697640895843506, "grad_norm_var": 0.22768012665927795, "learning_rate": 5e-05, "loss": 0.1609, "loss/crossentropy": 2.7538956999778748, "loss/hidden": 0.0, "loss/logits": 0.1609276346862316, "loss/reg": 2.8854165077209473, "step": 416 }, { "epoch": 0.00417, "grad_norm": 0.33163169026374817, "grad_norm_var": 0.22738752034767185, "learning_rate": 5e-05, "loss": 0.1686, "loss/crossentropy": 2.716135025024414, "loss/hidden": 0.0, "loss/logits": 0.16858552396297455, "loss/reg": 2.883502244949341, "step": 417 }, { "epoch": 0.00418, "grad_norm": 0.3197973072528839, "grad_norm_var": 0.22875903165210631, "learning_rate": 5e-05, "loss": 0.1622, "loss/crossentropy": 2.8385114669799805, "loss/hidden": 0.0, "loss/logits": 0.16216163337230682, "loss/reg": 2.8822600841522217, "step": 418 }, { "epoch": 0.00419, "grad_norm": 0.3929068446159363, "grad_norm_var": 0.0038269076397950408, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.7871418595314026, "loss/hidden": 0.0, "loss/logits": 0.17471741139888763, "loss/reg": 2.8793692588806152, "step": 419 }, { "epoch": 0.0042, "grad_norm": 0.3870161473751068, "grad_norm_var": 0.0038355224442556198, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.7993595600128174, "loss/hidden": 0.0, "loss/logits": 0.1703585907816887, "loss/reg": 2.8776159286499023, "step": 420 }, { "epoch": 0.00421, "grad_norm": 0.35682201385498047, "grad_norm_var": 0.0038246732894099337, "learning_rate": 5e-05, "loss": 0.1886, "loss/crossentropy": 2.655856966972351, "loss/hidden": 0.0, "loss/logits": 0.18859218060970306, "loss/reg": 2.87612247467041, "step": 421 }, { "epoch": 0.00422, "grad_norm": 0.33115604519844055, "grad_norm_var": 0.003924500155300174, "learning_rate": 5e-05, "loss": 0.1611, "loss/crossentropy": 2.8695462942123413, "loss/hidden": 0.0, "loss/logits": 0.1611364483833313, "loss/reg": 2.873897075653076, "step": 422 }, { "epoch": 0.00423, "grad_norm": 0.4912989139556885, "grad_norm_var": 0.004829238325957341, "learning_rate": 5e-05, "loss": 0.1784, "loss/crossentropy": 2.7167177200317383, "loss/hidden": 0.0, "loss/logits": 0.17839327454566956, "loss/reg": 2.8718714714050293, "step": 423 }, { "epoch": 0.00424, "grad_norm": 0.3349898159503937, "grad_norm_var": 0.004720821391959795, "learning_rate": 5e-05, "loss": 0.1615, "loss/crossentropy": 2.7473002076148987, "loss/hidden": 0.0, "loss/logits": 0.1615452691912651, "loss/reg": 2.8705270290374756, "step": 424 }, { "epoch": 0.00425, "grad_norm": 0.3930635154247284, "grad_norm_var": 0.004686561363231038, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.785146713256836, "loss/hidden": 0.0, "loss/logits": 0.17748162522912025, "loss/reg": 2.868584394454956, "step": 425 }, { "epoch": 0.00426, "grad_norm": 0.3448260426521301, "grad_norm_var": 0.0017484410160554464, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.726165235042572, "loss/hidden": 0.0, "loss/logits": 0.17576807364821434, "loss/reg": 2.8673741817474365, "step": 426 }, { "epoch": 0.00427, "grad_norm": 0.3846610188484192, "grad_norm_var": 0.0017836724819081718, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.7086002230644226, "loss/hidden": 0.0, "loss/logits": 0.1762431263923645, "loss/reg": 2.8651485443115234, "step": 427 }, { "epoch": 0.00428, "grad_norm": 0.3494791090488434, "grad_norm_var": 0.0017205006490997802, "learning_rate": 5e-05, "loss": 0.1818, "loss/crossentropy": 2.7305288314819336, "loss/hidden": 0.0, "loss/logits": 0.18181117624044418, "loss/reg": 2.8629541397094727, "step": 428 }, { "epoch": 0.00429, "grad_norm": 0.3337409794330597, "grad_norm_var": 0.0017762239427149495, "learning_rate": 5e-05, "loss": 0.1661, "loss/crossentropy": 2.840167284011841, "loss/hidden": 0.0, "loss/logits": 0.1660769209265709, "loss/reg": 2.8615217208862305, "step": 429 }, { "epoch": 0.0043, "grad_norm": 0.4685284495353699, "grad_norm_var": 0.0024887722894077887, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.7593295574188232, "loss/hidden": 0.0, "loss/logits": 0.1740834154188633, "loss/reg": 2.860365629196167, "step": 430 }, { "epoch": 0.00431, "grad_norm": 0.35651838779449463, "grad_norm_var": 0.002434815976952542, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.777701735496521, "loss/hidden": 0.0, "loss/logits": 0.16725125908851624, "loss/reg": 2.858222007751465, "step": 431 }, { "epoch": 0.00432, "grad_norm": 0.34670454263687134, "grad_norm_var": 0.0023984303943363817, "learning_rate": 5e-05, "loss": 0.165, "loss/crossentropy": 2.749099850654602, "loss/hidden": 0.0, "loss/logits": 0.16504037007689476, "loss/reg": 2.855973958969116, "step": 432 }, { "epoch": 0.00433, "grad_norm": 0.3284713923931122, "grad_norm_var": 0.0024153046998328874, "learning_rate": 5e-05, "loss": 0.1521, "loss/crossentropy": 2.7869237661361694, "loss/hidden": 0.0, "loss/logits": 0.15208067372441292, "loss/reg": 2.853942632675171, "step": 433 }, { "epoch": 0.00434, "grad_norm": 0.48883649706840515, "grad_norm_var": 0.0030697262784900037, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.9403671622276306, "loss/hidden": 0.0, "loss/logits": 0.17589127644896507, "loss/reg": 2.851728916168213, "step": 434 }, { "epoch": 0.00435, "grad_norm": 0.36951273679733276, "grad_norm_var": 0.0030654307324534447, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.7797312140464783, "loss/hidden": 0.0, "loss/logits": 0.1850355602800846, "loss/reg": 2.8488640785217285, "step": 435 }, { "epoch": 0.00436, "grad_norm": 0.4184967577457428, "grad_norm_var": 0.0031605906698184564, "learning_rate": 5e-05, "loss": 0.1855, "loss/crossentropy": 2.812410533428192, "loss/hidden": 0.0, "loss/logits": 0.18553681299090385, "loss/reg": 2.8465514183044434, "step": 436 }, { "epoch": 0.00437, "grad_norm": 0.4329560101032257, "grad_norm_var": 0.0032767273553133062, "learning_rate": 5e-05, "loss": 0.1784, "loss/crossentropy": 2.840768814086914, "loss/hidden": 0.0, "loss/logits": 0.17843929678201675, "loss/reg": 2.844315767288208, "step": 437 }, { "epoch": 0.00438, "grad_norm": 0.6038658022880554, "grad_norm_var": 0.005936964872234999, "learning_rate": 5e-05, "loss": 0.1949, "loss/crossentropy": 2.7183879017829895, "loss/hidden": 0.0, "loss/logits": 0.19491342082619667, "loss/reg": 2.842548131942749, "step": 438 }, { "epoch": 0.00439, "grad_norm": 0.4069391191005707, "grad_norm_var": 0.005387125873613382, "learning_rate": 5e-05, "loss": 0.1875, "loss/crossentropy": 2.7780433297157288, "loss/hidden": 0.0, "loss/logits": 0.18749799579381943, "loss/reg": 2.840106964111328, "step": 439 }, { "epoch": 0.0044, "grad_norm": 0.35941290855407715, "grad_norm_var": 0.005220523762257064, "learning_rate": 5e-05, "loss": 0.1639, "loss/crossentropy": 2.7595601081848145, "loss/hidden": 0.0, "loss/logits": 0.1638675332069397, "loss/reg": 2.8378958702087402, "step": 440 }, { "epoch": 0.00441, "grad_norm": 0.3669149875640869, "grad_norm_var": 0.005284393934492052, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.673116147518158, "loss/hidden": 0.0, "loss/logits": 0.1713937260210514, "loss/reg": 2.8351917266845703, "step": 441 }, { "epoch": 0.00442, "grad_norm": 0.3643859922885895, "grad_norm_var": 0.0051709546313713755, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.8026832342147827, "loss/hidden": 0.0, "loss/logits": 0.17951133847236633, "loss/reg": 2.8323209285736084, "step": 442 }, { "epoch": 0.00443, "grad_norm": 0.34250232577323914, "grad_norm_var": 0.005361033629373261, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 2.7829501032829285, "loss/hidden": 0.0, "loss/logits": 0.17286691814661026, "loss/reg": 2.8302509784698486, "step": 443 }, { "epoch": 0.00444, "grad_norm": 0.3323063552379608, "grad_norm_var": 0.005486165176397177, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.7025471329689026, "loss/hidden": 0.0, "loss/logits": 0.1684984639286995, "loss/reg": 2.827455997467041, "step": 444 }, { "epoch": 0.00445, "grad_norm": 0.35889074206352234, "grad_norm_var": 0.005320257567319386, "learning_rate": 5e-05, "loss": 0.1872, "loss/crossentropy": 2.6426368355751038, "loss/hidden": 0.0, "loss/logits": 0.18716050684452057, "loss/reg": 2.8253841400146484, "step": 445 }, { "epoch": 0.00446, "grad_norm": 0.39696604013442993, "grad_norm_var": 0.004953801905317123, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.6918662786483765, "loss/hidden": 0.0, "loss/logits": 0.18442435935139656, "loss/reg": 2.8228838443756104, "step": 446 }, { "epoch": 0.00447, "grad_norm": 0.3320043981075287, "grad_norm_var": 0.005107676487313561, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.7309769988059998, "loss/hidden": 0.0, "loss/logits": 0.16887113079428673, "loss/reg": 2.8211417198181152, "step": 447 }, { "epoch": 0.00448, "grad_norm": 0.3350951373577118, "grad_norm_var": 0.0051840048892141, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.7696202397346497, "loss/hidden": 0.0, "loss/logits": 0.17358100041747093, "loss/reg": 2.818211793899536, "step": 448 }, { "epoch": 0.00449, "grad_norm": 0.35995370149612427, "grad_norm_var": 0.004988316730949372, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.7628800868988037, "loss/hidden": 0.0, "loss/logits": 0.1849852055311203, "loss/reg": 2.8163435459136963, "step": 449 }, { "epoch": 0.0045, "grad_norm": 0.3433259427547455, "grad_norm_var": 0.004429295151525636, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.92121821641922, "loss/hidden": 0.0, "loss/logits": 0.1775917150080204, "loss/reg": 2.8140530586242676, "step": 450 }, { "epoch": 0.00451, "grad_norm": 0.3525676131248474, "grad_norm_var": 0.004477082320186199, "learning_rate": 5e-05, "loss": 0.1765, "loss/crossentropy": 2.8223352432250977, "loss/hidden": 0.0, "loss/logits": 0.1765166036784649, "loss/reg": 2.8114964962005615, "step": 451 }, { "epoch": 0.00452, "grad_norm": 0.3349536955356598, "grad_norm_var": 0.004502986709870172, "learning_rate": 5e-05, "loss": 0.1617, "loss/crossentropy": 2.58266818523407, "loss/hidden": 0.0, "loss/logits": 0.16168920323252678, "loss/reg": 2.8082778453826904, "step": 452 }, { "epoch": 0.00453, "grad_norm": 0.3272739350795746, "grad_norm_var": 0.004404667304666754, "learning_rate": 5e-05, "loss": 0.1606, "loss/crossentropy": 2.791221022605896, "loss/hidden": 0.0, "loss/logits": 0.16057174652814865, "loss/reg": 2.8055834770202637, "step": 453 }, { "epoch": 0.00454, "grad_norm": 0.35802412033081055, "grad_norm_var": 0.0005107777789474354, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.856186628341675, "loss/hidden": 0.0, "loss/logits": 0.17580854520201683, "loss/reg": 2.8030307292938232, "step": 454 }, { "epoch": 0.00455, "grad_norm": 0.34605538845062256, "grad_norm_var": 0.0003165176266941239, "learning_rate": 5e-05, "loss": 0.1651, "loss/crossentropy": 2.734806716442108, "loss/hidden": 0.0, "loss/logits": 0.16513444855809212, "loss/reg": 2.7998156547546387, "step": 455 }, { "epoch": 0.00456, "grad_norm": 0.35396888852119446, "grad_norm_var": 0.0003120198180476634, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.8904529213905334, "loss/hidden": 0.0, "loss/logits": 0.17014532163739204, "loss/reg": 2.796231508255005, "step": 456 }, { "epoch": 0.00457, "grad_norm": 0.3613145649433136, "grad_norm_var": 0.00030159148728288546, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.813112735748291, "loss/hidden": 0.0, "loss/logits": 0.1834680140018463, "loss/reg": 2.7929091453552246, "step": 457 }, { "epoch": 0.00458, "grad_norm": 0.36372610926628113, "grad_norm_var": 0.00030035069871777733, "learning_rate": 5e-05, "loss": 0.167, "loss/crossentropy": 2.795239508152008, "loss/hidden": 0.0, "loss/logits": 0.1669648103415966, "loss/reg": 2.789707899093628, "step": 458 }, { "epoch": 0.00459, "grad_norm": 0.3581913113594055, "grad_norm_var": 0.00030019062479412403, "learning_rate": 5e-05, "loss": 0.165, "loss/crossentropy": 2.797567903995514, "loss/hidden": 0.0, "loss/logits": 0.16498373076319695, "loss/reg": 2.786154270172119, "step": 459 }, { "epoch": 0.0046, "grad_norm": 0.3571149408817291, "grad_norm_var": 0.00027710791712463786, "learning_rate": 5e-05, "loss": 0.161, "loss/crossentropy": 2.743883192539215, "loss/hidden": 0.0, "loss/logits": 0.16101711615920067, "loss/reg": 2.7824792861938477, "step": 460 }, { "epoch": 0.00461, "grad_norm": 0.356715589761734, "grad_norm_var": 0.0002755397827386948, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.724743604660034, "loss/hidden": 0.0, "loss/logits": 0.1731928214430809, "loss/reg": 2.778890371322632, "step": 461 }, { "epoch": 0.00462, "grad_norm": 0.3243059813976288, "grad_norm_var": 0.00017305590364662023, "learning_rate": 5e-05, "loss": 0.1592, "loss/crossentropy": 2.7731017470359802, "loss/hidden": 0.0, "loss/logits": 0.15923070535063744, "loss/reg": 2.775613784790039, "step": 462 }, { "epoch": 0.00463, "grad_norm": 0.3843972980976105, "grad_norm_var": 0.00023436686166613171, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.6426811814308167, "loss/hidden": 0.0, "loss/logits": 0.17761223763227463, "loss/reg": 2.772561550140381, "step": 463 }, { "epoch": 0.00464, "grad_norm": 0.3468632102012634, "grad_norm_var": 0.00021796986892265539, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.882816791534424, "loss/hidden": 0.0, "loss/logits": 0.17419364303350449, "loss/reg": 2.769129991531372, "step": 464 }, { "epoch": 0.00465, "grad_norm": 0.3967267870903015, "grad_norm_var": 0.0003424789629975648, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.9460648894309998, "loss/hidden": 0.0, "loss/logits": 0.1784828118979931, "loss/reg": 2.767021656036377, "step": 465 }, { "epoch": 0.00466, "grad_norm": 0.36927327513694763, "grad_norm_var": 0.0003472996962895862, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.7243736386299133, "loss/hidden": 0.0, "loss/logits": 0.17225344851613045, "loss/reg": 2.7637596130371094, "step": 466 }, { "epoch": 0.00467, "grad_norm": 0.6855224967002869, "grad_norm_var": 0.007136168552583877, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.9400131702423096, "loss/hidden": 0.0, "loss/logits": 0.17254997044801712, "loss/reg": 2.7604963779449463, "step": 467 }, { "epoch": 0.00468, "grad_norm": 0.41388872265815735, "grad_norm_var": 0.007088047286249225, "learning_rate": 5e-05, "loss": 0.1608, "loss/crossentropy": 2.7375746369361877, "loss/hidden": 0.0, "loss/logits": 0.16082635894417763, "loss/reg": 2.7572667598724365, "step": 468 }, { "epoch": 0.00469, "grad_norm": 0.3997427225112915, "grad_norm_var": 0.006892705403346755, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.8761582374572754, "loss/hidden": 0.0, "loss/logits": 0.17043552175164223, "loss/reg": 2.754149913787842, "step": 469 }, { "epoch": 0.0047, "grad_norm": 0.33424052596092224, "grad_norm_var": 0.007016741295476203, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 2.8255309462547302, "loss/hidden": 0.0, "loss/logits": 0.16519855335354805, "loss/reg": 2.751276731491089, "step": 470 }, { "epoch": 0.00471, "grad_norm": 0.41415977478027344, "grad_norm_var": 0.006957502567752493, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 2.676911950111389, "loss/hidden": 0.0, "loss/logits": 0.18318749964237213, "loss/reg": 2.7486090660095215, "step": 471 }, { "epoch": 0.00472, "grad_norm": 0.3688299357891083, "grad_norm_var": 0.006902369057221236, "learning_rate": 5e-05, "loss": 0.162, "loss/crossentropy": 2.6736281514167786, "loss/hidden": 0.0, "loss/logits": 0.16196409612894058, "loss/reg": 2.745225667953491, "step": 472 }, { "epoch": 0.00473, "grad_norm": 0.42543119192123413, "grad_norm_var": 0.006916738926360373, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.7871673703193665, "loss/hidden": 0.0, "loss/logits": 0.16850638762116432, "loss/reg": 2.7421023845672607, "step": 473 }, { "epoch": 0.00474, "grad_norm": 0.3542870581150055, "grad_norm_var": 0.006960025235757868, "learning_rate": 5e-05, "loss": 0.1695, "loss/crossentropy": 2.727401077747345, "loss/hidden": 0.0, "loss/logits": 0.1695132479071617, "loss/reg": 2.738083839416504, "step": 474 }, { "epoch": 0.00475, "grad_norm": 0.42508408427238464, "grad_norm_var": 0.006928287935252916, "learning_rate": 5e-05, "loss": 0.1967, "loss/crossentropy": 2.756391167640686, "loss/hidden": 0.0, "loss/logits": 0.19667796790599823, "loss/reg": 2.734565496444702, "step": 475 }, { "epoch": 0.00476, "grad_norm": 0.3327108919620514, "grad_norm_var": 0.007096223362361916, "learning_rate": 5e-05, "loss": 0.1698, "loss/crossentropy": 2.662286937236786, "loss/hidden": 0.0, "loss/logits": 0.16976173967123032, "loss/reg": 2.730898141860962, "step": 476 }, { "epoch": 0.00477, "grad_norm": 0.3538263142108917, "grad_norm_var": 0.007111786918880665, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 2.6600981950759888, "loss/hidden": 0.0, "loss/logits": 0.17283405736088753, "loss/reg": 2.7276227474212646, "step": 477 }, { "epoch": 0.00478, "grad_norm": 0.6719810962677002, "grad_norm_var": 0.011362604241119423, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.9192944169044495, "loss/hidden": 0.0, "loss/logits": 0.1955549158155918, "loss/reg": 2.7246785163879395, "step": 478 }, { "epoch": 0.00479, "grad_norm": 0.40175002813339233, "grad_norm_var": 0.011305273259017534, "learning_rate": 5e-05, "loss": 0.1707, "loss/crossentropy": 2.8099315762519836, "loss/hidden": 0.0, "loss/logits": 0.1706707924604416, "loss/reg": 2.721214771270752, "step": 479 }, { "epoch": 0.0048, "grad_norm": 0.5700014233589172, "grad_norm_var": 0.012288996380569357, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.764845371246338, "loss/hidden": 0.0, "loss/logits": 0.17503220587968826, "loss/reg": 2.71852970123291, "step": 480 }, { "epoch": 0.00481, "grad_norm": 0.3602856993675232, "grad_norm_var": 0.012545036289332723, "learning_rate": 5e-05, "loss": 0.1675, "loss/crossentropy": 2.817295730113983, "loss/hidden": 0.0, "loss/logits": 0.16745564714074135, "loss/reg": 2.7155778408050537, "step": 481 }, { "epoch": 0.00482, "grad_norm": 0.37470605969429016, "grad_norm_var": 0.012502846327791594, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.7710434794425964, "loss/hidden": 0.0, "loss/logits": 0.172159094363451, "loss/reg": 2.7133727073669434, "step": 482 }, { "epoch": 0.00483, "grad_norm": 0.319488525390625, "grad_norm_var": 0.008425663660975724, "learning_rate": 5e-05, "loss": 0.1524, "loss/crossentropy": 2.783412456512451, "loss/hidden": 0.0, "loss/logits": 0.15241163223981857, "loss/reg": 2.710848331451416, "step": 483 }, { "epoch": 0.00484, "grad_norm": 0.3474343419075012, "grad_norm_var": 0.008645296689366684, "learning_rate": 5e-05, "loss": 0.1582, "loss/crossentropy": 2.8712441325187683, "loss/hidden": 0.0, "loss/logits": 0.1582440249621868, "loss/reg": 2.708759307861328, "step": 484 }, { "epoch": 0.00485, "grad_norm": 0.3881974518299103, "grad_norm_var": 0.008659215056144554, "learning_rate": 5e-05, "loss": 0.168, "loss/crossentropy": 2.7801290154457092, "loss/hidden": 0.0, "loss/logits": 0.16804108768701553, "loss/reg": 2.707090139389038, "step": 485 }, { "epoch": 0.00486, "grad_norm": 0.3865320086479187, "grad_norm_var": 0.008353144350497502, "learning_rate": 5e-05, "loss": 0.1622, "loss/crossentropy": 2.762259840965271, "loss/hidden": 0.0, "loss/logits": 0.1622123382985592, "loss/reg": 2.704341173171997, "step": 486 }, { "epoch": 0.00487, "grad_norm": 0.3601287007331848, "grad_norm_var": 0.008476237288047564, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.7640222311019897, "loss/hidden": 0.0, "loss/logits": 0.1716899275779724, "loss/reg": 2.702449321746826, "step": 487 }, { "epoch": 0.00488, "grad_norm": 0.3476349711418152, "grad_norm_var": 0.008599584577097493, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.8057321906089783, "loss/hidden": 0.0, "loss/logits": 0.1770944595336914, "loss/reg": 2.6996164321899414, "step": 488 }, { "epoch": 0.00489, "grad_norm": 0.35880380868911743, "grad_norm_var": 0.008661929013581293, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.7546836137771606, "loss/hidden": 0.0, "loss/logits": 0.17589908093214035, "loss/reg": 2.69681978225708, "step": 489 }, { "epoch": 0.0049, "grad_norm": 0.3216891586780548, "grad_norm_var": 0.008914221483014847, "learning_rate": 5e-05, "loss": 0.1628, "loss/crossentropy": 2.755846858024597, "loss/hidden": 0.0, "loss/logits": 0.16277150437235832, "loss/reg": 2.6948180198669434, "step": 490 }, { "epoch": 0.00491, "grad_norm": 0.3739294409751892, "grad_norm_var": 0.008872687766587003, "learning_rate": 5e-05, "loss": 0.1737, "loss/crossentropy": 2.758453607559204, "loss/hidden": 0.0, "loss/logits": 0.173715490847826, "loss/reg": 2.692713737487793, "step": 491 }, { "epoch": 0.00492, "grad_norm": 0.3633546531200409, "grad_norm_var": 0.008689872848313067, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.756626844406128, "loss/hidden": 0.0, "loss/logits": 0.17549088224768639, "loss/reg": 2.6911702156066895, "step": 492 }, { "epoch": 0.00493, "grad_norm": 0.4165309965610504, "grad_norm_var": 0.00860196217059566, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.69485205411911, "loss/hidden": 0.0, "loss/logits": 0.18822569772601128, "loss/reg": 2.6890034675598145, "step": 493 }, { "epoch": 0.00494, "grad_norm": 0.34585461020469666, "grad_norm_var": 0.0033206140596304815, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.73829984664917, "loss/hidden": 0.0, "loss/logits": 0.17416464537382126, "loss/reg": 2.687713623046875, "step": 494 }, { "epoch": 0.00495, "grad_norm": 0.3443280756473541, "grad_norm_var": 0.003339269529387142, "learning_rate": 5e-05, "loss": 0.1613, "loss/crossentropy": 2.745963931083679, "loss/hidden": 0.0, "loss/logits": 0.161319550126791, "loss/reg": 2.685638189315796, "step": 495 }, { "epoch": 0.00496, "grad_norm": 0.3689098656177521, "grad_norm_var": 0.000602855553897171, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.6744813919067383, "loss/hidden": 0.0, "loss/logits": 0.17346932739019394, "loss/reg": 2.6839358806610107, "step": 496 }, { "epoch": 0.00497, "grad_norm": 0.36457374691963196, "grad_norm_var": 0.0006035317496342747, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.799642562866211, "loss/hidden": 0.0, "loss/logits": 0.17352834343910217, "loss/reg": 2.683485507965088, "step": 497 }, { "epoch": 0.00498, "grad_norm": 0.35222312808036804, "grad_norm_var": 0.0005951796117876367, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.739416480064392, "loss/hidden": 0.0, "loss/logits": 0.1742345169186592, "loss/reg": 2.6830482482910156, "step": 498 }, { "epoch": 0.00499, "grad_norm": 0.3427422344684601, "grad_norm_var": 0.0005034448418147264, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.772252082824707, "loss/hidden": 0.0, "loss/logits": 0.16728588938713074, "loss/reg": 2.682189464569092, "step": 499 }, { "epoch": 0.005, "grad_norm": 0.39299577474594116, "grad_norm_var": 0.0005481683329227494, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.7702752351760864, "loss/hidden": 0.0, "loss/logits": 0.1926349699497223, "loss/reg": 2.6808254718780518, "step": 500 }, { "epoch": 0.00501, "grad_norm": 0.3431949019432068, "grad_norm_var": 0.0005312130675710792, "learning_rate": 5e-05, "loss": 0.1634, "loss/crossentropy": 2.7881234288215637, "loss/hidden": 0.0, "loss/logits": 0.1633942425251007, "loss/reg": 2.6798629760742188, "step": 501 }, { "epoch": 0.00502, "grad_norm": 0.36641839146614075, "grad_norm_var": 0.0004892704880637311, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.848407030105591, "loss/hidden": 0.0, "loss/logits": 0.17872987315058708, "loss/reg": 2.677311658859253, "step": 502 }, { "epoch": 0.00503, "grad_norm": 0.3278079330921173, "grad_norm_var": 0.000554897538649816, "learning_rate": 5e-05, "loss": 0.1587, "loss/crossentropy": 2.75662362575531, "loss/hidden": 0.0, "loss/logits": 0.15868044644594193, "loss/reg": 2.675185203552246, "step": 503 }, { "epoch": 0.00504, "grad_norm": 0.3251039683818817, "grad_norm_var": 0.0006183250665441046, "learning_rate": 5e-05, "loss": 0.1551, "loss/crossentropy": 2.731416165828705, "loss/hidden": 0.0, "loss/logits": 0.15506618097424507, "loss/reg": 2.673948287963867, "step": 504 }, { "epoch": 0.00505, "grad_norm": 0.35344070196151733, "grad_norm_var": 0.0006186746986458047, "learning_rate": 5e-05, "loss": 0.167, "loss/crossentropy": 2.740668296813965, "loss/hidden": 0.0, "loss/logits": 0.16695522889494896, "loss/reg": 2.6712498664855957, "step": 505 }, { "epoch": 0.00506, "grad_norm": 0.36658284068107605, "grad_norm_var": 0.0005366058949143918, "learning_rate": 5e-05, "loss": 0.1668, "loss/crossentropy": 2.802608013153076, "loss/hidden": 0.0, "loss/logits": 0.16682763025164604, "loss/reg": 2.669286012649536, "step": 506 }, { "epoch": 0.00507, "grad_norm": 0.4423954486846924, "grad_norm_var": 0.000963591213409624, "learning_rate": 5e-05, "loss": 0.1963, "loss/crossentropy": 2.811932861804962, "loss/hidden": 0.0, "loss/logits": 0.19632378965616226, "loss/reg": 2.666898012161255, "step": 507 }, { "epoch": 0.00508, "grad_norm": 0.3770610988140106, "grad_norm_var": 0.000975015024308116, "learning_rate": 5e-05, "loss": 0.1753, "loss/crossentropy": 2.7279282808303833, "loss/hidden": 0.0, "loss/logits": 0.17527905479073524, "loss/reg": 2.664764881134033, "step": 508 }, { "epoch": 0.00509, "grad_norm": 0.35589146614074707, "grad_norm_var": 0.0007832244440521922, "learning_rate": 5e-05, "loss": 0.168, "loss/crossentropy": 2.8977367281913757, "loss/hidden": 0.0, "loss/logits": 0.16798892244696617, "loss/reg": 2.6622438430786133, "step": 509 }, { "epoch": 0.0051, "grad_norm": 0.3419097661972046, "grad_norm_var": 0.0007919503322765919, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 2.6906025409698486, "loss/hidden": 0.0, "loss/logits": 0.172856405377388, "loss/reg": 2.6595211029052734, "step": 510 }, { "epoch": 0.00511, "grad_norm": 0.3972381353378296, "grad_norm_var": 0.0008538971282195384, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.762513279914856, "loss/hidden": 0.0, "loss/logits": 0.18245646730065346, "loss/reg": 2.6572320461273193, "step": 511 }, { "epoch": 0.00512, "grad_norm": 0.3489353358745575, "grad_norm_var": 0.0008648399289393501, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.7472071647644043, "loss/hidden": 0.0, "loss/logits": 0.1789936050772667, "loss/reg": 2.6544342041015625, "step": 512 }, { "epoch": 0.00513, "grad_norm": 0.3673308491706848, "grad_norm_var": 0.0008661114894439326, "learning_rate": 5e-05, "loss": 0.1745, "loss/crossentropy": 2.69700163602829, "loss/hidden": 0.0, "loss/logits": 0.1745261810719967, "loss/reg": 2.6520497798919678, "step": 513 }, { "epoch": 0.00514, "grad_norm": 0.33870744705200195, "grad_norm_var": 0.0008961917113334199, "learning_rate": 5e-05, "loss": 0.1649, "loss/crossentropy": 2.762860357761383, "loss/hidden": 0.0, "loss/logits": 0.16494135558605194, "loss/reg": 2.6500847339630127, "step": 514 }, { "epoch": 0.00515, "grad_norm": 0.40411266684532166, "grad_norm_var": 0.0009761766654230563, "learning_rate": 5e-05, "loss": 0.1628, "loss/crossentropy": 3.01085501909256, "loss/hidden": 0.0, "loss/logits": 0.16284478455781937, "loss/reg": 2.648311138153076, "step": 515 }, { "epoch": 0.00516, "grad_norm": 0.37194308638572693, "grad_norm_var": 0.0009268939874421604, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.721080005168915, "loss/hidden": 0.0, "loss/logits": 0.18202906847000122, "loss/reg": 2.6469640731811523, "step": 516 }, { "epoch": 0.00517, "grad_norm": 0.3380615711212158, "grad_norm_var": 0.0009429551352979477, "learning_rate": 5e-05, "loss": 0.1639, "loss/crossentropy": 2.6788495779037476, "loss/hidden": 0.0, "loss/logits": 0.16385124996304512, "loss/reg": 2.6445441246032715, "step": 517 }, { "epoch": 0.00518, "grad_norm": 0.37696361541748047, "grad_norm_var": 0.0009533986625055632, "learning_rate": 5e-05, "loss": 0.1587, "loss/crossentropy": 2.6845511198043823, "loss/hidden": 0.0, "loss/logits": 0.1586880125105381, "loss/reg": 2.6424736976623535, "step": 518 }, { "epoch": 0.00519, "grad_norm": 0.32983675599098206, "grad_norm_var": 0.0009437052369864992, "learning_rate": 5e-05, "loss": 0.1585, "loss/crossentropy": 2.5984672904014587, "loss/hidden": 0.0, "loss/logits": 0.15849433466792107, "loss/reg": 2.639796257019043, "step": 519 }, { "epoch": 0.0052, "grad_norm": 0.3439983129501343, "grad_norm_var": 0.000866215802107521, "learning_rate": 5e-05, "loss": 0.1578, "loss/crossentropy": 2.7057528495788574, "loss/hidden": 0.0, "loss/logits": 0.15775253251194954, "loss/reg": 2.636976480484009, "step": 520 }, { "epoch": 0.00521, "grad_norm": 0.4739494323730469, "grad_norm_var": 0.0015736599047053415, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.6239394545555115, "loss/hidden": 0.0, "loss/logits": 0.17801255360245705, "loss/reg": 2.6342852115631104, "step": 521 }, { "epoch": 0.00522, "grad_norm": 0.5270029306411743, "grad_norm_var": 0.003035565907296726, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.890467643737793, "loss/hidden": 0.0, "loss/logits": 0.18007512018084526, "loss/reg": 2.631289005279541, "step": 522 }, { "epoch": 0.00523, "grad_norm": 0.42719003558158875, "grad_norm_var": 0.002930528350278516, "learning_rate": 5e-05, "loss": 0.1781, "loss/crossentropy": 2.9749565720558167, "loss/hidden": 0.0, "loss/logits": 0.17812742665410042, "loss/reg": 2.6284420490264893, "step": 523 }, { "epoch": 0.00524, "grad_norm": 0.37133005261421204, "grad_norm_var": 0.0029367435634455913, "learning_rate": 5e-05, "loss": 0.1597, "loss/crossentropy": 2.692670702934265, "loss/hidden": 0.0, "loss/logits": 0.1597190946340561, "loss/reg": 2.6251702308654785, "step": 524 }, { "epoch": 0.00525, "grad_norm": 0.3646347224712372, "grad_norm_var": 0.0029109098946428253, "learning_rate": 5e-05, "loss": 0.1676, "loss/crossentropy": 2.8696910738945007, "loss/hidden": 0.0, "loss/logits": 0.16764900088310242, "loss/reg": 2.621973991394043, "step": 525 }, { "epoch": 0.00526, "grad_norm": 0.3347557485103607, "grad_norm_var": 0.002953013887398237, "learning_rate": 5e-05, "loss": 0.1659, "loss/crossentropy": 2.844240427017212, "loss/hidden": 0.0, "loss/logits": 0.16587505862116814, "loss/reg": 2.6180617809295654, "step": 526 }, { "epoch": 0.00527, "grad_norm": 0.3301764726638794, "grad_norm_var": 0.003100070614909223, "learning_rate": 5e-05, "loss": 0.1554, "loss/crossentropy": 2.822225272655487, "loss/hidden": 0.0, "loss/logits": 0.15541274286806583, "loss/reg": 2.6156363487243652, "step": 527 }, { "epoch": 0.00528, "grad_norm": 0.3668423593044281, "grad_norm_var": 0.003050578439524883, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.873881459236145, "loss/hidden": 0.0, "loss/logits": 0.17249644920229912, "loss/reg": 2.6129000186920166, "step": 528 }, { "epoch": 0.00529, "grad_norm": 0.33062636852264404, "grad_norm_var": 0.0031927551041592986, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.7202290296554565, "loss/hidden": 0.0, "loss/logits": 0.16901781037449837, "loss/reg": 2.6098320484161377, "step": 529 }, { "epoch": 0.0053, "grad_norm": 0.33170488476753235, "grad_norm_var": 0.003231463613689256, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.7543463706970215, "loss/hidden": 0.0, "loss/logits": 0.17077547311782837, "loss/reg": 2.606674909591675, "step": 530 }, { "epoch": 0.00531, "grad_norm": 0.3436318337917328, "grad_norm_var": 0.0032369737172315838, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.6231788992881775, "loss/hidden": 0.0, "loss/logits": 0.1821577101945877, "loss/reg": 2.603997230529785, "step": 531 }, { "epoch": 0.00532, "grad_norm": 0.33105242252349854, "grad_norm_var": 0.0033454153420392264, "learning_rate": 5e-05, "loss": 0.1661, "loss/crossentropy": 2.819184124469757, "loss/hidden": 0.0, "loss/logits": 0.16607840731739998, "loss/reg": 2.6012203693389893, "step": 532 }, { "epoch": 0.00533, "grad_norm": 0.3485148847103119, "grad_norm_var": 0.0033075767398377588, "learning_rate": 5e-05, "loss": 0.1676, "loss/crossentropy": 2.8594303727149963, "loss/hidden": 0.0, "loss/logits": 0.1676221825182438, "loss/reg": 2.5986573696136475, "step": 533 }, { "epoch": 0.00534, "grad_norm": 0.3541623651981354, "grad_norm_var": 0.003321219936842216, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.7382256984710693, "loss/hidden": 0.0, "loss/logits": 0.17420669272542, "loss/reg": 2.5956368446350098, "step": 534 }, { "epoch": 0.00535, "grad_norm": 0.362183541059494, "grad_norm_var": 0.003216249066768325, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.820302128791809, "loss/hidden": 0.0, "loss/logits": 0.1731831431388855, "loss/reg": 2.591860294342041, "step": 535 }, { "epoch": 0.00536, "grad_norm": 0.340348482131958, "grad_norm_var": 0.0032303969391706505, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.978896915912628, "loss/hidden": 0.0, "loss/logits": 0.16773569583892822, "loss/reg": 2.5879762172698975, "step": 536 }, { "epoch": 0.00537, "grad_norm": 0.359326034784317, "grad_norm_var": 0.002480178301492671, "learning_rate": 5e-05, "loss": 0.1767, "loss/crossentropy": 2.7645240426063538, "loss/hidden": 0.0, "loss/logits": 0.17668773606419563, "loss/reg": 2.5847809314727783, "step": 537 }, { "epoch": 0.00538, "grad_norm": 0.3420480489730835, "grad_norm_var": 0.0005976425682412671, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.7278724908828735, "loss/hidden": 0.0, "loss/logits": 0.17576001212000847, "loss/reg": 2.581143379211426, "step": 538 }, { "epoch": 0.00539, "grad_norm": 0.33362701535224915, "grad_norm_var": 0.00021185911019383125, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.6828721165657043, "loss/hidden": 0.0, "loss/logits": 0.1707863062620163, "loss/reg": 2.5769548416137695, "step": 539 }, { "epoch": 0.0054, "grad_norm": 0.6795082092285156, "grad_norm_var": 0.007165518560383773, "learning_rate": 5e-05, "loss": 0.2001, "loss/crossentropy": 2.7977577447891235, "loss/hidden": 0.0, "loss/logits": 0.20013980567455292, "loss/reg": 2.5742313861846924, "step": 540 }, { "epoch": 0.00541, "grad_norm": 0.35366278886795044, "grad_norm_var": 0.007174778628811747, "learning_rate": 5e-05, "loss": 0.1703, "loss/crossentropy": 2.859143853187561, "loss/hidden": 0.0, "loss/logits": 0.17025134339928627, "loss/reg": 2.57037353515625, "step": 541 }, { "epoch": 0.00542, "grad_norm": 0.3655552566051483, "grad_norm_var": 0.007109308326582827, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.8502614498138428, "loss/hidden": 0.0, "loss/logits": 0.1775321438908577, "loss/reg": 2.566716432571411, "step": 542 }, { "epoch": 0.00543, "grad_norm": 0.3574732542037964, "grad_norm_var": 0.007021635262450318, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.892129361629486, "loss/hidden": 0.0, "loss/logits": 0.17345865443348885, "loss/reg": 2.563842296600342, "step": 543 }, { "epoch": 0.00544, "grad_norm": 0.36598220467567444, "grad_norm_var": 0.007021902205424502, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.7138225436210632, "loss/hidden": 0.0, "loss/logits": 0.17211398482322693, "loss/reg": 2.5608513355255127, "step": 544 }, { "epoch": 0.00545, "grad_norm": 0.35922348499298096, "grad_norm_var": 0.0069277921155704155, "learning_rate": 5e-05, "loss": 0.1695, "loss/crossentropy": 2.8138818740844727, "loss/hidden": 0.0, "loss/logits": 0.16949571669101715, "loss/reg": 2.557931423187256, "step": 545 }, { "epoch": 0.00546, "grad_norm": 0.3538724184036255, "grad_norm_var": 0.006843838113958241, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.7698569893836975, "loss/hidden": 0.0, "loss/logits": 0.17218982055783272, "loss/reg": 2.5551669597625732, "step": 546 }, { "epoch": 0.00547, "grad_norm": 0.38070008158683777, "grad_norm_var": 0.006790073386325786, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.631078600883484, "loss/hidden": 0.0, "loss/logits": 0.1842627413570881, "loss/reg": 2.5517876148223877, "step": 547 }, { "epoch": 0.00548, "grad_norm": 0.35319533944129944, "grad_norm_var": 0.006693321782661003, "learning_rate": 5e-05, "loss": 0.16, "loss/crossentropy": 2.850399076938629, "loss/hidden": 0.0, "loss/logits": 0.15995023399591446, "loss/reg": 2.548754930496216, "step": 548 }, { "epoch": 0.00549, "grad_norm": 0.4596186578273773, "grad_norm_var": 0.0070637908733671, "learning_rate": 5e-05, "loss": 0.164, "loss/crossentropy": 2.868459641933441, "loss/hidden": 0.0, "loss/logits": 0.16395244374871254, "loss/reg": 2.5462560653686523, "step": 549 }, { "epoch": 0.0055, "grad_norm": 0.3474785387516022, "grad_norm_var": 0.007091863949161529, "learning_rate": 5e-05, "loss": 0.1641, "loss/crossentropy": 2.6114882230758667, "loss/hidden": 0.0, "loss/logits": 0.1641043722629547, "loss/reg": 2.5431270599365234, "step": 550 }, { "epoch": 0.00551, "grad_norm": 0.3570033013820648, "grad_norm_var": 0.007107306178779664, "learning_rate": 5e-05, "loss": 0.1579, "loss/crossentropy": 2.8220438957214355, "loss/hidden": 0.0, "loss/logits": 0.15791887789964676, "loss/reg": 2.539910078048706, "step": 551 }, { "epoch": 0.00552, "grad_norm": 0.32915255427360535, "grad_norm_var": 0.0071770024029156184, "learning_rate": 5e-05, "loss": 0.1612, "loss/crossentropy": 2.878856658935547, "loss/hidden": 0.0, "loss/logits": 0.16122740507125854, "loss/reg": 2.5368034839630127, "step": 552 }, { "epoch": 0.00553, "grad_norm": 0.3565903604030609, "grad_norm_var": 0.007185408405122592, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.705552637577057, "loss/hidden": 0.0, "loss/logits": 0.16892266646027565, "loss/reg": 2.5333409309387207, "step": 553 }, { "epoch": 0.00554, "grad_norm": 0.31767770648002625, "grad_norm_var": 0.0073488319211029345, "learning_rate": 5e-05, "loss": 0.1579, "loss/crossentropy": 2.685009717941284, "loss/hidden": 0.0, "loss/logits": 0.15788856148719788, "loss/reg": 2.5296521186828613, "step": 554 }, { "epoch": 0.00555, "grad_norm": 0.35047340393066406, "grad_norm_var": 0.0072637659398345844, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.5745012760162354, "loss/hidden": 0.0, "loss/logits": 0.17458590865135193, "loss/reg": 2.5270395278930664, "step": 555 }, { "epoch": 0.00556, "grad_norm": 0.3832140266895294, "grad_norm_var": 0.0009360149891549837, "learning_rate": 5e-05, "loss": 0.1676, "loss/crossentropy": 2.8551809787750244, "loss/hidden": 0.0, "loss/logits": 0.16756092011928558, "loss/reg": 2.523982048034668, "step": 556 }, { "epoch": 0.00557, "grad_norm": 0.4020329713821411, "grad_norm_var": 0.0010289291164416311, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.787672698497772, "loss/hidden": 0.0, "loss/logits": 0.17547398060560226, "loss/reg": 2.520615816116333, "step": 557 }, { "epoch": 0.00558, "grad_norm": 0.38412049412727356, "grad_norm_var": 0.0010519623608851428, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.846573293209076, "loss/hidden": 0.0, "loss/logits": 0.1815263032913208, "loss/reg": 2.518004894256592, "step": 558 }, { "epoch": 0.00559, "grad_norm": 0.3456071615219116, "grad_norm_var": 0.0010744320361522322, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.922893524169922, "loss/hidden": 0.0, "loss/logits": 0.17459525167942047, "loss/reg": 2.5153868198394775, "step": 559 }, { "epoch": 0.0056, "grad_norm": 0.36563733220100403, "grad_norm_var": 0.0010744113839659304, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.7154372334480286, "loss/hidden": 0.0, "loss/logits": 0.16903281211853027, "loss/reg": 2.5126099586486816, "step": 560 }, { "epoch": 0.00561, "grad_norm": 0.3387238085269928, "grad_norm_var": 0.00111742135319511, "learning_rate": 5e-05, "loss": 0.1666, "loss/crossentropy": 2.5947054624557495, "loss/hidden": 0.0, "loss/logits": 0.16662058234214783, "loss/reg": 2.509439706802368, "step": 561 }, { "epoch": 0.00562, "grad_norm": 0.45574790239334106, "grad_norm_var": 0.0016275854789366514, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.961915969848633, "loss/hidden": 0.0, "loss/logits": 0.1813669353723526, "loss/reg": 2.5074241161346436, "step": 562 }, { "epoch": 0.00563, "grad_norm": 0.39113175868988037, "grad_norm_var": 0.0016486631382784092, "learning_rate": 5e-05, "loss": 0.1643, "loss/crossentropy": 2.7337673902511597, "loss/hidden": 0.0, "loss/logits": 0.164301548153162, "loss/reg": 2.5046684741973877, "step": 563 }, { "epoch": 0.00564, "grad_norm": 0.36300358176231384, "grad_norm_var": 0.0016312765518430934, "learning_rate": 5e-05, "loss": 0.1602, "loss/crossentropy": 2.713749051094055, "loss/hidden": 0.0, "loss/logits": 0.16021040827035904, "loss/reg": 2.5020864009857178, "step": 564 }, { "epoch": 0.00565, "grad_norm": 0.3250221312046051, "grad_norm_var": 0.001185749693630452, "learning_rate": 5e-05, "loss": 0.1661, "loss/crossentropy": 2.739534556865692, "loss/hidden": 0.0, "loss/logits": 0.166114691644907, "loss/reg": 2.500089645385742, "step": 565 }, { "epoch": 0.00566, "grad_norm": 0.3059675395488739, "grad_norm_var": 0.0013809527139825861, "learning_rate": 5e-05, "loss": 0.1528, "loss/crossentropy": 2.7676697373390198, "loss/hidden": 0.0, "loss/logits": 0.15275665000081062, "loss/reg": 2.497802257537842, "step": 566 }, { "epoch": 0.00567, "grad_norm": 0.41637444496154785, "grad_norm_var": 0.0015720438674995396, "learning_rate": 5e-05, "loss": 0.1899, "loss/crossentropy": 2.7852693796157837, "loss/hidden": 0.0, "loss/logits": 0.18987080082297325, "loss/reg": 2.4960269927978516, "step": 567 }, { "epoch": 0.00568, "grad_norm": 0.48216167092323303, "grad_norm_var": 0.002316091582714641, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.919625759124756, "loss/hidden": 0.0, "loss/logits": 0.17901213094592094, "loss/reg": 2.4943020343780518, "step": 568 }, { "epoch": 0.00569, "grad_norm": 0.34773337841033936, "grad_norm_var": 0.0023415161321106623, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.829575002193451, "loss/hidden": 0.0, "loss/logits": 0.1688704527914524, "loss/reg": 2.4922549724578857, "step": 569 }, { "epoch": 0.0057, "grad_norm": 0.42466020584106445, "grad_norm_var": 0.0022617987789910494, "learning_rate": 5e-05, "loss": 0.2065, "loss/crossentropy": 2.847673773765564, "loss/hidden": 0.0, "loss/logits": 0.20647098124027252, "loss/reg": 2.4896240234375, "step": 570 }, { "epoch": 0.00571, "grad_norm": 0.39025840163230896, "grad_norm_var": 0.0022035635328787567, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.9154597520828247, "loss/hidden": 0.0, "loss/logits": 0.1810290329158306, "loss/reg": 2.487513542175293, "step": 571 }, { "epoch": 0.00572, "grad_norm": 0.3611275851726532, "grad_norm_var": 0.002232206094215346, "learning_rate": 5e-05, "loss": 0.1687, "loss/crossentropy": 2.813008964061737, "loss/hidden": 0.0, "loss/logits": 0.16871189698576927, "loss/reg": 2.4849367141723633, "step": 572 }, { "epoch": 0.00573, "grad_norm": 0.37163245677948, "grad_norm_var": 0.002205551603401897, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.829798102378845, "loss/hidden": 0.0, "loss/logits": 0.173641849309206, "loss/reg": 2.481811046600342, "step": 573 }, { "epoch": 0.00574, "grad_norm": 0.37662971019744873, "grad_norm_var": 0.002204250880404842, "learning_rate": 5e-05, "loss": 0.1641, "loss/crossentropy": 2.786403477191925, "loss/hidden": 0.0, "loss/logits": 0.16413037478923798, "loss/reg": 2.479344606399536, "step": 574 }, { "epoch": 0.00575, "grad_norm": 0.4090428948402405, "grad_norm_var": 0.002174681113915019, "learning_rate": 5e-05, "loss": 0.1684, "loss/crossentropy": 2.685749888420105, "loss/hidden": 0.0, "loss/logits": 0.16840650886297226, "loss/reg": 2.476419448852539, "step": 575 }, { "epoch": 0.00576, "grad_norm": 0.35688483715057373, "grad_norm_var": 0.0021995018187083346, "learning_rate": 5e-05, "loss": 0.1611, "loss/crossentropy": 2.809792697429657, "loss/hidden": 0.0, "loss/logits": 0.1611488163471222, "loss/reg": 2.4737355709075928, "step": 576 }, { "epoch": 0.00577, "grad_norm": 0.38194504380226135, "grad_norm_var": 0.002065385566742454, "learning_rate": 5e-05, "loss": 0.1615, "loss/crossentropy": 2.850769340991974, "loss/hidden": 0.0, "loss/logits": 0.1615295149385929, "loss/reg": 2.4713802337646484, "step": 577 }, { "epoch": 0.00578, "grad_norm": 0.3567502200603485, "grad_norm_var": 0.001743510873329986, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.7103776335716248, "loss/hidden": 0.0, "loss/logits": 0.16894375160336494, "loss/reg": 2.4685709476470947, "step": 578 }, { "epoch": 0.00579, "grad_norm": 0.3396901786327362, "grad_norm_var": 0.001824115359163836, "learning_rate": 5e-05, "loss": 0.166, "loss/crossentropy": 2.7079854607582092, "loss/hidden": 0.0, "loss/logits": 0.1659584417939186, "loss/reg": 2.465658664703369, "step": 579 }, { "epoch": 0.0058, "grad_norm": 0.358395516872406, "grad_norm_var": 0.0018331543648902808, "learning_rate": 5e-05, "loss": 0.1813, "loss/crossentropy": 2.8853692412376404, "loss/hidden": 0.0, "loss/logits": 0.18133477121591568, "loss/reg": 2.4633235931396484, "step": 580 }, { "epoch": 0.00581, "grad_norm": 0.3434228301048279, "grad_norm_var": 0.0017310432323107805, "learning_rate": 5e-05, "loss": 0.1739, "loss/crossentropy": 2.666011691093445, "loss/hidden": 0.0, "loss/logits": 0.1739240102469921, "loss/reg": 2.460447072982788, "step": 581 }, { "epoch": 0.00582, "grad_norm": 0.3482820689678192, "grad_norm_var": 0.0014454775261250163, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.6244596242904663, "loss/hidden": 0.0, "loss/logits": 0.1845875158905983, "loss/reg": 2.457307815551758, "step": 582 }, { "epoch": 0.00583, "grad_norm": 0.36450985074043274, "grad_norm_var": 0.0013555723186838029, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.6909091472625732, "loss/hidden": 0.0, "loss/logits": 0.174148079007864, "loss/reg": 2.4545810222625732, "step": 583 }, { "epoch": 0.00584, "grad_norm": 0.34841907024383545, "grad_norm_var": 0.0005772011049318792, "learning_rate": 5e-05, "loss": 0.1621, "loss/crossentropy": 2.7992460131645203, "loss/hidden": 0.0, "loss/logits": 0.162076648324728, "loss/reg": 2.4516048431396484, "step": 584 }, { "epoch": 0.00585, "grad_norm": 0.36560627818107605, "grad_norm_var": 0.0005501529366056079, "learning_rate": 5e-05, "loss": 0.1612, "loss/crossentropy": 2.7556354999542236, "loss/hidden": 0.0, "loss/logits": 0.16119826585054398, "loss/reg": 2.4482715129852295, "step": 585 }, { "epoch": 0.00586, "grad_norm": 0.37393423914909363, "grad_norm_var": 0.00033166715444868193, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.6222774982452393, "loss/hidden": 0.0, "loss/logits": 0.1779084950685501, "loss/reg": 2.4452362060546875, "step": 586 }, { "epoch": 0.00587, "grad_norm": 0.3511587679386139, "grad_norm_var": 0.0002976648126369145, "learning_rate": 5e-05, "loss": 0.1761, "loss/crossentropy": 2.7342361211776733, "loss/hidden": 0.0, "loss/logits": 0.17614838480949402, "loss/reg": 2.441678524017334, "step": 587 }, { "epoch": 0.00588, "grad_norm": 0.33847615122795105, "grad_norm_var": 0.0003352805276915209, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.7935328483581543, "loss/hidden": 0.0, "loss/logits": 0.17295999452471733, "loss/reg": 2.437959671020508, "step": 588 }, { "epoch": 0.00589, "grad_norm": 0.351034015417099, "grad_norm_var": 0.00033410454836428903, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.7590489387512207, "loss/hidden": 0.0, "loss/logits": 0.1775294505059719, "loss/reg": 2.4346938133239746, "step": 589 }, { "epoch": 0.0059, "grad_norm": 0.37800535559654236, "grad_norm_var": 0.0003372250971240794, "learning_rate": 5e-05, "loss": 0.1645, "loss/crossentropy": 2.75826096534729, "loss/hidden": 0.0, "loss/logits": 0.16445999220013618, "loss/reg": 2.4319543838500977, "step": 590 }, { "epoch": 0.00591, "grad_norm": 0.3323316276073456, "grad_norm_var": 0.0002069473145354402, "learning_rate": 5e-05, "loss": 0.1644, "loss/crossentropy": 2.963920295238495, "loss/hidden": 0.0, "loss/logits": 0.164412472397089, "loss/reg": 2.4295387268066406, "step": 591 }, { "epoch": 0.00592, "grad_norm": 0.8281128406524658, "grad_norm_var": 0.014169124282143371, "learning_rate": 5e-05, "loss": 0.2256, "loss/crossentropy": 2.9319988489151, "loss/hidden": 0.0, "loss/logits": 0.22560855001211166, "loss/reg": 2.427125930786133, "step": 592 }, { "epoch": 0.00593, "grad_norm": 0.37988972663879395, "grad_norm_var": 0.014170226758262046, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.92197585105896, "loss/hidden": 0.0, "loss/logits": 0.1794501654803753, "loss/reg": 2.4247610569000244, "step": 593 }, { "epoch": 0.00594, "grad_norm": 0.37449878454208374, "grad_norm_var": 0.014123355612102569, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.734030842781067, "loss/hidden": 0.0, "loss/logits": 0.17557094618678093, "loss/reg": 2.4224398136138916, "step": 594 }, { "epoch": 0.00595, "grad_norm": 0.3890518248081207, "grad_norm_var": 0.013970946553029018, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.6910988688468933, "loss/hidden": 0.0, "loss/logits": 0.17206770926713943, "loss/reg": 2.4206151962280273, "step": 595 }, { "epoch": 0.00596, "grad_norm": 0.45764538645744324, "grad_norm_var": 0.014180672563351104, "learning_rate": 5e-05, "loss": 0.1886, "loss/crossentropy": 2.706140458583832, "loss/hidden": 0.0, "loss/logits": 0.1886041909456253, "loss/reg": 2.418341636657715, "step": 596 }, { "epoch": 0.00597, "grad_norm": 0.3294787108898163, "grad_norm_var": 0.014289226884282809, "learning_rate": 5e-05, "loss": 0.1693, "loss/crossentropy": 2.772903263568878, "loss/hidden": 0.0, "loss/logits": 0.16925981268286705, "loss/reg": 2.415613889694214, "step": 597 }, { "epoch": 0.00598, "grad_norm": 0.3425086438655853, "grad_norm_var": 0.014326812953815705, "learning_rate": 5e-05, "loss": 0.17, "loss/crossentropy": 2.7024609446525574, "loss/hidden": 0.0, "loss/logits": 0.16995511576533318, "loss/reg": 2.4122676849365234, "step": 598 }, { "epoch": 0.00599, "grad_norm": 0.37222734093666077, "grad_norm_var": 0.014300147579082, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.8698896765708923, "loss/hidden": 0.0, "loss/logits": 0.17886632308363914, "loss/reg": 2.4099037647247314, "step": 599 }, { "epoch": 0.006, "grad_norm": 0.39135316014289856, "grad_norm_var": 0.014151428197242365, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.700629711151123, "loss/hidden": 0.0, "loss/logits": 0.17468373104929924, "loss/reg": 2.40794038772583, "step": 600 }, { "epoch": 0.00601, "grad_norm": 0.3728218376636505, "grad_norm_var": 0.014124279912823712, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.794102430343628, "loss/hidden": 0.0, "loss/logits": 0.16897983103990555, "loss/reg": 2.405545711517334, "step": 601 }, { "epoch": 0.00602, "grad_norm": 0.37317147850990295, "grad_norm_var": 0.014126729018321404, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.6252577900886536, "loss/hidden": 0.0, "loss/logits": 0.17474820092320442, "loss/reg": 2.402970790863037, "step": 602 }, { "epoch": 0.00603, "grad_norm": 0.35492607951164246, "grad_norm_var": 0.01410428304541661, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.6527358889579773, "loss/hidden": 0.0, "loss/logits": 0.18093448877334595, "loss/reg": 2.4007880687713623, "step": 603 }, { "epoch": 0.00604, "grad_norm": 0.408010870218277, "grad_norm_var": 0.013856041692491945, "learning_rate": 5e-05, "loss": 0.2018, "loss/crossentropy": 2.874286651611328, "loss/hidden": 0.0, "loss/logits": 0.20177744701504707, "loss/reg": 2.3988449573516846, "step": 604 }, { "epoch": 0.00605, "grad_norm": 0.3291812837123871, "grad_norm_var": 0.014034946168994126, "learning_rate": 5e-05, "loss": 0.1614, "loss/crossentropy": 2.7926167249679565, "loss/hidden": 0.0, "loss/logits": 0.16140995919704437, "loss/reg": 2.3966500759124756, "step": 605 }, { "epoch": 0.00606, "grad_norm": 0.34659212827682495, "grad_norm_var": 0.014192203001449558, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.8195464611053467, "loss/hidden": 0.0, "loss/logits": 0.1709096021950245, "loss/reg": 2.394033670425415, "step": 606 }, { "epoch": 0.00607, "grad_norm": 0.32253992557525635, "grad_norm_var": 0.014285055545239086, "learning_rate": 5e-05, "loss": 0.1649, "loss/crossentropy": 2.7236337065696716, "loss/hidden": 0.0, "loss/logits": 0.16493552178144455, "loss/reg": 2.3918449878692627, "step": 607 }, { "epoch": 0.00608, "grad_norm": 0.350931316614151, "grad_norm_var": 0.0011668026056699994, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.800759196281433, "loss/hidden": 0.0, "loss/logits": 0.17175282910466194, "loss/reg": 2.38920521736145, "step": 608 }, { "epoch": 0.00609, "grad_norm": 0.40333986282348633, "grad_norm_var": 0.001237012928824995, "learning_rate": 5e-05, "loss": 0.2046, "loss/crossentropy": 2.7574119567871094, "loss/hidden": 0.0, "loss/logits": 0.2046247273683548, "loss/reg": 2.3865227699279785, "step": 609 }, { "epoch": 0.0061, "grad_norm": 0.3773089349269867, "grad_norm_var": 0.0012392324335123346, "learning_rate": 5e-05, "loss": 0.1641, "loss/crossentropy": 2.6313101649284363, "loss/hidden": 0.0, "loss/logits": 0.16410250216722488, "loss/reg": 2.3836612701416016, "step": 610 }, { "epoch": 0.00611, "grad_norm": 0.438357949256897, "grad_norm_var": 0.0015159779907225465, "learning_rate": 5e-05, "loss": 0.2098, "loss/crossentropy": 2.780138611793518, "loss/hidden": 0.0, "loss/logits": 0.20979087427258492, "loss/reg": 2.380405902862549, "step": 611 }, { "epoch": 0.00612, "grad_norm": 0.34121251106262207, "grad_norm_var": 0.0010515226822608785, "learning_rate": 5e-05, "loss": 0.161, "loss/crossentropy": 2.669090151786804, "loss/hidden": 0.0, "loss/logits": 0.1609921157360077, "loss/reg": 2.3776426315307617, "step": 612 }, { "epoch": 0.00613, "grad_norm": 0.36169829964637756, "grad_norm_var": 0.0009600577824135296, "learning_rate": 5e-05, "loss": 0.1734, "loss/crossentropy": 2.7925440073013306, "loss/hidden": 0.0, "loss/logits": 0.17342102900147438, "loss/reg": 2.3744184970855713, "step": 613 }, { "epoch": 0.00614, "grad_norm": 0.522160530090332, "grad_norm_var": 0.002369345725690275, "learning_rate": 5e-05, "loss": 0.1663, "loss/crossentropy": 2.698939800262451, "loss/hidden": 0.0, "loss/logits": 0.16627153754234314, "loss/reg": 2.370917320251465, "step": 614 }, { "epoch": 0.00615, "grad_norm": 0.4562234580516815, "grad_norm_var": 0.002733171284208069, "learning_rate": 5e-05, "loss": 0.1686, "loss/crossentropy": 2.8971627950668335, "loss/hidden": 0.0, "loss/logits": 0.16856613755226135, "loss/reg": 2.368067979812622, "step": 615 }, { "epoch": 0.00616, "grad_norm": 0.5767056345939636, "grad_norm_var": 0.0050531115809510415, "learning_rate": 5e-05, "loss": 0.171, "loss/crossentropy": 2.822002112865448, "loss/hidden": 0.0, "loss/logits": 0.17102698609232903, "loss/reg": 2.3653640747070312, "step": 616 }, { "epoch": 0.00617, "grad_norm": 0.3703908324241638, "grad_norm_var": 0.005060977204500819, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.761395037174225, "loss/hidden": 0.0, "loss/logits": 0.17990661412477493, "loss/reg": 2.362797975540161, "step": 617 }, { "epoch": 0.00618, "grad_norm": 0.44375622272491455, "grad_norm_var": 0.005159430065949637, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.8019450306892395, "loss/hidden": 0.0, "loss/logits": 0.1661831997334957, "loss/reg": 2.3597350120544434, "step": 618 }, { "epoch": 0.00619, "grad_norm": 0.41226035356521606, "grad_norm_var": 0.005018716701479123, "learning_rate": 5e-05, "loss": 0.1737, "loss/crossentropy": 2.837542712688446, "loss/hidden": 0.0, "loss/logits": 0.1737065464258194, "loss/reg": 2.356935977935791, "step": 619 }, { "epoch": 0.0062, "grad_norm": 0.36850520968437195, "grad_norm_var": 0.005094037089036248, "learning_rate": 5e-05, "loss": 0.1691, "loss/crossentropy": 2.872538685798645, "loss/hidden": 0.0, "loss/logits": 0.16909406706690788, "loss/reg": 2.354841709136963, "step": 620 }, { "epoch": 0.00621, "grad_norm": 0.3547448217868805, "grad_norm_var": 0.004888988248098869, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.727312922477722, "loss/hidden": 0.0, "loss/logits": 0.17773358151316643, "loss/reg": 2.3518083095550537, "step": 621 }, { "epoch": 0.00622, "grad_norm": 0.3340252637863159, "grad_norm_var": 0.0049932414292845895, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.7399535179138184, "loss/hidden": 0.0, "loss/logits": 0.16725115478038788, "loss/reg": 2.349299907684326, "step": 622 }, { "epoch": 0.00623, "grad_norm": 0.328477680683136, "grad_norm_var": 0.004932429457390519, "learning_rate": 5e-05, "loss": 0.1658, "loss/crossentropy": 2.7973376512527466, "loss/hidden": 0.0, "loss/logits": 0.1658070906996727, "loss/reg": 2.346407175064087, "step": 623 }, { "epoch": 0.00624, "grad_norm": 0.3988572061061859, "grad_norm_var": 0.004746415643168555, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.886197090148926, "loss/hidden": 0.0, "loss/logits": 0.18076446652412415, "loss/reg": 2.343637228012085, "step": 624 }, { "epoch": 0.00625, "grad_norm": 0.3653312921524048, "grad_norm_var": 0.0048476613679717525, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.6095593571662903, "loss/hidden": 0.0, "loss/logits": 0.17522242665290833, "loss/reg": 2.34155011177063, "step": 625 }, { "epoch": 0.00626, "grad_norm": 0.3519672751426697, "grad_norm_var": 0.004975031863489395, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.724495232105255, "loss/hidden": 0.0, "loss/logits": 0.16851425543427467, "loss/reg": 2.339081048965454, "step": 626 }, { "epoch": 0.00627, "grad_norm": 0.3507337272167206, "grad_norm_var": 0.005024779798457324, "learning_rate": 5e-05, "loss": 0.1661, "loss/crossentropy": 2.775688886642456, "loss/hidden": 0.0, "loss/logits": 0.1661130003631115, "loss/reg": 2.3362221717834473, "step": 627 }, { "epoch": 0.00628, "grad_norm": 0.35331088304519653, "grad_norm_var": 0.004945443478871292, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.6876689195632935, "loss/hidden": 0.0, "loss/logits": 0.18032584339380264, "loss/reg": 2.3339033126831055, "step": 628 }, { "epoch": 0.00629, "grad_norm": 0.3569658696651459, "grad_norm_var": 0.004969005818722216, "learning_rate": 5e-05, "loss": 0.1646, "loss/crossentropy": 2.87895804643631, "loss/hidden": 0.0, "loss/logits": 0.16462786123156548, "loss/reg": 2.3317511081695557, "step": 629 }, { "epoch": 0.0063, "grad_norm": 0.37102508544921875, "grad_norm_var": 0.0038649155689368443, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.8995742201805115, "loss/hidden": 0.0, "loss/logits": 0.1807471290230751, "loss/reg": 2.3286077976226807, "step": 630 }, { "epoch": 0.00631, "grad_norm": 0.37091144919395447, "grad_norm_var": 0.0035332975201383715, "learning_rate": 5e-05, "loss": 0.1679, "loss/crossentropy": 2.755174398422241, "loss/hidden": 0.0, "loss/logits": 0.16785116121172905, "loss/reg": 2.3263347148895264, "step": 631 }, { "epoch": 0.00632, "grad_norm": 0.3764369487762451, "grad_norm_var": 0.000834165955391919, "learning_rate": 5e-05, "loss": 0.1597, "loss/crossentropy": 2.7826634645462036, "loss/hidden": 0.0, "loss/logits": 0.1596829891204834, "loss/reg": 2.3239433765411377, "step": 632 }, { "epoch": 0.00633, "grad_norm": 0.34151408076286316, "grad_norm_var": 0.0008818179956038841, "learning_rate": 5e-05, "loss": 0.1628, "loss/crossentropy": 2.805456221103668, "loss/hidden": 0.0, "loss/logits": 0.16277796775102615, "loss/reg": 2.3215837478637695, "step": 633 }, { "epoch": 0.00634, "grad_norm": 0.7558053731918335, "grad_norm_var": 0.010143553337954326, "learning_rate": 5e-05, "loss": 0.1858, "loss/crossentropy": 2.7678999304771423, "loss/hidden": 0.0, "loss/logits": 0.18575545772910118, "loss/reg": 2.3192989826202393, "step": 634 }, { "epoch": 0.00635, "grad_norm": 0.3809748589992523, "grad_norm_var": 0.010099062255010161, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.86500483751297, "loss/hidden": 0.0, "loss/logits": 0.1791505441069603, "loss/reg": 2.317030906677246, "step": 635 }, { "epoch": 0.00636, "grad_norm": 0.40578746795654297, "grad_norm_var": 0.010104068412990375, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.8707818388938904, "loss/hidden": 0.0, "loss/logits": 0.188164584338665, "loss/reg": 2.3146989345550537, "step": 636 }, { "epoch": 0.00637, "grad_norm": 0.415227472782135, "grad_norm_var": 0.010070131470069877, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.831197440624237, "loss/hidden": 0.0, "loss/logits": 0.17476488277316093, "loss/reg": 2.311936616897583, "step": 637 }, { "epoch": 0.00638, "grad_norm": 0.4119730293750763, "grad_norm_var": 0.00985685373482662, "learning_rate": 5e-05, "loss": 0.1699, "loss/crossentropy": 2.6559138894081116, "loss/hidden": 0.0, "loss/logits": 0.16989587992429733, "loss/reg": 2.3090522289276123, "step": 638 }, { "epoch": 0.00639, "grad_norm": 0.3662709593772888, "grad_norm_var": 0.009606093056996168, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.767539858818054, "loss/hidden": 0.0, "loss/logits": 0.17754964902997017, "loss/reg": 2.3056743144989014, "step": 639 }, { "epoch": 0.0064, "grad_norm": 0.38491374254226685, "grad_norm_var": 0.009617242443139995, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.6669586896896362, "loss/hidden": 0.0, "loss/logits": 0.18266603723168373, "loss/reg": 2.303258180618286, "step": 640 }, { "epoch": 0.00641, "grad_norm": 0.4197373390197754, "grad_norm_var": 0.009569272862985524, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.7964502573013306, "loss/hidden": 0.0, "loss/logits": 0.17782465368509293, "loss/reg": 2.300361156463623, "step": 641 }, { "epoch": 0.00642, "grad_norm": 0.4097757339477539, "grad_norm_var": 0.00940137989136159, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.735614001750946, "loss/hidden": 0.0, "loss/logits": 0.1855894774198532, "loss/reg": 2.2972419261932373, "step": 642 }, { "epoch": 0.00643, "grad_norm": 0.35904356837272644, "grad_norm_var": 0.00934616788177974, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.774403393268585, "loss/hidden": 0.0, "loss/logits": 0.1832551322877407, "loss/reg": 2.293954610824585, "step": 643 }, { "epoch": 0.00644, "grad_norm": 0.34157049655914307, "grad_norm_var": 0.009435664127140328, "learning_rate": 5e-05, "loss": 0.1618, "loss/crossentropy": 2.8616234064102173, "loss/hidden": 0.0, "loss/logits": 0.16181085631251335, "loss/reg": 2.2906179428100586, "step": 644 }, { "epoch": 0.00645, "grad_norm": 0.4255986213684082, "grad_norm_var": 0.009297406924193945, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.654071033000946, "loss/hidden": 0.0, "loss/logits": 0.18530349805951118, "loss/reg": 2.287349224090576, "step": 645 }, { "epoch": 0.00646, "grad_norm": 0.3393001854419708, "grad_norm_var": 0.009518979339113423, "learning_rate": 5e-05, "loss": 0.1665, "loss/crossentropy": 2.73319810628891, "loss/hidden": 0.0, "loss/logits": 0.16647282242774963, "loss/reg": 2.2834725379943848, "step": 646 }, { "epoch": 0.00647, "grad_norm": 0.34969252347946167, "grad_norm_var": 0.00964795505733251, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.7993005514144897, "loss/hidden": 0.0, "loss/logits": 0.1833389550447464, "loss/reg": 2.2799694538116455, "step": 647 }, { "epoch": 0.00648, "grad_norm": 0.35388484597206116, "grad_norm_var": 0.009766310746661707, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.766145169734955, "loss/hidden": 0.0, "loss/logits": 0.17488964274525642, "loss/reg": 2.277585029602051, "step": 648 }, { "epoch": 0.00649, "grad_norm": 0.5462765097618103, "grad_norm_var": 0.010685818975949597, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.7475533485412598, "loss/hidden": 0.0, "loss/logits": 0.18824508786201477, "loss/reg": 2.2750473022460938, "step": 649 }, { "epoch": 0.0065, "grad_norm": 0.3537692725658417, "grad_norm_var": 0.002605622083243005, "learning_rate": 5e-05, "loss": 0.1761, "loss/crossentropy": 2.756273865699768, "loss/hidden": 0.0, "loss/logits": 0.17612234875559807, "loss/reg": 2.2722957134246826, "step": 650 }, { "epoch": 0.00651, "grad_norm": 0.3770252466201782, "grad_norm_var": 0.0026121330513858157, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.7889973521232605, "loss/hidden": 0.0, "loss/logits": 0.1897362545132637, "loss/reg": 2.2701008319854736, "step": 651 }, { "epoch": 0.00652, "grad_norm": 0.4475138187408447, "grad_norm_var": 0.0028018836674080227, "learning_rate": 5e-05, "loss": 0.1951, "loss/crossentropy": 2.531024992465973, "loss/hidden": 0.0, "loss/logits": 0.1951226033270359, "loss/reg": 2.267695665359497, "step": 652 }, { "epoch": 0.00653, "grad_norm": 0.3947466313838959, "grad_norm_var": 0.002769718525090366, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.886034905910492, "loss/hidden": 0.0, "loss/logits": 0.19575949385762215, "loss/reg": 2.2648732662200928, "step": 653 }, { "epoch": 0.00654, "grad_norm": 0.3775857090950012, "grad_norm_var": 0.0027546537142078996, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.7190786600112915, "loss/hidden": 0.0, "loss/logits": 0.17461128905415535, "loss/reg": 2.2620925903320312, "step": 654 }, { "epoch": 0.00655, "grad_norm": 0.34534481167793274, "grad_norm_var": 0.002849399631435645, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.8847506046295166, "loss/hidden": 0.0, "loss/logits": 0.1791832633316517, "loss/reg": 2.2593743801116943, "step": 655 }, { "epoch": 0.00656, "grad_norm": 0.3607633411884308, "grad_norm_var": 0.0028993682580486144, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.815674066543579, "loss/hidden": 0.0, "loss/logits": 0.179163109511137, "loss/reg": 2.2569730281829834, "step": 656 }, { "epoch": 0.00657, "grad_norm": 0.38781270384788513, "grad_norm_var": 0.002826278400635814, "learning_rate": 5e-05, "loss": 0.1618, "loss/crossentropy": 2.6106160283088684, "loss/hidden": 0.0, "loss/logits": 0.1617795117199421, "loss/reg": 2.254523277282715, "step": 657 }, { "epoch": 0.00658, "grad_norm": 0.40386784076690674, "grad_norm_var": 0.0028094212847462165, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.8471227884292603, "loss/hidden": 0.0, "loss/logits": 0.17474086582660675, "loss/reg": 2.252164125442505, "step": 658 }, { "epoch": 0.00659, "grad_norm": 0.36319243907928467, "grad_norm_var": 0.002796007207749466, "learning_rate": 5e-05, "loss": 0.163, "loss/crossentropy": 2.7625906467437744, "loss/hidden": 0.0, "loss/logits": 0.16296877712011337, "loss/reg": 2.249460458755493, "step": 659 }, { "epoch": 0.0066, "grad_norm": 0.3657222092151642, "grad_norm_var": 0.00269101182172804, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.781547486782074, "loss/hidden": 0.0, "loss/logits": 0.1777149885892868, "loss/reg": 2.246467113494873, "step": 660 }, { "epoch": 0.00661, "grad_norm": 0.38363057374954224, "grad_norm_var": 0.0025851401210759276, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.82689893245697, "loss/hidden": 0.0, "loss/logits": 0.1844283789396286, "loss/reg": 2.2436530590057373, "step": 661 }, { "epoch": 0.00662, "grad_norm": 0.4096749424934387, "grad_norm_var": 0.0024716520181473594, "learning_rate": 5e-05, "loss": 0.1745, "loss/crossentropy": 2.8063756823539734, "loss/hidden": 0.0, "loss/logits": 0.17451731115579605, "loss/reg": 2.241178035736084, "step": 662 }, { "epoch": 0.00663, "grad_norm": 0.42931249737739563, "grad_norm_var": 0.0024528927297352344, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.8724401593208313, "loss/hidden": 0.0, "loss/logits": 0.1859952136874199, "loss/reg": 2.2383124828338623, "step": 663 }, { "epoch": 0.00664, "grad_norm": 0.3530314862728119, "grad_norm_var": 0.0024574750299312478, "learning_rate": 5e-05, "loss": 0.1696, "loss/crossentropy": 2.9292226433753967, "loss/hidden": 0.0, "loss/logits": 0.16955319419503212, "loss/reg": 2.2356791496276855, "step": 664 }, { "epoch": 0.00665, "grad_norm": 0.4304611384868622, "grad_norm_var": 0.0009397736187397402, "learning_rate": 5e-05, "loss": 0.1902, "loss/crossentropy": 2.7114855647087097, "loss/hidden": 0.0, "loss/logits": 0.19023016840219498, "loss/reg": 2.2330329418182373, "step": 665 }, { "epoch": 0.00666, "grad_norm": 0.32996541261672974, "grad_norm_var": 0.0010789617804694747, "learning_rate": 5e-05, "loss": 0.158, "loss/crossentropy": 2.8920618891716003, "loss/hidden": 0.0, "loss/logits": 0.1580132134258747, "loss/reg": 2.2296440601348877, "step": 666 }, { "epoch": 0.00667, "grad_norm": 0.3874596953392029, "grad_norm_var": 0.0010747020479673205, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.7297377586364746, "loss/hidden": 0.0, "loss/logits": 0.18189727514982224, "loss/reg": 2.226966619491577, "step": 667 }, { "epoch": 0.00668, "grad_norm": 0.36097773909568787, "grad_norm_var": 0.000828712243429038, "learning_rate": 5e-05, "loss": 0.1726, "loss/crossentropy": 2.6433697938919067, "loss/hidden": 0.0, "loss/logits": 0.17256683483719826, "loss/reg": 2.2244138717651367, "step": 668 }, { "epoch": 0.00669, "grad_norm": 0.3509676158428192, "grad_norm_var": 0.0008637156407869958, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.8315157890319824, "loss/hidden": 0.0, "loss/logits": 0.17234884947538376, "loss/reg": 2.220712661743164, "step": 669 }, { "epoch": 0.0067, "grad_norm": 0.3578469157218933, "grad_norm_var": 0.0008878035089742793, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 2.788190722465515, "loss/hidden": 0.0, "loss/logits": 0.16572094336152077, "loss/reg": 2.217878818511963, "step": 670 }, { "epoch": 0.00671, "grad_norm": 0.4930081069469452, "grad_norm_var": 0.0016420680378558003, "learning_rate": 5e-05, "loss": 0.1818, "loss/crossentropy": 3.013857901096344, "loss/hidden": 0.0, "loss/logits": 0.18175816163420677, "loss/reg": 2.2148284912109375, "step": 671 }, { "epoch": 0.00672, "grad_norm": 0.36925604939460754, "grad_norm_var": 0.0016185866984450236, "learning_rate": 5e-05, "loss": 0.1642, "loss/crossentropy": 2.8940696716308594, "loss/hidden": 0.0, "loss/logits": 0.1642276532948017, "loss/reg": 2.2119295597076416, "step": 672 }, { "epoch": 0.00673, "grad_norm": 0.4327005445957184, "grad_norm_var": 0.0017552981165500747, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.87309467792511, "loss/hidden": 0.0, "loss/logits": 0.17423933744430542, "loss/reg": 2.209021806716919, "step": 673 }, { "epoch": 0.00674, "grad_norm": 0.738524854183197, "grad_norm_var": 0.009426579051544037, "learning_rate": 5e-05, "loss": 0.1868, "loss/crossentropy": 2.8040258288383484, "loss/hidden": 0.0, "loss/logits": 0.1867678351700306, "loss/reg": 2.2066619396209717, "step": 674 }, { "epoch": 0.00675, "grad_norm": 0.4364205002784729, "grad_norm_var": 0.009307313279513674, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.718536138534546, "loss/hidden": 0.0, "loss/logits": 0.17956989258527756, "loss/reg": 2.203990936279297, "step": 675 }, { "epoch": 0.00676, "grad_norm": 0.41067376732826233, "grad_norm_var": 0.009142390414932911, "learning_rate": 5e-05, "loss": 0.1642, "loss/crossentropy": 2.72940456867218, "loss/hidden": 0.0, "loss/logits": 0.16421709582209587, "loss/reg": 2.2013046741485596, "step": 676 }, { "epoch": 0.00677, "grad_norm": 0.4327182173728943, "grad_norm_var": 0.009073804614162174, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.7371246814727783, "loss/hidden": 0.0, "loss/logits": 0.19054419547319412, "loss/reg": 2.1991653442382812, "step": 677 }, { "epoch": 0.00678, "grad_norm": 0.3779783844947815, "grad_norm_var": 0.009181024716334075, "learning_rate": 5e-05, "loss": 0.1589, "loss/crossentropy": 2.7500953674316406, "loss/hidden": 0.0, "loss/logits": 0.15891055390238762, "loss/reg": 2.197261333465576, "step": 678 }, { "epoch": 0.00679, "grad_norm": 0.3585035502910614, "grad_norm_var": 0.009389539404841711, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.869826376438141, "loss/hidden": 0.0, "loss/logits": 0.18359991908073425, "loss/reg": 2.195239782333374, "step": 679 }, { "epoch": 0.0068, "grad_norm": 0.3534944951534271, "grad_norm_var": 0.009385802469305704, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.8476794362068176, "loss/hidden": 0.0, "loss/logits": 0.1689487136900425, "loss/reg": 2.1929402351379395, "step": 680 }, { "epoch": 0.00681, "grad_norm": 0.3718988001346588, "grad_norm_var": 0.009470130435250168, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.7930009365081787, "loss/hidden": 0.0, "loss/logits": 0.17038631066679955, "loss/reg": 2.19075608253479, "step": 681 }, { "epoch": 0.00682, "grad_norm": 0.4854961037635803, "grad_norm_var": 0.009319177707927173, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.8028470277786255, "loss/hidden": 0.0, "loss/logits": 0.17054682224988937, "loss/reg": 2.1886627674102783, "step": 682 }, { "epoch": 0.00683, "grad_norm": 0.3880312144756317, "grad_norm_var": 0.00931672834921676, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.963544547557831, "loss/hidden": 0.0, "loss/logits": 0.17662956938147545, "loss/reg": 2.1860859394073486, "step": 683 }, { "epoch": 0.00684, "grad_norm": 0.3488878309726715, "grad_norm_var": 0.009420855437860176, "learning_rate": 5e-05, "loss": 0.1667, "loss/crossentropy": 2.959736704826355, "loss/hidden": 0.0, "loss/logits": 0.16670886427164078, "loss/reg": 2.183668375015259, "step": 684 }, { "epoch": 0.00685, "grad_norm": 0.8154363632202148, "grad_norm_var": 0.018681551405985854, "learning_rate": 5e-05, "loss": 0.2213, "loss/crossentropy": 2.911233067512512, "loss/hidden": 0.0, "loss/logits": 0.2212524674832821, "loss/reg": 2.1813154220581055, "step": 685 }, { "epoch": 0.00686, "grad_norm": 0.4155946969985962, "grad_norm_var": 0.018194440840509217, "learning_rate": 5e-05, "loss": 0.1964, "loss/crossentropy": 2.7186298966407776, "loss/hidden": 0.0, "loss/logits": 0.19640850275754929, "loss/reg": 2.1795501708984375, "step": 686 }, { "epoch": 0.00687, "grad_norm": 0.38160914182662964, "grad_norm_var": 0.01835781299917098, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.7301290035247803, "loss/hidden": 0.0, "loss/logits": 0.17081937566399574, "loss/reg": 2.177623748779297, "step": 687 }, { "epoch": 0.00688, "grad_norm": 0.41628003120422363, "grad_norm_var": 0.01802219976069038, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.7876546382904053, "loss/hidden": 0.0, "loss/logits": 0.17706667259335518, "loss/reg": 2.175504446029663, "step": 688 }, { "epoch": 0.00689, "grad_norm": 0.4177417755126953, "grad_norm_var": 0.018066232212721724, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.763257145881653, "loss/hidden": 0.0, "loss/logits": 0.1785966381430626, "loss/reg": 2.173213481903076, "step": 689 }, { "epoch": 0.0069, "grad_norm": 0.3603265583515167, "grad_norm_var": 0.012296751904473697, "learning_rate": 5e-05, "loss": 0.1678, "loss/crossentropy": 2.774847447872162, "loss/hidden": 0.0, "loss/logits": 0.16783085092902184, "loss/reg": 2.1712427139282227, "step": 690 }, { "epoch": 0.00691, "grad_norm": 0.4307333827018738, "grad_norm_var": 0.01228874334383105, "learning_rate": 5e-05, "loss": 0.2034, "loss/crossentropy": 2.6488924622535706, "loss/hidden": 0.0, "loss/logits": 0.20339511707425117, "loss/reg": 2.170015573501587, "step": 691 }, { "epoch": 0.00692, "grad_norm": 0.3678703010082245, "grad_norm_var": 0.012472673417673882, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.8285900950431824, "loss/hidden": 0.0, "loss/logits": 0.179282795637846, "loss/reg": 2.1680784225463867, "step": 692 }, { "epoch": 0.00693, "grad_norm": 0.3516632914543152, "grad_norm_var": 0.012747599104723136, "learning_rate": 5e-05, "loss": 0.1638, "loss/crossentropy": 2.72187340259552, "loss/hidden": 0.0, "loss/logits": 0.16377655416727066, "loss/reg": 2.166708469390869, "step": 693 }, { "epoch": 0.00694, "grad_norm": 0.37773895263671875, "grad_norm_var": 0.012748787659448176, "learning_rate": 5e-05, "loss": 0.2, "loss/crossentropy": 2.5079989433288574, "loss/hidden": 0.0, "loss/logits": 0.19995050877332687, "loss/reg": 2.1644845008850098, "step": 694 }, { "epoch": 0.00695, "grad_norm": 0.33557403087615967, "grad_norm_var": 0.012954622340141124, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.733457326889038, "loss/hidden": 0.0, "loss/logits": 0.1730196811258793, "loss/reg": 2.162649631500244, "step": 695 }, { "epoch": 0.00696, "grad_norm": 0.3414340615272522, "grad_norm_var": 0.01306044443406886, "learning_rate": 5e-05, "loss": 0.1699, "loss/crossentropy": 2.770694136619568, "loss/hidden": 0.0, "loss/logits": 0.1698729656636715, "loss/reg": 2.1605873107910156, "step": 696 }, { "epoch": 0.00697, "grad_norm": 0.39742037653923035, "grad_norm_var": 0.012961649579914787, "learning_rate": 5e-05, "loss": 0.1753, "loss/crossentropy": 2.747798502445221, "loss/hidden": 0.0, "loss/logits": 0.17528066039085388, "loss/reg": 2.158661127090454, "step": 697 }, { "epoch": 0.00698, "grad_norm": 0.4672209620475769, "grad_norm_var": 0.012809503544980934, "learning_rate": 5e-05, "loss": 0.1961, "loss/crossentropy": 2.764335811138153, "loss/hidden": 0.0, "loss/logits": 0.1961456499993801, "loss/reg": 2.157139539718628, "step": 698 }, { "epoch": 0.00699, "grad_norm": 0.40900057554244995, "grad_norm_var": 0.012766202979620484, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.9526583552360535, "loss/hidden": 0.0, "loss/logits": 0.1826096773147583, "loss/reg": 2.1556172370910645, "step": 699 }, { "epoch": 0.007, "grad_norm": 0.45763787627220154, "grad_norm_var": 0.01255169197725956, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.9059385657310486, "loss/hidden": 0.0, "loss/logits": 0.18454358726739883, "loss/reg": 2.1542842388153076, "step": 700 }, { "epoch": 0.00701, "grad_norm": 0.568651020526886, "grad_norm_var": 0.0033942912710514268, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.771495759487152, "loss/hidden": 0.0, "loss/logits": 0.18007055297493935, "loss/reg": 2.1522164344787598, "step": 701 }, { "epoch": 0.00702, "grad_norm": 0.3590672016143799, "grad_norm_var": 0.00352192003862181, "learning_rate": 5e-05, "loss": 0.1651, "loss/crossentropy": 2.750881016254425, "loss/hidden": 0.0, "loss/logits": 0.1650897115468979, "loss/reg": 2.1509296894073486, "step": 702 }, { "epoch": 0.00703, "grad_norm": 0.36948493123054504, "grad_norm_var": 0.0035648755964216056, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.8027891516685486, "loss/hidden": 0.0, "loss/logits": 0.17848948016762733, "loss/reg": 2.149231195449829, "step": 703 }, { "epoch": 0.00704, "grad_norm": 0.3613908588886261, "grad_norm_var": 0.0036467673242235915, "learning_rate": 5e-05, "loss": 0.1682, "loss/crossentropy": 2.763719141483307, "loss/hidden": 0.0, "loss/logits": 0.16823140904307365, "loss/reg": 2.1478073596954346, "step": 704 }, { "epoch": 0.00705, "grad_norm": 0.38240060210227966, "grad_norm_var": 0.003633263034560896, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.705716133117676, "loss/hidden": 0.0, "loss/logits": 0.1780022643506527, "loss/reg": 2.1462342739105225, "step": 705 }, { "epoch": 0.00706, "grad_norm": 0.3587467074394226, "grad_norm_var": 0.0036409547879681387, "learning_rate": 5e-05, "loss": 0.1658, "loss/crossentropy": 2.7772558331489563, "loss/hidden": 0.0, "loss/logits": 0.16575098782777786, "loss/reg": 2.144062042236328, "step": 706 }, { "epoch": 0.00707, "grad_norm": 0.36025822162628174, "grad_norm_var": 0.0036250184261099458, "learning_rate": 5e-05, "loss": 0.1724, "loss/crossentropy": 2.634014904499054, "loss/hidden": 0.0, "loss/logits": 0.17235567048192024, "loss/reg": 2.1429154872894287, "step": 707 }, { "epoch": 0.00708, "grad_norm": 0.35575759410858154, "grad_norm_var": 0.0036725083584184842, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.6474004983901978, "loss/hidden": 0.0, "loss/logits": 0.17943605780601501, "loss/reg": 2.1421751976013184, "step": 708 }, { "epoch": 0.00709, "grad_norm": 0.3865105211734772, "grad_norm_var": 0.003566375202589933, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.9082140922546387, "loss/hidden": 0.0, "loss/logits": 0.18172414600849152, "loss/reg": 2.1403071880340576, "step": 709 }, { "epoch": 0.0071, "grad_norm": 0.368362694978714, "grad_norm_var": 0.003590971719305887, "learning_rate": 5e-05, "loss": 0.1711, "loss/crossentropy": 2.6766469478607178, "loss/hidden": 0.0, "loss/logits": 0.17112310975790024, "loss/reg": 2.138167142868042, "step": 710 }, { "epoch": 0.00711, "grad_norm": 0.34797176718711853, "grad_norm_var": 0.003506589552138199, "learning_rate": 5e-05, "loss": 0.163, "loss/crossentropy": 2.8545928597450256, "loss/hidden": 0.0, "loss/logits": 0.16299721226096153, "loss/reg": 2.1356112957000732, "step": 711 }, { "epoch": 0.00712, "grad_norm": 0.3511999547481537, "grad_norm_var": 0.0034451354888741254, "learning_rate": 5e-05, "loss": 0.1706, "loss/crossentropy": 2.7516467571258545, "loss/hidden": 0.0, "loss/logits": 0.1705768182873726, "loss/reg": 2.13396954536438, "step": 712 }, { "epoch": 0.00713, "grad_norm": 0.4692562520503998, "grad_norm_var": 0.0038021677070381584, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.7622230648994446, "loss/hidden": 0.0, "loss/logits": 0.1771526113152504, "loss/reg": 2.1316707134246826, "step": 713 }, { "epoch": 0.00714, "grad_norm": 0.3500974774360657, "grad_norm_var": 0.003583350276630167, "learning_rate": 5e-05, "loss": 0.1651, "loss/crossentropy": 2.7385149598121643, "loss/hidden": 0.0, "loss/logits": 0.16512250155210495, "loss/reg": 2.1301488876342773, "step": 714 }, { "epoch": 0.00715, "grad_norm": 0.33279696106910706, "grad_norm_var": 0.0037632620297312364, "learning_rate": 5e-05, "loss": 0.1639, "loss/crossentropy": 2.6760587096214294, "loss/hidden": 0.0, "loss/logits": 0.16387901455163956, "loss/reg": 2.128563165664673, "step": 715 }, { "epoch": 0.00716, "grad_norm": 0.36436334252357483, "grad_norm_var": 0.003418879723208453, "learning_rate": 5e-05, "loss": 0.1675, "loss/crossentropy": 2.7055559158325195, "loss/hidden": 0.0, "loss/logits": 0.16751762479543686, "loss/reg": 2.1272294521331787, "step": 716 }, { "epoch": 0.00717, "grad_norm": 0.35308849811553955, "grad_norm_var": 0.0009122804473129371, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.827264368534088, "loss/hidden": 0.0, "loss/logits": 0.16845671087503433, "loss/reg": 2.125559091567993, "step": 717 }, { "epoch": 0.00718, "grad_norm": 0.36609259247779846, "grad_norm_var": 0.0009080073745675876, "learning_rate": 5e-05, "loss": 0.1878, "loss/crossentropy": 2.7995529770851135, "loss/hidden": 0.0, "loss/logits": 0.18778567016124725, "loss/reg": 2.12422513961792, "step": 718 }, { "epoch": 0.00719, "grad_norm": 0.3564467430114746, "grad_norm_var": 0.0009149400496893722, "learning_rate": 5e-05, "loss": 0.1665, "loss/crossentropy": 2.848701000213623, "loss/hidden": 0.0, "loss/logits": 0.16652807220816612, "loss/reg": 2.1227705478668213, "step": 719 }, { "epoch": 0.0072, "grad_norm": 0.3523035943508148, "grad_norm_var": 0.0009263477116920882, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.7714666724205017, "loss/hidden": 0.0, "loss/logits": 0.16896242648363113, "loss/reg": 2.1214191913604736, "step": 720 }, { "epoch": 0.00721, "grad_norm": 0.39885270595550537, "grad_norm_var": 0.0009792887842439849, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.835131287574768, "loss/hidden": 0.0, "loss/logits": 0.1700747236609459, "loss/reg": 2.1200404167175293, "step": 721 }, { "epoch": 0.00722, "grad_norm": 0.40293964743614197, "grad_norm_var": 0.0010526817455953927, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.744925618171692, "loss/hidden": 0.0, "loss/logits": 0.1818903423845768, "loss/reg": 2.1181347370147705, "step": 722 }, { "epoch": 0.00723, "grad_norm": 0.5598530769348145, "grad_norm_var": 0.0032894654306610577, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.983691990375519, "loss/hidden": 0.0, "loss/logits": 0.16615596786141396, "loss/reg": 2.1172993183135986, "step": 723 }, { "epoch": 0.00724, "grad_norm": 0.39669546484947205, "grad_norm_var": 0.003249640426166478, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.709549069404602, "loss/hidden": 0.0, "loss/logits": 0.17345992848277092, "loss/reg": 2.1152801513671875, "step": 724 }, { "epoch": 0.00725, "grad_norm": 0.35726040601730347, "grad_norm_var": 0.0032964500726321067, "learning_rate": 5e-05, "loss": 0.1624, "loss/crossentropy": 2.7148231267929077, "loss/hidden": 0.0, "loss/logits": 0.16241873800754547, "loss/reg": 2.1130924224853516, "step": 725 }, { "epoch": 0.00726, "grad_norm": 0.3927571177482605, "grad_norm_var": 0.0032861190572127997, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.7939482927322388, "loss/hidden": 0.0, "loss/logits": 0.1794990859925747, "loss/reg": 2.111480712890625, "step": 726 }, { "epoch": 0.00727, "grad_norm": 0.3941044807434082, "grad_norm_var": 0.0031944564404073005, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.834249794483185, "loss/hidden": 0.0, "loss/logits": 0.1711888276040554, "loss/reg": 2.109204053878784, "step": 727 }, { "epoch": 0.00728, "grad_norm": 0.4828793704509735, "grad_norm_var": 0.0036429198556795937, "learning_rate": 5e-05, "loss": 0.2082, "loss/crossentropy": 2.899094045162201, "loss/hidden": 0.0, "loss/logits": 0.20823358744382858, "loss/reg": 2.1061551570892334, "step": 728 }, { "epoch": 0.00729, "grad_norm": 0.3574215769767761, "grad_norm_var": 0.003326472236741973, "learning_rate": 5e-05, "loss": 0.1596, "loss/crossentropy": 2.7636680603027344, "loss/hidden": 0.0, "loss/logits": 0.15961402654647827, "loss/reg": 2.104001760482788, "step": 729 }, { "epoch": 0.0073, "grad_norm": 0.40163764357566833, "grad_norm_var": 0.003227754706050412, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.8588566184043884, "loss/hidden": 0.0, "loss/logits": 0.17974677309393883, "loss/reg": 2.102442741394043, "step": 730 }, { "epoch": 0.00731, "grad_norm": 0.37189754843711853, "grad_norm_var": 0.003015475193035148, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.826458215713501, "loss/hidden": 0.0, "loss/logits": 0.16728204488754272, "loss/reg": 2.1002049446105957, "step": 731 }, { "epoch": 0.00732, "grad_norm": 0.3587784171104431, "grad_norm_var": 0.003039707591927121, "learning_rate": 5e-05, "loss": 0.1645, "loss/crossentropy": 2.731923222541809, "loss/hidden": 0.0, "loss/logits": 0.16448039561510086, "loss/reg": 2.098315954208374, "step": 732 }, { "epoch": 0.00733, "grad_norm": 0.37631648778915405, "grad_norm_var": 0.002946915065401934, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.880859136581421, "loss/hidden": 0.0, "loss/logits": 0.17137856781482697, "loss/reg": 2.0962650775909424, "step": 733 }, { "epoch": 0.00734, "grad_norm": 0.3563605844974518, "grad_norm_var": 0.002990850657754888, "learning_rate": 5e-05, "loss": 0.1628, "loss/crossentropy": 2.6355279088020325, "loss/hidden": 0.0, "loss/logits": 0.16278789564967155, "loss/reg": 2.094142436981201, "step": 734 }, { "epoch": 0.00735, "grad_norm": 0.37199047207832336, "grad_norm_var": 0.0029265023383142925, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.769617021083832, "loss/hidden": 0.0, "loss/logits": 0.18019907549023628, "loss/reg": 2.0921125411987305, "step": 735 }, { "epoch": 0.00736, "grad_norm": 0.489103764295578, "grad_norm_var": 0.0033036264225515164, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 2.7491883039474487, "loss/hidden": 0.0, "loss/logits": 0.1831774264574051, "loss/reg": 2.0904338359832764, "step": 736 }, { "epoch": 0.00737, "grad_norm": 0.5059826970100403, "grad_norm_var": 0.003943075932518525, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.8231146931648254, "loss/hidden": 0.0, "loss/logits": 0.18174266442656517, "loss/reg": 2.0879714488983154, "step": 737 }, { "epoch": 0.00738, "grad_norm": 0.6662333011627197, "grad_norm_var": 0.007992879009924207, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 2.7952335476875305, "loss/hidden": 0.0, "loss/logits": 0.18612126260995865, "loss/reg": 2.0855941772460938, "step": 738 }, { "epoch": 0.00739, "grad_norm": 0.43555790185928345, "grad_norm_var": 0.006764259520141217, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.7390406727790833, "loss/hidden": 0.0, "loss/logits": 0.1822943352162838, "loss/reg": 2.083002805709839, "step": 739 }, { "epoch": 0.0074, "grad_norm": 0.36206063628196716, "grad_norm_var": 0.0069454028516603905, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.6245489716529846, "loss/hidden": 0.0, "loss/logits": 0.17474905773997307, "loss/reg": 2.0807597637176514, "step": 740 }, { "epoch": 0.00741, "grad_norm": 0.4077146649360657, "grad_norm_var": 0.006699115025232619, "learning_rate": 5e-05, "loss": 0.21, "loss/crossentropy": 2.734727144241333, "loss/hidden": 0.0, "loss/logits": 0.20996900647878647, "loss/reg": 2.0777878761291504, "step": 741 }, { "epoch": 0.00742, "grad_norm": 0.4748740792274475, "grad_norm_var": 0.0068148961293998615, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.757317006587982, "loss/hidden": 0.0, "loss/logits": 0.19259492680430412, "loss/reg": 2.0748398303985596, "step": 742 }, { "epoch": 0.00743, "grad_norm": 0.3738694190979004, "grad_norm_var": 0.006926021168675212, "learning_rate": 5e-05, "loss": 0.1671, "loss/crossentropy": 2.7804144620895386, "loss/hidden": 0.0, "loss/logits": 0.1671152375638485, "loss/reg": 2.072981119155884, "step": 743 }, { "epoch": 0.00744, "grad_norm": 0.4373812675476074, "grad_norm_var": 0.006701504868688938, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.8251866698265076, "loss/hidden": 0.0, "loss/logits": 0.183602724224329, "loss/reg": 2.069178581237793, "step": 744 }, { "epoch": 0.00745, "grad_norm": 0.41339626908302307, "grad_norm_var": 0.006417608208757027, "learning_rate": 5e-05, "loss": 0.1665, "loss/crossentropy": 2.7784698605537415, "loss/hidden": 0.0, "loss/logits": 0.16648468375205994, "loss/reg": 2.066897392272949, "step": 745 }, { "epoch": 0.00746, "grad_norm": 0.36906108260154724, "grad_norm_var": 0.0065862671267569286, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.7134994864463806, "loss/hidden": 0.0, "loss/logits": 0.17522963881492615, "loss/reg": 2.063711404800415, "step": 746 }, { "epoch": 0.00747, "grad_norm": 0.3699776232242584, "grad_norm_var": 0.006599620482715507, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.6448380947113037, "loss/hidden": 0.0, "loss/logits": 0.1815556287765503, "loss/reg": 2.0618152618408203, "step": 747 }, { "epoch": 0.00748, "grad_norm": 0.35848432779312134, "grad_norm_var": 0.006602145753337363, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.577029287815094, "loss/hidden": 0.0, "loss/logits": 0.1758727729320526, "loss/reg": 2.0592784881591797, "step": 748 }, { "epoch": 0.00749, "grad_norm": 0.40015411376953125, "grad_norm_var": 0.006489211309593653, "learning_rate": 5e-05, "loss": 0.2007, "loss/crossentropy": 2.7719894647598267, "loss/hidden": 0.0, "loss/logits": 0.20066174119710922, "loss/reg": 2.056396484375, "step": 749 }, { "epoch": 0.0075, "grad_norm": 0.34235846996307373, "grad_norm_var": 0.006628701391081989, "learning_rate": 5e-05, "loss": 0.1661, "loss/crossentropy": 2.8526532649993896, "loss/hidden": 0.0, "loss/logits": 0.1660567931830883, "loss/reg": 2.053225040435791, "step": 750 }, { "epoch": 0.00751, "grad_norm": 0.37578198313713074, "grad_norm_var": 0.006603490490161393, "learning_rate": 5e-05, "loss": 0.192, "loss/crossentropy": 2.803673267364502, "loss/hidden": 0.0, "loss/logits": 0.19199685007333755, "loss/reg": 2.0500073432922363, "step": 751 }, { "epoch": 0.00752, "grad_norm": 0.3724234700202942, "grad_norm_var": 0.006439587327084632, "learning_rate": 5e-05, "loss": 0.1611, "loss/crossentropy": 2.8497246503829956, "loss/hidden": 0.0, "loss/logits": 0.16111965849995613, "loss/reg": 2.0481040477752686, "step": 752 }, { "epoch": 0.00753, "grad_norm": 0.37283533811569214, "grad_norm_var": 0.005960471364599432, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.580562174320221, "loss/hidden": 0.0, "loss/logits": 0.18162427470088005, "loss/reg": 2.045363187789917, "step": 753 }, { "epoch": 0.00754, "grad_norm": 0.42849722504615784, "grad_norm_var": 0.0013156070888824681, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.7384997606277466, "loss/hidden": 0.0, "loss/logits": 0.1885378062725067, "loss/reg": 2.0420894622802734, "step": 754 }, { "epoch": 0.00755, "grad_norm": 0.3246319890022278, "grad_norm_var": 0.0014611472372319412, "learning_rate": 5e-05, "loss": 0.152, "loss/crossentropy": 2.827781558036804, "loss/hidden": 0.0, "loss/logits": 0.15203238278627396, "loss/reg": 2.0399134159088135, "step": 755 }, { "epoch": 0.00756, "grad_norm": 0.3523566722869873, "grad_norm_var": 0.0014986135555234647, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.7049853801727295, "loss/hidden": 0.0, "loss/logits": 0.17991740256547928, "loss/reg": 2.036095142364502, "step": 756 }, { "epoch": 0.00757, "grad_norm": 0.3352646827697754, "grad_norm_var": 0.0016155829783374783, "learning_rate": 5e-05, "loss": 0.1612, "loss/crossentropy": 2.715296685695648, "loss/hidden": 0.0, "loss/logits": 0.16122159361839294, "loss/reg": 2.0335283279418945, "step": 757 }, { "epoch": 0.00758, "grad_norm": 0.36173179745674133, "grad_norm_var": 0.001004548523534495, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.8797001242637634, "loss/hidden": 0.0, "loss/logits": 0.17676853761076927, "loss/reg": 2.0315983295440674, "step": 758 }, { "epoch": 0.00759, "grad_norm": 0.43379032611846924, "grad_norm_var": 0.0012258123535982288, "learning_rate": 5e-05, "loss": 0.2008, "loss/crossentropy": 2.7367305159568787, "loss/hidden": 0.0, "loss/logits": 0.20076703280210495, "loss/reg": 2.028825521469116, "step": 759 }, { "epoch": 0.0076, "grad_norm": 0.7135851979255676, "grad_norm_var": 0.008180404333396396, "learning_rate": 5e-05, "loss": 0.1988, "loss/crossentropy": 2.7777557373046875, "loss/hidden": 0.0, "loss/logits": 0.19884883239865303, "loss/reg": 2.0258262157440186, "step": 760 }, { "epoch": 0.00761, "grad_norm": 0.36141064763069153, "grad_norm_var": 0.008223674749041798, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.755949318408966, "loss/hidden": 0.0, "loss/logits": 0.17225057631731033, "loss/reg": 2.022468090057373, "step": 761 }, { "epoch": 0.00762, "grad_norm": 0.47610145807266235, "grad_norm_var": 0.008612084301677063, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.701655924320221, "loss/hidden": 0.0, "loss/logits": 0.17863870784640312, "loss/reg": 2.0194478034973145, "step": 762 }, { "epoch": 0.00763, "grad_norm": 0.35960420966148376, "grad_norm_var": 0.0086585523494028, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.6496411561965942, "loss/hidden": 0.0, "loss/logits": 0.17191722244024277, "loss/reg": 2.0173516273498535, "step": 763 }, { "epoch": 0.00764, "grad_norm": 0.3759422302246094, "grad_norm_var": 0.008585472348376118, "learning_rate": 5e-05, "loss": 0.1665, "loss/crossentropy": 2.7261382937431335, "loss/hidden": 0.0, "loss/logits": 0.1664697714149952, "loss/reg": 2.014254331588745, "step": 764 }, { "epoch": 0.00765, "grad_norm": 0.3791477680206299, "grad_norm_var": 0.008610251361000461, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.779210090637207, "loss/hidden": 0.0, "loss/logits": 0.1715676300227642, "loss/reg": 2.0109994411468506, "step": 765 }, { "epoch": 0.00766, "grad_norm": 0.37698858976364136, "grad_norm_var": 0.00842901980982322, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.5693264603614807, "loss/hidden": 0.0, "loss/logits": 0.17796850576996803, "loss/reg": 2.007894277572632, "step": 766 }, { "epoch": 0.00767, "grad_norm": 0.324692040681839, "grad_norm_var": 0.008757168987509056, "learning_rate": 5e-05, "loss": 0.163, "loss/crossentropy": 2.790699005126953, "loss/hidden": 0.0, "loss/logits": 0.16297711431980133, "loss/reg": 2.0054080486297607, "step": 767 }, { "epoch": 0.00768, "grad_norm": 0.37725430727005005, "grad_norm_var": 0.008742918144709544, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.90560781955719, "loss/hidden": 0.0, "loss/logits": 0.17007537558674812, "loss/reg": 2.0026705265045166, "step": 768 }, { "epoch": 0.00769, "grad_norm": 0.3565872013568878, "grad_norm_var": 0.008812017421293783, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.8573551774024963, "loss/hidden": 0.0, "loss/logits": 0.1747995764017105, "loss/reg": 1.9999767541885376, "step": 769 }, { "epoch": 0.0077, "grad_norm": 0.32768821716308594, "grad_norm_var": 0.009011701837686615, "learning_rate": 5e-05, "loss": 0.1647, "loss/crossentropy": 2.7850446105003357, "loss/hidden": 0.0, "loss/logits": 0.16465429961681366, "loss/reg": 1.9974277019500732, "step": 770 }, { "epoch": 0.00771, "grad_norm": 0.34194430708885193, "grad_norm_var": 0.008880009468442519, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.89225697517395, "loss/hidden": 0.0, "loss/logits": 0.16806093603372574, "loss/reg": 1.9941447973251343, "step": 771 }, { "epoch": 0.00772, "grad_norm": 0.36788639426231384, "grad_norm_var": 0.008815313943155234, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.8710330724716187, "loss/hidden": 0.0, "loss/logits": 0.1823921650648117, "loss/reg": 1.991845965385437, "step": 772 }, { "epoch": 0.00773, "grad_norm": 0.33500465750694275, "grad_norm_var": 0.008817280025891942, "learning_rate": 5e-05, "loss": 0.1669, "loss/crossentropy": 2.7821491956710815, "loss/hidden": 0.0, "loss/logits": 0.16685106977820396, "loss/reg": 1.9901355504989624, "step": 773 }, { "epoch": 0.00774, "grad_norm": 0.33815550804138184, "grad_norm_var": 0.008946649562538052, "learning_rate": 5e-05, "loss": 0.162, "loss/crossentropy": 2.7051143050193787, "loss/hidden": 0.0, "loss/logits": 0.16199326515197754, "loss/reg": 1.9880555868148804, "step": 774 }, { "epoch": 0.00775, "grad_norm": 0.32524728775024414, "grad_norm_var": 0.009054478596347363, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.72264701128006, "loss/hidden": 0.0, "loss/logits": 0.16902651265263557, "loss/reg": 1.986093521118164, "step": 775 }, { "epoch": 0.00776, "grad_norm": 0.34697458148002625, "grad_norm_var": 0.0013234442614042051, "learning_rate": 5e-05, "loss": 0.1691, "loss/crossentropy": 2.780848979949951, "loss/hidden": 0.0, "loss/logits": 0.1690516211092472, "loss/reg": 1.9844779968261719, "step": 776 }, { "epoch": 0.00777, "grad_norm": 0.33995282649993896, "grad_norm_var": 0.0013500864177136548, "learning_rate": 5e-05, "loss": 0.1612, "loss/crossentropy": 2.772739827632904, "loss/hidden": 0.0, "loss/logits": 0.16119593381881714, "loss/reg": 1.9825077056884766, "step": 777 }, { "epoch": 0.00778, "grad_norm": 0.35139432549476624, "grad_norm_var": 0.0003803343966673219, "learning_rate": 5e-05, "loss": 0.1668, "loss/crossentropy": 2.7008825540542603, "loss/hidden": 0.0, "loss/logits": 0.16681700944900513, "loss/reg": 1.9806305170059204, "step": 778 }, { "epoch": 0.00779, "grad_norm": 0.4588527977466583, "grad_norm_var": 0.00110283708340256, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.632855713367462, "loss/hidden": 0.0, "loss/logits": 0.19068260118365288, "loss/reg": 1.9793086051940918, "step": 779 }, { "epoch": 0.0078, "grad_norm": 0.3829444646835327, "grad_norm_var": 0.0011229031183707624, "learning_rate": 5e-05, "loss": 0.1875, "loss/crossentropy": 2.9350045323371887, "loss/hidden": 0.0, "loss/logits": 0.18754199519753456, "loss/reg": 1.9774302244186401, "step": 780 }, { "epoch": 0.00781, "grad_norm": 0.46253493428230286, "grad_norm_var": 0.0017907320044085833, "learning_rate": 5e-05, "loss": 0.1957, "loss/crossentropy": 2.7478776574134827, "loss/hidden": 0.0, "loss/logits": 0.19565920531749725, "loss/reg": 1.9757329225540161, "step": 781 }, { "epoch": 0.00782, "grad_norm": 0.35229969024658203, "grad_norm_var": 0.0017840355007145352, "learning_rate": 5e-05, "loss": 0.1611, "loss/crossentropy": 2.805756628513336, "loss/hidden": 0.0, "loss/logits": 0.1610955037176609, "loss/reg": 1.973933458328247, "step": 782 }, { "epoch": 0.00783, "grad_norm": 0.3324076533317566, "grad_norm_var": 0.0017495419673394963, "learning_rate": 5e-05, "loss": 0.1706, "loss/crossentropy": 2.8343148827552795, "loss/hidden": 0.0, "loss/logits": 0.1706329919397831, "loss/reg": 1.9718844890594482, "step": 783 }, { "epoch": 0.00784, "grad_norm": 0.3563413619995117, "grad_norm_var": 0.0017352353042652258, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.825278103351593, "loss/hidden": 0.0, "loss/logits": 0.17681827396154404, "loss/reg": 1.969612956047058, "step": 784 }, { "epoch": 0.00785, "grad_norm": 0.33560603857040405, "grad_norm_var": 0.0017751309342711038, "learning_rate": 5e-05, "loss": 0.1558, "loss/crossentropy": 2.8132280111312866, "loss/hidden": 0.0, "loss/logits": 0.1558120921254158, "loss/reg": 1.9675753116607666, "step": 785 }, { "epoch": 0.00786, "grad_norm": 0.39733415842056274, "grad_norm_var": 0.0017810049820061401, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.8960456252098083, "loss/hidden": 0.0, "loss/logits": 0.18493180349469185, "loss/reg": 1.965217113494873, "step": 786 }, { "epoch": 0.00787, "grad_norm": 0.561698317527771, "grad_norm_var": 0.004151387117344507, "learning_rate": 5e-05, "loss": 0.1987, "loss/crossentropy": 2.66669100522995, "loss/hidden": 0.0, "loss/logits": 0.19869648292660713, "loss/reg": 1.9629205465316772, "step": 787 }, { "epoch": 0.00788, "grad_norm": 0.35911333560943604, "grad_norm_var": 0.004167781816727311, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.723667323589325, "loss/hidden": 0.0, "loss/logits": 0.17424843832850456, "loss/reg": 1.9608547687530518, "step": 788 }, { "epoch": 0.00789, "grad_norm": 0.3422897160053253, "grad_norm_var": 0.004130072564222831, "learning_rate": 5e-05, "loss": 0.1683, "loss/crossentropy": 2.853653848171234, "loss/hidden": 0.0, "loss/logits": 0.16829833760857582, "loss/reg": 1.959040880203247, "step": 789 }, { "epoch": 0.0079, "grad_norm": 0.373519629240036, "grad_norm_var": 0.0040217911733014585, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 2.7469093799591064, "loss/hidden": 0.0, "loss/logits": 0.1697460599243641, "loss/reg": 1.9578640460968018, "step": 790 }, { "epoch": 0.00791, "grad_norm": 0.42586550116539, "grad_norm_var": 0.003921241787547673, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.9876235127449036, "loss/hidden": 0.0, "loss/logits": 0.18961479887366295, "loss/reg": 1.9558684825897217, "step": 791 }, { "epoch": 0.00792, "grad_norm": 0.34371063113212585, "grad_norm_var": 0.00393897634361432, "learning_rate": 5e-05, "loss": 0.1624, "loss/crossentropy": 2.922863006591797, "loss/hidden": 0.0, "loss/logits": 0.16239817067980766, "loss/reg": 1.9541829824447632, "step": 792 }, { "epoch": 0.00793, "grad_norm": 0.3611912727355957, "grad_norm_var": 0.0038367960883469387, "learning_rate": 5e-05, "loss": 0.1767, "loss/crossentropy": 2.751186192035675, "loss/hidden": 0.0, "loss/logits": 0.17671825364232063, "loss/reg": 1.9524297714233398, "step": 793 }, { "epoch": 0.00794, "grad_norm": 0.3787733018398285, "grad_norm_var": 0.0037525025406884736, "learning_rate": 5e-05, "loss": 0.1695, "loss/crossentropy": 2.6539193391799927, "loss/hidden": 0.0, "loss/logits": 0.16952653229236603, "loss/reg": 1.951439619064331, "step": 794 }, { "epoch": 0.00795, "grad_norm": 0.37621310353279114, "grad_norm_var": 0.003409985625982037, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.672878086566925, "loss/hidden": 0.0, "loss/logits": 0.1826501600444317, "loss/reg": 1.9504698514938354, "step": 795 }, { "epoch": 0.00796, "grad_norm": 0.3580264747142792, "grad_norm_var": 0.0034518512961513536, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.8564891815185547, "loss/hidden": 0.0, "loss/logits": 0.17409207299351692, "loss/reg": 1.9492477178573608, "step": 796 }, { "epoch": 0.00797, "grad_norm": 0.3552623689174652, "grad_norm_var": 0.0030235748866805395, "learning_rate": 5e-05, "loss": 0.1679, "loss/crossentropy": 2.9642611145973206, "loss/hidden": 0.0, "loss/logits": 0.16786304488778114, "loss/reg": 1.9484763145446777, "step": 797 }, { "epoch": 0.00798, "grad_norm": 0.37029561400413513, "grad_norm_var": 0.0029878997549970957, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.7581509947776794, "loss/hidden": 0.0, "loss/logits": 0.18365685641765594, "loss/reg": 1.946696400642395, "step": 798 }, { "epoch": 0.00799, "grad_norm": 0.37257152795791626, "grad_norm_var": 0.00285137706672662, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.766197443008423, "loss/hidden": 0.0, "loss/logits": 0.17676663026213646, "loss/reg": 1.9451103210449219, "step": 799 }, { "epoch": 0.008, "grad_norm": 0.3937225043773651, "grad_norm_var": 0.0028245897421089812, "learning_rate": 5e-05, "loss": 0.1593, "loss/crossentropy": 2.968823492527008, "loss/hidden": 0.0, "loss/logits": 0.15929469466209412, "loss/reg": 1.943403959274292, "step": 800 }, { "epoch": 0.00801, "grad_norm": 0.5229995846748352, "grad_norm_var": 0.003870799660257873, "learning_rate": 5e-05, "loss": 0.1904, "loss/crossentropy": 2.5733524560928345, "loss/hidden": 0.0, "loss/logits": 0.19040565192699432, "loss/reg": 1.9423651695251465, "step": 801 }, { "epoch": 0.00802, "grad_norm": 0.4087795913219452, "grad_norm_var": 0.003885163701405114, "learning_rate": 5e-05, "loss": 0.2011, "loss/crossentropy": 2.643693685531616, "loss/hidden": 0.0, "loss/logits": 0.20110392943024635, "loss/reg": 1.941137433052063, "step": 802 }, { "epoch": 0.00803, "grad_norm": 0.369555801153183, "grad_norm_var": 0.0018963737991296507, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.702915072441101, "loss/hidden": 0.0, "loss/logits": 0.17682579904794693, "loss/reg": 1.9396127462387085, "step": 803 }, { "epoch": 0.00804, "grad_norm": 0.3822772204875946, "grad_norm_var": 0.001859244513831604, "learning_rate": 5e-05, "loss": 0.1674, "loss/crossentropy": 2.8051819801330566, "loss/hidden": 0.0, "loss/logits": 0.1674252152442932, "loss/reg": 1.9382424354553223, "step": 804 }, { "epoch": 0.00805, "grad_norm": 0.42195388674736023, "grad_norm_var": 0.0018187903132861672, "learning_rate": 5e-05, "loss": 0.1982, "loss/crossentropy": 2.7962412238121033, "loss/hidden": 0.0, "loss/logits": 0.19819854572415352, "loss/reg": 1.9367636442184448, "step": 805 }, { "epoch": 0.00806, "grad_norm": 0.39215224981307983, "grad_norm_var": 0.001803471303692028, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.838093400001526, "loss/hidden": 0.0, "loss/logits": 0.1862441450357437, "loss/reg": 1.9356895685195923, "step": 806 }, { "epoch": 0.00807, "grad_norm": 0.36561474204063416, "grad_norm_var": 0.0017388941933010808, "learning_rate": 5e-05, "loss": 0.1659, "loss/crossentropy": 2.8486820459365845, "loss/hidden": 0.0, "loss/logits": 0.16586757823824883, "loss/reg": 1.9336965084075928, "step": 807 }, { "epoch": 0.00808, "grad_norm": 0.3940856456756592, "grad_norm_var": 0.0016146705961780842, "learning_rate": 5e-05, "loss": 0.1695, "loss/crossentropy": 2.7590547800064087, "loss/hidden": 0.0, "loss/logits": 0.16953302919864655, "loss/reg": 1.9318287372589111, "step": 808 }, { "epoch": 0.00809, "grad_norm": 0.37031009793281555, "grad_norm_var": 0.0015860965038246484, "learning_rate": 5e-05, "loss": 0.1663, "loss/crossentropy": 2.803991198539734, "loss/hidden": 0.0, "loss/logits": 0.16627426072955132, "loss/reg": 1.9298937320709229, "step": 809 }, { "epoch": 0.0081, "grad_norm": 0.36467787623405457, "grad_norm_var": 0.001618743456786816, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.7522680163383484, "loss/hidden": 0.0, "loss/logits": 0.17983945459127426, "loss/reg": 1.928220272064209, "step": 810 }, { "epoch": 0.00811, "grad_norm": 0.644191324710846, "grad_norm_var": 0.005662418748027209, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.9207261204719543, "loss/hidden": 0.0, "loss/logits": 0.18232716247439384, "loss/reg": 1.9264674186706543, "step": 811 }, { "epoch": 0.00812, "grad_norm": 0.4135313034057617, "grad_norm_var": 0.00550433789682554, "learning_rate": 5e-05, "loss": 0.1754, "loss/crossentropy": 2.78128319978714, "loss/hidden": 0.0, "loss/logits": 0.17538663744926453, "loss/reg": 1.9246680736541748, "step": 812 }, { "epoch": 0.00813, "grad_norm": 0.44594907760620117, "grad_norm_var": 0.005370096537218135, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.7069836258888245, "loss/hidden": 0.0, "loss/logits": 0.18982965499162674, "loss/reg": 1.9229440689086914, "step": 813 }, { "epoch": 0.00814, "grad_norm": 0.41460415720939636, "grad_norm_var": 0.005231401879877403, "learning_rate": 5e-05, "loss": 0.1965, "loss/crossentropy": 2.658607244491577, "loss/hidden": 0.0, "loss/logits": 0.1964995227754116, "loss/reg": 1.9211541414260864, "step": 814 }, { "epoch": 0.00815, "grad_norm": 0.40847840905189514, "grad_norm_var": 0.0050977892227572616, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.789508819580078, "loss/hidden": 0.0, "loss/logits": 0.18652214854955673, "loss/reg": 1.91935396194458, "step": 815 }, { "epoch": 0.00816, "grad_norm": 0.39475539326667786, "grad_norm_var": 0.005094298258556392, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.771743655204773, "loss/hidden": 0.0, "loss/logits": 0.18758049979805946, "loss/reg": 1.9176177978515625, "step": 816 }, { "epoch": 0.00817, "grad_norm": 0.3788897395133972, "grad_norm_var": 0.004405869730473085, "learning_rate": 5e-05, "loss": 0.1765, "loss/crossentropy": 2.807315766811371, "loss/hidden": 0.0, "loss/logits": 0.17645375058054924, "loss/reg": 1.9157112836837769, "step": 817 }, { "epoch": 0.00818, "grad_norm": 0.3527612090110779, "grad_norm_var": 0.004615691680188668, "learning_rate": 5e-05, "loss": 0.1644, "loss/crossentropy": 2.8354954719543457, "loss/hidden": 0.0, "loss/logits": 0.16440149024128914, "loss/reg": 1.914186716079712, "step": 818 }, { "epoch": 0.00819, "grad_norm": 0.45434367656707764, "grad_norm_var": 0.004640431192599914, "learning_rate": 5e-05, "loss": 0.2004, "loss/crossentropy": 2.7823927998542786, "loss/hidden": 0.0, "loss/logits": 0.20038331300020218, "loss/reg": 1.9128350019454956, "step": 819 }, { "epoch": 0.0082, "grad_norm": 0.4440554082393646, "grad_norm_var": 0.004630750512825665, "learning_rate": 5e-05, "loss": 0.198, "loss/crossentropy": 2.7826399207115173, "loss/hidden": 0.0, "loss/logits": 0.19803617522120476, "loss/reg": 1.9112329483032227, "step": 820 }, { "epoch": 0.00821, "grad_norm": 0.8357638120651245, "grad_norm_var": 0.015646654980531947, "learning_rate": 5e-05, "loss": 0.2236, "loss/crossentropy": 2.855618476867676, "loss/hidden": 0.0, "loss/logits": 0.22355607897043228, "loss/reg": 1.9094618558883667, "step": 821 }, { "epoch": 0.00822, "grad_norm": 0.369484543800354, "grad_norm_var": 0.01582983572015086, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.7591440081596375, "loss/hidden": 0.0, "loss/logits": 0.16877064853906631, "loss/reg": 1.9078381061553955, "step": 822 }, { "epoch": 0.00823, "grad_norm": 0.3682078421115875, "grad_norm_var": 0.015804289096973827, "learning_rate": 5e-05, "loss": 0.1661, "loss/crossentropy": 2.6714991331100464, "loss/hidden": 0.0, "loss/logits": 0.16609660163521767, "loss/reg": 1.9066871404647827, "step": 823 }, { "epoch": 0.00824, "grad_norm": 0.4925972521305084, "grad_norm_var": 0.015796176553567597, "learning_rate": 5e-05, "loss": 0.1983, "loss/crossentropy": 2.690047025680542, "loss/hidden": 0.0, "loss/logits": 0.19832593947649002, "loss/reg": 1.9051308631896973, "step": 824 }, { "epoch": 0.00825, "grad_norm": 0.4004671275615692, "grad_norm_var": 0.01554450060197241, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 3.0430662631988525, "loss/hidden": 0.0, "loss/logits": 0.18668686598539352, "loss/reg": 1.9037506580352783, "step": 825 }, { "epoch": 0.00826, "grad_norm": 0.40644222497940063, "grad_norm_var": 0.01518439463368978, "learning_rate": 5e-05, "loss": 0.1992, "loss/crossentropy": 2.856709599494934, "loss/hidden": 0.0, "loss/logits": 0.19924000278115273, "loss/reg": 1.9035279750823975, "step": 826 }, { "epoch": 0.00827, "grad_norm": 0.387662410736084, "grad_norm_var": 0.012707668169475368, "learning_rate": 5e-05, "loss": 0.171, "loss/crossentropy": 2.8205041885375977, "loss/hidden": 0.0, "loss/logits": 0.17101648449897766, "loss/reg": 1.9019997119903564, "step": 827 }, { "epoch": 0.00828, "grad_norm": 0.4229760766029358, "grad_norm_var": 0.012685578660426963, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.8298428058624268, "loss/hidden": 0.0, "loss/logits": 0.1704978421330452, "loss/reg": 1.9013198614120483, "step": 828 }, { "epoch": 0.00829, "grad_norm": 0.4192207455635071, "grad_norm_var": 0.012695092968616347, "learning_rate": 5e-05, "loss": 0.1699, "loss/crossentropy": 2.8572763800621033, "loss/hidden": 0.0, "loss/logits": 0.16986168175935745, "loss/reg": 1.9007604122161865, "step": 829 }, { "epoch": 0.0083, "grad_norm": 0.3887600898742676, "grad_norm_var": 0.012805118489640084, "learning_rate": 5e-05, "loss": 0.202, "loss/crossentropy": 2.786255419254303, "loss/hidden": 0.0, "loss/logits": 0.20195355266332626, "loss/reg": 1.9000667333602905, "step": 830 }, { "epoch": 0.00831, "grad_norm": 0.48604434728622437, "grad_norm_var": 0.012929568590754843, "learning_rate": 5e-05, "loss": 0.1961, "loss/crossentropy": 2.845152735710144, "loss/hidden": 0.0, "loss/logits": 0.19609695672988892, "loss/reg": 1.899495005607605, "step": 831 }, { "epoch": 0.00832, "grad_norm": 0.38712021708488464, "grad_norm_var": 0.012976881832390848, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.7007412910461426, "loss/hidden": 0.0, "loss/logits": 0.1888689175248146, "loss/reg": 1.8983198404312134, "step": 832 }, { "epoch": 0.00833, "grad_norm": 0.3749590814113617, "grad_norm_var": 0.013008393945473115, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.607687532901764, "loss/hidden": 0.0, "loss/logits": 0.1784559190273285, "loss/reg": 1.897759199142456, "step": 833 }, { "epoch": 0.00834, "grad_norm": 0.35202544927597046, "grad_norm_var": 0.013016684761580717, "learning_rate": 5e-05, "loss": 0.1707, "loss/crossentropy": 2.7777926325798035, "loss/hidden": 0.0, "loss/logits": 0.17071311548352242, "loss/reg": 1.8961632251739502, "step": 834 }, { "epoch": 0.00835, "grad_norm": 0.3441760540008545, "grad_norm_var": 0.013518763280912912, "learning_rate": 5e-05, "loss": 0.1615, "loss/crossentropy": 2.835566759109497, "loss/hidden": 0.0, "loss/logits": 0.1615000143647194, "loss/reg": 1.8950566053390503, "step": 835 }, { "epoch": 0.00836, "grad_norm": 0.362005352973938, "grad_norm_var": 0.013785734718565416, "learning_rate": 5e-05, "loss": 0.1631, "loss/crossentropy": 2.818268299102783, "loss/hidden": 0.0, "loss/logits": 0.16310900822281837, "loss/reg": 1.8935774564743042, "step": 836 }, { "epoch": 0.00837, "grad_norm": 0.3725143074989319, "grad_norm_var": 0.0018186987426010584, "learning_rate": 5e-05, "loss": 0.1929, "loss/crossentropy": 2.763745427131653, "loss/hidden": 0.0, "loss/logits": 0.192863829433918, "loss/reg": 1.8919721841812134, "step": 837 }, { "epoch": 0.00838, "grad_norm": 0.39604651927948, "grad_norm_var": 0.0017691837659245461, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.729005455970764, "loss/hidden": 0.0, "loss/logits": 0.18469301983714104, "loss/reg": 1.8908017873764038, "step": 838 }, { "epoch": 0.00839, "grad_norm": 0.39175912737846375, "grad_norm_var": 0.001711627371570279, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.8563897609710693, "loss/hidden": 0.0, "loss/logits": 0.18407713249325752, "loss/reg": 1.8892946243286133, "step": 839 }, { "epoch": 0.0084, "grad_norm": 0.3497207462787628, "grad_norm_var": 0.0012053613127933737, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 2.9635773301124573, "loss/hidden": 0.0, "loss/logits": 0.1656595915555954, "loss/reg": 1.8886394500732422, "step": 840 }, { "epoch": 0.00841, "grad_norm": 0.36070436239242554, "grad_norm_var": 0.0012493146014174172, "learning_rate": 5e-05, "loss": 0.1659, "loss/crossentropy": 2.905772030353546, "loss/hidden": 0.0, "loss/logits": 0.16585366800427437, "loss/reg": 1.8883072137832642, "step": 841 }, { "epoch": 0.00842, "grad_norm": 0.5194427371025085, "grad_norm_var": 0.002330769361460483, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.914414703845978, "loss/hidden": 0.0, "loss/logits": 0.1903173327445984, "loss/reg": 1.8872804641723633, "step": 842 }, { "epoch": 0.00843, "grad_norm": 0.3658028841018677, "grad_norm_var": 0.0023811347132161485, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.8683270812034607, "loss/hidden": 0.0, "loss/logits": 0.17216329649090767, "loss/reg": 1.8860183954238892, "step": 843 }, { "epoch": 0.00844, "grad_norm": 0.3355120122432709, "grad_norm_var": 0.0025135271396979795, "learning_rate": 5e-05, "loss": 0.166, "loss/crossentropy": 2.7672330141067505, "loss/hidden": 0.0, "loss/logits": 0.16596197709441185, "loss/reg": 1.8848904371261597, "step": 844 }, { "epoch": 0.00845, "grad_norm": 0.45907968282699585, "grad_norm_var": 0.002779472587279837, "learning_rate": 5e-05, "loss": 0.1881, "loss/crossentropy": 2.79125440120697, "loss/hidden": 0.0, "loss/logits": 0.1881270818412304, "loss/reg": 1.8831804990768433, "step": 845 }, { "epoch": 0.00846, "grad_norm": 0.3753393888473511, "grad_norm_var": 0.0027935829770950843, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.753562033176422, "loss/hidden": 0.0, "loss/logits": 0.18013736233115196, "loss/reg": 1.8817567825317383, "step": 846 }, { "epoch": 0.00847, "grad_norm": 0.41996800899505615, "grad_norm_var": 0.002216029114339835, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.8630106449127197, "loss/hidden": 0.0, "loss/logits": 0.18711163103580475, "loss/reg": 1.8805315494537354, "step": 847 }, { "epoch": 0.00848, "grad_norm": 0.40139615535736084, "grad_norm_var": 0.0022320677834542836, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.8663435578346252, "loss/hidden": 0.0, "loss/logits": 0.18201814219355583, "loss/reg": 1.8795160055160522, "step": 848 }, { "epoch": 0.00849, "grad_norm": 0.44251078367233276, "grad_norm_var": 0.0024153193390402117, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.8493316173553467, "loss/hidden": 0.0, "loss/logits": 0.1721041165292263, "loss/reg": 1.8777539730072021, "step": 849 }, { "epoch": 0.0085, "grad_norm": 0.39363330602645874, "grad_norm_var": 0.00231007314672006, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 2.8225064277648926, "loss/hidden": 0.0, "loss/logits": 0.19327203929424286, "loss/reg": 1.8762940168380737, "step": 850 }, { "epoch": 0.00851, "grad_norm": 0.3834942579269409, "grad_norm_var": 0.0021502092497398652, "learning_rate": 5e-05, "loss": 0.1919, "loss/crossentropy": 2.7274433970451355, "loss/hidden": 0.0, "loss/logits": 0.19194044917821884, "loss/reg": 1.8752055168151855, "step": 851 }, { "epoch": 0.00852, "grad_norm": 0.36249086260795593, "grad_norm_var": 0.0021480519578248518, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.725217640399933, "loss/hidden": 0.0, "loss/logits": 0.17594841867685318, "loss/reg": 1.8736830949783325, "step": 852 }, { "epoch": 0.00853, "grad_norm": 0.3869001269340515, "grad_norm_var": 0.002116727725912885, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.738001227378845, "loss/hidden": 0.0, "loss/logits": 0.1822870336472988, "loss/reg": 1.8728076219558716, "step": 853 }, { "epoch": 0.00854, "grad_norm": 0.3708244860172272, "grad_norm_var": 0.002157970353941248, "learning_rate": 5e-05, "loss": 0.1668, "loss/crossentropy": 2.788993239402771, "loss/hidden": 0.0, "loss/logits": 0.16682880371809006, "loss/reg": 1.8720353841781616, "step": 854 }, { "epoch": 0.00855, "grad_norm": 0.372335821390152, "grad_norm_var": 0.002189712517136307, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.813368082046509, "loss/hidden": 0.0, "loss/logits": 0.17184938862919807, "loss/reg": 1.8711519241333008, "step": 855 }, { "epoch": 0.00856, "grad_norm": 0.35767483711242676, "grad_norm_var": 0.0021470276840197164, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.779674708843231, "loss/hidden": 0.0, "loss/logits": 0.17495984584093094, "loss/reg": 1.869391679763794, "step": 856 }, { "epoch": 0.00857, "grad_norm": 0.3517981767654419, "grad_norm_var": 0.002191754274186038, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.7776423692703247, "loss/hidden": 0.0, "loss/logits": 0.16876182705163956, "loss/reg": 1.868484616279602, "step": 857 }, { "epoch": 0.00858, "grad_norm": 0.8127824664115906, "grad_norm_var": 0.012490247842596114, "learning_rate": 5e-05, "loss": 0.2224, "loss/crossentropy": 2.9876713156700134, "loss/hidden": 0.0, "loss/logits": 0.2223958522081375, "loss/reg": 1.8681334257125854, "step": 858 }, { "epoch": 0.00859, "grad_norm": 0.4339921474456787, "grad_norm_var": 0.012361098720851383, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.8306267857551575, "loss/hidden": 0.0, "loss/logits": 0.17517483979463577, "loss/reg": 1.8669747114181519, "step": 859 }, { "epoch": 0.0086, "grad_norm": 0.5807726383209229, "grad_norm_var": 0.013480947234538975, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.731403112411499, "loss/hidden": 0.0, "loss/logits": 0.17869474738836288, "loss/reg": 1.865167498588562, "step": 860 }, { "epoch": 0.00861, "grad_norm": 0.377247154712677, "grad_norm_var": 0.013599237642109623, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.8269473910331726, "loss/hidden": 0.0, "loss/logits": 0.1713988333940506, "loss/reg": 1.8640793561935425, "step": 861 }, { "epoch": 0.00862, "grad_norm": 0.37849879264831543, "grad_norm_var": 0.013578332002263205, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.8433790802955627, "loss/hidden": 0.0, "loss/logits": 0.16895778477191925, "loss/reg": 1.8629158735275269, "step": 862 }, { "epoch": 0.00863, "grad_norm": 0.4124751091003418, "grad_norm_var": 0.013588511645486826, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.7584823966026306, "loss/hidden": 0.0, "loss/logits": 0.18026690557599068, "loss/reg": 1.8609492778778076, "step": 863 }, { "epoch": 0.00864, "grad_norm": 0.37336814403533936, "grad_norm_var": 0.013730216300815038, "learning_rate": 5e-05, "loss": 0.1707, "loss/crossentropy": 2.7589592933654785, "loss/hidden": 0.0, "loss/logits": 0.17070752009749413, "loss/reg": 1.8597759008407593, "step": 864 }, { "epoch": 0.00865, "grad_norm": 0.8337989449501038, "grad_norm_var": 0.02424293784323857, "learning_rate": 5e-05, "loss": 0.1781, "loss/crossentropy": 2.8036633133888245, "loss/hidden": 0.0, "loss/logits": 0.17813356593251228, "loss/reg": 1.8581993579864502, "step": 865 }, { "epoch": 0.00866, "grad_norm": 0.35601744055747986, "grad_norm_var": 0.02460846166740538, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.715932846069336, "loss/hidden": 0.0, "loss/logits": 0.17593519389629364, "loss/reg": 1.8565593957901, "step": 866 }, { "epoch": 0.00867, "grad_norm": 0.425502747297287, "grad_norm_var": 0.02436568774250706, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.7275202870368958, "loss/hidden": 0.0, "loss/logits": 0.18210354447364807, "loss/reg": 1.8554532527923584, "step": 867 }, { "epoch": 0.00868, "grad_norm": 0.3844553232192993, "grad_norm_var": 0.024142035969486622, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.8464134335517883, "loss/hidden": 0.0, "loss/logits": 0.1795903705060482, "loss/reg": 1.855208396911621, "step": 868 }, { "epoch": 0.00869, "grad_norm": 0.35618311166763306, "grad_norm_var": 0.02446160042257303, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.87707781791687, "loss/hidden": 0.0, "loss/logits": 0.17583919316530228, "loss/reg": 1.8549201488494873, "step": 869 }, { "epoch": 0.0087, "grad_norm": 0.41672077775001526, "grad_norm_var": 0.024117258377413187, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.71548855304718, "loss/hidden": 0.0, "loss/logits": 0.1758672632277012, "loss/reg": 1.8541263341903687, "step": 870 }, { "epoch": 0.00871, "grad_norm": 0.3646162450313568, "grad_norm_var": 0.024202440513241764, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.786548674106598, "loss/hidden": 0.0, "loss/logits": 0.18596061319112778, "loss/reg": 1.8524823188781738, "step": 871 }, { "epoch": 0.00872, "grad_norm": 0.37939974665641785, "grad_norm_var": 0.023961625350842352, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.835165321826935, "loss/hidden": 0.0, "loss/logits": 0.17476912215352058, "loss/reg": 1.8506335020065308, "step": 872 }, { "epoch": 0.00873, "grad_norm": 0.3887036442756653, "grad_norm_var": 0.023551954015331347, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.732525408267975, "loss/hidden": 0.0, "loss/logits": 0.17892278358340263, "loss/reg": 1.849176287651062, "step": 873 }, { "epoch": 0.00874, "grad_norm": 0.387320876121521, "grad_norm_var": 0.014549813961372993, "learning_rate": 5e-05, "loss": 0.1941, "loss/crossentropy": 2.792769968509674, "loss/hidden": 0.0, "loss/logits": 0.19409611076116562, "loss/reg": 1.848083734512329, "step": 874 }, { "epoch": 0.00875, "grad_norm": 0.3818178176879883, "grad_norm_var": 0.014678730624869341, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.762765884399414, "loss/hidden": 0.0, "loss/logits": 0.18646146729588509, "loss/reg": 1.8465189933776855, "step": 875 }, { "epoch": 0.00876, "grad_norm": 0.36014363169670105, "grad_norm_var": 0.013132955726787365, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.8113619089126587, "loss/hidden": 0.0, "loss/logits": 0.1847200095653534, "loss/reg": 1.8454153537750244, "step": 876 }, { "epoch": 0.00877, "grad_norm": 0.3916279077529907, "grad_norm_var": 0.013081129963419124, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.767706513404846, "loss/hidden": 0.0, "loss/logits": 0.18097594752907753, "loss/reg": 1.8445773124694824, "step": 877 }, { "epoch": 0.00878, "grad_norm": 0.38396528363227844, "grad_norm_var": 0.013058641234249413, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.750400483608246, "loss/hidden": 0.0, "loss/logits": 0.179927259683609, "loss/reg": 1.8441030979156494, "step": 878 }, { "epoch": 0.00879, "grad_norm": 0.3784838020801544, "grad_norm_var": 0.013129867131250705, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.6576608419418335, "loss/hidden": 0.0, "loss/logits": 0.1783200539648533, "loss/reg": 1.842759370803833, "step": 879 }, { "epoch": 0.0088, "grad_norm": 0.3373940885066986, "grad_norm_var": 0.013387093786405535, "learning_rate": 5e-05, "loss": 0.1665, "loss/crossentropy": 2.7494055032730103, "loss/hidden": 0.0, "loss/logits": 0.16646360978484154, "loss/reg": 1.8418775796890259, "step": 880 }, { "epoch": 0.00881, "grad_norm": 0.3743676543235779, "grad_norm_var": 0.000488954453456858, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.7373632192611694, "loss/hidden": 0.0, "loss/logits": 0.17177791520953178, "loss/reg": 1.8405792713165283, "step": 881 }, { "epoch": 0.00882, "grad_norm": 0.36398422718048096, "grad_norm_var": 0.0004683277690547882, "learning_rate": 5e-05, "loss": 0.1713, "loss/crossentropy": 2.689941644668579, "loss/hidden": 0.0, "loss/logits": 0.1712586209177971, "loss/reg": 1.8396137952804565, "step": 882 }, { "epoch": 0.00883, "grad_norm": 0.36932700872421265, "grad_norm_var": 0.0003222525763987627, "learning_rate": 5e-05, "loss": 0.1738, "loss/crossentropy": 3.0094715356826782, "loss/hidden": 0.0, "loss/logits": 0.17375321686267853, "loss/reg": 1.8387012481689453, "step": 883 }, { "epoch": 0.00884, "grad_norm": 0.37647050619125366, "grad_norm_var": 0.0003174026053568609, "learning_rate": 5e-05, "loss": 0.1734, "loss/crossentropy": 2.675420820713043, "loss/hidden": 0.0, "loss/logits": 0.17337032034993172, "loss/reg": 1.8372994661331177, "step": 884 }, { "epoch": 0.00885, "grad_norm": 0.3657122850418091, "grad_norm_var": 0.0002983341146215642, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.785289704799652, "loss/hidden": 0.0, "loss/logits": 0.1750231385231018, "loss/reg": 1.836666464805603, "step": 885 }, { "epoch": 0.00886, "grad_norm": 0.3565351963043213, "grad_norm_var": 0.00019998832643003023, "learning_rate": 5e-05, "loss": 0.1611, "loss/crossentropy": 2.67407763004303, "loss/hidden": 0.0, "loss/logits": 0.16111686453223228, "loss/reg": 1.8361395597457886, "step": 886 }, { "epoch": 0.00887, "grad_norm": 0.38317063450813293, "grad_norm_var": 0.00020202125672466782, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.7103012204170227, "loss/hidden": 0.0, "loss/logits": 0.17928585410118103, "loss/reg": 1.8344452381134033, "step": 887 }, { "epoch": 0.00888, "grad_norm": 0.39307963848114014, "grad_norm_var": 0.00022420215532017082, "learning_rate": 5e-05, "loss": 0.1899, "loss/crossentropy": 2.7159000635147095, "loss/hidden": 0.0, "loss/logits": 0.18990719318389893, "loss/reg": 1.8328790664672852, "step": 888 }, { "epoch": 0.00889, "grad_norm": 0.35862287878990173, "grad_norm_var": 0.00022381402201028245, "learning_rate": 5e-05, "loss": 0.1834, "loss/crossentropy": 2.847196877002716, "loss/hidden": 0.0, "loss/logits": 0.18342823907732964, "loss/reg": 1.8318103551864624, "step": 889 }, { "epoch": 0.0089, "grad_norm": 0.3539126515388489, "grad_norm_var": 0.0002281156381275314, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.853213608264923, "loss/hidden": 0.0, "loss/logits": 0.1777319796383381, "loss/reg": 1.829829454421997, "step": 890 }, { "epoch": 0.00891, "grad_norm": 0.41561535000801086, "grad_norm_var": 0.000350336348254295, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.6837574243545532, "loss/hidden": 0.0, "loss/logits": 0.18954132869839668, "loss/reg": 1.8281813859939575, "step": 891 }, { "epoch": 0.00892, "grad_norm": 0.3593007028102875, "grad_norm_var": 0.00035178644306217054, "learning_rate": 5e-05, "loss": 0.1639, "loss/crossentropy": 2.7398064136505127, "loss/hidden": 0.0, "loss/logits": 0.163859985768795, "loss/reg": 1.8272664546966553, "step": 892 }, { "epoch": 0.00893, "grad_norm": 0.3928586542606354, "grad_norm_var": 0.00035500389449958367, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.7214816212654114, "loss/hidden": 0.0, "loss/logits": 0.18696707114577293, "loss/reg": 1.8260765075683594, "step": 893 }, { "epoch": 0.00894, "grad_norm": 0.38060957193374634, "grad_norm_var": 0.00035065611870696014, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.8307188153266907, "loss/hidden": 0.0, "loss/logits": 0.18159236386418343, "loss/reg": 1.8246755599975586, "step": 894 }, { "epoch": 0.00895, "grad_norm": 0.34957438707351685, "grad_norm_var": 0.0003796919232549693, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.7387137413024902, "loss/hidden": 0.0, "loss/logits": 0.18012140691280365, "loss/reg": 1.8235573768615723, "step": 895 }, { "epoch": 0.00896, "grad_norm": 0.3666534721851349, "grad_norm_var": 0.00030342620785123544, "learning_rate": 5e-05, "loss": 0.2036, "loss/crossentropy": 2.758805215358734, "loss/hidden": 0.0, "loss/logits": 0.20357270538806915, "loss/reg": 1.8219776153564453, "step": 896 }, { "epoch": 0.00897, "grad_norm": 0.5404136180877686, "grad_norm_var": 0.0020682628614343557, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 2.970340132713318, "loss/hidden": 0.0, "loss/logits": 0.19332898035645485, "loss/reg": 1.8206608295440674, "step": 897 }, { "epoch": 0.00898, "grad_norm": 0.3982648551464081, "grad_norm_var": 0.0020554109287465, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.6868785619735718, "loss/hidden": 0.0, "loss/logits": 0.1768476814031601, "loss/reg": 1.8192402124404907, "step": 898 }, { "epoch": 0.00899, "grad_norm": 0.38741791248321533, "grad_norm_var": 0.002038042531465663, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.816374719142914, "loss/hidden": 0.0, "loss/logits": 0.18077386170625687, "loss/reg": 1.8179749250411987, "step": 899 }, { "epoch": 0.009, "grad_norm": 0.3847026526927948, "grad_norm_var": 0.0020316665201686695, "learning_rate": 5e-05, "loss": 0.1982, "loss/crossentropy": 2.8102923035621643, "loss/hidden": 0.0, "loss/logits": 0.19821672514081, "loss/reg": 1.8163453340530396, "step": 900 }, { "epoch": 0.00901, "grad_norm": 0.3916863203048706, "grad_norm_var": 0.0020013109603508254, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.883694589138031, "loss/hidden": 0.0, "loss/logits": 0.17775952070951462, "loss/reg": 1.8147519826889038, "step": 901 }, { "epoch": 0.00902, "grad_norm": 0.36217865347862244, "grad_norm_var": 0.001979417665481912, "learning_rate": 5e-05, "loss": 0.1669, "loss/crossentropy": 2.8100743293762207, "loss/hidden": 0.0, "loss/logits": 0.1668529324233532, "loss/reg": 1.8133151531219482, "step": 902 }, { "epoch": 0.00903, "grad_norm": 0.42102572321891785, "grad_norm_var": 0.00204143104015622, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.8006924986839294, "loss/hidden": 0.0, "loss/logits": 0.18102795258164406, "loss/reg": 1.8111281394958496, "step": 903 }, { "epoch": 0.00904, "grad_norm": 0.388874351978302, "grad_norm_var": 0.002041367346731493, "learning_rate": 5e-05, "loss": 0.1818, "loss/crossentropy": 2.8004772067070007, "loss/hidden": 0.0, "loss/logits": 0.18177441507577896, "loss/reg": 1.8097094297409058, "step": 904 }, { "epoch": 0.00905, "grad_norm": 0.3800460696220398, "grad_norm_var": 0.0019783346485648203, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.693452537059784, "loss/hidden": 0.0, "loss/logits": 0.1760004386305809, "loss/reg": 1.8086622953414917, "step": 905 }, { "epoch": 0.00906, "grad_norm": 0.32960033416748047, "grad_norm_var": 0.0021389732007735863, "learning_rate": 5e-05, "loss": 0.1641, "loss/crossentropy": 2.7406028509140015, "loss/hidden": 0.0, "loss/logits": 0.1641414910554886, "loss/reg": 1.8070775270462036, "step": 906 }, { "epoch": 0.00907, "grad_norm": 0.41262900829315186, "grad_norm_var": 0.002129550660359725, "learning_rate": 5e-05, "loss": 0.1639, "loss/crossentropy": 2.815558433532715, "loss/hidden": 0.0, "loss/logits": 0.1639426201581955, "loss/reg": 1.80557119846344, "step": 907 }, { "epoch": 0.00908, "grad_norm": 0.34168651700019836, "grad_norm_var": 0.0022218976438497353, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.7727773189544678, "loss/hidden": 0.0, "loss/logits": 0.16886158660054207, "loss/reg": 1.8040578365325928, "step": 908 }, { "epoch": 0.00909, "grad_norm": 0.3481311500072479, "grad_norm_var": 0.0023254939668475396, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.7662405967712402, "loss/hidden": 0.0, "loss/logits": 0.17232706770300865, "loss/reg": 1.802317500114441, "step": 909 }, { "epoch": 0.0091, "grad_norm": 0.34673434495925903, "grad_norm_var": 0.0024236772610501315, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.7547109723091125, "loss/hidden": 0.0, "loss/logits": 0.17211218550801277, "loss/reg": 1.8002270460128784, "step": 910 }, { "epoch": 0.00911, "grad_norm": 0.358116090297699, "grad_norm_var": 0.002388630196925971, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.8174885511398315, "loss/hidden": 0.0, "loss/logits": 0.17777465283870697, "loss/reg": 1.7986141443252563, "step": 911 }, { "epoch": 0.00912, "grad_norm": 0.37328216433525085, "grad_norm_var": 0.0023752628687049343, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.7423484921455383, "loss/hidden": 0.0, "loss/logits": 0.18462468683719635, "loss/reg": 1.7969509363174438, "step": 912 }, { "epoch": 0.00913, "grad_norm": 0.4073435366153717, "grad_norm_var": 0.000729848525378903, "learning_rate": 5e-05, "loss": 0.1656, "loss/crossentropy": 2.6962223649024963, "loss/hidden": 0.0, "loss/logits": 0.16563431546092033, "loss/reg": 1.7949342727661133, "step": 913 }, { "epoch": 0.00914, "grad_norm": 0.37720003724098206, "grad_norm_var": 0.0006978068548327905, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.710608184337616, "loss/hidden": 0.0, "loss/logits": 0.18529681861400604, "loss/reg": 1.7920844554901123, "step": 914 }, { "epoch": 0.00915, "grad_norm": 0.36010050773620605, "grad_norm_var": 0.0007016424011595597, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.8862733840942383, "loss/hidden": 0.0, "loss/logits": 0.1833462007343769, "loss/reg": 1.7895822525024414, "step": 915 }, { "epoch": 0.00916, "grad_norm": 0.35757148265838623, "grad_norm_var": 0.0007087821480995478, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.736583173274994, "loss/hidden": 0.0, "loss/logits": 0.1740923710167408, "loss/reg": 1.7870407104492188, "step": 916 }, { "epoch": 0.00917, "grad_norm": 0.38085147738456726, "grad_norm_var": 0.0006880592910958772, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.722678780555725, "loss/hidden": 0.0, "loss/logits": 0.17577889189124107, "loss/reg": 1.78484308719635, "step": 917 }, { "epoch": 0.00918, "grad_norm": 0.3618144989013672, "grad_norm_var": 0.000688524329092799, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 2.7107303738594055, "loss/hidden": 0.0, "loss/logits": 0.17288268730044365, "loss/reg": 1.781936764717102, "step": 918 }, { "epoch": 0.00919, "grad_norm": 0.4562152326107025, "grad_norm_var": 0.000997994245971834, "learning_rate": 5e-05, "loss": 0.209, "loss/crossentropy": 2.6911511421203613, "loss/hidden": 0.0, "loss/logits": 0.20904593169689178, "loss/reg": 1.7793291807174683, "step": 919 }, { "epoch": 0.0092, "grad_norm": 0.7892553806304932, "grad_norm_var": 0.011823798595284762, "learning_rate": 5e-05, "loss": 0.2098, "loss/crossentropy": 2.8024474382400513, "loss/hidden": 0.0, "loss/logits": 0.20984026044607162, "loss/reg": 1.7767542600631714, "step": 920 }, { "epoch": 0.00921, "grad_norm": 0.4952187240123749, "grad_norm_var": 0.012365066103201613, "learning_rate": 5e-05, "loss": 0.2215, "loss/crossentropy": 2.836692988872528, "loss/hidden": 0.0, "loss/logits": 0.22146976366639137, "loss/reg": 1.77475905418396, "step": 921 }, { "epoch": 0.00922, "grad_norm": 0.4500190317630768, "grad_norm_var": 0.01204494814009713, "learning_rate": 5e-05, "loss": 0.1957, "loss/crossentropy": 2.7740437984466553, "loss/hidden": 0.0, "loss/logits": 0.1957194283604622, "loss/reg": 1.7725263833999634, "step": 922 }, { "epoch": 0.00923, "grad_norm": 0.4018624424934387, "grad_norm_var": 0.012053458598524087, "learning_rate": 5e-05, "loss": 0.1985, "loss/crossentropy": 2.783832609653473, "loss/hidden": 0.0, "loss/logits": 0.19849245250225067, "loss/reg": 1.7706068754196167, "step": 923 }, { "epoch": 0.00924, "grad_norm": 0.4053579866886139, "grad_norm_var": 0.01170279735803306, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.8584959506988525, "loss/hidden": 0.0, "loss/logits": 0.1846720688045025, "loss/reg": 1.7687995433807373, "step": 924 }, { "epoch": 0.00925, "grad_norm": 0.4355542063713074, "grad_norm_var": 0.011379840002585932, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.8072018027305603, "loss/hidden": 0.0, "loss/logits": 0.19105902686715126, "loss/reg": 1.766356348991394, "step": 925 }, { "epoch": 0.00926, "grad_norm": 0.41985246539115906, "grad_norm_var": 0.010977469936536095, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.825522303581238, "loss/hidden": 0.0, "loss/logits": 0.17719140276312828, "loss/reg": 1.764008641242981, "step": 926 }, { "epoch": 0.00927, "grad_norm": 0.4020366370677948, "grad_norm_var": 0.010695516965112247, "learning_rate": 5e-05, "loss": 0.168, "loss/crossentropy": 2.946666181087494, "loss/hidden": 0.0, "loss/logits": 0.16798604279756546, "loss/reg": 1.7619134187698364, "step": 927 }, { "epoch": 0.00928, "grad_norm": 0.4333237111568451, "grad_norm_var": 0.01047000612816995, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.904057264328003, "loss/hidden": 0.0, "loss/logits": 0.1731964722275734, "loss/reg": 1.7594822645187378, "step": 928 }, { "epoch": 0.00929, "grad_norm": 0.44914835691452026, "grad_norm_var": 0.010434282299518182, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.8881112933158875, "loss/hidden": 0.0, "loss/logits": 0.1821521669626236, "loss/reg": 1.757930874824524, "step": 929 }, { "epoch": 0.0093, "grad_norm": 0.8063428401947021, "grad_norm_var": 0.018582235883409348, "learning_rate": 5e-05, "loss": 0.2108, "loss/crossentropy": 3.000428855419159, "loss/hidden": 0.0, "loss/logits": 0.21083774790167809, "loss/reg": 1.756495714187622, "step": 930 }, { "epoch": 0.00931, "grad_norm": 0.37262633442878723, "grad_norm_var": 0.018420551139004416, "learning_rate": 5e-05, "loss": 0.1727, "loss/crossentropy": 2.899652659893036, "loss/hidden": 0.0, "loss/logits": 0.1727372407913208, "loss/reg": 1.755653977394104, "step": 931 }, { "epoch": 0.00932, "grad_norm": 0.44574448466300964, "grad_norm_var": 0.017660345874116586, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.7930703163146973, "loss/hidden": 0.0, "loss/logits": 0.17705539613962173, "loss/reg": 1.7541528940200806, "step": 932 }, { "epoch": 0.00933, "grad_norm": 0.4381559193134308, "grad_norm_var": 0.017191491981390843, "learning_rate": 5e-05, "loss": 0.1928, "loss/crossentropy": 3.009516716003418, "loss/hidden": 0.0, "loss/logits": 0.19276633486151695, "loss/reg": 1.7523828744888306, "step": 933 }, { "epoch": 0.00934, "grad_norm": 0.37119948863983154, "grad_norm_var": 0.01705829482260827, "learning_rate": 5e-05, "loss": 0.1784, "loss/crossentropy": 2.7011741995811462, "loss/hidden": 0.0, "loss/logits": 0.17841476574540138, "loss/reg": 1.750117540359497, "step": 934 }, { "epoch": 0.00935, "grad_norm": 0.38776126503944397, "grad_norm_var": 0.017506596591750172, "learning_rate": 5e-05, "loss": 0.1765, "loss/crossentropy": 2.755903959274292, "loss/hidden": 0.0, "loss/logits": 0.17649077624082565, "loss/reg": 1.7485483884811401, "step": 935 }, { "epoch": 0.00936, "grad_norm": 0.3909890949726105, "grad_norm_var": 0.010412048008983836, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.7981409430503845, "loss/hidden": 0.0, "loss/logits": 0.1718210205435753, "loss/reg": 1.7470717430114746, "step": 936 }, { "epoch": 0.00937, "grad_norm": 0.4047463536262512, "grad_norm_var": 0.010306674977021387, "learning_rate": 5e-05, "loss": 0.1944, "loss/crossentropy": 2.653374135494232, "loss/hidden": 0.0, "loss/logits": 0.1944383941590786, "loss/reg": 1.7457630634307861, "step": 937 }, { "epoch": 0.00938, "grad_norm": 0.4641042947769165, "grad_norm_var": 0.010340857957444612, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 3.054188370704651, "loss/hidden": 0.0, "loss/logits": 0.19333792477846146, "loss/reg": 1.7441859245300293, "step": 938 }, { "epoch": 0.00939, "grad_norm": 0.463642418384552, "grad_norm_var": 0.01027101724812616, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.7980846166610718, "loss/hidden": 0.0, "loss/logits": 0.19064636901021004, "loss/reg": 1.7424683570861816, "step": 939 }, { "epoch": 0.0094, "grad_norm": 0.3763858675956726, "grad_norm_var": 0.010469512228889604, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.8554503321647644, "loss/hidden": 0.0, "loss/logits": 0.1817203275859356, "loss/reg": 1.7405589818954468, "step": 940 }, { "epoch": 0.00941, "grad_norm": 0.41792160272598267, "grad_norm_var": 0.010502572032980106, "learning_rate": 5e-05, "loss": 0.2081, "loss/crossentropy": 2.7464479207992554, "loss/hidden": 0.0, "loss/logits": 0.20806986466050148, "loss/reg": 1.7390387058258057, "step": 941 }, { "epoch": 0.00942, "grad_norm": 0.405609130859375, "grad_norm_var": 0.010553986517832181, "learning_rate": 5e-05, "loss": 0.1923, "loss/crossentropy": 2.9190812706947327, "loss/hidden": 0.0, "loss/logits": 0.1922554075717926, "loss/reg": 1.737121343612671, "step": 942 }, { "epoch": 0.00943, "grad_norm": 0.5186859369277954, "grad_norm_var": 0.010823950074372254, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.846211016178131, "loss/hidden": 0.0, "loss/logits": 0.1892014741897583, "loss/reg": 1.7357187271118164, "step": 943 }, { "epoch": 0.00944, "grad_norm": 0.3441806137561798, "grad_norm_var": 0.011478989118617007, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.8220054507255554, "loss/hidden": 0.0, "loss/logits": 0.1700589321553707, "loss/reg": 1.733402967453003, "step": 944 }, { "epoch": 0.00945, "grad_norm": 0.37400367856025696, "grad_norm_var": 0.011751047533430632, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.7692030668258667, "loss/hidden": 0.0, "loss/logits": 0.17900892347097397, "loss/reg": 1.7316250801086426, "step": 945 }, { "epoch": 0.00946, "grad_norm": 0.4089336395263672, "grad_norm_var": 0.0020184395330867097, "learning_rate": 5e-05, "loss": 0.1982, "loss/crossentropy": 2.891884744167328, "loss/hidden": 0.0, "loss/logits": 0.19816706702113152, "loss/reg": 1.7298880815505981, "step": 946 }, { "epoch": 0.00947, "grad_norm": 0.36752966046333313, "grad_norm_var": 0.002046509202798789, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.858055591583252, "loss/hidden": 0.0, "loss/logits": 0.18428384885191917, "loss/reg": 1.7284300327301025, "step": 947 }, { "epoch": 0.00948, "grad_norm": 0.36644455790519714, "grad_norm_var": 0.002074549092487384, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.848255932331085, "loss/hidden": 0.0, "loss/logits": 0.17144014686346054, "loss/reg": 1.7270002365112305, "step": 948 }, { "epoch": 0.00949, "grad_norm": 0.5910805463790894, "grad_norm_var": 0.004186356490923812, "learning_rate": 5e-05, "loss": 0.23, "loss/crossentropy": 2.7590108513832092, "loss/hidden": 0.0, "loss/logits": 0.22996008396148682, "loss/reg": 1.7256983518600464, "step": 949 }, { "epoch": 0.0095, "grad_norm": 0.35803112387657166, "grad_norm_var": 0.00427554901524122, "learning_rate": 5e-05, "loss": 0.1684, "loss/crossentropy": 2.7760064005851746, "loss/hidden": 0.0, "loss/logits": 0.16840620338916779, "loss/reg": 1.723679542541504, "step": 950 }, { "epoch": 0.00951, "grad_norm": 0.412681519985199, "grad_norm_var": 0.004223846207916952, "learning_rate": 5e-05, "loss": 0.2018, "loss/crossentropy": 2.9045740365982056, "loss/hidden": 0.0, "loss/logits": 0.20182525366544724, "loss/reg": 1.72231125831604, "step": 951 }, { "epoch": 0.00952, "grad_norm": 0.4021626114845276, "grad_norm_var": 0.004193552649382352, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.6521793007850647, "loss/hidden": 0.0, "loss/logits": 0.1848319098353386, "loss/reg": 1.720641851425171, "step": 952 }, { "epoch": 0.00953, "grad_norm": 0.3750251233577728, "grad_norm_var": 0.00429834750938114, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.7560397386550903, "loss/hidden": 0.0, "loss/logits": 0.17908834293484688, "loss/reg": 1.7182477712631226, "step": 953 }, { "epoch": 0.00954, "grad_norm": 0.5893900990486145, "grad_norm_var": 0.006092951728716223, "learning_rate": 5e-05, "loss": 0.2129, "loss/crossentropy": 2.835801601409912, "loss/hidden": 0.0, "loss/logits": 0.21293479949235916, "loss/reg": 1.716722011566162, "step": 954 }, { "epoch": 0.00955, "grad_norm": 0.40877264738082886, "grad_norm_var": 0.005985476808116985, "learning_rate": 5e-05, "loss": 0.1938, "loss/crossentropy": 2.689119517803192, "loss/hidden": 0.0, "loss/logits": 0.19383220747113228, "loss/reg": 1.714568853378296, "step": 955 }, { "epoch": 0.00956, "grad_norm": 0.38810843229293823, "grad_norm_var": 0.005926205055061354, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.948507070541382, "loss/hidden": 0.0, "loss/logits": 0.1705201156437397, "loss/reg": 1.7119203805923462, "step": 956 }, { "epoch": 0.00957, "grad_norm": 0.4206679165363312, "grad_norm_var": 0.005925719462670757, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.78257417678833, "loss/hidden": 0.0, "loss/logits": 0.18074193224310875, "loss/reg": 1.7095727920532227, "step": 957 }, { "epoch": 0.00958, "grad_norm": 0.3933105766773224, "grad_norm_var": 0.005959929971731507, "learning_rate": 5e-05, "loss": 0.2045, "loss/crossentropy": 2.7225964665412903, "loss/hidden": 0.0, "loss/logits": 0.2044883407652378, "loss/reg": 1.707101583480835, "step": 958 }, { "epoch": 0.00959, "grad_norm": 0.3582659661769867, "grad_norm_var": 0.005456189170996354, "learning_rate": 5e-05, "loss": 0.1603, "loss/crossentropy": 2.7268422842025757, "loss/hidden": 0.0, "loss/logits": 0.1602596789598465, "loss/reg": 1.7055177688598633, "step": 959 }, { "epoch": 0.0096, "grad_norm": 0.397594153881073, "grad_norm_var": 0.0051663773874797295, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.877332389354706, "loss/hidden": 0.0, "loss/logits": 0.1733493208885193, "loss/reg": 1.7030569314956665, "step": 960 }, { "epoch": 0.00961, "grad_norm": 0.4981625974178314, "grad_norm_var": 0.005480135764708397, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.656112492084503, "loss/hidden": 0.0, "loss/logits": 0.18259770050644875, "loss/reg": 1.7019816637039185, "step": 961 }, { "epoch": 0.00962, "grad_norm": 0.937751054763794, "grad_norm_var": 0.022106629800203694, "learning_rate": 5e-05, "loss": 0.2083, "loss/crossentropy": 2.907736301422119, "loss/hidden": 0.0, "loss/logits": 0.20831404626369476, "loss/reg": 1.700728416442871, "step": 962 }, { "epoch": 0.00963, "grad_norm": 0.3895174264907837, "grad_norm_var": 0.021883161578962466, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.845858633518219, "loss/hidden": 0.0, "loss/logits": 0.18031802773475647, "loss/reg": 1.6990742683410645, "step": 963 }, { "epoch": 0.00964, "grad_norm": 0.34548690915107727, "grad_norm_var": 0.02215928485241057, "learning_rate": 5e-05, "loss": 0.1645, "loss/crossentropy": 2.8244311213493347, "loss/hidden": 0.0, "loss/logits": 0.1645219847559929, "loss/reg": 1.697798252105713, "step": 964 }, { "epoch": 0.00965, "grad_norm": 0.36824318766593933, "grad_norm_var": 0.021193656582446117, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.884181797504425, "loss/hidden": 0.0, "loss/logits": 0.17166699841618538, "loss/reg": 1.6957753896713257, "step": 965 }, { "epoch": 0.00966, "grad_norm": 0.37774839997291565, "grad_norm_var": 0.021001939954339834, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 3.025804340839386, "loss/hidden": 0.0, "loss/logits": 0.18241329863667488, "loss/reg": 1.6944239139556885, "step": 966 }, { "epoch": 0.00967, "grad_norm": 0.36408743262290955, "grad_norm_var": 0.02133579751543382, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.6597015261650085, "loss/hidden": 0.0, "loss/logits": 0.1815933845937252, "loss/reg": 1.692252516746521, "step": 967 }, { "epoch": 0.00968, "grad_norm": 0.34311729669570923, "grad_norm_var": 0.02183892884845392, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.845684826374054, "loss/hidden": 0.0, "loss/logits": 0.1661607250571251, "loss/reg": 1.6907639503479004, "step": 968 }, { "epoch": 0.00969, "grad_norm": 0.38303908705711365, "grad_norm_var": 0.021779175231247044, "learning_rate": 5e-05, "loss": 0.1693, "loss/crossentropy": 2.558404505252838, "loss/hidden": 0.0, "loss/logits": 0.16925161331892014, "loss/reg": 1.6893569231033325, "step": 969 }, { "epoch": 0.0097, "grad_norm": 0.3850949704647064, "grad_norm_var": 0.02018777587365078, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.890751600265503, "loss/hidden": 0.0, "loss/logits": 0.1684512346982956, "loss/reg": 1.688266634941101, "step": 970 }, { "epoch": 0.00971, "grad_norm": 0.4068422317504883, "grad_norm_var": 0.020191525445637973, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.707846701145172, "loss/hidden": 0.0, "loss/logits": 0.18569114059209824, "loss/reg": 1.6867531538009644, "step": 971 }, { "epoch": 0.00972, "grad_norm": 0.3924512565135956, "grad_norm_var": 0.020172897207266394, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.848098576068878, "loss/hidden": 0.0, "loss/logits": 0.1828712299466133, "loss/reg": 1.6852294206619263, "step": 972 }, { "epoch": 0.00973, "grad_norm": 0.3714575469493866, "grad_norm_var": 0.020336838096992275, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.8703532814979553, "loss/hidden": 0.0, "loss/logits": 0.17731666564941406, "loss/reg": 1.6838881969451904, "step": 973 }, { "epoch": 0.00974, "grad_norm": 0.35195186734199524, "grad_norm_var": 0.020588227081264298, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.863659620285034, "loss/hidden": 0.0, "loss/logits": 0.18621815741062164, "loss/reg": 1.6821165084838867, "step": 974 }, { "epoch": 0.00975, "grad_norm": 0.441755086183548, "grad_norm_var": 0.02037088575085001, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 2.810901939868927, "loss/hidden": 0.0, "loss/logits": 0.19334488362073898, "loss/reg": 1.6802574396133423, "step": 975 }, { "epoch": 0.00976, "grad_norm": 0.40233367681503296, "grad_norm_var": 0.02035677589008348, "learning_rate": 5e-05, "loss": 0.1983, "loss/crossentropy": 2.681654691696167, "loss/hidden": 0.0, "loss/logits": 0.19833911955356598, "loss/reg": 1.6781818866729736, "step": 976 }, { "epoch": 0.00977, "grad_norm": 0.6531580686569214, "grad_norm_var": 0.023423138566671976, "learning_rate": 5e-05, "loss": 0.2212, "loss/crossentropy": 2.82851505279541, "loss/hidden": 0.0, "loss/logits": 0.2212192267179489, "loss/reg": 1.6763266324996948, "step": 977 }, { "epoch": 0.00978, "grad_norm": 0.3646674156188965, "grad_norm_var": 0.005314392422501734, "learning_rate": 5e-05, "loss": 0.1715, "loss/crossentropy": 2.7788134813308716, "loss/hidden": 0.0, "loss/logits": 0.17147252708673477, "loss/reg": 1.674770712852478, "step": 978 }, { "epoch": 0.00979, "grad_norm": 0.40374529361724854, "grad_norm_var": 0.005314159555871933, "learning_rate": 5e-05, "loss": 0.2092, "loss/crossentropy": 2.7746172547340393, "loss/hidden": 0.0, "loss/logits": 0.2092289738357067, "loss/reg": 1.6723932027816772, "step": 979 }, { "epoch": 0.0098, "grad_norm": 0.3737621009349823, "grad_norm_var": 0.005169172562247167, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.871635138988495, "loss/hidden": 0.0, "loss/logits": 0.18473126366734505, "loss/reg": 1.6707124710083008, "step": 980 }, { "epoch": 0.00981, "grad_norm": 0.37383797764778137, "grad_norm_var": 0.005148210609649088, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.760922133922577, "loss/hidden": 0.0, "loss/logits": 0.1733292043209076, "loss/reg": 1.6693061590194702, "step": 981 }, { "epoch": 0.00982, "grad_norm": 0.38922467827796936, "grad_norm_var": 0.00512344066750369, "learning_rate": 5e-05, "loss": 0.164, "loss/crossentropy": 2.8191832304000854, "loss/hidden": 0.0, "loss/logits": 0.16395087912678719, "loss/reg": 1.6671026945114136, "step": 982 }, { "epoch": 0.00983, "grad_norm": 0.40670332312583923, "grad_norm_var": 0.0050327015332547465, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.847275197505951, "loss/hidden": 0.0, "loss/logits": 0.18915896490216255, "loss/reg": 1.6655491590499878, "step": 983 }, { "epoch": 0.00984, "grad_norm": 0.3739645183086395, "grad_norm_var": 0.004847126969690196, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.817361056804657, "loss/hidden": 0.0, "loss/logits": 0.1940479725599289, "loss/reg": 1.664434552192688, "step": 984 }, { "epoch": 0.00985, "grad_norm": 0.36827903985977173, "grad_norm_var": 0.00490322302848593, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.9278652667999268, "loss/hidden": 0.0, "loss/logits": 0.17926159501075745, "loss/reg": 1.6631444692611694, "step": 985 }, { "epoch": 0.00986, "grad_norm": 0.36838048696517944, "grad_norm_var": 0.0049621510753778035, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.625900149345398, "loss/hidden": 0.0, "loss/logits": 0.1820085123181343, "loss/reg": 1.6615486145019531, "step": 986 }, { "epoch": 0.00987, "grad_norm": 0.406107097864151, "grad_norm_var": 0.00496177464005331, "learning_rate": 5e-05, "loss": 0.165, "loss/crossentropy": 2.6364856362342834, "loss/hidden": 0.0, "loss/logits": 0.16496483236551285, "loss/reg": 1.6604608297348022, "step": 987 }, { "epoch": 0.00988, "grad_norm": 0.3886563777923584, "grad_norm_var": 0.004967815483619401, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.796413004398346, "loss/hidden": 0.0, "loss/logits": 0.1803182028234005, "loss/reg": 1.6593427658081055, "step": 988 }, { "epoch": 0.00989, "grad_norm": 0.35161423683166504, "grad_norm_var": 0.005074223354079936, "learning_rate": 5e-05, "loss": 0.1644, "loss/crossentropy": 2.8155667185783386, "loss/hidden": 0.0, "loss/logits": 0.16441339999437332, "loss/reg": 1.6583104133605957, "step": 989 }, { "epoch": 0.0099, "grad_norm": 0.39407941699028015, "grad_norm_var": 0.0049088886087087355, "learning_rate": 5e-05, "loss": 0.1928, "loss/crossentropy": 2.8455575704574585, "loss/hidden": 0.0, "loss/logits": 0.19284628704190254, "loss/reg": 1.658367395401001, "step": 990 }, { "epoch": 0.00991, "grad_norm": 0.3695094883441925, "grad_norm_var": 0.004869171230278472, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.80289888381958, "loss/hidden": 0.0, "loss/logits": 0.18142832443118095, "loss/reg": 1.6575336456298828, "step": 991 }, { "epoch": 0.00992, "grad_norm": 0.3505973815917969, "grad_norm_var": 0.005015199761620141, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.731611430644989, "loss/hidden": 0.0, "loss/logits": 0.17688723653554916, "loss/reg": 1.6567095518112183, "step": 992 }, { "epoch": 0.00993, "grad_norm": 0.38008397817611694, "grad_norm_var": 0.0003133497280889556, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.7869952917099, "loss/hidden": 0.0, "loss/logits": 0.18361864984035492, "loss/reg": 1.6552574634552002, "step": 993 }, { "epoch": 0.00994, "grad_norm": 0.38647469878196716, "grad_norm_var": 0.00030154116815576856, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.744201898574829, "loss/hidden": 0.0, "loss/logits": 0.17939525097608566, "loss/reg": 1.6545374393463135, "step": 994 }, { "epoch": 0.00995, "grad_norm": 0.3995093107223511, "grad_norm_var": 0.0002894285610608412, "learning_rate": 5e-05, "loss": 0.1916, "loss/crossentropy": 2.694726526737213, "loss/hidden": 0.0, "loss/logits": 0.1916041001677513, "loss/reg": 1.6533586978912354, "step": 995 }, { "epoch": 0.00996, "grad_norm": 0.34584900736808777, "grad_norm_var": 0.0003615231269390488, "learning_rate": 5e-05, "loss": 0.1739, "loss/crossentropy": 2.8016315698623657, "loss/hidden": 0.0, "loss/logits": 0.1739257462322712, "loss/reg": 1.651719570159912, "step": 996 }, { "epoch": 0.00997, "grad_norm": 0.3925288915634155, "grad_norm_var": 0.0003722265532580001, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.897447168827057, "loss/hidden": 0.0, "loss/logits": 0.17699377238750458, "loss/reg": 1.6504392623901367, "step": 997 }, { "epoch": 0.00998, "grad_norm": 0.39880403876304626, "grad_norm_var": 0.0003904176090236522, "learning_rate": 5e-05, "loss": 0.1919, "loss/crossentropy": 2.8275578022003174, "loss/hidden": 0.0, "loss/logits": 0.19189641624689102, "loss/reg": 1.6491367816925049, "step": 998 }, { "epoch": 0.00999, "grad_norm": 0.4310808479785919, "grad_norm_var": 0.0005141220319849537, "learning_rate": 5e-05, "loss": 0.2298, "loss/crossentropy": 2.7063609957695007, "loss/hidden": 0.0, "loss/logits": 0.22976921498775482, "loss/reg": 1.6471576690673828, "step": 999 }, { "epoch": 0.01, "grad_norm": 0.3714313507080078, "grad_norm_var": 0.0005171003041950173, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.7168938517570496, "loss/hidden": 0.0, "loss/logits": 0.19012651592493057, "loss/reg": 1.6457953453063965, "step": 1000 }, { "epoch": 0.01001, "grad_norm": 0.3641223907470703, "grad_norm_var": 0.0005254723456020552, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.753402054309845, "loss/hidden": 0.0, "loss/logits": 0.1885070614516735, "loss/reg": 1.6438895463943481, "step": 1001 }, { "epoch": 0.01002, "grad_norm": 0.39603546261787415, "grad_norm_var": 0.0005260879240772306, "learning_rate": 5e-05, "loss": 0.1886, "loss/crossentropy": 2.837618350982666, "loss/hidden": 0.0, "loss/logits": 0.18864833936095238, "loss/reg": 1.642066240310669, "step": 1002 }, { "epoch": 0.01003, "grad_norm": 0.48641237616539, "grad_norm_var": 0.0011775773272432843, "learning_rate": 5e-05, "loss": 0.2391, "loss/crossentropy": 2.831344962120056, "loss/hidden": 0.0, "loss/logits": 0.23907097056508064, "loss/reg": 1.6403310298919678, "step": 1003 }, { "epoch": 0.01004, "grad_norm": 0.3798205256462097, "grad_norm_var": 0.0011815944076354482, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.7991966605186462, "loss/hidden": 0.0, "loss/logits": 0.18689077720046043, "loss/reg": 1.6383352279663086, "step": 1004 }, { "epoch": 0.01005, "grad_norm": 0.3784593641757965, "grad_norm_var": 0.0010986458368820136, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.8769296407699585, "loss/hidden": 0.0, "loss/logits": 0.1838611364364624, "loss/reg": 1.6359633207321167, "step": 1005 }, { "epoch": 0.01006, "grad_norm": 0.3823866844177246, "grad_norm_var": 0.001099349676319091, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.7005032896995544, "loss/hidden": 0.0, "loss/logits": 0.18838313221931458, "loss/reg": 1.6344705820083618, "step": 1006 }, { "epoch": 0.01007, "grad_norm": 0.3799179494380951, "grad_norm_var": 0.0010800167815802877, "learning_rate": 5e-05, "loss": 0.1908, "loss/crossentropy": 2.6673877239227295, "loss/hidden": 0.0, "loss/logits": 0.1907733455300331, "loss/reg": 1.6336002349853516, "step": 1007 }, { "epoch": 0.01008, "grad_norm": 0.37894824147224426, "grad_norm_var": 0.0009852009444313561, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.7754920721054077, "loss/hidden": 0.0, "loss/logits": 0.18283528462052345, "loss/reg": 1.6325920820236206, "step": 1008 }, { "epoch": 0.01009, "grad_norm": 0.39319896697998047, "grad_norm_var": 0.0009773145681171713, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.8970988988876343, "loss/hidden": 0.0, "loss/logits": 0.18647681921720505, "loss/reg": 1.631390929222107, "step": 1009 }, { "epoch": 0.0101, "grad_norm": 0.38153842091560364, "grad_norm_var": 0.0009821853173486716, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.763012409210205, "loss/hidden": 0.0, "loss/logits": 0.18243329226970673, "loss/reg": 1.6309009790420532, "step": 1010 }, { "epoch": 0.01011, "grad_norm": 0.3883878290653229, "grad_norm_var": 0.0009776724027208417, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.765687584877014, "loss/hidden": 0.0, "loss/logits": 0.19011493027210236, "loss/reg": 1.6300084590911865, "step": 1011 }, { "epoch": 0.01012, "grad_norm": 0.35583510994911194, "grad_norm_var": 0.0009243763684662879, "learning_rate": 5e-05, "loss": 0.1764, "loss/crossentropy": 2.749864339828491, "loss/hidden": 0.0, "loss/logits": 0.1763710305094719, "loss/reg": 1.6288458108901978, "step": 1012 }, { "epoch": 0.01013, "grad_norm": 0.38475698232650757, "grad_norm_var": 0.0009267555768795195, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.7161130905151367, "loss/hidden": 0.0, "loss/logits": 0.18835009634494781, "loss/reg": 1.6285719871520996, "step": 1013 }, { "epoch": 0.01014, "grad_norm": 0.37661120295524597, "grad_norm_var": 0.000933546249842306, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 2.6909996271133423, "loss/hidden": 0.0, "loss/logits": 0.16574236378073692, "loss/reg": 1.6282925605773926, "step": 1014 }, { "epoch": 0.01015, "grad_norm": 0.3704264163970947, "grad_norm_var": 0.0008256614127205612, "learning_rate": 5e-05, "loss": 0.1765, "loss/crossentropy": 2.783431828022003, "loss/hidden": 0.0, "loss/logits": 0.1764850728213787, "loss/reg": 1.627402663230896, "step": 1015 }, { "epoch": 0.01016, "grad_norm": 0.37588080763816833, "grad_norm_var": 0.0008185416610935044, "learning_rate": 5e-05, "loss": 0.1946, "loss/crossentropy": 2.8895158171653748, "loss/hidden": 0.0, "loss/logits": 0.19460226222872734, "loss/reg": 1.6271944046020508, "step": 1016 }, { "epoch": 0.01017, "grad_norm": 0.4015043377876282, "grad_norm_var": 0.0007978520380527006, "learning_rate": 5e-05, "loss": 0.1888, "loss/crossentropy": 2.9011647701263428, "loss/hidden": 0.0, "loss/logits": 0.18881165981292725, "loss/reg": 1.62646484375, "step": 1017 }, { "epoch": 0.01018, "grad_norm": 0.3869231641292572, "grad_norm_var": 0.0007934398262748814, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.685133635997772, "loss/hidden": 0.0, "loss/logits": 0.17748012766242027, "loss/reg": 1.62509286403656, "step": 1018 }, { "epoch": 0.01019, "grad_norm": 0.43097683787345886, "grad_norm_var": 0.00025487289950493577, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.7974401712417603, "loss/hidden": 0.0, "loss/logits": 0.19396745413541794, "loss/reg": 1.624069094657898, "step": 1019 }, { "epoch": 0.0102, "grad_norm": 0.3837282061576843, "grad_norm_var": 0.0002535984477039547, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 2.7660019397735596, "loss/hidden": 0.0, "loss/logits": 0.17998040840029716, "loss/reg": 1.6229311227798462, "step": 1020 }, { "epoch": 0.01021, "grad_norm": 0.36976560950279236, "grad_norm_var": 0.00026514185975165343, "learning_rate": 5e-05, "loss": 0.1684, "loss/crossentropy": 2.773696482181549, "loss/hidden": 0.0, "loss/logits": 0.16836534813046455, "loss/reg": 1.6225528717041016, "step": 1021 }, { "epoch": 0.01022, "grad_norm": 0.3894343674182892, "grad_norm_var": 0.00026691892163717516, "learning_rate": 5e-05, "loss": 0.2622, "loss/crossentropy": 2.7716140151023865, "loss/hidden": 0.0, "loss/logits": 0.2621819078922272, "loss/reg": 1.6223206520080566, "step": 1022 }, { "epoch": 0.01023, "grad_norm": 0.38731515407562256, "grad_norm_var": 0.0002660763662075608, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.721261143684387, "loss/hidden": 0.0, "loss/logits": 0.18962380290031433, "loss/reg": 1.6217230558395386, "step": 1023 }, { "epoch": 0.01024, "grad_norm": 0.3601418733596802, "grad_norm_var": 0.00030260891980304536, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.8548576831817627, "loss/hidden": 0.0, "loss/logits": 0.17688385397195816, "loss/reg": 1.621361494064331, "step": 1024 }, { "epoch": 0.01025, "grad_norm": 0.37833818793296814, "grad_norm_var": 0.00029724636529409785, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.7824737429618835, "loss/hidden": 0.0, "loss/logits": 0.17165284976363182, "loss/reg": 1.620347499847412, "step": 1025 }, { "epoch": 0.01026, "grad_norm": 0.40456339716911316, "grad_norm_var": 0.000327128476702739, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.7456098794937134, "loss/hidden": 0.0, "loss/logits": 0.19577478617429733, "loss/reg": 1.6197152137756348, "step": 1026 }, { "epoch": 0.01027, "grad_norm": 0.36613523960113525, "grad_norm_var": 0.0003451676569719416, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.7014692425727844, "loss/hidden": 0.0, "loss/logits": 0.1794487200677395, "loss/reg": 1.6191424131393433, "step": 1027 }, { "epoch": 0.01028, "grad_norm": 0.36154961585998535, "grad_norm_var": 0.00032678045604246365, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.8211179971694946, "loss/hidden": 0.0, "loss/logits": 0.17734427005052567, "loss/reg": 1.6176239252090454, "step": 1028 }, { "epoch": 0.01029, "grad_norm": 0.40533965826034546, "grad_norm_var": 0.00035807133543167187, "learning_rate": 5e-05, "loss": 0.1997, "loss/crossentropy": 2.5644471049308777, "loss/hidden": 0.0, "loss/logits": 0.19972549006342888, "loss/reg": 1.6164883375167847, "step": 1029 }, { "epoch": 0.0103, "grad_norm": 0.3332688510417938, "grad_norm_var": 0.0005198547791486281, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.6996293663978577, "loss/hidden": 0.0, "loss/logits": 0.1703544519841671, "loss/reg": 1.6148288249969482, "step": 1030 }, { "epoch": 0.01031, "grad_norm": 0.42555445432662964, "grad_norm_var": 0.0006278098210961592, "learning_rate": 5e-05, "loss": 0.1963, "loss/crossentropy": 2.763964354991913, "loss/hidden": 0.0, "loss/logits": 0.19628288969397545, "loss/reg": 1.6132444143295288, "step": 1031 }, { "epoch": 0.01032, "grad_norm": 0.38525158166885376, "grad_norm_var": 0.0006218714027041216, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.6709354519844055, "loss/hidden": 0.0, "loss/logits": 0.18258166313171387, "loss/reg": 1.612375259399414, "step": 1032 }, { "epoch": 0.01033, "grad_norm": 0.37334632873535156, "grad_norm_var": 0.0006117594391554903, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.780134439468384, "loss/hidden": 0.0, "loss/logits": 0.1798795834183693, "loss/reg": 1.6112585067749023, "step": 1033 }, { "epoch": 0.01034, "grad_norm": 0.3481215536594391, "grad_norm_var": 0.0006899686053054983, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.7880823612213135, "loss/hidden": 0.0, "loss/logits": 0.16774233058094978, "loss/reg": 1.6096283197402954, "step": 1034 }, { "epoch": 0.01035, "grad_norm": 0.401376873254776, "grad_norm_var": 0.0005491717474448839, "learning_rate": 5e-05, "loss": 0.1946, "loss/crossentropy": 2.7819212675094604, "loss/hidden": 0.0, "loss/logits": 0.19464891031384468, "loss/reg": 1.6077492237091064, "step": 1035 }, { "epoch": 0.01036, "grad_norm": 0.4044601619243622, "grad_norm_var": 0.0005875103191944693, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.76057767868042, "loss/hidden": 0.0, "loss/logits": 0.1929696425795555, "loss/reg": 1.6054394245147705, "step": 1036 }, { "epoch": 0.01037, "grad_norm": 0.3640334904193878, "grad_norm_var": 0.0005980528349471677, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.805960953235626, "loss/hidden": 0.0, "loss/logits": 0.17302041500806808, "loss/reg": 1.603266716003418, "step": 1037 }, { "epoch": 0.01038, "grad_norm": 0.34832075238227844, "grad_norm_var": 0.0006548009377475091, "learning_rate": 5e-05, "loss": 0.1659, "loss/crossentropy": 2.9117356538772583, "loss/hidden": 0.0, "loss/logits": 0.1658683866262436, "loss/reg": 1.6007298231124878, "step": 1038 }, { "epoch": 0.01039, "grad_norm": 0.35593798756599426, "grad_norm_var": 0.0006771319252449855, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.8520063757896423, "loss/hidden": 0.0, "loss/logits": 0.17831408604979515, "loss/reg": 1.5984032154083252, "step": 1039 }, { "epoch": 0.0104, "grad_norm": 0.44101372361183167, "grad_norm_var": 0.0009150763472360879, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.9714547991752625, "loss/hidden": 0.0, "loss/logits": 0.1763400174677372, "loss/reg": 1.596476435661316, "step": 1040 }, { "epoch": 0.01041, "grad_norm": 0.42450451850891113, "grad_norm_var": 0.001031664270957929, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.8061989545822144, "loss/hidden": 0.0, "loss/logits": 0.18570463731884956, "loss/reg": 1.5949974060058594, "step": 1041 }, { "epoch": 0.01042, "grad_norm": 1.413458228111267, "grad_norm_var": 0.06742490936139979, "learning_rate": 5e-05, "loss": 0.2472, "loss/crossentropy": 2.941382944583893, "loss/hidden": 0.0, "loss/logits": 0.24719487875699997, "loss/reg": 1.5933181047439575, "step": 1042 }, { "epoch": 0.01043, "grad_norm": 0.4077235460281372, "grad_norm_var": 0.0670847180936582, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.744387209415436, "loss/hidden": 0.0, "loss/logits": 0.17426170781254768, "loss/reg": 1.5918529033660889, "step": 1043 }, { "epoch": 0.01044, "grad_norm": 0.3899991810321808, "grad_norm_var": 0.06680138514417872, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.9295618534088135, "loss/hidden": 0.0, "loss/logits": 0.169007558375597, "loss/reg": 1.5900723934173584, "step": 1044 }, { "epoch": 0.01045, "grad_norm": 0.4866078794002533, "grad_norm_var": 0.06671553563036674, "learning_rate": 5e-05, "loss": 0.2114, "loss/crossentropy": 2.7827881574630737, "loss/hidden": 0.0, "loss/logits": 0.2113601267337799, "loss/reg": 1.5889488458633423, "step": 1045 }, { "epoch": 0.01046, "grad_norm": 0.39279693365097046, "grad_norm_var": 0.0659594213964073, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.780573546886444, "loss/hidden": 0.0, "loss/logits": 0.1807180792093277, "loss/reg": 1.5870665311813354, "step": 1046 }, { "epoch": 0.01047, "grad_norm": 0.3671790659427643, "grad_norm_var": 0.06644172437070375, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.8943958282470703, "loss/hidden": 0.0, "loss/logits": 0.17946722730994225, "loss/reg": 1.5854285955429077, "step": 1047 }, { "epoch": 0.01048, "grad_norm": 0.37834271788597107, "grad_norm_var": 0.06651034798313456, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.7288926243782043, "loss/hidden": 0.0, "loss/logits": 0.1715567633509636, "loss/reg": 1.584210991859436, "step": 1048 }, { "epoch": 0.01049, "grad_norm": 0.37263625860214233, "grad_norm_var": 0.06651821205251345, "learning_rate": 5e-05, "loss": 0.1738, "loss/crossentropy": 2.5842645168304443, "loss/hidden": 0.0, "loss/logits": 0.17380548641085625, "loss/reg": 1.5832701921463013, "step": 1049 }, { "epoch": 0.0105, "grad_norm": 0.357949823141098, "grad_norm_var": 0.06638283943495621, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.8539857864379883, "loss/hidden": 0.0, "loss/logits": 0.17616372555494308, "loss/reg": 1.5817757844924927, "step": 1050 }, { "epoch": 0.01051, "grad_norm": 0.40757089853286743, "grad_norm_var": 0.06633959192563718, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.877175509929657, "loss/hidden": 0.0, "loss/logits": 0.19005050137639046, "loss/reg": 1.580580472946167, "step": 1051 }, { "epoch": 0.01052, "grad_norm": 0.43088629841804504, "grad_norm_var": 0.06619799704120433, "learning_rate": 5e-05, "loss": 0.2072, "loss/crossentropy": 2.6631829738616943, "loss/hidden": 0.0, "loss/logits": 0.20722678676247597, "loss/reg": 1.5795587301254272, "step": 1052 }, { "epoch": 0.01053, "grad_norm": 2.6514923572540283, "grad_norm_var": 0.3643590351017664, "learning_rate": 5e-05, "loss": 0.2386, "loss/crossentropy": 2.8652138113975525, "loss/hidden": 0.0, "loss/logits": 0.23861064016819, "loss/reg": 1.578200101852417, "step": 1053 }, { "epoch": 0.01054, "grad_norm": 0.4709310531616211, "grad_norm_var": 0.3611571581594747, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.751032590866089, "loss/hidden": 0.0, "loss/logits": 0.18403061106801033, "loss/reg": 1.5764774084091187, "step": 1054 }, { "epoch": 0.01055, "grad_norm": 0.542866051197052, "grad_norm_var": 0.3570259510737727, "learning_rate": 5e-05, "loss": 0.2072, "loss/crossentropy": 2.771138370037079, "loss/hidden": 0.0, "loss/logits": 0.20715508237481117, "loss/reg": 1.5751893520355225, "step": 1055 }, { "epoch": 0.01056, "grad_norm": 0.4095610976219177, "grad_norm_var": 0.35784257490682114, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.6514610052108765, "loss/hidden": 0.0, "loss/logits": 0.18285758048295975, "loss/reg": 1.5734416246414185, "step": 1056 }, { "epoch": 0.01057, "grad_norm": 0.45324698090553284, "grad_norm_var": 0.35714871626115285, "learning_rate": 5e-05, "loss": 0.2123, "loss/crossentropy": 2.73183411359787, "loss/hidden": 0.0, "loss/logits": 0.21226082369685173, "loss/reg": 1.5717664957046509, "step": 1057 }, { "epoch": 0.01058, "grad_norm": 0.42001500725746155, "grad_norm_var": 0.3138407253297512, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.7152724266052246, "loss/hidden": 0.0, "loss/logits": 0.1883089914917946, "loss/reg": 1.5704617500305176, "step": 1058 }, { "epoch": 0.01059, "grad_norm": 0.3464891314506531, "grad_norm_var": 0.31530804811188473, "learning_rate": 5e-05, "loss": 0.1642, "loss/crossentropy": 2.7491345405578613, "loss/hidden": 0.0, "loss/logits": 0.16423293575644493, "loss/reg": 1.569446325302124, "step": 1059 }, { "epoch": 0.0106, "grad_norm": 0.3669905960559845, "grad_norm_var": 0.3158470526355899, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.836626708507538, "loss/hidden": 0.0, "loss/logits": 0.16727466136217117, "loss/reg": 1.5682600736618042, "step": 1060 }, { "epoch": 0.01061, "grad_norm": 0.39248570799827576, "grad_norm_var": 0.31723986653993524, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.951793611049652, "loss/hidden": 0.0, "loss/logits": 0.17091352492570877, "loss/reg": 1.5677317380905151, "step": 1061 }, { "epoch": 0.01062, "grad_norm": 0.42078810930252075, "grad_norm_var": 0.31671112367648746, "learning_rate": 5e-05, "loss": 0.1945, "loss/crossentropy": 2.748759090900421, "loss/hidden": 0.0, "loss/logits": 0.19453415274620056, "loss/reg": 1.5668708086013794, "step": 1062 }, { "epoch": 0.01063, "grad_norm": 0.37992992997169495, "grad_norm_var": 0.3164115915761645, "learning_rate": 5e-05, "loss": 0.1878, "loss/crossentropy": 2.6824185848236084, "loss/hidden": 0.0, "loss/logits": 0.18784139305353165, "loss/reg": 1.567291259765625, "step": 1063 }, { "epoch": 0.01064, "grad_norm": 0.3837074935436249, "grad_norm_var": 0.3162905057857987, "learning_rate": 5e-05, "loss": 0.1761, "loss/crossentropy": 2.7311676144599915, "loss/hidden": 0.0, "loss/logits": 0.176094900816679, "loss/reg": 1.566008448600769, "step": 1064 }, { "epoch": 0.01065, "grad_norm": 0.3652944266796112, "grad_norm_var": 0.3164679597230645, "learning_rate": 5e-05, "loss": 0.1734, "loss/crossentropy": 2.7913513779640198, "loss/hidden": 0.0, "loss/logits": 0.17336497828364372, "loss/reg": 1.5644934177398682, "step": 1065 }, { "epoch": 0.01066, "grad_norm": 0.5269585847854614, "grad_norm_var": 0.3139251636385244, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.6032761335372925, "loss/hidden": 0.0, "loss/logits": 0.18454618379473686, "loss/reg": 1.5634944438934326, "step": 1066 }, { "epoch": 0.01067, "grad_norm": 0.4014894664287567, "grad_norm_var": 0.3140515403632015, "learning_rate": 5e-05, "loss": 0.188, "loss/crossentropy": 2.719841182231903, "loss/hidden": 0.0, "loss/logits": 0.18803178519010544, "loss/reg": 1.5624383687973022, "step": 1067 }, { "epoch": 0.01068, "grad_norm": 0.345795601606369, "grad_norm_var": 0.31597113808328114, "learning_rate": 5e-05, "loss": 0.1734, "loss/crossentropy": 2.8608756065368652, "loss/hidden": 0.0, "loss/logits": 0.17335866764187813, "loss/reg": 1.5619043111801147, "step": 1068 }, { "epoch": 0.01069, "grad_norm": 0.403914213180542, "grad_norm_var": 0.0033892069415853284, "learning_rate": 5e-05, "loss": 0.1908, "loss/crossentropy": 2.7109196186065674, "loss/hidden": 0.0, "loss/logits": 0.1907677985727787, "loss/reg": 1.5616743564605713, "step": 1069 }, { "epoch": 0.0107, "grad_norm": 0.408088743686676, "grad_norm_var": 0.0031623901529525987, "learning_rate": 5e-05, "loss": 0.2084, "loss/crossentropy": 2.7172587513923645, "loss/hidden": 0.0, "loss/logits": 0.20841724425554276, "loss/reg": 1.560802936553955, "step": 1070 }, { "epoch": 0.01071, "grad_norm": 0.36463382840156555, "grad_norm_var": 0.002001661703932278, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.778719425201416, "loss/hidden": 0.0, "loss/logits": 0.1717045158147812, "loss/reg": 1.560097336769104, "step": 1071 }, { "epoch": 0.01072, "grad_norm": 0.37026646733283997, "grad_norm_var": 0.0020445979916204338, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.7089297771453857, "loss/hidden": 0.0, "loss/logits": 0.17478087916970253, "loss/reg": 1.5588798522949219, "step": 1072 }, { "epoch": 0.01073, "grad_norm": 0.4527875781059265, "grad_norm_var": 0.0020411585504943965, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.9299193620681763, "loss/hidden": 0.0, "loss/logits": 0.19047147780656815, "loss/reg": 1.5580799579620361, "step": 1073 }, { "epoch": 0.01074, "grad_norm": 0.4295300841331482, "grad_norm_var": 0.0020762032373007446, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.6749280095100403, "loss/hidden": 0.0, "loss/logits": 0.19354819506406784, "loss/reg": 1.5568370819091797, "step": 1074 }, { "epoch": 0.01075, "grad_norm": 0.37927335500717163, "grad_norm_var": 0.0019206305721111705, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.7454851865768433, "loss/hidden": 0.0, "loss/logits": 0.1866455115377903, "loss/reg": 1.5555474758148193, "step": 1075 }, { "epoch": 0.01076, "grad_norm": 0.37798190116882324, "grad_norm_var": 0.0018805443791561534, "learning_rate": 5e-05, "loss": 0.1654, "loss/crossentropy": 2.839569091796875, "loss/hidden": 0.0, "loss/logits": 0.1654018685221672, "loss/reg": 1.5542937517166138, "step": 1076 }, { "epoch": 0.01077, "grad_norm": 0.40603601932525635, "grad_norm_var": 0.0018781135855993687, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.769132375717163, "loss/hidden": 0.0, "loss/logits": 0.18842901661992073, "loss/reg": 1.5535392761230469, "step": 1077 }, { "epoch": 0.01078, "grad_norm": 0.38742178678512573, "grad_norm_var": 0.001859793659604044, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.616896152496338, "loss/hidden": 0.0, "loss/logits": 0.18272512406110764, "loss/reg": 1.5521759986877441, "step": 1078 }, { "epoch": 0.01079, "grad_norm": 3.135983467102051, "grad_norm_var": 0.4696119388561969, "learning_rate": 5e-05, "loss": 0.3609, "loss/crossentropy": 2.867876887321472, "loss/hidden": 0.0, "loss/logits": 0.36090877279639244, "loss/reg": 1.5513793230056763, "step": 1079 }, { "epoch": 0.0108, "grad_norm": 0.44463542103767395, "grad_norm_var": 0.46832083359345933, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.739173710346222, "loss/hidden": 0.0, "loss/logits": 0.18892118334770203, "loss/reg": 1.5504870414733887, "step": 1080 }, { "epoch": 0.01081, "grad_norm": 0.45169124007225037, "grad_norm_var": 0.4663715745961762, "learning_rate": 5e-05, "loss": 0.1868, "loss/crossentropy": 2.576621174812317, "loss/hidden": 0.0, "loss/logits": 0.18683869391679764, "loss/reg": 1.5502347946166992, "step": 1081 }, { "epoch": 0.01082, "grad_norm": 0.4132075905799866, "grad_norm_var": 0.46799089854198317, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.822961449623108, "loss/hidden": 0.0, "loss/logits": 0.17634045705199242, "loss/reg": 1.5503305196762085, "step": 1082 }, { "epoch": 0.01083, "grad_norm": 0.4182119369506836, "grad_norm_var": 0.4676253053735462, "learning_rate": 5e-05, "loss": 0.1947, "loss/crossentropy": 2.9488189220428467, "loss/hidden": 0.0, "loss/logits": 0.19468581303954124, "loss/reg": 1.5501155853271484, "step": 1083 }, { "epoch": 0.01084, "grad_norm": 0.41536945104599, "grad_norm_var": 0.4658077316127261, "learning_rate": 5e-05, "loss": 0.1691, "loss/crossentropy": 2.7678072452545166, "loss/hidden": 0.0, "loss/logits": 0.16905947774648666, "loss/reg": 1.549827218055725, "step": 1084 }, { "epoch": 0.01085, "grad_norm": 0.42519059777259827, "grad_norm_var": 0.4653402127084352, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.7813703417778015, "loss/hidden": 0.0, "loss/logits": 0.17720388248562813, "loss/reg": 1.550193428993225, "step": 1085 }, { "epoch": 0.01086, "grad_norm": 0.4168826937675476, "grad_norm_var": 0.4651434528428754, "learning_rate": 5e-05, "loss": 0.2069, "loss/crossentropy": 2.845678389072418, "loss/hidden": 0.0, "loss/logits": 0.20686748251318932, "loss/reg": 1.54852294921875, "step": 1086 }, { "epoch": 0.01087, "grad_norm": 0.3667008578777313, "grad_norm_var": 0.4650842073091175, "learning_rate": 5e-05, "loss": 0.1663, "loss/crossentropy": 2.8423004746437073, "loss/hidden": 0.0, "loss/logits": 0.16627563163638115, "loss/reg": 1.5476669073104858, "step": 1087 }, { "epoch": 0.01088, "grad_norm": 0.3929131329059601, "grad_norm_var": 0.46448085164061514, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.74026757478714, "loss/hidden": 0.0, "loss/logits": 0.1841907612979412, "loss/reg": 1.5473967790603638, "step": 1088 }, { "epoch": 0.01089, "grad_norm": 0.39987713098526, "grad_norm_var": 0.4655681808252753, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.771194040775299, "loss/hidden": 0.0, "loss/logits": 0.17620433494448662, "loss/reg": 1.5467705726623535, "step": 1089 }, { "epoch": 0.0109, "grad_norm": 0.3864990472793579, "grad_norm_var": 0.46654038035843726, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.73794162273407, "loss/hidden": 0.0, "loss/logits": 0.1848638951778412, "loss/reg": 1.5455354452133179, "step": 1090 }, { "epoch": 0.01091, "grad_norm": 0.36454349756240845, "grad_norm_var": 0.46694053852503253, "learning_rate": 5e-05, "loss": 0.1934, "loss/crossentropy": 2.7609499096870422, "loss/hidden": 0.0, "loss/logits": 0.19338013604283333, "loss/reg": 1.5440956354141235, "step": 1091 }, { "epoch": 0.01092, "grad_norm": 0.44408488273620605, "grad_norm_var": 0.46547544141069996, "learning_rate": 5e-05, "loss": 0.2098, "loss/crossentropy": 2.9235315918922424, "loss/hidden": 0.0, "loss/logits": 0.20981372147798538, "loss/reg": 1.5432744026184082, "step": 1092 }, { "epoch": 0.01093, "grad_norm": 0.5225197672843933, "grad_norm_var": 0.4636320430634635, "learning_rate": 5e-05, "loss": 0.2165, "loss/crossentropy": 2.971177875995636, "loss/hidden": 0.0, "loss/logits": 0.21645646914839745, "loss/reg": 1.542582392692566, "step": 1093 }, { "epoch": 0.01094, "grad_norm": 0.4295833706855774, "grad_norm_var": 0.4626234072637335, "learning_rate": 5e-05, "loss": 0.1917, "loss/crossentropy": 2.743496000766754, "loss/hidden": 0.0, "loss/logits": 0.19165712594985962, "loss/reg": 1.541102409362793, "step": 1094 }, { "epoch": 0.01095, "grad_norm": 0.3936365842819214, "grad_norm_var": 0.0014465937708503795, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.81677907705307, "loss/hidden": 0.0, "loss/logits": 0.18201437965035439, "loss/reg": 1.5404126644134521, "step": 1095 }, { "epoch": 0.01096, "grad_norm": 0.5806572437286377, "grad_norm_var": 0.0030888115382388857, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.8792348504066467, "loss/hidden": 0.0, "loss/logits": 0.18511545285582542, "loss/reg": 1.5391901731491089, "step": 1096 }, { "epoch": 0.01097, "grad_norm": 0.40794965624809265, "grad_norm_var": 0.003060587668769073, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.9066116213798523, "loss/hidden": 0.0, "loss/logits": 0.19067668542265892, "loss/reg": 1.5390186309814453, "step": 1097 }, { "epoch": 0.01098, "grad_norm": 0.3634330630302429, "grad_norm_var": 0.003284496285873558, "learning_rate": 5e-05, "loss": 0.161, "loss/crossentropy": 2.7170445919036865, "loss/hidden": 0.0, "loss/logits": 0.1610131524503231, "loss/reg": 1.5397425889968872, "step": 1098 }, { "epoch": 0.01099, "grad_norm": 1.1740427017211914, "grad_norm_var": 0.03875858693632671, "learning_rate": 5e-05, "loss": 0.2375, "loss/crossentropy": 2.8482487201690674, "loss/hidden": 0.0, "loss/logits": 0.23750562220811844, "loss/reg": 1.539298415184021, "step": 1099 }, { "epoch": 0.011, "grad_norm": 0.41955143213272095, "grad_norm_var": 0.038730476788456265, "learning_rate": 5e-05, "loss": 0.1936, "loss/crossentropy": 2.771632492542267, "loss/hidden": 0.0, "loss/logits": 0.19357017427682877, "loss/reg": 1.5384517908096313, "step": 1100 }, { "epoch": 0.01101, "grad_norm": 0.44815731048583984, "grad_norm_var": 0.03863233892448896, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.8682021498680115, "loss/hidden": 0.0, "loss/logits": 0.18247045576572418, "loss/reg": 1.5383305549621582, "step": 1101 }, { "epoch": 0.01102, "grad_norm": 0.8878780007362366, "grad_norm_var": 0.049196589116129716, "learning_rate": 5e-05, "loss": 0.2108, "loss/crossentropy": 2.742558717727661, "loss/hidden": 0.0, "loss/logits": 0.21081242337822914, "loss/reg": 1.5376973152160645, "step": 1102 }, { "epoch": 0.01103, "grad_norm": 0.45139309763908386, "grad_norm_var": 0.04815231816326545, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.6859241724014282, "loss/hidden": 0.0, "loss/logits": 0.18528878688812256, "loss/reg": 1.5371124744415283, "step": 1103 }, { "epoch": 0.01104, "grad_norm": 0.542457103729248, "grad_norm_var": 0.04733165822270831, "learning_rate": 5e-05, "loss": 0.2534, "loss/crossentropy": 3.040016829967499, "loss/hidden": 0.0, "loss/logits": 0.2534272372722626, "loss/reg": 1.53706693649292, "step": 1104 }, { "epoch": 0.01105, "grad_norm": 0.45765799283981323, "grad_norm_var": 0.04666483176769951, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.705473482608795, "loss/hidden": 0.0, "loss/logits": 0.17168359830975533, "loss/reg": 1.5363619327545166, "step": 1105 }, { "epoch": 0.01106, "grad_norm": 0.47550585865974426, "grad_norm_var": 0.04560972358215081, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.8135305643081665, "loss/hidden": 0.0, "loss/logits": 0.18824224174022675, "loss/reg": 1.5352301597595215, "step": 1106 }, { "epoch": 0.01107, "grad_norm": 0.45044830441474915, "grad_norm_var": 0.044259536577999074, "learning_rate": 5e-05, "loss": 0.1995, "loss/crossentropy": 2.8964192271232605, "loss/hidden": 0.0, "loss/logits": 0.19954011589288712, "loss/reg": 1.535237431526184, "step": 1107 }, { "epoch": 0.01108, "grad_norm": 0.38536059856414795, "grad_norm_var": 0.045132585802010065, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.7900161743164062, "loss/hidden": 0.0, "loss/logits": 0.18588732928037643, "loss/reg": 1.5349178314208984, "step": 1108 }, { "epoch": 0.01109, "grad_norm": 0.3790653944015503, "grad_norm_var": 0.04645454606829357, "learning_rate": 5e-05, "loss": 0.1858, "loss/crossentropy": 2.7974360585212708, "loss/hidden": 0.0, "loss/logits": 0.1857653297483921, "loss/reg": 1.535470962524414, "step": 1109 }, { "epoch": 0.0111, "grad_norm": 0.4460708796977997, "grad_norm_var": 0.046282830384225, "learning_rate": 5e-05, "loss": 0.1936, "loss/crossentropy": 2.8131036162376404, "loss/hidden": 0.0, "loss/logits": 0.19361238926649094, "loss/reg": 1.5345796346664429, "step": 1110 }, { "epoch": 0.01111, "grad_norm": 0.39801692962646484, "grad_norm_var": 0.046212298527668116, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.8013834953308105, "loss/hidden": 0.0, "loss/logits": 0.1730150803923607, "loss/reg": 1.5339577198028564, "step": 1111 }, { "epoch": 0.01112, "grad_norm": 0.37219998240470886, "grad_norm_var": 0.047151327489262096, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.745340883731842, "loss/hidden": 0.0, "loss/logits": 0.17297760024666786, "loss/reg": 1.532616376876831, "step": 1112 }, { "epoch": 0.01113, "grad_norm": 0.37448444962501526, "grad_norm_var": 0.04764855990328396, "learning_rate": 5e-05, "loss": 0.1745, "loss/crossentropy": 2.7970882058143616, "loss/hidden": 0.0, "loss/logits": 0.1744941510260105, "loss/reg": 1.5317273139953613, "step": 1113 }, { "epoch": 0.01114, "grad_norm": 0.44670793414115906, "grad_norm_var": 0.04654778230486344, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.8545820713043213, "loss/hidden": 0.0, "loss/logits": 0.18966153636574745, "loss/reg": 1.5309053659439087, "step": 1114 }, { "epoch": 0.01115, "grad_norm": 0.46504896879196167, "grad_norm_var": 0.014889839873678161, "learning_rate": 5e-05, "loss": 0.2429, "loss/crossentropy": 2.6743342876434326, "loss/hidden": 0.0, "loss/logits": 0.24294951558113098, "loss/reg": 1.5288500785827637, "step": 1115 }, { "epoch": 0.01116, "grad_norm": 0.38851630687713623, "grad_norm_var": 0.015127761548291678, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.8051797747612, "loss/hidden": 0.0, "loss/logits": 0.1781875565648079, "loss/reg": 1.5276798009872437, "step": 1116 }, { "epoch": 0.01117, "grad_norm": 0.4078308045864105, "grad_norm_var": 0.015296091420591058, "learning_rate": 5e-05, "loss": 0.2051, "loss/crossentropy": 2.6146674156188965, "loss/hidden": 0.0, "loss/logits": 0.20513415709137917, "loss/reg": 1.5253890752792358, "step": 1117 }, { "epoch": 0.01118, "grad_norm": 0.40175458788871765, "grad_norm_var": 0.0022052748110299113, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.786243498325348, "loss/hidden": 0.0, "loss/logits": 0.18161213025450706, "loss/reg": 1.5237782001495361, "step": 1118 }, { "epoch": 0.01119, "grad_norm": 0.38154929876327515, "grad_norm_var": 0.00228912119924131, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.8448697328567505, "loss/hidden": 0.0, "loss/logits": 0.18143780902028084, "loss/reg": 1.522615671157837, "step": 1119 }, { "epoch": 0.0112, "grad_norm": 0.38408970832824707, "grad_norm_var": 0.0013403912284101576, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.674954891204834, "loss/hidden": 0.0, "loss/logits": 0.18346881866455078, "loss/reg": 1.5216854810714722, "step": 1120 }, { "epoch": 0.01121, "grad_norm": 0.3568238317966461, "grad_norm_var": 0.0013807554136454217, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.753901481628418, "loss/hidden": 0.0, "loss/logits": 0.18456534296274185, "loss/reg": 1.5205482244491577, "step": 1121 }, { "epoch": 0.01122, "grad_norm": 0.4012138545513153, "grad_norm_var": 0.0010480325632392688, "learning_rate": 5e-05, "loss": 0.1946, "loss/crossentropy": 2.6712945103645325, "loss/hidden": 0.0, "loss/logits": 0.19459592550992966, "loss/reg": 1.5195826292037964, "step": 1122 }, { "epoch": 0.01123, "grad_norm": 0.3645152449607849, "grad_norm_var": 0.0009595980710018252, "learning_rate": 5e-05, "loss": 0.1879, "loss/crossentropy": 2.7605273723602295, "loss/hidden": 0.0, "loss/logits": 0.18793095648288727, "loss/reg": 1.5193746089935303, "step": 1123 }, { "epoch": 0.01124, "grad_norm": 0.35984060168266296, "grad_norm_var": 0.0010401730322851521, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.870726466178894, "loss/hidden": 0.0, "loss/logits": 0.1740589663386345, "loss/reg": 1.5188905000686646, "step": 1124 }, { "epoch": 0.01125, "grad_norm": 0.3941318690776825, "grad_norm_var": 0.0010213796255876299, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.7815005779266357, "loss/hidden": 0.0, "loss/logits": 0.1801726222038269, "loss/reg": 1.5177239179611206, "step": 1125 }, { "epoch": 0.01126, "grad_norm": 0.4047865569591522, "grad_norm_var": 0.0008546231628673814, "learning_rate": 5e-05, "loss": 0.2063, "loss/crossentropy": 2.715599477291107, "loss/hidden": 0.0, "loss/logits": 0.20628444477915764, "loss/reg": 1.5171293020248413, "step": 1126 }, { "epoch": 0.01127, "grad_norm": 0.41338419914245605, "grad_norm_var": 0.0008779320407387677, "learning_rate": 5e-05, "loss": 0.2112, "loss/crossentropy": 2.9712833166122437, "loss/hidden": 0.0, "loss/logits": 0.2111729383468628, "loss/reg": 1.5161170959472656, "step": 1127 }, { "epoch": 0.01128, "grad_norm": 0.3533566892147064, "grad_norm_var": 0.0009569173440450385, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.844532549381256, "loss/hidden": 0.0, "loss/logits": 0.1821518912911415, "loss/reg": 1.5148218870162964, "step": 1128 }, { "epoch": 0.01129, "grad_norm": 0.36923155188560486, "grad_norm_var": 0.000972049210964802, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.8003381490707397, "loss/hidden": 0.0, "loss/logits": 0.18395673483610153, "loss/reg": 1.5138037204742432, "step": 1129 }, { "epoch": 0.0113, "grad_norm": 0.38235801458358765, "grad_norm_var": 0.0007726070702099741, "learning_rate": 5e-05, "loss": 0.188, "loss/crossentropy": 2.837286412715912, "loss/hidden": 0.0, "loss/logits": 0.1879820078611374, "loss/reg": 1.512952208518982, "step": 1130 }, { "epoch": 0.01131, "grad_norm": 0.37186941504478455, "grad_norm_var": 0.00037387253486032, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.737375319004059, "loss/hidden": 0.0, "loss/logits": 0.1758098341524601, "loss/reg": 1.5121604204177856, "step": 1131 }, { "epoch": 0.01132, "grad_norm": 0.36942365765571594, "grad_norm_var": 0.00038376674257047566, "learning_rate": 5e-05, "loss": 0.1963, "loss/crossentropy": 2.8107948303222656, "loss/hidden": 0.0, "loss/logits": 0.19633925706148148, "loss/reg": 1.5109307765960693, "step": 1132 }, { "epoch": 0.01133, "grad_norm": 0.36502739787101746, "grad_norm_var": 0.00035233925543644756, "learning_rate": 5e-05, "loss": 0.192, "loss/crossentropy": 2.739868938922882, "loss/hidden": 0.0, "loss/logits": 0.19196948036551476, "loss/reg": 1.5099393129348755, "step": 1133 }, { "epoch": 0.01134, "grad_norm": 0.35947197675704956, "grad_norm_var": 0.0003390916400412743, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.9228984713554382, "loss/hidden": 0.0, "loss/logits": 0.17327600717544556, "loss/reg": 1.5089576244354248, "step": 1134 }, { "epoch": 0.01135, "grad_norm": 0.4201592803001404, "grad_norm_var": 0.00045598006875781453, "learning_rate": 5e-05, "loss": 0.2055, "loss/crossentropy": 2.709896743297577, "loss/hidden": 0.0, "loss/logits": 0.2055364064872265, "loss/reg": 1.5074372291564941, "step": 1135 }, { "epoch": 0.01136, "grad_norm": 0.374734103679657, "grad_norm_var": 0.0004555446863156713, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.7715643644332886, "loss/hidden": 0.0, "loss/logits": 0.17949137836694717, "loss/reg": 1.5065659284591675, "step": 1136 }, { "epoch": 0.01137, "grad_norm": 0.3729066550731659, "grad_norm_var": 0.00042464881057899067, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.7452287673950195, "loss/hidden": 0.0, "loss/logits": 0.17615839838981628, "loss/reg": 1.5052706003189087, "step": 1137 }, { "epoch": 0.01138, "grad_norm": 0.38408100605010986, "grad_norm_var": 0.0003940218106961842, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.8566765189170837, "loss/hidden": 0.0, "loss/logits": 0.18010596185922623, "loss/reg": 1.5037782192230225, "step": 1138 }, { "epoch": 0.01139, "grad_norm": 0.3781294524669647, "grad_norm_var": 0.00037984854208149933, "learning_rate": 5e-05, "loss": 0.1634, "loss/crossentropy": 2.7707727551460266, "loss/hidden": 0.0, "loss/logits": 0.16339509561657906, "loss/reg": 1.5025368928909302, "step": 1139 }, { "epoch": 0.0114, "grad_norm": 0.4109313189983368, "grad_norm_var": 0.00040868822139816053, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.883831739425659, "loss/hidden": 0.0, "loss/logits": 0.18474670499563217, "loss/reg": 1.50146484375, "step": 1140 }, { "epoch": 0.01141, "grad_norm": 0.3971245288848877, "grad_norm_var": 0.0004137900008258755, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.7795395851135254, "loss/hidden": 0.0, "loss/logits": 0.19261505082249641, "loss/reg": 1.5002492666244507, "step": 1141 }, { "epoch": 0.01142, "grad_norm": 0.4321688115596771, "grad_norm_var": 0.0005404274556179349, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.7451062202453613, "loss/hidden": 0.0, "loss/logits": 0.18978586420416832, "loss/reg": 1.4995501041412354, "step": 1142 }, { "epoch": 0.01143, "grad_norm": 0.4023013710975647, "grad_norm_var": 0.0005056395743542516, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.6546586751937866, "loss/hidden": 0.0, "loss/logits": 0.1770654208958149, "loss/reg": 1.4985374212265015, "step": 1143 }, { "epoch": 0.01144, "grad_norm": 0.4330555200576782, "grad_norm_var": 0.0005774834396144494, "learning_rate": 5e-05, "loss": 0.1908, "loss/crossentropy": 2.648827910423279, "loss/hidden": 0.0, "loss/logits": 0.1907619796693325, "loss/reg": 1.4983007907867432, "step": 1144 }, { "epoch": 0.01145, "grad_norm": 0.38340097665786743, "grad_norm_var": 0.0005528051964884484, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 2.761549949645996, "loss/hidden": 0.0, "loss/logits": 0.18321385234594345, "loss/reg": 1.4974865913391113, "step": 1145 }, { "epoch": 0.01146, "grad_norm": 0.3774440586566925, "grad_norm_var": 0.0005592043924350865, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.8717733025550842, "loss/hidden": 0.0, "loss/logits": 0.1877019703388214, "loss/reg": 1.4972758293151855, "step": 1146 }, { "epoch": 0.01147, "grad_norm": 0.3859359323978424, "grad_norm_var": 0.0005384773779493794, "learning_rate": 5e-05, "loss": 0.1838, "loss/crossentropy": 2.9210824966430664, "loss/hidden": 0.0, "loss/logits": 0.18384063616394997, "loss/reg": 1.4961239099502563, "step": 1147 }, { "epoch": 0.01148, "grad_norm": 0.3788585066795349, "grad_norm_var": 0.0005176612581260268, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.8554866313934326, "loss/hidden": 0.0, "loss/logits": 0.17496761679649353, "loss/reg": 1.4948080778121948, "step": 1148 }, { "epoch": 0.01149, "grad_norm": 0.381953626871109, "grad_norm_var": 0.0004769895308362693, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.760767877101898, "loss/hidden": 0.0, "loss/logits": 0.182776290923357, "loss/reg": 1.49321448802948, "step": 1149 }, { "epoch": 0.0115, "grad_norm": 0.41392236948013306, "grad_norm_var": 0.0004258390348976608, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.617997944355011, "loss/hidden": 0.0, "loss/logits": 0.1842118427157402, "loss/reg": 1.4924064874649048, "step": 1150 }, { "epoch": 0.01151, "grad_norm": 0.40094637870788574, "grad_norm_var": 0.0003855969394288672, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.6930890679359436, "loss/hidden": 0.0, "loss/logits": 0.1934753768146038, "loss/reg": 1.4911280870437622, "step": 1151 }, { "epoch": 0.01152, "grad_norm": 0.4077228009700775, "grad_norm_var": 0.00036780126123923116, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.696079134941101, "loss/hidden": 0.0, "loss/logits": 0.18851438537240028, "loss/reg": 1.4894925355911255, "step": 1152 }, { "epoch": 0.01153, "grad_norm": 0.37663567066192627, "grad_norm_var": 0.0003570365498350214, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.7406232357025146, "loss/hidden": 0.0, "loss/logits": 0.18599893897771835, "loss/reg": 1.4883166551589966, "step": 1153 }, { "epoch": 0.01154, "grad_norm": 0.39306846261024475, "grad_norm_var": 0.00034715706505394905, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.8033538460731506, "loss/hidden": 0.0, "loss/logits": 0.17927935346961021, "loss/reg": 1.4874869585037231, "step": 1154 }, { "epoch": 0.01155, "grad_norm": 0.5115792155265808, "grad_norm_var": 0.0011226610795350292, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.8153198957443237, "loss/hidden": 0.0, "loss/logits": 0.18982412666082382, "loss/reg": 1.4867078065872192, "step": 1155 }, { "epoch": 0.01156, "grad_norm": 0.3997136354446411, "grad_norm_var": 0.0011223134316026655, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.676534593105316, "loss/hidden": 0.0, "loss/logits": 0.18394503742456436, "loss/reg": 1.4859753847122192, "step": 1156 }, { "epoch": 0.01157, "grad_norm": 0.40472909808158875, "grad_norm_var": 0.0011182066388159624, "learning_rate": 5e-05, "loss": 0.2042, "loss/crossentropy": 2.7133376002311707, "loss/hidden": 0.0, "loss/logits": 0.20424646511673927, "loss/reg": 1.4849092960357666, "step": 1157 }, { "epoch": 0.01158, "grad_norm": 0.3610299527645111, "grad_norm_var": 0.0011788388166517098, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.7634962797164917, "loss/hidden": 0.0, "loss/logits": 0.17686418071389198, "loss/reg": 1.484142541885376, "step": 1158 }, { "epoch": 0.01159, "grad_norm": 0.4668003022670746, "grad_norm_var": 0.001452027449821891, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.6544492840766907, "loss/hidden": 0.0, "loss/logits": 0.17144346237182617, "loss/reg": 1.4833974838256836, "step": 1159 }, { "epoch": 0.0116, "grad_norm": 0.38263368606567383, "grad_norm_var": 0.0014209642141947163, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.801379144191742, "loss/hidden": 0.0, "loss/logits": 0.1680724062025547, "loss/reg": 1.4821968078613281, "step": 1160 }, { "epoch": 0.01161, "grad_norm": 0.46218374371528625, "grad_norm_var": 0.0016172066414785117, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.8082536458969116, "loss/hidden": 0.0, "loss/logits": 0.18662720918655396, "loss/reg": 1.4810516834259033, "step": 1161 }, { "epoch": 0.01162, "grad_norm": 0.40168115496635437, "grad_norm_var": 0.001559790115608121, "learning_rate": 5e-05, "loss": 0.1734, "loss/crossentropy": 2.7967772483825684, "loss/hidden": 0.0, "loss/logits": 0.17344212904572487, "loss/reg": 1.4801340103149414, "step": 1162 }, { "epoch": 0.01163, "grad_norm": 0.4014493227005005, "grad_norm_var": 0.0015290129465419374, "learning_rate": 5e-05, "loss": 0.1975, "loss/crossentropy": 2.7243821024894714, "loss/hidden": 0.0, "loss/logits": 0.19749024882912636, "loss/reg": 1.479145884513855, "step": 1163 }, { "epoch": 0.01164, "grad_norm": 0.3888718783855438, "grad_norm_var": 0.001494961513700081, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.7566738724708557, "loss/hidden": 0.0, "loss/logits": 0.18155549839138985, "loss/reg": 1.4781523942947388, "step": 1164 }, { "epoch": 0.01165, "grad_norm": 0.3523938059806824, "grad_norm_var": 0.0016588613416890366, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.7065756916999817, "loss/hidden": 0.0, "loss/logits": 0.17404457181692123, "loss/reg": 1.4770623445510864, "step": 1165 }, { "epoch": 0.01166, "grad_norm": 0.462964802980423, "grad_norm_var": 0.0018489885102984402, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.975113093852997, "loss/hidden": 0.0, "loss/logits": 0.18585091456770897, "loss/reg": 1.4751616716384888, "step": 1166 }, { "epoch": 0.01167, "grad_norm": 0.5501847863197327, "grad_norm_var": 0.0030429283606156414, "learning_rate": 5e-05, "loss": 0.2033, "loss/crossentropy": 2.8122783303260803, "loss/hidden": 0.0, "loss/logits": 0.20325875282287598, "loss/reg": 1.4739700555801392, "step": 1167 }, { "epoch": 0.01168, "grad_norm": 0.43727031350135803, "grad_norm_var": 0.0030482293912123463, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.6004568338394165, "loss/hidden": 0.0, "loss/logits": 0.18711896240711212, "loss/reg": 1.472339153289795, "step": 1168 }, { "epoch": 0.01169, "grad_norm": 0.5093014240264893, "grad_norm_var": 0.003344487550155887, "learning_rate": 5e-05, "loss": 0.204, "loss/crossentropy": 2.8668264746665955, "loss/hidden": 0.0, "loss/logits": 0.2039630264043808, "loss/reg": 1.471176028251648, "step": 1169 }, { "epoch": 0.0117, "grad_norm": 0.3894696831703186, "grad_norm_var": 0.003363193736657033, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.8249135613441467, "loss/hidden": 0.0, "loss/logits": 0.18029743060469627, "loss/reg": 1.46927011013031, "step": 1170 }, { "epoch": 0.01171, "grad_norm": 0.4055262506008148, "grad_norm_var": 0.0029145778475038226, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.7669249773025513, "loss/hidden": 0.0, "loss/logits": 0.1910816915333271, "loss/reg": 1.4679526090621948, "step": 1171 }, { "epoch": 0.01172, "grad_norm": 0.36875951290130615, "grad_norm_var": 0.0030726867573331244, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.7718464732170105, "loss/hidden": 0.0, "loss/logits": 0.17493540793657303, "loss/reg": 1.4658503532409668, "step": 1172 }, { "epoch": 0.01173, "grad_norm": 0.42354193329811096, "grad_norm_var": 0.003052543245601517, "learning_rate": 5e-05, "loss": 0.1855, "loss/crossentropy": 2.719308376312256, "loss/hidden": 0.0, "loss/logits": 0.1855236478149891, "loss/reg": 1.464747667312622, "step": 1173 }, { "epoch": 0.01174, "grad_norm": 0.39868998527526855, "grad_norm_var": 0.0028312487941498285, "learning_rate": 5e-05, "loss": 0.1683, "loss/crossentropy": 2.8142080307006836, "loss/hidden": 0.0, "loss/logits": 0.16827983036637306, "loss/reg": 1.4631402492523193, "step": 1174 }, { "epoch": 0.01175, "grad_norm": 0.3635683059692383, "grad_norm_var": 0.002923433007255775, "learning_rate": 5e-05, "loss": 0.1691, "loss/crossentropy": 2.8140381574630737, "loss/hidden": 0.0, "loss/logits": 0.16913769394159317, "loss/reg": 1.461613655090332, "step": 1175 }, { "epoch": 0.01176, "grad_norm": 0.4067941904067993, "grad_norm_var": 0.0028438749166883564, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 2.74730122089386, "loss/hidden": 0.0, "loss/logits": 0.1805243194103241, "loss/reg": 1.4598890542984009, "step": 1176 }, { "epoch": 0.01177, "grad_norm": 0.4573166072368622, "grad_norm_var": 0.002818087802214993, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.6282488107681274, "loss/hidden": 0.0, "loss/logits": 0.19583966955542564, "loss/reg": 1.458559513092041, "step": 1177 }, { "epoch": 0.01178, "grad_norm": 0.41611266136169434, "grad_norm_var": 0.0027961219454357995, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.8419344425201416, "loss/hidden": 0.0, "loss/logits": 0.1892707683146, "loss/reg": 1.4577357769012451, "step": 1178 }, { "epoch": 0.01179, "grad_norm": 0.4139103293418884, "grad_norm_var": 0.002773736915110033, "learning_rate": 5e-05, "loss": 0.1878, "loss/crossentropy": 2.8046358227729797, "loss/hidden": 0.0, "loss/logits": 0.18783502280712128, "loss/reg": 1.457064151763916, "step": 1179 }, { "epoch": 0.0118, "grad_norm": 0.39083123207092285, "grad_norm_var": 0.002765441807365839, "learning_rate": 5e-05, "loss": 0.1937, "loss/crossentropy": 2.7833199501037598, "loss/hidden": 0.0, "loss/logits": 0.1936798058450222, "loss/reg": 1.4564727544784546, "step": 1180 }, { "epoch": 0.01181, "grad_norm": 0.37335360050201416, "grad_norm_var": 0.0025993115992857365, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.9156389832496643, "loss/hidden": 0.0, "loss/logits": 0.17697978019714355, "loss/reg": 1.4559135437011719, "step": 1181 }, { "epoch": 0.01182, "grad_norm": 0.3955034911632538, "grad_norm_var": 0.0025240464809215823, "learning_rate": 5e-05, "loss": 0.1987, "loss/crossentropy": 2.8472283482551575, "loss/hidden": 0.0, "loss/logits": 0.19867418706417084, "loss/reg": 1.4550074338912964, "step": 1182 }, { "epoch": 0.01183, "grad_norm": 0.4038008451461792, "grad_norm_var": 0.0012981508534220532, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.806343376636505, "loss/hidden": 0.0, "loss/logits": 0.18930381163954735, "loss/reg": 1.454715609550476, "step": 1183 }, { "epoch": 0.01184, "grad_norm": 0.5491846203804016, "grad_norm_var": 0.0024937052353730376, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.7931687235832214, "loss/hidden": 0.0, "loss/logits": 0.18108559772372246, "loss/reg": 1.454012155532837, "step": 1184 }, { "epoch": 0.01185, "grad_norm": 0.3679744601249695, "grad_norm_var": 0.0019952852149375476, "learning_rate": 5e-05, "loss": 0.1894, "loss/crossentropy": 2.670843780040741, "loss/hidden": 0.0, "loss/logits": 0.18943195790052414, "loss/reg": 1.4539992809295654, "step": 1185 }, { "epoch": 0.01186, "grad_norm": 0.5347173810005188, "grad_norm_var": 0.0029594091193365646, "learning_rate": 5e-05, "loss": 0.1931, "loss/crossentropy": 2.81303608417511, "loss/hidden": 0.0, "loss/logits": 0.19309942051768303, "loss/reg": 1.4531627893447876, "step": 1186 }, { "epoch": 0.01187, "grad_norm": 0.35835427045822144, "grad_norm_var": 0.003169699938894862, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.8339531421661377, "loss/hidden": 0.0, "loss/logits": 0.1810522824525833, "loss/reg": 1.452269196510315, "step": 1187 }, { "epoch": 0.01188, "grad_norm": 0.3567518889904022, "grad_norm_var": 0.0032509833875422464, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.765186607837677, "loss/hidden": 0.0, "loss/logits": 0.17427153512835503, "loss/reg": 1.450973391532898, "step": 1188 }, { "epoch": 0.01189, "grad_norm": 0.3796020746231079, "grad_norm_var": 0.0033107722836770177, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.9006309509277344, "loss/hidden": 0.0, "loss/logits": 0.1769685558974743, "loss/reg": 1.4492751359939575, "step": 1189 }, { "epoch": 0.0119, "grad_norm": 0.44529664516448975, "grad_norm_var": 0.0033737393452274926, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.7190520763397217, "loss/hidden": 0.0, "loss/logits": 0.1789361834526062, "loss/reg": 1.4478099346160889, "step": 1190 }, { "epoch": 0.01191, "grad_norm": 0.3916754424571991, "grad_norm_var": 0.003236675787769731, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.8190470933914185, "loss/hidden": 0.0, "loss/logits": 0.18166551738977432, "loss/reg": 1.4454478025436401, "step": 1191 }, { "epoch": 0.01192, "grad_norm": 0.41403406858444214, "grad_norm_var": 0.0032319593928060424, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.867457687854767, "loss/hidden": 0.0, "loss/logits": 0.18155980482697487, "loss/reg": 1.4430103302001953, "step": 1192 }, { "epoch": 0.01193, "grad_norm": 0.38379180431365967, "grad_norm_var": 0.0031601439954389837, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.715328276157379, "loss/hidden": 0.0, "loss/logits": 0.18566227331757545, "loss/reg": 1.441053867340088, "step": 1193 }, { "epoch": 0.01194, "grad_norm": 0.37299588322639465, "grad_norm_var": 0.0032465457322637874, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.7711251378059387, "loss/hidden": 0.0, "loss/logits": 0.17077474668622017, "loss/reg": 1.4382448196411133, "step": 1194 }, { "epoch": 0.01195, "grad_norm": 0.40252241492271423, "grad_norm_var": 0.00324603537587758, "learning_rate": 5e-05, "loss": 0.1955, "loss/crossentropy": 2.7185396552085876, "loss/hidden": 0.0, "loss/logits": 0.19552023708820343, "loss/reg": 1.43511164188385, "step": 1195 }, { "epoch": 0.01196, "grad_norm": 0.37733253836631775, "grad_norm_var": 0.0032874685602436522, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.8184019327163696, "loss/hidden": 0.0, "loss/logits": 0.18738915398716927, "loss/reg": 1.4331696033477783, "step": 1196 }, { "epoch": 0.01197, "grad_norm": 0.39589473605155945, "grad_norm_var": 0.003219060852671751, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.7010214924812317, "loss/hidden": 0.0, "loss/logits": 0.186626598238945, "loss/reg": 1.4316699504852295, "step": 1197 }, { "epoch": 0.01198, "grad_norm": 0.4191206693649292, "grad_norm_var": 0.0032142886338114205, "learning_rate": 5e-05, "loss": 0.1919, "loss/crossentropy": 2.909880578517914, "loss/hidden": 0.0, "loss/logits": 0.19187119230628014, "loss/reg": 1.4303897619247437, "step": 1198 }, { "epoch": 0.01199, "grad_norm": 0.4035413861274719, "grad_norm_var": 0.0032144922705757157, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.6426594257354736, "loss/hidden": 0.0, "loss/logits": 0.1845902055501938, "loss/reg": 1.4290152788162231, "step": 1199 }, { "epoch": 0.012, "grad_norm": 0.36857399344444275, "grad_norm_var": 0.0018906405469929315, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.787937879562378, "loss/hidden": 0.0, "loss/logits": 0.16886601597070694, "loss/reg": 1.4279104471206665, "step": 1200 }, { "epoch": 0.01201, "grad_norm": 0.3827061653137207, "grad_norm_var": 0.0018447143939107808, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.6812090277671814, "loss/hidden": 0.0, "loss/logits": 0.18400828912854195, "loss/reg": 1.4265459775924683, "step": 1201 }, { "epoch": 0.01202, "grad_norm": 0.3506307005882263, "grad_norm_var": 0.0006360064193142537, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.71059513092041, "loss/hidden": 0.0, "loss/logits": 0.17186394706368446, "loss/reg": 1.4260975122451782, "step": 1202 }, { "epoch": 0.01203, "grad_norm": 0.3920105993747711, "grad_norm_var": 0.0005752191941902721, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.7090110182762146, "loss/hidden": 0.0, "loss/logits": 0.1797771342098713, "loss/reg": 1.4251253604888916, "step": 1203 }, { "epoch": 0.01204, "grad_norm": 0.3974657356739044, "grad_norm_var": 0.000499526406805432, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.701002776622772, "loss/hidden": 0.0, "loss/logits": 0.18736432120203972, "loss/reg": 1.4243783950805664, "step": 1204 }, { "epoch": 0.01205, "grad_norm": 0.42775383591651917, "grad_norm_var": 0.0005627563087383103, "learning_rate": 5e-05, "loss": 0.2098, "loss/crossentropy": 2.734471619129181, "loss/hidden": 0.0, "loss/logits": 0.2097911797463894, "loss/reg": 1.4237735271453857, "step": 1205 }, { "epoch": 0.01206, "grad_norm": 0.40280991792678833, "grad_norm_var": 0.0003925441234761018, "learning_rate": 5e-05, "loss": 0.204, "loss/crossentropy": 2.649215042591095, "loss/hidden": 0.0, "loss/logits": 0.2039887085556984, "loss/reg": 1.4232732057571411, "step": 1206 }, { "epoch": 0.01207, "grad_norm": 0.4008637070655823, "grad_norm_var": 0.00039659149065429666, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.7435959577560425, "loss/hidden": 0.0, "loss/logits": 0.1816614270210266, "loss/reg": 1.42289137840271, "step": 1207 }, { "epoch": 0.01208, "grad_norm": 0.38267508149147034, "grad_norm_var": 0.0003711633927555173, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.8640989661216736, "loss/hidden": 0.0, "loss/logits": 0.17420271039009094, "loss/reg": 1.422437071800232, "step": 1208 }, { "epoch": 0.01209, "grad_norm": 0.3714866638183594, "grad_norm_var": 0.00039293414504885845, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.746426820755005, "loss/hidden": 0.0, "loss/logits": 0.16898731514811516, "loss/reg": 1.42264723777771, "step": 1209 }, { "epoch": 0.0121, "grad_norm": 0.3865947723388672, "grad_norm_var": 0.00037271053118719997, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.849126398563385, "loss/hidden": 0.0, "loss/logits": 0.18478216230869293, "loss/reg": 1.4227102994918823, "step": 1210 }, { "epoch": 0.01211, "grad_norm": 0.46436965465545654, "grad_norm_var": 0.0007037118140789371, "learning_rate": 5e-05, "loss": 0.1964, "loss/crossentropy": 2.891853094100952, "loss/hidden": 0.0, "loss/logits": 0.1963624656200409, "loss/reg": 1.421623945236206, "step": 1211 }, { "epoch": 0.01212, "grad_norm": 0.3868556618690491, "grad_norm_var": 0.0006866427169156234, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.7030688524246216, "loss/hidden": 0.0, "loss/logits": 0.18502762913703918, "loss/reg": 1.4205013513565063, "step": 1212 }, { "epoch": 0.01213, "grad_norm": 0.5466204285621643, "grad_norm_var": 0.002107741306228805, "learning_rate": 5e-05, "loss": 0.2321, "loss/crossentropy": 2.8207810521125793, "loss/hidden": 0.0, "loss/logits": 0.23205070197582245, "loss/reg": 1.4198737144470215, "step": 1213 }, { "epoch": 0.01214, "grad_norm": 0.36891767382621765, "grad_norm_var": 0.0021724490893943573, "learning_rate": 5e-05, "loss": 0.1751, "loss/crossentropy": 2.765327572822571, "loss/hidden": 0.0, "loss/logits": 0.17510851845145226, "loss/reg": 1.4195406436920166, "step": 1214 }, { "epoch": 0.01215, "grad_norm": 0.38599929213523865, "grad_norm_var": 0.0021883509252218604, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.7295534014701843, "loss/hidden": 0.0, "loss/logits": 0.1777309849858284, "loss/reg": 1.4191190004348755, "step": 1215 }, { "epoch": 0.01216, "grad_norm": 0.3923039436340332, "grad_norm_var": 0.002120883638911231, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.775855779647827, "loss/hidden": 0.0, "loss/logits": 0.17348914593458176, "loss/reg": 1.4182158708572388, "step": 1216 }, { "epoch": 0.01217, "grad_norm": 0.38923102617263794, "grad_norm_var": 0.0021063207621189258, "learning_rate": 5e-05, "loss": 0.1852, "loss/crossentropy": 2.6477224826812744, "loss/hidden": 0.0, "loss/logits": 0.18519454449415207, "loss/reg": 1.4171086549758911, "step": 1217 }, { "epoch": 0.01218, "grad_norm": 0.38138842582702637, "grad_norm_var": 0.0019510417841006008, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.6555696725845337, "loss/hidden": 0.0, "loss/logits": 0.1890222169458866, "loss/reg": 1.415981650352478, "step": 1218 }, { "epoch": 0.01219, "grad_norm": 0.4084736108779907, "grad_norm_var": 0.001939832634857904, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.818666934967041, "loss/hidden": 0.0, "loss/logits": 0.18353594094514847, "loss/reg": 1.4153097867965698, "step": 1219 }, { "epoch": 0.0122, "grad_norm": 0.4247463345527649, "grad_norm_var": 0.0019558024315882013, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.7127944231033325, "loss/hidden": 0.0, "loss/logits": 0.17549164965748787, "loss/reg": 1.4142556190490723, "step": 1220 }, { "epoch": 0.01221, "grad_norm": 0.40716880559921265, "grad_norm_var": 0.0019268832744298062, "learning_rate": 5e-05, "loss": 0.1938, "loss/crossentropy": 2.643829822540283, "loss/hidden": 0.0, "loss/logits": 0.19382373616099358, "loss/reg": 1.4136812686920166, "step": 1221 }, { "epoch": 0.01222, "grad_norm": 0.35272741317749023, "grad_norm_var": 0.0021068318421432236, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.6937798261642456, "loss/hidden": 0.0, "loss/logits": 0.17175516858696938, "loss/reg": 1.4124233722686768, "step": 1222 }, { "epoch": 0.01223, "grad_norm": 0.3460560441017151, "grad_norm_var": 0.002311292127889427, "learning_rate": 5e-05, "loss": 0.1711, "loss/crossentropy": 2.764497935771942, "loss/hidden": 0.0, "loss/logits": 0.17111336812376976, "loss/reg": 1.4124194383621216, "step": 1223 }, { "epoch": 0.01224, "grad_norm": 0.38824766874313354, "grad_norm_var": 0.00230056400932727, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.713408052921295, "loss/hidden": 0.0, "loss/logits": 0.18410339578986168, "loss/reg": 1.412917971611023, "step": 1224 }, { "epoch": 0.01225, "grad_norm": 0.3764731287956238, "grad_norm_var": 0.002283111285856387, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.8800466656684875, "loss/hidden": 0.0, "loss/logits": 0.1814701072871685, "loss/reg": 1.4119223356246948, "step": 1225 }, { "epoch": 0.01226, "grad_norm": 0.35486990213394165, "grad_norm_var": 0.0024043515928512467, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.864288032054901, "loss/hidden": 0.0, "loss/logits": 0.1720508709549904, "loss/reg": 1.410658359527588, "step": 1226 }, { "epoch": 0.01227, "grad_norm": 0.3798040449619293, "grad_norm_var": 0.0021075098216082667, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.800125002861023, "loss/hidden": 0.0, "loss/logits": 0.18600793182849884, "loss/reg": 1.4104071855545044, "step": 1227 }, { "epoch": 0.01228, "grad_norm": 0.38487929105758667, "grad_norm_var": 0.002109404100500737, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.7868515253067017, "loss/hidden": 0.0, "loss/logits": 0.18701008334755898, "loss/reg": 1.4095227718353271, "step": 1228 }, { "epoch": 0.01229, "grad_norm": 0.45511749386787415, "grad_norm_var": 0.0007584030638864074, "learning_rate": 5e-05, "loss": 0.1985, "loss/crossentropy": 2.6571977138519287, "loss/hidden": 0.0, "loss/logits": 0.19848217070102692, "loss/reg": 1.4084324836730957, "step": 1229 }, { "epoch": 0.0123, "grad_norm": 0.6594117879867554, "grad_norm_var": 0.005321544010225898, "learning_rate": 5e-05, "loss": 0.2011, "loss/crossentropy": 2.8001416325569153, "loss/hidden": 0.0, "loss/logits": 0.20106521993875504, "loss/reg": 1.4072535037994385, "step": 1230 }, { "epoch": 0.01231, "grad_norm": 0.43426990509033203, "grad_norm_var": 0.005342107314555718, "learning_rate": 5e-05, "loss": 0.2027, "loss/crossentropy": 2.7976555228233337, "loss/hidden": 0.0, "loss/logits": 0.20270870998501778, "loss/reg": 1.4066628217697144, "step": 1231 }, { "epoch": 0.01232, "grad_norm": 0.7469918131828308, "grad_norm_var": 0.01244134254394691, "learning_rate": 5e-05, "loss": 0.2375, "loss/crossentropy": 2.800569176673889, "loss/hidden": 0.0, "loss/logits": 0.23753388598561287, "loss/reg": 1.4050111770629883, "step": 1232 }, { "epoch": 0.01233, "grad_norm": 0.36666056513786316, "grad_norm_var": 0.012597725507063546, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.8041948676109314, "loss/hidden": 0.0, "loss/logits": 0.17551551386713982, "loss/reg": 1.4047677516937256, "step": 1233 }, { "epoch": 0.01234, "grad_norm": 0.3527641296386719, "grad_norm_var": 0.01283143182770274, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.6635568737983704, "loss/hidden": 0.0, "loss/logits": 0.17143995314836502, "loss/reg": 1.4030920267105103, "step": 1234 }, { "epoch": 0.01235, "grad_norm": 0.40449002385139465, "grad_norm_var": 0.012842484989278431, "learning_rate": 5e-05, "loss": 0.1751, "loss/crossentropy": 2.6857893466949463, "loss/hidden": 0.0, "loss/logits": 0.17512645572423935, "loss/reg": 1.4026522636413574, "step": 1235 }, { "epoch": 0.01236, "grad_norm": 0.3718612492084503, "grad_norm_var": 0.013034358750853499, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.8389678597450256, "loss/hidden": 0.0, "loss/logits": 0.16729874536395073, "loss/reg": 1.401383876800537, "step": 1236 }, { "epoch": 0.01237, "grad_norm": 0.4983327388763428, "grad_norm_var": 0.013350877741256121, "learning_rate": 5e-05, "loss": 0.2026, "loss/crossentropy": 2.7713358998298645, "loss/hidden": 0.0, "loss/logits": 0.20256582275032997, "loss/reg": 1.4005975723266602, "step": 1237 }, { "epoch": 0.01238, "grad_norm": 0.441445916891098, "grad_norm_var": 0.012933952665905564, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.8286274671554565, "loss/hidden": 0.0, "loss/logits": 0.18565155193209648, "loss/reg": 1.3995530605316162, "step": 1238 }, { "epoch": 0.01239, "grad_norm": 0.3901333808898926, "grad_norm_var": 0.012532041194226077, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.911182940006256, "loss/hidden": 0.0, "loss/logits": 0.18296772241592407, "loss/reg": 1.3976836204528809, "step": 1239 }, { "epoch": 0.0124, "grad_norm": 0.39856791496276855, "grad_norm_var": 0.012470430313853701, "learning_rate": 5e-05, "loss": 0.1963, "loss/crossentropy": 2.6836057305336, "loss/hidden": 0.0, "loss/logits": 0.19625534117221832, "loss/reg": 1.396618366241455, "step": 1240 }, { "epoch": 0.01241, "grad_norm": 0.41100946068763733, "grad_norm_var": 0.012259332529219746, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.6196849942207336, "loss/hidden": 0.0, "loss/logits": 0.19323748722672462, "loss/reg": 1.395609736442566, "step": 1241 }, { "epoch": 0.01242, "grad_norm": 0.348229318857193, "grad_norm_var": 0.012338050864381422, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.775640070438385, "loss/hidden": 0.0, "loss/logits": 0.17175709828734398, "loss/reg": 1.3951023817062378, "step": 1242 }, { "epoch": 0.01243, "grad_norm": 1.2390751838684082, "grad_norm_var": 0.05155969127554597, "learning_rate": 5e-05, "loss": 0.2159, "loss/crossentropy": 2.780874252319336, "loss/hidden": 0.0, "loss/logits": 0.21593540906906128, "loss/reg": 1.3937615156173706, "step": 1243 }, { "epoch": 0.01244, "grad_norm": 0.4731004238128662, "grad_norm_var": 0.050763118391303465, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.669394373893738, "loss/hidden": 0.0, "loss/logits": 0.1825188212096691, "loss/reg": 1.392451524734497, "step": 1244 }, { "epoch": 0.01245, "grad_norm": 0.43558576703071594, "grad_norm_var": 0.050902455998128635, "learning_rate": 5e-05, "loss": 0.1864, "loss/crossentropy": 2.6458783745765686, "loss/hidden": 0.0, "loss/logits": 0.1863899528980255, "loss/reg": 1.3913257122039795, "step": 1245 }, { "epoch": 0.01246, "grad_norm": 0.43400633335113525, "grad_norm_var": 0.049234233763389666, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.8432045578956604, "loss/hidden": 0.0, "loss/logits": 0.18295767530798912, "loss/reg": 1.3903818130493164, "step": 1246 }, { "epoch": 0.01247, "grad_norm": 0.43098875880241394, "grad_norm_var": 0.049256731879161936, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.7787997722625732, "loss/hidden": 0.0, "loss/logits": 0.19347602501511574, "loss/reg": 1.3895725011825562, "step": 1247 }, { "epoch": 0.01248, "grad_norm": 1.1000005006790161, "grad_norm_var": 0.06942585731693722, "learning_rate": 5e-05, "loss": 0.2263, "loss/crossentropy": 2.7730624675750732, "loss/hidden": 0.0, "loss/logits": 0.22629190236330032, "loss/reg": 1.3885997533798218, "step": 1248 }, { "epoch": 0.01249, "grad_norm": 0.449535071849823, "grad_norm_var": 0.06831525341155571, "learning_rate": 5e-05, "loss": 0.1994, "loss/crossentropy": 2.7426183223724365, "loss/hidden": 0.0, "loss/logits": 0.1994258612394333, "loss/reg": 1.3879303932189941, "step": 1249 }, { "epoch": 0.0125, "grad_norm": 0.3891923129558563, "grad_norm_var": 0.0676286766494714, "learning_rate": 5e-05, "loss": 0.1764, "loss/crossentropy": 2.7025153636932373, "loss/hidden": 0.0, "loss/logits": 0.1763673946261406, "loss/reg": 1.3863074779510498, "step": 1250 }, { "epoch": 0.01251, "grad_norm": 0.39398977160453796, "grad_norm_var": 0.06778814624374416, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.850652813911438, "loss/hidden": 0.0, "loss/logits": 0.18415121361613274, "loss/reg": 1.3853391408920288, "step": 1251 }, { "epoch": 0.01252, "grad_norm": 0.495724618434906, "grad_norm_var": 0.06641914754466918, "learning_rate": 5e-05, "loss": 0.1964, "loss/crossentropy": 2.679881453514099, "loss/hidden": 0.0, "loss/logits": 0.19643474370241165, "loss/reg": 1.3849084377288818, "step": 1252 }, { "epoch": 0.01253, "grad_norm": 0.4119850695133209, "grad_norm_var": 0.06714101490369723, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.750667691230774, "loss/hidden": 0.0, "loss/logits": 0.1896573342382908, "loss/reg": 1.3841875791549683, "step": 1253 }, { "epoch": 0.01254, "grad_norm": 0.41677817702293396, "grad_norm_var": 0.06742149598287875, "learning_rate": 5e-05, "loss": 0.1949, "loss/crossentropy": 2.7512380480766296, "loss/hidden": 0.0, "loss/logits": 0.1948574222624302, "loss/reg": 1.3836725950241089, "step": 1254 }, { "epoch": 0.01255, "grad_norm": 0.3882921040058136, "grad_norm_var": 0.06745202400909417, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.830279588699341, "loss/hidden": 0.0, "loss/logits": 0.18280091881752014, "loss/reg": 1.3836078643798828, "step": 1255 }, { "epoch": 0.01256, "grad_norm": 0.41673871874809265, "grad_norm_var": 0.06719419648756339, "learning_rate": 5e-05, "loss": 0.2026, "loss/crossentropy": 2.7475191354751587, "loss/hidden": 0.0, "loss/logits": 0.20259103178977966, "loss/reg": 1.3826485872268677, "step": 1256 }, { "epoch": 0.01257, "grad_norm": 0.585350751876831, "grad_norm_var": 0.06668494479683436, "learning_rate": 5e-05, "loss": 0.2359, "loss/crossentropy": 2.9428182244300842, "loss/hidden": 0.0, "loss/logits": 0.23591554537415504, "loss/reg": 1.381587266921997, "step": 1257 }, { "epoch": 0.01258, "grad_norm": 0.42391830682754517, "grad_norm_var": 0.06525364309366323, "learning_rate": 5e-05, "loss": 0.203, "loss/crossentropy": 2.6863314509391785, "loss/hidden": 0.0, "loss/logits": 0.20304660126566887, "loss/reg": 1.3804073333740234, "step": 1258 }, { "epoch": 0.01259, "grad_norm": 0.3636891841888428, "grad_norm_var": 0.030416591644257748, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.770876467227936, "loss/hidden": 0.0, "loss/logits": 0.18117986992001534, "loss/reg": 1.3794111013412476, "step": 1259 }, { "epoch": 0.0126, "grad_norm": 0.4120618999004364, "grad_norm_var": 0.030669422375767422, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.6826838850975037, "loss/hidden": 0.0, "loss/logits": 0.18561375886201859, "loss/reg": 1.378712773323059, "step": 1260 }, { "epoch": 0.01261, "grad_norm": 0.38935086131095886, "grad_norm_var": 0.0310259038505535, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.8777846097946167, "loss/hidden": 0.0, "loss/logits": 0.18731402978301048, "loss/reg": 1.3772296905517578, "step": 1261 }, { "epoch": 0.01262, "grad_norm": 0.3571130335330963, "grad_norm_var": 0.031752674237896614, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.8309271931648254, "loss/hidden": 0.0, "loss/logits": 0.17464854568243027, "loss/reg": 1.3755420446395874, "step": 1262 }, { "epoch": 0.01263, "grad_norm": 0.4016406536102295, "grad_norm_var": 0.0319358552762881, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.8556240797042847, "loss/hidden": 0.0, "loss/logits": 0.19051608070731163, "loss/reg": 1.3745484352111816, "step": 1263 }, { "epoch": 0.01264, "grad_norm": 0.4311268627643585, "grad_norm_var": 0.003017690530940461, "learning_rate": 5e-05, "loss": 0.2093, "loss/crossentropy": 2.760189950466156, "loss/hidden": 0.0, "loss/logits": 0.20933211222290993, "loss/reg": 1.3732630014419556, "step": 1264 }, { "epoch": 0.01265, "grad_norm": 0.38528862595558167, "grad_norm_var": 0.0030261360436078005, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.698637902736664, "loss/hidden": 0.0, "loss/logits": 0.18210280314087868, "loss/reg": 1.371398687362671, "step": 1265 }, { "epoch": 0.01266, "grad_norm": 0.35614868998527527, "grad_norm_var": 0.0032142068850269786, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.8088836073875427, "loss/hidden": 0.0, "loss/logits": 0.17882408946752548, "loss/reg": 1.3697328567504883, "step": 1266 }, { "epoch": 0.01267, "grad_norm": 0.36676737666130066, "grad_norm_var": 0.0033343322691377925, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.6373648643493652, "loss/hidden": 0.0, "loss/logits": 0.18188165128231049, "loss/reg": 1.3673834800720215, "step": 1267 }, { "epoch": 0.01268, "grad_norm": 0.39788973331451416, "grad_norm_var": 0.0028485353302448763, "learning_rate": 5e-05, "loss": 0.1929, "loss/crossentropy": 2.72048681974411, "loss/hidden": 0.0, "loss/logits": 0.19285759702324867, "loss/reg": 1.3652880191802979, "step": 1268 }, { "epoch": 0.01269, "grad_norm": 0.39289984107017517, "grad_norm_var": 0.0028573651350930524, "learning_rate": 5e-05, "loss": 0.1965, "loss/crossentropy": 2.7955026030540466, "loss/hidden": 0.0, "loss/logits": 0.19650669395923615, "loss/reg": 1.3635308742523193, "step": 1269 }, { "epoch": 0.0127, "grad_norm": 0.3635425567626953, "grad_norm_var": 0.002953132085538002, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.7407087683677673, "loss/hidden": 0.0, "loss/logits": 0.17556580528616905, "loss/reg": 1.3616644144058228, "step": 1270 }, { "epoch": 0.01271, "grad_norm": 0.4003201723098755, "grad_norm_var": 0.0029402084248949262, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.6960673928260803, "loss/hidden": 0.0, "loss/logits": 0.1870543658733368, "loss/reg": 1.3603110313415527, "step": 1271 }, { "epoch": 0.01272, "grad_norm": 0.3728436529636383, "grad_norm_var": 0.002978704676407059, "learning_rate": 5e-05, "loss": 0.1764, "loss/crossentropy": 2.7935059666633606, "loss/hidden": 0.0, "loss/logits": 0.1764380931854248, "loss/reg": 1.3583228588104248, "step": 1272 }, { "epoch": 0.01273, "grad_norm": 0.3922203779220581, "grad_norm_var": 0.0005369219153182336, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 2.9126545190811157, "loss/hidden": 0.0, "loss/logits": 0.1932995654642582, "loss/reg": 1.3570079803466797, "step": 1273 }, { "epoch": 0.01274, "grad_norm": 0.3662412762641907, "grad_norm_var": 0.0004680491238065465, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.8062674403190613, "loss/hidden": 0.0, "loss/logits": 0.17627891525626183, "loss/reg": 1.3558125495910645, "step": 1274 }, { "epoch": 0.01275, "grad_norm": 0.389067679643631, "grad_norm_var": 0.00043848758916393096, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.7338613271713257, "loss/hidden": 0.0, "loss/logits": 0.19517629593610764, "loss/reg": 1.3549078702926636, "step": 1275 }, { "epoch": 0.01276, "grad_norm": 0.45303645730018616, "grad_norm_var": 0.0006863072728836646, "learning_rate": 5e-05, "loss": 0.2122, "loss/crossentropy": 2.838892161846161, "loss/hidden": 0.0, "loss/logits": 0.21219918876886368, "loss/reg": 1.3540998697280884, "step": 1276 }, { "epoch": 0.01277, "grad_norm": 0.36253467202186584, "grad_norm_var": 0.0007280970613809353, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.8312386870384216, "loss/hidden": 0.0, "loss/logits": 0.1846158690750599, "loss/reg": 1.353420615196228, "step": 1277 }, { "epoch": 0.01278, "grad_norm": 0.37399822473526, "grad_norm_var": 0.0006790970538206727, "learning_rate": 5e-05, "loss": 0.1879, "loss/crossentropy": 2.8536208868026733, "loss/hidden": 0.0, "loss/logits": 0.18789278343319893, "loss/reg": 1.352396845817566, "step": 1278 }, { "epoch": 0.01279, "grad_norm": 0.3941795825958252, "grad_norm_var": 0.0006688551439072026, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.7190252542495728, "loss/hidden": 0.0, "loss/logits": 0.1929876208305359, "loss/reg": 1.3508316278457642, "step": 1279 }, { "epoch": 0.0128, "grad_norm": 0.38698315620422363, "grad_norm_var": 0.0005331698153800672, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.773389518260956, "loss/hidden": 0.0, "loss/logits": 0.18697837367653847, "loss/reg": 1.34941565990448, "step": 1280 }, { "epoch": 0.01281, "grad_norm": 0.43258407711982727, "grad_norm_var": 0.0006771733589316486, "learning_rate": 5e-05, "loss": 0.1964, "loss/crossentropy": 2.7498987913131714, "loss/hidden": 0.0, "loss/logits": 0.19635442271828651, "loss/reg": 1.348366379737854, "step": 1281 }, { "epoch": 0.01282, "grad_norm": 0.4235626459121704, "grad_norm_var": 0.0006787048817335125, "learning_rate": 5e-05, "loss": 0.1992, "loss/crossentropy": 2.7566969990730286, "loss/hidden": 0.0, "loss/logits": 0.1991618350148201, "loss/reg": 1.3475583791732788, "step": 1282 }, { "epoch": 0.01283, "grad_norm": 0.3940984606742859, "grad_norm_var": 0.0006341984147311871, "learning_rate": 5e-05, "loss": 0.2041, "loss/crossentropy": 2.8078003525733948, "loss/hidden": 0.0, "loss/logits": 0.20406979322433472, "loss/reg": 1.3463932275772095, "step": 1283 }, { "epoch": 0.01284, "grad_norm": 0.384031742811203, "grad_norm_var": 0.0006380904039433551, "learning_rate": 5e-05, "loss": 0.1879, "loss/crossentropy": 2.9492968916893005, "loss/hidden": 0.0, "loss/logits": 0.18790540099143982, "loss/reg": 1.3449573516845703, "step": 1284 }, { "epoch": 0.01285, "grad_norm": 0.3790985345840454, "grad_norm_var": 0.0006495060301425178, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.795994758605957, "loss/hidden": 0.0, "loss/logits": 0.1865733079612255, "loss/reg": 1.343949317932129, "step": 1285 }, { "epoch": 0.01286, "grad_norm": 0.3845146894454956, "grad_norm_var": 0.0005980594021735749, "learning_rate": 5e-05, "loss": 0.188, "loss/crossentropy": 2.7280075550079346, "loss/hidden": 0.0, "loss/logits": 0.18804579600691795, "loss/reg": 1.3426213264465332, "step": 1286 }, { "epoch": 0.01287, "grad_norm": 0.5705528259277344, "grad_norm_var": 0.002573541618339078, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.4836789965629578, "loss/hidden": 0.0, "loss/logits": 0.19577286019921303, "loss/reg": 1.3411113023757935, "step": 1287 }, { "epoch": 0.01288, "grad_norm": 0.39455586671829224, "grad_norm_var": 0.002513614459891574, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.7349933981895447, "loss/hidden": 0.0, "loss/logits": 0.1888500116765499, "loss/reg": 1.3391989469528198, "step": 1288 }, { "epoch": 0.01289, "grad_norm": 0.3759537637233734, "grad_norm_var": 0.0025580404579436185, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.812786042690277, "loss/hidden": 0.0, "loss/logits": 0.17579347640275955, "loss/reg": 1.3369227647781372, "step": 1289 }, { "epoch": 0.0129, "grad_norm": 0.40224215388298035, "grad_norm_var": 0.0024575000110041286, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.8621543049812317, "loss/hidden": 0.0, "loss/logits": 0.19386817887425423, "loss/reg": 1.3354421854019165, "step": 1290 }, { "epoch": 0.01291, "grad_norm": 0.3668369650840759, "grad_norm_var": 0.002539502080659517, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.894817292690277, "loss/hidden": 0.0, "loss/logits": 0.17856686189770699, "loss/reg": 1.333516240119934, "step": 1291 }, { "epoch": 0.01292, "grad_norm": 0.3802977502346039, "grad_norm_var": 0.0024035539250668444, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.795164704322815, "loss/hidden": 0.0, "loss/logits": 0.1828628107905388, "loss/reg": 1.3324875831604004, "step": 1292 }, { "epoch": 0.01293, "grad_norm": 1.061220407485962, "grad_norm_var": 0.02938838453965478, "learning_rate": 5e-05, "loss": 0.2468, "loss/crossentropy": 2.8882861137390137, "loss/hidden": 0.0, "loss/logits": 0.24676746502518654, "loss/reg": 1.331701636314392, "step": 1293 }, { "epoch": 0.01294, "grad_norm": 0.39137813448905945, "grad_norm_var": 0.029244943809875072, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.9083763360977173, "loss/hidden": 0.0, "loss/logits": 0.18574338406324387, "loss/reg": 1.3301653861999512, "step": 1294 }, { "epoch": 0.01295, "grad_norm": 0.3713628649711609, "grad_norm_var": 0.02943248635611706, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.8202906250953674, "loss/hidden": 0.0, "loss/logits": 0.17676304280757904, "loss/reg": 1.3285754919052124, "step": 1295 }, { "epoch": 0.01296, "grad_norm": 0.39226874709129333, "grad_norm_var": 0.029394258249184086, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.8032939434051514, "loss/hidden": 0.0, "loss/logits": 0.17801367118954659, "loss/reg": 1.3272022008895874, "step": 1296 }, { "epoch": 0.01297, "grad_norm": 0.40169134736061096, "grad_norm_var": 0.02950107240310749, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.6777111291885376, "loss/hidden": 0.0, "loss/logits": 0.18530849367380142, "loss/reg": 1.3254737854003906, "step": 1297 }, { "epoch": 0.01298, "grad_norm": 0.408547967672348, "grad_norm_var": 0.029552281796611728, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.762950897216797, "loss/hidden": 0.0, "loss/logits": 0.19006695970892906, "loss/reg": 1.3238102197647095, "step": 1298 }, { "epoch": 0.01299, "grad_norm": 0.40359973907470703, "grad_norm_var": 0.029498297332966376, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.7999081015586853, "loss/hidden": 0.0, "loss/logits": 0.17994213849306107, "loss/reg": 1.3228790760040283, "step": 1299 }, { "epoch": 0.013, "grad_norm": 0.42354899644851685, "grad_norm_var": 0.029291732015891307, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.711639881134033, "loss/hidden": 0.0, "loss/logits": 0.18831219896674156, "loss/reg": 1.321997880935669, "step": 1300 }, { "epoch": 0.01301, "grad_norm": 0.37877607345581055, "grad_norm_var": 0.029294538805312784, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.7869099378585815, "loss/hidden": 0.0, "loss/logits": 0.19013599306344986, "loss/reg": 1.3209624290466309, "step": 1301 }, { "epoch": 0.01302, "grad_norm": 0.8780012726783752, "grad_norm_var": 0.04058730529278964, "learning_rate": 5e-05, "loss": 0.2086, "loss/crossentropy": 2.9132827520370483, "loss/hidden": 0.0, "loss/logits": 0.2086055651307106, "loss/reg": 1.320249319076538, "step": 1302 }, { "epoch": 0.01303, "grad_norm": 0.4919426441192627, "grad_norm_var": 0.03997255141455191, "learning_rate": 5e-05, "loss": 0.1929, "loss/crossentropy": 2.873347520828247, "loss/hidden": 0.0, "loss/logits": 0.19293329864740372, "loss/reg": 1.3193020820617676, "step": 1303 }, { "epoch": 0.01304, "grad_norm": 0.4724264442920685, "grad_norm_var": 0.03956677984298077, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.660923659801483, "loss/hidden": 0.0, "loss/logits": 0.17619645223021507, "loss/reg": 1.3196465969085693, "step": 1304 }, { "epoch": 0.01305, "grad_norm": 0.42146193981170654, "grad_norm_var": 0.03909519236833046, "learning_rate": 5e-05, "loss": 0.1923, "loss/crossentropy": 2.8432238698005676, "loss/hidden": 0.0, "loss/logits": 0.19225074350833893, "loss/reg": 1.3197365999221802, "step": 1305 }, { "epoch": 0.01306, "grad_norm": 0.44016242027282715, "grad_norm_var": 0.03880278698594292, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.9219031929969788, "loss/hidden": 0.0, "loss/logits": 0.1823579967021942, "loss/reg": 1.3191851377487183, "step": 1306 }, { "epoch": 0.01307, "grad_norm": 0.40534767508506775, "grad_norm_var": 0.0383132831443539, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.8344457149505615, "loss/hidden": 0.0, "loss/logits": 0.18211128562688828, "loss/reg": 1.3193784952163696, "step": 1307 }, { "epoch": 0.01308, "grad_norm": 0.4818474054336548, "grad_norm_var": 0.037572268534637604, "learning_rate": 5e-05, "loss": 0.1931, "loss/crossentropy": 3.001499652862549, "loss/hidden": 0.0, "loss/logits": 0.19311653822660446, "loss/reg": 1.319129467010498, "step": 1308 }, { "epoch": 0.01309, "grad_norm": 0.41809335350990295, "grad_norm_var": 0.014352758274943181, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.908874988555908, "loss/hidden": 0.0, "loss/logits": 0.18662643805146217, "loss/reg": 1.3184683322906494, "step": 1309 }, { "epoch": 0.0131, "grad_norm": 0.4113363027572632, "grad_norm_var": 0.01422490614724207, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.778001129627228, "loss/hidden": 0.0, "loss/logits": 0.19404159486293793, "loss/reg": 1.3183917999267578, "step": 1310 }, { "epoch": 0.01311, "grad_norm": 0.4028130769729614, "grad_norm_var": 0.013956863128374053, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.892741084098816, "loss/hidden": 0.0, "loss/logits": 0.18560755625367165, "loss/reg": 1.318432092666626, "step": 1311 }, { "epoch": 0.01312, "grad_norm": 0.5507920384407043, "grad_norm_var": 0.014265137075122303, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.7685837745666504, "loss/hidden": 0.0, "loss/logits": 0.19431418180465698, "loss/reg": 1.3180075883865356, "step": 1312 }, { "epoch": 0.01313, "grad_norm": 0.4126599431037903, "grad_norm_var": 0.014184603572884372, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.769340455532074, "loss/hidden": 0.0, "loss/logits": 0.18763459101319313, "loss/reg": 1.316887378692627, "step": 1313 }, { "epoch": 0.01314, "grad_norm": 0.4310034215450287, "grad_norm_var": 0.014054329397543756, "learning_rate": 5e-05, "loss": 0.2148, "loss/crossentropy": 2.7395371794700623, "loss/hidden": 0.0, "loss/logits": 0.21481387317180634, "loss/reg": 1.316412091255188, "step": 1314 }, { "epoch": 0.01315, "grad_norm": 0.3769264221191406, "grad_norm_var": 0.01431356443074178, "learning_rate": 5e-05, "loss": 0.1931, "loss/crossentropy": 2.707428455352783, "loss/hidden": 0.0, "loss/logits": 0.19313662126660347, "loss/reg": 1.315134048461914, "step": 1315 }, { "epoch": 0.01316, "grad_norm": 0.43392711877822876, "grad_norm_var": 0.014266644976929577, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.646990716457367, "loss/hidden": 0.0, "loss/logits": 0.18968894705176353, "loss/reg": 1.3136744499206543, "step": 1316 }, { "epoch": 0.01317, "grad_norm": 0.42039671540260315, "grad_norm_var": 0.013907685821175575, "learning_rate": 5e-05, "loss": 0.2099, "loss/crossentropy": 2.647752821445465, "loss/hidden": 0.0, "loss/logits": 0.20992901176214218, "loss/reg": 1.3125907182693481, "step": 1317 }, { "epoch": 0.01318, "grad_norm": 0.36365634202957153, "grad_norm_var": 0.002157925123658914, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.752347230911255, "loss/hidden": 0.0, "loss/logits": 0.18107367679476738, "loss/reg": 1.3118373155593872, "step": 1318 }, { "epoch": 0.01319, "grad_norm": 0.4149968922138214, "grad_norm_var": 0.0019276034667143301, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.833215117454529, "loss/hidden": 0.0, "loss/logits": 0.18308308348059654, "loss/reg": 1.3113209009170532, "step": 1319 }, { "epoch": 0.0132, "grad_norm": 0.4174114465713501, "grad_norm_var": 0.0017954009995525433, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.8773661851882935, "loss/hidden": 0.0, "loss/logits": 0.17799117416143417, "loss/reg": 1.3101435899734497, "step": 1320 }, { "epoch": 0.01321, "grad_norm": 0.37340742349624634, "grad_norm_var": 0.0019635318784405127, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.759882092475891, "loss/hidden": 0.0, "loss/logits": 0.17917311936616898, "loss/reg": 1.3096390962600708, "step": 1321 }, { "epoch": 0.01322, "grad_norm": 0.36545029282569885, "grad_norm_var": 0.0021332032625857996, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.7518441677093506, "loss/hidden": 0.0, "loss/logits": 0.18147645145654678, "loss/reg": 1.3092446327209473, "step": 1322 }, { "epoch": 0.01323, "grad_norm": 0.38353613018989563, "grad_norm_var": 0.0021982906675894606, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.895625650882721, "loss/hidden": 0.0, "loss/logits": 0.17865781486034393, "loss/reg": 1.3078157901763916, "step": 1323 }, { "epoch": 0.01324, "grad_norm": 0.3809302747249603, "grad_norm_var": 0.0019506857096142267, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.7681140899658203, "loss/hidden": 0.0, "loss/logits": 0.1827395148575306, "loss/reg": 1.3067607879638672, "step": 1324 }, { "epoch": 0.01325, "grad_norm": 0.45117852091789246, "grad_norm_var": 0.0020555368006153394, "learning_rate": 5e-05, "loss": 0.2006, "loss/crossentropy": 2.8935614228248596, "loss/hidden": 0.0, "loss/logits": 0.20061300694942474, "loss/reg": 1.3059515953063965, "step": 1325 }, { "epoch": 0.01326, "grad_norm": 0.3950449228286743, "grad_norm_var": 0.0020733523569008445, "learning_rate": 5e-05, "loss": 0.1918, "loss/crossentropy": 2.716562330722809, "loss/hidden": 0.0, "loss/logits": 0.19175706058740616, "loss/reg": 1.3049736022949219, "step": 1326 }, { "epoch": 0.01327, "grad_norm": 0.39624467492103577, "grad_norm_var": 0.002083116547425116, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.9077075123786926, "loss/hidden": 0.0, "loss/logits": 0.18224596232175827, "loss/reg": 1.3039295673370361, "step": 1327 }, { "epoch": 0.01328, "grad_norm": 0.3917173743247986, "grad_norm_var": 0.0006884956392181489, "learning_rate": 5e-05, "loss": 0.1963, "loss/crossentropy": 2.8372305631637573, "loss/hidden": 0.0, "loss/logits": 0.1963268630206585, "loss/reg": 1.3027836084365845, "step": 1328 }, { "epoch": 0.01329, "grad_norm": 0.39020252227783203, "grad_norm_var": 0.0006836971401257201, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.6796911358833313, "loss/hidden": 0.0, "loss/logits": 0.18476086854934692, "loss/reg": 1.3020461797714233, "step": 1329 }, { "epoch": 0.0133, "grad_norm": 0.38087961077690125, "grad_norm_var": 0.0006276855907307866, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.7201030254364014, "loss/hidden": 0.0, "loss/logits": 0.1847059205174446, "loss/reg": 1.300846815109253, "step": 1330 }, { "epoch": 0.01331, "grad_norm": 0.3877928555011749, "grad_norm_var": 0.0006074390999117995, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.839367628097534, "loss/hidden": 0.0, "loss/logits": 0.18426093831658363, "loss/reg": 1.3001853227615356, "step": 1331 }, { "epoch": 0.01332, "grad_norm": 0.3711041212081909, "grad_norm_var": 0.0005420569547143316, "learning_rate": 5e-05, "loss": 0.1696, "loss/crossentropy": 2.7172279357910156, "loss/hidden": 0.0, "loss/logits": 0.16955699026584625, "loss/reg": 1.2990162372589111, "step": 1332 }, { "epoch": 0.01333, "grad_norm": 0.6289080381393433, "grad_norm_var": 0.00402807478378359, "learning_rate": 5e-05, "loss": 0.1972, "loss/crossentropy": 2.678100347518921, "loss/hidden": 0.0, "loss/logits": 0.19720054045319557, "loss/reg": 1.2978439331054688, "step": 1333 }, { "epoch": 0.01334, "grad_norm": 0.4056476652622223, "grad_norm_var": 0.003902441977409969, "learning_rate": 5e-05, "loss": 0.1864, "loss/crossentropy": 2.7936259508132935, "loss/hidden": 0.0, "loss/logits": 0.1864231936633587, "loss/reg": 1.2967522144317627, "step": 1334 }, { "epoch": 0.01335, "grad_norm": 0.3485678732395172, "grad_norm_var": 0.004119842087168672, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.693207561969757, "loss/hidden": 0.0, "loss/logits": 0.1709160953760147, "loss/reg": 1.2951072454452515, "step": 1335 }, { "epoch": 0.01336, "grad_norm": 0.40611714124679565, "grad_norm_var": 0.0041079969860560875, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.7255959510803223, "loss/hidden": 0.0, "loss/logits": 0.18296395614743233, "loss/reg": 1.294013500213623, "step": 1336 }, { "epoch": 0.01337, "grad_norm": 0.3974861204624176, "grad_norm_var": 0.004047475093204926, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.8238306641578674, "loss/hidden": 0.0, "loss/logits": 0.18465031683444977, "loss/reg": 1.2925174236297607, "step": 1337 }, { "epoch": 0.01338, "grad_norm": 0.4148518443107605, "grad_norm_var": 0.003939165560142958, "learning_rate": 5e-05, "loss": 0.2004, "loss/crossentropy": 2.8640605807304382, "loss/hidden": 0.0, "loss/logits": 0.20043480768799782, "loss/reg": 1.2910975217819214, "step": 1338 }, { "epoch": 0.01339, "grad_norm": 0.4171488881111145, "grad_norm_var": 0.0038995204542005596, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.6302874088287354, "loss/hidden": 0.0, "loss/logits": 0.19046159461140633, "loss/reg": 1.2900209426879883, "step": 1339 }, { "epoch": 0.0134, "grad_norm": 0.39216965436935425, "grad_norm_var": 0.0038634942425959514, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.7624495029449463, "loss/hidden": 0.0, "loss/logits": 0.18672016263008118, "loss/reg": 1.2881990671157837, "step": 1340 }, { "epoch": 0.01341, "grad_norm": 0.41747212409973145, "grad_norm_var": 0.003753668540790267, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.8837625980377197, "loss/hidden": 0.0, "loss/logits": 0.1778956986963749, "loss/reg": 1.2869876623153687, "step": 1341 }, { "epoch": 0.01342, "grad_norm": 0.3829328715801239, "grad_norm_var": 0.0037851070907451876, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 2.7400670051574707, "loss/hidden": 0.0, "loss/logits": 0.19326373934745789, "loss/reg": 1.2858026027679443, "step": 1342 }, { "epoch": 0.01343, "grad_norm": 0.4228571355342865, "grad_norm_var": 0.0037873835369272063, "learning_rate": 5e-05, "loss": 0.2003, "loss/crossentropy": 2.9916569590568542, "loss/hidden": 0.0, "loss/logits": 0.20030486211180687, "loss/reg": 1.2851616144180298, "step": 1343 }, { "epoch": 0.01344, "grad_norm": 0.42717915773391724, "grad_norm_var": 0.0037807597262457747, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.8114970922470093, "loss/hidden": 0.0, "loss/logits": 0.19114911556243896, "loss/reg": 1.2845793962478638, "step": 1344 }, { "epoch": 0.01345, "grad_norm": 0.3848547339439392, "grad_norm_var": 0.0037980591833326137, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.7906049489974976, "loss/hidden": 0.0, "loss/logits": 0.17564281448721886, "loss/reg": 1.2834604978561401, "step": 1345 }, { "epoch": 0.01346, "grad_norm": 0.5220564603805542, "grad_norm_var": 0.004465037808005083, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.8656685948371887, "loss/hidden": 0.0, "loss/logits": 0.19348178058862686, "loss/reg": 1.2817208766937256, "step": 1346 }, { "epoch": 0.01347, "grad_norm": 0.4464890658855438, "grad_norm_var": 0.004424811622567286, "learning_rate": 5e-05, "loss": 0.1937, "loss/crossentropy": 2.7914677262306213, "loss/hidden": 0.0, "loss/logits": 0.19365255907177925, "loss/reg": 1.2804713249206543, "step": 1347 }, { "epoch": 0.01348, "grad_norm": 0.3959408700466156, "grad_norm_var": 0.004287815978112041, "learning_rate": 5e-05, "loss": 0.1987, "loss/crossentropy": 2.793258845806122, "loss/hidden": 0.0, "loss/logits": 0.19866468757390976, "loss/reg": 1.2791686058044434, "step": 1348 }, { "epoch": 0.01349, "grad_norm": 0.42396214604377747, "grad_norm_var": 0.001359216418765108, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.9762020111083984, "loss/hidden": 0.0, "loss/logits": 0.17010829970240593, "loss/reg": 1.2777467966079712, "step": 1349 }, { "epoch": 0.0135, "grad_norm": 0.42238879203796387, "grad_norm_var": 0.0013606376487376281, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.819011688232422, "loss/hidden": 0.0, "loss/logits": 0.1885228455066681, "loss/reg": 1.276718258857727, "step": 1350 }, { "epoch": 0.01351, "grad_norm": 0.4367927610874176, "grad_norm_var": 0.0010785369168515246, "learning_rate": 5e-05, "loss": 0.2036, "loss/crossentropy": 2.7756308913230896, "loss/hidden": 0.0, "loss/logits": 0.2036239691078663, "loss/reg": 1.2753915786743164, "step": 1351 }, { "epoch": 0.01352, "grad_norm": 0.39300453662872314, "grad_norm_var": 0.0011125389978846934, "learning_rate": 5e-05, "loss": 0.1899, "loss/crossentropy": 2.837214469909668, "loss/hidden": 0.0, "loss/logits": 0.18992070853710175, "loss/reg": 1.2738478183746338, "step": 1352 }, { "epoch": 0.01353, "grad_norm": 0.3900686204433441, "grad_norm_var": 0.0011368585379263293, "learning_rate": 5e-05, "loss": 0.1934, "loss/crossentropy": 2.6612390875816345, "loss/hidden": 0.0, "loss/logits": 0.1934347301721573, "loss/reg": 1.2721461057662964, "step": 1353 }, { "epoch": 0.01354, "grad_norm": 0.37273284792900085, "grad_norm_var": 0.0012661753083164603, "learning_rate": 5e-05, "loss": 0.1694, "loss/crossentropy": 2.7896453142166138, "loss/hidden": 0.0, "loss/logits": 0.16944236680865288, "loss/reg": 1.2709778547286987, "step": 1354 }, { "epoch": 0.01355, "grad_norm": 0.37389302253723145, "grad_norm_var": 0.001373625563106732, "learning_rate": 5e-05, "loss": 0.1727, "loss/crossentropy": 2.920479476451874, "loss/hidden": 0.0, "loss/logits": 0.1727452278137207, "loss/reg": 1.2689437866210938, "step": 1355 }, { "epoch": 0.01356, "grad_norm": 0.42584228515625, "grad_norm_var": 0.0013518686663250771, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.99138343334198, "loss/hidden": 0.0, "loss/logits": 0.19352904707193375, "loss/reg": 1.2677642107009888, "step": 1356 }, { "epoch": 0.01357, "grad_norm": 0.4118824601173401, "grad_norm_var": 0.0013519076041731489, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.8795509934425354, "loss/hidden": 0.0, "loss/logits": 0.18404123187065125, "loss/reg": 1.2662688493728638, "step": 1357 }, { "epoch": 0.01358, "grad_norm": 0.4201810657978058, "grad_norm_var": 0.0012815735880918075, "learning_rate": 5e-05, "loss": 0.2088, "loss/crossentropy": 2.915390193462372, "loss/hidden": 0.0, "loss/logits": 0.2087855376303196, "loss/reg": 1.2648541927337646, "step": 1358 }, { "epoch": 0.01359, "grad_norm": 0.48833900690078735, "grad_norm_var": 0.0016017265945368526, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.7243436574935913, "loss/hidden": 0.0, "loss/logits": 0.18500912189483643, "loss/reg": 1.2632609605789185, "step": 1359 }, { "epoch": 0.0136, "grad_norm": 0.41109177470207214, "grad_norm_var": 0.0016045950663166645, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.761650860309601, "loss/hidden": 0.0, "loss/logits": 0.18839628621935844, "loss/reg": 1.261253833770752, "step": 1360 }, { "epoch": 0.01361, "grad_norm": 0.38177841901779175, "grad_norm_var": 0.0016195899755527302, "learning_rate": 5e-05, "loss": 0.1966, "loss/crossentropy": 2.728949189186096, "loss/hidden": 0.0, "loss/logits": 0.19656601920723915, "loss/reg": 1.2589702606201172, "step": 1361 }, { "epoch": 0.01362, "grad_norm": 0.4108126759529114, "grad_norm_var": 0.0008759893825818062, "learning_rate": 5e-05, "loss": 0.1738, "loss/crossentropy": 2.734529137611389, "loss/hidden": 0.0, "loss/logits": 0.1738160066306591, "loss/reg": 1.2575434446334839, "step": 1362 }, { "epoch": 0.01363, "grad_norm": 0.4372514486312866, "grad_norm_var": 0.0008398593237139059, "learning_rate": 5e-05, "loss": 0.1968, "loss/crossentropy": 2.65754634141922, "loss/hidden": 0.0, "loss/logits": 0.1968196965754032, "loss/reg": 1.2563024759292603, "step": 1363 }, { "epoch": 0.01364, "grad_norm": 0.720674991607666, "grad_norm_var": 0.006724574980634479, "learning_rate": 5e-05, "loss": 0.2195, "loss/crossentropy": 2.953807055950165, "loss/hidden": 0.0, "loss/logits": 0.2195480540394783, "loss/reg": 1.255125641822815, "step": 1364 }, { "epoch": 0.01365, "grad_norm": 0.39298033714294434, "grad_norm_var": 0.0068200160138982965, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.741532266139984, "loss/hidden": 0.0, "loss/logits": 0.18346120789647102, "loss/reg": 1.253417730331421, "step": 1365 }, { "epoch": 0.01366, "grad_norm": 0.42354610562324524, "grad_norm_var": 0.006818831556282904, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.758584976196289, "loss/hidden": 0.0, "loss/logits": 0.1735590063035488, "loss/reg": 1.251852035522461, "step": 1366 }, { "epoch": 0.01367, "grad_norm": 0.4350874722003937, "grad_norm_var": 0.006817623328532463, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 2.854390501976013, "loss/hidden": 0.0, "loss/logits": 0.19326837360858917, "loss/reg": 1.2506519556045532, "step": 1367 }, { "epoch": 0.01368, "grad_norm": 0.38499191403388977, "grad_norm_var": 0.006861772154808465, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.8086124658584595, "loss/hidden": 0.0, "loss/logits": 0.18208570405840874, "loss/reg": 1.2490259408950806, "step": 1368 }, { "epoch": 0.01369, "grad_norm": 0.5313814878463745, "grad_norm_var": 0.007356119875327716, "learning_rate": 5e-05, "loss": 0.2032, "loss/crossentropy": 2.9096702933311462, "loss/hidden": 0.0, "loss/logits": 0.2031627707183361, "loss/reg": 1.2478126287460327, "step": 1369 }, { "epoch": 0.0137, "grad_norm": 0.4552428722381592, "grad_norm_var": 0.007053640487362378, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 2.741479814052582, "loss/hidden": 0.0, "loss/logits": 0.1799832098186016, "loss/reg": 1.2461893558502197, "step": 1370 }, { "epoch": 0.01371, "grad_norm": 0.4122316539287567, "grad_norm_var": 0.006786819829008536, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 3.0401941537857056, "loss/hidden": 0.0, "loss/logits": 0.18025413900613785, "loss/reg": 1.24508535861969, "step": 1371 }, { "epoch": 0.01372, "grad_norm": 0.3969644606113434, "grad_norm_var": 0.006918315747275635, "learning_rate": 5e-05, "loss": 0.1886, "loss/crossentropy": 2.7754027247428894, "loss/hidden": 0.0, "loss/logits": 0.18864772096276283, "loss/reg": 1.2440794706344604, "step": 1372 }, { "epoch": 0.01373, "grad_norm": 0.3795020282268524, "grad_norm_var": 0.00712532709277743, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.8709006309509277, "loss/hidden": 0.0, "loss/logits": 0.17753225937485695, "loss/reg": 1.2425460815429688, "step": 1373 }, { "epoch": 0.01374, "grad_norm": 0.34354275465011597, "grad_norm_var": 0.00772179540161035, "learning_rate": 5e-05, "loss": 0.1637, "loss/crossentropy": 2.771426022052765, "loss/hidden": 0.0, "loss/logits": 0.16372710466384888, "loss/reg": 1.2412630319595337, "step": 1374 }, { "epoch": 0.01375, "grad_norm": 0.39372017979621887, "grad_norm_var": 0.007644236740270749, "learning_rate": 5e-05, "loss": 0.1984, "loss/crossentropy": 2.7911269068717957, "loss/hidden": 0.0, "loss/logits": 0.19837456196546555, "loss/reg": 1.2404334545135498, "step": 1375 }, { "epoch": 0.01376, "grad_norm": 0.37725409865379333, "grad_norm_var": 0.007809791729928861, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.705751419067383, "loss/hidden": 0.0, "loss/logits": 0.18037205934524536, "loss/reg": 1.239854335784912, "step": 1376 }, { "epoch": 0.01377, "grad_norm": 0.3886111080646515, "grad_norm_var": 0.007768951436301293, "learning_rate": 5e-05, "loss": 0.1887, "loss/crossentropy": 2.786842703819275, "loss/hidden": 0.0, "loss/logits": 0.18867934867739677, "loss/reg": 1.2384082078933716, "step": 1377 }, { "epoch": 0.01378, "grad_norm": 0.37910887598991394, "grad_norm_var": 0.007913883052354128, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.793938934803009, "loss/hidden": 0.0, "loss/logits": 0.18826347962021828, "loss/reg": 1.2371619939804077, "step": 1378 }, { "epoch": 0.01379, "grad_norm": 0.3881366550922394, "grad_norm_var": 0.008005739815983074, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.756206750869751, "loss/hidden": 0.0, "loss/logits": 0.17876296862959862, "loss/reg": 1.2357430458068848, "step": 1379 }, { "epoch": 0.0138, "grad_norm": 0.39993518590927124, "grad_norm_var": 0.0017986913450700728, "learning_rate": 5e-05, "loss": 0.1854, "loss/crossentropy": 2.8165774941444397, "loss/hidden": 0.0, "loss/logits": 0.1853647120296955, "loss/reg": 1.2344536781311035, "step": 1380 }, { "epoch": 0.01381, "grad_norm": 0.37117767333984375, "grad_norm_var": 0.0018637489993303094, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.833341598510742, "loss/hidden": 0.0, "loss/logits": 0.18087629228830338, "loss/reg": 1.2332347631454468, "step": 1381 }, { "epoch": 0.01382, "grad_norm": 0.44362714886665344, "grad_norm_var": 0.0019418828305195295, "learning_rate": 5e-05, "loss": 0.206, "loss/crossentropy": 2.8209856152534485, "loss/hidden": 0.0, "loss/logits": 0.20602793619036674, "loss/reg": 1.232313632965088, "step": 1382 }, { "epoch": 0.01383, "grad_norm": 0.4029310345649719, "grad_norm_var": 0.0018776474781246222, "learning_rate": 5e-05, "loss": 0.203, "loss/crossentropy": 2.745832860469818, "loss/hidden": 0.0, "loss/logits": 0.20303602144122124, "loss/reg": 1.2314114570617676, "step": 1383 }, { "epoch": 0.01384, "grad_norm": 0.4554927945137024, "grad_norm_var": 0.002018806747643931, "learning_rate": 5e-05, "loss": 0.2238, "loss/crossentropy": 2.7095658779144287, "loss/hidden": 0.0, "loss/logits": 0.22382865101099014, "loss/reg": 1.2301923036575317, "step": 1384 }, { "epoch": 0.01385, "grad_norm": 0.4818246066570282, "grad_norm_var": 0.001353271385290237, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.8538730144500732, "loss/hidden": 0.0, "loss/logits": 0.17802419140934944, "loss/reg": 1.2288519144058228, "step": 1385 }, { "epoch": 0.01386, "grad_norm": 0.47836628556251526, "grad_norm_var": 0.001543655778380985, "learning_rate": 5e-05, "loss": 0.2098, "loss/crossentropy": 2.9244675636291504, "loss/hidden": 0.0, "loss/logits": 0.2098112478852272, "loss/reg": 1.2279032468795776, "step": 1386 }, { "epoch": 0.01387, "grad_norm": 0.4103398323059082, "grad_norm_var": 0.0015422512386726428, "learning_rate": 5e-05, "loss": 0.2022, "loss/crossentropy": 2.692758023738861, "loss/hidden": 0.0, "loss/logits": 0.20222963392734528, "loss/reg": 1.2269119024276733, "step": 1387 }, { "epoch": 0.01388, "grad_norm": 0.4230862557888031, "grad_norm_var": 0.0015546177559936443, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 2.6759063601493835, "loss/hidden": 0.0, "loss/logits": 0.18045221269130707, "loss/reg": 1.225614070892334, "step": 1388 }, { "epoch": 0.01389, "grad_norm": 0.39890357851982117, "grad_norm_var": 0.0015062573807308984, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.7969236969947815, "loss/hidden": 0.0, "loss/logits": 0.18138685822486877, "loss/reg": 1.2248356342315674, "step": 1389 }, { "epoch": 0.0139, "grad_norm": 0.38309741020202637, "grad_norm_var": 0.0012614423849095946, "learning_rate": 5e-05, "loss": 0.1879, "loss/crossentropy": 2.840832471847534, "loss/hidden": 0.0, "loss/logits": 0.18787847086787224, "loss/reg": 1.2237766981124878, "step": 1390 }, { "epoch": 0.01391, "grad_norm": 0.3969389796257019, "grad_norm_var": 0.001254684277324917, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 2.8534193634986877, "loss/hidden": 0.0, "loss/logits": 0.18053820729255676, "loss/reg": 1.223007321357727, "step": 1391 }, { "epoch": 0.01392, "grad_norm": 0.35356929898262024, "grad_norm_var": 0.0013968724081272735, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.804259717464447, "loss/hidden": 0.0, "loss/logits": 0.1765923760831356, "loss/reg": 1.2222816944122314, "step": 1392 }, { "epoch": 0.01393, "grad_norm": 0.39217162132263184, "grad_norm_var": 0.001387654680048911, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.804800271987915, "loss/hidden": 0.0, "loss/logits": 0.1875942163169384, "loss/reg": 1.2213808298110962, "step": 1393 }, { "epoch": 0.01394, "grad_norm": 0.377047598361969, "grad_norm_var": 0.0013963880523254369, "learning_rate": 5e-05, "loss": 0.1973, "loss/crossentropy": 2.8182566165924072, "loss/hidden": 0.0, "loss/logits": 0.19730354100465775, "loss/reg": 1.2202363014221191, "step": 1394 }, { "epoch": 0.01395, "grad_norm": 0.3902486562728882, "grad_norm_var": 0.0013905691464131417, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.7491560578346252, "loss/hidden": 0.0, "loss/logits": 0.189250610768795, "loss/reg": 1.2191710472106934, "step": 1395 }, { "epoch": 0.01396, "grad_norm": 0.5830679535865784, "grad_norm_var": 0.003242805657963747, "learning_rate": 5e-05, "loss": 0.1978, "loss/crossentropy": 2.810506761074066, "loss/hidden": 0.0, "loss/logits": 0.1978430114686489, "loss/reg": 1.2181440591812134, "step": 1396 }, { "epoch": 0.01397, "grad_norm": 0.41446352005004883, "grad_norm_var": 0.0030702379351137993, "learning_rate": 5e-05, "loss": 0.1982, "loss/crossentropy": 2.829331338405609, "loss/hidden": 0.0, "loss/logits": 0.19816021993756294, "loss/reg": 1.2169166803359985, "step": 1397 }, { "epoch": 0.01398, "grad_norm": 0.4109343886375427, "grad_norm_var": 0.003051804093662188, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 2.9817265272140503, "loss/hidden": 0.0, "loss/logits": 0.18317783251404762, "loss/reg": 1.2161028385162354, "step": 1398 }, { "epoch": 0.01399, "grad_norm": 0.4030122756958008, "grad_norm_var": 0.003051597620713731, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.7645658254623413, "loss/hidden": 0.0, "loss/logits": 0.18450627103447914, "loss/reg": 1.2153202295303345, "step": 1399 }, { "epoch": 0.014, "grad_norm": 0.9211554527282715, "grad_norm_var": 0.018681524358094104, "learning_rate": 5e-05, "loss": 0.2265, "loss/crossentropy": 2.772430121898651, "loss/hidden": 0.0, "loss/logits": 0.22651683166623116, "loss/reg": 1.2144694328308105, "step": 1400 }, { "epoch": 0.01401, "grad_norm": 0.5132554173469543, "grad_norm_var": 0.018871863342353364, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.8380813002586365, "loss/hidden": 0.0, "loss/logits": 0.19385619089007378, "loss/reg": 1.2133592367172241, "step": 1401 }, { "epoch": 0.01402, "grad_norm": 0.3983222544193268, "grad_norm_var": 0.019002687433299182, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.7452619075775146, "loss/hidden": 0.0, "loss/logits": 0.18066129088401794, "loss/reg": 1.2125701904296875, "step": 1402 }, { "epoch": 0.01403, "grad_norm": 0.38246041536331177, "grad_norm_var": 0.01919163386322751, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.8391433358192444, "loss/hidden": 0.0, "loss/logits": 0.17655245959758759, "loss/reg": 1.2120070457458496, "step": 1403 }, { "epoch": 0.01404, "grad_norm": 0.41270893812179565, "grad_norm_var": 0.01923056479132349, "learning_rate": 5e-05, "loss": 0.1941, "loss/crossentropy": 2.694108784198761, "loss/hidden": 0.0, "loss/logits": 0.19414566829800606, "loss/reg": 1.211472511291504, "step": 1404 }, { "epoch": 0.01405, "grad_norm": 0.37444761395454407, "grad_norm_var": 0.0194205713803524, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.756447434425354, "loss/hidden": 0.0, "loss/logits": 0.188365388661623, "loss/reg": 1.2103835344314575, "step": 1405 }, { "epoch": 0.01406, "grad_norm": 0.3629455864429474, "grad_norm_var": 0.019610079451670957, "learning_rate": 5e-05, "loss": 0.1854, "loss/crossentropy": 2.8100785613059998, "loss/hidden": 0.0, "loss/logits": 0.18542328104376793, "loss/reg": 1.210111379623413, "step": 1406 }, { "epoch": 0.01407, "grad_norm": 0.36319929361343384, "grad_norm_var": 0.01988808713783746, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.7677817940711975, "loss/hidden": 0.0, "loss/logits": 0.1782296933233738, "loss/reg": 1.209017276763916, "step": 1407 }, { "epoch": 0.01408, "grad_norm": 0.7494403123855591, "grad_norm_var": 0.025077728825443388, "learning_rate": 5e-05, "loss": 0.2119, "loss/crossentropy": 2.7342721819877625, "loss/hidden": 0.0, "loss/logits": 0.21190783753991127, "loss/reg": 1.207706332206726, "step": 1408 }, { "epoch": 0.01409, "grad_norm": 0.359062135219574, "grad_norm_var": 0.025470202190572132, "learning_rate": 5e-05, "loss": 0.1667, "loss/crossentropy": 2.7984707355499268, "loss/hidden": 0.0, "loss/logits": 0.1667410060763359, "loss/reg": 1.2061673402786255, "step": 1409 }, { "epoch": 0.0141, "grad_norm": 0.3760697543621063, "grad_norm_var": 0.02548153168728078, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.8415337800979614, "loss/hidden": 0.0, "loss/logits": 0.17982684448361397, "loss/reg": 1.2050466537475586, "step": 1410 }, { "epoch": 0.01411, "grad_norm": 0.4107130467891693, "grad_norm_var": 0.025308039267595398, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.8946332335472107, "loss/hidden": 0.0, "loss/logits": 0.17427558451890945, "loss/reg": 1.204237699508667, "step": 1411 }, { "epoch": 0.01412, "grad_norm": 0.4082576632499695, "grad_norm_var": 0.02445911428786796, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.7471116185188293, "loss/hidden": 0.0, "loss/logits": 0.18230270966887474, "loss/reg": 1.2035255432128906, "step": 1412 }, { "epoch": 0.01413, "grad_norm": 0.7261360883712769, "grad_norm_var": 0.028896584983750567, "learning_rate": 5e-05, "loss": 0.2141, "loss/crossentropy": 2.7916662096977234, "loss/hidden": 0.0, "loss/logits": 0.2141283005475998, "loss/reg": 1.2025699615478516, "step": 1413 }, { "epoch": 0.01414, "grad_norm": 0.4041743576526642, "grad_norm_var": 0.028955615300985102, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.785338282585144, "loss/hidden": 0.0, "loss/logits": 0.18472053483128548, "loss/reg": 1.201487421989441, "step": 1414 }, { "epoch": 0.01415, "grad_norm": 0.42084982991218567, "grad_norm_var": 0.028809439139849055, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.7464206218719482, "loss/hidden": 0.0, "loss/logits": 0.1779973767697811, "loss/reg": 1.200051188468933, "step": 1415 }, { "epoch": 0.01416, "grad_norm": 0.45047450065612793, "grad_norm_var": 0.01459023840276699, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.907436788082123, "loss/hidden": 0.0, "loss/logits": 0.18275701627135277, "loss/reg": 1.19915771484375, "step": 1416 }, { "epoch": 0.01417, "grad_norm": 0.4240787625312805, "grad_norm_var": 0.014270135412653876, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.6383658051490784, "loss/hidden": 0.0, "loss/logits": 0.1825261414051056, "loss/reg": 1.1980401277542114, "step": 1417 }, { "epoch": 0.01418, "grad_norm": 0.3851097524166107, "grad_norm_var": 0.014352634082185445, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 2.715300142765045, "loss/hidden": 0.0, "loss/logits": 0.1909404620528221, "loss/reg": 1.1972358226776123, "step": 1418 }, { "epoch": 0.01419, "grad_norm": 0.4259493350982666, "grad_norm_var": 0.014148020705458992, "learning_rate": 5e-05, "loss": 0.1954, "loss/crossentropy": 2.795480966567993, "loss/hidden": 0.0, "loss/logits": 0.19538183510303497, "loss/reg": 1.1965022087097168, "step": 1419 }, { "epoch": 0.0142, "grad_norm": 0.41580289602279663, "grad_norm_var": 0.014137009585771305, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.878966212272644, "loss/hidden": 0.0, "loss/logits": 0.1872658208012581, "loss/reg": 1.1956799030303955, "step": 1420 }, { "epoch": 0.01421, "grad_norm": 0.47517159581184387, "grad_norm_var": 0.013876705878671021, "learning_rate": 5e-05, "loss": 0.2072, "loss/crossentropy": 2.8808937072753906, "loss/hidden": 0.0, "loss/logits": 0.2071789838373661, "loss/reg": 1.1947969198226929, "step": 1421 }, { "epoch": 0.01422, "grad_norm": 0.3958096206188202, "grad_norm_var": 0.013574404616716058, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.647930860519409, "loss/hidden": 0.0, "loss/logits": 0.18728242069482803, "loss/reg": 1.194016933441162, "step": 1422 }, { "epoch": 0.01423, "grad_norm": 0.4426226317882538, "grad_norm_var": 0.01305587928614605, "learning_rate": 5e-05, "loss": 0.204, "loss/crossentropy": 2.8057883977890015, "loss/hidden": 0.0, "loss/logits": 0.2040250115096569, "loss/reg": 1.1931583881378174, "step": 1423 }, { "epoch": 0.01424, "grad_norm": 0.41137373447418213, "grad_norm_var": 0.006897930700183829, "learning_rate": 5e-05, "loss": 0.1964, "loss/crossentropy": 2.8716946840286255, "loss/hidden": 0.0, "loss/logits": 0.19638033583760262, "loss/reg": 1.1923456192016602, "step": 1424 }, { "epoch": 0.01425, "grad_norm": 0.42817750573158264, "grad_norm_var": 0.0065130178351596475, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.7237064242362976, "loss/hidden": 0.0, "loss/logits": 0.18527411296963692, "loss/reg": 1.1919002532958984, "step": 1425 }, { "epoch": 0.01426, "grad_norm": 0.4095504879951477, "grad_norm_var": 0.0063086320451773165, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.836295783519745, "loss/hidden": 0.0, "loss/logits": 0.18403561040759087, "loss/reg": 1.1909234523773193, "step": 1426 }, { "epoch": 0.01427, "grad_norm": 0.4399351477622986, "grad_norm_var": 0.006249292355520449, "learning_rate": 5e-05, "loss": 0.2038, "loss/crossentropy": 2.7117483019828796, "loss/hidden": 0.0, "loss/logits": 0.2037728913128376, "loss/reg": 1.189060091972351, "step": 1427 }, { "epoch": 0.01428, "grad_norm": 0.3880847692489624, "grad_norm_var": 0.006364050570575138, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.7042183876037598, "loss/hidden": 0.0, "loss/logits": 0.1861833930015564, "loss/reg": 1.1879582405090332, "step": 1428 }, { "epoch": 0.01429, "grad_norm": 0.45626869797706604, "grad_norm_var": 0.0006274098049336244, "learning_rate": 5e-05, "loss": 0.2024, "loss/crossentropy": 2.824433743953705, "loss/hidden": 0.0, "loss/logits": 0.20241528376936913, "loss/reg": 1.1868696212768555, "step": 1429 }, { "epoch": 0.0143, "grad_norm": 0.4194117486476898, "grad_norm_var": 0.0006029838264533775, "learning_rate": 5e-05, "loss": 0.1963, "loss/crossentropy": 2.7539783120155334, "loss/hidden": 0.0, "loss/logits": 0.19634708017110825, "loss/reg": 1.1856297254562378, "step": 1430 }, { "epoch": 0.01431, "grad_norm": 0.3876844048500061, "grad_norm_var": 0.0006869516146238024, "learning_rate": 5e-05, "loss": 0.1954, "loss/crossentropy": 2.681014835834503, "loss/hidden": 0.0, "loss/logits": 0.19539380818605423, "loss/reg": 1.1842849254608154, "step": 1431 }, { "epoch": 0.01432, "grad_norm": 0.4049186110496521, "grad_norm_var": 0.000645033648734498, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.772512137889862, "loss/hidden": 0.0, "loss/logits": 0.17883709073066711, "loss/reg": 1.183428406715393, "step": 1432 }, { "epoch": 0.01433, "grad_norm": 0.4411693513393402, "grad_norm_var": 0.0006740150025339157, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.8390111923217773, "loss/hidden": 0.0, "loss/logits": 0.19385584443807602, "loss/reg": 1.1821532249450684, "step": 1433 }, { "epoch": 0.01434, "grad_norm": 0.37364938855171204, "grad_norm_var": 0.0007362101089197252, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.7289167046546936, "loss/hidden": 0.0, "loss/logits": 0.18471518531441689, "loss/reg": 1.1817139387130737, "step": 1434 }, { "epoch": 0.01435, "grad_norm": 0.3801406919956207, "grad_norm_var": 0.0008293373200224867, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.7374696731567383, "loss/hidden": 0.0, "loss/logits": 0.17699915915727615, "loss/reg": 1.1811518669128418, "step": 1435 }, { "epoch": 0.01436, "grad_norm": 0.4569256901741028, "grad_norm_var": 0.0009292300730142089, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.712208926677704, "loss/hidden": 0.0, "loss/logits": 0.1851220801472664, "loss/reg": 1.1803579330444336, "step": 1436 }, { "epoch": 0.01437, "grad_norm": 0.4534268379211426, "grad_norm_var": 0.0007971731126398485, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.938688635826111, "loss/hidden": 0.0, "loss/logits": 0.19052105024456978, "loss/reg": 1.1800974607467651, "step": 1437 }, { "epoch": 0.01438, "grad_norm": 3.6807291507720947, "grad_norm_var": 0.6654650831373831, "learning_rate": 5e-05, "loss": 0.3041, "loss/crossentropy": 2.723634898662567, "loss/hidden": 0.0, "loss/logits": 0.3040575832128525, "loss/reg": 1.1786816120147705, "step": 1438 }, { "epoch": 0.01439, "grad_norm": 0.478336364030838, "grad_norm_var": 0.6646840673393943, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.604291319847107, "loss/hidden": 0.0, "loss/logits": 0.1942521370947361, "loss/reg": 1.177720546722412, "step": 1439 }, { "epoch": 0.0144, "grad_norm": 0.5175783038139343, "grad_norm_var": 0.6623552944700096, "learning_rate": 5e-05, "loss": 0.1927, "loss/crossentropy": 2.9179378747940063, "loss/hidden": 0.0, "loss/logits": 0.19269224256277084, "loss/reg": 1.1768488883972168, "step": 1440 }, { "epoch": 0.01441, "grad_norm": 0.44153159856796265, "grad_norm_var": 0.6620030812325127, "learning_rate": 5e-05, "loss": 0.2094, "loss/crossentropy": 2.699263036251068, "loss/hidden": 0.0, "loss/logits": 0.20937075465917587, "loss/reg": 1.1756783723831177, "step": 1441 }, { "epoch": 0.01442, "grad_norm": 0.456778347492218, "grad_norm_var": 0.6607348854967332, "learning_rate": 5e-05, "loss": 0.2043, "loss/crossentropy": 2.8013316988945007, "loss/hidden": 0.0, "loss/logits": 0.2043425552546978, "loss/reg": 1.174521565437317, "step": 1442 }, { "epoch": 0.01443, "grad_norm": 0.3832143247127533, "grad_norm_var": 0.6624190273713508, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.7974475622177124, "loss/hidden": 0.0, "loss/logits": 0.16904297098517418, "loss/reg": 1.1735647916793823, "step": 1443 }, { "epoch": 0.01444, "grad_norm": 0.40240901708602905, "grad_norm_var": 0.6619650609078701, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.6992990374565125, "loss/hidden": 0.0, "loss/logits": 0.19426801800727844, "loss/reg": 1.1723049879074097, "step": 1444 }, { "epoch": 0.01445, "grad_norm": 0.413047194480896, "grad_norm_var": 0.6631025192839431, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.6653656363487244, "loss/hidden": 0.0, "loss/logits": 0.17078221589326859, "loss/reg": 1.171108603477478, "step": 1445 }, { "epoch": 0.01446, "grad_norm": 0.4184585213661194, "grad_norm_var": 0.6631294281930682, "learning_rate": 5e-05, "loss": 0.2102, "loss/crossentropy": 2.701816439628601, "loss/hidden": 0.0, "loss/logits": 0.21016155928373337, "loss/reg": 1.1695311069488525, "step": 1446 }, { "epoch": 0.01447, "grad_norm": 0.36679932475090027, "grad_norm_var": 0.6638332006424051, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.8065091371536255, "loss/hidden": 0.0, "loss/logits": 0.182582326233387, "loss/reg": 1.1683489084243774, "step": 1447 }, { "epoch": 0.01448, "grad_norm": 0.37978798151016235, "grad_norm_var": 0.6646245839910803, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 2.7764082551002502, "loss/hidden": 0.0, "loss/logits": 0.19127563014626503, "loss/reg": 1.1672946214675903, "step": 1448 }, { "epoch": 0.01449, "grad_norm": 0.41191771626472473, "grad_norm_var": 0.665405763465797, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.740322172641754, "loss/hidden": 0.0, "loss/logits": 0.1907496452331543, "loss/reg": 1.166246771812439, "step": 1449 }, { "epoch": 0.0145, "grad_norm": 0.4475890100002289, "grad_norm_var": 0.6632604096159077, "learning_rate": 5e-05, "loss": 0.2022, "loss/crossentropy": 2.8474504351615906, "loss/hidden": 0.0, "loss/logits": 0.20218346267938614, "loss/reg": 1.1651406288146973, "step": 1450 }, { "epoch": 0.01451, "grad_norm": 0.3883093595504761, "grad_norm_var": 0.6629918541871671, "learning_rate": 5e-05, "loss": 0.2002, "loss/crossentropy": 2.771294593811035, "loss/hidden": 0.0, "loss/logits": 0.20018937811255455, "loss/reg": 1.1640667915344238, "step": 1451 }, { "epoch": 0.01452, "grad_norm": 0.38779163360595703, "grad_norm_var": 0.6648956523531993, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.838146388530731, "loss/hidden": 0.0, "loss/logits": 0.1846405677497387, "loss/reg": 1.1632236242294312, "step": 1452 }, { "epoch": 0.01453, "grad_norm": 0.44142037630081177, "grad_norm_var": 0.6651820988867188, "learning_rate": 5e-05, "loss": 0.1992, "loss/crossentropy": 2.7226682901382446, "loss/hidden": 0.0, "loss/logits": 0.19923654943704605, "loss/reg": 1.1626334190368652, "step": 1453 }, { "epoch": 0.01454, "grad_norm": 0.4469018578529358, "grad_norm_var": 0.0016475347938865329, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.8452677726745605, "loss/hidden": 0.0, "loss/logits": 0.19433218985795975, "loss/reg": 1.1622459888458252, "step": 1454 }, { "epoch": 0.01455, "grad_norm": 0.4018121063709259, "grad_norm_var": 0.0014577680517240807, "learning_rate": 5e-05, "loss": 0.1998, "loss/crossentropy": 2.738975167274475, "loss/hidden": 0.0, "loss/logits": 0.19976536184549332, "loss/reg": 1.1620265245437622, "step": 1455 }, { "epoch": 0.01456, "grad_norm": 0.42509543895721436, "grad_norm_var": 0.0007777995787594194, "learning_rate": 5e-05, "loss": 0.1924, "loss/crossentropy": 2.826959192752838, "loss/hidden": 0.0, "loss/logits": 0.19238118827342987, "loss/reg": 1.161539912223816, "step": 1456 }, { "epoch": 0.01457, "grad_norm": 0.440341979265213, "grad_norm_var": 0.0007734106803447698, "learning_rate": 5e-05, "loss": 0.1978, "loss/crossentropy": 2.6413469314575195, "loss/hidden": 0.0, "loss/logits": 0.19783642515540123, "loss/reg": 1.1616432666778564, "step": 1457 }, { "epoch": 0.01458, "grad_norm": 0.39853590726852417, "grad_norm_var": 0.0006472376220099108, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.698549807071686, "loss/hidden": 0.0, "loss/logits": 0.18476466834545135, "loss/reg": 1.1611636877059937, "step": 1458 }, { "epoch": 0.01459, "grad_norm": 0.38865047693252563, "grad_norm_var": 0.0006299673554417813, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.7270365953445435, "loss/hidden": 0.0, "loss/logits": 0.18844184651970863, "loss/reg": 1.1610968112945557, "step": 1459 }, { "epoch": 0.0146, "grad_norm": 0.4249439239501953, "grad_norm_var": 0.000639110550863183, "learning_rate": 5e-05, "loss": 0.2042, "loss/crossentropy": 2.7351967096328735, "loss/hidden": 0.0, "loss/logits": 0.2041994333267212, "loss/reg": 1.1608000993728638, "step": 1460 }, { "epoch": 0.01461, "grad_norm": 0.3722969889640808, "grad_norm_var": 0.0007336083208048571, "learning_rate": 5e-05, "loss": 0.1818, "loss/crossentropy": 2.7943307161331177, "loss/hidden": 0.0, "loss/logits": 0.18180421367287636, "loss/reg": 1.1603269577026367, "step": 1461 }, { "epoch": 0.01462, "grad_norm": 0.43652695417404175, "grad_norm_var": 0.0007773033601769286, "learning_rate": 5e-05, "loss": 0.2023, "loss/crossentropy": 2.662752866744995, "loss/hidden": 0.0, "loss/logits": 0.2022755742073059, "loss/reg": 1.1591933965682983, "step": 1462 }, { "epoch": 0.01463, "grad_norm": 0.4718083441257477, "grad_norm_var": 0.0008627420285323261, "learning_rate": 5e-05, "loss": 0.1951, "loss/crossentropy": 2.9354034662246704, "loss/hidden": 0.0, "loss/logits": 0.19507525488734245, "loss/reg": 1.158470869064331, "step": 1463 }, { "epoch": 0.01464, "grad_norm": 0.385351300239563, "grad_norm_var": 0.0008374568626427566, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.822139620780945, "loss/hidden": 0.0, "loss/logits": 0.186939537525177, "loss/reg": 1.1580942869186401, "step": 1464 }, { "epoch": 0.01465, "grad_norm": 0.3974020779132843, "grad_norm_var": 0.0008601347897435793, "learning_rate": 5e-05, "loss": 0.2006, "loss/crossentropy": 2.843861997127533, "loss/hidden": 0.0, "loss/logits": 0.20057183876633644, "loss/reg": 1.157541275024414, "step": 1465 }, { "epoch": 0.01466, "grad_norm": 0.38286757469177246, "grad_norm_var": 0.0008486814366043779, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.6517167687416077, "loss/hidden": 0.0, "loss/logits": 0.18896743655204773, "loss/reg": 1.1566143035888672, "step": 1466 }, { "epoch": 0.01467, "grad_norm": 0.42905157804489136, "grad_norm_var": 0.0008243923150278073, "learning_rate": 5e-05, "loss": 0.1984, "loss/crossentropy": 2.897773861885071, "loss/hidden": 0.0, "loss/logits": 0.19835495948791504, "loss/reg": 1.156038761138916, "step": 1467 }, { "epoch": 0.01468, "grad_norm": 0.3854566514492035, "grad_norm_var": 0.0008330248364320133, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.8033392429351807, "loss/hidden": 0.0, "loss/logits": 0.18027521297335625, "loss/reg": 1.1554698944091797, "step": 1468 }, { "epoch": 0.01469, "grad_norm": 0.4645752012729645, "grad_norm_var": 0.0009503278882020183, "learning_rate": 5e-05, "loss": 0.2117, "loss/crossentropy": 2.7153387665748596, "loss/hidden": 0.0, "loss/logits": 0.21166526898741722, "loss/reg": 1.1556744575500488, "step": 1469 }, { "epoch": 0.0147, "grad_norm": 0.3899242877960205, "grad_norm_var": 0.0009163884442397426, "learning_rate": 5e-05, "loss": 0.1912, "loss/crossentropy": 2.720719635486603, "loss/hidden": 0.0, "loss/logits": 0.19123655557632446, "loss/reg": 1.1561858654022217, "step": 1470 }, { "epoch": 0.01471, "grad_norm": 0.3779591917991638, "grad_norm_var": 0.0009848749223748934, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.8007051944732666, "loss/hidden": 0.0, "loss/logits": 0.1796492263674736, "loss/reg": 1.1558936834335327, "step": 1471 }, { "epoch": 0.01472, "grad_norm": 0.6328070759773254, "grad_norm_var": 0.004080776063957622, "learning_rate": 5e-05, "loss": 0.2355, "loss/crossentropy": 2.686060070991516, "loss/hidden": 0.0, "loss/logits": 0.23549868538975716, "loss/reg": 1.1561976671218872, "step": 1472 }, { "epoch": 0.01473, "grad_norm": 0.472528874874115, "grad_norm_var": 0.004217134203378577, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.742728590965271, "loss/hidden": 0.0, "loss/logits": 0.18668468296527863, "loss/reg": 1.1556323766708374, "step": 1473 }, { "epoch": 0.01474, "grad_norm": 0.38901230692863464, "grad_norm_var": 0.004257255456704956, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.911827266216278, "loss/hidden": 0.0, "loss/logits": 0.17221413552761078, "loss/reg": 1.1559501886367798, "step": 1474 }, { "epoch": 0.01475, "grad_norm": 0.4404449462890625, "grad_norm_var": 0.004173393020864262, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.542245864868164, "loss/hidden": 0.0, "loss/logits": 0.19110393524169922, "loss/reg": 1.1563279628753662, "step": 1475 }, { "epoch": 0.01476, "grad_norm": 0.47592392563819885, "grad_norm_var": 0.004312948871115774, "learning_rate": 5e-05, "loss": 0.2094, "loss/crossentropy": 2.9327672123908997, "loss/hidden": 0.0, "loss/logits": 0.20938289538025856, "loss/reg": 1.1564908027648926, "step": 1476 }, { "epoch": 0.01477, "grad_norm": 0.37822598218917847, "grad_norm_var": 0.004268347129857242, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.8230528831481934, "loss/hidden": 0.0, "loss/logits": 0.17565542086958885, "loss/reg": 1.156173586845398, "step": 1477 }, { "epoch": 0.01478, "grad_norm": 0.3917330503463745, "grad_norm_var": 0.004365919184094539, "learning_rate": 5e-05, "loss": 0.2045, "loss/crossentropy": 2.8280991911888123, "loss/hidden": 0.0, "loss/logits": 0.20451990514993668, "loss/reg": 1.1554744243621826, "step": 1478 }, { "epoch": 0.01479, "grad_norm": 0.37878331542015076, "grad_norm_var": 0.004376637666552326, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.8972471356391907, "loss/hidden": 0.0, "loss/logits": 0.18294430896639824, "loss/reg": 1.1554595232009888, "step": 1479 }, { "epoch": 0.0148, "grad_norm": 0.37876391410827637, "grad_norm_var": 0.004412639484719334, "learning_rate": 5e-05, "loss": 0.2315, "loss/crossentropy": 2.6865721940994263, "loss/hidden": 0.0, "loss/logits": 0.2314772792160511, "loss/reg": 1.1548503637313843, "step": 1480 }, { "epoch": 0.01481, "grad_norm": 0.39876559376716614, "grad_norm_var": 0.004408130788441378, "learning_rate": 5e-05, "loss": 0.1891, "loss/crossentropy": 2.991089105606079, "loss/hidden": 0.0, "loss/logits": 0.18908962607383728, "loss/reg": 1.153546929359436, "step": 1481 }, { "epoch": 0.01482, "grad_norm": 0.42962586879730225, "grad_norm_var": 0.004295032189139869, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.81504887342453, "loss/hidden": 0.0, "loss/logits": 0.17986131832003593, "loss/reg": 1.1533591747283936, "step": 1482 }, { "epoch": 0.01483, "grad_norm": 0.39817601442337036, "grad_norm_var": 0.0043414287038569055, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.8316676020622253, "loss/hidden": 0.0, "loss/logits": 0.1767561361193657, "loss/reg": 1.1531206369400024, "step": 1483 }, { "epoch": 0.01484, "grad_norm": 0.3925790786743164, "grad_norm_var": 0.004308073096685746, "learning_rate": 5e-05, "loss": 0.1703, "loss/crossentropy": 2.805059552192688, "loss/hidden": 0.0, "loss/logits": 0.1703021265566349, "loss/reg": 1.1523410081863403, "step": 1484 }, { "epoch": 0.01485, "grad_norm": 0.3768901228904724, "grad_norm_var": 0.004318495561248111, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.7650709748268127, "loss/hidden": 0.0, "loss/logits": 0.18137887865304947, "loss/reg": 1.151556134223938, "step": 1485 }, { "epoch": 0.01486, "grad_norm": 0.40476369857788086, "grad_norm_var": 0.004274959281858711, "learning_rate": 5e-05, "loss": 0.1834, "loss/crossentropy": 2.8746532201766968, "loss/hidden": 0.0, "loss/logits": 0.18340081721544266, "loss/reg": 1.1500896215438843, "step": 1486 }, { "epoch": 0.01487, "grad_norm": 0.44126570224761963, "grad_norm_var": 0.004172172160196756, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.694306790828705, "loss/hidden": 0.0, "loss/logits": 0.19317873939871788, "loss/reg": 1.149519443511963, "step": 1487 }, { "epoch": 0.01488, "grad_norm": 0.47749271988868713, "grad_norm_var": 0.0013509307920328006, "learning_rate": 5e-05, "loss": 0.1916, "loss/crossentropy": 2.719083070755005, "loss/hidden": 0.0, "loss/logits": 0.1916189342737198, "loss/reg": 1.1487241983413696, "step": 1488 }, { "epoch": 0.01489, "grad_norm": 0.3436330556869507, "grad_norm_var": 0.0013844778204995265, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.7870516180992126, "loss/hidden": 0.0, "loss/logits": 0.17992451414465904, "loss/reg": 1.1486061811447144, "step": 1489 }, { "epoch": 0.0149, "grad_norm": 0.38551199436187744, "grad_norm_var": 0.001393174193328169, "learning_rate": 5e-05, "loss": 0.1891, "loss/crossentropy": 2.8131721019744873, "loss/hidden": 0.0, "loss/logits": 0.18907897546887398, "loss/reg": 1.1475741863250732, "step": 1490 }, { "epoch": 0.01491, "grad_norm": 0.4178476333618164, "grad_norm_var": 0.001320663123918564, "learning_rate": 5e-05, "loss": 0.2016, "loss/crossentropy": 2.775062322616577, "loss/hidden": 0.0, "loss/logits": 0.20159471407532692, "loss/reg": 1.146192193031311, "step": 1491 }, { "epoch": 0.01492, "grad_norm": 0.3631218373775482, "grad_norm_var": 0.001039799575198197, "learning_rate": 5e-05, "loss": 0.1682, "loss/crossentropy": 2.776621460914612, "loss/hidden": 0.0, "loss/logits": 0.16820930317044258, "loss/reg": 1.1452136039733887, "step": 1492 }, { "epoch": 0.01493, "grad_norm": 4.2707037925720215, "grad_norm_var": 0.9380895971594498, "learning_rate": 5e-05, "loss": 0.3158, "loss/crossentropy": 2.92247998714447, "loss/hidden": 0.0, "loss/logits": 0.3158273994922638, "loss/reg": 1.1442078351974487, "step": 1493 }, { "epoch": 0.01494, "grad_norm": 0.4497520625591278, "grad_norm_var": 0.9363747553262743, "learning_rate": 5e-05, "loss": 0.1921, "loss/crossentropy": 2.8871845602989197, "loss/hidden": 0.0, "loss/logits": 0.19214729592204094, "loss/reg": 1.1430447101593018, "step": 1494 }, { "epoch": 0.01495, "grad_norm": 0.45193424820899963, "grad_norm_var": 0.9341201756923909, "learning_rate": 5e-05, "loss": 0.1944, "loss/crossentropy": 2.8296074271202087, "loss/hidden": 0.0, "loss/logits": 0.19437526538968086, "loss/reg": 1.1418877840042114, "step": 1495 }, { "epoch": 0.01496, "grad_norm": 0.4302029609680176, "grad_norm_var": 0.9324334842618291, "learning_rate": 5e-05, "loss": 0.2005, "loss/crossentropy": 2.7669071555137634, "loss/hidden": 0.0, "loss/logits": 0.2004767581820488, "loss/reg": 1.1404200792312622, "step": 1496 }, { "epoch": 0.01497, "grad_norm": 0.4347190260887146, "grad_norm_var": 0.9313002422194117, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.8296796679496765, "loss/hidden": 0.0, "loss/logits": 0.19576232135295868, "loss/reg": 1.139227032661438, "step": 1497 }, { "epoch": 0.01498, "grad_norm": 0.5106848478317261, "grad_norm_var": 0.9292830465830048, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.730704605579376, "loss/hidden": 0.0, "loss/logits": 0.1929554119706154, "loss/reg": 1.1384540796279907, "step": 1498 }, { "epoch": 0.01499, "grad_norm": 0.3934985101222992, "grad_norm_var": 0.9294472871619052, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.88557106256485, "loss/hidden": 0.0, "loss/logits": 0.17628108337521553, "loss/reg": 1.1374356746673584, "step": 1499 }, { "epoch": 0.015, "grad_norm": 0.38453835248947144, "grad_norm_var": 0.9297369973056901, "learning_rate": 5e-05, "loss": 0.1891, "loss/crossentropy": 2.733524262905121, "loss/hidden": 0.0, "loss/logits": 0.18906502798199654, "loss/reg": 1.1368626356124878, "step": 1500 }, { "epoch": 0.01501, "grad_norm": 0.38647788763046265, "grad_norm_var": 0.9293826966113136, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 2.6723055839538574, "loss/hidden": 0.0, "loss/logits": 0.18317405134439468, "loss/reg": 1.1361547708511353, "step": 1501 }, { "epoch": 0.01502, "grad_norm": 0.38139402866363525, "grad_norm_var": 0.9302094379605873, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.8804500699043274, "loss/hidden": 0.0, "loss/logits": 0.1735868975520134, "loss/reg": 1.1355422735214233, "step": 1502 }, { "epoch": 0.01503, "grad_norm": 0.3744097948074341, "grad_norm_var": 0.9324178817578533, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.929677963256836, "loss/hidden": 0.0, "loss/logits": 0.16901340708136559, "loss/reg": 1.1346811056137085, "step": 1503 }, { "epoch": 0.01504, "grad_norm": 0.3716701567173004, "grad_norm_var": 0.9356011201024083, "learning_rate": 5e-05, "loss": 0.168, "loss/crossentropy": 2.940733551979065, "loss/hidden": 0.0, "loss/logits": 0.16800947487354279, "loss/reg": 1.1341255903244019, "step": 1504 }, { "epoch": 0.01505, "grad_norm": 0.4324607849121094, "grad_norm_var": 0.9325026880056351, "learning_rate": 5e-05, "loss": 0.1995, "loss/crossentropy": 2.7044883370399475, "loss/hidden": 0.0, "loss/logits": 0.19945520162582397, "loss/reg": 1.133963942527771, "step": 1505 }, { "epoch": 0.01506, "grad_norm": 0.3936706781387329, "grad_norm_var": 0.9322164850582716, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.8886741995811462, "loss/hidden": 0.0, "loss/logits": 0.1905427686870098, "loss/reg": 1.133609652519226, "step": 1506 }, { "epoch": 0.01507, "grad_norm": 0.41537290811538696, "grad_norm_var": 0.9322944406545675, "learning_rate": 5e-05, "loss": 0.1973, "loss/crossentropy": 2.8340283036231995, "loss/hidden": 0.0, "loss/logits": 0.1972740888595581, "loss/reg": 1.133344054222107, "step": 1507 }, { "epoch": 0.01508, "grad_norm": 0.46224504709243774, "grad_norm_var": 0.9290801736495811, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.8049793243408203, "loss/hidden": 0.0, "loss/logits": 0.1792147532105446, "loss/reg": 1.133016586303711, "step": 1508 }, { "epoch": 0.01509, "grad_norm": 0.3661869168281555, "grad_norm_var": 0.001638684958712311, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.787099599838257, "loss/hidden": 0.0, "loss/logits": 0.18104476854205132, "loss/reg": 1.1325634717941284, "step": 1509 }, { "epoch": 0.0151, "grad_norm": 0.5948194265365601, "grad_norm_var": 0.00362709916255467, "learning_rate": 5e-05, "loss": 0.2071, "loss/crossentropy": 2.9607895612716675, "loss/hidden": 0.0, "loss/logits": 0.20714980363845825, "loss/reg": 1.1316790580749512, "step": 1510 }, { "epoch": 0.01511, "grad_norm": 0.41173163056373596, "grad_norm_var": 0.003578473170561654, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.8691484332084656, "loss/hidden": 0.0, "loss/logits": 0.18565627187490463, "loss/reg": 1.1312055587768555, "step": 1511 }, { "epoch": 0.01512, "grad_norm": 0.4034125506877899, "grad_norm_var": 0.003592262118630174, "learning_rate": 5e-05, "loss": 0.1852, "loss/crossentropy": 2.8478543758392334, "loss/hidden": 0.0, "loss/logits": 0.18515947088599205, "loss/reg": 1.1307348012924194, "step": 1512 }, { "epoch": 0.01513, "grad_norm": 0.3978036046028137, "grad_norm_var": 0.003604153126838305, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.8709346055984497, "loss/hidden": 0.0, "loss/logits": 0.1858948990702629, "loss/reg": 1.1300814151763916, "step": 1513 }, { "epoch": 0.01514, "grad_norm": 0.3758530914783478, "grad_norm_var": 0.00306556512898114, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.767078220844269, "loss/hidden": 0.0, "loss/logits": 0.18287447094917297, "loss/reg": 1.1289632320404053, "step": 1514 }, { "epoch": 0.01515, "grad_norm": 0.37056660652160645, "grad_norm_var": 0.0031461246167071356, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.8232129216194153, "loss/hidden": 0.0, "loss/logits": 0.1713615171611309, "loss/reg": 1.1283029317855835, "step": 1515 }, { "epoch": 0.01516, "grad_norm": 0.4838574230670929, "grad_norm_var": 0.0034564083210849917, "learning_rate": 5e-05, "loss": 0.2098, "loss/crossentropy": 2.7439082264900208, "loss/hidden": 0.0, "loss/logits": 0.20981475710868835, "loss/reg": 1.1274932622909546, "step": 1516 }, { "epoch": 0.01517, "grad_norm": 0.4795788526535034, "grad_norm_var": 0.003658104504286123, "learning_rate": 5e-05, "loss": 0.222, "loss/crossentropy": 2.814515173435211, "loss/hidden": 0.0, "loss/logits": 0.22201483324170113, "loss/reg": 1.126589298248291, "step": 1517 }, { "epoch": 0.01518, "grad_norm": 0.43272629380226135, "grad_norm_var": 0.003560685680539777, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.6954237818717957, "loss/hidden": 0.0, "loss/logits": 0.1866161711513996, "loss/reg": 1.1255377531051636, "step": 1518 }, { "epoch": 0.01519, "grad_norm": 0.44345027208328247, "grad_norm_var": 0.0034122455072800612, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.97675484418869, "loss/hidden": 0.0, "loss/logits": 0.18700629100203514, "loss/reg": 1.1250556707382202, "step": 1519 }, { "epoch": 0.0152, "grad_norm": 0.37308746576309204, "grad_norm_var": 0.003401874892602097, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.7208551168441772, "loss/hidden": 0.0, "loss/logits": 0.17121143266558647, "loss/reg": 1.1241092681884766, "step": 1520 }, { "epoch": 0.01521, "grad_norm": 0.38577350974082947, "grad_norm_var": 0.003505989678647703, "learning_rate": 5e-05, "loss": 0.1967, "loss/crossentropy": 2.7316006422042847, "loss/hidden": 0.0, "loss/logits": 0.19670463353395462, "loss/reg": 1.1233248710632324, "step": 1521 }, { "epoch": 0.01522, "grad_norm": 0.4153127372264862, "grad_norm_var": 0.0034466381379363717, "learning_rate": 5e-05, "loss": 0.1854, "loss/crossentropy": 2.852459490299225, "loss/hidden": 0.0, "loss/logits": 0.18538159132003784, "loss/reg": 1.122011661529541, "step": 1522 }, { "epoch": 0.01523, "grad_norm": 0.3831879794597626, "grad_norm_var": 0.0035558519997709315, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.785507321357727, "loss/hidden": 0.0, "loss/logits": 0.1772114671766758, "loss/reg": 1.1209535598754883, "step": 1523 }, { "epoch": 0.01524, "grad_norm": 0.47046220302581787, "grad_norm_var": 0.0036022759188557058, "learning_rate": 5e-05, "loss": 0.2032, "loss/crossentropy": 2.862886071205139, "loss/hidden": 0.0, "loss/logits": 0.2032475359737873, "loss/reg": 1.119650959968567, "step": 1524 }, { "epoch": 0.01525, "grad_norm": 0.4245082437992096, "grad_norm_var": 0.0033634452527405304, "learning_rate": 5e-05, "loss": 0.2064, "loss/crossentropy": 2.7247531414031982, "loss/hidden": 0.0, "loss/logits": 0.20639275014400482, "loss/reg": 1.1184643507003784, "step": 1525 }, { "epoch": 0.01526, "grad_norm": 0.39092564582824707, "grad_norm_var": 0.0014234374246410498, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.6980656385421753, "loss/hidden": 0.0, "loss/logits": 0.18094860762357712, "loss/reg": 1.1170532703399658, "step": 1526 }, { "epoch": 0.01527, "grad_norm": 0.46039342880249023, "grad_norm_var": 0.0015493220358197223, "learning_rate": 5e-05, "loss": 0.1912, "loss/crossentropy": 2.7977852821350098, "loss/hidden": 0.0, "loss/logits": 0.1911815144121647, "loss/reg": 1.1160552501678467, "step": 1527 }, { "epoch": 0.01528, "grad_norm": 0.5170849561691284, "grad_norm_var": 0.002133071464395893, "learning_rate": 5e-05, "loss": 0.2056, "loss/crossentropy": 2.8577861189842224, "loss/hidden": 0.0, "loss/logits": 0.2055775336921215, "loss/reg": 1.1147196292877197, "step": 1528 }, { "epoch": 0.01529, "grad_norm": 0.40889468789100647, "grad_norm_var": 0.0021001187915984892, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.659440815448761, "loss/hidden": 0.0, "loss/logits": 0.1939220428466797, "loss/reg": 1.112278938293457, "step": 1529 }, { "epoch": 0.0153, "grad_norm": 0.4347395598888397, "grad_norm_var": 0.0019232800669717184, "learning_rate": 5e-05, "loss": 0.1986, "loss/crossentropy": 2.8192054629325867, "loss/hidden": 0.0, "loss/logits": 0.1986180804669857, "loss/reg": 1.1106244325637817, "step": 1530 }, { "epoch": 0.01531, "grad_norm": 0.3992787003517151, "grad_norm_var": 0.0017485805047590564, "learning_rate": 5e-05, "loss": 0.197, "loss/crossentropy": 2.817795991897583, "loss/hidden": 0.0, "loss/logits": 0.19699057191610336, "loss/reg": 1.1089487075805664, "step": 1531 }, { "epoch": 0.01532, "grad_norm": 0.4026416540145874, "grad_norm_var": 0.0015933646211622964, "learning_rate": 5e-05, "loss": 0.1914, "loss/crossentropy": 2.862860321998596, "loss/hidden": 0.0, "loss/logits": 0.19142070785164833, "loss/reg": 1.107688307762146, "step": 1532 }, { "epoch": 0.01533, "grad_norm": 0.3913553059101105, "grad_norm_var": 0.0014540163735419368, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.7344642281532288, "loss/hidden": 0.0, "loss/logits": 0.1830131895840168, "loss/reg": 1.1065510511398315, "step": 1533 }, { "epoch": 0.01534, "grad_norm": 0.5044860243797302, "grad_norm_var": 0.0018893563688275254, "learning_rate": 5e-05, "loss": 0.2228, "loss/crossentropy": 2.7584890127182007, "loss/hidden": 0.0, "loss/logits": 0.22278590872883797, "loss/reg": 1.1057510375976562, "step": 1534 }, { "epoch": 0.01535, "grad_norm": 0.4163862466812134, "grad_norm_var": 0.0018698157391243955, "learning_rate": 5e-05, "loss": 0.2103, "loss/crossentropy": 2.730086088180542, "loss/hidden": 0.0, "loss/logits": 0.21033897623419762, "loss/reg": 1.1047923564910889, "step": 1535 }, { "epoch": 0.01536, "grad_norm": 0.49695706367492676, "grad_norm_var": 0.00199358529955254, "learning_rate": 5e-05, "loss": 0.2009, "loss/crossentropy": 2.966780424118042, "loss/hidden": 0.0, "loss/logits": 0.20090845972299576, "loss/reg": 1.1038782596588135, "step": 1536 }, { "epoch": 0.01537, "grad_norm": 0.5590111613273621, "grad_norm_var": 0.002815411142489177, "learning_rate": 5e-05, "loss": 0.2021, "loss/crossentropy": 2.7672842741012573, "loss/hidden": 0.0, "loss/logits": 0.20211173966526985, "loss/reg": 1.101956844329834, "step": 1537 }, { "epoch": 0.01538, "grad_norm": 0.41098716855049133, "grad_norm_var": 0.0028321029196908634, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.83419930934906, "loss/hidden": 0.0, "loss/logits": 0.17854700610041618, "loss/reg": 1.100677490234375, "step": 1538 }, { "epoch": 0.01539, "grad_norm": 0.5386168956756592, "grad_norm_var": 0.0031240819845108885, "learning_rate": 5e-05, "loss": 0.2135, "loss/crossentropy": 2.6488757133483887, "loss/hidden": 0.0, "loss/logits": 0.21346936747431755, "loss/reg": 1.0994669198989868, "step": 1539 }, { "epoch": 0.0154, "grad_norm": 1.3307965993881226, "grad_norm_var": 0.051540649597419906, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.665076494216919, "loss/hidden": 0.0, "loss/logits": 0.19429544359445572, "loss/reg": 1.0974620580673218, "step": 1540 }, { "epoch": 0.01541, "grad_norm": 0.4543551504611969, "grad_norm_var": 0.05127424614667148, "learning_rate": 5e-05, "loss": 0.2127, "loss/crossentropy": 2.8328760862350464, "loss/hidden": 0.0, "loss/logits": 0.2126643992960453, "loss/reg": 1.0955594778060913, "step": 1541 }, { "epoch": 0.01542, "grad_norm": 0.45249900221824646, "grad_norm_var": 0.05055573652069357, "learning_rate": 5e-05, "loss": 0.2003, "loss/crossentropy": 2.802838683128357, "loss/hidden": 0.0, "loss/logits": 0.20028693228960037, "loss/reg": 1.0934381484985352, "step": 1542 }, { "epoch": 0.01543, "grad_norm": 0.4271480143070221, "grad_norm_var": 0.0508498280485958, "learning_rate": 5e-05, "loss": 0.1916, "loss/crossentropy": 2.7794657349586487, "loss/hidden": 0.0, "loss/logits": 0.19155221432447433, "loss/reg": 1.09139084815979, "step": 1543 }, { "epoch": 0.01544, "grad_norm": 0.4178108870983124, "grad_norm_var": 0.051359794317072924, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.984102964401245, "loss/hidden": 0.0, "loss/logits": 0.18070745840668678, "loss/reg": 1.0887871980667114, "step": 1544 }, { "epoch": 0.01545, "grad_norm": 0.43329110741615295, "grad_norm_var": 0.05109129627541022, "learning_rate": 5e-05, "loss": 0.1711, "loss/crossentropy": 2.7687519192695618, "loss/hidden": 0.0, "loss/logits": 0.17112785205245018, "loss/reg": 1.0872141122817993, "step": 1545 }, { "epoch": 0.01546, "grad_norm": 0.4670257866382599, "grad_norm_var": 0.050856580550541, "learning_rate": 5e-05, "loss": 0.1948, "loss/crossentropy": 2.6094987988471985, "loss/hidden": 0.0, "loss/logits": 0.19480736926198006, "loss/reg": 1.0858657360076904, "step": 1546 }, { "epoch": 0.01547, "grad_norm": 0.40323877334594727, "grad_norm_var": 0.05080099145439707, "learning_rate": 5e-05, "loss": 0.1918, "loss/crossentropy": 2.9429529309272766, "loss/hidden": 0.0, "loss/logits": 0.1918117143213749, "loss/reg": 1.0839253664016724, "step": 1547 }, { "epoch": 0.01548, "grad_norm": 0.37677061557769775, "grad_norm_var": 0.05120164181760689, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.781135082244873, "loss/hidden": 0.0, "loss/logits": 0.18400835990905762, "loss/reg": 1.0825845003128052, "step": 1548 }, { "epoch": 0.01549, "grad_norm": 0.4067257046699524, "grad_norm_var": 0.05098341124146938, "learning_rate": 5e-05, "loss": 0.1959, "loss/crossentropy": 3.0511435866355896, "loss/hidden": 0.0, "loss/logits": 0.195915374904871, "loss/reg": 1.0807474851608276, "step": 1549 }, { "epoch": 0.0155, "grad_norm": 0.41131263971328735, "grad_norm_var": 0.05154488197435978, "learning_rate": 5e-05, "loss": 0.2111, "loss/crossentropy": 2.7555925250053406, "loss/hidden": 0.0, "loss/logits": 0.21109894663095474, "loss/reg": 1.079037070274353, "step": 1550 }, { "epoch": 0.01551, "grad_norm": 0.395988792181015, "grad_norm_var": 0.05179878503089578, "learning_rate": 5e-05, "loss": 0.2093, "loss/crossentropy": 2.9266753792762756, "loss/hidden": 0.0, "loss/logits": 0.2093418724834919, "loss/reg": 1.0777536630630493, "step": 1551 }, { "epoch": 0.01552, "grad_norm": 0.377541184425354, "grad_norm_var": 0.0527211149077442, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.864599108695984, "loss/hidden": 0.0, "loss/logits": 0.17924968153238297, "loss/reg": 1.0760817527770996, "step": 1552 }, { "epoch": 0.01553, "grad_norm": 0.3881519138813019, "grad_norm_var": 0.05300642886035351, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.7285693287849426, "loss/hidden": 0.0, "loss/logits": 0.18472957983613014, "loss/reg": 1.0742844343185425, "step": 1553 }, { "epoch": 0.01554, "grad_norm": 0.706222653388977, "grad_norm_var": 0.05570734295763023, "learning_rate": 5e-05, "loss": 0.2179, "loss/crossentropy": 2.935713231563568, "loss/hidden": 0.0, "loss/logits": 0.21790685132145882, "loss/reg": 1.0728493928909302, "step": 1554 }, { "epoch": 0.01555, "grad_norm": 0.43061429262161255, "grad_norm_var": 0.055869027275741875, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.8454378843307495, "loss/hidden": 0.0, "loss/logits": 0.1798604428768158, "loss/reg": 1.0717158317565918, "step": 1555 }, { "epoch": 0.01556, "grad_norm": 0.48051029443740845, "grad_norm_var": 0.006013161612129384, "learning_rate": 5e-05, "loss": 0.1964, "loss/crossentropy": 2.836775243282318, "loss/hidden": 0.0, "loss/logits": 0.19637484848499298, "loss/reg": 1.0707546472549438, "step": 1556 }, { "epoch": 0.01557, "grad_norm": 0.4592551290988922, "grad_norm_var": 0.006024481601364788, "learning_rate": 5e-05, "loss": 0.1982, "loss/crossentropy": 2.8153125047683716, "loss/hidden": 0.0, "loss/logits": 0.1982250064611435, "loss/reg": 1.0695213079452515, "step": 1557 }, { "epoch": 0.01558, "grad_norm": 0.4210123121738434, "grad_norm_var": 0.006032424887954294, "learning_rate": 5e-05, "loss": 0.1965, "loss/crossentropy": 2.8357452750205994, "loss/hidden": 0.0, "loss/logits": 0.1964869536459446, "loss/reg": 1.0685877799987793, "step": 1558 }, { "epoch": 0.01559, "grad_norm": 0.4791198968887329, "grad_norm_var": 0.00612837245298895, "learning_rate": 5e-05, "loss": 0.2072, "loss/crossentropy": 2.9406415820121765, "loss/hidden": 0.0, "loss/logits": 0.20718776062130928, "loss/reg": 1.0677986145019531, "step": 1559 }, { "epoch": 0.0156, "grad_norm": 0.401761531829834, "grad_norm_var": 0.006193905709313973, "learning_rate": 5e-05, "loss": 0.1813, "loss/crossentropy": 2.733909487724304, "loss/hidden": 0.0, "loss/logits": 0.18126443028450012, "loss/reg": 1.0671542882919312, "step": 1560 }, { "epoch": 0.01561, "grad_norm": 0.37871912121772766, "grad_norm_var": 0.006428189979391045, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.821140468120575, "loss/hidden": 0.0, "loss/logits": 0.18263829499483109, "loss/reg": 1.0664088726043701, "step": 1561 }, { "epoch": 0.01562, "grad_norm": 0.40654441714286804, "grad_norm_var": 0.0064106344187308505, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.847402274608612, "loss/hidden": 0.0, "loss/logits": 0.19583850726485252, "loss/reg": 1.0656745433807373, "step": 1562 }, { "epoch": 0.01563, "grad_norm": 0.39909881353378296, "grad_norm_var": 0.006427978041990023, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.8001545667648315, "loss/hidden": 0.0, "loss/logits": 0.1870410367846489, "loss/reg": 1.0649704933166504, "step": 1563 }, { "epoch": 0.01564, "grad_norm": 0.44327160716056824, "grad_norm_var": 0.006210596260875845, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.895441710948944, "loss/hidden": 0.0, "loss/logits": 0.19354696199297905, "loss/reg": 1.0645458698272705, "step": 1564 }, { "epoch": 0.01565, "grad_norm": 0.43015867471694946, "grad_norm_var": 0.006151527259060663, "learning_rate": 5e-05, "loss": 0.2007, "loss/crossentropy": 2.6846776008605957, "loss/hidden": 0.0, "loss/logits": 0.20071979612112045, "loss/reg": 1.0638952255249023, "step": 1565 }, { "epoch": 0.01566, "grad_norm": 0.4013887643814087, "grad_norm_var": 0.006193100862394488, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.6647936701774597, "loss/hidden": 0.0, "loss/logits": 0.19105948507785797, "loss/reg": 1.063437819480896, "step": 1566 }, { "epoch": 0.01567, "grad_norm": 0.3776607811450958, "grad_norm_var": 0.00631544015384116, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.7828028202056885, "loss/hidden": 0.0, "loss/logits": 0.17943480983376503, "loss/reg": 1.0625545978546143, "step": 1567 }, { "epoch": 0.01568, "grad_norm": 0.4237598478794098, "grad_norm_var": 0.006086760813740237, "learning_rate": 5e-05, "loss": 0.2011, "loss/crossentropy": 2.6718530654907227, "loss/hidden": 0.0, "loss/logits": 0.20108015462756157, "loss/reg": 1.0613658428192139, "step": 1568 }, { "epoch": 0.01569, "grad_norm": 0.3995186686515808, "grad_norm_var": 0.006017464457152703, "learning_rate": 5e-05, "loss": 0.1917, "loss/crossentropy": 2.5245614051818848, "loss/hidden": 0.0, "loss/logits": 0.1916515864431858, "loss/reg": 1.0607917308807373, "step": 1569 }, { "epoch": 0.0157, "grad_norm": 0.38678285479545593, "grad_norm_var": 0.0010524458516556479, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.8043148517608643, "loss/hidden": 0.0, "loss/logits": 0.18437979742884636, "loss/reg": 1.0602530241012573, "step": 1570 }, { "epoch": 0.01571, "grad_norm": 0.40594568848609924, "grad_norm_var": 0.0010553984485499417, "learning_rate": 5e-05, "loss": 0.1904, "loss/crossentropy": 2.783930778503418, "loss/hidden": 0.0, "loss/logits": 0.19035086408257484, "loss/reg": 1.0592772960662842, "step": 1571 }, { "epoch": 0.01572, "grad_norm": 0.4098511040210724, "grad_norm_var": 0.0007823522709671089, "learning_rate": 5e-05, "loss": 0.2076, "loss/crossentropy": 2.5536091327667236, "loss/hidden": 0.0, "loss/logits": 0.20764099434018135, "loss/reg": 1.0582836866378784, "step": 1572 }, { "epoch": 0.01573, "grad_norm": 2.0963947772979736, "grad_norm_var": 0.17817707747996261, "learning_rate": 5e-05, "loss": 0.2349, "loss/crossentropy": 2.964250385761261, "loss/hidden": 0.0, "loss/logits": 0.234896432608366, "loss/reg": 1.0572166442871094, "step": 1573 }, { "epoch": 0.01574, "grad_norm": 0.45353516936302185, "grad_norm_var": 0.17782993109395742, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.8702264428138733, "loss/hidden": 0.0, "loss/logits": 0.19320252165198326, "loss/reg": 1.056014060974121, "step": 1574 }, { "epoch": 0.01575, "grad_norm": 0.44855397939682007, "grad_norm_var": 0.17804818136024572, "learning_rate": 5e-05, "loss": 0.1982, "loss/crossentropy": 2.949871063232422, "loss/hidden": 0.0, "loss/logits": 0.1982114426791668, "loss/reg": 1.0550678968429565, "step": 1575 }, { "epoch": 0.01576, "grad_norm": 0.4332554340362549, "grad_norm_var": 0.17762864137172196, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.7771596908569336, "loss/hidden": 0.0, "loss/logits": 0.191011194139719, "loss/reg": 1.05426025390625, "step": 1576 }, { "epoch": 0.01577, "grad_norm": 0.38208287954330444, "grad_norm_var": 0.17756670040897735, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.774085760116577, "loss/hidden": 0.0, "loss/logits": 0.17731792107224464, "loss/reg": 1.0530881881713867, "step": 1577 }, { "epoch": 0.01578, "grad_norm": 0.4420539140701294, "grad_norm_var": 0.17711490965873147, "learning_rate": 5e-05, "loss": 0.21, "loss/crossentropy": 2.8334729075431824, "loss/hidden": 0.0, "loss/logits": 0.20999984815716743, "loss/reg": 1.052188515663147, "step": 1578 }, { "epoch": 0.01579, "grad_norm": 0.42621752619743347, "grad_norm_var": 0.17672070717077115, "learning_rate": 5e-05, "loss": 0.1972, "loss/crossentropy": 2.653954863548279, "loss/hidden": 0.0, "loss/logits": 0.1972290314733982, "loss/reg": 1.051479697227478, "step": 1579 }, { "epoch": 0.0158, "grad_norm": 0.4033251404762268, "grad_norm_var": 0.17724256929511586, "learning_rate": 5e-05, "loss": 0.1978, "loss/crossentropy": 2.7112594842910767, "loss/hidden": 0.0, "loss/logits": 0.19781038537621498, "loss/reg": 1.0506196022033691, "step": 1580 }, { "epoch": 0.01581, "grad_norm": 0.39284470677375793, "grad_norm_var": 0.1777767191095864, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.7284326553344727, "loss/hidden": 0.0, "loss/logits": 0.18470782414078712, "loss/reg": 1.0500985383987427, "step": 1581 }, { "epoch": 0.01582, "grad_norm": 0.37770310044288635, "grad_norm_var": 0.17817909777804894, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.775408685207367, "loss/hidden": 0.0, "loss/logits": 0.19050265476107597, "loss/reg": 1.0495445728302002, "step": 1582 }, { "epoch": 0.01583, "grad_norm": 0.4136863946914673, "grad_norm_var": 0.17759466596601922, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.937465488910675, "loss/hidden": 0.0, "loss/logits": 0.18836676329374313, "loss/reg": 1.0486500263214111, "step": 1583 }, { "epoch": 0.01584, "grad_norm": 0.6148908138275146, "grad_norm_var": 0.17746426467375553, "learning_rate": 5e-05, "loss": 0.1929, "loss/crossentropy": 2.843154191970825, "loss/hidden": 0.0, "loss/logits": 0.1928807981312275, "loss/reg": 1.0476218461990356, "step": 1584 }, { "epoch": 0.01585, "grad_norm": 0.44110244512557983, "grad_norm_var": 0.17684658441090229, "learning_rate": 5e-05, "loss": 0.2037, "loss/crossentropy": 2.7909162044525146, "loss/hidden": 0.0, "loss/logits": 0.2037404477596283, "loss/reg": 1.0467737913131714, "step": 1585 }, { "epoch": 0.01586, "grad_norm": 0.397225022315979, "grad_norm_var": 0.17664980315666612, "learning_rate": 5e-05, "loss": 0.1753, "loss/crossentropy": 2.7290444374084473, "loss/hidden": 0.0, "loss/logits": 0.17533109337091446, "loss/reg": 1.0455046892166138, "step": 1586 }, { "epoch": 0.01587, "grad_norm": 0.43149325251579285, "grad_norm_var": 0.17625553391335924, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 2.9028236269950867, "loss/hidden": 0.0, "loss/logits": 0.18315931409597397, "loss/reg": 1.0444459915161133, "step": 1587 }, { "epoch": 0.01588, "grad_norm": 0.5195469260215759, "grad_norm_var": 0.17517331211055512, "learning_rate": 5e-05, "loss": 0.2144, "loss/crossentropy": 2.842075824737549, "loss/hidden": 0.0, "loss/logits": 0.21435903385281563, "loss/reg": 1.043382167816162, "step": 1588 }, { "epoch": 0.01589, "grad_norm": 0.39358392357826233, "grad_norm_var": 0.0035111967362226287, "learning_rate": 5e-05, "loss": 0.1765, "loss/crossentropy": 2.796789824962616, "loss/hidden": 0.0, "loss/logits": 0.17651647701859474, "loss/reg": 1.042464256286621, "step": 1589 }, { "epoch": 0.0159, "grad_norm": 0.38947010040283203, "grad_norm_var": 0.00361531631975656, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.7255443334579468, "loss/hidden": 0.0, "loss/logits": 0.17971770092844963, "loss/reg": 1.0416078567504883, "step": 1590 }, { "epoch": 0.01591, "grad_norm": 0.3702358901500702, "grad_norm_var": 0.003822570496432457, "learning_rate": 5e-05, "loss": 0.1738, "loss/crossentropy": 2.6745933890342712, "loss/hidden": 0.0, "loss/logits": 0.17375827953219414, "loss/reg": 1.0408596992492676, "step": 1591 }, { "epoch": 0.01592, "grad_norm": 0.4127400815486908, "grad_norm_var": 0.0038312033002130524, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.825109601020813, "loss/hidden": 0.0, "loss/logits": 0.1792273335158825, "loss/reg": 1.0395222902297974, "step": 1592 }, { "epoch": 0.01593, "grad_norm": 0.44498032331466675, "grad_norm_var": 0.00371424276920929, "learning_rate": 5e-05, "loss": 0.1982, "loss/crossentropy": 2.847081184387207, "loss/hidden": 0.0, "loss/logits": 0.19815902039408684, "loss/reg": 1.038297176361084, "step": 1593 }, { "epoch": 0.01594, "grad_norm": 0.3808656930923462, "grad_norm_var": 0.0038453633106530346, "learning_rate": 5e-05, "loss": 0.1886, "loss/crossentropy": 2.8437030911445618, "loss/hidden": 0.0, "loss/logits": 0.18859212845563889, "loss/reg": 1.037431240081787, "step": 1594 }, { "epoch": 0.01595, "grad_norm": 0.4359935522079468, "grad_norm_var": 0.0038521160414555075, "learning_rate": 5e-05, "loss": 0.2084, "loss/crossentropy": 2.7890624403953552, "loss/hidden": 0.0, "loss/logits": 0.20844709128141403, "loss/reg": 1.035805344581604, "step": 1595 }, { "epoch": 0.01596, "grad_norm": 0.3994602560997009, "grad_norm_var": 0.0038648531464093713, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.976751923561096, "loss/hidden": 0.0, "loss/logits": 0.17941803485155106, "loss/reg": 1.034838080406189, "step": 1596 }, { "epoch": 0.01597, "grad_norm": 0.4446309208869934, "grad_norm_var": 0.0038036113876331397, "learning_rate": 5e-05, "loss": 0.2044, "loss/crossentropy": 2.6885805130004883, "loss/hidden": 0.0, "loss/logits": 0.20441588386893272, "loss/reg": 1.0337451696395874, "step": 1597 }, { "epoch": 0.01598, "grad_norm": 0.38748055696487427, "grad_norm_var": 0.0037424185106978165, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.676852226257324, "loss/hidden": 0.0, "loss/logits": 0.18669695407152176, "loss/reg": 1.0320935249328613, "step": 1598 }, { "epoch": 0.01599, "grad_norm": 0.3517172932624817, "grad_norm_var": 0.004115871219999908, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 2.7314975261688232, "loss/hidden": 0.0, "loss/logits": 0.16971135139465332, "loss/reg": 1.03118097782135, "step": 1599 }, { "epoch": 0.016, "grad_norm": 0.3928627669811249, "grad_norm_var": 0.001603946516322626, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.722111701965332, "loss/hidden": 0.0, "loss/logits": 0.1881674863398075, "loss/reg": 1.0301860570907593, "step": 1600 }, { "epoch": 0.01601, "grad_norm": 0.43568360805511475, "grad_norm_var": 0.0015848176222982625, "learning_rate": 5e-05, "loss": 0.2035, "loss/crossentropy": 2.847353994846344, "loss/hidden": 0.0, "loss/logits": 0.20354657620191574, "loss/reg": 1.0290837287902832, "step": 1601 }, { "epoch": 0.01602, "grad_norm": 0.4481942355632782, "grad_norm_var": 0.0016484863625255031, "learning_rate": 5e-05, "loss": 0.1995, "loss/crossentropy": 2.8443565368652344, "loss/hidden": 0.0, "loss/logits": 0.19950192049145699, "loss/reg": 1.0280483961105347, "step": 1602 }, { "epoch": 0.01603, "grad_norm": 0.375472754240036, "grad_norm_var": 0.0017209400432947792, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.907568633556366, "loss/hidden": 0.0, "loss/logits": 0.18281982839107513, "loss/reg": 1.0267391204833984, "step": 1603 }, { "epoch": 0.01604, "grad_norm": 0.43082571029663086, "grad_norm_var": 0.000933965990925841, "learning_rate": 5e-05, "loss": 0.1993, "loss/crossentropy": 2.843041956424713, "loss/hidden": 0.0, "loss/logits": 0.19929581135511398, "loss/reg": 1.0255545377731323, "step": 1604 }, { "epoch": 0.01605, "grad_norm": 0.46895185112953186, "grad_norm_var": 0.0011653483970377318, "learning_rate": 5e-05, "loss": 0.1959, "loss/crossentropy": 2.8054389357566833, "loss/hidden": 0.0, "loss/logits": 0.1959143802523613, "loss/reg": 1.024338722229004, "step": 1605 }, { "epoch": 0.01606, "grad_norm": 0.40604454278945923, "grad_norm_var": 0.0011358271508217518, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 2.6769182085990906, "loss/hidden": 0.0, "loss/logits": 0.19093864411115646, "loss/reg": 1.0232833623886108, "step": 1606 }, { "epoch": 0.01607, "grad_norm": 0.3953154683113098, "grad_norm_var": 0.0010367066058140713, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.784206509590149, "loss/hidden": 0.0, "loss/logits": 0.1903013363480568, "loss/reg": 1.0223585367202759, "step": 1607 }, { "epoch": 0.01608, "grad_norm": 0.4797690510749817, "grad_norm_var": 0.0013133904404337778, "learning_rate": 5e-05, "loss": 0.2239, "loss/crossentropy": 2.7858375906944275, "loss/hidden": 0.0, "loss/logits": 0.2238890789449215, "loss/reg": 1.0218397378921509, "step": 1608 }, { "epoch": 0.01609, "grad_norm": 0.39962831139564514, "grad_norm_var": 0.0012751071067100051, "learning_rate": 5e-05, "loss": 0.188, "loss/crossentropy": 2.769327938556671, "loss/hidden": 0.0, "loss/logits": 0.1880376748740673, "loss/reg": 1.0211185216903687, "step": 1609 }, { "epoch": 0.0161, "grad_norm": 0.4070293605327606, "grad_norm_var": 0.0012003623105043696, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.886406660079956, "loss/hidden": 0.0, "loss/logits": 0.1911228708922863, "loss/reg": 1.0199602842330933, "step": 1610 }, { "epoch": 0.01611, "grad_norm": 0.3688849210739136, "grad_norm_var": 0.001304648081967604, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.839527428150177, "loss/hidden": 0.0, "loss/logits": 0.188365176320076, "loss/reg": 1.0188500881195068, "step": 1611 }, { "epoch": 0.01612, "grad_norm": 0.3744886815547943, "grad_norm_var": 0.001385363352798156, "learning_rate": 5e-05, "loss": 0.1894, "loss/crossentropy": 2.902321934700012, "loss/hidden": 0.0, "loss/logits": 0.18943318352103233, "loss/reg": 1.017642617225647, "step": 1612 }, { "epoch": 0.01613, "grad_norm": 0.39889490604400635, "grad_norm_var": 0.0013075760766252942, "learning_rate": 5e-05, "loss": 0.2063, "loss/crossentropy": 2.8871779441833496, "loss/hidden": 0.0, "loss/logits": 0.20626594126224518, "loss/reg": 1.016883134841919, "step": 1613 }, { "epoch": 0.01614, "grad_norm": 0.583963930606842, "grad_norm_var": 0.00319393139732812, "learning_rate": 5e-05, "loss": 0.1922, "loss/crossentropy": 2.8867177963256836, "loss/hidden": 0.0, "loss/logits": 0.19224493950605392, "loss/reg": 1.0164117813110352, "step": 1614 }, { "epoch": 0.01615, "grad_norm": 0.46384310722351074, "grad_norm_var": 0.0029609833884467586, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.8498631715774536, "loss/hidden": 0.0, "loss/logits": 0.18652616068720818, "loss/reg": 1.01522958278656, "step": 1615 }, { "epoch": 0.01616, "grad_norm": 0.45459720492362976, "grad_norm_var": 0.002919291729982876, "learning_rate": 5e-05, "loss": 0.1997, "loss/crossentropy": 2.9225170612335205, "loss/hidden": 0.0, "loss/logits": 0.19967232644557953, "loss/reg": 1.0138804912567139, "step": 1616 }, { "epoch": 0.01617, "grad_norm": 0.4312615394592285, "grad_norm_var": 0.0029175898021926217, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.8372015953063965, "loss/hidden": 0.0, "loss/logits": 0.1956445276737213, "loss/reg": 1.0130113363265991, "step": 1617 }, { "epoch": 0.01618, "grad_norm": 0.41878917813301086, "grad_norm_var": 0.0029020530857978622, "learning_rate": 5e-05, "loss": 0.2, "loss/crossentropy": 2.7289316654205322, "loss/hidden": 0.0, "loss/logits": 0.20003606751561165, "loss/reg": 1.0116477012634277, "step": 1618 }, { "epoch": 0.01619, "grad_norm": 0.3937804698944092, "grad_norm_var": 0.002793291740125229, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.9862471222877502, "loss/hidden": 0.0, "loss/logits": 0.18085769563913345, "loss/reg": 1.0105339288711548, "step": 1619 }, { "epoch": 0.0162, "grad_norm": 0.38236817717552185, "grad_norm_var": 0.0029331274073497783, "learning_rate": 5e-05, "loss": 0.1781, "loss/crossentropy": 2.660866141319275, "loss/hidden": 0.0, "loss/logits": 0.17813289538025856, "loss/reg": 1.0092902183532715, "step": 1620 }, { "epoch": 0.01621, "grad_norm": 2.7290685176849365, "grad_norm_var": 0.3349158996765001, "learning_rate": 5e-05, "loss": 0.2398, "loss/crossentropy": 2.8020981550216675, "loss/hidden": 0.0, "loss/logits": 0.2397916540503502, "loss/reg": 1.0086039304733276, "step": 1621 }, { "epoch": 0.01622, "grad_norm": 0.3994118869304657, "grad_norm_var": 0.33506186009023053, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 2.760943591594696, "loss/hidden": 0.0, "loss/logits": 0.1800093874335289, "loss/reg": 1.0072078704833984, "step": 1622 }, { "epoch": 0.01623, "grad_norm": 0.5013747215270996, "grad_norm_var": 0.3333290261657852, "learning_rate": 5e-05, "loss": 0.206, "loss/crossentropy": 2.761034667491913, "loss/hidden": 0.0, "loss/logits": 0.20601341128349304, "loss/reg": 1.0063005685806274, "step": 1623 }, { "epoch": 0.01624, "grad_norm": 0.5265280604362488, "grad_norm_var": 0.3328769613967595, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.8885090947151184, "loss/hidden": 0.0, "loss/logits": 0.18442480266094208, "loss/reg": 1.0057168006896973, "step": 1624 }, { "epoch": 0.01625, "grad_norm": 0.4721510708332062, "grad_norm_var": 0.3314893959527416, "learning_rate": 5e-05, "loss": 0.1941, "loss/crossentropy": 2.6390693187713623, "loss/hidden": 0.0, "loss/logits": 0.19406652450561523, "loss/reg": 1.0045299530029297, "step": 1625 }, { "epoch": 0.01626, "grad_norm": 0.45044615864753723, "grad_norm_var": 0.3305963341312829, "learning_rate": 5e-05, "loss": 0.1854, "loss/crossentropy": 2.8466389775276184, "loss/hidden": 0.0, "loss/logits": 0.18537183478474617, "loss/reg": 1.0036040544509888, "step": 1626 }, { "epoch": 0.01627, "grad_norm": 0.4176648259162903, "grad_norm_var": 0.3293435667823592, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 2.8753581643104553, "loss/hidden": 0.0, "loss/logits": 0.1729147806763649, "loss/reg": 1.0026119947433472, "step": 1627 }, { "epoch": 0.01628, "grad_norm": 0.45000559091567993, "grad_norm_var": 0.32755605843470553, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.8436471819877625, "loss/hidden": 0.0, "loss/logits": 0.18771208450198174, "loss/reg": 1.0014100074768066, "step": 1628 }, { "epoch": 0.01629, "grad_norm": 0.6647088527679443, "grad_norm_var": 0.32512335965386835, "learning_rate": 5e-05, "loss": 0.247, "loss/crossentropy": 3.064721703529358, "loss/hidden": 0.0, "loss/logits": 0.24700120091438293, "loss/reg": 1.0006461143493652, "step": 1629 }, { "epoch": 0.0163, "grad_norm": 0.4309985935688019, "grad_norm_var": 0.3270912337702334, "learning_rate": 5e-05, "loss": 0.2076, "loss/crossentropy": 2.7790536284446716, "loss/hidden": 0.0, "loss/logits": 0.20761951059103012, "loss/reg": 0.9996326565742493, "step": 1630 }, { "epoch": 0.01631, "grad_norm": 0.4092963933944702, "grad_norm_var": 0.32826153742197073, "learning_rate": 5e-05, "loss": 0.1858, "loss/crossentropy": 2.753304898738861, "loss/hidden": 0.0, "loss/logits": 0.1858477033674717, "loss/reg": 0.998843252658844, "step": 1631 }, { "epoch": 0.01632, "grad_norm": 0.43260130286216736, "grad_norm_var": 0.32870582994017805, "learning_rate": 5e-05, "loss": 0.2128, "loss/crossentropy": 2.7732598185539246, "loss/hidden": 0.0, "loss/logits": 0.2127918191254139, "loss/reg": 0.9981245994567871, "step": 1632 }, { "epoch": 0.01633, "grad_norm": 0.4002577066421509, "grad_norm_var": 0.3294403105987868, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.7656468749046326, "loss/hidden": 0.0, "loss/logits": 0.18241503462195396, "loss/reg": 0.996983528137207, "step": 1633 }, { "epoch": 0.01634, "grad_norm": 0.4860977530479431, "grad_norm_var": 0.3281648073013982, "learning_rate": 5e-05, "loss": 0.2161, "loss/crossentropy": 2.7581685185432434, "loss/hidden": 0.0, "loss/logits": 0.2160632722079754, "loss/reg": 0.9963746666908264, "step": 1634 }, { "epoch": 0.01635, "grad_norm": 0.7037771940231323, "grad_norm_var": 0.3257848148583186, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.8083826303482056, "loss/hidden": 0.0, "loss/logits": 0.1846979483962059, "loss/reg": 0.9959037899971008, "step": 1635 }, { "epoch": 0.01636, "grad_norm": 0.3976283669471741, "grad_norm_var": 0.3253239044098365, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.87517386674881, "loss/hidden": 0.0, "loss/logits": 0.17655937373638153, "loss/reg": 0.9958146214485168, "step": 1636 }, { "epoch": 0.01637, "grad_norm": 0.40411144495010376, "grad_norm_var": 0.008434168810514371, "learning_rate": 5e-05, "loss": 0.188, "loss/crossentropy": 2.9585282802581787, "loss/hidden": 0.0, "loss/logits": 0.18801874667406082, "loss/reg": 0.9954627156257629, "step": 1637 }, { "epoch": 0.01638, "grad_norm": 0.47144263982772827, "grad_norm_var": 0.008064267432894153, "learning_rate": 5e-05, "loss": 0.2147, "loss/crossentropy": 2.8516178727149963, "loss/hidden": 0.0, "loss/logits": 0.21468612551689148, "loss/reg": 0.9962711930274963, "step": 1638 }, { "epoch": 0.01639, "grad_norm": 0.4136725068092346, "grad_norm_var": 0.00825053359325154, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.710533022880554, "loss/hidden": 0.0, "loss/logits": 0.189330842345953, "loss/reg": 0.9961523413658142, "step": 1639 }, { "epoch": 0.0164, "grad_norm": 0.4156164824962616, "grad_norm_var": 0.008193946810416774, "learning_rate": 5e-05, "loss": 0.2061, "loss/crossentropy": 2.6554945707321167, "loss/hidden": 0.0, "loss/logits": 0.206108208745718, "loss/reg": 0.9952605962753296, "step": 1640 }, { "epoch": 0.01641, "grad_norm": 0.5130243897438049, "grad_norm_var": 0.008343982594008044, "learning_rate": 5e-05, "loss": 0.2016, "loss/crossentropy": 2.844529688358307, "loss/hidden": 0.0, "loss/logits": 0.2015930339694023, "loss/reg": 0.9954249858856201, "step": 1641 }, { "epoch": 0.01642, "grad_norm": 0.40717145800590515, "grad_norm_var": 0.00855270077346663, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.8812676668167114, "loss/hidden": 0.0, "loss/logits": 0.18932242318987846, "loss/reg": 0.9956634044647217, "step": 1642 }, { "epoch": 0.01643, "grad_norm": 0.4588758051395416, "grad_norm_var": 0.008406279557532174, "learning_rate": 5e-05, "loss": 0.2035, "loss/crossentropy": 2.959150493144989, "loss/hidden": 0.0, "loss/logits": 0.20350589975714684, "loss/reg": 0.9954665899276733, "step": 1643 }, { "epoch": 0.01644, "grad_norm": 0.41808342933654785, "grad_norm_var": 0.00853891966052895, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.720207452774048, "loss/hidden": 0.0, "loss/logits": 0.17933955788612366, "loss/reg": 0.9951499700546265, "step": 1644 }, { "epoch": 0.01645, "grad_norm": 0.4281888008117676, "grad_norm_var": 0.0057123534006301175, "learning_rate": 5e-05, "loss": 0.2003, "loss/crossentropy": 2.7021594643592834, "loss/hidden": 0.0, "loss/logits": 0.20027698203921318, "loss/reg": 0.9951516389846802, "step": 1645 }, { "epoch": 0.01646, "grad_norm": 0.4419175088405609, "grad_norm_var": 0.0056929746093528485, "learning_rate": 5e-05, "loss": 0.1888, "loss/crossentropy": 2.7900770902633667, "loss/hidden": 0.0, "loss/logits": 0.18883102387189865, "loss/reg": 0.9944278597831726, "step": 1646 }, { "epoch": 0.01647, "grad_norm": 0.406677782535553, "grad_norm_var": 0.005707653242778928, "learning_rate": 5e-05, "loss": 0.2018, "loss/crossentropy": 2.8403636813163757, "loss/hidden": 0.0, "loss/logits": 0.20182918012142181, "loss/reg": 0.9936374425888062, "step": 1647 }, { "epoch": 0.01648, "grad_norm": 0.39660346508026123, "grad_norm_var": 0.005871895630400322, "learning_rate": 5e-05, "loss": 0.1949, "loss/crossentropy": 2.6434635519981384, "loss/hidden": 0.0, "loss/logits": 0.19490381330251694, "loss/reg": 0.9928113222122192, "step": 1648 }, { "epoch": 0.01649, "grad_norm": 0.38837823271751404, "grad_norm_var": 0.005955855741034688, "learning_rate": 5e-05, "loss": 0.196, "loss/crossentropy": 2.7492038011550903, "loss/hidden": 0.0, "loss/logits": 0.19599515572190285, "loss/reg": 0.9914868474006653, "step": 1649 }, { "epoch": 0.0165, "grad_norm": 0.39013776183128357, "grad_norm_var": 0.006030547116352913, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 2.8989550471305847, "loss/hidden": 0.0, "loss/logits": 0.19131910800933838, "loss/reg": 0.9906550645828247, "step": 1650 }, { "epoch": 0.01651, "grad_norm": 0.46770238876342773, "grad_norm_var": 0.0012410480978832794, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 3.0211783051490784, "loss/hidden": 0.0, "loss/logits": 0.19395827502012253, "loss/reg": 0.9903994798660278, "step": 1651 }, { "epoch": 0.01652, "grad_norm": 0.4251278340816498, "grad_norm_var": 0.0011835438271421083, "learning_rate": 5e-05, "loss": 0.1983, "loss/crossentropy": 2.7187276482582092, "loss/hidden": 0.0, "loss/logits": 0.1982722319662571, "loss/reg": 0.9893469214439392, "step": 1652 }, { "epoch": 0.01653, "grad_norm": 0.48329293727874756, "grad_norm_var": 0.0013240331607979549, "learning_rate": 5e-05, "loss": 0.2075, "loss/crossentropy": 2.614067316055298, "loss/hidden": 0.0, "loss/logits": 0.2074788697063923, "loss/reg": 0.988538384437561, "step": 1653 }, { "epoch": 0.01654, "grad_norm": 0.7802302241325378, "grad_norm_var": 0.008871511150981197, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.801050305366516, "loss/hidden": 0.0, "loss/logits": 0.18674134090542793, "loss/reg": 0.9879339933395386, "step": 1654 }, { "epoch": 0.01655, "grad_norm": 0.5053502917289734, "grad_norm_var": 0.008926244689548623, "learning_rate": 5e-05, "loss": 0.2008, "loss/crossentropy": 2.7182639241218567, "loss/hidden": 0.0, "loss/logits": 0.20081807672977448, "loss/reg": 0.9869012236595154, "step": 1655 }, { "epoch": 0.01656, "grad_norm": 0.47397318482398987, "grad_norm_var": 0.00881009549445061, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 3.0697373151779175, "loss/hidden": 0.0, "loss/logits": 0.18633192032575607, "loss/reg": 0.9862724542617798, "step": 1656 }, { "epoch": 0.01657, "grad_norm": 0.44242963194847107, "grad_norm_var": 0.008637024175784314, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.746702015399933, "loss/hidden": 0.0, "loss/logits": 0.19096798822283745, "loss/reg": 0.9855261445045471, "step": 1657 }, { "epoch": 0.01658, "grad_norm": 0.4351765811443329, "grad_norm_var": 0.008499481917788154, "learning_rate": 5e-05, "loss": 0.19, "loss/crossentropy": 2.7543291449546814, "loss/hidden": 0.0, "loss/logits": 0.1900062970817089, "loss/reg": 0.9842270016670227, "step": 1658 }, { "epoch": 0.01659, "grad_norm": 0.4238717555999756, "grad_norm_var": 0.008576100925275236, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.738615930080414, "loss/hidden": 0.0, "loss/logits": 0.1884973607957363, "loss/reg": 0.9834668636322021, "step": 1659 }, { "epoch": 0.0166, "grad_norm": 0.4111272394657135, "grad_norm_var": 0.008614938397928507, "learning_rate": 5e-05, "loss": 0.1987, "loss/crossentropy": 2.6902480721473694, "loss/hidden": 0.0, "loss/logits": 0.19869986921548843, "loss/reg": 0.9826908111572266, "step": 1660 }, { "epoch": 0.01661, "grad_norm": 0.41095390915870667, "grad_norm_var": 0.008698014381044639, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.7943766117095947, "loss/hidden": 0.0, "loss/logits": 0.18661296740174294, "loss/reg": 0.9816150665283203, "step": 1661 }, { "epoch": 0.01662, "grad_norm": 0.41411253809928894, "grad_norm_var": 0.008795518968246081, "learning_rate": 5e-05, "loss": 0.2033, "loss/crossentropy": 2.859815239906311, "loss/hidden": 0.0, "loss/logits": 0.20330817252397537, "loss/reg": 0.9813138246536255, "step": 1662 }, { "epoch": 0.01663, "grad_norm": 0.4265795946121216, "grad_norm_var": 0.0086961695125602, "learning_rate": 5e-05, "loss": 0.1944, "loss/crossentropy": 2.9196831583976746, "loss/hidden": 0.0, "loss/logits": 0.19435005262494087, "loss/reg": 0.9806890487670898, "step": 1663 }, { "epoch": 0.01664, "grad_norm": 0.4456392526626587, "grad_norm_var": 0.008466672332987986, "learning_rate": 5e-05, "loss": 0.2035, "loss/crossentropy": 2.7248425483703613, "loss/hidden": 0.0, "loss/logits": 0.20353588461875916, "loss/reg": 0.980077862739563, "step": 1664 }, { "epoch": 0.01665, "grad_norm": 0.3930458724498749, "grad_norm_var": 0.008424857113765827, "learning_rate": 5e-05, "loss": 0.1924, "loss/crossentropy": 2.8926807045936584, "loss/hidden": 0.0, "loss/logits": 0.19242505729198456, "loss/reg": 0.979702353477478, "step": 1665 }, { "epoch": 0.01666, "grad_norm": 0.3947600722312927, "grad_norm_var": 0.008384339501580958, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.923057436943054, "loss/hidden": 0.0, "loss/logits": 0.18904650956392288, "loss/reg": 0.9788217544555664, "step": 1666 }, { "epoch": 0.01667, "grad_norm": 0.44706106185913086, "grad_norm_var": 0.008385190103097765, "learning_rate": 5e-05, "loss": 0.204, "loss/crossentropy": 2.7078863978385925, "loss/hidden": 0.0, "loss/logits": 0.2040410153567791, "loss/reg": 0.9782266616821289, "step": 1667 }, { "epoch": 0.01668, "grad_norm": 0.39598697423934937, "grad_norm_var": 0.00856227985747096, "learning_rate": 5e-05, "loss": 0.2027, "loss/crossentropy": 2.7848398089408875, "loss/hidden": 0.0, "loss/logits": 0.20268257707357407, "loss/reg": 0.9783090949058533, "step": 1668 }, { "epoch": 0.01669, "grad_norm": 0.43456798791885376, "grad_norm_var": 0.008528310952534387, "learning_rate": 5e-05, "loss": 0.2359, "loss/crossentropy": 2.9408648014068604, "loss/hidden": 0.0, "loss/logits": 0.23594681918621063, "loss/reg": 0.9780111908912659, "step": 1669 }, { "epoch": 0.0167, "grad_norm": 0.44477957487106323, "grad_norm_var": 0.0008885970048522614, "learning_rate": 5e-05, "loss": 0.2097, "loss/crossentropy": 2.7345515489578247, "loss/hidden": 0.0, "loss/logits": 0.20972703397274017, "loss/reg": 0.9774311184883118, "step": 1670 }, { "epoch": 0.01671, "grad_norm": 0.4334873557090759, "grad_norm_var": 0.0005010059813244501, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.802227795124054, "loss/hidden": 0.0, "loss/logits": 0.1957888901233673, "loss/reg": 0.976824939250946, "step": 1671 }, { "epoch": 0.01672, "grad_norm": 0.44768232107162476, "grad_norm_var": 0.00037857010970889784, "learning_rate": 5e-05, "loss": 0.2204, "loss/crossentropy": 2.687469720840454, "loss/hidden": 0.0, "loss/logits": 0.22035083919763565, "loss/reg": 0.9762384295463562, "step": 1672 }, { "epoch": 0.01673, "grad_norm": 0.4268612861633301, "grad_norm_var": 0.0003577020661750141, "learning_rate": 5e-05, "loss": 0.2018, "loss/crossentropy": 2.8833935260772705, "loss/hidden": 0.0, "loss/logits": 0.2018217258155346, "loss/reg": 0.9758449792861938, "step": 1673 }, { "epoch": 0.01674, "grad_norm": 0.4269968867301941, "grad_norm_var": 0.00034980973717595453, "learning_rate": 5e-05, "loss": 0.197, "loss/crossentropy": 2.7447726726531982, "loss/hidden": 0.0, "loss/logits": 0.19696272909641266, "loss/reg": 0.9751297831535339, "step": 1674 }, { "epoch": 0.01675, "grad_norm": 0.4018579125404358, "grad_norm_var": 0.00037928433144642784, "learning_rate": 5e-05, "loss": 0.1974, "loss/crossentropy": 2.845329523086548, "loss/hidden": 0.0, "loss/logits": 0.1974216252565384, "loss/reg": 0.9747862219810486, "step": 1675 }, { "epoch": 0.01676, "grad_norm": 0.4319971203804016, "grad_norm_var": 0.0003756425543625282, "learning_rate": 5e-05, "loss": 0.1963, "loss/crossentropy": 2.912673830986023, "loss/hidden": 0.0, "loss/logits": 0.19628258049488068, "loss/reg": 0.9739908576011658, "step": 1676 }, { "epoch": 0.01677, "grad_norm": 0.42803555727005005, "grad_norm_var": 0.00036525195673638616, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 2.8631667494773865, "loss/hidden": 0.0, "loss/logits": 0.18627800792455673, "loss/reg": 0.9731059074401855, "step": 1677 }, { "epoch": 0.01678, "grad_norm": 0.42730480432510376, "grad_norm_var": 0.00035769842100891923, "learning_rate": 5e-05, "loss": 0.1918, "loss/crossentropy": 2.771612226963043, "loss/hidden": 0.0, "loss/logits": 0.19182706996798515, "loss/reg": 0.971794605255127, "step": 1678 }, { "epoch": 0.01679, "grad_norm": 0.4047600030899048, "grad_norm_var": 0.0003840668623575424, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.746255576610565, "loss/hidden": 0.0, "loss/logits": 0.1884135976433754, "loss/reg": 0.9709142446517944, "step": 1679 }, { "epoch": 0.0168, "grad_norm": 0.40580958127975464, "grad_norm_var": 0.000368572634361064, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.7972524762153625, "loss/hidden": 0.0, "loss/logits": 0.19399097189307213, "loss/reg": 0.9700757265090942, "step": 1680 }, { "epoch": 0.01681, "grad_norm": 0.4170977473258972, "grad_norm_var": 0.00031327910748755924, "learning_rate": 5e-05, "loss": 0.1982, "loss/crossentropy": 2.680404782295227, "loss/hidden": 0.0, "loss/logits": 0.19815754517912865, "loss/reg": 0.9692074656486511, "step": 1681 }, { "epoch": 0.01682, "grad_norm": 0.4438115656375885, "grad_norm_var": 0.00027853475307475136, "learning_rate": 5e-05, "loss": 0.2074, "loss/crossentropy": 2.7973380088806152, "loss/hidden": 0.0, "loss/logits": 0.20738579705357552, "loss/reg": 0.9686961770057678, "step": 1682 }, { "epoch": 0.01683, "grad_norm": 0.4141625463962555, "grad_norm_var": 0.0002543706883667192, "learning_rate": 5e-05, "loss": 0.1951, "loss/crossentropy": 2.7903233766555786, "loss/hidden": 0.0, "loss/logits": 0.1950708031654358, "loss/reg": 0.9684246778488159, "step": 1683 }, { "epoch": 0.01684, "grad_norm": 0.3853682577610016, "grad_norm_var": 0.0003011857786094733, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.902022361755371, "loss/hidden": 0.0, "loss/logits": 0.18294353038072586, "loss/reg": 0.96816486120224, "step": 1684 }, { "epoch": 0.01685, "grad_norm": 0.4095843732357025, "grad_norm_var": 0.0003030324449647607, "learning_rate": 5e-05, "loss": 0.2011, "loss/crossentropy": 2.8587907552719116, "loss/hidden": 0.0, "loss/logits": 0.20112555101513863, "loss/reg": 0.967812180519104, "step": 1685 }, { "epoch": 0.01686, "grad_norm": 0.398264616727829, "grad_norm_var": 0.00029604972872923445, "learning_rate": 5e-05, "loss": 0.1938, "loss/crossentropy": 2.7175191044807434, "loss/hidden": 0.0, "loss/logits": 0.19377201423048973, "loss/reg": 0.9674014449119568, "step": 1686 }, { "epoch": 0.01687, "grad_norm": 0.3565382957458496, "grad_norm_var": 0.0005168949377238404, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.8121435046195984, "loss/hidden": 0.0, "loss/logits": 0.17816277965903282, "loss/reg": 0.9671922922134399, "step": 1687 }, { "epoch": 0.01688, "grad_norm": 0.38014495372772217, "grad_norm_var": 0.0004998676381996507, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 2.8066705465316772, "loss/hidden": 0.0, "loss/logits": 0.18004292249679565, "loss/reg": 0.9661559462547302, "step": 1688 }, { "epoch": 0.01689, "grad_norm": 0.439016729593277, "grad_norm_var": 0.0005365721033507261, "learning_rate": 5e-05, "loss": 0.1977, "loss/crossentropy": 2.816479742527008, "loss/hidden": 0.0, "loss/logits": 0.19771433249115944, "loss/reg": 0.965285062789917, "step": 1689 }, { "epoch": 0.0169, "grad_norm": 0.423315167427063, "grad_norm_var": 0.0005294054421746204, "learning_rate": 5e-05, "loss": 0.2077, "loss/crossentropy": 2.815659761428833, "loss/hidden": 0.0, "loss/logits": 0.20774946361780167, "loss/reg": 0.9643948078155518, "step": 1690 }, { "epoch": 0.01691, "grad_norm": 0.4508022367954254, "grad_norm_var": 0.0006231092694258996, "learning_rate": 5e-05, "loss": 0.2144, "loss/crossentropy": 2.7999706864356995, "loss/hidden": 0.0, "loss/logits": 0.21436100453138351, "loss/reg": 0.9632377028465271, "step": 1691 }, { "epoch": 0.01692, "grad_norm": 0.3995899260044098, "grad_norm_var": 0.0006088267676019518, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.879646420478821, "loss/hidden": 0.0, "loss/logits": 0.19112277030944824, "loss/reg": 0.962549090385437, "step": 1692 }, { "epoch": 0.01693, "grad_norm": 0.5071508288383484, "grad_norm_var": 0.0011747166082547742, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.949573576450348, "loss/hidden": 0.0, "loss/logits": 0.1898249238729477, "loss/reg": 0.9616365432739258, "step": 1693 }, { "epoch": 0.01694, "grad_norm": 0.3963351845741272, "grad_norm_var": 0.0011897154306865331, "learning_rate": 5e-05, "loss": 0.199, "loss/crossentropy": 2.7994354367256165, "loss/hidden": 0.0, "loss/logits": 0.19900693371891975, "loss/reg": 0.960827648639679, "step": 1694 }, { "epoch": 0.01695, "grad_norm": 0.4103904068470001, "grad_norm_var": 0.0011843963912345878, "learning_rate": 5e-05, "loss": 0.1908, "loss/crossentropy": 2.764429032802582, "loss/hidden": 0.0, "loss/logits": 0.19081446528434753, "loss/reg": 0.9593966007232666, "step": 1695 }, { "epoch": 0.01696, "grad_norm": 0.3964170515537262, "grad_norm_var": 0.0012012147403077358, "learning_rate": 5e-05, "loss": 0.1988, "loss/crossentropy": 2.680214524269104, "loss/hidden": 0.0, "loss/logits": 0.1987549141049385, "loss/reg": 0.9580814838409424, "step": 1696 }, { "epoch": 0.01697, "grad_norm": 0.4489021897315979, "grad_norm_var": 0.001276513715549162, "learning_rate": 5e-05, "loss": 0.21, "loss/crossentropy": 2.820174813270569, "loss/hidden": 0.0, "loss/logits": 0.21000191941857338, "loss/reg": 0.9565646648406982, "step": 1697 }, { "epoch": 0.01698, "grad_norm": 0.4084279537200928, "grad_norm_var": 0.0012246727050356085, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.754970967769623, "loss/hidden": 0.0, "loss/logits": 0.1853071115911007, "loss/reg": 0.9545485377311707, "step": 1698 }, { "epoch": 0.01699, "grad_norm": 0.4521838128566742, "grad_norm_var": 0.0013157176445986315, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.8605205416679382, "loss/hidden": 0.0, "loss/logits": 0.18919217213988304, "loss/reg": 0.9533572793006897, "step": 1699 }, { "epoch": 0.017, "grad_norm": 0.40928900241851807, "grad_norm_var": 0.0012525002442726437, "learning_rate": 5e-05, "loss": 0.2029, "loss/crossentropy": 2.7299290895462036, "loss/hidden": 0.0, "loss/logits": 0.20293082669377327, "loss/reg": 0.9519543051719666, "step": 1700 }, { "epoch": 0.01701, "grad_norm": 0.4375574588775635, "grad_norm_var": 0.0012704019431191704, "learning_rate": 5e-05, "loss": 0.21, "loss/crossentropy": 2.7553977370262146, "loss/hidden": 0.0, "loss/logits": 0.21000895649194717, "loss/reg": 0.9501931667327881, "step": 1701 }, { "epoch": 0.01702, "grad_norm": 0.40013188123703003, "grad_norm_var": 0.001265296725807552, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.8755266666412354, "loss/hidden": 0.0, "loss/logits": 0.19107430800795555, "loss/reg": 0.9485136866569519, "step": 1702 }, { "epoch": 0.01703, "grad_norm": 0.3753577470779419, "grad_norm_var": 0.0011287875673520631, "learning_rate": 5e-05, "loss": 0.1879, "loss/crossentropy": 2.7124125957489014, "loss/hidden": 0.0, "loss/logits": 0.18789031356573105, "loss/reg": 0.9467406272888184, "step": 1703 }, { "epoch": 0.01704, "grad_norm": 0.37156128883361816, "grad_norm_var": 0.0011800800264768559, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.7135950922966003, "loss/hidden": 0.0, "loss/logits": 0.18276651576161385, "loss/reg": 0.9453279376029968, "step": 1704 }, { "epoch": 0.01705, "grad_norm": 0.3806215524673462, "grad_norm_var": 0.0012482685718171423, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.664647400379181, "loss/hidden": 0.0, "loss/logits": 0.17725708335638046, "loss/reg": 0.944482684135437, "step": 1705 }, { "epoch": 0.01706, "grad_norm": 0.38079720735549927, "grad_norm_var": 0.0013240482296425864, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.859362006187439, "loss/hidden": 0.0, "loss/logits": 0.18818871304392815, "loss/reg": 0.9428150653839111, "step": 1706 }, { "epoch": 0.01707, "grad_norm": 0.3702297508716583, "grad_norm_var": 0.0013354449290563766, "learning_rate": 5e-05, "loss": 0.1834, "loss/crossentropy": 2.7437954545021057, "loss/hidden": 0.0, "loss/logits": 0.1833883598446846, "loss/reg": 0.9419358372688293, "step": 1707 }, { "epoch": 0.01708, "grad_norm": 0.38292014598846436, "grad_norm_var": 0.0013738587391907587, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.8175190687179565, "loss/hidden": 0.0, "loss/logits": 0.18825723230838776, "loss/reg": 0.9411101341247559, "step": 1708 }, { "epoch": 0.01709, "grad_norm": 0.4336113929748535, "grad_norm_var": 0.000739829895405449, "learning_rate": 5e-05, "loss": 0.2004, "loss/crossentropy": 2.7488914132118225, "loss/hidden": 0.0, "loss/logits": 0.2003837488591671, "loss/reg": 0.9400129914283752, "step": 1709 }, { "epoch": 0.0171, "grad_norm": 0.3866313397884369, "grad_norm_var": 0.0007548829773588358, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.865144968032837, "loss/hidden": 0.0, "loss/logits": 0.18591173365712166, "loss/reg": 0.9393734335899353, "step": 1710 }, { "epoch": 0.01711, "grad_norm": 0.3687353730201721, "grad_norm_var": 0.0008212520908905674, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.923434257507324, "loss/hidden": 0.0, "loss/logits": 0.1896546706557274, "loss/reg": 0.9383866786956787, "step": 1711 }, { "epoch": 0.01712, "grad_norm": 0.38244619965553284, "grad_norm_var": 0.0008405183279570849, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.958771765232086, "loss/hidden": 0.0, "loss/logits": 0.18655554950237274, "loss/reg": 0.9373697638511658, "step": 1712 }, { "epoch": 0.01713, "grad_norm": 0.3985337018966675, "grad_norm_var": 0.0006662152040347369, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.790857195854187, "loss/hidden": 0.0, "loss/logits": 0.1754690706729889, "loss/reg": 0.9367863535881042, "step": 1713 }, { "epoch": 0.01714, "grad_norm": 0.46521270275115967, "grad_norm_var": 0.0009604061373006178, "learning_rate": 5e-05, "loss": 0.2082, "loss/crossentropy": 2.8150055408477783, "loss/hidden": 0.0, "loss/logits": 0.20824339613318443, "loss/reg": 0.9358097314834595, "step": 1714 }, { "epoch": 0.01715, "grad_norm": 0.36640942096710205, "grad_norm_var": 0.0008204419803184575, "learning_rate": 5e-05, "loss": 0.1915, "loss/crossentropy": 2.9064807295799255, "loss/hidden": 0.0, "loss/logits": 0.19147857278585434, "loss/reg": 0.9348723888397217, "step": 1715 }, { "epoch": 0.01716, "grad_norm": 0.4536788761615753, "grad_norm_var": 0.001031849466324708, "learning_rate": 5e-05, "loss": 0.196, "loss/crossentropy": 2.7744264602661133, "loss/hidden": 0.0, "loss/logits": 0.19603459164500237, "loss/reg": 0.9344622492790222, "step": 1716 }, { "epoch": 0.01717, "grad_norm": 0.37372827529907227, "grad_norm_var": 0.0009426139138331162, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.7083056569099426, "loss/hidden": 0.0, "loss/logits": 0.18066375702619553, "loss/reg": 0.9334821701049805, "step": 1717 }, { "epoch": 0.01718, "grad_norm": 0.3796086311340332, "grad_norm_var": 0.0009498690764029612, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.9045706391334534, "loss/hidden": 0.0, "loss/logits": 0.18725696206092834, "loss/reg": 0.9323367476463318, "step": 1718 }, { "epoch": 0.01719, "grad_norm": 0.3763802945613861, "grad_norm_var": 0.0009476817574635772, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.670018196105957, "loss/hidden": 0.0, "loss/logits": 0.1874183751642704, "loss/reg": 0.9317356944084167, "step": 1719 }, { "epoch": 0.0172, "grad_norm": 0.39076095819473267, "grad_norm_var": 0.0009185417773364161, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.751845359802246, "loss/hidden": 0.0, "loss/logits": 0.1850602775812149, "loss/reg": 0.9313357472419739, "step": 1720 }, { "epoch": 0.01721, "grad_norm": 0.42364755272865295, "grad_norm_var": 0.0009624046398820714, "learning_rate": 5e-05, "loss": 0.2073, "loss/crossentropy": 2.7992833256721497, "loss/hidden": 0.0, "loss/logits": 0.2073342464864254, "loss/reg": 0.9311196208000183, "step": 1721 }, { "epoch": 0.01722, "grad_norm": 0.44127926230430603, "grad_norm_var": 0.0010697798969358, "learning_rate": 5e-05, "loss": 0.219, "loss/crossentropy": 2.814840614795685, "loss/hidden": 0.0, "loss/logits": 0.21901006624102592, "loss/reg": 0.9308744668960571, "step": 1722 }, { "epoch": 0.01723, "grad_norm": 0.4107615351676941, "grad_norm_var": 0.0010136604388472008, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.699351191520691, "loss/hidden": 0.0, "loss/logits": 0.18741191178560257, "loss/reg": 0.9312902092933655, "step": 1723 }, { "epoch": 0.01724, "grad_norm": 0.5666653513908386, "grad_norm_var": 0.0026527682925742368, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 2.854068696498871, "loss/hidden": 0.0, "loss/logits": 0.19326234608888626, "loss/reg": 0.9310418367385864, "step": 1724 }, { "epoch": 0.01725, "grad_norm": 0.3888385593891144, "grad_norm_var": 0.0026587771173752217, "learning_rate": 5e-05, "loss": 0.1921, "loss/crossentropy": 2.8429774045944214, "loss/hidden": 0.0, "loss/logits": 0.1921473778784275, "loss/reg": 0.9303467869758606, "step": 1725 }, { "epoch": 0.01726, "grad_norm": 0.4140770435333252, "grad_norm_var": 0.002617294349685162, "learning_rate": 5e-05, "loss": 0.1912, "loss/crossentropy": 2.6299397349357605, "loss/hidden": 0.0, "loss/logits": 0.19124232232570648, "loss/reg": 0.9298058748245239, "step": 1726 }, { "epoch": 0.01727, "grad_norm": 0.3864987790584564, "grad_norm_var": 0.002533247945002985, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.730083465576172, "loss/hidden": 0.0, "loss/logits": 0.18100566416978836, "loss/reg": 0.9297086000442505, "step": 1727 }, { "epoch": 0.01728, "grad_norm": 0.41775962710380554, "grad_norm_var": 0.0024642286621332312, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.6796945929527283, "loss/hidden": 0.0, "loss/logits": 0.19562242552638054, "loss/reg": 0.9300265908241272, "step": 1728 }, { "epoch": 0.01729, "grad_norm": 0.4079970121383667, "grad_norm_var": 0.0024479575636529288, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.7550109028816223, "loss/hidden": 0.0, "loss/logits": 0.19564897194504738, "loss/reg": 0.9302366375923157, "step": 1729 }, { "epoch": 0.0173, "grad_norm": 0.4165860116481781, "grad_norm_var": 0.0022796285006947414, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.813656747341156, "loss/hidden": 0.0, "loss/logits": 0.1943197064101696, "loss/reg": 0.9302929639816284, "step": 1730 }, { "epoch": 0.01731, "grad_norm": 0.43071046471595764, "grad_norm_var": 0.0021350215473328604, "learning_rate": 5e-05, "loss": 0.1972, "loss/crossentropy": 2.8851414918899536, "loss/hidden": 0.0, "loss/logits": 0.1972125954926014, "loss/reg": 0.9297398924827576, "step": 1731 }, { "epoch": 0.01732, "grad_norm": 0.3740787208080292, "grad_norm_var": 0.0021463760989535135, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.8831735253334045, "loss/hidden": 0.0, "loss/logits": 0.17595698684453964, "loss/reg": 0.9294543862342834, "step": 1732 }, { "epoch": 0.01733, "grad_norm": 0.41656848788261414, "grad_norm_var": 0.002039838173721368, "learning_rate": 5e-05, "loss": 0.1961, "loss/crossentropy": 2.7793456315994263, "loss/hidden": 0.0, "loss/logits": 0.19613324478268623, "loss/reg": 0.9291931390762329, "step": 1733 }, { "epoch": 0.01734, "grad_norm": 0.4268653094768524, "grad_norm_var": 0.001955542062938446, "learning_rate": 5e-05, "loss": 0.199, "loss/crossentropy": 2.697963237762451, "loss/hidden": 0.0, "loss/logits": 0.19901638850569725, "loss/reg": 0.9292311668395996, "step": 1734 }, { "epoch": 0.01735, "grad_norm": 0.4020274877548218, "grad_norm_var": 0.0018540141631924782, "learning_rate": 5e-05, "loss": 0.1985, "loss/crossentropy": 2.826058506965637, "loss/hidden": 0.0, "loss/logits": 0.1985473819077015, "loss/reg": 0.9292099475860596, "step": 1735 }, { "epoch": 0.01736, "grad_norm": 0.4104580283164978, "grad_norm_var": 0.0018022734387696784, "learning_rate": 5e-05, "loss": 0.2002, "loss/crossentropy": 2.895944595336914, "loss/hidden": 0.0, "loss/logits": 0.20016875490546227, "loss/reg": 0.9295958280563354, "step": 1736 }, { "epoch": 0.01737, "grad_norm": 0.3876482844352722, "grad_norm_var": 0.0018702079285484432, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.8814991116523743, "loss/hidden": 0.0, "loss/logits": 0.1926160380244255, "loss/reg": 0.9298628568649292, "step": 1737 }, { "epoch": 0.01738, "grad_norm": 0.3946945369243622, "grad_norm_var": 0.001865447438802307, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.8898362517356873, "loss/hidden": 0.0, "loss/logits": 0.18102619051933289, "loss/reg": 0.9295737147331238, "step": 1738 }, { "epoch": 0.01739, "grad_norm": 0.4132324457168579, "grad_norm_var": 0.0018641807090493641, "learning_rate": 5e-05, "loss": 0.2007, "loss/crossentropy": 2.8055079579353333, "loss/hidden": 0.0, "loss/logits": 0.2006586194038391, "loss/reg": 0.929633617401123, "step": 1739 }, { "epoch": 0.0174, "grad_norm": 0.3722969591617584, "grad_norm_var": 0.00031866605833391994, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.881201148033142, "loss/hidden": 0.0, "loss/logits": 0.17884038016200066, "loss/reg": 0.9294272661209106, "step": 1740 }, { "epoch": 0.01741, "grad_norm": 0.3921547830104828, "grad_norm_var": 0.0003127507684731838, "learning_rate": 5e-05, "loss": 0.1989, "loss/crossentropy": 2.870577871799469, "loss/hidden": 0.0, "loss/logits": 0.19886576384305954, "loss/reg": 0.9293291568756104, "step": 1741 }, { "epoch": 0.01742, "grad_norm": 0.4074016213417053, "grad_norm_var": 0.0003065474628230665, "learning_rate": 5e-05, "loss": 0.1887, "loss/crossentropy": 2.692419111728668, "loss/hidden": 0.0, "loss/logits": 0.188666682690382, "loss/reg": 0.9284968972206116, "step": 1742 }, { "epoch": 0.01743, "grad_norm": 0.3687678873538971, "grad_norm_var": 0.0003665339924477558, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.7657968401908875, "loss/hidden": 0.0, "loss/logits": 0.1842886470258236, "loss/reg": 0.9279156923294067, "step": 1743 }, { "epoch": 0.01744, "grad_norm": 0.46656665205955505, "grad_norm_var": 0.0006150264403531095, "learning_rate": 5e-05, "loss": 0.2095, "loss/crossentropy": 2.838911771774292, "loss/hidden": 0.0, "loss/logits": 0.20953642204403877, "loss/reg": 0.927099883556366, "step": 1744 }, { "epoch": 0.01745, "grad_norm": 0.39029231667518616, "grad_norm_var": 0.0006287310128329874, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.7523877024650574, "loss/hidden": 0.0, "loss/logits": 0.19562172889709473, "loss/reg": 0.9264803528785706, "step": 1745 }, { "epoch": 0.01746, "grad_norm": 0.3831719756126404, "grad_norm_var": 0.0006442070246771051, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.740496337413788, "loss/hidden": 0.0, "loss/logits": 0.19047483429312706, "loss/reg": 0.9256248474121094, "step": 1746 }, { "epoch": 0.01747, "grad_norm": 0.5675250887870789, "grad_norm_var": 0.0023322043705761336, "learning_rate": 5e-05, "loss": 0.2027, "loss/crossentropy": 2.843151092529297, "loss/hidden": 0.0, "loss/logits": 0.2027064487338066, "loss/reg": 0.9250745177268982, "step": 1747 }, { "epoch": 0.01748, "grad_norm": 0.4640829563140869, "grad_norm_var": 0.0023971129605368327, "learning_rate": 5e-05, "loss": 0.2053, "loss/crossentropy": 2.857450842857361, "loss/hidden": 0.0, "loss/logits": 0.20526857301592827, "loss/reg": 0.9246965646743774, "step": 1748 }, { "epoch": 0.01749, "grad_norm": 0.4389997124671936, "grad_norm_var": 0.002428811116496136, "learning_rate": 5e-05, "loss": 0.2078, "loss/crossentropy": 3.014944314956665, "loss/hidden": 0.0, "loss/logits": 0.20782027766108513, "loss/reg": 0.9241983890533447, "step": 1749 }, { "epoch": 0.0175, "grad_norm": 0.4219127297401428, "grad_norm_var": 0.002424415101141249, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.778669834136963, "loss/hidden": 0.0, "loss/logits": 0.19519609212875366, "loss/reg": 0.9234949946403503, "step": 1750 }, { "epoch": 0.01751, "grad_norm": 0.44038647413253784, "grad_norm_var": 0.0024368494019202675, "learning_rate": 5e-05, "loss": 0.2084, "loss/crossentropy": 2.752516746520996, "loss/hidden": 0.0, "loss/logits": 0.20839788019657135, "loss/reg": 0.922467052936554, "step": 1751 }, { "epoch": 0.01752, "grad_norm": 0.41221359372138977, "grad_norm_var": 0.0024348144491298704, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.842723250389099, "loss/hidden": 0.0, "loss/logits": 0.19562644511461258, "loss/reg": 0.9218528866767883, "step": 1752 }, { "epoch": 0.01753, "grad_norm": 0.4004841148853302, "grad_norm_var": 0.002389599515625429, "learning_rate": 5e-05, "loss": 0.1954, "loss/crossentropy": 2.939370810985565, "loss/hidden": 0.0, "loss/logits": 0.19543294981122017, "loss/reg": 0.9208647012710571, "step": 1753 }, { "epoch": 0.01754, "grad_norm": 0.4379563331604004, "grad_norm_var": 0.002355491992859593, "learning_rate": 5e-05, "loss": 0.2182, "loss/crossentropy": 2.744364857673645, "loss/hidden": 0.0, "loss/logits": 0.21822267770767212, "loss/reg": 0.9200965166091919, "step": 1754 }, { "epoch": 0.01755, "grad_norm": 0.3759958744049072, "grad_norm_var": 0.002493577858945939, "learning_rate": 5e-05, "loss": 0.1754, "loss/crossentropy": 2.886771500110626, "loss/hidden": 0.0, "loss/logits": 0.17540156841278076, "loss/reg": 0.9194830656051636, "step": 1755 }, { "epoch": 0.01756, "grad_norm": 0.4182957112789154, "grad_norm_var": 0.00232550336918698, "learning_rate": 5e-05, "loss": 0.197, "loss/crossentropy": 2.7246428728103638, "loss/hidden": 0.0, "loss/logits": 0.1970425397157669, "loss/reg": 0.919228732585907, "step": 1756 }, { "epoch": 0.01757, "grad_norm": 0.4069487452507019, "grad_norm_var": 0.0022760944225958942, "learning_rate": 5e-05, "loss": 0.1988, "loss/crossentropy": 2.910832703113556, "loss/hidden": 0.0, "loss/logits": 0.19881373643875122, "loss/reg": 0.9180617928504944, "step": 1757 }, { "epoch": 0.01758, "grad_norm": 0.4275709092617035, "grad_norm_var": 0.0022540248202831217, "learning_rate": 5e-05, "loss": 0.2148, "loss/crossentropy": 2.7431459426879883, "loss/hidden": 0.0, "loss/logits": 0.2147684395313263, "loss/reg": 0.9176238179206848, "step": 1758 }, { "epoch": 0.01759, "grad_norm": 0.39265963435173035, "grad_norm_var": 0.0021063545561460987, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.80772066116333, "loss/hidden": 0.0, "loss/logits": 0.18733786791563034, "loss/reg": 0.9171220064163208, "step": 1759 }, { "epoch": 0.0176, "grad_norm": 0.3848111927509308, "grad_norm_var": 0.0021016960850855507, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.85300612449646, "loss/hidden": 0.0, "loss/logits": 0.18891701474785805, "loss/reg": 0.9165941476821899, "step": 1760 }, { "epoch": 0.01761, "grad_norm": 0.4237481355667114, "grad_norm_var": 0.0020270584799056005, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.8984915018081665, "loss/hidden": 0.0, "loss/logits": 0.18082596361637115, "loss/reg": 0.9160624146461487, "step": 1761 }, { "epoch": 0.01762, "grad_norm": 0.5142870545387268, "grad_norm_var": 0.0023738048932404015, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.873230457305908, "loss/hidden": 0.0, "loss/logits": 0.19349532574415207, "loss/reg": 0.916169285774231, "step": 1762 }, { "epoch": 0.01763, "grad_norm": 0.4356222152709961, "grad_norm_var": 0.0010951696449246946, "learning_rate": 5e-05, "loss": 0.1864, "loss/crossentropy": 2.8136460185050964, "loss/hidden": 0.0, "loss/logits": 0.18640804663300514, "loss/reg": 0.9159582257270813, "step": 1763 }, { "epoch": 0.01764, "grad_norm": 0.393518328666687, "grad_norm_var": 0.001036296866566729, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.749118745326996, "loss/hidden": 0.0, "loss/logits": 0.1894579976797104, "loss/reg": 0.9152243733406067, "step": 1764 }, { "epoch": 0.01765, "grad_norm": 0.3967163562774658, "grad_norm_var": 0.0010428298323417332, "learning_rate": 5e-05, "loss": 0.1923, "loss/crossentropy": 2.7416646480560303, "loss/hidden": 0.0, "loss/logits": 0.1922891065478325, "loss/reg": 0.9147580862045288, "step": 1765 }, { "epoch": 0.01766, "grad_norm": 0.40663349628448486, "grad_norm_var": 0.0010488292205994862, "learning_rate": 5e-05, "loss": 0.1912, "loss/crossentropy": 2.7590591311454773, "loss/hidden": 0.0, "loss/logits": 0.19123412668704987, "loss/reg": 0.914356529712677, "step": 1766 }, { "epoch": 0.01767, "grad_norm": 0.4140712022781372, "grad_norm_var": 0.0010091434052932632, "learning_rate": 5e-05, "loss": 0.1919, "loss/crossentropy": 2.687725067138672, "loss/hidden": 0.0, "loss/logits": 0.191885843873024, "loss/reg": 0.9137458801269531, "step": 1767 }, { "epoch": 0.01768, "grad_norm": 0.3837500512599945, "grad_norm_var": 0.0010707176300563356, "learning_rate": 5e-05, "loss": 0.1946, "loss/crossentropy": 2.7783952951431274, "loss/hidden": 0.0, "loss/logits": 0.1945618949830532, "loss/reg": 0.9133679270744324, "step": 1768 }, { "epoch": 0.01769, "grad_norm": 0.39224839210510254, "grad_norm_var": 0.0010890483887377974, "learning_rate": 5e-05, "loss": 0.2031, "loss/crossentropy": 2.741241693496704, "loss/hidden": 0.0, "loss/logits": 0.20311570912599564, "loss/reg": 0.912524938583374, "step": 1769 }, { "epoch": 0.0177, "grad_norm": 0.3952781558036804, "grad_norm_var": 0.0010597493335637827, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.7046775817871094, "loss/hidden": 0.0, "loss/logits": 0.17761826515197754, "loss/reg": 0.9117524027824402, "step": 1770 }, { "epoch": 0.01771, "grad_norm": 0.44809678196907043, "grad_norm_var": 0.0010564659434679176, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.6977961659431458, "loss/hidden": 0.0, "loss/logits": 0.18849042057991028, "loss/reg": 0.9117720723152161, "step": 1771 }, { "epoch": 0.01772, "grad_norm": 0.4902764856815338, "grad_norm_var": 0.0014153685782453203, "learning_rate": 5e-05, "loss": 0.1989, "loss/crossentropy": 2.854245662689209, "loss/hidden": 0.0, "loss/logits": 0.19886160641908646, "loss/reg": 0.9108124375343323, "step": 1772 }, { "epoch": 0.01773, "grad_norm": 0.42389383912086487, "grad_norm_var": 0.001405770734557695, "learning_rate": 5e-05, "loss": 0.1969, "loss/crossentropy": 2.941379427909851, "loss/hidden": 0.0, "loss/logits": 0.19685113802552223, "loss/reg": 0.9101334810256958, "step": 1773 }, { "epoch": 0.01774, "grad_norm": 0.45862945914268494, "grad_norm_var": 0.0014965888956112356, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.79221248626709, "loss/hidden": 0.0, "loss/logits": 0.1856122799217701, "loss/reg": 0.9094204306602478, "step": 1774 }, { "epoch": 0.01775, "grad_norm": 0.4052278697490692, "grad_norm_var": 0.001457059190942837, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.8591490983963013, "loss/hidden": 0.0, "loss/logits": 0.18978318944573402, "loss/reg": 0.9089229106903076, "step": 1775 }, { "epoch": 0.01776, "grad_norm": 0.3891145884990692, "grad_norm_var": 0.0014363471457248366, "learning_rate": 5e-05, "loss": 0.1751, "loss/crossentropy": 2.749578833580017, "loss/hidden": 0.0, "loss/logits": 0.17506226524710655, "loss/reg": 0.9084846377372742, "step": 1776 }, { "epoch": 0.01777, "grad_norm": 0.39679303765296936, "grad_norm_var": 0.0014797685463354545, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 3.001101851463318, "loss/hidden": 0.0, "loss/logits": 0.19014138355851173, "loss/reg": 0.9078476428985596, "step": 1777 }, { "epoch": 0.01778, "grad_norm": 0.3706504702568054, "grad_norm_var": 0.0009924082079833714, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.7723464965820312, "loss/hidden": 0.0, "loss/logits": 0.18464886024594307, "loss/reg": 0.9068264365196228, "step": 1778 }, { "epoch": 0.01779, "grad_norm": 0.3968674838542938, "grad_norm_var": 0.0009669675906873842, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.761741518974304, "loss/hidden": 0.0, "loss/logits": 0.18945276364684105, "loss/reg": 0.906024694442749, "step": 1779 }, { "epoch": 0.0178, "grad_norm": 0.39632725715637207, "grad_norm_var": 0.0009612466044541615, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.8378600478172302, "loss/hidden": 0.0, "loss/logits": 0.1862327829003334, "loss/reg": 0.9050126075744629, "step": 1780 }, { "epoch": 0.01781, "grad_norm": 0.3713095486164093, "grad_norm_var": 0.0010475586715930519, "learning_rate": 5e-05, "loss": 0.1713, "loss/crossentropy": 2.830242097377777, "loss/hidden": 0.0, "loss/logits": 0.1712886579334736, "loss/reg": 0.9039465188980103, "step": 1781 }, { "epoch": 0.01782, "grad_norm": 0.4007900059223175, "grad_norm_var": 0.0010513013471431778, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.8021321892738342, "loss/hidden": 0.0, "loss/logits": 0.18226920068264008, "loss/reg": 0.9026550054550171, "step": 1782 }, { "epoch": 0.01783, "grad_norm": 0.5193232297897339, "grad_norm_var": 0.0018242062912838624, "learning_rate": 5e-05, "loss": 0.1818, "loss/crossentropy": 2.5900211334228516, "loss/hidden": 0.0, "loss/logits": 0.18178314715623856, "loss/reg": 0.9019155502319336, "step": 1783 }, { "epoch": 0.01784, "grad_norm": 0.403618723154068, "grad_norm_var": 0.0017663287180598138, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.8122188448905945, "loss/hidden": 0.0, "loss/logits": 0.18899159505963326, "loss/reg": 0.9012361168861389, "step": 1784 }, { "epoch": 0.01785, "grad_norm": 0.4231337904930115, "grad_norm_var": 0.0017275082001659947, "learning_rate": 5e-05, "loss": 0.2196, "loss/crossentropy": 2.824800193309784, "loss/hidden": 0.0, "loss/logits": 0.21959979087114334, "loss/reg": 0.8999006748199463, "step": 1785 }, { "epoch": 0.01786, "grad_norm": 0.44012317061424255, "grad_norm_var": 0.0017168415806013914, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.738426446914673, "loss/hidden": 0.0, "loss/logits": 0.18970657885074615, "loss/reg": 0.8987688422203064, "step": 1786 }, { "epoch": 0.01787, "grad_norm": 0.4444918632507324, "grad_norm_var": 0.0017045747668082202, "learning_rate": 5e-05, "loss": 0.1968, "loss/crossentropy": 2.7222702503204346, "loss/hidden": 0.0, "loss/logits": 0.19678190723061562, "loss/reg": 0.8982342481613159, "step": 1787 }, { "epoch": 0.01788, "grad_norm": 0.41831323504447937, "grad_norm_var": 0.0013602734497308237, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.7708369493484497, "loss/hidden": 0.0, "loss/logits": 0.19575121998786926, "loss/reg": 0.896869957447052, "step": 1788 }, { "epoch": 0.01789, "grad_norm": 0.39989328384399414, "grad_norm_var": 0.0013715357724878973, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.7035407423973083, "loss/hidden": 0.0, "loss/logits": 0.18592968955636024, "loss/reg": 0.8959349989891052, "step": 1789 }, { "epoch": 0.0179, "grad_norm": 0.41748160123825073, "grad_norm_var": 0.0012361403251163084, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.8941848278045654, "loss/hidden": 0.0, "loss/logits": 0.17486931383609772, "loss/reg": 0.8951555490493774, "step": 1790 }, { "epoch": 0.01791, "grad_norm": 0.44419723749160767, "grad_norm_var": 0.0012953922793792927, "learning_rate": 5e-05, "loss": 0.2246, "loss/crossentropy": 2.8155258893966675, "loss/hidden": 0.0, "loss/logits": 0.2246222421526909, "loss/reg": 0.8940463662147522, "step": 1791 }, { "epoch": 0.01792, "grad_norm": 0.406249076128006, "grad_norm_var": 0.0012556850385435415, "learning_rate": 5e-05, "loss": 0.1927, "loss/crossentropy": 2.8289509415626526, "loss/hidden": 0.0, "loss/logits": 0.19272882491350174, "loss/reg": 0.8930648565292358, "step": 1792 }, { "epoch": 0.01793, "grad_norm": 0.3946913182735443, "grad_norm_var": 0.0012612307282537335, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.834330677986145, "loss/hidden": 0.0, "loss/logits": 0.1943346932530403, "loss/reg": 0.8924468159675598, "step": 1793 }, { "epoch": 0.01794, "grad_norm": 0.4092795252799988, "grad_norm_var": 0.0011236675583754518, "learning_rate": 5e-05, "loss": 0.1992, "loss/crossentropy": 2.8072381019592285, "loss/hidden": 0.0, "loss/logits": 0.19915487617254257, "loss/reg": 0.8920629620552063, "step": 1794 }, { "epoch": 0.01795, "grad_norm": 0.4286874234676361, "grad_norm_var": 0.0010977976660271338, "learning_rate": 5e-05, "loss": 0.1936, "loss/crossentropy": 2.908243238925934, "loss/hidden": 0.0, "loss/logits": 0.19364729523658752, "loss/reg": 0.8911548852920532, "step": 1795 }, { "epoch": 0.01796, "grad_norm": 0.3970155119895935, "grad_norm_var": 0.0010956668734328988, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.6946675181388855, "loss/hidden": 0.0, "loss/logits": 0.19396164640784264, "loss/reg": 0.8905576467514038, "step": 1796 }, { "epoch": 0.01797, "grad_norm": 0.4096953570842743, "grad_norm_var": 0.0009390040878516359, "learning_rate": 5e-05, "loss": 0.1915, "loss/crossentropy": 2.8345912098884583, "loss/hidden": 0.0, "loss/logits": 0.1914876624941826, "loss/reg": 0.889485239982605, "step": 1797 }, { "epoch": 0.01798, "grad_norm": 0.3989505469799042, "grad_norm_var": 0.0009444939561368048, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.85462749004364, "loss/hidden": 0.0, "loss/logits": 0.19523821398615837, "loss/reg": 0.8887150883674622, "step": 1798 }, { "epoch": 0.01799, "grad_norm": 0.4110300540924072, "grad_norm_var": 0.0002750364050750983, "learning_rate": 5e-05, "loss": 0.2033, "loss/crossentropy": 2.7547779083251953, "loss/hidden": 0.0, "loss/logits": 0.2033349722623825, "loss/reg": 0.8880630135536194, "step": 1799 }, { "epoch": 0.018, "grad_norm": 0.40208929777145386, "grad_norm_var": 0.00027759083654341394, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.8043800592422485, "loss/hidden": 0.0, "loss/logits": 0.18389111012220383, "loss/reg": 0.8876356482505798, "step": 1800 }, { "epoch": 0.01801, "grad_norm": 0.4190218448638916, "grad_norm_var": 0.0002743705401916411, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.8892692923545837, "loss/hidden": 0.0, "loss/logits": 0.1957857720553875, "loss/reg": 0.8869847655296326, "step": 1801 }, { "epoch": 0.01802, "grad_norm": 0.43014124035835266, "grad_norm_var": 0.00024726162186390713, "learning_rate": 5e-05, "loss": 0.2099, "loss/crossentropy": 2.7159982323646545, "loss/hidden": 0.0, "loss/logits": 0.20992730557918549, "loss/reg": 0.8867628574371338, "step": 1802 }, { "epoch": 0.01803, "grad_norm": 0.4165356457233429, "grad_norm_var": 0.00018413420812303608, "learning_rate": 5e-05, "loss": 0.1914, "loss/crossentropy": 2.7939720153808594, "loss/hidden": 0.0, "loss/logits": 0.19136624038219452, "loss/reg": 0.8857271075248718, "step": 1803 }, { "epoch": 0.01804, "grad_norm": 0.412634015083313, "grad_norm_var": 0.00018190296511185015, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.863085687160492, "loss/hidden": 0.0, "loss/logits": 0.1892290711402893, "loss/reg": 0.8853285312652588, "step": 1804 }, { "epoch": 0.01805, "grad_norm": 0.4068357050418854, "grad_norm_var": 0.0001733850609784889, "learning_rate": 5e-05, "loss": 0.1868, "loss/crossentropy": 2.7323646545410156, "loss/hidden": 0.0, "loss/logits": 0.1867602802813053, "loss/reg": 0.8856411576271057, "step": 1805 }, { "epoch": 0.01806, "grad_norm": 0.4004560112953186, "grad_norm_var": 0.00018083683617112396, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.775236189365387, "loss/hidden": 0.0, "loss/logits": 0.18314369022846222, "loss/reg": 0.8853859901428223, "step": 1806 }, { "epoch": 0.01807, "grad_norm": 0.43723416328430176, "grad_norm_var": 0.00015371433146776543, "learning_rate": 5e-05, "loss": 0.2064, "loss/crossentropy": 2.7276336550712585, "loss/hidden": 0.0, "loss/logits": 0.20644037798047066, "loss/reg": 0.8855364918708801, "step": 1807 }, { "epoch": 0.01808, "grad_norm": 0.3960845470428467, "grad_norm_var": 0.0001669956005184307, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.749648630619049, "loss/hidden": 0.0, "loss/logits": 0.19049681723117828, "loss/reg": 0.8853706121444702, "step": 1808 }, { "epoch": 0.01809, "grad_norm": 0.47138044238090515, "grad_norm_var": 0.0003714023544985031, "learning_rate": 5e-05, "loss": 0.2125, "loss/crossentropy": 2.845011830329895, "loss/hidden": 0.0, "loss/logits": 0.2125193141400814, "loss/reg": 0.8859027624130249, "step": 1809 }, { "epoch": 0.0181, "grad_norm": 0.40787723660469055, "grad_norm_var": 0.00037267745666983804, "learning_rate": 5e-05, "loss": 0.201, "loss/crossentropy": 2.754331648349762, "loss/hidden": 0.0, "loss/logits": 0.20103870332241058, "loss/reg": 0.8857205510139465, "step": 1810 }, { "epoch": 0.01811, "grad_norm": 0.46917012333869934, "grad_norm_var": 0.0005470735478984542, "learning_rate": 5e-05, "loss": 0.225, "loss/crossentropy": 2.6890124678611755, "loss/hidden": 0.0, "loss/logits": 0.22502825409173965, "loss/reg": 0.8853986859321594, "step": 1811 }, { "epoch": 0.01812, "grad_norm": 0.4247317612171173, "grad_norm_var": 0.0005179641686820132, "learning_rate": 5e-05, "loss": 0.2016, "loss/crossentropy": 2.6866010427474976, "loss/hidden": 0.0, "loss/logits": 0.20160917937755585, "loss/reg": 0.8847169280052185, "step": 1812 }, { "epoch": 0.01813, "grad_norm": 0.38973426818847656, "grad_norm_var": 0.0005692725583755849, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.746491849422455, "loss/hidden": 0.0, "loss/logits": 0.18436771258711815, "loss/reg": 0.8842182159423828, "step": 1813 }, { "epoch": 0.01814, "grad_norm": 0.4148683547973633, "grad_norm_var": 0.0005438949840141445, "learning_rate": 5e-05, "loss": 0.2008, "loss/crossentropy": 2.878942310810089, "loss/hidden": 0.0, "loss/logits": 0.20079002529382706, "loss/reg": 0.8834556341171265, "step": 1814 }, { "epoch": 0.01815, "grad_norm": 0.4110451936721802, "grad_norm_var": 0.0005438781752580094, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 3.0287649035453796, "loss/hidden": 0.0, "loss/logits": 0.18630259484052658, "loss/reg": 0.8829677104949951, "step": 1815 }, { "epoch": 0.01816, "grad_norm": 0.3800681531429291, "grad_norm_var": 0.0006249104218365738, "learning_rate": 5e-05, "loss": 0.2033, "loss/crossentropy": 2.6700210571289062, "loss/hidden": 0.0, "loss/logits": 0.20325781777501106, "loss/reg": 0.8822290897369385, "step": 1816 }, { "epoch": 0.01817, "grad_norm": 0.4045352339744568, "grad_norm_var": 0.0006360311616276112, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.6943440437316895, "loss/hidden": 0.0, "loss/logits": 0.18424637615680695, "loss/reg": 0.8814281225204468, "step": 1817 }, { "epoch": 0.01818, "grad_norm": 0.38361549377441406, "grad_norm_var": 0.0006903171502589604, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.660310387611389, "loss/hidden": 0.0, "loss/logits": 0.190257228910923, "loss/reg": 0.8801477551460266, "step": 1818 }, { "epoch": 0.01819, "grad_norm": 0.4341121315956116, "grad_norm_var": 0.0007151567713507371, "learning_rate": 5e-05, "loss": 0.1944, "loss/crossentropy": 2.691216230392456, "loss/hidden": 0.0, "loss/logits": 0.19436386600136757, "loss/reg": 0.879374623298645, "step": 1819 }, { "epoch": 0.0182, "grad_norm": 0.41172513365745544, "grad_norm_var": 0.0007155283160471648, "learning_rate": 5e-05, "loss": 0.1961, "loss/crossentropy": 2.8783839344978333, "loss/hidden": 0.0, "loss/logits": 0.19605087116360664, "loss/reg": 0.878777027130127, "step": 1820 }, { "epoch": 0.01821, "grad_norm": 0.41848644614219666, "grad_norm_var": 0.0007109920889231902, "learning_rate": 5e-05, "loss": 0.1942, "loss/crossentropy": 2.805996596813202, "loss/hidden": 0.0, "loss/logits": 0.19415006786584854, "loss/reg": 0.8780494928359985, "step": 1821 }, { "epoch": 0.01822, "grad_norm": 0.4151057004928589, "grad_norm_var": 0.0006941503368942059, "learning_rate": 5e-05, "loss": 0.2074, "loss/crossentropy": 2.691302001476288, "loss/hidden": 0.0, "loss/logits": 0.20735519751906395, "loss/reg": 0.8774803876876831, "step": 1822 }, { "epoch": 0.01823, "grad_norm": 0.40295282006263733, "grad_norm_var": 0.0006744779437835765, "learning_rate": 5e-05, "loss": 0.1963, "loss/crossentropy": 2.7196571826934814, "loss/hidden": 0.0, "loss/logits": 0.19634438306093216, "loss/reg": 0.8762959241867065, "step": 1823 }, { "epoch": 0.01824, "grad_norm": 0.38869625329971313, "grad_norm_var": 0.0006962458575604883, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.771869957447052, "loss/hidden": 0.0, "loss/logits": 0.18820732459425926, "loss/reg": 0.8756154179573059, "step": 1824 }, { "epoch": 0.01825, "grad_norm": 0.38419196009635925, "grad_norm_var": 0.0005072875532496912, "learning_rate": 5e-05, "loss": 0.1916, "loss/crossentropy": 2.6809815168380737, "loss/hidden": 0.0, "loss/logits": 0.1916203871369362, "loss/reg": 0.8744083642959595, "step": 1825 }, { "epoch": 0.01826, "grad_norm": 0.4487023949623108, "grad_norm_var": 0.0006063934180459326, "learning_rate": 5e-05, "loss": 0.2178, "loss/crossentropy": 2.8156700134277344, "loss/hidden": 0.0, "loss/logits": 0.21776284649968147, "loss/reg": 0.8737536072731018, "step": 1826 }, { "epoch": 0.01827, "grad_norm": 0.3885592520236969, "grad_norm_var": 0.0003911630525487042, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.7764856219291687, "loss/hidden": 0.0, "loss/logits": 0.18714625388383865, "loss/reg": 0.8732184767723083, "step": 1827 }, { "epoch": 0.01828, "grad_norm": 0.4574081599712372, "grad_norm_var": 0.0005381117093431365, "learning_rate": 5e-05, "loss": 0.1966, "loss/crossentropy": 2.8399417996406555, "loss/hidden": 0.0, "loss/logits": 0.19662820547819138, "loss/reg": 0.8722001910209656, "step": 1828 }, { "epoch": 0.01829, "grad_norm": 0.4126952588558197, "grad_norm_var": 0.0005140311352538636, "learning_rate": 5e-05, "loss": 0.2029, "loss/crossentropy": 2.8371532559394836, "loss/hidden": 0.0, "loss/logits": 0.20286116003990173, "loss/reg": 0.8717215061187744, "step": 1829 }, { "epoch": 0.0183, "grad_norm": 0.44622132182121277, "grad_norm_var": 0.0005966652735205321, "learning_rate": 5e-05, "loss": 0.1875, "loss/crossentropy": 2.9045282006263733, "loss/hidden": 0.0, "loss/logits": 0.18746205791831017, "loss/reg": 0.8713307976722717, "step": 1830 }, { "epoch": 0.01831, "grad_norm": 0.42101621627807617, "grad_norm_var": 0.000601932039182614, "learning_rate": 5e-05, "loss": 0.1964, "loss/crossentropy": 2.7737737894058228, "loss/hidden": 0.0, "loss/logits": 0.19638841599225998, "loss/reg": 0.8712663650512695, "step": 1831 }, { "epoch": 0.01832, "grad_norm": 0.43446776270866394, "grad_norm_var": 0.0005525170621598444, "learning_rate": 5e-05, "loss": 0.2063, "loss/crossentropy": 2.769006133079529, "loss/hidden": 0.0, "loss/logits": 0.20632325112819672, "loss/reg": 0.8707724809646606, "step": 1832 }, { "epoch": 0.01833, "grad_norm": 0.42732134461402893, "grad_norm_var": 0.0005508020339593708, "learning_rate": 5e-05, "loss": 0.1962, "loss/crossentropy": 2.6726030111312866, "loss/hidden": 0.0, "loss/logits": 0.19617467373609543, "loss/reg": 0.8704696893692017, "step": 1833 }, { "epoch": 0.01834, "grad_norm": 0.36816877126693726, "grad_norm_var": 0.000634894013014827, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.8035976886749268, "loss/hidden": 0.0, "loss/logits": 0.1790780983865261, "loss/reg": 0.870456337928772, "step": 1834 }, { "epoch": 0.01835, "grad_norm": 0.4553738534450531, "grad_norm_var": 0.0007138150602069756, "learning_rate": 5e-05, "loss": 0.2036, "loss/crossentropy": 2.8905494809150696, "loss/hidden": 0.0, "loss/logits": 0.2036396525800228, "loss/reg": 0.8699305653572083, "step": 1835 }, { "epoch": 0.01836, "grad_norm": 0.40657714009284973, "grad_norm_var": 0.0007194821629816439, "learning_rate": 5e-05, "loss": 0.1953, "loss/crossentropy": 2.873349666595459, "loss/hidden": 0.0, "loss/logits": 0.19530102238059044, "loss/reg": 0.8697292804718018, "step": 1836 }, { "epoch": 0.01837, "grad_norm": 0.4543238580226898, "grad_norm_var": 0.0008056768340881389, "learning_rate": 5e-05, "loss": 0.2024, "loss/crossentropy": 3.1166374683380127, "loss/hidden": 0.0, "loss/logits": 0.20240385457873344, "loss/reg": 0.869637131690979, "step": 1837 }, { "epoch": 0.01838, "grad_norm": 0.421379953622818, "grad_norm_var": 0.000804472493204796, "learning_rate": 5e-05, "loss": 0.2083, "loss/crossentropy": 2.50061959028244, "loss/hidden": 0.0, "loss/logits": 0.20833520963788033, "loss/reg": 0.8689462542533875, "step": 1838 }, { "epoch": 0.01839, "grad_norm": 0.4087032377719879, "grad_norm_var": 0.0007935618870177426, "learning_rate": 5e-05, "loss": 0.2058, "loss/crossentropy": 2.8472816348075867, "loss/hidden": 0.0, "loss/logits": 0.2057865411043167, "loss/reg": 0.8682885766029358, "step": 1839 }, { "epoch": 0.0184, "grad_norm": 0.37850746512413025, "grad_norm_var": 0.0008428996161608304, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.649823546409607, "loss/hidden": 0.0, "loss/logits": 0.17720508575439453, "loss/reg": 0.8673396110534668, "step": 1840 }, { "epoch": 0.01841, "grad_norm": 0.4339388310909271, "grad_norm_var": 0.0007627055638354899, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.782778263092041, "loss/hidden": 0.0, "loss/logits": 0.19349564984440804, "loss/reg": 0.8672105669975281, "step": 1841 }, { "epoch": 0.01842, "grad_norm": 0.4713878333568573, "grad_norm_var": 0.0008734888219704917, "learning_rate": 5e-05, "loss": 0.2131, "loss/crossentropy": 2.9232393503189087, "loss/hidden": 0.0, "loss/logits": 0.21310868114233017, "loss/reg": 0.8665778040885925, "step": 1842 }, { "epoch": 0.01843, "grad_norm": 0.4213177263736725, "grad_norm_var": 0.0007852010018439011, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.7351691126823425, "loss/hidden": 0.0, "loss/logits": 0.18833507969975471, "loss/reg": 0.8655192852020264, "step": 1843 }, { "epoch": 0.01844, "grad_norm": 0.3939739465713501, "grad_norm_var": 0.0007725325420692782, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.9118587970733643, "loss/hidden": 0.0, "loss/logits": 0.19397348538041115, "loss/reg": 0.8646129369735718, "step": 1844 }, { "epoch": 0.01845, "grad_norm": 0.47830748558044434, "grad_norm_var": 0.0009583470904952189, "learning_rate": 5e-05, "loss": 0.2142, "loss/crossentropy": 2.748432219028473, "loss/hidden": 0.0, "loss/logits": 0.21417668461799622, "loss/reg": 0.8641530871391296, "step": 1845 }, { "epoch": 0.01846, "grad_norm": 0.40475931763648987, "grad_norm_var": 0.000955724836401813, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.8377654552459717, "loss/hidden": 0.0, "loss/logits": 0.17845793813467026, "loss/reg": 0.8638774156570435, "step": 1846 }, { "epoch": 0.01847, "grad_norm": 0.412720263004303, "grad_norm_var": 0.00096301732033884, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.839793860912323, "loss/hidden": 0.0, "loss/logits": 0.17627452313899994, "loss/reg": 0.8637427091598511, "step": 1847 }, { "epoch": 0.01848, "grad_norm": 0.40244048833847046, "grad_norm_var": 0.0009790173845574976, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.894149899482727, "loss/hidden": 0.0, "loss/logits": 0.19404540956020355, "loss/reg": 0.8632751703262329, "step": 1848 }, { "epoch": 0.01849, "grad_norm": 0.4253540635108948, "grad_norm_var": 0.0009776536425150676, "learning_rate": 5e-05, "loss": 0.1916, "loss/crossentropy": 2.7934269309043884, "loss/hidden": 0.0, "loss/logits": 0.19155260547995567, "loss/reg": 0.8631685376167297, "step": 1849 }, { "epoch": 0.0185, "grad_norm": 0.43653759360313416, "grad_norm_var": 0.0007874927555983254, "learning_rate": 5e-05, "loss": 0.2, "loss/crossentropy": 2.778549313545227, "loss/hidden": 0.0, "loss/logits": 0.2000497616827488, "loss/reg": 0.8622583150863647, "step": 1850 }, { "epoch": 0.01851, "grad_norm": 0.4531484544277191, "grad_norm_var": 0.0007788936634817945, "learning_rate": 5e-05, "loss": 0.2031, "loss/crossentropy": 2.610984146595001, "loss/hidden": 0.0, "loss/logits": 0.2030741162598133, "loss/reg": 0.8614676594734192, "step": 1851 }, { "epoch": 0.01852, "grad_norm": 0.47085142135620117, "grad_norm_var": 0.0008774013336590041, "learning_rate": 5e-05, "loss": 0.2051, "loss/crossentropy": 2.735887825489044, "loss/hidden": 0.0, "loss/logits": 0.20505616813898087, "loss/reg": 0.861035168170929, "step": 1852 }, { "epoch": 0.01853, "grad_norm": 0.4040316045284271, "grad_norm_var": 0.0008672012734844969, "learning_rate": 5e-05, "loss": 0.2319, "loss/crossentropy": 2.7442225217819214, "loss/hidden": 0.0, "loss/logits": 0.231895312666893, "loss/reg": 0.8610429167747498, "step": 1853 }, { "epoch": 0.01854, "grad_norm": 0.41742706298828125, "grad_norm_var": 0.0008706576516620286, "learning_rate": 5e-05, "loss": 0.2052, "loss/crossentropy": 2.814299166202545, "loss/hidden": 0.0, "loss/logits": 0.20518312603235245, "loss/reg": 0.8606191873550415, "step": 1854 }, { "epoch": 0.01855, "grad_norm": 0.463815838098526, "grad_norm_var": 0.000934583450987155, "learning_rate": 5e-05, "loss": 0.213, "loss/crossentropy": 2.772307276725769, "loss/hidden": 0.0, "loss/logits": 0.21297916769981384, "loss/reg": 0.8602007627487183, "step": 1855 }, { "epoch": 0.01856, "grad_norm": 0.4082344174385071, "grad_norm_var": 0.0007885627261811449, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.8122514486312866, "loss/hidden": 0.0, "loss/logits": 0.18972306698560715, "loss/reg": 0.86012864112854, "step": 1856 }, { "epoch": 0.01857, "grad_norm": 0.4443175792694092, "grad_norm_var": 0.0007991676930914182, "learning_rate": 5e-05, "loss": 0.2157, "loss/crossentropy": 2.6094889044761658, "loss/hidden": 0.0, "loss/logits": 0.21571212634444237, "loss/reg": 0.8605313301086426, "step": 1857 }, { "epoch": 0.01858, "grad_norm": 0.47876986861228943, "grad_norm_var": 0.0008415495263010108, "learning_rate": 5e-05, "loss": 0.2221, "loss/crossentropy": 2.6963194012641907, "loss/hidden": 0.0, "loss/logits": 0.22212516888976097, "loss/reg": 0.8600078821182251, "step": 1858 }, { "epoch": 0.01859, "grad_norm": 0.8267810940742493, "grad_norm_var": 0.01052554114220392, "learning_rate": 5e-05, "loss": 0.2183, "loss/crossentropy": 2.7884796261787415, "loss/hidden": 0.0, "loss/logits": 0.21834344044327736, "loss/reg": 0.8599786758422852, "step": 1859 }, { "epoch": 0.0186, "grad_norm": 0.5109309554100037, "grad_norm_var": 0.01038839950993009, "learning_rate": 5e-05, "loss": 0.1894, "loss/crossentropy": 2.711488723754883, "loss/hidden": 0.0, "loss/logits": 0.18943556025624275, "loss/reg": 0.8601222038269043, "step": 1860 }, { "epoch": 0.01861, "grad_norm": 0.44234102964401245, "grad_norm_var": 0.0104049609113968, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 2.85012149810791, "loss/hidden": 0.0, "loss/logits": 0.1861085630953312, "loss/reg": 0.8600141406059265, "step": 1861 }, { "epoch": 0.01862, "grad_norm": 0.4488767981529236, "grad_norm_var": 0.010186053331192524, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.6850525736808777, "loss/hidden": 0.0, "loss/logits": 0.18929185718297958, "loss/reg": 0.8600738644599915, "step": 1862 }, { "epoch": 0.01863, "grad_norm": 0.42296475172042847, "grad_norm_var": 0.01012064050706446, "learning_rate": 5e-05, "loss": 0.1931, "loss/crossentropy": 2.823063313961029, "loss/hidden": 0.0, "loss/logits": 0.19307823106646538, "loss/reg": 0.860470712184906, "step": 1863 }, { "epoch": 0.01864, "grad_norm": 0.4614562392234802, "grad_norm_var": 0.009837779451030537, "learning_rate": 5e-05, "loss": 0.1957, "loss/crossentropy": 2.8724761605262756, "loss/hidden": 0.0, "loss/logits": 0.19572097808122635, "loss/reg": 0.8609605431556702, "step": 1864 }, { "epoch": 0.01865, "grad_norm": 0.4670039117336273, "grad_norm_var": 0.009699710240888716, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.908924877643585, "loss/hidden": 0.0, "loss/logits": 0.1882306933403015, "loss/reg": 0.8610231280326843, "step": 1865 }, { "epoch": 0.01866, "grad_norm": 0.4499320387840271, "grad_norm_var": 0.009646977562170866, "learning_rate": 5e-05, "loss": 0.2121, "loss/crossentropy": 2.960861086845398, "loss/hidden": 0.0, "loss/logits": 0.21211780235171318, "loss/reg": 0.8618782162666321, "step": 1866 }, { "epoch": 0.01867, "grad_norm": 0.4010671079158783, "grad_norm_var": 0.009955610707336915, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.8528873920440674, "loss/hidden": 0.0, "loss/logits": 0.18694154918193817, "loss/reg": 0.8613529205322266, "step": 1867 }, { "epoch": 0.01868, "grad_norm": 0.4370492100715637, "grad_norm_var": 0.010022847689133333, "learning_rate": 5e-05, "loss": 0.2016, "loss/crossentropy": 2.8674468398094177, "loss/hidden": 0.0, "loss/logits": 0.20157770067453384, "loss/reg": 0.8607704043388367, "step": 1868 }, { "epoch": 0.01869, "grad_norm": 0.44706204533576965, "grad_norm_var": 0.009772638036635502, "learning_rate": 5e-05, "loss": 0.2073, "loss/crossentropy": 2.6901365518569946, "loss/hidden": 0.0, "loss/logits": 0.20734887197613716, "loss/reg": 0.8604735136032104, "step": 1869 }, { "epoch": 0.0187, "grad_norm": 0.4227212071418762, "grad_norm_var": 0.009736925025791706, "learning_rate": 5e-05, "loss": 0.1888, "loss/crossentropy": 2.9464325308799744, "loss/hidden": 0.0, "loss/logits": 0.18880737200379372, "loss/reg": 0.8606537580490112, "step": 1870 }, { "epoch": 0.01871, "grad_norm": 0.4298408329486847, "grad_norm_var": 0.00984085547868165, "learning_rate": 5e-05, "loss": 0.1967, "loss/crossentropy": 2.808286964893341, "loss/hidden": 0.0, "loss/logits": 0.19672463834285736, "loss/reg": 0.8605533838272095, "step": 1871 }, { "epoch": 0.01872, "grad_norm": 0.4341031014919281, "grad_norm_var": 0.009674092300272127, "learning_rate": 5e-05, "loss": 0.2119, "loss/crossentropy": 2.726570785045624, "loss/hidden": 0.0, "loss/logits": 0.21193281561136246, "loss/reg": 0.8598572611808777, "step": 1872 }, { "epoch": 0.01873, "grad_norm": 0.40797707438468933, "grad_norm_var": 0.009882653573959918, "learning_rate": 5e-05, "loss": 0.2006, "loss/crossentropy": 2.7558920979499817, "loss/hidden": 0.0, "loss/logits": 0.2006087228655815, "loss/reg": 0.8593345880508423, "step": 1873 }, { "epoch": 0.01874, "grad_norm": 0.4248046576976776, "grad_norm_var": 0.00998757024144767, "learning_rate": 5e-05, "loss": 0.2014, "loss/crossentropy": 2.8610259294509888, "loss/hidden": 0.0, "loss/logits": 0.20143526792526245, "loss/reg": 0.8589630722999573, "step": 1874 }, { "epoch": 0.01875, "grad_norm": 0.415412962436676, "grad_norm_var": 0.000703250459169195, "learning_rate": 5e-05, "loss": 0.1902, "loss/crossentropy": 2.6619481444358826, "loss/hidden": 0.0, "loss/logits": 0.1901528425514698, "loss/reg": 0.8587875962257385, "step": 1875 }, { "epoch": 0.01876, "grad_norm": 0.3922654688358307, "grad_norm_var": 0.0004447968186206778, "learning_rate": 5e-05, "loss": 0.1917, "loss/crossentropy": 2.875247359275818, "loss/hidden": 0.0, "loss/logits": 0.19171729311347008, "loss/reg": 0.8583662509918213, "step": 1876 }, { "epoch": 0.01877, "grad_norm": 0.41292887926101685, "grad_norm_var": 0.00045656488741578903, "learning_rate": 5e-05, "loss": 0.2055, "loss/crossentropy": 2.69901043176651, "loss/hidden": 0.0, "loss/logits": 0.20547962561249733, "loss/reg": 0.8579082489013672, "step": 1877 }, { "epoch": 0.01878, "grad_norm": 0.49054309725761414, "grad_norm_var": 0.0006715142851825543, "learning_rate": 5e-05, "loss": 0.2157, "loss/crossentropy": 2.7710567116737366, "loss/hidden": 0.0, "loss/logits": 0.21566850692033768, "loss/reg": 0.8571624159812927, "step": 1878 }, { "epoch": 0.01879, "grad_norm": 0.4159642457962036, "grad_norm_var": 0.0006833101582665221, "learning_rate": 5e-05, "loss": 0.2045, "loss/crossentropy": 2.7000783681869507, "loss/hidden": 0.0, "loss/logits": 0.20450333133339882, "loss/reg": 0.8566645979881287, "step": 1879 }, { "epoch": 0.0188, "grad_norm": 0.41294634342193604, "grad_norm_var": 0.0006391081317155453, "learning_rate": 5e-05, "loss": 0.1999, "loss/crossentropy": 2.711737096309662, "loss/hidden": 0.0, "loss/logits": 0.19987360760569572, "loss/reg": 0.8563292622566223, "step": 1880 }, { "epoch": 0.01881, "grad_norm": 0.3933762013912201, "grad_norm_var": 0.0006033787049420665, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.5067747831344604, "loss/hidden": 0.0, "loss/logits": 0.1932012103497982, "loss/reg": 0.856222927570343, "step": 1881 }, { "epoch": 0.01882, "grad_norm": 0.39141905307769775, "grad_norm_var": 0.0006169972349161612, "learning_rate": 5e-05, "loss": 0.1942, "loss/crossentropy": 2.7386473417282104, "loss/hidden": 0.0, "loss/logits": 0.19419138133525848, "loss/reg": 0.8559076189994812, "step": 1882 }, { "epoch": 0.01883, "grad_norm": 0.40779832005500793, "grad_norm_var": 0.0006023050366477952, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.7678736448287964, "loss/hidden": 0.0, "loss/logits": 0.18222039192914963, "loss/reg": 0.8556982278823853, "step": 1883 }, { "epoch": 0.01884, "grad_norm": 0.3862144649028778, "grad_norm_var": 0.0006551248482100055, "learning_rate": 5e-05, "loss": 0.1922, "loss/crossentropy": 2.750099837779999, "loss/hidden": 0.0, "loss/logits": 0.19219841435551643, "loss/reg": 0.8550093173980713, "step": 1884 }, { "epoch": 0.01885, "grad_norm": 0.3694680333137512, "grad_norm_var": 0.0007290592163974356, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.774383544921875, "loss/hidden": 0.0, "loss/logits": 0.17750544473528862, "loss/reg": 0.8547077775001526, "step": 1885 }, { "epoch": 0.01886, "grad_norm": 0.3932516574859619, "grad_norm_var": 0.0007450872750751143, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.850859224796295, "loss/hidden": 0.0, "loss/logits": 0.19068051874637604, "loss/reg": 0.8540486693382263, "step": 1886 }, { "epoch": 0.01887, "grad_norm": 0.4164002239704132, "grad_norm_var": 0.0007228728195628069, "learning_rate": 5e-05, "loss": 0.2016, "loss/crossentropy": 2.7217097878456116, "loss/hidden": 0.0, "loss/logits": 0.20161013677716255, "loss/reg": 0.8533588647842407, "step": 1887 }, { "epoch": 0.01888, "grad_norm": 0.4806760251522064, "grad_norm_var": 0.001006219679751382, "learning_rate": 5e-05, "loss": 0.1965, "loss/crossentropy": 2.9720823168754578, "loss/hidden": 0.0, "loss/logits": 0.19651786983013153, "loss/reg": 0.8531802892684937, "step": 1888 }, { "epoch": 0.01889, "grad_norm": 0.4182739555835724, "grad_norm_var": 0.0010056544745641364, "learning_rate": 5e-05, "loss": 0.199, "loss/crossentropy": 2.8201688528060913, "loss/hidden": 0.0, "loss/logits": 0.19896001368761063, "loss/reg": 0.852277398109436, "step": 1889 }, { "epoch": 0.0189, "grad_norm": 0.45667311549186707, "grad_norm_var": 0.0011156389935623918, "learning_rate": 5e-05, "loss": 0.202, "loss/crossentropy": 2.78415310382843, "loss/hidden": 0.0, "loss/logits": 0.20198464766144753, "loss/reg": 0.8516212105751038, "step": 1890 }, { "epoch": 0.01891, "grad_norm": 0.4377621114253998, "grad_norm_var": 0.0011455522062469323, "learning_rate": 5e-05, "loss": 0.2086, "loss/crossentropy": 2.7693448662757874, "loss/hidden": 0.0, "loss/logits": 0.20859498530626297, "loss/reg": 0.8512976169586182, "step": 1891 }, { "epoch": 0.01892, "grad_norm": 0.39116787910461426, "grad_norm_var": 0.0011492835139240007, "learning_rate": 5e-05, "loss": 0.1986, "loss/crossentropy": 2.9238383769989014, "loss/hidden": 0.0, "loss/logits": 0.19861867651343346, "loss/reg": 0.8505960702896118, "step": 1892 }, { "epoch": 0.01893, "grad_norm": 0.49259769916534424, "grad_norm_var": 0.001500831881940127, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.7086846828460693, "loss/hidden": 0.0, "loss/logits": 0.18557342514395714, "loss/reg": 0.8502125144004822, "step": 1893 }, { "epoch": 0.01894, "grad_norm": 0.4160771667957306, "grad_norm_var": 0.001168426734231694, "learning_rate": 5e-05, "loss": 0.1981, "loss/crossentropy": 2.7745996713638306, "loss/hidden": 0.0, "loss/logits": 0.198074109852314, "loss/reg": 0.8496638536453247, "step": 1894 }, { "epoch": 0.01895, "grad_norm": 0.41058945655822754, "grad_norm_var": 0.001171335815736004, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 2.8345044255256653, "loss/hidden": 0.0, "loss/logits": 0.1913003958761692, "loss/reg": 0.8487659692764282, "step": 1895 }, { "epoch": 0.01896, "grad_norm": 0.39137691259384155, "grad_norm_var": 0.0012125551676121801, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.756899118423462, "loss/hidden": 0.0, "loss/logits": 0.18267197161912918, "loss/reg": 0.8477123379707336, "step": 1896 }, { "epoch": 0.01897, "grad_norm": 0.4524717628955841, "grad_norm_var": 0.0012539780327675333, "learning_rate": 5e-05, "loss": 0.2036, "loss/crossentropy": 2.8982661962509155, "loss/hidden": 0.0, "loss/logits": 0.2035953588783741, "loss/reg": 0.8468810319900513, "step": 1897 }, { "epoch": 0.01898, "grad_norm": 0.4107264280319214, "grad_norm_var": 0.0012049521548068006, "learning_rate": 5e-05, "loss": 0.2077, "loss/crossentropy": 2.693164646625519, "loss/hidden": 0.0, "loss/logits": 0.20767628774046898, "loss/reg": 0.8459100127220154, "step": 1898 }, { "epoch": 0.01899, "grad_norm": 0.403199702501297, "grad_norm_var": 0.0012141969750022205, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.6866546869277954, "loss/hidden": 0.0, "loss/logits": 0.19577254354953766, "loss/reg": 0.8448077440261841, "step": 1899 }, { "epoch": 0.019, "grad_norm": 0.40245264768600464, "grad_norm_var": 0.0011565908327204018, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.6641695499420166, "loss/hidden": 0.0, "loss/logits": 0.18845220282673836, "loss/reg": 0.8437227010726929, "step": 1900 }, { "epoch": 0.01901, "grad_norm": 0.3748202621936798, "grad_norm_var": 0.001121286883045806, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.87463241815567, "loss/hidden": 0.0, "loss/logits": 0.18736360594630241, "loss/reg": 0.8430832624435425, "step": 1901 }, { "epoch": 0.01902, "grad_norm": 0.4243621528148651, "grad_norm_var": 0.0010634312725284515, "learning_rate": 5e-05, "loss": 0.213, "loss/crossentropy": 2.8850821256637573, "loss/hidden": 0.0, "loss/logits": 0.2129940651357174, "loss/reg": 0.8422893285751343, "step": 1902 }, { "epoch": 0.01903, "grad_norm": 0.375873863697052, "grad_norm_var": 0.0012056692127686082, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.9269325137138367, "loss/hidden": 0.0, "loss/logits": 0.19388863816857338, "loss/reg": 0.8414551019668579, "step": 1903 }, { "epoch": 0.01904, "grad_norm": 0.42073309421539307, "grad_norm_var": 0.0009548363804637934, "learning_rate": 5e-05, "loss": 0.2025, "loss/crossentropy": 2.8575509190559387, "loss/hidden": 0.0, "loss/logits": 0.20251396670937538, "loss/reg": 0.8409824371337891, "step": 1904 }, { "epoch": 0.01905, "grad_norm": 0.44542282819747925, "grad_norm_var": 0.001003894760507947, "learning_rate": 5e-05, "loss": 0.2087, "loss/crossentropy": 2.881743311882019, "loss/hidden": 0.0, "loss/logits": 0.2086678370833397, "loss/reg": 0.8402405381202698, "step": 1905 }, { "epoch": 0.01906, "grad_norm": 0.4176529049873352, "grad_norm_var": 0.0009038042833217001, "learning_rate": 5e-05, "loss": 0.2046, "loss/crossentropy": 2.762676239013672, "loss/hidden": 0.0, "loss/logits": 0.20464308559894562, "loss/reg": 0.8392151594161987, "step": 1906 }, { "epoch": 0.01907, "grad_norm": 0.4172472059726715, "grad_norm_var": 0.0008725113390647336, "learning_rate": 5e-05, "loss": 0.1949, "loss/crossentropy": 2.9665162563323975, "loss/hidden": 0.0, "loss/logits": 0.19487519562244415, "loss/reg": 0.8380366563796997, "step": 1907 }, { "epoch": 0.01908, "grad_norm": 0.4117933511734009, "grad_norm_var": 0.0008323956791387459, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.9393438696861267, "loss/hidden": 0.0, "loss/logits": 0.18285026401281357, "loss/reg": 0.8377097249031067, "step": 1908 }, { "epoch": 0.01909, "grad_norm": 0.4283354580402374, "grad_norm_var": 0.0004402894728716948, "learning_rate": 5e-05, "loss": 0.2004, "loss/crossentropy": 2.8670120239257812, "loss/hidden": 0.0, "loss/logits": 0.20037797465920448, "loss/reg": 0.83690345287323, "step": 1909 }, { "epoch": 0.0191, "grad_norm": 0.4141778349876404, "grad_norm_var": 0.0004396586654966228, "learning_rate": 5e-05, "loss": 0.192, "loss/crossentropy": 2.7264450788497925, "loss/hidden": 0.0, "loss/logits": 0.19202467799186707, "loss/reg": 0.8363360166549683, "step": 1910 }, { "epoch": 0.01911, "grad_norm": 0.3901000916957855, "grad_norm_var": 0.00047132750558592784, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.662300944328308, "loss/hidden": 0.0, "loss/logits": 0.17456327751278877, "loss/reg": 0.8360676765441895, "step": 1911 }, { "epoch": 0.01912, "grad_norm": 0.4108651280403137, "grad_norm_var": 0.00044330438000829143, "learning_rate": 5e-05, "loss": 0.1979, "loss/crossentropy": 2.7080438137054443, "loss/hidden": 0.0, "loss/logits": 0.19789596647024155, "loss/reg": 0.8352373242378235, "step": 1912 }, { "epoch": 0.01913, "grad_norm": 0.3944729268550873, "grad_norm_var": 0.0003445506227322981, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.5074052810668945, "loss/hidden": 0.0, "loss/logits": 0.18367857486009598, "loss/reg": 0.8342070579528809, "step": 1913 }, { "epoch": 0.01914, "grad_norm": 0.40939974784851074, "grad_norm_var": 0.0003443357351905522, "learning_rate": 5e-05, "loss": 0.1968, "loss/crossentropy": 2.710819363594055, "loss/hidden": 0.0, "loss/logits": 0.19682180508971214, "loss/reg": 0.8332958221435547, "step": 1914 }, { "epoch": 0.01915, "grad_norm": 0.7204210758209229, "grad_norm_var": 0.006396513333546554, "learning_rate": 5e-05, "loss": 0.2156, "loss/crossentropy": 2.985659956932068, "loss/hidden": 0.0, "loss/logits": 0.215598963201046, "loss/reg": 0.8321488499641418, "step": 1915 }, { "epoch": 0.01916, "grad_norm": 0.5111802220344543, "grad_norm_var": 0.006755829513090935, "learning_rate": 5e-05, "loss": 0.2083, "loss/crossentropy": 2.765591084957123, "loss/hidden": 0.0, "loss/logits": 0.20833655446767807, "loss/reg": 0.8310338258743286, "step": 1916 }, { "epoch": 0.01917, "grad_norm": 0.4084041714668274, "grad_norm_var": 0.006554926472281564, "learning_rate": 5e-05, "loss": 0.1914, "loss/crossentropy": 2.8071027994155884, "loss/hidden": 0.0, "loss/logits": 0.19143269956111908, "loss/reg": 0.8302226662635803, "step": 1917 }, { "epoch": 0.01918, "grad_norm": 0.44032594561576843, "grad_norm_var": 0.006542831349444234, "learning_rate": 5e-05, "loss": 0.2035, "loss/crossentropy": 2.7133736610412598, "loss/hidden": 0.0, "loss/logits": 0.20351077988743782, "loss/reg": 0.8293207883834839, "step": 1918 }, { "epoch": 0.01919, "grad_norm": 0.46147823333740234, "grad_norm_var": 0.00628573912805136, "learning_rate": 5e-05, "loss": 0.2017, "loss/crossentropy": 2.8348661065101624, "loss/hidden": 0.0, "loss/logits": 0.2017073966562748, "loss/reg": 0.8288859724998474, "step": 1919 }, { "epoch": 0.0192, "grad_norm": 0.3947501480579376, "grad_norm_var": 0.0064081085864480395, "learning_rate": 5e-05, "loss": 0.1891, "loss/crossentropy": 2.7925063967704773, "loss/hidden": 0.0, "loss/logits": 0.1890932358801365, "loss/reg": 0.8279058933258057, "step": 1920 }, { "epoch": 0.01921, "grad_norm": 0.4013521373271942, "grad_norm_var": 0.006510863884384721, "learning_rate": 5e-05, "loss": 0.1978, "loss/crossentropy": 2.7446338534355164, "loss/hidden": 0.0, "loss/logits": 0.19776693731546402, "loss/reg": 0.8273690342903137, "step": 1921 }, { "epoch": 0.01922, "grad_norm": 0.3815420866012573, "grad_norm_var": 0.006697539133585843, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.911185562610626, "loss/hidden": 0.0, "loss/logits": 0.17821616306900978, "loss/reg": 0.8266914486885071, "step": 1922 }, { "epoch": 0.01923, "grad_norm": 0.36894431710243225, "grad_norm_var": 0.006972125815576998, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.8202965259552, "loss/hidden": 0.0, "loss/logits": 0.1839989572763443, "loss/reg": 0.826550304889679, "step": 1923 }, { "epoch": 0.01924, "grad_norm": 0.41022172570228577, "grad_norm_var": 0.006976979996265461, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.8062856197357178, "loss/hidden": 0.0, "loss/logits": 0.18412147462368011, "loss/reg": 0.8260629177093506, "step": 1924 }, { "epoch": 0.01925, "grad_norm": 0.42535021901130676, "grad_norm_var": 0.006979840681534692, "learning_rate": 5e-05, "loss": 0.2046, "loss/crossentropy": 2.9198508858680725, "loss/hidden": 0.0, "loss/logits": 0.20462898537516594, "loss/reg": 0.8256332874298096, "step": 1925 }, { "epoch": 0.01926, "grad_norm": 0.41190701723098755, "grad_norm_var": 0.006986145451577327, "learning_rate": 5e-05, "loss": 0.1974, "loss/crossentropy": 2.8014107942581177, "loss/hidden": 0.0, "loss/logits": 0.1974184885621071, "loss/reg": 0.8247300386428833, "step": 1926 }, { "epoch": 0.01927, "grad_norm": 0.44452086091041565, "grad_norm_var": 0.006854194152772293, "learning_rate": 5e-05, "loss": 0.2007, "loss/crossentropy": 2.822911560535431, "loss/hidden": 0.0, "loss/logits": 0.20071227848529816, "loss/reg": 0.8244222402572632, "step": 1927 }, { "epoch": 0.01928, "grad_norm": 0.5008922219276428, "grad_norm_var": 0.007044683448397379, "learning_rate": 5e-05, "loss": 0.2141, "loss/crossentropy": 2.8664467334747314, "loss/hidden": 0.0, "loss/logits": 0.2140694372355938, "loss/reg": 0.8241965174674988, "step": 1928 }, { "epoch": 0.01929, "grad_norm": 0.42727142572402954, "grad_norm_var": 0.006900477335719032, "learning_rate": 5e-05, "loss": 0.1976, "loss/crossentropy": 2.8532140851020813, "loss/hidden": 0.0, "loss/logits": 0.19760168716311455, "loss/reg": 0.8241434097290039, "step": 1929 }, { "epoch": 0.0193, "grad_norm": 0.3961251378059387, "grad_norm_var": 0.006974275889228206, "learning_rate": 5e-05, "loss": 0.1868, "loss/crossentropy": 2.774713635444641, "loss/hidden": 0.0, "loss/logits": 0.18682782351970673, "loss/reg": 0.8236058354377747, "step": 1930 }, { "epoch": 0.01931, "grad_norm": 0.4069480299949646, "grad_norm_var": 0.001564247241811408, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.7575384974479675, "loss/hidden": 0.0, "loss/logits": 0.1830037534236908, "loss/reg": 0.8234732747077942, "step": 1931 }, { "epoch": 0.01932, "grad_norm": 0.3841603994369507, "grad_norm_var": 0.001103778174189011, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.786939561367035, "loss/hidden": 0.0, "loss/logits": 0.1853381060063839, "loss/reg": 0.8227251768112183, "step": 1932 }, { "epoch": 0.01933, "grad_norm": 0.42709746956825256, "grad_norm_var": 0.0011054095386222298, "learning_rate": 5e-05, "loss": 0.2276, "loss/crossentropy": 2.7064501643180847, "loss/hidden": 0.0, "loss/logits": 0.22762993723154068, "loss/reg": 0.8224155306816101, "step": 1933 }, { "epoch": 0.01934, "grad_norm": 0.4189620614051819, "grad_norm_var": 0.0010694294421293808, "learning_rate": 5e-05, "loss": 0.2046, "loss/crossentropy": 2.8003990650177, "loss/hidden": 0.0, "loss/logits": 0.20464803650975227, "loss/reg": 0.8217348456382751, "step": 1934 }, { "epoch": 0.01935, "grad_norm": 0.42320385575294495, "grad_norm_var": 0.0009306623657763987, "learning_rate": 5e-05, "loss": 0.1894, "loss/crossentropy": 2.7599529027938843, "loss/hidden": 0.0, "loss/logits": 0.18942034244537354, "loss/reg": 0.8217450976371765, "step": 1935 }, { "epoch": 0.01936, "grad_norm": 0.43891292810440063, "grad_norm_var": 0.0009394853920891936, "learning_rate": 5e-05, "loss": 0.1988, "loss/crossentropy": 2.941531538963318, "loss/hidden": 0.0, "loss/logits": 0.19884098693728447, "loss/reg": 0.8212395906448364, "step": 1936 }, { "epoch": 0.01937, "grad_norm": 0.38071680068969727, "grad_norm_var": 0.0010083631744948343, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 2.777757167816162, "loss/hidden": 0.0, "loss/logits": 0.1832246519625187, "loss/reg": 0.8206287622451782, "step": 1937 }, { "epoch": 0.01938, "grad_norm": 0.4053645431995392, "grad_norm_var": 0.0009362139371330458, "learning_rate": 5e-05, "loss": 0.1942, "loss/crossentropy": 2.6445663571357727, "loss/hidden": 0.0, "loss/logits": 0.19421947374939919, "loss/reg": 0.8199341893196106, "step": 1938 }, { "epoch": 0.01939, "grad_norm": 0.3973230719566345, "grad_norm_var": 0.0008050451379228551, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.7140950560569763, "loss/hidden": 0.0, "loss/logits": 0.19255831465125084, "loss/reg": 0.8188463449478149, "step": 1939 }, { "epoch": 0.0194, "grad_norm": 0.3956315219402313, "grad_norm_var": 0.0008348160456935834, "learning_rate": 5e-05, "loss": 0.1975, "loss/crossentropy": 2.704246759414673, "loss/hidden": 0.0, "loss/logits": 0.19747210666537285, "loss/reg": 0.8183398246765137, "step": 1940 }, { "epoch": 0.01941, "grad_norm": 0.4323425889015198, "grad_norm_var": 0.0008449350953672311, "learning_rate": 5e-05, "loss": 0.2033, "loss/crossentropy": 2.6787729263305664, "loss/hidden": 0.0, "loss/logits": 0.2032655067741871, "loss/reg": 0.8175671100616455, "step": 1941 }, { "epoch": 0.01942, "grad_norm": 0.42692750692367554, "grad_norm_var": 0.000846410359529115, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.741370141506195, "loss/hidden": 0.0, "loss/logits": 0.1897616647183895, "loss/reg": 0.8167078495025635, "step": 1942 }, { "epoch": 0.01943, "grad_norm": 0.5386472344398499, "grad_norm_var": 0.0017185548646476998, "learning_rate": 5e-05, "loss": 0.2308, "loss/crossentropy": 2.8304240703582764, "loss/hidden": 0.0, "loss/logits": 0.23077243193984032, "loss/reg": 0.8157216310501099, "step": 1943 }, { "epoch": 0.01944, "grad_norm": 0.39153966307640076, "grad_norm_var": 0.001359874314681516, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.911763846874237, "loss/hidden": 0.0, "loss/logits": 0.18419482558965683, "loss/reg": 0.8156632781028748, "step": 1944 }, { "epoch": 0.01945, "grad_norm": 0.46944209933280945, "grad_norm_var": 0.0015220376425207978, "learning_rate": 5e-05, "loss": 0.2035, "loss/crossentropy": 2.9459604024887085, "loss/hidden": 0.0, "loss/logits": 0.20351316034793854, "loss/reg": 0.8148277401924133, "step": 1945 }, { "epoch": 0.01946, "grad_norm": 0.5551398396492004, "grad_norm_var": 0.002578514831995271, "learning_rate": 5e-05, "loss": 0.2106, "loss/crossentropy": 2.837183356285095, "loss/hidden": 0.0, "loss/logits": 0.21064143255352974, "loss/reg": 0.8145296573638916, "step": 1946 }, { "epoch": 0.01947, "grad_norm": 0.5528098940849304, "grad_norm_var": 0.0034449012988506072, "learning_rate": 5e-05, "loss": 0.1961, "loss/crossentropy": 2.9039652347564697, "loss/hidden": 0.0, "loss/logits": 0.19607802107930183, "loss/reg": 0.8138279318809509, "step": 1947 }, { "epoch": 0.01948, "grad_norm": 0.415836900472641, "grad_norm_var": 0.0032722428995358302, "learning_rate": 5e-05, "loss": 0.2086, "loss/crossentropy": 2.6907394528388977, "loss/hidden": 0.0, "loss/logits": 0.20860682427883148, "loss/reg": 0.8135868310928345, "step": 1948 }, { "epoch": 0.01949, "grad_norm": 0.38912898302078247, "grad_norm_var": 0.00343712172172476, "learning_rate": 5e-05, "loss": 0.1984, "loss/crossentropy": 2.80822890996933, "loss/hidden": 0.0, "loss/logits": 0.19835097342729568, "loss/reg": 0.813839316368103, "step": 1949 }, { "epoch": 0.0195, "grad_norm": 0.40182438492774963, "grad_norm_var": 0.0035023975724768897, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.7360247373580933, "loss/hidden": 0.0, "loss/logits": 0.19426304474473, "loss/reg": 0.8130156993865967, "step": 1950 }, { "epoch": 0.01951, "grad_norm": 0.5061089992523193, "grad_norm_var": 0.0037637273327097245, "learning_rate": 5e-05, "loss": 0.2302, "loss/crossentropy": 2.810616374015808, "loss/hidden": 0.0, "loss/logits": 0.23021040111780167, "loss/reg": 0.8126420974731445, "step": 1951 }, { "epoch": 0.01952, "grad_norm": 0.40442657470703125, "grad_norm_var": 0.0038596389857129174, "learning_rate": 5e-05, "loss": 0.1996, "loss/crossentropy": 2.7186889052391052, "loss/hidden": 0.0, "loss/logits": 0.1996094360947609, "loss/reg": 0.812751293182373, "step": 1952 }, { "epoch": 0.01953, "grad_norm": 0.3843165338039398, "grad_norm_var": 0.0038312987729529585, "learning_rate": 5e-05, "loss": 0.1957, "loss/crossentropy": 2.771174371242523, "loss/hidden": 0.0, "loss/logits": 0.19567937403917313, "loss/reg": 0.8128808736801147, "step": 1953 }, { "epoch": 0.01954, "grad_norm": 0.46193453669548035, "grad_norm_var": 0.00375742651823829, "learning_rate": 5e-05, "loss": 0.1962, "loss/crossentropy": 2.7205318808555603, "loss/hidden": 0.0, "loss/logits": 0.19615139067173004, "loss/reg": 0.8125832676887512, "step": 1954 }, { "epoch": 0.01955, "grad_norm": 0.4301360249519348, "grad_norm_var": 0.003615205873543022, "learning_rate": 5e-05, "loss": 0.2023, "loss/crossentropy": 2.7779752612113953, "loss/hidden": 0.0, "loss/logits": 0.2022506631910801, "loss/reg": 0.8117827773094177, "step": 1955 }, { "epoch": 0.01956, "grad_norm": 0.4634301960468292, "grad_norm_var": 0.0034357660159029303, "learning_rate": 5e-05, "loss": 0.205, "loss/crossentropy": 2.9738866090774536, "loss/hidden": 0.0, "loss/logits": 0.20504064112901688, "loss/reg": 0.8119235038757324, "step": 1956 }, { "epoch": 0.01957, "grad_norm": 0.4130321741104126, "grad_norm_var": 0.003508395486898848, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.8298826217651367, "loss/hidden": 0.0, "loss/logits": 0.18758833408355713, "loss/reg": 0.8116447925567627, "step": 1957 }, { "epoch": 0.01958, "grad_norm": 0.47880375385284424, "grad_norm_var": 0.003514979627616995, "learning_rate": 5e-05, "loss": 0.2127, "loss/crossentropy": 2.865356743335724, "loss/hidden": 0.0, "loss/logits": 0.21270957589149475, "loss/reg": 0.8115543127059937, "step": 1958 }, { "epoch": 0.01959, "grad_norm": 0.5161337852478027, "grad_norm_var": 0.0032911683468492196, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.847059905529022, "loss/hidden": 0.0, "loss/logits": 0.19394532218575478, "loss/reg": 0.8105104565620422, "step": 1959 }, { "epoch": 0.0196, "grad_norm": 0.41269540786743164, "grad_norm_var": 0.0031482363185309096, "learning_rate": 5e-05, "loss": 0.1936, "loss/crossentropy": 2.763217329978943, "loss/hidden": 0.0, "loss/logits": 0.19357923790812492, "loss/reg": 0.8093386888504028, "step": 1960 }, { "epoch": 0.01961, "grad_norm": 0.4319096505641937, "grad_norm_var": 0.0031562494539512987, "learning_rate": 5e-05, "loss": 0.2449, "loss/crossentropy": 2.7221856117248535, "loss/hidden": 0.0, "loss/logits": 0.24492282792925835, "loss/reg": 0.8088083267211914, "step": 1961 }, { "epoch": 0.01962, "grad_norm": 0.5298008322715759, "grad_norm_var": 0.0028448906488864, "learning_rate": 5e-05, "loss": 0.2001, "loss/crossentropy": 2.8264525532722473, "loss/hidden": 0.0, "loss/logits": 0.20010977983474731, "loss/reg": 0.8084967136383057, "step": 1962 }, { "epoch": 0.01963, "grad_norm": 0.47640174627304077, "grad_norm_var": 0.0021574920282253717, "learning_rate": 5e-05, "loss": 0.228, "loss/crossentropy": 2.74677574634552, "loss/hidden": 0.0, "loss/logits": 0.2279682606458664, "loss/reg": 0.8080564141273499, "step": 1963 }, { "epoch": 0.01964, "grad_norm": 0.44032299518585205, "grad_norm_var": 0.0021005854531945625, "learning_rate": 5e-05, "loss": 0.2062, "loss/crossentropy": 2.8946619629859924, "loss/hidden": 0.0, "loss/logits": 0.2062392719089985, "loss/reg": 0.8072237968444824, "step": 1964 }, { "epoch": 0.01965, "grad_norm": 0.4359903037548065, "grad_norm_var": 0.0018807734680394244, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.8006544709205627, "loss/hidden": 0.0, "loss/logits": 0.1934516429901123, "loss/reg": 0.8061104416847229, "step": 1965 }, { "epoch": 0.01966, "grad_norm": 0.4103769063949585, "grad_norm_var": 0.0018313161015808082, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.85004460811615, "loss/hidden": 0.0, "loss/logits": 0.18770591914653778, "loss/reg": 0.805844247341156, "step": 1966 }, { "epoch": 0.01967, "grad_norm": 0.45468464493751526, "grad_norm_var": 0.0016100881394809363, "learning_rate": 5e-05, "loss": 0.2063, "loss/crossentropy": 2.8525925278663635, "loss/hidden": 0.0, "loss/logits": 0.20628943294286728, "loss/reg": 0.8054131865501404, "step": 1967 }, { "epoch": 0.01968, "grad_norm": 0.41498661041259766, "grad_norm_var": 0.0015577833495147076, "learning_rate": 5e-05, "loss": 0.1929, "loss/crossentropy": 2.7838199138641357, "loss/hidden": 0.0, "loss/logits": 0.1929318793118, "loss/reg": 0.8045769929885864, "step": 1968 }, { "epoch": 0.01969, "grad_norm": 0.4064616858959198, "grad_norm_var": 0.0014028036544412833, "learning_rate": 5e-05, "loss": 0.1996, "loss/crossentropy": 2.7728294730186462, "loss/hidden": 0.0, "loss/logits": 0.1995893009006977, "loss/reg": 0.8039410710334778, "step": 1969 }, { "epoch": 0.0197, "grad_norm": 0.3879988193511963, "grad_norm_var": 0.0016126988674289818, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.737799882888794, "loss/hidden": 0.0, "loss/logits": 0.18477895110845566, "loss/reg": 0.8039387464523315, "step": 1970 }, { "epoch": 0.01971, "grad_norm": 0.4101479649543762, "grad_norm_var": 0.0016744785608907168, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.82141375541687, "loss/hidden": 0.0, "loss/logits": 0.18309226632118225, "loss/reg": 0.80408775806427, "step": 1971 }, { "epoch": 0.01972, "grad_norm": 0.6650723814964294, "grad_norm_var": 0.004773083863224518, "learning_rate": 5e-05, "loss": 0.2239, "loss/crossentropy": 3.000373601913452, "loss/hidden": 0.0, "loss/logits": 0.22394775971770287, "loss/reg": 0.8045146465301514, "step": 1972 }, { "epoch": 0.01973, "grad_norm": 0.458319753408432, "grad_norm_var": 0.004646034076159048, "learning_rate": 5e-05, "loss": 0.1992, "loss/crossentropy": 2.6664801836013794, "loss/hidden": 0.0, "loss/logits": 0.1992279216647148, "loss/reg": 0.8046901226043701, "step": 1973 }, { "epoch": 0.01974, "grad_norm": 0.4235832989215851, "grad_norm_var": 0.004684412564852858, "learning_rate": 5e-05, "loss": 0.1941, "loss/crossentropy": 2.769956588745117, "loss/hidden": 0.0, "loss/logits": 0.1941225603222847, "loss/reg": 0.8042617440223694, "step": 1974 }, { "epoch": 0.01975, "grad_norm": 0.4769344925880432, "grad_norm_var": 0.0044592586608794465, "learning_rate": 5e-05, "loss": 0.2022, "loss/crossentropy": 2.8257822394371033, "loss/hidden": 0.0, "loss/logits": 0.2021910548210144, "loss/reg": 0.8034544587135315, "step": 1975 }, { "epoch": 0.01976, "grad_norm": 0.4848521649837494, "grad_norm_var": 0.004404308109408961, "learning_rate": 5e-05, "loss": 0.2129, "loss/crossentropy": 2.8919442892074585, "loss/hidden": 0.0, "loss/logits": 0.21290498971939087, "loss/reg": 0.8028760552406311, "step": 1976 }, { "epoch": 0.01977, "grad_norm": 0.3805828094482422, "grad_norm_var": 0.004738891169894188, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.8650283813476562, "loss/hidden": 0.0, "loss/logits": 0.18921462818980217, "loss/reg": 0.8020926713943481, "step": 1977 }, { "epoch": 0.01978, "grad_norm": 0.4142460227012634, "grad_norm_var": 0.004398356120446204, "learning_rate": 5e-05, "loss": 0.1878, "loss/crossentropy": 2.8711732625961304, "loss/hidden": 0.0, "loss/logits": 0.18783869594335556, "loss/reg": 0.8014591932296753, "step": 1978 }, { "epoch": 0.01979, "grad_norm": 0.44899171590805054, "grad_norm_var": 0.0043353381509027384, "learning_rate": 5e-05, "loss": 0.208, "loss/crossentropy": 2.5625649094581604, "loss/hidden": 0.0, "loss/logits": 0.2080221101641655, "loss/reg": 0.8012404441833496, "step": 1979 }, { "epoch": 0.0198, "grad_norm": 0.44338473677635193, "grad_norm_var": 0.004334179241258988, "learning_rate": 5e-05, "loss": 0.2112, "loss/crossentropy": 2.8066795468330383, "loss/hidden": 0.0, "loss/logits": 0.21121276542544365, "loss/reg": 0.8007535934448242, "step": 1980 }, { "epoch": 0.01981, "grad_norm": 0.3934018313884735, "grad_norm_var": 0.004497499997501934, "learning_rate": 5e-05, "loss": 0.1875, "loss/crossentropy": 2.74673193693161, "loss/hidden": 0.0, "loss/logits": 0.18748626857995987, "loss/reg": 0.8003087043762207, "step": 1981 }, { "epoch": 0.01982, "grad_norm": 0.47937899827957153, "grad_norm_var": 0.004502974365799129, "learning_rate": 5e-05, "loss": 0.2173, "loss/crossentropy": 2.823216140270233, "loss/hidden": 0.0, "loss/logits": 0.21725162491202354, "loss/reg": 0.8001217842102051, "step": 1982 }, { "epoch": 0.01983, "grad_norm": 0.41723868250846863, "grad_norm_var": 0.0045494442842594855, "learning_rate": 5e-05, "loss": 0.1981, "loss/crossentropy": 2.7272141575813293, "loss/hidden": 0.0, "loss/logits": 0.19813034310936928, "loss/reg": 0.8001703023910522, "step": 1983 }, { "epoch": 0.01984, "grad_norm": 0.4236186444759369, "grad_norm_var": 0.004520594879298286, "learning_rate": 5e-05, "loss": 0.2114, "loss/crossentropy": 2.7918258905410767, "loss/hidden": 0.0, "loss/logits": 0.21141138672828674, "loss/reg": 0.7996878027915955, "step": 1984 }, { "epoch": 0.01985, "grad_norm": 0.43094608187675476, "grad_norm_var": 0.004433431641809552, "learning_rate": 5e-05, "loss": 0.2094, "loss/crossentropy": 2.647144854068756, "loss/hidden": 0.0, "loss/logits": 0.20941178873181343, "loss/reg": 0.7999981045722961, "step": 1985 }, { "epoch": 0.01986, "grad_norm": 0.4323650598526001, "grad_norm_var": 0.004212350788636702, "learning_rate": 5e-05, "loss": 0.2113, "loss/crossentropy": 2.794140636920929, "loss/hidden": 0.0, "loss/logits": 0.21128448471426964, "loss/reg": 0.7999016046524048, "step": 1986 }, { "epoch": 0.01987, "grad_norm": 0.45955783128738403, "grad_norm_var": 0.004109362838493628, "learning_rate": 5e-05, "loss": 0.2124, "loss/crossentropy": 2.7951821088790894, "loss/hidden": 0.0, "loss/logits": 0.21235604584217072, "loss/reg": 0.7995927333831787, "step": 1987 }, { "epoch": 0.01988, "grad_norm": 0.4196851849555969, "grad_norm_var": 0.0009023983358114496, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.8231590390205383, "loss/hidden": 0.0, "loss/logits": 0.19013461470603943, "loss/reg": 0.7984586358070374, "step": 1988 }, { "epoch": 0.01989, "grad_norm": 0.49775421619415283, "grad_norm_var": 0.0011133027865477651, "learning_rate": 5e-05, "loss": 0.208, "loss/crossentropy": 3.028570830821991, "loss/hidden": 0.0, "loss/logits": 0.20801334083080292, "loss/reg": 0.7981004118919373, "step": 1989 }, { "epoch": 0.0199, "grad_norm": 0.4030906856060028, "grad_norm_var": 0.0011821039332649108, "learning_rate": 5e-05, "loss": 0.195, "loss/crossentropy": 2.868641436100006, "loss/hidden": 0.0, "loss/logits": 0.19496846944093704, "loss/reg": 0.7968870401382446, "step": 1990 }, { "epoch": 0.01991, "grad_norm": 0.5345245003700256, "grad_norm_var": 0.0016893028660055229, "learning_rate": 5e-05, "loss": 0.2065, "loss/crossentropy": 2.789147198200226, "loss/hidden": 0.0, "loss/logits": 0.20645881444215775, "loss/reg": 0.7962032556533813, "step": 1991 }, { "epoch": 0.01992, "grad_norm": 0.40988919138908386, "grad_norm_var": 0.0016069727992540472, "learning_rate": 5e-05, "loss": 0.2057, "loss/crossentropy": 2.6691997051239014, "loss/hidden": 0.0, "loss/logits": 0.20573721826076508, "loss/reg": 0.7955927848815918, "step": 1992 }, { "epoch": 0.01993, "grad_norm": 0.42493656277656555, "grad_norm_var": 0.0013975202967209408, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.8890662789344788, "loss/hidden": 0.0, "loss/logits": 0.18143537640571594, "loss/reg": 0.7952697277069092, "step": 1993 }, { "epoch": 0.01994, "grad_norm": 0.5122359395027161, "grad_norm_var": 0.0016668707279849186, "learning_rate": 5e-05, "loss": 0.2132, "loss/crossentropy": 2.8018561005592346, "loss/hidden": 0.0, "loss/logits": 0.21317176520824432, "loss/reg": 0.7952179312705994, "step": 1994 }, { "epoch": 0.01995, "grad_norm": 0.39479175209999084, "grad_norm_var": 0.0018265944699556594, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.8895358443260193, "loss/hidden": 0.0, "loss/logits": 0.18123216927051544, "loss/reg": 0.7947417497634888, "step": 1995 }, { "epoch": 0.01996, "grad_norm": 0.4241412281990051, "grad_norm_var": 0.0018469557738500933, "learning_rate": 5e-05, "loss": 0.2372, "loss/crossentropy": 2.824765920639038, "loss/hidden": 0.0, "loss/logits": 0.2372448891401291, "loss/reg": 0.7937231063842773, "step": 1996 }, { "epoch": 0.01997, "grad_norm": 0.46130889654159546, "grad_norm_var": 0.0017033186931065185, "learning_rate": 5e-05, "loss": 0.1925, "loss/crossentropy": 3.0325597524642944, "loss/hidden": 0.0, "loss/logits": 0.19254948571324348, "loss/reg": 0.7932283282279968, "step": 1997 }, { "epoch": 0.01998, "grad_norm": 0.4147680103778839, "grad_norm_var": 0.0016710034497371704, "learning_rate": 5e-05, "loss": 0.1925, "loss/crossentropy": 2.8635621070861816, "loss/hidden": 0.0, "loss/logits": 0.19247011095285416, "loss/reg": 0.7929666042327881, "step": 1998 }, { "epoch": 0.01999, "grad_norm": 0.3953905701637268, "grad_norm_var": 0.001770939335639273, "learning_rate": 5e-05, "loss": 0.1963, "loss/crossentropy": 2.6696969866752625, "loss/hidden": 0.0, "loss/logits": 0.19630739465355873, "loss/reg": 0.7929765582084656, "step": 1999 }, { "epoch": 0.02, "grad_norm": 0.46709325909614563, "grad_norm_var": 0.0017944712625772832, "learning_rate": 5e-05, "loss": 0.2193, "loss/crossentropy": 2.767913043498993, "loss/hidden": 0.0, "loss/logits": 0.21925517171621323, "loss/reg": 0.792606770992279, "step": 2000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.030493785030656e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }