{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.029864118261908316, "eval_steps": 2000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.4932059130954158e-05, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 1.3962, "loss/crossentropy": 2.609541177749634, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.19302886724472046, "step": 1 }, { "epoch": 2.9864118261908317e-05, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 1.2844, "loss/crossentropy": 2.702785015106201, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1750669926404953, "step": 2 }, { "epoch": 4.4796177392862473e-05, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 1.2457, "loss/crossentropy": 2.620382308959961, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16754117608070374, "step": 3 }, { "epoch": 5.9728236523816634e-05, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 1.3366, "loss/crossentropy": 2.566118001937866, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.18033114075660706, "step": 4 }, { "epoch": 7.466029565477079e-05, "grad_norm": 0.6953125, "learning_rate": 2e-05, "loss": 1.292, "loss/crossentropy": 2.561917304992676, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17484982311725616, "step": 5 }, { "epoch": 8.959235478572495e-05, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 1.315, "loss/crossentropy": 2.606935977935791, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17437440156936646, "step": 6 }, { "epoch": 0.00010452441391667911, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 1.4291, "loss/crossentropy": 2.555368661880493, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 12.0, "loss/logits": 0.19471214711666107, "step": 7 }, { "epoch": 0.00011945647304763327, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 1.5255, "loss/crossentropy": 2.5476245880126953, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 12.0, "loss/logits": 0.24422992765903473, "step": 8 }, { "epoch": 0.00013438853217858743, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 1.3099, "loss/crossentropy": 2.2711799144744873, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.16932150721549988, "step": 9 }, { "epoch": 0.00014932059130954157, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 1.3835, "loss/crossentropy": 2.5518879890441895, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.19604898989200592, "step": 10 }, { "epoch": 0.00016425265044049575, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 1.4655, "loss/crossentropy": 2.5716614723205566, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.2233092039823532, "step": 11 }, { "epoch": 0.0001791847095714499, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 1.3127, "loss/crossentropy": 2.6517248153686523, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17209960520267487, "step": 12 }, { "epoch": 0.00019411676870240407, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 1.403, "loss/crossentropy": 2.4482481479644775, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 12.0, "loss/logits": 0.17639976739883423, "step": 13 }, { "epoch": 0.00020904882783335821, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 1.2437, "loss/crossentropy": 2.5636048316955566, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1577576994895935, "step": 14 }, { "epoch": 0.0002239808869643124, "grad_norm": 0.59375, "learning_rate": 2e-05, "loss": 1.2923, "loss/crossentropy": 2.559340238571167, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17509810626506805, "step": 15 }, { "epoch": 0.00023891294609526653, "grad_norm": 0.6171875, "grad_norm_var": 0.005140113830566406, "learning_rate": 2e-05, "loss": 1.2483, "loss/crossentropy": 2.5928773880004883, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.17019006609916687, "step": 16 }, { "epoch": 0.0002538450052262207, "grad_norm": 0.58984375, "grad_norm_var": 0.005712890625, "learning_rate": 2e-05, "loss": 1.3097, "loss/crossentropy": 2.7123403549194336, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.176863431930542, "step": 17 }, { "epoch": 0.00026877706435717485, "grad_norm": 0.58984375, "grad_norm_var": 0.006197102864583333, "learning_rate": 2e-05, "loss": 1.3289, "loss/crossentropy": 2.7460012435913086, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18828681111335754, "step": 18 }, { "epoch": 0.000283709123488129, "grad_norm": 1.28125, "grad_norm_var": 0.026744524637858074, "learning_rate": 2e-05, "loss": 1.6745, "loss/crossentropy": 2.6814517974853516, "loss/dist_ce": 0.0, "loss/fcd": 1.4140625, "loss/idx": 12.0, "loss/logits": 0.2604835033416748, "step": 19 }, { "epoch": 0.00029864118261908315, "grad_norm": 0.58984375, "grad_norm_var": 0.028202056884765625, "learning_rate": 2e-05, "loss": 1.2942, "loss/crossentropy": 2.8028454780578613, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1770225465297699, "step": 20 }, { "epoch": 0.00031357324175003735, "grad_norm": 0.5859375, "grad_norm_var": 0.02951227823893229, "learning_rate": 2e-05, "loss": 1.3311, "loss/crossentropy": 2.590350866317749, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.19830407202243805, "step": 21 }, { "epoch": 0.0003285053008809915, "grad_norm": 0.6953125, "grad_norm_var": 0.029569498697916665, "learning_rate": 2e-05, "loss": 1.4609, "loss/crossentropy": 2.5552988052368164, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.21873216331005096, "step": 22 }, { "epoch": 0.00034343736001194564, "grad_norm": 0.7265625, "grad_norm_var": 0.029537391662597657, "learning_rate": 2e-05, "loss": 1.4856, "loss/crossentropy": 2.727858066558838, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 12.0, "loss/logits": 0.21994858980178833, "step": 23 }, { "epoch": 0.0003583694191428998, "grad_norm": 0.6484375, "grad_norm_var": 0.029579671223958333, "learning_rate": 2e-05, "loss": 1.371, "loss/crossentropy": 2.3856289386749268, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.18346479535102844, "step": 24 }, { "epoch": 0.000373301478273854, "grad_norm": 0.59375, "grad_norm_var": 0.03048089345296224, "learning_rate": 2e-05, "loss": 1.282, "loss/crossentropy": 2.530938148498535, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.15697604417800903, "step": 25 }, { "epoch": 0.00038823353740480814, "grad_norm": 0.640625, "grad_norm_var": 0.029403114318847658, "learning_rate": 2e-05, "loss": 1.4404, "loss/crossentropy": 2.4393346309661865, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.19825200736522675, "step": 26 }, { "epoch": 0.0004031655965357623, "grad_norm": 0.60546875, "grad_norm_var": 0.028580474853515624, "learning_rate": 2e-05, "loss": 1.3121, "loss/crossentropy": 2.767091989517212, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.1792472004890442, "step": 27 }, { "epoch": 0.00041809765566671643, "grad_norm": 0.671875, "grad_norm_var": 0.028508440653483073, "learning_rate": 2e-05, "loss": 1.428, "loss/crossentropy": 2.497144937515259, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 12.0, "loss/logits": 0.21709555387496948, "step": 28 }, { "epoch": 0.0004330297147976706, "grad_norm": 0.6015625, "grad_norm_var": 0.028796831766764324, "learning_rate": 2e-05, "loss": 1.2806, "loss/crossentropy": 2.503953218460083, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.18687917292118073, "step": 29 }, { "epoch": 0.0004479617739286248, "grad_norm": 0.6875, "grad_norm_var": 0.028580729166666666, "learning_rate": 2e-05, "loss": 1.3877, "loss/crossentropy": 2.498080015182495, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.20022732019424438, "step": 30 }, { "epoch": 0.0004628938330595789, "grad_norm": 0.66015625, "grad_norm_var": 0.02818190256754557, "learning_rate": 2e-05, "loss": 1.3267, "loss/crossentropy": 2.71108341217041, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.1782739758491516, "step": 31 }, { "epoch": 0.00047782589219053307, "grad_norm": 1.390625, "grad_norm_var": 0.05970350901285807, "learning_rate": 2e-05, "loss": 1.7218, "loss/crossentropy": 2.422400951385498, "loss/dist_ce": 0.0, "loss/fcd": 1.46875, "loss/idx": 12.0, "loss/logits": 0.2530236542224884, "step": 32 }, { "epoch": 0.0004927579513214872, "grad_norm": 0.67578125, "grad_norm_var": 0.058646074930826825, "learning_rate": 2e-05, "loss": 1.4255, "loss/crossentropy": 2.750814437866211, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.20671629905700684, "step": 33 }, { "epoch": 0.0005076900104524414, "grad_norm": 1.1015625, "grad_norm_var": 0.0656005859375, "learning_rate": 2e-05, "loss": 1.3081, "loss/crossentropy": 2.8478612899780273, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.15186628699302673, "step": 34 }, { "epoch": 0.0005226220695833955, "grad_norm": 0.69140625, "grad_norm_var": 0.04633274078369141, "learning_rate": 2e-05, "loss": 1.3061, "loss/crossentropy": 2.4926323890686035, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17327454686164856, "step": 35 }, { "epoch": 0.0005375541287143497, "grad_norm": 0.8125, "grad_norm_var": 0.045481109619140626, "learning_rate": 2e-05, "loss": 1.3796, "loss/crossentropy": 2.6361165046691895, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.1921408474445343, "step": 36 }, { "epoch": 0.0005524861878453039, "grad_norm": 0.67578125, "grad_norm_var": 0.044178199768066403, "learning_rate": 2e-05, "loss": 1.3217, "loss/crossentropy": 2.5386064052581787, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18107610940933228, "step": 37 }, { "epoch": 0.000567418246976258, "grad_norm": 0.5859375, "grad_norm_var": 0.045613034566243486, "learning_rate": 2e-05, "loss": 1.3732, "loss/crossentropy": 2.542595863342285, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 12.0, "loss/logits": 0.19354979693889618, "step": 38 }, { "epoch": 0.0005823503061072122, "grad_norm": 0.57421875, "grad_norm_var": 0.047247060139973956, "learning_rate": 2e-05, "loss": 1.2929, "loss/crossentropy": 2.339637279510498, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.18351908028125763, "step": 39 }, { "epoch": 0.0005972823652381663, "grad_norm": 0.5546875, "grad_norm_var": 0.048766835530598955, "learning_rate": 2e-05, "loss": 1.2777, "loss/crossentropy": 2.626256227493286, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.16047844290733337, "step": 40 }, { "epoch": 0.0006122144243691205, "grad_norm": 0.6015625, "grad_norm_var": 0.048638916015625, "learning_rate": 2e-05, "loss": 1.2725, "loss/crossentropy": 2.5306503772735596, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17098692059516907, "step": 41 }, { "epoch": 0.0006271464835000747, "grad_norm": 0.6328125, "grad_norm_var": 0.04872614542643229, "learning_rate": 2e-05, "loss": 1.4129, "loss/crossentropy": 2.4811081886291504, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 12.0, "loss/logits": 0.20194479823112488, "step": 42 }, { "epoch": 0.0006420785426310288, "grad_norm": 0.58203125, "grad_norm_var": 0.049119059244791666, "learning_rate": 2e-05, "loss": 1.274, "loss/crossentropy": 2.4502978324890137, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16458770632743835, "step": 43 }, { "epoch": 0.000657010601761983, "grad_norm": 0.69140625, "grad_norm_var": 0.04902083079020182, "learning_rate": 2e-05, "loss": 1.5054, "loss/crossentropy": 2.416804075241089, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 12.0, "loss/logits": 0.22411639988422394, "step": 44 }, { "epoch": 0.0006719426608929372, "grad_norm": 0.79296875, "grad_norm_var": 0.048288726806640626, "learning_rate": 2e-05, "loss": 1.4078, "loss/crossentropy": 2.742251396179199, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.18909800052642822, "step": 45 }, { "epoch": 0.0006868747200238913, "grad_norm": 0.70703125, "grad_norm_var": 0.04819685618082682, "learning_rate": 2e-05, "loss": 1.4589, "loss/crossentropy": 2.6130266189575195, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 12.0, "loss/logits": 0.232346311211586, "step": 46 }, { "epoch": 0.0007018067791548455, "grad_norm": 0.69140625, "grad_norm_var": 0.04795373280843099, "learning_rate": 2e-05, "loss": 1.4667, "loss/crossentropy": 2.4245617389678955, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 12.0, "loss/logits": 0.21665045619010925, "step": 47 }, { "epoch": 0.0007167388382857996, "grad_norm": 0.56640625, "grad_norm_var": 0.018373616536458335, "learning_rate": 2e-05, "loss": 1.3413, "loss/crossentropy": 2.6861648559570312, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.19289466738700867, "step": 48 }, { "epoch": 0.0007316708974167538, "grad_norm": 0.54296875, "grad_norm_var": 0.019614410400390626, "learning_rate": 2e-05, "loss": 1.2877, "loss/crossentropy": 2.713362455368042, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17047566175460815, "step": 49 }, { "epoch": 0.000746602956547708, "grad_norm": 0.5234375, "grad_norm_var": 0.007645416259765625, "learning_rate": 2e-05, "loss": 1.2322, "loss/crossentropy": 2.661055564880371, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1618708372116089, "step": 50 }, { "epoch": 0.0007615350156786621, "grad_norm": 0.62890625, "grad_norm_var": 0.0074541727701822914, "learning_rate": 2e-05, "loss": 1.3611, "loss/crossentropy": 2.7492740154266357, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.1892453134059906, "step": 51 }, { "epoch": 0.0007764670748096163, "grad_norm": 0.640625, "grad_norm_var": 0.005238596598307292, "learning_rate": 2e-05, "loss": 1.4406, "loss/crossentropy": 2.341090679168701, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 12.0, "loss/logits": 0.19059142470359802, "step": 52 }, { "epoch": 0.0007913991339405704, "grad_norm": 0.5703125, "grad_norm_var": 0.005212847391764323, "learning_rate": 2e-05, "loss": 1.2364, "loss/crossentropy": 2.77945613861084, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.15829287469387054, "step": 53 }, { "epoch": 0.0008063311930715246, "grad_norm": 0.6328125, "grad_norm_var": 0.0051502863566080725, "learning_rate": 2e-05, "loss": 1.2944, "loss/crossentropy": 2.8645284175872803, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1772599220275879, "step": 54 }, { "epoch": 0.0008212632522024788, "grad_norm": 0.546875, "grad_norm_var": 0.0053670247395833336, "learning_rate": 2e-05, "loss": 1.3286, "loss/crossentropy": 2.2789793014526367, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.1957651972770691, "step": 55 }, { "epoch": 0.0008361953113334329, "grad_norm": 0.61328125, "grad_norm_var": 0.00507806142171224, "learning_rate": 2e-05, "loss": 1.4199, "loss/crossentropy": 2.734726667404175, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.21681487560272217, "step": 56 }, { "epoch": 0.0008511273704643871, "grad_norm": 0.5859375, "grad_norm_var": 0.0051375706990559895, "learning_rate": 2e-05, "loss": 1.2676, "loss/crossentropy": 2.6685731410980225, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16598857939243317, "step": 57 }, { "epoch": 0.0008660594295953411, "grad_norm": 0.62109375, "grad_norm_var": 0.005128987630208333, "learning_rate": 2e-05, "loss": 1.2632, "loss/crossentropy": 2.5432820320129395, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16164365410804749, "step": 58 }, { "epoch": 0.0008809914887262953, "grad_norm": 0.671875, "grad_norm_var": 0.0051655451456705725, "learning_rate": 2e-05, "loss": 1.3446, "loss/crossentropy": 2.400588035583496, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.17270785570144653, "step": 59 }, { "epoch": 0.0008959235478572496, "grad_norm": 0.57421875, "grad_norm_var": 0.0050129572550455725, "learning_rate": 2e-05, "loss": 1.2711, "loss/crossentropy": 2.9404215812683105, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16168653964996338, "step": 60 }, { "epoch": 0.0009108556069882036, "grad_norm": 0.640625, "grad_norm_var": 0.002937571207682292, "learning_rate": 2e-05, "loss": 1.343, "loss/crossentropy": 2.4893975257873535, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.17896610498428345, "step": 61 }, { "epoch": 0.0009257876661191578, "grad_norm": 0.5625, "grad_norm_var": 0.0023706436157226564, "learning_rate": 2e-05, "loss": 1.286, "loss/crossentropy": 2.544616460800171, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17664407193660736, "step": 62 }, { "epoch": 0.000940719725250112, "grad_norm": 0.62890625, "grad_norm_var": 0.0018599828084309895, "learning_rate": 2e-05, "loss": 1.3906, "loss/crossentropy": 2.5881662368774414, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.19525527954101562, "step": 63 }, { "epoch": 0.0009556517843810661, "grad_norm": 0.578125, "grad_norm_var": 0.0018208821614583333, "learning_rate": 2e-05, "loss": 1.1933, "loss/crossentropy": 2.7437222003936768, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.14645323157310486, "step": 64 }, { "epoch": 0.0009705838435120203, "grad_norm": 0.6328125, "grad_norm_var": 0.0016702651977539063, "learning_rate": 2e-05, "loss": 1.4415, "loss/crossentropy": 2.6524834632873535, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 12.0, "loss/logits": 0.21492895483970642, "step": 65 }, { "epoch": 0.0009855159026429744, "grad_norm": 0.59765625, "grad_norm_var": 0.001224517822265625, "learning_rate": 2e-05, "loss": 1.3729, "loss/crossentropy": 2.853750705718994, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.2009965479373932, "step": 66 }, { "epoch": 0.0010004479617739285, "grad_norm": 0.55078125, "grad_norm_var": 0.0013872782389322917, "learning_rate": 2e-05, "loss": 1.205, "loss/crossentropy": 2.3778915405273438, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1425057351589203, "step": 67 }, { "epoch": 0.0010153800209048828, "grad_norm": 0.640625, "grad_norm_var": 0.0013872782389322917, "learning_rate": 2e-05, "loss": 1.4857, "loss/crossentropy": 2.0992250442504883, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 12.0, "loss/logits": 0.22008036077022552, "step": 68 }, { "epoch": 0.001030312080035837, "grad_norm": 0.57421875, "grad_norm_var": 0.0013711929321289062, "learning_rate": 2e-05, "loss": 1.3042, "loss/crossentropy": 2.557607650756836, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.17921671271324158, "step": 69 }, { "epoch": 0.001045244139166791, "grad_norm": 0.53515625, "grad_norm_var": 0.0015825907389322917, "learning_rate": 2e-05, "loss": 1.1715, "loss/crossentropy": 2.5834288597106934, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.14024239778518677, "step": 70 }, { "epoch": 0.0010601761982977453, "grad_norm": 0.67578125, "grad_norm_var": 0.0017567316691080729, "learning_rate": 2e-05, "loss": 1.4302, "loss/crossentropy": 2.542942523956299, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 12.0, "loss/logits": 0.2036018967628479, "step": 71 }, { "epoch": 0.0010751082574286994, "grad_norm": 0.5625, "grad_norm_var": 0.0018633524576822916, "learning_rate": 2e-05, "loss": 1.252, "loss/crossentropy": 2.635038137435913, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16602204740047455, "step": 72 }, { "epoch": 0.0010900403165596535, "grad_norm": 0.5859375, "grad_norm_var": 0.0018633524576822916, "learning_rate": 2e-05, "loss": 1.3169, "loss/crossentropy": 2.5402872562408447, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.18406565487384796, "step": 73 }, { "epoch": 0.0011049723756906078, "grad_norm": 0.5546875, "grad_norm_var": 0.001970354715983073, "learning_rate": 2e-05, "loss": 1.2864, "loss/crossentropy": 2.5613765716552734, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.177069291472435, "step": 74 }, { "epoch": 0.001119904434821562, "grad_norm": 0.56640625, "grad_norm_var": 0.0016253153483072917, "learning_rate": 2e-05, "loss": 1.2234, "loss/crossentropy": 2.5081839561462402, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15304957330226898, "step": 75 }, { "epoch": 0.001134836493952516, "grad_norm": 0.6796875, "grad_norm_var": 0.002080217997233073, "learning_rate": 2e-05, "loss": 1.3845, "loss/crossentropy": 2.4124844074249268, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.18140918016433716, "step": 76 }, { "epoch": 0.0011497685530834703, "grad_norm": 0.87109375, "grad_norm_var": 0.006712849934895833, "learning_rate": 2e-05, "loss": 1.4476, "loss/crossentropy": 2.6826772689819336, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.22889642417430878, "step": 77 }, { "epoch": 0.0011647006122144244, "grad_norm": 0.67578125, "grad_norm_var": 0.0067626317342122395, "learning_rate": 2e-05, "loss": 1.5034, "loss/crossentropy": 2.1851906776428223, "loss/dist_ce": 0.0, "loss/fcd": 1.3125, "loss/idx": 12.0, "loss/logits": 0.19086772203445435, "step": 78 }, { "epoch": 0.0011796326713453785, "grad_norm": 0.60546875, "grad_norm_var": 0.00676720937093099, "learning_rate": 2e-05, "loss": 1.1937, "loss/crossentropy": 2.655413866043091, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1467989981174469, "step": 79 }, { "epoch": 0.0011945647304763326, "grad_norm": 0.64453125, "grad_norm_var": 0.006690470377604166, "learning_rate": 2e-05, "loss": 1.3068, "loss/crossentropy": 2.6873862743377686, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.18183887004852295, "step": 80 }, { "epoch": 0.001209496789607287, "grad_norm": 0.65625, "grad_norm_var": 0.006758371988932292, "learning_rate": 2e-05, "loss": 1.334, "loss/crossentropy": 2.697080373764038, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.19333507120609283, "step": 81 }, { "epoch": 0.001224428848738241, "grad_norm": 0.5234375, "grad_norm_var": 0.007358741760253906, "learning_rate": 2e-05, "loss": 1.19, "loss/crossentropy": 2.5790112018585205, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.14316534996032715, "step": 82 }, { "epoch": 0.001239360907869195, "grad_norm": 0.54296875, "grad_norm_var": 0.007433509826660157, "learning_rate": 2e-05, "loss": 1.2365, "loss/crossentropy": 2.552305221557617, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.15837864577770233, "step": 83 }, { "epoch": 0.0012542929670001494, "grad_norm": 0.7421875, "grad_norm_var": 0.008379046122233074, "learning_rate": 2e-05, "loss": 1.5673, "loss/crossentropy": 2.189481019973755, "loss/dist_ce": 0.0, "loss/fcd": 1.3515625, "loss/idx": 12.0, "loss/logits": 0.21574443578720093, "step": 84 }, { "epoch": 0.0012692250261311035, "grad_norm": 0.72265625, "grad_norm_var": 0.008755938212076823, "learning_rate": 2e-05, "loss": 1.4194, "loss/crossentropy": 2.523226261138916, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.20060396194458008, "step": 85 }, { "epoch": 0.0012841570852620576, "grad_norm": 0.56640625, "grad_norm_var": 0.008404986063639323, "learning_rate": 2e-05, "loss": 1.3345, "loss/crossentropy": 2.6093485355377197, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.1860586702823639, "step": 86 }, { "epoch": 0.0012990891443930119, "grad_norm": 0.5390625, "grad_norm_var": 0.008847808837890625, "learning_rate": 2e-05, "loss": 1.2861, "loss/crossentropy": 2.5548102855682373, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17671170830726624, "step": 87 }, { "epoch": 0.001314021203523966, "grad_norm": 1.2265625, "grad_norm_var": 0.030658976236979166, "learning_rate": 2e-05, "loss": 1.7591, "loss/crossentropy": 2.089616298675537, "loss/dist_ce": 0.0, "loss/fcd": 1.5, "loss/idx": 12.0, "loss/logits": 0.259127676486969, "step": 88 }, { "epoch": 0.00132895326265492, "grad_norm": 0.6484375, "grad_norm_var": 0.030211385091145834, "learning_rate": 2e-05, "loss": 1.3975, "loss/crossentropy": 2.563173770904541, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.19433674216270447, "step": 89 }, { "epoch": 0.0013438853217858744, "grad_norm": 0.60546875, "grad_norm_var": 0.029572486877441406, "learning_rate": 2e-05, "loss": 1.4763, "loss/crossentropy": 2.6154820919036865, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.23407219350337982, "step": 90 }, { "epoch": 0.0013588173809168285, "grad_norm": 0.62890625, "grad_norm_var": 0.02890313466389974, "learning_rate": 2e-05, "loss": 1.4736, "loss/crossentropy": 2.460665225982666, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 12.0, "loss/logits": 0.2470313310623169, "step": 91 }, { "epoch": 0.0013737494400477826, "grad_norm": 0.65625, "grad_norm_var": 0.02893822987874349, "learning_rate": 2e-05, "loss": 1.2087, "loss/crossentropy": 2.815687417984009, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1540374755859375, "step": 92 }, { "epoch": 0.0013886814991787367, "grad_norm": 0.53125, "grad_norm_var": 0.027428181966145833, "learning_rate": 2e-05, "loss": 1.2891, "loss/crossentropy": 2.7470219135284424, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17968374490737915, "step": 93 }, { "epoch": 0.001403613558309691, "grad_norm": 0.5390625, "grad_norm_var": 0.02825819651285807, "learning_rate": 2e-05, "loss": 1.178, "loss/crossentropy": 2.582261562347412, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.15451796352863312, "step": 94 }, { "epoch": 0.001418545617440645, "grad_norm": 0.9609375, "grad_norm_var": 0.03410746256510417, "learning_rate": 2e-05, "loss": 1.6059, "loss/crossentropy": 2.5164883136749268, "loss/dist_ce": 0.0, "loss/fcd": 1.359375, "loss/idx": 12.0, "loss/logits": 0.24653397500514984, "step": 95 }, { "epoch": 0.0014334776765715992, "grad_norm": 0.515625, "grad_norm_var": 0.03559919993082682, "learning_rate": 2e-05, "loss": 1.1938, "loss/crossentropy": 2.423619031906128, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.154771625995636, "step": 96 }, { "epoch": 0.0014484097357025535, "grad_norm": 0.55859375, "grad_norm_var": 0.03628107706705729, "learning_rate": 2e-05, "loss": 1.2842, "loss/crossentropy": 2.7245681285858154, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17486220598220825, "step": 97 }, { "epoch": 0.0014633417948335076, "grad_norm": 0.515625, "grad_norm_var": 0.036423746744791666, "learning_rate": 2e-05, "loss": 1.2278, "loss/crossentropy": 2.596822500228882, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15747135877609253, "step": 98 }, { "epoch": 0.0014782738539644616, "grad_norm": 0.58984375, "grad_norm_var": 0.035853068033854164, "learning_rate": 2e-05, "loss": 1.3496, "loss/crossentropy": 2.319052219390869, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.20119673013687134, "step": 99 }, { "epoch": 0.001493205913095416, "grad_norm": 0.51171875, "grad_norm_var": 0.036622047424316406, "learning_rate": 2e-05, "loss": 1.2263, "loss/crossentropy": 2.8021628856658936, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15594205260276794, "step": 100 }, { "epoch": 0.00150813797222637, "grad_norm": 0.56640625, "grad_norm_var": 0.03652540842692057, "learning_rate": 2e-05, "loss": 1.3127, "loss/crossentropy": 2.776284694671631, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1876915991306305, "step": 101 }, { "epoch": 0.0015230700313573241, "grad_norm": 0.53125, "grad_norm_var": 0.03692423502604167, "learning_rate": 2e-05, "loss": 1.2802, "loss/crossentropy": 2.548290967941284, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17086531221866608, "step": 102 }, { "epoch": 0.0015380020904882782, "grad_norm": 0.96484375, "grad_norm_var": 0.04293257395426432, "learning_rate": 2e-05, "loss": 1.6039, "loss/crossentropy": 2.2254953384399414, "loss/dist_ce": 0.0, "loss/fcd": 1.359375, "loss/idx": 12.0, "loss/logits": 0.24454209208488464, "step": 103 }, { "epoch": 0.0015529341496192325, "grad_norm": 0.5234375, "grad_norm_var": 0.020662371317545572, "learning_rate": 2e-05, "loss": 1.1787, "loss/crossentropy": 2.4688055515289307, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.14745503664016724, "step": 104 }, { "epoch": 0.0015678662087501866, "grad_norm": 0.5390625, "grad_norm_var": 0.020929400126139322, "learning_rate": 2e-05, "loss": 1.3059, "loss/crossentropy": 2.3868861198425293, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17307642102241516, "step": 105 }, { "epoch": 0.0015827982678811407, "grad_norm": 0.546875, "grad_norm_var": 0.02116877237955729, "learning_rate": 2e-05, "loss": 1.2415, "loss/crossentropy": 2.660839319229126, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.17113830149173737, "step": 106 }, { "epoch": 0.001597730327012095, "grad_norm": 0.65234375, "grad_norm_var": 0.021277872721354167, "learning_rate": 2e-05, "loss": 1.3782, "loss/crossentropy": 2.385211944580078, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.19073891639709473, "step": 107 }, { "epoch": 0.0016126623861430491, "grad_norm": 0.58203125, "grad_norm_var": 0.021129290262858074, "learning_rate": 2e-05, "loss": 1.2911, "loss/crossentropy": 2.6263484954833984, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.18175816535949707, "step": 108 }, { "epoch": 0.0016275944452740032, "grad_norm": 0.62890625, "grad_norm_var": 0.020806630452473957, "learning_rate": 2e-05, "loss": 1.3712, "loss/crossentropy": 2.2086172103881836, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.17591926455497742, "step": 109 }, { "epoch": 0.0016425265044049575, "grad_norm": 0.5390625, "grad_norm_var": 0.020806630452473957, "learning_rate": 2e-05, "loss": 1.2894, "loss/crossentropy": 2.563765048980713, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1721740961074829, "step": 110 }, { "epoch": 0.0016574585635359116, "grad_norm": 0.5859375, "grad_norm_var": 0.011944325764973958, "learning_rate": 2e-05, "loss": 1.2239, "loss/crossentropy": 2.4681785106658936, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16138719022274017, "step": 111 }, { "epoch": 0.0016723906226668657, "grad_norm": 0.61328125, "grad_norm_var": 0.011643918355305989, "learning_rate": 2e-05, "loss": 1.3192, "loss/crossentropy": 2.641052484512329, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.17078858613967896, "step": 112 }, { "epoch": 0.00168732268179782, "grad_norm": 0.6640625, "grad_norm_var": 0.011889394124348958, "learning_rate": 2e-05, "loss": 1.4469, "loss/crossentropy": 2.031921863555908, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 12.0, "loss/logits": 0.21249458193778992, "step": 113 }, { "epoch": 0.0017022547409287741, "grad_norm": 0.59765625, "grad_norm_var": 0.01141808827718099, "learning_rate": 2e-05, "loss": 1.3311, "loss/crossentropy": 2.430854082107544, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.1826540231704712, "step": 114 }, { "epoch": 0.0017171868000597282, "grad_norm": 0.671875, "grad_norm_var": 0.011702473958333333, "learning_rate": 2e-05, "loss": 1.267, "loss/crossentropy": 2.7262022495269775, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.17326927185058594, "step": 115 }, { "epoch": 0.0017321188591906823, "grad_norm": 0.5703125, "grad_norm_var": 0.011169370015462239, "learning_rate": 2e-05, "loss": 1.1959, "loss/crossentropy": 2.7001309394836426, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.1568140983581543, "step": 116 }, { "epoch": 0.0017470509183216366, "grad_norm": 0.54296875, "grad_norm_var": 0.01134332021077474, "learning_rate": 2e-05, "loss": 1.273, "loss/crossentropy": 2.252134323120117, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.1714470088481903, "step": 117 }, { "epoch": 0.0017619829774525907, "grad_norm": 0.58984375, "grad_norm_var": 0.010945638020833334, "learning_rate": 2e-05, "loss": 1.2842, "loss/crossentropy": 2.4008445739746094, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17484425008296967, "step": 118 }, { "epoch": 0.0017769150365835448, "grad_norm": 0.58984375, "grad_norm_var": 0.0021565755208333335, "learning_rate": 2e-05, "loss": 1.3173, "loss/crossentropy": 2.561591863632202, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.18449710309505463, "step": 119 }, { "epoch": 0.001791847095714499, "grad_norm": 0.78125, "grad_norm_var": 0.0040280659993489586, "learning_rate": 2e-05, "loss": 1.4499, "loss/crossentropy": 2.298919677734375, "loss/dist_ce": 0.0, "loss/fcd": 1.2578125, "loss/idx": 12.0, "loss/logits": 0.19211535155773163, "step": 120 }, { "epoch": 0.0018067791548454532, "grad_norm": 0.59765625, "grad_norm_var": 0.0037200291951497394, "learning_rate": 2e-05, "loss": 1.3325, "loss/crossentropy": 2.5992934703826904, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.19187986850738525, "step": 121 }, { "epoch": 0.0018217112139764073, "grad_norm": 0.52734375, "grad_norm_var": 0.003907267252604167, "learning_rate": 2e-05, "loss": 1.2822, "loss/crossentropy": 2.6393258571624756, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17281374335289001, "step": 122 }, { "epoch": 0.0018366432731073616, "grad_norm": 0.66015625, "grad_norm_var": 0.003956858317057292, "learning_rate": 2e-05, "loss": 1.5573, "loss/crossentropy": 2.521825075149536, "loss/dist_ce": 0.0, "loss/fcd": 1.296875, "loss/idx": 12.0, "loss/logits": 0.26045745611190796, "step": 123 }, { "epoch": 0.0018515753322383157, "grad_norm": 0.625, "grad_norm_var": 0.00391839345296224, "learning_rate": 2e-05, "loss": 1.4125, "loss/crossentropy": 2.404489278793335, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 12.0, "loss/logits": 0.2015247493982315, "step": 124 }, { "epoch": 0.0018665073913692698, "grad_norm": 0.546875, "grad_norm_var": 0.004149373372395833, "learning_rate": 2e-05, "loss": 1.3093, "loss/crossentropy": 2.4341983795166016, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.16871654987335205, "step": 125 }, { "epoch": 0.001881439450500224, "grad_norm": 0.71484375, "grad_norm_var": 0.00450127919514974, "learning_rate": 2e-05, "loss": 1.4935, "loss/crossentropy": 2.46096134185791, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 12.0, "loss/logits": 0.2122688889503479, "step": 126 }, { "epoch": 0.0018963715096311782, "grad_norm": 0.5078125, "grad_norm_var": 0.00521081288655599, "learning_rate": 2e-05, "loss": 1.1707, "loss/crossentropy": 2.6259350776672363, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.13940490782260895, "step": 127 }, { "epoch": 0.0019113035687621323, "grad_norm": 0.59765625, "grad_norm_var": 0.005224545796712239, "learning_rate": 2e-05, "loss": 1.3569, "loss/crossentropy": 2.5117621421813965, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.19283828139305115, "step": 128 }, { "epoch": 0.0019262356278930864, "grad_norm": 0.53125, "grad_norm_var": 0.005397478739420573, "learning_rate": 2e-05, "loss": 1.3103, "loss/crossentropy": 2.7031896114349365, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.1774989813566208, "step": 129 }, { "epoch": 0.0019411676870240407, "grad_norm": 0.63671875, "grad_norm_var": 0.005463600158691406, "learning_rate": 2e-05, "loss": 1.2254, "loss/crossentropy": 2.3635990619659424, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.14724132418632507, "step": 130 }, { "epoch": 0.0019560997461549948, "grad_norm": 0.51171875, "grad_norm_var": 0.005653889973958334, "learning_rate": 2e-05, "loss": 1.2139, "loss/crossentropy": 2.5982468128204346, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15138475596904755, "step": 131 }, { "epoch": 0.001971031805285949, "grad_norm": 0.55859375, "grad_norm_var": 0.0057021458943684895, "learning_rate": 2e-05, "loss": 1.2847, "loss/crossentropy": 2.7570533752441406, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1674671769142151, "step": 132 }, { "epoch": 0.001985963864416903, "grad_norm": 0.66796875, "grad_norm_var": 0.005812009175618489, "learning_rate": 2e-05, "loss": 1.362, "loss/crossentropy": 2.513373374938965, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.1745297610759735, "step": 133 }, { "epoch": 0.002000895923547857, "grad_norm": 0.5859375, "grad_norm_var": 0.0058197021484375, "learning_rate": 2e-05, "loss": 1.2976, "loss/crossentropy": 2.5316669940948486, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1726335734128952, "step": 134 }, { "epoch": 0.0020158279826788116, "grad_norm": 0.63671875, "grad_norm_var": 0.005877685546875, "learning_rate": 2e-05, "loss": 1.395, "loss/crossentropy": 2.557969331741333, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.19964131712913513, "step": 135 }, { "epoch": 0.0020307600418097657, "grad_norm": 0.67578125, "grad_norm_var": 0.004100990295410156, "learning_rate": 2e-05, "loss": 1.216, "loss/crossentropy": 2.6619279384613037, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.14569371938705444, "step": 136 }, { "epoch": 0.0020456921009407198, "grad_norm": 0.56640625, "grad_norm_var": 0.00416711171468099, "learning_rate": 2e-05, "loss": 1.2596, "loss/crossentropy": 2.415104389190674, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.1580391526222229, "step": 137 }, { "epoch": 0.002060624160071674, "grad_norm": 0.640625, "grad_norm_var": 0.003918202718098959, "learning_rate": 2e-05, "loss": 1.4596, "loss/crossentropy": 2.653369903564453, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 12.0, "loss/logits": 0.2330111861228943, "step": 138 }, { "epoch": 0.002075556219202628, "grad_norm": 0.8984375, "grad_norm_var": 0.009250831604003907, "learning_rate": 2e-05, "loss": 1.4266, "loss/crossentropy": 2.440645694732666, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.2078884392976761, "step": 139 }, { "epoch": 0.002090488278333582, "grad_norm": 0.58984375, "grad_norm_var": 0.009299468994140626, "learning_rate": 2e-05, "loss": 1.2998, "loss/crossentropy": 2.5413432121276855, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1748320460319519, "step": 140 }, { "epoch": 0.0021054203374645366, "grad_norm": 0.6171875, "grad_norm_var": 0.008953857421875, "learning_rate": 2e-05, "loss": 1.4042, "loss/crossentropy": 2.618962287902832, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.20887598395347595, "step": 141 }, { "epoch": 0.0021203523965954907, "grad_norm": 0.5703125, "grad_norm_var": 0.008452796936035156, "learning_rate": 2e-05, "loss": 1.2957, "loss/crossentropy": 2.5938720703125, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17847199738025665, "step": 142 }, { "epoch": 0.0021352844557264447, "grad_norm": 0.56640625, "grad_norm_var": 0.007852935791015625, "learning_rate": 2e-05, "loss": 1.2851, "loss/crossentropy": 2.5696861743927, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.167904332280159, "step": 143 }, { "epoch": 0.002150216514857399, "grad_norm": 0.5390625, "grad_norm_var": 0.008208656311035156, "learning_rate": 2e-05, "loss": 1.294, "loss/crossentropy": 2.61997389793396, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17682784795761108, "step": 144 }, { "epoch": 0.002165148573988353, "grad_norm": 0.91796875, "grad_norm_var": 0.013388824462890626, "learning_rate": 2e-05, "loss": 1.4053, "loss/crossentropy": 2.3499011993408203, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.18656103312969208, "step": 145 }, { "epoch": 0.002180080633119307, "grad_norm": 0.69921875, "grad_norm_var": 0.013637034098307292, "learning_rate": 2e-05, "loss": 1.4045, "loss/crossentropy": 2.7330119609832764, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.21699491143226624, "step": 146 }, { "epoch": 0.002195012692250261, "grad_norm": 0.5625, "grad_norm_var": 0.01292870839436849, "learning_rate": 2e-05, "loss": 1.2919, "loss/crossentropy": 2.430858612060547, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16690674424171448, "step": 147 }, { "epoch": 0.0022099447513812156, "grad_norm": 0.53125, "grad_norm_var": 0.0132843017578125, "learning_rate": 2e-05, "loss": 1.2364, "loss/crossentropy": 2.6031928062438965, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1582430899143219, "step": 148 }, { "epoch": 0.0022248768105121697, "grad_norm": 0.55859375, "grad_norm_var": 0.0136474609375, "learning_rate": 2e-05, "loss": 1.2916, "loss/crossentropy": 2.755666494369507, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17443333566188812, "step": 149 }, { "epoch": 0.002239808869643124, "grad_norm": 0.60546875, "grad_norm_var": 0.013544146219889324, "learning_rate": 2e-05, "loss": 1.3718, "loss/crossentropy": 2.467615842819214, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.18433909118175507, "step": 150 }, { "epoch": 0.002254740928774078, "grad_norm": 0.5390625, "grad_norm_var": 0.014130655924479167, "learning_rate": 2e-05, "loss": 1.2852, "loss/crossentropy": 2.555243730545044, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17583933472633362, "step": 151 }, { "epoch": 0.002269672987905032, "grad_norm": 0.6171875, "grad_norm_var": 0.013986651102701824, "learning_rate": 2e-05, "loss": 1.3463, "loss/crossentropy": 2.4775476455688477, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.19002023339271545, "step": 152 }, { "epoch": 0.002284605047035986, "grad_norm": 0.546875, "grad_norm_var": 0.014166259765625, "learning_rate": 2e-05, "loss": 1.2389, "loss/crossentropy": 2.747436761856079, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.14515338838100433, "step": 153 }, { "epoch": 0.0022995371061669406, "grad_norm": 0.625, "grad_norm_var": 0.014148966471354166, "learning_rate": 2e-05, "loss": 1.3508, "loss/crossentropy": 2.550513505935669, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.20233154296875, "step": 154 }, { "epoch": 0.0023144691652978947, "grad_norm": 0.609375, "grad_norm_var": 0.008794911702473958, "learning_rate": 2e-05, "loss": 1.3803, "loss/crossentropy": 2.761523723602295, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.19279026985168457, "step": 155 }, { "epoch": 0.002329401224428849, "grad_norm": 0.625, "grad_norm_var": 0.008796628316243489, "learning_rate": 2e-05, "loss": 1.3714, "loss/crossentropy": 2.6067566871643066, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.22298991680145264, "step": 156 }, { "epoch": 0.002344333283559803, "grad_norm": 0.486328125, "grad_norm_var": 0.00970927874247233, "learning_rate": 2e-05, "loss": 1.166, "loss/crossentropy": 2.559112548828125, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.1503329873085022, "step": 157 }, { "epoch": 0.002359265342690757, "grad_norm": 0.5546875, "grad_norm_var": 0.009786335627237956, "learning_rate": 2e-05, "loss": 1.2461, "loss/crossentropy": 2.5379300117492676, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16015547513961792, "step": 158 }, { "epoch": 0.002374197401821711, "grad_norm": 0.55859375, "grad_norm_var": 0.009824101130167644, "learning_rate": 2e-05, "loss": 1.3303, "loss/crossentropy": 2.4808268547058105, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.1740918755531311, "step": 159 }, { "epoch": 0.002389129460952665, "grad_norm": 0.478515625, "grad_norm_var": 0.01053314208984375, "learning_rate": 2e-05, "loss": 1.1762, "loss/crossentropy": 2.506295919418335, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.15280288457870483, "step": 160 }, { "epoch": 0.0024040615200836197, "grad_norm": 0.5703125, "grad_norm_var": 0.0031035741170247397, "learning_rate": 2e-05, "loss": 1.2835, "loss/crossentropy": 2.7988052368164062, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17412468791007996, "step": 161 }, { "epoch": 0.002418993579214574, "grad_norm": 0.57421875, "grad_norm_var": 0.001976458231608073, "learning_rate": 2e-05, "loss": 1.3794, "loss/crossentropy": 2.4478092193603516, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.19193212687969208, "step": 162 }, { "epoch": 0.002433925638345528, "grad_norm": 0.52734375, "grad_norm_var": 0.0020662943522135415, "learning_rate": 2e-05, "loss": 1.2295, "loss/crossentropy": 2.6355202198028564, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1592077910900116, "step": 163 }, { "epoch": 0.002448857697476482, "grad_norm": 0.5234375, "grad_norm_var": 0.0021031697591145835, "learning_rate": 2e-05, "loss": 1.2719, "loss/crossentropy": 2.6865055561065674, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17034964263439178, "step": 164 }, { "epoch": 0.002463789756607436, "grad_norm": 0.625, "grad_norm_var": 0.002344195048014323, "learning_rate": 2e-05, "loss": 1.4733, "loss/crossentropy": 2.4416587352752686, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.23110373318195343, "step": 165 }, { "epoch": 0.00247872181573839, "grad_norm": 0.5625, "grad_norm_var": 0.0022371927897135418, "learning_rate": 2e-05, "loss": 1.4188, "loss/crossentropy": 2.4493439197540283, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.20008787512779236, "step": 166 }, { "epoch": 0.0024936538748693447, "grad_norm": 0.55078125, "grad_norm_var": 0.002206865946451823, "learning_rate": 2e-05, "loss": 1.2253, "loss/crossentropy": 2.712056875228882, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16277046501636505, "step": 167 }, { "epoch": 0.002508585934000299, "grad_norm": 0.6171875, "grad_norm_var": 0.002206865946451823, "learning_rate": 2e-05, "loss": 1.4075, "loss/crossentropy": 2.426335096359253, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.21219667792320251, "step": 168 }, { "epoch": 0.002523517993131253, "grad_norm": 0.5859375, "grad_norm_var": 0.0022094090779622394, "learning_rate": 2e-05, "loss": 1.4624, "loss/crossentropy": 2.7482047080993652, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 12.0, "loss/logits": 0.22799468040466309, "step": 169 }, { "epoch": 0.002538450052262207, "grad_norm": 0.55078125, "grad_norm_var": 0.0019810994466145835, "learning_rate": 2e-05, "loss": 1.2488, "loss/crossentropy": 2.6758551597595215, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16287586092948914, "step": 170 }, { "epoch": 0.002553382111393161, "grad_norm": 0.5625, "grad_norm_var": 0.0018254597981770834, "learning_rate": 2e-05, "loss": 1.2867, "loss/crossentropy": 2.5498204231262207, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17730045318603516, "step": 171 }, { "epoch": 0.002568314170524115, "grad_norm": 0.53515625, "grad_norm_var": 0.001546160380045573, "learning_rate": 2e-05, "loss": 1.2392, "loss/crossentropy": 2.717747926712036, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16104069352149963, "step": 172 }, { "epoch": 0.0025832462296550692, "grad_norm": 0.78515625, "grad_norm_var": 0.004432789484659831, "learning_rate": 2e-05, "loss": 1.3913, "loss/crossentropy": 2.50931715965271, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.18817588686943054, "step": 173 }, { "epoch": 0.0025981782887860238, "grad_norm": 0.5234375, "grad_norm_var": 0.0045685927073160805, "learning_rate": 2e-05, "loss": 1.2485, "loss/crossentropy": 2.446424961090088, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16260743141174316, "step": 174 }, { "epoch": 0.002613110347916978, "grad_norm": 0.58984375, "grad_norm_var": 0.00457927385965983, "learning_rate": 2e-05, "loss": 1.4057, "loss/crossentropy": 2.4681448936462402, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.20260846614837646, "step": 175 }, { "epoch": 0.002628042407047932, "grad_norm": 0.5546875, "grad_norm_var": 0.003986040751139323, "learning_rate": 2e-05, "loss": 1.2474, "loss/crossentropy": 2.8297557830810547, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1614263653755188, "step": 176 }, { "epoch": 0.002642974466178886, "grad_norm": 0.55078125, "grad_norm_var": 0.0040283203125, "learning_rate": 2e-05, "loss": 1.1839, "loss/crossentropy": 2.594815731048584, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.1526121199131012, "step": 177 }, { "epoch": 0.00265790652530984, "grad_norm": 0.51953125, "grad_norm_var": 0.004229482014973958, "learning_rate": 2e-05, "loss": 1.3533, "loss/crossentropy": 2.3881778717041016, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.20487645268440247, "step": 178 }, { "epoch": 0.0026728385844407942, "grad_norm": 0.5, "grad_norm_var": 0.00444176991780599, "learning_rate": 2e-05, "loss": 1.1749, "loss/crossentropy": 2.5949110984802246, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.15150848031044006, "step": 179 }, { "epoch": 0.0026877706435717488, "grad_norm": 0.58203125, "grad_norm_var": 0.004284413655598959, "learning_rate": 2e-05, "loss": 1.2842, "loss/crossentropy": 2.8450071811676025, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.16698706150054932, "step": 180 }, { "epoch": 0.002702702702702703, "grad_norm": 0.48828125, "grad_norm_var": 0.004535865783691406, "learning_rate": 2e-05, "loss": 1.2074, "loss/crossentropy": 2.5095417499542236, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1605302095413208, "step": 181 }, { "epoch": 0.002717634761833657, "grad_norm": 0.6015625, "grad_norm_var": 0.004612159729003906, "learning_rate": 2e-05, "loss": 1.359, "loss/crossentropy": 2.6669504642486572, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.20277641713619232, "step": 182 }, { "epoch": 0.002732566820964611, "grad_norm": 0.5625, "grad_norm_var": 0.0045928955078125, "learning_rate": 2e-05, "loss": 1.1927, "loss/crossentropy": 2.5329155921936035, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.14582672715187073, "step": 183 }, { "epoch": 0.002747498880095565, "grad_norm": 0.56640625, "grad_norm_var": 0.004430071512858073, "learning_rate": 2e-05, "loss": 1.354, "loss/crossentropy": 2.6010122299194336, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.19775637984275818, "step": 184 }, { "epoch": 0.0027624309392265192, "grad_norm": 0.56640625, "grad_norm_var": 0.0044024149576822914, "learning_rate": 2e-05, "loss": 1.3048, "loss/crossentropy": 2.39243483543396, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.17982321977615356, "step": 185 }, { "epoch": 0.0027773629983574733, "grad_norm": 0.515625, "grad_norm_var": 0.004546038309733073, "learning_rate": 2e-05, "loss": 1.2813, "loss/crossentropy": 2.5203473567962646, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.16413094103336334, "step": 186 }, { "epoch": 0.002792295057488428, "grad_norm": 0.68359375, "grad_norm_var": 0.0054585774739583336, "learning_rate": 2e-05, "loss": 1.5839, "loss/crossentropy": 2.391411542892456, "loss/dist_ce": 0.0, "loss/fcd": 1.328125, "loss/idx": 12.0, "loss/logits": 0.2557827830314636, "step": 187 }, { "epoch": 0.002807227116619382, "grad_norm": 0.53125, "grad_norm_var": 0.00547784169514974, "learning_rate": 2e-05, "loss": 1.2742, "loss/crossentropy": 2.7311580181121826, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17259860038757324, "step": 188 }, { "epoch": 0.002822159175750336, "grad_norm": 0.54296875, "grad_norm_var": 0.0021982192993164062, "learning_rate": 2e-05, "loss": 1.2556, "loss/crossentropy": 2.5843493938446045, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16964314877986908, "step": 189 }, { "epoch": 0.00283709123488129, "grad_norm": 0.5390625, "grad_norm_var": 0.002147865295410156, "learning_rate": 2e-05, "loss": 1.2337, "loss/crossentropy": 2.5467610359191895, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.15552528202533722, "step": 190 }, { "epoch": 0.002852023294012244, "grad_norm": 0.5859375, "grad_norm_var": 0.0021311442057291665, "learning_rate": 2e-05, "loss": 1.3968, "loss/crossentropy": 2.381016492843628, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.19368363916873932, "step": 191 }, { "epoch": 0.0028669553531431983, "grad_norm": 0.55078125, "grad_norm_var": 0.002132606506347656, "learning_rate": 2e-05, "loss": 1.257, "loss/crossentropy": 2.8202872276306152, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1710858792066574, "step": 192 }, { "epoch": 0.0028818874122741524, "grad_norm": 0.67578125, "grad_norm_var": 0.0030318578084309895, "learning_rate": 2e-05, "loss": 1.4907, "loss/crossentropy": 2.774198055267334, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 12.0, "loss/logits": 0.24069786071777344, "step": 193 }, { "epoch": 0.002896819471405107, "grad_norm": 0.6015625, "grad_norm_var": 0.0029744466145833334, "learning_rate": 2e-05, "loss": 1.2802, "loss/crossentropy": 2.622847557067871, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17865772545337677, "step": 194 }, { "epoch": 0.002911751530536061, "grad_norm": 0.515625, "grad_norm_var": 0.0028472900390625, "learning_rate": 2e-05, "loss": 1.1273, "loss/crossentropy": 2.44476056098938, "loss/dist_ce": 0.0, "loss/fcd": 0.9921875, "loss/idx": 12.0, "loss/logits": 0.13513167202472687, "step": 195 }, { "epoch": 0.002926683589667015, "grad_norm": 0.6015625, "grad_norm_var": 0.002904192606608073, "learning_rate": 2e-05, "loss": 1.4153, "loss/crossentropy": 2.6610636711120605, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.21217352151870728, "step": 196 }, { "epoch": 0.002941615648797969, "grad_norm": 0.6640625, "grad_norm_var": 0.002907053629557292, "learning_rate": 2e-05, "loss": 1.48, "loss/crossentropy": 2.434468984603882, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 12.0, "loss/logits": 0.19873744249343872, "step": 197 }, { "epoch": 0.0029565477079289233, "grad_norm": 0.51171875, "grad_norm_var": 0.0031717300415039064, "learning_rate": 2e-05, "loss": 1.2033, "loss/crossentropy": 2.4664409160614014, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.1642196774482727, "step": 198 }, { "epoch": 0.0029714797670598774, "grad_norm": 0.58984375, "grad_norm_var": 0.0031695048014322917, "learning_rate": 2e-05, "loss": 1.3863, "loss/crossentropy": 2.6245970726013184, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 12.0, "loss/logits": 0.17531853914260864, "step": 199 }, { "epoch": 0.002986411826190832, "grad_norm": 0.546875, "grad_norm_var": 0.0032225926717122395, "learning_rate": 2e-05, "loss": 1.2205, "loss/crossentropy": 2.24330735206604, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.13458162546157837, "step": 200 }, { "epoch": 0.003001343885321786, "grad_norm": 0.54296875, "grad_norm_var": 0.0032882054646809896, "learning_rate": 2e-05, "loss": 1.2716, "loss/crossentropy": 2.551177978515625, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.17783880233764648, "step": 201 }, { "epoch": 0.00301627594445274, "grad_norm": 0.53515625, "grad_norm_var": 0.0031575520833333334, "learning_rate": 2e-05, "loss": 1.1211, "loss/crossentropy": 2.748706102371216, "loss/dist_ce": 0.0, "loss/fcd": 0.984375, "loss/idx": 12.0, "loss/logits": 0.13667932152748108, "step": 202 }, { "epoch": 0.003031208003583694, "grad_norm": 0.55078125, "grad_norm_var": 0.002357737223307292, "learning_rate": 2e-05, "loss": 1.2637, "loss/crossentropy": 2.560105800628662, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.15430858731269836, "step": 203 }, { "epoch": 0.0030461400627146483, "grad_norm": 0.53125, "grad_norm_var": 0.002357737223307292, "learning_rate": 2e-05, "loss": 1.2106, "loss/crossentropy": 2.519240140914917, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15595705807209015, "step": 204 }, { "epoch": 0.0030610721218456024, "grad_norm": 0.5625, "grad_norm_var": 0.002316729227701823, "learning_rate": 2e-05, "loss": 1.2509, "loss/crossentropy": 2.8139536380767822, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1649806946516037, "step": 205 }, { "epoch": 0.0030760041809765565, "grad_norm": 0.5234375, "grad_norm_var": 0.002394549051920573, "learning_rate": 2e-05, "loss": 1.2708, "loss/crossentropy": 2.5938565731048584, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16919182240962982, "step": 206 }, { "epoch": 0.003090936240107511, "grad_norm": 0.59375, "grad_norm_var": 0.0024169286092122397, "learning_rate": 2e-05, "loss": 1.3906, "loss/crossentropy": 2.7333359718322754, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.20309199392795563, "step": 207 }, { "epoch": 0.003105868299238465, "grad_norm": 0.5625, "grad_norm_var": 0.0023976643880208332, "learning_rate": 2e-05, "loss": 1.2733, "loss/crossentropy": 2.491389513015747, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.17951218783855438, "step": 208 }, { "epoch": 0.003120800358369419, "grad_norm": 0.55859375, "grad_norm_var": 0.0015927632649739584, "learning_rate": 2e-05, "loss": 1.2968, "loss/crossentropy": 2.423560380935669, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.17177289724349976, "step": 209 }, { "epoch": 0.0031357324175003733, "grad_norm": 0.53515625, "grad_norm_var": 0.0015181859334309896, "learning_rate": 2e-05, "loss": 1.2264, "loss/crossentropy": 2.598179817199707, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15610140562057495, "step": 210 }, { "epoch": 0.0031506644766313274, "grad_norm": 0.8828125, "grad_norm_var": 0.007877031962076822, "learning_rate": 2e-05, "loss": 1.3882, "loss/crossentropy": 2.521733045578003, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.2007344365119934, "step": 211 }, { "epoch": 0.0031655965357622814, "grad_norm": 0.5625, "grad_norm_var": 0.00786431630452474, "learning_rate": 2e-05, "loss": 1.2254, "loss/crossentropy": 2.67375111579895, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16288092732429504, "step": 212 }, { "epoch": 0.003180528594893236, "grad_norm": 0.56640625, "grad_norm_var": 0.007344563802083333, "learning_rate": 2e-05, "loss": 1.2893, "loss/crossentropy": 2.398942470550537, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17206481099128723, "step": 213 }, { "epoch": 0.00319546065402419, "grad_norm": 0.57421875, "grad_norm_var": 0.0070841471354166664, "learning_rate": 2e-05, "loss": 1.2304, "loss/crossentropy": 2.648841142654419, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16006574034690857, "step": 214 }, { "epoch": 0.003210392713155144, "grad_norm": 0.60546875, "grad_norm_var": 0.007127888997395833, "learning_rate": 2e-05, "loss": 1.3215, "loss/crossentropy": 2.5586395263671875, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18091149628162384, "step": 215 }, { "epoch": 0.0032253247722860983, "grad_norm": 0.828125, "grad_norm_var": 0.010936482747395834, "learning_rate": 2e-05, "loss": 1.4868, "loss/crossentropy": 2.3754496574401855, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 12.0, "loss/logits": 0.20558351278305054, "step": 216 }, { "epoch": 0.0032402568314170523, "grad_norm": 0.57421875, "grad_norm_var": 0.0107818603515625, "learning_rate": 2e-05, "loss": 1.2598, "loss/crossentropy": 2.624929189682007, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.15819445252418518, "step": 217 }, { "epoch": 0.0032551888905480064, "grad_norm": 0.55859375, "grad_norm_var": 0.010623931884765625, "learning_rate": 2e-05, "loss": 1.3027, "loss/crossentropy": 2.589963674545288, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.16989606618881226, "step": 218 }, { "epoch": 0.0032701209496789605, "grad_norm": 0.58203125, "grad_norm_var": 0.010487620035807292, "learning_rate": 2e-05, "loss": 1.3079, "loss/crossentropy": 2.4396564960479736, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17511269450187683, "step": 219 }, { "epoch": 0.003285053008809915, "grad_norm": 0.59375, "grad_norm_var": 0.010158030192057292, "learning_rate": 2e-05, "loss": 1.3446, "loss/crossentropy": 2.4202687740325928, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.1805376559495926, "step": 220 }, { "epoch": 0.003299985067940869, "grad_norm": 0.5234375, "grad_norm_var": 0.010469563802083333, "learning_rate": 2e-05, "loss": 1.1811, "loss/crossentropy": 2.558259963989258, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.14983907341957092, "step": 221 }, { "epoch": 0.0033149171270718232, "grad_norm": 0.5234375, "grad_norm_var": 0.010469563802083333, "learning_rate": 2e-05, "loss": 1.2646, "loss/crossentropy": 2.6712141036987305, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.1708478480577469, "step": 222 }, { "epoch": 0.0033298491862027773, "grad_norm": 0.51171875, "grad_norm_var": 0.01097558339436849, "learning_rate": 2e-05, "loss": 1.2885, "loss/crossentropy": 2.7163174152374268, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17126557230949402, "step": 223 }, { "epoch": 0.0033447812453337314, "grad_norm": 0.5703125, "grad_norm_var": 0.010944048563639322, "learning_rate": 2e-05, "loss": 1.2871, "loss/crossentropy": 2.553407907485962, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1699165552854538, "step": 224 }, { "epoch": 0.0033597133044646855, "grad_norm": 0.54296875, "grad_norm_var": 0.011039161682128906, "learning_rate": 2e-05, "loss": 1.2835, "loss/crossentropy": 2.4799208641052246, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.15845397114753723, "step": 225 }, { "epoch": 0.00337464536359564, "grad_norm": 0.54296875, "grad_norm_var": 0.010979652404785156, "learning_rate": 2e-05, "loss": 1.2787, "loss/crossentropy": 2.745670795440674, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16931718587875366, "step": 226 }, { "epoch": 0.003389577422726594, "grad_norm": 0.60546875, "grad_norm_var": 0.0051971435546875, "learning_rate": 2e-05, "loss": 1.3744, "loss/crossentropy": 2.385148286819458, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.21031928062438965, "step": 227 }, { "epoch": 0.0034045094818575482, "grad_norm": 0.53515625, "grad_norm_var": 0.005304400126139323, "learning_rate": 2e-05, "loss": 1.3196, "loss/crossentropy": 2.319401264190674, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.1868358552455902, "step": 228 }, { "epoch": 0.0034194415409885023, "grad_norm": 0.5859375, "grad_norm_var": 0.005299631754557292, "learning_rate": 2e-05, "loss": 1.3091, "loss/crossentropy": 2.3205957412719727, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.16845813393592834, "step": 229 }, { "epoch": 0.0034343736001194564, "grad_norm": 0.62109375, "grad_norm_var": 0.0054094950358072914, "learning_rate": 2e-05, "loss": 1.4095, "loss/crossentropy": 2.571542978286743, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.21414990723133087, "step": 230 }, { "epoch": 0.0034493056592504105, "grad_norm": 0.57421875, "grad_norm_var": 0.0053708394368489586, "learning_rate": 2e-05, "loss": 1.2948, "loss/crossentropy": 2.497636556625366, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1775931417942047, "step": 231 }, { "epoch": 0.0034642377183813646, "grad_norm": 0.48828125, "grad_norm_var": 0.0013274510701497396, "learning_rate": 2e-05, "loss": 1.1676, "loss/crossentropy": 2.585477352142334, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.14414075016975403, "step": 232 }, { "epoch": 0.003479169777512319, "grad_norm": 0.70703125, "grad_norm_var": 0.002710914611816406, "learning_rate": 2e-05, "loss": 1.3074, "loss/crossentropy": 2.792330503463745, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.16680563986301422, "step": 233 }, { "epoch": 0.003494101836643273, "grad_norm": 1.015625, "grad_norm_var": 0.015274810791015624, "learning_rate": 2e-05, "loss": 1.3413, "loss/crossentropy": 3.3043012619018555, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.1381298005580902, "step": 234 }, { "epoch": 0.0035090338957742273, "grad_norm": 0.58203125, "grad_norm_var": 0.015274810791015624, "learning_rate": 2e-05, "loss": 1.2644, "loss/crossentropy": 2.5281269550323486, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16281655430793762, "step": 235 }, { "epoch": 0.0035239659549051814, "grad_norm": 0.53515625, "grad_norm_var": 0.015500831604003906, "learning_rate": 2e-05, "loss": 1.2285, "loss/crossentropy": 2.3544559478759766, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1581532061100006, "step": 236 }, { "epoch": 0.0035388980140361355, "grad_norm": 1.46875, "grad_norm_var": 0.0627664566040039, "learning_rate": 2e-05, "loss": 1.5838, "loss/crossentropy": 2.6616692543029785, "loss/dist_ce": 0.0, "loss/fcd": 1.40625, "loss/idx": 12.0, "loss/logits": 0.17750610411167145, "step": 237 }, { "epoch": 0.0035538300731670896, "grad_norm": 0.59375, "grad_norm_var": 0.061882972717285156, "learning_rate": 2e-05, "loss": 1.4213, "loss/crossentropy": 2.2179884910583496, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 12.0, "loss/logits": 0.1947222650051117, "step": 238 }, { "epoch": 0.003568762132298044, "grad_norm": 0.48828125, "grad_norm_var": 0.062365150451660155, "learning_rate": 2e-05, "loss": 1.1698, "loss/crossentropy": 2.5931408405303955, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.14633141458034515, "step": 239 }, { "epoch": 0.003583694191428998, "grad_norm": 0.80859375, "grad_norm_var": 0.06326878865559896, "learning_rate": 2e-05, "loss": 1.6428, "loss/crossentropy": 2.3654534816741943, "loss/dist_ce": 0.0, "loss/fcd": 1.359375, "loss/idx": 12.0, "loss/logits": 0.28339850902557373, "step": 240 }, { "epoch": 0.0035986262505599523, "grad_norm": 0.55078125, "grad_norm_var": 0.06314188639322917, "learning_rate": 2e-05, "loss": 1.2658, "loss/crossentropy": 2.5046465396881104, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.1720809042453766, "step": 241 }, { "epoch": 0.0036135583096909064, "grad_norm": 0.54296875, "grad_norm_var": 0.06314188639322917, "learning_rate": 2e-05, "loss": 1.3045, "loss/crossentropy": 2.845069646835327, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.17947256565093994, "step": 242 }, { "epoch": 0.0036284903688218605, "grad_norm": 0.52734375, "grad_norm_var": 0.0641845703125, "learning_rate": 2e-05, "loss": 1.2002, "loss/crossentropy": 2.601872682571411, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15330010652542114, "step": 243 }, { "epoch": 0.0036434224279528146, "grad_norm": 0.7734375, "grad_norm_var": 0.06363773345947266, "learning_rate": 2e-05, "loss": 1.316, "loss/crossentropy": 2.63727068901062, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.19103199243545532, "step": 244 }, { "epoch": 0.0036583544870837687, "grad_norm": 0.53515625, "grad_norm_var": 0.0644287109375, "learning_rate": 2e-05, "loss": 1.2223, "loss/crossentropy": 2.713804006576538, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.16758683323860168, "step": 245 }, { "epoch": 0.003673286546214723, "grad_norm": 0.5859375, "grad_norm_var": 0.0647623062133789, "learning_rate": 2e-05, "loss": 1.2487, "loss/crossentropy": 2.487891674041748, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1783592849969864, "step": 246 }, { "epoch": 0.0036882186053456773, "grad_norm": 0.546875, "grad_norm_var": 0.06517130533854167, "learning_rate": 2e-05, "loss": 1.2519, "loss/crossentropy": 2.4887685775756836, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16592524945735931, "step": 247 }, { "epoch": 0.0037031506644766314, "grad_norm": 0.63671875, "grad_norm_var": 0.06291478474934896, "learning_rate": 2e-05, "loss": 1.3709, "loss/crossentropy": 2.388550043106079, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.19902655482292175, "step": 248 }, { "epoch": 0.0037180827236075855, "grad_norm": 0.51953125, "grad_norm_var": 0.06446507771809896, "learning_rate": 2e-05, "loss": 1.1646, "loss/crossentropy": 2.6850759983062744, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.14900657534599304, "step": 249 }, { "epoch": 0.0037330147827385396, "grad_norm": 0.55859375, "grad_norm_var": 0.056423886617024736, "learning_rate": 2e-05, "loss": 1.2667, "loss/crossentropy": 2.3784635066986084, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1572844386100769, "step": 250 }, { "epoch": 0.0037479468418694937, "grad_norm": 0.6171875, "grad_norm_var": 0.05622533162434896, "learning_rate": 2e-05, "loss": 1.3129, "loss/crossentropy": 2.604210376739502, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17225471138954163, "step": 251 }, { "epoch": 0.003762878901000448, "grad_norm": 0.51953125, "grad_norm_var": 0.056465403238932295, "learning_rate": 2e-05, "loss": 1.235, "loss/crossentropy": 2.756725549697876, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16468365490436554, "step": 252 }, { "epoch": 0.0037778109601314023, "grad_norm": 0.52734375, "grad_norm_var": 0.008092689514160156, "learning_rate": 2e-05, "loss": 1.3165, "loss/crossentropy": 2.769130229949951, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.183657705783844, "step": 253 }, { "epoch": 0.0037927430192623564, "grad_norm": 0.5, "grad_norm_var": 0.008510780334472657, "learning_rate": 2e-05, "loss": 1.2422, "loss/crossentropy": 2.54736065864563, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.17974743247032166, "step": 254 }, { "epoch": 0.0038076750783933105, "grad_norm": 0.4921875, "grad_norm_var": 0.008465321858723958, "learning_rate": 2e-05, "loss": 1.2686, "loss/crossentropy": 2.463906764984131, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16703197360038757, "step": 255 }, { "epoch": 0.0038226071375242645, "grad_norm": 0.478515625, "grad_norm_var": 0.0051102797190348305, "learning_rate": 2e-05, "loss": 1.0878, "loss/crossentropy": 2.503507614135742, "loss/dist_ce": 0.0, "loss/fcd": 0.95703125, "loss/idx": 12.0, "loss/logits": 0.13080117106437683, "step": 256 }, { "epoch": 0.0038375391966552186, "grad_norm": 0.609375, "grad_norm_var": 0.005276219050089518, "learning_rate": 2e-05, "loss": 1.3397, "loss/crossentropy": 2.539508819580078, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.17561593651771545, "step": 257 }, { "epoch": 0.0038524712557861727, "grad_norm": 0.71484375, "grad_norm_var": 0.00671690305074056, "learning_rate": 2e-05, "loss": 1.3383, "loss/crossentropy": 2.8462297916412354, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.1976788341999054, "step": 258 }, { "epoch": 0.0038674033149171273, "grad_norm": 0.53125, "grad_norm_var": 0.006694904963175456, "learning_rate": 2e-05, "loss": 1.2151, "loss/crossentropy": 2.5267205238342285, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15262790024280548, "step": 259 }, { "epoch": 0.0038823353740480814, "grad_norm": 0.58203125, "grad_norm_var": 0.003835026423136393, "learning_rate": 2e-05, "loss": 1.2916, "loss/crossentropy": 2.566385269165039, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17439204454421997, "step": 260 }, { "epoch": 0.0038972674331790354, "grad_norm": 0.55078125, "grad_norm_var": 0.0037991682688395183, "learning_rate": 2e-05, "loss": 1.2338, "loss/crossentropy": 2.4941866397857666, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.15566781163215637, "step": 261 }, { "epoch": 0.0039121994923099895, "grad_norm": 0.671875, "grad_norm_var": 0.004550282160441081, "learning_rate": 2e-05, "loss": 1.3631, "loss/crossentropy": 2.345531702041626, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.17564262449741364, "step": 262 }, { "epoch": 0.003927131551440944, "grad_norm": 0.51171875, "grad_norm_var": 0.0047173659006754555, "learning_rate": 2e-05, "loss": 1.2948, "loss/crossentropy": 2.719332695007324, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1697680652141571, "step": 263 }, { "epoch": 0.003942063610571898, "grad_norm": 0.5078125, "grad_norm_var": 0.004503361384073893, "learning_rate": 2e-05, "loss": 1.2181, "loss/crossentropy": 2.4483582973480225, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1634569764137268, "step": 264 }, { "epoch": 0.003956995669702852, "grad_norm": 0.55078125, "grad_norm_var": 0.004413334528605143, "learning_rate": 2e-05, "loss": 1.3577, "loss/crossentropy": 2.466128349304199, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.20145492255687714, "step": 265 }, { "epoch": 0.003971927728833806, "grad_norm": 0.474609375, "grad_norm_var": 0.00484460194905599, "learning_rate": 2e-05, "loss": 1.1326, "loss/crossentropy": 2.559739351272583, "loss/dist_ce": 0.0, "loss/fcd": 0.99609375, "loss/idx": 12.0, "loss/logits": 0.1364823430776596, "step": 266 }, { "epoch": 0.00398685978796476, "grad_norm": 0.6015625, "grad_norm_var": 0.004725074768066407, "learning_rate": 2e-05, "loss": 1.3285, "loss/crossentropy": 2.488020896911621, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18785008788108826, "step": 267 }, { "epoch": 0.004001791847095714, "grad_norm": 0.5390625, "grad_norm_var": 0.0046656290690104164, "learning_rate": 2e-05, "loss": 1.3112, "loss/crossentropy": 2.3776917457580566, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17841312289237976, "step": 268 }, { "epoch": 0.004016723906226669, "grad_norm": 0.70703125, "grad_norm_var": 0.006075286865234375, "learning_rate": 2e-05, "loss": 1.4075, "loss/crossentropy": 2.6192870140075684, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 12.0, "loss/logits": 0.18098263442516327, "step": 269 }, { "epoch": 0.004031655965357623, "grad_norm": 0.5859375, "grad_norm_var": 0.005803934733072917, "learning_rate": 2e-05, "loss": 1.2378, "loss/crossentropy": 2.7433526515960693, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.15190985798835754, "step": 270 }, { "epoch": 0.004046588024488577, "grad_norm": 0.515625, "grad_norm_var": 0.005597178141276042, "learning_rate": 2e-05, "loss": 1.2776, "loss/crossentropy": 2.572636842727661, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17604324221611023, "step": 271 }, { "epoch": 0.004061520083619531, "grad_norm": 0.55078125, "grad_norm_var": 0.0050343672434488935, "learning_rate": 2e-05, "loss": 1.3095, "loss/crossentropy": 2.5277881622314453, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17667779326438904, "step": 272 }, { "epoch": 0.004076452142750485, "grad_norm": 0.54296875, "grad_norm_var": 0.005008427302042643, "learning_rate": 2e-05, "loss": 1.2538, "loss/crossentropy": 2.541118621826172, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1678951382637024, "step": 273 }, { "epoch": 0.0040913842018814395, "grad_norm": 0.51171875, "grad_norm_var": 0.0036959171295166014, "learning_rate": 2e-05, "loss": 1.205, "loss/crossentropy": 2.4771127700805664, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.16591356694698334, "step": 274 }, { "epoch": 0.004106316261012394, "grad_norm": 0.52734375, "grad_norm_var": 0.0037110487620035807, "learning_rate": 2e-05, "loss": 1.2732, "loss/crossentropy": 2.7245373725891113, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.1716119647026062, "step": 275 }, { "epoch": 0.004121248320143348, "grad_norm": 0.62109375, "grad_norm_var": 0.003930393854777018, "learning_rate": 2e-05, "loss": 1.3769, "loss/crossentropy": 2.581962823867798, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.17378033697605133, "step": 276 }, { "epoch": 0.004136180379274302, "grad_norm": 0.51953125, "grad_norm_var": 0.004032627741495768, "learning_rate": 2e-05, "loss": 1.203, "loss/crossentropy": 2.607046604156494, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15608051419258118, "step": 277 }, { "epoch": 0.004151112438405256, "grad_norm": 0.515625, "grad_norm_var": 0.0032010237375895184, "learning_rate": 2e-05, "loss": 1.1885, "loss/crossentropy": 2.6051206588745117, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.1494494080543518, "step": 278 }, { "epoch": 0.00416604449753621, "grad_norm": 0.6015625, "grad_norm_var": 0.0032595157623291015, "learning_rate": 2e-05, "loss": 1.3056, "loss/crossentropy": 2.5447115898132324, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.18058553338050842, "step": 279 }, { "epoch": 0.004180976556667164, "grad_norm": 0.99609375, "grad_norm_var": 0.015116866429646809, "learning_rate": 2e-05, "loss": 1.4477, "loss/crossentropy": 2.5050199031829834, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 12.0, "loss/logits": 0.1664591133594513, "step": 280 }, { "epoch": 0.004195908615798118, "grad_norm": 0.52734375, "grad_norm_var": 0.015258391698201498, "learning_rate": 2e-05, "loss": 1.2369, "loss/crossentropy": 2.526259422302246, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1665583997964859, "step": 281 }, { "epoch": 0.004210840674929073, "grad_norm": 0.5234375, "grad_norm_var": 0.014697710673014322, "learning_rate": 2e-05, "loss": 1.2211, "loss/crossentropy": 2.4373950958251953, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15083222091197968, "step": 282 }, { "epoch": 0.004225772734060027, "grad_norm": 1.9375, "grad_norm_var": 0.12889601389567057, "learning_rate": 2e-05, "loss": 1.5671, "loss/crossentropy": 2.5966246128082275, "loss/dist_ce": 0.0, "loss/fcd": 1.34375, "loss/idx": 12.0, "loss/logits": 0.2233429104089737, "step": 283 }, { "epoch": 0.004240704793190981, "grad_norm": 0.5078125, "grad_norm_var": 0.12950331370035809, "learning_rate": 2e-05, "loss": 1.1653, "loss/crossentropy": 2.717890739440918, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.14189787209033966, "step": 284 }, { "epoch": 0.004255636852321935, "grad_norm": 0.5625, "grad_norm_var": 0.13006083170572916, "learning_rate": 2e-05, "loss": 1.2765, "loss/crossentropy": 2.6981263160705566, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17491918802261353, "step": 285 }, { "epoch": 0.0042705689114528895, "grad_norm": 0.515625, "grad_norm_var": 0.13105646769205728, "learning_rate": 2e-05, "loss": 1.3008, "loss/crossentropy": 2.629808187484741, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.16800275444984436, "step": 286 }, { "epoch": 0.004285500970583844, "grad_norm": 0.55078125, "grad_norm_var": 0.1304814020792643, "learning_rate": 2e-05, "loss": 1.2972, "loss/crossentropy": 2.7882864475250244, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1878424435853958, "step": 287 }, { "epoch": 0.004300433029714798, "grad_norm": 0.51953125, "grad_norm_var": 0.1309849421183268, "learning_rate": 2e-05, "loss": 1.2366, "loss/crossentropy": 2.4925997257232666, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16624397039413452, "step": 288 }, { "epoch": 0.004315365088845752, "grad_norm": 0.5625, "grad_norm_var": 0.13071695963541666, "learning_rate": 2e-05, "loss": 1.2994, "loss/crossentropy": 2.804802417755127, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.18225803971290588, "step": 289 }, { "epoch": 0.004330297147976706, "grad_norm": 0.515625, "grad_norm_var": 0.13064263661702474, "learning_rate": 2e-05, "loss": 1.2799, "loss/crossentropy": 2.5255799293518066, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.1861756294965744, "step": 290 }, { "epoch": 0.00434522920710766, "grad_norm": 0.55859375, "grad_norm_var": 0.13016554514567058, "learning_rate": 2e-05, "loss": 1.3202, "loss/crossentropy": 2.419299602508545, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.179526686668396, "step": 291 }, { "epoch": 0.004360161266238614, "grad_norm": 0.490234375, "grad_norm_var": 0.13188754717508952, "learning_rate": 2e-05, "loss": 1.1478, "loss/crossentropy": 2.4527664184570312, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.14002804458141327, "step": 292 }, { "epoch": 0.004375093325369568, "grad_norm": 0.58984375, "grad_norm_var": 0.13097087542215982, "learning_rate": 2e-05, "loss": 1.334, "loss/crossentropy": 2.6172571182250977, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.17773565649986267, "step": 293 }, { "epoch": 0.004390025384500522, "grad_norm": 0.578125, "grad_norm_var": 0.130056365331014, "learning_rate": 2e-05, "loss": 1.2928, "loss/crossentropy": 2.644300699234009, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17558696866035461, "step": 294 }, { "epoch": 0.004404957443631477, "grad_norm": 0.52734375, "grad_norm_var": 0.13096477190653483, "learning_rate": 2e-05, "loss": 1.2111, "loss/crossentropy": 2.701231002807617, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.14079666137695312, "step": 295 }, { "epoch": 0.004419889502762431, "grad_norm": 0.498046875, "grad_norm_var": 0.12374617258707682, "learning_rate": 2e-05, "loss": 1.1536, "loss/crossentropy": 2.5944371223449707, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.14583569765090942, "step": 296 }, { "epoch": 0.004434821561893385, "grad_norm": 1.2265625, "grad_norm_var": 0.14540328979492187, "learning_rate": 2e-05, "loss": 1.3935, "loss/crossentropy": 2.704197883605957, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.205990731716156, "step": 297 }, { "epoch": 0.0044497536210243395, "grad_norm": 0.5859375, "grad_norm_var": 0.14445521036783854, "learning_rate": 2e-05, "loss": 1.3361, "loss/crossentropy": 2.60979962348938, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.18767428398132324, "step": 298 }, { "epoch": 0.0044646856801552936, "grad_norm": 0.52734375, "grad_norm_var": 0.030499712626139323, "learning_rate": 2e-05, "loss": 1.3711, "loss/crossentropy": 2.3737831115722656, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.1992519050836563, "step": 299 }, { "epoch": 0.004479617739286248, "grad_norm": 0.5625, "grad_norm_var": 0.03014367421468099, "learning_rate": 2e-05, "loss": 1.3883, "loss/crossentropy": 2.3658077716827393, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.18512727320194244, "step": 300 }, { "epoch": 0.004494549798417202, "grad_norm": 0.5234375, "grad_norm_var": 0.030359840393066405, "learning_rate": 2e-05, "loss": 1.2623, "loss/crossentropy": 2.541961431503296, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1764090359210968, "step": 301 }, { "epoch": 0.004509481857548156, "grad_norm": 0.55859375, "grad_norm_var": 0.030087788899739582, "learning_rate": 2e-05, "loss": 1.2756, "loss/crossentropy": 2.721057891845703, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17400358617305756, "step": 302 }, { "epoch": 0.00452441391667911, "grad_norm": 0.61328125, "grad_norm_var": 0.030038960774739585, "learning_rate": 2e-05, "loss": 1.3062, "loss/crossentropy": 2.519401788711548, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.16561126708984375, "step": 303 }, { "epoch": 0.004539345975810064, "grad_norm": 0.640625, "grad_norm_var": 0.02982018788655599, "learning_rate": 2e-05, "loss": 1.2715, "loss/crossentropy": 2.8109989166259766, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1621253490447998, "step": 304 }, { "epoch": 0.004554278034941018, "grad_norm": 0.5546875, "grad_norm_var": 0.02986036936442057, "learning_rate": 2e-05, "loss": 1.1901, "loss/crossentropy": 2.4882760047912598, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15099531412124634, "step": 305 }, { "epoch": 0.004569210094071972, "grad_norm": 0.60546875, "grad_norm_var": 0.029390970865885418, "learning_rate": 2e-05, "loss": 1.3448, "loss/crossentropy": 2.2777934074401855, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.19636347889900208, "step": 306 }, { "epoch": 0.004584142153202926, "grad_norm": 0.478515625, "grad_norm_var": 0.03026096026102702, "learning_rate": 2e-05, "loss": 1.1451, "loss/crossentropy": 2.3440771102905273, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.12943994998931885, "step": 307 }, { "epoch": 0.004599074212333881, "grad_norm": 0.490234375, "grad_norm_var": 0.03026096026102702, "learning_rate": 2e-05, "loss": 1.2176, "loss/crossentropy": 2.565845251083374, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15513211488723755, "step": 308 }, { "epoch": 0.004614006271464835, "grad_norm": 0.78125, "grad_norm_var": 0.03235446612040202, "learning_rate": 2e-05, "loss": 1.3562, "loss/crossentropy": 2.702106475830078, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 12.0, "loss/logits": 0.17654916644096375, "step": 309 }, { "epoch": 0.0046289383305957894, "grad_norm": 0.5234375, "grad_norm_var": 0.03277014096577962, "learning_rate": 2e-05, "loss": 1.2382, "loss/crossentropy": 2.545269727706909, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16790466010570526, "step": 310 }, { "epoch": 0.0046438703897267435, "grad_norm": 0.54296875, "grad_norm_var": 0.032621367772420244, "learning_rate": 2e-05, "loss": 1.2643, "loss/crossentropy": 2.53337025642395, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.17057746648788452, "step": 311 }, { "epoch": 0.004658802448857698, "grad_norm": 0.61328125, "grad_norm_var": 0.03177642822265625, "learning_rate": 2e-05, "loss": 1.3344, "loss/crossentropy": 2.68241548538208, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.1938180923461914, "step": 312 }, { "epoch": 0.004673734507988652, "grad_norm": 0.58984375, "grad_norm_var": 0.005132484436035156, "learning_rate": 2e-05, "loss": 1.3496, "loss/crossentropy": 2.7061564922332764, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.201199471950531, "step": 313 }, { "epoch": 0.004688666567119606, "grad_norm": 0.67578125, "grad_norm_var": 0.005774434407552083, "learning_rate": 2e-05, "loss": 1.4513, "loss/crossentropy": 2.6005072593688965, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.20914174616336823, "step": 314 }, { "epoch": 0.00470359862625056, "grad_norm": 0.546875, "grad_norm_var": 0.00566094716389974, "learning_rate": 2e-05, "loss": 1.2654, "loss/crossentropy": 2.7302968502044678, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.1716044545173645, "step": 315 }, { "epoch": 0.004718530685381514, "grad_norm": 0.66796875, "grad_norm_var": 0.0060918172200520836, "learning_rate": 2e-05, "loss": 1.5089, "loss/crossentropy": 2.707127094268799, "loss/dist_ce": 0.0, "loss/fcd": 1.2734375, "loss/idx": 12.0, "loss/logits": 0.23551242053508759, "step": 316 }, { "epoch": 0.004733462744512468, "grad_norm": 0.5703125, "grad_norm_var": 0.005826314290364583, "learning_rate": 2e-05, "loss": 1.2823, "loss/crossentropy": 2.592078924179077, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.16508805751800537, "step": 317 }, { "epoch": 0.004748394803643422, "grad_norm": 0.57421875, "grad_norm_var": 0.005774434407552083, "learning_rate": 2e-05, "loss": 1.2459, "loss/crossentropy": 2.4738409519195557, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16779488325119019, "step": 318 }, { "epoch": 0.004763326862774376, "grad_norm": 0.5625, "grad_norm_var": 0.005790138244628906, "learning_rate": 2e-05, "loss": 1.3254, "loss/crossentropy": 2.321881055831909, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.1769586205482483, "step": 319 }, { "epoch": 0.00477825892190533, "grad_norm": 0.55859375, "grad_norm_var": 0.005641937255859375, "learning_rate": 2e-05, "loss": 1.2608, "loss/crossentropy": 2.6014082431793213, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17482781410217285, "step": 320 }, { "epoch": 0.004793190981036285, "grad_norm": 0.66015625, "grad_norm_var": 0.005932044982910156, "learning_rate": 2e-05, "loss": 1.3777, "loss/crossentropy": 2.7242867946624756, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.20579351484775543, "step": 321 }, { "epoch": 0.004808123040167239, "grad_norm": 0.53515625, "grad_norm_var": 0.006096839904785156, "learning_rate": 2e-05, "loss": 1.2116, "loss/crossentropy": 2.6165010929107666, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1569620966911316, "step": 322 }, { "epoch": 0.0048230550992981935, "grad_norm": 0.55859375, "grad_norm_var": 0.005353275934855143, "learning_rate": 2e-05, "loss": 1.2708, "loss/crossentropy": 2.8225040435791016, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16922032833099365, "step": 323 }, { "epoch": 0.004837987158429148, "grad_norm": 0.56640625, "grad_norm_var": 0.004695574442545573, "learning_rate": 2e-05, "loss": 1.2872, "loss/crossentropy": 2.497213125228882, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16218045353889465, "step": 324 }, { "epoch": 0.004852919217560102, "grad_norm": 0.7265625, "grad_norm_var": 0.0035277684529622395, "learning_rate": 2e-05, "loss": 1.5149, "loss/crossentropy": 2.276867389678955, "loss/dist_ce": 0.0, "loss/fcd": 1.2890625, "loss/idx": 12.0, "loss/logits": 0.22580742835998535, "step": 325 }, { "epoch": 0.004867851276691056, "grad_norm": 0.52734375, "grad_norm_var": 0.0034929911295572915, "learning_rate": 2e-05, "loss": 1.2771, "loss/crossentropy": 2.719574213027954, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.18333600461483002, "step": 326 }, { "epoch": 0.00488278333582201, "grad_norm": 0.58203125, "grad_norm_var": 0.0033315022786458335, "learning_rate": 2e-05, "loss": 1.4085, "loss/crossentropy": 2.591456413269043, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 12.0, "loss/logits": 0.1975385546684265, "step": 327 }, { "epoch": 0.004897715394952964, "grad_norm": 0.54296875, "grad_norm_var": 0.0034665425618489584, "learning_rate": 2e-05, "loss": 1.2639, "loss/crossentropy": 2.5922658443450928, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.18573111295700073, "step": 328 }, { "epoch": 0.004912647454083918, "grad_norm": 0.52734375, "grad_norm_var": 0.003714752197265625, "learning_rate": 2e-05, "loss": 1.1678, "loss/crossentropy": 2.6114420890808105, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.1521402895450592, "step": 329 }, { "epoch": 0.004927579513214872, "grad_norm": 0.6171875, "grad_norm_var": 0.003231239318847656, "learning_rate": 2e-05, "loss": 1.3684, "loss/crossentropy": 2.68989896774292, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.1965211182832718, "step": 330 }, { "epoch": 0.004942511572345826, "grad_norm": 0.63671875, "grad_norm_var": 0.003305816650390625, "learning_rate": 2e-05, "loss": 1.2985, "loss/crossentropy": 2.6312994956970215, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.18911811709403992, "step": 331 }, { "epoch": 0.00495744363147678, "grad_norm": 0.52734375, "grad_norm_var": 0.003049468994140625, "learning_rate": 2e-05, "loss": 1.2595, "loss/crossentropy": 2.5765206813812256, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17355464398860931, "step": 332 }, { "epoch": 0.004972375690607734, "grad_norm": 0.57421875, "grad_norm_var": 0.0030455907185872396, "learning_rate": 2e-05, "loss": 1.463, "loss/crossentropy": 2.599970579147339, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.22083698213100433, "step": 333 }, { "epoch": 0.004987307749738689, "grad_norm": 0.5703125, "grad_norm_var": 0.003049468994140625, "learning_rate": 2e-05, "loss": 1.3341, "loss/crossentropy": 2.6213738918304443, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.17781299352645874, "step": 334 }, { "epoch": 0.0050022398088696435, "grad_norm": 0.55078125, "grad_norm_var": 0.003084754943847656, "learning_rate": 2e-05, "loss": 1.2978, "loss/crossentropy": 2.6995716094970703, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1727505922317505, "step": 335 }, { "epoch": 0.005017171868000598, "grad_norm": 0.51171875, "grad_norm_var": 0.0033487319946289063, "learning_rate": 2e-05, "loss": 1.1847, "loss/crossentropy": 2.6471781730651855, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.15342769026756287, "step": 336 }, { "epoch": 0.005032103927131552, "grad_norm": 0.66015625, "grad_norm_var": 0.0033487319946289063, "learning_rate": 2e-05, "loss": 1.5682, "loss/crossentropy": 2.331575393676758, "loss/dist_ce": 0.0, "loss/fcd": 1.3359375, "loss/idx": 12.0, "loss/logits": 0.2322523295879364, "step": 337 }, { "epoch": 0.005047035986262506, "grad_norm": 0.51171875, "grad_norm_var": 0.0035104751586914062, "learning_rate": 2e-05, "loss": 1.2217, "loss/crossentropy": 2.6712942123413086, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15924036502838135, "step": 338 }, { "epoch": 0.00506196804539346, "grad_norm": 0.498046875, "grad_norm_var": 0.0038677056630452475, "learning_rate": 2e-05, "loss": 1.1757, "loss/crossentropy": 2.6050660610198975, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.16003666818141937, "step": 339 }, { "epoch": 0.005076900104524414, "grad_norm": 0.5078125, "grad_norm_var": 0.004115660985310872, "learning_rate": 2e-05, "loss": 1.2327, "loss/crossentropy": 2.417349100112915, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16236600279808044, "step": 340 }, { "epoch": 0.005091832163655368, "grad_norm": 0.57421875, "grad_norm_var": 0.0023254235585530598, "learning_rate": 2e-05, "loss": 1.3032, "loss/crossentropy": 2.8112549781799316, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1859789788722992, "step": 341 }, { "epoch": 0.005106764222786322, "grad_norm": 0.625, "grad_norm_var": 0.0025288740793863933, "learning_rate": 2e-05, "loss": 1.3182, "loss/crossentropy": 2.38157057762146, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17755183577537537, "step": 342 }, { "epoch": 0.005121696281917276, "grad_norm": 0.61328125, "grad_norm_var": 0.0026667118072509766, "learning_rate": 2e-05, "loss": 1.2629, "loss/crossentropy": 2.5023629665374756, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.16914424300193787, "step": 343 }, { "epoch": 0.00513662834104823, "grad_norm": 0.6328125, "grad_norm_var": 0.00290067990620931, "learning_rate": 2e-05, "loss": 1.4134, "loss/crossentropy": 2.5533506870269775, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 12.0, "loss/logits": 0.2024352252483368, "step": 344 }, { "epoch": 0.005151560400179184, "grad_norm": 0.52734375, "grad_norm_var": 0.00290067990620931, "learning_rate": 2e-05, "loss": 1.1969, "loss/crossentropy": 2.6583385467529297, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15788167715072632, "step": 345 }, { "epoch": 0.0051664924593101385, "grad_norm": 0.51953125, "grad_norm_var": 0.002897500991821289, "learning_rate": 2e-05, "loss": 1.1828, "loss/crossentropy": 2.7132012844085693, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.1515815556049347, "step": 346 }, { "epoch": 0.0051814245184410935, "grad_norm": 0.52734375, "grad_norm_var": 0.002600208918253581, "learning_rate": 2e-05, "loss": 1.1514, "loss/crossentropy": 2.7961926460266113, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.14358270168304443, "step": 347 }, { "epoch": 0.0051963565775720475, "grad_norm": 0.50390625, "grad_norm_var": 0.0027310530344645183, "learning_rate": 2e-05, "loss": 1.1785, "loss/crossentropy": 2.5494437217712402, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.15509198606014252, "step": 348 }, { "epoch": 0.005211288636703002, "grad_norm": 0.5625, "grad_norm_var": 0.002712361017862956, "learning_rate": 2e-05, "loss": 1.3236, "loss/crossentropy": 2.6538543701171875, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1985650658607483, "step": 349 }, { "epoch": 0.005226220695833956, "grad_norm": 0.55859375, "grad_norm_var": 0.0026986281077067058, "learning_rate": 2e-05, "loss": 1.3186, "loss/crossentropy": 2.7938647270202637, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.18574939668178558, "step": 350 }, { "epoch": 0.00524115275496491, "grad_norm": 0.56640625, "grad_norm_var": 0.002704477310180664, "learning_rate": 2e-05, "loss": 1.2769, "loss/crossentropy": 2.6201303005218506, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16753770411014557, "step": 351 }, { "epoch": 0.005256084814095864, "grad_norm": 0.60546875, "grad_norm_var": 0.002696847915649414, "learning_rate": 2e-05, "loss": 1.4235, "loss/crossentropy": 2.322477102279663, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 12.0, "loss/logits": 0.18908751010894775, "step": 352 }, { "epoch": 0.005271016873226818, "grad_norm": 0.59765625, "grad_norm_var": 0.0021241346995035807, "learning_rate": 2e-05, "loss": 1.3541, "loss/crossentropy": 2.4053268432617188, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.1978444755077362, "step": 353 }, { "epoch": 0.005285948932357772, "grad_norm": 0.58203125, "grad_norm_var": 0.0019971052805582683, "learning_rate": 2e-05, "loss": 1.3755, "loss/crossentropy": 2.505030393600464, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.18796448409557343, "step": 354 }, { "epoch": 0.005300880991488726, "grad_norm": 0.5234375, "grad_norm_var": 0.0018187840779622397, "learning_rate": 2e-05, "loss": 1.213, "loss/crossentropy": 2.673353433609009, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1661037802696228, "step": 355 }, { "epoch": 0.00531581305061968, "grad_norm": 1.6015625, "grad_norm_var": 0.06836236317952474, "learning_rate": 2e-05, "loss": 1.3959, "loss/crossentropy": 1.8594982624053955, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 12.0, "loss/logits": 0.1302594691514969, "step": 356 }, { "epoch": 0.005330745109750634, "grad_norm": 0.69140625, "grad_norm_var": 0.06830895741780599, "learning_rate": 2e-05, "loss": 1.4121, "loss/crossentropy": 2.4870803356170654, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.19338038563728333, "step": 357 }, { "epoch": 0.0053456771688815885, "grad_norm": 0.5625, "grad_norm_var": 0.06867720286051432, "learning_rate": 2e-05, "loss": 1.3687, "loss/crossentropy": 2.3528172969818115, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.19687314331531525, "step": 358 }, { "epoch": 0.0053606092280125426, "grad_norm": 0.57421875, "grad_norm_var": 0.06889082590738932, "learning_rate": 2e-05, "loss": 1.4012, "loss/crossentropy": 2.2566211223602295, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.18240945041179657, "step": 359 }, { "epoch": 0.0053755412871434975, "grad_norm": 0.54296875, "grad_norm_var": 0.06940409342447916, "learning_rate": 2e-05, "loss": 1.3009, "loss/crossentropy": 2.8327674865722656, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.19157303869724274, "step": 360 }, { "epoch": 0.005390473346274452, "grad_norm": 0.4921875, "grad_norm_var": 0.06995283762613932, "learning_rate": 2e-05, "loss": 1.2257, "loss/crossentropy": 2.573021173477173, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16322672367095947, "step": 361 }, { "epoch": 0.005405405405405406, "grad_norm": 0.5546875, "grad_norm_var": 0.06953226725260417, "learning_rate": 2e-05, "loss": 1.2096, "loss/crossentropy": 2.7299983501434326, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1548905074596405, "step": 362 }, { "epoch": 0.00542033746453636, "grad_norm": 0.5859375, "grad_norm_var": 0.06896101633707682, "learning_rate": 2e-05, "loss": 1.3065, "loss/crossentropy": 2.708285331726074, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.18154960870742798, "step": 363 }, { "epoch": 0.005435269523667314, "grad_norm": 0.57421875, "grad_norm_var": 0.06807295481363933, "learning_rate": 2e-05, "loss": 1.2052, "loss/crossentropy": 2.836975336074829, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15837103128433228, "step": 364 }, { "epoch": 0.005450201582798268, "grad_norm": 0.8125, "grad_norm_var": 0.06952966054280599, "learning_rate": 2e-05, "loss": 1.7326, "loss/crossentropy": 2.6771881580352783, "loss/dist_ce": 0.0, "loss/fcd": 1.4453125, "loss/idx": 12.0, "loss/logits": 0.28733551502227783, "step": 365 }, { "epoch": 0.005465133641929222, "grad_norm": 0.5546875, "grad_norm_var": 0.06957906087239583, "learning_rate": 2e-05, "loss": 1.3377, "loss/crossentropy": 2.6525285243988037, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.18148328363895416, "step": 366 }, { "epoch": 0.005480065701060176, "grad_norm": 0.52734375, "grad_norm_var": 0.07011693318684896, "learning_rate": 2e-05, "loss": 1.2411, "loss/crossentropy": 2.624660015106201, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1551695019006729, "step": 367 }, { "epoch": 0.00549499776019113, "grad_norm": 0.62890625, "grad_norm_var": 0.07001546223958334, "learning_rate": 2e-05, "loss": 1.2908, "loss/crossentropy": 2.51216197013855, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16580525040626526, "step": 368 }, { "epoch": 0.005509929819322084, "grad_norm": 0.62109375, "grad_norm_var": 0.06988499959309896, "learning_rate": 2e-05, "loss": 1.3284, "loss/crossentropy": 2.7236480712890625, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18772506713867188, "step": 369 }, { "epoch": 0.0055248618784530384, "grad_norm": 0.58203125, "grad_norm_var": 0.06988499959309896, "learning_rate": 2e-05, "loss": 1.2295, "loss/crossentropy": 2.7294671535491943, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16700142621994019, "step": 370 }, { "epoch": 0.0055397939375839925, "grad_norm": 0.6015625, "grad_norm_var": 0.06892878214518229, "learning_rate": 2e-05, "loss": 1.2704, "loss/crossentropy": 2.6511926651000977, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16887199878692627, "step": 371 }, { "epoch": 0.005554725996714947, "grad_norm": 0.52734375, "grad_norm_var": 0.005724016825358073, "learning_rate": 2e-05, "loss": 1.2096, "loss/crossentropy": 2.696777105331421, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1627081334590912, "step": 372 }, { "epoch": 0.005569658055845901, "grad_norm": 0.5703125, "grad_norm_var": 0.004996744791666666, "learning_rate": 2e-05, "loss": 1.2566, "loss/crossentropy": 2.6416056156158447, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1706182360649109, "step": 373 }, { "epoch": 0.005584590114976856, "grad_norm": 0.5390625, "grad_norm_var": 0.005092112223307291, "learning_rate": 2e-05, "loss": 1.2752, "loss/crossentropy": 2.701991319656372, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1657867729663849, "step": 374 }, { "epoch": 0.00559952217410781, "grad_norm": 0.51171875, "grad_norm_var": 0.005389149983723958, "learning_rate": 2e-05, "loss": 1.1155, "loss/crossentropy": 2.4490652084350586, "loss/dist_ce": 0.0, "loss/fcd": 0.9765625, "loss/idx": 12.0, "loss/logits": 0.1389380842447281, "step": 375 }, { "epoch": 0.005614454233238764, "grad_norm": 0.5390625, "grad_norm_var": 0.005407651265462239, "learning_rate": 2e-05, "loss": 1.239, "loss/crossentropy": 2.5287365913391113, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1609053909778595, "step": 376 }, { "epoch": 0.005629386292369718, "grad_norm": 0.48828125, "grad_norm_var": 0.005452473958333333, "learning_rate": 2e-05, "loss": 1.1656, "loss/crossentropy": 2.5641028881073, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.1421457827091217, "step": 377 }, { "epoch": 0.005644318351500672, "grad_norm": 0.51171875, "grad_norm_var": 0.005690956115722656, "learning_rate": 2e-05, "loss": 1.2171, "loss/crossentropy": 2.590167284011841, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1546175181865692, "step": 378 }, { "epoch": 0.005659250410631626, "grad_norm": 0.734375, "grad_norm_var": 0.007314491271972656, "learning_rate": 2e-05, "loss": 1.3905, "loss/crossentropy": 2.8093738555908203, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.2029891312122345, "step": 379 }, { "epoch": 0.00567418246976258, "grad_norm": 0.58203125, "grad_norm_var": 0.0073094050089518225, "learning_rate": 2e-05, "loss": 1.412, "loss/crossentropy": 2.499774217605591, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.21667346358299255, "step": 380 }, { "epoch": 0.005689114528893534, "grad_norm": 0.55859375, "grad_norm_var": 0.0035776774088541667, "learning_rate": 2e-05, "loss": 1.1953, "loss/crossentropy": 2.5769855976104736, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15628309547901154, "step": 381 }, { "epoch": 0.005704046588024488, "grad_norm": 0.5546875, "grad_norm_var": 0.0035776774088541667, "learning_rate": 2e-05, "loss": 1.3599, "loss/crossentropy": 2.3219430446624756, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.20361411571502686, "step": 382 }, { "epoch": 0.0057189786471554425, "grad_norm": 0.498046875, "grad_norm_var": 0.0037877241770426433, "learning_rate": 2e-05, "loss": 1.1961, "loss/crossentropy": 2.521282911300659, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.14920538663864136, "step": 383 }, { "epoch": 0.005733910706286397, "grad_norm": 0.49609375, "grad_norm_var": 0.0037682692209879557, "learning_rate": 2e-05, "loss": 1.2406, "loss/crossentropy": 2.704392671585083, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.15467111766338348, "step": 384 }, { "epoch": 0.005748842765417351, "grad_norm": 0.59765625, "grad_norm_var": 0.0036030928293863933, "learning_rate": 2e-05, "loss": 1.2842, "loss/crossentropy": 2.674487829208374, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.18259578943252563, "step": 385 }, { "epoch": 0.005763774824548305, "grad_norm": 0.498046875, "grad_norm_var": 0.003750038146972656, "learning_rate": 2e-05, "loss": 1.1793, "loss/crossentropy": 2.782691478729248, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.14800548553466797, "step": 386 }, { "epoch": 0.00577870688367926, "grad_norm": 0.58203125, "grad_norm_var": 0.0036410013834635418, "learning_rate": 2e-05, "loss": 1.3885, "loss/crossentropy": 2.3553826808929443, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.1931438148021698, "step": 387 }, { "epoch": 0.005793638942810214, "grad_norm": 0.52734375, "grad_norm_var": 0.0036410013834635418, "learning_rate": 2e-05, "loss": 1.3019, "loss/crossentropy": 2.500701427459717, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.17689158022403717, "step": 388 }, { "epoch": 0.005808571001941168, "grad_norm": 0.53125, "grad_norm_var": 0.00362701416015625, "learning_rate": 2e-05, "loss": 1.3106, "loss/crossentropy": 2.561689615249634, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1855672150850296, "step": 389 }, { "epoch": 0.005823503061072122, "grad_norm": 0.578125, "grad_norm_var": 0.0036816914876302083, "learning_rate": 2e-05, "loss": 1.1974, "loss/crossentropy": 2.4336016178131104, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1505298614501953, "step": 390 }, { "epoch": 0.005838435120203076, "grad_norm": 0.54296875, "grad_norm_var": 0.003586069742838542, "learning_rate": 2e-05, "loss": 1.2446, "loss/crossentropy": 2.371649980545044, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.15085071325302124, "step": 391 }, { "epoch": 0.00585336717933403, "grad_norm": 0.5546875, "grad_norm_var": 0.003575897216796875, "learning_rate": 2e-05, "loss": 1.2947, "loss/crossentropy": 2.6943581104278564, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1775604784488678, "step": 392 }, { "epoch": 0.005868299238464984, "grad_norm": 0.703125, "grad_norm_var": 0.004628435770670573, "learning_rate": 2e-05, "loss": 1.4691, "loss/crossentropy": 2.582803726196289, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 12.0, "loss/logits": 0.20347224175930023, "step": 393 }, { "epoch": 0.005883231297595938, "grad_norm": 0.5625, "grad_norm_var": 0.004424285888671875, "learning_rate": 2e-05, "loss": 1.2891, "loss/crossentropy": 2.5786473751068115, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17973393201828003, "step": 394 }, { "epoch": 0.0058981633567268925, "grad_norm": 0.640625, "grad_norm_var": 0.002904510498046875, "learning_rate": 2e-05, "loss": 1.2877, "loss/crossentropy": 2.0246448516845703, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16268569231033325, "step": 395 }, { "epoch": 0.005913095415857847, "grad_norm": 0.50390625, "grad_norm_var": 0.003087615966796875, "learning_rate": 2e-05, "loss": 1.2483, "loss/crossentropy": 2.597654104232788, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.17802008986473083, "step": 396 }, { "epoch": 0.005928027474988801, "grad_norm": 0.53125, "grad_norm_var": 0.0031325658162434894, "learning_rate": 2e-05, "loss": 1.1191, "loss/crossentropy": 2.6132638454437256, "loss/dist_ce": 0.0, "loss/fcd": 0.9921875, "loss/idx": 12.0, "loss/logits": 0.12693801522254944, "step": 397 }, { "epoch": 0.005942959534119755, "grad_norm": 0.578125, "grad_norm_var": 0.0031615575154622395, "learning_rate": 2e-05, "loss": 1.3393, "loss/crossentropy": 2.3903872966766357, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.17523705959320068, "step": 398 }, { "epoch": 0.005957891593250709, "grad_norm": 0.8671875, "grad_norm_var": 0.00873411496480306, "learning_rate": 2e-05, "loss": 1.7769, "loss/crossentropy": 2.4915974140167236, "loss/dist_ce": 0.0, "loss/fcd": 1.5078125, "loss/idx": 12.0, "loss/logits": 0.2690865993499756, "step": 399 }, { "epoch": 0.005972823652381664, "grad_norm": 0.50390625, "grad_norm_var": 0.008649555842081706, "learning_rate": 2e-05, "loss": 1.2025, "loss/crossentropy": 2.4921159744262695, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1556488275527954, "step": 400 }, { "epoch": 0.005987755711512618, "grad_norm": 0.609375, "grad_norm_var": 0.008683506647745769, "learning_rate": 2e-05, "loss": 1.416, "loss/crossentropy": 2.8674352169036865, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.2129102349281311, "step": 401 }, { "epoch": 0.006002687770643572, "grad_norm": 0.466796875, "grad_norm_var": 0.009094985326131184, "learning_rate": 2e-05, "loss": 1.1529, "loss/crossentropy": 2.636471748352051, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.1373138427734375, "step": 402 }, { "epoch": 0.006017619829774526, "grad_norm": 1.5703125, "grad_norm_var": 0.07038000424702963, "learning_rate": 2e-05, "loss": 1.518, "loss/crossentropy": 2.292736530303955, "loss/dist_ce": 0.0, "loss/fcd": 1.2734375, "loss/idx": 12.0, "loss/logits": 0.24452432990074158, "step": 403 }, { "epoch": 0.00603255188890548, "grad_norm": 0.55859375, "grad_norm_var": 0.06996343930562338, "learning_rate": 2e-05, "loss": 1.3221, "loss/crossentropy": 2.7590014934539795, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18144939839839935, "step": 404 }, { "epoch": 0.006047483948036434, "grad_norm": 0.58203125, "grad_norm_var": 0.06936173439025879, "learning_rate": 2e-05, "loss": 1.3817, "loss/crossentropy": 2.3799400329589844, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.186435729265213, "step": 405 }, { "epoch": 0.006062416007167388, "grad_norm": 0.490234375, "grad_norm_var": 0.07065277099609375, "learning_rate": 2e-05, "loss": 1.2203, "loss/crossentropy": 2.47124981880188, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.14994587004184723, "step": 406 }, { "epoch": 0.0060773480662983425, "grad_norm": 0.53515625, "grad_norm_var": 0.07075932820638021, "learning_rate": 2e-05, "loss": 1.2415, "loss/crossentropy": 2.5655479431152344, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16334325075149536, "step": 407 }, { "epoch": 0.0060922801254292966, "grad_norm": 0.56640625, "grad_norm_var": 0.07063287099202474, "learning_rate": 2e-05, "loss": 1.2976, "loss/crossentropy": 2.325611114501953, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.15702426433563232, "step": 408 }, { "epoch": 0.006107212184560251, "grad_norm": 0.46484375, "grad_norm_var": 0.07223459879557291, "learning_rate": 2e-05, "loss": 1.1735, "loss/crossentropy": 2.4934916496276855, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.13443033397197723, "step": 409 }, { "epoch": 0.006122144243691205, "grad_norm": 0.57421875, "grad_norm_var": 0.07214247385660807, "learning_rate": 2e-05, "loss": 1.2699, "loss/crossentropy": 2.694333791732788, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.17613661289215088, "step": 410 }, { "epoch": 0.006137076302822159, "grad_norm": 0.5703125, "grad_norm_var": 0.07233015696207683, "learning_rate": 2e-05, "loss": 1.2064, "loss/crossentropy": 2.710723400115967, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.151699036359787, "step": 411 }, { "epoch": 0.006152008361953113, "grad_norm": 0.5546875, "grad_norm_var": 0.07168299357096354, "learning_rate": 2e-05, "loss": 1.3126, "loss/crossentropy": 2.4599716663360596, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.1798081398010254, "step": 412 }, { "epoch": 0.006166940421084068, "grad_norm": 0.5390625, "grad_norm_var": 0.07158762613932292, "learning_rate": 2e-05, "loss": 1.2299, "loss/crossentropy": 2.5145263671875, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15962854027748108, "step": 413 }, { "epoch": 0.006181872480215022, "grad_norm": 0.5234375, "grad_norm_var": 0.07213058471679687, "learning_rate": 2e-05, "loss": 1.2114, "loss/crossentropy": 2.5661544799804688, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1489061415195465, "step": 414 }, { "epoch": 0.006196804539345976, "grad_norm": 0.515625, "grad_norm_var": 0.06843414306640624, "learning_rate": 2e-05, "loss": 1.2189, "loss/crossentropy": 2.6023993492126465, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.16417017579078674, "step": 415 }, { "epoch": 0.00621173659847693, "grad_norm": 0.5078125, "grad_norm_var": 0.06838423411051432, "learning_rate": 2e-05, "loss": 1.2502, "loss/crossentropy": 2.4928228855133057, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.14868320524692535, "step": 416 }, { "epoch": 0.006226668657607884, "grad_norm": 0.5703125, "grad_norm_var": 0.0684401830037435, "learning_rate": 2e-05, "loss": 1.3053, "loss/crossentropy": 2.4073102474212646, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.1725049614906311, "step": 417 }, { "epoch": 0.006241600716738838, "grad_norm": 0.5625, "grad_norm_var": 0.06732099850972494, "learning_rate": 2e-05, "loss": 1.2862, "loss/crossentropy": 2.30145525932312, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1689978539943695, "step": 418 }, { "epoch": 0.0062565327758697924, "grad_norm": 0.494140625, "grad_norm_var": 0.0012425740559895834, "learning_rate": 2e-05, "loss": 1.2577, "loss/crossentropy": 2.6010894775390625, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17175039649009705, "step": 419 }, { "epoch": 0.0062714648350007465, "grad_norm": 0.53125, "grad_norm_var": 0.0012145360310872396, "learning_rate": 2e-05, "loss": 1.2791, "loss/crossentropy": 2.4492437839508057, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17758145928382874, "step": 420 }, { "epoch": 0.006286396894131701, "grad_norm": 0.494140625, "grad_norm_var": 0.0011623223622639974, "learning_rate": 2e-05, "loss": 1.2604, "loss/crossentropy": 2.675687313079834, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.16660946607589722, "step": 421 }, { "epoch": 0.006301328953262655, "grad_norm": 0.63671875, "grad_norm_var": 0.0017094930013020833, "learning_rate": 2e-05, "loss": 1.3558, "loss/crossentropy": 2.114575147628784, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.15267613530158997, "step": 422 }, { "epoch": 0.006316261012393609, "grad_norm": 0.484375, "grad_norm_var": 0.0019037246704101563, "learning_rate": 2e-05, "loss": 1.196, "loss/crossentropy": 2.609177827835083, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15691204369068146, "step": 423 }, { "epoch": 0.006331193071524563, "grad_norm": 0.71484375, "grad_norm_var": 0.003865496317545573, "learning_rate": 2e-05, "loss": 1.3174, "loss/crossentropy": 2.4156365394592285, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17677493393421173, "step": 424 }, { "epoch": 0.006346125130655517, "grad_norm": 0.5390625, "grad_norm_var": 0.0034052530924479165, "learning_rate": 2e-05, "loss": 1.2332, "loss/crossentropy": 2.472134590148926, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.15507778525352478, "step": 425 }, { "epoch": 0.006361057189786472, "grad_norm": 0.51953125, "grad_norm_var": 0.0034212748209635417, "learning_rate": 2e-05, "loss": 1.309, "loss/crossentropy": 2.4787662029266357, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.18404549360275269, "step": 426 }, { "epoch": 0.006375989248917426, "grad_norm": 0.51171875, "grad_norm_var": 0.003456560770670573, "learning_rate": 2e-05, "loss": 1.2076, "loss/crossentropy": 2.5816304683685303, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.16856208443641663, "step": 427 }, { "epoch": 0.00639092130804838, "grad_norm": 0.53515625, "grad_norm_var": 0.0034517923990885417, "learning_rate": 2e-05, "loss": 1.3818, "loss/crossentropy": 2.716796398162842, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.21773184835910797, "step": 428 }, { "epoch": 0.006405853367179334, "grad_norm": 0.54296875, "grad_norm_var": 0.003450965881347656, "learning_rate": 2e-05, "loss": 1.1217, "loss/crossentropy": 2.690194845199585, "loss/dist_ce": 0.0, "loss/fcd": 0.984375, "loss/idx": 12.0, "loss/logits": 0.13730208575725555, "step": 429 }, { "epoch": 0.006420785426310288, "grad_norm": 0.52734375, "grad_norm_var": 0.0034418741861979166, "learning_rate": 2e-05, "loss": 1.1929, "loss/crossentropy": 2.5792243480682373, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1459844559431076, "step": 430 }, { "epoch": 0.006435717485441242, "grad_norm": 0.625, "grad_norm_var": 0.0037907918294270834, "learning_rate": 2e-05, "loss": 1.4976, "loss/crossentropy": 2.277367353439331, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 12.0, "loss/logits": 0.23194152116775513, "step": 431 }, { "epoch": 0.0064506495445721965, "grad_norm": 0.5546875, "grad_norm_var": 0.003665669759114583, "learning_rate": 2e-05, "loss": 1.2346, "loss/crossentropy": 2.6008970737457275, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.15651211142539978, "step": 432 }, { "epoch": 0.006465581603703151, "grad_norm": 0.4921875, "grad_norm_var": 0.003864034016927083, "learning_rate": 2e-05, "loss": 1.1928, "loss/crossentropy": 2.4385178089141846, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15374068915843964, "step": 433 }, { "epoch": 0.006480513662834105, "grad_norm": 0.53515625, "grad_norm_var": 0.0038573582967122394, "learning_rate": 2e-05, "loss": 1.2827, "loss/crossentropy": 2.591653823852539, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.18110871315002441, "step": 434 }, { "epoch": 0.006495445721965059, "grad_norm": 0.55078125, "grad_norm_var": 0.00366514523824056, "learning_rate": 2e-05, "loss": 1.2192, "loss/crossentropy": 2.5972487926483154, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.14887724816799164, "step": 435 }, { "epoch": 0.006510377781096013, "grad_norm": 0.5234375, "grad_norm_var": 0.0036881605784098307, "learning_rate": 2e-05, "loss": 1.2291, "loss/crossentropy": 2.6286871433258057, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15877765417099, "step": 436 }, { "epoch": 0.006525309840226967, "grad_norm": 0.5, "grad_norm_var": 0.003647295633951823, "learning_rate": 2e-05, "loss": 1.2138, "loss/crossentropy": 2.6377251148223877, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15911364555358887, "step": 437 }, { "epoch": 0.006540241899357921, "grad_norm": 0.58203125, "grad_norm_var": 0.003198687235514323, "learning_rate": 2e-05, "loss": 1.3142, "loss/crossentropy": 2.6262760162353516, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1891767680644989, "step": 438 }, { "epoch": 0.006555173958488876, "grad_norm": 0.470703125, "grad_norm_var": 0.0033229668935139973, "learning_rate": 2e-05, "loss": 1.1893, "loss/crossentropy": 2.4748237133026123, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.1502346694469452, "step": 439 }, { "epoch": 0.00657010601761983, "grad_norm": 0.51953125, "grad_norm_var": 0.001291640599568685, "learning_rate": 2e-05, "loss": 1.2477, "loss/crossentropy": 2.4698257446289062, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16178780794143677, "step": 440 }, { "epoch": 0.006585038076750784, "grad_norm": 0.51953125, "grad_norm_var": 0.001299905776977539, "learning_rate": 2e-05, "loss": 1.2288, "loss/crossentropy": 2.2895803451538086, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1506274938583374, "step": 441 }, { "epoch": 0.006599970135881738, "grad_norm": 0.59375, "grad_norm_var": 0.0015221754709879556, "learning_rate": 2e-05, "loss": 1.295, "loss/crossentropy": 2.6053385734558105, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1778094321489334, "step": 442 }, { "epoch": 0.006614902195012692, "grad_norm": 0.7109375, "grad_norm_var": 0.003344456354777018, "learning_rate": 2e-05, "loss": 1.4482, "loss/crossentropy": 2.7315480709075928, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 12.0, "loss/logits": 0.2138347327709198, "step": 443 }, { "epoch": 0.0066298342541436465, "grad_norm": 0.55859375, "grad_norm_var": 0.0033356825510660808, "learning_rate": 2e-05, "loss": 1.2976, "loss/crossentropy": 2.4568560123443604, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.17260998487472534, "step": 444 }, { "epoch": 0.006644766313274601, "grad_norm": 0.55078125, "grad_norm_var": 0.003331740697224935, "learning_rate": 2e-05, "loss": 1.215, "loss/crossentropy": 2.6158523559570312, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1525478959083557, "step": 445 }, { "epoch": 0.006659698372405555, "grad_norm": 0.56640625, "grad_norm_var": 0.0033044020334879556, "learning_rate": 2e-05, "loss": 1.1846, "loss/crossentropy": 2.301199436187744, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.14553159475326538, "step": 446 }, { "epoch": 0.006674630431536509, "grad_norm": 0.6171875, "grad_norm_var": 0.0032335758209228516, "learning_rate": 2e-05, "loss": 1.289, "loss/crossentropy": 2.4552204608917236, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.163978710770607, "step": 447 }, { "epoch": 0.006689562490667463, "grad_norm": 0.53125, "grad_norm_var": 0.003262186050415039, "learning_rate": 2e-05, "loss": 1.3447, "loss/crossentropy": 2.410238742828369, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.19623002409934998, "step": 448 }, { "epoch": 0.006704494549798417, "grad_norm": 0.5546875, "grad_norm_var": 0.0030129591623942057, "learning_rate": 2e-05, "loss": 1.2677, "loss/crossentropy": 2.524290084838867, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.1661645472049713, "step": 449 }, { "epoch": 0.006719426608929371, "grad_norm": 0.546875, "grad_norm_var": 0.0029900709788004557, "learning_rate": 2e-05, "loss": 1.303, "loss/crossentropy": 2.5573766231536865, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.18579092621803284, "step": 450 }, { "epoch": 0.006734358668060325, "grad_norm": 0.73828125, "grad_norm_var": 0.005056111017862955, "learning_rate": 2e-05, "loss": 1.4536, "loss/crossentropy": 2.5473504066467285, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 12.0, "loss/logits": 0.20360559225082397, "step": 451 }, { "epoch": 0.00674929072719128, "grad_norm": 0.80078125, "grad_norm_var": 0.008224980036417643, "learning_rate": 2e-05, "loss": 1.5703, "loss/crossentropy": 2.538242816925049, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 12.0, "loss/logits": 0.3046456277370453, "step": 452 }, { "epoch": 0.006764222786322234, "grad_norm": 0.65234375, "grad_norm_var": 0.007947270075480144, "learning_rate": 2e-05, "loss": 1.3163, "loss/crossentropy": 2.378641128540039, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17571493983268738, "step": 453 }, { "epoch": 0.006779154845453188, "grad_norm": 0.66015625, "grad_norm_var": 0.008197768529256185, "learning_rate": 2e-05, "loss": 1.3884, "loss/crossentropy": 2.3066680431365967, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.19306373596191406, "step": 454 }, { "epoch": 0.006794086904584142, "grad_norm": 0.69140625, "grad_norm_var": 0.007452392578125, "learning_rate": 2e-05, "loss": 1.3553, "loss/crossentropy": 2.4432783126831055, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.20682473480701447, "step": 455 }, { "epoch": 0.0068090189637150965, "grad_norm": 0.52734375, "grad_norm_var": 0.007358551025390625, "learning_rate": 2e-05, "loss": 1.2052, "loss/crossentropy": 2.449265718460083, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15830518305301666, "step": 456 }, { "epoch": 0.0068239510228460505, "grad_norm": 0.6171875, "grad_norm_var": 0.006727536519368489, "learning_rate": 2e-05, "loss": 1.3472, "loss/crossentropy": 2.5877275466918945, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.1831488013267517, "step": 457 }, { "epoch": 0.006838883081977005, "grad_norm": 0.703125, "grad_norm_var": 0.007094256083170573, "learning_rate": 2e-05, "loss": 1.4365, "loss/crossentropy": 2.5130813121795654, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.21774663031101227, "step": 458 }, { "epoch": 0.006853815141107959, "grad_norm": 0.6796875, "grad_norm_var": 0.006804339090983073, "learning_rate": 2e-05, "loss": 1.2911, "loss/crossentropy": 2.49009108543396, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16606926918029785, "step": 459 }, { "epoch": 0.006868747200238913, "grad_norm": 0.53515625, "grad_norm_var": 0.007045427958170573, "learning_rate": 2e-05, "loss": 1.205, "loss/crossentropy": 2.6400084495544434, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15812143683433533, "step": 460 }, { "epoch": 0.006883679259369867, "grad_norm": 0.5703125, "grad_norm_var": 0.006880442301432292, "learning_rate": 2e-05, "loss": 1.2798, "loss/crossentropy": 2.443336248397827, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.18602094054222107, "step": 461 }, { "epoch": 0.006898611318500821, "grad_norm": 0.5234375, "grad_norm_var": 0.00732873280843099, "learning_rate": 2e-05, "loss": 1.2403, "loss/crossentropy": 2.221099376678467, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16219985485076904, "step": 462 }, { "epoch": 0.006913543377631775, "grad_norm": 0.53125, "grad_norm_var": 0.007843462626139323, "learning_rate": 2e-05, "loss": 1.2735, "loss/crossentropy": 2.568125009536743, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17195746302604675, "step": 463 }, { "epoch": 0.006928475436762729, "grad_norm": 0.6328125, "grad_norm_var": 0.007334327697753907, "learning_rate": 2e-05, "loss": 1.2505, "loss/crossentropy": 2.489778518676758, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1646047830581665, "step": 464 }, { "epoch": 0.006943407495893684, "grad_norm": 0.546875, "grad_norm_var": 0.007409095764160156, "learning_rate": 2e-05, "loss": 1.299, "loss/crossentropy": 2.5794758796691895, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1739940047264099, "step": 465 }, { "epoch": 0.006958339555024638, "grad_norm": 0.57421875, "grad_norm_var": 0.0071807861328125, "learning_rate": 2e-05, "loss": 1.3035, "loss/crossentropy": 2.4775702953338623, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17069995403289795, "step": 466 }, { "epoch": 0.006973271614155592, "grad_norm": 0.494140625, "grad_norm_var": 0.007186746597290039, "learning_rate": 2e-05, "loss": 1.16, "loss/crossentropy": 2.651700258255005, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.14432933926582336, "step": 467 }, { "epoch": 0.006988203673286546, "grad_norm": 0.53515625, "grad_norm_var": 0.004795948664347331, "learning_rate": 2e-05, "loss": 1.2376, "loss/crossentropy": 2.6250648498535156, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1673288345336914, "step": 468 }, { "epoch": 0.0070031357324175005, "grad_norm": 0.5703125, "grad_norm_var": 0.004558293024698893, "learning_rate": 2e-05, "loss": 1.2862, "loss/crossentropy": 2.5528345108032227, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1767835021018982, "step": 469 }, { "epoch": 0.007018067791548455, "grad_norm": 0.474609375, "grad_norm_var": 0.0049010594685872395, "learning_rate": 2e-05, "loss": 1.166, "loss/crossentropy": 2.459474563598633, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.14254210889339447, "step": 470 }, { "epoch": 0.007032999850679409, "grad_norm": 0.47265625, "grad_norm_var": 0.0045094172159830725, "learning_rate": 2e-05, "loss": 1.1841, "loss/crossentropy": 2.6750423908233643, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.15288898348808289, "step": 471 }, { "epoch": 0.007047931909810363, "grad_norm": 0.546875, "grad_norm_var": 0.004443613688151041, "learning_rate": 2e-05, "loss": 1.3021, "loss/crossentropy": 2.6386568546295166, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1770569384098053, "step": 472 }, { "epoch": 0.007062863968941317, "grad_norm": 0.515625, "grad_norm_var": 0.004354349772135417, "learning_rate": 2e-05, "loss": 1.2594, "loss/crossentropy": 2.706113576889038, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17347458004951477, "step": 473 }, { "epoch": 0.007077796028072271, "grad_norm": 0.62109375, "grad_norm_var": 0.003172747294108073, "learning_rate": 2e-05, "loss": 1.2548, "loss/crossentropy": 2.6007003784179688, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16884616017341614, "step": 474 }, { "epoch": 0.007092728087203225, "grad_norm": 0.55859375, "grad_norm_var": 0.0020197550455729168, "learning_rate": 2e-05, "loss": 1.2828, "loss/crossentropy": 2.5858445167541504, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.16564837098121643, "step": 475 }, { "epoch": 0.007107660146334179, "grad_norm": 0.478515625, "grad_norm_var": 0.0022866408030192058, "learning_rate": 2e-05, "loss": 1.1638, "loss/crossentropy": 2.386579751968384, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.14035974442958832, "step": 476 }, { "epoch": 0.007122592205465133, "grad_norm": 0.6015625, "grad_norm_var": 0.0024722894032796223, "learning_rate": 2e-05, "loss": 1.2395, "loss/crossentropy": 2.5994484424591064, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1613873541355133, "step": 477 }, { "epoch": 0.007137524264596088, "grad_norm": 0.53125, "grad_norm_var": 0.0024563948313395183, "learning_rate": 2e-05, "loss": 1.3159, "loss/crossentropy": 2.616286039352417, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.1752912998199463, "step": 478 }, { "epoch": 0.007152456323727042, "grad_norm": 0.53125, "grad_norm_var": 0.0024563948313395183, "learning_rate": 2e-05, "loss": 1.1997, "loss/crossentropy": 2.7094411849975586, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.16060031950473785, "step": 479 }, { "epoch": 0.007167388382857996, "grad_norm": 0.59375, "grad_norm_var": 0.0020831902821858723, "learning_rate": 2e-05, "loss": 1.2477, "loss/crossentropy": 2.761913776397705, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16173386573791504, "step": 480 }, { "epoch": 0.0071823204419889505, "grad_norm": 0.56640625, "grad_norm_var": 0.0021238803863525392, "learning_rate": 2e-05, "loss": 1.2121, "loss/crossentropy": 2.754338502883911, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15745031833648682, "step": 481 }, { "epoch": 0.007197252501119905, "grad_norm": 0.67578125, "grad_norm_var": 0.0032099246978759765, "learning_rate": 2e-05, "loss": 1.3736, "loss/crossentropy": 2.5661139488220215, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.18609124422073364, "step": 482 }, { "epoch": 0.007212184560250859, "grad_norm": 0.54296875, "grad_norm_var": 0.003008460998535156, "learning_rate": 2e-05, "loss": 1.2358, "loss/crossentropy": 2.3610997200012207, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.15763944387435913, "step": 483 }, { "epoch": 0.007227116619381813, "grad_norm": 0.76171875, "grad_norm_var": 0.00573724110921224, "learning_rate": 2e-05, "loss": 1.4967, "loss/crossentropy": 2.4400460720062256, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 12.0, "loss/logits": 0.21541057527065277, "step": 484 }, { "epoch": 0.007242048678512767, "grad_norm": 0.5390625, "grad_norm_var": 0.00577691396077474, "learning_rate": 2e-05, "loss": 1.2353, "loss/crossentropy": 2.7640154361724854, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16499778628349304, "step": 485 }, { "epoch": 0.007256980737643721, "grad_norm": 0.47265625, "grad_norm_var": 0.005800231297810873, "learning_rate": 2e-05, "loss": 1.1987, "loss/crossentropy": 2.694920301437378, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15962320566177368, "step": 486 }, { "epoch": 0.007271912796774675, "grad_norm": 0.55078125, "grad_norm_var": 0.005239470799763998, "learning_rate": 2e-05, "loss": 1.2831, "loss/crossentropy": 2.8368988037109375, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17376646399497986, "step": 487 }, { "epoch": 0.007286844855905629, "grad_norm": 0.5234375, "grad_norm_var": 0.005339797337849935, "learning_rate": 2e-05, "loss": 1.2062, "loss/crossentropy": 2.394050121307373, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.14370450377464294, "step": 488 }, { "epoch": 0.007301776915036583, "grad_norm": 0.56640625, "grad_norm_var": 0.005156310399373373, "learning_rate": 2e-05, "loss": 1.2921, "loss/crossentropy": 2.6243531703948975, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17488384246826172, "step": 489 }, { "epoch": 0.007316708974167537, "grad_norm": 0.498046875, "grad_norm_var": 0.005259450276692708, "learning_rate": 2e-05, "loss": 1.2395, "loss/crossentropy": 2.6446375846862793, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16142112016677856, "step": 490 }, { "epoch": 0.007331641033298492, "grad_norm": 0.498046875, "grad_norm_var": 0.005516163508097331, "learning_rate": 2e-05, "loss": 1.2261, "loss/crossentropy": 2.3884522914886475, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.14793342351913452, "step": 491 }, { "epoch": 0.007346573092429446, "grad_norm": 0.515625, "grad_norm_var": 0.00520782470703125, "learning_rate": 2e-05, "loss": 1.2395, "loss/crossentropy": 2.611057758331299, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16138747334480286, "step": 492 }, { "epoch": 0.0073615051515604005, "grad_norm": 0.55078125, "grad_norm_var": 0.005091285705566407, "learning_rate": 2e-05, "loss": 1.235, "loss/crossentropy": 2.760316848754883, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16464349627494812, "step": 493 }, { "epoch": 0.0073764372106913546, "grad_norm": 0.5859375, "grad_norm_var": 0.005087725321451823, "learning_rate": 2e-05, "loss": 1.3381, "loss/crossentropy": 2.4796364307403564, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.19745641946792603, "step": 494 }, { "epoch": 0.007391369269822309, "grad_norm": 0.7578125, "grad_norm_var": 0.0074035008748372395, "learning_rate": 2e-05, "loss": 1.5507, "loss/crossentropy": 2.632960796356201, "loss/dist_ce": 0.0, "loss/fcd": 1.3125, "loss/idx": 12.0, "loss/logits": 0.23823606967926025, "step": 495 }, { "epoch": 0.007406301328953263, "grad_norm": 0.578125, "grad_norm_var": 0.007379595438639323, "learning_rate": 2e-05, "loss": 1.3475, "loss/crossentropy": 2.4685165882110596, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 12.0, "loss/logits": 0.16783174872398376, "step": 496 }, { "epoch": 0.007421233388084217, "grad_norm": 0.60546875, "grad_norm_var": 0.00743554433186849, "learning_rate": 2e-05, "loss": 1.2911, "loss/crossentropy": 2.674586534500122, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17387951910495758, "step": 497 }, { "epoch": 0.007436165447215171, "grad_norm": 0.53125, "grad_norm_var": 0.006826273600260417, "learning_rate": 2e-05, "loss": 1.1969, "loss/crossentropy": 2.428183078765869, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15000945329666138, "step": 498 }, { "epoch": 0.007451097506346125, "grad_norm": 0.466796875, "grad_norm_var": 0.007436863581339518, "learning_rate": 2e-05, "loss": 1.1538, "loss/crossentropy": 2.430645704269409, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.13817110657691956, "step": 499 }, { "epoch": 0.007466029565477079, "grad_norm": 0.5625, "grad_norm_var": 0.004628864924112955, "learning_rate": 2e-05, "loss": 1.2676, "loss/crossentropy": 2.6222951412200928, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16602060198783875, "step": 500 }, { "epoch": 0.007480961624608033, "grad_norm": 0.51953125, "grad_norm_var": 0.004681634902954102, "learning_rate": 2e-05, "loss": 1.2974, "loss/crossentropy": 2.3674845695495605, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.16458511352539062, "step": 501 }, { "epoch": 0.007495893683738987, "grad_norm": 0.62109375, "grad_norm_var": 0.00454875628153483, "learning_rate": 2e-05, "loss": 1.1938, "loss/crossentropy": 2.597952127456665, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15470948815345764, "step": 502 }, { "epoch": 0.007510825742869941, "grad_norm": 0.5078125, "grad_norm_var": 0.004706811904907226, "learning_rate": 2e-05, "loss": 1.1595, "loss/crossentropy": 2.6619369983673096, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 12.0, "loss/logits": 0.15948548913002014, "step": 503 }, { "epoch": 0.007525757802000896, "grad_norm": 0.5390625, "grad_norm_var": 0.004655186335245768, "learning_rate": 2e-05, "loss": 1.2799, "loss/crossentropy": 2.4827001094818115, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17829132080078125, "step": 504 }, { "epoch": 0.0075406898611318504, "grad_norm": 0.71875, "grad_norm_var": 0.006306568781534831, "learning_rate": 2e-05, "loss": 1.3244, "loss/crossentropy": 2.6411755084991455, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.19154754281044006, "step": 505 }, { "epoch": 0.0075556219202628045, "grad_norm": 0.53125, "grad_norm_var": 0.00607446034749349, "learning_rate": 2e-05, "loss": 1.1721, "loss/crossentropy": 2.6066651344299316, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.1486879289150238, "step": 506 }, { "epoch": 0.007570553979393759, "grad_norm": 1.09375, "grad_norm_var": 0.022688023249308267, "learning_rate": 2e-05, "loss": 1.8032, "loss/crossentropy": 2.4730823040008545, "loss/dist_ce": 0.0, "loss/fcd": 1.5390625, "loss/idx": 12.0, "loss/logits": 0.264101505279541, "step": 507 }, { "epoch": 0.007585486038524713, "grad_norm": 0.55078125, "grad_norm_var": 0.022344700495402017, "learning_rate": 2e-05, "loss": 1.3164, "loss/crossentropy": 2.587294101715088, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.18363715708255768, "step": 508 }, { "epoch": 0.007600418097655667, "grad_norm": 0.5234375, "grad_norm_var": 0.02259837786356608, "learning_rate": 2e-05, "loss": 1.186, "loss/crossentropy": 2.5303711891174316, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.14696934819221497, "step": 509 }, { "epoch": 0.007615350156786621, "grad_norm": 0.5390625, "grad_norm_var": 0.022860066095987955, "learning_rate": 2e-05, "loss": 1.2258, "loss/crossentropy": 2.4873058795928955, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16325148940086365, "step": 510 }, { "epoch": 0.007630282215917575, "grad_norm": 0.58203125, "grad_norm_var": 0.021160618464152018, "learning_rate": 2e-05, "loss": 1.3853, "loss/crossentropy": 2.499600887298584, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.22127383947372437, "step": 511 }, { "epoch": 0.007645214275048529, "grad_norm": 0.50390625, "grad_norm_var": 0.02164139747619629, "learning_rate": 2e-05, "loss": 1.1943, "loss/crossentropy": 2.507723569869995, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.1552710235118866, "step": 512 }, { "epoch": 0.007660146334179483, "grad_norm": 0.56640625, "grad_norm_var": 0.021642033259073892, "learning_rate": 2e-05, "loss": 1.2238, "loss/crossentropy": 2.786867380142212, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15346887707710266, "step": 513 }, { "epoch": 0.007675078393310437, "grad_norm": 0.78125, "grad_norm_var": 0.023761987686157227, "learning_rate": 2e-05, "loss": 1.5063, "loss/crossentropy": 2.463454246520996, "loss/dist_ce": 0.0, "loss/fcd": 1.2890625, "loss/idx": 12.0, "loss/logits": 0.21725308895111084, "step": 514 }, { "epoch": 0.007690010452441391, "grad_norm": 0.546875, "grad_norm_var": 0.022735595703125, "learning_rate": 2e-05, "loss": 1.3415, "loss/crossentropy": 2.8017828464508057, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.19304242730140686, "step": 515 }, { "epoch": 0.0077049425115723455, "grad_norm": 0.6796875, "grad_norm_var": 0.022922515869140625, "learning_rate": 2e-05, "loss": 1.3884, "loss/crossentropy": 2.391439914703369, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.1931254267692566, "step": 516 }, { "epoch": 0.0077198745707032996, "grad_norm": 0.66015625, "grad_norm_var": 0.022409820556640626, "learning_rate": 2e-05, "loss": 1.3426, "loss/crossentropy": 2.592740058898926, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.17067590355873108, "step": 517 }, { "epoch": 0.0077348066298342545, "grad_norm": 0.52734375, "grad_norm_var": 0.022965240478515624, "learning_rate": 2e-05, "loss": 1.2129, "loss/crossentropy": 2.682835340499878, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15816320478916168, "step": 518 }, { "epoch": 0.007749738688965209, "grad_norm": 0.5546875, "grad_norm_var": 0.022428131103515624, "learning_rate": 2e-05, "loss": 1.3196, "loss/crossentropy": 2.841771125793457, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.19456353783607483, "step": 519 }, { "epoch": 0.007764670748096163, "grad_norm": 0.62109375, "grad_norm_var": 0.021978187561035156, "learning_rate": 2e-05, "loss": 1.3445, "loss/crossentropy": 2.2829132080078125, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.17264382541179657, "step": 520 }, { "epoch": 0.007779602807227117, "grad_norm": 0.5625, "grad_norm_var": 0.02152551015218099, "learning_rate": 2e-05, "loss": 1.2809, "loss/crossentropy": 2.52323317527771, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17150172591209412, "step": 521 }, { "epoch": 0.007794534866358071, "grad_norm": 0.56640625, "grad_norm_var": 0.021214803059895832, "learning_rate": 2e-05, "loss": 1.3486, "loss/crossentropy": 2.559417724609375, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.20019292831420898, "step": 522 }, { "epoch": 0.007809466925489025, "grad_norm": 0.59375, "grad_norm_var": 0.005003865559895833, "learning_rate": 2e-05, "loss": 1.2866, "loss/crossentropy": 2.2705278396606445, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1694566011428833, "step": 523 }, { "epoch": 0.007824398984619979, "grad_norm": 0.53515625, "grad_norm_var": 0.00509033203125, "learning_rate": 2e-05, "loss": 1.2626, "loss/crossentropy": 2.5298879146575928, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.16886663436889648, "step": 524 }, { "epoch": 0.007839331043750933, "grad_norm": 0.62109375, "grad_norm_var": 0.00489800771077474, "learning_rate": 2e-05, "loss": 1.3763, "loss/crossentropy": 2.5087928771972656, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.18884728848934174, "step": 525 }, { "epoch": 0.007854263102881887, "grad_norm": 0.6796875, "grad_norm_var": 0.00517724355061849, "learning_rate": 2e-05, "loss": 1.3889, "loss/crossentropy": 2.522796869277954, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 12.0, "loss/logits": 0.17794877290725708, "step": 526 }, { "epoch": 0.007869195162012841, "grad_norm": 0.57421875, "grad_norm_var": 0.0051986058553059895, "learning_rate": 2e-05, "loss": 1.1859, "loss/crossentropy": 2.575768232345581, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.1546417772769928, "step": 527 }, { "epoch": 0.007884127221143795, "grad_norm": 1.2734375, "grad_norm_var": 0.032515462239583334, "learning_rate": 2e-05, "loss": 1.826, "loss/crossentropy": 2.2920806407928467, "loss/dist_ce": 0.0, "loss/fcd": 1.4453125, "loss/idx": 12.0, "loss/logits": 0.38067495822906494, "step": 528 }, { "epoch": 0.00789905928027475, "grad_norm": 0.54296875, "grad_norm_var": 0.03280003865559896, "learning_rate": 2e-05, "loss": 1.3219, "loss/crossentropy": 2.600083351135254, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18125802278518677, "step": 529 }, { "epoch": 0.007913991339405704, "grad_norm": 0.71484375, "grad_norm_var": 0.03186944325764974, "learning_rate": 2e-05, "loss": 1.35, "loss/crossentropy": 2.3968639373779297, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.1625278890132904, "step": 530 }, { "epoch": 0.007928923398536658, "grad_norm": 0.578125, "grad_norm_var": 0.031538836161295575, "learning_rate": 2e-05, "loss": 1.327, "loss/crossentropy": 2.8377633094787598, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.1941969245672226, "step": 531 }, { "epoch": 0.007943855457667612, "grad_norm": 0.53515625, "grad_norm_var": 0.032133992513020834, "learning_rate": 2e-05, "loss": 1.2754, "loss/crossentropy": 2.437499523162842, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17379987239837646, "step": 532 }, { "epoch": 0.007958787516798566, "grad_norm": 0.53125, "grad_norm_var": 0.032719357808430986, "learning_rate": 2e-05, "loss": 1.3285, "loss/crossentropy": 2.624368190765381, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18789049983024597, "step": 533 }, { "epoch": 0.00797371957592952, "grad_norm": 0.5078125, "grad_norm_var": 0.032999420166015626, "learning_rate": 2e-05, "loss": 1.1739, "loss/crossentropy": 2.702047824859619, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.14263641834259033, "step": 534 }, { "epoch": 0.007988651635060474, "grad_norm": 0.59375, "grad_norm_var": 0.032731119791666666, "learning_rate": 2e-05, "loss": 1.3157, "loss/crossentropy": 2.7412476539611816, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17504537105560303, "step": 535 }, { "epoch": 0.008003583694191428, "grad_norm": 0.66796875, "grad_norm_var": 0.032831827799479164, "learning_rate": 2e-05, "loss": 1.3453, "loss/crossentropy": 2.646535634994507, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.19684037566184998, "step": 536 }, { "epoch": 0.008018515753322384, "grad_norm": 0.55859375, "grad_norm_var": 0.03286787668863932, "learning_rate": 2e-05, "loss": 1.3575, "loss/crossentropy": 2.4461722373962402, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.18560636043548584, "step": 537 }, { "epoch": 0.008033447812453338, "grad_norm": 0.55078125, "grad_norm_var": 0.033014869689941405, "learning_rate": 2e-05, "loss": 1.2235, "loss/crossentropy": 2.7021944522857666, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16099202632904053, "step": 538 }, { "epoch": 0.008048379871584292, "grad_norm": 0.52734375, "grad_norm_var": 0.033599599202473955, "learning_rate": 2e-05, "loss": 1.1725, "loss/crossentropy": 2.6571671962738037, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.14905613660812378, "step": 539 }, { "epoch": 0.008063311930715246, "grad_norm": 0.5859375, "grad_norm_var": 0.03315575917561849, "learning_rate": 2e-05, "loss": 1.3608, "loss/crossentropy": 2.54116153717041, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.18891382217407227, "step": 540 }, { "epoch": 0.0080782439898462, "grad_norm": 0.5078125, "grad_norm_var": 0.034057362874348955, "learning_rate": 2e-05, "loss": 1.2093, "loss/crossentropy": 2.552755355834961, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.16238316893577576, "step": 541 }, { "epoch": 0.008093176048977154, "grad_norm": 0.5703125, "grad_norm_var": 0.033943430582682295, "learning_rate": 2e-05, "loss": 1.21, "loss/crossentropy": 2.474992275238037, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.16313990950584412, "step": 542 }, { "epoch": 0.008108108108108109, "grad_norm": 0.64453125, "grad_norm_var": 0.03388163248697917, "learning_rate": 2e-05, "loss": 1.4651, "loss/crossentropy": 2.347513198852539, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.22293345630168915, "step": 543 }, { "epoch": 0.008123040167239063, "grad_norm": 0.51953125, "grad_norm_var": 0.003536415100097656, "learning_rate": 2e-05, "loss": 1.1549, "loss/crossentropy": 2.6457583904266357, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.13925421237945557, "step": 544 }, { "epoch": 0.008137972226370017, "grad_norm": 0.52734375, "grad_norm_var": 0.0036101659138997396, "learning_rate": 2e-05, "loss": 1.2341, "loss/crossentropy": 2.5168657302856445, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16381186246871948, "step": 545 }, { "epoch": 0.00815290428550097, "grad_norm": 0.48828125, "grad_norm_var": 0.0024449030558268228, "learning_rate": 2e-05, "loss": 1.1984, "loss/crossentropy": 2.518291711807251, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15153710544109344, "step": 546 }, { "epoch": 0.008167836344631925, "grad_norm": 0.65234375, "grad_norm_var": 0.003009033203125, "learning_rate": 2e-05, "loss": 1.42, "loss/crossentropy": 2.566648244857788, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 12.0, "loss/logits": 0.18559721112251282, "step": 547 }, { "epoch": 0.008182768403762879, "grad_norm": 0.498046875, "grad_norm_var": 0.0032207330067952475, "learning_rate": 2e-05, "loss": 1.1543, "loss/crossentropy": 2.616701364517212, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.14649763703346252, "step": 548 }, { "epoch": 0.008197700462893833, "grad_norm": 0.56640625, "grad_norm_var": 0.003171523412068685, "learning_rate": 2e-05, "loss": 1.2761, "loss/crossentropy": 2.670872688293457, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16675950586795807, "step": 549 }, { "epoch": 0.008212632522024787, "grad_norm": 0.55859375, "grad_norm_var": 0.0029764652252197267, "learning_rate": 2e-05, "loss": 1.2667, "loss/crossentropy": 2.361363410949707, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.172979474067688, "step": 550 }, { "epoch": 0.008227564581155741, "grad_norm": 0.5078125, "grad_norm_var": 0.0030925591786702474, "learning_rate": 2e-05, "loss": 1.3208, "loss/crossentropy": 2.6893482208251953, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18016394972801208, "step": 551 }, { "epoch": 0.008242496640286695, "grad_norm": 0.53515625, "grad_norm_var": 0.0022516727447509767, "learning_rate": 2e-05, "loss": 1.2538, "loss/crossentropy": 2.5788474082946777, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16790160536766052, "step": 552 }, { "epoch": 0.00825742869941765, "grad_norm": 0.53515625, "grad_norm_var": 0.002258920669555664, "learning_rate": 2e-05, "loss": 1.2469, "loss/crossentropy": 2.7054810523986816, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16093963384628296, "step": 553 }, { "epoch": 0.008272360758548604, "grad_norm": 0.7421875, "grad_norm_var": 0.004607884089152018, "learning_rate": 2e-05, "loss": 1.4921, "loss/crossentropy": 2.5595617294311523, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 12.0, "loss/logits": 0.2420843541622162, "step": 554 }, { "epoch": 0.008287292817679558, "grad_norm": 0.53125, "grad_norm_var": 0.004591608047485351, "learning_rate": 2e-05, "loss": 1.2663, "loss/crossentropy": 3.0859930515289307, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16476529836654663, "step": 555 }, { "epoch": 0.008302224876810512, "grad_norm": 0.546875, "grad_norm_var": 0.004555368423461914, "learning_rate": 2e-05, "loss": 1.3437, "loss/crossentropy": 2.519860029220581, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.1952182948589325, "step": 556 }, { "epoch": 0.008317156935941466, "grad_norm": 0.50390625, "grad_norm_var": 0.0045825799306233725, "learning_rate": 2e-05, "loss": 1.2765, "loss/crossentropy": 2.5596110820770264, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17497843503952026, "step": 557 }, { "epoch": 0.00833208899507242, "grad_norm": 0.494140625, "grad_norm_var": 0.004819997151692708, "learning_rate": 2e-05, "loss": 1.1638, "loss/crossentropy": 2.6705644130706787, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.1560230553150177, "step": 558 }, { "epoch": 0.008347021054203374, "grad_norm": 0.53125, "grad_norm_var": 0.004242897033691406, "learning_rate": 2e-05, "loss": 1.2259, "loss/crossentropy": 2.432647466659546, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1556343138217926, "step": 559 }, { "epoch": 0.008361953113334328, "grad_norm": 0.58203125, "grad_norm_var": 0.0042652765909830725, "learning_rate": 2e-05, "loss": 1.3295, "loss/crossentropy": 2.516251564025879, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18889586627483368, "step": 560 }, { "epoch": 0.008376885172465282, "grad_norm": 1.0390625, "grad_norm_var": 0.019082132975260416, "learning_rate": 2e-05, "loss": 1.6007, "loss/crossentropy": 2.3152377605438232, "loss/dist_ce": 0.0, "loss/fcd": 1.34375, "loss/idx": 12.0, "loss/logits": 0.2569894790649414, "step": 561 }, { "epoch": 0.008391817231596236, "grad_norm": 0.55859375, "grad_norm_var": 0.018512217203776042, "learning_rate": 2e-05, "loss": 1.1934, "loss/crossentropy": 2.74574875831604, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15431912243366241, "step": 562 }, { "epoch": 0.008406749290727192, "grad_norm": 0.494140625, "grad_norm_var": 0.01868602434794108, "learning_rate": 2e-05, "loss": 1.2298, "loss/crossentropy": 2.555695056915283, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1516464799642563, "step": 563 }, { "epoch": 0.008421681349858146, "grad_norm": 0.5390625, "grad_norm_var": 0.018361918131510415, "learning_rate": 2e-05, "loss": 1.1969, "loss/crossentropy": 2.6115164756774902, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1500345766544342, "step": 564 }, { "epoch": 0.0084366134089891, "grad_norm": 0.5859375, "grad_norm_var": 0.018352699279785157, "learning_rate": 2e-05, "loss": 1.311, "loss/crossentropy": 2.468400478363037, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17037324607372284, "step": 565 }, { "epoch": 0.008451545468120054, "grad_norm": 0.62890625, "grad_norm_var": 0.018457984924316405, "learning_rate": 2e-05, "loss": 1.2793, "loss/crossentropy": 2.5954904556274414, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17773208022117615, "step": 566 }, { "epoch": 0.008466477527251009, "grad_norm": 0.57421875, "grad_norm_var": 0.01805267333984375, "learning_rate": 2e-05, "loss": 1.3878, "loss/crossentropy": 2.4109535217285156, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.19250379502773285, "step": 567 }, { "epoch": 0.008481409586381963, "grad_norm": 0.4921875, "grad_norm_var": 0.018475786844889323, "learning_rate": 2e-05, "loss": 1.1878, "loss/crossentropy": 2.3730239868164062, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.14876341819763184, "step": 568 }, { "epoch": 0.008496341645512917, "grad_norm": 0.5625, "grad_norm_var": 0.01833648681640625, "learning_rate": 2e-05, "loss": 1.3256, "loss/crossentropy": 2.475072145462036, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18494603037834167, "step": 569 }, { "epoch": 0.00851127370464387, "grad_norm": 0.5234375, "grad_norm_var": 0.016826883951822916, "learning_rate": 2e-05, "loss": 1.153, "loss/crossentropy": 2.537174940109253, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.1451968550682068, "step": 570 }, { "epoch": 0.008526205763774825, "grad_norm": 0.51171875, "grad_norm_var": 0.016962623596191405, "learning_rate": 2e-05, "loss": 1.2862, "loss/crossentropy": 2.5827090740203857, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1767972707748413, "step": 571 }, { "epoch": 0.008541137822905779, "grad_norm": 0.57421875, "grad_norm_var": 0.01691411336263021, "learning_rate": 2e-05, "loss": 1.3106, "loss/crossentropy": 2.3603527545928955, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17780755460262299, "step": 572 }, { "epoch": 0.008556069882036733, "grad_norm": 0.5390625, "grad_norm_var": 0.01665948232014974, "learning_rate": 2e-05, "loss": 1.2905, "loss/crossentropy": 2.3075900077819824, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16551288962364197, "step": 573 }, { "epoch": 0.008571001941167687, "grad_norm": 0.62109375, "grad_norm_var": 0.01626585324605306, "learning_rate": 2e-05, "loss": 1.3477, "loss/crossentropy": 2.506995439529419, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.19144755601882935, "step": 574 }, { "epoch": 0.008585934000298641, "grad_norm": 0.71484375, "grad_norm_var": 0.01706070899963379, "learning_rate": 2e-05, "loss": 1.5414, "loss/crossentropy": 2.3521230220794678, "loss/dist_ce": 0.0, "loss/fcd": 1.3203125, "loss/idx": 12.0, "loss/logits": 0.2211102545261383, "step": 575 }, { "epoch": 0.008600866059429595, "grad_norm": 0.51953125, "grad_norm_var": 0.01742386817932129, "learning_rate": 2e-05, "loss": 1.3131, "loss/crossentropy": 2.6059999465942383, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.18809755146503448, "step": 576 }, { "epoch": 0.00861579811856055, "grad_norm": 0.59375, "grad_norm_var": 0.0032976627349853515, "learning_rate": 2e-05, "loss": 1.403, "loss/crossentropy": 2.578143835067749, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 12.0, "loss/logits": 0.19208408892154694, "step": 577 }, { "epoch": 0.008630730177691504, "grad_norm": 0.52734375, "grad_norm_var": 0.0033836205800374347, "learning_rate": 2e-05, "loss": 1.2463, "loss/crossentropy": 2.582569122314453, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16822180151939392, "step": 578 }, { "epoch": 0.008645662236822458, "grad_norm": 0.65234375, "grad_norm_var": 0.0035033543904622396, "learning_rate": 2e-05, "loss": 1.4421, "loss/crossentropy": 2.4856603145599365, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.22337768971920013, "step": 579 }, { "epoch": 0.008660594295953412, "grad_norm": 0.498046875, "grad_norm_var": 0.0037914117177327475, "learning_rate": 2e-05, "loss": 1.2455, "loss/crossentropy": 2.372971534729004, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.15174807608127594, "step": 580 }, { "epoch": 0.008675526355084366, "grad_norm": 0.53125, "grad_norm_var": 0.003861729303995768, "learning_rate": 2e-05, "loss": 1.2073, "loss/crossentropy": 2.7692155838012695, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1604694277048111, "step": 581 }, { "epoch": 0.00869045841421532, "grad_norm": 0.6015625, "grad_norm_var": 0.003681039810180664, "learning_rate": 2e-05, "loss": 1.2814, "loss/crossentropy": 2.6545069217681885, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17197799682617188, "step": 582 }, { "epoch": 0.008705390473346274, "grad_norm": 0.462890625, "grad_norm_var": 0.004316139221191406, "learning_rate": 2e-05, "loss": 1.1049, "loss/crossentropy": 2.683872938156128, "loss/dist_ce": 0.0, "loss/fcd": 0.97265625, "loss/idx": 12.0, "loss/logits": 0.13223010301589966, "step": 583 }, { "epoch": 0.008720322532477228, "grad_norm": 0.5625, "grad_norm_var": 0.004009437561035156, "learning_rate": 2e-05, "loss": 1.311, "loss/crossentropy": 2.4532418251037598, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17814823985099792, "step": 584 }, { "epoch": 0.008735254591608182, "grad_norm": 0.50390625, "grad_norm_var": 0.00422210693359375, "learning_rate": 2e-05, "loss": 1.2304, "loss/crossentropy": 2.5566062927246094, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16788913309574127, "step": 585 }, { "epoch": 0.008750186650739136, "grad_norm": 0.53125, "grad_norm_var": 0.004189300537109375, "learning_rate": 2e-05, "loss": 1.2254, "loss/crossentropy": 2.733366012573242, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.17072361707687378, "step": 586 }, { "epoch": 0.00876511870987009, "grad_norm": 0.6640625, "grad_norm_var": 0.004677772521972656, "learning_rate": 2e-05, "loss": 1.2663, "loss/crossentropy": 2.689211845397949, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1803634762763977, "step": 587 }, { "epoch": 0.008780050769001044, "grad_norm": 0.53125, "grad_norm_var": 0.0047609965006510414, "learning_rate": 2e-05, "loss": 1.1999, "loss/crossentropy": 2.5483739376068115, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.16079798340797424, "step": 588 }, { "epoch": 0.008794982828131999, "grad_norm": 0.67578125, "grad_norm_var": 0.00543969472249349, "learning_rate": 2e-05, "loss": 1.2913, "loss/crossentropy": 2.7356832027435303, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16630901396274567, "step": 589 }, { "epoch": 0.008809914887262954, "grad_norm": 0.65234375, "grad_norm_var": 0.005695025126139323, "learning_rate": 2e-05, "loss": 1.3674, "loss/crossentropy": 2.607666015625, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 12.0, "loss/logits": 0.18775686621665955, "step": 590 }, { "epoch": 0.008824846946393908, "grad_norm": 0.494140625, "grad_norm_var": 0.004665867487589518, "learning_rate": 2e-05, "loss": 1.2201, "loss/crossentropy": 2.525197982788086, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15761615335941315, "step": 591 }, { "epoch": 0.008839779005524863, "grad_norm": 0.55078125, "grad_norm_var": 0.0045473575592041016, "learning_rate": 2e-05, "loss": 1.2657, "loss/crossentropy": 2.3301258087158203, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.18756815791130066, "step": 592 }, { "epoch": 0.008854711064655817, "grad_norm": 0.5546875, "grad_norm_var": 0.004490772883097331, "learning_rate": 2e-05, "loss": 1.2597, "loss/crossentropy": 2.576165199279785, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17378875613212585, "step": 593 }, { "epoch": 0.00886964312378677, "grad_norm": 0.49609375, "grad_norm_var": 0.004696766535441081, "learning_rate": 2e-05, "loss": 1.1644, "loss/crossentropy": 2.6105165481567383, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.14877916872501373, "step": 594 }, { "epoch": 0.008884575182917725, "grad_norm": 0.59375, "grad_norm_var": 0.0041913191477457685, "learning_rate": 2e-05, "loss": 1.2399, "loss/crossentropy": 2.6493771076202393, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16178762912750244, "step": 595 }, { "epoch": 0.008899507242048679, "grad_norm": 0.50390625, "grad_norm_var": 0.004147783915201823, "learning_rate": 2e-05, "loss": 1.1916, "loss/crossentropy": 2.4031014442443848, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15257549285888672, "step": 596 }, { "epoch": 0.008914439301179633, "grad_norm": 0.53125, "grad_norm_var": 0.004147783915201823, "learning_rate": 2e-05, "loss": 1.219, "loss/crossentropy": 2.6648566722869873, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.16428467631340027, "step": 597 }, { "epoch": 0.008929371360310587, "grad_norm": 0.478515625, "grad_norm_var": 0.004361073176066081, "learning_rate": 2e-05, "loss": 1.1408, "loss/crossentropy": 2.4159302711486816, "loss/dist_ce": 0.0, "loss/fcd": 0.99609375, "loss/idx": 12.0, "loss/logits": 0.14472083747386932, "step": 598 }, { "epoch": 0.008944303419441541, "grad_norm": 0.62890625, "grad_norm_var": 0.00417327880859375, "learning_rate": 2e-05, "loss": 1.3695, "loss/crossentropy": 2.6700379848480225, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.20542120933532715, "step": 599 }, { "epoch": 0.008959235478572495, "grad_norm": 0.55859375, "grad_norm_var": 0.004172706604003906, "learning_rate": 2e-05, "loss": 1.2161, "loss/crossentropy": 2.6093008518218994, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15361179411411285, "step": 600 }, { "epoch": 0.00897416753770345, "grad_norm": 0.5703125, "grad_norm_var": 0.003957621256510417, "learning_rate": 2e-05, "loss": 1.322, "loss/crossentropy": 2.6258511543273926, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18138790130615234, "step": 601 }, { "epoch": 0.008989099596834403, "grad_norm": 0.640625, "grad_norm_var": 0.004235331217447917, "learning_rate": 2e-05, "loss": 1.521, "loss/crossentropy": 2.5401480197906494, "loss/dist_ce": 0.0, "loss/fcd": 1.296875, "loss/idx": 12.0, "loss/logits": 0.22416627407073975, "step": 602 }, { "epoch": 0.009004031655965358, "grad_norm": 0.58203125, "grad_norm_var": 0.003630510965983073, "learning_rate": 2e-05, "loss": 1.3557, "loss/crossentropy": 2.61795973777771, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.1838516741991043, "step": 603 }, { "epoch": 0.009018963715096312, "grad_norm": 0.55859375, "grad_norm_var": 0.0035535176595052082, "learning_rate": 2e-05, "loss": 1.2715, "loss/crossentropy": 2.5267536640167236, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16214627027511597, "step": 604 }, { "epoch": 0.009033895774227266, "grad_norm": 0.5703125, "grad_norm_var": 0.0027175267537434896, "learning_rate": 2e-05, "loss": 1.2625, "loss/crossentropy": 2.503329038619995, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.1608983874320984, "step": 605 }, { "epoch": 0.00904882783335822, "grad_norm": 0.5703125, "grad_norm_var": 0.0021313985188802084, "learning_rate": 2e-05, "loss": 1.3283, "loss/crossentropy": 2.4117980003356934, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.1720181107521057, "step": 606 }, { "epoch": 0.009063759892489174, "grad_norm": 0.55078125, "grad_norm_var": 0.0018709659576416015, "learning_rate": 2e-05, "loss": 1.2359, "loss/crossentropy": 2.691850423812866, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1734330952167511, "step": 607 }, { "epoch": 0.009078691951620128, "grad_norm": 0.52734375, "grad_norm_var": 0.001930093765258789, "learning_rate": 2e-05, "loss": 1.2051, "loss/crossentropy": 2.514829635620117, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15824373066425323, "step": 608 }, { "epoch": 0.009093624010751082, "grad_norm": 0.52734375, "grad_norm_var": 0.0019861698150634766, "learning_rate": 2e-05, "loss": 1.2934, "loss/crossentropy": 2.523049831390381, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17622219026088715, "step": 609 }, { "epoch": 0.009108556069882036, "grad_norm": 0.5078125, "grad_norm_var": 0.001901865005493164, "learning_rate": 2e-05, "loss": 1.1657, "loss/crossentropy": 2.4890053272247314, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.14229866862297058, "step": 610 }, { "epoch": 0.00912348812901299, "grad_norm": 0.546875, "grad_norm_var": 0.0018049716949462891, "learning_rate": 2e-05, "loss": 1.279, "loss/crossentropy": 2.4220166206359863, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.1774497777223587, "step": 611 }, { "epoch": 0.009138420188143944, "grad_norm": 0.53515625, "grad_norm_var": 0.0016600131988525391, "learning_rate": 2e-05, "loss": 1.2903, "loss/crossentropy": 2.6942455768585205, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1808762103319168, "step": 612 }, { "epoch": 0.009153352247274898, "grad_norm": 0.5234375, "grad_norm_var": 0.0016888777414957683, "learning_rate": 2e-05, "loss": 1.3349, "loss/crossentropy": 2.5151455402374268, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.17866647243499756, "step": 613 }, { "epoch": 0.009168284306405853, "grad_norm": 0.65234375, "grad_norm_var": 0.0018091201782226562, "learning_rate": 2e-05, "loss": 1.3485, "loss/crossentropy": 2.5160040855407715, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.19221991300582886, "step": 614 }, { "epoch": 0.009183216365536807, "grad_norm": 0.546875, "grad_norm_var": 0.0015380859375, "learning_rate": 2e-05, "loss": 1.2869, "loss/crossentropy": 2.493112087249756, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16190239787101746, "step": 615 }, { "epoch": 0.009198148424667763, "grad_norm": 0.66015625, "grad_norm_var": 0.0021563212076822916, "learning_rate": 2e-05, "loss": 1.2114, "loss/crossentropy": 2.559445381164551, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.14888477325439453, "step": 616 }, { "epoch": 0.009213080483798717, "grad_norm": 0.609375, "grad_norm_var": 0.002269490559895833, "learning_rate": 2e-05, "loss": 1.3424, "loss/crossentropy": 2.2312510013580322, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.15492798388004303, "step": 617 }, { "epoch": 0.00922801254292967, "grad_norm": 0.5078125, "grad_norm_var": 0.002109527587890625, "learning_rate": 2e-05, "loss": 1.1577, "loss/crossentropy": 2.72347354888916, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.14986461400985718, "step": 618 }, { "epoch": 0.009242944602060625, "grad_norm": 0.51953125, "grad_norm_var": 0.0021787007649739583, "learning_rate": 2e-05, "loss": 1.3219, "loss/crossentropy": 2.683133125305176, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18123763799667358, "step": 619 }, { "epoch": 0.009257876661191579, "grad_norm": 0.640625, "grad_norm_var": 0.0026152928670247397, "learning_rate": 2e-05, "loss": 1.2966, "loss/crossentropy": 2.640098810195923, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.17159606516361237, "step": 620 }, { "epoch": 0.009272808720322533, "grad_norm": 0.50390625, "grad_norm_var": 0.0028195699055989583, "learning_rate": 2e-05, "loss": 1.1244, "loss/crossentropy": 2.522538661956787, "loss/dist_ce": 0.0, "loss/fcd": 0.9921875, "loss/idx": 12.0, "loss/logits": 0.13216978311538696, "step": 621 }, { "epoch": 0.009287740779453487, "grad_norm": 0.55859375, "grad_norm_var": 0.0028090794881184896, "learning_rate": 2e-05, "loss": 1.2727, "loss/crossentropy": 2.597336530685425, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.171115443110466, "step": 622 }, { "epoch": 0.009302672838584441, "grad_norm": 1.625, "grad_norm_var": 0.07398656209309896, "learning_rate": 2e-05, "loss": 1.4228, "loss/crossentropy": 3.3440957069396973, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 12.0, "loss/logits": 0.1728263944387436, "step": 623 }, { "epoch": 0.009317604897715395, "grad_norm": 0.5859375, "grad_norm_var": 0.073442014058431, "learning_rate": 2e-05, "loss": 1.2968, "loss/crossentropy": 2.5757291316986084, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1718222051858902, "step": 624 }, { "epoch": 0.00933253695684635, "grad_norm": 0.53515625, "grad_norm_var": 0.0733407974243164, "learning_rate": 2e-05, "loss": 1.2285, "loss/crossentropy": 2.482468605041504, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1582183539867401, "step": 625 }, { "epoch": 0.009347469015977303, "grad_norm": 0.5703125, "grad_norm_var": 0.0725778579711914, "learning_rate": 2e-05, "loss": 1.4247, "loss/crossentropy": 2.2671515941619873, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.2216186374425888, "step": 626 }, { "epoch": 0.009362401075108258, "grad_norm": 0.56640625, "grad_norm_var": 0.0723785400390625, "learning_rate": 2e-05, "loss": 1.2715, "loss/crossentropy": 2.383666515350342, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.19337865710258484, "step": 627 }, { "epoch": 0.009377333134239212, "grad_norm": 0.484375, "grad_norm_var": 0.07320753733317058, "learning_rate": 2e-05, "loss": 1.1818, "loss/crossentropy": 2.647897243499756, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.15054702758789062, "step": 628 }, { "epoch": 0.009392265193370166, "grad_norm": 0.53515625, "grad_norm_var": 0.07304865519205729, "learning_rate": 2e-05, "loss": 1.2364, "loss/crossentropy": 2.7212166786193848, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16603776812553406, "step": 629 }, { "epoch": 0.00940719725250112, "grad_norm": 0.54296875, "grad_norm_var": 0.07349014282226562, "learning_rate": 2e-05, "loss": 1.3161, "loss/crossentropy": 2.613542318344116, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.19105498492717743, "step": 630 }, { "epoch": 0.009422129311632074, "grad_norm": 0.5625, "grad_norm_var": 0.07334365844726562, "learning_rate": 2e-05, "loss": 1.2691, "loss/crossentropy": 2.55253529548645, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16752402484416962, "step": 631 }, { "epoch": 0.009437061370763028, "grad_norm": 0.6640625, "grad_norm_var": 0.07336266835530598, "learning_rate": 2e-05, "loss": 1.2329, "loss/crossentropy": 2.688462734222412, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1703636348247528, "step": 632 }, { "epoch": 0.009451993429893982, "grad_norm": 0.55078125, "grad_norm_var": 0.07370503743489583, "learning_rate": 2e-05, "loss": 1.2271, "loss/crossentropy": 2.488166570663452, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16464681923389435, "step": 633 }, { "epoch": 0.009466925489024936, "grad_norm": 0.50390625, "grad_norm_var": 0.073765500386556, "learning_rate": 2e-05, "loss": 1.226, "loss/crossentropy": 2.471609115600586, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.155724436044693, "step": 634 }, { "epoch": 0.00948185754815589, "grad_norm": 0.546875, "grad_norm_var": 0.0734392801920573, "learning_rate": 2e-05, "loss": 1.2426, "loss/crossentropy": 2.4193055629730225, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.15662391483783722, "step": 635 }, { "epoch": 0.009496789607286844, "grad_norm": 0.53515625, "grad_norm_var": 0.07389418284098308, "learning_rate": 2e-05, "loss": 1.2934, "loss/crossentropy": 2.5824780464172363, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.18399065732955933, "step": 636 }, { "epoch": 0.009511721666417798, "grad_norm": 0.61328125, "grad_norm_var": 0.07299340565999349, "learning_rate": 2e-05, "loss": 1.2094, "loss/crossentropy": 2.6305127143859863, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15475600957870483, "step": 637 }, { "epoch": 0.009526653725548753, "grad_norm": 0.59375, "grad_norm_var": 0.07276509602864584, "learning_rate": 2e-05, "loss": 1.2025, "loss/crossentropy": 2.63283371925354, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.163466677069664, "step": 638 }, { "epoch": 0.009541585784679707, "grad_norm": 0.5, "grad_norm_var": 0.0020131429036458334, "learning_rate": 2e-05, "loss": 1.1414, "loss/crossentropy": 2.563518762588501, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 12.0, "loss/logits": 0.14137643575668335, "step": 639 }, { "epoch": 0.00955651784381066, "grad_norm": 0.5234375, "grad_norm_var": 0.0020050048828125, "learning_rate": 2e-05, "loss": 1.143, "loss/crossentropy": 2.7085118293762207, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 12.0, "loss/logits": 0.14298370480537415, "step": 640 }, { "epoch": 0.009571449902941615, "grad_norm": 0.482421875, "grad_norm_var": 0.0022955417633056642, "learning_rate": 2e-05, "loss": 1.0613, "loss/crossentropy": 2.453303098678589, "loss/dist_ce": 0.0, "loss/fcd": 0.94140625, "loss/idx": 12.0, "loss/logits": 0.11989939212799072, "step": 641 }, { "epoch": 0.00958638196207257, "grad_norm": 0.55859375, "grad_norm_var": 0.0022699832916259766, "learning_rate": 2e-05, "loss": 1.3519, "loss/crossentropy": 2.531151294708252, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.20343336462974548, "step": 642 }, { "epoch": 0.009601314021203525, "grad_norm": 0.5078125, "grad_norm_var": 0.0023386478424072266, "learning_rate": 2e-05, "loss": 1.226, "loss/crossentropy": 2.747368097305298, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1635451763868332, "step": 643 }, { "epoch": 0.009616246080334479, "grad_norm": 0.490234375, "grad_norm_var": 0.002294158935546875, "learning_rate": 2e-05, "loss": 1.1569, "loss/crossentropy": 2.6408183574676514, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.14904196560382843, "step": 644 }, { "epoch": 0.009631178139465433, "grad_norm": 0.578125, "grad_norm_var": 0.002356402079264323, "learning_rate": 2e-05, "loss": 1.292, "loss/crossentropy": 2.741863489151001, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16696280241012573, "step": 645 }, { "epoch": 0.009646110198596387, "grad_norm": 0.5078125, "grad_norm_var": 0.002453104654947917, "learning_rate": 2e-05, "loss": 1.2767, "loss/crossentropy": 2.5982935428619385, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17511314153671265, "step": 646 }, { "epoch": 0.009661042257727341, "grad_norm": 0.59375, "grad_norm_var": 0.0025873819986979166, "learning_rate": 2e-05, "loss": 1.3794, "loss/crossentropy": 2.221534252166748, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.18411913514137268, "step": 647 }, { "epoch": 0.009675974316858295, "grad_norm": 0.5078125, "grad_norm_var": 0.0016718546549479166, "learning_rate": 2e-05, "loss": 1.2579, "loss/crossentropy": 2.5770785808563232, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17191347479820251, "step": 648 }, { "epoch": 0.00969090637598925, "grad_norm": 0.5390625, "grad_norm_var": 0.001659075419108073, "learning_rate": 2e-05, "loss": 1.1903, "loss/crossentropy": 2.5759243965148926, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15121833980083466, "step": 649 }, { "epoch": 0.009705838435120203, "grad_norm": 0.53515625, "grad_norm_var": 0.0015848159790039062, "learning_rate": 2e-05, "loss": 1.2818, "loss/crossentropy": 2.791914224624634, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.18024246394634247, "step": 650 }, { "epoch": 0.009720770494251157, "grad_norm": 0.53515625, "grad_norm_var": 0.001580047607421875, "learning_rate": 2e-05, "loss": 1.3524, "loss/crossentropy": 2.622012138366699, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.18833482265472412, "step": 651 }, { "epoch": 0.009735702553382112, "grad_norm": 0.5703125, "grad_norm_var": 0.0016458511352539062, "learning_rate": 2e-05, "loss": 1.318, "loss/crossentropy": 2.5040524005889893, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.1773415207862854, "step": 652 }, { "epoch": 0.009750634612513066, "grad_norm": 0.53125, "grad_norm_var": 0.001262664794921875, "learning_rate": 2e-05, "loss": 1.2424, "loss/crossentropy": 2.6082024574279785, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16425400972366333, "step": 653 }, { "epoch": 0.00976556667164402, "grad_norm": 0.5, "grad_norm_var": 0.001073455810546875, "learning_rate": 2e-05, "loss": 1.2584, "loss/crossentropy": 2.7121970653533936, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17245107889175415, "step": 654 }, { "epoch": 0.009780498730774974, "grad_norm": 0.46484375, "grad_norm_var": 0.0012857437133789063, "learning_rate": 2e-05, "loss": 1.1393, "loss/crossentropy": 2.605868101119995, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 12.0, "loss/logits": 0.13928866386413574, "step": 655 }, { "epoch": 0.009795430789905928, "grad_norm": 0.62890625, "grad_norm_var": 0.00193634033203125, "learning_rate": 2e-05, "loss": 1.4061, "loss/crossentropy": 2.5328142642974854, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 12.0, "loss/logits": 0.1951315701007843, "step": 656 }, { "epoch": 0.009810362849036882, "grad_norm": 0.56640625, "grad_norm_var": 0.0018085320790608725, "learning_rate": 2e-05, "loss": 1.1623, "loss/crossentropy": 2.591285228729248, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.1467236578464508, "step": 657 }, { "epoch": 0.009825294908167836, "grad_norm": 0.53125, "grad_norm_var": 0.0017818291982014975, "learning_rate": 2e-05, "loss": 1.2404, "loss/crossentropy": 2.678740978240967, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16228888928890228, "step": 658 }, { "epoch": 0.00984022696729879, "grad_norm": 0.50390625, "grad_norm_var": 0.0017978509267171225, "learning_rate": 2e-05, "loss": 1.197, "loss/crossentropy": 2.5414230823516846, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15012088418006897, "step": 659 }, { "epoch": 0.009855159026429744, "grad_norm": 0.62109375, "grad_norm_var": 0.0020608901977539062, "learning_rate": 2e-05, "loss": 1.369, "loss/crossentropy": 2.403024435043335, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.18149128556251526, "step": 660 }, { "epoch": 0.009870091085560698, "grad_norm": 0.48828125, "grad_norm_var": 0.0021647135416666668, "learning_rate": 2e-05, "loss": 1.2125, "loss/crossentropy": 2.634225368499756, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.16562089323997498, "step": 661 }, { "epoch": 0.009885023144691652, "grad_norm": 0.52734375, "grad_norm_var": 0.0021071751912434896, "learning_rate": 2e-05, "loss": 1.1779, "loss/crossentropy": 2.8136699199676514, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.16229188442230225, "step": 662 }, { "epoch": 0.009899955203822607, "grad_norm": 0.6015625, "grad_norm_var": 0.0021666844685872396, "learning_rate": 2e-05, "loss": 1.417, "loss/crossentropy": 2.579958915710449, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.19826620817184448, "step": 663 }, { "epoch": 0.00991488726295356, "grad_norm": 0.65625, "grad_norm_var": 0.0028914769490559896, "learning_rate": 2e-05, "loss": 1.4121, "loss/crossentropy": 2.441436529159546, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.20899558067321777, "step": 664 }, { "epoch": 0.009929819322084515, "grad_norm": 0.51171875, "grad_norm_var": 0.0029782613118489584, "learning_rate": 2e-05, "loss": 1.2005, "loss/crossentropy": 2.5218026638031006, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1536444127559662, "step": 665 }, { "epoch": 0.009944751381215469, "grad_norm": 0.515625, "grad_norm_var": 0.0030364354451497396, "learning_rate": 2e-05, "loss": 1.2123, "loss/crossentropy": 2.6483314037323, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15763527154922485, "step": 666 }, { "epoch": 0.009959683440346423, "grad_norm": 0.56640625, "grad_norm_var": 0.0030476252237955728, "learning_rate": 2e-05, "loss": 1.1919, "loss/crossentropy": 2.708850383758545, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.1528831422328949, "step": 667 }, { "epoch": 0.009974615499477379, "grad_norm": 0.5078125, "grad_norm_var": 0.003114763895670573, "learning_rate": 2e-05, "loss": 1.202, "loss/crossentropy": 2.585891008377075, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15510644018650055, "step": 668 }, { "epoch": 0.009989547558608333, "grad_norm": 0.50390625, "grad_norm_var": 0.003212229410807292, "learning_rate": 2e-05, "loss": 1.2312, "loss/crossentropy": 2.487285614013672, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16083845496177673, "step": 669 }, { "epoch": 0.010004479617739287, "grad_norm": 0.494140625, "grad_norm_var": 0.0032483259836832683, "learning_rate": 2e-05, "loss": 1.1847, "loss/crossentropy": 2.699873924255371, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.15343204140663147, "step": 670 }, { "epoch": 0.010019411676870241, "grad_norm": 0.578125, "grad_norm_var": 0.002868509292602539, "learning_rate": 2e-05, "loss": 1.2348, "loss/crossentropy": 2.5884616374969482, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.18012240529060364, "step": 671 }, { "epoch": 0.010034343736001195, "grad_norm": 0.5703125, "grad_norm_var": 0.002467966079711914, "learning_rate": 2e-05, "loss": 1.1819, "loss/crossentropy": 2.445880889892578, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.1506287306547165, "step": 672 }, { "epoch": 0.01004927579513215, "grad_norm": 0.54296875, "grad_norm_var": 0.0024401187896728516, "learning_rate": 2e-05, "loss": 1.2579, "loss/crossentropy": 2.3222944736480713, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.15637820959091187, "step": 673 }, { "epoch": 0.010064207854263103, "grad_norm": 0.53125, "grad_norm_var": 0.0024401187896728516, "learning_rate": 2e-05, "loss": 1.2937, "loss/crossentropy": 2.3992764949798584, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1843375265598297, "step": 674 }, { "epoch": 0.010079139913394057, "grad_norm": 0.5625, "grad_norm_var": 0.0023333072662353516, "learning_rate": 2e-05, "loss": 1.3425, "loss/crossentropy": 2.4430034160614014, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.1940709352493286, "step": 675 }, { "epoch": 0.010094071972525012, "grad_norm": 0.56640625, "grad_norm_var": 0.0019924004872639975, "learning_rate": 2e-05, "loss": 1.2304, "loss/crossentropy": 2.6293787956237793, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16787387430667877, "step": 676 }, { "epoch": 0.010109004031655966, "grad_norm": 0.53125, "grad_norm_var": 0.0017811934153238933, "learning_rate": 2e-05, "loss": 1.2056, "loss/crossentropy": 2.738004207611084, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15091140568256378, "step": 677 }, { "epoch": 0.01012393609078692, "grad_norm": 0.5390625, "grad_norm_var": 0.0017575422922770183, "learning_rate": 2e-05, "loss": 1.2279, "loss/crossentropy": 2.484860897064209, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15762406587600708, "step": 678 }, { "epoch": 0.010138868149917874, "grad_norm": 0.4921875, "grad_norm_var": 0.0017343997955322266, "learning_rate": 2e-05, "loss": 1.2223, "loss/crossentropy": 2.5346856117248535, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1675938218832016, "step": 679 }, { "epoch": 0.010153800209048828, "grad_norm": 0.54296875, "grad_norm_var": 0.0008088270823160807, "learning_rate": 2e-05, "loss": 1.2444, "loss/crossentropy": 2.5577213764190674, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1663036346435547, "step": 680 }, { "epoch": 0.010168732268179782, "grad_norm": 0.52734375, "grad_norm_var": 0.0007760206858317058, "learning_rate": 2e-05, "loss": 1.2781, "loss/crossentropy": 2.5874197483062744, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.1843561828136444, "step": 681 }, { "epoch": 0.010183664327310736, "grad_norm": 0.56640625, "grad_norm_var": 0.0008008162180582683, "learning_rate": 2e-05, "loss": 1.2692, "loss/crossentropy": 2.458387613296509, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.15981708467006683, "step": 682 }, { "epoch": 0.01019859638644169, "grad_norm": 0.65625, "grad_norm_var": 0.0016343275705973308, "learning_rate": 2e-05, "loss": 1.4047, "loss/crossentropy": 2.532681941986084, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.20941874384880066, "step": 683 }, { "epoch": 0.010213528445572644, "grad_norm": 0.54296875, "grad_norm_var": 0.0015393416086832681, "learning_rate": 2e-05, "loss": 1.1006, "loss/crossentropy": 2.7314085960388184, "loss/dist_ce": 0.0, "loss/fcd": 0.97265625, "loss/idx": 12.0, "loss/logits": 0.12796571850776672, "step": 684 }, { "epoch": 0.010228460504703598, "grad_norm": 0.515625, "grad_norm_var": 0.0014809767405192058, "learning_rate": 2e-05, "loss": 1.1961, "loss/crossentropy": 2.662325859069824, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15699967741966248, "step": 685 }, { "epoch": 0.010243392563834552, "grad_norm": 0.62109375, "grad_norm_var": 0.0015853246053059895, "learning_rate": 2e-05, "loss": 1.3114, "loss/crossentropy": 2.2350664138793945, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17081522941589355, "step": 686 }, { "epoch": 0.010258324622965507, "grad_norm": 0.53125, "grad_norm_var": 0.0015807469685872396, "learning_rate": 2e-05, "loss": 1.2881, "loss/crossentropy": 2.4432411193847656, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17870429158210754, "step": 687 }, { "epoch": 0.01027325668209646, "grad_norm": 0.87109375, "grad_norm_var": 0.0079498291015625, "learning_rate": 2e-05, "loss": 1.5581, "loss/crossentropy": 2.6931755542755127, "loss/dist_ce": 0.0, "loss/fcd": 1.34375, "loss/idx": 12.0, "loss/logits": 0.2143464982509613, "step": 688 }, { "epoch": 0.010288188741227415, "grad_norm": 0.58984375, "grad_norm_var": 0.00791015625, "learning_rate": 2e-05, "loss": 1.298, "loss/crossentropy": 2.6877715587615967, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1730184555053711, "step": 689 }, { "epoch": 0.010303120800358369, "grad_norm": 0.53515625, "grad_norm_var": 0.007888730367024739, "learning_rate": 2e-05, "loss": 1.2563, "loss/crossentropy": 2.4702701568603516, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.1625899374485016, "step": 690 }, { "epoch": 0.010318052859489323, "grad_norm": 0.56640625, "grad_norm_var": 0.007883453369140625, "learning_rate": 2e-05, "loss": 1.3245, "loss/crossentropy": 2.4865882396698, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.1839236319065094, "step": 691 }, { "epoch": 0.010332984918620277, "grad_norm": 0.498046875, "grad_norm_var": 0.008251174290974935, "learning_rate": 2e-05, "loss": 1.2138, "loss/crossentropy": 2.5186684131622314, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1591419279575348, "step": 692 }, { "epoch": 0.010347916977751231, "grad_norm": 0.56640625, "grad_norm_var": 0.008144744237263997, "learning_rate": 2e-05, "loss": 1.276, "loss/crossentropy": 2.6817424297332764, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17447999119758606, "step": 693 }, { "epoch": 0.010362849036882187, "grad_norm": 0.546875, "grad_norm_var": 0.008113590876261394, "learning_rate": 2e-05, "loss": 1.3738, "loss/crossentropy": 2.546079397201538, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.21759110689163208, "step": 694 }, { "epoch": 0.010377781096013141, "grad_norm": 0.57421875, "grad_norm_var": 0.007648960749308268, "learning_rate": 2e-05, "loss": 1.2983, "loss/crossentropy": 2.4528214931488037, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.18111282587051392, "step": 695 }, { "epoch": 0.010392713155144095, "grad_norm": 0.5546875, "grad_norm_var": 0.007602421442667643, "learning_rate": 2e-05, "loss": 1.2616, "loss/crossentropy": 2.686127185821533, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.16781781613826752, "step": 696 }, { "epoch": 0.01040764521427505, "grad_norm": 0.60546875, "grad_norm_var": 0.007446018854777018, "learning_rate": 2e-05, "loss": 1.3101, "loss/crossentropy": 2.54367995262146, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.1694590151309967, "step": 697 }, { "epoch": 0.010422577273406003, "grad_norm": 0.546875, "grad_norm_var": 0.007515319188435872, "learning_rate": 2e-05, "loss": 1.2795, "loss/crossentropy": 2.6796493530273438, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17017018795013428, "step": 698 }, { "epoch": 0.010437509332536957, "grad_norm": 0.60546875, "grad_norm_var": 0.007178099950154623, "learning_rate": 2e-05, "loss": 1.3181, "loss/crossentropy": 2.4632835388183594, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.1774502694606781, "step": 699 }, { "epoch": 0.010452441391667911, "grad_norm": 0.515625, "grad_norm_var": 0.007357899347941081, "learning_rate": 2e-05, "loss": 1.1805, "loss/crossentropy": 2.715266466140747, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.14927467703819275, "step": 700 }, { "epoch": 0.010467373450798866, "grad_norm": 0.5390625, "grad_norm_var": 0.007198063532511393, "learning_rate": 2e-05, "loss": 1.2262, "loss/crossentropy": 2.36779522895813, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1558821052312851, "step": 701 }, { "epoch": 0.01048230550992982, "grad_norm": 0.609375, "grad_norm_var": 0.007141224543253581, "learning_rate": 2e-05, "loss": 1.3637, "loss/crossentropy": 2.60080885887146, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.19179855287075043, "step": 702 }, { "epoch": 0.010497237569060774, "grad_norm": 0.55078125, "grad_norm_var": 0.007042042414347331, "learning_rate": 2e-05, "loss": 1.2853, "loss/crossentropy": 2.4730658531188965, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1759296953678131, "step": 703 }, { "epoch": 0.010512169628191728, "grad_norm": 0.6171875, "grad_norm_var": 0.001206827163696289, "learning_rate": 2e-05, "loss": 1.4148, "loss/crossentropy": 2.071516990661621, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 12.0, "loss/logits": 0.1960187554359436, "step": 704 }, { "epoch": 0.010527101687322682, "grad_norm": 0.56640625, "grad_norm_var": 0.0011599063873291016, "learning_rate": 2e-05, "loss": 1.3046, "loss/crossentropy": 2.5084221363067627, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.1718074232339859, "step": 705 }, { "epoch": 0.010542033746453636, "grad_norm": 0.71484375, "grad_norm_var": 0.0025256951649983723, "learning_rate": 2e-05, "loss": 1.4087, "loss/crossentropy": 2.703789234161377, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.22122114896774292, "step": 706 }, { "epoch": 0.01055696580558459, "grad_norm": 0.59375, "grad_norm_var": 0.0025461673736572265, "learning_rate": 2e-05, "loss": 1.2491, "loss/crossentropy": 2.3329367637634277, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.15533186495304108, "step": 707 }, { "epoch": 0.010571897864715544, "grad_norm": 0.52734375, "grad_norm_var": 0.0022979736328125, "learning_rate": 2e-05, "loss": 1.2526, "loss/crossentropy": 2.6226069927215576, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.15883222222328186, "step": 708 }, { "epoch": 0.010586829923846498, "grad_norm": 0.5625, "grad_norm_var": 0.002304522196451823, "learning_rate": 2e-05, "loss": 1.2615, "loss/crossentropy": 2.480863332748413, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.15995028614997864, "step": 709 }, { "epoch": 0.010601761982977452, "grad_norm": 0.53125, "grad_norm_var": 0.002382342020670573, "learning_rate": 2e-05, "loss": 1.2916, "loss/crossentropy": 2.587437868118286, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1821785271167755, "step": 710 }, { "epoch": 0.010616694042108406, "grad_norm": 0.48828125, "grad_norm_var": 0.002863502502441406, "learning_rate": 2e-05, "loss": 1.2136, "loss/crossentropy": 2.6183738708496094, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15106935799121857, "step": 711 }, { "epoch": 0.01063162610123936, "grad_norm": 0.59765625, "grad_norm_var": 0.002887980143229167, "learning_rate": 2e-05, "loss": 1.3275, "loss/crossentropy": 2.7090811729431152, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.17910704016685486, "step": 712 }, { "epoch": 0.010646558160370315, "grad_norm": 0.5703125, "grad_norm_var": 0.002814165751139323, "learning_rate": 2e-05, "loss": 1.4072, "loss/crossentropy": 2.540616273880005, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.21185210347175598, "step": 713 }, { "epoch": 0.010661490219501269, "grad_norm": 0.59765625, "grad_norm_var": 0.002811686197916667, "learning_rate": 2e-05, "loss": 1.3452, "loss/crossentropy": 2.587371349334717, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.18108756840229034, "step": 714 }, { "epoch": 0.010676422278632223, "grad_norm": 0.51171875, "grad_norm_var": 0.0029703776041666665, "learning_rate": 2e-05, "loss": 1.1809, "loss/crossentropy": 2.543191909790039, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.14962825179100037, "step": 715 }, { "epoch": 0.010691354337763177, "grad_norm": 0.53125, "grad_norm_var": 0.0028757731119791667, "learning_rate": 2e-05, "loss": 1.2791, "loss/crossentropy": 2.650752544403076, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16970160603523254, "step": 716 }, { "epoch": 0.010706286396894131, "grad_norm": 0.51953125, "grad_norm_var": 0.0029784520467122395, "learning_rate": 2e-05, "loss": 1.2036, "loss/crossentropy": 2.721712350845337, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15671955049037933, "step": 717 }, { "epoch": 0.010721218456025085, "grad_norm": 0.490234375, "grad_norm_var": 0.0032101790110270183, "learning_rate": 2e-05, "loss": 1.2164, "loss/crossentropy": 2.4478020668029785, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.16174601018428802, "step": 718 }, { "epoch": 0.01073615051515604, "grad_norm": 0.54296875, "grad_norm_var": 0.0032242933909098306, "learning_rate": 2e-05, "loss": 1.2431, "loss/crossentropy": 2.5908939838409424, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1649816781282425, "step": 719 }, { "epoch": 0.010751082574286995, "grad_norm": 0.6171875, "grad_norm_var": 0.0032242933909098306, "learning_rate": 2e-05, "loss": 1.3963, "loss/crossentropy": 2.453248977661133, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.19315370917320251, "step": 720 }, { "epoch": 0.01076601463341795, "grad_norm": 0.828125, "grad_norm_var": 0.00772258440653483, "learning_rate": 2e-05, "loss": 1.3003, "loss/crossentropy": 2.477731704711914, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.16752782464027405, "step": 721 }, { "epoch": 0.010780946692548903, "grad_norm": 0.67578125, "grad_norm_var": 0.007097609837849935, "learning_rate": 2e-05, "loss": 1.4393, "loss/crossentropy": 2.5516936779022217, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.1970784217119217, "step": 722 }, { "epoch": 0.010795878751679857, "grad_norm": 0.62890625, "grad_norm_var": 0.007266982396443685, "learning_rate": 2e-05, "loss": 1.4367, "loss/crossentropy": 2.6101393699645996, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 12.0, "loss/logits": 0.21016988158226013, "step": 723 }, { "epoch": 0.010810810810810811, "grad_norm": 0.50390625, "grad_norm_var": 0.007454284032185872, "learning_rate": 2e-05, "loss": 1.2009, "loss/crossentropy": 2.484208822250366, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.1696794629096985, "step": 724 }, { "epoch": 0.010825742869941766, "grad_norm": 0.5703125, "grad_norm_var": 0.0074452559153238935, "learning_rate": 2e-05, "loss": 1.2156, "loss/crossentropy": 2.6739721298217773, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15314337611198425, "step": 725 }, { "epoch": 0.01084067492907272, "grad_norm": 0.578125, "grad_norm_var": 0.007307163874308268, "learning_rate": 2e-05, "loss": 1.3033, "loss/crossentropy": 2.596822500228882, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17045846581459045, "step": 726 }, { "epoch": 0.010855606988203674, "grad_norm": 0.57421875, "grad_norm_var": 0.0067378838857014975, "learning_rate": 2e-05, "loss": 1.2591, "loss/crossentropy": 2.6170504093170166, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.1575794816017151, "step": 727 }, { "epoch": 0.010870539047334628, "grad_norm": 0.5859375, "grad_norm_var": 0.00672453244527181, "learning_rate": 2e-05, "loss": 1.3929, "loss/crossentropy": 2.4130985736846924, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.19758152961730957, "step": 728 }, { "epoch": 0.010885471106465582, "grad_norm": 0.52734375, "grad_norm_var": 0.006911961237589518, "learning_rate": 2e-05, "loss": 1.2616, "loss/crossentropy": 2.508866786956787, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1756208837032318, "step": 729 }, { "epoch": 0.010900403165596536, "grad_norm": 0.53125, "grad_norm_var": 0.007033014297485351, "learning_rate": 2e-05, "loss": 1.1935, "loss/crossentropy": 2.5010106563568115, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15440748631954193, "step": 730 }, { "epoch": 0.01091533522472749, "grad_norm": 0.5234375, "grad_norm_var": 0.006941080093383789, "learning_rate": 2e-05, "loss": 1.1836, "loss/crossentropy": 2.571686029434204, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.15230761468410492, "step": 731 }, { "epoch": 0.010930267283858444, "grad_norm": 0.5078125, "grad_norm_var": 0.007117700576782226, "learning_rate": 2e-05, "loss": 1.2025, "loss/crossentropy": 2.565176486968994, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.14785486459732056, "step": 732 }, { "epoch": 0.010945199342989398, "grad_norm": 0.54296875, "grad_norm_var": 0.006977701187133789, "learning_rate": 2e-05, "loss": 1.2446, "loss/crossentropy": 2.555361747741699, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.15867501497268677, "step": 733 }, { "epoch": 0.010960131402120352, "grad_norm": 0.58203125, "grad_norm_var": 0.006445058186848958, "learning_rate": 2e-05, "loss": 1.3327, "loss/crossentropy": 2.6520895957946777, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.19203567504882812, "step": 734 }, { "epoch": 0.010975063461251306, "grad_norm": 0.515625, "grad_norm_var": 0.006635983784993489, "learning_rate": 2e-05, "loss": 1.1915, "loss/crossentropy": 2.4574670791625977, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.14458885788917542, "step": 735 }, { "epoch": 0.01098999552038226, "grad_norm": 0.5546875, "grad_norm_var": 0.0065769831339518225, "learning_rate": 2e-05, "loss": 1.3282, "loss/crossentropy": 2.5529794692993164, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.17192694544792175, "step": 736 }, { "epoch": 0.011004927579513215, "grad_norm": 0.63671875, "grad_norm_var": 0.0024553934733072915, "learning_rate": 2e-05, "loss": 1.3139, "loss/crossentropy": 2.5145184993743896, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17325331270694733, "step": 737 }, { "epoch": 0.011019859638644169, "grad_norm": 0.70703125, "grad_norm_var": 0.0029782613118489584, "learning_rate": 2e-05, "loss": 1.5417, "loss/crossentropy": 2.3598456382751465, "loss/dist_ce": 0.0, "loss/fcd": 1.2890625, "loss/idx": 12.0, "loss/logits": 0.2526322901248932, "step": 738 }, { "epoch": 0.011034791697775123, "grad_norm": 0.55859375, "grad_norm_var": 0.0027058919270833335, "learning_rate": 2e-05, "loss": 1.341, "loss/crossentropy": 2.66998291015625, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.20035037398338318, "step": 739 }, { "epoch": 0.011049723756906077, "grad_norm": 0.51171875, "grad_norm_var": 0.0026486714680989585, "learning_rate": 2e-05, "loss": 1.2284, "loss/crossentropy": 2.527078628540039, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.17375284433364868, "step": 740 }, { "epoch": 0.011064655816037031, "grad_norm": 0.53125, "grad_norm_var": 0.0027058919270833335, "learning_rate": 2e-05, "loss": 1.2061, "loss/crossentropy": 2.663687229156494, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15137803554534912, "step": 741 }, { "epoch": 0.011079587875167985, "grad_norm": 0.58203125, "grad_norm_var": 0.0027160008748372396, "learning_rate": 2e-05, "loss": 1.3247, "loss/crossentropy": 2.5827627182006836, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.1840566098690033, "step": 742 }, { "epoch": 0.01109451993429894, "grad_norm": 0.52734375, "grad_norm_var": 0.0027694066365559896, "learning_rate": 2e-05, "loss": 1.2818, "loss/crossentropy": 2.684978723526001, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17245015501976013, "step": 743 }, { "epoch": 0.011109451993429893, "grad_norm": 0.53125, "grad_norm_var": 0.002751604715983073, "learning_rate": 2e-05, "loss": 1.2827, "loss/crossentropy": 2.6512203216552734, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.17336753010749817, "step": 744 }, { "epoch": 0.011124384052560847, "grad_norm": 0.53125, "grad_norm_var": 0.0027384440104166667, "learning_rate": 2e-05, "loss": 1.2578, "loss/crossentropy": 2.4307868480682373, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.15619587898254395, "step": 745 }, { "epoch": 0.011139316111691801, "grad_norm": 0.51953125, "grad_norm_var": 0.002783648173014323, "learning_rate": 2e-05, "loss": 1.1904, "loss/crossentropy": 2.794666290283203, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.1513822376728058, "step": 746 }, { "epoch": 0.011154248170822757, "grad_norm": 0.5078125, "grad_norm_var": 0.0028624852498372396, "learning_rate": 2e-05, "loss": 1.2677, "loss/crossentropy": 2.5204267501831055, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16616524755954742, "step": 747 }, { "epoch": 0.011169180229953711, "grad_norm": 0.48046875, "grad_norm_var": 0.003073883056640625, "learning_rate": 2e-05, "loss": 1.1894, "loss/crossentropy": 2.630366563796997, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.1581723392009735, "step": 748 }, { "epoch": 0.011184112289084665, "grad_norm": 0.51953125, "grad_norm_var": 0.0031341552734375, "learning_rate": 2e-05, "loss": 1.2354, "loss/crossentropy": 2.55263090133667, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16509470343589783, "step": 749 }, { "epoch": 0.01119904434821562, "grad_norm": 0.478515625, "grad_norm_var": 0.003359079360961914, "learning_rate": 2e-05, "loss": 1.1695, "loss/crossentropy": 2.5358965396881104, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.14609494805335999, "step": 750 }, { "epoch": 0.011213976407346574, "grad_norm": 0.5625, "grad_norm_var": 0.003323221206665039, "learning_rate": 2e-05, "loss": 1.3294, "loss/crossentropy": 2.5020415782928467, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18877586722373962, "step": 751 }, { "epoch": 0.011228908466477528, "grad_norm": 0.50390625, "grad_norm_var": 0.0034273624420166015, "learning_rate": 2e-05, "loss": 1.2785, "loss/crossentropy": 2.7053909301757812, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17688840627670288, "step": 752 }, { "epoch": 0.011243840525608482, "grad_norm": 0.59375, "grad_norm_var": 0.0030063470204671223, "learning_rate": 2e-05, "loss": 1.3516, "loss/crossentropy": 2.440305471420288, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.1953657865524292, "step": 753 }, { "epoch": 0.011258772584739436, "grad_norm": 0.62890625, "grad_norm_var": 0.0016521294911702475, "learning_rate": 2e-05, "loss": 1.3206, "loss/crossentropy": 2.6588921546936035, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17999888956546783, "step": 754 }, { "epoch": 0.01127370464387039, "grad_norm": 0.55859375, "grad_norm_var": 0.0016521294911702475, "learning_rate": 2e-05, "loss": 1.1845, "loss/crossentropy": 2.6511154174804688, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.14538779854774475, "step": 755 }, { "epoch": 0.011288636703001344, "grad_norm": 0.50390625, "grad_norm_var": 0.001680739720662435, "learning_rate": 2e-05, "loss": 1.2223, "loss/crossentropy": 2.579488754272461, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15982869267463684, "step": 756 }, { "epoch": 0.011303568762132298, "grad_norm": 0.51171875, "grad_norm_var": 0.0017144362131754558, "learning_rate": 2e-05, "loss": 1.208, "loss/crossentropy": 2.4899256229400635, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.16107669472694397, "step": 757 }, { "epoch": 0.011318500821263252, "grad_norm": 0.5625, "grad_norm_var": 0.001612710952758789, "learning_rate": 2e-05, "loss": 1.2585, "loss/crossentropy": 2.424293279647827, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.16471582651138306, "step": 758 }, { "epoch": 0.011333432880394206, "grad_norm": 0.58984375, "grad_norm_var": 0.0018131097157796225, "learning_rate": 2e-05, "loss": 1.2444, "loss/crossentropy": 2.6930503845214844, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1662687361240387, "step": 759 }, { "epoch": 0.01134836493952516, "grad_norm": 0.55859375, "grad_norm_var": 0.0018407026926676431, "learning_rate": 2e-05, "loss": 1.1926, "loss/crossentropy": 2.4682953357696533, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.14569604396820068, "step": 760 }, { "epoch": 0.011363296998656115, "grad_norm": 0.671875, "grad_norm_var": 0.002946201960245768, "learning_rate": 2e-05, "loss": 1.3256, "loss/crossentropy": 2.677440643310547, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.1850149929523468, "step": 761 }, { "epoch": 0.011378229057787069, "grad_norm": 0.490234375, "grad_norm_var": 0.003107134501139323, "learning_rate": 2e-05, "loss": 1.1689, "loss/crossentropy": 2.6096267700195312, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.15331940352916718, "step": 762 }, { "epoch": 0.011393161116918023, "grad_norm": 0.515625, "grad_norm_var": 0.003072039286295573, "learning_rate": 2e-05, "loss": 1.1372, "loss/crossentropy": 2.6900992393493652, "loss/dist_ce": 0.0, "loss/fcd": 0.984375, "loss/idx": 12.0, "loss/logits": 0.15282993018627167, "step": 763 }, { "epoch": 0.011408093176048977, "grad_norm": 0.48828125, "grad_norm_var": 0.003007952372233073, "learning_rate": 2e-05, "loss": 1.2392, "loss/crossentropy": 2.6735024452209473, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16892734169960022, "step": 764 }, { "epoch": 0.011423025235179931, "grad_norm": 0.53515625, "grad_norm_var": 0.0029677708943684896, "learning_rate": 2e-05, "loss": 1.2317, "loss/crossentropy": 2.434821367263794, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16139961779117584, "step": 765 }, { "epoch": 0.011437957294310885, "grad_norm": 0.51953125, "grad_norm_var": 0.00269773801167806, "learning_rate": 2e-05, "loss": 1.2853, "loss/crossentropy": 2.514920234680176, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.160325825214386, "step": 766 }, { "epoch": 0.011452889353441839, "grad_norm": 0.5625, "grad_norm_var": 0.00269773801167806, "learning_rate": 2e-05, "loss": 1.256, "loss/crossentropy": 2.647106409072876, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.17788350582122803, "step": 767 }, { "epoch": 0.011467821412572793, "grad_norm": 0.62890625, "grad_norm_var": 0.00291136105855306, "learning_rate": 2e-05, "loss": 1.2025, "loss/crossentropy": 2.546200752258301, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.14780209958553314, "step": 768 }, { "epoch": 0.011482753471703747, "grad_norm": 0.515625, "grad_norm_var": 0.002915175755818685, "learning_rate": 2e-05, "loss": 1.2694, "loss/crossentropy": 2.6620922088623047, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.17561113834381104, "step": 769 }, { "epoch": 0.011497685530834701, "grad_norm": 0.55078125, "grad_norm_var": 0.0025019168853759764, "learning_rate": 2e-05, "loss": 1.2409, "loss/crossentropy": 2.733931064605713, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.15500226616859436, "step": 770 }, { "epoch": 0.011512617589965655, "grad_norm": 0.515625, "grad_norm_var": 0.002555068333943685, "learning_rate": 2e-05, "loss": 1.1855, "loss/crossentropy": 2.6056385040283203, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.14647497236728668, "step": 771 }, { "epoch": 0.01152754964909661, "grad_norm": 0.5625, "grad_norm_var": 0.002448256810506185, "learning_rate": 2e-05, "loss": 1.2866, "loss/crossentropy": 2.3778257369995117, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1616000533103943, "step": 772 }, { "epoch": 0.011542481708227565, "grad_norm": 0.51953125, "grad_norm_var": 0.0024135430653889974, "learning_rate": 2e-05, "loss": 1.2556, "loss/crossentropy": 2.5326895713806152, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1696186363697052, "step": 773 }, { "epoch": 0.01155741376735852, "grad_norm": 0.64453125, "grad_norm_var": 0.0029796441396077473, "learning_rate": 2e-05, "loss": 1.4358, "loss/crossentropy": 2.400148630142212, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.19359630346298218, "step": 774 }, { "epoch": 0.011572345826489474, "grad_norm": 0.60546875, "grad_norm_var": 0.003068908055623372, "learning_rate": 2e-05, "loss": 1.2743, "loss/crossentropy": 2.5815742015838623, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.164922833442688, "step": 775 }, { "epoch": 0.011587277885620428, "grad_norm": 0.494140625, "grad_norm_var": 0.0033002217610677083, "learning_rate": 2e-05, "loss": 1.1144, "loss/crossentropy": 2.582948684692383, "loss/dist_ce": 0.0, "loss/fcd": 0.98046875, "loss/idx": 12.0, "loss/logits": 0.13390488922595978, "step": 776 }, { "epoch": 0.011602209944751382, "grad_norm": 0.78515625, "grad_norm_var": 0.005923906962076823, "learning_rate": 2e-05, "loss": 1.4428, "loss/crossentropy": 2.4833004474639893, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 12.0, "loss/logits": 0.20838040113449097, "step": 777 }, { "epoch": 0.011617142003882336, "grad_norm": 0.5390625, "grad_norm_var": 0.0056294600168863935, "learning_rate": 2e-05, "loss": 1.2143, "loss/crossentropy": 2.672031879425049, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15958479046821594, "step": 778 }, { "epoch": 0.01163207406301329, "grad_norm": 0.53125, "grad_norm_var": 0.0055493513743082685, "learning_rate": 2e-05, "loss": 1.2599, "loss/crossentropy": 2.485051155090332, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17396780848503113, "step": 779 }, { "epoch": 0.011647006122144244, "grad_norm": 0.515625, "grad_norm_var": 0.005325937271118164, "learning_rate": 2e-05, "loss": 1.1861, "loss/crossentropy": 2.4769065380096436, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.14706002175807953, "step": 780 }, { "epoch": 0.011661938181275198, "grad_norm": 0.5234375, "grad_norm_var": 0.005379724502563477, "learning_rate": 2e-05, "loss": 1.2594, "loss/crossentropy": 2.6715643405914307, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.1656656712293625, "step": 781 }, { "epoch": 0.011676870240406152, "grad_norm": 0.51171875, "grad_norm_var": 0.005429188410441081, "learning_rate": 2e-05, "loss": 1.2034, "loss/crossentropy": 2.6194474697113037, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1565479040145874, "step": 782 }, { "epoch": 0.011691802299537106, "grad_norm": 0.5703125, "grad_norm_var": 0.005432621637980143, "learning_rate": 2e-05, "loss": 1.2735, "loss/crossentropy": 2.634058713912964, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16408279538154602, "step": 783 }, { "epoch": 0.01170673435866806, "grad_norm": 0.498046875, "grad_norm_var": 0.005359141031901041, "learning_rate": 2e-05, "loss": 1.2094, "loss/crossentropy": 2.4988787174224854, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.14692716300487518, "step": 784 }, { "epoch": 0.011721666417799015, "grad_norm": 0.482421875, "grad_norm_var": 0.005603138605753581, "learning_rate": 2e-05, "loss": 1.1987, "loss/crossentropy": 2.6358911991119385, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15186846256256104, "step": 785 }, { "epoch": 0.011736598476929969, "grad_norm": 0.48828125, "grad_norm_var": 0.005866607030232747, "learning_rate": 2e-05, "loss": 1.115, "loss/crossentropy": 2.591740131378174, "loss/dist_ce": 0.0, "loss/fcd": 0.98046875, "loss/idx": 12.0, "loss/logits": 0.13453420996665955, "step": 786 }, { "epoch": 0.011751530536060923, "grad_norm": 0.546875, "grad_norm_var": 0.005787769953409831, "learning_rate": 2e-05, "loss": 1.2283, "loss/crossentropy": 2.444399833679199, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15801313519477844, "step": 787 }, { "epoch": 0.011766462595191877, "grad_norm": 0.5625, "grad_norm_var": 0.005787769953409831, "learning_rate": 2e-05, "loss": 1.3204, "loss/crossentropy": 2.599078416824341, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.19537828862667084, "step": 788 }, { "epoch": 0.011781394654322831, "grad_norm": 0.6875, "grad_norm_var": 0.006843042373657226, "learning_rate": 2e-05, "loss": 1.3781, "loss/crossentropy": 2.5863723754882812, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.174960196018219, "step": 789 }, { "epoch": 0.011796326713453785, "grad_norm": 0.53515625, "grad_norm_var": 0.006381972630818685, "learning_rate": 2e-05, "loss": 1.2814, "loss/crossentropy": 2.3581089973449707, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.18761396408081055, "step": 790 }, { "epoch": 0.011811258772584739, "grad_norm": 0.490234375, "grad_norm_var": 0.006433550516764323, "learning_rate": 2e-05, "loss": 1.2193, "loss/crossentropy": 2.3400156497955322, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15682953596115112, "step": 791 }, { "epoch": 0.011826190831715693, "grad_norm": 0.58984375, "grad_norm_var": 0.0063237349192301435, "learning_rate": 2e-05, "loss": 1.2358, "loss/crossentropy": 2.597322702407837, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16544011235237122, "step": 792 }, { "epoch": 0.011841122890846647, "grad_norm": 0.546875, "grad_norm_var": 0.002515268325805664, "learning_rate": 2e-05, "loss": 1.255, "loss/crossentropy": 2.618713617324829, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16905856132507324, "step": 793 }, { "epoch": 0.011856054949977601, "grad_norm": 0.498046875, "grad_norm_var": 0.002618408203125, "learning_rate": 2e-05, "loss": 1.2597, "loss/crossentropy": 2.586865186691284, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.16593782603740692, "step": 794 }, { "epoch": 0.011870987009108555, "grad_norm": 0.62109375, "grad_norm_var": 0.003064409891764323, "learning_rate": 2e-05, "loss": 1.4704, "loss/crossentropy": 2.281075954437256, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 12.0, "loss/logits": 0.2047278881072998, "step": 795 }, { "epoch": 0.01188591906823951, "grad_norm": 0.53125, "grad_norm_var": 0.003025245666503906, "learning_rate": 2e-05, "loss": 1.2074, "loss/crossentropy": 2.56146502494812, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1527431160211563, "step": 796 }, { "epoch": 0.011900851127370464, "grad_norm": 0.6328125, "grad_norm_var": 0.003491655985514323, "learning_rate": 2e-05, "loss": 1.3282, "loss/crossentropy": 2.544595241546631, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.17975857853889465, "step": 797 }, { "epoch": 0.011915783186501418, "grad_norm": 0.5234375, "grad_norm_var": 0.003441111246744792, "learning_rate": 2e-05, "loss": 1.1355, "loss/crossentropy": 2.5778257846832275, "loss/dist_ce": 0.0, "loss/fcd": 0.99609375, "loss/idx": 12.0, "loss/logits": 0.1393672674894333, "step": 798 }, { "epoch": 0.011930715245632374, "grad_norm": 0.462890625, "grad_norm_var": 0.003875589370727539, "learning_rate": 2e-05, "loss": 1.1403, "loss/crossentropy": 2.7800252437591553, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 12.0, "loss/logits": 0.1403147280216217, "step": 799 }, { "epoch": 0.011945647304763328, "grad_norm": 0.58984375, "grad_norm_var": 0.003844960530598958, "learning_rate": 2e-05, "loss": 1.3118, "loss/crossentropy": 2.5456595420837402, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.18681630492210388, "step": 800 }, { "epoch": 0.011960579363894282, "grad_norm": 0.53125, "grad_norm_var": 0.0035584608713785807, "learning_rate": 2e-05, "loss": 1.2338, "loss/crossentropy": 2.46443247795105, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16349388659000397, "step": 801 }, { "epoch": 0.011975511423025236, "grad_norm": 0.53125, "grad_norm_var": 0.0033066908518473307, "learning_rate": 2e-05, "loss": 1.2967, "loss/crossentropy": 2.6430821418762207, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.17946891486644745, "step": 802 }, { "epoch": 0.01199044348215619, "grad_norm": 0.48828125, "grad_norm_var": 0.0035851637522379557, "learning_rate": 2e-05, "loss": 1.1305, "loss/crossentropy": 2.51657772064209, "loss/dist_ce": 0.0, "loss/fcd": 0.9921875, "loss/idx": 12.0, "loss/logits": 0.13827310502529144, "step": 803 }, { "epoch": 0.012005375541287144, "grad_norm": 0.55078125, "grad_norm_var": 0.0035763899485270183, "learning_rate": 2e-05, "loss": 1.2621, "loss/crossentropy": 2.462827205657959, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16057859361171722, "step": 804 }, { "epoch": 0.012020307600418098, "grad_norm": 0.482421875, "grad_norm_var": 0.002463213602701823, "learning_rate": 2e-05, "loss": 1.1779, "loss/crossentropy": 2.651974678039551, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.1466044932603836, "step": 805 }, { "epoch": 0.012035239659549052, "grad_norm": 0.486328125, "grad_norm_var": 0.002629709243774414, "learning_rate": 2e-05, "loss": 1.1487, "loss/crossentropy": 2.6436069011688232, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 12.0, "loss/logits": 0.1487184464931488, "step": 806 }, { "epoch": 0.012050171718680006, "grad_norm": 0.5078125, "grad_norm_var": 0.002544593811035156, "learning_rate": 2e-05, "loss": 1.1571, "loss/crossentropy": 2.6559572219848633, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.14927107095718384, "step": 807 }, { "epoch": 0.01206510377781096, "grad_norm": 0.5703125, "grad_norm_var": 0.0024279276529947918, "learning_rate": 2e-05, "loss": 1.2641, "loss/crossentropy": 2.5706608295440674, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16250476241111755, "step": 808 }, { "epoch": 0.012080035836941914, "grad_norm": 0.55859375, "grad_norm_var": 0.002455584208170573, "learning_rate": 2e-05, "loss": 1.2112, "loss/crossentropy": 2.501370906829834, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15652695298194885, "step": 809 }, { "epoch": 0.012094967896072869, "grad_norm": 0.5390625, "grad_norm_var": 0.002356449762980143, "learning_rate": 2e-05, "loss": 1.2613, "loss/crossentropy": 2.7686879634857178, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17538884282112122, "step": 810 }, { "epoch": 0.012109899955203823, "grad_norm": 0.5546875, "grad_norm_var": 0.0018960158030192056, "learning_rate": 2e-05, "loss": 1.2619, "loss/crossentropy": 2.6589901447296143, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.17594030499458313, "step": 811 }, { "epoch": 0.012124832014334777, "grad_norm": 0.578125, "grad_norm_var": 0.0020173231760660807, "learning_rate": 2e-05, "loss": 1.2311, "loss/crossentropy": 2.3903369903564453, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16083644330501556, "step": 812 }, { "epoch": 0.01213976407346573, "grad_norm": 0.53515625, "grad_norm_var": 0.0013624668121337891, "learning_rate": 2e-05, "loss": 1.3336, "loss/crossentropy": 2.5818023681640625, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.19297264516353607, "step": 813 }, { "epoch": 0.012154696132596685, "grad_norm": 0.6796875, "grad_norm_var": 0.0027383009592692057, "learning_rate": 2e-05, "loss": 1.3454, "loss/crossentropy": 2.5759518146514893, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.17351481318473816, "step": 814 }, { "epoch": 0.012169628191727639, "grad_norm": 0.5234375, "grad_norm_var": 0.0023416519165039063, "learning_rate": 2e-05, "loss": 1.2492, "loss/crossentropy": 2.6695284843444824, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16327539086341858, "step": 815 }, { "epoch": 0.012184560250858593, "grad_norm": 0.53515625, "grad_norm_var": 0.0021956761678059897, "learning_rate": 2e-05, "loss": 1.3266, "loss/crossentropy": 2.319685459136963, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18595829606056213, "step": 816 }, { "epoch": 0.012199492309989547, "grad_norm": 0.515625, "grad_norm_var": 0.0022307713826497395, "learning_rate": 2e-05, "loss": 1.2435, "loss/crossentropy": 2.4563002586364746, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.15760290622711182, "step": 817 }, { "epoch": 0.012214424369120501, "grad_norm": 0.58203125, "grad_norm_var": 0.0023340861002604167, "learning_rate": 2e-05, "loss": 1.324, "loss/crossentropy": 2.5283279418945312, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.16777649521827698, "step": 818 }, { "epoch": 0.012229356428251455, "grad_norm": 0.515625, "grad_norm_var": 0.0021814346313476563, "learning_rate": 2e-05, "loss": 1.2184, "loss/crossentropy": 2.776594638824463, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15586011111736298, "step": 819 }, { "epoch": 0.01224428848738241, "grad_norm": 0.5078125, "grad_norm_var": 0.002261861165364583, "learning_rate": 2e-05, "loss": 1.2115, "loss/crossentropy": 2.5414376258850098, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15676796436309814, "step": 820 }, { "epoch": 0.012259220546513364, "grad_norm": 0.49609375, "grad_norm_var": 0.0021649519602457683, "learning_rate": 2e-05, "loss": 1.2741, "loss/crossentropy": 2.4355485439300537, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16474318504333496, "step": 821 }, { "epoch": 0.012274152605644318, "grad_norm": 0.50390625, "grad_norm_var": 0.0020517985026041667, "learning_rate": 2e-05, "loss": 1.1965, "loss/crossentropy": 2.4327552318573, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1496448963880539, "step": 822 }, { "epoch": 0.012289084664775272, "grad_norm": 0.55859375, "grad_norm_var": 0.0019683202107747396, "learning_rate": 2e-05, "loss": 1.3843, "loss/crossentropy": 2.559605360031128, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.19683653116226196, "step": 823 }, { "epoch": 0.012304016723906226, "grad_norm": 0.5234375, "grad_norm_var": 0.0019606908162434896, "learning_rate": 2e-05, "loss": 1.2149, "loss/crossentropy": 2.5897631645202637, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1601862609386444, "step": 824 }, { "epoch": 0.012318948783037182, "grad_norm": 0.55859375, "grad_norm_var": 0.0019606908162434896, "learning_rate": 2e-05, "loss": 1.2356, "loss/crossentropy": 2.696868658065796, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16526451706886292, "step": 825 }, { "epoch": 0.012333880842168136, "grad_norm": 0.6484375, "grad_norm_var": 0.0026336034138997396, "learning_rate": 2e-05, "loss": 1.303, "loss/crossentropy": 2.709047794342041, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17016342282295227, "step": 826 }, { "epoch": 0.01234881290129909, "grad_norm": 0.54296875, "grad_norm_var": 0.0026364644368489583, "learning_rate": 2e-05, "loss": 1.2634, "loss/crossentropy": 2.4290895462036133, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.16962262988090515, "step": 827 }, { "epoch": 0.012363744960430044, "grad_norm": 0.515625, "grad_norm_var": 0.0026486714680989585, "learning_rate": 2e-05, "loss": 1.2122, "loss/crossentropy": 2.5961177349090576, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.16535454988479614, "step": 828 }, { "epoch": 0.012378677019560998, "grad_norm": 0.5390625, "grad_norm_var": 0.002643775939941406, "learning_rate": 2e-05, "loss": 1.2599, "loss/crossentropy": 2.601069211959839, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.15833136439323425, "step": 829 }, { "epoch": 0.012393609078691952, "grad_norm": 0.515625, "grad_norm_var": 0.0014154434204101563, "learning_rate": 2e-05, "loss": 1.2341, "loss/crossentropy": 2.6840217113494873, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.16378390789031982, "step": 830 }, { "epoch": 0.012408541137822906, "grad_norm": 0.59375, "grad_norm_var": 0.0016031265258789062, "learning_rate": 2e-05, "loss": 1.3101, "loss/crossentropy": 2.6370441913604736, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.18508955836296082, "step": 831 }, { "epoch": 0.01242347319695386, "grad_norm": 0.57421875, "grad_norm_var": 0.0016692479451497395, "learning_rate": 2e-05, "loss": 1.3561, "loss/crossentropy": 2.3607304096221924, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 12.0, "loss/logits": 0.1763923168182373, "step": 832 }, { "epoch": 0.012438405256084814, "grad_norm": 0.482421875, "grad_norm_var": 0.0018602848052978516, "learning_rate": 2e-05, "loss": 1.1611, "loss/crossentropy": 2.4794504642486572, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.13761460781097412, "step": 833 }, { "epoch": 0.012453337315215769, "grad_norm": 0.53515625, "grad_norm_var": 0.0017420291900634766, "learning_rate": 2e-05, "loss": 1.3138, "loss/crossentropy": 2.515113353729248, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.18878057599067688, "step": 834 }, { "epoch": 0.012468269374346723, "grad_norm": 0.5703125, "grad_norm_var": 0.0017642815907796224, "learning_rate": 2e-05, "loss": 1.3528, "loss/crossentropy": 2.7300655841827393, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.2043353170156479, "step": 835 }, { "epoch": 0.012483201433477677, "grad_norm": 0.50390625, "grad_norm_var": 0.001782846450805664, "learning_rate": 2e-05, "loss": 1.1872, "loss/crossentropy": 2.455660104751587, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.14813506603240967, "step": 836 }, { "epoch": 0.01249813349260863, "grad_norm": 0.5078125, "grad_norm_var": 0.0017206668853759766, "learning_rate": 2e-05, "loss": 1.2639, "loss/crossentropy": 2.3937854766845703, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1545325219631195, "step": 837 }, { "epoch": 0.012513065551739585, "grad_norm": 0.52734375, "grad_norm_var": 0.001635599136352539, "learning_rate": 2e-05, "loss": 1.201, "loss/crossentropy": 2.608152389526367, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15416556596755981, "step": 838 }, { "epoch": 0.012527997610870539, "grad_norm": 0.6171875, "grad_norm_var": 0.0019674777984619142, "learning_rate": 2e-05, "loss": 1.3746, "loss/crossentropy": 2.6124019622802734, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 12.0, "loss/logits": 0.19495141506195068, "step": 839 }, { "epoch": 0.012542929670001493, "grad_norm": 0.546875, "grad_norm_var": 0.0019274234771728515, "learning_rate": 2e-05, "loss": 1.145, "loss/crossentropy": 2.5301358699798584, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 12.0, "loss/logits": 0.14495986700057983, "step": 840 }, { "epoch": 0.012557861729132447, "grad_norm": 0.490234375, "grad_norm_var": 0.002129364013671875, "learning_rate": 2e-05, "loss": 1.0545, "loss/crossentropy": 2.5924758911132812, "loss/dist_ce": 0.0, "loss/fcd": 0.93359375, "loss/idx": 12.0, "loss/logits": 0.12090451270341873, "step": 841 }, { "epoch": 0.012572793788263401, "grad_norm": 0.6328125, "grad_norm_var": 0.001927947998046875, "learning_rate": 2e-05, "loss": 1.2721, "loss/crossentropy": 2.3995118141174316, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16268111765384674, "step": 842 }, { "epoch": 0.012587725847394355, "grad_norm": 0.54296875, "grad_norm_var": 0.001927947998046875, "learning_rate": 2e-05, "loss": 1.3116, "loss/crossentropy": 2.5142955780029297, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.18662039935588837, "step": 843 }, { "epoch": 0.01260265790652531, "grad_norm": 0.6015625, "grad_norm_var": 0.00207061767578125, "learning_rate": 2e-05, "loss": 1.336, "loss/crossentropy": 2.4487991333007812, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.17975641787052155, "step": 844 }, { "epoch": 0.012617589965656264, "grad_norm": 0.56640625, "grad_norm_var": 0.002081743876139323, "learning_rate": 2e-05, "loss": 1.2271, "loss/crossentropy": 2.799164295196533, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.17243638634681702, "step": 845 }, { "epoch": 0.012632522024787218, "grad_norm": 0.494140625, "grad_norm_var": 0.002210601170857747, "learning_rate": 2e-05, "loss": 1.2006, "loss/crossentropy": 2.636415958404541, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1537102460861206, "step": 846 }, { "epoch": 0.012647454083918172, "grad_norm": 0.49609375, "grad_norm_var": 0.0022264957427978516, "learning_rate": 2e-05, "loss": 1.2061, "loss/crossentropy": 2.377129316329956, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15926313400268555, "step": 847 }, { "epoch": 0.012662386143049126, "grad_norm": 0.58984375, "grad_norm_var": 0.0023066043853759766, "learning_rate": 2e-05, "loss": 1.2835, "loss/crossentropy": 2.4958112239837646, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.1897384524345398, "step": 848 }, { "epoch": 0.01267731820218008, "grad_norm": 0.49609375, "grad_norm_var": 0.0022059122721354166, "learning_rate": 2e-05, "loss": 1.2354, "loss/crossentropy": 2.732330560684204, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.17292934656143188, "step": 849 }, { "epoch": 0.012692250261311034, "grad_norm": 0.5078125, "grad_norm_var": 0.0022882461547851563, "learning_rate": 2e-05, "loss": 1.1959, "loss/crossentropy": 2.8346798419952393, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.14898401498794556, "step": 850 }, { "epoch": 0.01270718232044199, "grad_norm": 0.546875, "grad_norm_var": 0.002237892150878906, "learning_rate": 2e-05, "loss": 1.1314, "loss/crossentropy": 2.544158697128296, "loss/dist_ce": 0.0, "loss/fcd": 0.98828125, "loss/idx": 12.0, "loss/logits": 0.14313051104545593, "step": 851 }, { "epoch": 0.012722114379572944, "grad_norm": 0.51171875, "grad_norm_var": 0.002202288309733073, "learning_rate": 2e-05, "loss": 1.2397, "loss/crossentropy": 2.5338921546936035, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16162025928497314, "step": 852 }, { "epoch": 0.012737046438703898, "grad_norm": 0.671875, "grad_norm_var": 0.0031315485636393228, "learning_rate": 2e-05, "loss": 1.2927, "loss/crossentropy": 2.4311304092407227, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16769030690193176, "step": 853 }, { "epoch": 0.012751978497834852, "grad_norm": 0.5078125, "grad_norm_var": 0.0032208760579427085, "learning_rate": 2e-05, "loss": 1.2106, "loss/crossentropy": 2.668276309967041, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15591159462928772, "step": 854 }, { "epoch": 0.012766910556965806, "grad_norm": 0.515625, "grad_norm_var": 0.0029729207356770835, "learning_rate": 2e-05, "loss": 1.3015, "loss/crossentropy": 2.482598304748535, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.18431049585342407, "step": 855 }, { "epoch": 0.01278184261609676, "grad_norm": 0.53125, "grad_norm_var": 0.0029841105143229166, "learning_rate": 2e-05, "loss": 1.3188, "loss/crossentropy": 2.7403719425201416, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.19376035034656525, "step": 856 }, { "epoch": 0.012796774675227714, "grad_norm": 0.53515625, "grad_norm_var": 0.0027885278065999347, "learning_rate": 2e-05, "loss": 1.2216, "loss/crossentropy": 2.433129072189331, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.16688039898872375, "step": 857 }, { "epoch": 0.012811706734358668, "grad_norm": 0.48828125, "grad_norm_var": 0.002435668309529622, "learning_rate": 2e-05, "loss": 1.2295, "loss/crossentropy": 2.5015032291412354, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15920080244541168, "step": 858 }, { "epoch": 0.012826638793489623, "grad_norm": 0.57421875, "grad_norm_var": 0.0025185743967692056, "learning_rate": 2e-05, "loss": 1.2442, "loss/crossentropy": 2.488417625427246, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1816791594028473, "step": 859 }, { "epoch": 0.012841570852620577, "grad_norm": 0.52734375, "grad_norm_var": 0.0022504011789957683, "learning_rate": 2e-05, "loss": 1.2024, "loss/crossentropy": 2.665573835372925, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15554329752922058, "step": 860 }, { "epoch": 0.01285650291175153, "grad_norm": 0.8046875, "grad_norm_var": 0.006795740127563477, "learning_rate": 2e-05, "loss": 1.4828, "loss/crossentropy": 2.3143980503082275, "loss/dist_ce": 0.0, "loss/fcd": 1.2578125, "loss/idx": 12.0, "loss/logits": 0.22494599223136902, "step": 861 }, { "epoch": 0.012871434970882485, "grad_norm": 0.578125, "grad_norm_var": 0.006611887613932292, "learning_rate": 2e-05, "loss": 1.2513, "loss/crossentropy": 2.563246965408325, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.15756869316101074, "step": 862 }, { "epoch": 0.012886367030013439, "grad_norm": 0.5859375, "grad_norm_var": 0.00640862782796224, "learning_rate": 2e-05, "loss": 1.2763, "loss/crossentropy": 2.4165306091308594, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17470136284828186, "step": 863 }, { "epoch": 0.012901299089144393, "grad_norm": 0.58203125, "grad_norm_var": 0.006382179260253906, "learning_rate": 2e-05, "loss": 1.2261, "loss/crossentropy": 2.7474277019500732, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1479920744895935, "step": 864 }, { "epoch": 0.012916231148275347, "grad_norm": 0.486328125, "grad_norm_var": 0.006471745173136393, "learning_rate": 2e-05, "loss": 1.1288, "loss/crossentropy": 2.628307819366455, "loss/dist_ce": 0.0, "loss/fcd": 0.98828125, "loss/idx": 12.0, "loss/logits": 0.14052993059158325, "step": 865 }, { "epoch": 0.012931163207406301, "grad_norm": 0.5703125, "grad_norm_var": 0.00628355344136556, "learning_rate": 2e-05, "loss": 1.2903, "loss/crossentropy": 2.6581015586853027, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16525937616825104, "step": 866 }, { "epoch": 0.012946095266537255, "grad_norm": 0.51171875, "grad_norm_var": 0.00643919308980306, "learning_rate": 2e-05, "loss": 1.2729, "loss/crossentropy": 2.480175256729126, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.17913323640823364, "step": 867 }, { "epoch": 0.01296102732566821, "grad_norm": 0.55859375, "grad_norm_var": 0.006266005833943685, "learning_rate": 2e-05, "loss": 1.1887, "loss/crossentropy": 2.5329577922821045, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.14967145025730133, "step": 868 }, { "epoch": 0.012975959384799163, "grad_norm": 0.5, "grad_norm_var": 0.005647770563761393, "learning_rate": 2e-05, "loss": 1.2151, "loss/crossentropy": 2.5917510986328125, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.16825619339942932, "step": 869 }, { "epoch": 0.012990891443930118, "grad_norm": 0.55859375, "grad_norm_var": 0.005498997370402018, "learning_rate": 2e-05, "loss": 1.279, "loss/crossentropy": 2.703120231628418, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.1852026879787445, "step": 870 }, { "epoch": 0.013005823503061072, "grad_norm": 0.6015625, "grad_norm_var": 0.005489206314086914, "learning_rate": 2e-05, "loss": 1.3381, "loss/crossentropy": 2.6077189445495605, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.18187852203845978, "step": 871 }, { "epoch": 0.013020755562192026, "grad_norm": 0.52734375, "grad_norm_var": 0.005506245295206705, "learning_rate": 2e-05, "loss": 1.2242, "loss/crossentropy": 2.229553461074829, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.14606034755706787, "step": 872 }, { "epoch": 0.01303568762132298, "grad_norm": 0.5078125, "grad_norm_var": 0.005650440851847331, "learning_rate": 2e-05, "loss": 1.0931, "loss/crossentropy": 2.4416704177856445, "loss/dist_ce": 0.0, "loss/fcd": 0.9609375, "loss/idx": 12.0, "loss/logits": 0.13215383887290955, "step": 873 }, { "epoch": 0.013050619680453934, "grad_norm": 0.6015625, "grad_norm_var": 0.00536650021870931, "learning_rate": 2e-05, "loss": 1.3962, "loss/crossentropy": 2.580476760864258, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.20873740315437317, "step": 874 }, { "epoch": 0.013065551739584888, "grad_norm": 0.625, "grad_norm_var": 0.005574782689412435, "learning_rate": 2e-05, "loss": 1.4374, "loss/crossentropy": 2.3341832160949707, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.19524669647216797, "step": 875 }, { "epoch": 0.013080483798715842, "grad_norm": 0.546875, "grad_norm_var": 0.005486408869425456, "learning_rate": 2e-05, "loss": 1.3415, "loss/crossentropy": 2.2510344982147217, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.19304239749908447, "step": 876 }, { "epoch": 0.013095415857846796, "grad_norm": 0.609375, "grad_norm_var": 0.0018020470937093098, "learning_rate": 2e-05, "loss": 1.3444, "loss/crossentropy": 2.42315936088562, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 12.0, "loss/logits": 0.1725073605775833, "step": 877 }, { "epoch": 0.013110347916977752, "grad_norm": 0.6484375, "grad_norm_var": 0.0022861321767171225, "learning_rate": 2e-05, "loss": 1.206, "loss/crossentropy": 2.3976566791534424, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.1590883731842041, "step": 878 }, { "epoch": 0.013125279976108706, "grad_norm": 0.5703125, "grad_norm_var": 0.0022553602854410806, "learning_rate": 2e-05, "loss": 1.3037, "loss/crossentropy": 2.417356014251709, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.17873302102088928, "step": 879 }, { "epoch": 0.01314021203523966, "grad_norm": 0.48828125, "grad_norm_var": 0.0025651137034098308, "learning_rate": 2e-05, "loss": 1.25, "loss/crossentropy": 2.579538345336914, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1640225350856781, "step": 880 }, { "epoch": 0.013155144094370614, "grad_norm": 0.6171875, "grad_norm_var": 0.002402178446451823, "learning_rate": 2e-05, "loss": 1.397, "loss/crossentropy": 2.5399506092071533, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.1938353031873703, "step": 881 }, { "epoch": 0.013170076153501568, "grad_norm": 0.55859375, "grad_norm_var": 0.0024027506510416667, "learning_rate": 2e-05, "loss": 1.3593, "loss/crossentropy": 2.570274591445923, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 12.0, "loss/logits": 0.19524267315864563, "step": 882 }, { "epoch": 0.013185008212632522, "grad_norm": 0.57421875, "grad_norm_var": 0.0022074381510416665, "learning_rate": 2e-05, "loss": 1.2983, "loss/crossentropy": 2.8754701614379883, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.18115171790122986, "step": 883 }, { "epoch": 0.013199940271763477, "grad_norm": 0.56640625, "grad_norm_var": 0.002201080322265625, "learning_rate": 2e-05, "loss": 1.2878, "loss/crossentropy": 2.8438570499420166, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1705954670906067, "step": 884 }, { "epoch": 0.01321487233089443, "grad_norm": 0.55078125, "grad_norm_var": 0.0018960952758789063, "learning_rate": 2e-05, "loss": 1.3113, "loss/crossentropy": 2.6516339778900146, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1863495111465454, "step": 885 }, { "epoch": 0.013229804390025385, "grad_norm": 0.486328125, "grad_norm_var": 0.0023518721262613933, "learning_rate": 2e-05, "loss": 1.1807, "loss/crossentropy": 2.603248119354248, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.14165057241916656, "step": 886 }, { "epoch": 0.013244736449156339, "grad_norm": 0.55078125, "grad_norm_var": 0.002282444636027018, "learning_rate": 2e-05, "loss": 1.2735, "loss/crossentropy": 2.409811019897461, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16412316262722015, "step": 887 }, { "epoch": 0.013259668508287293, "grad_norm": 0.51953125, "grad_norm_var": 0.0023247877756754558, "learning_rate": 2e-05, "loss": 1.2055, "loss/crossentropy": 2.680741548538208, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15857717394828796, "step": 888 }, { "epoch": 0.013274600567418247, "grad_norm": 1.84375, "grad_norm_var": 0.10388995806376139, "learning_rate": 2e-05, "loss": 1.3991, "loss/crossentropy": 2.831256628036499, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.2428184151649475, "step": 889 }, { "epoch": 0.013289532626549201, "grad_norm": 0.640625, "grad_norm_var": 0.10374690691630045, "learning_rate": 2e-05, "loss": 1.3895, "loss/crossentropy": 2.541971206665039, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.18634910881519318, "step": 890 }, { "epoch": 0.013304464685680155, "grad_norm": 0.58203125, "grad_norm_var": 0.10400427182515462, "learning_rate": 2e-05, "loss": 1.2184, "loss/crossentropy": 2.6130967140197754, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.16373895108699799, "step": 891 }, { "epoch": 0.01331939674481111, "grad_norm": 0.546875, "grad_norm_var": 0.10400427182515462, "learning_rate": 2e-05, "loss": 1.19, "loss/crossentropy": 2.482786178588867, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.14307764172554016, "step": 892 }, { "epoch": 0.013334328803942063, "grad_norm": 0.484375, "grad_norm_var": 0.10560949643452962, "learning_rate": 2e-05, "loss": 1.1326, "loss/crossentropy": 2.541658639907837, "loss/dist_ce": 0.0, "loss/fcd": 0.98828125, "loss/idx": 12.0, "loss/logits": 0.1443423479795456, "step": 893 }, { "epoch": 0.013349260863073018, "grad_norm": 0.65625, "grad_norm_var": 0.1056228478749593, "learning_rate": 2e-05, "loss": 1.3966, "loss/crossentropy": 2.303764581680298, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.1934724748134613, "step": 894 }, { "epoch": 0.013364192922203972, "grad_norm": 0.4921875, "grad_norm_var": 0.10672783851623535, "learning_rate": 2e-05, "loss": 1.1688, "loss/crossentropy": 2.6050453186035156, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.1453702449798584, "step": 895 }, { "epoch": 0.013379124981334926, "grad_norm": 0.546875, "grad_norm_var": 0.10579705238342285, "learning_rate": 2e-05, "loss": 1.2725, "loss/crossentropy": 2.772411823272705, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.17090922594070435, "step": 896 }, { "epoch": 0.01339405704046588, "grad_norm": 0.78515625, "grad_norm_var": 0.10708196957906087, "learning_rate": 2e-05, "loss": 1.7039, "loss/crossentropy": 2.8474199771881104, "loss/dist_ce": 0.0, "loss/fcd": 1.3984375, "loss/idx": 12.0, "loss/logits": 0.30551040172576904, "step": 897 }, { "epoch": 0.013408989099596834, "grad_norm": 1.328125, "grad_norm_var": 0.13481214841206868, "learning_rate": 2e-05, "loss": 1.5814, "loss/crossentropy": 2.6117067337036133, "loss/dist_ce": 0.0, "loss/fcd": 1.3203125, "loss/idx": 12.0, "loss/logits": 0.2610923647880554, "step": 898 }, { "epoch": 0.013423921158727788, "grad_norm": 0.55859375, "grad_norm_var": 0.13508350054423016, "learning_rate": 2e-05, "loss": 1.2476, "loss/crossentropy": 2.5433382987976074, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16162243485450745, "step": 899 }, { "epoch": 0.013438853217858742, "grad_norm": 0.59375, "grad_norm_var": 0.1346571445465088, "learning_rate": 2e-05, "loss": 1.3289, "loss/crossentropy": 2.3905489444732666, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18823018670082092, "step": 900 }, { "epoch": 0.013453785276989696, "grad_norm": 0.60546875, "grad_norm_var": 0.1337714989980062, "learning_rate": 2e-05, "loss": 1.222, "loss/crossentropy": 2.8749287128448486, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15945787727832794, "step": 901 }, { "epoch": 0.01346871733612065, "grad_norm": 0.73828125, "grad_norm_var": 0.13051751454671223, "learning_rate": 2e-05, "loss": 1.2667, "loss/crossentropy": 2.56249737739563, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16514375805854797, "step": 902 }, { "epoch": 0.013483649395251604, "grad_norm": 0.53125, "grad_norm_var": 0.13097432454427083, "learning_rate": 2e-05, "loss": 1.3496, "loss/crossentropy": 2.5442233085632324, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 12.0, "loss/logits": 0.16989666223526, "step": 903 }, { "epoch": 0.01349858145438256, "grad_norm": 0.60546875, "grad_norm_var": 0.12918675740559896, "learning_rate": 2e-05, "loss": 1.3425, "loss/crossentropy": 2.4813945293426514, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.18623802065849304, "step": 904 }, { "epoch": 0.013513513513513514, "grad_norm": 0.54296875, "grad_norm_var": 0.04024499257405599, "learning_rate": 2e-05, "loss": 1.1192, "loss/crossentropy": 2.650784969329834, "loss/dist_ce": 0.0, "loss/fcd": 0.98046875, "loss/idx": 12.0, "loss/logits": 0.1387377828359604, "step": 905 }, { "epoch": 0.013528445572644468, "grad_norm": 0.58203125, "grad_norm_var": 0.04045384724934896, "learning_rate": 2e-05, "loss": 1.3307, "loss/crossentropy": 2.550124168395996, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.1822178065776825, "step": 906 }, { "epoch": 0.013543377631775422, "grad_norm": 0.61328125, "grad_norm_var": 0.040289052327473956, "learning_rate": 2e-05, "loss": 1.4404, "loss/crossentropy": 2.7158029079437256, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.24511241912841797, "step": 907 }, { "epoch": 0.013558309690906377, "grad_norm": 0.5078125, "grad_norm_var": 0.0408599853515625, "learning_rate": 2e-05, "loss": 1.2551, "loss/crossentropy": 2.580396890640259, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16913624107837677, "step": 908 }, { "epoch": 0.01357324175003733, "grad_norm": 0.57421875, "grad_norm_var": 0.03955122629801432, "learning_rate": 2e-05, "loss": 1.2294, "loss/crossentropy": 2.6257517337799072, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16689662635326385, "step": 909 }, { "epoch": 0.013588173809168285, "grad_norm": 0.61328125, "grad_norm_var": 0.039581298828125, "learning_rate": 2e-05, "loss": 1.3795, "loss/crossentropy": 2.5408027172088623, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 12.0, "loss/logits": 0.1763436645269394, "step": 910 }, { "epoch": 0.013603105868299239, "grad_norm": 0.54296875, "grad_norm_var": 0.038750648498535156, "learning_rate": 2e-05, "loss": 1.2114, "loss/crossentropy": 2.5792062282562256, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1567598581314087, "step": 911 }, { "epoch": 0.013618037927430193, "grad_norm": 0.5234375, "grad_norm_var": 0.039081764221191403, "learning_rate": 2e-05, "loss": 1.2693, "loss/crossentropy": 2.639925956726074, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.17551803588867188, "step": 912 }, { "epoch": 0.013632969986561147, "grad_norm": 0.51953125, "grad_norm_var": 0.03836409250895182, "learning_rate": 2e-05, "loss": 1.2264, "loss/crossentropy": 2.6736505031585693, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1639370322227478, "step": 913 }, { "epoch": 0.013647902045692101, "grad_norm": 0.625, "grad_norm_var": 0.0032307306925455728, "learning_rate": 2e-05, "loss": 1.3798, "loss/crossentropy": 2.731677293777466, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.19231237471103668, "step": 914 }, { "epoch": 0.013662834104823055, "grad_norm": 0.51171875, "grad_norm_var": 0.003500811258951823, "learning_rate": 2e-05, "loss": 1.2027, "loss/crossentropy": 2.551647424697876, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15584951639175415, "step": 915 }, { "epoch": 0.01367776616395401, "grad_norm": 0.53515625, "grad_norm_var": 0.0035837809244791668, "learning_rate": 2e-05, "loss": 1.2998, "loss/crossentropy": 2.5736746788024902, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.18263675272464752, "step": 916 }, { "epoch": 0.013692698223084963, "grad_norm": 0.50390625, "grad_norm_var": 0.003792063395182292, "learning_rate": 2e-05, "loss": 1.2474, "loss/crossentropy": 2.5645570755004883, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16151148080825806, "step": 917 }, { "epoch": 0.013707630282215917, "grad_norm": 0.49609375, "grad_norm_var": 0.0019236246744791666, "learning_rate": 2e-05, "loss": 1.1903, "loss/crossentropy": 2.7008137702941895, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.1512683928012848, "step": 918 }, { "epoch": 0.013722562341346872, "grad_norm": 0.5625, "grad_norm_var": 0.0018992106119791667, "learning_rate": 2e-05, "loss": 1.2117, "loss/crossentropy": 2.576681613922119, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.16480094194412231, "step": 919 }, { "epoch": 0.013737494400477826, "grad_norm": 0.62109375, "grad_norm_var": 0.0020222981770833333, "learning_rate": 2e-05, "loss": 1.2225, "loss/crossentropy": 2.2980234622955322, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.1522141844034195, "step": 920 }, { "epoch": 0.01375242645960878, "grad_norm": 0.490234375, "grad_norm_var": 0.0022785027821858725, "learning_rate": 2e-05, "loss": 1.1559, "loss/crossentropy": 2.6368680000305176, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.14811134338378906, "step": 921 }, { "epoch": 0.013767358518739734, "grad_norm": 0.58203125, "grad_norm_var": 0.0022785027821858725, "learning_rate": 2e-05, "loss": 1.2656, "loss/crossentropy": 2.3946034908294678, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16408073902130127, "step": 922 }, { "epoch": 0.013782290577870688, "grad_norm": 0.58203125, "grad_norm_var": 0.0020816644032796224, "learning_rate": 2e-05, "loss": 1.2416, "loss/crossentropy": 2.6989355087280273, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16348567605018616, "step": 923 }, { "epoch": 0.013797222637001642, "grad_norm": 0.51171875, "grad_norm_var": 0.0020609378814697267, "learning_rate": 2e-05, "loss": 1.1568, "loss/crossentropy": 2.457051992416382, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.1490294486284256, "step": 924 }, { "epoch": 0.013812154696132596, "grad_norm": 0.494140625, "grad_norm_var": 0.002199745178222656, "learning_rate": 2e-05, "loss": 1.209, "loss/crossentropy": 2.6960437297821045, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.16211289167404175, "step": 925 }, { "epoch": 0.01382708675526355, "grad_norm": 0.57421875, "grad_norm_var": 0.0019378026326497396, "learning_rate": 2e-05, "loss": 1.2168, "loss/crossentropy": 2.486149549484253, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1543126404285431, "step": 926 }, { "epoch": 0.013842018814394504, "grad_norm": 0.74609375, "grad_norm_var": 0.00453637440999349, "learning_rate": 2e-05, "loss": 1.2962, "loss/crossentropy": 2.531057119369507, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1868131458759308, "step": 927 }, { "epoch": 0.013856950873525458, "grad_norm": 0.60546875, "grad_norm_var": 0.004612477620442709, "learning_rate": 2e-05, "loss": 1.3788, "loss/crossentropy": 2.445338726043701, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 12.0, "loss/logits": 0.19916069507598877, "step": 928 }, { "epoch": 0.013871882932656412, "grad_norm": 0.55078125, "grad_norm_var": 0.004504648844401041, "learning_rate": 2e-05, "loss": 1.2241, "loss/crossentropy": 2.5794379711151123, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.15381918847560883, "step": 929 }, { "epoch": 0.013886814991787368, "grad_norm": 0.546875, "grad_norm_var": 0.004229990641276041, "learning_rate": 2e-05, "loss": 1.3103, "loss/crossentropy": 2.6032307147979736, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17746026813983917, "step": 930 }, { "epoch": 0.013901747050918322, "grad_norm": 0.53515625, "grad_norm_var": 0.004122416178385417, "learning_rate": 2e-05, "loss": 1.2506, "loss/crossentropy": 2.637634515762329, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.15681351721286774, "step": 931 }, { "epoch": 0.013916679110049276, "grad_norm": 0.5078125, "grad_norm_var": 0.004254595438639323, "learning_rate": 2e-05, "loss": 1.3071, "loss/crossentropy": 2.4573991298675537, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.17433026432991028, "step": 932 }, { "epoch": 0.01393161116918023, "grad_norm": 0.515625, "grad_norm_var": 0.004180399576822916, "learning_rate": 2e-05, "loss": 1.2987, "loss/crossentropy": 2.4106740951538086, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1737232506275177, "step": 933 }, { "epoch": 0.013946543228311185, "grad_norm": 0.62109375, "grad_norm_var": 0.0041315714518229164, "learning_rate": 2e-05, "loss": 1.3689, "loss/crossentropy": 2.61064076423645, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 12.0, "loss/logits": 0.181406170129776, "step": 934 }, { "epoch": 0.013961475287442139, "grad_norm": 0.65234375, "grad_norm_var": 0.0046009699503580725, "learning_rate": 2e-05, "loss": 1.2803, "loss/crossentropy": 2.4577136039733887, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.16315729916095734, "step": 935 }, { "epoch": 0.013976407346573093, "grad_norm": 0.51953125, "grad_norm_var": 0.004567909240722656, "learning_rate": 2e-05, "loss": 1.2566, "loss/crossentropy": 2.2968335151672363, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.1706232726573944, "step": 936 }, { "epoch": 0.013991339405704047, "grad_norm": 0.50390625, "grad_norm_var": 0.004443852106730143, "learning_rate": 2e-05, "loss": 1.162, "loss/crossentropy": 2.473719596862793, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.146341010928154, "step": 937 }, { "epoch": 0.014006271464835001, "grad_norm": 0.51171875, "grad_norm_var": 0.004598347345987955, "learning_rate": 2e-05, "loss": 1.1695, "loss/crossentropy": 2.7954347133636475, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.1460927426815033, "step": 938 }, { "epoch": 0.014021203523965955, "grad_norm": 0.478515625, "grad_norm_var": 0.004979960123697917, "learning_rate": 2e-05, "loss": 1.1899, "loss/crossentropy": 2.491903781890869, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.14299684762954712, "step": 939 }, { "epoch": 0.01403613558309691, "grad_norm": 0.60546875, "grad_norm_var": 0.004992167154947917, "learning_rate": 2e-05, "loss": 1.2752, "loss/crossentropy": 2.651737928390503, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.15803387761116028, "step": 940 }, { "epoch": 0.014051067642227863, "grad_norm": 0.9140625, "grad_norm_var": 0.012295007705688477, "learning_rate": 2e-05, "loss": 1.4622, "loss/crossentropy": 2.304933547973633, "loss/dist_ce": 0.0, "loss/fcd": 1.2578125, "loss/idx": 12.0, "loss/logits": 0.20437663793563843, "step": 941 }, { "epoch": 0.014065999701358817, "grad_norm": 0.5390625, "grad_norm_var": 0.012431192398071288, "learning_rate": 2e-05, "loss": 1.2685, "loss/crossentropy": 2.5132477283477783, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16695216298103333, "step": 942 }, { "epoch": 0.014080931760489772, "grad_norm": 0.5, "grad_norm_var": 0.010917139053344727, "learning_rate": 2e-05, "loss": 1.1307, "loss/crossentropy": 2.515592336654663, "loss/dist_ce": 0.0, "loss/fcd": 0.9921875, "loss/idx": 12.0, "loss/logits": 0.13850846886634827, "step": 943 }, { "epoch": 0.014095863819620726, "grad_norm": 0.5, "grad_norm_var": 0.011102533340454102, "learning_rate": 2e-05, "loss": 1.2549, "loss/crossentropy": 2.392749071121216, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.16117683053016663, "step": 944 }, { "epoch": 0.01411079587875168, "grad_norm": 0.6875, "grad_norm_var": 0.012054936091105143, "learning_rate": 2e-05, "loss": 1.4447, "loss/crossentropy": 2.619136095046997, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 12.0, "loss/logits": 0.21033376455307007, "step": 945 }, { "epoch": 0.014125727937882634, "grad_norm": 0.5078125, "grad_norm_var": 0.012276824315388997, "learning_rate": 2e-05, "loss": 1.2539, "loss/crossentropy": 2.70526385307312, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16797472536563873, "step": 946 }, { "epoch": 0.014140659997013588, "grad_norm": 0.5625, "grad_norm_var": 0.012201166152954102, "learning_rate": 2e-05, "loss": 1.3053, "loss/crossentropy": 2.671525239944458, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1802533119916916, "step": 947 }, { "epoch": 0.014155592056144542, "grad_norm": 0.54296875, "grad_norm_var": 0.011984872817993163, "learning_rate": 2e-05, "loss": 1.2934, "loss/crossentropy": 2.5209715366363525, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.16836240887641907, "step": 948 }, { "epoch": 0.014170524115275496, "grad_norm": 0.546875, "grad_norm_var": 0.011808379491170248, "learning_rate": 2e-05, "loss": 1.2201, "loss/crossentropy": 2.352734327316284, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.14974716305732727, "step": 949 }, { "epoch": 0.01418545617440645, "grad_norm": 0.64453125, "grad_norm_var": 0.011988051732381185, "learning_rate": 2e-05, "loss": 1.3441, "loss/crossentropy": 2.5131075382232666, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.20351985096931458, "step": 950 }, { "epoch": 0.014200388233537404, "grad_norm": 0.54296875, "grad_norm_var": 0.011623112360636394, "learning_rate": 2e-05, "loss": 1.1609, "loss/crossentropy": 2.6400680541992188, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 12.0, "loss/logits": 0.14532148838043213, "step": 951 }, { "epoch": 0.014215320292668358, "grad_norm": 0.5625, "grad_norm_var": 0.011453866958618164, "learning_rate": 2e-05, "loss": 1.2853, "loss/crossentropy": 2.4889473915100098, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.1681395173072815, "step": 952 }, { "epoch": 0.014230252351799312, "grad_norm": 0.66015625, "grad_norm_var": 0.01156322161356608, "learning_rate": 2e-05, "loss": 1.3387, "loss/crossentropy": 2.304206609725952, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.18247899413108826, "step": 953 }, { "epoch": 0.014245184410930267, "grad_norm": 0.62890625, "grad_norm_var": 0.011328617731730143, "learning_rate": 2e-05, "loss": 1.3366, "loss/crossentropy": 2.4242758750915527, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.18031245470046997, "step": 954 }, { "epoch": 0.01426011647006122, "grad_norm": 0.59375, "grad_norm_var": 0.010461171468098959, "learning_rate": 2e-05, "loss": 1.2421, "loss/crossentropy": 2.681748151779175, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 12.0, "loss/logits": 0.17177164554595947, "step": 955 }, { "epoch": 0.014275048529192176, "grad_norm": 0.484375, "grad_norm_var": 0.011227862040201823, "learning_rate": 2e-05, "loss": 1.1478, "loss/crossentropy": 2.6037683486938477, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 12.0, "loss/logits": 0.14775672554969788, "step": 956 }, { "epoch": 0.01428998058832313, "grad_norm": 0.88671875, "grad_norm_var": 0.010088094075520833, "learning_rate": 2e-05, "loss": 1.5679, "loss/crossentropy": 2.7519948482513428, "loss/dist_ce": 0.0, "loss/fcd": 1.296875, "loss/idx": 12.0, "loss/logits": 0.2710202634334564, "step": 957 }, { "epoch": 0.014304912647454085, "grad_norm": 0.5078125, "grad_norm_var": 0.0103485107421875, "learning_rate": 2e-05, "loss": 1.2848, "loss/crossentropy": 2.3958358764648438, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.1754007339477539, "step": 958 }, { "epoch": 0.014319844706585039, "grad_norm": 0.60546875, "grad_norm_var": 0.009848976135253906, "learning_rate": 2e-05, "loss": 1.496, "loss/crossentropy": 2.428069591522217, "loss/dist_ce": 0.0, "loss/fcd": 1.2734375, "loss/idx": 12.0, "loss/logits": 0.2225390374660492, "step": 959 }, { "epoch": 0.014334776765715993, "grad_norm": 0.5234375, "grad_norm_var": 0.009597206115722656, "learning_rate": 2e-05, "loss": 1.2105, "loss/crossentropy": 2.5576891899108887, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15579423308372498, "step": 960 }, { "epoch": 0.014349708824846947, "grad_norm": 0.51953125, "grad_norm_var": 0.009244537353515625, "learning_rate": 2e-05, "loss": 1.2722, "loss/crossentropy": 2.3733532428741455, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 12.0, "loss/logits": 0.17849120497703552, "step": 961 }, { "epoch": 0.014364640883977901, "grad_norm": 0.5078125, "grad_norm_var": 0.009244537353515625, "learning_rate": 2e-05, "loss": 1.1481, "loss/crossentropy": 2.5622546672821045, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 12.0, "loss/logits": 0.14027273654937744, "step": 962 }, { "epoch": 0.014379572943108855, "grad_norm": 0.6171875, "grad_norm_var": 0.009285481770833333, "learning_rate": 2e-05, "loss": 1.2864, "loss/crossentropy": 2.436474561691284, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.16921366751194, "step": 963 }, { "epoch": 0.01439450500223981, "grad_norm": 0.71875, "grad_norm_var": 0.010209592183430989, "learning_rate": 2e-05, "loss": 1.4983, "loss/crossentropy": 2.7202954292297363, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 12.0, "loss/logits": 0.25612711906433105, "step": 964 }, { "epoch": 0.014409437061370763, "grad_norm": 0.515625, "grad_norm_var": 0.010479164123535157, "learning_rate": 2e-05, "loss": 1.2016, "loss/crossentropy": 2.370645523071289, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.14687049388885498, "step": 965 }, { "epoch": 0.014424369120501717, "grad_norm": 0.52734375, "grad_norm_var": 0.010563087463378907, "learning_rate": 2e-05, "loss": 1.3283, "loss/crossentropy": 2.7037761211395264, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.18769502639770508, "step": 966 }, { "epoch": 0.014439301179632671, "grad_norm": 0.51171875, "grad_norm_var": 0.010810279846191406, "learning_rate": 2e-05, "loss": 1.2502, "loss/crossentropy": 2.5512642860412598, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16423162817955017, "step": 967 }, { "epoch": 0.014454233238763626, "grad_norm": 0.482421875, "grad_norm_var": 0.011458698908487957, "learning_rate": 2e-05, "loss": 1.173, "loss/crossentropy": 2.560576915740967, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.1495593786239624, "step": 968 }, { "epoch": 0.01446916529789458, "grad_norm": 0.70703125, "grad_norm_var": 0.01209270159403483, "learning_rate": 2e-05, "loss": 1.3237, "loss/crossentropy": 2.5032005310058594, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 12.0, "loss/logits": 0.1908506453037262, "step": 969 }, { "epoch": 0.014484097357025534, "grad_norm": 0.55078125, "grad_norm_var": 0.01200242042541504, "learning_rate": 2e-05, "loss": 1.1987, "loss/crossentropy": 2.6098976135253906, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 12.0, "loss/logits": 0.15182408690452576, "step": 970 }, { "epoch": 0.014499029416156488, "grad_norm": 0.54296875, "grad_norm_var": 0.01206192970275879, "learning_rate": 2e-05, "loss": 1.2095, "loss/crossentropy": 2.6711175441741943, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.15485651791095734, "step": 971 }, { "epoch": 0.014513961475287442, "grad_norm": 0.55078125, "grad_norm_var": 0.011530160903930664, "learning_rate": 2e-05, "loss": 1.2459, "loss/crossentropy": 2.7971813678741455, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.16782422363758087, "step": 972 }, { "epoch": 0.014528893534418396, "grad_norm": 0.52734375, "grad_norm_var": 0.004891316095987956, "learning_rate": 2e-05, "loss": 1.3771, "loss/crossentropy": 2.5039517879486084, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 12.0, "loss/logits": 0.18174386024475098, "step": 973 }, { "epoch": 0.01454382559354935, "grad_norm": 0.52734375, "grad_norm_var": 0.0047864119211832685, "learning_rate": 2e-05, "loss": 1.2518, "loss/crossentropy": 2.4862096309661865, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 12.0, "loss/logits": 0.1736270785331726, "step": 974 }, { "epoch": 0.014558757652680304, "grad_norm": 0.76953125, "grad_norm_var": 0.007496754328409831, "learning_rate": 2e-05, "loss": 1.32, "loss/crossentropy": 2.335102081298828, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 12.0, "loss/logits": 0.17159795761108398, "step": 975 }, { "epoch": 0.014573689711811258, "grad_norm": 0.546875, "grad_norm_var": 0.007389561335245768, "learning_rate": 2e-05, "loss": 1.2981, "loss/crossentropy": 2.5871307849884033, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.1731010526418686, "step": 976 }, { "epoch": 0.014588621770942212, "grad_norm": 0.484375, "grad_norm_var": 0.0077042738596598305, "learning_rate": 2e-05, "loss": 1.2361, "loss/crossentropy": 2.640996217727661, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.17356736958026886, "step": 977 }, { "epoch": 0.014603553830073166, "grad_norm": 0.54296875, "grad_norm_var": 0.007499424616495768, "learning_rate": 2e-05, "loss": 1.2752, "loss/crossentropy": 2.4583990573883057, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.16585969924926758, "step": 978 }, { "epoch": 0.01461848588920412, "grad_norm": 0.56640625, "grad_norm_var": 0.007342386245727539, "learning_rate": 2e-05, "loss": 1.2629, "loss/crossentropy": 2.440524101257324, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 12.0, "loss/logits": 0.16134318709373474, "step": 979 }, { "epoch": 0.014633417948335075, "grad_norm": 0.50390625, "grad_norm_var": 0.00588072141011556, "learning_rate": 2e-05, "loss": 1.2182, "loss/crossentropy": 2.598440647125244, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.15566852688789368, "step": 980 }, { "epoch": 0.014648350007466029, "grad_norm": 0.5234375, "grad_norm_var": 0.005844990412394206, "learning_rate": 2e-05, "loss": 1.1937, "loss/crossentropy": 2.516242742538452, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15460515022277832, "step": 981 }, { "epoch": 0.014663282066596985, "grad_norm": 0.51953125, "grad_norm_var": 0.005876652399698893, "learning_rate": 2e-05, "loss": 1.1113, "loss/crossentropy": 2.374080181121826, "loss/dist_ce": 0.0, "loss/fcd": 0.97265625, "loss/idx": 12.0, "loss/logits": 0.13867977261543274, "step": 982 }, { "epoch": 0.014678214125727939, "grad_norm": 0.91015625, "grad_norm_var": 0.013574330012003581, "learning_rate": 2e-05, "loss": 1.4734, "loss/crossentropy": 2.396512269973755, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 12.0, "loss/logits": 0.20776385068893433, "step": 983 }, { "epoch": 0.014693146184858893, "grad_norm": 0.55859375, "grad_norm_var": 0.012961260477701823, "learning_rate": 2e-05, "loss": 1.1341, "loss/crossentropy": 2.6779584884643555, "loss/dist_ce": 0.0, "loss/fcd": 0.984375, "loss/idx": 12.0, "loss/logits": 0.14977312088012695, "step": 984 }, { "epoch": 0.014708078243989847, "grad_norm": 0.51953125, "grad_norm_var": 0.012064043680826824, "learning_rate": 2e-05, "loss": 1.2131, "loss/crossentropy": 2.5011801719665527, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 12.0, "loss/logits": 0.1583985984325409, "step": 985 }, { "epoch": 0.014723010303120801, "grad_norm": 0.5234375, "grad_norm_var": 0.012186431884765625, "learning_rate": 2e-05, "loss": 1.2479, "loss/crossentropy": 2.4844136238098145, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16194245219230652, "step": 986 }, { "epoch": 0.014737942362251755, "grad_norm": 0.55859375, "grad_norm_var": 0.012145741780598959, "learning_rate": 2e-05, "loss": 1.2528, "loss/crossentropy": 2.221806526184082, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 12.0, "loss/logits": 0.16683350503444672, "step": 987 }, { "epoch": 0.014752874421382709, "grad_norm": 0.58984375, "grad_norm_var": 0.0121368408203125, "learning_rate": 2e-05, "loss": 1.3223, "loss/crossentropy": 2.5378036499023438, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 12.0, "loss/logits": 0.19727113842964172, "step": 988 }, { "epoch": 0.014767806480513663, "grad_norm": 0.546875, "grad_norm_var": 0.012041155497233074, "learning_rate": 2e-05, "loss": 1.1958, "loss/crossentropy": 2.597865581512451, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.1567688286304474, "step": 989 }, { "epoch": 0.014782738539644617, "grad_norm": 0.498046875, "grad_norm_var": 0.012278858820597332, "learning_rate": 2e-05, "loss": 1.1742, "loss/crossentropy": 2.7298946380615234, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 12.0, "loss/logits": 0.14295458793640137, "step": 990 }, { "epoch": 0.014797670598775571, "grad_norm": 0.546875, "grad_norm_var": 0.009531895319620768, "learning_rate": 2e-05, "loss": 1.2208, "loss/crossentropy": 2.394542932510376, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.1583481729030609, "step": 991 }, { "epoch": 0.014812602657906525, "grad_norm": 0.50390625, "grad_norm_var": 0.009715127944946288, "learning_rate": 2e-05, "loss": 1.3119, "loss/crossentropy": 2.8174941539764404, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 12.0, "loss/logits": 0.20255360007286072, "step": 992 }, { "epoch": 0.01482753471703748, "grad_norm": 0.52734375, "grad_norm_var": 0.009419997533162435, "learning_rate": 2e-05, "loss": 1.1653, "loss/crossentropy": 2.6789565086364746, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 12.0, "loss/logits": 0.14188051223754883, "step": 993 }, { "epoch": 0.014842466776168434, "grad_norm": 0.5390625, "grad_norm_var": 0.009429152806599934, "learning_rate": 2e-05, "loss": 1.2278, "loss/crossentropy": 2.6719415187835693, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 12.0, "loss/logits": 0.16530798375606537, "step": 994 }, { "epoch": 0.014857398835299388, "grad_norm": 0.5, "grad_norm_var": 0.00963451067606608, "learning_rate": 2e-05, "loss": 1.1983, "loss/crossentropy": 2.6873083114624023, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 12.0, "loss/logits": 0.15920904278755188, "step": 995 }, { "epoch": 0.014872330894430342, "grad_norm": 0.55078125, "grad_norm_var": 0.009456745783487956, "learning_rate": 2e-05, "loss": 1.2986, "loss/crossentropy": 2.5818088054656982, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 12.0, "loss/logits": 0.18138398230075836, "step": 996 }, { "epoch": 0.014887262953561296, "grad_norm": 0.53515625, "grad_norm_var": 0.009412495295206706, "learning_rate": 2e-05, "loss": 1.3294, "loss/crossentropy": 2.5327281951904297, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.1731855869293213, "step": 997 }, { "epoch": 0.01490219501269225, "grad_norm": 0.59375, "grad_norm_var": 0.009376255671183269, "learning_rate": 2e-05, "loss": 1.3438, "loss/crossentropy": 2.5216119289398193, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.18757742643356323, "step": 998 }, { "epoch": 0.014917127071823204, "grad_norm": 0.4921875, "grad_norm_var": 0.0009270826975504557, "learning_rate": 2e-05, "loss": 1.1393, "loss/crossentropy": 2.517537832260132, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 12.0, "loss/logits": 0.13927598297595978, "step": 999 }, { "epoch": 0.014932059130954158, "grad_norm": 0.546875, "grad_norm_var": 0.0009011427561442057, "learning_rate": 2e-05, "loss": 1.3215, "loss/crossentropy": 2.4011945724487305, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 12.0, "loss/logits": 0.16521359980106354, "step": 1000 }, { "epoch": 0.014946991190085112, "grad_norm": 0.52734375, "grad_norm_var": 0.0008880456288655598, "learning_rate": 2e-05, "loss": 1.3127, "loss/crossentropy": 2.5374038219451904, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 12.0, "loss/logits": 0.17211785912513733, "step": 1001 }, { "epoch": 0.014961923249216066, "grad_norm": 0.51171875, "grad_norm_var": 0.0009166558583577473, "learning_rate": 2e-05, "loss": 1.3361, "loss/crossentropy": 2.324538230895996, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.1876780390739441, "step": 1002 }, { "epoch": 0.01497685530834702, "grad_norm": 1.0546875, "grad_norm_var": 0.017824538548787437, "learning_rate": 2e-05, "loss": 1.3262, "loss/crossentropy": 2.5217180252075195, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.19337180256843567, "step": 1003 }, { "epoch": 0.014991787367477975, "grad_norm": 0.91796875, "grad_norm_var": 0.02557371457417806, "learning_rate": 2e-05, "loss": 1.2566, "loss/crossentropy": 2.5869264602661133, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17063188552856445, "step": 1004 }, { "epoch": 0.015006719426608929, "grad_norm": 1.0078125, "grad_norm_var": 0.03638443946838379, "learning_rate": 2e-05, "loss": 1.5329, "loss/crossentropy": 2.5455098152160645, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 13.0, "loss/logits": 0.25163742899894714, "step": 1005 }, { "epoch": 0.015021651485739883, "grad_norm": 0.953125, "grad_norm_var": 0.042180315653483076, "learning_rate": 2e-05, "loss": 1.3301, "loss/crossentropy": 2.805210590362549, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.18944759666919708, "step": 1006 }, { "epoch": 0.015036583544870837, "grad_norm": 0.92578125, "grad_norm_var": 0.04623209635416667, "learning_rate": 2e-05, "loss": 1.397, "loss/crossentropy": 2.673166275024414, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.209548681974411, "step": 1007 }, { "epoch": 0.015051515604001793, "grad_norm": 0.80859375, "grad_norm_var": 0.04536921183268229, "learning_rate": 2e-05, "loss": 1.353, "loss/crossentropy": 2.5452094078063965, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.1967710256576538, "step": 1008 }, { "epoch": 0.015066447663132747, "grad_norm": 0.796875, "grad_norm_var": 0.044171587626139326, "learning_rate": 2e-05, "loss": 1.2264, "loss/crossentropy": 2.614394426345825, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.15607105195522308, "step": 1009 }, { "epoch": 0.015081379722263701, "grad_norm": 0.80859375, "grad_norm_var": 0.04278971354166667, "learning_rate": 2e-05, "loss": 1.4443, "loss/crossentropy": 2.5008745193481445, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 13.0, "loss/logits": 0.20992863178253174, "step": 1010 }, { "epoch": 0.015096311781394655, "grad_norm": 1.0078125, "grad_norm_var": 0.04396336873372396, "learning_rate": 2e-05, "loss": 1.6703, "loss/crossentropy": 2.8562960624694824, "loss/dist_ce": 0.0, "loss/fcd": 1.3828125, "loss/idx": 13.0, "loss/logits": 0.28753405809402466, "step": 1011 }, { "epoch": 0.015111243840525609, "grad_norm": 0.734375, "grad_norm_var": 0.04113356272379557, "learning_rate": 2e-05, "loss": 1.3257, "loss/crossentropy": 2.57487416267395, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.19289004802703857, "step": 1012 }, { "epoch": 0.015126175899656563, "grad_norm": 0.7109375, "grad_norm_var": 0.03770319620768229, "learning_rate": 2e-05, "loss": 1.2732, "loss/crossentropy": 2.6012637615203857, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17167770862579346, "step": 1013 }, { "epoch": 0.015141107958787517, "grad_norm": 0.734375, "grad_norm_var": 0.03554255167643229, "learning_rate": 2e-05, "loss": 1.2555, "loss/crossentropy": 2.406501531600952, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.15391208231449127, "step": 1014 }, { "epoch": 0.015156040017918471, "grad_norm": 0.72265625, "grad_norm_var": 0.029904619852701823, "learning_rate": 2e-05, "loss": 1.2593, "loss/crossentropy": 2.6688358783721924, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17337998747825623, "step": 1015 }, { "epoch": 0.015170972077049425, "grad_norm": 0.74609375, "grad_norm_var": 0.025712076822916666, "learning_rate": 2e-05, "loss": 1.36, "loss/crossentropy": 2.654153823852539, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.20374688506126404, "step": 1016 }, { "epoch": 0.01518590413618038, "grad_norm": 0.640625, "grad_norm_var": 0.02223656972249349, "learning_rate": 2e-05, "loss": 1.34, "loss/crossentropy": 2.397961139678955, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.19154663383960724, "step": 1017 }, { "epoch": 0.015200836195311334, "grad_norm": 0.6640625, "grad_norm_var": 0.017473347981770835, "learning_rate": 2e-05, "loss": 1.2604, "loss/crossentropy": 2.5992560386657715, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17447641491889954, "step": 1018 }, { "epoch": 0.015215768254442288, "grad_norm": 0.66015625, "grad_norm_var": 0.015232276916503907, "learning_rate": 2e-05, "loss": 1.3838, "loss/crossentropy": 2.2740230560302734, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.19633014500141144, "step": 1019 }, { "epoch": 0.015230700313573242, "grad_norm": 0.59765625, "grad_norm_var": 0.01671288808186849, "learning_rate": 2e-05, "loss": 1.2765, "loss/crossentropy": 2.6276321411132812, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18275076150894165, "step": 1020 }, { "epoch": 0.015245632372704196, "grad_norm": 0.7890625, "grad_norm_var": 0.013131141662597656, "learning_rate": 2e-05, "loss": 1.5263, "loss/crossentropy": 2.5657906532287598, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 13.0, "loss/logits": 0.27633097767829895, "step": 1021 }, { "epoch": 0.01526056443183515, "grad_norm": 0.65234375, "grad_norm_var": 0.011393229166666666, "learning_rate": 2e-05, "loss": 1.2135, "loss/crossentropy": 2.5474112033843994, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1510053277015686, "step": 1022 }, { "epoch": 0.015275496490966104, "grad_norm": 0.8984375, "grad_norm_var": 0.010799090067545572, "learning_rate": 2e-05, "loss": 1.6306, "loss/crossentropy": 2.913024425506592, "loss/dist_ce": 0.0, "loss/fcd": 1.3515625, "loss/idx": 13.0, "loss/logits": 0.278994619846344, "step": 1023 }, { "epoch": 0.015290428550097058, "grad_norm": 0.59375, "grad_norm_var": 0.011956532796223959, "learning_rate": 2e-05, "loss": 1.2351, "loss/crossentropy": 2.50426983833313, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.172644704580307, "step": 1024 }, { "epoch": 0.015305360609228012, "grad_norm": 0.77734375, "grad_norm_var": 0.011818885803222656, "learning_rate": 2e-05, "loss": 1.4706, "loss/crossentropy": 2.4272844791412354, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 13.0, "loss/logits": 0.20502135157585144, "step": 1025 }, { "epoch": 0.015320292668358966, "grad_norm": 0.625, "grad_norm_var": 0.012090810139973958, "learning_rate": 2e-05, "loss": 1.3414, "loss/crossentropy": 2.1835615634918213, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.19297602772712708, "step": 1026 }, { "epoch": 0.01533522472748992, "grad_norm": 1.0078125, "grad_norm_var": 0.012090810139973958, "learning_rate": 2e-05, "loss": 1.4718, "loss/crossentropy": 2.999908924102783, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 13.0, "loss/logits": 0.2296452820301056, "step": 1027 }, { "epoch": 0.015350156786620875, "grad_norm": 0.6484375, "grad_norm_var": 0.012412516276041667, "learning_rate": 2e-05, "loss": 1.2357, "loss/crossentropy": 2.710797071456909, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.15754911303520203, "step": 1028 }, { "epoch": 0.015365088845751829, "grad_norm": 0.609375, "grad_norm_var": 0.013136545817057291, "learning_rate": 2e-05, "loss": 1.264, "loss/crossentropy": 2.562701940536499, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17028763890266418, "step": 1029 }, { "epoch": 0.015380020904882783, "grad_norm": 0.5703125, "grad_norm_var": 0.014295450846354167, "learning_rate": 2e-05, "loss": 1.2993, "loss/crossentropy": 2.5587525367736816, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1899479776620865, "step": 1030 }, { "epoch": 0.015394952964013737, "grad_norm": 0.6640625, "grad_norm_var": 0.014334551493326823, "learning_rate": 2e-05, "loss": 1.2617, "loss/crossentropy": 2.8359696865081787, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17577612400054932, "step": 1031 }, { "epoch": 0.015409885023144691, "grad_norm": 0.61328125, "grad_norm_var": 0.014559364318847657, "learning_rate": 2e-05, "loss": 1.2215, "loss/crossentropy": 2.6557297706604004, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16680006682872772, "step": 1032 }, { "epoch": 0.015424817082275645, "grad_norm": 0.54296875, "grad_norm_var": 0.015775299072265624, "learning_rate": 2e-05, "loss": 1.1809, "loss/crossentropy": 2.598259687423706, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15747103095054626, "step": 1033 }, { "epoch": 0.015439749141406599, "grad_norm": 0.59765625, "grad_norm_var": 0.01621087392171224, "learning_rate": 2e-05, "loss": 1.293, "loss/crossentropy": 2.3652303218841553, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.18361005187034607, "step": 1034 }, { "epoch": 0.015454681200537555, "grad_norm": 0.60546875, "grad_norm_var": 0.016527748107910155, "learning_rate": 2e-05, "loss": 1.2546, "loss/crossentropy": 2.5818984508514404, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.16868820786476135, "step": 1035 }, { "epoch": 0.015469613259668509, "grad_norm": 0.61328125, "grad_norm_var": 0.016382789611816405, "learning_rate": 2e-05, "loss": 1.2933, "loss/crossentropy": 2.655116081237793, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1839730441570282, "step": 1036 }, { "epoch": 0.015484545318799463, "grad_norm": 0.546875, "grad_norm_var": 0.016382789611816405, "learning_rate": 2e-05, "loss": 1.2268, "loss/crossentropy": 2.541788339614868, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16428236663341522, "step": 1037 }, { "epoch": 0.015499477377930417, "grad_norm": 0.55859375, "grad_norm_var": 0.017032814025878907, "learning_rate": 2e-05, "loss": 1.264, "loss/crossentropy": 2.7263083457946777, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17802877724170685, "step": 1038 }, { "epoch": 0.015514409437061371, "grad_norm": 0.546875, "grad_norm_var": 0.013324928283691407, "learning_rate": 2e-05, "loss": 1.2371, "loss/crossentropy": 2.8279428482055664, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.18236854672431946, "step": 1039 }, { "epoch": 0.015529341496192325, "grad_norm": 0.53125, "grad_norm_var": 0.013892555236816406, "learning_rate": 2e-05, "loss": 1.2164, "loss/crossentropy": 2.6993460655212402, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16949616372585297, "step": 1040 }, { "epoch": 0.01554427355532328, "grad_norm": 0.5703125, "grad_norm_var": 0.012467193603515624, "learning_rate": 2e-05, "loss": 1.2521, "loss/crossentropy": 2.532785654067993, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.16616228222846985, "step": 1041 }, { "epoch": 0.015559205614454234, "grad_norm": 0.6171875, "grad_norm_var": 0.012461344401041666, "learning_rate": 2e-05, "loss": 1.2867, "loss/crossentropy": 2.5695011615753174, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1772921234369278, "step": 1042 }, { "epoch": 0.015574137673585188, "grad_norm": 0.53515625, "grad_norm_var": 0.001683489481608073, "learning_rate": 2e-05, "loss": 1.2487, "loss/crossentropy": 2.461456537246704, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1706066131591797, "step": 1043 }, { "epoch": 0.015589069732716142, "grad_norm": 0.63671875, "grad_norm_var": 0.0015940348307291667, "learning_rate": 2e-05, "loss": 1.2816, "loss/crossentropy": 2.573848247528076, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1722354292869568, "step": 1044 }, { "epoch": 0.015604001791847096, "grad_norm": 0.5546875, "grad_norm_var": 0.001602935791015625, "learning_rate": 2e-05, "loss": 1.179, "loss/crossentropy": 2.5369319915771484, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.1555238664150238, "step": 1045 }, { "epoch": 0.01561893385097805, "grad_norm": 0.62890625, "grad_norm_var": 0.0017297744750976562, "learning_rate": 2e-05, "loss": 1.3623, "loss/crossentropy": 2.646678924560547, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.19046559929847717, "step": 1046 }, { "epoch": 0.015633865910109002, "grad_norm": 0.52734375, "grad_norm_var": 0.0014605204264322917, "learning_rate": 2e-05, "loss": 1.2948, "loss/crossentropy": 2.7469944953918457, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.18540921807289124, "step": 1047 }, { "epoch": 0.015648797969239958, "grad_norm": 0.61328125, "grad_norm_var": 0.0014605204264322917, "learning_rate": 2e-05, "loss": 1.246, "loss/crossentropy": 2.574913501739502, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17573140561580658, "step": 1048 }, { "epoch": 0.015663730028370914, "grad_norm": 0.5, "grad_norm_var": 0.0017689387003580728, "learning_rate": 2e-05, "loss": 1.2004, "loss/crossentropy": 2.689748525619507, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16129891574382782, "step": 1049 }, { "epoch": 0.015678662087501866, "grad_norm": 0.48828125, "grad_norm_var": 0.0021712621053059894, "learning_rate": 2e-05, "loss": 1.2207, "loss/crossentropy": 2.6240992546081543, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16604174673557281, "step": 1050 }, { "epoch": 0.015693594146632822, "grad_norm": 0.58984375, "grad_norm_var": 0.0021066665649414062, "learning_rate": 2e-05, "loss": 1.3609, "loss/crossentropy": 2.4291961193084717, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.20462797582149506, "step": 1051 }, { "epoch": 0.015708526205763775, "grad_norm": 0.53515625, "grad_norm_var": 0.0019973119099934897, "learning_rate": 2e-05, "loss": 1.1655, "loss/crossentropy": 2.6186795234680176, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1498570740222931, "step": 1052 }, { "epoch": 0.01572345826489473, "grad_norm": 0.515625, "grad_norm_var": 0.002118364969889323, "learning_rate": 2e-05, "loss": 1.3027, "loss/crossentropy": 2.752519369125366, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.18552502989768982, "step": 1053 }, { "epoch": 0.015738390324025683, "grad_norm": 0.76171875, "grad_norm_var": 0.0046772638956705725, "learning_rate": 2e-05, "loss": 1.3994, "loss/crossentropy": 2.4993913173675537, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 13.0, "loss/logits": 0.1962486207485199, "step": 1054 }, { "epoch": 0.01575332238315664, "grad_norm": 0.54296875, "grad_norm_var": 0.004691314697265625, "learning_rate": 2e-05, "loss": 1.213, "loss/crossentropy": 2.584184408187866, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1661267727613449, "step": 1055 }, { "epoch": 0.01576825444228759, "grad_norm": 0.55078125, "grad_norm_var": 0.00460961659749349, "learning_rate": 2e-05, "loss": 1.3191, "loss/crossentropy": 2.708296775817871, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.1941191405057907, "step": 1056 }, { "epoch": 0.015783186501418547, "grad_norm": 0.5390625, "grad_norm_var": 0.004681841532389323, "learning_rate": 2e-05, "loss": 1.3431, "loss/crossentropy": 2.7722063064575195, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.20250889658927917, "step": 1057 }, { "epoch": 0.0157981185605495, "grad_norm": 0.494140625, "grad_norm_var": 0.004871098200480143, "learning_rate": 2e-05, "loss": 1.1876, "loss/crossentropy": 2.6514158248901367, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.16421112418174744, "step": 1058 }, { "epoch": 0.015813050619680455, "grad_norm": 0.56640625, "grad_norm_var": 0.004814640680948893, "learning_rate": 2e-05, "loss": 1.2953, "loss/crossentropy": 2.65419864654541, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.19374266266822815, "step": 1059 }, { "epoch": 0.015827982678811407, "grad_norm": 0.6953125, "grad_norm_var": 0.0055871168772379555, "learning_rate": 2e-05, "loss": 1.3446, "loss/crossentropy": 2.398844003677368, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.19617103040218353, "step": 1060 }, { "epoch": 0.015842914737942363, "grad_norm": 0.58984375, "grad_norm_var": 0.005597416559855143, "learning_rate": 2e-05, "loss": 1.2199, "loss/crossentropy": 2.6054039001464844, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.15744562447071075, "step": 1061 }, { "epoch": 0.015857846797073315, "grad_norm": 0.8125, "grad_norm_var": 0.009117492039998372, "learning_rate": 2e-05, "loss": 1.2721, "loss/crossentropy": 2.718179702758789, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1940201371908188, "step": 1062 }, { "epoch": 0.01587277885620427, "grad_norm": 0.765625, "grad_norm_var": 0.010909255345662434, "learning_rate": 2e-05, "loss": 1.422, "loss/crossentropy": 2.877493381500244, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.2344614416360855, "step": 1063 }, { "epoch": 0.015887710915335224, "grad_norm": 0.62890625, "grad_norm_var": 0.01095732053120931, "learning_rate": 2e-05, "loss": 1.4013, "loss/crossentropy": 2.4012539386749268, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 13.0, "loss/logits": 0.1981934905052185, "step": 1064 }, { "epoch": 0.01590264297446618, "grad_norm": 0.53515625, "grad_norm_var": 0.01057279904683431, "learning_rate": 2e-05, "loss": 1.2988, "loss/crossentropy": 2.4239413738250732, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.18942078948020935, "step": 1065 }, { "epoch": 0.015917575033597132, "grad_norm": 0.62109375, "grad_norm_var": 0.009684356053670247, "learning_rate": 2e-05, "loss": 1.3502, "loss/crossentropy": 2.4800355434417725, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.2017608880996704, "step": 1066 }, { "epoch": 0.015932507092728088, "grad_norm": 0.5234375, "grad_norm_var": 0.010129658381144206, "learning_rate": 2e-05, "loss": 1.2782, "loss/crossentropy": 2.4825854301452637, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.16887298226356506, "step": 1067 }, { "epoch": 0.01594743915185904, "grad_norm": 0.494140625, "grad_norm_var": 0.010615984598795572, "learning_rate": 2e-05, "loss": 1.2356, "loss/crossentropy": 2.487943649291992, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17307588458061218, "step": 1068 }, { "epoch": 0.015962371210989996, "grad_norm": 0.52734375, "grad_norm_var": 0.010489145914713541, "learning_rate": 2e-05, "loss": 1.2028, "loss/crossentropy": 2.282266855239868, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1559266895055771, "step": 1069 }, { "epoch": 0.015977303270120948, "grad_norm": 0.5078125, "grad_norm_var": 0.00914605458577474, "learning_rate": 2e-05, "loss": 1.2308, "loss/crossentropy": 2.5250964164733887, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17615896463394165, "step": 1070 }, { "epoch": 0.015992235329251904, "grad_norm": 0.546875, "grad_norm_var": 0.009123992919921876, "learning_rate": 2e-05, "loss": 1.2686, "loss/crossentropy": 2.4969565868377686, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17482084035873413, "step": 1071 }, { "epoch": 0.016007167388382856, "grad_norm": 0.72265625, "grad_norm_var": 0.010131072998046876, "learning_rate": 2e-05, "loss": 1.4118, "loss/crossentropy": 2.3616156578063965, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.2243403196334839, "step": 1072 }, { "epoch": 0.016022099447513812, "grad_norm": 0.6171875, "grad_norm_var": 0.009897104899088542, "learning_rate": 2e-05, "loss": 1.2862, "loss/crossentropy": 2.4683635234832764, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.17678457498550415, "step": 1073 }, { "epoch": 0.016037031506644768, "grad_norm": 0.57421875, "grad_norm_var": 0.009135293960571288, "learning_rate": 2e-05, "loss": 1.2834, "loss/crossentropy": 2.6184275150299072, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1817963421344757, "step": 1074 }, { "epoch": 0.01605196356577572, "grad_norm": 0.6015625, "grad_norm_var": 0.009017419815063477, "learning_rate": 2e-05, "loss": 1.4443, "loss/crossentropy": 2.3622615337371826, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 13.0, "loss/logits": 0.20212198793888092, "step": 1075 }, { "epoch": 0.016066895624906676, "grad_norm": 0.5078125, "grad_norm_var": 0.009087610244750976, "learning_rate": 2e-05, "loss": 1.2307, "loss/crossentropy": 2.550598382949829, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1681874394416809, "step": 1076 }, { "epoch": 0.01608182768403763, "grad_norm": 0.5234375, "grad_norm_var": 0.009439961115519205, "learning_rate": 2e-05, "loss": 1.2534, "loss/crossentropy": 2.5999906063079834, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.183104008436203, "step": 1077 }, { "epoch": 0.016096759743168584, "grad_norm": 0.546875, "grad_norm_var": 0.006123971939086914, "learning_rate": 2e-05, "loss": 1.3151, "loss/crossentropy": 2.518636703491211, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.190095454454422, "step": 1078 }, { "epoch": 0.016111691802299537, "grad_norm": 0.53515625, "grad_norm_var": 0.0036707401275634767, "learning_rate": 2e-05, "loss": 1.2221, "loss/crossentropy": 2.473445177078247, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.15955936908721924, "step": 1079 }, { "epoch": 0.016126623861430493, "grad_norm": 0.50390625, "grad_norm_var": 0.0035547733306884764, "learning_rate": 2e-05, "loss": 1.2402, "loss/crossentropy": 2.5388095378875732, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16985295712947845, "step": 1080 }, { "epoch": 0.016141555920561445, "grad_norm": 0.5625, "grad_norm_var": 0.003527180353800456, "learning_rate": 2e-05, "loss": 1.3255, "loss/crossentropy": 2.577446222305298, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.19264891743659973, "step": 1081 }, { "epoch": 0.0161564879796924, "grad_norm": 0.47265625, "grad_norm_var": 0.00364073117574056, "learning_rate": 2e-05, "loss": 1.1981, "loss/crossentropy": 2.561262369155884, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1590115875005722, "step": 1082 }, { "epoch": 0.016171420038823353, "grad_norm": 0.58984375, "grad_norm_var": 0.0036990960439046224, "learning_rate": 2e-05, "loss": 1.3113, "loss/crossentropy": 2.3770103454589844, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18632197380065918, "step": 1083 }, { "epoch": 0.01618635209795431, "grad_norm": 0.6796875, "grad_norm_var": 0.004416338602701823, "learning_rate": 2e-05, "loss": 1.4186, "loss/crossentropy": 2.4611380100250244, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 13.0, "loss/logits": 0.19985929131507874, "step": 1084 }, { "epoch": 0.01620128415708526, "grad_norm": 0.52734375, "grad_norm_var": 0.004416338602701823, "learning_rate": 2e-05, "loss": 1.2343, "loss/crossentropy": 2.508803129196167, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17179152369499207, "step": 1085 }, { "epoch": 0.016216216216216217, "grad_norm": 0.7578125, "grad_norm_var": 0.00645898183186849, "learning_rate": 2e-05, "loss": 1.271, "loss/crossentropy": 2.593477964401245, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.20067906379699707, "step": 1086 }, { "epoch": 0.01623114827534717, "grad_norm": 0.515625, "grad_norm_var": 0.006655311584472657, "learning_rate": 2e-05, "loss": 1.1943, "loss/crossentropy": 2.566943407058716, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16300199925899506, "step": 1087 }, { "epoch": 0.016246080334478125, "grad_norm": 0.54296875, "grad_norm_var": 0.005193010965983073, "learning_rate": 2e-05, "loss": 1.1865, "loss/crossentropy": 2.6508474349975586, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.1630159616470337, "step": 1088 }, { "epoch": 0.016261012393609078, "grad_norm": 0.47265625, "grad_norm_var": 0.005515289306640625, "learning_rate": 2e-05, "loss": 1.1207, "loss/crossentropy": 2.662527561187744, "loss/dist_ce": 0.0, "loss/fcd": 0.98046875, "loss/idx": 13.0, "loss/logits": 0.14020352065563202, "step": 1089 }, { "epoch": 0.016275944452740033, "grad_norm": 0.65625, "grad_norm_var": 0.006122779846191406, "learning_rate": 2e-05, "loss": 1.2878, "loss/crossentropy": 2.4766385555267334, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.2018873393535614, "step": 1090 }, { "epoch": 0.016290876511870986, "grad_norm": 0.52734375, "grad_norm_var": 0.006078084309895833, "learning_rate": 2e-05, "loss": 1.3068, "loss/crossentropy": 2.465484380722046, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18179619312286377, "step": 1091 }, { "epoch": 0.01630580857100194, "grad_norm": 0.6015625, "grad_norm_var": 0.006004842122395834, "learning_rate": 2e-05, "loss": 1.383, "loss/crossentropy": 2.4888992309570312, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.21115002036094666, "step": 1092 }, { "epoch": 0.016320740630132894, "grad_norm": 0.66015625, "grad_norm_var": 0.006443214416503906, "learning_rate": 2e-05, "loss": 1.5633, "loss/crossentropy": 2.5339741706848145, "loss/dist_ce": 0.0, "loss/fcd": 1.3125, "loss/idx": 13.0, "loss/logits": 0.2507936358451843, "step": 1093 }, { "epoch": 0.01633567268926385, "grad_norm": 0.48046875, "grad_norm_var": 0.006941477457682292, "learning_rate": 2e-05, "loss": 1.1839, "loss/crossentropy": 2.5070247650146484, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1526746153831482, "step": 1094 }, { "epoch": 0.016350604748394802, "grad_norm": 0.58203125, "grad_norm_var": 0.0068743387858072914, "learning_rate": 2e-05, "loss": 1.3714, "loss/crossentropy": 2.454162359237671, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.19956299662590027, "step": 1095 }, { "epoch": 0.016365536807525758, "grad_norm": 0.5, "grad_norm_var": 0.006910133361816406, "learning_rate": 2e-05, "loss": 1.1066, "loss/crossentropy": 2.6837570667266846, "loss/dist_ce": 0.0, "loss/fcd": 0.9609375, "loss/idx": 13.0, "loss/logits": 0.14565584063529968, "step": 1096 }, { "epoch": 0.01638046886665671, "grad_norm": 0.6015625, "grad_norm_var": 0.006963539123535156, "learning_rate": 2e-05, "loss": 1.3122, "loss/crossentropy": 2.467454433441162, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18724094331264496, "step": 1097 }, { "epoch": 0.016395400925787666, "grad_norm": 0.65234375, "grad_norm_var": 0.006577491760253906, "learning_rate": 2e-05, "loss": 1.3914, "loss/crossentropy": 2.465182065963745, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.19610178470611572, "step": 1098 }, { "epoch": 0.01641033298491862, "grad_norm": 0.5703125, "grad_norm_var": 0.006586710611979167, "learning_rate": 2e-05, "loss": 1.3055, "loss/crossentropy": 2.500645637512207, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.17266511917114258, "step": 1099 }, { "epoch": 0.016425265044049574, "grad_norm": 0.59375, "grad_norm_var": 0.005940500895182292, "learning_rate": 2e-05, "loss": 1.3624, "loss/crossentropy": 2.6782145500183105, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.21395066380500793, "step": 1100 }, { "epoch": 0.01644019710318053, "grad_norm": 0.5546875, "grad_norm_var": 0.005803871154785156, "learning_rate": 2e-05, "loss": 1.2681, "loss/crossentropy": 2.5062341690063477, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17436693608760834, "step": 1101 }, { "epoch": 0.016455129162311483, "grad_norm": 0.53125, "grad_norm_var": 0.0036208470662434894, "learning_rate": 2e-05, "loss": 1.2351, "loss/crossentropy": 2.7024452686309814, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17257672548294067, "step": 1102 }, { "epoch": 0.01647006122144244, "grad_norm": 0.5546875, "grad_norm_var": 0.003458086649576823, "learning_rate": 2e-05, "loss": 1.3367, "loss/crossentropy": 2.4043080806732178, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.1883116364479065, "step": 1103 }, { "epoch": 0.01648499328057339, "grad_norm": 0.55078125, "grad_norm_var": 0.0034362157185872395, "learning_rate": 2e-05, "loss": 1.2919, "loss/crossentropy": 2.891212224960327, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.19036878645420074, "step": 1104 }, { "epoch": 0.016499925339704347, "grad_norm": 0.515625, "grad_norm_var": 0.0030047098795572915, "learning_rate": 2e-05, "loss": 1.2393, "loss/crossentropy": 2.4849252700805664, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16896244883537292, "step": 1105 }, { "epoch": 0.0165148573988353, "grad_norm": 0.6953125, "grad_norm_var": 0.0035451253255208335, "learning_rate": 2e-05, "loss": 1.376, "loss/crossentropy": 2.5396435260772705, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.18845024704933167, "step": 1106 }, { "epoch": 0.016529789457966255, "grad_norm": 0.4765625, "grad_norm_var": 0.004017066955566406, "learning_rate": 2e-05, "loss": 1.0813, "loss/crossentropy": 2.8166513442993164, "loss/dist_ce": 0.0, "loss/fcd": 0.9453125, "loss/idx": 13.0, "loss/logits": 0.13603615760803223, "step": 1107 }, { "epoch": 0.016544721517097207, "grad_norm": 0.609375, "grad_norm_var": 0.004053688049316407, "learning_rate": 2e-05, "loss": 1.3427, "loss/crossentropy": 2.6875510215759277, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.1942945122718811, "step": 1108 }, { "epoch": 0.016559653576228163, "grad_norm": 0.5078125, "grad_norm_var": 0.003684234619140625, "learning_rate": 2e-05, "loss": 1.2945, "loss/crossentropy": 2.7076685428619385, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.19290290772914886, "step": 1109 }, { "epoch": 0.016574585635359115, "grad_norm": 0.5703125, "grad_norm_var": 0.003223609924316406, "learning_rate": 2e-05, "loss": 1.2231, "loss/crossentropy": 2.6345443725585938, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16839203238487244, "step": 1110 }, { "epoch": 0.01658951769449007, "grad_norm": 0.5546875, "grad_norm_var": 0.003214263916015625, "learning_rate": 2e-05, "loss": 1.305, "loss/crossentropy": 2.5353426933288574, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.17995133996009827, "step": 1111 }, { "epoch": 0.016604449753621024, "grad_norm": 0.53125, "grad_norm_var": 0.0030047098795572915, "learning_rate": 2e-05, "loss": 1.2216, "loss/crossentropy": 2.474064826965332, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1669555902481079, "step": 1112 }, { "epoch": 0.01661938181275198, "grad_norm": 0.53125, "grad_norm_var": 0.0029886881510416668, "learning_rate": 2e-05, "loss": 1.2778, "loss/crossentropy": 2.3786816596984863, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17628170549869537, "step": 1113 }, { "epoch": 0.01663431387188293, "grad_norm": 0.494140625, "grad_norm_var": 0.002657810846964518, "learning_rate": 2e-05, "loss": 1.2608, "loss/crossentropy": 2.5369842052459717, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17481496930122375, "step": 1114 }, { "epoch": 0.016649245931013888, "grad_norm": 0.59765625, "grad_norm_var": 0.0027690728505452475, "learning_rate": 2e-05, "loss": 1.3831, "loss/crossentropy": 2.764906167984009, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.20345237851142883, "step": 1115 }, { "epoch": 0.01666417799014484, "grad_norm": 0.57421875, "grad_norm_var": 0.0026902357737223306, "learning_rate": 2e-05, "loss": 1.42, "loss/crossentropy": 2.3619296550750732, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.22464251518249512, "step": 1116 }, { "epoch": 0.016679110049275796, "grad_norm": 0.515625, "grad_norm_var": 0.0027773380279541016, "learning_rate": 2e-05, "loss": 1.2734, "loss/crossentropy": 2.6021158695220947, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.18749283254146576, "step": 1117 }, { "epoch": 0.016694042108406748, "grad_norm": 0.703125, "grad_norm_var": 0.004178857803344727, "learning_rate": 2e-05, "loss": 1.4517, "loss/crossentropy": 2.440187931060791, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 13.0, "loss/logits": 0.20173925161361694, "step": 1118 }, { "epoch": 0.016708974167537704, "grad_norm": 0.5546875, "grad_norm_var": 0.004178857803344727, "learning_rate": 2e-05, "loss": 1.4031, "loss/crossentropy": 2.343580722808838, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.21561887860298157, "step": 1119 }, { "epoch": 0.016723906226668656, "grad_norm": 0.6328125, "grad_norm_var": 0.004483270645141602, "learning_rate": 2e-05, "loss": 1.4455, "loss/crossentropy": 2.702117681503296, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 13.0, "loss/logits": 0.24238847196102142, "step": 1120 }, { "epoch": 0.016738838285799612, "grad_norm": 0.7734375, "grad_norm_var": 0.006887674331665039, "learning_rate": 2e-05, "loss": 1.5223, "loss/crossentropy": 2.4388134479522705, "loss/dist_ce": 0.0, "loss/fcd": 1.2890625, "loss/idx": 13.0, "loss/logits": 0.23323826491832733, "step": 1121 }, { "epoch": 0.016753770344930564, "grad_norm": 0.6015625, "grad_norm_var": 0.006028604507446289, "learning_rate": 2e-05, "loss": 1.3789, "loss/crossentropy": 2.9548146724700928, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.20707201957702637, "step": 1122 }, { "epoch": 0.01676870240406152, "grad_norm": 0.62109375, "grad_norm_var": 0.005402866999308268, "learning_rate": 2e-05, "loss": 1.2119, "loss/crossentropy": 2.4389278888702393, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1572415977716446, "step": 1123 }, { "epoch": 0.016783634463192473, "grad_norm": 0.640625, "grad_norm_var": 0.0055620670318603516, "learning_rate": 2e-05, "loss": 1.4007, "loss/crossentropy": 2.364082098007202, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.20534616708755493, "step": 1124 }, { "epoch": 0.01679856652232343, "grad_norm": 0.67578125, "grad_norm_var": 0.005534728368123372, "learning_rate": 2e-05, "loss": 1.3358, "loss/crossentropy": 2.7452070713043213, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.19514214992523193, "step": 1125 }, { "epoch": 0.016813498581454384, "grad_norm": 0.57421875, "grad_norm_var": 0.005521122614542643, "learning_rate": 2e-05, "loss": 1.3119, "loss/crossentropy": 2.608513116836548, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.2025601863861084, "step": 1126 }, { "epoch": 0.016828430640585337, "grad_norm": 0.546875, "grad_norm_var": 0.0055705865224202475, "learning_rate": 2e-05, "loss": 1.2455, "loss/crossentropy": 2.6770386695861816, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17516326904296875, "step": 1127 }, { "epoch": 0.016843362699716292, "grad_norm": 0.6171875, "grad_norm_var": 0.0052670637766520185, "learning_rate": 2e-05, "loss": 1.2574, "loss/crossentropy": 2.6152310371398926, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17150159180164337, "step": 1128 }, { "epoch": 0.016858294758847245, "grad_norm": 0.57421875, "grad_norm_var": 0.0049691359202067055, "learning_rate": 2e-05, "loss": 1.2696, "loss/crossentropy": 2.4857561588287354, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.1758662760257721, "step": 1129 }, { "epoch": 0.0168732268179782, "grad_norm": 0.54296875, "grad_norm_var": 0.004389381408691407, "learning_rate": 2e-05, "loss": 1.2337, "loss/crossentropy": 2.617910623550415, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1634291708469391, "step": 1130 }, { "epoch": 0.016888158877109153, "grad_norm": 0.5234375, "grad_norm_var": 0.004847208658854167, "learning_rate": 2e-05, "loss": 1.2272, "loss/crossentropy": 2.406134605407715, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16469454765319824, "step": 1131 }, { "epoch": 0.01690309093624011, "grad_norm": 0.578125, "grad_norm_var": 0.0048323949178059895, "learning_rate": 2e-05, "loss": 1.3356, "loss/crossentropy": 2.5499508380889893, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.2028346061706543, "step": 1132 }, { "epoch": 0.01691802299537106, "grad_norm": 0.5390625, "grad_norm_var": 0.0045882542928059895, "learning_rate": 2e-05, "loss": 1.2556, "loss/crossentropy": 2.609267234802246, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.17749843001365662, "step": 1133 }, { "epoch": 0.016932955054502017, "grad_norm": 0.92578125, "grad_norm_var": 0.010564168294270834, "learning_rate": 2e-05, "loss": 1.5228, "loss/crossentropy": 2.679161787033081, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 13.0, "loss/logits": 0.2727677822113037, "step": 1134 }, { "epoch": 0.01694788711363297, "grad_norm": 0.55078125, "grad_norm_var": 0.010599199930826824, "learning_rate": 2e-05, "loss": 1.2627, "loss/crossentropy": 2.477123737335205, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1767975389957428, "step": 1135 }, { "epoch": 0.016962819172763925, "grad_norm": 0.52734375, "grad_norm_var": 0.011112467447916666, "learning_rate": 2e-05, "loss": 1.1844, "loss/crossentropy": 2.6230485439300537, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.16095471382141113, "step": 1136 }, { "epoch": 0.016977751231894878, "grad_norm": 0.640625, "grad_norm_var": 0.009378814697265625, "learning_rate": 2e-05, "loss": 1.3407, "loss/crossentropy": 2.6676883697509766, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.20009231567382812, "step": 1137 }, { "epoch": 0.016992683291025833, "grad_norm": 0.578125, "grad_norm_var": 0.009423828125, "learning_rate": 2e-05, "loss": 1.3611, "loss/crossentropy": 2.3190951347351074, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.18917620182037354, "step": 1138 }, { "epoch": 0.017007615350156786, "grad_norm": 0.58984375, "grad_norm_var": 0.00941162109375, "learning_rate": 2e-05, "loss": 1.4375, "loss/crossentropy": 2.5025153160095215, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 13.0, "loss/logits": 0.22656379640102386, "step": 1139 }, { "epoch": 0.01702254740928774, "grad_norm": 0.57421875, "grad_norm_var": 0.00934136708577474, "learning_rate": 2e-05, "loss": 1.3811, "loss/crossentropy": 2.7573082447052, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 13.0, "loss/logits": 0.2170415222644806, "step": 1140 }, { "epoch": 0.017037479468418694, "grad_norm": 0.57421875, "grad_norm_var": 0.00892480214436849, "learning_rate": 2e-05, "loss": 1.2483, "loss/crossentropy": 2.612473249435425, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17800605297088623, "step": 1141 }, { "epoch": 0.01705241152754965, "grad_norm": 0.72265625, "grad_norm_var": 0.00996850331624349, "learning_rate": 2e-05, "loss": 1.4418, "loss/crossentropy": 2.723935604095459, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 13.0, "loss/logits": 0.22303350269794464, "step": 1142 }, { "epoch": 0.017067343586680602, "grad_norm": 0.52734375, "grad_norm_var": 0.010131581624348959, "learning_rate": 2e-05, "loss": 1.2575, "loss/crossentropy": 2.4441022872924805, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17158004641532898, "step": 1143 }, { "epoch": 0.017082275645811558, "grad_norm": 0.55078125, "grad_norm_var": 0.010247230529785156, "learning_rate": 2e-05, "loss": 1.2389, "loss/crossentropy": 2.5366430282592773, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1764407753944397, "step": 1144 }, { "epoch": 0.01709720770494251, "grad_norm": 0.52734375, "grad_norm_var": 0.010514259338378906, "learning_rate": 2e-05, "loss": 1.2357, "loss/crossentropy": 2.7200098037719727, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1732417792081833, "step": 1145 }, { "epoch": 0.017112139764073466, "grad_norm": 0.6328125, "grad_norm_var": 0.010430908203125, "learning_rate": 2e-05, "loss": 1.4005, "loss/crossentropy": 2.693432331085205, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.21298031508922577, "step": 1146 }, { "epoch": 0.01712707182320442, "grad_norm": 0.6796875, "grad_norm_var": 0.010410563151041666, "learning_rate": 2e-05, "loss": 1.4696, "loss/crossentropy": 2.30118989944458, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 13.0, "loss/logits": 0.20401182770729065, "step": 1147 }, { "epoch": 0.017142003882335374, "grad_norm": 0.52734375, "grad_norm_var": 0.010770098368326823, "learning_rate": 2e-05, "loss": 1.1958, "loss/crossentropy": 2.67213773727417, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.15674254298210144, "step": 1148 }, { "epoch": 0.017156935941466327, "grad_norm": 0.52734375, "grad_norm_var": 0.010880533854166667, "learning_rate": 2e-05, "loss": 1.2911, "loss/crossentropy": 2.788635730743408, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.19733943045139313, "step": 1149 }, { "epoch": 0.017171868000597282, "grad_norm": 0.671875, "grad_norm_var": 0.003999773661295573, "learning_rate": 2e-05, "loss": 1.2827, "loss/crossentropy": 2.373025894165039, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18114924430847168, "step": 1150 }, { "epoch": 0.017186800059728235, "grad_norm": 0.5546875, "grad_norm_var": 0.003981526692708333, "learning_rate": 2e-05, "loss": 1.3812, "loss/crossentropy": 2.4267663955688477, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.2015307992696762, "step": 1151 }, { "epoch": 0.01720173211885919, "grad_norm": 0.5703125, "grad_norm_var": 0.003750038146972656, "learning_rate": 2e-05, "loss": 1.263, "loss/crossentropy": 2.414804458618164, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1849151998758316, "step": 1152 }, { "epoch": 0.017216664177990147, "grad_norm": 0.58984375, "grad_norm_var": 0.0035723368326822918, "learning_rate": 2e-05, "loss": 1.3073, "loss/crossentropy": 2.3469202518463135, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.19007420539855957, "step": 1153 }, { "epoch": 0.0172315962371211, "grad_norm": 0.7109375, "grad_norm_var": 0.004510498046875, "learning_rate": 2e-05, "loss": 1.36, "loss/crossentropy": 2.6144962310791016, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.18814045190811157, "step": 1154 }, { "epoch": 0.017246528296252055, "grad_norm": 0.53515625, "grad_norm_var": 0.004740142822265625, "learning_rate": 2e-05, "loss": 1.1889, "loss/crossentropy": 2.7515432834625244, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15762561559677124, "step": 1155 }, { "epoch": 0.017261460355383007, "grad_norm": 0.48828125, "grad_norm_var": 0.005408732096354166, "learning_rate": 2e-05, "loss": 1.142, "loss/crossentropy": 2.41715407371521, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.14197902381420135, "step": 1156 }, { "epoch": 0.017276392414513963, "grad_norm": 0.5390625, "grad_norm_var": 0.005545488993326823, "learning_rate": 2e-05, "loss": 1.2009, "loss/crossentropy": 2.6549770832061768, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16962096095085144, "step": 1157 }, { "epoch": 0.017291324473644915, "grad_norm": 0.59765625, "grad_norm_var": 0.004223060607910156, "learning_rate": 2e-05, "loss": 1.3874, "loss/crossentropy": 2.5990960597991943, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.1999187469482422, "step": 1158 }, { "epoch": 0.01730625653277587, "grad_norm": 0.875, "grad_norm_var": 0.009479777018229166, "learning_rate": 2e-05, "loss": 1.4493, "loss/crossentropy": 2.7729477882385254, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 13.0, "loss/logits": 0.22276878356933594, "step": 1159 }, { "epoch": 0.017321188591906823, "grad_norm": 0.58984375, "grad_norm_var": 0.009325917561848958, "learning_rate": 2e-05, "loss": 1.3001, "loss/crossentropy": 3.0190675258636475, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.17512644827365875, "step": 1160 }, { "epoch": 0.01733612065103778, "grad_norm": 0.546875, "grad_norm_var": 0.009157752990722657, "learning_rate": 2e-05, "loss": 1.2894, "loss/crossentropy": 2.6917951107025146, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18787968158721924, "step": 1161 }, { "epoch": 0.01735105271016873, "grad_norm": 0.55859375, "grad_norm_var": 0.009200032552083333, "learning_rate": 2e-05, "loss": 1.396, "loss/crossentropy": 2.4003427028656006, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.21633100509643555, "step": 1162 }, { "epoch": 0.017365984769299687, "grad_norm": 0.494140625, "grad_norm_var": 0.009322341283162434, "learning_rate": 2e-05, "loss": 1.2178, "loss/crossentropy": 2.5328409671783447, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17093852162361145, "step": 1163 }, { "epoch": 0.01738091682843064, "grad_norm": 0.5, "grad_norm_var": 0.009583139419555664, "learning_rate": 2e-05, "loss": 1.2184, "loss/crossentropy": 2.6213250160217285, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16373440623283386, "step": 1164 }, { "epoch": 0.017395848887561596, "grad_norm": 0.515625, "grad_norm_var": 0.009680795669555663, "learning_rate": 2e-05, "loss": 1.2133, "loss/crossentropy": 2.523061513900757, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1664462834596634, "step": 1165 }, { "epoch": 0.017410780946692548, "grad_norm": 0.57421875, "grad_norm_var": 0.009127664566040038, "learning_rate": 2e-05, "loss": 1.2568, "loss/crossentropy": 2.529860496520996, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18652039766311646, "step": 1166 }, { "epoch": 0.017425713005823504, "grad_norm": 0.66796875, "grad_norm_var": 0.009584919611612955, "learning_rate": 2e-05, "loss": 1.3846, "loss/crossentropy": 2.6294994354248047, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 13.0, "loss/logits": 0.2205285131931305, "step": 1167 }, { "epoch": 0.017440645064954456, "grad_norm": 0.79296875, "grad_norm_var": 0.012259403864542643, "learning_rate": 2e-05, "loss": 1.5955, "loss/crossentropy": 2.7622482776641846, "loss/dist_ce": 0.0, "loss/fcd": 1.3203125, "loss/idx": 13.0, "loss/logits": 0.2751520276069641, "step": 1168 }, { "epoch": 0.017455577124085412, "grad_norm": 0.494140625, "grad_norm_var": 0.01294244130452474, "learning_rate": 2e-05, "loss": 1.3178, "loss/crossentropy": 2.5934271812438965, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19280162453651428, "step": 1169 }, { "epoch": 0.017470509183216364, "grad_norm": 0.498046875, "grad_norm_var": 0.012414026260375976, "learning_rate": 2e-05, "loss": 1.0967, "loss/crossentropy": 2.6452291011810303, "loss/dist_ce": 0.0, "loss/fcd": 0.953125, "loss/idx": 13.0, "loss/logits": 0.14357757568359375, "step": 1170 }, { "epoch": 0.01748544124234732, "grad_norm": 0.52734375, "grad_norm_var": 0.012463744481404622, "learning_rate": 2e-05, "loss": 1.1795, "loss/crossentropy": 2.3400866985321045, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.16387495398521423, "step": 1171 }, { "epoch": 0.017500373301478273, "grad_norm": 0.5078125, "grad_norm_var": 0.012252028783162434, "learning_rate": 2e-05, "loss": 1.2582, "loss/crossentropy": 2.6386451721191406, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17226719856262207, "step": 1172 }, { "epoch": 0.01751530536060923, "grad_norm": 0.58984375, "grad_norm_var": 0.012136316299438477, "learning_rate": 2e-05, "loss": 1.3139, "loss/crossentropy": 2.514528274536133, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18887433409690857, "step": 1173 }, { "epoch": 0.01753023741974018, "grad_norm": 0.52734375, "grad_norm_var": 0.012309122085571288, "learning_rate": 2e-05, "loss": 1.225, "loss/crossentropy": 2.5932071208953857, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1703052967786789, "step": 1174 }, { "epoch": 0.017545169478871137, "grad_norm": 0.52734375, "grad_norm_var": 0.006130075454711914, "learning_rate": 2e-05, "loss": 1.1948, "loss/crossentropy": 2.6108434200286865, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.163537859916687, "step": 1175 }, { "epoch": 0.01756010153800209, "grad_norm": 0.6171875, "grad_norm_var": 0.006296523412068685, "learning_rate": 2e-05, "loss": 1.4186, "loss/crossentropy": 2.4866180419921875, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.23113754391670227, "step": 1176 }, { "epoch": 0.017575033597133045, "grad_norm": 0.87109375, "grad_norm_var": 0.012354516983032226, "learning_rate": 2e-05, "loss": 1.2533, "loss/crossentropy": 2.478165626525879, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.16738183796405792, "step": 1177 }, { "epoch": 0.017589965656263997, "grad_norm": 0.50390625, "grad_norm_var": 0.012690083185831705, "learning_rate": 2e-05, "loss": 1.2588, "loss/crossentropy": 2.6207003593444824, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1807083934545517, "step": 1178 }, { "epoch": 0.017604897715394953, "grad_norm": 0.53515625, "grad_norm_var": 0.012349955240885417, "learning_rate": 2e-05, "loss": 1.2262, "loss/crossentropy": 2.719006299972534, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1715444028377533, "step": 1179 }, { "epoch": 0.01761982977452591, "grad_norm": 0.625, "grad_norm_var": 0.012024434407552083, "learning_rate": 2e-05, "loss": 1.3838, "loss/crossentropy": 2.394318103790283, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.20415380597114563, "step": 1180 }, { "epoch": 0.01763476183365686, "grad_norm": 0.5390625, "grad_norm_var": 0.011839040120442708, "learning_rate": 2e-05, "loss": 1.2821, "loss/crossentropy": 2.665447235107422, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18832457065582275, "step": 1181 }, { "epoch": 0.017649693892787817, "grad_norm": 0.60546875, "grad_norm_var": 0.011845143636067708, "learning_rate": 2e-05, "loss": 1.3763, "loss/crossentropy": 2.765406608581543, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.2044430673122406, "step": 1182 }, { "epoch": 0.01766462595191877, "grad_norm": 0.62890625, "grad_norm_var": 0.01153106689453125, "learning_rate": 2e-05, "loss": 1.288, "loss/crossentropy": 2.4687106609344482, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.20986786484718323, "step": 1183 }, { "epoch": 0.017679558011049725, "grad_norm": 0.478515625, "grad_norm_var": 0.009071842829386393, "learning_rate": 2e-05, "loss": 1.2231, "loss/crossentropy": 2.5798656940460205, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17622314393520355, "step": 1184 }, { "epoch": 0.017694490070180677, "grad_norm": 0.6328125, "grad_norm_var": 0.00892175038655599, "learning_rate": 2e-05, "loss": 1.1994, "loss/crossentropy": 2.4811015129089355, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1525367796421051, "step": 1185 }, { "epoch": 0.017709422129311633, "grad_norm": 0.60546875, "grad_norm_var": 0.008527485529581706, "learning_rate": 2e-05, "loss": 1.3831, "loss/crossentropy": 2.6285629272460938, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.21119612455368042, "step": 1186 }, { "epoch": 0.017724354188442586, "grad_norm": 0.55078125, "grad_norm_var": 0.008389012018839518, "learning_rate": 2e-05, "loss": 1.251, "loss/crossentropy": 2.726686716079712, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1728990077972412, "step": 1187 }, { "epoch": 0.01773928624757354, "grad_norm": 0.5078125, "grad_norm_var": 0.008389012018839518, "learning_rate": 2e-05, "loss": 1.2366, "loss/crossentropy": 2.5716280937194824, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16626042127609253, "step": 1188 }, { "epoch": 0.017754218306704494, "grad_norm": 0.5078125, "grad_norm_var": 0.008746830622355144, "learning_rate": 2e-05, "loss": 1.261, "loss/crossentropy": 2.5722556114196777, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17502065002918243, "step": 1189 }, { "epoch": 0.01776915036583545, "grad_norm": 0.51171875, "grad_norm_var": 0.008869663874308268, "learning_rate": 2e-05, "loss": 1.1607, "loss/crossentropy": 2.6687557697296143, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.145055890083313, "step": 1190 }, { "epoch": 0.017784082424966402, "grad_norm": 0.494140625, "grad_norm_var": 0.009162839253743489, "learning_rate": 2e-05, "loss": 1.1919, "loss/crossentropy": 2.54484486579895, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16066983342170715, "step": 1191 }, { "epoch": 0.017799014484097358, "grad_norm": 0.5546875, "grad_norm_var": 0.009063148498535156, "learning_rate": 2e-05, "loss": 1.2078, "loss/crossentropy": 2.6763405799865723, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16090230643749237, "step": 1192 }, { "epoch": 0.01781394654322831, "grad_norm": 0.55859375, "grad_norm_var": 0.0027053197224934894, "learning_rate": 2e-05, "loss": 1.3166, "loss/crossentropy": 2.541527509689331, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.1915903240442276, "step": 1193 }, { "epoch": 0.017828878602359266, "grad_norm": 0.61328125, "grad_norm_var": 0.0027444839477539064, "learning_rate": 2e-05, "loss": 1.2664, "loss/crossentropy": 2.8826029300689697, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.18048372864723206, "step": 1194 }, { "epoch": 0.01784381066149022, "grad_norm": 0.546875, "grad_norm_var": 0.002715301513671875, "learning_rate": 2e-05, "loss": 1.2014, "loss/crossentropy": 2.8553855419158936, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16232401132583618, "step": 1195 }, { "epoch": 0.017858742720621174, "grad_norm": 0.58203125, "grad_norm_var": 0.002458635965983073, "learning_rate": 2e-05, "loss": 1.3419, "loss/crossentropy": 2.377991199493408, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 13.0, "loss/logits": 0.17786559462547302, "step": 1196 }, { "epoch": 0.017873674779752127, "grad_norm": 0.53125, "grad_norm_var": 0.002481524149576823, "learning_rate": 2e-05, "loss": 1.257, "loss/crossentropy": 2.6475048065185547, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.17889858782291412, "step": 1197 }, { "epoch": 0.017888606838883082, "grad_norm": 0.60546875, "grad_norm_var": 0.002481524149576823, "learning_rate": 2e-05, "loss": 1.3621, "loss/crossentropy": 2.4759228229522705, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.2058965563774109, "step": 1198 }, { "epoch": 0.017903538898014035, "grad_norm": 0.5, "grad_norm_var": 0.0022822062174479166, "learning_rate": 2e-05, "loss": 1.2517, "loss/crossentropy": 2.5964038372039795, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18137618899345398, "step": 1199 }, { "epoch": 0.01791847095714499, "grad_norm": 0.5859375, "grad_norm_var": 0.001996342341105143, "learning_rate": 2e-05, "loss": 1.3369, "loss/crossentropy": 2.4271297454833984, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.18846943974494934, "step": 1200 }, { "epoch": 0.017933403016275943, "grad_norm": 0.55859375, "grad_norm_var": 0.0015759627024332682, "learning_rate": 2e-05, "loss": 1.3796, "loss/crossentropy": 2.558906316757202, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.23895680904388428, "step": 1201 }, { "epoch": 0.0179483350754069, "grad_norm": 0.55859375, "grad_norm_var": 0.0013722578684488933, "learning_rate": 2e-05, "loss": 1.2437, "loss/crossentropy": 2.63244891166687, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.15778325498104095, "step": 1202 }, { "epoch": 0.01796326713453785, "grad_norm": 0.52734375, "grad_norm_var": 0.0013978163401285808, "learning_rate": 2e-05, "loss": 1.2569, "loss/crossentropy": 2.5377118587493896, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17098784446716309, "step": 1203 }, { "epoch": 0.017978199193668807, "grad_norm": 0.5078125, "grad_norm_var": 0.0013978163401285808, "learning_rate": 2e-05, "loss": 1.2133, "loss/crossentropy": 2.4338786602020264, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.15082281827926636, "step": 1204 }, { "epoch": 0.017993131252799763, "grad_norm": 0.6171875, "grad_norm_var": 0.0015811761220296224, "learning_rate": 2e-05, "loss": 1.3381, "loss/crossentropy": 2.4920754432678223, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.20530369877815247, "step": 1205 }, { "epoch": 0.018008063311930715, "grad_norm": 0.5078125, "grad_norm_var": 0.0016038099924723308, "learning_rate": 2e-05, "loss": 1.226, "loss/crossentropy": 2.680633783340454, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17915096879005432, "step": 1206 }, { "epoch": 0.01802299537106167, "grad_norm": 0.65234375, "grad_norm_var": 0.0019243876139322916, "learning_rate": 2e-05, "loss": 1.3583, "loss/crossentropy": 2.5132720470428467, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.18643470108509064, "step": 1207 }, { "epoch": 0.018037927430192623, "grad_norm": 0.51171875, "grad_norm_var": 0.0020873387654622394, "learning_rate": 2e-05, "loss": 1.2533, "loss/crossentropy": 2.6504547595977783, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18298813700675964, "step": 1208 }, { "epoch": 0.01805285948932358, "grad_norm": 0.61328125, "grad_norm_var": 0.0022617975870768228, "learning_rate": 2e-05, "loss": 1.3371, "loss/crossentropy": 2.7215096950531006, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.20427459478378296, "step": 1209 }, { "epoch": 0.01806779154845453, "grad_norm": 0.5390625, "grad_norm_var": 0.002115631103515625, "learning_rate": 2e-05, "loss": 1.2598, "loss/crossentropy": 2.750105857849121, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17385736107826233, "step": 1210 }, { "epoch": 0.018082723607585487, "grad_norm": 0.57421875, "grad_norm_var": 0.0021178563435872394, "learning_rate": 2e-05, "loss": 1.268, "loss/crossentropy": 2.3910109996795654, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.16642415523529053, "step": 1211 }, { "epoch": 0.01809765566671644, "grad_norm": 0.578125, "grad_norm_var": 0.0021077473958333332, "learning_rate": 2e-05, "loss": 1.2759, "loss/crossentropy": 2.6208412647247314, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17433887720108032, "step": 1212 }, { "epoch": 0.018112587725847396, "grad_norm": 0.5234375, "grad_norm_var": 0.0021420796712239582, "learning_rate": 2e-05, "loss": 1.2242, "loss/crossentropy": 2.5190320014953613, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16950549185276031, "step": 1213 }, { "epoch": 0.018127519784978348, "grad_norm": 0.54296875, "grad_norm_var": 0.0020078023274739585, "learning_rate": 2e-05, "loss": 1.2889, "loss/crossentropy": 2.4932422637939453, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.17951592803001404, "step": 1214 }, { "epoch": 0.018142451844109304, "grad_norm": 0.51171875, "grad_norm_var": 0.0019286473592122395, "learning_rate": 2e-05, "loss": 1.1917, "loss/crossentropy": 2.5093982219696045, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16048479080200195, "step": 1215 }, { "epoch": 0.018157383903240256, "grad_norm": 0.55859375, "grad_norm_var": 0.0018694559733072917, "learning_rate": 2e-05, "loss": 1.3799, "loss/crossentropy": 2.4472479820251465, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 13.0, "loss/logits": 0.21582560241222382, "step": 1216 }, { "epoch": 0.018172315962371212, "grad_norm": 0.5, "grad_norm_var": 0.002057329813639323, "learning_rate": 2e-05, "loss": 1.234, "loss/crossentropy": 2.574943780899048, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17926263809204102, "step": 1217 }, { "epoch": 0.018187248021502164, "grad_norm": 0.58984375, "grad_norm_var": 0.002147865295410156, "learning_rate": 2e-05, "loss": 1.2759, "loss/crossentropy": 2.5302133560180664, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.16648699343204498, "step": 1218 }, { "epoch": 0.01820218008063312, "grad_norm": 0.5703125, "grad_norm_var": 0.002113596598307292, "learning_rate": 2e-05, "loss": 1.2878, "loss/crossentropy": 2.620727777481079, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1784372329711914, "step": 1219 }, { "epoch": 0.018217112139764072, "grad_norm": 0.50390625, "grad_norm_var": 0.0021397272745768228, "learning_rate": 2e-05, "loss": 1.2389, "loss/crossentropy": 2.8881278038024902, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1763637661933899, "step": 1220 }, { "epoch": 0.018232044198895028, "grad_norm": 0.5234375, "grad_norm_var": 0.001923052469889323, "learning_rate": 2e-05, "loss": 1.1603, "loss/crossentropy": 2.626577854156494, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.15248607099056244, "step": 1221 }, { "epoch": 0.01824697625802598, "grad_norm": 0.51953125, "grad_norm_var": 0.0018656412760416667, "learning_rate": 2e-05, "loss": 1.3248, "loss/crossentropy": 2.4721579551696777, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.19197949767112732, "step": 1222 }, { "epoch": 0.018261908317156936, "grad_norm": 0.59765625, "grad_norm_var": 0.0013120015462239583, "learning_rate": 2e-05, "loss": 1.2957, "loss/crossentropy": 2.1448283195495605, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1863187849521637, "step": 1223 }, { "epoch": 0.01827684037628789, "grad_norm": 0.74609375, "grad_norm_var": 0.003631337483723958, "learning_rate": 2e-05, "loss": 1.3701, "loss/crossentropy": 2.505596399307251, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.1826344132423401, "step": 1224 }, { "epoch": 0.018291772435418845, "grad_norm": 0.54296875, "grad_norm_var": 0.003459676106770833, "learning_rate": 2e-05, "loss": 1.191, "loss/crossentropy": 2.5166234970092773, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15974509716033936, "step": 1225 }, { "epoch": 0.018306704494549797, "grad_norm": 0.5234375, "grad_norm_var": 0.003513590494791667, "learning_rate": 2e-05, "loss": 1.2205, "loss/crossentropy": 2.7810256481170654, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1658242642879486, "step": 1226 }, { "epoch": 0.018321636553680753, "grad_norm": 0.5, "grad_norm_var": 0.003683916727701823, "learning_rate": 2e-05, "loss": 1.2462, "loss/crossentropy": 2.5059738159179688, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17585018277168274, "step": 1227 }, { "epoch": 0.018336568612811705, "grad_norm": 0.59375, "grad_norm_var": 0.0037535985310872396, "learning_rate": 2e-05, "loss": 1.4086, "loss/crossentropy": 2.800818681716919, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.22110095620155334, "step": 1228 }, { "epoch": 0.01835150067194266, "grad_norm": 0.50390625, "grad_norm_var": 0.0038543701171875, "learning_rate": 2e-05, "loss": 1.1752, "loss/crossentropy": 2.5914788246154785, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.1517515331506729, "step": 1229 }, { "epoch": 0.018366432731073613, "grad_norm": 0.49609375, "grad_norm_var": 0.004046630859375, "learning_rate": 2e-05, "loss": 1.2081, "loss/crossentropy": 2.6223032474517822, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.153452530503273, "step": 1230 }, { "epoch": 0.01838136479020457, "grad_norm": 0.7109375, "grad_norm_var": 0.005541419982910157, "learning_rate": 2e-05, "loss": 1.4043, "loss/crossentropy": 2.614365339279175, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.2089834213256836, "step": 1231 }, { "epoch": 0.018396296849335525, "grad_norm": 0.53125, "grad_norm_var": 0.005597941080729167, "learning_rate": 2e-05, "loss": 1.2692, "loss/crossentropy": 2.5265705585479736, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17544037103652954, "step": 1232 }, { "epoch": 0.018411228908466477, "grad_norm": 0.63671875, "grad_norm_var": 0.005680274963378906, "learning_rate": 2e-05, "loss": 1.4872, "loss/crossentropy": 2.4311468601226807, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 13.0, "loss/logits": 0.2606537342071533, "step": 1233 }, { "epoch": 0.018426160967597433, "grad_norm": 0.65234375, "grad_norm_var": 0.006105486551920573, "learning_rate": 2e-05, "loss": 1.47, "loss/crossentropy": 2.592331647872925, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 13.0, "loss/logits": 0.23566170036792755, "step": 1234 }, { "epoch": 0.018441093026728386, "grad_norm": 0.5390625, "grad_norm_var": 0.00617364247639974, "learning_rate": 2e-05, "loss": 1.3208, "loss/crossentropy": 2.7657077312469482, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19580461084842682, "step": 1235 }, { "epoch": 0.01845602508585934, "grad_norm": 0.52734375, "grad_norm_var": 0.00600121815999349, "learning_rate": 2e-05, "loss": 1.2421, "loss/crossentropy": 2.8439533710479736, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.18744774162769318, "step": 1236 }, { "epoch": 0.018470957144990294, "grad_norm": 0.64453125, "grad_norm_var": 0.006141153971354166, "learning_rate": 2e-05, "loss": 1.2085, "loss/crossentropy": 3.1795899868011475, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.14597997069358826, "step": 1237 }, { "epoch": 0.01848588920412125, "grad_norm": 0.53515625, "grad_norm_var": 0.006032307942708333, "learning_rate": 2e-05, "loss": 1.2747, "loss/crossentropy": 2.7262730598449707, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1887180060148239, "step": 1238 }, { "epoch": 0.018500821263252202, "grad_norm": 0.546875, "grad_norm_var": 0.00607446034749349, "learning_rate": 2e-05, "loss": 1.3052, "loss/crossentropy": 2.5821917057037354, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.18802396953105927, "step": 1239 }, { "epoch": 0.018515753322383158, "grad_norm": 0.57421875, "grad_norm_var": 0.00404351552327474, "learning_rate": 2e-05, "loss": 1.3231, "loss/crossentropy": 2.5506041049957275, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.17467817664146423, "step": 1240 }, { "epoch": 0.01853068538151411, "grad_norm": 0.5546875, "grad_norm_var": 0.004015858968098958, "learning_rate": 2e-05, "loss": 1.2676, "loss/crossentropy": 2.6439409255981445, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.16604889929294586, "step": 1241 }, { "epoch": 0.018545617440645066, "grad_norm": 0.58984375, "grad_norm_var": 0.0039066950480143225, "learning_rate": 2e-05, "loss": 1.384, "loss/crossentropy": 2.2677359580993652, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.1886504888534546, "step": 1242 }, { "epoch": 0.01856054949977602, "grad_norm": 0.5078125, "grad_norm_var": 0.003836504618326823, "learning_rate": 2e-05, "loss": 1.2263, "loss/crossentropy": 2.5122861862182617, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16376549005508423, "step": 1243 }, { "epoch": 0.018575481558906974, "grad_norm": 0.53125, "grad_norm_var": 0.0038955052693684894, "learning_rate": 2e-05, "loss": 1.3596, "loss/crossentropy": 2.6457715034484863, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.21896395087242126, "step": 1244 }, { "epoch": 0.018590413618037926, "grad_norm": 0.5234375, "grad_norm_var": 0.003753407796223958, "learning_rate": 2e-05, "loss": 1.1308, "loss/crossentropy": 2.356555700302124, "loss/dist_ce": 0.0, "loss/fcd": 0.98828125, "loss/idx": 13.0, "loss/logits": 0.14256852865219116, "step": 1245 }, { "epoch": 0.018605345677168882, "grad_norm": 0.57421875, "grad_norm_var": 0.0033770243326822916, "learning_rate": 2e-05, "loss": 1.2262, "loss/crossentropy": 2.6997029781341553, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1715344786643982, "step": 1246 }, { "epoch": 0.018620277736299835, "grad_norm": 0.55859375, "grad_norm_var": 0.0020405451456705728, "learning_rate": 2e-05, "loss": 1.3046, "loss/crossentropy": 2.6006901264190674, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.1796230673789978, "step": 1247 }, { "epoch": 0.01863520979543079, "grad_norm": 0.5703125, "grad_norm_var": 0.0019642512003580728, "learning_rate": 2e-05, "loss": 1.2811, "loss/crossentropy": 2.6090171337127686, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1717006117105484, "step": 1248 }, { "epoch": 0.018650141854561743, "grad_norm": 0.50390625, "grad_norm_var": 0.0018259048461914062, "learning_rate": 2e-05, "loss": 1.2577, "loss/crossentropy": 2.1411545276641846, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.16393724083900452, "step": 1249 }, { "epoch": 0.0186650739136927, "grad_norm": 0.5390625, "grad_norm_var": 0.0012082417805989584, "learning_rate": 2e-05, "loss": 1.3501, "loss/crossentropy": 2.4617106914520264, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 13.0, "loss/logits": 0.18599101901054382, "step": 1250 }, { "epoch": 0.01868000597282365, "grad_norm": 0.546875, "grad_norm_var": 0.0011993408203125, "learning_rate": 2e-05, "loss": 1.2825, "loss/crossentropy": 2.587517261505127, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18875601887702942, "step": 1251 }, { "epoch": 0.018694938031954607, "grad_norm": 0.6171875, "grad_norm_var": 0.0014113744099934896, "learning_rate": 2e-05, "loss": 1.2102, "loss/crossentropy": 2.5046262741088867, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.15552225708961487, "step": 1252 }, { "epoch": 0.01870987009108556, "grad_norm": 0.6015625, "grad_norm_var": 0.0010274251302083333, "learning_rate": 2e-05, "loss": 1.258, "loss/crossentropy": 2.6143996715545654, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17202343046665192, "step": 1253 }, { "epoch": 0.018724802150216515, "grad_norm": 0.5625, "grad_norm_var": 0.001002947489420573, "learning_rate": 2e-05, "loss": 1.2749, "loss/crossentropy": 2.6341636180877686, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1733606904745102, "step": 1254 }, { "epoch": 0.018739734209347467, "grad_norm": 0.625, "grad_norm_var": 0.001285235087076823, "learning_rate": 2e-05, "loss": 1.2895, "loss/crossentropy": 2.7565486431121826, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1879180669784546, "step": 1255 }, { "epoch": 0.018754666268478423, "grad_norm": 0.490234375, "grad_norm_var": 0.0015811761220296224, "learning_rate": 2e-05, "loss": 1.1834, "loss/crossentropy": 2.7358415126800537, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1677880734205246, "step": 1256 }, { "epoch": 0.01876959832760938, "grad_norm": 0.55078125, "grad_norm_var": 0.0015828291575113933, "learning_rate": 2e-05, "loss": 1.2657, "loss/crossentropy": 2.661904811859131, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1798032820224762, "step": 1257 }, { "epoch": 0.01878453038674033, "grad_norm": 0.56640625, "grad_norm_var": 0.0015107313791910806, "learning_rate": 2e-05, "loss": 1.2148, "loss/crossentropy": 2.6133949756622314, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1523081362247467, "step": 1258 }, { "epoch": 0.018799462445871287, "grad_norm": 0.59375, "grad_norm_var": 0.0014393965403238932, "learning_rate": 2e-05, "loss": 1.2584, "loss/crossentropy": 2.6154913902282715, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18810322880744934, "step": 1259 }, { "epoch": 0.01881439450500224, "grad_norm": 0.578125, "grad_norm_var": 0.0013989607493082683, "learning_rate": 2e-05, "loss": 1.3253, "loss/crossentropy": 2.4987523555755615, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.19253703951835632, "step": 1260 }, { "epoch": 0.018829326564133195, "grad_norm": 0.5546875, "grad_norm_var": 0.0012967268625895183, "learning_rate": 2e-05, "loss": 1.1954, "loss/crossentropy": 2.6275758743286133, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1563832014799118, "step": 1261 }, { "epoch": 0.018844258623264148, "grad_norm": 0.56640625, "grad_norm_var": 0.0012904961903889975, "learning_rate": 2e-05, "loss": 1.1805, "loss/crossentropy": 2.654968738555908, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.1571100354194641, "step": 1262 }, { "epoch": 0.018859190682395104, "grad_norm": 0.56640625, "grad_norm_var": 0.001288588841756185, "learning_rate": 2e-05, "loss": 1.2607, "loss/crossentropy": 2.5465691089630127, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17477178573608398, "step": 1263 }, { "epoch": 0.018874122741526056, "grad_norm": 0.48046875, "grad_norm_var": 0.0017243544260660806, "learning_rate": 2e-05, "loss": 1.2487, "loss/crossentropy": 2.625317096710205, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.17062163352966309, "step": 1264 }, { "epoch": 0.018889054800657012, "grad_norm": 0.53515625, "grad_norm_var": 0.0015559991200764973, "learning_rate": 2e-05, "loss": 1.2303, "loss/crossentropy": 2.710569143295288, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16783851385116577, "step": 1265 }, { "epoch": 0.018903986859787964, "grad_norm": 0.5703125, "grad_norm_var": 0.0015259901682535806, "learning_rate": 2e-05, "loss": 1.2482, "loss/crossentropy": 2.6588263511657715, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1778830587863922, "step": 1266 }, { "epoch": 0.01891891891891892, "grad_norm": 0.5234375, "grad_norm_var": 0.0016102949778238932, "learning_rate": 2e-05, "loss": 1.2365, "loss/crossentropy": 2.613532304763794, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1740056276321411, "step": 1267 }, { "epoch": 0.018933850978049872, "grad_norm": 0.515625, "grad_norm_var": 0.0014995416005452475, "learning_rate": 2e-05, "loss": 1.2785, "loss/crossentropy": 2.453362464904785, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17693351209163666, "step": 1268 }, { "epoch": 0.018948783037180828, "grad_norm": 0.53125, "grad_norm_var": 0.001372512181599935, "learning_rate": 2e-05, "loss": 1.3397, "loss/crossentropy": 2.748429775238037, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.20684918761253357, "step": 1269 }, { "epoch": 0.01896371509631178, "grad_norm": 0.53125, "grad_norm_var": 0.0013842105865478516, "learning_rate": 2e-05, "loss": 1.26, "loss/crossentropy": 2.556675672531128, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17401845753192902, "step": 1270 }, { "epoch": 0.018978647155442736, "grad_norm": 0.5, "grad_norm_var": 0.0010892073313395181, "learning_rate": 2e-05, "loss": 1.2804, "loss/crossentropy": 2.5527727603912354, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17879468202590942, "step": 1271 }, { "epoch": 0.01899357921457369, "grad_norm": 0.88671875, "grad_norm_var": 0.008236122131347657, "learning_rate": 2e-05, "loss": 1.4474, "loss/crossentropy": 2.2951905727386475, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 13.0, "loss/logits": 0.21297743916511536, "step": 1272 }, { "epoch": 0.019008511273704645, "grad_norm": 0.52734375, "grad_norm_var": 0.008316993713378906, "learning_rate": 2e-05, "loss": 1.2259, "loss/crossentropy": 2.5471127033233643, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1712278127670288, "step": 1273 }, { "epoch": 0.019023443332835597, "grad_norm": 0.5234375, "grad_norm_var": 0.0084197998046875, "learning_rate": 2e-05, "loss": 1.3398, "loss/crossentropy": 2.5242245197296143, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.20698873698711395, "step": 1274 }, { "epoch": 0.019038375391966553, "grad_norm": 0.466796875, "grad_norm_var": 0.008881616592407226, "learning_rate": 2e-05, "loss": 1.1402, "loss/crossentropy": 2.6311800479888916, "loss/dist_ce": 0.0, "loss/fcd": 0.99609375, "loss/idx": 13.0, "loss/logits": 0.14407965540885925, "step": 1275 }, { "epoch": 0.019053307451097505, "grad_norm": 0.5625, "grad_norm_var": 0.008845758438110352, "learning_rate": 2e-05, "loss": 1.3024, "loss/crossentropy": 2.553720712661743, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.19305121898651123, "step": 1276 }, { "epoch": 0.01906823951022846, "grad_norm": 0.5390625, "grad_norm_var": 0.008856693903605143, "learning_rate": 2e-05, "loss": 1.2891, "loss/crossentropy": 2.5351758003234863, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1875266134738922, "step": 1277 }, { "epoch": 0.019083171569359413, "grad_norm": 0.56640625, "grad_norm_var": 0.008856693903605143, "learning_rate": 2e-05, "loss": 1.3024, "loss/crossentropy": 2.4513752460479736, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.17735695838928223, "step": 1278 }, { "epoch": 0.01909810362849037, "grad_norm": 0.53515625, "grad_norm_var": 0.00885618527730306, "learning_rate": 2e-05, "loss": 1.2502, "loss/crossentropy": 2.5539772510528564, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.17203769087791443, "step": 1279 }, { "epoch": 0.01911303568762132, "grad_norm": 0.56640625, "grad_norm_var": 0.008524688084920247, "learning_rate": 2e-05, "loss": 1.2212, "loss/crossentropy": 2.49832820892334, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17433378100395203, "step": 1280 }, { "epoch": 0.019127967746752277, "grad_norm": 0.515625, "grad_norm_var": 0.008600346247355143, "learning_rate": 2e-05, "loss": 1.2678, "loss/crossentropy": 2.638371229171753, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17406100034713745, "step": 1281 }, { "epoch": 0.01914289980588323, "grad_norm": 0.734375, "grad_norm_var": 0.01064311663309733, "learning_rate": 2e-05, "loss": 1.4935, "loss/crossentropy": 2.581265687942505, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 13.0, "loss/logits": 0.21223483979701996, "step": 1282 }, { "epoch": 0.019157831865014185, "grad_norm": 0.5625, "grad_norm_var": 0.010526768366495768, "learning_rate": 2e-05, "loss": 1.3831, "loss/crossentropy": 2.5751612186431885, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.21123623847961426, "step": 1283 }, { "epoch": 0.01917276392414514, "grad_norm": 0.462890625, "grad_norm_var": 0.011058489481608072, "learning_rate": 2e-05, "loss": 1.1104, "loss/crossentropy": 2.5615251064300537, "loss/dist_ce": 0.0, "loss/fcd": 0.9609375, "loss/idx": 13.0, "loss/logits": 0.14948110282421112, "step": 1284 }, { "epoch": 0.019187695983276094, "grad_norm": 0.53125, "grad_norm_var": 0.011058489481608072, "learning_rate": 2e-05, "loss": 1.2032, "loss/crossentropy": 2.5952653884887695, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1641014665365219, "step": 1285 }, { "epoch": 0.01920262804240705, "grad_norm": 0.5234375, "grad_norm_var": 0.011095619201660157, "learning_rate": 2e-05, "loss": 1.2654, "loss/crossentropy": 2.237917423248291, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.19509923458099365, "step": 1286 }, { "epoch": 0.019217560101538002, "grad_norm": 0.5390625, "grad_norm_var": 0.010864194234212239, "learning_rate": 2e-05, "loss": 1.2028, "loss/crossentropy": 2.4912095069885254, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.15591605007648468, "step": 1287 }, { "epoch": 0.019232492160668958, "grad_norm": 0.53515625, "grad_norm_var": 0.0035170873006184896, "learning_rate": 2e-05, "loss": 1.2284, "loss/crossentropy": 2.849947214126587, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17369095981121063, "step": 1288 }, { "epoch": 0.01924742421979991, "grad_norm": 0.5546875, "grad_norm_var": 0.003505961100260417, "learning_rate": 2e-05, "loss": 1.3009, "loss/crossentropy": 2.718153953552246, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.17590981721878052, "step": 1289 }, { "epoch": 0.019262356278930866, "grad_norm": 0.52734375, "grad_norm_var": 0.0034957249959309896, "learning_rate": 2e-05, "loss": 1.2431, "loss/crossentropy": 2.539076805114746, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17277245223522186, "step": 1290 }, { "epoch": 0.019277288338061818, "grad_norm": 0.578125, "grad_norm_var": 0.003107055028279622, "learning_rate": 2e-05, "loss": 1.3905, "loss/crossentropy": 2.501863479614258, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.21077433228492737, "step": 1291 }, { "epoch": 0.019292220397192774, "grad_norm": 0.70703125, "grad_norm_var": 0.004612588882446289, "learning_rate": 2e-05, "loss": 1.5402, "loss/crossentropy": 2.5748298168182373, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 13.0, "loss/logits": 0.2589094042778015, "step": 1292 }, { "epoch": 0.019307152456323726, "grad_norm": 1.046875, "grad_norm_var": 0.01923368771870931, "learning_rate": 2e-05, "loss": 1.4531, "loss/crossentropy": 2.400190830230713, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 13.0, "loss/logits": 0.2031039446592331, "step": 1293 }, { "epoch": 0.019322084515454682, "grad_norm": 0.486328125, "grad_norm_var": 0.01991729736328125, "learning_rate": 2e-05, "loss": 1.1589, "loss/crossentropy": 2.452254056930542, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.15112650394439697, "step": 1294 }, { "epoch": 0.019337016574585635, "grad_norm": 0.484375, "grad_norm_var": 0.020435523986816407, "learning_rate": 2e-05, "loss": 1.2109, "loss/crossentropy": 2.6677145957946777, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1640741527080536, "step": 1295 }, { "epoch": 0.01935194863371659, "grad_norm": 0.59765625, "grad_norm_var": 0.020420265197753907, "learning_rate": 2e-05, "loss": 1.3198, "loss/crossentropy": 2.658365488052368, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.202656090259552, "step": 1296 }, { "epoch": 0.019366880692847543, "grad_norm": 0.609375, "grad_norm_var": 0.020081520080566406, "learning_rate": 2e-05, "loss": 1.3752, "loss/crossentropy": 2.398197650909424, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.21891216933727264, "step": 1297 }, { "epoch": 0.0193818127519785, "grad_norm": 0.51953125, "grad_norm_var": 0.018903096516927082, "learning_rate": 2e-05, "loss": 1.2435, "loss/crossentropy": 2.6253135204315186, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17318454384803772, "step": 1298 }, { "epoch": 0.01939674481110945, "grad_norm": 0.51953125, "grad_norm_var": 0.019113604227701822, "learning_rate": 2e-05, "loss": 1.2117, "loss/crossentropy": 2.595719575881958, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1647767871618271, "step": 1299 }, { "epoch": 0.019411676870240407, "grad_norm": 0.56640625, "grad_norm_var": 0.018216435114542642, "learning_rate": 2e-05, "loss": 1.2591, "loss/crossentropy": 2.4858176708221436, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1809636652469635, "step": 1300 }, { "epoch": 0.01942660892937136, "grad_norm": 0.55859375, "grad_norm_var": 0.018074909845987957, "learning_rate": 2e-05, "loss": 1.2194, "loss/crossentropy": 2.464454174041748, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16469821333885193, "step": 1301 }, { "epoch": 0.019441540988502315, "grad_norm": 0.49609375, "grad_norm_var": 0.018344608942667644, "learning_rate": 2e-05, "loss": 1.1929, "loss/crossentropy": 2.5054361820220947, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16160087287425995, "step": 1302 }, { "epoch": 0.019456473047633267, "grad_norm": 0.53515625, "grad_norm_var": 0.01836838722229004, "learning_rate": 2e-05, "loss": 1.1693, "loss/crossentropy": 2.579584836959839, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15363293886184692, "step": 1303 }, { "epoch": 0.019471405106764223, "grad_norm": 0.53515625, "grad_norm_var": 0.01836838722229004, "learning_rate": 2e-05, "loss": 1.2129, "loss/crossentropy": 2.4479448795318604, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16602320969104767, "step": 1304 }, { "epoch": 0.019486337165895175, "grad_norm": 0.5703125, "grad_norm_var": 0.018325408299763996, "learning_rate": 2e-05, "loss": 1.334, "loss/crossentropy": 2.685575485229492, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.18556523323059082, "step": 1305 }, { "epoch": 0.01950126922502613, "grad_norm": 0.5625, "grad_norm_var": 0.018138869603474935, "learning_rate": 2e-05, "loss": 1.3211, "loss/crossentropy": 2.4253220558166504, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.18828678131103516, "step": 1306 }, { "epoch": 0.019516201284157084, "grad_norm": 0.55078125, "grad_norm_var": 0.018213637669881187, "learning_rate": 2e-05, "loss": 1.2963, "loss/crossentropy": 2.4698915481567383, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1947701871395111, "step": 1307 }, { "epoch": 0.01953113334328804, "grad_norm": 0.578125, "grad_norm_var": 0.017139418919881185, "learning_rate": 2e-05, "loss": 1.3248, "loss/crossentropy": 2.5888915061950684, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.18418516218662262, "step": 1308 }, { "epoch": 0.019546065402418992, "grad_norm": 0.5, "grad_norm_var": 0.0015004316965738932, "learning_rate": 2e-05, "loss": 1.1915, "loss/crossentropy": 2.698927640914917, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16026300191879272, "step": 1309 }, { "epoch": 0.019560997461549948, "grad_norm": 0.498046875, "grad_norm_var": 0.0014222304026285807, "learning_rate": 2e-05, "loss": 1.1648, "loss/crossentropy": 2.4757766723632812, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.14918801188468933, "step": 1310 }, { "epoch": 0.019575929520680904, "grad_norm": 0.90234375, "grad_norm_var": 0.009095875422159831, "learning_rate": 2e-05, "loss": 1.5092, "loss/crossentropy": 2.956965684890747, "loss/dist_ce": 0.0, "loss/fcd": 1.25, "loss/idx": 13.0, "loss/logits": 0.2591836452484131, "step": 1311 }, { "epoch": 0.019590861579811856, "grad_norm": 0.5, "grad_norm_var": 0.009315220514933269, "learning_rate": 2e-05, "loss": 1.227, "loss/crossentropy": 2.597576379776001, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17235302925109863, "step": 1312 }, { "epoch": 0.01960579363894281, "grad_norm": 0.51953125, "grad_norm_var": 0.009259653091430665, "learning_rate": 2e-05, "loss": 1.2917, "loss/crossentropy": 2.78556752204895, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.19013968110084534, "step": 1313 }, { "epoch": 0.019620725698073764, "grad_norm": 0.58203125, "grad_norm_var": 0.009191497166951498, "learning_rate": 2e-05, "loss": 1.3658, "loss/crossentropy": 2.502690553665161, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.19392375648021698, "step": 1314 }, { "epoch": 0.01963565775720472, "grad_norm": 0.62109375, "grad_norm_var": 0.00927580197652181, "learning_rate": 2e-05, "loss": 1.297, "loss/crossentropy": 2.6288228034973145, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.18761998414993286, "step": 1315 }, { "epoch": 0.019650589816335672, "grad_norm": 0.875, "grad_norm_var": 0.015192524592081705, "learning_rate": 2e-05, "loss": 1.2782, "loss/crossentropy": 2.738717794418335, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.1844879537820816, "step": 1316 }, { "epoch": 0.019665521875466628, "grad_norm": 0.5546875, "grad_norm_var": 0.015208037694295247, "learning_rate": 2e-05, "loss": 1.2752, "loss/crossentropy": 2.660844564437866, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18141654133796692, "step": 1317 }, { "epoch": 0.01968045393459758, "grad_norm": 0.47265625, "grad_norm_var": 0.01552427609761556, "learning_rate": 2e-05, "loss": 1.204, "loss/crossentropy": 2.632686138153076, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16492034494876862, "step": 1318 }, { "epoch": 0.019695385993728536, "grad_norm": 0.53515625, "grad_norm_var": 0.01552427609761556, "learning_rate": 2e-05, "loss": 1.1667, "loss/crossentropy": 2.642601728439331, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1511181890964508, "step": 1319 }, { "epoch": 0.01971031805285949, "grad_norm": 0.4609375, "grad_norm_var": 0.01636020342508952, "learning_rate": 2e-05, "loss": 1.2313, "loss/crossentropy": 2.6074163913726807, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17656564712524414, "step": 1320 }, { "epoch": 0.019725250111990444, "grad_norm": 0.828125, "grad_norm_var": 0.020174519220987955, "learning_rate": 2e-05, "loss": 1.2131, "loss/crossentropy": 2.755542755126953, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16619285941123962, "step": 1321 }, { "epoch": 0.019740182171121397, "grad_norm": 0.61328125, "grad_norm_var": 0.020106744766235352, "learning_rate": 2e-05, "loss": 1.3149, "loss/crossentropy": 2.5030322074890137, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18986953794956207, "step": 1322 }, { "epoch": 0.019755114230252353, "grad_norm": 0.54296875, "grad_norm_var": 0.02016129493713379, "learning_rate": 2e-05, "loss": 1.2952, "loss/crossentropy": 2.4982292652130127, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.185785174369812, "step": 1323 }, { "epoch": 0.019770046289383305, "grad_norm": 0.56640625, "grad_norm_var": 0.020202493667602538, "learning_rate": 2e-05, "loss": 1.2813, "loss/crossentropy": 2.668501138687134, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.1875031292438507, "step": 1324 }, { "epoch": 0.01978497834851426, "grad_norm": 0.50390625, "grad_norm_var": 0.02015226682027181, "learning_rate": 2e-05, "loss": 1.2135, "loss/crossentropy": 2.592796564102173, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.17439380288124084, "step": 1325 }, { "epoch": 0.019799910407645213, "grad_norm": 0.51953125, "grad_norm_var": 0.019893328348795574, "learning_rate": 2e-05, "loss": 1.3221, "loss/crossentropy": 2.619886636734009, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19710972905158997, "step": 1326 }, { "epoch": 0.01981484246677617, "grad_norm": 0.6484375, "grad_norm_var": 0.013682047526041666, "learning_rate": 2e-05, "loss": 1.5511, "loss/crossentropy": 2.5196568965911865, "loss/dist_ce": 0.0, "loss/fcd": 1.3046875, "loss/idx": 13.0, "loss/logits": 0.2463717758655548, "step": 1327 }, { "epoch": 0.01982977452590712, "grad_norm": 0.6484375, "grad_norm_var": 0.013396962483723959, "learning_rate": 2e-05, "loss": 1.3578, "loss/crossentropy": 2.68768572807312, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.1859176903963089, "step": 1328 }, { "epoch": 0.019844706585038077, "grad_norm": 0.46484375, "grad_norm_var": 0.014121500651041667, "learning_rate": 2e-05, "loss": 1.1804, "loss/crossentropy": 2.659552812576294, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15694257616996765, "step": 1329 }, { "epoch": 0.01985963864416903, "grad_norm": 0.470703125, "grad_norm_var": 0.015012089411417644, "learning_rate": 2e-05, "loss": 1.2425, "loss/crossentropy": 2.698822498321533, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1799800544977188, "step": 1330 }, { "epoch": 0.019874570703299985, "grad_norm": 0.6015625, "grad_norm_var": 0.014936431248982748, "learning_rate": 2e-05, "loss": 1.3199, "loss/crossentropy": 2.2921221256256104, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.18708041310310364, "step": 1331 }, { "epoch": 0.019889502762430938, "grad_norm": 0.5859375, "grad_norm_var": 0.008853133519490559, "learning_rate": 2e-05, "loss": 1.319, "loss/crossentropy": 2.7646992206573486, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19404536485671997, "step": 1332 }, { "epoch": 0.019904434821561894, "grad_norm": 0.484375, "grad_norm_var": 0.009245665868123372, "learning_rate": 2e-05, "loss": 1.2248, "loss/crossentropy": 2.6878888607025146, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17010048031806946, "step": 1333 }, { "epoch": 0.019919366880692846, "grad_norm": 0.5625, "grad_norm_var": 0.008713388442993164, "learning_rate": 2e-05, "loss": 1.2177, "loss/crossentropy": 2.486751079559326, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17087268829345703, "step": 1334 }, { "epoch": 0.0199342989398238, "grad_norm": 0.56640625, "grad_norm_var": 0.008650827407836913, "learning_rate": 2e-05, "loss": 1.3721, "loss/crossentropy": 2.5809147357940674, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.2001952826976776, "step": 1335 }, { "epoch": 0.019949230998954758, "grad_norm": 0.53515625, "grad_norm_var": 0.007947778701782227, "learning_rate": 2e-05, "loss": 1.2824, "loss/crossentropy": 2.8265273571014404, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18861952424049377, "step": 1336 }, { "epoch": 0.01996416305808571, "grad_norm": 0.72265625, "grad_norm_var": 0.005032968521118164, "learning_rate": 2e-05, "loss": 1.2615, "loss/crossentropy": 2.333455801010132, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1599290519952774, "step": 1337 }, { "epoch": 0.019979095117216666, "grad_norm": 0.67578125, "grad_norm_var": 0.005680958429972331, "learning_rate": 2e-05, "loss": 1.2279, "loss/crossentropy": 2.4327051639556885, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16535300016403198, "step": 1338 }, { "epoch": 0.019994027176347618, "grad_norm": 0.53125, "grad_norm_var": 0.005729786554972331, "learning_rate": 2e-05, "loss": 1.202, "loss/crossentropy": 2.7616231441497803, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16289827227592468, "step": 1339 }, { "epoch": 0.020008959235478574, "grad_norm": 0.609375, "grad_norm_var": 0.005836089452107747, "learning_rate": 2e-05, "loss": 1.4349, "loss/crossentropy": 2.4606475830078125, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 13.0, "loss/logits": 0.22398307919502258, "step": 1340 }, { "epoch": 0.020023891294609526, "grad_norm": 0.58203125, "grad_norm_var": 0.005522012710571289, "learning_rate": 2e-05, "loss": 1.2027, "loss/crossentropy": 2.614811897277832, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1558469831943512, "step": 1341 }, { "epoch": 0.020038823353740482, "grad_norm": 0.51171875, "grad_norm_var": 0.005584192276000976, "learning_rate": 2e-05, "loss": 1.15, "loss/crossentropy": 2.557891368865967, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.14996448159217834, "step": 1342 }, { "epoch": 0.020053755412871434, "grad_norm": 0.498046875, "grad_norm_var": 0.0055266698201497395, "learning_rate": 2e-05, "loss": 1.2479, "loss/crossentropy": 2.5452401638031006, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17753981053829193, "step": 1343 }, { "epoch": 0.02006868747200239, "grad_norm": 0.484375, "grad_norm_var": 0.00539849599202474, "learning_rate": 2e-05, "loss": 1.1854, "loss/crossentropy": 2.5871667861938477, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1541636437177658, "step": 1344 }, { "epoch": 0.020083619531133343, "grad_norm": 0.58984375, "grad_norm_var": 0.004865455627441406, "learning_rate": 2e-05, "loss": 1.2847, "loss/crossentropy": 2.5764195919036865, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.17535443603992462, "step": 1345 }, { "epoch": 0.0200985515902643, "grad_norm": 0.5390625, "grad_norm_var": 0.004314152399698893, "learning_rate": 2e-05, "loss": 1.2904, "loss/crossentropy": 2.349005699157715, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.19662730395793915, "step": 1346 }, { "epoch": 0.02011348364939525, "grad_norm": 0.5546875, "grad_norm_var": 0.004238621393839518, "learning_rate": 2e-05, "loss": 1.2428, "loss/crossentropy": 2.576354503631592, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1724541187286377, "step": 1347 }, { "epoch": 0.020128415708526207, "grad_norm": 0.5234375, "grad_norm_var": 0.004304742813110352, "learning_rate": 2e-05, "loss": 1.2087, "loss/crossentropy": 2.7604458332061768, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16968321800231934, "step": 1348 }, { "epoch": 0.02014334776765716, "grad_norm": 0.55859375, "grad_norm_var": 0.00389402707417806, "learning_rate": 2e-05, "loss": 1.2885, "loss/crossentropy": 2.5020458698272705, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.17917054891586304, "step": 1349 }, { "epoch": 0.020158279826788115, "grad_norm": 0.609375, "grad_norm_var": 0.004013808568318685, "learning_rate": 2e-05, "loss": 1.4814, "loss/crossentropy": 2.569965124130249, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 13.0, "loss/logits": 0.23924951255321503, "step": 1350 }, { "epoch": 0.020173211885919067, "grad_norm": 0.54296875, "grad_norm_var": 0.004053862889607748, "learning_rate": 2e-05, "loss": 1.2074, "loss/crossentropy": 2.6130759716033936, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16052843630313873, "step": 1351 }, { "epoch": 0.020188143945050023, "grad_norm": 0.546875, "grad_norm_var": 0.00401304562886556, "learning_rate": 2e-05, "loss": 1.2967, "loss/crossentropy": 2.5374364852905273, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.18736249208450317, "step": 1352 }, { "epoch": 0.020203076004180975, "grad_norm": 0.61328125, "grad_norm_var": 0.0024981021881103514, "learning_rate": 2e-05, "loss": 1.3419, "loss/crossentropy": 2.4236462116241455, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.18566705286502838, "step": 1353 }, { "epoch": 0.02021800806331193, "grad_norm": 0.48828125, "grad_norm_var": 0.0018175601959228515, "learning_rate": 2e-05, "loss": 1.205, "loss/crossentropy": 2.845634937286377, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.17379263043403625, "step": 1354 }, { "epoch": 0.020232940122442884, "grad_norm": 0.4921875, "grad_norm_var": 0.002005116144816081, "learning_rate": 2e-05, "loss": 1.2024, "loss/crossentropy": 2.4425594806671143, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16329693794250488, "step": 1355 }, { "epoch": 0.02024787218157384, "grad_norm": 0.515625, "grad_norm_var": 0.0017686049143473307, "learning_rate": 2e-05, "loss": 1.1883, "loss/crossentropy": 2.421473741531372, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15703192353248596, "step": 1356 }, { "epoch": 0.020262804240704792, "grad_norm": 0.52734375, "grad_norm_var": 0.0016537825266520181, "learning_rate": 2e-05, "loss": 1.3472, "loss/crossentropy": 2.572152853012085, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.19879058003425598, "step": 1357 }, { "epoch": 0.020277736299835748, "grad_norm": 1.96875, "grad_norm_var": 0.12938116391499838, "learning_rate": 2e-05, "loss": 1.7107, "loss/crossentropy": 2.3684675693511963, "loss/dist_ce": 0.0, "loss/fcd": 1.421875, "loss/idx": 13.0, "loss/logits": 0.2888346314430237, "step": 1358 }, { "epoch": 0.0202926683589667, "grad_norm": 0.48046875, "grad_norm_var": 0.12970574696858725, "learning_rate": 2e-05, "loss": 1.181, "loss/crossentropy": 2.509111166000366, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.16538101434707642, "step": 1359 }, { "epoch": 0.020307600418097656, "grad_norm": 0.546875, "grad_norm_var": 0.12875970204671225, "learning_rate": 2e-05, "loss": 1.2878, "loss/crossentropy": 2.669523000717163, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.1940631866455078, "step": 1360 }, { "epoch": 0.020322532477228608, "grad_norm": 0.61328125, "grad_norm_var": 0.12866509755452474, "learning_rate": 2e-05, "loss": 1.4801, "loss/crossentropy": 2.491579055786133, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 13.0, "loss/logits": 0.23791025578975677, "step": 1361 }, { "epoch": 0.020337464536359564, "grad_norm": 0.474609375, "grad_norm_var": 0.12972830136617025, "learning_rate": 2e-05, "loss": 1.193, "loss/crossentropy": 2.5316779613494873, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16174541413784027, "step": 1362 }, { "epoch": 0.02035239659549052, "grad_norm": 0.49609375, "grad_norm_var": 0.13051985104878744, "learning_rate": 2e-05, "loss": 1.2706, "loss/crossentropy": 2.616147518157959, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17682576179504395, "step": 1363 }, { "epoch": 0.020367328654621472, "grad_norm": 0.50390625, "grad_norm_var": 0.13080786069234213, "learning_rate": 2e-05, "loss": 1.266, "loss/crossentropy": 2.4946200847625732, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17224857211112976, "step": 1364 }, { "epoch": 0.020382260713752428, "grad_norm": 0.5625, "grad_norm_var": 0.13077492713928224, "learning_rate": 2e-05, "loss": 1.3026, "loss/crossentropy": 2.7409372329711914, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.19320283830165863, "step": 1365 }, { "epoch": 0.02039719277288338, "grad_norm": 0.55859375, "grad_norm_var": 0.13103445370992026, "learning_rate": 2e-05, "loss": 1.2833, "loss/crossentropy": 2.5230894088745117, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18950234353542328, "step": 1366 }, { "epoch": 0.020412124832014336, "grad_norm": 0.734375, "grad_norm_var": 0.13133975664774578, "learning_rate": 2e-05, "loss": 1.4051, "loss/crossentropy": 2.4019763469696045, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.20979498326778412, "step": 1367 }, { "epoch": 0.02042705689114529, "grad_norm": 0.50390625, "grad_norm_var": 0.13194680213928223, "learning_rate": 2e-05, "loss": 1.227, "loss/crossentropy": 2.572502851486206, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17235463857650757, "step": 1368 }, { "epoch": 0.020441988950276244, "grad_norm": 0.57421875, "grad_norm_var": 0.13212927182515463, "learning_rate": 2e-05, "loss": 1.3113, "loss/crossentropy": 2.350470781326294, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.17066575586795807, "step": 1369 }, { "epoch": 0.020456921009407197, "grad_norm": 0.5703125, "grad_norm_var": 0.13102644284566242, "learning_rate": 2e-05, "loss": 1.3753, "loss/crossentropy": 2.5796074867248535, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.20337635278701782, "step": 1370 }, { "epoch": 0.020471853068538153, "grad_norm": 0.515625, "grad_norm_var": 0.13062170346577961, "learning_rate": 2e-05, "loss": 1.1941, "loss/crossentropy": 2.5275111198425293, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16285260021686554, "step": 1371 }, { "epoch": 0.020486785127669105, "grad_norm": 0.470703125, "grad_norm_var": 0.13145777384440105, "learning_rate": 2e-05, "loss": 1.1291, "loss/crossentropy": 2.7132022380828857, "loss/dist_ce": 0.0, "loss/fcd": 0.97265625, "loss/idx": 13.0, "loss/logits": 0.156441330909729, "step": 1372 }, { "epoch": 0.02050171718680006, "grad_norm": 0.6015625, "grad_norm_var": 0.13077284495035807, "learning_rate": 2e-05, "loss": 1.3682, "loss/crossentropy": 2.4123904705047607, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.18850690126419067, "step": 1373 }, { "epoch": 0.020516649245931013, "grad_norm": 0.47265625, "grad_norm_var": 0.004807790120442708, "learning_rate": 2e-05, "loss": 1.1859, "loss/crossentropy": 2.519430160522461, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1546916663646698, "step": 1374 }, { "epoch": 0.02053158130506197, "grad_norm": 0.49609375, "grad_norm_var": 0.004693857828776042, "learning_rate": 2e-05, "loss": 1.1965, "loss/crossentropy": 2.733642816543579, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16526637971401215, "step": 1375 }, { "epoch": 0.02054651336419292, "grad_norm": 0.49609375, "grad_norm_var": 0.004831886291503907, "learning_rate": 2e-05, "loss": 1.1669, "loss/crossentropy": 2.5803768634796143, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1512264758348465, "step": 1376 }, { "epoch": 0.020561445423323877, "grad_norm": 0.59375, "grad_norm_var": 0.0046656290690104164, "learning_rate": 2e-05, "loss": 1.3094, "loss/crossentropy": 2.4563205242156982, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.1922210454940796, "step": 1377 }, { "epoch": 0.02057637748245483, "grad_norm": 0.498046875, "grad_norm_var": 0.004498545328776042, "learning_rate": 2e-05, "loss": 1.2834, "loss/crossentropy": 2.5536391735076904, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18183785676956177, "step": 1378 }, { "epoch": 0.020591309541585785, "grad_norm": 0.51171875, "grad_norm_var": 0.004421234130859375, "learning_rate": 2e-05, "loss": 1.1879, "loss/crossentropy": 2.3600947856903076, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15669815242290497, "step": 1379 }, { "epoch": 0.020606241600716738, "grad_norm": 0.5, "grad_norm_var": 0.00444176991780599, "learning_rate": 2e-05, "loss": 1.2191, "loss/crossentropy": 2.5085413455963135, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16443030536174774, "step": 1380 }, { "epoch": 0.020621173659847693, "grad_norm": 0.51953125, "grad_norm_var": 0.004435475667317708, "learning_rate": 2e-05, "loss": 1.2258, "loss/crossentropy": 2.6902709007263184, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16326066851615906, "step": 1381 }, { "epoch": 0.020636105718978646, "grad_norm": 0.6015625, "grad_norm_var": 0.004665565490722656, "learning_rate": 2e-05, "loss": 1.282, "loss/crossentropy": 2.8826005458831787, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18827125430107117, "step": 1382 }, { "epoch": 0.0206510377781096, "grad_norm": 0.59375, "grad_norm_var": 0.0022806167602539063, "learning_rate": 2e-05, "loss": 1.3672, "loss/crossentropy": 2.341332197189331, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.19533485174179077, "step": 1383 }, { "epoch": 0.020665969837240554, "grad_norm": 0.55859375, "grad_norm_var": 0.0022592544555664062, "learning_rate": 2e-05, "loss": 1.2868, "loss/crossentropy": 2.7123072147369385, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.16963346302509308, "step": 1384 }, { "epoch": 0.02068090189637151, "grad_norm": 0.55859375, "grad_norm_var": 0.002194658915201823, "learning_rate": 2e-05, "loss": 1.224, "loss/crossentropy": 2.544621706008911, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16929784417152405, "step": 1385 }, { "epoch": 0.020695833955502462, "grad_norm": 0.5, "grad_norm_var": 0.002171770731608073, "learning_rate": 2e-05, "loss": 1.2222, "loss/crossentropy": 2.4783518314361572, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1753370463848114, "step": 1386 }, { "epoch": 0.020710766014633418, "grad_norm": 0.466796875, "grad_norm_var": 0.002417739232381185, "learning_rate": 2e-05, "loss": 1.1808, "loss/crossentropy": 2.5404293537139893, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.16517174243927002, "step": 1387 }, { "epoch": 0.020725698073764374, "grad_norm": 0.51953125, "grad_norm_var": 0.0021972020467122396, "learning_rate": 2e-05, "loss": 1.2112, "loss/crossentropy": 2.6268763542175293, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.17210936546325684, "step": 1388 }, { "epoch": 0.020740630132895326, "grad_norm": 0.5625, "grad_norm_var": 0.0019225438435872396, "learning_rate": 2e-05, "loss": 1.2616, "loss/crossentropy": 2.555655002593994, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17567789554595947, "step": 1389 }, { "epoch": 0.020755562192026282, "grad_norm": 0.49609375, "grad_norm_var": 0.0017836888631184895, "learning_rate": 2e-05, "loss": 1.2087, "loss/crossentropy": 2.5104944705963135, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.15401938557624817, "step": 1390 }, { "epoch": 0.020770494251157234, "grad_norm": 0.498046875, "grad_norm_var": 0.001775217056274414, "learning_rate": 2e-05, "loss": 1.2413, "loss/crossentropy": 2.602848768234253, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.170992910861969, "step": 1391 }, { "epoch": 0.02078542631028819, "grad_norm": 0.54296875, "grad_norm_var": 0.001702737808227539, "learning_rate": 2e-05, "loss": 1.2999, "loss/crossentropy": 2.5092689990997314, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.19050829112529755, "step": 1392 }, { "epoch": 0.020800358369419143, "grad_norm": 0.51171875, "grad_norm_var": 0.0014544010162353515, "learning_rate": 2e-05, "loss": 1.2414, "loss/crossentropy": 2.510206460952759, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.18671731650829315, "step": 1393 }, { "epoch": 0.0208152904285501, "grad_norm": 0.478515625, "grad_norm_var": 0.00155485471089681, "learning_rate": 2e-05, "loss": 1.1186, "loss/crossentropy": 2.801673650741577, "loss/dist_ce": 0.0, "loss/fcd": 0.96484375, "loss/idx": 13.0, "loss/logits": 0.15374580025672913, "step": 1394 }, { "epoch": 0.02083022248768105, "grad_norm": 0.5078125, "grad_norm_var": 0.0015633742014567057, "learning_rate": 2e-05, "loss": 1.196, "loss/crossentropy": 2.528714418411255, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1569829136133194, "step": 1395 }, { "epoch": 0.020845154546812007, "grad_norm": 0.52734375, "grad_norm_var": 0.0015153090159098308, "learning_rate": 2e-05, "loss": 1.1815, "loss/crossentropy": 2.5752596855163574, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15807154774665833, "step": 1396 }, { "epoch": 0.02086008660594296, "grad_norm": 0.490234375, "grad_norm_var": 0.0016009012858072917, "learning_rate": 2e-05, "loss": 1.1608, "loss/crossentropy": 2.6302576065063477, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.15298575162887573, "step": 1397 }, { "epoch": 0.020875018665073915, "grad_norm": 0.53515625, "grad_norm_var": 0.0012063980102539062, "learning_rate": 2e-05, "loss": 1.2726, "loss/crossentropy": 2.4865758419036865, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17103740572929382, "step": 1398 }, { "epoch": 0.020889950724204867, "grad_norm": 0.55078125, "grad_norm_var": 0.0009091695149739584, "learning_rate": 2e-05, "loss": 1.2263, "loss/crossentropy": 2.63130259513855, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17159323394298553, "step": 1399 }, { "epoch": 0.020904882783335823, "grad_norm": 0.46875, "grad_norm_var": 0.0009398778279622396, "learning_rate": 2e-05, "loss": 1.1366, "loss/crossentropy": 2.5764553546905518, "loss/dist_ce": 0.0, "loss/fcd": 0.98828125, "loss/idx": 13.0, "loss/logits": 0.1483464539051056, "step": 1400 }, { "epoch": 0.020919814842466775, "grad_norm": 0.478515625, "grad_norm_var": 0.0008584181467692057, "learning_rate": 2e-05, "loss": 1.2287, "loss/crossentropy": 2.4545843601226807, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16616512835025787, "step": 1401 }, { "epoch": 0.02093474690159773, "grad_norm": 0.58203125, "grad_norm_var": 0.0011868635813395182, "learning_rate": 2e-05, "loss": 1.3896, "loss/crossentropy": 2.566432476043701, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.2099205106496811, "step": 1402 }, { "epoch": 0.020949678960728683, "grad_norm": 0.5078125, "grad_norm_var": 0.0010363260904947917, "learning_rate": 2e-05, "loss": 1.1944, "loss/crossentropy": 2.694551706314087, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.17097516357898712, "step": 1403 }, { "epoch": 0.02096461101985964, "grad_norm": 0.52734375, "grad_norm_var": 0.001043701171875, "learning_rate": 2e-05, "loss": 1.1465, "loss/crossentropy": 2.600843906402588, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.1465233862400055, "step": 1404 }, { "epoch": 0.02097954307899059, "grad_norm": 0.59375, "grad_norm_var": 0.0012959798177083334, "learning_rate": 2e-05, "loss": 1.28, "loss/crossentropy": 2.5780487060546875, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18626543879508972, "step": 1405 }, { "epoch": 0.020994475138121547, "grad_norm": 0.5, "grad_norm_var": 0.001285235087076823, "learning_rate": 2e-05, "loss": 1.2806, "loss/crossentropy": 2.555168867111206, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17903593182563782, "step": 1406 }, { "epoch": 0.0210094071972525, "grad_norm": 0.5703125, "grad_norm_var": 0.0014116764068603516, "learning_rate": 2e-05, "loss": 1.2721, "loss/crossentropy": 2.6439151763916016, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17056259512901306, "step": 1407 }, { "epoch": 0.021024339256383456, "grad_norm": 0.4921875, "grad_norm_var": 0.0014397780100504558, "learning_rate": 2e-05, "loss": 1.192, "loss/crossentropy": 2.6460981369018555, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16073650121688843, "step": 1408 }, { "epoch": 0.021039271315514408, "grad_norm": 0.5390625, "grad_norm_var": 0.0014557997385660807, "learning_rate": 2e-05, "loss": 1.1734, "loss/crossentropy": 2.7225704193115234, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.14998149871826172, "step": 1409 }, { "epoch": 0.021054203374645364, "grad_norm": 0.486328125, "grad_norm_var": 0.00141447385152181, "learning_rate": 2e-05, "loss": 1.1947, "loss/crossentropy": 2.7134530544281006, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.17125816643238068, "step": 1410 }, { "epoch": 0.021069135433776316, "grad_norm": 0.65625, "grad_norm_var": 0.0025040785471598308, "learning_rate": 2e-05, "loss": 1.4376, "loss/crossentropy": 2.8304383754730225, "loss/dist_ce": 0.0, "loss/fcd": 1.234375, "loss/idx": 13.0, "loss/logits": 0.20326532423496246, "step": 1411 }, { "epoch": 0.021084067492907272, "grad_norm": 0.796875, "grad_norm_var": 0.00689098040262858, "learning_rate": 2e-05, "loss": 1.6109, "loss/crossentropy": 2.3552732467651367, "loss/dist_ce": 0.0, "loss/fcd": 1.375, "loss/idx": 13.0, "loss/logits": 0.23586499691009521, "step": 1412 }, { "epoch": 0.021098999552038224, "grad_norm": 0.486328125, "grad_norm_var": 0.0069222609202067055, "learning_rate": 2e-05, "loss": 1.1958, "loss/crossentropy": 2.665379524230957, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16456547379493713, "step": 1413 }, { "epoch": 0.02111393161116918, "grad_norm": 0.490234375, "grad_norm_var": 0.007126617431640625, "learning_rate": 2e-05, "loss": 1.2232, "loss/crossentropy": 2.6303813457489014, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16855427622795105, "step": 1414 }, { "epoch": 0.021128863670300136, "grad_norm": 0.546875, "grad_norm_var": 0.007124773661295573, "learning_rate": 2e-05, "loss": 1.2708, "loss/crossentropy": 2.770017385482788, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.18481460213661194, "step": 1415 }, { "epoch": 0.02114379572943109, "grad_norm": 0.482421875, "grad_norm_var": 0.006997156143188477, "learning_rate": 2e-05, "loss": 1.2358, "loss/crossentropy": 2.4545326232910156, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17330729961395264, "step": 1416 }, { "epoch": 0.021158727788562044, "grad_norm": 0.609375, "grad_norm_var": 0.0068895975748697914, "learning_rate": 2e-05, "loss": 1.3533, "loss/crossentropy": 2.5513885021209717, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.2204749584197998, "step": 1417 }, { "epoch": 0.021173659847692997, "grad_norm": 0.515625, "grad_norm_var": 0.006918780008951823, "learning_rate": 2e-05, "loss": 1.2852, "loss/crossentropy": 2.4511773586273193, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.19149178266525269, "step": 1418 }, { "epoch": 0.021188591906823952, "grad_norm": 0.5, "grad_norm_var": 0.006966590881347656, "learning_rate": 2e-05, "loss": 1.2128, "loss/crossentropy": 2.630352258682251, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1737767457962036, "step": 1419 }, { "epoch": 0.021203523965954905, "grad_norm": 0.75, "grad_norm_var": 0.009405517578125, "learning_rate": 2e-05, "loss": 1.4607, "loss/crossentropy": 2.5818517208099365, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 13.0, "loss/logits": 0.24980100989341736, "step": 1420 }, { "epoch": 0.02121845602508586, "grad_norm": 0.5078125, "grad_norm_var": 0.009520212809244791, "learning_rate": 2e-05, "loss": 1.1809, "loss/crossentropy": 2.4227993488311768, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15742167830467224, "step": 1421 }, { "epoch": 0.021233388084216813, "grad_norm": 0.50390625, "grad_norm_var": 0.009490903218587239, "learning_rate": 2e-05, "loss": 1.1791, "loss/crossentropy": 2.596496820449829, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15562020242214203, "step": 1422 }, { "epoch": 0.02124832014334777, "grad_norm": 0.5234375, "grad_norm_var": 0.009553464253743489, "learning_rate": 2e-05, "loss": 1.2777, "loss/crossentropy": 2.4700193405151367, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18397745490074158, "step": 1423 }, { "epoch": 0.02126325220247872, "grad_norm": 0.515625, "grad_norm_var": 0.00939019521077474, "learning_rate": 2e-05, "loss": 1.2129, "loss/crossentropy": 2.675663471221924, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1582183986902237, "step": 1424 }, { "epoch": 0.021278184261609677, "grad_norm": 0.640625, "grad_norm_var": 0.009793535868326823, "learning_rate": 2e-05, "loss": 1.3463, "loss/crossentropy": 2.5702874660491943, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.19006071984767914, "step": 1425 }, { "epoch": 0.02129311632074063, "grad_norm": 0.546875, "grad_norm_var": 0.009401814142862955, "learning_rate": 2e-05, "loss": 1.4265, "loss/crossentropy": 2.700167417526245, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 13.0, "loss/logits": 0.21554800868034363, "step": 1426 }, { "epoch": 0.021308048379871585, "grad_norm": 0.48046875, "grad_norm_var": 0.009241596857706705, "learning_rate": 2e-05, "loss": 1.1925, "loss/crossentropy": 2.780790328979492, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.1690763682126999, "step": 1427 }, { "epoch": 0.021322980439002538, "grad_norm": 0.53125, "grad_norm_var": 0.005121469497680664, "learning_rate": 2e-05, "loss": 1.2917, "loss/crossentropy": 2.366610288619995, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.18228915333747864, "step": 1428 }, { "epoch": 0.021337912498133493, "grad_norm": 0.515625, "grad_norm_var": 0.004967689514160156, "learning_rate": 2e-05, "loss": 1.3195, "loss/crossentropy": 2.4140758514404297, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19454874098300934, "step": 1429 }, { "epoch": 0.021352844557264446, "grad_norm": 0.55078125, "grad_norm_var": 0.004784886042277018, "learning_rate": 2e-05, "loss": 1.2121, "loss/crossentropy": 2.3538424968719482, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1652664840221405, "step": 1430 }, { "epoch": 0.0213677766163954, "grad_norm": 0.71875, "grad_norm_var": 0.006673161188761393, "learning_rate": 2e-05, "loss": 1.3674, "loss/crossentropy": 2.611670970916748, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.23455733060836792, "step": 1431 }, { "epoch": 0.021382708675526354, "grad_norm": 0.51171875, "grad_norm_var": 0.006440226236979167, "learning_rate": 2e-05, "loss": 1.1873, "loss/crossentropy": 2.4568278789520264, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.14822477102279663, "step": 1432 }, { "epoch": 0.02139764073465731, "grad_norm": 0.56640625, "grad_norm_var": 0.0062590916951497395, "learning_rate": 2e-05, "loss": 1.26, "loss/crossentropy": 2.767918348312378, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17406600713729858, "step": 1433 }, { "epoch": 0.021412572793788262, "grad_norm": 0.50390625, "grad_norm_var": 0.006329091389973959, "learning_rate": 2e-05, "loss": 1.2008, "loss/crossentropy": 2.643681764602661, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16954705119132996, "step": 1434 }, { "epoch": 0.021427504852919218, "grad_norm": 0.65234375, "grad_norm_var": 0.0066787083943684895, "learning_rate": 2e-05, "loss": 1.4312, "loss/crossentropy": 2.535278797149658, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 13.0, "loss/logits": 0.22029095888137817, "step": 1435 }, { "epoch": 0.02144243691205017, "grad_norm": 0.5390625, "grad_norm_var": 0.00422051747639974, "learning_rate": 2e-05, "loss": 1.3012, "loss/crossentropy": 2.571066379547119, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.19182069599628448, "step": 1436 }, { "epoch": 0.021457368971181126, "grad_norm": 0.546875, "grad_norm_var": 0.004093360900878906, "learning_rate": 2e-05, "loss": 1.2651, "loss/crossentropy": 2.668915271759033, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.18694379925727844, "step": 1437 }, { "epoch": 0.02147230103031208, "grad_norm": 0.5390625, "grad_norm_var": 0.003940582275390625, "learning_rate": 2e-05, "loss": 1.171, "loss/crossentropy": 2.5324816703796387, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1553664356470108, "step": 1438 }, { "epoch": 0.021487233089443034, "grad_norm": 0.50390625, "grad_norm_var": 0.004047075907389323, "learning_rate": 2e-05, "loss": 1.245, "loss/crossentropy": 2.754875421524048, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17472638189792633, "step": 1439 }, { "epoch": 0.02150216514857399, "grad_norm": 0.55859375, "grad_norm_var": 0.00394287109375, "learning_rate": 2e-05, "loss": 1.2567, "loss/crossentropy": 2.51790189743042, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1785835325717926, "step": 1440 }, { "epoch": 0.021517097207704942, "grad_norm": 0.5546875, "grad_norm_var": 0.0034421284993489585, "learning_rate": 2e-05, "loss": 1.2437, "loss/crossentropy": 2.6603500843048096, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1655387580394745, "step": 1441 }, { "epoch": 0.0215320292668359, "grad_norm": 0.55078125, "grad_norm_var": 0.0034407933553059896, "learning_rate": 2e-05, "loss": 1.2036, "loss/crossentropy": 2.661207437515259, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.17234490811824799, "step": 1442 }, { "epoch": 0.02154696132596685, "grad_norm": 0.5234375, "grad_norm_var": 0.0031491597493489582, "learning_rate": 2e-05, "loss": 1.0976, "loss/crossentropy": 2.7283310890197754, "loss/dist_ce": 0.0, "loss/fcd": 0.95703125, "loss/idx": 13.0, "loss/logits": 0.14056336879730225, "step": 1443 }, { "epoch": 0.021561893385097806, "grad_norm": 0.59765625, "grad_norm_var": 0.003221575419108073, "learning_rate": 2e-05, "loss": 1.3786, "loss/crossentropy": 2.423133134841919, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.19894269108772278, "step": 1444 }, { "epoch": 0.02157682544422876, "grad_norm": 0.5390625, "grad_norm_var": 0.003122393290201823, "learning_rate": 2e-05, "loss": 1.1607, "loss/crossentropy": 2.6964542865753174, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.15290293097496033, "step": 1445 }, { "epoch": 0.021591757503359715, "grad_norm": 0.58203125, "grad_norm_var": 0.003145790100097656, "learning_rate": 2e-05, "loss": 1.2395, "loss/crossentropy": 2.5600531101226807, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16921450197696686, "step": 1446 }, { "epoch": 0.021606689562490667, "grad_norm": 0.46484375, "grad_norm_var": 0.0018605550130208334, "learning_rate": 2e-05, "loss": 1.1883, "loss/crossentropy": 2.557687282562256, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.16487538814544678, "step": 1447 }, { "epoch": 0.021621621621621623, "grad_norm": 0.6015625, "grad_norm_var": 0.001955604553222656, "learning_rate": 2e-05, "loss": 1.319, "loss/crossentropy": 2.5055274963378906, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19401580095291138, "step": 1448 }, { "epoch": 0.021636553680752575, "grad_norm": 0.515625, "grad_norm_var": 0.0020159403483072918, "learning_rate": 2e-05, "loss": 1.1947, "loss/crossentropy": 2.861999988555908, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16344812512397766, "step": 1449 }, { "epoch": 0.02165148573988353, "grad_norm": 0.50390625, "grad_norm_var": 0.0020159403483072918, "learning_rate": 2e-05, "loss": 1.2473, "loss/crossentropy": 2.4316065311431885, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1691448986530304, "step": 1450 }, { "epoch": 0.021666417799014483, "grad_norm": 0.70703125, "grad_norm_var": 0.0029612223307291666, "learning_rate": 2e-05, "loss": 1.512, "loss/crossentropy": 2.4315688610076904, "loss/dist_ce": 0.0, "loss/fcd": 1.2890625, "loss/idx": 13.0, "loss/logits": 0.22292405366897583, "step": 1451 }, { "epoch": 0.02168134985814544, "grad_norm": 0.478515625, "grad_norm_var": 0.00329283078511556, "learning_rate": 2e-05, "loss": 1.1174, "loss/crossentropy": 2.7632553577423096, "loss/dist_ce": 0.0, "loss/fcd": 0.96875, "loss/idx": 13.0, "loss/logits": 0.14863690733909607, "step": 1452 }, { "epoch": 0.02169628191727639, "grad_norm": 0.55078125, "grad_norm_var": 0.0032932122548421225, "learning_rate": 2e-05, "loss": 1.3365, "loss/crossentropy": 2.5227863788604736, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.1958889216184616, "step": 1453 }, { "epoch": 0.021711213976407347, "grad_norm": 0.58203125, "grad_norm_var": 0.003356154759724935, "learning_rate": 2e-05, "loss": 1.2874, "loss/crossentropy": 2.528759241104126, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.201430082321167, "step": 1454 }, { "epoch": 0.0217261460355383, "grad_norm": 0.455078125, "grad_norm_var": 0.003811136881510417, "learning_rate": 2e-05, "loss": 1.1504, "loss/crossentropy": 2.579636812210083, "loss/dist_ce": 0.0, "loss/fcd": 0.99609375, "loss/idx": 13.0, "loss/logits": 0.1543428748846054, "step": 1455 }, { "epoch": 0.021741078094669256, "grad_norm": 0.5625, "grad_norm_var": 0.0038176854451497395, "learning_rate": 2e-05, "loss": 1.293, "loss/crossentropy": 2.594700574874878, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.19148565828800201, "step": 1456 }, { "epoch": 0.021756010153800208, "grad_norm": 0.609375, "grad_norm_var": 0.00405267079671224, "learning_rate": 2e-05, "loss": 1.3389, "loss/crossentropy": 2.5115139484405518, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.1826532483100891, "step": 1457 }, { "epoch": 0.021770942212931164, "grad_norm": 0.515625, "grad_norm_var": 0.004133351643880208, "learning_rate": 2e-05, "loss": 1.1826, "loss/crossentropy": 2.4750778675079346, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15130820870399475, "step": 1458 }, { "epoch": 0.021785874272062116, "grad_norm": 0.8125, "grad_norm_var": 0.008358256022135416, "learning_rate": 2e-05, "loss": 1.2464, "loss/crossentropy": 2.7200441360473633, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.16831210255622864, "step": 1459 }, { "epoch": 0.021800806331193072, "grad_norm": 0.65234375, "grad_norm_var": 0.008765920003255209, "learning_rate": 2e-05, "loss": 1.3127, "loss/crossentropy": 2.4744434356689453, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18768146634101868, "step": 1460 }, { "epoch": 0.021815738390324024, "grad_norm": 0.55859375, "grad_norm_var": 0.008707110087076824, "learning_rate": 2e-05, "loss": 1.286, "loss/crossentropy": 2.4717938899993896, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18439996242523193, "step": 1461 }, { "epoch": 0.02183067044945498, "grad_norm": 0.5, "grad_norm_var": 0.009018198649088541, "learning_rate": 2e-05, "loss": 1.2192, "loss/crossentropy": 2.5739426612854004, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16450907289981842, "step": 1462 }, { "epoch": 0.021845602508585932, "grad_norm": 0.50390625, "grad_norm_var": 0.008582051595052083, "learning_rate": 2e-05, "loss": 1.1912, "loss/crossentropy": 2.4712491035461426, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.15212152898311615, "step": 1463 }, { "epoch": 0.02186053456771689, "grad_norm": 0.54296875, "grad_norm_var": 0.008544858296712239, "learning_rate": 2e-05, "loss": 1.2548, "loss/crossentropy": 2.2979588508605957, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1766788214445114, "step": 1464 }, { "epoch": 0.02187546662684784, "grad_norm": 0.6640625, "grad_norm_var": 0.008931414286295573, "learning_rate": 2e-05, "loss": 1.4653, "loss/crossentropy": 2.055795669555664, "loss/dist_ce": 0.0, "loss/fcd": 1.28125, "loss/idx": 13.0, "loss/logits": 0.18409845232963562, "step": 1465 }, { "epoch": 0.021890398685978796, "grad_norm": 0.5234375, "grad_norm_var": 0.008770243326822916, "learning_rate": 2e-05, "loss": 1.1843, "loss/crossentropy": 2.5227856636047363, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.16089648008346558, "step": 1466 }, { "epoch": 0.021905330745109752, "grad_norm": 0.484375, "grad_norm_var": 0.007983843485514322, "learning_rate": 2e-05, "loss": 1.2299, "loss/crossentropy": 2.611480474472046, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1751788854598999, "step": 1467 }, { "epoch": 0.021920262804240705, "grad_norm": 0.46875, "grad_norm_var": 0.008098840713500977, "learning_rate": 2e-05, "loss": 1.1278, "loss/crossentropy": 2.6512179374694824, "loss/dist_ce": 0.0, "loss/fcd": 0.984375, "loss/idx": 13.0, "loss/logits": 0.14338138699531555, "step": 1468 }, { "epoch": 0.02193519486337166, "grad_norm": 0.5, "grad_norm_var": 0.008333571751912435, "learning_rate": 2e-05, "loss": 1.2038, "loss/crossentropy": 2.5303971767425537, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.15689189732074738, "step": 1469 }, { "epoch": 0.021950126922502613, "grad_norm": 0.5546875, "grad_norm_var": 0.008294407526652019, "learning_rate": 2e-05, "loss": 1.2558, "loss/crossentropy": 2.435598850250244, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.16985587775707245, "step": 1470 }, { "epoch": 0.02196505898163357, "grad_norm": 0.51953125, "grad_norm_var": 0.007680193583170573, "learning_rate": 2e-05, "loss": 1.264, "loss/crossentropy": 2.4546844959259033, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.1702587753534317, "step": 1471 }, { "epoch": 0.02197999104076452, "grad_norm": 0.5078125, "grad_norm_var": 0.007854652404785157, "learning_rate": 2e-05, "loss": 1.248, "loss/crossentropy": 2.5671000480651855, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17768289148807526, "step": 1472 }, { "epoch": 0.021994923099895477, "grad_norm": 0.48046875, "grad_norm_var": 0.007999420166015625, "learning_rate": 2e-05, "loss": 1.1841, "loss/crossentropy": 2.4480648040771484, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15280106663703918, "step": 1473 }, { "epoch": 0.02200985515902643, "grad_norm": 0.51953125, "grad_norm_var": 0.007982826232910157, "learning_rate": 2e-05, "loss": 1.2384, "loss/crossentropy": 2.707908868789673, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17594537138938904, "step": 1474 }, { "epoch": 0.022024787218157385, "grad_norm": 0.498046875, "grad_norm_var": 0.0031385898590087892, "learning_rate": 2e-05, "loss": 1.1188, "loss/crossentropy": 2.5222115516662598, "loss/dist_ce": 0.0, "loss/fcd": 0.97265625, "loss/idx": 13.0, "loss/logits": 0.14612950384616852, "step": 1475 }, { "epoch": 0.022039719277288337, "grad_norm": 0.5390625, "grad_norm_var": 0.0020913283030192056, "learning_rate": 2e-05, "loss": 1.087, "loss/crossentropy": 2.3494150638580322, "loss/dist_ce": 0.0, "loss/fcd": 0.95703125, "loss/idx": 13.0, "loss/logits": 0.13001090288162231, "step": 1476 }, { "epoch": 0.022054651336419293, "grad_norm": 0.498046875, "grad_norm_var": 0.002031707763671875, "learning_rate": 2e-05, "loss": 1.2642, "loss/crossentropy": 2.571301221847534, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.18606534600257874, "step": 1477 }, { "epoch": 0.022069583395550246, "grad_norm": 0.50390625, "grad_norm_var": 0.0020227432250976562, "learning_rate": 2e-05, "loss": 1.2481, "loss/crossentropy": 2.5197813510894775, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17777016758918762, "step": 1478 }, { "epoch": 0.0220845154546812, "grad_norm": 0.625, "grad_norm_var": 0.002690887451171875, "learning_rate": 2e-05, "loss": 1.2635, "loss/crossentropy": 2.594855785369873, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.19315627217292786, "step": 1479 }, { "epoch": 0.022099447513812154, "grad_norm": 0.59765625, "grad_norm_var": 0.00299530029296875, "learning_rate": 2e-05, "loss": 1.3524, "loss/crossentropy": 2.5978474617004395, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.21956896781921387, "step": 1480 }, { "epoch": 0.02211437957294311, "grad_norm": 0.58203125, "grad_norm_var": 0.0019525527954101563, "learning_rate": 2e-05, "loss": 1.3905, "loss/crossentropy": 2.5185165405273438, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.21082568168640137, "step": 1481 }, { "epoch": 0.022129311632074062, "grad_norm": 0.578125, "grad_norm_var": 0.0021270116170247397, "learning_rate": 2e-05, "loss": 1.2688, "loss/crossentropy": 2.6426360607147217, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17500849068164825, "step": 1482 }, { "epoch": 0.022144243691205018, "grad_norm": 0.498046875, "grad_norm_var": 0.0020581404368082683, "learning_rate": 2e-05, "loss": 1.2863, "loss/crossentropy": 2.5090346336364746, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1846894472837448, "step": 1483 }, { "epoch": 0.02215917575033597, "grad_norm": 0.578125, "grad_norm_var": 0.00192106564839681, "learning_rate": 2e-05, "loss": 1.3025, "loss/crossentropy": 2.463310718536377, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.18533632159233093, "step": 1484 }, { "epoch": 0.022174107809466926, "grad_norm": 0.546875, "grad_norm_var": 0.0018318017323811849, "learning_rate": 2e-05, "loss": 1.2817, "loss/crossentropy": 2.671934127807617, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.180099219083786, "step": 1485 }, { "epoch": 0.02218903986859788, "grad_norm": 0.4765625, "grad_norm_var": 0.0020517826080322264, "learning_rate": 2e-05, "loss": 1.2166, "loss/crossentropy": 2.4652843475341797, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1697027087211609, "step": 1486 }, { "epoch": 0.022203971927728834, "grad_norm": 0.609375, "grad_norm_var": 0.002379337946573893, "learning_rate": 2e-05, "loss": 1.3923, "loss/crossentropy": 2.6956992149353027, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.21260452270507812, "step": 1487 }, { "epoch": 0.022218903986859787, "grad_norm": 0.47265625, "grad_norm_var": 0.0026070753733317058, "learning_rate": 2e-05, "loss": 1.1581, "loss/crossentropy": 2.682551145553589, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.15028738975524902, "step": 1488 }, { "epoch": 0.022233836045990742, "grad_norm": 0.6015625, "grad_norm_var": 0.002599191665649414, "learning_rate": 2e-05, "loss": 1.3929, "loss/crossentropy": 2.7902488708496094, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.2132531702518463, "step": 1489 }, { "epoch": 0.022248768105121695, "grad_norm": 0.5390625, "grad_norm_var": 0.002555958429972331, "learning_rate": 2e-05, "loss": 1.2911, "loss/crossentropy": 2.5189132690429688, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.18173748254776, "step": 1490 }, { "epoch": 0.02226370016425265, "grad_norm": 0.53515625, "grad_norm_var": 0.0024022420247395834, "learning_rate": 2e-05, "loss": 1.1905, "loss/crossentropy": 2.603708028793335, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1592203825712204, "step": 1491 }, { "epoch": 0.022278632223383603, "grad_norm": 0.458984375, "grad_norm_var": 0.0029072920481363934, "learning_rate": 2e-05, "loss": 1.1689, "loss/crossentropy": 2.576422691345215, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15326352417469025, "step": 1492 }, { "epoch": 0.02229356428251456, "grad_norm": 0.53125, "grad_norm_var": 0.002773539225260417, "learning_rate": 2e-05, "loss": 1.1926, "loss/crossentropy": 2.671536684036255, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1613408625125885, "step": 1493 }, { "epoch": 0.022308496341645515, "grad_norm": 0.671875, "grad_norm_var": 0.0035964330037434895, "learning_rate": 2e-05, "loss": 1.3892, "loss/crossentropy": 2.2959797382354736, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.19386935234069824, "step": 1494 }, { "epoch": 0.022323428400776467, "grad_norm": 0.515625, "grad_norm_var": 0.003343645731608073, "learning_rate": 2e-05, "loss": 1.2218, "loss/crossentropy": 2.720688819885254, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16712167859077454, "step": 1495 }, { "epoch": 0.022338360459907423, "grad_norm": 0.58203125, "grad_norm_var": 0.0032587051391601562, "learning_rate": 2e-05, "loss": 1.3552, "loss/crossentropy": 2.4977471828460693, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.18331705033779144, "step": 1496 }, { "epoch": 0.022353292519038375, "grad_norm": 0.56640625, "grad_norm_var": 0.0032042821248372396, "learning_rate": 2e-05, "loss": 1.3051, "loss/crossentropy": 2.5807783603668213, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18012891709804535, "step": 1497 }, { "epoch": 0.02236822457816933, "grad_norm": 0.474609375, "grad_norm_var": 0.0034527937571207684, "learning_rate": 2e-05, "loss": 1.1901, "loss/crossentropy": 2.650909900665283, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.16662496328353882, "step": 1498 }, { "epoch": 0.022383156637300283, "grad_norm": 0.50390625, "grad_norm_var": 0.0034212748209635417, "learning_rate": 2e-05, "loss": 1.2063, "loss/crossentropy": 2.6215226650238037, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16723014414310455, "step": 1499 }, { "epoch": 0.02239808869643124, "grad_norm": 0.490234375, "grad_norm_var": 0.0034749190012613933, "learning_rate": 2e-05, "loss": 1.1455, "loss/crossentropy": 2.5625998973846436, "loss/dist_ce": 0.0, "loss/fcd": 0.99609375, "loss/idx": 13.0, "loss/logits": 0.14940068125724792, "step": 1500 }, { "epoch": 0.02241302075556219, "grad_norm": 0.5390625, "grad_norm_var": 0.003467416763305664, "learning_rate": 2e-05, "loss": 1.2824, "loss/crossentropy": 2.5618104934692383, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18080656230449677, "step": 1501 }, { "epoch": 0.022427952814693147, "grad_norm": 0.56640625, "grad_norm_var": 0.0032656192779541016, "learning_rate": 2e-05, "loss": 1.3115, "loss/crossentropy": 2.55956768989563, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.20210814476013184, "step": 1502 }, { "epoch": 0.0224428848738241, "grad_norm": 0.484375, "grad_norm_var": 0.0031048933664957683, "learning_rate": 2e-05, "loss": 1.1382, "loss/crossentropy": 2.7115917205810547, "loss/dist_ce": 0.0, "loss/fcd": 0.9921875, "loss/idx": 13.0, "loss/logits": 0.14602671563625336, "step": 1503 }, { "epoch": 0.022457816932955055, "grad_norm": 0.6875, "grad_norm_var": 0.004251845677693685, "learning_rate": 2e-05, "loss": 1.5219, "loss/crossentropy": 2.5612852573394775, "loss/dist_ce": 0.0, "loss/fcd": 1.2890625, "loss/idx": 13.0, "loss/logits": 0.23288561403751373, "step": 1504 }, { "epoch": 0.022472748992086008, "grad_norm": 0.486328125, "grad_norm_var": 0.004239654541015625, "learning_rate": 2e-05, "loss": 1.2115, "loss/crossentropy": 2.5381405353546143, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.17247450351715088, "step": 1505 }, { "epoch": 0.022487681051216964, "grad_norm": 0.8125, "grad_norm_var": 0.008894856770833333, "learning_rate": 2e-05, "loss": 1.3213, "loss/crossentropy": 2.6162917613983154, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19628921151161194, "step": 1506 }, { "epoch": 0.022502613110347916, "grad_norm": 0.48828125, "grad_norm_var": 0.009166463216145834, "learning_rate": 2e-05, "loss": 1.1868, "loss/crossentropy": 2.6764116287231445, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15558165311813354, "step": 1507 }, { "epoch": 0.022517545169478872, "grad_norm": 0.6484375, "grad_norm_var": 0.009016911188761393, "learning_rate": 2e-05, "loss": 1.2914, "loss/crossentropy": 2.6869983673095703, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.19766151905059814, "step": 1508 }, { "epoch": 0.022532477228609824, "grad_norm": 0.52734375, "grad_norm_var": 0.009035730361938476, "learning_rate": 2e-05, "loss": 1.19, "loss/crossentropy": 2.548644781112671, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15873447060585022, "step": 1509 }, { "epoch": 0.02254740928774078, "grad_norm": 0.482421875, "grad_norm_var": 0.008587074279785157, "learning_rate": 2e-05, "loss": 1.1595, "loss/crossentropy": 2.6315722465515137, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.15952712297439575, "step": 1510 }, { "epoch": 0.022562341346871732, "grad_norm": 0.85546875, "grad_norm_var": 0.014090728759765626, "learning_rate": 2e-05, "loss": 1.2821, "loss/crossentropy": 2.651576519012451, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.2039494812488556, "step": 1511 }, { "epoch": 0.022577273406002688, "grad_norm": 0.53125, "grad_norm_var": 0.014202308654785157, "learning_rate": 2e-05, "loss": 1.2496, "loss/crossentropy": 2.5648045539855957, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.16364048421382904, "step": 1512 }, { "epoch": 0.02259220546513364, "grad_norm": 0.63671875, "grad_norm_var": 0.014463233947753906, "learning_rate": 2e-05, "loss": 1.1766, "loss/crossentropy": 2.465576410293579, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1609870195388794, "step": 1513 }, { "epoch": 0.022607137524264596, "grad_norm": 0.5390625, "grad_norm_var": 0.013852167129516601, "learning_rate": 2e-05, "loss": 1.2467, "loss/crossentropy": 2.382476568222046, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1607544869184494, "step": 1514 }, { "epoch": 0.02262206958339555, "grad_norm": 0.5703125, "grad_norm_var": 0.013454421361287435, "learning_rate": 2e-05, "loss": 1.3323, "loss/crossentropy": 2.588047742843628, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.19164502620697021, "step": 1515 }, { "epoch": 0.022637001642526505, "grad_norm": 0.546875, "grad_norm_var": 0.012946001688639323, "learning_rate": 2e-05, "loss": 1.2083, "loss/crossentropy": 2.6603829860687256, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16927656531333923, "step": 1516 }, { "epoch": 0.022651933701657457, "grad_norm": 0.625, "grad_norm_var": 0.01285088857014974, "learning_rate": 2e-05, "loss": 1.2823, "loss/crossentropy": 2.647714614868164, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18076732754707336, "step": 1517 }, { "epoch": 0.022666865760788413, "grad_norm": 0.48828125, "grad_norm_var": 0.013509559631347656, "learning_rate": 2e-05, "loss": 1.1729, "loss/crossentropy": 2.624861717224121, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.1651315689086914, "step": 1518 }, { "epoch": 0.02268179781991937, "grad_norm": 0.515625, "grad_norm_var": 0.013138262430826823, "learning_rate": 2e-05, "loss": 1.2071, "loss/crossentropy": 2.556081771850586, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16020020842552185, "step": 1519 }, { "epoch": 0.02269672987905032, "grad_norm": 0.51953125, "grad_norm_var": 0.012719980875651042, "learning_rate": 2e-05, "loss": 1.2024, "loss/crossentropy": 2.708319902420044, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16335517168045044, "step": 1520 }, { "epoch": 0.022711661938181277, "grad_norm": 0.53515625, "grad_norm_var": 0.01226181983947754, "learning_rate": 2e-05, "loss": 1.3096, "loss/crossentropy": 2.614462375640869, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18464729189872742, "step": 1521 }, { "epoch": 0.02272659399731223, "grad_norm": 0.60546875, "grad_norm_var": 0.008595641454060872, "learning_rate": 2e-05, "loss": 1.2894, "loss/crossentropy": 2.635050058364868, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1800360381603241, "step": 1522 }, { "epoch": 0.022741526056443185, "grad_norm": 0.470703125, "grad_norm_var": 0.008805783589680989, "learning_rate": 2e-05, "loss": 1.1752, "loss/crossentropy": 2.6181414127349854, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15956082940101624, "step": 1523 }, { "epoch": 0.022756458115574137, "grad_norm": 0.5390625, "grad_norm_var": 0.00838921864827474, "learning_rate": 2e-05, "loss": 1.2508, "loss/crossentropy": 2.428799629211426, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1649072766304016, "step": 1524 }, { "epoch": 0.022771390174705093, "grad_norm": 0.64453125, "grad_norm_var": 0.00870965321858724, "learning_rate": 2e-05, "loss": 1.438, "loss/crossentropy": 2.635791778564453, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 13.0, "loss/logits": 0.21926459670066833, "step": 1525 }, { "epoch": 0.022786322233836045, "grad_norm": 0.50390625, "grad_norm_var": 0.008490228652954101, "learning_rate": 2e-05, "loss": 1.2384, "loss/crossentropy": 2.5704894065856934, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16805407404899597, "step": 1526 }, { "epoch": 0.022801254292967, "grad_norm": 0.52734375, "grad_norm_var": 0.0027491092681884766, "learning_rate": 2e-05, "loss": 1.2464, "loss/crossentropy": 2.7401912212371826, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17604930698871613, "step": 1527 }, { "epoch": 0.022816186352097954, "grad_norm": 0.53125, "grad_norm_var": 0.0027491092681884766, "learning_rate": 2e-05, "loss": 1.3106, "loss/crossentropy": 2.4598357677459717, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18563039600849152, "step": 1528 }, { "epoch": 0.02283111841122891, "grad_norm": 0.578125, "grad_norm_var": 0.002285623550415039, "learning_rate": 2e-05, "loss": 1.3289, "loss/crossentropy": 2.4256432056427, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.18043887615203857, "step": 1529 }, { "epoch": 0.022846050470359862, "grad_norm": 0.51953125, "grad_norm_var": 0.002328221003214518, "learning_rate": 2e-05, "loss": 1.2916, "loss/crossentropy": 2.513697624206543, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1822143942117691, "step": 1530 }, { "epoch": 0.022860982529490818, "grad_norm": 0.55859375, "grad_norm_var": 0.0022973219553629556, "learning_rate": 2e-05, "loss": 1.2614, "loss/crossentropy": 2.432072639465332, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.1676977425813675, "step": 1531 }, { "epoch": 0.02287591458862177, "grad_norm": 0.498046875, "grad_norm_var": 0.0024296442667643228, "learning_rate": 2e-05, "loss": 1.1784, "loss/crossentropy": 2.608954668045044, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.16277967393398285, "step": 1532 }, { "epoch": 0.022890846647752726, "grad_norm": 0.5625, "grad_norm_var": 0.0019759496053059896, "learning_rate": 2e-05, "loss": 1.3215, "loss/crossentropy": 2.744267225265503, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19652540981769562, "step": 1533 }, { "epoch": 0.022905778706883678, "grad_norm": 0.625, "grad_norm_var": 0.0022496541341145834, "learning_rate": 2e-05, "loss": 1.3236, "loss/crossentropy": 2.448258638381958, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.17515279352664948, "step": 1534 }, { "epoch": 0.022920710766014634, "grad_norm": 0.474609375, "grad_norm_var": 0.0025203545888264974, "learning_rate": 2e-05, "loss": 1.1635, "loss/crossentropy": 2.6411309242248535, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.14791785180568695, "step": 1535 }, { "epoch": 0.022935642825145586, "grad_norm": 0.52734375, "grad_norm_var": 0.00249937375386556, "learning_rate": 2e-05, "loss": 1.2836, "loss/crossentropy": 2.6703877449035645, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18201038241386414, "step": 1536 }, { "epoch": 0.022950574884276542, "grad_norm": 0.59375, "grad_norm_var": 0.0026462395985921224, "learning_rate": 2e-05, "loss": 1.3684, "loss/crossentropy": 2.5531914234161377, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 13.0, "loss/logits": 0.2043484002351761, "step": 1537 }, { "epoch": 0.022965506943407495, "grad_norm": 0.54296875, "grad_norm_var": 0.002407185236612956, "learning_rate": 2e-05, "loss": 1.2455, "loss/crossentropy": 2.6327946186065674, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17522411048412323, "step": 1538 }, { "epoch": 0.02298043900253845, "grad_norm": 0.5390625, "grad_norm_var": 0.0020350138346354168, "learning_rate": 2e-05, "loss": 1.2707, "loss/crossentropy": 2.820997476577759, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.20818859338760376, "step": 1539 }, { "epoch": 0.022995371061669403, "grad_norm": 0.451171875, "grad_norm_var": 0.002620808283487956, "learning_rate": 2e-05, "loss": 1.1121, "loss/crossentropy": 2.2270283699035645, "loss/dist_ce": 0.0, "loss/fcd": 0.97265625, "loss/idx": 13.0, "loss/logits": 0.13947048783302307, "step": 1540 }, { "epoch": 0.02301030312080036, "grad_norm": 0.51953125, "grad_norm_var": 0.0018944899241129557, "learning_rate": 2e-05, "loss": 1.2607, "loss/crossentropy": 2.485645294189453, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1826171576976776, "step": 1541 }, { "epoch": 0.02302523517993131, "grad_norm": 0.515625, "grad_norm_var": 0.0018551985422770182, "learning_rate": 2e-05, "loss": 1.2134, "loss/crossentropy": 2.5995469093322754, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16654683649539948, "step": 1542 }, { "epoch": 0.023040167239062267, "grad_norm": 0.640625, "grad_norm_var": 0.002537393569946289, "learning_rate": 2e-05, "loss": 1.3908, "loss/crossentropy": 2.5561470985412598, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.2111516296863556, "step": 1543 }, { "epoch": 0.02305509929819322, "grad_norm": 0.57421875, "grad_norm_var": 0.002589146296183268, "learning_rate": 2e-05, "loss": 1.3671, "loss/crossentropy": 2.6751511096954346, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.18742230534553528, "step": 1544 }, { "epoch": 0.023070031357324175, "grad_norm": 0.470703125, "grad_norm_var": 0.0028365453084309897, "learning_rate": 2e-05, "loss": 1.1518, "loss/crossentropy": 2.600984811782837, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.15175320208072662, "step": 1545 }, { "epoch": 0.02308496341645513, "grad_norm": 0.69921875, "grad_norm_var": 0.004404131571451823, "learning_rate": 2e-05, "loss": 1.3883, "loss/crossentropy": 2.5492827892303467, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.24766142666339874, "step": 1546 }, { "epoch": 0.023099895475586083, "grad_norm": 0.494140625, "grad_norm_var": 0.004586140314737956, "learning_rate": 2e-05, "loss": 1.2833, "loss/crossentropy": 2.368687629699707, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.189529687166214, "step": 1547 }, { "epoch": 0.02311482753471704, "grad_norm": 0.55859375, "grad_norm_var": 0.004431915283203125, "learning_rate": 2e-05, "loss": 1.313, "loss/crossentropy": 2.526824951171875, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18795964121818542, "step": 1548 }, { "epoch": 0.02312975959384799, "grad_norm": 0.5, "grad_norm_var": 0.004566192626953125, "learning_rate": 2e-05, "loss": 1.2229, "loss/crossentropy": 2.5661303997039795, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17601008713245392, "step": 1549 }, { "epoch": 0.023144691652978947, "grad_norm": 0.578125, "grad_norm_var": 0.004206085205078125, "learning_rate": 2e-05, "loss": 1.3267, "loss/crossentropy": 2.568767786026001, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.20167773962020874, "step": 1550 }, { "epoch": 0.0231596237121099, "grad_norm": 0.474609375, "grad_norm_var": 0.004206085205078125, "learning_rate": 2e-05, "loss": 1.168, "loss/crossentropy": 2.607841968536377, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15234646201133728, "step": 1551 }, { "epoch": 0.023174555771240855, "grad_norm": 0.63671875, "grad_norm_var": 0.004733022054036458, "learning_rate": 2e-05, "loss": 1.5019, "loss/crossentropy": 2.2986114025115967, "loss/dist_ce": 0.0, "loss/fcd": 1.2890625, "loss/idx": 13.0, "loss/logits": 0.21287450194358826, "step": 1552 }, { "epoch": 0.023189487830371808, "grad_norm": 0.609375, "grad_norm_var": 0.004840850830078125, "learning_rate": 2e-05, "loss": 1.2576, "loss/crossentropy": 2.6668930053710938, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17167997360229492, "step": 1553 }, { "epoch": 0.023204419889502764, "grad_norm": 0.578125, "grad_norm_var": 0.004883766174316406, "learning_rate": 2e-05, "loss": 1.2429, "loss/crossentropy": 2.417006254196167, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.16477948427200317, "step": 1554 }, { "epoch": 0.023219351948633716, "grad_norm": 0.5859375, "grad_norm_var": 0.004937171936035156, "learning_rate": 2e-05, "loss": 1.3243, "loss/crossentropy": 2.640455722808838, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19927874207496643, "step": 1555 }, { "epoch": 0.02323428400776467, "grad_norm": 0.5, "grad_norm_var": 0.004407485326131185, "learning_rate": 2e-05, "loss": 1.2037, "loss/crossentropy": 2.4920125007629395, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1646072268486023, "step": 1556 }, { "epoch": 0.023249216066895624, "grad_norm": 0.4921875, "grad_norm_var": 0.004596185684204101, "learning_rate": 2e-05, "loss": 1.1922, "loss/crossentropy": 2.554762601852417, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16096070408821106, "step": 1557 }, { "epoch": 0.02326414812602658, "grad_norm": 0.5625, "grad_norm_var": 0.004476404190063477, "learning_rate": 2e-05, "loss": 1.2629, "loss/crossentropy": 2.679831027984619, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17697837948799133, "step": 1558 }, { "epoch": 0.023279080185157532, "grad_norm": 0.57421875, "grad_norm_var": 0.004035425186157226, "learning_rate": 2e-05, "loss": 1.3702, "loss/crossentropy": 2.5628817081451416, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.190528005361557, "step": 1559 }, { "epoch": 0.023294012244288488, "grad_norm": 0.59375, "grad_norm_var": 0.004107904434204101, "learning_rate": 2e-05, "loss": 1.3442, "loss/crossentropy": 2.408496141433716, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.19575239717960358, "step": 1560 }, { "epoch": 0.02330894430341944, "grad_norm": 0.515625, "grad_norm_var": 0.00371856689453125, "learning_rate": 2e-05, "loss": 1.3114, "loss/crossentropy": 2.3996641635894775, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.1863994300365448, "step": 1561 }, { "epoch": 0.023323876362550396, "grad_norm": 0.5078125, "grad_norm_var": 0.0024443944295247395, "learning_rate": 2e-05, "loss": 1.1977, "loss/crossentropy": 2.4856419563293457, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.15866494178771973, "step": 1562 }, { "epoch": 0.02333880842168135, "grad_norm": 0.546875, "grad_norm_var": 0.002242263158162435, "learning_rate": 2e-05, "loss": 1.3169, "loss/crossentropy": 2.454580545425415, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.18412557244300842, "step": 1563 }, { "epoch": 0.023353740480812304, "grad_norm": 0.58203125, "grad_norm_var": 0.0023006280263264973, "learning_rate": 2e-05, "loss": 1.2856, "loss/crossentropy": 2.640057325363159, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.19184106588363647, "step": 1564 }, { "epoch": 0.023368672539943257, "grad_norm": 0.490234375, "grad_norm_var": 0.0023747762044270832, "learning_rate": 2e-05, "loss": 1.1696, "loss/crossentropy": 2.4749562740325928, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15401500463485718, "step": 1565 }, { "epoch": 0.023383604599074213, "grad_norm": 0.5, "grad_norm_var": 0.0024815877278645832, "learning_rate": 2e-05, "loss": 1.1622, "loss/crossentropy": 2.4315335750579834, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.15435267984867096, "step": 1566 }, { "epoch": 0.023398536658205165, "grad_norm": 0.515625, "grad_norm_var": 0.002191527684529622, "learning_rate": 2e-05, "loss": 1.2196, "loss/crossentropy": 2.7022817134857178, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17277126014232635, "step": 1567 }, { "epoch": 0.02341346871733612, "grad_norm": 0.51953125, "grad_norm_var": 0.00168608029683431, "learning_rate": 2e-05, "loss": 1.218, "loss/crossentropy": 2.5333712100982666, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1633320152759552, "step": 1568 }, { "epoch": 0.023428400776467073, "grad_norm": 0.5703125, "grad_norm_var": 0.0014311313629150391, "learning_rate": 2e-05, "loss": 1.2387, "loss/crossentropy": 2.753187417984009, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16835424304008484, "step": 1569 }, { "epoch": 0.02344333283559803, "grad_norm": 0.515625, "grad_norm_var": 0.0013548374176025391, "learning_rate": 2e-05, "loss": 1.1516, "loss/crossentropy": 2.419675588607788, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.14382390677928925, "step": 1570 }, { "epoch": 0.023458264894728985, "grad_norm": 0.578125, "grad_norm_var": 0.0013063907623291015, "learning_rate": 2e-05, "loss": 1.2286, "loss/crossentropy": 2.2976953983306885, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.15043297410011292, "step": 1571 }, { "epoch": 0.023473196953859937, "grad_norm": 0.5703125, "grad_norm_var": 0.001284646987915039, "learning_rate": 2e-05, "loss": 1.3051, "loss/crossentropy": 2.31030535697937, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18014974892139435, "step": 1572 }, { "epoch": 0.023488129012990893, "grad_norm": 0.51953125, "grad_norm_var": 0.0011582533518473307, "learning_rate": 2e-05, "loss": 1.2815, "loss/crossentropy": 2.604799270629883, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.17212437093257904, "step": 1573 }, { "epoch": 0.023503061072121845, "grad_norm": 0.5078125, "grad_norm_var": 0.0011911869049072265, "learning_rate": 2e-05, "loss": 1.2331, "loss/crossentropy": 2.6337313652038574, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.162765234708786, "step": 1574 }, { "epoch": 0.0235179931312528, "grad_norm": 0.53515625, "grad_norm_var": 0.001097726821899414, "learning_rate": 2e-05, "loss": 1.3357, "loss/crossentropy": 2.5624470710754395, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.21068018674850464, "step": 1575 }, { "epoch": 0.023532925190383754, "grad_norm": 0.546875, "grad_norm_var": 0.0008711338043212891, "learning_rate": 2e-05, "loss": 1.3721, "loss/crossentropy": 2.4522528648376465, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.20022107660770416, "step": 1576 }, { "epoch": 0.02354785724951471, "grad_norm": 0.5, "grad_norm_var": 0.0009217421213785807, "learning_rate": 2e-05, "loss": 1.2497, "loss/crossentropy": 2.6643879413604736, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.17159616947174072, "step": 1577 }, { "epoch": 0.023562789308645662, "grad_norm": 0.52734375, "grad_norm_var": 0.0008835951487223307, "learning_rate": 2e-05, "loss": 1.2574, "loss/crossentropy": 2.673330545425415, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1793064922094345, "step": 1578 }, { "epoch": 0.023577721367776618, "grad_norm": 0.5390625, "grad_norm_var": 0.0008727868398030598, "learning_rate": 2e-05, "loss": 1.2391, "loss/crossentropy": 2.4731786251068115, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1844569444656372, "step": 1579 }, { "epoch": 0.02359265342690757, "grad_norm": 0.7890625, "grad_norm_var": 0.004923105239868164, "learning_rate": 2e-05, "loss": 1.335, "loss/crossentropy": 2.567404270172119, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.17879244685173035, "step": 1580 }, { "epoch": 0.023607585486038526, "grad_norm": 0.640625, "grad_norm_var": 0.005232747395833333, "learning_rate": 2e-05, "loss": 1.4227, "loss/crossentropy": 2.81486177444458, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 13.0, "loss/logits": 0.21953287720680237, "step": 1581 }, { "epoch": 0.023622517545169478, "grad_norm": 0.55078125, "grad_norm_var": 0.005023638407389323, "learning_rate": 2e-05, "loss": 1.3109, "loss/crossentropy": 2.5990700721740723, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18592441082000732, "step": 1582 }, { "epoch": 0.023637449604300434, "grad_norm": 0.50390625, "grad_norm_var": 0.005098215738932292, "learning_rate": 2e-05, "loss": 1.1859, "loss/crossentropy": 2.800184488296509, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15463992953300476, "step": 1583 }, { "epoch": 0.023652381663431386, "grad_norm": 0.578125, "grad_norm_var": 0.005019060770670573, "learning_rate": 2e-05, "loss": 1.3002, "loss/crossentropy": 2.5168678760528564, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.19082492589950562, "step": 1584 }, { "epoch": 0.023667313722562342, "grad_norm": 0.59375, "grad_norm_var": 0.005083147684733073, "learning_rate": 2e-05, "loss": 1.2893, "loss/crossentropy": 2.3718113899230957, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18773557245731354, "step": 1585 }, { "epoch": 0.023682245781693295, "grad_norm": 0.53515625, "grad_norm_var": 0.0049855550130208336, "learning_rate": 2e-05, "loss": 1.2704, "loss/crossentropy": 2.434890031814575, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.19229555130004883, "step": 1586 }, { "epoch": 0.02369717784082425, "grad_norm": 0.57421875, "grad_norm_var": 0.00497887929280599, "learning_rate": 2e-05, "loss": 1.2931, "loss/crossentropy": 2.461606025695801, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.1758810579776764, "step": 1587 }, { "epoch": 0.023712109899955203, "grad_norm": 0.515625, "grad_norm_var": 0.005114173889160157, "learning_rate": 2e-05, "loss": 1.2502, "loss/crossentropy": 2.5047669410705566, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.179841548204422, "step": 1588 }, { "epoch": 0.02372704195908616, "grad_norm": 0.498046875, "grad_norm_var": 0.005258417129516602, "learning_rate": 2e-05, "loss": 1.1957, "loss/crossentropy": 2.441140651702881, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1488042175769806, "step": 1589 }, { "epoch": 0.02374197401821711, "grad_norm": 0.486328125, "grad_norm_var": 0.0054323832194010414, "learning_rate": 2e-05, "loss": 1.1902, "loss/crossentropy": 2.5383846759796143, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15891912579536438, "step": 1590 }, { "epoch": 0.023756906077348067, "grad_norm": 0.482421875, "grad_norm_var": 0.005760685602823893, "learning_rate": 2e-05, "loss": 1.2418, "loss/crossentropy": 2.5855181217193604, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17932924628257751, "step": 1591 }, { "epoch": 0.02377183813647902, "grad_norm": 0.53125, "grad_norm_var": 0.005790440241495768, "learning_rate": 2e-05, "loss": 1.2238, "loss/crossentropy": 2.640465259552002, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17696192860603333, "step": 1592 }, { "epoch": 0.023786770195609975, "grad_norm": 0.58984375, "grad_norm_var": 0.005661757787068685, "learning_rate": 2e-05, "loss": 1.3372, "loss/crossentropy": 2.1728391647338867, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.20438703894615173, "step": 1593 }, { "epoch": 0.023801702254740927, "grad_norm": 0.5234375, "grad_norm_var": 0.005678923924763998, "learning_rate": 2e-05, "loss": 1.2761, "loss/crossentropy": 2.580386161804199, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18235522508621216, "step": 1594 }, { "epoch": 0.023816634313871883, "grad_norm": 0.66796875, "grad_norm_var": 0.006388076146443685, "learning_rate": 2e-05, "loss": 1.4137, "loss/crossentropy": 2.3353309631347656, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 13.0, "loss/logits": 0.2027420699596405, "step": 1595 }, { "epoch": 0.023831566373002835, "grad_norm": 0.58984375, "grad_norm_var": 0.0029510339101155598, "learning_rate": 2e-05, "loss": 1.3433, "loss/crossentropy": 2.4447288513183594, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.19487924873828888, "step": 1596 }, { "epoch": 0.02384649843213379, "grad_norm": 0.6171875, "grad_norm_var": 0.002714141209920247, "learning_rate": 2e-05, "loss": 1.3034, "loss/crossentropy": 2.6335086822509766, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.17839547991752625, "step": 1597 }, { "epoch": 0.023861430491264747, "grad_norm": 0.546875, "grad_norm_var": 0.002715921401977539, "learning_rate": 2e-05, "loss": 1.2338, "loss/crossentropy": 2.8217856884002686, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1713387668132782, "step": 1598 }, { "epoch": 0.0238763625503957, "grad_norm": 0.57421875, "grad_norm_var": 0.0025728702545166015, "learning_rate": 2e-05, "loss": 1.3584, "loss/crossentropy": 2.3990859985351562, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.18652434647083282, "step": 1599 }, { "epoch": 0.023891294609526655, "grad_norm": 0.48046875, "grad_norm_var": 0.002887582778930664, "learning_rate": 2e-05, "loss": 1.2107, "loss/crossentropy": 2.350006103515625, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16381904482841492, "step": 1600 }, { "epoch": 0.023906226668657608, "grad_norm": 0.55859375, "grad_norm_var": 0.002761697769165039, "learning_rate": 2e-05, "loss": 1.2017, "loss/crossentropy": 2.7609221935272217, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16260434687137604, "step": 1601 }, { "epoch": 0.023921158727788563, "grad_norm": 0.5703125, "grad_norm_var": 0.002777719497680664, "learning_rate": 2e-05, "loss": 1.2588, "loss/crossentropy": 2.6673271656036377, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.18064871430397034, "step": 1602 }, { "epoch": 0.023936090786919516, "grad_norm": 0.5703125, "grad_norm_var": 0.002766275405883789, "learning_rate": 2e-05, "loss": 1.3639, "loss/crossentropy": 2.7430176734924316, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.19205442070960999, "step": 1603 }, { "epoch": 0.02395102284605047, "grad_norm": 0.494140625, "grad_norm_var": 0.0028940836588541665, "learning_rate": 2e-05, "loss": 1.2124, "loss/crossentropy": 2.53092360496521, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1655370444059372, "step": 1604 }, { "epoch": 0.023965954905181424, "grad_norm": 0.55078125, "grad_norm_var": 0.002710835138956706, "learning_rate": 2e-05, "loss": 1.3779, "loss/crossentropy": 2.3667619228363037, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.19819000363349915, "step": 1605 }, { "epoch": 0.02398088696431238, "grad_norm": 0.55078125, "grad_norm_var": 0.0024050394694010418, "learning_rate": 2e-05, "loss": 1.2546, "loss/crossentropy": 2.244283676147461, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1686820089817047, "step": 1606 }, { "epoch": 0.023995819023443332, "grad_norm": 0.5625, "grad_norm_var": 0.002018594741821289, "learning_rate": 2e-05, "loss": 1.2936, "loss/crossentropy": 2.713747501373291, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.1764499694108963, "step": 1607 }, { "epoch": 0.024010751082574288, "grad_norm": 0.47265625, "grad_norm_var": 0.0024668216705322266, "learning_rate": 2e-05, "loss": 1.1716, "loss/crossentropy": 2.828679323196411, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.155990868806839, "step": 1608 }, { "epoch": 0.02402568314170524, "grad_norm": 0.76171875, "grad_norm_var": 0.005054457982381185, "learning_rate": 2e-05, "loss": 1.2463, "loss/crossentropy": 2.567664623260498, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.16816666722297668, "step": 1609 }, { "epoch": 0.024040615200836196, "grad_norm": 0.98828125, "grad_norm_var": 0.015782785415649415, "learning_rate": 2e-05, "loss": 1.342, "loss/crossentropy": 2.756133556365967, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.20140567421913147, "step": 1610 }, { "epoch": 0.02405554725996715, "grad_norm": 0.484375, "grad_norm_var": 0.0161592960357666, "learning_rate": 2e-05, "loss": 1.2295, "loss/crossentropy": 2.6116912364959717, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1748572140932083, "step": 1611 }, { "epoch": 0.024070479319098104, "grad_norm": 0.51171875, "grad_norm_var": 0.016498804092407227, "learning_rate": 2e-05, "loss": 1.2077, "loss/crossentropy": 2.7342991828918457, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16085676848888397, "step": 1612 }, { "epoch": 0.024085411378229057, "grad_norm": 0.57421875, "grad_norm_var": 0.016406488418579102, "learning_rate": 2e-05, "loss": 1.3091, "loss/crossentropy": 2.641632318496704, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18407666683197021, "step": 1613 }, { "epoch": 0.024100343437360013, "grad_norm": 0.470703125, "grad_norm_var": 0.017087745666503906, "learning_rate": 2e-05, "loss": 1.1709, "loss/crossentropy": 2.5103869438171387, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.14749841392040253, "step": 1614 }, { "epoch": 0.024115275496490965, "grad_norm": 0.494140625, "grad_norm_var": 0.0174807071685791, "learning_rate": 2e-05, "loss": 1.242, "loss/crossentropy": 2.5742335319519043, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1717085838317871, "step": 1615 }, { "epoch": 0.02413020755562192, "grad_norm": 0.546875, "grad_norm_var": 0.01697703997294108, "learning_rate": 2e-05, "loss": 1.3399, "loss/crossentropy": 2.6849160194396973, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.19930125772953033, "step": 1616 }, { "epoch": 0.024145139614752873, "grad_norm": 0.56640625, "grad_norm_var": 0.01696623166402181, "learning_rate": 2e-05, "loss": 1.3899, "loss/crossentropy": 2.5048611164093018, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.2179987132549286, "step": 1617 }, { "epoch": 0.02416007167388383, "grad_norm": 0.59765625, "grad_norm_var": 0.01700272560119629, "learning_rate": 2e-05, "loss": 1.3458, "loss/crossentropy": 2.225867509841919, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.17394158244132996, "step": 1618 }, { "epoch": 0.02417500373301478, "grad_norm": 0.5546875, "grad_norm_var": 0.01702739397684733, "learning_rate": 2e-05, "loss": 1.2265, "loss/crossentropy": 2.765789747238159, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17184340953826904, "step": 1619 }, { "epoch": 0.024189935792145737, "grad_norm": 0.62890625, "grad_norm_var": 0.01673018137613932, "learning_rate": 2e-05, "loss": 1.2812, "loss/crossentropy": 2.774526596069336, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.17186947166919708, "step": 1620 }, { "epoch": 0.02420486785127669, "grad_norm": 0.57421875, "grad_norm_var": 0.016666094462076824, "learning_rate": 2e-05, "loss": 1.3168, "loss/crossentropy": 2.60170578956604, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19177797436714172, "step": 1621 }, { "epoch": 0.024219799910407645, "grad_norm": 0.5546875, "grad_norm_var": 0.016649881998697918, "learning_rate": 2e-05, "loss": 1.2087, "loss/crossentropy": 2.606919527053833, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16964177787303925, "step": 1622 }, { "epoch": 0.024234731969538598, "grad_norm": 0.625, "grad_norm_var": 0.016714986165364584, "learning_rate": 2e-05, "loss": 1.3444, "loss/crossentropy": 2.387601375579834, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.1725083887577057, "step": 1623 }, { "epoch": 0.024249664028669553, "grad_norm": 0.5546875, "grad_norm_var": 0.01587518056233724, "learning_rate": 2e-05, "loss": 1.3219, "loss/crossentropy": 2.5789592266082764, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19694536924362183, "step": 1624 }, { "epoch": 0.02426459608780051, "grad_norm": 0.484375, "grad_norm_var": 0.014444224039713542, "learning_rate": 2e-05, "loss": 1.2397, "loss/crossentropy": 2.5164992809295654, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17721965909004211, "step": 1625 }, { "epoch": 0.02427952814693146, "grad_norm": 0.625, "grad_norm_var": 0.002707354227701823, "learning_rate": 2e-05, "loss": 1.2661, "loss/crossentropy": 2.8884806632995605, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.18014132976531982, "step": 1626 }, { "epoch": 0.024294460206062417, "grad_norm": 0.55859375, "grad_norm_var": 0.00237274169921875, "learning_rate": 2e-05, "loss": 1.4009, "loss/crossentropy": 2.534945249557495, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 13.0, "loss/logits": 0.19778071343898773, "step": 1627 }, { "epoch": 0.02430939226519337, "grad_norm": 0.5625, "grad_norm_var": 0.0022231419881184895, "learning_rate": 2e-05, "loss": 1.3571, "loss/crossentropy": 2.6048007011413574, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.20868733525276184, "step": 1628 }, { "epoch": 0.024324324324324326, "grad_norm": 0.76953125, "grad_norm_var": 0.004957008361816406, "learning_rate": 2e-05, "loss": 1.4905, "loss/crossentropy": 2.398721218109131, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 13.0, "loss/logits": 0.22485968470573425, "step": 1629 }, { "epoch": 0.024339256383455278, "grad_norm": 0.48828125, "grad_norm_var": 0.004736566543579101, "learning_rate": 2e-05, "loss": 1.1915, "loss/crossentropy": 2.464036703109741, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1602393090724945, "step": 1630 }, { "epoch": 0.024354188442586234, "grad_norm": 0.5078125, "grad_norm_var": 0.004602495829264323, "learning_rate": 2e-05, "loss": 1.174, "loss/crossentropy": 2.519684076309204, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15833880007266998, "step": 1631 }, { "epoch": 0.024369120501717186, "grad_norm": 0.578125, "grad_norm_var": 0.004546546936035156, "learning_rate": 2e-05, "loss": 1.3098, "loss/crossentropy": 2.3867616653442383, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18479761481285095, "step": 1632 }, { "epoch": 0.024384052560848142, "grad_norm": 0.53515625, "grad_norm_var": 0.004651323954264323, "learning_rate": 2e-05, "loss": 1.2205, "loss/crossentropy": 2.6318466663360596, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16580361127853394, "step": 1633 }, { "epoch": 0.024398984619979094, "grad_norm": 0.486328125, "grad_norm_var": 0.005088917414347331, "learning_rate": 2e-05, "loss": 1.1839, "loss/crossentropy": 2.615931749343872, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15263916552066803, "step": 1634 }, { "epoch": 0.02441391667911005, "grad_norm": 0.5078125, "grad_norm_var": 0.005309406916300456, "learning_rate": 2e-05, "loss": 1.1888, "loss/crossentropy": 2.495640277862549, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15757544338703156, "step": 1635 }, { "epoch": 0.024428848738241003, "grad_norm": 0.53125, "grad_norm_var": 0.005074167251586914, "learning_rate": 2e-05, "loss": 1.1999, "loss/crossentropy": 2.633312463760376, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1686273217201233, "step": 1636 }, { "epoch": 0.02444378079737196, "grad_norm": 0.478515625, "grad_norm_var": 0.00545190175374349, "learning_rate": 2e-05, "loss": 1.2016, "loss/crossentropy": 2.4377596378326416, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16258299350738525, "step": 1637 }, { "epoch": 0.02445871285650291, "grad_norm": 0.51953125, "grad_norm_var": 0.005521138509114583, "learning_rate": 2e-05, "loss": 1.184, "loss/crossentropy": 2.7175605297088623, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.16056030988693237, "step": 1638 }, { "epoch": 0.024473644915633867, "grad_norm": 0.5390625, "grad_norm_var": 0.005132293701171875, "learning_rate": 2e-05, "loss": 1.2285, "loss/crossentropy": 2.648740530014038, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17380774021148682, "step": 1639 }, { "epoch": 0.02448857697476482, "grad_norm": 0.55859375, "grad_norm_var": 0.005138079325358073, "learning_rate": 2e-05, "loss": 1.2751, "loss/crossentropy": 2.3271639347076416, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18137691915035248, "step": 1640 }, { "epoch": 0.024503509033895775, "grad_norm": 0.482421875, "grad_norm_var": 0.005154275894165039, "learning_rate": 2e-05, "loss": 1.2039, "loss/crossentropy": 2.5689260959625244, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16480699181556702, "step": 1641 }, { "epoch": 0.024518441093026727, "grad_norm": 1.40625, "grad_norm_var": 0.05157914161682129, "learning_rate": 2e-05, "loss": 1.6373, "loss/crossentropy": 2.645709991455078, "loss/dist_ce": 0.0, "loss/fcd": 1.3828125, "loss/idx": 13.0, "loss/logits": 0.25451111793518066, "step": 1642 }, { "epoch": 0.024533373152157683, "grad_norm": 0.5390625, "grad_norm_var": 0.051696125666300455, "learning_rate": 2e-05, "loss": 1.2251, "loss/crossentropy": 2.5970401763916016, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17040389776229858, "step": 1643 }, { "epoch": 0.024548305211288635, "grad_norm": 0.462890625, "grad_norm_var": 0.05272318522135417, "learning_rate": 2e-05, "loss": 1.1941, "loss/crossentropy": 2.482206344604492, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.14718057215213776, "step": 1644 }, { "epoch": 0.02456323727041959, "grad_norm": 0.54296875, "grad_norm_var": 0.050414784749348955, "learning_rate": 2e-05, "loss": 1.3091, "loss/crossentropy": 2.300069808959961, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.19193615019321442, "step": 1645 }, { "epoch": 0.024578169329550544, "grad_norm": 0.5390625, "grad_norm_var": 0.050004005432128906, "learning_rate": 2e-05, "loss": 1.2527, "loss/crossentropy": 2.6057968139648438, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18236851692199707, "step": 1646 }, { "epoch": 0.0245931013886815, "grad_norm": 0.51953125, "grad_norm_var": 0.049906158447265626, "learning_rate": 2e-05, "loss": 1.2824, "loss/crossentropy": 2.6754231452941895, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.2042427510023117, "step": 1647 }, { "epoch": 0.02460803344781245, "grad_norm": 0.48828125, "grad_norm_var": 0.050393104553222656, "learning_rate": 2e-05, "loss": 1.156, "loss/crossentropy": 2.59796142578125, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.156040221452713, "step": 1648 }, { "epoch": 0.024622965506943408, "grad_norm": 0.640625, "grad_norm_var": 0.05058364868164063, "learning_rate": 2e-05, "loss": 1.336, "loss/crossentropy": 2.5802266597747803, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.1875656247138977, "step": 1649 }, { "epoch": 0.024637897566074363, "grad_norm": 0.6328125, "grad_norm_var": 0.05014138221740723, "learning_rate": 2e-05, "loss": 1.2801, "loss/crossentropy": 2.673807382583618, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.19418036937713623, "step": 1650 }, { "epoch": 0.024652829625205316, "grad_norm": 0.66796875, "grad_norm_var": 0.05005796750386556, "learning_rate": 2e-05, "loss": 1.5013, "loss/crossentropy": 2.436178684234619, "loss/dist_ce": 0.0, "loss/fcd": 1.2734375, "loss/idx": 13.0, "loss/logits": 0.22788530588150024, "step": 1651 }, { "epoch": 0.02466776168433627, "grad_norm": 0.5546875, "grad_norm_var": 0.04988745053609212, "learning_rate": 2e-05, "loss": 1.3592, "loss/crossentropy": 2.245730400085449, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.21073272824287415, "step": 1652 }, { "epoch": 0.024682693743467224, "grad_norm": 0.51953125, "grad_norm_var": 0.04933770497639974, "learning_rate": 2e-05, "loss": 1.2983, "loss/crossentropy": 2.5788753032684326, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1889631152153015, "step": 1653 }, { "epoch": 0.02469762580259818, "grad_norm": 0.494140625, "grad_norm_var": 0.04965322812398275, "learning_rate": 2e-05, "loss": 1.1939, "loss/crossentropy": 2.6785736083984375, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16263613104820251, "step": 1654 }, { "epoch": 0.024712557861729132, "grad_norm": 0.5078125, "grad_norm_var": 0.04996501604715983, "learning_rate": 2e-05, "loss": 1.2169, "loss/crossentropy": 2.441274404525757, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1544494479894638, "step": 1655 }, { "epoch": 0.024727489920860088, "grad_norm": 0.55078125, "grad_norm_var": 0.05000913937886556, "learning_rate": 2e-05, "loss": 1.2835, "loss/crossentropy": 2.6980459690093994, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1819465607404709, "step": 1656 }, { "epoch": 0.02474242197999104, "grad_norm": 0.53125, "grad_norm_var": 0.04941349029541016, "learning_rate": 2e-05, "loss": 1.2336, "loss/crossentropy": 2.562973976135254, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1632436215877533, "step": 1657 }, { "epoch": 0.024757354039121996, "grad_norm": 0.53125, "grad_norm_var": 0.003185462951660156, "learning_rate": 2e-05, "loss": 1.2196, "loss/crossentropy": 2.669048547744751, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17273685336112976, "step": 1658 }, { "epoch": 0.02477228609825295, "grad_norm": 0.43359375, "grad_norm_var": 0.003966522216796875, "learning_rate": 2e-05, "loss": 1.1219, "loss/crossentropy": 2.5299949645996094, "loss/dist_ce": 0.0, "loss/fcd": 0.9765625, "loss/idx": 13.0, "loss/logits": 0.14536432921886444, "step": 1659 }, { "epoch": 0.024787218157383904, "grad_norm": 0.5390625, "grad_norm_var": 0.003560495376586914, "learning_rate": 2e-05, "loss": 1.2227, "loss/crossentropy": 2.4549379348754883, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16801050305366516, "step": 1660 }, { "epoch": 0.024802150216514857, "grad_norm": 0.5, "grad_norm_var": 0.003677988052368164, "learning_rate": 2e-05, "loss": 1.1772, "loss/crossentropy": 2.661327600479126, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15374130010604858, "step": 1661 }, { "epoch": 0.024817082275645812, "grad_norm": 0.484375, "grad_norm_var": 0.003876479466756185, "learning_rate": 2e-05, "loss": 1.163, "loss/crossentropy": 2.5966739654541016, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.16298674046993256, "step": 1662 }, { "epoch": 0.024832014334776765, "grad_norm": 0.50390625, "grad_norm_var": 0.003928613662719726, "learning_rate": 2e-05, "loss": 1.1938, "loss/crossentropy": 2.5892300605773926, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.17037644982337952, "step": 1663 }, { "epoch": 0.02484694639390772, "grad_norm": 0.60546875, "grad_norm_var": 0.004037332534790039, "learning_rate": 2e-05, "loss": 1.4414, "loss/crossentropy": 2.285784959793091, "loss/dist_ce": 0.0, "loss/fcd": 1.2421875, "loss/idx": 13.0, "loss/logits": 0.19916491210460663, "step": 1664 }, { "epoch": 0.024861878453038673, "grad_norm": 0.58984375, "grad_norm_var": 0.003541421890258789, "learning_rate": 2e-05, "loss": 1.2812, "loss/crossentropy": 2.531317949295044, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1796582043170929, "step": 1665 }, { "epoch": 0.02487681051216963, "grad_norm": 0.49609375, "grad_norm_var": 0.003025166193644206, "learning_rate": 2e-05, "loss": 1.237, "loss/crossentropy": 2.617490530014038, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1823294758796692, "step": 1666 }, { "epoch": 0.02489174257130058, "grad_norm": 0.47265625, "grad_norm_var": 0.0018648624420166016, "learning_rate": 2e-05, "loss": 1.2051, "loss/crossentropy": 2.699169635772705, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16599290072917938, "step": 1667 }, { "epoch": 0.024906674630431537, "grad_norm": 0.60546875, "grad_norm_var": 0.0022632439931233725, "learning_rate": 2e-05, "loss": 1.391, "loss/crossentropy": 2.284773588180542, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.21133294701576233, "step": 1668 }, { "epoch": 0.02492160668956249, "grad_norm": 0.52734375, "grad_norm_var": 0.002263625462849935, "learning_rate": 2e-05, "loss": 1.3096, "loss/crossentropy": 2.398963212966919, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.17675255239009857, "step": 1669 }, { "epoch": 0.024936538748693445, "grad_norm": 0.515625, "grad_norm_var": 0.002208900451660156, "learning_rate": 2e-05, "loss": 1.2286, "loss/crossentropy": 2.7221715450286865, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17392107844352722, "step": 1670 }, { "epoch": 0.024951470807824398, "grad_norm": 0.51171875, "grad_norm_var": 0.002201080322265625, "learning_rate": 2e-05, "loss": 1.2543, "loss/crossentropy": 2.7048890590667725, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18397437036037445, "step": 1671 }, { "epoch": 0.024966402866955353, "grad_norm": 0.6015625, "grad_norm_var": 0.0025374730428059894, "learning_rate": 2e-05, "loss": 1.2544, "loss/crossentropy": 2.427884101867676, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.16849087178707123, "step": 1672 }, { "epoch": 0.024981334926086306, "grad_norm": 0.546875, "grad_norm_var": 0.0025593439737955728, "learning_rate": 2e-05, "loss": 1.2987, "loss/crossentropy": 2.7837252616882324, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1893191933631897, "step": 1673 }, { "epoch": 0.02499626698521726, "grad_norm": 0.5390625, "grad_norm_var": 0.002565447489420573, "learning_rate": 2e-05, "loss": 1.3511, "loss/crossentropy": 2.4340031147003174, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.21051189303398132, "step": 1674 }, { "epoch": 0.025011199044348214, "grad_norm": 0.5078125, "grad_norm_var": 0.0019602457682291667, "learning_rate": 2e-05, "loss": 1.1157, "loss/crossentropy": 2.662384510040283, "loss/dist_ce": 0.0, "loss/fcd": 0.96875, "loss/idx": 13.0, "loss/logits": 0.1469482183456421, "step": 1675 }, { "epoch": 0.02502613110347917, "grad_norm": 0.51171875, "grad_norm_var": 0.0019891738891601564, "learning_rate": 2e-05, "loss": 1.1957, "loss/crossentropy": 2.637355327606201, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1644275039434433, "step": 1676 }, { "epoch": 0.025041063162610126, "grad_norm": 0.60546875, "grad_norm_var": 0.002227783203125, "learning_rate": 2e-05, "loss": 1.2562, "loss/crossentropy": 2.616171360015869, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17029576003551483, "step": 1677 }, { "epoch": 0.025055995221741078, "grad_norm": 0.73046875, "grad_norm_var": 0.004218482971191406, "learning_rate": 2e-05, "loss": 1.3086, "loss/crossentropy": 2.3912839889526367, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.19144338369369507, "step": 1678 }, { "epoch": 0.025070927280872034, "grad_norm": 0.52734375, "grad_norm_var": 0.004094886779785156, "learning_rate": 2e-05, "loss": 1.2694, "loss/crossentropy": 2.4914817810058594, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.16782964766025543, "step": 1679 }, { "epoch": 0.025085859340002986, "grad_norm": 0.48046875, "grad_norm_var": 0.004245440165201823, "learning_rate": 2e-05, "loss": 1.1771, "loss/crossentropy": 2.6493070125579834, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15367591381072998, "step": 1680 }, { "epoch": 0.025100791399133942, "grad_norm": 0.58203125, "grad_norm_var": 0.004205767313639323, "learning_rate": 2e-05, "loss": 1.2893, "loss/crossentropy": 2.348886013031006, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18777824938297272, "step": 1681 }, { "epoch": 0.025115723458264894, "grad_norm": 0.6796875, "grad_norm_var": 0.005051422119140625, "learning_rate": 2e-05, "loss": 1.5518, "loss/crossentropy": 2.78147029876709, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 13.0, "loss/logits": 0.28619956970214844, "step": 1682 }, { "epoch": 0.02513065551739585, "grad_norm": 0.59375, "grad_norm_var": 0.004572486877441407, "learning_rate": 2e-05, "loss": 1.3742, "loss/crossentropy": 2.9234964847564697, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.18667152523994446, "step": 1683 }, { "epoch": 0.025145587576526802, "grad_norm": 0.54296875, "grad_norm_var": 0.004493141174316406, "learning_rate": 2e-05, "loss": 1.2891, "loss/crossentropy": 2.454906702041626, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1797175407409668, "step": 1684 }, { "epoch": 0.02516051963565776, "grad_norm": 0.5, "grad_norm_var": 0.0046689351399739586, "learning_rate": 2e-05, "loss": 1.2387, "loss/crossentropy": 2.572845458984375, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1761704385280609, "step": 1685 }, { "epoch": 0.02517545169478871, "grad_norm": 0.498046875, "grad_norm_var": 0.004794677098592122, "learning_rate": 2e-05, "loss": 1.1848, "loss/crossentropy": 2.493640184402466, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15350091457366943, "step": 1686 }, { "epoch": 0.025190383753919666, "grad_norm": 0.5625, "grad_norm_var": 0.004629373550415039, "learning_rate": 2e-05, "loss": 1.196, "loss/crossentropy": 2.6362812519073486, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16470015048980713, "step": 1687 }, { "epoch": 0.02520531581305062, "grad_norm": 0.53515625, "grad_norm_var": 0.004564523696899414, "learning_rate": 2e-05, "loss": 1.2056, "loss/crossentropy": 2.431504249572754, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.15874925255775452, "step": 1688 }, { "epoch": 0.025220247872181575, "grad_norm": 0.49609375, "grad_norm_var": 0.004807519912719727, "learning_rate": 2e-05, "loss": 1.2304, "loss/crossentropy": 2.4948151111602783, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16791260242462158, "step": 1689 }, { "epoch": 0.025235179931312527, "grad_norm": 0.5234375, "grad_norm_var": 0.004857619603474935, "learning_rate": 2e-05, "loss": 1.2481, "loss/crossentropy": 2.4789421558380127, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1777571737766266, "step": 1690 }, { "epoch": 0.025250111990443483, "grad_norm": 0.51953125, "grad_norm_var": 0.00479276974995931, "learning_rate": 2e-05, "loss": 1.2302, "loss/crossentropy": 2.2739431858062744, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16770566999912262, "step": 1691 }, { "epoch": 0.025265044049574435, "grad_norm": 0.515625, "grad_norm_var": 0.004770898818969726, "learning_rate": 2e-05, "loss": 1.2194, "loss/crossentropy": 2.639508008956909, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1725163459777832, "step": 1692 }, { "epoch": 0.02527997610870539, "grad_norm": 0.5, "grad_norm_var": 0.004767465591430664, "learning_rate": 2e-05, "loss": 1.2304, "loss/crossentropy": 2.8804821968078613, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1756765991449356, "step": 1693 }, { "epoch": 0.025294908167836343, "grad_norm": 1.1796875, "grad_norm_var": 0.028237390518188476, "learning_rate": 2e-05, "loss": 1.9296, "loss/crossentropy": 2.852426290512085, "loss/dist_ce": 0.0, "loss/fcd": 1.578125, "loss/idx": 13.0, "loss/logits": 0.35142582654953003, "step": 1694 }, { "epoch": 0.0253098402269673, "grad_norm": 0.56640625, "grad_norm_var": 0.028072722752889, "learning_rate": 2e-05, "loss": 1.2908, "loss/crossentropy": 2.6262359619140625, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18921533226966858, "step": 1695 }, { "epoch": 0.02532477228609825, "grad_norm": 0.53515625, "grad_norm_var": 0.02753599484761556, "learning_rate": 2e-05, "loss": 1.3023, "loss/crossentropy": 2.71830153465271, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.19293737411499023, "step": 1696 }, { "epoch": 0.025339704345229207, "grad_norm": 0.54296875, "grad_norm_var": 0.02763708432515462, "learning_rate": 2e-05, "loss": 1.2915, "loss/crossentropy": 2.5848164558410645, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.19779077172279358, "step": 1697 }, { "epoch": 0.02535463640436016, "grad_norm": 0.59375, "grad_norm_var": 0.02696429888407389, "learning_rate": 2e-05, "loss": 1.3879, "loss/crossentropy": 2.7005844116210938, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.20819774270057678, "step": 1698 }, { "epoch": 0.025369568463491116, "grad_norm": 0.47265625, "grad_norm_var": 0.02758316993713379, "learning_rate": 2e-05, "loss": 1.1186, "loss/crossentropy": 2.633463144302368, "loss/dist_ce": 0.0, "loss/fcd": 0.9765625, "loss/idx": 13.0, "loss/logits": 0.1420612782239914, "step": 1699 }, { "epoch": 0.025384500522622068, "grad_norm": 0.546875, "grad_norm_var": 0.02757121721903483, "learning_rate": 2e-05, "loss": 1.264, "loss/crossentropy": 2.642566204071045, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1858481764793396, "step": 1700 }, { "epoch": 0.025399432581753024, "grad_norm": 0.49609375, "grad_norm_var": 0.02760758399963379, "learning_rate": 2e-05, "loss": 1.3168, "loss/crossentropy": 2.675619125366211, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19182080030441284, "step": 1701 }, { "epoch": 0.02541436464088398, "grad_norm": 0.609375, "grad_norm_var": 0.027347564697265625, "learning_rate": 2e-05, "loss": 1.3172, "loss/crossentropy": 2.6206746101379395, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19216333329677582, "step": 1702 }, { "epoch": 0.025429296700014932, "grad_norm": 0.55078125, "grad_norm_var": 0.027375221252441406, "learning_rate": 2e-05, "loss": 1.264, "loss/crossentropy": 2.7116100788116455, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1937001645565033, "step": 1703 }, { "epoch": 0.025444228759145888, "grad_norm": 0.53515625, "grad_norm_var": 0.027375221252441406, "learning_rate": 2e-05, "loss": 1.3182, "loss/crossentropy": 2.672400951385498, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19316034018993378, "step": 1704 }, { "epoch": 0.02545916081827684, "grad_norm": 0.546875, "grad_norm_var": 0.027009073893229166, "learning_rate": 2e-05, "loss": 1.3287, "loss/crossentropy": 2.59317946434021, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.19592443108558655, "step": 1705 }, { "epoch": 0.025474092877407796, "grad_norm": 0.466796875, "grad_norm_var": 0.02761521339416504, "learning_rate": 2e-05, "loss": 1.2262, "loss/crossentropy": 2.5838565826416016, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16374395787715912, "step": 1706 }, { "epoch": 0.02548902493653875, "grad_norm": 0.50390625, "grad_norm_var": 0.027743132909138997, "learning_rate": 2e-05, "loss": 1.1815, "loss/crossentropy": 2.6330811977386475, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.16590842604637146, "step": 1707 }, { "epoch": 0.025503956995669704, "grad_norm": 0.546875, "grad_norm_var": 0.02756663958231608, "learning_rate": 2e-05, "loss": 1.2692, "loss/crossentropy": 2.5513384342193604, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.16764463484287262, "step": 1708 }, { "epoch": 0.025518889054800657, "grad_norm": 0.50390625, "grad_norm_var": 0.027528746922810873, "learning_rate": 2e-05, "loss": 1.1706, "loss/crossentropy": 2.727013111114502, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15500634908676147, "step": 1709 }, { "epoch": 0.025533821113931612, "grad_norm": 0.5859375, "grad_norm_var": 0.0016778151194254557, "learning_rate": 2e-05, "loss": 1.3518, "loss/crossentropy": 2.346733570098877, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.20339244604110718, "step": 1710 }, { "epoch": 0.025548753173062565, "grad_norm": 0.490234375, "grad_norm_var": 0.0017491022745768229, "learning_rate": 2e-05, "loss": 1.1178, "loss/crossentropy": 2.4764509201049805, "loss/dist_ce": 0.0, "loss/fcd": 0.97265625, "loss/idx": 13.0, "loss/logits": 0.145157128572464, "step": 1711 }, { "epoch": 0.02556368523219352, "grad_norm": 0.55078125, "grad_norm_var": 0.0017689387003580728, "learning_rate": 2e-05, "loss": 1.2626, "loss/crossentropy": 2.531623125076294, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.18448594212532043, "step": 1712 }, { "epoch": 0.025578617291324473, "grad_norm": 0.5546875, "grad_norm_var": 0.0017916361490885417, "learning_rate": 2e-05, "loss": 1.321, "loss/crossentropy": 2.4013497829437256, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.18041828274726868, "step": 1713 }, { "epoch": 0.02559354935045543, "grad_norm": 0.53125, "grad_norm_var": 0.001543426513671875, "learning_rate": 2e-05, "loss": 1.3033, "loss/crossentropy": 2.6992998123168945, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1939333975315094, "step": 1714 }, { "epoch": 0.02560848140958638, "grad_norm": 0.5078125, "grad_norm_var": 0.0013483047485351562, "learning_rate": 2e-05, "loss": 1.1803, "loss/crossentropy": 2.517133951187134, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.14903077483177185, "step": 1715 }, { "epoch": 0.025623413468717337, "grad_norm": 0.640625, "grad_norm_var": 0.002071571350097656, "learning_rate": 2e-05, "loss": 1.6195, "loss/crossentropy": 2.476125717163086, "loss/dist_ce": 0.0, "loss/fcd": 1.359375, "loss/idx": 13.0, "loss/logits": 0.26009926199913025, "step": 1716 }, { "epoch": 0.02563834552784829, "grad_norm": 0.56640625, "grad_norm_var": 0.001980018615722656, "learning_rate": 2e-05, "loss": 1.3432, "loss/crossentropy": 2.2813334465026855, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.20258614420890808, "step": 1717 }, { "epoch": 0.025653277586979245, "grad_norm": 0.48828125, "grad_norm_var": 0.0018282572428385416, "learning_rate": 2e-05, "loss": 1.1883, "loss/crossentropy": 2.476081132888794, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15700972080230713, "step": 1718 }, { "epoch": 0.025668209646110197, "grad_norm": 0.62890625, "grad_norm_var": 0.002367401123046875, "learning_rate": 2e-05, "loss": 1.3278, "loss/crossentropy": 2.189779281616211, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.1871606558561325, "step": 1719 }, { "epoch": 0.025683141705241153, "grad_norm": 0.546875, "grad_norm_var": 0.002367591857910156, "learning_rate": 2e-05, "loss": 1.1858, "loss/crossentropy": 2.363250732421875, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15456654131412506, "step": 1720 }, { "epoch": 0.025698073764372106, "grad_norm": 0.498046875, "grad_norm_var": 0.0024800459543863934, "learning_rate": 2e-05, "loss": 1.1912, "loss/crossentropy": 2.333819627761841, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15996934473514557, "step": 1721 }, { "epoch": 0.02571300582350306, "grad_norm": 0.515625, "grad_norm_var": 0.002164141337076823, "learning_rate": 2e-05, "loss": 1.2155, "loss/crossentropy": 2.6888930797576904, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16863149404525757, "step": 1722 }, { "epoch": 0.025727937882634014, "grad_norm": 0.53125, "grad_norm_var": 0.0020746866861979167, "learning_rate": 2e-05, "loss": 1.3424, "loss/crossentropy": 2.607393980026245, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.2017740160226822, "step": 1723 }, { "epoch": 0.02574286994176497, "grad_norm": 0.51953125, "grad_norm_var": 0.0021071751912434896, "learning_rate": 2e-05, "loss": 1.2383, "loss/crossentropy": 2.4375128746032715, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16799165308475494, "step": 1724 }, { "epoch": 0.025757802000895922, "grad_norm": 0.486328125, "grad_norm_var": 0.00221403439839681, "learning_rate": 2e-05, "loss": 1.2517, "loss/crossentropy": 2.7144973278045654, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18143236637115479, "step": 1725 }, { "epoch": 0.025772734060026878, "grad_norm": 0.5546875, "grad_norm_var": 0.00208433469136556, "learning_rate": 2e-05, "loss": 1.316, "loss/crossentropy": 2.503783941268921, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19102910161018372, "step": 1726 }, { "epoch": 0.02578766611915783, "grad_norm": 0.56640625, "grad_norm_var": 0.0019597371419270834, "learning_rate": 2e-05, "loss": 1.2821, "loss/crossentropy": 2.499298572540283, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.17273728549480438, "step": 1727 }, { "epoch": 0.025802598178288786, "grad_norm": 0.5234375, "grad_norm_var": 0.001977984110514323, "learning_rate": 2e-05, "loss": 1.259, "loss/crossentropy": 2.5022952556610107, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.18091173470020294, "step": 1728 }, { "epoch": 0.025817530237419742, "grad_norm": 0.5390625, "grad_norm_var": 0.0019652684529622394, "learning_rate": 2e-05, "loss": 1.3357, "loss/crossentropy": 2.4653351306915283, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.19503894448280334, "step": 1729 }, { "epoch": 0.025832462296550694, "grad_norm": 0.546875, "grad_norm_var": 0.0019617080688476562, "learning_rate": 2e-05, "loss": 1.1965, "loss/crossentropy": 2.6985909938812256, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16525985300540924, "step": 1730 }, { "epoch": 0.02584739435568165, "grad_norm": 0.51953125, "grad_norm_var": 0.00191802978515625, "learning_rate": 2e-05, "loss": 1.2125, "loss/crossentropy": 2.709136486053467, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.18129640817642212, "step": 1731 }, { "epoch": 0.025862326414812602, "grad_norm": 0.5078125, "grad_norm_var": 0.0012738545735677083, "learning_rate": 2e-05, "loss": 1.2335, "loss/crossentropy": 2.7097113132476807, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17096135020256042, "step": 1732 }, { "epoch": 0.025877258473943558, "grad_norm": 0.609375, "grad_norm_var": 0.001576677958170573, "learning_rate": 2e-05, "loss": 1.2684, "loss/crossentropy": 2.601806879043579, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.18242479860782623, "step": 1733 }, { "epoch": 0.02589219053307451, "grad_norm": 0.53125, "grad_norm_var": 0.0014165242513020833, "learning_rate": 2e-05, "loss": 1.3134, "loss/crossentropy": 2.4615402221679688, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18838651478290558, "step": 1734 }, { "epoch": 0.025907122592205466, "grad_norm": 0.51953125, "grad_norm_var": 0.0008539835611979167, "learning_rate": 2e-05, "loss": 1.2638, "loss/crossentropy": 2.577078104019165, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.18569207191467285, "step": 1735 }, { "epoch": 0.02592205465133642, "grad_norm": 0.5859375, "grad_norm_var": 0.0010256449381510417, "learning_rate": 2e-05, "loss": 1.1798, "loss/crossentropy": 2.5145535469055176, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15635466575622559, "step": 1736 }, { "epoch": 0.025936986710467375, "grad_norm": 0.515625, "grad_norm_var": 0.0009591261545817058, "learning_rate": 2e-05, "loss": 1.2788, "loss/crossentropy": 2.691556453704834, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17723232507705688, "step": 1737 }, { "epoch": 0.025951918769598327, "grad_norm": 0.609375, "grad_norm_var": 0.0012566725413004558, "learning_rate": 2e-05, "loss": 1.2817, "loss/crossentropy": 2.559390068054199, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.20356020331382751, "step": 1738 }, { "epoch": 0.025966850828729283, "grad_norm": 0.54296875, "grad_norm_var": 0.0012490431467692058, "learning_rate": 2e-05, "loss": 1.2244, "loss/crossentropy": 2.600177526473999, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16185130178928375, "step": 1739 }, { "epoch": 0.025981782887860235, "grad_norm": 0.55859375, "grad_norm_var": 0.0012255191802978515, "learning_rate": 2e-05, "loss": 1.2417, "loss/crossentropy": 2.7534642219543457, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17136284708976746, "step": 1740 }, { "epoch": 0.02599671494699119, "grad_norm": 0.6640625, "grad_norm_var": 0.0018142064412434895, "learning_rate": 2e-05, "loss": 1.5145, "loss/crossentropy": 2.1807971000671387, "loss/dist_ce": 0.0, "loss/fcd": 1.2734375, "loss/idx": 13.0, "loss/logits": 0.2410614937543869, "step": 1741 }, { "epoch": 0.026011647006122143, "grad_norm": 0.79296875, "grad_norm_var": 0.005324045817057292, "learning_rate": 2e-05, "loss": 1.5923, "loss/crossentropy": 2.4337072372436523, "loss/dist_ce": 0.0, "loss/fcd": 1.3515625, "loss/idx": 13.0, "loss/logits": 0.24070346355438232, "step": 1742 }, { "epoch": 0.0260265790652531, "grad_norm": 0.49609375, "grad_norm_var": 0.005674235026041667, "learning_rate": 2e-05, "loss": 1.1684, "loss/crossentropy": 2.5552825927734375, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15277054905891418, "step": 1743 }, { "epoch": 0.02604151112438405, "grad_norm": 0.80859375, "grad_norm_var": 0.009122657775878906, "learning_rate": 2e-05, "loss": 1.5857, "loss/crossentropy": 2.7988672256469727, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 13.0, "loss/logits": 0.32007214426994324, "step": 1744 }, { "epoch": 0.026056443183515007, "grad_norm": 0.5078125, "grad_norm_var": 0.00937188466389974, "learning_rate": 2e-05, "loss": 1.1453, "loss/crossentropy": 2.5977418422698975, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.14532379806041718, "step": 1745 }, { "epoch": 0.02607137524264596, "grad_norm": 0.5546875, "grad_norm_var": 0.009338823954264323, "learning_rate": 2e-05, "loss": 1.2278, "loss/crossentropy": 2.459958076477051, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1731024980545044, "step": 1746 }, { "epoch": 0.026086307301776916, "grad_norm": 0.6015625, "grad_norm_var": 0.009067789713541666, "learning_rate": 2e-05, "loss": 1.257, "loss/crossentropy": 2.615450859069824, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17106476426124573, "step": 1747 }, { "epoch": 0.026101239360907868, "grad_norm": 0.51953125, "grad_norm_var": 0.008951250712076824, "learning_rate": 2e-05, "loss": 1.2121, "loss/crossentropy": 2.551990270614624, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1652519255876541, "step": 1748 }, { "epoch": 0.026116171420038824, "grad_norm": 0.55859375, "grad_norm_var": 0.008971913655598959, "learning_rate": 2e-05, "loss": 1.3636, "loss/crossentropy": 2.399773120880127, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.1839275360107422, "step": 1749 }, { "epoch": 0.026131103479169776, "grad_norm": 0.48828125, "grad_norm_var": 0.00939782460530599, "learning_rate": 2e-05, "loss": 1.1708, "loss/crossentropy": 2.5663254261016846, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.16302275657653809, "step": 1750 }, { "epoch": 0.026146035538300732, "grad_norm": 0.5625, "grad_norm_var": 0.009150950113932292, "learning_rate": 2e-05, "loss": 1.3825, "loss/crossentropy": 2.4493865966796875, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.1950165033340454, "step": 1751 }, { "epoch": 0.026160967597431684, "grad_norm": 0.59765625, "grad_norm_var": 0.009160296122233073, "learning_rate": 2e-05, "loss": 1.3006, "loss/crossentropy": 2.4346399307250977, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1912744641304016, "step": 1752 }, { "epoch": 0.02617589965656264, "grad_norm": 0.578125, "grad_norm_var": 0.00881646474202474, "learning_rate": 2e-05, "loss": 1.2118, "loss/crossentropy": 2.5589816570281982, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.15713083744049072, "step": 1753 }, { "epoch": 0.026190831715693592, "grad_norm": 0.55078125, "grad_norm_var": 0.008880360921223959, "learning_rate": 2e-05, "loss": 1.3028, "loss/crossentropy": 2.65625, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.19340017437934875, "step": 1754 }, { "epoch": 0.026205763774824548, "grad_norm": 0.55859375, "grad_norm_var": 0.008805084228515624, "learning_rate": 2e-05, "loss": 1.342, "loss/crossentropy": 2.6520872116088867, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.217010498046875, "step": 1755 }, { "epoch": 0.026220695833955504, "grad_norm": 0.5078125, "grad_norm_var": 0.00916131337483724, "learning_rate": 2e-05, "loss": 1.273, "loss/crossentropy": 2.5524790287017822, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1870691478252411, "step": 1756 }, { "epoch": 0.026235627893086456, "grad_norm": 0.5, "grad_norm_var": 0.00909722646077474, "learning_rate": 2e-05, "loss": 1.2431, "loss/crossentropy": 2.7991576194763184, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.180614173412323, "step": 1757 }, { "epoch": 0.026250559952217412, "grad_norm": 0.53125, "grad_norm_var": 0.005736287434895833, "learning_rate": 2e-05, "loss": 1.2284, "loss/crossentropy": 2.677903652191162, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1736885905265808, "step": 1758 }, { "epoch": 0.026265492011348365, "grad_norm": 0.484375, "grad_norm_var": 0.00584100087483724, "learning_rate": 2e-05, "loss": 1.1874, "loss/crossentropy": 2.39847469329834, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15619158744812012, "step": 1759 }, { "epoch": 0.02628042407047932, "grad_norm": 0.49609375, "grad_norm_var": 0.0014566421508789063, "learning_rate": 2e-05, "loss": 1.2115, "loss/crossentropy": 2.773829936981201, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.172477588057518, "step": 1760 }, { "epoch": 0.026295356129610273, "grad_norm": 0.52734375, "grad_norm_var": 0.0014035542805989583, "learning_rate": 2e-05, "loss": 1.2803, "loss/crossentropy": 2.3061296939849854, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1709517389535904, "step": 1761 }, { "epoch": 0.02631028818874123, "grad_norm": 0.4765625, "grad_norm_var": 0.0016171773274739583, "learning_rate": 2e-05, "loss": 1.1998, "loss/crossentropy": 2.6350574493408203, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16071206331253052, "step": 1762 }, { "epoch": 0.02632522024787218, "grad_norm": 0.498046875, "grad_norm_var": 0.0013501326243082683, "learning_rate": 2e-05, "loss": 1.1946, "loss/crossentropy": 2.479978322982788, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.155495285987854, "step": 1763 }, { "epoch": 0.026340152307003137, "grad_norm": 0.486328125, "grad_norm_var": 0.001453081766764323, "learning_rate": 2e-05, "loss": 1.1679, "loss/crossentropy": 2.6815295219421387, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.14445266127586365, "step": 1764 }, { "epoch": 0.02635508436613409, "grad_norm": 0.62109375, "grad_norm_var": 0.0019759496053059896, "learning_rate": 2e-05, "loss": 1.2447, "loss/crossentropy": 2.6957926750183105, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17434380948543549, "step": 1765 }, { "epoch": 0.026370016425265045, "grad_norm": 0.5234375, "grad_norm_var": 0.0018620808919270833, "learning_rate": 2e-05, "loss": 1.1802, "loss/crossentropy": 2.4276416301727295, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15674927830696106, "step": 1766 }, { "epoch": 0.026384948484395997, "grad_norm": 0.50390625, "grad_norm_var": 0.0018325169881184896, "learning_rate": 2e-05, "loss": 1.2002, "loss/crossentropy": 2.8092257976531982, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1611369252204895, "step": 1767 }, { "epoch": 0.026399880543526953, "grad_norm": 0.5546875, "grad_norm_var": 0.001546478271484375, "learning_rate": 2e-05, "loss": 1.2647, "loss/crossentropy": 2.672966718673706, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17874416708946228, "step": 1768 }, { "epoch": 0.026414812602657906, "grad_norm": 0.51171875, "grad_norm_var": 0.001350847880045573, "learning_rate": 2e-05, "loss": 1.1984, "loss/crossentropy": 2.601475715637207, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.1749573051929474, "step": 1769 }, { "epoch": 0.02642974466178886, "grad_norm": 0.5390625, "grad_norm_var": 0.0013125101725260417, "learning_rate": 2e-05, "loss": 1.3379, "loss/crossentropy": 2.344796895980835, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.2128661870956421, "step": 1770 }, { "epoch": 0.026444676720919814, "grad_norm": 0.4453125, "grad_norm_var": 0.0015319188435872395, "learning_rate": 2e-05, "loss": 1.1482, "loss/crossentropy": 2.720777750015259, "loss/dist_ce": 0.0, "loss/fcd": 0.9921875, "loss/idx": 13.0, "loss/logits": 0.15602290630340576, "step": 1771 }, { "epoch": 0.02645960878005077, "grad_norm": 0.4921875, "grad_norm_var": 0.0015578587849934896, "learning_rate": 2e-05, "loss": 1.1621, "loss/crossentropy": 2.65152645111084, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.15432672202587128, "step": 1772 }, { "epoch": 0.026474540839181722, "grad_norm": 0.51953125, "grad_norm_var": 0.0015505472819010416, "learning_rate": 2e-05, "loss": 1.2169, "loss/crossentropy": 2.48185133934021, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16998444497585297, "step": 1773 }, { "epoch": 0.026489472898312678, "grad_norm": 0.5703125, "grad_norm_var": 0.0017400105794270833, "learning_rate": 2e-05, "loss": 1.1641, "loss/crossentropy": 2.5464279651641846, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.14069680869579315, "step": 1774 }, { "epoch": 0.02650440495744363, "grad_norm": 0.6484375, "grad_norm_var": 0.002738698323567708, "learning_rate": 2e-05, "loss": 1.3625, "loss/crossentropy": 2.5577807426452637, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.19061937928199768, "step": 1775 }, { "epoch": 0.026519337016574586, "grad_norm": 0.494140625, "grad_norm_var": 0.002746693293253581, "learning_rate": 2e-05, "loss": 1.1661, "loss/crossentropy": 2.5085034370422363, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15048107504844666, "step": 1776 }, { "epoch": 0.02653426907570554, "grad_norm": 0.50390625, "grad_norm_var": 0.002776066462198893, "learning_rate": 2e-05, "loss": 1.2534, "loss/crossentropy": 2.579402208328247, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.19094473123550415, "step": 1777 }, { "epoch": 0.026549201134836494, "grad_norm": 0.52734375, "grad_norm_var": 0.0026140689849853517, "learning_rate": 2e-05, "loss": 1.1913, "loss/crossentropy": 2.696247100830078, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16005630791187286, "step": 1778 }, { "epoch": 0.026564133193967446, "grad_norm": 0.51171875, "grad_norm_var": 0.0025721232096354166, "learning_rate": 2e-05, "loss": 1.1651, "loss/crossentropy": 2.7263166904449463, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.1572725623846054, "step": 1779 }, { "epoch": 0.026579065253098402, "grad_norm": 0.494140625, "grad_norm_var": 0.002532196044921875, "learning_rate": 2e-05, "loss": 1.2075, "loss/crossentropy": 2.452711820602417, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16845420002937317, "step": 1780 }, { "epoch": 0.026593997312229358, "grad_norm": 0.46484375, "grad_norm_var": 0.002135467529296875, "learning_rate": 2e-05, "loss": 1.1348, "loss/crossentropy": 2.5930423736572266, "loss/dist_ce": 0.0, "loss/fcd": 0.98046875, "loss/idx": 13.0, "loss/logits": 0.15436789393424988, "step": 1781 }, { "epoch": 0.02660892937136031, "grad_norm": 0.734375, "grad_norm_var": 0.00503997802734375, "learning_rate": 2e-05, "loss": 1.3972, "loss/crossentropy": 2.6180036067962646, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.2018566131591797, "step": 1782 }, { "epoch": 0.026623861430491266, "grad_norm": 0.578125, "grad_norm_var": 0.00510400136311849, "learning_rate": 2e-05, "loss": 1.3258, "loss/crossentropy": 2.556471109390259, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.1930120587348938, "step": 1783 }, { "epoch": 0.02663879348962222, "grad_norm": 0.494140625, "grad_norm_var": 0.005189243952433268, "learning_rate": 2e-05, "loss": 1.2076, "loss/crossentropy": 2.686593532562256, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16853547096252441, "step": 1784 }, { "epoch": 0.026653725548753174, "grad_norm": 0.46875, "grad_norm_var": 0.005427026748657226, "learning_rate": 2e-05, "loss": 1.2387, "loss/crossentropy": 2.6526505947113037, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.18397173285484314, "step": 1785 }, { "epoch": 0.026668657607884127, "grad_norm": 0.640625, "grad_norm_var": 0.00618907610575358, "learning_rate": 2e-05, "loss": 1.3109, "loss/crossentropy": 2.4550328254699707, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.1859399974346161, "step": 1786 }, { "epoch": 0.026683589667015083, "grad_norm": 0.5703125, "grad_norm_var": 0.005641794204711914, "learning_rate": 2e-05, "loss": 1.341, "loss/crossentropy": 2.7371487617492676, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.21602407097816467, "step": 1787 }, { "epoch": 0.026698521726146035, "grad_norm": 0.66015625, "grad_norm_var": 0.006232309341430664, "learning_rate": 2e-05, "loss": 1.3813, "loss/crossentropy": 2.7988855838775635, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.2093946784734726, "step": 1788 }, { "epoch": 0.02671345378527699, "grad_norm": 0.5234375, "grad_norm_var": 0.006214761734008789, "learning_rate": 2e-05, "loss": 1.3025, "loss/crossentropy": 2.5311191082000732, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1931311935186386, "step": 1789 }, { "epoch": 0.026728385844407943, "grad_norm": 0.50390625, "grad_norm_var": 0.006357431411743164, "learning_rate": 2e-05, "loss": 1.2538, "loss/crossentropy": 2.6057794094085693, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18352286517620087, "step": 1790 }, { "epoch": 0.0267433179035389, "grad_norm": 0.52734375, "grad_norm_var": 0.005703083674112956, "learning_rate": 2e-05, "loss": 1.1792, "loss/crossentropy": 2.5075063705444336, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1635546088218689, "step": 1791 }, { "epoch": 0.02675824996266985, "grad_norm": 0.625, "grad_norm_var": 0.005910746256510417, "learning_rate": 2e-05, "loss": 1.3609, "loss/crossentropy": 2.534444570541382, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.18119356036186218, "step": 1792 }, { "epoch": 0.026773182021800807, "grad_norm": 0.51953125, "grad_norm_var": 0.005826314290364583, "learning_rate": 2e-05, "loss": 1.2744, "loss/crossentropy": 2.717726230621338, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18060529232025146, "step": 1793 }, { "epoch": 0.02678811408093176, "grad_norm": 0.58203125, "grad_norm_var": 0.005828094482421875, "learning_rate": 2e-05, "loss": 1.3391, "loss/crossentropy": 2.476660966873169, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.1984376311302185, "step": 1794 }, { "epoch": 0.026803046140062715, "grad_norm": 0.5, "grad_norm_var": 0.005906105041503906, "learning_rate": 2e-05, "loss": 1.2207, "loss/crossentropy": 2.204763174057007, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16599825024604797, "step": 1795 }, { "epoch": 0.026817978199193668, "grad_norm": 0.56640625, "grad_norm_var": 0.005642048517862956, "learning_rate": 2e-05, "loss": 1.314, "loss/crossentropy": 2.2588131427764893, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18899735808372498, "step": 1796 }, { "epoch": 0.026832910258324624, "grad_norm": 0.546875, "grad_norm_var": 0.0050225416819254555, "learning_rate": 2e-05, "loss": 1.266, "loss/crossentropy": 2.7214598655700684, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1878923773765564, "step": 1797 }, { "epoch": 0.026847842317455576, "grad_norm": 0.65234375, "grad_norm_var": 0.0035912672678629558, "learning_rate": 2e-05, "loss": 1.4791, "loss/crossentropy": 2.7222535610198975, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 13.0, "loss/logits": 0.25250500440597534, "step": 1798 }, { "epoch": 0.026862774376586532, "grad_norm": 0.51953125, "grad_norm_var": 0.0036637465159098308, "learning_rate": 2e-05, "loss": 1.2162, "loss/crossentropy": 2.938828468322754, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16931620240211487, "step": 1799 }, { "epoch": 0.026877706435717484, "grad_norm": 0.55859375, "grad_norm_var": 0.003389422098795573, "learning_rate": 2e-05, "loss": 1.2342, "loss/crossentropy": 2.7594287395477295, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16388383507728577, "step": 1800 }, { "epoch": 0.02689263849484844, "grad_norm": 0.5625, "grad_norm_var": 0.002794329325358073, "learning_rate": 2e-05, "loss": 1.3562, "loss/crossentropy": 2.6729891300201416, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.2077617645263672, "step": 1801 }, { "epoch": 0.026907570553979392, "grad_norm": 0.462890625, "grad_norm_var": 0.003004058202107747, "learning_rate": 2e-05, "loss": 1.1714, "loss/crossentropy": 2.5881118774414062, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15578031539916992, "step": 1802 }, { "epoch": 0.026922502613110348, "grad_norm": 0.74609375, "grad_norm_var": 0.005292876561482748, "learning_rate": 2e-05, "loss": 1.3937, "loss/crossentropy": 2.675889253616333, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.19843614101409912, "step": 1803 }, { "epoch": 0.0269374346722413, "grad_norm": 0.54296875, "grad_norm_var": 0.004680617650349935, "learning_rate": 2e-05, "loss": 1.2514, "loss/crossentropy": 2.4744293689727783, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18113207817077637, "step": 1804 }, { "epoch": 0.026952366731372256, "grad_norm": 0.50390625, "grad_norm_var": 0.004796330134073893, "learning_rate": 2e-05, "loss": 1.1718, "loss/crossentropy": 2.706061363220215, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1561264991760254, "step": 1805 }, { "epoch": 0.02696729879050321, "grad_norm": 0.5234375, "grad_norm_var": 0.004680617650349935, "learning_rate": 2e-05, "loss": 1.1883, "loss/crossentropy": 2.48380970954895, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15707790851593018, "step": 1806 }, { "epoch": 0.026982230849634165, "grad_norm": 0.6484375, "grad_norm_var": 0.005090570449829102, "learning_rate": 2e-05, "loss": 1.3034, "loss/crossentropy": 2.59531569480896, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.17839841544628143, "step": 1807 }, { "epoch": 0.02699716290876512, "grad_norm": 0.5234375, "grad_norm_var": 0.004940144220987956, "learning_rate": 2e-05, "loss": 1.2338, "loss/crossentropy": 2.7297565937042236, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1869654804468155, "step": 1808 }, { "epoch": 0.027012094967896073, "grad_norm": 0.51171875, "grad_norm_var": 0.0049860477447509766, "learning_rate": 2e-05, "loss": 1.2296, "loss/crossentropy": 2.546668291091919, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16706717014312744, "step": 1809 }, { "epoch": 0.02702702702702703, "grad_norm": 0.4921875, "grad_norm_var": 0.00522001584370931, "learning_rate": 2e-05, "loss": 1.2034, "loss/crossentropy": 2.6435706615448, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.17213952541351318, "step": 1810 }, { "epoch": 0.02704195908615798, "grad_norm": 0.53125, "grad_norm_var": 0.00505674680074056, "learning_rate": 2e-05, "loss": 1.2388, "loss/crossentropy": 2.6985273361206055, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17626091837882996, "step": 1811 }, { "epoch": 0.027056891145288937, "grad_norm": 0.51953125, "grad_norm_var": 0.005127700169881185, "learning_rate": 2e-05, "loss": 1.2427, "loss/crossentropy": 2.4755988121032715, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.18802842497825623, "step": 1812 }, { "epoch": 0.02707182320441989, "grad_norm": 0.5078125, "grad_norm_var": 0.005254220962524414, "learning_rate": 2e-05, "loss": 1.1422, "loss/crossentropy": 2.6311707496643066, "loss/dist_ce": 0.0, "loss/fcd": 0.98828125, "loss/idx": 13.0, "loss/logits": 0.1539306342601776, "step": 1813 }, { "epoch": 0.027086755263550845, "grad_norm": 0.578125, "grad_norm_var": 0.00458982785542806, "learning_rate": 2e-05, "loss": 1.2927, "loss/crossentropy": 2.8410966396331787, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.20679564774036407, "step": 1814 }, { "epoch": 0.027101687322681797, "grad_norm": 0.48828125, "grad_norm_var": 0.004760217666625976, "learning_rate": 2e-05, "loss": 1.2574, "loss/crossentropy": 2.6344683170318604, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1714169979095459, "step": 1815 }, { "epoch": 0.027116619381812753, "grad_norm": 0.50390625, "grad_norm_var": 0.0048394362131754555, "learning_rate": 2e-05, "loss": 1.2733, "loss/crossentropy": 2.4946045875549316, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.18738268315792084, "step": 1816 }, { "epoch": 0.027131551440943705, "grad_norm": 0.53125, "grad_norm_var": 0.004808410008748373, "learning_rate": 2e-05, "loss": 1.1028, "loss/crossentropy": 2.759798526763916, "loss/dist_ce": 0.0, "loss/fcd": 0.94921875, "loss/idx": 13.0, "loss/logits": 0.15359053015708923, "step": 1817 }, { "epoch": 0.02714648350007466, "grad_norm": 0.5078125, "grad_norm_var": 0.004481951395670573, "learning_rate": 2e-05, "loss": 1.2177, "loss/crossentropy": 2.5690910816192627, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16301162540912628, "step": 1818 }, { "epoch": 0.027161415559205614, "grad_norm": 0.5546875, "grad_norm_var": 0.001544189453125, "learning_rate": 2e-05, "loss": 1.2606, "loss/crossentropy": 2.410215377807617, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.16685530543327332, "step": 1819 }, { "epoch": 0.02717634761833657, "grad_norm": 0.48828125, "grad_norm_var": 0.0016314188639322917, "learning_rate": 2e-05, "loss": 1.1857, "loss/crossentropy": 2.719848394393921, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15441551804542542, "step": 1820 }, { "epoch": 0.027191279677467522, "grad_norm": 0.56640625, "grad_norm_var": 0.0016924540201822917, "learning_rate": 2e-05, "loss": 1.2924, "loss/crossentropy": 2.547701358795166, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.1986786127090454, "step": 1821 }, { "epoch": 0.027206211736598478, "grad_norm": 0.54296875, "grad_norm_var": 0.0016997655232747395, "learning_rate": 2e-05, "loss": 1.2618, "loss/crossentropy": 2.6193835735321045, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17586740851402283, "step": 1822 }, { "epoch": 0.02722114379572943, "grad_norm": 0.65234375, "grad_norm_var": 0.0017618815104166667, "learning_rate": 2e-05, "loss": 1.2637, "loss/crossentropy": 2.512247323989868, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.1855495274066925, "step": 1823 }, { "epoch": 0.027236075854860386, "grad_norm": 0.5078125, "grad_norm_var": 0.0017934163411458333, "learning_rate": 2e-05, "loss": 1.235, "loss/crossentropy": 2.514695167541504, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16470174491405487, "step": 1824 }, { "epoch": 0.027251007913991338, "grad_norm": 0.49609375, "grad_norm_var": 0.0018473307291666666, "learning_rate": 2e-05, "loss": 1.1994, "loss/crossentropy": 2.5031635761260986, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16035684943199158, "step": 1825 }, { "epoch": 0.027265939973122294, "grad_norm": 0.53515625, "grad_norm_var": 0.0017501195271809897, "learning_rate": 2e-05, "loss": 1.1901, "loss/crossentropy": 2.6170687675476074, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1588709056377411, "step": 1826 }, { "epoch": 0.027280872032253246, "grad_norm": 0.53515625, "grad_norm_var": 0.0017506917317708333, "learning_rate": 2e-05, "loss": 1.29, "loss/crossentropy": 2.467294216156006, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.18067091703414917, "step": 1827 }, { "epoch": 0.027295804091384202, "grad_norm": 0.48046875, "grad_norm_var": 0.0019121805826822916, "learning_rate": 2e-05, "loss": 1.1621, "loss/crossentropy": 2.658353805541992, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.16208136081695557, "step": 1828 }, { "epoch": 0.027310736150515155, "grad_norm": 0.484375, "grad_norm_var": 0.0020151774088541666, "learning_rate": 2e-05, "loss": 1.1345, "loss/crossentropy": 2.707339286804199, "loss/dist_ce": 0.0, "loss/fcd": 0.984375, "loss/idx": 13.0, "loss/logits": 0.15014402568340302, "step": 1829 }, { "epoch": 0.02732566820964611, "grad_norm": 0.7578125, "grad_norm_var": 0.0052263895670572914, "learning_rate": 2e-05, "loss": 1.3429, "loss/crossentropy": 2.3926897048950195, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.17105039954185486, "step": 1830 }, { "epoch": 0.027340600268777063, "grad_norm": 0.60546875, "grad_norm_var": 0.0052836100260416664, "learning_rate": 2e-05, "loss": 1.389, "loss/crossentropy": 2.5021653175354004, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 13.0, "loss/logits": 0.18584418296813965, "step": 1831 }, { "epoch": 0.02735553232790802, "grad_norm": 0.52734375, "grad_norm_var": 0.005183664957682291, "learning_rate": 2e-05, "loss": 1.1209, "loss/crossentropy": 2.6762871742248535, "loss/dist_ce": 0.0, "loss/fcd": 0.97265625, "loss/idx": 13.0, "loss/logits": 0.14821532368659973, "step": 1832 }, { "epoch": 0.027370464387038974, "grad_norm": 0.55859375, "grad_norm_var": 0.0051680882771809895, "learning_rate": 2e-05, "loss": 1.2086, "loss/crossentropy": 2.7123446464538574, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16171371936798096, "step": 1833 }, { "epoch": 0.027385396446169927, "grad_norm": 0.5703125, "grad_norm_var": 0.005060259501139323, "learning_rate": 2e-05, "loss": 1.2813, "loss/crossentropy": 2.3558154106140137, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1797207146883011, "step": 1834 }, { "epoch": 0.027400328505300883, "grad_norm": 0.5546875, "grad_norm_var": 0.005060259501139323, "learning_rate": 2e-05, "loss": 1.2775, "loss/crossentropy": 2.6045339107513428, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1759113073348999, "step": 1835 }, { "epoch": 0.027415260564431835, "grad_norm": 0.51953125, "grad_norm_var": 0.0048476537068684895, "learning_rate": 2e-05, "loss": 1.0849, "loss/crossentropy": 2.608766794204712, "loss/dist_ce": 0.0, "loss/fcd": 0.94140625, "loss/idx": 13.0, "loss/logits": 0.14344725012779236, "step": 1836 }, { "epoch": 0.02743019262356279, "grad_norm": 0.4609375, "grad_norm_var": 0.005395253499348958, "learning_rate": 2e-05, "loss": 1.1483, "loss/crossentropy": 2.852187395095825, "loss/dist_ce": 0.0, "loss/fcd": 0.9921875, "loss/idx": 13.0, "loss/logits": 0.1560676395893097, "step": 1837 }, { "epoch": 0.027445124682693743, "grad_norm": 0.59765625, "grad_norm_var": 0.005535888671875, "learning_rate": 2e-05, "loss": 1.3112, "loss/crossentropy": 2.568100690841675, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.17836827039718628, "step": 1838 }, { "epoch": 0.0274600567418247, "grad_norm": 0.546875, "grad_norm_var": 0.004830360412597656, "learning_rate": 2e-05, "loss": 1.2512, "loss/crossentropy": 2.4580633640289307, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.18085670471191406, "step": 1839 }, { "epoch": 0.02747498880095565, "grad_norm": 0.51171875, "grad_norm_var": 0.004811350504557292, "learning_rate": 2e-05, "loss": 1.1555, "loss/crossentropy": 2.5179049968719482, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.13983239233493805, "step": 1840 }, { "epoch": 0.027489920860086607, "grad_norm": 0.53515625, "grad_norm_var": 0.004644775390625, "learning_rate": 2e-05, "loss": 1.2934, "loss/crossentropy": 2.557543992996216, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.19966451823711395, "step": 1841 }, { "epoch": 0.02750485291921756, "grad_norm": 0.5234375, "grad_norm_var": 0.004674720764160156, "learning_rate": 2e-05, "loss": 1.2074, "loss/crossentropy": 2.637477159500122, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16053849458694458, "step": 1842 }, { "epoch": 0.027519784978348515, "grad_norm": 0.55859375, "grad_norm_var": 0.004668617248535156, "learning_rate": 2e-05, "loss": 1.2512, "loss/crossentropy": 2.749081611633301, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1809091717004776, "step": 1843 }, { "epoch": 0.027534717037479468, "grad_norm": 0.59765625, "grad_norm_var": 0.004447364807128906, "learning_rate": 2e-05, "loss": 1.2723, "loss/crossentropy": 2.6513166427612305, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.18633897602558136, "step": 1844 }, { "epoch": 0.027549649096610423, "grad_norm": 0.62109375, "grad_norm_var": 0.0042938232421875, "learning_rate": 2e-05, "loss": 1.2902, "loss/crossentropy": 2.4566802978515625, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.18077588081359863, "step": 1845 }, { "epoch": 0.027564581155741376, "grad_norm": 0.46484375, "grad_norm_var": 0.0021432876586914063, "learning_rate": 2e-05, "loss": 1.1489, "loss/crossentropy": 2.5728206634521484, "loss/dist_ce": 0.0, "loss/fcd": 0.99609375, "loss/idx": 13.0, "loss/logits": 0.15283794701099396, "step": 1846 }, { "epoch": 0.02757951321487233, "grad_norm": 0.4921875, "grad_norm_var": 0.002064005533854167, "learning_rate": 2e-05, "loss": 1.1915, "loss/crossentropy": 2.5025198459625244, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1602761149406433, "step": 1847 }, { "epoch": 0.027594445274003284, "grad_norm": 0.478515625, "grad_norm_var": 0.002295668919881185, "learning_rate": 2e-05, "loss": 1.154, "loss/crossentropy": 2.742009401321411, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.15398138761520386, "step": 1848 }, { "epoch": 0.02760937733313424, "grad_norm": 0.484375, "grad_norm_var": 0.0024261315663655597, "learning_rate": 2e-05, "loss": 1.2092, "loss/crossentropy": 2.5691721439361572, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.17790091037750244, "step": 1849 }, { "epoch": 0.027624309392265192, "grad_norm": 0.498046875, "grad_norm_var": 0.0023867289225260415, "learning_rate": 2e-05, "loss": 1.2136, "loss/crossentropy": 2.7507925033569336, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.1589614599943161, "step": 1850 }, { "epoch": 0.027639241451396148, "grad_norm": 0.482421875, "grad_norm_var": 0.002454360326131185, "learning_rate": 2e-05, "loss": 1.2141, "loss/crossentropy": 2.597674608230591, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.17499211430549622, "step": 1851 }, { "epoch": 0.0276541735105271, "grad_norm": 0.59375, "grad_norm_var": 0.0027611891428629557, "learning_rate": 2e-05, "loss": 1.2882, "loss/crossentropy": 2.4266016483306885, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.18663495779037476, "step": 1852 }, { "epoch": 0.027669105569658056, "grad_norm": 0.515625, "grad_norm_var": 0.002459446589152018, "learning_rate": 2e-05, "loss": 1.2623, "loss/crossentropy": 2.6035945415496826, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1607305407524109, "step": 1853 }, { "epoch": 0.02768403762878901, "grad_norm": 0.48828125, "grad_norm_var": 0.002240482966105143, "learning_rate": 2e-05, "loss": 1.2368, "loss/crossentropy": 2.7356526851654053, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17425121366977692, "step": 1854 }, { "epoch": 0.027698969687919964, "grad_norm": 0.53125, "grad_norm_var": 0.0022092024485270184, "learning_rate": 2e-05, "loss": 1.1971, "loss/crossentropy": 2.591665029525757, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16583198308944702, "step": 1855 }, { "epoch": 0.027713901747050917, "grad_norm": 0.5703125, "grad_norm_var": 0.0023312727610270184, "learning_rate": 2e-05, "loss": 1.2682, "loss/crossentropy": 2.8954193592071533, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1822158247232437, "step": 1856 }, { "epoch": 0.027728833806181873, "grad_norm": 0.55859375, "grad_norm_var": 0.002390400568644206, "learning_rate": 2e-05, "loss": 1.4086, "loss/crossentropy": 2.160271167755127, "loss/dist_ce": 0.0, "loss/fcd": 1.1953125, "loss/idx": 13.0, "loss/logits": 0.21325430274009705, "step": 1857 }, { "epoch": 0.027743765865312825, "grad_norm": 0.52734375, "grad_norm_var": 0.002388620376586914, "learning_rate": 2e-05, "loss": 1.2161, "loss/crossentropy": 2.652884006500244, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16925424337387085, "step": 1858 }, { "epoch": 0.02775869792444378, "grad_norm": 0.54296875, "grad_norm_var": 0.002342081069946289, "learning_rate": 2e-05, "loss": 1.3669, "loss/crossentropy": 2.712960720062256, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.19504106044769287, "step": 1859 }, { "epoch": 0.027773629983574737, "grad_norm": 0.55078125, "grad_norm_var": 0.0020437717437744142, "learning_rate": 2e-05, "loss": 1.2549, "loss/crossentropy": 2.463047742843628, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.16891682147979736, "step": 1860 }, { "epoch": 0.02778856204270569, "grad_norm": 0.5390625, "grad_norm_var": 0.001413583755493164, "learning_rate": 2e-05, "loss": 1.1812, "loss/crossentropy": 2.5723025798797607, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.14994603395462036, "step": 1861 }, { "epoch": 0.027803494101836645, "grad_norm": 0.5546875, "grad_norm_var": 0.0012585798899332683, "learning_rate": 2e-05, "loss": 1.3497, "loss/crossentropy": 2.5939555168151855, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.20121702551841736, "step": 1862 }, { "epoch": 0.027818426160967597, "grad_norm": 0.5703125, "grad_norm_var": 0.0012929121653238933, "learning_rate": 2e-05, "loss": 1.3497, "loss/crossentropy": 2.595493793487549, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.20122528076171875, "step": 1863 }, { "epoch": 0.027833358220098553, "grad_norm": 0.515625, "grad_norm_var": 0.001122283935546875, "learning_rate": 2e-05, "loss": 1.2143, "loss/crossentropy": 2.7929248809814453, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.17519071698188782, "step": 1864 }, { "epoch": 0.027848290279229505, "grad_norm": 0.54296875, "grad_norm_var": 0.0009592056274414062, "learning_rate": 2e-05, "loss": 1.2764, "loss/crossentropy": 2.555880069732666, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17485883831977844, "step": 1865 }, { "epoch": 0.02786322233836046, "grad_norm": 0.4765625, "grad_norm_var": 0.001097853978474935, "learning_rate": 2e-05, "loss": 1.2594, "loss/crossentropy": 2.3144571781158447, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.18126043677330017, "step": 1866 }, { "epoch": 0.027878154397491414, "grad_norm": 0.5859375, "grad_norm_var": 0.001041412353515625, "learning_rate": 2e-05, "loss": 1.2723, "loss/crossentropy": 2.8717572689056396, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.17073537409305573, "step": 1867 }, { "epoch": 0.02789308645662237, "grad_norm": 0.53125, "grad_norm_var": 0.0008501688639322917, "learning_rate": 2e-05, "loss": 1.2706, "loss/crossentropy": 2.798926830291748, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.19248628616333008, "step": 1868 }, { "epoch": 0.02790801851575332, "grad_norm": 0.56640625, "grad_norm_var": 0.0008625666300455729, "learning_rate": 2e-05, "loss": 1.1766, "loss/crossentropy": 2.5379748344421387, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.16097001731395721, "step": 1869 }, { "epoch": 0.027922950574884278, "grad_norm": 0.55078125, "grad_norm_var": 0.0006692886352539062, "learning_rate": 2e-05, "loss": 1.2328, "loss/crossentropy": 2.607471227645874, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17812922596931458, "step": 1870 }, { "epoch": 0.02793788263401523, "grad_norm": 0.56640625, "grad_norm_var": 0.00068359375, "learning_rate": 2e-05, "loss": 1.233, "loss/crossentropy": 2.5563502311706543, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17826440930366516, "step": 1871 }, { "epoch": 0.027952814693146186, "grad_norm": 0.62109375, "grad_norm_var": 0.0010034561157226563, "learning_rate": 2e-05, "loss": 1.3474, "loss/crossentropy": 2.455146551132202, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 13.0, "loss/logits": 0.18337559700012207, "step": 1872 }, { "epoch": 0.027967746752277138, "grad_norm": 0.498046875, "grad_norm_var": 0.0011635939280192056, "learning_rate": 2e-05, "loss": 1.1542, "loss/crossentropy": 2.446068525314331, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.146395742893219, "step": 1873 }, { "epoch": 0.027982678811408094, "grad_norm": 0.56640625, "grad_norm_var": 0.0011604150136311849, "learning_rate": 2e-05, "loss": 1.3081, "loss/crossentropy": 2.396256685256958, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.20653998851776123, "step": 1874 }, { "epoch": 0.027997610870539046, "grad_norm": 0.56640625, "grad_norm_var": 0.0011768182118733724, "learning_rate": 2e-05, "loss": 1.2419, "loss/crossentropy": 2.4543728828430176, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17160436511039734, "step": 1875 }, { "epoch": 0.028012542929670002, "grad_norm": 0.53515625, "grad_norm_var": 0.0011908054351806641, "learning_rate": 2e-05, "loss": 1.2603, "loss/crossentropy": 2.67966365814209, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1977955400943756, "step": 1876 }, { "epoch": 0.028027474988800954, "grad_norm": 0.6875, "grad_norm_var": 0.0023673852284749348, "learning_rate": 2e-05, "loss": 1.3841, "loss/crossentropy": 2.7667315006256104, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.21221590042114258, "step": 1877 }, { "epoch": 0.02804240704793191, "grad_norm": 0.4921875, "grad_norm_var": 0.0026430606842041014, "learning_rate": 2e-05, "loss": 1.1679, "loss/crossentropy": 2.534827470779419, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1522301733493805, "step": 1878 }, { "epoch": 0.028057339107062863, "grad_norm": 0.671875, "grad_norm_var": 0.0035009860992431642, "learning_rate": 2e-05, "loss": 1.426, "loss/crossentropy": 2.4057843685150146, "loss/dist_ce": 0.0, "loss/fcd": 1.2109375, "loss/idx": 13.0, "loss/logits": 0.2150222361087799, "step": 1879 }, { "epoch": 0.02807227116619382, "grad_norm": 0.59765625, "grad_norm_var": 0.003426218032836914, "learning_rate": 2e-05, "loss": 1.347, "loss/crossentropy": 2.831153154373169, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.20638611912727356, "step": 1880 }, { "epoch": 0.02808720322532477, "grad_norm": 0.53125, "grad_norm_var": 0.0034708499908447264, "learning_rate": 2e-05, "loss": 1.308, "loss/crossentropy": 2.6058216094970703, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1986655741930008, "step": 1881 }, { "epoch": 0.028102135284455727, "grad_norm": 0.69921875, "grad_norm_var": 0.003934717178344727, "learning_rate": 2e-05, "loss": 1.3506, "loss/crossentropy": 2.816981315612793, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.20997676253318787, "step": 1882 }, { "epoch": 0.02811706734358668, "grad_norm": 0.546875, "grad_norm_var": 0.0039951165517171225, "learning_rate": 2e-05, "loss": 1.2804, "loss/crossentropy": 2.183166265487671, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.17098873853683472, "step": 1883 }, { "epoch": 0.028131999402717635, "grad_norm": 0.55078125, "grad_norm_var": 0.0039003849029541015, "learning_rate": 2e-05, "loss": 1.3684, "loss/crossentropy": 2.391815423965454, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.21994663774967194, "step": 1884 }, { "epoch": 0.028146931461848587, "grad_norm": 0.53125, "grad_norm_var": 0.004031991958618164, "learning_rate": 2e-05, "loss": 1.1912, "loss/crossentropy": 2.4962005615234375, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1599755585193634, "step": 1885 }, { "epoch": 0.028161863520979543, "grad_norm": 0.5, "grad_norm_var": 0.0043625990549723305, "learning_rate": 2e-05, "loss": 1.2411, "loss/crossentropy": 2.674856662750244, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1785699725151062, "step": 1886 }, { "epoch": 0.0281767955801105, "grad_norm": 0.5234375, "grad_norm_var": 0.004513661066691081, "learning_rate": 2e-05, "loss": 1.3122, "loss/crossentropy": 2.5995376110076904, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.19502834975719452, "step": 1887 }, { "epoch": 0.02819172763924145, "grad_norm": 0.625, "grad_norm_var": 0.004541254043579102, "learning_rate": 2e-05, "loss": 1.3003, "loss/crossentropy": 2.6286256313323975, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.1830824315547943, "step": 1888 }, { "epoch": 0.028206659698372407, "grad_norm": 0.53515625, "grad_norm_var": 0.004270362854003906, "learning_rate": 2e-05, "loss": 1.2272, "loss/crossentropy": 2.5657100677490234, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17253029346466064, "step": 1889 }, { "epoch": 0.02822159175750336, "grad_norm": 0.51171875, "grad_norm_var": 0.0045017878214518225, "learning_rate": 2e-05, "loss": 1.2384, "loss/crossentropy": 2.6371774673461914, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1681075394153595, "step": 1890 }, { "epoch": 0.028236523816634315, "grad_norm": 0.4921875, "grad_norm_var": 0.004872639973958333, "learning_rate": 2e-05, "loss": 1.2299, "loss/crossentropy": 2.384807825088501, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17519429326057434, "step": 1891 }, { "epoch": 0.028251455875765268, "grad_norm": 0.515625, "grad_norm_var": 0.004972775777180989, "learning_rate": 2e-05, "loss": 1.2124, "loss/crossentropy": 2.781672954559326, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.17337191104888916, "step": 1892 }, { "epoch": 0.028266387934896223, "grad_norm": 0.4921875, "grad_norm_var": 0.004120826721191406, "learning_rate": 2e-05, "loss": 1.1777, "loss/crossentropy": 2.4862048625946045, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15426138043403625, "step": 1893 }, { "epoch": 0.028281319994027176, "grad_norm": 0.52734375, "grad_norm_var": 0.003922271728515625, "learning_rate": 2e-05, "loss": 1.2273, "loss/crossentropy": 2.573594808578491, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16478094458580017, "step": 1894 }, { "epoch": 0.02829625205315813, "grad_norm": 0.578125, "grad_norm_var": 0.002988433837890625, "learning_rate": 2e-05, "loss": 1.2974, "loss/crossentropy": 2.44994854927063, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.2036007046699524, "step": 1895 }, { "epoch": 0.028311184112289084, "grad_norm": 0.53125, "grad_norm_var": 0.0028187433878580728, "learning_rate": 2e-05, "loss": 1.291, "loss/crossentropy": 2.642103433609009, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.19725409150123596, "step": 1896 }, { "epoch": 0.02832611617142004, "grad_norm": 0.5234375, "grad_norm_var": 0.0028350194295247394, "learning_rate": 2e-05, "loss": 1.2426, "loss/crossentropy": 2.4922900199890137, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17228657007217407, "step": 1897 }, { "epoch": 0.028341048230550992, "grad_norm": 0.5078125, "grad_norm_var": 0.0011309305826822916, "learning_rate": 2e-05, "loss": 1.2007, "loss/crossentropy": 2.5027291774749756, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.15385723114013672, "step": 1898 }, { "epoch": 0.028355980289681948, "grad_norm": 0.484375, "grad_norm_var": 0.0012407938639322916, "learning_rate": 2e-05, "loss": 1.1203, "loss/crossentropy": 2.66294527053833, "loss/dist_ce": 0.0, "loss/fcd": 0.97265625, "loss/idx": 13.0, "loss/logits": 0.14767876267433167, "step": 1899 }, { "epoch": 0.0283709123488129, "grad_norm": 0.474609375, "grad_norm_var": 0.0013604323069254557, "learning_rate": 2e-05, "loss": 1.1664, "loss/crossentropy": 2.5771396160125732, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.1507551074028015, "step": 1900 }, { "epoch": 0.028385844407943856, "grad_norm": 0.62109375, "grad_norm_var": 0.001974598566691081, "learning_rate": 2e-05, "loss": 1.3821, "loss/crossentropy": 2.398294448852539, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 13.0, "loss/logits": 0.21805456280708313, "step": 1901 }, { "epoch": 0.02840077646707481, "grad_norm": 0.6171875, "grad_norm_var": 0.0023999373118082684, "learning_rate": 2e-05, "loss": 1.2681, "loss/crossentropy": 2.537140130996704, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17434020340442657, "step": 1902 }, { "epoch": 0.028415708526205764, "grad_norm": 0.5625, "grad_norm_var": 0.0024349053700764975, "learning_rate": 2e-05, "loss": 1.2614, "loss/crossentropy": 2.519807815551758, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.17546257376670837, "step": 1903 }, { "epoch": 0.028430640585336717, "grad_norm": 0.474609375, "grad_norm_var": 0.0020934422810872395, "learning_rate": 2e-05, "loss": 1.2129, "loss/crossentropy": 2.592402696609497, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.17381396889686584, "step": 1904 }, { "epoch": 0.028445572644467673, "grad_norm": 0.6484375, "grad_norm_var": 0.003002421061197917, "learning_rate": 2e-05, "loss": 1.3098, "loss/crossentropy": 2.6101558208465576, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.18479961156845093, "step": 1905 }, { "epoch": 0.028460504703598625, "grad_norm": 0.49609375, "grad_norm_var": 0.0030665079752604167, "learning_rate": 2e-05, "loss": 1.1539, "loss/crossentropy": 2.396393060684204, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.14609143137931824, "step": 1906 }, { "epoch": 0.02847543676272958, "grad_norm": 0.578125, "grad_norm_var": 0.0030469258626302084, "learning_rate": 2e-05, "loss": 1.3732, "loss/crossentropy": 2.566880226135254, "loss/dist_ce": 0.0, "loss/fcd": 1.171875, "loss/idx": 13.0, "loss/logits": 0.20131400227546692, "step": 1907 }, { "epoch": 0.028490368821860533, "grad_norm": 0.9765625, "grad_norm_var": 0.014855448404947917, "learning_rate": 2e-05, "loss": 1.4376, "loss/crossentropy": 1.9921377897262573, "loss/dist_ce": 0.0, "loss/fcd": 1.265625, "loss/idx": 13.0, "loss/logits": 0.17201855778694153, "step": 1908 }, { "epoch": 0.02850530088099149, "grad_norm": 0.77734375, "grad_norm_var": 0.017041460673014323, "learning_rate": 2e-05, "loss": 1.592, "loss/crossentropy": 2.6982979774475098, "loss/dist_ce": 0.0, "loss/fcd": 1.328125, "loss/idx": 13.0, "loss/logits": 0.26390308141708374, "step": 1909 }, { "epoch": 0.02852023294012244, "grad_norm": 0.5390625, "grad_norm_var": 0.016958109537760415, "learning_rate": 2e-05, "loss": 1.224, "loss/crossentropy": 2.5811877250671387, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1615004688501358, "step": 1910 }, { "epoch": 0.028535164999253397, "grad_norm": 0.54296875, "grad_norm_var": 0.017076555887858072, "learning_rate": 2e-05, "loss": 1.3145, "loss/crossentropy": 2.500891923904419, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.1816907823085785, "step": 1911 }, { "epoch": 0.028550097058384353, "grad_norm": 0.5625, "grad_norm_var": 0.016914812723795573, "learning_rate": 2e-05, "loss": 1.2898, "loss/crossentropy": 2.510209798812866, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.2116694152355194, "step": 1912 }, { "epoch": 0.028565029117515305, "grad_norm": 0.6640625, "grad_norm_var": 0.016965166727701823, "learning_rate": 2e-05, "loss": 1.2248, "loss/crossentropy": 2.9473345279693604, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.1623196303844452, "step": 1913 }, { "epoch": 0.02857996117664626, "grad_norm": 0.58203125, "grad_norm_var": 0.016442108154296874, "learning_rate": 2e-05, "loss": 1.3948, "loss/crossentropy": 2.6808791160583496, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.21510747075080872, "step": 1914 }, { "epoch": 0.028594893235777213, "grad_norm": 0.58984375, "grad_norm_var": 0.015509986877441406, "learning_rate": 2e-05, "loss": 1.3086, "loss/crossentropy": 2.472118616104126, "loss/dist_ce": 0.0, "loss/fcd": 1.1328125, "loss/idx": 13.0, "loss/logits": 0.1758304238319397, "step": 1915 }, { "epoch": 0.02860982529490817, "grad_norm": 0.59375, "grad_norm_var": 0.01429899533589681, "learning_rate": 2e-05, "loss": 1.2624, "loss/crossentropy": 2.7138822078704834, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.16082213819026947, "step": 1916 }, { "epoch": 0.02862475735403912, "grad_norm": 0.5546875, "grad_norm_var": 0.014512999852498373, "learning_rate": 2e-05, "loss": 1.2013, "loss/crossentropy": 2.4729011058807373, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1622874140739441, "step": 1917 }, { "epoch": 0.028639689413170077, "grad_norm": 0.5078125, "grad_norm_var": 0.015155649185180664, "learning_rate": 2e-05, "loss": 1.1805, "loss/crossentropy": 2.375136613845825, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1414240300655365, "step": 1918 }, { "epoch": 0.02865462147230103, "grad_norm": 0.5546875, "grad_norm_var": 0.015201807022094727, "learning_rate": 2e-05, "loss": 1.2548, "loss/crossentropy": 2.6161394119262695, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.16885605454444885, "step": 1919 }, { "epoch": 0.028669553531431986, "grad_norm": 0.5390625, "grad_norm_var": 0.014361000061035157, "learning_rate": 2e-05, "loss": 1.2967, "loss/crossentropy": 2.6225621700286865, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.19516368210315704, "step": 1920 }, { "epoch": 0.028684485590562938, "grad_norm": 0.5390625, "grad_norm_var": 0.014499855041503907, "learning_rate": 2e-05, "loss": 1.2501, "loss/crossentropy": 2.6922271251678467, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17982667684555054, "step": 1921 }, { "epoch": 0.028699417649693894, "grad_norm": 0.490234375, "grad_norm_var": 0.014583063125610352, "learning_rate": 2e-05, "loss": 1.2416, "loss/crossentropy": 2.497199535369873, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17905518412590027, "step": 1922 }, { "epoch": 0.028714349708824846, "grad_norm": 0.498046875, "grad_norm_var": 0.015211931864420573, "learning_rate": 2e-05, "loss": 1.2402, "loss/crossentropy": 2.8170697689056396, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16988661885261536, "step": 1923 }, { "epoch": 0.028729281767955802, "grad_norm": 0.578125, "grad_norm_var": 0.004835955301920573, "learning_rate": 2e-05, "loss": 1.2604, "loss/crossentropy": 2.3041558265686035, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1588452011346817, "step": 1924 }, { "epoch": 0.028744213827086754, "grad_norm": 0.546875, "grad_norm_var": 0.0017712910970052083, "learning_rate": 2e-05, "loss": 1.2265, "loss/crossentropy": 2.748405694961548, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.17180919647216797, "step": 1925 }, { "epoch": 0.02875914588621771, "grad_norm": 0.6640625, "grad_norm_var": 0.002479298909505208, "learning_rate": 2e-05, "loss": 1.3679, "loss/crossentropy": 2.416738510131836, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.18823902308940887, "step": 1926 }, { "epoch": 0.028774077945348663, "grad_norm": 0.515625, "grad_norm_var": 0.002599016825358073, "learning_rate": 2e-05, "loss": 1.1838, "loss/crossentropy": 2.5889949798583984, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.16037039458751678, "step": 1927 }, { "epoch": 0.02878901000447962, "grad_norm": 0.5234375, "grad_norm_var": 0.002688026428222656, "learning_rate": 2e-05, "loss": 1.3251, "loss/crossentropy": 2.383061408996582, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.1844463050365448, "step": 1928 }, { "epoch": 0.02880394206361057, "grad_norm": 0.50390625, "grad_norm_var": 0.0020441691080729167, "learning_rate": 2e-05, "loss": 1.1616, "loss/crossentropy": 2.659311532974243, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.153801828622818, "step": 1929 }, { "epoch": 0.028818874122741527, "grad_norm": 0.5234375, "grad_norm_var": 0.001999346415201823, "learning_rate": 2e-05, "loss": 1.204, "loss/crossentropy": 2.457728862762451, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.1570843905210495, "step": 1930 }, { "epoch": 0.02883380618187248, "grad_norm": 0.50390625, "grad_norm_var": 0.001948992411295573, "learning_rate": 2e-05, "loss": 1.2224, "loss/crossentropy": 2.628582715988159, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.17548592388629913, "step": 1931 }, { "epoch": 0.028848738241003435, "grad_norm": 0.49609375, "grad_norm_var": 0.001842498779296875, "learning_rate": 2e-05, "loss": 1.179, "loss/crossentropy": 2.5765113830566406, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.1555609107017517, "step": 1932 }, { "epoch": 0.028863670300134387, "grad_norm": 0.54296875, "grad_norm_var": 0.0018182754516601562, "learning_rate": 2e-05, "loss": 1.2661, "loss/crossentropy": 2.8126022815704346, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.1801205724477768, "step": 1933 }, { "epoch": 0.028878602359265343, "grad_norm": 0.5234375, "grad_norm_var": 0.001781145731608073, "learning_rate": 2e-05, "loss": 1.2714, "loss/crossentropy": 2.6158759593963623, "loss/dist_ce": 0.0, "loss/fcd": 1.0859375, "loss/idx": 13.0, "loss/logits": 0.18547698855400085, "step": 1934 }, { "epoch": 0.028893534418396295, "grad_norm": 0.625, "grad_norm_var": 0.0022846857706705728, "learning_rate": 2e-05, "loss": 1.3012, "loss/crossentropy": 2.6109371185302734, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.18406124413013458, "step": 1935 }, { "epoch": 0.02890846647752725, "grad_norm": 0.55859375, "grad_norm_var": 0.0023104349772135415, "learning_rate": 2e-05, "loss": 1.1956, "loss/crossentropy": 2.6355507373809814, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1643276810646057, "step": 1936 }, { "epoch": 0.028923398536658203, "grad_norm": 0.5390625, "grad_norm_var": 0.0023104349772135415, "learning_rate": 2e-05, "loss": 1.3375, "loss/crossentropy": 2.5399065017700195, "loss/dist_ce": 0.0, "loss/fcd": 1.1484375, "loss/idx": 13.0, "loss/logits": 0.1890271008014679, "step": 1937 }, { "epoch": 0.02893833059578916, "grad_norm": 0.5234375, "grad_norm_var": 0.002161010106404622, "learning_rate": 2e-05, "loss": 1.2741, "loss/crossentropy": 2.486199378967285, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18031063675880432, "step": 1938 }, { "epoch": 0.028953262654920115, "grad_norm": 0.490234375, "grad_norm_var": 0.002210219701131185, "learning_rate": 2e-05, "loss": 1.2398, "loss/crossentropy": 2.4644899368286133, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.17728056013584137, "step": 1939 }, { "epoch": 0.028968194714051067, "grad_norm": 0.5, "grad_norm_var": 0.00220640500386556, "learning_rate": 2e-05, "loss": 1.1981, "loss/crossentropy": 2.615004301071167, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16684943437576294, "step": 1940 }, { "epoch": 0.028983126773182023, "grad_norm": 0.578125, "grad_norm_var": 0.0023116906483968097, "learning_rate": 2e-05, "loss": 1.322, "loss/crossentropy": 2.47305965423584, "loss/dist_ce": 0.0, "loss/fcd": 1.140625, "loss/idx": 13.0, "loss/logits": 0.181331604719162, "step": 1941 }, { "epoch": 0.028998058832312976, "grad_norm": 0.5234375, "grad_norm_var": 0.0011878808339436848, "learning_rate": 2e-05, "loss": 1.3178, "loss/crossentropy": 2.2365729808807373, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19283056259155273, "step": 1942 }, { "epoch": 0.02901299089144393, "grad_norm": 0.50390625, "grad_norm_var": 0.0012180169423421225, "learning_rate": 2e-05, "loss": 1.2095, "loss/crossentropy": 2.5759778022766113, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16261249780654907, "step": 1943 }, { "epoch": 0.029027922950574884, "grad_norm": 0.546875, "grad_norm_var": 0.00123594601949056, "learning_rate": 2e-05, "loss": 1.2683, "loss/crossentropy": 2.4381752014160156, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17450745403766632, "step": 1944 }, { "epoch": 0.02904285500970584, "grad_norm": 0.5078125, "grad_norm_var": 0.0012232303619384766, "learning_rate": 2e-05, "loss": 1.2061, "loss/crossentropy": 2.6488664150238037, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.159229576587677, "step": 1945 }, { "epoch": 0.029057787068836792, "grad_norm": 0.8203125, "grad_norm_var": 0.006456232070922852, "learning_rate": 2e-05, "loss": 1.3904, "loss/crossentropy": 2.58434796333313, "loss/dist_ce": 0.0, "loss/fcd": 1.203125, "loss/idx": 13.0, "loss/logits": 0.18732021749019623, "step": 1946 }, { "epoch": 0.029072719127967748, "grad_norm": 0.60546875, "grad_norm_var": 0.006490945816040039, "learning_rate": 2e-05, "loss": 1.3589, "loss/crossentropy": 2.2552731037139893, "loss/dist_ce": 0.0, "loss/fcd": 1.1796875, "loss/idx": 13.0, "loss/logits": 0.1792486011981964, "step": 1947 }, { "epoch": 0.0290876511870987, "grad_norm": 0.52734375, "grad_norm_var": 0.006305297215779622, "learning_rate": 2e-05, "loss": 1.207, "loss/crossentropy": 2.5927445888519287, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1679755449295044, "step": 1948 }, { "epoch": 0.029102583246229656, "grad_norm": 0.5625, "grad_norm_var": 0.006291945775349935, "learning_rate": 2e-05, "loss": 1.244, "loss/crossentropy": 2.586740255355835, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.17367993295192719, "step": 1949 }, { "epoch": 0.02911751530536061, "grad_norm": 0.515625, "grad_norm_var": 0.006332254409790039, "learning_rate": 2e-05, "loss": 1.1496, "loss/crossentropy": 2.475268840789795, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.14958719909191132, "step": 1950 }, { "epoch": 0.029132447364491564, "grad_norm": 0.484375, "grad_norm_var": 0.006311655044555664, "learning_rate": 2e-05, "loss": 1.1883, "loss/crossentropy": 2.3651411533355713, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.15700486302375793, "step": 1951 }, { "epoch": 0.029147379423622517, "grad_norm": 0.46484375, "grad_norm_var": 0.006743478775024414, "learning_rate": 2e-05, "loss": 1.2257, "loss/crossentropy": 2.7126286029815674, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16317632794380188, "step": 1952 }, { "epoch": 0.029162311482753472, "grad_norm": 0.51953125, "grad_norm_var": 0.006778446833292643, "learning_rate": 2e-05, "loss": 1.2039, "loss/crossentropy": 2.6610074043273926, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16480238735675812, "step": 1953 }, { "epoch": 0.029177243541884425, "grad_norm": 0.51953125, "grad_norm_var": 0.0067891279856363935, "learning_rate": 2e-05, "loss": 1.2575, "loss/crossentropy": 2.511665105819702, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.17938058078289032, "step": 1954 }, { "epoch": 0.02919217560101538, "grad_norm": 0.51953125, "grad_norm_var": 0.006641070048014323, "learning_rate": 2e-05, "loss": 1.1791, "loss/crossentropy": 2.600038528442383, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15568949282169342, "step": 1955 }, { "epoch": 0.029207107660146333, "grad_norm": 0.5703125, "grad_norm_var": 0.006540362040201823, "learning_rate": 2e-05, "loss": 1.2662, "loss/crossentropy": 2.5734775066375732, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17242102324962616, "step": 1956 }, { "epoch": 0.02922203971927729, "grad_norm": 0.5390625, "grad_norm_var": 0.006479326883951823, "learning_rate": 2e-05, "loss": 1.2526, "loss/crossentropy": 2.4726715087890625, "loss/dist_ce": 0.0, "loss/fcd": 1.078125, "loss/idx": 13.0, "loss/logits": 0.17450538277626038, "step": 1957 }, { "epoch": 0.02923697177840824, "grad_norm": 0.53515625, "grad_norm_var": 0.006453196207682292, "learning_rate": 2e-05, "loss": 1.2294, "loss/crossentropy": 2.485107898712158, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.1591225266456604, "step": 1958 }, { "epoch": 0.029251903837539197, "grad_norm": 0.453125, "grad_norm_var": 0.006901995340983073, "learning_rate": 2e-05, "loss": 1.1976, "loss/crossentropy": 2.5497329235076904, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16637209057807922, "step": 1959 }, { "epoch": 0.02926683589667015, "grad_norm": 0.486328125, "grad_norm_var": 0.0071015516916910805, "learning_rate": 2e-05, "loss": 1.1731, "loss/crossentropy": 2.803480625152588, "loss/dist_ce": 0.0, "loss/fcd": 1.015625, "loss/idx": 13.0, "loss/logits": 0.15749675035476685, "step": 1960 }, { "epoch": 0.029281767955801105, "grad_norm": 0.52734375, "grad_norm_var": 0.0070430596669514975, "learning_rate": 2e-05, "loss": 1.2392, "loss/crossentropy": 2.4313740730285645, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16886743903160095, "step": 1961 }, { "epoch": 0.029296700014932057, "grad_norm": 0.46875, "grad_norm_var": 0.00165861447652181, "learning_rate": 2e-05, "loss": 1.1526, "loss/crossentropy": 2.5858538150787354, "loss/dist_ce": 0.0, "loss/fcd": 1.0, "loss/idx": 13.0, "loss/logits": 0.15264388918876648, "step": 1962 }, { "epoch": 0.029311632074063013, "grad_norm": 0.5703125, "grad_norm_var": 0.0013290246327718098, "learning_rate": 2e-05, "loss": 1.1973, "loss/crossentropy": 2.4390578269958496, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.16606196761131287, "step": 1963 }, { "epoch": 0.02932656413319397, "grad_norm": 0.9921875, "grad_norm_var": 0.015507364273071289, "learning_rate": 2e-05, "loss": 1.5494, "loss/crossentropy": 2.5813605785369873, "loss/dist_ce": 0.0, "loss/fcd": 1.2578125, "loss/idx": 13.0, "loss/logits": 0.29163628816604614, "step": 1964 }, { "epoch": 0.02934149619232492, "grad_norm": 0.53125, "grad_norm_var": 0.015497700373331705, "learning_rate": 2e-05, "loss": 1.2957, "loss/crossentropy": 2.640507221221924, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.20195528864860535, "step": 1965 }, { "epoch": 0.029356428251455877, "grad_norm": 0.498046875, "grad_norm_var": 0.015582529703776042, "learning_rate": 2e-05, "loss": 1.2077, "loss/crossentropy": 2.6269850730895996, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.16860657930374146, "step": 1966 }, { "epoch": 0.02937136031058683, "grad_norm": 0.53125, "grad_norm_var": 0.015356699625651041, "learning_rate": 2e-05, "loss": 1.1539, "loss/crossentropy": 2.636483669281006, "loss/dist_ce": 0.0, "loss/fcd": 1.0078125, "loss/idx": 13.0, "loss/logits": 0.14610238373279572, "step": 1967 }, { "epoch": 0.029386292369717786, "grad_norm": 0.5078125, "grad_norm_var": 0.015010515848795572, "learning_rate": 2e-05, "loss": 1.1888, "loss/crossentropy": 2.5731468200683594, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.1575092077255249, "step": 1968 }, { "epoch": 0.029401224428848738, "grad_norm": 0.447265625, "grad_norm_var": 0.015612141291300455, "learning_rate": 2e-05, "loss": 1.1274, "loss/crossentropy": 2.7225778102874756, "loss/dist_ce": 0.0, "loss/fcd": 0.9765625, "loss/idx": 13.0, "loss/logits": 0.15085333585739136, "step": 1969 }, { "epoch": 0.029416156487979694, "grad_norm": 0.53515625, "grad_norm_var": 0.015577300389607748, "learning_rate": 2e-05, "loss": 1.3017, "loss/crossentropy": 2.6382668018341064, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.18452683091163635, "step": 1970 }, { "epoch": 0.029431088547110646, "grad_norm": 0.5078125, "grad_norm_var": 0.01562498410542806, "learning_rate": 2e-05, "loss": 1.2374, "loss/crossentropy": 2.362830638885498, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16712644696235657, "step": 1971 }, { "epoch": 0.029446020606241602, "grad_norm": 0.490234375, "grad_norm_var": 0.015742937723795574, "learning_rate": 2e-05, "loss": 1.2171, "loss/crossentropy": 2.442208766937256, "loss/dist_ce": 0.0, "loss/fcd": 1.0546875, "loss/idx": 13.0, "loss/logits": 0.16240856051445007, "step": 1972 }, { "epoch": 0.029460952665372554, "grad_norm": 0.53125, "grad_norm_var": 0.015746498107910158, "learning_rate": 2e-05, "loss": 1.2984, "loss/crossentropy": 2.450517416000366, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.1889958381652832, "step": 1973 }, { "epoch": 0.02947588472450351, "grad_norm": 0.58203125, "grad_norm_var": 0.015863990783691405, "learning_rate": 2e-05, "loss": 1.2289, "loss/crossentropy": 2.8067851066589355, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.16644850373268127, "step": 1974 }, { "epoch": 0.029490816783634462, "grad_norm": 1.171875, "grad_norm_var": 0.03970534006754557, "learning_rate": 2e-05, "loss": 1.3591, "loss/crossentropy": 2.7080118656158447, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.23414579033851624, "step": 1975 }, { "epoch": 0.029505748842765418, "grad_norm": 0.6015625, "grad_norm_var": 0.0390010674794515, "learning_rate": 2e-05, "loss": 1.4244, "loss/crossentropy": 2.4435675144195557, "loss/dist_ce": 0.0, "loss/fcd": 1.21875, "loss/idx": 13.0, "loss/logits": 0.20565560460090637, "step": 1976 }, { "epoch": 0.02952068090189637, "grad_norm": 0.55859375, "grad_norm_var": 0.03878693580627442, "learning_rate": 2e-05, "loss": 1.3483, "loss/crossentropy": 2.673099994659424, "loss/dist_ce": 0.0, "loss/fcd": 1.15625, "loss/idx": 13.0, "loss/logits": 0.1920124590396881, "step": 1977 }, { "epoch": 0.029535612961027326, "grad_norm": 0.54296875, "grad_norm_var": 0.037878529230753584, "learning_rate": 2e-05, "loss": 1.2699, "loss/crossentropy": 2.5343728065490723, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.17618075013160706, "step": 1978 }, { "epoch": 0.02955054502015828, "grad_norm": 0.5390625, "grad_norm_var": 0.03806316057840983, "learning_rate": 2e-05, "loss": 1.2738, "loss/crossentropy": 2.402545690536499, "loss/dist_ce": 0.0, "loss/fcd": 1.1015625, "loss/idx": 13.0, "loss/logits": 0.1722050905227661, "step": 1979 }, { "epoch": 0.029565477079289235, "grad_norm": 0.578125, "grad_norm_var": 0.0270174503326416, "learning_rate": 2e-05, "loss": 1.2149, "loss/crossentropy": 2.5633673667907715, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16800308227539062, "step": 1980 }, { "epoch": 0.029580409138420187, "grad_norm": 0.609375, "grad_norm_var": 0.02697294553120931, "learning_rate": 2e-05, "loss": 1.4391, "loss/crossentropy": 2.921246290206909, "loss/dist_ce": 0.0, "loss/fcd": 1.1875, "loss/idx": 13.0, "loss/logits": 0.251582533121109, "step": 1981 }, { "epoch": 0.029595341197551143, "grad_norm": 0.462890625, "grad_norm_var": 0.027420409520467124, "learning_rate": 2e-05, "loss": 1.1457, "loss/crossentropy": 2.622143507003784, "loss/dist_ce": 0.0, "loss/fcd": 0.99609375, "loss/idx": 13.0, "loss/logits": 0.14962665736675262, "step": 1982 }, { "epoch": 0.029610273256682095, "grad_norm": 0.494140625, "grad_norm_var": 0.02772210439046224, "learning_rate": 2e-05, "loss": 1.2095, "loss/crossentropy": 2.3120386600494385, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16262733936309814, "step": 1983 }, { "epoch": 0.02962520531581305, "grad_norm": 0.5546875, "grad_norm_var": 0.02745507558186849, "learning_rate": 2e-05, "loss": 1.2747, "loss/crossentropy": 2.4596571922302246, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.18092194199562073, "step": 1984 }, { "epoch": 0.029640137374944003, "grad_norm": 0.83984375, "grad_norm_var": 0.030378325780232748, "learning_rate": 2e-05, "loss": 1.2322, "loss/crossentropy": 2.4688382148742676, "loss/dist_ce": 0.0, "loss/fcd": 1.0703125, "loss/idx": 13.0, "loss/logits": 0.16190896928310394, "step": 1985 }, { "epoch": 0.02965506943407496, "grad_norm": 0.5625, "grad_norm_var": 0.030188735326131186, "learning_rate": 2e-05, "loss": 1.4595, "loss/crossentropy": 2.3537650108337402, "loss/dist_ce": 0.0, "loss/fcd": 1.2265625, "loss/idx": 13.0, "loss/logits": 0.23292958736419678, "step": 1986 }, { "epoch": 0.02967000149320591, "grad_norm": 0.515625, "grad_norm_var": 0.03009476661682129, "learning_rate": 2e-05, "loss": 1.2017, "loss/crossentropy": 2.5475752353668213, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1625993549823761, "step": 1987 }, { "epoch": 0.029684933552336867, "grad_norm": 0.498046875, "grad_norm_var": 0.02998197873433431, "learning_rate": 2e-05, "loss": 1.1774, "loss/crossentropy": 2.5423738956451416, "loss/dist_ce": 0.0, "loss/fcd": 1.0234375, "loss/idx": 13.0, "loss/logits": 0.15400215983390808, "step": 1988 }, { "epoch": 0.02969986561146782, "grad_norm": 0.5859375, "grad_norm_var": 0.029648192723592124, "learning_rate": 2e-05, "loss": 1.3544, "loss/crossentropy": 2.679858446121216, "loss/dist_ce": 0.0, "loss/fcd": 1.1640625, "loss/idx": 13.0, "loss/logits": 0.1902952790260315, "step": 1989 }, { "epoch": 0.029714797670598776, "grad_norm": 0.462890625, "grad_norm_var": 0.0309173583984375, "learning_rate": 2e-05, "loss": 1.1498, "loss/crossentropy": 2.4342339038848877, "loss/dist_ce": 0.0, "loss/fcd": 0.99609375, "loss/idx": 13.0, "loss/logits": 0.1536850929260254, "step": 1990 }, { "epoch": 0.02972972972972973, "grad_norm": 0.59375, "grad_norm_var": 0.007619222005208333, "learning_rate": 2e-05, "loss": 1.3155, "loss/crossentropy": 2.6562082767486572, "loss/dist_ce": 0.0, "loss/fcd": 1.125, "loss/idx": 13.0, "loss/logits": 0.19045662879943848, "step": 1991 }, { "epoch": 0.029744661788860684, "grad_norm": 0.515625, "grad_norm_var": 0.007633209228515625, "learning_rate": 2e-05, "loss": 1.2124, "loss/crossentropy": 2.6417014598846436, "loss/dist_ce": 0.0, "loss/fcd": 1.046875, "loss/idx": 13.0, "loss/logits": 0.16551363468170166, "step": 1992 }, { "epoch": 0.02975959384799164, "grad_norm": 0.47265625, "grad_norm_var": 0.0080780029296875, "learning_rate": 2e-05, "loss": 1.2153, "loss/crossentropy": 2.5185062885284424, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.176223486661911, "step": 1993 }, { "epoch": 0.029774525907122592, "grad_norm": 0.65234375, "grad_norm_var": 0.008697509765625, "learning_rate": 2e-05, "loss": 1.2088, "loss/crossentropy": 2.354360818862915, "loss/dist_ce": 0.0, "loss/fcd": 1.03125, "loss/idx": 13.0, "loss/logits": 0.17753687500953674, "step": 1994 }, { "epoch": 0.029789457966253548, "grad_norm": 0.55078125, "grad_norm_var": 0.008675575256347656, "learning_rate": 2e-05, "loss": 1.29, "loss/crossentropy": 2.5912888050079346, "loss/dist_ce": 0.0, "loss/fcd": 1.1171875, "loss/idx": 13.0, "loss/logits": 0.17279267311096191, "step": 1995 }, { "epoch": 0.0298043900253845, "grad_norm": 0.53125, "grad_norm_var": 0.008695411682128906, "learning_rate": 2e-05, "loss": 1.2898, "loss/crossentropy": 2.7931675910949707, "loss/dist_ce": 0.0, "loss/fcd": 1.09375, "loss/idx": 13.0, "loss/logits": 0.1960277259349823, "step": 1996 }, { "epoch": 0.029819322084515456, "grad_norm": 0.5234375, "grad_norm_var": 0.008549944559733073, "learning_rate": 2e-05, "loss": 1.2553, "loss/crossentropy": 2.6213860511779785, "loss/dist_ce": 0.0, "loss/fcd": 1.0625, "loss/idx": 13.0, "loss/logits": 0.19277352094650269, "step": 1997 }, { "epoch": 0.02983425414364641, "grad_norm": 0.60546875, "grad_norm_var": 0.008144998550415039, "learning_rate": 2e-05, "loss": 1.3088, "loss/crossentropy": 2.811062812805176, "loss/dist_ce": 0.0, "loss/fcd": 1.109375, "loss/idx": 13.0, "loss/logits": 0.19940659403800964, "step": 1998 }, { "epoch": 0.029849186202777364, "grad_norm": 0.470703125, "grad_norm_var": 0.008384943008422852, "learning_rate": 2e-05, "loss": 1.1252, "loss/crossentropy": 2.78082013130188, "loss/dist_ce": 0.0, "loss/fcd": 0.98046875, "loss/idx": 13.0, "loss/logits": 0.1447007954120636, "step": 1999 }, { "epoch": 0.029864118261908316, "grad_norm": 0.462890625, "grad_norm_var": 0.008957926432291667, "learning_rate": 2e-05, "loss": 1.1896, "loss/crossentropy": 2.4754817485809326, "loss/dist_ce": 0.0, "loss/fcd": 1.0390625, "loss/idx": 13.0, "loss/logits": 0.1505032777786255, "step": 2000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.03506581880832e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }