diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.029864118261908316, + "eval_steps": 2000, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.4932059130954158e-05, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 1.3962, + "loss/crossentropy": 2.609541177749634, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.19302886724472046, + "step": 1 + }, + { + "epoch": 2.9864118261908317e-05, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 1.2844, + "loss/crossentropy": 2.702785015106201, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1750669926404953, + "step": 2 + }, + { + "epoch": 4.4796177392862473e-05, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 1.2457, + "loss/crossentropy": 2.620382308959961, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16754117608070374, + "step": 3 + }, + { + "epoch": 5.9728236523816634e-05, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 1.3366, + "loss/crossentropy": 2.566118001937866, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.18033114075660706, + "step": 4 + }, + { + "epoch": 7.466029565477079e-05, + "grad_norm": 0.6953125, + "learning_rate": 2e-05, + "loss": 1.292, + "loss/crossentropy": 2.561917304992676, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17484982311725616, + "step": 5 + }, + { + "epoch": 8.959235478572495e-05, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 1.315, + "loss/crossentropy": 2.606935977935791, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17437440156936646, + "step": 6 + }, + { + "epoch": 0.00010452441391667911, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 1.4291, + "loss/crossentropy": 2.555368661880493, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 12.0, + "loss/logits": 0.19471214711666107, + "step": 7 + }, + { + "epoch": 0.00011945647304763327, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 1.5255, + "loss/crossentropy": 2.5476245880126953, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 12.0, + "loss/logits": 0.24422992765903473, + "step": 8 + }, + { + "epoch": 0.00013438853217858743, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 1.3099, + "loss/crossentropy": 2.2711799144744873, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.16932150721549988, + "step": 9 + }, + { + "epoch": 0.00014932059130954157, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 1.3835, + "loss/crossentropy": 2.5518879890441895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.19604898989200592, + "step": 10 + }, + { + "epoch": 0.00016425265044049575, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 1.4655, + "loss/crossentropy": 2.5716614723205566, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.2233092039823532, + "step": 11 + }, + { + "epoch": 0.0001791847095714499, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 1.3127, + "loss/crossentropy": 2.6517248153686523, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17209960520267487, + "step": 12 + }, + { + "epoch": 0.00019411676870240407, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 1.403, + "loss/crossentropy": 2.4482481479644775, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 12.0, + "loss/logits": 0.17639976739883423, + "step": 13 + }, + { + "epoch": 0.00020904882783335821, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 1.2437, + "loss/crossentropy": 2.5636048316955566, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1577576994895935, + "step": 14 + }, + { + "epoch": 0.0002239808869643124, + "grad_norm": 0.59375, + "learning_rate": 2e-05, + "loss": 1.2923, + "loss/crossentropy": 2.559340238571167, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17509810626506805, + "step": 15 + }, + { + "epoch": 0.00023891294609526653, + "grad_norm": 0.6171875, + "grad_norm_var": 0.005140113830566406, + "learning_rate": 2e-05, + "loss": 1.2483, + "loss/crossentropy": 2.5928773880004883, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.17019006609916687, + "step": 16 + }, + { + "epoch": 0.0002538450052262207, + "grad_norm": 0.58984375, + "grad_norm_var": 0.005712890625, + "learning_rate": 2e-05, + "loss": 1.3097, + "loss/crossentropy": 2.7123403549194336, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.176863431930542, + "step": 17 + }, + { + "epoch": 0.00026877706435717485, + "grad_norm": 0.58984375, + "grad_norm_var": 0.006197102864583333, + "learning_rate": 2e-05, + "loss": 1.3289, + "loss/crossentropy": 2.7460012435913086, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18828681111335754, + "step": 18 + }, + { + "epoch": 0.000283709123488129, + "grad_norm": 1.28125, + "grad_norm_var": 0.026744524637858074, + "learning_rate": 2e-05, + "loss": 1.6745, + "loss/crossentropy": 2.6814517974853516, + "loss/dist_ce": 0.0, + "loss/fcd": 1.4140625, + "loss/idx": 12.0, + "loss/logits": 0.2604835033416748, + "step": 19 + }, + { + "epoch": 0.00029864118261908315, + "grad_norm": 0.58984375, + "grad_norm_var": 0.028202056884765625, + "learning_rate": 2e-05, + "loss": 1.2942, + "loss/crossentropy": 2.8028454780578613, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1770225465297699, + "step": 20 + }, + { + "epoch": 0.00031357324175003735, + "grad_norm": 0.5859375, + "grad_norm_var": 0.02951227823893229, + "learning_rate": 2e-05, + "loss": 1.3311, + "loss/crossentropy": 2.590350866317749, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.19830407202243805, + "step": 21 + }, + { + "epoch": 0.0003285053008809915, + "grad_norm": 0.6953125, + "grad_norm_var": 0.029569498697916665, + "learning_rate": 2e-05, + "loss": 1.4609, + "loss/crossentropy": 2.5552988052368164, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.21873216331005096, + "step": 22 + }, + { + "epoch": 0.00034343736001194564, + "grad_norm": 0.7265625, + "grad_norm_var": 0.029537391662597657, + "learning_rate": 2e-05, + "loss": 1.4856, + "loss/crossentropy": 2.727858066558838, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 12.0, + "loss/logits": 0.21994858980178833, + "step": 23 + }, + { + "epoch": 0.0003583694191428998, + "grad_norm": 0.6484375, + "grad_norm_var": 0.029579671223958333, + "learning_rate": 2e-05, + "loss": 1.371, + "loss/crossentropy": 2.3856289386749268, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.18346479535102844, + "step": 24 + }, + { + "epoch": 0.000373301478273854, + "grad_norm": 0.59375, + "grad_norm_var": 0.03048089345296224, + "learning_rate": 2e-05, + "loss": 1.282, + "loss/crossentropy": 2.530938148498535, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.15697604417800903, + "step": 25 + }, + { + "epoch": 0.00038823353740480814, + "grad_norm": 0.640625, + "grad_norm_var": 0.029403114318847658, + "learning_rate": 2e-05, + "loss": 1.4404, + "loss/crossentropy": 2.4393346309661865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.19825200736522675, + "step": 26 + }, + { + "epoch": 0.0004031655965357623, + "grad_norm": 0.60546875, + "grad_norm_var": 0.028580474853515624, + "learning_rate": 2e-05, + "loss": 1.3121, + "loss/crossentropy": 2.767091989517212, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.1792472004890442, + "step": 27 + }, + { + "epoch": 0.00041809765566671643, + "grad_norm": 0.671875, + "grad_norm_var": 0.028508440653483073, + "learning_rate": 2e-05, + "loss": 1.428, + "loss/crossentropy": 2.497144937515259, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 12.0, + "loss/logits": 0.21709555387496948, + "step": 28 + }, + { + "epoch": 0.0004330297147976706, + "grad_norm": 0.6015625, + "grad_norm_var": 0.028796831766764324, + "learning_rate": 2e-05, + "loss": 1.2806, + "loss/crossentropy": 2.503953218460083, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.18687917292118073, + "step": 29 + }, + { + "epoch": 0.0004479617739286248, + "grad_norm": 0.6875, + "grad_norm_var": 0.028580729166666666, + "learning_rate": 2e-05, + "loss": 1.3877, + "loss/crossentropy": 2.498080015182495, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.20022732019424438, + "step": 30 + }, + { + "epoch": 0.0004628938330595789, + "grad_norm": 0.66015625, + "grad_norm_var": 0.02818190256754557, + "learning_rate": 2e-05, + "loss": 1.3267, + "loss/crossentropy": 2.71108341217041, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.1782739758491516, + "step": 31 + }, + { + "epoch": 0.00047782589219053307, + "grad_norm": 1.390625, + "grad_norm_var": 0.05970350901285807, + "learning_rate": 2e-05, + "loss": 1.7218, + "loss/crossentropy": 2.422400951385498, + "loss/dist_ce": 0.0, + "loss/fcd": 1.46875, + "loss/idx": 12.0, + "loss/logits": 0.2530236542224884, + "step": 32 + }, + { + "epoch": 0.0004927579513214872, + "grad_norm": 0.67578125, + "grad_norm_var": 0.058646074930826825, + "learning_rate": 2e-05, + "loss": 1.4255, + "loss/crossentropy": 2.750814437866211, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.20671629905700684, + "step": 33 + }, + { + "epoch": 0.0005076900104524414, + "grad_norm": 1.1015625, + "grad_norm_var": 0.0656005859375, + "learning_rate": 2e-05, + "loss": 1.3081, + "loss/crossentropy": 2.8478612899780273, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.15186628699302673, + "step": 34 + }, + { + "epoch": 0.0005226220695833955, + "grad_norm": 0.69140625, + "grad_norm_var": 0.04633274078369141, + "learning_rate": 2e-05, + "loss": 1.3061, + "loss/crossentropy": 2.4926323890686035, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17327454686164856, + "step": 35 + }, + { + "epoch": 0.0005375541287143497, + "grad_norm": 0.8125, + "grad_norm_var": 0.045481109619140626, + "learning_rate": 2e-05, + "loss": 1.3796, + "loss/crossentropy": 2.6361165046691895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.1921408474445343, + "step": 36 + }, + { + "epoch": 0.0005524861878453039, + "grad_norm": 0.67578125, + "grad_norm_var": 0.044178199768066403, + "learning_rate": 2e-05, + "loss": 1.3217, + "loss/crossentropy": 2.5386064052581787, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18107610940933228, + "step": 37 + }, + { + "epoch": 0.000567418246976258, + "grad_norm": 0.5859375, + "grad_norm_var": 0.045613034566243486, + "learning_rate": 2e-05, + "loss": 1.3732, + "loss/crossentropy": 2.542595863342285, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 12.0, + "loss/logits": 0.19354979693889618, + "step": 38 + }, + { + "epoch": 0.0005823503061072122, + "grad_norm": 0.57421875, + "grad_norm_var": 0.047247060139973956, + "learning_rate": 2e-05, + "loss": 1.2929, + "loss/crossentropy": 2.339637279510498, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.18351908028125763, + "step": 39 + }, + { + "epoch": 0.0005972823652381663, + "grad_norm": 0.5546875, + "grad_norm_var": 0.048766835530598955, + "learning_rate": 2e-05, + "loss": 1.2777, + "loss/crossentropy": 2.626256227493286, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.16047844290733337, + "step": 40 + }, + { + "epoch": 0.0006122144243691205, + "grad_norm": 0.6015625, + "grad_norm_var": 0.048638916015625, + "learning_rate": 2e-05, + "loss": 1.2725, + "loss/crossentropy": 2.5306503772735596, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17098692059516907, + "step": 41 + }, + { + "epoch": 0.0006271464835000747, + "grad_norm": 0.6328125, + "grad_norm_var": 0.04872614542643229, + "learning_rate": 2e-05, + "loss": 1.4129, + "loss/crossentropy": 2.4811081886291504, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 12.0, + "loss/logits": 0.20194479823112488, + "step": 42 + }, + { + "epoch": 0.0006420785426310288, + "grad_norm": 0.58203125, + "grad_norm_var": 0.049119059244791666, + "learning_rate": 2e-05, + "loss": 1.274, + "loss/crossentropy": 2.4502978324890137, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16458770632743835, + "step": 43 + }, + { + "epoch": 0.000657010601761983, + "grad_norm": 0.69140625, + "grad_norm_var": 0.04902083079020182, + "learning_rate": 2e-05, + "loss": 1.5054, + "loss/crossentropy": 2.416804075241089, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 12.0, + "loss/logits": 0.22411639988422394, + "step": 44 + }, + { + "epoch": 0.0006719426608929372, + "grad_norm": 0.79296875, + "grad_norm_var": 0.048288726806640626, + "learning_rate": 2e-05, + "loss": 1.4078, + "loss/crossentropy": 2.742251396179199, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.18909800052642822, + "step": 45 + }, + { + "epoch": 0.0006868747200238913, + "grad_norm": 0.70703125, + "grad_norm_var": 0.04819685618082682, + "learning_rate": 2e-05, + "loss": 1.4589, + "loss/crossentropy": 2.6130266189575195, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 12.0, + "loss/logits": 0.232346311211586, + "step": 46 + }, + { + "epoch": 0.0007018067791548455, + "grad_norm": 0.69140625, + "grad_norm_var": 0.04795373280843099, + "learning_rate": 2e-05, + "loss": 1.4667, + "loss/crossentropy": 2.4245617389678955, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 12.0, + "loss/logits": 0.21665045619010925, + "step": 47 + }, + { + "epoch": 0.0007167388382857996, + "grad_norm": 0.56640625, + "grad_norm_var": 0.018373616536458335, + "learning_rate": 2e-05, + "loss": 1.3413, + "loss/crossentropy": 2.6861648559570312, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.19289466738700867, + "step": 48 + }, + { + "epoch": 0.0007316708974167538, + "grad_norm": 0.54296875, + "grad_norm_var": 0.019614410400390626, + "learning_rate": 2e-05, + "loss": 1.2877, + "loss/crossentropy": 2.713362455368042, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17047566175460815, + "step": 49 + }, + { + "epoch": 0.000746602956547708, + "grad_norm": 0.5234375, + "grad_norm_var": 0.007645416259765625, + "learning_rate": 2e-05, + "loss": 1.2322, + "loss/crossentropy": 2.661055564880371, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1618708372116089, + "step": 50 + }, + { + "epoch": 0.0007615350156786621, + "grad_norm": 0.62890625, + "grad_norm_var": 0.0074541727701822914, + "learning_rate": 2e-05, + "loss": 1.3611, + "loss/crossentropy": 2.7492740154266357, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.1892453134059906, + "step": 51 + }, + { + "epoch": 0.0007764670748096163, + "grad_norm": 0.640625, + "grad_norm_var": 0.005238596598307292, + "learning_rate": 2e-05, + "loss": 1.4406, + "loss/crossentropy": 2.341090679168701, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 12.0, + "loss/logits": 0.19059142470359802, + "step": 52 + }, + { + "epoch": 0.0007913991339405704, + "grad_norm": 0.5703125, + "grad_norm_var": 0.005212847391764323, + "learning_rate": 2e-05, + "loss": 1.2364, + "loss/crossentropy": 2.77945613861084, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.15829287469387054, + "step": 53 + }, + { + "epoch": 0.0008063311930715246, + "grad_norm": 0.6328125, + "grad_norm_var": 0.0051502863566080725, + "learning_rate": 2e-05, + "loss": 1.2944, + "loss/crossentropy": 2.8645284175872803, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1772599220275879, + "step": 54 + }, + { + "epoch": 0.0008212632522024788, + "grad_norm": 0.546875, + "grad_norm_var": 0.0053670247395833336, + "learning_rate": 2e-05, + "loss": 1.3286, + "loss/crossentropy": 2.2789793014526367, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.1957651972770691, + "step": 55 + }, + { + "epoch": 0.0008361953113334329, + "grad_norm": 0.61328125, + "grad_norm_var": 0.00507806142171224, + "learning_rate": 2e-05, + "loss": 1.4199, + "loss/crossentropy": 2.734726667404175, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.21681487560272217, + "step": 56 + }, + { + "epoch": 0.0008511273704643871, + "grad_norm": 0.5859375, + "grad_norm_var": 0.0051375706990559895, + "learning_rate": 2e-05, + "loss": 1.2676, + "loss/crossentropy": 2.6685731410980225, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16598857939243317, + "step": 57 + }, + { + "epoch": 0.0008660594295953411, + "grad_norm": 0.62109375, + "grad_norm_var": 0.005128987630208333, + "learning_rate": 2e-05, + "loss": 1.2632, + "loss/crossentropy": 2.5432820320129395, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16164365410804749, + "step": 58 + }, + { + "epoch": 0.0008809914887262953, + "grad_norm": 0.671875, + "grad_norm_var": 0.0051655451456705725, + "learning_rate": 2e-05, + "loss": 1.3446, + "loss/crossentropy": 2.400588035583496, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.17270785570144653, + "step": 59 + }, + { + "epoch": 0.0008959235478572496, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0050129572550455725, + "learning_rate": 2e-05, + "loss": 1.2711, + "loss/crossentropy": 2.9404215812683105, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16168653964996338, + "step": 60 + }, + { + "epoch": 0.0009108556069882036, + "grad_norm": 0.640625, + "grad_norm_var": 0.002937571207682292, + "learning_rate": 2e-05, + "loss": 1.343, + "loss/crossentropy": 2.4893975257873535, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.17896610498428345, + "step": 61 + }, + { + "epoch": 0.0009257876661191578, + "grad_norm": 0.5625, + "grad_norm_var": 0.0023706436157226564, + "learning_rate": 2e-05, + "loss": 1.286, + "loss/crossentropy": 2.544616460800171, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17664407193660736, + "step": 62 + }, + { + "epoch": 0.000940719725250112, + "grad_norm": 0.62890625, + "grad_norm_var": 0.0018599828084309895, + "learning_rate": 2e-05, + "loss": 1.3906, + "loss/crossentropy": 2.5881662368774414, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.19525527954101562, + "step": 63 + }, + { + "epoch": 0.0009556517843810661, + "grad_norm": 0.578125, + "grad_norm_var": 0.0018208821614583333, + "learning_rate": 2e-05, + "loss": 1.1933, + "loss/crossentropy": 2.7437222003936768, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.14645323157310486, + "step": 64 + }, + { + "epoch": 0.0009705838435120203, + "grad_norm": 0.6328125, + "grad_norm_var": 0.0016702651977539063, + "learning_rate": 2e-05, + "loss": 1.4415, + "loss/crossentropy": 2.6524834632873535, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 12.0, + "loss/logits": 0.21492895483970642, + "step": 65 + }, + { + "epoch": 0.0009855159026429744, + "grad_norm": 0.59765625, + "grad_norm_var": 0.001224517822265625, + "learning_rate": 2e-05, + "loss": 1.3729, + "loss/crossentropy": 2.853750705718994, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.2009965479373932, + "step": 66 + }, + { + "epoch": 0.0010004479617739285, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0013872782389322917, + "learning_rate": 2e-05, + "loss": 1.205, + "loss/crossentropy": 2.3778915405273438, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1425057351589203, + "step": 67 + }, + { + "epoch": 0.0010153800209048828, + "grad_norm": 0.640625, + "grad_norm_var": 0.0013872782389322917, + "learning_rate": 2e-05, + "loss": 1.4857, + "loss/crossentropy": 2.0992250442504883, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 12.0, + "loss/logits": 0.22008036077022552, + "step": 68 + }, + { + "epoch": 0.001030312080035837, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0013711929321289062, + "learning_rate": 2e-05, + "loss": 1.3042, + "loss/crossentropy": 2.557607650756836, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.17921671271324158, + "step": 69 + }, + { + "epoch": 0.001045244139166791, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0015825907389322917, + "learning_rate": 2e-05, + "loss": 1.1715, + "loss/crossentropy": 2.5834288597106934, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.14024239778518677, + "step": 70 + }, + { + "epoch": 0.0010601761982977453, + "grad_norm": 0.67578125, + "grad_norm_var": 0.0017567316691080729, + "learning_rate": 2e-05, + "loss": 1.4302, + "loss/crossentropy": 2.542942523956299, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 12.0, + "loss/logits": 0.2036018967628479, + "step": 71 + }, + { + "epoch": 0.0010751082574286994, + "grad_norm": 0.5625, + "grad_norm_var": 0.0018633524576822916, + "learning_rate": 2e-05, + "loss": 1.252, + "loss/crossentropy": 2.635038137435913, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16602204740047455, + "step": 72 + }, + { + "epoch": 0.0010900403165596535, + "grad_norm": 0.5859375, + "grad_norm_var": 0.0018633524576822916, + "learning_rate": 2e-05, + "loss": 1.3169, + "loss/crossentropy": 2.5402872562408447, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.18406565487384796, + "step": 73 + }, + { + "epoch": 0.0011049723756906078, + "grad_norm": 0.5546875, + "grad_norm_var": 0.001970354715983073, + "learning_rate": 2e-05, + "loss": 1.2864, + "loss/crossentropy": 2.5613765716552734, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.177069291472435, + "step": 74 + }, + { + "epoch": 0.001119904434821562, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0016253153483072917, + "learning_rate": 2e-05, + "loss": 1.2234, + "loss/crossentropy": 2.5081839561462402, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15304957330226898, + "step": 75 + }, + { + "epoch": 0.001134836493952516, + "grad_norm": 0.6796875, + "grad_norm_var": 0.002080217997233073, + "learning_rate": 2e-05, + "loss": 1.3845, + "loss/crossentropy": 2.4124844074249268, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.18140918016433716, + "step": 76 + }, + { + "epoch": 0.0011497685530834703, + "grad_norm": 0.87109375, + "grad_norm_var": 0.006712849934895833, + "learning_rate": 2e-05, + "loss": 1.4476, + "loss/crossentropy": 2.6826772689819336, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.22889642417430878, + "step": 77 + }, + { + "epoch": 0.0011647006122144244, + "grad_norm": 0.67578125, + "grad_norm_var": 0.0067626317342122395, + "learning_rate": 2e-05, + "loss": 1.5034, + "loss/crossentropy": 2.1851906776428223, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3125, + "loss/idx": 12.0, + "loss/logits": 0.19086772203445435, + "step": 78 + }, + { + "epoch": 0.0011796326713453785, + "grad_norm": 0.60546875, + "grad_norm_var": 0.00676720937093099, + "learning_rate": 2e-05, + "loss": 1.1937, + "loss/crossentropy": 2.655413866043091, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1467989981174469, + "step": 79 + }, + { + "epoch": 0.0011945647304763326, + "grad_norm": 0.64453125, + "grad_norm_var": 0.006690470377604166, + "learning_rate": 2e-05, + "loss": 1.3068, + "loss/crossentropy": 2.6873862743377686, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.18183887004852295, + "step": 80 + }, + { + "epoch": 0.001209496789607287, + "grad_norm": 0.65625, + "grad_norm_var": 0.006758371988932292, + "learning_rate": 2e-05, + "loss": 1.334, + "loss/crossentropy": 2.697080373764038, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.19333507120609283, + "step": 81 + }, + { + "epoch": 0.001224428848738241, + "grad_norm": 0.5234375, + "grad_norm_var": 0.007358741760253906, + "learning_rate": 2e-05, + "loss": 1.19, + "loss/crossentropy": 2.5790112018585205, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.14316534996032715, + "step": 82 + }, + { + "epoch": 0.001239360907869195, + "grad_norm": 0.54296875, + "grad_norm_var": 0.007433509826660157, + "learning_rate": 2e-05, + "loss": 1.2365, + "loss/crossentropy": 2.552305221557617, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.15837864577770233, + "step": 83 + }, + { + "epoch": 0.0012542929670001494, + "grad_norm": 0.7421875, + "grad_norm_var": 0.008379046122233074, + "learning_rate": 2e-05, + "loss": 1.5673, + "loss/crossentropy": 2.189481019973755, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3515625, + "loss/idx": 12.0, + "loss/logits": 0.21574443578720093, + "step": 84 + }, + { + "epoch": 0.0012692250261311035, + "grad_norm": 0.72265625, + "grad_norm_var": 0.008755938212076823, + "learning_rate": 2e-05, + "loss": 1.4194, + "loss/crossentropy": 2.523226261138916, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.20060396194458008, + "step": 85 + }, + { + "epoch": 0.0012841570852620576, + "grad_norm": 0.56640625, + "grad_norm_var": 0.008404986063639323, + "learning_rate": 2e-05, + "loss": 1.3345, + "loss/crossentropy": 2.6093485355377197, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.1860586702823639, + "step": 86 + }, + { + "epoch": 0.0012990891443930119, + "grad_norm": 0.5390625, + "grad_norm_var": 0.008847808837890625, + "learning_rate": 2e-05, + "loss": 1.2861, + "loss/crossentropy": 2.5548102855682373, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17671170830726624, + "step": 87 + }, + { + "epoch": 0.001314021203523966, + "grad_norm": 1.2265625, + "grad_norm_var": 0.030658976236979166, + "learning_rate": 2e-05, + "loss": 1.7591, + "loss/crossentropy": 2.089616298675537, + "loss/dist_ce": 0.0, + "loss/fcd": 1.5, + "loss/idx": 12.0, + "loss/logits": 0.259127676486969, + "step": 88 + }, + { + "epoch": 0.00132895326265492, + "grad_norm": 0.6484375, + "grad_norm_var": 0.030211385091145834, + "learning_rate": 2e-05, + "loss": 1.3975, + "loss/crossentropy": 2.563173770904541, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.19433674216270447, + "step": 89 + }, + { + "epoch": 0.0013438853217858744, + "grad_norm": 0.60546875, + "grad_norm_var": 0.029572486877441406, + "learning_rate": 2e-05, + "loss": 1.4763, + "loss/crossentropy": 2.6154820919036865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.23407219350337982, + "step": 90 + }, + { + "epoch": 0.0013588173809168285, + "grad_norm": 0.62890625, + "grad_norm_var": 0.02890313466389974, + "learning_rate": 2e-05, + "loss": 1.4736, + "loss/crossentropy": 2.460665225982666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 12.0, + "loss/logits": 0.2470313310623169, + "step": 91 + }, + { + "epoch": 0.0013737494400477826, + "grad_norm": 0.65625, + "grad_norm_var": 0.02893822987874349, + "learning_rate": 2e-05, + "loss": 1.2087, + "loss/crossentropy": 2.815687417984009, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1540374755859375, + "step": 92 + }, + { + "epoch": 0.0013886814991787367, + "grad_norm": 0.53125, + "grad_norm_var": 0.027428181966145833, + "learning_rate": 2e-05, + "loss": 1.2891, + "loss/crossentropy": 2.7470219135284424, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17968374490737915, + "step": 93 + }, + { + "epoch": 0.001403613558309691, + "grad_norm": 0.5390625, + "grad_norm_var": 0.02825819651285807, + "learning_rate": 2e-05, + "loss": 1.178, + "loss/crossentropy": 2.582261562347412, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.15451796352863312, + "step": 94 + }, + { + "epoch": 0.001418545617440645, + "grad_norm": 0.9609375, + "grad_norm_var": 0.03410746256510417, + "learning_rate": 2e-05, + "loss": 1.6059, + "loss/crossentropy": 2.5164883136749268, + "loss/dist_ce": 0.0, + "loss/fcd": 1.359375, + "loss/idx": 12.0, + "loss/logits": 0.24653397500514984, + "step": 95 + }, + { + "epoch": 0.0014334776765715992, + "grad_norm": 0.515625, + "grad_norm_var": 0.03559919993082682, + "learning_rate": 2e-05, + "loss": 1.1938, + "loss/crossentropy": 2.423619031906128, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.154771625995636, + "step": 96 + }, + { + "epoch": 0.0014484097357025535, + "grad_norm": 0.55859375, + "grad_norm_var": 0.03628107706705729, + "learning_rate": 2e-05, + "loss": 1.2842, + "loss/crossentropy": 2.7245681285858154, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17486220598220825, + "step": 97 + }, + { + "epoch": 0.0014633417948335076, + "grad_norm": 0.515625, + "grad_norm_var": 0.036423746744791666, + "learning_rate": 2e-05, + "loss": 1.2278, + "loss/crossentropy": 2.596822500228882, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15747135877609253, + "step": 98 + }, + { + "epoch": 0.0014782738539644616, + "grad_norm": 0.58984375, + "grad_norm_var": 0.035853068033854164, + "learning_rate": 2e-05, + "loss": 1.3496, + "loss/crossentropy": 2.319052219390869, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.20119673013687134, + "step": 99 + }, + { + "epoch": 0.001493205913095416, + "grad_norm": 0.51171875, + "grad_norm_var": 0.036622047424316406, + "learning_rate": 2e-05, + "loss": 1.2263, + "loss/crossentropy": 2.8021628856658936, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15594205260276794, + "step": 100 + }, + { + "epoch": 0.00150813797222637, + "grad_norm": 0.56640625, + "grad_norm_var": 0.03652540842692057, + "learning_rate": 2e-05, + "loss": 1.3127, + "loss/crossentropy": 2.776284694671631, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1876915991306305, + "step": 101 + }, + { + "epoch": 0.0015230700313573241, + "grad_norm": 0.53125, + "grad_norm_var": 0.03692423502604167, + "learning_rate": 2e-05, + "loss": 1.2802, + "loss/crossentropy": 2.548290967941284, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17086531221866608, + "step": 102 + }, + { + "epoch": 0.0015380020904882782, + "grad_norm": 0.96484375, + "grad_norm_var": 0.04293257395426432, + "learning_rate": 2e-05, + "loss": 1.6039, + "loss/crossentropy": 2.2254953384399414, + "loss/dist_ce": 0.0, + "loss/fcd": 1.359375, + "loss/idx": 12.0, + "loss/logits": 0.24454209208488464, + "step": 103 + }, + { + "epoch": 0.0015529341496192325, + "grad_norm": 0.5234375, + "grad_norm_var": 0.020662371317545572, + "learning_rate": 2e-05, + "loss": 1.1787, + "loss/crossentropy": 2.4688055515289307, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.14745503664016724, + "step": 104 + }, + { + "epoch": 0.0015678662087501866, + "grad_norm": 0.5390625, + "grad_norm_var": 0.020929400126139322, + "learning_rate": 2e-05, + "loss": 1.3059, + "loss/crossentropy": 2.3868861198425293, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17307642102241516, + "step": 105 + }, + { + "epoch": 0.0015827982678811407, + "grad_norm": 0.546875, + "grad_norm_var": 0.02116877237955729, + "learning_rate": 2e-05, + "loss": 1.2415, + "loss/crossentropy": 2.660839319229126, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.17113830149173737, + "step": 106 + }, + { + "epoch": 0.001597730327012095, + "grad_norm": 0.65234375, + "grad_norm_var": 0.021277872721354167, + "learning_rate": 2e-05, + "loss": 1.3782, + "loss/crossentropy": 2.385211944580078, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.19073891639709473, + "step": 107 + }, + { + "epoch": 0.0016126623861430491, + "grad_norm": 0.58203125, + "grad_norm_var": 0.021129290262858074, + "learning_rate": 2e-05, + "loss": 1.2911, + "loss/crossentropy": 2.6263484954833984, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.18175816535949707, + "step": 108 + }, + { + "epoch": 0.0016275944452740032, + "grad_norm": 0.62890625, + "grad_norm_var": 0.020806630452473957, + "learning_rate": 2e-05, + "loss": 1.3712, + "loss/crossentropy": 2.2086172103881836, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.17591926455497742, + "step": 109 + }, + { + "epoch": 0.0016425265044049575, + "grad_norm": 0.5390625, + "grad_norm_var": 0.020806630452473957, + "learning_rate": 2e-05, + "loss": 1.2894, + "loss/crossentropy": 2.563765048980713, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1721740961074829, + "step": 110 + }, + { + "epoch": 0.0016574585635359116, + "grad_norm": 0.5859375, + "grad_norm_var": 0.011944325764973958, + "learning_rate": 2e-05, + "loss": 1.2239, + "loss/crossentropy": 2.4681785106658936, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16138719022274017, + "step": 111 + }, + { + "epoch": 0.0016723906226668657, + "grad_norm": 0.61328125, + "grad_norm_var": 0.011643918355305989, + "learning_rate": 2e-05, + "loss": 1.3192, + "loss/crossentropy": 2.641052484512329, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.17078858613967896, + "step": 112 + }, + { + "epoch": 0.00168732268179782, + "grad_norm": 0.6640625, + "grad_norm_var": 0.011889394124348958, + "learning_rate": 2e-05, + "loss": 1.4469, + "loss/crossentropy": 2.031921863555908, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 12.0, + "loss/logits": 0.21249458193778992, + "step": 113 + }, + { + "epoch": 0.0017022547409287741, + "grad_norm": 0.59765625, + "grad_norm_var": 0.01141808827718099, + "learning_rate": 2e-05, + "loss": 1.3311, + "loss/crossentropy": 2.430854082107544, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.1826540231704712, + "step": 114 + }, + { + "epoch": 0.0017171868000597282, + "grad_norm": 0.671875, + "grad_norm_var": 0.011702473958333333, + "learning_rate": 2e-05, + "loss": 1.267, + "loss/crossentropy": 2.7262022495269775, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.17326927185058594, + "step": 115 + }, + { + "epoch": 0.0017321188591906823, + "grad_norm": 0.5703125, + "grad_norm_var": 0.011169370015462239, + "learning_rate": 2e-05, + "loss": 1.1959, + "loss/crossentropy": 2.7001309394836426, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.1568140983581543, + "step": 116 + }, + { + "epoch": 0.0017470509183216366, + "grad_norm": 0.54296875, + "grad_norm_var": 0.01134332021077474, + "learning_rate": 2e-05, + "loss": 1.273, + "loss/crossentropy": 2.252134323120117, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.1714470088481903, + "step": 117 + }, + { + "epoch": 0.0017619829774525907, + "grad_norm": 0.58984375, + "grad_norm_var": 0.010945638020833334, + "learning_rate": 2e-05, + "loss": 1.2842, + "loss/crossentropy": 2.4008445739746094, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17484425008296967, + "step": 118 + }, + { + "epoch": 0.0017769150365835448, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0021565755208333335, + "learning_rate": 2e-05, + "loss": 1.3173, + "loss/crossentropy": 2.561591863632202, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.18449710309505463, + "step": 119 + }, + { + "epoch": 0.001791847095714499, + "grad_norm": 0.78125, + "grad_norm_var": 0.0040280659993489586, + "learning_rate": 2e-05, + "loss": 1.4499, + "loss/crossentropy": 2.298919677734375, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2578125, + "loss/idx": 12.0, + "loss/logits": 0.19211535155773163, + "step": 120 + }, + { + "epoch": 0.0018067791548454532, + "grad_norm": 0.59765625, + "grad_norm_var": 0.0037200291951497394, + "learning_rate": 2e-05, + "loss": 1.3325, + "loss/crossentropy": 2.5992934703826904, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.19187986850738525, + "step": 121 + }, + { + "epoch": 0.0018217112139764073, + "grad_norm": 0.52734375, + "grad_norm_var": 0.003907267252604167, + "learning_rate": 2e-05, + "loss": 1.2822, + "loss/crossentropy": 2.6393258571624756, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17281374335289001, + "step": 122 + }, + { + "epoch": 0.0018366432731073616, + "grad_norm": 0.66015625, + "grad_norm_var": 0.003956858317057292, + "learning_rate": 2e-05, + "loss": 1.5573, + "loss/crossentropy": 2.521825075149536, + "loss/dist_ce": 0.0, + "loss/fcd": 1.296875, + "loss/idx": 12.0, + "loss/logits": 0.26045745611190796, + "step": 123 + }, + { + "epoch": 0.0018515753322383157, + "grad_norm": 0.625, + "grad_norm_var": 0.00391839345296224, + "learning_rate": 2e-05, + "loss": 1.4125, + "loss/crossentropy": 2.404489278793335, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 12.0, + "loss/logits": 0.2015247493982315, + "step": 124 + }, + { + "epoch": 0.0018665073913692698, + "grad_norm": 0.546875, + "grad_norm_var": 0.004149373372395833, + "learning_rate": 2e-05, + "loss": 1.3093, + "loss/crossentropy": 2.4341983795166016, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.16871654987335205, + "step": 125 + }, + { + "epoch": 0.001881439450500224, + "grad_norm": 0.71484375, + "grad_norm_var": 0.00450127919514974, + "learning_rate": 2e-05, + "loss": 1.4935, + "loss/crossentropy": 2.46096134185791, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 12.0, + "loss/logits": 0.2122688889503479, + "step": 126 + }, + { + "epoch": 0.0018963715096311782, + "grad_norm": 0.5078125, + "grad_norm_var": 0.00521081288655599, + "learning_rate": 2e-05, + "loss": 1.1707, + "loss/crossentropy": 2.6259350776672363, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.13940490782260895, + "step": 127 + }, + { + "epoch": 0.0019113035687621323, + "grad_norm": 0.59765625, + "grad_norm_var": 0.005224545796712239, + "learning_rate": 2e-05, + "loss": 1.3569, + "loss/crossentropy": 2.5117621421813965, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.19283828139305115, + "step": 128 + }, + { + "epoch": 0.0019262356278930864, + "grad_norm": 0.53125, + "grad_norm_var": 0.005397478739420573, + "learning_rate": 2e-05, + "loss": 1.3103, + "loss/crossentropy": 2.7031896114349365, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.1774989813566208, + "step": 129 + }, + { + "epoch": 0.0019411676870240407, + "grad_norm": 0.63671875, + "grad_norm_var": 0.005463600158691406, + "learning_rate": 2e-05, + "loss": 1.2254, + "loss/crossentropy": 2.3635990619659424, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.14724132418632507, + "step": 130 + }, + { + "epoch": 0.0019560997461549948, + "grad_norm": 0.51171875, + "grad_norm_var": 0.005653889973958334, + "learning_rate": 2e-05, + "loss": 1.2139, + "loss/crossentropy": 2.5982468128204346, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15138475596904755, + "step": 131 + }, + { + "epoch": 0.001971031805285949, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0057021458943684895, + "learning_rate": 2e-05, + "loss": 1.2847, + "loss/crossentropy": 2.7570533752441406, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1674671769142151, + "step": 132 + }, + { + "epoch": 0.001985963864416903, + "grad_norm": 0.66796875, + "grad_norm_var": 0.005812009175618489, + "learning_rate": 2e-05, + "loss": 1.362, + "loss/crossentropy": 2.513373374938965, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.1745297610759735, + "step": 133 + }, + { + "epoch": 0.002000895923547857, + "grad_norm": 0.5859375, + "grad_norm_var": 0.0058197021484375, + "learning_rate": 2e-05, + "loss": 1.2976, + "loss/crossentropy": 2.5316669940948486, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1726335734128952, + "step": 134 + }, + { + "epoch": 0.0020158279826788116, + "grad_norm": 0.63671875, + "grad_norm_var": 0.005877685546875, + "learning_rate": 2e-05, + "loss": 1.395, + "loss/crossentropy": 2.557969331741333, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.19964131712913513, + "step": 135 + }, + { + "epoch": 0.0020307600418097657, + "grad_norm": 0.67578125, + "grad_norm_var": 0.004100990295410156, + "learning_rate": 2e-05, + "loss": 1.216, + "loss/crossentropy": 2.6619279384613037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.14569371938705444, + "step": 136 + }, + { + "epoch": 0.0020456921009407198, + "grad_norm": 0.56640625, + "grad_norm_var": 0.00416711171468099, + "learning_rate": 2e-05, + "loss": 1.2596, + "loss/crossentropy": 2.415104389190674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.1580391526222229, + "step": 137 + }, + { + "epoch": 0.002060624160071674, + "grad_norm": 0.640625, + "grad_norm_var": 0.003918202718098959, + "learning_rate": 2e-05, + "loss": 1.4596, + "loss/crossentropy": 2.653369903564453, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 12.0, + "loss/logits": 0.2330111861228943, + "step": 138 + }, + { + "epoch": 0.002075556219202628, + "grad_norm": 0.8984375, + "grad_norm_var": 0.009250831604003907, + "learning_rate": 2e-05, + "loss": 1.4266, + "loss/crossentropy": 2.440645694732666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.2078884392976761, + "step": 139 + }, + { + "epoch": 0.002090488278333582, + "grad_norm": 0.58984375, + "grad_norm_var": 0.009299468994140626, + "learning_rate": 2e-05, + "loss": 1.2998, + "loss/crossentropy": 2.5413432121276855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1748320460319519, + "step": 140 + }, + { + "epoch": 0.0021054203374645366, + "grad_norm": 0.6171875, + "grad_norm_var": 0.008953857421875, + "learning_rate": 2e-05, + "loss": 1.4042, + "loss/crossentropy": 2.618962287902832, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.20887598395347595, + "step": 141 + }, + { + "epoch": 0.0021203523965954907, + "grad_norm": 0.5703125, + "grad_norm_var": 0.008452796936035156, + "learning_rate": 2e-05, + "loss": 1.2957, + "loss/crossentropy": 2.5938720703125, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17847199738025665, + "step": 142 + }, + { + "epoch": 0.0021352844557264447, + "grad_norm": 0.56640625, + "grad_norm_var": 0.007852935791015625, + "learning_rate": 2e-05, + "loss": 1.2851, + "loss/crossentropy": 2.5696861743927, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.167904332280159, + "step": 143 + }, + { + "epoch": 0.002150216514857399, + "grad_norm": 0.5390625, + "grad_norm_var": 0.008208656311035156, + "learning_rate": 2e-05, + "loss": 1.294, + "loss/crossentropy": 2.61997389793396, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17682784795761108, + "step": 144 + }, + { + "epoch": 0.002165148573988353, + "grad_norm": 0.91796875, + "grad_norm_var": 0.013388824462890626, + "learning_rate": 2e-05, + "loss": 1.4053, + "loss/crossentropy": 2.3499011993408203, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.18656103312969208, + "step": 145 + }, + { + "epoch": 0.002180080633119307, + "grad_norm": 0.69921875, + "grad_norm_var": 0.013637034098307292, + "learning_rate": 2e-05, + "loss": 1.4045, + "loss/crossentropy": 2.7330119609832764, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.21699491143226624, + "step": 146 + }, + { + "epoch": 0.002195012692250261, + "grad_norm": 0.5625, + "grad_norm_var": 0.01292870839436849, + "learning_rate": 2e-05, + "loss": 1.2919, + "loss/crossentropy": 2.430858612060547, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16690674424171448, + "step": 147 + }, + { + "epoch": 0.0022099447513812156, + "grad_norm": 0.53125, + "grad_norm_var": 0.0132843017578125, + "learning_rate": 2e-05, + "loss": 1.2364, + "loss/crossentropy": 2.6031928062438965, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1582430899143219, + "step": 148 + }, + { + "epoch": 0.0022248768105121697, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0136474609375, + "learning_rate": 2e-05, + "loss": 1.2916, + "loss/crossentropy": 2.755666494369507, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17443333566188812, + "step": 149 + }, + { + "epoch": 0.002239808869643124, + "grad_norm": 0.60546875, + "grad_norm_var": 0.013544146219889324, + "learning_rate": 2e-05, + "loss": 1.3718, + "loss/crossentropy": 2.467615842819214, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.18433909118175507, + "step": 150 + }, + { + "epoch": 0.002254740928774078, + "grad_norm": 0.5390625, + "grad_norm_var": 0.014130655924479167, + "learning_rate": 2e-05, + "loss": 1.2852, + "loss/crossentropy": 2.555243730545044, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17583933472633362, + "step": 151 + }, + { + "epoch": 0.002269672987905032, + "grad_norm": 0.6171875, + "grad_norm_var": 0.013986651102701824, + "learning_rate": 2e-05, + "loss": 1.3463, + "loss/crossentropy": 2.4775476455688477, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.19002023339271545, + "step": 152 + }, + { + "epoch": 0.002284605047035986, + "grad_norm": 0.546875, + "grad_norm_var": 0.014166259765625, + "learning_rate": 2e-05, + "loss": 1.2389, + "loss/crossentropy": 2.747436761856079, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.14515338838100433, + "step": 153 + }, + { + "epoch": 0.0022995371061669406, + "grad_norm": 0.625, + "grad_norm_var": 0.014148966471354166, + "learning_rate": 2e-05, + "loss": 1.3508, + "loss/crossentropy": 2.550513505935669, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.20233154296875, + "step": 154 + }, + { + "epoch": 0.0023144691652978947, + "grad_norm": 0.609375, + "grad_norm_var": 0.008794911702473958, + "learning_rate": 2e-05, + "loss": 1.3803, + "loss/crossentropy": 2.761523723602295, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.19279026985168457, + "step": 155 + }, + { + "epoch": 0.002329401224428849, + "grad_norm": 0.625, + "grad_norm_var": 0.008796628316243489, + "learning_rate": 2e-05, + "loss": 1.3714, + "loss/crossentropy": 2.6067566871643066, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.22298991680145264, + "step": 156 + }, + { + "epoch": 0.002344333283559803, + "grad_norm": 0.486328125, + "grad_norm_var": 0.00970927874247233, + "learning_rate": 2e-05, + "loss": 1.166, + "loss/crossentropy": 2.559112548828125, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.1503329873085022, + "step": 157 + }, + { + "epoch": 0.002359265342690757, + "grad_norm": 0.5546875, + "grad_norm_var": 0.009786335627237956, + "learning_rate": 2e-05, + "loss": 1.2461, + "loss/crossentropy": 2.5379300117492676, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16015547513961792, + "step": 158 + }, + { + "epoch": 0.002374197401821711, + "grad_norm": 0.55859375, + "grad_norm_var": 0.009824101130167644, + "learning_rate": 2e-05, + "loss": 1.3303, + "loss/crossentropy": 2.4808268547058105, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.1740918755531311, + "step": 159 + }, + { + "epoch": 0.002389129460952665, + "grad_norm": 0.478515625, + "grad_norm_var": 0.01053314208984375, + "learning_rate": 2e-05, + "loss": 1.1762, + "loss/crossentropy": 2.506295919418335, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.15280288457870483, + "step": 160 + }, + { + "epoch": 0.0024040615200836197, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0031035741170247397, + "learning_rate": 2e-05, + "loss": 1.2835, + "loss/crossentropy": 2.7988052368164062, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17412468791007996, + "step": 161 + }, + { + "epoch": 0.002418993579214574, + "grad_norm": 0.57421875, + "grad_norm_var": 0.001976458231608073, + "learning_rate": 2e-05, + "loss": 1.3794, + "loss/crossentropy": 2.4478092193603516, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.19193212687969208, + "step": 162 + }, + { + "epoch": 0.002433925638345528, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0020662943522135415, + "learning_rate": 2e-05, + "loss": 1.2295, + "loss/crossentropy": 2.6355202198028564, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1592077910900116, + "step": 163 + }, + { + "epoch": 0.002448857697476482, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0021031697591145835, + "learning_rate": 2e-05, + "loss": 1.2719, + "loss/crossentropy": 2.6865055561065674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17034964263439178, + "step": 164 + }, + { + "epoch": 0.002463789756607436, + "grad_norm": 0.625, + "grad_norm_var": 0.002344195048014323, + "learning_rate": 2e-05, + "loss": 1.4733, + "loss/crossentropy": 2.4416587352752686, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.23110373318195343, + "step": 165 + }, + { + "epoch": 0.00247872181573839, + "grad_norm": 0.5625, + "grad_norm_var": 0.0022371927897135418, + "learning_rate": 2e-05, + "loss": 1.4188, + "loss/crossentropy": 2.4493439197540283, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.20008787512779236, + "step": 166 + }, + { + "epoch": 0.0024936538748693447, + "grad_norm": 0.55078125, + "grad_norm_var": 0.002206865946451823, + "learning_rate": 2e-05, + "loss": 1.2253, + "loss/crossentropy": 2.712056875228882, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16277046501636505, + "step": 167 + }, + { + "epoch": 0.002508585934000299, + "grad_norm": 0.6171875, + "grad_norm_var": 0.002206865946451823, + "learning_rate": 2e-05, + "loss": 1.4075, + "loss/crossentropy": 2.426335096359253, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.21219667792320251, + "step": 168 + }, + { + "epoch": 0.002523517993131253, + "grad_norm": 0.5859375, + "grad_norm_var": 0.0022094090779622394, + "learning_rate": 2e-05, + "loss": 1.4624, + "loss/crossentropy": 2.7482047080993652, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 12.0, + "loss/logits": 0.22799468040466309, + "step": 169 + }, + { + "epoch": 0.002538450052262207, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0019810994466145835, + "learning_rate": 2e-05, + "loss": 1.2488, + "loss/crossentropy": 2.6758551597595215, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16287586092948914, + "step": 170 + }, + { + "epoch": 0.002553382111393161, + "grad_norm": 0.5625, + "grad_norm_var": 0.0018254597981770834, + "learning_rate": 2e-05, + "loss": 1.2867, + "loss/crossentropy": 2.5498204231262207, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17730045318603516, + "step": 171 + }, + { + "epoch": 0.002568314170524115, + "grad_norm": 0.53515625, + "grad_norm_var": 0.001546160380045573, + "learning_rate": 2e-05, + "loss": 1.2392, + "loss/crossentropy": 2.717747926712036, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16104069352149963, + "step": 172 + }, + { + "epoch": 0.0025832462296550692, + "grad_norm": 0.78515625, + "grad_norm_var": 0.004432789484659831, + "learning_rate": 2e-05, + "loss": 1.3913, + "loss/crossentropy": 2.50931715965271, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.18817588686943054, + "step": 173 + }, + { + "epoch": 0.0025981782887860238, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0045685927073160805, + "learning_rate": 2e-05, + "loss": 1.2485, + "loss/crossentropy": 2.446424961090088, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16260743141174316, + "step": 174 + }, + { + "epoch": 0.002613110347916978, + "grad_norm": 0.58984375, + "grad_norm_var": 0.00457927385965983, + "learning_rate": 2e-05, + "loss": 1.4057, + "loss/crossentropy": 2.4681448936462402, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.20260846614837646, + "step": 175 + }, + { + "epoch": 0.002628042407047932, + "grad_norm": 0.5546875, + "grad_norm_var": 0.003986040751139323, + "learning_rate": 2e-05, + "loss": 1.2474, + "loss/crossentropy": 2.8297557830810547, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1614263653755188, + "step": 176 + }, + { + "epoch": 0.002642974466178886, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0040283203125, + "learning_rate": 2e-05, + "loss": 1.1839, + "loss/crossentropy": 2.594815731048584, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.1526121199131012, + "step": 177 + }, + { + "epoch": 0.00265790652530984, + "grad_norm": 0.51953125, + "grad_norm_var": 0.004229482014973958, + "learning_rate": 2e-05, + "loss": 1.3533, + "loss/crossentropy": 2.3881778717041016, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.20487645268440247, + "step": 178 + }, + { + "epoch": 0.0026728385844407942, + "grad_norm": 0.5, + "grad_norm_var": 0.00444176991780599, + "learning_rate": 2e-05, + "loss": 1.1749, + "loss/crossentropy": 2.5949110984802246, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.15150848031044006, + "step": 179 + }, + { + "epoch": 0.0026877706435717488, + "grad_norm": 0.58203125, + "grad_norm_var": 0.004284413655598959, + "learning_rate": 2e-05, + "loss": 1.2842, + "loss/crossentropy": 2.8450071811676025, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.16698706150054932, + "step": 180 + }, + { + "epoch": 0.002702702702702703, + "grad_norm": 0.48828125, + "grad_norm_var": 0.004535865783691406, + "learning_rate": 2e-05, + "loss": 1.2074, + "loss/crossentropy": 2.5095417499542236, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1605302095413208, + "step": 181 + }, + { + "epoch": 0.002717634761833657, + "grad_norm": 0.6015625, + "grad_norm_var": 0.004612159729003906, + "learning_rate": 2e-05, + "loss": 1.359, + "loss/crossentropy": 2.6669504642486572, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.20277641713619232, + "step": 182 + }, + { + "epoch": 0.002732566820964611, + "grad_norm": 0.5625, + "grad_norm_var": 0.0045928955078125, + "learning_rate": 2e-05, + "loss": 1.1927, + "loss/crossentropy": 2.5329155921936035, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.14582672715187073, + "step": 183 + }, + { + "epoch": 0.002747498880095565, + "grad_norm": 0.56640625, + "grad_norm_var": 0.004430071512858073, + "learning_rate": 2e-05, + "loss": 1.354, + "loss/crossentropy": 2.6010122299194336, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.19775637984275818, + "step": 184 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0044024149576822914, + "learning_rate": 2e-05, + "loss": 1.3048, + "loss/crossentropy": 2.39243483543396, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.17982321977615356, + "step": 185 + }, + { + "epoch": 0.0027773629983574733, + "grad_norm": 0.515625, + "grad_norm_var": 0.004546038309733073, + "learning_rate": 2e-05, + "loss": 1.2813, + "loss/crossentropy": 2.5203473567962646, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.16413094103336334, + "step": 186 + }, + { + "epoch": 0.002792295057488428, + "grad_norm": 0.68359375, + "grad_norm_var": 0.0054585774739583336, + "learning_rate": 2e-05, + "loss": 1.5839, + "loss/crossentropy": 2.391411542892456, + "loss/dist_ce": 0.0, + "loss/fcd": 1.328125, + "loss/idx": 12.0, + "loss/logits": 0.2557827830314636, + "step": 187 + }, + { + "epoch": 0.002807227116619382, + "grad_norm": 0.53125, + "grad_norm_var": 0.00547784169514974, + "learning_rate": 2e-05, + "loss": 1.2742, + "loss/crossentropy": 2.7311580181121826, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17259860038757324, + "step": 188 + }, + { + "epoch": 0.002822159175750336, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0021982192993164062, + "learning_rate": 2e-05, + "loss": 1.2556, + "loss/crossentropy": 2.5843493938446045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16964314877986908, + "step": 189 + }, + { + "epoch": 0.00283709123488129, + "grad_norm": 0.5390625, + "grad_norm_var": 0.002147865295410156, + "learning_rate": 2e-05, + "loss": 1.2337, + "loss/crossentropy": 2.5467610359191895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.15552528202533722, + "step": 190 + }, + { + "epoch": 0.002852023294012244, + "grad_norm": 0.5859375, + "grad_norm_var": 0.0021311442057291665, + "learning_rate": 2e-05, + "loss": 1.3968, + "loss/crossentropy": 2.381016492843628, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.19368363916873932, + "step": 191 + }, + { + "epoch": 0.0028669553531431983, + "grad_norm": 0.55078125, + "grad_norm_var": 0.002132606506347656, + "learning_rate": 2e-05, + "loss": 1.257, + "loss/crossentropy": 2.8202872276306152, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1710858792066574, + "step": 192 + }, + { + "epoch": 0.0028818874122741524, + "grad_norm": 0.67578125, + "grad_norm_var": 0.0030318578084309895, + "learning_rate": 2e-05, + "loss": 1.4907, + "loss/crossentropy": 2.774198055267334, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 12.0, + "loss/logits": 0.24069786071777344, + "step": 193 + }, + { + "epoch": 0.002896819471405107, + "grad_norm": 0.6015625, + "grad_norm_var": 0.0029744466145833334, + "learning_rate": 2e-05, + "loss": 1.2802, + "loss/crossentropy": 2.622847557067871, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17865772545337677, + "step": 194 + }, + { + "epoch": 0.002911751530536061, + "grad_norm": 0.515625, + "grad_norm_var": 0.0028472900390625, + "learning_rate": 2e-05, + "loss": 1.1273, + "loss/crossentropy": 2.44476056098938, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9921875, + "loss/idx": 12.0, + "loss/logits": 0.13513167202472687, + "step": 195 + }, + { + "epoch": 0.002926683589667015, + "grad_norm": 0.6015625, + "grad_norm_var": 0.002904192606608073, + "learning_rate": 2e-05, + "loss": 1.4153, + "loss/crossentropy": 2.6610636711120605, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.21217352151870728, + "step": 196 + }, + { + "epoch": 0.002941615648797969, + "grad_norm": 0.6640625, + "grad_norm_var": 0.002907053629557292, + "learning_rate": 2e-05, + "loss": 1.48, + "loss/crossentropy": 2.434468984603882, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 12.0, + "loss/logits": 0.19873744249343872, + "step": 197 + }, + { + "epoch": 0.0029565477079289233, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0031717300415039064, + "learning_rate": 2e-05, + "loss": 1.2033, + "loss/crossentropy": 2.4664409160614014, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.1642196774482727, + "step": 198 + }, + { + "epoch": 0.0029714797670598774, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0031695048014322917, + "learning_rate": 2e-05, + "loss": 1.3863, + "loss/crossentropy": 2.6245970726013184, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 12.0, + "loss/logits": 0.17531853914260864, + "step": 199 + }, + { + "epoch": 0.002986411826190832, + "grad_norm": 0.546875, + "grad_norm_var": 0.0032225926717122395, + "learning_rate": 2e-05, + "loss": 1.2205, + "loss/crossentropy": 2.24330735206604, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.13458162546157837, + "step": 200 + }, + { + "epoch": 0.003001343885321786, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0032882054646809896, + "learning_rate": 2e-05, + "loss": 1.2716, + "loss/crossentropy": 2.551177978515625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.17783880233764648, + "step": 201 + }, + { + "epoch": 0.00301627594445274, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0031575520833333334, + "learning_rate": 2e-05, + "loss": 1.1211, + "loss/crossentropy": 2.748706102371216, + "loss/dist_ce": 0.0, + "loss/fcd": 0.984375, + "loss/idx": 12.0, + "loss/logits": 0.13667932152748108, + "step": 202 + }, + { + "epoch": 0.003031208003583694, + "grad_norm": 0.55078125, + "grad_norm_var": 0.002357737223307292, + "learning_rate": 2e-05, + "loss": 1.2637, + "loss/crossentropy": 2.560105800628662, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.15430858731269836, + "step": 203 + }, + { + "epoch": 0.0030461400627146483, + "grad_norm": 0.53125, + "grad_norm_var": 0.002357737223307292, + "learning_rate": 2e-05, + "loss": 1.2106, + "loss/crossentropy": 2.519240140914917, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15595705807209015, + "step": 204 + }, + { + "epoch": 0.0030610721218456024, + "grad_norm": 0.5625, + "grad_norm_var": 0.002316729227701823, + "learning_rate": 2e-05, + "loss": 1.2509, + "loss/crossentropy": 2.8139536380767822, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1649806946516037, + "step": 205 + }, + { + "epoch": 0.0030760041809765565, + "grad_norm": 0.5234375, + "grad_norm_var": 0.002394549051920573, + "learning_rate": 2e-05, + "loss": 1.2708, + "loss/crossentropy": 2.5938565731048584, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16919182240962982, + "step": 206 + }, + { + "epoch": 0.003090936240107511, + "grad_norm": 0.59375, + "grad_norm_var": 0.0024169286092122397, + "learning_rate": 2e-05, + "loss": 1.3906, + "loss/crossentropy": 2.7333359718322754, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.20309199392795563, + "step": 207 + }, + { + "epoch": 0.003105868299238465, + "grad_norm": 0.5625, + "grad_norm_var": 0.0023976643880208332, + "learning_rate": 2e-05, + "loss": 1.2733, + "loss/crossentropy": 2.491389513015747, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.17951218783855438, + "step": 208 + }, + { + "epoch": 0.003120800358369419, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0015927632649739584, + "learning_rate": 2e-05, + "loss": 1.2968, + "loss/crossentropy": 2.423560380935669, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.17177289724349976, + "step": 209 + }, + { + "epoch": 0.0031357324175003733, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0015181859334309896, + "learning_rate": 2e-05, + "loss": 1.2264, + "loss/crossentropy": 2.598179817199707, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15610140562057495, + "step": 210 + }, + { + "epoch": 0.0031506644766313274, + "grad_norm": 0.8828125, + "grad_norm_var": 0.007877031962076822, + "learning_rate": 2e-05, + "loss": 1.3882, + "loss/crossentropy": 2.521733045578003, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.2007344365119934, + "step": 211 + }, + { + "epoch": 0.0031655965357622814, + "grad_norm": 0.5625, + "grad_norm_var": 0.00786431630452474, + "learning_rate": 2e-05, + "loss": 1.2254, + "loss/crossentropy": 2.67375111579895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16288092732429504, + "step": 212 + }, + { + "epoch": 0.003180528594893236, + "grad_norm": 0.56640625, + "grad_norm_var": 0.007344563802083333, + "learning_rate": 2e-05, + "loss": 1.2893, + "loss/crossentropy": 2.398942470550537, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17206481099128723, + "step": 213 + }, + { + "epoch": 0.00319546065402419, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0070841471354166664, + "learning_rate": 2e-05, + "loss": 1.2304, + "loss/crossentropy": 2.648841142654419, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16006574034690857, + "step": 214 + }, + { + "epoch": 0.003210392713155144, + "grad_norm": 0.60546875, + "grad_norm_var": 0.007127888997395833, + "learning_rate": 2e-05, + "loss": 1.3215, + "loss/crossentropy": 2.5586395263671875, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18091149628162384, + "step": 215 + }, + { + "epoch": 0.0032253247722860983, + "grad_norm": 0.828125, + "grad_norm_var": 0.010936482747395834, + "learning_rate": 2e-05, + "loss": 1.4868, + "loss/crossentropy": 2.3754496574401855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 12.0, + "loss/logits": 0.20558351278305054, + "step": 216 + }, + { + "epoch": 0.0032402568314170523, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0107818603515625, + "learning_rate": 2e-05, + "loss": 1.2598, + "loss/crossentropy": 2.624929189682007, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.15819445252418518, + "step": 217 + }, + { + "epoch": 0.0032551888905480064, + "grad_norm": 0.55859375, + "grad_norm_var": 0.010623931884765625, + "learning_rate": 2e-05, + "loss": 1.3027, + "loss/crossentropy": 2.589963674545288, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.16989606618881226, + "step": 218 + }, + { + "epoch": 0.0032701209496789605, + "grad_norm": 0.58203125, + "grad_norm_var": 0.010487620035807292, + "learning_rate": 2e-05, + "loss": 1.3079, + "loss/crossentropy": 2.4396564960479736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17511269450187683, + "step": 219 + }, + { + "epoch": 0.003285053008809915, + "grad_norm": 0.59375, + "grad_norm_var": 0.010158030192057292, + "learning_rate": 2e-05, + "loss": 1.3446, + "loss/crossentropy": 2.4202687740325928, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.1805376559495926, + "step": 220 + }, + { + "epoch": 0.003299985067940869, + "grad_norm": 0.5234375, + "grad_norm_var": 0.010469563802083333, + "learning_rate": 2e-05, + "loss": 1.1811, + "loss/crossentropy": 2.558259963989258, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.14983907341957092, + "step": 221 + }, + { + "epoch": 0.0033149171270718232, + "grad_norm": 0.5234375, + "grad_norm_var": 0.010469563802083333, + "learning_rate": 2e-05, + "loss": 1.2646, + "loss/crossentropy": 2.6712141036987305, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.1708478480577469, + "step": 222 + }, + { + "epoch": 0.0033298491862027773, + "grad_norm": 0.51171875, + "grad_norm_var": 0.01097558339436849, + "learning_rate": 2e-05, + "loss": 1.2885, + "loss/crossentropy": 2.7163174152374268, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17126557230949402, + "step": 223 + }, + { + "epoch": 0.0033447812453337314, + "grad_norm": 0.5703125, + "grad_norm_var": 0.010944048563639322, + "learning_rate": 2e-05, + "loss": 1.2871, + "loss/crossentropy": 2.553407907485962, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1699165552854538, + "step": 224 + }, + { + "epoch": 0.0033597133044646855, + "grad_norm": 0.54296875, + "grad_norm_var": 0.011039161682128906, + "learning_rate": 2e-05, + "loss": 1.2835, + "loss/crossentropy": 2.4799208641052246, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.15845397114753723, + "step": 225 + }, + { + "epoch": 0.00337464536359564, + "grad_norm": 0.54296875, + "grad_norm_var": 0.010979652404785156, + "learning_rate": 2e-05, + "loss": 1.2787, + "loss/crossentropy": 2.745670795440674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16931718587875366, + "step": 226 + }, + { + "epoch": 0.003389577422726594, + "grad_norm": 0.60546875, + "grad_norm_var": 0.0051971435546875, + "learning_rate": 2e-05, + "loss": 1.3744, + "loss/crossentropy": 2.385148286819458, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.21031928062438965, + "step": 227 + }, + { + "epoch": 0.0034045094818575482, + "grad_norm": 0.53515625, + "grad_norm_var": 0.005304400126139323, + "learning_rate": 2e-05, + "loss": 1.3196, + "loss/crossentropy": 2.319401264190674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.1868358552455902, + "step": 228 + }, + { + "epoch": 0.0034194415409885023, + "grad_norm": 0.5859375, + "grad_norm_var": 0.005299631754557292, + "learning_rate": 2e-05, + "loss": 1.3091, + "loss/crossentropy": 2.3205957412719727, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.16845813393592834, + "step": 229 + }, + { + "epoch": 0.0034343736001194564, + "grad_norm": 0.62109375, + "grad_norm_var": 0.0054094950358072914, + "learning_rate": 2e-05, + "loss": 1.4095, + "loss/crossentropy": 2.571542978286743, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.21414990723133087, + "step": 230 + }, + { + "epoch": 0.0034493056592504105, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0053708394368489586, + "learning_rate": 2e-05, + "loss": 1.2948, + "loss/crossentropy": 2.497636556625366, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1775931417942047, + "step": 231 + }, + { + "epoch": 0.0034642377183813646, + "grad_norm": 0.48828125, + "grad_norm_var": 0.0013274510701497396, + "learning_rate": 2e-05, + "loss": 1.1676, + "loss/crossentropy": 2.585477352142334, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.14414075016975403, + "step": 232 + }, + { + "epoch": 0.003479169777512319, + "grad_norm": 0.70703125, + "grad_norm_var": 0.002710914611816406, + "learning_rate": 2e-05, + "loss": 1.3074, + "loss/crossentropy": 2.792330503463745, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.16680563986301422, + "step": 233 + }, + { + "epoch": 0.003494101836643273, + "grad_norm": 1.015625, + "grad_norm_var": 0.015274810791015624, + "learning_rate": 2e-05, + "loss": 1.3413, + "loss/crossentropy": 3.3043012619018555, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.1381298005580902, + "step": 234 + }, + { + "epoch": 0.0035090338957742273, + "grad_norm": 0.58203125, + "grad_norm_var": 0.015274810791015624, + "learning_rate": 2e-05, + "loss": 1.2644, + "loss/crossentropy": 2.5281269550323486, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16281655430793762, + "step": 235 + }, + { + "epoch": 0.0035239659549051814, + "grad_norm": 0.53515625, + "grad_norm_var": 0.015500831604003906, + "learning_rate": 2e-05, + "loss": 1.2285, + "loss/crossentropy": 2.3544559478759766, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1581532061100006, + "step": 236 + }, + { + "epoch": 0.0035388980140361355, + "grad_norm": 1.46875, + "grad_norm_var": 0.0627664566040039, + "learning_rate": 2e-05, + "loss": 1.5838, + "loss/crossentropy": 2.6616692543029785, + "loss/dist_ce": 0.0, + "loss/fcd": 1.40625, + "loss/idx": 12.0, + "loss/logits": 0.17750610411167145, + "step": 237 + }, + { + "epoch": 0.0035538300731670896, + "grad_norm": 0.59375, + "grad_norm_var": 0.061882972717285156, + "learning_rate": 2e-05, + "loss": 1.4213, + "loss/crossentropy": 2.2179884910583496, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 12.0, + "loss/logits": 0.1947222650051117, + "step": 238 + }, + { + "epoch": 0.003568762132298044, + "grad_norm": 0.48828125, + "grad_norm_var": 0.062365150451660155, + "learning_rate": 2e-05, + "loss": 1.1698, + "loss/crossentropy": 2.5931408405303955, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.14633141458034515, + "step": 239 + }, + { + "epoch": 0.003583694191428998, + "grad_norm": 0.80859375, + "grad_norm_var": 0.06326878865559896, + "learning_rate": 2e-05, + "loss": 1.6428, + "loss/crossentropy": 2.3654534816741943, + "loss/dist_ce": 0.0, + "loss/fcd": 1.359375, + "loss/idx": 12.0, + "loss/logits": 0.28339850902557373, + "step": 240 + }, + { + "epoch": 0.0035986262505599523, + "grad_norm": 0.55078125, + "grad_norm_var": 0.06314188639322917, + "learning_rate": 2e-05, + "loss": 1.2658, + "loss/crossentropy": 2.5046465396881104, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.1720809042453766, + "step": 241 + }, + { + "epoch": 0.0036135583096909064, + "grad_norm": 0.54296875, + "grad_norm_var": 0.06314188639322917, + "learning_rate": 2e-05, + "loss": 1.3045, + "loss/crossentropy": 2.845069646835327, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.17947256565093994, + "step": 242 + }, + { + "epoch": 0.0036284903688218605, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0641845703125, + "learning_rate": 2e-05, + "loss": 1.2002, + "loss/crossentropy": 2.601872682571411, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15330010652542114, + "step": 243 + }, + { + "epoch": 0.0036434224279528146, + "grad_norm": 0.7734375, + "grad_norm_var": 0.06363773345947266, + "learning_rate": 2e-05, + "loss": 1.316, + "loss/crossentropy": 2.63727068901062, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.19103199243545532, + "step": 244 + }, + { + "epoch": 0.0036583544870837687, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0644287109375, + "learning_rate": 2e-05, + "loss": 1.2223, + "loss/crossentropy": 2.713804006576538, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.16758683323860168, + "step": 245 + }, + { + "epoch": 0.003673286546214723, + "grad_norm": 0.5859375, + "grad_norm_var": 0.0647623062133789, + "learning_rate": 2e-05, + "loss": 1.2487, + "loss/crossentropy": 2.487891674041748, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1783592849969864, + "step": 246 + }, + { + "epoch": 0.0036882186053456773, + "grad_norm": 0.546875, + "grad_norm_var": 0.06517130533854167, + "learning_rate": 2e-05, + "loss": 1.2519, + "loss/crossentropy": 2.4887685775756836, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16592524945735931, + "step": 247 + }, + { + "epoch": 0.0037031506644766314, + "grad_norm": 0.63671875, + "grad_norm_var": 0.06291478474934896, + "learning_rate": 2e-05, + "loss": 1.3709, + "loss/crossentropy": 2.388550043106079, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.19902655482292175, + "step": 248 + }, + { + "epoch": 0.0037180827236075855, + "grad_norm": 0.51953125, + "grad_norm_var": 0.06446507771809896, + "learning_rate": 2e-05, + "loss": 1.1646, + "loss/crossentropy": 2.6850759983062744, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.14900657534599304, + "step": 249 + }, + { + "epoch": 0.0037330147827385396, + "grad_norm": 0.55859375, + "grad_norm_var": 0.056423886617024736, + "learning_rate": 2e-05, + "loss": 1.2667, + "loss/crossentropy": 2.3784635066986084, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1572844386100769, + "step": 250 + }, + { + "epoch": 0.0037479468418694937, + "grad_norm": 0.6171875, + "grad_norm_var": 0.05622533162434896, + "learning_rate": 2e-05, + "loss": 1.3129, + "loss/crossentropy": 2.604210376739502, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17225471138954163, + "step": 251 + }, + { + "epoch": 0.003762878901000448, + "grad_norm": 0.51953125, + "grad_norm_var": 0.056465403238932295, + "learning_rate": 2e-05, + "loss": 1.235, + "loss/crossentropy": 2.756725549697876, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16468365490436554, + "step": 252 + }, + { + "epoch": 0.0037778109601314023, + "grad_norm": 0.52734375, + "grad_norm_var": 0.008092689514160156, + "learning_rate": 2e-05, + "loss": 1.3165, + "loss/crossentropy": 2.769130229949951, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.183657705783844, + "step": 253 + }, + { + "epoch": 0.0037927430192623564, + "grad_norm": 0.5, + "grad_norm_var": 0.008510780334472657, + "learning_rate": 2e-05, + "loss": 1.2422, + "loss/crossentropy": 2.54736065864563, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.17974743247032166, + "step": 254 + }, + { + "epoch": 0.0038076750783933105, + "grad_norm": 0.4921875, + "grad_norm_var": 0.008465321858723958, + "learning_rate": 2e-05, + "loss": 1.2686, + "loss/crossentropy": 2.463906764984131, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16703197360038757, + "step": 255 + }, + { + "epoch": 0.0038226071375242645, + "grad_norm": 0.478515625, + "grad_norm_var": 0.0051102797190348305, + "learning_rate": 2e-05, + "loss": 1.0878, + "loss/crossentropy": 2.503507614135742, + "loss/dist_ce": 0.0, + "loss/fcd": 0.95703125, + "loss/idx": 12.0, + "loss/logits": 0.13080117106437683, + "step": 256 + }, + { + "epoch": 0.0038375391966552186, + "grad_norm": 0.609375, + "grad_norm_var": 0.005276219050089518, + "learning_rate": 2e-05, + "loss": 1.3397, + "loss/crossentropy": 2.539508819580078, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.17561593651771545, + "step": 257 + }, + { + "epoch": 0.0038524712557861727, + "grad_norm": 0.71484375, + "grad_norm_var": 0.00671690305074056, + "learning_rate": 2e-05, + "loss": 1.3383, + "loss/crossentropy": 2.8462297916412354, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.1976788341999054, + "step": 258 + }, + { + "epoch": 0.0038674033149171273, + "grad_norm": 0.53125, + "grad_norm_var": 0.006694904963175456, + "learning_rate": 2e-05, + "loss": 1.2151, + "loss/crossentropy": 2.5267205238342285, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15262790024280548, + "step": 259 + }, + { + "epoch": 0.0038823353740480814, + "grad_norm": 0.58203125, + "grad_norm_var": 0.003835026423136393, + "learning_rate": 2e-05, + "loss": 1.2916, + "loss/crossentropy": 2.566385269165039, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17439204454421997, + "step": 260 + }, + { + "epoch": 0.0038972674331790354, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0037991682688395183, + "learning_rate": 2e-05, + "loss": 1.2338, + "loss/crossentropy": 2.4941866397857666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.15566781163215637, + "step": 261 + }, + { + "epoch": 0.0039121994923099895, + "grad_norm": 0.671875, + "grad_norm_var": 0.004550282160441081, + "learning_rate": 2e-05, + "loss": 1.3631, + "loss/crossentropy": 2.345531702041626, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.17564262449741364, + "step": 262 + }, + { + "epoch": 0.003927131551440944, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0047173659006754555, + "learning_rate": 2e-05, + "loss": 1.2948, + "loss/crossentropy": 2.719332695007324, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1697680652141571, + "step": 263 + }, + { + "epoch": 0.003942063610571898, + "grad_norm": 0.5078125, + "grad_norm_var": 0.004503361384073893, + "learning_rate": 2e-05, + "loss": 1.2181, + "loss/crossentropy": 2.4483582973480225, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1634569764137268, + "step": 264 + }, + { + "epoch": 0.003956995669702852, + "grad_norm": 0.55078125, + "grad_norm_var": 0.004413334528605143, + "learning_rate": 2e-05, + "loss": 1.3577, + "loss/crossentropy": 2.466128349304199, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.20145492255687714, + "step": 265 + }, + { + "epoch": 0.003971927728833806, + "grad_norm": 0.474609375, + "grad_norm_var": 0.00484460194905599, + "learning_rate": 2e-05, + "loss": 1.1326, + "loss/crossentropy": 2.559739351272583, + "loss/dist_ce": 0.0, + "loss/fcd": 0.99609375, + "loss/idx": 12.0, + "loss/logits": 0.1364823430776596, + "step": 266 + }, + { + "epoch": 0.00398685978796476, + "grad_norm": 0.6015625, + "grad_norm_var": 0.004725074768066407, + "learning_rate": 2e-05, + "loss": 1.3285, + "loss/crossentropy": 2.488020896911621, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18785008788108826, + "step": 267 + }, + { + "epoch": 0.004001791847095714, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0046656290690104164, + "learning_rate": 2e-05, + "loss": 1.3112, + "loss/crossentropy": 2.3776917457580566, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17841312289237976, + "step": 268 + }, + { + "epoch": 0.004016723906226669, + "grad_norm": 0.70703125, + "grad_norm_var": 0.006075286865234375, + "learning_rate": 2e-05, + "loss": 1.4075, + "loss/crossentropy": 2.6192870140075684, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 12.0, + "loss/logits": 0.18098263442516327, + "step": 269 + }, + { + "epoch": 0.004031655965357623, + "grad_norm": 0.5859375, + "grad_norm_var": 0.005803934733072917, + "learning_rate": 2e-05, + "loss": 1.2378, + "loss/crossentropy": 2.7433526515960693, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.15190985798835754, + "step": 270 + }, + { + "epoch": 0.004046588024488577, + "grad_norm": 0.515625, + "grad_norm_var": 0.005597178141276042, + "learning_rate": 2e-05, + "loss": 1.2776, + "loss/crossentropy": 2.572636842727661, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17604324221611023, + "step": 271 + }, + { + "epoch": 0.004061520083619531, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0050343672434488935, + "learning_rate": 2e-05, + "loss": 1.3095, + "loss/crossentropy": 2.5277881622314453, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17667779326438904, + "step": 272 + }, + { + "epoch": 0.004076452142750485, + "grad_norm": 0.54296875, + "grad_norm_var": 0.005008427302042643, + "learning_rate": 2e-05, + "loss": 1.2538, + "loss/crossentropy": 2.541118621826172, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1678951382637024, + "step": 273 + }, + { + "epoch": 0.0040913842018814395, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0036959171295166014, + "learning_rate": 2e-05, + "loss": 1.205, + "loss/crossentropy": 2.4771127700805664, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.16591356694698334, + "step": 274 + }, + { + "epoch": 0.004106316261012394, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0037110487620035807, + "learning_rate": 2e-05, + "loss": 1.2732, + "loss/crossentropy": 2.7245373725891113, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.1716119647026062, + "step": 275 + }, + { + "epoch": 0.004121248320143348, + "grad_norm": 0.62109375, + "grad_norm_var": 0.003930393854777018, + "learning_rate": 2e-05, + "loss": 1.3769, + "loss/crossentropy": 2.581962823867798, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.17378033697605133, + "step": 276 + }, + { + "epoch": 0.004136180379274302, + "grad_norm": 0.51953125, + "grad_norm_var": 0.004032627741495768, + "learning_rate": 2e-05, + "loss": 1.203, + "loss/crossentropy": 2.607046604156494, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15608051419258118, + "step": 277 + }, + { + "epoch": 0.004151112438405256, + "grad_norm": 0.515625, + "grad_norm_var": 0.0032010237375895184, + "learning_rate": 2e-05, + "loss": 1.1885, + "loss/crossentropy": 2.6051206588745117, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.1494494080543518, + "step": 278 + }, + { + "epoch": 0.00416604449753621, + "grad_norm": 0.6015625, + "grad_norm_var": 0.0032595157623291015, + "learning_rate": 2e-05, + "loss": 1.3056, + "loss/crossentropy": 2.5447115898132324, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.18058553338050842, + "step": 279 + }, + { + "epoch": 0.004180976556667164, + "grad_norm": 0.99609375, + "grad_norm_var": 0.015116866429646809, + "learning_rate": 2e-05, + "loss": 1.4477, + "loss/crossentropy": 2.5050199031829834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 12.0, + "loss/logits": 0.1664591133594513, + "step": 280 + }, + { + "epoch": 0.004195908615798118, + "grad_norm": 0.52734375, + "grad_norm_var": 0.015258391698201498, + "learning_rate": 2e-05, + "loss": 1.2369, + "loss/crossentropy": 2.526259422302246, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1665583997964859, + "step": 281 + }, + { + "epoch": 0.004210840674929073, + "grad_norm": 0.5234375, + "grad_norm_var": 0.014697710673014322, + "learning_rate": 2e-05, + "loss": 1.2211, + "loss/crossentropy": 2.4373950958251953, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15083222091197968, + "step": 282 + }, + { + "epoch": 0.004225772734060027, + "grad_norm": 1.9375, + "grad_norm_var": 0.12889601389567057, + "learning_rate": 2e-05, + "loss": 1.5671, + "loss/crossentropy": 2.5966246128082275, + "loss/dist_ce": 0.0, + "loss/fcd": 1.34375, + "loss/idx": 12.0, + "loss/logits": 0.2233429104089737, + "step": 283 + }, + { + "epoch": 0.004240704793190981, + "grad_norm": 0.5078125, + "grad_norm_var": 0.12950331370035809, + "learning_rate": 2e-05, + "loss": 1.1653, + "loss/crossentropy": 2.717890739440918, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.14189787209033966, + "step": 284 + }, + { + "epoch": 0.004255636852321935, + "grad_norm": 0.5625, + "grad_norm_var": 0.13006083170572916, + "learning_rate": 2e-05, + "loss": 1.2765, + "loss/crossentropy": 2.6981263160705566, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17491918802261353, + "step": 285 + }, + { + "epoch": 0.0042705689114528895, + "grad_norm": 0.515625, + "grad_norm_var": 0.13105646769205728, + "learning_rate": 2e-05, + "loss": 1.3008, + "loss/crossentropy": 2.629808187484741, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.16800275444984436, + "step": 286 + }, + { + "epoch": 0.004285500970583844, + "grad_norm": 0.55078125, + "grad_norm_var": 0.1304814020792643, + "learning_rate": 2e-05, + "loss": 1.2972, + "loss/crossentropy": 2.7882864475250244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1878424435853958, + "step": 287 + }, + { + "epoch": 0.004300433029714798, + "grad_norm": 0.51953125, + "grad_norm_var": 0.1309849421183268, + "learning_rate": 2e-05, + "loss": 1.2366, + "loss/crossentropy": 2.4925997257232666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16624397039413452, + "step": 288 + }, + { + "epoch": 0.004315365088845752, + "grad_norm": 0.5625, + "grad_norm_var": 0.13071695963541666, + "learning_rate": 2e-05, + "loss": 1.2994, + "loss/crossentropy": 2.804802417755127, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.18225803971290588, + "step": 289 + }, + { + "epoch": 0.004330297147976706, + "grad_norm": 0.515625, + "grad_norm_var": 0.13064263661702474, + "learning_rate": 2e-05, + "loss": 1.2799, + "loss/crossentropy": 2.5255799293518066, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.1861756294965744, + "step": 290 + }, + { + "epoch": 0.00434522920710766, + "grad_norm": 0.55859375, + "grad_norm_var": 0.13016554514567058, + "learning_rate": 2e-05, + "loss": 1.3202, + "loss/crossentropy": 2.419299602508545, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.179526686668396, + "step": 291 + }, + { + "epoch": 0.004360161266238614, + "grad_norm": 0.490234375, + "grad_norm_var": 0.13188754717508952, + "learning_rate": 2e-05, + "loss": 1.1478, + "loss/crossentropy": 2.4527664184570312, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.14002804458141327, + "step": 292 + }, + { + "epoch": 0.004375093325369568, + "grad_norm": 0.58984375, + "grad_norm_var": 0.13097087542215982, + "learning_rate": 2e-05, + "loss": 1.334, + "loss/crossentropy": 2.6172571182250977, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.17773565649986267, + "step": 293 + }, + { + "epoch": 0.004390025384500522, + "grad_norm": 0.578125, + "grad_norm_var": 0.130056365331014, + "learning_rate": 2e-05, + "loss": 1.2928, + "loss/crossentropy": 2.644300699234009, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17558696866035461, + "step": 294 + }, + { + "epoch": 0.004404957443631477, + "grad_norm": 0.52734375, + "grad_norm_var": 0.13096477190653483, + "learning_rate": 2e-05, + "loss": 1.2111, + "loss/crossentropy": 2.701231002807617, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.14079666137695312, + "step": 295 + }, + { + "epoch": 0.004419889502762431, + "grad_norm": 0.498046875, + "grad_norm_var": 0.12374617258707682, + "learning_rate": 2e-05, + "loss": 1.1536, + "loss/crossentropy": 2.5944371223449707, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.14583569765090942, + "step": 296 + }, + { + "epoch": 0.004434821561893385, + "grad_norm": 1.2265625, + "grad_norm_var": 0.14540328979492187, + "learning_rate": 2e-05, + "loss": 1.3935, + "loss/crossentropy": 2.704197883605957, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.205990731716156, + "step": 297 + }, + { + "epoch": 0.0044497536210243395, + "grad_norm": 0.5859375, + "grad_norm_var": 0.14445521036783854, + "learning_rate": 2e-05, + "loss": 1.3361, + "loss/crossentropy": 2.60979962348938, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.18767428398132324, + "step": 298 + }, + { + "epoch": 0.0044646856801552936, + "grad_norm": 0.52734375, + "grad_norm_var": 0.030499712626139323, + "learning_rate": 2e-05, + "loss": 1.3711, + "loss/crossentropy": 2.3737831115722656, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.1992519050836563, + "step": 299 + }, + { + "epoch": 0.004479617739286248, + "grad_norm": 0.5625, + "grad_norm_var": 0.03014367421468099, + "learning_rate": 2e-05, + "loss": 1.3883, + "loss/crossentropy": 2.3658077716827393, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.18512727320194244, + "step": 300 + }, + { + "epoch": 0.004494549798417202, + "grad_norm": 0.5234375, + "grad_norm_var": 0.030359840393066405, + "learning_rate": 2e-05, + "loss": 1.2623, + "loss/crossentropy": 2.541961431503296, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1764090359210968, + "step": 301 + }, + { + "epoch": 0.004509481857548156, + "grad_norm": 0.55859375, + "grad_norm_var": 0.030087788899739582, + "learning_rate": 2e-05, + "loss": 1.2756, + "loss/crossentropy": 2.721057891845703, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17400358617305756, + "step": 302 + }, + { + "epoch": 0.00452441391667911, + "grad_norm": 0.61328125, + "grad_norm_var": 0.030038960774739585, + "learning_rate": 2e-05, + "loss": 1.3062, + "loss/crossentropy": 2.519401788711548, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.16561126708984375, + "step": 303 + }, + { + "epoch": 0.004539345975810064, + "grad_norm": 0.640625, + "grad_norm_var": 0.02982018788655599, + "learning_rate": 2e-05, + "loss": 1.2715, + "loss/crossentropy": 2.8109989166259766, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1621253490447998, + "step": 304 + }, + { + "epoch": 0.004554278034941018, + "grad_norm": 0.5546875, + "grad_norm_var": 0.02986036936442057, + "learning_rate": 2e-05, + "loss": 1.1901, + "loss/crossentropy": 2.4882760047912598, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15099531412124634, + "step": 305 + }, + { + "epoch": 0.004569210094071972, + "grad_norm": 0.60546875, + "grad_norm_var": 0.029390970865885418, + "learning_rate": 2e-05, + "loss": 1.3448, + "loss/crossentropy": 2.2777934074401855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.19636347889900208, + "step": 306 + }, + { + "epoch": 0.004584142153202926, + "grad_norm": 0.478515625, + "grad_norm_var": 0.03026096026102702, + "learning_rate": 2e-05, + "loss": 1.1451, + "loss/crossentropy": 2.3440771102905273, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.12943994998931885, + "step": 307 + }, + { + "epoch": 0.004599074212333881, + "grad_norm": 0.490234375, + "grad_norm_var": 0.03026096026102702, + "learning_rate": 2e-05, + "loss": 1.2176, + "loss/crossentropy": 2.565845251083374, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15513211488723755, + "step": 308 + }, + { + "epoch": 0.004614006271464835, + "grad_norm": 0.78125, + "grad_norm_var": 0.03235446612040202, + "learning_rate": 2e-05, + "loss": 1.3562, + "loss/crossentropy": 2.702106475830078, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 12.0, + "loss/logits": 0.17654916644096375, + "step": 309 + }, + { + "epoch": 0.0046289383305957894, + "grad_norm": 0.5234375, + "grad_norm_var": 0.03277014096577962, + "learning_rate": 2e-05, + "loss": 1.2382, + "loss/crossentropy": 2.545269727706909, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16790466010570526, + "step": 310 + }, + { + "epoch": 0.0046438703897267435, + "grad_norm": 0.54296875, + "grad_norm_var": 0.032621367772420244, + "learning_rate": 2e-05, + "loss": 1.2643, + "loss/crossentropy": 2.53337025642395, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.17057746648788452, + "step": 311 + }, + { + "epoch": 0.004658802448857698, + "grad_norm": 0.61328125, + "grad_norm_var": 0.03177642822265625, + "learning_rate": 2e-05, + "loss": 1.3344, + "loss/crossentropy": 2.68241548538208, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.1938180923461914, + "step": 312 + }, + { + "epoch": 0.004673734507988652, + "grad_norm": 0.58984375, + "grad_norm_var": 0.005132484436035156, + "learning_rate": 2e-05, + "loss": 1.3496, + "loss/crossentropy": 2.7061564922332764, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.201199471950531, + "step": 313 + }, + { + "epoch": 0.004688666567119606, + "grad_norm": 0.67578125, + "grad_norm_var": 0.005774434407552083, + "learning_rate": 2e-05, + "loss": 1.4513, + "loss/crossentropy": 2.6005072593688965, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.20914174616336823, + "step": 314 + }, + { + "epoch": 0.00470359862625056, + "grad_norm": 0.546875, + "grad_norm_var": 0.00566094716389974, + "learning_rate": 2e-05, + "loss": 1.2654, + "loss/crossentropy": 2.7302968502044678, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.1716044545173645, + "step": 315 + }, + { + "epoch": 0.004718530685381514, + "grad_norm": 0.66796875, + "grad_norm_var": 0.0060918172200520836, + "learning_rate": 2e-05, + "loss": 1.5089, + "loss/crossentropy": 2.707127094268799, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2734375, + "loss/idx": 12.0, + "loss/logits": 0.23551242053508759, + "step": 316 + }, + { + "epoch": 0.004733462744512468, + "grad_norm": 0.5703125, + "grad_norm_var": 0.005826314290364583, + "learning_rate": 2e-05, + "loss": 1.2823, + "loss/crossentropy": 2.592078924179077, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.16508805751800537, + "step": 317 + }, + { + "epoch": 0.004748394803643422, + "grad_norm": 0.57421875, + "grad_norm_var": 0.005774434407552083, + "learning_rate": 2e-05, + "loss": 1.2459, + "loss/crossentropy": 2.4738409519195557, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16779488325119019, + "step": 318 + }, + { + "epoch": 0.004763326862774376, + "grad_norm": 0.5625, + "grad_norm_var": 0.005790138244628906, + "learning_rate": 2e-05, + "loss": 1.3254, + "loss/crossentropy": 2.321881055831909, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.1769586205482483, + "step": 319 + }, + { + "epoch": 0.00477825892190533, + "grad_norm": 0.55859375, + "grad_norm_var": 0.005641937255859375, + "learning_rate": 2e-05, + "loss": 1.2608, + "loss/crossentropy": 2.6014082431793213, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17482781410217285, + "step": 320 + }, + { + "epoch": 0.004793190981036285, + "grad_norm": 0.66015625, + "grad_norm_var": 0.005932044982910156, + "learning_rate": 2e-05, + "loss": 1.3777, + "loss/crossentropy": 2.7242867946624756, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.20579351484775543, + "step": 321 + }, + { + "epoch": 0.004808123040167239, + "grad_norm": 0.53515625, + "grad_norm_var": 0.006096839904785156, + "learning_rate": 2e-05, + "loss": 1.2116, + "loss/crossentropy": 2.6165010929107666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1569620966911316, + "step": 322 + }, + { + "epoch": 0.0048230550992981935, + "grad_norm": 0.55859375, + "grad_norm_var": 0.005353275934855143, + "learning_rate": 2e-05, + "loss": 1.2708, + "loss/crossentropy": 2.8225040435791016, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16922032833099365, + "step": 323 + }, + { + "epoch": 0.004837987158429148, + "grad_norm": 0.56640625, + "grad_norm_var": 0.004695574442545573, + "learning_rate": 2e-05, + "loss": 1.2872, + "loss/crossentropy": 2.497213125228882, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16218045353889465, + "step": 324 + }, + { + "epoch": 0.004852919217560102, + "grad_norm": 0.7265625, + "grad_norm_var": 0.0035277684529622395, + "learning_rate": 2e-05, + "loss": 1.5149, + "loss/crossentropy": 2.276867389678955, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2890625, + "loss/idx": 12.0, + "loss/logits": 0.22580742835998535, + "step": 325 + }, + { + "epoch": 0.004867851276691056, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0034929911295572915, + "learning_rate": 2e-05, + "loss": 1.2771, + "loss/crossentropy": 2.719574213027954, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.18333600461483002, + "step": 326 + }, + { + "epoch": 0.00488278333582201, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0033315022786458335, + "learning_rate": 2e-05, + "loss": 1.4085, + "loss/crossentropy": 2.591456413269043, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 12.0, + "loss/logits": 0.1975385546684265, + "step": 327 + }, + { + "epoch": 0.004897715394952964, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0034665425618489584, + "learning_rate": 2e-05, + "loss": 1.2639, + "loss/crossentropy": 2.5922658443450928, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.18573111295700073, + "step": 328 + }, + { + "epoch": 0.004912647454083918, + "grad_norm": 0.52734375, + "grad_norm_var": 0.003714752197265625, + "learning_rate": 2e-05, + "loss": 1.1678, + "loss/crossentropy": 2.6114420890808105, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.1521402895450592, + "step": 329 + }, + { + "epoch": 0.004927579513214872, + "grad_norm": 0.6171875, + "grad_norm_var": 0.003231239318847656, + "learning_rate": 2e-05, + "loss": 1.3684, + "loss/crossentropy": 2.68989896774292, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.1965211182832718, + "step": 330 + }, + { + "epoch": 0.004942511572345826, + "grad_norm": 0.63671875, + "grad_norm_var": 0.003305816650390625, + "learning_rate": 2e-05, + "loss": 1.2985, + "loss/crossentropy": 2.6312994956970215, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.18911811709403992, + "step": 331 + }, + { + "epoch": 0.00495744363147678, + "grad_norm": 0.52734375, + "grad_norm_var": 0.003049468994140625, + "learning_rate": 2e-05, + "loss": 1.2595, + "loss/crossentropy": 2.5765206813812256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17355464398860931, + "step": 332 + }, + { + "epoch": 0.004972375690607734, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0030455907185872396, + "learning_rate": 2e-05, + "loss": 1.463, + "loss/crossentropy": 2.599970579147339, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.22083698213100433, + "step": 333 + }, + { + "epoch": 0.004987307749738689, + "grad_norm": 0.5703125, + "grad_norm_var": 0.003049468994140625, + "learning_rate": 2e-05, + "loss": 1.3341, + "loss/crossentropy": 2.6213738918304443, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.17781299352645874, + "step": 334 + }, + { + "epoch": 0.0050022398088696435, + "grad_norm": 0.55078125, + "grad_norm_var": 0.003084754943847656, + "learning_rate": 2e-05, + "loss": 1.2978, + "loss/crossentropy": 2.6995716094970703, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1727505922317505, + "step": 335 + }, + { + "epoch": 0.005017171868000598, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0033487319946289063, + "learning_rate": 2e-05, + "loss": 1.1847, + "loss/crossentropy": 2.6471781730651855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.15342769026756287, + "step": 336 + }, + { + "epoch": 0.005032103927131552, + "grad_norm": 0.66015625, + "grad_norm_var": 0.0033487319946289063, + "learning_rate": 2e-05, + "loss": 1.5682, + "loss/crossentropy": 2.331575393676758, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3359375, + "loss/idx": 12.0, + "loss/logits": 0.2322523295879364, + "step": 337 + }, + { + "epoch": 0.005047035986262506, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0035104751586914062, + "learning_rate": 2e-05, + "loss": 1.2217, + "loss/crossentropy": 2.6712942123413086, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15924036502838135, + "step": 338 + }, + { + "epoch": 0.00506196804539346, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0038677056630452475, + "learning_rate": 2e-05, + "loss": 1.1757, + "loss/crossentropy": 2.6050660610198975, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.16003666818141937, + "step": 339 + }, + { + "epoch": 0.005076900104524414, + "grad_norm": 0.5078125, + "grad_norm_var": 0.004115660985310872, + "learning_rate": 2e-05, + "loss": 1.2327, + "loss/crossentropy": 2.417349100112915, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16236600279808044, + "step": 340 + }, + { + "epoch": 0.005091832163655368, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0023254235585530598, + "learning_rate": 2e-05, + "loss": 1.3032, + "loss/crossentropy": 2.8112549781799316, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1859789788722992, + "step": 341 + }, + { + "epoch": 0.005106764222786322, + "grad_norm": 0.625, + "grad_norm_var": 0.0025288740793863933, + "learning_rate": 2e-05, + "loss": 1.3182, + "loss/crossentropy": 2.38157057762146, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17755183577537537, + "step": 342 + }, + { + "epoch": 0.005121696281917276, + "grad_norm": 0.61328125, + "grad_norm_var": 0.0026667118072509766, + "learning_rate": 2e-05, + "loss": 1.2629, + "loss/crossentropy": 2.5023629665374756, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.16914424300193787, + "step": 343 + }, + { + "epoch": 0.00513662834104823, + "grad_norm": 0.6328125, + "grad_norm_var": 0.00290067990620931, + "learning_rate": 2e-05, + "loss": 1.4134, + "loss/crossentropy": 2.5533506870269775, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 12.0, + "loss/logits": 0.2024352252483368, + "step": 344 + }, + { + "epoch": 0.005151560400179184, + "grad_norm": 0.52734375, + "grad_norm_var": 0.00290067990620931, + "learning_rate": 2e-05, + "loss": 1.1969, + "loss/crossentropy": 2.6583385467529297, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15788167715072632, + "step": 345 + }, + { + "epoch": 0.0051664924593101385, + "grad_norm": 0.51953125, + "grad_norm_var": 0.002897500991821289, + "learning_rate": 2e-05, + "loss": 1.1828, + "loss/crossentropy": 2.7132012844085693, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.1515815556049347, + "step": 346 + }, + { + "epoch": 0.0051814245184410935, + "grad_norm": 0.52734375, + "grad_norm_var": 0.002600208918253581, + "learning_rate": 2e-05, + "loss": 1.1514, + "loss/crossentropy": 2.7961926460266113, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.14358270168304443, + "step": 347 + }, + { + "epoch": 0.0051963565775720475, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0027310530344645183, + "learning_rate": 2e-05, + "loss": 1.1785, + "loss/crossentropy": 2.5494437217712402, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.15509198606014252, + "step": 348 + }, + { + "epoch": 0.005211288636703002, + "grad_norm": 0.5625, + "grad_norm_var": 0.002712361017862956, + "learning_rate": 2e-05, + "loss": 1.3236, + "loss/crossentropy": 2.6538543701171875, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1985650658607483, + "step": 349 + }, + { + "epoch": 0.005226220695833956, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0026986281077067058, + "learning_rate": 2e-05, + "loss": 1.3186, + "loss/crossentropy": 2.7938647270202637, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.18574939668178558, + "step": 350 + }, + { + "epoch": 0.00524115275496491, + "grad_norm": 0.56640625, + "grad_norm_var": 0.002704477310180664, + "learning_rate": 2e-05, + "loss": 1.2769, + "loss/crossentropy": 2.6201303005218506, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16753770411014557, + "step": 351 + }, + { + "epoch": 0.005256084814095864, + "grad_norm": 0.60546875, + "grad_norm_var": 0.002696847915649414, + "learning_rate": 2e-05, + "loss": 1.4235, + "loss/crossentropy": 2.322477102279663, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 12.0, + "loss/logits": 0.18908751010894775, + "step": 352 + }, + { + "epoch": 0.005271016873226818, + "grad_norm": 0.59765625, + "grad_norm_var": 0.0021241346995035807, + "learning_rate": 2e-05, + "loss": 1.3541, + "loss/crossentropy": 2.4053268432617188, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.1978444755077362, + "step": 353 + }, + { + "epoch": 0.005285948932357772, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0019971052805582683, + "learning_rate": 2e-05, + "loss": 1.3755, + "loss/crossentropy": 2.505030393600464, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.18796448409557343, + "step": 354 + }, + { + "epoch": 0.005300880991488726, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0018187840779622397, + "learning_rate": 2e-05, + "loss": 1.213, + "loss/crossentropy": 2.673353433609009, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1661037802696228, + "step": 355 + }, + { + "epoch": 0.00531581305061968, + "grad_norm": 1.6015625, + "grad_norm_var": 0.06836236317952474, + "learning_rate": 2e-05, + "loss": 1.3959, + "loss/crossentropy": 1.8594982624053955, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 12.0, + "loss/logits": 0.1302594691514969, + "step": 356 + }, + { + "epoch": 0.005330745109750634, + "grad_norm": 0.69140625, + "grad_norm_var": 0.06830895741780599, + "learning_rate": 2e-05, + "loss": 1.4121, + "loss/crossentropy": 2.4870803356170654, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.19338038563728333, + "step": 357 + }, + { + "epoch": 0.0053456771688815885, + "grad_norm": 0.5625, + "grad_norm_var": 0.06867720286051432, + "learning_rate": 2e-05, + "loss": 1.3687, + "loss/crossentropy": 2.3528172969818115, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.19687314331531525, + "step": 358 + }, + { + "epoch": 0.0053606092280125426, + "grad_norm": 0.57421875, + "grad_norm_var": 0.06889082590738932, + "learning_rate": 2e-05, + "loss": 1.4012, + "loss/crossentropy": 2.2566211223602295, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.18240945041179657, + "step": 359 + }, + { + "epoch": 0.0053755412871434975, + "grad_norm": 0.54296875, + "grad_norm_var": 0.06940409342447916, + "learning_rate": 2e-05, + "loss": 1.3009, + "loss/crossentropy": 2.8327674865722656, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.19157303869724274, + "step": 360 + }, + { + "epoch": 0.005390473346274452, + "grad_norm": 0.4921875, + "grad_norm_var": 0.06995283762613932, + "learning_rate": 2e-05, + "loss": 1.2257, + "loss/crossentropy": 2.573021173477173, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16322672367095947, + "step": 361 + }, + { + "epoch": 0.005405405405405406, + "grad_norm": 0.5546875, + "grad_norm_var": 0.06953226725260417, + "learning_rate": 2e-05, + "loss": 1.2096, + "loss/crossentropy": 2.7299983501434326, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1548905074596405, + "step": 362 + }, + { + "epoch": 0.00542033746453636, + "grad_norm": 0.5859375, + "grad_norm_var": 0.06896101633707682, + "learning_rate": 2e-05, + "loss": 1.3065, + "loss/crossentropy": 2.708285331726074, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.18154960870742798, + "step": 363 + }, + { + "epoch": 0.005435269523667314, + "grad_norm": 0.57421875, + "grad_norm_var": 0.06807295481363933, + "learning_rate": 2e-05, + "loss": 1.2052, + "loss/crossentropy": 2.836975336074829, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15837103128433228, + "step": 364 + }, + { + "epoch": 0.005450201582798268, + "grad_norm": 0.8125, + "grad_norm_var": 0.06952966054280599, + "learning_rate": 2e-05, + "loss": 1.7326, + "loss/crossentropy": 2.6771881580352783, + "loss/dist_ce": 0.0, + "loss/fcd": 1.4453125, + "loss/idx": 12.0, + "loss/logits": 0.28733551502227783, + "step": 365 + }, + { + "epoch": 0.005465133641929222, + "grad_norm": 0.5546875, + "grad_norm_var": 0.06957906087239583, + "learning_rate": 2e-05, + "loss": 1.3377, + "loss/crossentropy": 2.6525285243988037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.18148328363895416, + "step": 366 + }, + { + "epoch": 0.005480065701060176, + "grad_norm": 0.52734375, + "grad_norm_var": 0.07011693318684896, + "learning_rate": 2e-05, + "loss": 1.2411, + "loss/crossentropy": 2.624660015106201, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1551695019006729, + "step": 367 + }, + { + "epoch": 0.00549499776019113, + "grad_norm": 0.62890625, + "grad_norm_var": 0.07001546223958334, + "learning_rate": 2e-05, + "loss": 1.2908, + "loss/crossentropy": 2.51216197013855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16580525040626526, + "step": 368 + }, + { + "epoch": 0.005509929819322084, + "grad_norm": 0.62109375, + "grad_norm_var": 0.06988499959309896, + "learning_rate": 2e-05, + "loss": 1.3284, + "loss/crossentropy": 2.7236480712890625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18772506713867188, + "step": 369 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 0.58203125, + "grad_norm_var": 0.06988499959309896, + "learning_rate": 2e-05, + "loss": 1.2295, + "loss/crossentropy": 2.7294671535491943, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16700142621994019, + "step": 370 + }, + { + "epoch": 0.0055397939375839925, + "grad_norm": 0.6015625, + "grad_norm_var": 0.06892878214518229, + "learning_rate": 2e-05, + "loss": 1.2704, + "loss/crossentropy": 2.6511926651000977, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16887199878692627, + "step": 371 + }, + { + "epoch": 0.005554725996714947, + "grad_norm": 0.52734375, + "grad_norm_var": 0.005724016825358073, + "learning_rate": 2e-05, + "loss": 1.2096, + "loss/crossentropy": 2.696777105331421, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1627081334590912, + "step": 372 + }, + { + "epoch": 0.005569658055845901, + "grad_norm": 0.5703125, + "grad_norm_var": 0.004996744791666666, + "learning_rate": 2e-05, + "loss": 1.2566, + "loss/crossentropy": 2.6416056156158447, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1706182360649109, + "step": 373 + }, + { + "epoch": 0.005584590114976856, + "grad_norm": 0.5390625, + "grad_norm_var": 0.005092112223307291, + "learning_rate": 2e-05, + "loss": 1.2752, + "loss/crossentropy": 2.701991319656372, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1657867729663849, + "step": 374 + }, + { + "epoch": 0.00559952217410781, + "grad_norm": 0.51171875, + "grad_norm_var": 0.005389149983723958, + "learning_rate": 2e-05, + "loss": 1.1155, + "loss/crossentropy": 2.4490652084350586, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9765625, + "loss/idx": 12.0, + "loss/logits": 0.1389380842447281, + "step": 375 + }, + { + "epoch": 0.005614454233238764, + "grad_norm": 0.5390625, + "grad_norm_var": 0.005407651265462239, + "learning_rate": 2e-05, + "loss": 1.239, + "loss/crossentropy": 2.5287365913391113, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1609053909778595, + "step": 376 + }, + { + "epoch": 0.005629386292369718, + "grad_norm": 0.48828125, + "grad_norm_var": 0.005452473958333333, + "learning_rate": 2e-05, + "loss": 1.1656, + "loss/crossentropy": 2.5641028881073, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.1421457827091217, + "step": 377 + }, + { + "epoch": 0.005644318351500672, + "grad_norm": 0.51171875, + "grad_norm_var": 0.005690956115722656, + "learning_rate": 2e-05, + "loss": 1.2171, + "loss/crossentropy": 2.590167284011841, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1546175181865692, + "step": 378 + }, + { + "epoch": 0.005659250410631626, + "grad_norm": 0.734375, + "grad_norm_var": 0.007314491271972656, + "learning_rate": 2e-05, + "loss": 1.3905, + "loss/crossentropy": 2.8093738555908203, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.2029891312122345, + "step": 379 + }, + { + "epoch": 0.00567418246976258, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0073094050089518225, + "learning_rate": 2e-05, + "loss": 1.412, + "loss/crossentropy": 2.499774217605591, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.21667346358299255, + "step": 380 + }, + { + "epoch": 0.005689114528893534, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0035776774088541667, + "learning_rate": 2e-05, + "loss": 1.1953, + "loss/crossentropy": 2.5769855976104736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15628309547901154, + "step": 381 + }, + { + "epoch": 0.005704046588024488, + "grad_norm": 0.5546875, + "grad_norm_var": 0.0035776774088541667, + "learning_rate": 2e-05, + "loss": 1.3599, + "loss/crossentropy": 2.3219430446624756, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.20361411571502686, + "step": 382 + }, + { + "epoch": 0.0057189786471554425, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0037877241770426433, + "learning_rate": 2e-05, + "loss": 1.1961, + "loss/crossentropy": 2.521282911300659, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.14920538663864136, + "step": 383 + }, + { + "epoch": 0.005733910706286397, + "grad_norm": 0.49609375, + "grad_norm_var": 0.0037682692209879557, + "learning_rate": 2e-05, + "loss": 1.2406, + "loss/crossentropy": 2.704392671585083, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.15467111766338348, + "step": 384 + }, + { + "epoch": 0.005748842765417351, + "grad_norm": 0.59765625, + "grad_norm_var": 0.0036030928293863933, + "learning_rate": 2e-05, + "loss": 1.2842, + "loss/crossentropy": 2.674487829208374, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.18259578943252563, + "step": 385 + }, + { + "epoch": 0.005763774824548305, + "grad_norm": 0.498046875, + "grad_norm_var": 0.003750038146972656, + "learning_rate": 2e-05, + "loss": 1.1793, + "loss/crossentropy": 2.782691478729248, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.14800548553466797, + "step": 386 + }, + { + "epoch": 0.00577870688367926, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0036410013834635418, + "learning_rate": 2e-05, + "loss": 1.3885, + "loss/crossentropy": 2.3553826808929443, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.1931438148021698, + "step": 387 + }, + { + "epoch": 0.005793638942810214, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0036410013834635418, + "learning_rate": 2e-05, + "loss": 1.3019, + "loss/crossentropy": 2.500701427459717, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.17689158022403717, + "step": 388 + }, + { + "epoch": 0.005808571001941168, + "grad_norm": 0.53125, + "grad_norm_var": 0.00362701416015625, + "learning_rate": 2e-05, + "loss": 1.3106, + "loss/crossentropy": 2.561689615249634, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1855672150850296, + "step": 389 + }, + { + "epoch": 0.005823503061072122, + "grad_norm": 0.578125, + "grad_norm_var": 0.0036816914876302083, + "learning_rate": 2e-05, + "loss": 1.1974, + "loss/crossentropy": 2.4336016178131104, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1505298614501953, + "step": 390 + }, + { + "epoch": 0.005838435120203076, + "grad_norm": 0.54296875, + "grad_norm_var": 0.003586069742838542, + "learning_rate": 2e-05, + "loss": 1.2446, + "loss/crossentropy": 2.371649980545044, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.15085071325302124, + "step": 391 + }, + { + "epoch": 0.00585336717933403, + "grad_norm": 0.5546875, + "grad_norm_var": 0.003575897216796875, + "learning_rate": 2e-05, + "loss": 1.2947, + "loss/crossentropy": 2.6943581104278564, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1775604784488678, + "step": 392 + }, + { + "epoch": 0.005868299238464984, + "grad_norm": 0.703125, + "grad_norm_var": 0.004628435770670573, + "learning_rate": 2e-05, + "loss": 1.4691, + "loss/crossentropy": 2.582803726196289, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 12.0, + "loss/logits": 0.20347224175930023, + "step": 393 + }, + { + "epoch": 0.005883231297595938, + "grad_norm": 0.5625, + "grad_norm_var": 0.004424285888671875, + "learning_rate": 2e-05, + "loss": 1.2891, + "loss/crossentropy": 2.5786473751068115, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17973393201828003, + "step": 394 + }, + { + "epoch": 0.0058981633567268925, + "grad_norm": 0.640625, + "grad_norm_var": 0.002904510498046875, + "learning_rate": 2e-05, + "loss": 1.2877, + "loss/crossentropy": 2.0246448516845703, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16268569231033325, + "step": 395 + }, + { + "epoch": 0.005913095415857847, + "grad_norm": 0.50390625, + "grad_norm_var": 0.003087615966796875, + "learning_rate": 2e-05, + "loss": 1.2483, + "loss/crossentropy": 2.597654104232788, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.17802008986473083, + "step": 396 + }, + { + "epoch": 0.005928027474988801, + "grad_norm": 0.53125, + "grad_norm_var": 0.0031325658162434894, + "learning_rate": 2e-05, + "loss": 1.1191, + "loss/crossentropy": 2.6132638454437256, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9921875, + "loss/idx": 12.0, + "loss/logits": 0.12693801522254944, + "step": 397 + }, + { + "epoch": 0.005942959534119755, + "grad_norm": 0.578125, + "grad_norm_var": 0.0031615575154622395, + "learning_rate": 2e-05, + "loss": 1.3393, + "loss/crossentropy": 2.3903872966766357, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.17523705959320068, + "step": 398 + }, + { + "epoch": 0.005957891593250709, + "grad_norm": 0.8671875, + "grad_norm_var": 0.00873411496480306, + "learning_rate": 2e-05, + "loss": 1.7769, + "loss/crossentropy": 2.4915974140167236, + "loss/dist_ce": 0.0, + "loss/fcd": 1.5078125, + "loss/idx": 12.0, + "loss/logits": 0.2690865993499756, + "step": 399 + }, + { + "epoch": 0.005972823652381664, + "grad_norm": 0.50390625, + "grad_norm_var": 0.008649555842081706, + "learning_rate": 2e-05, + "loss": 1.2025, + "loss/crossentropy": 2.4921159744262695, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1556488275527954, + "step": 400 + }, + { + "epoch": 0.005987755711512618, + "grad_norm": 0.609375, + "grad_norm_var": 0.008683506647745769, + "learning_rate": 2e-05, + "loss": 1.416, + "loss/crossentropy": 2.8674352169036865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.2129102349281311, + "step": 401 + }, + { + "epoch": 0.006002687770643572, + "grad_norm": 0.466796875, + "grad_norm_var": 0.009094985326131184, + "learning_rate": 2e-05, + "loss": 1.1529, + "loss/crossentropy": 2.636471748352051, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.1373138427734375, + "step": 402 + }, + { + "epoch": 0.006017619829774526, + "grad_norm": 1.5703125, + "grad_norm_var": 0.07038000424702963, + "learning_rate": 2e-05, + "loss": 1.518, + "loss/crossentropy": 2.292736530303955, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2734375, + "loss/idx": 12.0, + "loss/logits": 0.24452432990074158, + "step": 403 + }, + { + "epoch": 0.00603255188890548, + "grad_norm": 0.55859375, + "grad_norm_var": 0.06996343930562338, + "learning_rate": 2e-05, + "loss": 1.3221, + "loss/crossentropy": 2.7590014934539795, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18144939839839935, + "step": 404 + }, + { + "epoch": 0.006047483948036434, + "grad_norm": 0.58203125, + "grad_norm_var": 0.06936173439025879, + "learning_rate": 2e-05, + "loss": 1.3817, + "loss/crossentropy": 2.3799400329589844, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.186435729265213, + "step": 405 + }, + { + "epoch": 0.006062416007167388, + "grad_norm": 0.490234375, + "grad_norm_var": 0.07065277099609375, + "learning_rate": 2e-05, + "loss": 1.2203, + "loss/crossentropy": 2.47124981880188, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.14994587004184723, + "step": 406 + }, + { + "epoch": 0.0060773480662983425, + "grad_norm": 0.53515625, + "grad_norm_var": 0.07075932820638021, + "learning_rate": 2e-05, + "loss": 1.2415, + "loss/crossentropy": 2.5655479431152344, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16334325075149536, + "step": 407 + }, + { + "epoch": 0.0060922801254292966, + "grad_norm": 0.56640625, + "grad_norm_var": 0.07063287099202474, + "learning_rate": 2e-05, + "loss": 1.2976, + "loss/crossentropy": 2.325611114501953, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.15702426433563232, + "step": 408 + }, + { + "epoch": 0.006107212184560251, + "grad_norm": 0.46484375, + "grad_norm_var": 0.07223459879557291, + "learning_rate": 2e-05, + "loss": 1.1735, + "loss/crossentropy": 2.4934916496276855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.13443033397197723, + "step": 409 + }, + { + "epoch": 0.006122144243691205, + "grad_norm": 0.57421875, + "grad_norm_var": 0.07214247385660807, + "learning_rate": 2e-05, + "loss": 1.2699, + "loss/crossentropy": 2.694333791732788, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.17613661289215088, + "step": 410 + }, + { + "epoch": 0.006137076302822159, + "grad_norm": 0.5703125, + "grad_norm_var": 0.07233015696207683, + "learning_rate": 2e-05, + "loss": 1.2064, + "loss/crossentropy": 2.710723400115967, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.151699036359787, + "step": 411 + }, + { + "epoch": 0.006152008361953113, + "grad_norm": 0.5546875, + "grad_norm_var": 0.07168299357096354, + "learning_rate": 2e-05, + "loss": 1.3126, + "loss/crossentropy": 2.4599716663360596, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.1798081398010254, + "step": 412 + }, + { + "epoch": 0.006166940421084068, + "grad_norm": 0.5390625, + "grad_norm_var": 0.07158762613932292, + "learning_rate": 2e-05, + "loss": 1.2299, + "loss/crossentropy": 2.5145263671875, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15962854027748108, + "step": 413 + }, + { + "epoch": 0.006181872480215022, + "grad_norm": 0.5234375, + "grad_norm_var": 0.07213058471679687, + "learning_rate": 2e-05, + "loss": 1.2114, + "loss/crossentropy": 2.5661544799804688, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1489061415195465, + "step": 414 + }, + { + "epoch": 0.006196804539345976, + "grad_norm": 0.515625, + "grad_norm_var": 0.06843414306640624, + "learning_rate": 2e-05, + "loss": 1.2189, + "loss/crossentropy": 2.6023993492126465, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.16417017579078674, + "step": 415 + }, + { + "epoch": 0.00621173659847693, + "grad_norm": 0.5078125, + "grad_norm_var": 0.06838423411051432, + "learning_rate": 2e-05, + "loss": 1.2502, + "loss/crossentropy": 2.4928228855133057, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.14868320524692535, + "step": 416 + }, + { + "epoch": 0.006226668657607884, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0684401830037435, + "learning_rate": 2e-05, + "loss": 1.3053, + "loss/crossentropy": 2.4073102474212646, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.1725049614906311, + "step": 417 + }, + { + "epoch": 0.006241600716738838, + "grad_norm": 0.5625, + "grad_norm_var": 0.06732099850972494, + "learning_rate": 2e-05, + "loss": 1.2862, + "loss/crossentropy": 2.30145525932312, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1689978539943695, + "step": 418 + }, + { + "epoch": 0.0062565327758697924, + "grad_norm": 0.494140625, + "grad_norm_var": 0.0012425740559895834, + "learning_rate": 2e-05, + "loss": 1.2577, + "loss/crossentropy": 2.6010894775390625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17175039649009705, + "step": 419 + }, + { + "epoch": 0.0062714648350007465, + "grad_norm": 0.53125, + "grad_norm_var": 0.0012145360310872396, + "learning_rate": 2e-05, + "loss": 1.2791, + "loss/crossentropy": 2.4492437839508057, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17758145928382874, + "step": 420 + }, + { + "epoch": 0.006286396894131701, + "grad_norm": 0.494140625, + "grad_norm_var": 0.0011623223622639974, + "learning_rate": 2e-05, + "loss": 1.2604, + "loss/crossentropy": 2.675687313079834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.16660946607589722, + "step": 421 + }, + { + "epoch": 0.006301328953262655, + "grad_norm": 0.63671875, + "grad_norm_var": 0.0017094930013020833, + "learning_rate": 2e-05, + "loss": 1.3558, + "loss/crossentropy": 2.114575147628784, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.15267613530158997, + "step": 422 + }, + { + "epoch": 0.006316261012393609, + "grad_norm": 0.484375, + "grad_norm_var": 0.0019037246704101563, + "learning_rate": 2e-05, + "loss": 1.196, + "loss/crossentropy": 2.609177827835083, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15691204369068146, + "step": 423 + }, + { + "epoch": 0.006331193071524563, + "grad_norm": 0.71484375, + "grad_norm_var": 0.003865496317545573, + "learning_rate": 2e-05, + "loss": 1.3174, + "loss/crossentropy": 2.4156365394592285, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17677493393421173, + "step": 424 + }, + { + "epoch": 0.006346125130655517, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0034052530924479165, + "learning_rate": 2e-05, + "loss": 1.2332, + "loss/crossentropy": 2.472134590148926, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.15507778525352478, + "step": 425 + }, + { + "epoch": 0.006361057189786472, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0034212748209635417, + "learning_rate": 2e-05, + "loss": 1.309, + "loss/crossentropy": 2.4787662029266357, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.18404549360275269, + "step": 426 + }, + { + "epoch": 0.006375989248917426, + "grad_norm": 0.51171875, + "grad_norm_var": 0.003456560770670573, + "learning_rate": 2e-05, + "loss": 1.2076, + "loss/crossentropy": 2.5816304683685303, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.16856208443641663, + "step": 427 + }, + { + "epoch": 0.00639092130804838, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0034517923990885417, + "learning_rate": 2e-05, + "loss": 1.3818, + "loss/crossentropy": 2.716796398162842, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.21773184835910797, + "step": 428 + }, + { + "epoch": 0.006405853367179334, + "grad_norm": 0.54296875, + "grad_norm_var": 0.003450965881347656, + "learning_rate": 2e-05, + "loss": 1.1217, + "loss/crossentropy": 2.690194845199585, + "loss/dist_ce": 0.0, + "loss/fcd": 0.984375, + "loss/idx": 12.0, + "loss/logits": 0.13730208575725555, + "step": 429 + }, + { + "epoch": 0.006420785426310288, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0034418741861979166, + "learning_rate": 2e-05, + "loss": 1.1929, + "loss/crossentropy": 2.5792243480682373, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1459844559431076, + "step": 430 + }, + { + "epoch": 0.006435717485441242, + "grad_norm": 0.625, + "grad_norm_var": 0.0037907918294270834, + "learning_rate": 2e-05, + "loss": 1.4976, + "loss/crossentropy": 2.277367353439331, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 12.0, + "loss/logits": 0.23194152116775513, + "step": 431 + }, + { + "epoch": 0.0064506495445721965, + "grad_norm": 0.5546875, + "grad_norm_var": 0.003665669759114583, + "learning_rate": 2e-05, + "loss": 1.2346, + "loss/crossentropy": 2.6008970737457275, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.15651211142539978, + "step": 432 + }, + { + "epoch": 0.006465581603703151, + "grad_norm": 0.4921875, + "grad_norm_var": 0.003864034016927083, + "learning_rate": 2e-05, + "loss": 1.1928, + "loss/crossentropy": 2.4385178089141846, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15374068915843964, + "step": 433 + }, + { + "epoch": 0.006480513662834105, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0038573582967122394, + "learning_rate": 2e-05, + "loss": 1.2827, + "loss/crossentropy": 2.591653823852539, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.18110871315002441, + "step": 434 + }, + { + "epoch": 0.006495445721965059, + "grad_norm": 0.55078125, + "grad_norm_var": 0.00366514523824056, + "learning_rate": 2e-05, + "loss": 1.2192, + "loss/crossentropy": 2.5972487926483154, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.14887724816799164, + "step": 435 + }, + { + "epoch": 0.006510377781096013, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0036881605784098307, + "learning_rate": 2e-05, + "loss": 1.2291, + "loss/crossentropy": 2.6286871433258057, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15877765417099, + "step": 436 + }, + { + "epoch": 0.006525309840226967, + "grad_norm": 0.5, + "grad_norm_var": 0.003647295633951823, + "learning_rate": 2e-05, + "loss": 1.2138, + "loss/crossentropy": 2.6377251148223877, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15911364555358887, + "step": 437 + }, + { + "epoch": 0.006540241899357921, + "grad_norm": 0.58203125, + "grad_norm_var": 0.003198687235514323, + "learning_rate": 2e-05, + "loss": 1.3142, + "loss/crossentropy": 2.6262760162353516, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1891767680644989, + "step": 438 + }, + { + "epoch": 0.006555173958488876, + "grad_norm": 0.470703125, + "grad_norm_var": 0.0033229668935139973, + "learning_rate": 2e-05, + "loss": 1.1893, + "loss/crossentropy": 2.4748237133026123, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.1502346694469452, + "step": 439 + }, + { + "epoch": 0.00657010601761983, + "grad_norm": 0.51953125, + "grad_norm_var": 0.001291640599568685, + "learning_rate": 2e-05, + "loss": 1.2477, + "loss/crossentropy": 2.4698257446289062, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16178780794143677, + "step": 440 + }, + { + "epoch": 0.006585038076750784, + "grad_norm": 0.51953125, + "grad_norm_var": 0.001299905776977539, + "learning_rate": 2e-05, + "loss": 1.2288, + "loss/crossentropy": 2.2895803451538086, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1506274938583374, + "step": 441 + }, + { + "epoch": 0.006599970135881738, + "grad_norm": 0.59375, + "grad_norm_var": 0.0015221754709879556, + "learning_rate": 2e-05, + "loss": 1.295, + "loss/crossentropy": 2.6053385734558105, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1778094321489334, + "step": 442 + }, + { + "epoch": 0.006614902195012692, + "grad_norm": 0.7109375, + "grad_norm_var": 0.003344456354777018, + "learning_rate": 2e-05, + "loss": 1.4482, + "loss/crossentropy": 2.7315480709075928, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 12.0, + "loss/logits": 0.2138347327709198, + "step": 443 + }, + { + "epoch": 0.0066298342541436465, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0033356825510660808, + "learning_rate": 2e-05, + "loss": 1.2976, + "loss/crossentropy": 2.4568560123443604, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.17260998487472534, + "step": 444 + }, + { + "epoch": 0.006644766313274601, + "grad_norm": 0.55078125, + "grad_norm_var": 0.003331740697224935, + "learning_rate": 2e-05, + "loss": 1.215, + "loss/crossentropy": 2.6158523559570312, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1525478959083557, + "step": 445 + }, + { + "epoch": 0.006659698372405555, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0033044020334879556, + "learning_rate": 2e-05, + "loss": 1.1846, + "loss/crossentropy": 2.301199436187744, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.14553159475326538, + "step": 446 + }, + { + "epoch": 0.006674630431536509, + "grad_norm": 0.6171875, + "grad_norm_var": 0.0032335758209228516, + "learning_rate": 2e-05, + "loss": 1.289, + "loss/crossentropy": 2.4552204608917236, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.163978710770607, + "step": 447 + }, + { + "epoch": 0.006689562490667463, + "grad_norm": 0.53125, + "grad_norm_var": 0.003262186050415039, + "learning_rate": 2e-05, + "loss": 1.3447, + "loss/crossentropy": 2.410238742828369, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.19623002409934998, + "step": 448 + }, + { + "epoch": 0.006704494549798417, + "grad_norm": 0.5546875, + "grad_norm_var": 0.0030129591623942057, + "learning_rate": 2e-05, + "loss": 1.2677, + "loss/crossentropy": 2.524290084838867, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.1661645472049713, + "step": 449 + }, + { + "epoch": 0.006719426608929371, + "grad_norm": 0.546875, + "grad_norm_var": 0.0029900709788004557, + "learning_rate": 2e-05, + "loss": 1.303, + "loss/crossentropy": 2.5573766231536865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.18579092621803284, + "step": 450 + }, + { + "epoch": 0.006734358668060325, + "grad_norm": 0.73828125, + "grad_norm_var": 0.005056111017862955, + "learning_rate": 2e-05, + "loss": 1.4536, + "loss/crossentropy": 2.5473504066467285, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 12.0, + "loss/logits": 0.20360559225082397, + "step": 451 + }, + { + "epoch": 0.00674929072719128, + "grad_norm": 0.80078125, + "grad_norm_var": 0.008224980036417643, + "learning_rate": 2e-05, + "loss": 1.5703, + "loss/crossentropy": 2.538242816925049, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 12.0, + "loss/logits": 0.3046456277370453, + "step": 452 + }, + { + "epoch": 0.006764222786322234, + "grad_norm": 0.65234375, + "grad_norm_var": 0.007947270075480144, + "learning_rate": 2e-05, + "loss": 1.3163, + "loss/crossentropy": 2.378641128540039, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17571493983268738, + "step": 453 + }, + { + "epoch": 0.006779154845453188, + "grad_norm": 0.66015625, + "grad_norm_var": 0.008197768529256185, + "learning_rate": 2e-05, + "loss": 1.3884, + "loss/crossentropy": 2.3066680431365967, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.19306373596191406, + "step": 454 + }, + { + "epoch": 0.006794086904584142, + "grad_norm": 0.69140625, + "grad_norm_var": 0.007452392578125, + "learning_rate": 2e-05, + "loss": 1.3553, + "loss/crossentropy": 2.4432783126831055, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.20682473480701447, + "step": 455 + }, + { + "epoch": 0.0068090189637150965, + "grad_norm": 0.52734375, + "grad_norm_var": 0.007358551025390625, + "learning_rate": 2e-05, + "loss": 1.2052, + "loss/crossentropy": 2.449265718460083, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15830518305301666, + "step": 456 + }, + { + "epoch": 0.0068239510228460505, + "grad_norm": 0.6171875, + "grad_norm_var": 0.006727536519368489, + "learning_rate": 2e-05, + "loss": 1.3472, + "loss/crossentropy": 2.5877275466918945, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.1831488013267517, + "step": 457 + }, + { + "epoch": 0.006838883081977005, + "grad_norm": 0.703125, + "grad_norm_var": 0.007094256083170573, + "learning_rate": 2e-05, + "loss": 1.4365, + "loss/crossentropy": 2.5130813121795654, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.21774663031101227, + "step": 458 + }, + { + "epoch": 0.006853815141107959, + "grad_norm": 0.6796875, + "grad_norm_var": 0.006804339090983073, + "learning_rate": 2e-05, + "loss": 1.2911, + "loss/crossentropy": 2.49009108543396, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16606926918029785, + "step": 459 + }, + { + "epoch": 0.006868747200238913, + "grad_norm": 0.53515625, + "grad_norm_var": 0.007045427958170573, + "learning_rate": 2e-05, + "loss": 1.205, + "loss/crossentropy": 2.6400084495544434, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15812143683433533, + "step": 460 + }, + { + "epoch": 0.006883679259369867, + "grad_norm": 0.5703125, + "grad_norm_var": 0.006880442301432292, + "learning_rate": 2e-05, + "loss": 1.2798, + "loss/crossentropy": 2.443336248397827, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.18602094054222107, + "step": 461 + }, + { + "epoch": 0.006898611318500821, + "grad_norm": 0.5234375, + "grad_norm_var": 0.00732873280843099, + "learning_rate": 2e-05, + "loss": 1.2403, + "loss/crossentropy": 2.221099376678467, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16219985485076904, + "step": 462 + }, + { + "epoch": 0.006913543377631775, + "grad_norm": 0.53125, + "grad_norm_var": 0.007843462626139323, + "learning_rate": 2e-05, + "loss": 1.2735, + "loss/crossentropy": 2.568125009536743, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17195746302604675, + "step": 463 + }, + { + "epoch": 0.006928475436762729, + "grad_norm": 0.6328125, + "grad_norm_var": 0.007334327697753907, + "learning_rate": 2e-05, + "loss": 1.2505, + "loss/crossentropy": 2.489778518676758, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1646047830581665, + "step": 464 + }, + { + "epoch": 0.006943407495893684, + "grad_norm": 0.546875, + "grad_norm_var": 0.007409095764160156, + "learning_rate": 2e-05, + "loss": 1.299, + "loss/crossentropy": 2.5794758796691895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1739940047264099, + "step": 465 + }, + { + "epoch": 0.006958339555024638, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0071807861328125, + "learning_rate": 2e-05, + "loss": 1.3035, + "loss/crossentropy": 2.4775702953338623, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17069995403289795, + "step": 466 + }, + { + "epoch": 0.006973271614155592, + "grad_norm": 0.494140625, + "grad_norm_var": 0.007186746597290039, + "learning_rate": 2e-05, + "loss": 1.16, + "loss/crossentropy": 2.651700258255005, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.14432933926582336, + "step": 467 + }, + { + "epoch": 0.006988203673286546, + "grad_norm": 0.53515625, + "grad_norm_var": 0.004795948664347331, + "learning_rate": 2e-05, + "loss": 1.2376, + "loss/crossentropy": 2.6250648498535156, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1673288345336914, + "step": 468 + }, + { + "epoch": 0.0070031357324175005, + "grad_norm": 0.5703125, + "grad_norm_var": 0.004558293024698893, + "learning_rate": 2e-05, + "loss": 1.2862, + "loss/crossentropy": 2.5528345108032227, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1767835021018982, + "step": 469 + }, + { + "epoch": 0.007018067791548455, + "grad_norm": 0.474609375, + "grad_norm_var": 0.0049010594685872395, + "learning_rate": 2e-05, + "loss": 1.166, + "loss/crossentropy": 2.459474563598633, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.14254210889339447, + "step": 470 + }, + { + "epoch": 0.007032999850679409, + "grad_norm": 0.47265625, + "grad_norm_var": 0.0045094172159830725, + "learning_rate": 2e-05, + "loss": 1.1841, + "loss/crossentropy": 2.6750423908233643, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.15288898348808289, + "step": 471 + }, + { + "epoch": 0.007047931909810363, + "grad_norm": 0.546875, + "grad_norm_var": 0.004443613688151041, + "learning_rate": 2e-05, + "loss": 1.3021, + "loss/crossentropy": 2.6386568546295166, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1770569384098053, + "step": 472 + }, + { + "epoch": 0.007062863968941317, + "grad_norm": 0.515625, + "grad_norm_var": 0.004354349772135417, + "learning_rate": 2e-05, + "loss": 1.2594, + "loss/crossentropy": 2.706113576889038, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17347458004951477, + "step": 473 + }, + { + "epoch": 0.007077796028072271, + "grad_norm": 0.62109375, + "grad_norm_var": 0.003172747294108073, + "learning_rate": 2e-05, + "loss": 1.2548, + "loss/crossentropy": 2.6007003784179688, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16884616017341614, + "step": 474 + }, + { + "epoch": 0.007092728087203225, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0020197550455729168, + "learning_rate": 2e-05, + "loss": 1.2828, + "loss/crossentropy": 2.5858445167541504, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.16564837098121643, + "step": 475 + }, + { + "epoch": 0.007107660146334179, + "grad_norm": 0.478515625, + "grad_norm_var": 0.0022866408030192058, + "learning_rate": 2e-05, + "loss": 1.1638, + "loss/crossentropy": 2.386579751968384, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.14035974442958832, + "step": 476 + }, + { + "epoch": 0.007122592205465133, + "grad_norm": 0.6015625, + "grad_norm_var": 0.0024722894032796223, + "learning_rate": 2e-05, + "loss": 1.2395, + "loss/crossentropy": 2.5994484424591064, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1613873541355133, + "step": 477 + }, + { + "epoch": 0.007137524264596088, + "grad_norm": 0.53125, + "grad_norm_var": 0.0024563948313395183, + "learning_rate": 2e-05, + "loss": 1.3159, + "loss/crossentropy": 2.616286039352417, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.1752912998199463, + "step": 478 + }, + { + "epoch": 0.007152456323727042, + "grad_norm": 0.53125, + "grad_norm_var": 0.0024563948313395183, + "learning_rate": 2e-05, + "loss": 1.1997, + "loss/crossentropy": 2.7094411849975586, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.16060031950473785, + "step": 479 + }, + { + "epoch": 0.007167388382857996, + "grad_norm": 0.59375, + "grad_norm_var": 0.0020831902821858723, + "learning_rate": 2e-05, + "loss": 1.2477, + "loss/crossentropy": 2.761913776397705, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16173386573791504, + "step": 480 + }, + { + "epoch": 0.0071823204419889505, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0021238803863525392, + "learning_rate": 2e-05, + "loss": 1.2121, + "loss/crossentropy": 2.754338502883911, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15745031833648682, + "step": 481 + }, + { + "epoch": 0.007197252501119905, + "grad_norm": 0.67578125, + "grad_norm_var": 0.0032099246978759765, + "learning_rate": 2e-05, + "loss": 1.3736, + "loss/crossentropy": 2.5661139488220215, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.18609124422073364, + "step": 482 + }, + { + "epoch": 0.007212184560250859, + "grad_norm": 0.54296875, + "grad_norm_var": 0.003008460998535156, + "learning_rate": 2e-05, + "loss": 1.2358, + "loss/crossentropy": 2.3610997200012207, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.15763944387435913, + "step": 483 + }, + { + "epoch": 0.007227116619381813, + "grad_norm": 0.76171875, + "grad_norm_var": 0.00573724110921224, + "learning_rate": 2e-05, + "loss": 1.4967, + "loss/crossentropy": 2.4400460720062256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 12.0, + "loss/logits": 0.21541057527065277, + "step": 484 + }, + { + "epoch": 0.007242048678512767, + "grad_norm": 0.5390625, + "grad_norm_var": 0.00577691396077474, + "learning_rate": 2e-05, + "loss": 1.2353, + "loss/crossentropy": 2.7640154361724854, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16499778628349304, + "step": 485 + }, + { + "epoch": 0.007256980737643721, + "grad_norm": 0.47265625, + "grad_norm_var": 0.005800231297810873, + "learning_rate": 2e-05, + "loss": 1.1987, + "loss/crossentropy": 2.694920301437378, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15962320566177368, + "step": 486 + }, + { + "epoch": 0.007271912796774675, + "grad_norm": 0.55078125, + "grad_norm_var": 0.005239470799763998, + "learning_rate": 2e-05, + "loss": 1.2831, + "loss/crossentropy": 2.8368988037109375, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17376646399497986, + "step": 487 + }, + { + "epoch": 0.007286844855905629, + "grad_norm": 0.5234375, + "grad_norm_var": 0.005339797337849935, + "learning_rate": 2e-05, + "loss": 1.2062, + "loss/crossentropy": 2.394050121307373, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.14370450377464294, + "step": 488 + }, + { + "epoch": 0.007301776915036583, + "grad_norm": 0.56640625, + "grad_norm_var": 0.005156310399373373, + "learning_rate": 2e-05, + "loss": 1.2921, + "loss/crossentropy": 2.6243531703948975, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17488384246826172, + "step": 489 + }, + { + "epoch": 0.007316708974167537, + "grad_norm": 0.498046875, + "grad_norm_var": 0.005259450276692708, + "learning_rate": 2e-05, + "loss": 1.2395, + "loss/crossentropy": 2.6446375846862793, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16142112016677856, + "step": 490 + }, + { + "epoch": 0.007331641033298492, + "grad_norm": 0.498046875, + "grad_norm_var": 0.005516163508097331, + "learning_rate": 2e-05, + "loss": 1.2261, + "loss/crossentropy": 2.3884522914886475, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.14793342351913452, + "step": 491 + }, + { + "epoch": 0.007346573092429446, + "grad_norm": 0.515625, + "grad_norm_var": 0.00520782470703125, + "learning_rate": 2e-05, + "loss": 1.2395, + "loss/crossentropy": 2.611057758331299, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16138747334480286, + "step": 492 + }, + { + "epoch": 0.0073615051515604005, + "grad_norm": 0.55078125, + "grad_norm_var": 0.005091285705566407, + "learning_rate": 2e-05, + "loss": 1.235, + "loss/crossentropy": 2.760316848754883, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16464349627494812, + "step": 493 + }, + { + "epoch": 0.0073764372106913546, + "grad_norm": 0.5859375, + "grad_norm_var": 0.005087725321451823, + "learning_rate": 2e-05, + "loss": 1.3381, + "loss/crossentropy": 2.4796364307403564, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.19745641946792603, + "step": 494 + }, + { + "epoch": 0.007391369269822309, + "grad_norm": 0.7578125, + "grad_norm_var": 0.0074035008748372395, + "learning_rate": 2e-05, + "loss": 1.5507, + "loss/crossentropy": 2.632960796356201, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3125, + "loss/idx": 12.0, + "loss/logits": 0.23823606967926025, + "step": 495 + }, + { + "epoch": 0.007406301328953263, + "grad_norm": 0.578125, + "grad_norm_var": 0.007379595438639323, + "learning_rate": 2e-05, + "loss": 1.3475, + "loss/crossentropy": 2.4685165882110596, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 12.0, + "loss/logits": 0.16783174872398376, + "step": 496 + }, + { + "epoch": 0.007421233388084217, + "grad_norm": 0.60546875, + "grad_norm_var": 0.00743554433186849, + "learning_rate": 2e-05, + "loss": 1.2911, + "loss/crossentropy": 2.674586534500122, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17387951910495758, + "step": 497 + }, + { + "epoch": 0.007436165447215171, + "grad_norm": 0.53125, + "grad_norm_var": 0.006826273600260417, + "learning_rate": 2e-05, + "loss": 1.1969, + "loss/crossentropy": 2.428183078765869, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15000945329666138, + "step": 498 + }, + { + "epoch": 0.007451097506346125, + "grad_norm": 0.466796875, + "grad_norm_var": 0.007436863581339518, + "learning_rate": 2e-05, + "loss": 1.1538, + "loss/crossentropy": 2.430645704269409, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.13817110657691956, + "step": 499 + }, + { + "epoch": 0.007466029565477079, + "grad_norm": 0.5625, + "grad_norm_var": 0.004628864924112955, + "learning_rate": 2e-05, + "loss": 1.2676, + "loss/crossentropy": 2.6222951412200928, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16602060198783875, + "step": 500 + }, + { + "epoch": 0.007480961624608033, + "grad_norm": 0.51953125, + "grad_norm_var": 0.004681634902954102, + "learning_rate": 2e-05, + "loss": 1.2974, + "loss/crossentropy": 2.3674845695495605, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.16458511352539062, + "step": 501 + }, + { + "epoch": 0.007495893683738987, + "grad_norm": 0.62109375, + "grad_norm_var": 0.00454875628153483, + "learning_rate": 2e-05, + "loss": 1.1938, + "loss/crossentropy": 2.597952127456665, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15470948815345764, + "step": 502 + }, + { + "epoch": 0.007510825742869941, + "grad_norm": 0.5078125, + "grad_norm_var": 0.004706811904907226, + "learning_rate": 2e-05, + "loss": 1.1595, + "loss/crossentropy": 2.6619369983673096, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 12.0, + "loss/logits": 0.15948548913002014, + "step": 503 + }, + { + "epoch": 0.007525757802000896, + "grad_norm": 0.5390625, + "grad_norm_var": 0.004655186335245768, + "learning_rate": 2e-05, + "loss": 1.2799, + "loss/crossentropy": 2.4827001094818115, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17829132080078125, + "step": 504 + }, + { + "epoch": 0.0075406898611318504, + "grad_norm": 0.71875, + "grad_norm_var": 0.006306568781534831, + "learning_rate": 2e-05, + "loss": 1.3244, + "loss/crossentropy": 2.6411755084991455, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.19154754281044006, + "step": 505 + }, + { + "epoch": 0.0075556219202628045, + "grad_norm": 0.53125, + "grad_norm_var": 0.00607446034749349, + "learning_rate": 2e-05, + "loss": 1.1721, + "loss/crossentropy": 2.6066651344299316, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.1486879289150238, + "step": 506 + }, + { + "epoch": 0.007570553979393759, + "grad_norm": 1.09375, + "grad_norm_var": 0.022688023249308267, + "learning_rate": 2e-05, + "loss": 1.8032, + "loss/crossentropy": 2.4730823040008545, + "loss/dist_ce": 0.0, + "loss/fcd": 1.5390625, + "loss/idx": 12.0, + "loss/logits": 0.264101505279541, + "step": 507 + }, + { + "epoch": 0.007585486038524713, + "grad_norm": 0.55078125, + "grad_norm_var": 0.022344700495402017, + "learning_rate": 2e-05, + "loss": 1.3164, + "loss/crossentropy": 2.587294101715088, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.18363715708255768, + "step": 508 + }, + { + "epoch": 0.007600418097655667, + "grad_norm": 0.5234375, + "grad_norm_var": 0.02259837786356608, + "learning_rate": 2e-05, + "loss": 1.186, + "loss/crossentropy": 2.5303711891174316, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.14696934819221497, + "step": 509 + }, + { + "epoch": 0.007615350156786621, + "grad_norm": 0.5390625, + "grad_norm_var": 0.022860066095987955, + "learning_rate": 2e-05, + "loss": 1.2258, + "loss/crossentropy": 2.4873058795928955, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16325148940086365, + "step": 510 + }, + { + "epoch": 0.007630282215917575, + "grad_norm": 0.58203125, + "grad_norm_var": 0.021160618464152018, + "learning_rate": 2e-05, + "loss": 1.3853, + "loss/crossentropy": 2.499600887298584, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.22127383947372437, + "step": 511 + }, + { + "epoch": 0.007645214275048529, + "grad_norm": 0.50390625, + "grad_norm_var": 0.02164139747619629, + "learning_rate": 2e-05, + "loss": 1.1943, + "loss/crossentropy": 2.507723569869995, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.1552710235118866, + "step": 512 + }, + { + "epoch": 0.007660146334179483, + "grad_norm": 0.56640625, + "grad_norm_var": 0.021642033259073892, + "learning_rate": 2e-05, + "loss": 1.2238, + "loss/crossentropy": 2.786867380142212, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15346887707710266, + "step": 513 + }, + { + "epoch": 0.007675078393310437, + "grad_norm": 0.78125, + "grad_norm_var": 0.023761987686157227, + "learning_rate": 2e-05, + "loss": 1.5063, + "loss/crossentropy": 2.463454246520996, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2890625, + "loss/idx": 12.0, + "loss/logits": 0.21725308895111084, + "step": 514 + }, + { + "epoch": 0.007690010452441391, + "grad_norm": 0.546875, + "grad_norm_var": 0.022735595703125, + "learning_rate": 2e-05, + "loss": 1.3415, + "loss/crossentropy": 2.8017828464508057, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.19304242730140686, + "step": 515 + }, + { + "epoch": 0.0077049425115723455, + "grad_norm": 0.6796875, + "grad_norm_var": 0.022922515869140625, + "learning_rate": 2e-05, + "loss": 1.3884, + "loss/crossentropy": 2.391439914703369, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.1931254267692566, + "step": 516 + }, + { + "epoch": 0.0077198745707032996, + "grad_norm": 0.66015625, + "grad_norm_var": 0.022409820556640626, + "learning_rate": 2e-05, + "loss": 1.3426, + "loss/crossentropy": 2.592740058898926, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.17067590355873108, + "step": 517 + }, + { + "epoch": 0.0077348066298342545, + "grad_norm": 0.52734375, + "grad_norm_var": 0.022965240478515624, + "learning_rate": 2e-05, + "loss": 1.2129, + "loss/crossentropy": 2.682835340499878, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15816320478916168, + "step": 518 + }, + { + "epoch": 0.007749738688965209, + "grad_norm": 0.5546875, + "grad_norm_var": 0.022428131103515624, + "learning_rate": 2e-05, + "loss": 1.3196, + "loss/crossentropy": 2.841771125793457, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.19456353783607483, + "step": 519 + }, + { + "epoch": 0.007764670748096163, + "grad_norm": 0.62109375, + "grad_norm_var": 0.021978187561035156, + "learning_rate": 2e-05, + "loss": 1.3445, + "loss/crossentropy": 2.2829132080078125, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.17264382541179657, + "step": 520 + }, + { + "epoch": 0.007779602807227117, + "grad_norm": 0.5625, + "grad_norm_var": 0.02152551015218099, + "learning_rate": 2e-05, + "loss": 1.2809, + "loss/crossentropy": 2.52323317527771, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17150172591209412, + "step": 521 + }, + { + "epoch": 0.007794534866358071, + "grad_norm": 0.56640625, + "grad_norm_var": 0.021214803059895832, + "learning_rate": 2e-05, + "loss": 1.3486, + "loss/crossentropy": 2.559417724609375, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.20019292831420898, + "step": 522 + }, + { + "epoch": 0.007809466925489025, + "grad_norm": 0.59375, + "grad_norm_var": 0.005003865559895833, + "learning_rate": 2e-05, + "loss": 1.2866, + "loss/crossentropy": 2.2705278396606445, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1694566011428833, + "step": 523 + }, + { + "epoch": 0.007824398984619979, + "grad_norm": 0.53515625, + "grad_norm_var": 0.00509033203125, + "learning_rate": 2e-05, + "loss": 1.2626, + "loss/crossentropy": 2.5298879146575928, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.16886663436889648, + "step": 524 + }, + { + "epoch": 0.007839331043750933, + "grad_norm": 0.62109375, + "grad_norm_var": 0.00489800771077474, + "learning_rate": 2e-05, + "loss": 1.3763, + "loss/crossentropy": 2.5087928771972656, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.18884728848934174, + "step": 525 + }, + { + "epoch": 0.007854263102881887, + "grad_norm": 0.6796875, + "grad_norm_var": 0.00517724355061849, + "learning_rate": 2e-05, + "loss": 1.3889, + "loss/crossentropy": 2.522796869277954, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 12.0, + "loss/logits": 0.17794877290725708, + "step": 526 + }, + { + "epoch": 0.007869195162012841, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0051986058553059895, + "learning_rate": 2e-05, + "loss": 1.1859, + "loss/crossentropy": 2.575768232345581, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.1546417772769928, + "step": 527 + }, + { + "epoch": 0.007884127221143795, + "grad_norm": 1.2734375, + "grad_norm_var": 0.032515462239583334, + "learning_rate": 2e-05, + "loss": 1.826, + "loss/crossentropy": 2.2920806407928467, + "loss/dist_ce": 0.0, + "loss/fcd": 1.4453125, + "loss/idx": 12.0, + "loss/logits": 0.38067495822906494, + "step": 528 + }, + { + "epoch": 0.00789905928027475, + "grad_norm": 0.54296875, + "grad_norm_var": 0.03280003865559896, + "learning_rate": 2e-05, + "loss": 1.3219, + "loss/crossentropy": 2.600083351135254, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18125802278518677, + "step": 529 + }, + { + "epoch": 0.007913991339405704, + "grad_norm": 0.71484375, + "grad_norm_var": 0.03186944325764974, + "learning_rate": 2e-05, + "loss": 1.35, + "loss/crossentropy": 2.3968639373779297, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.1625278890132904, + "step": 530 + }, + { + "epoch": 0.007928923398536658, + "grad_norm": 0.578125, + "grad_norm_var": 0.031538836161295575, + "learning_rate": 2e-05, + "loss": 1.327, + "loss/crossentropy": 2.8377633094787598, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.1941969245672226, + "step": 531 + }, + { + "epoch": 0.007943855457667612, + "grad_norm": 0.53515625, + "grad_norm_var": 0.032133992513020834, + "learning_rate": 2e-05, + "loss": 1.2754, + "loss/crossentropy": 2.437499523162842, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17379987239837646, + "step": 532 + }, + { + "epoch": 0.007958787516798566, + "grad_norm": 0.53125, + "grad_norm_var": 0.032719357808430986, + "learning_rate": 2e-05, + "loss": 1.3285, + "loss/crossentropy": 2.624368190765381, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18789049983024597, + "step": 533 + }, + { + "epoch": 0.00797371957592952, + "grad_norm": 0.5078125, + "grad_norm_var": 0.032999420166015626, + "learning_rate": 2e-05, + "loss": 1.1739, + "loss/crossentropy": 2.702047824859619, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.14263641834259033, + "step": 534 + }, + { + "epoch": 0.007988651635060474, + "grad_norm": 0.59375, + "grad_norm_var": 0.032731119791666666, + "learning_rate": 2e-05, + "loss": 1.3157, + "loss/crossentropy": 2.7412476539611816, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17504537105560303, + "step": 535 + }, + { + "epoch": 0.008003583694191428, + "grad_norm": 0.66796875, + "grad_norm_var": 0.032831827799479164, + "learning_rate": 2e-05, + "loss": 1.3453, + "loss/crossentropy": 2.646535634994507, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.19684037566184998, + "step": 536 + }, + { + "epoch": 0.008018515753322384, + "grad_norm": 0.55859375, + "grad_norm_var": 0.03286787668863932, + "learning_rate": 2e-05, + "loss": 1.3575, + "loss/crossentropy": 2.4461722373962402, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.18560636043548584, + "step": 537 + }, + { + "epoch": 0.008033447812453338, + "grad_norm": 0.55078125, + "grad_norm_var": 0.033014869689941405, + "learning_rate": 2e-05, + "loss": 1.2235, + "loss/crossentropy": 2.7021944522857666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16099202632904053, + "step": 538 + }, + { + "epoch": 0.008048379871584292, + "grad_norm": 0.52734375, + "grad_norm_var": 0.033599599202473955, + "learning_rate": 2e-05, + "loss": 1.1725, + "loss/crossentropy": 2.6571671962738037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.14905613660812378, + "step": 539 + }, + { + "epoch": 0.008063311930715246, + "grad_norm": 0.5859375, + "grad_norm_var": 0.03315575917561849, + "learning_rate": 2e-05, + "loss": 1.3608, + "loss/crossentropy": 2.54116153717041, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.18891382217407227, + "step": 540 + }, + { + "epoch": 0.0080782439898462, + "grad_norm": 0.5078125, + "grad_norm_var": 0.034057362874348955, + "learning_rate": 2e-05, + "loss": 1.2093, + "loss/crossentropy": 2.552755355834961, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.16238316893577576, + "step": 541 + }, + { + "epoch": 0.008093176048977154, + "grad_norm": 0.5703125, + "grad_norm_var": 0.033943430582682295, + "learning_rate": 2e-05, + "loss": 1.21, + "loss/crossentropy": 2.474992275238037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.16313990950584412, + "step": 542 + }, + { + "epoch": 0.008108108108108109, + "grad_norm": 0.64453125, + "grad_norm_var": 0.03388163248697917, + "learning_rate": 2e-05, + "loss": 1.4651, + "loss/crossentropy": 2.347513198852539, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.22293345630168915, + "step": 543 + }, + { + "epoch": 0.008123040167239063, + "grad_norm": 0.51953125, + "grad_norm_var": 0.003536415100097656, + "learning_rate": 2e-05, + "loss": 1.1549, + "loss/crossentropy": 2.6457583904266357, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.13925421237945557, + "step": 544 + }, + { + "epoch": 0.008137972226370017, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0036101659138997396, + "learning_rate": 2e-05, + "loss": 1.2341, + "loss/crossentropy": 2.5168657302856445, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16381186246871948, + "step": 545 + }, + { + "epoch": 0.00815290428550097, + "grad_norm": 0.48828125, + "grad_norm_var": 0.0024449030558268228, + "learning_rate": 2e-05, + "loss": 1.1984, + "loss/crossentropy": 2.518291711807251, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15153710544109344, + "step": 546 + }, + { + "epoch": 0.008167836344631925, + "grad_norm": 0.65234375, + "grad_norm_var": 0.003009033203125, + "learning_rate": 2e-05, + "loss": 1.42, + "loss/crossentropy": 2.566648244857788, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 12.0, + "loss/logits": 0.18559721112251282, + "step": 547 + }, + { + "epoch": 0.008182768403762879, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0032207330067952475, + "learning_rate": 2e-05, + "loss": 1.1543, + "loss/crossentropy": 2.616701364517212, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.14649763703346252, + "step": 548 + }, + { + "epoch": 0.008197700462893833, + "grad_norm": 0.56640625, + "grad_norm_var": 0.003171523412068685, + "learning_rate": 2e-05, + "loss": 1.2761, + "loss/crossentropy": 2.670872688293457, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16675950586795807, + "step": 549 + }, + { + "epoch": 0.008212632522024787, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0029764652252197267, + "learning_rate": 2e-05, + "loss": 1.2667, + "loss/crossentropy": 2.361363410949707, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.172979474067688, + "step": 550 + }, + { + "epoch": 0.008227564581155741, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0030925591786702474, + "learning_rate": 2e-05, + "loss": 1.3208, + "loss/crossentropy": 2.6893482208251953, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18016394972801208, + "step": 551 + }, + { + "epoch": 0.008242496640286695, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0022516727447509767, + "learning_rate": 2e-05, + "loss": 1.2538, + "loss/crossentropy": 2.5788474082946777, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16790160536766052, + "step": 552 + }, + { + "epoch": 0.00825742869941765, + "grad_norm": 0.53515625, + "grad_norm_var": 0.002258920669555664, + "learning_rate": 2e-05, + "loss": 1.2469, + "loss/crossentropy": 2.7054810523986816, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16093963384628296, + "step": 553 + }, + { + "epoch": 0.008272360758548604, + "grad_norm": 0.7421875, + "grad_norm_var": 0.004607884089152018, + "learning_rate": 2e-05, + "loss": 1.4921, + "loss/crossentropy": 2.5595617294311523, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 12.0, + "loss/logits": 0.2420843541622162, + "step": 554 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 0.53125, + "grad_norm_var": 0.004591608047485351, + "learning_rate": 2e-05, + "loss": 1.2663, + "loss/crossentropy": 3.0859930515289307, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16476529836654663, + "step": 555 + }, + { + "epoch": 0.008302224876810512, + "grad_norm": 0.546875, + "grad_norm_var": 0.004555368423461914, + "learning_rate": 2e-05, + "loss": 1.3437, + "loss/crossentropy": 2.519860029220581, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.1952182948589325, + "step": 556 + }, + { + "epoch": 0.008317156935941466, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0045825799306233725, + "learning_rate": 2e-05, + "loss": 1.2765, + "loss/crossentropy": 2.5596110820770264, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17497843503952026, + "step": 557 + }, + { + "epoch": 0.00833208899507242, + "grad_norm": 0.494140625, + "grad_norm_var": 0.004819997151692708, + "learning_rate": 2e-05, + "loss": 1.1638, + "loss/crossentropy": 2.6705644130706787, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.1560230553150177, + "step": 558 + }, + { + "epoch": 0.008347021054203374, + "grad_norm": 0.53125, + "grad_norm_var": 0.004242897033691406, + "learning_rate": 2e-05, + "loss": 1.2259, + "loss/crossentropy": 2.432647466659546, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1556343138217926, + "step": 559 + }, + { + "epoch": 0.008361953113334328, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0042652765909830725, + "learning_rate": 2e-05, + "loss": 1.3295, + "loss/crossentropy": 2.516251564025879, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18889586627483368, + "step": 560 + }, + { + "epoch": 0.008376885172465282, + "grad_norm": 1.0390625, + "grad_norm_var": 0.019082132975260416, + "learning_rate": 2e-05, + "loss": 1.6007, + "loss/crossentropy": 2.3152377605438232, + "loss/dist_ce": 0.0, + "loss/fcd": 1.34375, + "loss/idx": 12.0, + "loss/logits": 0.2569894790649414, + "step": 561 + }, + { + "epoch": 0.008391817231596236, + "grad_norm": 0.55859375, + "grad_norm_var": 0.018512217203776042, + "learning_rate": 2e-05, + "loss": 1.1934, + "loss/crossentropy": 2.74574875831604, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15431912243366241, + "step": 562 + }, + { + "epoch": 0.008406749290727192, + "grad_norm": 0.494140625, + "grad_norm_var": 0.01868602434794108, + "learning_rate": 2e-05, + "loss": 1.2298, + "loss/crossentropy": 2.555695056915283, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1516464799642563, + "step": 563 + }, + { + "epoch": 0.008421681349858146, + "grad_norm": 0.5390625, + "grad_norm_var": 0.018361918131510415, + "learning_rate": 2e-05, + "loss": 1.1969, + "loss/crossentropy": 2.6115164756774902, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1500345766544342, + "step": 564 + }, + { + "epoch": 0.0084366134089891, + "grad_norm": 0.5859375, + "grad_norm_var": 0.018352699279785157, + "learning_rate": 2e-05, + "loss": 1.311, + "loss/crossentropy": 2.468400478363037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17037324607372284, + "step": 565 + }, + { + "epoch": 0.008451545468120054, + "grad_norm": 0.62890625, + "grad_norm_var": 0.018457984924316405, + "learning_rate": 2e-05, + "loss": 1.2793, + "loss/crossentropy": 2.5954904556274414, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17773208022117615, + "step": 566 + }, + { + "epoch": 0.008466477527251009, + "grad_norm": 0.57421875, + "grad_norm_var": 0.01805267333984375, + "learning_rate": 2e-05, + "loss": 1.3878, + "loss/crossentropy": 2.4109535217285156, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.19250379502773285, + "step": 567 + }, + { + "epoch": 0.008481409586381963, + "grad_norm": 0.4921875, + "grad_norm_var": 0.018475786844889323, + "learning_rate": 2e-05, + "loss": 1.1878, + "loss/crossentropy": 2.3730239868164062, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.14876341819763184, + "step": 568 + }, + { + "epoch": 0.008496341645512917, + "grad_norm": 0.5625, + "grad_norm_var": 0.01833648681640625, + "learning_rate": 2e-05, + "loss": 1.3256, + "loss/crossentropy": 2.475072145462036, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18494603037834167, + "step": 569 + }, + { + "epoch": 0.00851127370464387, + "grad_norm": 0.5234375, + "grad_norm_var": 0.016826883951822916, + "learning_rate": 2e-05, + "loss": 1.153, + "loss/crossentropy": 2.537174940109253, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.1451968550682068, + "step": 570 + }, + { + "epoch": 0.008526205763774825, + "grad_norm": 0.51171875, + "grad_norm_var": 0.016962623596191405, + "learning_rate": 2e-05, + "loss": 1.2862, + "loss/crossentropy": 2.5827090740203857, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1767972707748413, + "step": 571 + }, + { + "epoch": 0.008541137822905779, + "grad_norm": 0.57421875, + "grad_norm_var": 0.01691411336263021, + "learning_rate": 2e-05, + "loss": 1.3106, + "loss/crossentropy": 2.3603527545928955, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17780755460262299, + "step": 572 + }, + { + "epoch": 0.008556069882036733, + "grad_norm": 0.5390625, + "grad_norm_var": 0.01665948232014974, + "learning_rate": 2e-05, + "loss": 1.2905, + "loss/crossentropy": 2.3075900077819824, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16551288962364197, + "step": 573 + }, + { + "epoch": 0.008571001941167687, + "grad_norm": 0.62109375, + "grad_norm_var": 0.01626585324605306, + "learning_rate": 2e-05, + "loss": 1.3477, + "loss/crossentropy": 2.506995439529419, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.19144755601882935, + "step": 574 + }, + { + "epoch": 0.008585934000298641, + "grad_norm": 0.71484375, + "grad_norm_var": 0.01706070899963379, + "learning_rate": 2e-05, + "loss": 1.5414, + "loss/crossentropy": 2.3521230220794678, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3203125, + "loss/idx": 12.0, + "loss/logits": 0.2211102545261383, + "step": 575 + }, + { + "epoch": 0.008600866059429595, + "grad_norm": 0.51953125, + "grad_norm_var": 0.01742386817932129, + "learning_rate": 2e-05, + "loss": 1.3131, + "loss/crossentropy": 2.6059999465942383, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.18809755146503448, + "step": 576 + }, + { + "epoch": 0.00861579811856055, + "grad_norm": 0.59375, + "grad_norm_var": 0.0032976627349853515, + "learning_rate": 2e-05, + "loss": 1.403, + "loss/crossentropy": 2.578143835067749, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 12.0, + "loss/logits": 0.19208408892154694, + "step": 577 + }, + { + "epoch": 0.008630730177691504, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0033836205800374347, + "learning_rate": 2e-05, + "loss": 1.2463, + "loss/crossentropy": 2.582569122314453, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16822180151939392, + "step": 578 + }, + { + "epoch": 0.008645662236822458, + "grad_norm": 0.65234375, + "grad_norm_var": 0.0035033543904622396, + "learning_rate": 2e-05, + "loss": 1.4421, + "loss/crossentropy": 2.4856603145599365, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.22337768971920013, + "step": 579 + }, + { + "epoch": 0.008660594295953412, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0037914117177327475, + "learning_rate": 2e-05, + "loss": 1.2455, + "loss/crossentropy": 2.372971534729004, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.15174807608127594, + "step": 580 + }, + { + "epoch": 0.008675526355084366, + "grad_norm": 0.53125, + "grad_norm_var": 0.003861729303995768, + "learning_rate": 2e-05, + "loss": 1.2073, + "loss/crossentropy": 2.7692155838012695, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1604694277048111, + "step": 581 + }, + { + "epoch": 0.00869045841421532, + "grad_norm": 0.6015625, + "grad_norm_var": 0.003681039810180664, + "learning_rate": 2e-05, + "loss": 1.2814, + "loss/crossentropy": 2.6545069217681885, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17197799682617188, + "step": 582 + }, + { + "epoch": 0.008705390473346274, + "grad_norm": 0.462890625, + "grad_norm_var": 0.004316139221191406, + "learning_rate": 2e-05, + "loss": 1.1049, + "loss/crossentropy": 2.683872938156128, + "loss/dist_ce": 0.0, + "loss/fcd": 0.97265625, + "loss/idx": 12.0, + "loss/logits": 0.13223010301589966, + "step": 583 + }, + { + "epoch": 0.008720322532477228, + "grad_norm": 0.5625, + "grad_norm_var": 0.004009437561035156, + "learning_rate": 2e-05, + "loss": 1.311, + "loss/crossentropy": 2.4532418251037598, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17814823985099792, + "step": 584 + }, + { + "epoch": 0.008735254591608182, + "grad_norm": 0.50390625, + "grad_norm_var": 0.00422210693359375, + "learning_rate": 2e-05, + "loss": 1.2304, + "loss/crossentropy": 2.5566062927246094, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16788913309574127, + "step": 585 + }, + { + "epoch": 0.008750186650739136, + "grad_norm": 0.53125, + "grad_norm_var": 0.004189300537109375, + "learning_rate": 2e-05, + "loss": 1.2254, + "loss/crossentropy": 2.733366012573242, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.17072361707687378, + "step": 586 + }, + { + "epoch": 0.00876511870987009, + "grad_norm": 0.6640625, + "grad_norm_var": 0.004677772521972656, + "learning_rate": 2e-05, + "loss": 1.2663, + "loss/crossentropy": 2.689211845397949, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1803634762763977, + "step": 587 + }, + { + "epoch": 0.008780050769001044, + "grad_norm": 0.53125, + "grad_norm_var": 0.0047609965006510414, + "learning_rate": 2e-05, + "loss": 1.1999, + "loss/crossentropy": 2.5483739376068115, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.16079798340797424, + "step": 588 + }, + { + "epoch": 0.008794982828131999, + "grad_norm": 0.67578125, + "grad_norm_var": 0.00543969472249349, + "learning_rate": 2e-05, + "loss": 1.2913, + "loss/crossentropy": 2.7356832027435303, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16630901396274567, + "step": 589 + }, + { + "epoch": 0.008809914887262954, + "grad_norm": 0.65234375, + "grad_norm_var": 0.005695025126139323, + "learning_rate": 2e-05, + "loss": 1.3674, + "loss/crossentropy": 2.607666015625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 12.0, + "loss/logits": 0.18775686621665955, + "step": 590 + }, + { + "epoch": 0.008824846946393908, + "grad_norm": 0.494140625, + "grad_norm_var": 0.004665867487589518, + "learning_rate": 2e-05, + "loss": 1.2201, + "loss/crossentropy": 2.525197982788086, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15761615335941315, + "step": 591 + }, + { + "epoch": 0.008839779005524863, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0045473575592041016, + "learning_rate": 2e-05, + "loss": 1.2657, + "loss/crossentropy": 2.3301258087158203, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.18756815791130066, + "step": 592 + }, + { + "epoch": 0.008854711064655817, + "grad_norm": 0.5546875, + "grad_norm_var": 0.004490772883097331, + "learning_rate": 2e-05, + "loss": 1.2597, + "loss/crossentropy": 2.576165199279785, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17378875613212585, + "step": 593 + }, + { + "epoch": 0.00886964312378677, + "grad_norm": 0.49609375, + "grad_norm_var": 0.004696766535441081, + "learning_rate": 2e-05, + "loss": 1.1644, + "loss/crossentropy": 2.6105165481567383, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.14877916872501373, + "step": 594 + }, + { + "epoch": 0.008884575182917725, + "grad_norm": 0.59375, + "grad_norm_var": 0.0041913191477457685, + "learning_rate": 2e-05, + "loss": 1.2399, + "loss/crossentropy": 2.6493771076202393, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16178762912750244, + "step": 595 + }, + { + "epoch": 0.008899507242048679, + "grad_norm": 0.50390625, + "grad_norm_var": 0.004147783915201823, + "learning_rate": 2e-05, + "loss": 1.1916, + "loss/crossentropy": 2.4031014442443848, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15257549285888672, + "step": 596 + }, + { + "epoch": 0.008914439301179633, + "grad_norm": 0.53125, + "grad_norm_var": 0.004147783915201823, + "learning_rate": 2e-05, + "loss": 1.219, + "loss/crossentropy": 2.6648566722869873, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.16428467631340027, + "step": 597 + }, + { + "epoch": 0.008929371360310587, + "grad_norm": 0.478515625, + "grad_norm_var": 0.004361073176066081, + "learning_rate": 2e-05, + "loss": 1.1408, + "loss/crossentropy": 2.4159302711486816, + "loss/dist_ce": 0.0, + "loss/fcd": 0.99609375, + "loss/idx": 12.0, + "loss/logits": 0.14472083747386932, + "step": 598 + }, + { + "epoch": 0.008944303419441541, + "grad_norm": 0.62890625, + "grad_norm_var": 0.00417327880859375, + "learning_rate": 2e-05, + "loss": 1.3695, + "loss/crossentropy": 2.6700379848480225, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.20542120933532715, + "step": 599 + }, + { + "epoch": 0.008959235478572495, + "grad_norm": 0.55859375, + "grad_norm_var": 0.004172706604003906, + "learning_rate": 2e-05, + "loss": 1.2161, + "loss/crossentropy": 2.6093008518218994, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15361179411411285, + "step": 600 + }, + { + "epoch": 0.00897416753770345, + "grad_norm": 0.5703125, + "grad_norm_var": 0.003957621256510417, + "learning_rate": 2e-05, + "loss": 1.322, + "loss/crossentropy": 2.6258511543273926, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18138790130615234, + "step": 601 + }, + { + "epoch": 0.008989099596834403, + "grad_norm": 0.640625, + "grad_norm_var": 0.004235331217447917, + "learning_rate": 2e-05, + "loss": 1.521, + "loss/crossentropy": 2.5401480197906494, + "loss/dist_ce": 0.0, + "loss/fcd": 1.296875, + "loss/idx": 12.0, + "loss/logits": 0.22416627407073975, + "step": 602 + }, + { + "epoch": 0.009004031655965358, + "grad_norm": 0.58203125, + "grad_norm_var": 0.003630510965983073, + "learning_rate": 2e-05, + "loss": 1.3557, + "loss/crossentropy": 2.61795973777771, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.1838516741991043, + "step": 603 + }, + { + "epoch": 0.009018963715096312, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0035535176595052082, + "learning_rate": 2e-05, + "loss": 1.2715, + "loss/crossentropy": 2.5267536640167236, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16214627027511597, + "step": 604 + }, + { + "epoch": 0.009033895774227266, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0027175267537434896, + "learning_rate": 2e-05, + "loss": 1.2625, + "loss/crossentropy": 2.503329038619995, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.1608983874320984, + "step": 605 + }, + { + "epoch": 0.00904882783335822, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0021313985188802084, + "learning_rate": 2e-05, + "loss": 1.3283, + "loss/crossentropy": 2.4117980003356934, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.1720181107521057, + "step": 606 + }, + { + "epoch": 0.009063759892489174, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0018709659576416015, + "learning_rate": 2e-05, + "loss": 1.2359, + "loss/crossentropy": 2.691850423812866, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1734330952167511, + "step": 607 + }, + { + "epoch": 0.009078691951620128, + "grad_norm": 0.52734375, + "grad_norm_var": 0.001930093765258789, + "learning_rate": 2e-05, + "loss": 1.2051, + "loss/crossentropy": 2.514829635620117, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15824373066425323, + "step": 608 + }, + { + "epoch": 0.009093624010751082, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0019861698150634766, + "learning_rate": 2e-05, + "loss": 1.2934, + "loss/crossentropy": 2.523049831390381, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17622219026088715, + "step": 609 + }, + { + "epoch": 0.009108556069882036, + "grad_norm": 0.5078125, + "grad_norm_var": 0.001901865005493164, + "learning_rate": 2e-05, + "loss": 1.1657, + "loss/crossentropy": 2.4890053272247314, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.14229866862297058, + "step": 610 + }, + { + "epoch": 0.00912348812901299, + "grad_norm": 0.546875, + "grad_norm_var": 0.0018049716949462891, + "learning_rate": 2e-05, + "loss": 1.279, + "loss/crossentropy": 2.4220166206359863, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.1774497777223587, + "step": 611 + }, + { + "epoch": 0.009138420188143944, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0016600131988525391, + "learning_rate": 2e-05, + "loss": 1.2903, + "loss/crossentropy": 2.6942455768585205, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1808762103319168, + "step": 612 + }, + { + "epoch": 0.009153352247274898, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0016888777414957683, + "learning_rate": 2e-05, + "loss": 1.3349, + "loss/crossentropy": 2.5151455402374268, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.17866647243499756, + "step": 613 + }, + { + "epoch": 0.009168284306405853, + "grad_norm": 0.65234375, + "grad_norm_var": 0.0018091201782226562, + "learning_rate": 2e-05, + "loss": 1.3485, + "loss/crossentropy": 2.5160040855407715, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.19221991300582886, + "step": 614 + }, + { + "epoch": 0.009183216365536807, + "grad_norm": 0.546875, + "grad_norm_var": 0.0015380859375, + "learning_rate": 2e-05, + "loss": 1.2869, + "loss/crossentropy": 2.493112087249756, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16190239787101746, + "step": 615 + }, + { + "epoch": 0.009198148424667763, + "grad_norm": 0.66015625, + "grad_norm_var": 0.0021563212076822916, + "learning_rate": 2e-05, + "loss": 1.2114, + "loss/crossentropy": 2.559445381164551, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.14888477325439453, + "step": 616 + }, + { + "epoch": 0.009213080483798717, + "grad_norm": 0.609375, + "grad_norm_var": 0.002269490559895833, + "learning_rate": 2e-05, + "loss": 1.3424, + "loss/crossentropy": 2.2312510013580322, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.15492798388004303, + "step": 617 + }, + { + "epoch": 0.00922801254292967, + "grad_norm": 0.5078125, + "grad_norm_var": 0.002109527587890625, + "learning_rate": 2e-05, + "loss": 1.1577, + "loss/crossentropy": 2.72347354888916, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.14986461400985718, + "step": 618 + }, + { + "epoch": 0.009242944602060625, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0021787007649739583, + "learning_rate": 2e-05, + "loss": 1.3219, + "loss/crossentropy": 2.683133125305176, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18123763799667358, + "step": 619 + }, + { + "epoch": 0.009257876661191579, + "grad_norm": 0.640625, + "grad_norm_var": 0.0026152928670247397, + "learning_rate": 2e-05, + "loss": 1.2966, + "loss/crossentropy": 2.640098810195923, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.17159606516361237, + "step": 620 + }, + { + "epoch": 0.009272808720322533, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0028195699055989583, + "learning_rate": 2e-05, + "loss": 1.1244, + "loss/crossentropy": 2.522538661956787, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9921875, + "loss/idx": 12.0, + "loss/logits": 0.13216978311538696, + "step": 621 + }, + { + "epoch": 0.009287740779453487, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0028090794881184896, + "learning_rate": 2e-05, + "loss": 1.2727, + "loss/crossentropy": 2.597336530685425, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.171115443110466, + "step": 622 + }, + { + "epoch": 0.009302672838584441, + "grad_norm": 1.625, + "grad_norm_var": 0.07398656209309896, + "learning_rate": 2e-05, + "loss": 1.4228, + "loss/crossentropy": 3.3440957069396973, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 12.0, + "loss/logits": 0.1728263944387436, + "step": 623 + }, + { + "epoch": 0.009317604897715395, + "grad_norm": 0.5859375, + "grad_norm_var": 0.073442014058431, + "learning_rate": 2e-05, + "loss": 1.2968, + "loss/crossentropy": 2.5757291316986084, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1718222051858902, + "step": 624 + }, + { + "epoch": 0.00933253695684635, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0733407974243164, + "learning_rate": 2e-05, + "loss": 1.2285, + "loss/crossentropy": 2.482468605041504, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1582183539867401, + "step": 625 + }, + { + "epoch": 0.009347469015977303, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0725778579711914, + "learning_rate": 2e-05, + "loss": 1.4247, + "loss/crossentropy": 2.2671515941619873, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.2216186374425888, + "step": 626 + }, + { + "epoch": 0.009362401075108258, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0723785400390625, + "learning_rate": 2e-05, + "loss": 1.2715, + "loss/crossentropy": 2.383666515350342, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.19337865710258484, + "step": 627 + }, + { + "epoch": 0.009377333134239212, + "grad_norm": 0.484375, + "grad_norm_var": 0.07320753733317058, + "learning_rate": 2e-05, + "loss": 1.1818, + "loss/crossentropy": 2.647897243499756, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.15054702758789062, + "step": 628 + }, + { + "epoch": 0.009392265193370166, + "grad_norm": 0.53515625, + "grad_norm_var": 0.07304865519205729, + "learning_rate": 2e-05, + "loss": 1.2364, + "loss/crossentropy": 2.7212166786193848, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16603776812553406, + "step": 629 + }, + { + "epoch": 0.00940719725250112, + "grad_norm": 0.54296875, + "grad_norm_var": 0.07349014282226562, + "learning_rate": 2e-05, + "loss": 1.3161, + "loss/crossentropy": 2.613542318344116, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.19105498492717743, + "step": 630 + }, + { + "epoch": 0.009422129311632074, + "grad_norm": 0.5625, + "grad_norm_var": 0.07334365844726562, + "learning_rate": 2e-05, + "loss": 1.2691, + "loss/crossentropy": 2.55253529548645, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16752402484416962, + "step": 631 + }, + { + "epoch": 0.009437061370763028, + "grad_norm": 0.6640625, + "grad_norm_var": 0.07336266835530598, + "learning_rate": 2e-05, + "loss": 1.2329, + "loss/crossentropy": 2.688462734222412, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1703636348247528, + "step": 632 + }, + { + "epoch": 0.009451993429893982, + "grad_norm": 0.55078125, + "grad_norm_var": 0.07370503743489583, + "learning_rate": 2e-05, + "loss": 1.2271, + "loss/crossentropy": 2.488166570663452, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16464681923389435, + "step": 633 + }, + { + "epoch": 0.009466925489024936, + "grad_norm": 0.50390625, + "grad_norm_var": 0.073765500386556, + "learning_rate": 2e-05, + "loss": 1.226, + "loss/crossentropy": 2.471609115600586, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.155724436044693, + "step": 634 + }, + { + "epoch": 0.00948185754815589, + "grad_norm": 0.546875, + "grad_norm_var": 0.0734392801920573, + "learning_rate": 2e-05, + "loss": 1.2426, + "loss/crossentropy": 2.4193055629730225, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.15662391483783722, + "step": 635 + }, + { + "epoch": 0.009496789607286844, + "grad_norm": 0.53515625, + "grad_norm_var": 0.07389418284098308, + "learning_rate": 2e-05, + "loss": 1.2934, + "loss/crossentropy": 2.5824780464172363, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.18399065732955933, + "step": 636 + }, + { + "epoch": 0.009511721666417798, + "grad_norm": 0.61328125, + "grad_norm_var": 0.07299340565999349, + "learning_rate": 2e-05, + "loss": 1.2094, + "loss/crossentropy": 2.6305127143859863, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15475600957870483, + "step": 637 + }, + { + "epoch": 0.009526653725548753, + "grad_norm": 0.59375, + "grad_norm_var": 0.07276509602864584, + "learning_rate": 2e-05, + "loss": 1.2025, + "loss/crossentropy": 2.63283371925354, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.163466677069664, + "step": 638 + }, + { + "epoch": 0.009541585784679707, + "grad_norm": 0.5, + "grad_norm_var": 0.0020131429036458334, + "learning_rate": 2e-05, + "loss": 1.1414, + "loss/crossentropy": 2.563518762588501, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 12.0, + "loss/logits": 0.14137643575668335, + "step": 639 + }, + { + "epoch": 0.00955651784381066, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0020050048828125, + "learning_rate": 2e-05, + "loss": 1.143, + "loss/crossentropy": 2.7085118293762207, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 12.0, + "loss/logits": 0.14298370480537415, + "step": 640 + }, + { + "epoch": 0.009571449902941615, + "grad_norm": 0.482421875, + "grad_norm_var": 0.0022955417633056642, + "learning_rate": 2e-05, + "loss": 1.0613, + "loss/crossentropy": 2.453303098678589, + "loss/dist_ce": 0.0, + "loss/fcd": 0.94140625, + "loss/idx": 12.0, + "loss/logits": 0.11989939212799072, + "step": 641 + }, + { + "epoch": 0.00958638196207257, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0022699832916259766, + "learning_rate": 2e-05, + "loss": 1.3519, + "loss/crossentropy": 2.531151294708252, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.20343336462974548, + "step": 642 + }, + { + "epoch": 0.009601314021203525, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0023386478424072266, + "learning_rate": 2e-05, + "loss": 1.226, + "loss/crossentropy": 2.747368097305298, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1635451763868332, + "step": 643 + }, + { + "epoch": 0.009616246080334479, + "grad_norm": 0.490234375, + "grad_norm_var": 0.002294158935546875, + "learning_rate": 2e-05, + "loss": 1.1569, + "loss/crossentropy": 2.6408183574676514, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.14904196560382843, + "step": 644 + }, + { + "epoch": 0.009631178139465433, + "grad_norm": 0.578125, + "grad_norm_var": 0.002356402079264323, + "learning_rate": 2e-05, + "loss": 1.292, + "loss/crossentropy": 2.741863489151001, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16696280241012573, + "step": 645 + }, + { + "epoch": 0.009646110198596387, + "grad_norm": 0.5078125, + "grad_norm_var": 0.002453104654947917, + "learning_rate": 2e-05, + "loss": 1.2767, + "loss/crossentropy": 2.5982935428619385, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17511314153671265, + "step": 646 + }, + { + "epoch": 0.009661042257727341, + "grad_norm": 0.59375, + "grad_norm_var": 0.0025873819986979166, + "learning_rate": 2e-05, + "loss": 1.3794, + "loss/crossentropy": 2.221534252166748, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.18411913514137268, + "step": 647 + }, + { + "epoch": 0.009675974316858295, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0016718546549479166, + "learning_rate": 2e-05, + "loss": 1.2579, + "loss/crossentropy": 2.5770785808563232, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17191347479820251, + "step": 648 + }, + { + "epoch": 0.00969090637598925, + "grad_norm": 0.5390625, + "grad_norm_var": 0.001659075419108073, + "learning_rate": 2e-05, + "loss": 1.1903, + "loss/crossentropy": 2.5759243965148926, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15121833980083466, + "step": 649 + }, + { + "epoch": 0.009705838435120203, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0015848159790039062, + "learning_rate": 2e-05, + "loss": 1.2818, + "loss/crossentropy": 2.791914224624634, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.18024246394634247, + "step": 650 + }, + { + "epoch": 0.009720770494251157, + "grad_norm": 0.53515625, + "grad_norm_var": 0.001580047607421875, + "learning_rate": 2e-05, + "loss": 1.3524, + "loss/crossentropy": 2.622012138366699, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.18833482265472412, + "step": 651 + }, + { + "epoch": 0.009735702553382112, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0016458511352539062, + "learning_rate": 2e-05, + "loss": 1.318, + "loss/crossentropy": 2.5040524005889893, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.1773415207862854, + "step": 652 + }, + { + "epoch": 0.009750634612513066, + "grad_norm": 0.53125, + "grad_norm_var": 0.001262664794921875, + "learning_rate": 2e-05, + "loss": 1.2424, + "loss/crossentropy": 2.6082024574279785, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16425400972366333, + "step": 653 + }, + { + "epoch": 0.00976556667164402, + "grad_norm": 0.5, + "grad_norm_var": 0.001073455810546875, + "learning_rate": 2e-05, + "loss": 1.2584, + "loss/crossentropy": 2.7121970653533936, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17245107889175415, + "step": 654 + }, + { + "epoch": 0.009780498730774974, + "grad_norm": 0.46484375, + "grad_norm_var": 0.0012857437133789063, + "learning_rate": 2e-05, + "loss": 1.1393, + "loss/crossentropy": 2.605868101119995, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 12.0, + "loss/logits": 0.13928866386413574, + "step": 655 + }, + { + "epoch": 0.009795430789905928, + "grad_norm": 0.62890625, + "grad_norm_var": 0.00193634033203125, + "learning_rate": 2e-05, + "loss": 1.4061, + "loss/crossentropy": 2.5328142642974854, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 12.0, + "loss/logits": 0.1951315701007843, + "step": 656 + }, + { + "epoch": 0.009810362849036882, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0018085320790608725, + "learning_rate": 2e-05, + "loss": 1.1623, + "loss/crossentropy": 2.591285228729248, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.1467236578464508, + "step": 657 + }, + { + "epoch": 0.009825294908167836, + "grad_norm": 0.53125, + "grad_norm_var": 0.0017818291982014975, + "learning_rate": 2e-05, + "loss": 1.2404, + "loss/crossentropy": 2.678740978240967, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16228888928890228, + "step": 658 + }, + { + "epoch": 0.00984022696729879, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0017978509267171225, + "learning_rate": 2e-05, + "loss": 1.197, + "loss/crossentropy": 2.5414230823516846, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15012088418006897, + "step": 659 + }, + { + "epoch": 0.009855159026429744, + "grad_norm": 0.62109375, + "grad_norm_var": 0.0020608901977539062, + "learning_rate": 2e-05, + "loss": 1.369, + "loss/crossentropy": 2.403024435043335, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.18149128556251526, + "step": 660 + }, + { + "epoch": 0.009870091085560698, + "grad_norm": 0.48828125, + "grad_norm_var": 0.0021647135416666668, + "learning_rate": 2e-05, + "loss": 1.2125, + "loss/crossentropy": 2.634225368499756, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.16562089323997498, + "step": 661 + }, + { + "epoch": 0.009885023144691652, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0021071751912434896, + "learning_rate": 2e-05, + "loss": 1.1779, + "loss/crossentropy": 2.8136699199676514, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.16229188442230225, + "step": 662 + }, + { + "epoch": 0.009899955203822607, + "grad_norm": 0.6015625, + "grad_norm_var": 0.0021666844685872396, + "learning_rate": 2e-05, + "loss": 1.417, + "loss/crossentropy": 2.579958915710449, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.19826620817184448, + "step": 663 + }, + { + "epoch": 0.00991488726295356, + "grad_norm": 0.65625, + "grad_norm_var": 0.0028914769490559896, + "learning_rate": 2e-05, + "loss": 1.4121, + "loss/crossentropy": 2.441436529159546, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.20899558067321777, + "step": 664 + }, + { + "epoch": 0.009929819322084515, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0029782613118489584, + "learning_rate": 2e-05, + "loss": 1.2005, + "loss/crossentropy": 2.5218026638031006, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1536444127559662, + "step": 665 + }, + { + "epoch": 0.009944751381215469, + "grad_norm": 0.515625, + "grad_norm_var": 0.0030364354451497396, + "learning_rate": 2e-05, + "loss": 1.2123, + "loss/crossentropy": 2.6483314037323, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15763527154922485, + "step": 666 + }, + { + "epoch": 0.009959683440346423, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0030476252237955728, + "learning_rate": 2e-05, + "loss": 1.1919, + "loss/crossentropy": 2.708850383758545, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.1528831422328949, + "step": 667 + }, + { + "epoch": 0.009974615499477379, + "grad_norm": 0.5078125, + "grad_norm_var": 0.003114763895670573, + "learning_rate": 2e-05, + "loss": 1.202, + "loss/crossentropy": 2.585891008377075, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15510644018650055, + "step": 668 + }, + { + "epoch": 0.009989547558608333, + "grad_norm": 0.50390625, + "grad_norm_var": 0.003212229410807292, + "learning_rate": 2e-05, + "loss": 1.2312, + "loss/crossentropy": 2.487285614013672, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16083845496177673, + "step": 669 + }, + { + "epoch": 0.010004479617739287, + "grad_norm": 0.494140625, + "grad_norm_var": 0.0032483259836832683, + "learning_rate": 2e-05, + "loss": 1.1847, + "loss/crossentropy": 2.699873924255371, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.15343204140663147, + "step": 670 + }, + { + "epoch": 0.010019411676870241, + "grad_norm": 0.578125, + "grad_norm_var": 0.002868509292602539, + "learning_rate": 2e-05, + "loss": 1.2348, + "loss/crossentropy": 2.5884616374969482, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.18012240529060364, + "step": 671 + }, + { + "epoch": 0.010034343736001195, + "grad_norm": 0.5703125, + "grad_norm_var": 0.002467966079711914, + "learning_rate": 2e-05, + "loss": 1.1819, + "loss/crossentropy": 2.445880889892578, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.1506287306547165, + "step": 672 + }, + { + "epoch": 0.01004927579513215, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0024401187896728516, + "learning_rate": 2e-05, + "loss": 1.2579, + "loss/crossentropy": 2.3222944736480713, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.15637820959091187, + "step": 673 + }, + { + "epoch": 0.010064207854263103, + "grad_norm": 0.53125, + "grad_norm_var": 0.0024401187896728516, + "learning_rate": 2e-05, + "loss": 1.2937, + "loss/crossentropy": 2.3992764949798584, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1843375265598297, + "step": 674 + }, + { + "epoch": 0.010079139913394057, + "grad_norm": 0.5625, + "grad_norm_var": 0.0023333072662353516, + "learning_rate": 2e-05, + "loss": 1.3425, + "loss/crossentropy": 2.4430034160614014, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.1940709352493286, + "step": 675 + }, + { + "epoch": 0.010094071972525012, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0019924004872639975, + "learning_rate": 2e-05, + "loss": 1.2304, + "loss/crossentropy": 2.6293787956237793, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16787387430667877, + "step": 676 + }, + { + "epoch": 0.010109004031655966, + "grad_norm": 0.53125, + "grad_norm_var": 0.0017811934153238933, + "learning_rate": 2e-05, + "loss": 1.2056, + "loss/crossentropy": 2.738004207611084, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15091140568256378, + "step": 677 + }, + { + "epoch": 0.01012393609078692, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0017575422922770183, + "learning_rate": 2e-05, + "loss": 1.2279, + "loss/crossentropy": 2.484860897064209, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15762406587600708, + "step": 678 + }, + { + "epoch": 0.010138868149917874, + "grad_norm": 0.4921875, + "grad_norm_var": 0.0017343997955322266, + "learning_rate": 2e-05, + "loss": 1.2223, + "loss/crossentropy": 2.5346856117248535, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1675938218832016, + "step": 679 + }, + { + "epoch": 0.010153800209048828, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0008088270823160807, + "learning_rate": 2e-05, + "loss": 1.2444, + "loss/crossentropy": 2.5577213764190674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1663036346435547, + "step": 680 + }, + { + "epoch": 0.010168732268179782, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0007760206858317058, + "learning_rate": 2e-05, + "loss": 1.2781, + "loss/crossentropy": 2.5874197483062744, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.1843561828136444, + "step": 681 + }, + { + "epoch": 0.010183664327310736, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0008008162180582683, + "learning_rate": 2e-05, + "loss": 1.2692, + "loss/crossentropy": 2.458387613296509, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.15981708467006683, + "step": 682 + }, + { + "epoch": 0.01019859638644169, + "grad_norm": 0.65625, + "grad_norm_var": 0.0016343275705973308, + "learning_rate": 2e-05, + "loss": 1.4047, + "loss/crossentropy": 2.532681941986084, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.20941874384880066, + "step": 683 + }, + { + "epoch": 0.010213528445572644, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0015393416086832681, + "learning_rate": 2e-05, + "loss": 1.1006, + "loss/crossentropy": 2.7314085960388184, + "loss/dist_ce": 0.0, + "loss/fcd": 0.97265625, + "loss/idx": 12.0, + "loss/logits": 0.12796571850776672, + "step": 684 + }, + { + "epoch": 0.010228460504703598, + "grad_norm": 0.515625, + "grad_norm_var": 0.0014809767405192058, + "learning_rate": 2e-05, + "loss": 1.1961, + "loss/crossentropy": 2.662325859069824, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15699967741966248, + "step": 685 + }, + { + "epoch": 0.010243392563834552, + "grad_norm": 0.62109375, + "grad_norm_var": 0.0015853246053059895, + "learning_rate": 2e-05, + "loss": 1.3114, + "loss/crossentropy": 2.2350664138793945, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17081522941589355, + "step": 686 + }, + { + "epoch": 0.010258324622965507, + "grad_norm": 0.53125, + "grad_norm_var": 0.0015807469685872396, + "learning_rate": 2e-05, + "loss": 1.2881, + "loss/crossentropy": 2.4432411193847656, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17870429158210754, + "step": 687 + }, + { + "epoch": 0.01027325668209646, + "grad_norm": 0.87109375, + "grad_norm_var": 0.0079498291015625, + "learning_rate": 2e-05, + "loss": 1.5581, + "loss/crossentropy": 2.6931755542755127, + "loss/dist_ce": 0.0, + "loss/fcd": 1.34375, + "loss/idx": 12.0, + "loss/logits": 0.2143464982509613, + "step": 688 + }, + { + "epoch": 0.010288188741227415, + "grad_norm": 0.58984375, + "grad_norm_var": 0.00791015625, + "learning_rate": 2e-05, + "loss": 1.298, + "loss/crossentropy": 2.6877715587615967, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1730184555053711, + "step": 689 + }, + { + "epoch": 0.010303120800358369, + "grad_norm": 0.53515625, + "grad_norm_var": 0.007888730367024739, + "learning_rate": 2e-05, + "loss": 1.2563, + "loss/crossentropy": 2.4702701568603516, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.1625899374485016, + "step": 690 + }, + { + "epoch": 0.010318052859489323, + "grad_norm": 0.56640625, + "grad_norm_var": 0.007883453369140625, + "learning_rate": 2e-05, + "loss": 1.3245, + "loss/crossentropy": 2.4865882396698, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.1839236319065094, + "step": 691 + }, + { + "epoch": 0.010332984918620277, + "grad_norm": 0.498046875, + "grad_norm_var": 0.008251174290974935, + "learning_rate": 2e-05, + "loss": 1.2138, + "loss/crossentropy": 2.5186684131622314, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1591419279575348, + "step": 692 + }, + { + "epoch": 0.010347916977751231, + "grad_norm": 0.56640625, + "grad_norm_var": 0.008144744237263997, + "learning_rate": 2e-05, + "loss": 1.276, + "loss/crossentropy": 2.6817424297332764, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17447999119758606, + "step": 693 + }, + { + "epoch": 0.010362849036882187, + "grad_norm": 0.546875, + "grad_norm_var": 0.008113590876261394, + "learning_rate": 2e-05, + "loss": 1.3738, + "loss/crossentropy": 2.546079397201538, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.21759110689163208, + "step": 694 + }, + { + "epoch": 0.010377781096013141, + "grad_norm": 0.57421875, + "grad_norm_var": 0.007648960749308268, + "learning_rate": 2e-05, + "loss": 1.2983, + "loss/crossentropy": 2.4528214931488037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.18111282587051392, + "step": 695 + }, + { + "epoch": 0.010392713155144095, + "grad_norm": 0.5546875, + "grad_norm_var": 0.007602421442667643, + "learning_rate": 2e-05, + "loss": 1.2616, + "loss/crossentropy": 2.686127185821533, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.16781781613826752, + "step": 696 + }, + { + "epoch": 0.01040764521427505, + "grad_norm": 0.60546875, + "grad_norm_var": 0.007446018854777018, + "learning_rate": 2e-05, + "loss": 1.3101, + "loss/crossentropy": 2.54367995262146, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.1694590151309967, + "step": 697 + }, + { + "epoch": 0.010422577273406003, + "grad_norm": 0.546875, + "grad_norm_var": 0.007515319188435872, + "learning_rate": 2e-05, + "loss": 1.2795, + "loss/crossentropy": 2.6796493530273438, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17017018795013428, + "step": 698 + }, + { + "epoch": 0.010437509332536957, + "grad_norm": 0.60546875, + "grad_norm_var": 0.007178099950154623, + "learning_rate": 2e-05, + "loss": 1.3181, + "loss/crossentropy": 2.4632835388183594, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.1774502694606781, + "step": 699 + }, + { + "epoch": 0.010452441391667911, + "grad_norm": 0.515625, + "grad_norm_var": 0.007357899347941081, + "learning_rate": 2e-05, + "loss": 1.1805, + "loss/crossentropy": 2.715266466140747, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.14927467703819275, + "step": 700 + }, + { + "epoch": 0.010467373450798866, + "grad_norm": 0.5390625, + "grad_norm_var": 0.007198063532511393, + "learning_rate": 2e-05, + "loss": 1.2262, + "loss/crossentropy": 2.36779522895813, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1558821052312851, + "step": 701 + }, + { + "epoch": 0.01048230550992982, + "grad_norm": 0.609375, + "grad_norm_var": 0.007141224543253581, + "learning_rate": 2e-05, + "loss": 1.3637, + "loss/crossentropy": 2.60080885887146, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.19179855287075043, + "step": 702 + }, + { + "epoch": 0.010497237569060774, + "grad_norm": 0.55078125, + "grad_norm_var": 0.007042042414347331, + "learning_rate": 2e-05, + "loss": 1.2853, + "loss/crossentropy": 2.4730658531188965, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1759296953678131, + "step": 703 + }, + { + "epoch": 0.010512169628191728, + "grad_norm": 0.6171875, + "grad_norm_var": 0.001206827163696289, + "learning_rate": 2e-05, + "loss": 1.4148, + "loss/crossentropy": 2.071516990661621, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 12.0, + "loss/logits": 0.1960187554359436, + "step": 704 + }, + { + "epoch": 0.010527101687322682, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0011599063873291016, + "learning_rate": 2e-05, + "loss": 1.3046, + "loss/crossentropy": 2.5084221363067627, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.1718074232339859, + "step": 705 + }, + { + "epoch": 0.010542033746453636, + "grad_norm": 0.71484375, + "grad_norm_var": 0.0025256951649983723, + "learning_rate": 2e-05, + "loss": 1.4087, + "loss/crossentropy": 2.703789234161377, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.22122114896774292, + "step": 706 + }, + { + "epoch": 0.01055696580558459, + "grad_norm": 0.59375, + "grad_norm_var": 0.0025461673736572265, + "learning_rate": 2e-05, + "loss": 1.2491, + "loss/crossentropy": 2.3329367637634277, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.15533186495304108, + "step": 707 + }, + { + "epoch": 0.010571897864715544, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0022979736328125, + "learning_rate": 2e-05, + "loss": 1.2526, + "loss/crossentropy": 2.6226069927215576, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.15883222222328186, + "step": 708 + }, + { + "epoch": 0.010586829923846498, + "grad_norm": 0.5625, + "grad_norm_var": 0.002304522196451823, + "learning_rate": 2e-05, + "loss": 1.2615, + "loss/crossentropy": 2.480863332748413, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.15995028614997864, + "step": 709 + }, + { + "epoch": 0.010601761982977452, + "grad_norm": 0.53125, + "grad_norm_var": 0.002382342020670573, + "learning_rate": 2e-05, + "loss": 1.2916, + "loss/crossentropy": 2.587437868118286, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1821785271167755, + "step": 710 + }, + { + "epoch": 0.010616694042108406, + "grad_norm": 0.48828125, + "grad_norm_var": 0.002863502502441406, + "learning_rate": 2e-05, + "loss": 1.2136, + "loss/crossentropy": 2.6183738708496094, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15106935799121857, + "step": 711 + }, + { + "epoch": 0.01063162610123936, + "grad_norm": 0.59765625, + "grad_norm_var": 0.002887980143229167, + "learning_rate": 2e-05, + "loss": 1.3275, + "loss/crossentropy": 2.7090811729431152, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.17910704016685486, + "step": 712 + }, + { + "epoch": 0.010646558160370315, + "grad_norm": 0.5703125, + "grad_norm_var": 0.002814165751139323, + "learning_rate": 2e-05, + "loss": 1.4072, + "loss/crossentropy": 2.540616273880005, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.21185210347175598, + "step": 713 + }, + { + "epoch": 0.010661490219501269, + "grad_norm": 0.59765625, + "grad_norm_var": 0.002811686197916667, + "learning_rate": 2e-05, + "loss": 1.3452, + "loss/crossentropy": 2.587371349334717, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.18108756840229034, + "step": 714 + }, + { + "epoch": 0.010676422278632223, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0029703776041666665, + "learning_rate": 2e-05, + "loss": 1.1809, + "loss/crossentropy": 2.543191909790039, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.14962825179100037, + "step": 715 + }, + { + "epoch": 0.010691354337763177, + "grad_norm": 0.53125, + "grad_norm_var": 0.0028757731119791667, + "learning_rate": 2e-05, + "loss": 1.2791, + "loss/crossentropy": 2.650752544403076, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16970160603523254, + "step": 716 + }, + { + "epoch": 0.010706286396894131, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0029784520467122395, + "learning_rate": 2e-05, + "loss": 1.2036, + "loss/crossentropy": 2.721712350845337, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15671955049037933, + "step": 717 + }, + { + "epoch": 0.010721218456025085, + "grad_norm": 0.490234375, + "grad_norm_var": 0.0032101790110270183, + "learning_rate": 2e-05, + "loss": 1.2164, + "loss/crossentropy": 2.4478020668029785, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.16174601018428802, + "step": 718 + }, + { + "epoch": 0.01073615051515604, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0032242933909098306, + "learning_rate": 2e-05, + "loss": 1.2431, + "loss/crossentropy": 2.5908939838409424, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1649816781282425, + "step": 719 + }, + { + "epoch": 0.010751082574286995, + "grad_norm": 0.6171875, + "grad_norm_var": 0.0032242933909098306, + "learning_rate": 2e-05, + "loss": 1.3963, + "loss/crossentropy": 2.453248977661133, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.19315370917320251, + "step": 720 + }, + { + "epoch": 0.01076601463341795, + "grad_norm": 0.828125, + "grad_norm_var": 0.00772258440653483, + "learning_rate": 2e-05, + "loss": 1.3003, + "loss/crossentropy": 2.477731704711914, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.16752782464027405, + "step": 721 + }, + { + "epoch": 0.010780946692548903, + "grad_norm": 0.67578125, + "grad_norm_var": 0.007097609837849935, + "learning_rate": 2e-05, + "loss": 1.4393, + "loss/crossentropy": 2.5516936779022217, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.1970784217119217, + "step": 722 + }, + { + "epoch": 0.010795878751679857, + "grad_norm": 0.62890625, + "grad_norm_var": 0.007266982396443685, + "learning_rate": 2e-05, + "loss": 1.4367, + "loss/crossentropy": 2.6101393699645996, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 12.0, + "loss/logits": 0.21016988158226013, + "step": 723 + }, + { + "epoch": 0.010810810810810811, + "grad_norm": 0.50390625, + "grad_norm_var": 0.007454284032185872, + "learning_rate": 2e-05, + "loss": 1.2009, + "loss/crossentropy": 2.484208822250366, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.1696794629096985, + "step": 724 + }, + { + "epoch": 0.010825742869941766, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0074452559153238935, + "learning_rate": 2e-05, + "loss": 1.2156, + "loss/crossentropy": 2.6739721298217773, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15314337611198425, + "step": 725 + }, + { + "epoch": 0.01084067492907272, + "grad_norm": 0.578125, + "grad_norm_var": 0.007307163874308268, + "learning_rate": 2e-05, + "loss": 1.3033, + "loss/crossentropy": 2.596822500228882, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17045846581459045, + "step": 726 + }, + { + "epoch": 0.010855606988203674, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0067378838857014975, + "learning_rate": 2e-05, + "loss": 1.2591, + "loss/crossentropy": 2.6170504093170166, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.1575794816017151, + "step": 727 + }, + { + "epoch": 0.010870539047334628, + "grad_norm": 0.5859375, + "grad_norm_var": 0.00672453244527181, + "learning_rate": 2e-05, + "loss": 1.3929, + "loss/crossentropy": 2.4130985736846924, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.19758152961730957, + "step": 728 + }, + { + "epoch": 0.010885471106465582, + "grad_norm": 0.52734375, + "grad_norm_var": 0.006911961237589518, + "learning_rate": 2e-05, + "loss": 1.2616, + "loss/crossentropy": 2.508866786956787, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1756208837032318, + "step": 729 + }, + { + "epoch": 0.010900403165596536, + "grad_norm": 0.53125, + "grad_norm_var": 0.007033014297485351, + "learning_rate": 2e-05, + "loss": 1.1935, + "loss/crossentropy": 2.5010106563568115, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15440748631954193, + "step": 730 + }, + { + "epoch": 0.01091533522472749, + "grad_norm": 0.5234375, + "grad_norm_var": 0.006941080093383789, + "learning_rate": 2e-05, + "loss": 1.1836, + "loss/crossentropy": 2.571686029434204, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.15230761468410492, + "step": 731 + }, + { + "epoch": 0.010930267283858444, + "grad_norm": 0.5078125, + "grad_norm_var": 0.007117700576782226, + "learning_rate": 2e-05, + "loss": 1.2025, + "loss/crossentropy": 2.565176486968994, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.14785486459732056, + "step": 732 + }, + { + "epoch": 0.010945199342989398, + "grad_norm": 0.54296875, + "grad_norm_var": 0.006977701187133789, + "learning_rate": 2e-05, + "loss": 1.2446, + "loss/crossentropy": 2.555361747741699, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.15867501497268677, + "step": 733 + }, + { + "epoch": 0.010960131402120352, + "grad_norm": 0.58203125, + "grad_norm_var": 0.006445058186848958, + "learning_rate": 2e-05, + "loss": 1.3327, + "loss/crossentropy": 2.6520895957946777, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.19203567504882812, + "step": 734 + }, + { + "epoch": 0.010975063461251306, + "grad_norm": 0.515625, + "grad_norm_var": 0.006635983784993489, + "learning_rate": 2e-05, + "loss": 1.1915, + "loss/crossentropy": 2.4574670791625977, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.14458885788917542, + "step": 735 + }, + { + "epoch": 0.01098999552038226, + "grad_norm": 0.5546875, + "grad_norm_var": 0.0065769831339518225, + "learning_rate": 2e-05, + "loss": 1.3282, + "loss/crossentropy": 2.5529794692993164, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.17192694544792175, + "step": 736 + }, + { + "epoch": 0.011004927579513215, + "grad_norm": 0.63671875, + "grad_norm_var": 0.0024553934733072915, + "learning_rate": 2e-05, + "loss": 1.3139, + "loss/crossentropy": 2.5145184993743896, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17325331270694733, + "step": 737 + }, + { + "epoch": 0.011019859638644169, + "grad_norm": 0.70703125, + "grad_norm_var": 0.0029782613118489584, + "learning_rate": 2e-05, + "loss": 1.5417, + "loss/crossentropy": 2.3598456382751465, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2890625, + "loss/idx": 12.0, + "loss/logits": 0.2526322901248932, + "step": 738 + }, + { + "epoch": 0.011034791697775123, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0027058919270833335, + "learning_rate": 2e-05, + "loss": 1.341, + "loss/crossentropy": 2.66998291015625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.20035037398338318, + "step": 739 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0026486714680989585, + "learning_rate": 2e-05, + "loss": 1.2284, + "loss/crossentropy": 2.527078628540039, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.17375284433364868, + "step": 740 + }, + { + "epoch": 0.011064655816037031, + "grad_norm": 0.53125, + "grad_norm_var": 0.0027058919270833335, + "learning_rate": 2e-05, + "loss": 1.2061, + "loss/crossentropy": 2.663687229156494, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15137803554534912, + "step": 741 + }, + { + "epoch": 0.011079587875167985, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0027160008748372396, + "learning_rate": 2e-05, + "loss": 1.3247, + "loss/crossentropy": 2.5827627182006836, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.1840566098690033, + "step": 742 + }, + { + "epoch": 0.01109451993429894, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0027694066365559896, + "learning_rate": 2e-05, + "loss": 1.2818, + "loss/crossentropy": 2.684978723526001, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17245015501976013, + "step": 743 + }, + { + "epoch": 0.011109451993429893, + "grad_norm": 0.53125, + "grad_norm_var": 0.002751604715983073, + "learning_rate": 2e-05, + "loss": 1.2827, + "loss/crossentropy": 2.6512203216552734, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.17336753010749817, + "step": 744 + }, + { + "epoch": 0.011124384052560847, + "grad_norm": 0.53125, + "grad_norm_var": 0.0027384440104166667, + "learning_rate": 2e-05, + "loss": 1.2578, + "loss/crossentropy": 2.4307868480682373, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.15619587898254395, + "step": 745 + }, + { + "epoch": 0.011139316111691801, + "grad_norm": 0.51953125, + "grad_norm_var": 0.002783648173014323, + "learning_rate": 2e-05, + "loss": 1.1904, + "loss/crossentropy": 2.794666290283203, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.1513822376728058, + "step": 746 + }, + { + "epoch": 0.011154248170822757, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0028624852498372396, + "learning_rate": 2e-05, + "loss": 1.2677, + "loss/crossentropy": 2.5204267501831055, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16616524755954742, + "step": 747 + }, + { + "epoch": 0.011169180229953711, + "grad_norm": 0.48046875, + "grad_norm_var": 0.003073883056640625, + "learning_rate": 2e-05, + "loss": 1.1894, + "loss/crossentropy": 2.630366563796997, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.1581723392009735, + "step": 748 + }, + { + "epoch": 0.011184112289084665, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0031341552734375, + "learning_rate": 2e-05, + "loss": 1.2354, + "loss/crossentropy": 2.55263090133667, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16509470343589783, + "step": 749 + }, + { + "epoch": 0.01119904434821562, + "grad_norm": 0.478515625, + "grad_norm_var": 0.003359079360961914, + "learning_rate": 2e-05, + "loss": 1.1695, + "loss/crossentropy": 2.5358965396881104, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.14609494805335999, + "step": 750 + }, + { + "epoch": 0.011213976407346574, + "grad_norm": 0.5625, + "grad_norm_var": 0.003323221206665039, + "learning_rate": 2e-05, + "loss": 1.3294, + "loss/crossentropy": 2.5020415782928467, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18877586722373962, + "step": 751 + }, + { + "epoch": 0.011228908466477528, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0034273624420166015, + "learning_rate": 2e-05, + "loss": 1.2785, + "loss/crossentropy": 2.7053909301757812, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17688840627670288, + "step": 752 + }, + { + "epoch": 0.011243840525608482, + "grad_norm": 0.59375, + "grad_norm_var": 0.0030063470204671223, + "learning_rate": 2e-05, + "loss": 1.3516, + "loss/crossentropy": 2.440305471420288, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.1953657865524292, + "step": 753 + }, + { + "epoch": 0.011258772584739436, + "grad_norm": 0.62890625, + "grad_norm_var": 0.0016521294911702475, + "learning_rate": 2e-05, + "loss": 1.3206, + "loss/crossentropy": 2.6588921546936035, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17999888956546783, + "step": 754 + }, + { + "epoch": 0.01127370464387039, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0016521294911702475, + "learning_rate": 2e-05, + "loss": 1.1845, + "loss/crossentropy": 2.6511154174804688, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.14538779854774475, + "step": 755 + }, + { + "epoch": 0.011288636703001344, + "grad_norm": 0.50390625, + "grad_norm_var": 0.001680739720662435, + "learning_rate": 2e-05, + "loss": 1.2223, + "loss/crossentropy": 2.579488754272461, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15982869267463684, + "step": 756 + }, + { + "epoch": 0.011303568762132298, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0017144362131754558, + "learning_rate": 2e-05, + "loss": 1.208, + "loss/crossentropy": 2.4899256229400635, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.16107669472694397, + "step": 757 + }, + { + "epoch": 0.011318500821263252, + "grad_norm": 0.5625, + "grad_norm_var": 0.001612710952758789, + "learning_rate": 2e-05, + "loss": 1.2585, + "loss/crossentropy": 2.424293279647827, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.16471582651138306, + "step": 758 + }, + { + "epoch": 0.011333432880394206, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0018131097157796225, + "learning_rate": 2e-05, + "loss": 1.2444, + "loss/crossentropy": 2.6930503845214844, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1662687361240387, + "step": 759 + }, + { + "epoch": 0.01134836493952516, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0018407026926676431, + "learning_rate": 2e-05, + "loss": 1.1926, + "loss/crossentropy": 2.4682953357696533, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.14569604396820068, + "step": 760 + }, + { + "epoch": 0.011363296998656115, + "grad_norm": 0.671875, + "grad_norm_var": 0.002946201960245768, + "learning_rate": 2e-05, + "loss": 1.3256, + "loss/crossentropy": 2.677440643310547, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.1850149929523468, + "step": 761 + }, + { + "epoch": 0.011378229057787069, + "grad_norm": 0.490234375, + "grad_norm_var": 0.003107134501139323, + "learning_rate": 2e-05, + "loss": 1.1689, + "loss/crossentropy": 2.6096267700195312, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.15331940352916718, + "step": 762 + }, + { + "epoch": 0.011393161116918023, + "grad_norm": 0.515625, + "grad_norm_var": 0.003072039286295573, + "learning_rate": 2e-05, + "loss": 1.1372, + "loss/crossentropy": 2.6900992393493652, + "loss/dist_ce": 0.0, + "loss/fcd": 0.984375, + "loss/idx": 12.0, + "loss/logits": 0.15282993018627167, + "step": 763 + }, + { + "epoch": 0.011408093176048977, + "grad_norm": 0.48828125, + "grad_norm_var": 0.003007952372233073, + "learning_rate": 2e-05, + "loss": 1.2392, + "loss/crossentropy": 2.6735024452209473, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16892734169960022, + "step": 764 + }, + { + "epoch": 0.011423025235179931, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0029677708943684896, + "learning_rate": 2e-05, + "loss": 1.2317, + "loss/crossentropy": 2.434821367263794, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16139961779117584, + "step": 765 + }, + { + "epoch": 0.011437957294310885, + "grad_norm": 0.51953125, + "grad_norm_var": 0.00269773801167806, + "learning_rate": 2e-05, + "loss": 1.2853, + "loss/crossentropy": 2.514920234680176, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.160325825214386, + "step": 766 + }, + { + "epoch": 0.011452889353441839, + "grad_norm": 0.5625, + "grad_norm_var": 0.00269773801167806, + "learning_rate": 2e-05, + "loss": 1.256, + "loss/crossentropy": 2.647106409072876, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.17788350582122803, + "step": 767 + }, + { + "epoch": 0.011467821412572793, + "grad_norm": 0.62890625, + "grad_norm_var": 0.00291136105855306, + "learning_rate": 2e-05, + "loss": 1.2025, + "loss/crossentropy": 2.546200752258301, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.14780209958553314, + "step": 768 + }, + { + "epoch": 0.011482753471703747, + "grad_norm": 0.515625, + "grad_norm_var": 0.002915175755818685, + "learning_rate": 2e-05, + "loss": 1.2694, + "loss/crossentropy": 2.6620922088623047, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.17561113834381104, + "step": 769 + }, + { + "epoch": 0.011497685530834701, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0025019168853759764, + "learning_rate": 2e-05, + "loss": 1.2409, + "loss/crossentropy": 2.733931064605713, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.15500226616859436, + "step": 770 + }, + { + "epoch": 0.011512617589965655, + "grad_norm": 0.515625, + "grad_norm_var": 0.002555068333943685, + "learning_rate": 2e-05, + "loss": 1.1855, + "loss/crossentropy": 2.6056385040283203, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.14647497236728668, + "step": 771 + }, + { + "epoch": 0.01152754964909661, + "grad_norm": 0.5625, + "grad_norm_var": 0.002448256810506185, + "learning_rate": 2e-05, + "loss": 1.2866, + "loss/crossentropy": 2.3778257369995117, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1616000533103943, + "step": 772 + }, + { + "epoch": 0.011542481708227565, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0024135430653889974, + "learning_rate": 2e-05, + "loss": 1.2556, + "loss/crossentropy": 2.5326895713806152, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1696186363697052, + "step": 773 + }, + { + "epoch": 0.01155741376735852, + "grad_norm": 0.64453125, + "grad_norm_var": 0.0029796441396077473, + "learning_rate": 2e-05, + "loss": 1.4358, + "loss/crossentropy": 2.400148630142212, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.19359630346298218, + "step": 774 + }, + { + "epoch": 0.011572345826489474, + "grad_norm": 0.60546875, + "grad_norm_var": 0.003068908055623372, + "learning_rate": 2e-05, + "loss": 1.2743, + "loss/crossentropy": 2.5815742015838623, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.164922833442688, + "step": 775 + }, + { + "epoch": 0.011587277885620428, + "grad_norm": 0.494140625, + "grad_norm_var": 0.0033002217610677083, + "learning_rate": 2e-05, + "loss": 1.1144, + "loss/crossentropy": 2.582948684692383, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98046875, + "loss/idx": 12.0, + "loss/logits": 0.13390488922595978, + "step": 776 + }, + { + "epoch": 0.011602209944751382, + "grad_norm": 0.78515625, + "grad_norm_var": 0.005923906962076823, + "learning_rate": 2e-05, + "loss": 1.4428, + "loss/crossentropy": 2.4833004474639893, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 12.0, + "loss/logits": 0.20838040113449097, + "step": 777 + }, + { + "epoch": 0.011617142003882336, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0056294600168863935, + "learning_rate": 2e-05, + "loss": 1.2143, + "loss/crossentropy": 2.672031879425049, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15958479046821594, + "step": 778 + }, + { + "epoch": 0.01163207406301329, + "grad_norm": 0.53125, + "grad_norm_var": 0.0055493513743082685, + "learning_rate": 2e-05, + "loss": 1.2599, + "loss/crossentropy": 2.485051155090332, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17396780848503113, + "step": 779 + }, + { + "epoch": 0.011647006122144244, + "grad_norm": 0.515625, + "grad_norm_var": 0.005325937271118164, + "learning_rate": 2e-05, + "loss": 1.1861, + "loss/crossentropy": 2.4769065380096436, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.14706002175807953, + "step": 780 + }, + { + "epoch": 0.011661938181275198, + "grad_norm": 0.5234375, + "grad_norm_var": 0.005379724502563477, + "learning_rate": 2e-05, + "loss": 1.2594, + "loss/crossentropy": 2.6715643405914307, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.1656656712293625, + "step": 781 + }, + { + "epoch": 0.011676870240406152, + "grad_norm": 0.51171875, + "grad_norm_var": 0.005429188410441081, + "learning_rate": 2e-05, + "loss": 1.2034, + "loss/crossentropy": 2.6194474697113037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1565479040145874, + "step": 782 + }, + { + "epoch": 0.011691802299537106, + "grad_norm": 0.5703125, + "grad_norm_var": 0.005432621637980143, + "learning_rate": 2e-05, + "loss": 1.2735, + "loss/crossentropy": 2.634058713912964, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16408279538154602, + "step": 783 + }, + { + "epoch": 0.01170673435866806, + "grad_norm": 0.498046875, + "grad_norm_var": 0.005359141031901041, + "learning_rate": 2e-05, + "loss": 1.2094, + "loss/crossentropy": 2.4988787174224854, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.14692716300487518, + "step": 784 + }, + { + "epoch": 0.011721666417799015, + "grad_norm": 0.482421875, + "grad_norm_var": 0.005603138605753581, + "learning_rate": 2e-05, + "loss": 1.1987, + "loss/crossentropy": 2.6358911991119385, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15186846256256104, + "step": 785 + }, + { + "epoch": 0.011736598476929969, + "grad_norm": 0.48828125, + "grad_norm_var": 0.005866607030232747, + "learning_rate": 2e-05, + "loss": 1.115, + "loss/crossentropy": 2.591740131378174, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98046875, + "loss/idx": 12.0, + "loss/logits": 0.13453420996665955, + "step": 786 + }, + { + "epoch": 0.011751530536060923, + "grad_norm": 0.546875, + "grad_norm_var": 0.005787769953409831, + "learning_rate": 2e-05, + "loss": 1.2283, + "loss/crossentropy": 2.444399833679199, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15801313519477844, + "step": 787 + }, + { + "epoch": 0.011766462595191877, + "grad_norm": 0.5625, + "grad_norm_var": 0.005787769953409831, + "learning_rate": 2e-05, + "loss": 1.3204, + "loss/crossentropy": 2.599078416824341, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.19537828862667084, + "step": 788 + }, + { + "epoch": 0.011781394654322831, + "grad_norm": 0.6875, + "grad_norm_var": 0.006843042373657226, + "learning_rate": 2e-05, + "loss": 1.3781, + "loss/crossentropy": 2.5863723754882812, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.174960196018219, + "step": 789 + }, + { + "epoch": 0.011796326713453785, + "grad_norm": 0.53515625, + "grad_norm_var": 0.006381972630818685, + "learning_rate": 2e-05, + "loss": 1.2814, + "loss/crossentropy": 2.3581089973449707, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.18761396408081055, + "step": 790 + }, + { + "epoch": 0.011811258772584739, + "grad_norm": 0.490234375, + "grad_norm_var": 0.006433550516764323, + "learning_rate": 2e-05, + "loss": 1.2193, + "loss/crossentropy": 2.3400156497955322, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15682953596115112, + "step": 791 + }, + { + "epoch": 0.011826190831715693, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0063237349192301435, + "learning_rate": 2e-05, + "loss": 1.2358, + "loss/crossentropy": 2.597322702407837, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16544011235237122, + "step": 792 + }, + { + "epoch": 0.011841122890846647, + "grad_norm": 0.546875, + "grad_norm_var": 0.002515268325805664, + "learning_rate": 2e-05, + "loss": 1.255, + "loss/crossentropy": 2.618713617324829, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16905856132507324, + "step": 793 + }, + { + "epoch": 0.011856054949977601, + "grad_norm": 0.498046875, + "grad_norm_var": 0.002618408203125, + "learning_rate": 2e-05, + "loss": 1.2597, + "loss/crossentropy": 2.586865186691284, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.16593782603740692, + "step": 794 + }, + { + "epoch": 0.011870987009108555, + "grad_norm": 0.62109375, + "grad_norm_var": 0.003064409891764323, + "learning_rate": 2e-05, + "loss": 1.4704, + "loss/crossentropy": 2.281075954437256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 12.0, + "loss/logits": 0.2047278881072998, + "step": 795 + }, + { + "epoch": 0.01188591906823951, + "grad_norm": 0.53125, + "grad_norm_var": 0.003025245666503906, + "learning_rate": 2e-05, + "loss": 1.2074, + "loss/crossentropy": 2.56146502494812, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1527431160211563, + "step": 796 + }, + { + "epoch": 0.011900851127370464, + "grad_norm": 0.6328125, + "grad_norm_var": 0.003491655985514323, + "learning_rate": 2e-05, + "loss": 1.3282, + "loss/crossentropy": 2.544595241546631, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.17975857853889465, + "step": 797 + }, + { + "epoch": 0.011915783186501418, + "grad_norm": 0.5234375, + "grad_norm_var": 0.003441111246744792, + "learning_rate": 2e-05, + "loss": 1.1355, + "loss/crossentropy": 2.5778257846832275, + "loss/dist_ce": 0.0, + "loss/fcd": 0.99609375, + "loss/idx": 12.0, + "loss/logits": 0.1393672674894333, + "step": 798 + }, + { + "epoch": 0.011930715245632374, + "grad_norm": 0.462890625, + "grad_norm_var": 0.003875589370727539, + "learning_rate": 2e-05, + "loss": 1.1403, + "loss/crossentropy": 2.7800252437591553, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 12.0, + "loss/logits": 0.1403147280216217, + "step": 799 + }, + { + "epoch": 0.011945647304763328, + "grad_norm": 0.58984375, + "grad_norm_var": 0.003844960530598958, + "learning_rate": 2e-05, + "loss": 1.3118, + "loss/crossentropy": 2.5456595420837402, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.18681630492210388, + "step": 800 + }, + { + "epoch": 0.011960579363894282, + "grad_norm": 0.53125, + "grad_norm_var": 0.0035584608713785807, + "learning_rate": 2e-05, + "loss": 1.2338, + "loss/crossentropy": 2.46443247795105, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16349388659000397, + "step": 801 + }, + { + "epoch": 0.011975511423025236, + "grad_norm": 0.53125, + "grad_norm_var": 0.0033066908518473307, + "learning_rate": 2e-05, + "loss": 1.2967, + "loss/crossentropy": 2.6430821418762207, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.17946891486644745, + "step": 802 + }, + { + "epoch": 0.01199044348215619, + "grad_norm": 0.48828125, + "grad_norm_var": 0.0035851637522379557, + "learning_rate": 2e-05, + "loss": 1.1305, + "loss/crossentropy": 2.51657772064209, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9921875, + "loss/idx": 12.0, + "loss/logits": 0.13827310502529144, + "step": 803 + }, + { + "epoch": 0.012005375541287144, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0035763899485270183, + "learning_rate": 2e-05, + "loss": 1.2621, + "loss/crossentropy": 2.462827205657959, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16057859361171722, + "step": 804 + }, + { + "epoch": 0.012020307600418098, + "grad_norm": 0.482421875, + "grad_norm_var": 0.002463213602701823, + "learning_rate": 2e-05, + "loss": 1.1779, + "loss/crossentropy": 2.651974678039551, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.1466044932603836, + "step": 805 + }, + { + "epoch": 0.012035239659549052, + "grad_norm": 0.486328125, + "grad_norm_var": 0.002629709243774414, + "learning_rate": 2e-05, + "loss": 1.1487, + "loss/crossentropy": 2.6436069011688232, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 12.0, + "loss/logits": 0.1487184464931488, + "step": 806 + }, + { + "epoch": 0.012050171718680006, + "grad_norm": 0.5078125, + "grad_norm_var": 0.002544593811035156, + "learning_rate": 2e-05, + "loss": 1.1571, + "loss/crossentropy": 2.6559572219848633, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.14927107095718384, + "step": 807 + }, + { + "epoch": 0.01206510377781096, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0024279276529947918, + "learning_rate": 2e-05, + "loss": 1.2641, + "loss/crossentropy": 2.5706608295440674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16250476241111755, + "step": 808 + }, + { + "epoch": 0.012080035836941914, + "grad_norm": 0.55859375, + "grad_norm_var": 0.002455584208170573, + "learning_rate": 2e-05, + "loss": 1.2112, + "loss/crossentropy": 2.501370906829834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15652695298194885, + "step": 809 + }, + { + "epoch": 0.012094967896072869, + "grad_norm": 0.5390625, + "grad_norm_var": 0.002356449762980143, + "learning_rate": 2e-05, + "loss": 1.2613, + "loss/crossentropy": 2.7686879634857178, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17538884282112122, + "step": 810 + }, + { + "epoch": 0.012109899955203823, + "grad_norm": 0.5546875, + "grad_norm_var": 0.0018960158030192056, + "learning_rate": 2e-05, + "loss": 1.2619, + "loss/crossentropy": 2.6589901447296143, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.17594030499458313, + "step": 811 + }, + { + "epoch": 0.012124832014334777, + "grad_norm": 0.578125, + "grad_norm_var": 0.0020173231760660807, + "learning_rate": 2e-05, + "loss": 1.2311, + "loss/crossentropy": 2.3903369903564453, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16083644330501556, + "step": 812 + }, + { + "epoch": 0.01213976407346573, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0013624668121337891, + "learning_rate": 2e-05, + "loss": 1.3336, + "loss/crossentropy": 2.5818023681640625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.19297264516353607, + "step": 813 + }, + { + "epoch": 0.012154696132596685, + "grad_norm": 0.6796875, + "grad_norm_var": 0.0027383009592692057, + "learning_rate": 2e-05, + "loss": 1.3454, + "loss/crossentropy": 2.5759518146514893, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.17351481318473816, + "step": 814 + }, + { + "epoch": 0.012169628191727639, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0023416519165039063, + "learning_rate": 2e-05, + "loss": 1.2492, + "loss/crossentropy": 2.6695284843444824, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16327539086341858, + "step": 815 + }, + { + "epoch": 0.012184560250858593, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0021956761678059897, + "learning_rate": 2e-05, + "loss": 1.3266, + "loss/crossentropy": 2.319685459136963, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18595829606056213, + "step": 816 + }, + { + "epoch": 0.012199492309989547, + "grad_norm": 0.515625, + "grad_norm_var": 0.0022307713826497395, + "learning_rate": 2e-05, + "loss": 1.2435, + "loss/crossentropy": 2.4563002586364746, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.15760290622711182, + "step": 817 + }, + { + "epoch": 0.012214424369120501, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0023340861002604167, + "learning_rate": 2e-05, + "loss": 1.324, + "loss/crossentropy": 2.5283279418945312, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.16777649521827698, + "step": 818 + }, + { + "epoch": 0.012229356428251455, + "grad_norm": 0.515625, + "grad_norm_var": 0.0021814346313476563, + "learning_rate": 2e-05, + "loss": 1.2184, + "loss/crossentropy": 2.776594638824463, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15586011111736298, + "step": 819 + }, + { + "epoch": 0.01224428848738241, + "grad_norm": 0.5078125, + "grad_norm_var": 0.002261861165364583, + "learning_rate": 2e-05, + "loss": 1.2115, + "loss/crossentropy": 2.5414376258850098, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15676796436309814, + "step": 820 + }, + { + "epoch": 0.012259220546513364, + "grad_norm": 0.49609375, + "grad_norm_var": 0.0021649519602457683, + "learning_rate": 2e-05, + "loss": 1.2741, + "loss/crossentropy": 2.4355485439300537, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16474318504333496, + "step": 821 + }, + { + "epoch": 0.012274152605644318, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0020517985026041667, + "learning_rate": 2e-05, + "loss": 1.1965, + "loss/crossentropy": 2.4327552318573, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1496448963880539, + "step": 822 + }, + { + "epoch": 0.012289084664775272, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0019683202107747396, + "learning_rate": 2e-05, + "loss": 1.3843, + "loss/crossentropy": 2.559605360031128, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.19683653116226196, + "step": 823 + }, + { + "epoch": 0.012304016723906226, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0019606908162434896, + "learning_rate": 2e-05, + "loss": 1.2149, + "loss/crossentropy": 2.5897631645202637, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1601862609386444, + "step": 824 + }, + { + "epoch": 0.012318948783037182, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0019606908162434896, + "learning_rate": 2e-05, + "loss": 1.2356, + "loss/crossentropy": 2.696868658065796, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16526451706886292, + "step": 825 + }, + { + "epoch": 0.012333880842168136, + "grad_norm": 0.6484375, + "grad_norm_var": 0.0026336034138997396, + "learning_rate": 2e-05, + "loss": 1.303, + "loss/crossentropy": 2.709047794342041, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17016342282295227, + "step": 826 + }, + { + "epoch": 0.01234881290129909, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0026364644368489583, + "learning_rate": 2e-05, + "loss": 1.2634, + "loss/crossentropy": 2.4290895462036133, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.16962262988090515, + "step": 827 + }, + { + "epoch": 0.012363744960430044, + "grad_norm": 0.515625, + "grad_norm_var": 0.0026486714680989585, + "learning_rate": 2e-05, + "loss": 1.2122, + "loss/crossentropy": 2.5961177349090576, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.16535454988479614, + "step": 828 + }, + { + "epoch": 0.012378677019560998, + "grad_norm": 0.5390625, + "grad_norm_var": 0.002643775939941406, + "learning_rate": 2e-05, + "loss": 1.2599, + "loss/crossentropy": 2.601069211959839, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.15833136439323425, + "step": 829 + }, + { + "epoch": 0.012393609078691952, + "grad_norm": 0.515625, + "grad_norm_var": 0.0014154434204101563, + "learning_rate": 2e-05, + "loss": 1.2341, + "loss/crossentropy": 2.6840217113494873, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.16378390789031982, + "step": 830 + }, + { + "epoch": 0.012408541137822906, + "grad_norm": 0.59375, + "grad_norm_var": 0.0016031265258789062, + "learning_rate": 2e-05, + "loss": 1.3101, + "loss/crossentropy": 2.6370441913604736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.18508955836296082, + "step": 831 + }, + { + "epoch": 0.01242347319695386, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0016692479451497395, + "learning_rate": 2e-05, + "loss": 1.3561, + "loss/crossentropy": 2.3607304096221924, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 12.0, + "loss/logits": 0.1763923168182373, + "step": 832 + }, + { + "epoch": 0.012438405256084814, + "grad_norm": 0.482421875, + "grad_norm_var": 0.0018602848052978516, + "learning_rate": 2e-05, + "loss": 1.1611, + "loss/crossentropy": 2.4794504642486572, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.13761460781097412, + "step": 833 + }, + { + "epoch": 0.012453337315215769, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0017420291900634766, + "learning_rate": 2e-05, + "loss": 1.3138, + "loss/crossentropy": 2.515113353729248, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.18878057599067688, + "step": 834 + }, + { + "epoch": 0.012468269374346723, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0017642815907796224, + "learning_rate": 2e-05, + "loss": 1.3528, + "loss/crossentropy": 2.7300655841827393, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.2043353170156479, + "step": 835 + }, + { + "epoch": 0.012483201433477677, + "grad_norm": 0.50390625, + "grad_norm_var": 0.001782846450805664, + "learning_rate": 2e-05, + "loss": 1.1872, + "loss/crossentropy": 2.455660104751587, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.14813506603240967, + "step": 836 + }, + { + "epoch": 0.01249813349260863, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0017206668853759766, + "learning_rate": 2e-05, + "loss": 1.2639, + "loss/crossentropy": 2.3937854766845703, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1545325219631195, + "step": 837 + }, + { + "epoch": 0.012513065551739585, + "grad_norm": 0.52734375, + "grad_norm_var": 0.001635599136352539, + "learning_rate": 2e-05, + "loss": 1.201, + "loss/crossentropy": 2.608152389526367, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15416556596755981, + "step": 838 + }, + { + "epoch": 0.012527997610870539, + "grad_norm": 0.6171875, + "grad_norm_var": 0.0019674777984619142, + "learning_rate": 2e-05, + "loss": 1.3746, + "loss/crossentropy": 2.6124019622802734, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 12.0, + "loss/logits": 0.19495141506195068, + "step": 839 + }, + { + "epoch": 0.012542929670001493, + "grad_norm": 0.546875, + "grad_norm_var": 0.0019274234771728515, + "learning_rate": 2e-05, + "loss": 1.145, + "loss/crossentropy": 2.5301358699798584, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 12.0, + "loss/logits": 0.14495986700057983, + "step": 840 + }, + { + "epoch": 0.012557861729132447, + "grad_norm": 0.490234375, + "grad_norm_var": 0.002129364013671875, + "learning_rate": 2e-05, + "loss": 1.0545, + "loss/crossentropy": 2.5924758911132812, + "loss/dist_ce": 0.0, + "loss/fcd": 0.93359375, + "loss/idx": 12.0, + "loss/logits": 0.12090451270341873, + "step": 841 + }, + { + "epoch": 0.012572793788263401, + "grad_norm": 0.6328125, + "grad_norm_var": 0.001927947998046875, + "learning_rate": 2e-05, + "loss": 1.2721, + "loss/crossentropy": 2.3995118141174316, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16268111765384674, + "step": 842 + }, + { + "epoch": 0.012587725847394355, + "grad_norm": 0.54296875, + "grad_norm_var": 0.001927947998046875, + "learning_rate": 2e-05, + "loss": 1.3116, + "loss/crossentropy": 2.5142955780029297, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.18662039935588837, + "step": 843 + }, + { + "epoch": 0.01260265790652531, + "grad_norm": 0.6015625, + "grad_norm_var": 0.00207061767578125, + "learning_rate": 2e-05, + "loss": 1.336, + "loss/crossentropy": 2.4487991333007812, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.17975641787052155, + "step": 844 + }, + { + "epoch": 0.012617589965656264, + "grad_norm": 0.56640625, + "grad_norm_var": 0.002081743876139323, + "learning_rate": 2e-05, + "loss": 1.2271, + "loss/crossentropy": 2.799164295196533, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.17243638634681702, + "step": 845 + }, + { + "epoch": 0.012632522024787218, + "grad_norm": 0.494140625, + "grad_norm_var": 0.002210601170857747, + "learning_rate": 2e-05, + "loss": 1.2006, + "loss/crossentropy": 2.636415958404541, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1537102460861206, + "step": 846 + }, + { + "epoch": 0.012647454083918172, + "grad_norm": 0.49609375, + "grad_norm_var": 0.0022264957427978516, + "learning_rate": 2e-05, + "loss": 1.2061, + "loss/crossentropy": 2.377129316329956, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15926313400268555, + "step": 847 + }, + { + "epoch": 0.012662386143049126, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0023066043853759766, + "learning_rate": 2e-05, + "loss": 1.2835, + "loss/crossentropy": 2.4958112239837646, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.1897384524345398, + "step": 848 + }, + { + "epoch": 0.01267731820218008, + "grad_norm": 0.49609375, + "grad_norm_var": 0.0022059122721354166, + "learning_rate": 2e-05, + "loss": 1.2354, + "loss/crossentropy": 2.732330560684204, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.17292934656143188, + "step": 849 + }, + { + "epoch": 0.012692250261311034, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0022882461547851563, + "learning_rate": 2e-05, + "loss": 1.1959, + "loss/crossentropy": 2.8346798419952393, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.14898401498794556, + "step": 850 + }, + { + "epoch": 0.01270718232044199, + "grad_norm": 0.546875, + "grad_norm_var": 0.002237892150878906, + "learning_rate": 2e-05, + "loss": 1.1314, + "loss/crossentropy": 2.544158697128296, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98828125, + "loss/idx": 12.0, + "loss/logits": 0.14313051104545593, + "step": 851 + }, + { + "epoch": 0.012722114379572944, + "grad_norm": 0.51171875, + "grad_norm_var": 0.002202288309733073, + "learning_rate": 2e-05, + "loss": 1.2397, + "loss/crossentropy": 2.5338921546936035, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16162025928497314, + "step": 852 + }, + { + "epoch": 0.012737046438703898, + "grad_norm": 0.671875, + "grad_norm_var": 0.0031315485636393228, + "learning_rate": 2e-05, + "loss": 1.2927, + "loss/crossentropy": 2.4311304092407227, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16769030690193176, + "step": 853 + }, + { + "epoch": 0.012751978497834852, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0032208760579427085, + "learning_rate": 2e-05, + "loss": 1.2106, + "loss/crossentropy": 2.668276309967041, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15591159462928772, + "step": 854 + }, + { + "epoch": 0.012766910556965806, + "grad_norm": 0.515625, + "grad_norm_var": 0.0029729207356770835, + "learning_rate": 2e-05, + "loss": 1.3015, + "loss/crossentropy": 2.482598304748535, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.18431049585342407, + "step": 855 + }, + { + "epoch": 0.01278184261609676, + "grad_norm": 0.53125, + "grad_norm_var": 0.0029841105143229166, + "learning_rate": 2e-05, + "loss": 1.3188, + "loss/crossentropy": 2.7403719425201416, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.19376035034656525, + "step": 856 + }, + { + "epoch": 0.012796774675227714, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0027885278065999347, + "learning_rate": 2e-05, + "loss": 1.2216, + "loss/crossentropy": 2.433129072189331, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.16688039898872375, + "step": 857 + }, + { + "epoch": 0.012811706734358668, + "grad_norm": 0.48828125, + "grad_norm_var": 0.002435668309529622, + "learning_rate": 2e-05, + "loss": 1.2295, + "loss/crossentropy": 2.5015032291412354, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15920080244541168, + "step": 858 + }, + { + "epoch": 0.012826638793489623, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0025185743967692056, + "learning_rate": 2e-05, + "loss": 1.2442, + "loss/crossentropy": 2.488417625427246, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1816791594028473, + "step": 859 + }, + { + "epoch": 0.012841570852620577, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0022504011789957683, + "learning_rate": 2e-05, + "loss": 1.2024, + "loss/crossentropy": 2.665573835372925, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15554329752922058, + "step": 860 + }, + { + "epoch": 0.01285650291175153, + "grad_norm": 0.8046875, + "grad_norm_var": 0.006795740127563477, + "learning_rate": 2e-05, + "loss": 1.4828, + "loss/crossentropy": 2.3143980503082275, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2578125, + "loss/idx": 12.0, + "loss/logits": 0.22494599223136902, + "step": 861 + }, + { + "epoch": 0.012871434970882485, + "grad_norm": 0.578125, + "grad_norm_var": 0.006611887613932292, + "learning_rate": 2e-05, + "loss": 1.2513, + "loss/crossentropy": 2.563246965408325, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.15756869316101074, + "step": 862 + }, + { + "epoch": 0.012886367030013439, + "grad_norm": 0.5859375, + "grad_norm_var": 0.00640862782796224, + "learning_rate": 2e-05, + "loss": 1.2763, + "loss/crossentropy": 2.4165306091308594, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17470136284828186, + "step": 863 + }, + { + "epoch": 0.012901299089144393, + "grad_norm": 0.58203125, + "grad_norm_var": 0.006382179260253906, + "learning_rate": 2e-05, + "loss": 1.2261, + "loss/crossentropy": 2.7474277019500732, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1479920744895935, + "step": 864 + }, + { + "epoch": 0.012916231148275347, + "grad_norm": 0.486328125, + "grad_norm_var": 0.006471745173136393, + "learning_rate": 2e-05, + "loss": 1.1288, + "loss/crossentropy": 2.628307819366455, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98828125, + "loss/idx": 12.0, + "loss/logits": 0.14052993059158325, + "step": 865 + }, + { + "epoch": 0.012931163207406301, + "grad_norm": 0.5703125, + "grad_norm_var": 0.00628355344136556, + "learning_rate": 2e-05, + "loss": 1.2903, + "loss/crossentropy": 2.6581015586853027, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16525937616825104, + "step": 866 + }, + { + "epoch": 0.012946095266537255, + "grad_norm": 0.51171875, + "grad_norm_var": 0.00643919308980306, + "learning_rate": 2e-05, + "loss": 1.2729, + "loss/crossentropy": 2.480175256729126, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.17913323640823364, + "step": 867 + }, + { + "epoch": 0.01296102732566821, + "grad_norm": 0.55859375, + "grad_norm_var": 0.006266005833943685, + "learning_rate": 2e-05, + "loss": 1.1887, + "loss/crossentropy": 2.5329577922821045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.14967145025730133, + "step": 868 + }, + { + "epoch": 0.012975959384799163, + "grad_norm": 0.5, + "grad_norm_var": 0.005647770563761393, + "learning_rate": 2e-05, + "loss": 1.2151, + "loss/crossentropy": 2.5917510986328125, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.16825619339942932, + "step": 869 + }, + { + "epoch": 0.012990891443930118, + "grad_norm": 0.55859375, + "grad_norm_var": 0.005498997370402018, + "learning_rate": 2e-05, + "loss": 1.279, + "loss/crossentropy": 2.703120231628418, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.1852026879787445, + "step": 870 + }, + { + "epoch": 0.013005823503061072, + "grad_norm": 0.6015625, + "grad_norm_var": 0.005489206314086914, + "learning_rate": 2e-05, + "loss": 1.3381, + "loss/crossentropy": 2.6077189445495605, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.18187852203845978, + "step": 871 + }, + { + "epoch": 0.013020755562192026, + "grad_norm": 0.52734375, + "grad_norm_var": 0.005506245295206705, + "learning_rate": 2e-05, + "loss": 1.2242, + "loss/crossentropy": 2.229553461074829, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.14606034755706787, + "step": 872 + }, + { + "epoch": 0.01303568762132298, + "grad_norm": 0.5078125, + "grad_norm_var": 0.005650440851847331, + "learning_rate": 2e-05, + "loss": 1.0931, + "loss/crossentropy": 2.4416704177856445, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9609375, + "loss/idx": 12.0, + "loss/logits": 0.13215383887290955, + "step": 873 + }, + { + "epoch": 0.013050619680453934, + "grad_norm": 0.6015625, + "grad_norm_var": 0.00536650021870931, + "learning_rate": 2e-05, + "loss": 1.3962, + "loss/crossentropy": 2.580476760864258, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.20873740315437317, + "step": 874 + }, + { + "epoch": 0.013065551739584888, + "grad_norm": 0.625, + "grad_norm_var": 0.005574782689412435, + "learning_rate": 2e-05, + "loss": 1.4374, + "loss/crossentropy": 2.3341832160949707, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.19524669647216797, + "step": 875 + }, + { + "epoch": 0.013080483798715842, + "grad_norm": 0.546875, + "grad_norm_var": 0.005486408869425456, + "learning_rate": 2e-05, + "loss": 1.3415, + "loss/crossentropy": 2.2510344982147217, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.19304239749908447, + "step": 876 + }, + { + "epoch": 0.013095415857846796, + "grad_norm": 0.609375, + "grad_norm_var": 0.0018020470937093098, + "learning_rate": 2e-05, + "loss": 1.3444, + "loss/crossentropy": 2.42315936088562, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 12.0, + "loss/logits": 0.1725073605775833, + "step": 877 + }, + { + "epoch": 0.013110347916977752, + "grad_norm": 0.6484375, + "grad_norm_var": 0.0022861321767171225, + "learning_rate": 2e-05, + "loss": 1.206, + "loss/crossentropy": 2.3976566791534424, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.1590883731842041, + "step": 878 + }, + { + "epoch": 0.013125279976108706, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0022553602854410806, + "learning_rate": 2e-05, + "loss": 1.3037, + "loss/crossentropy": 2.417356014251709, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.17873302102088928, + "step": 879 + }, + { + "epoch": 0.01314021203523966, + "grad_norm": 0.48828125, + "grad_norm_var": 0.0025651137034098308, + "learning_rate": 2e-05, + "loss": 1.25, + "loss/crossentropy": 2.579538345336914, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1640225350856781, + "step": 880 + }, + { + "epoch": 0.013155144094370614, + "grad_norm": 0.6171875, + "grad_norm_var": 0.002402178446451823, + "learning_rate": 2e-05, + "loss": 1.397, + "loss/crossentropy": 2.5399506092071533, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.1938353031873703, + "step": 881 + }, + { + "epoch": 0.013170076153501568, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0024027506510416667, + "learning_rate": 2e-05, + "loss": 1.3593, + "loss/crossentropy": 2.570274591445923, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 12.0, + "loss/logits": 0.19524267315864563, + "step": 882 + }, + { + "epoch": 0.013185008212632522, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0022074381510416665, + "learning_rate": 2e-05, + "loss": 1.2983, + "loss/crossentropy": 2.8754701614379883, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.18115171790122986, + "step": 883 + }, + { + "epoch": 0.013199940271763477, + "grad_norm": 0.56640625, + "grad_norm_var": 0.002201080322265625, + "learning_rate": 2e-05, + "loss": 1.2878, + "loss/crossentropy": 2.8438570499420166, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1705954670906067, + "step": 884 + }, + { + "epoch": 0.01321487233089443, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0018960952758789063, + "learning_rate": 2e-05, + "loss": 1.3113, + "loss/crossentropy": 2.6516339778900146, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1863495111465454, + "step": 885 + }, + { + "epoch": 0.013229804390025385, + "grad_norm": 0.486328125, + "grad_norm_var": 0.0023518721262613933, + "learning_rate": 2e-05, + "loss": 1.1807, + "loss/crossentropy": 2.603248119354248, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.14165057241916656, + "step": 886 + }, + { + "epoch": 0.013244736449156339, + "grad_norm": 0.55078125, + "grad_norm_var": 0.002282444636027018, + "learning_rate": 2e-05, + "loss": 1.2735, + "loss/crossentropy": 2.409811019897461, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16412316262722015, + "step": 887 + }, + { + "epoch": 0.013259668508287293, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0023247877756754558, + "learning_rate": 2e-05, + "loss": 1.2055, + "loss/crossentropy": 2.680741548538208, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15857717394828796, + "step": 888 + }, + { + "epoch": 0.013274600567418247, + "grad_norm": 1.84375, + "grad_norm_var": 0.10388995806376139, + "learning_rate": 2e-05, + "loss": 1.3991, + "loss/crossentropy": 2.831256628036499, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.2428184151649475, + "step": 889 + }, + { + "epoch": 0.013289532626549201, + "grad_norm": 0.640625, + "grad_norm_var": 0.10374690691630045, + "learning_rate": 2e-05, + "loss": 1.3895, + "loss/crossentropy": 2.541971206665039, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.18634910881519318, + "step": 890 + }, + { + "epoch": 0.013304464685680155, + "grad_norm": 0.58203125, + "grad_norm_var": 0.10400427182515462, + "learning_rate": 2e-05, + "loss": 1.2184, + "loss/crossentropy": 2.6130967140197754, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.16373895108699799, + "step": 891 + }, + { + "epoch": 0.01331939674481111, + "grad_norm": 0.546875, + "grad_norm_var": 0.10400427182515462, + "learning_rate": 2e-05, + "loss": 1.19, + "loss/crossentropy": 2.482786178588867, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.14307764172554016, + "step": 892 + }, + { + "epoch": 0.013334328803942063, + "grad_norm": 0.484375, + "grad_norm_var": 0.10560949643452962, + "learning_rate": 2e-05, + "loss": 1.1326, + "loss/crossentropy": 2.541658639907837, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98828125, + "loss/idx": 12.0, + "loss/logits": 0.1443423479795456, + "step": 893 + }, + { + "epoch": 0.013349260863073018, + "grad_norm": 0.65625, + "grad_norm_var": 0.1056228478749593, + "learning_rate": 2e-05, + "loss": 1.3966, + "loss/crossentropy": 2.303764581680298, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.1934724748134613, + "step": 894 + }, + { + "epoch": 0.013364192922203972, + "grad_norm": 0.4921875, + "grad_norm_var": 0.10672783851623535, + "learning_rate": 2e-05, + "loss": 1.1688, + "loss/crossentropy": 2.6050453186035156, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.1453702449798584, + "step": 895 + }, + { + "epoch": 0.013379124981334926, + "grad_norm": 0.546875, + "grad_norm_var": 0.10579705238342285, + "learning_rate": 2e-05, + "loss": 1.2725, + "loss/crossentropy": 2.772411823272705, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.17090922594070435, + "step": 896 + }, + { + "epoch": 0.01339405704046588, + "grad_norm": 0.78515625, + "grad_norm_var": 0.10708196957906087, + "learning_rate": 2e-05, + "loss": 1.7039, + "loss/crossentropy": 2.8474199771881104, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3984375, + "loss/idx": 12.0, + "loss/logits": 0.30551040172576904, + "step": 897 + }, + { + "epoch": 0.013408989099596834, + "grad_norm": 1.328125, + "grad_norm_var": 0.13481214841206868, + "learning_rate": 2e-05, + "loss": 1.5814, + "loss/crossentropy": 2.6117067337036133, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3203125, + "loss/idx": 12.0, + "loss/logits": 0.2610923647880554, + "step": 898 + }, + { + "epoch": 0.013423921158727788, + "grad_norm": 0.55859375, + "grad_norm_var": 0.13508350054423016, + "learning_rate": 2e-05, + "loss": 1.2476, + "loss/crossentropy": 2.5433382987976074, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16162243485450745, + "step": 899 + }, + { + "epoch": 0.013438853217858742, + "grad_norm": 0.59375, + "grad_norm_var": 0.1346571445465088, + "learning_rate": 2e-05, + "loss": 1.3289, + "loss/crossentropy": 2.3905489444732666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18823018670082092, + "step": 900 + }, + { + "epoch": 0.013453785276989696, + "grad_norm": 0.60546875, + "grad_norm_var": 0.1337714989980062, + "learning_rate": 2e-05, + "loss": 1.222, + "loss/crossentropy": 2.8749287128448486, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15945787727832794, + "step": 901 + }, + { + "epoch": 0.01346871733612065, + "grad_norm": 0.73828125, + "grad_norm_var": 0.13051751454671223, + "learning_rate": 2e-05, + "loss": 1.2667, + "loss/crossentropy": 2.56249737739563, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16514375805854797, + "step": 902 + }, + { + "epoch": 0.013483649395251604, + "grad_norm": 0.53125, + "grad_norm_var": 0.13097432454427083, + "learning_rate": 2e-05, + "loss": 1.3496, + "loss/crossentropy": 2.5442233085632324, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 12.0, + "loss/logits": 0.16989666223526, + "step": 903 + }, + { + "epoch": 0.01349858145438256, + "grad_norm": 0.60546875, + "grad_norm_var": 0.12918675740559896, + "learning_rate": 2e-05, + "loss": 1.3425, + "loss/crossentropy": 2.4813945293426514, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.18623802065849304, + "step": 904 + }, + { + "epoch": 0.013513513513513514, + "grad_norm": 0.54296875, + "grad_norm_var": 0.04024499257405599, + "learning_rate": 2e-05, + "loss": 1.1192, + "loss/crossentropy": 2.650784969329834, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98046875, + "loss/idx": 12.0, + "loss/logits": 0.1387377828359604, + "step": 905 + }, + { + "epoch": 0.013528445572644468, + "grad_norm": 0.58203125, + "grad_norm_var": 0.04045384724934896, + "learning_rate": 2e-05, + "loss": 1.3307, + "loss/crossentropy": 2.550124168395996, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.1822178065776825, + "step": 906 + }, + { + "epoch": 0.013543377631775422, + "grad_norm": 0.61328125, + "grad_norm_var": 0.040289052327473956, + "learning_rate": 2e-05, + "loss": 1.4404, + "loss/crossentropy": 2.7158029079437256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.24511241912841797, + "step": 907 + }, + { + "epoch": 0.013558309690906377, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0408599853515625, + "learning_rate": 2e-05, + "loss": 1.2551, + "loss/crossentropy": 2.580396890640259, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16913624107837677, + "step": 908 + }, + { + "epoch": 0.01357324175003733, + "grad_norm": 0.57421875, + "grad_norm_var": 0.03955122629801432, + "learning_rate": 2e-05, + "loss": 1.2294, + "loss/crossentropy": 2.6257517337799072, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16689662635326385, + "step": 909 + }, + { + "epoch": 0.013588173809168285, + "grad_norm": 0.61328125, + "grad_norm_var": 0.039581298828125, + "learning_rate": 2e-05, + "loss": 1.3795, + "loss/crossentropy": 2.5408027172088623, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 12.0, + "loss/logits": 0.1763436645269394, + "step": 910 + }, + { + "epoch": 0.013603105868299239, + "grad_norm": 0.54296875, + "grad_norm_var": 0.038750648498535156, + "learning_rate": 2e-05, + "loss": 1.2114, + "loss/crossentropy": 2.5792062282562256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1567598581314087, + "step": 911 + }, + { + "epoch": 0.013618037927430193, + "grad_norm": 0.5234375, + "grad_norm_var": 0.039081764221191403, + "learning_rate": 2e-05, + "loss": 1.2693, + "loss/crossentropy": 2.639925956726074, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.17551803588867188, + "step": 912 + }, + { + "epoch": 0.013632969986561147, + "grad_norm": 0.51953125, + "grad_norm_var": 0.03836409250895182, + "learning_rate": 2e-05, + "loss": 1.2264, + "loss/crossentropy": 2.6736505031585693, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1639370322227478, + "step": 913 + }, + { + "epoch": 0.013647902045692101, + "grad_norm": 0.625, + "grad_norm_var": 0.0032307306925455728, + "learning_rate": 2e-05, + "loss": 1.3798, + "loss/crossentropy": 2.731677293777466, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.19231237471103668, + "step": 914 + }, + { + "epoch": 0.013662834104823055, + "grad_norm": 0.51171875, + "grad_norm_var": 0.003500811258951823, + "learning_rate": 2e-05, + "loss": 1.2027, + "loss/crossentropy": 2.551647424697876, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15584951639175415, + "step": 915 + }, + { + "epoch": 0.01367776616395401, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0035837809244791668, + "learning_rate": 2e-05, + "loss": 1.2998, + "loss/crossentropy": 2.5736746788024902, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.18263675272464752, + "step": 916 + }, + { + "epoch": 0.013692698223084963, + "grad_norm": 0.50390625, + "grad_norm_var": 0.003792063395182292, + "learning_rate": 2e-05, + "loss": 1.2474, + "loss/crossentropy": 2.5645570755004883, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16151148080825806, + "step": 917 + }, + { + "epoch": 0.013707630282215917, + "grad_norm": 0.49609375, + "grad_norm_var": 0.0019236246744791666, + "learning_rate": 2e-05, + "loss": 1.1903, + "loss/crossentropy": 2.7008137702941895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.1512683928012848, + "step": 918 + }, + { + "epoch": 0.013722562341346872, + "grad_norm": 0.5625, + "grad_norm_var": 0.0018992106119791667, + "learning_rate": 2e-05, + "loss": 1.2117, + "loss/crossentropy": 2.576681613922119, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.16480094194412231, + "step": 919 + }, + { + "epoch": 0.013737494400477826, + "grad_norm": 0.62109375, + "grad_norm_var": 0.0020222981770833333, + "learning_rate": 2e-05, + "loss": 1.2225, + "loss/crossentropy": 2.2980234622955322, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.1522141844034195, + "step": 920 + }, + { + "epoch": 0.01375242645960878, + "grad_norm": 0.490234375, + "grad_norm_var": 0.0022785027821858725, + "learning_rate": 2e-05, + "loss": 1.1559, + "loss/crossentropy": 2.6368680000305176, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.14811134338378906, + "step": 921 + }, + { + "epoch": 0.013767358518739734, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0022785027821858725, + "learning_rate": 2e-05, + "loss": 1.2656, + "loss/crossentropy": 2.3946034908294678, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16408073902130127, + "step": 922 + }, + { + "epoch": 0.013782290577870688, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0020816644032796224, + "learning_rate": 2e-05, + "loss": 1.2416, + "loss/crossentropy": 2.6989355087280273, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16348567605018616, + "step": 923 + }, + { + "epoch": 0.013797222637001642, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0020609378814697267, + "learning_rate": 2e-05, + "loss": 1.1568, + "loss/crossentropy": 2.457051992416382, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.1490294486284256, + "step": 924 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.494140625, + "grad_norm_var": 0.002199745178222656, + "learning_rate": 2e-05, + "loss": 1.209, + "loss/crossentropy": 2.6960437297821045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.16211289167404175, + "step": 925 + }, + { + "epoch": 0.01382708675526355, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0019378026326497396, + "learning_rate": 2e-05, + "loss": 1.2168, + "loss/crossentropy": 2.486149549484253, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1543126404285431, + "step": 926 + }, + { + "epoch": 0.013842018814394504, + "grad_norm": 0.74609375, + "grad_norm_var": 0.00453637440999349, + "learning_rate": 2e-05, + "loss": 1.2962, + "loss/crossentropy": 2.531057119369507, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1868131458759308, + "step": 927 + }, + { + "epoch": 0.013856950873525458, + "grad_norm": 0.60546875, + "grad_norm_var": 0.004612477620442709, + "learning_rate": 2e-05, + "loss": 1.3788, + "loss/crossentropy": 2.445338726043701, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 12.0, + "loss/logits": 0.19916069507598877, + "step": 928 + }, + { + "epoch": 0.013871882932656412, + "grad_norm": 0.55078125, + "grad_norm_var": 0.004504648844401041, + "learning_rate": 2e-05, + "loss": 1.2241, + "loss/crossentropy": 2.5794379711151123, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.15381918847560883, + "step": 929 + }, + { + "epoch": 0.013886814991787368, + "grad_norm": 0.546875, + "grad_norm_var": 0.004229990641276041, + "learning_rate": 2e-05, + "loss": 1.3103, + "loss/crossentropy": 2.6032307147979736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17746026813983917, + "step": 930 + }, + { + "epoch": 0.013901747050918322, + "grad_norm": 0.53515625, + "grad_norm_var": 0.004122416178385417, + "learning_rate": 2e-05, + "loss": 1.2506, + "loss/crossentropy": 2.637634515762329, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.15681351721286774, + "step": 931 + }, + { + "epoch": 0.013916679110049276, + "grad_norm": 0.5078125, + "grad_norm_var": 0.004254595438639323, + "learning_rate": 2e-05, + "loss": 1.3071, + "loss/crossentropy": 2.4573991298675537, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.17433026432991028, + "step": 932 + }, + { + "epoch": 0.01393161116918023, + "grad_norm": 0.515625, + "grad_norm_var": 0.004180399576822916, + "learning_rate": 2e-05, + "loss": 1.2987, + "loss/crossentropy": 2.4106740951538086, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1737232506275177, + "step": 933 + }, + { + "epoch": 0.013946543228311185, + "grad_norm": 0.62109375, + "grad_norm_var": 0.0041315714518229164, + "learning_rate": 2e-05, + "loss": 1.3689, + "loss/crossentropy": 2.61064076423645, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 12.0, + "loss/logits": 0.181406170129776, + "step": 934 + }, + { + "epoch": 0.013961475287442139, + "grad_norm": 0.65234375, + "grad_norm_var": 0.0046009699503580725, + "learning_rate": 2e-05, + "loss": 1.2803, + "loss/crossentropy": 2.4577136039733887, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.16315729916095734, + "step": 935 + }, + { + "epoch": 0.013976407346573093, + "grad_norm": 0.51953125, + "grad_norm_var": 0.004567909240722656, + "learning_rate": 2e-05, + "loss": 1.2566, + "loss/crossentropy": 2.2968335151672363, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.1706232726573944, + "step": 936 + }, + { + "epoch": 0.013991339405704047, + "grad_norm": 0.50390625, + "grad_norm_var": 0.004443852106730143, + "learning_rate": 2e-05, + "loss": 1.162, + "loss/crossentropy": 2.473719596862793, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.146341010928154, + "step": 937 + }, + { + "epoch": 0.014006271464835001, + "grad_norm": 0.51171875, + "grad_norm_var": 0.004598347345987955, + "learning_rate": 2e-05, + "loss": 1.1695, + "loss/crossentropy": 2.7954347133636475, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.1460927426815033, + "step": 938 + }, + { + "epoch": 0.014021203523965955, + "grad_norm": 0.478515625, + "grad_norm_var": 0.004979960123697917, + "learning_rate": 2e-05, + "loss": 1.1899, + "loss/crossentropy": 2.491903781890869, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.14299684762954712, + "step": 939 + }, + { + "epoch": 0.01403613558309691, + "grad_norm": 0.60546875, + "grad_norm_var": 0.004992167154947917, + "learning_rate": 2e-05, + "loss": 1.2752, + "loss/crossentropy": 2.651737928390503, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.15803387761116028, + "step": 940 + }, + { + "epoch": 0.014051067642227863, + "grad_norm": 0.9140625, + "grad_norm_var": 0.012295007705688477, + "learning_rate": 2e-05, + "loss": 1.4622, + "loss/crossentropy": 2.304933547973633, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2578125, + "loss/idx": 12.0, + "loss/logits": 0.20437663793563843, + "step": 941 + }, + { + "epoch": 0.014065999701358817, + "grad_norm": 0.5390625, + "grad_norm_var": 0.012431192398071288, + "learning_rate": 2e-05, + "loss": 1.2685, + "loss/crossentropy": 2.5132477283477783, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16695216298103333, + "step": 942 + }, + { + "epoch": 0.014080931760489772, + "grad_norm": 0.5, + "grad_norm_var": 0.010917139053344727, + "learning_rate": 2e-05, + "loss": 1.1307, + "loss/crossentropy": 2.515592336654663, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9921875, + "loss/idx": 12.0, + "loss/logits": 0.13850846886634827, + "step": 943 + }, + { + "epoch": 0.014095863819620726, + "grad_norm": 0.5, + "grad_norm_var": 0.011102533340454102, + "learning_rate": 2e-05, + "loss": 1.2549, + "loss/crossentropy": 2.392749071121216, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.16117683053016663, + "step": 944 + }, + { + "epoch": 0.01411079587875168, + "grad_norm": 0.6875, + "grad_norm_var": 0.012054936091105143, + "learning_rate": 2e-05, + "loss": 1.4447, + "loss/crossentropy": 2.619136095046997, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 12.0, + "loss/logits": 0.21033376455307007, + "step": 945 + }, + { + "epoch": 0.014125727937882634, + "grad_norm": 0.5078125, + "grad_norm_var": 0.012276824315388997, + "learning_rate": 2e-05, + "loss": 1.2539, + "loss/crossentropy": 2.70526385307312, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16797472536563873, + "step": 946 + }, + { + "epoch": 0.014140659997013588, + "grad_norm": 0.5625, + "grad_norm_var": 0.012201166152954102, + "learning_rate": 2e-05, + "loss": 1.3053, + "loss/crossentropy": 2.671525239944458, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1802533119916916, + "step": 947 + }, + { + "epoch": 0.014155592056144542, + "grad_norm": 0.54296875, + "grad_norm_var": 0.011984872817993163, + "learning_rate": 2e-05, + "loss": 1.2934, + "loss/crossentropy": 2.5209715366363525, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.16836240887641907, + "step": 948 + }, + { + "epoch": 0.014170524115275496, + "grad_norm": 0.546875, + "grad_norm_var": 0.011808379491170248, + "learning_rate": 2e-05, + "loss": 1.2201, + "loss/crossentropy": 2.352734327316284, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.14974716305732727, + "step": 949 + }, + { + "epoch": 0.01418545617440645, + "grad_norm": 0.64453125, + "grad_norm_var": 0.011988051732381185, + "learning_rate": 2e-05, + "loss": 1.3441, + "loss/crossentropy": 2.5131075382232666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.20351985096931458, + "step": 950 + }, + { + "epoch": 0.014200388233537404, + "grad_norm": 0.54296875, + "grad_norm_var": 0.011623112360636394, + "learning_rate": 2e-05, + "loss": 1.1609, + "loss/crossentropy": 2.6400680541992188, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 12.0, + "loss/logits": 0.14532148838043213, + "step": 951 + }, + { + "epoch": 0.014215320292668358, + "grad_norm": 0.5625, + "grad_norm_var": 0.011453866958618164, + "learning_rate": 2e-05, + "loss": 1.2853, + "loss/crossentropy": 2.4889473915100098, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.1681395173072815, + "step": 952 + }, + { + "epoch": 0.014230252351799312, + "grad_norm": 0.66015625, + "grad_norm_var": 0.01156322161356608, + "learning_rate": 2e-05, + "loss": 1.3387, + "loss/crossentropy": 2.304206609725952, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.18247899413108826, + "step": 953 + }, + { + "epoch": 0.014245184410930267, + "grad_norm": 0.62890625, + "grad_norm_var": 0.011328617731730143, + "learning_rate": 2e-05, + "loss": 1.3366, + "loss/crossentropy": 2.4242758750915527, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.18031245470046997, + "step": 954 + }, + { + "epoch": 0.01426011647006122, + "grad_norm": 0.59375, + "grad_norm_var": 0.010461171468098959, + "learning_rate": 2e-05, + "loss": 1.2421, + "loss/crossentropy": 2.681748151779175, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 12.0, + "loss/logits": 0.17177164554595947, + "step": 955 + }, + { + "epoch": 0.014275048529192176, + "grad_norm": 0.484375, + "grad_norm_var": 0.011227862040201823, + "learning_rate": 2e-05, + "loss": 1.1478, + "loss/crossentropy": 2.6037683486938477, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 12.0, + "loss/logits": 0.14775672554969788, + "step": 956 + }, + { + "epoch": 0.01428998058832313, + "grad_norm": 0.88671875, + "grad_norm_var": 0.010088094075520833, + "learning_rate": 2e-05, + "loss": 1.5679, + "loss/crossentropy": 2.7519948482513428, + "loss/dist_ce": 0.0, + "loss/fcd": 1.296875, + "loss/idx": 12.0, + "loss/logits": 0.2710202634334564, + "step": 957 + }, + { + "epoch": 0.014304912647454085, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0103485107421875, + "learning_rate": 2e-05, + "loss": 1.2848, + "loss/crossentropy": 2.3958358764648438, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.1754007339477539, + "step": 958 + }, + { + "epoch": 0.014319844706585039, + "grad_norm": 0.60546875, + "grad_norm_var": 0.009848976135253906, + "learning_rate": 2e-05, + "loss": 1.496, + "loss/crossentropy": 2.428069591522217, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2734375, + "loss/idx": 12.0, + "loss/logits": 0.2225390374660492, + "step": 959 + }, + { + "epoch": 0.014334776765715993, + "grad_norm": 0.5234375, + "grad_norm_var": 0.009597206115722656, + "learning_rate": 2e-05, + "loss": 1.2105, + "loss/crossentropy": 2.5576891899108887, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15579423308372498, + "step": 960 + }, + { + "epoch": 0.014349708824846947, + "grad_norm": 0.51953125, + "grad_norm_var": 0.009244537353515625, + "learning_rate": 2e-05, + "loss": 1.2722, + "loss/crossentropy": 2.3733532428741455, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 12.0, + "loss/logits": 0.17849120497703552, + "step": 961 + }, + { + "epoch": 0.014364640883977901, + "grad_norm": 0.5078125, + "grad_norm_var": 0.009244537353515625, + "learning_rate": 2e-05, + "loss": 1.1481, + "loss/crossentropy": 2.5622546672821045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 12.0, + "loss/logits": 0.14027273654937744, + "step": 962 + }, + { + "epoch": 0.014379572943108855, + "grad_norm": 0.6171875, + "grad_norm_var": 0.009285481770833333, + "learning_rate": 2e-05, + "loss": 1.2864, + "loss/crossentropy": 2.436474561691284, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.16921366751194, + "step": 963 + }, + { + "epoch": 0.01439450500223981, + "grad_norm": 0.71875, + "grad_norm_var": 0.010209592183430989, + "learning_rate": 2e-05, + "loss": 1.4983, + "loss/crossentropy": 2.7202954292297363, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 12.0, + "loss/logits": 0.25612711906433105, + "step": 964 + }, + { + "epoch": 0.014409437061370763, + "grad_norm": 0.515625, + "grad_norm_var": 0.010479164123535157, + "learning_rate": 2e-05, + "loss": 1.2016, + "loss/crossentropy": 2.370645523071289, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.14687049388885498, + "step": 965 + }, + { + "epoch": 0.014424369120501717, + "grad_norm": 0.52734375, + "grad_norm_var": 0.010563087463378907, + "learning_rate": 2e-05, + "loss": 1.3283, + "loss/crossentropy": 2.7037761211395264, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.18769502639770508, + "step": 966 + }, + { + "epoch": 0.014439301179632671, + "grad_norm": 0.51171875, + "grad_norm_var": 0.010810279846191406, + "learning_rate": 2e-05, + "loss": 1.2502, + "loss/crossentropy": 2.5512642860412598, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16423162817955017, + "step": 967 + }, + { + "epoch": 0.014454233238763626, + "grad_norm": 0.482421875, + "grad_norm_var": 0.011458698908487957, + "learning_rate": 2e-05, + "loss": 1.173, + "loss/crossentropy": 2.560576915740967, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.1495593786239624, + "step": 968 + }, + { + "epoch": 0.01446916529789458, + "grad_norm": 0.70703125, + "grad_norm_var": 0.01209270159403483, + "learning_rate": 2e-05, + "loss": 1.3237, + "loss/crossentropy": 2.5032005310058594, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 12.0, + "loss/logits": 0.1908506453037262, + "step": 969 + }, + { + "epoch": 0.014484097357025534, + "grad_norm": 0.55078125, + "grad_norm_var": 0.01200242042541504, + "learning_rate": 2e-05, + "loss": 1.1987, + "loss/crossentropy": 2.6098976135253906, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 12.0, + "loss/logits": 0.15182408690452576, + "step": 970 + }, + { + "epoch": 0.014499029416156488, + "grad_norm": 0.54296875, + "grad_norm_var": 0.01206192970275879, + "learning_rate": 2e-05, + "loss": 1.2095, + "loss/crossentropy": 2.6711175441741943, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.15485651791095734, + "step": 971 + }, + { + "epoch": 0.014513961475287442, + "grad_norm": 0.55078125, + "grad_norm_var": 0.011530160903930664, + "learning_rate": 2e-05, + "loss": 1.2459, + "loss/crossentropy": 2.7971813678741455, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.16782422363758087, + "step": 972 + }, + { + "epoch": 0.014528893534418396, + "grad_norm": 0.52734375, + "grad_norm_var": 0.004891316095987956, + "learning_rate": 2e-05, + "loss": 1.3771, + "loss/crossentropy": 2.5039517879486084, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 12.0, + "loss/logits": 0.18174386024475098, + "step": 973 + }, + { + "epoch": 0.01454382559354935, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0047864119211832685, + "learning_rate": 2e-05, + "loss": 1.2518, + "loss/crossentropy": 2.4862096309661865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 12.0, + "loss/logits": 0.1736270785331726, + "step": 974 + }, + { + "epoch": 0.014558757652680304, + "grad_norm": 0.76953125, + "grad_norm_var": 0.007496754328409831, + "learning_rate": 2e-05, + "loss": 1.32, + "loss/crossentropy": 2.335102081298828, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 12.0, + "loss/logits": 0.17159795761108398, + "step": 975 + }, + { + "epoch": 0.014573689711811258, + "grad_norm": 0.546875, + "grad_norm_var": 0.007389561335245768, + "learning_rate": 2e-05, + "loss": 1.2981, + "loss/crossentropy": 2.5871307849884033, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.1731010526418686, + "step": 976 + }, + { + "epoch": 0.014588621770942212, + "grad_norm": 0.484375, + "grad_norm_var": 0.0077042738596598305, + "learning_rate": 2e-05, + "loss": 1.2361, + "loss/crossentropy": 2.640996217727661, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.17356736958026886, + "step": 977 + }, + { + "epoch": 0.014603553830073166, + "grad_norm": 0.54296875, + "grad_norm_var": 0.007499424616495768, + "learning_rate": 2e-05, + "loss": 1.2752, + "loss/crossentropy": 2.4583990573883057, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.16585969924926758, + "step": 978 + }, + { + "epoch": 0.01461848588920412, + "grad_norm": 0.56640625, + "grad_norm_var": 0.007342386245727539, + "learning_rate": 2e-05, + "loss": 1.2629, + "loss/crossentropy": 2.440524101257324, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 12.0, + "loss/logits": 0.16134318709373474, + "step": 979 + }, + { + "epoch": 0.014633417948335075, + "grad_norm": 0.50390625, + "grad_norm_var": 0.00588072141011556, + "learning_rate": 2e-05, + "loss": 1.2182, + "loss/crossentropy": 2.598440647125244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.15566852688789368, + "step": 980 + }, + { + "epoch": 0.014648350007466029, + "grad_norm": 0.5234375, + "grad_norm_var": 0.005844990412394206, + "learning_rate": 2e-05, + "loss": 1.1937, + "loss/crossentropy": 2.516242742538452, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15460515022277832, + "step": 981 + }, + { + "epoch": 0.014663282066596985, + "grad_norm": 0.51953125, + "grad_norm_var": 0.005876652399698893, + "learning_rate": 2e-05, + "loss": 1.1113, + "loss/crossentropy": 2.374080181121826, + "loss/dist_ce": 0.0, + "loss/fcd": 0.97265625, + "loss/idx": 12.0, + "loss/logits": 0.13867977261543274, + "step": 982 + }, + { + "epoch": 0.014678214125727939, + "grad_norm": 0.91015625, + "grad_norm_var": 0.013574330012003581, + "learning_rate": 2e-05, + "loss": 1.4734, + "loss/crossentropy": 2.396512269973755, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 12.0, + "loss/logits": 0.20776385068893433, + "step": 983 + }, + { + "epoch": 0.014693146184858893, + "grad_norm": 0.55859375, + "grad_norm_var": 0.012961260477701823, + "learning_rate": 2e-05, + "loss": 1.1341, + "loss/crossentropy": 2.6779584884643555, + "loss/dist_ce": 0.0, + "loss/fcd": 0.984375, + "loss/idx": 12.0, + "loss/logits": 0.14977312088012695, + "step": 984 + }, + { + "epoch": 0.014708078243989847, + "grad_norm": 0.51953125, + "grad_norm_var": 0.012064043680826824, + "learning_rate": 2e-05, + "loss": 1.2131, + "loss/crossentropy": 2.5011801719665527, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 12.0, + "loss/logits": 0.1583985984325409, + "step": 985 + }, + { + "epoch": 0.014723010303120801, + "grad_norm": 0.5234375, + "grad_norm_var": 0.012186431884765625, + "learning_rate": 2e-05, + "loss": 1.2479, + "loss/crossentropy": 2.4844136238098145, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16194245219230652, + "step": 986 + }, + { + "epoch": 0.014737942362251755, + "grad_norm": 0.55859375, + "grad_norm_var": 0.012145741780598959, + "learning_rate": 2e-05, + "loss": 1.2528, + "loss/crossentropy": 2.221806526184082, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 12.0, + "loss/logits": 0.16683350503444672, + "step": 987 + }, + { + "epoch": 0.014752874421382709, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0121368408203125, + "learning_rate": 2e-05, + "loss": 1.3223, + "loss/crossentropy": 2.5378036499023438, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 12.0, + "loss/logits": 0.19727113842964172, + "step": 988 + }, + { + "epoch": 0.014767806480513663, + "grad_norm": 0.546875, + "grad_norm_var": 0.012041155497233074, + "learning_rate": 2e-05, + "loss": 1.1958, + "loss/crossentropy": 2.597865581512451, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.1567688286304474, + "step": 989 + }, + { + "epoch": 0.014782738539644617, + "grad_norm": 0.498046875, + "grad_norm_var": 0.012278858820597332, + "learning_rate": 2e-05, + "loss": 1.1742, + "loss/crossentropy": 2.7298946380615234, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 12.0, + "loss/logits": 0.14295458793640137, + "step": 990 + }, + { + "epoch": 0.014797670598775571, + "grad_norm": 0.546875, + "grad_norm_var": 0.009531895319620768, + "learning_rate": 2e-05, + "loss": 1.2208, + "loss/crossentropy": 2.394542932510376, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.1583481729030609, + "step": 991 + }, + { + "epoch": 0.014812602657906525, + "grad_norm": 0.50390625, + "grad_norm_var": 0.009715127944946288, + "learning_rate": 2e-05, + "loss": 1.3119, + "loss/crossentropy": 2.8174941539764404, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 12.0, + "loss/logits": 0.20255360007286072, + "step": 992 + }, + { + "epoch": 0.01482753471703748, + "grad_norm": 0.52734375, + "grad_norm_var": 0.009419997533162435, + "learning_rate": 2e-05, + "loss": 1.1653, + "loss/crossentropy": 2.6789565086364746, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 12.0, + "loss/logits": 0.14188051223754883, + "step": 993 + }, + { + "epoch": 0.014842466776168434, + "grad_norm": 0.5390625, + "grad_norm_var": 0.009429152806599934, + "learning_rate": 2e-05, + "loss": 1.2278, + "loss/crossentropy": 2.6719415187835693, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 12.0, + "loss/logits": 0.16530798375606537, + "step": 994 + }, + { + "epoch": 0.014857398835299388, + "grad_norm": 0.5, + "grad_norm_var": 0.00963451067606608, + "learning_rate": 2e-05, + "loss": 1.1983, + "loss/crossentropy": 2.6873083114624023, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 12.0, + "loss/logits": 0.15920904278755188, + "step": 995 + }, + { + "epoch": 0.014872330894430342, + "grad_norm": 0.55078125, + "grad_norm_var": 0.009456745783487956, + "learning_rate": 2e-05, + "loss": 1.2986, + "loss/crossentropy": 2.5818088054656982, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 12.0, + "loss/logits": 0.18138398230075836, + "step": 996 + }, + { + "epoch": 0.014887262953561296, + "grad_norm": 0.53515625, + "grad_norm_var": 0.009412495295206706, + "learning_rate": 2e-05, + "loss": 1.3294, + "loss/crossentropy": 2.5327281951904297, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.1731855869293213, + "step": 997 + }, + { + "epoch": 0.01490219501269225, + "grad_norm": 0.59375, + "grad_norm_var": 0.009376255671183269, + "learning_rate": 2e-05, + "loss": 1.3438, + "loss/crossentropy": 2.5216119289398193, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.18757742643356323, + "step": 998 + }, + { + "epoch": 0.014917127071823204, + "grad_norm": 0.4921875, + "grad_norm_var": 0.0009270826975504557, + "learning_rate": 2e-05, + "loss": 1.1393, + "loss/crossentropy": 2.517537832260132, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 12.0, + "loss/logits": 0.13927598297595978, + "step": 999 + }, + { + "epoch": 0.014932059130954158, + "grad_norm": 0.546875, + "grad_norm_var": 0.0009011427561442057, + "learning_rate": 2e-05, + "loss": 1.3215, + "loss/crossentropy": 2.4011945724487305, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 12.0, + "loss/logits": 0.16521359980106354, + "step": 1000 + }, + { + "epoch": 0.014946991190085112, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0008880456288655598, + "learning_rate": 2e-05, + "loss": 1.3127, + "loss/crossentropy": 2.5374038219451904, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 12.0, + "loss/logits": 0.17211785912513733, + "step": 1001 + }, + { + "epoch": 0.014961923249216066, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0009166558583577473, + "learning_rate": 2e-05, + "loss": 1.3361, + "loss/crossentropy": 2.324538230895996, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.1876780390739441, + "step": 1002 + }, + { + "epoch": 0.01497685530834702, + "grad_norm": 1.0546875, + "grad_norm_var": 0.017824538548787437, + "learning_rate": 2e-05, + "loss": 1.3262, + "loss/crossentropy": 2.5217180252075195, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.19337180256843567, + "step": 1003 + }, + { + "epoch": 0.014991787367477975, + "grad_norm": 0.91796875, + "grad_norm_var": 0.02557371457417806, + "learning_rate": 2e-05, + "loss": 1.2566, + "loss/crossentropy": 2.5869264602661133, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17063188552856445, + "step": 1004 + }, + { + "epoch": 0.015006719426608929, + "grad_norm": 1.0078125, + "grad_norm_var": 0.03638443946838379, + "learning_rate": 2e-05, + "loss": 1.5329, + "loss/crossentropy": 2.5455098152160645, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 13.0, + "loss/logits": 0.25163742899894714, + "step": 1005 + }, + { + "epoch": 0.015021651485739883, + "grad_norm": 0.953125, + "grad_norm_var": 0.042180315653483076, + "learning_rate": 2e-05, + "loss": 1.3301, + "loss/crossentropy": 2.805210590362549, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.18944759666919708, + "step": 1006 + }, + { + "epoch": 0.015036583544870837, + "grad_norm": 0.92578125, + "grad_norm_var": 0.04623209635416667, + "learning_rate": 2e-05, + "loss": 1.397, + "loss/crossentropy": 2.673166275024414, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.209548681974411, + "step": 1007 + }, + { + "epoch": 0.015051515604001793, + "grad_norm": 0.80859375, + "grad_norm_var": 0.04536921183268229, + "learning_rate": 2e-05, + "loss": 1.353, + "loss/crossentropy": 2.5452094078063965, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.1967710256576538, + "step": 1008 + }, + { + "epoch": 0.015066447663132747, + "grad_norm": 0.796875, + "grad_norm_var": 0.044171587626139326, + "learning_rate": 2e-05, + "loss": 1.2264, + "loss/crossentropy": 2.614394426345825, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.15607105195522308, + "step": 1009 + }, + { + "epoch": 0.015081379722263701, + "grad_norm": 0.80859375, + "grad_norm_var": 0.04278971354166667, + "learning_rate": 2e-05, + "loss": 1.4443, + "loss/crossentropy": 2.5008745193481445, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 13.0, + "loss/logits": 0.20992863178253174, + "step": 1010 + }, + { + "epoch": 0.015096311781394655, + "grad_norm": 1.0078125, + "grad_norm_var": 0.04396336873372396, + "learning_rate": 2e-05, + "loss": 1.6703, + "loss/crossentropy": 2.8562960624694824, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3828125, + "loss/idx": 13.0, + "loss/logits": 0.28753405809402466, + "step": 1011 + }, + { + "epoch": 0.015111243840525609, + "grad_norm": 0.734375, + "grad_norm_var": 0.04113356272379557, + "learning_rate": 2e-05, + "loss": 1.3257, + "loss/crossentropy": 2.57487416267395, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.19289004802703857, + "step": 1012 + }, + { + "epoch": 0.015126175899656563, + "grad_norm": 0.7109375, + "grad_norm_var": 0.03770319620768229, + "learning_rate": 2e-05, + "loss": 1.2732, + "loss/crossentropy": 2.6012637615203857, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17167770862579346, + "step": 1013 + }, + { + "epoch": 0.015141107958787517, + "grad_norm": 0.734375, + "grad_norm_var": 0.03554255167643229, + "learning_rate": 2e-05, + "loss": 1.2555, + "loss/crossentropy": 2.406501531600952, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.15391208231449127, + "step": 1014 + }, + { + "epoch": 0.015156040017918471, + "grad_norm": 0.72265625, + "grad_norm_var": 0.029904619852701823, + "learning_rate": 2e-05, + "loss": 1.2593, + "loss/crossentropy": 2.6688358783721924, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17337998747825623, + "step": 1015 + }, + { + "epoch": 0.015170972077049425, + "grad_norm": 0.74609375, + "grad_norm_var": 0.025712076822916666, + "learning_rate": 2e-05, + "loss": 1.36, + "loss/crossentropy": 2.654153823852539, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.20374688506126404, + "step": 1016 + }, + { + "epoch": 0.01518590413618038, + "grad_norm": 0.640625, + "grad_norm_var": 0.02223656972249349, + "learning_rate": 2e-05, + "loss": 1.34, + "loss/crossentropy": 2.397961139678955, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.19154663383960724, + "step": 1017 + }, + { + "epoch": 0.015200836195311334, + "grad_norm": 0.6640625, + "grad_norm_var": 0.017473347981770835, + "learning_rate": 2e-05, + "loss": 1.2604, + "loss/crossentropy": 2.5992560386657715, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17447641491889954, + "step": 1018 + }, + { + "epoch": 0.015215768254442288, + "grad_norm": 0.66015625, + "grad_norm_var": 0.015232276916503907, + "learning_rate": 2e-05, + "loss": 1.3838, + "loss/crossentropy": 2.2740230560302734, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.19633014500141144, + "step": 1019 + }, + { + "epoch": 0.015230700313573242, + "grad_norm": 0.59765625, + "grad_norm_var": 0.01671288808186849, + "learning_rate": 2e-05, + "loss": 1.2765, + "loss/crossentropy": 2.6276321411132812, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18275076150894165, + "step": 1020 + }, + { + "epoch": 0.015245632372704196, + "grad_norm": 0.7890625, + "grad_norm_var": 0.013131141662597656, + "learning_rate": 2e-05, + "loss": 1.5263, + "loss/crossentropy": 2.5657906532287598, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 13.0, + "loss/logits": 0.27633097767829895, + "step": 1021 + }, + { + "epoch": 0.01526056443183515, + "grad_norm": 0.65234375, + "grad_norm_var": 0.011393229166666666, + "learning_rate": 2e-05, + "loss": 1.2135, + "loss/crossentropy": 2.5474112033843994, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1510053277015686, + "step": 1022 + }, + { + "epoch": 0.015275496490966104, + "grad_norm": 0.8984375, + "grad_norm_var": 0.010799090067545572, + "learning_rate": 2e-05, + "loss": 1.6306, + "loss/crossentropy": 2.913024425506592, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3515625, + "loss/idx": 13.0, + "loss/logits": 0.278994619846344, + "step": 1023 + }, + { + "epoch": 0.015290428550097058, + "grad_norm": 0.59375, + "grad_norm_var": 0.011956532796223959, + "learning_rate": 2e-05, + "loss": 1.2351, + "loss/crossentropy": 2.50426983833313, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.172644704580307, + "step": 1024 + }, + { + "epoch": 0.015305360609228012, + "grad_norm": 0.77734375, + "grad_norm_var": 0.011818885803222656, + "learning_rate": 2e-05, + "loss": 1.4706, + "loss/crossentropy": 2.4272844791412354, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 13.0, + "loss/logits": 0.20502135157585144, + "step": 1025 + }, + { + "epoch": 0.015320292668358966, + "grad_norm": 0.625, + "grad_norm_var": 0.012090810139973958, + "learning_rate": 2e-05, + "loss": 1.3414, + "loss/crossentropy": 2.1835615634918213, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.19297602772712708, + "step": 1026 + }, + { + "epoch": 0.01533522472748992, + "grad_norm": 1.0078125, + "grad_norm_var": 0.012090810139973958, + "learning_rate": 2e-05, + "loss": 1.4718, + "loss/crossentropy": 2.999908924102783, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 13.0, + "loss/logits": 0.2296452820301056, + "step": 1027 + }, + { + "epoch": 0.015350156786620875, + "grad_norm": 0.6484375, + "grad_norm_var": 0.012412516276041667, + "learning_rate": 2e-05, + "loss": 1.2357, + "loss/crossentropy": 2.710797071456909, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.15754911303520203, + "step": 1028 + }, + { + "epoch": 0.015365088845751829, + "grad_norm": 0.609375, + "grad_norm_var": 0.013136545817057291, + "learning_rate": 2e-05, + "loss": 1.264, + "loss/crossentropy": 2.562701940536499, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17028763890266418, + "step": 1029 + }, + { + "epoch": 0.015380020904882783, + "grad_norm": 0.5703125, + "grad_norm_var": 0.014295450846354167, + "learning_rate": 2e-05, + "loss": 1.2993, + "loss/crossentropy": 2.5587525367736816, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1899479776620865, + "step": 1030 + }, + { + "epoch": 0.015394952964013737, + "grad_norm": 0.6640625, + "grad_norm_var": 0.014334551493326823, + "learning_rate": 2e-05, + "loss": 1.2617, + "loss/crossentropy": 2.8359696865081787, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17577612400054932, + "step": 1031 + }, + { + "epoch": 0.015409885023144691, + "grad_norm": 0.61328125, + "grad_norm_var": 0.014559364318847657, + "learning_rate": 2e-05, + "loss": 1.2215, + "loss/crossentropy": 2.6557297706604004, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16680006682872772, + "step": 1032 + }, + { + "epoch": 0.015424817082275645, + "grad_norm": 0.54296875, + "grad_norm_var": 0.015775299072265624, + "learning_rate": 2e-05, + "loss": 1.1809, + "loss/crossentropy": 2.598259687423706, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15747103095054626, + "step": 1033 + }, + { + "epoch": 0.015439749141406599, + "grad_norm": 0.59765625, + "grad_norm_var": 0.01621087392171224, + "learning_rate": 2e-05, + "loss": 1.293, + "loss/crossentropy": 2.3652303218841553, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.18361005187034607, + "step": 1034 + }, + { + "epoch": 0.015454681200537555, + "grad_norm": 0.60546875, + "grad_norm_var": 0.016527748107910155, + "learning_rate": 2e-05, + "loss": 1.2546, + "loss/crossentropy": 2.5818984508514404, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.16868820786476135, + "step": 1035 + }, + { + "epoch": 0.015469613259668509, + "grad_norm": 0.61328125, + "grad_norm_var": 0.016382789611816405, + "learning_rate": 2e-05, + "loss": 1.2933, + "loss/crossentropy": 2.655116081237793, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1839730441570282, + "step": 1036 + }, + { + "epoch": 0.015484545318799463, + "grad_norm": 0.546875, + "grad_norm_var": 0.016382789611816405, + "learning_rate": 2e-05, + "loss": 1.2268, + "loss/crossentropy": 2.541788339614868, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16428236663341522, + "step": 1037 + }, + { + "epoch": 0.015499477377930417, + "grad_norm": 0.55859375, + "grad_norm_var": 0.017032814025878907, + "learning_rate": 2e-05, + "loss": 1.264, + "loss/crossentropy": 2.7263083457946777, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17802877724170685, + "step": 1038 + }, + { + "epoch": 0.015514409437061371, + "grad_norm": 0.546875, + "grad_norm_var": 0.013324928283691407, + "learning_rate": 2e-05, + "loss": 1.2371, + "loss/crossentropy": 2.8279428482055664, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.18236854672431946, + "step": 1039 + }, + { + "epoch": 0.015529341496192325, + "grad_norm": 0.53125, + "grad_norm_var": 0.013892555236816406, + "learning_rate": 2e-05, + "loss": 1.2164, + "loss/crossentropy": 2.6993460655212402, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16949616372585297, + "step": 1040 + }, + { + "epoch": 0.01554427355532328, + "grad_norm": 0.5703125, + "grad_norm_var": 0.012467193603515624, + "learning_rate": 2e-05, + "loss": 1.2521, + "loss/crossentropy": 2.532785654067993, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.16616228222846985, + "step": 1041 + }, + { + "epoch": 0.015559205614454234, + "grad_norm": 0.6171875, + "grad_norm_var": 0.012461344401041666, + "learning_rate": 2e-05, + "loss": 1.2867, + "loss/crossentropy": 2.5695011615753174, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1772921234369278, + "step": 1042 + }, + { + "epoch": 0.015574137673585188, + "grad_norm": 0.53515625, + "grad_norm_var": 0.001683489481608073, + "learning_rate": 2e-05, + "loss": 1.2487, + "loss/crossentropy": 2.461456537246704, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1706066131591797, + "step": 1043 + }, + { + "epoch": 0.015589069732716142, + "grad_norm": 0.63671875, + "grad_norm_var": 0.0015940348307291667, + "learning_rate": 2e-05, + "loss": 1.2816, + "loss/crossentropy": 2.573848247528076, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1722354292869568, + "step": 1044 + }, + { + "epoch": 0.015604001791847096, + "grad_norm": 0.5546875, + "grad_norm_var": 0.001602935791015625, + "learning_rate": 2e-05, + "loss": 1.179, + "loss/crossentropy": 2.5369319915771484, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.1555238664150238, + "step": 1045 + }, + { + "epoch": 0.01561893385097805, + "grad_norm": 0.62890625, + "grad_norm_var": 0.0017297744750976562, + "learning_rate": 2e-05, + "loss": 1.3623, + "loss/crossentropy": 2.646678924560547, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.19046559929847717, + "step": 1046 + }, + { + "epoch": 0.015633865910109002, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0014605204264322917, + "learning_rate": 2e-05, + "loss": 1.2948, + "loss/crossentropy": 2.7469944953918457, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.18540921807289124, + "step": 1047 + }, + { + "epoch": 0.015648797969239958, + "grad_norm": 0.61328125, + "grad_norm_var": 0.0014605204264322917, + "learning_rate": 2e-05, + "loss": 1.246, + "loss/crossentropy": 2.574913501739502, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17573140561580658, + "step": 1048 + }, + { + "epoch": 0.015663730028370914, + "grad_norm": 0.5, + "grad_norm_var": 0.0017689387003580728, + "learning_rate": 2e-05, + "loss": 1.2004, + "loss/crossentropy": 2.689748525619507, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16129891574382782, + "step": 1049 + }, + { + "epoch": 0.015678662087501866, + "grad_norm": 0.48828125, + "grad_norm_var": 0.0021712621053059894, + "learning_rate": 2e-05, + "loss": 1.2207, + "loss/crossentropy": 2.6240992546081543, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16604174673557281, + "step": 1050 + }, + { + "epoch": 0.015693594146632822, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0021066665649414062, + "learning_rate": 2e-05, + "loss": 1.3609, + "loss/crossentropy": 2.4291961193084717, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.20462797582149506, + "step": 1051 + }, + { + "epoch": 0.015708526205763775, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0019973119099934897, + "learning_rate": 2e-05, + "loss": 1.1655, + "loss/crossentropy": 2.6186795234680176, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1498570740222931, + "step": 1052 + }, + { + "epoch": 0.01572345826489473, + "grad_norm": 0.515625, + "grad_norm_var": 0.002118364969889323, + "learning_rate": 2e-05, + "loss": 1.3027, + "loss/crossentropy": 2.752519369125366, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.18552502989768982, + "step": 1053 + }, + { + "epoch": 0.015738390324025683, + "grad_norm": 0.76171875, + "grad_norm_var": 0.0046772638956705725, + "learning_rate": 2e-05, + "loss": 1.3994, + "loss/crossentropy": 2.4993913173675537, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 13.0, + "loss/logits": 0.1962486207485199, + "step": 1054 + }, + { + "epoch": 0.01575332238315664, + "grad_norm": 0.54296875, + "grad_norm_var": 0.004691314697265625, + "learning_rate": 2e-05, + "loss": 1.213, + "loss/crossentropy": 2.584184408187866, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1661267727613449, + "step": 1055 + }, + { + "epoch": 0.01576825444228759, + "grad_norm": 0.55078125, + "grad_norm_var": 0.00460961659749349, + "learning_rate": 2e-05, + "loss": 1.3191, + "loss/crossentropy": 2.708296775817871, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.1941191405057907, + "step": 1056 + }, + { + "epoch": 0.015783186501418547, + "grad_norm": 0.5390625, + "grad_norm_var": 0.004681841532389323, + "learning_rate": 2e-05, + "loss": 1.3431, + "loss/crossentropy": 2.7722063064575195, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.20250889658927917, + "step": 1057 + }, + { + "epoch": 0.0157981185605495, + "grad_norm": 0.494140625, + "grad_norm_var": 0.004871098200480143, + "learning_rate": 2e-05, + "loss": 1.1876, + "loss/crossentropy": 2.6514158248901367, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.16421112418174744, + "step": 1058 + }, + { + "epoch": 0.015813050619680455, + "grad_norm": 0.56640625, + "grad_norm_var": 0.004814640680948893, + "learning_rate": 2e-05, + "loss": 1.2953, + "loss/crossentropy": 2.65419864654541, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.19374266266822815, + "step": 1059 + }, + { + "epoch": 0.015827982678811407, + "grad_norm": 0.6953125, + "grad_norm_var": 0.0055871168772379555, + "learning_rate": 2e-05, + "loss": 1.3446, + "loss/crossentropy": 2.398844003677368, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.19617103040218353, + "step": 1060 + }, + { + "epoch": 0.015842914737942363, + "grad_norm": 0.58984375, + "grad_norm_var": 0.005597416559855143, + "learning_rate": 2e-05, + "loss": 1.2199, + "loss/crossentropy": 2.6054039001464844, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.15744562447071075, + "step": 1061 + }, + { + "epoch": 0.015857846797073315, + "grad_norm": 0.8125, + "grad_norm_var": 0.009117492039998372, + "learning_rate": 2e-05, + "loss": 1.2721, + "loss/crossentropy": 2.718179702758789, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1940201371908188, + "step": 1062 + }, + { + "epoch": 0.01587277885620427, + "grad_norm": 0.765625, + "grad_norm_var": 0.010909255345662434, + "learning_rate": 2e-05, + "loss": 1.422, + "loss/crossentropy": 2.877493381500244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.2344614416360855, + "step": 1063 + }, + { + "epoch": 0.015887710915335224, + "grad_norm": 0.62890625, + "grad_norm_var": 0.01095732053120931, + "learning_rate": 2e-05, + "loss": 1.4013, + "loss/crossentropy": 2.4012539386749268, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 13.0, + "loss/logits": 0.1981934905052185, + "step": 1064 + }, + { + "epoch": 0.01590264297446618, + "grad_norm": 0.53515625, + "grad_norm_var": 0.01057279904683431, + "learning_rate": 2e-05, + "loss": 1.2988, + "loss/crossentropy": 2.4239413738250732, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.18942078948020935, + "step": 1065 + }, + { + "epoch": 0.015917575033597132, + "grad_norm": 0.62109375, + "grad_norm_var": 0.009684356053670247, + "learning_rate": 2e-05, + "loss": 1.3502, + "loss/crossentropy": 2.4800355434417725, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.2017608880996704, + "step": 1066 + }, + { + "epoch": 0.015932507092728088, + "grad_norm": 0.5234375, + "grad_norm_var": 0.010129658381144206, + "learning_rate": 2e-05, + "loss": 1.2782, + "loss/crossentropy": 2.4825854301452637, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.16887298226356506, + "step": 1067 + }, + { + "epoch": 0.01594743915185904, + "grad_norm": 0.494140625, + "grad_norm_var": 0.010615984598795572, + "learning_rate": 2e-05, + "loss": 1.2356, + "loss/crossentropy": 2.487943649291992, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17307588458061218, + "step": 1068 + }, + { + "epoch": 0.015962371210989996, + "grad_norm": 0.52734375, + "grad_norm_var": 0.010489145914713541, + "learning_rate": 2e-05, + "loss": 1.2028, + "loss/crossentropy": 2.282266855239868, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1559266895055771, + "step": 1069 + }, + { + "epoch": 0.015977303270120948, + "grad_norm": 0.5078125, + "grad_norm_var": 0.00914605458577474, + "learning_rate": 2e-05, + "loss": 1.2308, + "loss/crossentropy": 2.5250964164733887, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17615896463394165, + "step": 1070 + }, + { + "epoch": 0.015992235329251904, + "grad_norm": 0.546875, + "grad_norm_var": 0.009123992919921876, + "learning_rate": 2e-05, + "loss": 1.2686, + "loss/crossentropy": 2.4969565868377686, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17482084035873413, + "step": 1071 + }, + { + "epoch": 0.016007167388382856, + "grad_norm": 0.72265625, + "grad_norm_var": 0.010131072998046876, + "learning_rate": 2e-05, + "loss": 1.4118, + "loss/crossentropy": 2.3616156578063965, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.2243403196334839, + "step": 1072 + }, + { + "epoch": 0.016022099447513812, + "grad_norm": 0.6171875, + "grad_norm_var": 0.009897104899088542, + "learning_rate": 2e-05, + "loss": 1.2862, + "loss/crossentropy": 2.4683635234832764, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.17678457498550415, + "step": 1073 + }, + { + "epoch": 0.016037031506644768, + "grad_norm": 0.57421875, + "grad_norm_var": 0.009135293960571288, + "learning_rate": 2e-05, + "loss": 1.2834, + "loss/crossentropy": 2.6184275150299072, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1817963421344757, + "step": 1074 + }, + { + "epoch": 0.01605196356577572, + "grad_norm": 0.6015625, + "grad_norm_var": 0.009017419815063477, + "learning_rate": 2e-05, + "loss": 1.4443, + "loss/crossentropy": 2.3622615337371826, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 13.0, + "loss/logits": 0.20212198793888092, + "step": 1075 + }, + { + "epoch": 0.016066895624906676, + "grad_norm": 0.5078125, + "grad_norm_var": 0.009087610244750976, + "learning_rate": 2e-05, + "loss": 1.2307, + "loss/crossentropy": 2.550598382949829, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1681874394416809, + "step": 1076 + }, + { + "epoch": 0.01608182768403763, + "grad_norm": 0.5234375, + "grad_norm_var": 0.009439961115519205, + "learning_rate": 2e-05, + "loss": 1.2534, + "loss/crossentropy": 2.5999906063079834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.183104008436203, + "step": 1077 + }, + { + "epoch": 0.016096759743168584, + "grad_norm": 0.546875, + "grad_norm_var": 0.006123971939086914, + "learning_rate": 2e-05, + "loss": 1.3151, + "loss/crossentropy": 2.518636703491211, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.190095454454422, + "step": 1078 + }, + { + "epoch": 0.016111691802299537, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0036707401275634767, + "learning_rate": 2e-05, + "loss": 1.2221, + "loss/crossentropy": 2.473445177078247, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.15955936908721924, + "step": 1079 + }, + { + "epoch": 0.016126623861430493, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0035547733306884764, + "learning_rate": 2e-05, + "loss": 1.2402, + "loss/crossentropy": 2.5388095378875732, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16985295712947845, + "step": 1080 + }, + { + "epoch": 0.016141555920561445, + "grad_norm": 0.5625, + "grad_norm_var": 0.003527180353800456, + "learning_rate": 2e-05, + "loss": 1.3255, + "loss/crossentropy": 2.577446222305298, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.19264891743659973, + "step": 1081 + }, + { + "epoch": 0.0161564879796924, + "grad_norm": 0.47265625, + "grad_norm_var": 0.00364073117574056, + "learning_rate": 2e-05, + "loss": 1.1981, + "loss/crossentropy": 2.561262369155884, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1590115875005722, + "step": 1082 + }, + { + "epoch": 0.016171420038823353, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0036990960439046224, + "learning_rate": 2e-05, + "loss": 1.3113, + "loss/crossentropy": 2.3770103454589844, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18632197380065918, + "step": 1083 + }, + { + "epoch": 0.01618635209795431, + "grad_norm": 0.6796875, + "grad_norm_var": 0.004416338602701823, + "learning_rate": 2e-05, + "loss": 1.4186, + "loss/crossentropy": 2.4611380100250244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 13.0, + "loss/logits": 0.19985929131507874, + "step": 1084 + }, + { + "epoch": 0.01620128415708526, + "grad_norm": 0.52734375, + "grad_norm_var": 0.004416338602701823, + "learning_rate": 2e-05, + "loss": 1.2343, + "loss/crossentropy": 2.508803129196167, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17179152369499207, + "step": 1085 + }, + { + "epoch": 0.016216216216216217, + "grad_norm": 0.7578125, + "grad_norm_var": 0.00645898183186849, + "learning_rate": 2e-05, + "loss": 1.271, + "loss/crossentropy": 2.593477964401245, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.20067906379699707, + "step": 1086 + }, + { + "epoch": 0.01623114827534717, + "grad_norm": 0.515625, + "grad_norm_var": 0.006655311584472657, + "learning_rate": 2e-05, + "loss": 1.1943, + "loss/crossentropy": 2.566943407058716, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16300199925899506, + "step": 1087 + }, + { + "epoch": 0.016246080334478125, + "grad_norm": 0.54296875, + "grad_norm_var": 0.005193010965983073, + "learning_rate": 2e-05, + "loss": 1.1865, + "loss/crossentropy": 2.6508474349975586, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.1630159616470337, + "step": 1088 + }, + { + "epoch": 0.016261012393609078, + "grad_norm": 0.47265625, + "grad_norm_var": 0.005515289306640625, + "learning_rate": 2e-05, + "loss": 1.1207, + "loss/crossentropy": 2.662527561187744, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98046875, + "loss/idx": 13.0, + "loss/logits": 0.14020352065563202, + "step": 1089 + }, + { + "epoch": 0.016275944452740033, + "grad_norm": 0.65625, + "grad_norm_var": 0.006122779846191406, + "learning_rate": 2e-05, + "loss": 1.2878, + "loss/crossentropy": 2.4766385555267334, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.2018873393535614, + "step": 1090 + }, + { + "epoch": 0.016290876511870986, + "grad_norm": 0.52734375, + "grad_norm_var": 0.006078084309895833, + "learning_rate": 2e-05, + "loss": 1.3068, + "loss/crossentropy": 2.465484380722046, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18179619312286377, + "step": 1091 + }, + { + "epoch": 0.01630580857100194, + "grad_norm": 0.6015625, + "grad_norm_var": 0.006004842122395834, + "learning_rate": 2e-05, + "loss": 1.383, + "loss/crossentropy": 2.4888992309570312, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.21115002036094666, + "step": 1092 + }, + { + "epoch": 0.016320740630132894, + "grad_norm": 0.66015625, + "grad_norm_var": 0.006443214416503906, + "learning_rate": 2e-05, + "loss": 1.5633, + "loss/crossentropy": 2.5339741706848145, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3125, + "loss/idx": 13.0, + "loss/logits": 0.2507936358451843, + "step": 1093 + }, + { + "epoch": 0.01633567268926385, + "grad_norm": 0.48046875, + "grad_norm_var": 0.006941477457682292, + "learning_rate": 2e-05, + "loss": 1.1839, + "loss/crossentropy": 2.5070247650146484, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1526746153831482, + "step": 1094 + }, + { + "epoch": 0.016350604748394802, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0068743387858072914, + "learning_rate": 2e-05, + "loss": 1.3714, + "loss/crossentropy": 2.454162359237671, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.19956299662590027, + "step": 1095 + }, + { + "epoch": 0.016365536807525758, + "grad_norm": 0.5, + "grad_norm_var": 0.006910133361816406, + "learning_rate": 2e-05, + "loss": 1.1066, + "loss/crossentropy": 2.6837570667266846, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9609375, + "loss/idx": 13.0, + "loss/logits": 0.14565584063529968, + "step": 1096 + }, + { + "epoch": 0.01638046886665671, + "grad_norm": 0.6015625, + "grad_norm_var": 0.006963539123535156, + "learning_rate": 2e-05, + "loss": 1.3122, + "loss/crossentropy": 2.467454433441162, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18724094331264496, + "step": 1097 + }, + { + "epoch": 0.016395400925787666, + "grad_norm": 0.65234375, + "grad_norm_var": 0.006577491760253906, + "learning_rate": 2e-05, + "loss": 1.3914, + "loss/crossentropy": 2.465182065963745, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.19610178470611572, + "step": 1098 + }, + { + "epoch": 0.01641033298491862, + "grad_norm": 0.5703125, + "grad_norm_var": 0.006586710611979167, + "learning_rate": 2e-05, + "loss": 1.3055, + "loss/crossentropy": 2.500645637512207, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.17266511917114258, + "step": 1099 + }, + { + "epoch": 0.016425265044049574, + "grad_norm": 0.59375, + "grad_norm_var": 0.005940500895182292, + "learning_rate": 2e-05, + "loss": 1.3624, + "loss/crossentropy": 2.6782145500183105, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.21395066380500793, + "step": 1100 + }, + { + "epoch": 0.01644019710318053, + "grad_norm": 0.5546875, + "grad_norm_var": 0.005803871154785156, + "learning_rate": 2e-05, + "loss": 1.2681, + "loss/crossentropy": 2.5062341690063477, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17436693608760834, + "step": 1101 + }, + { + "epoch": 0.016455129162311483, + "grad_norm": 0.53125, + "grad_norm_var": 0.0036208470662434894, + "learning_rate": 2e-05, + "loss": 1.2351, + "loss/crossentropy": 2.7024452686309814, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17257672548294067, + "step": 1102 + }, + { + "epoch": 0.01647006122144244, + "grad_norm": 0.5546875, + "grad_norm_var": 0.003458086649576823, + "learning_rate": 2e-05, + "loss": 1.3367, + "loss/crossentropy": 2.4043080806732178, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.1883116364479065, + "step": 1103 + }, + { + "epoch": 0.01648499328057339, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0034362157185872395, + "learning_rate": 2e-05, + "loss": 1.2919, + "loss/crossentropy": 2.891212224960327, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.19036878645420074, + "step": 1104 + }, + { + "epoch": 0.016499925339704347, + "grad_norm": 0.515625, + "grad_norm_var": 0.0030047098795572915, + "learning_rate": 2e-05, + "loss": 1.2393, + "loss/crossentropy": 2.4849252700805664, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16896244883537292, + "step": 1105 + }, + { + "epoch": 0.0165148573988353, + "grad_norm": 0.6953125, + "grad_norm_var": 0.0035451253255208335, + "learning_rate": 2e-05, + "loss": 1.376, + "loss/crossentropy": 2.5396435260772705, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.18845024704933167, + "step": 1106 + }, + { + "epoch": 0.016529789457966255, + "grad_norm": 0.4765625, + "grad_norm_var": 0.004017066955566406, + "learning_rate": 2e-05, + "loss": 1.0813, + "loss/crossentropy": 2.8166513442993164, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9453125, + "loss/idx": 13.0, + "loss/logits": 0.13603615760803223, + "step": 1107 + }, + { + "epoch": 0.016544721517097207, + "grad_norm": 0.609375, + "grad_norm_var": 0.004053688049316407, + "learning_rate": 2e-05, + "loss": 1.3427, + "loss/crossentropy": 2.6875510215759277, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.1942945122718811, + "step": 1108 + }, + { + "epoch": 0.016559653576228163, + "grad_norm": 0.5078125, + "grad_norm_var": 0.003684234619140625, + "learning_rate": 2e-05, + "loss": 1.2945, + "loss/crossentropy": 2.7076685428619385, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.19290290772914886, + "step": 1109 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5703125, + "grad_norm_var": 0.003223609924316406, + "learning_rate": 2e-05, + "loss": 1.2231, + "loss/crossentropy": 2.6345443725585938, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16839203238487244, + "step": 1110 + }, + { + "epoch": 0.01658951769449007, + "grad_norm": 0.5546875, + "grad_norm_var": 0.003214263916015625, + "learning_rate": 2e-05, + "loss": 1.305, + "loss/crossentropy": 2.5353426933288574, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.17995133996009827, + "step": 1111 + }, + { + "epoch": 0.016604449753621024, + "grad_norm": 0.53125, + "grad_norm_var": 0.0030047098795572915, + "learning_rate": 2e-05, + "loss": 1.2216, + "loss/crossentropy": 2.474064826965332, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1669555902481079, + "step": 1112 + }, + { + "epoch": 0.01661938181275198, + "grad_norm": 0.53125, + "grad_norm_var": 0.0029886881510416668, + "learning_rate": 2e-05, + "loss": 1.2778, + "loss/crossentropy": 2.3786816596984863, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17628170549869537, + "step": 1113 + }, + { + "epoch": 0.01663431387188293, + "grad_norm": 0.494140625, + "grad_norm_var": 0.002657810846964518, + "learning_rate": 2e-05, + "loss": 1.2608, + "loss/crossentropy": 2.5369842052459717, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17481496930122375, + "step": 1114 + }, + { + "epoch": 0.016649245931013888, + "grad_norm": 0.59765625, + "grad_norm_var": 0.0027690728505452475, + "learning_rate": 2e-05, + "loss": 1.3831, + "loss/crossentropy": 2.764906167984009, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.20345237851142883, + "step": 1115 + }, + { + "epoch": 0.01666417799014484, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0026902357737223306, + "learning_rate": 2e-05, + "loss": 1.42, + "loss/crossentropy": 2.3619296550750732, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.22464251518249512, + "step": 1116 + }, + { + "epoch": 0.016679110049275796, + "grad_norm": 0.515625, + "grad_norm_var": 0.0027773380279541016, + "learning_rate": 2e-05, + "loss": 1.2734, + "loss/crossentropy": 2.6021158695220947, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.18749283254146576, + "step": 1117 + }, + { + "epoch": 0.016694042108406748, + "grad_norm": 0.703125, + "grad_norm_var": 0.004178857803344727, + "learning_rate": 2e-05, + "loss": 1.4517, + "loss/crossentropy": 2.440187931060791, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 13.0, + "loss/logits": 0.20173925161361694, + "step": 1118 + }, + { + "epoch": 0.016708974167537704, + "grad_norm": 0.5546875, + "grad_norm_var": 0.004178857803344727, + "learning_rate": 2e-05, + "loss": 1.4031, + "loss/crossentropy": 2.343580722808838, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.21561887860298157, + "step": 1119 + }, + { + "epoch": 0.016723906226668656, + "grad_norm": 0.6328125, + "grad_norm_var": 0.004483270645141602, + "learning_rate": 2e-05, + "loss": 1.4455, + "loss/crossentropy": 2.702117681503296, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 13.0, + "loss/logits": 0.24238847196102142, + "step": 1120 + }, + { + "epoch": 0.016738838285799612, + "grad_norm": 0.7734375, + "grad_norm_var": 0.006887674331665039, + "learning_rate": 2e-05, + "loss": 1.5223, + "loss/crossentropy": 2.4388134479522705, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2890625, + "loss/idx": 13.0, + "loss/logits": 0.23323826491832733, + "step": 1121 + }, + { + "epoch": 0.016753770344930564, + "grad_norm": 0.6015625, + "grad_norm_var": 0.006028604507446289, + "learning_rate": 2e-05, + "loss": 1.3789, + "loss/crossentropy": 2.9548146724700928, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.20707201957702637, + "step": 1122 + }, + { + "epoch": 0.01676870240406152, + "grad_norm": 0.62109375, + "grad_norm_var": 0.005402866999308268, + "learning_rate": 2e-05, + "loss": 1.2119, + "loss/crossentropy": 2.4389278888702393, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1572415977716446, + "step": 1123 + }, + { + "epoch": 0.016783634463192473, + "grad_norm": 0.640625, + "grad_norm_var": 0.0055620670318603516, + "learning_rate": 2e-05, + "loss": 1.4007, + "loss/crossentropy": 2.364082098007202, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.20534616708755493, + "step": 1124 + }, + { + "epoch": 0.01679856652232343, + "grad_norm": 0.67578125, + "grad_norm_var": 0.005534728368123372, + "learning_rate": 2e-05, + "loss": 1.3358, + "loss/crossentropy": 2.7452070713043213, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.19514214992523193, + "step": 1125 + }, + { + "epoch": 0.016813498581454384, + "grad_norm": 0.57421875, + "grad_norm_var": 0.005521122614542643, + "learning_rate": 2e-05, + "loss": 1.3119, + "loss/crossentropy": 2.608513116836548, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.2025601863861084, + "step": 1126 + }, + { + "epoch": 0.016828430640585337, + "grad_norm": 0.546875, + "grad_norm_var": 0.0055705865224202475, + "learning_rate": 2e-05, + "loss": 1.2455, + "loss/crossentropy": 2.6770386695861816, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17516326904296875, + "step": 1127 + }, + { + "epoch": 0.016843362699716292, + "grad_norm": 0.6171875, + "grad_norm_var": 0.0052670637766520185, + "learning_rate": 2e-05, + "loss": 1.2574, + "loss/crossentropy": 2.6152310371398926, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17150159180164337, + "step": 1128 + }, + { + "epoch": 0.016858294758847245, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0049691359202067055, + "learning_rate": 2e-05, + "loss": 1.2696, + "loss/crossentropy": 2.4857561588287354, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.1758662760257721, + "step": 1129 + }, + { + "epoch": 0.0168732268179782, + "grad_norm": 0.54296875, + "grad_norm_var": 0.004389381408691407, + "learning_rate": 2e-05, + "loss": 1.2337, + "loss/crossentropy": 2.617910623550415, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1634291708469391, + "step": 1130 + }, + { + "epoch": 0.016888158877109153, + "grad_norm": 0.5234375, + "grad_norm_var": 0.004847208658854167, + "learning_rate": 2e-05, + "loss": 1.2272, + "loss/crossentropy": 2.406134605407715, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16469454765319824, + "step": 1131 + }, + { + "epoch": 0.01690309093624011, + "grad_norm": 0.578125, + "grad_norm_var": 0.0048323949178059895, + "learning_rate": 2e-05, + "loss": 1.3356, + "loss/crossentropy": 2.5499508380889893, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.2028346061706543, + "step": 1132 + }, + { + "epoch": 0.01691802299537106, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0045882542928059895, + "learning_rate": 2e-05, + "loss": 1.2556, + "loss/crossentropy": 2.609267234802246, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.17749843001365662, + "step": 1133 + }, + { + "epoch": 0.016932955054502017, + "grad_norm": 0.92578125, + "grad_norm_var": 0.010564168294270834, + "learning_rate": 2e-05, + "loss": 1.5228, + "loss/crossentropy": 2.679161787033081, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 13.0, + "loss/logits": 0.2727677822113037, + "step": 1134 + }, + { + "epoch": 0.01694788711363297, + "grad_norm": 0.55078125, + "grad_norm_var": 0.010599199930826824, + "learning_rate": 2e-05, + "loss": 1.2627, + "loss/crossentropy": 2.477123737335205, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1767975389957428, + "step": 1135 + }, + { + "epoch": 0.016962819172763925, + "grad_norm": 0.52734375, + "grad_norm_var": 0.011112467447916666, + "learning_rate": 2e-05, + "loss": 1.1844, + "loss/crossentropy": 2.6230485439300537, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.16095471382141113, + "step": 1136 + }, + { + "epoch": 0.016977751231894878, + "grad_norm": 0.640625, + "grad_norm_var": 0.009378814697265625, + "learning_rate": 2e-05, + "loss": 1.3407, + "loss/crossentropy": 2.6676883697509766, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.20009231567382812, + "step": 1137 + }, + { + "epoch": 0.016992683291025833, + "grad_norm": 0.578125, + "grad_norm_var": 0.009423828125, + "learning_rate": 2e-05, + "loss": 1.3611, + "loss/crossentropy": 2.3190951347351074, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.18917620182037354, + "step": 1138 + }, + { + "epoch": 0.017007615350156786, + "grad_norm": 0.58984375, + "grad_norm_var": 0.00941162109375, + "learning_rate": 2e-05, + "loss": 1.4375, + "loss/crossentropy": 2.5025153160095215, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 13.0, + "loss/logits": 0.22656379640102386, + "step": 1139 + }, + { + "epoch": 0.01702254740928774, + "grad_norm": 0.57421875, + "grad_norm_var": 0.00934136708577474, + "learning_rate": 2e-05, + "loss": 1.3811, + "loss/crossentropy": 2.7573082447052, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 13.0, + "loss/logits": 0.2170415222644806, + "step": 1140 + }, + { + "epoch": 0.017037479468418694, + "grad_norm": 0.57421875, + "grad_norm_var": 0.00892480214436849, + "learning_rate": 2e-05, + "loss": 1.2483, + "loss/crossentropy": 2.612473249435425, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17800605297088623, + "step": 1141 + }, + { + "epoch": 0.01705241152754965, + "grad_norm": 0.72265625, + "grad_norm_var": 0.00996850331624349, + "learning_rate": 2e-05, + "loss": 1.4418, + "loss/crossentropy": 2.723935604095459, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 13.0, + "loss/logits": 0.22303350269794464, + "step": 1142 + }, + { + "epoch": 0.017067343586680602, + "grad_norm": 0.52734375, + "grad_norm_var": 0.010131581624348959, + "learning_rate": 2e-05, + "loss": 1.2575, + "loss/crossentropy": 2.4441022872924805, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17158004641532898, + "step": 1143 + }, + { + "epoch": 0.017082275645811558, + "grad_norm": 0.55078125, + "grad_norm_var": 0.010247230529785156, + "learning_rate": 2e-05, + "loss": 1.2389, + "loss/crossentropy": 2.5366430282592773, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1764407753944397, + "step": 1144 + }, + { + "epoch": 0.01709720770494251, + "grad_norm": 0.52734375, + "grad_norm_var": 0.010514259338378906, + "learning_rate": 2e-05, + "loss": 1.2357, + "loss/crossentropy": 2.7200098037719727, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1732417792081833, + "step": 1145 + }, + { + "epoch": 0.017112139764073466, + "grad_norm": 0.6328125, + "grad_norm_var": 0.010430908203125, + "learning_rate": 2e-05, + "loss": 1.4005, + "loss/crossentropy": 2.693432331085205, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.21298031508922577, + "step": 1146 + }, + { + "epoch": 0.01712707182320442, + "grad_norm": 0.6796875, + "grad_norm_var": 0.010410563151041666, + "learning_rate": 2e-05, + "loss": 1.4696, + "loss/crossentropy": 2.30118989944458, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 13.0, + "loss/logits": 0.20401182770729065, + "step": 1147 + }, + { + "epoch": 0.017142003882335374, + "grad_norm": 0.52734375, + "grad_norm_var": 0.010770098368326823, + "learning_rate": 2e-05, + "loss": 1.1958, + "loss/crossentropy": 2.67213773727417, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.15674254298210144, + "step": 1148 + }, + { + "epoch": 0.017156935941466327, + "grad_norm": 0.52734375, + "grad_norm_var": 0.010880533854166667, + "learning_rate": 2e-05, + "loss": 1.2911, + "loss/crossentropy": 2.788635730743408, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.19733943045139313, + "step": 1149 + }, + { + "epoch": 0.017171868000597282, + "grad_norm": 0.671875, + "grad_norm_var": 0.003999773661295573, + "learning_rate": 2e-05, + "loss": 1.2827, + "loss/crossentropy": 2.373025894165039, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18114924430847168, + "step": 1150 + }, + { + "epoch": 0.017186800059728235, + "grad_norm": 0.5546875, + "grad_norm_var": 0.003981526692708333, + "learning_rate": 2e-05, + "loss": 1.3812, + "loss/crossentropy": 2.4267663955688477, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.2015307992696762, + "step": 1151 + }, + { + "epoch": 0.01720173211885919, + "grad_norm": 0.5703125, + "grad_norm_var": 0.003750038146972656, + "learning_rate": 2e-05, + "loss": 1.263, + "loss/crossentropy": 2.414804458618164, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1849151998758316, + "step": 1152 + }, + { + "epoch": 0.017216664177990147, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0035723368326822918, + "learning_rate": 2e-05, + "loss": 1.3073, + "loss/crossentropy": 2.3469202518463135, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.19007420539855957, + "step": 1153 + }, + { + "epoch": 0.0172315962371211, + "grad_norm": 0.7109375, + "grad_norm_var": 0.004510498046875, + "learning_rate": 2e-05, + "loss": 1.36, + "loss/crossentropy": 2.6144962310791016, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.18814045190811157, + "step": 1154 + }, + { + "epoch": 0.017246528296252055, + "grad_norm": 0.53515625, + "grad_norm_var": 0.004740142822265625, + "learning_rate": 2e-05, + "loss": 1.1889, + "loss/crossentropy": 2.7515432834625244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15762561559677124, + "step": 1155 + }, + { + "epoch": 0.017261460355383007, + "grad_norm": 0.48828125, + "grad_norm_var": 0.005408732096354166, + "learning_rate": 2e-05, + "loss": 1.142, + "loss/crossentropy": 2.41715407371521, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.14197902381420135, + "step": 1156 + }, + { + "epoch": 0.017276392414513963, + "grad_norm": 0.5390625, + "grad_norm_var": 0.005545488993326823, + "learning_rate": 2e-05, + "loss": 1.2009, + "loss/crossentropy": 2.6549770832061768, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16962096095085144, + "step": 1157 + }, + { + "epoch": 0.017291324473644915, + "grad_norm": 0.59765625, + "grad_norm_var": 0.004223060607910156, + "learning_rate": 2e-05, + "loss": 1.3874, + "loss/crossentropy": 2.5990960597991943, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.1999187469482422, + "step": 1158 + }, + { + "epoch": 0.01730625653277587, + "grad_norm": 0.875, + "grad_norm_var": 0.009479777018229166, + "learning_rate": 2e-05, + "loss": 1.4493, + "loss/crossentropy": 2.7729477882385254, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 13.0, + "loss/logits": 0.22276878356933594, + "step": 1159 + }, + { + "epoch": 0.017321188591906823, + "grad_norm": 0.58984375, + "grad_norm_var": 0.009325917561848958, + "learning_rate": 2e-05, + "loss": 1.3001, + "loss/crossentropy": 3.0190675258636475, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.17512644827365875, + "step": 1160 + }, + { + "epoch": 0.01733612065103778, + "grad_norm": 0.546875, + "grad_norm_var": 0.009157752990722657, + "learning_rate": 2e-05, + "loss": 1.2894, + "loss/crossentropy": 2.6917951107025146, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18787968158721924, + "step": 1161 + }, + { + "epoch": 0.01735105271016873, + "grad_norm": 0.55859375, + "grad_norm_var": 0.009200032552083333, + "learning_rate": 2e-05, + "loss": 1.396, + "loss/crossentropy": 2.4003427028656006, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.21633100509643555, + "step": 1162 + }, + { + "epoch": 0.017365984769299687, + "grad_norm": 0.494140625, + "grad_norm_var": 0.009322341283162434, + "learning_rate": 2e-05, + "loss": 1.2178, + "loss/crossentropy": 2.5328409671783447, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17093852162361145, + "step": 1163 + }, + { + "epoch": 0.01738091682843064, + "grad_norm": 0.5, + "grad_norm_var": 0.009583139419555664, + "learning_rate": 2e-05, + "loss": 1.2184, + "loss/crossentropy": 2.6213250160217285, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16373440623283386, + "step": 1164 + }, + { + "epoch": 0.017395848887561596, + "grad_norm": 0.515625, + "grad_norm_var": 0.009680795669555663, + "learning_rate": 2e-05, + "loss": 1.2133, + "loss/crossentropy": 2.523061513900757, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1664462834596634, + "step": 1165 + }, + { + "epoch": 0.017410780946692548, + "grad_norm": 0.57421875, + "grad_norm_var": 0.009127664566040038, + "learning_rate": 2e-05, + "loss": 1.2568, + "loss/crossentropy": 2.529860496520996, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18652039766311646, + "step": 1166 + }, + { + "epoch": 0.017425713005823504, + "grad_norm": 0.66796875, + "grad_norm_var": 0.009584919611612955, + "learning_rate": 2e-05, + "loss": 1.3846, + "loss/crossentropy": 2.6294994354248047, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 13.0, + "loss/logits": 0.2205285131931305, + "step": 1167 + }, + { + "epoch": 0.017440645064954456, + "grad_norm": 0.79296875, + "grad_norm_var": 0.012259403864542643, + "learning_rate": 2e-05, + "loss": 1.5955, + "loss/crossentropy": 2.7622482776641846, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3203125, + "loss/idx": 13.0, + "loss/logits": 0.2751520276069641, + "step": 1168 + }, + { + "epoch": 0.017455577124085412, + "grad_norm": 0.494140625, + "grad_norm_var": 0.01294244130452474, + "learning_rate": 2e-05, + "loss": 1.3178, + "loss/crossentropy": 2.5934271812438965, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19280162453651428, + "step": 1169 + }, + { + "epoch": 0.017470509183216364, + "grad_norm": 0.498046875, + "grad_norm_var": 0.012414026260375976, + "learning_rate": 2e-05, + "loss": 1.0967, + "loss/crossentropy": 2.6452291011810303, + "loss/dist_ce": 0.0, + "loss/fcd": 0.953125, + "loss/idx": 13.0, + "loss/logits": 0.14357757568359375, + "step": 1170 + }, + { + "epoch": 0.01748544124234732, + "grad_norm": 0.52734375, + "grad_norm_var": 0.012463744481404622, + "learning_rate": 2e-05, + "loss": 1.1795, + "loss/crossentropy": 2.3400866985321045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.16387495398521423, + "step": 1171 + }, + { + "epoch": 0.017500373301478273, + "grad_norm": 0.5078125, + "grad_norm_var": 0.012252028783162434, + "learning_rate": 2e-05, + "loss": 1.2582, + "loss/crossentropy": 2.6386451721191406, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17226719856262207, + "step": 1172 + }, + { + "epoch": 0.01751530536060923, + "grad_norm": 0.58984375, + "grad_norm_var": 0.012136316299438477, + "learning_rate": 2e-05, + "loss": 1.3139, + "loss/crossentropy": 2.514528274536133, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18887433409690857, + "step": 1173 + }, + { + "epoch": 0.01753023741974018, + "grad_norm": 0.52734375, + "grad_norm_var": 0.012309122085571288, + "learning_rate": 2e-05, + "loss": 1.225, + "loss/crossentropy": 2.5932071208953857, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1703052967786789, + "step": 1174 + }, + { + "epoch": 0.017545169478871137, + "grad_norm": 0.52734375, + "grad_norm_var": 0.006130075454711914, + "learning_rate": 2e-05, + "loss": 1.1948, + "loss/crossentropy": 2.6108434200286865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.163537859916687, + "step": 1175 + }, + { + "epoch": 0.01756010153800209, + "grad_norm": 0.6171875, + "grad_norm_var": 0.006296523412068685, + "learning_rate": 2e-05, + "loss": 1.4186, + "loss/crossentropy": 2.4866180419921875, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.23113754391670227, + "step": 1176 + }, + { + "epoch": 0.017575033597133045, + "grad_norm": 0.87109375, + "grad_norm_var": 0.012354516983032226, + "learning_rate": 2e-05, + "loss": 1.2533, + "loss/crossentropy": 2.478165626525879, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.16738183796405792, + "step": 1177 + }, + { + "epoch": 0.017589965656263997, + "grad_norm": 0.50390625, + "grad_norm_var": 0.012690083185831705, + "learning_rate": 2e-05, + "loss": 1.2588, + "loss/crossentropy": 2.6207003593444824, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1807083934545517, + "step": 1178 + }, + { + "epoch": 0.017604897715394953, + "grad_norm": 0.53515625, + "grad_norm_var": 0.012349955240885417, + "learning_rate": 2e-05, + "loss": 1.2262, + "loss/crossentropy": 2.719006299972534, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1715444028377533, + "step": 1179 + }, + { + "epoch": 0.01761982977452591, + "grad_norm": 0.625, + "grad_norm_var": 0.012024434407552083, + "learning_rate": 2e-05, + "loss": 1.3838, + "loss/crossentropy": 2.394318103790283, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.20415380597114563, + "step": 1180 + }, + { + "epoch": 0.01763476183365686, + "grad_norm": 0.5390625, + "grad_norm_var": 0.011839040120442708, + "learning_rate": 2e-05, + "loss": 1.2821, + "loss/crossentropy": 2.665447235107422, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18832457065582275, + "step": 1181 + }, + { + "epoch": 0.017649693892787817, + "grad_norm": 0.60546875, + "grad_norm_var": 0.011845143636067708, + "learning_rate": 2e-05, + "loss": 1.3763, + "loss/crossentropy": 2.765406608581543, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.2044430673122406, + "step": 1182 + }, + { + "epoch": 0.01766462595191877, + "grad_norm": 0.62890625, + "grad_norm_var": 0.01153106689453125, + "learning_rate": 2e-05, + "loss": 1.288, + "loss/crossentropy": 2.4687106609344482, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.20986786484718323, + "step": 1183 + }, + { + "epoch": 0.017679558011049725, + "grad_norm": 0.478515625, + "grad_norm_var": 0.009071842829386393, + "learning_rate": 2e-05, + "loss": 1.2231, + "loss/crossentropy": 2.5798656940460205, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17622314393520355, + "step": 1184 + }, + { + "epoch": 0.017694490070180677, + "grad_norm": 0.6328125, + "grad_norm_var": 0.00892175038655599, + "learning_rate": 2e-05, + "loss": 1.1994, + "loss/crossentropy": 2.4811015129089355, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1525367796421051, + "step": 1185 + }, + { + "epoch": 0.017709422129311633, + "grad_norm": 0.60546875, + "grad_norm_var": 0.008527485529581706, + "learning_rate": 2e-05, + "loss": 1.3831, + "loss/crossentropy": 2.6285629272460938, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.21119612455368042, + "step": 1186 + }, + { + "epoch": 0.017724354188442586, + "grad_norm": 0.55078125, + "grad_norm_var": 0.008389012018839518, + "learning_rate": 2e-05, + "loss": 1.251, + "loss/crossentropy": 2.726686716079712, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1728990077972412, + "step": 1187 + }, + { + "epoch": 0.01773928624757354, + "grad_norm": 0.5078125, + "grad_norm_var": 0.008389012018839518, + "learning_rate": 2e-05, + "loss": 1.2366, + "loss/crossentropy": 2.5716280937194824, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16626042127609253, + "step": 1188 + }, + { + "epoch": 0.017754218306704494, + "grad_norm": 0.5078125, + "grad_norm_var": 0.008746830622355144, + "learning_rate": 2e-05, + "loss": 1.261, + "loss/crossentropy": 2.5722556114196777, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17502065002918243, + "step": 1189 + }, + { + "epoch": 0.01776915036583545, + "grad_norm": 0.51171875, + "grad_norm_var": 0.008869663874308268, + "learning_rate": 2e-05, + "loss": 1.1607, + "loss/crossentropy": 2.6687557697296143, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.145055890083313, + "step": 1190 + }, + { + "epoch": 0.017784082424966402, + "grad_norm": 0.494140625, + "grad_norm_var": 0.009162839253743489, + "learning_rate": 2e-05, + "loss": 1.1919, + "loss/crossentropy": 2.54484486579895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16066983342170715, + "step": 1191 + }, + { + "epoch": 0.017799014484097358, + "grad_norm": 0.5546875, + "grad_norm_var": 0.009063148498535156, + "learning_rate": 2e-05, + "loss": 1.2078, + "loss/crossentropy": 2.6763405799865723, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16090230643749237, + "step": 1192 + }, + { + "epoch": 0.01781394654322831, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0027053197224934894, + "learning_rate": 2e-05, + "loss": 1.3166, + "loss/crossentropy": 2.541527509689331, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.1915903240442276, + "step": 1193 + }, + { + "epoch": 0.017828878602359266, + "grad_norm": 0.61328125, + "grad_norm_var": 0.0027444839477539064, + "learning_rate": 2e-05, + "loss": 1.2664, + "loss/crossentropy": 2.8826029300689697, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.18048372864723206, + "step": 1194 + }, + { + "epoch": 0.01784381066149022, + "grad_norm": 0.546875, + "grad_norm_var": 0.002715301513671875, + "learning_rate": 2e-05, + "loss": 1.2014, + "loss/crossentropy": 2.8553855419158936, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16232401132583618, + "step": 1195 + }, + { + "epoch": 0.017858742720621174, + "grad_norm": 0.58203125, + "grad_norm_var": 0.002458635965983073, + "learning_rate": 2e-05, + "loss": 1.3419, + "loss/crossentropy": 2.377991199493408, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 13.0, + "loss/logits": 0.17786559462547302, + "step": 1196 + }, + { + "epoch": 0.017873674779752127, + "grad_norm": 0.53125, + "grad_norm_var": 0.002481524149576823, + "learning_rate": 2e-05, + "loss": 1.257, + "loss/crossentropy": 2.6475048065185547, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.17889858782291412, + "step": 1197 + }, + { + "epoch": 0.017888606838883082, + "grad_norm": 0.60546875, + "grad_norm_var": 0.002481524149576823, + "learning_rate": 2e-05, + "loss": 1.3621, + "loss/crossentropy": 2.4759228229522705, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.2058965563774109, + "step": 1198 + }, + { + "epoch": 0.017903538898014035, + "grad_norm": 0.5, + "grad_norm_var": 0.0022822062174479166, + "learning_rate": 2e-05, + "loss": 1.2517, + "loss/crossentropy": 2.5964038372039795, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18137618899345398, + "step": 1199 + }, + { + "epoch": 0.01791847095714499, + "grad_norm": 0.5859375, + "grad_norm_var": 0.001996342341105143, + "learning_rate": 2e-05, + "loss": 1.3369, + "loss/crossentropy": 2.4271297454833984, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.18846943974494934, + "step": 1200 + }, + { + "epoch": 0.017933403016275943, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0015759627024332682, + "learning_rate": 2e-05, + "loss": 1.3796, + "loss/crossentropy": 2.558906316757202, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.23895680904388428, + "step": 1201 + }, + { + "epoch": 0.0179483350754069, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0013722578684488933, + "learning_rate": 2e-05, + "loss": 1.2437, + "loss/crossentropy": 2.63244891166687, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.15778325498104095, + "step": 1202 + }, + { + "epoch": 0.01796326713453785, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0013978163401285808, + "learning_rate": 2e-05, + "loss": 1.2569, + "loss/crossentropy": 2.5377118587493896, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17098784446716309, + "step": 1203 + }, + { + "epoch": 0.017978199193668807, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0013978163401285808, + "learning_rate": 2e-05, + "loss": 1.2133, + "loss/crossentropy": 2.4338786602020264, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.15082281827926636, + "step": 1204 + }, + { + "epoch": 0.017993131252799763, + "grad_norm": 0.6171875, + "grad_norm_var": 0.0015811761220296224, + "learning_rate": 2e-05, + "loss": 1.3381, + "loss/crossentropy": 2.4920754432678223, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.20530369877815247, + "step": 1205 + }, + { + "epoch": 0.018008063311930715, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0016038099924723308, + "learning_rate": 2e-05, + "loss": 1.226, + "loss/crossentropy": 2.680633783340454, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17915096879005432, + "step": 1206 + }, + { + "epoch": 0.01802299537106167, + "grad_norm": 0.65234375, + "grad_norm_var": 0.0019243876139322916, + "learning_rate": 2e-05, + "loss": 1.3583, + "loss/crossentropy": 2.5132720470428467, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.18643470108509064, + "step": 1207 + }, + { + "epoch": 0.018037927430192623, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0020873387654622394, + "learning_rate": 2e-05, + "loss": 1.2533, + "loss/crossentropy": 2.6504547595977783, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18298813700675964, + "step": 1208 + }, + { + "epoch": 0.01805285948932358, + "grad_norm": 0.61328125, + "grad_norm_var": 0.0022617975870768228, + "learning_rate": 2e-05, + "loss": 1.3371, + "loss/crossentropy": 2.7215096950531006, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.20427459478378296, + "step": 1209 + }, + { + "epoch": 0.01806779154845453, + "grad_norm": 0.5390625, + "grad_norm_var": 0.002115631103515625, + "learning_rate": 2e-05, + "loss": 1.2598, + "loss/crossentropy": 2.750105857849121, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17385736107826233, + "step": 1210 + }, + { + "epoch": 0.018082723607585487, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0021178563435872394, + "learning_rate": 2e-05, + "loss": 1.268, + "loss/crossentropy": 2.3910109996795654, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.16642415523529053, + "step": 1211 + }, + { + "epoch": 0.01809765566671644, + "grad_norm": 0.578125, + "grad_norm_var": 0.0021077473958333332, + "learning_rate": 2e-05, + "loss": 1.2759, + "loss/crossentropy": 2.6208412647247314, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17433887720108032, + "step": 1212 + }, + { + "epoch": 0.018112587725847396, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0021420796712239582, + "learning_rate": 2e-05, + "loss": 1.2242, + "loss/crossentropy": 2.5190320014953613, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16950549185276031, + "step": 1213 + }, + { + "epoch": 0.018127519784978348, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0020078023274739585, + "learning_rate": 2e-05, + "loss": 1.2889, + "loss/crossentropy": 2.4932422637939453, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.17951592803001404, + "step": 1214 + }, + { + "epoch": 0.018142451844109304, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0019286473592122395, + "learning_rate": 2e-05, + "loss": 1.1917, + "loss/crossentropy": 2.5093982219696045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16048479080200195, + "step": 1215 + }, + { + "epoch": 0.018157383903240256, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0018694559733072917, + "learning_rate": 2e-05, + "loss": 1.3799, + "loss/crossentropy": 2.4472479820251465, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 13.0, + "loss/logits": 0.21582560241222382, + "step": 1216 + }, + { + "epoch": 0.018172315962371212, + "grad_norm": 0.5, + "grad_norm_var": 0.002057329813639323, + "learning_rate": 2e-05, + "loss": 1.234, + "loss/crossentropy": 2.574943780899048, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17926263809204102, + "step": 1217 + }, + { + "epoch": 0.018187248021502164, + "grad_norm": 0.58984375, + "grad_norm_var": 0.002147865295410156, + "learning_rate": 2e-05, + "loss": 1.2759, + "loss/crossentropy": 2.5302133560180664, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.16648699343204498, + "step": 1218 + }, + { + "epoch": 0.01820218008063312, + "grad_norm": 0.5703125, + "grad_norm_var": 0.002113596598307292, + "learning_rate": 2e-05, + "loss": 1.2878, + "loss/crossentropy": 2.620727777481079, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1784372329711914, + "step": 1219 + }, + { + "epoch": 0.018217112139764072, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0021397272745768228, + "learning_rate": 2e-05, + "loss": 1.2389, + "loss/crossentropy": 2.8881278038024902, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1763637661933899, + "step": 1220 + }, + { + "epoch": 0.018232044198895028, + "grad_norm": 0.5234375, + "grad_norm_var": 0.001923052469889323, + "learning_rate": 2e-05, + "loss": 1.1603, + "loss/crossentropy": 2.626577854156494, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.15248607099056244, + "step": 1221 + }, + { + "epoch": 0.01824697625802598, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0018656412760416667, + "learning_rate": 2e-05, + "loss": 1.3248, + "loss/crossentropy": 2.4721579551696777, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.19197949767112732, + "step": 1222 + }, + { + "epoch": 0.018261908317156936, + "grad_norm": 0.59765625, + "grad_norm_var": 0.0013120015462239583, + "learning_rate": 2e-05, + "loss": 1.2957, + "loss/crossentropy": 2.1448283195495605, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1863187849521637, + "step": 1223 + }, + { + "epoch": 0.01827684037628789, + "grad_norm": 0.74609375, + "grad_norm_var": 0.003631337483723958, + "learning_rate": 2e-05, + "loss": 1.3701, + "loss/crossentropy": 2.505596399307251, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.1826344132423401, + "step": 1224 + }, + { + "epoch": 0.018291772435418845, + "grad_norm": 0.54296875, + "grad_norm_var": 0.003459676106770833, + "learning_rate": 2e-05, + "loss": 1.191, + "loss/crossentropy": 2.5166234970092773, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15974509716033936, + "step": 1225 + }, + { + "epoch": 0.018306704494549797, + "grad_norm": 0.5234375, + "grad_norm_var": 0.003513590494791667, + "learning_rate": 2e-05, + "loss": 1.2205, + "loss/crossentropy": 2.7810256481170654, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1658242642879486, + "step": 1226 + }, + { + "epoch": 0.018321636553680753, + "grad_norm": 0.5, + "grad_norm_var": 0.003683916727701823, + "learning_rate": 2e-05, + "loss": 1.2462, + "loss/crossentropy": 2.5059738159179688, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17585018277168274, + "step": 1227 + }, + { + "epoch": 0.018336568612811705, + "grad_norm": 0.59375, + "grad_norm_var": 0.0037535985310872396, + "learning_rate": 2e-05, + "loss": 1.4086, + "loss/crossentropy": 2.800818681716919, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.22110095620155334, + "step": 1228 + }, + { + "epoch": 0.01835150067194266, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0038543701171875, + "learning_rate": 2e-05, + "loss": 1.1752, + "loss/crossentropy": 2.5914788246154785, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.1517515331506729, + "step": 1229 + }, + { + "epoch": 0.018366432731073613, + "grad_norm": 0.49609375, + "grad_norm_var": 0.004046630859375, + "learning_rate": 2e-05, + "loss": 1.2081, + "loss/crossentropy": 2.6223032474517822, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.153452530503273, + "step": 1230 + }, + { + "epoch": 0.01838136479020457, + "grad_norm": 0.7109375, + "grad_norm_var": 0.005541419982910157, + "learning_rate": 2e-05, + "loss": 1.4043, + "loss/crossentropy": 2.614365339279175, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.2089834213256836, + "step": 1231 + }, + { + "epoch": 0.018396296849335525, + "grad_norm": 0.53125, + "grad_norm_var": 0.005597941080729167, + "learning_rate": 2e-05, + "loss": 1.2692, + "loss/crossentropy": 2.5265705585479736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17544037103652954, + "step": 1232 + }, + { + "epoch": 0.018411228908466477, + "grad_norm": 0.63671875, + "grad_norm_var": 0.005680274963378906, + "learning_rate": 2e-05, + "loss": 1.4872, + "loss/crossentropy": 2.4311468601226807, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 13.0, + "loss/logits": 0.2606537342071533, + "step": 1233 + }, + { + "epoch": 0.018426160967597433, + "grad_norm": 0.65234375, + "grad_norm_var": 0.006105486551920573, + "learning_rate": 2e-05, + "loss": 1.47, + "loss/crossentropy": 2.592331647872925, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 13.0, + "loss/logits": 0.23566170036792755, + "step": 1234 + }, + { + "epoch": 0.018441093026728386, + "grad_norm": 0.5390625, + "grad_norm_var": 0.00617364247639974, + "learning_rate": 2e-05, + "loss": 1.3208, + "loss/crossentropy": 2.7657077312469482, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19580461084842682, + "step": 1235 + }, + { + "epoch": 0.01845602508585934, + "grad_norm": 0.52734375, + "grad_norm_var": 0.00600121815999349, + "learning_rate": 2e-05, + "loss": 1.2421, + "loss/crossentropy": 2.8439533710479736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.18744774162769318, + "step": 1236 + }, + { + "epoch": 0.018470957144990294, + "grad_norm": 0.64453125, + "grad_norm_var": 0.006141153971354166, + "learning_rate": 2e-05, + "loss": 1.2085, + "loss/crossentropy": 3.1795899868011475, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.14597997069358826, + "step": 1237 + }, + { + "epoch": 0.01848588920412125, + "grad_norm": 0.53515625, + "grad_norm_var": 0.006032307942708333, + "learning_rate": 2e-05, + "loss": 1.2747, + "loss/crossentropy": 2.7262730598449707, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1887180060148239, + "step": 1238 + }, + { + "epoch": 0.018500821263252202, + "grad_norm": 0.546875, + "grad_norm_var": 0.00607446034749349, + "learning_rate": 2e-05, + "loss": 1.3052, + "loss/crossentropy": 2.5821917057037354, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.18802396953105927, + "step": 1239 + }, + { + "epoch": 0.018515753322383158, + "grad_norm": 0.57421875, + "grad_norm_var": 0.00404351552327474, + "learning_rate": 2e-05, + "loss": 1.3231, + "loss/crossentropy": 2.5506041049957275, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.17467817664146423, + "step": 1240 + }, + { + "epoch": 0.01853068538151411, + "grad_norm": 0.5546875, + "grad_norm_var": 0.004015858968098958, + "learning_rate": 2e-05, + "loss": 1.2676, + "loss/crossentropy": 2.6439409255981445, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.16604889929294586, + "step": 1241 + }, + { + "epoch": 0.018545617440645066, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0039066950480143225, + "learning_rate": 2e-05, + "loss": 1.384, + "loss/crossentropy": 2.2677359580993652, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.1886504888534546, + "step": 1242 + }, + { + "epoch": 0.01856054949977602, + "grad_norm": 0.5078125, + "grad_norm_var": 0.003836504618326823, + "learning_rate": 2e-05, + "loss": 1.2263, + "loss/crossentropy": 2.5122861862182617, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16376549005508423, + "step": 1243 + }, + { + "epoch": 0.018575481558906974, + "grad_norm": 0.53125, + "grad_norm_var": 0.0038955052693684894, + "learning_rate": 2e-05, + "loss": 1.3596, + "loss/crossentropy": 2.6457715034484863, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.21896395087242126, + "step": 1244 + }, + { + "epoch": 0.018590413618037926, + "grad_norm": 0.5234375, + "grad_norm_var": 0.003753407796223958, + "learning_rate": 2e-05, + "loss": 1.1308, + "loss/crossentropy": 2.356555700302124, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98828125, + "loss/idx": 13.0, + "loss/logits": 0.14256852865219116, + "step": 1245 + }, + { + "epoch": 0.018605345677168882, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0033770243326822916, + "learning_rate": 2e-05, + "loss": 1.2262, + "loss/crossentropy": 2.6997029781341553, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1715344786643982, + "step": 1246 + }, + { + "epoch": 0.018620277736299835, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0020405451456705728, + "learning_rate": 2e-05, + "loss": 1.3046, + "loss/crossentropy": 2.6006901264190674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.1796230673789978, + "step": 1247 + }, + { + "epoch": 0.01863520979543079, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0019642512003580728, + "learning_rate": 2e-05, + "loss": 1.2811, + "loss/crossentropy": 2.6090171337127686, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1717006117105484, + "step": 1248 + }, + { + "epoch": 0.018650141854561743, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0018259048461914062, + "learning_rate": 2e-05, + "loss": 1.2577, + "loss/crossentropy": 2.1411545276641846, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.16393724083900452, + "step": 1249 + }, + { + "epoch": 0.0186650739136927, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0012082417805989584, + "learning_rate": 2e-05, + "loss": 1.3501, + "loss/crossentropy": 2.4617106914520264, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 13.0, + "loss/logits": 0.18599101901054382, + "step": 1250 + }, + { + "epoch": 0.01868000597282365, + "grad_norm": 0.546875, + "grad_norm_var": 0.0011993408203125, + "learning_rate": 2e-05, + "loss": 1.2825, + "loss/crossentropy": 2.587517261505127, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18875601887702942, + "step": 1251 + }, + { + "epoch": 0.018694938031954607, + "grad_norm": 0.6171875, + "grad_norm_var": 0.0014113744099934896, + "learning_rate": 2e-05, + "loss": 1.2102, + "loss/crossentropy": 2.5046262741088867, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.15552225708961487, + "step": 1252 + }, + { + "epoch": 0.01870987009108556, + "grad_norm": 0.6015625, + "grad_norm_var": 0.0010274251302083333, + "learning_rate": 2e-05, + "loss": 1.258, + "loss/crossentropy": 2.6143996715545654, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17202343046665192, + "step": 1253 + }, + { + "epoch": 0.018724802150216515, + "grad_norm": 0.5625, + "grad_norm_var": 0.001002947489420573, + "learning_rate": 2e-05, + "loss": 1.2749, + "loss/crossentropy": 2.6341636180877686, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1733606904745102, + "step": 1254 + }, + { + "epoch": 0.018739734209347467, + "grad_norm": 0.625, + "grad_norm_var": 0.001285235087076823, + "learning_rate": 2e-05, + "loss": 1.2895, + "loss/crossentropy": 2.7565486431121826, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1879180669784546, + "step": 1255 + }, + { + "epoch": 0.018754666268478423, + "grad_norm": 0.490234375, + "grad_norm_var": 0.0015811761220296224, + "learning_rate": 2e-05, + "loss": 1.1834, + "loss/crossentropy": 2.7358415126800537, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1677880734205246, + "step": 1256 + }, + { + "epoch": 0.01876959832760938, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0015828291575113933, + "learning_rate": 2e-05, + "loss": 1.2657, + "loss/crossentropy": 2.661904811859131, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1798032820224762, + "step": 1257 + }, + { + "epoch": 0.01878453038674033, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0015107313791910806, + "learning_rate": 2e-05, + "loss": 1.2148, + "loss/crossentropy": 2.6133949756622314, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1523081362247467, + "step": 1258 + }, + { + "epoch": 0.018799462445871287, + "grad_norm": 0.59375, + "grad_norm_var": 0.0014393965403238932, + "learning_rate": 2e-05, + "loss": 1.2584, + "loss/crossentropy": 2.6154913902282715, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18810322880744934, + "step": 1259 + }, + { + "epoch": 0.01881439450500224, + "grad_norm": 0.578125, + "grad_norm_var": 0.0013989607493082683, + "learning_rate": 2e-05, + "loss": 1.3253, + "loss/crossentropy": 2.4987523555755615, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.19253703951835632, + "step": 1260 + }, + { + "epoch": 0.018829326564133195, + "grad_norm": 0.5546875, + "grad_norm_var": 0.0012967268625895183, + "learning_rate": 2e-05, + "loss": 1.1954, + "loss/crossentropy": 2.6275758743286133, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1563832014799118, + "step": 1261 + }, + { + "epoch": 0.018844258623264148, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0012904961903889975, + "learning_rate": 2e-05, + "loss": 1.1805, + "loss/crossentropy": 2.654968738555908, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.1571100354194641, + "step": 1262 + }, + { + "epoch": 0.018859190682395104, + "grad_norm": 0.56640625, + "grad_norm_var": 0.001288588841756185, + "learning_rate": 2e-05, + "loss": 1.2607, + "loss/crossentropy": 2.5465691089630127, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17477178573608398, + "step": 1263 + }, + { + "epoch": 0.018874122741526056, + "grad_norm": 0.48046875, + "grad_norm_var": 0.0017243544260660806, + "learning_rate": 2e-05, + "loss": 1.2487, + "loss/crossentropy": 2.625317096710205, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.17062163352966309, + "step": 1264 + }, + { + "epoch": 0.018889054800657012, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0015559991200764973, + "learning_rate": 2e-05, + "loss": 1.2303, + "loss/crossentropy": 2.710569143295288, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16783851385116577, + "step": 1265 + }, + { + "epoch": 0.018903986859787964, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0015259901682535806, + "learning_rate": 2e-05, + "loss": 1.2482, + "loss/crossentropy": 2.6588263511657715, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1778830587863922, + "step": 1266 + }, + { + "epoch": 0.01891891891891892, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0016102949778238932, + "learning_rate": 2e-05, + "loss": 1.2365, + "loss/crossentropy": 2.613532304763794, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1740056276321411, + "step": 1267 + }, + { + "epoch": 0.018933850978049872, + "grad_norm": 0.515625, + "grad_norm_var": 0.0014995416005452475, + "learning_rate": 2e-05, + "loss": 1.2785, + "loss/crossentropy": 2.453362464904785, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17693351209163666, + "step": 1268 + }, + { + "epoch": 0.018948783037180828, + "grad_norm": 0.53125, + "grad_norm_var": 0.001372512181599935, + "learning_rate": 2e-05, + "loss": 1.3397, + "loss/crossentropy": 2.748429775238037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.20684918761253357, + "step": 1269 + }, + { + "epoch": 0.01896371509631178, + "grad_norm": 0.53125, + "grad_norm_var": 0.0013842105865478516, + "learning_rate": 2e-05, + "loss": 1.26, + "loss/crossentropy": 2.556675672531128, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17401845753192902, + "step": 1270 + }, + { + "epoch": 0.018978647155442736, + "grad_norm": 0.5, + "grad_norm_var": 0.0010892073313395181, + "learning_rate": 2e-05, + "loss": 1.2804, + "loss/crossentropy": 2.5527727603912354, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17879468202590942, + "step": 1271 + }, + { + "epoch": 0.01899357921457369, + "grad_norm": 0.88671875, + "grad_norm_var": 0.008236122131347657, + "learning_rate": 2e-05, + "loss": 1.4474, + "loss/crossentropy": 2.2951905727386475, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 13.0, + "loss/logits": 0.21297743916511536, + "step": 1272 + }, + { + "epoch": 0.019008511273704645, + "grad_norm": 0.52734375, + "grad_norm_var": 0.008316993713378906, + "learning_rate": 2e-05, + "loss": 1.2259, + "loss/crossentropy": 2.5471127033233643, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1712278127670288, + "step": 1273 + }, + { + "epoch": 0.019023443332835597, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0084197998046875, + "learning_rate": 2e-05, + "loss": 1.3398, + "loss/crossentropy": 2.5242245197296143, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.20698873698711395, + "step": 1274 + }, + { + "epoch": 0.019038375391966553, + "grad_norm": 0.466796875, + "grad_norm_var": 0.008881616592407226, + "learning_rate": 2e-05, + "loss": 1.1402, + "loss/crossentropy": 2.6311800479888916, + "loss/dist_ce": 0.0, + "loss/fcd": 0.99609375, + "loss/idx": 13.0, + "loss/logits": 0.14407965540885925, + "step": 1275 + }, + { + "epoch": 0.019053307451097505, + "grad_norm": 0.5625, + "grad_norm_var": 0.008845758438110352, + "learning_rate": 2e-05, + "loss": 1.3024, + "loss/crossentropy": 2.553720712661743, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.19305121898651123, + "step": 1276 + }, + { + "epoch": 0.01906823951022846, + "grad_norm": 0.5390625, + "grad_norm_var": 0.008856693903605143, + "learning_rate": 2e-05, + "loss": 1.2891, + "loss/crossentropy": 2.5351758003234863, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1875266134738922, + "step": 1277 + }, + { + "epoch": 0.019083171569359413, + "grad_norm": 0.56640625, + "grad_norm_var": 0.008856693903605143, + "learning_rate": 2e-05, + "loss": 1.3024, + "loss/crossentropy": 2.4513752460479736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.17735695838928223, + "step": 1278 + }, + { + "epoch": 0.01909810362849037, + "grad_norm": 0.53515625, + "grad_norm_var": 0.00885618527730306, + "learning_rate": 2e-05, + "loss": 1.2502, + "loss/crossentropy": 2.5539772510528564, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.17203769087791443, + "step": 1279 + }, + { + "epoch": 0.01911303568762132, + "grad_norm": 0.56640625, + "grad_norm_var": 0.008524688084920247, + "learning_rate": 2e-05, + "loss": 1.2212, + "loss/crossentropy": 2.49832820892334, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17433378100395203, + "step": 1280 + }, + { + "epoch": 0.019127967746752277, + "grad_norm": 0.515625, + "grad_norm_var": 0.008600346247355143, + "learning_rate": 2e-05, + "loss": 1.2678, + "loss/crossentropy": 2.638371229171753, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17406100034713745, + "step": 1281 + }, + { + "epoch": 0.01914289980588323, + "grad_norm": 0.734375, + "grad_norm_var": 0.01064311663309733, + "learning_rate": 2e-05, + "loss": 1.4935, + "loss/crossentropy": 2.581265687942505, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 13.0, + "loss/logits": 0.21223483979701996, + "step": 1282 + }, + { + "epoch": 0.019157831865014185, + "grad_norm": 0.5625, + "grad_norm_var": 0.010526768366495768, + "learning_rate": 2e-05, + "loss": 1.3831, + "loss/crossentropy": 2.5751612186431885, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.21123623847961426, + "step": 1283 + }, + { + "epoch": 0.01917276392414514, + "grad_norm": 0.462890625, + "grad_norm_var": 0.011058489481608072, + "learning_rate": 2e-05, + "loss": 1.1104, + "loss/crossentropy": 2.5615251064300537, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9609375, + "loss/idx": 13.0, + "loss/logits": 0.14948110282421112, + "step": 1284 + }, + { + "epoch": 0.019187695983276094, + "grad_norm": 0.53125, + "grad_norm_var": 0.011058489481608072, + "learning_rate": 2e-05, + "loss": 1.2032, + "loss/crossentropy": 2.5952653884887695, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1641014665365219, + "step": 1285 + }, + { + "epoch": 0.01920262804240705, + "grad_norm": 0.5234375, + "grad_norm_var": 0.011095619201660157, + "learning_rate": 2e-05, + "loss": 1.2654, + "loss/crossentropy": 2.237917423248291, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.19509923458099365, + "step": 1286 + }, + { + "epoch": 0.019217560101538002, + "grad_norm": 0.5390625, + "grad_norm_var": 0.010864194234212239, + "learning_rate": 2e-05, + "loss": 1.2028, + "loss/crossentropy": 2.4912095069885254, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.15591605007648468, + "step": 1287 + }, + { + "epoch": 0.019232492160668958, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0035170873006184896, + "learning_rate": 2e-05, + "loss": 1.2284, + "loss/crossentropy": 2.849947214126587, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17369095981121063, + "step": 1288 + }, + { + "epoch": 0.01924742421979991, + "grad_norm": 0.5546875, + "grad_norm_var": 0.003505961100260417, + "learning_rate": 2e-05, + "loss": 1.3009, + "loss/crossentropy": 2.718153953552246, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.17590981721878052, + "step": 1289 + }, + { + "epoch": 0.019262356278930866, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0034957249959309896, + "learning_rate": 2e-05, + "loss": 1.2431, + "loss/crossentropy": 2.539076805114746, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17277245223522186, + "step": 1290 + }, + { + "epoch": 0.019277288338061818, + "grad_norm": 0.578125, + "grad_norm_var": 0.003107055028279622, + "learning_rate": 2e-05, + "loss": 1.3905, + "loss/crossentropy": 2.501863479614258, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.21077433228492737, + "step": 1291 + }, + { + "epoch": 0.019292220397192774, + "grad_norm": 0.70703125, + "grad_norm_var": 0.004612588882446289, + "learning_rate": 2e-05, + "loss": 1.5402, + "loss/crossentropy": 2.5748298168182373, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 13.0, + "loss/logits": 0.2589094042778015, + "step": 1292 + }, + { + "epoch": 0.019307152456323726, + "grad_norm": 1.046875, + "grad_norm_var": 0.01923368771870931, + "learning_rate": 2e-05, + "loss": 1.4531, + "loss/crossentropy": 2.400190830230713, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 13.0, + "loss/logits": 0.2031039446592331, + "step": 1293 + }, + { + "epoch": 0.019322084515454682, + "grad_norm": 0.486328125, + "grad_norm_var": 0.01991729736328125, + "learning_rate": 2e-05, + "loss": 1.1589, + "loss/crossentropy": 2.452254056930542, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.15112650394439697, + "step": 1294 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.484375, + "grad_norm_var": 0.020435523986816407, + "learning_rate": 2e-05, + "loss": 1.2109, + "loss/crossentropy": 2.6677145957946777, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1640741527080536, + "step": 1295 + }, + { + "epoch": 0.01935194863371659, + "grad_norm": 0.59765625, + "grad_norm_var": 0.020420265197753907, + "learning_rate": 2e-05, + "loss": 1.3198, + "loss/crossentropy": 2.658365488052368, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.202656090259552, + "step": 1296 + }, + { + "epoch": 0.019366880692847543, + "grad_norm": 0.609375, + "grad_norm_var": 0.020081520080566406, + "learning_rate": 2e-05, + "loss": 1.3752, + "loss/crossentropy": 2.398197650909424, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.21891216933727264, + "step": 1297 + }, + { + "epoch": 0.0193818127519785, + "grad_norm": 0.51953125, + "grad_norm_var": 0.018903096516927082, + "learning_rate": 2e-05, + "loss": 1.2435, + "loss/crossentropy": 2.6253135204315186, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17318454384803772, + "step": 1298 + }, + { + "epoch": 0.01939674481110945, + "grad_norm": 0.51953125, + "grad_norm_var": 0.019113604227701822, + "learning_rate": 2e-05, + "loss": 1.2117, + "loss/crossentropy": 2.595719575881958, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1647767871618271, + "step": 1299 + }, + { + "epoch": 0.019411676870240407, + "grad_norm": 0.56640625, + "grad_norm_var": 0.018216435114542642, + "learning_rate": 2e-05, + "loss": 1.2591, + "loss/crossentropy": 2.4858176708221436, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1809636652469635, + "step": 1300 + }, + { + "epoch": 0.01942660892937136, + "grad_norm": 0.55859375, + "grad_norm_var": 0.018074909845987957, + "learning_rate": 2e-05, + "loss": 1.2194, + "loss/crossentropy": 2.464454174041748, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16469821333885193, + "step": 1301 + }, + { + "epoch": 0.019441540988502315, + "grad_norm": 0.49609375, + "grad_norm_var": 0.018344608942667644, + "learning_rate": 2e-05, + "loss": 1.1929, + "loss/crossentropy": 2.5054361820220947, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16160087287425995, + "step": 1302 + }, + { + "epoch": 0.019456473047633267, + "grad_norm": 0.53515625, + "grad_norm_var": 0.01836838722229004, + "learning_rate": 2e-05, + "loss": 1.1693, + "loss/crossentropy": 2.579584836959839, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15363293886184692, + "step": 1303 + }, + { + "epoch": 0.019471405106764223, + "grad_norm": 0.53515625, + "grad_norm_var": 0.01836838722229004, + "learning_rate": 2e-05, + "loss": 1.2129, + "loss/crossentropy": 2.4479448795318604, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16602320969104767, + "step": 1304 + }, + { + "epoch": 0.019486337165895175, + "grad_norm": 0.5703125, + "grad_norm_var": 0.018325408299763996, + "learning_rate": 2e-05, + "loss": 1.334, + "loss/crossentropy": 2.685575485229492, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.18556523323059082, + "step": 1305 + }, + { + "epoch": 0.01950126922502613, + "grad_norm": 0.5625, + "grad_norm_var": 0.018138869603474935, + "learning_rate": 2e-05, + "loss": 1.3211, + "loss/crossentropy": 2.4253220558166504, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.18828678131103516, + "step": 1306 + }, + { + "epoch": 0.019516201284157084, + "grad_norm": 0.55078125, + "grad_norm_var": 0.018213637669881187, + "learning_rate": 2e-05, + "loss": 1.2963, + "loss/crossentropy": 2.4698915481567383, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1947701871395111, + "step": 1307 + }, + { + "epoch": 0.01953113334328804, + "grad_norm": 0.578125, + "grad_norm_var": 0.017139418919881185, + "learning_rate": 2e-05, + "loss": 1.3248, + "loss/crossentropy": 2.5888915061950684, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.18418516218662262, + "step": 1308 + }, + { + "epoch": 0.019546065402418992, + "grad_norm": 0.5, + "grad_norm_var": 0.0015004316965738932, + "learning_rate": 2e-05, + "loss": 1.1915, + "loss/crossentropy": 2.698927640914917, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16026300191879272, + "step": 1309 + }, + { + "epoch": 0.019560997461549948, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0014222304026285807, + "learning_rate": 2e-05, + "loss": 1.1648, + "loss/crossentropy": 2.4757766723632812, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.14918801188468933, + "step": 1310 + }, + { + "epoch": 0.019575929520680904, + "grad_norm": 0.90234375, + "grad_norm_var": 0.009095875422159831, + "learning_rate": 2e-05, + "loss": 1.5092, + "loss/crossentropy": 2.956965684890747, + "loss/dist_ce": 0.0, + "loss/fcd": 1.25, + "loss/idx": 13.0, + "loss/logits": 0.2591836452484131, + "step": 1311 + }, + { + "epoch": 0.019590861579811856, + "grad_norm": 0.5, + "grad_norm_var": 0.009315220514933269, + "learning_rate": 2e-05, + "loss": 1.227, + "loss/crossentropy": 2.597576379776001, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17235302925109863, + "step": 1312 + }, + { + "epoch": 0.01960579363894281, + "grad_norm": 0.51953125, + "grad_norm_var": 0.009259653091430665, + "learning_rate": 2e-05, + "loss": 1.2917, + "loss/crossentropy": 2.78556752204895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.19013968110084534, + "step": 1313 + }, + { + "epoch": 0.019620725698073764, + "grad_norm": 0.58203125, + "grad_norm_var": 0.009191497166951498, + "learning_rate": 2e-05, + "loss": 1.3658, + "loss/crossentropy": 2.502690553665161, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.19392375648021698, + "step": 1314 + }, + { + "epoch": 0.01963565775720472, + "grad_norm": 0.62109375, + "grad_norm_var": 0.00927580197652181, + "learning_rate": 2e-05, + "loss": 1.297, + "loss/crossentropy": 2.6288228034973145, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.18761998414993286, + "step": 1315 + }, + { + "epoch": 0.019650589816335672, + "grad_norm": 0.875, + "grad_norm_var": 0.015192524592081705, + "learning_rate": 2e-05, + "loss": 1.2782, + "loss/crossentropy": 2.738717794418335, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.1844879537820816, + "step": 1316 + }, + { + "epoch": 0.019665521875466628, + "grad_norm": 0.5546875, + "grad_norm_var": 0.015208037694295247, + "learning_rate": 2e-05, + "loss": 1.2752, + "loss/crossentropy": 2.660844564437866, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18141654133796692, + "step": 1317 + }, + { + "epoch": 0.01968045393459758, + "grad_norm": 0.47265625, + "grad_norm_var": 0.01552427609761556, + "learning_rate": 2e-05, + "loss": 1.204, + "loss/crossentropy": 2.632686138153076, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16492034494876862, + "step": 1318 + }, + { + "epoch": 0.019695385993728536, + "grad_norm": 0.53515625, + "grad_norm_var": 0.01552427609761556, + "learning_rate": 2e-05, + "loss": 1.1667, + "loss/crossentropy": 2.642601728439331, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1511181890964508, + "step": 1319 + }, + { + "epoch": 0.01971031805285949, + "grad_norm": 0.4609375, + "grad_norm_var": 0.01636020342508952, + "learning_rate": 2e-05, + "loss": 1.2313, + "loss/crossentropy": 2.6074163913726807, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17656564712524414, + "step": 1320 + }, + { + "epoch": 0.019725250111990444, + "grad_norm": 0.828125, + "grad_norm_var": 0.020174519220987955, + "learning_rate": 2e-05, + "loss": 1.2131, + "loss/crossentropy": 2.755542755126953, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16619285941123962, + "step": 1321 + }, + { + "epoch": 0.019740182171121397, + "grad_norm": 0.61328125, + "grad_norm_var": 0.020106744766235352, + "learning_rate": 2e-05, + "loss": 1.3149, + "loss/crossentropy": 2.5030322074890137, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18986953794956207, + "step": 1322 + }, + { + "epoch": 0.019755114230252353, + "grad_norm": 0.54296875, + "grad_norm_var": 0.02016129493713379, + "learning_rate": 2e-05, + "loss": 1.2952, + "loss/crossentropy": 2.4982292652130127, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.185785174369812, + "step": 1323 + }, + { + "epoch": 0.019770046289383305, + "grad_norm": 0.56640625, + "grad_norm_var": 0.020202493667602538, + "learning_rate": 2e-05, + "loss": 1.2813, + "loss/crossentropy": 2.668501138687134, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.1875031292438507, + "step": 1324 + }, + { + "epoch": 0.01978497834851426, + "grad_norm": 0.50390625, + "grad_norm_var": 0.02015226682027181, + "learning_rate": 2e-05, + "loss": 1.2135, + "loss/crossentropy": 2.592796564102173, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.17439380288124084, + "step": 1325 + }, + { + "epoch": 0.019799910407645213, + "grad_norm": 0.51953125, + "grad_norm_var": 0.019893328348795574, + "learning_rate": 2e-05, + "loss": 1.3221, + "loss/crossentropy": 2.619886636734009, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19710972905158997, + "step": 1326 + }, + { + "epoch": 0.01981484246677617, + "grad_norm": 0.6484375, + "grad_norm_var": 0.013682047526041666, + "learning_rate": 2e-05, + "loss": 1.5511, + "loss/crossentropy": 2.5196568965911865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3046875, + "loss/idx": 13.0, + "loss/logits": 0.2463717758655548, + "step": 1327 + }, + { + "epoch": 0.01982977452590712, + "grad_norm": 0.6484375, + "grad_norm_var": 0.013396962483723959, + "learning_rate": 2e-05, + "loss": 1.3578, + "loss/crossentropy": 2.68768572807312, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.1859176903963089, + "step": 1328 + }, + { + "epoch": 0.019844706585038077, + "grad_norm": 0.46484375, + "grad_norm_var": 0.014121500651041667, + "learning_rate": 2e-05, + "loss": 1.1804, + "loss/crossentropy": 2.659552812576294, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15694257616996765, + "step": 1329 + }, + { + "epoch": 0.01985963864416903, + "grad_norm": 0.470703125, + "grad_norm_var": 0.015012089411417644, + "learning_rate": 2e-05, + "loss": 1.2425, + "loss/crossentropy": 2.698822498321533, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1799800544977188, + "step": 1330 + }, + { + "epoch": 0.019874570703299985, + "grad_norm": 0.6015625, + "grad_norm_var": 0.014936431248982748, + "learning_rate": 2e-05, + "loss": 1.3199, + "loss/crossentropy": 2.2921221256256104, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.18708041310310364, + "step": 1331 + }, + { + "epoch": 0.019889502762430938, + "grad_norm": 0.5859375, + "grad_norm_var": 0.008853133519490559, + "learning_rate": 2e-05, + "loss": 1.319, + "loss/crossentropy": 2.7646992206573486, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19404536485671997, + "step": 1332 + }, + { + "epoch": 0.019904434821561894, + "grad_norm": 0.484375, + "grad_norm_var": 0.009245665868123372, + "learning_rate": 2e-05, + "loss": 1.2248, + "loss/crossentropy": 2.6878888607025146, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17010048031806946, + "step": 1333 + }, + { + "epoch": 0.019919366880692846, + "grad_norm": 0.5625, + "grad_norm_var": 0.008713388442993164, + "learning_rate": 2e-05, + "loss": 1.2177, + "loss/crossentropy": 2.486751079559326, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17087268829345703, + "step": 1334 + }, + { + "epoch": 0.0199342989398238, + "grad_norm": 0.56640625, + "grad_norm_var": 0.008650827407836913, + "learning_rate": 2e-05, + "loss": 1.3721, + "loss/crossentropy": 2.5809147357940674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.2001952826976776, + "step": 1335 + }, + { + "epoch": 0.019949230998954758, + "grad_norm": 0.53515625, + "grad_norm_var": 0.007947778701782227, + "learning_rate": 2e-05, + "loss": 1.2824, + "loss/crossentropy": 2.8265273571014404, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18861952424049377, + "step": 1336 + }, + { + "epoch": 0.01996416305808571, + "grad_norm": 0.72265625, + "grad_norm_var": 0.005032968521118164, + "learning_rate": 2e-05, + "loss": 1.2615, + "loss/crossentropy": 2.333455801010132, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1599290519952774, + "step": 1337 + }, + { + "epoch": 0.019979095117216666, + "grad_norm": 0.67578125, + "grad_norm_var": 0.005680958429972331, + "learning_rate": 2e-05, + "loss": 1.2279, + "loss/crossentropy": 2.4327051639556885, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16535300016403198, + "step": 1338 + }, + { + "epoch": 0.019994027176347618, + "grad_norm": 0.53125, + "grad_norm_var": 0.005729786554972331, + "learning_rate": 2e-05, + "loss": 1.202, + "loss/crossentropy": 2.7616231441497803, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16289827227592468, + "step": 1339 + }, + { + "epoch": 0.020008959235478574, + "grad_norm": 0.609375, + "grad_norm_var": 0.005836089452107747, + "learning_rate": 2e-05, + "loss": 1.4349, + "loss/crossentropy": 2.4606475830078125, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 13.0, + "loss/logits": 0.22398307919502258, + "step": 1340 + }, + { + "epoch": 0.020023891294609526, + "grad_norm": 0.58203125, + "grad_norm_var": 0.005522012710571289, + "learning_rate": 2e-05, + "loss": 1.2027, + "loss/crossentropy": 2.614811897277832, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1558469831943512, + "step": 1341 + }, + { + "epoch": 0.020038823353740482, + "grad_norm": 0.51171875, + "grad_norm_var": 0.005584192276000976, + "learning_rate": 2e-05, + "loss": 1.15, + "loss/crossentropy": 2.557891368865967, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.14996448159217834, + "step": 1342 + }, + { + "epoch": 0.020053755412871434, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0055266698201497395, + "learning_rate": 2e-05, + "loss": 1.2479, + "loss/crossentropy": 2.5452401638031006, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17753981053829193, + "step": 1343 + }, + { + "epoch": 0.02006868747200239, + "grad_norm": 0.484375, + "grad_norm_var": 0.00539849599202474, + "learning_rate": 2e-05, + "loss": 1.1854, + "loss/crossentropy": 2.5871667861938477, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1541636437177658, + "step": 1344 + }, + { + "epoch": 0.020083619531133343, + "grad_norm": 0.58984375, + "grad_norm_var": 0.004865455627441406, + "learning_rate": 2e-05, + "loss": 1.2847, + "loss/crossentropy": 2.5764195919036865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.17535443603992462, + "step": 1345 + }, + { + "epoch": 0.0200985515902643, + "grad_norm": 0.5390625, + "grad_norm_var": 0.004314152399698893, + "learning_rate": 2e-05, + "loss": 1.2904, + "loss/crossentropy": 2.349005699157715, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.19662730395793915, + "step": 1346 + }, + { + "epoch": 0.02011348364939525, + "grad_norm": 0.5546875, + "grad_norm_var": 0.004238621393839518, + "learning_rate": 2e-05, + "loss": 1.2428, + "loss/crossentropy": 2.576354503631592, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1724541187286377, + "step": 1347 + }, + { + "epoch": 0.020128415708526207, + "grad_norm": 0.5234375, + "grad_norm_var": 0.004304742813110352, + "learning_rate": 2e-05, + "loss": 1.2087, + "loss/crossentropy": 2.7604458332061768, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16968321800231934, + "step": 1348 + }, + { + "epoch": 0.02014334776765716, + "grad_norm": 0.55859375, + "grad_norm_var": 0.00389402707417806, + "learning_rate": 2e-05, + "loss": 1.2885, + "loss/crossentropy": 2.5020458698272705, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.17917054891586304, + "step": 1349 + }, + { + "epoch": 0.020158279826788115, + "grad_norm": 0.609375, + "grad_norm_var": 0.004013808568318685, + "learning_rate": 2e-05, + "loss": 1.4814, + "loss/crossentropy": 2.569965124130249, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 13.0, + "loss/logits": 0.23924951255321503, + "step": 1350 + }, + { + "epoch": 0.020173211885919067, + "grad_norm": 0.54296875, + "grad_norm_var": 0.004053862889607748, + "learning_rate": 2e-05, + "loss": 1.2074, + "loss/crossentropy": 2.6130759716033936, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16052843630313873, + "step": 1351 + }, + { + "epoch": 0.020188143945050023, + "grad_norm": 0.546875, + "grad_norm_var": 0.00401304562886556, + "learning_rate": 2e-05, + "loss": 1.2967, + "loss/crossentropy": 2.5374364852905273, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.18736249208450317, + "step": 1352 + }, + { + "epoch": 0.020203076004180975, + "grad_norm": 0.61328125, + "grad_norm_var": 0.0024981021881103514, + "learning_rate": 2e-05, + "loss": 1.3419, + "loss/crossentropy": 2.4236462116241455, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.18566705286502838, + "step": 1353 + }, + { + "epoch": 0.02021800806331193, + "grad_norm": 0.48828125, + "grad_norm_var": 0.0018175601959228515, + "learning_rate": 2e-05, + "loss": 1.205, + "loss/crossentropy": 2.845634937286377, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.17379263043403625, + "step": 1354 + }, + { + "epoch": 0.020232940122442884, + "grad_norm": 0.4921875, + "grad_norm_var": 0.002005116144816081, + "learning_rate": 2e-05, + "loss": 1.2024, + "loss/crossentropy": 2.4425594806671143, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16329693794250488, + "step": 1355 + }, + { + "epoch": 0.02024787218157384, + "grad_norm": 0.515625, + "grad_norm_var": 0.0017686049143473307, + "learning_rate": 2e-05, + "loss": 1.1883, + "loss/crossentropy": 2.421473741531372, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15703192353248596, + "step": 1356 + }, + { + "epoch": 0.020262804240704792, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0016537825266520181, + "learning_rate": 2e-05, + "loss": 1.3472, + "loss/crossentropy": 2.572152853012085, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.19879058003425598, + "step": 1357 + }, + { + "epoch": 0.020277736299835748, + "grad_norm": 1.96875, + "grad_norm_var": 0.12938116391499838, + "learning_rate": 2e-05, + "loss": 1.7107, + "loss/crossentropy": 2.3684675693511963, + "loss/dist_ce": 0.0, + "loss/fcd": 1.421875, + "loss/idx": 13.0, + "loss/logits": 0.2888346314430237, + "step": 1358 + }, + { + "epoch": 0.0202926683589667, + "grad_norm": 0.48046875, + "grad_norm_var": 0.12970574696858725, + "learning_rate": 2e-05, + "loss": 1.181, + "loss/crossentropy": 2.509111166000366, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.16538101434707642, + "step": 1359 + }, + { + "epoch": 0.020307600418097656, + "grad_norm": 0.546875, + "grad_norm_var": 0.12875970204671225, + "learning_rate": 2e-05, + "loss": 1.2878, + "loss/crossentropy": 2.669523000717163, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.1940631866455078, + "step": 1360 + }, + { + "epoch": 0.020322532477228608, + "grad_norm": 0.61328125, + "grad_norm_var": 0.12866509755452474, + "learning_rate": 2e-05, + "loss": 1.4801, + "loss/crossentropy": 2.491579055786133, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 13.0, + "loss/logits": 0.23791025578975677, + "step": 1361 + }, + { + "epoch": 0.020337464536359564, + "grad_norm": 0.474609375, + "grad_norm_var": 0.12972830136617025, + "learning_rate": 2e-05, + "loss": 1.193, + "loss/crossentropy": 2.5316779613494873, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16174541413784027, + "step": 1362 + }, + { + "epoch": 0.02035239659549052, + "grad_norm": 0.49609375, + "grad_norm_var": 0.13051985104878744, + "learning_rate": 2e-05, + "loss": 1.2706, + "loss/crossentropy": 2.616147518157959, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17682576179504395, + "step": 1363 + }, + { + "epoch": 0.020367328654621472, + "grad_norm": 0.50390625, + "grad_norm_var": 0.13080786069234213, + "learning_rate": 2e-05, + "loss": 1.266, + "loss/crossentropy": 2.4946200847625732, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17224857211112976, + "step": 1364 + }, + { + "epoch": 0.020382260713752428, + "grad_norm": 0.5625, + "grad_norm_var": 0.13077492713928224, + "learning_rate": 2e-05, + "loss": 1.3026, + "loss/crossentropy": 2.7409372329711914, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.19320283830165863, + "step": 1365 + }, + { + "epoch": 0.02039719277288338, + "grad_norm": 0.55859375, + "grad_norm_var": 0.13103445370992026, + "learning_rate": 2e-05, + "loss": 1.2833, + "loss/crossentropy": 2.5230894088745117, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18950234353542328, + "step": 1366 + }, + { + "epoch": 0.020412124832014336, + "grad_norm": 0.734375, + "grad_norm_var": 0.13133975664774578, + "learning_rate": 2e-05, + "loss": 1.4051, + "loss/crossentropy": 2.4019763469696045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.20979498326778412, + "step": 1367 + }, + { + "epoch": 0.02042705689114529, + "grad_norm": 0.50390625, + "grad_norm_var": 0.13194680213928223, + "learning_rate": 2e-05, + "loss": 1.227, + "loss/crossentropy": 2.572502851486206, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17235463857650757, + "step": 1368 + }, + { + "epoch": 0.020441988950276244, + "grad_norm": 0.57421875, + "grad_norm_var": 0.13212927182515463, + "learning_rate": 2e-05, + "loss": 1.3113, + "loss/crossentropy": 2.350470781326294, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.17066575586795807, + "step": 1369 + }, + { + "epoch": 0.020456921009407197, + "grad_norm": 0.5703125, + "grad_norm_var": 0.13102644284566242, + "learning_rate": 2e-05, + "loss": 1.3753, + "loss/crossentropy": 2.5796074867248535, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.20337635278701782, + "step": 1370 + }, + { + "epoch": 0.020471853068538153, + "grad_norm": 0.515625, + "grad_norm_var": 0.13062170346577961, + "learning_rate": 2e-05, + "loss": 1.1941, + "loss/crossentropy": 2.5275111198425293, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16285260021686554, + "step": 1371 + }, + { + "epoch": 0.020486785127669105, + "grad_norm": 0.470703125, + "grad_norm_var": 0.13145777384440105, + "learning_rate": 2e-05, + "loss": 1.1291, + "loss/crossentropy": 2.7132022380828857, + "loss/dist_ce": 0.0, + "loss/fcd": 0.97265625, + "loss/idx": 13.0, + "loss/logits": 0.156441330909729, + "step": 1372 + }, + { + "epoch": 0.02050171718680006, + "grad_norm": 0.6015625, + "grad_norm_var": 0.13077284495035807, + "learning_rate": 2e-05, + "loss": 1.3682, + "loss/crossentropy": 2.4123904705047607, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.18850690126419067, + "step": 1373 + }, + { + "epoch": 0.020516649245931013, + "grad_norm": 0.47265625, + "grad_norm_var": 0.004807790120442708, + "learning_rate": 2e-05, + "loss": 1.1859, + "loss/crossentropy": 2.519430160522461, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1546916663646698, + "step": 1374 + }, + { + "epoch": 0.02053158130506197, + "grad_norm": 0.49609375, + "grad_norm_var": 0.004693857828776042, + "learning_rate": 2e-05, + "loss": 1.1965, + "loss/crossentropy": 2.733642816543579, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16526637971401215, + "step": 1375 + }, + { + "epoch": 0.02054651336419292, + "grad_norm": 0.49609375, + "grad_norm_var": 0.004831886291503907, + "learning_rate": 2e-05, + "loss": 1.1669, + "loss/crossentropy": 2.5803768634796143, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1512264758348465, + "step": 1376 + }, + { + "epoch": 0.020561445423323877, + "grad_norm": 0.59375, + "grad_norm_var": 0.0046656290690104164, + "learning_rate": 2e-05, + "loss": 1.3094, + "loss/crossentropy": 2.4563205242156982, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.1922210454940796, + "step": 1377 + }, + { + "epoch": 0.02057637748245483, + "grad_norm": 0.498046875, + "grad_norm_var": 0.004498545328776042, + "learning_rate": 2e-05, + "loss": 1.2834, + "loss/crossentropy": 2.5536391735076904, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18183785676956177, + "step": 1378 + }, + { + "epoch": 0.020591309541585785, + "grad_norm": 0.51171875, + "grad_norm_var": 0.004421234130859375, + "learning_rate": 2e-05, + "loss": 1.1879, + "loss/crossentropy": 2.3600947856903076, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15669815242290497, + "step": 1379 + }, + { + "epoch": 0.020606241600716738, + "grad_norm": 0.5, + "grad_norm_var": 0.00444176991780599, + "learning_rate": 2e-05, + "loss": 1.2191, + "loss/crossentropy": 2.5085413455963135, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16443030536174774, + "step": 1380 + }, + { + "epoch": 0.020621173659847693, + "grad_norm": 0.51953125, + "grad_norm_var": 0.004435475667317708, + "learning_rate": 2e-05, + "loss": 1.2258, + "loss/crossentropy": 2.6902709007263184, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16326066851615906, + "step": 1381 + }, + { + "epoch": 0.020636105718978646, + "grad_norm": 0.6015625, + "grad_norm_var": 0.004665565490722656, + "learning_rate": 2e-05, + "loss": 1.282, + "loss/crossentropy": 2.8826005458831787, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18827125430107117, + "step": 1382 + }, + { + "epoch": 0.0206510377781096, + "grad_norm": 0.59375, + "grad_norm_var": 0.0022806167602539063, + "learning_rate": 2e-05, + "loss": 1.3672, + "loss/crossentropy": 2.341332197189331, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.19533485174179077, + "step": 1383 + }, + { + "epoch": 0.020665969837240554, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0022592544555664062, + "learning_rate": 2e-05, + "loss": 1.2868, + "loss/crossentropy": 2.7123072147369385, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.16963346302509308, + "step": 1384 + }, + { + "epoch": 0.02068090189637151, + "grad_norm": 0.55859375, + "grad_norm_var": 0.002194658915201823, + "learning_rate": 2e-05, + "loss": 1.224, + "loss/crossentropy": 2.544621706008911, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16929784417152405, + "step": 1385 + }, + { + "epoch": 0.020695833955502462, + "grad_norm": 0.5, + "grad_norm_var": 0.002171770731608073, + "learning_rate": 2e-05, + "loss": 1.2222, + "loss/crossentropy": 2.4783518314361572, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1753370463848114, + "step": 1386 + }, + { + "epoch": 0.020710766014633418, + "grad_norm": 0.466796875, + "grad_norm_var": 0.002417739232381185, + "learning_rate": 2e-05, + "loss": 1.1808, + "loss/crossentropy": 2.5404293537139893, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.16517174243927002, + "step": 1387 + }, + { + "epoch": 0.020725698073764374, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0021972020467122396, + "learning_rate": 2e-05, + "loss": 1.2112, + "loss/crossentropy": 2.6268763542175293, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.17210936546325684, + "step": 1388 + }, + { + "epoch": 0.020740630132895326, + "grad_norm": 0.5625, + "grad_norm_var": 0.0019225438435872396, + "learning_rate": 2e-05, + "loss": 1.2616, + "loss/crossentropy": 2.555655002593994, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17567789554595947, + "step": 1389 + }, + { + "epoch": 0.020755562192026282, + "grad_norm": 0.49609375, + "grad_norm_var": 0.0017836888631184895, + "learning_rate": 2e-05, + "loss": 1.2087, + "loss/crossentropy": 2.5104944705963135, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.15401938557624817, + "step": 1390 + }, + { + "epoch": 0.020770494251157234, + "grad_norm": 0.498046875, + "grad_norm_var": 0.001775217056274414, + "learning_rate": 2e-05, + "loss": 1.2413, + "loss/crossentropy": 2.602848768234253, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.170992910861969, + "step": 1391 + }, + { + "epoch": 0.02078542631028819, + "grad_norm": 0.54296875, + "grad_norm_var": 0.001702737808227539, + "learning_rate": 2e-05, + "loss": 1.2999, + "loss/crossentropy": 2.5092689990997314, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.19050829112529755, + "step": 1392 + }, + { + "epoch": 0.020800358369419143, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0014544010162353515, + "learning_rate": 2e-05, + "loss": 1.2414, + "loss/crossentropy": 2.510206460952759, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.18671731650829315, + "step": 1393 + }, + { + "epoch": 0.0208152904285501, + "grad_norm": 0.478515625, + "grad_norm_var": 0.00155485471089681, + "learning_rate": 2e-05, + "loss": 1.1186, + "loss/crossentropy": 2.801673650741577, + "loss/dist_ce": 0.0, + "loss/fcd": 0.96484375, + "loss/idx": 13.0, + "loss/logits": 0.15374580025672913, + "step": 1394 + }, + { + "epoch": 0.02083022248768105, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0015633742014567057, + "learning_rate": 2e-05, + "loss": 1.196, + "loss/crossentropy": 2.528714418411255, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1569829136133194, + "step": 1395 + }, + { + "epoch": 0.020845154546812007, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0015153090159098308, + "learning_rate": 2e-05, + "loss": 1.1815, + "loss/crossentropy": 2.5752596855163574, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15807154774665833, + "step": 1396 + }, + { + "epoch": 0.02086008660594296, + "grad_norm": 0.490234375, + "grad_norm_var": 0.0016009012858072917, + "learning_rate": 2e-05, + "loss": 1.1608, + "loss/crossentropy": 2.6302576065063477, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.15298575162887573, + "step": 1397 + }, + { + "epoch": 0.020875018665073915, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0012063980102539062, + "learning_rate": 2e-05, + "loss": 1.2726, + "loss/crossentropy": 2.4865758419036865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17103740572929382, + "step": 1398 + }, + { + "epoch": 0.020889950724204867, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0009091695149739584, + "learning_rate": 2e-05, + "loss": 1.2263, + "loss/crossentropy": 2.63130259513855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17159323394298553, + "step": 1399 + }, + { + "epoch": 0.020904882783335823, + "grad_norm": 0.46875, + "grad_norm_var": 0.0009398778279622396, + "learning_rate": 2e-05, + "loss": 1.1366, + "loss/crossentropy": 2.5764553546905518, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98828125, + "loss/idx": 13.0, + "loss/logits": 0.1483464539051056, + "step": 1400 + }, + { + "epoch": 0.020919814842466775, + "grad_norm": 0.478515625, + "grad_norm_var": 0.0008584181467692057, + "learning_rate": 2e-05, + "loss": 1.2287, + "loss/crossentropy": 2.4545843601226807, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16616512835025787, + "step": 1401 + }, + { + "epoch": 0.02093474690159773, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0011868635813395182, + "learning_rate": 2e-05, + "loss": 1.3896, + "loss/crossentropy": 2.566432476043701, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.2099205106496811, + "step": 1402 + }, + { + "epoch": 0.020949678960728683, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0010363260904947917, + "learning_rate": 2e-05, + "loss": 1.1944, + "loss/crossentropy": 2.694551706314087, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.17097516357898712, + "step": 1403 + }, + { + "epoch": 0.02096461101985964, + "grad_norm": 0.52734375, + "grad_norm_var": 0.001043701171875, + "learning_rate": 2e-05, + "loss": 1.1465, + "loss/crossentropy": 2.600843906402588, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.1465233862400055, + "step": 1404 + }, + { + "epoch": 0.02097954307899059, + "grad_norm": 0.59375, + "grad_norm_var": 0.0012959798177083334, + "learning_rate": 2e-05, + "loss": 1.28, + "loss/crossentropy": 2.5780487060546875, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18626543879508972, + "step": 1405 + }, + { + "epoch": 0.020994475138121547, + "grad_norm": 0.5, + "grad_norm_var": 0.001285235087076823, + "learning_rate": 2e-05, + "loss": 1.2806, + "loss/crossentropy": 2.555168867111206, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17903593182563782, + "step": 1406 + }, + { + "epoch": 0.0210094071972525, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0014116764068603516, + "learning_rate": 2e-05, + "loss": 1.2721, + "loss/crossentropy": 2.6439151763916016, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17056259512901306, + "step": 1407 + }, + { + "epoch": 0.021024339256383456, + "grad_norm": 0.4921875, + "grad_norm_var": 0.0014397780100504558, + "learning_rate": 2e-05, + "loss": 1.192, + "loss/crossentropy": 2.6460981369018555, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16073650121688843, + "step": 1408 + }, + { + "epoch": 0.021039271315514408, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0014557997385660807, + "learning_rate": 2e-05, + "loss": 1.1734, + "loss/crossentropy": 2.7225704193115234, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.14998149871826172, + "step": 1409 + }, + { + "epoch": 0.021054203374645364, + "grad_norm": 0.486328125, + "grad_norm_var": 0.00141447385152181, + "learning_rate": 2e-05, + "loss": 1.1947, + "loss/crossentropy": 2.7134530544281006, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.17125816643238068, + "step": 1410 + }, + { + "epoch": 0.021069135433776316, + "grad_norm": 0.65625, + "grad_norm_var": 0.0025040785471598308, + "learning_rate": 2e-05, + "loss": 1.4376, + "loss/crossentropy": 2.8304383754730225, + "loss/dist_ce": 0.0, + "loss/fcd": 1.234375, + "loss/idx": 13.0, + "loss/logits": 0.20326532423496246, + "step": 1411 + }, + { + "epoch": 0.021084067492907272, + "grad_norm": 0.796875, + "grad_norm_var": 0.00689098040262858, + "learning_rate": 2e-05, + "loss": 1.6109, + "loss/crossentropy": 2.3552732467651367, + "loss/dist_ce": 0.0, + "loss/fcd": 1.375, + "loss/idx": 13.0, + "loss/logits": 0.23586499691009521, + "step": 1412 + }, + { + "epoch": 0.021098999552038224, + "grad_norm": 0.486328125, + "grad_norm_var": 0.0069222609202067055, + "learning_rate": 2e-05, + "loss": 1.1958, + "loss/crossentropy": 2.665379524230957, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16456547379493713, + "step": 1413 + }, + { + "epoch": 0.02111393161116918, + "grad_norm": 0.490234375, + "grad_norm_var": 0.007126617431640625, + "learning_rate": 2e-05, + "loss": 1.2232, + "loss/crossentropy": 2.6303813457489014, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16855427622795105, + "step": 1414 + }, + { + "epoch": 0.021128863670300136, + "grad_norm": 0.546875, + "grad_norm_var": 0.007124773661295573, + "learning_rate": 2e-05, + "loss": 1.2708, + "loss/crossentropy": 2.770017385482788, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.18481460213661194, + "step": 1415 + }, + { + "epoch": 0.02114379572943109, + "grad_norm": 0.482421875, + "grad_norm_var": 0.006997156143188477, + "learning_rate": 2e-05, + "loss": 1.2358, + "loss/crossentropy": 2.4545326232910156, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17330729961395264, + "step": 1416 + }, + { + "epoch": 0.021158727788562044, + "grad_norm": 0.609375, + "grad_norm_var": 0.0068895975748697914, + "learning_rate": 2e-05, + "loss": 1.3533, + "loss/crossentropy": 2.5513885021209717, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.2204749584197998, + "step": 1417 + }, + { + "epoch": 0.021173659847692997, + "grad_norm": 0.515625, + "grad_norm_var": 0.006918780008951823, + "learning_rate": 2e-05, + "loss": 1.2852, + "loss/crossentropy": 2.4511773586273193, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.19149178266525269, + "step": 1418 + }, + { + "epoch": 0.021188591906823952, + "grad_norm": 0.5, + "grad_norm_var": 0.006966590881347656, + "learning_rate": 2e-05, + "loss": 1.2128, + "loss/crossentropy": 2.630352258682251, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1737767457962036, + "step": 1419 + }, + { + "epoch": 0.021203523965954905, + "grad_norm": 0.75, + "grad_norm_var": 0.009405517578125, + "learning_rate": 2e-05, + "loss": 1.4607, + "loss/crossentropy": 2.5818517208099365, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 13.0, + "loss/logits": 0.24980100989341736, + "step": 1420 + }, + { + "epoch": 0.02121845602508586, + "grad_norm": 0.5078125, + "grad_norm_var": 0.009520212809244791, + "learning_rate": 2e-05, + "loss": 1.1809, + "loss/crossentropy": 2.4227993488311768, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15742167830467224, + "step": 1421 + }, + { + "epoch": 0.021233388084216813, + "grad_norm": 0.50390625, + "grad_norm_var": 0.009490903218587239, + "learning_rate": 2e-05, + "loss": 1.1791, + "loss/crossentropy": 2.596496820449829, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15562020242214203, + "step": 1422 + }, + { + "epoch": 0.02124832014334777, + "grad_norm": 0.5234375, + "grad_norm_var": 0.009553464253743489, + "learning_rate": 2e-05, + "loss": 1.2777, + "loss/crossentropy": 2.4700193405151367, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18397745490074158, + "step": 1423 + }, + { + "epoch": 0.02126325220247872, + "grad_norm": 0.515625, + "grad_norm_var": 0.00939019521077474, + "learning_rate": 2e-05, + "loss": 1.2129, + "loss/crossentropy": 2.675663471221924, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1582183986902237, + "step": 1424 + }, + { + "epoch": 0.021278184261609677, + "grad_norm": 0.640625, + "grad_norm_var": 0.009793535868326823, + "learning_rate": 2e-05, + "loss": 1.3463, + "loss/crossentropy": 2.5702874660491943, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.19006071984767914, + "step": 1425 + }, + { + "epoch": 0.02129311632074063, + "grad_norm": 0.546875, + "grad_norm_var": 0.009401814142862955, + "learning_rate": 2e-05, + "loss": 1.4265, + "loss/crossentropy": 2.700167417526245, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 13.0, + "loss/logits": 0.21554800868034363, + "step": 1426 + }, + { + "epoch": 0.021308048379871585, + "grad_norm": 0.48046875, + "grad_norm_var": 0.009241596857706705, + "learning_rate": 2e-05, + "loss": 1.1925, + "loss/crossentropy": 2.780790328979492, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.1690763682126999, + "step": 1427 + }, + { + "epoch": 0.021322980439002538, + "grad_norm": 0.53125, + "grad_norm_var": 0.005121469497680664, + "learning_rate": 2e-05, + "loss": 1.2917, + "loss/crossentropy": 2.366610288619995, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.18228915333747864, + "step": 1428 + }, + { + "epoch": 0.021337912498133493, + "grad_norm": 0.515625, + "grad_norm_var": 0.004967689514160156, + "learning_rate": 2e-05, + "loss": 1.3195, + "loss/crossentropy": 2.4140758514404297, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19454874098300934, + "step": 1429 + }, + { + "epoch": 0.021352844557264446, + "grad_norm": 0.55078125, + "grad_norm_var": 0.004784886042277018, + "learning_rate": 2e-05, + "loss": 1.2121, + "loss/crossentropy": 2.3538424968719482, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1652664840221405, + "step": 1430 + }, + { + "epoch": 0.0213677766163954, + "grad_norm": 0.71875, + "grad_norm_var": 0.006673161188761393, + "learning_rate": 2e-05, + "loss": 1.3674, + "loss/crossentropy": 2.611670970916748, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.23455733060836792, + "step": 1431 + }, + { + "epoch": 0.021382708675526354, + "grad_norm": 0.51171875, + "grad_norm_var": 0.006440226236979167, + "learning_rate": 2e-05, + "loss": 1.1873, + "loss/crossentropy": 2.4568278789520264, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.14822477102279663, + "step": 1432 + }, + { + "epoch": 0.02139764073465731, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0062590916951497395, + "learning_rate": 2e-05, + "loss": 1.26, + "loss/crossentropy": 2.767918348312378, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17406600713729858, + "step": 1433 + }, + { + "epoch": 0.021412572793788262, + "grad_norm": 0.50390625, + "grad_norm_var": 0.006329091389973959, + "learning_rate": 2e-05, + "loss": 1.2008, + "loss/crossentropy": 2.643681764602661, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16954705119132996, + "step": 1434 + }, + { + "epoch": 0.021427504852919218, + "grad_norm": 0.65234375, + "grad_norm_var": 0.0066787083943684895, + "learning_rate": 2e-05, + "loss": 1.4312, + "loss/crossentropy": 2.535278797149658, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 13.0, + "loss/logits": 0.22029095888137817, + "step": 1435 + }, + { + "epoch": 0.02144243691205017, + "grad_norm": 0.5390625, + "grad_norm_var": 0.00422051747639974, + "learning_rate": 2e-05, + "loss": 1.3012, + "loss/crossentropy": 2.571066379547119, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.19182069599628448, + "step": 1436 + }, + { + "epoch": 0.021457368971181126, + "grad_norm": 0.546875, + "grad_norm_var": 0.004093360900878906, + "learning_rate": 2e-05, + "loss": 1.2651, + "loss/crossentropy": 2.668915271759033, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.18694379925727844, + "step": 1437 + }, + { + "epoch": 0.02147230103031208, + "grad_norm": 0.5390625, + "grad_norm_var": 0.003940582275390625, + "learning_rate": 2e-05, + "loss": 1.171, + "loss/crossentropy": 2.5324816703796387, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1553664356470108, + "step": 1438 + }, + { + "epoch": 0.021487233089443034, + "grad_norm": 0.50390625, + "grad_norm_var": 0.004047075907389323, + "learning_rate": 2e-05, + "loss": 1.245, + "loss/crossentropy": 2.754875421524048, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17472638189792633, + "step": 1439 + }, + { + "epoch": 0.02150216514857399, + "grad_norm": 0.55859375, + "grad_norm_var": 0.00394287109375, + "learning_rate": 2e-05, + "loss": 1.2567, + "loss/crossentropy": 2.51790189743042, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1785835325717926, + "step": 1440 + }, + { + "epoch": 0.021517097207704942, + "grad_norm": 0.5546875, + "grad_norm_var": 0.0034421284993489585, + "learning_rate": 2e-05, + "loss": 1.2437, + "loss/crossentropy": 2.6603500843048096, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1655387580394745, + "step": 1441 + }, + { + "epoch": 0.0215320292668359, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0034407933553059896, + "learning_rate": 2e-05, + "loss": 1.2036, + "loss/crossentropy": 2.661207437515259, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.17234490811824799, + "step": 1442 + }, + { + "epoch": 0.02154696132596685, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0031491597493489582, + "learning_rate": 2e-05, + "loss": 1.0976, + "loss/crossentropy": 2.7283310890197754, + "loss/dist_ce": 0.0, + "loss/fcd": 0.95703125, + "loss/idx": 13.0, + "loss/logits": 0.14056336879730225, + "step": 1443 + }, + { + "epoch": 0.021561893385097806, + "grad_norm": 0.59765625, + "grad_norm_var": 0.003221575419108073, + "learning_rate": 2e-05, + "loss": 1.3786, + "loss/crossentropy": 2.423133134841919, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.19894269108772278, + "step": 1444 + }, + { + "epoch": 0.02157682544422876, + "grad_norm": 0.5390625, + "grad_norm_var": 0.003122393290201823, + "learning_rate": 2e-05, + "loss": 1.1607, + "loss/crossentropy": 2.6964542865753174, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.15290293097496033, + "step": 1445 + }, + { + "epoch": 0.021591757503359715, + "grad_norm": 0.58203125, + "grad_norm_var": 0.003145790100097656, + "learning_rate": 2e-05, + "loss": 1.2395, + "loss/crossentropy": 2.5600531101226807, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16921450197696686, + "step": 1446 + }, + { + "epoch": 0.021606689562490667, + "grad_norm": 0.46484375, + "grad_norm_var": 0.0018605550130208334, + "learning_rate": 2e-05, + "loss": 1.1883, + "loss/crossentropy": 2.557687282562256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.16487538814544678, + "step": 1447 + }, + { + "epoch": 0.021621621621621623, + "grad_norm": 0.6015625, + "grad_norm_var": 0.001955604553222656, + "learning_rate": 2e-05, + "loss": 1.319, + "loss/crossentropy": 2.5055274963378906, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19401580095291138, + "step": 1448 + }, + { + "epoch": 0.021636553680752575, + "grad_norm": 0.515625, + "grad_norm_var": 0.0020159403483072918, + "learning_rate": 2e-05, + "loss": 1.1947, + "loss/crossentropy": 2.861999988555908, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16344812512397766, + "step": 1449 + }, + { + "epoch": 0.02165148573988353, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0020159403483072918, + "learning_rate": 2e-05, + "loss": 1.2473, + "loss/crossentropy": 2.4316065311431885, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1691448986530304, + "step": 1450 + }, + { + "epoch": 0.021666417799014483, + "grad_norm": 0.70703125, + "grad_norm_var": 0.0029612223307291666, + "learning_rate": 2e-05, + "loss": 1.512, + "loss/crossentropy": 2.4315688610076904, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2890625, + "loss/idx": 13.0, + "loss/logits": 0.22292405366897583, + "step": 1451 + }, + { + "epoch": 0.02168134985814544, + "grad_norm": 0.478515625, + "grad_norm_var": 0.00329283078511556, + "learning_rate": 2e-05, + "loss": 1.1174, + "loss/crossentropy": 2.7632553577423096, + "loss/dist_ce": 0.0, + "loss/fcd": 0.96875, + "loss/idx": 13.0, + "loss/logits": 0.14863690733909607, + "step": 1452 + }, + { + "epoch": 0.02169628191727639, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0032932122548421225, + "learning_rate": 2e-05, + "loss": 1.3365, + "loss/crossentropy": 2.5227863788604736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.1958889216184616, + "step": 1453 + }, + { + "epoch": 0.021711213976407347, + "grad_norm": 0.58203125, + "grad_norm_var": 0.003356154759724935, + "learning_rate": 2e-05, + "loss": 1.2874, + "loss/crossentropy": 2.528759241104126, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.201430082321167, + "step": 1454 + }, + { + "epoch": 0.0217261460355383, + "grad_norm": 0.455078125, + "grad_norm_var": 0.003811136881510417, + "learning_rate": 2e-05, + "loss": 1.1504, + "loss/crossentropy": 2.579636812210083, + "loss/dist_ce": 0.0, + "loss/fcd": 0.99609375, + "loss/idx": 13.0, + "loss/logits": 0.1543428748846054, + "step": 1455 + }, + { + "epoch": 0.021741078094669256, + "grad_norm": 0.5625, + "grad_norm_var": 0.0038176854451497395, + "learning_rate": 2e-05, + "loss": 1.293, + "loss/crossentropy": 2.594700574874878, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.19148565828800201, + "step": 1456 + }, + { + "epoch": 0.021756010153800208, + "grad_norm": 0.609375, + "grad_norm_var": 0.00405267079671224, + "learning_rate": 2e-05, + "loss": 1.3389, + "loss/crossentropy": 2.5115139484405518, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.1826532483100891, + "step": 1457 + }, + { + "epoch": 0.021770942212931164, + "grad_norm": 0.515625, + "grad_norm_var": 0.004133351643880208, + "learning_rate": 2e-05, + "loss": 1.1826, + "loss/crossentropy": 2.4750778675079346, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15130820870399475, + "step": 1458 + }, + { + "epoch": 0.021785874272062116, + "grad_norm": 0.8125, + "grad_norm_var": 0.008358256022135416, + "learning_rate": 2e-05, + "loss": 1.2464, + "loss/crossentropy": 2.7200441360473633, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.16831210255622864, + "step": 1459 + }, + { + "epoch": 0.021800806331193072, + "grad_norm": 0.65234375, + "grad_norm_var": 0.008765920003255209, + "learning_rate": 2e-05, + "loss": 1.3127, + "loss/crossentropy": 2.4744434356689453, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18768146634101868, + "step": 1460 + }, + { + "epoch": 0.021815738390324024, + "grad_norm": 0.55859375, + "grad_norm_var": 0.008707110087076824, + "learning_rate": 2e-05, + "loss": 1.286, + "loss/crossentropy": 2.4717938899993896, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18439996242523193, + "step": 1461 + }, + { + "epoch": 0.02183067044945498, + "grad_norm": 0.5, + "grad_norm_var": 0.009018198649088541, + "learning_rate": 2e-05, + "loss": 1.2192, + "loss/crossentropy": 2.5739426612854004, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16450907289981842, + "step": 1462 + }, + { + "epoch": 0.021845602508585932, + "grad_norm": 0.50390625, + "grad_norm_var": 0.008582051595052083, + "learning_rate": 2e-05, + "loss": 1.1912, + "loss/crossentropy": 2.4712491035461426, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.15212152898311615, + "step": 1463 + }, + { + "epoch": 0.02186053456771689, + "grad_norm": 0.54296875, + "grad_norm_var": 0.008544858296712239, + "learning_rate": 2e-05, + "loss": 1.2548, + "loss/crossentropy": 2.2979588508605957, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1766788214445114, + "step": 1464 + }, + { + "epoch": 0.02187546662684784, + "grad_norm": 0.6640625, + "grad_norm_var": 0.008931414286295573, + "learning_rate": 2e-05, + "loss": 1.4653, + "loss/crossentropy": 2.055795669555664, + "loss/dist_ce": 0.0, + "loss/fcd": 1.28125, + "loss/idx": 13.0, + "loss/logits": 0.18409845232963562, + "step": 1465 + }, + { + "epoch": 0.021890398685978796, + "grad_norm": 0.5234375, + "grad_norm_var": 0.008770243326822916, + "learning_rate": 2e-05, + "loss": 1.1843, + "loss/crossentropy": 2.5227856636047363, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.16089648008346558, + "step": 1466 + }, + { + "epoch": 0.021905330745109752, + "grad_norm": 0.484375, + "grad_norm_var": 0.007983843485514322, + "learning_rate": 2e-05, + "loss": 1.2299, + "loss/crossentropy": 2.611480474472046, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1751788854598999, + "step": 1467 + }, + { + "epoch": 0.021920262804240705, + "grad_norm": 0.46875, + "grad_norm_var": 0.008098840713500977, + "learning_rate": 2e-05, + "loss": 1.1278, + "loss/crossentropy": 2.6512179374694824, + "loss/dist_ce": 0.0, + "loss/fcd": 0.984375, + "loss/idx": 13.0, + "loss/logits": 0.14338138699531555, + "step": 1468 + }, + { + "epoch": 0.02193519486337166, + "grad_norm": 0.5, + "grad_norm_var": 0.008333571751912435, + "learning_rate": 2e-05, + "loss": 1.2038, + "loss/crossentropy": 2.5303971767425537, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.15689189732074738, + "step": 1469 + }, + { + "epoch": 0.021950126922502613, + "grad_norm": 0.5546875, + "grad_norm_var": 0.008294407526652019, + "learning_rate": 2e-05, + "loss": 1.2558, + "loss/crossentropy": 2.435598850250244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.16985587775707245, + "step": 1470 + }, + { + "epoch": 0.02196505898163357, + "grad_norm": 0.51953125, + "grad_norm_var": 0.007680193583170573, + "learning_rate": 2e-05, + "loss": 1.264, + "loss/crossentropy": 2.4546844959259033, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.1702587753534317, + "step": 1471 + }, + { + "epoch": 0.02197999104076452, + "grad_norm": 0.5078125, + "grad_norm_var": 0.007854652404785157, + "learning_rate": 2e-05, + "loss": 1.248, + "loss/crossentropy": 2.5671000480651855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17768289148807526, + "step": 1472 + }, + { + "epoch": 0.021994923099895477, + "grad_norm": 0.48046875, + "grad_norm_var": 0.007999420166015625, + "learning_rate": 2e-05, + "loss": 1.1841, + "loss/crossentropy": 2.4480648040771484, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15280106663703918, + "step": 1473 + }, + { + "epoch": 0.02200985515902643, + "grad_norm": 0.51953125, + "grad_norm_var": 0.007982826232910157, + "learning_rate": 2e-05, + "loss": 1.2384, + "loss/crossentropy": 2.707908868789673, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17594537138938904, + "step": 1474 + }, + { + "epoch": 0.022024787218157385, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0031385898590087892, + "learning_rate": 2e-05, + "loss": 1.1188, + "loss/crossentropy": 2.5222115516662598, + "loss/dist_ce": 0.0, + "loss/fcd": 0.97265625, + "loss/idx": 13.0, + "loss/logits": 0.14612950384616852, + "step": 1475 + }, + { + "epoch": 0.022039719277288337, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0020913283030192056, + "learning_rate": 2e-05, + "loss": 1.087, + "loss/crossentropy": 2.3494150638580322, + "loss/dist_ce": 0.0, + "loss/fcd": 0.95703125, + "loss/idx": 13.0, + "loss/logits": 0.13001090288162231, + "step": 1476 + }, + { + "epoch": 0.022054651336419293, + "grad_norm": 0.498046875, + "grad_norm_var": 0.002031707763671875, + "learning_rate": 2e-05, + "loss": 1.2642, + "loss/crossentropy": 2.571301221847534, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.18606534600257874, + "step": 1477 + }, + { + "epoch": 0.022069583395550246, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0020227432250976562, + "learning_rate": 2e-05, + "loss": 1.2481, + "loss/crossentropy": 2.5197813510894775, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17777016758918762, + "step": 1478 + }, + { + "epoch": 0.0220845154546812, + "grad_norm": 0.625, + "grad_norm_var": 0.002690887451171875, + "learning_rate": 2e-05, + "loss": 1.2635, + "loss/crossentropy": 2.594855785369873, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.19315627217292786, + "step": 1479 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.59765625, + "grad_norm_var": 0.00299530029296875, + "learning_rate": 2e-05, + "loss": 1.3524, + "loss/crossentropy": 2.5978474617004395, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.21956896781921387, + "step": 1480 + }, + { + "epoch": 0.02211437957294311, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0019525527954101563, + "learning_rate": 2e-05, + "loss": 1.3905, + "loss/crossentropy": 2.5185165405273438, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.21082568168640137, + "step": 1481 + }, + { + "epoch": 0.022129311632074062, + "grad_norm": 0.578125, + "grad_norm_var": 0.0021270116170247397, + "learning_rate": 2e-05, + "loss": 1.2688, + "loss/crossentropy": 2.6426360607147217, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17500849068164825, + "step": 1482 + }, + { + "epoch": 0.022144243691205018, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0020581404368082683, + "learning_rate": 2e-05, + "loss": 1.2863, + "loss/crossentropy": 2.5090346336364746, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1846894472837448, + "step": 1483 + }, + { + "epoch": 0.02215917575033597, + "grad_norm": 0.578125, + "grad_norm_var": 0.00192106564839681, + "learning_rate": 2e-05, + "loss": 1.3025, + "loss/crossentropy": 2.463310718536377, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.18533632159233093, + "step": 1484 + }, + { + "epoch": 0.022174107809466926, + "grad_norm": 0.546875, + "grad_norm_var": 0.0018318017323811849, + "learning_rate": 2e-05, + "loss": 1.2817, + "loss/crossentropy": 2.671934127807617, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.180099219083786, + "step": 1485 + }, + { + "epoch": 0.02218903986859788, + "grad_norm": 0.4765625, + "grad_norm_var": 0.0020517826080322264, + "learning_rate": 2e-05, + "loss": 1.2166, + "loss/crossentropy": 2.4652843475341797, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1697027087211609, + "step": 1486 + }, + { + "epoch": 0.022203971927728834, + "grad_norm": 0.609375, + "grad_norm_var": 0.002379337946573893, + "learning_rate": 2e-05, + "loss": 1.3923, + "loss/crossentropy": 2.6956992149353027, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.21260452270507812, + "step": 1487 + }, + { + "epoch": 0.022218903986859787, + "grad_norm": 0.47265625, + "grad_norm_var": 0.0026070753733317058, + "learning_rate": 2e-05, + "loss": 1.1581, + "loss/crossentropy": 2.682551145553589, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.15028738975524902, + "step": 1488 + }, + { + "epoch": 0.022233836045990742, + "grad_norm": 0.6015625, + "grad_norm_var": 0.002599191665649414, + "learning_rate": 2e-05, + "loss": 1.3929, + "loss/crossentropy": 2.7902488708496094, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.2132531702518463, + "step": 1489 + }, + { + "epoch": 0.022248768105121695, + "grad_norm": 0.5390625, + "grad_norm_var": 0.002555958429972331, + "learning_rate": 2e-05, + "loss": 1.2911, + "loss/crossentropy": 2.5189132690429688, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.18173748254776, + "step": 1490 + }, + { + "epoch": 0.02226370016425265, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0024022420247395834, + "learning_rate": 2e-05, + "loss": 1.1905, + "loss/crossentropy": 2.603708028793335, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1592203825712204, + "step": 1491 + }, + { + "epoch": 0.022278632223383603, + "grad_norm": 0.458984375, + "grad_norm_var": 0.0029072920481363934, + "learning_rate": 2e-05, + "loss": 1.1689, + "loss/crossentropy": 2.576422691345215, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15326352417469025, + "step": 1492 + }, + { + "epoch": 0.02229356428251456, + "grad_norm": 0.53125, + "grad_norm_var": 0.002773539225260417, + "learning_rate": 2e-05, + "loss": 1.1926, + "loss/crossentropy": 2.671536684036255, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1613408625125885, + "step": 1493 + }, + { + "epoch": 0.022308496341645515, + "grad_norm": 0.671875, + "grad_norm_var": 0.0035964330037434895, + "learning_rate": 2e-05, + "loss": 1.3892, + "loss/crossentropy": 2.2959797382354736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.19386935234069824, + "step": 1494 + }, + { + "epoch": 0.022323428400776467, + "grad_norm": 0.515625, + "grad_norm_var": 0.003343645731608073, + "learning_rate": 2e-05, + "loss": 1.2218, + "loss/crossentropy": 2.720688819885254, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16712167859077454, + "step": 1495 + }, + { + "epoch": 0.022338360459907423, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0032587051391601562, + "learning_rate": 2e-05, + "loss": 1.3552, + "loss/crossentropy": 2.4977471828460693, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.18331705033779144, + "step": 1496 + }, + { + "epoch": 0.022353292519038375, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0032042821248372396, + "learning_rate": 2e-05, + "loss": 1.3051, + "loss/crossentropy": 2.5807783603668213, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18012891709804535, + "step": 1497 + }, + { + "epoch": 0.02236822457816933, + "grad_norm": 0.474609375, + "grad_norm_var": 0.0034527937571207684, + "learning_rate": 2e-05, + "loss": 1.1901, + "loss/crossentropy": 2.650909900665283, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.16662496328353882, + "step": 1498 + }, + { + "epoch": 0.022383156637300283, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0034212748209635417, + "learning_rate": 2e-05, + "loss": 1.2063, + "loss/crossentropy": 2.6215226650238037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16723014414310455, + "step": 1499 + }, + { + "epoch": 0.02239808869643124, + "grad_norm": 0.490234375, + "grad_norm_var": 0.0034749190012613933, + "learning_rate": 2e-05, + "loss": 1.1455, + "loss/crossentropy": 2.5625998973846436, + "loss/dist_ce": 0.0, + "loss/fcd": 0.99609375, + "loss/idx": 13.0, + "loss/logits": 0.14940068125724792, + "step": 1500 + }, + { + "epoch": 0.02241302075556219, + "grad_norm": 0.5390625, + "grad_norm_var": 0.003467416763305664, + "learning_rate": 2e-05, + "loss": 1.2824, + "loss/crossentropy": 2.5618104934692383, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18080656230449677, + "step": 1501 + }, + { + "epoch": 0.022427952814693147, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0032656192779541016, + "learning_rate": 2e-05, + "loss": 1.3115, + "loss/crossentropy": 2.55956768989563, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.20210814476013184, + "step": 1502 + }, + { + "epoch": 0.0224428848738241, + "grad_norm": 0.484375, + "grad_norm_var": 0.0031048933664957683, + "learning_rate": 2e-05, + "loss": 1.1382, + "loss/crossentropy": 2.7115917205810547, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9921875, + "loss/idx": 13.0, + "loss/logits": 0.14602671563625336, + "step": 1503 + }, + { + "epoch": 0.022457816932955055, + "grad_norm": 0.6875, + "grad_norm_var": 0.004251845677693685, + "learning_rate": 2e-05, + "loss": 1.5219, + "loss/crossentropy": 2.5612852573394775, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2890625, + "loss/idx": 13.0, + "loss/logits": 0.23288561403751373, + "step": 1504 + }, + { + "epoch": 0.022472748992086008, + "grad_norm": 0.486328125, + "grad_norm_var": 0.004239654541015625, + "learning_rate": 2e-05, + "loss": 1.2115, + "loss/crossentropy": 2.5381405353546143, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.17247450351715088, + "step": 1505 + }, + { + "epoch": 0.022487681051216964, + "grad_norm": 0.8125, + "grad_norm_var": 0.008894856770833333, + "learning_rate": 2e-05, + "loss": 1.3213, + "loss/crossentropy": 2.6162917613983154, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19628921151161194, + "step": 1506 + }, + { + "epoch": 0.022502613110347916, + "grad_norm": 0.48828125, + "grad_norm_var": 0.009166463216145834, + "learning_rate": 2e-05, + "loss": 1.1868, + "loss/crossentropy": 2.6764116287231445, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15558165311813354, + "step": 1507 + }, + { + "epoch": 0.022517545169478872, + "grad_norm": 0.6484375, + "grad_norm_var": 0.009016911188761393, + "learning_rate": 2e-05, + "loss": 1.2914, + "loss/crossentropy": 2.6869983673095703, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.19766151905059814, + "step": 1508 + }, + { + "epoch": 0.022532477228609824, + "grad_norm": 0.52734375, + "grad_norm_var": 0.009035730361938476, + "learning_rate": 2e-05, + "loss": 1.19, + "loss/crossentropy": 2.548644781112671, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15873447060585022, + "step": 1509 + }, + { + "epoch": 0.02254740928774078, + "grad_norm": 0.482421875, + "grad_norm_var": 0.008587074279785157, + "learning_rate": 2e-05, + "loss": 1.1595, + "loss/crossentropy": 2.6315722465515137, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.15952712297439575, + "step": 1510 + }, + { + "epoch": 0.022562341346871732, + "grad_norm": 0.85546875, + "grad_norm_var": 0.014090728759765626, + "learning_rate": 2e-05, + "loss": 1.2821, + "loss/crossentropy": 2.651576519012451, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.2039494812488556, + "step": 1511 + }, + { + "epoch": 0.022577273406002688, + "grad_norm": 0.53125, + "grad_norm_var": 0.014202308654785157, + "learning_rate": 2e-05, + "loss": 1.2496, + "loss/crossentropy": 2.5648045539855957, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.16364048421382904, + "step": 1512 + }, + { + "epoch": 0.02259220546513364, + "grad_norm": 0.63671875, + "grad_norm_var": 0.014463233947753906, + "learning_rate": 2e-05, + "loss": 1.1766, + "loss/crossentropy": 2.465576410293579, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1609870195388794, + "step": 1513 + }, + { + "epoch": 0.022607137524264596, + "grad_norm": 0.5390625, + "grad_norm_var": 0.013852167129516601, + "learning_rate": 2e-05, + "loss": 1.2467, + "loss/crossentropy": 2.382476568222046, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1607544869184494, + "step": 1514 + }, + { + "epoch": 0.02262206958339555, + "grad_norm": 0.5703125, + "grad_norm_var": 0.013454421361287435, + "learning_rate": 2e-05, + "loss": 1.3323, + "loss/crossentropy": 2.588047742843628, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.19164502620697021, + "step": 1515 + }, + { + "epoch": 0.022637001642526505, + "grad_norm": 0.546875, + "grad_norm_var": 0.012946001688639323, + "learning_rate": 2e-05, + "loss": 1.2083, + "loss/crossentropy": 2.6603829860687256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16927656531333923, + "step": 1516 + }, + { + "epoch": 0.022651933701657457, + "grad_norm": 0.625, + "grad_norm_var": 0.01285088857014974, + "learning_rate": 2e-05, + "loss": 1.2823, + "loss/crossentropy": 2.647714614868164, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18076732754707336, + "step": 1517 + }, + { + "epoch": 0.022666865760788413, + "grad_norm": 0.48828125, + "grad_norm_var": 0.013509559631347656, + "learning_rate": 2e-05, + "loss": 1.1729, + "loss/crossentropy": 2.624861717224121, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.1651315689086914, + "step": 1518 + }, + { + "epoch": 0.02268179781991937, + "grad_norm": 0.515625, + "grad_norm_var": 0.013138262430826823, + "learning_rate": 2e-05, + "loss": 1.2071, + "loss/crossentropy": 2.556081771850586, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16020020842552185, + "step": 1519 + }, + { + "epoch": 0.02269672987905032, + "grad_norm": 0.51953125, + "grad_norm_var": 0.012719980875651042, + "learning_rate": 2e-05, + "loss": 1.2024, + "loss/crossentropy": 2.708319902420044, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16335517168045044, + "step": 1520 + }, + { + "epoch": 0.022711661938181277, + "grad_norm": 0.53515625, + "grad_norm_var": 0.01226181983947754, + "learning_rate": 2e-05, + "loss": 1.3096, + "loss/crossentropy": 2.614462375640869, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18464729189872742, + "step": 1521 + }, + { + "epoch": 0.02272659399731223, + "grad_norm": 0.60546875, + "grad_norm_var": 0.008595641454060872, + "learning_rate": 2e-05, + "loss": 1.2894, + "loss/crossentropy": 2.635050058364868, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1800360381603241, + "step": 1522 + }, + { + "epoch": 0.022741526056443185, + "grad_norm": 0.470703125, + "grad_norm_var": 0.008805783589680989, + "learning_rate": 2e-05, + "loss": 1.1752, + "loss/crossentropy": 2.6181414127349854, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15956082940101624, + "step": 1523 + }, + { + "epoch": 0.022756458115574137, + "grad_norm": 0.5390625, + "grad_norm_var": 0.00838921864827474, + "learning_rate": 2e-05, + "loss": 1.2508, + "loss/crossentropy": 2.428799629211426, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1649072766304016, + "step": 1524 + }, + { + "epoch": 0.022771390174705093, + "grad_norm": 0.64453125, + "grad_norm_var": 0.00870965321858724, + "learning_rate": 2e-05, + "loss": 1.438, + "loss/crossentropy": 2.635791778564453, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 13.0, + "loss/logits": 0.21926459670066833, + "step": 1525 + }, + { + "epoch": 0.022786322233836045, + "grad_norm": 0.50390625, + "grad_norm_var": 0.008490228652954101, + "learning_rate": 2e-05, + "loss": 1.2384, + "loss/crossentropy": 2.5704894065856934, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16805407404899597, + "step": 1526 + }, + { + "epoch": 0.022801254292967, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0027491092681884766, + "learning_rate": 2e-05, + "loss": 1.2464, + "loss/crossentropy": 2.7401912212371826, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17604930698871613, + "step": 1527 + }, + { + "epoch": 0.022816186352097954, + "grad_norm": 0.53125, + "grad_norm_var": 0.0027491092681884766, + "learning_rate": 2e-05, + "loss": 1.3106, + "loss/crossentropy": 2.4598357677459717, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18563039600849152, + "step": 1528 + }, + { + "epoch": 0.02283111841122891, + "grad_norm": 0.578125, + "grad_norm_var": 0.002285623550415039, + "learning_rate": 2e-05, + "loss": 1.3289, + "loss/crossentropy": 2.4256432056427, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.18043887615203857, + "step": 1529 + }, + { + "epoch": 0.022846050470359862, + "grad_norm": 0.51953125, + "grad_norm_var": 0.002328221003214518, + "learning_rate": 2e-05, + "loss": 1.2916, + "loss/crossentropy": 2.513697624206543, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1822143942117691, + "step": 1530 + }, + { + "epoch": 0.022860982529490818, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0022973219553629556, + "learning_rate": 2e-05, + "loss": 1.2614, + "loss/crossentropy": 2.432072639465332, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.1676977425813675, + "step": 1531 + }, + { + "epoch": 0.02287591458862177, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0024296442667643228, + "learning_rate": 2e-05, + "loss": 1.1784, + "loss/crossentropy": 2.608954668045044, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.16277967393398285, + "step": 1532 + }, + { + "epoch": 0.022890846647752726, + "grad_norm": 0.5625, + "grad_norm_var": 0.0019759496053059896, + "learning_rate": 2e-05, + "loss": 1.3215, + "loss/crossentropy": 2.744267225265503, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19652540981769562, + "step": 1533 + }, + { + "epoch": 0.022905778706883678, + "grad_norm": 0.625, + "grad_norm_var": 0.0022496541341145834, + "learning_rate": 2e-05, + "loss": 1.3236, + "loss/crossentropy": 2.448258638381958, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.17515279352664948, + "step": 1534 + }, + { + "epoch": 0.022920710766014634, + "grad_norm": 0.474609375, + "grad_norm_var": 0.0025203545888264974, + "learning_rate": 2e-05, + "loss": 1.1635, + "loss/crossentropy": 2.6411309242248535, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.14791785180568695, + "step": 1535 + }, + { + "epoch": 0.022935642825145586, + "grad_norm": 0.52734375, + "grad_norm_var": 0.00249937375386556, + "learning_rate": 2e-05, + "loss": 1.2836, + "loss/crossentropy": 2.6703877449035645, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18201038241386414, + "step": 1536 + }, + { + "epoch": 0.022950574884276542, + "grad_norm": 0.59375, + "grad_norm_var": 0.0026462395985921224, + "learning_rate": 2e-05, + "loss": 1.3684, + "loss/crossentropy": 2.5531914234161377, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 13.0, + "loss/logits": 0.2043484002351761, + "step": 1537 + }, + { + "epoch": 0.022965506943407495, + "grad_norm": 0.54296875, + "grad_norm_var": 0.002407185236612956, + "learning_rate": 2e-05, + "loss": 1.2455, + "loss/crossentropy": 2.6327946186065674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17522411048412323, + "step": 1538 + }, + { + "epoch": 0.02298043900253845, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0020350138346354168, + "learning_rate": 2e-05, + "loss": 1.2707, + "loss/crossentropy": 2.820997476577759, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.20818859338760376, + "step": 1539 + }, + { + "epoch": 0.022995371061669403, + "grad_norm": 0.451171875, + "grad_norm_var": 0.002620808283487956, + "learning_rate": 2e-05, + "loss": 1.1121, + "loss/crossentropy": 2.2270283699035645, + "loss/dist_ce": 0.0, + "loss/fcd": 0.97265625, + "loss/idx": 13.0, + "loss/logits": 0.13947048783302307, + "step": 1540 + }, + { + "epoch": 0.02301030312080036, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0018944899241129557, + "learning_rate": 2e-05, + "loss": 1.2607, + "loss/crossentropy": 2.485645294189453, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1826171576976776, + "step": 1541 + }, + { + "epoch": 0.02302523517993131, + "grad_norm": 0.515625, + "grad_norm_var": 0.0018551985422770182, + "learning_rate": 2e-05, + "loss": 1.2134, + "loss/crossentropy": 2.5995469093322754, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16654683649539948, + "step": 1542 + }, + { + "epoch": 0.023040167239062267, + "grad_norm": 0.640625, + "grad_norm_var": 0.002537393569946289, + "learning_rate": 2e-05, + "loss": 1.3908, + "loss/crossentropy": 2.5561470985412598, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.2111516296863556, + "step": 1543 + }, + { + "epoch": 0.02305509929819322, + "grad_norm": 0.57421875, + "grad_norm_var": 0.002589146296183268, + "learning_rate": 2e-05, + "loss": 1.3671, + "loss/crossentropy": 2.6751511096954346, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.18742230534553528, + "step": 1544 + }, + { + "epoch": 0.023070031357324175, + "grad_norm": 0.470703125, + "grad_norm_var": 0.0028365453084309897, + "learning_rate": 2e-05, + "loss": 1.1518, + "loss/crossentropy": 2.600984811782837, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.15175320208072662, + "step": 1545 + }, + { + "epoch": 0.02308496341645513, + "grad_norm": 0.69921875, + "grad_norm_var": 0.004404131571451823, + "learning_rate": 2e-05, + "loss": 1.3883, + "loss/crossentropy": 2.5492827892303467, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.24766142666339874, + "step": 1546 + }, + { + "epoch": 0.023099895475586083, + "grad_norm": 0.494140625, + "grad_norm_var": 0.004586140314737956, + "learning_rate": 2e-05, + "loss": 1.2833, + "loss/crossentropy": 2.368687629699707, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.189529687166214, + "step": 1547 + }, + { + "epoch": 0.02311482753471704, + "grad_norm": 0.55859375, + "grad_norm_var": 0.004431915283203125, + "learning_rate": 2e-05, + "loss": 1.313, + "loss/crossentropy": 2.526824951171875, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18795964121818542, + "step": 1548 + }, + { + "epoch": 0.02312975959384799, + "grad_norm": 0.5, + "grad_norm_var": 0.004566192626953125, + "learning_rate": 2e-05, + "loss": 1.2229, + "loss/crossentropy": 2.5661303997039795, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17601008713245392, + "step": 1549 + }, + { + "epoch": 0.023144691652978947, + "grad_norm": 0.578125, + "grad_norm_var": 0.004206085205078125, + "learning_rate": 2e-05, + "loss": 1.3267, + "loss/crossentropy": 2.568767786026001, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.20167773962020874, + "step": 1550 + }, + { + "epoch": 0.0231596237121099, + "grad_norm": 0.474609375, + "grad_norm_var": 0.004206085205078125, + "learning_rate": 2e-05, + "loss": 1.168, + "loss/crossentropy": 2.607841968536377, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15234646201133728, + "step": 1551 + }, + { + "epoch": 0.023174555771240855, + "grad_norm": 0.63671875, + "grad_norm_var": 0.004733022054036458, + "learning_rate": 2e-05, + "loss": 1.5019, + "loss/crossentropy": 2.2986114025115967, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2890625, + "loss/idx": 13.0, + "loss/logits": 0.21287450194358826, + "step": 1552 + }, + { + "epoch": 0.023189487830371808, + "grad_norm": 0.609375, + "grad_norm_var": 0.004840850830078125, + "learning_rate": 2e-05, + "loss": 1.2576, + "loss/crossentropy": 2.6668930053710938, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17167997360229492, + "step": 1553 + }, + { + "epoch": 0.023204419889502764, + "grad_norm": 0.578125, + "grad_norm_var": 0.004883766174316406, + "learning_rate": 2e-05, + "loss": 1.2429, + "loss/crossentropy": 2.417006254196167, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.16477948427200317, + "step": 1554 + }, + { + "epoch": 0.023219351948633716, + "grad_norm": 0.5859375, + "grad_norm_var": 0.004937171936035156, + "learning_rate": 2e-05, + "loss": 1.3243, + "loss/crossentropy": 2.640455722808838, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19927874207496643, + "step": 1555 + }, + { + "epoch": 0.02323428400776467, + "grad_norm": 0.5, + "grad_norm_var": 0.004407485326131185, + "learning_rate": 2e-05, + "loss": 1.2037, + "loss/crossentropy": 2.4920125007629395, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1646072268486023, + "step": 1556 + }, + { + "epoch": 0.023249216066895624, + "grad_norm": 0.4921875, + "grad_norm_var": 0.004596185684204101, + "learning_rate": 2e-05, + "loss": 1.1922, + "loss/crossentropy": 2.554762601852417, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16096070408821106, + "step": 1557 + }, + { + "epoch": 0.02326414812602658, + "grad_norm": 0.5625, + "grad_norm_var": 0.004476404190063477, + "learning_rate": 2e-05, + "loss": 1.2629, + "loss/crossentropy": 2.679831027984619, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17697837948799133, + "step": 1558 + }, + { + "epoch": 0.023279080185157532, + "grad_norm": 0.57421875, + "grad_norm_var": 0.004035425186157226, + "learning_rate": 2e-05, + "loss": 1.3702, + "loss/crossentropy": 2.5628817081451416, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.190528005361557, + "step": 1559 + }, + { + "epoch": 0.023294012244288488, + "grad_norm": 0.59375, + "grad_norm_var": 0.004107904434204101, + "learning_rate": 2e-05, + "loss": 1.3442, + "loss/crossentropy": 2.408496141433716, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.19575239717960358, + "step": 1560 + }, + { + "epoch": 0.02330894430341944, + "grad_norm": 0.515625, + "grad_norm_var": 0.00371856689453125, + "learning_rate": 2e-05, + "loss": 1.3114, + "loss/crossentropy": 2.3996641635894775, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.1863994300365448, + "step": 1561 + }, + { + "epoch": 0.023323876362550396, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0024443944295247395, + "learning_rate": 2e-05, + "loss": 1.1977, + "loss/crossentropy": 2.4856419563293457, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.15866494178771973, + "step": 1562 + }, + { + "epoch": 0.02333880842168135, + "grad_norm": 0.546875, + "grad_norm_var": 0.002242263158162435, + "learning_rate": 2e-05, + "loss": 1.3169, + "loss/crossentropy": 2.454580545425415, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.18412557244300842, + "step": 1563 + }, + { + "epoch": 0.023353740480812304, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0023006280263264973, + "learning_rate": 2e-05, + "loss": 1.2856, + "loss/crossentropy": 2.640057325363159, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.19184106588363647, + "step": 1564 + }, + { + "epoch": 0.023368672539943257, + "grad_norm": 0.490234375, + "grad_norm_var": 0.0023747762044270832, + "learning_rate": 2e-05, + "loss": 1.1696, + "loss/crossentropy": 2.4749562740325928, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15401500463485718, + "step": 1565 + }, + { + "epoch": 0.023383604599074213, + "grad_norm": 0.5, + "grad_norm_var": 0.0024815877278645832, + "learning_rate": 2e-05, + "loss": 1.1622, + "loss/crossentropy": 2.4315335750579834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.15435267984867096, + "step": 1566 + }, + { + "epoch": 0.023398536658205165, + "grad_norm": 0.515625, + "grad_norm_var": 0.002191527684529622, + "learning_rate": 2e-05, + "loss": 1.2196, + "loss/crossentropy": 2.7022817134857178, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17277126014232635, + "step": 1567 + }, + { + "epoch": 0.02341346871733612, + "grad_norm": 0.51953125, + "grad_norm_var": 0.00168608029683431, + "learning_rate": 2e-05, + "loss": 1.218, + "loss/crossentropy": 2.5333712100982666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1633320152759552, + "step": 1568 + }, + { + "epoch": 0.023428400776467073, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0014311313629150391, + "learning_rate": 2e-05, + "loss": 1.2387, + "loss/crossentropy": 2.753187417984009, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16835424304008484, + "step": 1569 + }, + { + "epoch": 0.02344333283559803, + "grad_norm": 0.515625, + "grad_norm_var": 0.0013548374176025391, + "learning_rate": 2e-05, + "loss": 1.1516, + "loss/crossentropy": 2.419675588607788, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.14382390677928925, + "step": 1570 + }, + { + "epoch": 0.023458264894728985, + "grad_norm": 0.578125, + "grad_norm_var": 0.0013063907623291015, + "learning_rate": 2e-05, + "loss": 1.2286, + "loss/crossentropy": 2.2976953983306885, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.15043297410011292, + "step": 1571 + }, + { + "epoch": 0.023473196953859937, + "grad_norm": 0.5703125, + "grad_norm_var": 0.001284646987915039, + "learning_rate": 2e-05, + "loss": 1.3051, + "loss/crossentropy": 2.31030535697937, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18014974892139435, + "step": 1572 + }, + { + "epoch": 0.023488129012990893, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0011582533518473307, + "learning_rate": 2e-05, + "loss": 1.2815, + "loss/crossentropy": 2.604799270629883, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.17212437093257904, + "step": 1573 + }, + { + "epoch": 0.023503061072121845, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0011911869049072265, + "learning_rate": 2e-05, + "loss": 1.2331, + "loss/crossentropy": 2.6337313652038574, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.162765234708786, + "step": 1574 + }, + { + "epoch": 0.0235179931312528, + "grad_norm": 0.53515625, + "grad_norm_var": 0.001097726821899414, + "learning_rate": 2e-05, + "loss": 1.3357, + "loss/crossentropy": 2.5624470710754395, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.21068018674850464, + "step": 1575 + }, + { + "epoch": 0.023532925190383754, + "grad_norm": 0.546875, + "grad_norm_var": 0.0008711338043212891, + "learning_rate": 2e-05, + "loss": 1.3721, + "loss/crossentropy": 2.4522528648376465, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.20022107660770416, + "step": 1576 + }, + { + "epoch": 0.02354785724951471, + "grad_norm": 0.5, + "grad_norm_var": 0.0009217421213785807, + "learning_rate": 2e-05, + "loss": 1.2497, + "loss/crossentropy": 2.6643879413604736, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.17159616947174072, + "step": 1577 + }, + { + "epoch": 0.023562789308645662, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0008835951487223307, + "learning_rate": 2e-05, + "loss": 1.2574, + "loss/crossentropy": 2.673330545425415, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1793064922094345, + "step": 1578 + }, + { + "epoch": 0.023577721367776618, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0008727868398030598, + "learning_rate": 2e-05, + "loss": 1.2391, + "loss/crossentropy": 2.4731786251068115, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1844569444656372, + "step": 1579 + }, + { + "epoch": 0.02359265342690757, + "grad_norm": 0.7890625, + "grad_norm_var": 0.004923105239868164, + "learning_rate": 2e-05, + "loss": 1.335, + "loss/crossentropy": 2.567404270172119, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.17879244685173035, + "step": 1580 + }, + { + "epoch": 0.023607585486038526, + "grad_norm": 0.640625, + "grad_norm_var": 0.005232747395833333, + "learning_rate": 2e-05, + "loss": 1.4227, + "loss/crossentropy": 2.81486177444458, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 13.0, + "loss/logits": 0.21953287720680237, + "step": 1581 + }, + { + "epoch": 0.023622517545169478, + "grad_norm": 0.55078125, + "grad_norm_var": 0.005023638407389323, + "learning_rate": 2e-05, + "loss": 1.3109, + "loss/crossentropy": 2.5990700721740723, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18592441082000732, + "step": 1582 + }, + { + "epoch": 0.023637449604300434, + "grad_norm": 0.50390625, + "grad_norm_var": 0.005098215738932292, + "learning_rate": 2e-05, + "loss": 1.1859, + "loss/crossentropy": 2.800184488296509, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15463992953300476, + "step": 1583 + }, + { + "epoch": 0.023652381663431386, + "grad_norm": 0.578125, + "grad_norm_var": 0.005019060770670573, + "learning_rate": 2e-05, + "loss": 1.3002, + "loss/crossentropy": 2.5168678760528564, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.19082492589950562, + "step": 1584 + }, + { + "epoch": 0.023667313722562342, + "grad_norm": 0.59375, + "grad_norm_var": 0.005083147684733073, + "learning_rate": 2e-05, + "loss": 1.2893, + "loss/crossentropy": 2.3718113899230957, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18773557245731354, + "step": 1585 + }, + { + "epoch": 0.023682245781693295, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0049855550130208336, + "learning_rate": 2e-05, + "loss": 1.2704, + "loss/crossentropy": 2.434890031814575, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.19229555130004883, + "step": 1586 + }, + { + "epoch": 0.02369717784082425, + "grad_norm": 0.57421875, + "grad_norm_var": 0.00497887929280599, + "learning_rate": 2e-05, + "loss": 1.2931, + "loss/crossentropy": 2.461606025695801, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.1758810579776764, + "step": 1587 + }, + { + "epoch": 0.023712109899955203, + "grad_norm": 0.515625, + "grad_norm_var": 0.005114173889160157, + "learning_rate": 2e-05, + "loss": 1.2502, + "loss/crossentropy": 2.5047669410705566, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.179841548204422, + "step": 1588 + }, + { + "epoch": 0.02372704195908616, + "grad_norm": 0.498046875, + "grad_norm_var": 0.005258417129516602, + "learning_rate": 2e-05, + "loss": 1.1957, + "loss/crossentropy": 2.441140651702881, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1488042175769806, + "step": 1589 + }, + { + "epoch": 0.02374197401821711, + "grad_norm": 0.486328125, + "grad_norm_var": 0.0054323832194010414, + "learning_rate": 2e-05, + "loss": 1.1902, + "loss/crossentropy": 2.5383846759796143, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15891912579536438, + "step": 1590 + }, + { + "epoch": 0.023756906077348067, + "grad_norm": 0.482421875, + "grad_norm_var": 0.005760685602823893, + "learning_rate": 2e-05, + "loss": 1.2418, + "loss/crossentropy": 2.5855181217193604, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17932924628257751, + "step": 1591 + }, + { + "epoch": 0.02377183813647902, + "grad_norm": 0.53125, + "grad_norm_var": 0.005790440241495768, + "learning_rate": 2e-05, + "loss": 1.2238, + "loss/crossentropy": 2.640465259552002, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17696192860603333, + "step": 1592 + }, + { + "epoch": 0.023786770195609975, + "grad_norm": 0.58984375, + "grad_norm_var": 0.005661757787068685, + "learning_rate": 2e-05, + "loss": 1.3372, + "loss/crossentropy": 2.1728391647338867, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.20438703894615173, + "step": 1593 + }, + { + "epoch": 0.023801702254740927, + "grad_norm": 0.5234375, + "grad_norm_var": 0.005678923924763998, + "learning_rate": 2e-05, + "loss": 1.2761, + "loss/crossentropy": 2.580386161804199, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18235522508621216, + "step": 1594 + }, + { + "epoch": 0.023816634313871883, + "grad_norm": 0.66796875, + "grad_norm_var": 0.006388076146443685, + "learning_rate": 2e-05, + "loss": 1.4137, + "loss/crossentropy": 2.3353309631347656, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 13.0, + "loss/logits": 0.2027420699596405, + "step": 1595 + }, + { + "epoch": 0.023831566373002835, + "grad_norm": 0.58984375, + "grad_norm_var": 0.0029510339101155598, + "learning_rate": 2e-05, + "loss": 1.3433, + "loss/crossentropy": 2.4447288513183594, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.19487924873828888, + "step": 1596 + }, + { + "epoch": 0.02384649843213379, + "grad_norm": 0.6171875, + "grad_norm_var": 0.002714141209920247, + "learning_rate": 2e-05, + "loss": 1.3034, + "loss/crossentropy": 2.6335086822509766, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.17839547991752625, + "step": 1597 + }, + { + "epoch": 0.023861430491264747, + "grad_norm": 0.546875, + "grad_norm_var": 0.002715921401977539, + "learning_rate": 2e-05, + "loss": 1.2338, + "loss/crossentropy": 2.8217856884002686, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1713387668132782, + "step": 1598 + }, + { + "epoch": 0.0238763625503957, + "grad_norm": 0.57421875, + "grad_norm_var": 0.0025728702545166015, + "learning_rate": 2e-05, + "loss": 1.3584, + "loss/crossentropy": 2.3990859985351562, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.18652434647083282, + "step": 1599 + }, + { + "epoch": 0.023891294609526655, + "grad_norm": 0.48046875, + "grad_norm_var": 0.002887582778930664, + "learning_rate": 2e-05, + "loss": 1.2107, + "loss/crossentropy": 2.350006103515625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16381904482841492, + "step": 1600 + }, + { + "epoch": 0.023906226668657608, + "grad_norm": 0.55859375, + "grad_norm_var": 0.002761697769165039, + "learning_rate": 2e-05, + "loss": 1.2017, + "loss/crossentropy": 2.7609221935272217, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16260434687137604, + "step": 1601 + }, + { + "epoch": 0.023921158727788563, + "grad_norm": 0.5703125, + "grad_norm_var": 0.002777719497680664, + "learning_rate": 2e-05, + "loss": 1.2588, + "loss/crossentropy": 2.6673271656036377, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.18064871430397034, + "step": 1602 + }, + { + "epoch": 0.023936090786919516, + "grad_norm": 0.5703125, + "grad_norm_var": 0.002766275405883789, + "learning_rate": 2e-05, + "loss": 1.3639, + "loss/crossentropy": 2.7430176734924316, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.19205442070960999, + "step": 1603 + }, + { + "epoch": 0.02395102284605047, + "grad_norm": 0.494140625, + "grad_norm_var": 0.0028940836588541665, + "learning_rate": 2e-05, + "loss": 1.2124, + "loss/crossentropy": 2.53092360496521, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1655370444059372, + "step": 1604 + }, + { + "epoch": 0.023965954905181424, + "grad_norm": 0.55078125, + "grad_norm_var": 0.002710835138956706, + "learning_rate": 2e-05, + "loss": 1.3779, + "loss/crossentropy": 2.3667619228363037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.19819000363349915, + "step": 1605 + }, + { + "epoch": 0.02398088696431238, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0024050394694010418, + "learning_rate": 2e-05, + "loss": 1.2546, + "loss/crossentropy": 2.244283676147461, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1686820089817047, + "step": 1606 + }, + { + "epoch": 0.023995819023443332, + "grad_norm": 0.5625, + "grad_norm_var": 0.002018594741821289, + "learning_rate": 2e-05, + "loss": 1.2936, + "loss/crossentropy": 2.713747501373291, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.1764499694108963, + "step": 1607 + }, + { + "epoch": 0.024010751082574288, + "grad_norm": 0.47265625, + "grad_norm_var": 0.0024668216705322266, + "learning_rate": 2e-05, + "loss": 1.1716, + "loss/crossentropy": 2.828679323196411, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.155990868806839, + "step": 1608 + }, + { + "epoch": 0.02402568314170524, + "grad_norm": 0.76171875, + "grad_norm_var": 0.005054457982381185, + "learning_rate": 2e-05, + "loss": 1.2463, + "loss/crossentropy": 2.567664623260498, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.16816666722297668, + "step": 1609 + }, + { + "epoch": 0.024040615200836196, + "grad_norm": 0.98828125, + "grad_norm_var": 0.015782785415649415, + "learning_rate": 2e-05, + "loss": 1.342, + "loss/crossentropy": 2.756133556365967, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.20140567421913147, + "step": 1610 + }, + { + "epoch": 0.02405554725996715, + "grad_norm": 0.484375, + "grad_norm_var": 0.0161592960357666, + "learning_rate": 2e-05, + "loss": 1.2295, + "loss/crossentropy": 2.6116912364959717, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1748572140932083, + "step": 1611 + }, + { + "epoch": 0.024070479319098104, + "grad_norm": 0.51171875, + "grad_norm_var": 0.016498804092407227, + "learning_rate": 2e-05, + "loss": 1.2077, + "loss/crossentropy": 2.7342991828918457, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16085676848888397, + "step": 1612 + }, + { + "epoch": 0.024085411378229057, + "grad_norm": 0.57421875, + "grad_norm_var": 0.016406488418579102, + "learning_rate": 2e-05, + "loss": 1.3091, + "loss/crossentropy": 2.641632318496704, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18407666683197021, + "step": 1613 + }, + { + "epoch": 0.024100343437360013, + "grad_norm": 0.470703125, + "grad_norm_var": 0.017087745666503906, + "learning_rate": 2e-05, + "loss": 1.1709, + "loss/crossentropy": 2.5103869438171387, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.14749841392040253, + "step": 1614 + }, + { + "epoch": 0.024115275496490965, + "grad_norm": 0.494140625, + "grad_norm_var": 0.0174807071685791, + "learning_rate": 2e-05, + "loss": 1.242, + "loss/crossentropy": 2.5742335319519043, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1717085838317871, + "step": 1615 + }, + { + "epoch": 0.02413020755562192, + "grad_norm": 0.546875, + "grad_norm_var": 0.01697703997294108, + "learning_rate": 2e-05, + "loss": 1.3399, + "loss/crossentropy": 2.6849160194396973, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.19930125772953033, + "step": 1616 + }, + { + "epoch": 0.024145139614752873, + "grad_norm": 0.56640625, + "grad_norm_var": 0.01696623166402181, + "learning_rate": 2e-05, + "loss": 1.3899, + "loss/crossentropy": 2.5048611164093018, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.2179987132549286, + "step": 1617 + }, + { + "epoch": 0.02416007167388383, + "grad_norm": 0.59765625, + "grad_norm_var": 0.01700272560119629, + "learning_rate": 2e-05, + "loss": 1.3458, + "loss/crossentropy": 2.225867509841919, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.17394158244132996, + "step": 1618 + }, + { + "epoch": 0.02417500373301478, + "grad_norm": 0.5546875, + "grad_norm_var": 0.01702739397684733, + "learning_rate": 2e-05, + "loss": 1.2265, + "loss/crossentropy": 2.765789747238159, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17184340953826904, + "step": 1619 + }, + { + "epoch": 0.024189935792145737, + "grad_norm": 0.62890625, + "grad_norm_var": 0.01673018137613932, + "learning_rate": 2e-05, + "loss": 1.2812, + "loss/crossentropy": 2.774526596069336, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.17186947166919708, + "step": 1620 + }, + { + "epoch": 0.02420486785127669, + "grad_norm": 0.57421875, + "grad_norm_var": 0.016666094462076824, + "learning_rate": 2e-05, + "loss": 1.3168, + "loss/crossentropy": 2.60170578956604, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19177797436714172, + "step": 1621 + }, + { + "epoch": 0.024219799910407645, + "grad_norm": 0.5546875, + "grad_norm_var": 0.016649881998697918, + "learning_rate": 2e-05, + "loss": 1.2087, + "loss/crossentropy": 2.606919527053833, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16964177787303925, + "step": 1622 + }, + { + "epoch": 0.024234731969538598, + "grad_norm": 0.625, + "grad_norm_var": 0.016714986165364584, + "learning_rate": 2e-05, + "loss": 1.3444, + "loss/crossentropy": 2.387601375579834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.1725083887577057, + "step": 1623 + }, + { + "epoch": 0.024249664028669553, + "grad_norm": 0.5546875, + "grad_norm_var": 0.01587518056233724, + "learning_rate": 2e-05, + "loss": 1.3219, + "loss/crossentropy": 2.5789592266082764, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19694536924362183, + "step": 1624 + }, + { + "epoch": 0.02426459608780051, + "grad_norm": 0.484375, + "grad_norm_var": 0.014444224039713542, + "learning_rate": 2e-05, + "loss": 1.2397, + "loss/crossentropy": 2.5164992809295654, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17721965909004211, + "step": 1625 + }, + { + "epoch": 0.02427952814693146, + "grad_norm": 0.625, + "grad_norm_var": 0.002707354227701823, + "learning_rate": 2e-05, + "loss": 1.2661, + "loss/crossentropy": 2.8884806632995605, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.18014132976531982, + "step": 1626 + }, + { + "epoch": 0.024294460206062417, + "grad_norm": 0.55859375, + "grad_norm_var": 0.00237274169921875, + "learning_rate": 2e-05, + "loss": 1.4009, + "loss/crossentropy": 2.534945249557495, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 13.0, + "loss/logits": 0.19778071343898773, + "step": 1627 + }, + { + "epoch": 0.02430939226519337, + "grad_norm": 0.5625, + "grad_norm_var": 0.0022231419881184895, + "learning_rate": 2e-05, + "loss": 1.3571, + "loss/crossentropy": 2.6048007011413574, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.20868733525276184, + "step": 1628 + }, + { + "epoch": 0.024324324324324326, + "grad_norm": 0.76953125, + "grad_norm_var": 0.004957008361816406, + "learning_rate": 2e-05, + "loss": 1.4905, + "loss/crossentropy": 2.398721218109131, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 13.0, + "loss/logits": 0.22485968470573425, + "step": 1629 + }, + { + "epoch": 0.024339256383455278, + "grad_norm": 0.48828125, + "grad_norm_var": 0.004736566543579101, + "learning_rate": 2e-05, + "loss": 1.1915, + "loss/crossentropy": 2.464036703109741, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1602393090724945, + "step": 1630 + }, + { + "epoch": 0.024354188442586234, + "grad_norm": 0.5078125, + "grad_norm_var": 0.004602495829264323, + "learning_rate": 2e-05, + "loss": 1.174, + "loss/crossentropy": 2.519684076309204, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15833880007266998, + "step": 1631 + }, + { + "epoch": 0.024369120501717186, + "grad_norm": 0.578125, + "grad_norm_var": 0.004546546936035156, + "learning_rate": 2e-05, + "loss": 1.3098, + "loss/crossentropy": 2.3867616653442383, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18479761481285095, + "step": 1632 + }, + { + "epoch": 0.024384052560848142, + "grad_norm": 0.53515625, + "grad_norm_var": 0.004651323954264323, + "learning_rate": 2e-05, + "loss": 1.2205, + "loss/crossentropy": 2.6318466663360596, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16580361127853394, + "step": 1633 + }, + { + "epoch": 0.024398984619979094, + "grad_norm": 0.486328125, + "grad_norm_var": 0.005088917414347331, + "learning_rate": 2e-05, + "loss": 1.1839, + "loss/crossentropy": 2.615931749343872, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15263916552066803, + "step": 1634 + }, + { + "epoch": 0.02441391667911005, + "grad_norm": 0.5078125, + "grad_norm_var": 0.005309406916300456, + "learning_rate": 2e-05, + "loss": 1.1888, + "loss/crossentropy": 2.495640277862549, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15757544338703156, + "step": 1635 + }, + { + "epoch": 0.024428848738241003, + "grad_norm": 0.53125, + "grad_norm_var": 0.005074167251586914, + "learning_rate": 2e-05, + "loss": 1.1999, + "loss/crossentropy": 2.633312463760376, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1686273217201233, + "step": 1636 + }, + { + "epoch": 0.02444378079737196, + "grad_norm": 0.478515625, + "grad_norm_var": 0.00545190175374349, + "learning_rate": 2e-05, + "loss": 1.2016, + "loss/crossentropy": 2.4377596378326416, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16258299350738525, + "step": 1637 + }, + { + "epoch": 0.02445871285650291, + "grad_norm": 0.51953125, + "grad_norm_var": 0.005521138509114583, + "learning_rate": 2e-05, + "loss": 1.184, + "loss/crossentropy": 2.7175605297088623, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.16056030988693237, + "step": 1638 + }, + { + "epoch": 0.024473644915633867, + "grad_norm": 0.5390625, + "grad_norm_var": 0.005132293701171875, + "learning_rate": 2e-05, + "loss": 1.2285, + "loss/crossentropy": 2.648740530014038, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17380774021148682, + "step": 1639 + }, + { + "epoch": 0.02448857697476482, + "grad_norm": 0.55859375, + "grad_norm_var": 0.005138079325358073, + "learning_rate": 2e-05, + "loss": 1.2751, + "loss/crossentropy": 2.3271639347076416, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18137691915035248, + "step": 1640 + }, + { + "epoch": 0.024503509033895775, + "grad_norm": 0.482421875, + "grad_norm_var": 0.005154275894165039, + "learning_rate": 2e-05, + "loss": 1.2039, + "loss/crossentropy": 2.5689260959625244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16480699181556702, + "step": 1641 + }, + { + "epoch": 0.024518441093026727, + "grad_norm": 1.40625, + "grad_norm_var": 0.05157914161682129, + "learning_rate": 2e-05, + "loss": 1.6373, + "loss/crossentropy": 2.645709991455078, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3828125, + "loss/idx": 13.0, + "loss/logits": 0.25451111793518066, + "step": 1642 + }, + { + "epoch": 0.024533373152157683, + "grad_norm": 0.5390625, + "grad_norm_var": 0.051696125666300455, + "learning_rate": 2e-05, + "loss": 1.2251, + "loss/crossentropy": 2.5970401763916016, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17040389776229858, + "step": 1643 + }, + { + "epoch": 0.024548305211288635, + "grad_norm": 0.462890625, + "grad_norm_var": 0.05272318522135417, + "learning_rate": 2e-05, + "loss": 1.1941, + "loss/crossentropy": 2.482206344604492, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.14718057215213776, + "step": 1644 + }, + { + "epoch": 0.02456323727041959, + "grad_norm": 0.54296875, + "grad_norm_var": 0.050414784749348955, + "learning_rate": 2e-05, + "loss": 1.3091, + "loss/crossentropy": 2.300069808959961, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.19193615019321442, + "step": 1645 + }, + { + "epoch": 0.024578169329550544, + "grad_norm": 0.5390625, + "grad_norm_var": 0.050004005432128906, + "learning_rate": 2e-05, + "loss": 1.2527, + "loss/crossentropy": 2.6057968139648438, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18236851692199707, + "step": 1646 + }, + { + "epoch": 0.0245931013886815, + "grad_norm": 0.51953125, + "grad_norm_var": 0.049906158447265626, + "learning_rate": 2e-05, + "loss": 1.2824, + "loss/crossentropy": 2.6754231452941895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.2042427510023117, + "step": 1647 + }, + { + "epoch": 0.02460803344781245, + "grad_norm": 0.48828125, + "grad_norm_var": 0.050393104553222656, + "learning_rate": 2e-05, + "loss": 1.156, + "loss/crossentropy": 2.59796142578125, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.156040221452713, + "step": 1648 + }, + { + "epoch": 0.024622965506943408, + "grad_norm": 0.640625, + "grad_norm_var": 0.05058364868164063, + "learning_rate": 2e-05, + "loss": 1.336, + "loss/crossentropy": 2.5802266597747803, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.1875656247138977, + "step": 1649 + }, + { + "epoch": 0.024637897566074363, + "grad_norm": 0.6328125, + "grad_norm_var": 0.05014138221740723, + "learning_rate": 2e-05, + "loss": 1.2801, + "loss/crossentropy": 2.673807382583618, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.19418036937713623, + "step": 1650 + }, + { + "epoch": 0.024652829625205316, + "grad_norm": 0.66796875, + "grad_norm_var": 0.05005796750386556, + "learning_rate": 2e-05, + "loss": 1.5013, + "loss/crossentropy": 2.436178684234619, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2734375, + "loss/idx": 13.0, + "loss/logits": 0.22788530588150024, + "step": 1651 + }, + { + "epoch": 0.02466776168433627, + "grad_norm": 0.5546875, + "grad_norm_var": 0.04988745053609212, + "learning_rate": 2e-05, + "loss": 1.3592, + "loss/crossentropy": 2.245730400085449, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.21073272824287415, + "step": 1652 + }, + { + "epoch": 0.024682693743467224, + "grad_norm": 0.51953125, + "grad_norm_var": 0.04933770497639974, + "learning_rate": 2e-05, + "loss": 1.2983, + "loss/crossentropy": 2.5788753032684326, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1889631152153015, + "step": 1653 + }, + { + "epoch": 0.02469762580259818, + "grad_norm": 0.494140625, + "grad_norm_var": 0.04965322812398275, + "learning_rate": 2e-05, + "loss": 1.1939, + "loss/crossentropy": 2.6785736083984375, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16263613104820251, + "step": 1654 + }, + { + "epoch": 0.024712557861729132, + "grad_norm": 0.5078125, + "grad_norm_var": 0.04996501604715983, + "learning_rate": 2e-05, + "loss": 1.2169, + "loss/crossentropy": 2.441274404525757, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1544494479894638, + "step": 1655 + }, + { + "epoch": 0.024727489920860088, + "grad_norm": 0.55078125, + "grad_norm_var": 0.05000913937886556, + "learning_rate": 2e-05, + "loss": 1.2835, + "loss/crossentropy": 2.6980459690093994, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1819465607404709, + "step": 1656 + }, + { + "epoch": 0.02474242197999104, + "grad_norm": 0.53125, + "grad_norm_var": 0.04941349029541016, + "learning_rate": 2e-05, + "loss": 1.2336, + "loss/crossentropy": 2.562973976135254, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1632436215877533, + "step": 1657 + }, + { + "epoch": 0.024757354039121996, + "grad_norm": 0.53125, + "grad_norm_var": 0.003185462951660156, + "learning_rate": 2e-05, + "loss": 1.2196, + "loss/crossentropy": 2.669048547744751, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17273685336112976, + "step": 1658 + }, + { + "epoch": 0.02477228609825295, + "grad_norm": 0.43359375, + "grad_norm_var": 0.003966522216796875, + "learning_rate": 2e-05, + "loss": 1.1219, + "loss/crossentropy": 2.5299949645996094, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9765625, + "loss/idx": 13.0, + "loss/logits": 0.14536432921886444, + "step": 1659 + }, + { + "epoch": 0.024787218157383904, + "grad_norm": 0.5390625, + "grad_norm_var": 0.003560495376586914, + "learning_rate": 2e-05, + "loss": 1.2227, + "loss/crossentropy": 2.4549379348754883, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16801050305366516, + "step": 1660 + }, + { + "epoch": 0.024802150216514857, + "grad_norm": 0.5, + "grad_norm_var": 0.003677988052368164, + "learning_rate": 2e-05, + "loss": 1.1772, + "loss/crossentropy": 2.661327600479126, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15374130010604858, + "step": 1661 + }, + { + "epoch": 0.024817082275645812, + "grad_norm": 0.484375, + "grad_norm_var": 0.003876479466756185, + "learning_rate": 2e-05, + "loss": 1.163, + "loss/crossentropy": 2.5966739654541016, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.16298674046993256, + "step": 1662 + }, + { + "epoch": 0.024832014334776765, + "grad_norm": 0.50390625, + "grad_norm_var": 0.003928613662719726, + "learning_rate": 2e-05, + "loss": 1.1938, + "loss/crossentropy": 2.5892300605773926, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.17037644982337952, + "step": 1663 + }, + { + "epoch": 0.02484694639390772, + "grad_norm": 0.60546875, + "grad_norm_var": 0.004037332534790039, + "learning_rate": 2e-05, + "loss": 1.4414, + "loss/crossentropy": 2.285784959793091, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2421875, + "loss/idx": 13.0, + "loss/logits": 0.19916491210460663, + "step": 1664 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.58984375, + "grad_norm_var": 0.003541421890258789, + "learning_rate": 2e-05, + "loss": 1.2812, + "loss/crossentropy": 2.531317949295044, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1796582043170929, + "step": 1665 + }, + { + "epoch": 0.02487681051216963, + "grad_norm": 0.49609375, + "grad_norm_var": 0.003025166193644206, + "learning_rate": 2e-05, + "loss": 1.237, + "loss/crossentropy": 2.617490530014038, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1823294758796692, + "step": 1666 + }, + { + "epoch": 0.02489174257130058, + "grad_norm": 0.47265625, + "grad_norm_var": 0.0018648624420166016, + "learning_rate": 2e-05, + "loss": 1.2051, + "loss/crossentropy": 2.699169635772705, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16599290072917938, + "step": 1667 + }, + { + "epoch": 0.024906674630431537, + "grad_norm": 0.60546875, + "grad_norm_var": 0.0022632439931233725, + "learning_rate": 2e-05, + "loss": 1.391, + "loss/crossentropy": 2.284773588180542, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.21133294701576233, + "step": 1668 + }, + { + "epoch": 0.02492160668956249, + "grad_norm": 0.52734375, + "grad_norm_var": 0.002263625462849935, + "learning_rate": 2e-05, + "loss": 1.3096, + "loss/crossentropy": 2.398963212966919, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.17675255239009857, + "step": 1669 + }, + { + "epoch": 0.024936538748693445, + "grad_norm": 0.515625, + "grad_norm_var": 0.002208900451660156, + "learning_rate": 2e-05, + "loss": 1.2286, + "loss/crossentropy": 2.7221715450286865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17392107844352722, + "step": 1670 + }, + { + "epoch": 0.024951470807824398, + "grad_norm": 0.51171875, + "grad_norm_var": 0.002201080322265625, + "learning_rate": 2e-05, + "loss": 1.2543, + "loss/crossentropy": 2.7048890590667725, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18397437036037445, + "step": 1671 + }, + { + "epoch": 0.024966402866955353, + "grad_norm": 0.6015625, + "grad_norm_var": 0.0025374730428059894, + "learning_rate": 2e-05, + "loss": 1.2544, + "loss/crossentropy": 2.427884101867676, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.16849087178707123, + "step": 1672 + }, + { + "epoch": 0.024981334926086306, + "grad_norm": 0.546875, + "grad_norm_var": 0.0025593439737955728, + "learning_rate": 2e-05, + "loss": 1.2987, + "loss/crossentropy": 2.7837252616882324, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1893191933631897, + "step": 1673 + }, + { + "epoch": 0.02499626698521726, + "grad_norm": 0.5390625, + "grad_norm_var": 0.002565447489420573, + "learning_rate": 2e-05, + "loss": 1.3511, + "loss/crossentropy": 2.4340031147003174, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.21051189303398132, + "step": 1674 + }, + { + "epoch": 0.025011199044348214, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0019602457682291667, + "learning_rate": 2e-05, + "loss": 1.1157, + "loss/crossentropy": 2.662384510040283, + "loss/dist_ce": 0.0, + "loss/fcd": 0.96875, + "loss/idx": 13.0, + "loss/logits": 0.1469482183456421, + "step": 1675 + }, + { + "epoch": 0.02502613110347917, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0019891738891601564, + "learning_rate": 2e-05, + "loss": 1.1957, + "loss/crossentropy": 2.637355327606201, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1644275039434433, + "step": 1676 + }, + { + "epoch": 0.025041063162610126, + "grad_norm": 0.60546875, + "grad_norm_var": 0.002227783203125, + "learning_rate": 2e-05, + "loss": 1.2562, + "loss/crossentropy": 2.616171360015869, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17029576003551483, + "step": 1677 + }, + { + "epoch": 0.025055995221741078, + "grad_norm": 0.73046875, + "grad_norm_var": 0.004218482971191406, + "learning_rate": 2e-05, + "loss": 1.3086, + "loss/crossentropy": 2.3912839889526367, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.19144338369369507, + "step": 1678 + }, + { + "epoch": 0.025070927280872034, + "grad_norm": 0.52734375, + "grad_norm_var": 0.004094886779785156, + "learning_rate": 2e-05, + "loss": 1.2694, + "loss/crossentropy": 2.4914817810058594, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.16782964766025543, + "step": 1679 + }, + { + "epoch": 0.025085859340002986, + "grad_norm": 0.48046875, + "grad_norm_var": 0.004245440165201823, + "learning_rate": 2e-05, + "loss": 1.1771, + "loss/crossentropy": 2.6493070125579834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15367591381072998, + "step": 1680 + }, + { + "epoch": 0.025100791399133942, + "grad_norm": 0.58203125, + "grad_norm_var": 0.004205767313639323, + "learning_rate": 2e-05, + "loss": 1.2893, + "loss/crossentropy": 2.348886013031006, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18777824938297272, + "step": 1681 + }, + { + "epoch": 0.025115723458264894, + "grad_norm": 0.6796875, + "grad_norm_var": 0.005051422119140625, + "learning_rate": 2e-05, + "loss": 1.5518, + "loss/crossentropy": 2.78147029876709, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 13.0, + "loss/logits": 0.28619956970214844, + "step": 1682 + }, + { + "epoch": 0.02513065551739585, + "grad_norm": 0.59375, + "grad_norm_var": 0.004572486877441407, + "learning_rate": 2e-05, + "loss": 1.3742, + "loss/crossentropy": 2.9234964847564697, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.18667152523994446, + "step": 1683 + }, + { + "epoch": 0.025145587576526802, + "grad_norm": 0.54296875, + "grad_norm_var": 0.004493141174316406, + "learning_rate": 2e-05, + "loss": 1.2891, + "loss/crossentropy": 2.454906702041626, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1797175407409668, + "step": 1684 + }, + { + "epoch": 0.02516051963565776, + "grad_norm": 0.5, + "grad_norm_var": 0.0046689351399739586, + "learning_rate": 2e-05, + "loss": 1.2387, + "loss/crossentropy": 2.572845458984375, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1761704385280609, + "step": 1685 + }, + { + "epoch": 0.02517545169478871, + "grad_norm": 0.498046875, + "grad_norm_var": 0.004794677098592122, + "learning_rate": 2e-05, + "loss": 1.1848, + "loss/crossentropy": 2.493640184402466, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15350091457366943, + "step": 1686 + }, + { + "epoch": 0.025190383753919666, + "grad_norm": 0.5625, + "grad_norm_var": 0.004629373550415039, + "learning_rate": 2e-05, + "loss": 1.196, + "loss/crossentropy": 2.6362812519073486, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16470015048980713, + "step": 1687 + }, + { + "epoch": 0.02520531581305062, + "grad_norm": 0.53515625, + "grad_norm_var": 0.004564523696899414, + "learning_rate": 2e-05, + "loss": 1.2056, + "loss/crossentropy": 2.431504249572754, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.15874925255775452, + "step": 1688 + }, + { + "epoch": 0.025220247872181575, + "grad_norm": 0.49609375, + "grad_norm_var": 0.004807519912719727, + "learning_rate": 2e-05, + "loss": 1.2304, + "loss/crossentropy": 2.4948151111602783, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16791260242462158, + "step": 1689 + }, + { + "epoch": 0.025235179931312527, + "grad_norm": 0.5234375, + "grad_norm_var": 0.004857619603474935, + "learning_rate": 2e-05, + "loss": 1.2481, + "loss/crossentropy": 2.4789421558380127, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1777571737766266, + "step": 1690 + }, + { + "epoch": 0.025250111990443483, + "grad_norm": 0.51953125, + "grad_norm_var": 0.00479276974995931, + "learning_rate": 2e-05, + "loss": 1.2302, + "loss/crossentropy": 2.2739431858062744, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16770566999912262, + "step": 1691 + }, + { + "epoch": 0.025265044049574435, + "grad_norm": 0.515625, + "grad_norm_var": 0.004770898818969726, + "learning_rate": 2e-05, + "loss": 1.2194, + "loss/crossentropy": 2.639508008956909, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1725163459777832, + "step": 1692 + }, + { + "epoch": 0.02527997610870539, + "grad_norm": 0.5, + "grad_norm_var": 0.004767465591430664, + "learning_rate": 2e-05, + "loss": 1.2304, + "loss/crossentropy": 2.8804821968078613, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1756765991449356, + "step": 1693 + }, + { + "epoch": 0.025294908167836343, + "grad_norm": 1.1796875, + "grad_norm_var": 0.028237390518188476, + "learning_rate": 2e-05, + "loss": 1.9296, + "loss/crossentropy": 2.852426290512085, + "loss/dist_ce": 0.0, + "loss/fcd": 1.578125, + "loss/idx": 13.0, + "loss/logits": 0.35142582654953003, + "step": 1694 + }, + { + "epoch": 0.0253098402269673, + "grad_norm": 0.56640625, + "grad_norm_var": 0.028072722752889, + "learning_rate": 2e-05, + "loss": 1.2908, + "loss/crossentropy": 2.6262359619140625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18921533226966858, + "step": 1695 + }, + { + "epoch": 0.02532477228609825, + "grad_norm": 0.53515625, + "grad_norm_var": 0.02753599484761556, + "learning_rate": 2e-05, + "loss": 1.3023, + "loss/crossentropy": 2.71830153465271, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.19293737411499023, + "step": 1696 + }, + { + "epoch": 0.025339704345229207, + "grad_norm": 0.54296875, + "grad_norm_var": 0.02763708432515462, + "learning_rate": 2e-05, + "loss": 1.2915, + "loss/crossentropy": 2.5848164558410645, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.19779077172279358, + "step": 1697 + }, + { + "epoch": 0.02535463640436016, + "grad_norm": 0.59375, + "grad_norm_var": 0.02696429888407389, + "learning_rate": 2e-05, + "loss": 1.3879, + "loss/crossentropy": 2.7005844116210938, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.20819774270057678, + "step": 1698 + }, + { + "epoch": 0.025369568463491116, + "grad_norm": 0.47265625, + "grad_norm_var": 0.02758316993713379, + "learning_rate": 2e-05, + "loss": 1.1186, + "loss/crossentropy": 2.633463144302368, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9765625, + "loss/idx": 13.0, + "loss/logits": 0.1420612782239914, + "step": 1699 + }, + { + "epoch": 0.025384500522622068, + "grad_norm": 0.546875, + "grad_norm_var": 0.02757121721903483, + "learning_rate": 2e-05, + "loss": 1.264, + "loss/crossentropy": 2.642566204071045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1858481764793396, + "step": 1700 + }, + { + "epoch": 0.025399432581753024, + "grad_norm": 0.49609375, + "grad_norm_var": 0.02760758399963379, + "learning_rate": 2e-05, + "loss": 1.3168, + "loss/crossentropy": 2.675619125366211, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19182080030441284, + "step": 1701 + }, + { + "epoch": 0.02541436464088398, + "grad_norm": 0.609375, + "grad_norm_var": 0.027347564697265625, + "learning_rate": 2e-05, + "loss": 1.3172, + "loss/crossentropy": 2.6206746101379395, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19216333329677582, + "step": 1702 + }, + { + "epoch": 0.025429296700014932, + "grad_norm": 0.55078125, + "grad_norm_var": 0.027375221252441406, + "learning_rate": 2e-05, + "loss": 1.264, + "loss/crossentropy": 2.7116100788116455, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1937001645565033, + "step": 1703 + }, + { + "epoch": 0.025444228759145888, + "grad_norm": 0.53515625, + "grad_norm_var": 0.027375221252441406, + "learning_rate": 2e-05, + "loss": 1.3182, + "loss/crossentropy": 2.672400951385498, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19316034018993378, + "step": 1704 + }, + { + "epoch": 0.02545916081827684, + "grad_norm": 0.546875, + "grad_norm_var": 0.027009073893229166, + "learning_rate": 2e-05, + "loss": 1.3287, + "loss/crossentropy": 2.59317946434021, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.19592443108558655, + "step": 1705 + }, + { + "epoch": 0.025474092877407796, + "grad_norm": 0.466796875, + "grad_norm_var": 0.02761521339416504, + "learning_rate": 2e-05, + "loss": 1.2262, + "loss/crossentropy": 2.5838565826416016, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16374395787715912, + "step": 1706 + }, + { + "epoch": 0.02548902493653875, + "grad_norm": 0.50390625, + "grad_norm_var": 0.027743132909138997, + "learning_rate": 2e-05, + "loss": 1.1815, + "loss/crossentropy": 2.6330811977386475, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.16590842604637146, + "step": 1707 + }, + { + "epoch": 0.025503956995669704, + "grad_norm": 0.546875, + "grad_norm_var": 0.02756663958231608, + "learning_rate": 2e-05, + "loss": 1.2692, + "loss/crossentropy": 2.5513384342193604, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.16764463484287262, + "step": 1708 + }, + { + "epoch": 0.025518889054800657, + "grad_norm": 0.50390625, + "grad_norm_var": 0.027528746922810873, + "learning_rate": 2e-05, + "loss": 1.1706, + "loss/crossentropy": 2.727013111114502, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15500634908676147, + "step": 1709 + }, + { + "epoch": 0.025533821113931612, + "grad_norm": 0.5859375, + "grad_norm_var": 0.0016778151194254557, + "learning_rate": 2e-05, + "loss": 1.3518, + "loss/crossentropy": 2.346733570098877, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.20339244604110718, + "step": 1710 + }, + { + "epoch": 0.025548753173062565, + "grad_norm": 0.490234375, + "grad_norm_var": 0.0017491022745768229, + "learning_rate": 2e-05, + "loss": 1.1178, + "loss/crossentropy": 2.4764509201049805, + "loss/dist_ce": 0.0, + "loss/fcd": 0.97265625, + "loss/idx": 13.0, + "loss/logits": 0.145157128572464, + "step": 1711 + }, + { + "epoch": 0.02556368523219352, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0017689387003580728, + "learning_rate": 2e-05, + "loss": 1.2626, + "loss/crossentropy": 2.531623125076294, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.18448594212532043, + "step": 1712 + }, + { + "epoch": 0.025578617291324473, + "grad_norm": 0.5546875, + "grad_norm_var": 0.0017916361490885417, + "learning_rate": 2e-05, + "loss": 1.321, + "loss/crossentropy": 2.4013497829437256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.18041828274726868, + "step": 1713 + }, + { + "epoch": 0.02559354935045543, + "grad_norm": 0.53125, + "grad_norm_var": 0.001543426513671875, + "learning_rate": 2e-05, + "loss": 1.3033, + "loss/crossentropy": 2.6992998123168945, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1939333975315094, + "step": 1714 + }, + { + "epoch": 0.02560848140958638, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0013483047485351562, + "learning_rate": 2e-05, + "loss": 1.1803, + "loss/crossentropy": 2.517133951187134, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.14903077483177185, + "step": 1715 + }, + { + "epoch": 0.025623413468717337, + "grad_norm": 0.640625, + "grad_norm_var": 0.002071571350097656, + "learning_rate": 2e-05, + "loss": 1.6195, + "loss/crossentropy": 2.476125717163086, + "loss/dist_ce": 0.0, + "loss/fcd": 1.359375, + "loss/idx": 13.0, + "loss/logits": 0.26009926199913025, + "step": 1716 + }, + { + "epoch": 0.02563834552784829, + "grad_norm": 0.56640625, + "grad_norm_var": 0.001980018615722656, + "learning_rate": 2e-05, + "loss": 1.3432, + "loss/crossentropy": 2.2813334465026855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.20258614420890808, + "step": 1717 + }, + { + "epoch": 0.025653277586979245, + "grad_norm": 0.48828125, + "grad_norm_var": 0.0018282572428385416, + "learning_rate": 2e-05, + "loss": 1.1883, + "loss/crossentropy": 2.476081132888794, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15700972080230713, + "step": 1718 + }, + { + "epoch": 0.025668209646110197, + "grad_norm": 0.62890625, + "grad_norm_var": 0.002367401123046875, + "learning_rate": 2e-05, + "loss": 1.3278, + "loss/crossentropy": 2.189779281616211, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.1871606558561325, + "step": 1719 + }, + { + "epoch": 0.025683141705241153, + "grad_norm": 0.546875, + "grad_norm_var": 0.002367591857910156, + "learning_rate": 2e-05, + "loss": 1.1858, + "loss/crossentropy": 2.363250732421875, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15456654131412506, + "step": 1720 + }, + { + "epoch": 0.025698073764372106, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0024800459543863934, + "learning_rate": 2e-05, + "loss": 1.1912, + "loss/crossentropy": 2.333819627761841, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15996934473514557, + "step": 1721 + }, + { + "epoch": 0.02571300582350306, + "grad_norm": 0.515625, + "grad_norm_var": 0.002164141337076823, + "learning_rate": 2e-05, + "loss": 1.2155, + "loss/crossentropy": 2.6888930797576904, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16863149404525757, + "step": 1722 + }, + { + "epoch": 0.025727937882634014, + "grad_norm": 0.53125, + "grad_norm_var": 0.0020746866861979167, + "learning_rate": 2e-05, + "loss": 1.3424, + "loss/crossentropy": 2.607393980026245, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.2017740160226822, + "step": 1723 + }, + { + "epoch": 0.02574286994176497, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0021071751912434896, + "learning_rate": 2e-05, + "loss": 1.2383, + "loss/crossentropy": 2.4375128746032715, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16799165308475494, + "step": 1724 + }, + { + "epoch": 0.025757802000895922, + "grad_norm": 0.486328125, + "grad_norm_var": 0.00221403439839681, + "learning_rate": 2e-05, + "loss": 1.2517, + "loss/crossentropy": 2.7144973278045654, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18143236637115479, + "step": 1725 + }, + { + "epoch": 0.025772734060026878, + "grad_norm": 0.5546875, + "grad_norm_var": 0.00208433469136556, + "learning_rate": 2e-05, + "loss": 1.316, + "loss/crossentropy": 2.503783941268921, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19102910161018372, + "step": 1726 + }, + { + "epoch": 0.02578766611915783, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0019597371419270834, + "learning_rate": 2e-05, + "loss": 1.2821, + "loss/crossentropy": 2.499298572540283, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.17273728549480438, + "step": 1727 + }, + { + "epoch": 0.025802598178288786, + "grad_norm": 0.5234375, + "grad_norm_var": 0.001977984110514323, + "learning_rate": 2e-05, + "loss": 1.259, + "loss/crossentropy": 2.5022952556610107, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.18091173470020294, + "step": 1728 + }, + { + "epoch": 0.025817530237419742, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0019652684529622394, + "learning_rate": 2e-05, + "loss": 1.3357, + "loss/crossentropy": 2.4653351306915283, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.19503894448280334, + "step": 1729 + }, + { + "epoch": 0.025832462296550694, + "grad_norm": 0.546875, + "grad_norm_var": 0.0019617080688476562, + "learning_rate": 2e-05, + "loss": 1.1965, + "loss/crossentropy": 2.6985909938812256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16525985300540924, + "step": 1730 + }, + { + "epoch": 0.02584739435568165, + "grad_norm": 0.51953125, + "grad_norm_var": 0.00191802978515625, + "learning_rate": 2e-05, + "loss": 1.2125, + "loss/crossentropy": 2.709136486053467, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.18129640817642212, + "step": 1731 + }, + { + "epoch": 0.025862326414812602, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0012738545735677083, + "learning_rate": 2e-05, + "loss": 1.2335, + "loss/crossentropy": 2.7097113132476807, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17096135020256042, + "step": 1732 + }, + { + "epoch": 0.025877258473943558, + "grad_norm": 0.609375, + "grad_norm_var": 0.001576677958170573, + "learning_rate": 2e-05, + "loss": 1.2684, + "loss/crossentropy": 2.601806879043579, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.18242479860782623, + "step": 1733 + }, + { + "epoch": 0.02589219053307451, + "grad_norm": 0.53125, + "grad_norm_var": 0.0014165242513020833, + "learning_rate": 2e-05, + "loss": 1.3134, + "loss/crossentropy": 2.4615402221679688, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18838651478290558, + "step": 1734 + }, + { + "epoch": 0.025907122592205466, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0008539835611979167, + "learning_rate": 2e-05, + "loss": 1.2638, + "loss/crossentropy": 2.577078104019165, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.18569207191467285, + "step": 1735 + }, + { + "epoch": 0.02592205465133642, + "grad_norm": 0.5859375, + "grad_norm_var": 0.0010256449381510417, + "learning_rate": 2e-05, + "loss": 1.1798, + "loss/crossentropy": 2.5145535469055176, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15635466575622559, + "step": 1736 + }, + { + "epoch": 0.025936986710467375, + "grad_norm": 0.515625, + "grad_norm_var": 0.0009591261545817058, + "learning_rate": 2e-05, + "loss": 1.2788, + "loss/crossentropy": 2.691556453704834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17723232507705688, + "step": 1737 + }, + { + "epoch": 0.025951918769598327, + "grad_norm": 0.609375, + "grad_norm_var": 0.0012566725413004558, + "learning_rate": 2e-05, + "loss": 1.2817, + "loss/crossentropy": 2.559390068054199, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.20356020331382751, + "step": 1738 + }, + { + "epoch": 0.025966850828729283, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0012490431467692058, + "learning_rate": 2e-05, + "loss": 1.2244, + "loss/crossentropy": 2.600177526473999, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16185130178928375, + "step": 1739 + }, + { + "epoch": 0.025981782887860235, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0012255191802978515, + "learning_rate": 2e-05, + "loss": 1.2417, + "loss/crossentropy": 2.7534642219543457, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17136284708976746, + "step": 1740 + }, + { + "epoch": 0.02599671494699119, + "grad_norm": 0.6640625, + "grad_norm_var": 0.0018142064412434895, + "learning_rate": 2e-05, + "loss": 1.5145, + "loss/crossentropy": 2.1807971000671387, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2734375, + "loss/idx": 13.0, + "loss/logits": 0.2410614937543869, + "step": 1741 + }, + { + "epoch": 0.026011647006122143, + "grad_norm": 0.79296875, + "grad_norm_var": 0.005324045817057292, + "learning_rate": 2e-05, + "loss": 1.5923, + "loss/crossentropy": 2.4337072372436523, + "loss/dist_ce": 0.0, + "loss/fcd": 1.3515625, + "loss/idx": 13.0, + "loss/logits": 0.24070346355438232, + "step": 1742 + }, + { + "epoch": 0.0260265790652531, + "grad_norm": 0.49609375, + "grad_norm_var": 0.005674235026041667, + "learning_rate": 2e-05, + "loss": 1.1684, + "loss/crossentropy": 2.5552825927734375, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15277054905891418, + "step": 1743 + }, + { + "epoch": 0.02604151112438405, + "grad_norm": 0.80859375, + "grad_norm_var": 0.009122657775878906, + "learning_rate": 2e-05, + "loss": 1.5857, + "loss/crossentropy": 2.7988672256469727, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 13.0, + "loss/logits": 0.32007214426994324, + "step": 1744 + }, + { + "epoch": 0.026056443183515007, + "grad_norm": 0.5078125, + "grad_norm_var": 0.00937188466389974, + "learning_rate": 2e-05, + "loss": 1.1453, + "loss/crossentropy": 2.5977418422698975, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.14532379806041718, + "step": 1745 + }, + { + "epoch": 0.02607137524264596, + "grad_norm": 0.5546875, + "grad_norm_var": 0.009338823954264323, + "learning_rate": 2e-05, + "loss": 1.2278, + "loss/crossentropy": 2.459958076477051, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1731024980545044, + "step": 1746 + }, + { + "epoch": 0.026086307301776916, + "grad_norm": 0.6015625, + "grad_norm_var": 0.009067789713541666, + "learning_rate": 2e-05, + "loss": 1.257, + "loss/crossentropy": 2.615450859069824, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17106476426124573, + "step": 1747 + }, + { + "epoch": 0.026101239360907868, + "grad_norm": 0.51953125, + "grad_norm_var": 0.008951250712076824, + "learning_rate": 2e-05, + "loss": 1.2121, + "loss/crossentropy": 2.551990270614624, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1652519255876541, + "step": 1748 + }, + { + "epoch": 0.026116171420038824, + "grad_norm": 0.55859375, + "grad_norm_var": 0.008971913655598959, + "learning_rate": 2e-05, + "loss": 1.3636, + "loss/crossentropy": 2.399773120880127, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.1839275360107422, + "step": 1749 + }, + { + "epoch": 0.026131103479169776, + "grad_norm": 0.48828125, + "grad_norm_var": 0.00939782460530599, + "learning_rate": 2e-05, + "loss": 1.1708, + "loss/crossentropy": 2.5663254261016846, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.16302275657653809, + "step": 1750 + }, + { + "epoch": 0.026146035538300732, + "grad_norm": 0.5625, + "grad_norm_var": 0.009150950113932292, + "learning_rate": 2e-05, + "loss": 1.3825, + "loss/crossentropy": 2.4493865966796875, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.1950165033340454, + "step": 1751 + }, + { + "epoch": 0.026160967597431684, + "grad_norm": 0.59765625, + "grad_norm_var": 0.009160296122233073, + "learning_rate": 2e-05, + "loss": 1.3006, + "loss/crossentropy": 2.4346399307250977, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1912744641304016, + "step": 1752 + }, + { + "epoch": 0.02617589965656264, + "grad_norm": 0.578125, + "grad_norm_var": 0.00881646474202474, + "learning_rate": 2e-05, + "loss": 1.2118, + "loss/crossentropy": 2.5589816570281982, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.15713083744049072, + "step": 1753 + }, + { + "epoch": 0.026190831715693592, + "grad_norm": 0.55078125, + "grad_norm_var": 0.008880360921223959, + "learning_rate": 2e-05, + "loss": 1.3028, + "loss/crossentropy": 2.65625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.19340017437934875, + "step": 1754 + }, + { + "epoch": 0.026205763774824548, + "grad_norm": 0.55859375, + "grad_norm_var": 0.008805084228515624, + "learning_rate": 2e-05, + "loss": 1.342, + "loss/crossentropy": 2.6520872116088867, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.217010498046875, + "step": 1755 + }, + { + "epoch": 0.026220695833955504, + "grad_norm": 0.5078125, + "grad_norm_var": 0.00916131337483724, + "learning_rate": 2e-05, + "loss": 1.273, + "loss/crossentropy": 2.5524790287017822, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1870691478252411, + "step": 1756 + }, + { + "epoch": 0.026235627893086456, + "grad_norm": 0.5, + "grad_norm_var": 0.00909722646077474, + "learning_rate": 2e-05, + "loss": 1.2431, + "loss/crossentropy": 2.7991576194763184, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.180614173412323, + "step": 1757 + }, + { + "epoch": 0.026250559952217412, + "grad_norm": 0.53125, + "grad_norm_var": 0.005736287434895833, + "learning_rate": 2e-05, + "loss": 1.2284, + "loss/crossentropy": 2.677903652191162, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1736885905265808, + "step": 1758 + }, + { + "epoch": 0.026265492011348365, + "grad_norm": 0.484375, + "grad_norm_var": 0.00584100087483724, + "learning_rate": 2e-05, + "loss": 1.1874, + "loss/crossentropy": 2.39847469329834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15619158744812012, + "step": 1759 + }, + { + "epoch": 0.02628042407047932, + "grad_norm": 0.49609375, + "grad_norm_var": 0.0014566421508789063, + "learning_rate": 2e-05, + "loss": 1.2115, + "loss/crossentropy": 2.773829936981201, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.172477588057518, + "step": 1760 + }, + { + "epoch": 0.026295356129610273, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0014035542805989583, + "learning_rate": 2e-05, + "loss": 1.2803, + "loss/crossentropy": 2.3061296939849854, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1709517389535904, + "step": 1761 + }, + { + "epoch": 0.02631028818874123, + "grad_norm": 0.4765625, + "grad_norm_var": 0.0016171773274739583, + "learning_rate": 2e-05, + "loss": 1.1998, + "loss/crossentropy": 2.6350574493408203, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16071206331253052, + "step": 1762 + }, + { + "epoch": 0.02632522024787218, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0013501326243082683, + "learning_rate": 2e-05, + "loss": 1.1946, + "loss/crossentropy": 2.479978322982788, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.155495285987854, + "step": 1763 + }, + { + "epoch": 0.026340152307003137, + "grad_norm": 0.486328125, + "grad_norm_var": 0.001453081766764323, + "learning_rate": 2e-05, + "loss": 1.1679, + "loss/crossentropy": 2.6815295219421387, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.14445266127586365, + "step": 1764 + }, + { + "epoch": 0.02635508436613409, + "grad_norm": 0.62109375, + "grad_norm_var": 0.0019759496053059896, + "learning_rate": 2e-05, + "loss": 1.2447, + "loss/crossentropy": 2.6957926750183105, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17434380948543549, + "step": 1765 + }, + { + "epoch": 0.026370016425265045, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0018620808919270833, + "learning_rate": 2e-05, + "loss": 1.1802, + "loss/crossentropy": 2.4276416301727295, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15674927830696106, + "step": 1766 + }, + { + "epoch": 0.026384948484395997, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0018325169881184896, + "learning_rate": 2e-05, + "loss": 1.2002, + "loss/crossentropy": 2.8092257976531982, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1611369252204895, + "step": 1767 + }, + { + "epoch": 0.026399880543526953, + "grad_norm": 0.5546875, + "grad_norm_var": 0.001546478271484375, + "learning_rate": 2e-05, + "loss": 1.2647, + "loss/crossentropy": 2.672966718673706, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17874416708946228, + "step": 1768 + }, + { + "epoch": 0.026414812602657906, + "grad_norm": 0.51171875, + "grad_norm_var": 0.001350847880045573, + "learning_rate": 2e-05, + "loss": 1.1984, + "loss/crossentropy": 2.601475715637207, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.1749573051929474, + "step": 1769 + }, + { + "epoch": 0.02642974466178886, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0013125101725260417, + "learning_rate": 2e-05, + "loss": 1.3379, + "loss/crossentropy": 2.344796895980835, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.2128661870956421, + "step": 1770 + }, + { + "epoch": 0.026444676720919814, + "grad_norm": 0.4453125, + "grad_norm_var": 0.0015319188435872395, + "learning_rate": 2e-05, + "loss": 1.1482, + "loss/crossentropy": 2.720777750015259, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9921875, + "loss/idx": 13.0, + "loss/logits": 0.15602290630340576, + "step": 1771 + }, + { + "epoch": 0.02645960878005077, + "grad_norm": 0.4921875, + "grad_norm_var": 0.0015578587849934896, + "learning_rate": 2e-05, + "loss": 1.1621, + "loss/crossentropy": 2.65152645111084, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.15432672202587128, + "step": 1772 + }, + { + "epoch": 0.026474540839181722, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0015505472819010416, + "learning_rate": 2e-05, + "loss": 1.2169, + "loss/crossentropy": 2.48185133934021, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16998444497585297, + "step": 1773 + }, + { + "epoch": 0.026489472898312678, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0017400105794270833, + "learning_rate": 2e-05, + "loss": 1.1641, + "loss/crossentropy": 2.5464279651641846, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.14069680869579315, + "step": 1774 + }, + { + "epoch": 0.02650440495744363, + "grad_norm": 0.6484375, + "grad_norm_var": 0.002738698323567708, + "learning_rate": 2e-05, + "loss": 1.3625, + "loss/crossentropy": 2.5577807426452637, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.19061937928199768, + "step": 1775 + }, + { + "epoch": 0.026519337016574586, + "grad_norm": 0.494140625, + "grad_norm_var": 0.002746693293253581, + "learning_rate": 2e-05, + "loss": 1.1661, + "loss/crossentropy": 2.5085034370422363, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15048107504844666, + "step": 1776 + }, + { + "epoch": 0.02653426907570554, + "grad_norm": 0.50390625, + "grad_norm_var": 0.002776066462198893, + "learning_rate": 2e-05, + "loss": 1.2534, + "loss/crossentropy": 2.579402208328247, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.19094473123550415, + "step": 1777 + }, + { + "epoch": 0.026549201134836494, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0026140689849853517, + "learning_rate": 2e-05, + "loss": 1.1913, + "loss/crossentropy": 2.696247100830078, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16005630791187286, + "step": 1778 + }, + { + "epoch": 0.026564133193967446, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0025721232096354166, + "learning_rate": 2e-05, + "loss": 1.1651, + "loss/crossentropy": 2.7263166904449463, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.1572725623846054, + "step": 1779 + }, + { + "epoch": 0.026579065253098402, + "grad_norm": 0.494140625, + "grad_norm_var": 0.002532196044921875, + "learning_rate": 2e-05, + "loss": 1.2075, + "loss/crossentropy": 2.452711820602417, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16845420002937317, + "step": 1780 + }, + { + "epoch": 0.026593997312229358, + "grad_norm": 0.46484375, + "grad_norm_var": 0.002135467529296875, + "learning_rate": 2e-05, + "loss": 1.1348, + "loss/crossentropy": 2.5930423736572266, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98046875, + "loss/idx": 13.0, + "loss/logits": 0.15436789393424988, + "step": 1781 + }, + { + "epoch": 0.02660892937136031, + "grad_norm": 0.734375, + "grad_norm_var": 0.00503997802734375, + "learning_rate": 2e-05, + "loss": 1.3972, + "loss/crossentropy": 2.6180036067962646, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.2018566131591797, + "step": 1782 + }, + { + "epoch": 0.026623861430491266, + "grad_norm": 0.578125, + "grad_norm_var": 0.00510400136311849, + "learning_rate": 2e-05, + "loss": 1.3258, + "loss/crossentropy": 2.556471109390259, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.1930120587348938, + "step": 1783 + }, + { + "epoch": 0.02663879348962222, + "grad_norm": 0.494140625, + "grad_norm_var": 0.005189243952433268, + "learning_rate": 2e-05, + "loss": 1.2076, + "loss/crossentropy": 2.686593532562256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16853547096252441, + "step": 1784 + }, + { + "epoch": 0.026653725548753174, + "grad_norm": 0.46875, + "grad_norm_var": 0.005427026748657226, + "learning_rate": 2e-05, + "loss": 1.2387, + "loss/crossentropy": 2.6526505947113037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.18397173285484314, + "step": 1785 + }, + { + "epoch": 0.026668657607884127, + "grad_norm": 0.640625, + "grad_norm_var": 0.00618907610575358, + "learning_rate": 2e-05, + "loss": 1.3109, + "loss/crossentropy": 2.4550328254699707, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.1859399974346161, + "step": 1786 + }, + { + "epoch": 0.026683589667015083, + "grad_norm": 0.5703125, + "grad_norm_var": 0.005641794204711914, + "learning_rate": 2e-05, + "loss": 1.341, + "loss/crossentropy": 2.7371487617492676, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.21602407097816467, + "step": 1787 + }, + { + "epoch": 0.026698521726146035, + "grad_norm": 0.66015625, + "grad_norm_var": 0.006232309341430664, + "learning_rate": 2e-05, + "loss": 1.3813, + "loss/crossentropy": 2.7988855838775635, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.2093946784734726, + "step": 1788 + }, + { + "epoch": 0.02671345378527699, + "grad_norm": 0.5234375, + "grad_norm_var": 0.006214761734008789, + "learning_rate": 2e-05, + "loss": 1.3025, + "loss/crossentropy": 2.5311191082000732, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1931311935186386, + "step": 1789 + }, + { + "epoch": 0.026728385844407943, + "grad_norm": 0.50390625, + "grad_norm_var": 0.006357431411743164, + "learning_rate": 2e-05, + "loss": 1.2538, + "loss/crossentropy": 2.6057794094085693, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18352286517620087, + "step": 1790 + }, + { + "epoch": 0.0267433179035389, + "grad_norm": 0.52734375, + "grad_norm_var": 0.005703083674112956, + "learning_rate": 2e-05, + "loss": 1.1792, + "loss/crossentropy": 2.5075063705444336, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1635546088218689, + "step": 1791 + }, + { + "epoch": 0.02675824996266985, + "grad_norm": 0.625, + "grad_norm_var": 0.005910746256510417, + "learning_rate": 2e-05, + "loss": 1.3609, + "loss/crossentropy": 2.534444570541382, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.18119356036186218, + "step": 1792 + }, + { + "epoch": 0.026773182021800807, + "grad_norm": 0.51953125, + "grad_norm_var": 0.005826314290364583, + "learning_rate": 2e-05, + "loss": 1.2744, + "loss/crossentropy": 2.717726230621338, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18060529232025146, + "step": 1793 + }, + { + "epoch": 0.02678811408093176, + "grad_norm": 0.58203125, + "grad_norm_var": 0.005828094482421875, + "learning_rate": 2e-05, + "loss": 1.3391, + "loss/crossentropy": 2.476660966873169, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.1984376311302185, + "step": 1794 + }, + { + "epoch": 0.026803046140062715, + "grad_norm": 0.5, + "grad_norm_var": 0.005906105041503906, + "learning_rate": 2e-05, + "loss": 1.2207, + "loss/crossentropy": 2.204763174057007, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16599825024604797, + "step": 1795 + }, + { + "epoch": 0.026817978199193668, + "grad_norm": 0.56640625, + "grad_norm_var": 0.005642048517862956, + "learning_rate": 2e-05, + "loss": 1.314, + "loss/crossentropy": 2.2588131427764893, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18899735808372498, + "step": 1796 + }, + { + "epoch": 0.026832910258324624, + "grad_norm": 0.546875, + "grad_norm_var": 0.0050225416819254555, + "learning_rate": 2e-05, + "loss": 1.266, + "loss/crossentropy": 2.7214598655700684, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1878923773765564, + "step": 1797 + }, + { + "epoch": 0.026847842317455576, + "grad_norm": 0.65234375, + "grad_norm_var": 0.0035912672678629558, + "learning_rate": 2e-05, + "loss": 1.4791, + "loss/crossentropy": 2.7222535610198975, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 13.0, + "loss/logits": 0.25250500440597534, + "step": 1798 + }, + { + "epoch": 0.026862774376586532, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0036637465159098308, + "learning_rate": 2e-05, + "loss": 1.2162, + "loss/crossentropy": 2.938828468322754, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16931620240211487, + "step": 1799 + }, + { + "epoch": 0.026877706435717484, + "grad_norm": 0.55859375, + "grad_norm_var": 0.003389422098795573, + "learning_rate": 2e-05, + "loss": 1.2342, + "loss/crossentropy": 2.7594287395477295, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16388383507728577, + "step": 1800 + }, + { + "epoch": 0.02689263849484844, + "grad_norm": 0.5625, + "grad_norm_var": 0.002794329325358073, + "learning_rate": 2e-05, + "loss": 1.3562, + "loss/crossentropy": 2.6729891300201416, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.2077617645263672, + "step": 1801 + }, + { + "epoch": 0.026907570553979392, + "grad_norm": 0.462890625, + "grad_norm_var": 0.003004058202107747, + "learning_rate": 2e-05, + "loss": 1.1714, + "loss/crossentropy": 2.5881118774414062, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15578031539916992, + "step": 1802 + }, + { + "epoch": 0.026922502613110348, + "grad_norm": 0.74609375, + "grad_norm_var": 0.005292876561482748, + "learning_rate": 2e-05, + "loss": 1.3937, + "loss/crossentropy": 2.675889253616333, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.19843614101409912, + "step": 1803 + }, + { + "epoch": 0.0269374346722413, + "grad_norm": 0.54296875, + "grad_norm_var": 0.004680617650349935, + "learning_rate": 2e-05, + "loss": 1.2514, + "loss/crossentropy": 2.4744293689727783, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18113207817077637, + "step": 1804 + }, + { + "epoch": 0.026952366731372256, + "grad_norm": 0.50390625, + "grad_norm_var": 0.004796330134073893, + "learning_rate": 2e-05, + "loss": 1.1718, + "loss/crossentropy": 2.706061363220215, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1561264991760254, + "step": 1805 + }, + { + "epoch": 0.02696729879050321, + "grad_norm": 0.5234375, + "grad_norm_var": 0.004680617650349935, + "learning_rate": 2e-05, + "loss": 1.1883, + "loss/crossentropy": 2.48380970954895, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15707790851593018, + "step": 1806 + }, + { + "epoch": 0.026982230849634165, + "grad_norm": 0.6484375, + "grad_norm_var": 0.005090570449829102, + "learning_rate": 2e-05, + "loss": 1.3034, + "loss/crossentropy": 2.59531569480896, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.17839841544628143, + "step": 1807 + }, + { + "epoch": 0.02699716290876512, + "grad_norm": 0.5234375, + "grad_norm_var": 0.004940144220987956, + "learning_rate": 2e-05, + "loss": 1.2338, + "loss/crossentropy": 2.7297565937042236, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1869654804468155, + "step": 1808 + }, + { + "epoch": 0.027012094967896073, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0049860477447509766, + "learning_rate": 2e-05, + "loss": 1.2296, + "loss/crossentropy": 2.546668291091919, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16706717014312744, + "step": 1809 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 0.4921875, + "grad_norm_var": 0.00522001584370931, + "learning_rate": 2e-05, + "loss": 1.2034, + "loss/crossentropy": 2.6435706615448, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.17213952541351318, + "step": 1810 + }, + { + "epoch": 0.02704195908615798, + "grad_norm": 0.53125, + "grad_norm_var": 0.00505674680074056, + "learning_rate": 2e-05, + "loss": 1.2388, + "loss/crossentropy": 2.6985273361206055, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17626091837882996, + "step": 1811 + }, + { + "epoch": 0.027056891145288937, + "grad_norm": 0.51953125, + "grad_norm_var": 0.005127700169881185, + "learning_rate": 2e-05, + "loss": 1.2427, + "loss/crossentropy": 2.4755988121032715, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.18802842497825623, + "step": 1812 + }, + { + "epoch": 0.02707182320441989, + "grad_norm": 0.5078125, + "grad_norm_var": 0.005254220962524414, + "learning_rate": 2e-05, + "loss": 1.1422, + "loss/crossentropy": 2.6311707496643066, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98828125, + "loss/idx": 13.0, + "loss/logits": 0.1539306342601776, + "step": 1813 + }, + { + "epoch": 0.027086755263550845, + "grad_norm": 0.578125, + "grad_norm_var": 0.00458982785542806, + "learning_rate": 2e-05, + "loss": 1.2927, + "loss/crossentropy": 2.8410966396331787, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.20679564774036407, + "step": 1814 + }, + { + "epoch": 0.027101687322681797, + "grad_norm": 0.48828125, + "grad_norm_var": 0.004760217666625976, + "learning_rate": 2e-05, + "loss": 1.2574, + "loss/crossentropy": 2.6344683170318604, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1714169979095459, + "step": 1815 + }, + { + "epoch": 0.027116619381812753, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0048394362131754555, + "learning_rate": 2e-05, + "loss": 1.2733, + "loss/crossentropy": 2.4946045875549316, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.18738268315792084, + "step": 1816 + }, + { + "epoch": 0.027131551440943705, + "grad_norm": 0.53125, + "grad_norm_var": 0.004808410008748373, + "learning_rate": 2e-05, + "loss": 1.1028, + "loss/crossentropy": 2.759798526763916, + "loss/dist_ce": 0.0, + "loss/fcd": 0.94921875, + "loss/idx": 13.0, + "loss/logits": 0.15359053015708923, + "step": 1817 + }, + { + "epoch": 0.02714648350007466, + "grad_norm": 0.5078125, + "grad_norm_var": 0.004481951395670573, + "learning_rate": 2e-05, + "loss": 1.2177, + "loss/crossentropy": 2.5690910816192627, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16301162540912628, + "step": 1818 + }, + { + "epoch": 0.027161415559205614, + "grad_norm": 0.5546875, + "grad_norm_var": 0.001544189453125, + "learning_rate": 2e-05, + "loss": 1.2606, + "loss/crossentropy": 2.410215377807617, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.16685530543327332, + "step": 1819 + }, + { + "epoch": 0.02717634761833657, + "grad_norm": 0.48828125, + "grad_norm_var": 0.0016314188639322917, + "learning_rate": 2e-05, + "loss": 1.1857, + "loss/crossentropy": 2.719848394393921, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15441551804542542, + "step": 1820 + }, + { + "epoch": 0.027191279677467522, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0016924540201822917, + "learning_rate": 2e-05, + "loss": 1.2924, + "loss/crossentropy": 2.547701358795166, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.1986786127090454, + "step": 1821 + }, + { + "epoch": 0.027206211736598478, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0016997655232747395, + "learning_rate": 2e-05, + "loss": 1.2618, + "loss/crossentropy": 2.6193835735321045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17586740851402283, + "step": 1822 + }, + { + "epoch": 0.02722114379572943, + "grad_norm": 0.65234375, + "grad_norm_var": 0.0017618815104166667, + "learning_rate": 2e-05, + "loss": 1.2637, + "loss/crossentropy": 2.512247323989868, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.1855495274066925, + "step": 1823 + }, + { + "epoch": 0.027236075854860386, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0017934163411458333, + "learning_rate": 2e-05, + "loss": 1.235, + "loss/crossentropy": 2.514695167541504, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16470174491405487, + "step": 1824 + }, + { + "epoch": 0.027251007913991338, + "grad_norm": 0.49609375, + "grad_norm_var": 0.0018473307291666666, + "learning_rate": 2e-05, + "loss": 1.1994, + "loss/crossentropy": 2.5031635761260986, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16035684943199158, + "step": 1825 + }, + { + "epoch": 0.027265939973122294, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0017501195271809897, + "learning_rate": 2e-05, + "loss": 1.1901, + "loss/crossentropy": 2.6170687675476074, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1588709056377411, + "step": 1826 + }, + { + "epoch": 0.027280872032253246, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0017506917317708333, + "learning_rate": 2e-05, + "loss": 1.29, + "loss/crossentropy": 2.467294216156006, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.18067091703414917, + "step": 1827 + }, + { + "epoch": 0.027295804091384202, + "grad_norm": 0.48046875, + "grad_norm_var": 0.0019121805826822916, + "learning_rate": 2e-05, + "loss": 1.1621, + "loss/crossentropy": 2.658353805541992, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.16208136081695557, + "step": 1828 + }, + { + "epoch": 0.027310736150515155, + "grad_norm": 0.484375, + "grad_norm_var": 0.0020151774088541666, + "learning_rate": 2e-05, + "loss": 1.1345, + "loss/crossentropy": 2.707339286804199, + "loss/dist_ce": 0.0, + "loss/fcd": 0.984375, + "loss/idx": 13.0, + "loss/logits": 0.15014402568340302, + "step": 1829 + }, + { + "epoch": 0.02732566820964611, + "grad_norm": 0.7578125, + "grad_norm_var": 0.0052263895670572914, + "learning_rate": 2e-05, + "loss": 1.3429, + "loss/crossentropy": 2.3926897048950195, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.17105039954185486, + "step": 1830 + }, + { + "epoch": 0.027340600268777063, + "grad_norm": 0.60546875, + "grad_norm_var": 0.0052836100260416664, + "learning_rate": 2e-05, + "loss": 1.389, + "loss/crossentropy": 2.5021653175354004, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 13.0, + "loss/logits": 0.18584418296813965, + "step": 1831 + }, + { + "epoch": 0.02735553232790802, + "grad_norm": 0.52734375, + "grad_norm_var": 0.005183664957682291, + "learning_rate": 2e-05, + "loss": 1.1209, + "loss/crossentropy": 2.6762871742248535, + "loss/dist_ce": 0.0, + "loss/fcd": 0.97265625, + "loss/idx": 13.0, + "loss/logits": 0.14821532368659973, + "step": 1832 + }, + { + "epoch": 0.027370464387038974, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0051680882771809895, + "learning_rate": 2e-05, + "loss": 1.2086, + "loss/crossentropy": 2.7123446464538574, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16171371936798096, + "step": 1833 + }, + { + "epoch": 0.027385396446169927, + "grad_norm": 0.5703125, + "grad_norm_var": 0.005060259501139323, + "learning_rate": 2e-05, + "loss": 1.2813, + "loss/crossentropy": 2.3558154106140137, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1797207146883011, + "step": 1834 + }, + { + "epoch": 0.027400328505300883, + "grad_norm": 0.5546875, + "grad_norm_var": 0.005060259501139323, + "learning_rate": 2e-05, + "loss": 1.2775, + "loss/crossentropy": 2.6045339107513428, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1759113073348999, + "step": 1835 + }, + { + "epoch": 0.027415260564431835, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0048476537068684895, + "learning_rate": 2e-05, + "loss": 1.0849, + "loss/crossentropy": 2.608766794204712, + "loss/dist_ce": 0.0, + "loss/fcd": 0.94140625, + "loss/idx": 13.0, + "loss/logits": 0.14344725012779236, + "step": 1836 + }, + { + "epoch": 0.02743019262356279, + "grad_norm": 0.4609375, + "grad_norm_var": 0.005395253499348958, + "learning_rate": 2e-05, + "loss": 1.1483, + "loss/crossentropy": 2.852187395095825, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9921875, + "loss/idx": 13.0, + "loss/logits": 0.1560676395893097, + "step": 1837 + }, + { + "epoch": 0.027445124682693743, + "grad_norm": 0.59765625, + "grad_norm_var": 0.005535888671875, + "learning_rate": 2e-05, + "loss": 1.3112, + "loss/crossentropy": 2.568100690841675, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.17836827039718628, + "step": 1838 + }, + { + "epoch": 0.0274600567418247, + "grad_norm": 0.546875, + "grad_norm_var": 0.004830360412597656, + "learning_rate": 2e-05, + "loss": 1.2512, + "loss/crossentropy": 2.4580633640289307, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.18085670471191406, + "step": 1839 + }, + { + "epoch": 0.02747498880095565, + "grad_norm": 0.51171875, + "grad_norm_var": 0.004811350504557292, + "learning_rate": 2e-05, + "loss": 1.1555, + "loss/crossentropy": 2.5179049968719482, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.13983239233493805, + "step": 1840 + }, + { + "epoch": 0.027489920860086607, + "grad_norm": 0.53515625, + "grad_norm_var": 0.004644775390625, + "learning_rate": 2e-05, + "loss": 1.2934, + "loss/crossentropy": 2.557543992996216, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.19966451823711395, + "step": 1841 + }, + { + "epoch": 0.02750485291921756, + "grad_norm": 0.5234375, + "grad_norm_var": 0.004674720764160156, + "learning_rate": 2e-05, + "loss": 1.2074, + "loss/crossentropy": 2.637477159500122, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16053849458694458, + "step": 1842 + }, + { + "epoch": 0.027519784978348515, + "grad_norm": 0.55859375, + "grad_norm_var": 0.004668617248535156, + "learning_rate": 2e-05, + "loss": 1.2512, + "loss/crossentropy": 2.749081611633301, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1809091717004776, + "step": 1843 + }, + { + "epoch": 0.027534717037479468, + "grad_norm": 0.59765625, + "grad_norm_var": 0.004447364807128906, + "learning_rate": 2e-05, + "loss": 1.2723, + "loss/crossentropy": 2.6513166427612305, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.18633897602558136, + "step": 1844 + }, + { + "epoch": 0.027549649096610423, + "grad_norm": 0.62109375, + "grad_norm_var": 0.0042938232421875, + "learning_rate": 2e-05, + "loss": 1.2902, + "loss/crossentropy": 2.4566802978515625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.18077588081359863, + "step": 1845 + }, + { + "epoch": 0.027564581155741376, + "grad_norm": 0.46484375, + "grad_norm_var": 0.0021432876586914063, + "learning_rate": 2e-05, + "loss": 1.1489, + "loss/crossentropy": 2.5728206634521484, + "loss/dist_ce": 0.0, + "loss/fcd": 0.99609375, + "loss/idx": 13.0, + "loss/logits": 0.15283794701099396, + "step": 1846 + }, + { + "epoch": 0.02757951321487233, + "grad_norm": 0.4921875, + "grad_norm_var": 0.002064005533854167, + "learning_rate": 2e-05, + "loss": 1.1915, + "loss/crossentropy": 2.5025198459625244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1602761149406433, + "step": 1847 + }, + { + "epoch": 0.027594445274003284, + "grad_norm": 0.478515625, + "grad_norm_var": 0.002295668919881185, + "learning_rate": 2e-05, + "loss": 1.154, + "loss/crossentropy": 2.742009401321411, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.15398138761520386, + "step": 1848 + }, + { + "epoch": 0.02760937733313424, + "grad_norm": 0.484375, + "grad_norm_var": 0.0024261315663655597, + "learning_rate": 2e-05, + "loss": 1.2092, + "loss/crossentropy": 2.5691721439361572, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.17790091037750244, + "step": 1849 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0023867289225260415, + "learning_rate": 2e-05, + "loss": 1.2136, + "loss/crossentropy": 2.7507925033569336, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.1589614599943161, + "step": 1850 + }, + { + "epoch": 0.027639241451396148, + "grad_norm": 0.482421875, + "grad_norm_var": 0.002454360326131185, + "learning_rate": 2e-05, + "loss": 1.2141, + "loss/crossentropy": 2.597674608230591, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.17499211430549622, + "step": 1851 + }, + { + "epoch": 0.0276541735105271, + "grad_norm": 0.59375, + "grad_norm_var": 0.0027611891428629557, + "learning_rate": 2e-05, + "loss": 1.2882, + "loss/crossentropy": 2.4266016483306885, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.18663495779037476, + "step": 1852 + }, + { + "epoch": 0.027669105569658056, + "grad_norm": 0.515625, + "grad_norm_var": 0.002459446589152018, + "learning_rate": 2e-05, + "loss": 1.2623, + "loss/crossentropy": 2.6035945415496826, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1607305407524109, + "step": 1853 + }, + { + "epoch": 0.02768403762878901, + "grad_norm": 0.48828125, + "grad_norm_var": 0.002240482966105143, + "learning_rate": 2e-05, + "loss": 1.2368, + "loss/crossentropy": 2.7356526851654053, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17425121366977692, + "step": 1854 + }, + { + "epoch": 0.027698969687919964, + "grad_norm": 0.53125, + "grad_norm_var": 0.0022092024485270184, + "learning_rate": 2e-05, + "loss": 1.1971, + "loss/crossentropy": 2.591665029525757, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16583198308944702, + "step": 1855 + }, + { + "epoch": 0.027713901747050917, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0023312727610270184, + "learning_rate": 2e-05, + "loss": 1.2682, + "loss/crossentropy": 2.8954193592071533, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1822158247232437, + "step": 1856 + }, + { + "epoch": 0.027728833806181873, + "grad_norm": 0.55859375, + "grad_norm_var": 0.002390400568644206, + "learning_rate": 2e-05, + "loss": 1.4086, + "loss/crossentropy": 2.160271167755127, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1953125, + "loss/idx": 13.0, + "loss/logits": 0.21325430274009705, + "step": 1857 + }, + { + "epoch": 0.027743765865312825, + "grad_norm": 0.52734375, + "grad_norm_var": 0.002388620376586914, + "learning_rate": 2e-05, + "loss": 1.2161, + "loss/crossentropy": 2.652884006500244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16925424337387085, + "step": 1858 + }, + { + "epoch": 0.02775869792444378, + "grad_norm": 0.54296875, + "grad_norm_var": 0.002342081069946289, + "learning_rate": 2e-05, + "loss": 1.3669, + "loss/crossentropy": 2.712960720062256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.19504106044769287, + "step": 1859 + }, + { + "epoch": 0.027773629983574737, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0020437717437744142, + "learning_rate": 2e-05, + "loss": 1.2549, + "loss/crossentropy": 2.463047742843628, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.16891682147979736, + "step": 1860 + }, + { + "epoch": 0.02778856204270569, + "grad_norm": 0.5390625, + "grad_norm_var": 0.001413583755493164, + "learning_rate": 2e-05, + "loss": 1.1812, + "loss/crossentropy": 2.5723025798797607, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.14994603395462036, + "step": 1861 + }, + { + "epoch": 0.027803494101836645, + "grad_norm": 0.5546875, + "grad_norm_var": 0.0012585798899332683, + "learning_rate": 2e-05, + "loss": 1.3497, + "loss/crossentropy": 2.5939555168151855, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.20121702551841736, + "step": 1862 + }, + { + "epoch": 0.027818426160967597, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0012929121653238933, + "learning_rate": 2e-05, + "loss": 1.3497, + "loss/crossentropy": 2.595493793487549, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.20122528076171875, + "step": 1863 + }, + { + "epoch": 0.027833358220098553, + "grad_norm": 0.515625, + "grad_norm_var": 0.001122283935546875, + "learning_rate": 2e-05, + "loss": 1.2143, + "loss/crossentropy": 2.7929248809814453, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.17519071698188782, + "step": 1864 + }, + { + "epoch": 0.027848290279229505, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0009592056274414062, + "learning_rate": 2e-05, + "loss": 1.2764, + "loss/crossentropy": 2.555880069732666, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17485883831977844, + "step": 1865 + }, + { + "epoch": 0.02786322233836046, + "grad_norm": 0.4765625, + "grad_norm_var": 0.001097853978474935, + "learning_rate": 2e-05, + "loss": 1.2594, + "loss/crossentropy": 2.3144571781158447, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.18126043677330017, + "step": 1866 + }, + { + "epoch": 0.027878154397491414, + "grad_norm": 0.5859375, + "grad_norm_var": 0.001041412353515625, + "learning_rate": 2e-05, + "loss": 1.2723, + "loss/crossentropy": 2.8717572689056396, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.17073537409305573, + "step": 1867 + }, + { + "epoch": 0.02789308645662237, + "grad_norm": 0.53125, + "grad_norm_var": 0.0008501688639322917, + "learning_rate": 2e-05, + "loss": 1.2706, + "loss/crossentropy": 2.798926830291748, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.19248628616333008, + "step": 1868 + }, + { + "epoch": 0.02790801851575332, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0008625666300455729, + "learning_rate": 2e-05, + "loss": 1.1766, + "loss/crossentropy": 2.5379748344421387, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.16097001731395721, + "step": 1869 + }, + { + "epoch": 0.027922950574884278, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0006692886352539062, + "learning_rate": 2e-05, + "loss": 1.2328, + "loss/crossentropy": 2.607471227645874, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17812922596931458, + "step": 1870 + }, + { + "epoch": 0.02793788263401523, + "grad_norm": 0.56640625, + "grad_norm_var": 0.00068359375, + "learning_rate": 2e-05, + "loss": 1.233, + "loss/crossentropy": 2.5563502311706543, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17826440930366516, + "step": 1871 + }, + { + "epoch": 0.027952814693146186, + "grad_norm": 0.62109375, + "grad_norm_var": 0.0010034561157226563, + "learning_rate": 2e-05, + "loss": 1.3474, + "loss/crossentropy": 2.455146551132202, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 13.0, + "loss/logits": 0.18337559700012207, + "step": 1872 + }, + { + "epoch": 0.027967746752277138, + "grad_norm": 0.498046875, + "grad_norm_var": 0.0011635939280192056, + "learning_rate": 2e-05, + "loss": 1.1542, + "loss/crossentropy": 2.446068525314331, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.146395742893219, + "step": 1873 + }, + { + "epoch": 0.027982678811408094, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0011604150136311849, + "learning_rate": 2e-05, + "loss": 1.3081, + "loss/crossentropy": 2.396256685256958, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.20653998851776123, + "step": 1874 + }, + { + "epoch": 0.027997610870539046, + "grad_norm": 0.56640625, + "grad_norm_var": 0.0011768182118733724, + "learning_rate": 2e-05, + "loss": 1.2419, + "loss/crossentropy": 2.4543728828430176, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17160436511039734, + "step": 1875 + }, + { + "epoch": 0.028012542929670002, + "grad_norm": 0.53515625, + "grad_norm_var": 0.0011908054351806641, + "learning_rate": 2e-05, + "loss": 1.2603, + "loss/crossentropy": 2.67966365814209, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1977955400943756, + "step": 1876 + }, + { + "epoch": 0.028027474988800954, + "grad_norm": 0.6875, + "grad_norm_var": 0.0023673852284749348, + "learning_rate": 2e-05, + "loss": 1.3841, + "loss/crossentropy": 2.7667315006256104, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.21221590042114258, + "step": 1877 + }, + { + "epoch": 0.02804240704793191, + "grad_norm": 0.4921875, + "grad_norm_var": 0.0026430606842041014, + "learning_rate": 2e-05, + "loss": 1.1679, + "loss/crossentropy": 2.534827470779419, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1522301733493805, + "step": 1878 + }, + { + "epoch": 0.028057339107062863, + "grad_norm": 0.671875, + "grad_norm_var": 0.0035009860992431642, + "learning_rate": 2e-05, + "loss": 1.426, + "loss/crossentropy": 2.4057843685150146, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2109375, + "loss/idx": 13.0, + "loss/logits": 0.2150222361087799, + "step": 1879 + }, + { + "epoch": 0.02807227116619382, + "grad_norm": 0.59765625, + "grad_norm_var": 0.003426218032836914, + "learning_rate": 2e-05, + "loss": 1.347, + "loss/crossentropy": 2.831153154373169, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.20638611912727356, + "step": 1880 + }, + { + "epoch": 0.02808720322532477, + "grad_norm": 0.53125, + "grad_norm_var": 0.0034708499908447264, + "learning_rate": 2e-05, + "loss": 1.308, + "loss/crossentropy": 2.6058216094970703, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1986655741930008, + "step": 1881 + }, + { + "epoch": 0.028102135284455727, + "grad_norm": 0.69921875, + "grad_norm_var": 0.003934717178344727, + "learning_rate": 2e-05, + "loss": 1.3506, + "loss/crossentropy": 2.816981315612793, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.20997676253318787, + "step": 1882 + }, + { + "epoch": 0.02811706734358668, + "grad_norm": 0.546875, + "grad_norm_var": 0.0039951165517171225, + "learning_rate": 2e-05, + "loss": 1.2804, + "loss/crossentropy": 2.183166265487671, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.17098873853683472, + "step": 1883 + }, + { + "epoch": 0.028131999402717635, + "grad_norm": 0.55078125, + "grad_norm_var": 0.0039003849029541015, + "learning_rate": 2e-05, + "loss": 1.3684, + "loss/crossentropy": 2.391815423965454, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.21994663774967194, + "step": 1884 + }, + { + "epoch": 0.028146931461848587, + "grad_norm": 0.53125, + "grad_norm_var": 0.004031991958618164, + "learning_rate": 2e-05, + "loss": 1.1912, + "loss/crossentropy": 2.4962005615234375, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1599755585193634, + "step": 1885 + }, + { + "epoch": 0.028161863520979543, + "grad_norm": 0.5, + "grad_norm_var": 0.0043625990549723305, + "learning_rate": 2e-05, + "loss": 1.2411, + "loss/crossentropy": 2.674856662750244, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1785699725151062, + "step": 1886 + }, + { + "epoch": 0.0281767955801105, + "grad_norm": 0.5234375, + "grad_norm_var": 0.004513661066691081, + "learning_rate": 2e-05, + "loss": 1.3122, + "loss/crossentropy": 2.5995376110076904, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.19502834975719452, + "step": 1887 + }, + { + "epoch": 0.02819172763924145, + "grad_norm": 0.625, + "grad_norm_var": 0.004541254043579102, + "learning_rate": 2e-05, + "loss": 1.3003, + "loss/crossentropy": 2.6286256313323975, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.1830824315547943, + "step": 1888 + }, + { + "epoch": 0.028206659698372407, + "grad_norm": 0.53515625, + "grad_norm_var": 0.004270362854003906, + "learning_rate": 2e-05, + "loss": 1.2272, + "loss/crossentropy": 2.5657100677490234, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17253029346466064, + "step": 1889 + }, + { + "epoch": 0.02822159175750336, + "grad_norm": 0.51171875, + "grad_norm_var": 0.0045017878214518225, + "learning_rate": 2e-05, + "loss": 1.2384, + "loss/crossentropy": 2.6371774673461914, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1681075394153595, + "step": 1890 + }, + { + "epoch": 0.028236523816634315, + "grad_norm": 0.4921875, + "grad_norm_var": 0.004872639973958333, + "learning_rate": 2e-05, + "loss": 1.2299, + "loss/crossentropy": 2.384807825088501, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17519429326057434, + "step": 1891 + }, + { + "epoch": 0.028251455875765268, + "grad_norm": 0.515625, + "grad_norm_var": 0.004972775777180989, + "learning_rate": 2e-05, + "loss": 1.2124, + "loss/crossentropy": 2.781672954559326, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.17337191104888916, + "step": 1892 + }, + { + "epoch": 0.028266387934896223, + "grad_norm": 0.4921875, + "grad_norm_var": 0.004120826721191406, + "learning_rate": 2e-05, + "loss": 1.1777, + "loss/crossentropy": 2.4862048625946045, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15426138043403625, + "step": 1893 + }, + { + "epoch": 0.028281319994027176, + "grad_norm": 0.52734375, + "grad_norm_var": 0.003922271728515625, + "learning_rate": 2e-05, + "loss": 1.2273, + "loss/crossentropy": 2.573594808578491, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16478094458580017, + "step": 1894 + }, + { + "epoch": 0.02829625205315813, + "grad_norm": 0.578125, + "grad_norm_var": 0.002988433837890625, + "learning_rate": 2e-05, + "loss": 1.2974, + "loss/crossentropy": 2.44994854927063, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.2036007046699524, + "step": 1895 + }, + { + "epoch": 0.028311184112289084, + "grad_norm": 0.53125, + "grad_norm_var": 0.0028187433878580728, + "learning_rate": 2e-05, + "loss": 1.291, + "loss/crossentropy": 2.642103433609009, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.19725409150123596, + "step": 1896 + }, + { + "epoch": 0.02832611617142004, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0028350194295247394, + "learning_rate": 2e-05, + "loss": 1.2426, + "loss/crossentropy": 2.4922900199890137, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17228657007217407, + "step": 1897 + }, + { + "epoch": 0.028341048230550992, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0011309305826822916, + "learning_rate": 2e-05, + "loss": 1.2007, + "loss/crossentropy": 2.5027291774749756, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.15385723114013672, + "step": 1898 + }, + { + "epoch": 0.028355980289681948, + "grad_norm": 0.484375, + "grad_norm_var": 0.0012407938639322916, + "learning_rate": 2e-05, + "loss": 1.1203, + "loss/crossentropy": 2.66294527053833, + "loss/dist_ce": 0.0, + "loss/fcd": 0.97265625, + "loss/idx": 13.0, + "loss/logits": 0.14767876267433167, + "step": 1899 + }, + { + "epoch": 0.0283709123488129, + "grad_norm": 0.474609375, + "grad_norm_var": 0.0013604323069254557, + "learning_rate": 2e-05, + "loss": 1.1664, + "loss/crossentropy": 2.5771396160125732, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.1507551074028015, + "step": 1900 + }, + { + "epoch": 0.028385844407943856, + "grad_norm": 0.62109375, + "grad_norm_var": 0.001974598566691081, + "learning_rate": 2e-05, + "loss": 1.3821, + "loss/crossentropy": 2.398294448852539, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 13.0, + "loss/logits": 0.21805456280708313, + "step": 1901 + }, + { + "epoch": 0.02840077646707481, + "grad_norm": 0.6171875, + "grad_norm_var": 0.0023999373118082684, + "learning_rate": 2e-05, + "loss": 1.2681, + "loss/crossentropy": 2.537140130996704, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17434020340442657, + "step": 1902 + }, + { + "epoch": 0.028415708526205764, + "grad_norm": 0.5625, + "grad_norm_var": 0.0024349053700764975, + "learning_rate": 2e-05, + "loss": 1.2614, + "loss/crossentropy": 2.519807815551758, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.17546257376670837, + "step": 1903 + }, + { + "epoch": 0.028430640585336717, + "grad_norm": 0.474609375, + "grad_norm_var": 0.0020934422810872395, + "learning_rate": 2e-05, + "loss": 1.2129, + "loss/crossentropy": 2.592402696609497, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.17381396889686584, + "step": 1904 + }, + { + "epoch": 0.028445572644467673, + "grad_norm": 0.6484375, + "grad_norm_var": 0.003002421061197917, + "learning_rate": 2e-05, + "loss": 1.3098, + "loss/crossentropy": 2.6101558208465576, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.18479961156845093, + "step": 1905 + }, + { + "epoch": 0.028460504703598625, + "grad_norm": 0.49609375, + "grad_norm_var": 0.0030665079752604167, + "learning_rate": 2e-05, + "loss": 1.1539, + "loss/crossentropy": 2.396393060684204, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.14609143137931824, + "step": 1906 + }, + { + "epoch": 0.02847543676272958, + "grad_norm": 0.578125, + "grad_norm_var": 0.0030469258626302084, + "learning_rate": 2e-05, + "loss": 1.3732, + "loss/crossentropy": 2.566880226135254, + "loss/dist_ce": 0.0, + "loss/fcd": 1.171875, + "loss/idx": 13.0, + "loss/logits": 0.20131400227546692, + "step": 1907 + }, + { + "epoch": 0.028490368821860533, + "grad_norm": 0.9765625, + "grad_norm_var": 0.014855448404947917, + "learning_rate": 2e-05, + "loss": 1.4376, + "loss/crossentropy": 1.9921377897262573, + "loss/dist_ce": 0.0, + "loss/fcd": 1.265625, + "loss/idx": 13.0, + "loss/logits": 0.17201855778694153, + "step": 1908 + }, + { + "epoch": 0.02850530088099149, + "grad_norm": 0.77734375, + "grad_norm_var": 0.017041460673014323, + "learning_rate": 2e-05, + "loss": 1.592, + "loss/crossentropy": 2.6982979774475098, + "loss/dist_ce": 0.0, + "loss/fcd": 1.328125, + "loss/idx": 13.0, + "loss/logits": 0.26390308141708374, + "step": 1909 + }, + { + "epoch": 0.02852023294012244, + "grad_norm": 0.5390625, + "grad_norm_var": 0.016958109537760415, + "learning_rate": 2e-05, + "loss": 1.224, + "loss/crossentropy": 2.5811877250671387, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1615004688501358, + "step": 1910 + }, + { + "epoch": 0.028535164999253397, + "grad_norm": 0.54296875, + "grad_norm_var": 0.017076555887858072, + "learning_rate": 2e-05, + "loss": 1.3145, + "loss/crossentropy": 2.500891923904419, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.1816907823085785, + "step": 1911 + }, + { + "epoch": 0.028550097058384353, + "grad_norm": 0.5625, + "grad_norm_var": 0.016914812723795573, + "learning_rate": 2e-05, + "loss": 1.2898, + "loss/crossentropy": 2.510209798812866, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.2116694152355194, + "step": 1912 + }, + { + "epoch": 0.028565029117515305, + "grad_norm": 0.6640625, + "grad_norm_var": 0.016965166727701823, + "learning_rate": 2e-05, + "loss": 1.2248, + "loss/crossentropy": 2.9473345279693604, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.1623196303844452, + "step": 1913 + }, + { + "epoch": 0.02857996117664626, + "grad_norm": 0.58203125, + "grad_norm_var": 0.016442108154296874, + "learning_rate": 2e-05, + "loss": 1.3948, + "loss/crossentropy": 2.6808791160583496, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.21510747075080872, + "step": 1914 + }, + { + "epoch": 0.028594893235777213, + "grad_norm": 0.58984375, + "grad_norm_var": 0.015509986877441406, + "learning_rate": 2e-05, + "loss": 1.3086, + "loss/crossentropy": 2.472118616104126, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1328125, + "loss/idx": 13.0, + "loss/logits": 0.1758304238319397, + "step": 1915 + }, + { + "epoch": 0.02860982529490817, + "grad_norm": 0.59375, + "grad_norm_var": 0.01429899533589681, + "learning_rate": 2e-05, + "loss": 1.2624, + "loss/crossentropy": 2.7138822078704834, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.16082213819026947, + "step": 1916 + }, + { + "epoch": 0.02862475735403912, + "grad_norm": 0.5546875, + "grad_norm_var": 0.014512999852498373, + "learning_rate": 2e-05, + "loss": 1.2013, + "loss/crossentropy": 2.4729011058807373, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1622874140739441, + "step": 1917 + }, + { + "epoch": 0.028639689413170077, + "grad_norm": 0.5078125, + "grad_norm_var": 0.015155649185180664, + "learning_rate": 2e-05, + "loss": 1.1805, + "loss/crossentropy": 2.375136613845825, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1414240300655365, + "step": 1918 + }, + { + "epoch": 0.02865462147230103, + "grad_norm": 0.5546875, + "grad_norm_var": 0.015201807022094727, + "learning_rate": 2e-05, + "loss": 1.2548, + "loss/crossentropy": 2.6161394119262695, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.16885605454444885, + "step": 1919 + }, + { + "epoch": 0.028669553531431986, + "grad_norm": 0.5390625, + "grad_norm_var": 0.014361000061035157, + "learning_rate": 2e-05, + "loss": 1.2967, + "loss/crossentropy": 2.6225621700286865, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.19516368210315704, + "step": 1920 + }, + { + "epoch": 0.028684485590562938, + "grad_norm": 0.5390625, + "grad_norm_var": 0.014499855041503907, + "learning_rate": 2e-05, + "loss": 1.2501, + "loss/crossentropy": 2.6922271251678467, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17982667684555054, + "step": 1921 + }, + { + "epoch": 0.028699417649693894, + "grad_norm": 0.490234375, + "grad_norm_var": 0.014583063125610352, + "learning_rate": 2e-05, + "loss": 1.2416, + "loss/crossentropy": 2.497199535369873, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17905518412590027, + "step": 1922 + }, + { + "epoch": 0.028714349708824846, + "grad_norm": 0.498046875, + "grad_norm_var": 0.015211931864420573, + "learning_rate": 2e-05, + "loss": 1.2402, + "loss/crossentropy": 2.8170697689056396, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16988661885261536, + "step": 1923 + }, + { + "epoch": 0.028729281767955802, + "grad_norm": 0.578125, + "grad_norm_var": 0.004835955301920573, + "learning_rate": 2e-05, + "loss": 1.2604, + "loss/crossentropy": 2.3041558265686035, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1588452011346817, + "step": 1924 + }, + { + "epoch": 0.028744213827086754, + "grad_norm": 0.546875, + "grad_norm_var": 0.0017712910970052083, + "learning_rate": 2e-05, + "loss": 1.2265, + "loss/crossentropy": 2.748405694961548, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.17180919647216797, + "step": 1925 + }, + { + "epoch": 0.02875914588621771, + "grad_norm": 0.6640625, + "grad_norm_var": 0.002479298909505208, + "learning_rate": 2e-05, + "loss": 1.3679, + "loss/crossentropy": 2.416738510131836, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.18823902308940887, + "step": 1926 + }, + { + "epoch": 0.028774077945348663, + "grad_norm": 0.515625, + "grad_norm_var": 0.002599016825358073, + "learning_rate": 2e-05, + "loss": 1.1838, + "loss/crossentropy": 2.5889949798583984, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.16037039458751678, + "step": 1927 + }, + { + "epoch": 0.02878901000447962, + "grad_norm": 0.5234375, + "grad_norm_var": 0.002688026428222656, + "learning_rate": 2e-05, + "loss": 1.3251, + "loss/crossentropy": 2.383061408996582, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.1844463050365448, + "step": 1928 + }, + { + "epoch": 0.02880394206361057, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0020441691080729167, + "learning_rate": 2e-05, + "loss": 1.1616, + "loss/crossentropy": 2.659311532974243, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.153801828622818, + "step": 1929 + }, + { + "epoch": 0.028818874122741527, + "grad_norm": 0.5234375, + "grad_norm_var": 0.001999346415201823, + "learning_rate": 2e-05, + "loss": 1.204, + "loss/crossentropy": 2.457728862762451, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.1570843905210495, + "step": 1930 + }, + { + "epoch": 0.02883380618187248, + "grad_norm": 0.50390625, + "grad_norm_var": 0.001948992411295573, + "learning_rate": 2e-05, + "loss": 1.2224, + "loss/crossentropy": 2.628582715988159, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.17548592388629913, + "step": 1931 + }, + { + "epoch": 0.028848738241003435, + "grad_norm": 0.49609375, + "grad_norm_var": 0.001842498779296875, + "learning_rate": 2e-05, + "loss": 1.179, + "loss/crossentropy": 2.5765113830566406, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.1555609107017517, + "step": 1932 + }, + { + "epoch": 0.028863670300134387, + "grad_norm": 0.54296875, + "grad_norm_var": 0.0018182754516601562, + "learning_rate": 2e-05, + "loss": 1.2661, + "loss/crossentropy": 2.8126022815704346, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.1801205724477768, + "step": 1933 + }, + { + "epoch": 0.028878602359265343, + "grad_norm": 0.5234375, + "grad_norm_var": 0.001781145731608073, + "learning_rate": 2e-05, + "loss": 1.2714, + "loss/crossentropy": 2.6158759593963623, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0859375, + "loss/idx": 13.0, + "loss/logits": 0.18547698855400085, + "step": 1934 + }, + { + "epoch": 0.028893534418396295, + "grad_norm": 0.625, + "grad_norm_var": 0.0022846857706705728, + "learning_rate": 2e-05, + "loss": 1.3012, + "loss/crossentropy": 2.6109371185302734, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.18406124413013458, + "step": 1935 + }, + { + "epoch": 0.02890846647752725, + "grad_norm": 0.55859375, + "grad_norm_var": 0.0023104349772135415, + "learning_rate": 2e-05, + "loss": 1.1956, + "loss/crossentropy": 2.6355507373809814, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1643276810646057, + "step": 1936 + }, + { + "epoch": 0.028923398536658203, + "grad_norm": 0.5390625, + "grad_norm_var": 0.0023104349772135415, + "learning_rate": 2e-05, + "loss": 1.3375, + "loss/crossentropy": 2.5399065017700195, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1484375, + "loss/idx": 13.0, + "loss/logits": 0.1890271008014679, + "step": 1937 + }, + { + "epoch": 0.02893833059578916, + "grad_norm": 0.5234375, + "grad_norm_var": 0.002161010106404622, + "learning_rate": 2e-05, + "loss": 1.2741, + "loss/crossentropy": 2.486199378967285, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18031063675880432, + "step": 1938 + }, + { + "epoch": 0.028953262654920115, + "grad_norm": 0.490234375, + "grad_norm_var": 0.002210219701131185, + "learning_rate": 2e-05, + "loss": 1.2398, + "loss/crossentropy": 2.4644899368286133, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.17728056013584137, + "step": 1939 + }, + { + "epoch": 0.028968194714051067, + "grad_norm": 0.5, + "grad_norm_var": 0.00220640500386556, + "learning_rate": 2e-05, + "loss": 1.1981, + "loss/crossentropy": 2.615004301071167, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16684943437576294, + "step": 1940 + }, + { + "epoch": 0.028983126773182023, + "grad_norm": 0.578125, + "grad_norm_var": 0.0023116906483968097, + "learning_rate": 2e-05, + "loss": 1.322, + "loss/crossentropy": 2.47305965423584, + "loss/dist_ce": 0.0, + "loss/fcd": 1.140625, + "loss/idx": 13.0, + "loss/logits": 0.181331604719162, + "step": 1941 + }, + { + "epoch": 0.028998058832312976, + "grad_norm": 0.5234375, + "grad_norm_var": 0.0011878808339436848, + "learning_rate": 2e-05, + "loss": 1.3178, + "loss/crossentropy": 2.2365729808807373, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19283056259155273, + "step": 1942 + }, + { + "epoch": 0.02901299089144393, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0012180169423421225, + "learning_rate": 2e-05, + "loss": 1.2095, + "loss/crossentropy": 2.5759778022766113, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16261249780654907, + "step": 1943 + }, + { + "epoch": 0.029027922950574884, + "grad_norm": 0.546875, + "grad_norm_var": 0.00123594601949056, + "learning_rate": 2e-05, + "loss": 1.2683, + "loss/crossentropy": 2.4381752014160156, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17450745403766632, + "step": 1944 + }, + { + "epoch": 0.02904285500970584, + "grad_norm": 0.5078125, + "grad_norm_var": 0.0012232303619384766, + "learning_rate": 2e-05, + "loss": 1.2061, + "loss/crossentropy": 2.6488664150238037, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.159229576587677, + "step": 1945 + }, + { + "epoch": 0.029057787068836792, + "grad_norm": 0.8203125, + "grad_norm_var": 0.006456232070922852, + "learning_rate": 2e-05, + "loss": 1.3904, + "loss/crossentropy": 2.58434796333313, + "loss/dist_ce": 0.0, + "loss/fcd": 1.203125, + "loss/idx": 13.0, + "loss/logits": 0.18732021749019623, + "step": 1946 + }, + { + "epoch": 0.029072719127967748, + "grad_norm": 0.60546875, + "grad_norm_var": 0.006490945816040039, + "learning_rate": 2e-05, + "loss": 1.3589, + "loss/crossentropy": 2.2552731037139893, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1796875, + "loss/idx": 13.0, + "loss/logits": 0.1792486011981964, + "step": 1947 + }, + { + "epoch": 0.0290876511870987, + "grad_norm": 0.52734375, + "grad_norm_var": 0.006305297215779622, + "learning_rate": 2e-05, + "loss": 1.207, + "loss/crossentropy": 2.5927445888519287, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1679755449295044, + "step": 1948 + }, + { + "epoch": 0.029102583246229656, + "grad_norm": 0.5625, + "grad_norm_var": 0.006291945775349935, + "learning_rate": 2e-05, + "loss": 1.244, + "loss/crossentropy": 2.586740255355835, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.17367993295192719, + "step": 1949 + }, + { + "epoch": 0.02911751530536061, + "grad_norm": 0.515625, + "grad_norm_var": 0.006332254409790039, + "learning_rate": 2e-05, + "loss": 1.1496, + "loss/crossentropy": 2.475268840789795, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.14958719909191132, + "step": 1950 + }, + { + "epoch": 0.029132447364491564, + "grad_norm": 0.484375, + "grad_norm_var": 0.006311655044555664, + "learning_rate": 2e-05, + "loss": 1.1883, + "loss/crossentropy": 2.3651411533355713, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.15700486302375793, + "step": 1951 + }, + { + "epoch": 0.029147379423622517, + "grad_norm": 0.46484375, + "grad_norm_var": 0.006743478775024414, + "learning_rate": 2e-05, + "loss": 1.2257, + "loss/crossentropy": 2.7126286029815674, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16317632794380188, + "step": 1952 + }, + { + "epoch": 0.029162311482753472, + "grad_norm": 0.51953125, + "grad_norm_var": 0.006778446833292643, + "learning_rate": 2e-05, + "loss": 1.2039, + "loss/crossentropy": 2.6610074043273926, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16480238735675812, + "step": 1953 + }, + { + "epoch": 0.029177243541884425, + "grad_norm": 0.51953125, + "grad_norm_var": 0.0067891279856363935, + "learning_rate": 2e-05, + "loss": 1.2575, + "loss/crossentropy": 2.511665105819702, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.17938058078289032, + "step": 1954 + }, + { + "epoch": 0.02919217560101538, + "grad_norm": 0.51953125, + "grad_norm_var": 0.006641070048014323, + "learning_rate": 2e-05, + "loss": 1.1791, + "loss/crossentropy": 2.600038528442383, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15568949282169342, + "step": 1955 + }, + { + "epoch": 0.029207107660146333, + "grad_norm": 0.5703125, + "grad_norm_var": 0.006540362040201823, + "learning_rate": 2e-05, + "loss": 1.2662, + "loss/crossentropy": 2.5734775066375732, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17242102324962616, + "step": 1956 + }, + { + "epoch": 0.02922203971927729, + "grad_norm": 0.5390625, + "grad_norm_var": 0.006479326883951823, + "learning_rate": 2e-05, + "loss": 1.2526, + "loss/crossentropy": 2.4726715087890625, + "loss/dist_ce": 0.0, + "loss/fcd": 1.078125, + "loss/idx": 13.0, + "loss/logits": 0.17450538277626038, + "step": 1957 + }, + { + "epoch": 0.02923697177840824, + "grad_norm": 0.53515625, + "grad_norm_var": 0.006453196207682292, + "learning_rate": 2e-05, + "loss": 1.2294, + "loss/crossentropy": 2.485107898712158, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.1591225266456604, + "step": 1958 + }, + { + "epoch": 0.029251903837539197, + "grad_norm": 0.453125, + "grad_norm_var": 0.006901995340983073, + "learning_rate": 2e-05, + "loss": 1.1976, + "loss/crossentropy": 2.5497329235076904, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16637209057807922, + "step": 1959 + }, + { + "epoch": 0.02926683589667015, + "grad_norm": 0.486328125, + "grad_norm_var": 0.0071015516916910805, + "learning_rate": 2e-05, + "loss": 1.1731, + "loss/crossentropy": 2.803480625152588, + "loss/dist_ce": 0.0, + "loss/fcd": 1.015625, + "loss/idx": 13.0, + "loss/logits": 0.15749675035476685, + "step": 1960 + }, + { + "epoch": 0.029281767955801105, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0070430596669514975, + "learning_rate": 2e-05, + "loss": 1.2392, + "loss/crossentropy": 2.4313740730285645, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16886743903160095, + "step": 1961 + }, + { + "epoch": 0.029296700014932057, + "grad_norm": 0.46875, + "grad_norm_var": 0.00165861447652181, + "learning_rate": 2e-05, + "loss": 1.1526, + "loss/crossentropy": 2.5858538150787354, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0, + "loss/idx": 13.0, + "loss/logits": 0.15264388918876648, + "step": 1962 + }, + { + "epoch": 0.029311632074063013, + "grad_norm": 0.5703125, + "grad_norm_var": 0.0013290246327718098, + "learning_rate": 2e-05, + "loss": 1.1973, + "loss/crossentropy": 2.4390578269958496, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.16606196761131287, + "step": 1963 + }, + { + "epoch": 0.02932656413319397, + "grad_norm": 0.9921875, + "grad_norm_var": 0.015507364273071289, + "learning_rate": 2e-05, + "loss": 1.5494, + "loss/crossentropy": 2.5813605785369873, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2578125, + "loss/idx": 13.0, + "loss/logits": 0.29163628816604614, + "step": 1964 + }, + { + "epoch": 0.02934149619232492, + "grad_norm": 0.53125, + "grad_norm_var": 0.015497700373331705, + "learning_rate": 2e-05, + "loss": 1.2957, + "loss/crossentropy": 2.640507221221924, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.20195528864860535, + "step": 1965 + }, + { + "epoch": 0.029356428251455877, + "grad_norm": 0.498046875, + "grad_norm_var": 0.015582529703776042, + "learning_rate": 2e-05, + "loss": 1.2077, + "loss/crossentropy": 2.6269850730895996, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.16860657930374146, + "step": 1966 + }, + { + "epoch": 0.02937136031058683, + "grad_norm": 0.53125, + "grad_norm_var": 0.015356699625651041, + "learning_rate": 2e-05, + "loss": 1.1539, + "loss/crossentropy": 2.636483669281006, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0078125, + "loss/idx": 13.0, + "loss/logits": 0.14610238373279572, + "step": 1967 + }, + { + "epoch": 0.029386292369717786, + "grad_norm": 0.5078125, + "grad_norm_var": 0.015010515848795572, + "learning_rate": 2e-05, + "loss": 1.1888, + "loss/crossentropy": 2.5731468200683594, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.1575092077255249, + "step": 1968 + }, + { + "epoch": 0.029401224428848738, + "grad_norm": 0.447265625, + "grad_norm_var": 0.015612141291300455, + "learning_rate": 2e-05, + "loss": 1.1274, + "loss/crossentropy": 2.7225778102874756, + "loss/dist_ce": 0.0, + "loss/fcd": 0.9765625, + "loss/idx": 13.0, + "loss/logits": 0.15085333585739136, + "step": 1969 + }, + { + "epoch": 0.029416156487979694, + "grad_norm": 0.53515625, + "grad_norm_var": 0.015577300389607748, + "learning_rate": 2e-05, + "loss": 1.3017, + "loss/crossentropy": 2.6382668018341064, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.18452683091163635, + "step": 1970 + }, + { + "epoch": 0.029431088547110646, + "grad_norm": 0.5078125, + "grad_norm_var": 0.01562498410542806, + "learning_rate": 2e-05, + "loss": 1.2374, + "loss/crossentropy": 2.362830638885498, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16712644696235657, + "step": 1971 + }, + { + "epoch": 0.029446020606241602, + "grad_norm": 0.490234375, + "grad_norm_var": 0.015742937723795574, + "learning_rate": 2e-05, + "loss": 1.2171, + "loss/crossentropy": 2.442208766937256, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0546875, + "loss/idx": 13.0, + "loss/logits": 0.16240856051445007, + "step": 1972 + }, + { + "epoch": 0.029460952665372554, + "grad_norm": 0.53125, + "grad_norm_var": 0.015746498107910158, + "learning_rate": 2e-05, + "loss": 1.2984, + "loss/crossentropy": 2.450517416000366, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.1889958381652832, + "step": 1973 + }, + { + "epoch": 0.02947588472450351, + "grad_norm": 0.58203125, + "grad_norm_var": 0.015863990783691405, + "learning_rate": 2e-05, + "loss": 1.2289, + "loss/crossentropy": 2.8067851066589355, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.16644850373268127, + "step": 1974 + }, + { + "epoch": 0.029490816783634462, + "grad_norm": 1.171875, + "grad_norm_var": 0.03970534006754557, + "learning_rate": 2e-05, + "loss": 1.3591, + "loss/crossentropy": 2.7080118656158447, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.23414579033851624, + "step": 1975 + }, + { + "epoch": 0.029505748842765418, + "grad_norm": 0.6015625, + "grad_norm_var": 0.0390010674794515, + "learning_rate": 2e-05, + "loss": 1.4244, + "loss/crossentropy": 2.4435675144195557, + "loss/dist_ce": 0.0, + "loss/fcd": 1.21875, + "loss/idx": 13.0, + "loss/logits": 0.20565560460090637, + "step": 1976 + }, + { + "epoch": 0.02952068090189637, + "grad_norm": 0.55859375, + "grad_norm_var": 0.03878693580627442, + "learning_rate": 2e-05, + "loss": 1.3483, + "loss/crossentropy": 2.673099994659424, + "loss/dist_ce": 0.0, + "loss/fcd": 1.15625, + "loss/idx": 13.0, + "loss/logits": 0.1920124590396881, + "step": 1977 + }, + { + "epoch": 0.029535612961027326, + "grad_norm": 0.54296875, + "grad_norm_var": 0.037878529230753584, + "learning_rate": 2e-05, + "loss": 1.2699, + "loss/crossentropy": 2.5343728065490723, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.17618075013160706, + "step": 1978 + }, + { + "epoch": 0.02955054502015828, + "grad_norm": 0.5390625, + "grad_norm_var": 0.03806316057840983, + "learning_rate": 2e-05, + "loss": 1.2738, + "loss/crossentropy": 2.402545690536499, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1015625, + "loss/idx": 13.0, + "loss/logits": 0.1722050905227661, + "step": 1979 + }, + { + "epoch": 0.029565477079289235, + "grad_norm": 0.578125, + "grad_norm_var": 0.0270174503326416, + "learning_rate": 2e-05, + "loss": 1.2149, + "loss/crossentropy": 2.5633673667907715, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16800308227539062, + "step": 1980 + }, + { + "epoch": 0.029580409138420187, + "grad_norm": 0.609375, + "grad_norm_var": 0.02697294553120931, + "learning_rate": 2e-05, + "loss": 1.4391, + "loss/crossentropy": 2.921246290206909, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1875, + "loss/idx": 13.0, + "loss/logits": 0.251582533121109, + "step": 1981 + }, + { + "epoch": 0.029595341197551143, + "grad_norm": 0.462890625, + "grad_norm_var": 0.027420409520467124, + "learning_rate": 2e-05, + "loss": 1.1457, + "loss/crossentropy": 2.622143507003784, + "loss/dist_ce": 0.0, + "loss/fcd": 0.99609375, + "loss/idx": 13.0, + "loss/logits": 0.14962665736675262, + "step": 1982 + }, + { + "epoch": 0.029610273256682095, + "grad_norm": 0.494140625, + "grad_norm_var": 0.02772210439046224, + "learning_rate": 2e-05, + "loss": 1.2095, + "loss/crossentropy": 2.3120386600494385, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16262733936309814, + "step": 1983 + }, + { + "epoch": 0.02962520531581305, + "grad_norm": 0.5546875, + "grad_norm_var": 0.02745507558186849, + "learning_rate": 2e-05, + "loss": 1.2747, + "loss/crossentropy": 2.4596571922302246, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.18092194199562073, + "step": 1984 + }, + { + "epoch": 0.029640137374944003, + "grad_norm": 0.83984375, + "grad_norm_var": 0.030378325780232748, + "learning_rate": 2e-05, + "loss": 1.2322, + "loss/crossentropy": 2.4688382148742676, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0703125, + "loss/idx": 13.0, + "loss/logits": 0.16190896928310394, + "step": 1985 + }, + { + "epoch": 0.02965506943407496, + "grad_norm": 0.5625, + "grad_norm_var": 0.030188735326131186, + "learning_rate": 2e-05, + "loss": 1.4595, + "loss/crossentropy": 2.3537650108337402, + "loss/dist_ce": 0.0, + "loss/fcd": 1.2265625, + "loss/idx": 13.0, + "loss/logits": 0.23292958736419678, + "step": 1986 + }, + { + "epoch": 0.02967000149320591, + "grad_norm": 0.515625, + "grad_norm_var": 0.03009476661682129, + "learning_rate": 2e-05, + "loss": 1.2017, + "loss/crossentropy": 2.5475752353668213, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1625993549823761, + "step": 1987 + }, + { + "epoch": 0.029684933552336867, + "grad_norm": 0.498046875, + "grad_norm_var": 0.02998197873433431, + "learning_rate": 2e-05, + "loss": 1.1774, + "loss/crossentropy": 2.5423738956451416, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0234375, + "loss/idx": 13.0, + "loss/logits": 0.15400215983390808, + "step": 1988 + }, + { + "epoch": 0.02969986561146782, + "grad_norm": 0.5859375, + "grad_norm_var": 0.029648192723592124, + "learning_rate": 2e-05, + "loss": 1.3544, + "loss/crossentropy": 2.679858446121216, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1640625, + "loss/idx": 13.0, + "loss/logits": 0.1902952790260315, + "step": 1989 + }, + { + "epoch": 0.029714797670598776, + "grad_norm": 0.462890625, + "grad_norm_var": 0.0309173583984375, + "learning_rate": 2e-05, + "loss": 1.1498, + "loss/crossentropy": 2.4342339038848877, + "loss/dist_ce": 0.0, + "loss/fcd": 0.99609375, + "loss/idx": 13.0, + "loss/logits": 0.1536850929260254, + "step": 1990 + }, + { + "epoch": 0.02972972972972973, + "grad_norm": 0.59375, + "grad_norm_var": 0.007619222005208333, + "learning_rate": 2e-05, + "loss": 1.3155, + "loss/crossentropy": 2.6562082767486572, + "loss/dist_ce": 0.0, + "loss/fcd": 1.125, + "loss/idx": 13.0, + "loss/logits": 0.19045662879943848, + "step": 1991 + }, + { + "epoch": 0.029744661788860684, + "grad_norm": 0.515625, + "grad_norm_var": 0.007633209228515625, + "learning_rate": 2e-05, + "loss": 1.2124, + "loss/crossentropy": 2.6417014598846436, + "loss/dist_ce": 0.0, + "loss/fcd": 1.046875, + "loss/idx": 13.0, + "loss/logits": 0.16551363468170166, + "step": 1992 + }, + { + "epoch": 0.02975959384799164, + "grad_norm": 0.47265625, + "grad_norm_var": 0.0080780029296875, + "learning_rate": 2e-05, + "loss": 1.2153, + "loss/crossentropy": 2.5185062885284424, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.176223486661911, + "step": 1993 + }, + { + "epoch": 0.029774525907122592, + "grad_norm": 0.65234375, + "grad_norm_var": 0.008697509765625, + "learning_rate": 2e-05, + "loss": 1.2088, + "loss/crossentropy": 2.354360818862915, + "loss/dist_ce": 0.0, + "loss/fcd": 1.03125, + "loss/idx": 13.0, + "loss/logits": 0.17753687500953674, + "step": 1994 + }, + { + "epoch": 0.029789457966253548, + "grad_norm": 0.55078125, + "grad_norm_var": 0.008675575256347656, + "learning_rate": 2e-05, + "loss": 1.29, + "loss/crossentropy": 2.5912888050079346, + "loss/dist_ce": 0.0, + "loss/fcd": 1.1171875, + "loss/idx": 13.0, + "loss/logits": 0.17279267311096191, + "step": 1995 + }, + { + "epoch": 0.0298043900253845, + "grad_norm": 0.53125, + "grad_norm_var": 0.008695411682128906, + "learning_rate": 2e-05, + "loss": 1.2898, + "loss/crossentropy": 2.7931675910949707, + "loss/dist_ce": 0.0, + "loss/fcd": 1.09375, + "loss/idx": 13.0, + "loss/logits": 0.1960277259349823, + "step": 1996 + }, + { + "epoch": 0.029819322084515456, + "grad_norm": 0.5234375, + "grad_norm_var": 0.008549944559733073, + "learning_rate": 2e-05, + "loss": 1.2553, + "loss/crossentropy": 2.6213860511779785, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0625, + "loss/idx": 13.0, + "loss/logits": 0.19277352094650269, + "step": 1997 + }, + { + "epoch": 0.02983425414364641, + "grad_norm": 0.60546875, + "grad_norm_var": 0.008144998550415039, + "learning_rate": 2e-05, + "loss": 1.3088, + "loss/crossentropy": 2.811062812805176, + "loss/dist_ce": 0.0, + "loss/fcd": 1.109375, + "loss/idx": 13.0, + "loss/logits": 0.19940659403800964, + "step": 1998 + }, + { + "epoch": 0.029849186202777364, + "grad_norm": 0.470703125, + "grad_norm_var": 0.008384943008422852, + "learning_rate": 2e-05, + "loss": 1.1252, + "loss/crossentropy": 2.78082013130188, + "loss/dist_ce": 0.0, + "loss/fcd": 0.98046875, + "loss/idx": 13.0, + "loss/logits": 0.1447007954120636, + "step": 1999 + }, + { + "epoch": 0.029864118261908316, + "grad_norm": 0.462890625, + "grad_norm_var": 0.008957926432291667, + "learning_rate": 2e-05, + "loss": 1.1896, + "loss/crossentropy": 2.4754817485809326, + "loss/dist_ce": 0.0, + "loss/fcd": 1.0390625, + "loss/idx": 13.0, + "loss/logits": 0.1505032777786255, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.03506581880832e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}