diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.017274289810760155, + "eval_steps": 1000, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.7274289810760155e-05, + "grad_norm": 0.318359375, + "learning_rate": 0.01, + "loss": 1.4153, + "loss/crossentropy": 2.180091619491577, + "loss/fcd": 1.1796875, + "loss/logits": 0.2821359634399414, + "step": 1 + }, + { + "epoch": 3.454857962152031e-05, + "grad_norm": 0.3515625, + "learning_rate": 0.01, + "loss": 1.4401, + "loss/crossentropy": 2.49104380607605, + "loss/fcd": 1.1484375, + "loss/logits": 0.2602585107088089, + "step": 2 + }, + { + "epoch": 5.182286943228046e-05, + "grad_norm": 0.30859375, + "learning_rate": 0.01, + "loss": 1.4352, + "loss/crossentropy": 2.453463077545166, + "loss/fcd": 1.1875, + "loss/logits": 0.2847007066011429, + "step": 3 + }, + { + "epoch": 6.909715924304062e-05, + "grad_norm": 0.306640625, + "learning_rate": 0.01, + "loss": 1.3983, + "loss/crossentropy": 2.52145779132843, + "loss/fcd": 1.125, + "loss/logits": 0.2535911202430725, + "step": 4 + }, + { + "epoch": 8.637144905380078e-05, + "grad_norm": 0.35546875, + "learning_rate": 0.01, + "loss": 1.4077, + "loss/crossentropy": 2.364890694618225, + "loss/fcd": 1.10546875, + "loss/logits": 0.24292171746492386, + "step": 5 + }, + { + "epoch": 0.00010364573886456093, + "grad_norm": 0.310546875, + "learning_rate": 0.01, + "loss": 1.3824, + "loss/crossentropy": 2.3052154779434204, + "loss/fcd": 1.12890625, + "loss/logits": 0.24541093409061432, + "step": 6 + }, + { + "epoch": 0.00012092002867532108, + "grad_norm": 0.29296875, + "learning_rate": 0.01, + "loss": 1.4026, + "loss/crossentropy": 2.381744861602783, + "loss/fcd": 1.1171875, + "loss/logits": 0.2507929801940918, + "step": 7 + }, + { + "epoch": 0.00013819431848608124, + "grad_norm": 0.322265625, + "learning_rate": 0.01, + "loss": 1.4452, + "loss/crossentropy": 2.613944888114929, + "loss/fcd": 1.1796875, + "loss/logits": 0.27175769209861755, + "step": 8 + }, + { + "epoch": 0.0001554686082968414, + "grad_norm": 0.31640625, + "learning_rate": 0.01, + "loss": 1.4301, + "loss/crossentropy": 2.4438647031784058, + "loss/fcd": 1.2890625, + "loss/logits": 0.31327594816684723, + "step": 9 + }, + { + "epoch": 0.00017274289810760156, + "grad_norm": 0.322265625, + "learning_rate": 0.01, + "loss": 1.4258, + "loss/crossentropy": 2.689444422721863, + "loss/fcd": 1.20703125, + "loss/logits": 0.2705621272325516, + "step": 10 + }, + { + "epoch": 0.0001900171879183617, + "grad_norm": 0.283203125, + "learning_rate": 0.01, + "loss": 1.38, + "loss/crossentropy": 2.6325626373291016, + "loss/fcd": 1.16796875, + "loss/logits": 0.26059799641370773, + "step": 11 + }, + { + "epoch": 0.00020729147772912185, + "grad_norm": 0.294921875, + "learning_rate": 0.01, + "loss": 1.3964, + "loss/crossentropy": 2.2171366214752197, + "loss/fcd": 1.16015625, + "loss/logits": 0.25415121763944626, + "step": 12 + }, + { + "epoch": 0.00022456576753988202, + "grad_norm": 0.314453125, + "learning_rate": 0.01, + "loss": 1.4028, + "loss/crossentropy": 2.239351272583008, + "loss/fcd": 1.0625, + "loss/logits": 0.2298966646194458, + "step": 13 + }, + { + "epoch": 0.00024184005735064217, + "grad_norm": 0.31640625, + "learning_rate": 0.01, + "loss": 1.4218, + "loss/crossentropy": 2.712681293487549, + "loss/fcd": 1.1328125, + "loss/logits": 0.24666083604097366, + "step": 14 + }, + { + "epoch": 0.00025911434716140234, + "grad_norm": 0.3515625, + "learning_rate": 0.01, + "loss": 1.4074, + "loss/crossentropy": 2.6137157678604126, + "loss/fcd": 1.18359375, + "loss/logits": 0.2758009433746338, + "step": 15 + }, + { + "epoch": 0.0002763886369721625, + "grad_norm": 0.376953125, + "grad_norm_var": 0.0006428877512613932, + "learning_rate": 0.01, + "loss": 1.4429, + "loss/crossentropy": 2.266461730003357, + "loss/fcd": 1.203125, + "loss/logits": 0.26471851766109467, + "step": 16 + }, + { + "epoch": 0.00029366292678292263, + "grad_norm": 0.353515625, + "grad_norm_var": 0.0007058302561442057, + "learning_rate": 0.01, + "loss": 1.433, + "loss/crossentropy": 2.63763689994812, + "loss/fcd": 1.21875, + "loss/logits": 0.28894874453544617, + "step": 17 + }, + { + "epoch": 0.0003109372165936828, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0006610711415608723, + "learning_rate": 0.01, + "loss": 1.4003, + "loss/crossentropy": 2.5304828882217407, + "loss/fcd": 1.13671875, + "loss/logits": 0.26741379499435425, + "step": 18 + }, + { + "epoch": 0.0003282115064044429, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0006503899892171224, + "learning_rate": 0.01, + "loss": 1.4179, + "loss/crossentropy": 2.36896288394928, + "loss/fcd": 1.19921875, + "loss/logits": 0.2745219022035599, + "step": 19 + }, + { + "epoch": 0.0003454857962152031, + "grad_norm": 0.3125, + "grad_norm_var": 0.0006408055623372395, + "learning_rate": 0.01, + "loss": 1.4132, + "loss/crossentropy": 2.471444010734558, + "loss/fcd": 1.15234375, + "loss/logits": 0.24692216515541077, + "step": 20 + }, + { + "epoch": 0.00036276008602596327, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0005624771118164062, + "learning_rate": 0.01, + "loss": 1.3532, + "loss/crossentropy": 2.4798572063446045, + "loss/fcd": 1.13671875, + "loss/logits": 0.24522659927606583, + "step": 21 + }, + { + "epoch": 0.0003800343758367234, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0005975723266601563, + "learning_rate": 0.01, + "loss": 1.4057, + "loss/crossentropy": 2.3649370670318604, + "loss/fcd": 1.15234375, + "loss/logits": 0.26143455505371094, + "step": 22 + }, + { + "epoch": 0.00039730866564748356, + "grad_norm": 0.3125, + "grad_norm_var": 0.0005533854166666667, + "learning_rate": 0.01, + "loss": 1.4282, + "loss/crossentropy": 2.7900454998016357, + "loss/fcd": 1.109375, + "loss/logits": 0.256390705704689, + "step": 23 + }, + { + "epoch": 0.0004145829554582437, + "grad_norm": 0.283203125, + "grad_norm_var": 0.000638580322265625, + "learning_rate": 0.01, + "loss": 1.422, + "loss/crossentropy": 2.3018282651901245, + "loss/fcd": 1.14453125, + "loss/logits": 0.26084744930267334, + "step": 24 + }, + { + "epoch": 0.0004318572452690039, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0006613254547119141, + "learning_rate": 0.01, + "loss": 1.4043, + "loss/crossentropy": 2.404328942298889, + "loss/fcd": 1.0390625, + "loss/logits": 0.24188002943992615, + "step": 25 + }, + { + "epoch": 0.00044913153507976405, + "grad_norm": 2.265625, + "grad_norm_var": 0.23812503814697267, + "learning_rate": 0.01, + "loss": 1.3559, + "loss/crossentropy": 2.5355838537216187, + "loss/fcd": 1.1640625, + "loss/logits": 0.24743662029504776, + "step": 26 + }, + { + "epoch": 0.0004664058248905242, + "grad_norm": 0.36328125, + "grad_norm_var": 0.23687055905659993, + "learning_rate": 0.01, + "loss": 1.4526, + "loss/crossentropy": 2.329304337501526, + "loss/fcd": 1.0625, + "loss/logits": 0.2358776032924652, + "step": 27 + }, + { + "epoch": 0.00048368011470128434, + "grad_norm": 0.279296875, + "grad_norm_var": 0.23719480832417805, + "learning_rate": 0.01, + "loss": 1.3243, + "loss/crossentropy": 2.1602375507354736, + "loss/fcd": 1.02734375, + "loss/logits": 0.21287230402231216, + "step": 28 + }, + { + "epoch": 0.0005009544045120445, + "grad_norm": 0.30078125, + "grad_norm_var": 0.2374394734700521, + "learning_rate": 0.01, + "loss": 1.4111, + "loss/crossentropy": 2.4278478622436523, + "loss/fcd": 1.1171875, + "loss/logits": 0.2437409982085228, + "step": 29 + }, + { + "epoch": 0.0005182286943228047, + "grad_norm": 0.27734375, + "grad_norm_var": 0.23818588256835938, + "learning_rate": 0.01, + "loss": 1.4091, + "loss/crossentropy": 2.5047000646591187, + "loss/fcd": 1.15625, + "loss/logits": 0.27113544940948486, + "step": 30 + }, + { + "epoch": 0.0005355029841335648, + "grad_norm": 0.373046875, + "grad_norm_var": 0.23796435991923015, + "learning_rate": 0.01, + "loss": 1.4498, + "loss/crossentropy": 2.3999940156936646, + "loss/fcd": 1.25, + "loss/logits": 0.27853211760520935, + "step": 31 + }, + { + "epoch": 0.000552777273944325, + "grad_norm": 0.3671875, + "grad_norm_var": 0.23805281321207683, + "learning_rate": 0.01, + "loss": 1.4805, + "loss/crossentropy": 2.511382222175598, + "loss/fcd": 1.3203125, + "loss/logits": 0.409069299697876, + "step": 32 + }, + { + "epoch": 0.0005700515637550852, + "grad_norm": 0.6875, + "grad_norm_var": 0.24118663469950358, + "learning_rate": 0.01, + "loss": 1.3432, + "loss/crossentropy": 2.5396409034729004, + "loss/fcd": 1.2421875, + "loss/logits": 0.25656259059906006, + "step": 33 + }, + { + "epoch": 0.0005873258535658453, + "grad_norm": 0.357421875, + "grad_norm_var": 0.24034620920817057, + "learning_rate": 0.01, + "loss": 1.4207, + "loss/crossentropy": 2.3687368631362915, + "loss/fcd": 1.109375, + "loss/logits": 0.23432840406894684, + "step": 34 + }, + { + "epoch": 0.0006046001433766055, + "grad_norm": 0.291015625, + "grad_norm_var": 0.2409596602121989, + "learning_rate": 0.01, + "loss": 1.4195, + "loss/crossentropy": 2.428983449935913, + "loss/fcd": 1.21484375, + "loss/logits": 0.2627260833978653, + "step": 35 + }, + { + "epoch": 0.0006218744331873656, + "grad_norm": 0.31640625, + "grad_norm_var": 0.2408828576405843, + "learning_rate": 0.01, + "loss": 1.372, + "loss/crossentropy": 2.827309250831604, + "loss/fcd": 1.10546875, + "loss/logits": 0.2433805763721466, + "step": 36 + }, + { + "epoch": 0.0006391487229981258, + "grad_norm": 0.28125, + "grad_norm_var": 0.24178783098856607, + "learning_rate": 0.01, + "loss": 1.388, + "loss/crossentropy": 2.4543423652648926, + "loss/fcd": 1.13671875, + "loss/logits": 0.2694346010684967, + "step": 37 + }, + { + "epoch": 0.0006564230128088858, + "grad_norm": 0.302734375, + "grad_norm_var": 0.24162036577860516, + "learning_rate": 0.01, + "loss": 1.4109, + "loss/crossentropy": 2.5903791189193726, + "loss/fcd": 1.12109375, + "loss/logits": 0.246421679854393, + "step": 38 + }, + { + "epoch": 0.000673697302619646, + "grad_norm": 0.32421875, + "grad_norm_var": 0.24139873186747232, + "learning_rate": 0.01, + "loss": 1.4232, + "loss/crossentropy": 2.248749613761902, + "loss/fcd": 1.09375, + "loss/logits": 0.23829498887062073, + "step": 39 + }, + { + "epoch": 0.0006909715924304062, + "grad_norm": 0.31640625, + "grad_norm_var": 0.24068241119384765, + "learning_rate": 0.01, + "loss": 1.4025, + "loss/crossentropy": 2.52192759513855, + "loss/fcd": 1.21875, + "loss/logits": 0.3120736628770828, + "step": 40 + }, + { + "epoch": 0.0007082458822411663, + "grad_norm": 0.294921875, + "grad_norm_var": 0.24076868693033854, + "learning_rate": 0.01, + "loss": 1.3388, + "loss/crossentropy": 2.4299440383911133, + "loss/fcd": 1.05078125, + "loss/logits": 0.21974454075098038, + "step": 41 + }, + { + "epoch": 0.0007255201720519265, + "grad_norm": 0.361328125, + "grad_norm_var": 0.00956584612528483, + "learning_rate": 0.01, + "loss": 1.3783, + "loss/crossentropy": 2.3354129791259766, + "loss/fcd": 1.12109375, + "loss/logits": 0.22372399270534515, + "step": 42 + }, + { + "epoch": 0.0007427944618626866, + "grad_norm": 0.2734375, + "grad_norm_var": 0.009831984837849935, + "learning_rate": 0.01, + "loss": 1.3578, + "loss/crossentropy": 2.3422107696533203, + "loss/fcd": 1.0703125, + "loss/logits": 0.22979970276355743, + "step": 43 + }, + { + "epoch": 0.0007600687516734468, + "grad_norm": 0.337890625, + "grad_norm_var": 0.009589751561482748, + "learning_rate": 0.01, + "loss": 1.4869, + "loss/crossentropy": 2.4120657444000244, + "loss/fcd": 1.22265625, + "loss/logits": 0.27795399725437164, + "step": 44 + }, + { + "epoch": 0.000777343041484207, + "grad_norm": 0.314453125, + "grad_norm_var": 0.009527333577473958, + "learning_rate": 0.01, + "loss": 1.3861, + "loss/crossentropy": 2.66101336479187, + "loss/fcd": 1.15625, + "loss/logits": 0.25736863911151886, + "step": 45 + }, + { + "epoch": 0.0007946173312949671, + "grad_norm": 0.298828125, + "grad_norm_var": 0.009370152155558269, + "learning_rate": 0.01, + "loss": 1.4078, + "loss/crossentropy": 2.5887415409088135, + "loss/fcd": 1.17578125, + "loss/logits": 0.285249263048172, + "step": 46 + }, + { + "epoch": 0.0008118916211057273, + "grad_norm": 0.271484375, + "grad_norm_var": 0.009616454442342123, + "learning_rate": 0.01, + "loss": 1.3142, + "loss/crossentropy": 2.5115991830825806, + "loss/fcd": 1.0703125, + "loss/logits": 0.23692379146814346, + "step": 47 + }, + { + "epoch": 0.0008291659109164874, + "grad_norm": 0.296875, + "grad_norm_var": 0.00964506467183431, + "learning_rate": 0.01, + "loss": 1.4093, + "loss/crossentropy": 2.5383851528167725, + "loss/fcd": 1.14453125, + "loss/logits": 0.2725464850664139, + "step": 48 + }, + { + "epoch": 0.0008464402007272476, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0007040500640869141, + "learning_rate": 0.01, + "loss": 1.3871, + "loss/crossentropy": 2.3415656089782715, + "loss/fcd": 1.109375, + "loss/logits": 0.23871353268623352, + "step": 49 + }, + { + "epoch": 0.0008637144905380078, + "grad_norm": 0.310546875, + "grad_norm_var": 0.000538492202758789, + "learning_rate": 0.01, + "loss": 1.422, + "loss/crossentropy": 2.241709351539612, + "loss/fcd": 1.12109375, + "loss/logits": 0.2642487585544586, + "step": 50 + }, + { + "epoch": 0.0008809887803487679, + "grad_norm": 0.271484375, + "grad_norm_var": 0.0006014347076416015, + "learning_rate": 0.01, + "loss": 1.4018, + "loss/crossentropy": 2.18844211101532, + "loss/fcd": 1.0859375, + "loss/logits": 0.25836754590272903, + "step": 51 + }, + { + "epoch": 0.0008982630701595281, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0006024678548177083, + "learning_rate": 0.01, + "loss": 1.3702, + "loss/crossentropy": 2.4040807485580444, + "loss/fcd": 1.12890625, + "loss/logits": 0.27216051518917084, + "step": 52 + }, + { + "epoch": 0.0009155373599702882, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0005683739980061849, + "learning_rate": 0.01, + "loss": 1.375, + "loss/crossentropy": 2.3604718446731567, + "loss/fcd": 1.109375, + "loss/logits": 0.2563931792974472, + "step": 53 + }, + { + "epoch": 0.0009328116497810484, + "grad_norm": 0.328125, + "grad_norm_var": 0.0006024678548177083, + "learning_rate": 0.01, + "loss": 1.3398, + "loss/crossentropy": 2.3702304363250732, + "loss/fcd": 1.044921875, + "loss/logits": 0.23356395214796066, + "step": 54 + }, + { + "epoch": 0.0009500859395918086, + "grad_norm": 0.322265625, + "grad_norm_var": 0.000598001480102539, + "learning_rate": 0.01, + "loss": 1.4359, + "loss/crossentropy": 2.532386064529419, + "loss/fcd": 1.19921875, + "loss/logits": 0.29735907912254333, + "step": 55 + }, + { + "epoch": 0.0009673602294025687, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0005999088287353515, + "learning_rate": 0.01, + "loss": 1.3103, + "loss/crossentropy": 2.4240375757217407, + "loss/fcd": 1.04296875, + "loss/logits": 0.2354799136519432, + "step": 56 + }, + { + "epoch": 0.0009846345192133289, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0005986372629801433, + "learning_rate": 0.01, + "loss": 1.4436, + "loss/crossentropy": 2.6270374059677124, + "loss/fcd": 1.17578125, + "loss/logits": 0.2780339866876602, + "step": 57 + }, + { + "epoch": 0.001001908809024089, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0003819147745768229, + "learning_rate": 0.01, + "loss": 1.4263, + "loss/crossentropy": 2.6478673219680786, + "loss/fcd": 1.1640625, + "loss/logits": 0.26073622703552246, + "step": 58 + }, + { + "epoch": 0.001019183098834849, + "grad_norm": 0.314453125, + "grad_norm_var": 0.00032817522684733074, + "learning_rate": 0.01, + "loss": 1.3944, + "loss/crossentropy": 2.596788763999939, + "loss/fcd": 1.13671875, + "loss/logits": 0.2364196628332138, + "step": 59 + }, + { + "epoch": 0.0010364573886456094, + "grad_norm": 0.388671875, + "grad_norm_var": 0.0007116794586181641, + "learning_rate": 0.01, + "loss": 1.4703, + "loss/crossentropy": 2.516297459602356, + "loss/fcd": 1.1484375, + "loss/logits": 0.2600822076201439, + "step": 60 + }, + { + "epoch": 0.0010537316784563695, + "grad_norm": 0.353515625, + "grad_norm_var": 0.0008394718170166016, + "learning_rate": 0.01, + "loss": 1.4355, + "loss/crossentropy": 2.3750414848327637, + "loss/fcd": 1.1171875, + "loss/logits": 0.257433146238327, + "step": 61 + }, + { + "epoch": 0.0010710059682671295, + "grad_norm": 0.341796875, + "grad_norm_var": 0.0008870283762613932, + "learning_rate": 0.01, + "loss": 1.4704, + "loss/crossentropy": 2.6349244117736816, + "loss/fcd": 1.171875, + "loss/logits": 0.27842070162296295, + "step": 62 + }, + { + "epoch": 0.0010882802580778899, + "grad_norm": 0.345703125, + "grad_norm_var": 0.000816965103149414, + "learning_rate": 0.01, + "loss": 1.4253, + "loss/crossentropy": 2.4561866521835327, + "loss/fcd": 1.1484375, + "loss/logits": 0.2601305991411209, + "step": 63 + }, + { + "epoch": 0.00110555454788865, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0008069197336832682, + "learning_rate": 0.01, + "loss": 1.3542, + "loss/crossentropy": 2.4422744512557983, + "loss/fcd": 1.16015625, + "loss/logits": 0.25526949763298035, + "step": 64 + }, + { + "epoch": 0.00112282883769941, + "grad_norm": 0.41796875, + "grad_norm_var": 0.0014043013254801432, + "learning_rate": 0.01, + "loss": 1.4401, + "loss/crossentropy": 2.164702892303467, + "loss/fcd": 1.22265625, + "loss/logits": 0.20365531742572784, + "step": 65 + }, + { + "epoch": 0.0011401031275101703, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0014294942220052084, + "learning_rate": 0.01, + "loss": 1.3525, + "loss/crossentropy": 2.7132447957992554, + "loss/fcd": 1.13671875, + "loss/logits": 0.2643866539001465, + "step": 66 + }, + { + "epoch": 0.0011573774173209304, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0012522220611572265, + "learning_rate": 0.01, + "loss": 1.3225, + "loss/crossentropy": 2.4213372468948364, + "loss/fcd": 1.07421875, + "loss/logits": 0.2328537479043007, + "step": 67 + }, + { + "epoch": 0.0011746517071316905, + "grad_norm": 0.353515625, + "grad_norm_var": 0.00119627316792806, + "learning_rate": 0.01, + "loss": 1.3973, + "loss/crossentropy": 2.436795651912689, + "loss/fcd": 1.09765625, + "loss/logits": 0.2546040713787079, + "step": 68 + }, + { + "epoch": 0.0011919259969424506, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0012935479482014975, + "learning_rate": 0.01, + "loss": 1.3866, + "loss/crossentropy": 2.274712562561035, + "loss/fcd": 1.15625, + "loss/logits": 0.26513542234897614, + "step": 69 + }, + { + "epoch": 0.001209200286753211, + "grad_norm": 0.296875, + "grad_norm_var": 0.0013611952463785807, + "learning_rate": 0.01, + "loss": 1.3986, + "loss/crossentropy": 2.4798693656921387, + "loss/fcd": 1.140625, + "loss/logits": 0.2789834886789322, + "step": 70 + }, + { + "epoch": 0.001226474576563971, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0014126936594645182, + "learning_rate": 0.01, + "loss": 1.3394, + "loss/crossentropy": 2.496403932571411, + "loss/fcd": 1.10546875, + "loss/logits": 0.23832575976848602, + "step": 71 + }, + { + "epoch": 0.001243748866374731, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0013817946116129558, + "learning_rate": 0.01, + "loss": 1.3945, + "loss/crossentropy": 2.330789804458618, + "loss/fcd": 1.078125, + "loss/logits": 0.23751115798950195, + "step": 72 + }, + { + "epoch": 0.0012610231561854914, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0014734745025634765, + "learning_rate": 0.01, + "loss": 1.3859, + "loss/crossentropy": 2.5367313623428345, + "loss/fcd": 1.19921875, + "loss/logits": 0.2804088890552521, + "step": 73 + }, + { + "epoch": 0.0012782974459962515, + "grad_norm": 0.28515625, + "grad_norm_var": 0.001559305191040039, + "learning_rate": 0.01, + "loss": 1.3887, + "loss/crossentropy": 2.3117035627365112, + "loss/fcd": 1.0625, + "loss/logits": 0.2553889825940132, + "step": 74 + }, + { + "epoch": 0.0012955717358070116, + "grad_norm": 0.302734375, + "grad_norm_var": 0.001582193374633789, + "learning_rate": 0.01, + "loss": 1.4083, + "loss/crossentropy": 2.5574092864990234, + "loss/fcd": 1.12890625, + "loss/logits": 0.24754850566387177, + "step": 75 + }, + { + "epoch": 0.0013128460256177717, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0013358910878499349, + "learning_rate": 0.01, + "loss": 1.39, + "loss/crossentropy": 2.5164517164230347, + "loss/fcd": 1.13671875, + "loss/logits": 0.23118755221366882, + "step": 76 + }, + { + "epoch": 0.001330120315428532, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0012410481770833333, + "learning_rate": 0.01, + "loss": 1.4129, + "loss/crossentropy": 2.4725937843322754, + "loss/fcd": 1.11328125, + "loss/logits": 0.2354634776711464, + "step": 77 + }, + { + "epoch": 0.001347394605239292, + "grad_norm": 0.52734375, + "grad_norm_var": 0.0040692488352457685, + "learning_rate": 0.01, + "loss": 1.5435, + "loss/crossentropy": 2.067330479621887, + "loss/fcd": 1.0703125, + "loss/logits": 0.2535740062594414, + "step": 78 + }, + { + "epoch": 0.0013646688950500522, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0040776570638020836, + "learning_rate": 0.01, + "loss": 1.3808, + "loss/crossentropy": 2.363155961036682, + "loss/fcd": 1.109375, + "loss/logits": 0.2392946034669876, + "step": 79 + }, + { + "epoch": 0.0013819431848608125, + "grad_norm": 0.310546875, + "grad_norm_var": 0.004054371515909831, + "learning_rate": 0.01, + "loss": 1.4014, + "loss/crossentropy": 2.561974883079529, + "loss/fcd": 1.12109375, + "loss/logits": 0.2719137519598007, + "step": 80 + }, + { + "epoch": 0.0013992174746715726, + "grad_norm": 0.33203125, + "grad_norm_var": 0.0034375349680582684, + "learning_rate": 0.01, + "loss": 1.3718, + "loss/crossentropy": 2.5669400691986084, + "loss/fcd": 1.1875, + "loss/logits": 0.27283619344234467, + "step": 81 + }, + { + "epoch": 0.0014164917644823327, + "grad_norm": 0.318359375, + "grad_norm_var": 0.003415362040201823, + "learning_rate": 0.01, + "loss": 1.423, + "loss/crossentropy": 2.3874313831329346, + "loss/fcd": 1.11328125, + "loss/logits": 0.25072336941957474, + "step": 82 + }, + { + "epoch": 0.001433766054293093, + "grad_norm": 0.29296875, + "grad_norm_var": 0.003453509012858073, + "learning_rate": 0.01, + "loss": 1.4176, + "loss/crossentropy": 2.711247205734253, + "loss/fcd": 1.23046875, + "loss/logits": 0.28591448068618774, + "step": 83 + }, + { + "epoch": 0.001451040344103853, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0034200032552083332, + "learning_rate": 0.01, + "loss": 1.3905, + "loss/crossentropy": 2.549779772758484, + "loss/fcd": 1.12109375, + "loss/logits": 0.2730839252471924, + "step": 84 + }, + { + "epoch": 0.0014683146339146132, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0033526102701822917, + "learning_rate": 0.01, + "loss": 1.3706, + "loss/crossentropy": 2.255567193031311, + "loss/fcd": 1.1015625, + "loss/logits": 0.2550910860300064, + "step": 85 + }, + { + "epoch": 0.0014855889237253732, + "grad_norm": 0.6015625, + "grad_norm_var": 0.008341471354166666, + "learning_rate": 0.01, + "loss": 1.629, + "loss/crossentropy": 2.245366394519806, + "loss/fcd": 1.58203125, + "loss/logits": 0.3177703619003296, + "step": 86 + }, + { + "epoch": 0.0015028632135361336, + "grad_norm": 0.361328125, + "grad_norm_var": 0.0082763671875, + "learning_rate": 0.01, + "loss": 1.3925, + "loss/crossentropy": 2.5329853296279907, + "loss/fcd": 1.1640625, + "loss/logits": 0.2691914439201355, + "step": 87 + }, + { + "epoch": 0.0015201375033468936, + "grad_norm": 0.337890625, + "grad_norm_var": 0.008169158299763998, + "learning_rate": 0.01, + "loss": 1.3783, + "loss/crossentropy": 2.573711633682251, + "loss/fcd": 1.11328125, + "loss/logits": 0.24663084745407104, + "step": 88 + }, + { + "epoch": 0.0015374117931576537, + "grad_norm": 0.3046875, + "grad_norm_var": 0.008059438069661458, + "learning_rate": 0.01, + "loss": 1.3466, + "loss/crossentropy": 2.4545916318893433, + "loss/fcd": 1.14453125, + "loss/logits": 0.22179614007472992, + "step": 89 + }, + { + "epoch": 0.001554686082968414, + "grad_norm": 0.353515625, + "grad_norm_var": 0.007821893692016602, + "learning_rate": 0.01, + "loss": 1.4058, + "loss/crossentropy": 2.0489944219589233, + "loss/fcd": 1.1484375, + "loss/logits": 0.25446537882089615, + "step": 90 + }, + { + "epoch": 0.0015719603727791741, + "grad_norm": 0.333984375, + "grad_norm_var": 0.007696262995402018, + "learning_rate": 0.01, + "loss": 1.4186, + "loss/crossentropy": 2.6278460025787354, + "loss/fcd": 1.17578125, + "loss/logits": 0.2563782036304474, + "step": 91 + }, + { + "epoch": 0.0015892346625899342, + "grad_norm": 0.33984375, + "grad_norm_var": 0.0074314753214518225, + "learning_rate": 0.01, + "loss": 1.4634, + "loss/crossentropy": 2.3578550815582275, + "loss/fcd": 1.13671875, + "loss/logits": 0.26509464532136917, + "step": 92 + }, + { + "epoch": 0.0016065089524006945, + "grad_norm": 0.30078125, + "grad_norm_var": 0.007539876302083333, + "learning_rate": 0.01, + "loss": 1.3685, + "loss/crossentropy": 2.53238308429718, + "loss/fcd": 1.0859375, + "loss/logits": 0.24864411354064941, + "step": 93 + }, + { + "epoch": 0.0016237832422114546, + "grad_norm": 0.302734375, + "grad_norm_var": 0.005428679784138997, + "learning_rate": 0.01, + "loss": 1.4136, + "loss/crossentropy": 2.3873801231384277, + "loss/fcd": 1.2265625, + "loss/logits": 0.2842061370611191, + "step": 94 + }, + { + "epoch": 0.0016410575320222147, + "grad_norm": 0.29296875, + "grad_norm_var": 0.005470768610636393, + "learning_rate": 0.01, + "loss": 1.4, + "loss/crossentropy": 2.576484441757202, + "loss/fcd": 1.203125, + "loss/logits": 0.2684750333428383, + "step": 95 + }, + { + "epoch": 0.0016583318218329748, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0054779052734375, + "learning_rate": 0.01, + "loss": 1.4256, + "loss/crossentropy": 2.5171070098876953, + "loss/fcd": 1.24609375, + "loss/logits": 0.2969086170196533, + "step": 96 + }, + { + "epoch": 0.0016756061116437351, + "grad_norm": 0.306640625, + "grad_norm_var": 0.005534728368123372, + "learning_rate": 0.01, + "loss": 1.3949, + "loss/crossentropy": 2.6096785068511963, + "loss/fcd": 1.12890625, + "loss/logits": 0.2719826400279999, + "step": 97 + }, + { + "epoch": 0.0016928804014544952, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0056027571360270185, + "learning_rate": 0.01, + "loss": 1.3758, + "loss/crossentropy": 2.366774320602417, + "loss/fcd": 1.1015625, + "loss/logits": 0.23891064524650574, + "step": 98 + }, + { + "epoch": 0.0017101546912652553, + "grad_norm": 0.34765625, + "grad_norm_var": 0.005489714940388997, + "learning_rate": 0.01, + "loss": 1.436, + "loss/crossentropy": 2.356974244117737, + "loss/fcd": 1.3046875, + "loss/logits": 0.2715897411108017, + "step": 99 + }, + { + "epoch": 0.0017274289810760156, + "grad_norm": 0.287109375, + "grad_norm_var": 0.005489714940388997, + "learning_rate": 0.01, + "loss": 1.3544, + "loss/crossentropy": 2.5830947160720825, + "loss/fcd": 1.15625, + "loss/logits": 0.28681397438049316, + "step": 100 + }, + { + "epoch": 0.0017447032708867757, + "grad_norm": 0.296875, + "grad_norm_var": 0.0055816650390625, + "learning_rate": 0.01, + "loss": 1.3767, + "loss/crossentropy": 2.538628339767456, + "loss/fcd": 1.0859375, + "loss/logits": 0.2549655809998512, + "step": 101 + }, + { + "epoch": 0.0017619775606975358, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0005793094635009766, + "learning_rate": 0.01, + "loss": 1.3127, + "loss/crossentropy": 2.153649151325226, + "loss/fcd": 1.0546875, + "loss/logits": 0.23056582361459732, + "step": 102 + }, + { + "epoch": 0.0017792518505082959, + "grad_norm": 0.337890625, + "grad_norm_var": 0.0004759311676025391, + "learning_rate": 0.01, + "loss": 1.4807, + "loss/crossentropy": 2.7840667963027954, + "loss/fcd": 1.203125, + "loss/logits": 0.26921743154525757, + "step": 103 + }, + { + "epoch": 0.0017965261403190562, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0004470189412434896, + "learning_rate": 0.01, + "loss": 1.4075, + "loss/crossentropy": 2.375385046005249, + "loss/fcd": 1.10546875, + "loss/logits": 0.2573629766702652, + "step": 104 + }, + { + "epoch": 0.0018138004301298163, + "grad_norm": 0.33203125, + "grad_norm_var": 0.0004608154296875, + "learning_rate": 0.01, + "loss": 1.3262, + "loss/crossentropy": 2.7132558822631836, + "loss/fcd": 1.109375, + "loss/logits": 0.2457902729511261, + "step": 105 + }, + { + "epoch": 0.0018310747199405764, + "grad_norm": 0.2890625, + "grad_norm_var": 0.00039315223693847656, + "learning_rate": 0.01, + "loss": 1.292, + "loss/crossentropy": 2.017941474914551, + "loss/fcd": 0.986328125, + "loss/logits": 0.20789727568626404, + "step": 106 + }, + { + "epoch": 0.0018483490097513367, + "grad_norm": 0.310546875, + "grad_norm_var": 0.00035691261291503906, + "learning_rate": 0.01, + "loss": 1.4188, + "loss/crossentropy": 2.457041621208191, + "loss/fcd": 1.11328125, + "loss/logits": 0.23911338299512863, + "step": 107 + }, + { + "epoch": 0.0018656232995620968, + "grad_norm": 0.267578125, + "grad_norm_var": 0.00039513905843098957, + "learning_rate": 0.01, + "loss": 1.3624, + "loss/crossentropy": 2.264693021774292, + "loss/fcd": 1.1328125, + "loss/logits": 0.23969107121229172, + "step": 108 + }, + { + "epoch": 0.0018828975893728569, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0004439671834309896, + "learning_rate": 0.01, + "loss": 1.3602, + "loss/crossentropy": 2.5558459758758545, + "loss/fcd": 1.12890625, + "loss/logits": 0.24982617795467377, + "step": 109 + }, + { + "epoch": 0.0019001718791836172, + "grad_norm": 0.44921875, + "grad_norm_var": 0.0017612298329671224, + "learning_rate": 0.01, + "loss": 1.4482, + "loss/crossentropy": 2.623742938041687, + "loss/fcd": 1.140625, + "loss/logits": 0.2605845034122467, + "step": 110 + }, + { + "epoch": 0.0019174461689943773, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0017667134602864583, + "learning_rate": 0.01, + "loss": 1.4127, + "loss/crossentropy": 2.7532334327697754, + "loss/fcd": 1.171875, + "loss/logits": 0.26577115058898926, + "step": 111 + }, + { + "epoch": 0.0019347204588051373, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0017864068349202475, + "learning_rate": 0.01, + "loss": 1.3525, + "loss/crossentropy": 2.4502193927764893, + "loss/fcd": 1.05859375, + "loss/logits": 0.2550206333398819, + "step": 112 + }, + { + "epoch": 0.0019519947486158974, + "grad_norm": 0.33984375, + "grad_norm_var": 0.0018309911092122396, + "learning_rate": 0.01, + "loss": 1.4422, + "loss/crossentropy": 2.0644272565841675, + "loss/fcd": 1.08203125, + "loss/logits": 0.25845974683761597, + "step": 113 + }, + { + "epoch": 0.0019692690384266577, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0018169244130452475, + "learning_rate": 0.01, + "loss": 1.3762, + "loss/crossentropy": 2.6453906297683716, + "loss/fcd": 1.16015625, + "loss/logits": 0.28696541488170624, + "step": 114 + }, + { + "epoch": 0.001986543328237418, + "grad_norm": 0.3125, + "grad_norm_var": 0.0017402489980061849, + "learning_rate": 0.01, + "loss": 1.3974, + "loss/crossentropy": 2.229590892791748, + "loss/fcd": 1.04296875, + "loss/logits": 0.22459837794303894, + "step": 115 + }, + { + "epoch": 0.002003817618048178, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0017174879709879558, + "learning_rate": 0.01, + "loss": 1.3518, + "loss/crossentropy": 2.5267633199691772, + "loss/fcd": 1.08203125, + "loss/logits": 0.24026738852262497, + "step": 116 + }, + { + "epoch": 0.0020210919078589382, + "grad_norm": 0.328125, + "grad_norm_var": 0.0017108758290608724, + "learning_rate": 0.01, + "loss": 1.4729, + "loss/crossentropy": 2.3015085458755493, + "loss/fcd": 1.08203125, + "loss/logits": 0.23641249537467957, + "step": 117 + }, + { + "epoch": 0.002038366197669698, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0017054080963134766, + "learning_rate": 0.01, + "loss": 1.4479, + "loss/crossentropy": 2.0869252681732178, + "loss/fcd": 1.14453125, + "loss/logits": 0.2337687686085701, + "step": 118 + }, + { + "epoch": 0.0020556404874804584, + "grad_norm": 0.3125, + "grad_norm_var": 0.0016692479451497395, + "learning_rate": 0.01, + "loss": 1.3789, + "loss/crossentropy": 2.620050311088562, + "loss/fcd": 1.1328125, + "loss/logits": 0.2408916875720024, + "step": 119 + }, + { + "epoch": 0.0020729147772912187, + "grad_norm": 0.296875, + "grad_norm_var": 0.0016824722290039063, + "learning_rate": 0.01, + "loss": 1.3728, + "loss/crossentropy": 2.406272053718567, + "loss/fcd": 1.28515625, + "loss/logits": 0.27460669726133347, + "step": 120 + }, + { + "epoch": 0.0020901890671019786, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0016681512196858725, + "learning_rate": 0.01, + "loss": 1.3607, + "loss/crossentropy": 2.1980100870132446, + "loss/fcd": 1.013671875, + "loss/logits": 0.23184800148010254, + "step": 121 + }, + { + "epoch": 0.002107463356912739, + "grad_norm": 0.365234375, + "grad_norm_var": 0.0018063863118489584, + "learning_rate": 0.01, + "loss": 1.4133, + "loss/crossentropy": 2.672022223472595, + "loss/fcd": 1.24609375, + "loss/logits": 0.2712271511554718, + "step": 122 + }, + { + "epoch": 0.002124737646723499, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0018046061197916667, + "learning_rate": 0.01, + "loss": 1.4161, + "loss/crossentropy": 2.161317527294159, + "loss/fcd": 1.1640625, + "loss/logits": 0.2415143996477127, + "step": 123 + }, + { + "epoch": 0.002142011936534259, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0016402562459309896, + "learning_rate": 0.01, + "loss": 1.3432, + "loss/crossentropy": 2.4041404724121094, + "loss/fcd": 1.1171875, + "loss/logits": 0.2565518468618393, + "step": 124 + }, + { + "epoch": 0.0021592862263450194, + "grad_norm": 0.296875, + "grad_norm_var": 0.0015553792317708334, + "learning_rate": 0.01, + "loss": 1.3222, + "loss/crossentropy": 2.289466381072998, + "loss/fcd": 1.11328125, + "loss/logits": 0.2588811218738556, + "step": 125 + }, + { + "epoch": 0.0021765605161557797, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0003751118977864583, + "learning_rate": 0.01, + "loss": 1.4145, + "loss/crossentropy": 2.0946825742721558, + "loss/fcd": 1.0546875, + "loss/logits": 0.22345608472824097, + "step": 126 + }, + { + "epoch": 0.0021938348059665396, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0003452936808268229, + "learning_rate": 0.01, + "loss": 1.3904, + "loss/crossentropy": 2.4527688026428223, + "loss/fcd": 1.078125, + "loss/logits": 0.23762068152427673, + "step": 127 + }, + { + "epoch": 0.0022111090957773, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0003202915191650391, + "learning_rate": 0.01, + "loss": 1.4117, + "loss/crossentropy": 2.6558061838150024, + "loss/fcd": 1.26953125, + "loss/logits": 0.3351695239543915, + "step": 128 + }, + { + "epoch": 0.00222838338558806, + "grad_norm": 0.3125, + "grad_norm_var": 0.00027667681376139324, + "learning_rate": 0.01, + "loss": 1.3841, + "loss/crossentropy": 2.3390719890594482, + "loss/fcd": 1.0703125, + "loss/logits": 0.23404338955879211, + "step": 129 + }, + { + "epoch": 0.00224565767539882, + "grad_norm": 0.302734375, + "grad_norm_var": 0.00028254191080729165, + "learning_rate": 0.01, + "loss": 1.3402, + "loss/crossentropy": 2.5888524055480957, + "loss/fcd": 1.09765625, + "loss/logits": 0.2385600358247757, + "step": 130 + }, + { + "epoch": 0.0022629319652095804, + "grad_norm": 0.365234375, + "grad_norm_var": 0.00045291582743326825, + "learning_rate": 0.01, + "loss": 1.4423, + "loss/crossentropy": 2.1622209548950195, + "loss/fcd": 1.125, + "loss/logits": 0.25934895873069763, + "step": 131 + }, + { + "epoch": 0.0022802062550203407, + "grad_norm": 0.349609375, + "grad_norm_var": 0.0004840691884358724, + "learning_rate": 0.01, + "loss": 1.5001, + "loss/crossentropy": 2.5385576486587524, + "loss/fcd": 1.18359375, + "loss/logits": 0.26659196615219116, + "step": 132 + }, + { + "epoch": 0.0022974805448311006, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0005355676015218099, + "learning_rate": 0.01, + "loss": 1.3481, + "loss/crossentropy": 2.348211407661438, + "loss/fcd": 1.09375, + "loss/logits": 0.2560664862394333, + "step": 133 + }, + { + "epoch": 0.002314754834641861, + "grad_norm": 0.3125, + "grad_norm_var": 0.0005294164021809896, + "learning_rate": 0.01, + "loss": 1.3607, + "loss/crossentropy": 2.117067277431488, + "loss/fcd": 1.07421875, + "loss/logits": 0.22807861864566803, + "step": 134 + }, + { + "epoch": 0.0023320291244526207, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0005870660146077474, + "learning_rate": 0.01, + "loss": 1.387, + "loss/crossentropy": 2.5187747478485107, + "loss/fcd": 1.16796875, + "loss/logits": 0.27947917580604553, + "step": 135 + }, + { + "epoch": 0.002349303414263381, + "grad_norm": 0.326171875, + "grad_norm_var": 0.0005658467610677084, + "learning_rate": 0.01, + "loss": 1.3995, + "loss/crossentropy": 2.4953707456588745, + "loss/fcd": 1.09375, + "loss/logits": 0.24946419894695282, + "step": 136 + }, + { + "epoch": 0.0023665777040741414, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0005611260732014974, + "learning_rate": 0.01, + "loss": 1.4027, + "loss/crossentropy": 2.3007187843322754, + "loss/fcd": 1.2109375, + "loss/logits": 0.2944917380809784, + "step": 137 + }, + { + "epoch": 0.0023838519938849012, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0004042943318684896, + "learning_rate": 0.01, + "loss": 1.3784, + "loss/crossentropy": 2.406763792037964, + "loss/fcd": 1.0625, + "loss/logits": 0.24067886918783188, + "step": 138 + }, + { + "epoch": 0.0024011262836956615, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0004521052042643229, + "learning_rate": 0.01, + "loss": 1.394, + "loss/crossentropy": 2.3716171979904175, + "loss/fcd": 1.09375, + "loss/logits": 0.2490846812725067, + "step": 139 + }, + { + "epoch": 0.002418400573506422, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0004530429840087891, + "learning_rate": 0.01, + "loss": 1.3992, + "loss/crossentropy": 2.298838496208191, + "loss/fcd": 1.12109375, + "loss/logits": 0.2580900937318802, + "step": 140 + }, + { + "epoch": 0.0024356748633171817, + "grad_norm": 0.287109375, + "grad_norm_var": 0.00048014322916666664, + "learning_rate": 0.01, + "loss": 1.3887, + "loss/crossentropy": 2.1861318349838257, + "loss/fcd": 1.09375, + "loss/logits": 0.25625482201576233, + "step": 141 + }, + { + "epoch": 0.002452949153127942, + "grad_norm": 0.28125, + "grad_norm_var": 0.0005390803019205729, + "learning_rate": 0.01, + "loss": 1.4149, + "loss/crossentropy": 2.5295623540878296, + "loss/fcd": 1.14453125, + "loss/logits": 0.24908355623483658, + "step": 142 + }, + { + "epoch": 0.0024702234429387023, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0005419413248697917, + "learning_rate": 0.01, + "loss": 1.4095, + "loss/crossentropy": 2.4763203859329224, + "loss/fcd": 1.14453125, + "loss/logits": 0.25878605246543884, + "step": 143 + }, + { + "epoch": 0.002487497732749462, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0005533854166666667, + "learning_rate": 0.01, + "loss": 1.4244, + "loss/crossentropy": 2.520187020301819, + "loss/fcd": 1.1328125, + "loss/logits": 0.24524306505918503, + "step": 144 + }, + { + "epoch": 0.0025047720225602225, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0005658308664957683, + "learning_rate": 0.01, + "loss": 1.4039, + "loss/crossentropy": 2.517001748085022, + "loss/fcd": 1.1328125, + "loss/logits": 0.23872993886470795, + "step": 145 + }, + { + "epoch": 0.002522046312370983, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0005833784739176433, + "learning_rate": 0.01, + "loss": 1.3182, + "loss/crossentropy": 2.4004757404327393, + "loss/fcd": 1.0625, + "loss/logits": 0.24094465374946594, + "step": 146 + }, + { + "epoch": 0.0025393206021817427, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0003750960032145182, + "learning_rate": 0.01, + "loss": 1.3334, + "loss/crossentropy": 2.1713826656341553, + "loss/fcd": 1.12109375, + "loss/logits": 0.22458232194185257, + "step": 147 + }, + { + "epoch": 0.002556594891992503, + "grad_norm": 0.34375, + "grad_norm_var": 0.00034052530924479166, + "learning_rate": 0.01, + "loss": 1.3361, + "loss/crossentropy": 2.438323974609375, + "loss/fcd": 1.15234375, + "loss/logits": 0.24637237191200256, + "step": 148 + }, + { + "epoch": 0.0025738691818032633, + "grad_norm": 0.380859375, + "grad_norm_var": 0.0007058302561442057, + "learning_rate": 0.01, + "loss": 1.4953, + "loss/crossentropy": 2.450320243835449, + "loss/fcd": 1.1171875, + "loss/logits": 0.24158670753240585, + "step": 149 + }, + { + "epoch": 0.002591143471614023, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0007044474283854166, + "learning_rate": 0.01, + "loss": 1.4629, + "loss/crossentropy": 2.294734477996826, + "loss/fcd": 1.2421875, + "loss/logits": 0.2762032076716423, + "step": 150 + }, + { + "epoch": 0.0026084177614247835, + "grad_norm": 0.33203125, + "grad_norm_var": 0.0007077376047770182, + "learning_rate": 0.01, + "loss": 1.4165, + "loss/crossentropy": 2.468201994895935, + "loss/fcd": 1.21875, + "loss/logits": 0.2507496029138565, + "step": 151 + }, + { + "epoch": 0.0026256920512355434, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0006917158762613933, + "learning_rate": 0.01, + "loss": 1.395, + "loss/crossentropy": 2.353287696838379, + "loss/fcd": 1.18359375, + "loss/logits": 0.2722310647368431, + "step": 152 + }, + { + "epoch": 0.0026429663410463037, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0007008234659830729, + "learning_rate": 0.01, + "loss": 1.3506, + "loss/crossentropy": 2.2797771692276, + "loss/fcd": 1.1484375, + "loss/logits": 0.2620129883289337, + "step": 153 + }, + { + "epoch": 0.002660240630857064, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0007210890452067057, + "learning_rate": 0.01, + "loss": 1.3943, + "loss/crossentropy": 2.6261144876480103, + "loss/fcd": 1.21484375, + "loss/logits": 0.3041190207004547, + "step": 154 + }, + { + "epoch": 0.002677514920667824, + "grad_norm": 0.28125, + "grad_norm_var": 0.0007389704386393229, + "learning_rate": 0.01, + "loss": 1.4487, + "loss/crossentropy": 2.327589750289917, + "loss/fcd": 1.2890625, + "loss/logits": 0.333427369594574, + "step": 155 + }, + { + "epoch": 0.002694789210478584, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0007445653279622396, + "learning_rate": 0.01, + "loss": 1.3842, + "loss/crossentropy": 2.4801390171051025, + "loss/fcd": 1.1640625, + "loss/logits": 0.23910623788833618, + "step": 156 + }, + { + "epoch": 0.0027120635002893445, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0007395267486572266, + "learning_rate": 0.01, + "loss": 1.3487, + "loss/crossentropy": 2.577694535255432, + "loss/fcd": 1.16015625, + "loss/logits": 0.2568306028842926, + "step": 157 + }, + { + "epoch": 0.0027293377901001043, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0006922403971354167, + "learning_rate": 0.01, + "loss": 1.3505, + "loss/crossentropy": 2.415543556213379, + "loss/fcd": 1.09375, + "loss/logits": 0.2512781471014023, + "step": 158 + }, + { + "epoch": 0.0027466120799108647, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0006875991821289062, + "learning_rate": 0.01, + "loss": 1.4042, + "loss/crossentropy": 2.4328696727752686, + "loss/fcd": 1.0859375, + "loss/logits": 0.2584942355751991, + "step": 159 + }, + { + "epoch": 0.002763886369721625, + "grad_norm": 0.2578125, + "grad_norm_var": 0.0008356730143229167, + "learning_rate": 0.01, + "loss": 1.2883, + "loss/crossentropy": 2.344989776611328, + "loss/fcd": 1.06640625, + "loss/logits": 0.23677106201648712, + "step": 160 + }, + { + "epoch": 0.002781160659532385, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0008282979329427083, + "learning_rate": 0.01, + "loss": 1.3544, + "loss/crossentropy": 2.3909146785736084, + "loss/fcd": 1.1171875, + "loss/logits": 0.26238836348056793, + "step": 161 + }, + { + "epoch": 0.002798434949343145, + "grad_norm": 0.341796875, + "grad_norm_var": 0.000886980692545573, + "learning_rate": 0.01, + "loss": 1.4284, + "loss/crossentropy": 2.6815162897109985, + "loss/fcd": 1.23828125, + "loss/logits": 0.28333599865436554, + "step": 162 + }, + { + "epoch": 0.0028157092391539055, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0008396784464518229, + "learning_rate": 0.01, + "loss": 1.3743, + "loss/crossentropy": 2.363664388656616, + "loss/fcd": 1.11328125, + "loss/logits": 0.23216551542282104, + "step": 163 + }, + { + "epoch": 0.0028329835289646653, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0007912794748942058, + "learning_rate": 0.01, + "loss": 1.3503, + "loss/crossentropy": 2.6360952854156494, + "loss/fcd": 1.19140625, + "loss/logits": 0.25444111227989197, + "step": 164 + }, + { + "epoch": 0.0028502578187754256, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0004067579905192057, + "learning_rate": 0.01, + "loss": 1.3827, + "loss/crossentropy": 2.255971908569336, + "loss/fcd": 1.08984375, + "loss/logits": 0.2420385479927063, + "step": 165 + }, + { + "epoch": 0.002867532108586186, + "grad_norm": 0.328125, + "grad_norm_var": 0.00044854482014973957, + "learning_rate": 0.01, + "loss": 1.3572, + "loss/crossentropy": 2.5781320333480835, + "loss/fcd": 1.12109375, + "loss/logits": 0.2430611252784729, + "step": 166 + }, + { + "epoch": 0.002884806398396946, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0003909905751546224, + "learning_rate": 0.01, + "loss": 1.394, + "loss/crossentropy": 2.698032259941101, + "loss/fcd": 1.16796875, + "loss/logits": 0.248212069272995, + "step": 167 + }, + { + "epoch": 0.002902080688207706, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0003845055898030599, + "learning_rate": 0.01, + "loss": 1.4097, + "loss/crossentropy": 2.372989535331726, + "loss/fcd": 1.140625, + "loss/logits": 0.24837365001440048, + "step": 168 + }, + { + "epoch": 0.002919354978018466, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0003870646158854167, + "learning_rate": 0.01, + "loss": 1.3624, + "loss/crossentropy": 2.555245876312256, + "loss/fcd": 1.12890625, + "loss/logits": 0.2645147144794464, + "step": 169 + }, + { + "epoch": 0.0029366292678292263, + "grad_norm": 0.291015625, + "grad_norm_var": 0.00038089752197265623, + "learning_rate": 0.01, + "loss": 1.3458, + "loss/crossentropy": 2.2800326347351074, + "loss/fcd": 1.0390625, + "loss/logits": 0.22108863294124603, + "step": 170 + }, + { + "epoch": 0.0029539035576399866, + "grad_norm": 0.30078125, + "grad_norm_var": 0.000353240966796875, + "learning_rate": 0.01, + "loss": 1.3788, + "loss/crossentropy": 2.638196110725403, + "loss/fcd": 1.15234375, + "loss/logits": 0.2918136268854141, + "step": 171 + }, + { + "epoch": 0.0029711778474507465, + "grad_norm": 0.294921875, + "grad_norm_var": 0.00034152666727701824, + "learning_rate": 0.01, + "loss": 1.3664, + "loss/crossentropy": 2.6176986694335938, + "loss/fcd": 1.1640625, + "loss/logits": 0.26864323019981384, + "step": 172 + }, + { + "epoch": 0.002988452137261507, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0003345330556233724, + "learning_rate": 0.01, + "loss": 1.4184, + "loss/crossentropy": 2.62368905544281, + "loss/fcd": 1.25390625, + "loss/logits": 0.28509171307086945, + "step": 173 + }, + { + "epoch": 0.003005726427072267, + "grad_norm": 0.671875, + "grad_norm_var": 0.00890649159749349, + "learning_rate": 0.01, + "loss": 1.4685, + "loss/crossentropy": 2.309454083442688, + "loss/fcd": 1.125, + "loss/logits": 0.26153236627578735, + "step": 174 + }, + { + "epoch": 0.003023000716883027, + "grad_norm": 0.32421875, + "grad_norm_var": 0.00887309710184733, + "learning_rate": 0.01, + "loss": 1.4154, + "loss/crossentropy": 2.320811152458191, + "loss/fcd": 1.07421875, + "loss/logits": 0.24308273196220398, + "step": 175 + }, + { + "epoch": 0.0030402750066937873, + "grad_norm": 0.306640625, + "grad_norm_var": 0.008579444885253907, + "learning_rate": 0.01, + "loss": 1.3805, + "loss/crossentropy": 2.579828977584839, + "loss/fcd": 1.14453125, + "loss/logits": 0.2542525976896286, + "step": 176 + }, + { + "epoch": 0.0030575492965045476, + "grad_norm": 0.302734375, + "grad_norm_var": 0.008579444885253907, + "learning_rate": 0.01, + "loss": 1.3868, + "loss/crossentropy": 2.5000842809677124, + "loss/fcd": 1.18359375, + "loss/logits": 0.2917867451906204, + "step": 177 + }, + { + "epoch": 0.0030748235863153075, + "grad_norm": 0.291015625, + "grad_norm_var": 0.008653004964192709, + "learning_rate": 0.01, + "loss": 1.3679, + "loss/crossentropy": 2.5240609645843506, + "loss/fcd": 1.13671875, + "loss/logits": 0.2740897983312607, + "step": 178 + }, + { + "epoch": 0.0030920978761260678, + "grad_norm": 0.75, + "grad_norm_var": 0.019812758763631186, + "learning_rate": 0.01, + "loss": 1.423, + "loss/crossentropy": 2.383319854736328, + "loss/fcd": 1.16015625, + "loss/logits": 0.2834385186433792, + "step": 179 + }, + { + "epoch": 0.003109372165936828, + "grad_norm": 0.3125, + "grad_norm_var": 0.01962865193684896, + "learning_rate": 0.01, + "loss": 1.3861, + "loss/crossentropy": 2.3524543046951294, + "loss/fcd": 1.1796875, + "loss/logits": 0.24870187044143677, + "step": 180 + }, + { + "epoch": 0.003126646455747588, + "grad_norm": 0.333984375, + "grad_norm_var": 0.01944268544514974, + "learning_rate": 0.01, + "loss": 1.4644, + "loss/crossentropy": 2.768381118774414, + "loss/fcd": 1.26171875, + "loss/logits": 0.3117068111896515, + "step": 181 + }, + { + "epoch": 0.0031439207455583483, + "grad_norm": 0.3125, + "grad_norm_var": 0.019518470764160155, + "learning_rate": 0.01, + "loss": 1.4071, + "loss/crossentropy": 2.5678982734680176, + "loss/fcd": 1.14453125, + "loss/logits": 0.25002971291542053, + "step": 182 + }, + { + "epoch": 0.0031611950353691086, + "grad_norm": 0.3203125, + "grad_norm_var": 0.019382969538370768, + "learning_rate": 0.01, + "loss": 1.4044, + "loss/crossentropy": 2.6401069164276123, + "loss/fcd": 1.17578125, + "loss/logits": 0.2738536596298218, + "step": 183 + }, + { + "epoch": 0.0031784693251798684, + "grad_norm": 0.318359375, + "grad_norm_var": 0.019187148412068686, + "learning_rate": 0.01, + "loss": 1.4165, + "loss/crossentropy": 2.3614484071731567, + "loss/fcd": 1.1171875, + "loss/logits": 0.28841613233089447, + "step": 184 + }, + { + "epoch": 0.0031957436149906288, + "grad_norm": 0.337890625, + "grad_norm_var": 0.01904290517171224, + "learning_rate": 0.01, + "loss": 1.4151, + "loss/crossentropy": 2.2044495344161987, + "loss/fcd": 1.09765625, + "loss/logits": 0.25532982498407364, + "step": 185 + }, + { + "epoch": 0.003213017904801389, + "grad_norm": 0.279296875, + "grad_norm_var": 0.019160970052083334, + "learning_rate": 0.01, + "loss": 1.3233, + "loss/crossentropy": 2.657314658164978, + "loss/fcd": 1.1171875, + "loss/logits": 0.2434261366724968, + "step": 186 + }, + { + "epoch": 0.003230292194612149, + "grad_norm": 0.322265625, + "grad_norm_var": 0.019019174575805663, + "learning_rate": 0.01, + "loss": 1.457, + "loss/crossentropy": 2.509123682975769, + "loss/fcd": 1.16015625, + "loss/logits": 0.27627624571323395, + "step": 187 + }, + { + "epoch": 0.0032475664844229092, + "grad_norm": 0.298828125, + "grad_norm_var": 0.01898535092671712, + "learning_rate": 0.01, + "loss": 1.4612, + "loss/crossentropy": 2.4355961084365845, + "loss/fcd": 1.15625, + "loss/logits": 0.2809949368238449, + "step": 188 + }, + { + "epoch": 0.003264840774233669, + "grad_norm": 0.314453125, + "grad_norm_var": 0.018945821126302085, + "learning_rate": 0.01, + "loss": 1.4111, + "loss/crossentropy": 2.657699465751648, + "loss/fcd": 1.1484375, + "loss/logits": 0.26505863666534424, + "step": 189 + }, + { + "epoch": 0.0032821150640444294, + "grad_norm": 0.56640625, + "grad_norm_var": 0.01528771718343099, + "learning_rate": 0.01, + "loss": 1.4753, + "loss/crossentropy": 2.4757652282714844, + "loss/fcd": 1.0546875, + "loss/logits": 0.22812122106552124, + "step": 190 + }, + { + "epoch": 0.0032993893538551897, + "grad_norm": 0.296875, + "grad_norm_var": 0.015449269612630209, + "learning_rate": 0.01, + "loss": 1.3867, + "loss/crossentropy": 2.4966439604759216, + "loss/fcd": 1.14453125, + "loss/logits": 0.24755483120679855, + "step": 191 + }, + { + "epoch": 0.0033166636436659496, + "grad_norm": 0.30859375, + "grad_norm_var": 0.015437173843383788, + "learning_rate": 0.01, + "loss": 1.4331, + "loss/crossentropy": 2.2156739234924316, + "loss/fcd": 1.125, + "loss/logits": 0.24708709865808487, + "step": 192 + }, + { + "epoch": 0.00333393793347671, + "grad_norm": 0.337890625, + "grad_norm_var": 0.015273523330688477, + "learning_rate": 0.01, + "loss": 1.4652, + "loss/crossentropy": 2.5916343927383423, + "loss/fcd": 1.15625, + "loss/logits": 0.26975981891155243, + "step": 193 + }, + { + "epoch": 0.0033512122232874702, + "grad_norm": 0.294921875, + "grad_norm_var": 0.01524046262105306, + "learning_rate": 0.01, + "loss": 1.3916, + "loss/crossentropy": 2.4512441158294678, + "loss/fcd": 1.12890625, + "loss/logits": 0.2599586248397827, + "step": 194 + }, + { + "epoch": 0.00336848651309823, + "grad_norm": 0.271484375, + "grad_norm_var": 0.004449717203776042, + "learning_rate": 0.01, + "loss": 1.2906, + "loss/crossentropy": 2.4583925008773804, + "loss/fcd": 1.0859375, + "loss/logits": 0.22421551495790482, + "step": 195 + }, + { + "epoch": 0.0033857608029089904, + "grad_norm": 0.345703125, + "grad_norm_var": 0.004455931981404622, + "learning_rate": 0.01, + "loss": 1.4645, + "loss/crossentropy": 3.102002263069153, + "loss/fcd": 1.26953125, + "loss/logits": 0.31158843636512756, + "step": 196 + }, + { + "epoch": 0.0034030350927197507, + "grad_norm": 0.275390625, + "grad_norm_var": 0.00462950070699056, + "learning_rate": 0.01, + "loss": 1.3805, + "loss/crossentropy": 2.537242293357849, + "loss/fcd": 1.15234375, + "loss/logits": 0.24022500216960907, + "step": 197 + }, + { + "epoch": 0.0034203093825305106, + "grad_norm": 0.31640625, + "grad_norm_var": 0.004623905817667643, + "learning_rate": 0.01, + "loss": 1.4295, + "loss/crossentropy": 1.8695432543754578, + "loss/fcd": 1.24609375, + "loss/logits": 0.2338111400604248, + "step": 198 + }, + { + "epoch": 0.003437583672341271, + "grad_norm": 0.306640625, + "grad_norm_var": 0.004644711812337239, + "learning_rate": 0.01, + "loss": 1.4342, + "loss/crossentropy": 2.5979591608047485, + "loss/fcd": 1.17578125, + "loss/logits": 0.2477928102016449, + "step": 199 + }, + { + "epoch": 0.003454857962152031, + "grad_norm": 0.294921875, + "grad_norm_var": 0.004698117574055989, + "learning_rate": 0.01, + "loss": 1.3588, + "loss/crossentropy": 2.6363730430603027, + "loss/fcd": 1.12109375, + "loss/logits": 0.26602891087532043, + "step": 200 + }, + { + "epoch": 0.003472132251962791, + "grad_norm": 0.298828125, + "grad_norm_var": 0.004715919494628906, + "learning_rate": 0.01, + "loss": 1.3919, + "loss/crossentropy": 2.6225093603134155, + "loss/fcd": 1.09765625, + "loss/logits": 0.25346362590789795, + "step": 201 + }, + { + "epoch": 0.0034894065417735514, + "grad_norm": 0.296875, + "grad_norm_var": 0.004638528823852539, + "learning_rate": 0.01, + "loss": 1.3639, + "loss/crossentropy": 2.6900315284729004, + "loss/fcd": 1.140625, + "loss/logits": 0.25750475376844406, + "step": 202 + }, + { + "epoch": 0.0035066808315843117, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0046525160471598305, + "learning_rate": 0.01, + "loss": 1.4454, + "loss/crossentropy": 2.4896918535232544, + "loss/fcd": 1.12109375, + "loss/logits": 0.24820879101753235, + "step": 203 + }, + { + "epoch": 0.0035239551213950716, + "grad_norm": 0.29296875, + "grad_norm_var": 0.004671732584635417, + "learning_rate": 0.01, + "loss": 1.3744, + "loss/crossentropy": 2.4207727909088135, + "loss/fcd": 1.1796875, + "loss/logits": 0.267853319644928, + "step": 204 + }, + { + "epoch": 0.003541229411205832, + "grad_norm": 0.296875, + "grad_norm_var": 0.004704777399698893, + "learning_rate": 0.01, + "loss": 1.3827, + "loss/crossentropy": 2.6077362298965454, + "loss/fcd": 1.19140625, + "loss/logits": 0.2449246495962143, + "step": 205 + }, + { + "epoch": 0.0035585037010165918, + "grad_norm": 0.302734375, + "grad_norm_var": 0.00035959879557291666, + "learning_rate": 0.01, + "loss": 1.36, + "loss/crossentropy": 2.2625831365585327, + "loss/fcd": 1.07421875, + "loss/logits": 0.25722844898700714, + "step": 206 + }, + { + "epoch": 0.003575777990827352, + "grad_norm": 0.326171875, + "grad_norm_var": 0.0003903547922770182, + "learning_rate": 0.01, + "loss": 1.4604, + "loss/crossentropy": 2.6293487548828125, + "loss/fcd": 1.15625, + "loss/logits": 0.2616356760263443, + "step": 207 + }, + { + "epoch": 0.0035930522806381124, + "grad_norm": 0.291015625, + "grad_norm_var": 0.00040022532145182293, + "learning_rate": 0.01, + "loss": 1.3362, + "loss/crossentropy": 2.4450851678848267, + "loss/fcd": 1.0859375, + "loss/logits": 0.23832125961780548, + "step": 208 + }, + { + "epoch": 0.0036103265704488722, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0003178278605143229, + "learning_rate": 0.01, + "loss": 1.3815, + "loss/crossentropy": 2.265815496444702, + "loss/fcd": 1.25, + "loss/logits": 0.2856537625193596, + "step": 209 + }, + { + "epoch": 0.0036276008602596325, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0003153483072916667, + "learning_rate": 0.01, + "loss": 1.3779, + "loss/crossentropy": 2.4830867052078247, + "loss/fcd": 1.18359375, + "loss/logits": 0.27156491577625275, + "step": 210 + }, + { + "epoch": 0.003644875150070393, + "grad_norm": 0.33203125, + "grad_norm_var": 0.00030007362365722654, + "learning_rate": 0.01, + "loss": 1.3838, + "loss/crossentropy": 2.4645248651504517, + "loss/fcd": 1.13671875, + "loss/logits": 0.2536320984363556, + "step": 211 + }, + { + "epoch": 0.0036621494398811527, + "grad_norm": 0.29296875, + "grad_norm_var": 0.000191497802734375, + "learning_rate": 0.01, + "loss": 1.3463, + "loss/crossentropy": 2.4574155807495117, + "loss/fcd": 1.04296875, + "loss/logits": 0.22712672501802444, + "step": 212 + }, + { + "epoch": 0.003679423729691913, + "grad_norm": 0.3125, + "grad_norm_var": 0.00014468828837076823, + "learning_rate": 0.01, + "loss": 1.3817, + "loss/crossentropy": 2.51455819606781, + "loss/fcd": 1.09765625, + "loss/logits": 0.24947896599769592, + "step": 213 + }, + { + "epoch": 0.0036966980195026733, + "grad_norm": 0.3046875, + "grad_norm_var": 0.00013477007548014323, + "learning_rate": 0.01, + "loss": 1.4295, + "loss/crossentropy": 2.5708523988723755, + "loss/fcd": 1.2109375, + "loss/logits": 0.3021456152200699, + "step": 214 + }, + { + "epoch": 0.0037139723093134332, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0001357396443684896, + "learning_rate": 0.01, + "loss": 1.3823, + "loss/crossentropy": 2.696264386177063, + "loss/fcd": 1.12890625, + "loss/logits": 0.2742393985390663, + "step": 215 + }, + { + "epoch": 0.0037312465991241935, + "grad_norm": 0.326171875, + "grad_norm_var": 0.00015913645426432292, + "learning_rate": 0.01, + "loss": 1.4102, + "loss/crossentropy": 2.310886025428772, + "loss/fcd": 1.22265625, + "loss/logits": 0.2918149083852768, + "step": 216 + }, + { + "epoch": 0.003748520888934954, + "grad_norm": 0.302734375, + "grad_norm_var": 0.000156402587890625, + "learning_rate": 0.01, + "loss": 1.4137, + "loss/crossentropy": 2.2433084845542908, + "loss/fcd": 1.15625, + "loss/logits": 0.25447261333465576, + "step": 217 + }, + { + "epoch": 0.0037657951787457137, + "grad_norm": 0.322265625, + "grad_norm_var": 0.00016528765360514323, + "learning_rate": 0.01, + "loss": 1.4519, + "loss/crossentropy": 2.4079222679138184, + "loss/fcd": 1.140625, + "loss/logits": 0.2586686462163925, + "step": 218 + }, + { + "epoch": 0.003783069468556474, + "grad_norm": 0.279296875, + "grad_norm_var": 0.00021602312723795574, + "learning_rate": 0.01, + "loss": 1.415, + "loss/crossentropy": 2.460106134414673, + "loss/fcd": 1.15234375, + "loss/logits": 0.2525549978017807, + "step": 219 + }, + { + "epoch": 0.0038003437583672343, + "grad_norm": 0.2890625, + "grad_norm_var": 0.00022377967834472657, + "learning_rate": 0.01, + "loss": 1.4134, + "loss/crossentropy": 2.225171685218811, + "loss/fcd": 1.0546875, + "loss/logits": 0.22205037623643875, + "step": 220 + }, + { + "epoch": 0.003817618048177994, + "grad_norm": 0.26953125, + "grad_norm_var": 0.0003029982248942057, + "learning_rate": 0.01, + "loss": 1.3745, + "loss/crossentropy": 2.3788317441940308, + "loss/fcd": 1.1015625, + "loss/logits": 0.24135209619998932, + "step": 221 + }, + { + "epoch": 0.0038348923379887545, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0003082116444905599, + "learning_rate": 0.01, + "loss": 1.4452, + "loss/crossentropy": 2.375778555870056, + "loss/fcd": 1.21484375, + "loss/logits": 0.25682032108306885, + "step": 222 + }, + { + "epoch": 0.0038521666277995144, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0002720514933268229, + "learning_rate": 0.01, + "loss": 1.3877, + "loss/crossentropy": 2.4510881900787354, + "loss/fcd": 1.12109375, + "loss/logits": 0.2522790729999542, + "step": 223 + }, + { + "epoch": 0.0038694409176102747, + "grad_norm": 0.318359375, + "grad_norm_var": 0.00027872721354166665, + "learning_rate": 0.01, + "loss": 1.3783, + "loss/crossentropy": 2.4119985103607178, + "loss/fcd": 1.13671875, + "loss/logits": 0.22855417430400848, + "step": 224 + }, + { + "epoch": 0.003886715207421035, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0002911726633707682, + "learning_rate": 0.01, + "loss": 1.4037, + "loss/crossentropy": 2.4024510383605957, + "loss/fcd": 1.16015625, + "loss/logits": 0.2690604329109192, + "step": 225 + }, + { + "epoch": 0.003903989497231795, + "grad_norm": 0.2734375, + "grad_norm_var": 0.00034427642822265625, + "learning_rate": 0.01, + "loss": 1.3468, + "loss/crossentropy": 2.550796151161194, + "loss/fcd": 1.078125, + "loss/logits": 0.24284164607524872, + "step": 226 + }, + { + "epoch": 0.003921263787042556, + "grad_norm": 0.427734375, + "grad_norm_var": 0.0013123671213785806, + "learning_rate": 0.01, + "loss": 1.4574, + "loss/crossentropy": 2.9375933408737183, + "loss/fcd": 1.3046875, + "loss/logits": 0.2513057738542557, + "step": 227 + }, + { + "epoch": 0.0039385380768533155, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0013051350911458333, + "learning_rate": 0.01, + "loss": 1.4281, + "loss/crossentropy": 2.524444341659546, + "loss/fcd": 1.1953125, + "loss/logits": 0.27370719611644745, + "step": 228 + }, + { + "epoch": 0.003955812366664075, + "grad_norm": 0.33984375, + "grad_norm_var": 0.001366106669108073, + "learning_rate": 0.01, + "loss": 1.4307, + "loss/crossentropy": 2.486480951309204, + "loss/fcd": 1.11328125, + "loss/logits": 0.26038021594285965, + "step": 229 + }, + { + "epoch": 0.003973086656474836, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0013638655344645181, + "learning_rate": 0.01, + "loss": 1.3844, + "loss/crossentropy": 2.48094379901886, + "loss/fcd": 1.10546875, + "loss/logits": 0.24342957884073257, + "step": 230 + }, + { + "epoch": 0.003990360946285596, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0013834476470947266, + "learning_rate": 0.01, + "loss": 1.3685, + "loss/crossentropy": 2.241925358772278, + "loss/fcd": 1.14453125, + "loss/logits": 0.24210943281650543, + "step": 231 + }, + { + "epoch": 0.004007635236096356, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0013643741607666016, + "learning_rate": 0.01, + "loss": 1.448, + "loss/crossentropy": 2.648869752883911, + "loss/fcd": 1.09375, + "loss/logits": 0.24799171090126038, + "step": 232 + }, + { + "epoch": 0.004024909525907116, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0013676802317301431, + "learning_rate": 0.01, + "loss": 1.4431, + "loss/crossentropy": 2.63001549243927, + "loss/fcd": 1.15625, + "loss/logits": 0.27701297402381897, + "step": 233 + }, + { + "epoch": 0.0040421838157178765, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0013848463694254556, + "learning_rate": 0.01, + "loss": 1.3746, + "loss/crossentropy": 2.2247713804244995, + "loss/fcd": 1.03125, + "loss/logits": 0.24730068445205688, + "step": 234 + }, + { + "epoch": 0.004059458105528636, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0013358910878499349, + "learning_rate": 0.01, + "loss": 1.4197, + "loss/crossentropy": 2.511416435241699, + "loss/fcd": 1.15234375, + "loss/logits": 0.2583580017089844, + "step": 235 + }, + { + "epoch": 0.004076732395339396, + "grad_norm": 0.369140625, + "grad_norm_var": 0.0015294392903645833, + "learning_rate": 0.01, + "loss": 1.4459, + "loss/crossentropy": 2.366840362548828, + "loss/fcd": 1.171875, + "loss/logits": 0.2747315466403961, + "step": 236 + }, + { + "epoch": 0.004094006685150157, + "grad_norm": 0.34375, + "grad_norm_var": 0.0014388402303059896, + "learning_rate": 0.01, + "loss": 1.379, + "loss/crossentropy": 2.645435094833374, + "loss/fcd": 1.1796875, + "loss/logits": 0.2583626061677933, + "step": 237 + }, + { + "epoch": 0.004111280974960917, + "grad_norm": 0.333984375, + "grad_norm_var": 0.0014134089152018229, + "learning_rate": 0.01, + "loss": 1.3999, + "loss/crossentropy": 2.0519449710845947, + "loss/fcd": 1.09375, + "loss/logits": 0.2533458322286606, + "step": 238 + }, + { + "epoch": 0.004128555264771677, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0014001051584879556, + "learning_rate": 0.01, + "loss": 1.3654, + "loss/crossentropy": 2.236992359161377, + "loss/fcd": 1.02734375, + "loss/logits": 0.23388498276472092, + "step": 239 + }, + { + "epoch": 0.0041458295545824374, + "grad_norm": 0.328125, + "grad_norm_var": 0.0014027277628580728, + "learning_rate": 0.01, + "loss": 1.3499, + "loss/crossentropy": 2.308284044265747, + "loss/fcd": 1.05859375, + "loss/logits": 0.23218639194965363, + "step": 240 + }, + { + "epoch": 0.004163103844393197, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0013278802235921225, + "learning_rate": 0.01, + "loss": 1.4553, + "loss/crossentropy": 2.360711455345154, + "loss/fcd": 1.14453125, + "loss/logits": 0.24909411370754242, + "step": 241 + }, + { + "epoch": 0.004180378134203957, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0011983235677083333, + "learning_rate": 0.01, + "loss": 1.4332, + "loss/crossentropy": 2.486197352409363, + "loss/fcd": 1.21875, + "loss/logits": 0.28059011697769165, + "step": 242 + }, + { + "epoch": 0.004197652424014718, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0004508813222249349, + "learning_rate": 0.01, + "loss": 1.3993, + "loss/crossentropy": 2.461425542831421, + "loss/fcd": 1.12109375, + "loss/logits": 0.2716974467039108, + "step": 243 + }, + { + "epoch": 0.004214926713825478, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0004518985748291016, + "learning_rate": 0.01, + "loss": 1.4236, + "loss/crossentropy": 2.344510316848755, + "loss/fcd": 1.203125, + "loss/logits": 0.2624819576740265, + "step": 244 + }, + { + "epoch": 0.004232201003636238, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0005180199940999348, + "learning_rate": 0.01, + "loss": 1.3542, + "loss/crossentropy": 2.6375720500946045, + "loss/fcd": 1.1328125, + "loss/logits": 0.2671656012535095, + "step": 245 + }, + { + "epoch": 0.004249475293446998, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0005176385243733724, + "learning_rate": 0.01, + "loss": 1.3853, + "loss/crossentropy": 2.4105772972106934, + "loss/fcd": 1.16796875, + "loss/logits": 0.2800147980451584, + "step": 246 + }, + { + "epoch": 0.004266749583257758, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0004948298136393229, + "learning_rate": 0.01, + "loss": 1.3755, + "loss/crossentropy": 2.2956700325012207, + "loss/fcd": 1.12890625, + "loss/logits": 0.2564444988965988, + "step": 247 + }, + { + "epoch": 0.004284023873068518, + "grad_norm": 0.50390625, + "grad_norm_var": 0.0026893456776936847, + "learning_rate": 0.01, + "loss": 1.3836, + "loss/crossentropy": 2.3848729133605957, + "loss/fcd": 1.1640625, + "loss/logits": 0.2581590488553047, + "step": 248 + }, + { + "epoch": 0.004301298162879279, + "grad_norm": 0.326171875, + "grad_norm_var": 0.002683115005493164, + "learning_rate": 0.01, + "loss": 1.3791, + "loss/crossentropy": 2.6016765832901, + "loss/fcd": 1.1015625, + "loss/logits": 0.26704905927181244, + "step": 249 + }, + { + "epoch": 0.004318572452690039, + "grad_norm": 0.33984375, + "grad_norm_var": 0.002565956115722656, + "learning_rate": 0.01, + "loss": 1.4703, + "loss/crossentropy": 2.4796223640441895, + "loss/fcd": 1.28125, + "loss/logits": 0.30792760848999023, + "step": 250 + }, + { + "epoch": 0.004335846742500799, + "grad_norm": 0.296875, + "grad_norm_var": 0.002574777603149414, + "learning_rate": 0.01, + "loss": 1.3831, + "loss/crossentropy": 2.45810604095459, + "loss/fcd": 1.1484375, + "loss/logits": 0.26673202961683273, + "step": 251 + }, + { + "epoch": 0.004353121032311559, + "grad_norm": 0.296875, + "grad_norm_var": 0.0025400797526041667, + "learning_rate": 0.01, + "loss": 1.3379, + "loss/crossentropy": 2.37344229221344, + "loss/fcd": 1.0859375, + "loss/logits": 0.2348434329032898, + "step": 252 + }, + { + "epoch": 0.004370395322122319, + "grad_norm": 0.287109375, + "grad_norm_var": 0.002615213394165039, + "learning_rate": 0.01, + "loss": 1.4283, + "loss/crossentropy": 2.310893416404724, + "loss/fcd": 1.08984375, + "loss/logits": 0.22272542119026184, + "step": 253 + }, + { + "epoch": 0.004387669611933079, + "grad_norm": 0.455078125, + "grad_norm_var": 0.003699223200480143, + "learning_rate": 0.01, + "loss": 1.4319, + "loss/crossentropy": 2.287319302558899, + "loss/fcd": 1.1484375, + "loss/logits": 0.23187098652124405, + "step": 254 + }, + { + "epoch": 0.00440494390174384, + "grad_norm": 0.30859375, + "grad_norm_var": 0.003693072001139323, + "learning_rate": 0.01, + "loss": 1.3951, + "loss/crossentropy": 2.751601457595825, + "loss/fcd": 1.15625, + "loss/logits": 0.2715594172477722, + "step": 255 + }, + { + "epoch": 0.0044222181915546, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0037031650543212892, + "learning_rate": 0.01, + "loss": 1.372, + "loss/crossentropy": 2.513296961784363, + "loss/fcd": 1.12109375, + "loss/logits": 0.23859571665525436, + "step": 256 + }, + { + "epoch": 0.00443949248136536, + "grad_norm": 0.306640625, + "grad_norm_var": 0.003735971450805664, + "learning_rate": 0.01, + "loss": 1.3787, + "loss/crossentropy": 2.501555562019348, + "loss/fcd": 1.125, + "loss/logits": 0.2450244277715683, + "step": 257 + }, + { + "epoch": 0.00445676677117612, + "grad_norm": 0.291015625, + "grad_norm_var": 0.00377195676167806, + "learning_rate": 0.01, + "loss": 1.3965, + "loss/crossentropy": 2.503899097442627, + "loss/fcd": 1.17578125, + "loss/logits": 0.28062424063682556, + "step": 258 + }, + { + "epoch": 0.00447404106098688, + "grad_norm": 0.326171875, + "grad_norm_var": 0.0037612279256184896, + "learning_rate": 0.01, + "loss": 1.3864, + "loss/crossentropy": 2.5635122060775757, + "loss/fcd": 1.1484375, + "loss/logits": 0.25401656329631805, + "step": 259 + }, + { + "epoch": 0.00449131535079764, + "grad_norm": 0.3125, + "grad_norm_var": 0.0037770430246988934, + "learning_rate": 0.01, + "loss": 1.3786, + "loss/crossentropy": 2.4950658082962036, + "loss/fcd": 1.12109375, + "loss/logits": 0.2641760855913162, + "step": 260 + }, + { + "epoch": 0.004508589640608401, + "grad_norm": 0.296875, + "grad_norm_var": 0.003665781021118164, + "learning_rate": 0.01, + "loss": 1.3656, + "loss/crossentropy": 2.4370001554489136, + "loss/fcd": 1.1015625, + "loss/logits": 0.249709352850914, + "step": 261 + }, + { + "epoch": 0.004525863930419161, + "grad_norm": 0.2890625, + "grad_norm_var": 0.003766632080078125, + "learning_rate": 0.01, + "loss": 1.3332, + "loss/crossentropy": 2.4650388956069946, + "loss/fcd": 1.14453125, + "loss/logits": 0.2645094692707062, + "step": 262 + }, + { + "epoch": 0.004543138220229921, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0037601312001546224, + "learning_rate": 0.01, + "loss": 1.3832, + "loss/crossentropy": 2.677791714668274, + "loss/fcd": 1.16796875, + "loss/logits": 0.28196755796670914, + "step": 263 + }, + { + "epoch": 0.004560412510040681, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0015848795572916666, + "learning_rate": 0.01, + "loss": 1.4601, + "loss/crossentropy": 2.4847524166107178, + "loss/fcd": 1.109375, + "loss/logits": 0.2580026537179947, + "step": 264 + }, + { + "epoch": 0.004577686799851441, + "grad_norm": 0.33203125, + "grad_norm_var": 0.0015946547190348306, + "learning_rate": 0.01, + "loss": 1.4087, + "loss/crossentropy": 2.4944722652435303, + "loss/fcd": 1.12109375, + "loss/logits": 0.2483246624469757, + "step": 265 + }, + { + "epoch": 0.004594961089662201, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0016375223795572916, + "learning_rate": 0.01, + "loss": 1.3835, + "loss/crossentropy": 2.2753440141677856, + "loss/fcd": 1.046875, + "loss/logits": 0.24172206223011017, + "step": 266 + }, + { + "epoch": 0.004612235379472961, + "grad_norm": 0.3125, + "grad_norm_var": 0.0016192118326822916, + "learning_rate": 0.01, + "loss": 1.3721, + "loss/crossentropy": 2.4424277544021606, + "loss/fcd": 1.1640625, + "loss/logits": 0.2600102424621582, + "step": 267 + }, + { + "epoch": 0.004629509669283722, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0016474246978759766, + "learning_rate": 0.01, + "loss": 1.3636, + "loss/crossentropy": 2.5198450088500977, + "loss/fcd": 1.09375, + "loss/logits": 0.25200945883989334, + "step": 268 + }, + { + "epoch": 0.004646783959094482, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0016239007314046224, + "learning_rate": 0.01, + "loss": 1.3874, + "loss/crossentropy": 2.4488155841827393, + "loss/fcd": 1.1484375, + "loss/logits": 0.2999647855758667, + "step": 269 + }, + { + "epoch": 0.0046640582489052415, + "grad_norm": 0.2890625, + "grad_norm_var": 0.00022017161051432292, + "learning_rate": 0.01, + "loss": 1.3399, + "loss/crossentropy": 2.1886658668518066, + "loss/fcd": 1.03125, + "loss/logits": 0.241354301571846, + "step": 270 + }, + { + "epoch": 0.004681332538716002, + "grad_norm": 0.359375, + "grad_norm_var": 0.00041605631510416665, + "learning_rate": 0.01, + "loss": 1.3419, + "loss/crossentropy": 2.382296085357666, + "loss/fcd": 1.10546875, + "loss/logits": 0.2474452257156372, + "step": 271 + }, + { + "epoch": 0.004698606828526762, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0004093805948893229, + "learning_rate": 0.01, + "loss": 1.325, + "loss/crossentropy": 2.569235324859619, + "loss/fcd": 1.15625, + "loss/logits": 0.24149076640605927, + "step": 272 + }, + { + "epoch": 0.004715881118337522, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0004258314768473307, + "learning_rate": 0.01, + "loss": 1.2981, + "loss/crossentropy": 2.5184491872787476, + "loss/fcd": 1.09375, + "loss/logits": 0.25748542696237564, + "step": 273 + }, + { + "epoch": 0.004733155408148283, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0004210789998372396, + "learning_rate": 0.01, + "loss": 1.3787, + "loss/crossentropy": 2.2780392169952393, + "loss/fcd": 1.140625, + "loss/logits": 0.24548518657684326, + "step": 274 + }, + { + "epoch": 0.004750429697959043, + "grad_norm": 0.3125, + "grad_norm_var": 0.0003958225250244141, + "learning_rate": 0.01, + "loss": 1.4055, + "loss/crossentropy": 2.392509341239929, + "loss/fcd": 1.21875, + "loss/logits": 0.25389473140239716, + "step": 275 + }, + { + "epoch": 0.0047677039877698025, + "grad_norm": 0.55078125, + "grad_norm_var": 0.004181019465128581, + "learning_rate": 0.01, + "loss": 1.3982, + "loss/crossentropy": 2.6148691177368164, + "loss/fcd": 1.1875, + "loss/logits": 0.27452078461647034, + "step": 276 + }, + { + "epoch": 0.004784978277580563, + "grad_norm": 0.3125, + "grad_norm_var": 0.004148213068644205, + "learning_rate": 0.01, + "loss": 1.3782, + "loss/crossentropy": 2.4390900135040283, + "loss/fcd": 1.12890625, + "loss/logits": 0.2340994030237198, + "step": 277 + }, + { + "epoch": 0.004802252567391323, + "grad_norm": 0.28515625, + "grad_norm_var": 0.004165760676066081, + "learning_rate": 0.01, + "loss": 1.3674, + "loss/crossentropy": 2.065169870853424, + "loss/fcd": 1.08203125, + "loss/logits": 0.23831525444984436, + "step": 278 + }, + { + "epoch": 0.004819526857202083, + "grad_norm": 0.33984375, + "grad_norm_var": 0.0041680494944254555, + "learning_rate": 0.01, + "loss": 1.3986, + "loss/crossentropy": 2.23395574092865, + "loss/fcd": 1.07421875, + "loss/logits": 0.25276701152324677, + "step": 279 + }, + { + "epoch": 0.004836801147012844, + "grad_norm": 0.302734375, + "grad_norm_var": 0.004177459081013997, + "learning_rate": 0.01, + "loss": 1.3866, + "loss/crossentropy": 2.5360673666000366, + "loss/fcd": 1.125, + "loss/logits": 0.2552696242928505, + "step": 280 + }, + { + "epoch": 0.0048540754368236036, + "grad_norm": 0.59375, + "grad_norm_var": 0.008786503473917644, + "learning_rate": 0.01, + "loss": 1.3841, + "loss/crossentropy": 2.64610493183136, + "loss/fcd": 1.14453125, + "loss/logits": 0.2660531848669052, + "step": 281 + }, + { + "epoch": 0.0048713497266343634, + "grad_norm": 0.306640625, + "grad_norm_var": 0.008615605036417643, + "learning_rate": 0.01, + "loss": 1.3781, + "loss/crossentropy": 2.206232786178589, + "loss/fcd": 1.04296875, + "loss/logits": 0.22382746636867523, + "step": 282 + }, + { + "epoch": 0.004888624016445124, + "grad_norm": 0.27734375, + "grad_norm_var": 0.008825031916300456, + "learning_rate": 0.01, + "loss": 1.3924, + "loss/crossentropy": 2.491134285926819, + "loss/fcd": 1.1953125, + "loss/logits": 0.28758758306503296, + "step": 283 + }, + { + "epoch": 0.004905898306255884, + "grad_norm": 0.314453125, + "grad_norm_var": 0.008684396743774414, + "learning_rate": 0.01, + "loss": 1.3821, + "loss/crossentropy": 2.418181896209717, + "loss/fcd": 1.08984375, + "loss/logits": 0.24221232533454895, + "step": 284 + }, + { + "epoch": 0.004923172596066644, + "grad_norm": 0.302734375, + "grad_norm_var": 0.00864103635152181, + "learning_rate": 0.01, + "loss": 1.385, + "loss/crossentropy": 2.532857298851013, + "loss/fcd": 1.11328125, + "loss/logits": 0.25721532106399536, + "step": 285 + }, + { + "epoch": 0.004940446885877405, + "grad_norm": 0.28515625, + "grad_norm_var": 0.008668883641560873, + "learning_rate": 0.01, + "loss": 1.3564, + "loss/crossentropy": 2.602588653564453, + "loss/fcd": 1.12890625, + "loss/logits": 0.2445499449968338, + "step": 286 + }, + { + "epoch": 0.0049577211756881645, + "grad_norm": 0.2890625, + "grad_norm_var": 0.008800490697224935, + "learning_rate": 0.01, + "loss": 1.3491, + "loss/crossentropy": 2.5629632472991943, + "loss/fcd": 1.13671875, + "loss/logits": 0.2607369050383568, + "step": 287 + }, + { + "epoch": 0.004974995465498924, + "grad_norm": 0.28515625, + "grad_norm_var": 0.008880043029785156, + "learning_rate": 0.01, + "loss": 1.3404, + "loss/crossentropy": 2.522684097290039, + "loss/fcd": 1.1171875, + "loss/logits": 0.27616211771965027, + "step": 288 + }, + { + "epoch": 0.004992269755309685, + "grad_norm": 0.26171875, + "grad_norm_var": 0.009095001220703124, + "learning_rate": 0.01, + "loss": 1.3618, + "loss/crossentropy": 2.6350889205932617, + "loss/fcd": 1.109375, + "loss/logits": 0.24171672016382217, + "step": 289 + }, + { + "epoch": 0.005009544045120445, + "grad_norm": 0.337890625, + "grad_norm_var": 0.009074894587198894, + "learning_rate": 0.01, + "loss": 1.459, + "loss/crossentropy": 2.98556649684906, + "loss/fcd": 1.21875, + "loss/logits": 0.2643963396549225, + "step": 290 + }, + { + "epoch": 0.005026818334931205, + "grad_norm": 0.251953125, + "grad_norm_var": 0.009484354654947917, + "learning_rate": 0.01, + "loss": 1.3693, + "loss/crossentropy": 2.230570673942566, + "loss/fcd": 1.06640625, + "loss/logits": 0.24412426352500916, + "step": 291 + }, + { + "epoch": 0.005044092624741966, + "grad_norm": 0.31640625, + "grad_norm_var": 0.006051127115885417, + "learning_rate": 0.01, + "loss": 1.3658, + "loss/crossentropy": 2.8022435903549194, + "loss/fcd": 1.1875, + "loss/logits": 0.2787918150424957, + "step": 292 + }, + { + "epoch": 0.0050613669145527255, + "grad_norm": 0.291015625, + "grad_norm_var": 0.006091165542602539, + "learning_rate": 0.01, + "loss": 1.3367, + "loss/crossentropy": 2.4132487773895264, + "loss/fcd": 1.18359375, + "loss/logits": 0.26599422097206116, + "step": 293 + }, + { + "epoch": 0.005078641204363485, + "grad_norm": 0.29296875, + "grad_norm_var": 0.00606382687886556, + "learning_rate": 0.01, + "loss": 1.3944, + "loss/crossentropy": 2.2870916724205017, + "loss/fcd": 1.11328125, + "loss/logits": 0.25007129460573196, + "step": 294 + }, + { + "epoch": 0.005095915494174246, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0060225804646809895, + "learning_rate": 0.01, + "loss": 1.3933, + "loss/crossentropy": 2.60745906829834, + "loss/fcd": 1.13671875, + "loss/logits": 0.2817099541425705, + "step": 295 + }, + { + "epoch": 0.005113189783985006, + "grad_norm": 0.34765625, + "grad_norm_var": 0.006082900365193685, + "learning_rate": 0.01, + "loss": 1.4644, + "loss/crossentropy": 2.1799449920654297, + "loss/fcd": 1.12890625, + "loss/logits": 0.23855505883693695, + "step": 296 + }, + { + "epoch": 0.005130464073795766, + "grad_norm": 0.296875, + "grad_norm_var": 0.0006179650624593099, + "learning_rate": 0.01, + "loss": 1.3902, + "loss/crossentropy": 2.299877882003784, + "loss/fcd": 1.09765625, + "loss/logits": 0.24762696027755737, + "step": 297 + }, + { + "epoch": 0.005147738363606527, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0006234327952067058, + "learning_rate": 0.01, + "loss": 1.3882, + "loss/crossentropy": 2.334827423095703, + "loss/fcd": 1.07421875, + "loss/logits": 0.23748627305030823, + "step": 298 + }, + { + "epoch": 0.0051650126534172865, + "grad_norm": 0.33203125, + "grad_norm_var": 0.0006581465403238932, + "learning_rate": 0.01, + "loss": 1.3226, + "loss/crossentropy": 2.4439618587493896, + "loss/fcd": 1.078125, + "loss/logits": 0.23564526438713074, + "step": 299 + }, + { + "epoch": 0.005182286943228046, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0006502787272135417, + "learning_rate": 0.01, + "loss": 1.4317, + "loss/crossentropy": 2.4066379070281982, + "loss/fcd": 1.16796875, + "loss/logits": 0.28721271455287933, + "step": 300 + }, + { + "epoch": 0.005199561233038807, + "grad_norm": 0.337890625, + "grad_norm_var": 0.0007389704386393229, + "learning_rate": 0.01, + "loss": 1.392, + "loss/crossentropy": 2.6461589336395264, + "loss/fcd": 1.1640625, + "loss/logits": 0.2553107738494873, + "step": 301 + }, + { + "epoch": 0.005216835522849567, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0007389704386393229, + "learning_rate": 0.01, + "loss": 1.3864, + "loss/crossentropy": 2.607328414916992, + "loss/fcd": 1.125, + "loss/logits": 0.26615719497203827, + "step": 302 + }, + { + "epoch": 0.005234109812660327, + "grad_norm": 0.3125, + "grad_norm_var": 0.0007313410441080729, + "learning_rate": 0.01, + "loss": 1.3974, + "loss/crossentropy": 2.5339640378952026, + "loss/fcd": 1.23046875, + "loss/logits": 0.29202982783317566, + "step": 303 + }, + { + "epoch": 0.005251384102471087, + "grad_norm": 0.271484375, + "grad_norm_var": 0.000777292251586914, + "learning_rate": 0.01, + "loss": 1.3597, + "loss/crossentropy": 2.418789029121399, + "loss/fcd": 1.0078125, + "loss/logits": 0.22410588711500168, + "step": 304 + }, + { + "epoch": 0.0052686583922818475, + "grad_norm": 0.26171875, + "grad_norm_var": 0.000777292251586914, + "learning_rate": 0.01, + "loss": 1.3612, + "loss/crossentropy": 2.333797812461853, + "loss/fcd": 1.15234375, + "loss/logits": 0.2548183798789978, + "step": 305 + }, + { + "epoch": 0.005285932682092607, + "grad_norm": 0.330078125, + "grad_norm_var": 0.0007448673248291015, + "learning_rate": 0.01, + "loss": 1.4106, + "loss/crossentropy": 2.444805860519409, + "loss/fcd": 1.1796875, + "loss/logits": 0.2654833495616913, + "step": 306 + }, + { + "epoch": 0.005303206971903367, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0005655765533447265, + "learning_rate": 0.01, + "loss": 1.4068, + "loss/crossentropy": 2.478832244873047, + "loss/fcd": 1.15625, + "loss/logits": 0.27099600434303284, + "step": 307 + }, + { + "epoch": 0.005320481261714128, + "grad_norm": 0.34375, + "grad_norm_var": 0.0006519158681233724, + "learning_rate": 0.01, + "loss": 1.4297, + "loss/crossentropy": 2.276490032672882, + "loss/fcd": 1.2578125, + "loss/logits": 0.2906430959701538, + "step": 308 + }, + { + "epoch": 0.005337755551524888, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0006444136301676433, + "learning_rate": 0.01, + "loss": 1.3362, + "loss/crossentropy": 2.1777199506759644, + "loss/fcd": 1.1171875, + "loss/logits": 0.2572901248931885, + "step": 309 + }, + { + "epoch": 0.005355029841335648, + "grad_norm": 0.349609375, + "grad_norm_var": 0.0007352193196614583, + "learning_rate": 0.01, + "loss": 1.4705, + "loss/crossentropy": 2.4591206312179565, + "loss/fcd": 1.09375, + "loss/logits": 0.2502119764685631, + "step": 310 + }, + { + "epoch": 0.0053723041311464085, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0007771650950113932, + "learning_rate": 0.01, + "loss": 1.4149, + "loss/crossentropy": 2.377845048904419, + "loss/fcd": 1.1015625, + "loss/logits": 0.25507183372974396, + "step": 311 + }, + { + "epoch": 0.005389578420957168, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0006932417551676432, + "learning_rate": 0.01, + "loss": 1.3878, + "loss/crossentropy": 2.6086690425872803, + "loss/fcd": 1.25, + "loss/logits": 0.28851139545440674, + "step": 312 + }, + { + "epoch": 0.005406852710767928, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0006875991821289062, + "learning_rate": 0.01, + "loss": 1.3607, + "loss/crossentropy": 2.089534819126129, + "loss/fcd": 1.12890625, + "loss/logits": 0.22003582119941711, + "step": 313 + }, + { + "epoch": 0.005424127000578689, + "grad_norm": 0.388671875, + "grad_norm_var": 0.0011123021443684895, + "learning_rate": 0.01, + "loss": 1.3856, + "loss/crossentropy": 2.0762287974357605, + "loss/fcd": 1.1875, + "loss/logits": 0.23012210428714752, + "step": 314 + }, + { + "epoch": 0.005441401290389449, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0011039574940999348, + "learning_rate": 0.01, + "loss": 1.3841, + "loss/crossentropy": 2.5591676235198975, + "loss/fcd": 1.05859375, + "loss/logits": 0.2246263027191162, + "step": 315 + }, + { + "epoch": 0.005458675580200209, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0010869344075520833, + "learning_rate": 0.01, + "loss": 1.4073, + "loss/crossentropy": 2.412803888320923, + "loss/fcd": 1.12890625, + "loss/logits": 0.24091031402349472, + "step": 316 + }, + { + "epoch": 0.0054759498700109694, + "grad_norm": 0.333984375, + "grad_norm_var": 0.001073137919108073, + "learning_rate": 0.01, + "loss": 1.368, + "loss/crossentropy": 2.328226327896118, + "loss/fcd": 1.1328125, + "loss/logits": 0.2949056923389435, + "step": 317 + }, + { + "epoch": 0.005493224159821729, + "grad_norm": 0.30078125, + "grad_norm_var": 0.001038042704264323, + "learning_rate": 0.01, + "loss": 1.3639, + "loss/crossentropy": 2.2848289012908936, + "loss/fcd": 1.16796875, + "loss/logits": 0.25566980242729187, + "step": 318 + }, + { + "epoch": 0.005510498449632489, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0011049906412760417, + "learning_rate": 0.01, + "loss": 1.3843, + "loss/crossentropy": 2.3968076705932617, + "loss/fcd": 1.10546875, + "loss/logits": 0.2567252665758133, + "step": 319 + }, + { + "epoch": 0.00552777273944325, + "grad_norm": 0.423828125, + "grad_norm_var": 0.001811663309733073, + "learning_rate": 0.01, + "loss": 1.3891, + "loss/crossentropy": 2.396988272666931, + "loss/fcd": 1.1484375, + "loss/logits": 0.24911059439182281, + "step": 320 + }, + { + "epoch": 0.00554504702925401, + "grad_norm": 0.28125, + "grad_norm_var": 0.001689910888671875, + "learning_rate": 0.01, + "loss": 1.3517, + "loss/crossentropy": 2.4934462308883667, + "loss/fcd": 1.09375, + "loss/logits": 0.2607601135969162, + "step": 321 + }, + { + "epoch": 0.00556232131906477, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0017145156860351562, + "learning_rate": 0.01, + "loss": 1.4164, + "loss/crossentropy": 2.421591639518738, + "loss/fcd": 1.08203125, + "loss/logits": 0.2476629763841629, + "step": 322 + }, + { + "epoch": 0.00557959560887553, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0016919453938802084, + "learning_rate": 0.01, + "loss": 1.4522, + "loss/crossentropy": 2.5826879739761353, + "loss/fcd": 1.12890625, + "loss/logits": 0.24336670339107513, + "step": 323 + }, + { + "epoch": 0.00559686989868629, + "grad_norm": 0.36328125, + "grad_norm_var": 0.0017831802368164062, + "learning_rate": 0.01, + "loss": 1.435, + "loss/crossentropy": 2.6005271673202515, + "loss/fcd": 1.16796875, + "loss/logits": 0.2697305530309677, + "step": 324 + }, + { + "epoch": 0.00561414418849705, + "grad_norm": 0.3203125, + "grad_norm_var": 0.001741647720336914, + "learning_rate": 0.01, + "loss": 1.4172, + "loss/crossentropy": 2.514216661453247, + "loss/fcd": 1.09375, + "loss/logits": 0.2561942785978317, + "step": 325 + }, + { + "epoch": 0.005631418478307811, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0017611026763916016, + "learning_rate": 0.01, + "loss": 1.3803, + "loss/crossentropy": 2.6110743284225464, + "loss/fcd": 1.109375, + "loss/logits": 0.25072459131479263, + "step": 326 + }, + { + "epoch": 0.005648692768118571, + "grad_norm": 0.271484375, + "grad_norm_var": 0.0018299738566080728, + "learning_rate": 0.01, + "loss": 1.3267, + "loss/crossentropy": 2.3151168823242188, + "loss/fcd": 1.06640625, + "loss/logits": 0.22984758019447327, + "step": 327 + }, + { + "epoch": 0.005665967057929331, + "grad_norm": 0.30859375, + "grad_norm_var": 0.001784515380859375, + "learning_rate": 0.01, + "loss": 1.4146, + "loss/crossentropy": 2.610999584197998, + "loss/fcd": 1.16796875, + "loss/logits": 0.27360329031944275, + "step": 328 + }, + { + "epoch": 0.005683241347740091, + "grad_norm": 0.330078125, + "grad_norm_var": 0.00178680419921875, + "learning_rate": 0.01, + "loss": 1.4228, + "loss/crossentropy": 2.3715471029281616, + "loss/fcd": 1.125, + "loss/logits": 0.24973652511835098, + "step": 329 + }, + { + "epoch": 0.005700515637550851, + "grad_norm": 0.359375, + "grad_norm_var": 0.0015657901763916015, + "learning_rate": 0.01, + "loss": 1.3711, + "loss/crossentropy": 2.3313710689544678, + "loss/fcd": 1.0625, + "loss/logits": 0.2390831932425499, + "step": 330 + }, + { + "epoch": 0.005717789927361611, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0015309651692708333, + "learning_rate": 0.01, + "loss": 1.3683, + "loss/crossentropy": 2.405033826828003, + "loss/fcd": 1.140625, + "loss/logits": 0.26245684921741486, + "step": 331 + }, + { + "epoch": 0.005735064217172372, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0015340010325113932, + "learning_rate": 0.01, + "loss": 1.3872, + "loss/crossentropy": 2.6667896509170532, + "loss/fcd": 1.14453125, + "loss/logits": 0.2503022700548172, + "step": 332 + }, + { + "epoch": 0.005752338506983132, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0015253543853759766, + "learning_rate": 0.01, + "loss": 1.3296, + "loss/crossentropy": 2.6033343076705933, + "loss/fcd": 1.1328125, + "loss/logits": 0.24763934314250946, + "step": 333 + }, + { + "epoch": 0.005769612796793892, + "grad_norm": 0.271484375, + "grad_norm_var": 0.0016357421875, + "learning_rate": 0.01, + "loss": 1.3707, + "loss/crossentropy": 2.3747464418411255, + "loss/fcd": 1.08984375, + "loss/logits": 0.24109259992837906, + "step": 334 + }, + { + "epoch": 0.005786887086604652, + "grad_norm": 0.30078125, + "grad_norm_var": 0.001557159423828125, + "learning_rate": 0.01, + "loss": 1.3676, + "loss/crossentropy": 2.064777910709381, + "loss/fcd": 1.1875, + "loss/logits": 0.20032966136932373, + "step": 335 + }, + { + "epoch": 0.005804161376415412, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0007188002268473308, + "learning_rate": 0.01, + "loss": 1.415, + "loss/crossentropy": 2.395054817199707, + "loss/fcd": 1.140625, + "loss/logits": 0.26608574390411377, + "step": 336 + }, + { + "epoch": 0.005821435666226172, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0007338047027587891, + "learning_rate": 0.01, + "loss": 1.3812, + "loss/crossentropy": 2.2238911390304565, + "loss/fcd": 1.0703125, + "loss/logits": 0.2315894290804863, + "step": 337 + }, + { + "epoch": 0.005838709956036932, + "grad_norm": 0.400390625, + "grad_norm_var": 0.0012453556060791015, + "learning_rate": 0.01, + "loss": 1.4817, + "loss/crossentropy": 2.6248074769973755, + "loss/fcd": 1.16015625, + "loss/logits": 0.28028567135334015, + "step": 338 + }, + { + "epoch": 0.005855984245847693, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0012906233469645182, + "learning_rate": 0.01, + "loss": 1.3788, + "loss/crossentropy": 2.125354528427124, + "loss/fcd": 1.140625, + "loss/logits": 0.26438966393470764, + "step": 339 + }, + { + "epoch": 0.005873258535658453, + "grad_norm": 0.326171875, + "grad_norm_var": 0.0011260350545247396, + "learning_rate": 0.01, + "loss": 1.4301, + "loss/crossentropy": 2.301461696624756, + "loss/fcd": 1.1015625, + "loss/logits": 0.254987433552742, + "step": 340 + }, + { + "epoch": 0.0058905328254692125, + "grad_norm": 0.326171875, + "grad_norm_var": 0.001136000951131185, + "learning_rate": 0.01, + "loss": 1.4306, + "loss/crossentropy": 2.369805097579956, + "loss/fcd": 1.1015625, + "loss/logits": 0.25373272597789764, + "step": 341 + }, + { + "epoch": 0.005907807115279973, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0010833104451497396, + "learning_rate": 0.01, + "loss": 1.4098, + "loss/crossentropy": 2.5944920778274536, + "loss/fcd": 1.23046875, + "loss/logits": 0.2799176275730133, + "step": 342 + }, + { + "epoch": 0.005925081405090733, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0009953657786051433, + "learning_rate": 0.01, + "loss": 1.3992, + "loss/crossentropy": 2.13715797662735, + "loss/fcd": 1.04296875, + "loss/logits": 0.24987763166427612, + "step": 343 + }, + { + "epoch": 0.005942355694901493, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0009989261627197266, + "learning_rate": 0.01, + "loss": 1.4174, + "loss/crossentropy": 2.4599469900131226, + "loss/fcd": 1.10546875, + "loss/logits": 0.2511429339647293, + "step": 344 + }, + { + "epoch": 0.005959629984712254, + "grad_norm": 0.333984375, + "grad_norm_var": 0.0010085900624593098, + "learning_rate": 0.01, + "loss": 1.3608, + "loss/crossentropy": 2.431983709335327, + "loss/fcd": 1.125, + "loss/logits": 0.2585323229432106, + "step": 345 + }, + { + "epoch": 0.005976904274523014, + "grad_norm": 0.35546875, + "grad_norm_var": 0.0009857018788655598, + "learning_rate": 0.01, + "loss": 1.3743, + "loss/crossentropy": 2.3239270448684692, + "loss/fcd": 1.1015625, + "loss/logits": 0.2441619336605072, + "step": 346 + }, + { + "epoch": 0.0059941785643337735, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0009857018788655598, + "learning_rate": 0.01, + "loss": 1.3826, + "loss/crossentropy": 2.229923963546753, + "loss/fcd": 1.09765625, + "loss/logits": 0.22727931290864944, + "step": 347 + }, + { + "epoch": 0.006011452854144534, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0010732014973958333, + "learning_rate": 0.01, + "loss": 1.3712, + "loss/crossentropy": 2.6727981567382812, + "loss/fcd": 1.15625, + "loss/logits": 0.28281402587890625, + "step": 348 + }, + { + "epoch": 0.006028727143955294, + "grad_norm": 0.392578125, + "grad_norm_var": 0.0014724095662434896, + "learning_rate": 0.01, + "loss": 1.4247, + "loss/crossentropy": 2.4443578720092773, + "loss/fcd": 1.15625, + "loss/logits": 0.2722969502210617, + "step": 349 + }, + { + "epoch": 0.006046001433766054, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0013391971588134766, + "learning_rate": 0.01, + "loss": 1.3573, + "loss/crossentropy": 2.399729371070862, + "loss/fcd": 1.04296875, + "loss/logits": 0.22808712720870972, + "step": 350 + }, + { + "epoch": 0.006063275723576815, + "grad_norm": 0.32421875, + "grad_norm_var": 0.001315927505493164, + "learning_rate": 0.01, + "loss": 1.3975, + "loss/crossentropy": 2.521644949913025, + "loss/fcd": 1.16796875, + "loss/logits": 0.25423599034547806, + "step": 351 + }, + { + "epoch": 0.006080550013387575, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0013570149739583334, + "learning_rate": 0.01, + "loss": 1.3756, + "loss/crossentropy": 2.263104200363159, + "loss/fcd": 1.09765625, + "loss/logits": 0.26695793122053146, + "step": 352 + }, + { + "epoch": 0.0060978243031983344, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0012316226959228516, + "learning_rate": 0.01, + "loss": 1.3735, + "loss/crossentropy": 2.6748716831207275, + "loss/fcd": 1.16796875, + "loss/logits": 0.27432236075401306, + "step": 353 + }, + { + "epoch": 0.006115098593009095, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0008518060048421223, + "learning_rate": 0.01, + "loss": 1.3334, + "loss/crossentropy": 2.3271913528442383, + "loss/fcd": 1.15625, + "loss/logits": 0.25318336486816406, + "step": 354 + }, + { + "epoch": 0.006132372882819855, + "grad_norm": 0.328125, + "grad_norm_var": 0.0008040746053059896, + "learning_rate": 0.01, + "loss": 1.4881, + "loss/crossentropy": 2.6528772115707397, + "loss/fcd": 1.296875, + "loss/logits": 0.3017214983701706, + "step": 355 + }, + { + "epoch": 0.006149647172630615, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0008056640625, + "learning_rate": 0.01, + "loss": 1.3815, + "loss/crossentropy": 2.4514299631118774, + "loss/fcd": 1.16015625, + "loss/logits": 0.25581270456314087, + "step": 356 + }, + { + "epoch": 0.006166921462441376, + "grad_norm": 0.3203125, + "grad_norm_var": 0.000800180435180664, + "learning_rate": 0.01, + "loss": 1.3652, + "loss/crossentropy": 2.307224750518799, + "loss/fcd": 1.08984375, + "loss/logits": 0.24949809908866882, + "step": 357 + }, + { + "epoch": 0.0061841957522521356, + "grad_norm": 0.291015625, + "grad_norm_var": 0.000836944580078125, + "learning_rate": 0.01, + "loss": 1.3842, + "loss/crossentropy": 2.120967745780945, + "loss/fcd": 1.1328125, + "loss/logits": 0.2532486915588379, + "step": 358 + }, + { + "epoch": 0.006201470042062895, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0008429050445556641, + "learning_rate": 0.01, + "loss": 1.4114, + "loss/crossentropy": 2.4582676887512207, + "loss/fcd": 1.23828125, + "loss/logits": 0.278301477432251, + "step": 359 + }, + { + "epoch": 0.006218744331873656, + "grad_norm": 0.275390625, + "grad_norm_var": 0.000936126708984375, + "learning_rate": 0.01, + "loss": 1.3794, + "loss/crossentropy": 2.636004090309143, + "loss/fcd": 1.21484375, + "loss/logits": 0.2849871665239334, + "step": 360 + }, + { + "epoch": 0.006236018621684416, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0009364922841389974, + "learning_rate": 0.01, + "loss": 1.4538, + "loss/crossentropy": 2.55968701839447, + "loss/fcd": 1.2109375, + "loss/logits": 0.29454614222049713, + "step": 361 + }, + { + "epoch": 0.006253292911495176, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0007910251617431641, + "learning_rate": 0.01, + "loss": 1.3948, + "loss/crossentropy": 2.3076229095458984, + "loss/fcd": 1.015625, + "loss/logits": 0.23156127333641052, + "step": 362 + }, + { + "epoch": 0.006270567201305937, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0008000055948893229, + "learning_rate": 0.01, + "loss": 1.3462, + "loss/crossentropy": 2.3910467624664307, + "loss/fcd": 1.140625, + "loss/logits": 0.24528269469738007, + "step": 363 + }, + { + "epoch": 0.0062878414911166965, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0007506688435872396, + "learning_rate": 0.01, + "loss": 1.4077, + "loss/crossentropy": 2.3372639417648315, + "loss/fcd": 1.21875, + "loss/logits": 0.2640947550535202, + "step": 364 + }, + { + "epoch": 0.006305115780927456, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0002445856730143229, + "learning_rate": 0.01, + "loss": 1.3759, + "loss/crossentropy": 2.454505205154419, + "loss/fcd": 1.05078125, + "loss/logits": 0.2401072233915329, + "step": 365 + }, + { + "epoch": 0.006322390070738217, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0002534071604410807, + "learning_rate": 0.01, + "loss": 1.3749, + "loss/crossentropy": 2.3645259141921997, + "loss/fcd": 1.125, + "loss/logits": 0.23153205960988998, + "step": 366 + }, + { + "epoch": 0.006339664360548977, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0002587477366129557, + "learning_rate": 0.01, + "loss": 1.3546, + "loss/crossentropy": 2.494025230407715, + "loss/fcd": 1.14453125, + "loss/logits": 0.26557300239801407, + "step": 367 + }, + { + "epoch": 0.006356938650359737, + "grad_norm": 0.28125, + "grad_norm_var": 0.000279998779296875, + "learning_rate": 0.01, + "loss": 1.3496, + "loss/crossentropy": 2.3776293992996216, + "loss/fcd": 1.12109375, + "loss/logits": 0.2568487524986267, + "step": 368 + }, + { + "epoch": 0.006374212940170498, + "grad_norm": 0.30078125, + "grad_norm_var": 0.00024310747782389322, + "learning_rate": 0.01, + "loss": 1.3734, + "loss/crossentropy": 2.591793417930603, + "loss/fcd": 1.15234375, + "loss/logits": 0.27023325860500336, + "step": 369 + }, + { + "epoch": 0.0063914872299812575, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0002357323964436849, + "learning_rate": 0.01, + "loss": 1.3944, + "loss/crossentropy": 2.415038585662842, + "loss/fcd": 1.15625, + "loss/logits": 0.3026815205812454, + "step": 370 + }, + { + "epoch": 0.006408761519792017, + "grad_norm": 0.3359375, + "grad_norm_var": 0.0002699375152587891, + "learning_rate": 0.01, + "loss": 1.3485, + "loss/crossentropy": 2.4911344051361084, + "loss/fcd": 1.08984375, + "loss/logits": 0.2093563750386238, + "step": 371 + }, + { + "epoch": 0.006426035809602778, + "grad_norm": 0.28515625, + "grad_norm_var": 0.00027815500895182293, + "learning_rate": 0.01, + "loss": 1.34, + "loss/crossentropy": 2.470622181892395, + "loss/fcd": 1.17578125, + "loss/logits": 0.2913671284914017, + "step": 372 + }, + { + "epoch": 0.006443310099413538, + "grad_norm": 0.3203125, + "grad_norm_var": 0.00027815500895182293, + "learning_rate": 0.01, + "loss": 1.4077, + "loss/crossentropy": 2.6227082014083862, + "loss/fcd": 1.09375, + "loss/logits": 0.24900969862937927, + "step": 373 + }, + { + "epoch": 0.006460584389224298, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0003157138824462891, + "learning_rate": 0.01, + "loss": 1.4209, + "loss/crossentropy": 3.0212732553482056, + "loss/fcd": 1.22265625, + "loss/logits": 0.270741730928421, + "step": 374 + }, + { + "epoch": 0.006477858679035058, + "grad_norm": 0.326171875, + "grad_norm_var": 0.0003500461578369141, + "learning_rate": 0.01, + "loss": 1.465, + "loss/crossentropy": 2.8352737426757812, + "loss/fcd": 1.24609375, + "loss/logits": 0.31054478883743286, + "step": 375 + }, + { + "epoch": 0.0064951329688458185, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0003049055735270182, + "learning_rate": 0.01, + "loss": 1.3951, + "loss/crossentropy": 2.450179100036621, + "loss/fcd": 1.140625, + "loss/logits": 0.24616704881191254, + "step": 376 + }, + { + "epoch": 0.006512407258656578, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0003193537394205729, + "learning_rate": 0.01, + "loss": 1.3687, + "loss/crossentropy": 2.2392066717147827, + "loss/fcd": 1.025390625, + "loss/logits": 0.24169814586639404, + "step": 377 + }, + { + "epoch": 0.006529681548467338, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0003195285797119141, + "learning_rate": 0.01, + "loss": 1.4315, + "loss/crossentropy": 2.6067546606063843, + "loss/fcd": 1.18359375, + "loss/logits": 0.31542879343032837, + "step": 378 + }, + { + "epoch": 0.006546955838278099, + "grad_norm": 0.298828125, + "grad_norm_var": 0.00031558672587076825, + "learning_rate": 0.01, + "loss": 1.3917, + "loss/crossentropy": 2.360989570617676, + "loss/fcd": 1.13671875, + "loss/logits": 0.25205816328525543, + "step": 379 + }, + { + "epoch": 0.006564230128088859, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0003201643625895182, + "learning_rate": 0.01, + "loss": 1.4293, + "loss/crossentropy": 2.71570360660553, + "loss/fcd": 1.171875, + "loss/logits": 0.2731679454445839, + "step": 380 + }, + { + "epoch": 0.006581504417899619, + "grad_norm": 0.2890625, + "grad_norm_var": 0.00033359527587890626, + "learning_rate": 0.01, + "loss": 1.351, + "loss/crossentropy": 2.624392867088318, + "loss/fcd": 1.12109375, + "loss/logits": 0.2293551042675972, + "step": 381 + }, + { + "epoch": 0.0065987787077103795, + "grad_norm": 0.310546875, + "grad_norm_var": 0.00032958984375, + "learning_rate": 0.01, + "loss": 1.3969, + "loss/crossentropy": 2.1760467290878296, + "loss/fcd": 1.1171875, + "loss/logits": 0.23204915970563889, + "step": 382 + }, + { + "epoch": 0.006616052997521139, + "grad_norm": 0.2890625, + "grad_norm_var": 0.00029544830322265626, + "learning_rate": 0.01, + "loss": 1.3163, + "loss/crossentropy": 2.089251697063446, + "loss/fcd": 1.041015625, + "loss/logits": 0.21481642127037048, + "step": 383 + }, + { + "epoch": 0.006633327287331899, + "grad_norm": 0.3046875, + "grad_norm_var": 0.00025424957275390627, + "learning_rate": 0.01, + "loss": 1.413, + "loss/crossentropy": 2.1335262060165405, + "loss/fcd": 1.24609375, + "loss/logits": 0.29476068913936615, + "step": 384 + }, + { + "epoch": 0.00665060157714266, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0002770582834879557, + "learning_rate": 0.01, + "loss": 1.3581, + "loss/crossentropy": 2.3327542543411255, + "loss/fcd": 1.0859375, + "loss/logits": 0.2519141435623169, + "step": 385 + }, + { + "epoch": 0.00666787586695342, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0002797285715738932, + "learning_rate": 0.01, + "loss": 1.3991, + "loss/crossentropy": 2.521241784095764, + "loss/fcd": 1.1640625, + "loss/logits": 0.2740743160247803, + "step": 386 + }, + { + "epoch": 0.00668515015676418, + "grad_norm": 0.341796875, + "grad_norm_var": 0.00030543009440104165, + "learning_rate": 0.01, + "loss": 1.4046, + "loss/crossentropy": 2.5978543758392334, + "loss/fcd": 1.18359375, + "loss/logits": 0.24079592525959015, + "step": 387 + }, + { + "epoch": 0.0067024244465749405, + "grad_norm": 0.326171875, + "grad_norm_var": 0.00029575030008951824, + "learning_rate": 0.01, + "loss": 1.4169, + "loss/crossentropy": 2.246425747871399, + "loss/fcd": 1.19140625, + "loss/logits": 0.2572794705629349, + "step": 388 + }, + { + "epoch": 0.0067196987363857, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0003040949503580729, + "learning_rate": 0.01, + "loss": 1.382, + "loss/crossentropy": 2.293286442756653, + "loss/fcd": 1.09375, + "loss/logits": 0.23658673465251923, + "step": 389 + }, + { + "epoch": 0.00673697302619646, + "grad_norm": 0.33203125, + "grad_norm_var": 0.00032596588134765626, + "learning_rate": 0.01, + "loss": 1.4607, + "loss/crossentropy": 2.441470980644226, + "loss/fcd": 1.31640625, + "loss/logits": 0.28673678636550903, + "step": 390 + }, + { + "epoch": 0.006754247316007221, + "grad_norm": 0.296875, + "grad_norm_var": 0.0003061771392822266, + "learning_rate": 0.01, + "loss": 1.3719, + "loss/crossentropy": 2.5365694761276245, + "loss/fcd": 1.16015625, + "loss/logits": 0.2776503562927246, + "step": 391 + }, + { + "epoch": 0.006771521605817981, + "grad_norm": 0.33984375, + "grad_norm_var": 0.00037663777669270834, + "learning_rate": 0.01, + "loss": 1.4373, + "loss/crossentropy": 2.517317056655884, + "loss/fcd": 1.15234375, + "loss/logits": 0.27259568870067596, + "step": 392 + }, + { + "epoch": 0.006788795895628741, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0003398736317952474, + "learning_rate": 0.01, + "loss": 1.382, + "loss/crossentropy": 2.38772451877594, + "loss/fcd": 1.0859375, + "loss/logits": 0.24375227838754654, + "step": 393 + }, + { + "epoch": 0.006806070185439501, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0003720601399739583, + "learning_rate": 0.01, + "loss": 1.3734, + "loss/crossentropy": 2.2084882259368896, + "loss/fcd": 1.076171875, + "loss/logits": 0.22468051314353943, + "step": 394 + }, + { + "epoch": 0.006823344475250261, + "grad_norm": 0.333984375, + "grad_norm_var": 0.00041039784749348957, + "learning_rate": 0.01, + "loss": 1.4417, + "loss/crossentropy": 2.4394543170928955, + "loss/fcd": 1.15234375, + "loss/logits": 0.25751829147338867, + "step": 395 + }, + { + "epoch": 0.006840618765061021, + "grad_norm": 0.28125, + "grad_norm_var": 0.0004447778065999349, + "learning_rate": 0.01, + "loss": 1.3414, + "loss/crossentropy": 2.365694999694824, + "loss/fcd": 1.07421875, + "loss/logits": 0.24539195746183395, + "step": 396 + }, + { + "epoch": 0.006857893054871782, + "grad_norm": 0.310546875, + "grad_norm_var": 0.00042292277018229164, + "learning_rate": 0.01, + "loss": 1.396, + "loss/crossentropy": 2.4616193771362305, + "loss/fcd": 1.203125, + "loss/logits": 0.2692428231239319, + "step": 397 + }, + { + "epoch": 0.006875167344682542, + "grad_norm": 0.330078125, + "grad_norm_var": 0.0004531224568684896, + "learning_rate": 0.01, + "loss": 1.403, + "loss/crossentropy": 2.2189152240753174, + "loss/fcd": 1.0859375, + "loss/logits": 0.24257495999336243, + "step": 398 + }, + { + "epoch": 0.006892441634493302, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0004249413808186849, + "learning_rate": 0.01, + "loss": 1.3559, + "loss/crossentropy": 2.37640380859375, + "loss/fcd": 1.09375, + "loss/logits": 0.2584332674741745, + "step": 399 + }, + { + "epoch": 0.006909715924304062, + "grad_norm": 0.34765625, + "grad_norm_var": 0.0005074659983317057, + "learning_rate": 0.01, + "loss": 1.4774, + "loss/crossentropy": 2.384715437889099, + "loss/fcd": 1.1171875, + "loss/logits": 0.2619960308074951, + "step": 400 + }, + { + "epoch": 0.006926990214114822, + "grad_norm": 0.310546875, + "grad_norm_var": 0.00046054522196451825, + "learning_rate": 0.01, + "loss": 1.4446, + "loss/crossentropy": 2.1976479291915894, + "loss/fcd": 1.09375, + "loss/logits": 0.25502003729343414, + "step": 401 + }, + { + "epoch": 0.006944264503925582, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0004532972971598307, + "learning_rate": 0.01, + "loss": 1.3809, + "loss/crossentropy": 2.278647780418396, + "loss/fcd": 1.1015625, + "loss/logits": 0.2284827083349228, + "step": 402 + }, + { + "epoch": 0.006961538793736343, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0004994710286458333, + "learning_rate": 0.01, + "loss": 1.3505, + "loss/crossentropy": 2.4870028495788574, + "loss/fcd": 1.08984375, + "loss/logits": 0.2371172457933426, + "step": 403 + }, + { + "epoch": 0.006978813083547103, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0004836400349934896, + "learning_rate": 0.01, + "loss": 1.4059, + "loss/crossentropy": 2.65886914730072, + "loss/fcd": 1.171875, + "loss/logits": 0.2828421890735626, + "step": 404 + }, + { + "epoch": 0.006996087373357863, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0005295912424723308, + "learning_rate": 0.01, + "loss": 1.3245, + "loss/crossentropy": 2.1928412914276123, + "loss/fcd": 1.0234375, + "loss/logits": 0.22634898871183395, + "step": 405 + }, + { + "epoch": 0.007013361663168623, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0004922072092692057, + "learning_rate": 0.01, + "loss": 1.4224, + "loss/crossentropy": 2.6360604763031006, + "loss/fcd": 1.24609375, + "loss/logits": 0.2727653980255127, + "step": 406 + }, + { + "epoch": 0.007030635952979383, + "grad_norm": 0.3125, + "grad_norm_var": 0.00048584938049316405, + "learning_rate": 0.01, + "loss": 1.3588, + "loss/crossentropy": 2.3004168272018433, + "loss/fcd": 1.078125, + "loss/logits": 0.239614799618721, + "step": 407 + }, + { + "epoch": 0.007047910242790143, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0004352410634358724, + "learning_rate": 0.01, + "loss": 1.4105, + "loss/crossentropy": 2.3150475025177, + "loss/fcd": 1.09765625, + "loss/logits": 0.2282358631491661, + "step": 408 + }, + { + "epoch": 0.007065184532600904, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0004974365234375, + "learning_rate": 0.01, + "loss": 1.3576, + "loss/crossentropy": 2.645399570465088, + "loss/fcd": 1.17578125, + "loss/logits": 0.2676645368337631, + "step": 409 + }, + { + "epoch": 0.007082458822411664, + "grad_norm": 0.3203125, + "grad_norm_var": 0.00047855377197265626, + "learning_rate": 0.01, + "loss": 1.4103, + "loss/crossentropy": 2.1640161275863647, + "loss/fcd": 1.12890625, + "loss/logits": 0.2479998767375946, + "step": 410 + }, + { + "epoch": 0.007099733112222424, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0004301548004150391, + "learning_rate": 0.01, + "loss": 1.4286, + "loss/crossentropy": 2.5662118196487427, + "loss/fcd": 1.18359375, + "loss/logits": 0.2710702270269394, + "step": 411 + }, + { + "epoch": 0.0071170074020331835, + "grad_norm": 0.298828125, + "grad_norm_var": 0.000391387939453125, + "learning_rate": 0.01, + "loss": 1.369, + "loss/crossentropy": 2.1513331532478333, + "loss/fcd": 1.0625, + "loss/logits": 0.22271250188350677, + "step": 412 + }, + { + "epoch": 0.007134281691843944, + "grad_norm": 0.283203125, + "grad_norm_var": 0.00042565663655598957, + "learning_rate": 0.01, + "loss": 1.3294, + "loss/crossentropy": 2.2309274673461914, + "loss/fcd": 1.0546875, + "loss/logits": 0.24107103794813156, + "step": 413 + }, + { + "epoch": 0.007151555981654704, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0003997802734375, + "learning_rate": 0.01, + "loss": 1.3924, + "loss/crossentropy": 2.6093149185180664, + "loss/fcd": 1.08984375, + "loss/logits": 0.24238202720880508, + "step": 414 + }, + { + "epoch": 0.007168830271465464, + "grad_norm": 0.3828125, + "grad_norm_var": 0.0008020877838134765, + "learning_rate": 0.01, + "loss": 1.4011, + "loss/crossentropy": 2.6286522150039673, + "loss/fcd": 1.1328125, + "loss/logits": 0.258474200963974, + "step": 415 + }, + { + "epoch": 0.007186104561276225, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0007280985514322917, + "learning_rate": 0.01, + "loss": 1.3625, + "loss/crossentropy": 2.686766266822815, + "loss/fcd": 1.0859375, + "loss/logits": 0.24827048182487488, + "step": 416 + }, + { + "epoch": 0.007203378851086985, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0007395426432291667, + "learning_rate": 0.01, + "loss": 1.3839, + "loss/crossentropy": 2.319527268409729, + "loss/fcd": 1.20703125, + "loss/logits": 0.2674332559108734, + "step": 417 + }, + { + "epoch": 0.0072206531408977445, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0007565657297770183, + "learning_rate": 0.01, + "loss": 1.3619, + "loss/crossentropy": 2.3169610500335693, + "loss/fcd": 1.0859375, + "loss/logits": 0.23959346115589142, + "step": 418 + }, + { + "epoch": 0.007237927430708505, + "grad_norm": 0.328125, + "grad_norm_var": 0.0007449944814046223, + "learning_rate": 0.01, + "loss": 1.4737, + "loss/crossentropy": 2.5569876432418823, + "loss/fcd": 1.109375, + "loss/logits": 0.2552832216024399, + "step": 419 + }, + { + "epoch": 0.007255201720519265, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0007374445597330729, + "learning_rate": 0.01, + "loss": 1.4197, + "loss/crossentropy": 2.0687599182128906, + "loss/fcd": 1.1640625, + "loss/logits": 0.2598320543766022, + "step": 420 + }, + { + "epoch": 0.007272476010330025, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0006955305735270183, + "learning_rate": 0.01, + "loss": 1.4605, + "loss/crossentropy": 2.419862389564514, + "loss/fcd": 1.171875, + "loss/logits": 0.2556862235069275, + "step": 421 + }, + { + "epoch": 0.007289750300140786, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0006964206695556641, + "learning_rate": 0.01, + "loss": 1.4071, + "loss/crossentropy": 2.5204795598983765, + "loss/fcd": 1.17578125, + "loss/logits": 0.2741318494081497, + "step": 422 + }, + { + "epoch": 0.007307024589951546, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0006945292154947917, + "learning_rate": 0.01, + "loss": 1.4196, + "loss/crossentropy": 2.489278793334961, + "loss/fcd": 1.12109375, + "loss/logits": 0.25576694309711456, + "step": 423 + }, + { + "epoch": 0.0073242988797623055, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0007067362467447917, + "learning_rate": 0.01, + "loss": 1.3185, + "loss/crossentropy": 2.3392102122306824, + "loss/fcd": 1.03125, + "loss/logits": 0.21298449486494064, + "step": 424 + }, + { + "epoch": 0.007341573169573066, + "grad_norm": 0.28125, + "grad_norm_var": 0.0006886641184488932, + "learning_rate": 0.01, + "loss": 1.3571, + "loss/crossentropy": 2.2977930307388306, + "loss/fcd": 1.0859375, + "loss/logits": 0.23583728075027466, + "step": 425 + }, + { + "epoch": 0.007358847459383826, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0006886641184488932, + "learning_rate": 0.01, + "loss": 1.4277, + "loss/crossentropy": 2.6484419107437134, + "loss/fcd": 1.1953125, + "loss/logits": 0.27860742807388306, + "step": 426 + }, + { + "epoch": 0.007376121749194586, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0007303873697916667, + "learning_rate": 0.01, + "loss": 1.3439, + "loss/crossentropy": 2.460866689682007, + "loss/fcd": 1.0703125, + "loss/logits": 0.23756644129753113, + "step": 427 + }, + { + "epoch": 0.007393396039005347, + "grad_norm": 0.27734375, + "grad_norm_var": 0.000762033462524414, + "learning_rate": 0.01, + "loss": 1.343, + "loss/crossentropy": 2.0784988403320312, + "loss/fcd": 1.044921875, + "loss/logits": 0.21802522987127304, + "step": 428 + }, + { + "epoch": 0.0074106703288161066, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0007471720377604167, + "learning_rate": 0.01, + "loss": 1.3824, + "loss/crossentropy": 2.312214493751526, + "loss/fcd": 1.08203125, + "loss/logits": 0.2373846471309662, + "step": 429 + }, + { + "epoch": 0.0074279446186268664, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0007441043853759766, + "learning_rate": 0.01, + "loss": 1.4031, + "loss/crossentropy": 2.43253231048584, + "loss/fcd": 1.0390625, + "loss/logits": 0.24533094465732574, + "step": 430 + }, + { + "epoch": 0.007445218908437627, + "grad_norm": 0.296875, + "grad_norm_var": 0.0002559502919514974, + "learning_rate": 0.01, + "loss": 1.3775, + "loss/crossentropy": 2.7691128253936768, + "loss/fcd": 1.13671875, + "loss/logits": 0.22900952398777008, + "step": 431 + }, + { + "epoch": 0.007462493198248387, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0002489566802978516, + "learning_rate": 0.01, + "loss": 1.3613, + "loss/crossentropy": 2.231864333152771, + "loss/fcd": 1.05859375, + "loss/logits": 0.24191942811012268, + "step": 432 + }, + { + "epoch": 0.007479767488059147, + "grad_norm": 0.27734375, + "grad_norm_var": 0.00026493072509765626, + "learning_rate": 0.01, + "loss": 1.4166, + "loss/crossentropy": 2.343968152999878, + "loss/fcd": 1.0859375, + "loss/logits": 0.2661665081977844, + "step": 433 + }, + { + "epoch": 0.007497041777869908, + "grad_norm": 0.3203125, + "grad_norm_var": 0.000299835205078125, + "learning_rate": 0.01, + "loss": 1.3807, + "loss/crossentropy": 2.6194422245025635, + "loss/fcd": 1.11328125, + "loss/logits": 0.2392604500055313, + "step": 434 + }, + { + "epoch": 0.0075143160676806675, + "grad_norm": 0.283203125, + "grad_norm_var": 0.00023585955301920573, + "learning_rate": 0.01, + "loss": 1.2902, + "loss/crossentropy": 2.46696138381958, + "loss/fcd": 1.08203125, + "loss/logits": 0.26606328785419464, + "step": 435 + }, + { + "epoch": 0.007531590357491427, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0002272923787434896, + "learning_rate": 0.01, + "loss": 1.3931, + "loss/crossentropy": 2.4375393390655518, + "loss/fcd": 1.14453125, + "loss/logits": 0.27766771614551544, + "step": 436 + }, + { + "epoch": 0.007548864647302188, + "grad_norm": 0.33984375, + "grad_norm_var": 0.0003665765126546224, + "learning_rate": 0.01, + "loss": 1.3732, + "loss/crossentropy": 2.3699560165405273, + "loss/fcd": 1.16796875, + "loss/logits": 0.2573126032948494, + "step": 437 + }, + { + "epoch": 0.007566138937112948, + "grad_norm": 0.306640625, + "grad_norm_var": 0.00037282307942708334, + "learning_rate": 0.01, + "loss": 1.4006, + "loss/crossentropy": 2.227339029312134, + "loss/fcd": 1.14453125, + "loss/logits": 0.2607281506061554, + "step": 438 + }, + { + "epoch": 0.007583413226923708, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0003864129384358724, + "learning_rate": 0.01, + "loss": 1.3632, + "loss/crossentropy": 2.46047842502594, + "loss/fcd": 1.09765625, + "loss/logits": 0.24269723892211914, + "step": 439 + }, + { + "epoch": 0.007600687516734469, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0003676732381184896, + "learning_rate": 0.01, + "loss": 1.3795, + "loss/crossentropy": 2.4994819164276123, + "loss/fcd": 1.14453125, + "loss/logits": 0.25722265988588333, + "step": 440 + }, + { + "epoch": 0.0076179618065452285, + "grad_norm": 0.265625, + "grad_norm_var": 0.0004109064737955729, + "learning_rate": 0.01, + "loss": 1.3064, + "loss/crossentropy": 2.5115902423858643, + "loss/fcd": 1.078125, + "loss/logits": 0.2371089681982994, + "step": 441 + }, + { + "epoch": 0.007635236096355988, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0003619988759358724, + "learning_rate": 0.01, + "loss": 1.4358, + "loss/crossentropy": 2.380179762840271, + "loss/fcd": 1.1015625, + "loss/logits": 0.2640485018491745, + "step": 442 + }, + { + "epoch": 0.007652510386166749, + "grad_norm": 0.3359375, + "grad_norm_var": 0.000460052490234375, + "learning_rate": 0.01, + "loss": 1.4287, + "loss/crossentropy": 2.6699330806732178, + "loss/fcd": 1.23828125, + "loss/logits": 0.2810060381889343, + "step": 443 + }, + { + "epoch": 0.007669784675977509, + "grad_norm": 0.328125, + "grad_norm_var": 0.0004988988240559896, + "learning_rate": 0.01, + "loss": 1.4246, + "loss/crossentropy": 2.5262972116470337, + "loss/fcd": 1.13671875, + "loss/logits": 0.25480419397354126, + "step": 444 + }, + { + "epoch": 0.007687058965788269, + "grad_norm": 0.34765625, + "grad_norm_var": 0.000649261474609375, + "learning_rate": 0.01, + "loss": 1.3859, + "loss/crossentropy": 2.3320013284683228, + "loss/fcd": 1.05078125, + "loss/logits": 0.2234661728143692, + "step": 445 + }, + { + "epoch": 0.007704333255599029, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0006438573201497396, + "learning_rate": 0.01, + "loss": 1.3848, + "loss/crossentropy": 2.448530673980713, + "loss/fcd": 1.234375, + "loss/logits": 0.2647833973169327, + "step": 446 + }, + { + "epoch": 0.0077216075454097895, + "grad_norm": 0.3046875, + "grad_norm_var": 0.000642840067545573, + "learning_rate": 0.01, + "loss": 1.4458, + "loss/crossentropy": 2.279269576072693, + "loss/fcd": 1.1640625, + "loss/logits": 0.2693684697151184, + "step": 447 + }, + { + "epoch": 0.007738881835220549, + "grad_norm": 0.296875, + "grad_norm_var": 0.0006202538808186849, + "learning_rate": 0.01, + "loss": 1.3777, + "loss/crossentropy": 2.6742255687713623, + "loss/fcd": 1.1796875, + "loss/logits": 0.2811601459980011, + "step": 448 + }, + { + "epoch": 0.007756156125031309, + "grad_norm": 0.326171875, + "grad_norm_var": 0.0006031672159830729, + "learning_rate": 0.01, + "loss": 1.4104, + "loss/crossentropy": 2.4074745178222656, + "loss/fcd": 1.078125, + "loss/logits": 0.24794109165668488, + "step": 449 + }, + { + "epoch": 0.00777343041484207, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0005887190500895183, + "learning_rate": 0.01, + "loss": 1.3185, + "loss/crossentropy": 2.35663104057312, + "loss/fcd": 1.12109375, + "loss/logits": 0.22819262742996216, + "step": 450 + }, + { + "epoch": 0.00779070470465283, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0005558649698893229, + "learning_rate": 0.01, + "loss": 1.3834, + "loss/crossentropy": 2.6186258792877197, + "loss/fcd": 1.12109375, + "loss/logits": 0.2587556540966034, + "step": 451 + }, + { + "epoch": 0.00780797899446359, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0005566755930582683, + "learning_rate": 0.01, + "loss": 1.4106, + "loss/crossentropy": 2.6754432916641235, + "loss/fcd": 1.1328125, + "loss/logits": 0.2465488687157631, + "step": 452 + }, + { + "epoch": 0.00782525328427435, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0005098978678385416, + "learning_rate": 0.01, + "loss": 1.3696, + "loss/crossentropy": 2.5379905700683594, + "loss/fcd": 1.16015625, + "loss/logits": 0.2804763838648796, + "step": 453 + }, + { + "epoch": 0.007842527574085111, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0005257765452067058, + "learning_rate": 0.01, + "loss": 1.3851, + "loss/crossentropy": 2.5852067470550537, + "loss/fcd": 1.11328125, + "loss/logits": 0.23731224238872528, + "step": 454 + }, + { + "epoch": 0.007859801863895871, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0004597345987955729, + "learning_rate": 0.01, + "loss": 1.4338, + "loss/crossentropy": 2.6572701930999756, + "loss/fcd": 1.234375, + "loss/logits": 0.28852197527885437, + "step": 455 + }, + { + "epoch": 0.007877076153706631, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0004513899485270182, + "learning_rate": 0.01, + "loss": 1.3824, + "loss/crossentropy": 2.6901192665100098, + "loss/fcd": 1.12890625, + "loss/logits": 0.24115828424692154, + "step": 456 + }, + { + "epoch": 0.00789435044351739, + "grad_norm": 0.451171875, + "grad_norm_var": 0.00158538818359375, + "learning_rate": 0.01, + "loss": 1.5497, + "loss/crossentropy": 2.636592984199524, + "loss/fcd": 1.30859375, + "loss/logits": 0.36482033133506775, + "step": 457 + }, + { + "epoch": 0.00791162473332815, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0015401045481363933, + "learning_rate": 0.01, + "loss": 1.3698, + "loss/crossentropy": 2.414226531982422, + "loss/fcd": 1.076171875, + "loss/logits": 0.2397611290216446, + "step": 458 + }, + { + "epoch": 0.00792889902313891, + "grad_norm": 0.3203125, + "grad_norm_var": 0.001520522435506185, + "learning_rate": 0.01, + "loss": 1.4165, + "loss/crossentropy": 2.463810086250305, + "loss/fcd": 1.1484375, + "loss/logits": 0.24305613338947296, + "step": 459 + }, + { + "epoch": 0.007946173312949672, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0015141805013020833, + "learning_rate": 0.01, + "loss": 1.4418, + "loss/crossentropy": 2.451104521751404, + "loss/fcd": 1.296875, + "loss/logits": 0.30130288004875183, + "step": 460 + }, + { + "epoch": 0.007963447602760432, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0014490763346354167, + "learning_rate": 0.01, + "loss": 1.3988, + "loss/crossentropy": 2.53925359249115, + "loss/fcd": 1.11328125, + "loss/logits": 0.24273447692394257, + "step": 461 + }, + { + "epoch": 0.007980721892571192, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0014133294423421224, + "learning_rate": 0.01, + "loss": 1.3928, + "loss/crossentropy": 2.5229551792144775, + "loss/fcd": 1.1640625, + "loss/logits": 0.25667132437229156, + "step": 462 + }, + { + "epoch": 0.007997996182381952, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0014353434244791666, + "learning_rate": 0.01, + "loss": 1.3347, + "loss/crossentropy": 2.341879367828369, + "loss/fcd": 1.12890625, + "loss/logits": 0.23053725808858871, + "step": 463 + }, + { + "epoch": 0.008015270472192712, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0014744440714518229, + "learning_rate": 0.01, + "loss": 1.3569, + "loss/crossentropy": 2.2920732498168945, + "loss/fcd": 1.03515625, + "loss/logits": 0.23280857503414154, + "step": 464 + }, + { + "epoch": 0.008032544762003472, + "grad_norm": 0.349609375, + "grad_norm_var": 0.001541582743326823, + "learning_rate": 0.01, + "loss": 1.3894, + "loss/crossentropy": 2.515018939971924, + "loss/fcd": 1.10546875, + "loss/logits": 0.24030621349811554, + "step": 465 + }, + { + "epoch": 0.008049819051814231, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0015279134114583334, + "learning_rate": 0.01, + "loss": 1.4597, + "loss/crossentropy": 2.2328585386276245, + "loss/fcd": 1.10546875, + "loss/logits": 0.25991010665893555, + "step": 466 + }, + { + "epoch": 0.008067093341624993, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0015306949615478515, + "learning_rate": 0.01, + "loss": 1.4036, + "loss/crossentropy": 2.798638701438904, + "loss/fcd": 1.203125, + "loss/logits": 0.29376721382141113, + "step": 467 + }, + { + "epoch": 0.008084367631435753, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0015871683756510417, + "learning_rate": 0.01, + "loss": 1.358, + "loss/crossentropy": 2.322153091430664, + "loss/fcd": 1.15625, + "loss/logits": 0.2475121170282364, + "step": 468 + }, + { + "epoch": 0.008101641921246513, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0015871683756510417, + "learning_rate": 0.01, + "loss": 1.3756, + "loss/crossentropy": 2.2007282972335815, + "loss/fcd": 1.046875, + "loss/logits": 0.23374570161104202, + "step": 469 + }, + { + "epoch": 0.008118916211057273, + "grad_norm": 0.287109375, + "grad_norm_var": 0.001594400405883789, + "learning_rate": 0.01, + "loss": 1.366, + "loss/crossentropy": 2.408711314201355, + "loss/fcd": 1.1171875, + "loss/logits": 0.23746006190776825, + "step": 470 + }, + { + "epoch": 0.008136190500868033, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0016522566477457682, + "learning_rate": 0.01, + "loss": 1.4157, + "loss/crossentropy": 2.328341841697693, + "loss/fcd": 1.15234375, + "loss/logits": 0.2784807085990906, + "step": 471 + }, + { + "epoch": 0.008153464790678792, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0016563256581624349, + "learning_rate": 0.01, + "loss": 1.3845, + "loss/crossentropy": 2.414987564086914, + "loss/fcd": 1.26171875, + "loss/logits": 0.32799775153398514, + "step": 472 + }, + { + "epoch": 0.008170739080489554, + "grad_norm": 0.357421875, + "grad_norm_var": 0.0004951318105061848, + "learning_rate": 0.01, + "loss": 1.4935, + "loss/crossentropy": 2.597047209739685, + "loss/fcd": 1.34375, + "loss/logits": 0.3595212921500206, + "step": 473 + }, + { + "epoch": 0.008188013370300314, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0004927953084309896, + "learning_rate": 0.01, + "loss": 1.4074, + "loss/crossentropy": 2.6870315074920654, + "loss/fcd": 1.15625, + "loss/logits": 0.2819272577762604, + "step": 474 + }, + { + "epoch": 0.008205287660111074, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0004863580067952474, + "learning_rate": 0.01, + "loss": 1.4023, + "loss/crossentropy": 2.416118621826172, + "loss/fcd": 1.171875, + "loss/logits": 0.2792641520500183, + "step": 475 + }, + { + "epoch": 0.008222561949921834, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0004933675130208334, + "learning_rate": 0.01, + "loss": 1.3668, + "loss/crossentropy": 2.4251519441604614, + "loss/fcd": 1.12890625, + "loss/logits": 0.25571418553590775, + "step": 476 + }, + { + "epoch": 0.008239836239732594, + "grad_norm": 0.33203125, + "grad_norm_var": 0.00052490234375, + "learning_rate": 0.01, + "loss": 1.396, + "loss/crossentropy": 2.2888123989105225, + "loss/fcd": 1.109375, + "loss/logits": 0.2410544455051422, + "step": 477 + }, + { + "epoch": 0.008257110529543353, + "grad_norm": 0.310546875, + "grad_norm_var": 0.000519561767578125, + "learning_rate": 0.01, + "loss": 1.3594, + "loss/crossentropy": 2.479097008705139, + "loss/fcd": 1.171875, + "loss/logits": 0.25502997636795044, + "step": 478 + }, + { + "epoch": 0.008274384819354115, + "grad_norm": 0.298828125, + "grad_norm_var": 0.000513140360514323, + "learning_rate": 0.01, + "loss": 1.3785, + "loss/crossentropy": 2.4117250442504883, + "loss/fcd": 1.1328125, + "loss/logits": 0.26754797995090485, + "step": 479 + }, + { + "epoch": 0.008291659109164875, + "grad_norm": 0.34375, + "grad_norm_var": 0.0005388895670572917, + "learning_rate": 0.01, + "loss": 1.4354, + "loss/crossentropy": 2.577602744102478, + "loss/fcd": 1.24609375, + "loss/logits": 0.2731374129652977, + "step": 480 + }, + { + "epoch": 0.008308933398975635, + "grad_norm": 0.283203125, + "grad_norm_var": 0.000490252176920573, + "learning_rate": 0.01, + "loss": 1.3588, + "loss/crossentropy": 2.3125388622283936, + "loss/fcd": 1.109375, + "loss/logits": 0.2633324861526489, + "step": 481 + }, + { + "epoch": 0.008326207688786395, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0004997094472249349, + "learning_rate": 0.01, + "loss": 1.3518, + "loss/crossentropy": 2.3964109420776367, + "loss/fcd": 1.09375, + "loss/logits": 0.24801631271839142, + "step": 482 + }, + { + "epoch": 0.008343481978597155, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0005289077758789063, + "learning_rate": 0.01, + "loss": 1.3619, + "loss/crossentropy": 2.5348154306411743, + "loss/fcd": 1.1328125, + "loss/logits": 0.273783415555954, + "step": 483 + }, + { + "epoch": 0.008360756268407914, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0005053043365478516, + "learning_rate": 0.01, + "loss": 1.3716, + "loss/crossentropy": 2.525968909263611, + "loss/fcd": 1.11328125, + "loss/logits": 0.25891977548599243, + "step": 484 + }, + { + "epoch": 0.008378030558218676, + "grad_norm": 0.357421875, + "grad_norm_var": 0.000632333755493164, + "learning_rate": 0.01, + "loss": 1.4665, + "loss/crossentropy": 2.476569890975952, + "loss/fcd": 1.203125, + "loss/logits": 0.29254642128944397, + "step": 485 + }, + { + "epoch": 0.008395304848029436, + "grad_norm": 0.275390625, + "grad_norm_var": 0.000678110122680664, + "learning_rate": 0.01, + "loss": 1.3305, + "loss/crossentropy": 2.4879168272018433, + "loss/fcd": 1.08203125, + "loss/logits": 0.22623379528522491, + "step": 486 + }, + { + "epoch": 0.008412579137840196, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0006357192993164063, + "learning_rate": 0.01, + "loss": 1.3945, + "loss/crossentropy": 2.4186280965805054, + "loss/fcd": 1.09375, + "loss/logits": 0.23819412291049957, + "step": 487 + }, + { + "epoch": 0.008429853427650956, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0006739139556884765, + "learning_rate": 0.01, + "loss": 1.3611, + "loss/crossentropy": 2.2941300868988037, + "loss/fcd": 1.0625, + "loss/logits": 0.22146066278219223, + "step": 488 + }, + { + "epoch": 0.008447127717461715, + "grad_norm": 0.28125, + "grad_norm_var": 0.0005716323852539062, + "learning_rate": 0.01, + "loss": 1.3797, + "loss/crossentropy": 2.368129849433899, + "loss/fcd": 1.16796875, + "loss/logits": 0.2645361125469208, + "step": 489 + }, + { + "epoch": 0.008464402007272475, + "grad_norm": 0.296875, + "grad_norm_var": 0.0005732059478759765, + "learning_rate": 0.01, + "loss": 1.3563, + "loss/crossentropy": 2.5257701873779297, + "loss/fcd": 1.1171875, + "loss/logits": 0.2530096620321274, + "step": 490 + }, + { + "epoch": 0.008481676297083237, + "grad_norm": 0.263671875, + "grad_norm_var": 0.0006844679514567058, + "learning_rate": 0.01, + "loss": 1.3688, + "loss/crossentropy": 2.1511563062667847, + "loss/fcd": 1.0546875, + "loss/logits": 0.240036740899086, + "step": 491 + }, + { + "epoch": 0.008498950586893997, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0006647109985351562, + "learning_rate": 0.01, + "loss": 1.3563, + "loss/crossentropy": 2.370754837989807, + "loss/fcd": 1.18359375, + "loss/logits": 0.2698900103569031, + "step": 492 + }, + { + "epoch": 0.008516224876704757, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0006010532379150391, + "learning_rate": 0.01, + "loss": 1.4051, + "loss/crossentropy": 2.55213725566864, + "loss/fcd": 1.19140625, + "loss/logits": 0.26752666383981705, + "step": 493 + }, + { + "epoch": 0.008533499166515517, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0006074110666910807, + "learning_rate": 0.01, + "loss": 1.3304, + "loss/crossentropy": 2.878965377807617, + "loss/fcd": 1.140625, + "loss/logits": 0.248264878988266, + "step": 494 + }, + { + "epoch": 0.008550773456326276, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0006083170572916667, + "learning_rate": 0.01, + "loss": 1.3554, + "loss/crossentropy": 2.389639139175415, + "loss/fcd": 1.08203125, + "loss/logits": 0.2504645884037018, + "step": 495 + }, + { + "epoch": 0.008568047746137036, + "grad_norm": 0.26953125, + "grad_norm_var": 0.0005273818969726562, + "learning_rate": 0.01, + "loss": 1.353, + "loss/crossentropy": 2.261403799057007, + "loss/fcd": 1.03125, + "loss/logits": 0.22545771300792694, + "step": 496 + }, + { + "epoch": 0.008585322035947798, + "grad_norm": 0.328125, + "grad_norm_var": 0.0005760033925374349, + "learning_rate": 0.01, + "loss": 1.4314, + "loss/crossentropy": 2.755717635154724, + "loss/fcd": 1.18359375, + "loss/logits": 0.28124481439590454, + "step": 497 + }, + { + "epoch": 0.008602596325758558, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0005721410115559895, + "learning_rate": 0.01, + "loss": 1.39, + "loss/crossentropy": 2.5332454442977905, + "loss/fcd": 1.14453125, + "loss/logits": 0.2577049881219864, + "step": 498 + }, + { + "epoch": 0.008619870615569318, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0006039937337239584, + "learning_rate": 0.01, + "loss": 1.3284, + "loss/crossentropy": 2.3752284049987793, + "loss/fcd": 1.09375, + "loss/logits": 0.24180973321199417, + "step": 499 + }, + { + "epoch": 0.008637144905380078, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0006058851877848308, + "learning_rate": 0.01, + "loss": 1.3868, + "loss/crossentropy": 2.299641966819763, + "loss/fcd": 1.1328125, + "loss/logits": 0.2509627118706703, + "step": 500 + }, + { + "epoch": 0.008654419195190837, + "grad_norm": 0.294921875, + "grad_norm_var": 0.00035869280497233074, + "learning_rate": 0.01, + "loss": 1.3756, + "loss/crossentropy": 2.2871060371398926, + "loss/fcd": 1.06640625, + "loss/logits": 0.22674021124839783, + "step": 501 + }, + { + "epoch": 0.008671693485001597, + "grad_norm": 0.310546875, + "grad_norm_var": 0.00034610430399576825, + "learning_rate": 0.01, + "loss": 1.3644, + "loss/crossentropy": 2.2024362087249756, + "loss/fcd": 1.09375, + "loss/logits": 0.2369084656238556, + "step": 502 + }, + { + "epoch": 0.008688967774812357, + "grad_norm": 0.28125, + "grad_norm_var": 0.00030414263407389325, + "learning_rate": 0.01, + "loss": 1.343, + "loss/crossentropy": 2.5880898237228394, + "loss/fcd": 1.16796875, + "loss/logits": 0.25857551395893097, + "step": 503 + }, + { + "epoch": 0.008706242064623119, + "grad_norm": 0.306640625, + "grad_norm_var": 0.00030986467997233075, + "learning_rate": 0.01, + "loss": 1.4237, + "loss/crossentropy": 2.3485684394836426, + "loss/fcd": 1.16796875, + "loss/logits": 0.266690656542778, + "step": 504 + }, + { + "epoch": 0.008723516354433879, + "grad_norm": 0.353515625, + "grad_norm_var": 0.0005009969075520834, + "learning_rate": 0.01, + "loss": 1.3758, + "loss/crossentropy": 2.539777636528015, + "loss/fcd": 1.078125, + "loss/logits": 0.24045251309871674, + "step": 505 + }, + { + "epoch": 0.008740790644244639, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0005444844563802083, + "learning_rate": 0.01, + "loss": 1.3113, + "loss/crossentropy": 2.492120862007141, + "loss/fcd": 1.1171875, + "loss/logits": 0.2610347419977188, + "step": 506 + }, + { + "epoch": 0.008758064934055398, + "grad_norm": 0.28125, + "grad_norm_var": 0.0004825433095296224, + "learning_rate": 0.01, + "loss": 1.4436, + "loss/crossentropy": 2.5324673652648926, + "loss/fcd": 1.33984375, + "loss/logits": 0.3312453627586365, + "step": 507 + }, + { + "epoch": 0.008775339223866158, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0005187829335530599, + "learning_rate": 0.01, + "loss": 1.3478, + "loss/crossentropy": 2.612854242324829, + "loss/fcd": 1.1640625, + "loss/logits": 0.261405885219574, + "step": 508 + }, + { + "epoch": 0.008792613513676918, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0005492528279622395, + "learning_rate": 0.01, + "loss": 1.3917, + "loss/crossentropy": 2.4303773641586304, + "loss/fcd": 1.11328125, + "loss/logits": 0.24008433520793915, + "step": 509 + }, + { + "epoch": 0.00880988780348768, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0005370457967122396, + "learning_rate": 0.01, + "loss": 1.3929, + "loss/crossentropy": 2.676490068435669, + "loss/fcd": 1.23828125, + "loss/logits": 0.291456863284111, + "step": 510 + }, + { + "epoch": 0.00882716209329844, + "grad_norm": 0.248046875, + "grad_norm_var": 0.0006914615631103515, + "learning_rate": 0.01, + "loss": 1.3138, + "loss/crossentropy": 2.1477047204971313, + "loss/fcd": 1.09765625, + "loss/logits": 0.2524523437023163, + "step": 511 + }, + { + "epoch": 0.0088444363831092, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0006460666656494141, + "learning_rate": 0.01, + "loss": 1.3208, + "loss/crossentropy": 2.3151156902313232, + "loss/fcd": 1.08984375, + "loss/logits": 0.2605459988117218, + "step": 512 + }, + { + "epoch": 0.00886171067291996, + "grad_norm": 0.296875, + "grad_norm_var": 0.0005753676096598308, + "learning_rate": 0.01, + "loss": 1.357, + "loss/crossentropy": 2.4916226863861084, + "loss/fcd": 1.1640625, + "loss/logits": 0.25671282410621643, + "step": 513 + }, + { + "epoch": 0.00887898496273072, + "grad_norm": 0.328125, + "grad_norm_var": 0.0006388346354166667, + "learning_rate": 0.01, + "loss": 1.4206, + "loss/crossentropy": 2.2333791255950928, + "loss/fcd": 1.1484375, + "loss/logits": 0.28083400428295135, + "step": 514 + }, + { + "epoch": 0.00889625925254148, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0006095727284749348, + "learning_rate": 0.01, + "loss": 1.3783, + "loss/crossentropy": 2.606614589691162, + "loss/fcd": 1.08203125, + "loss/logits": 0.2666025906801224, + "step": 515 + }, + { + "epoch": 0.00891353354235224, + "grad_norm": 0.3125, + "grad_norm_var": 0.0006219863891601563, + "learning_rate": 0.01, + "loss": 1.3641, + "loss/crossentropy": 2.5051095485687256, + "loss/fcd": 1.2109375, + "loss/logits": 0.25101958215236664, + "step": 516 + }, + { + "epoch": 0.008930807832163, + "grad_norm": 0.353515625, + "grad_norm_var": 0.000811767578125, + "learning_rate": 0.01, + "loss": 1.4186, + "loss/crossentropy": 2.3850373029708862, + "loss/fcd": 1.13671875, + "loss/logits": 0.27909501641988754, + "step": 517 + }, + { + "epoch": 0.00894808212197376, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0008066177368164062, + "learning_rate": 0.01, + "loss": 1.366, + "loss/crossentropy": 2.217817187309265, + "loss/fcd": 1.03125, + "loss/logits": 0.23760483413934708, + "step": 518 + }, + { + "epoch": 0.00896535641178452, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0007822513580322266, + "learning_rate": 0.01, + "loss": 1.3482, + "loss/crossentropy": 2.537502408027649, + "loss/fcd": 1.15234375, + "loss/logits": 0.27564045786857605, + "step": 519 + }, + { + "epoch": 0.00898263070159528, + "grad_norm": 0.333984375, + "grad_norm_var": 0.0008463382720947266, + "learning_rate": 0.01, + "loss": 1.4875, + "loss/crossentropy": 2.628643035888672, + "loss/fcd": 1.31640625, + "loss/logits": 0.30241404473781586, + "step": 520 + }, + { + "epoch": 0.00899990499140604, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0006756941477457682, + "learning_rate": 0.01, + "loss": 1.4536, + "loss/crossentropy": 2.2907025814056396, + "loss/fcd": 1.1015625, + "loss/logits": 0.2538699805736542, + "step": 521 + }, + { + "epoch": 0.009017179281216802, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0006413618723551432, + "learning_rate": 0.01, + "loss": 1.4079, + "loss/crossentropy": 2.5753923654556274, + "loss/fcd": 1.2109375, + "loss/logits": 0.2975587248802185, + "step": 522 + }, + { + "epoch": 0.009034453571027562, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0006230513254801433, + "learning_rate": 0.01, + "loss": 1.3724, + "loss/crossentropy": 2.327569842338562, + "loss/fcd": 1.07421875, + "loss/logits": 0.2123243287205696, + "step": 523 + }, + { + "epoch": 0.009051727860838321, + "grad_norm": 0.345703125, + "grad_norm_var": 0.0006653944651285808, + "learning_rate": 0.01, + "loss": 1.484, + "loss/crossentropy": 2.4529794454574585, + "loss/fcd": 1.2109375, + "loss/logits": 0.2642442062497139, + "step": 524 + }, + { + "epoch": 0.009069002150649081, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0006680647532145182, + "learning_rate": 0.01, + "loss": 1.3426, + "loss/crossentropy": 2.405073642730713, + "loss/fcd": 1.11328125, + "loss/logits": 0.24681153148412704, + "step": 525 + }, + { + "epoch": 0.009086276440459841, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0006830692291259766, + "learning_rate": 0.01, + "loss": 1.3501, + "loss/crossentropy": 2.5336978435516357, + "loss/fcd": 1.13671875, + "loss/logits": 0.26675350964069366, + "step": 526 + }, + { + "epoch": 0.009103550730270601, + "grad_norm": 0.314453125, + "grad_norm_var": 0.00044960975646972655, + "learning_rate": 0.01, + "loss": 1.4051, + "loss/crossentropy": 2.306818962097168, + "loss/fcd": 1.1328125, + "loss/logits": 0.24449439346790314, + "step": 527 + }, + { + "epoch": 0.009120825020081363, + "grad_norm": 0.29296875, + "grad_norm_var": 0.00044960975646972655, + "learning_rate": 0.01, + "loss": 1.3847, + "loss/crossentropy": 2.394535183906555, + "loss/fcd": 1.15625, + "loss/logits": 0.2896339148283005, + "step": 528 + }, + { + "epoch": 0.009138099309892123, + "grad_norm": 0.32421875, + "grad_norm_var": 0.00044960975646972655, + "learning_rate": 0.01, + "loss": 1.383, + "loss/crossentropy": 2.502661347389221, + "loss/fcd": 1.10546875, + "loss/logits": 0.2570330798625946, + "step": 529 + }, + { + "epoch": 0.009155373599702882, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0004755655924479167, + "learning_rate": 0.01, + "loss": 1.3914, + "loss/crossentropy": 2.5401047468185425, + "loss/fcd": 1.1328125, + "loss/logits": 0.25133057683706284, + "step": 530 + }, + { + "epoch": 0.009172647889513642, + "grad_norm": 0.294921875, + "grad_norm_var": 0.000457000732421875, + "learning_rate": 0.01, + "loss": 1.3288, + "loss/crossentropy": 2.357369303703308, + "loss/fcd": 1.140625, + "loss/logits": 0.25731976330280304, + "step": 531 + }, + { + "epoch": 0.009189922179324402, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0005107720692952474, + "learning_rate": 0.01, + "loss": 1.3301, + "loss/crossentropy": 2.361912250518799, + "loss/fcd": 1.03125, + "loss/logits": 0.23256495594978333, + "step": 532 + }, + { + "epoch": 0.009207196469135162, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0003574212392171224, + "learning_rate": 0.01, + "loss": 1.4286, + "loss/crossentropy": 2.5182912349700928, + "loss/fcd": 1.1328125, + "loss/logits": 0.24184302985668182, + "step": 533 + }, + { + "epoch": 0.009224470758945922, + "grad_norm": 0.302734375, + "grad_norm_var": 0.00035564104715983075, + "learning_rate": 0.01, + "loss": 1.3729, + "loss/crossentropy": 2.3095160722732544, + "loss/fcd": 1.068359375, + "loss/logits": 0.22853360325098038, + "step": 534 + }, + { + "epoch": 0.009241745048756684, + "grad_norm": 0.3046875, + "grad_norm_var": 0.00034936269124348957, + "learning_rate": 0.01, + "loss": 1.4586, + "loss/crossentropy": 2.4540841579437256, + "loss/fcd": 1.26953125, + "loss/logits": 0.3655036687850952, + "step": 535 + }, + { + "epoch": 0.009259019338567443, + "grad_norm": 0.271484375, + "grad_norm_var": 0.0003513971964518229, + "learning_rate": 0.01, + "loss": 1.3534, + "loss/crossentropy": 2.350268244743347, + "loss/fcd": 1.02734375, + "loss/logits": 0.21084149181842804, + "step": 536 + }, + { + "epoch": 0.009276293628378203, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0003573099772135417, + "learning_rate": 0.01, + "loss": 1.4337, + "loss/crossentropy": 2.1304550170898438, + "loss/fcd": 1.15234375, + "loss/logits": 0.2608063519001007, + "step": 537 + }, + { + "epoch": 0.009293567918188963, + "grad_norm": 0.30078125, + "grad_norm_var": 0.00033899943033854164, + "learning_rate": 0.01, + "loss": 1.3731, + "loss/crossentropy": 2.4391915798187256, + "loss/fcd": 1.109375, + "loss/logits": 0.2429627627134323, + "step": 538 + }, + { + "epoch": 0.009310842207999723, + "grad_norm": 0.2490234375, + "grad_norm_var": 0.0004955569903055827, + "learning_rate": 0.01, + "loss": 1.3286, + "loss/crossentropy": 2.3171310424804688, + "loss/fcd": 1.078125, + "loss/logits": 0.2482328712940216, + "step": 539 + }, + { + "epoch": 0.009328116497810483, + "grad_norm": 0.34375, + "grad_norm_var": 0.0004833817481994629, + "learning_rate": 0.01, + "loss": 1.5811, + "loss/crossentropy": 2.376081347465515, + "loss/fcd": 1.10546875, + "loss/logits": 0.25213149189949036, + "step": 540 + }, + { + "epoch": 0.009345390787621245, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0005142807960510254, + "learning_rate": 0.01, + "loss": 1.3856, + "loss/crossentropy": 2.4632989168167114, + "loss/fcd": 1.078125, + "loss/logits": 0.2334313914179802, + "step": 541 + }, + { + "epoch": 0.009362665077432004, + "grad_norm": 0.267578125, + "grad_norm_var": 0.0005667328834533692, + "learning_rate": 0.01, + "loss": 1.2882, + "loss/crossentropy": 2.177401542663574, + "loss/fcd": 1.0390625, + "loss/logits": 0.24528680741786957, + "step": 542 + }, + { + "epoch": 0.009379939367242764, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0005423506100972493, + "learning_rate": 0.01, + "loss": 1.4063, + "loss/crossentropy": 2.4587985277175903, + "loss/fcd": 1.22265625, + "loss/logits": 0.2990281730890274, + "step": 543 + }, + { + "epoch": 0.009397213657053524, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0005635221799214681, + "learning_rate": 0.01, + "loss": 1.3479, + "loss/crossentropy": 2.5811359882354736, + "loss/fcd": 1.1484375, + "loss/logits": 0.2688131481409073, + "step": 544 + }, + { + "epoch": 0.009414487946864284, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0004939039548238119, + "learning_rate": 0.01, + "loss": 1.3695, + "loss/crossentropy": 2.51469349861145, + "loss/fcd": 1.16796875, + "loss/logits": 0.26591262221336365, + "step": 545 + }, + { + "epoch": 0.009431762236675044, + "grad_norm": 0.462890625, + "grad_norm_var": 0.002329091231028239, + "learning_rate": 0.01, + "loss": 1.3836, + "loss/crossentropy": 2.46504545211792, + "loss/fcd": 1.1171875, + "loss/logits": 0.25862205028533936, + "step": 546 + }, + { + "epoch": 0.009449036526485806, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0023592273394266766, + "learning_rate": 0.01, + "loss": 1.3895, + "loss/crossentropy": 2.6220297813415527, + "loss/fcd": 1.109375, + "loss/logits": 0.2548370361328125, + "step": 547 + }, + { + "epoch": 0.009466310816296565, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0023488322893778484, + "learning_rate": 0.01, + "loss": 1.3737, + "loss/crossentropy": 2.591723322868347, + "loss/fcd": 1.13671875, + "loss/logits": 0.25868477672338486, + "step": 548 + }, + { + "epoch": 0.009483585106107325, + "grad_norm": 0.345703125, + "grad_norm_var": 0.0024718562761942547, + "learning_rate": 0.01, + "loss": 1.4328, + "loss/crossentropy": 2.568224310874939, + "loss/fcd": 1.1875, + "loss/logits": 0.278149738907814, + "step": 549 + }, + { + "epoch": 0.009500859395918085, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0024854302406311034, + "learning_rate": 0.01, + "loss": 1.4231, + "loss/crossentropy": 2.5823177099227905, + "loss/fcd": 1.25, + "loss/logits": 0.2855361998081207, + "step": 550 + }, + { + "epoch": 0.009518133685728845, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0025197307268778482, + "learning_rate": 0.01, + "loss": 1.3549, + "loss/crossentropy": 2.8035439252853394, + "loss/fcd": 1.12890625, + "loss/logits": 0.26180362701416016, + "step": 551 + }, + { + "epoch": 0.009535407975539605, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0024581233660380046, + "learning_rate": 0.01, + "loss": 1.3912, + "loss/crossentropy": 2.695222020149231, + "loss/fcd": 1.16796875, + "loss/logits": 0.26626719534397125, + "step": 552 + }, + { + "epoch": 0.009552682265350367, + "grad_norm": 0.296875, + "grad_norm_var": 0.002452115217844645, + "learning_rate": 0.01, + "loss": 1.3884, + "loss/crossentropy": 2.2692904472351074, + "loss/fcd": 1.12890625, + "loss/logits": 0.26358961313962936, + "step": 553 + }, + { + "epoch": 0.009569956555161126, + "grad_norm": 0.484375, + "grad_norm_var": 0.004515453179677328, + "learning_rate": 0.01, + "loss": 1.4362, + "loss/crossentropy": 2.587984561920166, + "loss/fcd": 1.203125, + "loss/logits": 0.28202252089977264, + "step": 554 + }, + { + "epoch": 0.009587230844971886, + "grad_norm": 0.28125, + "grad_norm_var": 0.004301055272420248, + "learning_rate": 0.01, + "loss": 1.3332, + "loss/crossentropy": 2.238184094429016, + "loss/fcd": 1.125, + "loss/logits": 0.25094330310821533, + "step": 555 + }, + { + "epoch": 0.009604505134782646, + "grad_norm": 0.3046875, + "grad_norm_var": 0.004252099990844726, + "learning_rate": 0.01, + "loss": 1.3777, + "loss/crossentropy": 2.2282315492630005, + "loss/fcd": 1.0625, + "loss/logits": 0.2441270500421524, + "step": 556 + }, + { + "epoch": 0.009621779424593406, + "grad_norm": 0.2890625, + "grad_norm_var": 0.004194132486979167, + "learning_rate": 0.01, + "loss": 1.3897, + "loss/crossentropy": 2.354749321937561, + "loss/fcd": 1.171875, + "loss/logits": 0.26998236775398254, + "step": 557 + }, + { + "epoch": 0.009639053714404166, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0040819803873697914, + "learning_rate": 0.01, + "loss": 1.4072, + "loss/crossentropy": 2.3754522800445557, + "loss/fcd": 1.125, + "loss/logits": 0.27060529589653015, + "step": 558 + }, + { + "epoch": 0.009656328004214927, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0040383497873942055, + "learning_rate": 0.01, + "loss": 1.3777, + "loss/crossentropy": 2.3385682106018066, + "loss/fcd": 1.109375, + "loss/logits": 0.24154536426067352, + "step": 559 + }, + { + "epoch": 0.009673602294025687, + "grad_norm": 0.30859375, + "grad_norm_var": 0.00392297108968099, + "learning_rate": 0.01, + "loss": 1.4435, + "loss/crossentropy": 2.525418996810913, + "loss/fcd": 1.15234375, + "loss/logits": 0.2767959535121918, + "step": 560 + }, + { + "epoch": 0.009690876583836447, + "grad_norm": 0.3203125, + "grad_norm_var": 0.003881438573201497, + "learning_rate": 0.01, + "loss": 1.3849, + "loss/crossentropy": 2.291569232940674, + "loss/fcd": 1.12890625, + "loss/logits": 0.28253524005413055, + "step": 561 + }, + { + "epoch": 0.009708150873647207, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0024538675944010416, + "learning_rate": 0.01, + "loss": 1.3855, + "loss/crossentropy": 1.8735097646713257, + "loss/fcd": 1.171875, + "loss/logits": 0.186705082654953, + "step": 562 + }, + { + "epoch": 0.009725425163457967, + "grad_norm": 0.27734375, + "grad_norm_var": 0.002462625503540039, + "learning_rate": 0.01, + "loss": 1.3507, + "loss/crossentropy": 2.2446945905685425, + "loss/fcd": 1.0859375, + "loss/logits": 0.23518769443035126, + "step": 563 + }, + { + "epoch": 0.009742699453268727, + "grad_norm": 0.318359375, + "grad_norm_var": 0.002405405044555664, + "learning_rate": 0.01, + "loss": 1.3812, + "loss/crossentropy": 1.9525874853134155, + "loss/fcd": 1.2421875, + "loss/logits": 0.19731061905622482, + "step": 564 + }, + { + "epoch": 0.009759973743079488, + "grad_norm": 0.306640625, + "grad_norm_var": 0.00233610471089681, + "learning_rate": 0.01, + "loss": 1.3995, + "loss/crossentropy": 2.53279709815979, + "loss/fcd": 1.16796875, + "loss/logits": 0.27113981544971466, + "step": 565 + }, + { + "epoch": 0.009777248032890248, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0023889541625976562, + "learning_rate": 0.01, + "loss": 1.346, + "loss/crossentropy": 2.4163317680358887, + "loss/fcd": 1.14453125, + "loss/logits": 0.26083898544311523, + "step": 566 + }, + { + "epoch": 0.009794522322701008, + "grad_norm": 0.306640625, + "grad_norm_var": 0.00232086181640625, + "learning_rate": 0.01, + "loss": 1.3419, + "loss/crossentropy": 2.4386374950408936, + "loss/fcd": 1.1171875, + "loss/logits": 0.2661859691143036, + "step": 567 + }, + { + "epoch": 0.009811796612511768, + "grad_norm": 0.296875, + "grad_norm_var": 0.0023355484008789062, + "learning_rate": 0.01, + "loss": 1.3659, + "loss/crossentropy": 2.509569525718689, + "loss/fcd": 1.203125, + "loss/logits": 0.263532429933548, + "step": 568 + }, + { + "epoch": 0.009829070902322528, + "grad_norm": 0.30859375, + "grad_norm_var": 0.00232086181640625, + "learning_rate": 0.01, + "loss": 1.434, + "loss/crossentropy": 2.400490880012512, + "loss/fcd": 1.1171875, + "loss/logits": 0.24774880707263947, + "step": 569 + }, + { + "epoch": 0.009846345192133288, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0002483208974202474, + "learning_rate": 0.01, + "loss": 1.4385, + "loss/crossentropy": 2.390196442604065, + "loss/fcd": 1.07421875, + "loss/logits": 0.2328876331448555, + "step": 570 + }, + { + "epoch": 0.009863619481944048, + "grad_norm": 0.2890625, + "grad_norm_var": 0.00023013750712076823, + "learning_rate": 0.01, + "loss": 1.3399, + "loss/crossentropy": 2.399609327316284, + "loss/fcd": 1.17578125, + "loss/logits": 0.2631242126226425, + "step": 571 + }, + { + "epoch": 0.00988089377175481, + "grad_norm": 0.388671875, + "grad_norm_var": 0.0006914774576822917, + "learning_rate": 0.01, + "loss": 1.4129, + "loss/crossentropy": 2.5639859437942505, + "loss/fcd": 1.171875, + "loss/logits": 0.2520062252879143, + "step": 572 + }, + { + "epoch": 0.00989816806156557, + "grad_norm": 0.34765625, + "grad_norm_var": 0.000757280985514323, + "learning_rate": 0.01, + "loss": 1.4226, + "loss/crossentropy": 2.4615684747695923, + "loss/fcd": 1.12109375, + "loss/logits": 0.2613854482769966, + "step": 573 + }, + { + "epoch": 0.009915442351376329, + "grad_norm": 0.296875, + "grad_norm_var": 0.0007432142893473308, + "learning_rate": 0.01, + "loss": 1.4494, + "loss/crossentropy": 2.4410594701766968, + "loss/fcd": 1.1953125, + "loss/logits": 0.3067672997713089, + "step": 574 + }, + { + "epoch": 0.009932716641187089, + "grad_norm": 0.3125, + "grad_norm_var": 0.0007410685221354167, + "learning_rate": 0.01, + "loss": 1.4228, + "loss/crossentropy": 2.6319605112075806, + "loss/fcd": 1.0859375, + "loss/logits": 0.24571086466312408, + "step": 575 + }, + { + "epoch": 0.009949990930997849, + "grad_norm": 0.28125, + "grad_norm_var": 0.0008020401000976562, + "learning_rate": 0.01, + "loss": 1.2951, + "loss/crossentropy": 2.368131637573242, + "loss/fcd": 1.03515625, + "loss/logits": 0.23180700838565826, + "step": 576 + }, + { + "epoch": 0.009967265220808609, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0008711338043212891, + "learning_rate": 0.01, + "loss": 1.3585, + "loss/crossentropy": 2.197615623474121, + "loss/fcd": 1.08203125, + "loss/logits": 0.2364010065793991, + "step": 577 + }, + { + "epoch": 0.00998453951061937, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0008722782135009765, + "learning_rate": 0.01, + "loss": 1.3929, + "loss/crossentropy": 2.5560864210128784, + "loss/fcd": 1.11328125, + "loss/logits": 0.25519636273384094, + "step": 578 + }, + { + "epoch": 0.01000181380043013, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0008318424224853516, + "learning_rate": 0.01, + "loss": 1.3903, + "loss/crossentropy": 2.290327787399292, + "loss/fcd": 1.08203125, + "loss/logits": 0.242530919611454, + "step": 579 + }, + { + "epoch": 0.01001908809024089, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0008769830067952474, + "learning_rate": 0.01, + "loss": 1.3204, + "loss/crossentropy": 2.558402419090271, + "loss/fcd": 1.0703125, + "loss/logits": 0.24013052880764008, + "step": 580 + }, + { + "epoch": 0.01003636238005165, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0008811791737874349, + "learning_rate": 0.01, + "loss": 1.3934, + "loss/crossentropy": 2.3049778938293457, + "loss/fcd": 1.140625, + "loss/logits": 0.24487978965044022, + "step": 581 + }, + { + "epoch": 0.01005363666986241, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0008811791737874349, + "learning_rate": 0.01, + "loss": 1.3844, + "loss/crossentropy": 2.5796691179275513, + "loss/fcd": 1.10546875, + "loss/logits": 0.2458028495311737, + "step": 582 + }, + { + "epoch": 0.01007091095967317, + "grad_norm": 0.330078125, + "grad_norm_var": 0.0009151299794514974, + "learning_rate": 0.01, + "loss": 1.4305, + "loss/crossentropy": 2.3386783599853516, + "loss/fcd": 1.14453125, + "loss/logits": 0.24171485751867294, + "step": 583 + }, + { + "epoch": 0.010088185249483931, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0009124120076497396, + "learning_rate": 0.01, + "loss": 1.386, + "loss/crossentropy": 2.3040322065353394, + "loss/fcd": 1.1171875, + "loss/logits": 0.25387245416641235, + "step": 584 + }, + { + "epoch": 0.010105459539294691, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0009125868479410807, + "learning_rate": 0.01, + "loss": 1.3622, + "loss/crossentropy": 3.012826681137085, + "loss/fcd": 1.21484375, + "loss/logits": 0.255868136882782, + "step": 585 + }, + { + "epoch": 0.010122733829105451, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0009113947550455729, + "learning_rate": 0.01, + "loss": 1.4032, + "loss/crossentropy": 2.7537986040115356, + "loss/fcd": 1.16015625, + "loss/logits": 0.25436051189899445, + "step": 586 + }, + { + "epoch": 0.010140008118916211, + "grad_norm": 0.251953125, + "grad_norm_var": 0.001083230972290039, + "learning_rate": 0.01, + "loss": 1.3117, + "loss/crossentropy": 2.14433491230011, + "loss/fcd": 1.10546875, + "loss/logits": 0.24180641025304794, + "step": 587 + }, + { + "epoch": 0.01015728240872697, + "grad_norm": 0.296875, + "grad_norm_var": 0.000574493408203125, + "learning_rate": 0.01, + "loss": 1.3691, + "loss/crossentropy": 2.101401686668396, + "loss/fcd": 1.1640625, + "loss/logits": 0.19958080351352692, + "step": 588 + }, + { + "epoch": 0.01017455669853773, + "grad_norm": 0.2890625, + "grad_norm_var": 0.00040378570556640623, + "learning_rate": 0.01, + "loss": 1.3827, + "loss/crossentropy": 2.436479330062866, + "loss/fcd": 1.09765625, + "loss/logits": 0.23494569957256317, + "step": 589 + }, + { + "epoch": 0.010191830988348492, + "grad_norm": 0.318359375, + "grad_norm_var": 0.00043892860412597656, + "learning_rate": 0.01, + "loss": 1.4279, + "loss/crossentropy": 2.6805481910705566, + "loss/fcd": 1.12890625, + "loss/logits": 0.2272372618317604, + "step": 590 + }, + { + "epoch": 0.010209105278159252, + "grad_norm": 0.263671875, + "grad_norm_var": 0.00048065185546875, + "learning_rate": 0.01, + "loss": 1.322, + "loss/crossentropy": 2.7796462774276733, + "loss/fcd": 1.1796875, + "loss/logits": 0.26299113035202026, + "step": 591 + }, + { + "epoch": 0.010226379567970012, + "grad_norm": 0.3046875, + "grad_norm_var": 0.000478363037109375, + "learning_rate": 0.01, + "loss": 1.3396, + "loss/crossentropy": 2.4198944568634033, + "loss/fcd": 1.1171875, + "loss/logits": 0.2373996302485466, + "step": 592 + }, + { + "epoch": 0.010243653857780772, + "grad_norm": 0.310546875, + "grad_norm_var": 0.00046634674072265625, + "learning_rate": 0.01, + "loss": 1.3618, + "loss/crossentropy": 2.5916903018951416, + "loss/fcd": 1.10546875, + "loss/logits": 0.23520419746637344, + "step": 593 + }, + { + "epoch": 0.010260928147591532, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0004646142323811849, + "learning_rate": 0.01, + "loss": 1.4032, + "loss/crossentropy": 2.2067846059799194, + "loss/fcd": 1.05078125, + "loss/logits": 0.2392275035381317, + "step": 594 + }, + { + "epoch": 0.010278202437402292, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0004112084706624349, + "learning_rate": 0.01, + "loss": 1.3551, + "loss/crossentropy": 2.5146957635879517, + "loss/fcd": 1.1484375, + "loss/logits": 0.2572908252477646, + "step": 595 + }, + { + "epoch": 0.010295476727213053, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0004157861073811849, + "learning_rate": 0.01, + "loss": 1.4091, + "loss/crossentropy": 2.7353230714797974, + "loss/fcd": 1.1953125, + "loss/logits": 0.2845850735902786, + "step": 596 + }, + { + "epoch": 0.010312751017023813, + "grad_norm": 0.3125, + "grad_norm_var": 0.00041147867838541664, + "learning_rate": 0.01, + "loss": 1.4371, + "loss/crossentropy": 2.290863871574402, + "loss/fcd": 1.1171875, + "loss/logits": 0.25596096366643906, + "step": 597 + }, + { + "epoch": 0.010330025306834573, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0003811995188395182, + "learning_rate": 0.01, + "loss": 1.3736, + "loss/crossentropy": 2.4351121187210083, + "loss/fcd": 1.15625, + "loss/logits": 0.2633766904473305, + "step": 598 + }, + { + "epoch": 0.010347299596645333, + "grad_norm": 0.29296875, + "grad_norm_var": 0.000312042236328125, + "learning_rate": 0.01, + "loss": 1.3671, + "loss/crossentropy": 2.3196725845336914, + "loss/fcd": 1.0859375, + "loss/logits": 0.23156649619340897, + "step": 599 + }, + { + "epoch": 0.010364573886456093, + "grad_norm": 0.267578125, + "grad_norm_var": 0.00036290486653645836, + "learning_rate": 0.01, + "loss": 1.3352, + "loss/crossentropy": 2.0654172897338867, + "loss/fcd": 1.0390625, + "loss/logits": 0.23978617042303085, + "step": 600 + }, + { + "epoch": 0.010381848176266853, + "grad_norm": 0.29296875, + "grad_norm_var": 0.00035233497619628905, + "learning_rate": 0.01, + "loss": 1.3716, + "loss/crossentropy": 2.0811039805412292, + "loss/fcd": 1.12109375, + "loss/logits": 0.2653958946466446, + "step": 601 + }, + { + "epoch": 0.010399122466077614, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0003536065419514974, + "learning_rate": 0.01, + "loss": 1.3644, + "loss/crossentropy": 2.7797833681106567, + "loss/fcd": 1.2421875, + "loss/logits": 0.26879242062568665, + "step": 602 + }, + { + "epoch": 0.010416396755888374, + "grad_norm": 0.263671875, + "grad_norm_var": 0.0002975304921468099, + "learning_rate": 0.01, + "loss": 1.3323, + "loss/crossentropy": 2.2734681367874146, + "loss/fcd": 1.0625, + "loss/logits": 0.21455278247594833, + "step": 603 + }, + { + "epoch": 0.010433671045699134, + "grad_norm": 0.296875, + "grad_norm_var": 0.0002975304921468099, + "learning_rate": 0.01, + "loss": 1.3207, + "loss/crossentropy": 1.978046715259552, + "loss/fcd": 1.03515625, + "loss/logits": 0.23233170062303543, + "step": 604 + }, + { + "epoch": 0.010450945335509894, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0003051122029622396, + "learning_rate": 0.01, + "loss": 1.4169, + "loss/crossentropy": 2.5054962635040283, + "loss/fcd": 1.2265625, + "loss/logits": 0.2957670986652374, + "step": 605 + }, + { + "epoch": 0.010468219625320654, + "grad_norm": 0.359375, + "grad_norm_var": 0.0005370934804280598, + "learning_rate": 0.01, + "loss": 1.4294, + "loss/crossentropy": 2.5767931938171387, + "loss/fcd": 1.18359375, + "loss/logits": 0.2684077024459839, + "step": 606 + }, + { + "epoch": 0.010485493915131414, + "grad_norm": 0.3203125, + "grad_norm_var": 0.00048039754231770835, + "learning_rate": 0.01, + "loss": 1.37, + "loss/crossentropy": 2.3274868726730347, + "loss/fcd": 1.0546875, + "loss/logits": 0.23180848360061646, + "step": 607 + }, + { + "epoch": 0.010502768204942173, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0004983107248942057, + "learning_rate": 0.01, + "loss": 1.416, + "loss/crossentropy": 2.5422879457473755, + "loss/fcd": 1.0390625, + "loss/logits": 0.223361574113369, + "step": 608 + }, + { + "epoch": 0.010520042494752935, + "grad_norm": 0.328125, + "grad_norm_var": 0.0005373636881510417, + "learning_rate": 0.01, + "loss": 1.3627, + "loss/crossentropy": 2.570125699043274, + "loss/fcd": 1.125, + "loss/logits": 0.25247204303741455, + "step": 609 + }, + { + "epoch": 0.010537316784563695, + "grad_norm": 0.265625, + "grad_norm_var": 0.0006189823150634765, + "learning_rate": 0.01, + "loss": 1.316, + "loss/crossentropy": 2.2968589067459106, + "loss/fcd": 1.015625, + "loss/logits": 0.1994389146566391, + "step": 610 + }, + { + "epoch": 0.010554591074374455, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0006140232086181641, + "learning_rate": 0.01, + "loss": 1.4265, + "loss/crossentropy": 2.493618369102478, + "loss/fcd": 1.15625, + "loss/logits": 0.2581065893173218, + "step": 611 + }, + { + "epoch": 0.010571865364185215, + "grad_norm": 0.3046875, + "grad_norm_var": 0.000604248046875, + "learning_rate": 0.01, + "loss": 1.4014, + "loss/crossentropy": 2.4227527379989624, + "loss/fcd": 1.19140625, + "loss/logits": 0.25313572585582733, + "step": 612 + }, + { + "epoch": 0.010589139653995975, + "grad_norm": 0.296875, + "grad_norm_var": 0.0005971272786458333, + "learning_rate": 0.01, + "loss": 1.3718, + "loss/crossentropy": 2.3819390535354614, + "loss/fcd": 1.06640625, + "loss/logits": 0.22010967135429382, + "step": 613 + }, + { + "epoch": 0.010606413943806734, + "grad_norm": 0.294921875, + "grad_norm_var": 0.000598001480102539, + "learning_rate": 0.01, + "loss": 1.5079, + "loss/crossentropy": 2.190422534942627, + "loss/fcd": 1.08984375, + "loss/logits": 0.24383512139320374, + "step": 614 + }, + { + "epoch": 0.010623688233617496, + "grad_norm": 0.29296875, + "grad_norm_var": 0.000598001480102539, + "learning_rate": 0.01, + "loss": 1.3733, + "loss/crossentropy": 2.5865895748138428, + "loss/fcd": 1.10546875, + "loss/logits": 0.24275009334087372, + "step": 615 + }, + { + "epoch": 0.010640962523428256, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0005334854125976562, + "learning_rate": 0.01, + "loss": 1.3404, + "loss/crossentropy": 2.1975014209747314, + "loss/fcd": 1.046875, + "loss/logits": 0.2261335551738739, + "step": 616 + }, + { + "epoch": 0.010658236813239016, + "grad_norm": 0.33203125, + "grad_norm_var": 0.0005843480428059896, + "learning_rate": 0.01, + "loss": 1.4037, + "loss/crossentropy": 2.7723870277404785, + "loss/fcd": 1.234375, + "loss/logits": 0.2835993468761444, + "step": 617 + }, + { + "epoch": 0.010675511103049776, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0005884647369384765, + "learning_rate": 0.01, + "loss": 1.3625, + "loss/crossentropy": 2.599759817123413, + "loss/fcd": 1.1953125, + "loss/logits": 0.285232275724411, + "step": 618 + }, + { + "epoch": 0.010692785392860536, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0004821618398030599, + "learning_rate": 0.01, + "loss": 1.3733, + "loss/crossentropy": 2.4128291606903076, + "loss/fcd": 1.1796875, + "loss/logits": 0.26694832742214203, + "step": 619 + }, + { + "epoch": 0.010710059682671295, + "grad_norm": 0.306640625, + "grad_norm_var": 0.00047651926676432293, + "learning_rate": 0.01, + "loss": 1.3343, + "loss/crossentropy": 2.5237722396850586, + "loss/fcd": 1.1796875, + "loss/logits": 0.26433800160884857, + "step": 620 + }, + { + "epoch": 0.010727333972482057, + "grad_norm": 0.337890625, + "grad_norm_var": 0.0005385716756184896, + "learning_rate": 0.01, + "loss": 1.4112, + "loss/crossentropy": 2.317731261253357, + "loss/fcd": 1.2265625, + "loss/logits": 0.28476743400096893, + "step": 621 + }, + { + "epoch": 0.010744608262292817, + "grad_norm": 0.271484375, + "grad_norm_var": 0.0004234155019124349, + "learning_rate": 0.01, + "loss": 1.3603, + "loss/crossentropy": 2.3109618425369263, + "loss/fcd": 1.041015625, + "loss/logits": 0.2279675453901291, + "step": 622 + }, + { + "epoch": 0.010761882552103577, + "grad_norm": 0.275390625, + "grad_norm_var": 0.00044498443603515627, + "learning_rate": 0.01, + "loss": 1.3089, + "loss/crossentropy": 2.3984739780426025, + "loss/fcd": 1.09765625, + "loss/logits": 0.25493185222148895, + "step": 623 + }, + { + "epoch": 0.010779156841914337, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0004232883453369141, + "learning_rate": 0.01, + "loss": 1.4079, + "loss/crossentropy": 2.1802881956100464, + "loss/fcd": 1.0703125, + "loss/logits": 0.23454807698726654, + "step": 624 + }, + { + "epoch": 0.010796431131725097, + "grad_norm": 0.326171875, + "grad_norm_var": 0.00041599273681640624, + "learning_rate": 0.01, + "loss": 1.3629, + "loss/crossentropy": 2.6050442457199097, + "loss/fcd": 1.0703125, + "loss/logits": 0.2245146408677101, + "step": 625 + }, + { + "epoch": 0.010813705421535856, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0003667036692301432, + "learning_rate": 0.01, + "loss": 1.3622, + "loss/crossentropy": 2.4274967908859253, + "loss/fcd": 1.140625, + "loss/logits": 0.2685912102460861, + "step": 626 + }, + { + "epoch": 0.010830979711346618, + "grad_norm": 0.3125, + "grad_norm_var": 0.0003754774729410807, + "learning_rate": 0.01, + "loss": 1.4161, + "loss/crossentropy": 2.556549072265625, + "loss/fcd": 1.109375, + "loss/logits": 0.2520214840769768, + "step": 627 + }, + { + "epoch": 0.010848254001157378, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0003949483235677083, + "learning_rate": 0.01, + "loss": 1.3802, + "loss/crossentropy": 2.2824164628982544, + "loss/fcd": 1.046875, + "loss/logits": 0.22343048453330994, + "step": 628 + }, + { + "epoch": 0.010865528290968138, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0004146416982014974, + "learning_rate": 0.01, + "loss": 1.3555, + "loss/crossentropy": 2.500080108642578, + "loss/fcd": 1.0703125, + "loss/logits": 0.24835523962974548, + "step": 629 + }, + { + "epoch": 0.010882802580778898, + "grad_norm": 0.30859375, + "grad_norm_var": 0.00041631062825520836, + "learning_rate": 0.01, + "loss": 1.4, + "loss/crossentropy": 2.4014720916748047, + "loss/fcd": 1.140625, + "loss/logits": 0.236750990152359, + "step": 630 + }, + { + "epoch": 0.010900076870589658, + "grad_norm": 0.306640625, + "grad_norm_var": 0.00041286150614420575, + "learning_rate": 0.01, + "loss": 1.3707, + "loss/crossentropy": 2.3228918313980103, + "loss/fcd": 1.078125, + "loss/logits": 0.23406407982110977, + "step": 631 + }, + { + "epoch": 0.010917351160400417, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0004018147786458333, + "learning_rate": 0.01, + "loss": 1.3885, + "loss/crossentropy": 2.50198233127594, + "loss/fcd": 1.1875, + "loss/logits": 0.258284330368042, + "step": 632 + }, + { + "epoch": 0.010934625450211179, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0003524621327718099, + "learning_rate": 0.01, + "loss": 1.3978, + "loss/crossentropy": 2.637346863746643, + "loss/fcd": 1.15234375, + "loss/logits": 0.28542736172676086, + "step": 633 + }, + { + "epoch": 0.010951899740021939, + "grad_norm": 0.30859375, + "grad_norm_var": 0.00033969879150390624, + "learning_rate": 0.01, + "loss": 1.4403, + "loss/crossentropy": 2.4110260009765625, + "loss/fcd": 1.171875, + "loss/logits": 0.2651347145438194, + "step": 634 + }, + { + "epoch": 0.010969174029832699, + "grad_norm": 0.283203125, + "grad_norm_var": 0.000360870361328125, + "learning_rate": 0.01, + "loss": 1.4184, + "loss/crossentropy": 2.7041887044906616, + "loss/fcd": 1.171875, + "loss/logits": 0.2508121207356453, + "step": 635 + }, + { + "epoch": 0.010986448319643459, + "grad_norm": 0.28515625, + "grad_norm_var": 0.00037713050842285155, + "learning_rate": 0.01, + "loss": 1.401, + "loss/crossentropy": 2.4663859605789185, + "loss/fcd": 1.14453125, + "loss/logits": 0.2824552655220032, + "step": 636 + }, + { + "epoch": 0.011003722609454218, + "grad_norm": 0.326171875, + "grad_norm_var": 0.0003279209136962891, + "learning_rate": 0.01, + "loss": 1.3728, + "loss/crossentropy": 2.5915483236312866, + "loss/fcd": 1.11328125, + "loss/logits": 0.24787750095129013, + "step": 637 + }, + { + "epoch": 0.011020996899264978, + "grad_norm": 0.28125, + "grad_norm_var": 0.00029652913411458334, + "learning_rate": 0.01, + "loss": 1.3354, + "loss/crossentropy": 2.5775671005249023, + "loss/fcd": 1.11328125, + "loss/logits": 0.26823610067367554, + "step": 638 + }, + { + "epoch": 0.01103827118907574, + "grad_norm": 0.298828125, + "grad_norm_var": 0.00025151570638020835, + "learning_rate": 0.01, + "loss": 1.3522, + "loss/crossentropy": 2.3462886810302734, + "loss/fcd": 1.06640625, + "loss/logits": 0.23827192932367325, + "step": 639 + }, + { + "epoch": 0.0110555454788865, + "grad_norm": 0.3046875, + "grad_norm_var": 0.00025151570638020835, + "learning_rate": 0.01, + "loss": 1.4206, + "loss/crossentropy": 2.2796329855918884, + "loss/fcd": 1.09765625, + "loss/logits": 0.24281439930200577, + "step": 640 + }, + { + "epoch": 0.01107281976869726, + "grad_norm": 0.302734375, + "grad_norm_var": 0.00021107991536458334, + "learning_rate": 0.01, + "loss": 1.4093, + "loss/crossentropy": 2.2618002891540527, + "loss/fcd": 1.10546875, + "loss/logits": 0.24219272285699844, + "step": 641 + }, + { + "epoch": 0.01109009405850802, + "grad_norm": 0.515625, + "grad_norm_var": 0.0030247847239176433, + "learning_rate": 0.01, + "loss": 1.5002, + "loss/crossentropy": 2.628837466239929, + "loss/fcd": 1.1796875, + "loss/logits": 0.27036982774734497, + "step": 642 + }, + { + "epoch": 0.01110736834831878, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0030420303344726564, + "learning_rate": 0.01, + "loss": 1.3437, + "loss/crossentropy": 2.377197504043579, + "loss/fcd": 1.078125, + "loss/logits": 0.2347392812371254, + "step": 643 + }, + { + "epoch": 0.01112464263812954, + "grad_norm": 0.296875, + "grad_norm_var": 0.0030603885650634767, + "learning_rate": 0.01, + "loss": 1.3465, + "loss/crossentropy": 2.241411805152893, + "loss/fcd": 1.04296875, + "loss/logits": 0.22135238349437714, + "step": 644 + }, + { + "epoch": 0.0111419169279403, + "grad_norm": 0.287109375, + "grad_norm_var": 0.00304563840230306, + "learning_rate": 0.01, + "loss": 1.3781, + "loss/crossentropy": 2.132224917411804, + "loss/fcd": 1.06640625, + "loss/logits": 0.24958615005016327, + "step": 645 + }, + { + "epoch": 0.01115919121775106, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0030664443969726563, + "learning_rate": 0.01, + "loss": 1.3319, + "loss/crossentropy": 2.379546046257019, + "loss/fcd": 1.0703125, + "loss/logits": 0.23225411772727966, + "step": 646 + }, + { + "epoch": 0.01117646550756182, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0031198501586914063, + "learning_rate": 0.01, + "loss": 1.3696, + "loss/crossentropy": 2.4151222705841064, + "loss/fcd": 1.08984375, + "loss/logits": 0.25382500886917114, + "step": 647 + }, + { + "epoch": 0.01119373979737258, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0031198501586914063, + "learning_rate": 0.01, + "loss": 1.3725, + "loss/crossentropy": 2.4386223554611206, + "loss/fcd": 1.11328125, + "loss/logits": 0.24172081053256989, + "step": 648 + }, + { + "epoch": 0.01121101408718334, + "grad_norm": 0.294921875, + "grad_norm_var": 0.003135426839192708, + "learning_rate": 0.01, + "loss": 1.4136, + "loss/crossentropy": 2.4053245782852173, + "loss/fcd": 1.10546875, + "loss/logits": 0.2587142735719681, + "step": 649 + }, + { + "epoch": 0.0112282883769941, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0031352837880452475, + "learning_rate": 0.01, + "loss": 1.3908, + "loss/crossentropy": 2.8473496437072754, + "loss/fcd": 1.203125, + "loss/logits": 0.24620139598846436, + "step": 650 + }, + { + "epoch": 0.01124556266680486, + "grad_norm": 0.28515625, + "grad_norm_var": 0.003128496805826823, + "learning_rate": 0.01, + "loss": 1.346, + "loss/crossentropy": 2.4264625310897827, + "loss/fcd": 1.04296875, + "loss/logits": 0.22718993574380875, + "step": 651 + }, + { + "epoch": 0.011262836956615622, + "grad_norm": 0.302734375, + "grad_norm_var": 0.003088871637980143, + "learning_rate": 0.01, + "loss": 1.3722, + "loss/crossentropy": 2.393033504486084, + "loss/fcd": 1.09375, + "loss/logits": 0.2361084669828415, + "step": 652 + }, + { + "epoch": 0.011280111246426382, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0030968825022379557, + "learning_rate": 0.01, + "loss": 1.3962, + "loss/crossentropy": 2.5740654468536377, + "loss/fcd": 1.1328125, + "loss/logits": 0.27814269065856934, + "step": 653 + }, + { + "epoch": 0.011297385536237142, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0030664443969726563, + "learning_rate": 0.01, + "loss": 1.3502, + "loss/crossentropy": 2.572822332382202, + "loss/fcd": 1.109375, + "loss/logits": 0.25307735800743103, + "step": 654 + }, + { + "epoch": 0.011314659826047901, + "grad_norm": 0.302734375, + "grad_norm_var": 0.003061676025390625, + "learning_rate": 0.01, + "loss": 1.3652, + "loss/crossentropy": 2.36893892288208, + "loss/fcd": 1.12109375, + "loss/logits": 0.24310748279094696, + "step": 655 + }, + { + "epoch": 0.011331934115858661, + "grad_norm": 0.28515625, + "grad_norm_var": 0.003099505106608073, + "learning_rate": 0.01, + "loss": 1.382, + "loss/crossentropy": 2.453968048095703, + "loss/fcd": 1.12109375, + "loss/logits": 0.2507154792547226, + "step": 656 + }, + { + "epoch": 0.011349208405669421, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0030968825022379557, + "learning_rate": 0.01, + "loss": 1.4208, + "loss/crossentropy": 2.3706772327423096, + "loss/fcd": 1.203125, + "loss/logits": 0.2801935374736786, + "step": 657 + }, + { + "epoch": 0.011366482695480183, + "grad_norm": 0.28515625, + "grad_norm_var": 7.348060607910156e-05, + "learning_rate": 0.01, + "loss": 1.3745, + "loss/crossentropy": 2.3052316308021545, + "loss/fcd": 1.125, + "loss/logits": 0.24023611843585968, + "step": 658 + }, + { + "epoch": 0.011383756985290943, + "grad_norm": 0.263671875, + "grad_norm_var": 0.00013184547424316406, + "learning_rate": 0.01, + "loss": 1.3589, + "loss/crossentropy": 2.3989150524139404, + "loss/fcd": 1.08984375, + "loss/logits": 0.23345524072647095, + "step": 659 + }, + { + "epoch": 0.011401031275101703, + "grad_norm": 0.326171875, + "grad_norm_var": 0.000202178955078125, + "learning_rate": 0.01, + "loss": 1.4671, + "loss/crossentropy": 2.4908188581466675, + "loss/fcd": 1.08203125, + "loss/logits": 0.22981490939855576, + "step": 660 + }, + { + "epoch": 0.011418305564912462, + "grad_norm": 0.318359375, + "grad_norm_var": 0.000232696533203125, + "learning_rate": 0.01, + "loss": 1.3845, + "loss/crossentropy": 2.182092070579529, + "loss/fcd": 1.0390625, + "loss/logits": 0.22433090209960938, + "step": 661 + }, + { + "epoch": 0.011435579854723222, + "grad_norm": 0.29296875, + "grad_norm_var": 0.00023331642150878907, + "learning_rate": 0.01, + "loss": 1.3832, + "loss/crossentropy": 2.557218909263611, + "loss/fcd": 1.15234375, + "loss/logits": 0.26849667727947235, + "step": 662 + }, + { + "epoch": 0.011452854144533982, + "grad_norm": 0.26953125, + "grad_norm_var": 0.0002688090006510417, + "learning_rate": 0.01, + "loss": 1.3516, + "loss/crossentropy": 2.4368367195129395, + "loss/fcd": 1.09765625, + "loss/logits": 0.2600485235452652, + "step": 663 + }, + { + "epoch": 0.011470128434344744, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0002975304921468099, + "learning_rate": 0.01, + "loss": 1.3436, + "loss/crossentropy": 2.283419609069824, + "loss/fcd": 1.10546875, + "loss/logits": 0.2451685667037964, + "step": 664 + }, + { + "epoch": 0.011487402724155504, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0003295262654622396, + "learning_rate": 0.01, + "loss": 1.3917, + "loss/crossentropy": 2.2501282691955566, + "loss/fcd": 1.10546875, + "loss/logits": 0.23817522078752518, + "step": 665 + }, + { + "epoch": 0.011504677013966264, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0003692468007405599, + "learning_rate": 0.01, + "loss": 1.3747, + "loss/crossentropy": 2.595417618751526, + "loss/fcd": 1.109375, + "loss/logits": 0.272259384393692, + "step": 666 + }, + { + "epoch": 0.011521951303777023, + "grad_norm": 0.263671875, + "grad_norm_var": 0.0004292170206705729, + "learning_rate": 0.01, + "loss": 1.3477, + "loss/crossentropy": 2.3635072708129883, + "loss/fcd": 1.09765625, + "loss/logits": 0.24695640057325363, + "step": 667 + }, + { + "epoch": 0.011539225593587783, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0004246870676676432, + "learning_rate": 0.01, + "loss": 1.3744, + "loss/crossentropy": 2.310747981071472, + "loss/fcd": 1.08984375, + "loss/logits": 0.2579839900135994, + "step": 668 + }, + { + "epoch": 0.011556499883398543, + "grad_norm": 0.27734375, + "grad_norm_var": 0.00044193267822265623, + "learning_rate": 0.01, + "loss": 1.349, + "loss/crossentropy": 2.497538447380066, + "loss/fcd": 1.1328125, + "loss/logits": 0.26720890402793884, + "step": 669 + }, + { + "epoch": 0.011573774173209305, + "grad_norm": 0.3046875, + "grad_norm_var": 0.00044960975646972655, + "learning_rate": 0.01, + "loss": 1.3475, + "loss/crossentropy": 2.5883569717407227, + "loss/fcd": 1.2109375, + "loss/logits": 0.29579465091228485, + "step": 670 + }, + { + "epoch": 0.011591048463020065, + "grad_norm": 0.263671875, + "grad_norm_var": 0.0004998366038004557, + "learning_rate": 0.01, + "loss": 1.3349, + "loss/crossentropy": 2.2982797622680664, + "loss/fcd": 1.1328125, + "loss/logits": 0.22655323147773743, + "step": 671 + }, + { + "epoch": 0.011608322752830825, + "grad_norm": 0.359375, + "grad_norm_var": 0.0007800896962483724, + "learning_rate": 0.01, + "loss": 1.3753, + "loss/crossentropy": 2.4650286436080933, + "loss/fcd": 1.09765625, + "loss/logits": 0.24685797840356827, + "step": 672 + }, + { + "epoch": 0.011625597042641584, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0007897535959879557, + "learning_rate": 0.01, + "loss": 1.3429, + "loss/crossentropy": 2.5849392414093018, + "loss/fcd": 1.140625, + "loss/logits": 0.2600446939468384, + "step": 673 + }, + { + "epoch": 0.011642871332452344, + "grad_norm": 0.28125, + "grad_norm_var": 0.0007954756418863932, + "learning_rate": 0.01, + "loss": 1.3721, + "loss/crossentropy": 2.4149436950683594, + "loss/fcd": 1.08984375, + "loss/logits": 0.24952851235866547, + "step": 674 + }, + { + "epoch": 0.011660145622263104, + "grad_norm": 0.34765625, + "grad_norm_var": 0.0008959452311197917, + "learning_rate": 0.01, + "loss": 1.4752, + "loss/crossentropy": 2.582419753074646, + "loss/fcd": 1.2578125, + "loss/logits": 0.2812621593475342, + "step": 675 + }, + { + "epoch": 0.011677419912073864, + "grad_norm": 0.26171875, + "grad_norm_var": 0.0009247938791910808, + "learning_rate": 0.01, + "loss": 1.3666, + "loss/crossentropy": 2.3817840814590454, + "loss/fcd": 1.078125, + "loss/logits": 0.24483423680067062, + "step": 676 + }, + { + "epoch": 0.011694694201884626, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0008938948313395183, + "learning_rate": 0.01, + "loss": 1.3642, + "loss/crossentropy": 2.4791339635849, + "loss/fcd": 1.078125, + "loss/logits": 0.25220367312431335, + "step": 677 + }, + { + "epoch": 0.011711968491695385, + "grad_norm": 0.271484375, + "grad_norm_var": 0.0009230931599934895, + "learning_rate": 0.01, + "loss": 1.3435, + "loss/crossentropy": 2.3865939378738403, + "loss/fcd": 1.09375, + "loss/logits": 0.24416129291057587, + "step": 678 + }, + { + "epoch": 0.011729242781506145, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0009215672810872396, + "learning_rate": 0.01, + "loss": 1.4158, + "loss/crossentropy": 2.514981508255005, + "loss/fcd": 1.15625, + "loss/logits": 0.27227045595645905, + "step": 679 + }, + { + "epoch": 0.011746517071316905, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0008992513020833333, + "learning_rate": 0.01, + "loss": 1.399, + "loss/crossentropy": 2.660152792930603, + "loss/fcd": 1.11328125, + "loss/logits": 0.2607909142971039, + "step": 680 + }, + { + "epoch": 0.011763791361127665, + "grad_norm": 0.296875, + "grad_norm_var": 0.0008722305297851563, + "learning_rate": 0.01, + "loss": 1.4528, + "loss/crossentropy": 2.165284812450409, + "loss/fcd": 1.07421875, + "loss/logits": 0.2606969401240349, + "step": 681 + }, + { + "epoch": 0.011781065650938425, + "grad_norm": 0.330078125, + "grad_norm_var": 0.0008966922760009766, + "learning_rate": 0.01, + "loss": 1.4402, + "loss/crossentropy": 2.719216465950012, + "loss/fcd": 1.21875, + "loss/logits": 0.274374857544899, + "step": 682 + }, + { + "epoch": 0.011798339940749187, + "grad_norm": 0.357421875, + "grad_norm_var": 0.0010416507720947266, + "learning_rate": 0.01, + "loss": 1.4226, + "loss/crossentropy": 2.405388355255127, + "loss/fcd": 1.1640625, + "loss/logits": 0.2703537493944168, + "step": 683 + }, + { + "epoch": 0.011815614230559946, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0010400772094726562, + "learning_rate": 0.01, + "loss": 1.4291, + "loss/crossentropy": 2.7011595964431763, + "loss/fcd": 1.1875, + "loss/logits": 0.25208880007267, + "step": 684 + }, + { + "epoch": 0.011832888520370706, + "grad_norm": 0.37109375, + "grad_norm_var": 0.0012689590454101562, + "learning_rate": 0.01, + "loss": 1.3541, + "loss/crossentropy": 2.5975828170776367, + "loss/fcd": 1.1015625, + "loss/logits": 0.23054596036672592, + "step": 685 + }, + { + "epoch": 0.011850162810181466, + "grad_norm": 0.302734375, + "grad_norm_var": 0.001270278294881185, + "learning_rate": 0.01, + "loss": 1.3724, + "loss/crossentropy": 2.202287197113037, + "loss/fcd": 1.0625, + "loss/logits": 0.24445781856775284, + "step": 686 + }, + { + "epoch": 0.011867437099992226, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0011264642079671225, + "learning_rate": 0.01, + "loss": 1.3371, + "loss/crossentropy": 2.309388518333435, + "loss/fcd": 1.11328125, + "loss/logits": 0.2442098781466484, + "step": 687 + }, + { + "epoch": 0.011884711389802986, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0009780248006184896, + "learning_rate": 0.01, + "loss": 1.3841, + "loss/crossentropy": 2.499300003051758, + "loss/fcd": 1.11328125, + "loss/logits": 0.26171083748340607, + "step": 688 + }, + { + "epoch": 0.011901985679613748, + "grad_norm": 0.333984375, + "grad_norm_var": 0.0009480635325113932, + "learning_rate": 0.01, + "loss": 1.4675, + "loss/crossentropy": 2.35269558429718, + "loss/fcd": 1.06640625, + "loss/logits": 0.2726883888244629, + "step": 689 + }, + { + "epoch": 0.011919259969424507, + "grad_norm": 0.333984375, + "grad_norm_var": 0.000909868876139323, + "learning_rate": 0.01, + "loss": 1.403, + "loss/crossentropy": 2.78786039352417, + "loss/fcd": 1.25390625, + "loss/logits": 0.3147393763065338, + "step": 690 + }, + { + "epoch": 0.011936534259235267, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0009186903635660808, + "learning_rate": 0.01, + "loss": 1.3294, + "loss/crossentropy": 2.0689194798469543, + "loss/fcd": 1.03515625, + "loss/logits": 0.234086312353611, + "step": 691 + }, + { + "epoch": 0.011953808549046027, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0007778008778889974, + "learning_rate": 0.01, + "loss": 1.3331, + "loss/crossentropy": 2.290665626525879, + "loss/fcd": 1.046875, + "loss/logits": 0.23476862162351608, + "step": 692 + }, + { + "epoch": 0.011971082838856787, + "grad_norm": 0.26953125, + "grad_norm_var": 0.0008422215779622396, + "learning_rate": 0.01, + "loss": 1.3703, + "loss/crossentropy": 2.4959352016448975, + "loss/fcd": 1.16015625, + "loss/logits": 0.2350049912929535, + "step": 693 + }, + { + "epoch": 0.011988357128667547, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0007624308268229167, + "learning_rate": 0.01, + "loss": 1.3499, + "loss/crossentropy": 2.3858295679092407, + "loss/fcd": 1.0546875, + "loss/logits": 0.2346876710653305, + "step": 694 + }, + { + "epoch": 0.012005631418478309, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0007703145345052083, + "learning_rate": 0.01, + "loss": 1.4174, + "loss/crossentropy": 2.5176814794540405, + "loss/fcd": 1.13671875, + "loss/logits": 0.2492477372288704, + "step": 695 + }, + { + "epoch": 0.012022905708289068, + "grad_norm": 0.365234375, + "grad_norm_var": 0.0009376366933186848, + "learning_rate": 0.01, + "loss": 1.4472, + "loss/crossentropy": 2.553426146507263, + "loss/fcd": 1.13671875, + "loss/logits": 0.25825950503349304, + "step": 696 + }, + { + "epoch": 0.012040179998099828, + "grad_norm": 0.3125, + "grad_norm_var": 0.0009119510650634766, + "learning_rate": 0.01, + "loss": 1.3902, + "loss/crossentropy": 2.524499535560608, + "loss/fcd": 1.17578125, + "loss/logits": 0.2615286335349083, + "step": 697 + }, + { + "epoch": 0.012057454287910588, + "grad_norm": 0.271484375, + "grad_norm_var": 0.001028299331665039, + "learning_rate": 0.01, + "loss": 1.3468, + "loss/crossentropy": 2.234209656715393, + "loss/fcd": 1.109375, + "loss/logits": 0.2630993127822876, + "step": 698 + }, + { + "epoch": 0.012074728577721348, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0009722232818603516, + "learning_rate": 0.01, + "loss": 1.3397, + "loss/crossentropy": 2.595862627029419, + "loss/fcd": 1.09375, + "loss/logits": 0.25720856338739395, + "step": 699 + }, + { + "epoch": 0.012092002867532108, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0009722232818603516, + "learning_rate": 0.01, + "loss": 1.3472, + "loss/crossentropy": 2.3556742668151855, + "loss/fcd": 1.0859375, + "loss/logits": 0.23623445630073547, + "step": 700 + }, + { + "epoch": 0.01210927715734287, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0007013797760009766, + "learning_rate": 0.01, + "loss": 1.3728, + "loss/crossentropy": 2.286816358566284, + "loss/fcd": 1.04296875, + "loss/logits": 0.24584627896547318, + "step": 701 + }, + { + "epoch": 0.01212655144715363, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0007274468739827474, + "learning_rate": 0.01, + "loss": 1.3878, + "loss/crossentropy": 2.2807174921035767, + "loss/fcd": 1.109375, + "loss/logits": 0.25587528198957443, + "step": 702 + }, + { + "epoch": 0.01214382573696439, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0007240136464436848, + "learning_rate": 0.01, + "loss": 1.3971, + "loss/crossentropy": 2.5250132083892822, + "loss/fcd": 1.12109375, + "loss/logits": 0.2833500802516937, + "step": 703 + }, + { + "epoch": 0.01216110002677515, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0007322788238525391, + "learning_rate": 0.01, + "loss": 1.3972, + "loss/crossentropy": 2.5938040018081665, + "loss/fcd": 1.1484375, + "loss/logits": 0.2679053843021393, + "step": 704 + }, + { + "epoch": 0.012178374316585909, + "grad_norm": 0.3046875, + "grad_norm_var": 0.000661468505859375, + "learning_rate": 0.01, + "loss": 1.3572, + "loss/crossentropy": 2.3809746503829956, + "loss/fcd": 1.1171875, + "loss/logits": 0.2514628916978836, + "step": 705 + }, + { + "epoch": 0.012195648606396669, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0005812168121337891, + "learning_rate": 0.01, + "loss": 1.3698, + "loss/crossentropy": 2.3113526105880737, + "loss/fcd": 1.0703125, + "loss/logits": 0.24198968708515167, + "step": 706 + }, + { + "epoch": 0.01221292289620743, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0005541324615478515, + "learning_rate": 0.01, + "loss": 1.3485, + "loss/crossentropy": 2.465987205505371, + "loss/fcd": 1.13671875, + "loss/logits": 0.2991575300693512, + "step": 707 + }, + { + "epoch": 0.01223019718601819, + "grad_norm": 0.3125, + "grad_norm_var": 0.0005623976389567058, + "learning_rate": 0.01, + "loss": 1.3754, + "loss/crossentropy": 2.4940463304519653, + "loss/fcd": 1.1640625, + "loss/logits": 0.2627300024032593, + "step": 708 + }, + { + "epoch": 0.01224747147582895, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0005009810129801433, + "learning_rate": 0.01, + "loss": 1.378, + "loss/crossentropy": 2.6033318042755127, + "loss/fcd": 1.12109375, + "loss/logits": 0.2630281075835228, + "step": 709 + }, + { + "epoch": 0.01226474576563971, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0005177656809488932, + "learning_rate": 0.01, + "loss": 1.3969, + "loss/crossentropy": 2.218273878097534, + "loss/fcd": 1.08984375, + "loss/logits": 0.23244468122720718, + "step": 710 + }, + { + "epoch": 0.01228202005545047, + "grad_norm": 0.3359375, + "grad_norm_var": 0.0005585829416910808, + "learning_rate": 0.01, + "loss": 1.4508, + "loss/crossentropy": 2.329068422317505, + "loss/fcd": 1.2109375, + "loss/logits": 0.24251049757003784, + "step": 711 + }, + { + "epoch": 0.01229929434526123, + "grad_norm": 0.302734375, + "grad_norm_var": 0.00029511451721191405, + "learning_rate": 0.01, + "loss": 1.4497, + "loss/crossentropy": 2.4693063497543335, + "loss/fcd": 1.125, + "loss/logits": 0.2587638199329376, + "step": 712 + }, + { + "epoch": 0.01231656863507199, + "grad_norm": 0.3125, + "grad_norm_var": 0.00029511451721191405, + "learning_rate": 0.01, + "loss": 1.371, + "loss/crossentropy": 2.4224281311035156, + "loss/fcd": 1.13671875, + "loss/logits": 0.27352161705493927, + "step": 713 + }, + { + "epoch": 0.012333842924882751, + "grad_norm": 0.3203125, + "grad_norm_var": 0.00025577545166015624, + "learning_rate": 0.01, + "loss": 1.3508, + "loss/crossentropy": 2.5101382732391357, + "loss/fcd": 1.1171875, + "loss/logits": 0.25151751190423965, + "step": 714 + }, + { + "epoch": 0.012351117214693511, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0002010186513264974, + "learning_rate": 0.01, + "loss": 1.3949, + "loss/crossentropy": 2.765409469604492, + "loss/fcd": 1.10546875, + "loss/logits": 0.23425965011119843, + "step": 715 + }, + { + "epoch": 0.012368391504504271, + "grad_norm": 0.259765625, + "grad_norm_var": 0.0003284295399983724, + "learning_rate": 0.01, + "loss": 1.3346, + "loss/crossentropy": 2.446286678314209, + "loss/fcd": 1.0625, + "loss/logits": 0.23563802242279053, + "step": 716 + }, + { + "epoch": 0.012385665794315031, + "grad_norm": 0.365234375, + "grad_norm_var": 0.0005666097005208333, + "learning_rate": 0.01, + "loss": 1.485, + "loss/crossentropy": 2.3494917154312134, + "loss/fcd": 1.47265625, + "loss/logits": 0.2857535183429718, + "step": 717 + }, + { + "epoch": 0.01240294008412579, + "grad_norm": 0.28125, + "grad_norm_var": 0.0005729516347249349, + "learning_rate": 0.01, + "loss": 1.3814, + "loss/crossentropy": 2.3558719158172607, + "loss/fcd": 1.12109375, + "loss/logits": 0.24474655091762543, + "step": 718 + }, + { + "epoch": 0.01242021437393655, + "grad_norm": 0.26953125, + "grad_norm_var": 0.000646209716796875, + "learning_rate": 0.01, + "loss": 1.3125, + "loss/crossentropy": 2.364332675933838, + "loss/fcd": 1.1015625, + "loss/logits": 0.24612490087747574, + "step": 719 + }, + { + "epoch": 0.012437488663747312, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0006484826405843099, + "learning_rate": 0.01, + "loss": 1.3629, + "loss/crossentropy": 2.218404769897461, + "loss/fcd": 1.15625, + "loss/logits": 0.2676163464784622, + "step": 720 + }, + { + "epoch": 0.012454762953558072, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0007059574127197266, + "learning_rate": 0.01, + "loss": 1.3642, + "loss/crossentropy": 2.4319703578948975, + "loss/fcd": 1.12109375, + "loss/logits": 0.25568731129169464, + "step": 721 + }, + { + "epoch": 0.012472037243368832, + "grad_norm": 0.26953125, + "grad_norm_var": 0.0007715702056884765, + "learning_rate": 0.01, + "loss": 1.3565, + "loss/crossentropy": 2.603386163711548, + "loss/fcd": 1.09765625, + "loss/logits": 0.23648831248283386, + "step": 722 + }, + { + "epoch": 0.012489311533179592, + "grad_norm": 0.294921875, + "grad_norm_var": 0.000762033462524414, + "learning_rate": 0.01, + "loss": 1.4305, + "loss/crossentropy": 2.3345898389816284, + "loss/fcd": 1.05859375, + "loss/logits": 0.2294597253203392, + "step": 723 + }, + { + "epoch": 0.012506585822990352, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0007692813873291015, + "learning_rate": 0.01, + "loss": 1.3885, + "loss/crossentropy": 2.315110445022583, + "loss/fcd": 1.15234375, + "loss/logits": 0.262426495552063, + "step": 724 + }, + { + "epoch": 0.012523860112801112, + "grad_norm": 0.28125, + "grad_norm_var": 0.0007898807525634766, + "learning_rate": 0.01, + "loss": 1.2937, + "loss/crossentropy": 2.2987769842147827, + "loss/fcd": 1.0, + "loss/logits": 0.21975189447402954, + "step": 725 + }, + { + "epoch": 0.012541134402611873, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0007562637329101562, + "learning_rate": 0.01, + "loss": 1.3775, + "loss/crossentropy": 2.5773731470108032, + "loss/fcd": 1.16015625, + "loss/logits": 0.29223839938640594, + "step": 726 + }, + { + "epoch": 0.012558408692422633, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0006650288899739584, + "learning_rate": 0.01, + "loss": 1.4041, + "loss/crossentropy": 2.138230562210083, + "loss/fcd": 1.0625, + "loss/logits": 0.24283046275377274, + "step": 727 + }, + { + "epoch": 0.012575682982233393, + "grad_norm": 0.333984375, + "grad_norm_var": 0.0007525126139322917, + "learning_rate": 0.01, + "loss": 1.4611, + "loss/crossentropy": 2.521793842315674, + "loss/fcd": 1.2265625, + "loss/logits": 0.2588220089673996, + "step": 728 + }, + { + "epoch": 0.012592957272044153, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0007710774739583333, + "learning_rate": 0.01, + "loss": 1.3833, + "loss/crossentropy": 2.5079206228256226, + "loss/fcd": 1.13671875, + "loss/logits": 0.24896685779094696, + "step": 729 + }, + { + "epoch": 0.012610231561854913, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0007460912068684896, + "learning_rate": 0.01, + "loss": 1.398, + "loss/crossentropy": 2.4435055255889893, + "loss/fcd": 1.1875, + "loss/logits": 0.2766249179840088, + "step": 730 + }, + { + "epoch": 0.012627505851665673, + "grad_norm": 0.353515625, + "grad_norm_var": 0.0009387811024983724, + "learning_rate": 0.01, + "loss": 1.482, + "loss/crossentropy": 2.480614185333252, + "loss/fcd": 1.12109375, + "loss/logits": 0.24479512870311737, + "step": 731 + }, + { + "epoch": 0.012644780141476434, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0008542219797770183, + "learning_rate": 0.01, + "loss": 1.3214, + "loss/crossentropy": 2.556125283241272, + "loss/fcd": 1.0546875, + "loss/logits": 0.25190603733062744, + "step": 732 + }, + { + "epoch": 0.012662054431287194, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0006001631418863933, + "learning_rate": 0.01, + "loss": 1.3992, + "loss/crossentropy": 2.2440203428268433, + "loss/fcd": 1.046875, + "loss/logits": 0.23071999847888947, + "step": 733 + }, + { + "epoch": 0.012679328721097954, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0005847771962483723, + "learning_rate": 0.01, + "loss": 1.3884, + "loss/crossentropy": 2.366842269897461, + "loss/fcd": 1.13671875, + "loss/logits": 0.2621122822165489, + "step": 734 + }, + { + "epoch": 0.012696603010908714, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0005288283030192057, + "learning_rate": 0.01, + "loss": 1.373, + "loss/crossentropy": 2.528809905052185, + "loss/fcd": 1.140625, + "loss/logits": 0.2601289302110672, + "step": 735 + }, + { + "epoch": 0.012713877300719474, + "grad_norm": 0.333984375, + "grad_norm_var": 0.0005879084269205729, + "learning_rate": 0.01, + "loss": 1.3657, + "loss/crossentropy": 2.1993446350097656, + "loss/fcd": 1.05859375, + "loss/logits": 0.2357948124408722, + "step": 736 + }, + { + "epoch": 0.012731151590530234, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0005395889282226562, + "learning_rate": 0.01, + "loss": 1.3611, + "loss/crossentropy": 2.5157347917556763, + "loss/fcd": 1.109375, + "loss/logits": 0.2621786296367645, + "step": 737 + }, + { + "epoch": 0.012748425880340995, + "grad_norm": 0.31640625, + "grad_norm_var": 0.00045566558837890626, + "learning_rate": 0.01, + "loss": 1.3787, + "loss/crossentropy": 2.463285803794861, + "loss/fcd": 1.1328125, + "loss/logits": 0.2661950886249542, + "step": 738 + }, + { + "epoch": 0.012765700170151755, + "grad_norm": 0.314453125, + "grad_norm_var": 0.00044581095377604164, + "learning_rate": 0.01, + "loss": 1.3789, + "loss/crossentropy": 2.7613465785980225, + "loss/fcd": 1.09375, + "loss/logits": 0.24063792079687119, + "step": 739 + }, + { + "epoch": 0.012782974459962515, + "grad_norm": 0.337890625, + "grad_norm_var": 0.0004956404368082683, + "learning_rate": 0.01, + "loss": 1.3809, + "loss/crossentropy": 2.3430649042129517, + "loss/fcd": 1.05859375, + "loss/logits": 0.23180129379034042, + "step": 740 + }, + { + "epoch": 0.012800248749773275, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0004435062408447266, + "learning_rate": 0.01, + "loss": 1.3546, + "loss/crossentropy": 2.347190737724304, + "loss/fcd": 1.1015625, + "loss/logits": 0.23613610118627548, + "step": 741 + }, + { + "epoch": 0.012817523039584035, + "grad_norm": 0.328125, + "grad_norm_var": 0.00043320655822753906, + "learning_rate": 0.01, + "loss": 1.414, + "loss/crossentropy": 2.3196645975112915, + "loss/fcd": 1.12890625, + "loss/logits": 0.27611708641052246, + "step": 742 + }, + { + "epoch": 0.012834797329394795, + "grad_norm": 0.28125, + "grad_norm_var": 0.0004990736643473308, + "learning_rate": 0.01, + "loss": 1.3861, + "loss/crossentropy": 2.4212803840637207, + "loss/fcd": 1.109375, + "loss/logits": 0.2471313625574112, + "step": 743 + }, + { + "epoch": 0.012852071619205556, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0004806359608968099, + "learning_rate": 0.01, + "loss": 1.3723, + "loss/crossentropy": 2.527360200881958, + "loss/fcd": 1.109375, + "loss/logits": 0.24950604140758514, + "step": 744 + }, + { + "epoch": 0.012869345909016316, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0004750569661458333, + "learning_rate": 0.01, + "loss": 1.3461, + "loss/crossentropy": 2.2922967672348022, + "loss/fcd": 1.07421875, + "loss/logits": 0.23927000910043716, + "step": 745 + }, + { + "epoch": 0.012886620198827076, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0004943688710530599, + "learning_rate": 0.01, + "loss": 1.3785, + "loss/crossentropy": 2.133127212524414, + "loss/fcd": 1.078125, + "loss/logits": 0.23443202674388885, + "step": 746 + }, + { + "epoch": 0.012903894488637836, + "grad_norm": 0.28515625, + "grad_norm_var": 0.000366973876953125, + "learning_rate": 0.01, + "loss": 1.387, + "loss/crossentropy": 2.569379210472107, + "loss/fcd": 1.12109375, + "loss/logits": 0.26725105941295624, + "step": 747 + }, + { + "epoch": 0.012921168778448596, + "grad_norm": 0.294921875, + "grad_norm_var": 0.00033238728841145836, + "learning_rate": 0.01, + "loss": 1.4185, + "loss/crossentropy": 2.6103577613830566, + "loss/fcd": 1.13671875, + "loss/logits": 0.27920565009117126, + "step": 748 + }, + { + "epoch": 0.012938443068259356, + "grad_norm": 0.310546875, + "grad_norm_var": 0.00032145182291666666, + "learning_rate": 0.01, + "loss": 1.4161, + "loss/crossentropy": 2.3525288105010986, + "loss/fcd": 1.09375, + "loss/logits": 0.21820923686027527, + "step": 749 + }, + { + "epoch": 0.012955717358070115, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0003083388010660807, + "learning_rate": 0.01, + "loss": 1.3429, + "loss/crossentropy": 2.563652276992798, + "loss/fcd": 1.1171875, + "loss/logits": 0.25008824467658997, + "step": 750 + }, + { + "epoch": 0.012972991647880877, + "grad_norm": 0.279296875, + "grad_norm_var": 0.00033416748046875, + "learning_rate": 0.01, + "loss": 1.3992, + "loss/crossentropy": 2.4368664026260376, + "loss/fcd": 1.08984375, + "loss/logits": 0.2636963874101639, + "step": 751 + }, + { + "epoch": 0.012990265937691637, + "grad_norm": 0.267578125, + "grad_norm_var": 0.00034173329671223957, + "learning_rate": 0.01, + "loss": 1.3548, + "loss/crossentropy": 2.47409451007843, + "loss/fcd": 1.1640625, + "loss/logits": 0.26615823060274124, + "step": 752 + }, + { + "epoch": 0.013007540227502397, + "grad_norm": 0.390625, + "grad_norm_var": 0.0008442560831705729, + "learning_rate": 0.01, + "loss": 1.4382, + "loss/crossentropy": 2.667958378791809, + "loss/fcd": 1.22265625, + "loss/logits": 0.29826460778713226, + "step": 753 + }, + { + "epoch": 0.013024814517313157, + "grad_norm": 0.28125, + "grad_norm_var": 0.0008722941080729166, + "learning_rate": 0.01, + "loss": 1.3857, + "loss/crossentropy": 2.2399171590805054, + "loss/fcd": 1.30078125, + "loss/logits": 0.3064821809530258, + "step": 754 + }, + { + "epoch": 0.013042088807123917, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0008643945058186849, + "learning_rate": 0.01, + "loss": 1.4191, + "loss/crossentropy": 2.4244364500045776, + "loss/fcd": 1.1796875, + "loss/logits": 0.2772462069988251, + "step": 755 + }, + { + "epoch": 0.013059363096934676, + "grad_norm": 0.28125, + "grad_norm_var": 0.000800323486328125, + "learning_rate": 0.01, + "loss": 1.3482, + "loss/crossentropy": 2.6471344232559204, + "loss/fcd": 1.1484375, + "loss/logits": 0.2606939375400543, + "step": 756 + }, + { + "epoch": 0.013076637386745438, + "grad_norm": 0.345703125, + "grad_norm_var": 0.000935220718383789, + "learning_rate": 0.01, + "loss": 1.4094, + "loss/crossentropy": 2.4318645000457764, + "loss/fcd": 1.125, + "loss/logits": 0.2657194063067436, + "step": 757 + }, + { + "epoch": 0.013093911676556198, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0009119510650634766, + "learning_rate": 0.01, + "loss": 1.4882, + "loss/crossentropy": 2.6587414741516113, + "loss/fcd": 1.1171875, + "loss/logits": 0.25396668910980225, + "step": 758 + }, + { + "epoch": 0.013111185966366958, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0008859634399414062, + "learning_rate": 0.01, + "loss": 1.3734, + "loss/crossentropy": 2.320420742034912, + "loss/fcd": 1.0625, + "loss/logits": 0.22045490145683289, + "step": 759 + }, + { + "epoch": 0.013128460256177718, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0008935928344726562, + "learning_rate": 0.01, + "loss": 1.4048, + "loss/crossentropy": 2.43363881111145, + "loss/fcd": 1.1171875, + "loss/logits": 0.2532464414834976, + "step": 760 + }, + { + "epoch": 0.013145734545988478, + "grad_norm": 0.28125, + "grad_norm_var": 0.0009287357330322266, + "learning_rate": 0.01, + "loss": 1.3617, + "loss/crossentropy": 2.5222312211990356, + "loss/fcd": 1.15234375, + "loss/logits": 0.29095427691936493, + "step": 761 + }, + { + "epoch": 0.013163008835799237, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0009198347727457682, + "learning_rate": 0.01, + "loss": 1.3893, + "loss/crossentropy": 2.265801191329956, + "loss/fcd": 1.02734375, + "loss/logits": 0.23643554002046585, + "step": 762 + }, + { + "epoch": 0.013180283125609999, + "grad_norm": 0.390625, + "grad_norm_var": 0.0013386885325113933, + "learning_rate": 0.01, + "loss": 1.4154, + "loss/crossentropy": 2.1754260063171387, + "loss/fcd": 1.1640625, + "loss/logits": 0.244869664311409, + "step": 763 + }, + { + "epoch": 0.013197557415420759, + "grad_norm": 0.30859375, + "grad_norm_var": 0.001320330301920573, + "learning_rate": 0.01, + "loss": 1.3947, + "loss/crossentropy": 2.3228635787963867, + "loss/fcd": 1.03515625, + "loss/logits": 0.22359148412942886, + "step": 764 + }, + { + "epoch": 0.013214831705231519, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0013203938802083333, + "learning_rate": 0.01, + "loss": 1.4051, + "loss/crossentropy": 2.5446053743362427, + "loss/fcd": 1.140625, + "loss/logits": 0.24661505222320557, + "step": 765 + }, + { + "epoch": 0.013232105995042279, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0013085524241129556, + "learning_rate": 0.01, + "loss": 1.4116, + "loss/crossentropy": 2.4046772718429565, + "loss/fcd": 1.16015625, + "loss/logits": 0.26322653889656067, + "step": 766 + }, + { + "epoch": 0.013249380284853039, + "grad_norm": 0.287109375, + "grad_norm_var": 0.00127714474995931, + "learning_rate": 0.01, + "loss": 1.3094, + "loss/crossentropy": 2.397523880004883, + "loss/fcd": 1.08203125, + "loss/logits": 0.2391202375292778, + "step": 767 + }, + { + "epoch": 0.013266654574663798, + "grad_norm": 0.28125, + "grad_norm_var": 0.0012049357096354167, + "learning_rate": 0.01, + "loss": 1.3474, + "loss/crossentropy": 2.599183440208435, + "loss/fcd": 1.1640625, + "loss/logits": 0.2888915240764618, + "step": 768 + }, + { + "epoch": 0.01328392886447456, + "grad_norm": 0.353515625, + "grad_norm_var": 0.0009141127268473307, + "learning_rate": 0.01, + "loss": 1.4331, + "loss/crossentropy": 2.1059322357177734, + "loss/fcd": 1.0546875, + "loss/logits": 0.23741237819194794, + "step": 769 + }, + { + "epoch": 0.01330120315428532, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0008471171061197917, + "learning_rate": 0.01, + "loss": 1.3643, + "loss/crossentropy": 2.697718620300293, + "loss/fcd": 1.1796875, + "loss/logits": 0.26706932485103607, + "step": 770 + }, + { + "epoch": 0.01331847744409608, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0008389631907145183, + "learning_rate": 0.01, + "loss": 1.45, + "loss/crossentropy": 2.4075610637664795, + "loss/fcd": 1.1015625, + "loss/logits": 0.25129370391368866, + "step": 771 + }, + { + "epoch": 0.01333575173390684, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0008669535319010417, + "learning_rate": 0.01, + "loss": 1.3877, + "loss/crossentropy": 2.7534801959991455, + "loss/fcd": 1.23828125, + "loss/logits": 0.30193065106868744, + "step": 772 + }, + { + "epoch": 0.0133530260237176, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0008176008860270183, + "learning_rate": 0.01, + "loss": 1.3939, + "loss/crossentropy": 2.182551383972168, + "loss/fcd": 1.12890625, + "loss/logits": 0.28344330191612244, + "step": 773 + }, + { + "epoch": 0.01337030031352836, + "grad_norm": 0.28125, + "grad_norm_var": 0.000862741470336914, + "learning_rate": 0.01, + "loss": 1.412, + "loss/crossentropy": 2.510794520378113, + "loss/fcd": 1.16015625, + "loss/logits": 0.25014493614435196, + "step": 774 + }, + { + "epoch": 0.013387574603339121, + "grad_norm": 0.33984375, + "grad_norm_var": 0.000925445556640625, + "learning_rate": 0.01, + "loss": 1.4564, + "loss/crossentropy": 2.479841709136963, + "loss/fcd": 1.08203125, + "loss/logits": 0.25105684995651245, + "step": 775 + }, + { + "epoch": 0.013404848893149881, + "grad_norm": 0.3125, + "grad_norm_var": 0.0009211063385009766, + "learning_rate": 0.01, + "loss": 1.3963, + "loss/crossentropy": 2.639458417892456, + "loss/fcd": 1.10546875, + "loss/logits": 0.2490757405757904, + "step": 776 + }, + { + "epoch": 0.01342212318296064, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0008683363596598307, + "learning_rate": 0.01, + "loss": 1.4124, + "loss/crossentropy": 2.6080870628356934, + "loss/fcd": 1.13671875, + "loss/logits": 0.24069885909557343, + "step": 777 + }, + { + "epoch": 0.0134393974727714, + "grad_norm": 0.3125, + "grad_norm_var": 0.0008681615193684896, + "learning_rate": 0.01, + "loss": 1.3733, + "loss/crossentropy": 2.3055442571640015, + "loss/fcd": 1.08203125, + "loss/logits": 0.2517802268266678, + "step": 778 + }, + { + "epoch": 0.01345667176258216, + "grad_norm": 0.3125, + "grad_norm_var": 0.00043328603108723957, + "learning_rate": 0.01, + "loss": 1.4781, + "loss/crossentropy": 2.5537742376327515, + "loss/fcd": 1.26953125, + "loss/logits": 0.30602647364139557, + "step": 779 + }, + { + "epoch": 0.01347394605239292, + "grad_norm": 0.328125, + "grad_norm_var": 0.0004603068033854167, + "learning_rate": 0.01, + "loss": 1.3961, + "loss/crossentropy": 2.371378183364868, + "loss/fcd": 1.12890625, + "loss/logits": 0.24987629055976868, + "step": 780 + }, + { + "epoch": 0.013491220342203682, + "grad_norm": 0.328125, + "grad_norm_var": 0.00048267046610514324, + "learning_rate": 0.01, + "loss": 1.3782, + "loss/crossentropy": 2.570296287536621, + "loss/fcd": 1.05859375, + "loss/logits": 0.22898489236831665, + "step": 781 + }, + { + "epoch": 0.013508494632014442, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0004997094472249349, + "learning_rate": 0.01, + "loss": 1.3685, + "loss/crossentropy": 2.282141923904419, + "loss/fcd": 1.05859375, + "loss/logits": 0.2274707406759262, + "step": 782 + }, + { + "epoch": 0.013525768921825202, + "grad_norm": 0.349609375, + "grad_norm_var": 0.0005658308664957683, + "learning_rate": 0.01, + "loss": 1.41, + "loss/crossentropy": 2.378341317176819, + "loss/fcd": 1.21484375, + "loss/logits": 0.3016776442527771, + "step": 783 + }, + { + "epoch": 0.013543043211635962, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0005829970041910808, + "learning_rate": 0.01, + "loss": 1.3398, + "loss/crossentropy": 2.6982511281967163, + "loss/fcd": 1.0703125, + "loss/logits": 0.23673634231090546, + "step": 784 + }, + { + "epoch": 0.013560317501446722, + "grad_norm": 0.30859375, + "grad_norm_var": 0.00046126047770182293, + "learning_rate": 0.01, + "loss": 1.3964, + "loss/crossentropy": 2.371803879737854, + "loss/fcd": 1.1328125, + "loss/logits": 0.23621678352355957, + "step": 785 + }, + { + "epoch": 0.013577591791257481, + "grad_norm": 0.29296875, + "grad_norm_var": 0.00047771135965983075, + "learning_rate": 0.01, + "loss": 1.3296, + "loss/crossentropy": 2.3509960174560547, + "loss/fcd": 1.05859375, + "loss/logits": 0.23912984877824783, + "step": 786 + }, + { + "epoch": 0.013594866081068241, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0005536397298177083, + "learning_rate": 0.01, + "loss": 1.3796, + "loss/crossentropy": 2.4273725748062134, + "loss/fcd": 1.08984375, + "loss/logits": 0.2564089596271515, + "step": 787 + }, + { + "epoch": 0.013612140370879003, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0005254109700520833, + "learning_rate": 0.01, + "loss": 1.3708, + "loss/crossentropy": 2.4844895601272583, + "loss/fcd": 1.09375, + "loss/logits": 0.24952378869056702, + "step": 788 + }, + { + "epoch": 0.013629414660689763, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0005256493886311848, + "learning_rate": 0.01, + "loss": 1.412, + "loss/crossentropy": 2.415653347969055, + "loss/fcd": 1.1484375, + "loss/logits": 0.2528124749660492, + "step": 789 + }, + { + "epoch": 0.013646688950500523, + "grad_norm": 0.390625, + "grad_norm_var": 0.0008763472239176432, + "learning_rate": 0.01, + "loss": 1.4382, + "loss/crossentropy": 2.4079452753067017, + "loss/fcd": 1.18359375, + "loss/logits": 0.2838260903954506, + "step": 790 + }, + { + "epoch": 0.013663963240311282, + "grad_norm": 0.328125, + "grad_norm_var": 0.0008465925852457683, + "learning_rate": 0.01, + "loss": 1.446, + "loss/crossentropy": 2.3247077465057373, + "loss/fcd": 1.1171875, + "loss/logits": 0.2550275847315788, + "step": 791 + }, + { + "epoch": 0.013681237530122042, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0008462905883789062, + "learning_rate": 0.01, + "loss": 1.3615, + "loss/crossentropy": 2.1464229822158813, + "loss/fcd": 1.13671875, + "loss/logits": 0.25474052131175995, + "step": 792 + }, + { + "epoch": 0.013698511819932802, + "grad_norm": 0.361328125, + "grad_norm_var": 0.0009821414947509765, + "learning_rate": 0.01, + "loss": 1.4535, + "loss/crossentropy": 2.4427038431167603, + "loss/fcd": 1.0859375, + "loss/logits": 0.2672760635614395, + "step": 793 + }, + { + "epoch": 0.013715786109743564, + "grad_norm": 0.314453125, + "grad_norm_var": 0.00098114013671875, + "learning_rate": 0.01, + "loss": 1.4375, + "loss/crossentropy": 2.502182126045227, + "loss/fcd": 1.203125, + "loss/logits": 0.30062489211559296, + "step": 794 + }, + { + "epoch": 0.013733060399554324, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0010176976521809897, + "learning_rate": 0.01, + "loss": 1.3338, + "loss/crossentropy": 2.537824034690857, + "loss/fcd": 1.12109375, + "loss/logits": 0.24768686294555664, + "step": 795 + }, + { + "epoch": 0.013750334689365084, + "grad_norm": 0.26953125, + "grad_norm_var": 0.0011388142903645834, + "learning_rate": 0.01, + "loss": 1.3676, + "loss/crossentropy": 2.3750780820846558, + "loss/fcd": 1.076171875, + "loss/logits": 0.23160798847675323, + "step": 796 + }, + { + "epoch": 0.013767608979175843, + "grad_norm": 0.306640625, + "grad_norm_var": 0.001122903823852539, + "learning_rate": 0.01, + "loss": 1.3812, + "loss/crossentropy": 2.628328800201416, + "loss/fcd": 1.13671875, + "loss/logits": 0.2566673457622528, + "step": 797 + }, + { + "epoch": 0.013784883268986603, + "grad_norm": 0.353515625, + "grad_norm_var": 0.0012051900227864583, + "learning_rate": 0.01, + "loss": 1.3547, + "loss/crossentropy": 2.0953266620635986, + "loss/fcd": 1.1015625, + "loss/logits": 0.23933346569538116, + "step": 798 + }, + { + "epoch": 0.013802157558797363, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0011489232381184896, + "learning_rate": 0.01, + "loss": 1.4166, + "loss/crossentropy": 2.7266474962234497, + "loss/fcd": 1.140625, + "loss/logits": 0.2656974792480469, + "step": 799 + }, + { + "epoch": 0.013819431848608125, + "grad_norm": 0.296875, + "grad_norm_var": 0.0010843912760416666, + "learning_rate": 0.01, + "loss": 1.3191, + "loss/crossentropy": 2.459654688835144, + "loss/fcd": 1.12109375, + "loss/logits": 0.2644665837287903, + "step": 800 + }, + { + "epoch": 0.013836706138418885, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0010843912760416666, + "learning_rate": 0.01, + "loss": 1.4278, + "loss/crossentropy": 2.629300117492676, + "loss/fcd": 1.1171875, + "loss/logits": 0.24821807444095612, + "step": 801 + }, + { + "epoch": 0.013853980428229645, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0010678609212239583, + "learning_rate": 0.01, + "loss": 1.3413, + "loss/crossentropy": 2.5803698301315308, + "loss/fcd": 1.15234375, + "loss/logits": 0.267608180642128, + "step": 802 + }, + { + "epoch": 0.013871254718040404, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0010577996571858725, + "learning_rate": 0.01, + "loss": 1.3176, + "loss/crossentropy": 2.349183440208435, + "loss/fcd": 1.09375, + "loss/logits": 0.25479844957590103, + "step": 803 + }, + { + "epoch": 0.013888529007851164, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0010577996571858725, + "learning_rate": 0.01, + "loss": 1.3783, + "loss/crossentropy": 2.618894100189209, + "loss/fcd": 1.1796875, + "loss/logits": 0.2711791917681694, + "step": 804 + }, + { + "epoch": 0.013905803297661924, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0010577996571858725, + "learning_rate": 0.01, + "loss": 1.3966, + "loss/crossentropy": 2.3134875893592834, + "loss/fcd": 1.109375, + "loss/logits": 0.2539241313934326, + "step": 805 + }, + { + "epoch": 0.013923077587472686, + "grad_norm": 0.330078125, + "grad_norm_var": 0.0006611506144205729, + "learning_rate": 0.01, + "loss": 1.441, + "loss/crossentropy": 2.837363600730896, + "loss/fcd": 1.2578125, + "loss/logits": 0.32089151442050934, + "step": 806 + }, + { + "epoch": 0.013940351877283446, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0006586074829101563, + "learning_rate": 0.01, + "loss": 1.3525, + "loss/crossentropy": 2.377834916114807, + "loss/fcd": 1.06640625, + "loss/logits": 0.23237647861242294, + "step": 807 + }, + { + "epoch": 0.013957626167094206, + "grad_norm": 0.271484375, + "grad_norm_var": 0.0007306416829427083, + "learning_rate": 0.01, + "loss": 1.3753, + "loss/crossentropy": 2.520345091819763, + "loss/fcd": 1.1484375, + "loss/logits": 0.26999618113040924, + "step": 808 + }, + { + "epoch": 0.013974900456904965, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0005376180013020833, + "learning_rate": 0.01, + "loss": 1.3425, + "loss/crossentropy": 2.55434787273407, + "loss/fcd": 1.0859375, + "loss/logits": 0.26515287160873413, + "step": 809 + }, + { + "epoch": 0.013992174746715725, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0005302270253499349, + "learning_rate": 0.01, + "loss": 1.405, + "loss/crossentropy": 2.320609927177429, + "loss/fcd": 1.1328125, + "loss/logits": 0.2443319857120514, + "step": 810 + }, + { + "epoch": 0.014009449036526485, + "grad_norm": 0.28125, + "grad_norm_var": 0.0005451043446858724, + "learning_rate": 0.01, + "loss": 1.3608, + "loss/crossentropy": 2.3824050426483154, + "loss/fcd": 1.1171875, + "loss/logits": 0.2653844952583313, + "step": 811 + }, + { + "epoch": 0.014026723326337247, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0005200703938802084, + "learning_rate": 0.01, + "loss": 1.4786, + "loss/crossentropy": 2.459092617034912, + "loss/fcd": 1.18359375, + "loss/logits": 0.2695985734462738, + "step": 812 + }, + { + "epoch": 0.014043997616148007, + "grad_norm": 0.26953125, + "grad_norm_var": 0.0005698998769124349, + "learning_rate": 0.01, + "loss": 1.2888, + "loss/crossentropy": 2.4817110300064087, + "loss/fcd": 1.0546875, + "loss/logits": 0.22882136702537537, + "step": 813 + }, + { + "epoch": 0.014061271905958767, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0003539880116780599, + "learning_rate": 0.01, + "loss": 1.3681, + "loss/crossentropy": 2.556985020637512, + "loss/fcd": 1.12109375, + "loss/logits": 0.25683027505874634, + "step": 814 + }, + { + "epoch": 0.014078546195769526, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0003661473592122396, + "learning_rate": 0.01, + "loss": 1.4285, + "loss/crossentropy": 2.3824613094329834, + "loss/fcd": 1.10546875, + "loss/logits": 0.24755095690488815, + "step": 815 + }, + { + "epoch": 0.014095820485580286, + "grad_norm": 0.310546875, + "grad_norm_var": 0.00038094520568847655, + "learning_rate": 0.01, + "loss": 1.3345, + "loss/crossentropy": 2.1579148173332214, + "loss/fcd": 1.0625, + "loss/logits": 0.23608000576496124, + "step": 816 + }, + { + "epoch": 0.014113094775391046, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0004091739654541016, + "learning_rate": 0.01, + "loss": 1.42, + "loss/crossentropy": 2.556256413459778, + "loss/fcd": 1.140625, + "loss/logits": 0.23912374675273895, + "step": 817 + }, + { + "epoch": 0.014130369065201808, + "grad_norm": 0.29296875, + "grad_norm_var": 0.00040879249572753904, + "learning_rate": 0.01, + "loss": 1.4296, + "loss/crossentropy": 2.497371554374695, + "loss/fcd": 1.08984375, + "loss/logits": 0.24960072338581085, + "step": 818 + }, + { + "epoch": 0.014147643355012568, + "grad_norm": 0.29296875, + "grad_norm_var": 0.000379180908203125, + "learning_rate": 0.01, + "loss": 1.4164, + "loss/crossentropy": 2.6055017709732056, + "loss/fcd": 1.24609375, + "loss/logits": 0.30321623384952545, + "step": 819 + }, + { + "epoch": 0.014164917644823328, + "grad_norm": 0.26953125, + "grad_norm_var": 0.00041667620340983075, + "learning_rate": 0.01, + "loss": 1.3435, + "loss/crossentropy": 2.520479202270508, + "loss/fcd": 1.12109375, + "loss/logits": 0.24647565186023712, + "step": 820 + }, + { + "epoch": 0.014182191934634087, + "grad_norm": 0.283203125, + "grad_norm_var": 0.00037789344787597656, + "learning_rate": 0.01, + "loss": 1.4303, + "loss/crossentropy": 2.4229378700256348, + "loss/fcd": 1.16015625, + "loss/logits": 0.27616265416145325, + "step": 821 + }, + { + "epoch": 0.014199466224444847, + "grad_norm": 0.294921875, + "grad_norm_var": 0.00028634071350097656, + "learning_rate": 0.01, + "loss": 1.4063, + "loss/crossentropy": 2.642806649208069, + "loss/fcd": 1.11328125, + "loss/logits": 0.24927609413862228, + "step": 822 + }, + { + "epoch": 0.014216740514255607, + "grad_norm": 0.341796875, + "grad_norm_var": 0.00044040679931640626, + "learning_rate": 0.01, + "loss": 1.4389, + "loss/crossentropy": 2.743402600288391, + "loss/fcd": 1.20703125, + "loss/logits": 0.2956629917025566, + "step": 823 + }, + { + "epoch": 0.014234014804066367, + "grad_norm": 0.298828125, + "grad_norm_var": 0.00040079752604166665, + "learning_rate": 0.01, + "loss": 1.4283, + "loss/crossentropy": 2.5851320028305054, + "loss/fcd": 1.20703125, + "loss/logits": 0.26086658239364624, + "step": 824 + }, + { + "epoch": 0.014251289093877129, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0003787835439046224, + "learning_rate": 0.01, + "loss": 1.3569, + "loss/crossentropy": 2.5595767498016357, + "loss/fcd": 1.0703125, + "loss/logits": 0.24868559837341309, + "step": 825 + }, + { + "epoch": 0.014268563383687888, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0003745873769124349, + "learning_rate": 0.01, + "loss": 1.3698, + "loss/crossentropy": 2.553021550178528, + "loss/fcd": 1.12109375, + "loss/logits": 0.25030215084552765, + "step": 826 + }, + { + "epoch": 0.014285837673498648, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0003688653310139974, + "learning_rate": 0.01, + "loss": 1.3934, + "loss/crossentropy": 2.459465980529785, + "loss/fcd": 1.2578125, + "loss/logits": 0.27228477597236633, + "step": 827 + }, + { + "epoch": 0.014303111963309408, + "grad_norm": 0.373046875, + "grad_norm_var": 0.0006812890370686849, + "learning_rate": 0.01, + "loss": 1.403, + "loss/crossentropy": 2.5050086975097656, + "loss/fcd": 1.11328125, + "loss/logits": 0.2527881860733032, + "step": 828 + }, + { + "epoch": 0.014320386253120168, + "grad_norm": 0.296875, + "grad_norm_var": 0.0006002902984619141, + "learning_rate": 0.01, + "loss": 1.4282, + "loss/crossentropy": 2.5587570667266846, + "loss/fcd": 1.171875, + "loss/logits": 0.2506961077451706, + "step": 829 + }, + { + "epoch": 0.014337660542930928, + "grad_norm": 0.3046875, + "grad_norm_var": 0.000600433349609375, + "learning_rate": 0.01, + "loss": 1.3663, + "loss/crossentropy": 2.433290719985962, + "loss/fcd": 1.13671875, + "loss/logits": 0.23105743527412415, + "step": 830 + }, + { + "epoch": 0.01435493483274169, + "grad_norm": 0.345703125, + "grad_norm_var": 0.0006985823313395182, + "learning_rate": 0.01, + "loss": 1.4033, + "loss/crossentropy": 2.2913233041763306, + "loss/fcd": 1.140625, + "loss/logits": 0.2715977430343628, + "step": 831 + }, + { + "epoch": 0.01437220912255245, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0007214864095052083, + "learning_rate": 0.01, + "loss": 1.3672, + "loss/crossentropy": 2.4408832788467407, + "loss/fcd": 1.109375, + "loss/logits": 0.23768731951713562, + "step": 832 + }, + { + "epoch": 0.01438948341236321, + "grad_norm": 0.3515625, + "grad_norm_var": 0.0008374532063802083, + "learning_rate": 0.01, + "loss": 1.3927, + "loss/crossentropy": 2.273505926132202, + "loss/fcd": 1.05078125, + "loss/logits": 0.2371639683842659, + "step": 833 + }, + { + "epoch": 0.01440675770217397, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0008224328358968099, + "learning_rate": 0.01, + "loss": 1.4174, + "loss/crossentropy": 2.304438829421997, + "loss/fcd": 1.10546875, + "loss/logits": 0.2705874443054199, + "step": 834 + }, + { + "epoch": 0.014424031991984729, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0008061091105143229, + "learning_rate": 0.01, + "loss": 1.413, + "loss/crossentropy": 2.4857107400894165, + "loss/fcd": 1.265625, + "loss/logits": 0.2602947950363159, + "step": 835 + }, + { + "epoch": 0.014441306281795489, + "grad_norm": 0.33984375, + "grad_norm_var": 0.0007237116495768229, + "learning_rate": 0.01, + "loss": 1.4423, + "loss/crossentropy": 2.4861044883728027, + "loss/fcd": 1.16015625, + "loss/logits": 0.25615356862545013, + "step": 836 + }, + { + "epoch": 0.01445858057160625, + "grad_norm": 0.267578125, + "grad_norm_var": 0.0008066177368164062, + "learning_rate": 0.01, + "loss": 1.3396, + "loss/crossentropy": 2.3363460302352905, + "loss/fcd": 1.03125, + "loss/logits": 0.2474212720990181, + "step": 837 + }, + { + "epoch": 0.01447585486141701, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0007843017578125, + "learning_rate": 0.01, + "loss": 1.3885, + "loss/crossentropy": 2.332596778869629, + "loss/fcd": 1.0859375, + "loss/logits": 0.2456573098897934, + "step": 838 + }, + { + "epoch": 0.01449312915122777, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0007394790649414062, + "learning_rate": 0.01, + "loss": 1.3709, + "loss/crossentropy": 2.613990545272827, + "loss/fcd": 1.1953125, + "loss/logits": 0.2681911140680313, + "step": 839 + }, + { + "epoch": 0.01451040344103853, + "grad_norm": 0.279296875, + "grad_norm_var": 0.000803375244140625, + "learning_rate": 0.01, + "loss": 1.3305, + "loss/crossentropy": 2.448235511779785, + "loss/fcd": 1.05859375, + "loss/logits": 0.22328373789787292, + "step": 840 + }, + { + "epoch": 0.01452767773084929, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0008455753326416015, + "learning_rate": 0.01, + "loss": 1.3552, + "loss/crossentropy": 2.4329841136932373, + "loss/fcd": 1.171875, + "loss/logits": 0.2812986671924591, + "step": 841 + }, + { + "epoch": 0.01454495202066005, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0008448282877604167, + "learning_rate": 0.01, + "loss": 1.4049, + "loss/crossentropy": 2.366762161254883, + "loss/fcd": 1.1640625, + "loss/logits": 0.2537970468401909, + "step": 842 + }, + { + "epoch": 0.014562226310470812, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0008669535319010417, + "learning_rate": 0.01, + "loss": 1.3474, + "loss/crossentropy": 2.2118855714797974, + "loss/fcd": 1.05859375, + "loss/logits": 0.2319856360554695, + "step": 843 + }, + { + "epoch": 0.014579500600281571, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0005958398183186849, + "learning_rate": 0.01, + "loss": 1.3672, + "loss/crossentropy": 2.427622437477112, + "loss/fcd": 1.1171875, + "loss/logits": 0.26083478331565857, + "step": 844 + }, + { + "epoch": 0.014596774890092331, + "grad_norm": 0.322265625, + "grad_norm_var": 0.000603485107421875, + "learning_rate": 0.01, + "loss": 1.3868, + "loss/crossentropy": 2.6780372858047485, + "loss/fcd": 1.2578125, + "loss/logits": 0.2781776934862137, + "step": 845 + }, + { + "epoch": 0.014614049179903091, + "grad_norm": 0.337890625, + "grad_norm_var": 0.0006572564442952473, + "learning_rate": 0.01, + "loss": 1.3767, + "loss/crossentropy": 2.36633038520813, + "loss/fcd": 1.140625, + "loss/logits": 0.2774253934621811, + "step": 846 + }, + { + "epoch": 0.014631323469713851, + "grad_norm": 0.263671875, + "grad_norm_var": 0.0006892999013264974, + "learning_rate": 0.01, + "loss": 1.3532, + "loss/crossentropy": 2.598803162574768, + "loss/fcd": 1.1484375, + "loss/logits": 0.27754758298397064, + "step": 847 + }, + { + "epoch": 0.014648597759524611, + "grad_norm": 0.275390625, + "grad_norm_var": 0.000730133056640625, + "learning_rate": 0.01, + "loss": 1.3245, + "loss/crossentropy": 2.323062777519226, + "loss/fcd": 1.0234375, + "loss/logits": 0.23049668222665787, + "step": 848 + }, + { + "epoch": 0.014665872049335373, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0005938212076822916, + "learning_rate": 0.01, + "loss": 1.497, + "loss/crossentropy": 2.4116770029067993, + "loss/fcd": 1.1328125, + "loss/logits": 0.25217771530151367, + "step": 849 + }, + { + "epoch": 0.014683146339146132, + "grad_norm": 0.33203125, + "grad_norm_var": 0.0006493727366129557, + "learning_rate": 0.01, + "loss": 1.3825, + "loss/crossentropy": 2.784231662750244, + "loss/fcd": 1.23828125, + "loss/logits": 0.301376610994339, + "step": 850 + }, + { + "epoch": 0.014700420628956892, + "grad_norm": 0.68359375, + "grad_norm_var": 0.009682146708170573, + "learning_rate": 0.01, + "loss": 1.5242, + "loss/crossentropy": 2.3721545934677124, + "loss/fcd": 1.09765625, + "loss/logits": 0.2636701613664627, + "step": 851 + }, + { + "epoch": 0.014717694918767652, + "grad_norm": 0.33984375, + "grad_norm_var": 0.009682146708170573, + "learning_rate": 0.01, + "loss": 1.3491, + "loss/crossentropy": 2.5496045351028442, + "loss/fcd": 1.1171875, + "loss/logits": 0.2594982087612152, + "step": 852 + }, + { + "epoch": 0.014734969208578412, + "grad_norm": 0.310546875, + "grad_norm_var": 0.009457651774088542, + "learning_rate": 0.01, + "loss": 1.3208, + "loss/crossentropy": 2.211892247200012, + "loss/fcd": 1.0703125, + "loss/logits": 0.21089013665914536, + "step": 853 + }, + { + "epoch": 0.014752243498389172, + "grad_norm": 0.2890625, + "grad_norm_var": 0.009530750910441081, + "learning_rate": 0.01, + "loss": 1.3612, + "loss/crossentropy": 2.3918616771698, + "loss/fcd": 1.046875, + "loss/logits": 0.2475578412413597, + "step": 854 + }, + { + "epoch": 0.014769517788199932, + "grad_norm": 0.294921875, + "grad_norm_var": 0.009600178400675455, + "learning_rate": 0.01, + "loss": 1.3711, + "loss/crossentropy": 2.6660208702087402, + "loss/fcd": 1.125, + "loss/logits": 0.25626226514577866, + "step": 855 + }, + { + "epoch": 0.014786792078010693, + "grad_norm": 0.265625, + "grad_norm_var": 0.009698422749837239, + "learning_rate": 0.01, + "loss": 1.3272, + "loss/crossentropy": 2.4646941423416138, + "loss/fcd": 1.1015625, + "loss/logits": 0.24187320470809937, + "step": 856 + }, + { + "epoch": 0.014804066367821453, + "grad_norm": 0.296875, + "grad_norm_var": 0.00958250363667806, + "learning_rate": 0.01, + "loss": 1.349, + "loss/crossentropy": 2.889734983444214, + "loss/fcd": 1.23046875, + "loss/logits": 0.28400754928588867, + "step": 857 + }, + { + "epoch": 0.014821340657632213, + "grad_norm": 0.296875, + "grad_norm_var": 0.009624671936035157, + "learning_rate": 0.01, + "loss": 1.3696, + "loss/crossentropy": 2.4632620811462402, + "loss/fcd": 1.11328125, + "loss/logits": 0.24944238364696503, + "step": 858 + }, + { + "epoch": 0.014838614947442973, + "grad_norm": 0.28515625, + "grad_norm_var": 0.00966332753499349, + "learning_rate": 0.01, + "loss": 1.3636, + "loss/crossentropy": 2.38780677318573, + "loss/fcd": 1.1171875, + "loss/logits": 0.25044557452201843, + "step": 859 + }, + { + "epoch": 0.014855889237253733, + "grad_norm": 0.322265625, + "grad_norm_var": 0.009620141983032227, + "learning_rate": 0.01, + "loss": 1.392, + "loss/crossentropy": 2.523656487464905, + "loss/fcd": 1.10546875, + "loss/logits": 0.2575480043888092, + "step": 860 + }, + { + "epoch": 0.014873163527064493, + "grad_norm": 0.3125, + "grad_norm_var": 0.009632619222005208, + "learning_rate": 0.01, + "loss": 1.3788, + "loss/crossentropy": 2.3901199102401733, + "loss/fcd": 1.109375, + "loss/logits": 0.22918711602687836, + "step": 861 + }, + { + "epoch": 0.014890437816875254, + "grad_norm": 0.41015625, + "grad_norm_var": 0.010067224502563477, + "learning_rate": 0.01, + "loss": 1.392, + "loss/crossentropy": 2.2604238986968994, + "loss/fcd": 1.29296875, + "loss/logits": 0.28666311502456665, + "step": 862 + }, + { + "epoch": 0.014907712106686014, + "grad_norm": 0.296875, + "grad_norm_var": 0.00983727773030599, + "learning_rate": 0.01, + "loss": 1.3701, + "loss/crossentropy": 2.1219175457954407, + "loss/fcd": 1.11328125, + "loss/logits": 0.2484002709388733, + "step": 863 + }, + { + "epoch": 0.014924986396496774, + "grad_norm": 0.3046875, + "grad_norm_var": 0.00966490109761556, + "learning_rate": 0.01, + "loss": 1.4008, + "loss/crossentropy": 2.4230719804763794, + "loss/fcd": 1.08984375, + "loss/logits": 0.2542117089033127, + "step": 864 + }, + { + "epoch": 0.014942260686307534, + "grad_norm": 0.30859375, + "grad_norm_var": 0.009696563084920248, + "learning_rate": 0.01, + "loss": 1.3599, + "loss/crossentropy": 2.602153182029724, + "loss/fcd": 1.1171875, + "loss/logits": 0.2338126003742218, + "step": 865 + }, + { + "epoch": 0.014959534976118294, + "grad_norm": 0.310546875, + "grad_norm_var": 0.0097320556640625, + "learning_rate": 0.01, + "loss": 1.3659, + "loss/crossentropy": 2.3879982233047485, + "loss/fcd": 1.11328125, + "loss/logits": 0.25103290379047394, + "step": 866 + }, + { + "epoch": 0.014976809265929054, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0010736465454101562, + "learning_rate": 0.01, + "loss": 1.348, + "loss/crossentropy": 2.4637222290039062, + "loss/fcd": 1.14453125, + "loss/logits": 0.2280896008014679, + "step": 867 + }, + { + "epoch": 0.014994083555739815, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0010012149810791015, + "learning_rate": 0.01, + "loss": 1.3708, + "loss/crossentropy": 2.784236192703247, + "loss/fcd": 1.1640625, + "loss/logits": 0.28741903603076935, + "step": 868 + }, + { + "epoch": 0.015011357845550575, + "grad_norm": 0.298828125, + "grad_norm_var": 0.001000833511352539, + "learning_rate": 0.01, + "loss": 1.4288, + "loss/crossentropy": 2.6332989931106567, + "loss/fcd": 1.27734375, + "loss/logits": 0.3306438624858856, + "step": 869 + }, + { + "epoch": 0.015028632135361335, + "grad_norm": 0.359375, + "grad_norm_var": 0.0011690616607666015, + "learning_rate": 0.01, + "loss": 1.4187, + "loss/crossentropy": 2.3606460094451904, + "loss/fcd": 1.0546875, + "loss/logits": 0.23307877779006958, + "step": 870 + }, + { + "epoch": 0.015045906425172095, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0011728286743164062, + "learning_rate": 0.01, + "loss": 1.3553, + "loss/crossentropy": 2.324714183807373, + "loss/fcd": 1.0859375, + "loss/logits": 0.2501022219657898, + "step": 871 + }, + { + "epoch": 0.015063180714982855, + "grad_norm": 0.283203125, + "grad_norm_var": 0.0010920047760009765, + "learning_rate": 0.01, + "loss": 1.3623, + "loss/crossentropy": 2.328053116798401, + "loss/fcd": 1.140625, + "loss/logits": 0.2553166151046753, + "step": 872 + }, + { + "epoch": 0.015080455004793615, + "grad_norm": 0.30078125, + "grad_norm_var": 0.00108640988667806, + "learning_rate": 0.01, + "loss": 1.4392, + "loss/crossentropy": 2.377878785133362, + "loss/fcd": 1.15625, + "loss/logits": 0.25394026935100555, + "step": 873 + }, + { + "epoch": 0.015097729294604376, + "grad_norm": 0.349609375, + "grad_norm_var": 0.0011700948079427084, + "learning_rate": 0.01, + "loss": 1.3398, + "loss/crossentropy": 2.542131185531616, + "loss/fcd": 1.0625, + "loss/logits": 0.24263548851013184, + "step": 874 + }, + { + "epoch": 0.015115003584415136, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0011273701985677084, + "learning_rate": 0.01, + "loss": 1.3837, + "loss/crossentropy": 2.443636417388916, + "loss/fcd": 1.1328125, + "loss/logits": 0.27580726146698, + "step": 875 + }, + { + "epoch": 0.015132277874225896, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0011240005493164062, + "learning_rate": 0.01, + "loss": 1.4357, + "loss/crossentropy": 2.752240300178528, + "loss/fcd": 1.17578125, + "loss/logits": 0.2472759708762169, + "step": 876 + }, + { + "epoch": 0.015149552164036656, + "grad_norm": 0.32421875, + "grad_norm_var": 0.00113067626953125, + "learning_rate": 0.01, + "loss": 1.3789, + "loss/crossentropy": 2.504664421081543, + "loss/fcd": 1.125, + "loss/logits": 0.25199174135923386, + "step": 877 + }, + { + "epoch": 0.015166826453847416, + "grad_norm": 0.326171875, + "grad_norm_var": 0.0004998366038004557, + "learning_rate": 0.01, + "loss": 1.3978, + "loss/crossentropy": 2.3523584604263306, + "loss/fcd": 1.15234375, + "loss/logits": 0.26311442255973816, + "step": 878 + }, + { + "epoch": 0.015184100743658176, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0005164941151936849, + "learning_rate": 0.01, + "loss": 1.3575, + "loss/crossentropy": 2.3136786818504333, + "loss/fcd": 1.08984375, + "loss/logits": 0.25283563137054443, + "step": 879 + }, + { + "epoch": 0.015201375033468937, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0005233605702718099, + "learning_rate": 0.01, + "loss": 1.4031, + "loss/crossentropy": 2.445231080055237, + "loss/fcd": 1.15234375, + "loss/logits": 0.251323863863945, + "step": 880 + }, + { + "epoch": 0.015218649323279697, + "grad_norm": 0.302734375, + "grad_norm_var": 0.0005263646443684895, + "learning_rate": 0.01, + "loss": 1.4241, + "loss/crossentropy": 2.5056021213531494, + "loss/fcd": 1.109375, + "loss/logits": 0.2573155537247658, + "step": 881 + }, + { + "epoch": 0.015235923613090457, + "grad_norm": 0.298828125, + "grad_norm_var": 0.0005330403645833333, + "learning_rate": 0.01, + "loss": 1.3779, + "loss/crossentropy": 2.4044970273971558, + "loss/fcd": 1.09765625, + "loss/logits": 0.24030664563179016, + "step": 882 + }, + { + "epoch": 0.015253197902901217, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0005077203114827474, + "learning_rate": 0.01, + "loss": 1.3391, + "loss/crossentropy": 2.2992568016052246, + "loss/fcd": 1.037109375, + "loss/logits": 0.23432201147079468, + "step": 883 + }, + { + "epoch": 0.015270472192711977, + "grad_norm": 0.3125, + "grad_norm_var": 0.0005009333292643229, + "learning_rate": 0.01, + "loss": 1.3742, + "loss/crossentropy": 2.346727728843689, + "loss/fcd": 1.05078125, + "loss/logits": 0.2232709527015686, + "step": 884 + }, + { + "epoch": 0.015287746482522737, + "grad_norm": 0.28125, + "grad_norm_var": 0.0005459944407145182, + "learning_rate": 0.01, + "loss": 1.3237, + "loss/crossentropy": 1.982240617275238, + "loss/fcd": 1.0390625, + "loss/logits": 0.22034113854169846, + "step": 885 + }, + { + "epoch": 0.015305020772333498, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0003636042277018229, + "learning_rate": 0.01, + "loss": 1.438, + "loss/crossentropy": 2.3263243436813354, + "loss/fcd": 1.1640625, + "loss/logits": 0.24902021139860153, + "step": 886 + }, + { + "epoch": 0.015322295062144258, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0003649393717447917, + "learning_rate": 0.01, + "loss": 1.3869, + "loss/crossentropy": 2.56560879945755, + "loss/fcd": 1.1328125, + "loss/logits": 0.2558091878890991, + "step": 887 + }, + { + "epoch": 0.015339569351955018, + "grad_norm": 0.3125, + "grad_norm_var": 0.00032512346903483075, + "learning_rate": 0.01, + "loss": 1.3886, + "loss/crossentropy": 2.4856609106063843, + "loss/fcd": 1.1171875, + "loss/logits": 0.24640005826950073, + "step": 888 + }, + { + "epoch": 0.015356843641765778, + "grad_norm": 0.349609375, + "grad_norm_var": 0.0004208882649739583, + "learning_rate": 0.01, + "loss": 1.4196, + "loss/crossentropy": 2.55330491065979, + "loss/fcd": 1.14453125, + "loss/logits": 0.25765371322631836, + "step": 889 + }, + { + "epoch": 0.015374117931576538, + "grad_norm": 0.314453125, + "grad_norm_var": 0.0003218968709309896, + "learning_rate": 0.01, + "loss": 1.3971, + "loss/crossentropy": 2.6354317665100098, + "loss/fcd": 1.3203125, + "loss/logits": 0.3442998379468918, + "step": 890 + }, + { + "epoch": 0.015391392221387298, + "grad_norm": 0.296875, + "grad_norm_var": 0.00032755533854166664, + "learning_rate": 0.01, + "loss": 1.3998, + "loss/crossentropy": 2.034050762653351, + "loss/fcd": 1.11328125, + "loss/logits": 0.23992937058210373, + "step": 891 + }, + { + "epoch": 0.015408666511198058, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0003330866495768229, + "learning_rate": 0.01, + "loss": 1.4362, + "loss/crossentropy": 2.7760528326034546, + "loss/fcd": 1.16796875, + "loss/logits": 0.2806248515844345, + "step": 892 + }, + { + "epoch": 0.01542594080100882, + "grad_norm": 0.34375, + "grad_norm_var": 0.000394439697265625, + "learning_rate": 0.01, + "loss": 1.4516, + "loss/crossentropy": 2.26086688041687, + "loss/fcd": 1.2109375, + "loss/logits": 0.31815242767333984, + "step": 893 + }, + { + "epoch": 0.015443215090819579, + "grad_norm": 0.333984375, + "grad_norm_var": 0.0004140218098958333, + "learning_rate": 0.01, + "loss": 1.3702, + "loss/crossentropy": 2.5985008478164673, + "loss/fcd": 1.12890625, + "loss/logits": 0.2603040784597397, + "step": 894 + }, + { + "epoch": 0.015460489380630339, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0003986199696858724, + "learning_rate": 0.01, + "loss": 1.3509, + "loss/crossentropy": 2.3431901335716248, + "loss/fcd": 1.08984375, + "loss/logits": 0.22906331717967987, + "step": 895 + }, + { + "epoch": 0.015477763670441099, + "grad_norm": 0.296875, + "grad_norm_var": 0.0004066308339436849, + "learning_rate": 0.01, + "loss": 1.3537, + "loss/crossentropy": 2.4866254329681396, + "loss/fcd": 1.1015625, + "loss/logits": 0.23776976764202118, + "step": 896 + }, + { + "epoch": 0.015495037960251859, + "grad_norm": 0.3203125, + "grad_norm_var": 0.00040791829427083335, + "learning_rate": 0.01, + "loss": 1.3942, + "loss/crossentropy": 2.656658411026001, + "loss/fcd": 1.11328125, + "loss/logits": 0.265699565410614, + "step": 897 + }, + { + "epoch": 0.015512312250062619, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0004048506418863932, + "learning_rate": 0.01, + "loss": 1.398, + "loss/crossentropy": 2.508056640625, + "loss/fcd": 1.14453125, + "loss/logits": 0.2679043859243393, + "step": 898 + }, + { + "epoch": 0.01552958653987338, + "grad_norm": 0.31640625, + "grad_norm_var": 0.00033086140950520834, + "learning_rate": 0.01, + "loss": 1.3786, + "loss/crossentropy": 2.241898775100708, + "loss/fcd": 1.08984375, + "loss/logits": 0.23984474688768387, + "step": 899 + }, + { + "epoch": 0.01554686082968414, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0003639062245686849, + "learning_rate": 0.01, + "loss": 1.4062, + "loss/crossentropy": 2.563822388648987, + "loss/fcd": 1.12890625, + "loss/logits": 0.2376401573419571, + "step": 900 + }, + { + "epoch": 0.0155641351194949, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0003350416819254557, + "learning_rate": 0.01, + "loss": 1.3943, + "loss/crossentropy": 2.4819493293762207, + "loss/fcd": 1.140625, + "loss/logits": 0.26604655385017395, + "step": 901 + }, + { + "epoch": 0.01558140940930566, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0003422419230143229, + "learning_rate": 0.01, + "loss": 1.438, + "loss/crossentropy": 2.6099933385849, + "loss/fcd": 1.21484375, + "loss/logits": 0.2890657037496567, + "step": 902 + }, + { + "epoch": 0.01559868369911642, + "grad_norm": 0.3125, + "grad_norm_var": 0.00033817291259765627, + "learning_rate": 0.01, + "loss": 1.4034, + "loss/crossentropy": 2.5849201679229736, + "loss/fcd": 1.16796875, + "loss/logits": 0.2732825428247452, + "step": 903 + }, + { + "epoch": 0.01561595798892718, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0003444512685139974, + "learning_rate": 0.01, + "loss": 1.3811, + "loss/crossentropy": 2.3671282529830933, + "loss/fcd": 1.10546875, + "loss/logits": 0.24938072264194489, + "step": 904 + }, + { + "epoch": 0.01563323227873794, + "grad_norm": 0.3125, + "grad_norm_var": 0.00024871826171875, + "learning_rate": 0.01, + "loss": 1.3843, + "loss/crossentropy": 2.1398147344589233, + "loss/fcd": 1.07421875, + "loss/logits": 0.2394903600215912, + "step": 905 + }, + { + "epoch": 0.0156505065685487, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0002757867177327474, + "learning_rate": 0.01, + "loss": 1.3808, + "loss/crossentropy": 2.3531702756881714, + "loss/fcd": 1.08984375, + "loss/logits": 0.25511349737644196, + "step": 906 + }, + { + "epoch": 0.01566778085835946, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0002784570058186849, + "learning_rate": 0.01, + "loss": 1.3835, + "loss/crossentropy": 2.5271737575531006, + "loss/fcd": 1.109375, + "loss/logits": 0.25303974002599716, + "step": 907 + }, + { + "epoch": 0.015685055148170222, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0003013451894124349, + "learning_rate": 0.01, + "loss": 1.3966, + "loss/crossentropy": 2.50630259513855, + "loss/fcd": 1.25390625, + "loss/logits": 0.28888703882694244, + "step": 908 + }, + { + "epoch": 0.015702329437980982, + "grad_norm": 0.294921875, + "grad_norm_var": 0.00022068023681640626, + "learning_rate": 0.01, + "loss": 1.3997, + "loss/crossentropy": 2.6066339015960693, + "loss/fcd": 1.13671875, + "loss/logits": 0.24285603314638138, + "step": 909 + }, + { + "epoch": 0.015719603727791742, + "grad_norm": 0.296875, + "grad_norm_var": 0.00016541481018066405, + "learning_rate": 0.01, + "loss": 1.3514, + "loss/crossentropy": 2.349377751350403, + "loss/fcd": 1.07421875, + "loss/logits": 0.2379670813679695, + "step": 910 + }, + { + "epoch": 0.015736878017602502, + "grad_norm": 0.283203125, + "grad_norm_var": 0.00018677711486816406, + "learning_rate": 0.01, + "loss": 1.3697, + "loss/crossentropy": 2.493922233581543, + "loss/fcd": 1.109375, + "loss/logits": 0.2561178654432297, + "step": 911 + }, + { + "epoch": 0.015754152307413262, + "grad_norm": 0.279296875, + "grad_norm_var": 0.000218963623046875, + "learning_rate": 0.01, + "loss": 1.3902, + "loss/crossentropy": 2.17154997587204, + "loss/fcd": 1.04296875, + "loss/logits": 0.22504562884569168, + "step": 912 + }, + { + "epoch": 0.015771426597224022, + "grad_norm": 0.27734375, + "grad_norm_var": 0.00022525787353515624, + "learning_rate": 0.01, + "loss": 1.3458, + "loss/crossentropy": 2.4228713512420654, + "loss/fcd": 1.125, + "loss/logits": 0.26753516495227814, + "step": 913 + }, + { + "epoch": 0.01578870088703478, + "grad_norm": 0.3046875, + "grad_norm_var": 0.00022735595703125, + "learning_rate": 0.01, + "loss": 1.4741, + "loss/crossentropy": 2.25216805934906, + "loss/fcd": 1.23046875, + "loss/logits": 0.33171379566192627, + "step": 914 + }, + { + "epoch": 0.01580597517684554, + "grad_norm": 0.30859375, + "grad_norm_var": 0.000212860107421875, + "learning_rate": 0.01, + "loss": 1.3429, + "loss/crossentropy": 2.1387062072753906, + "loss/fcd": 1.07421875, + "loss/logits": 0.24266959726810455, + "step": 915 + }, + { + "epoch": 0.0158232494666563, + "grad_norm": 0.33203125, + "grad_norm_var": 0.00027794837951660155, + "learning_rate": 0.01, + "loss": 1.4493, + "loss/crossentropy": 2.02074271440506, + "loss/fcd": 1.23828125, + "loss/logits": 0.25191547721624374, + "step": 916 + }, + { + "epoch": 0.01584052375646706, + "grad_norm": 0.7734375, + "grad_norm_var": 0.01417692502339681, + "learning_rate": 0.01, + "loss": 1.4196, + "loss/crossentropy": 2.47384512424469, + "loss/fcd": 1.1484375, + "loss/logits": 0.2742984741926193, + "step": 917 + }, + { + "epoch": 0.01585779804627782, + "grad_norm": 0.291015625, + "grad_norm_var": 0.014222462972005209, + "learning_rate": 0.01, + "loss": 1.3766, + "loss/crossentropy": 2.5627119541168213, + "loss/fcd": 1.18359375, + "loss/logits": 0.27059850841760635, + "step": 918 + }, + { + "epoch": 0.01587507233608858, + "grad_norm": 0.31640625, + "grad_norm_var": 0.014214007059733073, + "learning_rate": 0.01, + "loss": 1.4257, + "loss/crossentropy": 2.5728260278701782, + "loss/fcd": 1.11328125, + "loss/logits": 0.26422248035669327, + "step": 919 + }, + { + "epoch": 0.015892346625899344, + "grad_norm": 0.306640625, + "grad_norm_var": 0.01424706776936849, + "learning_rate": 0.01, + "loss": 1.3441, + "loss/crossentropy": 2.3634893894195557, + "loss/fcd": 1.13671875, + "loss/logits": 0.2779320180416107, + "step": 920 + }, + { + "epoch": 0.015909620915710104, + "grad_norm": 0.3359375, + "grad_norm_var": 0.01422723134358724, + "learning_rate": 0.01, + "loss": 1.4555, + "loss/crossentropy": 2.176904857158661, + "loss/fcd": 1.2109375, + "loss/logits": 0.2693602591753006, + "step": 921 + }, + { + "epoch": 0.015926895205520864, + "grad_norm": 0.30078125, + "grad_norm_var": 0.014169820149739583, + "learning_rate": 0.01, + "loss": 1.3614, + "loss/crossentropy": 2.611035466194153, + "loss/fcd": 1.16796875, + "loss/logits": 0.2582136243581772, + "step": 922 + }, + { + "epoch": 0.015944169495331624, + "grad_norm": 0.3125, + "grad_norm_var": 0.014190610249837239, + "learning_rate": 0.01, + "loss": 1.3666, + "loss/crossentropy": 2.353346347808838, + "loss/fcd": 1.11328125, + "loss/logits": 0.24240678548812866, + "step": 923 + }, + { + "epoch": 0.015961443785142384, + "grad_norm": 0.283203125, + "grad_norm_var": 0.014214579264322917, + "learning_rate": 0.01, + "loss": 1.3461, + "loss/crossentropy": 2.3549081087112427, + "loss/fcd": 1.07421875, + "loss/logits": 0.2364579290151596, + "step": 924 + }, + { + "epoch": 0.015978718074953144, + "grad_norm": 0.29296875, + "grad_norm_var": 0.014224227269490559, + "learning_rate": 0.01, + "loss": 1.3649, + "loss/crossentropy": 2.4736167192459106, + "loss/fcd": 1.15234375, + "loss/logits": 0.26356005668640137, + "step": 925 + }, + { + "epoch": 0.015995992364763904, + "grad_norm": 0.310546875, + "grad_norm_var": 0.01417382558186849, + "learning_rate": 0.01, + "loss": 1.4136, + "loss/crossentropy": 2.3580808639526367, + "loss/fcd": 1.2578125, + "loss/logits": 0.2911546379327774, + "step": 926 + }, + { + "epoch": 0.016013266654574664, + "grad_norm": 0.298828125, + "grad_norm_var": 0.014087867736816407, + "learning_rate": 0.01, + "loss": 1.3582, + "loss/crossentropy": 2.476295828819275, + "loss/fcd": 1.171875, + "loss/logits": 0.267447791993618, + "step": 927 + }, + { + "epoch": 0.016030540944385423, + "grad_norm": 0.322265625, + "grad_norm_var": 0.013896942138671875, + "learning_rate": 0.01, + "loss": 1.411, + "loss/crossentropy": 2.6316243410110474, + "loss/fcd": 1.16796875, + "loss/logits": 0.2681735157966614, + "step": 928 + }, + { + "epoch": 0.016047815234196183, + "grad_norm": 0.28515625, + "grad_norm_var": 0.013840230305989583, + "learning_rate": 0.01, + "loss": 1.3853, + "loss/crossentropy": 2.5550700426101685, + "loss/fcd": 1.12109375, + "loss/logits": 0.25278639793395996, + "step": 929 + }, + { + "epoch": 0.016065089524006943, + "grad_norm": 0.296875, + "grad_norm_var": 0.013876597086588541, + "learning_rate": 0.01, + "loss": 1.4125, + "loss/crossentropy": 2.511132836341858, + "loss/fcd": 1.1796875, + "loss/logits": 0.26167523860931396, + "step": 930 + }, + { + "epoch": 0.016082363813817703, + "grad_norm": 0.279296875, + "grad_norm_var": 0.01403514544169108, + "learning_rate": 0.01, + "loss": 1.351, + "loss/crossentropy": 2.468320608139038, + "loss/fcd": 1.1484375, + "loss/logits": 0.254236102104187, + "step": 931 + }, + { + "epoch": 0.016099638103628463, + "grad_norm": 0.3125, + "grad_norm_var": 0.014063119888305664, + "learning_rate": 0.01, + "loss": 1.3762, + "loss/crossentropy": 2.7182319164276123, + "loss/fcd": 1.1171875, + "loss/logits": 0.25874409079551697, + "step": 932 + }, + { + "epoch": 0.016116912393439226, + "grad_norm": 0.30859375, + "grad_norm_var": 0.00023280779520670574, + "learning_rate": 0.01, + "loss": 1.3703, + "loss/crossentropy": 2.3206039667129517, + "loss/fcd": 1.09765625, + "loss/logits": 0.2651352882385254, + "step": 933 + }, + { + "epoch": 0.016134186683249986, + "grad_norm": 0.310546875, + "grad_norm_var": 0.00022454261779785155, + "learning_rate": 0.01, + "loss": 1.3802, + "loss/crossentropy": 2.498626470565796, + "loss/fcd": 1.13671875, + "loss/logits": 0.259146973490715, + "step": 934 + }, + { + "epoch": 0.016151460973060746, + "grad_norm": 0.28125, + "grad_norm_var": 0.00024628639221191406, + "learning_rate": 0.01, + "loss": 1.3612, + "loss/crossentropy": 2.3583563566207886, + "loss/fcd": 1.06640625, + "loss/logits": 0.24286328256130219, + "step": 935 + }, + { + "epoch": 0.016168735262871506, + "grad_norm": 0.29296875, + "grad_norm_var": 0.00025018056233723957, + "learning_rate": 0.01, + "loss": 1.4221, + "loss/crossentropy": 2.4976600408554077, + "loss/fcd": 1.234375, + "loss/logits": 0.274882972240448, + "step": 936 + }, + { + "epoch": 0.016186009552682266, + "grad_norm": 0.376953125, + "grad_norm_var": 0.0005435784657796224, + "learning_rate": 0.01, + "loss": 1.4084, + "loss/crossentropy": 2.4435365200042725, + "loss/fcd": 1.16796875, + "loss/logits": 0.2811162769794464, + "step": 937 + }, + { + "epoch": 0.016203283842493026, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0005657037099202473, + "learning_rate": 0.01, + "loss": 1.3418, + "loss/crossentropy": 2.3197275400161743, + "loss/fcd": 1.1015625, + "loss/logits": 0.26322872936725616, + "step": 938 + }, + { + "epoch": 0.016220558132303785, + "grad_norm": 0.2890625, + "grad_norm_var": 0.0005706628163655599, + "learning_rate": 0.01, + "loss": 1.3881, + "loss/crossentropy": 2.6520742177963257, + "loss/fcd": 1.16015625, + "loss/logits": 0.2682619243860245, + "step": 939 + }, + { + "epoch": 0.016237832422114545, + "grad_norm": 0.28125, + "grad_norm_var": 0.0005757013956705729, + "learning_rate": 0.01, + "loss": 1.3772, + "loss/crossentropy": 2.4414173364639282, + "loss/fcd": 1.09375, + "loss/logits": 0.23820270597934723, + "step": 940 + }, + { + "epoch": 0.016255106711925305, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0005737145741780599, + "learning_rate": 0.01, + "loss": 1.4165, + "loss/crossentropy": 2.4042497873306274, + "loss/fcd": 1.1328125, + "loss/logits": 0.2601849138736725, + "step": 941 + }, + { + "epoch": 0.016272381001736065, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0005716323852539062, + "learning_rate": 0.01, + "loss": 1.4208, + "loss/crossentropy": 2.4315325021743774, + "loss/fcd": 1.16015625, + "loss/logits": 0.2801144868135452, + "step": 942 + }, + { + "epoch": 0.016289655291546825, + "grad_norm": 0.33984375, + "grad_norm_var": 0.0006620883941650391, + "learning_rate": 0.01, + "loss": 1.4608, + "loss/crossentropy": 2.545047879219055, + "loss/fcd": 1.2890625, + "loss/logits": 0.33230888843536377, + "step": 943 + }, + { + "epoch": 0.016306929581357585, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0006926854451497396, + "learning_rate": 0.01, + "loss": 1.329, + "loss/crossentropy": 2.259741187095642, + "loss/fcd": 1.06640625, + "loss/logits": 0.2455529421567917, + "step": 944 + }, + { + "epoch": 0.016324203871168348, + "grad_norm": 0.294921875, + "grad_norm_var": 0.0006779829661051432, + "learning_rate": 0.01, + "loss": 1.3409, + "loss/crossentropy": 2.3239141702651978, + "loss/fcd": 1.10546875, + "loss/logits": 0.24660293757915497, + "step": 945 + }, + { + "epoch": 0.016341478160979108, + "grad_norm": 0.33203125, + "grad_norm_var": 0.0007329146067301432, + "learning_rate": 0.01, + "loss": 1.4681, + "loss/crossentropy": 2.3145695328712463, + "loss/fcd": 1.1015625, + "loss/logits": 0.24830932170152664, + "step": 946 + }, + { + "epoch": 0.016358752450789868, + "grad_norm": 0.330078125, + "grad_norm_var": 0.0007279555002848308, + "learning_rate": 0.01, + "loss": 1.5011, + "loss/crossentropy": 2.350569486618042, + "loss/fcd": 1.1875, + "loss/logits": 0.2759709805250168, + "step": 947 + }, + { + "epoch": 0.016376026740600628, + "grad_norm": 0.470703125, + "grad_norm_var": 0.0024080912272135416, + "learning_rate": 0.01, + "loss": 1.52, + "loss/crossentropy": 2.034683883190155, + "loss/fcd": 1.2421875, + "loss/logits": 0.28756849467754364, + "step": 948 + }, + { + "epoch": 0.016393301030411388, + "grad_norm": 0.33984375, + "grad_norm_var": 0.002434539794921875, + "learning_rate": 0.01, + "loss": 1.4182, + "loss/crossentropy": 2.5900092124938965, + "loss/fcd": 1.15625, + "loss/logits": 0.26620975136756897, + "step": 949 + }, + { + "epoch": 0.016410575320222148, + "grad_norm": 0.314453125, + "grad_norm_var": 0.002431170145670573, + "learning_rate": 0.01, + "loss": 1.4163, + "loss/crossentropy": 2.458656430244446, + "loss/fcd": 1.1953125, + "loss/logits": 0.27218569815158844, + "step": 950 + }, + { + "epoch": 0.016427849610032907, + "grad_norm": 0.326171875, + "grad_norm_var": 0.002330636978149414, + "learning_rate": 0.01, + "loss": 1.5638, + "loss/crossentropy": 2.581447720527649, + "loss/fcd": 1.2265625, + "loss/logits": 0.2988656759262085, + "step": 951 + }, + { + "epoch": 0.016445123899843667, + "grad_norm": 0.412109375, + "grad_norm_var": 0.002758216857910156, + "learning_rate": 0.01, + "loss": 1.5667, + "loss/crossentropy": 2.21357798576355, + "loss/fcd": 1.2421875, + "loss/logits": 0.30781693756580353, + "step": 952 + }, + { + "epoch": 0.016462398189654427, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0026659488677978514, + "learning_rate": 0.01, + "loss": 1.3711, + "loss/crossentropy": 2.1623282432556152, + "loss/fcd": 1.06640625, + "loss/logits": 0.24749789386987686, + "step": 953 + }, + { + "epoch": 0.016479672479465187, + "grad_norm": 0.306640625, + "grad_norm_var": 0.0025832494099934894, + "learning_rate": 0.01, + "loss": 1.3663, + "loss/crossentropy": 2.683838129043579, + "loss/fcd": 1.19921875, + "loss/logits": 0.2529330998659134, + "step": 954 + }, + { + "epoch": 0.016496946769275947, + "grad_norm": 0.265625, + "grad_norm_var": 0.0027312596638997396, + "learning_rate": 0.01, + "loss": 1.3548, + "loss/crossentropy": 2.3420257568359375, + "loss/fcd": 1.0859375, + "loss/logits": 0.253268837928772, + "step": 955 + }, + { + "epoch": 0.016514221059086707, + "grad_norm": 0.35546875, + "grad_norm_var": 0.002652740478515625, + "learning_rate": 0.01, + "loss": 1.369, + "loss/crossentropy": 2.3265002965927124, + "loss/fcd": 1.1171875, + "loss/logits": 0.24975580722093582, + "step": 956 + }, + { + "epoch": 0.01653149534889747, + "grad_norm": 0.3203125, + "grad_norm_var": 0.0025789737701416016, + "learning_rate": 0.01, + "loss": 1.3913, + "loss/crossentropy": 2.4944993257522583, + "loss/fcd": 1.16796875, + "loss/logits": 0.25516972690820694, + "step": 957 + }, + { + "epoch": 0.01654876963870823, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0025911808013916017, + "learning_rate": 0.01, + "loss": 1.3542, + "loss/crossentropy": 2.583009362220764, + "loss/fcd": 1.15625, + "loss/logits": 0.26096589863300323, + "step": 958 + }, + { + "epoch": 0.01656604392851899, + "grad_norm": 0.287109375, + "grad_norm_var": 0.002695465087890625, + "learning_rate": 0.01, + "loss": 1.4014, + "loss/crossentropy": 2.6060469150543213, + "loss/fcd": 1.1796875, + "loss/logits": 0.298343300819397, + "step": 959 + }, + { + "epoch": 0.01658331821832975, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0024979750315348307, + "learning_rate": 0.01, + "loss": 1.4127, + "loss/crossentropy": 2.4206702709198, + "loss/fcd": 1.140625, + "loss/logits": 0.2592027187347412, + "step": 960 + }, + { + "epoch": 0.01660059250814051, + "grad_norm": 0.302734375, + "grad_norm_var": 0.002465550104777018, + "learning_rate": 0.01, + "loss": 1.4039, + "loss/crossentropy": 2.18042528629303, + "loss/fcd": 1.10546875, + "loss/logits": 0.28101974725723267, + "step": 961 + }, + { + "epoch": 0.01661786679795127, + "grad_norm": 0.353515625, + "grad_norm_var": 0.0024996439615885416, + "learning_rate": 0.01, + "loss": 1.3448, + "loss/crossentropy": 2.2248626947402954, + "loss/fcd": 1.07421875, + "loss/logits": 0.2191808819770813, + "step": 962 + }, + { + "epoch": 0.01663514108776203, + "grad_norm": 0.357421875, + "grad_norm_var": 0.002541033426920573, + "learning_rate": 0.01, + "loss": 1.4061, + "loss/crossentropy": 2.476745128631592, + "loss/fcd": 1.14453125, + "loss/logits": 0.26761066913604736, + "step": 963 + }, + { + "epoch": 0.01665241537757279, + "grad_norm": 0.33984375, + "grad_norm_var": 0.00121305783589681, + "learning_rate": 0.01, + "loss": 1.427, + "loss/crossentropy": 2.3096065521240234, + "loss/fcd": 1.234375, + "loss/logits": 0.42609208822250366, + "step": 964 + }, + { + "epoch": 0.01666968966738355, + "grad_norm": 0.31640625, + "grad_norm_var": 0.0012012322743733723, + "learning_rate": 0.01, + "loss": 1.3957, + "loss/crossentropy": 2.7282618284225464, + "loss/fcd": 1.20703125, + "loss/logits": 0.28854241967201233, + "step": 965 + }, + { + "epoch": 0.01668696395719431, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0012641747792561848, + "learning_rate": 0.01, + "loss": 1.3752, + "loss/crossentropy": 2.339871048927307, + "loss/fcd": 1.08984375, + "loss/logits": 0.2586899399757385, + "step": 966 + }, + { + "epoch": 0.01670423824700507, + "grad_norm": 0.263671875, + "grad_norm_var": 0.001474746068318685, + "learning_rate": 0.01, + "loss": 1.3006, + "loss/crossentropy": 2.3013978004455566, + "loss/fcd": 1.046875, + "loss/logits": 0.22273491322994232, + "step": 967 + }, + { + "epoch": 0.01672151253681583, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0008559544881184896, + "learning_rate": 0.01, + "loss": 1.4225, + "loss/crossentropy": 2.47222638130188, + "loss/fcd": 1.2421875, + "loss/logits": 0.2986321449279785, + "step": 968 + }, + { + "epoch": 0.01673878682662659, + "grad_norm": 0.29296875, + "grad_norm_var": 0.0008559544881184896, + "learning_rate": 0.01, + "loss": 1.4188, + "loss/crossentropy": 2.2383479475975037, + "loss/fcd": 1.2265625, + "loss/logits": 0.3132626414299011, + "step": 969 + }, + { + "epoch": 0.016756061116437352, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0008643945058186849, + "learning_rate": 0.01, + "loss": 1.363, + "loss/crossentropy": 2.5179413557052612, + "loss/fcd": 1.09375, + "loss/logits": 0.2516755014657974, + "step": 970 + }, + { + "epoch": 0.016773335406248112, + "grad_norm": 0.279296875, + "grad_norm_var": 0.0007908503214518229, + "learning_rate": 0.01, + "loss": 1.3967, + "loss/crossentropy": 1.9743611812591553, + "loss/fcd": 1.05859375, + "loss/logits": 0.24054741859436035, + "step": 971 + }, + { + "epoch": 0.016790609696058872, + "grad_norm": 0.275390625, + "grad_norm_var": 0.000740671157836914, + "learning_rate": 0.01, + "loss": 1.3595, + "loss/crossentropy": 2.405099630355835, + "loss/fcd": 1.15234375, + "loss/logits": 0.28836295008659363, + "step": 972 + }, + { + "epoch": 0.01680788398586963, + "grad_norm": 0.3046875, + "grad_norm_var": 0.0007307529449462891, + "learning_rate": 0.01, + "loss": 1.4048, + "loss/crossentropy": 2.583898901939392, + "loss/fcd": 1.2109375, + "loss/logits": 0.2704490125179291, + "step": 973 + }, + { + "epoch": 0.01682515827568039, + "grad_norm": 0.3125, + "grad_norm_var": 0.0007318973541259766, + "learning_rate": 0.01, + "loss": 1.4402, + "loss/crossentropy": 2.486370801925659, + "loss/fcd": 1.140625, + "loss/logits": 0.2756696939468384, + "step": 974 + }, + { + "epoch": 0.01684243256549115, + "grad_norm": 0.28125, + "grad_norm_var": 0.0007501602172851563, + "learning_rate": 0.01, + "loss": 1.3591, + "loss/crossentropy": 2.421715497970581, + "loss/fcd": 1.0390625, + "loss/logits": 0.22876735776662827, + "step": 975 + }, + { + "epoch": 0.01685970685530191, + "grad_norm": 0.318359375, + "grad_norm_var": 0.0007433573404947917, + "learning_rate": 0.01, + "loss": 1.3213, + "loss/crossentropy": 2.4171801805496216, + "loss/fcd": 1.09765625, + "loss/logits": 0.23518116772174835, + "step": 976 + }, + { + "epoch": 0.01687698114511267, + "grad_norm": 0.4140625, + "grad_norm_var": 0.0014527479807535807, + "learning_rate": 0.01, + "loss": 1.4776, + "loss/crossentropy": 2.080851912498474, + "loss/fcd": 1.26953125, + "loss/logits": 0.22676381468772888, + "step": 977 + }, + { + "epoch": 0.01689425543492343, + "grad_norm": 0.2734375, + "grad_norm_var": 0.0014325459798177084, + "learning_rate": 0.01, + "loss": 1.3453, + "loss/crossentropy": 2.2649213075637817, + "loss/fcd": 1.09765625, + "loss/logits": 0.2382289096713066, + "step": 978 + }, + { + "epoch": 0.01691152972473419, + "grad_norm": 0.26953125, + "grad_norm_var": 0.00134886105855306, + "learning_rate": 0.01, + "loss": 1.4216, + "loss/crossentropy": 2.4842547178268433, + "loss/fcd": 1.16015625, + "loss/logits": 0.27352918684482574, + "step": 979 + }, + { + "epoch": 0.01692880401454495, + "grad_norm": 0.291015625, + "grad_norm_var": 0.0012618382771809897, + "learning_rate": 0.01, + "loss": 1.3782, + "loss/crossentropy": 2.163589835166931, + "loss/fcd": 1.125, + "loss/logits": 0.26143455505371094, + "step": 980 + }, + { + "epoch": 0.01694607830435571, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0012567520141601562, + "learning_rate": 0.01, + "loss": 1.3912, + "loss/crossentropy": 2.421532988548279, + "loss/fcd": 1.08203125, + "loss/logits": 0.23204928636550903, + "step": 981 + }, + { + "epoch": 0.016963352594166474, + "grad_norm": 0.28125, + "grad_norm_var": 0.00127256711324056, + "learning_rate": 0.01, + "loss": 1.3826, + "loss/crossentropy": 2.607829451560974, + "loss/fcd": 1.125, + "loss/logits": 0.2582753002643585, + "step": 982 + }, + { + "epoch": 0.016980626883977234, + "grad_norm": 0.3359375, + "grad_norm_var": 0.0012684504191080729, + "learning_rate": 0.01, + "loss": 1.3938, + "loss/crossentropy": 2.430111050605774, + "loss/fcd": 1.10546875, + "loss/logits": 0.2326122149825096, + "step": 983 + }, + { + "epoch": 0.016997901173787994, + "grad_norm": 0.369140625, + "grad_norm_var": 0.001544936498006185, + "learning_rate": 0.01, + "loss": 1.4349, + "loss/crossentropy": 2.584348440170288, + "loss/fcd": 1.18359375, + "loss/logits": 0.27420538663864136, + "step": 984 + }, + { + "epoch": 0.017015175463598754, + "grad_norm": 0.333984375, + "grad_norm_var": 0.0015746434529622397, + "learning_rate": 0.01, + "loss": 1.4002, + "loss/crossentropy": 2.6233400106430054, + "loss/fcd": 1.13671875, + "loss/logits": 0.2728031575679779, + "step": 985 + }, + { + "epoch": 0.017032449753409513, + "grad_norm": 0.3203125, + "grad_norm_var": 0.001567840576171875, + "learning_rate": 0.01, + "loss": 1.3921, + "loss/crossentropy": 2.2127867937088013, + "loss/fcd": 1.1328125, + "loss/logits": 0.24761803448200226, + "step": 986 + }, + { + "epoch": 0.017049724043220273, + "grad_norm": 0.322265625, + "grad_norm_var": 0.0015125910441080729, + "learning_rate": 0.01, + "loss": 1.4117, + "loss/crossentropy": 2.4916510581970215, + "loss/fcd": 1.140625, + "loss/logits": 0.2528844252228737, + "step": 987 + }, + { + "epoch": 0.017066998333031033, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0014711856842041016, + "learning_rate": 0.01, + "loss": 1.3702, + "loss/crossentropy": 2.076325237751007, + "loss/fcd": 1.109375, + "loss/logits": 0.24822547286748886, + "step": 988 + }, + { + "epoch": 0.017084272622841793, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0015150547027587891, + "learning_rate": 0.01, + "loss": 1.375, + "loss/crossentropy": 2.2751649618148804, + "loss/fcd": 1.06640625, + "loss/logits": 0.2591545879840851, + "step": 989 + }, + { + "epoch": 0.017101546912652553, + "grad_norm": 0.3046875, + "grad_norm_var": 0.001517470677693685, + "learning_rate": 0.01, + "loss": 1.3438, + "loss/crossentropy": 2.564236044883728, + "loss/fcd": 1.10546875, + "loss/logits": 0.2575865834951401, + "step": 990 + }, + { + "epoch": 0.017118821202463313, + "grad_norm": 0.27734375, + "grad_norm_var": 0.0015337467193603516, + "learning_rate": 0.01, + "loss": 1.2948, + "loss/crossentropy": 2.322708249092102, + "loss/fcd": 1.0859375, + "loss/logits": 0.23693984001874924, + "step": 991 + }, + { + "epoch": 0.017136095492274073, + "grad_norm": 0.30078125, + "grad_norm_var": 0.0015344619750976562, + "learning_rate": 0.01, + "loss": 1.4124, + "loss/crossentropy": 2.4255528450012207, + "loss/fcd": 1.171875, + "loss/logits": 0.25587616115808487, + "step": 992 + }, + { + "epoch": 0.017153369782084833, + "grad_norm": 0.275390625, + "grad_norm_var": 0.0007997989654541015, + "learning_rate": 0.01, + "loss": 1.3437, + "loss/crossentropy": 2.5350613594055176, + "loss/fcd": 1.12109375, + "loss/logits": 0.25402751564979553, + "step": 993 + }, + { + "epoch": 0.017170644071895596, + "grad_norm": 0.30859375, + "grad_norm_var": 0.0007494449615478516, + "learning_rate": 0.01, + "loss": 1.4055, + "loss/crossentropy": 2.5626988410949707, + "loss/fcd": 1.14453125, + "loss/logits": 0.25801587104797363, + "step": 994 + }, + { + "epoch": 0.017187918361706356, + "grad_norm": 0.3046875, + "grad_norm_var": 0.000670480728149414, + "learning_rate": 0.01, + "loss": 1.3867, + "loss/crossentropy": 2.7328250408172607, + "loss/fcd": 1.171875, + "loss/logits": 0.28935085237026215, + "step": 995 + }, + { + "epoch": 0.017205192651517116, + "grad_norm": 0.287109375, + "grad_norm_var": 0.0006787459055582683, + "learning_rate": 0.01, + "loss": 1.3854, + "loss/crossentropy": 2.2958213090896606, + "loss/fcd": 1.1328125, + "loss/logits": 0.2697945237159729, + "step": 996 + }, + { + "epoch": 0.017222466941327876, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0006787459055582683, + "learning_rate": 0.01, + "loss": 1.3576, + "loss/crossentropy": 2.314937472343445, + "loss/fcd": 1.09375, + "loss/logits": 0.24704495817422867, + "step": 997 + }, + { + "epoch": 0.017239741231138635, + "grad_norm": 0.32421875, + "grad_norm_var": 0.0006591637929280598, + "learning_rate": 0.01, + "loss": 1.4405, + "loss/crossentropy": 2.582629084587097, + "loss/fcd": 1.26171875, + "loss/logits": 0.335773229598999, + "step": 998 + }, + { + "epoch": 0.017257015520949395, + "grad_norm": 0.28515625, + "grad_norm_var": 0.0006277561187744141, + "learning_rate": 0.01, + "loss": 1.3605, + "loss/crossentropy": 2.299025297164917, + "loss/fcd": 1.052734375, + "loss/logits": 0.23469385504722595, + "step": 999 + }, + { + "epoch": 0.017274289810760155, + "grad_norm": 0.26953125, + "grad_norm_var": 0.00038700103759765626, + "learning_rate": 0.01, + "loss": 1.3825, + "loss/crossentropy": 2.467602014541626, + "loss/fcd": 1.15234375, + "loss/logits": 0.2697184160351753, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 300000, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.70040442617856e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}