qwen37 / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
769171e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.017274289810760155,
"eval_steps": 1000,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.7274289810760155e-05,
"grad_norm": 0.318359375,
"learning_rate": 0.01,
"loss": 1.4153,
"loss/crossentropy": 2.180091619491577,
"loss/fcd": 1.1796875,
"loss/logits": 0.2821359634399414,
"step": 1
},
{
"epoch": 3.454857962152031e-05,
"grad_norm": 0.3515625,
"learning_rate": 0.01,
"loss": 1.4401,
"loss/crossentropy": 2.49104380607605,
"loss/fcd": 1.1484375,
"loss/logits": 0.2602585107088089,
"step": 2
},
{
"epoch": 5.182286943228046e-05,
"grad_norm": 0.30859375,
"learning_rate": 0.01,
"loss": 1.4352,
"loss/crossentropy": 2.453463077545166,
"loss/fcd": 1.1875,
"loss/logits": 0.2847007066011429,
"step": 3
},
{
"epoch": 6.909715924304062e-05,
"grad_norm": 0.306640625,
"learning_rate": 0.01,
"loss": 1.3983,
"loss/crossentropy": 2.52145779132843,
"loss/fcd": 1.125,
"loss/logits": 0.2535911202430725,
"step": 4
},
{
"epoch": 8.637144905380078e-05,
"grad_norm": 0.35546875,
"learning_rate": 0.01,
"loss": 1.4077,
"loss/crossentropy": 2.364890694618225,
"loss/fcd": 1.10546875,
"loss/logits": 0.24292171746492386,
"step": 5
},
{
"epoch": 0.00010364573886456093,
"grad_norm": 0.310546875,
"learning_rate": 0.01,
"loss": 1.3824,
"loss/crossentropy": 2.3052154779434204,
"loss/fcd": 1.12890625,
"loss/logits": 0.24541093409061432,
"step": 6
},
{
"epoch": 0.00012092002867532108,
"grad_norm": 0.29296875,
"learning_rate": 0.01,
"loss": 1.4026,
"loss/crossentropy": 2.381744861602783,
"loss/fcd": 1.1171875,
"loss/logits": 0.2507929801940918,
"step": 7
},
{
"epoch": 0.00013819431848608124,
"grad_norm": 0.322265625,
"learning_rate": 0.01,
"loss": 1.4452,
"loss/crossentropy": 2.613944888114929,
"loss/fcd": 1.1796875,
"loss/logits": 0.27175769209861755,
"step": 8
},
{
"epoch": 0.0001554686082968414,
"grad_norm": 0.31640625,
"learning_rate": 0.01,
"loss": 1.4301,
"loss/crossentropy": 2.4438647031784058,
"loss/fcd": 1.2890625,
"loss/logits": 0.31327594816684723,
"step": 9
},
{
"epoch": 0.00017274289810760156,
"grad_norm": 0.322265625,
"learning_rate": 0.01,
"loss": 1.4258,
"loss/crossentropy": 2.689444422721863,
"loss/fcd": 1.20703125,
"loss/logits": 0.2705621272325516,
"step": 10
},
{
"epoch": 0.0001900171879183617,
"grad_norm": 0.283203125,
"learning_rate": 0.01,
"loss": 1.38,
"loss/crossentropy": 2.6325626373291016,
"loss/fcd": 1.16796875,
"loss/logits": 0.26059799641370773,
"step": 11
},
{
"epoch": 0.00020729147772912185,
"grad_norm": 0.294921875,
"learning_rate": 0.01,
"loss": 1.3964,
"loss/crossentropy": 2.2171366214752197,
"loss/fcd": 1.16015625,
"loss/logits": 0.25415121763944626,
"step": 12
},
{
"epoch": 0.00022456576753988202,
"grad_norm": 0.314453125,
"learning_rate": 0.01,
"loss": 1.4028,
"loss/crossentropy": 2.239351272583008,
"loss/fcd": 1.0625,
"loss/logits": 0.2298966646194458,
"step": 13
},
{
"epoch": 0.00024184005735064217,
"grad_norm": 0.31640625,
"learning_rate": 0.01,
"loss": 1.4218,
"loss/crossentropy": 2.712681293487549,
"loss/fcd": 1.1328125,
"loss/logits": 0.24666083604097366,
"step": 14
},
{
"epoch": 0.00025911434716140234,
"grad_norm": 0.3515625,
"learning_rate": 0.01,
"loss": 1.4074,
"loss/crossentropy": 2.6137157678604126,
"loss/fcd": 1.18359375,
"loss/logits": 0.2758009433746338,
"step": 15
},
{
"epoch": 0.0002763886369721625,
"grad_norm": 0.376953125,
"grad_norm_var": 0.0006428877512613932,
"learning_rate": 0.01,
"loss": 1.4429,
"loss/crossentropy": 2.266461730003357,
"loss/fcd": 1.203125,
"loss/logits": 0.26471851766109467,
"step": 16
},
{
"epoch": 0.00029366292678292263,
"grad_norm": 0.353515625,
"grad_norm_var": 0.0007058302561442057,
"learning_rate": 0.01,
"loss": 1.433,
"loss/crossentropy": 2.63763689994812,
"loss/fcd": 1.21875,
"loss/logits": 0.28894874453544617,
"step": 17
},
{
"epoch": 0.0003109372165936828,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0006610711415608723,
"learning_rate": 0.01,
"loss": 1.4003,
"loss/crossentropy": 2.5304828882217407,
"loss/fcd": 1.13671875,
"loss/logits": 0.26741379499435425,
"step": 18
},
{
"epoch": 0.0003282115064044429,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0006503899892171224,
"learning_rate": 0.01,
"loss": 1.4179,
"loss/crossentropy": 2.36896288394928,
"loss/fcd": 1.19921875,
"loss/logits": 0.2745219022035599,
"step": 19
},
{
"epoch": 0.0003454857962152031,
"grad_norm": 0.3125,
"grad_norm_var": 0.0006408055623372395,
"learning_rate": 0.01,
"loss": 1.4132,
"loss/crossentropy": 2.471444010734558,
"loss/fcd": 1.15234375,
"loss/logits": 0.24692216515541077,
"step": 20
},
{
"epoch": 0.00036276008602596327,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0005624771118164062,
"learning_rate": 0.01,
"loss": 1.3532,
"loss/crossentropy": 2.4798572063446045,
"loss/fcd": 1.13671875,
"loss/logits": 0.24522659927606583,
"step": 21
},
{
"epoch": 0.0003800343758367234,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0005975723266601563,
"learning_rate": 0.01,
"loss": 1.4057,
"loss/crossentropy": 2.3649370670318604,
"loss/fcd": 1.15234375,
"loss/logits": 0.26143455505371094,
"step": 22
},
{
"epoch": 0.00039730866564748356,
"grad_norm": 0.3125,
"grad_norm_var": 0.0005533854166666667,
"learning_rate": 0.01,
"loss": 1.4282,
"loss/crossentropy": 2.7900454998016357,
"loss/fcd": 1.109375,
"loss/logits": 0.256390705704689,
"step": 23
},
{
"epoch": 0.0004145829554582437,
"grad_norm": 0.283203125,
"grad_norm_var": 0.000638580322265625,
"learning_rate": 0.01,
"loss": 1.422,
"loss/crossentropy": 2.3018282651901245,
"loss/fcd": 1.14453125,
"loss/logits": 0.26084744930267334,
"step": 24
},
{
"epoch": 0.0004318572452690039,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0006613254547119141,
"learning_rate": 0.01,
"loss": 1.4043,
"loss/crossentropy": 2.404328942298889,
"loss/fcd": 1.0390625,
"loss/logits": 0.24188002943992615,
"step": 25
},
{
"epoch": 0.00044913153507976405,
"grad_norm": 2.265625,
"grad_norm_var": 0.23812503814697267,
"learning_rate": 0.01,
"loss": 1.3559,
"loss/crossentropy": 2.5355838537216187,
"loss/fcd": 1.1640625,
"loss/logits": 0.24743662029504776,
"step": 26
},
{
"epoch": 0.0004664058248905242,
"grad_norm": 0.36328125,
"grad_norm_var": 0.23687055905659993,
"learning_rate": 0.01,
"loss": 1.4526,
"loss/crossentropy": 2.329304337501526,
"loss/fcd": 1.0625,
"loss/logits": 0.2358776032924652,
"step": 27
},
{
"epoch": 0.00048368011470128434,
"grad_norm": 0.279296875,
"grad_norm_var": 0.23719480832417805,
"learning_rate": 0.01,
"loss": 1.3243,
"loss/crossentropy": 2.1602375507354736,
"loss/fcd": 1.02734375,
"loss/logits": 0.21287230402231216,
"step": 28
},
{
"epoch": 0.0005009544045120445,
"grad_norm": 0.30078125,
"grad_norm_var": 0.2374394734700521,
"learning_rate": 0.01,
"loss": 1.4111,
"loss/crossentropy": 2.4278478622436523,
"loss/fcd": 1.1171875,
"loss/logits": 0.2437409982085228,
"step": 29
},
{
"epoch": 0.0005182286943228047,
"grad_norm": 0.27734375,
"grad_norm_var": 0.23818588256835938,
"learning_rate": 0.01,
"loss": 1.4091,
"loss/crossentropy": 2.5047000646591187,
"loss/fcd": 1.15625,
"loss/logits": 0.27113544940948486,
"step": 30
},
{
"epoch": 0.0005355029841335648,
"grad_norm": 0.373046875,
"grad_norm_var": 0.23796435991923015,
"learning_rate": 0.01,
"loss": 1.4498,
"loss/crossentropy": 2.3999940156936646,
"loss/fcd": 1.25,
"loss/logits": 0.27853211760520935,
"step": 31
},
{
"epoch": 0.000552777273944325,
"grad_norm": 0.3671875,
"grad_norm_var": 0.23805281321207683,
"learning_rate": 0.01,
"loss": 1.4805,
"loss/crossentropy": 2.511382222175598,
"loss/fcd": 1.3203125,
"loss/logits": 0.409069299697876,
"step": 32
},
{
"epoch": 0.0005700515637550852,
"grad_norm": 0.6875,
"grad_norm_var": 0.24118663469950358,
"learning_rate": 0.01,
"loss": 1.3432,
"loss/crossentropy": 2.5396409034729004,
"loss/fcd": 1.2421875,
"loss/logits": 0.25656259059906006,
"step": 33
},
{
"epoch": 0.0005873258535658453,
"grad_norm": 0.357421875,
"grad_norm_var": 0.24034620920817057,
"learning_rate": 0.01,
"loss": 1.4207,
"loss/crossentropy": 2.3687368631362915,
"loss/fcd": 1.109375,
"loss/logits": 0.23432840406894684,
"step": 34
},
{
"epoch": 0.0006046001433766055,
"grad_norm": 0.291015625,
"grad_norm_var": 0.2409596602121989,
"learning_rate": 0.01,
"loss": 1.4195,
"loss/crossentropy": 2.428983449935913,
"loss/fcd": 1.21484375,
"loss/logits": 0.2627260833978653,
"step": 35
},
{
"epoch": 0.0006218744331873656,
"grad_norm": 0.31640625,
"grad_norm_var": 0.2408828576405843,
"learning_rate": 0.01,
"loss": 1.372,
"loss/crossentropy": 2.827309250831604,
"loss/fcd": 1.10546875,
"loss/logits": 0.2433805763721466,
"step": 36
},
{
"epoch": 0.0006391487229981258,
"grad_norm": 0.28125,
"grad_norm_var": 0.24178783098856607,
"learning_rate": 0.01,
"loss": 1.388,
"loss/crossentropy": 2.4543423652648926,
"loss/fcd": 1.13671875,
"loss/logits": 0.2694346010684967,
"step": 37
},
{
"epoch": 0.0006564230128088858,
"grad_norm": 0.302734375,
"grad_norm_var": 0.24162036577860516,
"learning_rate": 0.01,
"loss": 1.4109,
"loss/crossentropy": 2.5903791189193726,
"loss/fcd": 1.12109375,
"loss/logits": 0.246421679854393,
"step": 38
},
{
"epoch": 0.000673697302619646,
"grad_norm": 0.32421875,
"grad_norm_var": 0.24139873186747232,
"learning_rate": 0.01,
"loss": 1.4232,
"loss/crossentropy": 2.248749613761902,
"loss/fcd": 1.09375,
"loss/logits": 0.23829498887062073,
"step": 39
},
{
"epoch": 0.0006909715924304062,
"grad_norm": 0.31640625,
"grad_norm_var": 0.24068241119384765,
"learning_rate": 0.01,
"loss": 1.4025,
"loss/crossentropy": 2.52192759513855,
"loss/fcd": 1.21875,
"loss/logits": 0.3120736628770828,
"step": 40
},
{
"epoch": 0.0007082458822411663,
"grad_norm": 0.294921875,
"grad_norm_var": 0.24076868693033854,
"learning_rate": 0.01,
"loss": 1.3388,
"loss/crossentropy": 2.4299440383911133,
"loss/fcd": 1.05078125,
"loss/logits": 0.21974454075098038,
"step": 41
},
{
"epoch": 0.0007255201720519265,
"grad_norm": 0.361328125,
"grad_norm_var": 0.00956584612528483,
"learning_rate": 0.01,
"loss": 1.3783,
"loss/crossentropy": 2.3354129791259766,
"loss/fcd": 1.12109375,
"loss/logits": 0.22372399270534515,
"step": 42
},
{
"epoch": 0.0007427944618626866,
"grad_norm": 0.2734375,
"grad_norm_var": 0.009831984837849935,
"learning_rate": 0.01,
"loss": 1.3578,
"loss/crossentropy": 2.3422107696533203,
"loss/fcd": 1.0703125,
"loss/logits": 0.22979970276355743,
"step": 43
},
{
"epoch": 0.0007600687516734468,
"grad_norm": 0.337890625,
"grad_norm_var": 0.009589751561482748,
"learning_rate": 0.01,
"loss": 1.4869,
"loss/crossentropy": 2.4120657444000244,
"loss/fcd": 1.22265625,
"loss/logits": 0.27795399725437164,
"step": 44
},
{
"epoch": 0.000777343041484207,
"grad_norm": 0.314453125,
"grad_norm_var": 0.009527333577473958,
"learning_rate": 0.01,
"loss": 1.3861,
"loss/crossentropy": 2.66101336479187,
"loss/fcd": 1.15625,
"loss/logits": 0.25736863911151886,
"step": 45
},
{
"epoch": 0.0007946173312949671,
"grad_norm": 0.298828125,
"grad_norm_var": 0.009370152155558269,
"learning_rate": 0.01,
"loss": 1.4078,
"loss/crossentropy": 2.5887415409088135,
"loss/fcd": 1.17578125,
"loss/logits": 0.285249263048172,
"step": 46
},
{
"epoch": 0.0008118916211057273,
"grad_norm": 0.271484375,
"grad_norm_var": 0.009616454442342123,
"learning_rate": 0.01,
"loss": 1.3142,
"loss/crossentropy": 2.5115991830825806,
"loss/fcd": 1.0703125,
"loss/logits": 0.23692379146814346,
"step": 47
},
{
"epoch": 0.0008291659109164874,
"grad_norm": 0.296875,
"grad_norm_var": 0.00964506467183431,
"learning_rate": 0.01,
"loss": 1.4093,
"loss/crossentropy": 2.5383851528167725,
"loss/fcd": 1.14453125,
"loss/logits": 0.2725464850664139,
"step": 48
},
{
"epoch": 0.0008464402007272476,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0007040500640869141,
"learning_rate": 0.01,
"loss": 1.3871,
"loss/crossentropy": 2.3415656089782715,
"loss/fcd": 1.109375,
"loss/logits": 0.23871353268623352,
"step": 49
},
{
"epoch": 0.0008637144905380078,
"grad_norm": 0.310546875,
"grad_norm_var": 0.000538492202758789,
"learning_rate": 0.01,
"loss": 1.422,
"loss/crossentropy": 2.241709351539612,
"loss/fcd": 1.12109375,
"loss/logits": 0.2642487585544586,
"step": 50
},
{
"epoch": 0.0008809887803487679,
"grad_norm": 0.271484375,
"grad_norm_var": 0.0006014347076416015,
"learning_rate": 0.01,
"loss": 1.4018,
"loss/crossentropy": 2.18844211101532,
"loss/fcd": 1.0859375,
"loss/logits": 0.25836754590272903,
"step": 51
},
{
"epoch": 0.0008982630701595281,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0006024678548177083,
"learning_rate": 0.01,
"loss": 1.3702,
"loss/crossentropy": 2.4040807485580444,
"loss/fcd": 1.12890625,
"loss/logits": 0.27216051518917084,
"step": 52
},
{
"epoch": 0.0009155373599702882,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0005683739980061849,
"learning_rate": 0.01,
"loss": 1.375,
"loss/crossentropy": 2.3604718446731567,
"loss/fcd": 1.109375,
"loss/logits": 0.2563931792974472,
"step": 53
},
{
"epoch": 0.0009328116497810484,
"grad_norm": 0.328125,
"grad_norm_var": 0.0006024678548177083,
"learning_rate": 0.01,
"loss": 1.3398,
"loss/crossentropy": 2.3702304363250732,
"loss/fcd": 1.044921875,
"loss/logits": 0.23356395214796066,
"step": 54
},
{
"epoch": 0.0009500859395918086,
"grad_norm": 0.322265625,
"grad_norm_var": 0.000598001480102539,
"learning_rate": 0.01,
"loss": 1.4359,
"loss/crossentropy": 2.532386064529419,
"loss/fcd": 1.19921875,
"loss/logits": 0.29735907912254333,
"step": 55
},
{
"epoch": 0.0009673602294025687,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0005999088287353515,
"learning_rate": 0.01,
"loss": 1.3103,
"loss/crossentropy": 2.4240375757217407,
"loss/fcd": 1.04296875,
"loss/logits": 0.2354799136519432,
"step": 56
},
{
"epoch": 0.0009846345192133289,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0005986372629801433,
"learning_rate": 0.01,
"loss": 1.4436,
"loss/crossentropy": 2.6270374059677124,
"loss/fcd": 1.17578125,
"loss/logits": 0.2780339866876602,
"step": 57
},
{
"epoch": 0.001001908809024089,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0003819147745768229,
"learning_rate": 0.01,
"loss": 1.4263,
"loss/crossentropy": 2.6478673219680786,
"loss/fcd": 1.1640625,
"loss/logits": 0.26073622703552246,
"step": 58
},
{
"epoch": 0.001019183098834849,
"grad_norm": 0.314453125,
"grad_norm_var": 0.00032817522684733074,
"learning_rate": 0.01,
"loss": 1.3944,
"loss/crossentropy": 2.596788763999939,
"loss/fcd": 1.13671875,
"loss/logits": 0.2364196628332138,
"step": 59
},
{
"epoch": 0.0010364573886456094,
"grad_norm": 0.388671875,
"grad_norm_var": 0.0007116794586181641,
"learning_rate": 0.01,
"loss": 1.4703,
"loss/crossentropy": 2.516297459602356,
"loss/fcd": 1.1484375,
"loss/logits": 0.2600822076201439,
"step": 60
},
{
"epoch": 0.0010537316784563695,
"grad_norm": 0.353515625,
"grad_norm_var": 0.0008394718170166016,
"learning_rate": 0.01,
"loss": 1.4355,
"loss/crossentropy": 2.3750414848327637,
"loss/fcd": 1.1171875,
"loss/logits": 0.257433146238327,
"step": 61
},
{
"epoch": 0.0010710059682671295,
"grad_norm": 0.341796875,
"grad_norm_var": 0.0008870283762613932,
"learning_rate": 0.01,
"loss": 1.4704,
"loss/crossentropy": 2.6349244117736816,
"loss/fcd": 1.171875,
"loss/logits": 0.27842070162296295,
"step": 62
},
{
"epoch": 0.0010882802580778899,
"grad_norm": 0.345703125,
"grad_norm_var": 0.000816965103149414,
"learning_rate": 0.01,
"loss": 1.4253,
"loss/crossentropy": 2.4561866521835327,
"loss/fcd": 1.1484375,
"loss/logits": 0.2601305991411209,
"step": 63
},
{
"epoch": 0.00110555454788865,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0008069197336832682,
"learning_rate": 0.01,
"loss": 1.3542,
"loss/crossentropy": 2.4422744512557983,
"loss/fcd": 1.16015625,
"loss/logits": 0.25526949763298035,
"step": 64
},
{
"epoch": 0.00112282883769941,
"grad_norm": 0.41796875,
"grad_norm_var": 0.0014043013254801432,
"learning_rate": 0.01,
"loss": 1.4401,
"loss/crossentropy": 2.164702892303467,
"loss/fcd": 1.22265625,
"loss/logits": 0.20365531742572784,
"step": 65
},
{
"epoch": 0.0011401031275101703,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0014294942220052084,
"learning_rate": 0.01,
"loss": 1.3525,
"loss/crossentropy": 2.7132447957992554,
"loss/fcd": 1.13671875,
"loss/logits": 0.2643866539001465,
"step": 66
},
{
"epoch": 0.0011573774173209304,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0012522220611572265,
"learning_rate": 0.01,
"loss": 1.3225,
"loss/crossentropy": 2.4213372468948364,
"loss/fcd": 1.07421875,
"loss/logits": 0.2328537479043007,
"step": 67
},
{
"epoch": 0.0011746517071316905,
"grad_norm": 0.353515625,
"grad_norm_var": 0.00119627316792806,
"learning_rate": 0.01,
"loss": 1.3973,
"loss/crossentropy": 2.436795651912689,
"loss/fcd": 1.09765625,
"loss/logits": 0.2546040713787079,
"step": 68
},
{
"epoch": 0.0011919259969424506,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0012935479482014975,
"learning_rate": 0.01,
"loss": 1.3866,
"loss/crossentropy": 2.274712562561035,
"loss/fcd": 1.15625,
"loss/logits": 0.26513542234897614,
"step": 69
},
{
"epoch": 0.001209200286753211,
"grad_norm": 0.296875,
"grad_norm_var": 0.0013611952463785807,
"learning_rate": 0.01,
"loss": 1.3986,
"loss/crossentropy": 2.4798693656921387,
"loss/fcd": 1.140625,
"loss/logits": 0.2789834886789322,
"step": 70
},
{
"epoch": 0.001226474576563971,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0014126936594645182,
"learning_rate": 0.01,
"loss": 1.3394,
"loss/crossentropy": 2.496403932571411,
"loss/fcd": 1.10546875,
"loss/logits": 0.23832575976848602,
"step": 71
},
{
"epoch": 0.001243748866374731,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0013817946116129558,
"learning_rate": 0.01,
"loss": 1.3945,
"loss/crossentropy": 2.330789804458618,
"loss/fcd": 1.078125,
"loss/logits": 0.23751115798950195,
"step": 72
},
{
"epoch": 0.0012610231561854914,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0014734745025634765,
"learning_rate": 0.01,
"loss": 1.3859,
"loss/crossentropy": 2.5367313623428345,
"loss/fcd": 1.19921875,
"loss/logits": 0.2804088890552521,
"step": 73
},
{
"epoch": 0.0012782974459962515,
"grad_norm": 0.28515625,
"grad_norm_var": 0.001559305191040039,
"learning_rate": 0.01,
"loss": 1.3887,
"loss/crossentropy": 2.3117035627365112,
"loss/fcd": 1.0625,
"loss/logits": 0.2553889825940132,
"step": 74
},
{
"epoch": 0.0012955717358070116,
"grad_norm": 0.302734375,
"grad_norm_var": 0.001582193374633789,
"learning_rate": 0.01,
"loss": 1.4083,
"loss/crossentropy": 2.5574092864990234,
"loss/fcd": 1.12890625,
"loss/logits": 0.24754850566387177,
"step": 75
},
{
"epoch": 0.0013128460256177717,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0013358910878499349,
"learning_rate": 0.01,
"loss": 1.39,
"loss/crossentropy": 2.5164517164230347,
"loss/fcd": 1.13671875,
"loss/logits": 0.23118755221366882,
"step": 76
},
{
"epoch": 0.001330120315428532,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0012410481770833333,
"learning_rate": 0.01,
"loss": 1.4129,
"loss/crossentropy": 2.4725937843322754,
"loss/fcd": 1.11328125,
"loss/logits": 0.2354634776711464,
"step": 77
},
{
"epoch": 0.001347394605239292,
"grad_norm": 0.52734375,
"grad_norm_var": 0.0040692488352457685,
"learning_rate": 0.01,
"loss": 1.5435,
"loss/crossentropy": 2.067330479621887,
"loss/fcd": 1.0703125,
"loss/logits": 0.2535740062594414,
"step": 78
},
{
"epoch": 0.0013646688950500522,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0040776570638020836,
"learning_rate": 0.01,
"loss": 1.3808,
"loss/crossentropy": 2.363155961036682,
"loss/fcd": 1.109375,
"loss/logits": 0.2392946034669876,
"step": 79
},
{
"epoch": 0.0013819431848608125,
"grad_norm": 0.310546875,
"grad_norm_var": 0.004054371515909831,
"learning_rate": 0.01,
"loss": 1.4014,
"loss/crossentropy": 2.561974883079529,
"loss/fcd": 1.12109375,
"loss/logits": 0.2719137519598007,
"step": 80
},
{
"epoch": 0.0013992174746715726,
"grad_norm": 0.33203125,
"grad_norm_var": 0.0034375349680582684,
"learning_rate": 0.01,
"loss": 1.3718,
"loss/crossentropy": 2.5669400691986084,
"loss/fcd": 1.1875,
"loss/logits": 0.27283619344234467,
"step": 81
},
{
"epoch": 0.0014164917644823327,
"grad_norm": 0.318359375,
"grad_norm_var": 0.003415362040201823,
"learning_rate": 0.01,
"loss": 1.423,
"loss/crossentropy": 2.3874313831329346,
"loss/fcd": 1.11328125,
"loss/logits": 0.25072336941957474,
"step": 82
},
{
"epoch": 0.001433766054293093,
"grad_norm": 0.29296875,
"grad_norm_var": 0.003453509012858073,
"learning_rate": 0.01,
"loss": 1.4176,
"loss/crossentropy": 2.711247205734253,
"loss/fcd": 1.23046875,
"loss/logits": 0.28591448068618774,
"step": 83
},
{
"epoch": 0.001451040344103853,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0034200032552083332,
"learning_rate": 0.01,
"loss": 1.3905,
"loss/crossentropy": 2.549779772758484,
"loss/fcd": 1.12109375,
"loss/logits": 0.2730839252471924,
"step": 84
},
{
"epoch": 0.0014683146339146132,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0033526102701822917,
"learning_rate": 0.01,
"loss": 1.3706,
"loss/crossentropy": 2.255567193031311,
"loss/fcd": 1.1015625,
"loss/logits": 0.2550910860300064,
"step": 85
},
{
"epoch": 0.0014855889237253732,
"grad_norm": 0.6015625,
"grad_norm_var": 0.008341471354166666,
"learning_rate": 0.01,
"loss": 1.629,
"loss/crossentropy": 2.245366394519806,
"loss/fcd": 1.58203125,
"loss/logits": 0.3177703619003296,
"step": 86
},
{
"epoch": 0.0015028632135361336,
"grad_norm": 0.361328125,
"grad_norm_var": 0.0082763671875,
"learning_rate": 0.01,
"loss": 1.3925,
"loss/crossentropy": 2.5329853296279907,
"loss/fcd": 1.1640625,
"loss/logits": 0.2691914439201355,
"step": 87
},
{
"epoch": 0.0015201375033468936,
"grad_norm": 0.337890625,
"grad_norm_var": 0.008169158299763998,
"learning_rate": 0.01,
"loss": 1.3783,
"loss/crossentropy": 2.573711633682251,
"loss/fcd": 1.11328125,
"loss/logits": 0.24663084745407104,
"step": 88
},
{
"epoch": 0.0015374117931576537,
"grad_norm": 0.3046875,
"grad_norm_var": 0.008059438069661458,
"learning_rate": 0.01,
"loss": 1.3466,
"loss/crossentropy": 2.4545916318893433,
"loss/fcd": 1.14453125,
"loss/logits": 0.22179614007472992,
"step": 89
},
{
"epoch": 0.001554686082968414,
"grad_norm": 0.353515625,
"grad_norm_var": 0.007821893692016602,
"learning_rate": 0.01,
"loss": 1.4058,
"loss/crossentropy": 2.0489944219589233,
"loss/fcd": 1.1484375,
"loss/logits": 0.25446537882089615,
"step": 90
},
{
"epoch": 0.0015719603727791741,
"grad_norm": 0.333984375,
"grad_norm_var": 0.007696262995402018,
"learning_rate": 0.01,
"loss": 1.4186,
"loss/crossentropy": 2.6278460025787354,
"loss/fcd": 1.17578125,
"loss/logits": 0.2563782036304474,
"step": 91
},
{
"epoch": 0.0015892346625899342,
"grad_norm": 0.33984375,
"grad_norm_var": 0.0074314753214518225,
"learning_rate": 0.01,
"loss": 1.4634,
"loss/crossentropy": 2.3578550815582275,
"loss/fcd": 1.13671875,
"loss/logits": 0.26509464532136917,
"step": 92
},
{
"epoch": 0.0016065089524006945,
"grad_norm": 0.30078125,
"grad_norm_var": 0.007539876302083333,
"learning_rate": 0.01,
"loss": 1.3685,
"loss/crossentropy": 2.53238308429718,
"loss/fcd": 1.0859375,
"loss/logits": 0.24864411354064941,
"step": 93
},
{
"epoch": 0.0016237832422114546,
"grad_norm": 0.302734375,
"grad_norm_var": 0.005428679784138997,
"learning_rate": 0.01,
"loss": 1.4136,
"loss/crossentropy": 2.3873801231384277,
"loss/fcd": 1.2265625,
"loss/logits": 0.2842061370611191,
"step": 94
},
{
"epoch": 0.0016410575320222147,
"grad_norm": 0.29296875,
"grad_norm_var": 0.005470768610636393,
"learning_rate": 0.01,
"loss": 1.4,
"loss/crossentropy": 2.576484441757202,
"loss/fcd": 1.203125,
"loss/logits": 0.2684750333428383,
"step": 95
},
{
"epoch": 0.0016583318218329748,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0054779052734375,
"learning_rate": 0.01,
"loss": 1.4256,
"loss/crossentropy": 2.5171070098876953,
"loss/fcd": 1.24609375,
"loss/logits": 0.2969086170196533,
"step": 96
},
{
"epoch": 0.0016756061116437351,
"grad_norm": 0.306640625,
"grad_norm_var": 0.005534728368123372,
"learning_rate": 0.01,
"loss": 1.3949,
"loss/crossentropy": 2.6096785068511963,
"loss/fcd": 1.12890625,
"loss/logits": 0.2719826400279999,
"step": 97
},
{
"epoch": 0.0016928804014544952,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0056027571360270185,
"learning_rate": 0.01,
"loss": 1.3758,
"loss/crossentropy": 2.366774320602417,
"loss/fcd": 1.1015625,
"loss/logits": 0.23891064524650574,
"step": 98
},
{
"epoch": 0.0017101546912652553,
"grad_norm": 0.34765625,
"grad_norm_var": 0.005489714940388997,
"learning_rate": 0.01,
"loss": 1.436,
"loss/crossentropy": 2.356974244117737,
"loss/fcd": 1.3046875,
"loss/logits": 0.2715897411108017,
"step": 99
},
{
"epoch": 0.0017274289810760156,
"grad_norm": 0.287109375,
"grad_norm_var": 0.005489714940388997,
"learning_rate": 0.01,
"loss": 1.3544,
"loss/crossentropy": 2.5830947160720825,
"loss/fcd": 1.15625,
"loss/logits": 0.28681397438049316,
"step": 100
},
{
"epoch": 0.0017447032708867757,
"grad_norm": 0.296875,
"grad_norm_var": 0.0055816650390625,
"learning_rate": 0.01,
"loss": 1.3767,
"loss/crossentropy": 2.538628339767456,
"loss/fcd": 1.0859375,
"loss/logits": 0.2549655809998512,
"step": 101
},
{
"epoch": 0.0017619775606975358,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0005793094635009766,
"learning_rate": 0.01,
"loss": 1.3127,
"loss/crossentropy": 2.153649151325226,
"loss/fcd": 1.0546875,
"loss/logits": 0.23056582361459732,
"step": 102
},
{
"epoch": 0.0017792518505082959,
"grad_norm": 0.337890625,
"grad_norm_var": 0.0004759311676025391,
"learning_rate": 0.01,
"loss": 1.4807,
"loss/crossentropy": 2.7840667963027954,
"loss/fcd": 1.203125,
"loss/logits": 0.26921743154525757,
"step": 103
},
{
"epoch": 0.0017965261403190562,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0004470189412434896,
"learning_rate": 0.01,
"loss": 1.4075,
"loss/crossentropy": 2.375385046005249,
"loss/fcd": 1.10546875,
"loss/logits": 0.2573629766702652,
"step": 104
},
{
"epoch": 0.0018138004301298163,
"grad_norm": 0.33203125,
"grad_norm_var": 0.0004608154296875,
"learning_rate": 0.01,
"loss": 1.3262,
"loss/crossentropy": 2.7132558822631836,
"loss/fcd": 1.109375,
"loss/logits": 0.2457902729511261,
"step": 105
},
{
"epoch": 0.0018310747199405764,
"grad_norm": 0.2890625,
"grad_norm_var": 0.00039315223693847656,
"learning_rate": 0.01,
"loss": 1.292,
"loss/crossentropy": 2.017941474914551,
"loss/fcd": 0.986328125,
"loss/logits": 0.20789727568626404,
"step": 106
},
{
"epoch": 0.0018483490097513367,
"grad_norm": 0.310546875,
"grad_norm_var": 0.00035691261291503906,
"learning_rate": 0.01,
"loss": 1.4188,
"loss/crossentropy": 2.457041621208191,
"loss/fcd": 1.11328125,
"loss/logits": 0.23911338299512863,
"step": 107
},
{
"epoch": 0.0018656232995620968,
"grad_norm": 0.267578125,
"grad_norm_var": 0.00039513905843098957,
"learning_rate": 0.01,
"loss": 1.3624,
"loss/crossentropy": 2.264693021774292,
"loss/fcd": 1.1328125,
"loss/logits": 0.23969107121229172,
"step": 108
},
{
"epoch": 0.0018828975893728569,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0004439671834309896,
"learning_rate": 0.01,
"loss": 1.3602,
"loss/crossentropy": 2.5558459758758545,
"loss/fcd": 1.12890625,
"loss/logits": 0.24982617795467377,
"step": 109
},
{
"epoch": 0.0019001718791836172,
"grad_norm": 0.44921875,
"grad_norm_var": 0.0017612298329671224,
"learning_rate": 0.01,
"loss": 1.4482,
"loss/crossentropy": 2.623742938041687,
"loss/fcd": 1.140625,
"loss/logits": 0.2605845034122467,
"step": 110
},
{
"epoch": 0.0019174461689943773,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0017667134602864583,
"learning_rate": 0.01,
"loss": 1.4127,
"loss/crossentropy": 2.7532334327697754,
"loss/fcd": 1.171875,
"loss/logits": 0.26577115058898926,
"step": 111
},
{
"epoch": 0.0019347204588051373,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0017864068349202475,
"learning_rate": 0.01,
"loss": 1.3525,
"loss/crossentropy": 2.4502193927764893,
"loss/fcd": 1.05859375,
"loss/logits": 0.2550206333398819,
"step": 112
},
{
"epoch": 0.0019519947486158974,
"grad_norm": 0.33984375,
"grad_norm_var": 0.0018309911092122396,
"learning_rate": 0.01,
"loss": 1.4422,
"loss/crossentropy": 2.0644272565841675,
"loss/fcd": 1.08203125,
"loss/logits": 0.25845974683761597,
"step": 113
},
{
"epoch": 0.0019692690384266577,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0018169244130452475,
"learning_rate": 0.01,
"loss": 1.3762,
"loss/crossentropy": 2.6453906297683716,
"loss/fcd": 1.16015625,
"loss/logits": 0.28696541488170624,
"step": 114
},
{
"epoch": 0.001986543328237418,
"grad_norm": 0.3125,
"grad_norm_var": 0.0017402489980061849,
"learning_rate": 0.01,
"loss": 1.3974,
"loss/crossentropy": 2.229590892791748,
"loss/fcd": 1.04296875,
"loss/logits": 0.22459837794303894,
"step": 115
},
{
"epoch": 0.002003817618048178,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0017174879709879558,
"learning_rate": 0.01,
"loss": 1.3518,
"loss/crossentropy": 2.5267633199691772,
"loss/fcd": 1.08203125,
"loss/logits": 0.24026738852262497,
"step": 116
},
{
"epoch": 0.0020210919078589382,
"grad_norm": 0.328125,
"grad_norm_var": 0.0017108758290608724,
"learning_rate": 0.01,
"loss": 1.4729,
"loss/crossentropy": 2.3015085458755493,
"loss/fcd": 1.08203125,
"loss/logits": 0.23641249537467957,
"step": 117
},
{
"epoch": 0.002038366197669698,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0017054080963134766,
"learning_rate": 0.01,
"loss": 1.4479,
"loss/crossentropy": 2.0869252681732178,
"loss/fcd": 1.14453125,
"loss/logits": 0.2337687686085701,
"step": 118
},
{
"epoch": 0.0020556404874804584,
"grad_norm": 0.3125,
"grad_norm_var": 0.0016692479451497395,
"learning_rate": 0.01,
"loss": 1.3789,
"loss/crossentropy": 2.620050311088562,
"loss/fcd": 1.1328125,
"loss/logits": 0.2408916875720024,
"step": 119
},
{
"epoch": 0.0020729147772912187,
"grad_norm": 0.296875,
"grad_norm_var": 0.0016824722290039063,
"learning_rate": 0.01,
"loss": 1.3728,
"loss/crossentropy": 2.406272053718567,
"loss/fcd": 1.28515625,
"loss/logits": 0.27460669726133347,
"step": 120
},
{
"epoch": 0.0020901890671019786,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0016681512196858725,
"learning_rate": 0.01,
"loss": 1.3607,
"loss/crossentropy": 2.1980100870132446,
"loss/fcd": 1.013671875,
"loss/logits": 0.23184800148010254,
"step": 121
},
{
"epoch": 0.002107463356912739,
"grad_norm": 0.365234375,
"grad_norm_var": 0.0018063863118489584,
"learning_rate": 0.01,
"loss": 1.4133,
"loss/crossentropy": 2.672022223472595,
"loss/fcd": 1.24609375,
"loss/logits": 0.2712271511554718,
"step": 122
},
{
"epoch": 0.002124737646723499,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0018046061197916667,
"learning_rate": 0.01,
"loss": 1.4161,
"loss/crossentropy": 2.161317527294159,
"loss/fcd": 1.1640625,
"loss/logits": 0.2415143996477127,
"step": 123
},
{
"epoch": 0.002142011936534259,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0016402562459309896,
"learning_rate": 0.01,
"loss": 1.3432,
"loss/crossentropy": 2.4041404724121094,
"loss/fcd": 1.1171875,
"loss/logits": 0.2565518468618393,
"step": 124
},
{
"epoch": 0.0021592862263450194,
"grad_norm": 0.296875,
"grad_norm_var": 0.0015553792317708334,
"learning_rate": 0.01,
"loss": 1.3222,
"loss/crossentropy": 2.289466381072998,
"loss/fcd": 1.11328125,
"loss/logits": 0.2588811218738556,
"step": 125
},
{
"epoch": 0.0021765605161557797,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0003751118977864583,
"learning_rate": 0.01,
"loss": 1.4145,
"loss/crossentropy": 2.0946825742721558,
"loss/fcd": 1.0546875,
"loss/logits": 0.22345608472824097,
"step": 126
},
{
"epoch": 0.0021938348059665396,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0003452936808268229,
"learning_rate": 0.01,
"loss": 1.3904,
"loss/crossentropy": 2.4527688026428223,
"loss/fcd": 1.078125,
"loss/logits": 0.23762068152427673,
"step": 127
},
{
"epoch": 0.0022111090957773,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0003202915191650391,
"learning_rate": 0.01,
"loss": 1.4117,
"loss/crossentropy": 2.6558061838150024,
"loss/fcd": 1.26953125,
"loss/logits": 0.3351695239543915,
"step": 128
},
{
"epoch": 0.00222838338558806,
"grad_norm": 0.3125,
"grad_norm_var": 0.00027667681376139324,
"learning_rate": 0.01,
"loss": 1.3841,
"loss/crossentropy": 2.3390719890594482,
"loss/fcd": 1.0703125,
"loss/logits": 0.23404338955879211,
"step": 129
},
{
"epoch": 0.00224565767539882,
"grad_norm": 0.302734375,
"grad_norm_var": 0.00028254191080729165,
"learning_rate": 0.01,
"loss": 1.3402,
"loss/crossentropy": 2.5888524055480957,
"loss/fcd": 1.09765625,
"loss/logits": 0.2385600358247757,
"step": 130
},
{
"epoch": 0.0022629319652095804,
"grad_norm": 0.365234375,
"grad_norm_var": 0.00045291582743326825,
"learning_rate": 0.01,
"loss": 1.4423,
"loss/crossentropy": 2.1622209548950195,
"loss/fcd": 1.125,
"loss/logits": 0.25934895873069763,
"step": 131
},
{
"epoch": 0.0022802062550203407,
"grad_norm": 0.349609375,
"grad_norm_var": 0.0004840691884358724,
"learning_rate": 0.01,
"loss": 1.5001,
"loss/crossentropy": 2.5385576486587524,
"loss/fcd": 1.18359375,
"loss/logits": 0.26659196615219116,
"step": 132
},
{
"epoch": 0.0022974805448311006,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0005355676015218099,
"learning_rate": 0.01,
"loss": 1.3481,
"loss/crossentropy": 2.348211407661438,
"loss/fcd": 1.09375,
"loss/logits": 0.2560664862394333,
"step": 133
},
{
"epoch": 0.002314754834641861,
"grad_norm": 0.3125,
"grad_norm_var": 0.0005294164021809896,
"learning_rate": 0.01,
"loss": 1.3607,
"loss/crossentropy": 2.117067277431488,
"loss/fcd": 1.07421875,
"loss/logits": 0.22807861864566803,
"step": 134
},
{
"epoch": 0.0023320291244526207,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0005870660146077474,
"learning_rate": 0.01,
"loss": 1.387,
"loss/crossentropy": 2.5187747478485107,
"loss/fcd": 1.16796875,
"loss/logits": 0.27947917580604553,
"step": 135
},
{
"epoch": 0.002349303414263381,
"grad_norm": 0.326171875,
"grad_norm_var": 0.0005658467610677084,
"learning_rate": 0.01,
"loss": 1.3995,
"loss/crossentropy": 2.4953707456588745,
"loss/fcd": 1.09375,
"loss/logits": 0.24946419894695282,
"step": 136
},
{
"epoch": 0.0023665777040741414,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0005611260732014974,
"learning_rate": 0.01,
"loss": 1.4027,
"loss/crossentropy": 2.3007187843322754,
"loss/fcd": 1.2109375,
"loss/logits": 0.2944917380809784,
"step": 137
},
{
"epoch": 0.0023838519938849012,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0004042943318684896,
"learning_rate": 0.01,
"loss": 1.3784,
"loss/crossentropy": 2.406763792037964,
"loss/fcd": 1.0625,
"loss/logits": 0.24067886918783188,
"step": 138
},
{
"epoch": 0.0024011262836956615,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0004521052042643229,
"learning_rate": 0.01,
"loss": 1.394,
"loss/crossentropy": 2.3716171979904175,
"loss/fcd": 1.09375,
"loss/logits": 0.2490846812725067,
"step": 139
},
{
"epoch": 0.002418400573506422,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0004530429840087891,
"learning_rate": 0.01,
"loss": 1.3992,
"loss/crossentropy": 2.298838496208191,
"loss/fcd": 1.12109375,
"loss/logits": 0.2580900937318802,
"step": 140
},
{
"epoch": 0.0024356748633171817,
"grad_norm": 0.287109375,
"grad_norm_var": 0.00048014322916666664,
"learning_rate": 0.01,
"loss": 1.3887,
"loss/crossentropy": 2.1861318349838257,
"loss/fcd": 1.09375,
"loss/logits": 0.25625482201576233,
"step": 141
},
{
"epoch": 0.002452949153127942,
"grad_norm": 0.28125,
"grad_norm_var": 0.0005390803019205729,
"learning_rate": 0.01,
"loss": 1.4149,
"loss/crossentropy": 2.5295623540878296,
"loss/fcd": 1.14453125,
"loss/logits": 0.24908355623483658,
"step": 142
},
{
"epoch": 0.0024702234429387023,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0005419413248697917,
"learning_rate": 0.01,
"loss": 1.4095,
"loss/crossentropy": 2.4763203859329224,
"loss/fcd": 1.14453125,
"loss/logits": 0.25878605246543884,
"step": 143
},
{
"epoch": 0.002487497732749462,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0005533854166666667,
"learning_rate": 0.01,
"loss": 1.4244,
"loss/crossentropy": 2.520187020301819,
"loss/fcd": 1.1328125,
"loss/logits": 0.24524306505918503,
"step": 144
},
{
"epoch": 0.0025047720225602225,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0005658308664957683,
"learning_rate": 0.01,
"loss": 1.4039,
"loss/crossentropy": 2.517001748085022,
"loss/fcd": 1.1328125,
"loss/logits": 0.23872993886470795,
"step": 145
},
{
"epoch": 0.002522046312370983,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0005833784739176433,
"learning_rate": 0.01,
"loss": 1.3182,
"loss/crossentropy": 2.4004757404327393,
"loss/fcd": 1.0625,
"loss/logits": 0.24094465374946594,
"step": 146
},
{
"epoch": 0.0025393206021817427,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0003750960032145182,
"learning_rate": 0.01,
"loss": 1.3334,
"loss/crossentropy": 2.1713826656341553,
"loss/fcd": 1.12109375,
"loss/logits": 0.22458232194185257,
"step": 147
},
{
"epoch": 0.002556594891992503,
"grad_norm": 0.34375,
"grad_norm_var": 0.00034052530924479166,
"learning_rate": 0.01,
"loss": 1.3361,
"loss/crossentropy": 2.438323974609375,
"loss/fcd": 1.15234375,
"loss/logits": 0.24637237191200256,
"step": 148
},
{
"epoch": 0.0025738691818032633,
"grad_norm": 0.380859375,
"grad_norm_var": 0.0007058302561442057,
"learning_rate": 0.01,
"loss": 1.4953,
"loss/crossentropy": 2.450320243835449,
"loss/fcd": 1.1171875,
"loss/logits": 0.24158670753240585,
"step": 149
},
{
"epoch": 0.002591143471614023,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0007044474283854166,
"learning_rate": 0.01,
"loss": 1.4629,
"loss/crossentropy": 2.294734477996826,
"loss/fcd": 1.2421875,
"loss/logits": 0.2762032076716423,
"step": 150
},
{
"epoch": 0.0026084177614247835,
"grad_norm": 0.33203125,
"grad_norm_var": 0.0007077376047770182,
"learning_rate": 0.01,
"loss": 1.4165,
"loss/crossentropy": 2.468201994895935,
"loss/fcd": 1.21875,
"loss/logits": 0.2507496029138565,
"step": 151
},
{
"epoch": 0.0026256920512355434,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0006917158762613933,
"learning_rate": 0.01,
"loss": 1.395,
"loss/crossentropy": 2.353287696838379,
"loss/fcd": 1.18359375,
"loss/logits": 0.2722310647368431,
"step": 152
},
{
"epoch": 0.0026429663410463037,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0007008234659830729,
"learning_rate": 0.01,
"loss": 1.3506,
"loss/crossentropy": 2.2797771692276,
"loss/fcd": 1.1484375,
"loss/logits": 0.2620129883289337,
"step": 153
},
{
"epoch": 0.002660240630857064,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0007210890452067057,
"learning_rate": 0.01,
"loss": 1.3943,
"loss/crossentropy": 2.6261144876480103,
"loss/fcd": 1.21484375,
"loss/logits": 0.3041190207004547,
"step": 154
},
{
"epoch": 0.002677514920667824,
"grad_norm": 0.28125,
"grad_norm_var": 0.0007389704386393229,
"learning_rate": 0.01,
"loss": 1.4487,
"loss/crossentropy": 2.327589750289917,
"loss/fcd": 1.2890625,
"loss/logits": 0.333427369594574,
"step": 155
},
{
"epoch": 0.002694789210478584,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0007445653279622396,
"learning_rate": 0.01,
"loss": 1.3842,
"loss/crossentropy": 2.4801390171051025,
"loss/fcd": 1.1640625,
"loss/logits": 0.23910623788833618,
"step": 156
},
{
"epoch": 0.0027120635002893445,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0007395267486572266,
"learning_rate": 0.01,
"loss": 1.3487,
"loss/crossentropy": 2.577694535255432,
"loss/fcd": 1.16015625,
"loss/logits": 0.2568306028842926,
"step": 157
},
{
"epoch": 0.0027293377901001043,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0006922403971354167,
"learning_rate": 0.01,
"loss": 1.3505,
"loss/crossentropy": 2.415543556213379,
"loss/fcd": 1.09375,
"loss/logits": 0.2512781471014023,
"step": 158
},
{
"epoch": 0.0027466120799108647,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0006875991821289062,
"learning_rate": 0.01,
"loss": 1.4042,
"loss/crossentropy": 2.4328696727752686,
"loss/fcd": 1.0859375,
"loss/logits": 0.2584942355751991,
"step": 159
},
{
"epoch": 0.002763886369721625,
"grad_norm": 0.2578125,
"grad_norm_var": 0.0008356730143229167,
"learning_rate": 0.01,
"loss": 1.2883,
"loss/crossentropy": 2.344989776611328,
"loss/fcd": 1.06640625,
"loss/logits": 0.23677106201648712,
"step": 160
},
{
"epoch": 0.002781160659532385,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0008282979329427083,
"learning_rate": 0.01,
"loss": 1.3544,
"loss/crossentropy": 2.3909146785736084,
"loss/fcd": 1.1171875,
"loss/logits": 0.26238836348056793,
"step": 161
},
{
"epoch": 0.002798434949343145,
"grad_norm": 0.341796875,
"grad_norm_var": 0.000886980692545573,
"learning_rate": 0.01,
"loss": 1.4284,
"loss/crossentropy": 2.6815162897109985,
"loss/fcd": 1.23828125,
"loss/logits": 0.28333599865436554,
"step": 162
},
{
"epoch": 0.0028157092391539055,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0008396784464518229,
"learning_rate": 0.01,
"loss": 1.3743,
"loss/crossentropy": 2.363664388656616,
"loss/fcd": 1.11328125,
"loss/logits": 0.23216551542282104,
"step": 163
},
{
"epoch": 0.0028329835289646653,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0007912794748942058,
"learning_rate": 0.01,
"loss": 1.3503,
"loss/crossentropy": 2.6360952854156494,
"loss/fcd": 1.19140625,
"loss/logits": 0.25444111227989197,
"step": 164
},
{
"epoch": 0.0028502578187754256,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0004067579905192057,
"learning_rate": 0.01,
"loss": 1.3827,
"loss/crossentropy": 2.255971908569336,
"loss/fcd": 1.08984375,
"loss/logits": 0.2420385479927063,
"step": 165
},
{
"epoch": 0.002867532108586186,
"grad_norm": 0.328125,
"grad_norm_var": 0.00044854482014973957,
"learning_rate": 0.01,
"loss": 1.3572,
"loss/crossentropy": 2.5781320333480835,
"loss/fcd": 1.12109375,
"loss/logits": 0.2430611252784729,
"step": 166
},
{
"epoch": 0.002884806398396946,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0003909905751546224,
"learning_rate": 0.01,
"loss": 1.394,
"loss/crossentropy": 2.698032259941101,
"loss/fcd": 1.16796875,
"loss/logits": 0.248212069272995,
"step": 167
},
{
"epoch": 0.002902080688207706,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0003845055898030599,
"learning_rate": 0.01,
"loss": 1.4097,
"loss/crossentropy": 2.372989535331726,
"loss/fcd": 1.140625,
"loss/logits": 0.24837365001440048,
"step": 168
},
{
"epoch": 0.002919354978018466,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0003870646158854167,
"learning_rate": 0.01,
"loss": 1.3624,
"loss/crossentropy": 2.555245876312256,
"loss/fcd": 1.12890625,
"loss/logits": 0.2645147144794464,
"step": 169
},
{
"epoch": 0.0029366292678292263,
"grad_norm": 0.291015625,
"grad_norm_var": 0.00038089752197265623,
"learning_rate": 0.01,
"loss": 1.3458,
"loss/crossentropy": 2.2800326347351074,
"loss/fcd": 1.0390625,
"loss/logits": 0.22108863294124603,
"step": 170
},
{
"epoch": 0.0029539035576399866,
"grad_norm": 0.30078125,
"grad_norm_var": 0.000353240966796875,
"learning_rate": 0.01,
"loss": 1.3788,
"loss/crossentropy": 2.638196110725403,
"loss/fcd": 1.15234375,
"loss/logits": 0.2918136268854141,
"step": 171
},
{
"epoch": 0.0029711778474507465,
"grad_norm": 0.294921875,
"grad_norm_var": 0.00034152666727701824,
"learning_rate": 0.01,
"loss": 1.3664,
"loss/crossentropy": 2.6176986694335938,
"loss/fcd": 1.1640625,
"loss/logits": 0.26864323019981384,
"step": 172
},
{
"epoch": 0.002988452137261507,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0003345330556233724,
"learning_rate": 0.01,
"loss": 1.4184,
"loss/crossentropy": 2.62368905544281,
"loss/fcd": 1.25390625,
"loss/logits": 0.28509171307086945,
"step": 173
},
{
"epoch": 0.003005726427072267,
"grad_norm": 0.671875,
"grad_norm_var": 0.00890649159749349,
"learning_rate": 0.01,
"loss": 1.4685,
"loss/crossentropy": 2.309454083442688,
"loss/fcd": 1.125,
"loss/logits": 0.26153236627578735,
"step": 174
},
{
"epoch": 0.003023000716883027,
"grad_norm": 0.32421875,
"grad_norm_var": 0.00887309710184733,
"learning_rate": 0.01,
"loss": 1.4154,
"loss/crossentropy": 2.320811152458191,
"loss/fcd": 1.07421875,
"loss/logits": 0.24308273196220398,
"step": 175
},
{
"epoch": 0.0030402750066937873,
"grad_norm": 0.306640625,
"grad_norm_var": 0.008579444885253907,
"learning_rate": 0.01,
"loss": 1.3805,
"loss/crossentropy": 2.579828977584839,
"loss/fcd": 1.14453125,
"loss/logits": 0.2542525976896286,
"step": 176
},
{
"epoch": 0.0030575492965045476,
"grad_norm": 0.302734375,
"grad_norm_var": 0.008579444885253907,
"learning_rate": 0.01,
"loss": 1.3868,
"loss/crossentropy": 2.5000842809677124,
"loss/fcd": 1.18359375,
"loss/logits": 0.2917867451906204,
"step": 177
},
{
"epoch": 0.0030748235863153075,
"grad_norm": 0.291015625,
"grad_norm_var": 0.008653004964192709,
"learning_rate": 0.01,
"loss": 1.3679,
"loss/crossentropy": 2.5240609645843506,
"loss/fcd": 1.13671875,
"loss/logits": 0.2740897983312607,
"step": 178
},
{
"epoch": 0.0030920978761260678,
"grad_norm": 0.75,
"grad_norm_var": 0.019812758763631186,
"learning_rate": 0.01,
"loss": 1.423,
"loss/crossentropy": 2.383319854736328,
"loss/fcd": 1.16015625,
"loss/logits": 0.2834385186433792,
"step": 179
},
{
"epoch": 0.003109372165936828,
"grad_norm": 0.3125,
"grad_norm_var": 0.01962865193684896,
"learning_rate": 0.01,
"loss": 1.3861,
"loss/crossentropy": 2.3524543046951294,
"loss/fcd": 1.1796875,
"loss/logits": 0.24870187044143677,
"step": 180
},
{
"epoch": 0.003126646455747588,
"grad_norm": 0.333984375,
"grad_norm_var": 0.01944268544514974,
"learning_rate": 0.01,
"loss": 1.4644,
"loss/crossentropy": 2.768381118774414,
"loss/fcd": 1.26171875,
"loss/logits": 0.3117068111896515,
"step": 181
},
{
"epoch": 0.0031439207455583483,
"grad_norm": 0.3125,
"grad_norm_var": 0.019518470764160155,
"learning_rate": 0.01,
"loss": 1.4071,
"loss/crossentropy": 2.5678982734680176,
"loss/fcd": 1.14453125,
"loss/logits": 0.25002971291542053,
"step": 182
},
{
"epoch": 0.0031611950353691086,
"grad_norm": 0.3203125,
"grad_norm_var": 0.019382969538370768,
"learning_rate": 0.01,
"loss": 1.4044,
"loss/crossentropy": 2.6401069164276123,
"loss/fcd": 1.17578125,
"loss/logits": 0.2738536596298218,
"step": 183
},
{
"epoch": 0.0031784693251798684,
"grad_norm": 0.318359375,
"grad_norm_var": 0.019187148412068686,
"learning_rate": 0.01,
"loss": 1.4165,
"loss/crossentropy": 2.3614484071731567,
"loss/fcd": 1.1171875,
"loss/logits": 0.28841613233089447,
"step": 184
},
{
"epoch": 0.0031957436149906288,
"grad_norm": 0.337890625,
"grad_norm_var": 0.01904290517171224,
"learning_rate": 0.01,
"loss": 1.4151,
"loss/crossentropy": 2.2044495344161987,
"loss/fcd": 1.09765625,
"loss/logits": 0.25532982498407364,
"step": 185
},
{
"epoch": 0.003213017904801389,
"grad_norm": 0.279296875,
"grad_norm_var": 0.019160970052083334,
"learning_rate": 0.01,
"loss": 1.3233,
"loss/crossentropy": 2.657314658164978,
"loss/fcd": 1.1171875,
"loss/logits": 0.2434261366724968,
"step": 186
},
{
"epoch": 0.003230292194612149,
"grad_norm": 0.322265625,
"grad_norm_var": 0.019019174575805663,
"learning_rate": 0.01,
"loss": 1.457,
"loss/crossentropy": 2.509123682975769,
"loss/fcd": 1.16015625,
"loss/logits": 0.27627624571323395,
"step": 187
},
{
"epoch": 0.0032475664844229092,
"grad_norm": 0.298828125,
"grad_norm_var": 0.01898535092671712,
"learning_rate": 0.01,
"loss": 1.4612,
"loss/crossentropy": 2.4355961084365845,
"loss/fcd": 1.15625,
"loss/logits": 0.2809949368238449,
"step": 188
},
{
"epoch": 0.003264840774233669,
"grad_norm": 0.314453125,
"grad_norm_var": 0.018945821126302085,
"learning_rate": 0.01,
"loss": 1.4111,
"loss/crossentropy": 2.657699465751648,
"loss/fcd": 1.1484375,
"loss/logits": 0.26505863666534424,
"step": 189
},
{
"epoch": 0.0032821150640444294,
"grad_norm": 0.56640625,
"grad_norm_var": 0.01528771718343099,
"learning_rate": 0.01,
"loss": 1.4753,
"loss/crossentropy": 2.4757652282714844,
"loss/fcd": 1.0546875,
"loss/logits": 0.22812122106552124,
"step": 190
},
{
"epoch": 0.0032993893538551897,
"grad_norm": 0.296875,
"grad_norm_var": 0.015449269612630209,
"learning_rate": 0.01,
"loss": 1.3867,
"loss/crossentropy": 2.4966439604759216,
"loss/fcd": 1.14453125,
"loss/logits": 0.24755483120679855,
"step": 191
},
{
"epoch": 0.0033166636436659496,
"grad_norm": 0.30859375,
"grad_norm_var": 0.015437173843383788,
"learning_rate": 0.01,
"loss": 1.4331,
"loss/crossentropy": 2.2156739234924316,
"loss/fcd": 1.125,
"loss/logits": 0.24708709865808487,
"step": 192
},
{
"epoch": 0.00333393793347671,
"grad_norm": 0.337890625,
"grad_norm_var": 0.015273523330688477,
"learning_rate": 0.01,
"loss": 1.4652,
"loss/crossentropy": 2.5916343927383423,
"loss/fcd": 1.15625,
"loss/logits": 0.26975981891155243,
"step": 193
},
{
"epoch": 0.0033512122232874702,
"grad_norm": 0.294921875,
"grad_norm_var": 0.01524046262105306,
"learning_rate": 0.01,
"loss": 1.3916,
"loss/crossentropy": 2.4512441158294678,
"loss/fcd": 1.12890625,
"loss/logits": 0.2599586248397827,
"step": 194
},
{
"epoch": 0.00336848651309823,
"grad_norm": 0.271484375,
"grad_norm_var": 0.004449717203776042,
"learning_rate": 0.01,
"loss": 1.2906,
"loss/crossentropy": 2.4583925008773804,
"loss/fcd": 1.0859375,
"loss/logits": 0.22421551495790482,
"step": 195
},
{
"epoch": 0.0033857608029089904,
"grad_norm": 0.345703125,
"grad_norm_var": 0.004455931981404622,
"learning_rate": 0.01,
"loss": 1.4645,
"loss/crossentropy": 3.102002263069153,
"loss/fcd": 1.26953125,
"loss/logits": 0.31158843636512756,
"step": 196
},
{
"epoch": 0.0034030350927197507,
"grad_norm": 0.275390625,
"grad_norm_var": 0.00462950070699056,
"learning_rate": 0.01,
"loss": 1.3805,
"loss/crossentropy": 2.537242293357849,
"loss/fcd": 1.15234375,
"loss/logits": 0.24022500216960907,
"step": 197
},
{
"epoch": 0.0034203093825305106,
"grad_norm": 0.31640625,
"grad_norm_var": 0.004623905817667643,
"learning_rate": 0.01,
"loss": 1.4295,
"loss/crossentropy": 1.8695432543754578,
"loss/fcd": 1.24609375,
"loss/logits": 0.2338111400604248,
"step": 198
},
{
"epoch": 0.003437583672341271,
"grad_norm": 0.306640625,
"grad_norm_var": 0.004644711812337239,
"learning_rate": 0.01,
"loss": 1.4342,
"loss/crossentropy": 2.5979591608047485,
"loss/fcd": 1.17578125,
"loss/logits": 0.2477928102016449,
"step": 199
},
{
"epoch": 0.003454857962152031,
"grad_norm": 0.294921875,
"grad_norm_var": 0.004698117574055989,
"learning_rate": 0.01,
"loss": 1.3588,
"loss/crossentropy": 2.6363730430603027,
"loss/fcd": 1.12109375,
"loss/logits": 0.26602891087532043,
"step": 200
},
{
"epoch": 0.003472132251962791,
"grad_norm": 0.298828125,
"grad_norm_var": 0.004715919494628906,
"learning_rate": 0.01,
"loss": 1.3919,
"loss/crossentropy": 2.6225093603134155,
"loss/fcd": 1.09765625,
"loss/logits": 0.25346362590789795,
"step": 201
},
{
"epoch": 0.0034894065417735514,
"grad_norm": 0.296875,
"grad_norm_var": 0.004638528823852539,
"learning_rate": 0.01,
"loss": 1.3639,
"loss/crossentropy": 2.6900315284729004,
"loss/fcd": 1.140625,
"loss/logits": 0.25750475376844406,
"step": 202
},
{
"epoch": 0.0035066808315843117,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0046525160471598305,
"learning_rate": 0.01,
"loss": 1.4454,
"loss/crossentropy": 2.4896918535232544,
"loss/fcd": 1.12109375,
"loss/logits": 0.24820879101753235,
"step": 203
},
{
"epoch": 0.0035239551213950716,
"grad_norm": 0.29296875,
"grad_norm_var": 0.004671732584635417,
"learning_rate": 0.01,
"loss": 1.3744,
"loss/crossentropy": 2.4207727909088135,
"loss/fcd": 1.1796875,
"loss/logits": 0.267853319644928,
"step": 204
},
{
"epoch": 0.003541229411205832,
"grad_norm": 0.296875,
"grad_norm_var": 0.004704777399698893,
"learning_rate": 0.01,
"loss": 1.3827,
"loss/crossentropy": 2.6077362298965454,
"loss/fcd": 1.19140625,
"loss/logits": 0.2449246495962143,
"step": 205
},
{
"epoch": 0.0035585037010165918,
"grad_norm": 0.302734375,
"grad_norm_var": 0.00035959879557291666,
"learning_rate": 0.01,
"loss": 1.36,
"loss/crossentropy": 2.2625831365585327,
"loss/fcd": 1.07421875,
"loss/logits": 0.25722844898700714,
"step": 206
},
{
"epoch": 0.003575777990827352,
"grad_norm": 0.326171875,
"grad_norm_var": 0.0003903547922770182,
"learning_rate": 0.01,
"loss": 1.4604,
"loss/crossentropy": 2.6293487548828125,
"loss/fcd": 1.15625,
"loss/logits": 0.2616356760263443,
"step": 207
},
{
"epoch": 0.0035930522806381124,
"grad_norm": 0.291015625,
"grad_norm_var": 0.00040022532145182293,
"learning_rate": 0.01,
"loss": 1.3362,
"loss/crossentropy": 2.4450851678848267,
"loss/fcd": 1.0859375,
"loss/logits": 0.23832125961780548,
"step": 208
},
{
"epoch": 0.0036103265704488722,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0003178278605143229,
"learning_rate": 0.01,
"loss": 1.3815,
"loss/crossentropy": 2.265815496444702,
"loss/fcd": 1.25,
"loss/logits": 0.2856537625193596,
"step": 209
},
{
"epoch": 0.0036276008602596325,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0003153483072916667,
"learning_rate": 0.01,
"loss": 1.3779,
"loss/crossentropy": 2.4830867052078247,
"loss/fcd": 1.18359375,
"loss/logits": 0.27156491577625275,
"step": 210
},
{
"epoch": 0.003644875150070393,
"grad_norm": 0.33203125,
"grad_norm_var": 0.00030007362365722654,
"learning_rate": 0.01,
"loss": 1.3838,
"loss/crossentropy": 2.4645248651504517,
"loss/fcd": 1.13671875,
"loss/logits": 0.2536320984363556,
"step": 211
},
{
"epoch": 0.0036621494398811527,
"grad_norm": 0.29296875,
"grad_norm_var": 0.000191497802734375,
"learning_rate": 0.01,
"loss": 1.3463,
"loss/crossentropy": 2.4574155807495117,
"loss/fcd": 1.04296875,
"loss/logits": 0.22712672501802444,
"step": 212
},
{
"epoch": 0.003679423729691913,
"grad_norm": 0.3125,
"grad_norm_var": 0.00014468828837076823,
"learning_rate": 0.01,
"loss": 1.3817,
"loss/crossentropy": 2.51455819606781,
"loss/fcd": 1.09765625,
"loss/logits": 0.24947896599769592,
"step": 213
},
{
"epoch": 0.0036966980195026733,
"grad_norm": 0.3046875,
"grad_norm_var": 0.00013477007548014323,
"learning_rate": 0.01,
"loss": 1.4295,
"loss/crossentropy": 2.5708523988723755,
"loss/fcd": 1.2109375,
"loss/logits": 0.3021456152200699,
"step": 214
},
{
"epoch": 0.0037139723093134332,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0001357396443684896,
"learning_rate": 0.01,
"loss": 1.3823,
"loss/crossentropy": 2.696264386177063,
"loss/fcd": 1.12890625,
"loss/logits": 0.2742393985390663,
"step": 215
},
{
"epoch": 0.0037312465991241935,
"grad_norm": 0.326171875,
"grad_norm_var": 0.00015913645426432292,
"learning_rate": 0.01,
"loss": 1.4102,
"loss/crossentropy": 2.310886025428772,
"loss/fcd": 1.22265625,
"loss/logits": 0.2918149083852768,
"step": 216
},
{
"epoch": 0.003748520888934954,
"grad_norm": 0.302734375,
"grad_norm_var": 0.000156402587890625,
"learning_rate": 0.01,
"loss": 1.4137,
"loss/crossentropy": 2.2433084845542908,
"loss/fcd": 1.15625,
"loss/logits": 0.25447261333465576,
"step": 217
},
{
"epoch": 0.0037657951787457137,
"grad_norm": 0.322265625,
"grad_norm_var": 0.00016528765360514323,
"learning_rate": 0.01,
"loss": 1.4519,
"loss/crossentropy": 2.4079222679138184,
"loss/fcd": 1.140625,
"loss/logits": 0.2586686462163925,
"step": 218
},
{
"epoch": 0.003783069468556474,
"grad_norm": 0.279296875,
"grad_norm_var": 0.00021602312723795574,
"learning_rate": 0.01,
"loss": 1.415,
"loss/crossentropy": 2.460106134414673,
"loss/fcd": 1.15234375,
"loss/logits": 0.2525549978017807,
"step": 219
},
{
"epoch": 0.0038003437583672343,
"grad_norm": 0.2890625,
"grad_norm_var": 0.00022377967834472657,
"learning_rate": 0.01,
"loss": 1.4134,
"loss/crossentropy": 2.225171685218811,
"loss/fcd": 1.0546875,
"loss/logits": 0.22205037623643875,
"step": 220
},
{
"epoch": 0.003817618048177994,
"grad_norm": 0.26953125,
"grad_norm_var": 0.0003029982248942057,
"learning_rate": 0.01,
"loss": 1.3745,
"loss/crossentropy": 2.3788317441940308,
"loss/fcd": 1.1015625,
"loss/logits": 0.24135209619998932,
"step": 221
},
{
"epoch": 0.0038348923379887545,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0003082116444905599,
"learning_rate": 0.01,
"loss": 1.4452,
"loss/crossentropy": 2.375778555870056,
"loss/fcd": 1.21484375,
"loss/logits": 0.25682032108306885,
"step": 222
},
{
"epoch": 0.0038521666277995144,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0002720514933268229,
"learning_rate": 0.01,
"loss": 1.3877,
"loss/crossentropy": 2.4510881900787354,
"loss/fcd": 1.12109375,
"loss/logits": 0.2522790729999542,
"step": 223
},
{
"epoch": 0.0038694409176102747,
"grad_norm": 0.318359375,
"grad_norm_var": 0.00027872721354166665,
"learning_rate": 0.01,
"loss": 1.3783,
"loss/crossentropy": 2.4119985103607178,
"loss/fcd": 1.13671875,
"loss/logits": 0.22855417430400848,
"step": 224
},
{
"epoch": 0.003886715207421035,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0002911726633707682,
"learning_rate": 0.01,
"loss": 1.4037,
"loss/crossentropy": 2.4024510383605957,
"loss/fcd": 1.16015625,
"loss/logits": 0.2690604329109192,
"step": 225
},
{
"epoch": 0.003903989497231795,
"grad_norm": 0.2734375,
"grad_norm_var": 0.00034427642822265625,
"learning_rate": 0.01,
"loss": 1.3468,
"loss/crossentropy": 2.550796151161194,
"loss/fcd": 1.078125,
"loss/logits": 0.24284164607524872,
"step": 226
},
{
"epoch": 0.003921263787042556,
"grad_norm": 0.427734375,
"grad_norm_var": 0.0013123671213785806,
"learning_rate": 0.01,
"loss": 1.4574,
"loss/crossentropy": 2.9375933408737183,
"loss/fcd": 1.3046875,
"loss/logits": 0.2513057738542557,
"step": 227
},
{
"epoch": 0.0039385380768533155,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0013051350911458333,
"learning_rate": 0.01,
"loss": 1.4281,
"loss/crossentropy": 2.524444341659546,
"loss/fcd": 1.1953125,
"loss/logits": 0.27370719611644745,
"step": 228
},
{
"epoch": 0.003955812366664075,
"grad_norm": 0.33984375,
"grad_norm_var": 0.001366106669108073,
"learning_rate": 0.01,
"loss": 1.4307,
"loss/crossentropy": 2.486480951309204,
"loss/fcd": 1.11328125,
"loss/logits": 0.26038021594285965,
"step": 229
},
{
"epoch": 0.003973086656474836,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0013638655344645181,
"learning_rate": 0.01,
"loss": 1.3844,
"loss/crossentropy": 2.48094379901886,
"loss/fcd": 1.10546875,
"loss/logits": 0.24342957884073257,
"step": 230
},
{
"epoch": 0.003990360946285596,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0013834476470947266,
"learning_rate": 0.01,
"loss": 1.3685,
"loss/crossentropy": 2.241925358772278,
"loss/fcd": 1.14453125,
"loss/logits": 0.24210943281650543,
"step": 231
},
{
"epoch": 0.004007635236096356,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0013643741607666016,
"learning_rate": 0.01,
"loss": 1.448,
"loss/crossentropy": 2.648869752883911,
"loss/fcd": 1.09375,
"loss/logits": 0.24799171090126038,
"step": 232
},
{
"epoch": 0.004024909525907116,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0013676802317301431,
"learning_rate": 0.01,
"loss": 1.4431,
"loss/crossentropy": 2.63001549243927,
"loss/fcd": 1.15625,
"loss/logits": 0.27701297402381897,
"step": 233
},
{
"epoch": 0.0040421838157178765,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0013848463694254556,
"learning_rate": 0.01,
"loss": 1.3746,
"loss/crossentropy": 2.2247713804244995,
"loss/fcd": 1.03125,
"loss/logits": 0.24730068445205688,
"step": 234
},
{
"epoch": 0.004059458105528636,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0013358910878499349,
"learning_rate": 0.01,
"loss": 1.4197,
"loss/crossentropy": 2.511416435241699,
"loss/fcd": 1.15234375,
"loss/logits": 0.2583580017089844,
"step": 235
},
{
"epoch": 0.004076732395339396,
"grad_norm": 0.369140625,
"grad_norm_var": 0.0015294392903645833,
"learning_rate": 0.01,
"loss": 1.4459,
"loss/crossentropy": 2.366840362548828,
"loss/fcd": 1.171875,
"loss/logits": 0.2747315466403961,
"step": 236
},
{
"epoch": 0.004094006685150157,
"grad_norm": 0.34375,
"grad_norm_var": 0.0014388402303059896,
"learning_rate": 0.01,
"loss": 1.379,
"loss/crossentropy": 2.645435094833374,
"loss/fcd": 1.1796875,
"loss/logits": 0.2583626061677933,
"step": 237
},
{
"epoch": 0.004111280974960917,
"grad_norm": 0.333984375,
"grad_norm_var": 0.0014134089152018229,
"learning_rate": 0.01,
"loss": 1.3999,
"loss/crossentropy": 2.0519449710845947,
"loss/fcd": 1.09375,
"loss/logits": 0.2533458322286606,
"step": 238
},
{
"epoch": 0.004128555264771677,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0014001051584879556,
"learning_rate": 0.01,
"loss": 1.3654,
"loss/crossentropy": 2.236992359161377,
"loss/fcd": 1.02734375,
"loss/logits": 0.23388498276472092,
"step": 239
},
{
"epoch": 0.0041458295545824374,
"grad_norm": 0.328125,
"grad_norm_var": 0.0014027277628580728,
"learning_rate": 0.01,
"loss": 1.3499,
"loss/crossentropy": 2.308284044265747,
"loss/fcd": 1.05859375,
"loss/logits": 0.23218639194965363,
"step": 240
},
{
"epoch": 0.004163103844393197,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0013278802235921225,
"learning_rate": 0.01,
"loss": 1.4553,
"loss/crossentropy": 2.360711455345154,
"loss/fcd": 1.14453125,
"loss/logits": 0.24909411370754242,
"step": 241
},
{
"epoch": 0.004180378134203957,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0011983235677083333,
"learning_rate": 0.01,
"loss": 1.4332,
"loss/crossentropy": 2.486197352409363,
"loss/fcd": 1.21875,
"loss/logits": 0.28059011697769165,
"step": 242
},
{
"epoch": 0.004197652424014718,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0004508813222249349,
"learning_rate": 0.01,
"loss": 1.3993,
"loss/crossentropy": 2.461425542831421,
"loss/fcd": 1.12109375,
"loss/logits": 0.2716974467039108,
"step": 243
},
{
"epoch": 0.004214926713825478,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0004518985748291016,
"learning_rate": 0.01,
"loss": 1.4236,
"loss/crossentropy": 2.344510316848755,
"loss/fcd": 1.203125,
"loss/logits": 0.2624819576740265,
"step": 244
},
{
"epoch": 0.004232201003636238,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0005180199940999348,
"learning_rate": 0.01,
"loss": 1.3542,
"loss/crossentropy": 2.6375720500946045,
"loss/fcd": 1.1328125,
"loss/logits": 0.2671656012535095,
"step": 245
},
{
"epoch": 0.004249475293446998,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0005176385243733724,
"learning_rate": 0.01,
"loss": 1.3853,
"loss/crossentropy": 2.4105772972106934,
"loss/fcd": 1.16796875,
"loss/logits": 0.2800147980451584,
"step": 246
},
{
"epoch": 0.004266749583257758,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0004948298136393229,
"learning_rate": 0.01,
"loss": 1.3755,
"loss/crossentropy": 2.2956700325012207,
"loss/fcd": 1.12890625,
"loss/logits": 0.2564444988965988,
"step": 247
},
{
"epoch": 0.004284023873068518,
"grad_norm": 0.50390625,
"grad_norm_var": 0.0026893456776936847,
"learning_rate": 0.01,
"loss": 1.3836,
"loss/crossentropy": 2.3848729133605957,
"loss/fcd": 1.1640625,
"loss/logits": 0.2581590488553047,
"step": 248
},
{
"epoch": 0.004301298162879279,
"grad_norm": 0.326171875,
"grad_norm_var": 0.002683115005493164,
"learning_rate": 0.01,
"loss": 1.3791,
"loss/crossentropy": 2.6016765832901,
"loss/fcd": 1.1015625,
"loss/logits": 0.26704905927181244,
"step": 249
},
{
"epoch": 0.004318572452690039,
"grad_norm": 0.33984375,
"grad_norm_var": 0.002565956115722656,
"learning_rate": 0.01,
"loss": 1.4703,
"loss/crossentropy": 2.4796223640441895,
"loss/fcd": 1.28125,
"loss/logits": 0.30792760848999023,
"step": 250
},
{
"epoch": 0.004335846742500799,
"grad_norm": 0.296875,
"grad_norm_var": 0.002574777603149414,
"learning_rate": 0.01,
"loss": 1.3831,
"loss/crossentropy": 2.45810604095459,
"loss/fcd": 1.1484375,
"loss/logits": 0.26673202961683273,
"step": 251
},
{
"epoch": 0.004353121032311559,
"grad_norm": 0.296875,
"grad_norm_var": 0.0025400797526041667,
"learning_rate": 0.01,
"loss": 1.3379,
"loss/crossentropy": 2.37344229221344,
"loss/fcd": 1.0859375,
"loss/logits": 0.2348434329032898,
"step": 252
},
{
"epoch": 0.004370395322122319,
"grad_norm": 0.287109375,
"grad_norm_var": 0.002615213394165039,
"learning_rate": 0.01,
"loss": 1.4283,
"loss/crossentropy": 2.310893416404724,
"loss/fcd": 1.08984375,
"loss/logits": 0.22272542119026184,
"step": 253
},
{
"epoch": 0.004387669611933079,
"grad_norm": 0.455078125,
"grad_norm_var": 0.003699223200480143,
"learning_rate": 0.01,
"loss": 1.4319,
"loss/crossentropy": 2.287319302558899,
"loss/fcd": 1.1484375,
"loss/logits": 0.23187098652124405,
"step": 254
},
{
"epoch": 0.00440494390174384,
"grad_norm": 0.30859375,
"grad_norm_var": 0.003693072001139323,
"learning_rate": 0.01,
"loss": 1.3951,
"loss/crossentropy": 2.751601457595825,
"loss/fcd": 1.15625,
"loss/logits": 0.2715594172477722,
"step": 255
},
{
"epoch": 0.0044222181915546,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0037031650543212892,
"learning_rate": 0.01,
"loss": 1.372,
"loss/crossentropy": 2.513296961784363,
"loss/fcd": 1.12109375,
"loss/logits": 0.23859571665525436,
"step": 256
},
{
"epoch": 0.00443949248136536,
"grad_norm": 0.306640625,
"grad_norm_var": 0.003735971450805664,
"learning_rate": 0.01,
"loss": 1.3787,
"loss/crossentropy": 2.501555562019348,
"loss/fcd": 1.125,
"loss/logits": 0.2450244277715683,
"step": 257
},
{
"epoch": 0.00445676677117612,
"grad_norm": 0.291015625,
"grad_norm_var": 0.00377195676167806,
"learning_rate": 0.01,
"loss": 1.3965,
"loss/crossentropy": 2.503899097442627,
"loss/fcd": 1.17578125,
"loss/logits": 0.28062424063682556,
"step": 258
},
{
"epoch": 0.00447404106098688,
"grad_norm": 0.326171875,
"grad_norm_var": 0.0037612279256184896,
"learning_rate": 0.01,
"loss": 1.3864,
"loss/crossentropy": 2.5635122060775757,
"loss/fcd": 1.1484375,
"loss/logits": 0.25401656329631805,
"step": 259
},
{
"epoch": 0.00449131535079764,
"grad_norm": 0.3125,
"grad_norm_var": 0.0037770430246988934,
"learning_rate": 0.01,
"loss": 1.3786,
"loss/crossentropy": 2.4950658082962036,
"loss/fcd": 1.12109375,
"loss/logits": 0.2641760855913162,
"step": 260
},
{
"epoch": 0.004508589640608401,
"grad_norm": 0.296875,
"grad_norm_var": 0.003665781021118164,
"learning_rate": 0.01,
"loss": 1.3656,
"loss/crossentropy": 2.4370001554489136,
"loss/fcd": 1.1015625,
"loss/logits": 0.249709352850914,
"step": 261
},
{
"epoch": 0.004525863930419161,
"grad_norm": 0.2890625,
"grad_norm_var": 0.003766632080078125,
"learning_rate": 0.01,
"loss": 1.3332,
"loss/crossentropy": 2.4650388956069946,
"loss/fcd": 1.14453125,
"loss/logits": 0.2645094692707062,
"step": 262
},
{
"epoch": 0.004543138220229921,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0037601312001546224,
"learning_rate": 0.01,
"loss": 1.3832,
"loss/crossentropy": 2.677791714668274,
"loss/fcd": 1.16796875,
"loss/logits": 0.28196755796670914,
"step": 263
},
{
"epoch": 0.004560412510040681,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0015848795572916666,
"learning_rate": 0.01,
"loss": 1.4601,
"loss/crossentropy": 2.4847524166107178,
"loss/fcd": 1.109375,
"loss/logits": 0.2580026537179947,
"step": 264
},
{
"epoch": 0.004577686799851441,
"grad_norm": 0.33203125,
"grad_norm_var": 0.0015946547190348306,
"learning_rate": 0.01,
"loss": 1.4087,
"loss/crossentropy": 2.4944722652435303,
"loss/fcd": 1.12109375,
"loss/logits": 0.2483246624469757,
"step": 265
},
{
"epoch": 0.004594961089662201,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0016375223795572916,
"learning_rate": 0.01,
"loss": 1.3835,
"loss/crossentropy": 2.2753440141677856,
"loss/fcd": 1.046875,
"loss/logits": 0.24172206223011017,
"step": 266
},
{
"epoch": 0.004612235379472961,
"grad_norm": 0.3125,
"grad_norm_var": 0.0016192118326822916,
"learning_rate": 0.01,
"loss": 1.3721,
"loss/crossentropy": 2.4424277544021606,
"loss/fcd": 1.1640625,
"loss/logits": 0.2600102424621582,
"step": 267
},
{
"epoch": 0.004629509669283722,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0016474246978759766,
"learning_rate": 0.01,
"loss": 1.3636,
"loss/crossentropy": 2.5198450088500977,
"loss/fcd": 1.09375,
"loss/logits": 0.25200945883989334,
"step": 268
},
{
"epoch": 0.004646783959094482,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0016239007314046224,
"learning_rate": 0.01,
"loss": 1.3874,
"loss/crossentropy": 2.4488155841827393,
"loss/fcd": 1.1484375,
"loss/logits": 0.2999647855758667,
"step": 269
},
{
"epoch": 0.0046640582489052415,
"grad_norm": 0.2890625,
"grad_norm_var": 0.00022017161051432292,
"learning_rate": 0.01,
"loss": 1.3399,
"loss/crossentropy": 2.1886658668518066,
"loss/fcd": 1.03125,
"loss/logits": 0.241354301571846,
"step": 270
},
{
"epoch": 0.004681332538716002,
"grad_norm": 0.359375,
"grad_norm_var": 0.00041605631510416665,
"learning_rate": 0.01,
"loss": 1.3419,
"loss/crossentropy": 2.382296085357666,
"loss/fcd": 1.10546875,
"loss/logits": 0.2474452257156372,
"step": 271
},
{
"epoch": 0.004698606828526762,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0004093805948893229,
"learning_rate": 0.01,
"loss": 1.325,
"loss/crossentropy": 2.569235324859619,
"loss/fcd": 1.15625,
"loss/logits": 0.24149076640605927,
"step": 272
},
{
"epoch": 0.004715881118337522,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0004258314768473307,
"learning_rate": 0.01,
"loss": 1.2981,
"loss/crossentropy": 2.5184491872787476,
"loss/fcd": 1.09375,
"loss/logits": 0.25748542696237564,
"step": 273
},
{
"epoch": 0.004733155408148283,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0004210789998372396,
"learning_rate": 0.01,
"loss": 1.3787,
"loss/crossentropy": 2.2780392169952393,
"loss/fcd": 1.140625,
"loss/logits": 0.24548518657684326,
"step": 274
},
{
"epoch": 0.004750429697959043,
"grad_norm": 0.3125,
"grad_norm_var": 0.0003958225250244141,
"learning_rate": 0.01,
"loss": 1.4055,
"loss/crossentropy": 2.392509341239929,
"loss/fcd": 1.21875,
"loss/logits": 0.25389473140239716,
"step": 275
},
{
"epoch": 0.0047677039877698025,
"grad_norm": 0.55078125,
"grad_norm_var": 0.004181019465128581,
"learning_rate": 0.01,
"loss": 1.3982,
"loss/crossentropy": 2.6148691177368164,
"loss/fcd": 1.1875,
"loss/logits": 0.27452078461647034,
"step": 276
},
{
"epoch": 0.004784978277580563,
"grad_norm": 0.3125,
"grad_norm_var": 0.004148213068644205,
"learning_rate": 0.01,
"loss": 1.3782,
"loss/crossentropy": 2.4390900135040283,
"loss/fcd": 1.12890625,
"loss/logits": 0.2340994030237198,
"step": 277
},
{
"epoch": 0.004802252567391323,
"grad_norm": 0.28515625,
"grad_norm_var": 0.004165760676066081,
"learning_rate": 0.01,
"loss": 1.3674,
"loss/crossentropy": 2.065169870853424,
"loss/fcd": 1.08203125,
"loss/logits": 0.23831525444984436,
"step": 278
},
{
"epoch": 0.004819526857202083,
"grad_norm": 0.33984375,
"grad_norm_var": 0.0041680494944254555,
"learning_rate": 0.01,
"loss": 1.3986,
"loss/crossentropy": 2.23395574092865,
"loss/fcd": 1.07421875,
"loss/logits": 0.25276701152324677,
"step": 279
},
{
"epoch": 0.004836801147012844,
"grad_norm": 0.302734375,
"grad_norm_var": 0.004177459081013997,
"learning_rate": 0.01,
"loss": 1.3866,
"loss/crossentropy": 2.5360673666000366,
"loss/fcd": 1.125,
"loss/logits": 0.2552696242928505,
"step": 280
},
{
"epoch": 0.0048540754368236036,
"grad_norm": 0.59375,
"grad_norm_var": 0.008786503473917644,
"learning_rate": 0.01,
"loss": 1.3841,
"loss/crossentropy": 2.64610493183136,
"loss/fcd": 1.14453125,
"loss/logits": 0.2660531848669052,
"step": 281
},
{
"epoch": 0.0048713497266343634,
"grad_norm": 0.306640625,
"grad_norm_var": 0.008615605036417643,
"learning_rate": 0.01,
"loss": 1.3781,
"loss/crossentropy": 2.206232786178589,
"loss/fcd": 1.04296875,
"loss/logits": 0.22382746636867523,
"step": 282
},
{
"epoch": 0.004888624016445124,
"grad_norm": 0.27734375,
"grad_norm_var": 0.008825031916300456,
"learning_rate": 0.01,
"loss": 1.3924,
"loss/crossentropy": 2.491134285926819,
"loss/fcd": 1.1953125,
"loss/logits": 0.28758758306503296,
"step": 283
},
{
"epoch": 0.004905898306255884,
"grad_norm": 0.314453125,
"grad_norm_var": 0.008684396743774414,
"learning_rate": 0.01,
"loss": 1.3821,
"loss/crossentropy": 2.418181896209717,
"loss/fcd": 1.08984375,
"loss/logits": 0.24221232533454895,
"step": 284
},
{
"epoch": 0.004923172596066644,
"grad_norm": 0.302734375,
"grad_norm_var": 0.00864103635152181,
"learning_rate": 0.01,
"loss": 1.385,
"loss/crossentropy": 2.532857298851013,
"loss/fcd": 1.11328125,
"loss/logits": 0.25721532106399536,
"step": 285
},
{
"epoch": 0.004940446885877405,
"grad_norm": 0.28515625,
"grad_norm_var": 0.008668883641560873,
"learning_rate": 0.01,
"loss": 1.3564,
"loss/crossentropy": 2.602588653564453,
"loss/fcd": 1.12890625,
"loss/logits": 0.2445499449968338,
"step": 286
},
{
"epoch": 0.0049577211756881645,
"grad_norm": 0.2890625,
"grad_norm_var": 0.008800490697224935,
"learning_rate": 0.01,
"loss": 1.3491,
"loss/crossentropy": 2.5629632472991943,
"loss/fcd": 1.13671875,
"loss/logits": 0.2607369050383568,
"step": 287
},
{
"epoch": 0.004974995465498924,
"grad_norm": 0.28515625,
"grad_norm_var": 0.008880043029785156,
"learning_rate": 0.01,
"loss": 1.3404,
"loss/crossentropy": 2.522684097290039,
"loss/fcd": 1.1171875,
"loss/logits": 0.27616211771965027,
"step": 288
},
{
"epoch": 0.004992269755309685,
"grad_norm": 0.26171875,
"grad_norm_var": 0.009095001220703124,
"learning_rate": 0.01,
"loss": 1.3618,
"loss/crossentropy": 2.6350889205932617,
"loss/fcd": 1.109375,
"loss/logits": 0.24171672016382217,
"step": 289
},
{
"epoch": 0.005009544045120445,
"grad_norm": 0.337890625,
"grad_norm_var": 0.009074894587198894,
"learning_rate": 0.01,
"loss": 1.459,
"loss/crossentropy": 2.98556649684906,
"loss/fcd": 1.21875,
"loss/logits": 0.2643963396549225,
"step": 290
},
{
"epoch": 0.005026818334931205,
"grad_norm": 0.251953125,
"grad_norm_var": 0.009484354654947917,
"learning_rate": 0.01,
"loss": 1.3693,
"loss/crossentropy": 2.230570673942566,
"loss/fcd": 1.06640625,
"loss/logits": 0.24412426352500916,
"step": 291
},
{
"epoch": 0.005044092624741966,
"grad_norm": 0.31640625,
"grad_norm_var": 0.006051127115885417,
"learning_rate": 0.01,
"loss": 1.3658,
"loss/crossentropy": 2.8022435903549194,
"loss/fcd": 1.1875,
"loss/logits": 0.2787918150424957,
"step": 292
},
{
"epoch": 0.0050613669145527255,
"grad_norm": 0.291015625,
"grad_norm_var": 0.006091165542602539,
"learning_rate": 0.01,
"loss": 1.3367,
"loss/crossentropy": 2.4132487773895264,
"loss/fcd": 1.18359375,
"loss/logits": 0.26599422097206116,
"step": 293
},
{
"epoch": 0.005078641204363485,
"grad_norm": 0.29296875,
"grad_norm_var": 0.00606382687886556,
"learning_rate": 0.01,
"loss": 1.3944,
"loss/crossentropy": 2.2870916724205017,
"loss/fcd": 1.11328125,
"loss/logits": 0.25007129460573196,
"step": 294
},
{
"epoch": 0.005095915494174246,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0060225804646809895,
"learning_rate": 0.01,
"loss": 1.3933,
"loss/crossentropy": 2.60745906829834,
"loss/fcd": 1.13671875,
"loss/logits": 0.2817099541425705,
"step": 295
},
{
"epoch": 0.005113189783985006,
"grad_norm": 0.34765625,
"grad_norm_var": 0.006082900365193685,
"learning_rate": 0.01,
"loss": 1.4644,
"loss/crossentropy": 2.1799449920654297,
"loss/fcd": 1.12890625,
"loss/logits": 0.23855505883693695,
"step": 296
},
{
"epoch": 0.005130464073795766,
"grad_norm": 0.296875,
"grad_norm_var": 0.0006179650624593099,
"learning_rate": 0.01,
"loss": 1.3902,
"loss/crossentropy": 2.299877882003784,
"loss/fcd": 1.09765625,
"loss/logits": 0.24762696027755737,
"step": 297
},
{
"epoch": 0.005147738363606527,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0006234327952067058,
"learning_rate": 0.01,
"loss": 1.3882,
"loss/crossentropy": 2.334827423095703,
"loss/fcd": 1.07421875,
"loss/logits": 0.23748627305030823,
"step": 298
},
{
"epoch": 0.0051650126534172865,
"grad_norm": 0.33203125,
"grad_norm_var": 0.0006581465403238932,
"learning_rate": 0.01,
"loss": 1.3226,
"loss/crossentropy": 2.4439618587493896,
"loss/fcd": 1.078125,
"loss/logits": 0.23564526438713074,
"step": 299
},
{
"epoch": 0.005182286943228046,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0006502787272135417,
"learning_rate": 0.01,
"loss": 1.4317,
"loss/crossentropy": 2.4066379070281982,
"loss/fcd": 1.16796875,
"loss/logits": 0.28721271455287933,
"step": 300
},
{
"epoch": 0.005199561233038807,
"grad_norm": 0.337890625,
"grad_norm_var": 0.0007389704386393229,
"learning_rate": 0.01,
"loss": 1.392,
"loss/crossentropy": 2.6461589336395264,
"loss/fcd": 1.1640625,
"loss/logits": 0.2553107738494873,
"step": 301
},
{
"epoch": 0.005216835522849567,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0007389704386393229,
"learning_rate": 0.01,
"loss": 1.3864,
"loss/crossentropy": 2.607328414916992,
"loss/fcd": 1.125,
"loss/logits": 0.26615719497203827,
"step": 302
},
{
"epoch": 0.005234109812660327,
"grad_norm": 0.3125,
"grad_norm_var": 0.0007313410441080729,
"learning_rate": 0.01,
"loss": 1.3974,
"loss/crossentropy": 2.5339640378952026,
"loss/fcd": 1.23046875,
"loss/logits": 0.29202982783317566,
"step": 303
},
{
"epoch": 0.005251384102471087,
"grad_norm": 0.271484375,
"grad_norm_var": 0.000777292251586914,
"learning_rate": 0.01,
"loss": 1.3597,
"loss/crossentropy": 2.418789029121399,
"loss/fcd": 1.0078125,
"loss/logits": 0.22410588711500168,
"step": 304
},
{
"epoch": 0.0052686583922818475,
"grad_norm": 0.26171875,
"grad_norm_var": 0.000777292251586914,
"learning_rate": 0.01,
"loss": 1.3612,
"loss/crossentropy": 2.333797812461853,
"loss/fcd": 1.15234375,
"loss/logits": 0.2548183798789978,
"step": 305
},
{
"epoch": 0.005285932682092607,
"grad_norm": 0.330078125,
"grad_norm_var": 0.0007448673248291015,
"learning_rate": 0.01,
"loss": 1.4106,
"loss/crossentropy": 2.444805860519409,
"loss/fcd": 1.1796875,
"loss/logits": 0.2654833495616913,
"step": 306
},
{
"epoch": 0.005303206971903367,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0005655765533447265,
"learning_rate": 0.01,
"loss": 1.4068,
"loss/crossentropy": 2.478832244873047,
"loss/fcd": 1.15625,
"loss/logits": 0.27099600434303284,
"step": 307
},
{
"epoch": 0.005320481261714128,
"grad_norm": 0.34375,
"grad_norm_var": 0.0006519158681233724,
"learning_rate": 0.01,
"loss": 1.4297,
"loss/crossentropy": 2.276490032672882,
"loss/fcd": 1.2578125,
"loss/logits": 0.2906430959701538,
"step": 308
},
{
"epoch": 0.005337755551524888,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0006444136301676433,
"learning_rate": 0.01,
"loss": 1.3362,
"loss/crossentropy": 2.1777199506759644,
"loss/fcd": 1.1171875,
"loss/logits": 0.2572901248931885,
"step": 309
},
{
"epoch": 0.005355029841335648,
"grad_norm": 0.349609375,
"grad_norm_var": 0.0007352193196614583,
"learning_rate": 0.01,
"loss": 1.4705,
"loss/crossentropy": 2.4591206312179565,
"loss/fcd": 1.09375,
"loss/logits": 0.2502119764685631,
"step": 310
},
{
"epoch": 0.0053723041311464085,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0007771650950113932,
"learning_rate": 0.01,
"loss": 1.4149,
"loss/crossentropy": 2.377845048904419,
"loss/fcd": 1.1015625,
"loss/logits": 0.25507183372974396,
"step": 311
},
{
"epoch": 0.005389578420957168,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0006932417551676432,
"learning_rate": 0.01,
"loss": 1.3878,
"loss/crossentropy": 2.6086690425872803,
"loss/fcd": 1.25,
"loss/logits": 0.28851139545440674,
"step": 312
},
{
"epoch": 0.005406852710767928,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0006875991821289062,
"learning_rate": 0.01,
"loss": 1.3607,
"loss/crossentropy": 2.089534819126129,
"loss/fcd": 1.12890625,
"loss/logits": 0.22003582119941711,
"step": 313
},
{
"epoch": 0.005424127000578689,
"grad_norm": 0.388671875,
"grad_norm_var": 0.0011123021443684895,
"learning_rate": 0.01,
"loss": 1.3856,
"loss/crossentropy": 2.0762287974357605,
"loss/fcd": 1.1875,
"loss/logits": 0.23012210428714752,
"step": 314
},
{
"epoch": 0.005441401290389449,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0011039574940999348,
"learning_rate": 0.01,
"loss": 1.3841,
"loss/crossentropy": 2.5591676235198975,
"loss/fcd": 1.05859375,
"loss/logits": 0.2246263027191162,
"step": 315
},
{
"epoch": 0.005458675580200209,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0010869344075520833,
"learning_rate": 0.01,
"loss": 1.4073,
"loss/crossentropy": 2.412803888320923,
"loss/fcd": 1.12890625,
"loss/logits": 0.24091031402349472,
"step": 316
},
{
"epoch": 0.0054759498700109694,
"grad_norm": 0.333984375,
"grad_norm_var": 0.001073137919108073,
"learning_rate": 0.01,
"loss": 1.368,
"loss/crossentropy": 2.328226327896118,
"loss/fcd": 1.1328125,
"loss/logits": 0.2949056923389435,
"step": 317
},
{
"epoch": 0.005493224159821729,
"grad_norm": 0.30078125,
"grad_norm_var": 0.001038042704264323,
"learning_rate": 0.01,
"loss": 1.3639,
"loss/crossentropy": 2.2848289012908936,
"loss/fcd": 1.16796875,
"loss/logits": 0.25566980242729187,
"step": 318
},
{
"epoch": 0.005510498449632489,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0011049906412760417,
"learning_rate": 0.01,
"loss": 1.3843,
"loss/crossentropy": 2.3968076705932617,
"loss/fcd": 1.10546875,
"loss/logits": 0.2567252665758133,
"step": 319
},
{
"epoch": 0.00552777273944325,
"grad_norm": 0.423828125,
"grad_norm_var": 0.001811663309733073,
"learning_rate": 0.01,
"loss": 1.3891,
"loss/crossentropy": 2.396988272666931,
"loss/fcd": 1.1484375,
"loss/logits": 0.24911059439182281,
"step": 320
},
{
"epoch": 0.00554504702925401,
"grad_norm": 0.28125,
"grad_norm_var": 0.001689910888671875,
"learning_rate": 0.01,
"loss": 1.3517,
"loss/crossentropy": 2.4934462308883667,
"loss/fcd": 1.09375,
"loss/logits": 0.2607601135969162,
"step": 321
},
{
"epoch": 0.00556232131906477,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0017145156860351562,
"learning_rate": 0.01,
"loss": 1.4164,
"loss/crossentropy": 2.421591639518738,
"loss/fcd": 1.08203125,
"loss/logits": 0.2476629763841629,
"step": 322
},
{
"epoch": 0.00557959560887553,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0016919453938802084,
"learning_rate": 0.01,
"loss": 1.4522,
"loss/crossentropy": 2.5826879739761353,
"loss/fcd": 1.12890625,
"loss/logits": 0.24336670339107513,
"step": 323
},
{
"epoch": 0.00559686989868629,
"grad_norm": 0.36328125,
"grad_norm_var": 0.0017831802368164062,
"learning_rate": 0.01,
"loss": 1.435,
"loss/crossentropy": 2.6005271673202515,
"loss/fcd": 1.16796875,
"loss/logits": 0.2697305530309677,
"step": 324
},
{
"epoch": 0.00561414418849705,
"grad_norm": 0.3203125,
"grad_norm_var": 0.001741647720336914,
"learning_rate": 0.01,
"loss": 1.4172,
"loss/crossentropy": 2.514216661453247,
"loss/fcd": 1.09375,
"loss/logits": 0.2561942785978317,
"step": 325
},
{
"epoch": 0.005631418478307811,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0017611026763916016,
"learning_rate": 0.01,
"loss": 1.3803,
"loss/crossentropy": 2.6110743284225464,
"loss/fcd": 1.109375,
"loss/logits": 0.25072459131479263,
"step": 326
},
{
"epoch": 0.005648692768118571,
"grad_norm": 0.271484375,
"grad_norm_var": 0.0018299738566080728,
"learning_rate": 0.01,
"loss": 1.3267,
"loss/crossentropy": 2.3151168823242188,
"loss/fcd": 1.06640625,
"loss/logits": 0.22984758019447327,
"step": 327
},
{
"epoch": 0.005665967057929331,
"grad_norm": 0.30859375,
"grad_norm_var": 0.001784515380859375,
"learning_rate": 0.01,
"loss": 1.4146,
"loss/crossentropy": 2.610999584197998,
"loss/fcd": 1.16796875,
"loss/logits": 0.27360329031944275,
"step": 328
},
{
"epoch": 0.005683241347740091,
"grad_norm": 0.330078125,
"grad_norm_var": 0.00178680419921875,
"learning_rate": 0.01,
"loss": 1.4228,
"loss/crossentropy": 2.3715471029281616,
"loss/fcd": 1.125,
"loss/logits": 0.24973652511835098,
"step": 329
},
{
"epoch": 0.005700515637550851,
"grad_norm": 0.359375,
"grad_norm_var": 0.0015657901763916015,
"learning_rate": 0.01,
"loss": 1.3711,
"loss/crossentropy": 2.3313710689544678,
"loss/fcd": 1.0625,
"loss/logits": 0.2390831932425499,
"step": 330
},
{
"epoch": 0.005717789927361611,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0015309651692708333,
"learning_rate": 0.01,
"loss": 1.3683,
"loss/crossentropy": 2.405033826828003,
"loss/fcd": 1.140625,
"loss/logits": 0.26245684921741486,
"step": 331
},
{
"epoch": 0.005735064217172372,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0015340010325113932,
"learning_rate": 0.01,
"loss": 1.3872,
"loss/crossentropy": 2.6667896509170532,
"loss/fcd": 1.14453125,
"loss/logits": 0.2503022700548172,
"step": 332
},
{
"epoch": 0.005752338506983132,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0015253543853759766,
"learning_rate": 0.01,
"loss": 1.3296,
"loss/crossentropy": 2.6033343076705933,
"loss/fcd": 1.1328125,
"loss/logits": 0.24763934314250946,
"step": 333
},
{
"epoch": 0.005769612796793892,
"grad_norm": 0.271484375,
"grad_norm_var": 0.0016357421875,
"learning_rate": 0.01,
"loss": 1.3707,
"loss/crossentropy": 2.3747464418411255,
"loss/fcd": 1.08984375,
"loss/logits": 0.24109259992837906,
"step": 334
},
{
"epoch": 0.005786887086604652,
"grad_norm": 0.30078125,
"grad_norm_var": 0.001557159423828125,
"learning_rate": 0.01,
"loss": 1.3676,
"loss/crossentropy": 2.064777910709381,
"loss/fcd": 1.1875,
"loss/logits": 0.20032966136932373,
"step": 335
},
{
"epoch": 0.005804161376415412,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0007188002268473308,
"learning_rate": 0.01,
"loss": 1.415,
"loss/crossentropy": 2.395054817199707,
"loss/fcd": 1.140625,
"loss/logits": 0.26608574390411377,
"step": 336
},
{
"epoch": 0.005821435666226172,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0007338047027587891,
"learning_rate": 0.01,
"loss": 1.3812,
"loss/crossentropy": 2.2238911390304565,
"loss/fcd": 1.0703125,
"loss/logits": 0.2315894290804863,
"step": 337
},
{
"epoch": 0.005838709956036932,
"grad_norm": 0.400390625,
"grad_norm_var": 0.0012453556060791015,
"learning_rate": 0.01,
"loss": 1.4817,
"loss/crossentropy": 2.6248074769973755,
"loss/fcd": 1.16015625,
"loss/logits": 0.28028567135334015,
"step": 338
},
{
"epoch": 0.005855984245847693,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0012906233469645182,
"learning_rate": 0.01,
"loss": 1.3788,
"loss/crossentropy": 2.125354528427124,
"loss/fcd": 1.140625,
"loss/logits": 0.26438966393470764,
"step": 339
},
{
"epoch": 0.005873258535658453,
"grad_norm": 0.326171875,
"grad_norm_var": 0.0011260350545247396,
"learning_rate": 0.01,
"loss": 1.4301,
"loss/crossentropy": 2.301461696624756,
"loss/fcd": 1.1015625,
"loss/logits": 0.254987433552742,
"step": 340
},
{
"epoch": 0.0058905328254692125,
"grad_norm": 0.326171875,
"grad_norm_var": 0.001136000951131185,
"learning_rate": 0.01,
"loss": 1.4306,
"loss/crossentropy": 2.369805097579956,
"loss/fcd": 1.1015625,
"loss/logits": 0.25373272597789764,
"step": 341
},
{
"epoch": 0.005907807115279973,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0010833104451497396,
"learning_rate": 0.01,
"loss": 1.4098,
"loss/crossentropy": 2.5944920778274536,
"loss/fcd": 1.23046875,
"loss/logits": 0.2799176275730133,
"step": 342
},
{
"epoch": 0.005925081405090733,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0009953657786051433,
"learning_rate": 0.01,
"loss": 1.3992,
"loss/crossentropy": 2.13715797662735,
"loss/fcd": 1.04296875,
"loss/logits": 0.24987763166427612,
"step": 343
},
{
"epoch": 0.005942355694901493,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0009989261627197266,
"learning_rate": 0.01,
"loss": 1.4174,
"loss/crossentropy": 2.4599469900131226,
"loss/fcd": 1.10546875,
"loss/logits": 0.2511429339647293,
"step": 344
},
{
"epoch": 0.005959629984712254,
"grad_norm": 0.333984375,
"grad_norm_var": 0.0010085900624593098,
"learning_rate": 0.01,
"loss": 1.3608,
"loss/crossentropy": 2.431983709335327,
"loss/fcd": 1.125,
"loss/logits": 0.2585323229432106,
"step": 345
},
{
"epoch": 0.005976904274523014,
"grad_norm": 0.35546875,
"grad_norm_var": 0.0009857018788655598,
"learning_rate": 0.01,
"loss": 1.3743,
"loss/crossentropy": 2.3239270448684692,
"loss/fcd": 1.1015625,
"loss/logits": 0.2441619336605072,
"step": 346
},
{
"epoch": 0.0059941785643337735,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0009857018788655598,
"learning_rate": 0.01,
"loss": 1.3826,
"loss/crossentropy": 2.229923963546753,
"loss/fcd": 1.09765625,
"loss/logits": 0.22727931290864944,
"step": 347
},
{
"epoch": 0.006011452854144534,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0010732014973958333,
"learning_rate": 0.01,
"loss": 1.3712,
"loss/crossentropy": 2.6727981567382812,
"loss/fcd": 1.15625,
"loss/logits": 0.28281402587890625,
"step": 348
},
{
"epoch": 0.006028727143955294,
"grad_norm": 0.392578125,
"grad_norm_var": 0.0014724095662434896,
"learning_rate": 0.01,
"loss": 1.4247,
"loss/crossentropy": 2.4443578720092773,
"loss/fcd": 1.15625,
"loss/logits": 0.2722969502210617,
"step": 349
},
{
"epoch": 0.006046001433766054,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0013391971588134766,
"learning_rate": 0.01,
"loss": 1.3573,
"loss/crossentropy": 2.399729371070862,
"loss/fcd": 1.04296875,
"loss/logits": 0.22808712720870972,
"step": 350
},
{
"epoch": 0.006063275723576815,
"grad_norm": 0.32421875,
"grad_norm_var": 0.001315927505493164,
"learning_rate": 0.01,
"loss": 1.3975,
"loss/crossentropy": 2.521644949913025,
"loss/fcd": 1.16796875,
"loss/logits": 0.25423599034547806,
"step": 351
},
{
"epoch": 0.006080550013387575,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0013570149739583334,
"learning_rate": 0.01,
"loss": 1.3756,
"loss/crossentropy": 2.263104200363159,
"loss/fcd": 1.09765625,
"loss/logits": 0.26695793122053146,
"step": 352
},
{
"epoch": 0.0060978243031983344,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0012316226959228516,
"learning_rate": 0.01,
"loss": 1.3735,
"loss/crossentropy": 2.6748716831207275,
"loss/fcd": 1.16796875,
"loss/logits": 0.27432236075401306,
"step": 353
},
{
"epoch": 0.006115098593009095,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0008518060048421223,
"learning_rate": 0.01,
"loss": 1.3334,
"loss/crossentropy": 2.3271913528442383,
"loss/fcd": 1.15625,
"loss/logits": 0.25318336486816406,
"step": 354
},
{
"epoch": 0.006132372882819855,
"grad_norm": 0.328125,
"grad_norm_var": 0.0008040746053059896,
"learning_rate": 0.01,
"loss": 1.4881,
"loss/crossentropy": 2.6528772115707397,
"loss/fcd": 1.296875,
"loss/logits": 0.3017214983701706,
"step": 355
},
{
"epoch": 0.006149647172630615,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0008056640625,
"learning_rate": 0.01,
"loss": 1.3815,
"loss/crossentropy": 2.4514299631118774,
"loss/fcd": 1.16015625,
"loss/logits": 0.25581270456314087,
"step": 356
},
{
"epoch": 0.006166921462441376,
"grad_norm": 0.3203125,
"grad_norm_var": 0.000800180435180664,
"learning_rate": 0.01,
"loss": 1.3652,
"loss/crossentropy": 2.307224750518799,
"loss/fcd": 1.08984375,
"loss/logits": 0.24949809908866882,
"step": 357
},
{
"epoch": 0.0061841957522521356,
"grad_norm": 0.291015625,
"grad_norm_var": 0.000836944580078125,
"learning_rate": 0.01,
"loss": 1.3842,
"loss/crossentropy": 2.120967745780945,
"loss/fcd": 1.1328125,
"loss/logits": 0.2532486915588379,
"step": 358
},
{
"epoch": 0.006201470042062895,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0008429050445556641,
"learning_rate": 0.01,
"loss": 1.4114,
"loss/crossentropy": 2.4582676887512207,
"loss/fcd": 1.23828125,
"loss/logits": 0.278301477432251,
"step": 359
},
{
"epoch": 0.006218744331873656,
"grad_norm": 0.275390625,
"grad_norm_var": 0.000936126708984375,
"learning_rate": 0.01,
"loss": 1.3794,
"loss/crossentropy": 2.636004090309143,
"loss/fcd": 1.21484375,
"loss/logits": 0.2849871665239334,
"step": 360
},
{
"epoch": 0.006236018621684416,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0009364922841389974,
"learning_rate": 0.01,
"loss": 1.4538,
"loss/crossentropy": 2.55968701839447,
"loss/fcd": 1.2109375,
"loss/logits": 0.29454614222049713,
"step": 361
},
{
"epoch": 0.006253292911495176,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0007910251617431641,
"learning_rate": 0.01,
"loss": 1.3948,
"loss/crossentropy": 2.3076229095458984,
"loss/fcd": 1.015625,
"loss/logits": 0.23156127333641052,
"step": 362
},
{
"epoch": 0.006270567201305937,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0008000055948893229,
"learning_rate": 0.01,
"loss": 1.3462,
"loss/crossentropy": 2.3910467624664307,
"loss/fcd": 1.140625,
"loss/logits": 0.24528269469738007,
"step": 363
},
{
"epoch": 0.0062878414911166965,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0007506688435872396,
"learning_rate": 0.01,
"loss": 1.4077,
"loss/crossentropy": 2.3372639417648315,
"loss/fcd": 1.21875,
"loss/logits": 0.2640947550535202,
"step": 364
},
{
"epoch": 0.006305115780927456,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0002445856730143229,
"learning_rate": 0.01,
"loss": 1.3759,
"loss/crossentropy": 2.454505205154419,
"loss/fcd": 1.05078125,
"loss/logits": 0.2401072233915329,
"step": 365
},
{
"epoch": 0.006322390070738217,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0002534071604410807,
"learning_rate": 0.01,
"loss": 1.3749,
"loss/crossentropy": 2.3645259141921997,
"loss/fcd": 1.125,
"loss/logits": 0.23153205960988998,
"step": 366
},
{
"epoch": 0.006339664360548977,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0002587477366129557,
"learning_rate": 0.01,
"loss": 1.3546,
"loss/crossentropy": 2.494025230407715,
"loss/fcd": 1.14453125,
"loss/logits": 0.26557300239801407,
"step": 367
},
{
"epoch": 0.006356938650359737,
"grad_norm": 0.28125,
"grad_norm_var": 0.000279998779296875,
"learning_rate": 0.01,
"loss": 1.3496,
"loss/crossentropy": 2.3776293992996216,
"loss/fcd": 1.12109375,
"loss/logits": 0.2568487524986267,
"step": 368
},
{
"epoch": 0.006374212940170498,
"grad_norm": 0.30078125,
"grad_norm_var": 0.00024310747782389322,
"learning_rate": 0.01,
"loss": 1.3734,
"loss/crossentropy": 2.591793417930603,
"loss/fcd": 1.15234375,
"loss/logits": 0.27023325860500336,
"step": 369
},
{
"epoch": 0.0063914872299812575,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0002357323964436849,
"learning_rate": 0.01,
"loss": 1.3944,
"loss/crossentropy": 2.415038585662842,
"loss/fcd": 1.15625,
"loss/logits": 0.3026815205812454,
"step": 370
},
{
"epoch": 0.006408761519792017,
"grad_norm": 0.3359375,
"grad_norm_var": 0.0002699375152587891,
"learning_rate": 0.01,
"loss": 1.3485,
"loss/crossentropy": 2.4911344051361084,
"loss/fcd": 1.08984375,
"loss/logits": 0.2093563750386238,
"step": 371
},
{
"epoch": 0.006426035809602778,
"grad_norm": 0.28515625,
"grad_norm_var": 0.00027815500895182293,
"learning_rate": 0.01,
"loss": 1.34,
"loss/crossentropy": 2.470622181892395,
"loss/fcd": 1.17578125,
"loss/logits": 0.2913671284914017,
"step": 372
},
{
"epoch": 0.006443310099413538,
"grad_norm": 0.3203125,
"grad_norm_var": 0.00027815500895182293,
"learning_rate": 0.01,
"loss": 1.4077,
"loss/crossentropy": 2.6227082014083862,
"loss/fcd": 1.09375,
"loss/logits": 0.24900969862937927,
"step": 373
},
{
"epoch": 0.006460584389224298,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0003157138824462891,
"learning_rate": 0.01,
"loss": 1.4209,
"loss/crossentropy": 3.0212732553482056,
"loss/fcd": 1.22265625,
"loss/logits": 0.270741730928421,
"step": 374
},
{
"epoch": 0.006477858679035058,
"grad_norm": 0.326171875,
"grad_norm_var": 0.0003500461578369141,
"learning_rate": 0.01,
"loss": 1.465,
"loss/crossentropy": 2.8352737426757812,
"loss/fcd": 1.24609375,
"loss/logits": 0.31054478883743286,
"step": 375
},
{
"epoch": 0.0064951329688458185,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0003049055735270182,
"learning_rate": 0.01,
"loss": 1.3951,
"loss/crossentropy": 2.450179100036621,
"loss/fcd": 1.140625,
"loss/logits": 0.24616704881191254,
"step": 376
},
{
"epoch": 0.006512407258656578,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0003193537394205729,
"learning_rate": 0.01,
"loss": 1.3687,
"loss/crossentropy": 2.2392066717147827,
"loss/fcd": 1.025390625,
"loss/logits": 0.24169814586639404,
"step": 377
},
{
"epoch": 0.006529681548467338,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0003195285797119141,
"learning_rate": 0.01,
"loss": 1.4315,
"loss/crossentropy": 2.6067546606063843,
"loss/fcd": 1.18359375,
"loss/logits": 0.31542879343032837,
"step": 378
},
{
"epoch": 0.006546955838278099,
"grad_norm": 0.298828125,
"grad_norm_var": 0.00031558672587076825,
"learning_rate": 0.01,
"loss": 1.3917,
"loss/crossentropy": 2.360989570617676,
"loss/fcd": 1.13671875,
"loss/logits": 0.25205816328525543,
"step": 379
},
{
"epoch": 0.006564230128088859,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0003201643625895182,
"learning_rate": 0.01,
"loss": 1.4293,
"loss/crossentropy": 2.71570360660553,
"loss/fcd": 1.171875,
"loss/logits": 0.2731679454445839,
"step": 380
},
{
"epoch": 0.006581504417899619,
"grad_norm": 0.2890625,
"grad_norm_var": 0.00033359527587890626,
"learning_rate": 0.01,
"loss": 1.351,
"loss/crossentropy": 2.624392867088318,
"loss/fcd": 1.12109375,
"loss/logits": 0.2293551042675972,
"step": 381
},
{
"epoch": 0.0065987787077103795,
"grad_norm": 0.310546875,
"grad_norm_var": 0.00032958984375,
"learning_rate": 0.01,
"loss": 1.3969,
"loss/crossentropy": 2.1760467290878296,
"loss/fcd": 1.1171875,
"loss/logits": 0.23204915970563889,
"step": 382
},
{
"epoch": 0.006616052997521139,
"grad_norm": 0.2890625,
"grad_norm_var": 0.00029544830322265626,
"learning_rate": 0.01,
"loss": 1.3163,
"loss/crossentropy": 2.089251697063446,
"loss/fcd": 1.041015625,
"loss/logits": 0.21481642127037048,
"step": 383
},
{
"epoch": 0.006633327287331899,
"grad_norm": 0.3046875,
"grad_norm_var": 0.00025424957275390627,
"learning_rate": 0.01,
"loss": 1.413,
"loss/crossentropy": 2.1335262060165405,
"loss/fcd": 1.24609375,
"loss/logits": 0.29476068913936615,
"step": 384
},
{
"epoch": 0.00665060157714266,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0002770582834879557,
"learning_rate": 0.01,
"loss": 1.3581,
"loss/crossentropy": 2.3327542543411255,
"loss/fcd": 1.0859375,
"loss/logits": 0.2519141435623169,
"step": 385
},
{
"epoch": 0.00666787586695342,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0002797285715738932,
"learning_rate": 0.01,
"loss": 1.3991,
"loss/crossentropy": 2.521241784095764,
"loss/fcd": 1.1640625,
"loss/logits": 0.2740743160247803,
"step": 386
},
{
"epoch": 0.00668515015676418,
"grad_norm": 0.341796875,
"grad_norm_var": 0.00030543009440104165,
"learning_rate": 0.01,
"loss": 1.4046,
"loss/crossentropy": 2.5978543758392334,
"loss/fcd": 1.18359375,
"loss/logits": 0.24079592525959015,
"step": 387
},
{
"epoch": 0.0067024244465749405,
"grad_norm": 0.326171875,
"grad_norm_var": 0.00029575030008951824,
"learning_rate": 0.01,
"loss": 1.4169,
"loss/crossentropy": 2.246425747871399,
"loss/fcd": 1.19140625,
"loss/logits": 0.2572794705629349,
"step": 388
},
{
"epoch": 0.0067196987363857,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0003040949503580729,
"learning_rate": 0.01,
"loss": 1.382,
"loss/crossentropy": 2.293286442756653,
"loss/fcd": 1.09375,
"loss/logits": 0.23658673465251923,
"step": 389
},
{
"epoch": 0.00673697302619646,
"grad_norm": 0.33203125,
"grad_norm_var": 0.00032596588134765626,
"learning_rate": 0.01,
"loss": 1.4607,
"loss/crossentropy": 2.441470980644226,
"loss/fcd": 1.31640625,
"loss/logits": 0.28673678636550903,
"step": 390
},
{
"epoch": 0.006754247316007221,
"grad_norm": 0.296875,
"grad_norm_var": 0.0003061771392822266,
"learning_rate": 0.01,
"loss": 1.3719,
"loss/crossentropy": 2.5365694761276245,
"loss/fcd": 1.16015625,
"loss/logits": 0.2776503562927246,
"step": 391
},
{
"epoch": 0.006771521605817981,
"grad_norm": 0.33984375,
"grad_norm_var": 0.00037663777669270834,
"learning_rate": 0.01,
"loss": 1.4373,
"loss/crossentropy": 2.517317056655884,
"loss/fcd": 1.15234375,
"loss/logits": 0.27259568870067596,
"step": 392
},
{
"epoch": 0.006788795895628741,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0003398736317952474,
"learning_rate": 0.01,
"loss": 1.382,
"loss/crossentropy": 2.38772451877594,
"loss/fcd": 1.0859375,
"loss/logits": 0.24375227838754654,
"step": 393
},
{
"epoch": 0.006806070185439501,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0003720601399739583,
"learning_rate": 0.01,
"loss": 1.3734,
"loss/crossentropy": 2.2084882259368896,
"loss/fcd": 1.076171875,
"loss/logits": 0.22468051314353943,
"step": 394
},
{
"epoch": 0.006823344475250261,
"grad_norm": 0.333984375,
"grad_norm_var": 0.00041039784749348957,
"learning_rate": 0.01,
"loss": 1.4417,
"loss/crossentropy": 2.4394543170928955,
"loss/fcd": 1.15234375,
"loss/logits": 0.25751829147338867,
"step": 395
},
{
"epoch": 0.006840618765061021,
"grad_norm": 0.28125,
"grad_norm_var": 0.0004447778065999349,
"learning_rate": 0.01,
"loss": 1.3414,
"loss/crossentropy": 2.365694999694824,
"loss/fcd": 1.07421875,
"loss/logits": 0.24539195746183395,
"step": 396
},
{
"epoch": 0.006857893054871782,
"grad_norm": 0.310546875,
"grad_norm_var": 0.00042292277018229164,
"learning_rate": 0.01,
"loss": 1.396,
"loss/crossentropy": 2.4616193771362305,
"loss/fcd": 1.203125,
"loss/logits": 0.2692428231239319,
"step": 397
},
{
"epoch": 0.006875167344682542,
"grad_norm": 0.330078125,
"grad_norm_var": 0.0004531224568684896,
"learning_rate": 0.01,
"loss": 1.403,
"loss/crossentropy": 2.2189152240753174,
"loss/fcd": 1.0859375,
"loss/logits": 0.24257495999336243,
"step": 398
},
{
"epoch": 0.006892441634493302,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0004249413808186849,
"learning_rate": 0.01,
"loss": 1.3559,
"loss/crossentropy": 2.37640380859375,
"loss/fcd": 1.09375,
"loss/logits": 0.2584332674741745,
"step": 399
},
{
"epoch": 0.006909715924304062,
"grad_norm": 0.34765625,
"grad_norm_var": 0.0005074659983317057,
"learning_rate": 0.01,
"loss": 1.4774,
"loss/crossentropy": 2.384715437889099,
"loss/fcd": 1.1171875,
"loss/logits": 0.2619960308074951,
"step": 400
},
{
"epoch": 0.006926990214114822,
"grad_norm": 0.310546875,
"grad_norm_var": 0.00046054522196451825,
"learning_rate": 0.01,
"loss": 1.4446,
"loss/crossentropy": 2.1976479291915894,
"loss/fcd": 1.09375,
"loss/logits": 0.25502003729343414,
"step": 401
},
{
"epoch": 0.006944264503925582,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0004532972971598307,
"learning_rate": 0.01,
"loss": 1.3809,
"loss/crossentropy": 2.278647780418396,
"loss/fcd": 1.1015625,
"loss/logits": 0.2284827083349228,
"step": 402
},
{
"epoch": 0.006961538793736343,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0004994710286458333,
"learning_rate": 0.01,
"loss": 1.3505,
"loss/crossentropy": 2.4870028495788574,
"loss/fcd": 1.08984375,
"loss/logits": 0.2371172457933426,
"step": 403
},
{
"epoch": 0.006978813083547103,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0004836400349934896,
"learning_rate": 0.01,
"loss": 1.4059,
"loss/crossentropy": 2.65886914730072,
"loss/fcd": 1.171875,
"loss/logits": 0.2828421890735626,
"step": 404
},
{
"epoch": 0.006996087373357863,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0005295912424723308,
"learning_rate": 0.01,
"loss": 1.3245,
"loss/crossentropy": 2.1928412914276123,
"loss/fcd": 1.0234375,
"loss/logits": 0.22634898871183395,
"step": 405
},
{
"epoch": 0.007013361663168623,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0004922072092692057,
"learning_rate": 0.01,
"loss": 1.4224,
"loss/crossentropy": 2.6360604763031006,
"loss/fcd": 1.24609375,
"loss/logits": 0.2727653980255127,
"step": 406
},
{
"epoch": 0.007030635952979383,
"grad_norm": 0.3125,
"grad_norm_var": 0.00048584938049316405,
"learning_rate": 0.01,
"loss": 1.3588,
"loss/crossentropy": 2.3004168272018433,
"loss/fcd": 1.078125,
"loss/logits": 0.239614799618721,
"step": 407
},
{
"epoch": 0.007047910242790143,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0004352410634358724,
"learning_rate": 0.01,
"loss": 1.4105,
"loss/crossentropy": 2.3150475025177,
"loss/fcd": 1.09765625,
"loss/logits": 0.2282358631491661,
"step": 408
},
{
"epoch": 0.007065184532600904,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0004974365234375,
"learning_rate": 0.01,
"loss": 1.3576,
"loss/crossentropy": 2.645399570465088,
"loss/fcd": 1.17578125,
"loss/logits": 0.2676645368337631,
"step": 409
},
{
"epoch": 0.007082458822411664,
"grad_norm": 0.3203125,
"grad_norm_var": 0.00047855377197265626,
"learning_rate": 0.01,
"loss": 1.4103,
"loss/crossentropy": 2.1640161275863647,
"loss/fcd": 1.12890625,
"loss/logits": 0.2479998767375946,
"step": 410
},
{
"epoch": 0.007099733112222424,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0004301548004150391,
"learning_rate": 0.01,
"loss": 1.4286,
"loss/crossentropy": 2.5662118196487427,
"loss/fcd": 1.18359375,
"loss/logits": 0.2710702270269394,
"step": 411
},
{
"epoch": 0.0071170074020331835,
"grad_norm": 0.298828125,
"grad_norm_var": 0.000391387939453125,
"learning_rate": 0.01,
"loss": 1.369,
"loss/crossentropy": 2.1513331532478333,
"loss/fcd": 1.0625,
"loss/logits": 0.22271250188350677,
"step": 412
},
{
"epoch": 0.007134281691843944,
"grad_norm": 0.283203125,
"grad_norm_var": 0.00042565663655598957,
"learning_rate": 0.01,
"loss": 1.3294,
"loss/crossentropy": 2.2309274673461914,
"loss/fcd": 1.0546875,
"loss/logits": 0.24107103794813156,
"step": 413
},
{
"epoch": 0.007151555981654704,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0003997802734375,
"learning_rate": 0.01,
"loss": 1.3924,
"loss/crossentropy": 2.6093149185180664,
"loss/fcd": 1.08984375,
"loss/logits": 0.24238202720880508,
"step": 414
},
{
"epoch": 0.007168830271465464,
"grad_norm": 0.3828125,
"grad_norm_var": 0.0008020877838134765,
"learning_rate": 0.01,
"loss": 1.4011,
"loss/crossentropy": 2.6286522150039673,
"loss/fcd": 1.1328125,
"loss/logits": 0.258474200963974,
"step": 415
},
{
"epoch": 0.007186104561276225,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0007280985514322917,
"learning_rate": 0.01,
"loss": 1.3625,
"loss/crossentropy": 2.686766266822815,
"loss/fcd": 1.0859375,
"loss/logits": 0.24827048182487488,
"step": 416
},
{
"epoch": 0.007203378851086985,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0007395426432291667,
"learning_rate": 0.01,
"loss": 1.3839,
"loss/crossentropy": 2.319527268409729,
"loss/fcd": 1.20703125,
"loss/logits": 0.2674332559108734,
"step": 417
},
{
"epoch": 0.0072206531408977445,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0007565657297770183,
"learning_rate": 0.01,
"loss": 1.3619,
"loss/crossentropy": 2.3169610500335693,
"loss/fcd": 1.0859375,
"loss/logits": 0.23959346115589142,
"step": 418
},
{
"epoch": 0.007237927430708505,
"grad_norm": 0.328125,
"grad_norm_var": 0.0007449944814046223,
"learning_rate": 0.01,
"loss": 1.4737,
"loss/crossentropy": 2.5569876432418823,
"loss/fcd": 1.109375,
"loss/logits": 0.2552832216024399,
"step": 419
},
{
"epoch": 0.007255201720519265,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0007374445597330729,
"learning_rate": 0.01,
"loss": 1.4197,
"loss/crossentropy": 2.0687599182128906,
"loss/fcd": 1.1640625,
"loss/logits": 0.2598320543766022,
"step": 420
},
{
"epoch": 0.007272476010330025,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0006955305735270183,
"learning_rate": 0.01,
"loss": 1.4605,
"loss/crossentropy": 2.419862389564514,
"loss/fcd": 1.171875,
"loss/logits": 0.2556862235069275,
"step": 421
},
{
"epoch": 0.007289750300140786,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0006964206695556641,
"learning_rate": 0.01,
"loss": 1.4071,
"loss/crossentropy": 2.5204795598983765,
"loss/fcd": 1.17578125,
"loss/logits": 0.2741318494081497,
"step": 422
},
{
"epoch": 0.007307024589951546,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0006945292154947917,
"learning_rate": 0.01,
"loss": 1.4196,
"loss/crossentropy": 2.489278793334961,
"loss/fcd": 1.12109375,
"loss/logits": 0.25576694309711456,
"step": 423
},
{
"epoch": 0.0073242988797623055,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0007067362467447917,
"learning_rate": 0.01,
"loss": 1.3185,
"loss/crossentropy": 2.3392102122306824,
"loss/fcd": 1.03125,
"loss/logits": 0.21298449486494064,
"step": 424
},
{
"epoch": 0.007341573169573066,
"grad_norm": 0.28125,
"grad_norm_var": 0.0006886641184488932,
"learning_rate": 0.01,
"loss": 1.3571,
"loss/crossentropy": 2.2977930307388306,
"loss/fcd": 1.0859375,
"loss/logits": 0.23583728075027466,
"step": 425
},
{
"epoch": 0.007358847459383826,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0006886641184488932,
"learning_rate": 0.01,
"loss": 1.4277,
"loss/crossentropy": 2.6484419107437134,
"loss/fcd": 1.1953125,
"loss/logits": 0.27860742807388306,
"step": 426
},
{
"epoch": 0.007376121749194586,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0007303873697916667,
"learning_rate": 0.01,
"loss": 1.3439,
"loss/crossentropy": 2.460866689682007,
"loss/fcd": 1.0703125,
"loss/logits": 0.23756644129753113,
"step": 427
},
{
"epoch": 0.007393396039005347,
"grad_norm": 0.27734375,
"grad_norm_var": 0.000762033462524414,
"learning_rate": 0.01,
"loss": 1.343,
"loss/crossentropy": 2.0784988403320312,
"loss/fcd": 1.044921875,
"loss/logits": 0.21802522987127304,
"step": 428
},
{
"epoch": 0.0074106703288161066,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0007471720377604167,
"learning_rate": 0.01,
"loss": 1.3824,
"loss/crossentropy": 2.312214493751526,
"loss/fcd": 1.08203125,
"loss/logits": 0.2373846471309662,
"step": 429
},
{
"epoch": 0.0074279446186268664,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0007441043853759766,
"learning_rate": 0.01,
"loss": 1.4031,
"loss/crossentropy": 2.43253231048584,
"loss/fcd": 1.0390625,
"loss/logits": 0.24533094465732574,
"step": 430
},
{
"epoch": 0.007445218908437627,
"grad_norm": 0.296875,
"grad_norm_var": 0.0002559502919514974,
"learning_rate": 0.01,
"loss": 1.3775,
"loss/crossentropy": 2.7691128253936768,
"loss/fcd": 1.13671875,
"loss/logits": 0.22900952398777008,
"step": 431
},
{
"epoch": 0.007462493198248387,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0002489566802978516,
"learning_rate": 0.01,
"loss": 1.3613,
"loss/crossentropy": 2.231864333152771,
"loss/fcd": 1.05859375,
"loss/logits": 0.24191942811012268,
"step": 432
},
{
"epoch": 0.007479767488059147,
"grad_norm": 0.27734375,
"grad_norm_var": 0.00026493072509765626,
"learning_rate": 0.01,
"loss": 1.4166,
"loss/crossentropy": 2.343968152999878,
"loss/fcd": 1.0859375,
"loss/logits": 0.2661665081977844,
"step": 433
},
{
"epoch": 0.007497041777869908,
"grad_norm": 0.3203125,
"grad_norm_var": 0.000299835205078125,
"learning_rate": 0.01,
"loss": 1.3807,
"loss/crossentropy": 2.6194422245025635,
"loss/fcd": 1.11328125,
"loss/logits": 0.2392604500055313,
"step": 434
},
{
"epoch": 0.0075143160676806675,
"grad_norm": 0.283203125,
"grad_norm_var": 0.00023585955301920573,
"learning_rate": 0.01,
"loss": 1.2902,
"loss/crossentropy": 2.46696138381958,
"loss/fcd": 1.08203125,
"loss/logits": 0.26606328785419464,
"step": 435
},
{
"epoch": 0.007531590357491427,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0002272923787434896,
"learning_rate": 0.01,
"loss": 1.3931,
"loss/crossentropy": 2.4375393390655518,
"loss/fcd": 1.14453125,
"loss/logits": 0.27766771614551544,
"step": 436
},
{
"epoch": 0.007548864647302188,
"grad_norm": 0.33984375,
"grad_norm_var": 0.0003665765126546224,
"learning_rate": 0.01,
"loss": 1.3732,
"loss/crossentropy": 2.3699560165405273,
"loss/fcd": 1.16796875,
"loss/logits": 0.2573126032948494,
"step": 437
},
{
"epoch": 0.007566138937112948,
"grad_norm": 0.306640625,
"grad_norm_var": 0.00037282307942708334,
"learning_rate": 0.01,
"loss": 1.4006,
"loss/crossentropy": 2.227339029312134,
"loss/fcd": 1.14453125,
"loss/logits": 0.2607281506061554,
"step": 438
},
{
"epoch": 0.007583413226923708,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0003864129384358724,
"learning_rate": 0.01,
"loss": 1.3632,
"loss/crossentropy": 2.46047842502594,
"loss/fcd": 1.09765625,
"loss/logits": 0.24269723892211914,
"step": 439
},
{
"epoch": 0.007600687516734469,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0003676732381184896,
"learning_rate": 0.01,
"loss": 1.3795,
"loss/crossentropy": 2.4994819164276123,
"loss/fcd": 1.14453125,
"loss/logits": 0.25722265988588333,
"step": 440
},
{
"epoch": 0.0076179618065452285,
"grad_norm": 0.265625,
"grad_norm_var": 0.0004109064737955729,
"learning_rate": 0.01,
"loss": 1.3064,
"loss/crossentropy": 2.5115902423858643,
"loss/fcd": 1.078125,
"loss/logits": 0.2371089681982994,
"step": 441
},
{
"epoch": 0.007635236096355988,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0003619988759358724,
"learning_rate": 0.01,
"loss": 1.4358,
"loss/crossentropy": 2.380179762840271,
"loss/fcd": 1.1015625,
"loss/logits": 0.2640485018491745,
"step": 442
},
{
"epoch": 0.007652510386166749,
"grad_norm": 0.3359375,
"grad_norm_var": 0.000460052490234375,
"learning_rate": 0.01,
"loss": 1.4287,
"loss/crossentropy": 2.6699330806732178,
"loss/fcd": 1.23828125,
"loss/logits": 0.2810060381889343,
"step": 443
},
{
"epoch": 0.007669784675977509,
"grad_norm": 0.328125,
"grad_norm_var": 0.0004988988240559896,
"learning_rate": 0.01,
"loss": 1.4246,
"loss/crossentropy": 2.5262972116470337,
"loss/fcd": 1.13671875,
"loss/logits": 0.25480419397354126,
"step": 444
},
{
"epoch": 0.007687058965788269,
"grad_norm": 0.34765625,
"grad_norm_var": 0.000649261474609375,
"learning_rate": 0.01,
"loss": 1.3859,
"loss/crossentropy": 2.3320013284683228,
"loss/fcd": 1.05078125,
"loss/logits": 0.2234661728143692,
"step": 445
},
{
"epoch": 0.007704333255599029,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0006438573201497396,
"learning_rate": 0.01,
"loss": 1.3848,
"loss/crossentropy": 2.448530673980713,
"loss/fcd": 1.234375,
"loss/logits": 0.2647833973169327,
"step": 446
},
{
"epoch": 0.0077216075454097895,
"grad_norm": 0.3046875,
"grad_norm_var": 0.000642840067545573,
"learning_rate": 0.01,
"loss": 1.4458,
"loss/crossentropy": 2.279269576072693,
"loss/fcd": 1.1640625,
"loss/logits": 0.2693684697151184,
"step": 447
},
{
"epoch": 0.007738881835220549,
"grad_norm": 0.296875,
"grad_norm_var": 0.0006202538808186849,
"learning_rate": 0.01,
"loss": 1.3777,
"loss/crossentropy": 2.6742255687713623,
"loss/fcd": 1.1796875,
"loss/logits": 0.2811601459980011,
"step": 448
},
{
"epoch": 0.007756156125031309,
"grad_norm": 0.326171875,
"grad_norm_var": 0.0006031672159830729,
"learning_rate": 0.01,
"loss": 1.4104,
"loss/crossentropy": 2.4074745178222656,
"loss/fcd": 1.078125,
"loss/logits": 0.24794109165668488,
"step": 449
},
{
"epoch": 0.00777343041484207,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0005887190500895183,
"learning_rate": 0.01,
"loss": 1.3185,
"loss/crossentropy": 2.35663104057312,
"loss/fcd": 1.12109375,
"loss/logits": 0.22819262742996216,
"step": 450
},
{
"epoch": 0.00779070470465283,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0005558649698893229,
"learning_rate": 0.01,
"loss": 1.3834,
"loss/crossentropy": 2.6186258792877197,
"loss/fcd": 1.12109375,
"loss/logits": 0.2587556540966034,
"step": 451
},
{
"epoch": 0.00780797899446359,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0005566755930582683,
"learning_rate": 0.01,
"loss": 1.4106,
"loss/crossentropy": 2.6754432916641235,
"loss/fcd": 1.1328125,
"loss/logits": 0.2465488687157631,
"step": 452
},
{
"epoch": 0.00782525328427435,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0005098978678385416,
"learning_rate": 0.01,
"loss": 1.3696,
"loss/crossentropy": 2.5379905700683594,
"loss/fcd": 1.16015625,
"loss/logits": 0.2804763838648796,
"step": 453
},
{
"epoch": 0.007842527574085111,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0005257765452067058,
"learning_rate": 0.01,
"loss": 1.3851,
"loss/crossentropy": 2.5852067470550537,
"loss/fcd": 1.11328125,
"loss/logits": 0.23731224238872528,
"step": 454
},
{
"epoch": 0.007859801863895871,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0004597345987955729,
"learning_rate": 0.01,
"loss": 1.4338,
"loss/crossentropy": 2.6572701930999756,
"loss/fcd": 1.234375,
"loss/logits": 0.28852197527885437,
"step": 455
},
{
"epoch": 0.007877076153706631,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0004513899485270182,
"learning_rate": 0.01,
"loss": 1.3824,
"loss/crossentropy": 2.6901192665100098,
"loss/fcd": 1.12890625,
"loss/logits": 0.24115828424692154,
"step": 456
},
{
"epoch": 0.00789435044351739,
"grad_norm": 0.451171875,
"grad_norm_var": 0.00158538818359375,
"learning_rate": 0.01,
"loss": 1.5497,
"loss/crossentropy": 2.636592984199524,
"loss/fcd": 1.30859375,
"loss/logits": 0.36482033133506775,
"step": 457
},
{
"epoch": 0.00791162473332815,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0015401045481363933,
"learning_rate": 0.01,
"loss": 1.3698,
"loss/crossentropy": 2.414226531982422,
"loss/fcd": 1.076171875,
"loss/logits": 0.2397611290216446,
"step": 458
},
{
"epoch": 0.00792889902313891,
"grad_norm": 0.3203125,
"grad_norm_var": 0.001520522435506185,
"learning_rate": 0.01,
"loss": 1.4165,
"loss/crossentropy": 2.463810086250305,
"loss/fcd": 1.1484375,
"loss/logits": 0.24305613338947296,
"step": 459
},
{
"epoch": 0.007946173312949672,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0015141805013020833,
"learning_rate": 0.01,
"loss": 1.4418,
"loss/crossentropy": 2.451104521751404,
"loss/fcd": 1.296875,
"loss/logits": 0.30130288004875183,
"step": 460
},
{
"epoch": 0.007963447602760432,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0014490763346354167,
"learning_rate": 0.01,
"loss": 1.3988,
"loss/crossentropy": 2.53925359249115,
"loss/fcd": 1.11328125,
"loss/logits": 0.24273447692394257,
"step": 461
},
{
"epoch": 0.007980721892571192,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0014133294423421224,
"learning_rate": 0.01,
"loss": 1.3928,
"loss/crossentropy": 2.5229551792144775,
"loss/fcd": 1.1640625,
"loss/logits": 0.25667132437229156,
"step": 462
},
{
"epoch": 0.007997996182381952,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0014353434244791666,
"learning_rate": 0.01,
"loss": 1.3347,
"loss/crossentropy": 2.341879367828369,
"loss/fcd": 1.12890625,
"loss/logits": 0.23053725808858871,
"step": 463
},
{
"epoch": 0.008015270472192712,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0014744440714518229,
"learning_rate": 0.01,
"loss": 1.3569,
"loss/crossentropy": 2.2920732498168945,
"loss/fcd": 1.03515625,
"loss/logits": 0.23280857503414154,
"step": 464
},
{
"epoch": 0.008032544762003472,
"grad_norm": 0.349609375,
"grad_norm_var": 0.001541582743326823,
"learning_rate": 0.01,
"loss": 1.3894,
"loss/crossentropy": 2.515018939971924,
"loss/fcd": 1.10546875,
"loss/logits": 0.24030621349811554,
"step": 465
},
{
"epoch": 0.008049819051814231,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0015279134114583334,
"learning_rate": 0.01,
"loss": 1.4597,
"loss/crossentropy": 2.2328585386276245,
"loss/fcd": 1.10546875,
"loss/logits": 0.25991010665893555,
"step": 466
},
{
"epoch": 0.008067093341624993,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0015306949615478515,
"learning_rate": 0.01,
"loss": 1.4036,
"loss/crossentropy": 2.798638701438904,
"loss/fcd": 1.203125,
"loss/logits": 0.29376721382141113,
"step": 467
},
{
"epoch": 0.008084367631435753,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0015871683756510417,
"learning_rate": 0.01,
"loss": 1.358,
"loss/crossentropy": 2.322153091430664,
"loss/fcd": 1.15625,
"loss/logits": 0.2475121170282364,
"step": 468
},
{
"epoch": 0.008101641921246513,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0015871683756510417,
"learning_rate": 0.01,
"loss": 1.3756,
"loss/crossentropy": 2.2007282972335815,
"loss/fcd": 1.046875,
"loss/logits": 0.23374570161104202,
"step": 469
},
{
"epoch": 0.008118916211057273,
"grad_norm": 0.287109375,
"grad_norm_var": 0.001594400405883789,
"learning_rate": 0.01,
"loss": 1.366,
"loss/crossentropy": 2.408711314201355,
"loss/fcd": 1.1171875,
"loss/logits": 0.23746006190776825,
"step": 470
},
{
"epoch": 0.008136190500868033,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0016522566477457682,
"learning_rate": 0.01,
"loss": 1.4157,
"loss/crossentropy": 2.328341841697693,
"loss/fcd": 1.15234375,
"loss/logits": 0.2784807085990906,
"step": 471
},
{
"epoch": 0.008153464790678792,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0016563256581624349,
"learning_rate": 0.01,
"loss": 1.3845,
"loss/crossentropy": 2.414987564086914,
"loss/fcd": 1.26171875,
"loss/logits": 0.32799775153398514,
"step": 472
},
{
"epoch": 0.008170739080489554,
"grad_norm": 0.357421875,
"grad_norm_var": 0.0004951318105061848,
"learning_rate": 0.01,
"loss": 1.4935,
"loss/crossentropy": 2.597047209739685,
"loss/fcd": 1.34375,
"loss/logits": 0.3595212921500206,
"step": 473
},
{
"epoch": 0.008188013370300314,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0004927953084309896,
"learning_rate": 0.01,
"loss": 1.4074,
"loss/crossentropy": 2.6870315074920654,
"loss/fcd": 1.15625,
"loss/logits": 0.2819272577762604,
"step": 474
},
{
"epoch": 0.008205287660111074,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0004863580067952474,
"learning_rate": 0.01,
"loss": 1.4023,
"loss/crossentropy": 2.416118621826172,
"loss/fcd": 1.171875,
"loss/logits": 0.2792641520500183,
"step": 475
},
{
"epoch": 0.008222561949921834,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0004933675130208334,
"learning_rate": 0.01,
"loss": 1.3668,
"loss/crossentropy": 2.4251519441604614,
"loss/fcd": 1.12890625,
"loss/logits": 0.25571418553590775,
"step": 476
},
{
"epoch": 0.008239836239732594,
"grad_norm": 0.33203125,
"grad_norm_var": 0.00052490234375,
"learning_rate": 0.01,
"loss": 1.396,
"loss/crossentropy": 2.2888123989105225,
"loss/fcd": 1.109375,
"loss/logits": 0.2410544455051422,
"step": 477
},
{
"epoch": 0.008257110529543353,
"grad_norm": 0.310546875,
"grad_norm_var": 0.000519561767578125,
"learning_rate": 0.01,
"loss": 1.3594,
"loss/crossentropy": 2.479097008705139,
"loss/fcd": 1.171875,
"loss/logits": 0.25502997636795044,
"step": 478
},
{
"epoch": 0.008274384819354115,
"grad_norm": 0.298828125,
"grad_norm_var": 0.000513140360514323,
"learning_rate": 0.01,
"loss": 1.3785,
"loss/crossentropy": 2.4117250442504883,
"loss/fcd": 1.1328125,
"loss/logits": 0.26754797995090485,
"step": 479
},
{
"epoch": 0.008291659109164875,
"grad_norm": 0.34375,
"grad_norm_var": 0.0005388895670572917,
"learning_rate": 0.01,
"loss": 1.4354,
"loss/crossentropy": 2.577602744102478,
"loss/fcd": 1.24609375,
"loss/logits": 0.2731374129652977,
"step": 480
},
{
"epoch": 0.008308933398975635,
"grad_norm": 0.283203125,
"grad_norm_var": 0.000490252176920573,
"learning_rate": 0.01,
"loss": 1.3588,
"loss/crossentropy": 2.3125388622283936,
"loss/fcd": 1.109375,
"loss/logits": 0.2633324861526489,
"step": 481
},
{
"epoch": 0.008326207688786395,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0004997094472249349,
"learning_rate": 0.01,
"loss": 1.3518,
"loss/crossentropy": 2.3964109420776367,
"loss/fcd": 1.09375,
"loss/logits": 0.24801631271839142,
"step": 482
},
{
"epoch": 0.008343481978597155,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0005289077758789063,
"learning_rate": 0.01,
"loss": 1.3619,
"loss/crossentropy": 2.5348154306411743,
"loss/fcd": 1.1328125,
"loss/logits": 0.273783415555954,
"step": 483
},
{
"epoch": 0.008360756268407914,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0005053043365478516,
"learning_rate": 0.01,
"loss": 1.3716,
"loss/crossentropy": 2.525968909263611,
"loss/fcd": 1.11328125,
"loss/logits": 0.25891977548599243,
"step": 484
},
{
"epoch": 0.008378030558218676,
"grad_norm": 0.357421875,
"grad_norm_var": 0.000632333755493164,
"learning_rate": 0.01,
"loss": 1.4665,
"loss/crossentropy": 2.476569890975952,
"loss/fcd": 1.203125,
"loss/logits": 0.29254642128944397,
"step": 485
},
{
"epoch": 0.008395304848029436,
"grad_norm": 0.275390625,
"grad_norm_var": 0.000678110122680664,
"learning_rate": 0.01,
"loss": 1.3305,
"loss/crossentropy": 2.4879168272018433,
"loss/fcd": 1.08203125,
"loss/logits": 0.22623379528522491,
"step": 486
},
{
"epoch": 0.008412579137840196,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0006357192993164063,
"learning_rate": 0.01,
"loss": 1.3945,
"loss/crossentropy": 2.4186280965805054,
"loss/fcd": 1.09375,
"loss/logits": 0.23819412291049957,
"step": 487
},
{
"epoch": 0.008429853427650956,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0006739139556884765,
"learning_rate": 0.01,
"loss": 1.3611,
"loss/crossentropy": 2.2941300868988037,
"loss/fcd": 1.0625,
"loss/logits": 0.22146066278219223,
"step": 488
},
{
"epoch": 0.008447127717461715,
"grad_norm": 0.28125,
"grad_norm_var": 0.0005716323852539062,
"learning_rate": 0.01,
"loss": 1.3797,
"loss/crossentropy": 2.368129849433899,
"loss/fcd": 1.16796875,
"loss/logits": 0.2645361125469208,
"step": 489
},
{
"epoch": 0.008464402007272475,
"grad_norm": 0.296875,
"grad_norm_var": 0.0005732059478759765,
"learning_rate": 0.01,
"loss": 1.3563,
"loss/crossentropy": 2.5257701873779297,
"loss/fcd": 1.1171875,
"loss/logits": 0.2530096620321274,
"step": 490
},
{
"epoch": 0.008481676297083237,
"grad_norm": 0.263671875,
"grad_norm_var": 0.0006844679514567058,
"learning_rate": 0.01,
"loss": 1.3688,
"loss/crossentropy": 2.1511563062667847,
"loss/fcd": 1.0546875,
"loss/logits": 0.240036740899086,
"step": 491
},
{
"epoch": 0.008498950586893997,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0006647109985351562,
"learning_rate": 0.01,
"loss": 1.3563,
"loss/crossentropy": 2.370754837989807,
"loss/fcd": 1.18359375,
"loss/logits": 0.2698900103569031,
"step": 492
},
{
"epoch": 0.008516224876704757,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0006010532379150391,
"learning_rate": 0.01,
"loss": 1.4051,
"loss/crossentropy": 2.55213725566864,
"loss/fcd": 1.19140625,
"loss/logits": 0.26752666383981705,
"step": 493
},
{
"epoch": 0.008533499166515517,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0006074110666910807,
"learning_rate": 0.01,
"loss": 1.3304,
"loss/crossentropy": 2.878965377807617,
"loss/fcd": 1.140625,
"loss/logits": 0.248264878988266,
"step": 494
},
{
"epoch": 0.008550773456326276,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0006083170572916667,
"learning_rate": 0.01,
"loss": 1.3554,
"loss/crossentropy": 2.389639139175415,
"loss/fcd": 1.08203125,
"loss/logits": 0.2504645884037018,
"step": 495
},
{
"epoch": 0.008568047746137036,
"grad_norm": 0.26953125,
"grad_norm_var": 0.0005273818969726562,
"learning_rate": 0.01,
"loss": 1.353,
"loss/crossentropy": 2.261403799057007,
"loss/fcd": 1.03125,
"loss/logits": 0.22545771300792694,
"step": 496
},
{
"epoch": 0.008585322035947798,
"grad_norm": 0.328125,
"grad_norm_var": 0.0005760033925374349,
"learning_rate": 0.01,
"loss": 1.4314,
"loss/crossentropy": 2.755717635154724,
"loss/fcd": 1.18359375,
"loss/logits": 0.28124481439590454,
"step": 497
},
{
"epoch": 0.008602596325758558,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0005721410115559895,
"learning_rate": 0.01,
"loss": 1.39,
"loss/crossentropy": 2.5332454442977905,
"loss/fcd": 1.14453125,
"loss/logits": 0.2577049881219864,
"step": 498
},
{
"epoch": 0.008619870615569318,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0006039937337239584,
"learning_rate": 0.01,
"loss": 1.3284,
"loss/crossentropy": 2.3752284049987793,
"loss/fcd": 1.09375,
"loss/logits": 0.24180973321199417,
"step": 499
},
{
"epoch": 0.008637144905380078,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0006058851877848308,
"learning_rate": 0.01,
"loss": 1.3868,
"loss/crossentropy": 2.299641966819763,
"loss/fcd": 1.1328125,
"loss/logits": 0.2509627118706703,
"step": 500
},
{
"epoch": 0.008654419195190837,
"grad_norm": 0.294921875,
"grad_norm_var": 0.00035869280497233074,
"learning_rate": 0.01,
"loss": 1.3756,
"loss/crossentropy": 2.2871060371398926,
"loss/fcd": 1.06640625,
"loss/logits": 0.22674021124839783,
"step": 501
},
{
"epoch": 0.008671693485001597,
"grad_norm": 0.310546875,
"grad_norm_var": 0.00034610430399576825,
"learning_rate": 0.01,
"loss": 1.3644,
"loss/crossentropy": 2.2024362087249756,
"loss/fcd": 1.09375,
"loss/logits": 0.2369084656238556,
"step": 502
},
{
"epoch": 0.008688967774812357,
"grad_norm": 0.28125,
"grad_norm_var": 0.00030414263407389325,
"learning_rate": 0.01,
"loss": 1.343,
"loss/crossentropy": 2.5880898237228394,
"loss/fcd": 1.16796875,
"loss/logits": 0.25857551395893097,
"step": 503
},
{
"epoch": 0.008706242064623119,
"grad_norm": 0.306640625,
"grad_norm_var": 0.00030986467997233075,
"learning_rate": 0.01,
"loss": 1.4237,
"loss/crossentropy": 2.3485684394836426,
"loss/fcd": 1.16796875,
"loss/logits": 0.266690656542778,
"step": 504
},
{
"epoch": 0.008723516354433879,
"grad_norm": 0.353515625,
"grad_norm_var": 0.0005009969075520834,
"learning_rate": 0.01,
"loss": 1.3758,
"loss/crossentropy": 2.539777636528015,
"loss/fcd": 1.078125,
"loss/logits": 0.24045251309871674,
"step": 505
},
{
"epoch": 0.008740790644244639,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0005444844563802083,
"learning_rate": 0.01,
"loss": 1.3113,
"loss/crossentropy": 2.492120862007141,
"loss/fcd": 1.1171875,
"loss/logits": 0.2610347419977188,
"step": 506
},
{
"epoch": 0.008758064934055398,
"grad_norm": 0.28125,
"grad_norm_var": 0.0004825433095296224,
"learning_rate": 0.01,
"loss": 1.4436,
"loss/crossentropy": 2.5324673652648926,
"loss/fcd": 1.33984375,
"loss/logits": 0.3312453627586365,
"step": 507
},
{
"epoch": 0.008775339223866158,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0005187829335530599,
"learning_rate": 0.01,
"loss": 1.3478,
"loss/crossentropy": 2.612854242324829,
"loss/fcd": 1.1640625,
"loss/logits": 0.261405885219574,
"step": 508
},
{
"epoch": 0.008792613513676918,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0005492528279622395,
"learning_rate": 0.01,
"loss": 1.3917,
"loss/crossentropy": 2.4303773641586304,
"loss/fcd": 1.11328125,
"loss/logits": 0.24008433520793915,
"step": 509
},
{
"epoch": 0.00880988780348768,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0005370457967122396,
"learning_rate": 0.01,
"loss": 1.3929,
"loss/crossentropy": 2.676490068435669,
"loss/fcd": 1.23828125,
"loss/logits": 0.291456863284111,
"step": 510
},
{
"epoch": 0.00882716209329844,
"grad_norm": 0.248046875,
"grad_norm_var": 0.0006914615631103515,
"learning_rate": 0.01,
"loss": 1.3138,
"loss/crossentropy": 2.1477047204971313,
"loss/fcd": 1.09765625,
"loss/logits": 0.2524523437023163,
"step": 511
},
{
"epoch": 0.0088444363831092,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0006460666656494141,
"learning_rate": 0.01,
"loss": 1.3208,
"loss/crossentropy": 2.3151156902313232,
"loss/fcd": 1.08984375,
"loss/logits": 0.2605459988117218,
"step": 512
},
{
"epoch": 0.00886171067291996,
"grad_norm": 0.296875,
"grad_norm_var": 0.0005753676096598308,
"learning_rate": 0.01,
"loss": 1.357,
"loss/crossentropy": 2.4916226863861084,
"loss/fcd": 1.1640625,
"loss/logits": 0.25671282410621643,
"step": 513
},
{
"epoch": 0.00887898496273072,
"grad_norm": 0.328125,
"grad_norm_var": 0.0006388346354166667,
"learning_rate": 0.01,
"loss": 1.4206,
"loss/crossentropy": 2.2333791255950928,
"loss/fcd": 1.1484375,
"loss/logits": 0.28083400428295135,
"step": 514
},
{
"epoch": 0.00889625925254148,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0006095727284749348,
"learning_rate": 0.01,
"loss": 1.3783,
"loss/crossentropy": 2.606614589691162,
"loss/fcd": 1.08203125,
"loss/logits": 0.2666025906801224,
"step": 515
},
{
"epoch": 0.00891353354235224,
"grad_norm": 0.3125,
"grad_norm_var": 0.0006219863891601563,
"learning_rate": 0.01,
"loss": 1.3641,
"loss/crossentropy": 2.5051095485687256,
"loss/fcd": 1.2109375,
"loss/logits": 0.25101958215236664,
"step": 516
},
{
"epoch": 0.008930807832163,
"grad_norm": 0.353515625,
"grad_norm_var": 0.000811767578125,
"learning_rate": 0.01,
"loss": 1.4186,
"loss/crossentropy": 2.3850373029708862,
"loss/fcd": 1.13671875,
"loss/logits": 0.27909501641988754,
"step": 517
},
{
"epoch": 0.00894808212197376,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0008066177368164062,
"learning_rate": 0.01,
"loss": 1.366,
"loss/crossentropy": 2.217817187309265,
"loss/fcd": 1.03125,
"loss/logits": 0.23760483413934708,
"step": 518
},
{
"epoch": 0.00896535641178452,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0007822513580322266,
"learning_rate": 0.01,
"loss": 1.3482,
"loss/crossentropy": 2.537502408027649,
"loss/fcd": 1.15234375,
"loss/logits": 0.27564045786857605,
"step": 519
},
{
"epoch": 0.00898263070159528,
"grad_norm": 0.333984375,
"grad_norm_var": 0.0008463382720947266,
"learning_rate": 0.01,
"loss": 1.4875,
"loss/crossentropy": 2.628643035888672,
"loss/fcd": 1.31640625,
"loss/logits": 0.30241404473781586,
"step": 520
},
{
"epoch": 0.00899990499140604,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0006756941477457682,
"learning_rate": 0.01,
"loss": 1.4536,
"loss/crossentropy": 2.2907025814056396,
"loss/fcd": 1.1015625,
"loss/logits": 0.2538699805736542,
"step": 521
},
{
"epoch": 0.009017179281216802,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0006413618723551432,
"learning_rate": 0.01,
"loss": 1.4079,
"loss/crossentropy": 2.5753923654556274,
"loss/fcd": 1.2109375,
"loss/logits": 0.2975587248802185,
"step": 522
},
{
"epoch": 0.009034453571027562,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0006230513254801433,
"learning_rate": 0.01,
"loss": 1.3724,
"loss/crossentropy": 2.327569842338562,
"loss/fcd": 1.07421875,
"loss/logits": 0.2123243287205696,
"step": 523
},
{
"epoch": 0.009051727860838321,
"grad_norm": 0.345703125,
"grad_norm_var": 0.0006653944651285808,
"learning_rate": 0.01,
"loss": 1.484,
"loss/crossentropy": 2.4529794454574585,
"loss/fcd": 1.2109375,
"loss/logits": 0.2642442062497139,
"step": 524
},
{
"epoch": 0.009069002150649081,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0006680647532145182,
"learning_rate": 0.01,
"loss": 1.3426,
"loss/crossentropy": 2.405073642730713,
"loss/fcd": 1.11328125,
"loss/logits": 0.24681153148412704,
"step": 525
},
{
"epoch": 0.009086276440459841,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0006830692291259766,
"learning_rate": 0.01,
"loss": 1.3501,
"loss/crossentropy": 2.5336978435516357,
"loss/fcd": 1.13671875,
"loss/logits": 0.26675350964069366,
"step": 526
},
{
"epoch": 0.009103550730270601,
"grad_norm": 0.314453125,
"grad_norm_var": 0.00044960975646972655,
"learning_rate": 0.01,
"loss": 1.4051,
"loss/crossentropy": 2.306818962097168,
"loss/fcd": 1.1328125,
"loss/logits": 0.24449439346790314,
"step": 527
},
{
"epoch": 0.009120825020081363,
"grad_norm": 0.29296875,
"grad_norm_var": 0.00044960975646972655,
"learning_rate": 0.01,
"loss": 1.3847,
"loss/crossentropy": 2.394535183906555,
"loss/fcd": 1.15625,
"loss/logits": 0.2896339148283005,
"step": 528
},
{
"epoch": 0.009138099309892123,
"grad_norm": 0.32421875,
"grad_norm_var": 0.00044960975646972655,
"learning_rate": 0.01,
"loss": 1.383,
"loss/crossentropy": 2.502661347389221,
"loss/fcd": 1.10546875,
"loss/logits": 0.2570330798625946,
"step": 529
},
{
"epoch": 0.009155373599702882,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0004755655924479167,
"learning_rate": 0.01,
"loss": 1.3914,
"loss/crossentropy": 2.5401047468185425,
"loss/fcd": 1.1328125,
"loss/logits": 0.25133057683706284,
"step": 530
},
{
"epoch": 0.009172647889513642,
"grad_norm": 0.294921875,
"grad_norm_var": 0.000457000732421875,
"learning_rate": 0.01,
"loss": 1.3288,
"loss/crossentropy": 2.357369303703308,
"loss/fcd": 1.140625,
"loss/logits": 0.25731976330280304,
"step": 531
},
{
"epoch": 0.009189922179324402,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0005107720692952474,
"learning_rate": 0.01,
"loss": 1.3301,
"loss/crossentropy": 2.361912250518799,
"loss/fcd": 1.03125,
"loss/logits": 0.23256495594978333,
"step": 532
},
{
"epoch": 0.009207196469135162,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0003574212392171224,
"learning_rate": 0.01,
"loss": 1.4286,
"loss/crossentropy": 2.5182912349700928,
"loss/fcd": 1.1328125,
"loss/logits": 0.24184302985668182,
"step": 533
},
{
"epoch": 0.009224470758945922,
"grad_norm": 0.302734375,
"grad_norm_var": 0.00035564104715983075,
"learning_rate": 0.01,
"loss": 1.3729,
"loss/crossentropy": 2.3095160722732544,
"loss/fcd": 1.068359375,
"loss/logits": 0.22853360325098038,
"step": 534
},
{
"epoch": 0.009241745048756684,
"grad_norm": 0.3046875,
"grad_norm_var": 0.00034936269124348957,
"learning_rate": 0.01,
"loss": 1.4586,
"loss/crossentropy": 2.4540841579437256,
"loss/fcd": 1.26953125,
"loss/logits": 0.3655036687850952,
"step": 535
},
{
"epoch": 0.009259019338567443,
"grad_norm": 0.271484375,
"grad_norm_var": 0.0003513971964518229,
"learning_rate": 0.01,
"loss": 1.3534,
"loss/crossentropy": 2.350268244743347,
"loss/fcd": 1.02734375,
"loss/logits": 0.21084149181842804,
"step": 536
},
{
"epoch": 0.009276293628378203,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0003573099772135417,
"learning_rate": 0.01,
"loss": 1.4337,
"loss/crossentropy": 2.1304550170898438,
"loss/fcd": 1.15234375,
"loss/logits": 0.2608063519001007,
"step": 537
},
{
"epoch": 0.009293567918188963,
"grad_norm": 0.30078125,
"grad_norm_var": 0.00033899943033854164,
"learning_rate": 0.01,
"loss": 1.3731,
"loss/crossentropy": 2.4391915798187256,
"loss/fcd": 1.109375,
"loss/logits": 0.2429627627134323,
"step": 538
},
{
"epoch": 0.009310842207999723,
"grad_norm": 0.2490234375,
"grad_norm_var": 0.0004955569903055827,
"learning_rate": 0.01,
"loss": 1.3286,
"loss/crossentropy": 2.3171310424804688,
"loss/fcd": 1.078125,
"loss/logits": 0.2482328712940216,
"step": 539
},
{
"epoch": 0.009328116497810483,
"grad_norm": 0.34375,
"grad_norm_var": 0.0004833817481994629,
"learning_rate": 0.01,
"loss": 1.5811,
"loss/crossentropy": 2.376081347465515,
"loss/fcd": 1.10546875,
"loss/logits": 0.25213149189949036,
"step": 540
},
{
"epoch": 0.009345390787621245,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0005142807960510254,
"learning_rate": 0.01,
"loss": 1.3856,
"loss/crossentropy": 2.4632989168167114,
"loss/fcd": 1.078125,
"loss/logits": 0.2334313914179802,
"step": 541
},
{
"epoch": 0.009362665077432004,
"grad_norm": 0.267578125,
"grad_norm_var": 0.0005667328834533692,
"learning_rate": 0.01,
"loss": 1.2882,
"loss/crossentropy": 2.177401542663574,
"loss/fcd": 1.0390625,
"loss/logits": 0.24528680741786957,
"step": 542
},
{
"epoch": 0.009379939367242764,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0005423506100972493,
"learning_rate": 0.01,
"loss": 1.4063,
"loss/crossentropy": 2.4587985277175903,
"loss/fcd": 1.22265625,
"loss/logits": 0.2990281730890274,
"step": 543
},
{
"epoch": 0.009397213657053524,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0005635221799214681,
"learning_rate": 0.01,
"loss": 1.3479,
"loss/crossentropy": 2.5811359882354736,
"loss/fcd": 1.1484375,
"loss/logits": 0.2688131481409073,
"step": 544
},
{
"epoch": 0.009414487946864284,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0004939039548238119,
"learning_rate": 0.01,
"loss": 1.3695,
"loss/crossentropy": 2.51469349861145,
"loss/fcd": 1.16796875,
"loss/logits": 0.26591262221336365,
"step": 545
},
{
"epoch": 0.009431762236675044,
"grad_norm": 0.462890625,
"grad_norm_var": 0.002329091231028239,
"learning_rate": 0.01,
"loss": 1.3836,
"loss/crossentropy": 2.46504545211792,
"loss/fcd": 1.1171875,
"loss/logits": 0.25862205028533936,
"step": 546
},
{
"epoch": 0.009449036526485806,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0023592273394266766,
"learning_rate": 0.01,
"loss": 1.3895,
"loss/crossentropy": 2.6220297813415527,
"loss/fcd": 1.109375,
"loss/logits": 0.2548370361328125,
"step": 547
},
{
"epoch": 0.009466310816296565,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0023488322893778484,
"learning_rate": 0.01,
"loss": 1.3737,
"loss/crossentropy": 2.591723322868347,
"loss/fcd": 1.13671875,
"loss/logits": 0.25868477672338486,
"step": 548
},
{
"epoch": 0.009483585106107325,
"grad_norm": 0.345703125,
"grad_norm_var": 0.0024718562761942547,
"learning_rate": 0.01,
"loss": 1.4328,
"loss/crossentropy": 2.568224310874939,
"loss/fcd": 1.1875,
"loss/logits": 0.278149738907814,
"step": 549
},
{
"epoch": 0.009500859395918085,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0024854302406311034,
"learning_rate": 0.01,
"loss": 1.4231,
"loss/crossentropy": 2.5823177099227905,
"loss/fcd": 1.25,
"loss/logits": 0.2855361998081207,
"step": 550
},
{
"epoch": 0.009518133685728845,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0025197307268778482,
"learning_rate": 0.01,
"loss": 1.3549,
"loss/crossentropy": 2.8035439252853394,
"loss/fcd": 1.12890625,
"loss/logits": 0.26180362701416016,
"step": 551
},
{
"epoch": 0.009535407975539605,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0024581233660380046,
"learning_rate": 0.01,
"loss": 1.3912,
"loss/crossentropy": 2.695222020149231,
"loss/fcd": 1.16796875,
"loss/logits": 0.26626719534397125,
"step": 552
},
{
"epoch": 0.009552682265350367,
"grad_norm": 0.296875,
"grad_norm_var": 0.002452115217844645,
"learning_rate": 0.01,
"loss": 1.3884,
"loss/crossentropy": 2.2692904472351074,
"loss/fcd": 1.12890625,
"loss/logits": 0.26358961313962936,
"step": 553
},
{
"epoch": 0.009569956555161126,
"grad_norm": 0.484375,
"grad_norm_var": 0.004515453179677328,
"learning_rate": 0.01,
"loss": 1.4362,
"loss/crossentropy": 2.587984561920166,
"loss/fcd": 1.203125,
"loss/logits": 0.28202252089977264,
"step": 554
},
{
"epoch": 0.009587230844971886,
"grad_norm": 0.28125,
"grad_norm_var": 0.004301055272420248,
"learning_rate": 0.01,
"loss": 1.3332,
"loss/crossentropy": 2.238184094429016,
"loss/fcd": 1.125,
"loss/logits": 0.25094330310821533,
"step": 555
},
{
"epoch": 0.009604505134782646,
"grad_norm": 0.3046875,
"grad_norm_var": 0.004252099990844726,
"learning_rate": 0.01,
"loss": 1.3777,
"loss/crossentropy": 2.2282315492630005,
"loss/fcd": 1.0625,
"loss/logits": 0.2441270500421524,
"step": 556
},
{
"epoch": 0.009621779424593406,
"grad_norm": 0.2890625,
"grad_norm_var": 0.004194132486979167,
"learning_rate": 0.01,
"loss": 1.3897,
"loss/crossentropy": 2.354749321937561,
"loss/fcd": 1.171875,
"loss/logits": 0.26998236775398254,
"step": 557
},
{
"epoch": 0.009639053714404166,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0040819803873697914,
"learning_rate": 0.01,
"loss": 1.4072,
"loss/crossentropy": 2.3754522800445557,
"loss/fcd": 1.125,
"loss/logits": 0.27060529589653015,
"step": 558
},
{
"epoch": 0.009656328004214927,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0040383497873942055,
"learning_rate": 0.01,
"loss": 1.3777,
"loss/crossentropy": 2.3385682106018066,
"loss/fcd": 1.109375,
"loss/logits": 0.24154536426067352,
"step": 559
},
{
"epoch": 0.009673602294025687,
"grad_norm": 0.30859375,
"grad_norm_var": 0.00392297108968099,
"learning_rate": 0.01,
"loss": 1.4435,
"loss/crossentropy": 2.525418996810913,
"loss/fcd": 1.15234375,
"loss/logits": 0.2767959535121918,
"step": 560
},
{
"epoch": 0.009690876583836447,
"grad_norm": 0.3203125,
"grad_norm_var": 0.003881438573201497,
"learning_rate": 0.01,
"loss": 1.3849,
"loss/crossentropy": 2.291569232940674,
"loss/fcd": 1.12890625,
"loss/logits": 0.28253524005413055,
"step": 561
},
{
"epoch": 0.009708150873647207,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0024538675944010416,
"learning_rate": 0.01,
"loss": 1.3855,
"loss/crossentropy": 1.8735097646713257,
"loss/fcd": 1.171875,
"loss/logits": 0.186705082654953,
"step": 562
},
{
"epoch": 0.009725425163457967,
"grad_norm": 0.27734375,
"grad_norm_var": 0.002462625503540039,
"learning_rate": 0.01,
"loss": 1.3507,
"loss/crossentropy": 2.2446945905685425,
"loss/fcd": 1.0859375,
"loss/logits": 0.23518769443035126,
"step": 563
},
{
"epoch": 0.009742699453268727,
"grad_norm": 0.318359375,
"grad_norm_var": 0.002405405044555664,
"learning_rate": 0.01,
"loss": 1.3812,
"loss/crossentropy": 1.9525874853134155,
"loss/fcd": 1.2421875,
"loss/logits": 0.19731061905622482,
"step": 564
},
{
"epoch": 0.009759973743079488,
"grad_norm": 0.306640625,
"grad_norm_var": 0.00233610471089681,
"learning_rate": 0.01,
"loss": 1.3995,
"loss/crossentropy": 2.53279709815979,
"loss/fcd": 1.16796875,
"loss/logits": 0.27113981544971466,
"step": 565
},
{
"epoch": 0.009777248032890248,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0023889541625976562,
"learning_rate": 0.01,
"loss": 1.346,
"loss/crossentropy": 2.4163317680358887,
"loss/fcd": 1.14453125,
"loss/logits": 0.26083898544311523,
"step": 566
},
{
"epoch": 0.009794522322701008,
"grad_norm": 0.306640625,
"grad_norm_var": 0.00232086181640625,
"learning_rate": 0.01,
"loss": 1.3419,
"loss/crossentropy": 2.4386374950408936,
"loss/fcd": 1.1171875,
"loss/logits": 0.2661859691143036,
"step": 567
},
{
"epoch": 0.009811796612511768,
"grad_norm": 0.296875,
"grad_norm_var": 0.0023355484008789062,
"learning_rate": 0.01,
"loss": 1.3659,
"loss/crossentropy": 2.509569525718689,
"loss/fcd": 1.203125,
"loss/logits": 0.263532429933548,
"step": 568
},
{
"epoch": 0.009829070902322528,
"grad_norm": 0.30859375,
"grad_norm_var": 0.00232086181640625,
"learning_rate": 0.01,
"loss": 1.434,
"loss/crossentropy": 2.400490880012512,
"loss/fcd": 1.1171875,
"loss/logits": 0.24774880707263947,
"step": 569
},
{
"epoch": 0.009846345192133288,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0002483208974202474,
"learning_rate": 0.01,
"loss": 1.4385,
"loss/crossentropy": 2.390196442604065,
"loss/fcd": 1.07421875,
"loss/logits": 0.2328876331448555,
"step": 570
},
{
"epoch": 0.009863619481944048,
"grad_norm": 0.2890625,
"grad_norm_var": 0.00023013750712076823,
"learning_rate": 0.01,
"loss": 1.3399,
"loss/crossentropy": 2.399609327316284,
"loss/fcd": 1.17578125,
"loss/logits": 0.2631242126226425,
"step": 571
},
{
"epoch": 0.00988089377175481,
"grad_norm": 0.388671875,
"grad_norm_var": 0.0006914774576822917,
"learning_rate": 0.01,
"loss": 1.4129,
"loss/crossentropy": 2.5639859437942505,
"loss/fcd": 1.171875,
"loss/logits": 0.2520062252879143,
"step": 572
},
{
"epoch": 0.00989816806156557,
"grad_norm": 0.34765625,
"grad_norm_var": 0.000757280985514323,
"learning_rate": 0.01,
"loss": 1.4226,
"loss/crossentropy": 2.4615684747695923,
"loss/fcd": 1.12109375,
"loss/logits": 0.2613854482769966,
"step": 573
},
{
"epoch": 0.009915442351376329,
"grad_norm": 0.296875,
"grad_norm_var": 0.0007432142893473308,
"learning_rate": 0.01,
"loss": 1.4494,
"loss/crossentropy": 2.4410594701766968,
"loss/fcd": 1.1953125,
"loss/logits": 0.3067672997713089,
"step": 574
},
{
"epoch": 0.009932716641187089,
"grad_norm": 0.3125,
"grad_norm_var": 0.0007410685221354167,
"learning_rate": 0.01,
"loss": 1.4228,
"loss/crossentropy": 2.6319605112075806,
"loss/fcd": 1.0859375,
"loss/logits": 0.24571086466312408,
"step": 575
},
{
"epoch": 0.009949990930997849,
"grad_norm": 0.28125,
"grad_norm_var": 0.0008020401000976562,
"learning_rate": 0.01,
"loss": 1.2951,
"loss/crossentropy": 2.368131637573242,
"loss/fcd": 1.03515625,
"loss/logits": 0.23180700838565826,
"step": 576
},
{
"epoch": 0.009967265220808609,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0008711338043212891,
"learning_rate": 0.01,
"loss": 1.3585,
"loss/crossentropy": 2.197615623474121,
"loss/fcd": 1.08203125,
"loss/logits": 0.2364010065793991,
"step": 577
},
{
"epoch": 0.00998453951061937,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0008722782135009765,
"learning_rate": 0.01,
"loss": 1.3929,
"loss/crossentropy": 2.5560864210128784,
"loss/fcd": 1.11328125,
"loss/logits": 0.25519636273384094,
"step": 578
},
{
"epoch": 0.01000181380043013,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0008318424224853516,
"learning_rate": 0.01,
"loss": 1.3903,
"loss/crossentropy": 2.290327787399292,
"loss/fcd": 1.08203125,
"loss/logits": 0.242530919611454,
"step": 579
},
{
"epoch": 0.01001908809024089,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0008769830067952474,
"learning_rate": 0.01,
"loss": 1.3204,
"loss/crossentropy": 2.558402419090271,
"loss/fcd": 1.0703125,
"loss/logits": 0.24013052880764008,
"step": 580
},
{
"epoch": 0.01003636238005165,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0008811791737874349,
"learning_rate": 0.01,
"loss": 1.3934,
"loss/crossentropy": 2.3049778938293457,
"loss/fcd": 1.140625,
"loss/logits": 0.24487978965044022,
"step": 581
},
{
"epoch": 0.01005363666986241,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0008811791737874349,
"learning_rate": 0.01,
"loss": 1.3844,
"loss/crossentropy": 2.5796691179275513,
"loss/fcd": 1.10546875,
"loss/logits": 0.2458028495311737,
"step": 582
},
{
"epoch": 0.01007091095967317,
"grad_norm": 0.330078125,
"grad_norm_var": 0.0009151299794514974,
"learning_rate": 0.01,
"loss": 1.4305,
"loss/crossentropy": 2.3386783599853516,
"loss/fcd": 1.14453125,
"loss/logits": 0.24171485751867294,
"step": 583
},
{
"epoch": 0.010088185249483931,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0009124120076497396,
"learning_rate": 0.01,
"loss": 1.386,
"loss/crossentropy": 2.3040322065353394,
"loss/fcd": 1.1171875,
"loss/logits": 0.25387245416641235,
"step": 584
},
{
"epoch": 0.010105459539294691,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0009125868479410807,
"learning_rate": 0.01,
"loss": 1.3622,
"loss/crossentropy": 3.012826681137085,
"loss/fcd": 1.21484375,
"loss/logits": 0.255868136882782,
"step": 585
},
{
"epoch": 0.010122733829105451,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0009113947550455729,
"learning_rate": 0.01,
"loss": 1.4032,
"loss/crossentropy": 2.7537986040115356,
"loss/fcd": 1.16015625,
"loss/logits": 0.25436051189899445,
"step": 586
},
{
"epoch": 0.010140008118916211,
"grad_norm": 0.251953125,
"grad_norm_var": 0.001083230972290039,
"learning_rate": 0.01,
"loss": 1.3117,
"loss/crossentropy": 2.14433491230011,
"loss/fcd": 1.10546875,
"loss/logits": 0.24180641025304794,
"step": 587
},
{
"epoch": 0.01015728240872697,
"grad_norm": 0.296875,
"grad_norm_var": 0.000574493408203125,
"learning_rate": 0.01,
"loss": 1.3691,
"loss/crossentropy": 2.101401686668396,
"loss/fcd": 1.1640625,
"loss/logits": 0.19958080351352692,
"step": 588
},
{
"epoch": 0.01017455669853773,
"grad_norm": 0.2890625,
"grad_norm_var": 0.00040378570556640623,
"learning_rate": 0.01,
"loss": 1.3827,
"loss/crossentropy": 2.436479330062866,
"loss/fcd": 1.09765625,
"loss/logits": 0.23494569957256317,
"step": 589
},
{
"epoch": 0.010191830988348492,
"grad_norm": 0.318359375,
"grad_norm_var": 0.00043892860412597656,
"learning_rate": 0.01,
"loss": 1.4279,
"loss/crossentropy": 2.6805481910705566,
"loss/fcd": 1.12890625,
"loss/logits": 0.2272372618317604,
"step": 590
},
{
"epoch": 0.010209105278159252,
"grad_norm": 0.263671875,
"grad_norm_var": 0.00048065185546875,
"learning_rate": 0.01,
"loss": 1.322,
"loss/crossentropy": 2.7796462774276733,
"loss/fcd": 1.1796875,
"loss/logits": 0.26299113035202026,
"step": 591
},
{
"epoch": 0.010226379567970012,
"grad_norm": 0.3046875,
"grad_norm_var": 0.000478363037109375,
"learning_rate": 0.01,
"loss": 1.3396,
"loss/crossentropy": 2.4198944568634033,
"loss/fcd": 1.1171875,
"loss/logits": 0.2373996302485466,
"step": 592
},
{
"epoch": 0.010243653857780772,
"grad_norm": 0.310546875,
"grad_norm_var": 0.00046634674072265625,
"learning_rate": 0.01,
"loss": 1.3618,
"loss/crossentropy": 2.5916903018951416,
"loss/fcd": 1.10546875,
"loss/logits": 0.23520419746637344,
"step": 593
},
{
"epoch": 0.010260928147591532,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0004646142323811849,
"learning_rate": 0.01,
"loss": 1.4032,
"loss/crossentropy": 2.2067846059799194,
"loss/fcd": 1.05078125,
"loss/logits": 0.2392275035381317,
"step": 594
},
{
"epoch": 0.010278202437402292,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0004112084706624349,
"learning_rate": 0.01,
"loss": 1.3551,
"loss/crossentropy": 2.5146957635879517,
"loss/fcd": 1.1484375,
"loss/logits": 0.2572908252477646,
"step": 595
},
{
"epoch": 0.010295476727213053,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0004157861073811849,
"learning_rate": 0.01,
"loss": 1.4091,
"loss/crossentropy": 2.7353230714797974,
"loss/fcd": 1.1953125,
"loss/logits": 0.2845850735902786,
"step": 596
},
{
"epoch": 0.010312751017023813,
"grad_norm": 0.3125,
"grad_norm_var": 0.00041147867838541664,
"learning_rate": 0.01,
"loss": 1.4371,
"loss/crossentropy": 2.290863871574402,
"loss/fcd": 1.1171875,
"loss/logits": 0.25596096366643906,
"step": 597
},
{
"epoch": 0.010330025306834573,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0003811995188395182,
"learning_rate": 0.01,
"loss": 1.3736,
"loss/crossentropy": 2.4351121187210083,
"loss/fcd": 1.15625,
"loss/logits": 0.2633766904473305,
"step": 598
},
{
"epoch": 0.010347299596645333,
"grad_norm": 0.29296875,
"grad_norm_var": 0.000312042236328125,
"learning_rate": 0.01,
"loss": 1.3671,
"loss/crossentropy": 2.3196725845336914,
"loss/fcd": 1.0859375,
"loss/logits": 0.23156649619340897,
"step": 599
},
{
"epoch": 0.010364573886456093,
"grad_norm": 0.267578125,
"grad_norm_var": 0.00036290486653645836,
"learning_rate": 0.01,
"loss": 1.3352,
"loss/crossentropy": 2.0654172897338867,
"loss/fcd": 1.0390625,
"loss/logits": 0.23978617042303085,
"step": 600
},
{
"epoch": 0.010381848176266853,
"grad_norm": 0.29296875,
"grad_norm_var": 0.00035233497619628905,
"learning_rate": 0.01,
"loss": 1.3716,
"loss/crossentropy": 2.0811039805412292,
"loss/fcd": 1.12109375,
"loss/logits": 0.2653958946466446,
"step": 601
},
{
"epoch": 0.010399122466077614,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0003536065419514974,
"learning_rate": 0.01,
"loss": 1.3644,
"loss/crossentropy": 2.7797833681106567,
"loss/fcd": 1.2421875,
"loss/logits": 0.26879242062568665,
"step": 602
},
{
"epoch": 0.010416396755888374,
"grad_norm": 0.263671875,
"grad_norm_var": 0.0002975304921468099,
"learning_rate": 0.01,
"loss": 1.3323,
"loss/crossentropy": 2.2734681367874146,
"loss/fcd": 1.0625,
"loss/logits": 0.21455278247594833,
"step": 603
},
{
"epoch": 0.010433671045699134,
"grad_norm": 0.296875,
"grad_norm_var": 0.0002975304921468099,
"learning_rate": 0.01,
"loss": 1.3207,
"loss/crossentropy": 1.978046715259552,
"loss/fcd": 1.03515625,
"loss/logits": 0.23233170062303543,
"step": 604
},
{
"epoch": 0.010450945335509894,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0003051122029622396,
"learning_rate": 0.01,
"loss": 1.4169,
"loss/crossentropy": 2.5054962635040283,
"loss/fcd": 1.2265625,
"loss/logits": 0.2957670986652374,
"step": 605
},
{
"epoch": 0.010468219625320654,
"grad_norm": 0.359375,
"grad_norm_var": 0.0005370934804280598,
"learning_rate": 0.01,
"loss": 1.4294,
"loss/crossentropy": 2.5767931938171387,
"loss/fcd": 1.18359375,
"loss/logits": 0.2684077024459839,
"step": 606
},
{
"epoch": 0.010485493915131414,
"grad_norm": 0.3203125,
"grad_norm_var": 0.00048039754231770835,
"learning_rate": 0.01,
"loss": 1.37,
"loss/crossentropy": 2.3274868726730347,
"loss/fcd": 1.0546875,
"loss/logits": 0.23180848360061646,
"step": 607
},
{
"epoch": 0.010502768204942173,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0004983107248942057,
"learning_rate": 0.01,
"loss": 1.416,
"loss/crossentropy": 2.5422879457473755,
"loss/fcd": 1.0390625,
"loss/logits": 0.223361574113369,
"step": 608
},
{
"epoch": 0.010520042494752935,
"grad_norm": 0.328125,
"grad_norm_var": 0.0005373636881510417,
"learning_rate": 0.01,
"loss": 1.3627,
"loss/crossentropy": 2.570125699043274,
"loss/fcd": 1.125,
"loss/logits": 0.25247204303741455,
"step": 609
},
{
"epoch": 0.010537316784563695,
"grad_norm": 0.265625,
"grad_norm_var": 0.0006189823150634765,
"learning_rate": 0.01,
"loss": 1.316,
"loss/crossentropy": 2.2968589067459106,
"loss/fcd": 1.015625,
"loss/logits": 0.1994389146566391,
"step": 610
},
{
"epoch": 0.010554591074374455,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0006140232086181641,
"learning_rate": 0.01,
"loss": 1.4265,
"loss/crossentropy": 2.493618369102478,
"loss/fcd": 1.15625,
"loss/logits": 0.2581065893173218,
"step": 611
},
{
"epoch": 0.010571865364185215,
"grad_norm": 0.3046875,
"grad_norm_var": 0.000604248046875,
"learning_rate": 0.01,
"loss": 1.4014,
"loss/crossentropy": 2.4227527379989624,
"loss/fcd": 1.19140625,
"loss/logits": 0.25313572585582733,
"step": 612
},
{
"epoch": 0.010589139653995975,
"grad_norm": 0.296875,
"grad_norm_var": 0.0005971272786458333,
"learning_rate": 0.01,
"loss": 1.3718,
"loss/crossentropy": 2.3819390535354614,
"loss/fcd": 1.06640625,
"loss/logits": 0.22010967135429382,
"step": 613
},
{
"epoch": 0.010606413943806734,
"grad_norm": 0.294921875,
"grad_norm_var": 0.000598001480102539,
"learning_rate": 0.01,
"loss": 1.5079,
"loss/crossentropy": 2.190422534942627,
"loss/fcd": 1.08984375,
"loss/logits": 0.24383512139320374,
"step": 614
},
{
"epoch": 0.010623688233617496,
"grad_norm": 0.29296875,
"grad_norm_var": 0.000598001480102539,
"learning_rate": 0.01,
"loss": 1.3733,
"loss/crossentropy": 2.5865895748138428,
"loss/fcd": 1.10546875,
"loss/logits": 0.24275009334087372,
"step": 615
},
{
"epoch": 0.010640962523428256,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0005334854125976562,
"learning_rate": 0.01,
"loss": 1.3404,
"loss/crossentropy": 2.1975014209747314,
"loss/fcd": 1.046875,
"loss/logits": 0.2261335551738739,
"step": 616
},
{
"epoch": 0.010658236813239016,
"grad_norm": 0.33203125,
"grad_norm_var": 0.0005843480428059896,
"learning_rate": 0.01,
"loss": 1.4037,
"loss/crossentropy": 2.7723870277404785,
"loss/fcd": 1.234375,
"loss/logits": 0.2835993468761444,
"step": 617
},
{
"epoch": 0.010675511103049776,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0005884647369384765,
"learning_rate": 0.01,
"loss": 1.3625,
"loss/crossentropy": 2.599759817123413,
"loss/fcd": 1.1953125,
"loss/logits": 0.285232275724411,
"step": 618
},
{
"epoch": 0.010692785392860536,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0004821618398030599,
"learning_rate": 0.01,
"loss": 1.3733,
"loss/crossentropy": 2.4128291606903076,
"loss/fcd": 1.1796875,
"loss/logits": 0.26694832742214203,
"step": 619
},
{
"epoch": 0.010710059682671295,
"grad_norm": 0.306640625,
"grad_norm_var": 0.00047651926676432293,
"learning_rate": 0.01,
"loss": 1.3343,
"loss/crossentropy": 2.5237722396850586,
"loss/fcd": 1.1796875,
"loss/logits": 0.26433800160884857,
"step": 620
},
{
"epoch": 0.010727333972482057,
"grad_norm": 0.337890625,
"grad_norm_var": 0.0005385716756184896,
"learning_rate": 0.01,
"loss": 1.4112,
"loss/crossentropy": 2.317731261253357,
"loss/fcd": 1.2265625,
"loss/logits": 0.28476743400096893,
"step": 621
},
{
"epoch": 0.010744608262292817,
"grad_norm": 0.271484375,
"grad_norm_var": 0.0004234155019124349,
"learning_rate": 0.01,
"loss": 1.3603,
"loss/crossentropy": 2.3109618425369263,
"loss/fcd": 1.041015625,
"loss/logits": 0.2279675453901291,
"step": 622
},
{
"epoch": 0.010761882552103577,
"grad_norm": 0.275390625,
"grad_norm_var": 0.00044498443603515627,
"learning_rate": 0.01,
"loss": 1.3089,
"loss/crossentropy": 2.3984739780426025,
"loss/fcd": 1.09765625,
"loss/logits": 0.25493185222148895,
"step": 623
},
{
"epoch": 0.010779156841914337,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0004232883453369141,
"learning_rate": 0.01,
"loss": 1.4079,
"loss/crossentropy": 2.1802881956100464,
"loss/fcd": 1.0703125,
"loss/logits": 0.23454807698726654,
"step": 624
},
{
"epoch": 0.010796431131725097,
"grad_norm": 0.326171875,
"grad_norm_var": 0.00041599273681640624,
"learning_rate": 0.01,
"loss": 1.3629,
"loss/crossentropy": 2.6050442457199097,
"loss/fcd": 1.0703125,
"loss/logits": 0.2245146408677101,
"step": 625
},
{
"epoch": 0.010813705421535856,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0003667036692301432,
"learning_rate": 0.01,
"loss": 1.3622,
"loss/crossentropy": 2.4274967908859253,
"loss/fcd": 1.140625,
"loss/logits": 0.2685912102460861,
"step": 626
},
{
"epoch": 0.010830979711346618,
"grad_norm": 0.3125,
"grad_norm_var": 0.0003754774729410807,
"learning_rate": 0.01,
"loss": 1.4161,
"loss/crossentropy": 2.556549072265625,
"loss/fcd": 1.109375,
"loss/logits": 0.2520214840769768,
"step": 627
},
{
"epoch": 0.010848254001157378,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0003949483235677083,
"learning_rate": 0.01,
"loss": 1.3802,
"loss/crossentropy": 2.2824164628982544,
"loss/fcd": 1.046875,
"loss/logits": 0.22343048453330994,
"step": 628
},
{
"epoch": 0.010865528290968138,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0004146416982014974,
"learning_rate": 0.01,
"loss": 1.3555,
"loss/crossentropy": 2.500080108642578,
"loss/fcd": 1.0703125,
"loss/logits": 0.24835523962974548,
"step": 629
},
{
"epoch": 0.010882802580778898,
"grad_norm": 0.30859375,
"grad_norm_var": 0.00041631062825520836,
"learning_rate": 0.01,
"loss": 1.4,
"loss/crossentropy": 2.4014720916748047,
"loss/fcd": 1.140625,
"loss/logits": 0.236750990152359,
"step": 630
},
{
"epoch": 0.010900076870589658,
"grad_norm": 0.306640625,
"grad_norm_var": 0.00041286150614420575,
"learning_rate": 0.01,
"loss": 1.3707,
"loss/crossentropy": 2.3228918313980103,
"loss/fcd": 1.078125,
"loss/logits": 0.23406407982110977,
"step": 631
},
{
"epoch": 0.010917351160400417,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0004018147786458333,
"learning_rate": 0.01,
"loss": 1.3885,
"loss/crossentropy": 2.50198233127594,
"loss/fcd": 1.1875,
"loss/logits": 0.258284330368042,
"step": 632
},
{
"epoch": 0.010934625450211179,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0003524621327718099,
"learning_rate": 0.01,
"loss": 1.3978,
"loss/crossentropy": 2.637346863746643,
"loss/fcd": 1.15234375,
"loss/logits": 0.28542736172676086,
"step": 633
},
{
"epoch": 0.010951899740021939,
"grad_norm": 0.30859375,
"grad_norm_var": 0.00033969879150390624,
"learning_rate": 0.01,
"loss": 1.4403,
"loss/crossentropy": 2.4110260009765625,
"loss/fcd": 1.171875,
"loss/logits": 0.2651347145438194,
"step": 634
},
{
"epoch": 0.010969174029832699,
"grad_norm": 0.283203125,
"grad_norm_var": 0.000360870361328125,
"learning_rate": 0.01,
"loss": 1.4184,
"loss/crossentropy": 2.7041887044906616,
"loss/fcd": 1.171875,
"loss/logits": 0.2508121207356453,
"step": 635
},
{
"epoch": 0.010986448319643459,
"grad_norm": 0.28515625,
"grad_norm_var": 0.00037713050842285155,
"learning_rate": 0.01,
"loss": 1.401,
"loss/crossentropy": 2.4663859605789185,
"loss/fcd": 1.14453125,
"loss/logits": 0.2824552655220032,
"step": 636
},
{
"epoch": 0.011003722609454218,
"grad_norm": 0.326171875,
"grad_norm_var": 0.0003279209136962891,
"learning_rate": 0.01,
"loss": 1.3728,
"loss/crossentropy": 2.5915483236312866,
"loss/fcd": 1.11328125,
"loss/logits": 0.24787750095129013,
"step": 637
},
{
"epoch": 0.011020996899264978,
"grad_norm": 0.28125,
"grad_norm_var": 0.00029652913411458334,
"learning_rate": 0.01,
"loss": 1.3354,
"loss/crossentropy": 2.5775671005249023,
"loss/fcd": 1.11328125,
"loss/logits": 0.26823610067367554,
"step": 638
},
{
"epoch": 0.01103827118907574,
"grad_norm": 0.298828125,
"grad_norm_var": 0.00025151570638020835,
"learning_rate": 0.01,
"loss": 1.3522,
"loss/crossentropy": 2.3462886810302734,
"loss/fcd": 1.06640625,
"loss/logits": 0.23827192932367325,
"step": 639
},
{
"epoch": 0.0110555454788865,
"grad_norm": 0.3046875,
"grad_norm_var": 0.00025151570638020835,
"learning_rate": 0.01,
"loss": 1.4206,
"loss/crossentropy": 2.2796329855918884,
"loss/fcd": 1.09765625,
"loss/logits": 0.24281439930200577,
"step": 640
},
{
"epoch": 0.01107281976869726,
"grad_norm": 0.302734375,
"grad_norm_var": 0.00021107991536458334,
"learning_rate": 0.01,
"loss": 1.4093,
"loss/crossentropy": 2.2618002891540527,
"loss/fcd": 1.10546875,
"loss/logits": 0.24219272285699844,
"step": 641
},
{
"epoch": 0.01109009405850802,
"grad_norm": 0.515625,
"grad_norm_var": 0.0030247847239176433,
"learning_rate": 0.01,
"loss": 1.5002,
"loss/crossentropy": 2.628837466239929,
"loss/fcd": 1.1796875,
"loss/logits": 0.27036982774734497,
"step": 642
},
{
"epoch": 0.01110736834831878,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0030420303344726564,
"learning_rate": 0.01,
"loss": 1.3437,
"loss/crossentropy": 2.377197504043579,
"loss/fcd": 1.078125,
"loss/logits": 0.2347392812371254,
"step": 643
},
{
"epoch": 0.01112464263812954,
"grad_norm": 0.296875,
"grad_norm_var": 0.0030603885650634767,
"learning_rate": 0.01,
"loss": 1.3465,
"loss/crossentropy": 2.241411805152893,
"loss/fcd": 1.04296875,
"loss/logits": 0.22135238349437714,
"step": 644
},
{
"epoch": 0.0111419169279403,
"grad_norm": 0.287109375,
"grad_norm_var": 0.00304563840230306,
"learning_rate": 0.01,
"loss": 1.3781,
"loss/crossentropy": 2.132224917411804,
"loss/fcd": 1.06640625,
"loss/logits": 0.24958615005016327,
"step": 645
},
{
"epoch": 0.01115919121775106,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0030664443969726563,
"learning_rate": 0.01,
"loss": 1.3319,
"loss/crossentropy": 2.379546046257019,
"loss/fcd": 1.0703125,
"loss/logits": 0.23225411772727966,
"step": 646
},
{
"epoch": 0.01117646550756182,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0031198501586914063,
"learning_rate": 0.01,
"loss": 1.3696,
"loss/crossentropy": 2.4151222705841064,
"loss/fcd": 1.08984375,
"loss/logits": 0.25382500886917114,
"step": 647
},
{
"epoch": 0.01119373979737258,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0031198501586914063,
"learning_rate": 0.01,
"loss": 1.3725,
"loss/crossentropy": 2.4386223554611206,
"loss/fcd": 1.11328125,
"loss/logits": 0.24172081053256989,
"step": 648
},
{
"epoch": 0.01121101408718334,
"grad_norm": 0.294921875,
"grad_norm_var": 0.003135426839192708,
"learning_rate": 0.01,
"loss": 1.4136,
"loss/crossentropy": 2.4053245782852173,
"loss/fcd": 1.10546875,
"loss/logits": 0.2587142735719681,
"step": 649
},
{
"epoch": 0.0112282883769941,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0031352837880452475,
"learning_rate": 0.01,
"loss": 1.3908,
"loss/crossentropy": 2.8473496437072754,
"loss/fcd": 1.203125,
"loss/logits": 0.24620139598846436,
"step": 650
},
{
"epoch": 0.01124556266680486,
"grad_norm": 0.28515625,
"grad_norm_var": 0.003128496805826823,
"learning_rate": 0.01,
"loss": 1.346,
"loss/crossentropy": 2.4264625310897827,
"loss/fcd": 1.04296875,
"loss/logits": 0.22718993574380875,
"step": 651
},
{
"epoch": 0.011262836956615622,
"grad_norm": 0.302734375,
"grad_norm_var": 0.003088871637980143,
"learning_rate": 0.01,
"loss": 1.3722,
"loss/crossentropy": 2.393033504486084,
"loss/fcd": 1.09375,
"loss/logits": 0.2361084669828415,
"step": 652
},
{
"epoch": 0.011280111246426382,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0030968825022379557,
"learning_rate": 0.01,
"loss": 1.3962,
"loss/crossentropy": 2.5740654468536377,
"loss/fcd": 1.1328125,
"loss/logits": 0.27814269065856934,
"step": 653
},
{
"epoch": 0.011297385536237142,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0030664443969726563,
"learning_rate": 0.01,
"loss": 1.3502,
"loss/crossentropy": 2.572822332382202,
"loss/fcd": 1.109375,
"loss/logits": 0.25307735800743103,
"step": 654
},
{
"epoch": 0.011314659826047901,
"grad_norm": 0.302734375,
"grad_norm_var": 0.003061676025390625,
"learning_rate": 0.01,
"loss": 1.3652,
"loss/crossentropy": 2.36893892288208,
"loss/fcd": 1.12109375,
"loss/logits": 0.24310748279094696,
"step": 655
},
{
"epoch": 0.011331934115858661,
"grad_norm": 0.28515625,
"grad_norm_var": 0.003099505106608073,
"learning_rate": 0.01,
"loss": 1.382,
"loss/crossentropy": 2.453968048095703,
"loss/fcd": 1.12109375,
"loss/logits": 0.2507154792547226,
"step": 656
},
{
"epoch": 0.011349208405669421,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0030968825022379557,
"learning_rate": 0.01,
"loss": 1.4208,
"loss/crossentropy": 2.3706772327423096,
"loss/fcd": 1.203125,
"loss/logits": 0.2801935374736786,
"step": 657
},
{
"epoch": 0.011366482695480183,
"grad_norm": 0.28515625,
"grad_norm_var": 7.348060607910156e-05,
"learning_rate": 0.01,
"loss": 1.3745,
"loss/crossentropy": 2.3052316308021545,
"loss/fcd": 1.125,
"loss/logits": 0.24023611843585968,
"step": 658
},
{
"epoch": 0.011383756985290943,
"grad_norm": 0.263671875,
"grad_norm_var": 0.00013184547424316406,
"learning_rate": 0.01,
"loss": 1.3589,
"loss/crossentropy": 2.3989150524139404,
"loss/fcd": 1.08984375,
"loss/logits": 0.23345524072647095,
"step": 659
},
{
"epoch": 0.011401031275101703,
"grad_norm": 0.326171875,
"grad_norm_var": 0.000202178955078125,
"learning_rate": 0.01,
"loss": 1.4671,
"loss/crossentropy": 2.4908188581466675,
"loss/fcd": 1.08203125,
"loss/logits": 0.22981490939855576,
"step": 660
},
{
"epoch": 0.011418305564912462,
"grad_norm": 0.318359375,
"grad_norm_var": 0.000232696533203125,
"learning_rate": 0.01,
"loss": 1.3845,
"loss/crossentropy": 2.182092070579529,
"loss/fcd": 1.0390625,
"loss/logits": 0.22433090209960938,
"step": 661
},
{
"epoch": 0.011435579854723222,
"grad_norm": 0.29296875,
"grad_norm_var": 0.00023331642150878907,
"learning_rate": 0.01,
"loss": 1.3832,
"loss/crossentropy": 2.557218909263611,
"loss/fcd": 1.15234375,
"loss/logits": 0.26849667727947235,
"step": 662
},
{
"epoch": 0.011452854144533982,
"grad_norm": 0.26953125,
"grad_norm_var": 0.0002688090006510417,
"learning_rate": 0.01,
"loss": 1.3516,
"loss/crossentropy": 2.4368367195129395,
"loss/fcd": 1.09765625,
"loss/logits": 0.2600485235452652,
"step": 663
},
{
"epoch": 0.011470128434344744,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0002975304921468099,
"learning_rate": 0.01,
"loss": 1.3436,
"loss/crossentropy": 2.283419609069824,
"loss/fcd": 1.10546875,
"loss/logits": 0.2451685667037964,
"step": 664
},
{
"epoch": 0.011487402724155504,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0003295262654622396,
"learning_rate": 0.01,
"loss": 1.3917,
"loss/crossentropy": 2.2501282691955566,
"loss/fcd": 1.10546875,
"loss/logits": 0.23817522078752518,
"step": 665
},
{
"epoch": 0.011504677013966264,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0003692468007405599,
"learning_rate": 0.01,
"loss": 1.3747,
"loss/crossentropy": 2.595417618751526,
"loss/fcd": 1.109375,
"loss/logits": 0.272259384393692,
"step": 666
},
{
"epoch": 0.011521951303777023,
"grad_norm": 0.263671875,
"grad_norm_var": 0.0004292170206705729,
"learning_rate": 0.01,
"loss": 1.3477,
"loss/crossentropy": 2.3635072708129883,
"loss/fcd": 1.09765625,
"loss/logits": 0.24695640057325363,
"step": 667
},
{
"epoch": 0.011539225593587783,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0004246870676676432,
"learning_rate": 0.01,
"loss": 1.3744,
"loss/crossentropy": 2.310747981071472,
"loss/fcd": 1.08984375,
"loss/logits": 0.2579839900135994,
"step": 668
},
{
"epoch": 0.011556499883398543,
"grad_norm": 0.27734375,
"grad_norm_var": 0.00044193267822265623,
"learning_rate": 0.01,
"loss": 1.349,
"loss/crossentropy": 2.497538447380066,
"loss/fcd": 1.1328125,
"loss/logits": 0.26720890402793884,
"step": 669
},
{
"epoch": 0.011573774173209305,
"grad_norm": 0.3046875,
"grad_norm_var": 0.00044960975646972655,
"learning_rate": 0.01,
"loss": 1.3475,
"loss/crossentropy": 2.5883569717407227,
"loss/fcd": 1.2109375,
"loss/logits": 0.29579465091228485,
"step": 670
},
{
"epoch": 0.011591048463020065,
"grad_norm": 0.263671875,
"grad_norm_var": 0.0004998366038004557,
"learning_rate": 0.01,
"loss": 1.3349,
"loss/crossentropy": 2.2982797622680664,
"loss/fcd": 1.1328125,
"loss/logits": 0.22655323147773743,
"step": 671
},
{
"epoch": 0.011608322752830825,
"grad_norm": 0.359375,
"grad_norm_var": 0.0007800896962483724,
"learning_rate": 0.01,
"loss": 1.3753,
"loss/crossentropy": 2.4650286436080933,
"loss/fcd": 1.09765625,
"loss/logits": 0.24685797840356827,
"step": 672
},
{
"epoch": 0.011625597042641584,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0007897535959879557,
"learning_rate": 0.01,
"loss": 1.3429,
"loss/crossentropy": 2.5849392414093018,
"loss/fcd": 1.140625,
"loss/logits": 0.2600446939468384,
"step": 673
},
{
"epoch": 0.011642871332452344,
"grad_norm": 0.28125,
"grad_norm_var": 0.0007954756418863932,
"learning_rate": 0.01,
"loss": 1.3721,
"loss/crossentropy": 2.4149436950683594,
"loss/fcd": 1.08984375,
"loss/logits": 0.24952851235866547,
"step": 674
},
{
"epoch": 0.011660145622263104,
"grad_norm": 0.34765625,
"grad_norm_var": 0.0008959452311197917,
"learning_rate": 0.01,
"loss": 1.4752,
"loss/crossentropy": 2.582419753074646,
"loss/fcd": 1.2578125,
"loss/logits": 0.2812621593475342,
"step": 675
},
{
"epoch": 0.011677419912073864,
"grad_norm": 0.26171875,
"grad_norm_var": 0.0009247938791910808,
"learning_rate": 0.01,
"loss": 1.3666,
"loss/crossentropy": 2.3817840814590454,
"loss/fcd": 1.078125,
"loss/logits": 0.24483423680067062,
"step": 676
},
{
"epoch": 0.011694694201884626,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0008938948313395183,
"learning_rate": 0.01,
"loss": 1.3642,
"loss/crossentropy": 2.4791339635849,
"loss/fcd": 1.078125,
"loss/logits": 0.25220367312431335,
"step": 677
},
{
"epoch": 0.011711968491695385,
"grad_norm": 0.271484375,
"grad_norm_var": 0.0009230931599934895,
"learning_rate": 0.01,
"loss": 1.3435,
"loss/crossentropy": 2.3865939378738403,
"loss/fcd": 1.09375,
"loss/logits": 0.24416129291057587,
"step": 678
},
{
"epoch": 0.011729242781506145,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0009215672810872396,
"learning_rate": 0.01,
"loss": 1.4158,
"loss/crossentropy": 2.514981508255005,
"loss/fcd": 1.15625,
"loss/logits": 0.27227045595645905,
"step": 679
},
{
"epoch": 0.011746517071316905,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0008992513020833333,
"learning_rate": 0.01,
"loss": 1.399,
"loss/crossentropy": 2.660152792930603,
"loss/fcd": 1.11328125,
"loss/logits": 0.2607909142971039,
"step": 680
},
{
"epoch": 0.011763791361127665,
"grad_norm": 0.296875,
"grad_norm_var": 0.0008722305297851563,
"learning_rate": 0.01,
"loss": 1.4528,
"loss/crossentropy": 2.165284812450409,
"loss/fcd": 1.07421875,
"loss/logits": 0.2606969401240349,
"step": 681
},
{
"epoch": 0.011781065650938425,
"grad_norm": 0.330078125,
"grad_norm_var": 0.0008966922760009766,
"learning_rate": 0.01,
"loss": 1.4402,
"loss/crossentropy": 2.719216465950012,
"loss/fcd": 1.21875,
"loss/logits": 0.274374857544899,
"step": 682
},
{
"epoch": 0.011798339940749187,
"grad_norm": 0.357421875,
"grad_norm_var": 0.0010416507720947266,
"learning_rate": 0.01,
"loss": 1.4226,
"loss/crossentropy": 2.405388355255127,
"loss/fcd": 1.1640625,
"loss/logits": 0.2703537493944168,
"step": 683
},
{
"epoch": 0.011815614230559946,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0010400772094726562,
"learning_rate": 0.01,
"loss": 1.4291,
"loss/crossentropy": 2.7011595964431763,
"loss/fcd": 1.1875,
"loss/logits": 0.25208880007267,
"step": 684
},
{
"epoch": 0.011832888520370706,
"grad_norm": 0.37109375,
"grad_norm_var": 0.0012689590454101562,
"learning_rate": 0.01,
"loss": 1.3541,
"loss/crossentropy": 2.5975828170776367,
"loss/fcd": 1.1015625,
"loss/logits": 0.23054596036672592,
"step": 685
},
{
"epoch": 0.011850162810181466,
"grad_norm": 0.302734375,
"grad_norm_var": 0.001270278294881185,
"learning_rate": 0.01,
"loss": 1.3724,
"loss/crossentropy": 2.202287197113037,
"loss/fcd": 1.0625,
"loss/logits": 0.24445781856775284,
"step": 686
},
{
"epoch": 0.011867437099992226,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0011264642079671225,
"learning_rate": 0.01,
"loss": 1.3371,
"loss/crossentropy": 2.309388518333435,
"loss/fcd": 1.11328125,
"loss/logits": 0.2442098781466484,
"step": 687
},
{
"epoch": 0.011884711389802986,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0009780248006184896,
"learning_rate": 0.01,
"loss": 1.3841,
"loss/crossentropy": 2.499300003051758,
"loss/fcd": 1.11328125,
"loss/logits": 0.26171083748340607,
"step": 688
},
{
"epoch": 0.011901985679613748,
"grad_norm": 0.333984375,
"grad_norm_var": 0.0009480635325113932,
"learning_rate": 0.01,
"loss": 1.4675,
"loss/crossentropy": 2.35269558429718,
"loss/fcd": 1.06640625,
"loss/logits": 0.2726883888244629,
"step": 689
},
{
"epoch": 0.011919259969424507,
"grad_norm": 0.333984375,
"grad_norm_var": 0.000909868876139323,
"learning_rate": 0.01,
"loss": 1.403,
"loss/crossentropy": 2.78786039352417,
"loss/fcd": 1.25390625,
"loss/logits": 0.3147393763065338,
"step": 690
},
{
"epoch": 0.011936534259235267,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0009186903635660808,
"learning_rate": 0.01,
"loss": 1.3294,
"loss/crossentropy": 2.0689194798469543,
"loss/fcd": 1.03515625,
"loss/logits": 0.234086312353611,
"step": 691
},
{
"epoch": 0.011953808549046027,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0007778008778889974,
"learning_rate": 0.01,
"loss": 1.3331,
"loss/crossentropy": 2.290665626525879,
"loss/fcd": 1.046875,
"loss/logits": 0.23476862162351608,
"step": 692
},
{
"epoch": 0.011971082838856787,
"grad_norm": 0.26953125,
"grad_norm_var": 0.0008422215779622396,
"learning_rate": 0.01,
"loss": 1.3703,
"loss/crossentropy": 2.4959352016448975,
"loss/fcd": 1.16015625,
"loss/logits": 0.2350049912929535,
"step": 693
},
{
"epoch": 0.011988357128667547,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0007624308268229167,
"learning_rate": 0.01,
"loss": 1.3499,
"loss/crossentropy": 2.3858295679092407,
"loss/fcd": 1.0546875,
"loss/logits": 0.2346876710653305,
"step": 694
},
{
"epoch": 0.012005631418478309,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0007703145345052083,
"learning_rate": 0.01,
"loss": 1.4174,
"loss/crossentropy": 2.5176814794540405,
"loss/fcd": 1.13671875,
"loss/logits": 0.2492477372288704,
"step": 695
},
{
"epoch": 0.012022905708289068,
"grad_norm": 0.365234375,
"grad_norm_var": 0.0009376366933186848,
"learning_rate": 0.01,
"loss": 1.4472,
"loss/crossentropy": 2.553426146507263,
"loss/fcd": 1.13671875,
"loss/logits": 0.25825950503349304,
"step": 696
},
{
"epoch": 0.012040179998099828,
"grad_norm": 0.3125,
"grad_norm_var": 0.0009119510650634766,
"learning_rate": 0.01,
"loss": 1.3902,
"loss/crossentropy": 2.524499535560608,
"loss/fcd": 1.17578125,
"loss/logits": 0.2615286335349083,
"step": 697
},
{
"epoch": 0.012057454287910588,
"grad_norm": 0.271484375,
"grad_norm_var": 0.001028299331665039,
"learning_rate": 0.01,
"loss": 1.3468,
"loss/crossentropy": 2.234209656715393,
"loss/fcd": 1.109375,
"loss/logits": 0.2630993127822876,
"step": 698
},
{
"epoch": 0.012074728577721348,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0009722232818603516,
"learning_rate": 0.01,
"loss": 1.3397,
"loss/crossentropy": 2.595862627029419,
"loss/fcd": 1.09375,
"loss/logits": 0.25720856338739395,
"step": 699
},
{
"epoch": 0.012092002867532108,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0009722232818603516,
"learning_rate": 0.01,
"loss": 1.3472,
"loss/crossentropy": 2.3556742668151855,
"loss/fcd": 1.0859375,
"loss/logits": 0.23623445630073547,
"step": 700
},
{
"epoch": 0.01210927715734287,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0007013797760009766,
"learning_rate": 0.01,
"loss": 1.3728,
"loss/crossentropy": 2.286816358566284,
"loss/fcd": 1.04296875,
"loss/logits": 0.24584627896547318,
"step": 701
},
{
"epoch": 0.01212655144715363,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0007274468739827474,
"learning_rate": 0.01,
"loss": 1.3878,
"loss/crossentropy": 2.2807174921035767,
"loss/fcd": 1.109375,
"loss/logits": 0.25587528198957443,
"step": 702
},
{
"epoch": 0.01214382573696439,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0007240136464436848,
"learning_rate": 0.01,
"loss": 1.3971,
"loss/crossentropy": 2.5250132083892822,
"loss/fcd": 1.12109375,
"loss/logits": 0.2833500802516937,
"step": 703
},
{
"epoch": 0.01216110002677515,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0007322788238525391,
"learning_rate": 0.01,
"loss": 1.3972,
"loss/crossentropy": 2.5938040018081665,
"loss/fcd": 1.1484375,
"loss/logits": 0.2679053843021393,
"step": 704
},
{
"epoch": 0.012178374316585909,
"grad_norm": 0.3046875,
"grad_norm_var": 0.000661468505859375,
"learning_rate": 0.01,
"loss": 1.3572,
"loss/crossentropy": 2.3809746503829956,
"loss/fcd": 1.1171875,
"loss/logits": 0.2514628916978836,
"step": 705
},
{
"epoch": 0.012195648606396669,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0005812168121337891,
"learning_rate": 0.01,
"loss": 1.3698,
"loss/crossentropy": 2.3113526105880737,
"loss/fcd": 1.0703125,
"loss/logits": 0.24198968708515167,
"step": 706
},
{
"epoch": 0.01221292289620743,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0005541324615478515,
"learning_rate": 0.01,
"loss": 1.3485,
"loss/crossentropy": 2.465987205505371,
"loss/fcd": 1.13671875,
"loss/logits": 0.2991575300693512,
"step": 707
},
{
"epoch": 0.01223019718601819,
"grad_norm": 0.3125,
"grad_norm_var": 0.0005623976389567058,
"learning_rate": 0.01,
"loss": 1.3754,
"loss/crossentropy": 2.4940463304519653,
"loss/fcd": 1.1640625,
"loss/logits": 0.2627300024032593,
"step": 708
},
{
"epoch": 0.01224747147582895,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0005009810129801433,
"learning_rate": 0.01,
"loss": 1.378,
"loss/crossentropy": 2.6033318042755127,
"loss/fcd": 1.12109375,
"loss/logits": 0.2630281075835228,
"step": 709
},
{
"epoch": 0.01226474576563971,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0005177656809488932,
"learning_rate": 0.01,
"loss": 1.3969,
"loss/crossentropy": 2.218273878097534,
"loss/fcd": 1.08984375,
"loss/logits": 0.23244468122720718,
"step": 710
},
{
"epoch": 0.01228202005545047,
"grad_norm": 0.3359375,
"grad_norm_var": 0.0005585829416910808,
"learning_rate": 0.01,
"loss": 1.4508,
"loss/crossentropy": 2.329068422317505,
"loss/fcd": 1.2109375,
"loss/logits": 0.24251049757003784,
"step": 711
},
{
"epoch": 0.01229929434526123,
"grad_norm": 0.302734375,
"grad_norm_var": 0.00029511451721191405,
"learning_rate": 0.01,
"loss": 1.4497,
"loss/crossentropy": 2.4693063497543335,
"loss/fcd": 1.125,
"loss/logits": 0.2587638199329376,
"step": 712
},
{
"epoch": 0.01231656863507199,
"grad_norm": 0.3125,
"grad_norm_var": 0.00029511451721191405,
"learning_rate": 0.01,
"loss": 1.371,
"loss/crossentropy": 2.4224281311035156,
"loss/fcd": 1.13671875,
"loss/logits": 0.27352161705493927,
"step": 713
},
{
"epoch": 0.012333842924882751,
"grad_norm": 0.3203125,
"grad_norm_var": 0.00025577545166015624,
"learning_rate": 0.01,
"loss": 1.3508,
"loss/crossentropy": 2.5101382732391357,
"loss/fcd": 1.1171875,
"loss/logits": 0.25151751190423965,
"step": 714
},
{
"epoch": 0.012351117214693511,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0002010186513264974,
"learning_rate": 0.01,
"loss": 1.3949,
"loss/crossentropy": 2.765409469604492,
"loss/fcd": 1.10546875,
"loss/logits": 0.23425965011119843,
"step": 715
},
{
"epoch": 0.012368391504504271,
"grad_norm": 0.259765625,
"grad_norm_var": 0.0003284295399983724,
"learning_rate": 0.01,
"loss": 1.3346,
"loss/crossentropy": 2.446286678314209,
"loss/fcd": 1.0625,
"loss/logits": 0.23563802242279053,
"step": 716
},
{
"epoch": 0.012385665794315031,
"grad_norm": 0.365234375,
"grad_norm_var": 0.0005666097005208333,
"learning_rate": 0.01,
"loss": 1.485,
"loss/crossentropy": 2.3494917154312134,
"loss/fcd": 1.47265625,
"loss/logits": 0.2857535183429718,
"step": 717
},
{
"epoch": 0.01240294008412579,
"grad_norm": 0.28125,
"grad_norm_var": 0.0005729516347249349,
"learning_rate": 0.01,
"loss": 1.3814,
"loss/crossentropy": 2.3558719158172607,
"loss/fcd": 1.12109375,
"loss/logits": 0.24474655091762543,
"step": 718
},
{
"epoch": 0.01242021437393655,
"grad_norm": 0.26953125,
"grad_norm_var": 0.000646209716796875,
"learning_rate": 0.01,
"loss": 1.3125,
"loss/crossentropy": 2.364332675933838,
"loss/fcd": 1.1015625,
"loss/logits": 0.24612490087747574,
"step": 719
},
{
"epoch": 0.012437488663747312,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0006484826405843099,
"learning_rate": 0.01,
"loss": 1.3629,
"loss/crossentropy": 2.218404769897461,
"loss/fcd": 1.15625,
"loss/logits": 0.2676163464784622,
"step": 720
},
{
"epoch": 0.012454762953558072,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0007059574127197266,
"learning_rate": 0.01,
"loss": 1.3642,
"loss/crossentropy": 2.4319703578948975,
"loss/fcd": 1.12109375,
"loss/logits": 0.25568731129169464,
"step": 721
},
{
"epoch": 0.012472037243368832,
"grad_norm": 0.26953125,
"grad_norm_var": 0.0007715702056884765,
"learning_rate": 0.01,
"loss": 1.3565,
"loss/crossentropy": 2.603386163711548,
"loss/fcd": 1.09765625,
"loss/logits": 0.23648831248283386,
"step": 722
},
{
"epoch": 0.012489311533179592,
"grad_norm": 0.294921875,
"grad_norm_var": 0.000762033462524414,
"learning_rate": 0.01,
"loss": 1.4305,
"loss/crossentropy": 2.3345898389816284,
"loss/fcd": 1.05859375,
"loss/logits": 0.2294597253203392,
"step": 723
},
{
"epoch": 0.012506585822990352,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0007692813873291015,
"learning_rate": 0.01,
"loss": 1.3885,
"loss/crossentropy": 2.315110445022583,
"loss/fcd": 1.15234375,
"loss/logits": 0.262426495552063,
"step": 724
},
{
"epoch": 0.012523860112801112,
"grad_norm": 0.28125,
"grad_norm_var": 0.0007898807525634766,
"learning_rate": 0.01,
"loss": 1.2937,
"loss/crossentropy": 2.2987769842147827,
"loss/fcd": 1.0,
"loss/logits": 0.21975189447402954,
"step": 725
},
{
"epoch": 0.012541134402611873,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0007562637329101562,
"learning_rate": 0.01,
"loss": 1.3775,
"loss/crossentropy": 2.5773731470108032,
"loss/fcd": 1.16015625,
"loss/logits": 0.29223839938640594,
"step": 726
},
{
"epoch": 0.012558408692422633,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0006650288899739584,
"learning_rate": 0.01,
"loss": 1.4041,
"loss/crossentropy": 2.138230562210083,
"loss/fcd": 1.0625,
"loss/logits": 0.24283046275377274,
"step": 727
},
{
"epoch": 0.012575682982233393,
"grad_norm": 0.333984375,
"grad_norm_var": 0.0007525126139322917,
"learning_rate": 0.01,
"loss": 1.4611,
"loss/crossentropy": 2.521793842315674,
"loss/fcd": 1.2265625,
"loss/logits": 0.2588220089673996,
"step": 728
},
{
"epoch": 0.012592957272044153,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0007710774739583333,
"learning_rate": 0.01,
"loss": 1.3833,
"loss/crossentropy": 2.5079206228256226,
"loss/fcd": 1.13671875,
"loss/logits": 0.24896685779094696,
"step": 729
},
{
"epoch": 0.012610231561854913,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0007460912068684896,
"learning_rate": 0.01,
"loss": 1.398,
"loss/crossentropy": 2.4435055255889893,
"loss/fcd": 1.1875,
"loss/logits": 0.2766249179840088,
"step": 730
},
{
"epoch": 0.012627505851665673,
"grad_norm": 0.353515625,
"grad_norm_var": 0.0009387811024983724,
"learning_rate": 0.01,
"loss": 1.482,
"loss/crossentropy": 2.480614185333252,
"loss/fcd": 1.12109375,
"loss/logits": 0.24479512870311737,
"step": 731
},
{
"epoch": 0.012644780141476434,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0008542219797770183,
"learning_rate": 0.01,
"loss": 1.3214,
"loss/crossentropy": 2.556125283241272,
"loss/fcd": 1.0546875,
"loss/logits": 0.25190603733062744,
"step": 732
},
{
"epoch": 0.012662054431287194,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0006001631418863933,
"learning_rate": 0.01,
"loss": 1.3992,
"loss/crossentropy": 2.2440203428268433,
"loss/fcd": 1.046875,
"loss/logits": 0.23071999847888947,
"step": 733
},
{
"epoch": 0.012679328721097954,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0005847771962483723,
"learning_rate": 0.01,
"loss": 1.3884,
"loss/crossentropy": 2.366842269897461,
"loss/fcd": 1.13671875,
"loss/logits": 0.2621122822165489,
"step": 734
},
{
"epoch": 0.012696603010908714,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0005288283030192057,
"learning_rate": 0.01,
"loss": 1.373,
"loss/crossentropy": 2.528809905052185,
"loss/fcd": 1.140625,
"loss/logits": 0.2601289302110672,
"step": 735
},
{
"epoch": 0.012713877300719474,
"grad_norm": 0.333984375,
"grad_norm_var": 0.0005879084269205729,
"learning_rate": 0.01,
"loss": 1.3657,
"loss/crossentropy": 2.1993446350097656,
"loss/fcd": 1.05859375,
"loss/logits": 0.2357948124408722,
"step": 736
},
{
"epoch": 0.012731151590530234,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0005395889282226562,
"learning_rate": 0.01,
"loss": 1.3611,
"loss/crossentropy": 2.5157347917556763,
"loss/fcd": 1.109375,
"loss/logits": 0.2621786296367645,
"step": 737
},
{
"epoch": 0.012748425880340995,
"grad_norm": 0.31640625,
"grad_norm_var": 0.00045566558837890626,
"learning_rate": 0.01,
"loss": 1.3787,
"loss/crossentropy": 2.463285803794861,
"loss/fcd": 1.1328125,
"loss/logits": 0.2661950886249542,
"step": 738
},
{
"epoch": 0.012765700170151755,
"grad_norm": 0.314453125,
"grad_norm_var": 0.00044581095377604164,
"learning_rate": 0.01,
"loss": 1.3789,
"loss/crossentropy": 2.7613465785980225,
"loss/fcd": 1.09375,
"loss/logits": 0.24063792079687119,
"step": 739
},
{
"epoch": 0.012782974459962515,
"grad_norm": 0.337890625,
"grad_norm_var": 0.0004956404368082683,
"learning_rate": 0.01,
"loss": 1.3809,
"loss/crossentropy": 2.3430649042129517,
"loss/fcd": 1.05859375,
"loss/logits": 0.23180129379034042,
"step": 740
},
{
"epoch": 0.012800248749773275,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0004435062408447266,
"learning_rate": 0.01,
"loss": 1.3546,
"loss/crossentropy": 2.347190737724304,
"loss/fcd": 1.1015625,
"loss/logits": 0.23613610118627548,
"step": 741
},
{
"epoch": 0.012817523039584035,
"grad_norm": 0.328125,
"grad_norm_var": 0.00043320655822753906,
"learning_rate": 0.01,
"loss": 1.414,
"loss/crossentropy": 2.3196645975112915,
"loss/fcd": 1.12890625,
"loss/logits": 0.27611708641052246,
"step": 742
},
{
"epoch": 0.012834797329394795,
"grad_norm": 0.28125,
"grad_norm_var": 0.0004990736643473308,
"learning_rate": 0.01,
"loss": 1.3861,
"loss/crossentropy": 2.4212803840637207,
"loss/fcd": 1.109375,
"loss/logits": 0.2471313625574112,
"step": 743
},
{
"epoch": 0.012852071619205556,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0004806359608968099,
"learning_rate": 0.01,
"loss": 1.3723,
"loss/crossentropy": 2.527360200881958,
"loss/fcd": 1.109375,
"loss/logits": 0.24950604140758514,
"step": 744
},
{
"epoch": 0.012869345909016316,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0004750569661458333,
"learning_rate": 0.01,
"loss": 1.3461,
"loss/crossentropy": 2.2922967672348022,
"loss/fcd": 1.07421875,
"loss/logits": 0.23927000910043716,
"step": 745
},
{
"epoch": 0.012886620198827076,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0004943688710530599,
"learning_rate": 0.01,
"loss": 1.3785,
"loss/crossentropy": 2.133127212524414,
"loss/fcd": 1.078125,
"loss/logits": 0.23443202674388885,
"step": 746
},
{
"epoch": 0.012903894488637836,
"grad_norm": 0.28515625,
"grad_norm_var": 0.000366973876953125,
"learning_rate": 0.01,
"loss": 1.387,
"loss/crossentropy": 2.569379210472107,
"loss/fcd": 1.12109375,
"loss/logits": 0.26725105941295624,
"step": 747
},
{
"epoch": 0.012921168778448596,
"grad_norm": 0.294921875,
"grad_norm_var": 0.00033238728841145836,
"learning_rate": 0.01,
"loss": 1.4185,
"loss/crossentropy": 2.6103577613830566,
"loss/fcd": 1.13671875,
"loss/logits": 0.27920565009117126,
"step": 748
},
{
"epoch": 0.012938443068259356,
"grad_norm": 0.310546875,
"grad_norm_var": 0.00032145182291666666,
"learning_rate": 0.01,
"loss": 1.4161,
"loss/crossentropy": 2.3525288105010986,
"loss/fcd": 1.09375,
"loss/logits": 0.21820923686027527,
"step": 749
},
{
"epoch": 0.012955717358070115,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0003083388010660807,
"learning_rate": 0.01,
"loss": 1.3429,
"loss/crossentropy": 2.563652276992798,
"loss/fcd": 1.1171875,
"loss/logits": 0.25008824467658997,
"step": 750
},
{
"epoch": 0.012972991647880877,
"grad_norm": 0.279296875,
"grad_norm_var": 0.00033416748046875,
"learning_rate": 0.01,
"loss": 1.3992,
"loss/crossentropy": 2.4368664026260376,
"loss/fcd": 1.08984375,
"loss/logits": 0.2636963874101639,
"step": 751
},
{
"epoch": 0.012990265937691637,
"grad_norm": 0.267578125,
"grad_norm_var": 0.00034173329671223957,
"learning_rate": 0.01,
"loss": 1.3548,
"loss/crossentropy": 2.47409451007843,
"loss/fcd": 1.1640625,
"loss/logits": 0.26615823060274124,
"step": 752
},
{
"epoch": 0.013007540227502397,
"grad_norm": 0.390625,
"grad_norm_var": 0.0008442560831705729,
"learning_rate": 0.01,
"loss": 1.4382,
"loss/crossentropy": 2.667958378791809,
"loss/fcd": 1.22265625,
"loss/logits": 0.29826460778713226,
"step": 753
},
{
"epoch": 0.013024814517313157,
"grad_norm": 0.28125,
"grad_norm_var": 0.0008722941080729166,
"learning_rate": 0.01,
"loss": 1.3857,
"loss/crossentropy": 2.2399171590805054,
"loss/fcd": 1.30078125,
"loss/logits": 0.3064821809530258,
"step": 754
},
{
"epoch": 0.013042088807123917,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0008643945058186849,
"learning_rate": 0.01,
"loss": 1.4191,
"loss/crossentropy": 2.4244364500045776,
"loss/fcd": 1.1796875,
"loss/logits": 0.2772462069988251,
"step": 755
},
{
"epoch": 0.013059363096934676,
"grad_norm": 0.28125,
"grad_norm_var": 0.000800323486328125,
"learning_rate": 0.01,
"loss": 1.3482,
"loss/crossentropy": 2.6471344232559204,
"loss/fcd": 1.1484375,
"loss/logits": 0.2606939375400543,
"step": 756
},
{
"epoch": 0.013076637386745438,
"grad_norm": 0.345703125,
"grad_norm_var": 0.000935220718383789,
"learning_rate": 0.01,
"loss": 1.4094,
"loss/crossentropy": 2.4318645000457764,
"loss/fcd": 1.125,
"loss/logits": 0.2657194063067436,
"step": 757
},
{
"epoch": 0.013093911676556198,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0009119510650634766,
"learning_rate": 0.01,
"loss": 1.4882,
"loss/crossentropy": 2.6587414741516113,
"loss/fcd": 1.1171875,
"loss/logits": 0.25396668910980225,
"step": 758
},
{
"epoch": 0.013111185966366958,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0008859634399414062,
"learning_rate": 0.01,
"loss": 1.3734,
"loss/crossentropy": 2.320420742034912,
"loss/fcd": 1.0625,
"loss/logits": 0.22045490145683289,
"step": 759
},
{
"epoch": 0.013128460256177718,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0008935928344726562,
"learning_rate": 0.01,
"loss": 1.4048,
"loss/crossentropy": 2.43363881111145,
"loss/fcd": 1.1171875,
"loss/logits": 0.2532464414834976,
"step": 760
},
{
"epoch": 0.013145734545988478,
"grad_norm": 0.28125,
"grad_norm_var": 0.0009287357330322266,
"learning_rate": 0.01,
"loss": 1.3617,
"loss/crossentropy": 2.5222312211990356,
"loss/fcd": 1.15234375,
"loss/logits": 0.29095427691936493,
"step": 761
},
{
"epoch": 0.013163008835799237,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0009198347727457682,
"learning_rate": 0.01,
"loss": 1.3893,
"loss/crossentropy": 2.265801191329956,
"loss/fcd": 1.02734375,
"loss/logits": 0.23643554002046585,
"step": 762
},
{
"epoch": 0.013180283125609999,
"grad_norm": 0.390625,
"grad_norm_var": 0.0013386885325113933,
"learning_rate": 0.01,
"loss": 1.4154,
"loss/crossentropy": 2.1754260063171387,
"loss/fcd": 1.1640625,
"loss/logits": 0.244869664311409,
"step": 763
},
{
"epoch": 0.013197557415420759,
"grad_norm": 0.30859375,
"grad_norm_var": 0.001320330301920573,
"learning_rate": 0.01,
"loss": 1.3947,
"loss/crossentropy": 2.3228635787963867,
"loss/fcd": 1.03515625,
"loss/logits": 0.22359148412942886,
"step": 764
},
{
"epoch": 0.013214831705231519,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0013203938802083333,
"learning_rate": 0.01,
"loss": 1.4051,
"loss/crossentropy": 2.5446053743362427,
"loss/fcd": 1.140625,
"loss/logits": 0.24661505222320557,
"step": 765
},
{
"epoch": 0.013232105995042279,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0013085524241129556,
"learning_rate": 0.01,
"loss": 1.4116,
"loss/crossentropy": 2.4046772718429565,
"loss/fcd": 1.16015625,
"loss/logits": 0.26322653889656067,
"step": 766
},
{
"epoch": 0.013249380284853039,
"grad_norm": 0.287109375,
"grad_norm_var": 0.00127714474995931,
"learning_rate": 0.01,
"loss": 1.3094,
"loss/crossentropy": 2.397523880004883,
"loss/fcd": 1.08203125,
"loss/logits": 0.2391202375292778,
"step": 767
},
{
"epoch": 0.013266654574663798,
"grad_norm": 0.28125,
"grad_norm_var": 0.0012049357096354167,
"learning_rate": 0.01,
"loss": 1.3474,
"loss/crossentropy": 2.599183440208435,
"loss/fcd": 1.1640625,
"loss/logits": 0.2888915240764618,
"step": 768
},
{
"epoch": 0.01328392886447456,
"grad_norm": 0.353515625,
"grad_norm_var": 0.0009141127268473307,
"learning_rate": 0.01,
"loss": 1.4331,
"loss/crossentropy": 2.1059322357177734,
"loss/fcd": 1.0546875,
"loss/logits": 0.23741237819194794,
"step": 769
},
{
"epoch": 0.01330120315428532,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0008471171061197917,
"learning_rate": 0.01,
"loss": 1.3643,
"loss/crossentropy": 2.697718620300293,
"loss/fcd": 1.1796875,
"loss/logits": 0.26706932485103607,
"step": 770
},
{
"epoch": 0.01331847744409608,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0008389631907145183,
"learning_rate": 0.01,
"loss": 1.45,
"loss/crossentropy": 2.4075610637664795,
"loss/fcd": 1.1015625,
"loss/logits": 0.25129370391368866,
"step": 771
},
{
"epoch": 0.01333575173390684,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0008669535319010417,
"learning_rate": 0.01,
"loss": 1.3877,
"loss/crossentropy": 2.7534801959991455,
"loss/fcd": 1.23828125,
"loss/logits": 0.30193065106868744,
"step": 772
},
{
"epoch": 0.0133530260237176,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0008176008860270183,
"learning_rate": 0.01,
"loss": 1.3939,
"loss/crossentropy": 2.182551383972168,
"loss/fcd": 1.12890625,
"loss/logits": 0.28344330191612244,
"step": 773
},
{
"epoch": 0.01337030031352836,
"grad_norm": 0.28125,
"grad_norm_var": 0.000862741470336914,
"learning_rate": 0.01,
"loss": 1.412,
"loss/crossentropy": 2.510794520378113,
"loss/fcd": 1.16015625,
"loss/logits": 0.25014493614435196,
"step": 774
},
{
"epoch": 0.013387574603339121,
"grad_norm": 0.33984375,
"grad_norm_var": 0.000925445556640625,
"learning_rate": 0.01,
"loss": 1.4564,
"loss/crossentropy": 2.479841709136963,
"loss/fcd": 1.08203125,
"loss/logits": 0.25105684995651245,
"step": 775
},
{
"epoch": 0.013404848893149881,
"grad_norm": 0.3125,
"grad_norm_var": 0.0009211063385009766,
"learning_rate": 0.01,
"loss": 1.3963,
"loss/crossentropy": 2.639458417892456,
"loss/fcd": 1.10546875,
"loss/logits": 0.2490757405757904,
"step": 776
},
{
"epoch": 0.01342212318296064,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0008683363596598307,
"learning_rate": 0.01,
"loss": 1.4124,
"loss/crossentropy": 2.6080870628356934,
"loss/fcd": 1.13671875,
"loss/logits": 0.24069885909557343,
"step": 777
},
{
"epoch": 0.0134393974727714,
"grad_norm": 0.3125,
"grad_norm_var": 0.0008681615193684896,
"learning_rate": 0.01,
"loss": 1.3733,
"loss/crossentropy": 2.3055442571640015,
"loss/fcd": 1.08203125,
"loss/logits": 0.2517802268266678,
"step": 778
},
{
"epoch": 0.01345667176258216,
"grad_norm": 0.3125,
"grad_norm_var": 0.00043328603108723957,
"learning_rate": 0.01,
"loss": 1.4781,
"loss/crossentropy": 2.5537742376327515,
"loss/fcd": 1.26953125,
"loss/logits": 0.30602647364139557,
"step": 779
},
{
"epoch": 0.01347394605239292,
"grad_norm": 0.328125,
"grad_norm_var": 0.0004603068033854167,
"learning_rate": 0.01,
"loss": 1.3961,
"loss/crossentropy": 2.371378183364868,
"loss/fcd": 1.12890625,
"loss/logits": 0.24987629055976868,
"step": 780
},
{
"epoch": 0.013491220342203682,
"grad_norm": 0.328125,
"grad_norm_var": 0.00048267046610514324,
"learning_rate": 0.01,
"loss": 1.3782,
"loss/crossentropy": 2.570296287536621,
"loss/fcd": 1.05859375,
"loss/logits": 0.22898489236831665,
"step": 781
},
{
"epoch": 0.013508494632014442,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0004997094472249349,
"learning_rate": 0.01,
"loss": 1.3685,
"loss/crossentropy": 2.282141923904419,
"loss/fcd": 1.05859375,
"loss/logits": 0.2274707406759262,
"step": 782
},
{
"epoch": 0.013525768921825202,
"grad_norm": 0.349609375,
"grad_norm_var": 0.0005658308664957683,
"learning_rate": 0.01,
"loss": 1.41,
"loss/crossentropy": 2.378341317176819,
"loss/fcd": 1.21484375,
"loss/logits": 0.3016776442527771,
"step": 783
},
{
"epoch": 0.013543043211635962,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0005829970041910808,
"learning_rate": 0.01,
"loss": 1.3398,
"loss/crossentropy": 2.6982511281967163,
"loss/fcd": 1.0703125,
"loss/logits": 0.23673634231090546,
"step": 784
},
{
"epoch": 0.013560317501446722,
"grad_norm": 0.30859375,
"grad_norm_var": 0.00046126047770182293,
"learning_rate": 0.01,
"loss": 1.3964,
"loss/crossentropy": 2.371803879737854,
"loss/fcd": 1.1328125,
"loss/logits": 0.23621678352355957,
"step": 785
},
{
"epoch": 0.013577591791257481,
"grad_norm": 0.29296875,
"grad_norm_var": 0.00047771135965983075,
"learning_rate": 0.01,
"loss": 1.3296,
"loss/crossentropy": 2.3509960174560547,
"loss/fcd": 1.05859375,
"loss/logits": 0.23912984877824783,
"step": 786
},
{
"epoch": 0.013594866081068241,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0005536397298177083,
"learning_rate": 0.01,
"loss": 1.3796,
"loss/crossentropy": 2.4273725748062134,
"loss/fcd": 1.08984375,
"loss/logits": 0.2564089596271515,
"step": 787
},
{
"epoch": 0.013612140370879003,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0005254109700520833,
"learning_rate": 0.01,
"loss": 1.3708,
"loss/crossentropy": 2.4844895601272583,
"loss/fcd": 1.09375,
"loss/logits": 0.24952378869056702,
"step": 788
},
{
"epoch": 0.013629414660689763,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0005256493886311848,
"learning_rate": 0.01,
"loss": 1.412,
"loss/crossentropy": 2.415653347969055,
"loss/fcd": 1.1484375,
"loss/logits": 0.2528124749660492,
"step": 789
},
{
"epoch": 0.013646688950500523,
"grad_norm": 0.390625,
"grad_norm_var": 0.0008763472239176432,
"learning_rate": 0.01,
"loss": 1.4382,
"loss/crossentropy": 2.4079452753067017,
"loss/fcd": 1.18359375,
"loss/logits": 0.2838260903954506,
"step": 790
},
{
"epoch": 0.013663963240311282,
"grad_norm": 0.328125,
"grad_norm_var": 0.0008465925852457683,
"learning_rate": 0.01,
"loss": 1.446,
"loss/crossentropy": 2.3247077465057373,
"loss/fcd": 1.1171875,
"loss/logits": 0.2550275847315788,
"step": 791
},
{
"epoch": 0.013681237530122042,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0008462905883789062,
"learning_rate": 0.01,
"loss": 1.3615,
"loss/crossentropy": 2.1464229822158813,
"loss/fcd": 1.13671875,
"loss/logits": 0.25474052131175995,
"step": 792
},
{
"epoch": 0.013698511819932802,
"grad_norm": 0.361328125,
"grad_norm_var": 0.0009821414947509765,
"learning_rate": 0.01,
"loss": 1.4535,
"loss/crossentropy": 2.4427038431167603,
"loss/fcd": 1.0859375,
"loss/logits": 0.2672760635614395,
"step": 793
},
{
"epoch": 0.013715786109743564,
"grad_norm": 0.314453125,
"grad_norm_var": 0.00098114013671875,
"learning_rate": 0.01,
"loss": 1.4375,
"loss/crossentropy": 2.502182126045227,
"loss/fcd": 1.203125,
"loss/logits": 0.30062489211559296,
"step": 794
},
{
"epoch": 0.013733060399554324,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0010176976521809897,
"learning_rate": 0.01,
"loss": 1.3338,
"loss/crossentropy": 2.537824034690857,
"loss/fcd": 1.12109375,
"loss/logits": 0.24768686294555664,
"step": 795
},
{
"epoch": 0.013750334689365084,
"grad_norm": 0.26953125,
"grad_norm_var": 0.0011388142903645834,
"learning_rate": 0.01,
"loss": 1.3676,
"loss/crossentropy": 2.3750780820846558,
"loss/fcd": 1.076171875,
"loss/logits": 0.23160798847675323,
"step": 796
},
{
"epoch": 0.013767608979175843,
"grad_norm": 0.306640625,
"grad_norm_var": 0.001122903823852539,
"learning_rate": 0.01,
"loss": 1.3812,
"loss/crossentropy": 2.628328800201416,
"loss/fcd": 1.13671875,
"loss/logits": 0.2566673457622528,
"step": 797
},
{
"epoch": 0.013784883268986603,
"grad_norm": 0.353515625,
"grad_norm_var": 0.0012051900227864583,
"learning_rate": 0.01,
"loss": 1.3547,
"loss/crossentropy": 2.0953266620635986,
"loss/fcd": 1.1015625,
"loss/logits": 0.23933346569538116,
"step": 798
},
{
"epoch": 0.013802157558797363,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0011489232381184896,
"learning_rate": 0.01,
"loss": 1.4166,
"loss/crossentropy": 2.7266474962234497,
"loss/fcd": 1.140625,
"loss/logits": 0.2656974792480469,
"step": 799
},
{
"epoch": 0.013819431848608125,
"grad_norm": 0.296875,
"grad_norm_var": 0.0010843912760416666,
"learning_rate": 0.01,
"loss": 1.3191,
"loss/crossentropy": 2.459654688835144,
"loss/fcd": 1.12109375,
"loss/logits": 0.2644665837287903,
"step": 800
},
{
"epoch": 0.013836706138418885,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0010843912760416666,
"learning_rate": 0.01,
"loss": 1.4278,
"loss/crossentropy": 2.629300117492676,
"loss/fcd": 1.1171875,
"loss/logits": 0.24821807444095612,
"step": 801
},
{
"epoch": 0.013853980428229645,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0010678609212239583,
"learning_rate": 0.01,
"loss": 1.3413,
"loss/crossentropy": 2.5803698301315308,
"loss/fcd": 1.15234375,
"loss/logits": 0.267608180642128,
"step": 802
},
{
"epoch": 0.013871254718040404,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0010577996571858725,
"learning_rate": 0.01,
"loss": 1.3176,
"loss/crossentropy": 2.349183440208435,
"loss/fcd": 1.09375,
"loss/logits": 0.25479844957590103,
"step": 803
},
{
"epoch": 0.013888529007851164,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0010577996571858725,
"learning_rate": 0.01,
"loss": 1.3783,
"loss/crossentropy": 2.618894100189209,
"loss/fcd": 1.1796875,
"loss/logits": 0.2711791917681694,
"step": 804
},
{
"epoch": 0.013905803297661924,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0010577996571858725,
"learning_rate": 0.01,
"loss": 1.3966,
"loss/crossentropy": 2.3134875893592834,
"loss/fcd": 1.109375,
"loss/logits": 0.2539241313934326,
"step": 805
},
{
"epoch": 0.013923077587472686,
"grad_norm": 0.330078125,
"grad_norm_var": 0.0006611506144205729,
"learning_rate": 0.01,
"loss": 1.441,
"loss/crossentropy": 2.837363600730896,
"loss/fcd": 1.2578125,
"loss/logits": 0.32089151442050934,
"step": 806
},
{
"epoch": 0.013940351877283446,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0006586074829101563,
"learning_rate": 0.01,
"loss": 1.3525,
"loss/crossentropy": 2.377834916114807,
"loss/fcd": 1.06640625,
"loss/logits": 0.23237647861242294,
"step": 807
},
{
"epoch": 0.013957626167094206,
"grad_norm": 0.271484375,
"grad_norm_var": 0.0007306416829427083,
"learning_rate": 0.01,
"loss": 1.3753,
"loss/crossentropy": 2.520345091819763,
"loss/fcd": 1.1484375,
"loss/logits": 0.26999618113040924,
"step": 808
},
{
"epoch": 0.013974900456904965,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0005376180013020833,
"learning_rate": 0.01,
"loss": 1.3425,
"loss/crossentropy": 2.55434787273407,
"loss/fcd": 1.0859375,
"loss/logits": 0.26515287160873413,
"step": 809
},
{
"epoch": 0.013992174746715725,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0005302270253499349,
"learning_rate": 0.01,
"loss": 1.405,
"loss/crossentropy": 2.320609927177429,
"loss/fcd": 1.1328125,
"loss/logits": 0.2443319857120514,
"step": 810
},
{
"epoch": 0.014009449036526485,
"grad_norm": 0.28125,
"grad_norm_var": 0.0005451043446858724,
"learning_rate": 0.01,
"loss": 1.3608,
"loss/crossentropy": 2.3824050426483154,
"loss/fcd": 1.1171875,
"loss/logits": 0.2653844952583313,
"step": 811
},
{
"epoch": 0.014026723326337247,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0005200703938802084,
"learning_rate": 0.01,
"loss": 1.4786,
"loss/crossentropy": 2.459092617034912,
"loss/fcd": 1.18359375,
"loss/logits": 0.2695985734462738,
"step": 812
},
{
"epoch": 0.014043997616148007,
"grad_norm": 0.26953125,
"grad_norm_var": 0.0005698998769124349,
"learning_rate": 0.01,
"loss": 1.2888,
"loss/crossentropy": 2.4817110300064087,
"loss/fcd": 1.0546875,
"loss/logits": 0.22882136702537537,
"step": 813
},
{
"epoch": 0.014061271905958767,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0003539880116780599,
"learning_rate": 0.01,
"loss": 1.3681,
"loss/crossentropy": 2.556985020637512,
"loss/fcd": 1.12109375,
"loss/logits": 0.25683027505874634,
"step": 814
},
{
"epoch": 0.014078546195769526,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0003661473592122396,
"learning_rate": 0.01,
"loss": 1.4285,
"loss/crossentropy": 2.3824613094329834,
"loss/fcd": 1.10546875,
"loss/logits": 0.24755095690488815,
"step": 815
},
{
"epoch": 0.014095820485580286,
"grad_norm": 0.310546875,
"grad_norm_var": 0.00038094520568847655,
"learning_rate": 0.01,
"loss": 1.3345,
"loss/crossentropy": 2.1579148173332214,
"loss/fcd": 1.0625,
"loss/logits": 0.23608000576496124,
"step": 816
},
{
"epoch": 0.014113094775391046,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0004091739654541016,
"learning_rate": 0.01,
"loss": 1.42,
"loss/crossentropy": 2.556256413459778,
"loss/fcd": 1.140625,
"loss/logits": 0.23912374675273895,
"step": 817
},
{
"epoch": 0.014130369065201808,
"grad_norm": 0.29296875,
"grad_norm_var": 0.00040879249572753904,
"learning_rate": 0.01,
"loss": 1.4296,
"loss/crossentropy": 2.497371554374695,
"loss/fcd": 1.08984375,
"loss/logits": 0.24960072338581085,
"step": 818
},
{
"epoch": 0.014147643355012568,
"grad_norm": 0.29296875,
"grad_norm_var": 0.000379180908203125,
"learning_rate": 0.01,
"loss": 1.4164,
"loss/crossentropy": 2.6055017709732056,
"loss/fcd": 1.24609375,
"loss/logits": 0.30321623384952545,
"step": 819
},
{
"epoch": 0.014164917644823328,
"grad_norm": 0.26953125,
"grad_norm_var": 0.00041667620340983075,
"learning_rate": 0.01,
"loss": 1.3435,
"loss/crossentropy": 2.520479202270508,
"loss/fcd": 1.12109375,
"loss/logits": 0.24647565186023712,
"step": 820
},
{
"epoch": 0.014182191934634087,
"grad_norm": 0.283203125,
"grad_norm_var": 0.00037789344787597656,
"learning_rate": 0.01,
"loss": 1.4303,
"loss/crossentropy": 2.4229378700256348,
"loss/fcd": 1.16015625,
"loss/logits": 0.27616265416145325,
"step": 821
},
{
"epoch": 0.014199466224444847,
"grad_norm": 0.294921875,
"grad_norm_var": 0.00028634071350097656,
"learning_rate": 0.01,
"loss": 1.4063,
"loss/crossentropy": 2.642806649208069,
"loss/fcd": 1.11328125,
"loss/logits": 0.24927609413862228,
"step": 822
},
{
"epoch": 0.014216740514255607,
"grad_norm": 0.341796875,
"grad_norm_var": 0.00044040679931640626,
"learning_rate": 0.01,
"loss": 1.4389,
"loss/crossentropy": 2.743402600288391,
"loss/fcd": 1.20703125,
"loss/logits": 0.2956629917025566,
"step": 823
},
{
"epoch": 0.014234014804066367,
"grad_norm": 0.298828125,
"grad_norm_var": 0.00040079752604166665,
"learning_rate": 0.01,
"loss": 1.4283,
"loss/crossentropy": 2.5851320028305054,
"loss/fcd": 1.20703125,
"loss/logits": 0.26086658239364624,
"step": 824
},
{
"epoch": 0.014251289093877129,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0003787835439046224,
"learning_rate": 0.01,
"loss": 1.3569,
"loss/crossentropy": 2.5595767498016357,
"loss/fcd": 1.0703125,
"loss/logits": 0.24868559837341309,
"step": 825
},
{
"epoch": 0.014268563383687888,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0003745873769124349,
"learning_rate": 0.01,
"loss": 1.3698,
"loss/crossentropy": 2.553021550178528,
"loss/fcd": 1.12109375,
"loss/logits": 0.25030215084552765,
"step": 826
},
{
"epoch": 0.014285837673498648,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0003688653310139974,
"learning_rate": 0.01,
"loss": 1.3934,
"loss/crossentropy": 2.459465980529785,
"loss/fcd": 1.2578125,
"loss/logits": 0.27228477597236633,
"step": 827
},
{
"epoch": 0.014303111963309408,
"grad_norm": 0.373046875,
"grad_norm_var": 0.0006812890370686849,
"learning_rate": 0.01,
"loss": 1.403,
"loss/crossentropy": 2.5050086975097656,
"loss/fcd": 1.11328125,
"loss/logits": 0.2527881860733032,
"step": 828
},
{
"epoch": 0.014320386253120168,
"grad_norm": 0.296875,
"grad_norm_var": 0.0006002902984619141,
"learning_rate": 0.01,
"loss": 1.4282,
"loss/crossentropy": 2.5587570667266846,
"loss/fcd": 1.171875,
"loss/logits": 0.2506961077451706,
"step": 829
},
{
"epoch": 0.014337660542930928,
"grad_norm": 0.3046875,
"grad_norm_var": 0.000600433349609375,
"learning_rate": 0.01,
"loss": 1.3663,
"loss/crossentropy": 2.433290719985962,
"loss/fcd": 1.13671875,
"loss/logits": 0.23105743527412415,
"step": 830
},
{
"epoch": 0.01435493483274169,
"grad_norm": 0.345703125,
"grad_norm_var": 0.0006985823313395182,
"learning_rate": 0.01,
"loss": 1.4033,
"loss/crossentropy": 2.2913233041763306,
"loss/fcd": 1.140625,
"loss/logits": 0.2715977430343628,
"step": 831
},
{
"epoch": 0.01437220912255245,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0007214864095052083,
"learning_rate": 0.01,
"loss": 1.3672,
"loss/crossentropy": 2.4408832788467407,
"loss/fcd": 1.109375,
"loss/logits": 0.23768731951713562,
"step": 832
},
{
"epoch": 0.01438948341236321,
"grad_norm": 0.3515625,
"grad_norm_var": 0.0008374532063802083,
"learning_rate": 0.01,
"loss": 1.3927,
"loss/crossentropy": 2.273505926132202,
"loss/fcd": 1.05078125,
"loss/logits": 0.2371639683842659,
"step": 833
},
{
"epoch": 0.01440675770217397,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0008224328358968099,
"learning_rate": 0.01,
"loss": 1.4174,
"loss/crossentropy": 2.304438829421997,
"loss/fcd": 1.10546875,
"loss/logits": 0.2705874443054199,
"step": 834
},
{
"epoch": 0.014424031991984729,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0008061091105143229,
"learning_rate": 0.01,
"loss": 1.413,
"loss/crossentropy": 2.4857107400894165,
"loss/fcd": 1.265625,
"loss/logits": 0.2602947950363159,
"step": 835
},
{
"epoch": 0.014441306281795489,
"grad_norm": 0.33984375,
"grad_norm_var": 0.0007237116495768229,
"learning_rate": 0.01,
"loss": 1.4423,
"loss/crossentropy": 2.4861044883728027,
"loss/fcd": 1.16015625,
"loss/logits": 0.25615356862545013,
"step": 836
},
{
"epoch": 0.01445858057160625,
"grad_norm": 0.267578125,
"grad_norm_var": 0.0008066177368164062,
"learning_rate": 0.01,
"loss": 1.3396,
"loss/crossentropy": 2.3363460302352905,
"loss/fcd": 1.03125,
"loss/logits": 0.2474212720990181,
"step": 837
},
{
"epoch": 0.01447585486141701,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0007843017578125,
"learning_rate": 0.01,
"loss": 1.3885,
"loss/crossentropy": 2.332596778869629,
"loss/fcd": 1.0859375,
"loss/logits": 0.2456573098897934,
"step": 838
},
{
"epoch": 0.01449312915122777,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0007394790649414062,
"learning_rate": 0.01,
"loss": 1.3709,
"loss/crossentropy": 2.613990545272827,
"loss/fcd": 1.1953125,
"loss/logits": 0.2681911140680313,
"step": 839
},
{
"epoch": 0.01451040344103853,
"grad_norm": 0.279296875,
"grad_norm_var": 0.000803375244140625,
"learning_rate": 0.01,
"loss": 1.3305,
"loss/crossentropy": 2.448235511779785,
"loss/fcd": 1.05859375,
"loss/logits": 0.22328373789787292,
"step": 840
},
{
"epoch": 0.01452767773084929,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0008455753326416015,
"learning_rate": 0.01,
"loss": 1.3552,
"loss/crossentropy": 2.4329841136932373,
"loss/fcd": 1.171875,
"loss/logits": 0.2812986671924591,
"step": 841
},
{
"epoch": 0.01454495202066005,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0008448282877604167,
"learning_rate": 0.01,
"loss": 1.4049,
"loss/crossentropy": 2.366762161254883,
"loss/fcd": 1.1640625,
"loss/logits": 0.2537970468401909,
"step": 842
},
{
"epoch": 0.014562226310470812,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0008669535319010417,
"learning_rate": 0.01,
"loss": 1.3474,
"loss/crossentropy": 2.2118855714797974,
"loss/fcd": 1.05859375,
"loss/logits": 0.2319856360554695,
"step": 843
},
{
"epoch": 0.014579500600281571,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0005958398183186849,
"learning_rate": 0.01,
"loss": 1.3672,
"loss/crossentropy": 2.427622437477112,
"loss/fcd": 1.1171875,
"loss/logits": 0.26083478331565857,
"step": 844
},
{
"epoch": 0.014596774890092331,
"grad_norm": 0.322265625,
"grad_norm_var": 0.000603485107421875,
"learning_rate": 0.01,
"loss": 1.3868,
"loss/crossentropy": 2.6780372858047485,
"loss/fcd": 1.2578125,
"loss/logits": 0.2781776934862137,
"step": 845
},
{
"epoch": 0.014614049179903091,
"grad_norm": 0.337890625,
"grad_norm_var": 0.0006572564442952473,
"learning_rate": 0.01,
"loss": 1.3767,
"loss/crossentropy": 2.36633038520813,
"loss/fcd": 1.140625,
"loss/logits": 0.2774253934621811,
"step": 846
},
{
"epoch": 0.014631323469713851,
"grad_norm": 0.263671875,
"grad_norm_var": 0.0006892999013264974,
"learning_rate": 0.01,
"loss": 1.3532,
"loss/crossentropy": 2.598803162574768,
"loss/fcd": 1.1484375,
"loss/logits": 0.27754758298397064,
"step": 847
},
{
"epoch": 0.014648597759524611,
"grad_norm": 0.275390625,
"grad_norm_var": 0.000730133056640625,
"learning_rate": 0.01,
"loss": 1.3245,
"loss/crossentropy": 2.323062777519226,
"loss/fcd": 1.0234375,
"loss/logits": 0.23049668222665787,
"step": 848
},
{
"epoch": 0.014665872049335373,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0005938212076822916,
"learning_rate": 0.01,
"loss": 1.497,
"loss/crossentropy": 2.4116770029067993,
"loss/fcd": 1.1328125,
"loss/logits": 0.25217771530151367,
"step": 849
},
{
"epoch": 0.014683146339146132,
"grad_norm": 0.33203125,
"grad_norm_var": 0.0006493727366129557,
"learning_rate": 0.01,
"loss": 1.3825,
"loss/crossentropy": 2.784231662750244,
"loss/fcd": 1.23828125,
"loss/logits": 0.301376610994339,
"step": 850
},
{
"epoch": 0.014700420628956892,
"grad_norm": 0.68359375,
"grad_norm_var": 0.009682146708170573,
"learning_rate": 0.01,
"loss": 1.5242,
"loss/crossentropy": 2.3721545934677124,
"loss/fcd": 1.09765625,
"loss/logits": 0.2636701613664627,
"step": 851
},
{
"epoch": 0.014717694918767652,
"grad_norm": 0.33984375,
"grad_norm_var": 0.009682146708170573,
"learning_rate": 0.01,
"loss": 1.3491,
"loss/crossentropy": 2.5496045351028442,
"loss/fcd": 1.1171875,
"loss/logits": 0.2594982087612152,
"step": 852
},
{
"epoch": 0.014734969208578412,
"grad_norm": 0.310546875,
"grad_norm_var": 0.009457651774088542,
"learning_rate": 0.01,
"loss": 1.3208,
"loss/crossentropy": 2.211892247200012,
"loss/fcd": 1.0703125,
"loss/logits": 0.21089013665914536,
"step": 853
},
{
"epoch": 0.014752243498389172,
"grad_norm": 0.2890625,
"grad_norm_var": 0.009530750910441081,
"learning_rate": 0.01,
"loss": 1.3612,
"loss/crossentropy": 2.3918616771698,
"loss/fcd": 1.046875,
"loss/logits": 0.2475578412413597,
"step": 854
},
{
"epoch": 0.014769517788199932,
"grad_norm": 0.294921875,
"grad_norm_var": 0.009600178400675455,
"learning_rate": 0.01,
"loss": 1.3711,
"loss/crossentropy": 2.6660208702087402,
"loss/fcd": 1.125,
"loss/logits": 0.25626226514577866,
"step": 855
},
{
"epoch": 0.014786792078010693,
"grad_norm": 0.265625,
"grad_norm_var": 0.009698422749837239,
"learning_rate": 0.01,
"loss": 1.3272,
"loss/crossentropy": 2.4646941423416138,
"loss/fcd": 1.1015625,
"loss/logits": 0.24187320470809937,
"step": 856
},
{
"epoch": 0.014804066367821453,
"grad_norm": 0.296875,
"grad_norm_var": 0.00958250363667806,
"learning_rate": 0.01,
"loss": 1.349,
"loss/crossentropy": 2.889734983444214,
"loss/fcd": 1.23046875,
"loss/logits": 0.28400754928588867,
"step": 857
},
{
"epoch": 0.014821340657632213,
"grad_norm": 0.296875,
"grad_norm_var": 0.009624671936035157,
"learning_rate": 0.01,
"loss": 1.3696,
"loss/crossentropy": 2.4632620811462402,
"loss/fcd": 1.11328125,
"loss/logits": 0.24944238364696503,
"step": 858
},
{
"epoch": 0.014838614947442973,
"grad_norm": 0.28515625,
"grad_norm_var": 0.00966332753499349,
"learning_rate": 0.01,
"loss": 1.3636,
"loss/crossentropy": 2.38780677318573,
"loss/fcd": 1.1171875,
"loss/logits": 0.25044557452201843,
"step": 859
},
{
"epoch": 0.014855889237253733,
"grad_norm": 0.322265625,
"grad_norm_var": 0.009620141983032227,
"learning_rate": 0.01,
"loss": 1.392,
"loss/crossentropy": 2.523656487464905,
"loss/fcd": 1.10546875,
"loss/logits": 0.2575480043888092,
"step": 860
},
{
"epoch": 0.014873163527064493,
"grad_norm": 0.3125,
"grad_norm_var": 0.009632619222005208,
"learning_rate": 0.01,
"loss": 1.3788,
"loss/crossentropy": 2.3901199102401733,
"loss/fcd": 1.109375,
"loss/logits": 0.22918711602687836,
"step": 861
},
{
"epoch": 0.014890437816875254,
"grad_norm": 0.41015625,
"grad_norm_var": 0.010067224502563477,
"learning_rate": 0.01,
"loss": 1.392,
"loss/crossentropy": 2.2604238986968994,
"loss/fcd": 1.29296875,
"loss/logits": 0.28666311502456665,
"step": 862
},
{
"epoch": 0.014907712106686014,
"grad_norm": 0.296875,
"grad_norm_var": 0.00983727773030599,
"learning_rate": 0.01,
"loss": 1.3701,
"loss/crossentropy": 2.1219175457954407,
"loss/fcd": 1.11328125,
"loss/logits": 0.2484002709388733,
"step": 863
},
{
"epoch": 0.014924986396496774,
"grad_norm": 0.3046875,
"grad_norm_var": 0.00966490109761556,
"learning_rate": 0.01,
"loss": 1.4008,
"loss/crossentropy": 2.4230719804763794,
"loss/fcd": 1.08984375,
"loss/logits": 0.2542117089033127,
"step": 864
},
{
"epoch": 0.014942260686307534,
"grad_norm": 0.30859375,
"grad_norm_var": 0.009696563084920248,
"learning_rate": 0.01,
"loss": 1.3599,
"loss/crossentropy": 2.602153182029724,
"loss/fcd": 1.1171875,
"loss/logits": 0.2338126003742218,
"step": 865
},
{
"epoch": 0.014959534976118294,
"grad_norm": 0.310546875,
"grad_norm_var": 0.0097320556640625,
"learning_rate": 0.01,
"loss": 1.3659,
"loss/crossentropy": 2.3879982233047485,
"loss/fcd": 1.11328125,
"loss/logits": 0.25103290379047394,
"step": 866
},
{
"epoch": 0.014976809265929054,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0010736465454101562,
"learning_rate": 0.01,
"loss": 1.348,
"loss/crossentropy": 2.4637222290039062,
"loss/fcd": 1.14453125,
"loss/logits": 0.2280896008014679,
"step": 867
},
{
"epoch": 0.014994083555739815,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0010012149810791015,
"learning_rate": 0.01,
"loss": 1.3708,
"loss/crossentropy": 2.784236192703247,
"loss/fcd": 1.1640625,
"loss/logits": 0.28741903603076935,
"step": 868
},
{
"epoch": 0.015011357845550575,
"grad_norm": 0.298828125,
"grad_norm_var": 0.001000833511352539,
"learning_rate": 0.01,
"loss": 1.4288,
"loss/crossentropy": 2.6332989931106567,
"loss/fcd": 1.27734375,
"loss/logits": 0.3306438624858856,
"step": 869
},
{
"epoch": 0.015028632135361335,
"grad_norm": 0.359375,
"grad_norm_var": 0.0011690616607666015,
"learning_rate": 0.01,
"loss": 1.4187,
"loss/crossentropy": 2.3606460094451904,
"loss/fcd": 1.0546875,
"loss/logits": 0.23307877779006958,
"step": 870
},
{
"epoch": 0.015045906425172095,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0011728286743164062,
"learning_rate": 0.01,
"loss": 1.3553,
"loss/crossentropy": 2.324714183807373,
"loss/fcd": 1.0859375,
"loss/logits": 0.2501022219657898,
"step": 871
},
{
"epoch": 0.015063180714982855,
"grad_norm": 0.283203125,
"grad_norm_var": 0.0010920047760009765,
"learning_rate": 0.01,
"loss": 1.3623,
"loss/crossentropy": 2.328053116798401,
"loss/fcd": 1.140625,
"loss/logits": 0.2553166151046753,
"step": 872
},
{
"epoch": 0.015080455004793615,
"grad_norm": 0.30078125,
"grad_norm_var": 0.00108640988667806,
"learning_rate": 0.01,
"loss": 1.4392,
"loss/crossentropy": 2.377878785133362,
"loss/fcd": 1.15625,
"loss/logits": 0.25394026935100555,
"step": 873
},
{
"epoch": 0.015097729294604376,
"grad_norm": 0.349609375,
"grad_norm_var": 0.0011700948079427084,
"learning_rate": 0.01,
"loss": 1.3398,
"loss/crossentropy": 2.542131185531616,
"loss/fcd": 1.0625,
"loss/logits": 0.24263548851013184,
"step": 874
},
{
"epoch": 0.015115003584415136,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0011273701985677084,
"learning_rate": 0.01,
"loss": 1.3837,
"loss/crossentropy": 2.443636417388916,
"loss/fcd": 1.1328125,
"loss/logits": 0.27580726146698,
"step": 875
},
{
"epoch": 0.015132277874225896,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0011240005493164062,
"learning_rate": 0.01,
"loss": 1.4357,
"loss/crossentropy": 2.752240300178528,
"loss/fcd": 1.17578125,
"loss/logits": 0.2472759708762169,
"step": 876
},
{
"epoch": 0.015149552164036656,
"grad_norm": 0.32421875,
"grad_norm_var": 0.00113067626953125,
"learning_rate": 0.01,
"loss": 1.3789,
"loss/crossentropy": 2.504664421081543,
"loss/fcd": 1.125,
"loss/logits": 0.25199174135923386,
"step": 877
},
{
"epoch": 0.015166826453847416,
"grad_norm": 0.326171875,
"grad_norm_var": 0.0004998366038004557,
"learning_rate": 0.01,
"loss": 1.3978,
"loss/crossentropy": 2.3523584604263306,
"loss/fcd": 1.15234375,
"loss/logits": 0.26311442255973816,
"step": 878
},
{
"epoch": 0.015184100743658176,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0005164941151936849,
"learning_rate": 0.01,
"loss": 1.3575,
"loss/crossentropy": 2.3136786818504333,
"loss/fcd": 1.08984375,
"loss/logits": 0.25283563137054443,
"step": 879
},
{
"epoch": 0.015201375033468937,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0005233605702718099,
"learning_rate": 0.01,
"loss": 1.4031,
"loss/crossentropy": 2.445231080055237,
"loss/fcd": 1.15234375,
"loss/logits": 0.251323863863945,
"step": 880
},
{
"epoch": 0.015218649323279697,
"grad_norm": 0.302734375,
"grad_norm_var": 0.0005263646443684895,
"learning_rate": 0.01,
"loss": 1.4241,
"loss/crossentropy": 2.5056021213531494,
"loss/fcd": 1.109375,
"loss/logits": 0.2573155537247658,
"step": 881
},
{
"epoch": 0.015235923613090457,
"grad_norm": 0.298828125,
"grad_norm_var": 0.0005330403645833333,
"learning_rate": 0.01,
"loss": 1.3779,
"loss/crossentropy": 2.4044970273971558,
"loss/fcd": 1.09765625,
"loss/logits": 0.24030664563179016,
"step": 882
},
{
"epoch": 0.015253197902901217,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0005077203114827474,
"learning_rate": 0.01,
"loss": 1.3391,
"loss/crossentropy": 2.2992568016052246,
"loss/fcd": 1.037109375,
"loss/logits": 0.23432201147079468,
"step": 883
},
{
"epoch": 0.015270472192711977,
"grad_norm": 0.3125,
"grad_norm_var": 0.0005009333292643229,
"learning_rate": 0.01,
"loss": 1.3742,
"loss/crossentropy": 2.346727728843689,
"loss/fcd": 1.05078125,
"loss/logits": 0.2232709527015686,
"step": 884
},
{
"epoch": 0.015287746482522737,
"grad_norm": 0.28125,
"grad_norm_var": 0.0005459944407145182,
"learning_rate": 0.01,
"loss": 1.3237,
"loss/crossentropy": 1.982240617275238,
"loss/fcd": 1.0390625,
"loss/logits": 0.22034113854169846,
"step": 885
},
{
"epoch": 0.015305020772333498,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0003636042277018229,
"learning_rate": 0.01,
"loss": 1.438,
"loss/crossentropy": 2.3263243436813354,
"loss/fcd": 1.1640625,
"loss/logits": 0.24902021139860153,
"step": 886
},
{
"epoch": 0.015322295062144258,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0003649393717447917,
"learning_rate": 0.01,
"loss": 1.3869,
"loss/crossentropy": 2.56560879945755,
"loss/fcd": 1.1328125,
"loss/logits": 0.2558091878890991,
"step": 887
},
{
"epoch": 0.015339569351955018,
"grad_norm": 0.3125,
"grad_norm_var": 0.00032512346903483075,
"learning_rate": 0.01,
"loss": 1.3886,
"loss/crossentropy": 2.4856609106063843,
"loss/fcd": 1.1171875,
"loss/logits": 0.24640005826950073,
"step": 888
},
{
"epoch": 0.015356843641765778,
"grad_norm": 0.349609375,
"grad_norm_var": 0.0004208882649739583,
"learning_rate": 0.01,
"loss": 1.4196,
"loss/crossentropy": 2.55330491065979,
"loss/fcd": 1.14453125,
"loss/logits": 0.25765371322631836,
"step": 889
},
{
"epoch": 0.015374117931576538,
"grad_norm": 0.314453125,
"grad_norm_var": 0.0003218968709309896,
"learning_rate": 0.01,
"loss": 1.3971,
"loss/crossentropy": 2.6354317665100098,
"loss/fcd": 1.3203125,
"loss/logits": 0.3442998379468918,
"step": 890
},
{
"epoch": 0.015391392221387298,
"grad_norm": 0.296875,
"grad_norm_var": 0.00032755533854166664,
"learning_rate": 0.01,
"loss": 1.3998,
"loss/crossentropy": 2.034050762653351,
"loss/fcd": 1.11328125,
"loss/logits": 0.23992937058210373,
"step": 891
},
{
"epoch": 0.015408666511198058,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0003330866495768229,
"learning_rate": 0.01,
"loss": 1.4362,
"loss/crossentropy": 2.7760528326034546,
"loss/fcd": 1.16796875,
"loss/logits": 0.2806248515844345,
"step": 892
},
{
"epoch": 0.01542594080100882,
"grad_norm": 0.34375,
"grad_norm_var": 0.000394439697265625,
"learning_rate": 0.01,
"loss": 1.4516,
"loss/crossentropy": 2.26086688041687,
"loss/fcd": 1.2109375,
"loss/logits": 0.31815242767333984,
"step": 893
},
{
"epoch": 0.015443215090819579,
"grad_norm": 0.333984375,
"grad_norm_var": 0.0004140218098958333,
"learning_rate": 0.01,
"loss": 1.3702,
"loss/crossentropy": 2.5985008478164673,
"loss/fcd": 1.12890625,
"loss/logits": 0.2603040784597397,
"step": 894
},
{
"epoch": 0.015460489380630339,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0003986199696858724,
"learning_rate": 0.01,
"loss": 1.3509,
"loss/crossentropy": 2.3431901335716248,
"loss/fcd": 1.08984375,
"loss/logits": 0.22906331717967987,
"step": 895
},
{
"epoch": 0.015477763670441099,
"grad_norm": 0.296875,
"grad_norm_var": 0.0004066308339436849,
"learning_rate": 0.01,
"loss": 1.3537,
"loss/crossentropy": 2.4866254329681396,
"loss/fcd": 1.1015625,
"loss/logits": 0.23776976764202118,
"step": 896
},
{
"epoch": 0.015495037960251859,
"grad_norm": 0.3203125,
"grad_norm_var": 0.00040791829427083335,
"learning_rate": 0.01,
"loss": 1.3942,
"loss/crossentropy": 2.656658411026001,
"loss/fcd": 1.11328125,
"loss/logits": 0.265699565410614,
"step": 897
},
{
"epoch": 0.015512312250062619,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0004048506418863932,
"learning_rate": 0.01,
"loss": 1.398,
"loss/crossentropy": 2.508056640625,
"loss/fcd": 1.14453125,
"loss/logits": 0.2679043859243393,
"step": 898
},
{
"epoch": 0.01552958653987338,
"grad_norm": 0.31640625,
"grad_norm_var": 0.00033086140950520834,
"learning_rate": 0.01,
"loss": 1.3786,
"loss/crossentropy": 2.241898775100708,
"loss/fcd": 1.08984375,
"loss/logits": 0.23984474688768387,
"step": 899
},
{
"epoch": 0.01554686082968414,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0003639062245686849,
"learning_rate": 0.01,
"loss": 1.4062,
"loss/crossentropy": 2.563822388648987,
"loss/fcd": 1.12890625,
"loss/logits": 0.2376401573419571,
"step": 900
},
{
"epoch": 0.0155641351194949,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0003350416819254557,
"learning_rate": 0.01,
"loss": 1.3943,
"loss/crossentropy": 2.4819493293762207,
"loss/fcd": 1.140625,
"loss/logits": 0.26604655385017395,
"step": 901
},
{
"epoch": 0.01558140940930566,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0003422419230143229,
"learning_rate": 0.01,
"loss": 1.438,
"loss/crossentropy": 2.6099933385849,
"loss/fcd": 1.21484375,
"loss/logits": 0.2890657037496567,
"step": 902
},
{
"epoch": 0.01559868369911642,
"grad_norm": 0.3125,
"grad_norm_var": 0.00033817291259765627,
"learning_rate": 0.01,
"loss": 1.4034,
"loss/crossentropy": 2.5849201679229736,
"loss/fcd": 1.16796875,
"loss/logits": 0.2732825428247452,
"step": 903
},
{
"epoch": 0.01561595798892718,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0003444512685139974,
"learning_rate": 0.01,
"loss": 1.3811,
"loss/crossentropy": 2.3671282529830933,
"loss/fcd": 1.10546875,
"loss/logits": 0.24938072264194489,
"step": 904
},
{
"epoch": 0.01563323227873794,
"grad_norm": 0.3125,
"grad_norm_var": 0.00024871826171875,
"learning_rate": 0.01,
"loss": 1.3843,
"loss/crossentropy": 2.1398147344589233,
"loss/fcd": 1.07421875,
"loss/logits": 0.2394903600215912,
"step": 905
},
{
"epoch": 0.0156505065685487,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0002757867177327474,
"learning_rate": 0.01,
"loss": 1.3808,
"loss/crossentropy": 2.3531702756881714,
"loss/fcd": 1.08984375,
"loss/logits": 0.25511349737644196,
"step": 906
},
{
"epoch": 0.01566778085835946,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0002784570058186849,
"learning_rate": 0.01,
"loss": 1.3835,
"loss/crossentropy": 2.5271737575531006,
"loss/fcd": 1.109375,
"loss/logits": 0.25303974002599716,
"step": 907
},
{
"epoch": 0.015685055148170222,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0003013451894124349,
"learning_rate": 0.01,
"loss": 1.3966,
"loss/crossentropy": 2.50630259513855,
"loss/fcd": 1.25390625,
"loss/logits": 0.28888703882694244,
"step": 908
},
{
"epoch": 0.015702329437980982,
"grad_norm": 0.294921875,
"grad_norm_var": 0.00022068023681640626,
"learning_rate": 0.01,
"loss": 1.3997,
"loss/crossentropy": 2.6066339015960693,
"loss/fcd": 1.13671875,
"loss/logits": 0.24285603314638138,
"step": 909
},
{
"epoch": 0.015719603727791742,
"grad_norm": 0.296875,
"grad_norm_var": 0.00016541481018066405,
"learning_rate": 0.01,
"loss": 1.3514,
"loss/crossentropy": 2.349377751350403,
"loss/fcd": 1.07421875,
"loss/logits": 0.2379670813679695,
"step": 910
},
{
"epoch": 0.015736878017602502,
"grad_norm": 0.283203125,
"grad_norm_var": 0.00018677711486816406,
"learning_rate": 0.01,
"loss": 1.3697,
"loss/crossentropy": 2.493922233581543,
"loss/fcd": 1.109375,
"loss/logits": 0.2561178654432297,
"step": 911
},
{
"epoch": 0.015754152307413262,
"grad_norm": 0.279296875,
"grad_norm_var": 0.000218963623046875,
"learning_rate": 0.01,
"loss": 1.3902,
"loss/crossentropy": 2.17154997587204,
"loss/fcd": 1.04296875,
"loss/logits": 0.22504562884569168,
"step": 912
},
{
"epoch": 0.015771426597224022,
"grad_norm": 0.27734375,
"grad_norm_var": 0.00022525787353515624,
"learning_rate": 0.01,
"loss": 1.3458,
"loss/crossentropy": 2.4228713512420654,
"loss/fcd": 1.125,
"loss/logits": 0.26753516495227814,
"step": 913
},
{
"epoch": 0.01578870088703478,
"grad_norm": 0.3046875,
"grad_norm_var": 0.00022735595703125,
"learning_rate": 0.01,
"loss": 1.4741,
"loss/crossentropy": 2.25216805934906,
"loss/fcd": 1.23046875,
"loss/logits": 0.33171379566192627,
"step": 914
},
{
"epoch": 0.01580597517684554,
"grad_norm": 0.30859375,
"grad_norm_var": 0.000212860107421875,
"learning_rate": 0.01,
"loss": 1.3429,
"loss/crossentropy": 2.1387062072753906,
"loss/fcd": 1.07421875,
"loss/logits": 0.24266959726810455,
"step": 915
},
{
"epoch": 0.0158232494666563,
"grad_norm": 0.33203125,
"grad_norm_var": 0.00027794837951660155,
"learning_rate": 0.01,
"loss": 1.4493,
"loss/crossentropy": 2.02074271440506,
"loss/fcd": 1.23828125,
"loss/logits": 0.25191547721624374,
"step": 916
},
{
"epoch": 0.01584052375646706,
"grad_norm": 0.7734375,
"grad_norm_var": 0.01417692502339681,
"learning_rate": 0.01,
"loss": 1.4196,
"loss/crossentropy": 2.47384512424469,
"loss/fcd": 1.1484375,
"loss/logits": 0.2742984741926193,
"step": 917
},
{
"epoch": 0.01585779804627782,
"grad_norm": 0.291015625,
"grad_norm_var": 0.014222462972005209,
"learning_rate": 0.01,
"loss": 1.3766,
"loss/crossentropy": 2.5627119541168213,
"loss/fcd": 1.18359375,
"loss/logits": 0.27059850841760635,
"step": 918
},
{
"epoch": 0.01587507233608858,
"grad_norm": 0.31640625,
"grad_norm_var": 0.014214007059733073,
"learning_rate": 0.01,
"loss": 1.4257,
"loss/crossentropy": 2.5728260278701782,
"loss/fcd": 1.11328125,
"loss/logits": 0.26422248035669327,
"step": 919
},
{
"epoch": 0.015892346625899344,
"grad_norm": 0.306640625,
"grad_norm_var": 0.01424706776936849,
"learning_rate": 0.01,
"loss": 1.3441,
"loss/crossentropy": 2.3634893894195557,
"loss/fcd": 1.13671875,
"loss/logits": 0.2779320180416107,
"step": 920
},
{
"epoch": 0.015909620915710104,
"grad_norm": 0.3359375,
"grad_norm_var": 0.01422723134358724,
"learning_rate": 0.01,
"loss": 1.4555,
"loss/crossentropy": 2.176904857158661,
"loss/fcd": 1.2109375,
"loss/logits": 0.2693602591753006,
"step": 921
},
{
"epoch": 0.015926895205520864,
"grad_norm": 0.30078125,
"grad_norm_var": 0.014169820149739583,
"learning_rate": 0.01,
"loss": 1.3614,
"loss/crossentropy": 2.611035466194153,
"loss/fcd": 1.16796875,
"loss/logits": 0.2582136243581772,
"step": 922
},
{
"epoch": 0.015944169495331624,
"grad_norm": 0.3125,
"grad_norm_var": 0.014190610249837239,
"learning_rate": 0.01,
"loss": 1.3666,
"loss/crossentropy": 2.353346347808838,
"loss/fcd": 1.11328125,
"loss/logits": 0.24240678548812866,
"step": 923
},
{
"epoch": 0.015961443785142384,
"grad_norm": 0.283203125,
"grad_norm_var": 0.014214579264322917,
"learning_rate": 0.01,
"loss": 1.3461,
"loss/crossentropy": 2.3549081087112427,
"loss/fcd": 1.07421875,
"loss/logits": 0.2364579290151596,
"step": 924
},
{
"epoch": 0.015978718074953144,
"grad_norm": 0.29296875,
"grad_norm_var": 0.014224227269490559,
"learning_rate": 0.01,
"loss": 1.3649,
"loss/crossentropy": 2.4736167192459106,
"loss/fcd": 1.15234375,
"loss/logits": 0.26356005668640137,
"step": 925
},
{
"epoch": 0.015995992364763904,
"grad_norm": 0.310546875,
"grad_norm_var": 0.01417382558186849,
"learning_rate": 0.01,
"loss": 1.4136,
"loss/crossentropy": 2.3580808639526367,
"loss/fcd": 1.2578125,
"loss/logits": 0.2911546379327774,
"step": 926
},
{
"epoch": 0.016013266654574664,
"grad_norm": 0.298828125,
"grad_norm_var": 0.014087867736816407,
"learning_rate": 0.01,
"loss": 1.3582,
"loss/crossentropy": 2.476295828819275,
"loss/fcd": 1.171875,
"loss/logits": 0.267447791993618,
"step": 927
},
{
"epoch": 0.016030540944385423,
"grad_norm": 0.322265625,
"grad_norm_var": 0.013896942138671875,
"learning_rate": 0.01,
"loss": 1.411,
"loss/crossentropy": 2.6316243410110474,
"loss/fcd": 1.16796875,
"loss/logits": 0.2681735157966614,
"step": 928
},
{
"epoch": 0.016047815234196183,
"grad_norm": 0.28515625,
"grad_norm_var": 0.013840230305989583,
"learning_rate": 0.01,
"loss": 1.3853,
"loss/crossentropy": 2.5550700426101685,
"loss/fcd": 1.12109375,
"loss/logits": 0.25278639793395996,
"step": 929
},
{
"epoch": 0.016065089524006943,
"grad_norm": 0.296875,
"grad_norm_var": 0.013876597086588541,
"learning_rate": 0.01,
"loss": 1.4125,
"loss/crossentropy": 2.511132836341858,
"loss/fcd": 1.1796875,
"loss/logits": 0.26167523860931396,
"step": 930
},
{
"epoch": 0.016082363813817703,
"grad_norm": 0.279296875,
"grad_norm_var": 0.01403514544169108,
"learning_rate": 0.01,
"loss": 1.351,
"loss/crossentropy": 2.468320608139038,
"loss/fcd": 1.1484375,
"loss/logits": 0.254236102104187,
"step": 931
},
{
"epoch": 0.016099638103628463,
"grad_norm": 0.3125,
"grad_norm_var": 0.014063119888305664,
"learning_rate": 0.01,
"loss": 1.3762,
"loss/crossentropy": 2.7182319164276123,
"loss/fcd": 1.1171875,
"loss/logits": 0.25874409079551697,
"step": 932
},
{
"epoch": 0.016116912393439226,
"grad_norm": 0.30859375,
"grad_norm_var": 0.00023280779520670574,
"learning_rate": 0.01,
"loss": 1.3703,
"loss/crossentropy": 2.3206039667129517,
"loss/fcd": 1.09765625,
"loss/logits": 0.2651352882385254,
"step": 933
},
{
"epoch": 0.016134186683249986,
"grad_norm": 0.310546875,
"grad_norm_var": 0.00022454261779785155,
"learning_rate": 0.01,
"loss": 1.3802,
"loss/crossentropy": 2.498626470565796,
"loss/fcd": 1.13671875,
"loss/logits": 0.259146973490715,
"step": 934
},
{
"epoch": 0.016151460973060746,
"grad_norm": 0.28125,
"grad_norm_var": 0.00024628639221191406,
"learning_rate": 0.01,
"loss": 1.3612,
"loss/crossentropy": 2.3583563566207886,
"loss/fcd": 1.06640625,
"loss/logits": 0.24286328256130219,
"step": 935
},
{
"epoch": 0.016168735262871506,
"grad_norm": 0.29296875,
"grad_norm_var": 0.00025018056233723957,
"learning_rate": 0.01,
"loss": 1.4221,
"loss/crossentropy": 2.4976600408554077,
"loss/fcd": 1.234375,
"loss/logits": 0.274882972240448,
"step": 936
},
{
"epoch": 0.016186009552682266,
"grad_norm": 0.376953125,
"grad_norm_var": 0.0005435784657796224,
"learning_rate": 0.01,
"loss": 1.4084,
"loss/crossentropy": 2.4435365200042725,
"loss/fcd": 1.16796875,
"loss/logits": 0.2811162769794464,
"step": 937
},
{
"epoch": 0.016203283842493026,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0005657037099202473,
"learning_rate": 0.01,
"loss": 1.3418,
"loss/crossentropy": 2.3197275400161743,
"loss/fcd": 1.1015625,
"loss/logits": 0.26322872936725616,
"step": 938
},
{
"epoch": 0.016220558132303785,
"grad_norm": 0.2890625,
"grad_norm_var": 0.0005706628163655599,
"learning_rate": 0.01,
"loss": 1.3881,
"loss/crossentropy": 2.6520742177963257,
"loss/fcd": 1.16015625,
"loss/logits": 0.2682619243860245,
"step": 939
},
{
"epoch": 0.016237832422114545,
"grad_norm": 0.28125,
"grad_norm_var": 0.0005757013956705729,
"learning_rate": 0.01,
"loss": 1.3772,
"loss/crossentropy": 2.4414173364639282,
"loss/fcd": 1.09375,
"loss/logits": 0.23820270597934723,
"step": 940
},
{
"epoch": 0.016255106711925305,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0005737145741780599,
"learning_rate": 0.01,
"loss": 1.4165,
"loss/crossentropy": 2.4042497873306274,
"loss/fcd": 1.1328125,
"loss/logits": 0.2601849138736725,
"step": 941
},
{
"epoch": 0.016272381001736065,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0005716323852539062,
"learning_rate": 0.01,
"loss": 1.4208,
"loss/crossentropy": 2.4315325021743774,
"loss/fcd": 1.16015625,
"loss/logits": 0.2801144868135452,
"step": 942
},
{
"epoch": 0.016289655291546825,
"grad_norm": 0.33984375,
"grad_norm_var": 0.0006620883941650391,
"learning_rate": 0.01,
"loss": 1.4608,
"loss/crossentropy": 2.545047879219055,
"loss/fcd": 1.2890625,
"loss/logits": 0.33230888843536377,
"step": 943
},
{
"epoch": 0.016306929581357585,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0006926854451497396,
"learning_rate": 0.01,
"loss": 1.329,
"loss/crossentropy": 2.259741187095642,
"loss/fcd": 1.06640625,
"loss/logits": 0.2455529421567917,
"step": 944
},
{
"epoch": 0.016324203871168348,
"grad_norm": 0.294921875,
"grad_norm_var": 0.0006779829661051432,
"learning_rate": 0.01,
"loss": 1.3409,
"loss/crossentropy": 2.3239141702651978,
"loss/fcd": 1.10546875,
"loss/logits": 0.24660293757915497,
"step": 945
},
{
"epoch": 0.016341478160979108,
"grad_norm": 0.33203125,
"grad_norm_var": 0.0007329146067301432,
"learning_rate": 0.01,
"loss": 1.4681,
"loss/crossentropy": 2.3145695328712463,
"loss/fcd": 1.1015625,
"loss/logits": 0.24830932170152664,
"step": 946
},
{
"epoch": 0.016358752450789868,
"grad_norm": 0.330078125,
"grad_norm_var": 0.0007279555002848308,
"learning_rate": 0.01,
"loss": 1.5011,
"loss/crossentropy": 2.350569486618042,
"loss/fcd": 1.1875,
"loss/logits": 0.2759709805250168,
"step": 947
},
{
"epoch": 0.016376026740600628,
"grad_norm": 0.470703125,
"grad_norm_var": 0.0024080912272135416,
"learning_rate": 0.01,
"loss": 1.52,
"loss/crossentropy": 2.034683883190155,
"loss/fcd": 1.2421875,
"loss/logits": 0.28756849467754364,
"step": 948
},
{
"epoch": 0.016393301030411388,
"grad_norm": 0.33984375,
"grad_norm_var": 0.002434539794921875,
"learning_rate": 0.01,
"loss": 1.4182,
"loss/crossentropy": 2.5900092124938965,
"loss/fcd": 1.15625,
"loss/logits": 0.26620975136756897,
"step": 949
},
{
"epoch": 0.016410575320222148,
"grad_norm": 0.314453125,
"grad_norm_var": 0.002431170145670573,
"learning_rate": 0.01,
"loss": 1.4163,
"loss/crossentropy": 2.458656430244446,
"loss/fcd": 1.1953125,
"loss/logits": 0.27218569815158844,
"step": 950
},
{
"epoch": 0.016427849610032907,
"grad_norm": 0.326171875,
"grad_norm_var": 0.002330636978149414,
"learning_rate": 0.01,
"loss": 1.5638,
"loss/crossentropy": 2.581447720527649,
"loss/fcd": 1.2265625,
"loss/logits": 0.2988656759262085,
"step": 951
},
{
"epoch": 0.016445123899843667,
"grad_norm": 0.412109375,
"grad_norm_var": 0.002758216857910156,
"learning_rate": 0.01,
"loss": 1.5667,
"loss/crossentropy": 2.21357798576355,
"loss/fcd": 1.2421875,
"loss/logits": 0.30781693756580353,
"step": 952
},
{
"epoch": 0.016462398189654427,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0026659488677978514,
"learning_rate": 0.01,
"loss": 1.3711,
"loss/crossentropy": 2.1623282432556152,
"loss/fcd": 1.06640625,
"loss/logits": 0.24749789386987686,
"step": 953
},
{
"epoch": 0.016479672479465187,
"grad_norm": 0.306640625,
"grad_norm_var": 0.0025832494099934894,
"learning_rate": 0.01,
"loss": 1.3663,
"loss/crossentropy": 2.683838129043579,
"loss/fcd": 1.19921875,
"loss/logits": 0.2529330998659134,
"step": 954
},
{
"epoch": 0.016496946769275947,
"grad_norm": 0.265625,
"grad_norm_var": 0.0027312596638997396,
"learning_rate": 0.01,
"loss": 1.3548,
"loss/crossentropy": 2.3420257568359375,
"loss/fcd": 1.0859375,
"loss/logits": 0.253268837928772,
"step": 955
},
{
"epoch": 0.016514221059086707,
"grad_norm": 0.35546875,
"grad_norm_var": 0.002652740478515625,
"learning_rate": 0.01,
"loss": 1.369,
"loss/crossentropy": 2.3265002965927124,
"loss/fcd": 1.1171875,
"loss/logits": 0.24975580722093582,
"step": 956
},
{
"epoch": 0.01653149534889747,
"grad_norm": 0.3203125,
"grad_norm_var": 0.0025789737701416016,
"learning_rate": 0.01,
"loss": 1.3913,
"loss/crossentropy": 2.4944993257522583,
"loss/fcd": 1.16796875,
"loss/logits": 0.25516972690820694,
"step": 957
},
{
"epoch": 0.01654876963870823,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0025911808013916017,
"learning_rate": 0.01,
"loss": 1.3542,
"loss/crossentropy": 2.583009362220764,
"loss/fcd": 1.15625,
"loss/logits": 0.26096589863300323,
"step": 958
},
{
"epoch": 0.01656604392851899,
"grad_norm": 0.287109375,
"grad_norm_var": 0.002695465087890625,
"learning_rate": 0.01,
"loss": 1.4014,
"loss/crossentropy": 2.6060469150543213,
"loss/fcd": 1.1796875,
"loss/logits": 0.298343300819397,
"step": 959
},
{
"epoch": 0.01658331821832975,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0024979750315348307,
"learning_rate": 0.01,
"loss": 1.4127,
"loss/crossentropy": 2.4206702709198,
"loss/fcd": 1.140625,
"loss/logits": 0.2592027187347412,
"step": 960
},
{
"epoch": 0.01660059250814051,
"grad_norm": 0.302734375,
"grad_norm_var": 0.002465550104777018,
"learning_rate": 0.01,
"loss": 1.4039,
"loss/crossentropy": 2.18042528629303,
"loss/fcd": 1.10546875,
"loss/logits": 0.28101974725723267,
"step": 961
},
{
"epoch": 0.01661786679795127,
"grad_norm": 0.353515625,
"grad_norm_var": 0.0024996439615885416,
"learning_rate": 0.01,
"loss": 1.3448,
"loss/crossentropy": 2.2248626947402954,
"loss/fcd": 1.07421875,
"loss/logits": 0.2191808819770813,
"step": 962
},
{
"epoch": 0.01663514108776203,
"grad_norm": 0.357421875,
"grad_norm_var": 0.002541033426920573,
"learning_rate": 0.01,
"loss": 1.4061,
"loss/crossentropy": 2.476745128631592,
"loss/fcd": 1.14453125,
"loss/logits": 0.26761066913604736,
"step": 963
},
{
"epoch": 0.01665241537757279,
"grad_norm": 0.33984375,
"grad_norm_var": 0.00121305783589681,
"learning_rate": 0.01,
"loss": 1.427,
"loss/crossentropy": 2.3096065521240234,
"loss/fcd": 1.234375,
"loss/logits": 0.42609208822250366,
"step": 964
},
{
"epoch": 0.01666968966738355,
"grad_norm": 0.31640625,
"grad_norm_var": 0.0012012322743733723,
"learning_rate": 0.01,
"loss": 1.3957,
"loss/crossentropy": 2.7282618284225464,
"loss/fcd": 1.20703125,
"loss/logits": 0.28854241967201233,
"step": 965
},
{
"epoch": 0.01668696395719431,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0012641747792561848,
"learning_rate": 0.01,
"loss": 1.3752,
"loss/crossentropy": 2.339871048927307,
"loss/fcd": 1.08984375,
"loss/logits": 0.2586899399757385,
"step": 966
},
{
"epoch": 0.01670423824700507,
"grad_norm": 0.263671875,
"grad_norm_var": 0.001474746068318685,
"learning_rate": 0.01,
"loss": 1.3006,
"loss/crossentropy": 2.3013978004455566,
"loss/fcd": 1.046875,
"loss/logits": 0.22273491322994232,
"step": 967
},
{
"epoch": 0.01672151253681583,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0008559544881184896,
"learning_rate": 0.01,
"loss": 1.4225,
"loss/crossentropy": 2.47222638130188,
"loss/fcd": 1.2421875,
"loss/logits": 0.2986321449279785,
"step": 968
},
{
"epoch": 0.01673878682662659,
"grad_norm": 0.29296875,
"grad_norm_var": 0.0008559544881184896,
"learning_rate": 0.01,
"loss": 1.4188,
"loss/crossentropy": 2.2383479475975037,
"loss/fcd": 1.2265625,
"loss/logits": 0.3132626414299011,
"step": 969
},
{
"epoch": 0.016756061116437352,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0008643945058186849,
"learning_rate": 0.01,
"loss": 1.363,
"loss/crossentropy": 2.5179413557052612,
"loss/fcd": 1.09375,
"loss/logits": 0.2516755014657974,
"step": 970
},
{
"epoch": 0.016773335406248112,
"grad_norm": 0.279296875,
"grad_norm_var": 0.0007908503214518229,
"learning_rate": 0.01,
"loss": 1.3967,
"loss/crossentropy": 1.9743611812591553,
"loss/fcd": 1.05859375,
"loss/logits": 0.24054741859436035,
"step": 971
},
{
"epoch": 0.016790609696058872,
"grad_norm": 0.275390625,
"grad_norm_var": 0.000740671157836914,
"learning_rate": 0.01,
"loss": 1.3595,
"loss/crossentropy": 2.405099630355835,
"loss/fcd": 1.15234375,
"loss/logits": 0.28836295008659363,
"step": 972
},
{
"epoch": 0.01680788398586963,
"grad_norm": 0.3046875,
"grad_norm_var": 0.0007307529449462891,
"learning_rate": 0.01,
"loss": 1.4048,
"loss/crossentropy": 2.583898901939392,
"loss/fcd": 1.2109375,
"loss/logits": 0.2704490125179291,
"step": 973
},
{
"epoch": 0.01682515827568039,
"grad_norm": 0.3125,
"grad_norm_var": 0.0007318973541259766,
"learning_rate": 0.01,
"loss": 1.4402,
"loss/crossentropy": 2.486370801925659,
"loss/fcd": 1.140625,
"loss/logits": 0.2756696939468384,
"step": 974
},
{
"epoch": 0.01684243256549115,
"grad_norm": 0.28125,
"grad_norm_var": 0.0007501602172851563,
"learning_rate": 0.01,
"loss": 1.3591,
"loss/crossentropy": 2.421715497970581,
"loss/fcd": 1.0390625,
"loss/logits": 0.22876735776662827,
"step": 975
},
{
"epoch": 0.01685970685530191,
"grad_norm": 0.318359375,
"grad_norm_var": 0.0007433573404947917,
"learning_rate": 0.01,
"loss": 1.3213,
"loss/crossentropy": 2.4171801805496216,
"loss/fcd": 1.09765625,
"loss/logits": 0.23518116772174835,
"step": 976
},
{
"epoch": 0.01687698114511267,
"grad_norm": 0.4140625,
"grad_norm_var": 0.0014527479807535807,
"learning_rate": 0.01,
"loss": 1.4776,
"loss/crossentropy": 2.080851912498474,
"loss/fcd": 1.26953125,
"loss/logits": 0.22676381468772888,
"step": 977
},
{
"epoch": 0.01689425543492343,
"grad_norm": 0.2734375,
"grad_norm_var": 0.0014325459798177084,
"learning_rate": 0.01,
"loss": 1.3453,
"loss/crossentropy": 2.2649213075637817,
"loss/fcd": 1.09765625,
"loss/logits": 0.2382289096713066,
"step": 978
},
{
"epoch": 0.01691152972473419,
"grad_norm": 0.26953125,
"grad_norm_var": 0.00134886105855306,
"learning_rate": 0.01,
"loss": 1.4216,
"loss/crossentropy": 2.4842547178268433,
"loss/fcd": 1.16015625,
"loss/logits": 0.27352918684482574,
"step": 979
},
{
"epoch": 0.01692880401454495,
"grad_norm": 0.291015625,
"grad_norm_var": 0.0012618382771809897,
"learning_rate": 0.01,
"loss": 1.3782,
"loss/crossentropy": 2.163589835166931,
"loss/fcd": 1.125,
"loss/logits": 0.26143455505371094,
"step": 980
},
{
"epoch": 0.01694607830435571,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0012567520141601562,
"learning_rate": 0.01,
"loss": 1.3912,
"loss/crossentropy": 2.421532988548279,
"loss/fcd": 1.08203125,
"loss/logits": 0.23204928636550903,
"step": 981
},
{
"epoch": 0.016963352594166474,
"grad_norm": 0.28125,
"grad_norm_var": 0.00127256711324056,
"learning_rate": 0.01,
"loss": 1.3826,
"loss/crossentropy": 2.607829451560974,
"loss/fcd": 1.125,
"loss/logits": 0.2582753002643585,
"step": 982
},
{
"epoch": 0.016980626883977234,
"grad_norm": 0.3359375,
"grad_norm_var": 0.0012684504191080729,
"learning_rate": 0.01,
"loss": 1.3938,
"loss/crossentropy": 2.430111050605774,
"loss/fcd": 1.10546875,
"loss/logits": 0.2326122149825096,
"step": 983
},
{
"epoch": 0.016997901173787994,
"grad_norm": 0.369140625,
"grad_norm_var": 0.001544936498006185,
"learning_rate": 0.01,
"loss": 1.4349,
"loss/crossentropy": 2.584348440170288,
"loss/fcd": 1.18359375,
"loss/logits": 0.27420538663864136,
"step": 984
},
{
"epoch": 0.017015175463598754,
"grad_norm": 0.333984375,
"grad_norm_var": 0.0015746434529622397,
"learning_rate": 0.01,
"loss": 1.4002,
"loss/crossentropy": 2.6233400106430054,
"loss/fcd": 1.13671875,
"loss/logits": 0.2728031575679779,
"step": 985
},
{
"epoch": 0.017032449753409513,
"grad_norm": 0.3203125,
"grad_norm_var": 0.001567840576171875,
"learning_rate": 0.01,
"loss": 1.3921,
"loss/crossentropy": 2.2127867937088013,
"loss/fcd": 1.1328125,
"loss/logits": 0.24761803448200226,
"step": 986
},
{
"epoch": 0.017049724043220273,
"grad_norm": 0.322265625,
"grad_norm_var": 0.0015125910441080729,
"learning_rate": 0.01,
"loss": 1.4117,
"loss/crossentropy": 2.4916510581970215,
"loss/fcd": 1.140625,
"loss/logits": 0.2528844252228737,
"step": 987
},
{
"epoch": 0.017066998333031033,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0014711856842041016,
"learning_rate": 0.01,
"loss": 1.3702,
"loss/crossentropy": 2.076325237751007,
"loss/fcd": 1.109375,
"loss/logits": 0.24822547286748886,
"step": 988
},
{
"epoch": 0.017084272622841793,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0015150547027587891,
"learning_rate": 0.01,
"loss": 1.375,
"loss/crossentropy": 2.2751649618148804,
"loss/fcd": 1.06640625,
"loss/logits": 0.2591545879840851,
"step": 989
},
{
"epoch": 0.017101546912652553,
"grad_norm": 0.3046875,
"grad_norm_var": 0.001517470677693685,
"learning_rate": 0.01,
"loss": 1.3438,
"loss/crossentropy": 2.564236044883728,
"loss/fcd": 1.10546875,
"loss/logits": 0.2575865834951401,
"step": 990
},
{
"epoch": 0.017118821202463313,
"grad_norm": 0.27734375,
"grad_norm_var": 0.0015337467193603516,
"learning_rate": 0.01,
"loss": 1.2948,
"loss/crossentropy": 2.322708249092102,
"loss/fcd": 1.0859375,
"loss/logits": 0.23693984001874924,
"step": 991
},
{
"epoch": 0.017136095492274073,
"grad_norm": 0.30078125,
"grad_norm_var": 0.0015344619750976562,
"learning_rate": 0.01,
"loss": 1.4124,
"loss/crossentropy": 2.4255528450012207,
"loss/fcd": 1.171875,
"loss/logits": 0.25587616115808487,
"step": 992
},
{
"epoch": 0.017153369782084833,
"grad_norm": 0.275390625,
"grad_norm_var": 0.0007997989654541015,
"learning_rate": 0.01,
"loss": 1.3437,
"loss/crossentropy": 2.5350613594055176,
"loss/fcd": 1.12109375,
"loss/logits": 0.25402751564979553,
"step": 993
},
{
"epoch": 0.017170644071895596,
"grad_norm": 0.30859375,
"grad_norm_var": 0.0007494449615478516,
"learning_rate": 0.01,
"loss": 1.4055,
"loss/crossentropy": 2.5626988410949707,
"loss/fcd": 1.14453125,
"loss/logits": 0.25801587104797363,
"step": 994
},
{
"epoch": 0.017187918361706356,
"grad_norm": 0.3046875,
"grad_norm_var": 0.000670480728149414,
"learning_rate": 0.01,
"loss": 1.3867,
"loss/crossentropy": 2.7328250408172607,
"loss/fcd": 1.171875,
"loss/logits": 0.28935085237026215,
"step": 995
},
{
"epoch": 0.017205192651517116,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0006787459055582683,
"learning_rate": 0.01,
"loss": 1.3854,
"loss/crossentropy": 2.2958213090896606,
"loss/fcd": 1.1328125,
"loss/logits": 0.2697945237159729,
"step": 996
},
{
"epoch": 0.017222466941327876,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0006787459055582683,
"learning_rate": 0.01,
"loss": 1.3576,
"loss/crossentropy": 2.314937472343445,
"loss/fcd": 1.09375,
"loss/logits": 0.24704495817422867,
"step": 997
},
{
"epoch": 0.017239741231138635,
"grad_norm": 0.32421875,
"grad_norm_var": 0.0006591637929280598,
"learning_rate": 0.01,
"loss": 1.4405,
"loss/crossentropy": 2.582629084587097,
"loss/fcd": 1.26171875,
"loss/logits": 0.335773229598999,
"step": 998
},
{
"epoch": 0.017257015520949395,
"grad_norm": 0.28515625,
"grad_norm_var": 0.0006277561187744141,
"learning_rate": 0.01,
"loss": 1.3605,
"loss/crossentropy": 2.299025297164917,
"loss/fcd": 1.052734375,
"loss/logits": 0.23469385504722595,
"step": 999
},
{
"epoch": 0.017274289810760155,
"grad_norm": 0.26953125,
"grad_norm_var": 0.00038700103759765626,
"learning_rate": 0.01,
"loss": 1.3825,
"loss/crossentropy": 2.467602014541626,
"loss/fcd": 1.15234375,
"loss/logits": 0.2697184160351753,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 300000,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.70040442617856e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}