{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 2000, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 384.0, "learning_rate": 1.18e-05, "loss": 99.3112, "loss/crossentropy": 9.301286220550537, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 7.171189308166504, "step": 2 }, { "epoch": 0.00025, "grad_norm": 388.0, "learning_rate": 1.3600000000000002e-05, "loss": 98.5376, "loss/crossentropy": 9.283345699310303, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 7.322719573974609, "step": 4 }, { "epoch": 0.000375, "grad_norm": 380.0, "learning_rate": 1.54e-05, "loss": 98.4698, "loss/crossentropy": 9.26666784286499, "loss/hidden": 16.625, "loss/jsd": 0.0, "loss/logits": 7.094146490097046, "step": 6 }, { "epoch": 0.0005, "grad_norm": 187.0, "learning_rate": 1.72e-05, "loss": 95.8811, "loss/crossentropy": 9.060422420501709, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 6.9519524574279785, "step": 8 }, { "epoch": 0.000625, "grad_norm": 158.0, "learning_rate": 1.9e-05, "loss": 91.1537, "loss/crossentropy": 8.855913162231445, "loss/hidden": 16.375, "loss/jsd": 0.0, "loss/logits": 6.698125123977661, "step": 10 }, { "epoch": 0.00075, "grad_norm": 135.0, "learning_rate": 2.0800000000000004e-05, "loss": 89.0469, "loss/crossentropy": 8.479426860809326, "loss/hidden": 16.3125, "loss/jsd": 0.0, "loss/logits": 6.1546266078948975, "step": 12 }, { "epoch": 0.000875, "grad_norm": 119.0, "learning_rate": 2.2600000000000004e-05, "loss": 87.3701, "loss/crossentropy": 8.417439937591553, "loss/hidden": 16.25, "loss/jsd": 0.0, "loss/logits": 6.330978155136108, "step": 14 }, { "epoch": 0.001, "grad_norm": 98.0, "grad_norm_var": 15809.7625, "learning_rate": 2.4400000000000004e-05, "loss": 81.7839, "loss/crossentropy": 7.888103723526001, "loss/hidden": 15.8125, "loss/jsd": 0.0, "loss/logits": 5.809406042098999, "step": 16 }, { "epoch": 0.001125, "grad_norm": 278.0, "grad_norm_var": 12072.916666666666, "learning_rate": 2.6200000000000003e-05, "loss": 83.0321, "loss/crossentropy": 7.949460506439209, "loss/hidden": 15.34375, "loss/jsd": 0.0, "loss/logits": 6.00595760345459, "step": 18 }, { "epoch": 0.00125, "grad_norm": 67.5, "grad_norm_var": 8976.948958333332, "learning_rate": 2.8000000000000003e-05, "loss": 79.5947, "loss/crossentropy": 7.64544939994812, "loss/hidden": 15.25, "loss/jsd": 0.0, "loss/logits": 5.5388875007629395, "step": 20 }, { "epoch": 0.001375, "grad_norm": 38.5, "grad_norm_var": 4950.315625, "learning_rate": 2.9800000000000006e-05, "loss": 74.6424, "loss/crossentropy": 7.209100246429443, "loss/hidden": 15.15625, "loss/jsd": 0.0, "loss/logits": 5.076019763946533, "step": 22 }, { "epoch": 0.0015, "grad_norm": 54.5, "grad_norm_var": 4140.295833333334, "learning_rate": 3.16e-05, "loss": 71.7249, "loss/crossentropy": 7.1052405834198, "loss/hidden": 15.0, "loss/jsd": 0.0, "loss/logits": 5.032779216766357, "step": 24 }, { "epoch": 0.001625, "grad_norm": 90.5, "grad_norm_var": 3923.795833333333, "learning_rate": 3.3400000000000005e-05, "loss": 69.0909, "loss/crossentropy": 6.593599557876587, "loss/hidden": 14.9375, "loss/jsd": 0.0, "loss/logits": 4.861028671264648, "step": 26 }, { "epoch": 0.00175, "grad_norm": 49.0, "grad_norm_var": 4052.4239583333333, "learning_rate": 3.520000000000001e-05, "loss": 64.7694, "loss/crossentropy": 6.363184213638306, "loss/hidden": 14.59375, "loss/jsd": 0.0, "loss/logits": 4.430697441101074, "step": 28 }, { "epoch": 0.001875, "grad_norm": 47.0, "grad_norm_var": 4244.118489583333, "learning_rate": 3.7e-05, "loss": 59.3223, "loss/crossentropy": 5.989596843719482, "loss/hidden": 13.84375, "loss/jsd": 0.0, "loss/logits": 4.165619850158691, "step": 30 }, { "epoch": 0.002, "grad_norm": 71.0, "grad_norm_var": 4305.730989583333, "learning_rate": 3.88e-05, "loss": 55.1302, "loss/crossentropy": 5.726909637451172, "loss/hidden": 13.53125, "loss/jsd": 0.0, "loss/logits": 3.7759565114974976, "step": 32 }, { "epoch": 0.002125, "grad_norm": 60.0, "grad_norm_var": 934.0955729166667, "learning_rate": 4.0600000000000004e-05, "loss": 50.1945, "loss/crossentropy": 5.208499431610107, "loss/hidden": 13.125, "loss/jsd": 0.0, "loss/logits": 3.081121802330017, "step": 34 }, { "epoch": 0.00225, "grad_norm": 46.75, "grad_norm_var": 266.72083333333336, "learning_rate": 4.240000000000001e-05, "loss": 46.3994, "loss/crossentropy": 4.913021564483643, "loss/hidden": 12.375, "loss/jsd": 0.0, "loss/logits": 2.866178512573242, "step": 36 }, { "epoch": 0.002375, "grad_norm": 51.75, "grad_norm_var": 237.22395833333334, "learning_rate": 4.420000000000001e-05, "loss": 42.1907, "loss/crossentropy": 4.504716157913208, "loss/hidden": 12.0625, "loss/jsd": 0.0, "loss/logits": 2.6037776470184326, "step": 38 }, { "epoch": 0.0025, "grad_norm": 50.0, "grad_norm_var": 236.25390625, "learning_rate": 4.600000000000001e-05, "loss": 39.115, "loss/crossentropy": 4.373331546783447, "loss/hidden": 11.375, "loss/jsd": 0.0, "loss/logits": 2.2266069650650024, "step": 40 }, { "epoch": 0.002625, "grad_norm": 33.0, "grad_norm_var": 164.56640625, "learning_rate": 4.78e-05, "loss": 36.1801, "loss/crossentropy": 4.276909589767456, "loss/hidden": 11.0625, "loss/jsd": 0.0, "loss/logits": 2.2537089586257935, "step": 42 }, { "epoch": 0.00275, "grad_norm": 41.0, "grad_norm_var": 170.54765625, "learning_rate": 4.96e-05, "loss": 33.7672, "loss/crossentropy": 3.979385256767273, "loss/hidden": 10.59375, "loss/jsd": 0.0, "loss/logits": 1.776978850364685, "step": 44 }, { "epoch": 0.002875, "grad_norm": 31.5, "grad_norm_var": 205.69140625, "learning_rate": 5.14e-05, "loss": 31.4663, "loss/crossentropy": 3.5722849369049072, "loss/hidden": 10.15625, "loss/jsd": 0.0, "loss/logits": 1.7410615682601929, "step": 46 }, { "epoch": 0.003, "grad_norm": 21.375, "grad_norm_var": 211.38020833333334, "learning_rate": 5.3200000000000006e-05, "loss": 29.7082, "loss/crossentropy": 3.679291844367981, "loss/hidden": 9.625, "loss/jsd": 0.0, "loss/logits": 1.594287633895874, "step": 48 }, { "epoch": 0.003125, "grad_norm": 23.125, "grad_norm_var": 105.7416015625, "learning_rate": 5.500000000000001e-05, "loss": 28.489, "loss/crossentropy": 3.9182190895080566, "loss/hidden": 9.40625, "loss/jsd": 0.0, "loss/logits": 1.5025497078895569, "step": 50 }, { "epoch": 0.00325, "grad_norm": 29.875, "grad_norm_var": 105.0306640625, "learning_rate": 5.680000000000001e-05, "loss": 27.5703, "loss/crossentropy": 3.526407241821289, "loss/hidden": 9.25, "loss/jsd": 0.0, "loss/logits": 1.494104266166687, "step": 52 }, { "epoch": 0.003375, "grad_norm": 19.625, "grad_norm_var": 99.2416015625, "learning_rate": 5.860000000000001e-05, "loss": 26.1189, "loss/crossentropy": 3.4616609811782837, "loss/hidden": 9.0, "loss/jsd": 0.0, "loss/logits": 1.3545405268669128, "step": 54 }, { "epoch": 0.0035, "grad_norm": 22.5, "grad_norm_var": 54.81920572916667, "learning_rate": 6.040000000000001e-05, "loss": 24.328, "loss/crossentropy": 3.308198928833008, "loss/hidden": 8.75, "loss/jsd": 0.0, "loss/logits": 1.2083913683891296, "step": 56 }, { "epoch": 0.003625, "grad_norm": 14.0625, "grad_norm_var": 59.79152018229167, "learning_rate": 6.220000000000001e-05, "loss": 24.2188, "loss/crossentropy": 3.5452929735183716, "loss/hidden": 8.4375, "loss/jsd": 0.0, "loss/logits": 1.2204867601394653, "step": 58 }, { "epoch": 0.00375, "grad_norm": 15.75, "grad_norm_var": 52.173177083333336, "learning_rate": 6.400000000000001e-05, "loss": 22.8282, "loss/crossentropy": 3.1143264770507812, "loss/hidden": 8.40625, "loss/jsd": 0.0, "loss/logits": 1.1705525517463684, "step": 60 }, { "epoch": 0.003875, "grad_norm": 20.125, "grad_norm_var": 38.25930989583333, "learning_rate": 6.58e-05, "loss": 22.306, "loss/crossentropy": 3.136604428291321, "loss/hidden": 7.96875, "loss/jsd": 0.0, "loss/logits": 1.1404522061347961, "step": 62 }, { "epoch": 0.004, "grad_norm": 16.5, "grad_norm_var": 40.18274739583333, "learning_rate": 6.76e-05, "loss": 21.058, "loss/crossentropy": 2.9673322439193726, "loss/hidden": 7.703125, "loss/jsd": 0.0, "loss/logits": 1.0015667080879211, "step": 64 }, { "epoch": 0.004125, "grad_norm": 11.6875, "grad_norm_var": 39.946614583333336, "learning_rate": 6.94e-05, "loss": 21.0828, "loss/crossentropy": 3.2232860326766968, "loss/hidden": 7.546875, "loss/jsd": 0.0, "loss/logits": 0.964312881231308, "step": 66 }, { "epoch": 0.00425, "grad_norm": 15.4375, "grad_norm_var": 32.87902018229167, "learning_rate": 7.120000000000001e-05, "loss": 20.2688, "loss/crossentropy": 3.371062755584717, "loss/hidden": 7.53125, "loss/jsd": 0.0, "loss/logits": 0.971402496099472, "step": 68 }, { "epoch": 0.004375, "grad_norm": 11.8125, "grad_norm_var": 37.155322265625, "learning_rate": 7.3e-05, "loss": 19.6652, "loss/crossentropy": 2.8037211894989014, "loss/hidden": 7.40625, "loss/jsd": 0.0, "loss/logits": 0.952717661857605, "step": 70 }, { "epoch": 0.0045, "grad_norm": 116.5, "grad_norm_var": 640.5003743489583, "learning_rate": 7.48e-05, "loss": 19.6559, "loss/crossentropy": 2.9093810319900513, "loss/hidden": 7.15625, "loss/jsd": 0.0, "loss/logits": 0.9704654216766357, "step": 72 }, { "epoch": 0.004625, "grad_norm": 9.4375, "grad_norm_var": 651.0841145833333, "learning_rate": 7.66e-05, "loss": 18.7849, "loss/crossentropy": 2.824882984161377, "loss/hidden": 7.0625, "loss/jsd": 0.0, "loss/logits": 0.8673952519893646, "step": 74 }, { "epoch": 0.00475, "grad_norm": 21.875, "grad_norm_var": 649.4197916666667, "learning_rate": 7.840000000000001e-05, "loss": 18.5261, "loss/crossentropy": 2.8125277757644653, "loss/hidden": 7.109375, "loss/jsd": 0.0, "loss/logits": 0.8680737912654877, "step": 76 }, { "epoch": 0.004875, "grad_norm": 12.1875, "grad_norm_var": 658.5675618489583, "learning_rate": 8.020000000000001e-05, "loss": 18.4968, "loss/crossentropy": 2.8050509691238403, "loss/hidden": 6.828125, "loss/jsd": 0.0, "loss/logits": 0.8595540523529053, "step": 78 }, { "epoch": 0.005, "grad_norm": 11.5, "grad_norm_var": 669.7122233072917, "learning_rate": 8.200000000000001e-05, "loss": 18.0691, "loss/crossentropy": 3.2670862674713135, "loss/hidden": 6.8125, "loss/jsd": 0.0, "loss/logits": 0.8241342604160309, "step": 80 }, { "epoch": 0.005125, "grad_norm": 12.625, "grad_norm_var": 669.6054524739583, "learning_rate": 8.38e-05, "loss": 17.4693, "loss/crossentropy": 2.700217127799988, "loss/hidden": 6.765625, "loss/jsd": 0.0, "loss/logits": 0.8141748309135437, "step": 82 }, { "epoch": 0.00525, "grad_norm": 11.4375, "grad_norm_var": 679.333056640625, "learning_rate": 8.560000000000001e-05, "loss": 16.8553, "loss/crossentropy": 2.619894862174988, "loss/hidden": 6.578125, "loss/jsd": 0.0, "loss/logits": 0.755303144454956, "step": 84 }, { "epoch": 0.005375, "grad_norm": 10.8125, "grad_norm_var": 680.3473307291666, "learning_rate": 8.740000000000001e-05, "loss": 16.8983, "loss/crossentropy": 2.8718719482421875, "loss/hidden": 6.484375, "loss/jsd": 0.0, "loss/logits": 0.8335458338260651, "step": 86 }, { "epoch": 0.0055, "grad_norm": 13.8125, "grad_norm_var": 8.321858723958334, "learning_rate": 8.92e-05, "loss": 16.8672, "loss/crossentropy": 2.806625247001648, "loss/hidden": 6.546875, "loss/jsd": 0.0, "loss/logits": 0.7781052589416504, "step": 88 }, { "epoch": 0.005625, "grad_norm": 12.1875, "grad_norm_var": 7.507014973958333, "learning_rate": 9.1e-05, "loss": 16.4737, "loss/crossentropy": 3.016478180885315, "loss/hidden": 6.375, "loss/jsd": 0.0, "loss/logits": 0.7285971939563751, "step": 90 }, { "epoch": 0.00575, "grad_norm": 13.25, "grad_norm_var": 1.43046875, "learning_rate": 9.28e-05, "loss": 16.47, "loss/crossentropy": 2.5847216844558716, "loss/hidden": 6.359375, "loss/jsd": 0.0, "loss/logits": 0.6861400604248047, "step": 92 }, { "epoch": 0.005875, "grad_norm": 9.875, "grad_norm_var": 1.4640462239583334, "learning_rate": 9.46e-05, "loss": 16.3726, "loss/crossentropy": 2.6700236797332764, "loss/hidden": 6.328125, "loss/jsd": 0.0, "loss/logits": 0.7058612108230591, "step": 94 }, { "epoch": 0.006, "grad_norm": 11.6875, "grad_norm_var": 1.329541015625, "learning_rate": 9.64e-05, "loss": 16.0121, "loss/crossentropy": 2.8999104499816895, "loss/hidden": 6.203125, "loss/jsd": 0.0, "loss/logits": 0.7061053812503815, "step": 96 }, { "epoch": 0.006125, "grad_norm": 12.25, "grad_norm_var": 1.3148274739583334, "learning_rate": 9.82e-05, "loss": 15.9048, "loss/crossentropy": 2.9132989645004272, "loss/hidden": 6.234375, "loss/jsd": 0.0, "loss/logits": 0.7202947437763214, "step": 98 }, { "epoch": 0.00625, "grad_norm": 10.1875, "grad_norm_var": 1.2620930989583334, "learning_rate": 0.0001, "loss": 15.4134, "loss/crossentropy": 2.6530884504318237, "loss/hidden": 5.90625, "loss/jsd": 0.0, "loss/logits": 0.6581160724163055, "step": 100 }, { "epoch": 0.006375, "grad_norm": 12.8125, "grad_norm_var": 2.23736572265625, "learning_rate": 0.0001, "loss": 15.5444, "loss/crossentropy": 2.285265564918518, "loss/hidden": 6.15625, "loss/jsd": 0.0, "loss/logits": 0.6365102231502533, "step": 102 }, { "epoch": 0.0065, "grad_norm": 12.25, "grad_norm_var": 1.9623006184895833, "learning_rate": 0.0001, "loss": 15.3962, "loss/crossentropy": 2.9150387048721313, "loss/hidden": 5.890625, "loss/jsd": 0.0, "loss/logits": 0.6743068099021912, "step": 104 }, { "epoch": 0.006625, "grad_norm": 11.375, "grad_norm_var": 1.9106730143229167, "learning_rate": 0.0001, "loss": 15.0494, "loss/crossentropy": 2.461984634399414, "loss/hidden": 5.875, "loss/jsd": 0.0, "loss/logits": 0.5759885013103485, "step": 106 }, { "epoch": 0.00675, "grad_norm": 9.4375, "grad_norm_var": 1.83931884765625, "learning_rate": 0.0001, "loss": 15.2, "loss/crossentropy": 2.545448660850525, "loss/hidden": 5.828125, "loss/jsd": 0.0, "loss/logits": 0.6016611158847809, "step": 108 }, { "epoch": 0.006875, "grad_norm": 9.5, "grad_norm_var": 2.117997233072917, "learning_rate": 0.0001, "loss": 14.7974, "loss/crossentropy": 2.70013689994812, "loss/hidden": 5.8125, "loss/jsd": 0.0, "loss/logits": 0.6241994798183441, "step": 110 }, { "epoch": 0.007, "grad_norm": 11.6875, "grad_norm_var": 2.129715983072917, "learning_rate": 0.0001, "loss": 14.9825, "loss/crossentropy": 2.7020071744918823, "loss/hidden": 5.765625, "loss/jsd": 0.0, "loss/logits": 0.6234863996505737, "step": 112 }, { "epoch": 0.007125, "grad_norm": 10.0, "grad_norm_var": 2.9886067708333335, "learning_rate": 0.0001, "loss": 14.717, "loss/crossentropy": 2.513030529022217, "loss/hidden": 5.765625, "loss/jsd": 0.0, "loss/logits": 0.6006259322166443, "step": 114 }, { "epoch": 0.00725, "grad_norm": 9.8125, "grad_norm_var": 2.908317057291667, "learning_rate": 0.0001, "loss": 14.5928, "loss/crossentropy": 2.696964979171753, "loss/hidden": 5.640625, "loss/jsd": 0.0, "loss/logits": 0.6187423169612885, "step": 116 }, { "epoch": 0.007375, "grad_norm": 8.0, "grad_norm_var": 2.4882771809895834, "learning_rate": 0.0001, "loss": 14.4255, "loss/crossentropy": 2.6013330221176147, "loss/hidden": 5.578125, "loss/jsd": 0.0, "loss/logits": 0.6197507381439209, "step": 118 }, { "epoch": 0.0075, "grad_norm": 9.5625, "grad_norm_var": 2.2302042643229165, "learning_rate": 0.0001, "loss": 14.3271, "loss/crossentropy": 2.411963939666748, "loss/hidden": 5.609375, "loss/jsd": 0.0, "loss/logits": 0.625188797712326, "step": 120 }, { "epoch": 0.007625, "grad_norm": 7.84375, "grad_norm_var": 2.220686848958333, "learning_rate": 0.0001, "loss": 14.2801, "loss/crossentropy": 2.6053736209869385, "loss/hidden": 5.6875, "loss/jsd": 0.0, "loss/logits": 0.6165933012962341, "step": 122 }, { "epoch": 0.00775, "grad_norm": 8.9375, "grad_norm_var": 1.343603515625, "learning_rate": 0.0001, "loss": 14.212, "loss/crossentropy": 2.693827986717224, "loss/hidden": 5.546875, "loss/jsd": 0.0, "loss/logits": 0.593802809715271, "step": 124 }, { "epoch": 0.007875, "grad_norm": 8.9375, "grad_norm_var": 1.392822265625, "learning_rate": 0.0001, "loss": 14.083, "loss/crossentropy": 2.649814248085022, "loss/hidden": 5.484375, "loss/jsd": 0.0, "loss/logits": 0.5663131475448608, "step": 126 }, { "epoch": 0.008, "grad_norm": 7.90625, "grad_norm_var": 0.8796183268229166, "learning_rate": 0.0001, "loss": 13.7585, "loss/crossentropy": 2.813218355178833, "loss/hidden": 5.640625, "loss/jsd": 0.0, "loss/logits": 0.619841456413269, "step": 128 }, { "epoch": 0.008125, "grad_norm": 9.4375, "grad_norm_var": 0.5834635416666667, "learning_rate": 0.0001, "loss": 13.7834, "loss/crossentropy": 2.496381640434265, "loss/hidden": 5.34375, "loss/jsd": 0.0, "loss/logits": 0.534807562828064, "step": 130 }, { "epoch": 0.00825, "grad_norm": 7.15625, "grad_norm_var": 0.73668212890625, "learning_rate": 0.0001, "loss": 13.8102, "loss/crossentropy": 2.587761878967285, "loss/hidden": 5.6875, "loss/jsd": 0.0, "loss/logits": 0.5979687869548798, "step": 132 }, { "epoch": 0.008375, "grad_norm": 11.0625, "grad_norm_var": 0.9780232747395833, "learning_rate": 0.0001, "loss": 13.7661, "loss/crossentropy": 3.0042346715927124, "loss/hidden": 5.4375, "loss/jsd": 0.0, "loss/logits": 0.568140983581543, "step": 134 }, { "epoch": 0.0085, "grad_norm": 9.5625, "grad_norm_var": 0.8787394205729167, "learning_rate": 0.0001, "loss": 13.7338, "loss/crossentropy": 2.4913647174835205, "loss/hidden": 5.53125, "loss/jsd": 0.0, "loss/logits": 0.5167834609746933, "step": 136 }, { "epoch": 0.008625, "grad_norm": 7.4375, "grad_norm_var": 1.00718994140625, "learning_rate": 0.0001, "loss": 13.6016, "loss/crossentropy": 2.468238115310669, "loss/hidden": 5.28125, "loss/jsd": 0.0, "loss/logits": 0.5614461004734039, "step": 138 }, { "epoch": 0.00875, "grad_norm": 8.8125, "grad_norm_var": 1.0800740559895834, "learning_rate": 0.0001, "loss": 13.5316, "loss/crossentropy": 2.4444445371627808, "loss/hidden": 5.3125, "loss/jsd": 0.0, "loss/logits": 0.5187713205814362, "step": 140 }, { "epoch": 0.008875, "grad_norm": 6.96875, "grad_norm_var": 1.2684895833333334, "learning_rate": 0.0001, "loss": 13.0968, "loss/crossentropy": 2.6214382648468018, "loss/hidden": 5.265625, "loss/jsd": 0.0, "loss/logits": 0.5483916699886322, "step": 142 }, { "epoch": 0.009, "grad_norm": 8.375, "grad_norm_var": 1.2163045247395834, "learning_rate": 0.0001, "loss": 13.202, "loss/crossentropy": 2.7945733070373535, "loss/hidden": 5.265625, "loss/jsd": 0.0, "loss/logits": 0.5334698259830475, "step": 144 }, { "epoch": 0.009125, "grad_norm": 8.375, "grad_norm_var": 1.11070556640625, "learning_rate": 0.0001, "loss": 13.4962, "loss/crossentropy": 2.6263811588287354, "loss/hidden": 5.3125, "loss/jsd": 0.0, "loss/logits": 0.5304541736841202, "step": 146 }, { "epoch": 0.00925, "grad_norm": 6.46875, "grad_norm_var": 1.1537394205729166, "learning_rate": 0.0001, "loss": 13.0194, "loss/crossentropy": 2.446092367172241, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.4983871430158615, "step": 148 }, { "epoch": 0.009375, "grad_norm": 8.875, "grad_norm_var": 0.7020182291666667, "learning_rate": 0.0001, "loss": 13.4196, "loss/crossentropy": 2.954146981239319, "loss/hidden": 5.265625, "loss/jsd": 0.0, "loss/logits": 0.5594009757041931, "step": 150 }, { "epoch": 0.0095, "grad_norm": 7.9375, "grad_norm_var": 0.563916015625, "learning_rate": 0.0001, "loss": 13.1519, "loss/crossentropy": 2.7116650342941284, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5489330589771271, "step": 152 }, { "epoch": 0.009625, "grad_norm": 6.875, "grad_norm_var": 0.66314697265625, "learning_rate": 0.0001, "loss": 12.9977, "loss/crossentropy": 2.6282447576522827, "loss/hidden": 5.1875, "loss/jsd": 0.0, "loss/logits": 0.4889778196811676, "step": 154 }, { "epoch": 0.00975, "grad_norm": 6.5625, "grad_norm_var": 0.7279256184895834, "learning_rate": 0.0001, "loss": 12.9168, "loss/crossentropy": 2.5541906356811523, "loss/hidden": 5.140625, "loss/jsd": 0.0, "loss/logits": 0.5468989908695221, "step": 156 }, { "epoch": 0.009875, "grad_norm": 8.25, "grad_norm_var": 0.7774739583333333, "learning_rate": 0.0001, "loss": 12.8049, "loss/crossentropy": 2.3998714685440063, "loss/hidden": 5.171875, "loss/jsd": 0.0, "loss/logits": 0.4967530369758606, "step": 158 }, { "epoch": 0.01, "grad_norm": 8.125, "grad_norm_var": 0.6618448893229166, "learning_rate": 0.0001, "loss": 12.8303, "loss/crossentropy": 2.5461435317993164, "loss/hidden": 5.09375, "loss/jsd": 0.0, "loss/logits": 0.5074218511581421, "step": 160 }, { "epoch": 0.010125, "grad_norm": 7.8125, "grad_norm_var": 0.6085245768229167, "learning_rate": 0.0001, "loss": 12.7334, "loss/crossentropy": 2.2296184301376343, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.4967309385538101, "step": 162 }, { "epoch": 0.01025, "grad_norm": 7.28125, "grad_norm_var": 0.48121337890625, "learning_rate": 0.0001, "loss": 12.8275, "loss/crossentropy": 2.348438858985901, "loss/hidden": 5.09375, "loss/jsd": 0.0, "loss/logits": 0.47058284282684326, "step": 164 }, { "epoch": 0.010375, "grad_norm": 7.0, "grad_norm_var": 0.39407552083333336, "learning_rate": 0.0001, "loss": 12.6261, "loss/crossentropy": 2.4020251035690308, "loss/hidden": 4.96875, "loss/jsd": 0.0, "loss/logits": 0.4758017808198929, "step": 166 }, { "epoch": 0.0105, "grad_norm": 5.65625, "grad_norm_var": 0.6719685872395833, "learning_rate": 0.0001, "loss": 12.7439, "loss/crossentropy": 2.4506293535232544, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.5312856733798981, "step": 168 }, { "epoch": 0.010625, "grad_norm": 7.40625, "grad_norm_var": 0.6060506184895833, "learning_rate": 0.0001, "loss": 12.582, "loss/crossentropy": 2.5331802368164062, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.5095243901014328, "step": 170 }, { "epoch": 0.01075, "grad_norm": 8.625, "grad_norm_var": 0.6595662434895834, "learning_rate": 0.0001, "loss": 12.4879, "loss/crossentropy": 2.65364670753479, "loss/hidden": 5.140625, "loss/jsd": 0.0, "loss/logits": 0.543939620256424, "step": 172 }, { "epoch": 0.010875, "grad_norm": 6.21875, "grad_norm_var": 0.6911417643229166, "learning_rate": 0.0001, "loss": 12.2931, "loss/crossentropy": 2.2634752988815308, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.45544371008872986, "step": 174 }, { "epoch": 0.011, "grad_norm": 7.21875, "grad_norm_var": 0.6577473958333333, "learning_rate": 0.0001, "loss": 12.3996, "loss/crossentropy": 2.3653087615966797, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.4665989428758621, "step": 176 }, { "epoch": 0.011125, "grad_norm": 7.8125, "grad_norm_var": 0.6873006184895833, "learning_rate": 0.0001, "loss": 12.335, "loss/crossentropy": 2.274166226387024, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.48722338676452637, "step": 178 }, { "epoch": 0.01125, "grad_norm": 8.875, "grad_norm_var": 0.8094685872395834, "learning_rate": 0.0001, "loss": 12.4249, "loss/crossentropy": 2.464481830596924, "loss/hidden": 4.953125, "loss/jsd": 0.0, "loss/logits": 0.5341024994850159, "step": 180 }, { "epoch": 0.011375, "grad_norm": 6.84375, "grad_norm_var": 0.8151692708333333, "learning_rate": 0.0001, "loss": 12.12, "loss/crossentropy": 2.5521204471588135, "loss/hidden": 5.09375, "loss/jsd": 0.0, "loss/logits": 0.5266247987747192, "step": 182 }, { "epoch": 0.0115, "grad_norm": 6.375, "grad_norm_var": 0.6587076822916667, "learning_rate": 0.0001, "loss": 12.0581, "loss/crossentropy": 2.380069375038147, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.4448339492082596, "step": 184 }, { "epoch": 0.011625, "grad_norm": 7.46875, "grad_norm_var": 0.6198527018229166, "learning_rate": 0.0001, "loss": 12.2557, "loss/crossentropy": 2.6351869106292725, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.51109179854393, "step": 186 }, { "epoch": 0.01175, "grad_norm": 6.625, "grad_norm_var": 0.461572265625, "learning_rate": 0.0001, "loss": 12.3102, "loss/crossentropy": 2.606539011001587, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.4482808858156204, "step": 188 }, { "epoch": 0.011875, "grad_norm": 7.3125, "grad_norm_var": 0.40491129557291666, "learning_rate": 0.0001, "loss": 11.8983, "loss/crossentropy": 2.5177031755447388, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.4657522886991501, "step": 190 }, { "epoch": 0.012, "grad_norm": 5.40625, "grad_norm_var": 0.5950358072916667, "learning_rate": 0.0001, "loss": 11.8962, "loss/crossentropy": 2.5478276014328003, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.4511236548423767, "step": 192 }, { "epoch": 0.012125, "grad_norm": 7.25, "grad_norm_var": 0.5598592122395833, "learning_rate": 0.0001, "loss": 11.9408, "loss/crossentropy": 2.08566415309906, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.4315005987882614, "step": 194 }, { "epoch": 0.01225, "grad_norm": 7.03125, "grad_norm_var": 0.40950113932291665, "learning_rate": 0.0001, "loss": 12.0969, "loss/crossentropy": 2.634473204612732, "loss/hidden": 4.84375, "loss/jsd": 0.0, "loss/logits": 0.4682666063308716, "step": 196 }, { "epoch": 0.012375, "grad_norm": 7.25, "grad_norm_var": 0.4279296875, "learning_rate": 0.0001, "loss": 12.0544, "loss/crossentropy": 2.6797198057174683, "loss/hidden": 4.84375, "loss/jsd": 0.0, "loss/logits": 0.4552183598279953, "step": 198 }, { "epoch": 0.0125, "grad_norm": 6.75, "grad_norm_var": 0.4488118489583333, "learning_rate": 0.0001, "loss": 11.9949, "loss/crossentropy": 2.9568880796432495, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.5152240097522736, "step": 200 }, { "epoch": 0.012625, "grad_norm": 6.1875, "grad_norm_var": 0.4198527018229167, "learning_rate": 0.0001, "loss": 11.876, "loss/crossentropy": 2.4664944410324097, "loss/hidden": 4.734375, "loss/jsd": 0.0, "loss/logits": 0.4684390127658844, "step": 202 }, { "epoch": 0.01275, "grad_norm": 6.5625, "grad_norm_var": 0.41952718098958336, "learning_rate": 0.0001, "loss": 11.9644, "loss/crossentropy": 2.580668091773987, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.4738956689834595, "step": 204 }, { "epoch": 0.012875, "grad_norm": 6.3125, "grad_norm_var": 0.39635009765625, "learning_rate": 0.0001, "loss": 12.0377, "loss/crossentropy": 2.367862343788147, "loss/hidden": 4.671875, "loss/jsd": 0.0, "loss/logits": 0.45996397733688354, "step": 206 }, { "epoch": 0.013, "grad_norm": 6.09375, "grad_norm_var": 0.27681884765625, "learning_rate": 0.0001, "loss": 11.9037, "loss/crossentropy": 2.5246529579162598, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.46563032269477844, "step": 208 }, { "epoch": 0.013125, "grad_norm": 6.46875, "grad_norm_var": 0.24596354166666667, "learning_rate": 0.0001, "loss": 11.7128, "loss/crossentropy": 2.20585036277771, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.41547301411628723, "step": 210 }, { "epoch": 0.01325, "grad_norm": 5.9375, "grad_norm_var": 0.2263671875, "learning_rate": 0.0001, "loss": 11.7218, "loss/crossentropy": 2.3064881563186646, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.4325401932001114, "step": 212 }, { "epoch": 0.013375, "grad_norm": 7.03125, "grad_norm_var": 0.16054280598958334, "learning_rate": 0.0001, "loss": 11.5407, "loss/crossentropy": 2.3898541927337646, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.43332116305828094, "step": 214 }, { "epoch": 0.0135, "grad_norm": 6.5625, "grad_norm_var": 0.150244140625, "learning_rate": 0.0001, "loss": 11.8046, "loss/crossentropy": 2.456748604774475, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.4493984282016754, "step": 216 }, { "epoch": 0.013625, "grad_norm": 6.25, "grad_norm_var": 0.18606770833333333, "learning_rate": 0.0001, "loss": 11.8232, "loss/crossentropy": 2.7504690885543823, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.44959259033203125, "step": 218 }, { "epoch": 0.01375, "grad_norm": 6.65625, "grad_norm_var": 0.18313802083333333, "learning_rate": 0.0001, "loss": 11.7742, "loss/crossentropy": 2.272356390953064, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.4380947947502136, "step": 220 }, { "epoch": 0.013875, "grad_norm": 5.71875, "grad_norm_var": 0.20331624348958333, "learning_rate": 0.0001, "loss": 11.7707, "loss/crossentropy": 2.539223790168762, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.41349247097969055, "step": 222 }, { "epoch": 0.014, "grad_norm": 6.3125, "grad_norm_var": 0.21122639973958332, "learning_rate": 0.0001, "loss": 11.6392, "loss/crossentropy": 2.6272025108337402, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.4406648874282837, "step": 224 }, { "epoch": 0.014125, "grad_norm": 5.65625, "grad_norm_var": 0.23255208333333333, "learning_rate": 0.0001, "loss": 11.3455, "loss/crossentropy": 2.1202113032341003, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.42472922801971436, "step": 226 }, { "epoch": 0.01425, "grad_norm": 6.5625, "grad_norm_var": 0.23212483723958333, "learning_rate": 0.0001, "loss": 11.5275, "loss/crossentropy": 2.420728087425232, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.4595801681280136, "step": 228 }, { "epoch": 0.014375, "grad_norm": 5.65625, "grad_norm_var": 0.22057291666666667, "learning_rate": 0.0001, "loss": 11.849, "loss/crossentropy": 2.58668851852417, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.44095560908317566, "step": 230 }, { "epoch": 0.0145, "grad_norm": 5.28125, "grad_norm_var": 0.210546875, "learning_rate": 0.0001, "loss": 11.4306, "loss/crossentropy": 2.3560508489608765, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.4567548930644989, "step": 232 }, { "epoch": 0.014625, "grad_norm": 5.78125, "grad_norm_var": 0.19872639973958334, "learning_rate": 0.0001, "loss": 11.41, "loss/crossentropy": 2.362083673477173, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.4453700929880142, "step": 234 }, { "epoch": 0.01475, "grad_norm": 6.625, "grad_norm_var": 0.18801676432291667, "learning_rate": 0.0001, "loss": 11.3155, "loss/crossentropy": 2.6302807331085205, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.45155879855155945, "step": 236 }, { "epoch": 0.014875, "grad_norm": 6.28125, "grad_norm_var": 0.16236979166666668, "learning_rate": 0.0001, "loss": 11.5746, "loss/crossentropy": 2.504029393196106, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.423601895570755, "step": 238 }, { "epoch": 0.015, "grad_norm": 5.84375, "grad_norm_var": 0.14752604166666666, "learning_rate": 0.0001, "loss": 11.3137, "loss/crossentropy": 2.496834635734558, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.44761495292186737, "step": 240 }, { "epoch": 0.015125, "grad_norm": 5.75, "grad_norm_var": 0.14377848307291666, "learning_rate": 0.0001, "loss": 11.215, "loss/crossentropy": 2.5440114736557007, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.4467613846063614, "step": 242 }, { "epoch": 0.01525, "grad_norm": 6.8125, "grad_norm_var": 0.16174723307291666, "learning_rate": 0.0001, "loss": 11.4828, "loss/crossentropy": 2.3933448791503906, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.4251607805490494, "step": 244 }, { "epoch": 0.015375, "grad_norm": 5.28125, "grad_norm_var": 0.19026285807291668, "learning_rate": 0.0001, "loss": 11.2663, "loss/crossentropy": 2.709121346473694, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.4511110782623291, "step": 246 }, { "epoch": 0.0155, "grad_norm": 6.375, "grad_norm_var": 0.18007405598958334, "learning_rate": 0.0001, "loss": 11.2542, "loss/crossentropy": 2.65135395526886, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.42732760310173035, "step": 248 }, { "epoch": 0.015625, "grad_norm": 5.5, "grad_norm_var": 0.25390218098958334, "learning_rate": 0.0001, "loss": 11.0863, "loss/crossentropy": 2.531478524208069, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.39773157238960266, "step": 250 }, { "epoch": 0.01575, "grad_norm": 5.75, "grad_norm_var": 0.23079427083333334, "learning_rate": 0.0001, "loss": 11.3519, "loss/crossentropy": 2.1759145259857178, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.40788644552230835, "step": 252 }, { "epoch": 0.015875, "grad_norm": 6.28125, "grad_norm_var": 0.246337890625, "learning_rate": 0.0001, "loss": 10.9975, "loss/crossentropy": 2.486463189125061, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.4049537926912308, "step": 254 }, { "epoch": 0.016, "grad_norm": 4.90625, "grad_norm_var": 0.32945556640625, "learning_rate": 0.0001, "loss": 10.9813, "loss/crossentropy": 2.3456650972366333, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.3914157599210739, "step": 256 }, { "epoch": 0.016125, "grad_norm": 5.8125, "grad_norm_var": 0.35302327473958334, "learning_rate": 0.0001, "loss": 11.124, "loss/crossentropy": 2.7621407508850098, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.4217756986618042, "step": 258 }, { "epoch": 0.01625, "grad_norm": 5.09375, "grad_norm_var": 0.30201416015625, "learning_rate": 0.0001, "loss": 11.0913, "loss/crossentropy": 2.520516872406006, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.42682692408561707, "step": 260 }, { "epoch": 0.016375, "grad_norm": 5.875, "grad_norm_var": 0.289697265625, "learning_rate": 0.0001, "loss": 11.0406, "loss/crossentropy": 2.668264865875244, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.44012486934661865, "step": 262 }, { "epoch": 0.0165, "grad_norm": 5.71875, "grad_norm_var": 0.221337890625, "learning_rate": 0.0001, "loss": 11.1313, "loss/crossentropy": 2.6228344440460205, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.47360049188137054, "step": 264 }, { "epoch": 0.016625, "grad_norm": 5.1875, "grad_norm_var": 0.2591796875, "learning_rate": 0.0001, "loss": 10.8727, "loss/crossentropy": 2.07044917345047, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.3736244738101959, "step": 266 }, { "epoch": 0.01675, "grad_norm": 5.59375, "grad_norm_var": 0.24763997395833334, "learning_rate": 0.0001, "loss": 10.9825, "loss/crossentropy": 2.195676624774933, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.41074641048908234, "step": 268 }, { "epoch": 0.016875, "grad_norm": 5.15625, "grad_norm_var": 0.16717122395833334, "learning_rate": 0.0001, "loss": 10.9858, "loss/crossentropy": 2.6045761108398438, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.43537537753582, "step": 270 }, { "epoch": 0.017, "grad_norm": 5.40625, "grad_norm_var": 0.17616780598958334, "learning_rate": 0.0001, "loss": 11.1417, "loss/crossentropy": 2.343075156211853, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.3932172954082489, "step": 272 }, { "epoch": 0.017125, "grad_norm": 5.96875, "grad_norm_var": 0.18162434895833332, "learning_rate": 0.0001, "loss": 10.9988, "loss/crossentropy": 2.4649263620376587, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.3909170925617218, "step": 274 }, { "epoch": 0.01725, "grad_norm": 4.5, "grad_norm_var": 0.23905843098958332, "learning_rate": 0.0001, "loss": 10.9516, "loss/crossentropy": 2.3632274866104126, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.39788326621055603, "step": 276 }, { "epoch": 0.017375, "grad_norm": 5.78125, "grad_norm_var": 0.24231363932291666, "learning_rate": 0.0001, "loss": 10.9188, "loss/crossentropy": 2.5861356258392334, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.41938433051109314, "step": 278 }, { "epoch": 0.0175, "grad_norm": 5.21875, "grad_norm_var": 0.23834635416666666, "learning_rate": 0.0001, "loss": 10.92, "loss/crossentropy": 2.5754419565200806, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.4088515043258667, "step": 280 }, { "epoch": 0.017625, "grad_norm": 5.53125, "grad_norm_var": 0.21184895833333334, "learning_rate": 0.0001, "loss": 10.7969, "loss/crossentropy": 2.241589307785034, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.38494938611984253, "step": 282 }, { "epoch": 0.01775, "grad_norm": 5.40625, "grad_norm_var": 0.3136678059895833, "learning_rate": 0.0001, "loss": 10.9124, "loss/crossentropy": 2.332160472869873, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.4286513030529022, "step": 284 }, { "epoch": 0.017875, "grad_norm": 5.84375, "grad_norm_var": 0.2992472330729167, "learning_rate": 0.0001, "loss": 10.8958, "loss/crossentropy": 2.452752709388733, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.37612101435661316, "step": 286 }, { "epoch": 0.018, "grad_norm": 4.84375, "grad_norm_var": 0.249072265625, "learning_rate": 0.0001, "loss": 10.7195, "loss/crossentropy": 2.2290940284729004, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.37554706633090973, "step": 288 }, { "epoch": 0.018125, "grad_norm": 5.875, "grad_norm_var": 0.24680989583333332, "learning_rate": 0.0001, "loss": 10.7342, "loss/crossentropy": 2.4139484167099, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.40335869789123535, "step": 290 }, { "epoch": 0.01825, "grad_norm": 4.78125, "grad_norm_var": 0.21402587890625, "learning_rate": 0.0001, "loss": 10.6719, "loss/crossentropy": 2.56483793258667, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.41296976804733276, "step": 292 }, { "epoch": 0.018375, "grad_norm": 6.375, "grad_norm_var": 0.24823811848958333, "learning_rate": 0.0001, "loss": 10.9241, "loss/crossentropy": 2.3277297019958496, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.3984246253967285, "step": 294 }, { "epoch": 0.0185, "grad_norm": 4.71875, "grad_norm_var": 0.28815104166666666, "learning_rate": 0.0001, "loss": 10.7746, "loss/crossentropy": 2.3450149297714233, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.3996598720550537, "step": 296 }, { "epoch": 0.018625, "grad_norm": 5.03125, "grad_norm_var": 0.352734375, "learning_rate": 0.0001, "loss": 10.5621, "loss/crossentropy": 2.317037582397461, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.40160971879959106, "step": 298 }, { "epoch": 0.01875, "grad_norm": 5.3125, "grad_norm_var": 0.23987223307291666, "learning_rate": 0.0001, "loss": 10.9369, "loss/crossentropy": 2.5068975687026978, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.410100519657135, "step": 300 }, { "epoch": 0.018875, "grad_norm": 4.9375, "grad_norm_var": 0.22589518229166666, "learning_rate": 0.0001, "loss": 10.7482, "loss/crossentropy": 2.351959705352783, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.39118409156799316, "step": 302 }, { "epoch": 0.019, "grad_norm": 5.59375, "grad_norm_var": 0.21608072916666668, "learning_rate": 0.0001, "loss": 10.7665, "loss/crossentropy": 2.3877638578414917, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.39836424589157104, "step": 304 }, { "epoch": 0.019125, "grad_norm": 5.53125, "grad_norm_var": 0.19894205729166667, "learning_rate": 0.0001, "loss": 10.7703, "loss/crossentropy": 2.600021004676819, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.41590385138988495, "step": 306 }, { "epoch": 0.01925, "grad_norm": 5.40625, "grad_norm_var": 0.18485921223958332, "learning_rate": 0.0001, "loss": 10.6674, "loss/crossentropy": 2.4758663177490234, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.39325758814811707, "step": 308 }, { "epoch": 0.019375, "grad_norm": 5.25, "grad_norm_var": 0.10755208333333334, "learning_rate": 0.0001, "loss": 10.5967, "loss/crossentropy": 2.3135849237442017, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.3709346354007721, "step": 310 }, { "epoch": 0.0195, "grad_norm": 4.34375, "grad_norm_var": 0.20885416666666667, "learning_rate": 0.0001, "loss": 10.7752, "loss/crossentropy": 2.4139580726623535, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4005644619464874, "step": 312 }, { "epoch": 0.019625, "grad_norm": 6.03125, "grad_norm_var": 0.21534830729166668, "learning_rate": 0.0001, "loss": 10.779, "loss/crossentropy": 2.5271564722061157, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.37980175018310547, "step": 314 }, { "epoch": 0.01975, "grad_norm": 4.53125, "grad_norm_var": 0.25982666015625, "learning_rate": 0.0001, "loss": 10.5117, "loss/crossentropy": 2.5739694833755493, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.40202172100543976, "step": 316 }, { "epoch": 0.019875, "grad_norm": 6.625, "grad_norm_var": 0.36412353515625, "learning_rate": 0.0001, "loss": 10.5532, "loss/crossentropy": 2.245994448661804, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.36041176319122314, "step": 318 }, { "epoch": 0.02, "grad_norm": 4.65625, "grad_norm_var": 0.392041015625, "learning_rate": 0.0001, "loss": 10.5765, "loss/crossentropy": 2.5354862213134766, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.3732207715511322, "step": 320 }, { "epoch": 0.020125, "grad_norm": 5.8125, "grad_norm_var": 0.463134765625, "learning_rate": 0.0001, "loss": 10.4762, "loss/crossentropy": 2.3753507137298584, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3840855211019516, "step": 322 }, { "epoch": 0.02025, "grad_norm": 4.90625, "grad_norm_var": 0.4641927083333333, "learning_rate": 0.0001, "loss": 10.2923, "loss/crossentropy": 2.4433764219284058, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.38050127029418945, "step": 324 }, { "epoch": 0.020375, "grad_norm": 5.21875, "grad_norm_var": 0.5068644205729167, "learning_rate": 0.0001, "loss": 10.6858, "loss/crossentropy": 2.484220504760742, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4011761546134949, "step": 326 }, { "epoch": 0.0205, "grad_norm": 5.3125, "grad_norm_var": 0.3748697916666667, "learning_rate": 0.0001, "loss": 10.7329, "loss/crossentropy": 2.6139092445373535, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.4130321443080902, "step": 328 }, { "epoch": 0.020625, "grad_norm": 5.0, "grad_norm_var": 0.31311442057291666, "learning_rate": 0.0001, "loss": 10.606, "loss/crossentropy": 2.4625054597854614, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.3967062383890152, "step": 330 }, { "epoch": 0.02075, "grad_norm": 5.0625, "grad_norm_var": 0.2874837239583333, "learning_rate": 0.0001, "loss": 10.5546, "loss/crossentropy": 2.3454889059066772, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.3706536889076233, "step": 332 }, { "epoch": 0.020875, "grad_norm": 5.4375, "grad_norm_var": 0.18229166666666666, "learning_rate": 0.0001, "loss": 10.5487, "loss/crossentropy": 2.4348955154418945, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.37259407341480255, "step": 334 }, { "epoch": 0.021, "grad_norm": 4.25, "grad_norm_var": 0.21061197916666666, "learning_rate": 0.0001, "loss": 10.3802, "loss/crossentropy": 2.3722680807113647, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.37501636147499084, "step": 336 }, { "epoch": 0.021125, "grad_norm": 5.6875, "grad_norm_var": 0.17509358723958332, "learning_rate": 0.0001, "loss": 10.6118, "loss/crossentropy": 2.367267608642578, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.37614843249320984, "step": 338 }, { "epoch": 0.02125, "grad_norm": 4.96875, "grad_norm_var": 0.18136393229166667, "learning_rate": 0.0001, "loss": 10.409, "loss/crossentropy": 2.5342684984207153, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.4095509201288223, "step": 340 }, { "epoch": 0.021375, "grad_norm": 4.25, "grad_norm_var": 0.200634765625, "learning_rate": 0.0001, "loss": 10.347, "loss/crossentropy": 2.366121530532837, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.38898931443691254, "step": 342 }, { "epoch": 0.0215, "grad_norm": 5.96875, "grad_norm_var": 0.391650390625, "learning_rate": 0.0001, "loss": 10.4284, "loss/crossentropy": 2.435874819755554, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4087076783180237, "step": 344 }, { "epoch": 0.021625, "grad_norm": 5.46875, "grad_norm_var": 0.39010416666666664, "learning_rate": 0.0001, "loss": 10.4112, "loss/crossentropy": 2.4133975505828857, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3934263288974762, "step": 346 }, { "epoch": 0.02175, "grad_norm": 4.8125, "grad_norm_var": 0.40168863932291665, "learning_rate": 0.0001, "loss": 10.59, "loss/crossentropy": 2.3464980125427246, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.36868566274642944, "step": 348 }, { "epoch": 0.021875, "grad_norm": 5.15625, "grad_norm_var": 0.3732421875, "learning_rate": 0.0001, "loss": 10.5281, "loss/crossentropy": 2.492926597595215, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.37226299941539764, "step": 350 }, { "epoch": 0.022, "grad_norm": 4.25, "grad_norm_var": 0.37948811848958336, "learning_rate": 0.0001, "loss": 10.2246, "loss/crossentropy": 2.3636070489883423, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.3681965172290802, "step": 352 }, { "epoch": 0.022125, "grad_norm": 4.625, "grad_norm_var": 0.3692708333333333, "learning_rate": 0.0001, "loss": 10.5053, "loss/crossentropy": 2.288292169570923, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.3958088457584381, "step": 354 }, { "epoch": 0.02225, "grad_norm": 5.15625, "grad_norm_var": 0.38084309895833335, "learning_rate": 0.0001, "loss": 10.3737, "loss/crossentropy": 2.451295018196106, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.39023011922836304, "step": 356 }, { "epoch": 0.022375, "grad_norm": 4.59375, "grad_norm_var": 0.3459269205729167, "learning_rate": 0.0001, "loss": 10.3773, "loss/crossentropy": 2.3242127895355225, "loss/hidden": 4.0546875, "loss/jsd": 0.0, "loss/logits": 0.39542824029922485, "step": 358 }, { "epoch": 0.0225, "grad_norm": 5.03125, "grad_norm_var": 0.132666015625, "learning_rate": 0.0001, "loss": 10.531, "loss/crossentropy": 2.5108137130737305, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3721470236778259, "step": 360 }, { "epoch": 0.022625, "grad_norm": 4.8125, "grad_norm_var": 0.101416015625, "learning_rate": 0.0001, "loss": 10.4355, "loss/crossentropy": 2.282769203186035, "loss/hidden": 4.0859375, "loss/jsd": 0.0, "loss/logits": 0.353266179561615, "step": 362 }, { "epoch": 0.02275, "grad_norm": 5.40625, "grad_norm_var": 0.13072916666666667, "learning_rate": 0.0001, "loss": 10.3682, "loss/crossentropy": 2.3975025415420532, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3848300874233246, "step": 364 }, { "epoch": 0.022875, "grad_norm": 4.59375, "grad_norm_var": 0.13717447916666667, "learning_rate": 0.0001, "loss": 10.3197, "loss/crossentropy": 2.5082876682281494, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.38546115159988403, "step": 366 }, { "epoch": 0.023, "grad_norm": 5.75, "grad_norm_var": 0.23873697916666667, "learning_rate": 0.0001, "loss": 10.0812, "loss/crossentropy": 2.3333388566970825, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.35471296310424805, "step": 368 }, { "epoch": 0.023125, "grad_norm": 7.78125, "grad_norm_var": 0.6998697916666666, "learning_rate": 0.0001, "loss": 10.6427, "loss/crossentropy": 2.568707585334778, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3596802055835724, "step": 370 }, { "epoch": 0.02325, "grad_norm": 5.3125, "grad_norm_var": 0.73492431640625, "learning_rate": 0.0001, "loss": 10.3651, "loss/crossentropy": 2.393819808959961, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3487332612276077, "step": 372 }, { "epoch": 0.023375, "grad_norm": 5.03125, "grad_norm_var": 0.7124348958333333, "learning_rate": 0.0001, "loss": 10.2767, "loss/crossentropy": 2.472244381904602, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.40489913523197174, "step": 374 }, { "epoch": 0.0235, "grad_norm": 4.78125, "grad_norm_var": 0.7247029622395833, "learning_rate": 0.0001, "loss": 10.1461, "loss/crossentropy": 2.2458752393722534, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.3645085096359253, "step": 376 }, { "epoch": 0.023625, "grad_norm": 4.40625, "grad_norm_var": 0.77890625, "learning_rate": 0.0001, "loss": 10.2872, "loss/crossentropy": 2.1981548070907593, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.38387705385684967, "step": 378 }, { "epoch": 0.02375, "grad_norm": 4.84375, "grad_norm_var": 0.7771443684895833, "learning_rate": 0.0001, "loss": 10.1281, "loss/crossentropy": 2.4362692832946777, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3760421574115753, "step": 380 }, { "epoch": 0.023875, "grad_norm": 4.65625, "grad_norm_var": 0.7679972330729167, "learning_rate": 0.0001, "loss": 10.2042, "loss/crossentropy": 2.4601200819015503, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.3811968266963959, "step": 382 }, { "epoch": 0.024, "grad_norm": 4.875, "grad_norm_var": 0.684228515625, "learning_rate": 0.0001, "loss": 10.2408, "loss/crossentropy": 2.6233471632003784, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.36839838325977325, "step": 384 }, { "epoch": 0.024125, "grad_norm": 4.6875, "grad_norm_var": 0.165478515625, "learning_rate": 0.0001, "loss": 10.1497, "loss/crossentropy": 2.522361993789673, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.3934475779533386, "step": 386 }, { "epoch": 0.02425, "grad_norm": 4.5, "grad_norm_var": 0.14563395182291666, "learning_rate": 0.0001, "loss": 10.1857, "loss/crossentropy": 2.573809266090393, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.375105544924736, "step": 388 }, { "epoch": 0.024375, "grad_norm": 4.40625, "grad_norm_var": 0.13203125, "learning_rate": 0.0001, "loss": 10.0545, "loss/crossentropy": 2.458760142326355, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.348113551735878, "step": 390 }, { "epoch": 0.0245, "grad_norm": 4.8125, "grad_norm_var": 0.13248291015625, "learning_rate": 0.0001, "loss": 10.1765, "loss/crossentropy": 2.8183611631393433, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.39342811703681946, "step": 392 }, { "epoch": 0.024625, "grad_norm": 4.65625, "grad_norm_var": 0.11769205729166667, "learning_rate": 0.0001, "loss": 10.0009, "loss/crossentropy": 2.2332464456558228, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.3468857705593109, "step": 394 }, { "epoch": 0.02475, "grad_norm": 4.6875, "grad_norm_var": 0.14143473307291668, "learning_rate": 0.0001, "loss": 9.9097, "loss/crossentropy": 2.3098992109298706, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3603272885084152, "step": 396 }, { "epoch": 0.024875, "grad_norm": 4.46875, "grad_norm_var": 0.15129801432291667, "learning_rate": 0.0001, "loss": 10.1234, "loss/crossentropy": 2.1571128964424133, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3672170788049698, "step": 398 }, { "epoch": 0.025, "grad_norm": 4.8125, "grad_norm_var": 0.11464436848958333, "learning_rate": 0.0001, "loss": 10.1366, "loss/crossentropy": 2.3940361738204956, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3588118702173233, "step": 400 }, { "epoch": 0.025125, "grad_norm": 4.3125, "grad_norm_var": 0.1046875, "learning_rate": 0.0001, "loss": 10.248, "loss/crossentropy": 2.4594138860702515, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3485400527715683, "step": 402 }, { "epoch": 0.02525, "grad_norm": 4.9375, "grad_norm_var": 0.10377197265625, "learning_rate": 0.0001, "loss": 10.0681, "loss/crossentropy": 2.524109125137329, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3557968735694885, "step": 404 }, { "epoch": 0.025375, "grad_norm": 5.03125, "grad_norm_var": 0.10859375, "learning_rate": 0.0001, "loss": 10.0777, "loss/crossentropy": 2.3012895584106445, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.34439629316329956, "step": 406 }, { "epoch": 0.0255, "grad_norm": 4.28125, "grad_norm_var": 0.11404622395833333, "learning_rate": 0.0001, "loss": 9.8043, "loss/crossentropy": 2.4211593866348267, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3732890635728836, "step": 408 }, { "epoch": 0.025625, "grad_norm": 4.40625, "grad_norm_var": 0.11438395182291666, "learning_rate": 0.0001, "loss": 9.9315, "loss/crossentropy": 2.582412362098694, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.36784547567367554, "step": 410 }, { "epoch": 0.02575, "grad_norm": 4.0625, "grad_norm_var": 0.07857666015625, "learning_rate": 0.0001, "loss": 9.9946, "loss/crossentropy": 2.525751233100891, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3565828502178192, "step": 412 }, { "epoch": 0.025875, "grad_norm": 4.28125, "grad_norm_var": 0.092578125, "learning_rate": 0.0001, "loss": 10.2235, "loss/crossentropy": 2.670364737510681, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.4017476439476013, "step": 414 }, { "epoch": 0.026, "grad_norm": 4.4375, "grad_norm_var": 0.08396809895833333, "learning_rate": 0.0001, "loss": 9.8333, "loss/crossentropy": 2.3298041820526123, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3325035125017166, "step": 416 }, { "epoch": 0.026125, "grad_norm": 4.34375, "grad_norm_var": 0.08318684895833334, "learning_rate": 0.0001, "loss": 10.0788, "loss/crossentropy": 2.240808844566345, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.33864179253578186, "step": 418 }, { "epoch": 0.02625, "grad_norm": 4.4375, "grad_norm_var": 0.06643473307291667, "learning_rate": 0.0001, "loss": 9.933, "loss/crossentropy": 2.397716999053955, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3628341108560562, "step": 420 }, { "epoch": 0.026375, "grad_norm": 4.375, "grad_norm_var": 0.06901041666666667, "learning_rate": 0.0001, "loss": 10.0998, "loss/crossentropy": 2.4601176977157593, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3522925227880478, "step": 422 }, { "epoch": 0.0265, "grad_norm": 5.5, "grad_norm_var": 0.15636393229166667, "learning_rate": 0.0001, "loss": 10.1351, "loss/crossentropy": 2.4023250341415405, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.37351013720035553, "step": 424 }, { "epoch": 0.026625, "grad_norm": 4.71875, "grad_norm_var": 0.16990559895833332, "learning_rate": 0.0001, "loss": 10.0776, "loss/crossentropy": 2.271855592727661, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.3706711381673813, "step": 426 }, { "epoch": 0.02675, "grad_norm": 3.953125, "grad_norm_var": 0.17908426920572917, "learning_rate": 0.0001, "loss": 9.7587, "loss/crossentropy": 2.0610432028770447, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.3243980407714844, "step": 428 }, { "epoch": 0.026875, "grad_norm": 5.21875, "grad_norm_var": 0.20093485514322917, "learning_rate": 0.0001, "loss": 9.8788, "loss/crossentropy": 2.4309468269348145, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.4062986671924591, "step": 430 }, { "epoch": 0.027, "grad_norm": 4.65625, "grad_norm_var": 0.20027567545572916, "learning_rate": 0.0001, "loss": 10.0082, "loss/crossentropy": 2.4598418474197388, "loss/hidden": 3.9921875, "loss/jsd": 0.0, "loss/logits": 0.4272041916847229, "step": 432 }, { "epoch": 0.027125, "grad_norm": 5.03125, "grad_norm_var": 0.21314188639322917, "learning_rate": 0.0001, "loss": 9.9003, "loss/crossentropy": 2.539934992790222, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.35184885561466217, "step": 434 }, { "epoch": 0.02725, "grad_norm": 4.625, "grad_norm_var": 0.20462137858072918, "learning_rate": 0.0001, "loss": 9.9416, "loss/crossentropy": 2.269451856613159, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.35347896814346313, "step": 436 }, { "epoch": 0.027375, "grad_norm": 4.34375, "grad_norm_var": 0.1935455322265625, "learning_rate": 0.0001, "loss": 9.9578, "loss/crossentropy": 2.281239867210388, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.3409070521593094, "step": 438 }, { "epoch": 0.0275, "grad_norm": 4.1875, "grad_norm_var": 0.1381988525390625, "learning_rate": 0.0001, "loss": 9.8183, "loss/crossentropy": 2.2763094305992126, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3811110109090805, "step": 440 }, { "epoch": 0.027625, "grad_norm": 4.59375, "grad_norm_var": 0.1252593994140625, "learning_rate": 0.0001, "loss": 9.833, "loss/crossentropy": 2.5895315408706665, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.36611178517341614, "step": 442 }, { "epoch": 0.02775, "grad_norm": 4.21875, "grad_norm_var": 0.10857747395833334, "learning_rate": 0.0001, "loss": 9.7918, "loss/crossentropy": 2.1158281564712524, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3374405652284622, "step": 444 }, { "epoch": 0.027875, "grad_norm": 5.96875, "grad_norm_var": 0.20597330729166666, "learning_rate": 0.0001, "loss": 9.9942, "loss/crossentropy": 2.446805953979492, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.37695541977882385, "step": 446 }, { "epoch": 0.028, "grad_norm": 5.3125, "grad_norm_var": 0.24869791666666666, "learning_rate": 0.0001, "loss": 10.0719, "loss/crossentropy": 2.5359551906585693, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.3757418543100357, "step": 448 }, { "epoch": 0.028125, "grad_norm": 4.125, "grad_norm_var": 0.24947916666666667, "learning_rate": 0.0001, "loss": 9.8412, "loss/crossentropy": 2.656207323074341, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.42010098695755005, "step": 450 }, { "epoch": 0.02825, "grad_norm": 5.125, "grad_norm_var": 0.26568603515625, "learning_rate": 0.0001, "loss": 9.8241, "loss/crossentropy": 2.4152743816375732, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3626774847507477, "step": 452 }, { "epoch": 0.028375, "grad_norm": 4.3125, "grad_norm_var": 0.26858317057291664, "learning_rate": 0.0001, "loss": 10.136, "loss/crossentropy": 2.4247626066207886, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.36247318983078003, "step": 454 }, { "epoch": 0.0285, "grad_norm": 4.9375, "grad_norm_var": 0.2647939046223958, "learning_rate": 0.0001, "loss": 9.8054, "loss/crossentropy": 2.523893713951111, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.3691753149032593, "step": 456 }, { "epoch": 0.028625, "grad_norm": 5.875, "grad_norm_var": 0.6781646728515625, "learning_rate": 0.0001, "loss": 10.0115, "loss/crossentropy": 2.2744319438934326, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3499785512685776, "step": 458 }, { "epoch": 0.02875, "grad_norm": 4.96875, "grad_norm_var": 0.7750885009765625, "learning_rate": 0.0001, "loss": 9.8553, "loss/crossentropy": 2.383001685142517, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.34379810094833374, "step": 460 }, { "epoch": 0.028875, "grad_norm": 4.28125, "grad_norm_var": 0.7739084879557292, "learning_rate": 0.0001, "loss": 9.9509, "loss/crossentropy": 2.269408345222473, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.39304040372371674, "step": 462 }, { "epoch": 0.029, "grad_norm": 4.625, "grad_norm_var": 0.833544921875, "learning_rate": 0.0001, "loss": 9.8895, "loss/crossentropy": 2.305214285850525, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.32623225450515747, "step": 464 }, { "epoch": 0.029125, "grad_norm": 4.21875, "grad_norm_var": 0.8327799479166667, "learning_rate": 0.0001, "loss": 9.9362, "loss/crossentropy": 2.5358622074127197, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.35859355330467224, "step": 466 }, { "epoch": 0.02925, "grad_norm": 4.3125, "grad_norm_var": 0.8972981770833334, "learning_rate": 0.0001, "loss": 9.9957, "loss/crossentropy": 2.490285634994507, "loss/hidden": 4.0234375, "loss/jsd": 0.0, "loss/logits": 0.35838285088539124, "step": 468 }, { "epoch": 0.029375, "grad_norm": 4.84375, "grad_norm_var": 0.8911417643229167, "learning_rate": 0.0001, "loss": 9.9963, "loss/crossentropy": 2.388529062271118, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3805827349424362, "step": 470 }, { "epoch": 0.0295, "grad_norm": 4.09375, "grad_norm_var": 0.8791575113932292, "learning_rate": 0.0001, "loss": 9.9093, "loss/crossentropy": 2.464060425758362, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.35019560158252716, "step": 472 }, { "epoch": 0.029625, "grad_norm": 5.46875, "grad_norm_var": 0.37202046712239584, "learning_rate": 0.0001, "loss": 9.8783, "loss/crossentropy": 2.3649709224700928, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.3797526955604553, "step": 474 }, { "epoch": 0.02975, "grad_norm": 4.25, "grad_norm_var": 0.29755757649739584, "learning_rate": 0.0001, "loss": 9.8323, "loss/crossentropy": 2.4717437028884888, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.35063043236732483, "step": 476 }, { "epoch": 0.029875, "grad_norm": 4.34375, "grad_norm_var": 0.16384175618489583, "learning_rate": 0.0001, "loss": 9.9292, "loss/crossentropy": 2.5398319959640503, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.37523798644542694, "step": 478 }, { "epoch": 0.03, "grad_norm": 4.40625, "grad_norm_var": 0.14685872395833333, "learning_rate": 0.0001, "loss": 9.7299, "loss/crossentropy": 2.080340564250946, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.33192941546440125, "step": 480 }, { "epoch": 0.030125, "grad_norm": 4.46875, "grad_norm_var": 0.14511311848958333, "learning_rate": 0.0001, "loss": 9.6576, "loss/crossentropy": 2.2822307348251343, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.336023285984993, "step": 482 }, { "epoch": 0.03025, "grad_norm": 4.34375, "grad_norm_var": 0.1125, "learning_rate": 0.0001, "loss": 9.5694, "loss/crossentropy": 2.286174952983856, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3336353003978729, "step": 484 }, { "epoch": 0.030375, "grad_norm": 4.6875, "grad_norm_var": 0.13880208333333333, "learning_rate": 0.0001, "loss": 9.7847, "loss/crossentropy": 2.2369834184646606, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.32461969554424286, "step": 486 }, { "epoch": 0.0305, "grad_norm": 3.828125, "grad_norm_var": 0.16288960774739583, "learning_rate": 0.0001, "loss": 9.7289, "loss/crossentropy": 2.3086917400360107, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.321616530418396, "step": 488 }, { "epoch": 0.030625, "grad_norm": 4.21875, "grad_norm_var": 0.09160054524739583, "learning_rate": 0.0001, "loss": 9.8277, "loss/crossentropy": 2.3445401191711426, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.3501063734292984, "step": 490 }, { "epoch": 0.03075, "grad_norm": 4.46875, "grad_norm_var": 0.0995513916015625, "learning_rate": 0.0001, "loss": 9.611, "loss/crossentropy": 1.9773722887039185, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.30824264883995056, "step": 492 }, { "epoch": 0.030875, "grad_norm": 4.25, "grad_norm_var": 0.09944559733072916, "learning_rate": 0.0001, "loss": 9.5735, "loss/crossentropy": 2.428261160850525, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.35596518218517303, "step": 494 }, { "epoch": 0.031, "grad_norm": 4.125, "grad_norm_var": 0.09492085774739584, "learning_rate": 0.0001, "loss": 9.7677, "loss/crossentropy": 2.262718915939331, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.33079805970191956, "step": 496 }, { "epoch": 0.031125, "grad_norm": 4.5, "grad_norm_var": 0.10596415201822916, "learning_rate": 0.0001, "loss": 9.7701, "loss/crossentropy": 2.3702725172042847, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3400324583053589, "step": 498 }, { "epoch": 0.03125, "grad_norm": 3.84375, "grad_norm_var": 0.13961588541666667, "learning_rate": 0.0001, "loss": 9.5602, "loss/crossentropy": 2.295218586921692, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3482416272163391, "step": 500 }, { "epoch": 0.031375, "grad_norm": 5.34375, "grad_norm_var": 0.15602213541666668, "learning_rate": 0.0001, "loss": 10.0544, "loss/crossentropy": 2.445479154586792, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.37152746319770813, "step": 502 }, { "epoch": 0.0315, "grad_norm": 4.9375, "grad_norm_var": 0.1950836181640625, "learning_rate": 0.0001, "loss": 9.5511, "loss/crossentropy": 2.223459005355835, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.32452794909477234, "step": 504 }, { "epoch": 0.031625, "grad_norm": 4.5625, "grad_norm_var": 0.19709370930989584, "learning_rate": 0.0001, "loss": 9.8003, "loss/crossentropy": 2.6400363445281982, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3358649015426636, "step": 506 }, { "epoch": 0.03175, "grad_norm": 4.1875, "grad_norm_var": 0.20300191243489582, "learning_rate": 0.0001, "loss": 9.7514, "loss/crossentropy": 2.548031210899353, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3170333355665207, "step": 508 }, { "epoch": 0.031875, "grad_norm": 4.8125, "grad_norm_var": 0.21568094889322917, "learning_rate": 0.0001, "loss": 9.8207, "loss/crossentropy": 2.2956899404525757, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3392469882965088, "step": 510 }, { "epoch": 0.032, "grad_norm": 4.34375, "grad_norm_var": 0.22424723307291666, "learning_rate": 0.0001, "loss": 9.6765, "loss/crossentropy": 2.353795349597931, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.33590464293956757, "step": 512 }, { "epoch": 0.032125, "grad_norm": 3.796875, "grad_norm_var": 0.23311258951822916, "learning_rate": 0.0001, "loss": 9.5705, "loss/crossentropy": 2.3682990074157715, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3214150220155716, "step": 514 }, { "epoch": 0.03225, "grad_norm": 4.375, "grad_norm_var": 0.19153238932291666, "learning_rate": 0.0001, "loss": 9.5203, "loss/crossentropy": 2.2625592947006226, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.35527725517749786, "step": 516 }, { "epoch": 0.032375, "grad_norm": 4.78125, "grad_norm_var": 0.16765950520833334, "learning_rate": 0.0001, "loss": 9.7962, "loss/crossentropy": 2.4448131322860718, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33070215582847595, "step": 518 }, { "epoch": 0.0325, "grad_norm": 3.875, "grad_norm_var": 0.143310546875, "learning_rate": 0.0001, "loss": 9.6837, "loss/crossentropy": 2.3028098344802856, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.34870584309101105, "step": 520 }, { "epoch": 0.032625, "grad_norm": 4.09375, "grad_norm_var": 0.16155497233072916, "learning_rate": 0.0001, "loss": 9.5805, "loss/crossentropy": 2.181080639362335, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.332154244184494, "step": 522 }, { "epoch": 0.03275, "grad_norm": 4.09375, "grad_norm_var": 0.1529937744140625, "learning_rate": 0.0001, "loss": 9.5179, "loss/crossentropy": 2.337011694908142, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.3419201970100403, "step": 524 }, { "epoch": 0.032875, "grad_norm": 4.5, "grad_norm_var": 0.3694976806640625, "learning_rate": 0.0001, "loss": 9.6365, "loss/crossentropy": 2.354939341545105, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.35222889482975006, "step": 526 }, { "epoch": 0.033, "grad_norm": 3.8125, "grad_norm_var": 0.41142476399739586, "learning_rate": 0.0001, "loss": 9.4276, "loss/crossentropy": 2.2241241931915283, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.32786163687705994, "step": 528 }, { "epoch": 0.033125, "grad_norm": 4.40625, "grad_norm_var": 0.3993886311848958, "learning_rate": 0.0001, "loss": 9.5245, "loss/crossentropy": 2.4617063999176025, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.36403751373291016, "step": 530 }, { "epoch": 0.03325, "grad_norm": 4.15625, "grad_norm_var": 0.4066396077473958, "learning_rate": 0.0001, "loss": 9.5201, "loss/crossentropy": 2.2278032302856445, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3158974349498749, "step": 532 }, { "epoch": 0.033375, "grad_norm": 4.6875, "grad_norm_var": 0.3785634358723958, "learning_rate": 0.0001, "loss": 9.6706, "loss/crossentropy": 2.464658737182617, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.3708791136741638, "step": 534 }, { "epoch": 0.0335, "grad_norm": 4.25, "grad_norm_var": 0.34780985514322915, "learning_rate": 0.0001, "loss": 9.4853, "loss/crossentropy": 2.659584403038025, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3422502875328064, "step": 536 }, { "epoch": 0.033625, "grad_norm": 4.03125, "grad_norm_var": 0.3409830729166667, "learning_rate": 0.0001, "loss": 9.5004, "loss/crossentropy": 2.3510810136795044, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3363768756389618, "step": 538 }, { "epoch": 0.03375, "grad_norm": 4.0625, "grad_norm_var": 0.3513336181640625, "learning_rate": 0.0001, "loss": 9.6363, "loss/crossentropy": 2.4384061098098755, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3327721059322357, "step": 540 }, { "epoch": 0.033875, "grad_norm": 4.46875, "grad_norm_var": 0.08550516764322917, "learning_rate": 0.0001, "loss": 9.628, "loss/crossentropy": 2.435794949531555, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3326384872198105, "step": 542 }, { "epoch": 0.034, "grad_norm": 4.03125, "grad_norm_var": 0.06864827473958333, "learning_rate": 0.0001, "loss": 9.6422, "loss/crossentropy": 2.430909752845764, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33669474720954895, "step": 544 }, { "epoch": 0.034125, "grad_norm": 4.3125, "grad_norm_var": 0.06433817545572916, "learning_rate": 0.0001, "loss": 9.4545, "loss/crossentropy": 2.4339792728424072, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.33400970697402954, "step": 546 }, { "epoch": 0.03425, "grad_norm": 3.78125, "grad_norm_var": 0.060334269205729166, "learning_rate": 0.0001, "loss": 9.6066, "loss/crossentropy": 2.601755380630493, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3572891056537628, "step": 548 }, { "epoch": 0.034375, "grad_norm": 4.75, "grad_norm_var": 0.06419169108072917, "learning_rate": 0.0001, "loss": 9.5364, "loss/crossentropy": 2.263180732727051, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33265452086925507, "step": 550 }, { "epoch": 0.0345, "grad_norm": 4.21875, "grad_norm_var": 0.05921122233072917, "learning_rate": 0.0001, "loss": 9.4317, "loss/crossentropy": 2.6668169498443604, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3484792411327362, "step": 552 }, { "epoch": 0.034625, "grad_norm": 4.0, "grad_norm_var": 0.05729878743489583, "learning_rate": 0.0001, "loss": 9.5416, "loss/crossentropy": 2.488753318786621, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3451061546802521, "step": 554 }, { "epoch": 0.03475, "grad_norm": 4.0625, "grad_norm_var": 0.06444905598958334, "learning_rate": 0.0001, "loss": 9.3819, "loss/crossentropy": 2.5572561025619507, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3529306650161743, "step": 556 }, { "epoch": 0.034875, "grad_norm": 4.46875, "grad_norm_var": 0.06443684895833333, "learning_rate": 0.0001, "loss": 9.6456, "loss/crossentropy": 2.3274362087249756, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3618105351924896, "step": 558 }, { "epoch": 0.035, "grad_norm": 4.34375, "grad_norm_var": 0.059370930989583334, "learning_rate": 0.0001, "loss": 9.4529, "loss/crossentropy": 2.5755836963653564, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3374823033809662, "step": 560 }, { "epoch": 0.035125, "grad_norm": 3.640625, "grad_norm_var": 0.0757232666015625, "learning_rate": 0.0001, "loss": 9.5006, "loss/crossentropy": 2.291811466217041, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.33205731213092804, "step": 562 }, { "epoch": 0.03525, "grad_norm": 4.125, "grad_norm_var": 0.07857666015625, "learning_rate": 0.0001, "loss": 9.4801, "loss/crossentropy": 2.378532886505127, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.31333786249160767, "step": 564 }, { "epoch": 0.035375, "grad_norm": 4.46875, "grad_norm_var": 0.07685546875, "learning_rate": 0.0001, "loss": 9.425, "loss/crossentropy": 2.368729591369629, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.29723505675792694, "step": 566 }, { "epoch": 0.0355, "grad_norm": 3.890625, "grad_norm_var": 0.0939605712890625, "learning_rate": 0.0001, "loss": 9.3256, "loss/crossentropy": 2.359733462333679, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3121738135814667, "step": 568 }, { "epoch": 0.035625, "grad_norm": 4.9375, "grad_norm_var": 0.1553131103515625, "learning_rate": 0.0001, "loss": 9.4416, "loss/crossentropy": 2.315782904624939, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3432431221008301, "step": 570 }, { "epoch": 0.03575, "grad_norm": 3.3125, "grad_norm_var": 0.19507548014322917, "learning_rate": 0.0001, "loss": 9.5614, "loss/crossentropy": 2.3565382957458496, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.32801851630210876, "step": 572 }, { "epoch": 0.035875, "grad_norm": 4.25, "grad_norm_var": 0.19041239420572917, "learning_rate": 0.0001, "loss": 9.3502, "loss/crossentropy": 2.354498505592346, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.34335146844387054, "step": 574 }, { "epoch": 0.036, "grad_norm": 4.0625, "grad_norm_var": 0.1861968994140625, "learning_rate": 0.0001, "loss": 9.4591, "loss/crossentropy": 2.2208141088485718, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.32215404510498047, "step": 576 }, { "epoch": 0.036125, "grad_norm": 3.953125, "grad_norm_var": 0.17675374348958334, "learning_rate": 0.0001, "loss": 9.5684, "loss/crossentropy": 2.1560009717941284, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3317830264568329, "step": 578 }, { "epoch": 0.03625, "grad_norm": 3.84375, "grad_norm_var": 0.1708648681640625, "learning_rate": 0.0001, "loss": 9.384, "loss/crossentropy": 2.254258155822754, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.34166762232780457, "step": 580 }, { "epoch": 0.036375, "grad_norm": 3.90625, "grad_norm_var": 0.14008687337239584, "learning_rate": 0.0001, "loss": 9.4391, "loss/crossentropy": 2.193941831588745, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3121718168258667, "step": 582 }, { "epoch": 0.0365, "grad_norm": 4.1875, "grad_norm_var": 0.13092041015625, "learning_rate": 0.0001, "loss": 9.5948, "loss/crossentropy": 2.610209345817566, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.35641224682331085, "step": 584 }, { "epoch": 0.036625, "grad_norm": 3.9375, "grad_norm_var": 0.04394124348958333, "learning_rate": 0.0001, "loss": 9.334, "loss/crossentropy": 2.173751473426819, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3093992620706558, "step": 586 }, { "epoch": 0.03675, "grad_norm": 3.734375, "grad_norm_var": 0.021141560872395833, "learning_rate": 0.0001, "loss": 9.3647, "loss/crossentropy": 2.6784013509750366, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3435199409723282, "step": 588 }, { "epoch": 0.036875, "grad_norm": 4.375, "grad_norm_var": 0.028416951497395832, "learning_rate": 0.0001, "loss": 9.6858, "loss/crossentropy": 2.5606144666671753, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3252936899662018, "step": 590 }, { "epoch": 0.037, "grad_norm": 4.28125, "grad_norm_var": 0.07421773274739583, "learning_rate": 0.0001, "loss": 9.1905, "loss/crossentropy": 2.5036474466323853, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3341420292854309, "step": 592 }, { "epoch": 0.037125, "grad_norm": 4.0, "grad_norm_var": 0.07280171712239583, "learning_rate": 0.0001, "loss": 9.2089, "loss/crossentropy": 2.138327479362488, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3032035082578659, "step": 594 }, { "epoch": 0.03725, "grad_norm": 3.296875, "grad_norm_var": 0.11728515625, "learning_rate": 0.0001, "loss": 9.1876, "loss/crossentropy": 2.0817145109176636, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.28961437940597534, "step": 596 }, { "epoch": 0.037375, "grad_norm": 4.40625, "grad_norm_var": 0.127685546875, "learning_rate": 0.0001, "loss": 9.5, "loss/crossentropy": 2.231162667274475, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.32542233169078827, "step": 598 }, { "epoch": 0.0375, "grad_norm": 4.09375, "grad_norm_var": 0.12625325520833333, "learning_rate": 0.0001, "loss": 9.4883, "loss/crossentropy": 2.279044270515442, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.30954092741012573, "step": 600 }, { "epoch": 0.037625, "grad_norm": 3.96875, "grad_norm_var": 0.12911783854166667, "learning_rate": 0.0001, "loss": 9.5667, "loss/crossentropy": 2.124338150024414, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3287663906812668, "step": 602 }, { "epoch": 0.03775, "grad_norm": 4.4375, "grad_norm_var": 0.1248443603515625, "learning_rate": 0.0001, "loss": 9.5844, "loss/crossentropy": 2.5788776874542236, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.34556926786899567, "step": 604 }, { "epoch": 0.037875, "grad_norm": 3.390625, "grad_norm_var": 0.15191650390625, "learning_rate": 0.0001, "loss": 9.4029, "loss/crossentropy": 2.511660575866699, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.31718502938747406, "step": 606 }, { "epoch": 0.038, "grad_norm": 4.6875, "grad_norm_var": 0.13528238932291667, "learning_rate": 0.0001, "loss": 9.4312, "loss/crossentropy": 2.558152675628662, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3450692296028137, "step": 608 }, { "epoch": 0.038125, "grad_norm": 3.921875, "grad_norm_var": 0.13547261555989584, "learning_rate": 0.0001, "loss": 9.2833, "loss/crossentropy": 2.2010965943336487, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.33724747598171234, "step": 610 }, { "epoch": 0.03825, "grad_norm": 3.90625, "grad_norm_var": 0.098974609375, "learning_rate": 0.0001, "loss": 9.4903, "loss/crossentropy": 2.499935030937195, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3286616951227188, "step": 612 }, { "epoch": 0.038375, "grad_norm": 4.0625, "grad_norm_var": 0.08684488932291666, "learning_rate": 0.0001, "loss": 9.3714, "loss/crossentropy": 2.29742568731308, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3321594297885895, "step": 614 }, { "epoch": 0.0385, "grad_norm": 4.4375, "grad_norm_var": 0.10472005208333333, "learning_rate": 0.0001, "loss": 9.2124, "loss/crossentropy": 2.2705591917037964, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.33022937178611755, "step": 616 }, { "epoch": 0.038625, "grad_norm": 3.78125, "grad_norm_var": 0.105615234375, "learning_rate": 0.0001, "loss": 9.3996, "loss/crossentropy": 2.5177834033966064, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.33882059156894684, "step": 618 }, { "epoch": 0.03875, "grad_norm": 3.9375, "grad_norm_var": 0.09058329264322916, "learning_rate": 0.0001, "loss": 9.3251, "loss/crossentropy": 2.3914138078689575, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.300808310508728, "step": 620 }, { "epoch": 0.038875, "grad_norm": 4.84375, "grad_norm_var": 0.107177734375, "learning_rate": 0.0001, "loss": 9.4366, "loss/crossentropy": 2.4114054441452026, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3568413257598877, "step": 622 }, { "epoch": 0.039, "grad_norm": 4.03125, "grad_norm_var": 0.08601786295572916, "learning_rate": 0.0001, "loss": 9.1418, "loss/crossentropy": 2.3419090509414673, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.31717973947525024, "step": 624 }, { "epoch": 0.039125, "grad_norm": 4.25, "grad_norm_var": 0.09462890625, "learning_rate": 0.0001, "loss": 9.4373, "loss/crossentropy": 2.64203143119812, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.33858008682727814, "step": 626 }, { "epoch": 0.03925, "grad_norm": 3.625, "grad_norm_var": 0.10220947265625, "learning_rate": 0.0001, "loss": 9.3278, "loss/crossentropy": 2.0542389154434204, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.30543386936187744, "step": 628 }, { "epoch": 0.039375, "grad_norm": 3.875, "grad_norm_var": 0.13300679524739584, "learning_rate": 0.0001, "loss": 9.2397, "loss/crossentropy": 2.4575854539871216, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3102063983678818, "step": 630 }, { "epoch": 0.0395, "grad_norm": 4.28125, "grad_norm_var": 0.1220611572265625, "learning_rate": 0.0001, "loss": 9.3452, "loss/crossentropy": 2.3602949380874634, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.323737695813179, "step": 632 }, { "epoch": 0.039625, "grad_norm": 3.71875, "grad_norm_var": 0.12845052083333333, "learning_rate": 0.0001, "loss": 9.2681, "loss/crossentropy": 2.507497191429138, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3275406062602997, "step": 634 }, { "epoch": 0.03975, "grad_norm": 4.25, "grad_norm_var": 0.13587137858072917, "learning_rate": 0.0001, "loss": 9.36, "loss/crossentropy": 2.2765142917633057, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.31396952271461487, "step": 636 }, { "epoch": 0.039875, "grad_norm": 3.46875, "grad_norm_var": 0.10793863932291667, "learning_rate": 0.0001, "loss": 9.2294, "loss/crossentropy": 2.341191053390503, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.32686179876327515, "step": 638 }, { "epoch": 0.04, "grad_norm": 4.8125, "grad_norm_var": 0.1619781494140625, "learning_rate": 0.0001, "loss": 9.2556, "loss/crossentropy": 2.252098858356476, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3205975890159607, "step": 640 }, { "epoch": 0.040125, "grad_norm": 3.4375, "grad_norm_var": 0.20991109212239584, "learning_rate": 0.0001, "loss": 9.3137, "loss/crossentropy": 2.2994823455810547, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.34002968668937683, "step": 642 }, { "epoch": 0.04025, "grad_norm": 4.375, "grad_norm_var": 0.21840718587239583, "learning_rate": 0.0001, "loss": 9.1512, "loss/crossentropy": 2.5480741262435913, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.33601297438144684, "step": 644 }, { "epoch": 0.040375, "grad_norm": 3.578125, "grad_norm_var": 0.23454488118489583, "learning_rate": 0.0001, "loss": 9.3808, "loss/crossentropy": 2.524424910545349, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3395262509584427, "step": 646 }, { "epoch": 0.0405, "grad_norm": 3.75, "grad_norm_var": 0.23319905598958332, "learning_rate": 0.0001, "loss": 9.1816, "loss/crossentropy": 2.2198326587677, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.33485807478427887, "step": 648 }, { "epoch": 0.040625, "grad_norm": 4.125, "grad_norm_var": 0.23205973307291666, "learning_rate": 0.0001, "loss": 9.252, "loss/crossentropy": 2.5186063051223755, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.33504799008369446, "step": 650 }, { "epoch": 0.04075, "grad_norm": 3.5, "grad_norm_var": 0.23567606608072916, "learning_rate": 0.0001, "loss": 9.299, "loss/crossentropy": 2.3492661714553833, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.31913943588733673, "step": 652 }, { "epoch": 0.040875, "grad_norm": 4.1875, "grad_norm_var": 0.23007405598958333, "learning_rate": 0.0001, "loss": 9.2997, "loss/crossentropy": 2.600319981575012, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.32478684186935425, "step": 654 }, { "epoch": 0.041, "grad_norm": 3.859375, "grad_norm_var": 0.17136942545572917, "learning_rate": 0.0001, "loss": 9.1048, "loss/crossentropy": 2.335653781890869, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3278462737798691, "step": 656 }, { "epoch": 0.041125, "grad_norm": 3.640625, "grad_norm_var": 0.12332356770833333, "learning_rate": 0.0001, "loss": 9.2115, "loss/crossentropy": 2.223168969154358, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3341253995895386, "step": 658 }, { "epoch": 0.04125, "grad_norm": 5.3125, "grad_norm_var": 0.252490234375, "learning_rate": 0.0001, "loss": 9.1622, "loss/crossentropy": 2.424691677093506, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3088841736316681, "step": 660 }, { "epoch": 0.041375, "grad_norm": 4.625, "grad_norm_var": 0.2993072509765625, "learning_rate": 0.0001, "loss": 9.1041, "loss/crossentropy": 2.07004451751709, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2844501733779907, "step": 662 }, { "epoch": 0.0415, "grad_norm": 5.03125, "grad_norm_var": 0.3312459309895833, "learning_rate": 0.0001, "loss": 9.3562, "loss/crossentropy": 2.32711398601532, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3303475081920624, "step": 664 }, { "epoch": 0.041625, "grad_norm": 4.3125, "grad_norm_var": 0.48313700358072914, "learning_rate": 0.0001, "loss": 9.2399, "loss/crossentropy": 2.3355051279067993, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.34253838658332825, "step": 666 }, { "epoch": 0.04175, "grad_norm": 4.0, "grad_norm_var": 0.504443359375, "learning_rate": 0.0001, "loss": 9.3712, "loss/crossentropy": 2.423375368118286, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31640373170375824, "step": 668 }, { "epoch": 0.041875, "grad_norm": 3.8125, "grad_norm_var": 0.49081624348958336, "learning_rate": 0.0001, "loss": 9.1568, "loss/crossentropy": 2.4355897903442383, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3098563849925995, "step": 670 }, { "epoch": 0.042, "grad_norm": 3.65625, "grad_norm_var": 0.5179433186848958, "learning_rate": 0.0001, "loss": 9.0687, "loss/crossentropy": 2.151113748550415, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.28571945428848267, "step": 672 }, { "epoch": 0.042125, "grad_norm": 4.09375, "grad_norm_var": 0.45859375, "learning_rate": 0.0001, "loss": 9.126, "loss/crossentropy": 2.1033096313476562, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30395573377609253, "step": 674 }, { "epoch": 0.04225, "grad_norm": 3.703125, "grad_norm_var": 0.41311848958333336, "learning_rate": 0.0001, "loss": 9.2339, "loss/crossentropy": 2.384363532066345, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3477473706007004, "step": 676 }, { "epoch": 0.042375, "grad_norm": 4.0, "grad_norm_var": 0.35420633951822916, "learning_rate": 0.0001, "loss": 9.2017, "loss/crossentropy": 2.3887627124786377, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3319309651851654, "step": 678 }, { "epoch": 0.0425, "grad_norm": 3.796875, "grad_norm_var": 0.28951416015625, "learning_rate": 0.0001, "loss": 9.0506, "loss/crossentropy": 2.3131089210510254, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31970134377479553, "step": 680 }, { "epoch": 0.042625, "grad_norm": 3.515625, "grad_norm_var": 0.0540191650390625, "learning_rate": 0.0001, "loss": 9.3033, "loss/crossentropy": 2.2213594913482666, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.2932916283607483, "step": 682 }, { "epoch": 0.04275, "grad_norm": 3.9375, "grad_norm_var": 0.04342041015625, "learning_rate": 0.0001, "loss": 9.1272, "loss/crossentropy": 2.343689441680908, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.31054770946502686, "step": 684 }, { "epoch": 0.042875, "grad_norm": 4.125, "grad_norm_var": 0.07541910807291667, "learning_rate": 0.0001, "loss": 9.0333, "loss/crossentropy": 2.426623225212097, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.30797363817691803, "step": 686 }, { "epoch": 0.043, "grad_norm": 3.75, "grad_norm_var": 0.07333577473958333, "learning_rate": 0.0001, "loss": 9.1961, "loss/crossentropy": 2.1243041157722473, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.27504249662160873, "step": 688 }, { "epoch": 0.043125, "grad_norm": 3.546875, "grad_norm_var": 0.07789306640625, "learning_rate": 0.0001, "loss": 9.3228, "loss/crossentropy": 2.2858647108078003, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3059113025665283, "step": 690 }, { "epoch": 0.04325, "grad_norm": 4.03125, "grad_norm_var": 0.07827046712239584, "learning_rate": 0.0001, "loss": 9.2223, "loss/crossentropy": 2.6258697509765625, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.32897868752479553, "step": 692 }, { "epoch": 0.043375, "grad_norm": 4.15625, "grad_norm_var": 0.0829986572265625, "learning_rate": 0.0001, "loss": 9.0107, "loss/crossentropy": 2.3375871181488037, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.30802060663700104, "step": 694 }, { "epoch": 0.0435, "grad_norm": 3.96875, "grad_norm_var": 0.08065999348958333, "learning_rate": 0.0001, "loss": 9.1397, "loss/crossentropy": 2.27328884601593, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.32508768141269684, "step": 696 }, { "epoch": 0.043625, "grad_norm": 3.90625, "grad_norm_var": 0.07095947265625, "learning_rate": 0.0001, "loss": 9.0871, "loss/crossentropy": 2.3383896350860596, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3195807486772537, "step": 698 }, { "epoch": 0.04375, "grad_norm": 3.609375, "grad_norm_var": 0.07858784993489583, "learning_rate": 0.0001, "loss": 9.0647, "loss/crossentropy": 1.985029935836792, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2951173782348633, "step": 700 }, { "epoch": 0.043875, "grad_norm": 3.5625, "grad_norm_var": 0.04241129557291667, "learning_rate": 0.0001, "loss": 9.2963, "loss/crossentropy": 2.236124038696289, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3033126890659332, "step": 702 }, { "epoch": 0.044, "grad_norm": 3.9375, "grad_norm_var": 0.03899332682291667, "learning_rate": 0.0001, "loss": 9.4002, "loss/crossentropy": 2.3327068090438843, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.35688331723213196, "step": 704 }, { "epoch": 0.044125, "grad_norm": 3.40625, "grad_norm_var": 0.04415690104166667, "learning_rate": 0.0001, "loss": 9.1288, "loss/crossentropy": 2.185292422771454, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.2997971922159195, "step": 706 }, { "epoch": 0.04425, "grad_norm": 4.03125, "grad_norm_var": 0.048193359375, "learning_rate": 0.0001, "loss": 9.1816, "loss/crossentropy": 2.592350959777832, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3380052447319031, "step": 708 }, { "epoch": 0.044375, "grad_norm": 3.625, "grad_norm_var": 0.050837198893229164, "learning_rate": 0.0001, "loss": 9.2751, "loss/crossentropy": 2.3508870601654053, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.30894704163074493, "step": 710 }, { "epoch": 0.0445, "grad_norm": 3.703125, "grad_norm_var": 0.05436909993489583, "learning_rate": 0.0001, "loss": 9.2454, "loss/crossentropy": 2.4968451261520386, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3285796344280243, "step": 712 }, { "epoch": 0.044625, "grad_norm": 3.90625, "grad_norm_var": 0.10283203125, "learning_rate": 0.0001, "loss": 9.3351, "loss/crossentropy": 2.4106976985931396, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.32071977853775024, "step": 714 }, { "epoch": 0.04475, "grad_norm": 3.609375, "grad_norm_var": 0.09908447265625, "learning_rate": 0.0001, "loss": 9.0656, "loss/crossentropy": 2.2552783489227295, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.30507735908031464, "step": 716 }, { "epoch": 0.044875, "grad_norm": 3.96875, "grad_norm_var": 0.09814046223958334, "learning_rate": 0.0001, "loss": 9.0835, "loss/crossentropy": 2.4068862199783325, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31281837821006775, "step": 718 }, { "epoch": 0.045, "grad_norm": 3.75, "grad_norm_var": 0.17351888020833334, "learning_rate": 0.0001, "loss": 9.2535, "loss/crossentropy": 2.2698957920074463, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3157753646373749, "step": 720 }, { "epoch": 0.045125, "grad_norm": 4.25, "grad_norm_var": 0.25741780598958336, "learning_rate": 0.0001, "loss": 9.2178, "loss/crossentropy": 2.5466257333755493, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3196914494037628, "step": 722 }, { "epoch": 0.04525, "grad_norm": 3.75, "grad_norm_var": 0.2650553385416667, "learning_rate": 0.0001, "loss": 9.0596, "loss/crossentropy": 2.259085774421692, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30355823040008545, "step": 724 }, { "epoch": 0.045375, "grad_norm": 3.828125, "grad_norm_var": 0.2575836181640625, "learning_rate": 0.0001, "loss": 9.3315, "loss/crossentropy": 2.420317769050598, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3066561073064804, "step": 726 }, { "epoch": 0.0455, "grad_norm": 3.65625, "grad_norm_var": 0.24367574055989583, "learning_rate": 0.0001, "loss": 9.074, "loss/crossentropy": 2.244703531265259, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.30555886030197144, "step": 728 }, { "epoch": 0.045625, "grad_norm": 3.875, "grad_norm_var": 0.21819254557291667, "learning_rate": 0.0001, "loss": 9.1535, "loss/crossentropy": 2.3010120391845703, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30051296949386597, "step": 730 }, { "epoch": 0.04575, "grad_norm": 3.8125, "grad_norm_var": 0.2000885009765625, "learning_rate": 0.0001, "loss": 9.2806, "loss/crossentropy": 2.0744789838790894, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3020750731229782, "step": 732 }, { "epoch": 0.045875, "grad_norm": 4.875, "grad_norm_var": 0.2525390625, "learning_rate": 0.0001, "loss": 9.1755, "loss/crossentropy": 2.4247848987579346, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30639201402664185, "step": 734 }, { "epoch": 0.046, "grad_norm": 3.6875, "grad_norm_var": 0.21112874348958333, "learning_rate": 0.0001, "loss": 9.2127, "loss/crossentropy": 2.228127598762512, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2969563454389572, "step": 736 }, { "epoch": 0.046125, "grad_norm": 3.453125, "grad_norm_var": 0.12919514973958332, "learning_rate": 0.0001, "loss": 8.998, "loss/crossentropy": 2.3098256587982178, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.30860866606235504, "step": 738 }, { "epoch": 0.04625, "grad_norm": 3.46875, "grad_norm_var": 0.14089253743489583, "learning_rate": 0.0001, "loss": 8.9791, "loss/crossentropy": 2.135189712047577, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3053634464740753, "step": 740 }, { "epoch": 0.046375, "grad_norm": 3.71875, "grad_norm_var": 0.14488016764322917, "learning_rate": 0.0001, "loss": 9.0268, "loss/crossentropy": 2.4625048637390137, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3120746314525604, "step": 742 }, { "epoch": 0.0465, "grad_norm": 3.734375, "grad_norm_var": 0.142333984375, "learning_rate": 0.0001, "loss": 9.1708, "loss/crossentropy": 2.1217297315597534, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.2820632755756378, "step": 744 }, { "epoch": 0.046625, "grad_norm": 3.453125, "grad_norm_var": 0.16352437337239584, "learning_rate": 0.0001, "loss": 8.8857, "loss/crossentropy": 2.454026937484741, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.28094005584716797, "step": 746 }, { "epoch": 0.04675, "grad_norm": 4.5, "grad_norm_var": 0.19820556640625, "learning_rate": 0.0001, "loss": 9.1644, "loss/crossentropy": 2.413946032524109, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3286159932613373, "step": 748 }, { "epoch": 0.046875, "grad_norm": 3.265625, "grad_norm_var": 0.12294514973958333, "learning_rate": 0.0001, "loss": 8.881, "loss/crossentropy": 2.2393068075180054, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30563026666641235, "step": 750 }, { "epoch": 0.047, "grad_norm": 3.25, "grad_norm_var": 0.11814676920572917, "learning_rate": 0.0001, "loss": 8.8969, "loss/crossentropy": 2.2508221864700317, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2862202823162079, "step": 752 }, { "epoch": 0.047125, "grad_norm": 4.03125, "grad_norm_var": 0.13244527180989582, "learning_rate": 0.0001, "loss": 9.265, "loss/crossentropy": 2.1571661233901978, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3307356685400009, "step": 754 }, { "epoch": 0.04725, "grad_norm": 3.34375, "grad_norm_var": 0.13481343587239583, "learning_rate": 0.0001, "loss": 8.9922, "loss/crossentropy": 2.393697500228882, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3230316936969757, "step": 756 }, { "epoch": 0.047375, "grad_norm": 3.640625, "grad_norm_var": 0.13456624348958332, "learning_rate": 0.0001, "loss": 8.9217, "loss/crossentropy": 2.249446392059326, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2918042242527008, "step": 758 }, { "epoch": 0.0475, "grad_norm": 3.75, "grad_norm_var": 0.13603413899739583, "learning_rate": 0.0001, "loss": 9.0298, "loss/crossentropy": 2.4479427337646484, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3056575655937195, "step": 760 }, { "epoch": 0.047625, "grad_norm": 3.75, "grad_norm_var": 0.12724609375, "learning_rate": 0.0001, "loss": 9.2332, "loss/crossentropy": 2.1675299406051636, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.303151935338974, "step": 762 }, { "epoch": 0.04775, "grad_norm": 3.96875, "grad_norm_var": 0.08153889973958334, "learning_rate": 0.0001, "loss": 9.0754, "loss/crossentropy": 2.5868079662323, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.2945869415998459, "step": 764 }, { "epoch": 0.047875, "grad_norm": 3.609375, "grad_norm_var": 0.07388916015625, "learning_rate": 0.0001, "loss": 9.141, "loss/crossentropy": 2.535553216934204, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.32601119577884674, "step": 766 }, { "epoch": 0.048, "grad_norm": 4.0625, "grad_norm_var": 0.05016276041666667, "learning_rate": 0.0001, "loss": 9.2516, "loss/crossentropy": 2.5762476921081543, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.34192636609077454, "step": 768 }, { "epoch": 0.048125, "grad_norm": 3.921875, "grad_norm_var": 0.0440582275390625, "learning_rate": 0.0001, "loss": 9.0968, "loss/crossentropy": 2.402553081512451, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3043902814388275, "step": 770 }, { "epoch": 0.04825, "grad_norm": 3.3125, "grad_norm_var": 0.042801920572916666, "learning_rate": 0.0001, "loss": 9.0553, "loss/crossentropy": 2.4971920251846313, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.30807921290397644, "step": 772 }, { "epoch": 0.048375, "grad_norm": 3.625, "grad_norm_var": 0.043843587239583336, "learning_rate": 0.0001, "loss": 9.1797, "loss/crossentropy": 2.2825552225112915, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.31863027811050415, "step": 774 }, { "epoch": 0.0485, "grad_norm": 3.625, "grad_norm_var": 0.04838765462239583, "learning_rate": 0.0001, "loss": 8.9698, "loss/crossentropy": 2.516672134399414, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.31801968812942505, "step": 776 }, { "epoch": 0.048625, "grad_norm": 4.125, "grad_norm_var": 0.06825764973958333, "learning_rate": 0.0001, "loss": 8.9749, "loss/crossentropy": 2.495086908340454, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.31983429193496704, "step": 778 }, { "epoch": 0.04875, "grad_norm": 4.28125, "grad_norm_var": 0.12460530598958333, "learning_rate": 0.0001, "loss": 9.1932, "loss/crossentropy": 2.3892232179641724, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3304327577352524, "step": 780 }, { "epoch": 0.048875, "grad_norm": 3.3125, "grad_norm_var": 0.17346598307291666, "learning_rate": 0.0001, "loss": 9.0205, "loss/crossentropy": 2.4275970458984375, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.29281261563301086, "step": 782 }, { "epoch": 0.049, "grad_norm": 3.90625, "grad_norm_var": 0.16988525390625, "learning_rate": 0.0001, "loss": 9.0345, "loss/crossentropy": 2.147684335708618, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30130012333393097, "step": 784 }, { "epoch": 0.049125, "grad_norm": 3.53125, "grad_norm_var": 0.1740234375, "learning_rate": 0.0001, "loss": 9.1809, "loss/crossentropy": 2.387849450111389, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.32018375396728516, "step": 786 }, { "epoch": 0.04925, "grad_norm": 3.59375, "grad_norm_var": 0.16553446451822917, "learning_rate": 0.0001, "loss": 8.9704, "loss/crossentropy": 2.1901475191116333, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.2830745130777359, "step": 788 }, { "epoch": 0.049375, "grad_norm": 3.390625, "grad_norm_var": 0.17476806640625, "learning_rate": 0.0001, "loss": 9.0056, "loss/crossentropy": 2.476477026939392, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30152270197868347, "step": 790 }, { "epoch": 0.0495, "grad_norm": 3.75, "grad_norm_var": 0.16788736979166666, "learning_rate": 0.0001, "loss": 9.2374, "loss/crossentropy": 2.4895143508911133, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.32273176312446594, "step": 792 }, { "epoch": 0.049625, "grad_norm": 3.390625, "grad_norm_var": 0.16460673014322916, "learning_rate": 0.0001, "loss": 8.8151, "loss/crossentropy": 2.1444047689437866, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2969019412994385, "step": 794 }, { "epoch": 0.04975, "grad_norm": 3.390625, "grad_norm_var": 0.06581624348958333, "learning_rate": 0.0001, "loss": 9.0359, "loss/crossentropy": 2.314146399497986, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.297823429107666, "step": 796 }, { "epoch": 0.049875, "grad_norm": 3.484375, "grad_norm_var": 0.046223958333333336, "learning_rate": 0.0001, "loss": 8.9617, "loss/crossentropy": 2.1535879373550415, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.29098525643348694, "step": 798 }, { "epoch": 0.05, "grad_norm": 3.59375, "grad_norm_var": 0.04029032389322917, "learning_rate": 0.0001, "loss": 8.9443, "loss/crossentropy": 2.5792254209518433, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30909422039985657, "step": 800 }, { "epoch": 0.050125, "grad_norm": 3.671875, "grad_norm_var": 0.041258748372395834, "learning_rate": 0.0001, "loss": 8.9077, "loss/crossentropy": 2.387328624725342, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.35100018978118896, "step": 802 }, { "epoch": 0.05025, "grad_norm": 3.484375, "grad_norm_var": 0.0410308837890625, "learning_rate": 0.0001, "loss": 8.9889, "loss/crossentropy": 2.5987643003463745, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2944849133491516, "step": 804 }, { "epoch": 0.050375, "grad_norm": 3.46875, "grad_norm_var": 0.03965555826822917, "learning_rate": 0.0001, "loss": 8.688, "loss/crossentropy": 2.291478753089905, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3056093603372574, "step": 806 }, { "epoch": 0.0505, "grad_norm": 3.6875, "grad_norm_var": 0.030744425455729165, "learning_rate": 0.0001, "loss": 8.8988, "loss/crossentropy": 2.281537890434265, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31349045038223267, "step": 808 }, { "epoch": 0.050625, "grad_norm": 3.546875, "grad_norm_var": 0.026634724934895833, "learning_rate": 0.0001, "loss": 9.0642, "loss/crossentropy": 2.16494482755661, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.307197168469429, "step": 810 }, { "epoch": 0.05075, "grad_norm": 3.75, "grad_norm_var": 0.019953409830729168, "learning_rate": 0.0001, "loss": 8.9202, "loss/crossentropy": 2.296713352203369, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30054841935634613, "step": 812 }, { "epoch": 0.050875, "grad_norm": 3.640625, "grad_norm_var": 0.021012369791666666, "learning_rate": 0.0001, "loss": 8.8666, "loss/crossentropy": 2.0488401055336, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2681543007493019, "step": 814 }, { "epoch": 0.051, "grad_norm": 3.75, "grad_norm_var": 0.02427978515625, "learning_rate": 0.0001, "loss": 8.9113, "loss/crossentropy": 2.1317135095596313, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2823093831539154, "step": 816 }, { "epoch": 0.051125, "grad_norm": 3.640625, "grad_norm_var": 0.03144124348958333, "learning_rate": 0.0001, "loss": 8.9404, "loss/crossentropy": 2.3469539880752563, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30824559926986694, "step": 818 }, { "epoch": 0.05125, "grad_norm": 3.921875, "grad_norm_var": 0.05701395670572917, "learning_rate": 0.0001, "loss": 9.108, "loss/crossentropy": 2.4571563005447388, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31656327843666077, "step": 820 }, { "epoch": 0.051375, "grad_norm": 3.65625, "grad_norm_var": 0.053141276041666664, "learning_rate": 0.0001, "loss": 9.0717, "loss/crossentropy": 2.3994951248168945, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3029633164405823, "step": 822 }, { "epoch": 0.0515, "grad_norm": 3.6875, "grad_norm_var": 0.051985677083333334, "learning_rate": 0.0001, "loss": 8.9063, "loss/crossentropy": 2.2581586837768555, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.30481448769569397, "step": 824 }, { "epoch": 0.051625, "grad_norm": 3.125, "grad_norm_var": 0.07155659993489584, "learning_rate": 0.0001, "loss": 8.8898, "loss/crossentropy": 2.071722984313965, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.2941209524869919, "step": 826 }, { "epoch": 0.05175, "grad_norm": 3.625, "grad_norm_var": 0.0892578125, "learning_rate": 0.0001, "loss": 9.0358, "loss/crossentropy": 2.5443246364593506, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29996325075626373, "step": 828 }, { "epoch": 0.051875, "grad_norm": 3.40625, "grad_norm_var": 0.09868062337239583, "learning_rate": 0.0001, "loss": 8.9568, "loss/crossentropy": 2.384023070335388, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31264999508857727, "step": 830 }, { "epoch": 0.052, "grad_norm": 3.953125, "grad_norm_var": 0.0974609375, "learning_rate": 0.0001, "loss": 8.9013, "loss/crossentropy": 2.4286372661590576, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.32337459921836853, "step": 832 }, { "epoch": 0.052125, "grad_norm": 3.71875, "grad_norm_var": 0.08315327962239584, "learning_rate": 0.0001, "loss": 8.7277, "loss/crossentropy": 2.1722983717918396, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3173587769269943, "step": 834 }, { "epoch": 0.05225, "grad_norm": 4.03125, "grad_norm_var": 0.07082417805989584, "learning_rate": 0.0001, "loss": 9.0322, "loss/crossentropy": 2.6337625980377197, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.30448000133037567, "step": 836 }, { "epoch": 0.052375, "grad_norm": 3.53125, "grad_norm_var": 0.0727203369140625, "learning_rate": 0.0001, "loss": 8.9959, "loss/crossentropy": 2.303470253944397, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.318126916885376, "step": 838 }, { "epoch": 0.0525, "grad_norm": 3.734375, "grad_norm_var": 0.0729888916015625, "learning_rate": 0.0001, "loss": 8.7451, "loss/crossentropy": 2.295042037963867, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.30773746967315674, "step": 840 }, { "epoch": 0.052625, "grad_norm": 3.0625, "grad_norm_var": 0.0851226806640625, "learning_rate": 0.0001, "loss": 8.6922, "loss/crossentropy": 2.060616612434387, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.29626762866973877, "step": 842 }, { "epoch": 0.05275, "grad_norm": 3.671875, "grad_norm_var": 0.070556640625, "learning_rate": 0.0001, "loss": 8.9668, "loss/crossentropy": 2.2909332513809204, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.2921592891216278, "step": 844 }, { "epoch": 0.052875, "grad_norm": 4.21875, "grad_norm_var": 0.092822265625, "learning_rate": 0.0001, "loss": 9.1657, "loss/crossentropy": 2.5100677013397217, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3154117166996002, "step": 846 }, { "epoch": 0.053, "grad_norm": 3.546875, "grad_norm_var": 0.09158528645833333, "learning_rate": 0.0001, "loss": 9.1623, "loss/crossentropy": 2.3510499000549316, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.28174377977848053, "step": 848 }, { "epoch": 0.053125, "grad_norm": 3.5625, "grad_norm_var": 0.08963216145833333, "learning_rate": 0.0001, "loss": 8.8411, "loss/crossentropy": 2.361242413520813, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2962343841791153, "step": 850 }, { "epoch": 0.05325, "grad_norm": 3.40625, "grad_norm_var": 0.08534749348958333, "learning_rate": 0.0001, "loss": 8.8426, "loss/crossentropy": 1.9918023943901062, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.26788248866796494, "step": 852 }, { "epoch": 0.053375, "grad_norm": 3.25, "grad_norm_var": 0.0908111572265625, "learning_rate": 0.0001, "loss": 8.9666, "loss/crossentropy": 2.6088958978652954, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3181132972240448, "step": 854 }, { "epoch": 0.0535, "grad_norm": 3.8125, "grad_norm_var": 0.09516499837239584, "learning_rate": 0.0001, "loss": 8.8605, "loss/crossentropy": 2.1732386350631714, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3017688989639282, "step": 856 }, { "epoch": 0.053625, "grad_norm": 3.578125, "grad_norm_var": 0.080029296875, "learning_rate": 0.0001, "loss": 9.0765, "loss/crossentropy": 2.3053905963897705, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28416720032691956, "step": 858 }, { "epoch": 0.05375, "grad_norm": 3.53125, "grad_norm_var": 0.08489481608072917, "learning_rate": 0.0001, "loss": 8.9506, "loss/crossentropy": 2.3922606706619263, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3041910231113434, "step": 860 }, { "epoch": 0.053875, "grad_norm": 3.625, "grad_norm_var": 0.059326171875, "learning_rate": 0.0001, "loss": 8.9744, "loss/crossentropy": 2.2312803864479065, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.28423628211021423, "step": 862 }, { "epoch": 0.054, "grad_norm": 3.5625, "grad_norm_var": 0.053938802083333334, "learning_rate": 0.0001, "loss": 8.8504, "loss/crossentropy": 2.4400585889816284, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.32312142848968506, "step": 864 }, { "epoch": 0.054125, "grad_norm": 3.453125, "grad_norm_var": 0.06122639973958333, "learning_rate": 0.0001, "loss": 8.7439, "loss/crossentropy": 2.16322124004364, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3096587359905243, "step": 866 }, { "epoch": 0.05425, "grad_norm": 3.59375, "grad_norm_var": 0.05967508951822917, "learning_rate": 0.0001, "loss": 8.7824, "loss/crossentropy": 2.2714940309524536, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.31225262582302094, "step": 868 }, { "epoch": 0.054375, "grad_norm": 3.59375, "grad_norm_var": 0.061335245768229164, "learning_rate": 0.0001, "loss": 8.9188, "loss/crossentropy": 2.3233022689819336, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.29183535277843475, "step": 870 }, { "epoch": 0.0545, "grad_norm": 3.46875, "grad_norm_var": 0.060628255208333336, "learning_rate": 0.0001, "loss": 8.8018, "loss/crossentropy": 2.1473275423049927, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.302778959274292, "step": 872 }, { "epoch": 0.054625, "grad_norm": 3.5, "grad_norm_var": 0.04723307291666667, "learning_rate": 0.0001, "loss": 8.7291, "loss/crossentropy": 2.10613477230072, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2919985055923462, "step": 874 }, { "epoch": 0.05475, "grad_norm": 3.828125, "grad_norm_var": 0.04791259765625, "learning_rate": 0.0001, "loss": 8.8805, "loss/crossentropy": 2.391135811805725, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.303908035159111, "step": 876 }, { "epoch": 0.054875, "grad_norm": 4.21875, "grad_norm_var": 0.07213134765625, "learning_rate": 0.0001, "loss": 8.9757, "loss/crossentropy": 2.191763758659363, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.29602205753326416, "step": 878 }, { "epoch": 0.055, "grad_norm": 3.09375, "grad_norm_var": 0.0880859375, "learning_rate": 0.0001, "loss": 8.9226, "loss/crossentropy": 2.3897584676742554, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2984778434038162, "step": 880 }, { "epoch": 0.055125, "grad_norm": 3.53125, "grad_norm_var": 0.10453999837239583, "learning_rate": 0.0001, "loss": 8.6909, "loss/crossentropy": 2.2274895906448364, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.3162970691919327, "step": 882 }, { "epoch": 0.05525, "grad_norm": 3.5, "grad_norm_var": 0.10286051432291667, "learning_rate": 0.0001, "loss": 8.7805, "loss/crossentropy": 2.238261342048645, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3091089427471161, "step": 884 }, { "epoch": 0.055375, "grad_norm": 3.578125, "grad_norm_var": 0.09251200358072917, "learning_rate": 0.0001, "loss": 8.8335, "loss/crossentropy": 2.398587703704834, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30020007491111755, "step": 886 }, { "epoch": 0.0555, "grad_norm": 3.125, "grad_norm_var": 0.11856180826822917, "learning_rate": 0.0001, "loss": 8.6695, "loss/crossentropy": 2.362698554992676, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3049694448709488, "step": 888 }, { "epoch": 0.055625, "grad_norm": 4.09375, "grad_norm_var": 0.13857014973958334, "learning_rate": 0.0001, "loss": 8.8959, "loss/crossentropy": 2.483735680580139, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3071689158678055, "step": 890 }, { "epoch": 0.05575, "grad_norm": 3.734375, "grad_norm_var": 0.1455230712890625, "learning_rate": 0.0001, "loss": 9.1407, "loss/crossentropy": 2.6666314601898193, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3311910331249237, "step": 892 }, { "epoch": 0.055875, "grad_norm": 3.703125, "grad_norm_var": 0.1199615478515625, "learning_rate": 0.0001, "loss": 8.8907, "loss/crossentropy": 2.3543641567230225, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30659469962120056, "step": 894 }, { "epoch": 0.056, "grad_norm": 3.546875, "grad_norm_var": 0.11165364583333333, "learning_rate": 0.0001, "loss": 8.542, "loss/crossentropy": 2.442333459854126, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2963842451572418, "step": 896 }, { "epoch": 0.056125, "grad_norm": 3.25, "grad_norm_var": 0.09463602701822917, "learning_rate": 0.0001, "loss": 8.8421, "loss/crossentropy": 2.5169384479522705, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.31081072986125946, "step": 898 }, { "epoch": 0.05625, "grad_norm": 3.609375, "grad_norm_var": 0.09204813639322916, "learning_rate": 0.0001, "loss": 8.8349, "loss/crossentropy": 2.378996729850769, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2988849878311157, "step": 900 }, { "epoch": 0.056375, "grad_norm": 4.0, "grad_norm_var": 0.10779520670572916, "learning_rate": 0.0001, "loss": 8.6611, "loss/crossentropy": 2.0278642177581787, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.26663029193878174, "step": 902 }, { "epoch": 0.0565, "grad_norm": 3.625, "grad_norm_var": 0.08515218098958334, "learning_rate": 0.0001, "loss": 8.8614, "loss/crossentropy": 2.3273751735687256, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2889983803033829, "step": 904 }, { "epoch": 0.056625, "grad_norm": 3.46875, "grad_norm_var": 0.07197265625, "learning_rate": 0.0001, "loss": 8.8386, "loss/crossentropy": 2.0480875372886658, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.30192601680755615, "step": 906 }, { "epoch": 0.05675, "grad_norm": 3.859375, "grad_norm_var": 0.05976155598958333, "learning_rate": 0.0001, "loss": 9.0464, "loss/crossentropy": 2.5360888242721558, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.28871724009513855, "step": 908 }, { "epoch": 0.056875, "grad_norm": 3.25, "grad_norm_var": 0.060846964518229164, "learning_rate": 0.0001, "loss": 8.9657, "loss/crossentropy": 2.59428608417511, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3197096735239029, "step": 910 }, { "epoch": 0.057, "grad_norm": 3.46875, "grad_norm_var": 0.062027994791666666, "learning_rate": 0.0001, "loss": 8.8488, "loss/crossentropy": 2.4978867769241333, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.38725124299526215, "step": 912 }, { "epoch": 0.057125, "grad_norm": 3.640625, "grad_norm_var": 0.060347493489583334, "learning_rate": 0.0001, "loss": 9.0475, "loss/crossentropy": 2.269066333770752, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2860633432865143, "step": 914 }, { "epoch": 0.05725, "grad_norm": 3.03125, "grad_norm_var": 0.07659505208333334, "learning_rate": 0.0001, "loss": 8.6945, "loss/crossentropy": 2.3426260948181152, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2847317010164261, "step": 916 }, { "epoch": 0.057375, "grad_norm": 3.53125, "grad_norm_var": 0.055562337239583336, "learning_rate": 0.0001, "loss": 8.7413, "loss/crossentropy": 2.3890886306762695, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2951260507106781, "step": 918 }, { "epoch": 0.0575, "grad_norm": 3.46875, "grad_norm_var": 0.05225321451822917, "learning_rate": 0.0001, "loss": 8.726, "loss/crossentropy": 2.324913740158081, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2819140702486038, "step": 920 }, { "epoch": 0.057625, "grad_norm": 4.375, "grad_norm_var": 0.10100809733072917, "learning_rate": 0.0001, "loss": 8.8514, "loss/crossentropy": 2.4282515048980713, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3256274312734604, "step": 922 }, { "epoch": 0.05775, "grad_norm": 3.46875, "grad_norm_var": 0.10305582682291667, "learning_rate": 0.0001, "loss": 8.6487, "loss/crossentropy": 2.476005434989929, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3199878931045532, "step": 924 }, { "epoch": 0.057875, "grad_norm": 3.71875, "grad_norm_var": 0.13284098307291667, "learning_rate": 0.0001, "loss": 8.8988, "loss/crossentropy": 2.469366192817688, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2891136407852173, "step": 926 }, { "epoch": 0.058, "grad_norm": 3.234375, "grad_norm_var": 0.14055989583333334, "learning_rate": 0.0001, "loss": 8.7523, "loss/crossentropy": 2.348281979560852, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.27692942321300507, "step": 928 }, { "epoch": 0.058125, "grad_norm": 3.515625, "grad_norm_var": 0.14073893229166667, "learning_rate": 0.0001, "loss": 8.5853, "loss/crossentropy": 2.1680904626846313, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.29489417374134064, "step": 930 }, { "epoch": 0.05825, "grad_norm": 3.65625, "grad_norm_var": 0.116650390625, "learning_rate": 0.0001, "loss": 8.7008, "loss/crossentropy": 2.139521837234497, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.263563334941864, "step": 932 }, { "epoch": 0.058375, "grad_norm": 3.265625, "grad_norm_var": 0.13267822265625, "learning_rate": 0.0001, "loss": 8.8043, "loss/crossentropy": 2.522794246673584, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3044002056121826, "step": 934 }, { "epoch": 0.0585, "grad_norm": 3.765625, "grad_norm_var": 0.12968343098958332, "learning_rate": 0.0001, "loss": 8.9035, "loss/crossentropy": 2.4657033681869507, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.30104976892471313, "step": 936 }, { "epoch": 0.058625, "grad_norm": 5.53125, "grad_norm_var": 0.3282389322916667, "learning_rate": 0.0001, "loss": 8.9486, "loss/crossentropy": 2.3463146686553955, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.303235799074173, "step": 938 }, { "epoch": 0.05875, "grad_norm": 3.515625, "grad_norm_var": 0.3243479410807292, "learning_rate": 0.0001, "loss": 8.9808, "loss/crossentropy": 2.2249823808670044, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.29463791847229004, "step": 940 }, { "epoch": 0.058875, "grad_norm": 3.25, "grad_norm_var": 0.30130106608072915, "learning_rate": 0.0001, "loss": 8.67, "loss/crossentropy": 2.334540367126465, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30720797181129456, "step": 942 }, { "epoch": 0.059, "grad_norm": 3.921875, "grad_norm_var": 0.2997029622395833, "learning_rate": 0.0001, "loss": 8.7021, "loss/crossentropy": 2.498934745788574, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2946365475654602, "step": 944 }, { "epoch": 0.059125, "grad_norm": 3.71875, "grad_norm_var": 0.2964508056640625, "learning_rate": 0.0001, "loss": 8.7885, "loss/crossentropy": 2.0534738898277283, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.27286672592163086, "step": 946 }, { "epoch": 0.05925, "grad_norm": 3.125, "grad_norm_var": 0.3176066080729167, "learning_rate": 0.0001, "loss": 8.8829, "loss/crossentropy": 2.362457275390625, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2855580151081085, "step": 948 }, { "epoch": 0.059375, "grad_norm": 3.6875, "grad_norm_var": 0.30732320149739584, "learning_rate": 0.0001, "loss": 8.7018, "loss/crossentropy": 2.1891767382621765, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.31324711441993713, "step": 950 }, { "epoch": 0.0595, "grad_norm": 3.734375, "grad_norm_var": 0.31344401041666664, "learning_rate": 0.0001, "loss": 8.8816, "loss/crossentropy": 2.3532931804656982, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30756065249443054, "step": 952 }, { "epoch": 0.059625, "grad_norm": 3.265625, "grad_norm_var": 0.06813863118489584, "learning_rate": 0.0001, "loss": 8.7884, "loss/crossentropy": 2.365698456764221, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3289715647697449, "step": 954 }, { "epoch": 0.05975, "grad_norm": 3.671875, "grad_norm_var": 0.0709381103515625, "learning_rate": 0.0001, "loss": 8.9138, "loss/crossentropy": 2.614629626274109, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30402082204818726, "step": 956 }, { "epoch": 0.059875, "grad_norm": 3.59375, "grad_norm_var": 0.07691650390625, "learning_rate": 0.0001, "loss": 8.663, "loss/crossentropy": 2.109455645084381, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2772120535373688, "step": 958 }, { "epoch": 0.06, "grad_norm": 3.765625, "grad_norm_var": 0.0668121337890625, "learning_rate": 0.0001, "loss": 8.8741, "loss/crossentropy": 2.380413055419922, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.3016493618488312, "step": 960 }, { "epoch": 0.060125, "grad_norm": 3.5, "grad_norm_var": 0.06373291015625, "learning_rate": 0.0001, "loss": 8.9362, "loss/crossentropy": 2.506435751914978, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.335014671087265, "step": 962 }, { "epoch": 0.06025, "grad_norm": 3.40625, "grad_norm_var": 0.0563140869140625, "learning_rate": 0.0001, "loss": 8.6434, "loss/crossentropy": 1.9750906229019165, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2565095126628876, "step": 964 }, { "epoch": 0.060375, "grad_norm": 3.75, "grad_norm_var": 0.05491434733072917, "learning_rate": 0.0001, "loss": 8.7565, "loss/crossentropy": 2.2668718099594116, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2805168777704239, "step": 966 }, { "epoch": 0.0605, "grad_norm": 3.21875, "grad_norm_var": 0.044266764322916666, "learning_rate": 0.0001, "loss": 8.718, "loss/crossentropy": 2.2082256078720093, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2930946350097656, "step": 968 }, { "epoch": 0.060625, "grad_norm": 3.84375, "grad_norm_var": 0.0510894775390625, "learning_rate": 0.0001, "loss": 8.8922, "loss/crossentropy": 2.4699355363845825, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31211861968040466, "step": 970 }, { "epoch": 0.06075, "grad_norm": 3.609375, "grad_norm_var": 0.07473856608072917, "learning_rate": 0.0001, "loss": 8.6174, "loss/crossentropy": 2.160146117210388, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28759919106960297, "step": 972 }, { "epoch": 0.060875, "grad_norm": 3.09375, "grad_norm_var": 0.07998758951822917, "learning_rate": 0.0001, "loss": 8.9576, "loss/crossentropy": 2.245842933654785, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28311386704444885, "step": 974 }, { "epoch": 0.061, "grad_norm": 3.65625, "grad_norm_var": 0.07842508951822917, "learning_rate": 0.0001, "loss": 8.7692, "loss/crossentropy": 2.3998042345046997, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.29600852727890015, "step": 976 }, { "epoch": 0.061125, "grad_norm": 3.53125, "grad_norm_var": 0.07856343587239584, "learning_rate": 0.0001, "loss": 8.7165, "loss/crossentropy": 2.4230899810791016, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2921423017978668, "step": 978 }, { "epoch": 0.06125, "grad_norm": 3.203125, "grad_norm_var": 0.08714090983072917, "learning_rate": 0.0001, "loss": 8.5009, "loss/crossentropy": 2.105591118335724, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.2892334759235382, "step": 980 }, { "epoch": 0.061375, "grad_norm": 3.671875, "grad_norm_var": 0.08603515625, "learning_rate": 0.0001, "loss": 8.8291, "loss/crossentropy": 2.342753052711487, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2922486811876297, "step": 982 }, { "epoch": 0.0615, "grad_norm": 3.046875, "grad_norm_var": 0.09026285807291666, "learning_rate": 0.0001, "loss": 8.7629, "loss/crossentropy": 2.5545257329940796, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.29155534505844116, "step": 984 }, { "epoch": 0.061625, "grad_norm": 3.53125, "grad_norm_var": 0.08010152180989584, "learning_rate": 0.0001, "loss": 8.635, "loss/crossentropy": 2.390330672264099, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28914259374141693, "step": 986 }, { "epoch": 0.06175, "grad_norm": 3.109375, "grad_norm_var": 0.039484659830729164, "learning_rate": 0.0001, "loss": 8.6355, "loss/crossentropy": 2.031624495983124, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29533734917640686, "step": 988 }, { "epoch": 0.061875, "grad_norm": 3.625, "grad_norm_var": 0.04781494140625, "learning_rate": 0.0001, "loss": 8.8029, "loss/crossentropy": 2.1612058877944946, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2922551929950714, "step": 990 }, { "epoch": 0.062, "grad_norm": 3.15625, "grad_norm_var": 0.06575520833333333, "learning_rate": 0.0001, "loss": 8.8484, "loss/crossentropy": 2.2686573266983032, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28837865591049194, "step": 992 }, { "epoch": 0.062125, "grad_norm": 3.953125, "grad_norm_var": 0.08382161458333333, "learning_rate": 0.0001, "loss": 8.9572, "loss/crossentropy": 2.4839664697647095, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30035223066806793, "step": 994 }, { "epoch": 0.06225, "grad_norm": 3.328125, "grad_norm_var": 0.0792388916015625, "learning_rate": 0.0001, "loss": 8.7334, "loss/crossentropy": 2.434647560119629, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.30409903824329376, "step": 996 }, { "epoch": 0.062375, "grad_norm": 4.09375, "grad_norm_var": 0.10009663899739583, "learning_rate": 0.0001, "loss": 8.7896, "loss/crossentropy": 2.349792718887329, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.273041769862175, "step": 998 }, { "epoch": 0.0625, "grad_norm": 2.875, "grad_norm_var": 0.1135162353515625, "learning_rate": 0.0001, "loss": 8.7517, "loss/crossentropy": 2.424883484840393, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.28756730258464813, "step": 1000 }, { "epoch": 0.062625, "grad_norm": 3.1875, "grad_norm_var": 0.1218414306640625, "learning_rate": 0.0001, "loss": 8.5904, "loss/crossentropy": 2.4634835720062256, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.28353893756866455, "step": 1002 }, { "epoch": 0.06275, "grad_norm": 3.421875, "grad_norm_var": 0.10761311848958334, "learning_rate": 0.0001, "loss": 8.6432, "loss/crossentropy": 2.3544020652770996, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.291437104344368, "step": 1004 }, { "epoch": 0.062875, "grad_norm": 3.203125, "grad_norm_var": 0.11253255208333333, "learning_rate": 0.0001, "loss": 8.6969, "loss/crossentropy": 2.3183122873306274, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.291623592376709, "step": 1006 }, { "epoch": 0.063, "grad_norm": 3.078125, "grad_norm_var": 0.10125325520833334, "learning_rate": 0.0001, "loss": 8.7425, "loss/crossentropy": 2.5724557638168335, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3042745739221573, "step": 1008 }, { "epoch": 0.063125, "grad_norm": 3.484375, "grad_norm_var": 0.08701171875, "learning_rate": 0.0001, "loss": 8.7429, "loss/crossentropy": 2.393343210220337, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.28968481719493866, "step": 1010 }, { "epoch": 0.06325, "grad_norm": 3.359375, "grad_norm_var": 0.0902740478515625, "learning_rate": 0.0001, "loss": 8.7322, "loss/crossentropy": 2.4881935119628906, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3185275048017502, "step": 1012 }, { "epoch": 0.063375, "grad_norm": 3.3125, "grad_norm_var": 0.05918680826822917, "learning_rate": 0.0001, "loss": 8.6637, "loss/crossentropy": 2.1747263073921204, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27498696744441986, "step": 1014 }, { "epoch": 0.0635, "grad_norm": 3.734375, "grad_norm_var": 0.09000651041666667, "learning_rate": 0.0001, "loss": 8.6898, "loss/crossentropy": 2.5582990646362305, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.3123309761285782, "step": 1016 }, { "epoch": 0.063625, "grad_norm": 2.96875, "grad_norm_var": 0.10186258951822917, "learning_rate": 0.0001, "loss": 8.4983, "loss/crossentropy": 2.2065166234970093, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2910768985748291, "step": 1018 }, { "epoch": 0.06375, "grad_norm": 3.171875, "grad_norm_var": 0.10444234212239584, "learning_rate": 0.0001, "loss": 8.5609, "loss/crossentropy": 2.3102041482925415, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2962404191493988, "step": 1020 }, { "epoch": 0.063875, "grad_norm": 3.171875, "grad_norm_var": 0.10598042805989584, "learning_rate": 0.0001, "loss": 8.619, "loss/crossentropy": 2.330891489982605, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.301542192697525, "step": 1022 }, { "epoch": 0.064, "grad_norm": 3.390625, "grad_norm_var": 0.10001627604166667, "learning_rate": 0.0001, "loss": 8.4584, "loss/crossentropy": 2.4117361307144165, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2964586764574051, "step": 1024 }, { "epoch": 0.064125, "grad_norm": 3.28125, "grad_norm_var": 0.0966796875, "learning_rate": 0.0001, "loss": 8.5442, "loss/crossentropy": 2.069741904735565, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2646178603172302, "step": 1026 }, { "epoch": 0.06425, "grad_norm": 3.359375, "grad_norm_var": 0.09484049479166666, "learning_rate": 0.0001, "loss": 8.7027, "loss/crossentropy": 2.4950772523880005, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.30084407329559326, "step": 1028 }, { "epoch": 0.064375, "grad_norm": 3.46875, "grad_norm_var": 0.096923828125, "learning_rate": 0.0001, "loss": 8.6711, "loss/crossentropy": 2.3811144828796387, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2816159278154373, "step": 1030 }, { "epoch": 0.0645, "grad_norm": 3.265625, "grad_norm_var": 0.03141988118489583, "learning_rate": 0.0001, "loss": 8.5786, "loss/crossentropy": 2.146743893623352, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2715151458978653, "step": 1032 }, { "epoch": 0.064625, "grad_norm": 3.53125, "grad_norm_var": 0.0274566650390625, "learning_rate": 0.0001, "loss": 8.6314, "loss/crossentropy": 2.453763008117676, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.28644131124019623, "step": 1034 }, { "epoch": 0.06475, "grad_norm": 3.375, "grad_norm_var": 0.024583943684895835, "learning_rate": 0.0001, "loss": 8.4581, "loss/crossentropy": 2.189074158668518, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28300249576568604, "step": 1036 }, { "epoch": 0.064875, "grad_norm": 3.0625, "grad_norm_var": 0.038102213541666666, "learning_rate": 0.0001, "loss": 8.5097, "loss/crossentropy": 2.2738513946533203, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.27797406911849976, "step": 1038 }, { "epoch": 0.065, "grad_norm": 3.3125, "grad_norm_var": 0.0379547119140625, "learning_rate": 0.0001, "loss": 8.7724, "loss/crossentropy": 2.5585055351257324, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3508901298046112, "step": 1040 }, { "epoch": 0.065125, "grad_norm": 3.640625, "grad_norm_var": 0.04047749837239583, "learning_rate": 0.0001, "loss": 8.8154, "loss/crossentropy": 2.3115618228912354, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2781260311603546, "step": 1042 }, { "epoch": 0.06525, "grad_norm": 3.25, "grad_norm_var": 0.0395660400390625, "learning_rate": 0.0001, "loss": 8.5646, "loss/crossentropy": 2.2015340328216553, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.29191769659519196, "step": 1044 }, { "epoch": 0.065375, "grad_norm": 3.640625, "grad_norm_var": 0.04152730305989583, "learning_rate": 0.0001, "loss": 8.7064, "loss/crossentropy": 2.4439064264297485, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.31912754476070404, "step": 1046 }, { "epoch": 0.0655, "grad_norm": 3.390625, "grad_norm_var": 0.0404937744140625, "learning_rate": 0.0001, "loss": 8.7495, "loss/crossentropy": 2.611847996711731, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.31853775680065155, "step": 1048 }, { "epoch": 0.065625, "grad_norm": 3.75, "grad_norm_var": 0.04395243326822917, "learning_rate": 0.0001, "loss": 8.5285, "loss/crossentropy": 2.058500051498413, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2707058787345886, "step": 1050 }, { "epoch": 0.06575, "grad_norm": 3.453125, "grad_norm_var": 0.043355305989583336, "learning_rate": 0.0001, "loss": 8.6134, "loss/crossentropy": 2.3460679054260254, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.29806579649448395, "step": 1052 }, { "epoch": 0.065875, "grad_norm": 3.359375, "grad_norm_var": 0.028531901041666665, "learning_rate": 0.0001, "loss": 8.7906, "loss/crossentropy": 2.575559377670288, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.29479941725730896, "step": 1054 }, { "epoch": 0.066, "grad_norm": 3.5, "grad_norm_var": 0.03673502604166667, "learning_rate": 0.0001, "loss": 8.573, "loss/crossentropy": 2.4485961198806763, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3214751183986664, "step": 1056 }, { "epoch": 0.066125, "grad_norm": 3.28125, "grad_norm_var": 0.0400390625, "learning_rate": 0.0001, "loss": 8.5355, "loss/crossentropy": 2.397768259048462, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.31295061111450195, "step": 1058 }, { "epoch": 0.06625, "grad_norm": 3.21875, "grad_norm_var": 0.040848795572916666, "learning_rate": 0.0001, "loss": 8.4975, "loss/crossentropy": 2.390447735786438, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29537099599838257, "step": 1060 }, { "epoch": 0.066375, "grad_norm": 3.046875, "grad_norm_var": 0.04865620930989583, "learning_rate": 0.0001, "loss": 8.553, "loss/crossentropy": 2.417343854904175, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28230586647987366, "step": 1062 }, { "epoch": 0.0665, "grad_norm": 3.390625, "grad_norm_var": 0.046507771809895834, "learning_rate": 0.0001, "loss": 8.5829, "loss/crossentropy": 2.6324515342712402, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.3016352653503418, "step": 1064 }, { "epoch": 0.066625, "grad_norm": 3.453125, "grad_norm_var": 0.03913472493489583, "learning_rate": 0.0001, "loss": 8.5346, "loss/crossentropy": 2.323632597923279, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2779388278722763, "step": 1066 }, { "epoch": 0.06675, "grad_norm": 2.953125, "grad_norm_var": 0.03640950520833333, "learning_rate": 0.0001, "loss": 8.3832, "loss/crossentropy": 2.393522083759308, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.29860424995422363, "step": 1068 }, { "epoch": 0.066875, "grad_norm": 3.4375, "grad_norm_var": 0.039839680989583334, "learning_rate": 0.0001, "loss": 8.6174, "loss/crossentropy": 2.263484001159668, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2802084982395172, "step": 1070 }, { "epoch": 0.067, "grad_norm": 3.296875, "grad_norm_var": 0.03242085774739583, "learning_rate": 0.0001, "loss": 8.4765, "loss/crossentropy": 2.1152660846710205, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2554662525653839, "step": 1072 }, { "epoch": 0.067125, "grad_norm": 3.171875, "grad_norm_var": 0.03629150390625, "learning_rate": 0.0001, "loss": 8.5434, "loss/crossentropy": 2.4824490547180176, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28639310598373413, "step": 1074 }, { "epoch": 0.06725, "grad_norm": 3.0, "grad_norm_var": 0.033014933268229164, "learning_rate": 0.0001, "loss": 8.6253, "loss/crossentropy": 2.5830127000808716, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.3099432587623596, "step": 1076 }, { "epoch": 0.067375, "grad_norm": 3.96875, "grad_norm_var": 0.0666168212890625, "learning_rate": 0.0001, "loss": 8.7433, "loss/crossentropy": 2.580743908882141, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2936056852340698, "step": 1078 }, { "epoch": 0.0675, "grad_norm": 3.21875, "grad_norm_var": 0.07141011555989583, "learning_rate": 0.0001, "loss": 8.6721, "loss/crossentropy": 2.350903868675232, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2968801259994507, "step": 1080 }, { "epoch": 0.067625, "grad_norm": 3.5, "grad_norm_var": 0.07302144368489584, "learning_rate": 0.0001, "loss": 8.4574, "loss/crossentropy": 2.257428526878357, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2766515016555786, "step": 1082 }, { "epoch": 0.06775, "grad_norm": 3.265625, "grad_norm_var": 0.06972249348958333, "learning_rate": 0.0001, "loss": 8.6877, "loss/crossentropy": 2.2526296377182007, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28292417526245117, "step": 1084 }, { "epoch": 0.067875, "grad_norm": 3.5, "grad_norm_var": 0.06391499837239584, "learning_rate": 0.0001, "loss": 8.7502, "loss/crossentropy": 2.3044220209121704, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2810298800468445, "step": 1086 }, { "epoch": 0.068, "grad_norm": 3.125, "grad_norm_var": 0.06946614583333334, "learning_rate": 0.0001, "loss": 8.585, "loss/crossentropy": 2.2871402502059937, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27473995089530945, "step": 1088 }, { "epoch": 0.068125, "grad_norm": 3.890625, "grad_norm_var": 0.08155008951822916, "learning_rate": 0.0001, "loss": 8.7181, "loss/crossentropy": 2.49469530582428, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2961750328540802, "step": 1090 }, { "epoch": 0.06825, "grad_norm": 3.15625, "grad_norm_var": 0.07158915201822917, "learning_rate": 0.0001, "loss": 8.6174, "loss/crossentropy": 1.980285882949829, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.27205583453178406, "step": 1092 }, { "epoch": 0.068375, "grad_norm": 3.65625, "grad_norm_var": 0.06383056640625, "learning_rate": 0.0001, "loss": 8.383, "loss/crossentropy": 2.4063356518745422, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2908872812986374, "step": 1094 }, { "epoch": 0.0685, "grad_norm": 3.34375, "grad_norm_var": 0.05968424479166667, "learning_rate": 0.0001, "loss": 8.6962, "loss/crossentropy": 2.3776599168777466, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2903301566839218, "step": 1096 }, { "epoch": 0.068625, "grad_norm": 3.515625, "grad_norm_var": 0.05542704264322917, "learning_rate": 0.0001, "loss": 8.4252, "loss/crossentropy": 2.25793194770813, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2622709423303604, "step": 1098 }, { "epoch": 0.06875, "grad_norm": 3.0625, "grad_norm_var": 0.068017578125, "learning_rate": 0.0001, "loss": 8.4571, "loss/crossentropy": 2.0190887451171875, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.25865359604358673, "step": 1100 }, { "epoch": 0.068875, "grad_norm": 3.65625, "grad_norm_var": 0.15084228515625, "learning_rate": 0.0001, "loss": 8.7145, "loss/crossentropy": 2.419832944869995, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.36829979717731476, "step": 1102 }, { "epoch": 0.069, "grad_norm": 3.203125, "grad_norm_var": 0.18118489583333333, "learning_rate": 0.0001, "loss": 8.8892, "loss/crossentropy": 2.3291234970092773, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.27904945611953735, "step": 1104 }, { "epoch": 0.069125, "grad_norm": 3.75, "grad_norm_var": 0.18000386555989584, "learning_rate": 0.0001, "loss": 8.7385, "loss/crossentropy": 2.3887641429901123, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3103363811969757, "step": 1106 }, { "epoch": 0.06925, "grad_norm": 3.171875, "grad_norm_var": 0.1803863525390625, "learning_rate": 0.0001, "loss": 8.4626, "loss/crossentropy": 2.2511096000671387, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2744671106338501, "step": 1108 }, { "epoch": 0.069375, "grad_norm": 3.453125, "grad_norm_var": 0.16078999837239583, "learning_rate": 0.0001, "loss": 8.6296, "loss/crossentropy": 2.251457929611206, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2970822751522064, "step": 1110 }, { "epoch": 0.0695, "grad_norm": 3.171875, "grad_norm_var": 0.18325907389322918, "learning_rate": 0.0001, "loss": 8.4855, "loss/crossentropy": 2.3782349824905396, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.30054476857185364, "step": 1112 }, { "epoch": 0.069625, "grad_norm": 3.375, "grad_norm_var": 0.18281962076822916, "learning_rate": 0.0001, "loss": 8.6186, "loss/crossentropy": 2.605000376701355, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.298698827624321, "step": 1114 }, { "epoch": 0.06975, "grad_norm": 3.34375, "grad_norm_var": 0.15660807291666667, "learning_rate": 0.0001, "loss": 8.5642, "loss/crossentropy": 2.2239125967025757, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.284654900431633, "step": 1116 }, { "epoch": 0.069875, "grad_norm": 3.3125, "grad_norm_var": 0.09190165201822917, "learning_rate": 0.0001, "loss": 8.6873, "loss/crossentropy": 2.3841261863708496, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2972448319196701, "step": 1118 }, { "epoch": 0.07, "grad_norm": 3.53125, "grad_norm_var": 0.04534098307291667, "learning_rate": 0.0001, "loss": 8.4881, "loss/crossentropy": 2.251163959503174, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2940330058336258, "step": 1120 }, { "epoch": 0.070125, "grad_norm": 3.640625, "grad_norm_var": 0.041552734375, "learning_rate": 0.0001, "loss": 8.6624, "loss/crossentropy": 2.390018582344055, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.31052152812480927, "step": 1122 }, { "epoch": 0.07025, "grad_norm": 2.875, "grad_norm_var": 0.06489156087239584, "learning_rate": 0.0001, "loss": 8.5129, "loss/crossentropy": 2.2108170986175537, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2933100759983063, "step": 1124 }, { "epoch": 0.070375, "grad_norm": 3.25, "grad_norm_var": 0.06334635416666666, "learning_rate": 0.0001, "loss": 8.4376, "loss/crossentropy": 2.2771471738815308, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28437741100788116, "step": 1126 }, { "epoch": 0.0705, "grad_norm": 3.453125, "grad_norm_var": 0.05369364420572917, "learning_rate": 0.0001, "loss": 8.4285, "loss/crossentropy": 2.2954673767089844, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2847554385662079, "step": 1128 }, { "epoch": 0.070625, "grad_norm": 3.3125, "grad_norm_var": 0.0545318603515625, "learning_rate": 0.0001, "loss": 8.4656, "loss/crossentropy": 2.271798253059387, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2639819011092186, "step": 1130 }, { "epoch": 0.07075, "grad_norm": 3.78125, "grad_norm_var": 0.0674957275390625, "learning_rate": 0.0001, "loss": 8.6624, "loss/crossentropy": 2.5219074487686157, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30493326485157013, "step": 1132 }, { "epoch": 0.070875, "grad_norm": 2.75, "grad_norm_var": 0.09687093098958334, "learning_rate": 0.0001, "loss": 8.3251, "loss/crossentropy": 2.419742465019226, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29505568742752075, "step": 1134 }, { "epoch": 0.071, "grad_norm": 3.734375, "grad_norm_var": 0.1082916259765625, "learning_rate": 0.0001, "loss": 8.6851, "loss/crossentropy": 2.460718870162964, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2991577684879303, "step": 1136 }, { "epoch": 0.071125, "grad_norm": 3.140625, "grad_norm_var": 0.08157145182291667, "learning_rate": 0.0001, "loss": 8.5178, "loss/crossentropy": 2.173828959465027, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.27509088814258575, "step": 1138 }, { "epoch": 0.07125, "grad_norm": 3.109375, "grad_norm_var": 0.07099202473958334, "learning_rate": 0.0001, "loss": 8.2813, "loss/crossentropy": 2.0987906455993652, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.26560700684785843, "step": 1140 }, { "epoch": 0.071375, "grad_norm": 3.15625, "grad_norm_var": 0.0721588134765625, "learning_rate": 0.0001, "loss": 8.5089, "loss/crossentropy": 2.3777287006378174, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.27554096281528473, "step": 1142 }, { "epoch": 0.0715, "grad_norm": 3.390625, "grad_norm_var": 0.07099507649739584, "learning_rate": 0.0001, "loss": 8.3986, "loss/crossentropy": 2.294643998146057, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28792035579681396, "step": 1144 }, { "epoch": 0.071625, "grad_norm": 3.3125, "grad_norm_var": 0.07714436848958334, "learning_rate": 0.0001, "loss": 8.5452, "loss/crossentropy": 2.3531687259674072, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2780257761478424, "step": 1146 }, { "epoch": 0.07175, "grad_norm": 3.15625, "grad_norm_var": 0.06738993326822916, "learning_rate": 0.0001, "loss": 8.5518, "loss/crossentropy": 2.0771710872650146, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26807837188243866, "step": 1148 }, { "epoch": 0.071875, "grad_norm": 3.421875, "grad_norm_var": 0.04797770182291667, "learning_rate": 0.0001, "loss": 8.5416, "loss/crossentropy": 2.18049418926239, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.2739409804344177, "step": 1150 }, { "epoch": 0.072, "grad_norm": 3.09375, "grad_norm_var": 0.03434956868489583, "learning_rate": 0.0001, "loss": 8.4693, "loss/crossentropy": 2.666857123374939, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.30170081555843353, "step": 1152 }, { "epoch": 0.072125, "grad_norm": 3.171875, "grad_norm_var": 0.06629130045572916, "learning_rate": 0.0001, "loss": 8.6786, "loss/crossentropy": 2.5841041803359985, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.30311837792396545, "step": 1154 }, { "epoch": 0.07225, "grad_norm": 3.171875, "grad_norm_var": 0.0630767822265625, "learning_rate": 0.0001, "loss": 8.5017, "loss/crossentropy": 2.2012165784835815, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2716425508260727, "step": 1156 }, { "epoch": 0.072375, "grad_norm": 3.171875, "grad_norm_var": 0.06253255208333333, "learning_rate": 0.0001, "loss": 8.5665, "loss/crossentropy": 2.2168599367141724, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.30834463238716125, "step": 1158 }, { "epoch": 0.0725, "grad_norm": 4.15625, "grad_norm_var": 0.1092193603515625, "learning_rate": 0.0001, "loss": 8.4126, "loss/crossentropy": 2.1044594049453735, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2663833498954773, "step": 1160 }, { "epoch": 0.072625, "grad_norm": 2.828125, "grad_norm_var": 0.11414388020833334, "learning_rate": 0.0001, "loss": 8.3621, "loss/crossentropy": 2.109754800796509, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.27043384313583374, "step": 1162 }, { "epoch": 0.07275, "grad_norm": 3.53125, "grad_norm_var": 0.1210845947265625, "learning_rate": 0.0001, "loss": 8.4745, "loss/crossentropy": 2.384516477584839, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27273619174957275, "step": 1164 }, { "epoch": 0.072875, "grad_norm": 2.984375, "grad_norm_var": 0.1336090087890625, "learning_rate": 0.0001, "loss": 8.4642, "loss/crossentropy": 2.214682459831238, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.27829450368881226, "step": 1166 }, { "epoch": 0.073, "grad_norm": 3.4375, "grad_norm_var": 0.12939046223958334, "learning_rate": 0.0001, "loss": 8.5784, "loss/crossentropy": 2.276426315307617, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.29027001559734344, "step": 1168 }, { "epoch": 0.073125, "grad_norm": 3.09375, "grad_norm_var": 0.109130859375, "learning_rate": 0.0001, "loss": 8.353, "loss/crossentropy": 2.1743494272232056, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.26005150377750397, "step": 1170 }, { "epoch": 0.07325, "grad_norm": 3.421875, "grad_norm_var": 0.11611226399739584, "learning_rate": 0.0001, "loss": 8.4193, "loss/crossentropy": 2.279123902320862, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2806738466024399, "step": 1172 }, { "epoch": 0.073375, "grad_norm": 3.390625, "grad_norm_var": 0.11841532389322916, "learning_rate": 0.0001, "loss": 8.5702, "loss/crossentropy": 2.050893008708954, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.253355473279953, "step": 1174 }, { "epoch": 0.0735, "grad_norm": 3.375, "grad_norm_var": 0.07099202473958334, "learning_rate": 0.0001, "loss": 8.7427, "loss/crossentropy": 2.5286508798599243, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.31759728491306305, "step": 1176 }, { "epoch": 0.073625, "grad_norm": 3.3125, "grad_norm_var": 0.0608795166015625, "learning_rate": 0.0001, "loss": 8.6737, "loss/crossentropy": 2.445157289505005, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.29573802649974823, "step": 1178 }, { "epoch": 0.07375, "grad_norm": 3.46875, "grad_norm_var": 0.05756734212239583, "learning_rate": 0.0001, "loss": 8.5287, "loss/crossentropy": 2.3890769481658936, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2753005623817444, "step": 1180 }, { "epoch": 0.073875, "grad_norm": 3.375, "grad_norm_var": 0.052099609375, "learning_rate": 0.0001, "loss": 8.5552, "loss/crossentropy": 2.2562392950057983, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.27074071764945984, "step": 1182 }, { "epoch": 0.074, "grad_norm": 3.1875, "grad_norm_var": 0.0451568603515625, "learning_rate": 0.0001, "loss": 8.4914, "loss/crossentropy": 1.960713267326355, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.264203280210495, "step": 1184 }, { "epoch": 0.074125, "grad_norm": 3.265625, "grad_norm_var": 0.036279296875, "learning_rate": 0.0001, "loss": 8.4506, "loss/crossentropy": 2.357021927833557, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28563813865184784, "step": 1186 }, { "epoch": 0.07425, "grad_norm": 3.125, "grad_norm_var": 0.03333333333333333, "learning_rate": 0.0001, "loss": 8.3867, "loss/crossentropy": 2.1780192852020264, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26176655292510986, "step": 1188 }, { "epoch": 0.074375, "grad_norm": 3.328125, "grad_norm_var": 0.034228515625, "learning_rate": 0.0001, "loss": 8.4641, "loss/crossentropy": 2.075110673904419, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.29416830837726593, "step": 1190 }, { "epoch": 0.0745, "grad_norm": 3.09375, "grad_norm_var": 0.0424468994140625, "learning_rate": 0.0001, "loss": 8.3866, "loss/crossentropy": 2.152463436126709, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2740413099527359, "step": 1192 }, { "epoch": 0.074625, "grad_norm": 3.28125, "grad_norm_var": 0.039449055989583336, "learning_rate": 0.0001, "loss": 8.4003, "loss/crossentropy": 2.381898283958435, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.29159918427467346, "step": 1194 }, { "epoch": 0.07475, "grad_norm": 2.84375, "grad_norm_var": 0.04258524576822917, "learning_rate": 0.0001, "loss": 8.5591, "loss/crossentropy": 2.6784266233444214, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.30832037329673767, "step": 1196 }, { "epoch": 0.074875, "grad_norm": 3.296875, "grad_norm_var": 0.0288238525390625, "learning_rate": 0.0001, "loss": 8.477, "loss/crossentropy": 2.1786980628967285, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28018996119499207, "step": 1198 }, { "epoch": 0.075, "grad_norm": 3.234375, "grad_norm_var": 0.0292877197265625, "learning_rate": 0.0001, "loss": 8.6585, "loss/crossentropy": 2.248537063598633, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28961239755153656, "step": 1200 }, { "epoch": 0.075125, "grad_norm": 3.140625, "grad_norm_var": 0.029621378580729166, "learning_rate": 0.0001, "loss": 8.4601, "loss/crossentropy": 2.343456268310547, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2769605219364166, "step": 1202 }, { "epoch": 0.07525, "grad_norm": 3.140625, "grad_norm_var": 0.0279449462890625, "learning_rate": 0.0001, "loss": 8.5551, "loss/crossentropy": 2.3451120853424072, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.31704986095428467, "step": 1204 }, { "epoch": 0.075375, "grad_norm": 4.125, "grad_norm_var": 0.07421875, "learning_rate": 0.0001, "loss": 8.2784, "loss/crossentropy": 2.2383298873901367, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2643394321203232, "step": 1206 }, { "epoch": 0.0755, "grad_norm": 3.9375, "grad_norm_var": 0.10383707682291667, "learning_rate": 0.0001, "loss": 8.6391, "loss/crossentropy": 2.0706650018692017, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2589013874530792, "step": 1208 }, { "epoch": 0.075625, "grad_norm": 3.375, "grad_norm_var": 0.10327046712239583, "learning_rate": 0.0001, "loss": 8.6296, "loss/crossentropy": 2.264005422592163, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.28355173766613007, "step": 1210 }, { "epoch": 0.07575, "grad_norm": 3.078125, "grad_norm_var": 0.09348042805989583, "learning_rate": 0.0001, "loss": 8.3618, "loss/crossentropy": 2.4925496578216553, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2993692457675934, "step": 1212 }, { "epoch": 0.075875, "grad_norm": 3.375, "grad_norm_var": 0.09510091145833334, "learning_rate": 0.0001, "loss": 8.5563, "loss/crossentropy": 2.339760661125183, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27024510502815247, "step": 1214 }, { "epoch": 0.076, "grad_norm": 3.1875, "grad_norm_var": 0.10005594889322916, "learning_rate": 0.0001, "loss": 8.5717, "loss/crossentropy": 2.357187867164612, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3173932731151581, "step": 1216 }, { "epoch": 0.076125, "grad_norm": 3.140625, "grad_norm_var": 0.10143229166666666, "learning_rate": 0.0001, "loss": 8.3378, "loss/crossentropy": 2.5177258253097534, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.29622797667980194, "step": 1218 }, { "epoch": 0.07625, "grad_norm": 3.25, "grad_norm_var": 0.098779296875, "learning_rate": 0.0001, "loss": 8.4854, "loss/crossentropy": 2.0992863178253174, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2855496108531952, "step": 1220 }, { "epoch": 0.076375, "grad_norm": 3.03125, "grad_norm_var": 0.0710601806640625, "learning_rate": 0.0001, "loss": 8.3916, "loss/crossentropy": 2.4129068851470947, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.280709832906723, "step": 1222 }, { "epoch": 0.0765, "grad_norm": 3.359375, "grad_norm_var": 0.030301920572916665, "learning_rate": 0.0001, "loss": 8.3373, "loss/crossentropy": 2.0929447412490845, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.25636833161115646, "step": 1224 }, { "epoch": 0.076625, "grad_norm": 3.0, "grad_norm_var": 0.027326456705729165, "learning_rate": 0.0001, "loss": 8.4648, "loss/crossentropy": 2.427080750465393, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2882840186357498, "step": 1226 }, { "epoch": 0.07675, "grad_norm": 3.140625, "grad_norm_var": 0.026585896809895832, "learning_rate": 0.0001, "loss": 8.3397, "loss/crossentropy": 2.4737452268600464, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28378529846668243, "step": 1228 }, { "epoch": 0.076875, "grad_norm": 3.234375, "grad_norm_var": 0.0234771728515625, "learning_rate": 0.0001, "loss": 8.6966, "loss/crossentropy": 2.6298500299453735, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.306839257478714, "step": 1230 }, { "epoch": 0.077, "grad_norm": 2.859375, "grad_norm_var": 0.03771158854166667, "learning_rate": 0.0001, "loss": 8.4196, "loss/crossentropy": 2.3488998413085938, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.27983449399471283, "step": 1232 }, { "epoch": 0.077125, "grad_norm": 3.828125, "grad_norm_var": 0.06613667805989583, "learning_rate": 0.0001, "loss": 8.5206, "loss/crossentropy": 2.2696053981781006, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2854994237422943, "step": 1234 }, { "epoch": 0.07725, "grad_norm": 2.765625, "grad_norm_var": 0.07351888020833333, "learning_rate": 0.0001, "loss": 8.2717, "loss/crossentropy": 2.174792766571045, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2614743113517761, "step": 1236 }, { "epoch": 0.077375, "grad_norm": 3.921875, "grad_norm_var": 0.09851786295572916, "learning_rate": 0.0001, "loss": 8.6137, "loss/crossentropy": 2.2606674432754517, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28789395093917847, "step": 1238 }, { "epoch": 0.0775, "grad_norm": 3.921875, "grad_norm_var": 0.12114969889322917, "learning_rate": 0.0001, "loss": 8.4582, "loss/crossentropy": 2.104749917984009, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27152082324028015, "step": 1240 }, { "epoch": 0.077625, "grad_norm": 3.28125, "grad_norm_var": 0.1411041259765625, "learning_rate": 0.0001, "loss": 8.3861, "loss/crossentropy": 2.198368549346924, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.26415789127349854, "step": 1242 }, { "epoch": 0.07775, "grad_norm": 3.015625, "grad_norm_var": 0.1497711181640625, "learning_rate": 0.0001, "loss": 8.5154, "loss/crossentropy": 2.4816545248031616, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30805790424346924, "step": 1244 }, { "epoch": 0.077875, "grad_norm": 3.515625, "grad_norm_var": 0.1504547119140625, "learning_rate": 0.0001, "loss": 8.5166, "loss/crossentropy": 2.4640896320343018, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29878415167331696, "step": 1246 }, { "epoch": 0.078, "grad_norm": 2.796875, "grad_norm_var": 0.15390218098958333, "learning_rate": 0.0001, "loss": 8.3599, "loss/crossentropy": 2.2658848762512207, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.264192171394825, "step": 1248 }, { "epoch": 0.078125, "grad_norm": 3.5, "grad_norm_var": 0.142724609375, "learning_rate": 0.0001, "loss": 8.3164, "loss/crossentropy": 2.1921703815460205, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2650914490222931, "step": 1250 }, { "epoch": 0.07825, "grad_norm": 2.921875, "grad_norm_var": 0.1355865478515625, "learning_rate": 0.0001, "loss": 8.5194, "loss/crossentropy": 2.2807745933532715, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28195033967494965, "step": 1252 }, { "epoch": 0.078375, "grad_norm": 3.34375, "grad_norm_var": 0.12167561848958333, "learning_rate": 0.0001, "loss": 8.4772, "loss/crossentropy": 2.145294189453125, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2704995721578598, "step": 1254 }, { "epoch": 0.0785, "grad_norm": 2.859375, "grad_norm_var": 0.1064849853515625, "learning_rate": 0.0001, "loss": 8.2792, "loss/crossentropy": 2.3187735080718994, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28507962822914124, "step": 1256 }, { "epoch": 0.078625, "grad_norm": 3.734375, "grad_norm_var": 0.08872782389322917, "learning_rate": 0.0001, "loss": 8.4454, "loss/crossentropy": 2.226912260055542, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2773582488298416, "step": 1258 }, { "epoch": 0.07875, "grad_norm": 2.921875, "grad_norm_var": 0.08684488932291666, "learning_rate": 0.0001, "loss": 8.4253, "loss/crossentropy": 2.524247169494629, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.28448525071144104, "step": 1260 }, { "epoch": 0.078875, "grad_norm": 3.109375, "grad_norm_var": 0.08153889973958334, "learning_rate": 0.0001, "loss": 8.4428, "loss/crossentropy": 2.434785842895508, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2782677710056305, "step": 1262 }, { "epoch": 0.079, "grad_norm": 3.171875, "grad_norm_var": 0.09170633951822917, "learning_rate": 0.0001, "loss": 8.3836, "loss/crossentropy": 2.0830533504486084, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27287817001342773, "step": 1264 }, { "epoch": 0.079125, "grad_norm": 3.265625, "grad_norm_var": 0.09251200358072917, "learning_rate": 0.0001, "loss": 8.3748, "loss/crossentropy": 2.2747669219970703, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.3025789111852646, "step": 1266 }, { "epoch": 0.07925, "grad_norm": 3.140625, "grad_norm_var": 0.07610270182291666, "learning_rate": 0.0001, "loss": 8.4631, "loss/crossentropy": 2.2862067222595215, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2721339762210846, "step": 1268 }, { "epoch": 0.079375, "grad_norm": 3.21875, "grad_norm_var": 0.07183837890625, "learning_rate": 0.0001, "loss": 8.4595, "loss/crossentropy": 2.3111391067504883, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2573637366294861, "step": 1270 }, { "epoch": 0.0795, "grad_norm": 3.0625, "grad_norm_var": 0.06552632649739583, "learning_rate": 0.0001, "loss": 8.3966, "loss/crossentropy": 2.5623552799224854, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.26713909208774567, "step": 1272 }, { "epoch": 0.079625, "grad_norm": 3.15625, "grad_norm_var": 0.04524637858072917, "learning_rate": 0.0001, "loss": 8.2061, "loss/crossentropy": 2.1350300312042236, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.26287516951560974, "step": 1274 }, { "epoch": 0.07975, "grad_norm": 3.046875, "grad_norm_var": 0.042740885416666666, "learning_rate": 0.0001, "loss": 8.5091, "loss/crossentropy": 2.3671988248825073, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.274411678314209, "step": 1276 }, { "epoch": 0.079875, "grad_norm": 3.109375, "grad_norm_var": 0.034989420572916666, "learning_rate": 0.0001, "loss": 8.4391, "loss/crossentropy": 2.280188202857971, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2855434864759445, "step": 1278 }, { "epoch": 0.08, "grad_norm": 3.34375, "grad_norm_var": 0.013997395833333334, "learning_rate": 0.0001, "loss": 8.561, "loss/crossentropy": 2.09197735786438, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2575538009405136, "step": 1280 }, { "epoch": 0.080125, "grad_norm": 2.859375, "grad_norm_var": 0.0140045166015625, "learning_rate": 0.0001, "loss": 8.374, "loss/crossentropy": 2.199427366256714, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2671803832054138, "step": 1282 }, { "epoch": 0.08025, "grad_norm": 3.15625, "grad_norm_var": 0.016792805989583333, "learning_rate": 0.0001, "loss": 8.4494, "loss/crossentropy": 2.646793484687805, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26508933305740356, "step": 1284 }, { "epoch": 0.080375, "grad_norm": 3.125, "grad_norm_var": 0.017378743489583334, "learning_rate": 0.0001, "loss": 8.3257, "loss/crossentropy": 2.255902647972107, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26800450682640076, "step": 1286 }, { "epoch": 0.0805, "grad_norm": 3.703125, "grad_norm_var": 0.03821207682291667, "learning_rate": 0.0001, "loss": 8.6049, "loss/crossentropy": 2.4666751623153687, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2575417757034302, "step": 1288 }, { "epoch": 0.080625, "grad_norm": 3.234375, "grad_norm_var": 0.038874308268229164, "learning_rate": 0.0001, "loss": 8.4277, "loss/crossentropy": 2.2299275398254395, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26189981400966644, "step": 1290 }, { "epoch": 0.08075, "grad_norm": 3.234375, "grad_norm_var": 0.0371978759765625, "learning_rate": 0.0001, "loss": 8.3189, "loss/crossentropy": 2.3310989141464233, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.26487114280462265, "step": 1292 }, { "epoch": 0.080875, "grad_norm": 3.6875, "grad_norm_var": 0.05325520833333333, "learning_rate": 0.0001, "loss": 8.4445, "loss/crossentropy": 2.2357208728790283, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27044905722141266, "step": 1294 }, { "epoch": 0.081, "grad_norm": 2.828125, "grad_norm_var": 0.0831451416015625, "learning_rate": 0.0001, "loss": 8.2945, "loss/crossentropy": 2.4127997159957886, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2716076225042343, "step": 1296 }, { "epoch": 0.081125, "grad_norm": 3.109375, "grad_norm_var": 0.0810546875, "learning_rate": 0.0001, "loss": 8.1039, "loss/crossentropy": 2.271215081214905, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.26447129994630814, "step": 1298 }, { "epoch": 0.08125, "grad_norm": 2.953125, "grad_norm_var": 0.08394266764322916, "learning_rate": 0.0001, "loss": 8.4898, "loss/crossentropy": 2.214189291000366, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26683974266052246, "step": 1300 }, { "epoch": 0.081375, "grad_norm": 3.03125, "grad_norm_var": 0.09244791666666667, "learning_rate": 0.0001, "loss": 8.4019, "loss/crossentropy": 2.1998738050460815, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2771088480949402, "step": 1302 }, { "epoch": 0.0815, "grad_norm": 3.515625, "grad_norm_var": 0.08205464680989584, "learning_rate": 0.0001, "loss": 8.391, "loss/crossentropy": 2.10916006565094, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26495426893234253, "step": 1304 }, { "epoch": 0.081625, "grad_norm": 3.015625, "grad_norm_var": 0.111962890625, "learning_rate": 0.0001, "loss": 8.4097, "loss/crossentropy": 2.428833842277527, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28503939509391785, "step": 1306 }, { "epoch": 0.08175, "grad_norm": 3.046875, "grad_norm_var": 0.1131988525390625, "learning_rate": 0.0001, "loss": 8.441, "loss/crossentropy": 2.512449622154236, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.33712296187877655, "step": 1308 }, { "epoch": 0.081875, "grad_norm": 3.28125, "grad_norm_var": 0.088330078125, "learning_rate": 0.0001, "loss": 8.3889, "loss/crossentropy": 2.438145875930786, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2805769294500351, "step": 1310 }, { "epoch": 0.082, "grad_norm": 3.03125, "grad_norm_var": 0.0719390869140625, "learning_rate": 0.0001, "loss": 8.2675, "loss/crossentropy": 2.35923433303833, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2907231003046036, "step": 1312 }, { "epoch": 0.082125, "grad_norm": 3.125, "grad_norm_var": 0.07124735514322916, "learning_rate": 0.0001, "loss": 8.3908, "loss/crossentropy": 2.288873791694641, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2815335690975189, "step": 1314 }, { "epoch": 0.08225, "grad_norm": 3.125, "grad_norm_var": 0.0664459228515625, "learning_rate": 0.0001, "loss": 8.4692, "loss/crossentropy": 2.4051743745803833, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.282541960477829, "step": 1316 }, { "epoch": 0.082375, "grad_norm": 3.734375, "grad_norm_var": 0.0790924072265625, "learning_rate": 0.0001, "loss": 8.2673, "loss/crossentropy": 2.115163564682007, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26003655791282654, "step": 1318 }, { "epoch": 0.0825, "grad_norm": 2.984375, "grad_norm_var": 0.07752278645833334, "learning_rate": 0.0001, "loss": 8.3407, "loss/crossentropy": 2.294703960418701, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2557590380311012, "step": 1320 }, { "epoch": 0.082625, "grad_norm": 3.140625, "grad_norm_var": 0.041356404622395836, "learning_rate": 0.0001, "loss": 8.3636, "loss/crossentropy": 2.6248332262039185, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2877664119005203, "step": 1322 }, { "epoch": 0.08275, "grad_norm": 3.234375, "grad_norm_var": 0.04487202962239583, "learning_rate": 0.0001, "loss": 8.392, "loss/crossentropy": 2.212457776069641, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2759707272052765, "step": 1324 }, { "epoch": 0.082875, "grad_norm": 3.015625, "grad_norm_var": 0.0431304931640625, "learning_rate": 0.0001, "loss": 8.3794, "loss/crossentropy": 1.8898176550865173, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28947535157203674, "step": 1326 }, { "epoch": 0.083, "grad_norm": 3.1875, "grad_norm_var": 0.04039713541666667, "learning_rate": 0.0001, "loss": 8.4226, "loss/crossentropy": 2.3176772594451904, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.30063313245773315, "step": 1328 }, { "epoch": 0.083125, "grad_norm": 3.421875, "grad_norm_var": 0.04169514973958333, "learning_rate": 0.0001, "loss": 8.033, "loss/crossentropy": 1.9844502806663513, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.22963125258684158, "step": 1330 }, { "epoch": 0.08325, "grad_norm": 2.984375, "grad_norm_var": 0.04394124348958333, "learning_rate": 0.0001, "loss": 8.3364, "loss/crossentropy": 2.0860679745674133, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28032663464546204, "step": 1332 }, { "epoch": 0.083375, "grad_norm": 3.171875, "grad_norm_var": 0.028120930989583334, "learning_rate": 0.0001, "loss": 8.2641, "loss/crossentropy": 2.2230706214904785, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26876039803028107, "step": 1334 }, { "epoch": 0.0835, "grad_norm": 3.28125, "grad_norm_var": 0.026904296875, "learning_rate": 0.0001, "loss": 8.4189, "loss/crossentropy": 2.307973623275757, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2883561700582504, "step": 1336 }, { "epoch": 0.083625, "grad_norm": 3.390625, "grad_norm_var": 0.0290679931640625, "learning_rate": 0.0001, "loss": 8.292, "loss/crossentropy": 2.3308621644973755, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26050567626953125, "step": 1338 }, { "epoch": 0.08375, "grad_norm": 3.40625, "grad_norm_var": 0.03827718098958333, "learning_rate": 0.0001, "loss": 8.2063, "loss/crossentropy": 2.1553597450256348, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27626167237758636, "step": 1340 }, { "epoch": 0.083875, "grad_norm": 3.09375, "grad_norm_var": 0.08676656087239583, "learning_rate": 0.0001, "loss": 8.5019, "loss/crossentropy": 2.21061909198761, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.27770161628723145, "step": 1342 }, { "epoch": 0.084, "grad_norm": 2.734375, "grad_norm_var": 0.1104400634765625, "learning_rate": 0.0001, "loss": 8.222, "loss/crossentropy": 2.2863982915878296, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2523474544286728, "step": 1344 }, { "epoch": 0.084125, "grad_norm": 3.09375, "grad_norm_var": 0.11259358723958333, "learning_rate": 0.0001, "loss": 8.307, "loss/crossentropy": 2.4110026359558105, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26567649841308594, "step": 1346 }, { "epoch": 0.08425, "grad_norm": 3.265625, "grad_norm_var": 0.10590718587239584, "learning_rate": 0.0001, "loss": 8.3584, "loss/crossentropy": 2.3141287565231323, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2731921225786209, "step": 1348 }, { "epoch": 0.084375, "grad_norm": 3.140625, "grad_norm_var": 0.09840087890625, "learning_rate": 0.0001, "loss": 8.3221, "loss/crossentropy": 2.4939377307891846, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2732496112585068, "step": 1350 }, { "epoch": 0.0845, "grad_norm": 3.09375, "grad_norm_var": 0.09888916015625, "learning_rate": 0.0001, "loss": 8.4167, "loss/crossentropy": 2.6840182542800903, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2915331721305847, "step": 1352 }, { "epoch": 0.084625, "grad_norm": 2.984375, "grad_norm_var": 0.1009674072265625, "learning_rate": 0.0001, "loss": 8.361, "loss/crossentropy": 2.3241279125213623, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.25332190841436386, "step": 1354 }, { "epoch": 0.08475, "grad_norm": 2.96875, "grad_norm_var": 0.09444071451822916, "learning_rate": 0.0001, "loss": 8.128, "loss/crossentropy": 2.2505773305892944, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27977539598941803, "step": 1356 }, { "epoch": 0.084875, "grad_norm": 3.609375, "grad_norm_var": 0.04345296223958333, "learning_rate": 0.0001, "loss": 8.383, "loss/crossentropy": 2.4839106798171997, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2985747307538986, "step": 1358 }, { "epoch": 0.085, "grad_norm": 3.046875, "grad_norm_var": 0.03417561848958333, "learning_rate": 0.0001, "loss": 8.3302, "loss/crossentropy": 2.1147927045822144, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26798177510499954, "step": 1360 }, { "epoch": 0.085125, "grad_norm": 3.03125, "grad_norm_var": 0.03313395182291667, "learning_rate": 0.0001, "loss": 8.2878, "loss/crossentropy": 2.341870665550232, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.29636865854263306, "step": 1362 }, { "epoch": 0.08525, "grad_norm": 3.296875, "grad_norm_var": 0.033772786458333336, "learning_rate": 0.0001, "loss": 8.3404, "loss/crossentropy": 2.3820383548736572, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.308891698718071, "step": 1364 }, { "epoch": 0.085375, "grad_norm": 2.984375, "grad_norm_var": 0.03592122395833333, "learning_rate": 0.0001, "loss": 8.2509, "loss/crossentropy": 2.406251907348633, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2710695117712021, "step": 1366 }, { "epoch": 0.0855, "grad_norm": 3.09375, "grad_norm_var": 0.04088541666666667, "learning_rate": 0.0001, "loss": 8.4671, "loss/crossentropy": 2.4824094772338867, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2687048017978668, "step": 1368 }, { "epoch": 0.085625, "grad_norm": 3.140625, "grad_norm_var": 0.0371490478515625, "learning_rate": 0.0001, "loss": 8.2799, "loss/crossentropy": 2.1223180890083313, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.25326162576675415, "step": 1370 }, { "epoch": 0.08575, "grad_norm": 3.015625, "grad_norm_var": 0.0335113525390625, "learning_rate": 0.0001, "loss": 8.2298, "loss/crossentropy": 2.3254255056381226, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28770148754119873, "step": 1372 }, { "epoch": 0.085875, "grad_norm": 3.0, "grad_norm_var": 0.018928019205729167, "learning_rate": 0.0001, "loss": 8.405, "loss/crossentropy": 2.369110345840454, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2755488455295563, "step": 1374 }, { "epoch": 0.086, "grad_norm": 2.90625, "grad_norm_var": 0.018919881184895834, "learning_rate": 0.0001, "loss": 8.1925, "loss/crossentropy": 2.5451020002365112, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.27585017681121826, "step": 1376 }, { "epoch": 0.086125, "grad_norm": 3.109375, "grad_norm_var": 0.018876139322916666, "learning_rate": 0.0001, "loss": 8.5378, "loss/crossentropy": 2.5744348764419556, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28956979513168335, "step": 1378 }, { "epoch": 0.08625, "grad_norm": 3.046875, "grad_norm_var": 0.014729817708333334, "learning_rate": 0.0001, "loss": 8.3531, "loss/crossentropy": 2.0667566061019897, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2600719928741455, "step": 1380 }, { "epoch": 0.086375, "grad_norm": 3.1875, "grad_norm_var": 0.017464192708333333, "learning_rate": 0.0001, "loss": 8.3839, "loss/crossentropy": 2.383557438850403, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.279419407248497, "step": 1382 }, { "epoch": 0.0865, "grad_norm": 3.125, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 8.3527, "loss/crossentropy": 2.262708902359009, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26019924879074097, "step": 1384 }, { "epoch": 0.086625, "grad_norm": 3.28125, "grad_norm_var": 0.01695556640625, "learning_rate": 0.0001, "loss": 8.4089, "loss/crossentropy": 2.422199249267578, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.257827490568161, "step": 1386 }, { "epoch": 0.08675, "grad_norm": 3.09375, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 8.2639, "loss/crossentropy": 2.341743230819702, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2593771815299988, "step": 1388 }, { "epoch": 0.086875, "grad_norm": 3.828125, "grad_norm_var": 0.042578125, "learning_rate": 0.0001, "loss": 8.2408, "loss/crossentropy": 2.2627909183502197, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.262659452855587, "step": 1390 }, { "epoch": 0.087, "grad_norm": 2.84375, "grad_norm_var": 0.05591532389322917, "learning_rate": 0.0001, "loss": 8.202, "loss/crossentropy": 2.2393475770950317, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2788470536470413, "step": 1392 }, { "epoch": 0.087125, "grad_norm": 3.4375, "grad_norm_var": 0.06106363932291667, "learning_rate": 0.0001, "loss": 8.468, "loss/crossentropy": 2.256345748901367, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2531740814447403, "step": 1394 }, { "epoch": 0.08725, "grad_norm": 3.359375, "grad_norm_var": 0.06864827473958333, "learning_rate": 0.0001, "loss": 8.3001, "loss/crossentropy": 2.263739228248596, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26853087544441223, "step": 1396 }, { "epoch": 0.087375, "grad_norm": 3.125, "grad_norm_var": 0.067919921875, "learning_rate": 0.0001, "loss": 8.2254, "loss/crossentropy": 2.3907299041748047, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26690760254859924, "step": 1398 }, { "epoch": 0.0875, "grad_norm": 3.15625, "grad_norm_var": 0.06586812337239584, "learning_rate": 0.0001, "loss": 8.4731, "loss/crossentropy": 2.3437803983688354, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.259974405169487, "step": 1400 }, { "epoch": 0.087625, "grad_norm": 2.90625, "grad_norm_var": 0.06672261555989584, "learning_rate": 0.0001, "loss": 8.4484, "loss/crossentropy": 2.3048956394195557, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2741893529891968, "step": 1402 }, { "epoch": 0.08775, "grad_norm": 3.15625, "grad_norm_var": 0.06726888020833334, "learning_rate": 0.0001, "loss": 8.3698, "loss/crossentropy": 2.3523448705673218, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.280557781457901, "step": 1404 }, { "epoch": 0.087875, "grad_norm": 3.25, "grad_norm_var": 0.033524576822916666, "learning_rate": 0.0001, "loss": 8.3848, "loss/crossentropy": 2.3498148918151855, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3289744406938553, "step": 1406 }, { "epoch": 0.088, "grad_norm": 3.9375, "grad_norm_var": 0.06220296223958333, "learning_rate": 0.0001, "loss": 8.2261, "loss/crossentropy": 2.071715295314789, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26693500578403473, "step": 1408 }, { "epoch": 0.088125, "grad_norm": 3.421875, "grad_norm_var": 0.0634429931640625, "learning_rate": 0.0001, "loss": 8.2024, "loss/crossentropy": 2.333922863006592, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27316156029701233, "step": 1410 }, { "epoch": 0.08825, "grad_norm": 3.265625, "grad_norm_var": 0.05562744140625, "learning_rate": 0.0001, "loss": 8.0937, "loss/crossentropy": 2.1679932475090027, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.25762278586626053, "step": 1412 }, { "epoch": 0.088375, "grad_norm": 2.96875, "grad_norm_var": 0.0583648681640625, "learning_rate": 0.0001, "loss": 8.2188, "loss/crossentropy": 2.3344188928604126, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.32225053012371063, "step": 1414 }, { "epoch": 0.0885, "grad_norm": 3.40625, "grad_norm_var": 0.06533915201822917, "learning_rate": 0.0001, "loss": 8.2794, "loss/crossentropy": 2.181140899658203, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.25673504173755646, "step": 1416 }, { "epoch": 0.088625, "grad_norm": 3.015625, "grad_norm_var": 0.0631988525390625, "learning_rate": 0.0001, "loss": 8.4926, "loss/crossentropy": 2.5540969371795654, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.28567659854888916, "step": 1418 }, { "epoch": 0.08875, "grad_norm": 3.0625, "grad_norm_var": 0.13014322916666668, "learning_rate": 0.0001, "loss": 8.3927, "loss/crossentropy": 2.308061122894287, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26119405031204224, "step": 1420 }, { "epoch": 0.088875, "grad_norm": 4.125, "grad_norm_var": 0.22221577962239583, "learning_rate": 0.0001, "loss": 8.3563, "loss/crossentropy": 2.337175130844116, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28924381732940674, "step": 1422 }, { "epoch": 0.089, "grad_norm": 3.0625, "grad_norm_var": 0.205322265625, "learning_rate": 0.0001, "loss": 8.2574, "loss/crossentropy": 2.383028507232666, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2829654812812805, "step": 1424 }, { "epoch": 0.089125, "grad_norm": 2.8125, "grad_norm_var": 0.22934468587239584, "learning_rate": 0.0001, "loss": 8.0822, "loss/crossentropy": 2.2887638807296753, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2677062898874283, "step": 1426 }, { "epoch": 0.08925, "grad_norm": 2.9375, "grad_norm_var": 0.23911031087239584, "learning_rate": 0.0001, "loss": 8.2723, "loss/crossentropy": 2.2472543716430664, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2706802934408188, "step": 1428 }, { "epoch": 0.089375, "grad_norm": 3.3125, "grad_norm_var": 0.23624674479166666, "learning_rate": 0.0001, "loss": 8.378, "loss/crossentropy": 2.517896294593811, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28536301851272583, "step": 1430 }, { "epoch": 0.0895, "grad_norm": 2.984375, "grad_norm_var": 0.23261311848958333, "learning_rate": 0.0001, "loss": 8.1376, "loss/crossentropy": 2.118437886238098, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2512395307421684, "step": 1432 }, { "epoch": 0.089625, "grad_norm": 3.1875, "grad_norm_var": 0.22909749348958333, "learning_rate": 0.0001, "loss": 8.2629, "loss/crossentropy": 2.177670121192932, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2840624302625656, "step": 1434 }, { "epoch": 0.08975, "grad_norm": 3.28125, "grad_norm_var": 0.16071675618489584, "learning_rate": 0.0001, "loss": 8.3297, "loss/crossentropy": 2.422105073928833, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28876344859600067, "step": 1436 }, { "epoch": 0.089875, "grad_norm": 2.65625, "grad_norm_var": 0.03298238118489583, "learning_rate": 0.0001, "loss": 8.3754, "loss/crossentropy": 2.469232678413391, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2923481911420822, "step": 1438 }, { "epoch": 0.09, "grad_norm": 3.03125, "grad_norm_var": 0.0335845947265625, "learning_rate": 0.0001, "loss": 8.2403, "loss/crossentropy": 2.3302817344665527, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26953746378421783, "step": 1440 }, { "epoch": 0.090125, "grad_norm": 3.171875, "grad_norm_var": 0.0303131103515625, "learning_rate": 0.0001, "loss": 8.2342, "loss/crossentropy": 2.1770907640457153, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26305052638053894, "step": 1442 }, { "epoch": 0.09025, "grad_norm": 3.03125, "grad_norm_var": 0.0291412353515625, "learning_rate": 0.0001, "loss": 8.2963, "loss/crossentropy": 2.244715094566345, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2524953857064247, "step": 1444 }, { "epoch": 0.090375, "grad_norm": 3.53125, "grad_norm_var": 0.0806549072265625, "learning_rate": 0.0001, "loss": 8.2455, "loss/crossentropy": 2.1872771978378296, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28185485303401947, "step": 1446 }, { "epoch": 0.0905, "grad_norm": 2.796875, "grad_norm_var": 0.09325764973958334, "learning_rate": 0.0001, "loss": 8.1763, "loss/crossentropy": 2.388404607772827, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25562550872564316, "step": 1448 }, { "epoch": 0.090625, "grad_norm": 3.34375, "grad_norm_var": 0.11158854166666667, "learning_rate": 0.0001, "loss": 8.2801, "loss/crossentropy": 2.2114795446395874, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2575107663869858, "step": 1450 }, { "epoch": 0.09075, "grad_norm": 2.859375, "grad_norm_var": 0.11750895182291667, "learning_rate": 0.0001, "loss": 8.4195, "loss/crossentropy": 2.268153190612793, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2590959519147873, "step": 1452 }, { "epoch": 0.090875, "grad_norm": 3.0, "grad_norm_var": 0.10434468587239583, "learning_rate": 0.0001, "loss": 8.1186, "loss/crossentropy": 2.3246419429779053, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27099407464265823, "step": 1454 }, { "epoch": 0.091, "grad_norm": 3.046875, "grad_norm_var": 0.10079752604166667, "learning_rate": 0.0001, "loss": 8.3338, "loss/crossentropy": 2.5052762031555176, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29280832409858704, "step": 1456 }, { "epoch": 0.091125, "grad_norm": 3.15625, "grad_norm_var": 0.09954325358072917, "learning_rate": 0.0001, "loss": 8.3336, "loss/crossentropy": 1.9449425339698792, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.275674507021904, "step": 1458 }, { "epoch": 0.09125, "grad_norm": 2.8125, "grad_norm_var": 0.10442301432291666, "learning_rate": 0.0001, "loss": 8.3734, "loss/crossentropy": 2.3953585624694824, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28133782744407654, "step": 1460 }, { "epoch": 0.091375, "grad_norm": 3.015625, "grad_norm_var": 0.0473785400390625, "learning_rate": 0.0001, "loss": 8.0915, "loss/crossentropy": 2.2441056966781616, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2531541734933853, "step": 1462 }, { "epoch": 0.0915, "grad_norm": 3.0, "grad_norm_var": 0.04156494140625, "learning_rate": 0.0001, "loss": 8.4787, "loss/crossentropy": 2.4373109340667725, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2879520505666733, "step": 1464 }, { "epoch": 0.091625, "grad_norm": 2.96875, "grad_norm_var": 0.014850870768229166, "learning_rate": 0.0001, "loss": 8.101, "loss/crossentropy": 2.2889362573623657, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25555726885795593, "step": 1466 }, { "epoch": 0.09175, "grad_norm": 3.171875, "grad_norm_var": 0.018778483072916668, "learning_rate": 0.0001, "loss": 8.1266, "loss/crossentropy": 2.2097198963165283, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27271226048469543, "step": 1468 }, { "epoch": 0.091875, "grad_norm": 3.171875, "grad_norm_var": 0.020963541666666665, "learning_rate": 0.0001, "loss": 8.2212, "loss/crossentropy": 2.181049108505249, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26682595908641815, "step": 1470 }, { "epoch": 0.092, "grad_norm": 3.03125, "grad_norm_var": 0.022163899739583333, "learning_rate": 0.0001, "loss": 8.2866, "loss/crossentropy": 2.311566710472107, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24793966859579086, "step": 1472 }, { "epoch": 0.092125, "grad_norm": 3.203125, "grad_norm_var": 0.022704060872395834, "learning_rate": 0.0001, "loss": 8.178, "loss/crossentropy": 2.359419822692871, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26220113039016724, "step": 1474 }, { "epoch": 0.09225, "grad_norm": 3.21875, "grad_norm_var": 0.022001139322916665, "learning_rate": 0.0001, "loss": 8.4216, "loss/crossentropy": 2.547469735145569, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2959955930709839, "step": 1476 }, { "epoch": 0.092375, "grad_norm": 3.03125, "grad_norm_var": 0.021100870768229165, "learning_rate": 0.0001, "loss": 8.2998, "loss/crossentropy": 2.4008055925369263, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2753802388906479, "step": 1478 }, { "epoch": 0.0925, "grad_norm": 2.671875, "grad_norm_var": 0.028837076822916665, "learning_rate": 0.0001, "loss": 8.0442, "loss/crossentropy": 2.049844443798065, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.25805267691612244, "step": 1480 }, { "epoch": 0.092625, "grad_norm": 3.078125, "grad_norm_var": 0.029100545247395835, "learning_rate": 0.0001, "loss": 8.3892, "loss/crossentropy": 2.3216545581817627, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.27284783124923706, "step": 1482 }, { "epoch": 0.09275, "grad_norm": 2.859375, "grad_norm_var": 0.0282867431640625, "learning_rate": 0.0001, "loss": 8.2379, "loss/crossentropy": 2.3917791843414307, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26631103456020355, "step": 1484 }, { "epoch": 0.092875, "grad_norm": 3.21875, "grad_norm_var": 0.028125, "learning_rate": 0.0001, "loss": 8.5063, "loss/crossentropy": 2.536360025405884, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2958754599094391, "step": 1486 }, { "epoch": 0.093, "grad_norm": 2.984375, "grad_norm_var": 0.0410308837890625, "learning_rate": 0.0001, "loss": 8.3401, "loss/crossentropy": 2.387327551841736, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28280311822891235, "step": 1488 }, { "epoch": 0.093125, "grad_norm": 3.578125, "grad_norm_var": 0.05903218587239583, "learning_rate": 0.0001, "loss": 8.3749, "loss/crossentropy": 2.4892961978912354, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.29599107801914215, "step": 1490 }, { "epoch": 0.09325, "grad_norm": 3.21875, "grad_norm_var": 0.0744781494140625, "learning_rate": 0.0001, "loss": 8.2995, "loss/crossentropy": 2.233021378517151, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.269522100687027, "step": 1492 }, { "epoch": 0.093375, "grad_norm": 2.90625, "grad_norm_var": 0.08401285807291667, "learning_rate": 0.0001, "loss": 8.1287, "loss/crossentropy": 2.2778061628341675, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2637213170528412, "step": 1494 }, { "epoch": 0.0935, "grad_norm": 3.21875, "grad_norm_var": 0.08079020182291667, "learning_rate": 0.0001, "loss": 8.5223, "loss/crossentropy": 2.364134907722473, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2790801376104355, "step": 1496 }, { "epoch": 0.093625, "grad_norm": 2.734375, "grad_norm_var": 0.09123942057291666, "learning_rate": 0.0001, "loss": 8.1725, "loss/crossentropy": 2.0337949991226196, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.244078166782856, "step": 1498 }, { "epoch": 0.09375, "grad_norm": 3.03125, "grad_norm_var": 0.08502197265625, "learning_rate": 0.0001, "loss": 8.3314, "loss/crossentropy": 2.307387113571167, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2793863117694855, "step": 1500 }, { "epoch": 0.093875, "grad_norm": 2.84375, "grad_norm_var": 0.09312744140625, "learning_rate": 0.0001, "loss": 7.9943, "loss/crossentropy": 2.4143176078796387, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26304905116558075, "step": 1502 }, { "epoch": 0.094, "grad_norm": 3.296875, "grad_norm_var": 0.08209228515625, "learning_rate": 0.0001, "loss": 8.3215, "loss/crossentropy": 1.9640471935272217, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24391764402389526, "step": 1504 }, { "epoch": 0.094125, "grad_norm": 3.03125, "grad_norm_var": 0.0652252197265625, "learning_rate": 0.0001, "loss": 8.204, "loss/crossentropy": 2.405478358268738, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2646511495113373, "step": 1506 }, { "epoch": 0.09425, "grad_norm": 2.796875, "grad_norm_var": 0.046126302083333334, "learning_rate": 0.0001, "loss": 8.1211, "loss/crossentropy": 2.0469033122062683, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2384241446852684, "step": 1508 }, { "epoch": 0.094375, "grad_norm": 3.1875, "grad_norm_var": 0.0455230712890625, "learning_rate": 0.0001, "loss": 8.1709, "loss/crossentropy": 2.2403076887130737, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2608010023832321, "step": 1510 }, { "epoch": 0.0945, "grad_norm": 3.328125, "grad_norm_var": 0.04589436848958333, "learning_rate": 0.0001, "loss": 8.1339, "loss/crossentropy": 2.1499756574630737, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.271525114774704, "step": 1512 }, { "epoch": 0.094625, "grad_norm": 2.734375, "grad_norm_var": 0.04589436848958333, "learning_rate": 0.0001, "loss": 8.2903, "loss/crossentropy": 2.1646158695220947, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2660486549139023, "step": 1514 }, { "epoch": 0.09475, "grad_norm": 2.984375, "grad_norm_var": 0.05085347493489583, "learning_rate": 0.0001, "loss": 7.9533, "loss/crossentropy": 2.1994398832321167, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2655777484178543, "step": 1516 }, { "epoch": 0.094875, "grad_norm": 3.171875, "grad_norm_var": 0.047484334309895834, "learning_rate": 0.0001, "loss": 8.1863, "loss/crossentropy": 2.0542885661125183, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2492573782801628, "step": 1518 }, { "epoch": 0.095, "grad_norm": 2.84375, "grad_norm_var": 0.046507771809895834, "learning_rate": 0.0001, "loss": 8.0486, "loss/crossentropy": 2.382603883743286, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2706970274448395, "step": 1520 }, { "epoch": 0.095125, "grad_norm": 3.0, "grad_norm_var": 0.04146219889322917, "learning_rate": 0.0001, "loss": 8.288, "loss/crossentropy": 2.2416555881500244, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26504258811473846, "step": 1522 }, { "epoch": 0.09525, "grad_norm": 3.046875, "grad_norm_var": 0.04072265625, "learning_rate": 0.0001, "loss": 8.3359, "loss/crossentropy": 2.483952045440674, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.288076788187027, "step": 1524 }, { "epoch": 0.095375, "grad_norm": 2.921875, "grad_norm_var": 0.03912353515625, "learning_rate": 0.0001, "loss": 8.3248, "loss/crossentropy": 2.6946524381637573, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2882222831249237, "step": 1526 }, { "epoch": 0.0955, "grad_norm": 2.921875, "grad_norm_var": 0.01900634765625, "learning_rate": 0.0001, "loss": 8.2016, "loss/crossentropy": 1.9769355058670044, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2325659841299057, "step": 1528 }, { "epoch": 0.095625, "grad_norm": 2.921875, "grad_norm_var": 0.01513671875, "learning_rate": 0.0001, "loss": 8.2965, "loss/crossentropy": 2.3487859964370728, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26721881330013275, "step": 1530 }, { "epoch": 0.09575, "grad_norm": 3.59375, "grad_norm_var": 0.039534505208333334, "learning_rate": 0.0001, "loss": 8.1814, "loss/crossentropy": 2.2702786922454834, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2642149329185486, "step": 1532 }, { "epoch": 0.095875, "grad_norm": 2.640625, "grad_norm_var": 0.056962076822916666, "learning_rate": 0.0001, "loss": 8.2477, "loss/crossentropy": 2.1052145957946777, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2681325525045395, "step": 1534 }, { "epoch": 0.096, "grad_norm": 3.9375, "grad_norm_var": 0.10074869791666667, "learning_rate": 0.0001, "loss": 8.158, "loss/crossentropy": 2.356938362121582, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2789629250764847, "step": 1536 }, { "epoch": 0.096125, "grad_norm": 2.671875, "grad_norm_var": 0.11516825358072917, "learning_rate": 0.0001, "loss": 8.2291, "loss/crossentropy": 2.1644341945648193, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2661993205547333, "step": 1538 }, { "epoch": 0.09625, "grad_norm": 3.34375, "grad_norm_var": 0.12259012858072917, "learning_rate": 0.0001, "loss": 8.1661, "loss/crossentropy": 2.512505531311035, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.283734068274498, "step": 1540 }, { "epoch": 0.096375, "grad_norm": 2.71875, "grad_norm_var": 0.12873942057291668, "learning_rate": 0.0001, "loss": 8.1602, "loss/crossentropy": 2.2558088302612305, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26760005950927734, "step": 1542 }, { "epoch": 0.0965, "grad_norm": 3.0, "grad_norm_var": 0.12698160807291667, "learning_rate": 0.0001, "loss": 8.1477, "loss/crossentropy": 2.2201952934265137, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.26513203978538513, "step": 1544 }, { "epoch": 0.096625, "grad_norm": 2.71875, "grad_norm_var": 0.13869527180989583, "learning_rate": 0.0001, "loss": 8.1612, "loss/crossentropy": 2.229737162590027, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2448047399520874, "step": 1546 }, { "epoch": 0.09675, "grad_norm": 3.234375, "grad_norm_var": 0.12079671223958334, "learning_rate": 0.0001, "loss": 8.1329, "loss/crossentropy": 2.3728041648864746, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2712114453315735, "step": 1548 }, { "epoch": 0.096875, "grad_norm": 3.0, "grad_norm_var": 0.10236002604166666, "learning_rate": 0.0001, "loss": 8.2579, "loss/crossentropy": 2.3911492824554443, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.28612037003040314, "step": 1550 }, { "epoch": 0.097, "grad_norm": 3.21875, "grad_norm_var": 0.054011027018229164, "learning_rate": 0.0001, "loss": 8.3094, "loss/crossentropy": 2.3950345516204834, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.28530459105968475, "step": 1552 }, { "epoch": 0.097125, "grad_norm": 2.96875, "grad_norm_var": 0.051656087239583336, "learning_rate": 0.0001, "loss": 8.228, "loss/crossentropy": 2.577568531036377, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26758988201618195, "step": 1554 }, { "epoch": 0.09725, "grad_norm": 2.875, "grad_norm_var": 0.0450592041015625, "learning_rate": 0.0001, "loss": 8.2982, "loss/crossentropy": 2.4208312034606934, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2721874862909317, "step": 1556 }, { "epoch": 0.097375, "grad_norm": 2.828125, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 8.1512, "loss/crossentropy": 2.311411142349243, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2621803656220436, "step": 1558 }, { "epoch": 0.0975, "grad_norm": 3.015625, "grad_norm_var": 0.036295572916666664, "learning_rate": 0.0001, "loss": 8.278, "loss/crossentropy": 2.180152475833893, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.255389466881752, "step": 1560 }, { "epoch": 0.097625, "grad_norm": 2.953125, "grad_norm_var": 0.027587890625, "learning_rate": 0.0001, "loss": 8.3438, "loss/crossentropy": 2.5294106006622314, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26509322226047516, "step": 1562 }, { "epoch": 0.09775, "grad_norm": 3.03125, "grad_norm_var": 0.027099609375, "learning_rate": 0.0001, "loss": 8.2816, "loss/crossentropy": 2.1683244705200195, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2431202381849289, "step": 1564 }, { "epoch": 0.097875, "grad_norm": 3.265625, "grad_norm_var": 0.029686482747395833, "learning_rate": 0.0001, "loss": 8.2192, "loss/crossentropy": 2.2188292741775513, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.29626937210559845, "step": 1566 }, { "epoch": 0.098, "grad_norm": 2.8125, "grad_norm_var": 0.0247955322265625, "learning_rate": 0.0001, "loss": 8.2902, "loss/crossentropy": 2.364318370819092, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26975926756858826, "step": 1568 }, { "epoch": 0.098125, "grad_norm": 3.0625, "grad_norm_var": 0.02056884765625, "learning_rate": 0.0001, "loss": 8.3332, "loss/crossentropy": 2.54610013961792, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28531205654144287, "step": 1570 }, { "epoch": 0.09825, "grad_norm": 3.09375, "grad_norm_var": 0.018659464518229165, "learning_rate": 0.0001, "loss": 8.2105, "loss/crossentropy": 2.206403374671936, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24771679937839508, "step": 1572 }, { "epoch": 0.098375, "grad_norm": 3.078125, "grad_norm_var": 0.015913899739583334, "learning_rate": 0.0001, "loss": 8.3768, "loss/crossentropy": 2.3607594966888428, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.28001710772514343, "step": 1574 }, { "epoch": 0.0985, "grad_norm": 3.15625, "grad_norm_var": 0.021581013997395832, "learning_rate": 0.0001, "loss": 8.2712, "loss/crossentropy": 2.2735308408737183, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26137876510620117, "step": 1576 }, { "epoch": 0.098625, "grad_norm": 2.734375, "grad_norm_var": 0.027912394205729166, "learning_rate": 0.0001, "loss": 8.2123, "loss/crossentropy": 2.3609447479248047, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26791180670261383, "step": 1578 }, { "epoch": 0.09875, "grad_norm": 3.234375, "grad_norm_var": 0.028450520833333333, "learning_rate": 0.0001, "loss": 8.3814, "loss/crossentropy": 2.4629390239715576, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.29713912308216095, "step": 1580 }, { "epoch": 0.098875, "grad_norm": 3.171875, "grad_norm_var": 0.028902180989583335, "learning_rate": 0.0001, "loss": 8.3342, "loss/crossentropy": 2.4477301836013794, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2905379682779312, "step": 1582 }, { "epoch": 0.099, "grad_norm": 2.84375, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 8.1553, "loss/crossentropy": 2.310616612434387, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28200674057006836, "step": 1584 }, { "epoch": 0.099125, "grad_norm": 2.984375, "grad_norm_var": 0.02744140625, "learning_rate": 0.0001, "loss": 8.202, "loss/crossentropy": 2.056865870952606, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2651172876358032, "step": 1586 }, { "epoch": 0.09925, "grad_norm": 3.546875, "grad_norm_var": 0.04753316243489583, "learning_rate": 0.0001, "loss": 8.3285, "loss/crossentropy": 2.222190737724304, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2600644379854202, "step": 1588 }, { "epoch": 0.099375, "grad_norm": 3.03125, "grad_norm_var": 0.05275777180989583, "learning_rate": 0.0001, "loss": 8.3277, "loss/crossentropy": 2.499345541000366, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27901938557624817, "step": 1590 }, { "epoch": 0.0995, "grad_norm": 3.09375, "grad_norm_var": 0.04485270182291667, "learning_rate": 0.0001, "loss": 8.306, "loss/crossentropy": 2.4675090312957764, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27757471799850464, "step": 1592 }, { "epoch": 0.099625, "grad_norm": 3.078125, "grad_norm_var": 0.03870035807291667, "learning_rate": 0.0001, "loss": 8.1512, "loss/crossentropy": 2.0948686599731445, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24000447988510132, "step": 1594 }, { "epoch": 0.09975, "grad_norm": 3.078125, "grad_norm_var": 0.0365386962890625, "learning_rate": 0.0001, "loss": 8.2796, "loss/crossentropy": 2.2303179502487183, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25618939101696014, "step": 1596 }, { "epoch": 0.099875, "grad_norm": 2.78125, "grad_norm_var": 0.03728841145833333, "learning_rate": 0.0001, "loss": 8.2309, "loss/crossentropy": 2.172394037246704, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26014433801174164, "step": 1598 }, { "epoch": 0.1, "grad_norm": 3.0, "grad_norm_var": 0.0431640625, "learning_rate": 0.0001, "loss": 8.0142, "loss/crossentropy": 2.0257323384284973, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.23609444499015808, "step": 1600 }, { "epoch": 0.100125, "grad_norm": 2.8125, "grad_norm_var": 0.044514973958333336, "learning_rate": 0.0001, "loss": 8.0713, "loss/crossentropy": 2.1410731077194214, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26623016595840454, "step": 1602 }, { "epoch": 0.10025, "grad_norm": 2.984375, "grad_norm_var": 0.021491495768229167, "learning_rate": 0.0001, "loss": 8.0494, "loss/crossentropy": 2.3695082664489746, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2746554762125015, "step": 1604 }, { "epoch": 0.100375, "grad_norm": 2.90625, "grad_norm_var": 0.021637980143229166, "learning_rate": 0.0001, "loss": 8.1369, "loss/crossentropy": 2.2459890842437744, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25808124244213104, "step": 1606 }, { "epoch": 0.1005, "grad_norm": 3.25, "grad_norm_var": 0.023591105143229166, "learning_rate": 0.0001, "loss": 8.3249, "loss/crossentropy": 2.4524621963500977, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2866296321153641, "step": 1608 }, { "epoch": 0.100625, "grad_norm": 3.0, "grad_norm_var": 0.023128255208333334, "learning_rate": 0.0001, "loss": 8.2293, "loss/crossentropy": 2.3052438497543335, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27561667561531067, "step": 1610 }, { "epoch": 0.10075, "grad_norm": 2.71875, "grad_norm_var": 0.021174112955729168, "learning_rate": 0.0001, "loss": 8.1306, "loss/crossentropy": 2.477377772331238, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.26571913063526154, "step": 1612 }, { "epoch": 0.100875, "grad_norm": 2.90625, "grad_norm_var": 0.1718414306640625, "learning_rate": 0.0001, "loss": 8.4299, "loss/crossentropy": 2.2197115421295166, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2796938568353653, "step": 1614 }, { "epoch": 0.101, "grad_norm": 3.078125, "grad_norm_var": 0.17489827473958333, "learning_rate": 0.0001, "loss": 8.2917, "loss/crossentropy": 2.2831382751464844, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2742185890674591, "step": 1616 }, { "epoch": 0.101125, "grad_norm": 3.0625, "grad_norm_var": 0.17009989420572916, "learning_rate": 0.0001, "loss": 8.2352, "loss/crossentropy": 2.2610585689544678, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26356393098831177, "step": 1618 }, { "epoch": 0.10125, "grad_norm": 3.046875, "grad_norm_var": 0.16988525390625, "learning_rate": 0.0001, "loss": 8.2457, "loss/crossentropy": 2.2453945875167847, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2588353157043457, "step": 1620 }, { "epoch": 0.101375, "grad_norm": 3.203125, "grad_norm_var": 0.16035054524739584, "learning_rate": 0.0001, "loss": 8.2079, "loss/crossentropy": 2.2290210723876953, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27493180334568024, "step": 1622 }, { "epoch": 0.1015, "grad_norm": 3.0, "grad_norm_var": 0.15942281087239582, "learning_rate": 0.0001, "loss": 8.3946, "loss/crossentropy": 2.3389216661453247, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2627445012331009, "step": 1624 }, { "epoch": 0.101625, "grad_norm": 3.625, "grad_norm_var": 0.32203776041666665, "learning_rate": 0.0001, "loss": 8.2128, "loss/crossentropy": 2.193161904811859, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2609563320875168, "step": 1626 }, { "epoch": 0.10175, "grad_norm": 2.9375, "grad_norm_var": 0.29754231770833334, "learning_rate": 0.0001, "loss": 8.2789, "loss/crossentropy": 2.4037156105041504, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2840966284275055, "step": 1628 }, { "epoch": 0.101875, "grad_norm": 14.4375, "grad_norm_var": 8.05601298014323, "learning_rate": 0.0001, "loss": 8.7256, "loss/crossentropy": 2.2983932495117188, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2895353436470032, "step": 1630 }, { "epoch": 0.102, "grad_norm": 3.359375, "grad_norm_var": 8.257710774739584, "learning_rate": 0.0001, "loss": 8.5242, "loss/crossentropy": 2.5548095703125, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.29210618138313293, "step": 1632 }, { "epoch": 0.102125, "grad_norm": 3.015625, "grad_norm_var": 8.2908203125, "learning_rate": 0.0001, "loss": 8.2313, "loss/crossentropy": 2.1125290393829346, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26304440200328827, "step": 1634 }, { "epoch": 0.10225, "grad_norm": 3.0, "grad_norm_var": 8.321890258789063, "learning_rate": 0.0001, "loss": 8.3129, "loss/crossentropy": 2.4635796546936035, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.289856493473053, "step": 1636 }, { "epoch": 0.102375, "grad_norm": 3.359375, "grad_norm_var": 8.303043619791667, "learning_rate": 0.0001, "loss": 8.3342, "loss/crossentropy": 2.4671066999435425, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.30989140272140503, "step": 1638 }, { "epoch": 0.1025, "grad_norm": 2.859375, "grad_norm_var": 8.330631510416667, "learning_rate": 0.0001, "loss": 8.3266, "loss/crossentropy": 2.2032480239868164, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28856223821640015, "step": 1640 }, { "epoch": 0.102625, "grad_norm": 2.875, "grad_norm_var": 8.449762980143229, "learning_rate": 0.0001, "loss": 8.0619, "loss/crossentropy": 2.384071946144104, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27716881036758423, "step": 1642 }, { "epoch": 0.10275, "grad_norm": 4.25, "grad_norm_var": 8.37940165201823, "learning_rate": 0.0001, "loss": 8.2985, "loss/crossentropy": 2.394433617591858, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27403971552848816, "step": 1644 }, { "epoch": 0.102875, "grad_norm": 3.0, "grad_norm_var": 0.6259073893229167, "learning_rate": 0.0001, "loss": 8.1275, "loss/crossentropy": 2.5770705938339233, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2827526032924652, "step": 1646 }, { "epoch": 0.103, "grad_norm": 2.9375, "grad_norm_var": 0.11741434733072917, "learning_rate": 0.0001, "loss": 8.1994, "loss/crossentropy": 2.203721523284912, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2632629871368408, "step": 1648 }, { "epoch": 0.103125, "grad_norm": 2.78125, "grad_norm_var": 0.11988525390625, "learning_rate": 0.0001, "loss": 8.27, "loss/crossentropy": 2.190592408180237, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2594703510403633, "step": 1650 }, { "epoch": 0.10325, "grad_norm": 3.109375, "grad_norm_var": 0.122119140625, "learning_rate": 0.0001, "loss": 8.3436, "loss/crossentropy": 2.3094884157180786, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2539513558149338, "step": 1652 }, { "epoch": 0.103375, "grad_norm": 2.8125, "grad_norm_var": 0.12323811848958334, "learning_rate": 0.0001, "loss": 8.2993, "loss/crossentropy": 2.2542308568954468, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27265970408916473, "step": 1654 }, { "epoch": 0.1035, "grad_norm": 3.65625, "grad_norm_var": 0.14322916666666666, "learning_rate": 0.0001, "loss": 8.2358, "loss/crossentropy": 2.150593101978302, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26226024329662323, "step": 1656 }, { "epoch": 0.103625, "grad_norm": 2.796875, "grad_norm_var": 0.15575764973958334, "learning_rate": 0.0001, "loss": 8.2424, "loss/crossentropy": 2.3359317779541016, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2621028572320938, "step": 1658 }, { "epoch": 0.10375, "grad_norm": 3.125, "grad_norm_var": 0.07243550618489583, "learning_rate": 0.0001, "loss": 8.209, "loss/crossentropy": 2.3354294300079346, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28642134368419647, "step": 1660 }, { "epoch": 0.103875, "grad_norm": 2.890625, "grad_norm_var": 0.07413736979166667, "learning_rate": 0.0001, "loss": 8.0643, "loss/crossentropy": 2.2628813982009888, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2720167338848114, "step": 1662 }, { "epoch": 0.104, "grad_norm": 2.640625, "grad_norm_var": 0.08414713541666667, "learning_rate": 0.0001, "loss": 8.2252, "loss/crossentropy": 2.405531644821167, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2855593413114548, "step": 1664 }, { "epoch": 0.104125, "grad_norm": 3.453125, "grad_norm_var": 0.09845377604166666, "learning_rate": 0.0001, "loss": 8.2726, "loss/crossentropy": 2.348948836326599, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2590363919734955, "step": 1666 }, { "epoch": 0.10425, "grad_norm": 2.6875, "grad_norm_var": 0.108740234375, "learning_rate": 0.0001, "loss": 8.0444, "loss/crossentropy": 2.275284171104431, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2520073354244232, "step": 1668 }, { "epoch": 0.104375, "grad_norm": 3.171875, "grad_norm_var": 0.10839436848958334, "learning_rate": 0.0001, "loss": 7.9805, "loss/crossentropy": 2.160663425922394, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.25372669100761414, "step": 1670 }, { "epoch": 0.1045, "grad_norm": 3.1875, "grad_norm_var": 0.0904205322265625, "learning_rate": 0.0001, "loss": 8.2607, "loss/crossentropy": 2.2359933853149414, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2621243894100189, "step": 1672 }, { "epoch": 0.104625, "grad_norm": 2.78125, "grad_norm_var": 0.06806233723958334, "learning_rate": 0.0001, "loss": 7.9523, "loss/crossentropy": 1.9437886476516724, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2611730396747589, "step": 1674 }, { "epoch": 0.10475, "grad_norm": 2.859375, "grad_norm_var": 0.06712137858072917, "learning_rate": 0.0001, "loss": 8.0401, "loss/crossentropy": 2.288792371749878, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2550469785928726, "step": 1676 }, { "epoch": 0.104875, "grad_norm": 2.96875, "grad_norm_var": 0.07136128743489584, "learning_rate": 0.0001, "loss": 8.2327, "loss/crossentropy": 2.1720248460769653, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24942373484373093, "step": 1678 }, { "epoch": 0.105, "grad_norm": 2.65625, "grad_norm_var": 0.06942952473958333, "learning_rate": 0.0001, "loss": 8.2368, "loss/crossentropy": 2.2941821813583374, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2739366888999939, "step": 1680 }, { "epoch": 0.105125, "grad_norm": 3.0625, "grad_norm_var": 0.0464508056640625, "learning_rate": 0.0001, "loss": 8.234, "loss/crossentropy": 2.255189538002014, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2702263593673706, "step": 1682 }, { "epoch": 0.10525, "grad_norm": 3.078125, "grad_norm_var": 0.0395660400390625, "learning_rate": 0.0001, "loss": 8.1081, "loss/crossentropy": 2.419864535331726, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26446475088596344, "step": 1684 }, { "epoch": 0.105375, "grad_norm": 2.734375, "grad_norm_var": 0.041559855143229164, "learning_rate": 0.0001, "loss": 8.3004, "loss/crossentropy": 2.329113721847534, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2699923515319824, "step": 1686 }, { "epoch": 0.1055, "grad_norm": 2.890625, "grad_norm_var": 0.02672119140625, "learning_rate": 0.0001, "loss": 8.0978, "loss/crossentropy": 2.290674090385437, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2571800425648689, "step": 1688 }, { "epoch": 0.105625, "grad_norm": 2.703125, "grad_norm_var": 0.031787109375, "learning_rate": 0.0001, "loss": 7.992, "loss/crossentropy": 2.194283127784729, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2518819496035576, "step": 1690 }, { "epoch": 0.10575, "grad_norm": 3.625, "grad_norm_var": 0.06621805826822917, "learning_rate": 0.0001, "loss": 8.0845, "loss/crossentropy": 2.3665404319763184, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2546495646238327, "step": 1692 }, { "epoch": 0.105875, "grad_norm": 2.796875, "grad_norm_var": 0.06741536458333333, "learning_rate": 0.0001, "loss": 8.2148, "loss/crossentropy": 2.6665724515914917, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26855309307575226, "step": 1694 }, { "epoch": 0.106, "grad_norm": 2.59375, "grad_norm_var": 0.06809488932291667, "learning_rate": 0.0001, "loss": 8.0213, "loss/crossentropy": 2.0011618733406067, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2479224056005478, "step": 1696 }, { "epoch": 0.106125, "grad_norm": 3.0, "grad_norm_var": 0.06813151041666667, "learning_rate": 0.0001, "loss": 8.1117, "loss/crossentropy": 2.233310341835022, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26155102252960205, "step": 1698 }, { "epoch": 0.10625, "grad_norm": 3.078125, "grad_norm_var": 0.067138671875, "learning_rate": 0.0001, "loss": 8.1435, "loss/crossentropy": 2.023264706134796, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.23082198202610016, "step": 1700 }, { "epoch": 0.106375, "grad_norm": 2.984375, "grad_norm_var": 0.06722005208333333, "learning_rate": 0.0001, "loss": 8.0233, "loss/crossentropy": 2.2503018379211426, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24713517725467682, "step": 1702 }, { "epoch": 0.1065, "grad_norm": 2.796875, "grad_norm_var": 0.07294820149739584, "learning_rate": 0.0001, "loss": 7.9907, "loss/crossentropy": 2.557657241821289, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26641707122325897, "step": 1704 }, { "epoch": 0.106625, "grad_norm": 2.78125, "grad_norm_var": 0.06492411295572917, "learning_rate": 0.0001, "loss": 7.9908, "loss/crossentropy": 2.2161107063293457, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2457757443189621, "step": 1706 }, { "epoch": 0.10675, "grad_norm": 3.15625, "grad_norm_var": 0.041047159830729166, "learning_rate": 0.0001, "loss": 8.3599, "loss/crossentropy": 2.307586431503296, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25822295993566513, "step": 1708 }, { "epoch": 0.106875, "grad_norm": 2.875, "grad_norm_var": 0.03759765625, "learning_rate": 0.0001, "loss": 8.1595, "loss/crossentropy": 2.215519666671753, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2511584535241127, "step": 1710 }, { "epoch": 0.107, "grad_norm": 3.125, "grad_norm_var": 0.03365478515625, "learning_rate": 0.0001, "loss": 8.3037, "loss/crossentropy": 2.292387008666992, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2549327313899994, "step": 1712 }, { "epoch": 0.107125, "grad_norm": 2.9375, "grad_norm_var": 0.03853251139322917, "learning_rate": 0.0001, "loss": 8.1552, "loss/crossentropy": 2.573517322540283, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27707283198833466, "step": 1714 }, { "epoch": 0.10725, "grad_norm": 3.0625, "grad_norm_var": 0.03942057291666667, "learning_rate": 0.0001, "loss": 8.1879, "loss/crossentropy": 2.4223859310150146, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26580674946308136, "step": 1716 }, { "epoch": 0.107375, "grad_norm": 2.828125, "grad_norm_var": 0.03720296223958333, "learning_rate": 0.0001, "loss": 8.2149, "loss/crossentropy": 2.5507869720458984, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2808096259832382, "step": 1718 }, { "epoch": 0.1075, "grad_norm": 5.46875, "grad_norm_var": 0.42760009765625, "learning_rate": 0.0001, "loss": 8.2746, "loss/crossentropy": 2.4066158533096313, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27704988420009613, "step": 1720 }, { "epoch": 0.107625, "grad_norm": 3.1875, "grad_norm_var": 0.42789306640625, "learning_rate": 0.0001, "loss": 8.1194, "loss/crossentropy": 2.3192564249038696, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2466306835412979, "step": 1722 }, { "epoch": 0.10775, "grad_norm": 2.828125, "grad_norm_var": 0.4310943603515625, "learning_rate": 0.0001, "loss": 8.1758, "loss/crossentropy": 2.2524945735931396, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26875488460063934, "step": 1724 }, { "epoch": 0.107875, "grad_norm": 2.921875, "grad_norm_var": 0.4318359375, "learning_rate": 0.0001, "loss": 8.27, "loss/crossentropy": 2.2209118604660034, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.25894972681999207, "step": 1726 }, { "epoch": 0.108, "grad_norm": 3.0625, "grad_norm_var": 0.43041890462239585, "learning_rate": 0.0001, "loss": 8.2942, "loss/crossentropy": 2.156678080558777, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24751365184783936, "step": 1728 }, { "epoch": 0.108125, "grad_norm": 2.96875, "grad_norm_var": 0.42939453125, "learning_rate": 0.0001, "loss": 7.9607, "loss/crossentropy": 2.1267510652542114, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24761785566806793, "step": 1730 }, { "epoch": 0.10825, "grad_norm": 2.6875, "grad_norm_var": 0.4383941650390625, "learning_rate": 0.0001, "loss": 7.9771, "loss/crossentropy": 2.300544857978821, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23868989944458008, "step": 1732 }, { "epoch": 0.108375, "grad_norm": 2.890625, "grad_norm_var": 0.45806884765625, "learning_rate": 0.0001, "loss": 8.1917, "loss/crossentropy": 2.360519051551819, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26660336554050446, "step": 1734 }, { "epoch": 0.1085, "grad_norm": 2.859375, "grad_norm_var": 0.06379292805989584, "learning_rate": 0.0001, "loss": 8.1922, "loss/crossentropy": 2.1705552339553833, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2528446614742279, "step": 1736 }, { "epoch": 0.108625, "grad_norm": 3.0625, "grad_norm_var": 0.034357706705729164, "learning_rate": 0.0001, "loss": 8.1198, "loss/crossentropy": 2.299151659011841, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25353947281837463, "step": 1738 }, { "epoch": 0.10875, "grad_norm": 2.53125, "grad_norm_var": 0.045210774739583334, "learning_rate": 0.0001, "loss": 8.2351, "loss/crossentropy": 2.172037899494171, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2686986029148102, "step": 1740 }, { "epoch": 0.108875, "grad_norm": 3.3125, "grad_norm_var": 0.0550201416015625, "learning_rate": 0.0001, "loss": 8.0653, "loss/crossentropy": 2.3453006744384766, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2615263909101486, "step": 1742 }, { "epoch": 0.109, "grad_norm": 2.59375, "grad_norm_var": 0.06112874348958333, "learning_rate": 0.0001, "loss": 8.0488, "loss/crossentropy": 2.4043914079666138, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2809004932641983, "step": 1744 }, { "epoch": 0.109125, "grad_norm": 3.296875, "grad_norm_var": 0.06349995930989584, "learning_rate": 0.0001, "loss": 8.3458, "loss/crossentropy": 2.3624587059020996, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.29849672317504883, "step": 1746 }, { "epoch": 0.10925, "grad_norm": 3.25, "grad_norm_var": 0.06689453125, "learning_rate": 0.0001, "loss": 8.2422, "loss/crossentropy": 2.299923300743103, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2804667204618454, "step": 1748 }, { "epoch": 0.109375, "grad_norm": 2.625, "grad_norm_var": 0.06067301432291667, "learning_rate": 0.0001, "loss": 8.0319, "loss/crossentropy": 2.017127275466919, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.236178919672966, "step": 1750 }, { "epoch": 0.1095, "grad_norm": 2.875, "grad_norm_var": 0.05732014973958333, "learning_rate": 0.0001, "loss": 8.2062, "loss/crossentropy": 2.665374517440796, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28587816655635834, "step": 1752 }, { "epoch": 0.109625, "grad_norm": 2.609375, "grad_norm_var": 0.07112528483072916, "learning_rate": 0.0001, "loss": 8.129, "loss/crossentropy": 2.2756296396255493, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2523636817932129, "step": 1754 }, { "epoch": 0.10975, "grad_norm": 2.765625, "grad_norm_var": 0.058259073893229166, "learning_rate": 0.0001, "loss": 7.9899, "loss/crossentropy": 2.066028356552124, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23234106600284576, "step": 1756 }, { "epoch": 0.109875, "grad_norm": 2.796875, "grad_norm_var": 0.05266520182291667, "learning_rate": 0.0001, "loss": 8.1501, "loss/crossentropy": 2.307652711868286, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26358823478221893, "step": 1758 }, { "epoch": 0.11, "grad_norm": 2.84375, "grad_norm_var": 0.0484375, "learning_rate": 0.0001, "loss": 8.0521, "loss/crossentropy": 2.00510311126709, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2376161813735962, "step": 1760 }, { "epoch": 0.110125, "grad_norm": 2.765625, "grad_norm_var": 0.04407450358072917, "learning_rate": 0.0001, "loss": 8.1974, "loss/crossentropy": 2.4039831161499023, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27686847746372223, "step": 1762 }, { "epoch": 0.11025, "grad_norm": 2.8125, "grad_norm_var": 0.03303934733072917, "learning_rate": 0.0001, "loss": 8.0579, "loss/crossentropy": 2.3060104846954346, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2945534288883209, "step": 1764 }, { "epoch": 0.110375, "grad_norm": 3.40625, "grad_norm_var": 0.050047810872395834, "learning_rate": 0.0001, "loss": 8.0388, "loss/crossentropy": 2.4445682764053345, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27699966728687286, "step": 1766 }, { "epoch": 0.1105, "grad_norm": 2.75, "grad_norm_var": 0.05078837076822917, "learning_rate": 0.0001, "loss": 8.2157, "loss/crossentropy": 2.4216455221176147, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28081244230270386, "step": 1768 }, { "epoch": 0.110625, "grad_norm": 2.859375, "grad_norm_var": 0.035521443684895834, "learning_rate": 0.0001, "loss": 8.0611, "loss/crossentropy": 1.7756622433662415, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2289479374885559, "step": 1770 }, { "epoch": 0.11075, "grad_norm": 2.9375, "grad_norm_var": 0.03371480305989583, "learning_rate": 0.0001, "loss": 7.9772, "loss/crossentropy": 2.1726499795913696, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2653361111879349, "step": 1772 }, { "epoch": 0.110875, "grad_norm": 2.984375, "grad_norm_var": 0.034016927083333336, "learning_rate": 0.0001, "loss": 8.0615, "loss/crossentropy": 2.3147062063217163, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27281494438648224, "step": 1774 }, { "epoch": 0.111, "grad_norm": 3.09375, "grad_norm_var": 0.0338775634765625, "learning_rate": 0.0001, "loss": 8.1474, "loss/crossentropy": 2.1700609922409058, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25692617893218994, "step": 1776 }, { "epoch": 0.111125, "grad_norm": 2.8125, "grad_norm_var": 0.0308990478515625, "learning_rate": 0.0001, "loss": 8.0753, "loss/crossentropy": 2.114788770675659, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2623438090085983, "step": 1778 }, { "epoch": 0.11125, "grad_norm": 3.0625, "grad_norm_var": 0.03291727701822917, "learning_rate": 0.0001, "loss": 8.232, "loss/crossentropy": 2.3088366985321045, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25789759308099747, "step": 1780 }, { "epoch": 0.111375, "grad_norm": 2.828125, "grad_norm_var": 0.0161773681640625, "learning_rate": 0.0001, "loss": 8.1283, "loss/crossentropy": 2.021032750606537, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26125916838645935, "step": 1782 }, { "epoch": 0.1115, "grad_norm": 2.8125, "grad_norm_var": 0.016520182291666668, "learning_rate": 0.0001, "loss": 8.3159, "loss/crossentropy": 2.546655535697937, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26530279219150543, "step": 1784 }, { "epoch": 0.111625, "grad_norm": 2.703125, "grad_norm_var": 0.018094889322916665, "learning_rate": 0.0001, "loss": 8.0182, "loss/crossentropy": 2.1505188941955566, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23876308649778366, "step": 1786 }, { "epoch": 0.11175, "grad_norm": 2.875, "grad_norm_var": 0.019880167643229165, "learning_rate": 0.0001, "loss": 8.0242, "loss/crossentropy": 2.1757689714431763, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2557087540626526, "step": 1788 }, { "epoch": 0.111875, "grad_norm": 2.953125, "grad_norm_var": 0.01763916015625, "learning_rate": 0.0001, "loss": 8.0262, "loss/crossentropy": 2.28451144695282, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26241403818130493, "step": 1790 }, { "epoch": 0.112, "grad_norm": 2.765625, "grad_norm_var": 0.016145833333333335, "learning_rate": 0.0001, "loss": 8.0235, "loss/crossentropy": 2.11034619808197, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2549893856048584, "step": 1792 }, { "epoch": 0.112125, "grad_norm": 2.828125, "grad_norm_var": 0.014969889322916667, "learning_rate": 0.0001, "loss": 8.076, "loss/crossentropy": 2.0472227931022644, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2562412843108177, "step": 1794 }, { "epoch": 0.11225, "grad_norm": 2.796875, "grad_norm_var": 0.0120758056640625, "learning_rate": 0.0001, "loss": 7.928, "loss/crossentropy": 2.315675735473633, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2601415067911148, "step": 1796 }, { "epoch": 0.112375, "grad_norm": 2.953125, "grad_norm_var": 0.022443644205729165, "learning_rate": 0.0001, "loss": 7.8579, "loss/crossentropy": 2.0402532815933228, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2341025322675705, "step": 1798 }, { "epoch": 0.1125, "grad_norm": 2.78125, "grad_norm_var": 0.02135009765625, "learning_rate": 0.0001, "loss": 8.2218, "loss/crossentropy": 2.562678098678589, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2757340967655182, "step": 1800 }, { "epoch": 0.112625, "grad_norm": 3.140625, "grad_norm_var": 0.024657185872395834, "learning_rate": 0.0001, "loss": 8.1885, "loss/crossentropy": 2.0969003438949585, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2326880842447281, "step": 1802 }, { "epoch": 0.11275, "grad_norm": 2.734375, "grad_norm_var": 0.030720011393229166, "learning_rate": 0.0001, "loss": 8.1163, "loss/crossentropy": 2.332270383834839, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25439298152923584, "step": 1804 }, { "epoch": 0.112875, "grad_norm": 3.125, "grad_norm_var": 0.03774312337239583, "learning_rate": 0.0001, "loss": 8.106, "loss/crossentropy": 2.162129521369934, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26606719195842743, "step": 1806 }, { "epoch": 0.113, "grad_norm": 3.109375, "grad_norm_var": 0.0377593994140625, "learning_rate": 0.0001, "loss": 8.1417, "loss/crossentropy": 2.2731558084487915, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2553500384092331, "step": 1808 }, { "epoch": 0.113125, "grad_norm": 2.6875, "grad_norm_var": 0.0411529541015625, "learning_rate": 0.0001, "loss": 8.0807, "loss/crossentropy": 2.10029274225235, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23542343080043793, "step": 1810 }, { "epoch": 0.11325, "grad_norm": 2.875, "grad_norm_var": 0.04257405598958333, "learning_rate": 0.0001, "loss": 8.1634, "loss/crossentropy": 2.426058769226074, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2891880124807358, "step": 1812 }, { "epoch": 0.113375, "grad_norm": 3.0625, "grad_norm_var": 0.033154296875, "learning_rate": 0.0001, "loss": 8.0769, "loss/crossentropy": 2.4051181077957153, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2494017630815506, "step": 1814 }, { "epoch": 0.1135, "grad_norm": 2.671875, "grad_norm_var": 0.03585611979166667, "learning_rate": 0.0001, "loss": 8.0919, "loss/crossentropy": 2.0648642778396606, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23387089371681213, "step": 1816 }, { "epoch": 0.113625, "grad_norm": 3.25, "grad_norm_var": 0.0592681884765625, "learning_rate": 0.0001, "loss": 8.078, "loss/crossentropy": 1.989893615245819, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23796076327562332, "step": 1818 }, { "epoch": 0.11375, "grad_norm": 2.828125, "grad_norm_var": 0.0541656494140625, "learning_rate": 0.0001, "loss": 7.9998, "loss/crossentropy": 1.9235325455665588, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.22104668617248535, "step": 1820 }, { "epoch": 0.113875, "grad_norm": 2.828125, "grad_norm_var": 0.05319010416666667, "learning_rate": 0.0001, "loss": 8.0283, "loss/crossentropy": 2.39365816116333, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25356078147888184, "step": 1822 }, { "epoch": 0.114, "grad_norm": 2.765625, "grad_norm_var": 0.05628153483072917, "learning_rate": 0.0001, "loss": 8.0969, "loss/crossentropy": 2.2098069190979004, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2322394847869873, "step": 1824 }, { "epoch": 0.114125, "grad_norm": 2.59375, "grad_norm_var": 0.05894266764322917, "learning_rate": 0.0001, "loss": 8.0474, "loss/crossentropy": 2.3926165103912354, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26291845738887787, "step": 1826 }, { "epoch": 0.11425, "grad_norm": 2.90625, "grad_norm_var": 0.057494099934895834, "learning_rate": 0.0001, "loss": 8.0912, "loss/crossentropy": 2.271665573120117, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.28214675188064575, "step": 1828 }, { "epoch": 0.114375, "grad_norm": 2.96875, "grad_norm_var": 0.055826822916666664, "learning_rate": 0.0001, "loss": 8.2937, "loss/crossentropy": 2.3243162631988525, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25888554751873016, "step": 1830 }, { "epoch": 0.1145, "grad_norm": 2.828125, "grad_norm_var": 0.05383707682291667, "learning_rate": 0.0001, "loss": 8.158, "loss/crossentropy": 2.4674028158187866, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2678636610507965, "step": 1832 }, { "epoch": 0.114625, "grad_norm": 2.90625, "grad_norm_var": 0.023795572916666667, "learning_rate": 0.0001, "loss": 8.2065, "loss/crossentropy": 2.2178725004196167, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2575995400547981, "step": 1834 }, { "epoch": 0.11475, "grad_norm": 2.90625, "grad_norm_var": 0.023444620768229167, "learning_rate": 0.0001, "loss": 8.086, "loss/crossentropy": 2.173088550567627, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25725461542606354, "step": 1836 }, { "epoch": 0.114875, "grad_norm": 2.953125, "grad_norm_var": 0.015576171875, "learning_rate": 0.0001, "loss": 8.013, "loss/crossentropy": 2.167203664779663, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2551988810300827, "step": 1838 }, { "epoch": 0.115, "grad_norm": 2.828125, "grad_norm_var": 0.011714680989583334, "learning_rate": 0.0001, "loss": 8.0137, "loss/crossentropy": 2.324142336845398, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25543810427188873, "step": 1840 }, { "epoch": 0.115125, "grad_norm": 2.6875, "grad_norm_var": 0.008675130208333333, "learning_rate": 0.0001, "loss": 7.8792, "loss/crossentropy": 2.3638752698898315, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26999443769454956, "step": 1842 }, { "epoch": 0.11525, "grad_norm": 2.859375, "grad_norm_var": 0.014351399739583333, "learning_rate": 0.0001, "loss": 8.0577, "loss/crossentropy": 2.236335277557373, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24563505500555038, "step": 1844 }, { "epoch": 0.115375, "grad_norm": 2.78125, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 8.0579, "loss/crossentropy": 2.3817098140716553, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25307345390319824, "step": 1846 }, { "epoch": 0.1155, "grad_norm": 2.71875, "grad_norm_var": 0.016373697916666666, "learning_rate": 0.0001, "loss": 7.796, "loss/crossentropy": 2.2049105167388916, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2526453882455826, "step": 1848 }, { "epoch": 0.115625, "grad_norm": 2.984375, "grad_norm_var": 0.019429524739583332, "learning_rate": 0.0001, "loss": 8.2624, "loss/crossentropy": 2.544666051864624, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27838000655174255, "step": 1850 }, { "epoch": 0.11575, "grad_norm": 2.859375, "grad_norm_var": 0.0191070556640625, "learning_rate": 0.0001, "loss": 8.0722, "loss/crossentropy": 2.4957003593444824, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25582027435302734, "step": 1852 }, { "epoch": 0.115875, "grad_norm": 3.21875, "grad_norm_var": 0.030301920572916665, "learning_rate": 0.0001, "loss": 7.9738, "loss/crossentropy": 2.2852269411087036, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27101629972457886, "step": 1854 }, { "epoch": 0.116, "grad_norm": 2.953125, "grad_norm_var": 0.030790201822916665, "learning_rate": 0.0001, "loss": 7.9897, "loss/crossentropy": 2.1064014434814453, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24057136476039886, "step": 1856 }, { "epoch": 0.116125, "grad_norm": 3.140625, "grad_norm_var": 0.03762105305989583, "learning_rate": 0.0001, "loss": 8.1089, "loss/crossentropy": 2.352795124053955, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23926259577274323, "step": 1858 }, { "epoch": 0.11625, "grad_norm": 2.484375, "grad_norm_var": 0.04638671875, "learning_rate": 0.0001, "loss": 7.8246, "loss/crossentropy": 2.085222840309143, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.21554403752088547, "step": 1860 }, { "epoch": 0.116375, "grad_norm": 2.65625, "grad_norm_var": 0.0461578369140625, "learning_rate": 0.0001, "loss": 7.9895, "loss/crossentropy": 1.9475982785224915, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2195342779159546, "step": 1862 }, { "epoch": 0.1165, "grad_norm": 2.703125, "grad_norm_var": 0.04849853515625, "learning_rate": 0.0001, "loss": 7.9607, "loss/crossentropy": 2.4439034461975098, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27569329738616943, "step": 1864 }, { "epoch": 0.116625, "grad_norm": 3.328125, "grad_norm_var": 0.06008199055989583, "learning_rate": 0.0001, "loss": 7.9813, "loss/crossentropy": 2.3087748289108276, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2708819806575775, "step": 1866 }, { "epoch": 0.11675, "grad_norm": 2.859375, "grad_norm_var": 0.06638895670572917, "learning_rate": 0.0001, "loss": 8.0842, "loss/crossentropy": 2.2168221473693848, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2662223279476166, "step": 1868 }, { "epoch": 0.116875, "grad_norm": 2.59375, "grad_norm_var": 0.06785380045572917, "learning_rate": 0.0001, "loss": 8.0107, "loss/crossentropy": 2.008695662021637, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24834764003753662, "step": 1870 }, { "epoch": 0.117, "grad_norm": 3.109375, "grad_norm_var": 0.07073160807291666, "learning_rate": 0.0001, "loss": 8.1764, "loss/crossentropy": 2.2949352860450745, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23978671431541443, "step": 1872 }, { "epoch": 0.117125, "grad_norm": 2.8125, "grad_norm_var": 0.0585357666015625, "learning_rate": 0.0001, "loss": 7.9191, "loss/crossentropy": 2.1248743534088135, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24373694509267807, "step": 1874 }, { "epoch": 0.11725, "grad_norm": 5.25, "grad_norm_var": 0.4083984375, "learning_rate": 0.0001, "loss": 8.1591, "loss/crossentropy": 2.4608160257339478, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2699667811393738, "step": 1876 }, { "epoch": 0.117375, "grad_norm": 3.765625, "grad_norm_var": 0.43925679524739586, "learning_rate": 0.0001, "loss": 8.2212, "loss/crossentropy": 2.192078948020935, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26026079058647156, "step": 1878 }, { "epoch": 0.1175, "grad_norm": 3.25, "grad_norm_var": 0.4112701416015625, "learning_rate": 0.0001, "loss": 7.974, "loss/crossentropy": 2.079145610332489, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2560829073190689, "step": 1880 }, { "epoch": 0.117625, "grad_norm": 3.078125, "grad_norm_var": 0.4034657796223958, "learning_rate": 0.0001, "loss": 8.0137, "loss/crossentropy": 2.2801836133003235, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2451099008321762, "step": 1882 }, { "epoch": 0.11775, "grad_norm": 2.734375, "grad_norm_var": 0.43884175618489585, "learning_rate": 0.0001, "loss": 8.0881, "loss/crossentropy": 2.373893141746521, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26466645300388336, "step": 1884 }, { "epoch": 0.117875, "grad_norm": 3.0, "grad_norm_var": 0.413330078125, "learning_rate": 0.0001, "loss": 7.9685, "loss/crossentropy": 2.411695957183838, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2512810304760933, "step": 1886 }, { "epoch": 0.118, "grad_norm": 2.640625, "grad_norm_var": 0.42668355305989586, "learning_rate": 0.0001, "loss": 8.0557, "loss/crossentropy": 2.049705147743225, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2417662888765335, "step": 1888 }, { "epoch": 0.118125, "grad_norm": 2.890625, "grad_norm_var": 0.44470113118489585, "learning_rate": 0.0001, "loss": 8.0032, "loss/crossentropy": 2.1323426961898804, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23288051038980484, "step": 1890 }, { "epoch": 0.11825, "grad_norm": 3.203125, "grad_norm_var": 0.1299957275390625, "learning_rate": 0.0001, "loss": 8.1034, "loss/crossentropy": 2.246406674385071, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26154619455337524, "step": 1892 }, { "epoch": 0.118375, "grad_norm": 3.1875, "grad_norm_var": 0.07519429524739583, "learning_rate": 0.0001, "loss": 8.2607, "loss/crossentropy": 2.3726965188980103, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2482641637325287, "step": 1894 }, { "epoch": 0.1185, "grad_norm": 2.921875, "grad_norm_var": 0.06081441243489583, "learning_rate": 0.0001, "loss": 8.0314, "loss/crossentropy": 2.2549182176589966, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2727925777435303, "step": 1896 }, { "epoch": 0.118625, "grad_norm": 2.609375, "grad_norm_var": 0.06313374837239584, "learning_rate": 0.0001, "loss": 8.2286, "loss/crossentropy": 2.471170663833618, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27910932898521423, "step": 1898 }, { "epoch": 0.11875, "grad_norm": 2.859375, "grad_norm_var": 0.05249735514322917, "learning_rate": 0.0001, "loss": 8.0647, "loss/crossentropy": 2.6244795322418213, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.28073398768901825, "step": 1900 }, { "epoch": 0.118875, "grad_norm": 2.71875, "grad_norm_var": 0.049128214518229164, "learning_rate": 0.0001, "loss": 7.8933, "loss/crossentropy": 2.3852927684783936, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26651833206415176, "step": 1902 }, { "epoch": 0.119, "grad_norm": 2.703125, "grad_norm_var": 0.035521443684895834, "learning_rate": 0.0001, "loss": 8.0328, "loss/crossentropy": 2.5933183431625366, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.271125927567482, "step": 1904 }, { "epoch": 0.119125, "grad_norm": 3.4375, "grad_norm_var": 0.05452372233072917, "learning_rate": 0.0001, "loss": 8.0956, "loss/crossentropy": 2.2415446043014526, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.258744515478611, "step": 1906 }, { "epoch": 0.11925, "grad_norm": 2.390625, "grad_norm_var": 0.06295166015625, "learning_rate": 0.0001, "loss": 7.882, "loss/crossentropy": 2.1915100812911987, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26428738236427307, "step": 1908 }, { "epoch": 0.119375, "grad_norm": 2.9375, "grad_norm_var": 0.059300740559895836, "learning_rate": 0.0001, "loss": 7.8918, "loss/crossentropy": 2.024084210395813, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23683403432369232, "step": 1910 }, { "epoch": 0.1195, "grad_norm": 2.546875, "grad_norm_var": 0.08255208333333333, "learning_rate": 0.0001, "loss": 8.1278, "loss/crossentropy": 2.103874683380127, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2549164593219757, "step": 1912 }, { "epoch": 0.119625, "grad_norm": 2.859375, "grad_norm_var": 0.0891021728515625, "learning_rate": 0.0001, "loss": 8.0439, "loss/crossentropy": 2.1909857988357544, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25458941608667374, "step": 1914 }, { "epoch": 0.11975, "grad_norm": 2.734375, "grad_norm_var": 0.0890625, "learning_rate": 0.0001, "loss": 8.0189, "loss/crossentropy": 2.3163031339645386, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26227420568466187, "step": 1916 }, { "epoch": 0.119875, "grad_norm": 2.984375, "grad_norm_var": 0.0921539306640625, "learning_rate": 0.0001, "loss": 8.1534, "loss/crossentropy": 2.4677486419677734, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2759769409894943, "step": 1918 }, { "epoch": 0.12, "grad_norm": 2.859375, "grad_norm_var": 0.09339192708333334, "learning_rate": 0.0001, "loss": 8.004, "loss/crossentropy": 2.3451250791549683, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24791867285966873, "step": 1920 }, { "epoch": 0.120125, "grad_norm": 2.53125, "grad_norm_var": 0.06575113932291667, "learning_rate": 0.0001, "loss": 7.9536, "loss/crossentropy": 2.39498770236969, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26757124066352844, "step": 1922 }, { "epoch": 0.12025, "grad_norm": 2.734375, "grad_norm_var": 0.059912109375, "learning_rate": 0.0001, "loss": 7.9078, "loss/crossentropy": 2.137160062789917, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23972496390342712, "step": 1924 }, { "epoch": 0.120375, "grad_norm": 3.203125, "grad_norm_var": 0.06774800618489583, "learning_rate": 0.0001, "loss": 7.9543, "loss/crossentropy": 2.354183554649353, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2630910202860832, "step": 1926 }, { "epoch": 0.1205, "grad_norm": 2.796875, "grad_norm_var": 0.04582926432291667, "learning_rate": 0.0001, "loss": 7.9081, "loss/crossentropy": 2.0638818740844727, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2652597352862358, "step": 1928 }, { "epoch": 0.120625, "grad_norm": 3.203125, "grad_norm_var": 0.052534993489583334, "learning_rate": 0.0001, "loss": 8.0497, "loss/crossentropy": 2.588783383369446, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2830745279788971, "step": 1930 }, { "epoch": 0.12075, "grad_norm": 2.671875, "grad_norm_var": 0.05359700520833333, "learning_rate": 0.0001, "loss": 8.0327, "loss/crossentropy": 2.223568558692932, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25885971635580063, "step": 1932 }, { "epoch": 0.120875, "grad_norm": 2.78125, "grad_norm_var": 0.049641927083333336, "learning_rate": 0.0001, "loss": 7.9963, "loss/crossentropy": 2.4340078830718994, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2645603120326996, "step": 1934 }, { "epoch": 0.121, "grad_norm": 2.84375, "grad_norm_var": 0.048046875, "learning_rate": 0.0001, "loss": 7.7813, "loss/crossentropy": 1.9766615629196167, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22679174691438675, "step": 1936 }, { "epoch": 0.121125, "grad_norm": 2.796875, "grad_norm_var": 0.043782552083333336, "learning_rate": 0.0001, "loss": 8.0526, "loss/crossentropy": 2.007621169090271, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.232595793902874, "step": 1938 }, { "epoch": 0.12125, "grad_norm": 2.875, "grad_norm_var": 0.03530171712239583, "learning_rate": 0.0001, "loss": 8.0147, "loss/crossentropy": 2.245633602142334, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2521408647298813, "step": 1940 }, { "epoch": 0.121375, "grad_norm": 2.859375, "grad_norm_var": 0.05499674479166667, "learning_rate": 0.0001, "loss": 8.1478, "loss/crossentropy": 2.239235758781433, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29093019664287567, "step": 1942 }, { "epoch": 0.1215, "grad_norm": 3.125, "grad_norm_var": 0.05185139973958333, "learning_rate": 0.0001, "loss": 8.0329, "loss/crossentropy": 2.219251275062561, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24115260690450668, "step": 1944 }, { "epoch": 0.121625, "grad_norm": 3.171875, "grad_norm_var": 0.079296875, "learning_rate": 0.0001, "loss": 8.2787, "loss/crossentropy": 2.3880057334899902, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2628382295370102, "step": 1946 }, { "epoch": 0.12175, "grad_norm": 2.640625, "grad_norm_var": 0.07984619140625, "learning_rate": 0.0001, "loss": 8.0593, "loss/crossentropy": 1.9636898040771484, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.22413796931505203, "step": 1948 }, { "epoch": 0.121875, "grad_norm": 3.09375, "grad_norm_var": 0.07724202473958333, "learning_rate": 0.0001, "loss": 8.2408, "loss/crossentropy": 2.4159456491470337, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2786535918712616, "step": 1950 }, { "epoch": 0.122, "grad_norm": 2.9375, "grad_norm_var": 0.06634114583333334, "learning_rate": 0.0001, "loss": 8.0623, "loss/crossentropy": 2.125056028366089, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2584911435842514, "step": 1952 }, { "epoch": 0.122125, "grad_norm": 2.8125, "grad_norm_var": 0.07579752604166666, "learning_rate": 0.0001, "loss": 8.2316, "loss/crossentropy": 2.15469229221344, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24431538581848145, "step": 1954 }, { "epoch": 0.12225, "grad_norm": 2.828125, "grad_norm_var": 0.0720855712890625, "learning_rate": 0.0001, "loss": 7.9661, "loss/crossentropy": 2.124357581138611, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25534383952617645, "step": 1956 }, { "epoch": 0.122375, "grad_norm": 2.6875, "grad_norm_var": 0.058756510416666664, "learning_rate": 0.0001, "loss": 8.016, "loss/crossentropy": 2.344644784927368, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26001378893852234, "step": 1958 }, { "epoch": 0.1225, "grad_norm": 2.78125, "grad_norm_var": 0.059130859375, "learning_rate": 0.0001, "loss": 8.0888, "loss/crossentropy": 2.242557406425476, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25998418033123016, "step": 1960 }, { "epoch": 0.122625, "grad_norm": 2.578125, "grad_norm_var": 0.027339680989583334, "learning_rate": 0.0001, "loss": 7.9998, "loss/crossentropy": 2.1519815921783447, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24088148772716522, "step": 1962 }, { "epoch": 0.12275, "grad_norm": 3.09375, "grad_norm_var": 0.0291015625, "learning_rate": 0.0001, "loss": 7.9661, "loss/crossentropy": 2.0413911938667297, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2583937346935272, "step": 1964 }, { "epoch": 0.122875, "grad_norm": 2.5625, "grad_norm_var": 0.04091695149739583, "learning_rate": 0.0001, "loss": 8.0222, "loss/crossentropy": 2.404345154762268, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23985719680786133, "step": 1966 }, { "epoch": 0.123, "grad_norm": 2.71875, "grad_norm_var": 0.044709269205729166, "learning_rate": 0.0001, "loss": 8.0141, "loss/crossentropy": 2.6169755458831787, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2697141170501709, "step": 1968 }, { "epoch": 0.123125, "grad_norm": 2.90625, "grad_norm_var": 0.045832316080729164, "learning_rate": 0.0001, "loss": 8.0295, "loss/crossentropy": 2.2693088054656982, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2512510120868683, "step": 1970 }, { "epoch": 0.12325, "grad_norm": 2.578125, "grad_norm_var": 0.050699869791666664, "learning_rate": 0.0001, "loss": 7.9091, "loss/crossentropy": 2.4330859184265137, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2577967271208763, "step": 1972 }, { "epoch": 0.123375, "grad_norm": 3.046875, "grad_norm_var": 0.052490234375, "learning_rate": 0.0001, "loss": 8.1396, "loss/crossentropy": 2.560065507888794, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2873089164495468, "step": 1974 }, { "epoch": 0.1235, "grad_norm": 3.1875, "grad_norm_var": 0.060155232747395836, "learning_rate": 0.0001, "loss": 7.8746, "loss/crossentropy": 2.174700140953064, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2541915774345398, "step": 1976 }, { "epoch": 0.123625, "grad_norm": 2.765625, "grad_norm_var": 0.05668843587239583, "learning_rate": 0.0001, "loss": 7.947, "loss/crossentropy": 2.164485454559326, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25544650852680206, "step": 1978 }, { "epoch": 0.12375, "grad_norm": 3.21875, "grad_norm_var": 0.06304423014322917, "learning_rate": 0.0001, "loss": 8.1361, "loss/crossentropy": 2.110231041908264, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2481146827340126, "step": 1980 }, { "epoch": 0.123875, "grad_norm": 2.78125, "grad_norm_var": 0.047587076822916664, "learning_rate": 0.0001, "loss": 8.0873, "loss/crossentropy": 2.4308364391326904, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2560970336198807, "step": 1982 }, { "epoch": 0.124, "grad_norm": 2.828125, "grad_norm_var": 0.0412750244140625, "learning_rate": 0.0001, "loss": 8.0323, "loss/crossentropy": 2.481392025947571, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2720007449388504, "step": 1984 }, { "epoch": 0.124125, "grad_norm": 2.9375, "grad_norm_var": 0.041975911458333334, "learning_rate": 0.0001, "loss": 7.857, "loss/crossentropy": 2.1386367082595825, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26166096329689026, "step": 1986 }, { "epoch": 0.12425, "grad_norm": 2.6875, "grad_norm_var": 0.04220377604166667, "learning_rate": 0.0001, "loss": 8.0093, "loss/crossentropy": 2.333972215652466, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25886698067188263, "step": 1988 }, { "epoch": 0.124375, "grad_norm": 3.265625, "grad_norm_var": 0.05032145182291667, "learning_rate": 0.0001, "loss": 8.0834, "loss/crossentropy": 2.41828191280365, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24262161552906036, "step": 1990 }, { "epoch": 0.1245, "grad_norm": 2.53125, "grad_norm_var": 0.046052042643229166, "learning_rate": 0.0001, "loss": 8.0389, "loss/crossentropy": 2.1740458011627197, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.23383785039186478, "step": 1992 }, { "epoch": 0.124625, "grad_norm": 2.5625, "grad_norm_var": 0.049117024739583334, "learning_rate": 0.0001, "loss": 8.0157, "loss/crossentropy": 2.2331719398498535, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2640424221754074, "step": 1994 }, { "epoch": 0.12475, "grad_norm": 3.015625, "grad_norm_var": 0.04023335774739583, "learning_rate": 0.0001, "loss": 7.9991, "loss/crossentropy": 2.1523889303207397, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2687607556581497, "step": 1996 }, { "epoch": 0.124875, "grad_norm": 2.859375, "grad_norm_var": 0.040526326497395834, "learning_rate": 0.0001, "loss": 7.9342, "loss/crossentropy": 2.171301484107971, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24432216584682465, "step": 1998 }, { "epoch": 0.125, "grad_norm": 2.78125, "grad_norm_var": 0.04273681640625, "learning_rate": 0.0001, "loss": 8.1111, "loss/crossentropy": 2.2224671840667725, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23374950140714645, "step": 2000 }, { "epoch": 0.125125, "grad_norm": 2.5, "grad_norm_var": 0.04103902180989583, "learning_rate": 0.0001, "loss": 8.0055, "loss/crossentropy": 2.1209537386894226, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2525208741426468, "step": 2002 }, { "epoch": 0.12525, "grad_norm": 2.5625, "grad_norm_var": 0.04299723307291667, "learning_rate": 0.0001, "loss": 7.8668, "loss/crossentropy": 2.1079421639442444, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25046999752521515, "step": 2004 }, { "epoch": 0.125375, "grad_norm": 3.078125, "grad_norm_var": 0.03804931640625, "learning_rate": 0.0001, "loss": 7.9843, "loss/crossentropy": 2.3907183408737183, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2595418617129326, "step": 2006 }, { "epoch": 0.1255, "grad_norm": 2.78125, "grad_norm_var": 0.0336578369140625, "learning_rate": 0.0001, "loss": 7.9723, "loss/crossentropy": 2.081270694732666, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25873951613903046, "step": 2008 }, { "epoch": 0.125625, "grad_norm": 3.046875, "grad_norm_var": 0.033675130208333334, "learning_rate": 0.0001, "loss": 8.0587, "loss/crossentropy": 2.204562723636627, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24240562319755554, "step": 2010 }, { "epoch": 0.12575, "grad_norm": 2.6875, "grad_norm_var": 0.030549112955729166, "learning_rate": 0.0001, "loss": 8.0825, "loss/crossentropy": 2.234739661216736, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23210185766220093, "step": 2012 }, { "epoch": 0.125875, "grad_norm": 2.984375, "grad_norm_var": 0.03386128743489583, "learning_rate": 0.0001, "loss": 7.9408, "loss/crossentropy": 2.155194342136383, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24713550508022308, "step": 2014 }, { "epoch": 0.126, "grad_norm": 2.921875, "grad_norm_var": 0.03388671875, "learning_rate": 0.0001, "loss": 8.0006, "loss/crossentropy": 2.1440590620040894, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24631793051958084, "step": 2016 }, { "epoch": 0.126125, "grad_norm": 3.375, "grad_norm_var": 0.06747945149739583, "learning_rate": 0.0001, "loss": 8.2657, "loss/crossentropy": 2.2782651782035828, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25610214471817017, "step": 2018 }, { "epoch": 0.12625, "grad_norm": 2.703125, "grad_norm_var": 0.0567535400390625, "learning_rate": 0.0001, "loss": 7.9078, "loss/crossentropy": 2.2536474466323853, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23063694685697556, "step": 2020 }, { "epoch": 0.126375, "grad_norm": 3.890625, "grad_norm_var": 0.1236724853515625, "learning_rate": 0.0001, "loss": 8.0493, "loss/crossentropy": 2.2990732192993164, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.28477251529693604, "step": 2022 }, { "epoch": 0.1265, "grad_norm": 3.15625, "grad_norm_var": 0.1252593994140625, "learning_rate": 0.0001, "loss": 8.0902, "loss/crossentropy": 2.3423362970352173, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27799367904663086, "step": 2024 }, { "epoch": 0.126625, "grad_norm": 3.359375, "grad_norm_var": 0.7259999593098958, "learning_rate": 0.0001, "loss": 8.3429, "loss/crossentropy": 2.1187247037887573, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25325731933116913, "step": 2026 }, { "epoch": 0.12675, "grad_norm": 3.328125, "grad_norm_var": 0.6796061197916666, "learning_rate": 0.0001, "loss": 8.2877, "loss/crossentropy": 2.6572694778442383, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28467129170894623, "step": 2028 }, { "epoch": 0.126875, "grad_norm": 2.953125, "grad_norm_var": 0.6576171875, "learning_rate": 0.0001, "loss": 8.1366, "loss/crossentropy": 2.149785280227661, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.33148565888404846, "step": 2030 }, { "epoch": 0.127, "grad_norm": 2.984375, "grad_norm_var": 0.6441243489583334, "learning_rate": 0.0001, "loss": 8.016, "loss/crossentropy": 1.9550745487213135, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.21983042359352112, "step": 2032 }, { "epoch": 0.127125, "grad_norm": 2.59375, "grad_norm_var": 0.70074462890625, "learning_rate": 0.0001, "loss": 7.9985, "loss/crossentropy": 2.4743508100509644, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2646654099225998, "step": 2034 }, { "epoch": 0.12725, "grad_norm": 2.953125, "grad_norm_var": 0.679931640625, "learning_rate": 0.0001, "loss": 7.7714, "loss/crossentropy": 2.2703075408935547, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24541212618350983, "step": 2036 }, { "epoch": 0.127375, "grad_norm": 2.875, "grad_norm_var": 0.66455078125, "learning_rate": 0.0001, "loss": 8.2056, "loss/crossentropy": 2.400794506072998, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2792155146598816, "step": 2038 }, { "epoch": 0.1275, "grad_norm": 2.703125, "grad_norm_var": 0.7059529622395834, "learning_rate": 0.0001, "loss": 7.9121, "loss/crossentropy": 2.221606969833374, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27579738199710846, "step": 2040 }, { "epoch": 0.127625, "grad_norm": 2.875, "grad_norm_var": 0.05742085774739583, "learning_rate": 0.0001, "loss": 8.0984, "loss/crossentropy": 2.1297446489334106, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2457616627216339, "step": 2042 }, { "epoch": 0.12775, "grad_norm": 6.28125, "grad_norm_var": 0.7566802978515625, "learning_rate": 0.0001, "loss": 8.1542, "loss/crossentropy": 2.068065047264099, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2585330307483673, "step": 2044 }, { "epoch": 0.127875, "grad_norm": 3.515625, "grad_norm_var": 0.7755767822265625, "learning_rate": 0.0001, "loss": 8.2794, "loss/crossentropy": 2.285371780395508, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26811927556991577, "step": 2046 }, { "epoch": 0.128, "grad_norm": 3.015625, "grad_norm_var": 0.7772420247395834, "learning_rate": 0.0001, "loss": 8.0508, "loss/crossentropy": 2.102661430835724, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24708286672830582, "step": 2048 }, { "epoch": 0.128125, "grad_norm": 2.78125, "grad_norm_var": 0.7524648030598958, "learning_rate": 0.0001, "loss": 7.87, "loss/crossentropy": 1.9193878173828125, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24057899415493011, "step": 2050 }, { "epoch": 0.12825, "grad_norm": 2.828125, "grad_norm_var": 0.7676096598307292, "learning_rate": 0.0001, "loss": 7.9655, "loss/crossentropy": 2.2599531412124634, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2473764270544052, "step": 2052 }, { "epoch": 0.128375, "grad_norm": 3.03125, "grad_norm_var": 0.7643717447916667, "learning_rate": 0.0001, "loss": 8.0443, "loss/crossentropy": 2.1840824484825134, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.21873797476291656, "step": 2054 }, { "epoch": 0.1285, "grad_norm": 2.515625, "grad_norm_var": 0.7554026285807292, "learning_rate": 0.0001, "loss": 7.8934, "loss/crossentropy": 2.5172749757766724, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2471792846918106, "step": 2056 }, { "epoch": 0.128625, "grad_norm": 2.71875, "grad_norm_var": 0.7780436197916667, "learning_rate": 0.0001, "loss": 7.9608, "loss/crossentropy": 2.5356470346450806, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27692335844039917, "step": 2058 }, { "epoch": 0.12875, "grad_norm": 3.03125, "grad_norm_var": 0.08212483723958333, "learning_rate": 0.0001, "loss": 8.1766, "loss/crossentropy": 2.3050994873046875, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27711644768714905, "step": 2060 }, { "epoch": 0.128875, "grad_norm": 2.78125, "grad_norm_var": 0.031245930989583334, "learning_rate": 0.0001, "loss": 8.0858, "loss/crossentropy": 2.037451386451721, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.234269917011261, "step": 2062 }, { "epoch": 0.129, "grad_norm": 2.609375, "grad_norm_var": 0.035456339518229164, "learning_rate": 0.0001, "loss": 7.9019, "loss/crossentropy": 2.333581566810608, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2544917017221451, "step": 2064 }, { "epoch": 0.129125, "grad_norm": 2.71875, "grad_norm_var": 0.031248982747395834, "learning_rate": 0.0001, "loss": 8.1845, "loss/crossentropy": 2.3705108165740967, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2595665156841278, "step": 2066 }, { "epoch": 0.12925, "grad_norm": 2.96875, "grad_norm_var": 0.03257548014322917, "learning_rate": 0.0001, "loss": 8.0113, "loss/crossentropy": 2.2181931734085083, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2753777801990509, "step": 2068 }, { "epoch": 0.129375, "grad_norm": 2.6875, "grad_norm_var": 0.028678385416666667, "learning_rate": 0.0001, "loss": 8.046, "loss/crossentropy": 2.4384394884109497, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.260432630777359, "step": 2070 }, { "epoch": 0.1295, "grad_norm": 3.03125, "grad_norm_var": 0.022932942708333334, "learning_rate": 0.0001, "loss": 8.1072, "loss/crossentropy": 2.2950222492218018, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2557060271501541, "step": 2072 }, { "epoch": 0.129625, "grad_norm": 2.796875, "grad_norm_var": 0.0248443603515625, "learning_rate": 0.0001, "loss": 7.8764, "loss/crossentropy": 1.9588146209716797, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2530653849244118, "step": 2074 }, { "epoch": 0.12975, "grad_norm": 2.859375, "grad_norm_var": 0.020759073893229167, "learning_rate": 0.0001, "loss": 8.1442, "loss/crossentropy": 2.174315929412842, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24201688915491104, "step": 2076 }, { "epoch": 0.129875, "grad_norm": 2.875, "grad_norm_var": 0.026642862955729166, "learning_rate": 0.0001, "loss": 7.8332, "loss/crossentropy": 1.8437206149101257, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2369949370622635, "step": 2078 }, { "epoch": 0.13, "grad_norm": 2.765625, "grad_norm_var": 0.0224761962890625, "learning_rate": 0.0001, "loss": 8.1136, "loss/crossentropy": 2.4198756217956543, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2665669322013855, "step": 2080 }, { "epoch": 0.130125, "grad_norm": 2.765625, "grad_norm_var": 0.030143229166666667, "learning_rate": 0.0001, "loss": 7.8881, "loss/crossentropy": 2.051876664161682, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23426489531993866, "step": 2082 }, { "epoch": 0.13025, "grad_norm": 2.59375, "grad_norm_var": 0.031473795572916664, "learning_rate": 0.0001, "loss": 8.0093, "loss/crossentropy": 2.455062747001648, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2640880271792412, "step": 2084 }, { "epoch": 0.130375, "grad_norm": 2.875, "grad_norm_var": 0.03137613932291667, "learning_rate": 0.0001, "loss": 8.0219, "loss/crossentropy": 2.2999762296676636, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2465459704399109, "step": 2086 }, { "epoch": 0.1305, "grad_norm": 2.828125, "grad_norm_var": 0.03183186848958333, "learning_rate": 0.0001, "loss": 8.0045, "loss/crossentropy": 2.2871525287628174, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24762800335884094, "step": 2088 }, { "epoch": 0.130625, "grad_norm": 2.953125, "grad_norm_var": 0.03261617024739583, "learning_rate": 0.0001, "loss": 7.8668, "loss/crossentropy": 2.3214457035064697, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25224703550338745, "step": 2090 }, { "epoch": 0.13075, "grad_norm": 2.390625, "grad_norm_var": 0.042313639322916666, "learning_rate": 0.0001, "loss": 8.0465, "loss/crossentropy": 2.1685001850128174, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2591954469680786, "step": 2092 }, { "epoch": 0.130875, "grad_norm": 3.078125, "grad_norm_var": 0.041136678059895834, "learning_rate": 0.0001, "loss": 8.0284, "loss/crossentropy": 2.5781397819519043, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26295357197523117, "step": 2094 }, { "epoch": 0.131, "grad_norm": 2.5, "grad_norm_var": 0.045466105143229164, "learning_rate": 0.0001, "loss": 7.8661, "loss/crossentropy": 2.1926894187927246, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23267576098442078, "step": 2096 }, { "epoch": 0.131125, "grad_norm": 3.1875, "grad_norm_var": 0.04641520182291667, "learning_rate": 0.0001, "loss": 7.9553, "loss/crossentropy": 2.1911760568618774, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25542213022708893, "step": 2098 }, { "epoch": 0.13125, "grad_norm": 2.78125, "grad_norm_var": 0.04755452473958333, "learning_rate": 0.0001, "loss": 7.958, "loss/crossentropy": 2.2775418758392334, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24572113156318665, "step": 2100 }, { "epoch": 0.131375, "grad_norm": 2.78125, "grad_norm_var": 0.04748942057291667, "learning_rate": 0.0001, "loss": 8.0769, "loss/crossentropy": 2.3823719024658203, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2616809457540512, "step": 2102 }, { "epoch": 0.1315, "grad_norm": 2.90625, "grad_norm_var": 0.047240193684895834, "learning_rate": 0.0001, "loss": 8.0382, "loss/crossentropy": 2.4556522369384766, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2946828603744507, "step": 2104 }, { "epoch": 0.131625, "grad_norm": 3.0, "grad_norm_var": 0.05573628743489583, "learning_rate": 0.0001, "loss": 8.0877, "loss/crossentropy": 2.195865511894226, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26489073038101196, "step": 2106 }, { "epoch": 0.13175, "grad_norm": 2.375, "grad_norm_var": 0.05614827473958333, "learning_rate": 0.0001, "loss": 7.9057, "loss/crossentropy": 2.373032331466675, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24483656883239746, "step": 2108 }, { "epoch": 0.131875, "grad_norm": 2.765625, "grad_norm_var": 0.055497233072916666, "learning_rate": 0.0001, "loss": 8.0201, "loss/crossentropy": 2.4894858598709106, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2641270160675049, "step": 2110 }, { "epoch": 0.132, "grad_norm": 2.71875, "grad_norm_var": 0.08896484375, "learning_rate": 0.0001, "loss": 7.9225, "loss/crossentropy": 2.312375068664551, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24002444744110107, "step": 2112 }, { "epoch": 0.132125, "grad_norm": 2.765625, "grad_norm_var": 0.0932037353515625, "learning_rate": 0.0001, "loss": 7.9214, "loss/crossentropy": 2.000899076461792, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23694107681512833, "step": 2114 }, { "epoch": 0.13225, "grad_norm": 2.9375, "grad_norm_var": 0.09410807291666666, "learning_rate": 0.0001, "loss": 8.0863, "loss/crossentropy": 2.2505098581314087, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2779449298977852, "step": 2116 }, { "epoch": 0.132375, "grad_norm": 2.90625, "grad_norm_var": 0.0931549072265625, "learning_rate": 0.0001, "loss": 8.2278, "loss/crossentropy": 2.533925771713257, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27186477184295654, "step": 2118 }, { "epoch": 0.1325, "grad_norm": 3.0625, "grad_norm_var": 0.16379292805989584, "learning_rate": 0.0001, "loss": 7.9833, "loss/crossentropy": 2.2135390043258667, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2325379028916359, "step": 2120 }, { "epoch": 0.132625, "grad_norm": 3.03125, "grad_norm_var": 0.1632232666015625, "learning_rate": 0.0001, "loss": 8.0453, "loss/crossentropy": 2.428161382675171, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26223746687173843, "step": 2122 }, { "epoch": 0.13275, "grad_norm": 2.5625, "grad_norm_var": 0.1512603759765625, "learning_rate": 0.0001, "loss": 7.9349, "loss/crossentropy": 2.6289626359939575, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.272312268614769, "step": 2124 }, { "epoch": 0.132875, "grad_norm": 3.015625, "grad_norm_var": 0.15465087890625, "learning_rate": 0.0001, "loss": 8.0484, "loss/crossentropy": 2.081725239753723, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25036202371120453, "step": 2126 }, { "epoch": 0.133, "grad_norm": 3.109375, "grad_norm_var": 0.12092692057291667, "learning_rate": 0.0001, "loss": 8.1453, "loss/crossentropy": 2.4221293926239014, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2642197906970978, "step": 2128 }, { "epoch": 0.133125, "grad_norm": 2.609375, "grad_norm_var": 0.11172587076822917, "learning_rate": 0.0001, "loss": 8.0771, "loss/crossentropy": 2.385019063949585, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2672020420432091, "step": 2130 }, { "epoch": 0.13325, "grad_norm": 3.046875, "grad_norm_var": 0.10976155598958333, "learning_rate": 0.0001, "loss": 7.9391, "loss/crossentropy": 2.356515049934387, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23601362109184265, "step": 2132 }, { "epoch": 0.133375, "grad_norm": 2.796875, "grad_norm_var": 0.11910400390625, "learning_rate": 0.0001, "loss": 7.8618, "loss/crossentropy": 2.4709160327911377, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26310209929943085, "step": 2134 }, { "epoch": 0.1335, "grad_norm": 2.625, "grad_norm_var": 0.04101155598958333, "learning_rate": 0.0001, "loss": 7.9999, "loss/crossentropy": 2.4431287050247192, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25448351353406906, "step": 2136 }, { "epoch": 0.133625, "grad_norm": 2.734375, "grad_norm_var": 0.0377838134765625, "learning_rate": 0.0001, "loss": 8.0923, "loss/crossentropy": 2.361445426940918, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24513405561447144, "step": 2138 }, { "epoch": 0.13375, "grad_norm": 2.84375, "grad_norm_var": 0.03205464680989583, "learning_rate": 0.0001, "loss": 8.0121, "loss/crossentropy": 2.400641083717346, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2581202983856201, "step": 2140 }, { "epoch": 0.133875, "grad_norm": 2.390625, "grad_norm_var": 0.0412506103515625, "learning_rate": 0.0001, "loss": 7.8979, "loss/crossentropy": 2.0805707573890686, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2818114757537842, "step": 2142 }, { "epoch": 0.134, "grad_norm": 3.375, "grad_norm_var": 0.05451558430989583, "learning_rate": 0.0001, "loss": 8.1184, "loss/crossentropy": 2.480680823326111, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.29633618891239166, "step": 2144 }, { "epoch": 0.134125, "grad_norm": 2.625, "grad_norm_var": 0.0557037353515625, "learning_rate": 0.0001, "loss": 7.8102, "loss/crossentropy": 2.002712309360504, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.23709578067064285, "step": 2146 }, { "epoch": 0.13425, "grad_norm": 2.609375, "grad_norm_var": 0.05565999348958333, "learning_rate": 0.0001, "loss": 7.8972, "loss/crossentropy": 2.405007004737854, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2549060881137848, "step": 2148 }, { "epoch": 0.134375, "grad_norm": 2.6875, "grad_norm_var": 0.05328369140625, "learning_rate": 0.0001, "loss": 7.9409, "loss/crossentropy": 2.137619376182556, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2625036686658859, "step": 2150 }, { "epoch": 0.1345, "grad_norm": 2.75, "grad_norm_var": 0.04928385416666667, "learning_rate": 0.0001, "loss": 7.983, "loss/crossentropy": 2.3216036558151245, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26807793229818344, "step": 2152 }, { "epoch": 0.134625, "grad_norm": 3.265625, "grad_norm_var": 0.06470947265625, "learning_rate": 0.0001, "loss": 7.8514, "loss/crossentropy": 2.3814263343811035, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2562423348426819, "step": 2154 }, { "epoch": 0.13475, "grad_norm": 2.75, "grad_norm_var": 0.07095947265625, "learning_rate": 0.0001, "loss": 8.0623, "loss/crossentropy": 2.3068708181381226, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2527267262339592, "step": 2156 }, { "epoch": 0.134875, "grad_norm": 2.609375, "grad_norm_var": 0.05712483723958333, "learning_rate": 0.0001, "loss": 8.1027, "loss/crossentropy": 2.242267608642578, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2507360577583313, "step": 2158 }, { "epoch": 0.135, "grad_norm": 2.671875, "grad_norm_var": 0.04262593587239583, "learning_rate": 0.0001, "loss": 7.7535, "loss/crossentropy": 2.259010672569275, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2302761897444725, "step": 2160 }, { "epoch": 0.135125, "grad_norm": 2.90625, "grad_norm_var": 0.04046223958333333, "learning_rate": 0.0001, "loss": 7.8616, "loss/crossentropy": 2.195171058177948, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23362931609153748, "step": 2162 }, { "epoch": 0.13525, "grad_norm": 2.765625, "grad_norm_var": 0.03961588541666667, "learning_rate": 0.0001, "loss": 7.9037, "loss/crossentropy": 2.306097149848938, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.27228541672229767, "step": 2164 }, { "epoch": 0.135375, "grad_norm": 2.75, "grad_norm_var": 0.039937337239583336, "learning_rate": 0.0001, "loss": 7.9223, "loss/crossentropy": 2.354483962059021, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25157542526721954, "step": 2166 }, { "epoch": 0.1355, "grad_norm": 2.953125, "grad_norm_var": 0.04169514973958333, "learning_rate": 0.0001, "loss": 8.0153, "loss/crossentropy": 2.555723190307617, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2608166038990021, "step": 2168 }, { "epoch": 0.135625, "grad_norm": 2.59375, "grad_norm_var": 0.026976521809895834, "learning_rate": 0.0001, "loss": 8.0232, "loss/crossentropy": 2.314175248146057, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2557505890727043, "step": 2170 }, { "epoch": 0.13575, "grad_norm": 2.640625, "grad_norm_var": 0.0195709228515625, "learning_rate": 0.0001, "loss": 7.9036, "loss/crossentropy": 2.252376437187195, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26094751060009, "step": 2172 }, { "epoch": 0.135875, "grad_norm": 2.578125, "grad_norm_var": 0.020182291666666668, "learning_rate": 0.0001, "loss": 7.7816, "loss/crossentropy": 2.0808385610580444, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25295551121234894, "step": 2174 }, { "epoch": 0.136, "grad_norm": 2.703125, "grad_norm_var": 0.016145833333333335, "learning_rate": 0.0001, "loss": 7.9248, "loss/crossentropy": 2.2166486978530884, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2506643235683441, "step": 2176 }, { "epoch": 0.136125, "grad_norm": 2.734375, "grad_norm_var": 0.0152496337890625, "learning_rate": 0.0001, "loss": 7.9413, "loss/crossentropy": 2.4114983081817627, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24629026651382446, "step": 2178 }, { "epoch": 0.13625, "grad_norm": 2.984375, "grad_norm_var": 0.017508951822916667, "learning_rate": 0.0001, "loss": 7.9527, "loss/crossentropy": 1.9971612095832825, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23709237575531006, "step": 2180 }, { "epoch": 0.136375, "grad_norm": 2.734375, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 8.0183, "loss/crossentropy": 2.598210096359253, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2603771388530731, "step": 2182 }, { "epoch": 0.1365, "grad_norm": 2.9375, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 8.0132, "loss/crossentropy": 2.382105231285095, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24392583966255188, "step": 2184 }, { "epoch": 0.136625, "grad_norm": 2.59375, "grad_norm_var": 0.022834269205729167, "learning_rate": 0.0001, "loss": 8.047, "loss/crossentropy": 2.4047285318374634, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27039480209350586, "step": 2186 }, { "epoch": 0.13675, "grad_norm": 2.4375, "grad_norm_var": 0.032938639322916664, "learning_rate": 0.0001, "loss": 7.7846, "loss/crossentropy": 2.0133553743362427, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.242934912443161, "step": 2188 }, { "epoch": 0.136875, "grad_norm": 3.078125, "grad_norm_var": 0.043701171875, "learning_rate": 0.0001, "loss": 8.1042, "loss/crossentropy": 2.4624531269073486, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26904913038015366, "step": 2190 }, { "epoch": 0.137, "grad_norm": 2.640625, "grad_norm_var": 0.048216756184895834, "learning_rate": 0.0001, "loss": 7.9589, "loss/crossentropy": 2.3441646099090576, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2756097614765167, "step": 2192 }, { "epoch": 0.137125, "grad_norm": 2.609375, "grad_norm_var": 0.0473297119140625, "learning_rate": 0.0001, "loss": 7.9626, "loss/crossentropy": 2.0180709958076477, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22330156713724136, "step": 2194 }, { "epoch": 0.13725, "grad_norm": 4.875, "grad_norm_var": 0.3212076822916667, "learning_rate": 0.0001, "loss": 8.1305, "loss/crossentropy": 2.247686982154846, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.257401205599308, "step": 2196 }, { "epoch": 0.137375, "grad_norm": 3.125, "grad_norm_var": 0.3214182535807292, "learning_rate": 0.0001, "loss": 7.8432, "loss/crossentropy": 2.387032985687256, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2718297243118286, "step": 2198 }, { "epoch": 0.1375, "grad_norm": 2.84375, "grad_norm_var": 0.3174957275390625, "learning_rate": 0.0001, "loss": 7.9656, "loss/crossentropy": 2.1113094091415405, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2628230005502701, "step": 2200 }, { "epoch": 0.137625, "grad_norm": 2.84375, "grad_norm_var": 0.3123931884765625, "learning_rate": 0.0001, "loss": 8.1272, "loss/crossentropy": 2.681854248046875, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.30167150497436523, "step": 2202 }, { "epoch": 0.13775, "grad_norm": 2.71875, "grad_norm_var": 0.28544514973958335, "learning_rate": 0.0001, "loss": 7.8842, "loss/crossentropy": 2.2539913654327393, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2667968273162842, "step": 2204 }, { "epoch": 0.137875, "grad_norm": 2.984375, "grad_norm_var": 0.28813374837239586, "learning_rate": 0.0001, "loss": 7.994, "loss/crossentropy": 2.336976170539856, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2730831503868103, "step": 2206 }, { "epoch": 0.138, "grad_norm": 2.90625, "grad_norm_var": 0.28128153483072915, "learning_rate": 0.0001, "loss": 7.8908, "loss/crossentropy": 2.137080729007721, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2353602722287178, "step": 2208 }, { "epoch": 0.138125, "grad_norm": 2.5, "grad_norm_var": 0.28933919270833336, "learning_rate": 0.0001, "loss": 7.8058, "loss/crossentropy": 2.0887175798416138, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2580728679895401, "step": 2210 }, { "epoch": 0.13825, "grad_norm": 2.640625, "grad_norm_var": 0.030269368489583334, "learning_rate": 0.0001, "loss": 8.0849, "loss/crossentropy": 1.7958271503448486, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24919769912958145, "step": 2212 }, { "epoch": 0.138375, "grad_norm": 2.640625, "grad_norm_var": 0.0386871337890625, "learning_rate": 0.0001, "loss": 7.8756, "loss/crossentropy": 2.135870099067688, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24682459235191345, "step": 2214 }, { "epoch": 0.1385, "grad_norm": 2.734375, "grad_norm_var": 0.08369038899739584, "learning_rate": 0.0001, "loss": 8.0304, "loss/crossentropy": 2.346623420715332, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24961213767528534, "step": 2216 }, { "epoch": 0.138625, "grad_norm": 2.703125, "grad_norm_var": 0.07737630208333333, "learning_rate": 0.0001, "loss": 8.0556, "loss/crossentropy": 2.195094585418701, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24484457075595856, "step": 2218 }, { "epoch": 0.13875, "grad_norm": 2.703125, "grad_norm_var": 0.07613525390625, "learning_rate": 0.0001, "loss": 8.0197, "loss/crossentropy": 2.2862359285354614, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2567315921187401, "step": 2220 }, { "epoch": 0.138875, "grad_norm": 2.546875, "grad_norm_var": 0.07868550618489584, "learning_rate": 0.0001, "loss": 7.9319, "loss/crossentropy": 2.331244111061096, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2514599338173866, "step": 2222 }, { "epoch": 0.139, "grad_norm": 3.15625, "grad_norm_var": 0.08726806640625, "learning_rate": 0.0001, "loss": 7.8826, "loss/crossentropy": 1.9546465873718262, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23689769953489304, "step": 2224 }, { "epoch": 0.139125, "grad_norm": 2.3125, "grad_norm_var": 0.09897359212239583, "learning_rate": 0.0001, "loss": 7.859, "loss/crossentropy": 2.0505433082580566, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24545447528362274, "step": 2226 }, { "epoch": 0.13925, "grad_norm": 2.828125, "grad_norm_var": 0.10256245930989584, "learning_rate": 0.0001, "loss": 7.9768, "loss/crossentropy": 2.3643672466278076, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2489551082253456, "step": 2228 }, { "epoch": 0.139375, "grad_norm": 2.96875, "grad_norm_var": 0.09179280598958334, "learning_rate": 0.0001, "loss": 7.9637, "loss/crossentropy": 2.4726024866104126, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2436269074678421, "step": 2230 }, { "epoch": 0.1395, "grad_norm": 2.765625, "grad_norm_var": 0.04537760416666667, "learning_rate": 0.0001, "loss": 8.1119, "loss/crossentropy": 2.409575581550598, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.258441299200058, "step": 2232 }, { "epoch": 0.139625, "grad_norm": 2.546875, "grad_norm_var": 0.04572652180989583, "learning_rate": 0.0001, "loss": 7.8413, "loss/crossentropy": 2.37674617767334, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26087239384651184, "step": 2234 }, { "epoch": 0.13975, "grad_norm": 2.796875, "grad_norm_var": 0.04651590983072917, "learning_rate": 0.0001, "loss": 7.9557, "loss/crossentropy": 2.270553708076477, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24740490317344666, "step": 2236 }, { "epoch": 0.139875, "grad_norm": 2.703125, "grad_norm_var": 0.04517822265625, "learning_rate": 0.0001, "loss": 7.8289, "loss/crossentropy": 2.32234787940979, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2481185868382454, "step": 2238 }, { "epoch": 0.14, "grad_norm": 2.71875, "grad_norm_var": 0.03321024576822917, "learning_rate": 0.0001, "loss": 8.0077, "loss/crossentropy": 2.4945857524871826, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2944895774126053, "step": 2240 }, { "epoch": 0.140125, "grad_norm": 2.609375, "grad_norm_var": 0.0188385009765625, "learning_rate": 0.0001, "loss": 7.8994, "loss/crossentropy": 2.3556969165802, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2561969757080078, "step": 2242 }, { "epoch": 0.14025, "grad_norm": 2.8125, "grad_norm_var": 0.013765462239583333, "learning_rate": 0.0001, "loss": 7.9348, "loss/crossentropy": 2.219905376434326, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.273133248090744, "step": 2244 }, { "epoch": 0.140375, "grad_norm": 2.734375, "grad_norm_var": 0.00855712890625, "learning_rate": 0.0001, "loss": 8.0431, "loss/crossentropy": 2.1384077668190002, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.237950399518013, "step": 2246 }, { "epoch": 0.1405, "grad_norm": 2.8125, "grad_norm_var": 0.0127349853515625, "learning_rate": 0.0001, "loss": 7.83, "loss/crossentropy": 2.3398306369781494, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25574472546577454, "step": 2248 }, { "epoch": 0.140625, "grad_norm": 2.484375, "grad_norm_var": 0.015608723958333333, "learning_rate": 0.0001, "loss": 8.033, "loss/crossentropy": 2.24453866481781, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23751161247491837, "step": 2250 }, { "epoch": 0.14075, "grad_norm": 2.65625, "grad_norm_var": 0.014388020833333333, "learning_rate": 0.0001, "loss": 7.8561, "loss/crossentropy": 1.9904406070709229, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24353782087564468, "step": 2252 }, { "epoch": 0.140875, "grad_norm": 2.578125, "grad_norm_var": 0.017252604166666668, "learning_rate": 0.0001, "loss": 7.866, "loss/crossentropy": 2.367901563644409, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24962928891181946, "step": 2254 }, { "epoch": 0.141, "grad_norm": 2.984375, "grad_norm_var": 0.0227447509765625, "learning_rate": 0.0001, "loss": 8.0945, "loss/crossentropy": 2.3909614086151123, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2490956410765648, "step": 2256 }, { "epoch": 0.141125, "grad_norm": 2.75, "grad_norm_var": 0.0219635009765625, "learning_rate": 0.0001, "loss": 8.0642, "loss/crossentropy": 2.158316493034363, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2531931698322296, "step": 2258 }, { "epoch": 0.14125, "grad_norm": 2.671875, "grad_norm_var": 0.0224761962890625, "learning_rate": 0.0001, "loss": 7.8171, "loss/crossentropy": 2.163187623023987, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24659747630357742, "step": 2260 }, { "epoch": 0.141375, "grad_norm": 2.765625, "grad_norm_var": 0.02261962890625, "learning_rate": 0.0001, "loss": 7.8513, "loss/crossentropy": 2.2378615140914917, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.232786126434803, "step": 2262 }, { "epoch": 0.1415, "grad_norm": 2.859375, "grad_norm_var": 0.0192047119140625, "learning_rate": 0.0001, "loss": 7.9754, "loss/crossentropy": 2.50004506111145, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24566112458705902, "step": 2264 }, { "epoch": 0.141625, "grad_norm": 2.734375, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 7.9826, "loss/crossentropy": 2.192861318588257, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26260973513126373, "step": 2266 }, { "epoch": 0.14175, "grad_norm": 2.6875, "grad_norm_var": 0.0215240478515625, "learning_rate": 0.0001, "loss": 7.8327, "loss/crossentropy": 2.3900744915008545, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23667296767234802, "step": 2268 }, { "epoch": 0.141875, "grad_norm": 2.671875, "grad_norm_var": 0.0174713134765625, "learning_rate": 0.0001, "loss": 7.9866, "loss/crossentropy": 2.459189772605896, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2659824937582016, "step": 2270 }, { "epoch": 0.142, "grad_norm": 2.765625, "grad_norm_var": 0.0124176025390625, "learning_rate": 0.0001, "loss": 7.9221, "loss/crossentropy": 2.3119730949401855, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2727499008178711, "step": 2272 }, { "epoch": 0.142125, "grad_norm": 3.03125, "grad_norm_var": 0.0190093994140625, "learning_rate": 0.0001, "loss": 8.0506, "loss/crossentropy": 2.065304398536682, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2462991625070572, "step": 2274 }, { "epoch": 0.14225, "grad_norm": 2.484375, "grad_norm_var": 0.022021484375, "learning_rate": 0.0001, "loss": 7.8896, "loss/crossentropy": 2.293634057044983, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23517119884490967, "step": 2276 }, { "epoch": 0.142375, "grad_norm": 2.640625, "grad_norm_var": 0.0225250244140625, "learning_rate": 0.0001, "loss": 7.9233, "loss/crossentropy": 2.193318486213684, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24228639900684357, "step": 2278 }, { "epoch": 0.1425, "grad_norm": 2.90625, "grad_norm_var": 0.0252593994140625, "learning_rate": 0.0001, "loss": 7.9282, "loss/crossentropy": 2.3415223360061646, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25495699793100357, "step": 2280 }, { "epoch": 0.142625, "grad_norm": 2.546875, "grad_norm_var": 0.027318318684895832, "learning_rate": 0.0001, "loss": 7.8591, "loss/crossentropy": 1.9665740132331848, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2337390035390854, "step": 2282 }, { "epoch": 0.14275, "grad_norm": 2.828125, "grad_norm_var": 0.024193318684895833, "learning_rate": 0.0001, "loss": 7.7983, "loss/crossentropy": 2.0722063779830933, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23800316452980042, "step": 2284 }, { "epoch": 0.142875, "grad_norm": 2.59375, "grad_norm_var": 0.025093587239583333, "learning_rate": 0.0001, "loss": 7.9698, "loss/crossentropy": 2.3465652465820312, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2537553757429123, "step": 2286 }, { "epoch": 0.143, "grad_norm": 3.375, "grad_norm_var": 0.057938639322916666, "learning_rate": 0.0001, "loss": 7.8197, "loss/crossentropy": 2.4646483659744263, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2531234845519066, "step": 2288 }, { "epoch": 0.143125, "grad_norm": 2.359375, "grad_norm_var": 0.06323140462239583, "learning_rate": 0.0001, "loss": 7.8108, "loss/crossentropy": 2.255491614341736, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24858752638101578, "step": 2290 }, { "epoch": 0.14325, "grad_norm": 2.953125, "grad_norm_var": 0.06523030598958333, "learning_rate": 0.0001, "loss": 7.9138, "loss/crossentropy": 2.0624001026153564, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23998911678791046, "step": 2292 }, { "epoch": 0.143375, "grad_norm": 2.609375, "grad_norm_var": 0.06442769368489583, "learning_rate": 0.0001, "loss": 7.9531, "loss/crossentropy": 2.4339091777801514, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2569551467895508, "step": 2294 }, { "epoch": 0.1435, "grad_norm": 2.59375, "grad_norm_var": 0.06369527180989583, "learning_rate": 0.0001, "loss": 7.9041, "loss/crossentropy": 2.0448151230812073, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2471894770860672, "step": 2296 }, { "epoch": 0.143625, "grad_norm": 2.578125, "grad_norm_var": 0.0640625, "learning_rate": 0.0001, "loss": 8.0666, "loss/crossentropy": 2.415607452392578, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2683194726705551, "step": 2298 }, { "epoch": 0.14375, "grad_norm": 2.8125, "grad_norm_var": 0.061930338541666664, "learning_rate": 0.0001, "loss": 7.9572, "loss/crossentropy": 2.294751286506653, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2594631314277649, "step": 2300 }, { "epoch": 0.143875, "grad_norm": 2.8125, "grad_norm_var": 0.06265869140625, "learning_rate": 0.0001, "loss": 7.7507, "loss/crossentropy": 2.219098746776581, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2452593445777893, "step": 2302 }, { "epoch": 0.144, "grad_norm": 2.6875, "grad_norm_var": 0.024833170572916667, "learning_rate": 0.0001, "loss": 7.7452, "loss/crossentropy": 2.273571014404297, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23514091968536377, "step": 2304 }, { "epoch": 0.144125, "grad_norm": 2.8125, "grad_norm_var": 0.028706868489583332, "learning_rate": 0.0001, "loss": 8.092, "loss/crossentropy": 2.470995545387268, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2565620690584183, "step": 2306 }, { "epoch": 0.14425, "grad_norm": 2.671875, "grad_norm_var": 0.022508748372395835, "learning_rate": 0.0001, "loss": 8.0169, "loss/crossentropy": 2.4442650079727173, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2491839900612831, "step": 2308 }, { "epoch": 0.144375, "grad_norm": 2.515625, "grad_norm_var": 0.024120076497395834, "learning_rate": 0.0001, "loss": 7.8243, "loss/crossentropy": 2.3042315244674683, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24753264337778091, "step": 2310 }, { "epoch": 0.1445, "grad_norm": 2.609375, "grad_norm_var": 0.022835286458333333, "learning_rate": 0.0001, "loss": 7.8473, "loss/crossentropy": 2.247647523880005, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.242111474275589, "step": 2312 }, { "epoch": 0.144625, "grad_norm": 2.765625, "grad_norm_var": 0.021142578125, "learning_rate": 0.0001, "loss": 7.8245, "loss/crossentropy": 2.0948009490966797, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22648434340953827, "step": 2314 }, { "epoch": 0.14475, "grad_norm": 2.578125, "grad_norm_var": 0.024540201822916666, "learning_rate": 0.0001, "loss": 7.9081, "loss/crossentropy": 2.3449655771255493, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25433051586151123, "step": 2316 }, { "epoch": 0.144875, "grad_norm": 2.40625, "grad_norm_var": 0.03191731770833333, "learning_rate": 0.0001, "loss": 7.8692, "loss/crossentropy": 2.1519018411636353, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23911522328853607, "step": 2318 }, { "epoch": 0.145, "grad_norm": 3.046875, "grad_norm_var": 0.03612874348958333, "learning_rate": 0.0001, "loss": 8.0635, "loss/crossentropy": 2.0561267137527466, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23246531933546066, "step": 2320 }, { "epoch": 0.145125, "grad_norm": 2.578125, "grad_norm_var": 0.031248982747395834, "learning_rate": 0.0001, "loss": 8.0962, "loss/crossentropy": 2.413579821586609, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2741449773311615, "step": 2322 }, { "epoch": 0.14525, "grad_norm": 3.0, "grad_norm_var": 0.03997395833333333, "learning_rate": 0.0001, "loss": 7.8067, "loss/crossentropy": 2.215983271598816, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24622830748558044, "step": 2324 }, { "epoch": 0.145375, "grad_norm": 2.484375, "grad_norm_var": 0.04117431640625, "learning_rate": 0.0001, "loss": 7.9424, "loss/crossentropy": 2.2071104049682617, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24660057574510574, "step": 2326 }, { "epoch": 0.1455, "grad_norm": 2.890625, "grad_norm_var": 0.0425689697265625, "learning_rate": 0.0001, "loss": 8.0007, "loss/crossentropy": 2.1792843341827393, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2711133062839508, "step": 2328 }, { "epoch": 0.145625, "grad_norm": 2.609375, "grad_norm_var": 0.0416015625, "learning_rate": 0.0001, "loss": 7.9595, "loss/crossentropy": 2.256834030151367, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.26283788681030273, "step": 2330 }, { "epoch": 0.14575, "grad_norm": 2.390625, "grad_norm_var": 0.04439697265625, "learning_rate": 0.0001, "loss": 7.7117, "loss/crossentropy": 1.91128808259964, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23225348442792892, "step": 2332 }, { "epoch": 0.145875, "grad_norm": 2.78125, "grad_norm_var": 0.03492431640625, "learning_rate": 0.0001, "loss": 7.9979, "loss/crossentropy": 2.1611289978027344, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2694767862558365, "step": 2334 }, { "epoch": 0.146, "grad_norm": 2.796875, "grad_norm_var": 0.029423014322916666, "learning_rate": 0.0001, "loss": 7.9771, "loss/crossentropy": 2.3651944398880005, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24654380977153778, "step": 2336 }, { "epoch": 0.146125, "grad_norm": 2.78125, "grad_norm_var": 0.03247782389322917, "learning_rate": 0.0001, "loss": 8.0897, "loss/crossentropy": 2.365579605102539, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26178716123104095, "step": 2338 }, { "epoch": 0.14625, "grad_norm": 2.875, "grad_norm_var": 0.025487263997395832, "learning_rate": 0.0001, "loss": 7.8216, "loss/crossentropy": 2.195146918296814, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2360374853014946, "step": 2340 }, { "epoch": 0.146375, "grad_norm": 2.484375, "grad_norm_var": 0.02750244140625, "learning_rate": 0.0001, "loss": 8.0719, "loss/crossentropy": 2.6680378913879395, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2590227723121643, "step": 2342 }, { "epoch": 0.1465, "grad_norm": 2.765625, "grad_norm_var": 0.025227864583333332, "learning_rate": 0.0001, "loss": 7.9196, "loss/crossentropy": 2.307919979095459, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2748124748468399, "step": 2344 }, { "epoch": 0.146625, "grad_norm": 2.609375, "grad_norm_var": 0.025581868489583333, "learning_rate": 0.0001, "loss": 8.0198, "loss/crossentropy": 2.478832721710205, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25123097002506256, "step": 2346 }, { "epoch": 0.14675, "grad_norm": 3.109375, "grad_norm_var": 0.025715128580729166, "learning_rate": 0.0001, "loss": 7.998, "loss/crossentropy": 2.1463791131973267, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2501315772533417, "step": 2348 }, { "epoch": 0.146875, "grad_norm": 2.375, "grad_norm_var": 0.0401519775390625, "learning_rate": 0.0001, "loss": 7.9402, "loss/crossentropy": 2.287923812866211, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2529391869902611, "step": 2350 }, { "epoch": 0.147, "grad_norm": 2.84375, "grad_norm_var": 0.04274800618489583, "learning_rate": 0.0001, "loss": 7.8521, "loss/crossentropy": 2.393770456314087, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23246632516384125, "step": 2352 }, { "epoch": 0.147125, "grad_norm": 2.828125, "grad_norm_var": 0.03935445149739583, "learning_rate": 0.0001, "loss": 7.9868, "loss/crossentropy": 1.9886181354522705, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2457761988043785, "step": 2354 }, { "epoch": 0.14725, "grad_norm": 2.734375, "grad_norm_var": 0.0428375244140625, "learning_rate": 0.0001, "loss": 7.9631, "loss/crossentropy": 2.1264249682426453, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.261639803647995, "step": 2356 }, { "epoch": 0.147375, "grad_norm": 2.875, "grad_norm_var": 0.0380767822265625, "learning_rate": 0.0001, "loss": 7.8278, "loss/crossentropy": 2.0697389245033264, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.22177913784980774, "step": 2358 }, { "epoch": 0.1475, "grad_norm": 2.859375, "grad_norm_var": 0.0396484375, "learning_rate": 0.0001, "loss": 7.9523, "loss/crossentropy": 2.355503797531128, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.257536381483078, "step": 2360 }, { "epoch": 0.147625, "grad_norm": 2.484375, "grad_norm_var": 0.0458984375, "learning_rate": 0.0001, "loss": 7.7195, "loss/crossentropy": 2.263971447944641, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2459626868367195, "step": 2362 }, { "epoch": 0.14775, "grad_norm": 2.578125, "grad_norm_var": 0.034886678059895836, "learning_rate": 0.0001, "loss": 7.8342, "loss/crossentropy": 2.3069592714309692, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.25156907737255096, "step": 2364 }, { "epoch": 0.147875, "grad_norm": 2.890625, "grad_norm_var": 0.025902303059895833, "learning_rate": 0.0001, "loss": 7.8323, "loss/crossentropy": 2.343958616256714, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23790227621793747, "step": 2366 }, { "epoch": 0.148, "grad_norm": 2.6875, "grad_norm_var": 0.02525634765625, "learning_rate": 0.0001, "loss": 8.0088, "loss/crossentropy": 2.1950390338897705, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25015532970428467, "step": 2368 }, { "epoch": 0.148125, "grad_norm": 2.40625, "grad_norm_var": 0.0327789306640625, "learning_rate": 0.0001, "loss": 7.7179, "loss/crossentropy": 2.18330717086792, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23950626701116562, "step": 2370 }, { "epoch": 0.14825, "grad_norm": 3.109375, "grad_norm_var": 0.04156494140625, "learning_rate": 0.0001, "loss": 7.89, "loss/crossentropy": 2.359447479248047, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25177963823080063, "step": 2372 }, { "epoch": 0.148375, "grad_norm": 2.921875, "grad_norm_var": 0.048075358072916664, "learning_rate": 0.0001, "loss": 7.8616, "loss/crossentropy": 2.1051629185676575, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23466359078884125, "step": 2374 }, { "epoch": 0.1485, "grad_norm": 2.921875, "grad_norm_var": 0.05561421712239583, "learning_rate": 0.0001, "loss": 7.8136, "loss/crossentropy": 2.4187822341918945, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24878371506929398, "step": 2376 }, { "epoch": 0.148625, "grad_norm": 2.703125, "grad_norm_var": 0.05022786458333333, "learning_rate": 0.0001, "loss": 7.8544, "loss/crossentropy": 2.3044979572296143, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23919613659381866, "step": 2378 }, { "epoch": 0.14875, "grad_norm": 2.515625, "grad_norm_var": 0.05283203125, "learning_rate": 0.0001, "loss": 7.9553, "loss/crossentropy": 2.334774613380432, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26303067803382874, "step": 2380 }, { "epoch": 0.148875, "grad_norm": 2.8125, "grad_norm_var": 0.051268513997395834, "learning_rate": 0.0001, "loss": 7.8011, "loss/crossentropy": 2.1512030363082886, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2616318315267563, "step": 2382 }, { "epoch": 0.149, "grad_norm": 2.546875, "grad_norm_var": 0.04903971354166667, "learning_rate": 0.0001, "loss": 7.9319, "loss/crossentropy": 2.1831018924713135, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24666057527065277, "step": 2384 }, { "epoch": 0.149125, "grad_norm": 3.125, "grad_norm_var": 0.05806884765625, "learning_rate": 0.0001, "loss": 8.0404, "loss/crossentropy": 2.278907299041748, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24930745363235474, "step": 2386 }, { "epoch": 0.14925, "grad_norm": 2.46875, "grad_norm_var": 0.04917704264322917, "learning_rate": 0.0001, "loss": 7.8234, "loss/crossentropy": 2.117949962615967, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2466331273317337, "step": 2388 }, { "epoch": 0.149375, "grad_norm": 3.21875, "grad_norm_var": 0.05543212890625, "learning_rate": 0.0001, "loss": 7.9321, "loss/crossentropy": 2.4539263248443604, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2619713842868805, "step": 2390 }, { "epoch": 0.1495, "grad_norm": 2.703125, "grad_norm_var": 0.0503082275390625, "learning_rate": 0.0001, "loss": 8.0366, "loss/crossentropy": 2.155607759952545, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2557392567396164, "step": 2392 }, { "epoch": 0.149625, "grad_norm": 2.828125, "grad_norm_var": 0.0584625244140625, "learning_rate": 0.0001, "loss": 7.8701, "loss/crossentropy": 2.226928472518921, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.227652445435524, "step": 2394 }, { "epoch": 0.14975, "grad_norm": 2.53125, "grad_norm_var": 0.05657552083333333, "learning_rate": 0.0001, "loss": 7.9392, "loss/crossentropy": 2.152814030647278, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22992898523807526, "step": 2396 }, { "epoch": 0.149875, "grad_norm": 2.546875, "grad_norm_var": 0.061258951822916664, "learning_rate": 0.0001, "loss": 7.877, "loss/crossentropy": 2.0306188464164734, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2434871345758438, "step": 2398 }, { "epoch": 0.15, "grad_norm": 2.90625, "grad_norm_var": 0.057112630208333334, "learning_rate": 0.0001, "loss": 7.9455, "loss/crossentropy": 2.3572858572006226, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24068891257047653, "step": 2400 }, { "epoch": 0.150125, "grad_norm": 2.3125, "grad_norm_var": 0.057112630208333334, "learning_rate": 0.0001, "loss": 7.8958, "loss/crossentropy": 2.4808624982833862, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2544522359967232, "step": 2402 }, { "epoch": 0.15025, "grad_norm": 2.765625, "grad_norm_var": 0.055052693684895834, "learning_rate": 0.0001, "loss": 7.8099, "loss/crossentropy": 2.0727401971817017, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25555629283189774, "step": 2404 }, { "epoch": 0.150375, "grad_norm": 2.921875, "grad_norm_var": 0.04289957682291667, "learning_rate": 0.0001, "loss": 7.9492, "loss/crossentropy": 2.3265267610549927, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2484055981040001, "step": 2406 }, { "epoch": 0.1505, "grad_norm": 2.96875, "grad_norm_var": 0.05078837076822917, "learning_rate": 0.0001, "loss": 7.9314, "loss/crossentropy": 2.455584764480591, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2524856925010681, "step": 2408 }, { "epoch": 0.150625, "grad_norm": 2.59375, "grad_norm_var": 0.044694010416666666, "learning_rate": 0.0001, "loss": 7.8646, "loss/crossentropy": 2.425659656524658, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2739051878452301, "step": 2410 }, { "epoch": 0.15075, "grad_norm": 2.5625, "grad_norm_var": 0.042967732747395834, "learning_rate": 0.0001, "loss": 7.9698, "loss/crossentropy": 2.306099534034729, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24583810567855835, "step": 2412 }, { "epoch": 0.150875, "grad_norm": 2.734375, "grad_norm_var": 0.04340718587239583, "learning_rate": 0.0001, "loss": 7.7465, "loss/crossentropy": 2.4911707639694214, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24410546571016312, "step": 2414 }, { "epoch": 0.151, "grad_norm": 2.75, "grad_norm_var": 0.04588114420572917, "learning_rate": 0.0001, "loss": 7.8045, "loss/crossentropy": 2.461613178253174, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2660996913909912, "step": 2416 }, { "epoch": 0.151125, "grad_norm": 2.328125, "grad_norm_var": 0.046708170572916666, "learning_rate": 0.0001, "loss": 7.7771, "loss/crossentropy": 2.1200402975082397, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23757921904325485, "step": 2418 }, { "epoch": 0.15125, "grad_norm": 2.625, "grad_norm_var": 0.048029581705729164, "learning_rate": 0.0001, "loss": 7.8634, "loss/crossentropy": 2.0594701766967773, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2412218227982521, "step": 2420 }, { "epoch": 0.151375, "grad_norm": 3.390625, "grad_norm_var": 0.09226786295572917, "learning_rate": 0.0001, "loss": 7.9832, "loss/crossentropy": 2.171906590461731, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24810399115085602, "step": 2422 }, { "epoch": 0.1515, "grad_norm": 2.328125, "grad_norm_var": 0.09123942057291666, "learning_rate": 0.0001, "loss": 7.8273, "loss/crossentropy": 2.5124112367630005, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2484402135014534, "step": 2424 }, { "epoch": 0.151625, "grad_norm": 2.703125, "grad_norm_var": 0.0913726806640625, "learning_rate": 0.0001, "loss": 7.782, "loss/crossentropy": 2.3001959323883057, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24032824486494064, "step": 2426 }, { "epoch": 0.15175, "grad_norm": 2.546875, "grad_norm_var": 0.0932525634765625, "learning_rate": 0.0001, "loss": 7.7919, "loss/crossentropy": 2.170712888240814, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.26182495057582855, "step": 2428 }, { "epoch": 0.151875, "grad_norm": 2.84375, "grad_norm_var": 0.093359375, "learning_rate": 0.0001, "loss": 7.9205, "loss/crossentropy": 2.2728850841522217, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2512703761458397, "step": 2430 }, { "epoch": 0.152, "grad_norm": 2.46875, "grad_norm_var": 0.0877838134765625, "learning_rate": 0.0001, "loss": 7.7564, "loss/crossentropy": 2.2336456775665283, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24472886323928833, "step": 2432 }, { "epoch": 0.152125, "grad_norm": 2.75, "grad_norm_var": 0.07998758951822917, "learning_rate": 0.0001, "loss": 7.792, "loss/crossentropy": 2.0555617809295654, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.21662656217813492, "step": 2434 }, { "epoch": 0.15225, "grad_norm": 2.6875, "grad_norm_var": 0.07681376139322917, "learning_rate": 0.0001, "loss": 7.852, "loss/crossentropy": 2.1744298934936523, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.25150124728679657, "step": 2436 }, { "epoch": 0.152375, "grad_norm": 2.78125, "grad_norm_var": 0.018342081705729166, "learning_rate": 0.0001, "loss": 7.7934, "loss/crossentropy": 2.0990302562713623, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22628428786993027, "step": 2438 }, { "epoch": 0.1525, "grad_norm": 2.671875, "grad_norm_var": 0.011844889322916666, "learning_rate": 0.0001, "loss": 7.8081, "loss/crossentropy": 2.2353204488754272, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26493804156780243, "step": 2440 }, { "epoch": 0.152625, "grad_norm": 2.953125, "grad_norm_var": 0.017704264322916666, "learning_rate": 0.0001, "loss": 8.0923, "loss/crossentropy": 2.281570076942444, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2562706768512726, "step": 2442 }, { "epoch": 0.15275, "grad_norm": 2.34375, "grad_norm_var": 0.0250396728515625, "learning_rate": 0.0001, "loss": 7.773, "loss/crossentropy": 2.251350522041321, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2373836562037468, "step": 2444 }, { "epoch": 0.152875, "grad_norm": 3.34375, "grad_norm_var": 0.0528472900390625, "learning_rate": 0.0001, "loss": 7.976, "loss/crossentropy": 2.2960145473480225, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25969095528125763, "step": 2446 }, { "epoch": 0.153, "grad_norm": 2.28125, "grad_norm_var": 0.05984700520833333, "learning_rate": 0.0001, "loss": 7.8294, "loss/crossentropy": 2.28712797164917, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2389351725578308, "step": 2448 }, { "epoch": 0.153125, "grad_norm": 2.890625, "grad_norm_var": 0.06357421875, "learning_rate": 0.0001, "loss": 7.9498, "loss/crossentropy": 2.170054316520691, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24584101140499115, "step": 2450 }, { "epoch": 0.15325, "grad_norm": 2.78125, "grad_norm_var": 0.07652587890625, "learning_rate": 0.0001, "loss": 8.0025, "loss/crossentropy": 2.2106114625930786, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24998192489147186, "step": 2452 }, { "epoch": 0.153375, "grad_norm": 2.4375, "grad_norm_var": 0.07939046223958333, "learning_rate": 0.0001, "loss": 7.7971, "loss/crossentropy": 2.2459070682525635, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24219267070293427, "step": 2454 }, { "epoch": 0.1535, "grad_norm": 2.59375, "grad_norm_var": 0.08028971354166667, "learning_rate": 0.0001, "loss": 7.8238, "loss/crossentropy": 2.3142576217651367, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25120753794908524, "step": 2456 }, { "epoch": 0.153625, "grad_norm": 2.640625, "grad_norm_var": 0.0760406494140625, "learning_rate": 0.0001, "loss": 7.9069, "loss/crossentropy": 2.390681028366089, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2660007178783417, "step": 2458 }, { "epoch": 0.15375, "grad_norm": 2.6875, "grad_norm_var": 0.07095947265625, "learning_rate": 0.0001, "loss": 7.9599, "loss/crossentropy": 2.3622519969940186, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2376810610294342, "step": 2460 }, { "epoch": 0.153875, "grad_norm": 2.765625, "grad_norm_var": 0.05569254557291667, "learning_rate": 0.0001, "loss": 7.7633, "loss/crossentropy": 2.587849259376526, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2587762326002121, "step": 2462 }, { "epoch": 0.154, "grad_norm": 2.5625, "grad_norm_var": 0.04690348307291667, "learning_rate": 0.0001, "loss": 8.0367, "loss/crossentropy": 2.0119821429252625, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.23865149170160294, "step": 2464 }, { "epoch": 0.154125, "grad_norm": 2.578125, "grad_norm_var": 0.04401041666666667, "learning_rate": 0.0001, "loss": 7.9178, "loss/crossentropy": 2.278993248939514, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2538345381617546, "step": 2466 }, { "epoch": 0.15425, "grad_norm": 2.578125, "grad_norm_var": 0.029352823893229168, "learning_rate": 0.0001, "loss": 7.8351, "loss/crossentropy": 2.2865071296691895, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24299630522727966, "step": 2468 }, { "epoch": 0.154375, "grad_norm": 2.796875, "grad_norm_var": 0.027497355143229166, "learning_rate": 0.0001, "loss": 7.7031, "loss/crossentropy": 2.118988275527954, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23056157678365707, "step": 2470 }, { "epoch": 0.1545, "grad_norm": 2.25, "grad_norm_var": 0.0398101806640625, "learning_rate": 0.0001, "loss": 7.7647, "loss/crossentropy": 2.122451901435852, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2540634050965309, "step": 2472 }, { "epoch": 0.154625, "grad_norm": 2.875, "grad_norm_var": 0.041845703125, "learning_rate": 0.0001, "loss": 7.8108, "loss/crossentropy": 2.2460381984710693, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2485114336013794, "step": 2474 }, { "epoch": 0.15475, "grad_norm": 2.828125, "grad_norm_var": 0.0369049072265625, "learning_rate": 0.0001, "loss": 7.8238, "loss/crossentropy": 2.2321892976760864, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24692068994045258, "step": 2476 }, { "epoch": 0.154875, "grad_norm": 2.671875, "grad_norm_var": 0.0273834228515625, "learning_rate": 0.0001, "loss": 7.8107, "loss/crossentropy": 2.47454035282135, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24419991672039032, "step": 2478 }, { "epoch": 0.155, "grad_norm": 2.703125, "grad_norm_var": 0.026399739583333335, "learning_rate": 0.0001, "loss": 7.782, "loss/crossentropy": 2.1783688068389893, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24744782596826553, "step": 2480 }, { "epoch": 0.155125, "grad_norm": 2.671875, "grad_norm_var": 0.029524739583333334, "learning_rate": 0.0001, "loss": 7.9437, "loss/crossentropy": 2.553811550140381, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2530565932393074, "step": 2482 }, { "epoch": 0.15525, "grad_norm": 2.53125, "grad_norm_var": 0.04010009765625, "learning_rate": 0.0001, "loss": 7.8713, "loss/crossentropy": 2.441239356994629, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2551605701446533, "step": 2484 }, { "epoch": 0.155375, "grad_norm": 2.5625, "grad_norm_var": 0.038309733072916664, "learning_rate": 0.0001, "loss": 7.7061, "loss/crossentropy": 2.370589256286621, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25826428830623627, "step": 2486 }, { "epoch": 0.1555, "grad_norm": 2.640625, "grad_norm_var": 0.026390584309895833, "learning_rate": 0.0001, "loss": 7.7961, "loss/crossentropy": 2.1814417839050293, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2495630532503128, "step": 2488 }, { "epoch": 0.155625, "grad_norm": 2.890625, "grad_norm_var": 0.029173787434895834, "learning_rate": 0.0001, "loss": 7.8197, "loss/crossentropy": 2.4883482456207275, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.26348088681697845, "step": 2490 }, { "epoch": 0.15575, "grad_norm": 2.578125, "grad_norm_var": 0.027632649739583334, "learning_rate": 0.0001, "loss": 7.8219, "loss/crossentropy": 2.1873401403427124, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24557159841060638, "step": 2492 }, { "epoch": 0.155875, "grad_norm": 2.75, "grad_norm_var": 0.02867431640625, "learning_rate": 0.0001, "loss": 7.8562, "loss/crossentropy": 2.3110402822494507, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24427203088998795, "step": 2494 }, { "epoch": 0.156, "grad_norm": 2.953125, "grad_norm_var": 0.03600260416666667, "learning_rate": 0.0001, "loss": 7.8415, "loss/crossentropy": 2.2570624351501465, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24605220556259155, "step": 2496 }, { "epoch": 0.156125, "grad_norm": 2.3125, "grad_norm_var": 0.04087626139322917, "learning_rate": 0.0001, "loss": 7.6643, "loss/crossentropy": 2.2185534238815308, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2364451214671135, "step": 2498 }, { "epoch": 0.15625, "grad_norm": 3.078125, "grad_norm_var": 0.039948527018229166, "learning_rate": 0.0001, "loss": 7.8161, "loss/crossentropy": 2.2274473905563354, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24973751604557037, "step": 2500 }, { "epoch": 0.156375, "grad_norm": 2.46875, "grad_norm_var": 0.042292277018229164, "learning_rate": 0.0001, "loss": 8.0038, "loss/crossentropy": 2.3013203144073486, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24737702310085297, "step": 2502 }, { "epoch": 0.1565, "grad_norm": 2.484375, "grad_norm_var": 0.0455078125, "learning_rate": 0.0001, "loss": 7.7928, "loss/crossentropy": 2.5051584243774414, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25412553548812866, "step": 2504 }, { "epoch": 0.156625, "grad_norm": 2.890625, "grad_norm_var": 0.0430084228515625, "learning_rate": 0.0001, "loss": 7.9481, "loss/crossentropy": 2.189783751964569, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.24579951167106628, "step": 2506 }, { "epoch": 0.15675, "grad_norm": 2.796875, "grad_norm_var": 0.0482818603515625, "learning_rate": 0.0001, "loss": 7.8348, "loss/crossentropy": 2.629545569419861, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26841507852077484, "step": 2508 }, { "epoch": 0.156875, "grad_norm": 2.34375, "grad_norm_var": 0.060212198893229166, "learning_rate": 0.0001, "loss": 7.7123, "loss/crossentropy": 2.267096519470215, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24167770147323608, "step": 2510 }, { "epoch": 0.157, "grad_norm": 2.546875, "grad_norm_var": 0.05322977701822917, "learning_rate": 0.0001, "loss": 7.6795, "loss/crossentropy": 2.170002818107605, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2375175580382347, "step": 2512 }, { "epoch": 0.157125, "grad_norm": 2.890625, "grad_norm_var": 0.05366109212239583, "learning_rate": 0.0001, "loss": 7.8975, "loss/crossentropy": 2.1839526891708374, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24612244218587875, "step": 2514 }, { "epoch": 0.15725, "grad_norm": 2.484375, "grad_norm_var": 0.0496734619140625, "learning_rate": 0.0001, "loss": 7.7503, "loss/crossentropy": 1.7978224754333496, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22572653740644455, "step": 2516 }, { "epoch": 0.157375, "grad_norm": 2.9375, "grad_norm_var": 0.052567545572916666, "learning_rate": 0.0001, "loss": 7.9403, "loss/crossentropy": 2.2894846200942993, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24215564131736755, "step": 2518 }, { "epoch": 0.1575, "grad_norm": 2.578125, "grad_norm_var": 0.052469889322916664, "learning_rate": 0.0001, "loss": 8.0173, "loss/crossentropy": 2.271655559539795, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23966332525014877, "step": 2520 }, { "epoch": 0.157625, "grad_norm": 2.640625, "grad_norm_var": 0.051070149739583334, "learning_rate": 0.0001, "loss": 8.027, "loss/crossentropy": 2.30005145072937, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2628418430685997, "step": 2522 }, { "epoch": 0.15775, "grad_norm": 2.65625, "grad_norm_var": 0.03728841145833333, "learning_rate": 0.0001, "loss": 7.5896, "loss/crossentropy": 2.0799155235290527, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23638420552015305, "step": 2524 }, { "epoch": 0.157875, "grad_norm": 2.859375, "grad_norm_var": 0.03769124348958333, "learning_rate": 0.0001, "loss": 7.9816, "loss/crossentropy": 2.13996684551239, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24061943590641022, "step": 2526 }, { "epoch": 0.158, "grad_norm": 2.546875, "grad_norm_var": 0.03584696451822917, "learning_rate": 0.0001, "loss": 7.89, "loss/crossentropy": 2.4007346630096436, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2567130923271179, "step": 2528 }, { "epoch": 0.158125, "grad_norm": 2.859375, "grad_norm_var": 0.0336578369140625, "learning_rate": 0.0001, "loss": 7.8945, "loss/crossentropy": 2.3631176948547363, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2415386661887169, "step": 2530 }, { "epoch": 0.15825, "grad_norm": 2.5625, "grad_norm_var": 0.02857666015625, "learning_rate": 0.0001, "loss": 7.7907, "loss/crossentropy": 2.058986485004425, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24685738235712051, "step": 2532 }, { "epoch": 0.158375, "grad_norm": 2.5, "grad_norm_var": 0.02593994140625, "learning_rate": 0.0001, "loss": 7.8062, "loss/crossentropy": 2.2281254529953003, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24689025431871414, "step": 2534 }, { "epoch": 0.1585, "grad_norm": 2.5625, "grad_norm_var": 0.023127237955729168, "learning_rate": 0.0001, "loss": 7.8028, "loss/crossentropy": 2.4382470846176147, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27194739878177643, "step": 2536 }, { "epoch": 0.158625, "grad_norm": 2.96875, "grad_norm_var": 0.028083292643229167, "learning_rate": 0.0001, "loss": 7.8523, "loss/crossentropy": 2.2935184240341187, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23303698748350143, "step": 2538 }, { "epoch": 0.15875, "grad_norm": 2.578125, "grad_norm_var": 0.0289703369140625, "learning_rate": 0.0001, "loss": 7.8038, "loss/crossentropy": 2.43733286857605, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23855997622013092, "step": 2540 }, { "epoch": 0.158875, "grad_norm": 2.796875, "grad_norm_var": 0.020894368489583332, "learning_rate": 0.0001, "loss": 7.8183, "loss/crossentropy": 2.318352222442627, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2344244346022606, "step": 2542 }, { "epoch": 0.159, "grad_norm": 2.546875, "grad_norm_var": 0.021468098958333334, "learning_rate": 0.0001, "loss": 7.7506, "loss/crossentropy": 2.172747015953064, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23044558614492416, "step": 2544 }, { "epoch": 0.159125, "grad_norm": 2.625, "grad_norm_var": 0.017024739583333334, "learning_rate": 0.0001, "loss": 7.7647, "loss/crossentropy": 2.2639771699905396, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2624407559633255, "step": 2546 }, { "epoch": 0.15925, "grad_norm": 2.59375, "grad_norm_var": 0.05847981770833333, "learning_rate": 0.0001, "loss": 7.8837, "loss/crossentropy": 2.484058380126953, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2976628988981247, "step": 2548 }, { "epoch": 0.159375, "grad_norm": 2.671875, "grad_norm_var": 0.05621337890625, "learning_rate": 0.0001, "loss": 7.9374, "loss/crossentropy": 2.439212441444397, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23525572568178177, "step": 2550 }, { "epoch": 0.1595, "grad_norm": 2.6875, "grad_norm_var": 0.061009724934895836, "learning_rate": 0.0001, "loss": 7.6554, "loss/crossentropy": 2.046541452407837, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23553012311458588, "step": 2552 }, { "epoch": 0.159625, "grad_norm": 2.65625, "grad_norm_var": 0.058430989583333336, "learning_rate": 0.0001, "loss": 7.679, "loss/crossentropy": 2.027937591075897, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24199260026216507, "step": 2554 }, { "epoch": 0.15975, "grad_norm": 2.4375, "grad_norm_var": 0.06506245930989583, "learning_rate": 0.0001, "loss": 7.6605, "loss/crossentropy": 1.922214150428772, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.21722379326820374, "step": 2556 }, { "epoch": 0.159875, "grad_norm": 2.796875, "grad_norm_var": 0.06368815104166667, "learning_rate": 0.0001, "loss": 7.947, "loss/crossentropy": 2.41329824924469, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23701953887939453, "step": 2558 }, { "epoch": 0.16, "grad_norm": 2.625, "grad_norm_var": 0.06347249348958334, "learning_rate": 0.0001, "loss": 7.9478, "loss/crossentropy": 2.364239811897278, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.26914364099502563, "step": 2560 }, { "epoch": 0.160125, "grad_norm": 2.53125, "grad_norm_var": 0.06280924479166666, "learning_rate": 0.0001, "loss": 7.8012, "loss/crossentropy": 2.2895541191101074, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2240670844912529, "step": 2562 }, { "epoch": 0.16025, "grad_norm": 2.59375, "grad_norm_var": 0.022858683268229166, "learning_rate": 0.0001, "loss": 7.9514, "loss/crossentropy": 2.21063768863678, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23704807460308075, "step": 2564 }, { "epoch": 0.160375, "grad_norm": 2.421875, "grad_norm_var": 0.026102701822916668, "learning_rate": 0.0001, "loss": 7.9352, "loss/crossentropy": 2.1890910863876343, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.25638218969106674, "step": 2566 }, { "epoch": 0.1605, "grad_norm": 2.734375, "grad_norm_var": 0.023014322916666666, "learning_rate": 0.0001, "loss": 7.7254, "loss/crossentropy": 2.3455424308776855, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22738848626613617, "step": 2568 }, { "epoch": 0.160625, "grad_norm": 2.640625, "grad_norm_var": 0.018830362955729166, "learning_rate": 0.0001, "loss": 7.9244, "loss/crossentropy": 2.4760804176330566, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2477440983057022, "step": 2570 }, { "epoch": 0.16075, "grad_norm": 2.78125, "grad_norm_var": 0.0157379150390625, "learning_rate": 0.0001, "loss": 7.8971, "loss/crossentropy": 2.2207542657852173, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23943090438842773, "step": 2572 }, { "epoch": 0.160875, "grad_norm": 2.796875, "grad_norm_var": 0.0168609619140625, "learning_rate": 0.0001, "loss": 7.8168, "loss/crossentropy": 2.5181933641433716, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2479179948568344, "step": 2574 }, { "epoch": 0.161, "grad_norm": 2.75, "grad_norm_var": 0.015721638997395832, "learning_rate": 0.0001, "loss": 7.7453, "loss/crossentropy": 2.37592613697052, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24624846875667572, "step": 2576 }, { "epoch": 0.161125, "grad_norm": 3.15625, "grad_norm_var": 0.030204264322916667, "learning_rate": 0.0001, "loss": 7.8473, "loss/crossentropy": 2.2562596797943115, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2466205209493637, "step": 2578 }, { "epoch": 0.16125, "grad_norm": 2.625, "grad_norm_var": 0.029866536458333332, "learning_rate": 0.0001, "loss": 7.8813, "loss/crossentropy": 2.23412823677063, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2418230101466179, "step": 2580 }, { "epoch": 0.161375, "grad_norm": 2.625, "grad_norm_var": 0.023030598958333332, "learning_rate": 0.0001, "loss": 7.9116, "loss/crossentropy": 2.529879093170166, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2786422669887543, "step": 2582 }, { "epoch": 0.1615, "grad_norm": 2.515625, "grad_norm_var": 0.0252593994140625, "learning_rate": 0.0001, "loss": 7.7615, "loss/crossentropy": 2.1202937364578247, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24669316411018372, "step": 2584 }, { "epoch": 0.161625, "grad_norm": 2.59375, "grad_norm_var": 0.03206278483072917, "learning_rate": 0.0001, "loss": 7.7169, "loss/crossentropy": 2.2816332578659058, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23847512155771255, "step": 2586 }, { "epoch": 0.16175, "grad_norm": 2.828125, "grad_norm_var": 0.032136027018229166, "learning_rate": 0.0001, "loss": 7.7928, "loss/crossentropy": 2.264583945274353, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23760029673576355, "step": 2588 }, { "epoch": 0.161875, "grad_norm": 2.4375, "grad_norm_var": 0.03433837890625, "learning_rate": 0.0001, "loss": 7.8559, "loss/crossentropy": 2.070025682449341, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2591887414455414, "step": 2590 }, { "epoch": 0.162, "grad_norm": 2.453125, "grad_norm_var": 0.039388020833333336, "learning_rate": 0.0001, "loss": 7.8661, "loss/crossentropy": 2.3117960691452026, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23791569471359253, "step": 2592 }, { "epoch": 0.162125, "grad_norm": 2.5625, "grad_norm_var": 0.0252593994140625, "learning_rate": 0.0001, "loss": 7.4415, "loss/crossentropy": 2.2540050745010376, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22730688750743866, "step": 2594 }, { "epoch": 0.16225, "grad_norm": 2.703125, "grad_norm_var": 0.03404947916666667, "learning_rate": 0.0001, "loss": 7.6522, "loss/crossentropy": 1.7815396785736084, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2086879387497902, "step": 2596 }, { "epoch": 0.162375, "grad_norm": 2.625, "grad_norm_var": 0.03359375, "learning_rate": 0.0001, "loss": 7.7545, "loss/crossentropy": 2.38827908039093, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24543824791908264, "step": 2598 }, { "epoch": 0.1625, "grad_norm": 2.84375, "grad_norm_var": 0.03720703125, "learning_rate": 0.0001, "loss": 7.8079, "loss/crossentropy": 2.4880915880203247, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2978169023990631, "step": 2600 }, { "epoch": 0.162625, "grad_norm": 2.515625, "grad_norm_var": 0.03308817545572917, "learning_rate": 0.0001, "loss": 7.8877, "loss/crossentropy": 2.4218918085098267, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24140693247318268, "step": 2602 }, { "epoch": 0.16275, "grad_norm": 2.53125, "grad_norm_var": 0.031053670247395835, "learning_rate": 0.0001, "loss": 7.8144, "loss/crossentropy": 2.266621947288513, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26293135434389114, "step": 2604 }, { "epoch": 0.162875, "grad_norm": 2.578125, "grad_norm_var": 0.028499348958333334, "learning_rate": 0.0001, "loss": 7.7038, "loss/crossentropy": 2.1036359071731567, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2307727113366127, "step": 2606 }, { "epoch": 0.163, "grad_norm": 2.6875, "grad_norm_var": 0.04426981608072917, "learning_rate": 0.0001, "loss": 7.8715, "loss/crossentropy": 2.2452776432037354, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26242855191230774, "step": 2608 }, { "epoch": 0.163125, "grad_norm": 2.46875, "grad_norm_var": 0.04185791015625, "learning_rate": 0.0001, "loss": 7.8903, "loss/crossentropy": 2.2557637691497803, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.272522896528244, "step": 2610 }, { "epoch": 0.16325, "grad_norm": 2.65625, "grad_norm_var": 0.03326416015625, "learning_rate": 0.0001, "loss": 7.9233, "loss/crossentropy": 2.375272512435913, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2328314259648323, "step": 2612 }, { "epoch": 0.163375, "grad_norm": 2.984375, "grad_norm_var": 0.03937886555989583, "learning_rate": 0.0001, "loss": 7.8084, "loss/crossentropy": 2.2634716033935547, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2391228973865509, "step": 2614 }, { "epoch": 0.1635, "grad_norm": 2.75, "grad_norm_var": 0.04064127604166667, "learning_rate": 0.0001, "loss": 7.888, "loss/crossentropy": 2.5287814140319824, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2443351447582245, "step": 2616 }, { "epoch": 0.163625, "grad_norm": 2.375, "grad_norm_var": 0.04646708170572917, "learning_rate": 0.0001, "loss": 7.6805, "loss/crossentropy": 2.142694592475891, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24597708880901337, "step": 2618 }, { "epoch": 0.16375, "grad_norm": 3.25, "grad_norm_var": 0.06467997233072917, "learning_rate": 0.0001, "loss": 7.8287, "loss/crossentropy": 1.996739685535431, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25455768406391144, "step": 2620 }, { "epoch": 0.163875, "grad_norm": 2.40625, "grad_norm_var": 0.06575419108072916, "learning_rate": 0.0001, "loss": 7.8684, "loss/crossentropy": 2.460938572883606, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25081367045640945, "step": 2622 }, { "epoch": 0.164, "grad_norm": 2.828125, "grad_norm_var": 0.05423177083333333, "learning_rate": 0.0001, "loss": 7.8841, "loss/crossentropy": 2.4802383184432983, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24186843633651733, "step": 2624 }, { "epoch": 0.164125, "grad_norm": 2.609375, "grad_norm_var": 0.05076395670572917, "learning_rate": 0.0001, "loss": 7.8454, "loss/crossentropy": 2.209325075149536, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.237510085105896, "step": 2626 }, { "epoch": 0.16425, "grad_norm": 2.28125, "grad_norm_var": 0.06262613932291666, "learning_rate": 0.0001, "loss": 7.7406, "loss/crossentropy": 2.0401015281677246, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.20650795102119446, "step": 2628 }, { "epoch": 0.164375, "grad_norm": 2.890625, "grad_norm_var": 0.06002604166666667, "learning_rate": 0.0001, "loss": 7.894, "loss/crossentropy": 2.5026817321777344, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24748124927282333, "step": 2630 }, { "epoch": 0.1645, "grad_norm": 2.34375, "grad_norm_var": 0.0602691650390625, "learning_rate": 0.0001, "loss": 7.7024, "loss/crossentropy": 2.2391568422317505, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22500373423099518, "step": 2632 }, { "epoch": 0.164625, "grad_norm": 2.578125, "grad_norm_var": 0.05440165201822917, "learning_rate": 0.0001, "loss": 7.7876, "loss/crossentropy": 2.2685747742652893, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2358316034078598, "step": 2634 }, { "epoch": 0.16475, "grad_norm": 2.703125, "grad_norm_var": 0.027220662434895834, "learning_rate": 0.0001, "loss": 7.7126, "loss/crossentropy": 2.2785946130752563, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24026557803153992, "step": 2636 }, { "epoch": 0.164875, "grad_norm": 2.671875, "grad_norm_var": 0.024898274739583334, "learning_rate": 0.0001, "loss": 7.8153, "loss/crossentropy": 2.2721141576766968, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2398277372121811, "step": 2638 }, { "epoch": 0.165, "grad_norm": 2.5625, "grad_norm_var": 0.023224894205729166, "learning_rate": 0.0001, "loss": 7.9432, "loss/crossentropy": 2.229793667793274, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.21887121349573135, "step": 2640 }, { "epoch": 0.165125, "grad_norm": 2.4375, "grad_norm_var": 0.023876953125, "learning_rate": 0.0001, "loss": 7.603, "loss/crossentropy": 2.1890532970428467, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23123417794704437, "step": 2642 }, { "epoch": 0.16525, "grad_norm": 2.53125, "grad_norm_var": 0.0223052978515625, "learning_rate": 0.0001, "loss": 7.6917, "loss/crossentropy": 2.172744870185852, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22291851788759232, "step": 2644 }, { "epoch": 0.165375, "grad_norm": 2.75, "grad_norm_var": 0.017606608072916665, "learning_rate": 0.0001, "loss": 7.6527, "loss/crossentropy": 2.2864272594451904, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24596457928419113, "step": 2646 }, { "epoch": 0.1655, "grad_norm": 2.53125, "grad_norm_var": 0.01695556640625, "learning_rate": 0.0001, "loss": 7.858, "loss/crossentropy": 2.428277611732483, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.25081363320350647, "step": 2648 }, { "epoch": 0.165625, "grad_norm": 2.671875, "grad_norm_var": 0.017508951822916667, "learning_rate": 0.0001, "loss": 7.8198, "loss/crossentropy": 2.2622756958007812, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24526074528694153, "step": 2650 }, { "epoch": 0.16575, "grad_norm": 2.453125, "grad_norm_var": 0.0265289306640625, "learning_rate": 0.0001, "loss": 7.6316, "loss/crossentropy": 2.1581307649612427, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24160084128379822, "step": 2652 }, { "epoch": 0.165875, "grad_norm": 2.78125, "grad_norm_var": 0.0283203125, "learning_rate": 0.0001, "loss": 7.8289, "loss/crossentropy": 2.2180778980255127, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24087932705879211, "step": 2654 }, { "epoch": 0.166, "grad_norm": 2.796875, "grad_norm_var": 0.030939737955729168, "learning_rate": 0.0001, "loss": 7.8398, "loss/crossentropy": 2.3046650886535645, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.25179338455200195, "step": 2656 }, { "epoch": 0.166125, "grad_norm": 2.796875, "grad_norm_var": 0.02769775390625, "learning_rate": 0.0001, "loss": 7.7888, "loss/crossentropy": 2.251810073852539, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23383210599422455, "step": 2658 }, { "epoch": 0.16625, "grad_norm": 2.515625, "grad_norm_var": 0.0210357666015625, "learning_rate": 0.0001, "loss": 7.6108, "loss/crossentropy": 2.326148748397827, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.267700731754303, "step": 2660 }, { "epoch": 0.166375, "grad_norm": 2.640625, "grad_norm_var": 0.021410115559895835, "learning_rate": 0.0001, "loss": 7.6615, "loss/crossentropy": 2.3175487518310547, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2537999600172043, "step": 2662 }, { "epoch": 0.1665, "grad_norm": 2.609375, "grad_norm_var": 0.019310506184895833, "learning_rate": 0.0001, "loss": 7.7319, "loss/crossentropy": 2.4394067525863647, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24439330399036407, "step": 2664 }, { "epoch": 0.166625, "grad_norm": 2.5, "grad_norm_var": 0.024934895833333335, "learning_rate": 0.0001, "loss": 7.7306, "loss/crossentropy": 2.377542495727539, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2411457523703575, "step": 2666 }, { "epoch": 0.16675, "grad_norm": 2.96875, "grad_norm_var": 0.026097615559895832, "learning_rate": 0.0001, "loss": 7.7291, "loss/crossentropy": 2.244265556335449, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2636634260416031, "step": 2668 }, { "epoch": 0.166875, "grad_norm": 2.40625, "grad_norm_var": 0.029618326822916666, "learning_rate": 0.0001, "loss": 7.817, "loss/crossentropy": 2.3067715167999268, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2648337334394455, "step": 2670 }, { "epoch": 0.167, "grad_norm": 2.78125, "grad_norm_var": 0.02916259765625, "learning_rate": 0.0001, "loss": 7.9386, "loss/crossentropy": 2.3284155130386353, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25726044178009033, "step": 2672 }, { "epoch": 0.167125, "grad_norm": 2.609375, "grad_norm_var": 0.03279520670572917, "learning_rate": 0.0001, "loss": 7.6504, "loss/crossentropy": 2.1608939170837402, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23987659811973572, "step": 2674 }, { "epoch": 0.16725, "grad_norm": 3.0625, "grad_norm_var": 0.04436442057291667, "learning_rate": 0.0001, "loss": 7.7824, "loss/crossentropy": 2.156682312488556, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22545601427555084, "step": 2676 }, { "epoch": 0.167375, "grad_norm": 2.765625, "grad_norm_var": 0.07141520182291666, "learning_rate": 0.0001, "loss": 7.8573, "loss/crossentropy": 2.1365780234336853, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.21754685044288635, "step": 2678 }, { "epoch": 0.1675, "grad_norm": 2.484375, "grad_norm_var": 0.07617899576822916, "learning_rate": 0.0001, "loss": 7.9647, "loss/crossentropy": 2.27071213722229, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22438976168632507, "step": 2680 }, { "epoch": 0.167625, "grad_norm": 2.765625, "grad_norm_var": 0.0685455322265625, "learning_rate": 0.0001, "loss": 7.7773, "loss/crossentropy": 2.115522563457489, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22477930784225464, "step": 2682 }, { "epoch": 0.16775, "grad_norm": 2.46875, "grad_norm_var": 0.06461181640625, "learning_rate": 0.0001, "loss": 7.7755, "loss/crossentropy": 2.215229034423828, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.22990095615386963, "step": 2684 }, { "epoch": 0.167875, "grad_norm": 2.484375, "grad_norm_var": 0.06243082682291667, "learning_rate": 0.0001, "loss": 7.7615, "loss/crossentropy": 2.180325746536255, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.232216514647007, "step": 2686 }, { "epoch": 0.168, "grad_norm": 2.453125, "grad_norm_var": 0.0634185791015625, "learning_rate": 0.0001, "loss": 7.713, "loss/crossentropy": 2.4484771490097046, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2512357458472252, "step": 2688 }, { "epoch": 0.168125, "grad_norm": 2.5625, "grad_norm_var": 0.0561431884765625, "learning_rate": 0.0001, "loss": 7.7087, "loss/crossentropy": 2.178835153579712, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24023275077342987, "step": 2690 }, { "epoch": 0.16825, "grad_norm": 2.53125, "grad_norm_var": 0.045750935872395836, "learning_rate": 0.0001, "loss": 7.6464, "loss/crossentropy": 2.0620937943458557, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23334022611379623, "step": 2692 }, { "epoch": 0.168375, "grad_norm": 2.53125, "grad_norm_var": 0.009740193684895834, "learning_rate": 0.0001, "loss": 7.8639, "loss/crossentropy": 2.4087116718292236, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2341567426919937, "step": 2694 }, { "epoch": 0.1685, "grad_norm": 2.890625, "grad_norm_var": 0.014647420247395833, "learning_rate": 0.0001, "loss": 7.9106, "loss/crossentropy": 2.309167981147766, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24564050883054733, "step": 2696 }, { "epoch": 0.168625, "grad_norm": 2.28125, "grad_norm_var": 0.016364542643229167, "learning_rate": 0.0001, "loss": 7.6039, "loss/crossentropy": 1.9676810503005981, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2102392315864563, "step": 2698 }, { "epoch": 0.16875, "grad_norm": 2.65625, "grad_norm_var": 0.017118326822916665, "learning_rate": 0.0001, "loss": 7.7754, "loss/crossentropy": 2.380856513977051, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2278568521142006, "step": 2700 }, { "epoch": 0.168875, "grad_norm": 2.828125, "grad_norm_var": 0.020335896809895834, "learning_rate": 0.0001, "loss": 7.8029, "loss/crossentropy": 2.2248435020446777, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23989464342594147, "step": 2702 }, { "epoch": 0.169, "grad_norm": 2.65625, "grad_norm_var": 0.0283355712890625, "learning_rate": 0.0001, "loss": 7.7909, "loss/crossentropy": 2.2802765369415283, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23772113770246506, "step": 2704 }, { "epoch": 0.169125, "grad_norm": 2.640625, "grad_norm_var": 0.02998046875, "learning_rate": 0.0001, "loss": 7.8392, "loss/crossentropy": 2.320490837097168, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.232273131608963, "step": 2706 }, { "epoch": 0.16925, "grad_norm": 2.34375, "grad_norm_var": 0.036279296875, "learning_rate": 0.0001, "loss": 7.7277, "loss/crossentropy": 2.103710889816284, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.21974950283765793, "step": 2708 }, { "epoch": 0.169375, "grad_norm": 2.90625, "grad_norm_var": 0.0538726806640625, "learning_rate": 0.0001, "loss": 7.9849, "loss/crossentropy": 2.311343789100647, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27306586503982544, "step": 2710 }, { "epoch": 0.1695, "grad_norm": 4.21875, "grad_norm_var": 0.21113993326822916, "learning_rate": 0.0001, "loss": 7.898, "loss/crossentropy": 2.248456120491028, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23485350608825684, "step": 2712 }, { "epoch": 0.169625, "grad_norm": 3.640625, "grad_norm_var": 0.2821116129557292, "learning_rate": 0.0001, "loss": 8.0907, "loss/crossentropy": 2.3589508533477783, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23614779859781265, "step": 2714 }, { "epoch": 0.16975, "grad_norm": 2.9375, "grad_norm_var": 0.29221903483072914, "learning_rate": 0.0001, "loss": 7.8759, "loss/crossentropy": 2.472365975379944, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23715022206306458, "step": 2716 }, { "epoch": 0.169875, "grad_norm": 2.328125, "grad_norm_var": 0.3374420166015625, "learning_rate": 0.0001, "loss": 7.7942, "loss/crossentropy": 2.264963388442993, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24661238491535187, "step": 2718 }, { "epoch": 0.17, "grad_norm": 2.828125, "grad_norm_var": 0.3333943684895833, "learning_rate": 0.0001, "loss": 7.9196, "loss/crossentropy": 2.477718949317932, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25135859847068787, "step": 2720 }, { "epoch": 0.170125, "grad_norm": 2.40625, "grad_norm_var": 0.3395792643229167, "learning_rate": 0.0001, "loss": 7.8171, "loss/crossentropy": 2.2033207416534424, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24111789464950562, "step": 2722 }, { "epoch": 0.17025, "grad_norm": 2.671875, "grad_norm_var": 0.3217844645182292, "learning_rate": 0.0001, "loss": 7.7115, "loss/crossentropy": 2.3668792247772217, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2309272140264511, "step": 2724 }, { "epoch": 0.170375, "grad_norm": 2.609375, "grad_norm_var": 0.32696940104166666, "learning_rate": 0.0001, "loss": 7.807, "loss/crossentropy": 2.359344244003296, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.25724074244499207, "step": 2726 }, { "epoch": 0.1705, "grad_norm": 2.4375, "grad_norm_var": 0.21409505208333332, "learning_rate": 0.0001, "loss": 7.7598, "loss/crossentropy": 2.2009201049804688, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23280736804008484, "step": 2728 }, { "epoch": 0.170625, "grad_norm": 2.59375, "grad_norm_var": 0.1070465087890625, "learning_rate": 0.0001, "loss": 7.8024, "loss/crossentropy": 2.1781824827194214, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23048165440559387, "step": 2730 }, { "epoch": 0.17075, "grad_norm": 2.734375, "grad_norm_var": 0.041890462239583336, "learning_rate": 0.0001, "loss": 7.867, "loss/crossentropy": 2.2078417539596558, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24096353352069855, "step": 2732 }, { "epoch": 0.170875, "grad_norm": 2.640625, "grad_norm_var": 0.03166910807291667, "learning_rate": 0.0001, "loss": 7.8545, "loss/crossentropy": 2.3574694395065308, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2407936155796051, "step": 2734 }, { "epoch": 0.171, "grad_norm": 2.5, "grad_norm_var": 0.026285807291666668, "learning_rate": 0.0001, "loss": 7.7336, "loss/crossentropy": 2.3674607276916504, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2338072955608368, "step": 2736 }, { "epoch": 0.171125, "grad_norm": 2.453125, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 7.7552, "loss/crossentropy": 2.1523255109786987, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23672043532133102, "step": 2738 }, { "epoch": 0.17125, "grad_norm": 2.375, "grad_norm_var": 0.026471964518229165, "learning_rate": 0.0001, "loss": 7.6336, "loss/crossentropy": 2.0955100655555725, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2361309677362442, "step": 2740 }, { "epoch": 0.171375, "grad_norm": 3.140625, "grad_norm_var": 0.036432902018229164, "learning_rate": 0.0001, "loss": 7.8829, "loss/crossentropy": 2.509047269821167, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2578771486878395, "step": 2742 }, { "epoch": 0.1715, "grad_norm": 2.484375, "grad_norm_var": 0.03332926432291667, "learning_rate": 0.0001, "loss": 7.7991, "loss/crossentropy": 2.2338361740112305, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26055432856082916, "step": 2744 }, { "epoch": 0.171625, "grad_norm": 2.59375, "grad_norm_var": 0.033967081705729166, "learning_rate": 0.0001, "loss": 7.7437, "loss/crossentropy": 2.1647003889083862, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23635494709014893, "step": 2746 }, { "epoch": 0.17175, "grad_norm": 2.8125, "grad_norm_var": 0.03599344889322917, "learning_rate": 0.0001, "loss": 7.7051, "loss/crossentropy": 2.561861991882324, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2592911720275879, "step": 2748 }, { "epoch": 0.171875, "grad_norm": 2.390625, "grad_norm_var": 0.03669331868489583, "learning_rate": 0.0001, "loss": 7.6254, "loss/crossentropy": 2.082680583000183, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.21699509769678116, "step": 2750 }, { "epoch": 0.172, "grad_norm": 6.8125, "grad_norm_var": 1.1539052327473958, "learning_rate": 0.0001, "loss": 7.8237, "loss/crossentropy": 2.5276395082473755, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2522757425904274, "step": 2752 }, { "epoch": 0.172125, "grad_norm": 5.1875, "grad_norm_var": 6.189676920572917, "learning_rate": 0.0001, "loss": 8.0084, "loss/crossentropy": 2.2055013179779053, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23886322230100632, "step": 2754 }, { "epoch": 0.17225, "grad_norm": 2.375, "grad_norm_var": 6.181050618489583, "learning_rate": 0.0001, "loss": 7.9047, "loss/crossentropy": 2.394223690032959, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2515896260738373, "step": 2756 }, { "epoch": 0.172375, "grad_norm": 2.90625, "grad_norm_var": 6.165526326497396, "learning_rate": 0.0001, "loss": 7.9735, "loss/crossentropy": 2.2563817501068115, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23824837803840637, "step": 2758 }, { "epoch": 0.1725, "grad_norm": 2.484375, "grad_norm_var": 6.159468587239584, "learning_rate": 0.0001, "loss": 7.8461, "loss/crossentropy": 2.244271457195282, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22762248665094376, "step": 2760 }, { "epoch": 0.172625, "grad_norm": 2.609375, "grad_norm_var": 6.164518229166666, "learning_rate": 0.0001, "loss": 7.7406, "loss/crossentropy": 2.183770179748535, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.21703775227069855, "step": 2762 }, { "epoch": 0.17275, "grad_norm": 3.5625, "grad_norm_var": 6.166910807291667, "learning_rate": 0.0001, "loss": 7.9424, "loss/crossentropy": 2.1734741926193237, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2613416016101837, "step": 2764 }, { "epoch": 0.172875, "grad_norm": 2.859375, "grad_norm_var": 6.048502604166667, "learning_rate": 0.0001, "loss": 7.9174, "loss/crossentropy": 2.32150936126709, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23922567069530487, "step": 2766 }, { "epoch": 0.173, "grad_norm": 2.65625, "grad_norm_var": 5.403804524739583, "learning_rate": 0.0001, "loss": 7.6948, "loss/crossentropy": 2.0935378074645996, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23874947428703308, "step": 2768 }, { "epoch": 0.173125, "grad_norm": 2.703125, "grad_norm_var": 0.39658915201822914, "learning_rate": 0.0001, "loss": 7.9079, "loss/crossentropy": 2.2263898253440857, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24427123367786407, "step": 2770 }, { "epoch": 0.17325, "grad_norm": 2.625, "grad_norm_var": 0.37922261555989584, "learning_rate": 0.0001, "loss": 7.8895, "loss/crossentropy": 2.1437301635742188, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22952381521463394, "step": 2772 }, { "epoch": 0.173375, "grad_norm": 2.4375, "grad_norm_var": 0.40615946451822915, "learning_rate": 0.0001, "loss": 7.8372, "loss/crossentropy": 2.293881416320801, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2393340766429901, "step": 2774 }, { "epoch": 0.1735, "grad_norm": 2.640625, "grad_norm_var": 0.4012980143229167, "learning_rate": 0.0001, "loss": 7.7442, "loss/crossentropy": 2.258249878883362, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23926617950201035, "step": 2776 }, { "epoch": 0.173625, "grad_norm": 3.953125, "grad_norm_var": 0.4626210530598958, "learning_rate": 0.0001, "loss": 7.7726, "loss/crossentropy": 2.0506081581115723, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2070065289735794, "step": 2778 }, { "epoch": 0.17375, "grad_norm": 2.5625, "grad_norm_var": 0.14410400390625, "learning_rate": 0.0001, "loss": 7.7289, "loss/crossentropy": 2.130703330039978, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23245615512132645, "step": 2780 }, { "epoch": 0.173875, "grad_norm": 2.578125, "grad_norm_var": 0.1447906494140625, "learning_rate": 0.0001, "loss": 7.8126, "loss/crossentropy": 2.336124062538147, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2596941590309143, "step": 2782 }, { "epoch": 0.174, "grad_norm": 2.484375, "grad_norm_var": 0.13846028645833333, "learning_rate": 0.0001, "loss": 7.7806, "loss/crossentropy": 2.313853621482849, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24538902938365936, "step": 2784 }, { "epoch": 0.174125, "grad_norm": 2.796875, "grad_norm_var": 0.13967997233072918, "learning_rate": 0.0001, "loss": 7.7471, "loss/crossentropy": 2.2884416580200195, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24766312539577484, "step": 2786 }, { "epoch": 0.17425, "grad_norm": 2.609375, "grad_norm_var": 0.14446512858072916, "learning_rate": 0.0001, "loss": 7.8361, "loss/crossentropy": 2.3857977390289307, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2376108318567276, "step": 2788 }, { "epoch": 0.174375, "grad_norm": 2.46875, "grad_norm_var": 0.14230855305989584, "learning_rate": 0.0001, "loss": 7.7924, "loss/crossentropy": 2.2684574127197266, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25091154873371124, "step": 2790 }, { "epoch": 0.1745, "grad_norm": 2.671875, "grad_norm_var": 0.1416168212890625, "learning_rate": 0.0001, "loss": 7.7769, "loss/crossentropy": 2.2278876304626465, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2252519577741623, "step": 2792 }, { "epoch": 0.174625, "grad_norm": 2.4375, "grad_norm_var": 0.03281962076822917, "learning_rate": 0.0001, "loss": 7.5981, "loss/crossentropy": 2.2489346265792847, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23500560969114304, "step": 2794 }, { "epoch": 0.17475, "grad_norm": 2.5, "grad_norm_var": 0.022248331705729166, "learning_rate": 0.0001, "loss": 7.7545, "loss/crossentropy": 2.295899510383606, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2576214596629143, "step": 2796 }, { "epoch": 0.174875, "grad_norm": 2.46875, "grad_norm_var": 0.0241851806640625, "learning_rate": 0.0001, "loss": 7.8388, "loss/crossentropy": 2.3035519123077393, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2654409855604172, "step": 2798 }, { "epoch": 0.175, "grad_norm": 3.015625, "grad_norm_var": 0.037060546875, "learning_rate": 0.0001, "loss": 7.9235, "loss/crossentropy": 2.2352579832077026, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2841232195496559, "step": 2800 }, { "epoch": 0.175125, "grad_norm": 2.890625, "grad_norm_var": 0.046483357747395836, "learning_rate": 0.0001, "loss": 7.709, "loss/crossentropy": 2.3198055028915405, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23578546196222305, "step": 2802 }, { "epoch": 0.17525, "grad_norm": 3.15625, "grad_norm_var": 0.05718994140625, "learning_rate": 0.0001, "loss": 7.8402, "loss/crossentropy": 2.2245510816574097, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23328936845064163, "step": 2804 }, { "epoch": 0.175375, "grad_norm": 2.703125, "grad_norm_var": 0.13108317057291666, "learning_rate": 0.0001, "loss": 7.7222, "loss/crossentropy": 2.3215973377227783, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2323600873351097, "step": 2806 }, { "epoch": 0.1755, "grad_norm": 2.53125, "grad_norm_var": 0.13945210774739583, "learning_rate": 0.0001, "loss": 7.9641, "loss/crossentropy": 2.6317641735076904, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25842827558517456, "step": 2808 }, { "epoch": 0.175625, "grad_norm": 2.46875, "grad_norm_var": 0.13935139973958333, "learning_rate": 0.0001, "loss": 7.82, "loss/crossentropy": 2.215041160583496, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25014058500528336, "step": 2810 }, { "epoch": 0.17575, "grad_norm": 2.4375, "grad_norm_var": 0.13599853515625, "learning_rate": 0.0001, "loss": 7.7395, "loss/crossentropy": 2.1867226362228394, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23424651473760605, "step": 2812 }, { "epoch": 0.175875, "grad_norm": 2.28125, "grad_norm_var": 0.1615631103515625, "learning_rate": 0.0001, "loss": 7.5309, "loss/crossentropy": 2.217566967010498, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.226304829120636, "step": 2814 }, { "epoch": 0.176, "grad_norm": 2.875, "grad_norm_var": 0.15181884765625, "learning_rate": 0.0001, "loss": 7.759, "loss/crossentropy": 2.0618727803230286, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2439548224210739, "step": 2816 }, { "epoch": 0.176125, "grad_norm": 3.515625, "grad_norm_var": 0.6506795247395833, "learning_rate": 0.0001, "loss": 7.8753, "loss/crossentropy": 2.409805655479431, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2505335807800293, "step": 2818 }, { "epoch": 0.17625, "grad_norm": 3.03125, "grad_norm_var": 0.6482248942057292, "learning_rate": 0.0001, "loss": 7.5632, "loss/crossentropy": 2.03094744682312, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.21356388926506042, "step": 2820 }, { "epoch": 0.176375, "grad_norm": 2.265625, "grad_norm_var": 0.61324462890625, "learning_rate": 0.0001, "loss": 7.6257, "loss/crossentropy": 2.3141270875930786, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25053539127111435, "step": 2822 }, { "epoch": 0.1765, "grad_norm": 2.71875, "grad_norm_var": 0.60732421875, "learning_rate": 0.0001, "loss": 7.8661, "loss/crossentropy": 2.174069106578827, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.22884509712457657, "step": 2824 }, { "epoch": 0.176625, "grad_norm": 2.625, "grad_norm_var": 0.6008626302083333, "learning_rate": 0.0001, "loss": 7.5315, "loss/crossentropy": 2.08145010471344, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22816456109285355, "step": 2826 }, { "epoch": 0.17675, "grad_norm": 2.375, "grad_norm_var": 0.6063313802083333, "learning_rate": 0.0001, "loss": 7.6201, "loss/crossentropy": 2.094637870788574, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2243911251425743, "step": 2828 }, { "epoch": 0.176875, "grad_norm": 2.53125, "grad_norm_var": 0.5699940999348958, "learning_rate": 0.0001, "loss": 7.7378, "loss/crossentropy": 2.1945928931236267, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2399984896183014, "step": 2830 }, { "epoch": 0.177, "grad_norm": 2.5, "grad_norm_var": 0.5867421468098958, "learning_rate": 0.0001, "loss": 7.6547, "loss/crossentropy": 2.3012553453445435, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23795722424983978, "step": 2832 }, { "epoch": 0.177125, "grad_norm": 2.5, "grad_norm_var": 0.0286773681640625, "learning_rate": 0.0001, "loss": 7.6384, "loss/crossentropy": 2.424581289291382, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2470892071723938, "step": 2834 }, { "epoch": 0.17725, "grad_norm": 2.671875, "grad_norm_var": 0.0132720947265625, "learning_rate": 0.0001, "loss": 7.7292, "loss/crossentropy": 2.136981964111328, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23704643547534943, "step": 2836 }, { "epoch": 0.177375, "grad_norm": 2.421875, "grad_norm_var": 0.01051025390625, "learning_rate": 0.0001, "loss": 7.8595, "loss/crossentropy": 2.285550355911255, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23677106201648712, "step": 2838 }, { "epoch": 0.1775, "grad_norm": 2.859375, "grad_norm_var": 0.015262858072916666, "learning_rate": 0.0001, "loss": 7.6297, "loss/crossentropy": 2.1494513750076294, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2412027269601822, "step": 2840 }, { "epoch": 0.177625, "grad_norm": 3.0, "grad_norm_var": 0.026984659830729167, "learning_rate": 0.0001, "loss": 7.7216, "loss/crossentropy": 2.605257034301758, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25743308663368225, "step": 2842 }, { "epoch": 0.17775, "grad_norm": 2.5, "grad_norm_var": 0.028539021809895832, "learning_rate": 0.0001, "loss": 7.6461, "loss/crossentropy": 2.0102819204330444, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2197662517428398, "step": 2844 }, { "epoch": 0.177875, "grad_norm": 2.6875, "grad_norm_var": 0.028564453125, "learning_rate": 0.0001, "loss": 7.8794, "loss/crossentropy": 2.154181718826294, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23210670053958893, "step": 2846 }, { "epoch": 0.178, "grad_norm": 2.421875, "grad_norm_var": 0.0362945556640625, "learning_rate": 0.0001, "loss": 7.8658, "loss/crossentropy": 2.0595306158065796, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.26342423260211945, "step": 2848 }, { "epoch": 0.178125, "grad_norm": 2.578125, "grad_norm_var": 0.03465067545572917, "learning_rate": 0.0001, "loss": 7.5678, "loss/crossentropy": 2.179203987121582, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22831647098064423, "step": 2850 }, { "epoch": 0.17825, "grad_norm": 2.875, "grad_norm_var": 0.0369781494140625, "learning_rate": 0.0001, "loss": 7.6801, "loss/crossentropy": 2.0674314498901367, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23797215521335602, "step": 2852 }, { "epoch": 0.178375, "grad_norm": 2.765625, "grad_norm_var": 0.0592681884765625, "learning_rate": 0.0001, "loss": 7.9571, "loss/crossentropy": 2.3534988164901733, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26077476143836975, "step": 2854 }, { "epoch": 0.1785, "grad_norm": 2.578125, "grad_norm_var": 0.07317708333333334, "learning_rate": 0.0001, "loss": 7.6968, "loss/crossentropy": 2.10899019241333, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22398869693279266, "step": 2856 }, { "epoch": 0.178625, "grad_norm": 2.4375, "grad_norm_var": 0.07069905598958333, "learning_rate": 0.0001, "loss": 7.8299, "loss/crossentropy": 2.319381833076477, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24246351420879364, "step": 2858 }, { "epoch": 0.17875, "grad_norm": 2.484375, "grad_norm_var": 0.07053934733072917, "learning_rate": 0.0001, "loss": 7.6359, "loss/crossentropy": 2.083498954772949, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.23692196607589722, "step": 2860 }, { "epoch": 0.178875, "grad_norm": 2.359375, "grad_norm_var": 0.0753814697265625, "learning_rate": 0.0001, "loss": 7.8099, "loss/crossentropy": 2.1947683095932007, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24239200353622437, "step": 2862 }, { "epoch": 0.179, "grad_norm": 2.671875, "grad_norm_var": 0.06674702962239583, "learning_rate": 0.0001, "loss": 7.7765, "loss/crossentropy": 2.209209442138672, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24729400128126144, "step": 2864 }, { "epoch": 0.179125, "grad_norm": 2.53125, "grad_norm_var": 0.0657135009765625, "learning_rate": 0.0001, "loss": 7.8832, "loss/crossentropy": 2.3722325563430786, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24714642018079758, "step": 2866 }, { "epoch": 0.17925, "grad_norm": 2.578125, "grad_norm_var": 0.07121988932291666, "learning_rate": 0.0001, "loss": 7.7811, "loss/crossentropy": 2.4134687185287476, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2480832040309906, "step": 2868 }, { "epoch": 0.179375, "grad_norm": 2.625, "grad_norm_var": 0.0347320556640625, "learning_rate": 0.0001, "loss": 7.7289, "loss/crossentropy": 2.2061760425567627, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23551230877637863, "step": 2870 }, { "epoch": 0.1795, "grad_norm": 2.5625, "grad_norm_var": 0.030790201822916665, "learning_rate": 0.0001, "loss": 7.8462, "loss/crossentropy": 2.436152458190918, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2384696677327156, "step": 2872 }, { "epoch": 0.179625, "grad_norm": 2.375, "grad_norm_var": 0.03289388020833333, "learning_rate": 0.0001, "loss": 7.7489, "loss/crossentropy": 2.3130866289138794, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.27254779636859894, "step": 2874 }, { "epoch": 0.17975, "grad_norm": 2.828125, "grad_norm_var": 0.0348052978515625, "learning_rate": 0.0001, "loss": 7.8657, "loss/crossentropy": 2.294617772102356, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23688946664333344, "step": 2876 }, { "epoch": 0.179875, "grad_norm": 2.5625, "grad_norm_var": 0.030497233072916668, "learning_rate": 0.0001, "loss": 7.7343, "loss/crossentropy": 2.3553664684295654, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2647460997104645, "step": 2878 }, { "epoch": 0.18, "grad_norm": 2.734375, "grad_norm_var": 0.031180826822916667, "learning_rate": 0.0001, "loss": 7.7173, "loss/crossentropy": 2.3102493286132812, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23434799164533615, "step": 2880 }, { "epoch": 0.180125, "grad_norm": 2.5, "grad_norm_var": 0.030378214518229165, "learning_rate": 0.0001, "loss": 7.7913, "loss/crossentropy": 2.298587918281555, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22284646332263947, "step": 2882 }, { "epoch": 0.18025, "grad_norm": 2.78125, "grad_norm_var": 0.024128214518229166, "learning_rate": 0.0001, "loss": 7.6415, "loss/crossentropy": 2.1285579204559326, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23009717464447021, "step": 2884 }, { "epoch": 0.180375, "grad_norm": 2.546875, "grad_norm_var": 0.021419270833333334, "learning_rate": 0.0001, "loss": 7.7626, "loss/crossentropy": 2.1065726280212402, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.20703819394111633, "step": 2886 }, { "epoch": 0.1805, "grad_norm": 2.734375, "grad_norm_var": 0.0209625244140625, "learning_rate": 0.0001, "loss": 7.8513, "loss/crossentropy": 2.2261340618133545, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25797754526138306, "step": 2888 }, { "epoch": 0.180625, "grad_norm": 2.421875, "grad_norm_var": 0.0176910400390625, "learning_rate": 0.0001, "loss": 7.8405, "loss/crossentropy": 2.5347334146499634, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.25483807921409607, "step": 2890 }, { "epoch": 0.18075, "grad_norm": 2.5625, "grad_norm_var": 0.016141764322916665, "learning_rate": 0.0001, "loss": 7.8018, "loss/crossentropy": 2.2389495372772217, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2264159545302391, "step": 2892 }, { "epoch": 0.180875, "grad_norm": 2.453125, "grad_norm_var": 0.017552693684895832, "learning_rate": 0.0001, "loss": 7.8118, "loss/crossentropy": 2.2665982246398926, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24241438508033752, "step": 2894 }, { "epoch": 0.181, "grad_norm": 2.5625, "grad_norm_var": 0.017430623372395832, "learning_rate": 0.0001, "loss": 7.7171, "loss/crossentropy": 2.0678027272224426, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21813754737377167, "step": 2896 }, { "epoch": 0.181125, "grad_norm": 2.59375, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 7.7695, "loss/crossentropy": 2.3117023706436157, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23616492748260498, "step": 2898 }, { "epoch": 0.18125, "grad_norm": 2.5625, "grad_norm_var": 0.016434733072916666, "learning_rate": 0.0001, "loss": 7.8045, "loss/crossentropy": 2.30752170085907, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22399520874023438, "step": 2900 }, { "epoch": 0.181375, "grad_norm": 2.421875, "grad_norm_var": 0.016022745768229166, "learning_rate": 0.0001, "loss": 8.0111, "loss/crossentropy": 2.441025972366333, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25707243382930756, "step": 2902 }, { "epoch": 0.1815, "grad_norm": 2.421875, "grad_norm_var": 0.012848917643229167, "learning_rate": 0.0001, "loss": 7.6732, "loss/crossentropy": 2.146459937095642, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24353434145450592, "step": 2904 }, { "epoch": 0.181625, "grad_norm": 2.8125, "grad_norm_var": 0.017609659830729166, "learning_rate": 0.0001, "loss": 7.7333, "loss/crossentropy": 2.4673913717269897, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2522246688604355, "step": 2906 }, { "epoch": 0.18175, "grad_norm": 2.40625, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 7.7385, "loss/crossentropy": 2.1044957637786865, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.235269233584404, "step": 2908 }, { "epoch": 0.181875, "grad_norm": 2.328125, "grad_norm_var": 0.022347005208333333, "learning_rate": 0.0001, "loss": 7.4037, "loss/crossentropy": 2.2339547872543335, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2398391216993332, "step": 2910 }, { "epoch": 0.182, "grad_norm": 2.6875, "grad_norm_var": 0.0226715087890625, "learning_rate": 0.0001, "loss": 7.5718, "loss/crossentropy": 2.1950976848602295, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23091710358858109, "step": 2912 }, { "epoch": 0.182125, "grad_norm": 2.640625, "grad_norm_var": 0.02310791015625, "learning_rate": 0.0001, "loss": 7.6153, "loss/crossentropy": 2.3033652305603027, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26021257042884827, "step": 2914 }, { "epoch": 0.18225, "grad_norm": 2.4375, "grad_norm_var": 0.017838541666666666, "learning_rate": 0.0001, "loss": 7.8996, "loss/crossentropy": 2.4616453647613525, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2583332806825638, "step": 2916 }, { "epoch": 0.182375, "grad_norm": 2.484375, "grad_norm_var": 0.018456013997395833, "learning_rate": 0.0001, "loss": 7.6534, "loss/crossentropy": 2.3851388692855835, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24298951029777527, "step": 2918 }, { "epoch": 0.1825, "grad_norm": 2.515625, "grad_norm_var": 0.016890462239583334, "learning_rate": 0.0001, "loss": 7.6927, "loss/crossentropy": 2.409003496170044, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22873351722955704, "step": 2920 }, { "epoch": 0.182625, "grad_norm": 2.828125, "grad_norm_var": 0.03720703125, "learning_rate": 0.0001, "loss": 7.7708, "loss/crossentropy": 2.233977437019348, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.250872403383255, "step": 2922 }, { "epoch": 0.18275, "grad_norm": 2.296875, "grad_norm_var": 0.03955790201822917, "learning_rate": 0.0001, "loss": 7.8246, "loss/crossentropy": 2.2683684825897217, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2577130198478699, "step": 2924 }, { "epoch": 0.182875, "grad_norm": 2.734375, "grad_norm_var": 0.04023030598958333, "learning_rate": 0.0001, "loss": 7.7134, "loss/crossentropy": 2.2642526626586914, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25480280816555023, "step": 2926 }, { "epoch": 0.183, "grad_norm": 2.546875, "grad_norm_var": 0.03713785807291667, "learning_rate": 0.0001, "loss": 7.8294, "loss/crossentropy": 2.176198959350586, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22762203961610794, "step": 2928 }, { "epoch": 0.183125, "grad_norm": 2.65625, "grad_norm_var": 0.037007649739583336, "learning_rate": 0.0001, "loss": 7.7678, "loss/crossentropy": 2.5741217136383057, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24319174140691757, "step": 2930 }, { "epoch": 0.18325, "grad_norm": 2.765625, "grad_norm_var": 0.03632405598958333, "learning_rate": 0.0001, "loss": 7.744, "loss/crossentropy": 2.4425946474075317, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2534085810184479, "step": 2932 }, { "epoch": 0.183375, "grad_norm": 2.65625, "grad_norm_var": 0.0351470947265625, "learning_rate": 0.0001, "loss": 7.6372, "loss/crossentropy": 2.136154890060425, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23775418102741241, "step": 2934 }, { "epoch": 0.1835, "grad_norm": 2.5, "grad_norm_var": 0.035380045572916664, "learning_rate": 0.0001, "loss": 7.8286, "loss/crossentropy": 2.249310851097107, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24401143193244934, "step": 2936 }, { "epoch": 0.183625, "grad_norm": 2.71875, "grad_norm_var": 0.018912760416666667, "learning_rate": 0.0001, "loss": 7.8273, "loss/crossentropy": 2.3503148555755615, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24899056553840637, "step": 2938 }, { "epoch": 0.18375, "grad_norm": 2.546875, "grad_norm_var": 0.012923177083333333, "learning_rate": 0.0001, "loss": 7.5477, "loss/crossentropy": 2.147356152534485, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24195496737957, "step": 2940 }, { "epoch": 0.183875, "grad_norm": 2.3125, "grad_norm_var": 0.017496744791666668, "learning_rate": 0.0001, "loss": 7.6031, "loss/crossentropy": 2.314660429954529, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24584651738405228, "step": 2942 }, { "epoch": 0.184, "grad_norm": 2.796875, "grad_norm_var": 0.020829264322916666, "learning_rate": 0.0001, "loss": 7.6413, "loss/crossentropy": 2.0859320759773254, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2166888415813446, "step": 2944 }, { "epoch": 0.184125, "grad_norm": 2.234375, "grad_norm_var": 0.028837076822916665, "learning_rate": 0.0001, "loss": 7.7158, "loss/crossentropy": 2.305862069129944, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23023030161857605, "step": 2946 }, { "epoch": 0.18425, "grad_norm": 2.609375, "grad_norm_var": 0.03004150390625, "learning_rate": 0.0001, "loss": 7.635, "loss/crossentropy": 2.345908284187317, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22712141275405884, "step": 2948 }, { "epoch": 0.184375, "grad_norm": 2.90625, "grad_norm_var": 0.03779296875, "learning_rate": 0.0001, "loss": 7.7227, "loss/crossentropy": 1.9320513010025024, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.20210829377174377, "step": 2950 }, { "epoch": 0.1845, "grad_norm": 2.265625, "grad_norm_var": 0.04317118326822917, "learning_rate": 0.0001, "loss": 7.6749, "loss/crossentropy": 2.2975679636001587, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23657388985157013, "step": 2952 }, { "epoch": 0.184625, "grad_norm": 2.6875, "grad_norm_var": 0.04095052083333333, "learning_rate": 0.0001, "loss": 7.7486, "loss/crossentropy": 2.2370001077651978, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24035517871379852, "step": 2954 }, { "epoch": 0.18475, "grad_norm": 2.46875, "grad_norm_var": 0.046418253580729166, "learning_rate": 0.0001, "loss": 7.77, "loss/crossentropy": 2.3344703912734985, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24448612332344055, "step": 2956 }, { "epoch": 0.184875, "grad_norm": 2.671875, "grad_norm_var": 0.0432037353515625, "learning_rate": 0.0001, "loss": 7.7287, "loss/crossentropy": 2.3184186220169067, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24355477839708328, "step": 2958 }, { "epoch": 0.185, "grad_norm": 2.671875, "grad_norm_var": 0.04150390625, "learning_rate": 0.0001, "loss": 7.793, "loss/crossentropy": 2.4449312686920166, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24685372412204742, "step": 2960 }, { "epoch": 0.185125, "grad_norm": 2.5, "grad_norm_var": 0.0351226806640625, "learning_rate": 0.0001, "loss": 7.7295, "loss/crossentropy": 2.0395036935806274, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22259121388196945, "step": 2962 }, { "epoch": 0.18525, "grad_norm": 2.5625, "grad_norm_var": 0.0314453125, "learning_rate": 0.0001, "loss": 7.6614, "loss/crossentropy": 2.029510021209717, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22326287627220154, "step": 2964 }, { "epoch": 0.185375, "grad_norm": 2.65625, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 7.6142, "loss/crossentropy": 2.2890524864196777, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.22059021890163422, "step": 2966 }, { "epoch": 0.1855, "grad_norm": 2.578125, "grad_norm_var": 0.021434529622395834, "learning_rate": 0.0001, "loss": 7.7904, "loss/crossentropy": 2.2007906436920166, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22886135429143906, "step": 2968 }, { "epoch": 0.185625, "grad_norm": 2.578125, "grad_norm_var": 0.016097005208333334, "learning_rate": 0.0001, "loss": 7.7165, "loss/crossentropy": 2.4090301990509033, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24099014699459076, "step": 2970 }, { "epoch": 0.18575, "grad_norm": 2.34375, "grad_norm_var": 0.014225260416666666, "learning_rate": 0.0001, "loss": 7.7287, "loss/crossentropy": 2.358201503753662, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2252245992422104, "step": 2972 }, { "epoch": 0.185875, "grad_norm": 2.71875, "grad_norm_var": 0.015523274739583334, "learning_rate": 0.0001, "loss": 7.623, "loss/crossentropy": 2.3175272941589355, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23873913288116455, "step": 2974 }, { "epoch": 0.186, "grad_norm": 2.5625, "grad_norm_var": 0.0131988525390625, "learning_rate": 0.0001, "loss": 7.6915, "loss/crossentropy": 2.590595841407776, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24967636168003082, "step": 2976 }, { "epoch": 0.186125, "grad_norm": 2.484375, "grad_norm_var": 0.015306599934895833, "learning_rate": 0.0001, "loss": 7.6723, "loss/crossentropy": 2.2069579362869263, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22214916348457336, "step": 2978 }, { "epoch": 0.18625, "grad_norm": 2.65625, "grad_norm_var": 0.015729777018229165, "learning_rate": 0.0001, "loss": 7.8361, "loss/crossentropy": 2.3302581310272217, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2437615841627121, "step": 2980 }, { "epoch": 0.186375, "grad_norm": 2.453125, "grad_norm_var": 0.014232381184895834, "learning_rate": 0.0001, "loss": 7.6333, "loss/crossentropy": 2.2672786712646484, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22940023988485336, "step": 2982 }, { "epoch": 0.1865, "grad_norm": 2.65625, "grad_norm_var": 0.014290364583333333, "learning_rate": 0.0001, "loss": 7.7551, "loss/crossentropy": 2.528477191925049, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24014821648597717, "step": 2984 }, { "epoch": 0.186625, "grad_norm": 2.796875, "grad_norm_var": 0.0172515869140625, "learning_rate": 0.0001, "loss": 7.6699, "loss/crossentropy": 2.134658455848694, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2487226352095604, "step": 2986 }, { "epoch": 0.18675, "grad_norm": 2.34375, "grad_norm_var": 0.0167388916015625, "learning_rate": 0.0001, "loss": 7.6979, "loss/crossentropy": 2.3620848655700684, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24307234585285187, "step": 2988 }, { "epoch": 0.186875, "grad_norm": 2.671875, "grad_norm_var": 0.015250651041666667, "learning_rate": 0.0001, "loss": 7.7099, "loss/crossentropy": 2.4233195781707764, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2325136736035347, "step": 2990 }, { "epoch": 0.187, "grad_norm": 2.515625, "grad_norm_var": 0.015462239583333334, "learning_rate": 0.0001, "loss": 7.6485, "loss/crossentropy": 2.2925750017166138, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24893249571323395, "step": 2992 }, { "epoch": 0.187125, "grad_norm": 4.125, "grad_norm_var": 0.16298421223958334, "learning_rate": 0.0001, "loss": 7.7527, "loss/crossentropy": 2.1467760801315308, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23566482961177826, "step": 2994 }, { "epoch": 0.18725, "grad_norm": 2.4375, "grad_norm_var": 0.16752827962239583, "learning_rate": 0.0001, "loss": 7.7408, "loss/crossentropy": 2.103184461593628, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2323935329914093, "step": 2996 }, { "epoch": 0.187375, "grad_norm": 2.65625, "grad_norm_var": 0.16705322265625, "learning_rate": 0.0001, "loss": 7.7494, "loss/crossentropy": 2.174781620502472, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25046705454587936, "step": 2998 }, { "epoch": 0.1875, "grad_norm": 2.625, "grad_norm_var": 0.16780192057291668, "learning_rate": 0.0001, "loss": 7.7174, "loss/crossentropy": 2.2673741579055786, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22653405368328094, "step": 3000 }, { "epoch": 0.187625, "grad_norm": 3.296875, "grad_norm_var": 0.19199117024739584, "learning_rate": 0.0001, "loss": 7.7319, "loss/crossentropy": 2.157706141471863, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2287752330303192, "step": 3002 }, { "epoch": 0.18775, "grad_norm": 3.0625, "grad_norm_var": 0.2681223551432292, "learning_rate": 0.0001, "loss": 7.8417, "loss/crossentropy": 2.0640329122543335, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2371016889810562, "step": 3004 }, { "epoch": 0.187875, "grad_norm": 2.328125, "grad_norm_var": 0.28884989420572915, "learning_rate": 0.0001, "loss": 7.4793, "loss/crossentropy": 2.1649757027626038, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.22954751551151276, "step": 3006 }, { "epoch": 0.188, "grad_norm": 2.9375, "grad_norm_var": 0.2865397135416667, "learning_rate": 0.0001, "loss": 7.7681, "loss/crossentropy": 2.402653217315674, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2439199835062027, "step": 3008 }, { "epoch": 0.188125, "grad_norm": 2.671875, "grad_norm_var": 0.16442057291666667, "learning_rate": 0.0001, "loss": 7.7058, "loss/crossentropy": 2.0360541343688965, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2370767444372177, "step": 3010 }, { "epoch": 0.18825, "grad_norm": 2.78125, "grad_norm_var": 0.16503499348958334, "learning_rate": 0.0001, "loss": 7.78, "loss/crossentropy": 2.5376839637756348, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25698722898960114, "step": 3012 }, { "epoch": 0.188375, "grad_norm": 2.546875, "grad_norm_var": 0.16298421223958334, "learning_rate": 0.0001, "loss": 7.6674, "loss/crossentropy": 2.153092384338379, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22976408153772354, "step": 3014 }, { "epoch": 0.1885, "grad_norm": 2.421875, "grad_norm_var": 0.16728413899739583, "learning_rate": 0.0001, "loss": 7.9659, "loss/crossentropy": 2.4408687353134155, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2630517780780792, "step": 3016 }, { "epoch": 0.188625, "grad_norm": 2.546875, "grad_norm_var": 0.15907796223958334, "learning_rate": 0.0001, "loss": 7.6187, "loss/crossentropy": 2.1757636070251465, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2302849441766739, "step": 3018 }, { "epoch": 0.18875, "grad_norm": 2.71875, "grad_norm_var": 0.03792215983072917, "learning_rate": 0.0001, "loss": 7.5604, "loss/crossentropy": 2.064240336418152, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22439640015363693, "step": 3020 }, { "epoch": 0.188875, "grad_norm": 2.46875, "grad_norm_var": 0.03224995930989583, "learning_rate": 0.0001, "loss": 7.6954, "loss/crossentropy": 2.1221320629119873, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22975972294807434, "step": 3022 }, { "epoch": 0.189, "grad_norm": 2.59375, "grad_norm_var": 0.023714192708333335, "learning_rate": 0.0001, "loss": 7.8211, "loss/crossentropy": 2.243198275566101, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2282658889889717, "step": 3024 }, { "epoch": 0.189125, "grad_norm": 2.5, "grad_norm_var": 0.0191558837890625, "learning_rate": 0.0001, "loss": 7.6639, "loss/crossentropy": 1.9989042282104492, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23544684797525406, "step": 3026 }, { "epoch": 0.18925, "grad_norm": 2.671875, "grad_norm_var": 0.015510050455729167, "learning_rate": 0.0001, "loss": 7.6059, "loss/crossentropy": 2.0583502054214478, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22483647614717484, "step": 3028 }, { "epoch": 0.189375, "grad_norm": 2.5625, "grad_norm_var": 0.019481404622395834, "learning_rate": 0.0001, "loss": 7.5609, "loss/crossentropy": 2.213624954223633, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23435892909765244, "step": 3030 }, { "epoch": 0.1895, "grad_norm": 2.625, "grad_norm_var": 0.015445963541666666, "learning_rate": 0.0001, "loss": 7.7361, "loss/crossentropy": 2.2522560358047485, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24112869054079056, "step": 3032 }, { "epoch": 0.189625, "grad_norm": 2.453125, "grad_norm_var": 0.011865234375, "learning_rate": 0.0001, "loss": 7.9651, "loss/crossentropy": 2.325987696647644, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22860444337129593, "step": 3034 }, { "epoch": 0.18975, "grad_norm": 2.75, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 7.6774, "loss/crossentropy": 2.292188882827759, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24093221873044968, "step": 3036 }, { "epoch": 0.189875, "grad_norm": 2.859375, "grad_norm_var": 0.020677693684895835, "learning_rate": 0.0001, "loss": 7.7072, "loss/crossentropy": 2.1392345428466797, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2271246463060379, "step": 3038 }, { "epoch": 0.19, "grad_norm": 2.328125, "grad_norm_var": 0.025584920247395834, "learning_rate": 0.0001, "loss": 7.7131, "loss/crossentropy": 2.3634947538375854, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2574180141091347, "step": 3040 }, { "epoch": 0.190125, "grad_norm": 2.5, "grad_norm_var": 0.025406901041666666, "learning_rate": 0.0001, "loss": 7.6816, "loss/crossentropy": 2.224321484565735, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22878948599100113, "step": 3042 }, { "epoch": 0.19025, "grad_norm": 2.65625, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 7.7158, "loss/crossentropy": 2.471584916114807, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2526979222893715, "step": 3044 }, { "epoch": 0.190375, "grad_norm": 2.703125, "grad_norm_var": 0.02271728515625, "learning_rate": 0.0001, "loss": 7.4846, "loss/crossentropy": 2.2407480478286743, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23275888711214066, "step": 3046 }, { "epoch": 0.1905, "grad_norm": 2.484375, "grad_norm_var": 0.023140462239583333, "learning_rate": 0.0001, "loss": 7.7875, "loss/crossentropy": 2.2426388263702393, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23194129765033722, "step": 3048 }, { "epoch": 0.190625, "grad_norm": 2.921875, "grad_norm_var": 0.029488118489583333, "learning_rate": 0.0001, "loss": 7.6974, "loss/crossentropy": 2.3478230237960815, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24639128148555756, "step": 3050 }, { "epoch": 0.19075, "grad_norm": 2.390625, "grad_norm_var": 0.030973307291666665, "learning_rate": 0.0001, "loss": 7.8786, "loss/crossentropy": 2.3184871673583984, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2819037437438965, "step": 3052 }, { "epoch": 0.190875, "grad_norm": 2.59375, "grad_norm_var": 0.025520833333333333, "learning_rate": 0.0001, "loss": 7.6416, "loss/crossentropy": 2.140998363494873, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22248034179210663, "step": 3054 }, { "epoch": 0.191, "grad_norm": 2.40625, "grad_norm_var": 0.024540201822916666, "learning_rate": 0.0001, "loss": 7.7976, "loss/crossentropy": 2.492767333984375, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2436331883072853, "step": 3056 }, { "epoch": 0.191125, "grad_norm": 2.796875, "grad_norm_var": 0.026851399739583334, "learning_rate": 0.0001, "loss": 7.6777, "loss/crossentropy": 1.9927314519882202, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22899659723043442, "step": 3058 }, { "epoch": 0.19125, "grad_norm": 2.5, "grad_norm_var": 0.0286285400390625, "learning_rate": 0.0001, "loss": 7.9659, "loss/crossentropy": 2.2941343784332275, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24661502987146378, "step": 3060 }, { "epoch": 0.191375, "grad_norm": 2.5, "grad_norm_var": 0.02808837890625, "learning_rate": 0.0001, "loss": 7.8332, "loss/crossentropy": 2.2587934732437134, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23479507118463516, "step": 3062 }, { "epoch": 0.1915, "grad_norm": 2.484375, "grad_norm_var": 0.0285797119140625, "learning_rate": 0.0001, "loss": 7.7855, "loss/crossentropy": 2.448140263557434, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24534112215042114, "step": 3064 }, { "epoch": 0.191625, "grad_norm": 2.734375, "grad_norm_var": 0.022802734375, "learning_rate": 0.0001, "loss": 7.6054, "loss/crossentropy": 2.5346790552139282, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2549649178981781, "step": 3066 }, { "epoch": 0.19175, "grad_norm": 2.390625, "grad_norm_var": 0.0201324462890625, "learning_rate": 0.0001, "loss": 7.6652, "loss/crossentropy": 2.139198064804077, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22500670701265335, "step": 3068 }, { "epoch": 0.191875, "grad_norm": 2.5, "grad_norm_var": 0.020052083333333335, "learning_rate": 0.0001, "loss": 7.6343, "loss/crossentropy": 2.2101441621780396, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24504300951957703, "step": 3070 }, { "epoch": 0.192, "grad_norm": 2.78125, "grad_norm_var": 0.018387858072916666, "learning_rate": 0.0001, "loss": 7.8094, "loss/crossentropy": 2.387241005897522, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23291389644145966, "step": 3072 }, { "epoch": 0.192125, "grad_norm": 2.515625, "grad_norm_var": 0.015632120768229167, "learning_rate": 0.0001, "loss": 7.657, "loss/crossentropy": 2.015101671218872, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2168959379196167, "step": 3074 }, { "epoch": 0.19225, "grad_norm": 2.359375, "grad_norm_var": 0.016044108072916667, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 2.201832890510559, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25159038603305817, "step": 3076 }, { "epoch": 0.192375, "grad_norm": 2.640625, "grad_norm_var": 0.017699178059895834, "learning_rate": 0.0001, "loss": 7.7274, "loss/crossentropy": 2.4729052782058716, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2636168450117111, "step": 3078 }, { "epoch": 0.1925, "grad_norm": 2.5625, "grad_norm_var": 0.015868123372395834, "learning_rate": 0.0001, "loss": 7.4003, "loss/crossentropy": 2.096401810646057, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22800646722316742, "step": 3080 }, { "epoch": 0.192625, "grad_norm": 2.71875, "grad_norm_var": 0.017316691080729165, "learning_rate": 0.0001, "loss": 7.5855, "loss/crossentropy": 2.1243752241134644, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2130807489156723, "step": 3082 }, { "epoch": 0.19275, "grad_norm": 2.59375, "grad_norm_var": 0.016258748372395833, "learning_rate": 0.0001, "loss": 7.6234, "loss/crossentropy": 2.393889904022217, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23582585901021957, "step": 3084 }, { "epoch": 0.192875, "grad_norm": 2.453125, "grad_norm_var": 0.0202789306640625, "learning_rate": 0.0001, "loss": 7.7359, "loss/crossentropy": 2.530544877052307, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23919443786144257, "step": 3086 }, { "epoch": 0.193, "grad_norm": 2.453125, "grad_norm_var": 0.018895467122395832, "learning_rate": 0.0001, "loss": 7.7122, "loss/crossentropy": 2.14614474773407, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23280902951955795, "step": 3088 }, { "epoch": 0.193125, "grad_norm": 2.546875, "grad_norm_var": 0.025519816080729167, "learning_rate": 0.0001, "loss": 7.6703, "loss/crossentropy": 2.1698378324508667, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23564688116312027, "step": 3090 }, { "epoch": 0.19325, "grad_norm": 2.296875, "grad_norm_var": 0.0282379150390625, "learning_rate": 0.0001, "loss": 7.5935, "loss/crossentropy": 2.2789262533187866, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23355238884687424, "step": 3092 }, { "epoch": 0.193375, "grad_norm": 2.78125, "grad_norm_var": 0.030013020833333334, "learning_rate": 0.0001, "loss": 7.8383, "loss/crossentropy": 2.548181891441345, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.242641419172287, "step": 3094 }, { "epoch": 0.1935, "grad_norm": 2.4375, "grad_norm_var": 0.031525675455729166, "learning_rate": 0.0001, "loss": 7.5252, "loss/crossentropy": 2.34587025642395, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2449270710349083, "step": 3096 }, { "epoch": 0.193625, "grad_norm": 3.015625, "grad_norm_var": 0.04414774576822917, "learning_rate": 0.0001, "loss": 7.5994, "loss/crossentropy": 2.2254514694213867, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2519787400960922, "step": 3098 }, { "epoch": 0.19375, "grad_norm": 2.640625, "grad_norm_var": 0.060301717122395834, "learning_rate": 0.0001, "loss": 7.623, "loss/crossentropy": 2.3490875959396362, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24305754899978638, "step": 3100 }, { "epoch": 0.193875, "grad_norm": 2.34375, "grad_norm_var": 0.0575836181640625, "learning_rate": 0.0001, "loss": 7.7137, "loss/crossentropy": 2.021351933479309, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2211918607354164, "step": 3102 }, { "epoch": 0.194, "grad_norm": 2.8125, "grad_norm_var": 0.060791015625, "learning_rate": 0.0001, "loss": 7.7423, "loss/crossentropy": 2.1594278812408447, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24009329080581665, "step": 3104 }, { "epoch": 0.194125, "grad_norm": 2.578125, "grad_norm_var": 0.059137980143229164, "learning_rate": 0.0001, "loss": 7.7889, "loss/crossentropy": 2.459377884864807, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.251203328371048, "step": 3106 }, { "epoch": 0.19425, "grad_norm": 2.921875, "grad_norm_var": 0.05748291015625, "learning_rate": 0.0001, "loss": 7.5124, "loss/crossentropy": 2.0707362294197083, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21844393759965897, "step": 3108 }, { "epoch": 0.194375, "grad_norm": 2.453125, "grad_norm_var": 0.101025390625, "learning_rate": 0.0001, "loss": 7.6732, "loss/crossentropy": 2.2419523000717163, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2376825362443924, "step": 3110 }, { "epoch": 0.1945, "grad_norm": 2.46875, "grad_norm_var": 0.10181376139322916, "learning_rate": 0.0001, "loss": 7.6479, "loss/crossentropy": 2.3259233236312866, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.25602586567401886, "step": 3112 }, { "epoch": 0.194625, "grad_norm": 2.796875, "grad_norm_var": 0.0926910400390625, "learning_rate": 0.0001, "loss": 7.5447, "loss/crossentropy": 1.9941769242286682, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21642977744340897, "step": 3114 }, { "epoch": 0.19475, "grad_norm": 2.578125, "grad_norm_var": 0.09576822916666666, "learning_rate": 0.0001, "loss": 7.8191, "loss/crossentropy": 2.5602025985717773, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25449611991643906, "step": 3116 }, { "epoch": 0.194875, "grad_norm": 2.484375, "grad_norm_var": 0.09041239420572916, "learning_rate": 0.0001, "loss": 7.8136, "loss/crossentropy": 2.3339617252349854, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23222877830266953, "step": 3118 }, { "epoch": 0.195, "grad_norm": 2.453125, "grad_norm_var": 0.09040425618489584, "learning_rate": 0.0001, "loss": 7.7202, "loss/crossentropy": 2.4900479316711426, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24146823585033417, "step": 3120 }, { "epoch": 0.195125, "grad_norm": 2.734375, "grad_norm_var": 0.09036356608072917, "learning_rate": 0.0001, "loss": 7.6583, "loss/crossentropy": 2.191547393798828, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2142081782221794, "step": 3122 }, { "epoch": 0.19525, "grad_norm": 2.375, "grad_norm_var": 0.090185546875, "learning_rate": 0.0001, "loss": 7.6174, "loss/crossentropy": 2.3771393299102783, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2471369206905365, "step": 3124 }, { "epoch": 0.195375, "grad_norm": 2.625, "grad_norm_var": 0.04331766764322917, "learning_rate": 0.0001, "loss": 7.5707, "loss/crossentropy": 2.3110562562942505, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23804593086242676, "step": 3126 }, { "epoch": 0.1955, "grad_norm": 2.5, "grad_norm_var": 0.040379842122395836, "learning_rate": 0.0001, "loss": 7.6465, "loss/crossentropy": 2.0701069831848145, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23578406125307083, "step": 3128 }, { "epoch": 0.195625, "grad_norm": 2.421875, "grad_norm_var": 0.038134765625, "learning_rate": 0.0001, "loss": 7.6923, "loss/crossentropy": 2.2268728017807007, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22668010741472244, "step": 3130 }, { "epoch": 0.19575, "grad_norm": 2.546875, "grad_norm_var": 0.018724568684895835, "learning_rate": 0.0001, "loss": 7.7925, "loss/crossentropy": 2.288881540298462, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2400398850440979, "step": 3132 }, { "epoch": 0.195875, "grad_norm": 2.546875, "grad_norm_var": 0.019310506184895833, "learning_rate": 0.0001, "loss": 7.5776, "loss/crossentropy": 2.3762770891189575, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24239980429410934, "step": 3134 }, { "epoch": 0.196, "grad_norm": 2.359375, "grad_norm_var": 0.022037760416666666, "learning_rate": 0.0001, "loss": 7.5924, "loss/crossentropy": 2.0780075788497925, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2166167050600052, "step": 3136 }, { "epoch": 0.196125, "grad_norm": 2.359375, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 7.6296, "loss/crossentropy": 2.2321070432662964, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.257367268204689, "step": 3138 }, { "epoch": 0.19625, "grad_norm": 2.5, "grad_norm_var": 0.0124664306640625, "learning_rate": 0.0001, "loss": 7.655, "loss/crossentropy": 2.2101333141326904, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22183632850646973, "step": 3140 }, { "epoch": 0.196375, "grad_norm": 2.5, "grad_norm_var": 0.013451131184895833, "learning_rate": 0.0001, "loss": 7.7539, "loss/crossentropy": 2.3512450456619263, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2585148215293884, "step": 3142 }, { "epoch": 0.1965, "grad_norm": 2.578125, "grad_norm_var": 0.012360636393229167, "learning_rate": 0.0001, "loss": 7.6486, "loss/crossentropy": 2.224077582359314, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23895263671875, "step": 3144 }, { "epoch": 0.196625, "grad_norm": 2.40625, "grad_norm_var": 0.01246337890625, "learning_rate": 0.0001, "loss": 7.5564, "loss/crossentropy": 2.1337246894836426, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2377912774682045, "step": 3146 }, { "epoch": 0.19675, "grad_norm": 2.46875, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 7.6371, "loss/crossentropy": 2.1832423210144043, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2349269688129425, "step": 3148 }, { "epoch": 0.196875, "grad_norm": 2.515625, "grad_norm_var": 0.0158203125, "learning_rate": 0.0001, "loss": 7.6147, "loss/crossentropy": 2.2364492416381836, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24817125499248505, "step": 3150 }, { "epoch": 0.197, "grad_norm": 2.21875, "grad_norm_var": 0.019254557291666665, "learning_rate": 0.0001, "loss": 7.5449, "loss/crossentropy": 2.30988085269928, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.26015302538871765, "step": 3152 }, { "epoch": 0.197125, "grad_norm": 2.671875, "grad_norm_var": 0.020392862955729167, "learning_rate": 0.0001, "loss": 7.7401, "loss/crossentropy": 2.053266227245331, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24950231611728668, "step": 3154 }, { "epoch": 0.19725, "grad_norm": 2.640625, "grad_norm_var": 0.021126302083333333, "learning_rate": 0.0001, "loss": 7.6193, "loss/crossentropy": 2.2058684825897217, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23218996822834015, "step": 3156 }, { "epoch": 0.197375, "grad_norm": 2.671875, "grad_norm_var": 0.031148274739583332, "learning_rate": 0.0001, "loss": 7.83, "loss/crossentropy": 2.3524067401885986, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24826574325561523, "step": 3158 }, { "epoch": 0.1975, "grad_norm": 2.609375, "grad_norm_var": 0.032079060872395836, "learning_rate": 0.0001, "loss": 7.6028, "loss/crossentropy": 2.124649167060852, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23834124207496643, "step": 3160 }, { "epoch": 0.197625, "grad_norm": 3.203125, "grad_norm_var": 0.05683186848958333, "learning_rate": 0.0001, "loss": 7.6409, "loss/crossentropy": 2.170323371887207, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24094149470329285, "step": 3162 }, { "epoch": 0.19775, "grad_norm": 2.65625, "grad_norm_var": 0.06315816243489583, "learning_rate": 0.0001, "loss": 7.7351, "loss/crossentropy": 2.340814709663391, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2378515675663948, "step": 3164 }, { "epoch": 0.197875, "grad_norm": 2.640625, "grad_norm_var": 0.0582672119140625, "learning_rate": 0.0001, "loss": 7.7131, "loss/crossentropy": 2.454757571220398, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25292622298002243, "step": 3166 }, { "epoch": 0.198, "grad_norm": 2.59375, "grad_norm_var": 0.0470703125, "learning_rate": 0.0001, "loss": 7.7054, "loss/crossentropy": 2.3518433570861816, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2326052561402321, "step": 3168 }, { "epoch": 0.198125, "grad_norm": 2.375, "grad_norm_var": 0.049103800455729166, "learning_rate": 0.0001, "loss": 7.8036, "loss/crossentropy": 2.3869314193725586, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2253723442554474, "step": 3170 }, { "epoch": 0.19825, "grad_norm": 2.453125, "grad_norm_var": 0.05646158854166667, "learning_rate": 0.0001, "loss": 7.549, "loss/crossentropy": 2.1406772136688232, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2136363908648491, "step": 3172 }, { "epoch": 0.198375, "grad_norm": 2.65625, "grad_norm_var": 0.050959269205729164, "learning_rate": 0.0001, "loss": 7.7547, "loss/crossentropy": 2.276672065258026, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2241700440645218, "step": 3174 }, { "epoch": 0.1985, "grad_norm": 2.234375, "grad_norm_var": 0.056696573893229164, "learning_rate": 0.0001, "loss": 7.5706, "loss/crossentropy": 2.0704278349876404, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22979146987199783, "step": 3176 }, { "epoch": 0.198625, "grad_norm": 2.625, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 7.6319, "loss/crossentropy": 2.155359983444214, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23411893844604492, "step": 3178 }, { "epoch": 0.19875, "grad_norm": 2.890625, "grad_norm_var": 0.026708984375, "learning_rate": 0.0001, "loss": 7.6763, "loss/crossentropy": 2.17472767829895, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2461443468928337, "step": 3180 }, { "epoch": 0.198875, "grad_norm": 2.515625, "grad_norm_var": 0.02847900390625, "learning_rate": 0.0001, "loss": 7.61, "loss/crossentropy": 2.401307702064514, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2419404238462448, "step": 3182 }, { "epoch": 0.199, "grad_norm": 2.359375, "grad_norm_var": 0.03395894368489583, "learning_rate": 0.0001, "loss": 7.5815, "loss/crossentropy": 2.303532361984253, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23136408627033234, "step": 3184 }, { "epoch": 0.199125, "grad_norm": 2.8125, "grad_norm_var": 0.03798726399739583, "learning_rate": 0.0001, "loss": 7.7585, "loss/crossentropy": 2.691552758216858, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24934251606464386, "step": 3186 }, { "epoch": 0.19925, "grad_norm": 2.234375, "grad_norm_var": 0.041975911458333334, "learning_rate": 0.0001, "loss": 7.6867, "loss/crossentropy": 2.1433998346328735, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23058529198169708, "step": 3188 }, { "epoch": 0.199375, "grad_norm": 2.484375, "grad_norm_var": 0.04368387858072917, "learning_rate": 0.0001, "loss": 7.6658, "loss/crossentropy": 2.075712561607361, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23563528060913086, "step": 3190 }, { "epoch": 0.1995, "grad_norm": 2.515625, "grad_norm_var": 0.038849894205729166, "learning_rate": 0.0001, "loss": 7.6327, "loss/crossentropy": 2.239920735359192, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24973846971988678, "step": 3192 }, { "epoch": 0.199625, "grad_norm": 2.5625, "grad_norm_var": 0.03837788899739583, "learning_rate": 0.0001, "loss": 7.6491, "loss/crossentropy": 2.2711654901504517, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2149556428194046, "step": 3194 }, { "epoch": 0.19975, "grad_norm": 2.703125, "grad_norm_var": 0.033543904622395836, "learning_rate": 0.0001, "loss": 7.6259, "loss/crossentropy": 2.2742727994918823, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24378067255020142, "step": 3196 }, { "epoch": 0.199875, "grad_norm": 2.296875, "grad_norm_var": 0.03476155598958333, "learning_rate": 0.0001, "loss": 7.6477, "loss/crossentropy": 2.074104130268097, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22755059599876404, "step": 3198 }, { "epoch": 0.2, "grad_norm": 2.71875, "grad_norm_var": 0.03190816243489583, "learning_rate": 0.0001, "loss": 7.4978, "loss/crossentropy": 2.2103521823883057, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22538809478282928, "step": 3200 }, { "epoch": 0.200125, "grad_norm": 2.28125, "grad_norm_var": 0.027132161458333335, "learning_rate": 0.0001, "loss": 7.4522, "loss/crossentropy": 2.262304186820984, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22689391672611237, "step": 3202 }, { "epoch": 0.20025, "grad_norm": 2.484375, "grad_norm_var": 0.024527994791666667, "learning_rate": 0.0001, "loss": 7.6634, "loss/crossentropy": 2.159933626651764, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2550048828125, "step": 3204 }, { "epoch": 0.200375, "grad_norm": 2.5, "grad_norm_var": 0.022847493489583332, "learning_rate": 0.0001, "loss": 7.7359, "loss/crossentropy": 2.2928545475006104, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2492647022008896, "step": 3206 }, { "epoch": 0.2005, "grad_norm": 2.453125, "grad_norm_var": 0.022435506184895832, "learning_rate": 0.0001, "loss": 7.6104, "loss/crossentropy": 2.151831030845642, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21393808722496033, "step": 3208 }, { "epoch": 0.200625, "grad_norm": 2.46875, "grad_norm_var": 0.022005208333333335, "learning_rate": 0.0001, "loss": 7.8164, "loss/crossentropy": 2.166096329689026, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23471974581480026, "step": 3210 }, { "epoch": 0.20075, "grad_norm": 2.734375, "grad_norm_var": 0.018277994791666665, "learning_rate": 0.0001, "loss": 7.565, "loss/crossentropy": 2.181807518005371, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23415996134281158, "step": 3212 }, { "epoch": 0.200875, "grad_norm": 2.453125, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 7.5114, "loss/crossentropy": 2.1912107467651367, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22295787930488586, "step": 3214 }, { "epoch": 0.201, "grad_norm": 2.453125, "grad_norm_var": 0.01558837890625, "learning_rate": 0.0001, "loss": 7.4921, "loss/crossentropy": 2.297171950340271, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.25197841227054596, "step": 3216 }, { "epoch": 0.201125, "grad_norm": 2.359375, "grad_norm_var": 0.015364583333333333, "learning_rate": 0.0001, "loss": 7.7764, "loss/crossentropy": 2.484106659889221, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2731506675481796, "step": 3218 }, { "epoch": 0.20125, "grad_norm": 2.359375, "grad_norm_var": 0.017975870768229166, "learning_rate": 0.0001, "loss": 7.6025, "loss/crossentropy": 2.156631350517273, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21255087107419968, "step": 3220 }, { "epoch": 0.201375, "grad_norm": 2.421875, "grad_norm_var": 0.0179351806640625, "learning_rate": 0.0001, "loss": 7.5934, "loss/crossentropy": 2.3413909673690796, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22059939801692963, "step": 3222 }, { "epoch": 0.2015, "grad_norm": 3.203125, "grad_norm_var": 0.05054423014322917, "learning_rate": 0.0001, "loss": 7.6958, "loss/crossentropy": 2.313757300376892, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23688851296901703, "step": 3224 }, { "epoch": 0.201625, "grad_norm": 2.640625, "grad_norm_var": 0.052643839518229166, "learning_rate": 0.0001, "loss": 7.5624, "loss/crossentropy": 2.3218902349472046, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23602160066366196, "step": 3226 }, { "epoch": 0.20175, "grad_norm": 2.421875, "grad_norm_var": 0.0511138916015625, "learning_rate": 0.0001, "loss": 7.6753, "loss/crossentropy": 2.6514742374420166, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2617443650960922, "step": 3228 }, { "epoch": 0.201875, "grad_norm": 2.328125, "grad_norm_var": 0.05115559895833333, "learning_rate": 0.0001, "loss": 7.7007, "loss/crossentropy": 2.27648389339447, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23478404432535172, "step": 3230 }, { "epoch": 0.202, "grad_norm": 2.75, "grad_norm_var": 0.05434468587239583, "learning_rate": 0.0001, "loss": 7.7911, "loss/crossentropy": 2.3282746076583862, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24819976091384888, "step": 3232 }, { "epoch": 0.202125, "grad_norm": 2.359375, "grad_norm_var": 0.05576883951822917, "learning_rate": 0.0001, "loss": 7.5815, "loss/crossentropy": 2.0597460865974426, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23130206763744354, "step": 3234 }, { "epoch": 0.20225, "grad_norm": 2.484375, "grad_norm_var": 0.04765625, "learning_rate": 0.0001, "loss": 7.777, "loss/crossentropy": 2.3545076847076416, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22461315244436264, "step": 3236 }, { "epoch": 0.202375, "grad_norm": 2.734375, "grad_norm_var": 0.045506795247395836, "learning_rate": 0.0001, "loss": 7.6893, "loss/crossentropy": 2.2642041444778442, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24759702384471893, "step": 3238 }, { "epoch": 0.2025, "grad_norm": 2.359375, "grad_norm_var": 0.020438639322916667, "learning_rate": 0.0001, "loss": 7.5394, "loss/crossentropy": 2.405397891998291, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24760686606168747, "step": 3240 }, { "epoch": 0.202625, "grad_norm": 2.4375, "grad_norm_var": 0.018050130208333334, "learning_rate": 0.0001, "loss": 7.5978, "loss/crossentropy": 1.8945466876029968, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.20826154947280884, "step": 3242 }, { "epoch": 0.20275, "grad_norm": 2.59375, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 7.6012, "loss/crossentropy": 2.2214646339416504, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21611948311328888, "step": 3244 }, { "epoch": 0.202875, "grad_norm": 2.40625, "grad_norm_var": 0.01685791015625, "learning_rate": 0.0001, "loss": 7.5799, "loss/crossentropy": 2.41989004611969, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2531380206346512, "step": 3246 }, { "epoch": 0.203, "grad_norm": 2.75, "grad_norm_var": 0.016499837239583332, "learning_rate": 0.0001, "loss": 7.6609, "loss/crossentropy": 2.1457839012145996, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22547265142202377, "step": 3248 }, { "epoch": 0.203125, "grad_norm": 2.515625, "grad_norm_var": 0.011815388997395834, "learning_rate": 0.0001, "loss": 7.6925, "loss/crossentropy": 2.202640414237976, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24964796006679535, "step": 3250 }, { "epoch": 0.20325, "grad_norm": 2.59375, "grad_norm_var": 0.01177978515625, "learning_rate": 0.0001, "loss": 7.8597, "loss/crossentropy": 2.2329805493354797, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.21109139919281006, "step": 3252 }, { "epoch": 0.203375, "grad_norm": 2.546875, "grad_norm_var": 0.013602701822916667, "learning_rate": 0.0001, "loss": 7.6793, "loss/crossentropy": 2.37657368183136, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22429364919662476, "step": 3254 }, { "epoch": 0.2035, "grad_norm": 2.515625, "grad_norm_var": 0.013654581705729167, "learning_rate": 0.0001, "loss": 7.5988, "loss/crossentropy": 2.240600347518921, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24599803984165192, "step": 3256 }, { "epoch": 0.203625, "grad_norm": 2.296875, "grad_norm_var": 0.020670572916666668, "learning_rate": 0.0001, "loss": 7.6654, "loss/crossentropy": 2.5686757564544678, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.263079509139061, "step": 3258 }, { "epoch": 0.20375, "grad_norm": 2.53125, "grad_norm_var": 0.023265584309895834, "learning_rate": 0.0001, "loss": 7.824, "loss/crossentropy": 2.150991916656494, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2586375027894974, "step": 3260 }, { "epoch": 0.203875, "grad_norm": 2.453125, "grad_norm_var": 0.026048787434895835, "learning_rate": 0.0001, "loss": 7.6643, "loss/crossentropy": 2.370198965072632, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23691494762897491, "step": 3262 }, { "epoch": 0.204, "grad_norm": 2.28125, "grad_norm_var": 0.029390462239583335, "learning_rate": 0.0001, "loss": 7.5166, "loss/crossentropy": 2.2037036418914795, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23511488735675812, "step": 3264 }, { "epoch": 0.204125, "grad_norm": 2.625, "grad_norm_var": 0.032548014322916666, "learning_rate": 0.0001, "loss": 7.631, "loss/crossentropy": 2.2161275148391724, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22294826805591583, "step": 3266 }, { "epoch": 0.20425, "grad_norm": 3.015625, "grad_norm_var": 0.06555887858072916, "learning_rate": 0.0001, "loss": 7.8449, "loss/crossentropy": 2.403158664703369, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24931098520755768, "step": 3268 }, { "epoch": 0.204375, "grad_norm": 2.4375, "grad_norm_var": 0.06297098795572917, "learning_rate": 0.0001, "loss": 7.8327, "loss/crossentropy": 2.4231287240982056, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24163633584976196, "step": 3270 }, { "epoch": 0.2045, "grad_norm": 2.53125, "grad_norm_var": 0.06093648274739583, "learning_rate": 0.0001, "loss": 7.4887, "loss/crossentropy": 2.196571111679077, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2219029664993286, "step": 3272 }, { "epoch": 0.204625, "grad_norm": 2.5, "grad_norm_var": 0.05232645670572917, "learning_rate": 0.0001, "loss": 7.6319, "loss/crossentropy": 2.336767315864563, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23825974017381668, "step": 3274 }, { "epoch": 0.20475, "grad_norm": 2.25, "grad_norm_var": 0.05592041015625, "learning_rate": 0.0001, "loss": 7.4521, "loss/crossentropy": 1.9283623099327087, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2104254812002182, "step": 3276 }, { "epoch": 0.204875, "grad_norm": 2.75, "grad_norm_var": 0.05458882649739583, "learning_rate": 0.0001, "loss": 7.7259, "loss/crossentropy": 2.250017523765564, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23542233556509018, "step": 3278 }, { "epoch": 0.205, "grad_norm": 2.265625, "grad_norm_var": 0.053807576497395836, "learning_rate": 0.0001, "loss": 7.5853, "loss/crossentropy": 2.0355631709098816, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21646135300397873, "step": 3280 }, { "epoch": 0.205125, "grad_norm": 2.53125, "grad_norm_var": 0.056761678059895834, "learning_rate": 0.0001, "loss": 7.5638, "loss/crossentropy": 2.1133495569229126, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2248711958527565, "step": 3282 }, { "epoch": 0.20525, "grad_norm": 2.421875, "grad_norm_var": 0.0180816650390625, "learning_rate": 0.0001, "loss": 7.6095, "loss/crossentropy": 2.4140706062316895, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22342108935117722, "step": 3284 }, { "epoch": 0.205375, "grad_norm": 2.453125, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 7.6281, "loss/crossentropy": 2.4801841974258423, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24090874940156937, "step": 3286 }, { "epoch": 0.2055, "grad_norm": 2.671875, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 7.6194, "loss/crossentropy": 2.333125591278076, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23281607031822205, "step": 3288 }, { "epoch": 0.205625, "grad_norm": 2.328125, "grad_norm_var": 0.022443644205729165, "learning_rate": 0.0001, "loss": 7.5728, "loss/crossentropy": 2.057736098766327, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.21230417490005493, "step": 3290 }, { "epoch": 0.20575, "grad_norm": 2.5, "grad_norm_var": 0.019025675455729165, "learning_rate": 0.0001, "loss": 7.5554, "loss/crossentropy": 2.2296417951583862, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24230563640594482, "step": 3292 }, { "epoch": 0.205875, "grad_norm": 2.640625, "grad_norm_var": 0.0163482666015625, "learning_rate": 0.0001, "loss": 7.5161, "loss/crossentropy": 2.4877594709396362, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23580122739076614, "step": 3294 }, { "epoch": 0.206, "grad_norm": 2.65625, "grad_norm_var": 0.015816243489583333, "learning_rate": 0.0001, "loss": 7.6607, "loss/crossentropy": 2.22783100605011, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23493072390556335, "step": 3296 }, { "epoch": 0.206125, "grad_norm": 3.484375, "grad_norm_var": 0.06994527180989583, "learning_rate": 0.0001, "loss": 7.548, "loss/crossentropy": 2.177275240421295, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23492375016212463, "step": 3298 }, { "epoch": 0.20625, "grad_norm": 2.1875, "grad_norm_var": 0.07803446451822917, "learning_rate": 0.0001, "loss": 7.612, "loss/crossentropy": 2.100243628025055, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.21778713911771774, "step": 3300 }, { "epoch": 0.206375, "grad_norm": 2.40625, "grad_norm_var": 0.08179931640625, "learning_rate": 0.0001, "loss": 7.6651, "loss/crossentropy": 2.612051248550415, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.236809641122818, "step": 3302 }, { "epoch": 0.2065, "grad_norm": 2.53125, "grad_norm_var": 0.10703125, "learning_rate": 0.0001, "loss": 7.7176, "loss/crossentropy": 2.3358840942382812, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22349786013364792, "step": 3304 }, { "epoch": 0.206625, "grad_norm": 2.390625, "grad_norm_var": 0.10436197916666666, "learning_rate": 0.0001, "loss": 7.6017, "loss/crossentropy": 2.0664124488830566, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22234025597572327, "step": 3306 }, { "epoch": 0.20675, "grad_norm": 2.765625, "grad_norm_var": 0.105126953125, "learning_rate": 0.0001, "loss": 7.7791, "loss/crossentropy": 2.3125079870224, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22895997017621994, "step": 3308 }, { "epoch": 0.206875, "grad_norm": 2.5625, "grad_norm_var": 0.10261942545572916, "learning_rate": 0.0001, "loss": 7.761, "loss/crossentropy": 2.0990543365478516, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22909457981586456, "step": 3310 }, { "epoch": 0.207, "grad_norm": 2.359375, "grad_norm_var": 0.11274312337239584, "learning_rate": 0.0001, "loss": 7.6989, "loss/crossentropy": 2.3832221031188965, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23881246894598007, "step": 3312 }, { "epoch": 0.207125, "grad_norm": 2.5625, "grad_norm_var": 0.053099568684895834, "learning_rate": 0.0001, "loss": 7.5829, "loss/crossentropy": 2.070538818836212, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.22222436219453812, "step": 3314 }, { "epoch": 0.20725, "grad_norm": 2.359375, "grad_norm_var": 0.0491607666015625, "learning_rate": 0.0001, "loss": 7.7099, "loss/crossentropy": 2.0685949325561523, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.225556842982769, "step": 3316 }, { "epoch": 0.207375, "grad_norm": 2.453125, "grad_norm_var": 0.0464752197265625, "learning_rate": 0.0001, "loss": 7.5792, "loss/crossentropy": 2.2039034366607666, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23356658220291138, "step": 3318 }, { "epoch": 0.2075, "grad_norm": 2.359375, "grad_norm_var": 0.017772420247395834, "learning_rate": 0.0001, "loss": 7.6484, "loss/crossentropy": 2.520377278327942, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23397985100746155, "step": 3320 }, { "epoch": 0.207625, "grad_norm": 2.421875, "grad_norm_var": 0.018724568684895835, "learning_rate": 0.0001, "loss": 7.6201, "loss/crossentropy": 2.232245087623596, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2378198802471161, "step": 3322 }, { "epoch": 0.20775, "grad_norm": 2.703125, "grad_norm_var": 0.017574055989583334, "learning_rate": 0.0001, "loss": 7.7734, "loss/crossentropy": 2.3666106462478638, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22776059806346893, "step": 3324 }, { "epoch": 0.207875, "grad_norm": 2.453125, "grad_norm_var": 0.026090494791666665, "learning_rate": 0.0001, "loss": 7.692, "loss/crossentropy": 2.4405359029769897, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23942655324935913, "step": 3326 }, { "epoch": 0.208, "grad_norm": 2.3125, "grad_norm_var": 0.026097615559895832, "learning_rate": 0.0001, "loss": 7.4516, "loss/crossentropy": 2.165894627571106, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21770837903022766, "step": 3328 }, { "epoch": 0.208125, "grad_norm": 2.3125, "grad_norm_var": 0.026106770833333334, "learning_rate": 0.0001, "loss": 7.5851, "loss/crossentropy": 2.2975982427597046, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24745838344097137, "step": 3330 }, { "epoch": 0.20825, "grad_norm": 2.28125, "grad_norm_var": 0.0307769775390625, "learning_rate": 0.0001, "loss": 7.5979, "loss/crossentropy": 2.1849515438079834, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23390965163707733, "step": 3332 }, { "epoch": 0.208375, "grad_norm": 2.484375, "grad_norm_var": 0.028743489583333334, "learning_rate": 0.0001, "loss": 7.7235, "loss/crossentropy": 2.336664915084839, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22321896255016327, "step": 3334 }, { "epoch": 0.2085, "grad_norm": 2.625, "grad_norm_var": 0.029752604166666665, "learning_rate": 0.0001, "loss": 7.6849, "loss/crossentropy": 2.4030131101608276, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24722521752119064, "step": 3336 }, { "epoch": 0.208625, "grad_norm": 2.40625, "grad_norm_var": 0.028348795572916665, "learning_rate": 0.0001, "loss": 7.6468, "loss/crossentropy": 2.451479196548462, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2548409700393677, "step": 3338 }, { "epoch": 0.20875, "grad_norm": 2.53125, "grad_norm_var": 0.024079386393229166, "learning_rate": 0.0001, "loss": 7.7279, "loss/crossentropy": 2.391486406326294, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24848993867635727, "step": 3340 }, { "epoch": 0.208875, "grad_norm": 2.5625, "grad_norm_var": 0.015412394205729167, "learning_rate": 0.0001, "loss": 7.6761, "loss/crossentropy": 2.242367148399353, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23639392107725143, "step": 3342 }, { "epoch": 0.209, "grad_norm": 2.296875, "grad_norm_var": 0.016487630208333333, "learning_rate": 0.0001, "loss": 7.6526, "loss/crossentropy": 2.131048798561096, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2150106355547905, "step": 3344 }, { "epoch": 0.209125, "grad_norm": 2.359375, "grad_norm_var": 0.0172515869140625, "learning_rate": 0.0001, "loss": 7.4834, "loss/crossentropy": 2.2289204597473145, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2406204640865326, "step": 3346 }, { "epoch": 0.20925, "grad_norm": 3.078125, "grad_norm_var": 0.04006245930989583, "learning_rate": 0.0001, "loss": 7.6395, "loss/crossentropy": 2.335645318031311, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2360967919230461, "step": 3348 }, { "epoch": 0.209375, "grad_norm": 2.3125, "grad_norm_var": 0.0597076416015625, "learning_rate": 0.0001, "loss": 7.6344, "loss/crossentropy": 2.3322980403900146, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2533845752477646, "step": 3350 }, { "epoch": 0.2095, "grad_norm": 2.328125, "grad_norm_var": 0.06763916015625, "learning_rate": 0.0001, "loss": 7.394, "loss/crossentropy": 2.0071592926979065, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21447932720184326, "step": 3352 }, { "epoch": 0.209625, "grad_norm": 2.671875, "grad_norm_var": 0.07714436848958334, "learning_rate": 0.0001, "loss": 7.5678, "loss/crossentropy": 2.3756041526794434, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23200811445713043, "step": 3354 }, { "epoch": 0.20975, "grad_norm": 2.34375, "grad_norm_var": 0.08035380045572917, "learning_rate": 0.0001, "loss": 7.4439, "loss/crossentropy": 1.9585599303245544, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.19989113509655, "step": 3356 }, { "epoch": 0.209875, "grad_norm": 2.640625, "grad_norm_var": 0.08413798014322917, "learning_rate": 0.0001, "loss": 7.3117, "loss/crossentropy": 2.1502009630203247, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22656698524951935, "step": 3358 }, { "epoch": 0.21, "grad_norm": 2.375, "grad_norm_var": 0.077978515625, "learning_rate": 0.0001, "loss": 7.6045, "loss/crossentropy": 2.3314971923828125, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2628200501203537, "step": 3360 }, { "epoch": 0.210125, "grad_norm": 2.375, "grad_norm_var": 0.07669169108072917, "learning_rate": 0.0001, "loss": 7.6801, "loss/crossentropy": 2.407312273979187, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2756526470184326, "step": 3362 }, { "epoch": 0.21025, "grad_norm": 2.578125, "grad_norm_var": 0.06636962890625, "learning_rate": 0.0001, "loss": 7.7946, "loss/crossentropy": 2.308638334274292, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22128069400787354, "step": 3364 }, { "epoch": 0.210375, "grad_norm": 2.625, "grad_norm_var": 0.048192342122395836, "learning_rate": 0.0001, "loss": 7.4601, "loss/crossentropy": 2.227054715156555, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.223979651927948, "step": 3366 }, { "epoch": 0.2105, "grad_norm": 2.25, "grad_norm_var": 0.04175516764322917, "learning_rate": 0.0001, "loss": 7.3921, "loss/crossentropy": 2.1322200298309326, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.25718845427036285, "step": 3368 }, { "epoch": 0.210625, "grad_norm": 2.640625, "grad_norm_var": 0.03443603515625, "learning_rate": 0.0001, "loss": 7.5104, "loss/crossentropy": 2.1420618891716003, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2320111319422722, "step": 3370 }, { "epoch": 0.21075, "grad_norm": 2.3125, "grad_norm_var": 0.03416239420572917, "learning_rate": 0.0001, "loss": 7.7232, "loss/crossentropy": 2.3579647541046143, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2402171939611435, "step": 3372 }, { "epoch": 0.210875, "grad_norm": 3.34375, "grad_norm_var": 0.07431233723958333, "learning_rate": 0.0001, "loss": 7.4416, "loss/crossentropy": 2.3642072677612305, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22273917496204376, "step": 3374 }, { "epoch": 0.211, "grad_norm": 2.328125, "grad_norm_var": 0.07669270833333333, "learning_rate": 0.0001, "loss": 7.4455, "loss/crossentropy": 2.2815465927124023, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23410624265670776, "step": 3376 }, { "epoch": 0.211125, "grad_norm": 2.546875, "grad_norm_var": 0.07057291666666667, "learning_rate": 0.0001, "loss": 7.5148, "loss/crossentropy": 2.066421687602997, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24254445731639862, "step": 3378 }, { "epoch": 0.21125, "grad_norm": 2.453125, "grad_norm_var": 0.06165364583333333, "learning_rate": 0.0001, "loss": 7.5341, "loss/crossentropy": 2.347060441970825, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.25217752158641815, "step": 3380 }, { "epoch": 0.211375, "grad_norm": 2.640625, "grad_norm_var": 0.06360270182291666, "learning_rate": 0.0001, "loss": 7.7507, "loss/crossentropy": 2.1539812088012695, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26475973427295685, "step": 3382 }, { "epoch": 0.2115, "grad_norm": 2.609375, "grad_norm_var": 0.0606353759765625, "learning_rate": 0.0001, "loss": 7.7259, "loss/crossentropy": 2.1600695848464966, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2167871668934822, "step": 3384 }, { "epoch": 0.211625, "grad_norm": 2.828125, "grad_norm_var": 0.06337788899739584, "learning_rate": 0.0001, "loss": 7.6085, "loss/crossentropy": 2.286532163619995, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2332899570465088, "step": 3386 }, { "epoch": 0.21175, "grad_norm": 2.515625, "grad_norm_var": 0.05858968098958333, "learning_rate": 0.0001, "loss": 7.562, "loss/crossentropy": 2.1641604900360107, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2164478898048401, "step": 3388 }, { "epoch": 0.211875, "grad_norm": 2.84375, "grad_norm_var": 0.0242095947265625, "learning_rate": 0.0001, "loss": 7.7682, "loss/crossentropy": 2.3409924507141113, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23107275366783142, "step": 3390 }, { "epoch": 0.212, "grad_norm": 2.65625, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 7.5186, "loss/crossentropy": 2.2500524520874023, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.213922381401062, "step": 3392 }, { "epoch": 0.212125, "grad_norm": 2.328125, "grad_norm_var": 0.038426717122395836, "learning_rate": 0.0001, "loss": 7.5321, "loss/crossentropy": 2.1275582909584045, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22479213774204254, "step": 3394 }, { "epoch": 0.21225, "grad_norm": 2.578125, "grad_norm_var": 0.0404449462890625, "learning_rate": 0.0001, "loss": 7.5174, "loss/crossentropy": 2.212312698364258, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24751190841197968, "step": 3396 }, { "epoch": 0.212375, "grad_norm": 2.328125, "grad_norm_var": 0.042236328125, "learning_rate": 0.0001, "loss": 7.4043, "loss/crossentropy": 1.9285815954208374, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.19548063725233078, "step": 3398 }, { "epoch": 0.2125, "grad_norm": 2.5625, "grad_norm_var": 0.0437408447265625, "learning_rate": 0.0001, "loss": 7.6525, "loss/crossentropy": 2.2272496223449707, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24632243812084198, "step": 3400 }, { "epoch": 0.212625, "grad_norm": 2.46875, "grad_norm_var": 0.03942057291666667, "learning_rate": 0.0001, "loss": 7.6239, "loss/crossentropy": 2.521925210952759, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23339618742465973, "step": 3402 }, { "epoch": 0.21275, "grad_norm": 2.296875, "grad_norm_var": 0.04421284993489583, "learning_rate": 0.0001, "loss": 7.5628, "loss/crossentropy": 2.089443802833557, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2286679819226265, "step": 3404 }, { "epoch": 0.212875, "grad_norm": 2.59375, "grad_norm_var": 0.03938700358072917, "learning_rate": 0.0001, "loss": 7.6109, "loss/crossentropy": 2.3914257287979126, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23510746657848358, "step": 3406 }, { "epoch": 0.213, "grad_norm": 2.171875, "grad_norm_var": 0.038863118489583334, "learning_rate": 0.0001, "loss": 7.456, "loss/crossentropy": 2.1810909509658813, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21339446306228638, "step": 3408 }, { "epoch": 0.213125, "grad_norm": 2.640625, "grad_norm_var": 0.031126912434895834, "learning_rate": 0.0001, "loss": 7.6262, "loss/crossentropy": 2.068147301673889, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23478543758392334, "step": 3410 }, { "epoch": 0.21325, "grad_norm": 2.4375, "grad_norm_var": 0.022786458333333332, "learning_rate": 0.0001, "loss": 7.6825, "loss/crossentropy": 2.299628734588623, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24276187270879745, "step": 3412 }, { "epoch": 0.213375, "grad_norm": 2.53125, "grad_norm_var": 0.0220703125, "learning_rate": 0.0001, "loss": 7.5285, "loss/crossentropy": 2.539917826652527, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24582375586032867, "step": 3414 }, { "epoch": 0.2135, "grad_norm": 2.578125, "grad_norm_var": 0.016039021809895835, "learning_rate": 0.0001, "loss": 7.7053, "loss/crossentropy": 2.2273647785186768, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.24760635942220688, "step": 3416 }, { "epoch": 0.213625, "grad_norm": 2.421875, "grad_norm_var": 0.023143513997395834, "learning_rate": 0.0001, "loss": 7.5504, "loss/crossentropy": 2.0793908834457397, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.20350514352321625, "step": 3418 }, { "epoch": 0.21375, "grad_norm": 2.6875, "grad_norm_var": 0.030887858072916666, "learning_rate": 0.0001, "loss": 7.6719, "loss/crossentropy": 2.2238458395004272, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23141219466924667, "step": 3420 }, { "epoch": 0.213875, "grad_norm": 2.78125, "grad_norm_var": 0.03818359375, "learning_rate": 0.0001, "loss": 7.6894, "loss/crossentropy": 2.2496371269226074, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24826089292764664, "step": 3422 }, { "epoch": 0.214, "grad_norm": 2.453125, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 7.5904, "loss/crossentropy": 2.2201942205429077, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22636444121599197, "step": 3424 }, { "epoch": 0.214125, "grad_norm": 2.5625, "grad_norm_var": 0.029117838541666666, "learning_rate": 0.0001, "loss": 7.7162, "loss/crossentropy": 2.387493371963501, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23339012265205383, "step": 3426 }, { "epoch": 0.21425, "grad_norm": 2.625, "grad_norm_var": 0.038263956705729164, "learning_rate": 0.0001, "loss": 7.4986, "loss/crossentropy": 2.1064809560775757, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2052219733595848, "step": 3428 }, { "epoch": 0.214375, "grad_norm": 2.5, "grad_norm_var": 0.043603515625, "learning_rate": 0.0001, "loss": 7.7774, "loss/crossentropy": 2.430534839630127, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2444680631160736, "step": 3430 }, { "epoch": 0.2145, "grad_norm": 2.4375, "grad_norm_var": 0.0421539306640625, "learning_rate": 0.0001, "loss": 7.6541, "loss/crossentropy": 2.2829513549804688, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23072397708892822, "step": 3432 }, { "epoch": 0.214625, "grad_norm": 2.4375, "grad_norm_var": 0.047215779622395836, "learning_rate": 0.0001, "loss": 7.4546, "loss/crossentropy": 2.240882158279419, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24445254355669022, "step": 3434 }, { "epoch": 0.21475, "grad_norm": 2.8125, "grad_norm_var": 0.04260660807291667, "learning_rate": 0.0001, "loss": 7.585, "loss/crossentropy": 1.915956974029541, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.20426518470048904, "step": 3436 }, { "epoch": 0.214875, "grad_norm": 2.484375, "grad_norm_var": 0.038141886393229164, "learning_rate": 0.0001, "loss": 7.6817, "loss/crossentropy": 2.0629988312721252, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23025204241275787, "step": 3438 }, { "epoch": 0.215, "grad_norm": 2.390625, "grad_norm_var": 0.0386627197265625, "learning_rate": 0.0001, "loss": 7.4915, "loss/crossentropy": 2.1499475240707397, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2223310023546219, "step": 3440 }, { "epoch": 0.215125, "grad_norm": 2.640625, "grad_norm_var": 0.0455474853515625, "learning_rate": 0.0001, "loss": 7.6482, "loss/crossentropy": 2.323388457298279, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23185274004936218, "step": 3442 }, { "epoch": 0.21525, "grad_norm": 2.359375, "grad_norm_var": 0.0388580322265625, "learning_rate": 0.0001, "loss": 7.6131, "loss/crossentropy": 2.4512449502944946, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2369954064488411, "step": 3444 }, { "epoch": 0.215375, "grad_norm": 2.4375, "grad_norm_var": 0.032373046875, "learning_rate": 0.0001, "loss": 7.637, "loss/crossentropy": 2.382017970085144, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23922354727983475, "step": 3446 }, { "epoch": 0.2155, "grad_norm": 2.390625, "grad_norm_var": 0.03400065104166667, "learning_rate": 0.0001, "loss": 7.5746, "loss/crossentropy": 2.27813720703125, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2455897182226181, "step": 3448 }, { "epoch": 0.215625, "grad_norm": 2.5, "grad_norm_var": 0.024247233072916666, "learning_rate": 0.0001, "loss": 7.6352, "loss/crossentropy": 2.0362807512283325, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.24546430259943008, "step": 3450 }, { "epoch": 0.21575, "grad_norm": 2.390625, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 7.5596, "loss/crossentropy": 2.3591285943984985, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2355157434940338, "step": 3452 }, { "epoch": 0.215875, "grad_norm": 2.5625, "grad_norm_var": 0.016950480143229165, "learning_rate": 0.0001, "loss": 7.4538, "loss/crossentropy": 2.0467506051063538, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2161174640059471, "step": 3454 }, { "epoch": 0.216, "grad_norm": 2.234375, "grad_norm_var": 0.020247395833333334, "learning_rate": 0.0001, "loss": 7.744, "loss/crossentropy": 2.383505702018738, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23139237612485886, "step": 3456 }, { "epoch": 0.216125, "grad_norm": 2.46875, "grad_norm_var": 0.007906087239583333, "learning_rate": 0.0001, "loss": 7.6497, "loss/crossentropy": 2.422740340232849, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2240883857011795, "step": 3458 }, { "epoch": 0.21625, "grad_norm": 2.4375, "grad_norm_var": 0.0074615478515625, "learning_rate": 0.0001, "loss": 7.6628, "loss/crossentropy": 2.494243025779724, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2455742135643959, "step": 3460 }, { "epoch": 0.216375, "grad_norm": 2.546875, "grad_norm_var": 0.0082672119140625, "learning_rate": 0.0001, "loss": 7.5369, "loss/crossentropy": 2.1609995365142822, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2432066798210144, "step": 3462 }, { "epoch": 0.2165, "grad_norm": 2.359375, "grad_norm_var": 0.0080230712890625, "learning_rate": 0.0001, "loss": 7.6719, "loss/crossentropy": 2.152750015258789, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22096887230873108, "step": 3464 }, { "epoch": 0.216625, "grad_norm": 2.5625, "grad_norm_var": 0.008275349934895834, "learning_rate": 0.0001, "loss": 7.6158, "loss/crossentropy": 2.2551519870758057, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22652066498994827, "step": 3466 }, { "epoch": 0.21675, "grad_norm": 2.390625, "grad_norm_var": 0.0111236572265625, "learning_rate": 0.0001, "loss": 7.6574, "loss/crossentropy": 2.2164549827575684, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23051265627145767, "step": 3468 }, { "epoch": 0.216875, "grad_norm": 2.328125, "grad_norm_var": 0.01197509765625, "learning_rate": 0.0001, "loss": 7.5793, "loss/crossentropy": 2.5120718479156494, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24404268711805344, "step": 3470 }, { "epoch": 0.217, "grad_norm": 2.390625, "grad_norm_var": 0.009358723958333334, "learning_rate": 0.0001, "loss": 7.4738, "loss/crossentropy": 2.171375274658203, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23342353105545044, "step": 3472 }, { "epoch": 0.217125, "grad_norm": 2.34375, "grad_norm_var": 0.00963134765625, "learning_rate": 0.0001, "loss": 7.6139, "loss/crossentropy": 2.095268964767456, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23750004172325134, "step": 3474 }, { "epoch": 0.21725, "grad_norm": 2.40625, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 7.6385, "loss/crossentropy": 2.187831997871399, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21564120054244995, "step": 3476 }, { "epoch": 0.217375, "grad_norm": 2.375, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 7.649, "loss/crossentropy": 2.339821934700012, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24484576284885406, "step": 3478 }, { "epoch": 0.2175, "grad_norm": 2.46875, "grad_norm_var": 0.010188802083333334, "learning_rate": 0.0001, "loss": 7.7182, "loss/crossentropy": 2.085095524787903, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21802203357219696, "step": 3480 }, { "epoch": 0.217625, "grad_norm": 2.453125, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 7.6739, "loss/crossentropy": 2.4005582332611084, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22154852002859116, "step": 3482 }, { "epoch": 0.21775, "grad_norm": 4.375, "grad_norm_var": 0.24265034993489584, "learning_rate": 0.0001, "loss": 7.5902, "loss/crossentropy": 2.235607147216797, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.3020609989762306, "step": 3484 }, { "epoch": 0.217875, "grad_norm": 2.703125, "grad_norm_var": 0.2362457275390625, "learning_rate": 0.0001, "loss": 7.5921, "loss/crossentropy": 2.37876033782959, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.238236665725708, "step": 3486 }, { "epoch": 0.218, "grad_norm": 2.828125, "grad_norm_var": 0.23375244140625, "learning_rate": 0.0001, "loss": 7.7093, "loss/crossentropy": 2.446916341781616, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2403736189007759, "step": 3488 }, { "epoch": 0.218125, "grad_norm": 2.28125, "grad_norm_var": 0.23952534993489583, "learning_rate": 0.0001, "loss": 7.6001, "loss/crossentropy": 2.703190565109253, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24328448623418808, "step": 3490 }, { "epoch": 0.21825, "grad_norm": 2.515625, "grad_norm_var": 0.23860677083333334, "learning_rate": 0.0001, "loss": 7.6135, "loss/crossentropy": 2.266252636909485, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2304093837738037, "step": 3492 }, { "epoch": 0.218375, "grad_norm": 3.0625, "grad_norm_var": 0.24362691243489584, "learning_rate": 0.0001, "loss": 7.6869, "loss/crossentropy": 2.2512608766555786, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23177290707826614, "step": 3494 }, { "epoch": 0.2185, "grad_norm": 2.390625, "grad_norm_var": 0.24628804524739584, "learning_rate": 0.0001, "loss": 7.8453, "loss/crossentropy": 2.3944746255874634, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22361087799072266, "step": 3496 }, { "epoch": 0.218625, "grad_norm": 2.34375, "grad_norm_var": 0.25588785807291664, "learning_rate": 0.0001, "loss": 7.3925, "loss/crossentropy": 1.9911410212516785, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2150706946849823, "step": 3498 }, { "epoch": 0.21875, "grad_norm": 2.84375, "grad_norm_var": 0.05894266764322917, "learning_rate": 0.0001, "loss": 7.6356, "loss/crossentropy": 2.290215253829956, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22882460057735443, "step": 3500 }, { "epoch": 0.218875, "grad_norm": 2.15625, "grad_norm_var": 0.07018941243489583, "learning_rate": 0.0001, "loss": 7.5163, "loss/crossentropy": 2.201116681098938, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.20892898738384247, "step": 3502 }, { "epoch": 0.219, "grad_norm": 2.5, "grad_norm_var": 0.06816304524739583, "learning_rate": 0.0001, "loss": 7.7937, "loss/crossentropy": 2.210504412651062, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21811866015195847, "step": 3504 }, { "epoch": 0.219125, "grad_norm": 2.5625, "grad_norm_var": 0.06256510416666666, "learning_rate": 0.0001, "loss": 7.6281, "loss/crossentropy": 2.355741262435913, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23753728717565536, "step": 3506 }, { "epoch": 0.21925, "grad_norm": 2.296875, "grad_norm_var": 0.06705729166666667, "learning_rate": 0.0001, "loss": 7.5304, "loss/crossentropy": 2.2776095867156982, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22738799452781677, "step": 3508 }, { "epoch": 0.219375, "grad_norm": 2.40625, "grad_norm_var": 0.046605428059895836, "learning_rate": 0.0001, "loss": 7.5032, "loss/crossentropy": 2.295590400695801, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2449091300368309, "step": 3510 }, { "epoch": 0.2195, "grad_norm": 2.671875, "grad_norm_var": 0.03515625, "learning_rate": 0.0001, "loss": 7.6288, "loss/crossentropy": 2.326041340827942, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2660403698682785, "step": 3512 }, { "epoch": 0.219625, "grad_norm": 2.28125, "grad_norm_var": 0.03609619140625, "learning_rate": 0.0001, "loss": 7.5858, "loss/crossentropy": 2.432402729988098, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25160669535398483, "step": 3514 }, { "epoch": 0.21975, "grad_norm": 2.890625, "grad_norm_var": 0.038895670572916666, "learning_rate": 0.0001, "loss": 7.818, "loss/crossentropy": 2.349487543106079, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23811831325292587, "step": 3516 }, { "epoch": 0.219875, "grad_norm": 2.546875, "grad_norm_var": 0.0508941650390625, "learning_rate": 0.0001, "loss": 7.7315, "loss/crossentropy": 2.3854116201400757, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24041782319545746, "step": 3518 }, { "epoch": 0.22, "grad_norm": 2.265625, "grad_norm_var": 0.048075358072916664, "learning_rate": 0.0001, "loss": 7.5721, "loss/crossentropy": 2.253451347351074, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2183404043316841, "step": 3520 }, { "epoch": 0.220125, "grad_norm": 2.453125, "grad_norm_var": 0.04853108723958333, "learning_rate": 0.0001, "loss": 7.4311, "loss/crossentropy": 1.989369809627533, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.218755841255188, "step": 3522 }, { "epoch": 0.22025, "grad_norm": 2.390625, "grad_norm_var": 0.046187337239583334, "learning_rate": 0.0001, "loss": 7.5011, "loss/crossentropy": 2.3954477310180664, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.254411518573761, "step": 3524 }, { "epoch": 0.220375, "grad_norm": 2.53125, "grad_norm_var": 0.04576416015625, "learning_rate": 0.0001, "loss": 7.5453, "loss/crossentropy": 1.9688183665275574, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2124113142490387, "step": 3526 }, { "epoch": 0.2205, "grad_norm": 2.40625, "grad_norm_var": 0.04296468098958333, "learning_rate": 0.0001, "loss": 7.5413, "loss/crossentropy": 2.304739475250244, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24053708463907242, "step": 3528 }, { "epoch": 0.220625, "grad_norm": 2.40625, "grad_norm_var": 0.040282185872395834, "learning_rate": 0.0001, "loss": 7.5082, "loss/crossentropy": 2.188236117362976, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2476644068956375, "step": 3530 }, { "epoch": 0.22075, "grad_norm": 2.28125, "grad_norm_var": 0.03193257649739583, "learning_rate": 0.0001, "loss": 7.4935, "loss/crossentropy": 2.452435612678528, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2143031656742096, "step": 3532 }, { "epoch": 0.220875, "grad_norm": 2.21875, "grad_norm_var": 0.012613932291666666, "learning_rate": 0.0001, "loss": 7.4794, "loss/crossentropy": 2.0797160863876343, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2122730240225792, "step": 3534 }, { "epoch": 0.221, "grad_norm": 2.484375, "grad_norm_var": 0.010595703125, "learning_rate": 0.0001, "loss": 7.4632, "loss/crossentropy": 2.4266319274902344, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2510553449392319, "step": 3536 }, { "epoch": 0.221125, "grad_norm": 2.203125, "grad_norm_var": 0.012450154622395833, "learning_rate": 0.0001, "loss": 7.6415, "loss/crossentropy": 2.059949517250061, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23185402899980545, "step": 3538 }, { "epoch": 0.22125, "grad_norm": 2.53125, "grad_norm_var": 0.01265869140625, "learning_rate": 0.0001, "loss": 7.6176, "loss/crossentropy": 2.5653083324432373, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23594635725021362, "step": 3540 }, { "epoch": 0.221375, "grad_norm": 2.265625, "grad_norm_var": 0.013667805989583334, "learning_rate": 0.0001, "loss": 7.481, "loss/crossentropy": 2.3301087617874146, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2452377825975418, "step": 3542 }, { "epoch": 0.2215, "grad_norm": 2.34375, "grad_norm_var": 0.0131011962890625, "learning_rate": 0.0001, "loss": 7.5606, "loss/crossentropy": 2.2409706115722656, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2660795971751213, "step": 3544 }, { "epoch": 0.221625, "grad_norm": 2.65625, "grad_norm_var": 0.020589192708333332, "learning_rate": 0.0001, "loss": 7.6275, "loss/crossentropy": 2.341962456703186, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23292769491672516, "step": 3546 }, { "epoch": 0.22175, "grad_norm": 2.546875, "grad_norm_var": 0.020503743489583334, "learning_rate": 0.0001, "loss": 7.5761, "loss/crossentropy": 2.27796733379364, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22092991322278976, "step": 3548 }, { "epoch": 0.221875, "grad_norm": 2.375, "grad_norm_var": 0.0172515869140625, "learning_rate": 0.0001, "loss": 7.5742, "loss/crossentropy": 2.189824938774109, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23947207629680634, "step": 3550 }, { "epoch": 0.222, "grad_norm": 2.453125, "grad_norm_var": 0.017186482747395832, "learning_rate": 0.0001, "loss": 7.5813, "loss/crossentropy": 2.4192023277282715, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.24019119888544083, "step": 3552 }, { "epoch": 0.222125, "grad_norm": 2.375, "grad_norm_var": 0.0127838134765625, "learning_rate": 0.0001, "loss": 7.5347, "loss/crossentropy": 2.158234715461731, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2341742068529129, "step": 3554 }, { "epoch": 0.22225, "grad_norm": 3.203125, "grad_norm_var": 0.047587076822916664, "learning_rate": 0.0001, "loss": 7.378, "loss/crossentropy": 2.2715145349502563, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22904645651578903, "step": 3556 }, { "epoch": 0.222375, "grad_norm": 2.375, "grad_norm_var": 0.26266276041666664, "learning_rate": 0.0001, "loss": 7.6181, "loss/crossentropy": 2.2331241369247437, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25211699306964874, "step": 3558 }, { "epoch": 0.2225, "grad_norm": 2.375, "grad_norm_var": 0.2688140869140625, "learning_rate": 0.0001, "loss": 7.5199, "loss/crossentropy": 1.9353508949279785, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2270013615489006, "step": 3560 }, { "epoch": 0.222625, "grad_norm": 2.40625, "grad_norm_var": 0.27647196451822914, "learning_rate": 0.0001, "loss": 7.5403, "loss/crossentropy": 2.4099196195602417, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24662812799215317, "step": 3562 }, { "epoch": 0.22275, "grad_norm": 2.71875, "grad_norm_var": 0.27763264973958335, "learning_rate": 0.0001, "loss": 7.7591, "loss/crossentropy": 2.163403630256653, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22525054216384888, "step": 3564 }, { "epoch": 0.222875, "grad_norm": 2.484375, "grad_norm_var": 0.27356363932291666, "learning_rate": 0.0001, "loss": 7.6724, "loss/crossentropy": 2.3632307052612305, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.221424400806427, "step": 3566 }, { "epoch": 0.223, "grad_norm": 2.359375, "grad_norm_var": 0.2787394205729167, "learning_rate": 0.0001, "loss": 7.4518, "loss/crossentropy": 2.079641282558441, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22443066537380219, "step": 3568 }, { "epoch": 0.223125, "grad_norm": 2.578125, "grad_norm_var": 0.28662109375, "learning_rate": 0.0001, "loss": 7.5553, "loss/crossentropy": 2.0237990021705627, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21297388523817062, "step": 3570 }, { "epoch": 0.22325, "grad_norm": 2.625, "grad_norm_var": 0.26023763020833335, "learning_rate": 0.0001, "loss": 7.9066, "loss/crossentropy": 2.4305167198181152, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24352750182151794, "step": 3572 }, { "epoch": 0.223375, "grad_norm": 2.453125, "grad_norm_var": 0.03300679524739583, "learning_rate": 0.0001, "loss": 7.8042, "loss/crossentropy": 2.296820282936096, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23312810063362122, "step": 3574 }, { "epoch": 0.2235, "grad_norm": 2.65625, "grad_norm_var": 0.029866536458333332, "learning_rate": 0.0001, "loss": 7.7462, "loss/crossentropy": 2.3262380361557007, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22458947449922562, "step": 3576 }, { "epoch": 0.223625, "grad_norm": 2.6875, "grad_norm_var": 0.027179972330729166, "learning_rate": 0.0001, "loss": 7.6477, "loss/crossentropy": 2.4565069675445557, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2450973466038704, "step": 3578 }, { "epoch": 0.22375, "grad_norm": 2.765625, "grad_norm_var": 0.028913370768229165, "learning_rate": 0.0001, "loss": 7.5068, "loss/crossentropy": 1.9984254240989685, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.19529377669095993, "step": 3580 }, { "epoch": 0.223875, "grad_norm": 2.5, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 7.5913, "loss/crossentropy": 2.2899560928344727, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.25301460921764374, "step": 3582 }, { "epoch": 0.224, "grad_norm": 2.421875, "grad_norm_var": 0.026764933268229166, "learning_rate": 0.0001, "loss": 7.5833, "loss/crossentropy": 2.1312328577041626, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22322946041822433, "step": 3584 }, { "epoch": 0.224125, "grad_norm": 2.421875, "grad_norm_var": 0.017867024739583334, "learning_rate": 0.0001, "loss": 7.6728, "loss/crossentropy": 2.5150551795959473, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2675798535346985, "step": 3586 }, { "epoch": 0.22425, "grad_norm": 2.5, "grad_norm_var": 0.0173492431640625, "learning_rate": 0.0001, "loss": 7.5826, "loss/crossentropy": 2.1855462789535522, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.21716494113206863, "step": 3588 }, { "epoch": 0.224375, "grad_norm": 2.328125, "grad_norm_var": 0.0208892822265625, "learning_rate": 0.0001, "loss": 7.4135, "loss/crossentropy": 2.4238641262054443, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23732400685548782, "step": 3590 }, { "epoch": 0.2245, "grad_norm": 2.46875, "grad_norm_var": 0.020002237955729165, "learning_rate": 0.0001, "loss": 7.5734, "loss/crossentropy": 2.313872456550598, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2245703637599945, "step": 3592 }, { "epoch": 0.224625, "grad_norm": 2.421875, "grad_norm_var": 0.017822265625, "learning_rate": 0.0001, "loss": 7.6389, "loss/crossentropy": 2.149410605430603, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22927816212177277, "step": 3594 }, { "epoch": 0.22475, "grad_norm": 2.671875, "grad_norm_var": 0.013117472330729166, "learning_rate": 0.0001, "loss": 7.594, "loss/crossentropy": 2.228400468826294, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23162036389112473, "step": 3596 }, { "epoch": 0.224875, "grad_norm": 2.3125, "grad_norm_var": 0.0120758056640625, "learning_rate": 0.0001, "loss": 7.6038, "loss/crossentropy": 2.181916832923889, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21812283247709274, "step": 3598 }, { "epoch": 0.225, "grad_norm": 2.484375, "grad_norm_var": 0.0110015869140625, "learning_rate": 0.0001, "loss": 7.7592, "loss/crossentropy": 2.3809478282928467, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2515050619840622, "step": 3600 }, { "epoch": 0.225125, "grad_norm": 2.34375, "grad_norm_var": 0.011324055989583333, "learning_rate": 0.0001, "loss": 7.5347, "loss/crossentropy": 2.1800352334976196, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20716014504432678, "step": 3602 }, { "epoch": 0.22525, "grad_norm": 2.328125, "grad_norm_var": 0.01744384765625, "learning_rate": 0.0001, "loss": 7.5785, "loss/crossentropy": 2.423554301261902, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24492305517196655, "step": 3604 }, { "epoch": 0.225375, "grad_norm": 2.78125, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 7.7038, "loss/crossentropy": 2.3537445068359375, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2731524705886841, "step": 3606 }, { "epoch": 0.2255, "grad_norm": 2.609375, "grad_norm_var": 0.04365234375, "learning_rate": 0.0001, "loss": 7.7234, "loss/crossentropy": 2.3835121393203735, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22908172756433487, "step": 3608 }, { "epoch": 0.225625, "grad_norm": 2.296875, "grad_norm_var": 0.04636942545572917, "learning_rate": 0.0001, "loss": 7.6703, "loss/crossentropy": 2.6490813493728638, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22531013935804367, "step": 3610 }, { "epoch": 0.22575, "grad_norm": 2.34375, "grad_norm_var": 0.05110575358072917, "learning_rate": 0.0001, "loss": 7.5759, "loss/crossentropy": 2.1871402263641357, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22304056584835052, "step": 3612 }, { "epoch": 0.225875, "grad_norm": 2.53125, "grad_norm_var": 0.0443511962890625, "learning_rate": 0.0001, "loss": 7.4932, "loss/crossentropy": 2.2581640481948853, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.24387839436531067, "step": 3614 }, { "epoch": 0.226, "grad_norm": 2.5625, "grad_norm_var": 0.044489542643229164, "learning_rate": 0.0001, "loss": 7.7008, "loss/crossentropy": 2.2254581451416016, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22644620388746262, "step": 3616 }, { "epoch": 0.226125, "grad_norm": 2.359375, "grad_norm_var": 0.04413960774739583, "learning_rate": 0.0001, "loss": 7.5712, "loss/crossentropy": 2.0950043201446533, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24633889645338058, "step": 3618 }, { "epoch": 0.22625, "grad_norm": 2.296875, "grad_norm_var": 0.042292277018229164, "learning_rate": 0.0001, "loss": 7.6548, "loss/crossentropy": 2.2507599592208862, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22745974361896515, "step": 3620 }, { "epoch": 0.226375, "grad_norm": 2.609375, "grad_norm_var": 0.032613118489583336, "learning_rate": 0.0001, "loss": 7.6329, "loss/crossentropy": 2.114292323589325, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22122054547071457, "step": 3622 }, { "epoch": 0.2265, "grad_norm": 2.296875, "grad_norm_var": 0.011823527018229167, "learning_rate": 0.0001, "loss": 7.532, "loss/crossentropy": 2.210999310016632, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22689851373434067, "step": 3624 }, { "epoch": 0.226625, "grad_norm": 2.34375, "grad_norm_var": 0.014583333333333334, "learning_rate": 0.0001, "loss": 7.4216, "loss/crossentropy": 2.5903425216674805, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22328981757164001, "step": 3626 }, { "epoch": 0.22675, "grad_norm": 2.453125, "grad_norm_var": 0.01275634765625, "learning_rate": 0.0001, "loss": 7.5963, "loss/crossentropy": 2.3979709148406982, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2354452759027481, "step": 3628 }, { "epoch": 0.226875, "grad_norm": 2.34375, "grad_norm_var": 0.01240234375, "learning_rate": 0.0001, "loss": 7.4937, "loss/crossentropy": 2.308434844017029, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22410962730646133, "step": 3630 }, { "epoch": 0.227, "grad_norm": 2.140625, "grad_norm_var": 0.01529541015625, "learning_rate": 0.0001, "loss": 7.5951, "loss/crossentropy": 2.4131675958633423, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24779768288135529, "step": 3632 }, { "epoch": 0.227125, "grad_norm": 2.796875, "grad_norm_var": 0.024723307291666666, "learning_rate": 0.0001, "loss": 7.5681, "loss/crossentropy": 2.2655831575393677, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2255219966173172, "step": 3634 }, { "epoch": 0.22725, "grad_norm": 2.25, "grad_norm_var": 0.025487263997395832, "learning_rate": 0.0001, "loss": 7.4176, "loss/crossentropy": 2.3867735862731934, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24707113206386566, "step": 3636 }, { "epoch": 0.227375, "grad_norm": 2.46875, "grad_norm_var": 0.022834269205729167, "learning_rate": 0.0001, "loss": 7.532, "loss/crossentropy": 2.3354828357696533, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2387101650238037, "step": 3638 }, { "epoch": 0.2275, "grad_norm": 2.4375, "grad_norm_var": 0.022581990559895834, "learning_rate": 0.0001, "loss": 7.4607, "loss/crossentropy": 2.164547324180603, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21518608927726746, "step": 3640 }, { "epoch": 0.227625, "grad_norm": 2.5, "grad_norm_var": 0.0202056884765625, "learning_rate": 0.0001, "loss": 7.6245, "loss/crossentropy": 2.3133569955825806, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2465798780322075, "step": 3642 }, { "epoch": 0.22775, "grad_norm": 2.359375, "grad_norm_var": 0.02066650390625, "learning_rate": 0.0001, "loss": 7.5408, "loss/crossentropy": 2.3382097482681274, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22838010638952255, "step": 3644 }, { "epoch": 0.227875, "grad_norm": 2.484375, "grad_norm_var": 0.0206695556640625, "learning_rate": 0.0001, "loss": 7.4479, "loss/crossentropy": 2.2210439443588257, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21334365755319595, "step": 3646 }, { "epoch": 0.228, "grad_norm": 2.3125, "grad_norm_var": 0.019310506184895833, "learning_rate": 0.0001, "loss": 7.5992, "loss/crossentropy": 2.1229522228240967, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23556266725063324, "step": 3648 }, { "epoch": 0.228125, "grad_norm": 2.421875, "grad_norm_var": 0.009129842122395834, "learning_rate": 0.0001, "loss": 7.5133, "loss/crossentropy": 2.335044503211975, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.25041337311267853, "step": 3650 }, { "epoch": 0.22825, "grad_norm": 2.4375, "grad_norm_var": 0.01851806640625, "learning_rate": 0.0001, "loss": 7.5085, "loss/crossentropy": 2.1980998516082764, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21712280064821243, "step": 3652 }, { "epoch": 0.228375, "grad_norm": 2.53125, "grad_norm_var": 0.021100870768229165, "learning_rate": 0.0001, "loss": 7.7472, "loss/crossentropy": 2.513595938682556, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2537487596273422, "step": 3654 }, { "epoch": 0.2285, "grad_norm": 2.484375, "grad_norm_var": 0.019722493489583333, "learning_rate": 0.0001, "loss": 7.515, "loss/crossentropy": 2.3871147632598877, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24276654422283173, "step": 3656 }, { "epoch": 0.228625, "grad_norm": 2.390625, "grad_norm_var": 0.021703084309895832, "learning_rate": 0.0001, "loss": 7.5239, "loss/crossentropy": 2.449997901916504, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23972835391759872, "step": 3658 }, { "epoch": 0.22875, "grad_norm": 2.1875, "grad_norm_var": 0.023957316080729166, "learning_rate": 0.0001, "loss": 7.3961, "loss/crossentropy": 1.9787690043449402, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21979594230651855, "step": 3660 }, { "epoch": 0.228875, "grad_norm": 2.234375, "grad_norm_var": 0.030777994791666666, "learning_rate": 0.0001, "loss": 7.4817, "loss/crossentropy": 2.327951431274414, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2144240364432335, "step": 3662 }, { "epoch": 0.229, "grad_norm": 2.28125, "grad_norm_var": 0.028954060872395833, "learning_rate": 0.0001, "loss": 7.4867, "loss/crossentropy": 2.2228533029556274, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2117415815591812, "step": 3664 }, { "epoch": 0.229125, "grad_norm": 2.40625, "grad_norm_var": 0.030817667643229168, "learning_rate": 0.0001, "loss": 7.5446, "loss/crossentropy": 2.5074501037597656, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2416432872414589, "step": 3666 }, { "epoch": 0.22925, "grad_norm": 2.671875, "grad_norm_var": 0.023502604166666666, "learning_rate": 0.0001, "loss": 7.5777, "loss/crossentropy": 2.1035314798355103, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21771013736724854, "step": 3668 }, { "epoch": 0.229375, "grad_norm": 2.484375, "grad_norm_var": 0.026463826497395832, "learning_rate": 0.0001, "loss": 7.5223, "loss/crossentropy": 2.156371831893921, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2226782739162445, "step": 3670 }, { "epoch": 0.2295, "grad_norm": 2.34375, "grad_norm_var": 0.0279296875, "learning_rate": 0.0001, "loss": 7.5712, "loss/crossentropy": 2.3843045234680176, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23753250390291214, "step": 3672 }, { "epoch": 0.229625, "grad_norm": 2.4375, "grad_norm_var": 0.024128214518229166, "learning_rate": 0.0001, "loss": 7.6631, "loss/crossentropy": 2.2664679288864136, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23596899211406708, "step": 3674 }, { "epoch": 0.22975, "grad_norm": 2.5625, "grad_norm_var": 0.024051920572916666, "learning_rate": 0.0001, "loss": 7.3774, "loss/crossentropy": 2.0786343812942505, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21789100021123886, "step": 3676 }, { "epoch": 0.229875, "grad_norm": 2.296875, "grad_norm_var": 0.0205230712890625, "learning_rate": 0.0001, "loss": 7.5573, "loss/crossentropy": 2.148313283920288, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22734209895133972, "step": 3678 }, { "epoch": 0.23, "grad_norm": 2.453125, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 7.5015, "loss/crossentropy": 2.3508098125457764, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22268230468034744, "step": 3680 }, { "epoch": 0.230125, "grad_norm": 2.296875, "grad_norm_var": 0.018773396809895832, "learning_rate": 0.0001, "loss": 7.5282, "loss/crossentropy": 2.419388175010681, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2422405481338501, "step": 3682 }, { "epoch": 0.23025, "grad_norm": 2.453125, "grad_norm_var": 0.0137603759765625, "learning_rate": 0.0001, "loss": 7.5899, "loss/crossentropy": 2.2298837900161743, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21447760611772537, "step": 3684 }, { "epoch": 0.230375, "grad_norm": 2.546875, "grad_norm_var": 0.013605753580729166, "learning_rate": 0.0001, "loss": 7.59, "loss/crossentropy": 2.4269654750823975, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24916177242994308, "step": 3686 }, { "epoch": 0.2305, "grad_norm": 2.546875, "grad_norm_var": 0.010431925455729166, "learning_rate": 0.0001, "loss": 7.4829, "loss/crossentropy": 1.9996501207351685, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23489703238010406, "step": 3688 }, { "epoch": 0.230625, "grad_norm": 2.34375, "grad_norm_var": 0.014469401041666666, "learning_rate": 0.0001, "loss": 7.7753, "loss/crossentropy": 2.3211807012557983, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.237454354763031, "step": 3690 }, { "epoch": 0.23075, "grad_norm": 2.5, "grad_norm_var": 0.013277180989583333, "learning_rate": 0.0001, "loss": 7.5813, "loss/crossentropy": 2.1937203407287598, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2389645278453827, "step": 3692 }, { "epoch": 0.230875, "grad_norm": 2.546875, "grad_norm_var": 0.01240234375, "learning_rate": 0.0001, "loss": 7.7269, "loss/crossentropy": 2.413442850112915, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24230879545211792, "step": 3694 }, { "epoch": 0.231, "grad_norm": 2.40625, "grad_norm_var": 0.0149078369140625, "learning_rate": 0.0001, "loss": 7.5778, "loss/crossentropy": 2.252353072166443, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22742437571287155, "step": 3696 }, { "epoch": 0.231125, "grad_norm": 2.703125, "grad_norm_var": 0.020670572916666668, "learning_rate": 0.0001, "loss": 7.6338, "loss/crossentropy": 2.2866071462631226, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22707533836364746, "step": 3698 }, { "epoch": 0.23125, "grad_norm": 2.359375, "grad_norm_var": 0.023470052083333335, "learning_rate": 0.0001, "loss": 7.4712, "loss/crossentropy": 1.939712941646576, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23170647770166397, "step": 3700 }, { "epoch": 0.231375, "grad_norm": 2.515625, "grad_norm_var": 0.022196451822916668, "learning_rate": 0.0001, "loss": 7.5799, "loss/crossentropy": 1.7738837003707886, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24364864826202393, "step": 3702 }, { "epoch": 0.2315, "grad_norm": 2.640625, "grad_norm_var": 0.0230865478515625, "learning_rate": 0.0001, "loss": 7.5375, "loss/crossentropy": 2.1018226742744446, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22683311998844147, "step": 3704 }, { "epoch": 0.231625, "grad_norm": 2.515625, "grad_norm_var": 0.0196197509765625, "learning_rate": 0.0001, "loss": 7.6113, "loss/crossentropy": 2.2435100078582764, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22194090485572815, "step": 3706 }, { "epoch": 0.23175, "grad_norm": 2.390625, "grad_norm_var": 0.019147745768229165, "learning_rate": 0.0001, "loss": 7.5735, "loss/crossentropy": 2.114552319049835, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21233541518449783, "step": 3708 }, { "epoch": 0.231875, "grad_norm": 2.46875, "grad_norm_var": 0.020164998372395833, "learning_rate": 0.0001, "loss": 7.4842, "loss/crossentropy": 2.340905785560608, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23916912078857422, "step": 3710 }, { "epoch": 0.232, "grad_norm": 2.65625, "grad_norm_var": 0.018675740559895834, "learning_rate": 0.0001, "loss": 7.5672, "loss/crossentropy": 2.2121574878692627, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22300127893686295, "step": 3712 }, { "epoch": 0.232125, "grad_norm": 2.28125, "grad_norm_var": 0.015852864583333334, "learning_rate": 0.0001, "loss": 7.5457, "loss/crossentropy": 2.350710391998291, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2501726374030113, "step": 3714 }, { "epoch": 0.23225, "grad_norm": 2.515625, "grad_norm_var": 0.015983072916666667, "learning_rate": 0.0001, "loss": 7.5465, "loss/crossentropy": 2.4245604276657104, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22501108050346375, "step": 3716 }, { "epoch": 0.232375, "grad_norm": 2.3125, "grad_norm_var": 0.017215983072916666, "learning_rate": 0.0001, "loss": 7.6819, "loss/crossentropy": 2.2433913946151733, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23217583447694778, "step": 3718 }, { "epoch": 0.2325, "grad_norm": 2.21875, "grad_norm_var": 0.017194620768229165, "learning_rate": 0.0001, "loss": 7.462, "loss/crossentropy": 2.203797459602356, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21597397327423096, "step": 3720 }, { "epoch": 0.232625, "grad_norm": 2.375, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 7.4907, "loss/crossentropy": 2.274090051651001, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22629251331090927, "step": 3722 }, { "epoch": 0.23275, "grad_norm": 2.515625, "grad_norm_var": 0.016402180989583334, "learning_rate": 0.0001, "loss": 7.5762, "loss/crossentropy": 2.1787149906158447, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21793190389871597, "step": 3724 }, { "epoch": 0.232875, "grad_norm": 2.359375, "grad_norm_var": 0.018001302083333334, "learning_rate": 0.0001, "loss": 7.5742, "loss/crossentropy": 2.472353219985962, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2537408918142319, "step": 3726 }, { "epoch": 0.233, "grad_norm": 2.546875, "grad_norm_var": 0.012376912434895833, "learning_rate": 0.0001, "loss": 7.7366, "loss/crossentropy": 2.1297940015792847, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24158430099487305, "step": 3728 }, { "epoch": 0.233125, "grad_norm": 2.375, "grad_norm_var": 0.0106109619140625, "learning_rate": 0.0001, "loss": 7.5799, "loss/crossentropy": 2.307152509689331, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2116793394088745, "step": 3730 }, { "epoch": 0.23325, "grad_norm": 2.484375, "grad_norm_var": 0.010465494791666667, "learning_rate": 0.0001, "loss": 7.4555, "loss/crossentropy": 1.9622553586959839, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22563175857067108, "step": 3732 }, { "epoch": 0.233375, "grad_norm": 2.46875, "grad_norm_var": 0.011637369791666666, "learning_rate": 0.0001, "loss": 7.558, "loss/crossentropy": 2.1677664518356323, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22453109174966812, "step": 3734 }, { "epoch": 0.2335, "grad_norm": 2.578125, "grad_norm_var": 0.09955952962239584, "learning_rate": 0.0001, "loss": 7.7388, "loss/crossentropy": 2.192821502685547, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23431353271007538, "step": 3736 }, { "epoch": 0.233625, "grad_norm": 2.53125, "grad_norm_var": 0.10053609212239584, "learning_rate": 0.0001, "loss": 7.5906, "loss/crossentropy": 2.1779892444610596, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23049022257328033, "step": 3738 }, { "epoch": 0.23375, "grad_norm": 2.28125, "grad_norm_var": 0.10708719889322917, "learning_rate": 0.0001, "loss": 7.4549, "loss/crossentropy": 2.402343273162842, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2568327337503433, "step": 3740 }, { "epoch": 0.233875, "grad_norm": 2.359375, "grad_norm_var": 0.10690104166666667, "learning_rate": 0.0001, "loss": 7.5119, "loss/crossentropy": 2.1869853734970093, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2425551563501358, "step": 3742 }, { "epoch": 0.234, "grad_norm": 2.375, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 7.5073, "loss/crossentropy": 2.2516770362854004, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2116701528429985, "step": 3744 }, { "epoch": 0.234125, "grad_norm": 2.484375, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 7.5986, "loss/crossentropy": 2.005309283733368, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20876743644475937, "step": 3746 }, { "epoch": 0.23425, "grad_norm": 2.4375, "grad_norm_var": 0.10966695149739583, "learning_rate": 0.0001, "loss": 7.4817, "loss/crossentropy": 2.5279784202575684, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23321525007486343, "step": 3748 }, { "epoch": 0.234375, "grad_norm": 2.453125, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 7.5474, "loss/crossentropy": 2.34593665599823, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23283468186855316, "step": 3750 }, { "epoch": 0.2345, "grad_norm": 2.1875, "grad_norm_var": 0.024665323893229167, "learning_rate": 0.0001, "loss": 7.4105, "loss/crossentropy": 2.249088764190674, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24345380067825317, "step": 3752 }, { "epoch": 0.234625, "grad_norm": 2.328125, "grad_norm_var": 0.01451416015625, "learning_rate": 0.0001, "loss": 7.5255, "loss/crossentropy": 2.2118486166000366, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2435833364725113, "step": 3754 }, { "epoch": 0.23475, "grad_norm": 2.59375, "grad_norm_var": 0.017411295572916666, "learning_rate": 0.0001, "loss": 7.6165, "loss/crossentropy": 2.2884024381637573, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23158583790063858, "step": 3756 }, { "epoch": 0.234875, "grad_norm": 2.609375, "grad_norm_var": 0.0189361572265625, "learning_rate": 0.0001, "loss": 7.7033, "loss/crossentropy": 2.2991076707839966, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23600934445858002, "step": 3758 }, { "epoch": 0.235, "grad_norm": 2.1875, "grad_norm_var": 0.020589192708333332, "learning_rate": 0.0001, "loss": 7.4587, "loss/crossentropy": 2.1402446627616882, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.21911373734474182, "step": 3760 }, { "epoch": 0.235125, "grad_norm": 2.453125, "grad_norm_var": 0.0215728759765625, "learning_rate": 0.0001, "loss": 7.6719, "loss/crossentropy": 2.234601616859436, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22324847429990768, "step": 3762 }, { "epoch": 0.23525, "grad_norm": 2.34375, "grad_norm_var": 0.022412109375, "learning_rate": 0.0001, "loss": 7.5913, "loss/crossentropy": 2.2850943207740784, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22513452172279358, "step": 3764 }, { "epoch": 0.235375, "grad_norm": 2.3125, "grad_norm_var": 0.01890869140625, "learning_rate": 0.0001, "loss": 7.6234, "loss/crossentropy": 2.122738838195801, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22072789072990417, "step": 3766 }, { "epoch": 0.2355, "grad_norm": 2.421875, "grad_norm_var": 0.0145172119140625, "learning_rate": 0.0001, "loss": 7.5619, "loss/crossentropy": 2.4355037212371826, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.24772146344184875, "step": 3768 }, { "epoch": 0.235625, "grad_norm": 2.453125, "grad_norm_var": 0.012572224934895833, "learning_rate": 0.0001, "loss": 7.5071, "loss/crossentropy": 1.8801981806755066, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22862768173217773, "step": 3770 }, { "epoch": 0.23575, "grad_norm": 2.640625, "grad_norm_var": 0.017650349934895834, "learning_rate": 0.0001, "loss": 7.4754, "loss/crossentropy": 2.2766274213790894, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2313675582408905, "step": 3772 }, { "epoch": 0.235875, "grad_norm": 2.265625, "grad_norm_var": 0.018610636393229168, "learning_rate": 0.0001, "loss": 7.5267, "loss/crossentropy": 2.0864307284355164, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2242383509874344, "step": 3774 }, { "epoch": 0.236, "grad_norm": 2.390625, "grad_norm_var": 0.01572265625, "learning_rate": 0.0001, "loss": 7.4776, "loss/crossentropy": 2.2763630151748657, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24077893048524857, "step": 3776 }, { "epoch": 0.236125, "grad_norm": 2.375, "grad_norm_var": 0.014127604166666667, "learning_rate": 0.0001, "loss": 7.5641, "loss/crossentropy": 2.2426193952560425, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2291230633854866, "step": 3778 }, { "epoch": 0.23625, "grad_norm": 2.28125, "grad_norm_var": 0.015558878580729166, "learning_rate": 0.0001, "loss": 7.4933, "loss/crossentropy": 2.081269860267639, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.25176841020584106, "step": 3780 }, { "epoch": 0.236375, "grad_norm": 6.09375, "grad_norm_var": 1.8935546875, "learning_rate": 0.0001, "loss": 7.8648, "loss/crossentropy": 2.011550545692444, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25546982884407043, "step": 3782 }, { "epoch": 0.2365, "grad_norm": 2.59375, "grad_norm_var": 1.8919993082682292, "learning_rate": 0.0001, "loss": 7.9057, "loss/crossentropy": 2.4982590675354004, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2651009112596512, "step": 3784 }, { "epoch": 0.236625, "grad_norm": 2.71875, "grad_norm_var": 1.8650950113932292, "learning_rate": 0.0001, "loss": 7.6646, "loss/crossentropy": 2.2798246145248413, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22900397330522537, "step": 3786 }, { "epoch": 0.23675, "grad_norm": 2.390625, "grad_norm_var": 1.87945556640625, "learning_rate": 0.0001, "loss": 7.379, "loss/crossentropy": 2.03772509098053, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22353503108024597, "step": 3788 }, { "epoch": 0.236875, "grad_norm": 2.734375, "grad_norm_var": 1.82320556640625, "learning_rate": 0.0001, "loss": 7.4055, "loss/crossentropy": 2.317867159843445, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23167508840560913, "step": 3790 }, { "epoch": 0.237, "grad_norm": 2.1875, "grad_norm_var": 1.84205322265625, "learning_rate": 0.0001, "loss": 7.4846, "loss/crossentropy": 2.010311722755432, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.20548687875270844, "step": 3792 }, { "epoch": 0.237125, "grad_norm": 2.359375, "grad_norm_var": 1.8334625244140625, "learning_rate": 0.0001, "loss": 7.5309, "loss/crossentropy": 2.325111150741577, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2562504708766937, "step": 3794 }, { "epoch": 0.23725, "grad_norm": 2.328125, "grad_norm_var": 1.8178131103515625, "learning_rate": 0.0001, "loss": 7.5873, "loss/crossentropy": 2.0214288234710693, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2158362790942192, "step": 3796 }, { "epoch": 0.237375, "grad_norm": 2.390625, "grad_norm_var": 0.1085845947265625, "learning_rate": 0.0001, "loss": 7.6152, "loss/crossentropy": 2.025280773639679, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2420121431350708, "step": 3798 }, { "epoch": 0.2375, "grad_norm": 2.453125, "grad_norm_var": 0.025788370768229166, "learning_rate": 0.0001, "loss": 7.5203, "loss/crossentropy": 2.2576816082000732, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24750325828790665, "step": 3800 }, { "epoch": 0.237625, "grad_norm": 2.515625, "grad_norm_var": 0.019807942708333335, "learning_rate": 0.0001, "loss": 7.562, "loss/crossentropy": 2.4177106618881226, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2321905866265297, "step": 3802 }, { "epoch": 0.23775, "grad_norm": 2.25, "grad_norm_var": 0.017560831705729165, "learning_rate": 0.0001, "loss": 7.6246, "loss/crossentropy": 2.381041169166565, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22734872996807098, "step": 3804 }, { "epoch": 0.237875, "grad_norm": 2.1875, "grad_norm_var": 0.014777628580729167, "learning_rate": 0.0001, "loss": 7.2978, "loss/crossentropy": 2.169810175895691, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21825361251831055, "step": 3806 }, { "epoch": 0.238, "grad_norm": 2.703125, "grad_norm_var": 0.01949462890625, "learning_rate": 0.0001, "loss": 7.6745, "loss/crossentropy": 2.2910315990448, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2431943118572235, "step": 3808 }, { "epoch": 0.238125, "grad_norm": 2.234375, "grad_norm_var": 0.020799763997395835, "learning_rate": 0.0001, "loss": 7.4715, "loss/crossentropy": 2.32381534576416, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21884576976299286, "step": 3810 }, { "epoch": 0.23825, "grad_norm": 2.40625, "grad_norm_var": 0.02027587890625, "learning_rate": 0.0001, "loss": 7.626, "loss/crossentropy": 2.3181525468826294, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23518574982881546, "step": 3812 }, { "epoch": 0.238375, "grad_norm": 2.328125, "grad_norm_var": 0.021712239583333334, "learning_rate": 0.0001, "loss": 7.718, "loss/crossentropy": 2.4021564722061157, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22601833939552307, "step": 3814 }, { "epoch": 0.2385, "grad_norm": 2.671875, "grad_norm_var": 0.026253255208333333, "learning_rate": 0.0001, "loss": 7.5776, "loss/crossentropy": 2.241186261177063, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22214636206626892, "step": 3816 }, { "epoch": 0.238625, "grad_norm": 2.203125, "grad_norm_var": 0.028076171875, "learning_rate": 0.0001, "loss": 7.4295, "loss/crossentropy": 2.3284627199172974, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23751358687877655, "step": 3818 }, { "epoch": 0.23875, "grad_norm": 2.328125, "grad_norm_var": 0.025634765625, "learning_rate": 0.0001, "loss": 7.3839, "loss/crossentropy": 1.878059983253479, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.20184766501188278, "step": 3820 }, { "epoch": 0.238875, "grad_norm": 2.296875, "grad_norm_var": 0.021415201822916667, "learning_rate": 0.0001, "loss": 7.4835, "loss/crossentropy": 2.2008787393569946, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2629189044237137, "step": 3822 }, { "epoch": 0.239, "grad_norm": 2.5625, "grad_norm_var": 0.016803995768229166, "learning_rate": 0.0001, "loss": 7.4507, "loss/crossentropy": 2.3513262271881104, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22654324024915695, "step": 3824 }, { "epoch": 0.239125, "grad_norm": 2.640625, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 7.4914, "loss/crossentropy": 2.05319607257843, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21194881200790405, "step": 3826 }, { "epoch": 0.23925, "grad_norm": 2.296875, "grad_norm_var": 0.020882161458333333, "learning_rate": 0.0001, "loss": 7.4726, "loss/crossentropy": 2.274933695793152, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24475091695785522, "step": 3828 }, { "epoch": 0.239375, "grad_norm": 2.625, "grad_norm_var": 0.02076416015625, "learning_rate": 0.0001, "loss": 7.4758, "loss/crossentropy": 2.2083067893981934, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21219877898693085, "step": 3830 }, { "epoch": 0.2395, "grad_norm": 2.5, "grad_norm_var": 0.0187164306640625, "learning_rate": 0.0001, "loss": 7.6724, "loss/crossentropy": 2.3150436878204346, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2536996081471443, "step": 3832 }, { "epoch": 0.239625, "grad_norm": 2.53125, "grad_norm_var": 0.014557902018229167, "learning_rate": 0.0001, "loss": 7.569, "loss/crossentropy": 2.2580004930496216, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21672644466161728, "step": 3834 }, { "epoch": 0.23975, "grad_norm": 2.375, "grad_norm_var": 0.014354451497395834, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 2.080252170562744, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22742826491594315, "step": 3836 }, { "epoch": 0.239875, "grad_norm": 2.375, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 7.3651, "loss/crossentropy": 2.1281388998031616, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2401324361562729, "step": 3838 }, { "epoch": 0.24, "grad_norm": 2.703125, "grad_norm_var": 0.03736572265625, "learning_rate": 0.0001, "loss": 7.5007, "loss/crossentropy": 2.2224199771881104, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23487379401922226, "step": 3840 }, { "epoch": 0.240125, "grad_norm": 2.421875, "grad_norm_var": 0.04025065104166667, "learning_rate": 0.0001, "loss": 7.632, "loss/crossentropy": 2.424517512321472, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24695612490177155, "step": 3842 }, { "epoch": 0.24025, "grad_norm": 2.75, "grad_norm_var": 0.04254557291666667, "learning_rate": 0.0001, "loss": 7.5674, "loss/crossentropy": 2.1599162220954895, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21321508288383484, "step": 3844 }, { "epoch": 0.240375, "grad_norm": 2.59375, "grad_norm_var": 0.0422271728515625, "learning_rate": 0.0001, "loss": 7.6312, "loss/crossentropy": 2.0522512793540955, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21054793149232864, "step": 3846 }, { "epoch": 0.2405, "grad_norm": 2.46875, "grad_norm_var": 0.04182840983072917, "learning_rate": 0.0001, "loss": 7.6148, "loss/crossentropy": 2.3739218711853027, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24357128143310547, "step": 3848 }, { "epoch": 0.240625, "grad_norm": 2.125, "grad_norm_var": 0.050309244791666666, "learning_rate": 0.0001, "loss": 7.4666, "loss/crossentropy": 2.1554529666900635, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21792703121900558, "step": 3850 }, { "epoch": 0.24075, "grad_norm": 2.59375, "grad_norm_var": 0.05252278645833333, "learning_rate": 0.0001, "loss": 7.5105, "loss/crossentropy": 2.5243422985076904, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23979288339614868, "step": 3852 }, { "epoch": 0.240875, "grad_norm": 2.21875, "grad_norm_var": 0.054032389322916666, "learning_rate": 0.0001, "loss": 7.6535, "loss/crossentropy": 2.169221580028534, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21038363128900528, "step": 3854 }, { "epoch": 0.241, "grad_norm": 2.34375, "grad_norm_var": 0.0281158447265625, "learning_rate": 0.0001, "loss": 7.4283, "loss/crossentropy": 2.375051498413086, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24313046038150787, "step": 3856 }, { "epoch": 0.241125, "grad_norm": 2.453125, "grad_norm_var": 0.027339680989583334, "learning_rate": 0.0001, "loss": 7.5168, "loss/crossentropy": 2.504861831665039, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2226250097155571, "step": 3858 }, { "epoch": 0.24125, "grad_norm": 2.171875, "grad_norm_var": 0.022347005208333333, "learning_rate": 0.0001, "loss": 7.4035, "loss/crossentropy": 2.188872456550598, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22465746104717255, "step": 3860 }, { "epoch": 0.241375, "grad_norm": 2.34375, "grad_norm_var": 0.018831380208333335, "learning_rate": 0.0001, "loss": 7.4832, "loss/crossentropy": 2.109761595726013, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21876226365566254, "step": 3862 }, { "epoch": 0.2415, "grad_norm": 2.34375, "grad_norm_var": 0.0161041259765625, "learning_rate": 0.0001, "loss": 7.4889, "loss/crossentropy": 2.454153060913086, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2234111726284027, "step": 3864 }, { "epoch": 0.241625, "grad_norm": 2.359375, "grad_norm_var": 0.0175445556640625, "learning_rate": 0.0001, "loss": 7.5401, "loss/crossentropy": 2.2534934282302856, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.20882034301757812, "step": 3866 }, { "epoch": 0.24175, "grad_norm": 2.234375, "grad_norm_var": 0.01861572265625, "learning_rate": 0.0001, "loss": 7.3884, "loss/crossentropy": 2.46646249294281, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23306798189878464, "step": 3868 }, { "epoch": 0.241875, "grad_norm": 2.46875, "grad_norm_var": 0.01939697265625, "learning_rate": 0.0001, "loss": 7.4742, "loss/crossentropy": 2.286616086959839, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23071034252643585, "step": 3870 }, { "epoch": 0.242, "grad_norm": 2.203125, "grad_norm_var": 0.020002237955729165, "learning_rate": 0.0001, "loss": 7.4734, "loss/crossentropy": 2.3127135038375854, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2306494191288948, "step": 3872 }, { "epoch": 0.242125, "grad_norm": 2.34375, "grad_norm_var": 0.018636067708333332, "learning_rate": 0.0001, "loss": 7.3801, "loss/crossentropy": 2.292248845100403, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21103744953870773, "step": 3874 }, { "epoch": 0.24225, "grad_norm": 2.53125, "grad_norm_var": 0.020799763997395835, "learning_rate": 0.0001, "loss": 7.7475, "loss/crossentropy": 2.2768986225128174, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25374574959278107, "step": 3876 }, { "epoch": 0.242375, "grad_norm": 2.640625, "grad_norm_var": 0.02554931640625, "learning_rate": 0.0001, "loss": 7.4706, "loss/crossentropy": 2.335216999053955, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2206215113401413, "step": 3878 }, { "epoch": 0.2425, "grad_norm": 2.203125, "grad_norm_var": 0.027098592122395834, "learning_rate": 0.0001, "loss": 7.6204, "loss/crossentropy": 2.3161808252334595, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23353615403175354, "step": 3880 }, { "epoch": 0.242625, "grad_norm": 2.453125, "grad_norm_var": 0.024853515625, "learning_rate": 0.0001, "loss": 7.4924, "loss/crossentropy": 2.1703152656555176, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23688945174217224, "step": 3882 }, { "epoch": 0.24275, "grad_norm": 2.78125, "grad_norm_var": 0.027176920572916666, "learning_rate": 0.0001, "loss": 7.529, "loss/crossentropy": 2.342598557472229, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21841312944889069, "step": 3884 }, { "epoch": 0.242875, "grad_norm": 2.640625, "grad_norm_var": 0.09791259765625, "learning_rate": 0.0001, "loss": 7.3372, "loss/crossentropy": 2.1650543808937073, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22078751772642136, "step": 3886 }, { "epoch": 0.243, "grad_norm": 2.140625, "grad_norm_var": 0.11746419270833333, "learning_rate": 0.0001, "loss": 7.4851, "loss/crossentropy": 1.997887134552002, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21081900596618652, "step": 3888 }, { "epoch": 0.243125, "grad_norm": 2.5, "grad_norm_var": 0.11201070149739584, "learning_rate": 0.0001, "loss": 7.6291, "loss/crossentropy": 2.2321996688842773, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22545063495635986, "step": 3890 }, { "epoch": 0.24325, "grad_norm": 2.578125, "grad_norm_var": 0.11261393229166666, "learning_rate": 0.0001, "loss": 7.5623, "loss/crossentropy": 2.522809147834778, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24426687508821487, "step": 3892 }, { "epoch": 0.243375, "grad_norm": 2.1875, "grad_norm_var": 0.11669921875, "learning_rate": 0.0001, "loss": 7.4586, "loss/crossentropy": 2.3665112257003784, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.253150999546051, "step": 3894 }, { "epoch": 0.2435, "grad_norm": 2.78125, "grad_norm_var": 0.11687825520833334, "learning_rate": 0.0001, "loss": 7.5531, "loss/crossentropy": 2.4039018154144287, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22742793709039688, "step": 3896 }, { "epoch": 0.243625, "grad_norm": 2.125, "grad_norm_var": 0.1285552978515625, "learning_rate": 0.0001, "loss": 7.5957, "loss/crossentropy": 2.3447670936584473, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2258382812142372, "step": 3898 }, { "epoch": 0.24375, "grad_norm": 2.390625, "grad_norm_var": 0.12775065104166666, "learning_rate": 0.0001, "loss": 7.3161, "loss/crossentropy": 2.020743668079376, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2295529618859291, "step": 3900 }, { "epoch": 0.243875, "grad_norm": 2.4375, "grad_norm_var": 0.045685831705729166, "learning_rate": 0.0001, "loss": 7.4793, "loss/crossentropy": 2.0840115547180176, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21463429182767868, "step": 3902 }, { "epoch": 0.244, "grad_norm": 2.671875, "grad_norm_var": 0.03689778645833333, "learning_rate": 0.0001, "loss": 7.7833, "loss/crossentropy": 2.3722068071365356, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22333616763353348, "step": 3904 }, { "epoch": 0.244125, "grad_norm": 2.40625, "grad_norm_var": 0.038605753580729166, "learning_rate": 0.0001, "loss": 7.4049, "loss/crossentropy": 2.5307204723358154, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23095162957906723, "step": 3906 }, { "epoch": 0.24425, "grad_norm": 2.34375, "grad_norm_var": 0.036554972330729164, "learning_rate": 0.0001, "loss": 7.5943, "loss/crossentropy": 2.427748680114746, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2420305386185646, "step": 3908 }, { "epoch": 0.244375, "grad_norm": 2.375, "grad_norm_var": 0.033984375, "learning_rate": 0.0001, "loss": 7.5523, "loss/crossentropy": 2.3058314323425293, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.25578539073467255, "step": 3910 }, { "epoch": 0.2445, "grad_norm": 2.28125, "grad_norm_var": 0.0274566650390625, "learning_rate": 0.0001, "loss": 7.2921, "loss/crossentropy": 1.9724953174591064, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.19885031133890152, "step": 3912 }, { "epoch": 0.244625, "grad_norm": 2.390625, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 7.6238, "loss/crossentropy": 2.1780654191970825, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2180589661002159, "step": 3914 }, { "epoch": 0.24475, "grad_norm": 2.375, "grad_norm_var": 0.017455037434895834, "learning_rate": 0.0001, "loss": 7.4598, "loss/crossentropy": 2.246911883354187, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22257865220308304, "step": 3916 }, { "epoch": 0.244875, "grad_norm": 2.484375, "grad_norm_var": 0.01953125, "learning_rate": 0.0001, "loss": 7.5233, "loss/crossentropy": 2.1807644367218018, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23529939353466034, "step": 3918 }, { "epoch": 0.245, "grad_norm": 2.421875, "grad_norm_var": 0.015241495768229167, "learning_rate": 0.0001, "loss": 7.6941, "loss/crossentropy": 2.4168232679367065, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24450047314167023, "step": 3920 }, { "epoch": 0.245125, "grad_norm": 2.640625, "grad_norm_var": 0.015836588541666665, "learning_rate": 0.0001, "loss": 7.3875, "loss/crossentropy": 2.1456319093704224, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22022899985313416, "step": 3922 }, { "epoch": 0.24525, "grad_norm": 2.46875, "grad_norm_var": 0.017183430989583335, "learning_rate": 0.0001, "loss": 7.5178, "loss/crossentropy": 2.1945712566375732, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23692822456359863, "step": 3924 }, { "epoch": 0.245375, "grad_norm": 2.25, "grad_norm_var": 0.017943318684895834, "learning_rate": 0.0001, "loss": 7.5715, "loss/crossentropy": 2.334246516227722, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24413339793682098, "step": 3926 }, { "epoch": 0.2455, "grad_norm": 2.40625, "grad_norm_var": 0.01416015625, "learning_rate": 0.0001, "loss": 7.4425, "loss/crossentropy": 2.052259385585785, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21555544435977936, "step": 3928 }, { "epoch": 0.245625, "grad_norm": 2.46875, "grad_norm_var": 0.0144195556640625, "learning_rate": 0.0001, "loss": 7.6389, "loss/crossentropy": 2.4917489290237427, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24890758097171783, "step": 3930 }, { "epoch": 0.24575, "grad_norm": 2.359375, "grad_norm_var": 0.015250651041666667, "learning_rate": 0.0001, "loss": 7.5094, "loss/crossentropy": 2.3055362701416016, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2225131392478943, "step": 3932 }, { "epoch": 0.245875, "grad_norm": 2.40625, "grad_norm_var": 0.013472493489583333, "learning_rate": 0.0001, "loss": 7.4905, "loss/crossentropy": 2.1624478101730347, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2125004380941391, "step": 3934 }, { "epoch": 0.246, "grad_norm": 2.328125, "grad_norm_var": 0.013997395833333334, "learning_rate": 0.0001, "loss": 7.4723, "loss/crossentropy": 2.408942222595215, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23912448436021805, "step": 3936 }, { "epoch": 0.246125, "grad_norm": 2.234375, "grad_norm_var": 0.012760416666666666, "learning_rate": 0.0001, "loss": 7.2563, "loss/crossentropy": 2.1905765533447266, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22357414662837982, "step": 3938 }, { "epoch": 0.24625, "grad_norm": 2.640625, "grad_norm_var": 0.0129547119140625, "learning_rate": 0.0001, "loss": 7.4517, "loss/crossentropy": 2.3154603242874146, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23004180938005447, "step": 3940 }, { "epoch": 0.246375, "grad_norm": 2.3125, "grad_norm_var": 0.011620076497395833, "learning_rate": 0.0001, "loss": 7.4689, "loss/crossentropy": 2.5023341178894043, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24651438742876053, "step": 3942 }, { "epoch": 0.2465, "grad_norm": 2.34375, "grad_norm_var": 0.01148681640625, "learning_rate": 0.0001, "loss": 7.4688, "loss/crossentropy": 2.0356882214546204, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.20058569312095642, "step": 3944 }, { "epoch": 0.246625, "grad_norm": 2.5, "grad_norm_var": 0.010791015625, "learning_rate": 0.0001, "loss": 7.5573, "loss/crossentropy": 2.4191837310791016, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23377195000648499, "step": 3946 }, { "epoch": 0.24675, "grad_norm": 2.5, "grad_norm_var": 0.010628255208333333, "learning_rate": 0.0001, "loss": 7.6407, "loss/crossentropy": 2.3417128324508667, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22980307787656784, "step": 3948 }, { "epoch": 0.246875, "grad_norm": 2.5, "grad_norm_var": 0.014143880208333333, "learning_rate": 0.0001, "loss": 7.5853, "loss/crossentropy": 2.3083345890045166, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23690129816532135, "step": 3950 }, { "epoch": 0.247, "grad_norm": 2.34375, "grad_norm_var": 0.017389933268229168, "learning_rate": 0.0001, "loss": 7.5956, "loss/crossentropy": 2.456955909729004, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2354580983519554, "step": 3952 }, { "epoch": 0.247125, "grad_norm": 2.359375, "grad_norm_var": 0.015055338541666666, "learning_rate": 0.0001, "loss": 7.4467, "loss/crossentropy": 2.162124276161194, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23025665432214737, "step": 3954 }, { "epoch": 0.24725, "grad_norm": 2.265625, "grad_norm_var": 0.013337198893229167, "learning_rate": 0.0001, "loss": 7.4083, "loss/crossentropy": 2.203832507133484, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21881967037916183, "step": 3956 }, { "epoch": 0.247375, "grad_norm": 2.453125, "grad_norm_var": 0.012848917643229167, "learning_rate": 0.0001, "loss": 7.4759, "loss/crossentropy": 2.226723313331604, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22026114910840988, "step": 3958 }, { "epoch": 0.2475, "grad_norm": 2.421875, "grad_norm_var": 0.012678019205729167, "learning_rate": 0.0001, "loss": 7.476, "loss/crossentropy": 2.3727807998657227, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.247600257396698, "step": 3960 }, { "epoch": 0.247625, "grad_norm": 2.375, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 7.483, "loss/crossentropy": 2.2924267053604126, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2366592362523079, "step": 3962 }, { "epoch": 0.24775, "grad_norm": 2.21875, "grad_norm_var": 0.013654581705729167, "learning_rate": 0.0001, "loss": 7.3951, "loss/crossentropy": 2.0975323915481567, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2243112102150917, "step": 3964 }, { "epoch": 0.247875, "grad_norm": 2.53125, "grad_norm_var": 0.010553995768229166, "learning_rate": 0.0001, "loss": 7.3489, "loss/crossentropy": 2.21097195148468, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2190333753824234, "step": 3966 }, { "epoch": 0.248, "grad_norm": 2.453125, "grad_norm_var": 0.007840983072916667, "learning_rate": 0.0001, "loss": 7.4721, "loss/crossentropy": 2.2507461309432983, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23200294375419617, "step": 3968 }, { "epoch": 0.248125, "grad_norm": 2.234375, "grad_norm_var": 0.0075592041015625, "learning_rate": 0.0001, "loss": 7.4843, "loss/crossentropy": 2.191147208213806, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23013149946928024, "step": 3970 }, { "epoch": 0.24825, "grad_norm": 2.453125, "grad_norm_var": 0.00748291015625, "learning_rate": 0.0001, "loss": 7.5614, "loss/crossentropy": 2.4555280208587646, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2434312105178833, "step": 3972 }, { "epoch": 0.248375, "grad_norm": 2.375, "grad_norm_var": 0.007157389322916667, "learning_rate": 0.0001, "loss": 7.4256, "loss/crossentropy": 2.2261093854904175, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23195409029722214, "step": 3974 }, { "epoch": 0.2485, "grad_norm": 2.59375, "grad_norm_var": 0.009528605143229167, "learning_rate": 0.0001, "loss": 7.4072, "loss/crossentropy": 2.1353044509887695, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21157176792621613, "step": 3976 }, { "epoch": 0.248625, "grad_norm": 2.203125, "grad_norm_var": 0.0131988525390625, "learning_rate": 0.0001, "loss": 7.3719, "loss/crossentropy": 2.145276427268982, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21788546442985535, "step": 3978 }, { "epoch": 0.24875, "grad_norm": 2.484375, "grad_norm_var": 0.0127105712890625, "learning_rate": 0.0001, "loss": 7.4473, "loss/crossentropy": 2.37498140335083, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21876709163188934, "step": 3980 }, { "epoch": 0.248875, "grad_norm": 2.578125, "grad_norm_var": 0.01724853515625, "learning_rate": 0.0001, "loss": 7.589, "loss/crossentropy": 2.432854413986206, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2542262375354767, "step": 3982 }, { "epoch": 0.249, "grad_norm": 2.21875, "grad_norm_var": 0.0197418212890625, "learning_rate": 0.0001, "loss": 7.5591, "loss/crossentropy": 2.451367735862732, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22828736156225204, "step": 3984 }, { "epoch": 0.249125, "grad_norm": 2.375, "grad_norm_var": 0.0188629150390625, "learning_rate": 0.0001, "loss": 7.4799, "loss/crossentropy": 2.2237168550491333, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23009125888347626, "step": 3986 }, { "epoch": 0.24925, "grad_norm": 2.25, "grad_norm_var": 0.020703125, "learning_rate": 0.0001, "loss": 7.6225, "loss/crossentropy": 2.0535144805908203, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22887953370809555, "step": 3988 }, { "epoch": 0.249375, "grad_norm": 2.578125, "grad_norm_var": 0.0243804931640625, "learning_rate": 0.0001, "loss": 7.2587, "loss/crossentropy": 2.1713778972625732, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.20236970484256744, "step": 3990 }, { "epoch": 0.2495, "grad_norm": 2.40625, "grad_norm_var": 0.029539998372395834, "learning_rate": 0.0001, "loss": 7.5803, "loss/crossentropy": 2.314449429512024, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2168670818209648, "step": 3992 }, { "epoch": 0.249625, "grad_norm": 2.28125, "grad_norm_var": 0.0293121337890625, "learning_rate": 0.0001, "loss": 7.5327, "loss/crossentropy": 2.2450716495513916, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23649980127811432, "step": 3994 }, { "epoch": 0.24975, "grad_norm": 2.28125, "grad_norm_var": 0.029488118489583333, "learning_rate": 0.0001, "loss": 7.3866, "loss/crossentropy": 2.172006130218506, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21691139042377472, "step": 3996 }, { "epoch": 0.249875, "grad_norm": 2.46875, "grad_norm_var": 0.025321451822916667, "learning_rate": 0.0001, "loss": 7.4429, "loss/crossentropy": 2.230931878089905, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2391601949930191, "step": 3998 }, { "epoch": 0.25, "grad_norm": 2.703125, "grad_norm_var": 0.0368316650390625, "learning_rate": 0.0001, "loss": 7.4992, "loss/crossentropy": 2.2851897478103638, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2219674438238144, "step": 4000 }, { "epoch": 0.250125, "grad_norm": 2.4375, "grad_norm_var": 0.03642578125, "learning_rate": 0.0001, "loss": 7.6042, "loss/crossentropy": 2.118333578109741, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23840918391942978, "step": 4002 }, { "epoch": 0.25025, "grad_norm": 2.375, "grad_norm_var": 0.03258056640625, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 2.031722903251648, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.21818936616182327, "step": 4004 }, { "epoch": 0.250375, "grad_norm": 2.234375, "grad_norm_var": 0.03168843587239583, "learning_rate": 0.0001, "loss": 7.4845, "loss/crossentropy": 2.4262243509292603, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24300269782543182, "step": 4006 }, { "epoch": 0.2505, "grad_norm": 2.4375, "grad_norm_var": 0.024299112955729167, "learning_rate": 0.0001, "loss": 7.7033, "loss/crossentropy": 2.167011022567749, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23807096481323242, "step": 4008 }, { "epoch": 0.250625, "grad_norm": 2.421875, "grad_norm_var": 0.02398681640625, "learning_rate": 0.0001, "loss": 7.532, "loss/crossentropy": 2.434093475341797, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22854039818048477, "step": 4010 }, { "epoch": 0.25075, "grad_norm": 2.109375, "grad_norm_var": 0.030745442708333334, "learning_rate": 0.0001, "loss": 7.3823, "loss/crossentropy": 2.121519684791565, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22766801714897156, "step": 4012 }, { "epoch": 0.250875, "grad_norm": 2.34375, "grad_norm_var": 0.027880859375, "learning_rate": 0.0001, "loss": 7.4289, "loss/crossentropy": 2.2950530648231506, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24578213691711426, "step": 4014 }, { "epoch": 0.251, "grad_norm": 2.671875, "grad_norm_var": 0.020589192708333332, "learning_rate": 0.0001, "loss": 7.5219, "loss/crossentropy": 2.3839573860168457, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21992743760347366, "step": 4016 }, { "epoch": 0.251125, "grad_norm": 2.296875, "grad_norm_var": 0.022321573893229165, "learning_rate": 0.0001, "loss": 7.5561, "loss/crossentropy": 2.4837042093276978, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23451413959264755, "step": 4018 }, { "epoch": 0.25125, "grad_norm": 2.4375, "grad_norm_var": 0.0230133056640625, "learning_rate": 0.0001, "loss": 7.5998, "loss/crossentropy": 2.4655433893203735, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2359592318534851, "step": 4020 }, { "epoch": 0.251375, "grad_norm": 2.375, "grad_norm_var": 0.0247711181640625, "learning_rate": 0.0001, "loss": 7.3372, "loss/crossentropy": 2.121498227119446, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2136445865035057, "step": 4022 }, { "epoch": 0.2515, "grad_norm": 2.859375, "grad_norm_var": 0.039549763997395834, "learning_rate": 0.0001, "loss": 7.4977, "loss/crossentropy": 2.208828091621399, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22217834740877151, "step": 4024 }, { "epoch": 0.251625, "grad_norm": 2.265625, "grad_norm_var": 0.06523030598958333, "learning_rate": 0.0001, "loss": 7.6357, "loss/crossentropy": 2.191303014755249, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.25364626944065094, "step": 4026 }, { "epoch": 0.25175, "grad_norm": 2.34375, "grad_norm_var": 0.06326497395833333, "learning_rate": 0.0001, "loss": 7.4891, "loss/crossentropy": 2.271737813949585, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23410511761903763, "step": 4028 }, { "epoch": 0.251875, "grad_norm": 2.453125, "grad_norm_var": 0.06123758951822917, "learning_rate": 0.0001, "loss": 7.5977, "loss/crossentropy": 2.153814435005188, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21588297933340073, "step": 4030 }, { "epoch": 0.252, "grad_norm": 2.421875, "grad_norm_var": 0.0592437744140625, "learning_rate": 0.0001, "loss": 7.2691, "loss/crossentropy": 2.183136820793152, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22347302734851837, "step": 4032 }, { "epoch": 0.252125, "grad_norm": 2.25, "grad_norm_var": 0.060205078125, "learning_rate": 0.0001, "loss": 7.481, "loss/crossentropy": 2.1289315223693848, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2259645164012909, "step": 4034 }, { "epoch": 0.25225, "grad_norm": 2.4375, "grad_norm_var": 0.0599517822265625, "learning_rate": 0.0001, "loss": 7.5806, "loss/crossentropy": 1.9534605145454407, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21411054581403732, "step": 4036 }, { "epoch": 0.252375, "grad_norm": 2.375, "grad_norm_var": 0.06717122395833333, "learning_rate": 0.0001, "loss": 7.6651, "loss/crossentropy": 2.3216229677200317, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21676085889339447, "step": 4038 }, { "epoch": 0.2525, "grad_norm": 2.296875, "grad_norm_var": 0.058470662434895834, "learning_rate": 0.0001, "loss": 7.5468, "loss/crossentropy": 2.086758255958557, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20101846754550934, "step": 4040 }, { "epoch": 0.252625, "grad_norm": 2.28125, "grad_norm_var": 0.03208719889322917, "learning_rate": 0.0001, "loss": 7.3466, "loss/crossentropy": 2.2384684085845947, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2282901331782341, "step": 4042 }, { "epoch": 0.25275, "grad_norm": 2.53125, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 7.4619, "loss/crossentropy": 1.9721214771270752, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21432989835739136, "step": 4044 }, { "epoch": 0.252875, "grad_norm": 2.34375, "grad_norm_var": 0.029313151041666666, "learning_rate": 0.0001, "loss": 7.3318, "loss/crossentropy": 1.9340474605560303, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21380113065242767, "step": 4046 }, { "epoch": 0.253, "grad_norm": 2.21875, "grad_norm_var": 0.0304351806640625, "learning_rate": 0.0001, "loss": 7.5475, "loss/crossentropy": 2.3662819862365723, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21932249516248703, "step": 4048 }, { "epoch": 0.253125, "grad_norm": 2.515625, "grad_norm_var": 0.030940755208333334, "learning_rate": 0.0001, "loss": 7.5843, "loss/crossentropy": 2.2904332876205444, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2326778918504715, "step": 4050 }, { "epoch": 0.25325, "grad_norm": 2.234375, "grad_norm_var": 0.032421875, "learning_rate": 0.0001, "loss": 7.4797, "loss/crossentropy": 2.4156243801116943, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23186896741390228, "step": 4052 }, { "epoch": 0.253375, "grad_norm": 2.40625, "grad_norm_var": 0.018408203125, "learning_rate": 0.0001, "loss": 7.4728, "loss/crossentropy": 2.2443253993988037, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21730951219797134, "step": 4054 }, { "epoch": 0.2535, "grad_norm": 2.359375, "grad_norm_var": 0.019359334309895834, "learning_rate": 0.0001, "loss": 7.4368, "loss/crossentropy": 1.9812930226325989, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2251596599817276, "step": 4056 }, { "epoch": 0.253625, "grad_norm": 2.171875, "grad_norm_var": 0.020441691080729168, "learning_rate": 0.0001, "loss": 7.4598, "loss/crossentropy": 2.628469467163086, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23521346598863602, "step": 4058 }, { "epoch": 0.25375, "grad_norm": 2.4375, "grad_norm_var": 0.018700154622395833, "learning_rate": 0.0001, "loss": 7.5237, "loss/crossentropy": 2.390172600746155, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22036952525377274, "step": 4060 }, { "epoch": 0.253875, "grad_norm": 2.46875, "grad_norm_var": 0.019303385416666666, "learning_rate": 0.0001, "loss": 7.632, "loss/crossentropy": 2.174973249435425, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22056061774492264, "step": 4062 }, { "epoch": 0.254, "grad_norm": 2.390625, "grad_norm_var": 0.020601399739583335, "learning_rate": 0.0001, "loss": 7.5629, "loss/crossentropy": 2.2380123138427734, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2207566350698471, "step": 4064 }, { "epoch": 0.254125, "grad_norm": 2.359375, "grad_norm_var": 0.017821248372395834, "learning_rate": 0.0001, "loss": 7.4799, "loss/crossentropy": 2.308589458465576, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22748146951198578, "step": 4066 }, { "epoch": 0.25425, "grad_norm": 2.578125, "grad_norm_var": 0.0190826416015625, "learning_rate": 0.0001, "loss": 7.6862, "loss/crossentropy": 2.6882801055908203, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2547220140695572, "step": 4068 }, { "epoch": 0.254375, "grad_norm": 2.15625, "grad_norm_var": 0.0201324462890625, "learning_rate": 0.0001, "loss": 7.5749, "loss/crossentropy": 2.417618155479431, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23678294569253922, "step": 4070 }, { "epoch": 0.2545, "grad_norm": 2.609375, "grad_norm_var": 0.0204986572265625, "learning_rate": 0.0001, "loss": 7.356, "loss/crossentropy": 2.23097562789917, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2246166616678238, "step": 4072 }, { "epoch": 0.254625, "grad_norm": 2.359375, "grad_norm_var": 0.02060546875, "learning_rate": 0.0001, "loss": 7.6507, "loss/crossentropy": 2.252380132675171, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23461927473545074, "step": 4074 }, { "epoch": 0.25475, "grad_norm": 2.453125, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 7.5349, "loss/crossentropy": 2.2581721544265747, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2264912948012352, "step": 4076 }, { "epoch": 0.254875, "grad_norm": 2.6875, "grad_norm_var": 0.022347005208333333, "learning_rate": 0.0001, "loss": 7.4877, "loss/crossentropy": 2.3790767192840576, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23550712317228317, "step": 4078 }, { "epoch": 0.255, "grad_norm": 2.34375, "grad_norm_var": 0.022248331705729166, "learning_rate": 0.0001, "loss": 7.4732, "loss/crossentropy": 2.3831344842910767, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23731465637683868, "step": 4080 }, { "epoch": 0.255125, "grad_norm": 2.359375, "grad_norm_var": 0.026178995768229168, "learning_rate": 0.0001, "loss": 7.4923, "loss/crossentropy": 2.5071096420288086, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22199269384145737, "step": 4082 }, { "epoch": 0.25525, "grad_norm": 2.46875, "grad_norm_var": 0.023758951822916666, "learning_rate": 0.0001, "loss": 7.5247, "loss/crossentropy": 2.30050528049469, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22400212287902832, "step": 4084 }, { "epoch": 0.255375, "grad_norm": 2.203125, "grad_norm_var": 0.0220123291015625, "learning_rate": 0.0001, "loss": 7.4624, "loss/crossentropy": 2.2171316146850586, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21539007127285004, "step": 4086 }, { "epoch": 0.2555, "grad_norm": 2.34375, "grad_norm_var": 0.023493448893229168, "learning_rate": 0.0001, "loss": 7.5157, "loss/crossentropy": 2.311915159225464, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2283788025379181, "step": 4088 }, { "epoch": 0.255625, "grad_norm": 2.265625, "grad_norm_var": 0.017997233072916667, "learning_rate": 0.0001, "loss": 7.3652, "loss/crossentropy": 2.3196401596069336, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22029782831668854, "step": 4090 }, { "epoch": 0.25575, "grad_norm": 2.265625, "grad_norm_var": 0.01959228515625, "learning_rate": 0.0001, "loss": 7.3629, "loss/crossentropy": 2.2703882455825806, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21397818624973297, "step": 4092 }, { "epoch": 0.255875, "grad_norm": 2.390625, "grad_norm_var": 0.0088531494140625, "learning_rate": 0.0001, "loss": 7.4362, "loss/crossentropy": 2.296531915664673, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21631457656621933, "step": 4094 }, { "epoch": 0.256, "grad_norm": 2.1875, "grad_norm_var": 0.010481770833333333, "learning_rate": 0.0001, "loss": 7.5304, "loss/crossentropy": 2.178624987602234, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22841021418571472, "step": 4096 }, { "epoch": 0.256125, "grad_norm": 2.359375, "grad_norm_var": 0.009837849934895834, "learning_rate": 0.0001, "loss": 7.4817, "loss/crossentropy": 2.3095529079437256, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22539246827363968, "step": 4098 }, { "epoch": 0.25625, "grad_norm": 2.1875, "grad_norm_var": 0.009618123372395834, "learning_rate": 0.0001, "loss": 7.4224, "loss/crossentropy": 2.3652232885360718, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.24415633082389832, "step": 4100 }, { "epoch": 0.256375, "grad_norm": 2.546875, "grad_norm_var": 0.011083984375, "learning_rate": 0.0001, "loss": 7.4878, "loss/crossentropy": 2.0212921500205994, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22881153225898743, "step": 4102 }, { "epoch": 0.2565, "grad_norm": 2.484375, "grad_norm_var": 0.01650390625, "learning_rate": 0.0001, "loss": 7.4785, "loss/crossentropy": 2.3224822282791138, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23468727618455887, "step": 4104 }, { "epoch": 0.256625, "grad_norm": 2.609375, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 7.7486, "loss/crossentropy": 2.4106584787368774, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21804316341876984, "step": 4106 }, { "epoch": 0.25675, "grad_norm": 2.1875, "grad_norm_var": 0.0199371337890625, "learning_rate": 0.0001, "loss": 7.321, "loss/crossentropy": 2.2287791967391968, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22060447931289673, "step": 4108 }, { "epoch": 0.256875, "grad_norm": 2.34375, "grad_norm_var": 0.0247955322265625, "learning_rate": 0.0001, "loss": 7.4268, "loss/crossentropy": 1.9771644473075867, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20855499804019928, "step": 4110 }, { "epoch": 0.257, "grad_norm": 2.46875, "grad_norm_var": 0.0250396728515625, "learning_rate": 0.0001, "loss": 7.4497, "loss/crossentropy": 2.125575006008148, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2247249186038971, "step": 4112 }, { "epoch": 0.257125, "grad_norm": 2.640625, "grad_norm_var": 0.028153483072916666, "learning_rate": 0.0001, "loss": 7.4864, "loss/crossentropy": 2.074004888534546, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21001387387514114, "step": 4114 }, { "epoch": 0.25725, "grad_norm": 2.21875, "grad_norm_var": 0.027367146809895833, "learning_rate": 0.0001, "loss": 7.4488, "loss/crossentropy": 1.9355474710464478, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.20440368354320526, "step": 4116 }, { "epoch": 0.257375, "grad_norm": 2.359375, "grad_norm_var": 0.025386555989583334, "learning_rate": 0.0001, "loss": 7.4847, "loss/crossentropy": 2.311020255088806, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.20823492854833603, "step": 4118 }, { "epoch": 0.2575, "grad_norm": 2.234375, "grad_norm_var": 0.023363240559895835, "learning_rate": 0.0001, "loss": 7.4348, "loss/crossentropy": 2.3363587856292725, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22158727049827576, "step": 4120 }, { "epoch": 0.257625, "grad_norm": 3.09375, "grad_norm_var": 1.232933553059896, "learning_rate": 0.0001, "loss": 7.5643, "loss/crossentropy": 2.253212571144104, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2316179946064949, "step": 4122 }, { "epoch": 0.25775, "grad_norm": 2.328125, "grad_norm_var": 1.2166900634765625, "learning_rate": 0.0001, "loss": 7.5682, "loss/crossentropy": 2.22301983833313, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23405539989471436, "step": 4124 }, { "epoch": 0.257875, "grad_norm": 2.625, "grad_norm_var": 1.179638671875, "learning_rate": 0.0001, "loss": 7.6354, "loss/crossentropy": 2.1657320261001587, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.24702759087085724, "step": 4126 }, { "epoch": 0.258, "grad_norm": 2.265625, "grad_norm_var": 1.1891886393229167, "learning_rate": 0.0001, "loss": 7.6633, "loss/crossentropy": 2.708492875099182, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.248228058218956, "step": 4128 }, { "epoch": 0.258125, "grad_norm": 2.4375, "grad_norm_var": 1.183177693684896, "learning_rate": 0.0001, "loss": 7.567, "loss/crossentropy": 2.481553316116333, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24093511700630188, "step": 4130 }, { "epoch": 0.25825, "grad_norm": 2.25, "grad_norm_var": 1.1793609619140626, "learning_rate": 0.0001, "loss": 7.5228, "loss/crossentropy": 2.336062788963318, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22933385521173477, "step": 4132 }, { "epoch": 0.258375, "grad_norm": 2.59375, "grad_norm_var": 1.1680338541666666, "learning_rate": 0.0001, "loss": 7.4909, "loss/crossentropy": 2.233590006828308, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22607532143592834, "step": 4134 }, { "epoch": 0.2585, "grad_norm": 2.5, "grad_norm_var": 1.15947265625, "learning_rate": 0.0001, "loss": 7.6554, "loss/crossentropy": 2.3016180992126465, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21121040731668472, "step": 4136 }, { "epoch": 0.258625, "grad_norm": 2.40625, "grad_norm_var": 0.026317342122395834, "learning_rate": 0.0001, "loss": 7.551, "loss/crossentropy": 2.0917162895202637, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2196909263730049, "step": 4138 }, { "epoch": 0.25875, "grad_norm": 2.265625, "grad_norm_var": 0.025862630208333334, "learning_rate": 0.0001, "loss": 7.5891, "loss/crossentropy": 2.369123935699463, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2268521413207054, "step": 4140 }, { "epoch": 0.258875, "grad_norm": 2.546875, "grad_norm_var": 0.016141764322916665, "learning_rate": 0.0001, "loss": 7.4561, "loss/crossentropy": 2.4183573722839355, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24558168649673462, "step": 4142 }, { "epoch": 0.259, "grad_norm": 2.359375, "grad_norm_var": 0.012906901041666667, "learning_rate": 0.0001, "loss": 7.5225, "loss/crossentropy": 2.1985327005386353, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2282828763127327, "step": 4144 }, { "epoch": 0.259125, "grad_norm": 2.28125, "grad_norm_var": 0.011205037434895834, "learning_rate": 0.0001, "loss": 7.4406, "loss/crossentropy": 2.2581781148910522, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2538040652871132, "step": 4146 }, { "epoch": 0.25925, "grad_norm": 2.703125, "grad_norm_var": 0.01461181640625, "learning_rate": 0.0001, "loss": 7.562, "loss/crossentropy": 2.539314389228821, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23499274253845215, "step": 4148 }, { "epoch": 0.259375, "grad_norm": 2.28125, "grad_norm_var": 0.013948567708333333, "learning_rate": 0.0001, "loss": 7.5071, "loss/crossentropy": 2.0835620164871216, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22590668499469757, "step": 4150 }, { "epoch": 0.2595, "grad_norm": 2.375, "grad_norm_var": 0.013509114583333334, "learning_rate": 0.0001, "loss": 7.4878, "loss/crossentropy": 2.1896191835403442, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22646436095237732, "step": 4152 }, { "epoch": 0.259625, "grad_norm": 2.40625, "grad_norm_var": 0.01422119140625, "learning_rate": 0.0001, "loss": 7.3402, "loss/crossentropy": 1.9305825233459473, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.20503179728984833, "step": 4154 }, { "epoch": 0.25975, "grad_norm": 2.171875, "grad_norm_var": 0.0172515869140625, "learning_rate": 0.0001, "loss": 7.417, "loss/crossentropy": 1.887523353099823, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2075207680463791, "step": 4156 }, { "epoch": 0.259875, "grad_norm": 2.5625, "grad_norm_var": 0.017756144205729168, "learning_rate": 0.0001, "loss": 7.5039, "loss/crossentropy": 2.354191780090332, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23857767134904861, "step": 4158 }, { "epoch": 0.26, "grad_norm": 2.21875, "grad_norm_var": 0.019873046875, "learning_rate": 0.0001, "loss": 7.3457, "loss/crossentropy": 2.1631126403808594, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.208695188164711, "step": 4160 }, { "epoch": 0.260125, "grad_norm": 2.3125, "grad_norm_var": 0.0183990478515625, "learning_rate": 0.0001, "loss": 7.337, "loss/crossentropy": 2.1133294701576233, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21781667321920395, "step": 4162 }, { "epoch": 0.26025, "grad_norm": 2.109375, "grad_norm_var": 0.014891560872395833, "learning_rate": 0.0001, "loss": 7.3932, "loss/crossentropy": 2.209348678588867, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21353087574243546, "step": 4164 }, { "epoch": 0.260375, "grad_norm": 2.5625, "grad_norm_var": 0.018000284830729168, "learning_rate": 0.0001, "loss": 7.5817, "loss/crossentropy": 2.3300869464874268, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23223827034235, "step": 4166 }, { "epoch": 0.2605, "grad_norm": 2.265625, "grad_norm_var": 0.017829386393229167, "learning_rate": 0.0001, "loss": 7.4924, "loss/crossentropy": 2.2505780458450317, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21991939842700958, "step": 4168 }, { "epoch": 0.260625, "grad_norm": 2.265625, "grad_norm_var": 0.017936197916666667, "learning_rate": 0.0001, "loss": 7.4621, "loss/crossentropy": 2.1267285346984863, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.20734632015228271, "step": 4170 }, { "epoch": 0.26075, "grad_norm": 2.859375, "grad_norm_var": 0.03319905598958333, "learning_rate": 0.0001, "loss": 7.6821, "loss/crossentropy": 2.561190962791443, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.25469981133937836, "step": 4172 }, { "epoch": 0.260875, "grad_norm": 2.296875, "grad_norm_var": 0.03173421223958333, "learning_rate": 0.0001, "loss": 7.6492, "loss/crossentropy": 2.5840975046157837, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24661777913570404, "step": 4174 }, { "epoch": 0.261, "grad_norm": 2.5625, "grad_norm_var": 0.032027180989583334, "learning_rate": 0.0001, "loss": 7.3485, "loss/crossentropy": 2.312406063079834, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23344486206769943, "step": 4176 }, { "epoch": 0.261125, "grad_norm": 2.203125, "grad_norm_var": 0.033219401041666666, "learning_rate": 0.0001, "loss": 7.4603, "loss/crossentropy": 2.3917373418807983, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22443928569555283, "step": 4178 }, { "epoch": 0.26125, "grad_norm": 2.375, "grad_norm_var": 0.03388570149739583, "learning_rate": 0.0001, "loss": 7.5113, "loss/crossentropy": 2.3461934328079224, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2298140972852707, "step": 4180 }, { "epoch": 0.261375, "grad_norm": 2.390625, "grad_norm_var": 0.0312408447265625, "learning_rate": 0.0001, "loss": 7.5746, "loss/crossentropy": 2.3137890100479126, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23146747052669525, "step": 4182 }, { "epoch": 0.2615, "grad_norm": 2.53125, "grad_norm_var": 0.03052978515625, "learning_rate": 0.0001, "loss": 7.5043, "loss/crossentropy": 2.2696300745010376, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2407674714922905, "step": 4184 }, { "epoch": 0.261625, "grad_norm": 2.234375, "grad_norm_var": 0.03245340983072917, "learning_rate": 0.0001, "loss": 7.4266, "loss/crossentropy": 2.3981817960739136, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2357579469680786, "step": 4186 }, { "epoch": 0.26175, "grad_norm": 2.28125, "grad_norm_var": 0.022826131184895834, "learning_rate": 0.0001, "loss": 7.2983, "loss/crossentropy": 2.278700351715088, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.24243299663066864, "step": 4188 }, { "epoch": 0.261875, "grad_norm": 2.65625, "grad_norm_var": 0.0290191650390625, "learning_rate": 0.0001, "loss": 7.4591, "loss/crossentropy": 2.4734745025634766, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2231806293129921, "step": 4190 }, { "epoch": 0.262, "grad_norm": 2.5625, "grad_norm_var": 0.03518473307291667, "learning_rate": 0.0001, "loss": 7.535, "loss/crossentropy": 2.281826972961426, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2438691332936287, "step": 4192 }, { "epoch": 0.262125, "grad_norm": 2.140625, "grad_norm_var": 0.03855692545572917, "learning_rate": 0.0001, "loss": 7.4283, "loss/crossentropy": 2.1762090921401978, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.19439425319433212, "step": 4194 }, { "epoch": 0.26225, "grad_norm": 2.65625, "grad_norm_var": 0.03613993326822917, "learning_rate": 0.0001, "loss": 7.5443, "loss/crossentropy": 2.3899075984954834, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22394779324531555, "step": 4196 }, { "epoch": 0.262375, "grad_norm": 2.21875, "grad_norm_var": 0.03876953125, "learning_rate": 0.0001, "loss": 7.612, "loss/crossentropy": 2.250289797782898, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22523928433656693, "step": 4198 }, { "epoch": 0.2625, "grad_norm": 2.515625, "grad_norm_var": 0.03834228515625, "learning_rate": 0.0001, "loss": 7.4899, "loss/crossentropy": 2.0721304416656494, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.22663410007953644, "step": 4200 }, { "epoch": 0.262625, "grad_norm": 2.421875, "grad_norm_var": 0.0362945556640625, "learning_rate": 0.0001, "loss": 7.5435, "loss/crossentropy": 2.1173004508018494, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20741065591573715, "step": 4202 }, { "epoch": 0.26275, "grad_norm": 2.171875, "grad_norm_var": 0.033186848958333334, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.3617148399353027, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22693531960248947, "step": 4204 }, { "epoch": 0.262875, "grad_norm": 2.390625, "grad_norm_var": 0.029784138997395834, "learning_rate": 0.0001, "loss": 7.49, "loss/crossentropy": 2.1566708087921143, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22167697548866272, "step": 4206 }, { "epoch": 0.263, "grad_norm": 2.5, "grad_norm_var": 0.022001139322916665, "learning_rate": 0.0001, "loss": 7.4201, "loss/crossentropy": 2.2872395515441895, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23204121738672256, "step": 4208 }, { "epoch": 0.263125, "grad_norm": 2.265625, "grad_norm_var": 0.0179595947265625, "learning_rate": 0.0001, "loss": 7.4017, "loss/crossentropy": 2.0254051089286804, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21901094913482666, "step": 4210 }, { "epoch": 0.26325, "grad_norm": 2.109375, "grad_norm_var": 0.017878214518229168, "learning_rate": 0.0001, "loss": 7.3872, "loss/crossentropy": 2.1917017698287964, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2296266034245491, "step": 4212 }, { "epoch": 0.263375, "grad_norm": 2.5, "grad_norm_var": 0.019050089518229167, "learning_rate": 0.0001, "loss": 7.5185, "loss/crossentropy": 2.329349994659424, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22277437150478363, "step": 4214 }, { "epoch": 0.2635, "grad_norm": 2.21875, "grad_norm_var": 0.019661458333333333, "learning_rate": 0.0001, "loss": 7.4769, "loss/crossentropy": 2.3094968795776367, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22067157924175262, "step": 4216 }, { "epoch": 0.263625, "grad_norm": 2.390625, "grad_norm_var": 0.015999348958333333, "learning_rate": 0.0001, "loss": 7.5023, "loss/crossentropy": 2.057901620864868, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.21499791741371155, "step": 4218 }, { "epoch": 0.26375, "grad_norm": 2.203125, "grad_norm_var": 0.015283203125, "learning_rate": 0.0001, "loss": 7.3844, "loss/crossentropy": 2.2509127855300903, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21002254635095596, "step": 4220 }, { "epoch": 0.263875, "grad_norm": 2.453125, "grad_norm_var": 0.0160064697265625, "learning_rate": 0.0001, "loss": 7.3918, "loss/crossentropy": 2.280747413635254, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21898390352725983, "step": 4222 }, { "epoch": 0.264, "grad_norm": 2.25, "grad_norm_var": 0.015672810872395835, "learning_rate": 0.0001, "loss": 7.5155, "loss/crossentropy": 2.3069392442703247, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23423085361719131, "step": 4224 }, { "epoch": 0.264125, "grad_norm": 2.3125, "grad_norm_var": 0.015185546875, "learning_rate": 0.0001, "loss": 7.4396, "loss/crossentropy": 2.0961318016052246, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23950795829296112, "step": 4226 }, { "epoch": 0.26425, "grad_norm": 2.265625, "grad_norm_var": 0.012691243489583334, "learning_rate": 0.0001, "loss": 7.4114, "loss/crossentropy": 2.2781461477279663, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23504098504781723, "step": 4228 }, { "epoch": 0.264375, "grad_norm": 2.296875, "grad_norm_var": 0.011181640625, "learning_rate": 0.0001, "loss": 7.4173, "loss/crossentropy": 2.1842299699783325, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21509061753749847, "step": 4230 }, { "epoch": 0.2645, "grad_norm": 2.328125, "grad_norm_var": 0.008817545572916667, "learning_rate": 0.0001, "loss": 7.5102, "loss/crossentropy": 2.3369181156158447, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23115838319063187, "step": 4232 }, { "epoch": 0.264625, "grad_norm": 2.46875, "grad_norm_var": 0.018033854166666665, "learning_rate": 0.0001, "loss": 7.6056, "loss/crossentropy": 2.123593807220459, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21955028176307678, "step": 4234 }, { "epoch": 0.26475, "grad_norm": 2.421875, "grad_norm_var": 0.019579060872395835, "learning_rate": 0.0001, "loss": 7.5573, "loss/crossentropy": 2.1841901540756226, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.20950836688280106, "step": 4236 }, { "epoch": 0.264875, "grad_norm": 2.1875, "grad_norm_var": 0.0201812744140625, "learning_rate": 0.0001, "loss": 7.4577, "loss/crossentropy": 2.180688500404358, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21572377532720566, "step": 4238 }, { "epoch": 0.265, "grad_norm": 2.296875, "grad_norm_var": 0.0191314697265625, "learning_rate": 0.0001, "loss": 7.5163, "loss/crossentropy": 2.5182595252990723, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24843312799930573, "step": 4240 }, { "epoch": 0.265125, "grad_norm": 2.4375, "grad_norm_var": 0.020783487955729166, "learning_rate": 0.0001, "loss": 7.3331, "loss/crossentropy": 2.3631211519241333, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22477340698242188, "step": 4242 }, { "epoch": 0.26525, "grad_norm": 2.109375, "grad_norm_var": 0.025874837239583334, "learning_rate": 0.0001, "loss": 7.3582, "loss/crossentropy": 2.392832636833191, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23732294142246246, "step": 4244 }, { "epoch": 0.265375, "grad_norm": 2.640625, "grad_norm_var": 0.0303863525390625, "learning_rate": 0.0001, "loss": 7.5458, "loss/crossentropy": 2.4172616004943848, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.25564996898174286, "step": 4246 }, { "epoch": 0.2655, "grad_norm": 2.25, "grad_norm_var": 0.030614217122395832, "learning_rate": 0.0001, "loss": 7.3352, "loss/crossentropy": 2.068120002746582, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20718112587928772, "step": 4248 }, { "epoch": 0.265625, "grad_norm": 2.203125, "grad_norm_var": 0.025495402018229165, "learning_rate": 0.0001, "loss": 7.4747, "loss/crossentropy": 2.3594859838485718, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22751520574092865, "step": 4250 }, { "epoch": 0.26575, "grad_norm": 2.421875, "grad_norm_var": 0.023298136393229165, "learning_rate": 0.0001, "loss": 7.3998, "loss/crossentropy": 2.446703553199768, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23083661496639252, "step": 4252 }, { "epoch": 0.265875, "grad_norm": 2.203125, "grad_norm_var": 0.026398722330729166, "learning_rate": 0.0001, "loss": 7.3885, "loss/crossentropy": 2.268368363380432, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.222725510597229, "step": 4254 }, { "epoch": 0.266, "grad_norm": 2.1875, "grad_norm_var": 0.028446451822916666, "learning_rate": 0.0001, "loss": 7.5755, "loss/crossentropy": 2.152546525001526, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22324977815151215, "step": 4256 }, { "epoch": 0.266125, "grad_norm": 2.3125, "grad_norm_var": 0.026220703125, "learning_rate": 0.0001, "loss": 7.3993, "loss/crossentropy": 2.101604700088501, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22246717661619186, "step": 4258 }, { "epoch": 0.26625, "grad_norm": 2.703125, "grad_norm_var": 0.029410807291666667, "learning_rate": 0.0001, "loss": 7.4041, "loss/crossentropy": 2.2383921146392822, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22142166644334793, "step": 4260 }, { "epoch": 0.266375, "grad_norm": 2.265625, "grad_norm_var": 0.023681640625, "learning_rate": 0.0001, "loss": 7.5145, "loss/crossentropy": 2.367012858390808, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22680655121803284, "step": 4262 }, { "epoch": 0.2665, "grad_norm": 2.203125, "grad_norm_var": 0.025732421875, "learning_rate": 0.0001, "loss": 7.2148, "loss/crossentropy": 2.4683737754821777, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23372989892959595, "step": 4264 }, { "epoch": 0.266625, "grad_norm": 2.5, "grad_norm_var": 0.0251617431640625, "learning_rate": 0.0001, "loss": 7.4554, "loss/crossentropy": 2.2976629734039307, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21854644268751144, "step": 4266 }, { "epoch": 0.26675, "grad_norm": 2.3125, "grad_norm_var": 0.025874837239583334, "learning_rate": 0.0001, "loss": 7.567, "loss/crossentropy": 2.0734461545944214, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.20711353421211243, "step": 4268 }, { "epoch": 0.266875, "grad_norm": 2.25, "grad_norm_var": 0.023094685872395833, "learning_rate": 0.0001, "loss": 7.3456, "loss/crossentropy": 2.2194411754608154, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2097087875008583, "step": 4270 }, { "epoch": 0.267, "grad_norm": 2.375, "grad_norm_var": 0.0214263916015625, "learning_rate": 0.0001, "loss": 7.3546, "loss/crossentropy": 2.3072394132614136, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2229909971356392, "step": 4272 }, { "epoch": 0.267125, "grad_norm": 2.546875, "grad_norm_var": 0.0269195556640625, "learning_rate": 0.0001, "loss": 7.4624, "loss/crossentropy": 2.3109829425811768, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2220280021429062, "step": 4274 }, { "epoch": 0.26725, "grad_norm": 2.4375, "grad_norm_var": 0.020921834309895835, "learning_rate": 0.0001, "loss": 7.5419, "loss/crossentropy": 2.2977495193481445, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.24211393296718597, "step": 4276 }, { "epoch": 0.267375, "grad_norm": 2.203125, "grad_norm_var": 0.021605428059895834, "learning_rate": 0.0001, "loss": 7.5332, "loss/crossentropy": 2.2791624069213867, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2135421559214592, "step": 4278 }, { "epoch": 0.2675, "grad_norm": 2.5, "grad_norm_var": 0.019489542643229166, "learning_rate": 0.0001, "loss": 7.5424, "loss/crossentropy": 2.4423515796661377, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2313070297241211, "step": 4280 }, { "epoch": 0.267625, "grad_norm": 2.3125, "grad_norm_var": 0.021043904622395835, "learning_rate": 0.0001, "loss": 7.2078, "loss/crossentropy": 2.1874340772628784, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.20884133875370026, "step": 4282 }, { "epoch": 0.26775, "grad_norm": 2.4375, "grad_norm_var": 0.0204986572265625, "learning_rate": 0.0001, "loss": 7.5178, "loss/crossentropy": 2.2358198165893555, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24393440037965775, "step": 4284 }, { "epoch": 0.267875, "grad_norm": 2.28125, "grad_norm_var": 0.021776326497395835, "learning_rate": 0.0001, "loss": 7.2812, "loss/crossentropy": 2.064828336238861, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2007676362991333, "step": 4286 }, { "epoch": 0.268, "grad_norm": 2.296875, "grad_norm_var": 0.022786458333333332, "learning_rate": 0.0001, "loss": 7.4434, "loss/crossentropy": 2.301008701324463, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23284630477428436, "step": 4288 }, { "epoch": 0.268125, "grad_norm": 2.296875, "grad_norm_var": 0.018538411458333334, "learning_rate": 0.0001, "loss": 7.3878, "loss/crossentropy": 1.959551453590393, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.1952490210533142, "step": 4290 }, { "epoch": 0.26825, "grad_norm": 2.375, "grad_norm_var": 0.016890462239583334, "learning_rate": 0.0001, "loss": 7.5644, "loss/crossentropy": 2.352171301841736, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2467745542526245, "step": 4292 }, { "epoch": 0.268375, "grad_norm": 2.375, "grad_norm_var": 0.015478515625, "learning_rate": 0.0001, "loss": 7.343, "loss/crossentropy": 2.213460922241211, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2195800244808197, "step": 4294 }, { "epoch": 0.2685, "grad_norm": 2.375, "grad_norm_var": 0.0146484375, "learning_rate": 0.0001, "loss": 7.4869, "loss/crossentropy": 2.042128264904022, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2317609265446663, "step": 4296 }, { "epoch": 0.268625, "grad_norm": 2.25, "grad_norm_var": 0.014769490559895833, "learning_rate": 0.0001, "loss": 7.3506, "loss/crossentropy": 1.9961625337600708, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.19491659104824066, "step": 4298 }, { "epoch": 0.26875, "grad_norm": 2.46875, "grad_norm_var": 0.017508951822916667, "learning_rate": 0.0001, "loss": 7.468, "loss/crossentropy": 2.118159055709839, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2275976911187172, "step": 4300 }, { "epoch": 0.268875, "grad_norm": 2.421875, "grad_norm_var": 0.015095011393229166, "learning_rate": 0.0001, "loss": 7.3515, "loss/crossentropy": 2.206713318824768, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21720553189516068, "step": 4302 }, { "epoch": 0.269, "grad_norm": 2.28125, "grad_norm_var": 0.01480712890625, "learning_rate": 0.0001, "loss": 7.4099, "loss/crossentropy": 2.2401949167251587, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21897585690021515, "step": 4304 }, { "epoch": 0.269125, "grad_norm": 2.21875, "grad_norm_var": 0.0141021728515625, "learning_rate": 0.0001, "loss": 7.3092, "loss/crossentropy": 2.1088130474090576, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21423090249300003, "step": 4306 }, { "epoch": 0.26925, "grad_norm": 2.546875, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 7.4478, "loss/crossentropy": 2.2879436016082764, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.20457830280065536, "step": 4308 }, { "epoch": 0.269375, "grad_norm": 2.46875, "grad_norm_var": 0.01558837890625, "learning_rate": 0.0001, "loss": 7.4249, "loss/crossentropy": 2.038703441619873, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21783077716827393, "step": 4310 }, { "epoch": 0.2695, "grad_norm": 2.375, "grad_norm_var": 0.01842041015625, "learning_rate": 0.0001, "loss": 7.6062, "loss/crossentropy": 2.4648650884628296, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22937683761119843, "step": 4312 }, { "epoch": 0.269625, "grad_norm": 2.25, "grad_norm_var": 0.016950480143229165, "learning_rate": 0.0001, "loss": 7.4844, "loss/crossentropy": 2.0551947951316833, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22646934539079666, "step": 4314 }, { "epoch": 0.26975, "grad_norm": 2.375, "grad_norm_var": 0.015165201822916667, "learning_rate": 0.0001, "loss": 7.4434, "loss/crossentropy": 2.237433969974518, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22162948548793793, "step": 4316 }, { "epoch": 0.269875, "grad_norm": 2.515625, "grad_norm_var": 0.015543619791666666, "learning_rate": 0.0001, "loss": 7.5201, "loss/crossentropy": 2.275120735168457, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23471694439649582, "step": 4318 }, { "epoch": 0.27, "grad_norm": 2.078125, "grad_norm_var": 0.019733683268229166, "learning_rate": 0.0001, "loss": 7.3651, "loss/crossentropy": 2.355165123939514, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.25206050276756287, "step": 4320 }, { "epoch": 0.270125, "grad_norm": 2.4375, "grad_norm_var": 0.018602498372395835, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.378506660461426, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22956155985593796, "step": 4322 }, { "epoch": 0.27025, "grad_norm": 2.515625, "grad_norm_var": 0.01636962890625, "learning_rate": 0.0001, "loss": 7.6102, "loss/crossentropy": 2.2196805477142334, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23681814968585968, "step": 4324 }, { "epoch": 0.270375, "grad_norm": 2.5625, "grad_norm_var": 0.0182525634765625, "learning_rate": 0.0001, "loss": 7.3513, "loss/crossentropy": 2.1617177724838257, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.19959037005901337, "step": 4326 }, { "epoch": 0.2705, "grad_norm": 2.421875, "grad_norm_var": 0.022386678059895835, "learning_rate": 0.0001, "loss": 7.4452, "loss/crossentropy": 2.2576276063919067, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2321421429514885, "step": 4328 }, { "epoch": 0.270625, "grad_norm": 2.140625, "grad_norm_var": 0.025267537434895834, "learning_rate": 0.0001, "loss": 7.36, "loss/crossentropy": 2.213522434234619, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2282974198460579, "step": 4330 }, { "epoch": 0.27075, "grad_norm": 2.21875, "grad_norm_var": 0.028880818684895834, "learning_rate": 0.0001, "loss": 7.3864, "loss/crossentropy": 1.9888715744018555, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21721196174621582, "step": 4332 }, { "epoch": 0.270875, "grad_norm": 2.515625, "grad_norm_var": 0.028706868489583332, "learning_rate": 0.0001, "loss": 7.3121, "loss/crossentropy": 2.3635976314544678, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22595931589603424, "step": 4334 }, { "epoch": 0.271, "grad_norm": 2.390625, "grad_norm_var": 0.022997029622395835, "learning_rate": 0.0001, "loss": 7.5235, "loss/crossentropy": 2.4795159101486206, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23015367984771729, "step": 4336 }, { "epoch": 0.271125, "grad_norm": 2.984375, "grad_norm_var": 0.047998046875, "learning_rate": 0.0001, "loss": 7.4249, "loss/crossentropy": 2.234145760536194, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20242837071418762, "step": 4338 }, { "epoch": 0.27125, "grad_norm": 2.46875, "grad_norm_var": 0.04712626139322917, "learning_rate": 0.0001, "loss": 7.5575, "loss/crossentropy": 2.2627410888671875, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2454666718840599, "step": 4340 }, { "epoch": 0.271375, "grad_norm": 2.34375, "grad_norm_var": 0.04541727701822917, "learning_rate": 0.0001, "loss": 7.3373, "loss/crossentropy": 2.4094841480255127, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22295518964529037, "step": 4342 }, { "epoch": 0.2715, "grad_norm": 2.328125, "grad_norm_var": 0.03918863932291667, "learning_rate": 0.0001, "loss": 7.489, "loss/crossentropy": 2.485226035118103, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2179865539073944, "step": 4344 }, { "epoch": 0.271625, "grad_norm": 2.3125, "grad_norm_var": 0.03469645182291667, "learning_rate": 0.0001, "loss": 7.4442, "loss/crossentropy": 2.3933740854263306, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24797789752483368, "step": 4346 }, { "epoch": 0.27175, "grad_norm": 2.359375, "grad_norm_var": 0.028246053059895835, "learning_rate": 0.0001, "loss": 7.4128, "loss/crossentropy": 2.136129140853882, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.20815995335578918, "step": 4348 }, { "epoch": 0.271875, "grad_norm": 2.234375, "grad_norm_var": 0.029474894205729168, "learning_rate": 0.0001, "loss": 7.3621, "loss/crossentropy": 2.2934054136276245, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22534728795289993, "step": 4350 }, { "epoch": 0.272, "grad_norm": 2.40625, "grad_norm_var": 0.029808553059895833, "learning_rate": 0.0001, "loss": 7.7497, "loss/crossentropy": 2.2244023084640503, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24372966587543488, "step": 4352 }, { "epoch": 0.272125, "grad_norm": 2.375, "grad_norm_var": 0.004515584309895833, "learning_rate": 0.0001, "loss": 7.5103, "loss/crossentropy": 2.2522358894348145, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2237512692809105, "step": 4354 }, { "epoch": 0.27225, "grad_norm": 2.53125, "grad_norm_var": 0.005533854166666667, "learning_rate": 0.0001, "loss": 7.5503, "loss/crossentropy": 2.006240487098694, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21218693256378174, "step": 4356 }, { "epoch": 0.272375, "grad_norm": 2.34375, "grad_norm_var": 0.007356770833333333, "learning_rate": 0.0001, "loss": 7.4, "loss/crossentropy": 2.1753203868865967, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21662192791700363, "step": 4358 }, { "epoch": 0.2725, "grad_norm": 2.1875, "grad_norm_var": 0.008317057291666667, "learning_rate": 0.0001, "loss": 7.4924, "loss/crossentropy": 2.5050474405288696, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23573968559503555, "step": 4360 }, { "epoch": 0.272625, "grad_norm": 2.1875, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 7.2394, "loss/crossentropy": 2.2561826705932617, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2149110957980156, "step": 4362 }, { "epoch": 0.27275, "grad_norm": 2.34375, "grad_norm_var": 0.011262003580729167, "learning_rate": 0.0001, "loss": 7.5461, "loss/crossentropy": 2.4560667276382446, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22948572784662247, "step": 4364 }, { "epoch": 0.272875, "grad_norm": 2.328125, "grad_norm_var": 0.010895792643229167, "learning_rate": 0.0001, "loss": 7.5201, "loss/crossentropy": 2.2445766925811768, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.24865839630365372, "step": 4366 }, { "epoch": 0.273, "grad_norm": 2.484375, "grad_norm_var": 0.010602823893229167, "learning_rate": 0.0001, "loss": 7.3433, "loss/crossentropy": 2.048405647277832, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21950747072696686, "step": 4368 }, { "epoch": 0.273125, "grad_norm": 2.1875, "grad_norm_var": 0.016405232747395835, "learning_rate": 0.0001, "loss": 7.478, "loss/crossentropy": 2.5548731088638306, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.24532774090766907, "step": 4370 }, { "epoch": 0.27325, "grad_norm": 2.328125, "grad_norm_var": 0.0150390625, "learning_rate": 0.0001, "loss": 7.4301, "loss/crossentropy": 2.158105969429016, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2423110455274582, "step": 4372 }, { "epoch": 0.273375, "grad_norm": 2.375, "grad_norm_var": 0.014518229166666667, "learning_rate": 0.0001, "loss": 7.5337, "loss/crossentropy": 2.3834049701690674, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2357938289642334, "step": 4374 }, { "epoch": 0.2735, "grad_norm": 2.578125, "grad_norm_var": 0.017692057291666667, "learning_rate": 0.0001, "loss": 7.3858, "loss/crossentropy": 2.2216193675994873, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23488235473632812, "step": 4376 }, { "epoch": 0.273625, "grad_norm": 2.3125, "grad_norm_var": 0.04396870930989583, "learning_rate": 0.0001, "loss": 7.4476, "loss/crossentropy": 2.11636745929718, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21510595828294754, "step": 4378 }, { "epoch": 0.27375, "grad_norm": 2.203125, "grad_norm_var": 0.04899800618489583, "learning_rate": 0.0001, "loss": 7.572, "loss/crossentropy": 2.2707090377807617, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22173020988702774, "step": 4380 }, { "epoch": 0.273875, "grad_norm": 2.203125, "grad_norm_var": 0.05029296875, "learning_rate": 0.0001, "loss": 7.3261, "loss/crossentropy": 2.270142912864685, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21372423321008682, "step": 4382 }, { "epoch": 0.274, "grad_norm": 2.484375, "grad_norm_var": 0.053120930989583336, "learning_rate": 0.0001, "loss": 7.4542, "loss/crossentropy": 2.403599977493286, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20995523035526276, "step": 4384 }, { "epoch": 0.274125, "grad_norm": 2.484375, "grad_norm_var": 0.049071248372395834, "learning_rate": 0.0001, "loss": 7.6809, "loss/crossentropy": 2.2590737342834473, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22851084172725677, "step": 4386 }, { "epoch": 0.27425, "grad_norm": 2.5, "grad_norm_var": 0.046647135416666666, "learning_rate": 0.0001, "loss": 7.5104, "loss/crossentropy": 2.303091526031494, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24387390911579132, "step": 4388 }, { "epoch": 0.274375, "grad_norm": 2.59375, "grad_norm_var": 0.05137430826822917, "learning_rate": 0.0001, "loss": 7.5524, "loss/crossentropy": 2.3587981462478638, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2259790152311325, "step": 4390 }, { "epoch": 0.2745, "grad_norm": 2.1875, "grad_norm_var": 0.05120442708333333, "learning_rate": 0.0001, "loss": 7.5764, "loss/crossentropy": 2.26511013507843, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2363736778497696, "step": 4392 }, { "epoch": 0.274625, "grad_norm": 2.28125, "grad_norm_var": 0.028758748372395834, "learning_rate": 0.0001, "loss": 7.2611, "loss/crossentropy": 2.1224186420440674, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21887023746967316, "step": 4394 }, { "epoch": 0.27475, "grad_norm": 2.34375, "grad_norm_var": 0.024665323893229167, "learning_rate": 0.0001, "loss": 7.411, "loss/crossentropy": 2.4636008739471436, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22841719537973404, "step": 4396 }, { "epoch": 0.274875, "grad_norm": 2.390625, "grad_norm_var": 0.022785441080729166, "learning_rate": 0.0001, "loss": 7.4425, "loss/crossentropy": 2.098508358001709, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.24196244776248932, "step": 4398 }, { "epoch": 0.275, "grad_norm": 2.296875, "grad_norm_var": 0.020856730143229165, "learning_rate": 0.0001, "loss": 7.6109, "loss/crossentropy": 2.245239734649658, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2130843922495842, "step": 4400 }, { "epoch": 0.275125, "grad_norm": 2.234375, "grad_norm_var": 0.025374348958333334, "learning_rate": 0.0001, "loss": 7.3746, "loss/crossentropy": 2.1706331968307495, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21158546209335327, "step": 4402 }, { "epoch": 0.27525, "grad_norm": 2.296875, "grad_norm_var": 0.026200358072916666, "learning_rate": 0.0001, "loss": 7.2704, "loss/crossentropy": 2.114712119102478, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21735002845525742, "step": 4404 }, { "epoch": 0.275375, "grad_norm": 2.375, "grad_norm_var": 0.01597900390625, "learning_rate": 0.0001, "loss": 7.4371, "loss/crossentropy": 2.4146808385849, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2161477878689766, "step": 4406 }, { "epoch": 0.2755, "grad_norm": 2.28125, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 7.5387, "loss/crossentropy": 2.5566210746765137, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23766440898180008, "step": 4408 }, { "epoch": 0.275625, "grad_norm": 2.125, "grad_norm_var": 0.01412353515625, "learning_rate": 0.0001, "loss": 7.3358, "loss/crossentropy": 2.216295003890991, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21209152787923813, "step": 4410 }, { "epoch": 0.27575, "grad_norm": 2.328125, "grad_norm_var": 0.01796875, "learning_rate": 0.0001, "loss": 7.5077, "loss/crossentropy": 2.306955099105835, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2539971098303795, "step": 4412 }, { "epoch": 0.275875, "grad_norm": 2.328125, "grad_norm_var": 0.018195597330729167, "learning_rate": 0.0001, "loss": 7.4567, "loss/crossentropy": 2.462777853012085, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21741768717765808, "step": 4414 }, { "epoch": 0.276, "grad_norm": 2.734375, "grad_norm_var": 0.0329010009765625, "learning_rate": 0.0001, "loss": 7.5509, "loss/crossentropy": 2.155795156955719, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22352226078510284, "step": 4416 }, { "epoch": 0.276125, "grad_norm": 3.234375, "grad_norm_var": 0.088720703125, "learning_rate": 0.0001, "loss": 7.4472, "loss/crossentropy": 2.4492448568344116, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22944186627864838, "step": 4418 }, { "epoch": 0.27625, "grad_norm": 3.453125, "grad_norm_var": 0.14516499837239583, "learning_rate": 0.0001, "loss": 7.4556, "loss/crossentropy": 2.296768367290497, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2189040184020996, "step": 4420 }, { "epoch": 0.276375, "grad_norm": 2.1875, "grad_norm_var": 0.1495758056640625, "learning_rate": 0.0001, "loss": 7.4439, "loss/crossentropy": 2.4126769304275513, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23999958485364914, "step": 4422 }, { "epoch": 0.2765, "grad_norm": 2.40625, "grad_norm_var": 0.14530843098958332, "learning_rate": 0.0001, "loss": 7.501, "loss/crossentropy": 2.47876238822937, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23916874080896378, "step": 4424 }, { "epoch": 0.276625, "grad_norm": 2.40625, "grad_norm_var": 0.138720703125, "learning_rate": 0.0001, "loss": 7.5551, "loss/crossentropy": 2.282703399658203, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2197970598936081, "step": 4426 }, { "epoch": 0.27675, "grad_norm": 2.171875, "grad_norm_var": 0.1488677978515625, "learning_rate": 0.0001, "loss": 7.4234, "loss/crossentropy": 2.26338267326355, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22207339107990265, "step": 4428 }, { "epoch": 0.276875, "grad_norm": 4.59375, "grad_norm_var": 0.3890462239583333, "learning_rate": 0.0001, "loss": 7.5694, "loss/crossentropy": 2.34802508354187, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23877909779548645, "step": 4430 }, { "epoch": 0.277, "grad_norm": 2.28125, "grad_norm_var": 0.4018951416015625, "learning_rate": 0.0001, "loss": 7.6152, "loss/crossentropy": 2.4531562328338623, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24313053488731384, "step": 4432 }, { "epoch": 0.277125, "grad_norm": 2.265625, "grad_norm_var": 0.39807535807291666, "learning_rate": 0.0001, "loss": 7.2933, "loss/crossentropy": 2.1458455324172974, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21921583265066147, "step": 4434 }, { "epoch": 0.27725, "grad_norm": 2.453125, "grad_norm_var": 0.34549153645833336, "learning_rate": 0.0001, "loss": 7.3935, "loss/crossentropy": 2.3641769886016846, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2235148400068283, "step": 4436 }, { "epoch": 0.277375, "grad_norm": 2.21875, "grad_norm_var": 0.3485026041666667, "learning_rate": 0.0001, "loss": 7.4657, "loss/crossentropy": 2.2699997425079346, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2232022061944008, "step": 4438 }, { "epoch": 0.2775, "grad_norm": 2.453125, "grad_norm_var": 0.3463043212890625, "learning_rate": 0.0001, "loss": 7.489, "loss/crossentropy": 2.2820075154304504, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22474562376737595, "step": 4440 }, { "epoch": 0.277625, "grad_norm": 2.28125, "grad_norm_var": 0.34402669270833336, "learning_rate": 0.0001, "loss": 7.2763, "loss/crossentropy": 2.3725186586380005, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2197442501783371, "step": 4442 }, { "epoch": 0.27775, "grad_norm": 2.40625, "grad_norm_var": 0.33488667805989586, "learning_rate": 0.0001, "loss": 7.3046, "loss/crossentropy": 2.0243775248527527, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20709318667650223, "step": 4444 }, { "epoch": 0.277875, "grad_norm": 2.5, "grad_norm_var": 0.019928995768229166, "learning_rate": 0.0001, "loss": 7.5691, "loss/crossentropy": 2.267683506011963, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23416732996702194, "step": 4446 }, { "epoch": 0.278, "grad_norm": 2.296875, "grad_norm_var": 0.015241495768229167, "learning_rate": 0.0001, "loss": 7.5059, "loss/crossentropy": 2.4565316438674927, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23150788247585297, "step": 4448 }, { "epoch": 0.278125, "grad_norm": 2.359375, "grad_norm_var": 0.015132649739583334, "learning_rate": 0.0001, "loss": 7.3083, "loss/crossentropy": 2.2579123973846436, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.23325718939304352, "step": 4450 }, { "epoch": 0.27825, "grad_norm": 2.3125, "grad_norm_var": 0.015819295247395834, "learning_rate": 0.0001, "loss": 7.4666, "loss/crossentropy": 2.2895156145095825, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23578675836324692, "step": 4452 }, { "epoch": 0.278375, "grad_norm": 2.40625, "grad_norm_var": 0.014232381184895834, "learning_rate": 0.0001, "loss": 7.3926, "loss/crossentropy": 2.4191900491714478, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23101849853992462, "step": 4454 }, { "epoch": 0.2785, "grad_norm": 2.625, "grad_norm_var": 0.01796875, "learning_rate": 0.0001, "loss": 7.5229, "loss/crossentropy": 2.1673884987831116, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21298403292894363, "step": 4456 }, { "epoch": 0.278625, "grad_norm": 2.28125, "grad_norm_var": 0.02232666015625, "learning_rate": 0.0001, "loss": 7.2791, "loss/crossentropy": 1.9522064924240112, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.1917770504951477, "step": 4458 }, { "epoch": 0.27875, "grad_norm": 2.234375, "grad_norm_var": 0.02037353515625, "learning_rate": 0.0001, "loss": 7.3455, "loss/crossentropy": 2.064726769924164, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2061791568994522, "step": 4460 }, { "epoch": 0.278875, "grad_norm": 2.21875, "grad_norm_var": 0.0209869384765625, "learning_rate": 0.0001, "loss": 7.5458, "loss/crossentropy": 2.57345187664032, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.24028793722391129, "step": 4462 }, { "epoch": 0.279, "grad_norm": 2.296875, "grad_norm_var": 0.021092732747395832, "learning_rate": 0.0001, "loss": 7.52, "loss/crossentropy": 2.1306859254837036, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21655796468257904, "step": 4464 }, { "epoch": 0.279125, "grad_norm": 2.203125, "grad_norm_var": 0.019237263997395834, "learning_rate": 0.0001, "loss": 7.3362, "loss/crossentropy": 2.0370622873306274, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20722102373838425, "step": 4466 }, { "epoch": 0.27925, "grad_norm": 2.234375, "grad_norm_var": 0.023942057291666666, "learning_rate": 0.0001, "loss": 7.3497, "loss/crossentropy": 2.3192614316940308, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2113727256655693, "step": 4468 }, { "epoch": 0.279375, "grad_norm": 2.390625, "grad_norm_var": 0.0229400634765625, "learning_rate": 0.0001, "loss": 7.3515, "loss/crossentropy": 2.378154754638672, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23498709499835968, "step": 4470 }, { "epoch": 0.2795, "grad_norm": 2.234375, "grad_norm_var": 0.017284138997395834, "learning_rate": 0.0001, "loss": 7.3089, "loss/crossentropy": 2.3729015588760376, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24130545556545258, "step": 4472 }, { "epoch": 0.279625, "grad_norm": 2.203125, "grad_norm_var": 0.009761555989583334, "learning_rate": 0.0001, "loss": 7.37, "loss/crossentropy": 2.139391541481018, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.24066069722175598, "step": 4474 }, { "epoch": 0.27975, "grad_norm": 2.234375, "grad_norm_var": 0.0101226806640625, "learning_rate": 0.0001, "loss": 7.3748, "loss/crossentropy": 2.561732530593872, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2334682047367096, "step": 4476 }, { "epoch": 0.279875, "grad_norm": 2.375, "grad_norm_var": 0.0133453369140625, "learning_rate": 0.0001, "loss": 7.4443, "loss/crossentropy": 2.5658878087997437, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22343529760837555, "step": 4478 }, { "epoch": 0.28, "grad_norm": 2.234375, "grad_norm_var": 0.014322916666666666, "learning_rate": 0.0001, "loss": 7.2706, "loss/crossentropy": 2.389992117881775, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2276037633419037, "step": 4480 }, { "epoch": 0.280125, "grad_norm": 2.28125, "grad_norm_var": 0.022044881184895834, "learning_rate": 0.0001, "loss": 7.5801, "loss/crossentropy": 2.496686816215515, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2241569608449936, "step": 4482 }, { "epoch": 0.28025, "grad_norm": 2.265625, "grad_norm_var": 0.017658487955729166, "learning_rate": 0.0001, "loss": 7.3432, "loss/crossentropy": 2.33056378364563, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23244759440422058, "step": 4484 }, { "epoch": 0.280375, "grad_norm": 2.296875, "grad_norm_var": 0.017313639322916668, "learning_rate": 0.0001, "loss": 7.4858, "loss/crossentropy": 2.2192925214767456, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21771762520074844, "step": 4486 }, { "epoch": 0.2805, "grad_norm": 2.296875, "grad_norm_var": 0.017902628580729166, "learning_rate": 0.0001, "loss": 7.3754, "loss/crossentropy": 2.1574283838272095, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20880410075187683, "step": 4488 }, { "epoch": 0.280625, "grad_norm": 2.1875, "grad_norm_var": 0.019266764322916668, "learning_rate": 0.0001, "loss": 7.4153, "loss/crossentropy": 2.3507992029190063, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23332297801971436, "step": 4490 }, { "epoch": 0.28075, "grad_norm": 2.5, "grad_norm_var": 0.021833292643229165, "learning_rate": 0.0001, "loss": 7.4313, "loss/crossentropy": 2.2984365224838257, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22823016345500946, "step": 4492 }, { "epoch": 0.280875, "grad_norm": 2.125, "grad_norm_var": 0.021076456705729166, "learning_rate": 0.0001, "loss": 7.2686, "loss/crossentropy": 2.104506731033325, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22106795012950897, "step": 4494 }, { "epoch": 0.281, "grad_norm": 2.109375, "grad_norm_var": 0.022196451822916668, "learning_rate": 0.0001, "loss": 7.537, "loss/crossentropy": 2.197329521179199, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.222248375415802, "step": 4496 }, { "epoch": 0.281125, "grad_norm": 2.28125, "grad_norm_var": 0.012995402018229166, "learning_rate": 0.0001, "loss": 7.3045, "loss/crossentropy": 2.2515393495559692, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22155503183603287, "step": 4498 }, { "epoch": 0.28125, "grad_norm": 2.53125, "grad_norm_var": 0.019986979166666665, "learning_rate": 0.0001, "loss": 7.3329, "loss/crossentropy": 2.1618112325668335, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22284910082817078, "step": 4500 }, { "epoch": 0.281375, "grad_norm": 2.109375, "grad_norm_var": 0.0228424072265625, "learning_rate": 0.0001, "loss": 7.2701, "loss/crossentropy": 2.088331937789917, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23195888847112656, "step": 4502 }, { "epoch": 0.2815, "grad_norm": 2.578125, "grad_norm_var": 0.026529947916666668, "learning_rate": 0.0001, "loss": 7.4928, "loss/crossentropy": 2.327447533607483, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22215363383293152, "step": 4504 }, { "epoch": 0.281625, "grad_norm": 2.21875, "grad_norm_var": 0.026334635416666665, "learning_rate": 0.0001, "loss": 7.4629, "loss/crossentropy": 2.294684410095215, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2363322228193283, "step": 4506 }, { "epoch": 0.28175, "grad_norm": 2.25, "grad_norm_var": 0.02236328125, "learning_rate": 0.0001, "loss": 7.4728, "loss/crossentropy": 2.3789936304092407, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22490855306386948, "step": 4508 }, { "epoch": 0.281875, "grad_norm": 2.296875, "grad_norm_var": 0.019554646809895833, "learning_rate": 0.0001, "loss": 7.45, "loss/crossentropy": 2.167789340019226, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21532851457595825, "step": 4510 }, { "epoch": 0.282, "grad_norm": 2.265625, "grad_norm_var": 0.016795857747395834, "learning_rate": 0.0001, "loss": 7.3859, "loss/crossentropy": 2.197750449180603, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22491279989480972, "step": 4512 }, { "epoch": 0.282125, "grad_norm": 2.171875, "grad_norm_var": 0.017951456705729167, "learning_rate": 0.0001, "loss": 7.3363, "loss/crossentropy": 2.168722629547119, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2121475487947464, "step": 4514 }, { "epoch": 0.28225, "grad_norm": 2.4375, "grad_norm_var": 0.013407389322916666, "learning_rate": 0.0001, "loss": 7.3139, "loss/crossentropy": 2.154773235321045, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22038134932518005, "step": 4516 }, { "epoch": 0.282375, "grad_norm": 2.390625, "grad_norm_var": 0.01109619140625, "learning_rate": 0.0001, "loss": 7.4864, "loss/crossentropy": 2.1986958980560303, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22602716833353043, "step": 4518 }, { "epoch": 0.2825, "grad_norm": 2.4375, "grad_norm_var": 0.007225545247395834, "learning_rate": 0.0001, "loss": 7.2571, "loss/crossentropy": 2.1418489813804626, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20537669956684113, "step": 4520 }, { "epoch": 0.282625, "grad_norm": 2.296875, "grad_norm_var": 0.010081990559895834, "learning_rate": 0.0001, "loss": 7.478, "loss/crossentropy": 2.152292013168335, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21266800165176392, "step": 4522 }, { "epoch": 0.28275, "grad_norm": 2.265625, "grad_norm_var": 0.012181599934895834, "learning_rate": 0.0001, "loss": 7.3898, "loss/crossentropy": 2.3404839038848877, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21193323284387589, "step": 4524 }, { "epoch": 0.282875, "grad_norm": 2.59375, "grad_norm_var": 0.0173004150390625, "learning_rate": 0.0001, "loss": 7.5024, "loss/crossentropy": 2.012498140335083, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21007203310728073, "step": 4526 }, { "epoch": 0.283, "grad_norm": 2.328125, "grad_norm_var": 0.016975911458333333, "learning_rate": 0.0001, "loss": 7.3939, "loss/crossentropy": 2.218207001686096, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23073484003543854, "step": 4528 }, { "epoch": 0.283125, "grad_norm": 2.21875, "grad_norm_var": 0.016185506184895834, "learning_rate": 0.0001, "loss": 7.519, "loss/crossentropy": 2.3596348762512207, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2409394308924675, "step": 4530 }, { "epoch": 0.28325, "grad_norm": 2.546875, "grad_norm_var": 0.016502888997395833, "learning_rate": 0.0001, "loss": 7.4134, "loss/crossentropy": 2.223081946372986, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22396845370531082, "step": 4532 }, { "epoch": 0.283375, "grad_norm": 2.3125, "grad_norm_var": 0.019755045572916668, "learning_rate": 0.0001, "loss": 7.3217, "loss/crossentropy": 2.258087635040283, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21903490275144577, "step": 4534 }, { "epoch": 0.2835, "grad_norm": 2.421875, "grad_norm_var": 0.022379557291666668, "learning_rate": 0.0001, "loss": 7.3878, "loss/crossentropy": 2.3071266412734985, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.21915674209594727, "step": 4536 }, { "epoch": 0.283625, "grad_norm": 2.296875, "grad_norm_var": 0.020035807291666666, "learning_rate": 0.0001, "loss": 7.4188, "loss/crossentropy": 2.1818684339523315, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22117509692907333, "step": 4538 }, { "epoch": 0.28375, "grad_norm": 2.453125, "grad_norm_var": 0.015851847330729165, "learning_rate": 0.0001, "loss": 7.4492, "loss/crossentropy": 2.411054253578186, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22131475806236267, "step": 4540 }, { "epoch": 0.283875, "grad_norm": 2.3125, "grad_norm_var": 0.0133209228515625, "learning_rate": 0.0001, "loss": 7.4197, "loss/crossentropy": 2.2065125703811646, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21521207690238953, "step": 4542 }, { "epoch": 0.284, "grad_norm": 2.453125, "grad_norm_var": 0.0134918212890625, "learning_rate": 0.0001, "loss": 7.4471, "loss/crossentropy": 2.266932725906372, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22295700013637543, "step": 4544 }, { "epoch": 0.284125, "grad_norm": 2.328125, "grad_norm_var": 0.011872355143229167, "learning_rate": 0.0001, "loss": 7.5007, "loss/crossentropy": 2.360601305961609, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23160873353481293, "step": 4546 }, { "epoch": 0.28425, "grad_norm": 2.21875, "grad_norm_var": 0.013895670572916666, "learning_rate": 0.0001, "loss": 7.3673, "loss/crossentropy": 2.1906508207321167, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21694105863571167, "step": 4548 }, { "epoch": 0.284375, "grad_norm": 2.328125, "grad_norm_var": 0.011649576822916667, "learning_rate": 0.0001, "loss": 7.4986, "loss/crossentropy": 2.348879337310791, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.213335782289505, "step": 4550 }, { "epoch": 0.2845, "grad_norm": 2.390625, "grad_norm_var": 0.008128865559895834, "learning_rate": 0.0001, "loss": 7.5221, "loss/crossentropy": 2.343509554862976, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.24417825788259506, "step": 4552 }, { "epoch": 0.284625, "grad_norm": 2.1875, "grad_norm_var": 0.008524576822916666, "learning_rate": 0.0001, "loss": 7.4006, "loss/crossentropy": 2.243224263191223, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21264301240444183, "step": 4554 }, { "epoch": 0.28475, "grad_norm": 2.5, "grad_norm_var": 0.011823527018229167, "learning_rate": 0.0001, "loss": 7.4715, "loss/crossentropy": 2.383206009864807, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2533396929502487, "step": 4556 }, { "epoch": 0.284875, "grad_norm": 2.140625, "grad_norm_var": 0.013695271809895833, "learning_rate": 0.0001, "loss": 7.2299, "loss/crossentropy": 2.2882405519485474, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23156971484422684, "step": 4558 }, { "epoch": 0.285, "grad_norm": 2.25, "grad_norm_var": 0.012548828125, "learning_rate": 0.0001, "loss": 7.4714, "loss/crossentropy": 2.3273061513900757, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2340613454580307, "step": 4560 }, { "epoch": 0.285125, "grad_norm": 2.1875, "grad_norm_var": 0.01236572265625, "learning_rate": 0.0001, "loss": 7.2556, "loss/crossentropy": 2.206205368041992, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21796930581331253, "step": 4562 }, { "epoch": 0.28525, "grad_norm": 2.328125, "grad_norm_var": 0.0104644775390625, "learning_rate": 0.0001, "loss": 7.3176, "loss/crossentropy": 2.26140820980072, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21520189940929413, "step": 4564 }, { "epoch": 0.285375, "grad_norm": 2.28125, "grad_norm_var": 0.01904296875, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.1846988201141357, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23294325172901154, "step": 4566 }, { "epoch": 0.2855, "grad_norm": 2.484375, "grad_norm_var": 0.020262654622395834, "learning_rate": 0.0001, "loss": 7.3939, "loss/crossentropy": 2.1126757860183716, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20701631158590317, "step": 4568 }, { "epoch": 0.285625, "grad_norm": 2.828125, "grad_norm_var": 0.03251851399739583, "learning_rate": 0.0001, "loss": 7.4807, "loss/crossentropy": 2.407546877861023, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22992483526468277, "step": 4570 }, { "epoch": 0.28575, "grad_norm": 2.296875, "grad_norm_var": 0.0304840087890625, "learning_rate": 0.0001, "loss": 7.2665, "loss/crossentropy": 2.281604766845703, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2113427072763443, "step": 4572 }, { "epoch": 0.285875, "grad_norm": 2.3125, "grad_norm_var": 0.0279205322265625, "learning_rate": 0.0001, "loss": 7.5764, "loss/crossentropy": 2.199875235557556, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21796562522649765, "step": 4574 }, { "epoch": 0.286, "grad_norm": 2.296875, "grad_norm_var": 0.027318318684895832, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.4474005699157715, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22615059465169907, "step": 4576 }, { "epoch": 0.286125, "grad_norm": 2.234375, "grad_norm_var": 0.025809733072916667, "learning_rate": 0.0001, "loss": 7.4363, "loss/crossentropy": 2.554097890853882, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2136968970298767, "step": 4578 }, { "epoch": 0.28625, "grad_norm": 2.375, "grad_norm_var": 0.028218587239583332, "learning_rate": 0.0001, "loss": 7.3431, "loss/crossentropy": 2.4345656633377075, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22010788321495056, "step": 4580 }, { "epoch": 0.286375, "grad_norm": 2.359375, "grad_norm_var": 0.021240234375, "learning_rate": 0.0001, "loss": 7.6311, "loss/crossentropy": 2.0311567187309265, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21834005415439606, "step": 4582 }, { "epoch": 0.2865, "grad_norm": 2.5625, "grad_norm_var": 0.024019368489583335, "learning_rate": 0.0001, "loss": 7.5513, "loss/crossentropy": 2.2485233545303345, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23373603075742722, "step": 4584 }, { "epoch": 0.286625, "grad_norm": 2.40625, "grad_norm_var": 0.009859212239583333, "learning_rate": 0.0001, "loss": 7.312, "loss/crossentropy": 2.2216570377349854, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2155969738960266, "step": 4586 }, { "epoch": 0.28675, "grad_norm": 2.421875, "grad_norm_var": 0.009943644205729166, "learning_rate": 0.0001, "loss": 7.5092, "loss/crossentropy": 2.213111639022827, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2212653011083603, "step": 4588 }, { "epoch": 0.286875, "grad_norm": 2.109375, "grad_norm_var": 0.013362630208333334, "learning_rate": 0.0001, "loss": 7.3797, "loss/crossentropy": 2.257219433784485, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22302033007144928, "step": 4590 }, { "epoch": 0.287, "grad_norm": 2.609375, "grad_norm_var": 0.018452962239583332, "learning_rate": 0.0001, "loss": 7.3395, "loss/crossentropy": 2.374780535697937, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2257033884525299, "step": 4592 }, { "epoch": 0.287125, "grad_norm": 2.40625, "grad_norm_var": 0.021761067708333335, "learning_rate": 0.0001, "loss": 7.2915, "loss/crossentropy": 2.033597767353058, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22485361993312836, "step": 4594 }, { "epoch": 0.28725, "grad_norm": 2.171875, "grad_norm_var": 0.022591145833333333, "learning_rate": 0.0001, "loss": 7.4737, "loss/crossentropy": 2.3573769330978394, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2300783023238182, "step": 4596 }, { "epoch": 0.287375, "grad_norm": 2.28125, "grad_norm_var": 0.0231109619140625, "learning_rate": 0.0001, "loss": 7.3957, "loss/crossentropy": 2.3102123737335205, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22356858849525452, "step": 4598 }, { "epoch": 0.2875, "grad_norm": 2.140625, "grad_norm_var": 0.02281494140625, "learning_rate": 0.0001, "loss": 7.2862, "loss/crossentropy": 2.392950177192688, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21436259150505066, "step": 4600 }, { "epoch": 0.287625, "grad_norm": 2.515625, "grad_norm_var": 0.0263824462890625, "learning_rate": 0.0001, "loss": 7.3518, "loss/crossentropy": 2.2781176567077637, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23186073452234268, "step": 4602 }, { "epoch": 0.28775, "grad_norm": 2.234375, "grad_norm_var": 0.0295074462890625, "learning_rate": 0.0001, "loss": 7.3298, "loss/crossentropy": 2.2567808628082275, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2299715131521225, "step": 4604 }, { "epoch": 0.287875, "grad_norm": 2.171875, "grad_norm_var": 0.0271392822265625, "learning_rate": 0.0001, "loss": 7.3105, "loss/crossentropy": 1.9985857605934143, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21828313171863556, "step": 4606 }, { "epoch": 0.288, "grad_norm": 2.3125, "grad_norm_var": 0.022749837239583334, "learning_rate": 0.0001, "loss": 7.4778, "loss/crossentropy": 2.2161970138549805, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2215772345662117, "step": 4608 }, { "epoch": 0.288125, "grad_norm": 2.21875, "grad_norm_var": 0.01998291015625, "learning_rate": 0.0001, "loss": 7.2087, "loss/crossentropy": 1.9949069619178772, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.1976795345544815, "step": 4610 }, { "epoch": 0.28825, "grad_norm": 2.296875, "grad_norm_var": 0.0176422119140625, "learning_rate": 0.0001, "loss": 7.3442, "loss/crossentropy": 2.1542601585388184, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20314577966928482, "step": 4612 }, { "epoch": 0.288375, "grad_norm": 2.359375, "grad_norm_var": 0.017822265625, "learning_rate": 0.0001, "loss": 7.4219, "loss/crossentropy": 2.1769785284996033, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22160416841506958, "step": 4614 }, { "epoch": 0.2885, "grad_norm": 2.265625, "grad_norm_var": 0.016673787434895834, "learning_rate": 0.0001, "loss": 7.4635, "loss/crossentropy": 2.079149842262268, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21963858604431152, "step": 4616 }, { "epoch": 0.288625, "grad_norm": 2.265625, "grad_norm_var": 0.012743123372395833, "learning_rate": 0.0001, "loss": 7.4368, "loss/crossentropy": 2.1562063694000244, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20194550603628159, "step": 4618 }, { "epoch": 0.28875, "grad_norm": 2.234375, "grad_norm_var": 0.0077626546223958336, "learning_rate": 0.0001, "loss": 7.3215, "loss/crossentropy": 2.285536289215088, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.23264925926923752, "step": 4620 }, { "epoch": 0.288875, "grad_norm": 2.265625, "grad_norm_var": 0.010933430989583333, "learning_rate": 0.0001, "loss": 7.5954, "loss/crossentropy": 2.383559465408325, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21055196970701218, "step": 4622 }, { "epoch": 0.289, "grad_norm": 2.15625, "grad_norm_var": 0.01123046875, "learning_rate": 0.0001, "loss": 7.3199, "loss/crossentropy": 2.09514844417572, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21392296999692917, "step": 4624 }, { "epoch": 0.289125, "grad_norm": 2.46875, "grad_norm_var": 0.012279256184895834, "learning_rate": 0.0001, "loss": 7.4886, "loss/crossentropy": 2.366590738296509, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2233487293124199, "step": 4626 }, { "epoch": 0.28925, "grad_norm": 2.375, "grad_norm_var": 0.0121246337890625, "learning_rate": 0.0001, "loss": 7.627, "loss/crossentropy": 2.5399746894836426, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22708307206630707, "step": 4628 }, { "epoch": 0.289375, "grad_norm": 2.390625, "grad_norm_var": 0.014286295572916666, "learning_rate": 0.0001, "loss": 7.3466, "loss/crossentropy": 2.181055188179016, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21900714933872223, "step": 4630 }, { "epoch": 0.2895, "grad_norm": 2.4375, "grad_norm_var": 0.014240519205729166, "learning_rate": 0.0001, "loss": 7.3034, "loss/crossentropy": 2.297792911529541, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2245972827076912, "step": 4632 }, { "epoch": 0.289625, "grad_norm": 2.3125, "grad_norm_var": 0.012946573893229167, "learning_rate": 0.0001, "loss": 7.3323, "loss/crossentropy": 2.329147696495056, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22863604873418808, "step": 4634 }, { "epoch": 0.28975, "grad_norm": 2.703125, "grad_norm_var": 0.23771158854166666, "learning_rate": 0.0001, "loss": 7.3751, "loss/crossentropy": 2.1996554136276245, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21396170556545258, "step": 4636 }, { "epoch": 0.289875, "grad_norm": 2.390625, "grad_norm_var": 0.23672587076822918, "learning_rate": 0.0001, "loss": 7.4831, "loss/crossentropy": 2.5707833766937256, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23703519254922867, "step": 4638 }, { "epoch": 0.29, "grad_norm": 2.5, "grad_norm_var": 0.22306315104166666, "learning_rate": 0.0001, "loss": 7.4067, "loss/crossentropy": 2.0104441046714783, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22016918659210205, "step": 4640 }, { "epoch": 0.290125, "grad_norm": 3.0, "grad_norm_var": 0.23544514973958333, "learning_rate": 0.0001, "loss": 7.3453, "loss/crossentropy": 2.236189603805542, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22123342752456665, "step": 4642 }, { "epoch": 0.29025, "grad_norm": 1.953125, "grad_norm_var": 0.2549641927083333, "learning_rate": 0.0001, "loss": 7.1814, "loss/crossentropy": 2.1541532278060913, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22347942739725113, "step": 4644 }, { "epoch": 0.290375, "grad_norm": 2.21875, "grad_norm_var": 0.2615559895833333, "learning_rate": 0.0001, "loss": 7.2884, "loss/crossentropy": 2.274155378341675, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23323418200016022, "step": 4646 }, { "epoch": 0.2905, "grad_norm": 2.4375, "grad_norm_var": 0.26042378743489586, "learning_rate": 0.0001, "loss": 7.4791, "loss/crossentropy": 2.08142626285553, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2270170971751213, "step": 4648 }, { "epoch": 0.290625, "grad_norm": 2.171875, "grad_norm_var": 0.2713368733723958, "learning_rate": 0.0001, "loss": 7.4897, "loss/crossentropy": 2.15802264213562, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.20108021795749664, "step": 4650 }, { "epoch": 0.29075, "grad_norm": 2.5625, "grad_norm_var": 0.06603190104166666, "learning_rate": 0.0001, "loss": 7.5383, "loss/crossentropy": 2.215345621109009, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2371804639697075, "step": 4652 }, { "epoch": 0.290875, "grad_norm": 2.5, "grad_norm_var": 0.07634989420572917, "learning_rate": 0.0001, "loss": 7.274, "loss/crossentropy": 2.2180492281913757, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21646256744861603, "step": 4654 }, { "epoch": 0.291, "grad_norm": 2.296875, "grad_norm_var": 0.07224019368489583, "learning_rate": 0.0001, "loss": 7.3643, "loss/crossentropy": 2.1802927255630493, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2141588106751442, "step": 4656 }, { "epoch": 0.291125, "grad_norm": 2.328125, "grad_norm_var": 0.04449462890625, "learning_rate": 0.0001, "loss": 7.649, "loss/crossentropy": 2.5158231258392334, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.243204727768898, "step": 4658 }, { "epoch": 0.29125, "grad_norm": 2.328125, "grad_norm_var": 0.0333892822265625, "learning_rate": 0.0001, "loss": 7.463, "loss/crossentropy": 2.190263271331787, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21288666874170303, "step": 4660 }, { "epoch": 0.291375, "grad_norm": 2.578125, "grad_norm_var": 0.0307281494140625, "learning_rate": 0.0001, "loss": 7.4629, "loss/crossentropy": 2.5436954498291016, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2295963317155838, "step": 4662 }, { "epoch": 0.2915, "grad_norm": 2.390625, "grad_norm_var": 0.031201171875, "learning_rate": 0.0001, "loss": 7.5307, "loss/crossentropy": 2.388418436050415, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21847142279148102, "step": 4664 }, { "epoch": 0.291625, "grad_norm": 2.3125, "grad_norm_var": 0.02662353515625, "learning_rate": 0.0001, "loss": 7.4585, "loss/crossentropy": 2.4331501722335815, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2292798087000847, "step": 4666 }, { "epoch": 0.29175, "grad_norm": 2.265625, "grad_norm_var": 0.027144368489583334, "learning_rate": 0.0001, "loss": 7.2474, "loss/crossentropy": 2.1129366755485535, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.20572030544281006, "step": 4668 }, { "epoch": 0.291875, "grad_norm": 2.390625, "grad_norm_var": 0.010789998372395833, "learning_rate": 0.0001, "loss": 7.5378, "loss/crossentropy": 2.058245360851288, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20938818156719208, "step": 4670 }, { "epoch": 0.292, "grad_norm": 2.4375, "grad_norm_var": 0.010204060872395834, "learning_rate": 0.0001, "loss": 7.5196, "loss/crossentropy": 2.34469473361969, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2339082881808281, "step": 4672 }, { "epoch": 0.292125, "grad_norm": 2.46875, "grad_norm_var": 0.0115631103515625, "learning_rate": 0.0001, "loss": 7.2852, "loss/crossentropy": 2.3817514181137085, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21657159179449081, "step": 4674 }, { "epoch": 0.29225, "grad_norm": 2.125, "grad_norm_var": 0.016988118489583332, "learning_rate": 0.0001, "loss": 7.3871, "loss/crossentropy": 2.382703423500061, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2450731173157692, "step": 4676 }, { "epoch": 0.292375, "grad_norm": 2.25, "grad_norm_var": 0.014615885416666667, "learning_rate": 0.0001, "loss": 7.4901, "loss/crossentropy": 2.296135663986206, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.26525241136550903, "step": 4678 }, { "epoch": 0.2925, "grad_norm": 2.3125, "grad_norm_var": 0.012093098958333333, "learning_rate": 0.0001, "loss": 7.4537, "loss/crossentropy": 2.2322874069213867, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22614771127700806, "step": 4680 }, { "epoch": 0.292625, "grad_norm": 2.375, "grad_norm_var": 0.014232381184895834, "learning_rate": 0.0001, "loss": 7.3101, "loss/crossentropy": 2.408819079399109, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21294642984867096, "step": 4682 }, { "epoch": 0.29275, "grad_norm": 2.328125, "grad_norm_var": 0.013434855143229167, "learning_rate": 0.0001, "loss": 7.3394, "loss/crossentropy": 2.1100034713745117, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22328343987464905, "step": 4684 }, { "epoch": 0.292875, "grad_norm": 2.390625, "grad_norm_var": 0.0124664306640625, "learning_rate": 0.0001, "loss": 7.1731, "loss/crossentropy": 1.996330440044403, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.18464084714651108, "step": 4686 }, { "epoch": 0.293, "grad_norm": 2.890625, "grad_norm_var": 0.035481770833333336, "learning_rate": 0.0001, "loss": 7.199, "loss/crossentropy": 2.1880674958229065, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2431424781680107, "step": 4688 }, { "epoch": 0.293125, "grad_norm": 2.109375, "grad_norm_var": 0.0387847900390625, "learning_rate": 0.0001, "loss": 7.2796, "loss/crossentropy": 2.326804280281067, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21799682080745697, "step": 4690 }, { "epoch": 0.29325, "grad_norm": 2.359375, "grad_norm_var": 0.033524576822916666, "learning_rate": 0.0001, "loss": 7.5501, "loss/crossentropy": 2.309473156929016, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22959060966968536, "step": 4692 }, { "epoch": 0.293375, "grad_norm": 2.421875, "grad_norm_var": 0.03308817545572917, "learning_rate": 0.0001, "loss": 7.4887, "loss/crossentropy": 2.607773542404175, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23161280155181885, "step": 4694 }, { "epoch": 0.2935, "grad_norm": 2.265625, "grad_norm_var": 0.034333292643229166, "learning_rate": 0.0001, "loss": 7.1605, "loss/crossentropy": 2.140324115753174, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.24282102286815643, "step": 4696 }, { "epoch": 0.293625, "grad_norm": 2.359375, "grad_norm_var": 0.03413798014322917, "learning_rate": 0.0001, "loss": 7.4359, "loss/crossentropy": 2.1101192831993103, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2048392966389656, "step": 4698 }, { "epoch": 0.29375, "grad_norm": 2.0625, "grad_norm_var": 0.040185546875, "learning_rate": 0.0001, "loss": 7.3764, "loss/crossentropy": 2.3710379600524902, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21764564514160156, "step": 4700 }, { "epoch": 0.293875, "grad_norm": 2.34375, "grad_norm_var": 0.03961181640625, "learning_rate": 0.0001, "loss": 7.4356, "loss/crossentropy": 2.3584975004196167, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2174394130706787, "step": 4702 }, { "epoch": 0.294, "grad_norm": 2.296875, "grad_norm_var": 0.014957682291666666, "learning_rate": 0.0001, "loss": 7.4147, "loss/crossentropy": 2.118413209915161, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.19888906925916672, "step": 4704 }, { "epoch": 0.294125, "grad_norm": 2.359375, "grad_norm_var": 0.0112945556640625, "learning_rate": 0.0001, "loss": 7.3895, "loss/crossentropy": 2.2657341957092285, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23323117196559906, "step": 4706 }, { "epoch": 0.29425, "grad_norm": 2.28125, "grad_norm_var": 0.012328084309895833, "learning_rate": 0.0001, "loss": 7.2587, "loss/crossentropy": 2.2151424884796143, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20538458228111267, "step": 4708 }, { "epoch": 0.294375, "grad_norm": 2.4375, "grad_norm_var": 0.010152180989583334, "learning_rate": 0.0001, "loss": 7.4708, "loss/crossentropy": 2.5433419942855835, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.26616372913122177, "step": 4710 }, { "epoch": 0.2945, "grad_norm": 2.421875, "grad_norm_var": 0.0125396728515625, "learning_rate": 0.0001, "loss": 7.2729, "loss/crossentropy": 2.154632091522217, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2053903043270111, "step": 4712 }, { "epoch": 0.294625, "grad_norm": 2.28125, "grad_norm_var": 0.01275634765625, "learning_rate": 0.0001, "loss": 7.4455, "loss/crossentropy": 2.3647114038467407, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22039730101823807, "step": 4714 }, { "epoch": 0.29475, "grad_norm": 2.171875, "grad_norm_var": 0.0086822509765625, "learning_rate": 0.0001, "loss": 7.2138, "loss/crossentropy": 2.1929028034210205, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21555405855178833, "step": 4716 }, { "epoch": 0.294875, "grad_norm": 2.421875, "grad_norm_var": 0.0091796875, "learning_rate": 0.0001, "loss": 7.36, "loss/crossentropy": 2.084302306175232, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21005475521087646, "step": 4718 }, { "epoch": 0.295, "grad_norm": 2.25, "grad_norm_var": 0.009765625, "learning_rate": 0.0001, "loss": 7.3237, "loss/crossentropy": 2.346268892288208, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22263453155755997, "step": 4720 }, { "epoch": 0.295125, "grad_norm": 2.203125, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 7.2444, "loss/crossentropy": 2.268470883369446, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2138143628835678, "step": 4722 }, { "epoch": 0.29525, "grad_norm": 2.703125, "grad_norm_var": 0.0237945556640625, "learning_rate": 0.0001, "loss": 7.6037, "loss/crossentropy": 2.326021194458008, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2048988789319992, "step": 4724 }, { "epoch": 0.295375, "grad_norm": 2.234375, "grad_norm_var": 0.024811808268229166, "learning_rate": 0.0001, "loss": 7.3422, "loss/crossentropy": 2.193223237991333, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2141074314713478, "step": 4726 }, { "epoch": 0.2955, "grad_norm": 2.203125, "grad_norm_var": 0.0246978759765625, "learning_rate": 0.0001, "loss": 7.4967, "loss/crossentropy": 2.308136224746704, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21569503098726273, "step": 4728 }, { "epoch": 0.295625, "grad_norm": 2.203125, "grad_norm_var": 0.0247711181640625, "learning_rate": 0.0001, "loss": 7.3277, "loss/crossentropy": 1.9237273931503296, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.1951095387339592, "step": 4730 }, { "epoch": 0.29575, "grad_norm": 2.40625, "grad_norm_var": 0.0256988525390625, "learning_rate": 0.0001, "loss": 7.3125, "loss/crossentropy": 2.059843420982361, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21607337892055511, "step": 4732 }, { "epoch": 0.295875, "grad_norm": 2.578125, "grad_norm_var": 0.030598958333333332, "learning_rate": 0.0001, "loss": 7.3797, "loss/crossentropy": 2.239774227142334, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.24183505028486252, "step": 4734 }, { "epoch": 0.296, "grad_norm": 2.359375, "grad_norm_var": 0.030663045247395833, "learning_rate": 0.0001, "loss": 7.4556, "loss/crossentropy": 2.37227463722229, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23292560875415802, "step": 4736 }, { "epoch": 0.296125, "grad_norm": 2.21875, "grad_norm_var": 0.0251861572265625, "learning_rate": 0.0001, "loss": 7.304, "loss/crossentropy": 2.0044411420822144, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19409935921430588, "step": 4738 }, { "epoch": 0.29625, "grad_norm": 2.234375, "grad_norm_var": 0.018094889322916665, "learning_rate": 0.0001, "loss": 7.3915, "loss/crossentropy": 2.3123366832733154, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2075807899236679, "step": 4740 }, { "epoch": 0.296375, "grad_norm": 2.3125, "grad_norm_var": 0.0167388916015625, "learning_rate": 0.0001, "loss": 7.3113, "loss/crossentropy": 1.761619508266449, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.22059950977563858, "step": 4742 }, { "epoch": 0.2965, "grad_norm": 2.5, "grad_norm_var": 0.016584269205729165, "learning_rate": 0.0001, "loss": 7.5001, "loss/crossentropy": 2.106764078140259, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22241321206092834, "step": 4744 }, { "epoch": 0.296625, "grad_norm": 2.28125, "grad_norm_var": 0.0156646728515625, "learning_rate": 0.0001, "loss": 7.4221, "loss/crossentropy": 2.1658183336257935, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2201010212302208, "step": 4746 }, { "epoch": 0.29675, "grad_norm": 2.21875, "grad_norm_var": 0.014216105143229166, "learning_rate": 0.0001, "loss": 7.2877, "loss/crossentropy": 2.288554072380066, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21139442175626755, "step": 4748 }, { "epoch": 0.296875, "grad_norm": 2.1875, "grad_norm_var": 0.012132771809895833, "learning_rate": 0.0001, "loss": 7.291, "loss/crossentropy": 2.209873080253601, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23227518796920776, "step": 4750 }, { "epoch": 0.297, "grad_norm": 2.21875, "grad_norm_var": 0.011668904622395834, "learning_rate": 0.0001, "loss": 7.3108, "loss/crossentropy": 2.2278741598129272, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.213672935962677, "step": 4752 }, { "epoch": 0.297125, "grad_norm": 2.546875, "grad_norm_var": 0.014351399739583333, "learning_rate": 0.0001, "loss": 7.4632, "loss/crossentropy": 2.1707485914230347, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21832922846078873, "step": 4754 }, { "epoch": 0.29725, "grad_norm": 2.21875, "grad_norm_var": 0.013277180989583333, "learning_rate": 0.0001, "loss": 7.3793, "loss/crossentropy": 2.068936765193939, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21937869489192963, "step": 4756 }, { "epoch": 0.297375, "grad_norm": 2.5625, "grad_norm_var": 0.0177642822265625, "learning_rate": 0.0001, "loss": 7.4439, "loss/crossentropy": 2.4402244091033936, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2264351323246956, "step": 4758 }, { "epoch": 0.2975, "grad_norm": 2.53125, "grad_norm_var": 0.029377237955729166, "learning_rate": 0.0001, "loss": 7.4963, "loss/crossentropy": 2.351226568222046, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23098966479301453, "step": 4760 }, { "epoch": 0.297625, "grad_norm": 2.140625, "grad_norm_var": 0.03259989420572917, "learning_rate": 0.0001, "loss": 7.392, "loss/crossentropy": 1.9955863952636719, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21300213038921356, "step": 4762 }, { "epoch": 0.29775, "grad_norm": 2.390625, "grad_norm_var": 0.03186848958333333, "learning_rate": 0.0001, "loss": 7.5055, "loss/crossentropy": 2.4948599338531494, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2379990816116333, "step": 4764 }, { "epoch": 0.297875, "grad_norm": 2.359375, "grad_norm_var": 0.0257720947265625, "learning_rate": 0.0001, "loss": 7.3315, "loss/crossentropy": 1.780558168888092, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19055156409740448, "step": 4766 }, { "epoch": 0.298, "grad_norm": 2.359375, "grad_norm_var": 0.0232086181640625, "learning_rate": 0.0001, "loss": 7.4719, "loss/crossentropy": 2.25586998462677, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2168411985039711, "step": 4768 }, { "epoch": 0.298125, "grad_norm": 2.296875, "grad_norm_var": 0.021703084309895832, "learning_rate": 0.0001, "loss": 7.3451, "loss/crossentropy": 2.1937352418899536, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21270034462213516, "step": 4770 }, { "epoch": 0.29825, "grad_norm": 2.296875, "grad_norm_var": 0.019123331705729166, "learning_rate": 0.0001, "loss": 7.4653, "loss/crossentropy": 2.2712541818618774, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23585833609104156, "step": 4772 }, { "epoch": 0.298375, "grad_norm": 2.453125, "grad_norm_var": 0.0224609375, "learning_rate": 0.0001, "loss": 7.392, "loss/crossentropy": 2.311362624168396, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21948038041591644, "step": 4774 }, { "epoch": 0.2985, "grad_norm": 2.296875, "grad_norm_var": 0.010936482747395834, "learning_rate": 0.0001, "loss": 7.5252, "loss/crossentropy": 2.5121726989746094, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23548902571201324, "step": 4776 }, { "epoch": 0.298625, "grad_norm": 2.28125, "grad_norm_var": 0.010725911458333333, "learning_rate": 0.0001, "loss": 7.4346, "loss/crossentropy": 2.352226734161377, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2144220471382141, "step": 4778 }, { "epoch": 0.29875, "grad_norm": 2.34375, "grad_norm_var": 0.0104156494140625, "learning_rate": 0.0001, "loss": 7.3003, "loss/crossentropy": 1.966825008392334, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20474375039339066, "step": 4780 }, { "epoch": 0.298875, "grad_norm": 2.171875, "grad_norm_var": 0.01158447265625, "learning_rate": 0.0001, "loss": 7.2383, "loss/crossentropy": 2.2786693572998047, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22437963634729385, "step": 4782 }, { "epoch": 0.299, "grad_norm": 2.125, "grad_norm_var": 0.012821451822916666, "learning_rate": 0.0001, "loss": 7.314, "loss/crossentropy": 2.1766642332077026, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.21804757416248322, "step": 4784 }, { "epoch": 0.299125, "grad_norm": 2.203125, "grad_norm_var": 0.0131988525390625, "learning_rate": 0.0001, "loss": 7.3039, "loss/crossentropy": 2.009239912033081, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.19700751453638077, "step": 4786 }, { "epoch": 0.29925, "grad_norm": 3.109375, "grad_norm_var": 0.0577545166015625, "learning_rate": 0.0001, "loss": 7.3357, "loss/crossentropy": 2.127845048904419, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21124648302793503, "step": 4788 }, { "epoch": 0.299375, "grad_norm": 2.265625, "grad_norm_var": 0.0581695556640625, "learning_rate": 0.0001, "loss": 7.3315, "loss/crossentropy": 2.303865075111389, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22746270895004272, "step": 4790 }, { "epoch": 0.2995, "grad_norm": 2.375, "grad_norm_var": 0.058984375, "learning_rate": 0.0001, "loss": 7.3872, "loss/crossentropy": 2.15032958984375, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21064457297325134, "step": 4792 }, { "epoch": 0.299625, "grad_norm": 2.5625, "grad_norm_var": 0.059544881184895836, "learning_rate": 0.0001, "loss": 7.4218, "loss/crossentropy": 1.9902217388153076, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2275134027004242, "step": 4794 }, { "epoch": 0.29975, "grad_norm": 2.296875, "grad_norm_var": 0.061421712239583336, "learning_rate": 0.0001, "loss": 7.3493, "loss/crossentropy": 2.365830898284912, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22473762929439545, "step": 4796 }, { "epoch": 0.299875, "grad_norm": 2.25, "grad_norm_var": 0.061253865559895836, "learning_rate": 0.0001, "loss": 7.2287, "loss/crossentropy": 2.406466007232666, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2222784459590912, "step": 4798 }, { "epoch": 0.3, "grad_norm": 2.0625, "grad_norm_var": 0.06142578125, "learning_rate": 0.0001, "loss": 7.4167, "loss/crossentropy": 2.2407450675964355, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2090248540043831, "step": 4800 }, { "epoch": 0.300125, "grad_norm": 2.46875, "grad_norm_var": 0.05963134765625, "learning_rate": 0.0001, "loss": 7.3583, "loss/crossentropy": 2.3130555152893066, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21969569474458694, "step": 4802 }, { "epoch": 0.30025, "grad_norm": 2.125, "grad_norm_var": 0.0265625, "learning_rate": 0.0001, "loss": 7.2996, "loss/crossentropy": 2.301561713218689, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21655967831611633, "step": 4804 }, { "epoch": 0.300375, "grad_norm": 2.421875, "grad_norm_var": 0.02265625, "learning_rate": 0.0001, "loss": 7.3977, "loss/crossentropy": 2.4715986251831055, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22068122029304504, "step": 4806 }, { "epoch": 0.3005, "grad_norm": 2.578125, "grad_norm_var": 0.0252838134765625, "learning_rate": 0.0001, "loss": 7.3491, "loss/crossentropy": 2.0583669543266296, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2092050388455391, "step": 4808 }, { "epoch": 0.300625, "grad_norm": 2.078125, "grad_norm_var": 0.026447550455729166, "learning_rate": 0.0001, "loss": 7.4016, "loss/crossentropy": 2.222812056541443, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.20580071210861206, "step": 4810 }, { "epoch": 0.30075, "grad_norm": 2.515625, "grad_norm_var": 0.027692667643229165, "learning_rate": 0.0001, "loss": 7.3078, "loss/crossentropy": 2.467105746269226, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21557529270648956, "step": 4812 }, { "epoch": 0.300875, "grad_norm": 2.25, "grad_norm_var": 0.029719034830729168, "learning_rate": 0.0001, "loss": 7.4463, "loss/crossentropy": 2.0972548127174377, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22322270274162292, "step": 4814 }, { "epoch": 0.301, "grad_norm": 2.125, "grad_norm_var": 0.0301910400390625, "learning_rate": 0.0001, "loss": 7.3009, "loss/crossentropy": 2.113277792930603, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21659619361162186, "step": 4816 }, { "epoch": 0.301125, "grad_norm": 2.296875, "grad_norm_var": 0.02880859375, "learning_rate": 0.0001, "loss": 7.3721, "loss/crossentropy": 2.483347177505493, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2171446457505226, "step": 4818 }, { "epoch": 0.30125, "grad_norm": 2.265625, "grad_norm_var": 0.026927693684895834, "learning_rate": 0.0001, "loss": 7.3889, "loss/crossentropy": 2.417271375656128, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22507987171411514, "step": 4820 }, { "epoch": 0.301375, "grad_norm": 2.375, "grad_norm_var": 0.02681884765625, "learning_rate": 0.0001, "loss": 7.4797, "loss/crossentropy": 2.3490647077560425, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2174646332859993, "step": 4822 }, { "epoch": 0.3015, "grad_norm": 2.359375, "grad_norm_var": 0.020164998372395833, "learning_rate": 0.0001, "loss": 7.5363, "loss/crossentropy": 2.400337815284729, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2227081060409546, "step": 4824 }, { "epoch": 0.301625, "grad_norm": 3.0, "grad_norm_var": 0.04592692057291667, "learning_rate": 0.0001, "loss": 7.4569, "loss/crossentropy": 2.2321704030036926, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.24002033472061157, "step": 4826 }, { "epoch": 0.30175, "grad_norm": 2.359375, "grad_norm_var": 0.04690348307291667, "learning_rate": 0.0001, "loss": 7.5359, "loss/crossentropy": 2.3871182203292847, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20659293234348297, "step": 4828 }, { "epoch": 0.301875, "grad_norm": 3.28125, "grad_norm_var": 0.09729410807291666, "learning_rate": 0.0001, "loss": 7.4514, "loss/crossentropy": 2.4146281480789185, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22511692345142365, "step": 4830 }, { "epoch": 0.302, "grad_norm": 2.765625, "grad_norm_var": 0.09411519368489583, "learning_rate": 0.0001, "loss": 7.3884, "loss/crossentropy": 2.3637466430664062, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20597800612449646, "step": 4832 }, { "epoch": 0.302125, "grad_norm": 2.0625, "grad_norm_var": 0.10143229166666666, "learning_rate": 0.0001, "loss": 7.4427, "loss/crossentropy": 2.363954782485962, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21014075726270676, "step": 4834 }, { "epoch": 0.30225, "grad_norm": 2.390625, "grad_norm_var": 0.0969146728515625, "learning_rate": 0.0001, "loss": 7.2434, "loss/crossentropy": 2.245696544647217, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20109941810369492, "step": 4836 }, { "epoch": 0.302375, "grad_norm": 2.421875, "grad_norm_var": 0.09761962890625, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 2.1231839656829834, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2174079716205597, "step": 4838 }, { "epoch": 0.3025, "grad_norm": 2.15625, "grad_norm_var": 0.11103108723958334, "learning_rate": 0.0001, "loss": 7.2504, "loss/crossentropy": 2.1823160648345947, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21924307942390442, "step": 4840 }, { "epoch": 0.302625, "grad_norm": 2.40625, "grad_norm_var": 0.08931884765625, "learning_rate": 0.0001, "loss": 7.4531, "loss/crossentropy": 2.3826904296875, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21867043524980545, "step": 4842 }, { "epoch": 0.30275, "grad_norm": 2.4375, "grad_norm_var": 0.08771158854166666, "learning_rate": 0.0001, "loss": 7.3825, "loss/crossentropy": 2.2450802326202393, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2196601778268814, "step": 4844 }, { "epoch": 0.302875, "grad_norm": 2.65625, "grad_norm_var": 0.03821207682291667, "learning_rate": 0.0001, "loss": 7.5146, "loss/crossentropy": 2.1403591632843018, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2133881226181984, "step": 4846 }, { "epoch": 0.303, "grad_norm": 2.25, "grad_norm_var": 0.06558329264322917, "learning_rate": 0.0001, "loss": 7.435, "loss/crossentropy": 2.14195853471756, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.19571785628795624, "step": 4848 }, { "epoch": 0.303125, "grad_norm": 2.5625, "grad_norm_var": 0.060221354166666664, "learning_rate": 0.0001, "loss": 7.4894, "loss/crossentropy": 2.2988877296447754, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22777371853590012, "step": 4850 }, { "epoch": 0.30325, "grad_norm": 2.203125, "grad_norm_var": 0.06297098795572917, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 2.390594482421875, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2120780646800995, "step": 4852 }, { "epoch": 0.303375, "grad_norm": 2.328125, "grad_norm_var": 0.06382548014322917, "learning_rate": 0.0001, "loss": 7.4284, "loss/crossentropy": 2.379095196723938, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20808186382055283, "step": 4854 }, { "epoch": 0.3035, "grad_norm": 2.375, "grad_norm_var": 0.0565093994140625, "learning_rate": 0.0001, "loss": 7.4303, "loss/crossentropy": 2.398724317550659, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22028641402721405, "step": 4856 }, { "epoch": 0.303625, "grad_norm": 2.546875, "grad_norm_var": 0.0544097900390625, "learning_rate": 0.0001, "loss": 7.5402, "loss/crossentropy": 2.4652938842773438, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23258233815431595, "step": 4858 }, { "epoch": 0.30375, "grad_norm": 2.375, "grad_norm_var": 0.0521148681640625, "learning_rate": 0.0001, "loss": 7.5154, "loss/crossentropy": 2.15559184551239, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2242182418704033, "step": 4860 }, { "epoch": 0.303875, "grad_norm": 2.328125, "grad_norm_var": 0.04914449055989583, "learning_rate": 0.0001, "loss": 7.516, "loss/crossentropy": 2.3568320274353027, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22093456983566284, "step": 4862 }, { "epoch": 0.304, "grad_norm": 2.359375, "grad_norm_var": 0.018382771809895834, "learning_rate": 0.0001, "loss": 7.303, "loss/crossentropy": 2.130435824394226, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20797330886125565, "step": 4864 }, { "epoch": 0.304125, "grad_norm": 2.46875, "grad_norm_var": 0.017659505208333332, "learning_rate": 0.0001, "loss": 7.5087, "loss/crossentropy": 2.1923974752426147, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2099837362766266, "step": 4866 }, { "epoch": 0.30425, "grad_norm": 2.203125, "grad_norm_var": 0.018993123372395834, "learning_rate": 0.0001, "loss": 7.5138, "loss/crossentropy": 2.3074164986610413, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21502834558486938, "step": 4868 }, { "epoch": 0.304375, "grad_norm": 2.609375, "grad_norm_var": 0.023274739583333332, "learning_rate": 0.0001, "loss": 7.6134, "loss/crossentropy": 2.399793028831482, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2250315099954605, "step": 4870 }, { "epoch": 0.3045, "grad_norm": 2.203125, "grad_norm_var": 0.021906534830729168, "learning_rate": 0.0001, "loss": 7.4322, "loss/crossentropy": 2.5476996898651123, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21240823715925217, "step": 4872 }, { "epoch": 0.304625, "grad_norm": 2.1875, "grad_norm_var": 0.022847493489583332, "learning_rate": 0.0001, "loss": 7.3043, "loss/crossentropy": 2.406996250152588, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21996094286441803, "step": 4874 }, { "epoch": 0.30475, "grad_norm": 2.140625, "grad_norm_var": 0.02496337890625, "learning_rate": 0.0001, "loss": 7.2396, "loss/crossentropy": 2.1408446431159973, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.212337464094162, "step": 4876 }, { "epoch": 0.304875, "grad_norm": 2.3125, "grad_norm_var": 0.022932942708333334, "learning_rate": 0.0001, "loss": 7.3172, "loss/crossentropy": 2.0700973868370056, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21268072724342346, "step": 4878 }, { "epoch": 0.305, "grad_norm": 2.078125, "grad_norm_var": 0.023412068684895832, "learning_rate": 0.0001, "loss": 7.3727, "loss/crossentropy": 2.317967474460602, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.22330276668071747, "step": 4880 }, { "epoch": 0.305125, "grad_norm": 2.4375, "grad_norm_var": 0.020633951822916666, "learning_rate": 0.0001, "loss": 7.4016, "loss/crossentropy": 2.3994847536087036, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2378486543893814, "step": 4882 }, { "epoch": 0.30525, "grad_norm": 2.046875, "grad_norm_var": 0.028446451822916666, "learning_rate": 0.0001, "loss": 7.3097, "loss/crossentropy": 2.103451728820801, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20587532967329025, "step": 4884 }, { "epoch": 0.305375, "grad_norm": 2.375, "grad_norm_var": 0.019172159830729167, "learning_rate": 0.0001, "loss": 7.2349, "loss/crossentropy": 2.2608002424240112, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21245233714580536, "step": 4886 }, { "epoch": 0.3055, "grad_norm": 2.25, "grad_norm_var": 0.018586222330729166, "learning_rate": 0.0001, "loss": 7.4389, "loss/crossentropy": 2.401768207550049, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22278450429439545, "step": 4888 }, { "epoch": 0.305625, "grad_norm": 2.296875, "grad_norm_var": 0.018798828125, "learning_rate": 0.0001, "loss": 7.3505, "loss/crossentropy": 2.100538969039917, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23200103640556335, "step": 4890 }, { "epoch": 0.30575, "grad_norm": 2.40625, "grad_norm_var": 0.03186442057291667, "learning_rate": 0.0001, "loss": 7.2995, "loss/crossentropy": 2.3301326036453247, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24110360443592072, "step": 4892 }, { "epoch": 0.305875, "grad_norm": 2.3125, "grad_norm_var": 0.03230692545572917, "learning_rate": 0.0001, "loss": 7.3208, "loss/crossentropy": 2.4211736917495728, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22261983156204224, "step": 4894 }, { "epoch": 0.306, "grad_norm": 2.125, "grad_norm_var": 0.030964152018229166, "learning_rate": 0.0001, "loss": 7.2336, "loss/crossentropy": 2.1695640087127686, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20674562454223633, "step": 4896 }, { "epoch": 0.306125, "grad_norm": 2.09375, "grad_norm_var": 0.031769816080729166, "learning_rate": 0.0001, "loss": 7.2113, "loss/crossentropy": 2.0899396538734436, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20201702415943146, "step": 4898 }, { "epoch": 0.30625, "grad_norm": 2.25, "grad_norm_var": 0.025055948893229166, "learning_rate": 0.0001, "loss": 7.1833, "loss/crossentropy": 2.3102033138275146, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22870182991027832, "step": 4900 }, { "epoch": 0.306375, "grad_norm": 2.265625, "grad_norm_var": 0.024462890625, "learning_rate": 0.0001, "loss": 7.429, "loss/crossentropy": 2.093406856060028, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21099549531936646, "step": 4902 }, { "epoch": 0.3065, "grad_norm": 2.234375, "grad_norm_var": 0.024054972330729167, "learning_rate": 0.0001, "loss": 7.2861, "loss/crossentropy": 2.1686906814575195, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20351973176002502, "step": 4904 }, { "epoch": 0.306625, "grad_norm": 2.390625, "grad_norm_var": 0.024247233072916666, "learning_rate": 0.0001, "loss": 7.4273, "loss/crossentropy": 2.0178955793380737, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2049720138311386, "step": 4906 }, { "epoch": 0.30675, "grad_norm": 2.15625, "grad_norm_var": 0.00933837890625, "learning_rate": 0.0001, "loss": 7.3421, "loss/crossentropy": 2.164846181869507, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.20680507272481918, "step": 4908 }, { "epoch": 0.306875, "grad_norm": 2.28125, "grad_norm_var": 0.009398396809895833, "learning_rate": 0.0001, "loss": 7.3451, "loss/crossentropy": 2.324475646018982, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22731740027666092, "step": 4910 }, { "epoch": 0.307, "grad_norm": 2.34375, "grad_norm_var": 0.009642537434895833, "learning_rate": 0.0001, "loss": 7.3873, "loss/crossentropy": 2.7607351541519165, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.239481620490551, "step": 4912 }, { "epoch": 0.307125, "grad_norm": 2.15625, "grad_norm_var": 0.0101470947265625, "learning_rate": 0.0001, "loss": 7.3776, "loss/crossentropy": 2.286571979522705, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2324925810098648, "step": 4914 }, { "epoch": 0.30725, "grad_norm": 2.203125, "grad_norm_var": 0.009251912434895834, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 2.0468556880950928, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.220846489071846, "step": 4916 }, { "epoch": 0.307375, "grad_norm": 2.34375, "grad_norm_var": 0.017464192708333333, "learning_rate": 0.0001, "loss": 7.4308, "loss/crossentropy": 2.3432207107543945, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21841663122177124, "step": 4918 }, { "epoch": 0.3075, "grad_norm": 2.4375, "grad_norm_var": 0.0173248291015625, "learning_rate": 0.0001, "loss": 7.3261, "loss/crossentropy": 1.9974586367607117, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.20223890990018845, "step": 4920 }, { "epoch": 0.307625, "grad_norm": 2.265625, "grad_norm_var": 0.017601521809895833, "learning_rate": 0.0001, "loss": 7.3917, "loss/crossentropy": 2.1487491130828857, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20649082213640213, "step": 4922 }, { "epoch": 0.30775, "grad_norm": 2.265625, "grad_norm_var": 0.016162109375, "learning_rate": 0.0001, "loss": 7.3709, "loss/crossentropy": 2.0778582096099854, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2082173004746437, "step": 4924 }, { "epoch": 0.307875, "grad_norm": 2.140625, "grad_norm_var": 0.0177642822265625, "learning_rate": 0.0001, "loss": 7.4308, "loss/crossentropy": 2.4916462898254395, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23393811285495758, "step": 4926 }, { "epoch": 0.308, "grad_norm": 2.390625, "grad_norm_var": 0.016893513997395835, "learning_rate": 0.0001, "loss": 7.2895, "loss/crossentropy": 2.1852511167526245, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.1971207559108734, "step": 4928 }, { "epoch": 0.308125, "grad_norm": 2.40625, "grad_norm_var": 0.014404296875, "learning_rate": 0.0001, "loss": 7.4778, "loss/crossentropy": 2.4292192459106445, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20822913944721222, "step": 4930 }, { "epoch": 0.30825, "grad_norm": 2.09375, "grad_norm_var": 0.017508951822916667, "learning_rate": 0.0001, "loss": 7.3199, "loss/crossentropy": 2.162381172180176, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21929100155830383, "step": 4932 }, { "epoch": 0.308375, "grad_norm": 2.265625, "grad_norm_var": 0.00947265625, "learning_rate": 0.0001, "loss": 7.325, "loss/crossentropy": 2.1474106311798096, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2182038575410843, "step": 4934 }, { "epoch": 0.3085, "grad_norm": 2.109375, "grad_norm_var": 0.009033203125, "learning_rate": 0.0001, "loss": 7.4231, "loss/crossentropy": 2.2566583156585693, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20229701697826385, "step": 4936 }, { "epoch": 0.308625, "grad_norm": 2.234375, "grad_norm_var": 0.0088775634765625, "learning_rate": 0.0001, "loss": 7.4338, "loss/crossentropy": 2.361881971359253, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22635971009731293, "step": 4938 }, { "epoch": 0.30875, "grad_norm": 2.515625, "grad_norm_var": 0.014403279622395833, "learning_rate": 0.0001, "loss": 7.4386, "loss/crossentropy": 2.214319109916687, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23132434487342834, "step": 4940 }, { "epoch": 0.308875, "grad_norm": 2.5, "grad_norm_var": 0.016576131184895832, "learning_rate": 0.0001, "loss": 7.3068, "loss/crossentropy": 2.063450336456299, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20887789130210876, "step": 4942 }, { "epoch": 0.309, "grad_norm": 2.0625, "grad_norm_var": 0.018062337239583334, "learning_rate": 0.0001, "loss": 7.3097, "loss/crossentropy": 2.1654560565948486, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20234611630439758, "step": 4944 }, { "epoch": 0.309125, "grad_norm": 2.234375, "grad_norm_var": 0.0166168212890625, "learning_rate": 0.0001, "loss": 7.3565, "loss/crossentropy": 2.086349129676819, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.1944892778992653, "step": 4946 }, { "epoch": 0.30925, "grad_norm": 2.359375, "grad_norm_var": 0.015819295247395834, "learning_rate": 0.0001, "loss": 7.4902, "loss/crossentropy": 2.4319673776626587, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2345517948269844, "step": 4948 }, { "epoch": 0.309375, "grad_norm": 2.25, "grad_norm_var": 0.015869140625, "learning_rate": 0.0001, "loss": 7.3198, "loss/crossentropy": 2.2893803119659424, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21564996242523193, "step": 4950 }, { "epoch": 0.3095, "grad_norm": 2.15625, "grad_norm_var": 0.0133697509765625, "learning_rate": 0.0001, "loss": 7.3128, "loss/crossentropy": 2.2920368909835815, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.227824367582798, "step": 4952 }, { "epoch": 0.309625, "grad_norm": 2.234375, "grad_norm_var": 0.0130035400390625, "learning_rate": 0.0001, "loss": 7.2905, "loss/crossentropy": 2.1902358531951904, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22382274270057678, "step": 4954 }, { "epoch": 0.30975, "grad_norm": 2.25, "grad_norm_var": 0.008885701497395834, "learning_rate": 0.0001, "loss": 7.1857, "loss/crossentropy": 2.1179298162460327, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.19988138228654861, "step": 4956 }, { "epoch": 0.309875, "grad_norm": 2.265625, "grad_norm_var": 0.0048980712890625, "learning_rate": 0.0001, "loss": 7.3741, "loss/crossentropy": 2.7137014865875244, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2347271665930748, "step": 4958 }, { "epoch": 0.31, "grad_norm": 2.328125, "grad_norm_var": 0.0032379150390625, "learning_rate": 0.0001, "loss": 7.5647, "loss/crossentropy": 2.31722891330719, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21784493327140808, "step": 4960 }, { "epoch": 0.310125, "grad_norm": 2.671875, "grad_norm_var": 0.0297760009765625, "learning_rate": 0.0001, "loss": 7.5544, "loss/crossentropy": 2.3354387283325195, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2259213700890541, "step": 4962 }, { "epoch": 0.31025, "grad_norm": 2.03125, "grad_norm_var": 0.0390777587890625, "learning_rate": 0.0001, "loss": 7.2878, "loss/crossentropy": 1.9992610216140747, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.206348218023777, "step": 4964 }, { "epoch": 0.310375, "grad_norm": 2.25, "grad_norm_var": 0.041304524739583334, "learning_rate": 0.0001, "loss": 7.1877, "loss/crossentropy": 2.0673335790634155, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21330340206623077, "step": 4966 }, { "epoch": 0.3105, "grad_norm": 2.109375, "grad_norm_var": 0.04544270833333333, "learning_rate": 0.0001, "loss": 7.2614, "loss/crossentropy": 2.302871584892273, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2201063483953476, "step": 4968 }, { "epoch": 0.310625, "grad_norm": 2.421875, "grad_norm_var": 0.0468414306640625, "learning_rate": 0.0001, "loss": 7.4131, "loss/crossentropy": 2.3394633531570435, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2507524937391281, "step": 4970 }, { "epoch": 0.31075, "grad_norm": 2.171875, "grad_norm_var": 0.04869384765625, "learning_rate": 0.0001, "loss": 7.5202, "loss/crossentropy": 2.5619258880615234, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2357812374830246, "step": 4972 }, { "epoch": 0.310875, "grad_norm": 2.578125, "grad_norm_var": 0.05373433430989583, "learning_rate": 0.0001, "loss": 7.3841, "loss/crossentropy": 2.0160459876060486, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22442921996116638, "step": 4974 }, { "epoch": 0.311, "grad_norm": 2.109375, "grad_norm_var": 0.0671875, "learning_rate": 0.0001, "loss": 7.4319, "loss/crossentropy": 2.474762439727783, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22143270820379257, "step": 4976 }, { "epoch": 0.311125, "grad_norm": 2.171875, "grad_norm_var": 0.04348551432291667, "learning_rate": 0.0001, "loss": 7.3106, "loss/crossentropy": 2.3460559844970703, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21027249097824097, "step": 4978 }, { "epoch": 0.31125, "grad_norm": 2.21875, "grad_norm_var": 0.0361328125, "learning_rate": 0.0001, "loss": 7.2274, "loss/crossentropy": 1.8983039855957031, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.19185489416122437, "step": 4980 }, { "epoch": 0.311375, "grad_norm": 2.34375, "grad_norm_var": 0.0352447509765625, "learning_rate": 0.0001, "loss": 7.5014, "loss/crossentropy": 2.4294410943984985, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21551431715488434, "step": 4982 }, { "epoch": 0.3115, "grad_norm": 2.484375, "grad_norm_var": 0.030516560872395834, "learning_rate": 0.0001, "loss": 7.1866, "loss/crossentropy": 2.183108687400818, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21338380873203278, "step": 4984 }, { "epoch": 0.311625, "grad_norm": 2.21875, "grad_norm_var": 0.033463541666666666, "learning_rate": 0.0001, "loss": 7.3706, "loss/crossentropy": 2.182175934314728, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21428421884775162, "step": 4986 }, { "epoch": 0.31175, "grad_norm": 2.40625, "grad_norm_var": 0.031468709309895836, "learning_rate": 0.0001, "loss": 7.5321, "loss/crossentropy": 2.381687641143799, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21604669839143753, "step": 4988 }, { "epoch": 0.311875, "grad_norm": 2.203125, "grad_norm_var": 0.02672119140625, "learning_rate": 0.0001, "loss": 7.3109, "loss/crossentropy": 2.211581826210022, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22757954895496368, "step": 4990 }, { "epoch": 0.312, "grad_norm": 2.328125, "grad_norm_var": 0.012214152018229167, "learning_rate": 0.0001, "loss": 7.3448, "loss/crossentropy": 2.164597749710083, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.229446142911911, "step": 4992 }, { "epoch": 0.312125, "grad_norm": 2.3125, "grad_norm_var": 0.009943644205729166, "learning_rate": 0.0001, "loss": 7.2848, "loss/crossentropy": 2.303720474243164, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21908074617385864, "step": 4994 }, { "epoch": 0.31225, "grad_norm": 2.21875, "grad_norm_var": 0.011351521809895833, "learning_rate": 0.0001, "loss": 7.4035, "loss/crossentropy": 2.4575328826904297, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22248198091983795, "step": 4996 }, { "epoch": 0.312375, "grad_norm": 2.453125, "grad_norm_var": 0.018700154622395833, "learning_rate": 0.0001, "loss": 7.507, "loss/crossentropy": 2.3077362775802612, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21921824663877487, "step": 4998 }, { "epoch": 0.3125, "grad_norm": 2.296875, "grad_norm_var": 0.017406209309895834, "learning_rate": 0.0001, "loss": 7.3155, "loss/crossentropy": 2.1894084215164185, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21899671852588654, "step": 5000 }, { "epoch": 0.312625, "grad_norm": 2.34375, "grad_norm_var": 0.015135701497395833, "learning_rate": 0.0001, "loss": 7.3408, "loss/crossentropy": 2.262491822242737, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19611389189958572, "step": 5002 }, { "epoch": 0.31275, "grad_norm": 2.53125, "grad_norm_var": 0.01842041015625, "learning_rate": 0.0001, "loss": 7.2152, "loss/crossentropy": 1.982454240322113, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.20768950879573822, "step": 5004 }, { "epoch": 0.312875, "grad_norm": 2.03125, "grad_norm_var": 0.023763020833333332, "learning_rate": 0.0001, "loss": 7.2139, "loss/crossentropy": 2.2430403232574463, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21962688863277435, "step": 5006 }, { "epoch": 0.313, "grad_norm": 2.359375, "grad_norm_var": 0.027595011393229167, "learning_rate": 0.0001, "loss": 7.3438, "loss/crossentropy": 2.3544063568115234, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22437047958374023, "step": 5008 }, { "epoch": 0.313125, "grad_norm": 2.109375, "grad_norm_var": 0.03183186848958333, "learning_rate": 0.0001, "loss": 7.3206, "loss/crossentropy": 2.342305541038513, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2293403297662735, "step": 5010 }, { "epoch": 0.31325, "grad_norm": 2.28125, "grad_norm_var": 0.031769816080729166, "learning_rate": 0.0001, "loss": 7.2882, "loss/crossentropy": 2.2680485248565674, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22678761184215546, "step": 5012 }, { "epoch": 0.313375, "grad_norm": 2.390625, "grad_norm_var": 0.024507649739583335, "learning_rate": 0.0001, "loss": 7.4037, "loss/crossentropy": 2.13227915763855, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20939987897872925, "step": 5014 }, { "epoch": 0.3135, "grad_norm": 2.15625, "grad_norm_var": 0.0258697509765625, "learning_rate": 0.0001, "loss": 7.3085, "loss/crossentropy": 2.2183371782302856, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22399604320526123, "step": 5016 }, { "epoch": 0.313625, "grad_norm": 2.34375, "grad_norm_var": 0.023534138997395832, "learning_rate": 0.0001, "loss": 7.3265, "loss/crossentropy": 2.3377569913864136, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21345963329076767, "step": 5018 }, { "epoch": 0.31375, "grad_norm": 2.140625, "grad_norm_var": 0.022001139322916665, "learning_rate": 0.0001, "loss": 7.203, "loss/crossentropy": 2.2402291893959045, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2167833298444748, "step": 5020 }, { "epoch": 0.313875, "grad_norm": 2.21875, "grad_norm_var": 0.0185211181640625, "learning_rate": 0.0001, "loss": 7.1242, "loss/crossentropy": 2.1168267726898193, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.20187420397996902, "step": 5022 }, { "epoch": 0.314, "grad_norm": 2.328125, "grad_norm_var": 0.017838541666666666, "learning_rate": 0.0001, "loss": 7.3825, "loss/crossentropy": 2.1705461740493774, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21749653667211533, "step": 5024 }, { "epoch": 0.314125, "grad_norm": 2.453125, "grad_norm_var": 0.015705362955729166, "learning_rate": 0.0001, "loss": 7.3272, "loss/crossentropy": 2.2871659994125366, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23058424144983292, "step": 5026 }, { "epoch": 0.31425, "grad_norm": 2.296875, "grad_norm_var": 0.0137359619140625, "learning_rate": 0.0001, "loss": 7.3611, "loss/crossentropy": 2.341141700744629, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23287169635295868, "step": 5028 }, { "epoch": 0.314375, "grad_norm": 2.25, "grad_norm_var": 0.013993326822916667, "learning_rate": 0.0001, "loss": 7.3674, "loss/crossentropy": 1.9995309114456177, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20135702937841415, "step": 5030 }, { "epoch": 0.3145, "grad_norm": 2.3125, "grad_norm_var": 0.012694295247395833, "learning_rate": 0.0001, "loss": 7.2974, "loss/crossentropy": 2.396213173866272, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22502055764198303, "step": 5032 }, { "epoch": 0.314625, "grad_norm": 2.234375, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 7.3376, "loss/crossentropy": 2.290814518928528, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21020909398794174, "step": 5034 }, { "epoch": 0.31475, "grad_norm": 2.0, "grad_norm_var": 0.017414347330729166, "learning_rate": 0.0001, "loss": 7.2327, "loss/crossentropy": 2.5261220932006836, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21174553781747818, "step": 5036 }, { "epoch": 0.314875, "grad_norm": 2.40625, "grad_norm_var": 0.020197550455729168, "learning_rate": 0.0001, "loss": 7.2107, "loss/crossentropy": 2.3141993284225464, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21525625884532928, "step": 5038 }, { "epoch": 0.315, "grad_norm": 2.640625, "grad_norm_var": 0.04215087890625, "learning_rate": 0.0001, "loss": 7.3146, "loss/crossentropy": 2.3335756063461304, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21623239666223526, "step": 5040 }, { "epoch": 0.315125, "grad_norm": 2.09375, "grad_norm_var": 0.045633951822916664, "learning_rate": 0.0001, "loss": 7.2735, "loss/crossentropy": 2.2438907623291016, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22927288711071014, "step": 5042 }, { "epoch": 0.31525, "grad_norm": 2.421875, "grad_norm_var": 0.047261555989583336, "learning_rate": 0.0001, "loss": 7.4352, "loss/crossentropy": 2.2331418991088867, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21440240740776062, "step": 5044 }, { "epoch": 0.315375, "grad_norm": 2.46875, "grad_norm_var": 0.0459136962890625, "learning_rate": 0.0001, "loss": 7.4132, "loss/crossentropy": 2.272444486618042, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2074766531586647, "step": 5046 }, { "epoch": 0.3155, "grad_norm": 2.140625, "grad_norm_var": 0.047526041666666664, "learning_rate": 0.0001, "loss": 7.1822, "loss/crossentropy": 2.2327855825424194, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23146355152130127, "step": 5048 }, { "epoch": 0.315625, "grad_norm": 2.3125, "grad_norm_var": 0.048151652018229164, "learning_rate": 0.0001, "loss": 7.1235, "loss/crossentropy": 2.138622522354126, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2159329056739807, "step": 5050 }, { "epoch": 0.31575, "grad_norm": 2.359375, "grad_norm_var": 0.03786519368489583, "learning_rate": 0.0001, "loss": 7.3683, "loss/crossentropy": 2.407191514968872, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2199866697192192, "step": 5052 }, { "epoch": 0.315875, "grad_norm": 2.390625, "grad_norm_var": 0.03487955729166667, "learning_rate": 0.0001, "loss": 7.3308, "loss/crossentropy": 2.131745457649231, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.209860198199749, "step": 5054 }, { "epoch": 0.316, "grad_norm": 2.125, "grad_norm_var": 0.018387858072916666, "learning_rate": 0.0001, "loss": 7.2193, "loss/crossentropy": 2.0351319909095764, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2149025946855545, "step": 5056 }, { "epoch": 0.316125, "grad_norm": 2.234375, "grad_norm_var": 0.014046223958333333, "learning_rate": 0.0001, "loss": 7.1825, "loss/crossentropy": 2.4224579334259033, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21975557506084442, "step": 5058 }, { "epoch": 0.31625, "grad_norm": 2.125, "grad_norm_var": 0.015070597330729166, "learning_rate": 0.0001, "loss": 7.273, "loss/crossentropy": 2.1384547352790833, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2082139551639557, "step": 5060 }, { "epoch": 0.316375, "grad_norm": 2.203125, "grad_norm_var": 0.013044230143229167, "learning_rate": 0.0001, "loss": 7.3584, "loss/crossentropy": 2.4295462369918823, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22232317924499512, "step": 5062 }, { "epoch": 0.3165, "grad_norm": 2.171875, "grad_norm_var": 0.0131256103515625, "learning_rate": 0.0001, "loss": 7.3255, "loss/crossentropy": 2.2445785999298096, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2112947255373001, "step": 5064 }, { "epoch": 0.316625, "grad_norm": 2.265625, "grad_norm_var": 0.0092193603515625, "learning_rate": 0.0001, "loss": 7.3904, "loss/crossentropy": 2.491084575653076, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23876774311065674, "step": 5066 }, { "epoch": 0.31675, "grad_norm": 2.265625, "grad_norm_var": 0.00875244140625, "learning_rate": 0.0001, "loss": 7.3886, "loss/crossentropy": 2.209762454032898, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21998873353004456, "step": 5068 }, { "epoch": 0.316875, "grad_norm": 2.1875, "grad_norm_var": 0.007373046875, "learning_rate": 0.0001, "loss": 7.354, "loss/crossentropy": 2.184313654899597, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21638303250074387, "step": 5070 }, { "epoch": 0.317, "grad_norm": 2.40625, "grad_norm_var": 0.00654296875, "learning_rate": 0.0001, "loss": 7.2445, "loss/crossentropy": 1.932084858417511, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20044665038585663, "step": 5072 }, { "epoch": 0.317125, "grad_norm": 2.140625, "grad_norm_var": 0.00751953125, "learning_rate": 0.0001, "loss": 7.147, "loss/crossentropy": 2.099605619907379, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.1961287409067154, "step": 5074 }, { "epoch": 0.31725, "grad_norm": 2.40625, "grad_norm_var": 0.0076080322265625, "learning_rate": 0.0001, "loss": 7.4359, "loss/crossentropy": 2.4930461645126343, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.19766415655612946, "step": 5076 }, { "epoch": 0.317375, "grad_norm": 2.09375, "grad_norm_var": 0.009847005208333334, "learning_rate": 0.0001, "loss": 7.2892, "loss/crossentropy": 2.0933732390403748, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2190747633576393, "step": 5078 }, { "epoch": 0.3175, "grad_norm": 2.34375, "grad_norm_var": 0.009794108072916667, "learning_rate": 0.0001, "loss": 7.2631, "loss/crossentropy": 2.2265865802764893, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19832175970077515, "step": 5080 }, { "epoch": 0.317625, "grad_norm": 2.078125, "grad_norm_var": 0.012955729166666667, "learning_rate": 0.0001, "loss": 7.3764, "loss/crossentropy": 2.1202717423439026, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22476188838481903, "step": 5082 }, { "epoch": 0.31775, "grad_norm": 2.171875, "grad_norm_var": 0.013671875, "learning_rate": 0.0001, "loss": 7.182, "loss/crossentropy": 2.208008825778961, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21714557707309723, "step": 5084 }, { "epoch": 0.317875, "grad_norm": 2.328125, "grad_norm_var": 0.0148345947265625, "learning_rate": 0.0001, "loss": 7.2372, "loss/crossentropy": 2.3373151421546936, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23226696997880936, "step": 5086 }, { "epoch": 0.318, "grad_norm": 2.40625, "grad_norm_var": 0.014256795247395834, "learning_rate": 0.0001, "loss": 7.3527, "loss/crossentropy": 2.235350489616394, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20813613384962082, "step": 5088 }, { "epoch": 0.318125, "grad_norm": 2.296875, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 7.3671, "loss/crossentropy": 2.436918258666992, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2232442870736122, "step": 5090 }, { "epoch": 0.31825, "grad_norm": 2.21875, "grad_norm_var": 0.018684895833333333, "learning_rate": 0.0001, "loss": 7.2646, "loss/crossentropy": 2.3693257570266724, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2209920957684517, "step": 5092 }, { "epoch": 0.318375, "grad_norm": 2.171875, "grad_norm_var": 0.020164998372395833, "learning_rate": 0.0001, "loss": 7.4097, "loss/crossentropy": 2.5247997045516968, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23687294870615005, "step": 5094 }, { "epoch": 0.3185, "grad_norm": 2.359375, "grad_norm_var": 0.020384724934895834, "learning_rate": 0.0001, "loss": 7.2837, "loss/crossentropy": 2.009072959423065, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20863434672355652, "step": 5096 }, { "epoch": 0.318625, "grad_norm": 2.09375, "grad_norm_var": 0.019173177083333333, "learning_rate": 0.0001, "loss": 7.1841, "loss/crossentropy": 2.075112044811249, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2084924578666687, "step": 5098 }, { "epoch": 0.31875, "grad_norm": 2.28125, "grad_norm_var": 0.017878214518229168, "learning_rate": 0.0001, "loss": 7.2721, "loss/crossentropy": 2.15164053440094, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20492035150527954, "step": 5100 }, { "epoch": 0.318875, "grad_norm": 2.390625, "grad_norm_var": 0.01708984375, "learning_rate": 0.0001, "loss": 7.3294, "loss/crossentropy": 1.9480576515197754, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20920200645923615, "step": 5102 }, { "epoch": 0.319, "grad_norm": 2.09375, "grad_norm_var": 0.018163045247395832, "learning_rate": 0.0001, "loss": 7.3463, "loss/crossentropy": 2.2869019508361816, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21756701171398163, "step": 5104 }, { "epoch": 0.319125, "grad_norm": 2.28125, "grad_norm_var": 0.017650349934895834, "learning_rate": 0.0001, "loss": 7.1405, "loss/crossentropy": 2.0993104577064514, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21797939389944077, "step": 5106 }, { "epoch": 0.31925, "grad_norm": 2.234375, "grad_norm_var": 0.0272613525390625, "learning_rate": 0.0001, "loss": 7.3592, "loss/crossentropy": 2.4255337715148926, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21611501276493073, "step": 5108 }, { "epoch": 0.319375, "grad_norm": 2.21875, "grad_norm_var": 0.0240631103515625, "learning_rate": 0.0001, "loss": 7.2214, "loss/crossentropy": 2.370956540107727, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.23814254999160767, "step": 5110 }, { "epoch": 0.3195, "grad_norm": 2.421875, "grad_norm_var": 0.025191243489583334, "learning_rate": 0.0001, "loss": 7.2919, "loss/crossentropy": 2.354593515396118, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22845982760190964, "step": 5112 }, { "epoch": 0.319625, "grad_norm": 2.171875, "grad_norm_var": 0.023582967122395833, "learning_rate": 0.0001, "loss": 7.3072, "loss/crossentropy": 2.1754024028778076, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23337450623512268, "step": 5114 }, { "epoch": 0.31975, "grad_norm": 2.34375, "grad_norm_var": 0.0242340087890625, "learning_rate": 0.0001, "loss": 7.3127, "loss/crossentropy": 2.1386443972587585, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2071130946278572, "step": 5116 }, { "epoch": 0.319875, "grad_norm": 2.234375, "grad_norm_var": 0.023531087239583335, "learning_rate": 0.0001, "loss": 7.3132, "loss/crossentropy": 2.3322278261184692, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21129751950502396, "step": 5118 }, { "epoch": 0.32, "grad_norm": 2.265625, "grad_norm_var": 0.0241119384765625, "learning_rate": 0.0001, "loss": 7.2513, "loss/crossentropy": 2.294466018676758, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.20739653706550598, "step": 5120 }, { "epoch": 0.320125, "grad_norm": 2.171875, "grad_norm_var": 0.023388671875, "learning_rate": 0.0001, "loss": 7.1934, "loss/crossentropy": 2.16942036151886, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.19805258512496948, "step": 5122 }, { "epoch": 0.32025, "grad_norm": 2.625, "grad_norm_var": 0.018831380208333335, "learning_rate": 0.0001, "loss": 7.507, "loss/crossentropy": 2.132123589515686, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21495044976472855, "step": 5124 }, { "epoch": 0.320375, "grad_norm": 2.0, "grad_norm_var": 0.024616495768229166, "learning_rate": 0.0001, "loss": 7.2239, "loss/crossentropy": 1.760918915271759, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.17673537880182266, "step": 5126 }, { "epoch": 0.3205, "grad_norm": 2.3125, "grad_norm_var": 0.024144490559895832, "learning_rate": 0.0001, "loss": 7.4896, "loss/crossentropy": 2.4206674098968506, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2217917963862419, "step": 5128 }, { "epoch": 0.320625, "grad_norm": 2.28125, "grad_norm_var": 0.025658162434895833, "learning_rate": 0.0001, "loss": 7.2327, "loss/crossentropy": 2.1627532839775085, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20185734331607819, "step": 5130 }, { "epoch": 0.32075, "grad_norm": 2.3125, "grad_norm_var": 0.0330078125, "learning_rate": 0.0001, "loss": 7.4634, "loss/crossentropy": 2.143425464630127, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21182992309331894, "step": 5132 }, { "epoch": 0.320875, "grad_norm": 2.375, "grad_norm_var": 0.03430887858072917, "learning_rate": 0.0001, "loss": 7.3735, "loss/crossentropy": 1.9043779969215393, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.19466465711593628, "step": 5134 }, { "epoch": 0.321, "grad_norm": 2.046875, "grad_norm_var": 0.04011942545572917, "learning_rate": 0.0001, "loss": 7.2799, "loss/crossentropy": 2.3145501613616943, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.23457962274551392, "step": 5136 }, { "epoch": 0.321125, "grad_norm": 2.484375, "grad_norm_var": 0.040648396809895834, "learning_rate": 0.0001, "loss": 7.3412, "loss/crossentropy": 2.338579297065735, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23789451271295547, "step": 5138 }, { "epoch": 0.32125, "grad_norm": 2.578125, "grad_norm_var": 0.036848958333333334, "learning_rate": 0.0001, "loss": 7.509, "loss/crossentropy": 2.0653016567230225, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23478703200817108, "step": 5140 }, { "epoch": 0.321375, "grad_norm": 2.25, "grad_norm_var": 0.029866536458333332, "learning_rate": 0.0001, "loss": 7.4456, "loss/crossentropy": 2.146829605102539, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2174145206809044, "step": 5142 }, { "epoch": 0.3215, "grad_norm": 2.28125, "grad_norm_var": 0.030256144205729165, "learning_rate": 0.0001, "loss": 7.2912, "loss/crossentropy": 2.283796191215515, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22230414301156998, "step": 5144 }, { "epoch": 0.321625, "grad_norm": 2.703125, "grad_norm_var": 0.040022786458333334, "learning_rate": 0.0001, "loss": 7.3014, "loss/crossentropy": 2.309414267539978, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.24046258628368378, "step": 5146 }, { "epoch": 0.32175, "grad_norm": 1.9375, "grad_norm_var": 0.045531209309895834, "learning_rate": 0.0001, "loss": 7.1654, "loss/crossentropy": 2.361426830291748, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21131903678178787, "step": 5148 }, { "epoch": 0.321875, "grad_norm": 2.40625, "grad_norm_var": 0.048628743489583334, "learning_rate": 0.0001, "loss": 7.3457, "loss/crossentropy": 2.2447391748428345, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23755492269992828, "step": 5150 }, { "epoch": 0.322, "grad_norm": 2.171875, "grad_norm_var": 0.041304524739583334, "learning_rate": 0.0001, "loss": 7.2372, "loss/crossentropy": 2.0738277435302734, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.20171590149402618, "step": 5152 }, { "epoch": 0.322125, "grad_norm": 2.21875, "grad_norm_var": 0.0625396728515625, "learning_rate": 0.0001, "loss": 7.3119, "loss/crossentropy": 2.452193021774292, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2211400717496872, "step": 5154 }, { "epoch": 0.32225, "grad_norm": 2.359375, "grad_norm_var": 0.05943094889322917, "learning_rate": 0.0001, "loss": 7.5275, "loss/crossentropy": 2.1492894887924194, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22379465401172638, "step": 5156 }, { "epoch": 0.322375, "grad_norm": 2.296875, "grad_norm_var": 0.05969950358072917, "learning_rate": 0.0001, "loss": 7.1852, "loss/crossentropy": 2.1073238849639893, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22517702728509903, "step": 5158 }, { "epoch": 0.3225, "grad_norm": 2.375, "grad_norm_var": 0.06669921875, "learning_rate": 0.0001, "loss": 7.4154, "loss/crossentropy": 2.211340069770813, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21814390271902084, "step": 5160 }, { "epoch": 0.322625, "grad_norm": 2.3125, "grad_norm_var": 0.05657145182291667, "learning_rate": 0.0001, "loss": 7.4603, "loss/crossentropy": 2.1871854066848755, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21511095762252808, "step": 5162 }, { "epoch": 0.32275, "grad_norm": 2.46875, "grad_norm_var": 0.04453837076822917, "learning_rate": 0.0001, "loss": 7.3515, "loss/crossentropy": 2.2441636323928833, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.23230791091918945, "step": 5164 }, { "epoch": 0.322875, "grad_norm": 2.15625, "grad_norm_var": 0.040892537434895834, "learning_rate": 0.0001, "loss": 7.1981, "loss/crossentropy": 2.134332001209259, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21239123493433, "step": 5166 }, { "epoch": 0.323, "grad_norm": 3.125, "grad_norm_var": 0.07382405598958333, "learning_rate": 0.0001, "loss": 7.5831, "loss/crossentropy": 2.3247495889663696, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21503940969705582, "step": 5168 }, { "epoch": 0.323125, "grad_norm": 2.484375, "grad_norm_var": 0.059244791666666664, "learning_rate": 0.0001, "loss": 7.5669, "loss/crossentropy": 2.187830626964569, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.24586067348718643, "step": 5170 }, { "epoch": 0.32325, "grad_norm": 2.265625, "grad_norm_var": 0.06236572265625, "learning_rate": 0.0001, "loss": 7.2922, "loss/crossentropy": 2.291743755340576, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22773872315883636, "step": 5172 }, { "epoch": 0.323375, "grad_norm": 2.3125, "grad_norm_var": 0.06570536295572917, "learning_rate": 0.0001, "loss": 7.2504, "loss/crossentropy": 2.20532488822937, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21305081248283386, "step": 5174 }, { "epoch": 0.3235, "grad_norm": 2.25, "grad_norm_var": 0.06047770182291667, "learning_rate": 0.0001, "loss": 7.3579, "loss/crossentropy": 2.226097345352173, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.223523810505867, "step": 5176 }, { "epoch": 0.323625, "grad_norm": 2.234375, "grad_norm_var": 0.06304931640625, "learning_rate": 0.0001, "loss": 7.0957, "loss/crossentropy": 2.05389004945755, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20273905247449875, "step": 5178 }, { "epoch": 0.32375, "grad_norm": 2.109375, "grad_norm_var": 0.06721089680989584, "learning_rate": 0.0001, "loss": 7.2408, "loss/crossentropy": 2.0276909470558167, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20203246176242828, "step": 5180 }, { "epoch": 0.323875, "grad_norm": 2.328125, "grad_norm_var": 0.06479390462239583, "learning_rate": 0.0001, "loss": 7.5044, "loss/crossentropy": 2.2829922437667847, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21530025452375412, "step": 5182 }, { "epoch": 0.324, "grad_norm": 2.125, "grad_norm_var": 0.023453776041666666, "learning_rate": 0.0001, "loss": 7.3556, "loss/crossentropy": 2.1827311515808105, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.218561053276062, "step": 5184 }, { "epoch": 0.324125, "grad_norm": 2.234375, "grad_norm_var": 0.01080322265625, "learning_rate": 0.0001, "loss": 7.3249, "loss/crossentropy": 2.091078519821167, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23558902740478516, "step": 5186 }, { "epoch": 0.32425, "grad_norm": 2.84375, "grad_norm_var": 0.0319244384765625, "learning_rate": 0.0001, "loss": 7.3744, "loss/crossentropy": 1.8757956624031067, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.19631417095661163, "step": 5188 }, { "epoch": 0.324375, "grad_norm": 2.328125, "grad_norm_var": 0.031037394205729166, "learning_rate": 0.0001, "loss": 7.284, "loss/crossentropy": 2.342907428741455, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22963730245828629, "step": 5190 }, { "epoch": 0.3245, "grad_norm": 2.28125, "grad_norm_var": 0.027179972330729166, "learning_rate": 0.0001, "loss": 7.3116, "loss/crossentropy": 2.244239926338196, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2171139419078827, "step": 5192 }, { "epoch": 0.324625, "grad_norm": 2.3125, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 7.26, "loss/crossentropy": 2.035650849342346, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2151600569486618, "step": 5194 }, { "epoch": 0.32475, "grad_norm": 2.265625, "grad_norm_var": 0.023274739583333332, "learning_rate": 0.0001, "loss": 7.3792, "loss/crossentropy": 2.309348702430725, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21702983975410461, "step": 5196 }, { "epoch": 0.324875, "grad_norm": 2.140625, "grad_norm_var": 0.026732381184895834, "learning_rate": 0.0001, "loss": 7.1716, "loss/crossentropy": 2.129282593727112, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21354100108146667, "step": 5198 }, { "epoch": 0.325, "grad_norm": 2.28125, "grad_norm_var": 0.031981404622395834, "learning_rate": 0.0001, "loss": 7.2898, "loss/crossentropy": 2.0915945172309875, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.19767944514751434, "step": 5200 }, { "epoch": 0.325125, "grad_norm": 2.515625, "grad_norm_var": 0.0355133056640625, "learning_rate": 0.0001, "loss": 7.249, "loss/crossentropy": 2.389148235321045, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23801419138908386, "step": 5202 }, { "epoch": 0.32525, "grad_norm": 2.203125, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 7.4405, "loss/crossentropy": 2.0830936431884766, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19635440409183502, "step": 5204 }, { "epoch": 0.325375, "grad_norm": 2.390625, "grad_norm_var": 0.016304524739583333, "learning_rate": 0.0001, "loss": 7.2434, "loss/crossentropy": 2.287827789783478, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22433650493621826, "step": 5206 }, { "epoch": 0.3255, "grad_norm": 2.1875, "grad_norm_var": 0.015111287434895834, "learning_rate": 0.0001, "loss": 7.4008, "loss/crossentropy": 2.550079107284546, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22946575284004211, "step": 5208 }, { "epoch": 0.325625, "grad_norm": 2.234375, "grad_norm_var": 0.0140777587890625, "learning_rate": 0.0001, "loss": 7.2935, "loss/crossentropy": 2.1926426887512207, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2117481455206871, "step": 5210 }, { "epoch": 0.32575, "grad_norm": 2.546875, "grad_norm_var": 0.020099894205729166, "learning_rate": 0.0001, "loss": 7.2581, "loss/crossentropy": 2.2146820425987244, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2380428984761238, "step": 5212 }, { "epoch": 0.325875, "grad_norm": 2.046875, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 7.2562, "loss/crossentropy": 2.3124881982803345, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21675492078065872, "step": 5214 }, { "epoch": 0.326, "grad_norm": 2.21875, "grad_norm_var": 0.016161092122395835, "learning_rate": 0.0001, "loss": 7.1713, "loss/crossentropy": 2.3529210090637207, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.223765030503273, "step": 5216 }, { "epoch": 0.326125, "grad_norm": 2.46875, "grad_norm_var": 0.020930989583333334, "learning_rate": 0.0001, "loss": 7.3457, "loss/crossentropy": 2.2348451614379883, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23559778928756714, "step": 5218 }, { "epoch": 0.32625, "grad_norm": 2.015625, "grad_norm_var": 0.026448567708333332, "learning_rate": 0.0001, "loss": 7.1213, "loss/crossentropy": 2.407866954803467, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23116052150726318, "step": 5220 }, { "epoch": 0.326375, "grad_norm": 2.390625, "grad_norm_var": 0.025267537434895834, "learning_rate": 0.0001, "loss": 7.43, "loss/crossentropy": 2.282076358795166, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22940561920404434, "step": 5222 }, { "epoch": 0.3265, "grad_norm": 2.109375, "grad_norm_var": 0.028270467122395834, "learning_rate": 0.0001, "loss": 7.2729, "loss/crossentropy": 2.0684497356414795, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20412250608205795, "step": 5224 }, { "epoch": 0.326625, "grad_norm": 2.15625, "grad_norm_var": 0.029150390625, "learning_rate": 0.0001, "loss": 7.4137, "loss/crossentropy": 2.4470431804656982, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2289155125617981, "step": 5226 }, { "epoch": 0.32675, "grad_norm": 2.21875, "grad_norm_var": 0.025755818684895834, "learning_rate": 0.0001, "loss": 7.1882, "loss/crossentropy": 2.232643723487854, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22333452105522156, "step": 5228 }, { "epoch": 0.326875, "grad_norm": 2.21875, "grad_norm_var": 0.022977701822916665, "learning_rate": 0.0001, "loss": 7.1683, "loss/crossentropy": 1.9237089157104492, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20895886421203613, "step": 5230 }, { "epoch": 0.327, "grad_norm": 2.34375, "grad_norm_var": 0.023460896809895833, "learning_rate": 0.0001, "loss": 7.392, "loss/crossentropy": 2.570519208908081, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2476401850581169, "step": 5232 }, { "epoch": 0.327125, "grad_norm": 2.21875, "grad_norm_var": 0.011909993489583333, "learning_rate": 0.0001, "loss": 7.1545, "loss/crossentropy": 2.332336902618408, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22975466400384903, "step": 5234 }, { "epoch": 0.32725, "grad_norm": 3.859375, "grad_norm_var": 0.1721588134765625, "learning_rate": 0.0001, "loss": 7.4894, "loss/crossentropy": 2.4853312969207764, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22591929137706757, "step": 5236 }, { "epoch": 0.327375, "grad_norm": 2.140625, "grad_norm_var": 0.17515869140625, "learning_rate": 0.0001, "loss": 7.3073, "loss/crossentropy": 2.4176896810531616, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2283809781074524, "step": 5238 }, { "epoch": 0.3275, "grad_norm": 2.25, "grad_norm_var": 0.1721588134765625, "learning_rate": 0.0001, "loss": 7.441, "loss/crossentropy": 2.581329822540283, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.24542076140642166, "step": 5240 }, { "epoch": 0.327625, "grad_norm": 2.4375, "grad_norm_var": 0.16851806640625, "learning_rate": 0.0001, "loss": 7.2261, "loss/crossentropy": 2.2448220252990723, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22246970981359482, "step": 5242 }, { "epoch": 0.32775, "grad_norm": 2.15625, "grad_norm_var": 0.16536051432291668, "learning_rate": 0.0001, "loss": 7.3095, "loss/crossentropy": 2.354673981666565, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2191583588719368, "step": 5244 }, { "epoch": 0.327875, "grad_norm": 2.171875, "grad_norm_var": 0.16868082682291666, "learning_rate": 0.0001, "loss": 7.3025, "loss/crossentropy": 2.3827792406082153, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23692472279071808, "step": 5246 }, { "epoch": 0.328, "grad_norm": 2.234375, "grad_norm_var": 0.17063395182291666, "learning_rate": 0.0001, "loss": 7.3818, "loss/crossentropy": 2.433183193206787, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22385024279356003, "step": 5248 }, { "epoch": 0.328125, "grad_norm": 2.28125, "grad_norm_var": 0.16874898274739583, "learning_rate": 0.0001, "loss": 7.3701, "loss/crossentropy": 2.1747193932533264, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2178487330675125, "step": 5250 }, { "epoch": 0.32825, "grad_norm": 2.4375, "grad_norm_var": 0.01500244140625, "learning_rate": 0.0001, "loss": 7.444, "loss/crossentropy": 2.0669074058532715, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22157609462738037, "step": 5252 }, { "epoch": 0.328375, "grad_norm": 2.25, "grad_norm_var": 0.014655558268229167, "learning_rate": 0.0001, "loss": 7.6055, "loss/crossentropy": 2.4753568172454834, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2559722363948822, "step": 5254 }, { "epoch": 0.3285, "grad_norm": 2.28125, "grad_norm_var": 0.014208984375, "learning_rate": 0.0001, "loss": 7.3287, "loss/crossentropy": 2.3594852685928345, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2157405987381935, "step": 5256 }, { "epoch": 0.328625, "grad_norm": 2.3125, "grad_norm_var": 0.012572224934895833, "learning_rate": 0.0001, "loss": 7.4131, "loss/crossentropy": 2.2439894676208496, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21909688413143158, "step": 5258 }, { "epoch": 0.32875, "grad_norm": 2.375, "grad_norm_var": 0.010091145833333334, "learning_rate": 0.0001, "loss": 7.1475, "loss/crossentropy": 2.136144995689392, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22025711089372635, "step": 5260 }, { "epoch": 0.328875, "grad_norm": 2.21875, "grad_norm_var": 0.009544881184895833, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.1109477281570435, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2135395109653473, "step": 5262 }, { "epoch": 0.329, "grad_norm": 2.5, "grad_norm_var": 0.011844889322916666, "learning_rate": 0.0001, "loss": 7.5097, "loss/crossentropy": 2.205671548843384, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21154307574033737, "step": 5264 }, { "epoch": 0.329125, "grad_norm": 2.0, "grad_norm_var": 0.0176910400390625, "learning_rate": 0.0001, "loss": 7.1289, "loss/crossentropy": 2.044199228286743, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.19973917305469513, "step": 5266 }, { "epoch": 0.32925, "grad_norm": 2.578125, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 7.4721, "loss/crossentropy": 2.234253406524658, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2099093198776245, "step": 5268 }, { "epoch": 0.329375, "grad_norm": 2.125, "grad_norm_var": 0.0228912353515625, "learning_rate": 0.0001, "loss": 7.2664, "loss/crossentropy": 2.252587676048279, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21195102483034134, "step": 5270 }, { "epoch": 0.3295, "grad_norm": 2.59375, "grad_norm_var": 0.0292877197265625, "learning_rate": 0.0001, "loss": 7.3687, "loss/crossentropy": 2.134685754776001, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2187366485595703, "step": 5272 }, { "epoch": 0.329625, "grad_norm": 2.0625, "grad_norm_var": 0.03511962890625, "learning_rate": 0.0001, "loss": 7.3743, "loss/crossentropy": 2.278490424156189, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23104965686798096, "step": 5274 }, { "epoch": 0.32975, "grad_norm": 2.359375, "grad_norm_var": 0.03319905598958333, "learning_rate": 0.0001, "loss": 7.2294, "loss/crossentropy": 2.3663605451583862, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2136199250817299, "step": 5276 }, { "epoch": 0.329875, "grad_norm": 2.15625, "grad_norm_var": 0.03323160807291667, "learning_rate": 0.0001, "loss": 7.399, "loss/crossentropy": 2.3179785013198853, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.20722012221813202, "step": 5278 }, { "epoch": 0.33, "grad_norm": 2.3125, "grad_norm_var": 0.029841105143229168, "learning_rate": 0.0001, "loss": 7.4934, "loss/crossentropy": 2.3918418884277344, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2301948517560959, "step": 5280 }, { "epoch": 0.330125, "grad_norm": 2.390625, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 7.2977, "loss/crossentropy": 2.0368301272392273, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2010180726647377, "step": 5282 }, { "epoch": 0.33025, "grad_norm": 2.328125, "grad_norm_var": 0.020308430989583334, "learning_rate": 0.0001, "loss": 7.2924, "loss/crossentropy": 2.419578790664673, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22721153497695923, "step": 5284 }, { "epoch": 0.330375, "grad_norm": 2.34375, "grad_norm_var": 0.016722615559895834, "learning_rate": 0.0001, "loss": 7.2706, "loss/crossentropy": 2.213322699069977, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2133590281009674, "step": 5286 }, { "epoch": 0.3305, "grad_norm": 2.3125, "grad_norm_var": 0.009862263997395834, "learning_rate": 0.0001, "loss": 7.3482, "loss/crossentropy": 2.2472715377807617, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23156778514385223, "step": 5288 }, { "epoch": 0.330625, "grad_norm": 2.203125, "grad_norm_var": 0.007417805989583333, "learning_rate": 0.0001, "loss": 7.1872, "loss/crossentropy": 2.298330068588257, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20751896500587463, "step": 5290 }, { "epoch": 0.33075, "grad_norm": 2.125, "grad_norm_var": 0.008055623372395833, "learning_rate": 0.0001, "loss": 7.3076, "loss/crossentropy": 2.2136794328689575, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21609289944171906, "step": 5292 }, { "epoch": 0.330875, "grad_norm": 2.1875, "grad_norm_var": 0.007966105143229167, "learning_rate": 0.0001, "loss": 7.3623, "loss/crossentropy": 2.2835363149642944, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22649526596069336, "step": 5294 }, { "epoch": 0.331, "grad_norm": 2.21875, "grad_norm_var": 0.007743326822916666, "learning_rate": 0.0001, "loss": 7.3962, "loss/crossentropy": 2.3068933486938477, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21767152845859528, "step": 5296 }, { "epoch": 0.331125, "grad_norm": 2.328125, "grad_norm_var": 0.01090087890625, "learning_rate": 0.0001, "loss": 7.1498, "loss/crossentropy": 2.1658458709716797, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.20870249718427658, "step": 5298 }, { "epoch": 0.33125, "grad_norm": 2.125, "grad_norm_var": 0.011454264322916666, "learning_rate": 0.0001, "loss": 7.2139, "loss/crossentropy": 2.022110342979431, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19558995962142944, "step": 5300 }, { "epoch": 0.331375, "grad_norm": 2.453125, "grad_norm_var": 0.014387003580729167, "learning_rate": 0.0001, "loss": 7.3322, "loss/crossentropy": 2.1405563354492188, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20309502631425858, "step": 5302 }, { "epoch": 0.3315, "grad_norm": 2.1875, "grad_norm_var": 0.015641276041666666, "learning_rate": 0.0001, "loss": 7.3417, "loss/crossentropy": 2.1956971883773804, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20331468433141708, "step": 5304 }, { "epoch": 0.331625, "grad_norm": 2.234375, "grad_norm_var": 0.0128814697265625, "learning_rate": 0.0001, "loss": 7.3134, "loss/crossentropy": 2.3754764795303345, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.23374854773283005, "step": 5306 }, { "epoch": 0.33175, "grad_norm": 2.359375, "grad_norm_var": 0.016695149739583335, "learning_rate": 0.0001, "loss": 7.2282, "loss/crossentropy": 2.068848133087158, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.19378670305013657, "step": 5308 }, { "epoch": 0.331875, "grad_norm": 2.046875, "grad_norm_var": 0.019205729166666668, "learning_rate": 0.0001, "loss": 7.1813, "loss/crossentropy": 2.3996471166610718, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2248714193701744, "step": 5310 }, { "epoch": 0.332, "grad_norm": 2.140625, "grad_norm_var": 0.020052083333333335, "learning_rate": 0.0001, "loss": 7.3708, "loss/crossentropy": 2.327105164527893, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2335279881954193, "step": 5312 }, { "epoch": 0.332125, "grad_norm": 2.203125, "grad_norm_var": 0.015478515625, "learning_rate": 0.0001, "loss": 7.3023, "loss/crossentropy": 2.254124402999878, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21904382854700089, "step": 5314 }, { "epoch": 0.33225, "grad_norm": 2.125, "grad_norm_var": 0.0150054931640625, "learning_rate": 0.0001, "loss": 7.2638, "loss/crossentropy": 2.242727756500244, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23604810237884521, "step": 5316 }, { "epoch": 0.332375, "grad_norm": 2.21875, "grad_norm_var": 0.0114898681640625, "learning_rate": 0.0001, "loss": 7.3476, "loss/crossentropy": 2.1725869178771973, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22862936556339264, "step": 5318 }, { "epoch": 0.3325, "grad_norm": 2.4375, "grad_norm_var": 0.0150390625, "learning_rate": 0.0001, "loss": 7.4291, "loss/crossentropy": 2.289436936378479, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2235860824584961, "step": 5320 }, { "epoch": 0.332625, "grad_norm": 2.03125, "grad_norm_var": 0.01962890625, "learning_rate": 0.0001, "loss": 7.1351, "loss/crossentropy": 2.306610345840454, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2362884059548378, "step": 5322 }, { "epoch": 0.33275, "grad_norm": 2.15625, "grad_norm_var": 0.015583292643229166, "learning_rate": 0.0001, "loss": 7.3527, "loss/crossentropy": 2.2097585201263428, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.24638129770755768, "step": 5324 }, { "epoch": 0.332875, "grad_norm": 2.296875, "grad_norm_var": 0.015462239583333334, "learning_rate": 0.0001, "loss": 7.3412, "loss/crossentropy": 2.1767340898513794, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2145010083913803, "step": 5326 }, { "epoch": 0.333, "grad_norm": 2.125, "grad_norm_var": 0.0224029541015625, "learning_rate": 0.0001, "loss": 7.2168, "loss/crossentropy": 2.1483041048049927, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21507804840803146, "step": 5328 }, { "epoch": 0.333125, "grad_norm": 2.421875, "grad_norm_var": 0.024930826822916665, "learning_rate": 0.0001, "loss": 7.4416, "loss/crossentropy": 2.1952147483825684, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21811091899871826, "step": 5330 }, { "epoch": 0.33325, "grad_norm": 2.40625, "grad_norm_var": 0.0267730712890625, "learning_rate": 0.0001, "loss": 7.4886, "loss/crossentropy": 2.136277675628662, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2098674327135086, "step": 5332 }, { "epoch": 0.333375, "grad_norm": 2.109375, "grad_norm_var": 0.027962239583333333, "learning_rate": 0.0001, "loss": 7.187, "loss/crossentropy": 2.1514610052108765, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2044997215270996, "step": 5334 }, { "epoch": 0.3335, "grad_norm": 2.46875, "grad_norm_var": 0.026610310872395834, "learning_rate": 0.0001, "loss": 7.2754, "loss/crossentropy": 2.2177401781082153, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.202798530459404, "step": 5336 }, { "epoch": 0.333625, "grad_norm": 2.09375, "grad_norm_var": 0.021239217122395834, "learning_rate": 0.0001, "loss": 7.1166, "loss/crossentropy": 2.191626250743866, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21423222124576569, "step": 5338 }, { "epoch": 0.33375, "grad_norm": 2.484375, "grad_norm_var": 0.0244293212890625, "learning_rate": 0.0001, "loss": 7.28, "loss/crossentropy": 2.3615646362304688, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22018377482891083, "step": 5340 }, { "epoch": 0.333875, "grad_norm": 2.4375, "grad_norm_var": 0.029899088541666667, "learning_rate": 0.0001, "loss": 7.401, "loss/crossentropy": 2.231188416481018, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22484582662582397, "step": 5342 }, { "epoch": 0.334, "grad_norm": 2.140625, "grad_norm_var": 0.026090494791666665, "learning_rate": 0.0001, "loss": 7.3716, "loss/crossentropy": 2.2002989053726196, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21874640136957169, "step": 5344 }, { "epoch": 0.334125, "grad_norm": 2.484375, "grad_norm_var": 0.0256744384765625, "learning_rate": 0.0001, "loss": 7.4587, "loss/crossentropy": 2.4450970888137817, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21748078614473343, "step": 5346 }, { "epoch": 0.33425, "grad_norm": 3.015625, "grad_norm_var": 0.057673136393229164, "learning_rate": 0.0001, "loss": 7.2319, "loss/crossentropy": 2.109615385532379, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.19727355241775513, "step": 5348 }, { "epoch": 0.334375, "grad_norm": 2.5625, "grad_norm_var": 0.052783203125, "learning_rate": 0.0001, "loss": 7.6111, "loss/crossentropy": 2.1436294317245483, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22486217319965363, "step": 5350 }, { "epoch": 0.3345, "grad_norm": 2.296875, "grad_norm_var": 0.05257059733072917, "learning_rate": 0.0001, "loss": 7.2229, "loss/crossentropy": 2.407299041748047, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23616841435432434, "step": 5352 }, { "epoch": 0.334625, "grad_norm": 2.328125, "grad_norm_var": 0.04553629557291667, "learning_rate": 0.0001, "loss": 7.2801, "loss/crossentropy": 2.192195415496826, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22064944356679916, "step": 5354 }, { "epoch": 0.33475, "grad_norm": 2.3125, "grad_norm_var": 0.04217122395833333, "learning_rate": 0.0001, "loss": 7.4254, "loss/crossentropy": 2.1264270544052124, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2075883150100708, "step": 5356 }, { "epoch": 0.334875, "grad_norm": 2.171875, "grad_norm_var": 0.0422760009765625, "learning_rate": 0.0001, "loss": 7.2029, "loss/crossentropy": 2.1121798753738403, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21974780410528183, "step": 5358 }, { "epoch": 0.335, "grad_norm": 2.3125, "grad_norm_var": 0.0387847900390625, "learning_rate": 0.0001, "loss": 7.4508, "loss/crossentropy": 2.2427613735198975, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21257954835891724, "step": 5360 }, { "epoch": 0.335125, "grad_norm": 2.25, "grad_norm_var": 0.04039713541666667, "learning_rate": 0.0001, "loss": 7.4028, "loss/crossentropy": 2.186354875564575, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.20210053771734238, "step": 5362 }, { "epoch": 0.33525, "grad_norm": 2.1875, "grad_norm_var": 0.012723795572916667, "learning_rate": 0.0001, "loss": 7.2075, "loss/crossentropy": 2.155713438987732, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.203889898955822, "step": 5364 }, { "epoch": 0.335375, "grad_norm": 2.265625, "grad_norm_var": 0.008805338541666667, "learning_rate": 0.0001, "loss": 7.4549, "loss/crossentropy": 2.2281126976013184, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22180020064115524, "step": 5366 }, { "epoch": 0.3355, "grad_norm": 2.28125, "grad_norm_var": 0.00972900390625, "learning_rate": 0.0001, "loss": 7.4261, "loss/crossentropy": 2.3512368202209473, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22407396882772446, "step": 5368 }, { "epoch": 0.335625, "grad_norm": 2.1875, "grad_norm_var": 0.01324462890625, "learning_rate": 0.0001, "loss": 7.2495, "loss/crossentropy": 2.2067723274230957, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2186514213681221, "step": 5370 }, { "epoch": 0.33575, "grad_norm": 2.09375, "grad_norm_var": 0.01881103515625, "learning_rate": 0.0001, "loss": 7.1389, "loss/crossentropy": 2.170414924621582, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20364930480718613, "step": 5372 }, { "epoch": 0.335875, "grad_norm": 2.1875, "grad_norm_var": 0.0207183837890625, "learning_rate": 0.0001, "loss": 7.1763, "loss/crossentropy": 2.1207195520401, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.19605641067028046, "step": 5374 }, { "epoch": 0.336, "grad_norm": 2.703125, "grad_norm_var": 0.03235270182291667, "learning_rate": 0.0001, "loss": 7.358, "loss/crossentropy": 2.5215905904769897, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.23153279721736908, "step": 5376 }, { "epoch": 0.336125, "grad_norm": 2.21875, "grad_norm_var": 0.0276519775390625, "learning_rate": 0.0001, "loss": 7.3453, "loss/crossentropy": 2.305363655090332, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21856709569692612, "step": 5378 }, { "epoch": 0.33625, "grad_norm": 2.25, "grad_norm_var": 0.0273101806640625, "learning_rate": 0.0001, "loss": 7.3475, "loss/crossentropy": 1.9425964951515198, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20577477663755417, "step": 5380 }, { "epoch": 0.336375, "grad_norm": 2.171875, "grad_norm_var": 0.028123982747395835, "learning_rate": 0.0001, "loss": 7.4189, "loss/crossentropy": 2.2299511432647705, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.26048048585653305, "step": 5382 }, { "epoch": 0.3365, "grad_norm": 2.28125, "grad_norm_var": 0.027831013997395834, "learning_rate": 0.0001, "loss": 7.424, "loss/crossentropy": 2.1312129497528076, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21301250159740448, "step": 5384 }, { "epoch": 0.336625, "grad_norm": 2.21875, "grad_norm_var": 0.023856608072916667, "learning_rate": 0.0001, "loss": 7.2993, "loss/crossentropy": 2.1534899473190308, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23556435108184814, "step": 5386 }, { "epoch": 0.33675, "grad_norm": 2.296875, "grad_norm_var": 0.019270833333333334, "learning_rate": 0.0001, "loss": 7.3045, "loss/crossentropy": 2.2635024189949036, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2051224410533905, "step": 5388 }, { "epoch": 0.336875, "grad_norm": 2.15625, "grad_norm_var": 0.017317708333333334, "learning_rate": 0.0001, "loss": 7.3403, "loss/crossentropy": 2.6195231676101685, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.24129101634025574, "step": 5390 }, { "epoch": 0.337, "grad_norm": 2.578125, "grad_norm_var": 0.010285441080729167, "learning_rate": 0.0001, "loss": 7.2644, "loss/crossentropy": 2.0847758054733276, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.20890209078788757, "step": 5392 }, { "epoch": 0.337125, "grad_norm": 2.15625, "grad_norm_var": 0.017308553059895832, "learning_rate": 0.0001, "loss": 7.4097, "loss/crossentropy": 2.41829776763916, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23831215500831604, "step": 5394 }, { "epoch": 0.33725, "grad_norm": 2.40625, "grad_norm_var": 0.018285115559895832, "learning_rate": 0.0001, "loss": 7.3525, "loss/crossentropy": 2.169269323348999, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21398558467626572, "step": 5396 }, { "epoch": 0.337375, "grad_norm": 2.15625, "grad_norm_var": 0.018424479166666667, "learning_rate": 0.0001, "loss": 7.3212, "loss/crossentropy": 2.198991537094116, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21486544609069824, "step": 5398 }, { "epoch": 0.3375, "grad_norm": 2.15625, "grad_norm_var": 0.019562784830729166, "learning_rate": 0.0001, "loss": 7.1218, "loss/crossentropy": 2.184316039085388, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20338526368141174, "step": 5400 }, { "epoch": 0.337625, "grad_norm": 2.328125, "grad_norm_var": 0.018290201822916668, "learning_rate": 0.0001, "loss": 7.321, "loss/crossentropy": 2.111461341381073, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21006064862012863, "step": 5402 }, { "epoch": 0.33775, "grad_norm": 2.203125, "grad_norm_var": 0.019115193684895834, "learning_rate": 0.0001, "loss": 7.3327, "loss/crossentropy": 2.301071524620056, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21048437803983688, "step": 5404 }, { "epoch": 0.337875, "grad_norm": 2.21875, "grad_norm_var": 0.0183746337890625, "learning_rate": 0.0001, "loss": 7.2345, "loss/crossentropy": 2.2816654443740845, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22754280269145966, "step": 5406 }, { "epoch": 0.338, "grad_norm": 2.1875, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 7.2538, "loss/crossentropy": 2.3284599781036377, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.23177867382764816, "step": 5408 }, { "epoch": 0.338125, "grad_norm": 2.21875, "grad_norm_var": 0.006266276041666667, "learning_rate": 0.0001, "loss": 7.1344, "loss/crossentropy": 2.009117007255554, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.195498988032341, "step": 5410 }, { "epoch": 0.33825, "grad_norm": 2.234375, "grad_norm_var": 0.005143229166666667, "learning_rate": 0.0001, "loss": 7.3075, "loss/crossentropy": 2.0180088877677917, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.22797267884016037, "step": 5412 }, { "epoch": 0.338375, "grad_norm": 2.265625, "grad_norm_var": 0.004736328125, "learning_rate": 0.0001, "loss": 7.3005, "loss/crossentropy": 2.1382123231887817, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2192755788564682, "step": 5414 }, { "epoch": 0.3385, "grad_norm": 2.203125, "grad_norm_var": 0.004325358072916666, "learning_rate": 0.0001, "loss": 7.3033, "loss/crossentropy": 2.408115029335022, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2371460422873497, "step": 5416 }, { "epoch": 0.338625, "grad_norm": 2.859375, "grad_norm_var": 0.16881103515625, "learning_rate": 0.0001, "loss": 7.3914, "loss/crossentropy": 2.1764304637908936, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20545508712530136, "step": 5418 }, { "epoch": 0.33875, "grad_norm": 2.359375, "grad_norm_var": 0.16660054524739584, "learning_rate": 0.0001, "loss": 7.305, "loss/crossentropy": 2.1204302310943604, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2243422120809555, "step": 5420 }, { "epoch": 0.338875, "grad_norm": 2.484375, "grad_norm_var": 0.16787821451822918, "learning_rate": 0.0001, "loss": 7.395, "loss/crossentropy": 2.1409205198287964, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20757804811000824, "step": 5422 }, { "epoch": 0.339, "grad_norm": 2.109375, "grad_norm_var": 0.17183837890625, "learning_rate": 0.0001, "loss": 7.239, "loss/crossentropy": 2.3405885696411133, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2265135422348976, "step": 5424 }, { "epoch": 0.339125, "grad_norm": 2.265625, "grad_norm_var": 0.17040608723958334, "learning_rate": 0.0001, "loss": 7.305, "loss/crossentropy": 2.0772798657417297, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.24080167710781097, "step": 5426 }, { "epoch": 0.33925, "grad_norm": 2.265625, "grad_norm_var": 0.16744791666666667, "learning_rate": 0.0001, "loss": 7.3626, "loss/crossentropy": 2.3165992498397827, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21086010336875916, "step": 5428 }, { "epoch": 0.339375, "grad_norm": 2.25, "grad_norm_var": 0.16593424479166666, "learning_rate": 0.0001, "loss": 7.2824, "loss/crossentropy": 2.2697300910949707, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.216596320271492, "step": 5430 }, { "epoch": 0.3395, "grad_norm": 2.09375, "grad_norm_var": 0.1771636962890625, "learning_rate": 0.0001, "loss": 7.1865, "loss/crossentropy": 2.3810267448425293, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2208058461546898, "step": 5432 }, { "epoch": 0.339625, "grad_norm": 2.125, "grad_norm_var": 0.027860514322916665, "learning_rate": 0.0001, "loss": 7.2512, "loss/crossentropy": 2.255163311958313, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.205363892018795, "step": 5434 }, { "epoch": 0.33975, "grad_norm": 2.484375, "grad_norm_var": 0.0278717041015625, "learning_rate": 0.0001, "loss": 7.2184, "loss/crossentropy": 2.0902993083000183, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2042091339826584, "step": 5436 }, { "epoch": 0.339875, "grad_norm": 2.078125, "grad_norm_var": 0.016109212239583334, "learning_rate": 0.0001, "loss": 7.3953, "loss/crossentropy": 2.379234194755554, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22250881046056747, "step": 5438 }, { "epoch": 0.34, "grad_norm": 2.171875, "grad_norm_var": 0.015120442708333333, "learning_rate": 0.0001, "loss": 7.1556, "loss/crossentropy": 2.1854381561279297, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20145095884799957, "step": 5440 }, { "epoch": 0.340125, "grad_norm": 2.25, "grad_norm_var": 0.015453084309895834, "learning_rate": 0.0001, "loss": 7.3506, "loss/crossentropy": 2.4397945404052734, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21113866567611694, "step": 5442 }, { "epoch": 0.34025, "grad_norm": 2.0, "grad_norm_var": 0.019580078125, "learning_rate": 0.0001, "loss": 7.2824, "loss/crossentropy": 2.2910887002944946, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22225283086299896, "step": 5444 }, { "epoch": 0.340375, "grad_norm": 2.1875, "grad_norm_var": 0.022215779622395834, "learning_rate": 0.0001, "loss": 7.4318, "loss/crossentropy": 2.4982157945632935, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23762008547782898, "step": 5446 }, { "epoch": 0.3405, "grad_norm": 2.34375, "grad_norm_var": 0.03228759765625, "learning_rate": 0.0001, "loss": 7.5355, "loss/crossentropy": 2.3466309309005737, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2286328449845314, "step": 5448 }, { "epoch": 0.340625, "grad_norm": 2.109375, "grad_norm_var": 0.033426920572916664, "learning_rate": 0.0001, "loss": 7.4912, "loss/crossentropy": 2.5358729362487793, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.24695628136396408, "step": 5450 }, { "epoch": 0.34075, "grad_norm": 2.671875, "grad_norm_var": 0.03862202962239583, "learning_rate": 0.0001, "loss": 7.5367, "loss/crossentropy": 2.3047229051589966, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21712347865104675, "step": 5452 }, { "epoch": 0.340875, "grad_norm": 2.09375, "grad_norm_var": 0.038386027018229164, "learning_rate": 0.0001, "loss": 7.119, "loss/crossentropy": 2.1131972074508667, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.21644818782806396, "step": 5454 }, { "epoch": 0.341, "grad_norm": 2.484375, "grad_norm_var": 0.042780558268229164, "learning_rate": 0.0001, "loss": 7.4856, "loss/crossentropy": 2.1927385330200195, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23934345692396164, "step": 5456 }, { "epoch": 0.341125, "grad_norm": 2.390625, "grad_norm_var": 0.04208577473958333, "learning_rate": 0.0001, "loss": 7.5131, "loss/crossentropy": 2.1948903799057007, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2262982353568077, "step": 5458 }, { "epoch": 0.34125, "grad_norm": 2.359375, "grad_norm_var": 0.03450419108072917, "learning_rate": 0.0001, "loss": 7.2735, "loss/crossentropy": 2.2074116468429565, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21651601791381836, "step": 5460 }, { "epoch": 0.341375, "grad_norm": 2.65625, "grad_norm_var": 0.4347320556640625, "learning_rate": 0.0001, "loss": 7.6036, "loss/crossentropy": 2.3377386331558228, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.260085329413414, "step": 5462 }, { "epoch": 0.3415, "grad_norm": 2.21875, "grad_norm_var": 0.43775634765625, "learning_rate": 0.0001, "loss": 7.3691, "loss/crossentropy": 2.5194804668426514, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2196458876132965, "step": 5464 }, { "epoch": 0.341625, "grad_norm": 2.140625, "grad_norm_var": 0.43778889973958335, "learning_rate": 0.0001, "loss": 7.3684, "loss/crossentropy": 2.222210168838501, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20729803293943405, "step": 5466 }, { "epoch": 0.34175, "grad_norm": 2.1875, "grad_norm_var": 0.44607645670572915, "learning_rate": 0.0001, "loss": 7.5481, "loss/crossentropy": 2.0169124603271484, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22089700400829315, "step": 5468 }, { "epoch": 0.341875, "grad_norm": 2.34375, "grad_norm_var": 0.4344146728515625, "learning_rate": 0.0001, "loss": 7.3955, "loss/crossentropy": 2.1813031435012817, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2193674072623253, "step": 5470 }, { "epoch": 0.342, "grad_norm": 2.0, "grad_norm_var": 0.45690816243489585, "learning_rate": 0.0001, "loss": 7.2806, "loss/crossentropy": 2.2984254360198975, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22460055351257324, "step": 5472 }, { "epoch": 0.342125, "grad_norm": 2.3125, "grad_norm_var": 0.46097005208333336, "learning_rate": 0.0001, "loss": 7.3134, "loss/crossentropy": 2.1755064725875854, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21521113812923431, "step": 5474 }, { "epoch": 0.34225, "grad_norm": 2.1875, "grad_norm_var": 0.46314697265625, "learning_rate": 0.0001, "loss": 7.1628, "loss/crossentropy": 2.117392897605896, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19606874883174896, "step": 5476 }, { "epoch": 0.342375, "grad_norm": 2.3125, "grad_norm_var": 0.022516886393229168, "learning_rate": 0.0001, "loss": 7.2474, "loss/crossentropy": 2.127562403678894, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21586275100708008, "step": 5478 }, { "epoch": 0.3425, "grad_norm": 2.609375, "grad_norm_var": 0.027958170572916666, "learning_rate": 0.0001, "loss": 7.4147, "loss/crossentropy": 2.201394200325012, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21986886113882065, "step": 5480 }, { "epoch": 0.342625, "grad_norm": 2.03125, "grad_norm_var": 0.03655192057291667, "learning_rate": 0.0001, "loss": 7.2027, "loss/crossentropy": 2.0617064237594604, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22360235452651978, "step": 5482 }, { "epoch": 0.34275, "grad_norm": 2.109375, "grad_norm_var": 0.03845926920572917, "learning_rate": 0.0001, "loss": 7.3388, "loss/crossentropy": 2.266227602958679, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21536527574062347, "step": 5484 }, { "epoch": 0.342875, "grad_norm": 2.453125, "grad_norm_var": 0.034012858072916666, "learning_rate": 0.0001, "loss": 7.2759, "loss/crossentropy": 2.3090654611587524, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2186703011393547, "step": 5486 }, { "epoch": 0.343, "grad_norm": 2.234375, "grad_norm_var": 0.028180948893229165, "learning_rate": 0.0001, "loss": 7.4189, "loss/crossentropy": 2.4446980953216553, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24092496931552887, "step": 5488 }, { "epoch": 0.343125, "grad_norm": 2.140625, "grad_norm_var": 0.0316314697265625, "learning_rate": 0.0001, "loss": 7.3887, "loss/crossentropy": 2.301460862159729, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2261280044913292, "step": 5490 }, { "epoch": 0.34325, "grad_norm": 2.40625, "grad_norm_var": 0.031012980143229167, "learning_rate": 0.0001, "loss": 7.2817, "loss/crossentropy": 2.375279426574707, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22997137159109116, "step": 5492 }, { "epoch": 0.343375, "grad_norm": 2.140625, "grad_norm_var": 0.03375244140625, "learning_rate": 0.0001, "loss": 7.3397, "loss/crossentropy": 2.174167513847351, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22003310173749924, "step": 5494 }, { "epoch": 0.3435, "grad_norm": 2.296875, "grad_norm_var": 0.02275390625, "learning_rate": 0.0001, "loss": 7.2999, "loss/crossentropy": 2.264827609062195, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22024701535701752, "step": 5496 }, { "epoch": 0.343625, "grad_norm": 2.078125, "grad_norm_var": 0.0152740478515625, "learning_rate": 0.0001, "loss": 7.2387, "loss/crossentropy": 2.186145544052124, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21280007809400558, "step": 5498 }, { "epoch": 0.34375, "grad_norm": 2.5625, "grad_norm_var": 0.01982421875, "learning_rate": 0.0001, "loss": 7.3772, "loss/crossentropy": 2.481659770011902, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2214002087712288, "step": 5500 }, { "epoch": 0.343875, "grad_norm": 2.125, "grad_norm_var": 0.019367472330729166, "learning_rate": 0.0001, "loss": 7.3132, "loss/crossentropy": 2.3691645860671997, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23550476133823395, "step": 5502 }, { "epoch": 0.344, "grad_norm": 2.234375, "grad_norm_var": 0.01949462890625, "learning_rate": 0.0001, "loss": 7.2032, "loss/crossentropy": 2.260165572166443, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.23349297046661377, "step": 5504 }, { "epoch": 0.344125, "grad_norm": 2.109375, "grad_norm_var": 0.01617431640625, "learning_rate": 0.0001, "loss": 7.3419, "loss/crossentropy": 2.2282174825668335, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22737015783786774, "step": 5506 }, { "epoch": 0.34425, "grad_norm": 2.578125, "grad_norm_var": 0.021663411458333334, "learning_rate": 0.0001, "loss": 7.2331, "loss/crossentropy": 2.3071209192276, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23168734461069107, "step": 5508 }, { "epoch": 0.344375, "grad_norm": 2.296875, "grad_norm_var": 0.032013956705729166, "learning_rate": 0.0001, "loss": 7.5768, "loss/crossentropy": 2.337808847427368, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22684383392333984, "step": 5510 }, { "epoch": 0.3445, "grad_norm": 2.125, "grad_norm_var": 0.03868815104166667, "learning_rate": 0.0001, "loss": 7.1271, "loss/crossentropy": 2.21714323759079, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2065524309873581, "step": 5512 }, { "epoch": 0.344625, "grad_norm": 2.46875, "grad_norm_var": 0.03770243326822917, "learning_rate": 0.0001, "loss": 7.5043, "loss/crossentropy": 2.3815321922302246, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22905410081148148, "step": 5514 }, { "epoch": 0.34475, "grad_norm": 2.21875, "grad_norm_var": 0.03234049479166667, "learning_rate": 0.0001, "loss": 7.2816, "loss/crossentropy": 2.4301551580429077, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2220115140080452, "step": 5516 }, { "epoch": 0.344875, "grad_norm": 2.28125, "grad_norm_var": 0.03194986979166667, "learning_rate": 0.0001, "loss": 7.2628, "loss/crossentropy": 2.404889702796936, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21883364766836166, "step": 5518 }, { "epoch": 0.345, "grad_norm": 2.21875, "grad_norm_var": 0.03205464680989583, "learning_rate": 0.0001, "loss": 7.3638, "loss/crossentropy": 2.228495955467224, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2163659930229187, "step": 5520 }, { "epoch": 0.345125, "grad_norm": 2.234375, "grad_norm_var": 0.029816691080729166, "learning_rate": 0.0001, "loss": 7.1626, "loss/crossentropy": 2.102945566177368, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20724687725305557, "step": 5522 }, { "epoch": 0.34525, "grad_norm": 2.140625, "grad_norm_var": 0.025031534830729167, "learning_rate": 0.0001, "loss": 7.3196, "loss/crossentropy": 2.291762113571167, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2216021940112114, "step": 5524 }, { "epoch": 0.345375, "grad_norm": 2.140625, "grad_norm_var": 0.014644368489583334, "learning_rate": 0.0001, "loss": 7.2901, "loss/crossentropy": 2.3197373151779175, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2217041775584221, "step": 5526 }, { "epoch": 0.3455, "grad_norm": 2.171875, "grad_norm_var": 0.009891764322916666, "learning_rate": 0.0001, "loss": 7.1251, "loss/crossentropy": 2.16168212890625, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2141154482960701, "step": 5528 }, { "epoch": 0.345625, "grad_norm": 2.359375, "grad_norm_var": 0.01103515625, "learning_rate": 0.0001, "loss": 7.2774, "loss/crossentropy": 2.3330233097076416, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2168646603822708, "step": 5530 }, { "epoch": 0.34575, "grad_norm": 2.703125, "grad_norm_var": 0.025951131184895834, "learning_rate": 0.0001, "loss": 7.4739, "loss/crossentropy": 2.0444337129592896, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2103881537914276, "step": 5532 }, { "epoch": 0.345875, "grad_norm": 2.359375, "grad_norm_var": 0.025446573893229168, "learning_rate": 0.0001, "loss": 7.4652, "loss/crossentropy": 2.315095543861389, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2205868437886238, "step": 5534 }, { "epoch": 0.346, "grad_norm": 2.3125, "grad_norm_var": 0.024323527018229166, "learning_rate": 0.0001, "loss": 7.2638, "loss/crossentropy": 2.2340970039367676, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21409018337726593, "step": 5536 }, { "epoch": 0.346125, "grad_norm": 2.078125, "grad_norm_var": 0.027741495768229166, "learning_rate": 0.0001, "loss": 7.2674, "loss/crossentropy": 2.30459725856781, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22205013036727905, "step": 5538 }, { "epoch": 0.34625, "grad_norm": 2.203125, "grad_norm_var": 0.030475870768229166, "learning_rate": 0.0001, "loss": 7.1644, "loss/crossentropy": 2.048761546611786, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19404233247041702, "step": 5540 }, { "epoch": 0.346375, "grad_norm": 2.28125, "grad_norm_var": 0.028515625, "learning_rate": 0.0001, "loss": 7.2601, "loss/crossentropy": 2.1471662521362305, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2048891857266426, "step": 5542 }, { "epoch": 0.3465, "grad_norm": 2.328125, "grad_norm_var": 0.027274576822916667, "learning_rate": 0.0001, "loss": 7.0981, "loss/crossentropy": 2.147502064704895, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21818849444389343, "step": 5544 }, { "epoch": 0.346625, "grad_norm": 2.765625, "grad_norm_var": 0.0386627197265625, "learning_rate": 0.0001, "loss": 7.2796, "loss/crossentropy": 2.213876247406006, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22943131625652313, "step": 5546 }, { "epoch": 0.34675, "grad_norm": 2.5, "grad_norm_var": 0.028055826822916668, "learning_rate": 0.0001, "loss": 7.3104, "loss/crossentropy": 2.327598452568054, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22541771829128265, "step": 5548 }, { "epoch": 0.346875, "grad_norm": 2.171875, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 7.2545, "loss/crossentropy": 2.065271317958832, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2102024182677269, "step": 5550 }, { "epoch": 0.347, "grad_norm": 2.15625, "grad_norm_var": 0.031769816080729166, "learning_rate": 0.0001, "loss": 7.2822, "loss/crossentropy": 2.241680383682251, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.24006665498018265, "step": 5552 }, { "epoch": 0.347125, "grad_norm": 2.25, "grad_norm_var": 0.029816691080729166, "learning_rate": 0.0001, "loss": 7.2535, "loss/crossentropy": 1.9361643195152283, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.193950355052948, "step": 5554 }, { "epoch": 0.34725, "grad_norm": 2.515625, "grad_norm_var": 0.029618326822916666, "learning_rate": 0.0001, "loss": 7.2865, "loss/crossentropy": 2.2774378061294556, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2135343924164772, "step": 5556 }, { "epoch": 0.347375, "grad_norm": 2.1875, "grad_norm_var": 0.04593098958333333, "learning_rate": 0.0001, "loss": 7.2553, "loss/crossentropy": 2.092591166496277, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21611423045396805, "step": 5558 }, { "epoch": 0.3475, "grad_norm": 2.390625, "grad_norm_var": 0.04534098307291667, "learning_rate": 0.0001, "loss": 7.4553, "loss/crossentropy": 2.4065760374069214, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21628361195325851, "step": 5560 }, { "epoch": 0.347625, "grad_norm": 2.09375, "grad_norm_var": 0.03795166015625, "learning_rate": 0.0001, "loss": 7.3378, "loss/crossentropy": 2.1736042499542236, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21986880898475647, "step": 5562 }, { "epoch": 0.34775, "grad_norm": 2.109375, "grad_norm_var": 0.03855692545572917, "learning_rate": 0.0001, "loss": 7.1797, "loss/crossentropy": 2.377760887145996, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.20999367535114288, "step": 5564 }, { "epoch": 0.347875, "grad_norm": 2.4375, "grad_norm_var": 0.03753255208333333, "learning_rate": 0.0001, "loss": 7.2805, "loss/crossentropy": 2.4437506198883057, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2216576337814331, "step": 5566 }, { "epoch": 0.348, "grad_norm": 2.03125, "grad_norm_var": 0.04220377604166667, "learning_rate": 0.0001, "loss": 7.1347, "loss/crossentropy": 2.3025336265563965, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21155836433172226, "step": 5568 }, { "epoch": 0.348125, "grad_norm": 2.265625, "grad_norm_var": 0.042902628580729164, "learning_rate": 0.0001, "loss": 7.2856, "loss/crossentropy": 2.147992491722107, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20805560052394867, "step": 5570 }, { "epoch": 0.34825, "grad_norm": 2.125, "grad_norm_var": 0.037495930989583336, "learning_rate": 0.0001, "loss": 7.2576, "loss/crossentropy": 2.0868008732795715, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2108987644314766, "step": 5572 }, { "epoch": 0.348375, "grad_norm": 2.21875, "grad_norm_var": 0.014159138997395833, "learning_rate": 0.0001, "loss": 7.4734, "loss/crossentropy": 2.1314677000045776, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22252144664525986, "step": 5574 }, { "epoch": 0.3485, "grad_norm": 2.296875, "grad_norm_var": 0.013472493489583333, "learning_rate": 0.0001, "loss": 7.3078, "loss/crossentropy": 2.5350881814956665, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22396673262119293, "step": 5576 }, { "epoch": 0.348625, "grad_norm": 2.21875, "grad_norm_var": 0.016917928059895834, "learning_rate": 0.0001, "loss": 7.298, "loss/crossentropy": 2.3182544708251953, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2111162766814232, "step": 5578 }, { "epoch": 0.34875, "grad_norm": 2.34375, "grad_norm_var": 0.015876261393229167, "learning_rate": 0.0001, "loss": 7.4597, "loss/crossentropy": 2.1177011728286743, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2125827968120575, "step": 5580 }, { "epoch": 0.348875, "grad_norm": 2.125, "grad_norm_var": 0.013231404622395833, "learning_rate": 0.0001, "loss": 7.2578, "loss/crossentropy": 2.2595534324645996, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20692522078752518, "step": 5582 }, { "epoch": 0.349, "grad_norm": 2.265625, "grad_norm_var": 0.009651692708333333, "learning_rate": 0.0001, "loss": 7.4041, "loss/crossentropy": 2.2322030067443848, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2103104293346405, "step": 5584 }, { "epoch": 0.349125, "grad_norm": 2.25, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 7.3692, "loss/crossentropy": 2.342887282371521, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2122577279806137, "step": 5586 }, { "epoch": 0.34925, "grad_norm": 2.265625, "grad_norm_var": 0.0108062744140625, "learning_rate": 0.0001, "loss": 7.2918, "loss/crossentropy": 2.1660179495811462, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21258103847503662, "step": 5588 }, { "epoch": 0.349375, "grad_norm": 2.015625, "grad_norm_var": 0.013411458333333333, "learning_rate": 0.0001, "loss": 7.2614, "loss/crossentropy": 2.191131591796875, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.20719382166862488, "step": 5590 }, { "epoch": 0.3495, "grad_norm": 2.296875, "grad_norm_var": 0.013133748372395834, "learning_rate": 0.0001, "loss": 7.2375, "loss/crossentropy": 2.0313998460769653, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2047080472111702, "step": 5592 }, { "epoch": 0.349625, "grad_norm": 2.671875, "grad_norm_var": 0.025048828125, "learning_rate": 0.0001, "loss": 7.3307, "loss/crossentropy": 2.2927592992782593, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2102295085787773, "step": 5594 }, { "epoch": 0.34975, "grad_norm": 2.171875, "grad_norm_var": 0.02720947265625, "learning_rate": 0.0001, "loss": 7.2445, "loss/crossentropy": 2.255831480026245, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22202551364898682, "step": 5596 }, { "epoch": 0.349875, "grad_norm": 2.015625, "grad_norm_var": 0.0329010009765625, "learning_rate": 0.0001, "loss": 7.1646, "loss/crossentropy": 2.3973917961120605, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23418821394443512, "step": 5598 }, { "epoch": 0.35, "grad_norm": 2.21875, "grad_norm_var": 0.032746378580729166, "learning_rate": 0.0001, "loss": 7.2829, "loss/crossentropy": 2.029415249824524, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.209575816988945, "step": 5600 }, { "epoch": 0.350125, "grad_norm": 2.390625, "grad_norm_var": 0.03385009765625, "learning_rate": 0.0001, "loss": 7.1971, "loss/crossentropy": 2.4901596307754517, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23769868910312653, "step": 5602 }, { "epoch": 0.35025, "grad_norm": 1.984375, "grad_norm_var": 0.03967692057291667, "learning_rate": 0.0001, "loss": 7.1465, "loss/crossentropy": 2.428591251373291, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22477445006370544, "step": 5604 }, { "epoch": 0.350375, "grad_norm": 2.265625, "grad_norm_var": 0.035888671875, "learning_rate": 0.0001, "loss": 7.1955, "loss/crossentropy": 2.120119094848633, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2116844207048416, "step": 5606 }, { "epoch": 0.3505, "grad_norm": 2.25, "grad_norm_var": 0.03421223958333333, "learning_rate": 0.0001, "loss": 7.3445, "loss/crossentropy": 2.4258477687835693, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2168346494436264, "step": 5608 }, { "epoch": 0.350625, "grad_norm": 2.25, "grad_norm_var": 0.020243326822916668, "learning_rate": 0.0001, "loss": 7.2439, "loss/crossentropy": 2.255433678627014, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21720143407583237, "step": 5610 }, { "epoch": 0.35075, "grad_norm": 2.109375, "grad_norm_var": 0.019481404622395834, "learning_rate": 0.0001, "loss": 7.3014, "loss/crossentropy": 2.3039766550064087, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20905648171901703, "step": 5612 }, { "epoch": 0.350875, "grad_norm": 2.203125, "grad_norm_var": 0.014872233072916666, "learning_rate": 0.0001, "loss": 7.2678, "loss/crossentropy": 2.2413315773010254, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21247967332601547, "step": 5614 }, { "epoch": 0.351, "grad_norm": 2.234375, "grad_norm_var": 0.0152008056640625, "learning_rate": 0.0001, "loss": 7.4491, "loss/crossentropy": 2.434108018875122, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.23025284707546234, "step": 5616 }, { "epoch": 0.351125, "grad_norm": 2.265625, "grad_norm_var": 0.013667805989583334, "learning_rate": 0.0001, "loss": 7.3377, "loss/crossentropy": 2.3039416074752808, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23464076220989227, "step": 5618 }, { "epoch": 0.35125, "grad_norm": 2.28125, "grad_norm_var": 0.006371053059895834, "learning_rate": 0.0001, "loss": 7.1852, "loss/crossentropy": 2.198891282081604, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20828387141227722, "step": 5620 }, { "epoch": 0.351375, "grad_norm": 2.140625, "grad_norm_var": 0.00660400390625, "learning_rate": 0.0001, "loss": 7.3417, "loss/crossentropy": 1.9807387590408325, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.18464645743370056, "step": 5622 }, { "epoch": 0.3515, "grad_norm": 2.328125, "grad_norm_var": 0.007331339518229166, "learning_rate": 0.0001, "loss": 7.3484, "loss/crossentropy": 2.1647424697875977, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20414948463439941, "step": 5624 }, { "epoch": 0.351625, "grad_norm": 2.140625, "grad_norm_var": 0.0063385009765625, "learning_rate": 0.0001, "loss": 7.3053, "loss/crossentropy": 2.1328948736190796, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22070185095071793, "step": 5626 }, { "epoch": 0.35175, "grad_norm": 2.046875, "grad_norm_var": 0.0060699462890625, "learning_rate": 0.0001, "loss": 7.1396, "loss/crossentropy": 2.041126549243927, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20997025072574615, "step": 5628 }, { "epoch": 0.351875, "grad_norm": 2.171875, "grad_norm_var": 0.006126912434895834, "learning_rate": 0.0001, "loss": 7.1444, "loss/crossentropy": 2.0724629759788513, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20483851432800293, "step": 5630 }, { "epoch": 0.352, "grad_norm": 2.390625, "grad_norm_var": 0.007958984375, "learning_rate": 0.0001, "loss": 7.2539, "loss/crossentropy": 2.167839527130127, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21613173931837082, "step": 5632 }, { "epoch": 0.352125, "grad_norm": 2.078125, "grad_norm_var": 0.013037109375, "learning_rate": 0.0001, "loss": 7.1433, "loss/crossentropy": 2.105876088142395, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20869092643260956, "step": 5634 }, { "epoch": 0.35225, "grad_norm": 2.234375, "grad_norm_var": 0.012970987955729167, "learning_rate": 0.0001, "loss": 7.3603, "loss/crossentropy": 2.1784520745277405, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20698366314172745, "step": 5636 }, { "epoch": 0.352375, "grad_norm": 2.34375, "grad_norm_var": 0.013947550455729167, "learning_rate": 0.0001, "loss": 7.3438, "loss/crossentropy": 2.215299963951111, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21201211214065552, "step": 5638 }, { "epoch": 0.3525, "grad_norm": 2.125, "grad_norm_var": 0.013939412434895833, "learning_rate": 0.0001, "loss": 7.2268, "loss/crossentropy": 2.0955730676651, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20698635280132294, "step": 5640 }, { "epoch": 0.352625, "grad_norm": 2.4375, "grad_norm_var": 0.01763916015625, "learning_rate": 0.0001, "loss": 7.3076, "loss/crossentropy": 2.3292022943496704, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.24243366718292236, "step": 5642 }, { "epoch": 0.35275, "grad_norm": 2.234375, "grad_norm_var": 0.015672810872395835, "learning_rate": 0.0001, "loss": 7.229, "loss/crossentropy": 2.081650137901306, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.2038363665342331, "step": 5644 }, { "epoch": 0.352875, "grad_norm": 2.09375, "grad_norm_var": 0.016337076822916668, "learning_rate": 0.0001, "loss": 7.1489, "loss/crossentropy": 2.1960020065307617, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2301100641489029, "step": 5646 }, { "epoch": 0.353, "grad_norm": 2.234375, "grad_norm_var": 0.014583333333333334, "learning_rate": 0.0001, "loss": 7.247, "loss/crossentropy": 2.379251480102539, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.219240702688694, "step": 5648 }, { "epoch": 0.353125, "grad_norm": 2.234375, "grad_norm_var": 0.009273274739583334, "learning_rate": 0.0001, "loss": 7.3936, "loss/crossentropy": 2.380456566810608, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22222916781902313, "step": 5650 }, { "epoch": 0.35325, "grad_norm": 2.125, "grad_norm_var": 0.009650675455729167, "learning_rate": 0.0001, "loss": 7.2234, "loss/crossentropy": 2.2347010374069214, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22965724766254425, "step": 5652 }, { "epoch": 0.353375, "grad_norm": 2.1875, "grad_norm_var": 0.008226521809895833, "learning_rate": 0.0001, "loss": 7.4284, "loss/crossentropy": 2.169225573539734, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20262756943702698, "step": 5654 }, { "epoch": 0.3535, "grad_norm": 2.390625, "grad_norm_var": 0.010026041666666667, "learning_rate": 0.0001, "loss": 7.226, "loss/crossentropy": 2.189695119857788, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22685642540454865, "step": 5656 }, { "epoch": 0.353625, "grad_norm": 2.171875, "grad_norm_var": 0.008617146809895834, "learning_rate": 0.0001, "loss": 7.2095, "loss/crossentropy": 2.382868766784668, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2241538092494011, "step": 5658 }, { "epoch": 0.35375, "grad_norm": 2.328125, "grad_norm_var": 0.011812337239583333, "learning_rate": 0.0001, "loss": 7.251, "loss/crossentropy": 2.085647702217102, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22093788534402847, "step": 5660 }, { "epoch": 0.353875, "grad_norm": 2.234375, "grad_norm_var": 0.0098785400390625, "learning_rate": 0.0001, "loss": 7.2481, "loss/crossentropy": 1.988870620727539, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20682383328676224, "step": 5662 }, { "epoch": 0.354, "grad_norm": 2.25, "grad_norm_var": 0.011031087239583333, "learning_rate": 0.0001, "loss": 7.3857, "loss/crossentropy": 2.2957637310028076, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22826778888702393, "step": 5664 }, { "epoch": 0.354125, "grad_norm": 2.21875, "grad_norm_var": 0.010326131184895834, "learning_rate": 0.0001, "loss": 7.3686, "loss/crossentropy": 2.2440401315689087, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21839071810245514, "step": 5666 }, { "epoch": 0.35425, "grad_norm": 2.46875, "grad_norm_var": 0.009618123372395834, "learning_rate": 0.0001, "loss": 7.1635, "loss/crossentropy": 2.2398467659950256, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2106655314564705, "step": 5668 }, { "epoch": 0.354375, "grad_norm": 2.15625, "grad_norm_var": 0.009993489583333333, "learning_rate": 0.0001, "loss": 7.1971, "loss/crossentropy": 2.2117063999176025, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22577807307243347, "step": 5670 }, { "epoch": 0.3545, "grad_norm": 2.0625, "grad_norm_var": 0.012824503580729167, "learning_rate": 0.0001, "loss": 7.2778, "loss/crossentropy": 2.489393711090088, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22771210968494415, "step": 5672 }, { "epoch": 0.354625, "grad_norm": 2.140625, "grad_norm_var": 0.0126861572265625, "learning_rate": 0.0001, "loss": 7.2026, "loss/crossentropy": 2.0024437308311462, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.20530618727207184, "step": 5674 }, { "epoch": 0.35475, "grad_norm": 2.421875, "grad_norm_var": 0.011668904622395834, "learning_rate": 0.0001, "loss": 7.2365, "loss/crossentropy": 2.390307068824768, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21883082389831543, "step": 5676 }, { "epoch": 0.354875, "grad_norm": 2.28125, "grad_norm_var": 0.011653645833333334, "learning_rate": 0.0001, "loss": 7.3552, "loss/crossentropy": 2.237827181816101, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21198499202728271, "step": 5678 }, { "epoch": 0.355, "grad_norm": 2.0, "grad_norm_var": 0.0149322509765625, "learning_rate": 0.0001, "loss": 7.1375, "loss/crossentropy": 2.071999192237854, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20247787237167358, "step": 5680 }, { "epoch": 0.355125, "grad_norm": 2.078125, "grad_norm_var": 0.02051366170247396, "learning_rate": 0.0001, "loss": 7.2086, "loss/crossentropy": 2.006256639957428, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.19993876665830612, "step": 5682 }, { "epoch": 0.35525, "grad_norm": 2.09375, "grad_norm_var": 0.016534169514973957, "learning_rate": 0.0001, "loss": 7.2393, "loss/crossentropy": 2.4324631690979004, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2262727990746498, "step": 5684 }, { "epoch": 0.355375, "grad_norm": 2.234375, "grad_norm_var": 0.015958404541015624, "learning_rate": 0.0001, "loss": 7.1675, "loss/crossentropy": 2.2547385692596436, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21899420768022537, "step": 5686 }, { "epoch": 0.3555, "grad_norm": 2.296875, "grad_norm_var": 0.016318511962890626, "learning_rate": 0.0001, "loss": 7.1257, "loss/crossentropy": 2.3510611057281494, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21146484464406967, "step": 5688 }, { "epoch": 0.355625, "grad_norm": 2.140625, "grad_norm_var": 0.016094716389973958, "learning_rate": 0.0001, "loss": 7.2481, "loss/crossentropy": 2.157021403312683, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2122403085231781, "step": 5690 }, { "epoch": 0.35575, "grad_norm": 2.1875, "grad_norm_var": 0.012463124593098958, "learning_rate": 0.0001, "loss": 7.3541, "loss/crossentropy": 2.340176820755005, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21765124797821045, "step": 5692 }, { "epoch": 0.355875, "grad_norm": 2.359375, "grad_norm_var": 0.013303375244140625, "learning_rate": 0.0001, "loss": 7.4739, "loss/crossentropy": 2.550337553024292, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2295769453048706, "step": 5694 }, { "epoch": 0.356, "grad_norm": 2.078125, "grad_norm_var": 0.012410227457682292, "learning_rate": 0.0001, "loss": 7.3001, "loss/crossentropy": 2.221598982810974, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20915590971708298, "step": 5696 }, { "epoch": 0.356125, "grad_norm": 2.265625, "grad_norm_var": 0.008756510416666667, "learning_rate": 0.0001, "loss": 7.3883, "loss/crossentropy": 2.3473533391952515, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20821285992860794, "step": 5698 }, { "epoch": 0.35625, "grad_norm": 2.421875, "grad_norm_var": 0.011335245768229167, "learning_rate": 0.0001, "loss": 7.1648, "loss/crossentropy": 2.310244917869568, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21906693279743195, "step": 5700 }, { "epoch": 0.356375, "grad_norm": 2.125, "grad_norm_var": 0.014481608072916667, "learning_rate": 0.0001, "loss": 7.3204, "loss/crossentropy": 2.333138346672058, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.23273716866970062, "step": 5702 }, { "epoch": 0.3565, "grad_norm": 2.140625, "grad_norm_var": 0.014069620768229167, "learning_rate": 0.0001, "loss": 7.1919, "loss/crossentropy": 2.0340664386749268, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.22012364119291306, "step": 5704 }, { "epoch": 0.356625, "grad_norm": 2.921875, "grad_norm_var": 0.04997456868489583, "learning_rate": 0.0001, "loss": 7.4458, "loss/crossentropy": 2.390481472015381, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22539138793945312, "step": 5706 }, { "epoch": 0.35675, "grad_norm": 2.03125, "grad_norm_var": 0.05371805826822917, "learning_rate": 0.0001, "loss": 7.1741, "loss/crossentropy": 2.3384220600128174, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21385760605335236, "step": 5708 }, { "epoch": 0.356875, "grad_norm": 2.25, "grad_norm_var": 0.052734375, "learning_rate": 0.0001, "loss": 7.3415, "loss/crossentropy": 2.266079902648926, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21113939583301544, "step": 5710 }, { "epoch": 0.357, "grad_norm": 2.28125, "grad_norm_var": 0.05022379557291667, "learning_rate": 0.0001, "loss": 7.2609, "loss/crossentropy": 2.290347933769226, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22176006436347961, "step": 5712 }, { "epoch": 0.357125, "grad_norm": 1.9375, "grad_norm_var": 0.059651692708333336, "learning_rate": 0.0001, "loss": 7.0321, "loss/crossentropy": 2.053887665271759, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20520412921905518, "step": 5714 }, { "epoch": 0.35725, "grad_norm": 2.140625, "grad_norm_var": 0.05829671223958333, "learning_rate": 0.0001, "loss": 7.1609, "loss/crossentropy": 2.2182360887527466, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19742074608802795, "step": 5716 }, { "epoch": 0.357375, "grad_norm": 2.140625, "grad_norm_var": 0.05565999348958333, "learning_rate": 0.0001, "loss": 7.1431, "loss/crossentropy": 2.2058277130126953, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20125412940979004, "step": 5718 }, { "epoch": 0.3575, "grad_norm": 2.265625, "grad_norm_var": 0.0562408447265625, "learning_rate": 0.0001, "loss": 7.252, "loss/crossentropy": 2.047255039215088, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.19879010319709778, "step": 5720 }, { "epoch": 0.357625, "grad_norm": 2.28125, "grad_norm_var": 0.013508097330729166, "learning_rate": 0.0001, "loss": 7.1866, "loss/crossentropy": 2.2525157928466797, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23531018197536469, "step": 5722 }, { "epoch": 0.35775, "grad_norm": 1.96875, "grad_norm_var": 0.017476399739583332, "learning_rate": 0.0001, "loss": 6.9843, "loss/crossentropy": 2.151524543762207, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21920562535524368, "step": 5724 }, { "epoch": 0.357875, "grad_norm": 2.265625, "grad_norm_var": 0.020572916666666666, "learning_rate": 0.0001, "loss": 7.3308, "loss/crossentropy": 2.113102436065674, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20669876039028168, "step": 5726 }, { "epoch": 0.358, "grad_norm": 2.359375, "grad_norm_var": 0.023151652018229166, "learning_rate": 0.0001, "loss": 7.1118, "loss/crossentropy": 2.0734687447547913, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20580605417490005, "step": 5728 }, { "epoch": 0.358125, "grad_norm": 2.140625, "grad_norm_var": 0.018180338541666667, "learning_rate": 0.0001, "loss": 7.2682, "loss/crossentropy": 2.2568482160568237, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21253487467765808, "step": 5730 }, { "epoch": 0.35825, "grad_norm": 2.3125, "grad_norm_var": 0.017723592122395833, "learning_rate": 0.0001, "loss": 7.4277, "loss/crossentropy": 2.1575281620025635, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20935780555009842, "step": 5732 }, { "epoch": 0.358375, "grad_norm": 2.15625, "grad_norm_var": 0.02252197265625, "learning_rate": 0.0001, "loss": 7.3061, "loss/crossentropy": 2.1693350076675415, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21988856047391891, "step": 5734 }, { "epoch": 0.3585, "grad_norm": 2.390625, "grad_norm_var": 0.0248443603515625, "learning_rate": 0.0001, "loss": 7.3982, "loss/crossentropy": 2.4757591485977173, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22211267799139023, "step": 5736 }, { "epoch": 0.358625, "grad_norm": 2.203125, "grad_norm_var": 0.022835286458333333, "learning_rate": 0.0001, "loss": 7.2263, "loss/crossentropy": 1.9915843605995178, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.20769823342561722, "step": 5738 }, { "epoch": 0.35875, "grad_norm": 2.359375, "grad_norm_var": 0.011701456705729167, "learning_rate": 0.0001, "loss": 7.369, "loss/crossentropy": 2.206750988960266, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2206248715519905, "step": 5740 }, { "epoch": 0.358875, "grad_norm": 2.390625, "grad_norm_var": 0.011865234375, "learning_rate": 0.0001, "loss": 7.373, "loss/crossentropy": 2.46751606464386, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2296297699213028, "step": 5742 }, { "epoch": 0.359, "grad_norm": 2.140625, "grad_norm_var": 0.01314697265625, "learning_rate": 0.0001, "loss": 7.4235, "loss/crossentropy": 2.4122776985168457, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21703825145959854, "step": 5744 }, { "epoch": 0.359125, "grad_norm": 2.25, "grad_norm_var": 0.012939453125, "learning_rate": 0.0001, "loss": 7.4364, "loss/crossentropy": 2.4436628818511963, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21490082144737244, "step": 5746 }, { "epoch": 0.35925, "grad_norm": 2.0625, "grad_norm_var": 0.015934244791666666, "learning_rate": 0.0001, "loss": 7.0069, "loss/crossentropy": 2.0449106693267822, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.18983326852321625, "step": 5748 }, { "epoch": 0.359375, "grad_norm": 2.125, "grad_norm_var": 0.013898722330729167, "learning_rate": 0.0001, "loss": 7.1814, "loss/crossentropy": 2.1521427631378174, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20896754413843155, "step": 5750 }, { "epoch": 0.3595, "grad_norm": 2.421875, "grad_norm_var": 0.012788899739583333, "learning_rate": 0.0001, "loss": 7.2468, "loss/crossentropy": 2.1787991523742676, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2136235386133194, "step": 5752 }, { "epoch": 0.359625, "grad_norm": 2.125, "grad_norm_var": 0.013395182291666667, "learning_rate": 0.0001, "loss": 7.1933, "loss/crossentropy": 2.145695447921753, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20177044719457626, "step": 5754 }, { "epoch": 0.35975, "grad_norm": 2.328125, "grad_norm_var": 0.013916015625, "learning_rate": 0.0001, "loss": 7.2973, "loss/crossentropy": 2.315553665161133, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21916157007217407, "step": 5756 }, { "epoch": 0.359875, "grad_norm": 2.203125, "grad_norm_var": 0.011351521809895833, "learning_rate": 0.0001, "loss": 7.2535, "loss/crossentropy": 2.1844006776809692, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21652723848819733, "step": 5758 }, { "epoch": 0.36, "grad_norm": 2.15625, "grad_norm_var": 0.010936482747395834, "learning_rate": 0.0001, "loss": 7.1545, "loss/crossentropy": 2.140083909034729, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.2082248032093048, "step": 5760 }, { "epoch": 0.360125, "grad_norm": 2.21875, "grad_norm_var": 0.010640462239583334, "learning_rate": 0.0001, "loss": 7.3526, "loss/crossentropy": 2.4246350526809692, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2654386907815933, "step": 5762 }, { "epoch": 0.36025, "grad_norm": 2.34375, "grad_norm_var": 0.011116536458333333, "learning_rate": 0.0001, "loss": 7.236, "loss/crossentropy": 2.190550684928894, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21372459083795547, "step": 5764 }, { "epoch": 0.360375, "grad_norm": 2.1875, "grad_norm_var": 0.0099517822265625, "learning_rate": 0.0001, "loss": 7.198, "loss/crossentropy": 2.2206780910491943, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21862171590328217, "step": 5766 }, { "epoch": 0.3605, "grad_norm": 2.203125, "grad_norm_var": 0.008036295572916666, "learning_rate": 0.0001, "loss": 7.139, "loss/crossentropy": 2.0877394676208496, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21827848255634308, "step": 5768 }, { "epoch": 0.360625, "grad_norm": 1.9765625, "grad_norm_var": 0.010497792561848959, "learning_rate": 0.0001, "loss": 7.1267, "loss/crossentropy": 2.1852867603302, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.19439171254634857, "step": 5770 }, { "epoch": 0.36075, "grad_norm": 2.359375, "grad_norm_var": 0.009500885009765625, "learning_rate": 0.0001, "loss": 7.2532, "loss/crossentropy": 2.2738914489746094, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20999271422624588, "step": 5772 }, { "epoch": 0.360875, "grad_norm": 2.140625, "grad_norm_var": 0.010184478759765626, "learning_rate": 0.0001, "loss": 7.3997, "loss/crossentropy": 2.32577121257782, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20629432797431946, "step": 5774 }, { "epoch": 0.361, "grad_norm": 2.53125, "grad_norm_var": 0.017978668212890625, "learning_rate": 0.0001, "loss": 7.205, "loss/crossentropy": 2.3490023612976074, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23214927315711975, "step": 5776 }, { "epoch": 0.361125, "grad_norm": 2.234375, "grad_norm_var": 0.02216364542643229, "learning_rate": 0.0001, "loss": 7.2525, "loss/crossentropy": 2.041129231452942, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.1971891075372696, "step": 5778 }, { "epoch": 0.36125, "grad_norm": 2.28125, "grad_norm_var": 0.020957183837890626, "learning_rate": 0.0001, "loss": 7.1419, "loss/crossentropy": 2.4754010438919067, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2261795774102211, "step": 5780 }, { "epoch": 0.361375, "grad_norm": 2.265625, "grad_norm_var": 0.020947011311848958, "learning_rate": 0.0001, "loss": 7.4145, "loss/crossentropy": 2.3293362855911255, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2322985827922821, "step": 5782 }, { "epoch": 0.3615, "grad_norm": 2.21875, "grad_norm_var": 0.019608306884765624, "learning_rate": 0.0001, "loss": 7.2454, "loss/crossentropy": 2.2818493843078613, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2164430171251297, "step": 5784 }, { "epoch": 0.361625, "grad_norm": 2.125, "grad_norm_var": 0.0157379150390625, "learning_rate": 0.0001, "loss": 7.3042, "loss/crossentropy": 2.418179750442505, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22034931927919388, "step": 5786 }, { "epoch": 0.36175, "grad_norm": 2.453125, "grad_norm_var": 0.017431640625, "learning_rate": 0.0001, "loss": 7.2657, "loss/crossentropy": 2.1933167576789856, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2206980139017105, "step": 5788 }, { "epoch": 0.361875, "grad_norm": 2.453125, "grad_norm_var": 0.018863932291666666, "learning_rate": 0.0001, "loss": 7.175, "loss/crossentropy": 2.2669215202331543, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.20475652068853378, "step": 5790 }, { "epoch": 0.362, "grad_norm": 2.15625, "grad_norm_var": 0.017643229166666666, "learning_rate": 0.0001, "loss": 7.247, "loss/crossentropy": 2.1614081859588623, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20143014192581177, "step": 5792 }, { "epoch": 0.362125, "grad_norm": 2.09375, "grad_norm_var": 0.020113118489583335, "learning_rate": 0.0001, "loss": 7.1625, "loss/crossentropy": 2.088079333305359, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20542144775390625, "step": 5794 }, { "epoch": 0.36225, "grad_norm": 2.234375, "grad_norm_var": 0.019310506184895833, "learning_rate": 0.0001, "loss": 7.362, "loss/crossentropy": 2.2279645204544067, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2040068879723549, "step": 5796 }, { "epoch": 0.362375, "grad_norm": 2.203125, "grad_norm_var": 0.018903605143229165, "learning_rate": 0.0001, "loss": 7.1413, "loss/crossentropy": 2.3002779483795166, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2167448326945305, "step": 5798 }, { "epoch": 0.3625, "grad_norm": 2.28125, "grad_norm_var": 0.01959228515625, "learning_rate": 0.0001, "loss": 7.2912, "loss/crossentropy": 2.49115788936615, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22317777574062347, "step": 5800 }, { "epoch": 0.362625, "grad_norm": 2.171875, "grad_norm_var": 0.019188435872395833, "learning_rate": 0.0001, "loss": 7.2608, "loss/crossentropy": 2.2841527462005615, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2520540952682495, "step": 5802 }, { "epoch": 0.36275, "grad_norm": 2.1875, "grad_norm_var": 0.017210896809895834, "learning_rate": 0.0001, "loss": 7.1874, "loss/crossentropy": 2.3316045999526978, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20709974318742752, "step": 5804 }, { "epoch": 0.362875, "grad_norm": 2.234375, "grad_norm_var": 0.014354451497395834, "learning_rate": 0.0001, "loss": 7.2947, "loss/crossentropy": 2.1988085508346558, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21208590269088745, "step": 5806 }, { "epoch": 0.363, "grad_norm": 2.171875, "grad_norm_var": 0.007710774739583333, "learning_rate": 0.0001, "loss": 7.2853, "loss/crossentropy": 2.404086947441101, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23354601860046387, "step": 5808 }, { "epoch": 0.363125, "grad_norm": 2.296875, "grad_norm_var": 0.0051422119140625, "learning_rate": 0.0001, "loss": 7.3537, "loss/crossentropy": 2.3549355268478394, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21141213178634644, "step": 5810 }, { "epoch": 0.36325, "grad_norm": 2.140625, "grad_norm_var": 0.00552978515625, "learning_rate": 0.0001, "loss": 7.3451, "loss/crossentropy": 2.428610682487488, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22301796823740005, "step": 5812 }, { "epoch": 0.363375, "grad_norm": 2.3125, "grad_norm_var": 0.0060536702473958336, "learning_rate": 0.0001, "loss": 7.3552, "loss/crossentropy": 2.033466935157776, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.196543850004673, "step": 5814 }, { "epoch": 0.3635, "grad_norm": 2.28125, "grad_norm_var": 0.005150349934895834, "learning_rate": 0.0001, "loss": 7.3336, "loss/crossentropy": 2.324714779853821, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21623509377241135, "step": 5816 }, { "epoch": 0.363625, "grad_norm": 2.109375, "grad_norm_var": 0.004976399739583333, "learning_rate": 0.0001, "loss": 7.1483, "loss/crossentropy": 2.2404175996780396, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20572540163993835, "step": 5818 }, { "epoch": 0.36375, "grad_norm": 2.59375, "grad_norm_var": 0.014676920572916667, "learning_rate": 0.0001, "loss": 7.375, "loss/crossentropy": 2.269519567489624, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2145681381225586, "step": 5820 }, { "epoch": 0.363875, "grad_norm": 2.046875, "grad_norm_var": 0.016779581705729168, "learning_rate": 0.0001, "loss": 7.2622, "loss/crossentropy": 2.202036142349243, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2079039216041565, "step": 5822 }, { "epoch": 0.364, "grad_norm": 2.203125, "grad_norm_var": 0.0169097900390625, "learning_rate": 0.0001, "loss": 7.193, "loss/crossentropy": 2.546125888824463, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21474044024944305, "step": 5824 }, { "epoch": 0.364125, "grad_norm": 2.328125, "grad_norm_var": 0.016624959309895833, "learning_rate": 0.0001, "loss": 7.2476, "loss/crossentropy": 2.285582184791565, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21190346777439117, "step": 5826 }, { "epoch": 0.36425, "grad_norm": 2.28125, "grad_norm_var": 0.016373697916666666, "learning_rate": 0.0001, "loss": 7.3289, "loss/crossentropy": 2.239107668399811, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.19788258522748947, "step": 5828 }, { "epoch": 0.364375, "grad_norm": 2.09375, "grad_norm_var": 0.017015584309895835, "learning_rate": 0.0001, "loss": 7.3511, "loss/crossentropy": 2.401890516281128, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2497824877500534, "step": 5830 }, { "epoch": 0.3645, "grad_norm": 2.296875, "grad_norm_var": 0.019401041666666667, "learning_rate": 0.0001, "loss": 7.2613, "loss/crossentropy": 2.2030797004699707, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.1958223581314087, "step": 5832 }, { "epoch": 0.364625, "grad_norm": 2.3125, "grad_norm_var": 0.0217926025390625, "learning_rate": 0.0001, "loss": 7.2132, "loss/crossentropy": 2.241069197654724, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22120070457458496, "step": 5834 }, { "epoch": 0.36475, "grad_norm": 2.15625, "grad_norm_var": 0.014045206705729167, "learning_rate": 0.0001, "loss": 7.3255, "loss/crossentropy": 2.346863269805908, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22068895399570465, "step": 5836 }, { "epoch": 0.364875, "grad_norm": 2.484375, "grad_norm_var": 0.015950520833333332, "learning_rate": 0.0001, "loss": 7.3054, "loss/crossentropy": 2.157726287841797, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.19728650152683258, "step": 5838 }, { "epoch": 0.365, "grad_norm": 2.203125, "grad_norm_var": 0.0157867431640625, "learning_rate": 0.0001, "loss": 7.1046, "loss/crossentropy": 2.1430864334106445, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19296472519636154, "step": 5840 }, { "epoch": 0.365125, "grad_norm": 2.25, "grad_norm_var": 0.015152994791666667, "learning_rate": 0.0001, "loss": 7.2594, "loss/crossentropy": 2.453359365463257, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2097448706626892, "step": 5842 }, { "epoch": 0.36525, "grad_norm": 2.359375, "grad_norm_var": 0.015885416666666666, "learning_rate": 0.0001, "loss": 7.2375, "loss/crossentropy": 2.137434482574463, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19895398616790771, "step": 5844 }, { "epoch": 0.365375, "grad_norm": 2.203125, "grad_norm_var": 0.014354451497395834, "learning_rate": 0.0001, "loss": 7.3807, "loss/crossentropy": 2.1278064846992493, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2288549244403839, "step": 5846 }, { "epoch": 0.3655, "grad_norm": 2.140625, "grad_norm_var": 0.0134918212890625, "learning_rate": 0.0001, "loss": 7.1056, "loss/crossentropy": 2.465573310852051, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22544366866350174, "step": 5848 }, { "epoch": 0.365625, "grad_norm": 2.09375, "grad_norm_var": 0.011649576822916667, "learning_rate": 0.0001, "loss": 7.2308, "loss/crossentropy": 1.9526810050010681, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19054138660430908, "step": 5850 }, { "epoch": 0.36575, "grad_norm": 2.625, "grad_norm_var": 0.021903483072916667, "learning_rate": 0.0001, "loss": 7.3444, "loss/crossentropy": 2.3907413482666016, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22131036221981049, "step": 5852 }, { "epoch": 0.365875, "grad_norm": 2.15625, "grad_norm_var": 0.020210774739583333, "learning_rate": 0.0001, "loss": 7.3632, "loss/crossentropy": 2.4008761644363403, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22065234184265137, "step": 5854 }, { "epoch": 0.366, "grad_norm": 2.078125, "grad_norm_var": 0.023111979166666668, "learning_rate": 0.0001, "loss": 7.1694, "loss/crossentropy": 2.389074921607971, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21668195724487305, "step": 5856 }, { "epoch": 0.366125, "grad_norm": 2.203125, "grad_norm_var": 0.023469034830729166, "learning_rate": 0.0001, "loss": 7.1397, "loss/crossentropy": 2.4073052406311035, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2183263897895813, "step": 5858 }, { "epoch": 0.36625, "grad_norm": 2.234375, "grad_norm_var": 0.02408447265625, "learning_rate": 0.0001, "loss": 7.2806, "loss/crossentropy": 2.45753812789917, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22091014683246613, "step": 5860 }, { "epoch": 0.366375, "grad_norm": 2.15625, "grad_norm_var": 0.024247233072916666, "learning_rate": 0.0001, "loss": 7.1823, "loss/crossentropy": 2.2068198919296265, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2208571657538414, "step": 5862 }, { "epoch": 0.3665, "grad_norm": 1.9609375, "grad_norm_var": 0.027530670166015625, "learning_rate": 0.0001, "loss": 7.0967, "loss/crossentropy": 2.0362515449523926, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.1910586953163147, "step": 5864 }, { "epoch": 0.366625, "grad_norm": 2.296875, "grad_norm_var": 0.02529881795247396, "learning_rate": 0.0001, "loss": 7.2175, "loss/crossentropy": 2.3787707090377808, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20581073313951492, "step": 5866 }, { "epoch": 0.36675, "grad_norm": 2.09375, "grad_norm_var": 0.014542388916015624, "learning_rate": 0.0001, "loss": 7.112, "loss/crossentropy": 2.222362995147705, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21793870627880096, "step": 5868 }, { "epoch": 0.366875, "grad_norm": 2.265625, "grad_norm_var": 0.011909739176432291, "learning_rate": 0.0001, "loss": 7.1673, "loss/crossentropy": 2.208632707595825, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22443342208862305, "step": 5870 }, { "epoch": 0.367, "grad_norm": 2.1875, "grad_norm_var": 0.010330963134765624, "learning_rate": 0.0001, "loss": 7.3225, "loss/crossentropy": 1.9650804996490479, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20253446698188782, "step": 5872 }, { "epoch": 0.367125, "grad_norm": 2.15625, "grad_norm_var": 0.011270904541015625, "learning_rate": 0.0001, "loss": 7.2269, "loss/crossentropy": 2.1838788986206055, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22055459022521973, "step": 5874 }, { "epoch": 0.36725, "grad_norm": 2.4375, "grad_norm_var": 0.013565826416015624, "learning_rate": 0.0001, "loss": 7.0766, "loss/crossentropy": 2.2628434896469116, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20941808074712753, "step": 5876 }, { "epoch": 0.367375, "grad_norm": 1.953125, "grad_norm_var": 0.01853205362955729, "learning_rate": 0.0001, "loss": 7.278, "loss/crossentropy": 2.1746416687965393, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2203633338212967, "step": 5878 }, { "epoch": 0.3675, "grad_norm": 2.40625, "grad_norm_var": 0.017560831705729165, "learning_rate": 0.0001, "loss": 7.2906, "loss/crossentropy": 2.2157901525497437, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20976532995700836, "step": 5880 }, { "epoch": 0.367625, "grad_norm": 2.015625, "grad_norm_var": 0.020563761393229168, "learning_rate": 0.0001, "loss": 7.2316, "loss/crossentropy": 2.40118670463562, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22522136569023132, "step": 5882 }, { "epoch": 0.36775, "grad_norm": 2.28125, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 7.2647, "loss/crossentropy": 2.5736021995544434, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2218579277396202, "step": 5884 }, { "epoch": 0.367875, "grad_norm": 2.34375, "grad_norm_var": 0.020270792643229167, "learning_rate": 0.0001, "loss": 7.3696, "loss/crossentropy": 2.074417471885681, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21545039117336273, "step": 5886 }, { "epoch": 0.368, "grad_norm": 2.125, "grad_norm_var": 0.020865885416666667, "learning_rate": 0.0001, "loss": 7.1881, "loss/crossentropy": 2.3205907344818115, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20773284137248993, "step": 5888 }, { "epoch": 0.368125, "grad_norm": 2.34375, "grad_norm_var": 0.021393839518229166, "learning_rate": 0.0001, "loss": 7.073, "loss/crossentropy": 1.872346818447113, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.204973466694355, "step": 5890 }, { "epoch": 0.36825, "grad_norm": 2.171875, "grad_norm_var": 0.017332967122395834, "learning_rate": 0.0001, "loss": 7.2071, "loss/crossentropy": 2.166461229324341, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21400277316570282, "step": 5892 }, { "epoch": 0.368375, "grad_norm": 2.265625, "grad_norm_var": 0.012279256184895834, "learning_rate": 0.0001, "loss": 7.1804, "loss/crossentropy": 2.2579694986343384, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22205153107643127, "step": 5894 }, { "epoch": 0.3685, "grad_norm": 2.234375, "grad_norm_var": 0.008882649739583333, "learning_rate": 0.0001, "loss": 7.3781, "loss/crossentropy": 2.3255600929260254, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20994628965854645, "step": 5896 }, { "epoch": 0.368625, "grad_norm": 2.140625, "grad_norm_var": 0.00933837890625, "learning_rate": 0.0001, "loss": 7.2894, "loss/crossentropy": 2.6723464727401733, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21977736055850983, "step": 5898 }, { "epoch": 0.36875, "grad_norm": 2.265625, "grad_norm_var": 0.010212198893229166, "learning_rate": 0.0001, "loss": 7.2128, "loss/crossentropy": 2.2925814390182495, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.24049442261457443, "step": 5900 }, { "epoch": 0.368875, "grad_norm": 2.359375, "grad_norm_var": 0.0120513916015625, "learning_rate": 0.0001, "loss": 7.5009, "loss/crossentropy": 2.6006277799606323, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22706793248653412, "step": 5902 }, { "epoch": 0.369, "grad_norm": 2.34375, "grad_norm_var": 0.0119537353515625, "learning_rate": 0.0001, "loss": 7.2934, "loss/crossentropy": 2.4217220544815063, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.23213213682174683, "step": 5904 }, { "epoch": 0.369125, "grad_norm": 2.1875, "grad_norm_var": 0.01031494140625, "learning_rate": 0.0001, "loss": 7.2382, "loss/crossentropy": 2.1426541805267334, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.18587464094161987, "step": 5906 }, { "epoch": 0.36925, "grad_norm": 2.25, "grad_norm_var": 0.013036092122395834, "learning_rate": 0.0001, "loss": 7.1082, "loss/crossentropy": 2.2752933502197266, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21011126041412354, "step": 5908 }, { "epoch": 0.369375, "grad_norm": 2.546875, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 7.3066, "loss/crossentropy": 2.1513302326202393, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2277527153491974, "step": 5910 }, { "epoch": 0.3695, "grad_norm": 2.328125, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 7.2429, "loss/crossentropy": 2.327723264694214, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22254379838705063, "step": 5912 }, { "epoch": 0.369625, "grad_norm": 2.234375, "grad_norm_var": 0.01705322265625, "learning_rate": 0.0001, "loss": 7.1405, "loss/crossentropy": 2.0437814593315125, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.2087857574224472, "step": 5914 }, { "epoch": 0.36975, "grad_norm": 2.09375, "grad_norm_var": 0.0169342041015625, "learning_rate": 0.0001, "loss": 7.464, "loss/crossentropy": 2.3539143800735474, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21950966119766235, "step": 5916 }, { "epoch": 0.369875, "grad_norm": 2.203125, "grad_norm_var": 0.01630859375, "learning_rate": 0.0001, "loss": 7.2979, "loss/crossentropy": 2.4984445571899414, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.19562814384698868, "step": 5918 }, { "epoch": 0.37, "grad_norm": 2.1875, "grad_norm_var": 0.015355428059895834, "learning_rate": 0.0001, "loss": 7.3028, "loss/crossentropy": 2.4766656160354614, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21741511672735214, "step": 5920 }, { "epoch": 0.370125, "grad_norm": 1.9765625, "grad_norm_var": 0.020672353108723958, "learning_rate": 0.0001, "loss": 7.1708, "loss/crossentropy": 2.243171215057373, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2148171290755272, "step": 5922 }, { "epoch": 0.37025, "grad_norm": 2.375, "grad_norm_var": 0.01923192342122396, "learning_rate": 0.0001, "loss": 7.321, "loss/crossentropy": 2.036896765232086, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2031344696879387, "step": 5924 }, { "epoch": 0.370375, "grad_norm": 2.15625, "grad_norm_var": 0.011864980061848959, "learning_rate": 0.0001, "loss": 7.2207, "loss/crossentropy": 2.084986448287964, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21330830454826355, "step": 5926 }, { "epoch": 0.3705, "grad_norm": 2.140625, "grad_norm_var": 0.009710439046223958, "learning_rate": 0.0001, "loss": 7.2631, "loss/crossentropy": 2.224330186843872, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21560171246528625, "step": 5928 }, { "epoch": 0.370625, "grad_norm": 2.34375, "grad_norm_var": 0.011277008056640624, "learning_rate": 0.0001, "loss": 7.311, "loss/crossentropy": 2.49066960811615, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21737707406282425, "step": 5930 }, { "epoch": 0.37075, "grad_norm": 2.046875, "grad_norm_var": 0.010528310139973959, "learning_rate": 0.0001, "loss": 7.3316, "loss/crossentropy": 2.3138331174850464, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21158859878778458, "step": 5932 }, { "epoch": 0.370875, "grad_norm": 2.1875, "grad_norm_var": 0.010483551025390624, "learning_rate": 0.0001, "loss": 7.1589, "loss/crossentropy": 2.152156710624695, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2170645147562027, "step": 5934 }, { "epoch": 0.371, "grad_norm": 2.21875, "grad_norm_var": 0.010603586832682291, "learning_rate": 0.0001, "loss": 7.2144, "loss/crossentropy": 2.0164735317230225, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.1979883462190628, "step": 5936 }, { "epoch": 0.371125, "grad_norm": 2.25, "grad_norm_var": 0.007059733072916667, "learning_rate": 0.0001, "loss": 7.0243, "loss/crossentropy": 1.9355103373527527, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20030242949724197, "step": 5938 }, { "epoch": 0.37125, "grad_norm": 2.125, "grad_norm_var": 0.005646769205729167, "learning_rate": 0.0001, "loss": 7.2098, "loss/crossentropy": 2.3397440910339355, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21775859594345093, "step": 5940 }, { "epoch": 0.371375, "grad_norm": 1.984375, "grad_norm_var": 0.00806884765625, "learning_rate": 0.0001, "loss": 7.0602, "loss/crossentropy": 2.1412546634674072, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22875963151454926, "step": 5942 }, { "epoch": 0.3715, "grad_norm": 2.53125, "grad_norm_var": 0.018485514322916667, "learning_rate": 0.0001, "loss": 7.2662, "loss/crossentropy": 2.4508490562438965, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2223380208015442, "step": 5944 }, { "epoch": 0.371625, "grad_norm": 2.140625, "grad_norm_var": 0.0183990478515625, "learning_rate": 0.0001, "loss": 7.3082, "loss/crossentropy": 2.4562530517578125, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2685226500034332, "step": 5946 }, { "epoch": 0.37175, "grad_norm": 2.125, "grad_norm_var": 0.017411295572916666, "learning_rate": 0.0001, "loss": 7.1859, "loss/crossentropy": 2.600165367126465, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2223629280924797, "step": 5948 }, { "epoch": 0.371875, "grad_norm": 2.1875, "grad_norm_var": 0.017024739583333334, "learning_rate": 0.0001, "loss": 7.3726, "loss/crossentropy": 2.579715847969055, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22339096665382385, "step": 5950 }, { "epoch": 0.372, "grad_norm": 2.109375, "grad_norm_var": 0.017964680989583332, "learning_rate": 0.0001, "loss": 7.1321, "loss/crossentropy": 2.3544031381607056, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2049936279654503, "step": 5952 }, { "epoch": 0.372125, "grad_norm": 2.296875, "grad_norm_var": 0.018651326497395832, "learning_rate": 0.0001, "loss": 7.3593, "loss/crossentropy": 2.570642113685608, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2149905562400818, "step": 5954 }, { "epoch": 0.37225, "grad_norm": 2.203125, "grad_norm_var": 0.017878214518229168, "learning_rate": 0.0001, "loss": 7.2501, "loss/crossentropy": 2.345090627670288, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21142956614494324, "step": 5956 }, { "epoch": 0.372375, "grad_norm": 2.21875, "grad_norm_var": 0.014322916666666666, "learning_rate": 0.0001, "loss": 7.2556, "loss/crossentropy": 2.083262085914612, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21225877851247787, "step": 5958 }, { "epoch": 0.3725, "grad_norm": 2.328125, "grad_norm_var": 0.0090728759765625, "learning_rate": 0.0001, "loss": 7.3038, "loss/crossentropy": 2.310503602027893, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2126244381070137, "step": 5960 }, { "epoch": 0.372625, "grad_norm": 1.9921875, "grad_norm_var": 0.011934153238932292, "learning_rate": 0.0001, "loss": 7.1571, "loss/crossentropy": 2.414725184440613, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2153559774160385, "step": 5962 }, { "epoch": 0.37275, "grad_norm": 2.078125, "grad_norm_var": 0.013293202718098958, "learning_rate": 0.0001, "loss": 7.1466, "loss/crossentropy": 2.1802865266799927, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.19948512315750122, "step": 5964 }, { "epoch": 0.372875, "grad_norm": 2.265625, "grad_norm_var": 0.013588205973307291, "learning_rate": 0.0001, "loss": 7.3194, "loss/crossentropy": 2.2380692958831787, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21608977019786835, "step": 5966 }, { "epoch": 0.373, "grad_norm": 2.21875, "grad_norm_var": 0.013390858968098959, "learning_rate": 0.0001, "loss": 7.216, "loss/crossentropy": 2.2425453662872314, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.2108849212527275, "step": 5968 }, { "epoch": 0.373125, "grad_norm": 2.171875, "grad_norm_var": 0.012666575113932292, "learning_rate": 0.0001, "loss": 7.163, "loss/crossentropy": 2.289492964744568, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20529530197381973, "step": 5970 }, { "epoch": 0.37325, "grad_norm": 2.125, "grad_norm_var": 0.012385813395182292, "learning_rate": 0.0001, "loss": 7.1534, "loss/crossentropy": 1.9850167036056519, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2159864455461502, "step": 5972 }, { "epoch": 0.373375, "grad_norm": 2.53125, "grad_norm_var": 0.019954172770182292, "learning_rate": 0.0001, "loss": 7.1503, "loss/crossentropy": 2.3028509616851807, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21371204406023026, "step": 5974 }, { "epoch": 0.3735, "grad_norm": 2.109375, "grad_norm_var": 0.01669286092122396, "learning_rate": 0.0001, "loss": 7.2954, "loss/crossentropy": 2.3715981245040894, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21668031811714172, "step": 5976 }, { "epoch": 0.373625, "grad_norm": 2.3125, "grad_norm_var": 0.016048177083333334, "learning_rate": 0.0001, "loss": 7.2445, "loss/crossentropy": 2.125378727912903, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20972995460033417, "step": 5978 }, { "epoch": 0.37375, "grad_norm": 2.15625, "grad_norm_var": 0.014176432291666667, "learning_rate": 0.0001, "loss": 7.148, "loss/crossentropy": 2.019354999065399, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.18493575602769852, "step": 5980 }, { "epoch": 0.373875, "grad_norm": 2.234375, "grad_norm_var": 0.016136678059895833, "learning_rate": 0.0001, "loss": 7.2904, "loss/crossentropy": 2.051500916481018, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21599026024341583, "step": 5982 }, { "epoch": 0.374, "grad_norm": 2.21875, "grad_norm_var": 0.017601521809895833, "learning_rate": 0.0001, "loss": 7.4102, "loss/crossentropy": 2.1651517152786255, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21314673870801926, "step": 5984 }, { "epoch": 0.374125, "grad_norm": 2.0625, "grad_norm_var": 0.01842041015625, "learning_rate": 0.0001, "loss": 7.2403, "loss/crossentropy": 2.254274010658264, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.2058999389410019, "step": 5986 }, { "epoch": 0.37425, "grad_norm": 2.28125, "grad_norm_var": 0.018680826822916666, "learning_rate": 0.0001, "loss": 7.3251, "loss/crossentropy": 2.346308946609497, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20627041906118393, "step": 5988 }, { "epoch": 0.374375, "grad_norm": 2.15625, "grad_norm_var": 0.011356608072916666, "learning_rate": 0.0001, "loss": 7.2133, "loss/crossentropy": 2.4631309509277344, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2179097756743431, "step": 5990 }, { "epoch": 0.3745, "grad_norm": 2.21875, "grad_norm_var": 0.0103424072265625, "learning_rate": 0.0001, "loss": 7.2366, "loss/crossentropy": 2.558820962905884, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20852772891521454, "step": 5992 }, { "epoch": 0.374625, "grad_norm": 2.140625, "grad_norm_var": 0.007982381184895833, "learning_rate": 0.0001, "loss": 7.2517, "loss/crossentropy": 2.252416253089905, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21487032622098923, "step": 5994 }, { "epoch": 0.37475, "grad_norm": 2.515625, "grad_norm_var": 0.015999348958333333, "learning_rate": 0.0001, "loss": 7.3158, "loss/crossentropy": 2.4063336849212646, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22586210072040558, "step": 5996 }, { "epoch": 0.374875, "grad_norm": 2.21875, "grad_norm_var": 0.013179524739583334, "learning_rate": 0.0001, "loss": 7.1276, "loss/crossentropy": 1.848585605621338, "loss/hidden": 2.71875, "loss/jsd": 0.0, "loss/logits": 0.19894341379404068, "step": 5998 }, { "epoch": 0.375, "grad_norm": 2.34375, "grad_norm_var": 0.013654581705729167, "learning_rate": 0.0001, "loss": 7.1552, "loss/crossentropy": 2.4806891679763794, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2110806256532669, "step": 6000 }, { "epoch": 0.375125, "grad_norm": 2.328125, "grad_norm_var": 0.012451171875, "learning_rate": 0.0001, "loss": 7.2238, "loss/crossentropy": 2.1955052614212036, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2050733044743538, "step": 6002 }, { "epoch": 0.37525, "grad_norm": 2.0, "grad_norm_var": 0.014990234375, "learning_rate": 0.0001, "loss": 7.2147, "loss/crossentropy": 2.2241486310958862, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20495334267616272, "step": 6004 }, { "epoch": 0.375375, "grad_norm": 2.203125, "grad_norm_var": 0.015355428059895834, "learning_rate": 0.0001, "loss": 7.0965, "loss/crossentropy": 2.167006492614746, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20993337780237198, "step": 6006 }, { "epoch": 0.3755, "grad_norm": 2.25, "grad_norm_var": 0.015620930989583334, "learning_rate": 0.0001, "loss": 7.1475, "loss/crossentropy": 2.1680710911750793, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.2089712917804718, "step": 6008 }, { "epoch": 0.375625, "grad_norm": 2.28125, "grad_norm_var": 0.015555826822916667, "learning_rate": 0.0001, "loss": 7.1355, "loss/crossentropy": 2.338285446166992, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21749289333820343, "step": 6010 }, { "epoch": 0.37575, "grad_norm": 2.1875, "grad_norm_var": 0.0079498291015625, "learning_rate": 0.0001, "loss": 7.1924, "loss/crossentropy": 2.2684131860733032, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2123074233531952, "step": 6012 }, { "epoch": 0.375875, "grad_norm": 2.296875, "grad_norm_var": 0.008429972330729167, "learning_rate": 0.0001, "loss": 7.2788, "loss/crossentropy": 2.6743900775909424, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21920502185821533, "step": 6014 }, { "epoch": 0.376, "grad_norm": 2.296875, "grad_norm_var": 0.006598917643229166, "learning_rate": 0.0001, "loss": 7.3491, "loss/crossentropy": 2.175094962120056, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2208651378750801, "step": 6016 }, { "epoch": 0.376125, "grad_norm": 2.15625, "grad_norm_var": 0.005757649739583333, "learning_rate": 0.0001, "loss": 7.2814, "loss/crossentropy": 2.417030453681946, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2150922417640686, "step": 6018 }, { "epoch": 0.37625, "grad_norm": 2.296875, "grad_norm_var": 0.004069010416666667, "learning_rate": 0.0001, "loss": 7.1846, "loss/crossentropy": 2.065057873725891, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.18963027000427246, "step": 6020 }, { "epoch": 0.376375, "grad_norm": 2.1875, "grad_norm_var": 0.0033843994140625, "learning_rate": 0.0001, "loss": 7.099, "loss/crossentropy": 2.0630787014961243, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2001161426305771, "step": 6022 }, { "epoch": 0.3765, "grad_norm": 2.09375, "grad_norm_var": 0.005052693684895833, "learning_rate": 0.0001, "loss": 7.2794, "loss/crossentropy": 2.018410086631775, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21171020716428757, "step": 6024 }, { "epoch": 0.376625, "grad_norm": 2.140625, "grad_norm_var": 0.007242838541666667, "learning_rate": 0.0001, "loss": 7.04, "loss/crossentropy": 2.0343552231788635, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.1925143450498581, "step": 6026 }, { "epoch": 0.37675, "grad_norm": 2.265625, "grad_norm_var": 0.007731119791666667, "learning_rate": 0.0001, "loss": 7.2422, "loss/crossentropy": 2.3210513591766357, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2177387848496437, "step": 6028 }, { "epoch": 0.376875, "grad_norm": 2.203125, "grad_norm_var": 0.007096354166666667, "learning_rate": 0.0001, "loss": 7.0303, "loss/crossentropy": 2.0913103818893433, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19993747025728226, "step": 6030 }, { "epoch": 0.377, "grad_norm": 2.15625, "grad_norm_var": 0.00611572265625, "learning_rate": 0.0001, "loss": 7.2291, "loss/crossentropy": 2.2609927654266357, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21157050877809525, "step": 6032 }, { "epoch": 0.377125, "grad_norm": 2.296875, "grad_norm_var": 0.007161458333333333, "learning_rate": 0.0001, "loss": 7.1153, "loss/crossentropy": 2.5453518629074097, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22479583323001862, "step": 6034 }, { "epoch": 0.37725, "grad_norm": 2.1875, "grad_norm_var": 0.005952962239583333, "learning_rate": 0.0001, "loss": 7.2865, "loss/crossentropy": 2.421576499938965, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22091317176818848, "step": 6036 }, { "epoch": 0.377375, "grad_norm": 2.046875, "grad_norm_var": 0.0066721598307291664, "learning_rate": 0.0001, "loss": 7.2464, "loss/crossentropy": 2.370941162109375, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21858198940753937, "step": 6038 }, { "epoch": 0.3775, "grad_norm": 2.171875, "grad_norm_var": 0.006004842122395834, "learning_rate": 0.0001, "loss": 7.2514, "loss/crossentropy": 2.3779879808425903, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21413833647966385, "step": 6040 }, { "epoch": 0.377625, "grad_norm": 2.34375, "grad_norm_var": 0.005671183268229167, "learning_rate": 0.0001, "loss": 7.3732, "loss/crossentropy": 2.073517680168152, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.19559810310602188, "step": 6042 }, { "epoch": 0.37775, "grad_norm": 2.125, "grad_norm_var": 0.005810546875, "learning_rate": 0.0001, "loss": 7.2228, "loss/crossentropy": 2.136751651763916, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20897220820188522, "step": 6044 }, { "epoch": 0.377875, "grad_norm": 2.1875, "grad_norm_var": 0.005810546875, "learning_rate": 0.0001, "loss": 7.102, "loss/crossentropy": 2.1889522671699524, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20655735582113266, "step": 6046 }, { "epoch": 0.378, "grad_norm": 2.296875, "grad_norm_var": 0.006891886393229167, "learning_rate": 0.0001, "loss": 7.3464, "loss/crossentropy": 2.486303687095642, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2155042290687561, "step": 6048 }, { "epoch": 0.378125, "grad_norm": 2.234375, "grad_norm_var": 0.006441243489583333, "learning_rate": 0.0001, "loss": 7.2726, "loss/crossentropy": 2.2876516580581665, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22767490148544312, "step": 6050 }, { "epoch": 0.37825, "grad_norm": 2.359375, "grad_norm_var": 0.007868448893229166, "learning_rate": 0.0001, "loss": 7.2255, "loss/crossentropy": 2.4651968479156494, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2104952037334442, "step": 6052 }, { "epoch": 0.378375, "grad_norm": 2.234375, "grad_norm_var": 0.005467732747395833, "learning_rate": 0.0001, "loss": 7.3932, "loss/crossentropy": 2.672922730445862, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.24261310696601868, "step": 6054 }, { "epoch": 0.3785, "grad_norm": 2.625, "grad_norm_var": 0.0140625, "learning_rate": 0.0001, "loss": 7.1016, "loss/crossentropy": 1.833503007888794, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.17841923236846924, "step": 6056 }, { "epoch": 0.378625, "grad_norm": 1.96875, "grad_norm_var": 0.01978759765625, "learning_rate": 0.0001, "loss": 7.203, "loss/crossentropy": 2.2275290489196777, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20729243010282516, "step": 6058 }, { "epoch": 0.37875, "grad_norm": 2.1875, "grad_norm_var": 0.019050089518229167, "learning_rate": 0.0001, "loss": 7.2747, "loss/crossentropy": 2.679691195487976, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21618877351284027, "step": 6060 }, { "epoch": 0.378875, "grad_norm": 2.125, "grad_norm_var": 0.0194488525390625, "learning_rate": 0.0001, "loss": 7.1795, "loss/crossentropy": 2.263300061225891, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2042461857199669, "step": 6062 }, { "epoch": 0.379, "grad_norm": 2.3125, "grad_norm_var": 0.019416300455729167, "learning_rate": 0.0001, "loss": 7.398, "loss/crossentropy": 2.6218732595443726, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2375633716583252, "step": 6064 }, { "epoch": 0.379125, "grad_norm": 2.25, "grad_norm_var": 0.019440714518229166, "learning_rate": 0.0001, "loss": 7.3822, "loss/crossentropy": 2.30401611328125, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22317654639482498, "step": 6066 }, { "epoch": 0.37925, "grad_norm": 2.421875, "grad_norm_var": 0.023216756184895833, "learning_rate": 0.0001, "loss": 7.3373, "loss/crossentropy": 1.9362152814865112, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.1887020766735077, "step": 6068 }, { "epoch": 0.379375, "grad_norm": 2.546875, "grad_norm_var": 0.029313151041666666, "learning_rate": 0.0001, "loss": 7.5001, "loss/crossentropy": 2.3842978477478027, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21501502394676208, "step": 6070 }, { "epoch": 0.3795, "grad_norm": 2.34375, "grad_norm_var": 0.023298136393229165, "learning_rate": 0.0001, "loss": 7.1358, "loss/crossentropy": 2.22287917137146, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21563850343227386, "step": 6072 }, { "epoch": 0.379625, "grad_norm": 2.265625, "grad_norm_var": 0.017215983072916666, "learning_rate": 0.0001, "loss": 7.1006, "loss/crossentropy": 2.402526021003723, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21390444040298462, "step": 6074 }, { "epoch": 0.37975, "grad_norm": 1.9375, "grad_norm_var": 0.0245513916015625, "learning_rate": 0.0001, "loss": 7.1442, "loss/crossentropy": 2.188621401786804, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20676591992378235, "step": 6076 }, { "epoch": 0.379875, "grad_norm": 2.34375, "grad_norm_var": 0.022819010416666667, "learning_rate": 0.0001, "loss": 7.2357, "loss/crossentropy": 2.382845878601074, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21810097247362137, "step": 6078 }, { "epoch": 0.38, "grad_norm": 2.171875, "grad_norm_var": 0.0234375, "learning_rate": 0.0001, "loss": 7.266, "loss/crossentropy": 2.408275842666626, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21064983308315277, "step": 6080 }, { "epoch": 0.380125, "grad_norm": 2.09375, "grad_norm_var": 0.026611328125, "learning_rate": 0.0001, "loss": 7.1686, "loss/crossentropy": 2.3628504276275635, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20509422570466995, "step": 6082 }, { "epoch": 0.38025, "grad_norm": 2.15625, "grad_norm_var": 0.022737630208333335, "learning_rate": 0.0001, "loss": 7.2245, "loss/crossentropy": 2.246079921722412, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2075369581580162, "step": 6084 }, { "epoch": 0.380375, "grad_norm": 2.03125, "grad_norm_var": 0.016429646809895834, "learning_rate": 0.0001, "loss": 7.1207, "loss/crossentropy": 2.012663960456848, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22296176850795746, "step": 6086 }, { "epoch": 0.3805, "grad_norm": 2.109375, "grad_norm_var": 0.0128082275390625, "learning_rate": 0.0001, "loss": 7.2217, "loss/crossentropy": 2.5077545642852783, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2282715067267418, "step": 6088 }, { "epoch": 0.380625, "grad_norm": 2.21875, "grad_norm_var": 0.0127105712890625, "learning_rate": 0.0001, "loss": 7.1857, "loss/crossentropy": 2.2846879959106445, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20731749385595322, "step": 6090 }, { "epoch": 0.38075, "grad_norm": 2.21875, "grad_norm_var": 0.0082916259765625, "learning_rate": 0.0001, "loss": 7.264, "loss/crossentropy": 2.340460181236267, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21707424521446228, "step": 6092 }, { "epoch": 0.380875, "grad_norm": 2.109375, "grad_norm_var": 0.0072743733723958336, "learning_rate": 0.0001, "loss": 7.2139, "loss/crossentropy": 2.317685127258301, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20413777977228165, "step": 6094 }, { "epoch": 0.381, "grad_norm": 2.25, "grad_norm_var": 0.007111612955729167, "learning_rate": 0.0001, "loss": 7.3123, "loss/crossentropy": 2.2990000247955322, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22321767359972, "step": 6096 }, { "epoch": 0.381125, "grad_norm": 2.296875, "grad_norm_var": 0.006761678059895833, "learning_rate": 0.0001, "loss": 7.2415, "loss/crossentropy": 2.307106852531433, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2068295031785965, "step": 6098 }, { "epoch": 0.38125, "grad_norm": 2.09375, "grad_norm_var": 0.009663899739583334, "learning_rate": 0.0001, "loss": 7.2459, "loss/crossentropy": 2.272905468940735, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2071700543165207, "step": 6100 }, { "epoch": 0.381375, "grad_norm": 2.140625, "grad_norm_var": 0.007763671875, "learning_rate": 0.0001, "loss": 7.2205, "loss/crossentropy": 2.2466858625411987, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21905604004859924, "step": 6102 }, { "epoch": 0.3815, "grad_norm": 2.21875, "grad_norm_var": 0.00625, "learning_rate": 0.0001, "loss": 7.3163, "loss/crossentropy": 2.113400459289551, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20151030272245407, "step": 6104 }, { "epoch": 0.381625, "grad_norm": 2.25, "grad_norm_var": 0.00826416015625, "learning_rate": 0.0001, "loss": 7.2066, "loss/crossentropy": 2.289364218711853, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2247104048728943, "step": 6106 }, { "epoch": 0.38175, "grad_norm": 2.234375, "grad_norm_var": 0.01080322265625, "learning_rate": 0.0001, "loss": 7.2602, "loss/crossentropy": 2.203786611557007, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21391122043132782, "step": 6108 }, { "epoch": 0.381875, "grad_norm": 2.125, "grad_norm_var": 0.011454264322916666, "learning_rate": 0.0001, "loss": 7.1077, "loss/crossentropy": 2.2855257987976074, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24084167182445526, "step": 6110 }, { "epoch": 0.382, "grad_norm": 2.09375, "grad_norm_var": 0.0131500244140625, "learning_rate": 0.0001, "loss": 7.1511, "loss/crossentropy": 2.326382040977478, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22301767766475677, "step": 6112 }, { "epoch": 0.382125, "grad_norm": 2.1875, "grad_norm_var": 0.01357421875, "learning_rate": 0.0001, "loss": 7.3663, "loss/crossentropy": 2.4644904136657715, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2120847851037979, "step": 6114 }, { "epoch": 0.38225, "grad_norm": 2.046875, "grad_norm_var": 0.017594401041666666, "learning_rate": 0.0001, "loss": 7.1899, "loss/crossentropy": 1.9932494163513184, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.1996365636587143, "step": 6116 }, { "epoch": 0.382375, "grad_norm": 2.171875, "grad_norm_var": 0.017769368489583333, "learning_rate": 0.0001, "loss": 7.2331, "loss/crossentropy": 2.2997413873672485, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21524758636951447, "step": 6118 }, { "epoch": 0.3825, "grad_norm": 2.234375, "grad_norm_var": 0.018277994791666665, "learning_rate": 0.0001, "loss": 7.1646, "loss/crossentropy": 2.412147045135498, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20587334036827087, "step": 6120 }, { "epoch": 0.382625, "grad_norm": 2.34375, "grad_norm_var": 0.020100911458333332, "learning_rate": 0.0001, "loss": 7.1928, "loss/crossentropy": 2.1714669466018677, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22052258253097534, "step": 6122 }, { "epoch": 0.38275, "grad_norm": 2.234375, "grad_norm_var": 0.017887369791666666, "learning_rate": 0.0001, "loss": 7.2932, "loss/crossentropy": 2.3583080768585205, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.232616625726223, "step": 6124 }, { "epoch": 0.382875, "grad_norm": 2.234375, "grad_norm_var": 0.016999308268229166, "learning_rate": 0.0001, "loss": 7.252, "loss/crossentropy": 2.2754658460617065, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.21504772454500198, "step": 6126 }, { "epoch": 0.383, "grad_norm": 2.359375, "grad_norm_var": 0.023509724934895834, "learning_rate": 0.0001, "loss": 7.1404, "loss/crossentropy": 2.382510781288147, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22090370953083038, "step": 6128 }, { "epoch": 0.383125, "grad_norm": 1.9921875, "grad_norm_var": 0.027787017822265624, "learning_rate": 0.0001, "loss": 7.1187, "loss/crossentropy": 2.0568565130233765, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19300862401723862, "step": 6130 }, { "epoch": 0.38325, "grad_norm": 2.140625, "grad_norm_var": 0.021183013916015625, "learning_rate": 0.0001, "loss": 7.2013, "loss/crossentropy": 2.395463228225708, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21835564076900482, "step": 6132 }, { "epoch": 0.383375, "grad_norm": 2.09375, "grad_norm_var": 0.020715077718098957, "learning_rate": 0.0001, "loss": 7.2018, "loss/crossentropy": 2.2619467973709106, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.2159418836236, "step": 6134 }, { "epoch": 0.3835, "grad_norm": 2.328125, "grad_norm_var": 0.020918528238932293, "learning_rate": 0.0001, "loss": 7.4816, "loss/crossentropy": 2.2685283422470093, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21295687556266785, "step": 6136 }, { "epoch": 0.383625, "grad_norm": 2.25, "grad_norm_var": 0.019311269124348957, "learning_rate": 0.0001, "loss": 7.1095, "loss/crossentropy": 2.2552493810653687, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.2095998227596283, "step": 6138 }, { "epoch": 0.38375, "grad_norm": 2.203125, "grad_norm_var": 0.020139312744140624, "learning_rate": 0.0001, "loss": 7.2686, "loss/crossentropy": 2.3464255332946777, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.20441381633281708, "step": 6140 }, { "epoch": 0.383875, "grad_norm": 2.078125, "grad_norm_var": 0.024421183268229167, "learning_rate": 0.0001, "loss": 7.2288, "loss/crossentropy": 2.287395715713501, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22399558871984482, "step": 6142 }, { "epoch": 0.384, "grad_norm": 2.203125, "grad_norm_var": 0.011922200520833334, "learning_rate": 0.0001, "loss": 7.4251, "loss/crossentropy": 2.164630949497223, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22558195143938065, "step": 6144 }, { "epoch": 0.384125, "grad_norm": 2.203125, "grad_norm_var": 0.010959625244140625, "learning_rate": 0.0001, "loss": 7.1384, "loss/crossentropy": 1.937812328338623, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.19157906621694565, "step": 6146 }, { "epoch": 0.38425, "grad_norm": 2.359375, "grad_norm_var": 0.013787587483723959, "learning_rate": 0.0001, "loss": 7.2158, "loss/crossentropy": 2.3608381748199463, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.24235235154628754, "step": 6148 }, { "epoch": 0.384375, "grad_norm": 2.265625, "grad_norm_var": 0.014587148030598959, "learning_rate": 0.0001, "loss": 7.1659, "loss/crossentropy": 2.4223140478134155, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21669473499059677, "step": 6150 }, { "epoch": 0.3845, "grad_norm": 2.265625, "grad_norm_var": 0.017937978108723957, "learning_rate": 0.0001, "loss": 7.0478, "loss/crossentropy": 2.2232367992401123, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21520529687404633, "step": 6152 }, { "epoch": 0.384625, "grad_norm": 1.984375, "grad_norm_var": 0.021602121988932292, "learning_rate": 0.0001, "loss": 7.2573, "loss/crossentropy": 2.498006224632263, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22033587098121643, "step": 6154 }, { "epoch": 0.38475, "grad_norm": 2.40625, "grad_norm_var": 0.02343928019205729, "learning_rate": 0.0001, "loss": 7.2942, "loss/crossentropy": 2.367275357246399, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21903083473443985, "step": 6156 }, { "epoch": 0.384875, "grad_norm": 2.234375, "grad_norm_var": 0.0179351806640625, "learning_rate": 0.0001, "loss": 7.1205, "loss/crossentropy": 2.4437015056610107, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21279380470514297, "step": 6158 }, { "epoch": 0.385, "grad_norm": 2.0625, "grad_norm_var": 0.021907552083333334, "learning_rate": 0.0001, "loss": 7.247, "loss/crossentropy": 2.4123772382736206, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22339122742414474, "step": 6160 }, { "epoch": 0.385125, "grad_norm": 2.375, "grad_norm_var": 0.023811848958333333, "learning_rate": 0.0001, "loss": 7.186, "loss/crossentropy": 2.2593997716903687, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2121337652206421, "step": 6162 }, { "epoch": 0.38525, "grad_norm": 2.03125, "grad_norm_var": 0.024283854166666667, "learning_rate": 0.0001, "loss": 7.2505, "loss/crossentropy": 2.4139418601989746, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2320389673113823, "step": 6164 }, { "epoch": 0.385375, "grad_norm": 2.234375, "grad_norm_var": 0.023021443684895834, "learning_rate": 0.0001, "loss": 7.1636, "loss/crossentropy": 2.300944983959198, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21375388652086258, "step": 6166 }, { "epoch": 0.3855, "grad_norm": 2.015625, "grad_norm_var": 0.0188140869140625, "learning_rate": 0.0001, "loss": 7.1753, "loss/crossentropy": 2.328981041908264, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2163231372833252, "step": 6168 }, { "epoch": 0.385625, "grad_norm": 2.28125, "grad_norm_var": 0.016825358072916668, "learning_rate": 0.0001, "loss": 7.257, "loss/crossentropy": 2.253252863883972, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.1982560232281685, "step": 6170 }, { "epoch": 0.38575, "grad_norm": 2.140625, "grad_norm_var": 0.01328125, "learning_rate": 0.0001, "loss": 7.1984, "loss/crossentropy": 2.4192394018173218, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20848099142313004, "step": 6172 }, { "epoch": 0.385875, "grad_norm": 2.140625, "grad_norm_var": 0.013410441080729167, "learning_rate": 0.0001, "loss": 7.1191, "loss/crossentropy": 2.1820013523101807, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2143269181251526, "step": 6174 }, { "epoch": 0.386, "grad_norm": 2.296875, "grad_norm_var": 0.013410441080729167, "learning_rate": 0.0001, "loss": 7.0709, "loss/crossentropy": 2.271676182746887, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.208109050989151, "step": 6176 }, { "epoch": 0.386125, "grad_norm": 2.09375, "grad_norm_var": 0.008935546875, "learning_rate": 0.0001, "loss": 7.0798, "loss/crossentropy": 2.139783024787903, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2209583893418312, "step": 6178 }, { "epoch": 0.38625, "grad_norm": 2.359375, "grad_norm_var": 0.01142578125, "learning_rate": 0.0001, "loss": 7.1879, "loss/crossentropy": 2.22269070148468, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20203150063753128, "step": 6180 }, { "epoch": 0.386375, "grad_norm": 2.171875, "grad_norm_var": 0.015510813395182291, "learning_rate": 0.0001, "loss": 7.1266, "loss/crossentropy": 2.225795269012451, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20338183641433716, "step": 6182 }, { "epoch": 0.3865, "grad_norm": 2.125, "grad_norm_var": 0.013512929280598959, "learning_rate": 0.0001, "loss": 7.1648, "loss/crossentropy": 2.114717125892639, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22314286977052689, "step": 6184 }, { "epoch": 0.386625, "grad_norm": 2.09375, "grad_norm_var": 0.013219960530598958, "learning_rate": 0.0001, "loss": 7.1101, "loss/crossentropy": 2.0499364137649536, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20304933190345764, "step": 6186 }, { "epoch": 0.38675, "grad_norm": 2.125, "grad_norm_var": 0.013002268473307292, "learning_rate": 0.0001, "loss": 7.1289, "loss/crossentropy": 2.2222620248794556, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2161303237080574, "step": 6188 }, { "epoch": 0.386875, "grad_norm": 2.1875, "grad_norm_var": 0.012294260660807292, "learning_rate": 0.0001, "loss": 7.1987, "loss/crossentropy": 2.296729564666748, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2208261713385582, "step": 6190 }, { "epoch": 0.387, "grad_norm": 2.3125, "grad_norm_var": 0.016477203369140624, "learning_rate": 0.0001, "loss": 7.4317, "loss/crossentropy": 2.4254910945892334, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2514353543519974, "step": 6192 }, { "epoch": 0.387125, "grad_norm": 2.109375, "grad_norm_var": 0.01654230753580729, "learning_rate": 0.0001, "loss": 7.1742, "loss/crossentropy": 2.1946414709091187, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20449410378932953, "step": 6194 }, { "epoch": 0.38725, "grad_norm": 2.109375, "grad_norm_var": 0.012839508056640626, "learning_rate": 0.0001, "loss": 7.1523, "loss/crossentropy": 2.1898844242095947, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2156129777431488, "step": 6196 }, { "epoch": 0.387375, "grad_norm": 2.328125, "grad_norm_var": 0.0109527587890625, "learning_rate": 0.0001, "loss": 7.158, "loss/crossentropy": 2.100374698638916, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21593213081359863, "step": 6198 }, { "epoch": 0.3875, "grad_norm": 1.9453125, "grad_norm_var": 0.014564768473307291, "learning_rate": 0.0001, "loss": 7.0778, "loss/crossentropy": 2.119781017303467, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19760458916425705, "step": 6200 }, { "epoch": 0.387625, "grad_norm": 2.265625, "grad_norm_var": 0.014088694254557292, "learning_rate": 0.0001, "loss": 7.1764, "loss/crossentropy": 2.4633294343948364, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20755568891763687, "step": 6202 }, { "epoch": 0.38775, "grad_norm": 2.15625, "grad_norm_var": 0.014168039957682291, "learning_rate": 0.0001, "loss": 7.2959, "loss/crossentropy": 2.351404905319214, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21977319568395615, "step": 6204 }, { "epoch": 0.387875, "grad_norm": 2.171875, "grad_norm_var": 0.014178212483723958, "learning_rate": 0.0001, "loss": 7.2006, "loss/crossentropy": 2.1882619857788086, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2126511111855507, "step": 6206 }, { "epoch": 0.388, "grad_norm": 2.015625, "grad_norm_var": 0.009747060139973958, "learning_rate": 0.0001, "loss": 7.1983, "loss/crossentropy": 2.2526434659957886, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.2065277025103569, "step": 6208 }, { "epoch": 0.388125, "grad_norm": 2.09375, "grad_norm_var": 0.009854888916015625, "learning_rate": 0.0001, "loss": 7.1293, "loss/crossentropy": 2.3055331707000732, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2006327137351036, "step": 6210 }, { "epoch": 0.38825, "grad_norm": 2.234375, "grad_norm_var": 0.010575103759765624, "learning_rate": 0.0001, "loss": 7.2887, "loss/crossentropy": 2.175672173500061, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21848966926336288, "step": 6212 }, { "epoch": 0.388375, "grad_norm": 2.40625, "grad_norm_var": 0.013836415608723958, "learning_rate": 0.0001, "loss": 7.2577, "loss/crossentropy": 2.3525902032852173, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22647082805633545, "step": 6214 }, { "epoch": 0.3885, "grad_norm": 2.1875, "grad_norm_var": 0.020490519205729165, "learning_rate": 0.0001, "loss": 7.279, "loss/crossentropy": 2.0665590167045593, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.19694138318300247, "step": 6216 }, { "epoch": 0.388625, "grad_norm": 2.125, "grad_norm_var": 0.022809855143229165, "learning_rate": 0.0001, "loss": 7.233, "loss/crossentropy": 2.0745668411254883, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20363005250692368, "step": 6218 }, { "epoch": 0.38875, "grad_norm": 2.53125, "grad_norm_var": 0.028922526041666667, "learning_rate": 0.0001, "loss": 7.3839, "loss/crossentropy": 2.038256347179413, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21672368794679642, "step": 6220 }, { "epoch": 0.388875, "grad_norm": 3.0, "grad_norm_var": 0.06468098958333333, "learning_rate": 0.0001, "loss": 7.2287, "loss/crossentropy": 2.2980682849884033, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22767220437526703, "step": 6222 }, { "epoch": 0.389, "grad_norm": 2.03125, "grad_norm_var": 0.07187398274739583, "learning_rate": 0.0001, "loss": 7.1803, "loss/crossentropy": 2.175195574760437, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20187747478485107, "step": 6224 }, { "epoch": 0.389125, "grad_norm": 2.1875, "grad_norm_var": 0.07619400024414062, "learning_rate": 0.0001, "loss": 7.0902, "loss/crossentropy": 2.0487667322158813, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19584185630083084, "step": 6226 }, { "epoch": 0.38925, "grad_norm": 2.3125, "grad_norm_var": 0.0768450419108073, "learning_rate": 0.0001, "loss": 7.3122, "loss/crossentropy": 2.1507842540740967, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2099941298365593, "step": 6228 }, { "epoch": 0.389375, "grad_norm": 2.265625, "grad_norm_var": 0.07747573852539062, "learning_rate": 0.0001, "loss": 7.3023, "loss/crossentropy": 2.536380410194397, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23754464834928513, "step": 6230 }, { "epoch": 0.3895, "grad_norm": 2.09375, "grad_norm_var": 0.07687149047851563, "learning_rate": 0.0001, "loss": 7.1825, "loss/crossentropy": 2.354156494140625, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22577545046806335, "step": 6232 }, { "epoch": 0.389625, "grad_norm": 2.234375, "grad_norm_var": 0.07278416951497396, "learning_rate": 0.0001, "loss": 7.1849, "loss/crossentropy": 2.0844647884368896, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2019355520606041, "step": 6234 }, { "epoch": 0.38975, "grad_norm": 2.203125, "grad_norm_var": 0.06987279256184896, "learning_rate": 0.0001, "loss": 7.1985, "loss/crossentropy": 2.2951064109802246, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20773043483495712, "step": 6236 }, { "epoch": 0.389875, "grad_norm": 2.25, "grad_norm_var": 0.031109364827473958, "learning_rate": 0.0001, "loss": 7.227, "loss/crossentropy": 2.1024820804595947, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2022949606180191, "step": 6238 }, { "epoch": 0.39, "grad_norm": 2.09375, "grad_norm_var": 0.014829254150390625, "learning_rate": 0.0001, "loss": 7.149, "loss/crossentropy": 2.028247654438019, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.19261081516742706, "step": 6240 }, { "epoch": 0.390125, "grad_norm": 2.953125, "grad_norm_var": 0.04890848795572917, "learning_rate": 0.0001, "loss": 7.3008, "loss/crossentropy": 2.4559611082077026, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21937407553195953, "step": 6242 }, { "epoch": 0.39025, "grad_norm": 2.046875, "grad_norm_var": 0.050191243489583336, "learning_rate": 0.0001, "loss": 7.1446, "loss/crossentropy": 2.094746768474579, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.18669873476028442, "step": 6244 }, { "epoch": 0.390375, "grad_norm": 2.328125, "grad_norm_var": 0.05080973307291667, "learning_rate": 0.0001, "loss": 7.3141, "loss/crossentropy": 2.0901660323143005, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2054583877325058, "step": 6246 }, { "epoch": 0.3905, "grad_norm": 2.265625, "grad_norm_var": 0.04784749348958333, "learning_rate": 0.0001, "loss": 7.323, "loss/crossentropy": 2.0414637327194214, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2149074748158455, "step": 6248 }, { "epoch": 0.390625, "grad_norm": 2.3125, "grad_norm_var": 0.04676106770833333, "learning_rate": 0.0001, "loss": 7.2818, "loss/crossentropy": 2.477187752723694, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22341058403253555, "step": 6250 }, { "epoch": 0.39075, "grad_norm": 2.171875, "grad_norm_var": 0.04804280598958333, "learning_rate": 0.0001, "loss": 7.0282, "loss/crossentropy": 2.167757034301758, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19077523797750473, "step": 6252 }, { "epoch": 0.390875, "grad_norm": 1.984375, "grad_norm_var": 0.05100809733072917, "learning_rate": 0.0001, "loss": 7.2128, "loss/crossentropy": 2.4444233179092407, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21900728344917297, "step": 6254 }, { "epoch": 0.391, "grad_norm": 2.15625, "grad_norm_var": 0.0484771728515625, "learning_rate": 0.0001, "loss": 7.1528, "loss/crossentropy": 2.353291869163513, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.21415339410305023, "step": 6256 }, { "epoch": 0.391125, "grad_norm": 2.171875, "grad_norm_var": 0.015192667643229166, "learning_rate": 0.0001, "loss": 7.091, "loss/crossentropy": 2.2490928173065186, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2167230099439621, "step": 6258 }, { "epoch": 0.39125, "grad_norm": 2.21875, "grad_norm_var": 0.013704427083333333, "learning_rate": 0.0001, "loss": 7.3306, "loss/crossentropy": 2.4165178537368774, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2405092567205429, "step": 6260 }, { "epoch": 0.391375, "grad_norm": 5.4375, "grad_norm_var": 0.8871378580729167, "learning_rate": 0.0001, "loss": 7.3884, "loss/crossentropy": 2.242551565170288, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23066271841526031, "step": 6262 }, { "epoch": 0.3915, "grad_norm": 13.0625, "grad_norm_var": 7.71259765625, "learning_rate": 0.0001, "loss": 7.8466, "loss/crossentropy": 2.4470694065093994, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.26077815890312195, "step": 6264 }, { "epoch": 0.391625, "grad_norm": 2.625, "grad_norm_var": 7.606929524739583, "learning_rate": 0.0001, "loss": 7.3717, "loss/crossentropy": 2.3733749389648438, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23013299703598022, "step": 6266 }, { "epoch": 0.39175, "grad_norm": 2.46875, "grad_norm_var": 7.546891276041666, "learning_rate": 0.0001, "loss": 7.2616, "loss/crossentropy": 1.922485888004303, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2019965946674347, "step": 6268 }, { "epoch": 0.391875, "grad_norm": 2.015625, "grad_norm_var": 7.520503743489583, "learning_rate": 0.0001, "loss": 7.1054, "loss/crossentropy": 2.258482575416565, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22108732908964157, "step": 6270 }, { "epoch": 0.392, "grad_norm": 2.515625, "grad_norm_var": 7.477079264322916, "learning_rate": 0.0001, "loss": 7.0443, "loss/crossentropy": 2.2347971200942993, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21819213032722473, "step": 6272 }, { "epoch": 0.392125, "grad_norm": 2.15625, "grad_norm_var": 7.528499348958333, "learning_rate": 0.0001, "loss": 7.2214, "loss/crossentropy": 2.2647920846939087, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21231649070978165, "step": 6274 }, { "epoch": 0.39225, "grad_norm": 2.390625, "grad_norm_var": 7.526005045572917, "learning_rate": 0.0001, "loss": 7.4141, "loss/crossentropy": 2.3626712560653687, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21143031865358353, "step": 6276 }, { "epoch": 0.392375, "grad_norm": 2.515625, "grad_norm_var": 7.242333984375, "learning_rate": 0.0001, "loss": 7.2001, "loss/crossentropy": 2.018476188182831, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20615462213754654, "step": 6278 }, { "epoch": 0.3925, "grad_norm": 2.09375, "grad_norm_var": 0.07892252604166666, "learning_rate": 0.0001, "loss": 7.1264, "loss/crossentropy": 2.347910761833191, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2185712829232216, "step": 6280 }, { "epoch": 0.392625, "grad_norm": 2.1875, "grad_norm_var": 0.027339680989583334, "learning_rate": 0.0001, "loss": 7.4132, "loss/crossentropy": 2.291501462459564, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21928339451551437, "step": 6282 }, { "epoch": 0.39275, "grad_norm": 2.234375, "grad_norm_var": 0.0239166259765625, "learning_rate": 0.0001, "loss": 7.3247, "loss/crossentropy": 2.402106761932373, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2256387323141098, "step": 6284 }, { "epoch": 0.392875, "grad_norm": 2.234375, "grad_norm_var": 0.020929972330729168, "learning_rate": 0.0001, "loss": 7.3261, "loss/crossentropy": 2.5922293663024902, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23517914861440659, "step": 6286 }, { "epoch": 0.393, "grad_norm": 2.0625, "grad_norm_var": 0.018831380208333335, "learning_rate": 0.0001, "loss": 6.9919, "loss/crossentropy": 1.85099458694458, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.19639435410499573, "step": 6288 }, { "epoch": 0.393125, "grad_norm": 2.109375, "grad_norm_var": 0.019466145833333334, "learning_rate": 0.0001, "loss": 7.1741, "loss/crossentropy": 2.6446659564971924, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20629265159368515, "step": 6290 }, { "epoch": 0.39325, "grad_norm": 2.1875, "grad_norm_var": 0.0166656494140625, "learning_rate": 0.0001, "loss": 7.0529, "loss/crossentropy": 2.020451545715332, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2004483938217163, "step": 6292 }, { "epoch": 0.393375, "grad_norm": 2.296875, "grad_norm_var": 0.010456339518229166, "learning_rate": 0.0001, "loss": 7.1559, "loss/crossentropy": 2.37979257106781, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.23029006272554398, "step": 6294 }, { "epoch": 0.3935, "grad_norm": 2.265625, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 7.1597, "loss/crossentropy": 1.9197645783424377, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.18409430235624313, "step": 6296 }, { "epoch": 0.393625, "grad_norm": 2.203125, "grad_norm_var": 0.0141510009765625, "learning_rate": 0.0001, "loss": 7.2572, "loss/crossentropy": 2.4349591732025146, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2220984399318695, "step": 6298 }, { "epoch": 0.39375, "grad_norm": 2.140625, "grad_norm_var": 0.018131256103515625, "learning_rate": 0.0001, "loss": 7.2722, "loss/crossentropy": 2.2742892503738403, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2200971096754074, "step": 6300 }, { "epoch": 0.393875, "grad_norm": 2.25, "grad_norm_var": 0.01762669881184896, "learning_rate": 0.0001, "loss": 7.1513, "loss/crossentropy": 2.224164128303528, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21906382590532303, "step": 6302 }, { "epoch": 0.394, "grad_norm": 2.203125, "grad_norm_var": 0.01935399373372396, "learning_rate": 0.0001, "loss": 7.4193, "loss/crossentropy": 2.4383881092071533, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21790315955877304, "step": 6304 }, { "epoch": 0.394125, "grad_norm": 2.0625, "grad_norm_var": 0.020763905843098958, "learning_rate": 0.0001, "loss": 7.258, "loss/crossentropy": 2.2842483520507812, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21311689168214798, "step": 6306 }, { "epoch": 0.39425, "grad_norm": 2.171875, "grad_norm_var": 0.022474924723307293, "learning_rate": 0.0001, "loss": 7.2049, "loss/crossentropy": 2.2808109521865845, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2112395316362381, "step": 6308 }, { "epoch": 0.394375, "grad_norm": 2.34375, "grad_norm_var": 0.023158518473307292, "learning_rate": 0.0001, "loss": 7.0865, "loss/crossentropy": 2.1974657773971558, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.20438392460346222, "step": 6310 }, { "epoch": 0.3945, "grad_norm": 2.125, "grad_norm_var": 0.034126536051432295, "learning_rate": 0.0001, "loss": 7.1925, "loss/crossentropy": 2.0605512261390686, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20864906907081604, "step": 6312 }, { "epoch": 0.394625, "grad_norm": 2.171875, "grad_norm_var": 0.029797108968098958, "learning_rate": 0.0001, "loss": 7.0857, "loss/crossentropy": 2.132881999015808, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19599150866270065, "step": 6314 }, { "epoch": 0.39475, "grad_norm": 2.09375, "grad_norm_var": 0.025439453125, "learning_rate": 0.0001, "loss": 7.2386, "loss/crossentropy": 2.266274333000183, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.234784334897995, "step": 6316 }, { "epoch": 0.394875, "grad_norm": 2.15625, "grad_norm_var": 0.025397745768229167, "learning_rate": 0.0001, "loss": 7.3488, "loss/crossentropy": 2.5363104343414307, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2150138020515442, "step": 6318 }, { "epoch": 0.395, "grad_norm": 2.140625, "grad_norm_var": 0.027229817708333333, "learning_rate": 0.0001, "loss": 7.2938, "loss/crossentropy": 2.313738703727722, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21550405770540237, "step": 6320 }, { "epoch": 0.395125, "grad_norm": 2.03125, "grad_norm_var": 0.02779541015625, "learning_rate": 0.0001, "loss": 7.1882, "loss/crossentropy": 2.285131096839905, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20362475514411926, "step": 6322 }, { "epoch": 0.39525, "grad_norm": 2.203125, "grad_norm_var": 0.026790364583333334, "learning_rate": 0.0001, "loss": 7.2794, "loss/crossentropy": 2.522578239440918, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2191583439707756, "step": 6324 }, { "epoch": 0.395375, "grad_norm": 2.203125, "grad_norm_var": 0.025755818684895834, "learning_rate": 0.0001, "loss": 7.1791, "loss/crossentropy": 2.460786819458008, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2294606864452362, "step": 6326 }, { "epoch": 0.3955, "grad_norm": 2.140625, "grad_norm_var": 0.01246337890625, "learning_rate": 0.0001, "loss": 7.1824, "loss/crossentropy": 2.282975435256958, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.22038684040308, "step": 6328 }, { "epoch": 0.395625, "grad_norm": 2.21875, "grad_norm_var": 0.012165323893229166, "learning_rate": 0.0001, "loss": 7.129, "loss/crossentropy": 2.178337335586548, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2119307518005371, "step": 6330 }, { "epoch": 0.39575, "grad_norm": 2.328125, "grad_norm_var": 0.013981119791666666, "learning_rate": 0.0001, "loss": 7.311, "loss/crossentropy": 2.2752076387405396, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20718948543071747, "step": 6332 }, { "epoch": 0.395875, "grad_norm": 2.625, "grad_norm_var": 0.025113932291666665, "learning_rate": 0.0001, "loss": 7.2538, "loss/crossentropy": 2.2171541452407837, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2145935222506523, "step": 6334 }, { "epoch": 0.396, "grad_norm": 2.3125, "grad_norm_var": 0.03355204264322917, "learning_rate": 0.0001, "loss": 7.3479, "loss/crossentropy": 2.221325159072876, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22985991835594177, "step": 6336 }, { "epoch": 0.396125, "grad_norm": 2.203125, "grad_norm_var": 0.028425089518229165, "learning_rate": 0.0001, "loss": 7.2308, "loss/crossentropy": 2.456564784049988, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21483594179153442, "step": 6338 }, { "epoch": 0.39625, "grad_norm": 2.15625, "grad_norm_var": 0.0310455322265625, "learning_rate": 0.0001, "loss": 7.1604, "loss/crossentropy": 2.1799052953720093, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.2172977179288864, "step": 6340 }, { "epoch": 0.396375, "grad_norm": 2.15625, "grad_norm_var": 0.0314453125, "learning_rate": 0.0001, "loss": 7.3171, "loss/crossentropy": 2.308402419090271, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21601353585720062, "step": 6342 }, { "epoch": 0.3965, "grad_norm": 2.3125, "grad_norm_var": 0.03222249348958333, "learning_rate": 0.0001, "loss": 7.2262, "loss/crossentropy": 2.40183162689209, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2093559354543686, "step": 6344 }, { "epoch": 0.396625, "grad_norm": 1.984375, "grad_norm_var": 0.034651692708333334, "learning_rate": 0.0001, "loss": 7.0315, "loss/crossentropy": 1.9722952842712402, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19374338537454605, "step": 6346 }, { "epoch": 0.39675, "grad_norm": 2.234375, "grad_norm_var": 0.03699544270833333, "learning_rate": 0.0001, "loss": 7.1221, "loss/crossentropy": 2.2486064434051514, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19635196030139923, "step": 6348 }, { "epoch": 0.396875, "grad_norm": 2.15625, "grad_norm_var": 0.025829060872395834, "learning_rate": 0.0001, "loss": 7.1707, "loss/crossentropy": 2.4905707836151123, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23304647207260132, "step": 6350 }, { "epoch": 0.397, "grad_norm": 2.09375, "grad_norm_var": 0.0089996337890625, "learning_rate": 0.0001, "loss": 7.1753, "loss/crossentropy": 2.437414765357971, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2328067496418953, "step": 6352 }, { "epoch": 0.397125, "grad_norm": 2.03125, "grad_norm_var": 0.011942545572916666, "learning_rate": 0.0001, "loss": 7.0693, "loss/crossentropy": 2.294760227203369, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21324169635772705, "step": 6354 }, { "epoch": 0.39725, "grad_norm": 2.34375, "grad_norm_var": 0.01558837890625, "learning_rate": 0.0001, "loss": 7.1353, "loss/crossentropy": 2.355687439441681, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2057061642408371, "step": 6356 }, { "epoch": 0.397375, "grad_norm": 1.96875, "grad_norm_var": 0.0186920166015625, "learning_rate": 0.0001, "loss": 7.1041, "loss/crossentropy": 2.3166561126708984, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19575699418783188, "step": 6358 }, { "epoch": 0.3975, "grad_norm": 2.21875, "grad_norm_var": 0.0171539306640625, "learning_rate": 0.0001, "loss": 6.97, "loss/crossentropy": 2.255427122116089, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.19496920704841614, "step": 6360 }, { "epoch": 0.397625, "grad_norm": 2.09375, "grad_norm_var": 0.016926066080729166, "learning_rate": 0.0001, "loss": 7.276, "loss/crossentropy": 2.5599944591522217, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.24714961647987366, "step": 6362 }, { "epoch": 0.39775, "grad_norm": 2.234375, "grad_norm_var": 0.01539306640625, "learning_rate": 0.0001, "loss": 7.1811, "loss/crossentropy": 2.349897623062134, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21329647302627563, "step": 6364 }, { "epoch": 0.397875, "grad_norm": 2.046875, "grad_norm_var": 0.01549072265625, "learning_rate": 0.0001, "loss": 7.1841, "loss/crossentropy": 2.190832495689392, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.219038225710392, "step": 6366 }, { "epoch": 0.398, "grad_norm": 2.265625, "grad_norm_var": 0.0158599853515625, "learning_rate": 0.0001, "loss": 7.2998, "loss/crossentropy": 2.4349499940872192, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21405059099197388, "step": 6368 }, { "epoch": 0.398125, "grad_norm": 2.234375, "grad_norm_var": 0.01129150390625, "learning_rate": 0.0001, "loss": 7.2085, "loss/crossentropy": 2.380038619041443, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21820517629384995, "step": 6370 }, { "epoch": 0.39825, "grad_norm": 2.171875, "grad_norm_var": 0.009186808268229167, "learning_rate": 0.0001, "loss": 7.1465, "loss/crossentropy": 2.311740756034851, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20036467909812927, "step": 6372 }, { "epoch": 0.398375, "grad_norm": 2.203125, "grad_norm_var": 0.005387369791666667, "learning_rate": 0.0001, "loss": 7.0727, "loss/crossentropy": 2.3335254192352295, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21295969933271408, "step": 6374 }, { "epoch": 0.3985, "grad_norm": 2.109375, "grad_norm_var": 0.00562744140625, "learning_rate": 0.0001, "loss": 7.2106, "loss/crossentropy": 2.2149945497512817, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20361988991498947, "step": 6376 }, { "epoch": 0.398625, "grad_norm": 2.140625, "grad_norm_var": 0.005659993489583333, "learning_rate": 0.0001, "loss": 7.062, "loss/crossentropy": 2.217359721660614, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.206251360476017, "step": 6378 }, { "epoch": 0.39875, "grad_norm": 2.15625, "grad_norm_var": 0.006461588541666666, "learning_rate": 0.0001, "loss": 7.0996, "loss/crossentropy": 2.3303332328796387, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22268137335777283, "step": 6380 }, { "epoch": 0.398875, "grad_norm": 2.125, "grad_norm_var": 0.0055572509765625, "learning_rate": 0.0001, "loss": 7.263, "loss/crossentropy": 2.365337371826172, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21782244741916656, "step": 6382 }, { "epoch": 0.399, "grad_norm": 2.171875, "grad_norm_var": 0.010190582275390625, "learning_rate": 0.0001, "loss": 7.1806, "loss/crossentropy": 2.1229239106178284, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2005736082792282, "step": 6384 }, { "epoch": 0.399125, "grad_norm": 2.203125, "grad_norm_var": 0.009224192301432291, "learning_rate": 0.0001, "loss": 7.1557, "loss/crossentropy": 2.10502552986145, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21637042611837387, "step": 6386 }, { "epoch": 0.39925, "grad_norm": 2.296875, "grad_norm_var": 0.010229237874348958, "learning_rate": 0.0001, "loss": 7.3129, "loss/crossentropy": 2.272659182548523, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20518405735492706, "step": 6388 }, { "epoch": 0.399375, "grad_norm": 2.109375, "grad_norm_var": 0.011130523681640626, "learning_rate": 0.0001, "loss": 7.2501, "loss/crossentropy": 2.0195122957229614, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.19787423312664032, "step": 6390 }, { "epoch": 0.3995, "grad_norm": 2.015625, "grad_norm_var": 0.012631988525390625, "learning_rate": 0.0001, "loss": 6.965, "loss/crossentropy": 2.0998696088790894, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.18977724760770798, "step": 6392 }, { "epoch": 0.399625, "grad_norm": 2.078125, "grad_norm_var": 0.011926015218098959, "learning_rate": 0.0001, "loss": 7.1537, "loss/crossentropy": 2.5123926401138306, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22189470380544662, "step": 6394 }, { "epoch": 0.39975, "grad_norm": 2.546875, "grad_norm_var": 0.02241999308268229, "learning_rate": 0.0001, "loss": 7.145, "loss/crossentropy": 1.997516393661499, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.19417890161275864, "step": 6396 }, { "epoch": 0.399875, "grad_norm": 2.0625, "grad_norm_var": 0.023083241780598958, "learning_rate": 0.0001, "loss": 7.0326, "loss/crossentropy": 2.183876097202301, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2068372145295143, "step": 6398 }, { "epoch": 0.4, "grad_norm": 2.125, "grad_norm_var": 0.021825154622395832, "learning_rate": 0.0001, "loss": 7.0986, "loss/crossentropy": 2.2808289527893066, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21026340126991272, "step": 6400 }, { "epoch": 0.400125, "grad_norm": 2.15625, "grad_norm_var": 0.021761067708333335, "learning_rate": 0.0001, "loss": 7.2804, "loss/crossentropy": 2.11923885345459, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2049631029367447, "step": 6402 }, { "epoch": 0.40025, "grad_norm": 2.125, "grad_norm_var": 0.020601399739583335, "learning_rate": 0.0001, "loss": 7.1831, "loss/crossentropy": 2.2619943022727966, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.22322535514831543, "step": 6404 }, { "epoch": 0.400375, "grad_norm": 1.9921875, "grad_norm_var": 0.02127049763997396, "learning_rate": 0.0001, "loss": 7.0207, "loss/crossentropy": 1.81930810213089, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.18869058787822723, "step": 6406 }, { "epoch": 0.4005, "grad_norm": 2.296875, "grad_norm_var": 0.02247899373372396, "learning_rate": 0.0001, "loss": 7.3292, "loss/crossentropy": 2.421749472618103, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21702227741479874, "step": 6408 }, { "epoch": 0.400625, "grad_norm": 1.9921875, "grad_norm_var": 0.024112955729166666, "learning_rate": 0.0001, "loss": 7.2174, "loss/crossentropy": 2.363996148109436, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20497220754623413, "step": 6410 }, { "epoch": 0.40075, "grad_norm": 2.1875, "grad_norm_var": 0.011481730143229167, "learning_rate": 0.0001, "loss": 7.2747, "loss/crossentropy": 2.1107038259506226, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2324059084057808, "step": 6412 }, { "epoch": 0.400875, "grad_norm": 2.015625, "grad_norm_var": 0.011673990885416667, "learning_rate": 0.0001, "loss": 7.1866, "loss/crossentropy": 2.1150999069213867, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2122773751616478, "step": 6414 }, { "epoch": 0.401, "grad_norm": 2.40625, "grad_norm_var": 0.013849894205729166, "learning_rate": 0.0001, "loss": 7.2246, "loss/crossentropy": 2.143269181251526, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.2062653973698616, "step": 6416 }, { "epoch": 0.401125, "grad_norm": 2.15625, "grad_norm_var": 0.018065388997395834, "learning_rate": 0.0001, "loss": 7.317, "loss/crossentropy": 2.1853350400924683, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.1954660639166832, "step": 6418 }, { "epoch": 0.40125, "grad_norm": 2.109375, "grad_norm_var": 0.018114217122395835, "learning_rate": 0.0001, "loss": 7.1075, "loss/crossentropy": 2.1452205181121826, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.2259805053472519, "step": 6420 }, { "epoch": 0.401375, "grad_norm": 2.15625, "grad_norm_var": 0.015215810139973958, "learning_rate": 0.0001, "loss": 7.1589, "loss/crossentropy": 2.5433114767074585, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20600835233926773, "step": 6422 }, { "epoch": 0.4015, "grad_norm": 1.9453125, "grad_norm_var": 0.016356404622395834, "learning_rate": 0.0001, "loss": 7.0635, "loss/crossentropy": 1.9214220643043518, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.18052654713392258, "step": 6424 }, { "epoch": 0.401625, "grad_norm": 2.3125, "grad_norm_var": 0.015728505452473958, "learning_rate": 0.0001, "loss": 7.2356, "loss/crossentropy": 2.359765887260437, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.22128498554229736, "step": 6426 }, { "epoch": 0.40175, "grad_norm": 2.0625, "grad_norm_var": 0.017470041910807293, "learning_rate": 0.0001, "loss": 7.2596, "loss/crossentropy": 2.2433066368103027, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2121361345052719, "step": 6428 }, { "epoch": 0.401875, "grad_norm": 2.5, "grad_norm_var": 0.022226715087890626, "learning_rate": 0.0001, "loss": 7.2592, "loss/crossentropy": 2.251939296722412, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21200115978717804, "step": 6430 }, { "epoch": 0.402, "grad_norm": 2.1875, "grad_norm_var": 0.019909413655598958, "learning_rate": 0.0001, "loss": 7.1853, "loss/crossentropy": 2.20802104473114, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2019861936569214, "step": 6432 }, { "epoch": 0.402125, "grad_norm": 2.25, "grad_norm_var": 0.01645075480143229, "learning_rate": 0.0001, "loss": 7.1294, "loss/crossentropy": 2.509099006652832, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22067614644765854, "step": 6434 }, { "epoch": 0.40225, "grad_norm": 2.515625, "grad_norm_var": 0.025785319010416665, "learning_rate": 0.0001, "loss": 7.0552, "loss/crossentropy": 2.2592573165893555, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21897969394922256, "step": 6436 }, { "epoch": 0.402375, "grad_norm": 2.296875, "grad_norm_var": 0.027347819010416666, "learning_rate": 0.0001, "loss": 7.3063, "loss/crossentropy": 2.221360445022583, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21196063607931137, "step": 6438 }, { "epoch": 0.4025, "grad_norm": 2.03125, "grad_norm_var": 0.025294748942057292, "learning_rate": 0.0001, "loss": 7.1702, "loss/crossentropy": 2.428785562515259, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22930145263671875, "step": 6440 }, { "epoch": 0.402625, "grad_norm": 2.09375, "grad_norm_var": 0.02661921183268229, "learning_rate": 0.0001, "loss": 7.2067, "loss/crossentropy": 2.287087917327881, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.18991201370954514, "step": 6442 }, { "epoch": 0.40275, "grad_norm": 2.09375, "grad_norm_var": 0.02487767537434896, "learning_rate": 0.0001, "loss": 7.0066, "loss/crossentropy": 2.274364471435547, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21028787642717361, "step": 6444 }, { "epoch": 0.402875, "grad_norm": 2.40625, "grad_norm_var": 0.02175877888997396, "learning_rate": 0.0001, "loss": 7.1773, "loss/crossentropy": 2.046273946762085, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19751162081956863, "step": 6446 }, { "epoch": 0.403, "grad_norm": 2.140625, "grad_norm_var": 0.023178863525390624, "learning_rate": 0.0001, "loss": 7.2427, "loss/crossentropy": 2.1227652430534363, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.19309522956609726, "step": 6448 }, { "epoch": 0.403125, "grad_norm": 2.203125, "grad_norm_var": 0.02542088826497396, "learning_rate": 0.0001, "loss": 7.0169, "loss/crossentropy": 2.1013576984405518, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.2055366113781929, "step": 6450 }, { "epoch": 0.40325, "grad_norm": 2.234375, "grad_norm_var": 0.016097005208333334, "learning_rate": 0.0001, "loss": 7.2335, "loss/crossentropy": 2.3643540143966675, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21434146910905838, "step": 6452 }, { "epoch": 0.403375, "grad_norm": 2.171875, "grad_norm_var": 0.013102213541666666, "learning_rate": 0.0001, "loss": 7.1063, "loss/crossentropy": 2.2940242290496826, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21527153253555298, "step": 6454 }, { "epoch": 0.4035, "grad_norm": 2.015625, "grad_norm_var": 0.01363525390625, "learning_rate": 0.0001, "loss": 7.1233, "loss/crossentropy": 2.432578206062317, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21458515524864197, "step": 6456 }, { "epoch": 0.403625, "grad_norm": 2.515625, "grad_norm_var": 0.0198150634765625, "learning_rate": 0.0001, "loss": 7.161, "loss/crossentropy": 2.0361666083335876, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21390791982412338, "step": 6458 }, { "epoch": 0.40375, "grad_norm": 2.28125, "grad_norm_var": 0.0194000244140625, "learning_rate": 0.0001, "loss": 7.1093, "loss/crossentropy": 2.171969771385193, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.22019729018211365, "step": 6460 }, { "epoch": 0.403875, "grad_norm": 2.15625, "grad_norm_var": 0.0191314697265625, "learning_rate": 0.0001, "loss": 7.1831, "loss/crossentropy": 2.086853861808777, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21678819507360458, "step": 6462 }, { "epoch": 0.404, "grad_norm": 2.078125, "grad_norm_var": 0.017015584309895835, "learning_rate": 0.0001, "loss": 7.0841, "loss/crossentropy": 2.077217400074005, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.1952114701271057, "step": 6464 }, { "epoch": 0.404125, "grad_norm": 2.390625, "grad_norm_var": 0.018431599934895834, "learning_rate": 0.0001, "loss": 7.3492, "loss/crossentropy": 2.162355422973633, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20112480223178864, "step": 6466 }, { "epoch": 0.40425, "grad_norm": 2.296875, "grad_norm_var": 0.018895467122395832, "learning_rate": 0.0001, "loss": 7.2183, "loss/crossentropy": 2.3603663444519043, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22803721576929092, "step": 6468 }, { "epoch": 0.404375, "grad_norm": 2.265625, "grad_norm_var": 0.0194976806640625, "learning_rate": 0.0001, "loss": 7.4027, "loss/crossentropy": 2.0049321055412292, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22556506842374802, "step": 6470 }, { "epoch": 0.4045, "grad_norm": 2.1875, "grad_norm_var": 0.01796875, "learning_rate": 0.0001, "loss": 7.0518, "loss/crossentropy": 2.002595067024231, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.19405024498701096, "step": 6472 }, { "epoch": 0.404625, "grad_norm": 2.0625, "grad_norm_var": 0.015262858072916666, "learning_rate": 0.0001, "loss": 7.2985, "loss/crossentropy": 2.281522750854492, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21547602117061615, "step": 6474 }, { "epoch": 0.40475, "grad_norm": 2.3125, "grad_norm_var": 0.0160308837890625, "learning_rate": 0.0001, "loss": 7.0747, "loss/crossentropy": 2.1680938005447388, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21121270209550858, "step": 6476 }, { "epoch": 0.404875, "grad_norm": 2.109375, "grad_norm_var": 0.016796875, "learning_rate": 0.0001, "loss": 7.1666, "loss/crossentropy": 2.5115526914596558, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2365971878170967, "step": 6478 }, { "epoch": 0.405, "grad_norm": 3.171875, "grad_norm_var": 0.06926676432291666, "learning_rate": 0.0001, "loss": 7.2812, "loss/crossentropy": 2.2533043026924133, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.224958136677742, "step": 6480 }, { "epoch": 0.405125, "grad_norm": 1.84375, "grad_norm_var": 0.07986653645833333, "learning_rate": 0.0001, "loss": 7.1225, "loss/crossentropy": 2.2455263137817383, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2447589933872223, "step": 6482 }, { "epoch": 0.40525, "grad_norm": 2.25, "grad_norm_var": 0.0816070556640625, "learning_rate": 0.0001, "loss": 7.3032, "loss/crossentropy": 1.951097846031189, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20029456168413162, "step": 6484 }, { "epoch": 0.405375, "grad_norm": 2.109375, "grad_norm_var": 0.08391520182291666, "learning_rate": 0.0001, "loss": 7.2667, "loss/crossentropy": 2.215672254562378, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2087327167391777, "step": 6486 }, { "epoch": 0.4055, "grad_norm": 2.21875, "grad_norm_var": 0.08147379557291666, "learning_rate": 0.0001, "loss": 7.1273, "loss/crossentropy": 1.8793167471885681, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2043544054031372, "step": 6488 }, { "epoch": 0.405625, "grad_norm": 2.203125, "grad_norm_var": 0.07893473307291667, "learning_rate": 0.0001, "loss": 7.2892, "loss/crossentropy": 2.3511608839035034, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.20901018381118774, "step": 6490 }, { "epoch": 0.40575, "grad_norm": 2.09375, "grad_norm_var": 0.08166910807291666, "learning_rate": 0.0001, "loss": 7.1031, "loss/crossentropy": 2.2664103507995605, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2170920968055725, "step": 6492 }, { "epoch": 0.405875, "grad_norm": 1.953125, "grad_norm_var": 0.08240559895833334, "learning_rate": 0.0001, "loss": 7.2317, "loss/crossentropy": 2.4039368629455566, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2005300372838974, "step": 6494 }, { "epoch": 0.406, "grad_norm": 2.328125, "grad_norm_var": 0.018355305989583334, "learning_rate": 0.0001, "loss": 7.0248, "loss/crossentropy": 1.9945274591445923, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19976937770843506, "step": 6496 }, { "epoch": 0.406125, "grad_norm": 1.9921875, "grad_norm_var": 0.014977773030598959, "learning_rate": 0.0001, "loss": 7.0207, "loss/crossentropy": 2.2633402347564697, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20823977887630463, "step": 6498 }, { "epoch": 0.40625, "grad_norm": 2.265625, "grad_norm_var": 0.01840184529622396, "learning_rate": 0.0001, "loss": 7.3655, "loss/crossentropy": 2.1993072628974915, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2053944393992424, "step": 6500 }, { "epoch": 0.406375, "grad_norm": 1.9296875, "grad_norm_var": 0.022272745768229168, "learning_rate": 0.0001, "loss": 7.0907, "loss/crossentropy": 2.3876596689224243, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20196551084518433, "step": 6502 }, { "epoch": 0.4065, "grad_norm": 2.328125, "grad_norm_var": 0.0234619140625, "learning_rate": 0.0001, "loss": 7.2996, "loss/crossentropy": 2.2332963347434998, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.22670364379882812, "step": 6504 }, { "epoch": 0.406625, "grad_norm": 2.078125, "grad_norm_var": 0.0226470947265625, "learning_rate": 0.0001, "loss": 7.2239, "loss/crossentropy": 2.4286776781082153, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21485131978988647, "step": 6506 }, { "epoch": 0.40675, "grad_norm": 2.171875, "grad_norm_var": 0.023444620768229167, "learning_rate": 0.0001, "loss": 7.132, "loss/crossentropy": 2.1059845685958862, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20297736674547195, "step": 6508 }, { "epoch": 0.406875, "grad_norm": 2.078125, "grad_norm_var": 0.021336873372395832, "learning_rate": 0.0001, "loss": 7.3092, "loss/crossentropy": 2.437384843826294, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21660450100898743, "step": 6510 }, { "epoch": 0.407, "grad_norm": 2.390625, "grad_norm_var": 0.0242431640625, "learning_rate": 0.0001, "loss": 7.3282, "loss/crossentropy": 2.492189645767212, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.23497627675533295, "step": 6512 }, { "epoch": 0.407125, "grad_norm": 2.09375, "grad_norm_var": 0.021437327067057293, "learning_rate": 0.0001, "loss": 7.29, "loss/crossentropy": 2.0942156314849854, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21186655014753342, "step": 6514 }, { "epoch": 0.40725, "grad_norm": 2.296875, "grad_norm_var": 0.01834284464518229, "learning_rate": 0.0001, "loss": 7.2034, "loss/crossentropy": 2.205212712287903, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23347996175289154, "step": 6516 }, { "epoch": 0.407375, "grad_norm": 2.1875, "grad_norm_var": 0.016682942708333332, "learning_rate": 0.0001, "loss": 7.2561, "loss/crossentropy": 2.18733811378479, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19546174257993698, "step": 6518 }, { "epoch": 0.4075, "grad_norm": 2.0625, "grad_norm_var": 0.016185506184895834, "learning_rate": 0.0001, "loss": 7.0733, "loss/crossentropy": 2.0966725945472717, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22100364416837692, "step": 6520 }, { "epoch": 0.407625, "grad_norm": 2.140625, "grad_norm_var": 0.014680989583333333, "learning_rate": 0.0001, "loss": 7.07, "loss/crossentropy": 1.8778254985809326, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.2160200998187065, "step": 6522 }, { "epoch": 0.40775, "grad_norm": 2.359375, "grad_norm_var": 0.015071614583333334, "learning_rate": 0.0001, "loss": 7.1556, "loss/crossentropy": 2.265491247177124, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.23102690279483795, "step": 6524 }, { "epoch": 0.407875, "grad_norm": 2.296875, "grad_norm_var": 0.018745930989583333, "learning_rate": 0.0001, "loss": 7.2781, "loss/crossentropy": 2.2143566012382507, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2069038674235344, "step": 6526 }, { "epoch": 0.408, "grad_norm": 2.21875, "grad_norm_var": 0.014383951822916666, "learning_rate": 0.0001, "loss": 7.297, "loss/crossentropy": 2.3243274688720703, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21643533557653427, "step": 6528 }, { "epoch": 0.408125, "grad_norm": 2.3125, "grad_norm_var": 0.014191691080729167, "learning_rate": 0.0001, "loss": 7.2748, "loss/crossentropy": 2.3320904970169067, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2175091654062271, "step": 6530 }, { "epoch": 0.40825, "grad_norm": 2.171875, "grad_norm_var": 0.016087849934895832, "learning_rate": 0.0001, "loss": 7.2427, "loss/crossentropy": 2.1425468921661377, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20631052553653717, "step": 6532 }, { "epoch": 0.408375, "grad_norm": 2.515625, "grad_norm_var": 0.0207183837890625, "learning_rate": 0.0001, "loss": 7.1293, "loss/crossentropy": 2.483289361000061, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23001858592033386, "step": 6534 }, { "epoch": 0.4085, "grad_norm": 2.09375, "grad_norm_var": 0.0211822509765625, "learning_rate": 0.0001, "loss": 7.2254, "loss/crossentropy": 2.507395625114441, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.20727375894784927, "step": 6536 }, { "epoch": 0.408625, "grad_norm": 1.9921875, "grad_norm_var": 0.030450185139973957, "learning_rate": 0.0001, "loss": 7.0344, "loss/crossentropy": 2.13398277759552, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.18832530081272125, "step": 6538 }, { "epoch": 0.40875, "grad_norm": 2.359375, "grad_norm_var": 0.030755360921223957, "learning_rate": 0.0001, "loss": 7.1028, "loss/crossentropy": 2.128291964530945, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2085724174976349, "step": 6540 }, { "epoch": 0.408875, "grad_norm": 2.265625, "grad_norm_var": 0.026468658447265626, "learning_rate": 0.0001, "loss": 7.3629, "loss/crossentropy": 2.387086510658264, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.23265989124774933, "step": 6542 }, { "epoch": 0.409, "grad_norm": 2.171875, "grad_norm_var": 0.02826512654622396, "learning_rate": 0.0001, "loss": 7.2463, "loss/crossentropy": 2.436495780944824, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.216141015291214, "step": 6544 }, { "epoch": 0.409125, "grad_norm": 2.546875, "grad_norm_var": 0.03478368123372396, "learning_rate": 0.0001, "loss": 7.322, "loss/crossentropy": 2.2948137521743774, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2177954763174057, "step": 6546 }, { "epoch": 0.40925, "grad_norm": 2.359375, "grad_norm_var": 0.032195790608723955, "learning_rate": 0.0001, "loss": 7.3163, "loss/crossentropy": 2.068236470222473, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.19945338368415833, "step": 6548 }, { "epoch": 0.409375, "grad_norm": 2.234375, "grad_norm_var": 0.026920318603515625, "learning_rate": 0.0001, "loss": 7.202, "loss/crossentropy": 2.0515838861465454, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.1940801441669464, "step": 6550 }, { "epoch": 0.4095, "grad_norm": 2.078125, "grad_norm_var": 0.025221506754557293, "learning_rate": 0.0001, "loss": 7.0551, "loss/crossentropy": 2.2386069297790527, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21453995257616043, "step": 6552 }, { "epoch": 0.409625, "grad_norm": 2.140625, "grad_norm_var": 0.018553670247395834, "learning_rate": 0.0001, "loss": 7.1711, "loss/crossentropy": 2.238744616508484, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.208785742521286, "step": 6554 }, { "epoch": 0.40975, "grad_norm": 2.078125, "grad_norm_var": 0.01754150390625, "learning_rate": 0.0001, "loss": 7.2699, "loss/crossentropy": 2.372469902038574, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2197486311197281, "step": 6556 }, { "epoch": 0.409875, "grad_norm": 2.296875, "grad_norm_var": 0.018724568684895835, "learning_rate": 0.0001, "loss": 7.3083, "loss/crossentropy": 2.257944345474243, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20727266371250153, "step": 6558 }, { "epoch": 0.41, "grad_norm": 2.140625, "grad_norm_var": 0.016162109375, "learning_rate": 0.0001, "loss": 7.2547, "loss/crossentropy": 2.3854445219039917, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2205938622355461, "step": 6560 }, { "epoch": 0.410125, "grad_norm": 2.171875, "grad_norm_var": 0.0113922119140625, "learning_rate": 0.0001, "loss": 6.9946, "loss/crossentropy": 2.099511981010437, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2024962157011032, "step": 6562 }, { "epoch": 0.41025, "grad_norm": 2.015625, "grad_norm_var": 0.0093414306640625, "learning_rate": 0.0001, "loss": 7.1571, "loss/crossentropy": 2.1464951634407043, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21468877792358398, "step": 6564 }, { "epoch": 0.410375, "grad_norm": 2.171875, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 7.2538, "loss/crossentropy": 2.3937063217163086, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2055506482720375, "step": 6566 }, { "epoch": 0.4105, "grad_norm": 2.09375, "grad_norm_var": 0.010400390625, "learning_rate": 0.0001, "loss": 7.3326, "loss/crossentropy": 2.384568929672241, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22990135848522186, "step": 6568 }, { "epoch": 0.410625, "grad_norm": 2.328125, "grad_norm_var": 0.012821451822916666, "learning_rate": 0.0001, "loss": 7.1655, "loss/crossentropy": 2.2787879705429077, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23064248263835907, "step": 6570 }, { "epoch": 0.41075, "grad_norm": 2.109375, "grad_norm_var": 0.012679036458333333, "learning_rate": 0.0001, "loss": 7.1028, "loss/crossentropy": 2.1466062664985657, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20553258806467056, "step": 6572 }, { "epoch": 0.410875, "grad_norm": 2.3125, "grad_norm_var": 0.0108795166015625, "learning_rate": 0.0001, "loss": 7.3206, "loss/crossentropy": 2.2528934478759766, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20717480778694153, "step": 6574 }, { "epoch": 0.411, "grad_norm": 2.875, "grad_norm_var": 0.04734598795572917, "learning_rate": 0.0001, "loss": 7.0194, "loss/crossentropy": 1.7589277029037476, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.18202517926692963, "step": 6576 }, { "epoch": 0.411125, "grad_norm": 1.9765625, "grad_norm_var": 0.04872817993164062, "learning_rate": 0.0001, "loss": 7.148, "loss/crossentropy": 1.9737151265144348, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.1905621960759163, "step": 6578 }, { "epoch": 0.41125, "grad_norm": 2.546875, "grad_norm_var": 0.055216217041015626, "learning_rate": 0.0001, "loss": 7.1603, "loss/crossentropy": 2.265736222267151, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21055534482002258, "step": 6580 }, { "epoch": 0.411375, "grad_norm": 2.046875, "grad_norm_var": 0.054593658447265624, "learning_rate": 0.0001, "loss": 7.1553, "loss/crossentropy": 2.3410770893096924, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22156669199466705, "step": 6582 }, { "epoch": 0.4115, "grad_norm": 2.0625, "grad_norm_var": 0.056955718994140626, "learning_rate": 0.0001, "loss": 7.2238, "loss/crossentropy": 2.381894826889038, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21481861174106598, "step": 6584 }, { "epoch": 0.411625, "grad_norm": 2.140625, "grad_norm_var": 0.05864639282226562, "learning_rate": 0.0001, "loss": 7.2543, "loss/crossentropy": 2.0804589986801147, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.2047130987048149, "step": 6586 }, { "epoch": 0.41175, "grad_norm": 2.15625, "grad_norm_var": 0.055275217692057295, "learning_rate": 0.0001, "loss": 7.1031, "loss/crossentropy": 2.2733839750289917, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21279729157686234, "step": 6588 }, { "epoch": 0.411875, "grad_norm": 2.15625, "grad_norm_var": 0.05738703409830729, "learning_rate": 0.0001, "loss": 7.2073, "loss/crossentropy": 2.3205610513687134, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.21465665102005005, "step": 6590 }, { "epoch": 0.412, "grad_norm": 2.0625, "grad_norm_var": 0.02643407185872396, "learning_rate": 0.0001, "loss": 7.1259, "loss/crossentropy": 2.2790287733078003, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20951499789953232, "step": 6592 }, { "epoch": 0.412125, "grad_norm": 2.484375, "grad_norm_var": 0.028148396809895834, "learning_rate": 0.0001, "loss": 7.3196, "loss/crossentropy": 2.3589893579483032, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21895043551921844, "step": 6594 }, { "epoch": 0.41225, "grad_norm": 2.203125, "grad_norm_var": 0.015413411458333333, "learning_rate": 0.0001, "loss": 6.9755, "loss/crossentropy": 2.2003557682037354, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.1951805129647255, "step": 6596 }, { "epoch": 0.412375, "grad_norm": 2.21875, "grad_norm_var": 0.012206013997395833, "learning_rate": 0.0001, "loss": 7.2822, "loss/crossentropy": 2.1846193075180054, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2042449563741684, "step": 6598 }, { "epoch": 0.4125, "grad_norm": 2.1875, "grad_norm_var": 0.011449178059895834, "learning_rate": 0.0001, "loss": 7.1855, "loss/crossentropy": 2.1782132387161255, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2254723384976387, "step": 6600 }, { "epoch": 0.412625, "grad_norm": 2.265625, "grad_norm_var": 0.0118316650390625, "learning_rate": 0.0001, "loss": 7.2243, "loss/crossentropy": 2.1209170818328857, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22717957943677902, "step": 6602 }, { "epoch": 0.41275, "grad_norm": 2.265625, "grad_norm_var": 0.01217041015625, "learning_rate": 0.0001, "loss": 7.2421, "loss/crossentropy": 2.050145149230957, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19752968847751617, "step": 6604 }, { "epoch": 0.412875, "grad_norm": 2.234375, "grad_norm_var": 0.012593587239583334, "learning_rate": 0.0001, "loss": 7.2494, "loss/crossentropy": 2.1468405723571777, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2092445194721222, "step": 6606 }, { "epoch": 0.413, "grad_norm": 2.0625, "grad_norm_var": 0.012059529622395834, "learning_rate": 0.0001, "loss": 7.1083, "loss/crossentropy": 2.13679301738739, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.20577143877744675, "step": 6608 }, { "epoch": 0.413125, "grad_norm": 2.375, "grad_norm_var": 0.008610026041666666, "learning_rate": 0.0001, "loss": 7.442, "loss/crossentropy": 2.2817904949188232, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20960254222154617, "step": 6610 }, { "epoch": 0.41325, "grad_norm": 2.421875, "grad_norm_var": 0.00849609375, "learning_rate": 0.0001, "loss": 7.2355, "loss/crossentropy": 2.309928774833679, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2041112631559372, "step": 6612 }, { "epoch": 0.413375, "grad_norm": 2.359375, "grad_norm_var": 0.021214803059895832, "learning_rate": 0.0001, "loss": 7.2045, "loss/crossentropy": 1.9638023972511292, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19546246528625488, "step": 6614 }, { "epoch": 0.4135, "grad_norm": 2.125, "grad_norm_var": 0.022069295247395832, "learning_rate": 0.0001, "loss": 7.2197, "loss/crossentropy": 2.2863436937332153, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20454485714435577, "step": 6616 }, { "epoch": 0.413625, "grad_norm": 2.15625, "grad_norm_var": 0.025191243489583334, "learning_rate": 0.0001, "loss": 7.0682, "loss/crossentropy": 2.2289178371429443, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2051631659269333, "step": 6618 }, { "epoch": 0.41375, "grad_norm": 2.109375, "grad_norm_var": 0.026325480143229166, "learning_rate": 0.0001, "loss": 7.246, "loss/crossentropy": 2.1252601742744446, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21219032257795334, "step": 6620 }, { "epoch": 0.413875, "grad_norm": 2.109375, "grad_norm_var": 0.027228800455729167, "learning_rate": 0.0001, "loss": 7.1937, "loss/crossentropy": 2.377307415008545, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22052890807390213, "step": 6622 }, { "epoch": 0.414, "grad_norm": 2.109375, "grad_norm_var": 0.028043619791666665, "learning_rate": 0.0001, "loss": 7.0238, "loss/crossentropy": 2.070056438446045, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20407379418611526, "step": 6624 }, { "epoch": 0.414125, "grad_norm": 2.09375, "grad_norm_var": 0.027567545572916668, "learning_rate": 0.0001, "loss": 7.1428, "loss/crossentropy": 2.3578120470046997, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2143327072262764, "step": 6626 }, { "epoch": 0.41425, "grad_norm": 2.25, "grad_norm_var": 0.026170857747395835, "learning_rate": 0.0001, "loss": 7.1836, "loss/crossentropy": 2.4478999376296997, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21191944181919098, "step": 6628 }, { "epoch": 0.414375, "grad_norm": 2.203125, "grad_norm_var": 0.0091705322265625, "learning_rate": 0.0001, "loss": 7.2445, "loss/crossentropy": 2.12015438079834, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.20925237238407135, "step": 6630 }, { "epoch": 0.4145, "grad_norm": 2.265625, "grad_norm_var": 0.009566243489583333, "learning_rate": 0.0001, "loss": 6.9971, "loss/crossentropy": 2.209274411201477, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21350758522748947, "step": 6632 }, { "epoch": 0.414625, "grad_norm": 2.234375, "grad_norm_var": 0.009077962239583333, "learning_rate": 0.0001, "loss": 7.2467, "loss/crossentropy": 2.0114998817443848, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2036866992712021, "step": 6634 }, { "epoch": 0.41475, "grad_norm": 2.21875, "grad_norm_var": 0.00732421875, "learning_rate": 0.0001, "loss": 7.3385, "loss/crossentropy": 2.2817453145980835, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21267592161893845, "step": 6636 }, { "epoch": 0.414875, "grad_norm": 2.0, "grad_norm_var": 0.009300740559895833, "learning_rate": 0.0001, "loss": 7.3145, "loss/crossentropy": 2.3444933891296387, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21684523671865463, "step": 6638 }, { "epoch": 0.415, "grad_norm": 2.28125, "grad_norm_var": 0.011150868733723958, "learning_rate": 0.0001, "loss": 7.2032, "loss/crossentropy": 2.2932777404785156, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.24464474618434906, "step": 6640 }, { "epoch": 0.415125, "grad_norm": 2.03125, "grad_norm_var": 0.012143707275390625, "learning_rate": 0.0001, "loss": 7.1644, "loss/crossentropy": 2.237114191055298, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2008899301290512, "step": 6642 }, { "epoch": 0.41525, "grad_norm": 2.15625, "grad_norm_var": 0.010117340087890624, "learning_rate": 0.0001, "loss": 7.2987, "loss/crossentropy": 2.182630181312561, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.1930861920118332, "step": 6644 }, { "epoch": 0.415375, "grad_norm": 2.40625, "grad_norm_var": 0.050872548421223955, "learning_rate": 0.0001, "loss": 7.2461, "loss/crossentropy": 2.0044440031051636, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.18386952579021454, "step": 6646 }, { "epoch": 0.4155, "grad_norm": 2.078125, "grad_norm_var": 0.05223770141601562, "learning_rate": 0.0001, "loss": 7.1746, "loss/crossentropy": 2.2447410821914673, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2052413374185562, "step": 6648 }, { "epoch": 0.415625, "grad_norm": 2.15625, "grad_norm_var": 0.053254954020182294, "learning_rate": 0.0001, "loss": 7.1224, "loss/crossentropy": 1.9521759748458862, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21115896850824356, "step": 6650 }, { "epoch": 0.41575, "grad_norm": 2.09375, "grad_norm_var": 0.05448786417643229, "learning_rate": 0.0001, "loss": 7.347, "loss/crossentropy": 2.5570948123931885, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.24073059856891632, "step": 6652 }, { "epoch": 0.415875, "grad_norm": 2.125, "grad_norm_var": 0.052978261311848955, "learning_rate": 0.0001, "loss": 7.1036, "loss/crossentropy": 1.9738441109657288, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.17930065095424652, "step": 6654 }, { "epoch": 0.416, "grad_norm": 2.21875, "grad_norm_var": 0.05120035807291667, "learning_rate": 0.0001, "loss": 7.1908, "loss/crossentropy": 2.2183395624160767, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19753951579332352, "step": 6656 }, { "epoch": 0.416125, "grad_norm": 2.171875, "grad_norm_var": 0.04938151041666667, "learning_rate": 0.0001, "loss": 7.0473, "loss/crossentropy": 2.2473442554473877, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2003185674548149, "step": 6658 }, { "epoch": 0.41625, "grad_norm": 2.015625, "grad_norm_var": 0.05120035807291667, "learning_rate": 0.0001, "loss": 7.1982, "loss/crossentropy": 2.0214603543281555, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2159905880689621, "step": 6660 }, { "epoch": 0.416375, "grad_norm": 2.125, "grad_norm_var": 0.011494700113932292, "learning_rate": 0.0001, "loss": 7.0941, "loss/crossentropy": 2.1045217514038086, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21138650923967361, "step": 6662 }, { "epoch": 0.4165, "grad_norm": 2.234375, "grad_norm_var": 0.012595367431640626, "learning_rate": 0.0001, "loss": 7.1393, "loss/crossentropy": 2.2603907585144043, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21229936927556992, "step": 6664 }, { "epoch": 0.416625, "grad_norm": 2.21875, "grad_norm_var": 0.011553700764973958, "learning_rate": 0.0001, "loss": 6.9486, "loss/crossentropy": 2.166019320487976, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20004819333553314, "step": 6666 }, { "epoch": 0.41675, "grad_norm": 2.109375, "grad_norm_var": 0.011386871337890625, "learning_rate": 0.0001, "loss": 7.2752, "loss/crossentropy": 2.4337663650512695, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21502867341041565, "step": 6668 }, { "epoch": 0.416875, "grad_norm": 2.203125, "grad_norm_var": 0.011307525634765624, "learning_rate": 0.0001, "loss": 7.197, "loss/crossentropy": 2.383167862892151, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.22213765233755112, "step": 6670 }, { "epoch": 0.417, "grad_norm": 3.171875, "grad_norm_var": 0.07418797810872396, "learning_rate": 0.0001, "loss": 7.2443, "loss/crossentropy": 2.469019055366516, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22149549424648285, "step": 6672 }, { "epoch": 0.417125, "grad_norm": 1.9765625, "grad_norm_var": 0.07734781901041667, "learning_rate": 0.0001, "loss": 7.1202, "loss/crossentropy": 2.223373532295227, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20211607217788696, "step": 6674 }, { "epoch": 0.41725, "grad_norm": 2.3125, "grad_norm_var": 0.07432047526041667, "learning_rate": 0.0001, "loss": 7.1912, "loss/crossentropy": 2.2044894695281982, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21385891735553741, "step": 6676 }, { "epoch": 0.417375, "grad_norm": 2.0625, "grad_norm_var": 0.07100601196289062, "learning_rate": 0.0001, "loss": 7.2926, "loss/crossentropy": 2.291485071182251, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22199992835521698, "step": 6678 }, { "epoch": 0.4175, "grad_norm": 2.34375, "grad_norm_var": 0.06825942993164062, "learning_rate": 0.0001, "loss": 7.2736, "loss/crossentropy": 2.199909269809723, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21268168836832047, "step": 6680 }, { "epoch": 0.417625, "grad_norm": 2.296875, "grad_norm_var": 0.06856257120768229, "learning_rate": 0.0001, "loss": 7.255, "loss/crossentropy": 2.4666231870651245, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22746828943490982, "step": 6682 }, { "epoch": 0.41775, "grad_norm": 2.359375, "grad_norm_var": 0.06691665649414062, "learning_rate": 0.0001, "loss": 7.3389, "loss/crossentropy": 2.2254520654678345, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.19916392862796783, "step": 6684 }, { "epoch": 0.417875, "grad_norm": 2.15625, "grad_norm_var": 0.06772638956705729, "learning_rate": 0.0001, "loss": 7.3413, "loss/crossentropy": 2.230736017227173, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.2047268971800804, "step": 6686 }, { "epoch": 0.418, "grad_norm": 2.109375, "grad_norm_var": 0.011329905192057291, "learning_rate": 0.0001, "loss": 7.376, "loss/crossentropy": 2.319286346435547, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21851042658090591, "step": 6688 }, { "epoch": 0.418125, "grad_norm": 2.375, "grad_norm_var": 0.011002604166666667, "learning_rate": 0.0001, "loss": 7.2427, "loss/crossentropy": 2.312131881713867, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22473173588514328, "step": 6690 }, { "epoch": 0.41825, "grad_norm": 2.125, "grad_norm_var": 0.014322916666666666, "learning_rate": 0.0001, "loss": 7.2875, "loss/crossentropy": 2.3510780334472656, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.21289631724357605, "step": 6692 }, { "epoch": 0.418375, "grad_norm": 2.125, "grad_norm_var": 0.014436848958333333, "learning_rate": 0.0001, "loss": 7.2495, "loss/crossentropy": 2.302396059036255, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22023826837539673, "step": 6694 }, { "epoch": 0.4185, "grad_norm": 2.171875, "grad_norm_var": 0.0140533447265625, "learning_rate": 0.0001, "loss": 6.9816, "loss/crossentropy": 2.4627773761749268, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2272256538271904, "step": 6696 }, { "epoch": 0.418625, "grad_norm": 2.0625, "grad_norm_var": 0.01793212890625, "learning_rate": 0.0001, "loss": 7.2295, "loss/crossentropy": 2.464987277984619, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21357368677854538, "step": 6698 }, { "epoch": 0.41875, "grad_norm": 2.453125, "grad_norm_var": 0.0194000244140625, "learning_rate": 0.0001, "loss": 7.2894, "loss/crossentropy": 2.3194926977157593, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21601901948451996, "step": 6700 }, { "epoch": 0.418875, "grad_norm": 2.203125, "grad_norm_var": 0.0193267822265625, "learning_rate": 0.0001, "loss": 7.1178, "loss/crossentropy": 2.158350110054016, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21564631164073944, "step": 6702 }, { "epoch": 0.419, "grad_norm": 2.1875, "grad_norm_var": 0.018895467122395832, "learning_rate": 0.0001, "loss": 7.2231, "loss/crossentropy": 2.300287127494812, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.1997418850660324, "step": 6704 }, { "epoch": 0.419125, "grad_norm": 2.125, "grad_norm_var": 0.013744099934895834, "learning_rate": 0.0001, "loss": 7.2349, "loss/crossentropy": 2.378634810447693, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21635089814662933, "step": 6706 }, { "epoch": 0.41925, "grad_norm": 2.15625, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 7.2868, "loss/crossentropy": 2.3297730684280396, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2152877151966095, "step": 6708 }, { "epoch": 0.419375, "grad_norm": 2.34375, "grad_norm_var": 0.010838826497395834, "learning_rate": 0.0001, "loss": 7.3518, "loss/crossentropy": 2.1663570404052734, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20017527043819427, "step": 6710 }, { "epoch": 0.4195, "grad_norm": 2.015625, "grad_norm_var": 0.013109334309895833, "learning_rate": 0.0001, "loss": 7.2772, "loss/crossentropy": 2.153021812438965, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21046847850084305, "step": 6712 }, { "epoch": 0.419625, "grad_norm": 2.140625, "grad_norm_var": 0.011031087239583333, "learning_rate": 0.0001, "loss": 7.2319, "loss/crossentropy": 2.362241506576538, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21138739585876465, "step": 6714 }, { "epoch": 0.41975, "grad_norm": 2.125, "grad_norm_var": 0.006669108072916667, "learning_rate": 0.0001, "loss": 7.2814, "loss/crossentropy": 2.3483238220214844, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21662617474794388, "step": 6716 }, { "epoch": 0.419875, "grad_norm": 2.390625, "grad_norm_var": 0.009781901041666667, "learning_rate": 0.0001, "loss": 7.0603, "loss/crossentropy": 2.3899463415145874, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20360279828310013, "step": 6718 }, { "epoch": 0.42, "grad_norm": 2.125, "grad_norm_var": 0.010868326822916666, "learning_rate": 0.0001, "loss": 7.1463, "loss/crossentropy": 1.9754029512405396, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.17641819268465042, "step": 6720 }, { "epoch": 0.420125, "grad_norm": 2.0625, "grad_norm_var": 0.011359659830729167, "learning_rate": 0.0001, "loss": 7.3308, "loss/crossentropy": 2.39111590385437, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21659883856773376, "step": 6722 }, { "epoch": 0.42025, "grad_norm": 2.109375, "grad_norm_var": 0.014902496337890625, "learning_rate": 0.0001, "loss": 7.0377, "loss/crossentropy": 2.3440463542938232, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20923910290002823, "step": 6724 }, { "epoch": 0.420375, "grad_norm": 2.140625, "grad_norm_var": 0.012416330973307292, "learning_rate": 0.0001, "loss": 7.2015, "loss/crossentropy": 2.3062779903411865, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2264929711818695, "step": 6726 }, { "epoch": 0.4205, "grad_norm": 2.03125, "grad_norm_var": 0.011045074462890625, "learning_rate": 0.0001, "loss": 7.2369, "loss/crossentropy": 2.1152766346931458, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.20576580613851547, "step": 6728 }, { "epoch": 0.420625, "grad_norm": 2.1875, "grad_norm_var": 0.011270904541015625, "learning_rate": 0.0001, "loss": 7.2384, "loss/crossentropy": 2.3229598999023438, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.2038460299372673, "step": 6730 }, { "epoch": 0.42075, "grad_norm": 2.078125, "grad_norm_var": 0.010459136962890626, "learning_rate": 0.0001, "loss": 7.1322, "loss/crossentropy": 2.3162847757339478, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21878770738840103, "step": 6732 }, { "epoch": 0.420875, "grad_norm": 2.40625, "grad_norm_var": 0.013791656494140625, "learning_rate": 0.0001, "loss": 7.3428, "loss/crossentropy": 2.2077457904815674, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19963285326957703, "step": 6734 }, { "epoch": 0.421, "grad_norm": 1.984375, "grad_norm_var": 0.014994049072265625, "learning_rate": 0.0001, "loss": 7.1156, "loss/crossentropy": 2.162528872489929, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.201595239341259, "step": 6736 }, { "epoch": 0.421125, "grad_norm": 2.0625, "grad_norm_var": 0.015634918212890626, "learning_rate": 0.0001, "loss": 7.1029, "loss/crossentropy": 1.9828236103057861, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19044172018766403, "step": 6738 }, { "epoch": 0.42125, "grad_norm": 2.21875, "grad_norm_var": 0.0149078369140625, "learning_rate": 0.0001, "loss": 7.2027, "loss/crossentropy": 2.165103793144226, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21535291522741318, "step": 6740 }, { "epoch": 0.421375, "grad_norm": 1.9296875, "grad_norm_var": 0.01815973917643229, "learning_rate": 0.0001, "loss": 7.219, "loss/crossentropy": 2.0478169918060303, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.20827841013669968, "step": 6742 }, { "epoch": 0.4215, "grad_norm": 2.125, "grad_norm_var": 0.01746393839518229, "learning_rate": 0.0001, "loss": 7.1333, "loss/crossentropy": 2.147496223449707, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21103401482105255, "step": 6744 }, { "epoch": 0.421625, "grad_norm": 2.140625, "grad_norm_var": 0.01730524698893229, "learning_rate": 0.0001, "loss": 7.0481, "loss/crossentropy": 2.247206687927246, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2090415209531784, "step": 6746 }, { "epoch": 0.42175, "grad_norm": 2.0, "grad_norm_var": 0.0195953369140625, "learning_rate": 0.0001, "loss": 6.993, "loss/crossentropy": 2.2320778369903564, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20059379935264587, "step": 6748 }, { "epoch": 0.421875, "grad_norm": 2.234375, "grad_norm_var": 0.01278076171875, "learning_rate": 0.0001, "loss": 7.2886, "loss/crossentropy": 2.1443710327148438, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19475404173135757, "step": 6750 }, { "epoch": 0.422, "grad_norm": 2.296875, "grad_norm_var": 0.015868123372395834, "learning_rate": 0.0001, "loss": 7.1489, "loss/crossentropy": 2.279394507408142, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20637677609920502, "step": 6752 }, { "epoch": 0.422125, "grad_norm": 2.078125, "grad_norm_var": 0.014835611979166666, "learning_rate": 0.0001, "loss": 7.0726, "loss/crossentropy": 1.897760272026062, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.2159346044063568, "step": 6754 }, { "epoch": 0.42225, "grad_norm": 2.109375, "grad_norm_var": 0.012621053059895833, "learning_rate": 0.0001, "loss": 7.1816, "loss/crossentropy": 2.283962607383728, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.23135912418365479, "step": 6756 }, { "epoch": 0.422375, "grad_norm": 2.140625, "grad_norm_var": 0.009423573811848959, "learning_rate": 0.0001, "loss": 7.1644, "loss/crossentropy": 2.388902187347412, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21581237763166428, "step": 6758 }, { "epoch": 0.4225, "grad_norm": 2.0625, "grad_norm_var": 0.010473378499348958, "learning_rate": 0.0001, "loss": 7.1085, "loss/crossentropy": 2.092213809490204, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20688287913799286, "step": 6760 }, { "epoch": 0.422625, "grad_norm": 2.28125, "grad_norm_var": 0.011773427327473959, "learning_rate": 0.0001, "loss": 7.0663, "loss/crossentropy": 2.2318451404571533, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.19159415364265442, "step": 6762 }, { "epoch": 0.42275, "grad_norm": 2.265625, "grad_norm_var": 0.009723917643229166, "learning_rate": 0.0001, "loss": 7.0906, "loss/crossentropy": 2.011130690574646, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20954276621341705, "step": 6764 }, { "epoch": 0.422875, "grad_norm": 2.078125, "grad_norm_var": 0.0102935791015625, "learning_rate": 0.0001, "loss": 7.2399, "loss/crossentropy": 2.238163113594055, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19764646142721176, "step": 6766 }, { "epoch": 0.423, "grad_norm": 1.921875, "grad_norm_var": 0.010285441080729167, "learning_rate": 0.0001, "loss": 7.0323, "loss/crossentropy": 2.180557131767273, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19082503020763397, "step": 6768 }, { "epoch": 0.423125, "grad_norm": 2.328125, "grad_norm_var": 0.012702433268229167, "learning_rate": 0.0001, "loss": 7.197, "loss/crossentropy": 2.274542212486267, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21664000302553177, "step": 6770 }, { "epoch": 0.42325, "grad_norm": 2.046875, "grad_norm_var": 0.027489217122395833, "learning_rate": 0.0001, "loss": 7.0367, "loss/crossentropy": 2.271838068962097, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21325764805078506, "step": 6772 }, { "epoch": 0.423375, "grad_norm": 2.171875, "grad_norm_var": 0.030661773681640626, "learning_rate": 0.0001, "loss": 7.257, "loss/crossentropy": 2.0293312668800354, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.19651466608047485, "step": 6774 }, { "epoch": 0.4235, "grad_norm": 2.265625, "grad_norm_var": 0.030142974853515626, "learning_rate": 0.0001, "loss": 7.1123, "loss/crossentropy": 2.3418461084365845, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2213895097374916, "step": 6776 }, { "epoch": 0.423625, "grad_norm": 2.234375, "grad_norm_var": 0.030594635009765624, "learning_rate": 0.0001, "loss": 7.2606, "loss/crossentropy": 2.3157602548599243, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23133724927902222, "step": 6778 }, { "epoch": 0.42375, "grad_norm": 2.015625, "grad_norm_var": 0.031288401285807295, "learning_rate": 0.0001, "loss": 7.0713, "loss/crossentropy": 2.2262462377548218, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21064256131649017, "step": 6780 }, { "epoch": 0.423875, "grad_norm": 2.234375, "grad_norm_var": 0.03070856730143229, "learning_rate": 0.0001, "loss": 7.2447, "loss/crossentropy": 2.6724666357040405, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2289513200521469, "step": 6782 }, { "epoch": 0.424, "grad_norm": 3.84375, "grad_norm_var": 0.1964617411295573, "learning_rate": 0.0001, "loss": 7.2904, "loss/crossentropy": 2.3239933252334595, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21807952970266342, "step": 6784 }, { "epoch": 0.424125, "grad_norm": 2.484375, "grad_norm_var": 0.20003433227539064, "learning_rate": 0.0001, "loss": 7.2351, "loss/crossentropy": 2.0945045948028564, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.21246600896120071, "step": 6786 }, { "epoch": 0.42425, "grad_norm": 2.140625, "grad_norm_var": 0.19189224243164063, "learning_rate": 0.0001, "loss": 7.0303, "loss/crossentropy": 2.0538535118103027, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.19015134125947952, "step": 6788 }, { "epoch": 0.424375, "grad_norm": 2.03125, "grad_norm_var": 0.18770243326822916, "learning_rate": 0.0001, "loss": 7.107, "loss/crossentropy": 2.1239798069000244, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2081196829676628, "step": 6790 }, { "epoch": 0.4245, "grad_norm": 2.078125, "grad_norm_var": 0.19275614420572917, "learning_rate": 0.0001, "loss": 7.1894, "loss/crossentropy": 2.553703784942627, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.20920544862747192, "step": 6792 }, { "epoch": 0.424625, "grad_norm": 2.5, "grad_norm_var": 0.19491780598958333, "learning_rate": 0.0001, "loss": 7.2841, "loss/crossentropy": 2.5824981927871704, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21979796141386032, "step": 6794 }, { "epoch": 0.42475, "grad_norm": 2.046875, "grad_norm_var": 0.19247945149739584, "learning_rate": 0.0001, "loss": 7.0355, "loss/crossentropy": 2.185934007167816, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19819935411214828, "step": 6796 }, { "epoch": 0.424875, "grad_norm": 2.21875, "grad_norm_var": 0.18901265462239583, "learning_rate": 0.0001, "loss": 7.3111, "loss/crossentropy": 2.1742522716522217, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21659693121910095, "step": 6798 }, { "epoch": 0.425, "grad_norm": 2.09375, "grad_norm_var": 0.026676432291666666, "learning_rate": 0.0001, "loss": 7.2202, "loss/crossentropy": 2.3170909881591797, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21167440712451935, "step": 6800 }, { "epoch": 0.425125, "grad_norm": 2.15625, "grad_norm_var": 0.015461222330729166, "learning_rate": 0.0001, "loss": 7.229, "loss/crossentropy": 2.2571088075637817, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.21498262882232666, "step": 6802 }, { "epoch": 0.42525, "grad_norm": 2.3125, "grad_norm_var": 0.016535441080729168, "learning_rate": 0.0001, "loss": 7.127, "loss/crossentropy": 2.3108266592025757, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21807894110679626, "step": 6804 }, { "epoch": 0.425375, "grad_norm": 2.046875, "grad_norm_var": 0.0168853759765625, "learning_rate": 0.0001, "loss": 7.2834, "loss/crossentropy": 2.15952205657959, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19036553800106049, "step": 6806 }, { "epoch": 0.4255, "grad_norm": 2.140625, "grad_norm_var": 0.016356404622395834, "learning_rate": 0.0001, "loss": 7.2448, "loss/crossentropy": 2.15840220451355, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20849886536598206, "step": 6808 }, { "epoch": 0.425625, "grad_norm": 2.078125, "grad_norm_var": 0.008039347330729167, "learning_rate": 0.0001, "loss": 7.2537, "loss/crossentropy": 2.4403945207595825, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2224520891904831, "step": 6810 }, { "epoch": 0.42575, "grad_norm": 2.15625, "grad_norm_var": 0.0072743733723958336, "learning_rate": 0.0001, "loss": 7.1626, "loss/crossentropy": 2.308092713356018, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20341620594263077, "step": 6812 }, { "epoch": 0.425875, "grad_norm": 2.3125, "grad_norm_var": 0.009015909830729167, "learning_rate": 0.0001, "loss": 7.0819, "loss/crossentropy": 2.172079086303711, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21274089068174362, "step": 6814 }, { "epoch": 0.426, "grad_norm": 2.140625, "grad_norm_var": 0.012919108072916666, "learning_rate": 0.0001, "loss": 7.3531, "loss/crossentropy": 2.1660048365592957, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21018584817647934, "step": 6816 }, { "epoch": 0.426125, "grad_norm": 2.1875, "grad_norm_var": 0.013460286458333333, "learning_rate": 0.0001, "loss": 7.1127, "loss/crossentropy": 2.2449586391448975, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21250271797180176, "step": 6818 }, { "epoch": 0.42625, "grad_norm": 2.109375, "grad_norm_var": 0.012626139322916667, "learning_rate": 0.0001, "loss": 7.1219, "loss/crossentropy": 2.40755295753479, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2143237292766571, "step": 6820 }, { "epoch": 0.426375, "grad_norm": 2.125, "grad_norm_var": 0.0127349853515625, "learning_rate": 0.0001, "loss": 7.1712, "loss/crossentropy": 2.077523171901703, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21024423092603683, "step": 6822 }, { "epoch": 0.4265, "grad_norm": 2.140625, "grad_norm_var": 0.01285400390625, "learning_rate": 0.0001, "loss": 7.1869, "loss/crossentropy": 2.2102068662643433, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22490687668323517, "step": 6824 }, { "epoch": 0.426625, "grad_norm": 2.21875, "grad_norm_var": 0.0135162353515625, "learning_rate": 0.0001, "loss": 7.144, "loss/crossentropy": 2.5356907844543457, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2128807231783867, "step": 6826 }, { "epoch": 0.42675, "grad_norm": 1.984375, "grad_norm_var": 0.016258748372395833, "learning_rate": 0.0001, "loss": 7.1871, "loss/crossentropy": 2.4186887741088867, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2111327275633812, "step": 6828 }, { "epoch": 0.426875, "grad_norm": 2.296875, "grad_norm_var": 0.0157623291015625, "learning_rate": 0.0001, "loss": 7.2081, "loss/crossentropy": 2.4064241647720337, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22412901371717453, "step": 6830 }, { "epoch": 0.427, "grad_norm": 2.046875, "grad_norm_var": 0.0109527587890625, "learning_rate": 0.0001, "loss": 7.1116, "loss/crossentropy": 2.1347469091415405, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.1934138983488083, "step": 6832 }, { "epoch": 0.427125, "grad_norm": 2.40625, "grad_norm_var": 0.012376912434895833, "learning_rate": 0.0001, "loss": 7.0814, "loss/crossentropy": 2.5839143991470337, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22045065462589264, "step": 6834 }, { "epoch": 0.42725, "grad_norm": 1.96875, "grad_norm_var": 0.014615885416666667, "learning_rate": 0.0001, "loss": 7.2174, "loss/crossentropy": 2.4431090354919434, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2187180370092392, "step": 6836 }, { "epoch": 0.427375, "grad_norm": 2.234375, "grad_norm_var": 0.014191691080729167, "learning_rate": 0.0001, "loss": 7.2399, "loss/crossentropy": 2.3245296478271484, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21662010997533798, "step": 6838 }, { "epoch": 0.4275, "grad_norm": 1.96875, "grad_norm_var": 0.016559855143229166, "learning_rate": 0.0001, "loss": 7.0713, "loss/crossentropy": 2.178970217704773, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21533022820949554, "step": 6840 }, { "epoch": 0.427625, "grad_norm": 2.109375, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 7.0306, "loss/crossentropy": 2.044509172439575, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19547070562839508, "step": 6842 }, { "epoch": 0.42775, "grad_norm": 2.046875, "grad_norm_var": 0.0145904541015625, "learning_rate": 0.0001, "loss": 7.1904, "loss/crossentropy": 2.6852035522460938, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.23019836843013763, "step": 6844 }, { "epoch": 0.427875, "grad_norm": 2.140625, "grad_norm_var": 0.011649576822916667, "learning_rate": 0.0001, "loss": 7.1795, "loss/crossentropy": 2.404147505760193, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21275443583726883, "step": 6846 }, { "epoch": 0.428, "grad_norm": 2.296875, "grad_norm_var": 0.013118489583333334, "learning_rate": 0.0001, "loss": 7.1491, "loss/crossentropy": 2.30082368850708, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20821545273065567, "step": 6848 }, { "epoch": 0.428125, "grad_norm": 2.140625, "grad_norm_var": 0.0087066650390625, "learning_rate": 0.0001, "loss": 7.243, "loss/crossentropy": 2.41938316822052, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2598809227347374, "step": 6850 }, { "epoch": 0.42825, "grad_norm": 2.1875, "grad_norm_var": 0.007552083333333333, "learning_rate": 0.0001, "loss": 7.1086, "loss/crossentropy": 2.22542405128479, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.2023932784795761, "step": 6852 }, { "epoch": 0.428375, "grad_norm": 2.1875, "grad_norm_var": 0.010448201497395834, "learning_rate": 0.0001, "loss": 7.2592, "loss/crossentropy": 2.057928144931793, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.1930096223950386, "step": 6854 }, { "epoch": 0.4285, "grad_norm": 2.0, "grad_norm_var": 0.010676066080729166, "learning_rate": 0.0001, "loss": 7.1224, "loss/crossentropy": 2.1819591522216797, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.1996973529458046, "step": 6856 }, { "epoch": 0.428625, "grad_norm": 2.109375, "grad_norm_var": 0.010773722330729167, "learning_rate": 0.0001, "loss": 7.1747, "loss/crossentropy": 2.0980719327926636, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.22104358673095703, "step": 6858 }, { "epoch": 0.42875, "grad_norm": 2.15625, "grad_norm_var": 0.016243489583333333, "learning_rate": 0.0001, "loss": 6.913, "loss/crossentropy": 2.1513818502426147, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21680350601673126, "step": 6860 }, { "epoch": 0.428875, "grad_norm": 2.59375, "grad_norm_var": 0.0395904541015625, "learning_rate": 0.0001, "loss": 7.2397, "loss/crossentropy": 2.2294562458992004, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22388208657503128, "step": 6862 }, { "epoch": 0.429, "grad_norm": 2.171875, "grad_norm_var": 0.03950093587239583, "learning_rate": 0.0001, "loss": 7.0361, "loss/crossentropy": 2.283179521560669, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21150479465723038, "step": 6864 }, { "epoch": 0.429125, "grad_norm": 2.125, "grad_norm_var": 0.03980712890625, "learning_rate": 0.0001, "loss": 7.1224, "loss/crossentropy": 2.3938897848129272, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2365514636039734, "step": 6866 }, { "epoch": 0.42925, "grad_norm": 2.328125, "grad_norm_var": 0.0417877197265625, "learning_rate": 0.0001, "loss": 7.1785, "loss/crossentropy": 2.1658443212509155, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19529584795236588, "step": 6868 }, { "epoch": 0.429375, "grad_norm": 2.046875, "grad_norm_var": 0.04097900390625, "learning_rate": 0.0001, "loss": 7.1715, "loss/crossentropy": 2.0922370553016663, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.19775715470314026, "step": 6870 }, { "epoch": 0.4295, "grad_norm": 2.03125, "grad_norm_var": 0.03996480305989583, "learning_rate": 0.0001, "loss": 7.0673, "loss/crossentropy": 1.8662742376327515, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2048928141593933, "step": 6872 }, { "epoch": 0.429625, "grad_norm": 2.15625, "grad_norm_var": 0.039990234375, "learning_rate": 0.0001, "loss": 7.2334, "loss/crossentropy": 2.25331974029541, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2132648006081581, "step": 6874 }, { "epoch": 0.42975, "grad_norm": 2.171875, "grad_norm_var": 0.03132222493489583, "learning_rate": 0.0001, "loss": 7.0384, "loss/crossentropy": 2.0938963890075684, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19403553754091263, "step": 6876 }, { "epoch": 0.429875, "grad_norm": 2.171875, "grad_norm_var": 0.0096588134765625, "learning_rate": 0.0001, "loss": 7.1117, "loss/crossentropy": 2.201113998889923, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.2137242630124092, "step": 6878 }, { "epoch": 0.43, "grad_norm": 2.0625, "grad_norm_var": 0.009700520833333334, "learning_rate": 0.0001, "loss": 7.1669, "loss/crossentropy": 2.0272424817085266, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.19501972198486328, "step": 6880 }, { "epoch": 0.430125, "grad_norm": 2.140625, "grad_norm_var": 0.009423828125, "learning_rate": 0.0001, "loss": 7.2121, "loss/crossentropy": 2.506343126296997, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22914932668209076, "step": 6882 }, { "epoch": 0.43025, "grad_norm": 2.109375, "grad_norm_var": 0.004011027018229167, "learning_rate": 0.0001, "loss": 7.2302, "loss/crossentropy": 2.3738802671432495, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21538487076759338, "step": 6884 }, { "epoch": 0.430375, "grad_norm": 2.125, "grad_norm_var": 0.0032063802083333332, "learning_rate": 0.0001, "loss": 7.0062, "loss/crossentropy": 2.2716506719589233, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20168288797140121, "step": 6886 }, { "epoch": 0.4305, "grad_norm": 2.046875, "grad_norm_var": 0.0031412760416666668, "learning_rate": 0.0001, "loss": 7.0321, "loss/crossentropy": 2.131265103816986, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19480124861001968, "step": 6888 }, { "epoch": 0.430625, "grad_norm": 2.140625, "grad_norm_var": 0.005765533447265625, "learning_rate": 0.0001, "loss": 7.0555, "loss/crossentropy": 2.56924045085907, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21264956146478653, "step": 6890 }, { "epoch": 0.43075, "grad_norm": 2.15625, "grad_norm_var": 0.005863189697265625, "learning_rate": 0.0001, "loss": 7.29, "loss/crossentropy": 2.1670228242874146, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2205064371228218, "step": 6892 }, { "epoch": 0.430875, "grad_norm": 2.078125, "grad_norm_var": 0.010558827718098959, "learning_rate": 0.0001, "loss": 7.0508, "loss/crossentropy": 2.2021427154541016, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21743560582399368, "step": 6894 }, { "epoch": 0.431, "grad_norm": 2.28125, "grad_norm_var": 0.013138580322265624, "learning_rate": 0.0001, "loss": 7.1581, "loss/crossentropy": 2.3325300216674805, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22179429978132248, "step": 6896 }, { "epoch": 0.431125, "grad_norm": 2.140625, "grad_norm_var": 0.013879140218098959, "learning_rate": 0.0001, "loss": 7.134, "loss/crossentropy": 2.150698184967041, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21261726319789886, "step": 6898 }, { "epoch": 0.43125, "grad_norm": 2.359375, "grad_norm_var": 0.016928863525390626, "learning_rate": 0.0001, "loss": 7.0722, "loss/crossentropy": 2.262513518333435, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2029724195599556, "step": 6900 }, { "epoch": 0.431375, "grad_norm": 2.015625, "grad_norm_var": 0.01788304646809896, "learning_rate": 0.0001, "loss": 7.0642, "loss/crossentropy": 2.2124452590942383, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20887747406959534, "step": 6902 }, { "epoch": 0.4315, "grad_norm": 2.3125, "grad_norm_var": 0.018552398681640624, "learning_rate": 0.0001, "loss": 7.3876, "loss/crossentropy": 2.4545419216156006, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.263492152094841, "step": 6904 }, { "epoch": 0.431625, "grad_norm": 2.265625, "grad_norm_var": 0.016487630208333333, "learning_rate": 0.0001, "loss": 7.1829, "loss/crossentropy": 2.244446575641632, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20837052166461945, "step": 6906 }, { "epoch": 0.43175, "grad_norm": 2.09375, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 7.1886, "loss/crossentropy": 2.059956908226013, "loss/hidden": 2.71875, "loss/jsd": 0.0, "loss/logits": 0.18521716445684433, "step": 6908 }, { "epoch": 0.431875, "grad_norm": 2.296875, "grad_norm_var": 0.010380045572916666, "learning_rate": 0.0001, "loss": 7.0432, "loss/crossentropy": 2.323797821998596, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20761998742818832, "step": 6910 }, { "epoch": 0.432, "grad_norm": 2.09375, "grad_norm_var": 0.010236612955729167, "learning_rate": 0.0001, "loss": 7.216, "loss/crossentropy": 2.1744368076324463, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.1985497921705246, "step": 6912 }, { "epoch": 0.432125, "grad_norm": 2.15625, "grad_norm_var": 0.009935506184895833, "learning_rate": 0.0001, "loss": 7.0865, "loss/crossentropy": 2.1344690322875977, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22802872955799103, "step": 6914 }, { "epoch": 0.43225, "grad_norm": 2.09375, "grad_norm_var": 0.007340494791666667, "learning_rate": 0.0001, "loss": 7.1136, "loss/crossentropy": 2.186421036720276, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.19479546695947647, "step": 6916 }, { "epoch": 0.432375, "grad_norm": 2.203125, "grad_norm_var": 0.007942708333333333, "learning_rate": 0.0001, "loss": 7.1482, "loss/crossentropy": 1.894170880317688, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.18849200010299683, "step": 6918 }, { "epoch": 0.4325, "grad_norm": 2.375, "grad_norm_var": 0.0094146728515625, "learning_rate": 0.0001, "loss": 7.3064, "loss/crossentropy": 2.443792700767517, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22042584419250488, "step": 6920 }, { "epoch": 0.432625, "grad_norm": 2.046875, "grad_norm_var": 0.012040201822916667, "learning_rate": 0.0001, "loss": 7.1741, "loss/crossentropy": 2.199779987335205, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21045687794685364, "step": 6922 }, { "epoch": 0.43275, "grad_norm": 2.09375, "grad_norm_var": 0.012369791666666666, "learning_rate": 0.0001, "loss": 7.0816, "loss/crossentropy": 1.9734878540039062, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.18932150304317474, "step": 6924 }, { "epoch": 0.432875, "grad_norm": 2.015625, "grad_norm_var": 0.01383056640625, "learning_rate": 0.0001, "loss": 7.0272, "loss/crossentropy": 2.060529947280884, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20331456512212753, "step": 6926 }, { "epoch": 0.433, "grad_norm": 2.109375, "grad_norm_var": 0.013232421875, "learning_rate": 0.0001, "loss": 7.1574, "loss/crossentropy": 2.1172688007354736, "loss/hidden": 2.71875, "loss/jsd": 0.0, "loss/logits": 0.2042495608329773, "step": 6928 }, { "epoch": 0.433125, "grad_norm": 2.09375, "grad_norm_var": 0.013524373372395834, "learning_rate": 0.0001, "loss": 7.2431, "loss/crossentropy": 2.218605160713196, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.2152126282453537, "step": 6930 }, { "epoch": 0.43325, "grad_norm": 2.03125, "grad_norm_var": 0.014020792643229167, "learning_rate": 0.0001, "loss": 7.0866, "loss/crossentropy": 2.047509551048279, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.19541727006435394, "step": 6932 }, { "epoch": 0.433375, "grad_norm": 2.046875, "grad_norm_var": 0.014922841389973959, "learning_rate": 0.0001, "loss": 7.0279, "loss/crossentropy": 2.1803908348083496, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.18999575078487396, "step": 6934 }, { "epoch": 0.4335, "grad_norm": 2.171875, "grad_norm_var": 0.009893544514973958, "learning_rate": 0.0001, "loss": 7.2116, "loss/crossentropy": 2.3334230184555054, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2261931300163269, "step": 6936 }, { "epoch": 0.433625, "grad_norm": 2.234375, "grad_norm_var": 0.008129628499348958, "learning_rate": 0.0001, "loss": 7.1294, "loss/crossentropy": 2.2865262031555176, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21169719099998474, "step": 6938 }, { "epoch": 0.43375, "grad_norm": 2.046875, "grad_norm_var": 0.008837636311848958, "learning_rate": 0.0001, "loss": 6.9919, "loss/crossentropy": 2.046865701675415, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19481410086154938, "step": 6940 }, { "epoch": 0.433875, "grad_norm": 2.25, "grad_norm_var": 0.009787750244140626, "learning_rate": 0.0001, "loss": 7.2462, "loss/crossentropy": 2.44638729095459, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22355614602565765, "step": 6942 }, { "epoch": 0.434, "grad_norm": 2.0, "grad_norm_var": 0.010550689697265626, "learning_rate": 0.0001, "loss": 7.078, "loss/crossentropy": 2.1784814596176147, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20400924235582352, "step": 6944 }, { "epoch": 0.434125, "grad_norm": 2.171875, "grad_norm_var": 0.011446126302083333, "learning_rate": 0.0001, "loss": 7.0531, "loss/crossentropy": 2.4585882425308228, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22780652344226837, "step": 6946 }, { "epoch": 0.43425, "grad_norm": 2.234375, "grad_norm_var": 0.01248779296875, "learning_rate": 0.0001, "loss": 7.1479, "loss/crossentropy": 2.1119790077209473, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.199091836810112, "step": 6948 }, { "epoch": 0.434375, "grad_norm": 2.25, "grad_norm_var": 0.009987131754557291, "learning_rate": 0.0001, "loss": 7.1265, "loss/crossentropy": 2.2907246351242065, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2269321158528328, "step": 6950 }, { "epoch": 0.4345, "grad_norm": 1.921875, "grad_norm_var": 0.013183339436848959, "learning_rate": 0.0001, "loss": 7.0487, "loss/crossentropy": 2.2639458179473877, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19958817213773727, "step": 6952 }, { "epoch": 0.434625, "grad_norm": 2.21875, "grad_norm_var": 0.013014475504557291, "learning_rate": 0.0001, "loss": 7.0662, "loss/crossentropy": 2.39899480342865, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21808726340532303, "step": 6954 }, { "epoch": 0.43475, "grad_norm": 2.046875, "grad_norm_var": 0.015541330973307291, "learning_rate": 0.0001, "loss": 7.1265, "loss/crossentropy": 2.3878051042556763, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21249045431613922, "step": 6956 }, { "epoch": 0.434875, "grad_norm": 2.203125, "grad_norm_var": 0.014817047119140624, "learning_rate": 0.0001, "loss": 7.0378, "loss/crossentropy": 2.0612010955810547, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2078692466020584, "step": 6958 }, { "epoch": 0.435, "grad_norm": 2.0625, "grad_norm_var": 0.014743804931640625, "learning_rate": 0.0001, "loss": 7.1189, "loss/crossentropy": 2.1870445013046265, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20125596970319748, "step": 6960 }, { "epoch": 0.435125, "grad_norm": 2.546875, "grad_norm_var": 0.0228424072265625, "learning_rate": 0.0001, "loss": 7.1462, "loss/crossentropy": 2.318724036216736, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20482215285301208, "step": 6962 }, { "epoch": 0.43525, "grad_norm": 2.046875, "grad_norm_var": 0.0268951416015625, "learning_rate": 0.0001, "loss": 7.1296, "loss/crossentropy": 2.5011104345321655, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22151333093643188, "step": 6964 }, { "epoch": 0.435375, "grad_norm": 2.109375, "grad_norm_var": 0.026887003580729166, "learning_rate": 0.0001, "loss": 7.1403, "loss/crossentropy": 2.384117841720581, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21883900463581085, "step": 6966 }, { "epoch": 0.4355, "grad_norm": 2.3125, "grad_norm_var": 0.023705037434895833, "learning_rate": 0.0001, "loss": 7.0673, "loss/crossentropy": 2.29778790473938, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21907077729701996, "step": 6968 }, { "epoch": 0.435625, "grad_norm": 2.0625, "grad_norm_var": 0.021540323893229168, "learning_rate": 0.0001, "loss": 7.1069, "loss/crossentropy": 2.1342278718948364, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.18501774966716766, "step": 6970 }, { "epoch": 0.43575, "grad_norm": 2.265625, "grad_norm_var": 0.022297159830729166, "learning_rate": 0.0001, "loss": 7.2041, "loss/crossentropy": 2.4118359088897705, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.1985846757888794, "step": 6972 }, { "epoch": 0.435875, "grad_norm": 2.125, "grad_norm_var": 0.022435506184895832, "learning_rate": 0.0001, "loss": 7.1099, "loss/crossentropy": 2.274700164794922, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.1986750066280365, "step": 6974 }, { "epoch": 0.436, "grad_norm": 2.078125, "grad_norm_var": 0.022736612955729166, "learning_rate": 0.0001, "loss": 7.0998, "loss/crossentropy": 2.030650019645691, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20323613286018372, "step": 6976 }, { "epoch": 0.436125, "grad_norm": 1.984375, "grad_norm_var": 0.017096964518229167, "learning_rate": 0.0001, "loss": 7.0212, "loss/crossentropy": 2.1681824922561646, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19574403017759323, "step": 6978 }, { "epoch": 0.43625, "grad_norm": 2.21875, "grad_norm_var": 0.012287394205729166, "learning_rate": 0.0001, "loss": 7.3248, "loss/crossentropy": 2.2973861694335938, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2214999794960022, "step": 6980 }, { "epoch": 0.436375, "grad_norm": 2.125, "grad_norm_var": 0.014435831705729167, "learning_rate": 0.0001, "loss": 7.0755, "loss/crossentropy": 2.2429606914520264, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21233399212360382, "step": 6982 }, { "epoch": 0.4365, "grad_norm": 2.078125, "grad_norm_var": 0.013199869791666667, "learning_rate": 0.0001, "loss": 7.0234, "loss/crossentropy": 2.0870251655578613, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.18894104659557343, "step": 6984 }, { "epoch": 0.436625, "grad_norm": 2.390625, "grad_norm_var": 0.024022420247395832, "learning_rate": 0.0001, "loss": 7.1037, "loss/crossentropy": 1.8943498730659485, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.19573025405406952, "step": 6986 }, { "epoch": 0.43675, "grad_norm": 1.953125, "grad_norm_var": 0.023616536458333334, "learning_rate": 0.0001, "loss": 7.2237, "loss/crossentropy": 2.5429954528808594, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23790296912193298, "step": 6988 }, { "epoch": 0.436875, "grad_norm": 2.296875, "grad_norm_var": 0.024690755208333335, "learning_rate": 0.0001, "loss": 7.1511, "loss/crossentropy": 2.0892491340637207, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.19531705975532532, "step": 6990 }, { "epoch": 0.437, "grad_norm": 2.015625, "grad_norm_var": 0.024828084309895835, "learning_rate": 0.0001, "loss": 7.2602, "loss/crossentropy": 2.1488635540008545, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21692749112844467, "step": 6992 }, { "epoch": 0.437125, "grad_norm": 1.9453125, "grad_norm_var": 0.02623265584309896, "learning_rate": 0.0001, "loss": 7.1101, "loss/crossentropy": 2.3074164390563965, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20840033143758774, "step": 6994 }, { "epoch": 0.43725, "grad_norm": 2.21875, "grad_norm_var": 0.026318105061848958, "learning_rate": 0.0001, "loss": 7.0916, "loss/crossentropy": 2.2144097089767456, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21698354184627533, "step": 6996 }, { "epoch": 0.437375, "grad_norm": 2.03125, "grad_norm_var": 0.02490208943684896, "learning_rate": 0.0001, "loss": 7.2198, "loss/crossentropy": 2.2563615441322327, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.1933683454990387, "step": 6998 }, { "epoch": 0.4375, "grad_norm": 2.390625, "grad_norm_var": 0.028254954020182292, "learning_rate": 0.0001, "loss": 7.2343, "loss/crossentropy": 2.388770818710327, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21556702256202698, "step": 7000 }, { "epoch": 0.437625, "grad_norm": 1.8515625, "grad_norm_var": 0.022370402018229166, "learning_rate": 0.0001, "loss": 7.0739, "loss/crossentropy": 2.278029680252075, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20425931364297867, "step": 7002 }, { "epoch": 0.43775, "grad_norm": 2.09375, "grad_norm_var": 0.020137532552083334, "learning_rate": 0.0001, "loss": 7.0929, "loss/crossentropy": 2.2358744144439697, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.196475088596344, "step": 7004 }, { "epoch": 0.437875, "grad_norm": 2.28125, "grad_norm_var": 0.0196685791015625, "learning_rate": 0.0001, "loss": 7.0986, "loss/crossentropy": 2.077066421508789, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.18660877645015717, "step": 7006 }, { "epoch": 0.438, "grad_norm": 2.4375, "grad_norm_var": 0.024275716145833334, "learning_rate": 0.0001, "loss": 7.1155, "loss/crossentropy": 2.2696588039398193, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2037624940276146, "step": 7008 }, { "epoch": 0.438125, "grad_norm": 2.015625, "grad_norm_var": 0.02268040974934896, "learning_rate": 0.0001, "loss": 7.2975, "loss/crossentropy": 2.3751505613327026, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21261807531118393, "step": 7010 }, { "epoch": 0.43825, "grad_norm": 2.203125, "grad_norm_var": 0.022517649332682292, "learning_rate": 0.0001, "loss": 7.2177, "loss/crossentropy": 2.265856981277466, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.1969621181488037, "step": 7012 }, { "epoch": 0.438375, "grad_norm": 2.15625, "grad_norm_var": 0.021996815999348957, "learning_rate": 0.0001, "loss": 7.0688, "loss/crossentropy": 2.275804281234741, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21681126207113266, "step": 7014 }, { "epoch": 0.4385, "grad_norm": 2.125, "grad_norm_var": 0.018047841389973958, "learning_rate": 0.0001, "loss": 7.1952, "loss/crossentropy": 2.355897307395935, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20308750867843628, "step": 7016 }, { "epoch": 0.438625, "grad_norm": 2.203125, "grad_norm_var": 0.011701456705729167, "learning_rate": 0.0001, "loss": 7.1911, "loss/crossentropy": 2.5430479049682617, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21198173612356186, "step": 7018 }, { "epoch": 0.43875, "grad_norm": 2.15625, "grad_norm_var": 0.010895792643229167, "learning_rate": 0.0001, "loss": 7.2542, "loss/crossentropy": 2.3191837072372437, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21172154694795609, "step": 7020 }, { "epoch": 0.438875, "grad_norm": 2.171875, "grad_norm_var": 0.0104400634765625, "learning_rate": 0.0001, "loss": 7.164, "loss/crossentropy": 2.1724308729171753, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.21450695395469666, "step": 7022 }, { "epoch": 0.439, "grad_norm": 2.0625, "grad_norm_var": 0.00592041015625, "learning_rate": 0.0001, "loss": 7.1836, "loss/crossentropy": 2.2622636556625366, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22127537429332733, "step": 7024 }, { "epoch": 0.439125, "grad_norm": 2.0625, "grad_norm_var": 0.003855133056640625, "learning_rate": 0.0001, "loss": 7.0389, "loss/crossentropy": 2.3678966760635376, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.19926925003528595, "step": 7026 }, { "epoch": 0.43925, "grad_norm": 2.328125, "grad_norm_var": 0.006176503499348959, "learning_rate": 0.0001, "loss": 7.2079, "loss/crossentropy": 2.137107729911804, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.2178208827972412, "step": 7028 }, { "epoch": 0.439375, "grad_norm": 1.953125, "grad_norm_var": 0.008664703369140625, "learning_rate": 0.0001, "loss": 7.1307, "loss/crossentropy": 2.3075451850891113, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21288493275642395, "step": 7030 }, { "epoch": 0.4395, "grad_norm": 2.296875, "grad_norm_var": 0.010509999593098958, "learning_rate": 0.0001, "loss": 7.1834, "loss/crossentropy": 2.046548902988434, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.18741850554943085, "step": 7032 }, { "epoch": 0.439625, "grad_norm": 2.1875, "grad_norm_var": 0.010379791259765625, "learning_rate": 0.0001, "loss": 7.1691, "loss/crossentropy": 2.286561965942383, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20791344344615936, "step": 7034 }, { "epoch": 0.43975, "grad_norm": 2.234375, "grad_norm_var": 0.010599517822265625, "learning_rate": 0.0001, "loss": 7.3184, "loss/crossentropy": 2.287408947944641, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21978344023227692, "step": 7036 }, { "epoch": 0.439875, "grad_norm": 2.25, "grad_norm_var": 0.010916900634765626, "learning_rate": 0.0001, "loss": 7.2935, "loss/crossentropy": 2.6174110174179077, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20811716467142105, "step": 7038 }, { "epoch": 0.44, "grad_norm": 2.171875, "grad_norm_var": 0.010231272379557291, "learning_rate": 0.0001, "loss": 7.3164, "loss/crossentropy": 2.4823784828186035, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.23784971982240677, "step": 7040 }, { "epoch": 0.440125, "grad_norm": 2.078125, "grad_norm_var": 0.007958984375, "learning_rate": 0.0001, "loss": 7.1676, "loss/crossentropy": 2.3906302452087402, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.201579749584198, "step": 7042 }, { "epoch": 0.44025, "grad_norm": 1.984375, "grad_norm_var": 0.01168212890625, "learning_rate": 0.0001, "loss": 6.9151, "loss/crossentropy": 2.106368660926819, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.17003612965345383, "step": 7044 }, { "epoch": 0.440375, "grad_norm": 2.578125, "grad_norm_var": 0.0222076416015625, "learning_rate": 0.0001, "loss": 7.172, "loss/crossentropy": 2.2014052867889404, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21450527757406235, "step": 7046 }, { "epoch": 0.4405, "grad_norm": 2.328125, "grad_norm_var": 0.023249308268229168, "learning_rate": 0.0001, "loss": 7.0817, "loss/crossentropy": 2.2320194244384766, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2144247591495514, "step": 7048 }, { "epoch": 0.440625, "grad_norm": 2.28125, "grad_norm_var": 0.023615519205729168, "learning_rate": 0.0001, "loss": 7.1826, "loss/crossentropy": 2.2494088411331177, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.1985793113708496, "step": 7050 }, { "epoch": 0.44075, "grad_norm": 2.046875, "grad_norm_var": 0.026253255208333333, "learning_rate": 0.0001, "loss": 7.1503, "loss/crossentropy": 2.260366201400757, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2241573929786682, "step": 7052 }, { "epoch": 0.440875, "grad_norm": 2.09375, "grad_norm_var": 0.026838175455729165, "learning_rate": 0.0001, "loss": 7.0095, "loss/crossentropy": 1.7234525084495544, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.1722937896847725, "step": 7054 }, { "epoch": 0.441, "grad_norm": 2.140625, "grad_norm_var": 0.02974853515625, "learning_rate": 0.0001, "loss": 7.1572, "loss/crossentropy": 2.1869869232177734, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.1890271008014679, "step": 7056 }, { "epoch": 0.441125, "grad_norm": 2.171875, "grad_norm_var": 0.0292633056640625, "learning_rate": 0.0001, "loss": 6.9738, "loss/crossentropy": 2.192438244819641, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21589791029691696, "step": 7058 }, { "epoch": 0.44125, "grad_norm": 2.140625, "grad_norm_var": 0.02275390625, "learning_rate": 0.0001, "loss": 7.2958, "loss/crossentropy": 2.5864726305007935, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23851511627435684, "step": 7060 }, { "epoch": 0.441375, "grad_norm": 2.328125, "grad_norm_var": 0.011546834309895834, "learning_rate": 0.0001, "loss": 7.2065, "loss/crossentropy": 2.293688654899597, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21563249826431274, "step": 7062 }, { "epoch": 0.4415, "grad_norm": 2.359375, "grad_norm_var": 0.011442057291666667, "learning_rate": 0.0001, "loss": 7.1256, "loss/crossentropy": 2.4867671728134155, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.21973519027233124, "step": 7064 }, { "epoch": 0.441625, "grad_norm": 2.03125, "grad_norm_var": 0.010863240559895833, "learning_rate": 0.0001, "loss": 7.2803, "loss/crossentropy": 2.2372056245803833, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.21072745323181152, "step": 7066 }, { "epoch": 0.44175, "grad_norm": 2.109375, "grad_norm_var": 0.011714680989583334, "learning_rate": 0.0001, "loss": 7.0581, "loss/crossentropy": 2.25705885887146, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21308179199695587, "step": 7068 }, { "epoch": 0.441875, "grad_norm": 2.0625, "grad_norm_var": 0.0117584228515625, "learning_rate": 0.0001, "loss": 7.0393, "loss/crossentropy": 2.0396016240119934, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.1958983615040779, "step": 7070 }, { "epoch": 0.442, "grad_norm": 1.984375, "grad_norm_var": 0.012010701497395833, "learning_rate": 0.0001, "loss": 7.1502, "loss/crossentropy": 2.4618523120880127, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21056653559207916, "step": 7072 }, { "epoch": 0.442125, "grad_norm": 2.28125, "grad_norm_var": 0.0133941650390625, "learning_rate": 0.0001, "loss": 7.2114, "loss/crossentropy": 2.572341799736023, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.24748806655406952, "step": 7074 }, { "epoch": 0.44225, "grad_norm": 2.0625, "grad_norm_var": 0.0137603759765625, "learning_rate": 0.0001, "loss": 7.0992, "loss/crossentropy": 2.293982744216919, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20247779786586761, "step": 7076 }, { "epoch": 0.442375, "grad_norm": 2.25, "grad_norm_var": 0.012809244791666667, "learning_rate": 0.0001, "loss": 7.0991, "loss/crossentropy": 2.3547102212905884, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21820224821567535, "step": 7078 }, { "epoch": 0.4425, "grad_norm": 2.09375, "grad_norm_var": 0.009837849934895834, "learning_rate": 0.0001, "loss": 7.0733, "loss/crossentropy": 2.233034372329712, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20219457149505615, "step": 7080 }, { "epoch": 0.442625, "grad_norm": 2.203125, "grad_norm_var": 0.009273274739583334, "learning_rate": 0.0001, "loss": 7.1626, "loss/crossentropy": 2.2339953184127808, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19660235941410065, "step": 7082 }, { "epoch": 0.44275, "grad_norm": 2.125, "grad_norm_var": 0.008128865559895834, "learning_rate": 0.0001, "loss": 7.0955, "loss/crossentropy": 2.2933523654937744, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20504215359687805, "step": 7084 }, { "epoch": 0.442875, "grad_norm": 2.09375, "grad_norm_var": 0.008138020833333334, "learning_rate": 0.0001, "loss": 7.1335, "loss/crossentropy": 2.067702531814575, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.20143579691648483, "step": 7086 }, { "epoch": 0.443, "grad_norm": 1.96875, "grad_norm_var": 0.009570058186848958, "learning_rate": 0.0001, "loss": 7.1793, "loss/crossentropy": 2.0924493074417114, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.19646278768777847, "step": 7088 }, { "epoch": 0.443125, "grad_norm": 2.3125, "grad_norm_var": 0.010235341389973958, "learning_rate": 0.0001, "loss": 7.0845, "loss/crossentropy": 2.3074241876602173, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.20258978009223938, "step": 7090 }, { "epoch": 0.44325, "grad_norm": 1.9921875, "grad_norm_var": 0.013842519124348958, "learning_rate": 0.0001, "loss": 7.065, "loss/crossentropy": 2.421837568283081, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21192050725221634, "step": 7092 }, { "epoch": 0.443375, "grad_norm": 2.40625, "grad_norm_var": 0.017911529541015624, "learning_rate": 0.0001, "loss": 7.2564, "loss/crossentropy": 2.22128963470459, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21447572112083435, "step": 7094 }, { "epoch": 0.4435, "grad_norm": 2.015625, "grad_norm_var": 0.019449869791666668, "learning_rate": 0.0001, "loss": 7.0811, "loss/crossentropy": 1.8576909899711609, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.19812503457069397, "step": 7096 }, { "epoch": 0.443625, "grad_norm": 2.109375, "grad_norm_var": 0.01923828125, "learning_rate": 0.0001, "loss": 7.1299, "loss/crossentropy": 2.3352116346359253, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.22965504229068756, "step": 7098 }, { "epoch": 0.44375, "grad_norm": 2.21875, "grad_norm_var": 0.021922810872395834, "learning_rate": 0.0001, "loss": 7.1568, "loss/crossentropy": 2.288171410560608, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22362970560789108, "step": 7100 }, { "epoch": 0.443875, "grad_norm": 2.0625, "grad_norm_var": 0.022798665364583335, "learning_rate": 0.0001, "loss": 7.2212, "loss/crossentropy": 2.535971522331238, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21269136667251587, "step": 7102 }, { "epoch": 0.444, "grad_norm": 2.078125, "grad_norm_var": 0.01968968709309896, "learning_rate": 0.0001, "loss": 7.1576, "loss/crossentropy": 2.1160417795181274, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20902036875486374, "step": 7104 }, { "epoch": 0.444125, "grad_norm": 2.078125, "grad_norm_var": 0.01985448201497396, "learning_rate": 0.0001, "loss": 7.1795, "loss/crossentropy": 2.206043004989624, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.20614183694124222, "step": 7106 }, { "epoch": 0.44425, "grad_norm": 2.109375, "grad_norm_var": 0.015641021728515624, "learning_rate": 0.0001, "loss": 7.0607, "loss/crossentropy": 2.3254839181900024, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21963687986135483, "step": 7108 }, { "epoch": 0.444375, "grad_norm": 2.15625, "grad_norm_var": 0.010493723551432292, "learning_rate": 0.0001, "loss": 7.122, "loss/crossentropy": 2.003136456012726, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20006847381591797, "step": 7110 }, { "epoch": 0.4445, "grad_norm": 2.15625, "grad_norm_var": 0.0087310791015625, "learning_rate": 0.0001, "loss": 7.242, "loss/crossentropy": 1.9243924021720886, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.20003072917461395, "step": 7112 }, { "epoch": 0.444625, "grad_norm": 2.296875, "grad_norm_var": 0.010383097330729167, "learning_rate": 0.0001, "loss": 7.0951, "loss/crossentropy": 2.0053927302360535, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20178984850645065, "step": 7114 }, { "epoch": 0.44475, "grad_norm": 2.203125, "grad_norm_var": 0.0123931884765625, "learning_rate": 0.0001, "loss": 7.2092, "loss/crossentropy": 2.1276594400405884, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19504177570343018, "step": 7116 }, { "epoch": 0.444875, "grad_norm": 1.9453125, "grad_norm_var": 0.013303375244140625, "learning_rate": 0.0001, "loss": 7.1927, "loss/crossentropy": 2.3429445028305054, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.22777916491031647, "step": 7118 }, { "epoch": 0.445, "grad_norm": 2.546875, "grad_norm_var": 0.022885894775390624, "learning_rate": 0.0001, "loss": 7.0434, "loss/crossentropy": 1.9249637126922607, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.196383997797966, "step": 7120 }, { "epoch": 0.445125, "grad_norm": 2.15625, "grad_norm_var": 0.025658925374348957, "learning_rate": 0.0001, "loss": 7.0644, "loss/crossentropy": 2.101871132850647, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19870474934577942, "step": 7122 }, { "epoch": 0.44525, "grad_norm": 2.265625, "grad_norm_var": 0.02504450480143229, "learning_rate": 0.0001, "loss": 7.1175, "loss/crossentropy": 2.2008549571037292, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.19907881319522858, "step": 7124 }, { "epoch": 0.445375, "grad_norm": 2.0625, "grad_norm_var": 0.026838938395182293, "learning_rate": 0.0001, "loss": 7.2085, "loss/crossentropy": 1.9554465413093567, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20027147978544235, "step": 7126 }, { "epoch": 0.4455, "grad_norm": 2.203125, "grad_norm_var": 0.026747385660807293, "learning_rate": 0.0001, "loss": 7.3277, "loss/crossentropy": 2.2794995307922363, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21968071907758713, "step": 7128 }, { "epoch": 0.445625, "grad_norm": 2.171875, "grad_norm_var": 0.02599054972330729, "learning_rate": 0.0001, "loss": 7.1629, "loss/crossentropy": 2.1739684343338013, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.20093033462762833, "step": 7130 }, { "epoch": 0.44575, "grad_norm": 2.109375, "grad_norm_var": 0.02295710245768229, "learning_rate": 0.0001, "loss": 7.137, "loss/crossentropy": 2.3477087020874023, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2029757872223854, "step": 7132 }, { "epoch": 0.445875, "grad_norm": 2.25, "grad_norm_var": 0.019429524739583332, "learning_rate": 0.0001, "loss": 7.2484, "loss/crossentropy": 2.5823616981506348, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22345758229494095, "step": 7134 }, { "epoch": 0.446, "grad_norm": 2.125, "grad_norm_var": 0.009859212239583333, "learning_rate": 0.0001, "loss": 7.0862, "loss/crossentropy": 2.093753218650818, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20466412603855133, "step": 7136 }, { "epoch": 0.446125, "grad_norm": 2.125, "grad_norm_var": 0.004231770833333333, "learning_rate": 0.0001, "loss": 7.0262, "loss/crossentropy": 2.1759002804756165, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.22160179913043976, "step": 7138 }, { "epoch": 0.44625, "grad_norm": 2.125, "grad_norm_var": 0.007342274983723958, "learning_rate": 0.0001, "loss": 7.0974, "loss/crossentropy": 2.324357032775879, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2030269131064415, "step": 7140 }, { "epoch": 0.446375, "grad_norm": 2.25, "grad_norm_var": 0.008015696207682292, "learning_rate": 0.0001, "loss": 7.2816, "loss/crossentropy": 2.4500794410705566, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.260087326169014, "step": 7142 }, { "epoch": 0.4465, "grad_norm": 2.046875, "grad_norm_var": 0.008392079671223959, "learning_rate": 0.0001, "loss": 7.2604, "loss/crossentropy": 2.3314210176467896, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.1982848197221756, "step": 7144 }, { "epoch": 0.446625, "grad_norm": 2.109375, "grad_norm_var": 0.008001454671223958, "learning_rate": 0.0001, "loss": 7.2175, "loss/crossentropy": 2.3275386095046997, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20359043031930923, "step": 7146 }, { "epoch": 0.44675, "grad_norm": 2.375, "grad_norm_var": 0.011230214436848959, "learning_rate": 0.0001, "loss": 7.2505, "loss/crossentropy": 2.1986998319625854, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2198983132839203, "step": 7148 }, { "epoch": 0.446875, "grad_norm": 2.109375, "grad_norm_var": 0.012115224202473959, "learning_rate": 0.0001, "loss": 7.2739, "loss/crossentropy": 1.9064122438430786, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.1959618628025055, "step": 7150 }, { "epoch": 0.447, "grad_norm": 2.109375, "grad_norm_var": 0.013669586181640625, "learning_rate": 0.0001, "loss": 7.1101, "loss/crossentropy": 2.3088358640670776, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20778021216392517, "step": 7152 }, { "epoch": 0.447125, "grad_norm": 2.171875, "grad_norm_var": 0.014249420166015625, "learning_rate": 0.0001, "loss": 7.2034, "loss/crossentropy": 2.2749520540237427, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.19885848462581635, "step": 7154 }, { "epoch": 0.44725, "grad_norm": 1.984375, "grad_norm_var": 0.011356608072916666, "learning_rate": 0.0001, "loss": 6.9199, "loss/crossentropy": 2.0392338633537292, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.18347318470478058, "step": 7156 }, { "epoch": 0.447375, "grad_norm": 2.203125, "grad_norm_var": 0.012626139322916667, "learning_rate": 0.0001, "loss": 7.198, "loss/crossentropy": 2.3574529886245728, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21006865054368973, "step": 7158 }, { "epoch": 0.4475, "grad_norm": 2.375, "grad_norm_var": 0.021922810872395834, "learning_rate": 0.0001, "loss": 7.1556, "loss/crossentropy": 2.3607640266418457, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19356133043766022, "step": 7160 }, { "epoch": 0.447625, "grad_norm": 2.09375, "grad_norm_var": 0.023030598958333332, "learning_rate": 0.0001, "loss": 7.0782, "loss/crossentropy": 2.203671097755432, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22066359221935272, "step": 7162 }, { "epoch": 0.44775, "grad_norm": 2.09375, "grad_norm_var": 0.022261555989583334, "learning_rate": 0.0001, "loss": 7.0562, "loss/crossentropy": 2.1417211294174194, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.1960388571023941, "step": 7164 }, { "epoch": 0.447875, "grad_norm": 2.09375, "grad_norm_var": 0.021630859375, "learning_rate": 0.0001, "loss": 7.113, "loss/crossentropy": 2.0206031799316406, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.18700659275054932, "step": 7166 }, { "epoch": 0.448, "grad_norm": 2.109375, "grad_norm_var": 0.02095947265625, "learning_rate": 0.0001, "loss": 7.094, "loss/crossentropy": 2.1299338340759277, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.1982402428984642, "step": 7168 }, { "epoch": 0.448125, "grad_norm": 2.203125, "grad_norm_var": 0.028413899739583335, "learning_rate": 0.0001, "loss": 7.152, "loss/crossentropy": 2.33224093914032, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21684490144252777, "step": 7170 }, { "epoch": 0.44825, "grad_norm": 2.296875, "grad_norm_var": 0.025016276041666667, "learning_rate": 0.0001, "loss": 7.2652, "loss/crossentropy": 2.3137996196746826, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2174111008644104, "step": 7172 }, { "epoch": 0.448375, "grad_norm": 2.1875, "grad_norm_var": 0.024421183268229167, "learning_rate": 0.0001, "loss": 7.1668, "loss/crossentropy": 1.9115217328071594, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21205194294452667, "step": 7174 }, { "epoch": 0.4485, "grad_norm": 2.0, "grad_norm_var": 0.018000284830729168, "learning_rate": 0.0001, "loss": 7.1978, "loss/crossentropy": 2.4868533611297607, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22938791662454605, "step": 7176 }, { "epoch": 0.448625, "grad_norm": 2.265625, "grad_norm_var": 0.018260701497395834, "learning_rate": 0.0001, "loss": 7.193, "loss/crossentropy": 2.2773613929748535, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20718929171562195, "step": 7178 }, { "epoch": 0.44875, "grad_norm": 2.078125, "grad_norm_var": 0.0171295166015625, "learning_rate": 0.0001, "loss": 7.1092, "loss/crossentropy": 2.048713266849518, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.19681257009506226, "step": 7180 }, { "epoch": 0.448875, "grad_norm": 2.3125, "grad_norm_var": 0.015397135416666667, "learning_rate": 0.0001, "loss": 7.0797, "loss/crossentropy": 1.8432785868644714, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20921127498149872, "step": 7182 }, { "epoch": 0.449, "grad_norm": 2.140625, "grad_norm_var": 0.015387980143229167, "learning_rate": 0.0001, "loss": 7.1391, "loss/crossentropy": 2.129906117916107, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2244158536195755, "step": 7184 }, { "epoch": 0.449125, "grad_norm": 2.21875, "grad_norm_var": 0.011551920572916667, "learning_rate": 0.0001, "loss": 7.1485, "loss/crossentropy": 2.1112728118896484, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20086340606212616, "step": 7186 }, { "epoch": 0.44925, "grad_norm": 2.0625, "grad_norm_var": 0.011714680989583334, "learning_rate": 0.0001, "loss": 7.2314, "loss/crossentropy": 2.256954550743103, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20468229055404663, "step": 7188 }, { "epoch": 0.449375, "grad_norm": 2.125, "grad_norm_var": 0.014453125, "learning_rate": 0.0001, "loss": 7.1437, "loss/crossentropy": 2.071455657482147, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.20885341614484787, "step": 7190 }, { "epoch": 0.4495, "grad_norm": 2.046875, "grad_norm_var": 0.013114420572916667, "learning_rate": 0.0001, "loss": 7.0949, "loss/crossentropy": 2.562646746635437, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21791629493236542, "step": 7192 }, { "epoch": 0.449625, "grad_norm": 2.140625, "grad_norm_var": 0.009761555989583334, "learning_rate": 0.0001, "loss": 7.1798, "loss/crossentropy": 2.3666142225265503, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2231055274605751, "step": 7194 }, { "epoch": 0.44975, "grad_norm": 2.109375, "grad_norm_var": 0.009300740559895833, "learning_rate": 0.0001, "loss": 7.2875, "loss/crossentropy": 2.2392258644104004, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21026718616485596, "step": 7196 }, { "epoch": 0.449875, "grad_norm": 2.015625, "grad_norm_var": 0.0059234619140625, "learning_rate": 0.0001, "loss": 7.0306, "loss/crossentropy": 2.531536340713501, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20711271464824677, "step": 7198 }, { "epoch": 0.45, "grad_norm": 2.453125, "grad_norm_var": 0.045699055989583334, "learning_rate": 0.0001, "loss": 7.1813, "loss/crossentropy": 2.32056200504303, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21862630546092987, "step": 7200 }, { "epoch": 0.450125, "grad_norm": 2.296875, "grad_norm_var": 0.04778620402018229, "learning_rate": 0.0001, "loss": 7.3459, "loss/crossentropy": 2.6083868741989136, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2005249261856079, "step": 7202 }, { "epoch": 0.45025, "grad_norm": 2.171875, "grad_norm_var": 0.046781158447265624, "learning_rate": 0.0001, "loss": 6.9953, "loss/crossentropy": 2.134474039077759, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.18911154568195343, "step": 7204 }, { "epoch": 0.450375, "grad_norm": 2.015625, "grad_norm_var": 0.046537017822265624, "learning_rate": 0.0001, "loss": 7.2096, "loss/crossentropy": 2.2817357778549194, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20316140353679657, "step": 7206 }, { "epoch": 0.4505, "grad_norm": 2.03125, "grad_norm_var": 0.049806467692057294, "learning_rate": 0.0001, "loss": 7.0024, "loss/crossentropy": 2.288169503211975, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20653357356786728, "step": 7208 }, { "epoch": 0.450625, "grad_norm": 2.25, "grad_norm_var": 0.050172678629557294, "learning_rate": 0.0001, "loss": 7.2516, "loss/crossentropy": 2.5531680583953857, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21465806663036346, "step": 7210 }, { "epoch": 0.45075, "grad_norm": 2.1875, "grad_norm_var": 0.04960912068684896, "learning_rate": 0.0001, "loss": 7.3268, "loss/crossentropy": 2.2371240854263306, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21228843927383423, "step": 7212 }, { "epoch": 0.450875, "grad_norm": 2.21875, "grad_norm_var": 0.04690119425455729, "learning_rate": 0.0001, "loss": 7.2514, "loss/crossentropy": 2.17119038105011, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.19861158728599548, "step": 7214 }, { "epoch": 0.451, "grad_norm": 1.9609375, "grad_norm_var": 0.0108551025390625, "learning_rate": 0.0001, "loss": 6.9546, "loss/crossentropy": 2.042115569114685, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.19691552966833115, "step": 7216 }, { "epoch": 0.451125, "grad_norm": 2.3125, "grad_norm_var": 0.009997304280598958, "learning_rate": 0.0001, "loss": 7.0743, "loss/crossentropy": 2.0181705951690674, "loss/hidden": 2.71875, "loss/jsd": 0.0, "loss/logits": 0.20092586427927017, "step": 7218 }, { "epoch": 0.45125, "grad_norm": 1.9453125, "grad_norm_var": 0.011661783854166666, "learning_rate": 0.0001, "loss": 7.0426, "loss/crossentropy": 2.313483238220215, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2026253342628479, "step": 7220 }, { "epoch": 0.451375, "grad_norm": 2.3125, "grad_norm_var": 0.014689127604166666, "learning_rate": 0.0001, "loss": 7.0506, "loss/crossentropy": 2.278806447982788, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2161618396639824, "step": 7222 }, { "epoch": 0.4515, "grad_norm": 2.140625, "grad_norm_var": 0.011592610677083334, "learning_rate": 0.0001, "loss": 7.3406, "loss/crossentropy": 2.26158607006073, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20861487835645676, "step": 7224 }, { "epoch": 0.451625, "grad_norm": 2.203125, "grad_norm_var": 0.011201985677083333, "learning_rate": 0.0001, "loss": 7.1772, "loss/crossentropy": 2.0447089076042175, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2081030160188675, "step": 7226 }, { "epoch": 0.45175, "grad_norm": 2.109375, "grad_norm_var": 0.012308756510416666, "learning_rate": 0.0001, "loss": 7.0837, "loss/crossentropy": 2.4593422412872314, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22033347189426422, "step": 7228 }, { "epoch": 0.451875, "grad_norm": 2.1875, "grad_norm_var": 0.012352498372395833, "learning_rate": 0.0001, "loss": 7.0803, "loss/crossentropy": 2.1379048824310303, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.212110698223114, "step": 7230 }, { "epoch": 0.452, "grad_norm": 2.078125, "grad_norm_var": 0.009631093343098958, "learning_rate": 0.0001, "loss": 7.0067, "loss/crossentropy": 2.147655963897705, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21026761829853058, "step": 7232 }, { "epoch": 0.452125, "grad_norm": 2.09375, "grad_norm_var": 0.008278147379557291, "learning_rate": 0.0001, "loss": 7.2995, "loss/crossentropy": 2.4965137243270874, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2102838009595871, "step": 7234 }, { "epoch": 0.45225, "grad_norm": 2.34375, "grad_norm_var": 0.007420857747395833, "learning_rate": 0.0001, "loss": 7.2283, "loss/crossentropy": 2.5284390449523926, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2260439619421959, "step": 7236 }, { "epoch": 0.452375, "grad_norm": 2.046875, "grad_norm_var": 0.006136067708333333, "learning_rate": 0.0001, "loss": 7.1149, "loss/crossentropy": 2.124226689338684, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.1935354694724083, "step": 7238 }, { "epoch": 0.4525, "grad_norm": 2.046875, "grad_norm_var": 0.008177693684895833, "learning_rate": 0.0001, "loss": 7.1761, "loss/crossentropy": 2.31568443775177, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19555098563432693, "step": 7240 }, { "epoch": 0.452625, "grad_norm": 2.125, "grad_norm_var": 0.00797119140625, "learning_rate": 0.0001, "loss": 7.0197, "loss/crossentropy": 2.2378604412078857, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2003757283091545, "step": 7242 }, { "epoch": 0.45275, "grad_norm": 1.9453125, "grad_norm_var": 0.008304595947265625, "learning_rate": 0.0001, "loss": 7.032, "loss/crossentropy": 2.0563949942588806, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.1994936764240265, "step": 7244 }, { "epoch": 0.452875, "grad_norm": 2.1875, "grad_norm_var": 0.009273020426432292, "learning_rate": 0.0001, "loss": 7.0266, "loss/crossentropy": 2.1362143754959106, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19745449721813202, "step": 7246 }, { "epoch": 0.453, "grad_norm": 2.203125, "grad_norm_var": 0.009978993733723959, "learning_rate": 0.0001, "loss": 7.054, "loss/crossentropy": 2.285860776901245, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21970739215612411, "step": 7248 }, { "epoch": 0.453125, "grad_norm": 2.015625, "grad_norm_var": 0.010746002197265625, "learning_rate": 0.0001, "loss": 7.1889, "loss/crossentropy": 2.350265145301819, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2033230885863304, "step": 7250 }, { "epoch": 0.45325, "grad_norm": 2.234375, "grad_norm_var": 0.006768544514973958, "learning_rate": 0.0001, "loss": 7.212, "loss/crossentropy": 2.2641106843948364, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20904739946126938, "step": 7252 }, { "epoch": 0.453375, "grad_norm": 1.9765625, "grad_norm_var": 0.0104248046875, "learning_rate": 0.0001, "loss": 7.2615, "loss/crossentropy": 2.3802077770233154, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21182265132665634, "step": 7254 }, { "epoch": 0.4535, "grad_norm": 2.34375, "grad_norm_var": 0.013841756184895833, "learning_rate": 0.0001, "loss": 7.0786, "loss/crossentropy": 2.2918620109558105, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20877155661582947, "step": 7256 }, { "epoch": 0.453625, "grad_norm": 2.140625, "grad_norm_var": 0.0139312744140625, "learning_rate": 0.0001, "loss": 7.0746, "loss/crossentropy": 2.120309591293335, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.19328848272562027, "step": 7258 }, { "epoch": 0.45375, "grad_norm": 2.125, "grad_norm_var": 0.011901601155598959, "learning_rate": 0.0001, "loss": 7.2181, "loss/crossentropy": 2.221208691596985, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21751584112644196, "step": 7260 }, { "epoch": 0.453875, "grad_norm": 2.421875, "grad_norm_var": 0.023978424072265626, "learning_rate": 0.0001, "loss": 7.1222, "loss/crossentropy": 2.4885802268981934, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2257223054766655, "step": 7262 }, { "epoch": 0.454, "grad_norm": 2.375, "grad_norm_var": 0.02561213175455729, "learning_rate": 0.0001, "loss": 7.1315, "loss/crossentropy": 2.1699594259262085, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.21807490289211273, "step": 7264 }, { "epoch": 0.454125, "grad_norm": 2.21875, "grad_norm_var": 0.022607167561848957, "learning_rate": 0.0001, "loss": 7.1284, "loss/crossentropy": 1.8780632615089417, "loss/hidden": 2.71875, "loss/jsd": 0.0, "loss/logits": 0.1910051330924034, "step": 7266 }, { "epoch": 0.45425, "grad_norm": 2.125, "grad_norm_var": 0.02069066365559896, "learning_rate": 0.0001, "loss": 7.1184, "loss/crossentropy": 2.246003210544586, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.22277537733316422, "step": 7268 }, { "epoch": 0.454375, "grad_norm": 2.375, "grad_norm_var": 0.017096964518229167, "learning_rate": 0.0001, "loss": 7.1533, "loss/crossentropy": 2.1062055826187134, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.22463850677013397, "step": 7270 }, { "epoch": 0.4545, "grad_norm": 2.25, "grad_norm_var": 0.015620930989583334, "learning_rate": 0.0001, "loss": 7.1315, "loss/crossentropy": 2.35440993309021, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2115459069609642, "step": 7272 }, { "epoch": 0.454625, "grad_norm": 2.03125, "grad_norm_var": 0.01744384765625, "learning_rate": 0.0001, "loss": 7.0647, "loss/crossentropy": 2.358685255050659, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20285096019506454, "step": 7274 }, { "epoch": 0.45475, "grad_norm": 2.015625, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 7.0577, "loss/crossentropy": 2.1627367734909058, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19727136939764023, "step": 7276 }, { "epoch": 0.454875, "grad_norm": 2.0625, "grad_norm_var": 0.015458170572916667, "learning_rate": 0.0001, "loss": 7.0459, "loss/crossentropy": 2.0946096181869507, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.19678431004285812, "step": 7278 }, { "epoch": 0.455, "grad_norm": 2.1875, "grad_norm_var": 0.0161773681640625, "learning_rate": 0.0001, "loss": 7.2589, "loss/crossentropy": 2.244626998901367, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22719382494688034, "step": 7280 }, { "epoch": 0.455125, "grad_norm": 2.296875, "grad_norm_var": 0.014632161458333333, "learning_rate": 0.0001, "loss": 7.3766, "loss/crossentropy": 2.2402459383010864, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20855902880430222, "step": 7282 }, { "epoch": 0.45525, "grad_norm": 2.1875, "grad_norm_var": 0.016844685872395834, "learning_rate": 0.0001, "loss": 7.1058, "loss/crossentropy": 2.4255205392837524, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.1950306072831154, "step": 7284 }, { "epoch": 0.455375, "grad_norm": 2.109375, "grad_norm_var": 0.014286295572916666, "learning_rate": 0.0001, "loss": 7.1902, "loss/crossentropy": 2.4710735082626343, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21613246202468872, "step": 7286 }, { "epoch": 0.4555, "grad_norm": 2.203125, "grad_norm_var": 0.01373291015625, "learning_rate": 0.0001, "loss": 7.0767, "loss/crossentropy": 2.0865633487701416, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19315458834171295, "step": 7288 }, { "epoch": 0.455625, "grad_norm": 2.046875, "grad_norm_var": 0.014232381184895834, "learning_rate": 0.0001, "loss": 7.1555, "loss/crossentropy": 2.431704044342041, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22103539109230042, "step": 7290 }, { "epoch": 0.45575, "grad_norm": 2.109375, "grad_norm_var": 0.013212076822916667, "learning_rate": 0.0001, "loss": 7.1897, "loss/crossentropy": 2.2944241762161255, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20230845361948013, "step": 7292 }, { "epoch": 0.455875, "grad_norm": 2.125, "grad_norm_var": 0.012744140625, "learning_rate": 0.0001, "loss": 7.2919, "loss/crossentropy": 2.1923948526382446, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2351241260766983, "step": 7294 }, { "epoch": 0.456, "grad_norm": 2.21875, "grad_norm_var": 0.013498687744140625, "learning_rate": 0.0001, "loss": 7.0956, "loss/crossentropy": 1.9904609322547913, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.19044706225395203, "step": 7296 }, { "epoch": 0.456125, "grad_norm": 2.125, "grad_norm_var": 0.016371409098307293, "learning_rate": 0.0001, "loss": 7.2638, "loss/crossentropy": 2.392876386642456, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21254249662160873, "step": 7298 }, { "epoch": 0.45625, "grad_norm": 2.03125, "grad_norm_var": 0.01687800089518229, "learning_rate": 0.0001, "loss": 7.0909, "loss/crossentropy": 1.9285590052604675, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.18553440272808075, "step": 7300 }, { "epoch": 0.456375, "grad_norm": 2.09375, "grad_norm_var": 0.016717274983723957, "learning_rate": 0.0001, "loss": 7.1137, "loss/crossentropy": 2.4444403648376465, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22047097980976105, "step": 7302 }, { "epoch": 0.4565, "grad_norm": 2.015625, "grad_norm_var": 0.01741511027018229, "learning_rate": 0.0001, "loss": 7.0704, "loss/crossentropy": 2.4315890073776245, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21392318606376648, "step": 7304 }, { "epoch": 0.456625, "grad_norm": 2.140625, "grad_norm_var": 0.013889312744140625, "learning_rate": 0.0001, "loss": 7.1225, "loss/crossentropy": 2.1737417578697205, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21062488108873367, "step": 7306 }, { "epoch": 0.45675, "grad_norm": 2.078125, "grad_norm_var": 0.015036773681640626, "learning_rate": 0.0001, "loss": 7.0151, "loss/crossentropy": 1.8432039022445679, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20271048694849014, "step": 7308 }, { "epoch": 0.456875, "grad_norm": 2.171875, "grad_norm_var": 0.014113108317057291, "learning_rate": 0.0001, "loss": 7.0773, "loss/crossentropy": 2.1769769191741943, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20057760924100876, "step": 7310 }, { "epoch": 0.457, "grad_norm": 2.453125, "grad_norm_var": 0.017267862955729168, "learning_rate": 0.0001, "loss": 7.2829, "loss/crossentropy": 2.502546191215515, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21403612941503525, "step": 7312 }, { "epoch": 0.457125, "grad_norm": 2.09375, "grad_norm_var": 0.012547810872395834, "learning_rate": 0.0001, "loss": 7.1304, "loss/crossentropy": 2.3192015886306763, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21322068572044373, "step": 7314 }, { "epoch": 0.45725, "grad_norm": 2.140625, "grad_norm_var": 0.0109771728515625, "learning_rate": 0.0001, "loss": 7.0573, "loss/crossentropy": 2.21256947517395, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.19907121360301971, "step": 7316 }, { "epoch": 0.457375, "grad_norm": 2.078125, "grad_norm_var": 0.011083984375, "learning_rate": 0.0001, "loss": 7.1052, "loss/crossentropy": 2.3876004219055176, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21536500751972198, "step": 7318 }, { "epoch": 0.4575, "grad_norm": 2.21875, "grad_norm_var": 0.0110748291015625, "learning_rate": 0.0001, "loss": 7.1542, "loss/crossentropy": 2.3090778589248657, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2097294181585312, "step": 7320 }, { "epoch": 0.457625, "grad_norm": 2.140625, "grad_norm_var": 0.011226399739583334, "learning_rate": 0.0001, "loss": 7.1163, "loss/crossentropy": 2.3780629634857178, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2092108651995659, "step": 7322 }, { "epoch": 0.45775, "grad_norm": 2.09375, "grad_norm_var": 0.009137980143229167, "learning_rate": 0.0001, "loss": 7.1624, "loss/crossentropy": 2.3224358558654785, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20188771933317184, "step": 7324 }, { "epoch": 0.457875, "grad_norm": 1.9765625, "grad_norm_var": 0.014707183837890625, "learning_rate": 0.0001, "loss": 7.1187, "loss/crossentropy": 2.044081926345825, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.18789289891719818, "step": 7326 }, { "epoch": 0.458, "grad_norm": 2.375, "grad_norm_var": 0.012182362874348958, "learning_rate": 0.0001, "loss": 7.1447, "loss/crossentropy": 2.380860447883606, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20718378573656082, "step": 7328 }, { "epoch": 0.458125, "grad_norm": 1.9921875, "grad_norm_var": 0.0149658203125, "learning_rate": 0.0001, "loss": 7.2424, "loss/crossentropy": 2.1657201051712036, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.19616171717643738, "step": 7330 }, { "epoch": 0.45825, "grad_norm": 2.078125, "grad_norm_var": 0.017341105143229167, "learning_rate": 0.0001, "loss": 6.9994, "loss/crossentropy": 2.148615837097168, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20785433053970337, "step": 7332 }, { "epoch": 0.458375, "grad_norm": 2.28125, "grad_norm_var": 0.018513997395833332, "learning_rate": 0.0001, "loss": 7.1808, "loss/crossentropy": 2.2177847623825073, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21924154460430145, "step": 7334 }, { "epoch": 0.4585, "grad_norm": 2.046875, "grad_norm_var": 0.020511881510416666, "learning_rate": 0.0001, "loss": 7.1623, "loss/crossentropy": 2.2308239936828613, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2200755625963211, "step": 7336 }, { "epoch": 0.458625, "grad_norm": 2.140625, "grad_norm_var": 0.020857747395833334, "learning_rate": 0.0001, "loss": 7.1733, "loss/crossentropy": 1.9752941727638245, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.19304338842630386, "step": 7338 }, { "epoch": 0.45875, "grad_norm": 2.34375, "grad_norm_var": 0.027058919270833332, "learning_rate": 0.0001, "loss": 7.3887, "loss/crossentropy": 2.253862977027893, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21588093787431717, "step": 7340 }, { "epoch": 0.458875, "grad_norm": 2.15625, "grad_norm_var": 0.022045644124348958, "learning_rate": 0.0001, "loss": 7.1725, "loss/crossentropy": 2.3387417793273926, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20970949530601501, "step": 7342 }, { "epoch": 0.459, "grad_norm": 2.109375, "grad_norm_var": 0.024379221598307292, "learning_rate": 0.0001, "loss": 7.222, "loss/crossentropy": 2.1245256066322327, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.22371243685483932, "step": 7344 }, { "epoch": 0.459125, "grad_norm": 2.328125, "grad_norm_var": 0.023216756184895833, "learning_rate": 0.0001, "loss": 7.257, "loss/crossentropy": 2.2408164739608765, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2288973107933998, "step": 7346 }, { "epoch": 0.45925, "grad_norm": 1.8515625, "grad_norm_var": 0.02879613240559896, "learning_rate": 0.0001, "loss": 7.0951, "loss/crossentropy": 2.180490016937256, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21135683357715607, "step": 7348 }, { "epoch": 0.459375, "grad_norm": 2.203125, "grad_norm_var": 0.02787043253580729, "learning_rate": 0.0001, "loss": 7.1622, "loss/crossentropy": 2.359486937522888, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21552064269781113, "step": 7350 }, { "epoch": 0.4595, "grad_norm": 2.25, "grad_norm_var": 0.02600886027018229, "learning_rate": 0.0001, "loss": 7.2729, "loss/crossentropy": 2.4955438375473022, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2092251032590866, "step": 7352 }, { "epoch": 0.459625, "grad_norm": 2.03125, "grad_norm_var": 0.026444244384765624, "learning_rate": 0.0001, "loss": 7.1748, "loss/crossentropy": 2.3510589599609375, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20734377205371857, "step": 7354 }, { "epoch": 0.45975, "grad_norm": 2.203125, "grad_norm_var": 0.018332672119140626, "learning_rate": 0.0001, "loss": 7.1075, "loss/crossentropy": 2.3306682109832764, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20482201874256134, "step": 7356 }, { "epoch": 0.459875, "grad_norm": 1.984375, "grad_norm_var": 0.01725031534830729, "learning_rate": 0.0001, "loss": 7.1674, "loss/crossentropy": 2.3967502117156982, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21556542068719864, "step": 7358 }, { "epoch": 0.46, "grad_norm": 2.265625, "grad_norm_var": 0.017073313395182293, "learning_rate": 0.0001, "loss": 7.0717, "loss/crossentropy": 2.274154245853424, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21294216066598892, "step": 7360 }, { "epoch": 0.460125, "grad_norm": 2.140625, "grad_norm_var": 0.013783518473307292, "learning_rate": 0.0001, "loss": 7.2142, "loss/crossentropy": 2.230677008628845, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.2010795623064041, "step": 7362 }, { "epoch": 0.46025, "grad_norm": 1.9609375, "grad_norm_var": 0.009504954020182291, "learning_rate": 0.0001, "loss": 7.0379, "loss/crossentropy": 2.353756308555603, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22095441818237305, "step": 7364 }, { "epoch": 0.460375, "grad_norm": 2.140625, "grad_norm_var": 0.009458160400390625, "learning_rate": 0.0001, "loss": 6.9673, "loss/crossentropy": 2.085910201072693, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.18464379012584686, "step": 7366 }, { "epoch": 0.4605, "grad_norm": 1.984375, "grad_norm_var": 0.009199778238932291, "learning_rate": 0.0001, "loss": 6.8746, "loss/crossentropy": 2.0555408000946045, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2172028347849846, "step": 7368 }, { "epoch": 0.460625, "grad_norm": 2.15625, "grad_norm_var": 0.009907786051432292, "learning_rate": 0.0001, "loss": 7.0269, "loss/crossentropy": 2.3041187524795532, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.21645274013280869, "step": 7370 }, { "epoch": 0.46075, "grad_norm": 2.015625, "grad_norm_var": 0.009531402587890625, "learning_rate": 0.0001, "loss": 6.9228, "loss/crossentropy": 1.9540690183639526, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.184081070125103, "step": 7372 }, { "epoch": 0.460875, "grad_norm": 2.109375, "grad_norm_var": 0.009034983317057292, "learning_rate": 0.0001, "loss": 7.1386, "loss/crossentropy": 2.4415656328201294, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20297012478113174, "step": 7374 }, { "epoch": 0.461, "grad_norm": 2.078125, "grad_norm_var": 0.006528472900390625, "learning_rate": 0.0001, "loss": 7.1107, "loss/crossentropy": 2.086025834083557, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19494184106588364, "step": 7376 }, { "epoch": 0.461125, "grad_norm": 2.109375, "grad_norm_var": 0.0063860575358072914, "learning_rate": 0.0001, "loss": 7.1613, "loss/crossentropy": 2.236558198928833, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21862926334142685, "step": 7378 }, { "epoch": 0.46125, "grad_norm": 2.109375, "grad_norm_var": 0.006243642171223958, "learning_rate": 0.0001, "loss": 7.1739, "loss/crossentropy": 2.0421109199523926, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.2004089578986168, "step": 7380 }, { "epoch": 0.461375, "grad_norm": 2.109375, "grad_norm_var": 0.006573232014973959, "learning_rate": 0.0001, "loss": 7.0469, "loss/crossentropy": 2.216292977333069, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.1917620524764061, "step": 7382 }, { "epoch": 0.4615, "grad_norm": 1.9921875, "grad_norm_var": 0.006883748372395833, "learning_rate": 0.0001, "loss": 7.1088, "loss/crossentropy": 2.209986925125122, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20837366580963135, "step": 7384 }, { "epoch": 0.461625, "grad_norm": 2.609375, "grad_norm_var": 0.022809855143229165, "learning_rate": 0.0001, "loss": 7.3591, "loss/crossentropy": 2.3474618196487427, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.22284895181655884, "step": 7386 }, { "epoch": 0.46175, "grad_norm": 2.0, "grad_norm_var": 0.0231597900390625, "learning_rate": 0.0001, "loss": 7.0282, "loss/crossentropy": 2.0832881927490234, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.17219309508800507, "step": 7388 }, { "epoch": 0.461875, "grad_norm": 2.21875, "grad_norm_var": 0.023249308268229168, "learning_rate": 0.0001, "loss": 7.1468, "loss/crossentropy": 2.2276368141174316, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.19344830513000488, "step": 7390 }, { "epoch": 0.462, "grad_norm": 2.296875, "grad_norm_var": 0.025169881184895833, "learning_rate": 0.0001, "loss": 7.0684, "loss/crossentropy": 2.4470431804656982, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.22533931583166122, "step": 7392 }, { "epoch": 0.462125, "grad_norm": 1.9765625, "grad_norm_var": 0.02926813761393229, "learning_rate": 0.0001, "loss": 7.0658, "loss/crossentropy": 2.2905768156051636, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20665580034255981, "step": 7394 }, { "epoch": 0.46225, "grad_norm": 2.359375, "grad_norm_var": 0.0297760009765625, "learning_rate": 0.0001, "loss": 7.1369, "loss/crossentropy": 2.167145550251007, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21655890345573425, "step": 7396 }, { "epoch": 0.462375, "grad_norm": 1.984375, "grad_norm_var": 0.03216552734375, "learning_rate": 0.0001, "loss": 7.0859, "loss/crossentropy": 2.15854412317276, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20027539879083633, "step": 7398 }, { "epoch": 0.4625, "grad_norm": 2.28125, "grad_norm_var": 0.03135350545247396, "learning_rate": 0.0001, "loss": 7.1261, "loss/crossentropy": 2.1508615016937256, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.18814773857593536, "step": 7400 }, { "epoch": 0.462625, "grad_norm": 1.9921875, "grad_norm_var": 0.0194732666015625, "learning_rate": 0.0001, "loss": 7.134, "loss/crossentropy": 1.842678189277649, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.18930108845233917, "step": 7402 }, { "epoch": 0.46275, "grad_norm": 2.125, "grad_norm_var": 0.018400065104166665, "learning_rate": 0.0001, "loss": 7.2671, "loss/crossentropy": 2.4093555212020874, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21225199848413467, "step": 7404 }, { "epoch": 0.462875, "grad_norm": 2.265625, "grad_norm_var": 0.019188435872395833, "learning_rate": 0.0001, "loss": 7.2534, "loss/crossentropy": 1.9698211550712585, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21386128664016724, "step": 7406 }, { "epoch": 0.463, "grad_norm": 1.9375, "grad_norm_var": 0.0200836181640625, "learning_rate": 0.0001, "loss": 7.2251, "loss/crossentropy": 2.4693849086761475, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20892548561096191, "step": 7408 }, { "epoch": 0.463125, "grad_norm": 2.0625, "grad_norm_var": 0.01602350870768229, "learning_rate": 0.0001, "loss": 7.0863, "loss/crossentropy": 2.1525356769561768, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20484411716461182, "step": 7410 }, { "epoch": 0.46325, "grad_norm": 2.171875, "grad_norm_var": 0.023339589436848957, "learning_rate": 0.0001, "loss": 7.0279, "loss/crossentropy": 2.3783161640167236, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.19495395570993423, "step": 7412 }, { "epoch": 0.463375, "grad_norm": 1.9921875, "grad_norm_var": 0.022850545247395833, "learning_rate": 0.0001, "loss": 7.2075, "loss/crossentropy": 2.3271056413650513, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.19265718758106232, "step": 7414 }, { "epoch": 0.4635, "grad_norm": 3.0625, "grad_norm_var": 3.065482584635417, "learning_rate": 0.0001, "loss": 7.6398, "loss/crossentropy": 2.133812189102173, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20769577473402023, "step": 7416 }, { "epoch": 0.463625, "grad_norm": 2.09375, "grad_norm_var": 3.042229970296224, "learning_rate": 0.0001, "loss": 7.213, "loss/crossentropy": 2.110043227672577, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21482352912425995, "step": 7418 }, { "epoch": 0.46375, "grad_norm": 2.125, "grad_norm_var": 3.038519032796224, "learning_rate": 0.0001, "loss": 7.1561, "loss/crossentropy": 2.31582248210907, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19608522206544876, "step": 7420 }, { "epoch": 0.463875, "grad_norm": 2.0625, "grad_norm_var": 3.049450429280599, "learning_rate": 0.0001, "loss": 7.3156, "loss/crossentropy": 2.19181752204895, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2320917323231697, "step": 7422 }, { "epoch": 0.464, "grad_norm": 2.21875, "grad_norm_var": 3.0374794006347656, "learning_rate": 0.0001, "loss": 7.1618, "loss/crossentropy": 2.466021180152893, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20255694538354874, "step": 7424 }, { "epoch": 0.464125, "grad_norm": 2.15625, "grad_norm_var": 3.039378865559896, "learning_rate": 0.0001, "loss": 7.1428, "loss/crossentropy": 2.425376534461975, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19863475114107132, "step": 7426 }, { "epoch": 0.46425, "grad_norm": 2.265625, "grad_norm_var": 3.04412841796875, "learning_rate": 0.0001, "loss": 7.0199, "loss/crossentropy": 2.244443416595459, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.19751153886318207, "step": 7428 }, { "epoch": 0.464375, "grad_norm": 1.9609375, "grad_norm_var": 3.063792928059896, "learning_rate": 0.0001, "loss": 7.0883, "loss/crossentropy": 2.2773425579071045, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2200421243906021, "step": 7430 }, { "epoch": 0.4645, "grad_norm": 2.3125, "grad_norm_var": 0.01649169921875, "learning_rate": 0.0001, "loss": 7.1049, "loss/crossentropy": 2.173538327217102, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21017886698246002, "step": 7432 }, { "epoch": 0.464625, "grad_norm": 2.15625, "grad_norm_var": 0.013334147135416667, "learning_rate": 0.0001, "loss": 7.2421, "loss/crossentropy": 2.190195083618164, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2085673287510872, "step": 7434 }, { "epoch": 0.46475, "grad_norm": 2.015625, "grad_norm_var": 0.014900716145833333, "learning_rate": 0.0001, "loss": 7.1757, "loss/crossentropy": 2.2361714839935303, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.20774667710065842, "step": 7436 }, { "epoch": 0.464875, "grad_norm": 2.03125, "grad_norm_var": 0.014533487955729167, "learning_rate": 0.0001, "loss": 7.0734, "loss/crossentropy": 2.3481364250183105, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19645172357559204, "step": 7438 }, { "epoch": 0.465, "grad_norm": 1.9296875, "grad_norm_var": 0.019789377848307293, "learning_rate": 0.0001, "loss": 7.0093, "loss/crossentropy": 1.8984848260879517, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.19228964298963547, "step": 7440 }, { "epoch": 0.465125, "grad_norm": 2.234375, "grad_norm_var": 0.019453938802083334, "learning_rate": 0.0001, "loss": 6.9896, "loss/crossentropy": 2.2871270179748535, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2178485319018364, "step": 7442 }, { "epoch": 0.46525, "grad_norm": 2.09375, "grad_norm_var": 0.01668701171875, "learning_rate": 0.0001, "loss": 7.0334, "loss/crossentropy": 2.2983537912368774, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.1965767741203308, "step": 7444 }, { "epoch": 0.465375, "grad_norm": 2.03125, "grad_norm_var": 0.014768218994140625, "learning_rate": 0.0001, "loss": 7.2321, "loss/crossentropy": 2.178062319755554, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.1993517056107521, "step": 7446 }, { "epoch": 0.4655, "grad_norm": 1.921875, "grad_norm_var": 0.013632965087890626, "learning_rate": 0.0001, "loss": 7.0158, "loss/crossentropy": 2.1353349685668945, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.1889559030532837, "step": 7448 }, { "epoch": 0.465625, "grad_norm": 2.421875, "grad_norm_var": 0.02373224894205729, "learning_rate": 0.0001, "loss": 7.2622, "loss/crossentropy": 2.2076762914657593, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.2045726329088211, "step": 7450 }, { "epoch": 0.46575, "grad_norm": 2.15625, "grad_norm_var": 0.022900136311848958, "learning_rate": 0.0001, "loss": 7.1263, "loss/crossentropy": 2.4784727096557617, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21691476553678513, "step": 7452 }, { "epoch": 0.465875, "grad_norm": 2.125, "grad_norm_var": 0.02217585245768229, "learning_rate": 0.0001, "loss": 7.0873, "loss/crossentropy": 1.949112594127655, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21352668851613998, "step": 7454 }, { "epoch": 0.466, "grad_norm": 2.046875, "grad_norm_var": 0.017710113525390626, "learning_rate": 0.0001, "loss": 7.0893, "loss/crossentropy": 1.9374956488609314, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.19351773709058762, "step": 7456 }, { "epoch": 0.466125, "grad_norm": 2.171875, "grad_norm_var": 0.017227935791015624, "learning_rate": 0.0001, "loss": 7.107, "loss/crossentropy": 2.6141308546066284, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22147035598754883, "step": 7458 }, { "epoch": 0.46625, "grad_norm": 2.109375, "grad_norm_var": 0.017411041259765624, "learning_rate": 0.0001, "loss": 7.2292, "loss/crossentropy": 2.4253780841827393, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21378996968269348, "step": 7460 }, { "epoch": 0.466375, "grad_norm": 2.125, "grad_norm_var": 0.017050933837890626, "learning_rate": 0.0001, "loss": 7.1957, "loss/crossentropy": 2.009346067905426, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2008848488330841, "step": 7462 }, { "epoch": 0.4665, "grad_norm": 2.359375, "grad_norm_var": 0.01788508097330729, "learning_rate": 0.0001, "loss": 7.2114, "loss/crossentropy": 2.048809766769409, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.19820420444011688, "step": 7464 }, { "epoch": 0.466625, "grad_norm": 2.03125, "grad_norm_var": 0.009893544514973958, "learning_rate": 0.0001, "loss": 7.2132, "loss/crossentropy": 2.3207257986068726, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.20752790570259094, "step": 7466 }, { "epoch": 0.46675, "grad_norm": 2.15625, "grad_norm_var": 0.010607655843098958, "learning_rate": 0.0001, "loss": 7.0494, "loss/crossentropy": 2.112247109413147, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.19827520847320557, "step": 7468 }, { "epoch": 0.466875, "grad_norm": 2.125, "grad_norm_var": 0.015447743733723958, "learning_rate": 0.0001, "loss": 7.1738, "loss/crossentropy": 2.2079219818115234, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22125893086194992, "step": 7470 }, { "epoch": 0.467, "grad_norm": 2.09375, "grad_norm_var": 0.013386027018229166, "learning_rate": 0.0001, "loss": 7.0565, "loss/crossentropy": 2.2796353101730347, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.20792751014232635, "step": 7472 }, { "epoch": 0.467125, "grad_norm": 2.09375, "grad_norm_var": 0.0126129150390625, "learning_rate": 0.0001, "loss": 7.1541, "loss/crossentropy": 2.4691073894500732, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2315238192677498, "step": 7474 }, { "epoch": 0.46725, "grad_norm": 2.09375, "grad_norm_var": 0.012727864583333333, "learning_rate": 0.0001, "loss": 6.9937, "loss/crossentropy": 2.2356735467910767, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19917455315589905, "step": 7476 }, { "epoch": 0.467375, "grad_norm": 2.140625, "grad_norm_var": 0.0129058837890625, "learning_rate": 0.0001, "loss": 7.1406, "loss/crossentropy": 2.2866714000701904, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21963119506835938, "step": 7478 }, { "epoch": 0.4675, "grad_norm": 2.140625, "grad_norm_var": 0.0102935791015625, "learning_rate": 0.0001, "loss": 7.0201, "loss/crossentropy": 2.419238805770874, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21799220889806747, "step": 7480 }, { "epoch": 0.467625, "grad_norm": 2.171875, "grad_norm_var": 0.0093658447265625, "learning_rate": 0.0001, "loss": 7.0075, "loss/crossentropy": 2.33626389503479, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.20474457740783691, "step": 7482 }, { "epoch": 0.46775, "grad_norm": 2.0, "grad_norm_var": 0.009691365559895833, "learning_rate": 0.0001, "loss": 7.1476, "loss/crossentropy": 2.276577591896057, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2041800171136856, "step": 7484 }, { "epoch": 0.467875, "grad_norm": 2.375, "grad_norm_var": 0.008202107747395833, "learning_rate": 0.0001, "loss": 7.1914, "loss/crossentropy": 2.279148817062378, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.21196125447750092, "step": 7486 }, { "epoch": 0.468, "grad_norm": 1.921875, "grad_norm_var": 0.01376953125, "learning_rate": 0.0001, "loss": 7.0034, "loss/crossentropy": 2.241237759590149, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.19696836173534393, "step": 7488 }, { "epoch": 0.468125, "grad_norm": 2.140625, "grad_norm_var": 0.01412353515625, "learning_rate": 0.0001, "loss": 7.1467, "loss/crossentropy": 2.1839855909347534, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.19702082127332687, "step": 7490 }, { "epoch": 0.46825, "grad_norm": 2.078125, "grad_norm_var": 0.013988240559895834, "learning_rate": 0.0001, "loss": 7.133, "loss/crossentropy": 2.1689497232437134, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2228970304131508, "step": 7492 }, { "epoch": 0.468375, "grad_norm": 2.25, "grad_norm_var": 0.013060506184895833, "learning_rate": 0.0001, "loss": 7.1162, "loss/crossentropy": 2.161349654197693, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21048546582460403, "step": 7494 }, { "epoch": 0.4685, "grad_norm": 2.1875, "grad_norm_var": 0.016910807291666666, "learning_rate": 0.0001, "loss": 7.1184, "loss/crossentropy": 2.362098455429077, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.20488198101520538, "step": 7496 }, { "epoch": 0.468625, "grad_norm": 2.203125, "grad_norm_var": 0.0183502197265625, "learning_rate": 0.0001, "loss": 6.9562, "loss/crossentropy": 1.9627411365509033, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19364163279533386, "step": 7498 }, { "epoch": 0.46875, "grad_norm": 2.09375, "grad_norm_var": 0.01845703125, "learning_rate": 0.0001, "loss": 7.062, "loss/crossentropy": 2.4354528188705444, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21706019341945648, "step": 7500 }, { "epoch": 0.468875, "grad_norm": 2.828125, "grad_norm_var": 0.04521484375, "learning_rate": 0.0001, "loss": 7.2507, "loss/crossentropy": 2.263838768005371, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20706459134817123, "step": 7502 }, { "epoch": 0.469, "grad_norm": 2.375, "grad_norm_var": 0.0410552978515625, "learning_rate": 0.0001, "loss": 6.9699, "loss/crossentropy": 2.2720807790756226, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21732699871063232, "step": 7504 }, { "epoch": 0.469125, "grad_norm": 2.3125, "grad_norm_var": 0.0393707275390625, "learning_rate": 0.0001, "loss": 7.3032, "loss/crossentropy": 2.527849555015564, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2073214203119278, "step": 7506 }, { "epoch": 0.46925, "grad_norm": 2.34375, "grad_norm_var": 0.0349761962890625, "learning_rate": 0.0001, "loss": 7.1871, "loss/crossentropy": 2.335466742515564, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21534393727779388, "step": 7508 }, { "epoch": 0.469375, "grad_norm": 2.0625, "grad_norm_var": 0.04195556640625, "learning_rate": 0.0001, "loss": 7.0814, "loss/crossentropy": 2.4958906173706055, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20870383828878403, "step": 7510 }, { "epoch": 0.4695, "grad_norm": 1.9921875, "grad_norm_var": 0.046567535400390624, "learning_rate": 0.0001, "loss": 7.1191, "loss/crossentropy": 2.233251214027405, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19957691431045532, "step": 7512 }, { "epoch": 0.469625, "grad_norm": 2.28125, "grad_norm_var": 0.04698867797851562, "learning_rate": 0.0001, "loss": 7.2159, "loss/crossentropy": 2.1861671209335327, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.2184196561574936, "step": 7514 }, { "epoch": 0.46975, "grad_norm": 2.21875, "grad_norm_var": 0.041473134358723955, "learning_rate": 0.0001, "loss": 7.288, "loss/crossentropy": 2.273300290107727, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2080041542649269, "step": 7516 }, { "epoch": 0.469875, "grad_norm": 1.921875, "grad_norm_var": 0.02493260701497396, "learning_rate": 0.0001, "loss": 7.1214, "loss/crossentropy": 2.286626935005188, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23020273447036743, "step": 7518 }, { "epoch": 0.47, "grad_norm": 2.25, "grad_norm_var": 0.01982599894205729, "learning_rate": 0.0001, "loss": 7.1163, "loss/crossentropy": 2.25377357006073, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19812580943107605, "step": 7520 }, { "epoch": 0.470125, "grad_norm": 2.25, "grad_norm_var": 0.01633478800455729, "learning_rate": 0.0001, "loss": 6.9896, "loss/crossentropy": 2.268404960632324, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22042839229106903, "step": 7522 }, { "epoch": 0.47025, "grad_norm": 1.9375, "grad_norm_var": 0.016017405192057292, "learning_rate": 0.0001, "loss": 7.0678, "loss/crossentropy": 2.1755711436271667, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21114220470190048, "step": 7524 }, { "epoch": 0.470375, "grad_norm": 2.3125, "grad_norm_var": 0.017093658447265625, "learning_rate": 0.0001, "loss": 7.1598, "loss/crossentropy": 2.4401893615722656, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22914503514766693, "step": 7526 }, { "epoch": 0.4705, "grad_norm": 2.046875, "grad_norm_var": 0.01617431640625, "learning_rate": 0.0001, "loss": 7.2893, "loss/crossentropy": 2.613025188446045, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21846508979797363, "step": 7528 }, { "epoch": 0.470625, "grad_norm": 2.359375, "grad_norm_var": 0.019331868489583334, "learning_rate": 0.0001, "loss": 7.2158, "loss/crossentropy": 2.3140453100204468, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.22654858231544495, "step": 7530 }, { "epoch": 0.47075, "grad_norm": 2.09375, "grad_norm_var": 0.0232330322265625, "learning_rate": 0.0001, "loss": 7.0316, "loss/crossentropy": 2.0402657985687256, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.18750952184200287, "step": 7532 }, { "epoch": 0.470875, "grad_norm": 2.046875, "grad_norm_var": 0.01988525390625, "learning_rate": 0.0001, "loss": 7.1331, "loss/crossentropy": 2.567373514175415, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22395136952400208, "step": 7534 }, { "epoch": 0.471, "grad_norm": 2.140625, "grad_norm_var": 0.023465983072916665, "learning_rate": 0.0001, "loss": 7.2494, "loss/crossentropy": 2.3188188076019287, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21441180258989334, "step": 7536 }, { "epoch": 0.471125, "grad_norm": 2.296875, "grad_norm_var": 0.02314453125, "learning_rate": 0.0001, "loss": 7.0734, "loss/crossentropy": 2.2127068042755127, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.1956922933459282, "step": 7538 }, { "epoch": 0.47125, "grad_norm": 2.015625, "grad_norm_var": 0.021028645833333335, "learning_rate": 0.0001, "loss": 6.9942, "loss/crossentropy": 2.3275548219680786, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21980319917201996, "step": 7540 }, { "epoch": 0.471375, "grad_norm": 2.046875, "grad_norm_var": 0.02574462890625, "learning_rate": 0.0001, "loss": 7.2398, "loss/crossentropy": 2.232550859451294, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.18873104453086853, "step": 7542 }, { "epoch": 0.4715, "grad_norm": 2.140625, "grad_norm_var": 0.026496378580729167, "learning_rate": 0.0001, "loss": 7.1301, "loss/crossentropy": 2.1725897789001465, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.20075791329145432, "step": 7544 }, { "epoch": 0.471625, "grad_norm": 2.046875, "grad_norm_var": 0.023665364583333334, "learning_rate": 0.0001, "loss": 7.0794, "loss/crossentropy": 2.0849578976631165, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20648343861103058, "step": 7546 }, { "epoch": 0.47175, "grad_norm": 2.140625, "grad_norm_var": 0.020296223958333335, "learning_rate": 0.0001, "loss": 7.0611, "loss/crossentropy": 2.085715174674988, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2036331295967102, "step": 7548 }, { "epoch": 0.471875, "grad_norm": 2.15625, "grad_norm_var": 0.019319661458333335, "learning_rate": 0.0001, "loss": 7.1509, "loss/crossentropy": 1.9772019982337952, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.19998165220022202, "step": 7550 }, { "epoch": 0.472, "grad_norm": 2.0625, "grad_norm_var": 0.016141764322916665, "learning_rate": 0.0001, "loss": 7.0461, "loss/crossentropy": 2.2595661878585815, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20990056544542313, "step": 7552 }, { "epoch": 0.472125, "grad_norm": 2.046875, "grad_norm_var": 0.016673787434895834, "learning_rate": 0.0001, "loss": 7.0228, "loss/crossentropy": 2.2455222606658936, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2194531187415123, "step": 7554 }, { "epoch": 0.47225, "grad_norm": 2.140625, "grad_norm_var": 0.014567057291666666, "learning_rate": 0.0001, "loss": 7.1894, "loss/crossentropy": 2.240777611732483, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20595575124025345, "step": 7556 }, { "epoch": 0.472375, "grad_norm": 2.109375, "grad_norm_var": 0.008161417643229167, "learning_rate": 0.0001, "loss": 7.06, "loss/crossentropy": 2.1476142406463623, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.18031620979309082, "step": 7558 }, { "epoch": 0.4725, "grad_norm": 2.109375, "grad_norm_var": 0.008275349934895834, "learning_rate": 0.0001, "loss": 7.0674, "loss/crossentropy": 2.2802597284317017, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20469427108764648, "step": 7560 }, { "epoch": 0.472625, "grad_norm": 2.109375, "grad_norm_var": 0.007645670572916667, "learning_rate": 0.0001, "loss": 7.0796, "loss/crossentropy": 2.3202688694000244, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2028881385922432, "step": 7562 }, { "epoch": 0.47275, "grad_norm": 2.109375, "grad_norm_var": 0.00732421875, "learning_rate": 0.0001, "loss": 7.0198, "loss/crossentropy": 2.185010552406311, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.22524073719978333, "step": 7564 }, { "epoch": 0.472875, "grad_norm": 2.421875, "grad_norm_var": 0.01376953125, "learning_rate": 0.0001, "loss": 7.1435, "loss/crossentropy": 2.1675198078155518, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.19807401299476624, "step": 7566 }, { "epoch": 0.473, "grad_norm": 1.890625, "grad_norm_var": 0.016803995768229166, "learning_rate": 0.0001, "loss": 7.0201, "loss/crossentropy": 2.437238574028015, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2327703759074211, "step": 7568 }, { "epoch": 0.473125, "grad_norm": 2.09375, "grad_norm_var": 0.0144927978515625, "learning_rate": 0.0001, "loss": 6.9488, "loss/crossentropy": 2.4254690408706665, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21333793550729752, "step": 7570 }, { "epoch": 0.47325, "grad_norm": 2.046875, "grad_norm_var": 0.016276041666666668, "learning_rate": 0.0001, "loss": 7.3178, "loss/crossentropy": 2.5538944005966187, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21501559019088745, "step": 7572 }, { "epoch": 0.473375, "grad_norm": 2.078125, "grad_norm_var": 0.01484375, "learning_rate": 0.0001, "loss": 7.1061, "loss/crossentropy": 2.2689521312713623, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.19831790030002594, "step": 7574 }, { "epoch": 0.4735, "grad_norm": 2.28125, "grad_norm_var": 0.015523274739583334, "learning_rate": 0.0001, "loss": 7.0325, "loss/crossentropy": 1.871632695198059, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19257056713104248, "step": 7576 }, { "epoch": 0.473625, "grad_norm": 1.953125, "grad_norm_var": 0.017496744791666668, "learning_rate": 0.0001, "loss": 7.165, "loss/crossentropy": 2.2190489768981934, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20526662468910217, "step": 7578 }, { "epoch": 0.47375, "grad_norm": 2.109375, "grad_norm_var": 0.017316691080729165, "learning_rate": 0.0001, "loss": 7.3, "loss/crossentropy": 2.4316731691360474, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21631304919719696, "step": 7580 }, { "epoch": 0.473875, "grad_norm": 2.0625, "grad_norm_var": 0.009273274739583334, "learning_rate": 0.0001, "loss": 7.1383, "loss/crossentropy": 2.157890558242798, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.18887675553560257, "step": 7582 }, { "epoch": 0.474, "grad_norm": 2.015625, "grad_norm_var": 0.007763671875, "learning_rate": 0.0001, "loss": 7.1594, "loss/crossentropy": 2.0798093676567078, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2017417550086975, "step": 7584 }, { "epoch": 0.474125, "grad_norm": 2.4375, "grad_norm_var": 0.01455078125, "learning_rate": 0.0001, "loss": 7.2126, "loss/crossentropy": 2.38494336605072, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22561874240636826, "step": 7586 }, { "epoch": 0.47425, "grad_norm": 2.125, "grad_norm_var": 0.018294270833333334, "learning_rate": 0.0001, "loss": 7.2926, "loss/crossentropy": 2.4800193309783936, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2282564714550972, "step": 7588 }, { "epoch": 0.474375, "grad_norm": 2.28125, "grad_norm_var": 0.02139460245768229, "learning_rate": 0.0001, "loss": 7.0307, "loss/crossentropy": 2.2453829050064087, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21037797629833221, "step": 7590 }, { "epoch": 0.4745, "grad_norm": 2.28125, "grad_norm_var": 0.021469879150390624, "learning_rate": 0.0001, "loss": 7.208, "loss/crossentropy": 2.2749218940734863, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2083357498049736, "step": 7592 }, { "epoch": 0.474625, "grad_norm": 2.015625, "grad_norm_var": 0.019608306884765624, "learning_rate": 0.0001, "loss": 7.0957, "loss/crossentropy": 2.182799458503723, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.18513298779726028, "step": 7594 }, { "epoch": 0.47475, "grad_norm": 2.0625, "grad_norm_var": 0.020304107666015626, "learning_rate": 0.0001, "loss": 7.0361, "loss/crossentropy": 2.3911205530166626, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20220442116260529, "step": 7596 }, { "epoch": 0.474875, "grad_norm": 2.078125, "grad_norm_var": 0.019712066650390624, "learning_rate": 0.0001, "loss": 7.2206, "loss/crossentropy": 2.3215757608413696, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2056894525885582, "step": 7598 }, { "epoch": 0.475, "grad_norm": 2.015625, "grad_norm_var": 0.019589996337890624, "learning_rate": 0.0001, "loss": 7.1877, "loss/crossentropy": 2.3881407976150513, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2328326404094696, "step": 7600 }, { "epoch": 0.475125, "grad_norm": 2.0625, "grad_norm_var": 0.01568578084309896, "learning_rate": 0.0001, "loss": 6.955, "loss/crossentropy": 2.414221405982971, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20993871241807938, "step": 7602 }, { "epoch": 0.47525, "grad_norm": 2.296875, "grad_norm_var": 0.012276204427083333, "learning_rate": 0.0001, "loss": 7.0465, "loss/crossentropy": 2.051502525806427, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.18645334243774414, "step": 7604 }, { "epoch": 0.475375, "grad_norm": 2.109375, "grad_norm_var": 0.010241444905598958, "learning_rate": 0.0001, "loss": 7.1153, "loss/crossentropy": 2.0045130848884583, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.206765778362751, "step": 7606 }, { "epoch": 0.4755, "grad_norm": 2.046875, "grad_norm_var": 0.008538564046223959, "learning_rate": 0.0001, "loss": 7.0222, "loss/crossentropy": 2.3240236043930054, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.19366633147001266, "step": 7608 }, { "epoch": 0.475625, "grad_norm": 2.171875, "grad_norm_var": 0.007956695556640626, "learning_rate": 0.0001, "loss": 7.0812, "loss/crossentropy": 1.9922118186950684, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.20807665586471558, "step": 7610 }, { "epoch": 0.47575, "grad_norm": 2.203125, "grad_norm_var": 0.010909016927083333, "learning_rate": 0.0001, "loss": 7.0856, "loss/crossentropy": 1.825062870979309, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.1701940894126892, "step": 7612 }, { "epoch": 0.475875, "grad_norm": 2.09375, "grad_norm_var": 0.0108642578125, "learning_rate": 0.0001, "loss": 7.2186, "loss/crossentropy": 2.29690420627594, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20619598031044006, "step": 7614 }, { "epoch": 0.476, "grad_norm": 2.109375, "grad_norm_var": 0.010941569010416667, "learning_rate": 0.0001, "loss": 7.2496, "loss/crossentropy": 2.193692684173584, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20736318081617355, "step": 7616 }, { "epoch": 0.476125, "grad_norm": 2.078125, "grad_norm_var": 0.010741170247395833, "learning_rate": 0.0001, "loss": 7.0215, "loss/crossentropy": 2.1778957843780518, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20710615813732147, "step": 7618 }, { "epoch": 0.47625, "grad_norm": 1.9375, "grad_norm_var": 0.010237375895182291, "learning_rate": 0.0001, "loss": 7.0398, "loss/crossentropy": 2.1610811948776245, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19665740430355072, "step": 7620 }, { "epoch": 0.476375, "grad_norm": 2.09375, "grad_norm_var": 0.008137766520182292, "learning_rate": 0.0001, "loss": 7.1509, "loss/crossentropy": 2.0157353281974792, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.19302255660295486, "step": 7622 }, { "epoch": 0.4765, "grad_norm": 2.0625, "grad_norm_var": 0.007816314697265625, "learning_rate": 0.0001, "loss": 7.122, "loss/crossentropy": 2.2813684940338135, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.20449642091989517, "step": 7624 }, { "epoch": 0.476625, "grad_norm": 2.078125, "grad_norm_var": 0.007500966389973958, "learning_rate": 0.0001, "loss": 6.9673, "loss/crossentropy": 2.1194839477539062, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19238164275884628, "step": 7626 }, { "epoch": 0.47675, "grad_norm": 2.640625, "grad_norm_var": 0.02554931640625, "learning_rate": 0.0001, "loss": 7.1515, "loss/crossentropy": 2.177587628364563, "loss/hidden": 2.7109375, "loss/jsd": 0.0, "loss/logits": 0.207644984126091, "step": 7628 }, { "epoch": 0.476875, "grad_norm": 1.9453125, "grad_norm_var": 0.02942479451497396, "learning_rate": 0.0001, "loss": 7.0191, "loss/crossentropy": 2.3335689306259155, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21252815425395966, "step": 7630 }, { "epoch": 0.477, "grad_norm": 2.09375, "grad_norm_var": 0.02950007120768229, "learning_rate": 0.0001, "loss": 7.1822, "loss/crossentropy": 2.325187921524048, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21348048746585846, "step": 7632 }, { "epoch": 0.477125, "grad_norm": 2.0625, "grad_norm_var": 0.029325103759765624, "learning_rate": 0.0001, "loss": 7.1423, "loss/crossentropy": 2.287824869155884, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20349889993667603, "step": 7634 }, { "epoch": 0.47725, "grad_norm": 2.0, "grad_norm_var": 0.028574371337890626, "learning_rate": 0.0001, "loss": 7.0346, "loss/crossentropy": 2.2881882190704346, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.1959080845117569, "step": 7636 }, { "epoch": 0.477375, "grad_norm": 2.171875, "grad_norm_var": 0.029842122395833334, "learning_rate": 0.0001, "loss": 7.1162, "loss/crossentropy": 2.3250468969345093, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.1951511949300766, "step": 7638 }, { "epoch": 0.4775, "grad_norm": 2.234375, "grad_norm_var": 0.030036417643229167, "learning_rate": 0.0001, "loss": 7.3131, "loss/crossentropy": 2.1652873754501343, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20676423609256744, "step": 7640 }, { "epoch": 0.477625, "grad_norm": 2.109375, "grad_norm_var": 0.0290924072265625, "learning_rate": 0.0001, "loss": 7.1448, "loss/crossentropy": 2.1286062598228455, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20646179467439651, "step": 7642 }, { "epoch": 0.47775, "grad_norm": 2.109375, "grad_norm_var": 0.0110107421875, "learning_rate": 0.0001, "loss": 7.0397, "loss/crossentropy": 2.2193092107772827, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20696381479501724, "step": 7644 }, { "epoch": 0.477875, "grad_norm": 2.125, "grad_norm_var": 0.006876373291015625, "learning_rate": 0.0001, "loss": 7.2361, "loss/crossentropy": 2.1308083534240723, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.1990327164530754, "step": 7646 }, { "epoch": 0.478, "grad_norm": 2.03125, "grad_norm_var": 0.006748199462890625, "learning_rate": 0.0001, "loss": 7.2022, "loss/crossentropy": 2.4250513315200806, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22592860460281372, "step": 7648 }, { "epoch": 0.478125, "grad_norm": 2.140625, "grad_norm_var": 0.006650543212890625, "learning_rate": 0.0001, "loss": 7.1834, "loss/crossentropy": 2.297890782356262, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.2043851688504219, "step": 7650 }, { "epoch": 0.47825, "grad_norm": 2.375, "grad_norm_var": 0.011191558837890626, "learning_rate": 0.0001, "loss": 6.9906, "loss/crossentropy": 2.03427255153656, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.18367985635995865, "step": 7652 }, { "epoch": 0.478375, "grad_norm": 1.8671875, "grad_norm_var": 0.015474192301432292, "learning_rate": 0.0001, "loss": 7.0992, "loss/crossentropy": 2.3343425989151, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.210361510515213, "step": 7654 }, { "epoch": 0.4785, "grad_norm": 2.109375, "grad_norm_var": 0.015152740478515624, "learning_rate": 0.0001, "loss": 6.9502, "loss/crossentropy": 2.5432735681533813, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2085108458995819, "step": 7656 }, { "epoch": 0.478625, "grad_norm": 2.046875, "grad_norm_var": 0.014745839436848958, "learning_rate": 0.0001, "loss": 7.1265, "loss/crossentropy": 2.376760482788086, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21046151965856552, "step": 7658 }, { "epoch": 0.47875, "grad_norm": 2.109375, "grad_norm_var": 0.014564768473307291, "learning_rate": 0.0001, "loss": 7.091, "loss/crossentropy": 2.1871683597564697, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20009056478738785, "step": 7660 }, { "epoch": 0.478875, "grad_norm": 2.15625, "grad_norm_var": 0.015600331624348958, "learning_rate": 0.0001, "loss": 7.0751, "loss/crossentropy": 2.1381759643554688, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20151401311159134, "step": 7662 }, { "epoch": 0.479, "grad_norm": 2.078125, "grad_norm_var": 0.015228017171223959, "learning_rate": 0.0001, "loss": 7.1749, "loss/crossentropy": 2.204997420310974, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20210929214954376, "step": 7664 }, { "epoch": 0.479125, "grad_norm": 2.234375, "grad_norm_var": 0.021683502197265624, "learning_rate": 0.0001, "loss": 7.1606, "loss/crossentropy": 2.5145864486694336, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.24845977127552032, "step": 7666 }, { "epoch": 0.47925, "grad_norm": 2.125, "grad_norm_var": 0.014534250895182291, "learning_rate": 0.0001, "loss": 7.0797, "loss/crossentropy": 2.1923307180404663, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19743437319993973, "step": 7668 }, { "epoch": 0.479375, "grad_norm": 2.0625, "grad_norm_var": 0.010326131184895834, "learning_rate": 0.0001, "loss": 7.1655, "loss/crossentropy": 2.2290977239608765, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.18826957046985626, "step": 7670 }, { "epoch": 0.4795, "grad_norm": 1.9765625, "grad_norm_var": 0.0140777587890625, "learning_rate": 0.0001, "loss": 7.0097, "loss/crossentropy": 2.1928118467330933, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.20673301815986633, "step": 7672 }, { "epoch": 0.479625, "grad_norm": 2.1875, "grad_norm_var": 0.013825480143229167, "learning_rate": 0.0001, "loss": 7.1607, "loss/crossentropy": 2.399188995361328, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21841491758823395, "step": 7674 }, { "epoch": 0.47975, "grad_norm": 1.9765625, "grad_norm_var": 0.01598078409830729, "learning_rate": 0.0001, "loss": 7.1132, "loss/crossentropy": 2.001654326915741, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.17933339625597, "step": 7676 }, { "epoch": 0.479875, "grad_norm": 2.015625, "grad_norm_var": 0.01590550740559896, "learning_rate": 0.0001, "loss": 7.0738, "loss/crossentropy": 2.2801817655563354, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19666346907615662, "step": 7678 }, { "epoch": 0.48, "grad_norm": 2.109375, "grad_norm_var": 0.015582021077473958, "learning_rate": 0.0001, "loss": 7.1026, "loss/crossentropy": 2.081633687019348, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.19271892309188843, "step": 7680 }, { "epoch": 0.480125, "grad_norm": 2.203125, "grad_norm_var": 0.008813222249348959, "learning_rate": 0.0001, "loss": 7.211, "loss/crossentropy": 2.392168879508972, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.18299797177314758, "step": 7682 }, { "epoch": 0.48025, "grad_norm": 2.140625, "grad_norm_var": 0.010147857666015624, "learning_rate": 0.0001, "loss": 7.3303, "loss/crossentropy": 2.3119935989379883, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23662632703781128, "step": 7684 }, { "epoch": 0.480375, "grad_norm": 1.9921875, "grad_norm_var": 0.009919230143229167, "learning_rate": 0.0001, "loss": 7.0347, "loss/crossentropy": 2.2318339347839355, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.21657077223062515, "step": 7686 }, { "epoch": 0.4805, "grad_norm": 2.0625, "grad_norm_var": 0.007819620768229167, "learning_rate": 0.0001, "loss": 6.9663, "loss/crossentropy": 2.1833382844924927, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.21761953830718994, "step": 7688 }, { "epoch": 0.480625, "grad_norm": 2.109375, "grad_norm_var": 0.006322224934895833, "learning_rate": 0.0001, "loss": 7.0488, "loss/crossentropy": 2.134721040725708, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.1840914860367775, "step": 7690 }, { "epoch": 0.48075, "grad_norm": 2.140625, "grad_norm_var": 0.006231435139973958, "learning_rate": 0.0001, "loss": 7.0641, "loss/crossentropy": 2.2247310876846313, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.22055795043706894, "step": 7692 }, { "epoch": 0.480875, "grad_norm": 1.9609375, "grad_norm_var": 0.007892862955729166, "learning_rate": 0.0001, "loss": 7.0667, "loss/crossentropy": 2.27902615070343, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21320205926895142, "step": 7694 }, { "epoch": 0.481, "grad_norm": 2.0625, "grad_norm_var": 0.007950846354166667, "learning_rate": 0.0001, "loss": 7.0584, "loss/crossentropy": 2.0421899557113647, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.1877262145280838, "step": 7696 }, { "epoch": 0.481125, "grad_norm": 2.15625, "grad_norm_var": 0.0074615478515625, "learning_rate": 0.0001, "loss": 7.0576, "loss/crossentropy": 2.2370803356170654, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20086784660816193, "step": 7698 }, { "epoch": 0.48125, "grad_norm": 2.1875, "grad_norm_var": 0.0062652587890625, "learning_rate": 0.0001, "loss": 7.141, "loss/crossentropy": 2.2639442682266235, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20139917731285095, "step": 7700 }, { "epoch": 0.481375, "grad_norm": 2.03125, "grad_norm_var": 0.005944569905598958, "learning_rate": 0.0001, "loss": 7.0808, "loss/crossentropy": 2.308246374130249, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.2012246772646904, "step": 7702 }, { "epoch": 0.4815, "grad_norm": 2.25, "grad_norm_var": 0.007452138264973958, "learning_rate": 0.0001, "loss": 7.1243, "loss/crossentropy": 2.406078815460205, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20916583389043808, "step": 7704 }, { "epoch": 0.481625, "grad_norm": 2.40625, "grad_norm_var": 0.013730621337890625, "learning_rate": 0.0001, "loss": 7.0034, "loss/crossentropy": 2.3612505197525024, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.2026866227388382, "step": 7706 }, { "epoch": 0.48175, "grad_norm": 2.015625, "grad_norm_var": 0.014928944905598958, "learning_rate": 0.0001, "loss": 7.0391, "loss/crossentropy": 2.1469807028770447, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.1992548406124115, "step": 7708 }, { "epoch": 0.481875, "grad_norm": 2.078125, "grad_norm_var": 0.012711588541666667, "learning_rate": 0.0001, "loss": 7.1218, "loss/crossentropy": 2.0821534991264343, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.2098289653658867, "step": 7710 }, { "epoch": 0.482, "grad_norm": 2.28125, "grad_norm_var": 0.015070597330729166, "learning_rate": 0.0001, "loss": 7.1686, "loss/crossentropy": 2.180022358894348, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.21401400864124298, "step": 7712 }, { "epoch": 0.482125, "grad_norm": 2.078125, "grad_norm_var": 0.014644368489583334, "learning_rate": 0.0001, "loss": 7.1485, "loss/crossentropy": 2.3946781158447266, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22900524735450745, "step": 7714 }, { "epoch": 0.48225, "grad_norm": 2.140625, "grad_norm_var": 0.015360514322916666, "learning_rate": 0.0001, "loss": 7.1556, "loss/crossentropy": 2.135510504245758, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21871786564588547, "step": 7716 }, { "epoch": 0.482375, "grad_norm": 2.140625, "grad_norm_var": 0.014777628580729167, "learning_rate": 0.0001, "loss": 7.2415, "loss/crossentropy": 2.286018133163452, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21493077278137207, "step": 7718 }, { "epoch": 0.4825, "grad_norm": 2.171875, "grad_norm_var": 0.017748006184895835, "learning_rate": 0.0001, "loss": 7.0218, "loss/crossentropy": 2.0190805792808533, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20633917301893234, "step": 7720 }, { "epoch": 0.482625, "grad_norm": 2.03125, "grad_norm_var": 0.0145660400390625, "learning_rate": 0.0001, "loss": 7.1712, "loss/crossentropy": 2.0265942811965942, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20769783109426498, "step": 7722 }, { "epoch": 0.48275, "grad_norm": 2.0625, "grad_norm_var": 0.011454264322916666, "learning_rate": 0.0001, "loss": 6.9481, "loss/crossentropy": 2.119886040687561, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19638077169656754, "step": 7724 }, { "epoch": 0.482875, "grad_norm": 2.125, "grad_norm_var": 0.009696451822916667, "learning_rate": 0.0001, "loss": 7.1273, "loss/crossentropy": 2.1580886840820312, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.1923273205757141, "step": 7726 }, { "epoch": 0.483, "grad_norm": 2.0625, "grad_norm_var": 0.009137980143229167, "learning_rate": 0.0001, "loss": 7.0008, "loss/crossentropy": 2.1663527488708496, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.20339468121528625, "step": 7728 }, { "epoch": 0.483125, "grad_norm": 2.125, "grad_norm_var": 0.008576456705729167, "learning_rate": 0.0001, "loss": 7.2413, "loss/crossentropy": 2.182821273803711, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21904577314853668, "step": 7730 }, { "epoch": 0.48325, "grad_norm": 2.078125, "grad_norm_var": 0.008919270833333333, "learning_rate": 0.0001, "loss": 7.1494, "loss/crossentropy": 2.373465061187744, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2193896695971489, "step": 7732 }, { "epoch": 0.483375, "grad_norm": 1.9765625, "grad_norm_var": 0.0124176025390625, "learning_rate": 0.0001, "loss": 6.9891, "loss/crossentropy": 2.6929373741149902, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21133850514888763, "step": 7734 }, { "epoch": 0.4835, "grad_norm": 2.09375, "grad_norm_var": 0.0075185139973958336, "learning_rate": 0.0001, "loss": 7.241, "loss/crossentropy": 2.153095006942749, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2049623802304268, "step": 7736 }, { "epoch": 0.483625, "grad_norm": 2.328125, "grad_norm_var": 0.010358683268229167, "learning_rate": 0.0001, "loss": 7.2583, "loss/crossentropy": 2.349102020263672, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20283705741167068, "step": 7738 }, { "epoch": 0.48375, "grad_norm": 2.0, "grad_norm_var": 0.0125152587890625, "learning_rate": 0.0001, "loss": 7.0513, "loss/crossentropy": 2.5229681730270386, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22544901072978973, "step": 7740 }, { "epoch": 0.483875, "grad_norm": 2.078125, "grad_norm_var": 0.0131744384765625, "learning_rate": 0.0001, "loss": 7.2253, "loss/crossentropy": 2.4761338233947754, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20555153489112854, "step": 7742 }, { "epoch": 0.484, "grad_norm": 2.171875, "grad_norm_var": 0.016153971354166668, "learning_rate": 0.0001, "loss": 7.1065, "loss/crossentropy": 2.6251370906829834, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2124454379081726, "step": 7744 }, { "epoch": 0.484125, "grad_norm": 1.8828125, "grad_norm_var": 0.02028376261393229, "learning_rate": 0.0001, "loss": 6.936, "loss/crossentropy": 2.269635319709778, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21971495449543, "step": 7746 }, { "epoch": 0.48425, "grad_norm": 2.171875, "grad_norm_var": 0.02039159138997396, "learning_rate": 0.0001, "loss": 7.0399, "loss/crossentropy": 1.855255365371704, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.17584873735904694, "step": 7748 }, { "epoch": 0.484375, "grad_norm": 2.265625, "grad_norm_var": 0.017459869384765625, "learning_rate": 0.0001, "loss": 7.079, "loss/crossentropy": 2.198126792907715, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20961421728134155, "step": 7750 }, { "epoch": 0.4845, "grad_norm": 2.125, "grad_norm_var": 0.016137440999348957, "learning_rate": 0.0001, "loss": 7.0269, "loss/crossentropy": 1.967193365097046, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.19744029641151428, "step": 7752 }, { "epoch": 0.484625, "grad_norm": 2.03125, "grad_norm_var": 0.014597320556640625, "learning_rate": 0.0001, "loss": 7.0634, "loss/crossentropy": 2.325754761695862, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.20395881682634354, "step": 7754 }, { "epoch": 0.48475, "grad_norm": 1.9609375, "grad_norm_var": 0.014699045817057292, "learning_rate": 0.0001, "loss": 7.0538, "loss/crossentropy": 2.288987159729004, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2305854633450508, "step": 7756 }, { "epoch": 0.484875, "grad_norm": 2.234375, "grad_norm_var": 0.015207672119140625, "learning_rate": 0.0001, "loss": 7.3086, "loss/crossentropy": 2.116866946220398, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21432197093963623, "step": 7758 }, { "epoch": 0.485, "grad_norm": 2.203125, "grad_norm_var": 0.011791737874348958, "learning_rate": 0.0001, "loss": 7.2918, "loss/crossentropy": 2.250456213951111, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.22191955149173737, "step": 7760 }, { "epoch": 0.485125, "grad_norm": 2.140625, "grad_norm_var": 0.011043294270833334, "learning_rate": 0.0001, "loss": 7.1073, "loss/crossentropy": 2.2381151914596558, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20548705756664276, "step": 7762 }, { "epoch": 0.48525, "grad_norm": 2.203125, "grad_norm_var": 0.011278279622395833, "learning_rate": 0.0001, "loss": 7.0467, "loss/crossentropy": 2.271865129470825, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20705430209636688, "step": 7764 }, { "epoch": 0.485375, "grad_norm": 2.28125, "grad_norm_var": 0.0121978759765625, "learning_rate": 0.0001, "loss": 7.1154, "loss/crossentropy": 2.157663583755493, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.19460859149694443, "step": 7766 }, { "epoch": 0.4855, "grad_norm": 2.171875, "grad_norm_var": 0.0123779296875, "learning_rate": 0.0001, "loss": 7.1723, "loss/crossentropy": 2.1781057119369507, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.221711665391922, "step": 7768 }, { "epoch": 0.485625, "grad_norm": 2.078125, "grad_norm_var": 0.0119049072265625, "learning_rate": 0.0001, "loss": 7.0052, "loss/crossentropy": 2.017356812953949, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.1983555629849434, "step": 7770 }, { "epoch": 0.48575, "grad_norm": 2.140625, "grad_norm_var": 0.011551920572916667, "learning_rate": 0.0001, "loss": 7.0771, "loss/crossentropy": 2.2357795238494873, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.20853830128908157, "step": 7772 }, { "epoch": 0.485875, "grad_norm": 2.34375, "grad_norm_var": 0.016551717122395834, "learning_rate": 0.0001, "loss": 7.1366, "loss/crossentropy": 2.2986165285110474, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.22572917491197586, "step": 7774 }, { "epoch": 0.486, "grad_norm": 2.21875, "grad_norm_var": 0.01666259765625, "learning_rate": 0.0001, "loss": 7.2861, "loss/crossentropy": 2.2475717067718506, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21900728344917297, "step": 7776 }, { "epoch": 0.486125, "grad_norm": 2.0, "grad_norm_var": 0.015201822916666666, "learning_rate": 0.0001, "loss": 7.047, "loss/crossentropy": 2.310316801071167, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20573000609874725, "step": 7778 }, { "epoch": 0.48625, "grad_norm": 2.21875, "grad_norm_var": 0.0154449462890625, "learning_rate": 0.0001, "loss": 7.0287, "loss/crossentropy": 1.9829546213150024, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.2091880813241005, "step": 7780 }, { "epoch": 0.486375, "grad_norm": 1.953125, "grad_norm_var": 0.017392730712890624, "learning_rate": 0.0001, "loss": 6.9216, "loss/crossentropy": 2.3593095541000366, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.19612394273281097, "step": 7782 }, { "epoch": 0.4865, "grad_norm": 2.25, "grad_norm_var": 0.020623524983723957, "learning_rate": 0.0001, "loss": 7.1751, "loss/crossentropy": 2.493327498435974, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.23639556765556335, "step": 7784 }, { "epoch": 0.486625, "grad_norm": 2.15625, "grad_norm_var": 0.024253082275390626, "learning_rate": 0.0001, "loss": 7.0933, "loss/crossentropy": 2.37237012386322, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.2250986099243164, "step": 7786 }, { "epoch": 0.48675, "grad_norm": 2.03125, "grad_norm_var": 0.022658030192057293, "learning_rate": 0.0001, "loss": 7.1574, "loss/crossentropy": 2.3521467447280884, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21240167319774628, "step": 7788 }, { "epoch": 0.486875, "grad_norm": 2.234375, "grad_norm_var": 0.016200510660807292, "learning_rate": 0.0001, "loss": 7.0604, "loss/crossentropy": 2.2158159017562866, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.19502168148756027, "step": 7790 }, { "epoch": 0.487, "grad_norm": 2.046875, "grad_norm_var": 0.015488433837890624, "learning_rate": 0.0001, "loss": 7.2302, "loss/crossentropy": 2.2819695472717285, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2108946293592453, "step": 7792 }, { "epoch": 0.487125, "grad_norm": 2.015625, "grad_norm_var": 0.015295155843098958, "learning_rate": 0.0001, "loss": 7.0113, "loss/crossentropy": 2.2795369625091553, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.20005237311124802, "step": 7794 }, { "epoch": 0.48725, "grad_norm": 2.203125, "grad_norm_var": 0.015087636311848958, "learning_rate": 0.0001, "loss": 7.1739, "loss/crossentropy": 2.5151994228363037, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.20011039823293686, "step": 7796 }, { "epoch": 0.487375, "grad_norm": 2.171875, "grad_norm_var": 0.013590494791666666, "learning_rate": 0.0001, "loss": 7.1157, "loss/crossentropy": 2.0404099225997925, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21494626998901367, "step": 7798 }, { "epoch": 0.4875, "grad_norm": 2.046875, "grad_norm_var": 0.01080322265625, "learning_rate": 0.0001, "loss": 7.0035, "loss/crossentropy": 2.205460011959076, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20000936090946198, "step": 7800 }, { "epoch": 0.487625, "grad_norm": 2.046875, "grad_norm_var": 0.008210245768229167, "learning_rate": 0.0001, "loss": 7.0885, "loss/crossentropy": 2.253046751022339, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.19893667101860046, "step": 7802 }, { "epoch": 0.48775, "grad_norm": 2.3125, "grad_norm_var": 0.010838826497395834, "learning_rate": 0.0001, "loss": 7.1092, "loss/crossentropy": 2.250741481781006, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.19479040056467056, "step": 7804 }, { "epoch": 0.487875, "grad_norm": 2.0625, "grad_norm_var": 0.010530598958333333, "learning_rate": 0.0001, "loss": 7.0907, "loss/crossentropy": 2.1254579424858093, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2081373631954193, "step": 7806 }, { "epoch": 0.488, "grad_norm": 2.0625, "grad_norm_var": 0.01021728515625, "learning_rate": 0.0001, "loss": 6.9847, "loss/crossentropy": 2.2556028366088867, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.1956285983324051, "step": 7808 }, { "epoch": 0.488125, "grad_norm": 2.265625, "grad_norm_var": 0.010749308268229167, "learning_rate": 0.0001, "loss": 6.8772, "loss/crossentropy": 2.0442580580711365, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.1880321502685547, "step": 7810 }, { "epoch": 0.48825, "grad_norm": 1.9921875, "grad_norm_var": 0.011329905192057291, "learning_rate": 0.0001, "loss": 6.9359, "loss/crossentropy": 1.964568853378296, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.21166903525590897, "step": 7812 }, { "epoch": 0.488375, "grad_norm": 2.234375, "grad_norm_var": 0.010786692301432291, "learning_rate": 0.0001, "loss": 7.1827, "loss/crossentropy": 2.41512668132782, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.20831845700740814, "step": 7814 }, { "epoch": 0.4885, "grad_norm": 1.9765625, "grad_norm_var": 0.01175537109375, "learning_rate": 0.0001, "loss": 7.1273, "loss/crossentropy": 2.370983600616455, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.20832911878824234, "step": 7816 }, { "epoch": 0.488625, "grad_norm": 2.140625, "grad_norm_var": 0.0101806640625, "learning_rate": 0.0001, "loss": 7.0815, "loss/crossentropy": 2.3157626390457153, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.1933603137731552, "step": 7818 }, { "epoch": 0.48875, "grad_norm": 1.9765625, "grad_norm_var": 0.008310699462890625, "learning_rate": 0.0001, "loss": 6.9595, "loss/crossentropy": 2.01755154132843, "loss/hidden": 2.7109375, "loss/jsd": 0.0, "loss/logits": 0.18479111790657043, "step": 7820 }, { "epoch": 0.488875, "grad_norm": 2.140625, "grad_norm_var": 0.008038075764973958, "learning_rate": 0.0001, "loss": 7.0953, "loss/crossentropy": 2.116927742958069, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.1958414539694786, "step": 7822 }, { "epoch": 0.489, "grad_norm": 2.109375, "grad_norm_var": 0.007977040608723958, "learning_rate": 0.0001, "loss": 7.1639, "loss/crossentropy": 2.151541829109192, "loss/hidden": 2.71875, "loss/jsd": 0.0, "loss/logits": 0.1838311031460762, "step": 7824 }, { "epoch": 0.489125, "grad_norm": 2.125, "grad_norm_var": 0.0060727437337239586, "learning_rate": 0.0001, "loss": 7.053, "loss/crossentropy": 2.4051939249038696, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.20532245934009552, "step": 7826 }, { "epoch": 0.48925, "grad_norm": 2.390625, "grad_norm_var": 0.013841756184895833, "learning_rate": 0.0001, "loss": 7.1092, "loss/crossentropy": 2.3758574724197388, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2034212425351143, "step": 7828 }, { "epoch": 0.489375, "grad_norm": 2.015625, "grad_norm_var": 0.014408365885416666, "learning_rate": 0.0001, "loss": 7.1397, "loss/crossentropy": 2.2068817019462585, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20511388778686523, "step": 7830 }, { "epoch": 0.4895, "grad_norm": 2.171875, "grad_norm_var": 0.013342030843098958, "learning_rate": 0.0001, "loss": 7.1086, "loss/crossentropy": 2.3683449029922485, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.22085585445165634, "step": 7832 }, { "epoch": 0.489625, "grad_norm": 2.015625, "grad_norm_var": 0.016015370686848957, "learning_rate": 0.0001, "loss": 7.0163, "loss/crossentropy": 2.137399196624756, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.18337388336658478, "step": 7834 }, { "epoch": 0.48975, "grad_norm": 2.125, "grad_norm_var": 0.014411417643229167, "learning_rate": 0.0001, "loss": 7.1261, "loss/crossentropy": 2.2942042350769043, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.2159949615597725, "step": 7836 }, { "epoch": 0.489875, "grad_norm": 2.171875, "grad_norm_var": 0.014240519205729166, "learning_rate": 0.0001, "loss": 7.1013, "loss/crossentropy": 2.302956461906433, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.22014276683330536, "step": 7838 }, { "epoch": 0.49, "grad_norm": 2.0625, "grad_norm_var": 0.015511067708333333, "learning_rate": 0.0001, "loss": 7.0424, "loss/crossentropy": 2.1671026945114136, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.18992354720830917, "step": 7840 }, { "epoch": 0.490125, "grad_norm": 1.9453125, "grad_norm_var": 0.017622629801432293, "learning_rate": 0.0001, "loss": 7.0956, "loss/crossentropy": 2.034866750240326, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.19620046019554138, "step": 7842 }, { "epoch": 0.49025, "grad_norm": 1.9765625, "grad_norm_var": 0.010016886393229167, "learning_rate": 0.0001, "loss": 7.1249, "loss/crossentropy": 2.373619318008423, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.2142253741621971, "step": 7844 }, { "epoch": 0.490375, "grad_norm": 2.453125, "grad_norm_var": 0.018211873372395833, "learning_rate": 0.0001, "loss": 7.1376, "loss/crossentropy": 2.135084390640259, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.20321637392044067, "step": 7846 }, { "epoch": 0.4905, "grad_norm": 1.9609375, "grad_norm_var": 0.02012914021809896, "learning_rate": 0.0001, "loss": 7.0496, "loss/crossentropy": 2.344667911529541, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.2073012888431549, "step": 7848 }, { "epoch": 0.490625, "grad_norm": 2.203125, "grad_norm_var": 0.018790435791015626, "learning_rate": 0.0001, "loss": 6.8819, "loss/crossentropy": 2.205102324485779, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.19368459284305573, "step": 7850 }, { "epoch": 0.49075, "grad_norm": 1.96875, "grad_norm_var": 0.01999079386393229, "learning_rate": 0.0001, "loss": 6.962, "loss/crossentropy": 2.0952929258346558, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.19991254806518555, "step": 7852 }, { "epoch": 0.490875, "grad_norm": 2.078125, "grad_norm_var": 0.02063776652018229, "learning_rate": 0.0001, "loss": 7.0236, "loss/crossentropy": 2.091593384742737, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20015785843133926, "step": 7854 }, { "epoch": 0.491, "grad_norm": 2.203125, "grad_norm_var": 0.019758860270182293, "learning_rate": 0.0001, "loss": 7.1106, "loss/crossentropy": 2.1102964878082275, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20344851911067963, "step": 7856 }, { "epoch": 0.491125, "grad_norm": 2.09375, "grad_norm_var": 0.020869954427083334, "learning_rate": 0.0001, "loss": 7.0628, "loss/crossentropy": 2.2350775003433228, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.20482338964939117, "step": 7858 }, { "epoch": 0.49125, "grad_norm": 2.015625, "grad_norm_var": 0.019779459635416666, "learning_rate": 0.0001, "loss": 7.1046, "loss/crossentropy": 1.9230089783668518, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.19790837168693542, "step": 7860 }, { "epoch": 0.491375, "grad_norm": 1.984375, "grad_norm_var": 0.0128662109375, "learning_rate": 0.0001, "loss": 7.0854, "loss/crossentropy": 2.5659754276275635, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.20197945088148117, "step": 7862 }, { "epoch": 0.4915, "grad_norm": 2.328125, "grad_norm_var": 0.012967681884765625, "learning_rate": 0.0001, "loss": 7.2325, "loss/crossentropy": 2.4076273441314697, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21878042817115784, "step": 7864 }, { "epoch": 0.491625, "grad_norm": 2.25, "grad_norm_var": 0.013685862223307291, "learning_rate": 0.0001, "loss": 7.192, "loss/crossentropy": 2.3655115365982056, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2367181032896042, "step": 7866 }, { "epoch": 0.49175, "grad_norm": 2.140625, "grad_norm_var": 0.03374201456705729, "learning_rate": 0.0001, "loss": 7.2599, "loss/crossentropy": 2.1696566343307495, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.2092232033610344, "step": 7868 }, { "epoch": 0.491875, "grad_norm": 2.0625, "grad_norm_var": 0.03148778279622396, "learning_rate": 0.0001, "loss": 7.2385, "loss/crossentropy": 2.376840353012085, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21353555470705032, "step": 7870 }, { "epoch": 0.492, "grad_norm": 2.046875, "grad_norm_var": 0.03212865193684896, "learning_rate": 0.0001, "loss": 7.1475, "loss/crossentropy": 2.085893988609314, "loss/hidden": 2.7109375, "loss/jsd": 0.0, "loss/logits": 0.20016048848628998, "step": 7872 }, { "epoch": 0.492125, "grad_norm": 2.0, "grad_norm_var": 0.03144912719726563, "learning_rate": 0.0001, "loss": 7.0677, "loss/crossentropy": 2.2020708322525024, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.20381900668144226, "step": 7874 }, { "epoch": 0.49225, "grad_norm": 2.140625, "grad_norm_var": 0.029173787434895834, "learning_rate": 0.0001, "loss": 7.123, "loss/crossentropy": 2.22407865524292, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20843693614006042, "step": 7876 }, { "epoch": 0.492375, "grad_norm": 2.15625, "grad_norm_var": 0.026432291666666666, "learning_rate": 0.0001, "loss": 7.2386, "loss/crossentropy": 2.563064932823181, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.23229733109474182, "step": 7878 }, { "epoch": 0.4925, "grad_norm": 2.125, "grad_norm_var": 0.024800618489583332, "learning_rate": 0.0001, "loss": 7.1823, "loss/crossentropy": 2.2542184591293335, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20725636184215546, "step": 7880 }, { "epoch": 0.492625, "grad_norm": 2.171875, "grad_norm_var": 0.024430338541666666, "learning_rate": 0.0001, "loss": 7.051, "loss/crossentropy": 2.041069507598877, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.20505793392658234, "step": 7882 }, { "epoch": 0.49275, "grad_norm": 2.078125, "grad_norm_var": 0.0036529541015625, "learning_rate": 0.0001, "loss": 7.2277, "loss/crossentropy": 2.295996069908142, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.20132000744342804, "step": 7884 }, { "epoch": 0.492875, "grad_norm": 2.828125, "grad_norm_var": 0.0341949462890625, "learning_rate": 0.0001, "loss": 7.2021, "loss/crossentropy": 2.152729630470276, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21744219958782196, "step": 7886 }, { "epoch": 0.493, "grad_norm": 2.125, "grad_norm_var": 0.033324178059895834, "learning_rate": 0.0001, "loss": 7.0682, "loss/crossentropy": 2.158630609512329, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20471574366092682, "step": 7888 }, { "epoch": 0.493125, "grad_norm": 2.234375, "grad_norm_var": 0.030858357747395832, "learning_rate": 0.0001, "loss": 6.9869, "loss/crossentropy": 2.0892770290374756, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.18370763957500458, "step": 7890 }, { "epoch": 0.49325, "grad_norm": 2.03125, "grad_norm_var": 0.03192952473958333, "learning_rate": 0.0001, "loss": 7.1989, "loss/crossentropy": 2.492920398712158, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.218817800283432, "step": 7892 }, { "epoch": 0.493375, "grad_norm": 2.140625, "grad_norm_var": 0.03240559895833333, "learning_rate": 0.0001, "loss": 7.1465, "loss/crossentropy": 2.3585835695266724, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.203622005879879, "step": 7894 }, { "epoch": 0.4935, "grad_norm": 2.265625, "grad_norm_var": 0.03378499348958333, "learning_rate": 0.0001, "loss": 7.1171, "loss/crossentropy": 2.145370841026306, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.19818150997161865, "step": 7896 }, { "epoch": 0.493625, "grad_norm": 2.171875, "grad_norm_var": 0.03325093587239583, "learning_rate": 0.0001, "loss": 7.239, "loss/crossentropy": 2.408655047416687, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.21892248839139938, "step": 7898 }, { "epoch": 0.49375, "grad_norm": 2.125, "grad_norm_var": 0.03749974568684896, "learning_rate": 0.0001, "loss": 7.0329, "loss/crossentropy": 2.255719542503357, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.21674372255802155, "step": 7900 }, { "epoch": 0.493875, "grad_norm": 2.171875, "grad_norm_var": 0.007592519124348958, "learning_rate": 0.0001, "loss": 7.1371, "loss/crossentropy": 2.2832452058792114, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2183477208018303, "step": 7902 }, { "epoch": 0.494, "grad_norm": 2.03125, "grad_norm_var": 0.009100087483723958, "learning_rate": 0.0001, "loss": 7.1093, "loss/crossentropy": 2.5752521753311157, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.22478312999010086, "step": 7904 }, { "epoch": 0.494125, "grad_norm": 2.140625, "grad_norm_var": 0.008141835530598959, "learning_rate": 0.0001, "loss": 7.0139, "loss/crossentropy": 2.229183316230774, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.2103634625673294, "step": 7906 }, { "epoch": 0.49425, "grad_norm": 2.078125, "grad_norm_var": 0.009653472900390625, "learning_rate": 0.0001, "loss": 7.1175, "loss/crossentropy": 2.100042998790741, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.19252710044384003, "step": 7908 }, { "epoch": 0.494375, "grad_norm": 2.171875, "grad_norm_var": 0.013268788655598959, "learning_rate": 0.0001, "loss": 7.301, "loss/crossentropy": 2.3349530696868896, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22781674563884735, "step": 7910 }, { "epoch": 0.4945, "grad_norm": 2.0, "grad_norm_var": 0.013352203369140624, "learning_rate": 0.0001, "loss": 7.0135, "loss/crossentropy": 2.2375741004943848, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.19559404999017715, "step": 7912 }, { "epoch": 0.494625, "grad_norm": 2.109375, "grad_norm_var": 0.012680816650390624, "learning_rate": 0.0001, "loss": 7.093, "loss/crossentropy": 2.447828769683838, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2270418256521225, "step": 7914 }, { "epoch": 0.49475, "grad_norm": 2.28125, "grad_norm_var": 0.012059529622395834, "learning_rate": 0.0001, "loss": 6.9899, "loss/crossentropy": 2.0814391374588013, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.18889734894037247, "step": 7916 }, { "epoch": 0.494875, "grad_norm": 1.96875, "grad_norm_var": 0.0132476806640625, "learning_rate": 0.0001, "loss": 7.1485, "loss/crossentropy": 2.4870957136154175, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.21650540828704834, "step": 7918 }, { "epoch": 0.495, "grad_norm": 2.25, "grad_norm_var": 0.0121734619140625, "learning_rate": 0.0001, "loss": 7.0928, "loss/crossentropy": 2.1004719734191895, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.19623607397079468, "step": 7920 }, { "epoch": 0.495125, "grad_norm": 2.515625, "grad_norm_var": 0.0206207275390625, "learning_rate": 0.0001, "loss": 7.2285, "loss/crossentropy": 2.592145800590515, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.20861030369997025, "step": 7922 }, { "epoch": 0.49525, "grad_norm": 2.03125, "grad_norm_var": 0.021271769205729166, "learning_rate": 0.0001, "loss": 7.0363, "loss/crossentropy": 2.118270993232727, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.19777195155620575, "step": 7924 }, { "epoch": 0.495375, "grad_norm": 2.015625, "grad_norm_var": 0.0199615478515625, "learning_rate": 0.0001, "loss": 6.9582, "loss/crossentropy": 2.176929235458374, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.21006185561418533, "step": 7926 }, { "epoch": 0.4955, "grad_norm": 2.03125, "grad_norm_var": 0.019074503580729166, "learning_rate": 0.0001, "loss": 7.0891, "loss/crossentropy": 2.425865054130554, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.20186667144298553, "step": 7928 }, { "epoch": 0.495625, "grad_norm": 2.046875, "grad_norm_var": 0.020685831705729168, "learning_rate": 0.0001, "loss": 7.0731, "loss/crossentropy": 2.2788286209106445, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20905498415231705, "step": 7930 }, { "epoch": 0.49575, "grad_norm": 2.140625, "grad_norm_var": 0.019010416666666665, "learning_rate": 0.0001, "loss": 7.0084, "loss/crossentropy": 2.188755750656128, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.21286778151988983, "step": 7932 }, { "epoch": 0.495875, "grad_norm": 1.96875, "grad_norm_var": 0.019416300455729167, "learning_rate": 0.0001, "loss": 6.9673, "loss/crossentropy": 2.3245939016342163, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22744500637054443, "step": 7934 }, { "epoch": 0.496, "grad_norm": 2.265625, "grad_norm_var": 0.01890869140625, "learning_rate": 0.0001, "loss": 7.149, "loss/crossentropy": 2.371795654296875, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21272078156471252, "step": 7936 }, { "epoch": 0.496125, "grad_norm": 1.96875, "grad_norm_var": 0.007624308268229167, "learning_rate": 0.0001, "loss": 6.9848, "loss/crossentropy": 2.1958614587783813, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21580558270215988, "step": 7938 }, { "epoch": 0.49625, "grad_norm": 2.046875, "grad_norm_var": 0.007666015625, "learning_rate": 0.0001, "loss": 7.1961, "loss/crossentropy": 2.513580083847046, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.202760711312294, "step": 7940 }, { "epoch": 0.496375, "grad_norm": 2.0625, "grad_norm_var": 0.00748291015625, "learning_rate": 0.0001, "loss": 7.1847, "loss/crossentropy": 1.978775680065155, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.19025252759456635, "step": 7942 }, { "epoch": 0.4965, "grad_norm": 2.34375, "grad_norm_var": 0.0118316650390625, "learning_rate": 0.0001, "loss": 6.9504, "loss/crossentropy": 2.2968384623527527, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21156945079565048, "step": 7944 }, { "epoch": 0.496625, "grad_norm": 2.90625, "grad_norm_var": 0.051774088541666666, "learning_rate": 0.0001, "loss": 7.197, "loss/crossentropy": 2.2490543127059937, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20457617938518524, "step": 7946 }, { "epoch": 0.49675, "grad_norm": 2.09375, "grad_norm_var": 0.051488240559895836, "learning_rate": 0.0001, "loss": 7.0335, "loss/crossentropy": 2.0426313877105713, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.21022196114063263, "step": 7948 }, { "epoch": 0.496875, "grad_norm": 2.046875, "grad_norm_var": 0.0497222900390625, "learning_rate": 0.0001, "loss": 7.1089, "loss/crossentropy": 2.0203962326049805, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.1993423029780388, "step": 7950 }, { "epoch": 0.497, "grad_norm": 2.296875, "grad_norm_var": 0.0502349853515625, "learning_rate": 0.0001, "loss": 7.0083, "loss/crossentropy": 2.5307857990264893, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.21577821671962738, "step": 7952 }, { "epoch": 0.497125, "grad_norm": 1.9453125, "grad_norm_var": 0.05158665974934896, "learning_rate": 0.0001, "loss": 7.037, "loss/crossentropy": 2.324398159980774, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20606698840856552, "step": 7954 }, { "epoch": 0.49725, "grad_norm": 2.375, "grad_norm_var": 0.05232518513997396, "learning_rate": 0.0001, "loss": 7.2288, "loss/crossentropy": 2.4783180952072144, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2216300517320633, "step": 7956 }, { "epoch": 0.497375, "grad_norm": 2.359375, "grad_norm_var": 0.051401519775390626, "learning_rate": 0.0001, "loss": 7.213, "loss/crossentropy": 2.411279320716858, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.21086601167917252, "step": 7958 }, { "epoch": 0.4975, "grad_norm": 2.09375, "grad_norm_var": 0.051092274983723956, "learning_rate": 0.0001, "loss": 7.2168, "loss/crossentropy": 2.1537816524505615, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.20549767464399338, "step": 7960 }, { "epoch": 0.497625, "grad_norm": 2.078125, "grad_norm_var": 0.016281890869140624, "learning_rate": 0.0001, "loss": 7.1023, "loss/crossentropy": 2.394239068031311, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.21313968300819397, "step": 7962 }, { "epoch": 0.49775, "grad_norm": 2.125, "grad_norm_var": 0.016798655192057293, "learning_rate": 0.0001, "loss": 7.185, "loss/crossentropy": 2.42279052734375, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2198297083377838, "step": 7964 }, { "epoch": 0.497875, "grad_norm": 2.140625, "grad_norm_var": 0.01599909464518229, "learning_rate": 0.0001, "loss": 7.0906, "loss/crossentropy": 2.2097359895706177, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.20080310106277466, "step": 7966 }, { "epoch": 0.498, "grad_norm": 2.15625, "grad_norm_var": 0.014574940999348958, "learning_rate": 0.0001, "loss": 7.3154, "loss/crossentropy": 2.3493038415908813, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.19426824152469635, "step": 7968 }, { "epoch": 0.498125, "grad_norm": 2.125, "grad_norm_var": 0.0106597900390625, "learning_rate": 0.0001, "loss": 7.1954, "loss/crossentropy": 2.355955958366394, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2216314971446991, "step": 7970 }, { "epoch": 0.49825, "grad_norm": 2.078125, "grad_norm_var": 0.007811482747395833, "learning_rate": 0.0001, "loss": 7.2072, "loss/crossentropy": 2.63875412940979, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21204157173633575, "step": 7972 }, { "epoch": 0.498375, "grad_norm": 2.171875, "grad_norm_var": 0.0047686258951822914, "learning_rate": 0.0001, "loss": 7.0455, "loss/crossentropy": 2.2208076119422913, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.2118864804506302, "step": 7974 }, { "epoch": 0.4985, "grad_norm": 2.046875, "grad_norm_var": 0.004939524332682291, "learning_rate": 0.0001, "loss": 7.228, "loss/crossentropy": 2.2854151725769043, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20473261177539825, "step": 7976 }, { "epoch": 0.498625, "grad_norm": 2.484375, "grad_norm_var": 0.013315582275390625, "learning_rate": 0.0001, "loss": 7.1931, "loss/crossentropy": 2.1924540996551514, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.22139258682727814, "step": 7978 }, { "epoch": 0.49875, "grad_norm": 2.015625, "grad_norm_var": 0.017267862955729168, "learning_rate": 0.0001, "loss": 7.061, "loss/crossentropy": 2.3097928762435913, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.21281752735376358, "step": 7980 }, { "epoch": 0.498875, "grad_norm": 2.15625, "grad_norm_var": 0.016605631510416666, "learning_rate": 0.0001, "loss": 7.2304, "loss/crossentropy": 2.5834310054779053, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20607245713472366, "step": 7982 }, { "epoch": 0.499, "grad_norm": 1.984375, "grad_norm_var": 0.01800537109375, "learning_rate": 0.0001, "loss": 7.168, "loss/crossentropy": 2.2289873361587524, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.1980709359049797, "step": 7984 }, { "epoch": 0.499125, "grad_norm": 2.15625, "grad_norm_var": 0.02115453084309896, "learning_rate": 0.0001, "loss": 7.1071, "loss/crossentropy": 2.3941906690597534, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.2208108827471733, "step": 7986 }, { "epoch": 0.49925, "grad_norm": 2.671875, "grad_norm_var": 0.055149078369140625, "learning_rate": 0.0001, "loss": 7.3124, "loss/crossentropy": 2.1894861459732056, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.218056783080101, "step": 7988 }, { "epoch": 0.499375, "grad_norm": 2.0, "grad_norm_var": 0.05746256510416667, "learning_rate": 0.0001, "loss": 7.0268, "loss/crossentropy": 2.122998356819153, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.20764590054750443, "step": 7990 }, { "epoch": 0.4995, "grad_norm": 2.21875, "grad_norm_var": 0.056574503580729164, "learning_rate": 0.0001, "loss": 7.0048, "loss/crossentropy": 2.28106951713562, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.20048896968364716, "step": 7992 }, { "epoch": 0.499625, "grad_norm": 2.0625, "grad_norm_var": 0.05073140462239583, "learning_rate": 0.0001, "loss": 7.197, "loss/crossentropy": 2.450806140899658, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.20395724475383759, "step": 7994 }, { "epoch": 0.49975, "grad_norm": 2.1875, "grad_norm_var": 0.045171864827473956, "learning_rate": 0.0001, "loss": 7.0883, "loss/crossentropy": 2.2239577770233154, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.22716625034809113, "step": 7996 }, { "epoch": 0.499875, "grad_norm": 2.21875, "grad_norm_var": 0.046213531494140626, "learning_rate": 0.0001, "loss": 7.1814, "loss/crossentropy": 2.3571836948394775, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20826996862888336, "step": 7998 }, { "epoch": 0.5, "grad_norm": 2.0625, "grad_norm_var": 0.04478530883789063, "learning_rate": 0.0001, "loss": 7.208, "loss/crossentropy": 2.152498722076416, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.18669088929891586, "step": 8000 } ], "logging_steps": 2, "max_steps": 16000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.666362485243904e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }